From 43c5792592d9beb02eea57730ce5a4647dc0c838 Mon Sep 17 00:00:00 2001
From: Micah Williamson <micah.williamson@amd.com>
Date: Thu, 27 Nov 2025 01:54:44 -0600
Subject: [PATCH 001/770] [ROCm][CI] Fix test_cpu_offloading for ROCm (#29548)

Signed-off-by: Micah Williamson <micah.williamson@amd.com>
---
 tests/v1/kv_offload/test_cpu_offloading.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/v1/kv_offload/test_cpu_offloading.py b/tests/v1/kv_offload/test_cpu_offloading.py
index 406d4c0b4c1fd..57474a3dc01e7 100644
--- a/tests/v1/kv_offload/test_cpu_offloading.py
+++ b/tests/v1/kv_offload/test_cpu_offloading.py
@@ -20,6 +20,8 @@ ATTN_BACKENDS = ["FLASH_ATTN"]
 
 if current_platform.is_cuda():
     ATTN_BACKENDS.append("FLASHINFER")
+elif current_platform.is_rocm():
+    ATTN_BACKENDS = ["TRITON_ATTN"]
 
 
 class MockSubscriber:

From da3222f371b48c8e2548ec22767523394580a1c5 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 27 Nov 2025 00:09:41 -0800
Subject: [PATCH 002/770] [Model Runner V2] Implement multi-step Eagle with
 CUDA graph (#29559)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/cudagraph_utils.py         |   9 +-
 vllm/v1/worker/gpu/model_runner.py            |  53 +--
 vllm/v1/worker/gpu/spec_decode/eagle.py       | 422 ++++++++++++++++--
 .../worker/gpu/spec_decode/eagle_cudagraph.py | 112 +++++
 4 files changed, 526 insertions(+), 70 deletions(-)
 create mode 100644 vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py

diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py
index 8f1718e493b1e..4fd8eb50a4ea8 100644
--- a/vllm/v1/worker/gpu/cudagraph_utils.py
+++ b/vllm/v1/worker/gpu/cudagraph_utils.py
@@ -233,10 +233,11 @@ def prepare_inputs_to_capture(
     query_start_loc.np[num_reqs:] = num_tokens
     query_start_loc.copy_to_gpu()
     seq_lens_np = np.full(num_reqs, max_model_len, dtype=np.int32)
-    # HACK(woosuk): To optimize warmup time, we use 1 (instead of max_model_len)
-    # for seq_lens. This leads to a mismatch between seq_lens (GPU) and
-    # seq_lens_np (CPU), which might cause issues in some attention backends.
-    input_buffers.seq_lens[:num_reqs] = 1
+    # HACK(woosuk): For faster warmup, we set seq_lens (GPU) to num_tokens
+    # rather than max_model_len. This introduces a discrepancy between
+    # seq_lens (on GPU) and seq_lens_np (on CPU), which may cause issues for
+    # certain attention backends.
+    input_buffers.seq_lens[:num_reqs] = num_tokens
     input_buffers.seq_lens[num_reqs:] = 0
 
     input_block_tables = [x[:num_reqs] for x in block_tables.input_block_tables]
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index ed41e5a1a6c5e..0c9fdd0077f4a 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -140,10 +140,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.sampler = Sampler(logprobs_mode=self.model_config.logprobs_mode)
 
         # CUDA graphs.
-        self.cudagraph_manager = CudaGraphManager(
-            vllm_config=self.vllm_config,
-            device=self.device,
-        )
+        self.cudagraph_manager = CudaGraphManager(self.vllm_config, self.device)
 
     def get_supported_tasks(self) -> tuple[str]:
         return ("generate",)
@@ -203,6 +200,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             self.vllm_config,
             self.device,
         )
+        if self.do_spec_decode:
+            # HACK(woosuk)
+            self.speculator.set_attn(
+                self.kv_cache_config,
+                self.attn_metadata_builders,
+                self.block_tables,
+            )
+
         # TODO(woosuk): Support other backends.
         if not all(b.get_name() == "FLASH_ATTN" for b in self.attn_backends.values()):
             raise NotImplementedError("Only FLASH_ATTN backend is supported currently.")
@@ -297,35 +302,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         logits = self.model.compute_logits(hidden_states)
         self.sampler(logits, sampling_metadata)
 
-    @torch.inference_mode()
-    def _dummy_speculator_run(
-        self,
-        hidden_states: torch.Tensor,
-        aux_hidden_states: list[torch.Tensor] | None,
-    ) -> None:
-        num_tokens = hidden_states.shape[0]
-        num_reqs = min(num_tokens, self.max_num_reqs)
-        input_batch = InputBatch.make_dummy(
-            num_reqs=num_reqs,
-            num_tokens=num_tokens,
-            input_buffers=self.input_buffers,
-            device=self.device,
-        )
-        sampling_metadata = SamplingMetadata.make_dummy(
-            num_reqs=num_reqs,
-            device=self.device,
-        )
-        num_sampled = torch.ones(num_reqs, dtype=torch.int32, device=self.device)
-        num_rejected = torch.zeros(num_reqs, dtype=torch.int32, device=self.device)
-        self.propose_draft(
-            input_batch=input_batch,
-            sampling_metadata=sampling_metadata,
-            last_hidden_states=hidden_states,
-            aux_hidden_states=aux_hidden_states,
-            num_sampled=num_sampled,
-            num_rejected=num_rejected,
-        )
-
     @torch.inference_mode()
     def profile_run(self) -> None:
         hidden_states, sample_hidden_states = self._dummy_run(
@@ -334,7 +310,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         )
         self._dummy_sampler_run(sample_hidden_states)
         if self.do_spec_decode:
-            self._dummy_speculator_run(hidden_states, None)
+            num_tokens_across_dp = make_num_tokens_across_dp(
+                self.dp_size, self.max_num_tokens
+            )
+            self.speculator.run_model(
+                self.max_num_tokens,
+                attn_metadata=None,
+                num_tokens_across_dp=num_tokens_across_dp,
+            )
         torch.cuda.synchronize()
         del hidden_states, sample_hidden_states
         gc.collect()
@@ -368,6 +351,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 attn_metadata_builders=self.attn_metadata_builders,
                 kv_cache_config=self.kv_cache_config,
             )
+            if self.do_spec_decode:
+                self.speculator.capture_model()
 
         end_time = time.perf_counter()
         end_free_gpu_memory = torch.cuda.mem_get_info()[0]
diff --git a/vllm/v1/worker/gpu/spec_decode/eagle.py b/vllm/v1/worker/gpu/spec_decode/eagle.py
index 3c8621cc69c97..daf2775e8b92d 100644
--- a/vllm/v1/worker/gpu/spec_decode/eagle.py
+++ b/vllm/v1/worker/gpu/spec_decode/eagle.py
@@ -1,17 +1,29 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+import numpy as np
 import torch
 import torch.nn as nn
 
 from vllm.config import VllmConfig
 from vllm.config.compilation import CUDAGraphMode
 from vllm.forward_context import set_forward_context
+from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
 from vllm.triton_utils import tl, triton
-from vllm.v1.worker.gpu.input_batch import InputBatch
+from vllm.utils.platform_utils import is_pin_memory_available
+from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
+from vllm.v1.kv_cache_interface import KVCacheConfig
+from vllm.v1.worker.gpu.attn_utils import build_attn_metadata
+from vllm.v1.worker.gpu.block_table import BlockTables
+from vllm.v1.worker.gpu.input_batch import InputBatch, InputBuffers
 from vllm.v1.worker.gpu.sampler import gumbel_sample
+from vllm.v1.worker.gpu.spec_decode.eagle_cudagraph import EagleCudaGraphManager
 from vllm.v1.worker.gpu.states import SamplingMetadata
 
+logger = init_logger(__name__)
+
 
 class EagleSpeculator:
     def __init__(self, vllm_config: VllmConfig, device: torch.device):
@@ -27,13 +39,48 @@ class EagleSpeculator:
         self.scheduler_config = vllm_config.scheduler_config
         self.max_num_reqs = self.scheduler_config.max_num_seqs
         self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
+        self.max_model_len = vllm_config.model_config.max_model_len
+        # We need to get the hidden size from the draft model config because
+        # the draft model's hidden size can be different from the target model's
+        # hidden size (e.g., Llama 3.3 70B).
+        self.hidden_size = self.draft_model_config.get_hidden_size()
+        self.vocab_size = self.draft_model_config.get_vocab_size()
+        self.pin_memory = is_pin_memory_available()
+        self.dtype = vllm_config.model_config.dtype
 
-        self.input_ids = torch.zeros(
-            self.max_num_tokens, dtype=torch.int32, device=device
+        self.input_buffers = InputBuffers(
+            max_num_reqs=self.max_num_reqs,
+            max_num_tokens=self.max_num_tokens,
+            hidden_size=self.hidden_size,
+            vocab_size=self.vocab_size,
+            dtype=self.dtype,
+            device=device,
+            pin_memory=self.pin_memory,
         )
-        self.positions = torch.zeros(
-            self.max_num_tokens, dtype=torch.int64, device=device
+        self.hidden_states = torch.zeros(
+            self.max_num_tokens,
+            self.hidden_size,
+            dtype=self.dtype,
+            device=device,
         )
+        self.temperature = torch.zeros(
+            self.max_num_reqs,
+            dtype=torch.float32,
+            device=device,
+        )
+        self.seeds = torch.zeros(
+            self.max_num_reqs,
+            dtype=torch.int64,
+            device=device,
+        )
+        self.draft_tokens = torch.zeros(
+            self.max_num_reqs,
+            self.num_speculative_steps,
+            dtype=torch.int64,
+            device=device,
+        )
+
+        self.cudagraph_manager = EagleCudaGraphManager(vllm_config, device)
 
     def load_model(self, target_model: nn.Module) -> None:
         from vllm.compilation.backends import set_model_tag
@@ -49,6 +96,91 @@ class EagleSpeculator:
                 del self.model.lm_head
             self.model.lm_head = target_model.lm_head
 
+    def set_attn(
+        self,
+        kv_cache_config: KVCacheConfig,
+        attn_metadata_builders: list[AttentionMetadataBuilder],
+        block_tables: BlockTables,
+    ) -> None:
+        self.kv_cache_config = kv_cache_config
+        self.attn_metadata_builders = attn_metadata_builders
+        self.block_tables = block_tables
+
+    @torch.inference_mode()
+    def run_model(
+        self,
+        num_tokens: int,
+        attn_metadata: dict[str, Any],
+        num_tokens_across_dp: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        with set_forward_context(
+            attn_metadata,
+            self.vllm_config,
+            num_tokens=num_tokens,
+            cudagraph_runtime_mode=CUDAGraphMode.NONE,
+            num_tokens_across_dp=num_tokens_across_dp,
+        ):
+            ret_hidden_states = self.model(
+                input_ids=self.input_buffers.input_ids.gpu[:num_tokens],
+                positions=self.input_buffers.positions[:num_tokens],
+                hidden_states=self.hidden_states[:num_tokens],
+            )
+        if self.method == "mtp":
+            last_hidden_states = ret_hidden_states
+            hidden_states = ret_hidden_states
+        else:
+            last_hidden_states, hidden_states = ret_hidden_states
+        return last_hidden_states, hidden_states
+
+    def generate_draft(
+        self,
+        num_reqs: int,
+        attn_metadata: dict[str, Any],
+        num_tokens_across_dp: torch.Tensor | None,
+    ) -> None:
+        pos = self.input_buffers.positions[:num_reqs]
+        query_start_loc = self.input_buffers.query_start_loc.gpu[: num_reqs + 1]
+        for step in range(1, self.num_speculative_steps):
+            # Run the eagle model.
+            last_hidden_states, hidden_states = self.run_model(
+                num_reqs, attn_metadata, num_tokens_across_dp
+            )
+            logits = self.model.compute_logits(last_hidden_states)
+
+            # NOTE(woosuk): We must add 1 to the positions to match the Gumbel noise
+            # used for draft and target sampling.
+            draft_tokens = gumbel_sample(
+                logits,
+                self.temperature[:num_reqs],
+                self.seeds[:num_reqs],
+                pos + 1,
+                apply_temperature=True,
+            )
+            self.draft_tokens[:num_reqs, step] = draft_tokens
+
+            if step < self.num_speculative_steps - 1:
+                # Update the inputs for the next step.
+                update_eagle_inputs(
+                    draft_tokens,
+                    hidden_states,
+                    self.input_buffers,
+                    self.hidden_states,
+                    self.max_model_len,
+                )
+                self.block_tables.compute_slot_mappings(query_start_loc, pos)
+
+    def capture_model(self) -> None:
+        if self.num_speculative_steps == 1:
+            return
+        logger.info("Capturing model for Eagle speculator...")
+        self.cudagraph_manager.capture(
+            self.generate_draft,
+            self.input_buffers,
+            self.block_tables,
+            self.attn_metadata_builders,
+            self.kv_cache_config,
+        )
+
     @torch.inference_mode()
     def propose(
         self,
@@ -80,64 +212,110 @@ class EagleSpeculator:
             )
         else:
             hidden_states = last_hidden_states
+        num_tokens = input_batch.num_tokens_after_padding
+        self.hidden_states[:num_tokens] = hidden_states
 
         # Get the input ids and last token indices for the speculator.
         last_token_indices = prepare_eagle_inputs(
-            self.input_ids,
+            self.input_buffers,
             input_batch,
             num_sampled,
             num_rejected,
             last_sampled,
             next_prefill_tokens,
         )
-        input_ids = self.input_ids[: input_batch.num_tokens_after_padding]
 
         # Prefill: Run the eagle speculator with eager mode.
-        with set_forward_context(
+        # TODO(woosuk): Support CUDA graph for prefill.
+        last_hidden_states, hidden_states = self.run_model(
+            num_tokens,
             input_batch.attn_metadata,
-            self.vllm_config,
-            num_tokens=input_batch.num_tokens_after_padding,
-            cudagraph_runtime_mode=CUDAGraphMode.NONE,
-        ):
-            ret_hidden_states = self.model(
-                input_ids=input_ids,
-                positions=input_batch.positions,
-                hidden_states=hidden_states,
-            )
-        if self.method == "mtp":
-            last_hidden_states = ret_hidden_states
-            hidden_states = ret_hidden_states
-        else:
-            last_hidden_states, hidden_states = ret_hidden_states
+            num_tokens_across_dp=None,  # FIXME
+        )
         sample_hidden_states = last_hidden_states[last_token_indices]
         logits = self.model.compute_logits(sample_hidden_states)
 
         num_reqs = input_batch.num_reqs
         cu_num_logits = input_batch.cu_num_logits[:num_reqs]
-        temperature = sampling_metadata.temperature[cu_num_logits]
-        seed = sampling_metadata.seeds[cu_num_logits]
-        # NOTE(woosuk): We must add 1 to the positions to match the Gumbel noise
-        # used for draft and target sampling.
-        pos = input_batch.positions[last_token_indices] + 1
         # NOTE(woosuk): For draft sampling, we only consider the temperature
         # and ignore the other sampling parameters such as top_k and top_p,
         # for simplicity and performance.
         # While this may slightly degrade the acceptance rate, it does not
         # affect the output distribution after rejection sampling.
+        temperature = self.temperature[:num_reqs]
+        seeds = self.seeds[:num_reqs]
+        pos = self.input_buffers.positions[:num_reqs]
+        # Gather the values and copy them to the pre-allocated buffers.
+        torch.gather(sampling_metadata.temperature, 0, cu_num_logits, out=temperature)
+        torch.gather(sampling_metadata.seeds, 0, cu_num_logits, out=seeds)
+        torch.gather(input_batch.positions, 0, last_token_indices, out=pos)
+        # NOTE(woosuk): We must add 1 to the positions to match the Gumbel noise
+        # used for draft and target sampling.
         draft_tokens = gumbel_sample(
-            logits, temperature, seed, pos, apply_temperature=True
+            logits, temperature, seeds, pos + 1, apply_temperature=True
         )
         if self.num_speculative_steps == 1:
             # Early exit.
             return draft_tokens.view(-1, 1)
-        raise NotImplementedError("num_speculative_steps > 1 is not supported yet.")
+
+        # Save the draft tokens for the first step.
+        self.draft_tokens[:num_reqs, 0] = draft_tokens
+        # Prepare the inputs for the decode steps.
+        prepare_eagle_decode(
+            draft_tokens,
+            hidden_states,
+            last_token_indices,
+            input_batch.seq_lens,
+            num_rejected,
+            self.input_buffers,
+            self.hidden_states,
+            self.max_model_len,
+            self.max_num_reqs,
+        )
+        query_start_loc = self.input_buffers.query_start_loc
+        query_start_loc_gpu = query_start_loc.gpu[: num_reqs + 1]
+        slot_mappings = self.block_tables.compute_slot_mappings(
+            query_start_loc_gpu, pos
+        )
+
+        cudagraph_size = self.cudagraph_manager.get_cudagraph_size(num_reqs)
+        if cudagraph_size is not None:
+            # Run CUDA graph.
+            self.cudagraph_manager.run(cudagraph_size)
+            return self.draft_tokens[:num_reqs]
+
+        # Run eager mode.
+        query_start_loc.np[: num_reqs + 1] = np.arange(num_reqs + 1)
+        query_start_loc_cpu = query_start_loc.cpu[: num_reqs + 1]
+        # HACK(woosuk)
+        seq_lens_np = np.full(num_reqs, self.max_model_len, dtype=np.int32)
+        block_tables = [x[:num_reqs] for x in self.block_tables.input_block_tables]
+
+        # FIXME(woosuk): This is UNSAFE!!
+        attn_metadata = build_attn_metadata(
+            attn_metadata_builders=self.attn_metadata_builders,
+            num_reqs=num_reqs,
+            num_tokens=num_reqs,
+            query_start_loc_gpu=query_start_loc_gpu,
+            query_start_loc_cpu=query_start_loc_cpu,
+            seq_lens=self.input_buffers.seq_lens[:num_reqs],
+            seq_lens_np=seq_lens_np,
+            num_computed_tokens_cpu=None,  # FIXME
+            block_tables=block_tables,
+            slot_mappings=slot_mappings,
+            kv_cache_config=self.kv_cache_config,
+        )
+        self.generate_draft(num_reqs, attn_metadata, num_tokens_across_dp=None)  # FIXME
+        return self.draft_tokens[:num_reqs]
 
 
 @triton.jit
 def _prepare_eagle_inputs_kernel(
     last_token_indices_ptr,
     eagle_input_ids_ptr,
+    eagle_positions_ptr,
     target_input_ids_ptr,
+    target_positions_ptr,
     idx_mapping_ptr,
     last_sampled_ptr,
     next_prefill_tokens_ptr,
@@ -175,9 +353,16 @@ def _prepare_eagle_inputs_kernel(
     tl.store(last_token_indices_ptr + batch_idx, last_token_index)
     tl.store(eagle_input_ids_ptr + last_token_index, next_token)
 
+    # Copy positions.
+    for i in range(0, query_len, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        mask = block < query_len
+        target_pos = tl.load(target_positions_ptr + query_start + block, mask=mask)
+        tl.store(eagle_positions_ptr + query_start + block, target_pos, mask=mask)
+
 
 def prepare_eagle_inputs(
-    eagle_input_ids: torch.Tensor,
+    input_buffers: InputBuffers,
     input_batch: InputBatch,
     # [num_reqs]
     num_sampled: torch.Tensor,
@@ -192,12 +377,14 @@ def prepare_eagle_inputs(
     last_token_indices = torch.empty(
         num_reqs,
         dtype=torch.int64,
-        device=eagle_input_ids.device,
+        device=num_sampled.device,
     )
     _prepare_eagle_inputs_kernel[(num_reqs,)](
         last_token_indices,
-        eagle_input_ids,
+        input_buffers.input_ids.gpu,
+        input_buffers.positions,
         input_batch.input_ids,
+        input_batch.positions,
         input_batch.idx_mapping,
         last_sampled,
         next_prefill_tokens,
@@ -207,3 +394,174 @@ def prepare_eagle_inputs(
         BLOCK_SIZE=1024,
     )
     return last_token_indices
+
+
+@triton.jit
+def _prepare_eagle_docode_kernel(
+    draft_tokens_ptr,
+    output_hidden_states_ptr,
+    output_hidden_states_stride,
+    last_token_indices_ptr,
+    target_seq_lens_ptr,
+    num_rejected_ptr,
+    input_ids_ptr,
+    positions_ptr,
+    input_hidden_states_ptr,
+    input_hidden_states_stride,
+    query_start_loc_ptr,
+    seq_lens_ptr,
+    hidden_size,
+    max_model_len,
+    max_num_reqs,
+    BLOCK_SIZE: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+    num_reqs = tl.num_programs(0) - 1
+    if req_idx == num_reqs:
+        # Compute query_start_loc. Pad it with the last query_start_loc
+        # for CUDA graphs.
+        for i in range(0, max_num_reqs + 1, BLOCK_SIZE):
+            block = i + tl.arange(0, BLOCK_SIZE)
+            q = tl.where(block < num_reqs, block, num_reqs)
+            mask = block < max_num_reqs + 1
+            tl.store(query_start_loc_ptr + block, q, mask=mask)
+        # Pad seq_lens for CUDA graphs.
+        for i in range(req_idx, max_num_reqs, BLOCK_SIZE):
+            block = i + tl.arange(0, BLOCK_SIZE)
+            mask = block < max_num_reqs
+            tl.store(seq_lens_ptr + block, 0, mask=mask)
+        return
+
+    # draft token -> input id.
+    draft_token = tl.load(draft_tokens_ptr + req_idx)
+    tl.store(input_ids_ptr + req_idx, draft_token)
+
+    # output hidden states -> input hidden states.
+    src_idx = tl.load(last_token_indices_ptr + req_idx)
+    for i in range(0, hidden_size, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        mask = block < hidden_size
+        output_hidden_states = tl.load(
+            output_hidden_states_ptr + src_idx * output_hidden_states_stride + block,
+            mask=mask,
+        )
+        tl.store(
+            input_hidden_states_ptr + req_idx * input_hidden_states_stride + block,
+            output_hidden_states,
+            mask=mask,
+        )
+
+    # Compute position and seq_lens.
+    # NOTE(woosuk): To prevent out-of-range access, we clamp these values
+    # if they reach the max model length.
+    position = tl.load(positions_ptr + req_idx)
+    position = tl.minimum(position + 1, max_model_len - 1)
+    tl.store(positions_ptr + req_idx, position)
+
+    target_seq_len = tl.load(target_seq_lens_ptr + req_idx)
+    num_rejected = tl.load(num_rejected_ptr + req_idx)
+    seq_len = target_seq_len - num_rejected
+    seq_len = tl.minimum(seq_len + 1, max_model_len)
+    tl.store(seq_lens_ptr + req_idx, seq_len)
+
+
+def prepare_eagle_decode(
+    draft_tokens: torch.Tensor,
+    output_hidden_states: torch.Tensor,
+    last_token_indices: torch.Tensor,
+    target_seq_lens: torch.Tensor,
+    num_rejected: torch.Tensor,
+    input_buffers: InputBuffers,
+    input_hidden_states: torch.Tensor,
+    max_model_len: int,
+    max_num_reqs: int,
+):
+    num_reqs = draft_tokens.shape[0]
+    hidden_size = output_hidden_states.shape[-1]
+    _prepare_eagle_docode_kernel[(num_reqs + 1,)](
+        draft_tokens,
+        output_hidden_states,
+        output_hidden_states.stride(0),
+        last_token_indices,
+        target_seq_lens,
+        num_rejected,
+        input_buffers.input_ids.gpu,
+        input_buffers.positions,
+        input_hidden_states,
+        input_hidden_states.stride(0),
+        input_buffers.query_start_loc.gpu,
+        input_buffers.seq_lens,
+        hidden_size,
+        max_model_len,
+        max_num_reqs,
+        BLOCK_SIZE=1024,
+    )
+
+
+@triton.jit
+def _update_eagle_inputs_kernel(
+    input_ids_ptr,
+    positions_ptr,
+    input_hidden_states_ptr,
+    input_hidden_states_stride,
+    seq_lens_ptr,
+    max_model_len,
+    draft_tokens_ptr,
+    output_hidden_states_ptr,
+    output_hidden_states_stride,
+    hidden_size,
+    BLOCK_SIZE: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+
+    # Draft token -> Input ID.
+    draft_token = tl.load(draft_tokens_ptr + req_idx)
+    tl.store(input_ids_ptr + req_idx, draft_token)
+
+    # Output hidden states -> Input hidden states.
+    for i in range(0, hidden_size, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        mask = block < hidden_size
+        output_hidden_states = tl.load(
+            output_hidden_states_ptr + req_idx * output_hidden_states_stride + block,
+            mask=mask,
+        )
+        tl.store(
+            input_hidden_states_ptr + req_idx * input_hidden_states_stride + block,
+            output_hidden_states,
+            mask=mask,
+        )
+
+    # Increment position and seq_lens.
+    # NOTE(woosuk): To prevent out-of-range access, we clamp these values
+    # if they reach the max model length.
+    position = tl.load(positions_ptr + req_idx)
+    position = tl.minimum(position + 1, max_model_len - 1)
+    tl.store(positions_ptr + req_idx, position)
+
+    seq_len = tl.load(seq_lens_ptr + req_idx)
+    seq_len = tl.minimum(seq_len + 1, max_model_len)
+    tl.store(seq_lens_ptr + req_idx, seq_len)
+
+
+def update_eagle_inputs(
+    draft_tokens: torch.Tensor,
+    output_hidden_states: torch.Tensor,
+    input_buffers: InputBuffers,
+    hidden_states: torch.Tensor,
+    max_model_len: int,
+):
+    num_reqs, hidden_size = output_hidden_states.shape
+    _update_eagle_inputs_kernel[(num_reqs,)](
+        input_buffers.input_ids.gpu,
+        input_buffers.positions,
+        hidden_states,
+        hidden_states.stride(0),
+        input_buffers.seq_lens,
+        max_model_len,
+        draft_tokens,
+        output_hidden_states,
+        output_hidden_states.stride(0),
+        hidden_size,
+        BLOCK_SIZE=1024,
+    )
diff --git a/vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py b/vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py
new file mode 100644
index 0000000000000..a6f50d68cc684
--- /dev/null
+++ b/vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py
@@ -0,0 +1,112 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Callable
+
+import torch
+
+from vllm.config import VllmConfig
+from vllm.config.compilation import CUDAGraphMode
+from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
+from vllm.v1.kv_cache_interface import KVCacheConfig
+from vllm.v1.worker.gpu.block_table import BlockTables
+from vllm.v1.worker.gpu.cudagraph_utils import (
+    capture_graphs,
+    get_cudagraph_sizes,
+    prepare_inputs_to_capture,
+)
+from vllm.v1.worker.gpu.dp_utils import make_num_tokens_across_dp
+from vllm.v1.worker.gpu.input_batch import InputBuffers
+
+
+class EagleCudaGraphManager:
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
+        self.vllm_config = vllm_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.device = device
+
+        self.max_model_len = vllm_config.model_config.max_model_len
+        self.max_num_reqs = self.scheduler_config.max_num_seqs
+        self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
+        self.dp_size = vllm_config.parallel_config.data_parallel_size
+        self.compilation_config = vllm_config.compilation_config
+        assert self.compilation_config is not None
+
+        if self.compilation_config.cudagraph_mode is None:
+            self.cudagraph_mode = CUDAGraphMode.NONE
+        else:
+            self.cudagraph_mode = self.compilation_config.cudagraph_mode
+            if self.cudagraph_mode == CUDAGraphMode.FULL:
+                # NOTE(woosuk): For Eagle, we only use CUDA graphs for decode.
+                self.cudagraph_mode = CUDAGraphMode.FULL_DECODE_ONLY
+
+        self.cudagraph_sizes = get_cudagraph_sizes(
+            self.compilation_config.cudagraph_capture_sizes,
+            self.max_num_reqs,
+            self.max_num_tokens,
+            self.cudagraph_mode,
+        )
+
+        self.graphs: dict[int, torch.cuda.CUDAGraph] = {}
+        self.pool = torch.cuda.graph_pool_handle()
+
+    def get_cudagraph_size(self, num_tokens: int) -> int | None:
+        return self.cudagraph_sizes.get(num_tokens)
+
+    def capture_graph(
+        self,
+        num_tokens: int,
+        generate_fn: Callable,
+        input_buffers: InputBuffers,
+        block_tables: BlockTables,
+        attn_metadata_builders: list[AttentionMetadataBuilder],
+        kv_cache_config: KVCacheConfig,
+    ) -> None:
+        num_reqs = min(num_tokens, self.max_num_reqs)
+        attn_metadata = prepare_inputs_to_capture(
+            num_reqs,
+            num_tokens,
+            input_buffers,
+            block_tables,
+            attn_metadata_builders,
+            self.max_model_len,
+            kv_cache_config,
+        )
+        num_tokens_across_dp = make_num_tokens_across_dp(self.dp_size, num_tokens)
+
+        # Warm up.
+        generate_fn(num_tokens, attn_metadata, num_tokens_across_dp)
+
+        # Capture the graph.
+        assert num_tokens not in self.graphs
+        graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(graph, self.pool):
+            generate_fn(num_tokens, attn_metadata, num_tokens_across_dp)
+        self.graphs[num_tokens] = graph
+
+    @torch.inference_mode()
+    def capture(
+        self,
+        generate_fn: Callable,
+        input_buffers: InputBuffers,
+        block_tables: BlockTables,
+        attn_metadata_builders: list[AttentionMetadataBuilder],
+        kv_cache_config: KVCacheConfig,
+    ) -> None:
+        capture_graphs(
+            self.cudagraph_sizes,
+            self.device,
+            self.capture_graph,
+            generate_fn=generate_fn,
+            input_buffers=input_buffers,
+            block_tables=block_tables,
+            attn_metadata_builders=attn_metadata_builders,
+            kv_cache_config=kv_cache_config,
+        )
+
+    def run(self, num_tokens: int) -> None:
+        assert num_tokens in self.graphs
+        self.graphs[num_tokens].replay()

From 00d3310d2d00d021d2e8f5f00e31b51d30f0413e Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 27 Nov 2025 17:36:18 +0800
Subject: [PATCH 003/770] [Bugfix] Update Ultravox  compatibility (#29588)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/ultravox.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index bb0f6bd036f14..26a8355cd22b5 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -116,7 +116,12 @@ class UltravoxProcessingInfo(BaseProcessingInfo):
 
     def get_feature_extractor(self, **kwargs: object) -> WhisperFeatureExtractor:
         hf_processor = self.get_hf_processor(**kwargs)
+
+        # Changed in https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/commit/9a3c571b8fdaf1e66dd3ea61bbcb6db5c70a438e
         audio_processor = hf_processor.audio_processor  # type: ignore
+        if isinstance(audio_processor, WhisperFeatureExtractor):
+            return audio_processor
+
         feature_extractor = audio_processor.feature_extractor  # type: ignore
         assert isinstance(feature_extractor, WhisperFeatureExtractor)
         return feature_extractor

From 0838b52e2eff77d1aaf4ee9d0da19522b9a5749c Mon Sep 17 00:00:00 2001
From: Morrison Turnansky <mturnans@redhat.com>
Date: Thu, 27 Nov 2025 04:55:58 -0500
Subject: [PATCH 004/770] [Frontend][torch.compile] CompilationConfig Overhaul
 (#20283): Set up -O infrastructure (#26847)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: morrison-turnansky <mturnans@redhat.com>
Signed-off-by: adabeyta <aabeyta@redhat.com>
Signed-off-by: Morrison Turnansky <mturnans@redhat.com>
Co-authored-by: adabeyta <aabeyta@redhat.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/design/optimization_levels.md            |  69 ++++
 tests/compile/test_config.py                  |   4 +-
 tests/engine/test_arg_utils.py                |  57 +++-
 .../model_executor/test_enabled_custom_ops.py |   8 +-
 tests/test_config.py                          | 307 +++++++++++++++++-
 tests/utils_/test_argparse_utils.py           |  10 +-
 tests/v1/cudagraph/test_cudagraph_mode.py     |   4 +-
 vllm/config/compilation.py                    |  75 ++++-
 vllm/config/model.py                          |   8 +
 vllm/config/vllm.py                           | 223 ++++++++++++-
 vllm/engine/arg_utils.py                      |   8 +-
 vllm/utils/argparse_utils.py                  |  24 +-
 vllm/v1/worker/gpu/cudagraph_utils.py         |   2 +-
 13 files changed, 735 insertions(+), 64 deletions(-)
 create mode 100644 docs/design/optimization_levels.md

diff --git a/docs/design/optimization_levels.md b/docs/design/optimization_levels.md
new file mode 100644
index 0000000000000..940286071ef3c
--- /dev/null
+++ b/docs/design/optimization_levels.md
@@ -0,0 +1,69 @@
+<!-- markdownlint-disable -->
+
+# Optimization Levels
+
+## Overview
+
+vLLM now supports optimization levels (`-O0`, `-O1`, `-O2`, `-O3`). Optimization levels provide an intuitive mechnaism for users to trade startup time for performance. Higher levels have better performance but worse startup time. These optimization levels have associated defaults to help users get desired out of the box performance. Importantly, defaults set by optimization levels are purely defaults; explicit user settings will not be overwritten.
+
+## Level Summaries and Usage Examples
+```bash
+# CLI usage
+python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O0
+
+# Python API usage
+from vllm.entrypoints.llm import LLM
+
+llm = LLM(
+    model="RedHatAI/Llama-3.2-1B-FP8",
+    optimization_level=0
+)
+```
+
+#### `-O1`: Quick Optimizations
+- **Startup**: Moderate startup time
+- **Performance**: Inductor compilation, CUDAGraphMode.PIECEWISE
+- **Use case**:  Balance for most development scenarios
+
+```bash
+# CLI usage
+python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O1
+
+# Python API usage
+from vllm.entrypoints.llm import LLM
+
+llm = LLM(
+    model="RedHatAI/Llama-3.2-1B-FP8",
+    optimization_level=1
+)
+```
+
+#### `-O2`: Full Optimizations (Default)
+- **Startup**: Longer startup time
+- **Performance**: `-O1` + CUDAGraphMode.FULL_AND_PIECEWISE
+- **Use case**: Production workloads where performance is important. This is the default use case. It is also very similar to the previous default. The primary difference is that  noop & fusion flags are enabled. 
+
+```bash
+# CLI usage (default, so optional)
+python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O2
+
+# Python API usage
+from vllm.entrypoints.llm import LLM
+
+llm = LLM(
+    model="RedHatAI/Llama-3.2-1B-FP8",
+    optimization_level=2  # This is the default
+)
+```
+
+#### `-O3`: Full Optimization
+Still in development. Added infrastructure to prevent changing API in future 
+release. Currently behaves the same O2.
+
+## Troubleshooting
+
+### Common Issues
+
+1. **Startup Time Too Long**: Use `-O0` or `-O1` for faster startup
+2. **Compilation Errors**: Use `debug_dump_path` for additional debugging information
+3. **Performance Issues**: Ensure using `-O2` for production
\ No newline at end of file
diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py
index 1e8a882a7f3eb..a9e5ccee520e3 100644
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -172,8 +172,8 @@ def test_splitting_ops_dynamic():
     config = VllmConfig()
     # Default V1 config leaves cudagraph mode unset; splitting ops are only
     # populated when the engine decides to use piecewise compilation.
-    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE
-    assert not config.compilation_config.splitting_ops_contain_attention()
+    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE
+    assert config.compilation_config.splitting_ops_contain_attention()
 
     # When use_inductor_graph_partition=True
     config = VllmConfig(
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index be926764e4948..0077609b2f365 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -222,6 +222,47 @@ def test_media_io_kwargs_parser(arg, expected):
     assert args.media_io_kwargs == expected
 
 
+@pytest.mark.parametrize(
+    ("args", "expected"),
+    [
+        (["-O", "1"], "1"),
+        (["-O", "2"], "2"),
+        (["-O", "3"], "3"),
+        (["-O0"], "0"),
+        (["-O1"], "1"),
+        (["-O2"], "2"),
+        (["-O3"], "3"),
+    ],
+)
+def test_optimization_level(args, expected):
+    """
+    Test space-separated optimization levels (-O 1, -O 2, -O 3) map to
+    optimization_level.
+    """
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+    parsed_args = parser.parse_args(args)
+    assert parsed_args.optimization_level == expected
+    assert parsed_args.compilation_config.mode is None
+
+
+@pytest.mark.parametrize(
+    ("args", "expected"),
+    [
+        (["-O.mode=0"], 0),
+        (["-O.mode=1"], 1),
+        (["-O.mode=2"], 2),
+        (["-O.mode=3"], 3),
+    ],
+)
+def test_mode_parser(args, expected):
+    """
+    Test compilation config modes (-O.mode=int) map to compilation_config.
+    """
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+    parsed_args = parser.parse_args(args)
+    assert parsed_args.compilation_config.mode == expected
+
+
 def test_compilation_config():
     parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
 
@@ -229,22 +270,6 @@ def test_compilation_config():
     args = parser.parse_args([])
     assert args.compilation_config == CompilationConfig()
 
-    # set to O3
-    args = parser.parse_args(["-O0"])
-    assert args.compilation_config.mode == 0
-
-    # set to O 3 (space)
-    args = parser.parse_args(["-O", "1"])
-    assert args.compilation_config.mode == 1
-
-    # set to O 3 (equals)
-    args = parser.parse_args(["-O=2"])
-    assert args.compilation_config.mode == 2
-
-    # set to O.mode 3
-    args = parser.parse_args(["-O.mode", "3"])
-    assert args.compilation_config.mode == 3
-
     # set to string form of a dict
     args = parser.parse_args(
         [
diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py
index 9121284de85b7..7d95dcddca711 100644
--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@@ -5,7 +5,12 @@ import pytest
 import torch
 
 from vllm._aiter_ops import rocm_aiter_ops
-from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config
+from vllm.config import (
+    CompilationConfig,
+    VllmConfig,
+    get_cached_compilation_config,
+    set_current_vllm_config,
+)
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.activation import (
     GeluAndMul,
@@ -86,6 +91,7 @@ def test_enabled_ops(
             backend=backend, mode=compilation_mode, custom_ops=custom_ops
         )
     )
+    get_cached_compilation_config.cache_clear()
     with set_current_vllm_config(vllm_config):
         assert CustomOp.default_on() == default_on
 
diff --git a/tests/test_config.py b/tests/test_config.py
index 16f68d18fc68b..080e4d2afacc6 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -8,9 +8,20 @@ from unittest.mock import patch
 import pytest
 
 from vllm.compilation.backends import VllmBackend
-from vllm.config import ModelConfig, PoolerConfig, VllmConfig, update_config
+from vllm.config import (
+    CompilationConfig,
+    ModelConfig,
+    PoolerConfig,
+    VllmConfig,
+    update_config,
+)
+from vllm.config.compilation import CompilationMode, CUDAGraphMode
 from vllm.config.load import LoadConfig
 from vllm.config.utils import get_field
+from vllm.config.vllm import (
+    OPTIMIZATION_LEVEL_TO_CONFIG,
+    OptimizationLevel,
+)
 from vllm.model_executor.layers.pooler import PoolingType
 from vllm.platforms import current_platform
 
@@ -235,6 +246,43 @@ def test_default_pooling_type(model_id, default_pooling_type, pooling_type):
     assert model_config.pooler_config.pooling_type == pooling_type
 
 
+@pytest.mark.parametrize(
+    ("model_id", "expected_is_moe_model"),
+    [
+        ("RedHatAI/Qwen3-8B-speculator.eagle3", False),
+        ("RedHatAI/Llama-3.1-8B-Instruct-NVFP4", False),
+        ("RedHatAI/Llama-3.2-1B-FP8", False),
+        ("RedHatAI/Mistral-Small-24B-Instruct-2501-quantized.w8a8", False),
+        ("RedHatAI/gpt-oss-20b", True),
+        ("RedHatAI/DeepSeek-V2.5-1210-FP8", True),
+        ("RedHatAI/Llama-4-Scout-17B-16E-Instruct", True),
+        ("RedHatAI/Mixtral-8x7B-Instruct-v0.1", True),
+    ],
+)
+def test_moe_model_detection(model_id, expected_is_moe_model):
+    model_config = ModelConfig(model_id)
+    # Just check that is_moe_model field exists and is a boolean
+    assert model_config.is_model_moe() == expected_is_moe_model
+
+
+@pytest.mark.parametrize(
+    ("model_id", "quantized"),
+    [
+        ("RedHatAI/Qwen3-8B-speculator.eagle3", False),
+        ("RedHatAI/Llama-3.1-8B-Instruct-NVFP4", True),
+        ("RedHatAI/Llama-3.2-1B-FP8", True),
+        ("RedHatAI/Mistral-Small-24B-Instruct-2501-quantized.w8a8", True),
+        ("RedHatAI/gpt-oss-20b", True),
+        ("RedHatAI/DeepSeek-V2.5-1210-FP8", True),
+        ("RedHatAI/Mixtral-8x7B-Instruct-v0.1", False),
+    ],
+)
+def test_is_quantized(model_id, quantized):
+    model_config = ModelConfig(model_id)
+    # Just check that quantized field exists and is a boolean
+    assert model_config.is_quantized() == quantized
+
+
 @pytest.mark.skipif(
     current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm."
 )
@@ -552,3 +600,260 @@ def test_s3_url_different_models_create_different_directories(mock_pull_files):
     assert os.path.exists(config1.tokenizer) and os.path.isdir(config1.tokenizer)
     assert os.path.exists(config2.model) and os.path.isdir(config2.model)
     assert os.path.exists(config2.tokenizer) and os.path.isdir(config2.tokenizer)
+
+
+@pytest.mark.parametrize(
+    ("backend", "custom_ops", "expected"),
+    [
+        ("eager", [], True),
+        ("eager", ["+fused_layernorm"], True),
+        ("eager", ["all", "-fused_layernorm"], False),
+        ("inductor", [], False),
+        ("inductor", ["none", "+fused_layernorm"], True),
+        ("inductor", ["none", "-fused_layernorm"], False),
+    ],
+)
+def test_is_custom_op_enabled(backend: str, custom_ops: list[str], expected: bool):
+    """Test that is_custom_op_enabled works correctly."""
+    config = VllmConfig(
+        compilation_config=CompilationConfig(backend=backend, custom_ops=custom_ops)
+    )
+    assert config.compilation_config.is_custom_op_enabled("fused_layernorm") is expected
+
+
+def test_vllm_config_defaults_are_none():
+    """Verify that optimization-level defaults are None when not set by user."""
+    # Test all optimization levels to ensure defaults work correctly
+    for opt_level in OptimizationLevel:
+        config = object.__new__(VllmConfig)
+        config.compilation_config = CompilationConfig()
+        config.optimization_level = opt_level
+        config.model_config = None
+
+        # Use the global optimization level defaults
+        default_config = OPTIMIZATION_LEVEL_TO_CONFIG[opt_level]
+
+        # Verify that all pass_config values are None before defaults are applied
+        for pass_k in default_config["compilation_config"]["pass_config"]:
+            assert getattr(config.compilation_config.pass_config, pass_k) is None
+
+        # Verify that other config values are None before defaults are applied
+        for k in default_config["compilation_config"]:
+            if k != "pass_config":
+                assert getattr(config.compilation_config, k) is None
+
+
+@pytest.mark.parametrize(
+    ("model_id", "compiliation_config", "optimization_level"),
+    [
+        (
+            None,
+            CompilationConfig(backend="eager", custom_ops=["+quant_fp8"]),
+            OptimizationLevel.O0,
+        ),
+        (None, CompilationConfig(), OptimizationLevel.O0),
+        (None, CompilationConfig(), OptimizationLevel.O1),
+        (None, CompilationConfig(), OptimizationLevel.O2),
+        (None, CompilationConfig(), OptimizationLevel.O3),
+        (
+            "RedHatAI/Qwen3-8B-speculator.eagle3",
+            CompilationConfig(backend="inductor", custom_ops=["+quant_fp8"]),
+            OptimizationLevel.O2,
+        ),
+        (
+            "RedHatAI/Qwen3-8B-speculator.eagle3",
+            CompilationConfig(),
+            OptimizationLevel.O0,
+        ),
+        (
+            "RedHatAI/Qwen3-8B-speculator.eagle3",
+            CompilationConfig(),
+            OptimizationLevel.O1,
+        ),
+        (
+            "RedHatAI/Qwen3-8B-speculator.eagle3",
+            CompilationConfig(),
+            OptimizationLevel.O2,
+        ),
+        (
+            "RedHatAI/Qwen3-8B-speculator.eagle3",
+            CompilationConfig(),
+            OptimizationLevel.O3,
+        ),
+        ("RedHatAI/DeepSeek-V2.5-1210-FP8", CompilationConfig(), OptimizationLevel.O0),
+        ("RedHatAI/DeepSeek-V2.5-1210-FP8", CompilationConfig(), OptimizationLevel.O1),
+        ("RedHatAI/DeepSeek-V2.5-1210-FP8", CompilationConfig(), OptimizationLevel.O2),
+        ("RedHatAI/DeepSeek-V2.5-1210-FP8", CompilationConfig(), OptimizationLevel.O3),
+    ],
+)
+def test_vllm_config_defaults(model_id, compiliation_config, optimization_level):
+    """Test that optimization-level defaults are correctly applied."""
+
+    model_config = None
+    if model_id is not None:
+        model_config = ModelConfig(model_id)
+        vllm_config = VllmConfig(
+            model_config=model_config,
+            compilation_config=compiliation_config,
+            optimization_level=optimization_level,
+        )
+    else:
+        vllm_config = VllmConfig(
+            compilation_config=compiliation_config,
+            optimization_level=optimization_level,
+        )
+    # Use the global optimization level defaults
+    default_config = OPTIMIZATION_LEVEL_TO_CONFIG[optimization_level]
+
+    # Verify pass_config defaults (nested under compilation_config)
+    pass_config_dict = default_config["compilation_config"]["pass_config"]
+    for pass_k, pass_v in pass_config_dict.items():
+        actual = getattr(vllm_config.compilation_config.pass_config, pass_k)
+        expected = pass_v(vllm_config) if callable(pass_v) else pass_v
+        assert actual == expected, (
+            f"pass_config.{pass_k}: expected {expected}, got {actual}"
+        )
+
+    # Verify other compilation_config defaults
+    compilation_config_dict = default_config["compilation_config"]
+    for k, v in compilation_config_dict.items():
+        if k != "pass_config":
+            actual = getattr(vllm_config.compilation_config, k)
+            expected = v(vllm_config) if callable(v) else v
+            assert actual == expected, (
+                f"compilation_config.{k}: expected {expected}, got {actual}"
+            )
+
+
+def test_vllm_config_callable_defaults():
+    """Test that callable defaults work in the config system.
+
+    Verifies that lambdas in default configs can inspect VllmConfig properties
+    (e.g., is_quantized, is_model_moe) to conditionally set optimization flags.
+    """
+    config_no_model = VllmConfig(optimization_level=OptimizationLevel.O2)
+
+    # Callable that checks if model exists
+    has_model = lambda cfg: cfg.model_config is not None
+    assert has_model(config_no_model) is False
+
+    # Test with quantized model
+    quantized_model = ModelConfig("RedHatAI/Llama-3.2-1B-FP8")
+    config_quantized = VllmConfig(
+        model_config=quantized_model, optimization_level=OptimizationLevel.O2
+    )
+    enable_if_quantized = lambda cfg: (
+        cfg.model_config is not None and cfg.model_config.is_quantized()
+    )
+    assert enable_if_quantized(config_quantized) is True
+    assert enable_if_quantized(config_no_model) is False
+
+    # Test with MoE model
+    moe_model = ModelConfig("deepseek-ai/DeepSeek-V2-Lite")
+    config_moe = VllmConfig(
+        model_config=moe_model, optimization_level=OptimizationLevel.O2
+    )
+    enable_if_sequential = lambda cfg: (
+        cfg.model_config is not None and not cfg.model_config.is_model_moe()
+    )
+    assert enable_if_sequential(config_moe) is False
+    assert enable_if_sequential(config_quantized) is True
+
+
+def test_vllm_config_explicit_overrides():
+    """Test that explicit property overrides work correctly with callable defaults.
+
+    When users explicitly set configuration properties, those values
+    take precedence over callable defaults, across different models and
+    optimization levels.
+    """
+    from vllm.config.compilation import PassConfig
+
+    quantized_model = ModelConfig("RedHatAI/Llama-3.2-1B-FP8")
+    moe_model = ModelConfig("deepseek-ai/DeepSeek-V2-Lite")
+    regular_model = ModelConfig("Qwen/Qwen1.5-7B")
+
+    # Explicit compilation mode override on O0 (where default is NONE)
+    compilation_config = CompilationConfig(mode=CompilationMode.VLLM_COMPILE)
+    config = VllmConfig(
+        optimization_level=OptimizationLevel.O0,
+        compilation_config=compilation_config,
+    )
+    assert config.compilation_config.mode == CompilationMode.VLLM_COMPILE
+    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE
+
+    # Explicit pass config flags to override defaults
+    pass_config = PassConfig(enable_noop=True, enable_attn_fusion=True)
+    compilation_config = CompilationConfig(pass_config=pass_config)
+    config = VllmConfig(
+        optimization_level=OptimizationLevel.O0,
+        compilation_config=compilation_config,
+    )
+    assert config.compilation_config.pass_config.enable_noop is True
+    assert config.compilation_config.pass_config.enable_attn_fusion is True
+
+    # Explicit cudagraph mode override on quantized model at O2
+    pass_config = PassConfig(enable_async_tp=True)
+    compilation_config = CompilationConfig(
+        cudagraph_mode=CUDAGraphMode.NONE, pass_config=pass_config
+    )
+    config = VllmConfig(
+        model_config=quantized_model,
+        optimization_level=OptimizationLevel.O2,
+        compilation_config=compilation_config,
+    )
+    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE
+    assert config.compilation_config.pass_config.enable_async_tp is True
+    # Mode should still use default for O2
+    assert config.compilation_config.mode == CompilationMode.VLLM_COMPILE
+
+    # Different optimization levels with same model
+    config_o0 = VllmConfig(
+        model_config=regular_model, optimization_level=OptimizationLevel.O0
+    )
+    config_o2 = VllmConfig(
+        model_config=regular_model, optimization_level=OptimizationLevel.O2
+    )
+    assert config_o0.compilation_config.mode == CompilationMode.NONE
+    assert config_o2.compilation_config.mode == CompilationMode.VLLM_COMPILE
+    assert config_o0.compilation_config.cudagraph_mode == CUDAGraphMode.NONE
+    assert (
+        config_o2.compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE
+    )
+
+    # Same optimization level across different model types
+    config_moe_o2 = VllmConfig(
+        model_config=moe_model, optimization_level=OptimizationLevel.O2
+    )
+    config_regular_o2 = VllmConfig(
+        model_config=regular_model, optimization_level=OptimizationLevel.O2
+    )
+    config_quantized_o2 = VllmConfig(
+        model_config=quantized_model, optimization_level=OptimizationLevel.O2
+    )
+    # All should have same base compilation settings at O2
+    assert config_moe_o2.compilation_config.mode == CompilationMode.VLLM_COMPILE
+    assert config_regular_o2.compilation_config.mode == CompilationMode.VLLM_COMPILE
+    assert config_quantized_o2.compilation_config.mode == CompilationMode.VLLM_COMPILE
+    assert (
+        config_moe_o2.compilation_config.cudagraph_mode
+        == CUDAGraphMode.FULL_AND_PIECEWISE
+    )
+    assert (
+        config_regular_o2.compilation_config.cudagraph_mode
+        == CUDAGraphMode.FULL_AND_PIECEWISE
+    )
+
+    # Override one field but not others
+    pass_config = PassConfig(enable_noop=False)
+    compilation_config = CompilationConfig(pass_config=pass_config)
+    config = VllmConfig(
+        model_config=regular_model,
+        optimization_level=OptimizationLevel.O2,
+        compilation_config=compilation_config,
+    )
+    # Explicit override should be respected
+    assert config.compilation_config.pass_config.enable_noop is False
+    # Other fields should still use defaults
+    assert config.compilation_config.mode == CompilationMode.VLLM_COMPILE
+    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE
diff --git a/tests/utils_/test_argparse_utils.py b/tests/utils_/test_argparse_utils.py
index 32d4eca541356..c0519155c4ba8 100644
--- a/tests/utils_/test_argparse_utils.py
+++ b/tests/utils_/test_argparse_utils.py
@@ -28,6 +28,7 @@ def parser():
     parser.add_argument("--enable-feature", action="store_true")
     parser.add_argument("--hf-overrides", type=json.loads)
     parser.add_argument("-O", "--compilation-config", type=json.loads)
+    parser.add_argument("--optimization-level", type=int)
     return parser
 
 
@@ -217,8 +218,8 @@ def test_dict_args(parser):
             "key15": "-minus.and.dot",
         },
     }
+    assert parsed_args.optimization_level == 1
     assert parsed_args.compilation_config == {
-        "mode": 1,
         "use_inductor_graph_partition": True,
         "backend": "custom",
         "custom_ops": ["-quant_fp8", "+silu_mul", "-rms_norm"],
@@ -241,12 +242,13 @@ def test_duplicate_dict_args(caplog_vllm, parser):
     parsed_args = parser.parse_args(args)
     # Should be the last value
     assert parsed_args.hf_overrides == {"key1": "val2"}
-    assert parsed_args.compilation_config == {"mode": 3}
+    assert parsed_args.optimization_level == 3
+    assert parsed_args.compilation_config == {"mode": 2}
 
     assert len(caplog_vllm.records) == 1
     assert "duplicate" in caplog_vllm.text
     assert "--hf-overrides.key1" in caplog_vllm.text
-    assert "-O.mode" in caplog_vllm.text
+    assert "--optimization-level" in caplog_vllm.text
 
 
 def test_model_specification(
@@ -383,7 +385,7 @@ def test_compilation_mode_string_values(parser):
     assert args.compilation_config == {"mode": 0}
 
     args = parser.parse_args(["-O3"])
-    assert args.compilation_config == {"mode": 3}
+    assert args.optimization_level == 3
 
     args = parser.parse_args(["-O.mode=NONE"])
     assert args.compilation_config == {"mode": "NONE"}
diff --git a/tests/v1/cudagraph/test_cudagraph_mode.py b/tests/v1/cudagraph/test_cudagraph_mode.py
index 7f9c2a0571c3c..12621d493e549 100644
--- a/tests/v1/cudagraph/test_cudagraph_mode.py
+++ b/tests/v1/cudagraph/test_cudagraph_mode.py
@@ -117,9 +117,9 @@ else:
     combo_cases_2 = [
         ("FA2", "FULL", CompilationMode.NONE, True),
         ("FA2", "FULL", CompilationMode.VLLM_COMPILE, True),
-        ("FA2", "PIECEWISE", CompilationMode.NONE, False),
+        ("FA2", "PIECEWISE", CompilationMode.NONE, True),
         ("FA2", "PIECEWISE", CompilationMode.VLLM_COMPILE, True),
-        ("FA2", "FULL_AND_PIECEWISE", CompilationMode.NONE, False),
+        ("FA2", "FULL_AND_PIECEWISE", CompilationMode.NONE, True),
         ("FA2", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True),
         ("FA2", "FULL_DECODE_ONLY", CompilationMode.NONE, True),
         ("FA2", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True),
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 865d045676d14..da2c100dae3dc 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -8,7 +8,7 @@ from dataclasses import asdict, field
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, ClassVar, Literal
 
-from pydantic import TypeAdapter, field_validator
+from pydantic import Field, TypeAdapter, field_validator
 from pydantic.dataclasses import dataclass
 
 import vllm.envs as envs
@@ -97,19 +97,25 @@ class PassConfig:
 
     This is separate from general `CompilationConfig` so that inductor passes
     don't all have access to full configuration - that would create a cycle as
-    the `PassManager` is set as a property of config."""
+    the `PassManager` is set as a property of config.
 
-    enable_fusion: bool = False
+    You must pass PassConfig to VLLMConfig constructor via the CompilationConfig
+    constructor. VLLMConfig's post_init does further initialization.
+    If used outside of the VLLMConfig, some fields may be left in an
+    improper state.
+    """
+
+    enable_fusion: bool = Field(default=None)
     """Whether to enable the custom fusion (RMSNorm/SiluMul+quant) pass."""
-    enable_attn_fusion: bool = False
+    enable_attn_fusion: bool = Field(default=None)
     """Whether to enable the custom attention+quant fusion pass."""
-    enable_noop: bool = False
+    enable_noop: bool = Field(default=None)
     """Whether to enable the custom no-op elimination pass."""
-    enable_sequence_parallelism: bool = False
+    enable_sequence_parallelism: bool = Field(default=None)
     """Whether to enable sequence parallelism."""
-    enable_async_tp: bool = False
+    enable_async_tp: bool = Field(default=None)
     """Whether to enable async TP."""
-    enable_fi_allreduce_fusion: bool = False
+    enable_fi_allreduce_fusion: bool = Field(default=None)
     """Whether to enable flashinfer allreduce fusion."""
     fi_allreduce_fusion_max_size_mb: float | None = None
     """The threshold of the communicated tensor sizes under which
@@ -167,6 +173,22 @@ class PassConfig:
         """
         return InductorPass.hash_dict(asdict(self))
 
+    @field_validator(
+        "enable_fusion",
+        "enable_attn_fusion",
+        "enable_noop",
+        "enable_sequence_parallelism",
+        "enable_async_tp",
+        "enable_fi_allreduce_fusion",
+        mode="wrap",
+    )
+    @classmethod
+    def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
+        """Skip validation if the value is `None` when initialisation is delayed."""
+        if value is None:
+            return value
+        return handler(value)
+
     def __post_init__(self) -> None:
         if not self.enable_noop:
             if self.enable_fusion:
@@ -243,7 +265,13 @@ class DynamicShapesConfig:
 @config
 @dataclass
 class CompilationConfig:
-    """Configuration for compilation. It has three parts:
+    """Configuration for compilation.
+
+    You must pass CompilationConfig to VLLMConfig constructor.
+    VLLMConfig's post_init does further initialization. If used outside of the
+    VLLMConfig, some fields will be left in an improper state.
+
+    It has three parts:
 
     - Top-level Compilation control:
         - [`mode`][vllm.config.CompilationConfig.mode]
@@ -282,14 +310,14 @@ class CompilationConfig:
     """
 
     # Top-level Compilation control
-    level: int | None = None
+    level: int = Field(default=None)
     """
     Level is deprecated and will be removed in the next release,
     either 0.12.0 or 0.11.2 whichever is soonest.
     Please use mode. Currently all levels are mapped to mode.
     """
     # Top-level Compilation control
-    mode: CompilationMode | None = None
+    mode: CompilationMode = Field(default=None)
     """The compilation approach used for torch.compile-based compilation of the
     model.
 
@@ -390,7 +418,7 @@ class CompilationConfig:
     constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""
 
     # CudaGraph compilation
-    cudagraph_mode: CUDAGraphMode | None = None
+    cudagraph_mode: CUDAGraphMode = Field(default=None)
     """
     The mode of the cudagraph:
 
@@ -452,7 +480,7 @@ class CompilationConfig:
     When `enable_lora` is False, this option has no effect.
     """
 
-    use_inductor_graph_partition: bool = False
+    use_inductor_graph_partition: bool = Field(default=None)
     """Use inductor graph partition to split the graph at cudagraph_unsafe ops.
     This partition happens at inductor codegen time after all passes and fusions
     are finished. It generates a single `call` function which wraps
@@ -648,6 +676,20 @@ class CompilationConfig:
             )
         return value
 
+    @field_validator(
+        "level",
+        "mode",
+        "cudagraph_mode",
+        "use_inductor_graph_partition",
+        mode="wrap",
+    )
+    @classmethod
+    def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
+        """Skip validation if the value is `None` when initialisation is delayed."""
+        if value is None:
+            return value
+        return handler(value)
+
     def __post_init__(self) -> None:
         if self.level is not None:
             logger.warning(
@@ -948,6 +990,13 @@ class CompilationConfig:
                     op,
                 )
 
+    def is_custom_op_enabled(self, op: str) -> bool:
+        if "all" in self.custom_ops:
+            return f"-{op}" not in self.custom_ops
+
+        assert "none" in self.custom_ops
+        return f"+{op}" in self.custom_ops
+
     def adjust_cudagraph_sizes_for_spec_decode(
         self, uniform_decode_query_len: int, tensor_parallel_size: int
     ):
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 25972f097f53d..84311596b660c 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -1752,6 +1752,14 @@ class ModelConfig:
         logger.info("Using max model len %s", max_model_len)
         return max_model_len
 
+    def is_model_moe(
+        self,
+    ) -> bool:
+        return self.get_num_experts() > 1
+
+    def is_quantized(self) -> bool:
+        return getattr(self.hf_config, "quantization_config", None) is not None
+
 
 def get_served_model_name(model: str, served_model_name: str | list[str] | None):
     """
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 9342564aa3d3f..c576275e80fe3 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -9,8 +9,9 @@ import tempfile
 import threading
 import time
 from contextlib import contextmanager
-from dataclasses import replace
+from dataclasses import is_dataclass, replace
 from datetime import datetime
+from enum import IntEnum
 from functools import lru_cache
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, TypeVar, get_args
@@ -57,6 +58,103 @@ else:
 logger = init_logger(__name__)
 
 
+class OptimizationLevel(IntEnum):
+    """Optimization level enum."""
+
+    O0 = 0
+    """O0 : No optimization. no compilation, no cudagraphs, no other
+    optimization, just starting up immediately"""
+    O1 = 1
+    """O1: Quick optimizations. Dynamo+Inductor compilation and Piecewise 
+    cudagraphs"""
+    O2 = 2
+    """O2: Full optimizations. -O1 as well as Full and Piecewise cudagraphs."""
+    O3 = 3
+    """O3: Currently the same as -O2s."""
+
+
+IS_QUANTIZED = False
+IS_DENSE = False
+# The optimizations that depend on these properties currently set to False
+# in all cases.
+# if model_config is not None:
+#     IS_QUANTIZED = lambda c: c.model_config.is_quantized()
+#     IS_DENSE = lambda c: not c.model_config.is_model_moe()
+# See https://github.com/vllm-project/vllm/issues/25689.
+
+
+def enable_fusion(cfg: "VllmConfig") -> bool:
+    """Returns True if RMS norm or quant FP8 is enabled."""
+    return cfg.compilation_config.is_custom_op_enabled(
+        "rms_norm"
+    ) or cfg.compilation_config.is_custom_op_enabled("quant_fp8")
+
+
+OPTIMIZATION_LEVEL_00 = {
+    "compilation_config": {
+        "pass_config": {
+            "enable_noop": False,
+            "enable_fusion": False,
+            "enable_fi_allreduce_fusion": False,
+            "enable_attn_fusion": False,
+            "enable_sequence_parallelism": False,
+            "enable_async_tp": False,
+        },
+        "cudagraph_mode": CUDAGraphMode.NONE,
+        "use_inductor_graph_partition": False,
+    },
+}
+OPTIMIZATION_LEVEL_01 = {
+    "compilation_config": {
+        "pass_config": {
+            "enable_noop": True,
+            "enable_fusion": enable_fusion,
+            "enable_fi_allreduce_fusion": False,
+            "enable_attn_fusion": False,
+            "enable_sequence_parallelism": False,
+            "enable_async_tp": False,
+        },
+        "cudagraph_mode": CUDAGraphMode.PIECEWISE,
+        "use_inductor_graph_partition": False,
+    },
+}
+OPTIMIZATION_LEVEL_02 = {
+    "compilation_config": {
+        "pass_config": {
+            "enable_noop": True,
+            "enable_fusion": enable_fusion,
+            "enable_fi_allreduce_fusion": False,
+            "enable_attn_fusion": IS_QUANTIZED,
+            "enable_sequence_parallelism": IS_DENSE,
+            "enable_async_tp": IS_DENSE,
+        },
+        "cudagraph_mode": CUDAGraphMode.FULL_AND_PIECEWISE,
+        "use_inductor_graph_partition": False,
+    },
+}
+OPTIMIZATION_LEVEL_03 = {
+    "compilation_config": {
+        "pass_config": {
+            "enable_noop": True,
+            "enable_fusion": enable_fusion,
+            "enable_fi_allreduce_fusion": False,
+            "enable_attn_fusion": IS_QUANTIZED,
+            "enable_sequence_parallelism": IS_DENSE,
+            "enable_async_tp": IS_DENSE,
+        },
+        "cudagraph_mode": CUDAGraphMode.FULL_AND_PIECEWISE,
+        "use_inductor_graph_partition": False,
+    },
+}
+
+OPTIMIZATION_LEVEL_TO_CONFIG = {
+    OptimizationLevel.O0: OPTIMIZATION_LEVEL_00,
+    OptimizationLevel.O1: OPTIMIZATION_LEVEL_01,
+    OptimizationLevel.O2: OPTIMIZATION_LEVEL_02,
+    OptimizationLevel.O3: OPTIMIZATION_LEVEL_03,
+}
+
+
 @config
 @dataclass(config=ConfigDict(arbitrary_types_allowed=True))
 class VllmConfig:
@@ -116,6 +214,11 @@ class VllmConfig:
     you are using. Contents must be hashable."""
     instance_id: str = ""
     """The ID of the vLLM instance."""
+    optimization_level: OptimizationLevel = OptimizationLevel.O2
+    """The optimization level. These levels trade startup time cost for
+    performance, with -O0 having the best startup time and -O3 having the best
+    performance. -02 is used by defult. See  OptimizationLevel for full
+    description."""
 
     def compute_hash(self) -> str:
         """
@@ -297,6 +400,50 @@ class VllmConfig:
 
         return replace(self, model_config=model_config)
 
+    def _set_config_default(self, config_obj: Any, key: str, value: Any) -> None:
+        """Set config attribute to default if not already set by user.
+
+        Args:
+            config_obj: Configuration object to update.
+            key: Attribute name.
+            value: Default value (static or callable).
+        """
+        if getattr(config_obj, key) is None:
+            # Some config values are known before initialization and are
+            # hard coded.
+            # Other values depend on the user given configuration, so they are
+            # implemented with lambda functions and decided at run time.
+            setattr(config_obj, key, value(self) if callable(value) else value)
+
+    def _apply_optimization_level_defaults(self, defaults: dict[str, Any]) -> None:
+        """Apply optimization level defaults using self as root.
+
+        Recursively applies values from defaults into nested config objects.
+        Only fields present in defaults are overwritten.
+
+        If the user configuration does not specify a value for a default field
+        and if the default field is still None after all user selections are
+        applied, then default values will be applied to the field. User speciied
+        fields will not be overridden by the default.
+
+        Args:
+            defaults: Dictionary of default values to apply.
+        """
+
+        def apply_recursive(config_obj: Any, config_defaults: dict[str, Any]) -> None:
+            """Recursively apply defaults to config_obj, using self as root."""
+            for key, value in config_defaults.items():
+                if not hasattr(config_obj, key):
+                    continue
+
+                current = getattr(config_obj, key)
+                if isinstance(value, dict) and is_dataclass(current):
+                    apply_recursive(current, value)
+                else:
+                    self._set_config_default(config_obj, key, value)
+
+        apply_recursive(self, defaults)
+
     def _post_init_kv_transfer_config(self) -> None:
         """Update KVTransferConfig based on top-level configs in VllmConfig.
 
@@ -434,17 +581,47 @@ class VllmConfig:
                 "precision for chunked prefill triton kernels."
             )
 
-        # If the user does not explicitly set a compilation mode, then
-        # we use the default mode. The default mode depends on other
-        # settings (see the below code).
+        if (
+            self.optimization_level > OptimizationLevel.O0
+            and self.model_config is not None
+            and self.model_config.enforce_eager
+        ):
+            logger.warning("Enforce eager set, overriding optimization level to -O0")
+            self.optimization_level = OptimizationLevel.O0
+
+        if self.compilation_config.backend == "eager" or (
+            self.compilation_config.mode is not None
+            and self.compilation_config.mode != CompilationMode.VLLM_COMPILE
+        ):
+            logger.warning(
+                "Inductor compilation was disabled by user settings,"
+                "Optimizations settings that are only active during"
+                "Inductor compilation will be ignored."
+            )
+
+        def has_blocked_weights():
+            if self.quant_config is not None:
+                if hasattr(self.quant_config, "weight_block_size"):
+                    return self.quant_config.weight_block_size is not None
+                elif hasattr(self.quant_config, "has_blocked_weights"):
+                    return self.quant_config.has_blocked_weights()
+            return False
+
+        # Enable quant_fp8 CUDA ops (TODO disable in follow up)
+        # On H100 the CUDA kernel is faster than
+        # native implementation
+        # https://github.com/vllm-project/vllm/issues/25094
+        if has_blocked_weights():
+            custom_ops = self.compilation_config.custom_ops
+            if "-quant_fp8" not in custom_ops:
+                custom_ops.append("+quant_fp8")
+
         if self.compilation_config.mode is None:
-            if self.model_config is not None and not self.model_config.enforce_eager:
+            if self.optimization_level > OptimizationLevel.O0:
                 self.compilation_config.mode = CompilationMode.VLLM_COMPILE
             else:
                 self.compilation_config.mode = CompilationMode.NONE
 
-        # If user does not set custom ops via none or all set it here based on
-        # compilation mode and backend.
         if all(s not in self.compilation_config.custom_ops for s in ("all", "none")):
             if (
                 self.compilation_config.backend == "inductor"
@@ -454,23 +631,33 @@ class VllmConfig:
             else:
                 self.compilation_config.custom_ops.append("all")
 
+        default_config = OPTIMIZATION_LEVEL_TO_CONFIG[self.optimization_level]
+        self._apply_optimization_level_defaults(default_config)
+        if (
+            self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+            and self.compilation_config.mode != CompilationMode.VLLM_COMPILE
+        ):
+            logger.info(
+                "Cudagraph mode %s is not compatible with compilation mode %s."
+                "Overriding to NONE.",
+                self.compilation_config.cudagraph_mode,
+                self.compilation_config.mode,
+            )
+            self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
+
         # async tp is built on top of sequence parallelism
         # and requires it to be enabled.
         if self.compilation_config.pass_config.enable_async_tp:
             self.compilation_config.pass_config.enable_sequence_parallelism = True
+        if self.compilation_config.pass_config.enable_sequence_parallelism:
+            if "-rms_norm" in self.compilation_config.custom_ops:
+                logger.warning(
+                    "RMS norm force disabled, sequence parallelism might break"
+                )
+            else:
+                self.compilation_config.custom_ops.append("+rms_norm")
 
         if current_platform.support_static_graph_mode():
-            # if cudagraph_mode is not explicitly set by users, set default
-            # value
-            if self.compilation_config.cudagraph_mode is None:
-                if self.compilation_config.mode == CompilationMode.VLLM_COMPILE:
-                    # default to full and piecewise for most models
-                    self.compilation_config.cudagraph_mode = (
-                        CUDAGraphMode.FULL_AND_PIECEWISE
-                    )
-                else:
-                    self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
-
             # if cudagraph_mode has full cudagraphs, we need to check support
             if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
                 # decode context parallel does not support full cudagraphs
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 696ff3a1f4024..e4c9a82d25223 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -77,6 +77,7 @@ from vllm.config.observability import DetailedTraceModules
 from vllm.config.parallel import DistributedExecutorBackend, ExpertPlacementStrategy
 from vllm.config.scheduler import SchedulerPolicy
 from vllm.config.utils import get_field
+from vllm.config.vllm import OptimizationLevel
 from vllm.logger import init_logger, suppress_logging
 from vllm.platforms import CpuArchEnum, current_platform
 from vllm.plugins import load_general_plugins
@@ -560,6 +561,7 @@ class EngineArgs:
     stream_interval: int = SchedulerConfig.stream_interval
 
     kv_sharing_fast_prefill: bool = CacheConfig.kv_sharing_fast_prefill
+    optimization_level: OptimizationLevel = VllmConfig.optimization_level
 
     kv_offloading_size: float | None = CacheConfig.kv_offloading_size
     kv_offloading_backend: KVOffloadingBackend | None = (
@@ -1114,6 +1116,10 @@ class EngineArgs:
             "--structured-outputs-config", **vllm_kwargs["structured_outputs_config"]
         )
 
+        vllm_group.add_argument(
+            "--optimization-level", **vllm_kwargs["optimization_level"]
+        )
+
         # Other arguments
         parser.add_argument(
             "--disable-log-stats",
@@ -1733,7 +1739,6 @@ class EngineArgs:
             compilation_config.max_cudagraph_capture_size = (
                 self.max_cudagraph_capture_size
             )
-
         config = VllmConfig(
             model_config=model_config,
             cache_config=cache_config,
@@ -1750,6 +1755,7 @@ class EngineArgs:
             kv_events_config=self.kv_events_config,
             ec_transfer_config=self.ec_transfer_config,
             additional_config=self.additional_config,
+            optimization_level=self.optimization_level,
         )
 
         return config
diff --git a/vllm/utils/argparse_utils.py b/vllm/utils/argparse_utils.py
index 692e756d19634..b68157f02f6cc 100644
--- a/vllm/utils/argparse_utils.py
+++ b/vllm/utils/argparse_utils.py
@@ -247,16 +247,16 @@ class FlexibleArgumentParser(ArgumentParser):
             elif arg.startswith("-O") and arg != "-O" and arg[2] != ".":
                 # allow -O flag to be used without space, e.g. -O3 or -Odecode
                 # -O.<...> handled later
-                # also handle -O=<mode> here
-                mode = arg[3:] if arg[2] == "=" else arg[2:]
-                processed_args.append(f"-O.mode={mode}")
+                # also handle -O=<optimization_level> here
+                optimization_level = arg[3:] if arg[2] == "=" else arg[2:]
+                processed_args += ["--optimization-level", optimization_level]
             elif (
                 arg == "-O"
                 and i + 1 < len(args)
                 and args[i + 1] in {"0", "1", "2", "3"}
             ):
-                # Convert -O <n> to -O.mode <n>
-                processed_args.append("-O.mode")
+                # Convert -O <n> to --optimization-level <n>
+                processed_args.append("--optimization-level")
             else:
                 processed_args.append(arg)
 
@@ -294,10 +294,24 @@ class FlexibleArgumentParser(ArgumentParser):
         delete = set[int]()
         dict_args = defaultdict[str, dict[str, Any]](dict)
         duplicates = set[str]()
+        # Track regular arguments (non-dict args) for duplicate detection
+        regular_args_seen = set[str]()
         for i, processed_arg in enumerate(processed_args):
             if i in delete:  # skip if value from previous arg
                 continue
 
+            if processed_arg.startswith("--") and "." not in processed_arg:
+                if "=" in processed_arg:
+                    arg_name = processed_arg.split("=", 1)[0]
+                else:
+                    arg_name = processed_arg
+
+                if arg_name in regular_args_seen:
+                    duplicates.add(arg_name)
+                else:
+                    regular_args_seen.add(arg_name)
+                continue
+
             if processed_arg.startswith("-") and "." in processed_arg:
                 if "=" in processed_arg:
                     processed_arg, value_str = processed_arg.split("=", 1)
diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py
index 4fd8eb50a4ea8..eb8e610ae4710 100644
--- a/vllm/v1/worker/gpu/cudagraph_utils.py
+++ b/vllm/v1/worker/gpu/cudagraph_utils.py
@@ -37,7 +37,7 @@ class CudaGraphManager:
         self.dp_size = vllm_config.parallel_config.data_parallel_size
         self.compilation_config = vllm_config.compilation_config
         assert self.compilation_config is not None
-
+        self.cudagraph_mode: CUDAGraphMode
         if self.compilation_config.cudagraph_mode is None:
             self.cudagraph_mode = CUDAGraphMode.NONE
         else:

From 51906c8c559f1d7c23efa667fcb3b7ed79f7fa25 Mon Sep 17 00:00:00 2001
From: maang-h <55082429+maang-h@users.noreply.github.com>
Date: Thu, 27 Nov 2025 18:09:24 +0800
Subject: [PATCH 005/770] [Docs] Improve `priority` parameter documentation
 (#29572)

Signed-off-by: maang <maang_h@163.com>
Signed-off-by: maang-h <55082429+maang-h@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 vllm/entrypoints/llm.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 1860f383d45fb..f6ee746789981 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -405,6 +405,9 @@ class LLM:
             lora_request: LoRA request to use for generation, if any.
             priority: The priority of the requests, if any.
                 Only applicable when priority scheduling policy is enabled.
+                If provided, must be a list of integers matching the length
+                of `prompts`, where each priority value corresponds to the prompt
+                at the same index.
 
         Returns:
             A list of `RequestOutput` objects containing the

From e6d4f3c254a215e75b4d76d531176e242fe62a1f Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 27 Nov 2025 18:23:06 +0800
Subject: [PATCH 006/770] [Bugfix] Fix pre-commit (#29601)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../ec_connector/integration/test_epd_correctness.py  |  5 ++---
 vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py     | 11 +++++++----
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/tests/v1/ec_connector/integration/test_epd_correctness.py b/tests/v1/ec_connector/integration/test_epd_correctness.py
index 69c4c58e349b9..616d34441ab8e 100644
--- a/tests/v1/ec_connector/integration/test_epd_correctness.py
+++ b/tests/v1/ec_connector/integration/test_epd_correctness.py
@@ -237,9 +237,8 @@ def main():
 
     for i, prompt_data in enumerate(test_prompts):
         print(
-            f"\nRunning prompt {i + 1}/{len(test_prompts)}: {
-                prompt_data['description']
-            }"
+            f"\nRunning prompt {i + 1}/{len(test_prompts)}: "
+            f"{prompt_data['description']}"
         )
 
         output_str = run_chat_completion(
diff --git a/vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py b/vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py
index a6f50d68cc684..dcdeedda60a77 100644
--- a/vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py
+++ b/vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py
@@ -35,13 +35,16 @@ class EagleCudaGraphManager:
         self.compilation_config = vllm_config.compilation_config
         assert self.compilation_config is not None
 
+        cudagraph_mode: CUDAGraphMode
         if self.compilation_config.cudagraph_mode is None:
-            self.cudagraph_mode = CUDAGraphMode.NONE
+            cudagraph_mode = CUDAGraphMode.NONE
         else:
-            self.cudagraph_mode = self.compilation_config.cudagraph_mode
-            if self.cudagraph_mode == CUDAGraphMode.FULL:
+            cudagraph_mode = self.compilation_config.cudagraph_mode
+            if cudagraph_mode == CUDAGraphMode.FULL:
                 # NOTE(woosuk): For Eagle, we only use CUDA graphs for decode.
-                self.cudagraph_mode = CUDAGraphMode.FULL_DECODE_ONLY
+                cudagraph_mode = CUDAGraphMode.FULL_DECODE_ONLY
+
+        self.cudagraph_mode = cudagraph_mode
 
         self.cudagraph_sizes = get_cudagraph_sizes(
             self.compilation_config.cudagraph_capture_sizes,

From a5abd1d38439a026607d641c594ca98829ea5623 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Thu, 27 Nov 2025 19:33:19 +0800
Subject: [PATCH 007/770] [CI] Auto label CPU related issues (#29602)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 .github/workflows/issue_autolabel.yml | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/.github/workflows/issue_autolabel.yml b/.github/workflows/issue_autolabel.yml
index 7d565ef9f2e45..a8251ceed07f4 100644
--- a/.github/workflows/issue_autolabel.yml
+++ b/.github/workflows/issue_autolabel.yml
@@ -105,6 +105,31 @@ jobs:
                   }
                 ],
               },
+              cpu: {
+                // Keyword search - matches whole words only (with word boundaries)
+                keywords: [
+                  {
+                    term: "[CPU]",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "x86",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "ARM",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "Apple Silicon",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "IBM Z",
+                    searchIn: "title"
+                  },
+                ],
+              },
               // Add more label configurations here as needed
               // example: {
               //   keywords: [...],

From cf348c8d27c34247f5976a86ebe6f4a3b4f9e888 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Thu, 27 Nov 2025 04:36:24 -0800
Subject: [PATCH 008/770] [Bugfix] Fix HunyuanVL XD-RoPE (#29593)

Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored by: grider-transwithai <grider@transwith.ai>
---
 vllm/model_executor/models/hunyuan_vision.py           | 2 +-
 vllm/transformers_utils/processors/hunyuan_vl_image.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py
index e83addd0c092f..2950db571e6ee 100644
--- a/vllm/model_executor/models/hunyuan_vision.py
+++ b/vllm/model_executor/models/hunyuan_vision.py
@@ -847,7 +847,7 @@ class HunYuanVLForConditionalGeneration(
                 .expand(-1, llm_grid_w + 1)
                 .reshape(-1)
             )
-            h_index[pos : pos + token_num] = 0
+            t_index[pos : pos + token_num] = image_index
 
         if xd_num == 4:
             llm_positions = torch.stack([p_index, w_index, h_index, t_index])
diff --git a/vllm/transformers_utils/processors/hunyuan_vl_image.py b/vllm/transformers_utils/processors/hunyuan_vl_image.py
index 0a7e7865c783a..0b10ae249dbb6 100644
--- a/vllm/transformers_utils/processors/hunyuan_vl_image.py
+++ b/vllm/transformers_utils/processors/hunyuan_vl_image.py
@@ -195,9 +195,9 @@ class HunYuanVLImageProcessor(BaseImageProcessor):
         processed_images = []
         for image in images:
             if do_resize:
-                resized_width, resized_height = smart_resize(
-                    width,
-                    height,
+                resized_height, resized_width = smart_resize(
+                    height=height,
+                    width=width,
                     factor=patch_size * merge_size,
                     min_pixels=self.min_pixels,
                     max_pixels=self.max_pixels,

From 2f5f9acd551cfb737997a1f7f86982ec74aabf79 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 27 Nov 2025 21:56:28 +0800
Subject: [PATCH 009/770] [LoRA] Continue optimizing MoE LoRA weight loading
 (#29322)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/test_lora_checkpoints.py          |  15 +-
 tests/lora/test_lora_huggingface.py          |   8 +-
 vllm/lora/layers/base.py                     |   2 +-
 vllm/lora/layers/column_parallel_linear.py   |  16 +-
 vllm/lora/layers/fused_moe.py                | 202 ++++++++++---------
 vllm/lora/layers/logits_processor.py         |   2 +-
 vllm/lora/layers/replicated_linear.py        |   2 +-
 vllm/lora/layers/row_parallel_linear.py      |   4 +-
 vllm/lora/layers/vocal_parallel_embedding.py |   2 +-
 vllm/lora/lora_weights.py                    |  53 +++++
 vllm/lora/models.py                          |  50 ++---
 vllm/lora/utils.py                           |  17 +-
 vllm/lora/worker_manager.py                  |  10 +-
 vllm/model_executor/models/interfaces.py     |   1 +
 vllm/model_executor/models/qwen3_vl_moe.py   |   1 +
 15 files changed, 228 insertions(+), 157 deletions(-)

diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py
index 2219d470e91a1..b9b1bc59c6ed7 100644
--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
@@ -28,12 +28,13 @@ def test_load_checkpoints(
     packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
     embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
     embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
-    expected_lora_modules: list[str] = []
+    expected_lora_lst: list[str] = []
     for module in BAICHUAN_LORA_MODULES:
         if module in packed_modules_mapping:
-            expected_lora_modules.extend(packed_modules_mapping[module])
+            expected_lora_lst.extend(packed_modules_mapping[module])
         else:
-            expected_lora_modules.append(module)
+            expected_lora_lst.append(module)
+    expected_lora_modules = set(expected_lora_lst)
     if lora_name == "baichuan7B":
         peft_helper = PEFTHelper.from_local_dir(
             baichuan_lora_files, max_position_embeddings=4096
@@ -103,13 +104,13 @@ def test_lora_weights_mapping(baichuan_lora_files):
     packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
     embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
     embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
-    expected_lora_modules: list[str] = []
+    expected_lora_lst: list[str] = []
     for module in BAICHUAN_LORA_MODULES:
         if module in packed_modules_mapping:
-            expected_lora_modules.extend(packed_modules_mapping[module])
+            expected_lora_lst.extend(packed_modules_mapping[module])
         else:
-            expected_lora_modules.append(module)
-
+            expected_lora_lst.append(module)
+    expected_lora_modules = set(expected_lora_lst)
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             "model.": "language_model.model.",
diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py
index 7d20faef541aa..6a787471c74fd 100644
--- a/tests/lora/test_lora_huggingface.py
+++ b/tests/lora/test_lora_huggingface.py
@@ -26,13 +26,13 @@ def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
     packed_modules_mapping = LlamaForCausalLM.packed_modules_mapping
     embedding_modules = LlamaForCausalLM.embedding_modules
     embed_padding_modules = LlamaForCausalLM.embedding_padding_modules
-    expected_lora_modules: list[str] = []
+    expected_lora_lst: list[str] = []
     for module in LLAMA_LORA_MODULES:
         if module in packed_modules_mapping:
-            expected_lora_modules.extend(packed_modules_mapping[module])
+            expected_lora_lst.extend(packed_modules_mapping[module])
         else:
-            expected_lora_modules.append(module)
-
+            expected_lora_lst.append(module)
+    expected_lora_modules = set(expected_lora_lst)
     lora_path = get_adapter_absolute_path(lora_name)
 
     # lora loading should work for either absolute path and huggingface id.
diff --git a/vllm/lora/layers/base.py b/vllm/lora/layers/base.py
index 3bfb88c007622..a4b8fb4d2aec5 100644
--- a/vllm/lora/layers/base.py
+++ b/vllm/lora/layers/base.py
@@ -60,7 +60,7 @@ class BaseLayerWithLoRA(nn.Module):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         """Returns True if the layer can be replaced by this LoRA layer."""
         raise NotImplementedError
diff --git a/vllm/lora/layers/column_parallel_linear.py b/vllm/lora/layers/column_parallel_linear.py
index 3e21d426c304a..904025901fba7 100644
--- a/vllm/lora/layers/column_parallel_linear.py
+++ b/vllm/lora/layers/column_parallel_linear.py
@@ -153,7 +153,7 @@ class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         return type(source_layer) is ColumnParallelLinear or (
             type(source_layer) is MergedColumnParallelLinear
@@ -272,7 +272,7 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         return (
             type(source_layer) is MergedColumnParallelLinear
@@ -338,7 +338,7 @@ class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         return type(source_layer) is QKVParallelLinear and len(packed_modules_list) == 1
 
@@ -396,7 +396,7 @@ class MergedQKVParallelLinearWithLoRA(MergedColumnParallelLinearWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         return type(source_layer) is QKVParallelLinear and len(packed_modules_list) == 3
 
@@ -434,7 +434,7 @@ class ColumnParallelLinearWithShardedLoRA(ColumnParallelLinearWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
@@ -480,7 +480,7 @@ class MergedColumnParallelLinearWithShardedLoRA(MergedColumnParallelLinearWithLo
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
@@ -516,7 +516,7 @@ class QKVParallelLinearWithShardedLoRA(QKVParallelLinearWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
@@ -565,7 +565,7 @@ class MergedQKVParallelLinearWithShardedLoRA(MergedQKVParallelLinearWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py
index 1b925742c3002..3ad19370962ab 100644
--- a/vllm/lora/layers/fused_moe.py
+++ b/vllm/lora/layers/fused_moe.py
@@ -401,6 +401,61 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
                     self.w13_lora_b_stacked[1][lora_id][experts_id]
                 )
 
+    def _slice_w13_a(self, w13_lora_a: torch.Tensor) -> torch.Tensor:
+        """
+        Applies to FusedMoEWithLoRA and FusedMoE3DWithLoRA
+        """
+        if self.tp_size == 1 or not self.fully_sharded:
+            return w13_lora_a
+
+        # w13_lora_a shape (num_experts,rank,input_size)
+        current_lora_rank = w13_lora_a.shape[1]
+        assert current_lora_rank % self.tp_size == 0
+        # Based on S-LoRA, we slice W13/W1/W3 A along the rank dim.
+        sliced_rank = current_lora_rank // self.tp_size
+        start_idx = self.tp_rank * sliced_rank
+        end_idx = (self.tp_rank + 1) * sliced_rank
+        return w13_lora_a[:, start_idx:end_idx, :]
+
+    def _slice_w13_b(self, w13_lora_b: torch.Tensor):
+        if self.tp_size == 1:
+            return w13_lora_b
+
+        # w13_lora_b shape (num_experts,output_size,rank)
+        shard_size = self.base_layer.intermediate_size_per_partition
+        start_idx = self.tp_rank * shard_size
+        end_idx = (self.tp_rank + 1) * shard_size
+
+        return w13_lora_b[:, start_idx:end_idx, :]
+
+    def _slice_w2_a(self, w2_lora_a: torch.Tensor) -> torch.Tensor:
+        """
+        Applies to FusedMoEWithLoRA and FusedMoE3DWithLoRA
+        """
+        if self.tp_size == 1:
+            return w2_lora_a
+        # w2_lora_a shape (num_experts,rank,input_size)
+        shard_size = self.base_layer.intermediate_size_per_partition
+        start_idx = self.tp_rank * shard_size
+        end_idx = (self.tp_rank + 1) * shard_size
+
+        return w2_lora_a[:, :, start_idx:end_idx]
+
+    def _slice_w2_b(self, w2_lora_b: torch.Tensor) -> torch.Tensor:
+        """
+        Applies to FusedMoEWithLoRA and FusedMoE3DWithLoRA
+        """
+        if self.tp_size == 1 or not self.fully_sharded:
+            return w2_lora_b
+        # Based on S-LoRA, we slice W2 B along the hidden_size dim.
+        # w2_lora_b shape (num_experts,output_size,rank)
+        current_lora_size = w2_lora_b.shape[1]
+
+        sliced_size = current_lora_size // self.tp_size
+        start_idx = self.tp_rank * sliced_size
+        end_idx = (self.tp_rank + 1) * sliced_size
+        return w2_lora_b[:, start_idx:end_idx, :]
+
     def reset_lora(self, index: int):
         """Resets the lora weights at index back to 0."""
         for pos in range(self._w13_slices):
@@ -411,6 +466,8 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
         self.w2_lora_b_stacked[0][index] = 0
         self.adapter_enabled[index] = 0
 
+    #
+
     def set_lora(
         self,
         index: int,
@@ -418,69 +475,55 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
         lora_b: torch.Tensor | list[torch.Tensor],
     ):
         """Overwrites lora tensors at index."""
+        # Make mypy happy
         assert isinstance(lora_a, list)
         assert isinstance(lora_b, list)
+
         self.reset_lora(index)
         self.adapter_enabled[index] = 1
-        for eid in range(len(lora_a) // 3):
-            w1_lora_a = lora_a[eid * 3]
-            w2_lora_a = lora_a[eid * 3 + 1]
-            w3_lora_a = lora_a[eid * 3 + 2]
-            w1_lora_b = lora_b[eid * 3]
-            w2_lora_b = lora_b[eid * 3 + 1]
-            w3_lora_b = lora_b[eid * 3 + 2]
 
-            # Handle the case of adding LoRA to only a subset of experts
-            if w1_lora_a is None or w2_lora_a is None or w3_lora_a is None:
-                continue
+        num_experts = self.w13_lora_a_stacked[0].shape[1]
 
-            if self.tp_size > 1:
-                shard_size = self.base_layer.intermediate_size_per_partition
-                start_idx = self.tp_rank * shard_size
-                end_idx = (self.tp_rank + 1) * shard_size
+        w1_lora_a, w2_lora_a, w3_lora_a = lora_a
+        w1_lora_b, w2_lora_b, w3_lora_b = lora_b
+        assert (
+            num_experts
+            == w1_lora_a.shape[0]
+            == w2_lora_a.shape[0]
+            == w3_lora_a.shape[0]
+        )
 
-                w1_lora_b = w1_lora_b[start_idx:end_idx, :]
-                w3_lora_b = w3_lora_b[start_idx:end_idx, :]
-                w2_lora_a = w2_lora_a[:, start_idx:end_idx]
+        slliced_w1_lora_a = self._slice_w13_a(w1_lora_a)
+        slliced_w1_lora_b = self._slice_w13_b(w1_lora_b)
+        slliced_w3_lora_a = self._slice_w13_a(w3_lora_a)
+        slliced_w3_lora_b = self._slice_w13_b(w3_lora_b)
 
-                if self.fully_sharded:
-                    # Based on S-LoRA, we slice W1 and W3 A along the rank dim,
-                    # and W2 B along the hidden_size dim.
-                    w13_shard_size = self.w13_lora_a_stacked[0][index, eid].shape[0]
-                    w13_start_idx = self.tp_rank * w13_shard_size
-                    w13_end_idx = (self.tp_rank + 1) * w13_shard_size
-                    w1_lora_a = w1_lora_a[w13_start_idx:w13_end_idx, :]
-                    w3_lora_a = w3_lora_a[w13_start_idx:w13_end_idx, :]
+        sliced_w2_lora_a = self._slice_w2_a(w2_lora_a)
+        sliced_w2_lora_b = self._slice_w2_b(w2_lora_b)
 
-                    w2_shard_size = self.w2_lora_b_stacked[0][index, eid].shape[0]
-                    w2_start_idx = self.tp_rank * w2_shard_size
-                    w2_end_idx = (self.tp_rank + 1) * w2_shard_size
-                    w2_lora_b = w2_lora_b[w2_start_idx:w2_end_idx, :]
-            # w1 lora_a
-            self.w13_lora_a_stacked[0][
-                index, eid, : w1_lora_a.shape[0], : w1_lora_a.shape[1]
-            ].copy_(w1_lora_a, non_blocking=True)
-            # w3 lora_a
-            self.w13_lora_a_stacked[1][
-                index, eid, : w3_lora_a.shape[0], : w3_lora_a.shape[1]
-            ].copy_(w3_lora_a, non_blocking=True)
+        self.w13_lora_a_stacked[0][
+            index, :, : slliced_w1_lora_a.shape[1], : slliced_w1_lora_a.shape[2]
+        ].copy_(slliced_w1_lora_a, non_blocking=True)
 
-            # w1 lora_b
-            self.w13_lora_b_stacked[0][
-                index, eid, : w1_lora_b.shape[0], : w1_lora_b.shape[1]
-            ].copy_(w1_lora_b, non_blocking=True)
-            # w3 lora_b
-            self.w13_lora_b_stacked[1][
-                index, eid, : w3_lora_b.shape[0], : w3_lora_b.shape[1]
-            ].copy_(w3_lora_b, non_blocking=True)
+        self.w13_lora_a_stacked[1][
+            index, :, : slliced_w3_lora_a.shape[1], : slliced_w3_lora_a.shape[2]
+        ].copy_(slliced_w3_lora_a, non_blocking=True)
 
-            self.w2_lora_a_stacked[0][
-                index, eid, : w2_lora_a.shape[0], : w2_lora_a.shape[1]
-            ].copy_(w2_lora_a, non_blocking=True)
+        self.w13_lora_b_stacked[0][
+            index, :, : slliced_w1_lora_b.shape[1], : slliced_w1_lora_b.shape[2]
+        ].copy_(slliced_w1_lora_b, non_blocking=True)
 
-            self.w2_lora_b_stacked[0][
-                index, eid, : w2_lora_b.shape[0], : w2_lora_b.shape[1]
-            ].copy_(w2_lora_b, non_blocking=True)
+        self.w13_lora_b_stacked[1][
+            index, :, : slliced_w3_lora_b.shape[1], : slliced_w3_lora_b.shape[2]
+        ].copy_(slliced_w3_lora_b, non_blocking=True)
+
+        self.w2_lora_a_stacked[0][
+            index, :, : sliced_w2_lora_a.shape[1], : sliced_w2_lora_a.shape[2]
+        ].copy_(sliced_w2_lora_a, non_blocking=True)
+
+        self.w2_lora_b_stacked[0][
+            index, :, : sliced_w2_lora_b.shape[1], : sliced_w2_lora_b.shape[2]
+        ].copy_(sliced_w2_lora_b, non_blocking=True)
 
     def forward(self, *args, **kwargs):
         return self.base_layer.forward(*args, **kwargs)
@@ -506,12 +549,12 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         """Returns True if the layer can be replaced by this LoRA layer."""
-        # return type(source_layer) is FusedMoE
 
-        return type(source_layer) is FusedMoE and len(packed_modules_list) == 2
+        # source_layer is FusedMoE or SharedFusedMoE
+        return isinstance(source_layer, FusedMoE) and len(packed_modules_list) == 2
 
 
 class FusedMoE3DWithLoRA(FusedMoEWithLoRA):
@@ -555,6 +598,9 @@ class FusedMoE3DWithLoRA(FusedMoEWithLoRA):
         model_config: PretrainedConfig | None = None,
     ) -> None:
         """Initializes lora matrices."""
+
+        assert isinstance(model_config, PretrainedConfig)
+        self._base_model = model_config.architectures[0]
         self.max_loras = lora_config.max_loras
         self.fully_sharded = lora_config.fully_sharded_loras
 
@@ -565,20 +611,7 @@ class FusedMoE3DWithLoRA(FusedMoEWithLoRA):
         self._create_lora_a_weights(max_loras, lora_config)
         self._create_lora_b_weights(max_loras, lora_config)
 
-    def _slice_w13_a(self, w13_lora_a: torch.Tensor) -> torch.Tensor:
-        if self.tp_size == 1 or not self.fully_sharded:
-            return w13_lora_a
-
-        # w13_lora_a shape (num_experts,rank,input_size)
-        current_lora_rank = w13_lora_a.shape[1]
-        assert current_lora_rank % self.tp_size == 0
-
-        sliced_rank = current_lora_rank // self.tp_size
-        start_idx = self.tp_rank * sliced_rank
-        end_idx = (self.tp_rank + 1) * sliced_rank
-        return w13_lora_a[:, start_idx:end_idx, :]
-
-    def _slice_w13_b(self, w13_lora_b: torch.Tensor, is_interleave: bool = True):
+    def _slice_w13_b(self, w13_lora_b: torch.Tensor):
         if self.tp_size == 1:
             return w13_lora_b
 
@@ -586,7 +619,8 @@ class FusedMoE3DWithLoRA(FusedMoEWithLoRA):
         shard_size = self.base_layer.intermediate_size_per_partition
         start_idx = self.tp_rank * shard_size
         end_idx = (self.tp_rank + 1) * shard_size
-        if is_interleave:
+        # HACK: Currently, only GPT-OSS is in interleaved order
+        if self._base_model == "GptOssForCausalLM":
             # For models like GPT-OSS, the weights of w1 (gate_proj) and w3 (up_proj)
             # in the interleaved order, and corresponding LoRA need to be processed.
             w1_lora_b = w13_lora_b[:, ::2, :]
@@ -606,28 +640,6 @@ class FusedMoE3DWithLoRA(FusedMoEWithLoRA):
 
             return torch.cat([sliced_w1_lora_b, sliced_w3_lora_b], dim=1)
 
-    def _slice_w2_a(self, w2_lora_a: torch.Tensor) -> torch.Tensor:
-        if self.tp_size == 1:
-            return w2_lora_a
-        # w2_lora_a shape (num_experts,rank,input_size)
-        shard_size = self.base_layer.intermediate_size_per_partition
-        start_idx = self.tp_rank * shard_size
-        end_idx = (self.tp_rank + 1) * shard_size
-
-        return w2_lora_a[:, :, start_idx:end_idx]
-
-    def _slice_w2_b(self, w2_lora_b: torch.Tensor) -> torch.Tensor:
-        if self.tp_size == 1 or not self.fully_sharded:
-            return w2_lora_b
-        # Based on S-LoRA, we slice W2 B along the hidden_size dim.
-        # w2_lora_b shape (num_experts,output_size,rank)
-        current_lora_size = w2_lora_b.shape[1]
-
-        sliced_size = current_lora_size // self.tp_size
-        start_idx = self.tp_rank * sliced_size
-        end_idx = (self.tp_rank + 1) * sliced_size
-        return w2_lora_b[:, start_idx:end_idx, :]
-
     def set_lora(
         self,
         index: int,
@@ -658,7 +670,7 @@ class FusedMoE3DWithLoRA(FusedMoEWithLoRA):
         w2_lora_b = w2_lora_b.permute(1, 0, 2)
 
         sliced_w13_lora_a = self._slice_w13_a(w13_lora_a)
-        sliced_w13_lora_b = self._slice_w13_b(w13_lora_b, is_interleave=True)
+        sliced_w13_lora_b = self._slice_w13_b(w13_lora_b)
 
         sliced_w2_lora_a = self._slice_w2_a(w2_lora_a)
         sliced_w2_lora_b = self._slice_w2_b(w2_lora_b)
@@ -711,8 +723,8 @@ class FusedMoE3DWithLoRA(FusedMoEWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         """Returns True if the layer can be replaced by this LoRA layer."""
-
-        return type(source_layer) is FusedMoE and len(packed_modules_list) == 1
+        # source_layer is FusedMoE or SharedFusedMoE
+        return isinstance(source_layer, FusedMoE) and len(packed_modules_list) == 1
diff --git a/vllm/lora/layers/logits_processor.py b/vllm/lora/layers/logits_processor.py
index c01984db4e64c..01515f6136371 100644
--- a/vllm/lora/layers/logits_processor.py
+++ b/vllm/lora/layers/logits_processor.py
@@ -197,7 +197,7 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         # Special handling for the LogitsProcessor.
         return False
diff --git a/vllm/lora/layers/replicated_linear.py b/vllm/lora/layers/replicated_linear.py
index 243736c4ebc65..62bac546ccd1a 100644
--- a/vllm/lora/layers/replicated_linear.py
+++ b/vllm/lora/layers/replicated_linear.py
@@ -53,7 +53,7 @@ class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         return type(source_layer) is ReplicatedLinear
 
diff --git a/vllm/lora/layers/row_parallel_linear.py b/vllm/lora/layers/row_parallel_linear.py
index 95517b1aee263..958aa6af36746 100644
--- a/vllm/lora/layers/row_parallel_linear.py
+++ b/vllm/lora/layers/row_parallel_linear.py
@@ -87,7 +87,7 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         return type(source_layer) is RowParallelLinear
 
@@ -164,7 +164,7 @@ class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
diff --git a/vllm/lora/layers/vocal_parallel_embedding.py b/vllm/lora/layers/vocal_parallel_embedding.py
index c87ca9e24dece..4c1550d09e5e2 100644
--- a/vllm/lora/layers/vocal_parallel_embedding.py
+++ b/vllm/lora/layers/vocal_parallel_embedding.py
@@ -131,7 +131,7 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         return type(source_layer) is VocabParallelEmbedding
 
diff --git a/vllm/lora/lora_weights.py b/vllm/lora/lora_weights.py
index f0d8e22194050..15c4a1be63eeb 100644
--- a/vllm/lora/lora_weights.py
+++ b/vllm/lora/lora_weights.py
@@ -152,6 +152,59 @@ class PackedLoRALayerWeights(LoRALayerWeights):
         )
         return obj
 
+    @classmethod
+    def pack_moe(
+        cls, loras: GenericSequence[Optional["LoRALayerWeights"]], module_name: str
+    ) -> "PackedLoRALayerWeights":
+        """Pack a list of LoRAs into a single LoRA.
+
+        If LoRA is None, it signifies that the submodule does not have a LoRA.
+        """
+
+        first_lora = next(lora for lora in loras if lora is not None)
+        assert first_lora is not None
+        rank = first_lora.rank
+        lora_alpha = first_lora.lora_alpha
+        assert len(loras) % 3 == 0
+        w1_lora_a_lst = []
+        w2_lora_a_lst = []
+        w3_lora_a_lst = []
+        w1_lora_b_lst = []
+        w2_lora_b_lst = []
+        w3_lora_b_lst = []
+        # TODO: Consider the case where some experts don't have LoRA added.
+        for eid in range(len(loras) // 3):
+            w1_lora = loras[eid * 3]
+            w2_lora = loras[eid * 3 + 1]
+            w3_lora = loras[eid * 3 + 2]
+            assert w1_lora is not None
+            assert w2_lora is not None
+            assert w3_lora is not None
+
+            w1_lora_a_lst.append(w1_lora.lora_a)
+            w2_lora_a_lst.append(w2_lora.lora_a)
+            w3_lora_a_lst.append(w3_lora.lora_a)
+
+            w1_lora_b_lst.append(w1_lora.lora_b)
+            w2_lora_b_lst.append(w2_lora.lora_b)
+            w3_lora_b_lst.append(w3_lora.lora_b)
+
+        w1_lora_a = torch.stack(w1_lora_a_lst, dim=0)  # (num_experts,rank,input_size)
+        w2_lora_a = torch.stack(w2_lora_a_lst, dim=0)
+        w3_lora_a = torch.stack(w3_lora_a_lst, dim=0)
+        w1_lora_b = torch.stack(w1_lora_b_lst, dim=0)  # (num_experts,output_size,rank)
+        w2_lora_b = torch.stack(w2_lora_b_lst, dim=0)
+        w3_lora_b = torch.stack(w3_lora_b_lst, dim=0)
+
+        obj = cls(
+            module_name,
+            rank,
+            [lora_alpha, lora_alpha, lora_alpha],
+            [w1_lora_a, w2_lora_a, w3_lora_a],
+            [w1_lora_b, w2_lora_b, w3_lora_b],
+        )
+        return obj
+
     def optimize(self) -> "PackedLoRALayerWeights":
         """Optimize the LoRA by merging the scaling into lora_b."""
         for i in range(len(self.lora_b)):
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 636f062feb7b0..4caaf0e117cc4 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -13,7 +13,7 @@ from torch import nn
 
 from vllm.config.lora import LoRAConfig
 from vllm.logger import init_logger
-from vllm.lora.layers import BaseLayerWithLoRA, FusedMoEWithLoRA, LoRAMapping
+from vllm.lora.layers import BaseLayerWithLoRA, FusedMoE3DWithLoRA, LoRAMapping
 from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
 from vllm.lora.peft_helper import PEFTHelper
 from vllm.lora.punica_wrapper import get_punica_wrapper
@@ -151,16 +151,13 @@ class LoRAModel:
                 if pin_memory:
                     loras[module_name].lora_b = loras[module_name].lora_b.pin_memory()
 
-        for lora in loras.values():
-            lora.optimize()
-
         return cls(lora_model_id, peft_helper.r, loras)
 
     @classmethod
     def from_local_checkpoint(
         cls,
         lora_dir: str,
-        expected_lora_modules: list[str],
+        expected_lora_modules: set[str],
         peft_helper: PEFTHelper,
         *,
         lora_model_id: int | None = None,
@@ -190,10 +187,7 @@ class LoRAModel:
         lora_tensor_path = os.path.join(lora_dir, "adapter_model.safetensors")
         lora_bin_file_path = os.path.join(lora_dir, "adapter_model.bin")
         lora_pt_file_path = os.path.join(lora_dir, "adapter_model.pt")
-        # new_embeddings_tensor_path = os.path.join(
-        #     lora_dir, "new_embeddings.safetensors"
-        # )
-        # new_embeddings_bin_file_path = os.path.join(lora_dir, "new_embeddings.bin")
+
         tensors: dict[str, torch.Tensor] = {}
         unexpected_modules: list[list[str] | str] = []
 
@@ -201,18 +195,19 @@ class LoRAModel:
             for lora_module in modules.keys():  # noqa
                 if is_base_embeddding_weights(lora_module):
                     continue
-                module_name, _ = parse_fine_tuned_lora_name(lora_module, weights_mapper)
-                # Handle FSDP file format where experts.base_layer is the
+                # Handle PEFT file format where experts.base_layer is the
                 # gate_up_proj and experts is the down_proj
                 if "base_layer" in lora_module:
                     continue
+                module_name, _ = parse_fine_tuned_lora_name(lora_module, weights_mapper)
                 # Case for expert lora weights
                 if ".experts" in module_name:
-                    if not any(
-                        module_name.endswith(ele) for ele in expected_lora_modules
-                    ):
+                    expert_idx = module_name.find(".experts")
+                    expert_suffix = module_name[expert_idx + 1 :]
+                    if expert_suffix not in expected_lora_modules:
                         unexpected_modules.append(module_name)
-                elif module_name.split(".")[-1] not in expected_lora_modules:
+
+                elif module_name.rsplit(".", 1)[-1] not in expected_lora_modules:
                     unexpected_modules.append(module_name)
 
             if unexpected_modules:
@@ -358,9 +353,7 @@ class LoRAModelManager:
         self.modules: dict[str, BaseLayerWithLoRA] = {}
         # Dict instead of a set for compatibility with LRUCache.
         self._last_mapping: LoRAMapping | None = None
-        self._is_3d_moe_model = is_moe_model(self.model) and hasattr(
-            self.model, "is_3d_moe_weight"
-        )
+        self._is_3d_moe_model = is_moe_model(self.model) and self.model.is_3d_moe_weight
         self._create_lora_modules()
 
         self.model.lora_manager = self
@@ -411,7 +404,7 @@ class LoRAModelManager:
                 continue
             # Note (gnovack) - If MOE lora weights are not split into
             # num_experts chunks, we split them here
-            if isinstance(module, FusedMoEWithLoRA) and torch.is_tensor(
+            if isinstance(module, FusedMoE3DWithLoRA) and torch.is_tensor(
                 module_lora.lora_a
             ):
                 # Handle PEFT file format where experts.base_layer is the
@@ -679,7 +672,10 @@ class LoRAModelManager:
                         "cpu",
                     )
                     subloras.append(lora)
-                lora = PackedLoRALayerWeights.pack(subloras)
+                if module.__class__.__name__ == "FusedMoEWithLoRA":
+                    lora = PackedLoRALayerWeights.pack_moe(subloras, module_name)
+                else:
+                    lora = PackedLoRALayerWeights.pack(subloras)
                 model.loras[module_name] = lora
         return model
 
@@ -739,13 +735,21 @@ class LoRAModelManager:
                 replaced_module_name = module_name.replace("model.", "")
                 if lora_model.check_lora_name(module_name):
                     module_name = replaced_module_name
-            lora_model.loras[module_name] = PackedLoRALayerWeights.pack(
-                replacement_loras
-            )
+            if module_name.endswith(".experts"):
+                lora_model.loras[module_name] = PackedLoRALayerWeights.pack_moe(
+                    replacement_loras, module_name
+                )
+            else:
+                lora_model.loras[module_name] = PackedLoRALayerWeights.pack(
+                    replacement_loras
+                )
             # Remove the modules that have been replaced.
             for module in replaced_module:
                 lora_model.loras.pop(module, None)
 
+        for lora in lora_model.loras.values():
+            lora.optimize()
+
     def _get_lora_layer_weights(
         self, lora_model: LoRAModel, module_name: str
     ) -> LoRALayerWeights | None:
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index 12524994d4968..47484b2b984df 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -170,16 +170,15 @@ def parse_fine_tuned_lora_name(
 
 def is_base_embeddding_weights(name: str) -> bool:
     # hardcoded subfixes for input & output embedding weights
-    input_embedding_subfix = ".embed_tokens.base_layer.weight"
-    output_embedding_subfix = ".lm_head.base_layer.weight"
-
-    return name.endswith(input_embedding_subfix) or name.endswith(
-        output_embedding_subfix
+    embedding_suffixes = (
+        ".embed_tokens.base_layer.weight",
+        ".lm_head.base_layer.weight",
     )
+    return name.endswith(embedding_suffixes)
 
 
 def is_regex_target_modules(
-    load_modules: str | list[str], expected_lora_modules: list[str]
+    load_modules: str | list[str], expected_lora_modules: set[str]
 ) -> bool:
     """
     PEFT supports passing `target_modules` in the form of regular expressions,
@@ -195,8 +194,8 @@ def is_regex_target_modules(
         except re.error:
             return False
 
-    def is_subset(sub_list, full_list):
-        return set(sub_list).issubset(set(full_list))
+    def is_subset(sub_list, full_set):
+        return set(sub_list).issubset(full_set)
 
     # Similar to PEFT's processing logic, regex-related operations are only
     #  executed when the load_modules is a `str`.
@@ -290,7 +289,7 @@ def process_packed_modules_mapping(model: nn.Module) -> dict[str, list[str]]:
             # the expert indices are expanded based on the configured number
             # of routed experts.
             packed_modules_mapping = get_packed_modules_mapping(model)
-            if not hasattr(model, "is_3d_moe_weight"):
+            if not model.is_3d_moe_weight:
                 # 3D MoE LoRA does not need `packed_modules_mapping`
                 packed_modules_mapping["experts"] = [
                     weight_name.rstrip(".")
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index 4cc201a6414f1..d9a03f0500497 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -88,15 +88,15 @@ class WorkerLoRAManager:
         try:
             supported_lora_modules = self._adapter_manager.supported_lora_modules
             packed_modules_mapping = self._adapter_manager.packed_modules_mapping
-            expected_lora_modules: list[str] = []
+            expected_lora_lst: list[str] = []
             for module in supported_lora_modules:
                 if module in packed_modules_mapping:
-                    expected_lora_modules.extend(packed_modules_mapping[module])
+                    expected_lora_lst.extend(packed_modules_mapping[module])
                 else:
-                    expected_lora_modules.append(module)
+                    expected_lora_lst.append(module)
                 if module == "experts":
-                    expected_lora_modules.append(module)
-            expected_lora_modules = list(set(expected_lora_modules))
+                    expected_lora_lst.append(module)
+            expected_lora_modules = set(expected_lora_lst)
             lora_path = get_adapter_absolute_path(lora_request.lora_path)
 
             peft_helper = PEFTHelper.from_local_dir(
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 6f6ce32538b71..cee0b79e5e5ac 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -336,6 +336,7 @@ class SupportsLoRA(Protocol):
         There is no need to redefine this flag if this class is in the
         MRO of your model class.
     """
+    is_3d_moe_weight: ClassVar[bool] = False
     # The `embedding_module` and `embedding_padding_modules`
     # are empty by default.
     embedding_modules: ClassVar[dict[str, str]] = {}
diff --git a/vllm/model_executor/models/qwen3_vl_moe.py b/vllm/model_executor/models/qwen3_vl_moe.py
index e2c129120b1a5..a054bd5b3831e 100644
--- a/vllm/model_executor/models/qwen3_vl_moe.py
+++ b/vllm/model_executor/models/qwen3_vl_moe.py
@@ -401,6 +401,7 @@ class Qwen3VLMoeMixtureOfExperts(MixtureOfExperts):
 class Qwen3VLMoeForConditionalGeneration(
     Qwen3VLForConditionalGeneration, Qwen3VLMoeMixtureOfExperts
 ):
+    is_3d_moe_weight: bool = True
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",

From 882851dc817061de52c949ac27b11442e5529caa Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Thu, 27 Nov 2025 22:51:26 +0800
Subject: [PATCH 010/770] [CI/Build][Bugfix] Fix auto label issues for CPU
 (#29610)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 .github/workflows/issue_autolabel.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/issue_autolabel.yml b/.github/workflows/issue_autolabel.yml
index a8251ceed07f4..629966b959330 100644
--- a/.github/workflows/issue_autolabel.yml
+++ b/.github/workflows/issue_autolabel.yml
@@ -109,7 +109,7 @@ jobs:
                 // Keyword search - matches whole words only (with word boundaries)
                 keywords: [
                   {
-                    term: "[CPU]",
+                    term: "CPU Backend",
                     searchIn: "title"
                   },
                   {

From bab438ff3e7bd93f861e66a60c6cbefe42af0d1a Mon Sep 17 00:00:00 2001
From: Ryan Rock <ryan.rock@amd.com>
Date: Thu, 27 Nov 2025 09:01:37 -0600
Subject: [PATCH 011/770] [CI/Build] Skip ray tests on ROCm (#29556)

Signed-off-by: Ryan Rock <ryan.rock@amd.com>
---
 tests/v1/distributed/test_async_llm_dp.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/v1/distributed/test_async_llm_dp.py b/tests/v1/distributed/test_async_llm_dp.py
index 60f9017184ea0..3b5f2e5e8d72f 100644
--- a/tests/v1/distributed/test_async_llm_dp.py
+++ b/tests/v1/distributed/test_async_llm_dp.py
@@ -12,6 +12,7 @@ from vllm import SamplingParams
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.inputs import PromptType
+from vllm.platforms import current_platform
 from vllm.sampling_params import RequestOutputKind
 from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.core_client import DPAsyncMPClient
@@ -84,6 +85,10 @@ async def test_load(
     if async_scheduling and data_parallel_backend == "ray":
         # TODO(NickLucche) Re-enable when async scheduling is supported
         pytest.skip("Async scheduling is not supported with ray")
+    elif data_parallel_backend == "ray" and current_platform.is_rocm():
+        pytest.skip(
+            "Ray as the distributed executor backend is not supported with ROCm."
+        )
     stats_loggers = {}
 
     @dataclass

From 66d3d5422c9b90f1ee9593e1793e86f14e4eb3f4 Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Thu, 27 Nov 2025 16:15:50 +0100
Subject: [PATCH 012/770] [Doc]: fixing typos in diverse files (#29492)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
---
 vllm/benchmarks/serve.py                | 4 ++--
 vllm/config/parallel.py                 | 4 ++--
 vllm/lora/punica_wrapper/punica_base.py | 2 +-
 vllm/model_executor/models/adapters.py  | 4 ++--
 vllm/v1/sample/tpu/sampler.py           | 2 +-
 vllm/v1/worker/dp_utils.py              | 6 ++++--
 6 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index dddb050ec180e..519303c0bfa0a 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -1005,7 +1005,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
         help="Key-value pairs (e.g, --header x-additional-info=0.3.3) "
         "for headers to be passed with each request. These headers override "
         "per backend constants and values set via environment variable, and "
-        "will be overriden by other arguments (such as request ids).",
+        "will be overridden by other arguments (such as request ids).",
     )
     parser.add_argument(
         "--max-concurrency",
@@ -1138,7 +1138,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
         "--percentile-metrics",
         type=str,
         default=None,
-        help="Comma-separated list of selected metrics to report percentils. "
+        help="Comma-separated list of selected metrics to report percentiles. "
         "This argument specifies the metrics to report percentiles. "
         'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
         'If not specified, defaults to "ttft,tpot,itl" for generative models '
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index 7ba1da5db3849..4a8c8bc17cfc3 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -238,9 +238,9 @@ class ParallelConfig:
     cp_kv_cache_interleave_size: int = 1
     """Interleave size of kv_cache storage while using DCP or PCP.
     For `total_cp_rank = pcp_rank * dcp_world_size + dcp_rank`,
-        and `total_cp_world_size = pcp_world_size * dcp_world_szie`.
+        and `total_cp_world_size = pcp_world_size * dcp_world_size`.
     store interleave_size tokens on total_cp_rank i,
-    then store next interleave_size tokens on taotal_cp_rank i+1.
+    then store next interleave_size tokens on total_cp_rank i+1.
     Interleave_size=1: token-level alignment, where token `i` is stored on
         total_cp_rank `i % total_cp_world_size`.
     Interleave_size=block_size: block-level alignment, where tokens are
diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py
index ce38751e4b6a7..47c42b095534a 100644
--- a/vllm/lora/punica_wrapper/punica_base.py
+++ b/vllm/lora/punica_wrapper/punica_base.py
@@ -173,7 +173,7 @@ class PunicaWrapperBase(PunicaWrapperABC):
         vocab_size: int,
     ):
         # NOTE We have remove lora extra vocab support for now. So we set
-        # extra_vocab_size alwayzs to 0, and extra_vocab_size will be removed.
+        # extra_vocab_size always to 0, and extra_vocab_size will be removed.
 
         extra_vocab_size = 0
         (
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index a9cc49451a1d3..5aba46f8614be 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -428,7 +428,7 @@ def load_weights_using_from_2_way_softmax(
     )
     if text_config.tie_word_embeddings:
         # embed_tokens is the assumed name for input embeddings. If the model does not
-        # have this attribute, we fallback to get_input_embeddings(), which is used by
+        # have this attribute, we fall back to get_input_embeddings(), which is used by
         # the Transformers modeling backend.
         embed_tokens = (
             model.model.embed_tokens
@@ -486,7 +486,7 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te
     )
     if text_config.tie_word_embeddings:
         # embed_tokens is the assumed name for input embeddings. If the model does not
-        # have this attribute, we fallback to get_input_embeddings(), which is used by
+        # have this attribute, we fall back to get_input_embeddings(), which is used by
         # the Transformers modeling backend.
         embed_tokens = (
             model.model.embed_tokens
diff --git a/vllm/v1/sample/tpu/sampler.py b/vllm/v1/sample/tpu/sampler.py
index 8f0463c76ce15..6d992bb37a59d 100644
--- a/vllm/v1/sample/tpu/sampler.py
+++ b/vllm/v1/sample/tpu/sampler.py
@@ -181,7 +181,7 @@ def apply_top_k_top_p(
     after thresholding the logit using this cut-off, the remaining elements
     shall constitute the top-p set.
 
-    Note: in the case of tie (i.e. multipple cut-off elements present in the
+    Note: in the case of tie (i.e. multiple cut-off elements present in the
     logit), all tie elements are included in the top-p set. In other words,
     this function does not break ties. Instead, these tie tokens have equal
     chance of being chosen during final sampling, so we can consider the tie
diff --git a/vllm/v1/worker/dp_utils.py b/vllm/v1/worker/dp_utils.py
index 064f2f0360cbf..c1509de821b05 100644
--- a/vllm/v1/worker/dp_utils.py
+++ b/vllm/v1/worker/dp_utils.py
@@ -24,12 +24,14 @@ def _get_device_and_group(parallel_config: ParallelConfig):
     device = get_dp_group().device
     group = get_dp_group().device_group
 
-    # Transfering this tensor from GPU to CPU will introduce a GPU sync
+    # Transferring this tensor from GPU to CPU will introduce a GPU sync
     # point that could adversely affect performance of vllm with asynch
     # scheduling. This environment variable exists to quickly disable
     # this optimization if we run into this case.
     if parallel_config.disable_nccl_for_dp_synchronization:
-        logger.info_once("Using CPU all reduce to syncronize DP padding between ranks.")
+        logger.info_once(
+            "Using CPU all reduce to synchronize DP padding between ranks."
+        )
         device = "cpu"
         group = get_dp_group().cpu_group
     return device, group

From cd007a53b4a2d7a83e35de559dc87da09302e956 Mon Sep 17 00:00:00 2001
From: Mathis Felardos <mathis@mistral.ai>
Date: Thu, 27 Nov 2025 16:32:38 +0100
Subject: [PATCH 013/770] [bugfix] avoid NIXL_ERR_REMOTE_DISCONNECT in
 nixl_connector when Prefill dies (#28120)

Signed-off-by: Mathis Felardos <mathis@mistral.ai>
---
 .../kv_connector/v1/nixl_connector.py         | 62 ++++++++++++-------
 1 file changed, 41 insertions(+), 21 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index ff51840b84b14..d5edf84e8e7f1 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -1832,35 +1832,55 @@ class NixlConnectorWorker:
         done_req_ids: set[str] = set()
         for req_id, handles in list(transfers.items()):
             in_progress = False
-            for handle, _xfer_stime in handles:
-                xfer_state = self.nixl_wrapper.check_xfer_state(handle)
-                if xfer_state == "DONE":
-                    # Get telemetry from NIXL
-                    res = self.nixl_wrapper.get_xfer_telemetry(handle)
-                    self.xfer_stats.record_transfer(res)
-                    self.nixl_wrapper.release_xfer_handle(handle)
-                elif xfer_state == "PROC":
-                    in_progress = True
-                    continue
-                else:
-                    # transfer failed - mark blocks as invalid
-                    logger.error(
-                        "NIXL transfer failed for request %s with state %s. "
+            for handle, xfer_start_time in handles:
+                try:
+                    xfer_state = self.nixl_wrapper.check_xfer_state(handle)
+                    if xfer_state == "DONE":
+                        # Get telemetry from NIXL
+                        res = self.nixl_wrapper.get_xfer_telemetry(handle)
+                        self.xfer_stats.record_transfer(res)
+                        self.nixl_wrapper.release_xfer_handle(handle)
+                    elif xfer_state == "PROC":
+                        in_progress = True
+                        continue
+                    else:
+                        logger.error(
+                            "NIXL transfer failed for request %s with state "
+                            "%s. Marking blocks as invalid.",
+                            req_id,
+                            xfer_state,
+                        )
+                        self._handle_failed_transfer(req_id, handle)
+                        in_progress = False
+                except Exception:
+                    logger.exception(
+                        "NIXL transfer exception for request %s. "
                         "Marking blocks as invalid.",
                         req_id,
-                        xfer_state,
                     )
-                    # mark all (logical)blocks for this request as invalid
-                    if meta := self._recving_metadata.pop(req_id, None):
-                        self._invalid_block_ids.update(meta.local_block_ids)
-                    self._recving_metadata.pop(req_id, None)
-                    self.nixl_wrapper.release_xfer_handle(handle)
-                    self.xfer_stats.record_failed_transfer()
+                    self._handle_failed_transfer(req_id, handle)
+                    in_progress = False
+
             if not in_progress:
                 done_req_ids.add(req_id)
                 del transfers[req_id]
         return done_req_ids
 
+    def _handle_failed_transfer(self, req_id: str, handle: int):
+        """
+        Handle a failed transfer by marking all (logical) blocks as invalid and
+        recording the failure.
+
+        Args:
+            req_id: The request ID.
+            handle: The transfer handle.
+        """
+        if meta := self._recving_metadata.pop(req_id, None):
+            self._invalid_block_ids.update(meta.local_block_ids)
+        self._recving_metadata.pop(req_id, None)
+        self.nixl_wrapper.release_xfer_handle(handle)
+        self.xfer_stats.record_failed_transfer()
+
     def start_load_kv(self, metadata: NixlConnectorMetadata):
         """
         Start loading by triggering non-blocking nixl_xfer.

From fc1d8be3dc97e33ade7fb578451006bb044a5e60 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Thu, 27 Nov 2025 11:19:09 -0500
Subject: [PATCH 014/770] [Attention] Update attention imports (#29540)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 .../test_rocm_attention_backends_selection.py         |  9 +++------
 .../kv_connector/unit/test_backwards_compatibility.py |  6 +++---
 vllm/attention/backends/abstract.py                   | 11 ++++-------
 vllm/attention/layers/chunked_local_attention.py      |  3 +--
 vllm/config/model.py                                  |  3 +--
 vllm/config/multimodal.py                             | 11 ++---------
 vllm/distributed/kv_transfer/kv_connector/v1/base.py  |  4 ++--
 .../kv_connector/v1/decode_bench_connector.py         |  4 ++--
 .../kv_transfer/kv_connector/v1/lmcache_connector.py  |  4 ++--
 .../v1/lmcache_integration/vllm_v1_adapter.py         |  4 ++--
 .../kv_connector/v1/lmcache_mp_connector.py           |  4 ++--
 .../kv_transfer/kv_connector/v1/multi_connector.py    |  4 ++--
 .../kv_transfer/kv_connector/v1/nixl_connector.py     |  5 ++---
 .../kv_connector/v1/p2p/p2p_nccl_connector.py         |  4 ++--
 .../kv_connector/v1/shared_storage_connector.py       |  4 ++--
 vllm/forward_context.py                               |  8 +++-----
 vllm/model_executor/layers/attention_layer_base.py    |  7 ++-----
 vllm/model_executor/layers/mamba/abstract.py          |  7 ++-----
 .../compressed_tensors/compressed_tensors.py          |  3 +--
 vllm/model_executor/layers/quantization/fp8.py        |  4 +---
 vllm/model_executor/layers/quantization/modelopt.py   |  3 +--
 vllm/model_executor/layers/quantization/mxfp4.py      |  3 +--
 vllm/model_executor/layers/quantization/petit.py      |  3 +--
 vllm/model_executor/layers/quantization/ptpc_fp8.py   |  3 +--
 .../model_executor/layers/quantization/quark/quark.py |  3 +--
 vllm/platforms/cpu.py                                 |  5 +----
 vllm/platforms/cuda.py                                | 10 ++--------
 vllm/platforms/interface.py                           |  5 +----
 vllm/platforms/rocm.py                                |  6 +-----
 vllm/platforms/tpu.py                                 |  5 +----
 vllm/platforms/xpu.py                                 |  7 +------
 vllm/v1/attention/backends/cpu_attn.py                |  2 --
 vllm/v1/attention/backends/flash_attn.py              |  2 --
 vllm/v1/attention/backends/flex_attention.py          |  2 --
 vllm/v1/attention/backends/utils.py                   |  7 +++++--
 vllm/v1/kv_offload/spec.py                            |  4 ++--
 vllm/v1/spec_decode/eagle.py                          |  3 +--
 vllm/v1/worker/utils.py                               |  7 ++-----
 38 files changed, 63 insertions(+), 126 deletions(-)

diff --git a/tests/v1/attention/test_rocm_attention_backends_selection.py b/tests/v1/attention/test_rocm_attention_backends_selection.py
index 80158d4b7278c..77790be6f892b 100644
--- a/tests/v1/attention/test_rocm_attention_backends_selection.py
+++ b/tests/v1/attention/test_rocm_attention_backends_selection.py
@@ -139,14 +139,13 @@ def test_standard_attention_backend_selection(
     import importlib
 
     import vllm.envs as envs
-    from vllm.attention.backends.registry import _Backend
 
     importlib.reload(envs)
 
     # Convert string backend to enum if provided
     backend_enum = None
     if selected_backend:
-        backend_enum = getattr(_Backend, selected_backend)
+        backend_enum = getattr(AttentionBackendEnum, selected_backend)
 
     # Get the backend class path
     from vllm.platforms.rocm import RocmPlatform
@@ -253,7 +252,6 @@ def test_mla_backend_selection(
     import importlib
 
     import vllm.envs as envs
-    from vllm.attention.backends.registry import _Backend
 
     importlib.reload(envs)
 
@@ -269,7 +267,7 @@ def test_mla_backend_selection(
         # Convert string backend to enum if provided
         backend_enum = None
         if selected_backend:
-            backend_enum = getattr(_Backend, selected_backend)
+            backend_enum = getattr(AttentionBackendEnum, selected_backend)
 
         from vllm.platforms.rocm import RocmPlatform
 
@@ -301,7 +299,6 @@ def test_mla_backend_selection(
 
 def test_aiter_fa_requires_gfx9(mock_vllm_config):
     """Test that ROCM_AITER_FA requires gfx9 architecture."""
-    from vllm.attention.backends.registry import _Backend
     from vllm.platforms.rocm import RocmPlatform
 
     # Mock on_gfx9 to return False
@@ -313,7 +310,7 @@ def test_aiter_fa_requires_gfx9(mock_vllm_config):
         ),
     ):
         RocmPlatform.get_attn_backend_cls(
-            selected_backend=_Backend.ROCM_AITER_FA,
+            selected_backend=AttentionBackendEnum.ROCM_AITER_FA,
             head_size=128,
             dtype=torch.float16,
             kv_cache_dtype="auto",
diff --git a/tests/v1/kv_connector/unit/test_backwards_compatibility.py b/tests/v1/kv_connector/unit/test_backwards_compatibility.py
index f51001a6ec12a..7cd23805c599d 100644
--- a/tests/v1/kv_connector/unit/test_backwards_compatibility.py
+++ b/tests/v1/kv_connector/unit/test_backwards_compatibility.py
@@ -14,6 +14,7 @@ from unittest.mock import patch
 
 import pytest
 
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
 from vllm.distributed.kv_transfer.kv_connector.v1 import (
     KVConnectorBase_V1,
@@ -24,7 +25,6 @@ from vllm.v1.core.sched.output import SchedulerOutput
 from .utils import create_scheduler, create_vllm_config
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.config import VllmConfig
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
@@ -68,7 +68,7 @@ class OldStyleTestConnector(KVConnectorBase_V1):
         self,
         layer_name: str,
         kv_layer,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs,
     ) -> None:
         pass
@@ -119,7 +119,7 @@ class NewStyleTestConnector(KVConnectorBase_V1):
         self,
         layer_name: str,
         kv_layer,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs,
     ) -> None:
         pass
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index a321167b8090f..c290670eeacb0 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -6,11 +6,10 @@ from typing import TYPE_CHECKING, ClassVar, Generic, Protocol, TypeVar, get_args
 
 import torch
 
-from vllm.model_executor.layers.linear import ColumnParallelLinear
-from vllm.model_executor.layers.quantization.utils.quant_utils import QuantKey
-
 if TYPE_CHECKING:
     from vllm.config.cache import CacheDType
+    from vllm.model_executor.layers.linear import ColumnParallelLinear
+    from vllm.model_executor.layers.quantization.utils.quant_utils import QuantKey
     from vllm.platforms.interface import DeviceCapability
     from vllm.v1.attention.backends.utils import KVCacheLayoutType
 
@@ -178,8 +177,6 @@ class AttentionBackend(ABC):
         By default, only supports decoder attention.
         Backends should override this to support other attention types.
         """
-        from vllm.attention.backends.abstract import AttentionType
-
         return attn_type == AttentionType.DECODER
 
     @classmethod
@@ -360,7 +357,7 @@ class AttentionImpl(ABC, Generic[T]):
     ) -> torch.Tensor:
         raise NotImplementedError
 
-    def fused_output_quant_supported(self, quant_key: QuantKey):
+    def fused_output_quant_supported(self, quant_key: "QuantKey"):
         """
         Does this attention implementation support fused output quantization.
         This is used by the AttnFusionPass to only fuse output quantization
@@ -412,7 +409,7 @@ class MLAAttentionImpl(AttentionImpl[T], Generic[T]):
         qk_rope_head_dim: int,
         qk_head_dim: int,
         v_head_dim: int,
-        kv_b_proj: ColumnParallelLinear,
+        kv_b_proj: "ColumnParallelLinear",
         indexer: object | None = None,
     ) -> None:
         raise NotImplementedError
diff --git a/vllm/attention/layers/chunked_local_attention.py b/vllm/attention/layers/chunked_local_attention.py
index 48fcc6fa736bb..0ced0028ded9e 100644
--- a/vllm/attention/layers/chunked_local_attention.py
+++ b/vllm/attention/layers/chunked_local_attention.py
@@ -5,6 +5,7 @@ import functools
 import torch
 
 from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
+from vllm.attention.layer import Attention
 from vllm.attention.selector import get_attn_backend
 from vllm.config import CacheConfig
 from vllm.config.vllm import VllmConfig
@@ -22,8 +23,6 @@ from vllm.v1.kv_cache_interface import (
     KVCacheSpec,
 )
 
-from ..layer import Attention
-
 
 @functools.lru_cache
 def create_chunked_local_attention_backend(
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 84311596b660c..5dabd636c18c6 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -14,6 +14,7 @@ from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
 from transformers.configuration_utils import ALLOWED_LAYER_TYPES
 
 import vllm.envs as envs
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config.multimodal import MMCacheType, MMEncoderTPMode, MultiModalConfig
 from vllm.config.pooler import PoolerConfig
 from vllm.config.scheduler import RunnerType
@@ -53,7 +54,6 @@ if TYPE_CHECKING:
 
     import vllm.model_executor.layers.quantization as me_quant
     import vllm.model_executor.models as me_models
-    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config.load import LoadConfig
     from vllm.config.parallel import ParallelConfig
     from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -61,7 +61,6 @@ if TYPE_CHECKING:
 else:
     PretrainedConfig = Any
 
-    AttentionBackendEnum = Any
     me_quant = LazyLoader(
         "model_executor", globals(), "vllm.model_executor.layers.quantization"
     )
diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py
index 590bc4dcd0760..8a2936de96d6f 100644
--- a/vllm/config/multimodal.py
+++ b/vllm/config/multimodal.py
@@ -2,19 +2,15 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Mapping
-from typing import TYPE_CHECKING, Any, Literal, TypeAlias
+from typing import Any, Literal, TypeAlias
 
 from pydantic import ConfigDict, Field, field_validator, model_validator
 from pydantic.dataclasses import dataclass
 
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config.utils import config
 from vllm.utils.hashing import safe_hash
 
-if TYPE_CHECKING:
-    from vllm.attention.backends.registry import AttentionBackendEnum
-else:
-    AttentionBackendEnum = Any
-
 
 @dataclass
 class BaseDummyOptions:
@@ -170,9 +166,6 @@ class MultiModalConfig:
     def _validate_mm_encoder_attn_backend(
         cls, value: str | AttentionBackendEnum | None
     ) -> AttentionBackendEnum | None:
-        # We need to import the real type here (deferred to avoid circular import).
-        from vllm.attention.backends.registry import AttentionBackendEnum
-
         if isinstance(value, str) and value.upper() == "XFORMERS":
             raise ValueError(
                 "Attention backend 'XFORMERS' has been removed (See PR #29262 for "
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
index 74f09278b7bb1..cac45425bb7aa 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -42,12 +42,12 @@ from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional
 
 import torch
 
+from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
 from vllm.logger import init_logger
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.outputs import KVConnectorOutput
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
     from vllm.config import VllmConfig
     from vllm.distributed.kv_events import KVCacheEvent
     from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
@@ -239,7 +239,7 @@ class KVConnectorBase_V1(ABC):
         return
 
     def register_cross_layers_kv_cache(
-        self, kv_cache: torch.Tensor, attn_backend: type["AttentionBackend"]
+        self, kv_cache: torch.Tensor, attn_backend: type[AttentionBackend]
     ):
         """
         Initialize with a single KV cache tensor used by all layers.
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py
index 9cd7d93c92fa3..e9b2bd392b0ef 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py
@@ -36,6 +36,7 @@ from typing import TYPE_CHECKING, Any, Optional
 
 import torch
 
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.distributed.kv_transfer.kv_connector.v1 import (
     KVConnectorBase_V1,
     KVConnectorRole,
@@ -45,7 +46,6 @@ from vllm.logger import init_logger
 from vllm.utils.math_utils import cdiv
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.config import VllmConfig
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
@@ -117,7 +117,7 @@ class DecodeBenchConnector(KVConnectorBase_V1):
         self,
         layer_name: str,
         kv_layer: torch.Tensor,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs: Any,
     ) -> None:
         # This connector doesn't save KV cache (benchmarking only)
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
index 0c24a53fb754b..30da424ddcca0 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
@@ -7,6 +7,7 @@ from lmcache.integration.vllm.vllm_v1_adapter import (
     LMCacheConnectorV1Impl as LMCacheConnectorLatestImpl,
 )
 
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1,
@@ -17,7 +18,6 @@ from vllm.logger import init_logger
 from vllm.v1.core.sched.output import SchedulerOutput
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
     from vllm.v1.kv_cache_interface import KVCacheConfig
@@ -91,7 +91,7 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
         self,
         layer_name: str,
         kv_layer: torch.Tensor,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs: Any,
     ) -> None:
         """
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
index 94572b02fa872..15ac5b049fce9 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
@@ -29,6 +29,7 @@ from lmcache.v1.lookup_client.lmcache_async_lookup_client import (
 from lmcache.v1.offload_server.zmq_server import ZMQOffloadServer
 from lmcache.v1.plugin.plugin_launcher import PluginLauncher
 
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1,
@@ -50,7 +51,6 @@ from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.version import __version__ as VLLM_VERSION
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.forward_context import ForwardContext
     from vllm.multimodal.inputs import PlaceholderRange
     from vllm.v1.core.kv_cache_manager import KVCacheManager
@@ -915,7 +915,7 @@ class LMCacheConnectorV1Impl:
         self,
         layer_name: str,
         kv_layer: torch.Tensor,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs,
     ) -> None:
         """Start saving the a layer of KV cache from vLLM's paged buffer
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
index d1d3e475cc889..a4bddf5e03166 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
@@ -10,6 +10,7 @@ import zmq
 from lmcache.integration.vllm.utils import mla_enabled
 from lmcache.utils import init_logger as lmcache_init_logger
 
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1,
@@ -26,7 +27,6 @@ from vllm.v1.outputs import KVConnectorOutput
 from vllm.v1.utils import ConstantList
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.config import VllmConfig
     from vllm.distributed.kv_events import KVCacheEvent
     from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
@@ -490,7 +490,7 @@ class LMCacheMPConnector(KVConnectorBase_V1):
         self,
         layer_name: str,
         kv_layer: torch.Tensor,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs: Any,
     ) -> None:
         """
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
index c9d08e9b78ed0..f47e8ca7e6c50 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any
 
 import torch
 
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.config.kv_transfer import KVTransferConfig
 from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType
@@ -27,7 +28,6 @@ from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.outputs import KVConnectorOutput
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.distributed.kv_events import KVCacheEvent
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
@@ -216,7 +216,7 @@ class MultiConnector(KVConnectorBase_V1):
         self,
         layer_name: str,
         kv_layer: torch.Tensor,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs,
     ) -> None:
         for c in self._connectors:
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index d5edf84e8e7f1..24c8d32dafedc 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -20,7 +20,7 @@ import torch
 import zmq
 
 from vllm import envs
-from vllm.attention.backends.abstract import AttentionBackend
+from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.selector import get_attn_backend
 from vllm.config import VllmConfig
@@ -51,7 +51,6 @@ from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.worker.block_table import BlockTable
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
     from vllm.v1.kv_cache_interface import KVCacheConfig
     from vllm.v1.request import Request
@@ -308,7 +307,7 @@ class NixlConnector(KVConnectorBase_V1):
         self,
         layer_name: str,
         kv_layer: torch.Tensor,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs,
     ) -> None:
         """NixlConnector does not save explicitly."""
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
index a124a0d519db8..8f3a62d7bcdb0 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any, Optional
 import regex as re
 import torch
 
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1,
@@ -22,7 +23,6 @@ from vllm.v1.attention.backends.mla.common import MLACommonMetadata
 from vllm.v1.core.sched.output import SchedulerOutput
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
     from vllm.v1.kv_cache_interface import KVCacheConfig
@@ -243,7 +243,7 @@ class P2pNcclConnector(KVConnectorBase_V1):
         self,
         layer_name: str,
         kv_layer: torch.Tensor,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs: Any,
     ) -> None:
         """Start saving the KV cache of the layer from vLLM's paged buffer
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
index 4611b4d1ff7b8..ed641cfc43ddd 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any, Optional
 import safetensors
 import torch
 
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1,
@@ -19,7 +20,6 @@ from vllm.v1.attention.backends.mla.common import MLACommonMetadata
 from vllm.v1.core.sched.output import SchedulerOutput
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
     from vllm.v1.kv_cache_interface import KVCacheConfig
@@ -211,7 +211,7 @@ class SharedStorageConnector(KVConnectorBase_V1):
         self,
         layer_name: str,
         kv_layer: torch.Tensor,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs: Any,
     ) -> None:
         """Start saving the KV cache of the layer from vLLM's paged buffer
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 635419bc7cad4..173d366267e87 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -5,19 +5,17 @@ import time
 from collections import defaultdict
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, NamedTuple
+from typing import Any, NamedTuple
 
 import torch
 
 import vllm.envs as envs
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import CUDAGraphMode, ParallelConfig, VllmConfig
 from vllm.logger import init_logger
 from vllm.v1.worker.dp_utils import coordinate_batch_across_dp
 from vllm.v1.worker.ubatch_utils import UBatchSlices
 
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
-
 logger = init_logger(__name__)
 
 track_batchsize: bool = envs.VLLM_LOG_BATCHSIZE_INTERVAL >= 0
@@ -195,7 +193,7 @@ class ForwardContext:
     for each microbatch.
     Set dynamically for each forward pass
     """
-    attn_metadata: dict[str, "AttentionMetadata"] | list[dict[str, "AttentionMetadata"]]
+    attn_metadata: dict[str, AttentionMetadata] | list[dict[str, AttentionMetadata]]
     # TODO: remove after making all virtual_engines share the same kv cache
     virtual_engine: int  # set dynamically for each forward pass
     # set dynamically for each forward pass
diff --git a/vllm/model_executor/layers/attention_layer_base.py b/vllm/model_executor/layers/attention_layer_base.py
index ffbef470b1868..a60cf787135c0 100644
--- a/vllm/model_executor/layers/attention_layer_base.py
+++ b/vllm/model_executor/layers/attention_layer_base.py
@@ -3,14 +3,11 @@
 """Base class for attention-like layers."""
 
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING
 
+from vllm.attention.backends.abstract import AttentionBackend
 from vllm.config import VllmConfig
 from vllm.v1.kv_cache_interface import KVCacheSpec
 
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
-
 
 class AttentionLayerBase(ABC):
     """
@@ -22,7 +19,7 @@ class AttentionLayerBase(ABC):
     """
 
     @abstractmethod
-    def get_attn_backend(self) -> type["AttentionBackend"]:
+    def get_attn_backend(self) -> type[AttentionBackend]:
         """Get the attention backend class for this layer."""
         pass
 
diff --git a/vllm/model_executor/layers/mamba/abstract.py b/vllm/model_executor/layers/mamba/abstract.py
index aa919d6fdc35c..74f4383e9c238 100644
--- a/vllm/model_executor/layers/mamba/abstract.py
+++ b/vllm/model_executor/layers/mamba/abstract.py
@@ -2,18 +2,15 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import abstractmethod
 from collections.abc import Iterable
-from typing import TYPE_CHECKING
 
 import torch
 
+from vllm.attention.backends.abstract import AttentionBackend
 from vllm.attention.selector import get_mamba_attn_backend
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 from vllm.v1.kv_cache_interface import KVCacheSpec, MambaSpec
 
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
-
 
 class MambaBase(AttentionLayerBase):
     """
@@ -66,6 +63,6 @@ class MambaBase(AttentionLayerBase):
             ),
         )
 
-    def get_attn_backend(self) -> type["AttentionBackend"]:
+    def get_attn_backend(self) -> type[AttentionBackend]:
         """Get the attention backend class for this Mamba layer."""
         return get_mamba_attn_backend(self.mamba_type)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 7f61746a4e45c..f9d8f5883680b 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -18,6 +18,7 @@ from compressed_tensors.quantization import (
 from compressed_tensors.transform import TransformConfig
 
 import vllm.envs as envs
+from vllm.attention.layer import Attention
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import (
@@ -131,8 +132,6 @@ class CompressedTensorsConfig(QuantizationConfig):
         layer: torch.nn.Module,
         prefix: str,
     ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
-
         if isinstance(layer, LinearBase):
             # collect schemes
             quant_scheme = self.get_scheme(layer=layer, layer_name=prefix)
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index e033032903e87..7dfc8a9c36c3e 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -14,6 +14,7 @@ import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
 from vllm._aiter_ops import rocm_aiter_ops
+from vllm.attention.layer import Attention
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
 from vllm.model_executor.layers.batch_invariant import (
@@ -277,7 +278,6 @@ class Fp8Config(QuantizationConfig):
     def get_xpu_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention
         from vllm.model_executor.layers.quantization.ipex_quant import (
             XPUFp8LinearMethod,
             XPUFp8MoEMethod,
@@ -307,8 +307,6 @@ class Fp8Config(QuantizationConfig):
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
-
         if current_platform.is_xpu():
             return self.get_xpu_quant_method(layer, prefix)
         if isinstance(layer, LinearBase):
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 2cf7089e0ff90..80f8e3a03e7cf 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -12,6 +12,7 @@ from torch.nn.parameter import Parameter
 import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
+from vllm.attention.layer import Attention
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEQuantConfig,
@@ -149,8 +150,6 @@ class ModelOptQuantConfigBase(QuantizationConfig):
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
-
         # handle kv-cache first so we can focus only on weight quantization thereafter
         if isinstance(layer, Attention):
             return self.KVCacheMethodCls(self)
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index d975131f7cff7..bc241ac692e23 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -8,6 +8,7 @@ import torch
 from torch.nn.parameter import Parameter
 
 from vllm import envs
+from vllm.attention.layer import Attention
 from vllm.config import get_current_vllm_config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (
@@ -184,8 +185,6 @@ class Mxfp4Config(QuantizationConfig):
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
-
         if isinstance(layer, LinearBase):
             if self.ignored_layers and is_layer_skipped(
                 prefix=prefix,
diff --git a/vllm/model_executor/layers/quantization/petit.py b/vllm/model_executor/layers/quantization/petit.py
index 402cebc38c215..5ccc73166361a 100644
--- a/vllm/model_executor/layers/quantization/petit.py
+++ b/vllm/model_executor/layers/quantization/petit.py
@@ -8,6 +8,7 @@ import regex as re
 import torch
 from torch.nn.parameter import Parameter
 
+from vllm.attention.layer import Attention
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import (
     LinearBase,
@@ -159,8 +160,6 @@ class PetitNvFp4Config(QuantizationConfig):
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
-
         exclude = self.require_exclude_modules()
 
         if isinstance(layer, LinearBase):
diff --git a/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py
index 26ba8e5b16bc0..ed8a2c7fa0841 100644
--- a/vllm/model_executor/layers/quantization/ptpc_fp8.py
+++ b/vllm/model_executor/layers/quantization/ptpc_fp8.py
@@ -7,6 +7,7 @@ import torch
 from torch.nn.parameter import Parameter
 
 from vllm import _custom_ops as ops
+from vllm.attention.layer import Attention
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod
 from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -65,8 +66,6 @@ class PTPCFp8Config(Fp8Config):
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
-
         if isinstance(layer, LinearBase):
             if is_layer_skipped(prefix, self.ignored_layers):
                 return UnquantizedLinearMethod()
diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py
index f59e5e2a0af7a..3640e5c452786 100644
--- a/vllm/model_executor/layers/quantization/quark/quark.py
+++ b/vllm/model_executor/layers/quantization/quark/quark.py
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Any, Optional, cast
 
 import torch
 
+from vllm.attention.layer import Attention
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import (
@@ -102,8 +103,6 @@ class QuarkConfig(QuantizationConfig):
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
-
         # Check if the layer is skipped for quantization.
         exclude_layers = cast(list[str], self.quant_config.get("exclude"))
         if should_ignore_layer(
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index ed655912d3964..5f9561366e0d5 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -14,6 +14,7 @@ import regex as re
 import torch
 
 from vllm import envs
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
 
 from .interface import CpuArchEnum, Platform, PlatformEnum
@@ -21,10 +22,8 @@ from .interface import CpuArchEnum, Platform, PlatformEnum
 logger = init_logger(__name__)
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config import VllmConfig
 else:
-    AttentionBackendEnum = None
     VllmConfig = None
 
 
@@ -135,8 +134,6 @@ class CpuPlatform(Platform):
         use_sparse: bool,
         attn_type: str | None = None,
     ) -> str:
-        from vllm.attention.backends.registry import AttentionBackendEnum
-
         if selected_backend and selected_backend != AttentionBackendEnum.CPU_ATTN:
             logger.info("Cannot use %s backend on CPU.", selected_backend)
         if use_mla:
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index e8e14387bb7f6..d5c3a177d9c2b 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -15,6 +15,8 @@ from typing_extensions import ParamSpec
 # import custom ops, trigger op registration
 import vllm._C  # noqa
 import vllm.envs as envs
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
 from vllm.utils.import_utils import import_pynvml
 from vllm.utils.torch_utils import cuda_device_count_stateless
@@ -22,11 +24,9 @@ from vllm.utils.torch_utils import cuda_device_count_stateless
 from .interface import DeviceCapability, Platform, PlatformEnum
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config import VllmConfig
     from vllm.config.cache import CacheDType
 else:
-    AttentionBackendEnum = None
     VllmConfig = None
     CacheDType = None
 
@@ -48,8 +48,6 @@ def _get_backend_priorities(
     device_capability: DeviceCapability,
 ) -> list[AttentionBackendEnum]:
     """Get backend priorities with lazy import to avoid circular dependency."""
-    from vllm.attention.backends.registry import AttentionBackendEnum
-
     if use_mla:
         if device_capability.major == 10:
             return [
@@ -265,8 +263,6 @@ class CudaPlatformBase(Platform):
     def get_vit_attn_backend(
         cls, head_size: int, dtype: torch.dtype
     ) -> "AttentionBackendEnum":
-        from vllm.attention.backends.registry import AttentionBackendEnum
-
         # Try FlashAttention first
         try:
             backend_class = AttentionBackendEnum.FLASH_ATTN.get_class()
@@ -335,8 +331,6 @@ class CudaPlatformBase(Platform):
         use_sparse: bool,
         attn_type: str | None = None,
     ) -> str:
-        from vllm.attention.backends.abstract import AttentionType
-
         if attn_type is None:
             attn_type = AttentionType.DECODER
 
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 1e6b53021f888..27c6fac09f498 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -12,12 +12,12 @@ from typing import TYPE_CHECKING, Any, NamedTuple
 import numpy as np
 import torch
 
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
 
 if TYPE_CHECKING:
     from torch.distributed import PrefixStore, ProcessGroup
 
-    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config import VllmConfig
     from vllm.config.cache import CacheDType
     from vllm.inputs import ProcessorInputs, PromptType
@@ -226,9 +226,6 @@ class Platform:
     def get_vit_attn_backend(
         cls, head_size: int, dtype: torch.dtype
     ) -> "AttentionBackendEnum":
-        # Import AttentionBackendEnum here to avoid circular import.
-        from vllm.attention.backends.registry import AttentionBackendEnum
-
         return AttentionBackendEnum.TORCH_SDPA
 
     @classmethod
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 0483f6c06ada8..ccf3446a3a6e5 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -8,16 +8,14 @@ from typing import TYPE_CHECKING
 import torch
 
 import vllm.envs as envs
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
 from vllm.utils.torch_utils import cuda_device_count_stateless
 
 from .interface import DeviceCapability, Platform, PlatformEnum
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config import VllmConfig
-else:
-    AttentionBackendEnum = None
 
 logger = init_logger(__name__)
 
@@ -196,7 +194,6 @@ class RocmPlatform(Platform):
         from importlib.util import find_spec
 
         from vllm._aiter_ops import rocm_aiter_ops
-        from vllm.attention.backends.registry import AttentionBackendEnum
 
         if rocm_aiter_ops.is_mha_enabled():
             # Note: AITER FA is only supported for Qwen-VL models.
@@ -222,7 +219,6 @@ class RocmPlatform(Platform):
         attn_type: str | None = None,
     ) -> str:
         from vllm._aiter_ops import rocm_aiter_ops
-        from vllm.attention.backends.registry import AttentionBackendEnum
 
         if use_sparse:
             if kv_cache_dtype.startswith("fp8"):
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 04325a522f444..cbc0a996f3661 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, cast
 import torch
 from tpu_info import device
 
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.inputs import ProcessorInputs, PromptType
 from vllm.logger import init_logger
 
@@ -15,7 +16,6 @@ from .interface import Platform, PlatformEnum
 if TYPE_CHECKING:
     from typing import TypeAlias
 
-    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config import VllmConfig
     from vllm.config.cache import BlockSize
     from vllm.pooling_params import PoolingParams
@@ -26,7 +26,6 @@ else:
     BlockSize = None
     VllmConfig = None
     PoolingParams = None
-    AttentionBackendEnum = None
     ParamsType = None
 
 logger = init_logger(__name__)
@@ -67,8 +66,6 @@ class TpuPlatform(Platform):
         use_sparse,
         attn_type: str | None = None,
     ) -> str:
-        from vllm.attention.backends.registry import AttentionBackendEnum
-
         if use_sparse:
             raise NotImplementedError("Sparse Attention is not supported on TPU.")
         if selected_backend != AttentionBackendEnum.PALLAS:
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 18a3186b142f1..768714fb16726 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -8,16 +8,15 @@ from typing import TYPE_CHECKING
 import torch
 
 import vllm.envs as envs
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
 
 from .interface import DeviceCapability, Platform, PlatformEnum
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config import VllmConfig
 else:
     VllmConfig = None
-    AttentionBackendEnum = None
 
 logger = init_logger(__name__)
 
@@ -60,8 +59,6 @@ class XPUPlatform(Platform):
             "only NHD layout is supported by XPU attention kernels."
         )
 
-        from vllm.attention.backends.registry import AttentionBackendEnum
-
         if use_sparse:
             raise NotImplementedError("Sparse Attention is not supported on XPU.")
         if selected_backend == AttentionBackendEnum.TRITON_ATTN:
@@ -116,8 +113,6 @@ class XPUPlatform(Platform):
     def get_vit_attn_backend(
         cls, head_size: int, dtype: torch.dtype
     ) -> "AttentionBackendEnum":
-        from vllm.attention.backends.registry import AttentionBackendEnum
-
         return AttentionBackendEnum.FLASH_ATTN
 
     @classmethod
diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py
index d0b1f8c1b8071..fed7dcdf293bd 100644
--- a/vllm/v1/attention/backends/cpu_attn.py
+++ b/vllm/v1/attention/backends/cpu_attn.py
@@ -51,8 +51,6 @@ class CPUAttentionBackend(AttentionBackend):
     @classmethod
     def supports_attn_type(cls, attn_type: str) -> bool:
         """CPU attention supports decoder and encoder-only attention."""
-        from vllm.attention.backends.abstract import AttentionType
-
         return attn_type in (
             AttentionType.DECODER,
             AttentionType.ENCODER,
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index a1558073003fd..fb080b0b33bc0 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -84,8 +84,6 @@ class FlashAttentionBackend(AttentionBackend):
     @classmethod
     def supports_attn_type(cls, attn_type: str) -> bool:
         """FlashAttention supports all attention types."""
-        from vllm.attention.backends.abstract import AttentionType
-
         return attn_type in (
             AttentionType.DECODER,
             AttentionType.ENCODER,
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index 3869f1f4164c9..8de0a0a11471f 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -87,8 +87,6 @@ class FlexAttentionBackend(AttentionBackend):
     @classmethod
     def supports_attn_type(cls, attn_type: str) -> bool:
         """FlexAttention supports both decoder and encoder-only attention."""
-        from vllm.attention.backends.abstract import AttentionType
-
         return attn_type in (AttentionType.DECODER, AttentionType.ENCODER_ONLY)
 
     @staticmethod
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index ea9dccc702a0a..6e0d84e4fb4ac 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -24,12 +24,15 @@ from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.utils.math_utils import cdiv
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionImpl
     from vllm.v1.core.sched.output import SchedulerOutput
     from vllm.v1.worker.gpu_input_batch import InputBatch
 
 import vllm.envs as envs
-from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
+from vllm.attention.backends.abstract import (
+    AttentionBackend,
+    AttentionImpl,
+    AttentionMetadata,
+)
 from vllm.distributed.kv_transfer.kv_connector.utils import (
     get_kv_connector_cache_layout,
 )
diff --git a/vllm/v1/kv_offload/spec.py b/vllm/v1/kv_offload/spec.py
index 3afce55890752..2cdd5ba5ffe5c 100644
--- a/vllm/v1/kv_offload/spec.py
+++ b/vllm/v1/kv_offload/spec.py
@@ -6,12 +6,12 @@ from typing import TYPE_CHECKING
 
 import torch
 
+from vllm.attention.backends.abstract import AttentionBackend
 from vllm.logger import init_logger
 from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager
 from vllm.v1.kv_offload.worker.worker import OffloadingHandler
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
     from vllm.config import VllmConfig
 
 logger = init_logger(__name__)
@@ -51,7 +51,7 @@ class OffloadingSpec(ABC):
     def get_handlers(
         self,
         kv_caches: dict[str, torch.Tensor],
-        attn_backends: dict[str, type["AttentionBackend"]],
+        attn_backends: dict[str, type[AttentionBackend]],
     ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]:
         """
         Get offloading handlers along with their respective src and dst types.
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 7b9037c03d4f0..7600df48150ac 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -8,6 +8,7 @@ import numpy as np
 import torch
 import torch.nn as nn
 
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import (
     CompilationMode,
     CUDAGraphMode,
@@ -157,8 +158,6 @@ class EagleProposer:
         )
 
         # Determine allowed attention backends once during initialization.
-        from vllm.attention.backends.registry import AttentionBackendEnum
-
         self.allowed_attn_types: tuple | None = None
         if current_platform.is_rocm():
             rocm_types = [TritonAttentionMetadata, FlashAttentionMetadata]
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 92e4ce3abdba3..bd88cb1b253f8 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -2,11 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections import defaultdict
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING
 
 import torch
 
 from vllm.attention.backends.abstract import AttentionBackend
+from vllm.attention.layer import Attention
 from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
 from vllm.model_executor.models.interfaces import MultiModalEmbeddings
 from vllm.model_executor.models.utils import extract_layer_index
@@ -17,9 +17,6 @@ from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
 from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget
 from vllm.v1.kv_cache_interface import KVCacheGroupSpec, KVCacheSpec
 
-if TYPE_CHECKING:
-    from vllm.attention.layer import Attention
-
 
 class MultiModalBudget:
     """Helper class to calculate budget information for multi-modal models."""
@@ -278,7 +275,7 @@ def add_kv_sharing_layers_to_kv_cache_groups(
 
 def bind_kv_cache(
     kv_caches: dict[str, torch.Tensor],
-    forward_context: dict[str, "Attention"],
+    forward_context: dict[str, Attention],
     runner_kv_caches: list[torch.Tensor],
     num_attn_module: int = 1,
 ) -> None:

From e1f262337bcf774032019b5b717a6297a860f190 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 27 Nov 2025 16:42:14 +0000
Subject: [PATCH 015/770] Update Transformers pin in CI to 4.57.3 (#29418)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 requirements/nightly_torch_test.txt | 2 +-
 requirements/test.in                | 2 +-
 requirements/test.txt               | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt
index d9c5d89c1d52f..53b012372be8e 100644
--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
@@ -29,7 +29,7 @@ opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
 mteb>=1.38.11, <2 # required for mteb test
-transformers==4.57.1
+transformers==4.57.3
 tokenizers==0.22.0
 schemathesis>=3.39.15 # Required for openai schema test.
 # quantization
diff --git a/requirements/test.in b/requirements/test.in
index 05f6bcca5c2c4..da7a7db1f00c9 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -37,7 +37,7 @@ datamodel_code_generator # required for minicpm3 test
 # TODO: Use lm-eval[api]==0.4.10 once released
 lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
 mteb[bm25s]>=2, <3 # required for mteb test
-transformers==4.57.1
+transformers==4.57.3
 tokenizers==0.22.0
 schemathesis>=3.39.15 # Required for openai schema test.
 # quantization
diff --git a/requirements/test.txt b/requirements/test.txt
index bcd511660f85e..c5f103b8b0d78 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -1196,7 +1196,7 @@ tqdm==4.66.6
     #   transformers
 tqdm-multiprocess==0.0.11
     # via lm-eval
-transformers==4.57.1
+transformers==4.57.3
     # via
     #   -r requirements/test.in
     #   genai-perf

From 0840abdd242bbc7d0c42f0bfa73fec94a44e921b Mon Sep 17 00:00:00 2001
From: Injae Ryou <injaeryou@gmail.com>
Date: Fri, 28 Nov 2025 01:53:10 +0900
Subject: [PATCH 016/770] [BugFix] Optional tokenizer argument when loading
 GGUF models (#29582)

Signed-off-by: Injae Ryou <injaeryou@gmail.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/config/model.py                  | 15 +++++-----
 vllm/transformers_utils/gguf_utils.py | 42 +++++++++++++++++++++++++++
 vllm/transformers_utils/tokenizer.py  | 10 ++++++-
 3 files changed, 59 insertions(+), 8 deletions(-)

diff --git a/vllm/config/model.py b/vllm/config/model.py
index 5dabd636c18c6..21d602b30ac1a 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -439,13 +439,6 @@ class ModelConfig:
         self.model = maybe_model_redirect(self.model)
         # The tokenizer is consistent with the model by default.
         if self.tokenizer is None:
-            # Check if this is a GGUF model (either local file or remote GGUF)
-            if is_gguf(self.model):
-                raise ValueError(
-                    "Using a tokenizer is mandatory when loading a GGUF model. "
-                    "Please specify the tokenizer path or name using the "
-                    "--tokenizer argument."
-                )
             self.tokenizer = self.model
         if self.tokenizer_revision is None:
             self.tokenizer_revision = self.revision
@@ -699,6 +692,14 @@ class ModelConfig:
 
             self.multimodal_config = MultiModalConfig(**mm_config_kwargs)
 
+        # Multimodal GGUF models must use original repo for mm processing
+        if is_gguf(self.tokenizer) and self.is_multimodal_model:
+            raise ValueError(
+                "Loading a multimodal GGUF model needs to use original "
+                "tokenizer. Please specify the unquantized hf model's "
+                "repo name or path using the --tokenizer argument."
+            )
+
         if self.disable_sliding_window:
             # Set after get_and_verify_max_len to ensure that max_model_len
             # can be correctly capped to sliding window size
diff --git a/vllm/transformers_utils/gguf_utils.py b/vllm/transformers_utils/gguf_utils.py
index 2bf59c91a3bb1..f727b1b4726bb 100644
--- a/vllm/transformers_utils/gguf_utils.py
+++ b/vllm/transformers_utils/gguf_utils.py
@@ -9,6 +9,7 @@ from gguf.constants import Keys, VisionProjectorType
 from transformers import Gemma3Config, PretrainedConfig, SiglipVisionConfig
 
 from vllm.logger import init_logger
+from vllm.transformers_utils.config import list_filtered_repo_files
 
 logger = init_logger(__name__)
 
@@ -164,3 +165,44 @@ def maybe_patch_hf_config_from_gguf(
             hf_config = new_hf_config
 
     return hf_config
+
+
+def get_gguf_file_path_from_hf(
+    repo_id: str | Path,
+    quant_type: str,
+    revision: str | None = None,
+) -> str:
+    """Get the GGUF file path from HuggingFace Hub based on repo_id and quant_type.
+
+    Args:
+        repo_id: The HuggingFace repository ID (e.g., "Qwen/Qwen3-0.6B")
+        quant_type: The quantization type (e.g., "Q4_K_M", "F16")
+        revision: Optional revision/branch name
+
+    Returns:
+        The path to the GGUF file on HuggingFace Hub (e.g., "filename.gguf"),
+    """
+    repo_id = str(repo_id)
+    gguf_patterns = [
+        f"*-{quant_type}.gguf",
+        f"*-{quant_type}-*.gguf",
+        f"*/*-{quant_type}.gguf",
+        f"*/*-{quant_type}-*.gguf",
+    ]
+    matching_files = list_filtered_repo_files(
+        repo_id,
+        allow_patterns=gguf_patterns,
+        revision=revision,
+    )
+
+    if len(matching_files) == 0:
+        raise ValueError(
+            "Could not find GGUF file for repo %s with quantization %s.",
+            repo_id,
+            quant_type,
+        )
+
+    # Sort to ensure consistent ordering (prefer non-sharded files)
+    matching_files.sort(key=lambda x: (x.count("-"), x))
+    gguf_filename = matching_files[0]
+    return gguf_filename
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index f0e0ba8ef4246..929dc8bf481cb 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -19,6 +19,7 @@ from vllm.transformers_utils.config import (
     get_sentence_transformer_tokenizer_config,
     list_filtered_repo_files,
 )
+from vllm.transformers_utils.gguf_utils import get_gguf_file_path_from_hf
 from vllm.transformers_utils.tokenizers import MistralTokenizer
 from vllm.transformers_utils.utils import (
     check_gguf_file,
@@ -190,7 +191,14 @@ def get_tokenizer(
             kwargs["gguf_file"] = Path(tokenizer_name).name
             tokenizer_name = Path(tokenizer_name).parent
         elif is_remote_gguf(tokenizer_name):
-            tokenizer_name, _ = split_remote_gguf(tokenizer_name)
+            tokenizer_name, quant_type = split_remote_gguf(tokenizer_name)
+            # Get the HuggingFace Hub path for the GGUF file
+            gguf_file = get_gguf_file_path_from_hf(
+                tokenizer_name,
+                quant_type,
+                revision=revision,
+            )
+            kwargs["gguf_file"] = gguf_file
 
     # if `tokenizer_mode` == "auto", check if tokenizer can be loaded via Mistral format
     # first to use official Mistral tokenizer if possible.

From ee9841daa995a606139775043f0199d6a81037b3 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Nov 2025 01:08:08 +0800
Subject: [PATCH 017/770] [Bugfix] Fix doc build on main (#29619)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/interfaces_base.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index 4267b6c6598e2..85c5574bacf0a 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -167,8 +167,7 @@ class VllmModelForPooling(VllmModel[T_co], Protocol[T_co]):
 
     default_pooling_type: ClassVar[str] = "LAST"
     """
-    Indicates the
-    [vllm.model_executor.layers.pooler.PoolerConfig.pooling_type][]
+    Indicates the [vllm.config.pooler.PoolerConfig.pooling_type][]
     to use by default.
 
     You can use the

From d45269b37844b992dc4b34c0509ad8319bc043e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9E=9C=E5=86=BB=E8=99=BE=E4=BB=81?= <guodong@apache.org>
Date: Fri, 28 Nov 2025 01:21:00 +0800
Subject: [PATCH 018/770] add skip_reading_prefix_cache in repr for
 PoolingParams (#29620)

---
 vllm/pooling_params.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index d1aab98c274e1..c2094a2d920a2 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -219,6 +219,7 @@ class PoolingParams(
             f"step_tag_id={self.step_tag_id}, "
             f"returned_token_ids={self.returned_token_ids}, "
             f"requires_token_ids={self.requires_token_ids}, "
+            f"skip_reading_prefix_cache={self.skip_reading_prefix_cache}, "
             f"extra_kwargs={self.extra_kwargs})"
         )
 

From ea228b4491342f6b7a283e1a414e1a75171a0241 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Nov 2025 02:39:59 +0800
Subject: [PATCH 019/770] [Misc] Remove unused code from `protocol.py` (#29616)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/engine/protocol.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index 5e3374f9f6a10..6b3ee042daf3e 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -1,14 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import enum
 from abc import ABC, abstractmethod
 from collections.abc import AsyncGenerator, Iterable, Mapping
 from typing import Any
 
 from vllm.config import ModelConfig, VllmConfig
 from vllm.inputs.data import PromptType
-from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.outputs import PoolingRequestOutput, RequestOutput
 from vllm.plugins.io_processors import IOProcessor
@@ -19,13 +17,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.processor import Processor
 
-logger = init_logger(__name__)
-
-
-class Device(enum.Enum):
-    GPU = enum.auto()
-    CPU = enum.auto()
-
 
 class EngineClient(ABC):
     """Protocol class for Clients to Engine"""

From a24ea5414bc5b623cde301c8c6e1c5082ecfe412 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Nov 2025 03:04:58 +0800
Subject: [PATCH 020/770] [Deprecation] Advance deprecation status (#29617)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/config/scheduler.py            | 15 +--------
 vllm/distributed/parallel_state.py  | 19 -----------
 vllm/model_executor/models/utils.py | 49 -----------------------------
 vllm/v1/core/sched/output.py        |  4 +--
 4 files changed, 3 insertions(+), 84 deletions(-)

diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
index 2cf42d57ec217..ff1ac0e18f324 100644
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -7,7 +7,7 @@ from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast
 
 from pydantic import Field, field_validator
 from pydantic.dataclasses import dataclass
-from typing_extensions import Self, deprecated
+from typing_extensions import Self
 
 from vllm.config.utils import config
 from vllm.logger import init_logger
@@ -224,19 +224,6 @@ class SchedulerConfig:
 
         self.verify_max_model_len(max_model_len)
 
-    @property
-    @deprecated(
-        "`SchedulerConfig.chunked_prefill_enabled` has been renamed to "
-        "`SchedulerConfig.enable_chunked_prefill`. "
-        "The old name will be removed in v0.12."
-    )
-    def chunked_prefill_enabled(self) -> bool:
-        return self.enable_chunked_prefill
-
-    @chunked_prefill_enabled.setter
-    def chunked_prefill_enabled(self, value: bool):
-        self.enable_chunked_prefill = value
-
     def verify_max_model_len(self, max_model_len: int) -> Self:
         if (
             self.max_num_batched_tokens < max_model_len
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 69c28e278f2d2..52b433cfaf1bd 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -41,7 +41,6 @@ import torch.distributed
 import torch.distributed._functional_collectives as funcol
 import torch.distributed._symmetric_memory
 from torch.distributed import Backend, ProcessGroup
-from typing_extensions import deprecated
 
 import vllm.envs as envs
 from vllm.distributed.device_communicators.base_device_communicator import (
@@ -1078,15 +1077,6 @@ def get_tp_group() -> GroupCoordinator:
     return _TP
 
 
-@deprecated(
-    "`get_tensor_model_parallel_group` has been replaced with "
-    "`get_tp_group` and may be removed after v0.12. Please use "
-    "`get_tp_group` instead."
-)
-def get_tensor_model_parallel_group():
-    return get_tp_group()
-
-
 _DCP: GroupCoordinator | None = None
 
 
@@ -1130,15 +1120,6 @@ def get_pcp_group() -> GroupCoordinator:
     return _PCP
 
 
-@deprecated(
-    "`get_pipeline_model_parallel_group` has been replaced with "
-    "`get_pp_group` and may be removed in v0.12. Please use "
-    "`get_pp_group` instead."
-)
-def get_pipeline_model_parallel_group():
-    return get_pp_group()
-
-
 @contextmanager
 def graph_capture(device: torch.device):
     """
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index ccefd7e66697f..f25ab9153a50d 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -10,7 +10,6 @@ import torch
 import torch.nn as nn
 from torch.func import functional_call
 from transformers import PretrainedConfig
-from typing_extensions import deprecated
 
 from vllm.config import VllmConfig
 from vllm.distributed import (
@@ -481,54 +480,6 @@ def _merge_multimodal_embeddings(
     return inputs_embeds
 
 
-@deprecated(
-    "`merge_multimodal_embeddings` has been replaced with "
-    "`SupportsMultiModal.embed_input_ids` and will be "
-    "removed in v0.12."
-)
-def merge_multimodal_embeddings(
-    input_ids: torch.Tensor,
-    inputs_embeds: torch.Tensor,
-    multimodal_embeddings: NestedTensors,
-    placeholder_token_id: int | list[int],
-) -> torch.Tensor:
-    """
-    Merge `multimodal_embeddings` into `inputs_embeds` by overwriting the
-    positions in `inputs_embeds` corresponding to placeholder tokens in
-    `input_ids`.
-
-    `placeholder_token_id` can be a list of token ids (e.g, token ids
-    of img_start, img_break, and img_end tokens) when needed: This means
-    the order of these tokens in the `input_ids` MUST MATCH the order of
-    their embeddings in `multimodal_embeddings` since we need to
-    slice-merge instead of individually scattering.
-
-    For example, if input_ids is "TTTTTSIIIBIIIBIIIETTT", where
-    - T is text token
-    - S is image start token
-    - I is image embedding token
-    - B is image break token
-    - E is image end token.
-
-    Then the image embeddings (that correspond to I's) from vision encoder
-    must be padded with embeddings of S, B, and E in the same order of
-    input_ids for a correct embedding merge.
-
-    Note:
-        This updates `inputs_embeds` in place.
-    """
-    if isinstance(placeholder_token_id, list):
-        is_multimodal = isin_list(input_ids, placeholder_token_id)
-    else:
-        is_multimodal = input_ids == placeholder_token_id
-
-    return _merge_multimodal_embeddings(
-        inputs_embeds,
-        multimodal_embeddings=multimodal_embeddings,
-        is_multimodal=is_multimodal,
-    )
-
-
 def isin_list(
     elements: torch.Tensor,
     test_elements_list: list[int],
diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py
index 7902513dce49a..abfab43499b2a 100644
--- a/vllm/v1/core/sched/output.py
+++ b/vllm/v1/core/sched/output.py
@@ -126,12 +126,12 @@ class CachedRequestData:
         return len(self.req_ids)
 
     @cached_property
-    @deprecated("use resumed_req_ids field")
+    @deprecated("This will be removed in v0.14, use `resumed_req_ids` instead.")
     def resumed_from_preemption(self) -> list[bool]:
         return [req_id in self.resumed_req_ids for req_id in self.req_ids]
 
     @cached_property
-    @deprecated("use all_token_ids field")
+    @deprecated("This will be removed in v0.14, use `all_token_ids` instead.")
     def resumed_req_token_ids(self) -> list[list[int] | None]:
         return [
             self.all_token_ids[req_id] if req_id in self.resumed_req_ids else None

From 38658ec6f3b3a09a6cd205bab23a550b3d3f8c0e Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Fri, 28 Nov 2025 03:17:37 +0800
Subject: [PATCH 021/770] [Bugfix][MM encoder] Fix ViT attention backend
 resolving for Turing GPU (#29614)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/platforms/cuda.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index d5c3a177d9c2b..4bf9401b6b051 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -264,14 +264,15 @@ class CudaPlatformBase(Platform):
         cls, head_size: int, dtype: torch.dtype
     ) -> "AttentionBackendEnum":
         # Try FlashAttention first
-        try:
-            backend_class = AttentionBackendEnum.FLASH_ATTN.get_class()
-            if backend_class.supports_head_size(
-                head_size
-            ) and backend_class.supports_dtype(dtype):
-                return AttentionBackendEnum.FLASH_ATTN
-        except ImportError:
-            pass
+        if (cc := cls.get_device_capability()) and cc.major >= 8:
+            try:
+                backend_class = AttentionBackendEnum.FLASH_ATTN.get_class()
+                if backend_class.supports_head_size(
+                    head_size
+                ) and backend_class.supports_dtype(dtype):
+                    return AttentionBackendEnum.FLASH_ATTN
+            except ImportError:
+                pass
 
         return AttentionBackendEnum.TORCH_SDPA
 

From e5a621b724e5570aaffc4bbf9c5f6ec9bca63333 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Thu, 27 Nov 2025 20:31:52 +0100
Subject: [PATCH 022/770] [CI] Add batched audios Whisper test (#29308)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 .../openai/test_transcription_validation.py   | 197 +--------------
 .../test_transcription_validation_whisper.py  | 237 ++++++++++++++++++
 2 files changed, 238 insertions(+), 196 deletions(-)
 create mode 100644 tests/entrypoints/openai/test_transcription_validation_whisper.py

diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py
index 88580ed899f1a..8045ab1468d6a 100644
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -2,20 +2,12 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # imports for structured outputs tests
-import io
 import json
 
-import librosa
-import numpy as np
-import openai
 import pytest
-import pytest_asyncio
-import soundfile as sf
 
 from ...utils import RemoteOpenAIServer
 
-MODEL_NAME = "openai/whisper-large-v3-turbo"
-SERVER_ARGS = ["--enforce-eager"]
 MISTRAL_FORMAT_ARGS = [
     "--tokenizer_mode",
     "mistral",
@@ -26,22 +18,8 @@ MISTRAL_FORMAT_ARGS = [
 ]
 
 
-@pytest.fixture(scope="module")
-def server():
-    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
-        yield remote_server
-
-
-@pytest_asyncio.fixture
-async def client(server):
-    async with server.get_async_client() as async_client:
-        yield async_client
-
-
 @pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name", ["openai/whisper-large-v3-turbo", "mistralai/Voxtral-Mini-3B-2507"]
-)
+@pytest.mark.parametrize("model_name", ["mistralai/Voxtral-Mini-3B-2507"])
 async def test_basic_audio(mary_had_lamb, model_name):
     server_args = ["--enforce-eager"]
 
@@ -120,176 +98,3 @@ async def test_basic_audio_gemma(foscolo):
         )
         out = json.loads(transcription)["text"]
         assert "da cui vergine nacque Venere" in out
-
-
-@pytest.mark.asyncio
-async def test_non_asr_model(winning_call):
-    # text to text model
-    model_name = "JackFram/llama-68m"
-    with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server:
-        client = remote_server.get_async_client()
-        res = await client.audio.transcriptions.create(
-            model=model_name, file=winning_call, language="en", temperature=0.0
-        )
-        err = res.error
-        assert err["code"] == 400 and not res.text
-        assert err["message"] == "The model does not support Transcriptions API"
-
-
-@pytest.mark.asyncio
-async def test_bad_requests(mary_had_lamb, client):
-    # invalid language
-    with pytest.raises(openai.BadRequestError):
-        await client.audio.transcriptions.create(
-            model=MODEL_NAME, file=mary_had_lamb, language="hh", temperature=0.0
-        )
-
-
-@pytest.mark.asyncio
-async def test_long_audio_request(mary_had_lamb, client):
-    mary_had_lamb.seek(0)
-    audio, sr = librosa.load(mary_had_lamb)
-    # Add small silence after each audio for repeatability in the split process
-    audio = np.pad(audio, (0, 1600))
-    repeated_audio = np.tile(audio, 10)
-    # Repeated audio to buffer
-    buffer = io.BytesIO()
-    sf.write(buffer, repeated_audio, sr, format="WAV")
-    buffer.seek(0)
-    transcription = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
-        file=buffer,
-        language="en",
-        response_format="text",
-        temperature=0.0,
-    )
-    out = json.loads(transcription)
-    out_text = out["text"]
-    out_usage = out["usage"]
-    counts = out_text.count("Mary had a little lamb")
-    assert counts == 10, counts
-    assert out_usage["seconds"] == 161, out_usage["seconds"]
-
-
-@pytest.mark.asyncio
-async def test_completion_endpoints(client):
-    # text to text model
-    res = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=[{"role": "system", "content": "You are a helpful assistant."}],
-    )
-    err = res.error
-    assert err["code"] == 400
-    assert err["message"] == "The model does not support Chat Completions API"
-
-    res = await client.completions.create(model=MODEL_NAME, prompt="Hello")
-    err = res.error
-    assert err["code"] == 400
-    assert err["message"] == "The model does not support Completions API"
-
-
-@pytest.mark.asyncio
-async def test_streaming_response(winning_call, client):
-    transcription = ""
-    res_no_stream = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
-        file=winning_call,
-        response_format="json",
-        language="en",
-        temperature=0.0,
-    )
-    res = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
-        file=winning_call,
-        language="en",
-        temperature=0.0,
-        stream=True,
-        timeout=30,
-    )
-    # Reconstruct from chunks and validate
-    async for chunk in res:
-        text = chunk.choices[0]["delta"]["content"]
-        transcription += text
-
-    assert transcription == res_no_stream.text
-
-
-@pytest.mark.asyncio
-async def test_stream_options(winning_call, client):
-    res = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
-        file=winning_call,
-        language="en",
-        temperature=0.0,
-        stream=True,
-        extra_body=dict(stream_include_usage=True, stream_continuous_usage_stats=True),
-        timeout=30,
-    )
-    final = False
-    continuous = True
-    async for chunk in res:
-        if not len(chunk.choices):
-            # final usage sent
-            final = True
-        else:
-            continuous = continuous and hasattr(chunk, "usage")
-    assert final and continuous
-
-
-@pytest.mark.asyncio
-async def test_sampling_params(mary_had_lamb, client):
-    """
-    Compare sampling with params and greedy sampling to assert results
-    are different when extreme sampling parameters values are picked.
-    """
-    transcription = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
-        file=mary_had_lamb,
-        language="en",
-        temperature=0.8,
-        extra_body=dict(
-            seed=42,
-            repetition_penalty=1.9,
-            top_k=12,
-            top_p=0.4,
-            min_p=0.5,
-            frequency_penalty=1.8,
-            presence_penalty=2.0,
-        ),
-    )
-
-    greedy_transcription = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
-        file=mary_had_lamb,
-        language="en",
-        temperature=0.0,
-        extra_body=dict(seed=42),
-    )
-
-    assert greedy_transcription.text != transcription.text
-
-
-@pytest.mark.asyncio
-async def test_audio_prompt(mary_had_lamb, client):
-    prompt = "This is a speech, recorded in a phonograph."
-    # Prompts should not omit the part of original prompt while transcribing.
-    prefix = "The first words I spoke in the original phonograph"
-    transcription = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
-        file=mary_had_lamb,
-        language="en",
-        response_format="text",
-        temperature=0.0,
-    )
-    out = json.loads(transcription)["text"]
-    assert prefix in out
-    transcription_wprompt = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
-        file=mary_had_lamb,
-        language="en",
-        response_format="text",
-        prompt=prompt,
-        temperature=0.0,
-    )
-    out_prompt = json.loads(transcription_wprompt)["text"]
-    assert prefix in out_prompt
diff --git a/tests/entrypoints/openai/test_transcription_validation_whisper.py b/tests/entrypoints/openai/test_transcription_validation_whisper.py
new file mode 100644
index 0000000000000..82c50e58a0168
--- /dev/null
+++ b/tests/entrypoints/openai/test_transcription_validation_whisper.py
@@ -0,0 +1,237 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# imports for structured outputs tests
+import asyncio
+import io
+import json
+
+import librosa
+import numpy as np
+import openai
+import pytest
+import pytest_asyncio
+import soundfile as sf
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "openai/whisper-large-v3-turbo"
+SERVER_ARGS = ["--enforce-eager"]
+
+
+@pytest.fixture(scope="module")
+def server():
+    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def whisper_client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_basic_audio(mary_had_lamb):
+    server_args = ["--enforce-eager"]
+
+    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
+    with RemoteOpenAIServer(MODEL_NAME, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        transcription = await client.audio.transcriptions.create(
+            model=MODEL_NAME,
+            file=mary_had_lamb,
+            language="en",
+            response_format="text",
+            temperature=0.0,
+        )
+        out = json.loads(transcription)
+        out_text = out["text"]
+        out_usage = out["usage"]
+        assert "Mary had a little lamb," in out_text
+        assert out_usage["seconds"] == 16, out_usage["seconds"]
+
+
+@pytest.mark.asyncio
+async def test_basic_audio_batched(mary_had_lamb, winning_call, whisper_client):
+    transcription = whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="text",
+        temperature=0.0,
+    )
+    transcription2 = whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=winning_call,
+        language="en",
+        response_format="text",
+        temperature=0.0,
+    )
+    # Await both transcriptions by scheduling coroutines together
+    transcription, transcription2 = await asyncio.gather(transcription, transcription2)
+    out = json.loads(transcription)
+    out_text = out["text"]
+    assert "Mary had a little lamb," in out_text
+    out2 = json.loads(transcription2)
+    out_text2 = out2["text"]
+    assert "Edgar Martinez" in out_text2
+
+
+@pytest.mark.asyncio
+async def test_bad_requests(mary_had_lamb, whisper_client):
+    # invalid language
+    with pytest.raises(openai.BadRequestError):
+        await whisper_client.audio.transcriptions.create(
+            model=MODEL_NAME, file=mary_had_lamb, language="hh", temperature=0.0
+        )
+
+
+@pytest.mark.asyncio
+async def test_long_audio_request(mary_had_lamb, whisper_client):
+    mary_had_lamb.seek(0)
+    audio, sr = librosa.load(mary_had_lamb)
+    # Add small silence after each audio for repeatability in the split process
+    audio = np.pad(audio, (0, 1600))
+    repeated_audio = np.tile(audio, 10)
+    # Repeated audio to buffer
+    buffer = io.BytesIO()
+    sf.write(buffer, repeated_audio, sr, format="WAV")
+    buffer.seek(0)
+    transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=buffer,
+        language="en",
+        response_format="text",
+        temperature=0.0,
+    )
+    out = json.loads(transcription)
+    out_text = out["text"]
+    out_usage = out["usage"]
+    counts = out_text.count("Mary had a little lamb")
+    assert counts == 10, counts
+    assert out_usage["seconds"] == 161, out_usage["seconds"]
+
+
+@pytest.mark.asyncio
+async def test_completion_endpoints(whisper_client):
+    # text to text model
+    res = await whisper_client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=[{"role": "system", "content": "You are a helpful assistant."}],
+    )
+    err = res.error
+    assert err["code"] == 400
+    assert err["message"] == "The model does not support Chat Completions API"
+
+    res = await whisper_client.completions.create(model=MODEL_NAME, prompt="Hello")
+    err = res.error
+    assert err["code"] == 400
+    assert err["message"] == "The model does not support Completions API"
+
+
+@pytest.mark.asyncio
+async def test_streaming_response(winning_call, whisper_client):
+    transcription = ""
+    res_no_stream = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=winning_call,
+        response_format="json",
+        language="en",
+        temperature=0.0,
+    )
+    res = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=winning_call,
+        language="en",
+        temperature=0.0,
+        stream=True,
+        timeout=30,
+    )
+    # Reconstruct from chunks and validate
+    async for chunk in res:
+        text = chunk.choices[0]["delta"]["content"]
+        transcription += text
+
+    assert transcription == res_no_stream.text
+
+
+@pytest.mark.asyncio
+async def test_stream_options(winning_call, whisper_client):
+    res = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=winning_call,
+        language="en",
+        temperature=0.0,
+        stream=True,
+        extra_body=dict(stream_include_usage=True, stream_continuous_usage_stats=True),
+        timeout=30,
+    )
+    final = False
+    continuous = True
+    async for chunk in res:
+        if not len(chunk.choices):
+            # final usage sent
+            final = True
+        else:
+            continuous = continuous and hasattr(chunk, "usage")
+    assert final and continuous
+
+
+@pytest.mark.asyncio
+async def test_sampling_params(mary_had_lamb, whisper_client):
+    """
+    Compare sampling with params and greedy sampling to assert results
+    are different when extreme sampling parameters values are picked.
+    """
+    transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        temperature=0.8,
+        extra_body=dict(
+            seed=42,
+            repetition_penalty=1.9,
+            top_k=12,
+            top_p=0.4,
+            min_p=0.5,
+            frequency_penalty=1.8,
+            presence_penalty=2.0,
+        ),
+    )
+
+    greedy_transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        temperature=0.0,
+        extra_body=dict(seed=42),
+    )
+
+    assert greedy_transcription.text != transcription.text
+
+
+@pytest.mark.asyncio
+async def test_audio_prompt(mary_had_lamb, whisper_client):
+    prompt = "This is a speech, recorded in a phonograph."
+    # Prompts should not omit the part of original prompt while transcribing.
+    prefix = "The first words I spoke in the original phonograph"
+    transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="text",
+        temperature=0.0,
+    )
+    out = json.loads(transcription)["text"]
+    assert prefix in out
+    transcription_wprompt = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="text",
+        prompt=prompt,
+        temperature=0.0,
+    )
+    out_prompt = json.loads(transcription_wprompt)["text"]
+    assert prefix in out_prompt

From a5345bf49df74cd394a07797649f51cd67c6c697 Mon Sep 17 00:00:00 2001
From: Andrii Skliar <andreyws96@gmail.com>
Date: Thu, 27 Nov 2025 20:34:59 +0100
Subject: [PATCH 023/770] [BugFix] Fix `plan` API Mismatch when using latest
 FlashInfer (#29426)

Signed-off-by: Andrii Skliar <askliar@askliar-mlt.client.nvidia.com>
Co-authored-by: Andrii Skliar <askliar@askliar-mlt.client.nvidia.com>
---
 docker/Dockerfile                        | 4 ++--
 requirements/cuda.txt                    | 2 +-
 vllm/v1/attention/backends/flashinfer.py | 3 ++-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index aa3aad21d6c07..eb7c105071c00 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -398,8 +398,8 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 # Install FlashInfer pre-compiled kernel cache and binaries
 # https://docs.flashinfer.ai/installation.html
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system flashinfer-cubin==0.5.2 \
-    && uv pip install --system flashinfer-jit-cache==0.5.2 \
+    uv pip install --system flashinfer-cubin==0.5.3 \
+    && uv pip install --system flashinfer-jit-cache==0.5.3 \
         --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
     && flashinfer show-config
 
diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index 15e8aadc56f47..462f18ef7159b 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -10,4 +10,4 @@ torchaudio==2.9.0
 # These must be updated alongside torch
 torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 # FlashInfer should be updated together with the Dockerfile
-flashinfer-python==0.5.2
+flashinfer-python==0.5.3
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index dbd72b298b1fd..777398bf8a20e 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -1508,7 +1508,7 @@ def fast_plan_decode(
     qo_indptr_host = _get_range_buf(batch_size + 1, "cpu")
 
     try:
-        # Make sure we pass exactly 18 arguments for tensor core version
+        # Make sure we pass exactly 19 arguments for tensor core version
         self._plan_info = self._cached_module.plan(
             self._float_workspace_buffer,
             self._int_workspace_buffer,
@@ -1528,6 +1528,7 @@ def fast_plan_decode(
             window_left,
             fixed_split_size,
             disable_split_kv,
+            0,
         )
     except Exception as e:
         raise RuntimeError(f"Error in tensor core plan: {e}") from e

From ae0ce1be272105f02a3ac6a63e646690be2481fb Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 27 Nov 2025 12:38:53 -0800
Subject: [PATCH 024/770] [Model Runner V2][BugFix] Keep reference to GPU
 tensors in AsyncOutput (#29623)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/async_utils.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/worker/gpu/async_utils.py b/vllm/v1/worker/gpu/async_utils.py
index 421fb29a7f87f..f6bc607c1ae67 100644
--- a/vllm/v1/worker/gpu/async_utils.py
+++ b/vllm/v1/worker/gpu/async_utils.py
@@ -21,6 +21,9 @@ class AsyncOutput(AsyncModelRunnerOutput):
         copy_stream: torch.cuda.Stream,
         copy_event: torch.cuda.Event,
     ):
+        # NOTE(woosuk): We must retain references to the GPU tensors,
+        # as the copy operations are performed on a different CUDA stream than
+        # the one where the tensors were created.
         self.model_runner_output = model_runner_output
         self.sampler_output = sampler_output
         self.num_sampled_tokens = num_sampled_tokens
@@ -51,7 +54,9 @@ class AsyncOutput(AsyncModelRunnerOutput):
                 )
             else:
                 self.logprobs_tensors = None
-            self.num_sampled_tokens = num_sampled_tokens.to("cpu", non_blocking=True)
+            self.num_sampled_tokens_cpu = num_sampled_tokens.to(
+                "cpu", non_blocking=True
+            )
             self.prompt_logprobs_dict: dict[str, LogprobsTensors | None] = {}
             if self.model_runner_output.prompt_logprobs_dict:
                 for k, v in self.model_runner_output.prompt_logprobs_dict.items():
@@ -63,7 +68,7 @@ class AsyncOutput(AsyncModelRunnerOutput):
 
     def get_output(self) -> ModelRunnerOutput:
         self.copy_event.synchronize()
-        num_sampled_tokens_np = self.num_sampled_tokens.numpy()
+        num_sampled_tokens_np = self.num_sampled_tokens_cpu.numpy()
 
         # NOTE(woosuk): The following code is to ensure compatibility with
         # the existing model runner.

From be493e0b3cfb5810d254e9845217878a39a4853b Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Thu, 27 Nov 2025 16:45:38 -0500
Subject: [PATCH 025/770] [BugFix] Fix new nightly failures (#29578)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 vllm/v1/attention/backends/utils.py | 26 ++++++++++++++++++++++++++
 vllm/v1/worker/gpu_model_runner.py  | 12 +++++++++++-
 2 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 6e0d84e4fb4ac..27f07218d9b2e 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -100,6 +100,32 @@ class CommonAttentionMetadata:
     dcp_local_seq_lens_cpu: torch.Tensor | None = None
     """Sequence lengths of the local rank in decode context parallelism world"""
 
+    # TODO(lucas): remove once we have FULL-CG spec-decode support
+    def unpadded(
+        self, num_actual_tokens: int, num_actual_reqs: int
+    ) -> "CommonAttentionMetadata":
+        maybe_slice_reqs = lambda x: x[:num_actual_reqs] if x is not None else None
+        return CommonAttentionMetadata(
+            query_start_loc=self.query_start_loc[: num_actual_reqs + 1],
+            query_start_loc_cpu=self.query_start_loc_cpu[: num_actual_reqs + 1],
+            seq_lens=self.seq_lens[:num_actual_reqs],
+            seq_lens_cpu=self.seq_lens_cpu[:num_actual_reqs],
+            num_computed_tokens_cpu=self.num_computed_tokens_cpu[:num_actual_reqs],
+            num_reqs=num_actual_reqs,
+            num_actual_tokens=num_actual_tokens,
+            max_query_len=self.max_query_len,
+            max_seq_len=self.max_seq_len,
+            block_table_tensor=self.block_table_tensor[:num_actual_reqs],
+            slot_mapping=self.slot_mapping[:num_actual_tokens],
+            causal=self.causal,
+            logits_indices_padded=self.logits_indices_padded,
+            num_logits_indices=self.num_logits_indices,
+            encoder_seq_lens=maybe_slice_reqs(self.encoder_seq_lens),
+            encoder_seq_lens_cpu=maybe_slice_reqs(self.encoder_seq_lens_cpu),
+            dcp_local_seq_lens=maybe_slice_reqs(self.dcp_local_seq_lens),
+            dcp_local_seq_lens_cpu=maybe_slice_reqs(self.dcp_local_seq_lens_cpu),
+        )
+
 
 def slice_query_start_locs(
     query_start_loc: torch.Tensor,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 0ae4eb48acf22..6bff83658b45a 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1551,7 +1551,7 @@ class GPUModelRunner(
                 # Encoder-only layers do not have KV cache, so we need to
                 # create a dummy block table and slot mapping for them.
                 blk_table_tensor = torch.zeros(
-                    (num_tokens_padded, 1),
+                    (num_reqs_padded, 1),
                     dtype=torch.int32,
                     device=self.device,
                 )
@@ -1652,6 +1652,16 @@ class GPUModelRunner(
                     for layer_name in attn_group.layer_names:
                         attn_metadata[layer_name] = attn_metadata_i
 
+        if spec_decode_common_attn_metadata is not None and (
+            num_reqs != num_reqs_padded or num_tokens != num_tokens_padded
+        ):
+            # Currently the drafter still only uses piecewise cudagraphs (and modifies
+            # the attention metadata in directly), and therefore does not want to use
+            # padded attention metadata.
+            spec_decode_common_attn_metadata = (
+                spec_decode_common_attn_metadata.unpadded(num_tokens, num_reqs)
+            )
+
         return attn_metadata, spec_decode_common_attn_metadata
 
     def _compute_cascade_attn_prefix_lens(

From 35657bcd7a5fd7a7af1aa1b19d78eb8973ec79c1 Mon Sep 17 00:00:00 2001
From: scydas <scyda@outlook.com>
Date: Fri, 28 Nov 2025 09:34:33 +0800
Subject: [PATCH 026/770] [CPU]Update CPU PyTorch to 2.9.0 (#29589)

Signed-off-by: scyda <scyda@outlook.com>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
---
 docker/Dockerfile.cpu      | 4 ----
 requirements/cpu-build.txt | 4 ++--
 requirements/cpu.txt       | 8 ++++----
 3 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index eb3807ef0ca4e..67d3fb83a0275 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -119,7 +119,6 @@ FROM base AS vllm-test-deps
 
 WORKDIR /workspace/vllm
 
-# TODO: Update to 2.9.0 when there is a new build for intel_extension_for_pytorch for that version
 RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
     cp requirements/test.in requirements/cpu-test.in && \
     sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
@@ -132,9 +131,6 @@ RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
       esac; \
     }; \
     remove_packages_not_supported_on_aarch64 && \
-    sed -i 's/^torch==.*/torch==2.8.0/g' requirements/cpu-test.in && \
-    sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
-    sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
     uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
 
 RUN --mount=type=cache,target=/root/.cache/uv \
diff --git a/requirements/cpu-build.txt b/requirements/cpu-build.txt
index 81d429a5e5f8d..0c6fdd3b33cd1 100644
--- a/requirements/cpu-build.txt
+++ b/requirements/cpu-build.txt
@@ -4,9 +4,9 @@ packaging>=24.2
 setuptools>=77.0.3,<81.0.0
 setuptools-scm>=8
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
+torch==2.9.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
 torch==2.9.0; platform_system == "Darwin"
-torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
+torch==2.9.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
 scons; platform_machine == "aarch64"    # needed to build Arm Compute Library (ACL)
 wheel
 jinja2>=3.1.6
diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index e23d3286f3f78..8c04d6d5ce1b0 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -7,17 +7,17 @@ numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative d
 packaging>=24.2
 setuptools>=77.0.3,<81.0.0
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
+torch==2.9.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
 torch==2.9.0; platform_system == "Darwin"
-torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
+torch==2.9.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
 
 # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
 torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
-torchaudio==2.8.0; platform_machine == "ppc64le"
+torchaudio==2.9.0; platform_machine == "ppc64le"
 
 # required for the image processor of phi3v, this must be updated alongside torch
 torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
-torchvision==0.23.0; platform_machine == "ppc64le"
+torchvision==0.24.0; platform_machine == "ppc64le"
 datasets # for benchmark scripts
 
 # Intel Extension for PyTorch, only for x86_64 CPUs

From 745a3bae1aef2ff3aa70b3eab8624e4571698ba0 Mon Sep 17 00:00:00 2001
From: Xin Yang <105740670+xyang16@users.noreply.github.com>
Date: Thu, 27 Nov 2025 18:48:28 -0800
Subject: [PATCH 027/770] [LoRA] Support FusedMoE LoRA Triton kernel for mxfp4
 (#28971)

Signed-off-by: Xin Yang <xyangx@amazon.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 .../moe/test_modular_oai_triton_moe.py        | 250 ++++++++++++++++++
 vllm/lora/layers/fused_moe.py                 |  37 ++-
 .../fused_moe/gpt_oss_triton_kernels_moe.py   | 146 ++++++++++
 .../layers/quantization/mxfp4.py              |  20 +-
 4 files changed, 441 insertions(+), 12 deletions(-)
 create mode 100644 tests/kernels/moe/test_modular_oai_triton_moe.py

diff --git a/tests/kernels/moe/test_modular_oai_triton_moe.py b/tests/kernels/moe/test_modular_oai_triton_moe.py
new file mode 100644
index 0000000000000..3361d85e92507
--- /dev/null
+++ b/tests/kernels/moe/test_modular_oai_triton_moe.py
@@ -0,0 +1,250 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Test modular OAI Triton MoE
+"""
+
+import pytest
+import torch
+
+from vllm.utils.import_utils import has_triton_kernels
+
+if not has_triton_kernels():
+    pytest.skip(
+        "triton_kernels not found, skipping all related tests",
+        allow_module_level=True,
+    )
+
+from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig
+from triton_kernels.numerics import InFlexData
+from triton_kernels.numerics_details.mxfp import downcast_to_mxfp, upcast_from_mxfp
+from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor
+from triton_kernels.tensor_details import layout
+from triton_kernels.testing import assert_close
+
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.config import mxfp4_w4a16_moe_quant_config
+from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
+    OAITritonExperts,
+    UnfusedOAITritonExperts,
+)
+from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+    MoEPrepareAndFinalizeNoEP,
+)
+from vllm.model_executor.layers.utils import shuffle_weight
+from vllm.platforms import current_platform
+
+MNK = [
+    (1, 512, 384),
+    (1, 2880, 2880),
+    (2, 512, 384),
+    (2, 2880, 2880),
+    (32, 2880, 2880),
+    (64, 2880, 2880),
+]
+
+
+def unshuffle_weight(w: torch.Tensor):
+    first = w[..., ::2]
+    second = w[..., 1::2]
+    return torch.concat((first, second), dim=-1)
+
+
+def make_weights(dtype, k, n, e):
+    w1 = torch.randn((e, k, 2 * n), dtype=dtype, device="cuda")
+    w1_bias = torch.randn((e, 2 * n), dtype=dtype, device="cuda")
+
+    w2 = torch.randn((e, n, k), dtype=dtype, device="cuda")
+    w2_bias = torch.randn((e, k), dtype=dtype, device="cuda")
+
+    w1_tri = w1.clone()
+    w2_tri = w2.clone()
+
+    w1_bias_tri = w1_bias.clone()
+    w2_bias_tri = w2_bias.clone()
+    w1_bias_tri = w1_bias_tri.to(torch.float32)
+    w2_bias_tri = w2_bias_tri.to(torch.float32)
+
+    # shuffle weights
+    w1_tri = shuffle_weight(w1_tri)
+    w1_bias_tri = shuffle_weight(w1_bias_tri)
+
+    # quant triton_weights
+    w1_tri, w1_scale_tri = downcast_to_mxfp(w1_tri, torch.uint8, axis=1)
+    w1 = upcast_from_mxfp(w1_tri, w1_scale_tri, dtype, axis=1)
+    w1 = unshuffle_weight(w1)
+
+    w2_tri, w2_scale_tri = downcast_to_mxfp(w2_tri, torch.uint8, axis=1)
+    w2 = upcast_from_mxfp(w2_tri, w2_scale_tri, dtype, axis=1)
+
+    num_warps = 8
+    w_layout, w_layout_opts = layout.make_default_matmul_mxfp4_w_layout(mx_axis=1)
+    w_scale_layout, w_scale_layout_opts = (
+        layout.make_default_matmul_mxfp4_w_scale_layout(mx_axis=1, num_warps=num_warps)
+    )
+
+    w1_tri = convert_layout(wrap_torch_tensor(w1_tri, FP4), w_layout, **w_layout_opts)
+    w1_scale_tri = convert_layout(
+        wrap_torch_tensor(w1_scale_tri),
+        w_scale_layout,
+        **w_scale_layout_opts,
+    )
+
+    w2_tri = convert_layout(wrap_torch_tensor(w2_tri, FP4), w_layout, **w_layout_opts)
+    w2_scale_tri = convert_layout(
+        wrap_torch_tensor(w2_scale_tri),
+        w_scale_layout,
+        **w_scale_layout_opts,
+    )
+
+    w1_precision_config = PrecisionConfig(
+        weight_scale=w1_scale_tri, flex_ctx=FlexCtx(rhs_data=InFlexData())
+    )
+    w2_precision_config = PrecisionConfig(
+        weight_scale=w2_scale_tri, flex_ctx=FlexCtx(rhs_data=InFlexData())
+    )
+
+    return (
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        w1_tri,
+        w2_tri,
+        w1_bias_tri,
+        w2_bias_tri,
+        w1_precision_config,
+        w2_precision_config,
+    )
+
+
+def swiglu(x, alpha: float = 1.702, limit: float = 1.0):
+    # Note we add an extra bias of 1 to the linear layer
+    x_glu, x_linear = torch.chunk(x, 2, dim=-1)
+    if limit is not None:
+        x_glu = x_glu.clamp(max=limit)
+    out_glu = x_glu * torch.sigmoid(alpha * x_glu)
+    if limit is not None:
+        x_linear = x_linear.clamp(min=-limit, max=limit)
+    return out_glu * (x_linear + 1)
+
+
+def torch_moe_impl(
+    hidden_states: torch.Tensor,  # (M, K)
+    w1: torch.Tensor,  # (E, K, 2N)
+    w2: torch.Tensor,  # (E, N, K)
+    w1_bias: torch.Tensor,  # (E, 2N)
+    w2_bias: torch.Tensor,  # (E, K)
+    topk_weights: torch.Tensor,  # (M, topk)
+    topk_ids: torch.Tensor,  # (M, topk)
+):
+    w1 = w1[topk_ids, ...]
+    w1_bias = w1_bias[topk_ids, ...]
+    hidden_states = torch.einsum("bekc,bk->bec", w1, hidden_states) + w1_bias
+    hidden_states = swiglu(hidden_states, limit=7)
+
+    w2 = w2[topk_ids, ...]
+    w2_bias = w2_bias[topk_ids, ...]
+    hidden_states = torch.einsum("bekc,bek->bec", w2, hidden_states) + w2_bias
+
+    # Weighted sum of experts
+    hidden_states = torch.einsum("bec,be->bc", hidden_states, topk_weights)
+    return hidden_states
+
+
+def oai_triton_moe_impl(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: "PrecisionConfig",
+    w2_scale: "PrecisionConfig",
+    w1_bias: torch.Tensor | None,
+    w2_bias: torch.Tensor | None,
+    num_experts: int,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    unfused: bool = False,
+) -> torch.Tensor:
+    quant_config = mxfp4_w4a16_moe_quant_config(
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+    )
+
+    if unfused:
+        fused_experts = UnfusedOAITritonExperts(quant_config)
+    else:
+        fused_experts = OAITritonExperts(quant_config)
+
+    mk = FusedMoEModularKernel(MoEPrepareAndFinalizeNoEP(), fused_experts)
+
+    return mk.forward(
+        hidden_states=x,
+        w1=w1,
+        w2=w2,
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        inplace=True,
+        activation="swigluoai",
+        global_num_experts=num_experts,
+        expert_map=None,
+        apply_router_weight_on_input=False,
+    )
+
+
+@pytest.mark.skipif(
+    not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
+)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("m,n,k", MNK)
+@pytest.mark.parametrize("num_experts", [32, 128])
+@pytest.mark.parametrize("topk", [4])
+@pytest.mark.parametrize("unfused", [True, False])
+def test_oai_triton_moe(
+    dtype: torch.dtype,
+    m: int,
+    n: int,
+    k: int,
+    num_experts: int,
+    topk: int,
+    unfused: bool,
+):
+    current_platform.seed_everything(0)
+    (
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        w1_tri,
+        w2_tri,
+        w1_bias_tri,
+        w2_bias_tri,
+        w1_precision_config,
+        w2_precision_config,
+    ) = make_weights(dtype, k, n, num_experts)
+
+    x = torch.randn((m, k), dtype=dtype, device="cuda")
+    router_logits = torch.randn(m, num_experts, device="cuda", dtype=dtype)
+    topk_weights, topk_ids = torch.topk(router_logits, k=topk, dim=-1, sorted=True)
+    topk_weights = torch.nn.functional.softmax(topk_weights, dim=-1)
+
+    with set_current_vllm_config(VllmConfig()):
+        out_ref = torch_moe_impl(x, w1, w2, w1_bias, w2_bias, topk_weights, topk_ids)
+
+        out = oai_triton_moe_impl(
+            x,
+            w1_tri,
+            w2_tri,
+            w1_precision_config,
+            w2_precision_config,
+            w1_bias_tri,
+            w2_bias_tri,
+            num_experts,
+            topk_weights,
+            topk_ids,
+            unfused,
+        )
+
+    assert_close(ref=out_ref, tri=out, maxtol=0.025, rmstol=0.005)
diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py
index 3ad19370962ab..24cab79a72443 100644
--- a/vllm/lora/layers/fused_moe.py
+++ b/vllm/lora/layers/fused_moe.py
@@ -20,15 +20,24 @@ from vllm.model_executor.layers.fused_moe.config import (
     _get_config_dtype_str,
 )
 from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
-    modular_marlin_fused_moe,
+    MarlinExperts,
 )
 from vllm.model_executor.layers.fused_moe.fused_moe import (
-    modular_triton_fused_moe,
+    TritonExperts,
     try_get_optimal_moe_config,
 )
 from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import (
     FusedMoEModularMethod,
 )
+from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
+    UnfusedOAITritonExperts,
+)
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEModularKernel,
+)
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+    MoEPrepareAndFinalizeNoEP,
+)
 
 from .utils import _get_lora_device
 
@@ -114,15 +123,23 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
         self.base_layer.ensure_moe_quant_config_init()
         quant_config = self.base_layer.quant_method.moe_quant_config
 
-        m_fused_moe_fn = (
-            modular_triton_fused_moe(
-                quant_config, shared_experts=self.base_layer.shared_experts
-            )
-            if not quant_config.use_mxfp4_w4a16
-            else modular_marlin_fused_moe(
-                quant_config, shared_experts=self.base_layer.shared_experts
-            )
+        prepare_finalize = MoEPrepareAndFinalizeNoEP()
+        m_fused_moe_fn = FusedMoEModularKernel(
+            prepare_finalize,
+            self.base_layer.quant_method.select_gemm_impl(
+                prepare_finalize, self.base_layer
+            ),
+            self.base_layer.shared_experts,
+            getattr(self.base_layer, "shared_experts_stream", None),
         )
+        if quant_config.use_mxfp4_w4a16:
+            assert isinstance(
+                m_fused_moe_fn.fused_experts, (MarlinExperts, UnfusedOAITritonExperts)
+            )
+        else:
+            assert isinstance(
+                m_fused_moe_fn.fused_experts, (MarlinExperts, TritonExperts)
+            )
 
         def fwd_decorator(layer, func):
             def wrapper(*args, **kwargs):
diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
index 128507639fdfd..0b006e15632e1 100644
--- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
+++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
@@ -5,6 +5,7 @@
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import (
     FUSED_MOE_UNQUANTIZED_CONFIG,
@@ -376,3 +377,148 @@ class OAITritonExperts(BaseOAITritonExperts):
             intermediate_cache=workspace2,
             a1q_scale=a1q_scale,
         )
+
+
+class UnfusedOAITritonExperts(BaseOAITritonExperts):
+    """
+    A Triton based MoE expert class that operates on expert standard
+    format and explicitly keeps the activation and reduction (moe_sum) steps
+    unfused from the matmul_ogs kernel. This exposes injection points
+    for activation and moe_sum.
+
+    One use case for it is to inject LoRA modules on the activation and moe_sum.
+    """
+
+    def __init__(self, quant_config: FusedMoEQuantConfig):
+        # TODO (varun) : Enable activation quantization
+        assert quant_config.use_mxfp4_w4a16, "Supports only mxfp4_w4a16"
+        super().__init__(quant_config)
+
+    @property
+    def activation_formats(
+        self,
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (
+            mk.FusedMoEActivationFormat.Standard,
+            mk.FusedMoEActivationFormat.Standard,
+        )
+
+    def supports_chunking(self) -> bool:
+        return True
+
+    def workspace_shapes(
+        self,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
+        # workspace are allocated inside the kernel
+        workspace1 = (M * topk, N // 2)
+        workspace2 = (M * topk, max(N, K))
+        output = (M, K)
+        return (workspace1, workspace2, output)
+
+    def moe_sum(self, input: torch.Tensor, output: torch.Tensor):
+        ops.moe_sum(input, output)
+
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        apply_router_weight_on_input: bool,
+    ):
+        if self.quant_config is None:
+            self.quant_config = FUSED_MOE_UNQUANTIZED_CONFIG
+
+        if expert_map is not None:
+            topk_ids = expert_map[topk_ids]
+
+        local_num_experts = w1.size(0)
+        if global_num_experts == -1:
+            global_num_experts = local_num_experts
+
+        routing_data, gather_indx, scatter_indx = self._make_routing_data(
+            topk_ids, topk_weights, local_num_experts
+        )
+
+        topk = topk_ids.size(1)
+
+        # type check, uint8 means mxfp4
+        assert hidden_states.dtype == torch.bfloat16
+        assert (
+            self.quant_config.w1_bias is None
+            or self.quant_config.w1_bias.dtype == torch.float32
+        )
+        assert (
+            self.quant_config.w2_bias is None
+            or self.quant_config.w2_bias.dtype == torch.float32
+        )
+
+        # Shape check, only check non-mxfp4
+        assert hidden_states.ndim == 2
+        assert hidden_states.shape[-1] == w1.shape[-2]
+        assert w2.shape[-1] == w1.shape[1]
+
+        batch_dim = 1
+        M, K = hidden_states.shape
+        E, _, N = w1.shape
+
+        if global_num_experts == -1:
+            global_num_experts = E
+
+        # Note that the output tensor might be in workspace13
+        intermediate_cache1 = _resize_cache(workspace2, (batch_dim, M * topk, N))
+        intermediate_cache3 = _resize_cache(workspace2, (batch_dim, M * topk, K))
+        intermediate_cache2 = _resize_cache(workspace13, (M * topk, N // 2))
+
+        gammas = routing_data.gate_scal if routing_data else None
+
+        matmul_ogs(
+            hidden_states,
+            w1,
+            self.quant_config.w1_bias,
+            routing_data,
+            gather_indx=gather_indx,
+            precision_config=self.quant_config.w1_precision,
+            gammas=gammas if apply_router_weight_on_input else None,
+            fused_activation=None,
+            y=intermediate_cache1,
+        )
+
+        self.activation(
+            activation, intermediate_cache2, intermediate_cache1.view(-1, N)
+        )
+
+        # matmul_ogs grouped reduction fuse sum across multiple experts:
+        # y[dst_ind // n_expts_act, :] += x[src_ind, :]
+        # Need to set n_expts_act to 1 to unfuse moe_sum
+        routing_data.n_expts_act = 1
+
+        matmul_ogs(
+            intermediate_cache2,
+            w2,
+            self.quant_config.w2_bias,
+            routing_data,
+            scatter_indx=scatter_indx,
+            precision_config=self.quant_config.w2_precision,
+            gammas=None if apply_router_weight_on_input else gammas,
+            y=intermediate_cache3,
+        )
+
+        self.moe_sum(intermediate_cache3.view(-1, topk, K), output)
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index bc241ac692e23..74036753496d4 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -30,6 +30,7 @@ from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
 )
 from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
     OAITritonExperts,
+    UnfusedOAITritonExperts,
 )
 from vllm.model_executor.layers.fused_moe.trtllm_moe import TrtLlmGenExperts
 from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod
@@ -83,8 +84,21 @@ def get_mxfp4_backend_with_lora() -> Mxfp4Backend:
     if not current_platform.is_cuda():
         return Mxfp4Backend.NONE
 
-    logger.info_once("[get_mxfp4_backend_with_lora] Using Marlin backend")
-    return Mxfp4Backend.MARLIN
+    # If FlashInfer is not available, try either Marlin or Triton
+    triton_kernels_supported = (
+        has_triton_kernels()
+        and is_torch_equal_or_newer("2.8.0")
+        # NOTE: triton_kernels are only confirmed to work on SM90 and SM100
+        # SM110 fails with this error: https://github.com/vllm-project/vllm/issues/29317
+        # SM120 needs this fix: https://github.com/triton-lang/triton/pull/8498
+        and (9, 0) <= current_platform.get_device_capability() < (11, 0)
+    )
+    if envs.VLLM_MXFP4_USE_MARLIN or not triton_kernels_supported:
+        logger.info_once("[get_mxfp4_backend_with_lora] Using Marlin backend")
+        return Mxfp4Backend.MARLIN
+
+    logger.info_once("[get_mxfp4_backend_with_lora] Using Triton backend")
+    return Mxfp4Backend.TRITON
 
 
 def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend:
@@ -854,6 +868,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
             elif self.mxfp4_backend == Mxfp4Backend.MARLIN:
                 return MarlinExperts(self.moe_quant_config)
             elif self.mxfp4_backend == Mxfp4Backend.TRITON:
+                if self.moe.is_lora_enabled:
+                    return UnfusedOAITritonExperts(self.moe_quant_config)
                 return OAITritonExperts(self.moe_quant_config)
             else:
                 raise NotImplementedError(

From 18523b87f67b12e9044d690dfe9da7cddc390627 Mon Sep 17 00:00:00 2001
From: Wilson Wu <iwilsonwu@gmail.com>
Date: Fri, 28 Nov 2025 10:53:55 +0800
Subject: [PATCH 028/770] [Docs] Update supported models for Olmo 3 in tool
 calling documentation (#29411)

Signed-off-by: Wilson Wu <iwilsonwu@gmail.com>
---
 docs/features/tool_calling.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md
index dd79ba19b7247..22dda37279ac6 100644
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -371,7 +371,8 @@ Olmo 3 models output tool calls in a format that is very similar to the one expe
 
 Supported models:
 
-* TODO (will be updated after Olmo 3 release)
+* `allenai/Olmo-3-7B-Instruct`
+* `allenai/Olmo-3-32B-Think`
 
 Flags: `--tool-call-parser olmo3`
 

From c7ba1f6bc762af8f231e6ee885725e7401d74578 Mon Sep 17 00:00:00 2001
From: maang-h <55082429+maang-h@users.noreply.github.com>
Date: Fri, 28 Nov 2025 13:42:30 +0800
Subject: [PATCH 029/770] [BugFix] Fix ValueError in NewRequestData repr
 methods (#29392)

Signed-off-by: maang <maang_h@163.com>
---
 tests/v1/core/test_output.py | 36 ++++++++++++++++++++++++++++++++++++
 vllm/v1/core/sched/output.py |  8 ++++++--
 2 files changed, 42 insertions(+), 2 deletions(-)
 create mode 100644 tests/v1/core/test_output.py

diff --git a/tests/v1/core/test_output.py b/tests/v1/core/test_output.py
new file mode 100644
index 0000000000000..9dea19320e613
--- /dev/null
+++ b/tests/v1/core/test_output.py
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+
+from vllm.v1.core.sched.output import NewRequestData
+
+
+def _create_new_requests_data(prompt_embeds: torch.Tensor | None) -> NewRequestData:
+    return NewRequestData(
+        req_id="test_req",
+        prompt_token_ids=None,
+        mm_features=[],
+        sampling_params=None,
+        pooling_params=None,
+        block_ids=([],),
+        num_computed_tokens=0,
+        lora_request=None,
+        prompt_embeds=prompt_embeds,
+    )
+
+
+def test_repr_with_none() -> None:
+    """Test repr when prompt_embeds is None."""
+    new_requests_data = _create_new_requests_data(None)
+
+    assert "prompt_embeds_shape=None" in repr(new_requests_data)
+    assert "prompt_embeds_shape=None" in new_requests_data.anon_repr()
+
+
+def test_repr_with_multi_element_tensor() -> None:
+    """Test repr when prompt_embeds is a multi-element tensor."""
+    prompt_embeds = torch.randn(10, 768)
+    new_requests_data = _create_new_requests_data(prompt_embeds)
+
+    assert "prompt_embeds_shape=torch.Size([10, 768])" in repr(new_requests_data)
+    assert "prompt_embeds_shape=torch.Size([10, 768])" in new_requests_data.anon_repr()
diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py
index abfab43499b2a..b69fa87ebddc8 100644
--- a/vllm/v1/core/sched/output.py
+++ b/vllm/v1/core/sched/output.py
@@ -68,7 +68,9 @@ class NewRequestData:
         )
 
     def __repr__(self) -> str:
-        prompt_embeds_shape = self.prompt_embeds.shape if self.prompt_embeds else None
+        prompt_embeds_shape = (
+            self.prompt_embeds.shape if self.prompt_embeds is not None else None
+        )
         return (
             f"NewRequestData("
             f"req_id={self.req_id},"
@@ -88,7 +90,9 @@ class NewRequestData:
         prompt_token_ids_len = (
             len(self.prompt_token_ids) if self.prompt_token_ids is not None else None
         )
-        prompt_embeds_shape = self.prompt_embeds.shape if self.prompt_embeds else None
+        prompt_embeds_shape = (
+            self.prompt_embeds.shape if self.prompt_embeds is not None else None
+        )
         return (
             f"NewRequestData("
             f"req_id={self.req_id},"

From 37b15e97e8443a7fd76f5aa95a78d5593f7241a4 Mon Sep 17 00:00:00 2001
From: EanWang211123 <wangyiheng@sangfor.com.cn>
Date: Fri, 28 Nov 2025 14:05:45 +0800
Subject: [PATCH 030/770] [Multimodal][Speculative Decoding]Eagle3 mm support,
 enablement on qwen3vl (#29594)

Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
Signed-off-by: EanWang211123 <wangyiheng@sangfor.com.cn>
Co-authored-by: Louie Tsai <louie.tsai@intel.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 tests/models/registry.py               |  4 ++++
 tests/v1/e2e/test_spec_decode.py       | 14 ++++++++++++++
 vllm/model_executor/models/qwen3_vl.py | 23 ++++++++++++++++++++++-
 vllm/model_executor/models/registry.py |  1 +
 vllm/v1/spec_decode/eagle.py           |  8 ++++----
 5 files changed, 45 insertions(+), 5 deletions(-)

diff --git a/tests/models/registry.py b/tests/models/registry.py
index c9d4823d52792..1f4a106c06b4b 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -913,6 +913,10 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
         "Qwen/Qwen2.5-VL-7B-Instruct",
         speculative_model="Rayzl/qwen2.5-vl-7b-eagle3-sgl",
     ),
+    "Eagle3Qwen3vlForCausalLM": _HfExamplesInfo(
+        "Qwen/Qwen3-VL-8B-Instruct",
+        speculative_model="taobao-mnn/Qwen3-VL-8B-Instruct-Eagle3",
+    ),
     "Qwen3NextMTP": _HfExamplesInfo(
         "Qwen/Qwen3-Next-80B-A3B-Instruct", min_transformers_version="4.56.3"
     ),
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 03396270a31cb..3a25f7411eecd 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -283,6 +283,19 @@ def test_speculators_model_integration(
     ["model_setup", "mm_enabled", "enable_chunked_prefill"],
     [
         (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False, False),
+        pytest.param(
+            (
+                "eagle3",
+                "Qwen/Qwen3-VL-8B-Instruct",
+                "taobao-mnn/Qwen3-VL-8B-Instruct-Eagle3",
+                1,
+            ),
+            False,
+            False,
+            marks=pytest.mark.skip(
+                reason="architecture of its eagle3 is LlamaForCausalLMEagle3"
+            ),
+        ),
         pytest.param(
             (
                 "eagle3",
@@ -352,6 +365,7 @@ def test_speculators_model_integration(
     ],
     ids=[
         "qwen3_eagle3",
+        "qwen3_vl_eagle3",
         "qwen2_5_vl_eagle3",
         "llama3_eagle",
         "llama3_eagle3",
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 4cd6fa14c32df..52d31e70a8f05 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -89,6 +89,7 @@ from vllm.utils.collection_utils import is_list_of
 
 from .interfaces import (
     MultiModalEmbeddings,
+    SupportsEagle3,
     SupportsLoRA,
     SupportsMRoPE,
     SupportsMultiModal,
@@ -1122,9 +1123,14 @@ class Qwen3LLMModel(Qwen3Model):
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
+        
+        aux_hidden_states = []
         for layer_idx, layer in islice(
             enumerate(self.layers), self.start_layer, self.end_layer
         ):
+            if layer_idx in self.aux_hidden_state_layers:
+                aux_hidden_states.append(hidden_states + residual)
+            
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
@@ -1144,6 +1150,9 @@ class Qwen3LLMModel(Qwen3Model):
                 {"hidden_states": hidden_states, "residual": residual}
             )
         hidden_states, _ = self.norm(hidden_states, residual)
+
+        if len(aux_hidden_states) > 0:
+            return hidden_states, aux_hidden_states
         return hidden_states
 
 
@@ -1186,7 +1195,12 @@ class Qwen3LLMForCausalLM(Qwen3ForCausalLM):
     dummy_inputs=Qwen3VLDummyInputsBuilder,
 )
 class Qwen3VLForConditionalGeneration(
-    nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
+    nn.Module,
+    SupportsMultiModal,
+    SupportsLoRA,
+    SupportsPP,
+    SupportsMRoPE,
+    SupportsEagle3,
 ):
     merge_by_field_config = True
     multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"}
@@ -1279,6 +1293,13 @@ class Qwen3VLForConditionalGeneration(
         self.visual_dim = config.vision_config.out_hidden_size
         self.multiscale_dim = self.visual_dim * self.deepstack_num_level
 
+    def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
+        self.language_model.model.aux_hidden_state_layers = layers
+
+    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
+        num_layers = len(self.language_model.model.layers)
+        return (2, num_layers // 2, num_layers - 3)
+
     def _get_deepstack_input_embeds(self, num_tokens: int) -> IntermediateTensors:
         # get deepstack_input_embeds from buffer, and clear the buffer
         return IntermediateTensors(
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index ba9f33819c950..0d582043e8c02 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -414,6 +414,7 @@ _SPECULATIVE_DECODING_MODELS = {
     "Eagle3LlamaForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
     "LlamaForCausalLMEagle3": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
     "Eagle3Qwen2_5vlForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
+    "Eagle3Qwen3vlForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
     "EagleDeepSeekMTPModel": ("deepseek_eagle", "EagleDeepseekV3ForCausalLM"),
     "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"),
     "ErnieMTPModel": ("ernie_mtp", "ErnieMTP"),
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 7600df48150ac..305abdade8da6 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -1017,10 +1017,10 @@ class EagleProposer:
 
         if supports_multimodal(target_model):
             # handle multimodality
-            if (
-                self.get_model_name(target_model)
-                == "Qwen2_5_VLForConditionalGeneration"
-            ):
+            if self.get_model_name(target_model) in [
+                "Qwen2_5_VLForConditionalGeneration",
+                "Qwen3VLForConditionalGeneration",
+            ]:
                 self.model.config.image_token_index = target_model.config.image_token_id
             else:
                 self.model.config.image_token_index = (

From f4b76056ee5c3a3f917527da5be3786e1b8530c6 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <yuqi.wang@daocloud.io>
Date: Fri, 28 Nov 2025 14:05:48 +0800
Subject: [PATCH 031/770] Improve enable chunked_prefill & prefix_caching
 logic. (#26623)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
Signed-off-by: wang.yuqi <noooop@126.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 .../pooling/test_auto_prefix_cache_support.py |   4 +-
 tests/test_config.py                          | 240 +++++++++++++++++-
 vllm/config/model.py                          | 109 ++++++++
 vllm/config/pooler.py                         |   6 +-
 vllm/config/vllm.py                           |  76 ++----
 vllm/engine/arg_utils.py                      |  90 +++----
 vllm/model_executor/models/bert.py            |   4 +-
 vllm/model_executor/models/interfaces_base.py |  35 ++-
 vllm/model_executor/models/modernbert.py      |   3 +-
 vllm/model_executor/models/registry.py        |  15 +-
 vllm/v1/engine/core.py                        |   7 +-
 11 files changed, 456 insertions(+), 133 deletions(-)

diff --git a/tests/models/language/pooling/test_auto_prefix_cache_support.py b/tests/models/language/pooling/test_auto_prefix_cache_support.py
index 0904c7e877ef4..3795f2a5d8664 100644
--- a/tests/models/language/pooling/test_auto_prefix_cache_support.py
+++ b/tests/models/language/pooling/test_auto_prefix_cache_support.py
@@ -105,8 +105,6 @@ def test_embed_models(
 def test_non_causal_models(
     hf_runner, vllm_runner, example_prompts, model: str, dtype: str
 ) -> None:
-    with vllm_runner(
-        model, max_model_len=512, dtype=dtype, enable_prefix_caching=True
-    ) as vllm_model:
+    with vllm_runner(model, max_model_len=512, dtype=dtype) as vllm_model:
         cache_config = vllm_model.llm.llm_engine.cache_config
         assert not cache_config.enable_prefix_caching
diff --git a/tests/test_config.py b/tests/test_config.py
index 080e4d2afacc6..112b02edd0389 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
+import logging
 import os
 from dataclasses import MISSING, Field, asdict, dataclass, field
 from unittest.mock import patch
@@ -602,6 +602,244 @@ def test_s3_url_different_models_create_different_directories(mock_pull_files):
     assert os.path.exists(config2.tokenizer) and os.path.isdir(config2.tokenizer)
 
 
+@pytest.mark.parametrize(
+    ("model_id", "expected_attn_type", "expected_result", "reason"),
+    [
+        # pooling models
+        (
+            "jason9693/Qwen2.5-1.5B-apeach",
+            "decoder",
+            True,
+            "Pooling models with causal attn and last pooling support chunked prefill.",
+        ),
+        (
+            "Qwen/Qwen3-Embedding-0.6B",
+            "decoder",
+            True,
+            "Pooling models with causal attn and last pooling support chunked prefill.",
+        ),
+        (
+            "Qwen/Qwen2.5-Math-PRM-7B",
+            "decoder",
+            False,
+            "Pooling models with step pooling does not support chunked prefill.",
+        ),
+        (
+            "internlm/internlm2-1_8b-reward",
+            "decoder",
+            False,
+            "Pooling models with all pooling does not support chunked prefill.",
+        ),
+        (
+            "BAAI/bge-base-en",
+            "encoder_only",
+            False,
+            "Pooling models with bidirectional attn does not support chunked prefill.",
+        ),
+        (
+            "boltuix/NeuroBERT-NER",
+            "encoder_only",
+            False,
+            "Pooling models with bidirectional attn does not support chunked prefill.",
+        ),
+        (
+            "papluca/xlm-roberta-base-language-detection",
+            "encoder_only",
+            False,
+            "Pooling models with bidirectional attn does not support chunked prefill.",
+        ),
+        (
+            "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
+            "encoder_only",
+            False,
+            "Pooling models with bidirectional attn does not support chunked prefill.",
+        ),
+        (
+            "intfloat/e5-small",
+            "encoder_only",
+            False,
+            "Pooling models with bidirectional attn does not support chunked prefill.",
+        ),
+        # multimodal models
+        (
+            "openai/clip-vit-base-patch32",
+            "decoder",
+            True,
+            "Pooling models with causal attn and last pooling support chunked prefill.",
+        ),
+        (
+            "google/siglip-base-patch16-224",
+            "encoder_only",
+            False,
+            "Pooling models with bidirectional attn does not support chunked prefill.",
+        ),
+        # generate models
+        (
+            "Qwen/Qwen3-0.6B",
+            "decoder",
+            True,
+            "Generative models support chunked prefill.",
+        ),
+        (
+            "Qwen/Qwen3-Next-80B-A3B-Instruct",
+            "hybrid",
+            True,
+            "Generative models support chunked prefill.",
+        ),
+        (
+            "ibm-granite/granite-4.0-h-small",
+            "hybrid",
+            True,
+            "Generative models support chunked prefill.",
+        ),
+        (
+            "state-spaces/mamba-130m-hf",
+            "attention_free",
+            True,
+            "Generative models support chunked prefill.",
+        ),
+        # encoder_decoder models
+        (
+            "openai/whisper-small",
+            "encoder_decoder",
+            False,
+            "Encoder decoder models does not support chunked prefill.",
+        ),
+    ],
+)
+def test_is_chunked_prefill_supported(
+    model_id: str,
+    expected_attn_type: str,
+    expected_result: bool,
+    reason: str,
+    caplog_vllm,
+):
+    model_config = ModelConfig(model_id, trust_remote_code=True)
+    assert model_config.attn_type == expected_attn_type
+    with caplog_vllm.at_level(level=logging.DEBUG):
+        assert model_config.is_chunked_prefill_supported == expected_result
+    assert reason in caplog_vllm.text
+
+
+@pytest.mark.parametrize(
+    ("model_id", "expected_attn_type", "expected_result", "reason"),
+    [
+        # pooling models
+        (
+            "jason9693/Qwen2.5-1.5B-apeach",
+            "decoder",
+            True,
+            "Pooling models with causal attn and last pooling support prefix caching.",
+        ),
+        (
+            "Qwen/Qwen3-Embedding-0.6B",
+            "decoder",
+            True,
+            "Pooling models with causal attn and last pooling support prefix caching.",
+        ),
+        (
+            "Qwen/Qwen2.5-Math-PRM-7B",
+            "decoder",
+            False,
+            "Pooling models with step pooling does not support prefix caching.",
+        ),
+        (
+            "internlm/internlm2-1_8b-reward",
+            "decoder",
+            False,
+            "Pooling models with all pooling does not support prefix caching.",
+        ),
+        (
+            "BAAI/bge-base-en",
+            "encoder_only",
+            False,
+            "Pooling models with bidirectional attn does not support prefix caching.",
+        ),
+        (
+            "boltuix/NeuroBERT-NER",
+            "encoder_only",
+            False,
+            "Pooling models with bidirectional attn does not support prefix caching.",
+        ),
+        (
+            "papluca/xlm-roberta-base-language-detection",
+            "encoder_only",
+            False,
+            "Pooling models with bidirectional attn does not support prefix caching.",
+        ),
+        (
+            "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
+            "encoder_only",
+            False,
+            "Pooling models with bidirectional attn does not support prefix caching.",
+        ),
+        (
+            "intfloat/e5-small",
+            "encoder_only",
+            False,
+            "Pooling models with bidirectional attn does not support prefix caching.",
+        ),
+        # multimodal models
+        (
+            "openai/clip-vit-base-patch32",
+            "decoder",
+            True,
+            "Pooling models with causal attn and last pooling support prefix caching.",
+        ),
+        (
+            "google/siglip-base-patch16-224",
+            "encoder_only",
+            False,
+            "Pooling models with bidirectional attn does not support prefix caching.",
+        ),
+        # generate models
+        (
+            "Qwen/Qwen3-0.6B",
+            "decoder",
+            True,
+            "Generative models support prefix caching.",
+        ),
+        (
+            "Qwen/Qwen3-Next-80B-A3B-Instruct",
+            "hybrid",
+            False,
+            "Hybrid models does not support prefix caching since the feature is still experimental.",  # noqa: E501
+        ),
+        (
+            "ibm-granite/granite-4.0-h-small",
+            "hybrid",
+            False,
+            "Hybrid models does not support prefix caching since the feature is still experimental.",  # noqa: E501
+        ),
+        (
+            "state-spaces/mamba-130m-hf",
+            "attention_free",
+            False,
+            "Attention free models does not support prefix caching since the feature is still experimental.",  # noqa: E501
+        ),
+        # encoder_decoder models
+        (
+            "openai/whisper-small",
+            "encoder_decoder",
+            False,
+            "Encoder decoder models does not support prefix caching.",
+        ),
+    ],
+)
+def test_is_prefix_caching_supported(
+    model_id: str,
+    expected_attn_type: str,
+    expected_result: bool,
+    reason: str,
+    caplog_vllm,
+):
+    model_config = ModelConfig(model_id, trust_remote_code=True)
+    assert model_config.attn_type == expected_attn_type
+    with caplog_vllm.at_level(level=logging.DEBUG):
+        assert model_config.is_prefix_caching_supported == expected_result
+    assert reason in caplog_vllm.text
+
+
 @pytest.mark.parametrize(
     ("backend", "custom_ops", "expected"),
     [
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 21d602b30ac1a..b9ae4fec14efa 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -107,6 +107,10 @@ _RUNNER_CONVERTS: dict[RunnerType, list[ConvertType]] = {
     "draft": [],
 }
 
+AttnTypeStr = Literal[
+    "decoder", "encoder", "encoder_only", "encoder_decoder", "attention_free", "hybrid"
+]
+
 
 @config
 @dataclass(config=ConfigDict(arbitrary_types_allowed=True))
@@ -1752,6 +1756,111 @@ class ModelConfig:
         logger.info("Using max model len %s", max_model_len)
         return max_model_len
 
+    @property
+    def attn_type(self) -> AttnTypeStr:
+        if self.pooler_config is not None:
+            pooling_type = self._model_info.default_pooling_type.lower()
+            if pooling_type == "cls":
+                return "encoder_only"
+            else:
+                is_causal = getattr(self.hf_config, "is_causal", True)
+                return "encoder_only" if not is_causal else self._model_info.attn_type
+        elif self.is_hybrid:
+            return "hybrid"
+        elif self.is_attention_free:
+            return "attention_free"
+        elif self.is_encoder_decoder:
+            return "encoder_decoder"
+        else:
+            return "decoder"
+
+    @property
+    def is_chunked_prefill_supported(self) -> bool:
+        attn_type = self.attn_type
+        if self.pooler_config is not None:
+            # for pooling models
+            if attn_type == "encoder_only":
+                logger.debug(
+                    "Pooling models with bidirectional attn does not support "
+                    "chunked prefill."
+                )
+                return False
+            elif attn_type == "decoder":
+                pooling_type = self.pooler_config.pooling_type.lower()
+                if pooling_type in ["all", "mean", "step", "cls"]:
+                    logger.debug(
+                        "Pooling models with %s pooling does not "
+                        "support chunked prefill.",
+                        pooling_type,
+                    )
+                    return False
+                else:
+                    # pooling_type == "last"
+                    logger.debug(
+                        "Pooling models with causal attn and last pooling support "
+                        "chunked prefill."
+                    )
+                    return True
+            # vllm currently does not have pooling models using hybrid,
+            # attention_free or encoder_decoder attn types.
+            return attn_type != "encoder_decoder"
+        else:
+            if attn_type == "encoder_decoder":
+                logger.debug("Encoder decoder models does not support chunked prefill.")
+                return False
+            logger.debug("Generative models support chunked prefill.")
+            return True
+
+    @property
+    def is_prefix_caching_supported(self) -> bool:
+        attn_type = self.attn_type
+        if self.pooler_config is not None:
+            # for pooling models
+            if attn_type == "encoder_only":
+                logger.debug(
+                    "Pooling models with bidirectional attn does not "
+                    "support prefix caching."
+                )
+                return False
+            elif attn_type == "decoder":
+                pooling_type = self.pooler_config.pooling_type.lower()
+                if pooling_type in ["all", "mean", "step", "cls"]:
+                    logger.debug(
+                        "Pooling models with %s pooling does not "
+                        "support prefix caching.",
+                        pooling_type,
+                    )
+                    return False
+                else:
+                    # pooling_type == "last"
+                    logger.debug(
+                        "Pooling models with causal attn and last pooling support "
+                        "prefix caching."
+                    )
+                    return True
+            # vllm currently does not have pooling models using hybrid,
+            # attention_free or encoder_decoder attn types.
+            return False
+        else:
+            if attn_type == "hybrid":
+                logger.debug(
+                    "Hybrid models does not support prefix caching since the feature "
+                    "is still experimental."
+                )
+                return False
+            elif attn_type == "attention_free":
+                logger.debug(
+                    "Attention free models does not support prefix caching since the "
+                    "feature is still experimental."
+                )
+                return False
+            elif attn_type == "encoder_decoder":
+                logger.debug("Encoder decoder models does not support prefix caching.")
+                return False
+            else:  # attn_type == "decoder"
+                logger.debug("Generative models support prefix caching.")
+                return True
+
     def is_model_moe(
         self,
     ) -> bool:
diff --git a/vllm/config/pooler.py b/vllm/config/pooler.py
index 85950bbcd666f..aa4e7006d0247 100644
--- a/vllm/config/pooler.py
+++ b/vllm/config/pooler.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any
+from typing import Any, Literal
 
 from pydantic.dataclasses import dataclass
 
@@ -11,13 +11,15 @@ from vllm.utils.hashing import safe_hash
 
 logger = init_logger(__name__)
 
+PoolingTypeStr = Literal["LAST", "ALL", "CLS", "STEP", "MEAN"]
+
 
 @config
 @dataclass
 class PoolerConfig:
     """Controls the behavior of output pooling in pooling models."""
 
-    pooling_type: str | None = None
+    pooling_type: PoolingTypeStr | None = None
     """
     The pooling method of the pooling model. This should be a key in
     [`vllm.model_executor.layers.pooler.PoolingType`][].
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index c576275e80fe3..7ac8cc764322e 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -721,65 +721,27 @@ class VllmConfig:
                 "correctness and to realize prefill savings. "
             )
 
-        disable_chunked_prefill_reasons: list[str] = []
+        if self.model_config and self.model_config.is_encoder_decoder:
+            from vllm.multimodal import MULTIMODAL_REGISTRY
 
-        if self.model_config:
-            if self.model_config.pooler_config:
-                pooling_type = self.model_config.pooler_config.pooling_type
-                if pooling_type is None or pooling_type.lower() != "last":
-                    disable_chunked_prefill_reasons.append(
-                        'Only "last" pooling supports chunked '
-                        "prefill and prefix caching; disabling both."
-                    )
-                if not getattr(self.model_config.hf_config, "is_causal", True):
-                    disable_chunked_prefill_reasons.append(
-                        "Only models using causal attention support chunked "
-                        "prefill and prefix caching; disabling both."
-                    )
-            elif self.model_config.is_encoder_decoder:
-                from vllm.multimodal import MULTIMODAL_REGISTRY
-
-                self.scheduler_config.max_num_encoder_input_tokens = (
-                    MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config)
-                )
-                logger.debug(
-                    "Encoder-decoder model detected: setting "
-                    "`max_num_encoder_input_tokens` to encoder length (%s)",
-                    self.scheduler_config.max_num_encoder_input_tokens,
-                )
-                if (
-                    self.model_config.architecture == "WhisperForConditionalGeneration"
-                    and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"
-                ):
-                    logger.warning(
-                        "Whisper is known to have issues with "
-                        "forked workers. If startup is hanging, "
-                        "try setting 'VLLM_WORKER_MULTIPROC_METHOD' "
-                        "to 'spawn'."
-                    )
-
-        # Final off-switch for CP/APC:
-        # Disable for (a) collected blockers, (b) encoder–decoder, or
-        # (c) explicit CP=False when APC wasn't requested.
-        # Do NOT disable merely because the resolved CP flag is False.
-        apc_requested = (
-            self.cache_config is not None and self.cache_config.enable_prefix_caching
-        )
-        if (
-            disable_chunked_prefill_reasons
-            or (self.model_config is not None and self.model_config.is_encoder_decoder)
-            or (
-                self.scheduler_config.enable_chunked_prefill is False
-                and not apc_requested
+            self.scheduler_config.max_num_encoder_input_tokens = (
+                MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config)
             )
-        ):
-            for reason in disable_chunked_prefill_reasons:
-                logger.info(reason)
-            self.scheduler_config.enable_chunked_prefill = False
-            self.scheduler_config.long_prefill_token_threshold = 0
-
-            if self.cache_config is not None:
-                self.cache_config.enable_prefix_caching = False
+            logger.debug(
+                "Encoder-decoder model detected: setting "
+                "`max_num_encoder_input_tokens` to encoder length (%s)",
+                self.scheduler_config.max_num_encoder_input_tokens,
+            )
+            if (
+                self.model_config.architecture == "WhisperForConditionalGeneration"
+                and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"
+            ):
+                logger.warning(
+                    "Whisper is known to have issues with "
+                    "forked workers. If startup is hanging, "
+                    "try setting 'VLLM_WORKER_MULTIPROC_METHOD' "
+                    "to 'spawn'."
+                )
 
         if (
             self.kv_events_config is not None
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index e4c9a82d25223..ad5a34c56161c 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1349,30 +1349,10 @@ class EngineArgs:
         self.tokenizer = model_config.tokenizer
 
         self._check_feature_supported(model_config)
-
-        # Set default arguments for V1 Engine.
-        self._set_default_args(usage_context, model_config)
-        # Disable chunked prefill and prefix caching for:
-        # POWER (ppc64le)/s390x/RISCV CPUs in V1
-        if current_platform.is_cpu() and current_platform.get_cpu_architecture() in (
-            CpuArchEnum.POWERPC,
-            CpuArchEnum.S390X,
-            CpuArchEnum.RISCV,
-        ):
-            logger.info(
-                "Chunked prefill is not supported for ARM and POWER, "
-                "S390X and RISC-V CPUs; "
-                "disabling it for V1 backend."
-            )
-            self.enable_chunked_prefill = False
-            logger.info(
-                "Prefix caching is not supported for ARM and POWER, "
-                "S390X and RISC-V CPUs; "
-                "disabling it for V1 backend."
-            )
-            self.enable_prefix_caching = False
-
-        assert self.enable_chunked_prefill is not None
+        self._set_default_chunked_prefill_and_prefix_caching_args(model_config)
+        self._set_default_max_num_seqs_and_batched_tokens_args(
+            usage_context, model_config
+        )
 
         sliding_window: int | None = None
         if not is_interleaved(model_config.hf_text_config):
@@ -1805,34 +1785,6 @@ class EngineArgs:
                 )
                 _raise_unsupported_error(feature_name=name)
 
-    @classmethod
-    def get_chunked_prefill_prefix_caching_defaults(
-        cls,
-        model_config: ModelConfig,
-    ) -> tuple[bool, bool]:
-        if model_config.runner_type != "pooling":
-            default_chunked_prefill = True
-
-            # Disable prefix caching default for hybrid models and mamba-only
-            # models since the feature is still experimental.
-            default_prefix_caching = not (
-                model_config.is_hybrid or model_config.is_attention_free
-            )
-        else:
-            assert model_config.pooler_config is not None
-
-            pooling_type = model_config.pooler_config.pooling_type
-            incremental_prefill_supported = (
-                pooling_type is not None
-                and pooling_type.lower() == "last"
-                and getattr(model_config.hf_config, "is_causal", True)
-            )
-
-            default_chunked_prefill = incremental_prefill_supported
-            default_prefix_caching = incremental_prefill_supported
-
-        return default_chunked_prefill, default_prefix_caching
-
     @classmethod
     def get_batch_defaults(
         cls,
@@ -1916,14 +1868,11 @@ class EngineArgs:
 
         return default_max_num_batched_tokens, default_max_num_seqs
 
-    def _set_default_args(
-        self, usage_context: UsageContext, model_config: ModelConfig
+    def _set_default_chunked_prefill_and_prefix_caching_args(
+        self, model_config: ModelConfig
     ) -> None:
-        """Set Default Arguments for V1 Engine."""
-        (
-            default_chunked_prefill,
-            default_prefix_caching,
-        ) = self.get_chunked_prefill_prefix_caching_defaults(model_config)
+        default_chunked_prefill = model_config.is_chunked_prefill_supported
+        default_prefix_caching = model_config.is_prefix_caching_supported
 
         if self.prefill_context_parallel_size > 1:
             default_chunked_prefill = False
@@ -1984,6 +1933,29 @@ class EngineArgs:
                 scope="local",
             )
 
+        # Disable chunked prefill and prefix caching for:
+        # POWER (ppc64le)/s390x/RISCV CPUs in V1
+        if current_platform.is_cpu() and current_platform.get_cpu_architecture() in (
+            CpuArchEnum.POWERPC,
+            CpuArchEnum.S390X,
+            CpuArchEnum.RISCV,
+        ):
+            logger.info(
+                "Chunked prefill is not supported for ARM and POWER, "
+                "S390X and RISC-V CPUs; "
+                "disabling it for V1 backend."
+            )
+            self.enable_chunked_prefill = False
+            logger.info(
+                "Prefix caching is not supported for ARM and POWER, "
+                "S390X and RISC-V CPUs; "
+                "disabling it for V1 backend."
+            )
+            self.enable_prefix_caching = False
+
+    def _set_default_max_num_seqs_and_batched_tokens_args(
+        self, usage_context: UsageContext, model_config: ModelConfig
+    ):
         world_size = self.pipeline_parallel_size * self.tensor_parallel_size
         (
             default_max_num_batched_tokens,
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index 2679448bce775..e774cd647ea8c 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -32,7 +32,7 @@ from vllm.tasks import PoolingTask
 from vllm.v1.pool.metadata import PoolingMetadata
 
 from .interfaces import SupportsCrossEncoding, SupportsQuant
-from .interfaces_base import default_pooling_type
+from .interfaces_base import attn_type, default_pooling_type
 from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
 
 
@@ -432,7 +432,6 @@ class BertModel(nn.Module, SupportsQuant):
         return loaded_params
 
 
-@default_pooling_type("ALL")
 class BertPoolingModel(BertModel):
     is_pooling_model = True
 
@@ -864,6 +863,7 @@ class BertForSequenceClassification(nn.Module, SupportsCrossEncoding, SupportsQu
         )
 
 
+@attn_type("encoder_only")
 @default_pooling_type("ALL")
 class BertForTokenClassification(nn.Module):
     is_pooling_model = True
diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index 85c5574bacf0a..2c99fce8d918c 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -19,10 +19,14 @@ from vllm.utils.func_utils import supports_kw
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
+    from vllm.config.model import AttnTypeStr
+    from vllm.config.pooler import PoolingTypeStr
     from vllm.model_executor.layers.pooler import Pooler
 else:
     VllmConfig = Any
     Pooler = Any
+    PoolingTypeStr = Any
+    AttnTypeStr = Any
 
 logger = init_logger(__name__)
 
@@ -165,7 +169,7 @@ class VllmModelForPooling(VllmModel[T_co], Protocol[T_co]):
         MRO of your model class.
     """
 
-    default_pooling_type: ClassVar[str] = "LAST"
+    default_pooling_type: ClassVar[PoolingTypeStr] = "LAST"
     """
     Indicates the [vllm.config.pooler.PoolerConfig.pooling_type][]
     to use by default.
@@ -175,6 +179,17 @@ class VllmModelForPooling(VllmModel[T_co], Protocol[T_co]):
     decorator to conveniently set this field.
     """
 
+    attn_type: ClassVar[AttnTypeStr] = "decoder"
+    """
+    Indicates the
+    [vllm.config.model.ModelConfig.attn_type][]
+    to use by default.
+
+    You can use the
+    [vllm.model_executor.models.interfaces_base.attn_type][]
+    decorator to conveniently set this field.
+    """
+
     pooler: Pooler
     """The pooler is only called on TP rank 0."""
 
@@ -199,7 +214,7 @@ def is_pooling_model(
 _T = TypeVar("_T", bound=type[nn.Module])
 
 
-def default_pooling_type(pooling_type: str):
+def default_pooling_type(pooling_type: PoolingTypeStr):
     """Decorator to set `VllmModelForPooling.default_pooling_type`."""
 
     def func(model: _T) -> _T:
@@ -209,5 +224,19 @@ def default_pooling_type(pooling_type: str):
     return func
 
 
-def get_default_pooling_type(model: type[object] | object) -> str:
+def get_default_pooling_type(model: type[object] | object) -> PoolingTypeStr:
     return getattr(model, "default_pooling_type", "LAST")
+
+
+def attn_type(attn_type: AttnTypeStr):
+    """Decorator to set `VllmModelForPooling.attn_type`."""
+
+    def func(model: _T) -> _T:
+        model.attn_type = attn_type  # type: ignore
+        return model
+
+    return func
+
+
+def get_attn_type(model: type[object] | object) -> AttnTypeStr:
+    return getattr(model, "attn_type", "decoder")
diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py
index 3a8a6c74d9d15..743bc23d9876f 100644
--- a/vllm/model_executor/models/modernbert.py
+++ b/vllm/model_executor/models/modernbert.py
@@ -28,7 +28,7 @@ from vllm.tasks import PoolingTask
 from vllm.v1.pool.metadata import PoolingMetadata
 
 from .interfaces import SupportsCrossEncoding
-from .interfaces_base import default_pooling_type
+from .interfaces_base import attn_type, default_pooling_type
 from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
 
 
@@ -396,6 +396,7 @@ class ModernBertPredictionHead(nn.Module):
         return self.norm(self.act(self.dense(hidden_states)))
 
 
+@attn_type("encoder_only")
 @default_pooling_type("ALL")
 class ModernBertForTokenClassification(nn.Module):
     is_pooling_model = True
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 0d582043e8c02..73a61f1148b50 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -17,7 +17,7 @@ from collections.abc import Callable, Set
 from dataclasses import asdict, dataclass, field
 from functools import lru_cache
 from pathlib import Path
-from typing import TypeVar
+from typing import TYPE_CHECKING, Any, TypeVar
 
 import torch.nn as nn
 import transformers
@@ -33,6 +33,14 @@ from vllm.logging_utils import logtime
 from vllm.transformers_utils.dynamic_module import try_get_class_from_dynamic_module
 from vllm.utils.hashing import safe_hash
 
+if TYPE_CHECKING:
+    from vllm.config.model import AttnTypeStr
+    from vllm.config.pooler import PoolingTypeStr
+else:
+    AttnTypeStr = Any
+    PoolingTypeStr = Any
+
+
 from .interfaces import (
     has_inner_state,
     has_noops,
@@ -47,6 +55,7 @@ from .interfaces import (
     supports_transcription,
 )
 from .interfaces_base import (
+    get_attn_type,
     get_default_pooling_type,
     is_pooling_model,
     is_text_generation_model,
@@ -509,7 +518,8 @@ class _ModelInfo:
     architecture: str
     is_text_generation_model: bool
     is_pooling_model: bool
-    default_pooling_type: str
+    attn_type: AttnTypeStr
+    default_pooling_type: PoolingTypeStr
     supports_cross_encoding: bool
     supports_multimodal: bool
     supports_multimodal_raw_input_only: bool
@@ -530,6 +540,7 @@ class _ModelInfo:
             is_text_generation_model=is_text_generation_model(model),
             is_pooling_model=is_pooling_model(model),
             default_pooling_type=get_default_pooling_type(model),
+            attn_type=get_attn_type(model),
             supports_cross_encoding=supports_cross_encoding(model),
             supports_multimodal=supports_multimodal(model),
             supports_multimodal_raw_input_only=supports_multimodal_raw_input_only(
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 8657a95b5e6e7..e3a5f51a8fc56 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -119,11 +119,12 @@ class EngineCore:
         # Setup scheduler.
         Scheduler = vllm_config.scheduler_config.get_scheduler_cls()
 
-        if len(kv_cache_config.kv_cache_groups) == 0:
+        if len(kv_cache_config.kv_cache_groups) == 0:  # noqa: SIM102
             # Encoder models without KV cache don't support
             # chunked prefill. But do SSM models?
-            logger.info("Disabling chunked prefill for model without KVCache")
-            vllm_config.scheduler_config.enable_chunked_prefill = False
+            if vllm_config.scheduler_config.enable_chunked_prefill:
+                logger.warning("Disabling chunked prefill for model without KVCache")
+                vllm_config.scheduler_config.enable_chunked_prefill = False
 
         scheduler_block_size = (
             vllm_config.cache_config.block_size

From b34e8775a31c1a077a1a24f22ffbf048b2a979f6 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Nov 2025 14:43:18 +0800
Subject: [PATCH 032/770] Revert "[CPU]Update CPU PyTorch to 2.9.0 (#29589)"
 (#29647)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docker/Dockerfile.cpu                  | 4 ++++
 requirements/cpu-build.txt             | 4 ++--
 requirements/cpu.txt                   | 8 ++++----
 vllm/model_executor/models/qwen3_vl.py | 4 ++--
 4 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index 67d3fb83a0275..eb3807ef0ca4e 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -119,6 +119,7 @@ FROM base AS vllm-test-deps
 
 WORKDIR /workspace/vllm
 
+# TODO: Update to 2.9.0 when there is a new build for intel_extension_for_pytorch for that version
 RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
     cp requirements/test.in requirements/cpu-test.in && \
     sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
@@ -131,6 +132,9 @@ RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
       esac; \
     }; \
     remove_packages_not_supported_on_aarch64 && \
+    sed -i 's/^torch==.*/torch==2.8.0/g' requirements/cpu-test.in && \
+    sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
+    sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
     uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
 
 RUN --mount=type=cache,target=/root/.cache/uv \
diff --git a/requirements/cpu-build.txt b/requirements/cpu-build.txt
index 0c6fdd3b33cd1..81d429a5e5f8d 100644
--- a/requirements/cpu-build.txt
+++ b/requirements/cpu-build.txt
@@ -4,9 +4,9 @@ packaging>=24.2
 setuptools>=77.0.3,<81.0.0
 setuptools-scm>=8
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.9.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
+torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
 torch==2.9.0; platform_system == "Darwin"
-torch==2.9.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
+torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
 scons; platform_machine == "aarch64"    # needed to build Arm Compute Library (ACL)
 wheel
 jinja2>=3.1.6
diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index 8c04d6d5ce1b0..e23d3286f3f78 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -7,17 +7,17 @@ numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative d
 packaging>=24.2
 setuptools>=77.0.3,<81.0.0
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.9.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
+torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
 torch==2.9.0; platform_system == "Darwin"
-torch==2.9.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
+torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
 
 # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
 torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
-torchaudio==2.9.0; platform_machine == "ppc64le"
+torchaudio==2.8.0; platform_machine == "ppc64le"
 
 # required for the image processor of phi3v, this must be updated alongside torch
 torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
-torchvision==0.24.0; platform_machine == "ppc64le"
+torchvision==0.23.0; platform_machine == "ppc64le"
 datasets # for benchmark scripts
 
 # Intel Extension for PyTorch, only for x86_64 CPUs
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 52d31e70a8f05..39fe8336b84a1 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -1123,14 +1123,14 @@ class Qwen3LLMModel(Qwen3Model):
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        
+
         aux_hidden_states = []
         for layer_idx, layer in islice(
             enumerate(self.layers), self.start_layer, self.end_layer
         ):
             if layer_idx in self.aux_hidden_state_layers:
                 aux_hidden_states.append(hidden_states + residual)
-            
+
             hidden_states, residual = layer(
                 positions,
                 hidden_states,

From 480598958e28fa1e2ed2f7be2d457fc6f85a1748 Mon Sep 17 00:00:00 2001
From: "rongfu.leng" <rongfu.leng@daocloud.io>
Date: Fri, 28 Nov 2025 15:53:20 +0800
Subject: [PATCH 033/770] [Feature][Bench] Add pareto visualization (#29477)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
---
 docs/contributing/benchmarks.md        |  18 ++
 docs/mkdocs/hooks/generate_argparse.py |   4 +
 vllm/benchmarks/sweep/cli.py           |   3 +
 vllm/benchmarks/sweep/plot_pareto.py   | 393 +++++++++++++++++++++++++
 4 files changed, 418 insertions(+)
 create mode 100644 vllm/benchmarks/sweep/plot_pareto.py

diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md
index c9bc9cfe28a35..e4714e6266381 100644
--- a/docs/contributing/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@@ -1146,6 +1146,24 @@ vllm bench sweep plot benchmarks/results/<timestamp> \
 !!! tip
     You can use `--dry-run` to preview the figures to be plotted.
 
+### Pareto visualization (tokens/s/user vs tokens/s/GPU)
+
+`vllm bench sweep plot_pareto` helps pick configurations that balance per-user and per-GPU throughput.
+
+Higher concurrency or batch size can raise GPU efficiency (per-GPU), but can add per user latency; lower concurrency improves per-user rate but underutilizes GPUs; The Pareto frontier shows the best achievable pairs across your runs.
+
+- x-axis: tokens/s/user = `output_throughput` ÷ concurrency (`--user-count-var`, default `max_concurrency`, fallback `max_concurrent_requests`).
+- y-axis: tokens/s/GPU = `output_throughput` ÷ GPU count (`--gpu-count-var` if set; else gpu_count is TP×PP*DP).
+- Output: a single figure at `OUTPUT_DIR/pareto/PARETO.png`.
+- Show the configuration used in each data point `--label-by` (default: `max_concurrency,gpu_count`).
+
+Example:
+
+```bash
+vllm bench sweep plot_pareto benchmarks/results/<timestamp> \
+  --label-by max_concurrency,tensor_parallel_size,pipeline_parallel_size
+```
+
 ## Performance Benchmarks
 
 The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM.
diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py
index 735074c08b8c8..4ae64a6e4bfcc 100644
--- a/docs/mkdocs/hooks/generate_argparse.py
+++ b/docs/mkdocs/hooks/generate_argparse.py
@@ -94,6 +94,9 @@ def auto_mock(module_name: str, attr: str, max_mocks: int = 100):
 bench_latency = auto_mock("vllm.benchmarks", "latency")
 bench_serve = auto_mock("vllm.benchmarks", "serve")
 bench_sweep_plot = auto_mock("vllm.benchmarks.sweep.plot", "SweepPlotArgs")
+bench_sweep_plot_pareto = auto_mock(
+    "vllm.benchmarks.sweep.plot_pareto", "SweepPlotParetoArgs"
+)
 bench_sweep_serve = auto_mock("vllm.benchmarks.sweep.serve", "SweepServeArgs")
 bench_sweep_serve_sla = auto_mock(
     "vllm.benchmarks.sweep.serve_sla", "SweepServeSLAArgs"
@@ -221,6 +224,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
         "bench_latency": create_parser(bench_latency.add_cli_args),
         "bench_serve": create_parser(bench_serve.add_cli_args),
         "bench_sweep_plot": create_parser(bench_sweep_plot.add_cli_args),
+        "bench_sweep_plot_pareto": create_parser(bench_sweep_plot_pareto.add_cli_args),
         "bench_sweep_serve": create_parser(bench_sweep_serve.add_cli_args),
         "bench_sweep_serve_sla": create_parser(bench_sweep_serve_sla.add_cli_args),
         "bench_throughput": create_parser(bench_throughput.add_cli_args),
diff --git a/vllm/benchmarks/sweep/cli.py b/vllm/benchmarks/sweep/cli.py
index 108cd75690864..e74e0e2c181c5 100644
--- a/vllm/benchmarks/sweep/cli.py
+++ b/vllm/benchmarks/sweep/cli.py
@@ -6,6 +6,8 @@ from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG
 
 from .plot import SweepPlotArgs
 from .plot import main as plot_main
+from .plot_pareto import SweepPlotParetoArgs
+from .plot_pareto import main as plot_pareto_main
 from .serve import SweepServeArgs
 from .serve import main as serve_main
 from .serve_sla import SweepServeSLAArgs
@@ -15,6 +17,7 @@ SUBCOMMANDS = (
     (SweepServeArgs, serve_main),
     (SweepServeSLAArgs, serve_sla_main),
     (SweepPlotArgs, plot_main),
+    (SweepPlotParetoArgs, plot_pareto_main),
 )
 
 
diff --git a/vllm/benchmarks/sweep/plot_pareto.py b/vllm/benchmarks/sweep/plot_pareto.py
new file mode 100644
index 0000000000000..70472552b5cd4
--- /dev/null
+++ b/vllm/benchmarks/sweep/plot_pareto.py
@@ -0,0 +1,393 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import math
+from concurrent.futures import ProcessPoolExecutor
+from dataclasses import dataclass
+from functools import partial
+from pathlib import Path
+from typing import ClassVar
+
+from vllm.utils.collection_utils import full_groupby
+from vllm.utils.import_utils import PlaceholderModule
+
+from .plot import DummyExecutor, _json_load_bytes
+from .utils import sanitize_filename
+
+try:
+    import matplotlib.pyplot as plt
+    import pandas as pd
+    import seaborn as sns
+except ImportError:
+    plt = PlaceholderModule("matplotlib").placeholder_attr("pyplot")
+    pd = PlaceholderModule("pandas")
+    sns = PlaceholderModule("seaborn")
+
+
+def _first_present(run_data: dict[str, object], keys: list[str]):
+    for key in keys:
+        for candidate in {key, key.replace("_", "-"), key.replace("-", "_")}:
+            if candidate in run_data:
+                return run_data[candidate]
+    return None
+
+
+def _get_numeric(
+    run_data: dict[str, object],
+    keys: list[str],
+    *,
+    allow_zero: bool = True,
+) -> float | None:
+    value = _first_present(run_data, keys)
+    if value is None:
+        return None
+
+    try:
+        numeric = float(value)
+    except (TypeError, ValueError) as exc:
+        raise ValueError(
+            f"Expected numeric value for one of {keys}, "
+            f"but found {value!r} in {run_data=}"
+        ) from exc
+
+    if not allow_zero and numeric == 0:
+        return None
+
+    return numeric
+
+
+def _infer_user_count(
+    run_data: dict[str, object],
+    user_count_var: str | None,
+) -> float | None:
+    candidates = [user_count_var] if user_count_var else []
+    candidates.extend(["request_rate"])
+    user_count = _get_numeric(run_data, candidates, allow_zero=False)
+    if user_count is not None:
+        return user_count
+
+    # Fallback to the observed peak if configured value is missing.
+    return _get_numeric(run_data, ["max_concurrent_requests"], allow_zero=False)
+
+
+def _infer_gpu_count(
+    run_data: dict[str, object],
+    gpu_count_var: str | None,
+) -> float:
+    direct_candidates = [gpu_count_var] if gpu_count_var else []
+    direct_gpu_count = _get_numeric(run_data, direct_candidates, allow_zero=False)
+    if direct_gpu_count:
+        return direct_gpu_count
+
+    tp_size = _get_numeric(run_data, ["tensor_parallel_size", "tp"])
+    pp_size = _get_numeric(run_data, ["pipeline_parallel_size", "pp"])
+    dp_size = _get_numeric(run_data, ["data_parallel_size", "dp"])
+    world_size = 1.0
+    if tp_size:
+        world_size *= tp_size
+    if pp_size:
+        world_size *= pp_size
+    if dp_size:
+        world_size *= dp_size
+
+    return world_size
+
+
+def _get_throughput(
+    run_data: dict[str, object],
+    throughput_var: str,
+) -> float:
+    throughput = _get_numeric(run_data, [throughput_var])
+    if throughput is None:
+        raise ValueError(
+            f"Cannot find throughput metric {throughput_var!r} in run data. "
+            f"Available keys: {sorted(run_data)}"
+        )
+
+    return throughput
+
+
+def _prepare_records(
+    all_data: list[dict[str, object]],
+    *,
+    user_count_var: str | None,
+    gpu_count_var: str | None,
+) -> tuple[list[dict[str, object]], int]:
+    prepared = []
+    skipped_missing_users = 0
+
+    for record in all_data:
+        throughput = _get_throughput(record, "output_throughput")
+        user_count = _infer_user_count(record, user_count_var)
+        if user_count is None:
+            skipped_missing_users += 1
+            continue
+
+        gpu_count = _infer_gpu_count(record, gpu_count_var)
+        tokens_per_user = throughput / user_count
+        tokens_per_gpu = throughput / gpu_count
+
+        prepared.append(
+            {
+                **record,
+                "tokens_per_user": tokens_per_user,
+                "tokens_per_gpu": tokens_per_gpu,
+                "user_count_estimate": user_count,
+                "gpu_count": gpu_count,
+            }
+        )
+
+    return prepared, skipped_missing_users
+
+
+def _pareto_frontier(
+    df: "pd.DataFrame",
+    x_col: str,
+    y_col: str,
+    *,
+    epsilon: float = 1e-9,
+) -> "pd.DataFrame":
+    sorted_df = df.sort_values([x_col, y_col], ascending=[False, False])
+    frontier_indices = []
+    best_y = -math.inf
+
+    for idx, row in sorted_df.iterrows():
+        y_val = row[y_col]
+        if y_val >= best_y - epsilon:
+            frontier_indices.append(idx)
+            best_y = max(best_y, y_val)
+
+    return df.loc[frontier_indices]
+
+
+def _get_fig_path(
+    fig_dir: Path,
+    fig_group: tuple[tuple[str, str], ...],
+) -> Path:
+    parts = ["PARETO"]
+    if fig_group:
+        parts.extend(f"{k}={v}" for k, v in fig_group)
+    filename = sanitize_filename("-".join(parts) + ".png")
+    return fig_dir / filename
+
+
+def _plot_fig(
+    fig_dir: Path,
+    fig_group_data: tuple[tuple[tuple[str, str], ...], list[dict[str, object]]],
+    label_by: list[str],
+    *,
+    dry_run: bool,
+):
+    fig_group, fig_data = fig_group_data
+    fig_path = _get_fig_path(fig_dir, fig_group)
+
+    print("[BEGIN FIGURE]")
+    print(f"Group: {dict(fig_group)}")
+    print(f"Output file: {fig_path}")
+
+    if dry_run:
+        print("[END FIGURE]")
+        return
+
+    df = pd.DataFrame.from_records(fig_data)
+    df = df.dropna(subset=["tokens_per_user", "tokens_per_gpu"])
+
+    if df.empty:
+        print("No data points available after filtering; skipping.")
+        print("[END FIGURE]")
+        return
+
+    frontier = _pareto_frontier(df, "tokens_per_user", "tokens_per_gpu")
+    frontier = frontier.sort_values("tokens_per_user")
+
+    fig, ax = plt.subplots()
+    sns.scatterplot(
+        data=df,
+        x="tokens_per_user",
+        y="tokens_per_gpu",
+        color="0.5",
+        alpha=0.6,
+        ax=ax,
+        label="All runs",
+    )
+    sns.lineplot(
+        data=frontier,
+        x="tokens_per_user",
+        y="tokens_per_gpu",
+        marker="o",
+        ax=ax,
+        label="Pareto frontier",
+    )
+
+    if label_by:
+        for _, row in frontier.iterrows():
+            label_parts = []
+            for key in label_by:
+                if key in row:
+                    label_parts.append(f"{key}={row[key]}")
+            if label_parts:
+                ax.text(
+                    row["tokens_per_user"],
+                    row["tokens_per_gpu"],
+                    "\n".join(label_parts),
+                    fontsize=8,
+                )
+
+    ax.set_xlabel("Tokens/s/user")
+    ax.set_ylabel("Tokens/s/GPU")
+    ax.grid(True, linestyle="--", linewidth=0.5, alpha=0.6)
+    fig.tight_layout()
+    fig.savefig(fig_path)
+    plt.close(fig)
+
+    print(
+        f"Plotted {len(df)} points; Pareto frontier size: {len(frontier)}.",
+    )
+    print("[END FIGURE]")
+
+
+def plot_pareto(
+    output_dir: Path,
+    user_count_var: str | None,
+    gpu_count_var: str | None,
+    label_by: list[str],
+    *,
+    dry_run: bool,
+):
+    fig_dir = output_dir / "pareto"
+    raw_data = [
+        run_data
+        for path in output_dir.rglob("**/summary.json")
+        for run_data in _json_load_bytes(path)
+    ]
+
+    if not raw_data:
+        raise ValueError(f"Did not find any parameter sweep results under {output_dir}")
+
+    fig_dir.mkdir(parents=True, exist_ok=True)
+
+    prepared_data, skipped_missing_users = _prepare_records(
+        raw_data,
+        user_count_var=user_count_var,
+        gpu_count_var=gpu_count_var,
+    )
+
+    if skipped_missing_users:
+        print(
+            f"Skipped {skipped_missing_users} runs without a user count "
+            "(`max_concurrency` or `max_concurrent_requests`).",
+        )
+
+    if not prepared_data:
+        raise ValueError(
+            "No data points with both throughput and user count available "
+            "to plot Pareto frontier.",
+        )
+
+    fig_groups = full_groupby(
+        prepared_data,
+        key=lambda item: tuple(),
+    )
+
+    with DummyExecutor() if len(fig_groups) <= 1 else ProcessPoolExecutor() as executor:
+        all(
+            executor.map(
+                partial(
+                    _plot_fig,
+                    fig_dir,
+                    label_by=label_by,
+                    dry_run=dry_run,
+                ),
+                fig_groups,
+            )
+        )
+
+
+@dataclass
+class SweepPlotParetoArgs:
+    output_dir: Path
+    user_count_var: str | None
+    gpu_count_var: str | None
+    label_by: list[str]
+    dry_run: bool
+
+    parser_name: ClassVar[str] = "plot_pareto"
+    parser_help: ClassVar[str] = (
+        "Plot Pareto frontier between tokens/s/user and tokens/s/GPU "
+        "from parameter sweep results."
+    )
+
+    @classmethod
+    def from_cli_args(cls, args: argparse.Namespace):
+        output_dir = Path(args.OUTPUT_DIR)
+        if not output_dir.exists():
+            raise ValueError(f"No parameter sweep results under {output_dir}")
+
+        label_by = [] if not args.label_by else args.label_by.split(",")
+
+        return cls(
+            output_dir=output_dir,
+            user_count_var=args.user_count_var,
+            gpu_count_var=args.gpu_count_var,
+            label_by=label_by,
+            dry_run=args.dry_run,
+        )
+
+    @classmethod
+    def add_cli_args(cls, parser: argparse.ArgumentParser):
+        parser.add_argument(
+            "OUTPUT_DIR",
+            type=str,
+            default="results",
+            help="The directory containing the sweep results to plot.",
+        )
+        parser.add_argument(
+            "--user-count-var",
+            type=str,
+            default="max_concurrency",
+            help="Result key that stores concurrent user count. "
+            "Falls back to max_concurrent_requests if missing.",
+        )
+        parser.add_argument(
+            "--gpu-count-var",
+            type=str,
+            default=None,
+            help="Result key that stores GPU count. "
+            "If not provided, falls back to num_gpus/gpu_count "
+            "or tensor_parallel_size * pipeline_parallel_size.",
+        )
+        parser.add_argument(
+            "--label-by",
+            type=str,
+            default="max_concurrency,gpu_count",
+            help="Comma-separated list of fields to annotate on Pareto frontier "
+            "points.",
+        )
+        parser.add_argument(
+            "--dry-run",
+            action="store_true",
+            help="If set, prints the figures to plot without drawing them.",
+        )
+
+        return parser
+
+
+def run_main(args: SweepPlotParetoArgs):
+    return plot_pareto(
+        output_dir=args.output_dir,
+        user_count_var=args.user_count_var,
+        gpu_count_var=args.gpu_count_var,
+        label_by=args.label_by,
+        dry_run=args.dry_run,
+    )
+
+
+def main(args: argparse.Namespace):
+    run_main(SweepPlotParetoArgs.from_cli_args(args))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=SweepPlotParetoArgs.parser_help)
+    SweepPlotParetoArgs.add_cli_args(parser)
+
+    main(parser.parse_args())

From cc0f2a0e19881c3c601d3e287f297b36d2a78f78 Mon Sep 17 00:00:00 2001
From: maang-h <55082429+maang-h@users.noreply.github.com>
Date: Fri, 28 Nov 2025 16:12:20 +0800
Subject: [PATCH 034/770] [Doc] Improve abnormal information string (#29655)

Signed-off-by: maang <maang_h@163.com>
---
 vllm/v1/engine/utils.py | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py
index d65cad7af03d6..24bf66c42f312 100644
--- a/vllm/v1/engine/utils.py
+++ b/vllm/v1/engine/utils.py
@@ -371,8 +371,7 @@ class CoreEngineActorManager:
         )
         assert len(nodes) > 0, "No nodes with resources found in Ray cluster."
         assert dp_master_ip_key in nodes[0], (
-            "The DP master node (ip: %s) is missing or dead",
-            dp_master_ip,
+            f"The DP master node (ip: {dp_master_ip}) is missing or dead"
         )
         device_str = current_platform.ray_device_key
         n_node_devices: list[int] = [
@@ -446,8 +445,7 @@ class CoreEngineActorManager:
                 if key != "node:__internal_head__" and key.startswith("node:")
             ]
             assert len(node_ip_keys) == 1, (
-                "Zero or multiple node IP keys found in node resources: %s",
-                node_ip_keys,
+                f"Zero or multiple node IP keys found in node resources: {node_ip_keys}"
             )
             node_ip_key = node_ip_keys[0]
             node_ip = node_ip_key.split(":")[1]
@@ -464,11 +462,9 @@ class CoreEngineActorManager:
             if node_ip == dp_master_ip:
                 if dp_size_available < dp_size_local:
                     raise ValueError(
-                        "Not enough resources to allocate %s DP ranks "
-                        "on DP master node %s, possible to fit %s DP ranks",
-                        dp_size_local,
-                        dp_master_ip,
-                        dp_size_available,
+                        f"Not enough resources to allocate {dp_size_local} DP ranks "
+                        f"on DP master node {dp_master_ip}, possible to fit "
+                        f"{dp_size_available} DP ranks."
                     )
                 dp_size_to_allocate = dp_size_local
             elif pack_strategy == "strict":

From b2c1d294faca96643dbc2413d604ca160f458f0d Mon Sep 17 00:00:00 2001
From: Julien Denize <40604584+juliendenize@users.noreply.github.com>
Date: Fri, 28 Nov 2025 09:44:47 +0100
Subject: [PATCH 035/770] [BUGFIX] MistralTokenizer._call__ adds an invalid EOS
 token (#29607)

Signed-off-by: Julien Denize <julien.denize@mistral.ai>
Signed-off-by: Julien Denize <40604584+juliendenize@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 tests/tokenization/test_mistral_tokenizer.py  | 68 +++++++++++++++++++
 vllm/transformers_utils/tokenizers/mistral.py | 20 +++++-
 2 files changed, 87 insertions(+), 1 deletion(-)

diff --git a/tests/tokenization/test_mistral_tokenizer.py b/tests/tokenization/test_mistral_tokenizer.py
index 1ada8ee187c38..c80b698ba3848 100644
--- a/tests/tokenization/test_mistral_tokenizer.py
+++ b/tests/tokenization/test_mistral_tokenizer.py
@@ -331,6 +331,7 @@ class TestMistralTokenizer:
             )
             == token_ids
         )
+        assert mistral_tokenizer.encode_one("") == []
 
     def test_encode(self, mistral_tokenizer: MistralTokenizer):
         token_ids = (
@@ -370,6 +371,51 @@ class TestMistralTokenizer:
             mistral_tokenizer.encode("Hello world !", add_special_tokens=False)
             == token_ids[1:]
         )
+        assert mistral_tokenizer.encode("", add_special_tokens=False) == []
+
+    def test_call(self, mistral_tokenizer: MistralTokenizer):
+        token_ids = (
+            [1, 22177, 4304, 2662]
+            if mistral_tokenizer.is_tekken
+            else [1, 23325, 2294, 1686]
+        )
+        attn_mask = [1 for _ in range(len(token_ids))]
+
+        # Test 1: default
+        assert mistral_tokenizer("Hello world !") == {
+            "attention_mask": attn_mask[1:],
+            "input_ids": token_ids[1:],
+        }
+        # Test 2: special tokens
+        assert mistral_tokenizer("Hello world !", add_special_tokens=True) == {
+            "attention_mask": attn_mask,
+            "input_ids": token_ids,
+        }
+        # Test 3: special tokens + truncation
+        assert mistral_tokenizer(
+            "Hello world !", add_special_tokens=True, truncation=True, max_length=3
+        ) == {
+            "attention_mask": attn_mask[:-1],
+            "input_ids": token_ids[:-1],
+        }
+        # Test 4: special tokens + no truncation + max length
+        assert mistral_tokenizer(
+            "Hello world !", add_special_tokens=True, max_length=3
+        ) == {
+            "attention_mask": attn_mask,
+            "input_ids": token_ids,
+        }
+        # Test 5: empty string
+        assert mistral_tokenizer("") == {
+            "attention_mask": [],
+            "input_ids": [],
+        }
+
+        with pytest.raises(
+            ValueError,
+            match=(r"`text_pair` is not supported by `MistralTokenizer.__call__`."),
+        ):
+            mistral_tokenizer("Hello world !", "invalid pair")
 
     @pytest.mark.parametrize(
         "openai_request,add_generation_prompt,continue_final_message,expected_output,decoded_expected_output",
@@ -1087,6 +1133,24 @@ class TestMistralTokenizer:
             )
             == expected_tokens[mistral_tokenizer.is_tekken]
         )
+        assert (
+            mistral_tokenizer.decode(
+                ids[mistral_tokenizer.is_tekken],
+                skip_special_tokens=skip_special_tokens,
+            )
+            == expected_tokens[mistral_tokenizer.is_tekken]
+        )
+
+    def test_decode_empty(
+        self,
+        mistral_tokenizer: MistralTokenizer,
+    ):
+        assert (
+            mistral_tokenizer.decode(
+                [],
+            )
+            == ""
+        )
 
     def test_decode_int(
         self,
@@ -1390,6 +1454,8 @@ class TestMistralTokenizer:
             == expected_strings[mistral_tokenizer.is_tekken]
         )
 
+        assert mistral_tokenizer.convert_tokens_to_string([]) == ""
+
     @pytest.mark.parametrize(
         "skip_special_tokens,tuple_expected_tokens",
         (
@@ -2220,3 +2286,5 @@ class TestMistralTokenizer:
             ids, skip_special_tokens=skip_special_tokens
         )
         assert actual_tokens == expected_tokens
+
+        assert mistral_tokenizer.convert_ids_to_tokens([]) == []
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 39198a1f3d815..caff43c55ce85 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -312,13 +312,27 @@ class MistralTokenizer(TokenizerBase):
         truncation: bool = False,
         max_length: int | None = None,
     ):
-        return self.transformers_tokenizer(
+        if text_pair is not None:
+            raise ValueError(
+                "`text_pair` is not supported by `MistralTokenizer.__call__`."
+            )
+
+        encoded = self.transformers_tokenizer(
             text=text,
             text_pair=text_pair,
             add_special_tokens=add_special_tokens,
             truncation=truncation,
             max_length=max_length,
         )
+        # TODO(juliendenize): once https://github.com/huggingface/transformers/pull/41962
+        # is in, revert to only call self.transformers_tokenizer(...).
+        # Hack to fix wrongly added eos token, when fix will be supported the condition
+        # below will be False even before the revert is done.
+        if encoded["input_ids"] and encoded["input_ids"][-1] == self.eos_token_id:
+            encoded["input_ids"].pop(-1)
+            if attention_mask := encoded.get("attention_mask"):
+                attention_mask.pop(-1)
+        return encoded
 
     @property
     def vocab(self) -> list[str]:
@@ -349,6 +363,8 @@ class MistralTokenizer(TokenizerBase):
         max_length: int | None = None,
         add_special_tokens: bool | None = None,
     ) -> list[int]:
+        # TODO(juliendenize): once https://github.com/huggingface/transformers/pull/41962
+        # is in, directly call self.transformers_tokenizer.encode(...).
         encoded = self.tokenizer.encode(
             text, bos=add_special_tokens is not False, eos=False
         )
@@ -387,6 +403,8 @@ class MistralTokenizer(TokenizerBase):
         )
 
     def decode(self, ids: list[int] | int, skip_special_tokens: bool = True) -> str:
+        # TODO(juliendenize): once https://github.com/huggingface/transformers/pull/41962
+        # is in, directly call self.transformers_tokenizer.decode(...).
         if isinstance(ids, int):
             ids = [ids]
 

From 5f5521bd5d7d38d380640166294d97a839cf7ef9 Mon Sep 17 00:00:00 2001
From: Filipp Fisin <48059208+qGentry@users.noreply.github.com>
Date: Fri, 28 Nov 2025 09:45:10 +0100
Subject: [PATCH 036/770] Fix parameter order in GPT-OSS weight loading
 function for non-MXFP4 weights  (#29506)

Signed-off-by: Filipp Fisin <48059208+qGentry@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/gpt_oss.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index 9de3e261941b1..cff16b7a7a8cd 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -647,8 +647,8 @@ class GptOssModel(nn.Module):
             )
         else:
             return self._load_weights_other(
-                ep_rank_start,
                 ep_rank_end,
+                ep_rank_start,
                 heads_per_rank,
                 head_start,
                 weights,

From ccbdf51bd57761a7a7e7a5adf685fcec67c9c1bd Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Nov 2025 17:19:25 +0800
Subject: [PATCH 037/770] [Doc] Reorganize benchmark docs (#29658)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/.nav.yml                                 |   5 +
 docs/benchmarking/README.md                   |   7 +
 .../benchmarks.md => benchmarking/cli.md}     | 335 +++---------------
 docs/benchmarking/dashboard.md                |  58 +++
 docs/benchmarking/sweeps.md                   | 178 ++++++++++
 5 files changed, 291 insertions(+), 292 deletions(-)
 create mode 100644 docs/benchmarking/README.md
 rename docs/{contributing/benchmarks.md => benchmarking/cli.md} (71%)
 create mode 100644 docs/benchmarking/dashboard.md
 create mode 100644 docs/benchmarking/sweeps.md

diff --git a/docs/.nav.yml b/docs/.nav.yml
index c8bf00efb2370..d30c0f12eba4c 100644
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@@ -52,6 +52,11 @@ nav:
       - Plugins:
         - design/*plugin*.md
       - design/*
+  - Benchmarking:
+      - benchmarking/README.md
+      - benchmarking/cli.md
+      - benchmarking/sweeps.md
+      - benchmarking/dashboard.md
   - API Reference:
     - api/README.md
     - api/vllm
diff --git a/docs/benchmarking/README.md b/docs/benchmarking/README.md
new file mode 100644
index 0000000000000..238290d4762b3
--- /dev/null
+++ b/docs/benchmarking/README.md
@@ -0,0 +1,7 @@
+# Benchmark Suites
+
+vLLM provides comprehensive benchmarking tools for performance testing and evaluation:
+
+- **[Benchmark CLI](./cli.md)**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing.
+- **[Parameter Sweeps](./sweeps.md)**: Automate `vllm bench` runs for multiple configurations, useful for [optimization and tuning](../configuration/optimization.md).
+- **[Performance Dashboard](./dashboard.md)**: Automated CI that publishes benchmarks on each commit.
diff --git a/docs/contributing/benchmarks.md b/docs/benchmarking/cli.md
similarity index 71%
rename from docs/contributing/benchmarks.md
rename to docs/benchmarking/cli.md
index e4714e6266381..44a4c40125952 100644
--- a/docs/contributing/benchmarks.md
+++ b/docs/benchmarking/cli.md
@@ -1,22 +1,10 @@
----
-toc_depth: 4
----
+# Benchmark CLI
 
-# Benchmark Suites
+This section guides you through running benchmark tests with the extensive datasets supported on vLLM.
 
-vLLM provides comprehensive benchmarking tools for performance testing and evaluation:
+It's a living document, updated as new features and datasets become available.
 
-- **[Benchmark CLI](#benchmark-cli)**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing
-- **[Parameter sweeps](#parameter-sweeps)**: Automate `vllm bench` runs for multiple configurations
-- **[Performance benchmarks](#performance-benchmarks)**: Automated CI benchmarks for development
-
-## Benchmark CLI
-
-This section guides you through running benchmark tests with the extensive
-datasets supported on vLLM. It's a living document, updated as new features and datasets
-become available.
-
-### Dataset Overview
+## Dataset Overview
 
 <style>
 th {
@@ -59,9 +47,9 @@ Legend:
     --dataset-path /datasets/VisionArena-Chat/ --hf-name lmarena-ai/VisionArena-Chat
     ```
 
-### Examples
+## Examples
 
-#### 🚀 Online Benchmark
+### 🚀 Online Benchmark
 
 <details class="admonition abstract" markdown="1">
 <summary>Show more</summary>
@@ -112,7 +100,7 @@ P99 ITL (ms):                            8.39
 ==================================================
 ```
 
-##### Custom Dataset
+#### Custom Dataset
 
 If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
 
@@ -145,7 +133,7 @@ vllm bench serve --port 9001 --save-result --save-detailed \
 
 You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
 
-##### VisionArena Benchmark for Vision Language Models
+#### VisionArena Benchmark for Vision Language Models
 
 ```bash
 # need a model with vision capability here
@@ -163,7 +151,7 @@ vllm bench serve \
   --num-prompts 1000
 ```
 
-##### InstructCoder Benchmark with Speculative Decoding
+#### InstructCoder Benchmark with Speculative Decoding
 
 ``` bash
 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
@@ -180,7 +168,7 @@ vllm bench serve \
     --num-prompts 2048
 ```
 
-##### Spec Bench Benchmark with Speculative Decoding
+#### Spec Bench Benchmark with Speculative Decoding
 
 ``` bash
 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
@@ -217,7 +205,7 @@ vllm bench serve \
     --spec-bench-category "summarization"
 ```
 
-##### Other HuggingFaceDataset Examples
+#### Other HuggingFaceDataset Examples
 
 ```bash
 vllm serve Qwen/Qwen2-VL-7B-Instruct
@@ -283,7 +271,7 @@ vllm bench serve \
     --blazedit-max-distance 0.99
 ```
 
-##### Running With Sampling Parameters
+#### Running With Sampling Parameters
 
 When using OpenAI-compatible backends such as `vllm`, optional sampling
 parameters can be specified. Example client command:
@@ -301,7 +289,7 @@ vllm bench serve \
   --num-prompts 10
 ```
 
-##### Running With Ramp-Up Request Rate
+#### Running With Ramp-Up Request Rate
 
 The benchmark tool also supports ramping up the request rate over the
 duration of the benchmark run. This can be useful for stress testing the
@@ -318,11 +306,11 @@ The following arguments can be used to control the ramp-up:
 - `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
 - `--ramp-up-end-rps`: The request rate at the end of the benchmark.
 
-##### Load Pattern Configuration
+#### Load Pattern Configuration
 
 vLLM's benchmark serving script provides sophisticated load pattern simulation capabilities through three key parameters that control request generation and concurrency behavior:
 
-###### Load Pattern Control Parameters
+##### Load Pattern Control Parameters
 
 - `--request-rate`: Controls the target request generation rate (requests per second). Set to `inf` for maximum throughput testing or finite values for controlled load simulation.
 - `--burstiness`: Controls traffic variability using a Gamma distribution (range: > 0). Lower values create bursty traffic, higher values create uniform traffic.
@@ -387,7 +375,7 @@ Using KV cache metrics for load pattern configuration:
 
 </details>
 
-#### 📈 Offline Throughput Benchmark
+### 📈 Offline Throughput Benchmark
 
 <details class="admonition abstract" markdown="1">
 <summary>Show more</summary>
@@ -408,7 +396,7 @@ Total num prompt tokens:  5014
 Total num output tokens:  1500
 ```
 
-##### VisionArena Benchmark for Vision Language Models
+#### VisionArena Benchmark for Vision Language Models
 
 ```bash
 vllm bench throughput \
@@ -428,7 +416,7 @@ Total num prompt tokens:  14527
 Total num output tokens:  1280
 ```
 
-##### InstructCoder Benchmark with Speculative Decoding
+#### InstructCoder Benchmark with Speculative Decoding
 
 ``` bash
 VLLM_WORKER_MULTIPROC_METHOD=spawn \
@@ -451,7 +439,7 @@ Total num prompt tokens:  261136
 Total num output tokens:  204800
 ```
 
-##### Other HuggingFaceDataset Examples
+#### Other HuggingFaceDataset Examples
 
 `lmms-lab/LLaVA-OneVision-Data`:
 
@@ -509,20 +497,20 @@ vllm bench throughput \
 
 </details>
 
-#### 🛠️ Structured Output Benchmark
+### 🛠️ Structured Output Benchmark
 
 <details class="admonition abstract" markdown="1">
 <summary>Show more</summary>
 
 Benchmark the performance of structured output generation (JSON, grammar, regex).
 
-##### Server Setup
+#### Server Setup
 
 ```bash
 vllm serve NousResearch/Hermes-3-Llama-3.1-8B
 ```
 
-##### JSON Schema Benchmark
+#### JSON Schema Benchmark
 
 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
@@ -534,7 +522,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
   --num-prompts 1000
 ```
 
-##### Grammar-based Generation Benchmark
+#### Grammar-based Generation Benchmark
 
 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
@@ -546,7 +534,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
   --num-prompts 1000
 ```
 
-##### Regex-based Generation Benchmark
+#### Regex-based Generation Benchmark
 
 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
@@ -557,7 +545,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
   --num-prompts 1000
 ```
 
-##### Choice-based Generation Benchmark
+#### Choice-based Generation Benchmark
 
 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
@@ -568,7 +556,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
   --num-prompts 1000
 ```
 
-##### XGrammar Benchmark Dataset
+#### XGrammar Benchmark Dataset
 
 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
@@ -581,14 +569,14 @@ python3 benchmarks/benchmark_serving_structured_output.py \
 
 </details>
 
-#### 📚 Long Document QA Benchmark
+### 📚 Long Document QA Benchmark
 
 <details class="admonition abstract" markdown="1">
 <summary>Show more</summary>
 
 Benchmark the performance of long document question-answering with prefix caching.
 
-##### Basic Long Document QA Test
+#### Basic Long Document QA Test
 
 ```bash
 python3 benchmarks/benchmark_long_document_qa_throughput.py \
@@ -600,7 +588,7 @@ python3 benchmarks/benchmark_long_document_qa_throughput.py \
   --repeat-count 5
 ```
 
-##### Different Repeat Modes
+#### Different Repeat Modes
 
 ```bash
 # Random mode (default) - shuffle prompts randomly
@@ -633,14 +621,14 @@ python3 benchmarks/benchmark_long_document_qa_throughput.py \
 
 </details>
 
-#### 🗂️ Prefix Caching Benchmark
+### 🗂️ Prefix Caching Benchmark
 
 <details class="admonition abstract" markdown="1">
 <summary>Show more</summary>
 
 Benchmark the efficiency of automatic prefix caching.
 
-##### Fixed Prompt with Prefix Caching
+#### Fixed Prompt with Prefix Caching
 
 ```bash
 python3 benchmarks/benchmark_prefix_caching.py \
@@ -651,7 +639,7 @@ python3 benchmarks/benchmark_prefix_caching.py \
   --input-length-range 128:256
 ```
 
-##### ShareGPT Dataset with Prefix Caching
+#### ShareGPT Dataset with Prefix Caching
 
 ```bash
 # download dataset
@@ -682,14 +670,14 @@ vllm bench serve \
 
 </details>
 
-#### ⚡ Request Prioritization Benchmark
+### ⚡ Request Prioritization Benchmark
 
 <details class="admonition abstract" markdown="1">
 <summary>Show more</summary>
 
 Benchmark the performance of request prioritization in vLLM.
 
-##### Basic Prioritization Test
+#### Basic Prioritization Test
 
 ```bash
 python3 benchmarks/benchmark_prioritization.py \
@@ -700,7 +688,7 @@ python3 benchmarks/benchmark_prioritization.py \
   --scheduling-policy priority
 ```
 
-##### Multiple Sequences per Prompt
+#### Multiple Sequences per Prompt
 
 ```bash
 python3 benchmarks/benchmark_prioritization.py \
@@ -714,14 +702,14 @@ python3 benchmarks/benchmark_prioritization.py \
 
 </details>
 
-#### 👁️ Multi-Modal Benchmark
+### 👁️ Multi-Modal Benchmark
 
 <details class="admonition abstract" markdown="1">
 <summary>Show more</summary>
 
 Benchmark the performance of multi-modal requests in vLLM.
 
-##### Images (ShareGPT4V)
+#### Images (ShareGPT4V)
 
 Start vLLM:
 
@@ -747,7 +735,7 @@ vllm bench serve \
   --endpoint /v1/chat/completions
 ```
 
-##### Videos (ShareGPT4Video)
+#### Videos (ShareGPT4Video)
 
 Start vLLM:
 
@@ -773,7 +761,7 @@ vllm bench serve \
   --endpoint /v1/chat/completions
 ```
 
-##### Synthetic Random Images (random-mm)
+#### Synthetic Random Images (random-mm)
 
 Generate synthetic image inputs alongside random text prompts to stress-test vision models without external datasets.
 
@@ -846,14 +834,14 @@ This should be seen as an edge case, and if this behavior can be avoided by sett
 
 </details>
 
-#### Embedding Benchmark
+### Embedding Benchmark
 
 Benchmark the performance of embedding requests in vLLM.
 
 <details class="admonition abstract" markdown="1">
 <summary>Show more</summary>
 
-##### Text Embeddings
+#### Text Embeddings
 
 Unlike generative models which use Completions API or Chat Completions API,
 you should set `--backend openai-embeddings` and `--endpoint /v1/embeddings` to use the Embeddings API.
@@ -879,7 +867,7 @@ vllm bench serve \
   --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json
 ```
 
-##### Multi-modal Embeddings
+#### Multi-modal Embeddings
 
 Unlike generative models which use Completions API or Chat Completions API,
 you should set `--endpoint /v1/embeddings` to use the Embeddings API. The backend to use depends on the model:
@@ -944,7 +932,7 @@ vllm bench serve \
 
 </details>
 
-#### Reranker Benchmark
+### Reranker Benchmark
 
 Benchmark the performance of rerank requests in vLLM.
 
@@ -988,240 +976,3 @@ to account for the extra prompt which is the query. The token accounting to repo
 throughput numbers correctly is also adjusted.
 
 </details>
-
-## Parameter Sweeps
-
-### Online Benchmark
-
-[`vllm/benchmarks/sweep/serve.py`](../../vllm/benchmarks/sweep/serve.py) automatically starts `vllm serve` and runs `vllm bench serve` to evaluate vLLM over multiple configurations.
-
-Follow these steps to run the script:
-
-1. Construct the base command to `vllm serve`, and pass it to the `--serve-cmd` option.
-2. Construct the base command to `vllm bench serve`, and pass it to the `--bench-cmd` option.
-3. (Optional) If you would like to vary the settings of `vllm serve`, create a new JSON file and populate it with the parameter combinations you want to test. Pass the file path to `--serve-params`.
-
-    - Example: Tuning `--max-num-seqs` and `--max-num-batched-tokens`:
-
-    ```json
-    [
-        {
-            "max_num_seqs": 32,
-            "max_num_batched_tokens": 1024
-        },
-        {
-            "max_num_seqs": 64,
-            "max_num_batched_tokens": 1024
-        },
-        {
-            "max_num_seqs": 64,
-            "max_num_batched_tokens": 2048
-        },
-        {
-            "max_num_seqs": 128,
-            "max_num_batched_tokens": 2048
-        },
-        {
-            "max_num_seqs": 128,
-            "max_num_batched_tokens": 4096
-        },
-        {
-            "max_num_seqs": 256,
-            "max_num_batched_tokens": 4096
-        }
-    ]
-    ```
-
-4. (Optional) If you would like to vary the settings of `vllm bench serve`, create a new JSON file and populate it with the parameter combinations you want to test. Pass the file path to `--bench-params`.
-
-    - Example: Using different input/output lengths for random dataset:
-
-    ```json
-    [
-        {
-            "random_input_len": 128,
-            "random_output_len": 32
-        },
-        {
-            "random_input_len": 256,
-            "random_output_len": 64
-        },
-        {
-            "random_input_len": 512,
-            "random_output_len": 128
-        }
-    ]
-    ```
-
-5. Determine where you want to save the results, and pass that to `--output-dir`.
-
-Example command:
-
-```bash
-vllm bench sweep serve \
-    --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
-    --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
-    --serve-params benchmarks/serve_hparams.json \
-    --bench-params benchmarks/bench_hparams.json \
-    -o benchmarks/results
-```
-
-!!! important
-    If both `--serve-params` and `--bench-params` are passed, the script will iterate over the Cartesian product between them.
-    You can use `--dry-run` to preview the commands to be run.
-
-    We only start the server once for each `--serve-params`, and keep it running for multiple `--bench-params`.
-    Between each benchmark run, we call the `/reset_prefix_cache` and `/reset_mm_cache` endpoints to get a clean slate for the next run.
-    In case you are using a custom `--serve-cmd`, you can override the commands used for resetting the state by setting `--after-bench-cmd`.
-
-!!! note
-    By default, each parameter combination is run 3 times to make the results more reliable. You can adjust the number of runs by setting `--num-runs`.
-
-!!! tip
-    You can use the `--resume` option to continue the parameter sweep if one of the runs failed.
-  
-### SLA Auto-Tuner
-
-[`vllm/benchmarks/sweep/serve_sla.py`](../../vllm/benchmarks/sweep/serve_sla.py) is a wrapper over [`vllm/benchmarks/sweep/serve.py`](../../vllm/benchmarks/sweep/serve.py) that tunes either the request rate or concurrency (choose using `--sla-variable`) in order to satisfy the SLA constraints given by `--sla-params`.
-
-For example, to ensure E2E latency within different target values for 99% of requests:
-
-```json
-[
-    {
-        "p99_e2el_ms": "<=200"
-    },
-    {
-        "p99_e2el_ms": "<=500"
-    },
-    {
-        "p99_e2el_ms": "<=1000"
-    },
-    {
-        "p99_e2el_ms": "<=2000"
-    }
-]
-```
-
-Example command:
-
-```bash
-vllm bench sweep serve_sla \
-    --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
-    --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
-    --serve-params benchmarks/serve_hparams.json \
-    --bench-params benchmarks/bench_hparams.json \
-    --sla-params benchmarks/sla_hparams.json \
-    --sla-variable max_concurrency \
-    -o benchmarks/results
-```
-
-The algorithm for adjusting the SLA variable is as follows:
-
-1. Run the benchmark with infinite QPS, and use the corresponding metrics to determine the initial value of the variable.
-    - For example, the initial request rate is set to the concurrency under infinite QPS.
-2. If the SLA is still satisfied, keep doubling the value until the SLA is no longer satisfied. This gives a relatively narrow window that contains the point where the SLA is barely satisfied.
-3. Apply binary search over the window to find the maximum value that still satisfies the SLA.
-
-!!! important
-    SLA tuning is applied over each combination of `--serve-params`, `--bench-params`, and `--sla-params`.
-
-    For a given combination of `--serve-params` and `--bench-params`, we share the benchmark results across `--sla-params` to avoid rerunning benchmarks with the same SLA variable value.
-
-### Visualizer
-
-[`vllm/benchmarks/sweep/plot.py`](../../vllm/benchmarks/sweep/plot.py) can be used to plot performance curves from parameter sweep results.
-
-Example command:
-
-```bash
-vllm bench sweep plot benchmarks/results/<timestamp> \
-    --var-x max_concurrency \
-    --row-by random_input_len \
-    --col-by random_output_len \
-    --curve-by api_server_count,max_num_batched_tokens \
-    --filter-by 'max_concurrency<=1024'
-```
-
-!!! tip
-    You can use `--dry-run` to preview the figures to be plotted.
-
-### Pareto visualization (tokens/s/user vs tokens/s/GPU)
-
-`vllm bench sweep plot_pareto` helps pick configurations that balance per-user and per-GPU throughput.
-
-Higher concurrency or batch size can raise GPU efficiency (per-GPU), but can add per user latency; lower concurrency improves per-user rate but underutilizes GPUs; The Pareto frontier shows the best achievable pairs across your runs.
-
-- x-axis: tokens/s/user = `output_throughput` ÷ concurrency (`--user-count-var`, default `max_concurrency`, fallback `max_concurrent_requests`).
-- y-axis: tokens/s/GPU = `output_throughput` ÷ GPU count (`--gpu-count-var` if set; else gpu_count is TP×PP*DP).
-- Output: a single figure at `OUTPUT_DIR/pareto/PARETO.png`.
-- Show the configuration used in each data point `--label-by` (default: `max_concurrency,gpu_count`).
-
-Example:
-
-```bash
-vllm bench sweep plot_pareto benchmarks/results/<timestamp> \
-  --label-by max_concurrency,tensor_parallel_size,pipeline_parallel_size
-```
-
-## Performance Benchmarks
-
-The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM.
-
-### Manually Trigger the benchmark
-
-Use [vllm-ci-test-repo images](https://gallery.ecr.aws/q9t5s3a7/vllm-ci-test-repo) with vLLM benchmark suite.
-For CPU environment, please use the image with "-cpu" postfix.
-
-Here is an example for docker run command for CPU.
-
-```bash
-docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface  -e HF_TOKEN=''  --shm-size=16g --name vllm-cpu-ci  public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:1da94e673c257373280026f75ceb4effac80e892-cpu
-```
-
-Then, run below command inside the docker instance.
-
-```bash
-bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
-```
-
-When run, benchmark script generates results under **benchmark/results** folder, along with the benchmark_results.md and benchmark_results.json.
-
-#### Runtime environment variables
-
-- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
-- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
-- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
-- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
-- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
-- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
-
-For more results visualization, check the [visualizing the results](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md#visualizing-the-results).
-
-The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
-
-More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md).
-
-### Continuous Benchmarking
-
-The continuous benchmarking provides automated performance monitoring for vLLM across different models and GPU devices. This helps track vLLM's performance characteristics over time and identify any performance regressions or improvements.
-
-#### How It Works
-
-The continuous benchmarking is triggered via a [GitHub workflow CI](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) in the PyTorch infrastructure repository, which runs automatically every 4 hours. The workflow executes three types of performance tests:
-
-- **Serving tests**: Measure request handling and API performance
-- **Throughput tests**: Evaluate token generation rates
-- **Latency tests**: Assess response time characteristics
-
-#### Benchmark Configuration
-
-The benchmarking currently runs on a predefined set of models configured in the [vllm-benchmarks directory](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks). To add new models for benchmarking:
-
-1. Navigate to the appropriate GPU directory in the benchmarks configuration
-2. Add your model specifications to the corresponding configuration files
-3. The new models will be included in the next scheduled benchmark run
-
-#### Viewing Results
-
-All continuous benchmarking results are automatically published to the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
diff --git a/docs/benchmarking/dashboard.md b/docs/benchmarking/dashboard.md
new file mode 100644
index 0000000000000..7cc4d23250df9
--- /dev/null
+++ b/docs/benchmarking/dashboard.md
@@ -0,0 +1,58 @@
+# Performance Dashboard
+
+The performance dashboard is used to confirm whether new changes improve/degrade performance under various workloads.
+It is updated by triggering benchmark runs on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM.
+
+The results are automatically published to the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
+
+## Manually Trigger the benchmark
+
+Use [vllm-ci-test-repo images](https://gallery.ecr.aws/q9t5s3a7/vllm-ci-test-repo) with vLLM benchmark suite.
+For CPU environment, please use the image with "-cpu" postfix.
+
+Here is an example for docker run command for CPU.
+
+```bash
+docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface  -e HF_TOKEN=''  --shm-size=16g --name vllm-cpu-ci  public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:1da94e673c257373280026f75ceb4effac80e892-cpu
+```
+
+Then, run below command inside the docker instance.
+
+```bash
+bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+```
+
+When run, benchmark script generates results under **benchmark/results** folder, along with the benchmark_results.md and benchmark_results.json.
+
+### Runtime environment variables
+
+- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
+- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
+- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
+- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
+- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
+- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
+
+For more results visualization, check the [visualizing the results](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md#visualizing-the-results).
+
+More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md).
+
+## Continuous Benchmarking
+
+The continuous benchmarking provides automated performance monitoring for vLLM across different models and GPU devices. This helps track vLLM's performance characteristics over time and identify any performance regressions or improvements.
+
+### How It Works
+
+The continuous benchmarking is triggered via a [GitHub workflow CI](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) in the PyTorch infrastructure repository, which runs automatically every 4 hours. The workflow executes three types of performance tests:
+
+- **Serving tests**: Measure request handling and API performance
+- **Throughput tests**: Evaluate token generation rates
+- **Latency tests**: Assess response time characteristics
+
+### Benchmark Configuration
+
+The benchmarking currently runs on a predefined set of models configured in the [vllm-benchmarks directory](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks). To add new models for benchmarking:
+
+1. Navigate to the appropriate GPU directory in the benchmarks configuration
+2. Add your model specifications to the corresponding configuration files
+3. The new models will be included in the next scheduled benchmark run
diff --git a/docs/benchmarking/sweeps.md b/docs/benchmarking/sweeps.md
new file mode 100644
index 0000000000000..ee4d40d876dea
--- /dev/null
+++ b/docs/benchmarking/sweeps.md
@@ -0,0 +1,178 @@
+# Parameter Sweeps
+
+## Online Benchmark
+
+### Basic
+
+`vllm bench sweep serve` automatically starts `vllm serve` and runs `vllm bench serve` to evaluate vLLM over multiple configurations.
+
+Follow these steps to run the script:
+
+1. Construct the base command to `vllm serve`, and pass it to the `--serve-cmd` option.
+2. Construct the base command to `vllm bench serve`, and pass it to the `--bench-cmd` option.
+3. (Optional) If you would like to vary the settings of `vllm serve`, create a new JSON file and populate it with the parameter combinations you want to test. Pass the file path to `--serve-params`.
+
+    - Example: Tuning `--max-num-seqs` and `--max-num-batched-tokens`:
+
+    ```json
+    [
+        {
+            "max_num_seqs": 32,
+            "max_num_batched_tokens": 1024
+        },
+        {
+            "max_num_seqs": 64,
+            "max_num_batched_tokens": 1024
+        },
+        {
+            "max_num_seqs": 64,
+            "max_num_batched_tokens": 2048
+        },
+        {
+            "max_num_seqs": 128,
+            "max_num_batched_tokens": 2048
+        },
+        {
+            "max_num_seqs": 128,
+            "max_num_batched_tokens": 4096
+        },
+        {
+            "max_num_seqs": 256,
+            "max_num_batched_tokens": 4096
+        }
+    ]
+    ```
+
+4. (Optional) If you would like to vary the settings of `vllm bench serve`, create a new JSON file and populate it with the parameter combinations you want to test. Pass the file path to `--bench-params`.
+
+    - Example: Using different input/output lengths for random dataset:
+
+    ```json
+    [
+        {
+            "random_input_len": 128,
+            "random_output_len": 32
+        },
+        {
+            "random_input_len": 256,
+            "random_output_len": 64
+        },
+        {
+            "random_input_len": 512,
+            "random_output_len": 128
+        }
+    ]
+    ```
+
+5. Determine where you want to save the results, and pass that to `--output-dir`.
+
+Example command:
+
+```bash
+vllm bench sweep serve \
+    --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
+    --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
+    --serve-params benchmarks/serve_hparams.json \
+    --bench-params benchmarks/bench_hparams.json \
+    -o benchmarks/results
+```
+
+!!! important
+    If both `--serve-params` and `--bench-params` are passed, the script will iterate over the Cartesian product between them.
+    You can use `--dry-run` to preview the commands to be run.
+
+    We only start the server once for each `--serve-params`, and keep it running for multiple `--bench-params`.
+    Between each benchmark run, we call the `/reset_prefix_cache` and `/reset_mm_cache` endpoints to get a clean slate for the next run.
+    In case you are using a custom `--serve-cmd`, you can override the commands used for resetting the state by setting `--after-bench-cmd`.
+
+!!! note
+    By default, each parameter combination is run 3 times to make the results more reliable. You can adjust the number of runs by setting `--num-runs`.
+
+!!! tip
+    You can use the `--resume` option to continue the parameter sweep if one of the runs failed.
+  
+### SLA auto-tuner
+
+`vllm bench sweep serve_sla` is a wrapper over `vllm bench sweep serve` that tunes either the request rate or concurrency (choose using `--sla-variable`) in order to satisfy the SLA constraints given by `--sla-params`.
+
+For example, to ensure E2E latency within different target values for 99% of requests:
+
+```json
+[
+    {
+        "p99_e2el_ms": "<=200"
+    },
+    {
+        "p99_e2el_ms": "<=500"
+    },
+    {
+        "p99_e2el_ms": "<=1000"
+    },
+    {
+        "p99_e2el_ms": "<=2000"
+    }
+]
+```
+
+Example command:
+
+```bash
+vllm bench sweep serve_sla \
+    --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
+    --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
+    --serve-params benchmarks/serve_hparams.json \
+    --bench-params benchmarks/bench_hparams.json \
+    --sla-params benchmarks/sla_hparams.json \
+    --sla-variable max_concurrency \
+    -o benchmarks/results
+```
+
+The algorithm for adjusting the SLA variable is as follows:
+
+1. Run the benchmark with infinite QPS, and use the corresponding metrics to determine the initial value of the variable.
+    - For example, the initial request rate is set to the concurrency under infinite QPS.
+2. If the SLA is still satisfied, keep doubling the value until the SLA is no longer satisfied. This gives a relatively narrow window that contains the point where the SLA is barely satisfied.
+3. Apply binary search over the window to find the maximum value that still satisfies the SLA.
+
+!!! important
+    SLA tuning is applied over each combination of `--serve-params`, `--bench-params`, and `--sla-params`.
+
+    For a given combination of `--serve-params` and `--bench-params`, we share the benchmark results across `--sla-params` to avoid rerunning benchmarks with the same SLA variable value.
+
+## Visualization
+
+### Basic
+
+`vllm bench sweep plot` can be used to plot performance curves from parameter sweep results.
+
+Example command:
+
+```bash
+vllm bench sweep plot benchmarks/results/<timestamp> \
+    --var-x max_concurrency \
+    --row-by random_input_len \
+    --col-by random_output_len \
+    --curve-by api_server_count,max_num_batched_tokens \
+    --filter-by 'max_concurrency<=1024'
+```
+
+!!! tip
+    You can use `--dry-run` to preview the figures to be plotted.
+
+### Pareto chart
+
+`vllm bench sweep plot_pareto` helps pick configurations that balance per-user and per-GPU throughput.
+
+Higher concurrency or batch size can raise GPU efficiency (per-GPU), but can add per user latency; lower concurrency improves per-user rate but underutilizes GPUs; The Pareto frontier shows the best achievable pairs across your runs.
+
+- x-axis: tokens/s/user = `output_throughput` ÷ concurrency (`--user-count-var`, default `max_concurrency`, fallback `max_concurrent_requests`).
+- y-axis: tokens/s/GPU = `output_throughput` ÷ GPU count (`--gpu-count-var` if set; else gpu_count is TP×PP*DP).
+- Output: a single figure at `OUTPUT_DIR/pareto/PARETO.png`.
+- Show the configuration used in each data point `--label-by` (default: `max_concurrency,gpu_count`).
+
+Example:
+
+```bash
+vllm bench sweep plot_pareto benchmarks/results/<timestamp> \
+  --label-by max_concurrency,tensor_parallel_size,pipeline_parallel_size
+```

From 3cb32e5d6e227fe91d93902e302ef6f667308e04 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=B0=E5=85=AE?=
 <38908462+zhyajie@users.noreply.github.com>
Date: Fri, 28 Nov 2025 18:08:42 +0800
Subject: [PATCH 038/770] [Rocm] Set VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS
 default is disabled (#28985)

Signed-off-by: zhyajie <yajizhan@amd.com>
Co-authored-by: zhyajie <yajizhan@amd.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
---
 vllm/envs.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 56558548d3981..2ac457419a722 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -116,7 +116,7 @@ if TYPE_CHECKING:
     VLLM_ROCM_USE_AITER_TRITON_ROPE: bool = False
     VLLM_ROCM_USE_AITER_FP8BMM: bool = True
     VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION: bool = False
-    VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS: bool = True
+    VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS: bool = False
     VLLM_ROCM_USE_AITER_TRITON_GEMM: bool = True
     VLLM_ROCM_USE_SKINNY_GEMM: bool = True
     VLLM_ROCM_FP8_PADDING: bool = True
@@ -969,9 +969,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
         in ("true", "1")
     ),
     # Whether to use aiter fusion shared experts ops.
-    # By default is enabled.
+    # By default is disabled.
     "VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS": lambda: (
-        os.getenv("VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS", "True").lower()
+        os.getenv("VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS", "False").lower()
         in ("true", "1")
     ),
     # Whether to use aiter triton kernels for gemm ops.

From 5c2b5cb422182ae2a0c22a17532d465174396613 Mon Sep 17 00:00:00 2001
From: Wilson Wu <iwilsonwu@gmail.com>
Date: Fri, 28 Nov 2025 18:29:28 +0800
Subject: [PATCH 039/770] [Docs] Add SPLADE and Ultravox models to supported
 models documentation (#29659)

Signed-off-by: Wilson Wu <iwilsonwu@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docs/models/supported_models.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 25579835faf63..da7c5edf66bfb 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -479,6 +479,7 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
 |--------------|--------|-------------------|----------------------|---------------------------|
 | `BertModel`<sup>C</sup> | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | |
+| `BertSpladeSparseEmbeddingModel` | SPLADE | `naver/splade-v3` | | |
 | `Gemma2Model`<sup>C</sup> | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | ✅︎ |
 | `Gemma3TextModel`<sup>C</sup> | Gemma 3-based | `google/embeddinggemma-300m`, etc. | ✅︎ | ✅︎ |
 | `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ |
@@ -725,6 +726,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `Step3VLForConditionalGeneration` | Step3-VL | T + I<sup>+</sup> | `stepfun-ai/step3` | | ✅︎ |
 | `TarsierForConditionalGeneration` | Tarsier | T + I<sup>E+</sup> | `omni-search/Tarsier-7b`, `omni-search/Tarsier-34b` | | ✅︎ |
 | `Tarsier2ForConditionalGeneration`<sup>^</sup> | Tarsier2 | T + I<sup>E+</sup> + V<sup>E+</sup> | `omni-research/Tarsier2-Recap-7b`, `omni-research/Tarsier2-7b-0115` | | ✅︎ |
+| `UltravoxModel` | Ultravox | T + A<sup>E+</sup> | `fixie-ai/ultravox-v0_5-llama-3_2-1b` | ✅︎ | ✅︎ |
 
 Some models are supported only via the [Transformers modeling backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers modeling backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
 

From 33b06a6f24be632e99c19cadb5004a27cf9605a0 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Nov 2025 20:35:19 +0800
Subject: [PATCH 040/770] [Misc] Remove redundant attention var constants
 (#29650)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../attention/test_attention_selector.py      | 19 +++++++++---------
 .../attention/test_rocm_attention_selector.py | 11 +++++-----
 tests/kernels/utils.py                        | 20 -------------------
 tests/models/quantization/test_fp8.py         |  3 +--
 vllm/attention/selector.py                    |  9 ++++-----
 vllm/model_executor/models/deepseek_eagle.py  |  3 ---
 vllm/utils/__init__.py                        | 17 ----------------
 7 files changed, 19 insertions(+), 63 deletions(-)

diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
index cd34b520ea71b..c959b2f4bb03c 100644
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -11,7 +11,6 @@ from vllm.platforms import current_platform
 from vllm.platforms.cpu import CpuPlatform
 from vllm.platforms.cuda import CudaPlatform
 from vllm.platforms.rocm import RocmPlatform
-from vllm.utils import STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, STR_INVALID_VAL
 
 
 @pytest.fixture(autouse=True)
@@ -83,7 +82,7 @@ def test_env(
 ):
     """Test attention backend selection with valid device-backend pairs."""
     with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, name)
+        m.setenv("VLLM_ATTENTION_BACKEND", name)
         m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")
 
         if device == "cpu":
@@ -237,27 +236,27 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
     )
 
     with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL)
+        m.setenv("VLLM_ATTENTION_BACKEND", "FLASH_ATTN")
 
         # Unsupported CUDA arch
         monkeypatch.setattr(torch.cuda, "get_device_capability", lambda _=None: (7, 5))
         backend = get_attn_backend(16, torch.float16, None, 16)
-        assert backend.get_name() != STR_FLASH_ATTN_VAL
+        assert backend.get_name() != "FLASH_ATTN"
 
         # Reset the monkeypatch for subsequent tests
         monkeypatch.undo()
 
         # Unsupported data type
         backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16)
-        assert backend.get_name() != STR_FLASH_ATTN_VAL
+        assert backend.get_name() != "FLASH_ATTN"
 
         # Unsupported kv cache data type
         backend = get_attn_backend(16, torch.float16, "fp8", 16)
-        assert backend.get_name() != STR_FLASH_ATTN_VAL
+        assert backend.get_name() != "FLASH_ATTN"
 
         # Unsupported block size
         backend = get_attn_backend(16, torch.float16, None, 8)
-        assert backend.get_name() != STR_FLASH_ATTN_VAL
+        assert backend.get_name() != "FLASH_ATTN"
 
         # flash-attn is not installed
         import sys
@@ -265,7 +264,7 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
         original_module = sys.modules.get("vllm_flash_attn")
         monkeypatch.setitem(sys.modules, "vllm_flash_attn", None)
         backend = get_attn_backend(16, torch.float16, None, 16)
-        assert backend.get_name() != STR_FLASH_ATTN_VAL
+        assert backend.get_name() != "FLASH_ATTN"
 
         # Restore the original module if it existed
         if original_module is not None:
@@ -275,7 +274,7 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
 
         # Unsupported head size
         backend = get_attn_backend(17, torch.float16, None, 16)
-        assert backend.get_name() != STR_FLASH_ATTN_VAL
+        assert backend.get_name() != "FLASH_ATTN"
 
 
 def test_invalid_env(monkeypatch: pytest.MonkeyPatch):
@@ -284,7 +283,7 @@ def test_invalid_env(monkeypatch: pytest.MonkeyPatch):
         monkeypatch.context() as m,
         patch("vllm.platforms.current_platform", CudaPlatform()),
     ):
-        m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
+        m.setenv("VLLM_ATTENTION_BACKEND", "INVALID")
 
         # Should raise ValueError for invalid backend
         with pytest.raises(ValueError) as exc_info:
diff --git a/tests/kernels/attention/test_rocm_attention_selector.py b/tests/kernels/attention/test_rocm_attention_selector.py
index 9b7fb664956c6..b61058081c0b2 100644
--- a/tests/kernels/attention/test_rocm_attention_selector.py
+++ b/tests/kernels/attention/test_rocm_attention_selector.py
@@ -6,7 +6,6 @@ import torch
 
 from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
 from vllm.platforms.rocm import RocmPlatform
-from vllm.utils import STR_BACKEND_ENV_VAR
 
 
 @pytest.fixture(autouse=True)
@@ -18,7 +17,7 @@ def clear_cache():
 @pytest.mark.skip(reason="Skipped for now. Should be revisited.")
 def test_selector(monkeypatch: pytest.MonkeyPatch):
     with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, "ROCM_ATTN")
+        m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_ATTN")
 
         # Set the current platform to ROCm using monkeypatch
         monkeypatch.setattr("vllm.attention.selector.current_platform", RocmPlatform())
@@ -30,19 +29,19 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
         # MLA test for deepseek related
 
         # change the attention backend to triton MLA
-        m.setenv(STR_BACKEND_ENV_VAR, "TRITON_MLA")
+        m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_MLA")
         backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, use_mla=True)
         assert backend.get_name() == "TRITON_MLA"
 
         # If attention backend is None
         # If use_mla is true
         # The selected backend is triton MLA
-        m.setenv(STR_BACKEND_ENV_VAR, None)
+        m.setenv("VLLM_ATTENTION_BACKEND", "")
         backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, use_mla=True)
         assert backend.get_name() == "TRITON_MLA"
 
         # change the attention backend to AITER MLA
-        m.setenv(STR_BACKEND_ENV_VAR, "ROCM_AITER_MLA")
+        m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_AITER_MLA")
         backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False, use_mla=True)
         assert backend.get_name() == "ROCM_AITER_MLA"
 
@@ -50,7 +49,7 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
         # If use_mla is true
         # If VLLM_ROCM_USE_AITER is enabled
         # The selected backend is ROCM_AITER_MLA
-        m.setenv(STR_BACKEND_ENV_VAR, None)
+        m.setenv("VLLM_ATTENTION_BACKEND", "")
         m.setenv("VLLM_ROCM_USE_AITER", "1")
         backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False, use_mla=True)
         assert backend.get_name() == "ROCM_AITER_MLA"
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index b8148ce06b3fd..75e82f9314e74 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -9,7 +9,6 @@ from numbers import Number
 from typing import Any, NamedTuple
 from unittest.mock import patch
 
-import pytest
 import torch
 from torch._prims_common import TensorLikeType
 
@@ -17,9 +16,6 @@ from tests.kernels.quant_utils import native_w8a8_block_matmul
 from vllm.attention.backends.abstract import AttentionType
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
-from vllm.utils import (
-    STR_BACKEND_ENV_VAR,
-)
 from vllm.utils.torch_utils import make_tensor_with_pad
 
 # For now, disable "test_aot_dispatch_dynamic" since there are some
@@ -217,22 +213,6 @@ def make_causal_mask(
     return mask
 
 
-def override_backend_env_variable(
-    mpatch: pytest.MonkeyPatch, backend_name: str
-) -> None:
-    """
-    Override the environment variable indicating the vLLM backend temporarily,
-    using pytest monkeypatch to ensure that the env vars get
-    reset once the test context exits.
-
-    Arguments:
-
-    * mpatch: pytest monkeypatch instance
-    * backend_name: attention backend name to force
-    """
-    mpatch.setenv(STR_BACKEND_ENV_VAR, backend_name)
-
-
 def ref_masked_attention(
     query: torch.Tensor,
     key: torch.Tensor,
diff --git a/tests/models/quantization/test_fp8.py b/tests/models/quantization/test_fp8.py
index 2a6f34a9c4823..7dfedaf2799d4 100644
--- a/tests/models/quantization/test_fp8.py
+++ b/tests/models/quantization/test_fp8.py
@@ -11,7 +11,6 @@ import pytest
 from tests.quantization.utils import is_quant_method_supported
 from vllm.attention.utils.fa_utils import flash_attn_supports_fp8
 from vllm.platforms import current_platform
-from vllm.utils import STR_BACKEND_ENV_VAR
 from ..utils import check_logprobs_close
 
 
@@ -76,7 +75,7 @@ def test_models(
 
     with monkeypatch.context() as m:
         m.setenv("TOKENIZERS_PARALLELISM", "true")
-        m.setenv(STR_BACKEND_ENV_VAR, backend)
+        m.setenv("VLLM_ATTENTION_BACKEND", backend)
 
         MAX_MODEL_LEN = 1024
         NUM_LOG_PROBS = 8
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index ad19b58aa155c..a7190df3c4f10 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -19,7 +19,6 @@ from vllm.attention.backends.registry import (
 )
 from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
-from vllm.utils import STR_BACKEND_ENV_VAR
 from vllm.utils.import_utils import resolve_obj_by_qualname
 
 logger = init_logger(__name__)
@@ -35,7 +34,7 @@ def get_env_variable_attn_backend() -> AttentionBackendEnum | None:
     * AttentionBackendEnum value if an override is specified
     * None otherwise
     """
-    backend_name = os.environ.get(STR_BACKEND_ENV_VAR)
+    backend_name = os.environ.get("VLLM_ATTENTION_BACKEND")
     if backend_name is None:
         return None
     if backend_name == "XFORMERS":
@@ -139,10 +138,10 @@ def _cached_get_attn_backend(
             if backend_by_env_var.endswith("_VLLM_V1"):
                 logger.warning(
                     "The suffix '_VLLM_V1' in the environment variable "
-                    "%s is no longer necessary as V0 backends have been "
-                    "deprecated. Please remove this suffix from your "
+                    "VLLM_ATTENTION_BACKEND is no longer necessary as "
+                    "V0 backends have been deprecated. "
+                    "Please remove this suffix from your "
                     "environment variable setting.",
-                    STR_BACKEND_ENV_VAR,
                 )
                 backend_by_env_var = backend_by_env_var.removesuffix("_VLLM_V1")
             try:
diff --git a/vllm/model_executor/models/deepseek_eagle.py b/vllm/model_executor/models/deepseek_eagle.py
index 4d7a37292cb02..8f6b4a4b021f2 100644
--- a/vllm/model_executor/models/deepseek_eagle.py
+++ b/vllm/model_executor/models/deepseek_eagle.py
@@ -23,12 +23,9 @@ from vllm.model_executor.models.deepseek_v2 import (
     DeepseekV2DecoderLayer,
     DeepseekV3ForCausalLM,
 )
-from vllm.utils import init_logger
 
 from .utils import AutoWeightsLoader, maybe_prefix, process_eagle_weight
 
-logger = init_logger(__name__)
-
 
 @support_torch_compile
 class DeepseekV2Model(nn.Module):
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index fddcc27204307..25e7978c70fa8 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -7,8 +7,6 @@ from typing import Any
 
 import torch
 
-from vllm.logger import init_logger
-
 _DEPRECATED_MAPPINGS = {
     "cprofile": "profiling",
     "cprofile_context": "profiling",
@@ -37,21 +35,6 @@ def __dir__() -> list[str]:
     return sorted(list(globals().keys()) + list(_DEPRECATED_MAPPINGS.keys()))
 
 
-logger = init_logger(__name__)
-
-# Constants related to forcing the attention backend selection
-
-# String name of register which may be set in order to
-# force auto-selection of attention backend by Attention
-# wrapper
-STR_BACKEND_ENV_VAR: str = "VLLM_ATTENTION_BACKEND"
-
-# Possible string values of STR_BACKEND_ENV_VAR
-# register, corresponding to possible backends
-STR_FLASHINFER_ATTN_VAL: str = "FLASHINFER"
-STR_FLASH_ATTN_VAL: str = "FLASH_ATTN"
-STR_INVALID_VAL: str = "INVALID"
-
 MASK_64_BITS = (1 << 64) - 1
 
 
From 953d9c820b0d8f9a55ee455814287536a498498c Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Nov 2025 20:40:47 +0800
Subject: [PATCH 041/770] [mypy] Pass type checking for `vllm/utils` and
 `vllm/v1/pool` (#29666)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tools/pre_commit/mypy.py    |  3 ++-
 vllm/utils/async_utils.py   |  9 ++++++++-
 vllm/utils/jsontree.py      | 23 ++++++++---------------
 vllm/utils/mem_utils.py     |  4 ++--
 vllm/utils/nccl.py          |  6 +++---
 vllm/utils/network_utils.py |  2 +-
 vllm/utils/registry.py      |  6 ++++--
 vllm/utils/torch_utils.py   | 21 ++++++---------------
 vllm/v1/pool/metadata.py    |  6 +++---
 9 files changed, 37 insertions(+), 43 deletions(-)

diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py
index 34f6e8c928ffb..e3dc40fd0ec7d 100755
--- a/tools/pre_commit/mypy.py
+++ b/tools/pre_commit/mypy.py
@@ -36,8 +36,10 @@ FILES = [
     "vllm/transformers_utils",
     "vllm/triton_utils",
     "vllm/usage",
+    "vllm/utils",
     "vllm/v1/core",
     "vllm/v1/engine",
+    "vllm/v1/pool",
     "vllm/v1/worker",
 ]
 
@@ -59,7 +61,6 @@ SEPARATE_GROUPS = [
     "vllm/v1/executor",
     "vllm/v1/kv_offload",
     "vllm/v1/metrics",
-    "vllm/v1/pool",
     "vllm/v1/sample",
     "vllm/v1/spec_decode",
     "vllm/v1/structured_output",
diff --git a/vllm/utils/async_utils.py b/vllm/utils/async_utils.py
index b6c24e1ceeee7..77234cbd0c8ca 100644
--- a/vllm/utils/async_utils.py
+++ b/vllm/utils/async_utils.py
@@ -12,7 +12,7 @@ from asyncio import FIRST_COMPLETED, AbstractEventLoop, Future, Task
 from collections.abc import AsyncGenerator, Awaitable, Callable
 from concurrent.futures import Executor, ThreadPoolExecutor
 from functools import partial
-from typing import TypeVar
+from typing import TYPE_CHECKING, TypeVar
 
 from transformers.tokenization_utils_base import BatchEncoding
 from typing_extensions import ParamSpec
@@ -257,6 +257,13 @@ def in_loop(event_loop: AbstractEventLoop) -> bool:
         return False
 
 
+# A hack to pass mypy
+if TYPE_CHECKING:
+
+    def anext(it: AsyncGenerator[T, None]):
+        return it.__anext__()
+
+
 async def merge_async_iterators(
     *iterators: AsyncGenerator[T, None],
 ) -> AsyncGenerator[tuple[int, T], None]:
diff --git a/vllm/utils/jsontree.py b/vllm/utils/jsontree.py
index cde9aa6ff901c..fe757c2f33748 100644
--- a/vllm/utils/jsontree.py
+++ b/vllm/utils/jsontree.py
@@ -4,7 +4,7 @@
 
 from collections.abc import Callable, Iterable
 from functools import reduce
-from typing import TYPE_CHECKING, TypeAlias, TypeVar, cast, overload
+from typing import TYPE_CHECKING, Any, TypeAlias, TypeVar, overload
 
 if TYPE_CHECKING:
     import torch
@@ -82,16 +82,13 @@ def json_map_leaves(
 
 def json_map_leaves(
     func: Callable[[_T], _U],
-    value: "BatchedTensorInputs" | _JSONTree[_T],
+    value: Any,
 ) -> "BatchedTensorInputs" | _JSONTree[_U]:
     """Apply a function to each leaf in a nested JSON structure."""
     if isinstance(value, dict):
-        return {
-            k: json_map_leaves(func, v)  # type: ignore[arg-type]
-            for k, v in value.items()
-        }
+        return {k: json_map_leaves(func, v) for k, v in value.items()}  # type: ignore
     elif isinstance(value, list):
-        return [json_map_leaves(func, v) for v in value]
+        return [json_map_leaves(func, v) for v in value]  # type: ignore
     elif isinstance(value, tuple):
         return tuple(json_map_leaves(func, v) for v in value)
     else:
@@ -140,9 +137,9 @@ def json_reduce_leaves(
 
 
 def json_reduce_leaves(
-    func: Callable[..., _T | _U],
+    func: Callable[[_T, _T], _T] | Callable[[_U, _T], _U],
     value: _JSONTree[_T],
-    initial: _U = cast(_U, ...),  # noqa: B008
+    initial: _U = ...,  # type: ignore[assignment]
     /,
 ) -> _T | _U:
     """
@@ -151,13 +148,9 @@ def json_reduce_leaves(
     sequence to a single value.
     """
     if initial is ...:
-        return reduce(func, json_iter_leaves(value))  # type: ignore[arg-type]
+        return reduce(func, json_iter_leaves(value))  # type: ignore
 
-    return reduce(
-        func,  # type: ignore[arg-type]
-        json_iter_leaves(value),
-        initial,
-    )
+    return reduce(func, json_iter_leaves(value), initial)  # type: ignore
 
 
 def json_count_leaves(value: JSONTree[_T]) -> int:
diff --git a/vllm/utils/mem_utils.py b/vllm/utils/mem_utils.py
index c6a6757bed3bf..e2517b935bf28 100644
--- a/vllm/utils/mem_utils.py
+++ b/vllm/utils/mem_utils.py
@@ -68,11 +68,11 @@ class MemorySnapshot:
     timestamp: float = 0.0
     auto_measure: bool = True
 
-    def __post_init__(self):
+    def __post_init__(self) -> None:
         if self.auto_measure:
             self.measure()
 
-    def measure(self):
+    def measure(self) -> None:
         from vllm.platforms import current_platform
 
         # we measure the torch peak memory usage via allocated_bytes,
diff --git a/vllm/utils/nccl.py b/vllm/utils/nccl.py
index b1459fcbd246a..4807bc076f824 100644
--- a/vllm/utils/nccl.py
+++ b/vllm/utils/nccl.py
@@ -3,7 +3,7 @@
 
 from __future__ import annotations
 
-import importlib
+import importlib.util
 import os
 
 import torch
@@ -47,8 +47,8 @@ def find_nccl_include_paths() -> list[str] | None:
 
     try:
         spec = importlib.util.find_spec("nvidia.nccl")
-        if spec and getattr(spec, "submodule_search_locations", None):
-            for loc in spec.submodule_search_locations:
+        if spec and (locs := getattr(spec, "submodule_search_locations", None)):
+            for loc in locs:
                 inc_dir = os.path.join(loc, "include")
                 if os.path.exists(os.path.join(inc_dir, "nccl.h")):
                     paths.append(inc_dir)
diff --git a/vllm/utils/network_utils.py b/vllm/utils/network_utils.py
index 0a68e48ba5e7a..80ff0df28c66c 100644
--- a/vllm/utils/network_utils.py
+++ b/vllm/utils/network_utils.py
@@ -72,7 +72,7 @@ def get_ip() -> str:
     return "0.0.0.0"
 
 
-def test_loopback_bind(address, family):
+def test_loopback_bind(address: str, family: int) -> bool:
     try:
         s = socket.socket(family, socket.SOCK_DGRAM)
         s.bind((address, 0))  # Port 0 = auto assign
diff --git a/vllm/utils/registry.py b/vllm/utils/registry.py
index ac9b859159ead..a136d450e7b14 100644
--- a/vllm/utils/registry.py
+++ b/vllm/utils/registry.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any
+from typing import Any, TypeVar
+
+_T = TypeVar("_T", bound=type)
 
 
 class ExtensionManager:
@@ -34,7 +36,7 @@ class ExtensionManager:
         Decorator to register a class with the given name.
         """
 
-        def wrap(cls_to_register):
+        def wrap(cls_to_register: _T) -> _T:
             self.name2class[name] = cls_to_register
             return cls_to_register
 
diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py
index 3661dfd09047a..f5c49ac169f0c 100644
--- a/vllm/utils/torch_utils.py
+++ b/vllm/utils/torch_utils.py
@@ -13,7 +13,7 @@ import numpy.typing as npt
 import torch
 from packaging import version
 from packaging.version import Version
-from torch.library import Library
+from torch.library import Library, infer_schema
 
 import vllm.envs as envs
 
@@ -78,7 +78,6 @@ def guard_cuda_initialization():
         yield
         return
 
-    had_key = "CUDA_VISIBLE_DEVICES" in os.environ
     old_value = os.environ.get("CUDA_VISIBLE_DEVICES")
     os.environ["CUDA_VISIBLE_DEVICES"] = ""
     try:
@@ -90,10 +89,10 @@ def guard_cuda_initialization():
             err_msg = str(e)
         raise RuntimeError(err_msg) from e
     finally:
-        if had_key:
-            os.environ["CUDA_VISIBLE_DEVICES"] = old_value
+        if old_value is None:
+            del os.environ["CUDA_VISIBLE_DEVICES"]
         else:
-            os.environ.pop("CUDA_VISIBLE_DEVICES")
+            os.environ["CUDA_VISIBLE_DEVICES"] = old_value
 
 
 def get_dtype_size(dtype: torch.dtype) -> int:
@@ -525,8 +524,7 @@ def get_cuda_view_from_cpu_tensor(cpu_tensor: torch.Tensor) -> torch.Tensor:
 
 # Helper function used in testing.
 def _is_torch_equal_or_newer(torch_version: str, target: str) -> bool:
-    torch_version = version.parse(torch_version)
-    return torch_version >= version.parse(target)
+    return version.parse(torch_version) >= version.parse(target)
 
 
 def is_torch_equal_or_newer(target: str) -> bool:
@@ -640,15 +638,8 @@ def direct_register_custom_op(
 
         dispatch_key = current_platform.dispatch_key
 
-    import torch.library
+    schema_str = infer_schema(op_func, mutates_args=mutates_args)
 
-    if hasattr(torch.library, "infer_schema"):
-        schema_str = torch.library.infer_schema(op_func, mutates_args=mutates_args)
-    else:
-        # for pytorch 2.4
-        import torch._custom_op.impl
-
-        schema_str = torch._custom_op.impl.infer_schema(op_func, mutates_args)
     my_lib = target_lib or vllm_lib
     my_lib.define(op_name + schema_str, tags=tags)
     my_lib.impl(op_name, op_func, dispatch_key=dispatch_key)
diff --git a/vllm/v1/pool/metadata.py b/vllm/v1/pool/metadata.py
index 9883ab8fb9964..7bd2c7415dafe 100644
--- a/vllm/v1/pool/metadata.py
+++ b/vllm/v1/pool/metadata.py
@@ -67,16 +67,16 @@ def build_pooling_cursor(
 
     n_seq = len(num_scheduled_tokens)
     index = list(range(n_seq))
-    num_scheduled_tokens = torch.tensor(num_scheduled_tokens, device="cpu")
+    num_scheduled_tokens_cpu = torch.tensor(num_scheduled_tokens, device="cpu")
     cumsum = torch.zeros(
         n_seq + 1, dtype=torch.int64, pin_memory=pin_memory, device="cpu"
     )
-    torch.cumsum(num_scheduled_tokens, dim=0, out=cumsum[1:])
+    torch.cumsum(num_scheduled_tokens_cpu, dim=0, out=cumsum[1:])
     cumsum = cumsum.to(device, non_blocking=True)
     return PoolingCursor(
         index=index,
         first_token_indices_gpu=cumsum[:n_seq],
         last_token_indices_gpu=cumsum[1:] - 1,
         prompt_lens_cpu=prompt_lens,
-        num_scheduled_tokens_cpu=num_scheduled_tokens,
+        num_scheduled_tokens_cpu=num_scheduled_tokens_cpu,
     )

From 8e7a891602eb49d5e520e082148b1f021f9f801e Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 28 Nov 2025 04:52:23 -0800
Subject: [PATCH 042/770] [BugFix] Fix spec decoding max_tokens scheduling perf
 issue (#29542)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/test_outputs.py        | 24 +++++++++++-------------
 vllm/v1/core/sched/scheduler.py | 14 +++++++++-----
 vllm/v1/outputs.py              | 28 ++++++++--------------------
 3 files changed, 28 insertions(+), 38 deletions(-)

diff --git a/tests/v1/test_outputs.py b/tests/v1/test_outputs.py
index af9df844249ef..89d551e344cf4 100644
--- a/tests/v1/test_outputs.py
+++ b/tests/v1/test_outputs.py
@@ -43,7 +43,7 @@ class TestLogprobsLists(TestCase):
             cu_num_generated_tokens=None,
         )
 
-        sliced = logprobsLists.slice(1, 3)
+        sliced = logprobsLists.slice_request(1, num_positions=2)
         assert sliced.logprob_token_ids == [[2], [3]]
         assert sliced.logprobs == [[0.2], [0.3]]
         assert sliced.sampled_token_ranks == [2, 3]
@@ -51,7 +51,7 @@ class TestLogprobsLists(TestCase):
 
     def test_slice_from_start(self):
         """Test slicing from the start position"""
-        sliced = self.logprobsLists.slice(0, 2)
+        sliced = self.logprobsLists.slice_request(0, num_positions=5)
         assert len(sliced.logprob_token_ids) == 5
         assert sliced.logprob_token_ids == [
             [1, 2],
@@ -60,11 +60,11 @@ class TestLogprobsLists(TestCase):
             [7, 8],
             [9, 10],
         ]
-        assert sliced.cu_num_generated_tokens == [0, 2, 5]
+        assert sliced.cu_num_generated_tokens is None
 
     def test_slice_from_middle(self):
         """Test slicing from the middle position"""
-        sliced = self.logprobsLists.slice(1, 3)
+        sliced = self.logprobsLists.slice_request(1, num_positions=7)
         assert len(sliced.logprob_token_ids) == 7
         assert sliced.logprob_token_ids == [
             [5, 6],
@@ -75,27 +75,25 @@ class TestLogprobsLists(TestCase):
             [15, 16],
             [17, 18],
         ]
-        assert sliced.cu_num_generated_tokens == [0, 3, 7]
+        assert sliced.cu_num_generated_tokens is None
 
     def test_slice_single_request(self):
         """Test slicing a single request"""
-        sliced = self.logprobsLists.slice(1, 2)
+        sliced = self.logprobsLists.slice_request(1, num_positions=3)
         assert len(sliced.logprob_token_ids) == 3
         assert sliced.logprob_token_ids == [[5, 6], [7, 8], [9, 10]]
-        assert sliced.cu_num_generated_tokens == [0, 3]
+        assert sliced.cu_num_generated_tokens is None
 
     def test_slice_last_request(self):
         """Test slicing the last request"""
-        sliced = self.logprobsLists.slice(2, 3)
+        sliced = self.logprobsLists.slice_request(2, num_positions=4)
         assert len(sliced.logprob_token_ids) == 4
         assert sliced.logprob_token_ids == [[11, 12], [13, 14], [15, 16], [17, 18]]
-        assert sliced.cu_num_generated_tokens == [0, 4]
+        assert sliced.cu_num_generated_tokens is None
 
     def test_slice_all_requests(self):
         """Test slicing all requests (full slice)"""
-        sliced = self.logprobsLists.slice(0, 3)
+        sliced = self.logprobsLists.slice_request(0, num_positions=9)
         assert len(sliced.logprob_token_ids) == 9  # All tokens
         assert sliced.logprob_token_ids == self.logprobsLists.logprob_token_ids
-        assert (
-            sliced.cu_num_generated_tokens == self.logprobsLists.cu_num_generated_tokens
-        )
+        assert sliced.cu_num_generated_tokens is None
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 0304a8ec48bf7..e3ec8440a9323 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -234,11 +234,15 @@ class Scheduler(SchedulerInterface):
                 num_new_tokens = self.scheduler_config.long_prefill_token_threshold
             num_new_tokens = min(num_new_tokens, token_budget)
 
-            # Make sure the input position does not exceed the max model len or
-            # request's max_tokens.
-            # This is necessary when using spec decoding and/or async scheduling.
+            num_spec_placeholders = max(0, request.num_output_placeholders - 1)
             max_total_tokens = min(
-                request.num_prompt_tokens + request.max_tokens, self.max_model_len
+                # Avoid scheduling tokens that we're sure won't will be needed based on
+                # request.max_tokens. For this calculation we assume placeholder
+                # speculated output tokens are rejected.
+                request.num_prompt_tokens + request.max_tokens + num_spec_placeholders,
+                # Make sure the input position does not exceed the max model len.
+                # This is necessary when using spec decoding.
+                self.max_model_len,
             )
             num_new_tokens = min(
                 num_new_tokens, max_total_tokens - 1 - request.num_computed_tokens
@@ -1089,7 +1093,7 @@ class Scheduler(SchedulerInterface):
                 and request.sampling_params.logprobs is not None
                 and logprobs
             ):
-                new_logprobs = logprobs.slice(req_index, req_index + 1)
+                new_logprobs = logprobs.slice_request(req_index, len(new_token_ids))
 
             if new_token_ids and self.structured_output_manager.should_advance(request):
                 struct_output_request = request.structured_output_request
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index e32d5bb608b1d..8110deb5a610b 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -29,27 +29,15 @@ class LogprobsLists(NamedTuple):
     # different for each request.
     cu_num_generated_tokens: list[int] | None = None
 
-    def slice(self, start_req_idx: int, end_req_idx: int):
-        if self.cu_num_generated_tokens:
-            start = self.cu_num_generated_tokens[start_req_idx]
-            end = self.cu_num_generated_tokens[end_req_idx]
-            # Recompute cumulative array starting from 0
-            cu_num_offset = self.cu_num_generated_tokens[start_req_idx]
-            sliced_cu_num_generated_tokens = [
-                cu_num - cu_num_offset
-                for cu_num in self.cu_num_generated_tokens[
-                    start_req_idx : end_req_idx + 1
-                ]
-            ]
-        else:
-            start = start_req_idx
-            end = end_req_idx
-            sliced_cu_num_generated_tokens = None
+    def slice_request(self, req_idx: int, num_positions: int):
+        if self.cu_num_generated_tokens is not None:
+            req_idx = self.cu_num_generated_tokens[req_idx]
+        end_idx = req_idx + num_positions
         return LogprobsLists(
-            self.logprob_token_ids[start:end],
-            self.logprobs[start:end],
-            self.sampled_token_ranks[start:end],
-            sliced_cu_num_generated_tokens,
+            self.logprob_token_ids[req_idx:end_idx],
+            self.logprobs[req_idx:end_idx],
+            self.sampled_token_ranks[req_idx:end_idx],
+            None,
         )
 
 
From 1168768a2d17f50dfbdd03b5f4c6e6809fe7aaa3 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Nov 2025 21:26:47 +0800
Subject: [PATCH 043/770] [Optimization] Early return for `_apply_matches` and
 `_iter_placeholders` (#29668)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/multimodal/processing.py | 33 +++++++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 85a03efd5bb9b..691eff9acf862 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -727,18 +727,35 @@ def _find_matches(
     return mode, matches_to_apply
 
 
+def _all_items_found(
+    mm_item_counts: dict[str, int],
+    mm_found_counts: dict[str, int],
+) -> bool:
+    return all(
+        item_idx >= mm_item_counts[modality]
+        for modality, item_idx in mm_found_counts.items()
+    )
+
+
 def _apply_matches(
     prompt: _S,
     mm_prompt_updates: "MultiModalPromptUpdates",
     tokenizer: AnyTokenizer,
 ) -> tuple[list[_S], "MultiModalPromptUpdatesApplyResult"]:
     prompt_len = len(prompt)
+    mm_item_counts = {m: len(items) for m, items in mm_prompt_updates.items()}
 
     out_seqs = list[str | list[int]]()
     out_result: MultiModalPromptUpdatesApplyResult = {
         m: [None] * len(items) for m, items in mm_prompt_updates.items()
     }
 
+    mm_found_counts = {
+        m: sum(r is not None for r in res) for m, res in out_result.items()
+    }
+    if _all_items_found(mm_item_counts, mm_found_counts):
+        return [prompt], out_result
+
     start_idx = prev_end_idx = 0
     while start_idx < max(prompt_len, 1):  # Allow inserts into empty prompt
         found = False
@@ -776,6 +793,12 @@ def _apply_matches(
                 # Exclude overlapping matches
                 start_idx = prev_end_idx = match.end_idx
 
+            mm_found_counts = {
+                m: sum(r is not None for r in res) for m, res in out_result.items()
+            }
+            if _all_items_found(mm_item_counts, mm_found_counts):
+                break
+
         if not found:
             start_idx += 1
 
@@ -832,12 +855,15 @@ def _iter_placeholders(
 
     Note that empty matches are ignored.
     """
-    prompt_len = len(prompt)
     mm_item_counts = {m: len(items) for m, items in mm_prompt_updates.items()}
+    item_idx_by_modality = {modality: 0 for modality in mm_prompt_updates}
 
-    item_idx_by_modality = defaultdict[str, int](lambda: 0)
+    if _all_items_found(mm_item_counts, item_idx_by_modality):
+        return
 
+    prompt_len = len(prompt)
     start_idx = 0
+
     while start_idx < prompt_len:
         found = False
 
@@ -875,6 +901,9 @@ def _iter_placeholders(
                     break
 
             if found:
+                if _all_items_found(mm_item_counts, item_idx_by_modality):
+                    return
+
                 break  # Go back to the outer while loop
 
         if not found:

From f8151b66fa23f79c470d639f34768eb37709df0a Mon Sep 17 00:00:00 2001
From: HappyAmazonian <91216626+HappyAmazonian@users.noreply.github.com>
Date: Fri, 28 Nov 2025 05:29:05 -0800
Subject: [PATCH 044/770] =?UTF-8?q?Revert=20"Supress=20verbose=20logs=20fr?=
 =?UTF-8?q?om=20model=5Fhosting=5Fcontainer=5Fstandards=20(=E2=80=A6=20(#2?=
 =?UTF-8?q?9335)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Shen Teng <sheteng@amazon.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 requirements/common.txt               | 2 +-
 vllm/entrypoints/openai/api_server.py | 4 ----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index 3f8cd588422d0..8b9e6b935bd20 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -49,4 +49,4 @@ cbor2 # Required for cross-language serialization of hashable objects
 setproctitle # Used to set process names for better debugging and monitoring
 openai-harmony >= 0.0.3  # Required for gpt-oss
 anthropic == 0.71.0
-model-hosting-container-standards < 1.0.0
\ No newline at end of file
+model-hosting-container-standards >= 0.1.9, < 1.0.0
\ No newline at end of file
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 70174250ceabe..a7a2733913b08 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -5,7 +5,6 @@ import hashlib
 import importlib
 import inspect
 import json
-import logging
 import multiprocessing
 import multiprocessing.forkserver as forkserver
 import os
@@ -2099,9 +2098,6 @@ async def run_server(args, **uvicorn_kwargs) -> None:
     # Add process-specific prefix to stdout and stderr.
     decorate_logs("APIServer")
 
-    # Suppress verbose logs from model_hosting_container_standards
-    logging.getLogger("model_hosting_container_standards").setLevel(logging.ERROR)
-
     listen_address, sock = setup_server(args)
     await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
 

From e2f56c309d2a28899c68975a7e104502d56deb8f Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Fri, 28 Nov 2025 21:37:54 +0800
Subject: [PATCH 045/770] [CPU] Update torch 2.9.1 for CPU backend (#29664)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 .buildkite/scripts/hardware_ci/run-cpu-test.sh |  4 ++--
 csrc/cpu/utils.cpp                             | 13 +++++++------
 docker/Dockerfile.cpu                          |  2 +-
 requirements/cpu-build.txt                     |  5 ++---
 requirements/cpu.txt                           | 15 ++++-----------
 vllm/platforms/cpu.py                          |  1 -
 6 files changed, 16 insertions(+), 24 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index 2267718f75ca5..438fe522c8702 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -21,8 +21,8 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
 
 # Run the image, setting --shm-size=4g for tensor parallel.
 docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp
index 5199ba2af024f..3dacfc7b2b7a3 100644
--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@@ -51,12 +51,13 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
       if (node_id != -1) {
         node_ids.insert(node_id);
       }
-      TORCH_WARN(node_id == mem_node_id, "CPU ", cpu_id, " is on NUMA node ",
-                 node_id, ", but CPU ", omp_cpu_ids.front(),
-                 " is on NUMA node ", mem_node_id,
-                 ". All CPUs should be on the same NUMA node for optimal "
-                 "performance. Memory will be bound to NUMA node ",
-                 mem_node_id, ".");
+      if (node_id != mem_node_id) {
+        TORCH_WARN("CPU ", cpu_id, " is on NUMA node ", node_id, ", but CPU ",
+                   omp_cpu_ids.front(), " is on NUMA node ", mem_node_id,
+                   ". All CPUs should be on the same NUMA node for optimal "
+                   "performance. Memory will be bound to NUMA node ",
+                   mem_node_id, ".");
+      }
     }
     // Concatenate all node_ids into a single comma-separated string
     if (!node_ids.empty()) {
diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index eb3807ef0ca4e..8d55ecfba3e52 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -132,7 +132,7 @@ RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
       esac; \
     }; \
     remove_packages_not_supported_on_aarch64 && \
-    sed -i 's/^torch==.*/torch==2.8.0/g' requirements/cpu-test.in && \
+    sed -i 's/^torch==.*/torch==2.9.1/g' requirements/cpu-test.in && \
     sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
     sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
     uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
diff --git a/requirements/cpu-build.txt b/requirements/cpu-build.txt
index 81d429a5e5f8d..e18e0825fc428 100644
--- a/requirements/cpu-build.txt
+++ b/requirements/cpu-build.txt
@@ -4,9 +4,8 @@ packaging>=24.2
 setuptools>=77.0.3,<81.0.0
 setuptools-scm>=8
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
-torch==2.9.0; platform_system == "Darwin"
-torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
+torch==2.9.1+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
+torch==2.9.1; platform_system == "Darwin" or platform_machine == "ppc64le" or platform_machine == "aarch64"
 scons; platform_machine == "aarch64"    # needed to build Arm Compute Library (ACL)
 wheel
 jinja2>=3.1.6
diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index e23d3286f3f78..21571be479c83 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -4,25 +4,18 @@
 numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative decoding
 
 # Dependencies for CPUs
-packaging>=24.2
-setuptools>=77.0.3,<81.0.0
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
-torch==2.9.0; platform_system == "Darwin"
-torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
+torch==2.9.1+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
+torch==2.9.1; platform_system == "Darwin" or platform_machine == "ppc64le" or platform_machine == "aarch64"
 
 # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
-torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
-torchaudio==2.8.0; platform_machine == "ppc64le"
+torchaudio; platform_machine != "s390x"
 
 # required for the image processor of phi3v, this must be updated alongside torch
-torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
-torchvision==0.23.0; platform_machine == "ppc64le"
-datasets # for benchmark scripts
+torchvision; platform_machine != "s390x"
 
 # Intel Extension for PyTorch, only for x86_64 CPUs
 intel-openmp==2024.2.1; platform_machine == "x86_64"
-triton==3.2.0; platform_machine == "x86_64" # Triton is required for torch 2.6+cpu, as it is imported in torch.compile.
 
 # Use this to gather CPU info and optimize based on ARM Neoverse cores
 py-cpuinfo; platform_machine == "aarch64"
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 5f9561366e0d5..2b2c2f9cdc571 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -384,7 +384,6 @@ class CpuPlatform(Platform):
 
     @classmethod
     def is_pin_memory_available(cls) -> bool:
-        logger.warning("Pin memory is not supported on CPU.")
         return False
 
     @classmethod

From 460d8bbf2d19fc4757b3253050a0a18b61c136e3 Mon Sep 17 00:00:00 2001
From: Mingyuan Ma <111467530+Victor49152@users.noreply.github.com>
Date: Fri, 28 Nov 2025 05:52:42 -0800
Subject: [PATCH 046/770] Remove upstream fa checks (#29471)

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.io>
---
 vllm/attention/layer.py                       | 57 +++----------------
 vllm/attention/ops/vit_attn_wrappers.py       | 10 +---
 vllm/attention/utils/fa_utils.py              |  8 +++
 vllm/model_executor/models/dots_ocr.py        |  8 ---
 vllm/model_executor/models/ernie45_vl.py      |  9 ---
 vllm/model_executor/models/glm4_1v.py         | 12 +---
 vllm/model_executor/models/keye.py            |  1 -
 vllm/model_executor/models/paddleocr_vl.py    | 15 -----
 vllm/model_executor/models/qwen2_5_vl.py      | 18 ------
 vllm/model_executor/models/qwen2_vl.py        |  8 ---
 .../models/qwen3_omni_moe_thinker.py          |  6 --
 vllm/model_executor/models/qwen3_vl.py        | 12 ----
 vllm/model_executor/models/siglip2navit.py    |  2 -
 13 files changed, 18 insertions(+), 148 deletions(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 62ac38751aa01..da5a626171298 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -56,53 +56,28 @@ FP8_DTYPE = current_platform.fp8_dtype()
 logger = init_logger(__name__)
 
 
-def check_upstream_fa_availability(dtype: torch.dtype):
-    if (
-        dtype in (torch.float16, torch.bfloat16)
-        and current_platform.is_cuda()
-        and current_platform.has_device_capability(80)
-    ):
-        from transformers.utils import is_flash_attn_2_available
-
-        return is_flash_attn_2_available()
-    if current_platform.is_rocm():
-        from importlib.util import find_spec
-
-        return find_spec("flash_attn") is not None
-    return False
-
-
 def maybe_get_vit_flash_attn_backend(
     attn_backend: AttentionBackendEnum,
-    use_upstream_fa: bool,
     attn_backend_override: AttentionBackendEnum | None = None,
 ) -> tuple[AttentionBackendEnum, Callable | None]:
     if current_platform.is_rocm():
         if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA and on_gfx9():
             attn_backend = AttentionBackendEnum.ROCM_AITER_FA
-
         elif (
-            check_upstream_fa_availability(torch.get_default_dtype())
+            attn_backend_override is None
             and on_gfx9()
-            and attn_backend_override is None
+            and attn_backend == AttentionBackendEnum.FLASH_ATTN
         ):
-            attn_backend = AttentionBackendEnum.FLASH_ATTN
-            use_upstream_fa = True
+            pass
         else:
             return AttentionBackendEnum.TORCH_SDPA, None
-
     elif current_platform.is_cuda():
-        if (
-            attn_backend != AttentionBackendEnum.FLASH_ATTN
-            and check_upstream_fa_availability(torch.get_default_dtype())
-        ):
-            attn_backend = AttentionBackendEnum.FLASH_ATTN
-            use_upstream_fa = True
+        pass
     elif current_platform.is_xpu():
         assert attn_backend == AttentionBackendEnum.FLASH_ATTN, (
             "XPU platform only supports FLASH_ATTN as vision attention backend."
         )
-        use_upstream_fa = False
+        pass
     else:
         return AttentionBackendEnum.TORCH_SDPA, None
 
@@ -113,10 +88,7 @@ def maybe_get_vit_flash_attn_backend(
         if attn_backend == AttentionBackendEnum.ROCM_AITER_FA:
             from aiter import flash_attn_varlen_func
         else:
-            if use_upstream_fa:
-                from flash_attn import flash_attn_varlen_func
-            else:
-                from vllm.attention.utils.fa_utils import flash_attn_varlen_func
+            from vllm.attention.utils.fa_utils import flash_attn_varlen_func
     else:
         flash_attn_varlen_func = None
 
@@ -501,11 +473,6 @@ class MultiHeadAttention(nn.Module):
             attn_backend_override=attn_backend_override,
         )
 
-        # Some auto-selected backends can be upgraded
-        # to upstream flash attention if available.
-        # If vllm native fa is selected, we use it directly.
-        use_upstream_fa = False
-
         self.attn_backend = (
             backend
             if backend
@@ -521,7 +488,6 @@ class MultiHeadAttention(nn.Module):
         self.attn_backend, self._flash_attn_varlen_func = (
             maybe_get_vit_flash_attn_backend(
                 self.attn_backend,
-                use_upstream_fa,
                 attn_backend_override=attn_backend_override,
             )
         )
@@ -531,17 +497,8 @@ class MultiHeadAttention(nn.Module):
             AttentionBackendEnum.ROCM_AITER_FA,
         }
 
-        # this condition is just to make sure that the
-        # use_upstream_fa in the log is correct
-        if (
-            current_platform.is_rocm()
-            and self.attn_backend == AttentionBackendEnum.FLASH_ATTN
-        ):
-            use_upstream_fa = True
-
         logger.info_once(
-            f"MultiHeadAttention attn_backend: {self.attn_backend}, "
-            f"use_upstream_fa: {use_upstream_fa}"
+            f"Using {self.attn_backend} for MultiHeadAttention in multimodal encoder."
         )
 
     def forward(
diff --git a/vllm/attention/ops/vit_attn_wrappers.py b/vllm/attention/ops/vit_attn_wrappers.py
index 46f8f5117f7a7..d9f15f1e42858 100644
--- a/vllm/attention/ops/vit_attn_wrappers.py
+++ b/vllm/attention/ops/vit_attn_wrappers.py
@@ -27,15 +27,11 @@ def flash_attn_maxseqlen_wrapper(
     max_seqlen: torch.Tensor,
     batch_size: int,
     is_rocm_aiter: bool,
-    use_upstream_fa: bool,
 ) -> torch.Tensor:
     if is_rocm_aiter:
         from aiter import flash_attn_varlen_func
     else:
-        if use_upstream_fa:
-            from flash_attn import flash_attn_varlen_func
-        else:
-            from vllm.attention.utils.fa_utils import flash_attn_varlen_func
+        from vllm.attention.utils.fa_utils import flash_attn_varlen_func
     q, k, v = (einops.rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
     output = flash_attn_varlen_func(
         q,
@@ -62,7 +58,6 @@ def flash_attn_maxseqlen_wrapper_fake(
     max_seqlen: torch.Tensor,
     batch_size: int,
     is_rocm_aiter: bool,
-    use_upstream_fa: bool,
 ) -> torch.Tensor:
     b, s, h, d = q.shape
     return torch.empty((s, b, h * d), dtype=q.dtype, device=q.device)
@@ -83,10 +78,9 @@ def vit_flash_attn_wrapper(
     max_seqlen: torch.Tensor,
     batch_size: int,
     is_rocm_aiter: bool,
-    use_upstream_fa: bool,
 ) -> torch.Tensor:
     return torch.ops.vllm.flash_attn_maxseqlen_wrapper(
-        q, k, v, cu_seqlens, max_seqlen, batch_size, is_rocm_aiter, use_upstream_fa
+        q, k, v, cu_seqlens, max_seqlen, batch_size, is_rocm_aiter
     )
 
 
diff --git a/vllm/attention/utils/fa_utils.py b/vllm/attention/utils/fa_utils.py
index adb9b08a65735..8a46587473e43 100644
--- a/vllm/attention/utils/fa_utils.py
+++ b/vllm/attention/utils/fa_utils.py
@@ -18,6 +18,14 @@ elif current_platform.is_xpu():
     reshape_and_cache_flash = ops.reshape_and_cache_flash
     flash_attn_varlen_func = ops.flash_attn_varlen_func
     get_scheduler_metadata = ops.get_scheduler_metadata
+elif current_platform.is_rocm():
+    try:
+        from flash_attn import flash_attn_varlen_func  # noqa: F401
+    except ImportError as e:
+        raise ImportError(
+            "Rocm platform requires upstream flash-attn "
+            "to be installed. Please install flash-attn first."
+        ) from e
 
 
 def get_flash_attn_version(requires_alibi: bool = False) -> int | None:
diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py
index 5460018d0d67a..5cc2a48f26d64 100644
--- a/vllm/model_executor/models/dots_ocr.py
+++ b/vllm/model_executor/models/dots_ocr.py
@@ -11,7 +11,6 @@ from transformers.models.qwen2_vl import Qwen2VLProcessor
 
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import (
-    check_upstream_fa_availability,
     maybe_get_vit_flash_attn_backend,
 )
 from vllm.config import VllmConfig
@@ -294,12 +293,10 @@ class DotsVisionAttention(nn.Module):
             torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        self.use_upstream_fa = False
 
         self.attn_backend, self.flash_attn_varlen_func = (
             maybe_get_vit_flash_attn_backend(
                 self.attn_backend,
-                self.use_upstream_fa,
                 attn_backend_override=attn_backend_override,
             )
         )
@@ -569,11 +566,6 @@ class DotsVisionTransformer(nn.Module):
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        if (
-            self.attn_backend != AttentionBackendEnum.FLASH_ATTN
-            and check_upstream_fa_availability(torch.get_default_dtype())
-        ):
-            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
         self.out_hidden_size = config.hidden_size
         # Keep blocks for compatibility with other vision towers
         num_layers = (
diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py
index 07b34fbc8addb..81663dd7bbb45 100644
--- a/vllm/model_executor/models/ernie45_vl.py
+++ b/vllm/model_executor/models/ernie45_vl.py
@@ -38,7 +38,6 @@ from transformers import BatchFeature
 
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import (
-    check_upstream_fa_availability,
     maybe_get_vit_flash_attn_backend,
 )
 from vllm.config import VllmConfig
@@ -201,12 +200,9 @@ class Ernie4_5_VisionAttention(nn.Module):
             attn_backend_override=attn_backend_override,
         )
 
-        self.use_upstream_fa = False
-
         self.attn_backend, self.flash_attn_varlen_func = (
             maybe_get_vit_flash_attn_backend(
                 self.attn_backend,
-                self.use_upstream_fa,
                 attn_backend_override=attn_backend_override,
             )
         )
@@ -498,11 +494,6 @@ class Ernie4_5_VisionTransformer(nn.Module):
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        if (
-            self.attn_backend != AttentionBackendEnum.FLASH_ATTN
-            and check_upstream_fa_availability(torch.get_default_dtype())
-        ):
-            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
 
     @property
     def dtype(self) -> torch.dtype:
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 7e0370886884f..fe238861ecceb 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -47,10 +47,7 @@ from transformers.models.glm4v.video_processing_glm4v import Glm4vVideoProcessor
 from transformers.video_utils import VideoMetadata
 
 from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.layer import (
-    check_upstream_fa_availability,
-    maybe_get_vit_flash_attn_backend,
-)
+from vllm.attention.layer import maybe_get_vit_flash_attn_backend
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size, parallel_state
@@ -296,12 +293,10 @@ class Glm4vVisionAttention(nn.Module):
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        self.use_upstream_fa = False
 
         self.attn_backend, self.flash_attn_varlen_func = (
             maybe_get_vit_flash_attn_backend(
                 self.attn_backend,
-                self.use_upstream_fa,
                 attn_backend_override=attn_backend_override,
             )
         )
@@ -730,11 +725,6 @@ class Glm4vVisionTransformer(nn.Module):
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        if (
-            self.attn_backend != AttentionBackendEnum.FLASH_ATTN
-            and check_upstream_fa_availability(torch.get_default_dtype())
-        ):
-            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
 
     @property
     def dtype(self) -> torch.dtype:
diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
index 302260b952992..8817601558148 100644
--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -418,7 +418,6 @@ class KeyeSiglipAttention(nn.Module):
         self.attn_backend, self.flash_attn_varlen_func = (
             maybe_get_vit_flash_attn_backend(
                 self.attn_backend,
-                use_upstream_fa=False,
                 attn_backend_override=attn_backend_override,
             )
         )
diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py
index 74bb868492da9..5256d8ba7fd86 100644
--- a/vllm/model_executor/models/paddleocr_vl.py
+++ b/vllm/model_executor/models/paddleocr_vl.py
@@ -33,7 +33,6 @@ from transformers.utils import torch_int
 
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import (
-    check_upstream_fa_availability,
     maybe_get_vit_flash_attn_backend,
 )
 from vllm.attention.ops.vit_attn_wrappers import (
@@ -582,7 +581,6 @@ class SiglipAttention(nn.Module):
         prefix: str = "",
         attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA,
         attn_backend_override: AttentionBackendEnum | None = None,
-        use_upstream_fa: bool = False,
     ) -> None:
         super().__init__()
 
@@ -612,11 +610,9 @@ class SiglipAttention(nn.Module):
         )
 
         self.attn_backend = attn_backend
-        self.use_upstream_fa = use_upstream_fa
         self.attn_backend, self.flash_attn_varlen_func = (
             maybe_get_vit_flash_attn_backend(
                 self.attn_backend,
-                self.use_upstream_fa,
                 attn_backend_override=attn_backend_override,
             )
         )
@@ -680,7 +676,6 @@ class SiglipAttention(nn.Module):
                 max_seqlen,
                 batch_size,
                 self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA,
-                self.use_upstream_fa,
             )
         elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
             outputs = []
@@ -783,7 +778,6 @@ class SiglipEncoderLayer(nn.Module):
         *,
         attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA,
         attn_backend_override: AttentionBackendEnum | None = None,
-        use_upstream_fa: bool = False,
     ):
         super().__init__()
         self.embed_dim = config.hidden_size
@@ -796,7 +790,6 @@ class SiglipEncoderLayer(nn.Module):
             prefix=f"{prefix}.self_attn",
             attn_backend=attn_backend,
             attn_backend_override=attn_backend_override,
-            use_upstream_fa=use_upstream_fa,
         )
         self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
         self.mlp = SiglipMLP(
@@ -852,13 +845,6 @@ class SiglipEncoder(nn.Module):
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        self.use_upstream_fa = False
-        if self.attn_backend not in {
-            AttentionBackendEnum.FLASH_ATTN,
-            AttentionBackendEnum.ROCM_AITER_FA,
-        } and check_upstream_fa_availability(torch.get_default_dtype()):
-            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
-            self.use_upstream_fa = True
         if self.attn_backend not in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.TORCH_SDPA,
@@ -875,7 +861,6 @@ class SiglipEncoder(nn.Module):
                     prefix=f"{prefix}.layers.{layer_idx}",
                     attn_backend=self.attn_backend,
                     attn_backend_override=attn_backend_override,
-                    use_upstream_fa=self.use_upstream_fa,
                 )
                 for layer_idx in range(config.num_hidden_layers)
             ]
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 8c707c2561af1..6ca490f467634 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -307,7 +307,6 @@ class Qwen2_5_VisionAttention(nn.Module):
         prefix: str = "",
         use_data_parallel: bool = False,
         attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA,
-        use_upstream_fa: bool = False,
         attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
@@ -344,24 +343,13 @@ class Qwen2_5_VisionAttention(nn.Module):
             disable_tp=use_data_parallel,
         )
         self.attn_backend = attn_backend
-        self.use_upstream_fa = use_upstream_fa
         self.attn_backend, self.flash_attn_varlen_func = (
             maybe_get_vit_flash_attn_backend(
                 self.attn_backend,
-                self.use_upstream_fa,
                 attn_backend_override=attn_backend_override,
             )
         )
-        # On ROCm with FLASH_ATTN backend, upstream flash_attn is used
-        from vllm.platforms import current_platform
 
-        if (
-            current_platform.is_rocm()
-            and self.attn_backend == AttentionBackendEnum.FLASH_ATTN
-        ):
-            self.use_upstream_fa = True
-        if current_platform.is_xpu():
-            self.use_upstream_fa = False
         self.is_flash_attn_backend = self.attn_backend in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.ROCM_AITER_FA,
@@ -415,7 +403,6 @@ class Qwen2_5_VisionAttention(nn.Module):
                 max_seqlen,
                 batch_size,
                 self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA,
-                self.use_upstream_fa,
             )
         elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
             # Execute attention entry by entry for speed & less VRAM.
@@ -459,7 +446,6 @@ class Qwen2_5_VisionBlock(nn.Module):
         prefix: str = "",
         use_data_parallel: bool = False,
         attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA,
-        use_upstream_fa: bool = False,
         attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
@@ -475,7 +461,6 @@ class Qwen2_5_VisionBlock(nn.Module):
             prefix=f"{prefix}.attn",
             use_data_parallel=use_data_parallel,
             attn_backend=attn_backend,
-            use_upstream_fa=use_upstream_fa,
             attn_backend_override=attn_backend_override,
         )
         self.mlp = Qwen2_5_VisionMLP(
@@ -644,7 +629,6 @@ class Qwen2_5_VisionTransformer(nn.Module):
             is_neox_style=True,
         )
 
-        use_upstream_fa = False
         self.attn_backend = get_vit_attn_backend(
             head_size=head_dim,
             dtype=torch.get_default_dtype(),
@@ -654,7 +638,6 @@ class Qwen2_5_VisionTransformer(nn.Module):
         self.attn_backend, self.flash_attn_varlen_func = (
             maybe_get_vit_flash_attn_backend(
                 self.attn_backend,
-                use_upstream_fa,
                 attn_backend_override=attn_backend_override,
             )
         )
@@ -681,7 +664,6 @@ class Qwen2_5_VisionTransformer(nn.Module):
                         prefix=f"{prefix}.blocks.{layer_idx}",
                         use_data_parallel=use_data_parallel,
                         attn_backend=self.attn_backend,
-                        use_upstream_fa=use_upstream_fa,
                         attn_backend_override=attn_backend_override,
                     )
                     for layer_idx in range(depth)
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 9d1d023aed172..672659aa6042c 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -45,7 +45,6 @@ from transformers.models.qwen2_vl.video_processing_qwen2_vl import Qwen2VLVideoP
 
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import (
-    check_upstream_fa_availability,
     maybe_get_vit_flash_attn_backend,
 )
 from vllm.config import VllmConfig
@@ -335,12 +334,10 @@ class Qwen2VisionAttention(nn.Module):
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        self.use_upstream_fa = False
 
         self.attn_backend, self.flash_attn_varlen_func = (
             maybe_get_vit_flash_attn_backend(
                 self.attn_backend,
-                self.use_upstream_fa,
                 attn_backend_override=attn_backend_override,
             )
         )
@@ -657,11 +654,6 @@ class Qwen2VisionTransformer(nn.Module):
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        if (
-            self.attn_backend != AttentionBackendEnum.FLASH_ATTN
-            and check_upstream_fa_availability(torch.get_default_dtype())
-        ):
-            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
 
     @property
     def dtype(self) -> torch.dtype:
diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
index f5f88f66eff91..39dd42552ae8f 100755
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -47,7 +47,6 @@ from transformers.models.qwen3_omni_moe.processing_qwen3_omni_moe import (
 from transformers.models.whisper import WhisperFeatureExtractor
 
 from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.layer import check_upstream_fa_availability
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
 from vllm.distributed import get_pp_group
@@ -381,11 +380,6 @@ class Qwen3Omni_VisionTransformer(nn.Module):
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        if (
-            self.attn_backend != AttentionBackendEnum.FLASH_ATTN
-            and check_upstream_fa_availability(torch.get_default_dtype())
-        ):
-            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
 
     @property
     def dtype(self) -> torch.dtype:
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 39fe8336b84a1..f0ba631e66807 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -49,7 +49,6 @@ from transformers.models.qwen3_vl.video_processing_qwen3_vl import (
 from transformers.video_utils import VideoMetadata
 
 from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.layer import check_upstream_fa_availability
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
@@ -202,7 +201,6 @@ class Qwen3_VisionBlock(nn.Module):
         prefix: str = "",
         use_data_parallel: bool = False,
         attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA,
-        use_upstream_fa: bool = False,
     ) -> None:
         super().__init__()
         if norm_layer is None:
@@ -217,7 +215,6 @@ class Qwen3_VisionBlock(nn.Module):
             prefix=f"{prefix}.attn",
             use_data_parallel=use_data_parallel,
             attn_backend=attn_backend,
-            use_upstream_fa=use_upstream_fa,
         )
         self.mlp = Qwen3_VisionMLP(
             dim,
@@ -378,14 +375,6 @@ class Qwen3_VisionTransformer(nn.Module):
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        use_upstream_fa = False
-        if (
-            self.attn_backend != AttentionBackendEnum.FLASH_ATTN
-            and self.attn_backend != AttentionBackendEnum.ROCM_AITER_FA
-            and check_upstream_fa_availability(torch.get_default_dtype())
-        ):
-            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
-            use_upstream_fa = True
 
         if self.attn_backend not in {
             AttentionBackendEnum.FLASH_ATTN,
@@ -407,7 +396,6 @@ class Qwen3_VisionTransformer(nn.Module):
                     prefix=f"{prefix}.blocks.{layer_idx}",
                     use_data_parallel=use_data_parallel,
                     attn_backend=self.attn_backend,
-                    use_upstream_fa=use_upstream_fa,
                 )
                 for layer_idx in range(vision_config.depth)
             ]
diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py
index c185b45345bd5..bbce01995412c 100644
--- a/vllm/model_executor/models/siglip2navit.py
+++ b/vllm/model_executor/models/siglip2navit.py
@@ -255,12 +255,10 @@ class Siglip2Attention(nn.Module):
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        self.use_upstream_fa = False
 
         self.attn_backend, self.flash_attn_varlen_func = (
             maybe_get_vit_flash_attn_backend(
                 self.attn_backend,
-                self.use_upstream_fa,
                 attn_backend_override=attn_backend_override,
             )
         )

From 0808eb813b6b219324092ab8cc25d3223e5ccb77 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Nov 2025 23:07:23 +0800
Subject: [PATCH 047/770] [Misc] Remove `yapf` directives (#29675)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../ec_transfer/ec_connector/factory.py       |  3 -
 vllm/entrypoints/openai/serving_tokens.py     | 92 +++++++++++--------
 2 files changed, 52 insertions(+), 43 deletions(-)

diff --git a/vllm/distributed/ec_transfer/ec_connector/factory.py b/vllm/distributed/ec_transfer/ec_connector/factory.py
index bfdf51d775bda..e51b32e6f6dff 100644
--- a/vllm/distributed/ec_transfer/ec_connector/factory.py
+++ b/vllm/distributed/ec_transfer/ec_connector/factory.py
@@ -5,15 +5,12 @@ import importlib
 from collections.abc import Callable
 from typing import TYPE_CHECKING
 
-# yapf: disable
 from vllm.distributed.ec_transfer.ec_connector.base import (
     ECConnectorBase,
     ECConnectorRole,
 )
 from vllm.logger import init_logger
 
-# yapf: enable
-
 if TYPE_CHECKING:
     from vllm.config import ECTransferConfig, VllmConfig
 
diff --git a/vllm/entrypoints/openai/serving_tokens.py b/vllm/entrypoints/openai/serving_tokens.py
index 69a526b9b70d2..daa739e41fa07 100644
--- a/vllm/entrypoints/openai/serving_tokens.py
+++ b/vllm/entrypoints/openai/serving_tokens.py
@@ -7,7 +7,6 @@ from collections.abc import Sequence as GenericSequence
 
 from fastapi import Request
 
-# yapf: disable
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
@@ -49,22 +48,26 @@ class ServingTokens(OpenAIServing):
         enable_prompt_tokens_details: bool = False,
         enable_log_outputs: bool = False,
     ):
-        super().__init__(engine_client=engine_client,
-                         models=models,
-                         request_logger=request_logger,
-                         return_tokens_as_token_ids=return_tokens_as_token_ids,
-                         log_error_stack=log_error_stack)
+        super().__init__(
+            engine_client=engine_client,
+            models=models,
+            request_logger=request_logger,
+            return_tokens_as_token_ids=return_tokens_as_token_ids,
+            log_error_stack=log_error_stack,
+        )
         self.enable_prompt_tokens_details = enable_prompt_tokens_details
         self.enable_log_outputs = enable_log_outputs
         self.force_no_detokenize = force_no_detokenize
         if force_no_detokenize:
-            logger.info("Tokens-only mode is enabled, skipping detokenization "
-            "step for incoming requests.")
+            logger.info(
+                "Tokens-only mode is enabled, skipping detokenization "
+                "step for incoming requests."
+            )
 
     async def serve_tokens(
         self,
         request: GenerateRequest,
-        raw_request: Request | None = None
+        raw_request: Request | None = None,
     ) -> GenerateResponse | ErrorResponse:
         error_check_ret = await self._check_model(request)
         if error_check_ret is not None:
@@ -78,13 +81,13 @@ class ServingTokens(OpenAIServing):
             raise self.engine_client.dead_error
 
         lora_request = None
-        lora_request = self._maybe_get_adapters(request,
-            supports_default_mm_loras=True)
+        lora_request = self._maybe_get_adapters(request, supports_default_mm_loras=True)
 
         model_name = self.models.model_name(lora_request)
 
-        request_id = "generate-tokens-" \
-                     f"{self._base_request_id(raw_request, request.request_id)}"
+        request_id = (
+            f"generate-tokens-{self._base_request_id(raw_request, request.request_id)}"
+        )
 
         request_metadata = RequestResponseMetadata(request_id=request_id)
         if raw_request:
@@ -106,13 +109,18 @@ class ServingTokens(OpenAIServing):
             if self.force_no_detokenize:
                 sampling_params.detokenize = False
 
-            self._log_inputs(request_id,
-                             request.token_ids,
-                             params=sampling_params,
-                             lora_request=lora_request)
+            self._log_inputs(
+                request_id,
+                request.token_ids,
+                params=sampling_params,
+                lora_request=lora_request,
+            )
 
-            trace_headers = (None if raw_request is None else await
-                             self._get_trace_headers(raw_request.headers))
+            trace_headers = (
+                None
+                if raw_request is None
+                else await self._get_trace_headers(raw_request.headers)
+            )
 
             result_generator = self.engine_client.generate(
                 engine_prompt,
@@ -131,8 +139,8 @@ class ServingTokens(OpenAIServing):
         try:
             assert result_generator is not None
             return await self.serve_tokens_full_generator(
-                request, result_generator, request_id, model_name,
-                request_metadata)
+                request, result_generator, request_id, model_name, request_metadata
+            )
         except ValueError as e:
             return self.create_error_response(str(e))
 
@@ -144,7 +152,6 @@ class ServingTokens(OpenAIServing):
         model_name: str,
         request_metadata: RequestResponseMetadata,
     ) -> ErrorResponse | GenerateResponse:
-
         created_time = int(time.time())
         final_res: RequestOutput | None = None
         sampling_params: SamplingParams = request.sampling_params
@@ -179,9 +186,9 @@ class ServingTokens(OpenAIServing):
             choice_data = GenerateResponseChoice(
                 index=output.index,
                 logprobs=logprobs,
-                finish_reason=output.finish_reason
-                if output.finish_reason else "stop",
-                token_ids=as_list(output.token_ids))
+                finish_reason=output.finish_reason if output.finish_reason else "stop",
+                token_ids=as_list(output.token_ids),
+            )
 
             choices.append(choice_data)
             num_generated_tokens += len(output.token_ids)
@@ -191,14 +198,16 @@ class ServingTokens(OpenAIServing):
         if final_res.encoder_prompt_token_ids is not None:
             num_prompt_tokens += len(final_res.encoder_prompt_token_ids)
 
-        usage = UsageInfo(prompt_tokens=num_prompt_tokens,
-                          completion_tokens=num_generated_tokens,
-                          total_tokens=num_prompt_tokens +
-                          num_generated_tokens)
+        usage = UsageInfo(
+            prompt_tokens=num_prompt_tokens,
+            completion_tokens=num_generated_tokens,
+            total_tokens=num_prompt_tokens + num_generated_tokens,
+        )
         if self.enable_prompt_tokens_details and final_res.num_cached_tokens:
             # This info is not available at the /coordinator level
             usage.prompt_tokens_details = PromptTokenUsageInfo(
-                cached_tokens=final_res.num_cached_tokens)
+                cached_tokens=final_res.num_cached_tokens
+            )
 
         request_metadata.final_usage_info = usage
 
@@ -218,14 +227,13 @@ class ServingTokens(OpenAIServing):
                 # Get the corresponding output token IDs
                 output_token_ids = None
                 if choice.index < len(final_res.outputs):
-                    output_token_ids = final_res.outputs[
-                        choice.index].token_ids
+                    output_token_ids = final_res.outputs[choice.index].token_ids
 
                 if output_token_ids:
                     # Log token_ids only.
                     self.request_logger.log_outputs(
                         request_id=request_id,
-                        outputs="", 
+                        outputs="",
                         output_token_ids=output_token_ids,
                         finish_reason=choice.finish_reason,
                         is_streaming=False,
@@ -246,10 +254,12 @@ class ServingTokens(OpenAIServing):
         for i, token_id in enumerate(token_ids):
             token = f"token_id:{token_id}"
             step_top_logprobs = top_logprobs[i]
-            if step_top_logprobs is None or step_top_logprobs.get(
-                    token_id) is None:
+            if step_top_logprobs is None or step_top_logprobs.get(token_id) is None:
                 logprobs_content.append(
-                    ChatCompletionLogProbsContent(token=token, ))
+                    ChatCompletionLogProbsContent(
+                        token=token,
+                    )
+                )
             else:
                 step_token = step_top_logprobs[token_id]
 
@@ -261,9 +271,11 @@ class ServingTokens(OpenAIServing):
                             ChatCompletionLogProb(
                                 token=token,
                                 logprob=max(p[1].logprob, -9999.0),
-                            ) for i, p in enumerate(step_top_logprobs.items())
-                            if num_output_top_logprobs
-                            and i < num_output_top_logprobs
-                        ]))
+                            )
+                            for i, p in enumerate(step_top_logprobs.items())
+                            if num_output_top_logprobs and i < num_output_top_logprobs
+                        ],
+                    )
+                )
 
         return ChatCompletionLogProbs(content=logprobs_content)

From 9eec282cb5a6fe0b39449476dc4d14da0516984c Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 28 Nov 2025 16:34:48 +0000
Subject: [PATCH 048/770] Guard FlashInfer sampler using the same check as
 FlashInfer attention backend (#29415)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/v1/sample/ops/topk_topp_sampler.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index 5b2d130b0ea42..c9229e788b6bf 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -33,6 +33,16 @@ class TopKTopPSampler(nn.Module):
             and current_platform.is_cuda()
         ):
             if envs.VLLM_USE_FLASHINFER_SAMPLER:
+                from vllm.v1.attention.backends.flashinfer import FlashInferBackend
+
+                capability = current_platform.get_device_capability()
+                assert capability is not None
+                if not FlashInferBackend.supports_compute_capability(capability):
+                    capability_str = capability.as_version_str()
+                    raise RuntimeError(
+                        "FlashInfer does not support compute capability "
+                        f"{capability_str}, unset VLLM_USE_FLASHINFER_SAMPLER=1."
+                    )
                 # Users must opt in explicitly via VLLM_USE_FLASHINFER_SAMPLER=1.
                 logger.info_once(
                     "Using FlashInfer for top-p & top-k sampling.",

From 9e6bcda3ac13b97ea620155c8dc98ca434cdb234 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 29 Nov 2025 00:39:27 +0800
Subject: [PATCH 049/770] [mypy] Enable type checking for more directories
 (#29674)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tools/pre_commit/mypy.py                             | 12 ++++++------
 vllm/distributed/kv_transfer/kv_connector/v1/base.py |  2 +-
 .../kv_connector/v1/lmcache_mp_connector.py          |  2 +-
 .../kv_transfer/kv_connector/v1/metrics.py           | 12 ++++++------
 .../kv_transfer/kv_connector/v1/multi_connector.py   |  4 ++--
 .../kv_transfer/kv_connector/v1/nixl_connector.py    |  8 ++++----
 vllm/engine/arg_utils.py                             |  4 +++-
 vllm/transformers_utils/config.py                    |  4 ++--
 vllm/triton_utils/__init__.py                        |  3 ++-
 vllm/v1/metrics/loggers.py                           |  8 ++++----
 vllm/v1/sample/logits_processor/__init__.py          |  2 +-
 vllm/v1/spec_decode/metrics.py                       |  5 +++--
 12 files changed, 35 insertions(+), 31 deletions(-)

diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py
index e3dc40fd0ec7d..47e01fc93b48b 100755
--- a/tools/pre_commit/mypy.py
+++ b/tools/pre_commit/mypy.py
@@ -27,19 +27,24 @@ FILES = [
     "vllm/*.py",
     "vllm/assets",
     "vllm/distributed",
+    "vllm/engine",
     "vllm/entrypoints",
     "vllm/executor",
     "vllm/inputs",
     "vllm/logging_utils",
     "vllm/multimodal",
     "vllm/platforms",
+    "vllm/plugins",
     "vllm/transformers_utils",
     "vllm/triton_utils",
     "vllm/usage",
     "vllm/utils",
+    "vllm/worker",
     "vllm/v1/core",
     "vllm/v1/engine",
+    "vllm/v1/metrics",
     "vllm/v1/pool",
+    "vllm/v1/sample",
     "vllm/v1/worker",
 ]
 
@@ -50,24 +55,19 @@ SEPARATE_GROUPS = [
     # v0 related
     "vllm/attention",
     "vllm/compilation",
-    "vllm/engine",
-    "vllm/inputs",
     "vllm/lora",
     "vllm/model_executor",
-    "vllm/plugins",
-    "vllm/worker",
     # v1 related
     "vllm/v1/attention",
     "vllm/v1/executor",
     "vllm/v1/kv_offload",
-    "vllm/v1/metrics",
-    "vllm/v1/sample",
     "vllm/v1/spec_decode",
     "vllm/v1/structured_output",
 ]
 
 # TODO(woosuk): Include the code from Megatron and HuggingFace.
 EXCLUDE = [
+    "vllm/engine/arg_utils.py",
     "vllm/model_executor/parallel_utils",
     "vllm/model_executor/models",
     "vllm/model_executor/layers/fla/ops",
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
index cac45425bb7aa..d37ec25675b72 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -565,7 +565,7 @@ class KVConnectorBase_V1(ABC):
         vllm_config: "VllmConfig",
         metric_types: dict[type["PromMetric"], type["PromMetricT"]],
         labelnames: list[str],
-        per_engine_labelvalues: dict[int, list[str]],
+        per_engine_labelvalues: dict[int, list[object]],
     ) -> Optional["KVConnectorPromMetrics"]:
         """
         Create a KVConnectorPromMetrics subclass which should register
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
index a4bddf5e03166..78256a6552c22 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
@@ -806,7 +806,7 @@ class LMCacheMPConnector(KVConnectorBase_V1):
         vllm_config: "VllmConfig",
         metric_types: dict[type["PromMetric"], type["PromMetricT"]],
         labelnames: list[str],
-        per_engine_labelvalues: dict[int, list[str]],
+        per_engine_labelvalues: dict[int, list[object]],
     ) -> Optional["KVConnectorPromMetrics"]:
         """
         Create a KVConnectorPromMetrics subclass which should register
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py b/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py
index d6ea4f1ab4cfc..eb8342eb7129f 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py
@@ -52,13 +52,13 @@ class KVConnectorStats:
 
 
 class KVConnectorLogging:
-    def __init__(self, kv_tranfer_config: KVTransferConfig):
+    def __init__(self, kv_transfer_config: KVTransferConfig | None):
         # This should be called on frontend process.
         assert not has_kv_transfer_group()
         # Instantiate the connector's stats class.
-        if kv_tranfer_config and kv_tranfer_config.kv_connector:
+        if kv_transfer_config and kv_transfer_config.kv_connector:
             self.connector_cls = KVConnectorFactory.get_connector_class(
-                kv_tranfer_config
+                kv_transfer_config
             )
         self.reset()
 
@@ -120,7 +120,7 @@ class KVConnectorPromMetrics:
         vllm_config: VllmConfig,
         metric_types: dict[type[PromMetric], type[PromMetricT]],
         labelnames: list[str],
-        per_engine_labelvalues: dict[int, list[str]],
+        per_engine_labelvalues: dict[int, list[object]],
     ):
         self._kv_transfer_config = vllm_config.kv_transfer_config
         self._gauge_cls = metric_types[Gauge]
@@ -129,7 +129,7 @@ class KVConnectorPromMetrics:
         self._labelnames = labelnames
         self._per_engine_labelvalues = per_engine_labelvalues
 
-    def make_per_engine(self, metric: PromMetric) -> PromMetric:
+    def make_per_engine(self, metric: PromMetric) -> dict[int, PromMetric]:
         """
         Create a per-engine child of a prometheus_client.Metric with
         the appropriate labels set. The parent metric must be created
@@ -165,7 +165,7 @@ class KVConnectorPrometheus:
         self,
         vllm_config: VllmConfig,
         labelnames: list[str],
-        per_engine_labelvalues: dict[int, list[str]],
+        per_engine_labelvalues: dict[int, list[object]],
     ):
         self.prom_metrics: KVConnectorPromMetrics | None = None
         kv_transfer_config = vllm_config.kv_transfer_config
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
index f47e8ca7e6c50..51d5df6c6ba15 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
@@ -85,7 +85,7 @@ class MultiKVConnectorPromMetrics(KVConnectorPromMetrics):
         vllm_config: "VllmConfig",
         metric_types: dict[type[PromMetric], type[PromMetricT]],
         labelnames: list[str],
-        per_engine_labelvalues: dict[int, list[str]],
+        per_engine_labelvalues: dict[int, list[object]],
         prom_metrics: dict[str, KVConnectorPromMetrics],
     ):
         super().__init__(vllm_config, metric_types, labelnames, per_engine_labelvalues)
@@ -434,7 +434,7 @@ class MultiConnector(KVConnectorBase_V1):
         vllm_config: "VllmConfig",
         metric_types: dict[type["PromMetric"], type["PromMetricT"]],
         labelnames: list[str],
-        per_engine_labelvalues: dict[int, list[str]],
+        per_engine_labelvalues: dict[int, list[object]],
     ) -> KVConnectorPromMetrics:
         prom_metrics: dict[str, KVConnectorPromMetrics] = {}
         for connector_cls, temp_config in cls._get_connector_classes_and_configs(
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 24c8d32dafedc..41e32bb73d40b 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -288,7 +288,7 @@ class NixlConnector(KVConnectorBase_V1):
         vllm_config: VllmConfig,
         metric_types: dict[type[PromMetric], type[PromMetricT]],
         labelnames: list[str],
-        per_engine_labelvalues: dict[int, list[str]],
+        per_engine_labelvalues: dict[int, list[object]],
     ) -> KVConnectorPromMetrics:
         return NixlPromMetrics(
             vllm_config, metric_types, labelnames, per_engine_labelvalues
@@ -2345,9 +2345,9 @@ class NixlKVConnectorStats(KVConnectorStats):
         return {
             "Num successful transfers": n,
             "Avg xfer time (ms)": round(xfer_time.mean() * 1e3, 3),
-            "P90 xfer time (ms)": round(np.percentile(xfer_time, 90) * 1e3, 3),
+            "P90 xfer time (ms)": round(np.percentile(xfer_time, 90).item() * 1e3, 3),
             "Avg post time (ms)": round(post_time.mean() * 1e3, 3),
-            "P90 post time (ms)": round(np.percentile(post_time, 90) * 1e3, 3),
+            "P90 post time (ms)": round(np.percentile(post_time, 90).item() * 1e3, 3),
             "Avg MB per transfer": round(avg_mb, 3),
             "Throughput (MB/s)": round(throughput_mb_s, 3),
             "Avg number of descriptors": round(descs.mean(), 1),
@@ -2364,7 +2364,7 @@ class NixlPromMetrics(KVConnectorPromMetrics):
         vllm_config: VllmConfig,
         metric_types: dict[type[PromMetric], type[PromMetricT]],
         labelnames: list[str],
-        per_engine_labelvalues: dict[int, list[str]],
+        per_engine_labelvalues: dict[int, list[object]],
     ):
         super().__init__(vllm_config, metric_types, labelnames, per_engine_labelvalues)
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index ad5a34c56161c..1126e9ce12db7 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1954,7 +1954,9 @@ class EngineArgs:
             self.enable_prefix_caching = False
 
     def _set_default_max_num_seqs_and_batched_tokens_args(
-        self, usage_context: UsageContext, model_config: ModelConfig
+        self,
+        usage_context: UsageContext | None,
+        model_config: ModelConfig,
     ):
         world_size = self.pipeline_parallel_size * self.tensor_parallel_size
         (
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 66680f410cb3c..224542c824efb 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -614,12 +614,12 @@ def _maybe_remap_hf_config_attrs(config: PretrainedConfig) -> PretrainedConfig:
 
 def maybe_override_with_speculators(
     model: str,
-    tokenizer: str,
+    tokenizer: str | None,
     trust_remote_code: bool,
     revision: str | None = None,
     vllm_speculative_config: dict[str, Any] | None = None,
     **kwargs,
-) -> tuple[str, str, dict[str, Any] | None]:
+) -> tuple[str, str | None, dict[str, Any] | None]:
     """
     Resolve model configuration when speculators are detected.
 
diff --git a/vllm/triton_utils/__init__.py b/vllm/triton_utils/__init__.py
index a475d0fa406bf..ce459ca91d8e0 100644
--- a/vllm/triton_utils/__init__.py
+++ b/vllm/triton_utils/__init__.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import TYPE_CHECKING
 
 from vllm.triton_utils.importing import (
     HAS_TRITON,
@@ -7,7 +8,7 @@ from vllm.triton_utils.importing import (
     TritonPlaceholder,
 )
 
-if HAS_TRITON:
+if TYPE_CHECKING or HAS_TRITON:
     import triton
     import triton.language as tl
     import triton.language.extra.libdevice as tldevice
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index bd18a152ffc08..429cee3b5af10 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -104,8 +104,8 @@ class LoggingStatLogger(StatLoggerBase):
         self.mm_caching_metrics = CachingMetrics()
 
         self.spec_decoding_logging = SpecDecodingLogging()
-        kv_tranfer_config = self.vllm_config.kv_transfer_config
-        self.kv_connector_logging = KVConnectorLogging(kv_tranfer_config)
+        kv_transfer_config = self.vllm_config.kv_transfer_config
+        self.kv_connector_logging = KVConnectorLogging(kv_transfer_config)
         self.last_prompt_throughput: float = 0.0
         self.last_generation_throughput: float = 0.0
         self.engine_is_idle = False
@@ -380,7 +380,7 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
         model_name = vllm_config.model_config.served_model_name
         max_model_len = vllm_config.model_config.max_model_len
 
-        per_engine_labelvalues: dict[int, list[str]] = {
+        per_engine_labelvalues: dict[int, list[object]] = {
             idx: [model_name, str(idx)] for idx in engine_indexes
         }
 
@@ -1052,7 +1052,7 @@ PromMetric: TypeAlias = Gauge | Counter | Histogram
 
 
 def make_per_engine(
-    metric: PromMetric, engine_idxs: list[int], model_name: str
+    metric: PromMetric, engine_idxs: list[int], model_name: object
 ) -> dict[int, PromMetric]:
     return {idx: metric.labels(model_name, str(idx)) for idx in engine_idxs}
 
diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py
index 8b174af4c7794..f7b70645fd18e 100644
--- a/vllm/v1/sample/logits_processor/__init__.py
+++ b/vllm/v1/sample/logits_processor/__init__.py
@@ -313,7 +313,7 @@ class AdapterLogitsProcessor(LogitsProcessor):
                 if (len(inspect.signature(req_lp).parameters) == 3)
                 else [output_ids]
             )
-            return partial(req_lp, *args)
+            return partial(req_lp, *args)  # type: ignore[misc]
         return None
 
     def update_state(self, batch_update: BatchUpdate | None):
diff --git a/vllm/v1/spec_decode/metrics.py b/vllm/v1/spec_decode/metrics.py
index 79d856a143ba9..6c16bc686d169 100644
--- a/vllm/v1/spec_decode/metrics.py
+++ b/vllm/v1/spec_decode/metrics.py
@@ -144,7 +144,7 @@ class SpecDecodingProm:
         self,
         speculative_config: SpeculativeConfig | None,
         labelnames: list[str],
-        per_engine_labelvalues: dict[int, list[str]],
+        per_engine_labelvalues: dict[int, list[object]],
     ):
         self.spec_decoding_enabled = speculative_config is not None
         if not self.spec_decoding_enabled:
@@ -215,7 +215,8 @@ class SpecDecodingProm:
 
 
 def make_per_engine(
-    counter: prometheus_client.Counter, per_engine_labelvalues: dict[int, list[str]]
+    counter: prometheus_client.Counter,
+    per_engine_labelvalues: dict[int, list[object]],
 ):
     """Create a counter for each label value."""
     return {

From 3bcbb30cbf19c128b97b7d452a5a201098f2cc48 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9E=9C=E5=86=BB=E8=99=BE=E4=BB=81?= <guodong@apache.org>
Date: Sat, 29 Nov 2025 00:41:05 +0800
Subject: [PATCH 050/770] add add_truncate_prompt_tokens in repr for
 PoolingParams (#29683)

---
 vllm/pooling_params.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index c2094a2d920a2..4a5caa7e27fc7 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -220,6 +220,7 @@ class PoolingParams(
             f"returned_token_ids={self.returned_token_ids}, "
             f"requires_token_ids={self.requires_token_ids}, "
             f"skip_reading_prefix_cache={self.skip_reading_prefix_cache}, "
+            f"truncate_prompt_tokens={self.truncate_prompt_tokens}, "
             f"extra_kwargs={self.extra_kwargs})"
         )
 

From fae6943068e05fb90705ccb5c08bbd04d771048f Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Fri, 28 Nov 2025 17:41:41 +0100
Subject: [PATCH 051/770] [Doc]: fixing typos in multiple files. (#29685)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
---
 vllm/compilation/sequence_parallelism.py          | 2 +-
 vllm/model_executor/models/nano_nemotron_vl.py    | 2 +-
 vllm/model_executor/models/stablelm.py            | 2 +-
 vllm/utils/gc_utils.py                            | 2 +-
 vllm/v1/spec_decode/ngram_proposer.py             | 2 +-
 vllm/v1/worker/ec_connector_model_runner_mixin.py | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vllm/compilation/sequence_parallelism.py b/vllm/compilation/sequence_parallelism.py
index bb4dcf12d865d..cf4b8118f6b5c 100644
--- a/vllm/compilation/sequence_parallelism.py
+++ b/vllm/compilation/sequence_parallelism.py
@@ -304,7 +304,7 @@ class SequenceParallelismPass(VllmPatternMatcherPass):
     def __init__(self, config: VllmConfig):
         super().__init__(config)
 
-        # Used to cleanup redundant views created temporarily
+        # Used to clean up redundant views created temporarily
         # to circumvent residual shape change issues
         self.noop_cleanup = NoOpEliminationPass(config)
         self.noop_cleanup.pass_name = f"{self.pass_name}.{self.noop_cleanup.pass_name}"
diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
index cb39c2ae482d2..5529089e06ae9 100644
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -1472,7 +1472,7 @@ class NemotronH_Nano_VL_V2(
             return []
 
         # # The result multimodal_embeddings is tuple of tensors, with each
-        # tensor correspoending to a multimodal data item (image or video).
+        # tensor corresponding to a multimodal data item (image or video).
         multimodal_embeddings: tuple[torch.Tensor, ...] = ()
 
         # NOTE: It is important to iterate over the keys in this dictionary
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index 6cb98b7b72a5b..65092584edced 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -19,7 +19,7 @@
 # This code is based off the following work:
 # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/modeling_stablelm_epoch.py
 # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/config.json
-"""Inference-only StabeLM (https://github.com/Stability-AI/StableLM)
+"""Inference-only StableLM (https://github.com/Stability-AI/StableLM)
 model compatible with HuggingFace weights."""
 
 from collections.abc import Iterable
diff --git a/vllm/utils/gc_utils.py b/vllm/utils/gc_utils.py
index c56b1794230e9..25c8653e0e030 100644
--- a/vllm/utils/gc_utils.py
+++ b/vllm/utils/gc_utils.py
@@ -17,7 +17,7 @@ class GCDebugConfig:
     """
     Config for GC Debugger.
     - 0: disable GC debugger
-    - 1: enable GC debugger with gc.collect elpased times
+    - 1: enable GC debugger with gc.collect elapsed times
     - '{"top_objects":5}': enable GC debugger with top 5 collected objects
     """
 
diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py
index e2f83cb24aa90..10b3f0aa040e5 100644
--- a/vllm/v1/spec_decode/ngram_proposer.py
+++ b/vllm/v1/spec_decode/ngram_proposer.py
@@ -269,7 +269,7 @@ def _find_longest_matched_ngram_and_propose_tokens(
                 prev_lps = lps[max_ngram - 1]
             i += 1
         elif prev_lps != 0:
-            # Token mismatch: try the second longest prefix
+            # Token mismatch: try the second-longest prefix
             # among all suffix of tokens[:i],
             # which is the longest prefix of tokens[:prev_lps]
             prev_lps = lps[prev_lps - 1]
diff --git a/vllm/v1/worker/ec_connector_model_runner_mixin.py b/vllm/v1/worker/ec_connector_model_runner_mixin.py
index 00bc909df2975..08a41532ea8e1 100644
--- a/vllm/v1/worker/ec_connector_model_runner_mixin.py
+++ b/vllm/v1/worker/ec_connector_model_runner_mixin.py
@@ -59,7 +59,7 @@ class ECConnectorModelRunnerMixin:
         )
 
     # This context manager must be used within an active forward context.
-    # It encapsulates the entire EC conector lifecycle within execute_model
+    # It encapsulates the entire EC connector lifecycle within execute_model
     @staticmethod
     @contextmanager
     def _get_ec_connector_output(

From 6f9d81d03b3b60f8a8b51d624c86f99bf26e96a4 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 29 Nov 2025 00:44:33 +0800
Subject: [PATCH 052/770] [V0 deprecation] Clean up legacy paged attention
 helper functions (#28043)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/attention/ops/paged_attn.py            | 211 --------------------
 vllm/attention/ops/rocm_aiter_paged_attn.py | 123 ------------
 2 files changed, 334 deletions(-)
 delete mode 100644 vllm/attention/ops/rocm_aiter_paged_attn.py

diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
index 8e010ffba32ec..4aa4bcf5bbd36 100644
--- a/vllm/attention/ops/paged_attn.py
+++ b/vllm/attention/ops/paged_attn.py
@@ -1,58 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from dataclasses import dataclass
 
 import torch
 
 from vllm.platforms import current_platform
-from vllm.triton_utils import HAS_TRITON
 
 if current_platform.is_cuda_alike():
     from vllm import _custom_ops as ops
 elif current_platform.is_xpu():
     from vllm._ipex_ops import ipex_ops as ops
 
-if HAS_TRITON:
-    from vllm.attention.ops.prefix_prefill import context_attention_fwd
-
-# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
-_PARTITION_SIZE = 512
-
-
-@dataclass
-class PagedAttentionMetadata:
-    """Metadata for PagedAttention."""
-
-    # (batch_size,). The length of sequences (entire tokens seen so far) per
-    # sequence.
-    seq_lens_tensor: torch.Tensor | None
-    # Maximum sequence length in the batch. 0 if it is prefill-only batch.
-    max_decode_seq_len: int
-    # (batch_size, max_blocks_per_seq).
-    # Block addresses per sequence. (Seq id -> list of physical block)
-    # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks
-    # in the kv cache. Each block can contain up to block_size tokens.
-    # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph
-    # captured.
-    block_tables: torch.Tensor | None
-
 
 class PagedAttention:
-    @staticmethod
-    def get_supported_head_sizes() -> list[int]:
-        return [32, 64, 80, 96, 112, 120, 128, 192, 256]
-
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-        cache_dtype_str: str = "auto",
-    ) -> tuple[int, ...]:
-        return (2, num_blocks, block_size * num_kv_heads * head_size)
-
     @staticmethod
     def split_kv_cache(
         kv_cache: torch.Tensor,
@@ -89,174 +49,3 @@ class PagedAttention:
             k_scale,
             v_scale,
         )
-
-    @staticmethod
-    def forward_decode(
-        query: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        block_tables: torch.Tensor,
-        seq_lens: torch.Tensor,
-        max_seq_len: int,
-        kv_cache_dtype: str,
-        num_kv_heads: int,
-        scale: float,
-        alibi_slopes: torch.Tensor | None,
-        k_scale: torch.Tensor,
-        v_scale: torch.Tensor,
-        tp_rank: int = 0,
-        blocksparse_local_blocks: int = 0,
-        blocksparse_vert_stride: int = 0,
-        blocksparse_block_size: int = 64,
-        blocksparse_head_sliding_step: int = 0,
-    ) -> torch.Tensor:
-        if blocksparse_vert_stride is not None and blocksparse_vert_stride > 1:
-            # use blocksparse paged attention
-            block_size = value_cache.size(-1)
-            assert (
-                blocksparse_block_size > 0 and blocksparse_block_size % block_size == 0
-            ), (
-                f"{blocksparse_block_size=} needs to be a multiple of"
-                f"{block_size=} used in block_tables."
-            )
-
-        output = torch.empty_like(query)
-        block_size = value_cache.shape[3]
-        num_seqs, num_heads, head_size = query.shape
-        max_num_partitions = (max_seq_len + _PARTITION_SIZE - 1) // _PARTITION_SIZE
-        # NOTE(woosuk): We use a simple heuristic to decide whether to use
-        # PagedAttention V1 or V2. If the number of partitions is 1, we use
-        # V1 to avoid the overhead of reduction. Also, if the number of
-        # sequences or heads is large, we use V1 since there is enough work
-        # to parallelize.
-        # TODO(woosuk): Tune this heuristic.
-        # For context len > 8192, use V2 kernel to avoid shared memory shortage.
-        use_v1 = max_seq_len <= 8192 and (
-            max_num_partitions == 1 or num_seqs * num_heads > 512
-        )
-
-        if use_v1:
-            # Run PagedAttention V1.
-            ops.paged_attention_v1(
-                output,
-                query,
-                key_cache,
-                value_cache,
-                num_kv_heads,
-                scale,
-                block_tables,
-                seq_lens,
-                block_size,
-                max_seq_len,
-                alibi_slopes,
-                kv_cache_dtype,
-                k_scale,
-                v_scale,
-                tp_rank,
-                blocksparse_local_blocks,
-                blocksparse_vert_stride,
-                blocksparse_block_size,
-                blocksparse_head_sliding_step,
-            )
-        else:
-            # Run PagedAttention V2.
-            assert _PARTITION_SIZE % block_size == 0
-            tmp_output = torch.empty(
-                size=(num_seqs, num_heads, max_num_partitions, head_size),
-                dtype=output.dtype,
-                device=output.device,
-            )
-            exp_sums = torch.empty(
-                size=(num_seqs, num_heads, max_num_partitions),
-                dtype=torch.float32,
-                device=output.device,
-            )
-            max_logits = torch.empty_like(exp_sums)
-            ops.paged_attention_v2(
-                output,
-                exp_sums,
-                max_logits,
-                tmp_output,
-                query,
-                key_cache,
-                value_cache,
-                num_kv_heads,
-                scale,
-                block_tables,
-                seq_lens,
-                block_size,
-                max_seq_len,
-                alibi_slopes,
-                kv_cache_dtype,
-                k_scale,
-                v_scale,
-                tp_rank,
-                blocksparse_local_blocks,
-                blocksparse_vert_stride,
-                blocksparse_block_size,
-                blocksparse_head_sliding_step,
-            )
-        return output
-
-    @staticmethod
-    def forward_prefix(
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        kv_cache_dtype: str,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        block_tables: torch.Tensor,
-        query_start_loc: torch.Tensor,
-        seq_lens_tensor: torch.Tensor,
-        max_query_len: int,
-        alibi_slopes: torch.Tensor | None,
-        sliding_window: int | None,
-        k_scale: torch.Tensor,
-        v_scale: torch.Tensor,
-    ) -> torch.Tensor:
-        output = torch.empty_like(query)
-        max_seq_len = None
-        context_attention_fwd(
-            query,
-            key,
-            value,
-            output,
-            kv_cache_dtype,
-            key_cache,
-            value_cache,
-            block_tables,
-            # query_start_loc is (batch_size + 1,)
-            query_start_loc,
-            seq_lens_tensor,
-            max_seq_len,
-            max_query_len,
-            k_scale,
-            v_scale,
-            alibi_slopes,
-            sliding_window,
-        )
-        return output
-
-    @staticmethod
-    def swap_blocks(
-        src_kv_cache: torch.Tensor,
-        dst_kv_cache: torch.Tensor,
-        src_to_dst: torch.Tensor,
-    ) -> None:
-        src_key_cache = src_kv_cache[0]
-        dst_key_cache = dst_kv_cache[0]
-        ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst)
-
-        src_value_cache = src_kv_cache[1]
-        dst_value_cache = dst_kv_cache[1]
-        ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst)
-
-    @staticmethod
-    def copy_blocks(
-        kv_caches: list[torch.Tensor],
-        src_to_dists: torch.Tensor,
-    ) -> None:
-        key_caches = [kv_cache[0] for kv_cache in kv_caches]
-        value_caches = [kv_cache[1] for kv_cache in kv_caches]
-        ops.copy_blocks(key_caches, value_caches, src_to_dists)
diff --git a/vllm/attention/ops/rocm_aiter_paged_attn.py b/vllm/attention/ops/rocm_aiter_paged_attn.py
deleted file mode 100644
index bcd1e2cd56441..0000000000000
--- a/vllm/attention/ops/rocm_aiter_paged_attn.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import aiter as rocm_aiter
-import torch
-
-from vllm.attention.ops.paged_attn import PagedAttention
-from vllm.platforms import current_platform
-from vllm.utils.math_utils import cdiv
-
-FP8_DTYPE = current_platform.fp8_dtype()
-
-
-class AITERPagedAttention(PagedAttention):
-    @staticmethod
-    def write_to_paged_cache(
-        key: torch.Tensor,
-        value: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        slot_mapping: torch.Tensor,
-        kv_cache_dtype: str,
-        k_scale: torch.Tensor,
-        v_scale: torch.Tensor,
-    ) -> None:
-        if kv_cache_dtype not in ["int8", "fp8", "fp8_e4m3"]:
-            PagedAttention.write_to_paged_cache(
-                key,
-                value,
-                key_cache,
-                value_cache,
-                slot_mapping,
-                kv_cache_dtype,
-                k_scale,
-                v_scale,
-            )
-        else:
-            kv_cache_torch_dtype = FP8_DTYPE if "fp8" in kv_cache_dtype else torch.int8
-            key_cache = key_cache.view(kv_cache_torch_dtype)
-            value_cache = value_cache.view(kv_cache_torch_dtype)
-
-            rocm_aiter.reshape_and_cache_with_pertoken_quant(
-                key,
-                value,
-                key_cache,
-                value_cache,
-                k_scale,
-                v_scale,
-                slot_mapping.flatten(),
-                True,
-            )
-
-    @staticmethod
-    def forward_decode(
-        query: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        block_tables: torch.Tensor,
-        seq_lens: torch.Tensor,
-        max_seq_len: int,
-        kv_cache_dtype: str,
-        num_kv_heads: int,
-        scale: float,
-        alibi_slopes: torch.Tensor | None,
-        k_scale: torch.Tensor,
-        v_scale: torch.Tensor,
-        tp_rank: int = 0,
-        blocksparse_local_blocks: int = 0,
-        blocksparse_vert_stride: int = 0,
-        blocksparse_block_size: int = 64,
-        blocksparse_head_sliding_step: int = 0,
-    ) -> torch.Tensor:
-        if kv_cache_dtype not in ["int8", "fp8", "fp8_e4m3"]:
-            return PagedAttention.forward_decode(
-                query=query,
-                key_cache=key_cache,
-                value_cache=value_cache,
-                block_tables=block_tables,
-                seq_lens=seq_lens,
-                max_seq_len=max_seq_len,
-                kv_cache_dtype=kv_cache_dtype,
-                num_kv_heads=num_kv_heads,
-                scale=scale,
-                alibi_slopes=alibi_slopes,
-                k_scale=k_scale,
-                v_scale=v_scale,
-                tp_rank=tp_rank,
-                blocksparse_local_blocks=blocksparse_local_blocks,
-                blocksparse_vert_stride=blocksparse_vert_stride,
-                blocksparse_block_size=blocksparse_block_size,
-                blocksparse_head_sliding_step=blocksparse_head_sliding_step,
-            )
-
-        if "fp8" in kv_cache_dtype:
-            key_cache = key_cache.view(current_platform.fp8_dtype())
-            value_cache = value_cache.view(current_platform.fp8_dtype())
-
-        if blocksparse_vert_stride is not None and blocksparse_vert_stride > 1:
-            # use blocksparse paged attention
-            block_size = value_cache.size(-1)
-            assert (
-                blocksparse_block_size > 0 and blocksparse_block_size % block_size == 0
-            ), (
-                f"{blocksparse_block_size=} needs to be a multiple of"
-                f"{block_size=} used in block_tables."
-            )
-
-        output = torch.empty_like(query)
-        block_size = value_cache.shape[3]
-        max_num_blocks_per_seq = cdiv(max_seq_len, block_size)
-
-        rocm_aiter.pa_fwd_asm(
-            query,
-            key_cache,
-            value_cache,
-            block_tables,
-            seq_lens,
-            max_num_blocks_per_seq,
-            k_scale,
-            v_scale,
-            output,
-        )
-        return output

From f946a8d74308f4a47bc1624c9307c1bce45874eb Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 29 Nov 2025 00:46:51 +0800
Subject: [PATCH 053/770] [Chore]: Reorganize model repo operating functions in
 `transformers_utils` (#29680)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 tests/transformers_utils/test_config.py       |   4 +-
 vllm/engine/arg_utils.py                      |   2 +-
 .../model_loader/default_loader.py            |   2 +-
 vllm/model_executor/models/adapters.py        |   2 +-
 vllm/transformers_utils/config.py             | 279 +----------------
 vllm/transformers_utils/gguf_utils.py         |   2 +-
 vllm/transformers_utils/repo_utils.py         | 287 ++++++++++++++++++
 vllm/transformers_utils/tokenizer.py          |   6 +-
 8 files changed, 304 insertions(+), 280 deletions(-)
 create mode 100644 vllm/transformers_utils/repo_utils.py

diff --git a/tests/transformers_utils/test_config.py b/tests/transformers_utils/test_config.py
index de28ab5f99e8c..7107ad0f7b99d 100644
--- a/tests/transformers_utils/test_config.py
+++ b/tests/transformers_utils/test_config.py
@@ -8,7 +8,7 @@ from unittest.mock import MagicMock, call, patch
 
 import pytest
 
-from vllm.transformers_utils.config import list_filtered_repo_files
+from vllm.transformers_utils.repo_utils import list_filtered_repo_files
 
 
 @pytest.mark.parametrize(
@@ -44,7 +44,7 @@ def test_list_filtered_repo_files(
 
         # Patch list_repo_files called by fn
         with patch(
-            "vllm.transformers_utils.config.list_repo_files",
+            "vllm.transformers_utils.repo_utils.list_repo_files",
             MagicMock(return_value=_glob_path()),
         ) as mock_list_repo_files:
             out_files = sorted(
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 1126e9ce12db7..8f0eb832064fa 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -83,10 +83,10 @@ from vllm.platforms import CpuArchEnum, current_platform
 from vllm.plugins import load_general_plugins
 from vllm.ray.lazy_utils import is_in_ray_actor, is_ray_initialized
 from vllm.transformers_utils.config import (
-    get_model_path,
     is_interleaved,
     maybe_override_with_speculators,
 )
+from vllm.transformers_utils.repo_utils import get_model_path
 from vllm.transformers_utils.utils import is_cloud_storage, is_gguf
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.mem_constants import GiB_bytes
diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py
index 7401a7a0e2dbb..88c6d1e27e39c 100644
--- a/vllm/model_executor/model_loader/default_loader.py
+++ b/vllm/model_executor/model_loader/default_loader.py
@@ -31,7 +31,7 @@ from vllm.model_executor.model_loader.weight_utils import (
     safetensors_weights_iterator,
 )
 from vllm.platforms import current_platform
-from vllm.transformers_utils.config import list_filtered_repo_files
+from vllm.transformers_utils.repo_utils import list_filtered_repo_files
 
 logger = init_logger(__name__)
 
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 5aba46f8614be..738400ae864a5 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -14,9 +14,9 @@ from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.models.config import VerifyAndUpdateConfig
 from vllm.transformers_utils.config import (
-    get_hf_file_bytes,
     try_get_dense_modules,
 )
+from vllm.transformers_utils.repo_utils import get_hf_file_bytes
 
 from .interfaces_base import VllmModelForPooling, is_pooling_model
 
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 224542c824efb..45c4358bbc8f2 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -1,30 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import fnmatch
-import json
 import os
-import time
 from collections.abc import Callable
 from dataclasses import asdict
 from functools import cache, partial
 from importlib.metadata import version
 from pathlib import Path
-from typing import Any, Literal, TypeAlias, TypeVar
+from typing import Any, Literal, TypeAlias
 
 import huggingface_hub
 from huggingface_hub import (
     get_safetensors_metadata,
-    hf_hub_download,
-    try_to_load_from_cache,
-)
-from huggingface_hub import list_repo_files as hf_list_repo_files
-from huggingface_hub.utils import (
-    EntryNotFoundError,
-    HfHubHTTPError,
-    LocalEntryNotFoundError,
-    RepositoryNotFoundError,
-    RevisionNotFoundError,
 )
 from packaging.version import Version
 from transformers import GenerationConfig, PretrainedConfig
@@ -40,6 +27,14 @@ from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
 from vllm import envs
 from vllm.logger import init_logger
 from vllm.transformers_utils.config_parser_base import ConfigParserBase
+from vllm.transformers_utils.repo_utils import (
+    _get_hf_token,
+    file_or_path_exists,
+    get_hf_file_to_dict,
+    list_repo_files,
+    try_get_local_file,
+    with_retry,
+)
 from vllm.transformers_utils.utils import (
     check_gguf_file,
     is_gguf,
@@ -58,21 +53,6 @@ MISTRAL_CONFIG_NAME = "params.json"
 logger = init_logger(__name__)
 
 
-def _get_hf_token() -> str | None:
-    """
-    Get the HuggingFace token from environment variable.
-
-    Returns None if the token is not set, is an empty string,
-    or contains only whitespace.
-    This follows the same pattern as huggingface_hub library which
-    treats empty string tokens as None to avoid authentication errors.
-    """
-    token = os.getenv("HF_TOKEN")
-    if token and token.strip():
-        return token
-    return None
-
-
 class LazyConfigDict(dict):
     def __getitem__(self, key):
         if isinstance(value := super().__getitem__(key), type):
@@ -308,143 +288,6 @@ def register_config_parser(config_format: str):
     return _wrapper
 
 
-_R = TypeVar("_R")
-
-
-def with_retry(
-    func: Callable[[], _R],
-    log_msg: str,
-    max_retries: int = 2,
-    retry_delay: int = 2,
-) -> _R:
-    for attempt in range(max_retries):
-        try:
-            return func()
-        except Exception as e:
-            if attempt == max_retries - 1:
-                logger.error("%s: %s", log_msg, e)
-                raise
-            logger.error(
-                "%s: %s, retrying %d of %d", log_msg, e, attempt + 1, max_retries
-            )
-            time.sleep(retry_delay)
-            retry_delay *= 2
-
-    raise AssertionError("Should not be reached")
-
-
-# @cache doesn't cache exceptions
-@cache
-def list_repo_files(
-    repo_id: str,
-    *,
-    revision: str | None = None,
-    repo_type: str | None = None,
-    token: str | bool | None = None,
-) -> list[str]:
-    def lookup_files() -> list[str]:
-        # directly list files if model is local
-        if (local_path := Path(repo_id)).exists():
-            return [
-                str(file.relative_to(local_path))
-                for file in local_path.rglob("*")
-                if file.is_file()
-            ]
-        # if model is remote, use hf_hub api to list files
-        try:
-            if envs.VLLM_USE_MODELSCOPE:
-                from vllm.transformers_utils.utils import modelscope_list_repo_files
-
-                return modelscope_list_repo_files(
-                    repo_id,
-                    revision=revision,
-                    token=os.getenv("MODELSCOPE_API_TOKEN", None),
-                )
-            return hf_list_repo_files(
-                repo_id, revision=revision, repo_type=repo_type, token=token
-            )
-        except huggingface_hub.errors.OfflineModeIsEnabled:
-            # Don't raise in offline mode,
-            # all we know is that we don't have this
-            # file cached.
-            return []
-
-    return with_retry(lookup_files, "Error retrieving file list")
-
-
-def list_filtered_repo_files(
-    model_name_or_path: str,
-    allow_patterns: list[str],
-    revision: str | None = None,
-    repo_type: str | None = None,
-    token: str | bool | None = None,
-) -> list[str]:
-    try:
-        all_files = list_repo_files(
-            repo_id=model_name_or_path,
-            revision=revision,
-            token=token,
-            repo_type=repo_type,
-        )
-    except Exception:
-        logger.error(
-            "Error retrieving file list. Please ensure your `model_name_or_path`"
-            "`repo_type`, `token` and `revision` arguments are correctly set. "
-            "Returning an empty list."
-        )
-        return []
-
-    file_list = []
-    # Filter patterns on filenames
-    for pattern in allow_patterns:
-        file_list.extend(
-            [
-                file
-                for file in all_files
-                if fnmatch.fnmatch(os.path.basename(file), pattern)
-            ]
-        )
-    return file_list
-
-
-def file_exists(
-    repo_id: str,
-    file_name: str,
-    *,
-    repo_type: str | None = None,
-    revision: str | None = None,
-    token: str | bool | None = None,
-) -> bool:
-    file_list = list_repo_files(
-        repo_id, repo_type=repo_type, revision=revision, token=token
-    )
-    return file_name in file_list
-
-
-# In offline mode the result can be a false negative
-def file_or_path_exists(
-    model: str | Path, config_name: str, revision: str | None
-) -> bool:
-    if (local_path := Path(model)).exists():
-        return (local_path / config_name).is_file()
-
-    # Offline mode support: Check if config file is cached already
-    cached_filepath = try_to_load_from_cache(
-        repo_id=model, filename=config_name, revision=revision
-    )
-    if isinstance(cached_filepath, str):
-        # The config file exists in cache- we can continue trying to load
-        return True
-
-    # NB: file_exists will only check for the existence of the config file on
-    # hf_hub. This will fail in offline mode.
-
-    # Call HF to check if the file exists
-    return file_exists(
-        str(model), config_name, revision=revision, token=_get_hf_token()
-    )
-
-
 def set_default_rope_theta(config: PretrainedConfig, default_theta: float) -> None:
     """Some models may have no rope_theta in their config but still use RoPE.
     This function sets a default rope_theta if it's missing."""
@@ -836,72 +679,6 @@ def get_config(
     return config
 
 
-def try_get_local_file(
-    model: str | Path, file_name: str, revision: str | None = "main"
-) -> Path | None:
-    file_path = Path(model) / file_name
-    if file_path.is_file():
-        return file_path
-    else:
-        try:
-            cached_filepath = try_to_load_from_cache(
-                repo_id=model, filename=file_name, revision=revision
-            )
-            if isinstance(cached_filepath, str):
-                return Path(cached_filepath)
-        except ValueError:
-            ...
-    return None
-
-
-def get_hf_file_to_dict(
-    file_name: str, model: str | Path, revision: str | None = "main"
-):
-    """
-    Downloads a file from the Hugging Face Hub and returns
-    its contents as a dictionary.
-
-    Parameters:
-    - file_name (str): The name of the file to download.
-    - model (str): The name of the model on the Hugging Face Hub.
-    - revision (str): The specific version of the model.
-
-    Returns:
-    - config_dict (dict): A dictionary containing
-    the contents of the downloaded file.
-    """
-
-    file_path = try_get_local_file(model=model, file_name=file_name, revision=revision)
-
-    if file_path is None:
-        try:
-            hf_hub_file = hf_hub_download(model, file_name, revision=revision)
-        except huggingface_hub.errors.OfflineModeIsEnabled:
-            return None
-        except (
-            RepositoryNotFoundError,
-            RevisionNotFoundError,
-            EntryNotFoundError,
-            LocalEntryNotFoundError,
-        ) as e:
-            logger.debug("File or repository not found in hf_hub_download", e)
-            return None
-        except HfHubHTTPError as e:
-            logger.warning(
-                "Cannot connect to Hugging Face Hub. Skipping file download for '%s':",
-                file_name,
-                exc_info=e,
-            )
-            return None
-        file_path = Path(hf_hub_file)
-
-    if file_path is not None and file_path.is_file():
-        with open(file_path) as file:
-            return json.load(file)
-
-    return None
-
-
 @cache
 def get_pooling_config(model: str, revision: str | None = "main") -> dict | None:
     """
@@ -1316,41 +1093,3 @@ def _maybe_retrieve_max_pos_from_hf(model, revision, **kwargs) -> int:
         )
 
     return max_position_embeddings
-
-
-def get_model_path(model: str | Path, revision: str | None = None):
-    if os.path.exists(model):
-        return model
-    assert huggingface_hub.constants.HF_HUB_OFFLINE
-    common_kwargs = {
-        "local_files_only": huggingface_hub.constants.HF_HUB_OFFLINE,
-        "revision": revision,
-    }
-
-    if envs.VLLM_USE_MODELSCOPE:
-        from modelscope.hub.snapshot_download import snapshot_download
-
-        return snapshot_download(model_id=model, **common_kwargs)
-
-    from huggingface_hub import snapshot_download
-
-    return snapshot_download(repo_id=model, **common_kwargs)
-
-
-def get_hf_file_bytes(
-    file_name: str, model: str | Path, revision: str | None = "main"
-) -> bytes | None:
-    """Get file contents from HuggingFace repository as bytes."""
-    file_path = try_get_local_file(model=model, file_name=file_name, revision=revision)
-
-    if file_path is None:
-        hf_hub_file = hf_hub_download(
-            model, file_name, revision=revision, token=_get_hf_token()
-        )
-        file_path = Path(hf_hub_file)
-
-    if file_path is not None and file_path.is_file():
-        with open(file_path, "rb") as file:
-            return file.read()
-
-    return None
diff --git a/vllm/transformers_utils/gguf_utils.py b/vllm/transformers_utils/gguf_utils.py
index f727b1b4726bb..c5b4d3f000901 100644
--- a/vllm/transformers_utils/gguf_utils.py
+++ b/vllm/transformers_utils/gguf_utils.py
@@ -9,7 +9,7 @@ from gguf.constants import Keys, VisionProjectorType
 from transformers import Gemma3Config, PretrainedConfig, SiglipVisionConfig
 
 from vllm.logger import init_logger
-from vllm.transformers_utils.config import list_filtered_repo_files
+from vllm.transformers_utils.repo_utils import list_filtered_repo_files
 
 logger = init_logger(__name__)
 
diff --git a/vllm/transformers_utils/repo_utils.py b/vllm/transformers_utils/repo_utils.py
new file mode 100644
index 0000000000000..3ccec04fc487a
--- /dev/null
+++ b/vllm/transformers_utils/repo_utils.py
@@ -0,0 +1,287 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Utilities for model repo interaction."""
+
+import fnmatch
+import json
+import os
+import time
+from collections.abc import Callable
+from functools import cache
+from pathlib import Path
+from typing import TypeVar
+
+import huggingface_hub
+from huggingface_hub import (
+    hf_hub_download,
+    try_to_load_from_cache,
+)
+from huggingface_hub import list_repo_files as hf_list_repo_files
+from huggingface_hub.utils import (
+    EntryNotFoundError,
+    HfHubHTTPError,
+    LocalEntryNotFoundError,
+    RepositoryNotFoundError,
+    RevisionNotFoundError,
+)
+
+from vllm import envs
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def _get_hf_token() -> str | None:
+    """
+    Get the HuggingFace token from environment variable.
+
+    Returns None if the token is not set, is an empty string,
+    or contains only whitespace.
+    This follows the same pattern as huggingface_hub library which
+    treats empty string tokens as None to avoid authentication errors.
+    """
+    token = os.getenv("HF_TOKEN")
+    if token and token.strip():
+        return token
+    return None
+
+
+_R = TypeVar("_R")
+
+
+def with_retry(
+    func: Callable[[], _R],
+    log_msg: str,
+    max_retries: int = 2,
+    retry_delay: int = 2,
+) -> _R:
+    for attempt in range(max_retries):
+        try:
+            return func()
+        except Exception as e:
+            if attempt == max_retries - 1:
+                logger.error("%s: %s", log_msg, e)
+                raise
+            logger.error(
+                "%s: %s, retrying %d of %d", log_msg, e, attempt + 1, max_retries
+            )
+            time.sleep(retry_delay)
+            retry_delay *= 2
+
+    raise AssertionError("Should not be reached")
+
+
+# @cache doesn't cache exceptions
+@cache
+def list_repo_files(
+    repo_id: str,
+    *,
+    revision: str | None = None,
+    repo_type: str | None = None,
+    token: str | bool | None = None,
+) -> list[str]:
+    def lookup_files() -> list[str]:
+        # directly list files if model is local
+        if (local_path := Path(repo_id)).exists():
+            return [
+                str(file.relative_to(local_path))
+                for file in local_path.rglob("*")
+                if file.is_file()
+            ]
+        # if model is remote, use hf_hub api to list files
+        try:
+            if envs.VLLM_USE_MODELSCOPE:
+                from vllm.transformers_utils.utils import modelscope_list_repo_files
+
+                return modelscope_list_repo_files(
+                    repo_id,
+                    revision=revision,
+                    token=os.getenv("MODELSCOPE_API_TOKEN", None),
+                )
+            return hf_list_repo_files(
+                repo_id, revision=revision, repo_type=repo_type, token=token
+            )
+        except huggingface_hub.errors.OfflineModeIsEnabled:
+            # Don't raise in offline mode,
+            # all we know is that we don't have this
+            # file cached.
+            return []
+
+    return with_retry(lookup_files, "Error retrieving file list")
+
+
+def list_filtered_repo_files(
+    model_name_or_path: str,
+    allow_patterns: list[str],
+    revision: str | None = None,
+    repo_type: str | None = None,
+    token: str | bool | None = None,
+) -> list[str]:
+    try:
+        all_files = list_repo_files(
+            repo_id=model_name_or_path,
+            revision=revision,
+            token=token,
+            repo_type=repo_type,
+        )
+    except Exception:
+        logger.error(
+            "Error retrieving file list. Please ensure your `model_name_or_path`"
+            "`repo_type`, `token` and `revision` arguments are correctly set. "
+            "Returning an empty list."
+        )
+        return []
+
+    file_list = []
+    # Filter patterns on filenames
+    for pattern in allow_patterns:
+        file_list.extend(
+            [
+                file
+                for file in all_files
+                if fnmatch.fnmatch(os.path.basename(file), pattern)
+            ]
+        )
+    return file_list
+
+
+def file_exists(
+    repo_id: str,
+    file_name: str,
+    *,
+    repo_type: str | None = None,
+    revision: str | None = None,
+    token: str | bool | None = None,
+) -> bool:
+    file_list = list_repo_files(
+        repo_id, repo_type=repo_type, revision=revision, token=token
+    )
+    return file_name in file_list
+
+
+# In offline mode the result can be a false negative
+def file_or_path_exists(
+    model: str | Path, config_name: str, revision: str | None
+) -> bool:
+    if (local_path := Path(model)).exists():
+        return (local_path / config_name).is_file()
+
+    # Offline mode support: Check if config file is cached already
+    cached_filepath = try_to_load_from_cache(
+        repo_id=model, filename=config_name, revision=revision
+    )
+    if isinstance(cached_filepath, str):
+        # The config file exists in cache- we can continue trying to load
+        return True
+
+    # NB: file_exists will only check for the existence of the config file on
+    # hf_hub. This will fail in offline mode.
+
+    # Call HF to check if the file exists
+    return file_exists(
+        str(model), config_name, revision=revision, token=_get_hf_token()
+    )
+
+
+def get_model_path(model: str | Path, revision: str | None = None):
+    if os.path.exists(model):
+        return model
+    assert huggingface_hub.constants.HF_HUB_OFFLINE
+    common_kwargs = {
+        "local_files_only": huggingface_hub.constants.HF_HUB_OFFLINE,
+        "revision": revision,
+    }
+
+    if envs.VLLM_USE_MODELSCOPE:
+        from modelscope.hub.snapshot_download import snapshot_download
+
+        return snapshot_download(model_id=model, **common_kwargs)
+
+    from huggingface_hub import snapshot_download
+
+    return snapshot_download(repo_id=model, **common_kwargs)
+
+
+def get_hf_file_bytes(
+    file_name: str, model: str | Path, revision: str | None = "main"
+) -> bytes | None:
+    """Get file contents from HuggingFace repository as bytes."""
+    file_path = try_get_local_file(model=model, file_name=file_name, revision=revision)
+
+    if file_path is None:
+        hf_hub_file = hf_hub_download(
+            model, file_name, revision=revision, token=_get_hf_token()
+        )
+        file_path = Path(hf_hub_file)
+
+    if file_path is not None and file_path.is_file():
+        with open(file_path, "rb") as file:
+            return file.read()
+
+    return None
+
+
+def try_get_local_file(
+    model: str | Path, file_name: str, revision: str | None = "main"
+) -> Path | None:
+    file_path = Path(model) / file_name
+    if file_path.is_file():
+        return file_path
+    else:
+        try:
+            cached_filepath = try_to_load_from_cache(
+                repo_id=model, filename=file_name, revision=revision
+            )
+            if isinstance(cached_filepath, str):
+                return Path(cached_filepath)
+        except ValueError:
+            ...
+    return None
+
+
+def get_hf_file_to_dict(
+    file_name: str, model: str | Path, revision: str | None = "main"
+):
+    """
+    Downloads a file from the Hugging Face Hub and returns
+    its contents as a dictionary.
+
+    Parameters:
+    - file_name (str): The name of the file to download.
+    - model (str): The name of the model on the Hugging Face Hub.
+    - revision (str): The specific version of the model.
+
+    Returns:
+    - config_dict (dict): A dictionary containing
+    the contents of the downloaded file.
+    """
+
+    file_path = try_get_local_file(model=model, file_name=file_name, revision=revision)
+
+    if file_path is None:
+        try:
+            hf_hub_file = hf_hub_download(model, file_name, revision=revision)
+        except huggingface_hub.errors.OfflineModeIsEnabled:
+            return None
+        except (
+            RepositoryNotFoundError,
+            RevisionNotFoundError,
+            EntryNotFoundError,
+            LocalEntryNotFoundError,
+        ) as e:
+            logger.debug("File or repository not found in hf_hub_download", e)
+            return None
+        except HfHubHTTPError as e:
+            logger.warning(
+                "Cannot connect to Hugging Face Hub. Skipping file download for '%s':",
+                file_name,
+                exc_info=e,
+            )
+            return None
+        file_path = Path(hf_hub_file)
+
+    if file_path is not None and file_path.is_file():
+        with open(file_path) as file:
+            return json.load(file)
+
+    return None
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 929dc8bf481cb..9eb7fe37912b4 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -15,11 +15,9 @@ from typing_extensions import assert_never
 
 from vllm import envs
 from vllm.logger import init_logger
-from vllm.transformers_utils.config import (
-    get_sentence_transformer_tokenizer_config,
-    list_filtered_repo_files,
-)
+from vllm.transformers_utils.config import get_sentence_transformer_tokenizer_config
 from vllm.transformers_utils.gguf_utils import get_gguf_file_path_from_hf
+from vllm.transformers_utils.repo_utils import list_filtered_repo_files
 from vllm.transformers_utils.tokenizers import MistralTokenizer
 from vllm.transformers_utils.utils import (
     check_gguf_file,

From 4332955602f6b8d2e15b0f2658b7dbd4dc76024c Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 28 Nov 2025 17:10:08 +0000
Subject: [PATCH 054/770] [Docs] Add CLI reference doc for `vllm bench sweep
 plot_pareto` (#29689)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/cli/bench/sweep/plot_pareto.md | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 docs/cli/bench/sweep/plot_pareto.md

diff --git a/docs/cli/bench/sweep/plot_pareto.md b/docs/cli/bench/sweep/plot_pareto.md
new file mode 100644
index 0000000000000..f5dc257ce6772
--- /dev/null
+++ b/docs/cli/bench/sweep/plot_pareto.md
@@ -0,0 +1,9 @@
+# vllm bench sweep plot_pareto
+
+## JSON CLI Arguments
+
+--8<-- "docs/cli/json_tip.inc.md"
+
+## Arguments
+
+--8<-- "docs/argparse/bench_sweep_plot_pareto.inc.md"

From d40c854009225adde370be30a1a876afe1e5b529 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 29 Nov 2025 01:10:29 +0800
Subject: [PATCH 055/770] [CI/Build] Rework CPU multimodal processor test
 (#29684)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 .buildkite/test-pipeline.yaml | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 375645fde7477..3347d74cbb23b 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -820,14 +820,24 @@ steps:
   commands:
     - pytest -v -s models/language/pooling_mteb_test
 
-- label: Multi-Modal Processor Test # 44min
+- label: Multi-Modal Processor Test (CPU)
+  timeout_in_minutes: 60
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  no_gpu: true
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
+
+- label: Multi-Modal Processor Test
   timeout_in_minutes: 60
   source_file_dependencies:
   - vllm/
   - tests/models/multimodal
   commands:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/processing
+    - pytest -v -s models/multimodal/processing/test_tensor_schema.py
 
 - label: Multi-Modal Models Test (Standard) # 60min
   timeout_in_minutes: 80

From 8d9338fae4ad2b9236daaa87d7c9fb6245b8755f Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 29 Nov 2025 01:35:41 +0800
Subject: [PATCH 056/770] [Chore] Rename `Processor` to `InputProcessor`
 (#29682)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../entrypoints/openai/test_lora_resolvers.py |   2 +-
 tests/entrypoints/openai/test_serving_chat.py |  14 +-
 .../entrypoints/openai/test_serving_engine.py |   2 +-
 .../entrypoints/openai/test_serving_models.py |   2 +-
 .../openai/test_serving_responses.py          |   4 +-
 ...s.py => test_process_multi_modal_uuids.py} |  35 +-
 vllm/engine/protocol.py                       |   4 +-
 vllm/entrypoints/llm.py                       |   6 +-
 vllm/entrypoints/openai/serving_engine.py     |  10 +-
 vllm/entrypoints/openai/serving_models.py     |   2 +-
 vllm/v1/engine/async_llm.py                   |  29 +-
 vllm/v1/engine/input_processor.py             | 637 +++++++++++++++++
 vllm/v1/engine/llm_engine.py                  |  28 +-
 vllm/v1/engine/processor.py                   | 641 +-----------------
 14 files changed, 723 insertions(+), 693 deletions(-)
 rename tests/v1/engine/{test_processor_multi_modal_uuids.py => test_process_multi_modal_uuids.py} (87%)
 create mode 100644 vllm/v1/engine/input_processor.py

diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py
index b05fa379c69fc..4856cafef44b3 100644
--- a/tests/entrypoints/openai/test_lora_resolvers.py
+++ b/tests/entrypoints/openai/test_lora_resolvers.py
@@ -114,7 +114,7 @@ def mock_serving_setup():
     mock_engine.add_lora.reset_mock()
 
     mock_engine.model_config = MockModelConfig()
-    mock_engine.processor = MagicMock()
+    mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
     models = OpenAIServingModels(
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index dd10384a7e8c0..492e15fc82a61 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -429,7 +429,7 @@ async def test_serving_chat_returns_correct_model_name():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = MockModelConfig()
-    mock_engine.processor = MagicMock()
+    mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
     serving_chat = _build_serving_chat(mock_engine)
@@ -459,7 +459,7 @@ async def test_serving_chat_should_set_correct_max_tokens():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = MockModelConfig()
-    mock_engine.processor = MagicMock()
+    mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
     serving_chat = _build_serving_chat(mock_engine)
@@ -492,7 +492,7 @@ async def test_serving_chat_should_set_correct_max_tokens():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = mock_model_config
-    mock_engine.processor = MagicMock()
+    mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
     # Initialize the serving chat
@@ -537,7 +537,7 @@ async def test_serving_chat_should_set_correct_max_tokens():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = mock_model_config
-    mock_engine.processor = MagicMock()
+    mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
     # Initialize the serving chat
@@ -583,7 +583,7 @@ async def test_serving_chat_could_load_correct_generation_config():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = mock_model_config
-    mock_engine.processor = MagicMock()
+    mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
     # Initialize the serving chat
@@ -629,7 +629,7 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = mock_model_config
-    mock_engine.processor = MagicMock()
+    mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
     serving_chat = _build_serving_chat(mock_engine)
@@ -662,7 +662,7 @@ async def test_serving_chat_data_parallel_rank_extraction():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = MockModelConfig()
-    mock_engine.processor = MagicMock()
+    mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
     # Mock the generate method to return an async generator
diff --git a/tests/entrypoints/openai/test_serving_engine.py b/tests/entrypoints/openai/test_serving_engine.py
index 46d8871441a75..29892d0bf38aa 100644
--- a/tests/entrypoints/openai/test_serving_engine.py
+++ b/tests/entrypoints/openai/test_serving_engine.py
@@ -23,7 +23,7 @@ def serving() -> OpenAIServing:
     model_config.max_model_len = 32768
     models = Mock(spec=OpenAIServingModels)
     models.model_config = model_config
-    models.processor = Mock()
+    models.input_processor = Mock()
     models.io_processor = Mock()
 
     serving = OpenAIServing(
diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py
index 3c022870dba4b..b585835a0667a 100644
--- a/tests/entrypoints/openai/test_serving_models.py
+++ b/tests/entrypoints/openai/test_serving_models.py
@@ -30,7 +30,7 @@ async def _async_serving_models_init() -> OpenAIServingModels:
     mock_model_config = MagicMock(spec=ModelConfig)
     mock_model_config.max_model_len = 2048
     mock_engine_client.model_config = mock_model_config
-    mock_engine_client.processor = MagicMock()
+    mock_engine_client.input_processor = MagicMock()
     mock_engine_client.io_processor = MagicMock()
 
     serving_models = OpenAIServingModels(
diff --git a/tests/entrypoints/openai/test_serving_responses.py b/tests/entrypoints/openai/test_serving_responses.py
index 93e11b61020c5..6af32774cc5c6 100644
--- a/tests/entrypoints/openai/test_serving_responses.py
+++ b/tests/entrypoints/openai/test_serving_responses.py
@@ -127,7 +127,7 @@ class TestInitializeToolSessions:
         model_config.get_diff_sampling_param.return_value = {}
         engine_client.model_config = model_config
 
-        engine_client.processor = MagicMock()
+        engine_client.input_processor = MagicMock()
         engine_client.io_processor = MagicMock()
 
         models = MagicMock()
@@ -213,7 +213,7 @@ class TestValidateGeneratorInput:
         model_config.get_diff_sampling_param.return_value = {}
         engine_client.model_config = model_config
 
-        engine_client.processor = MagicMock()
+        engine_client.input_processor = MagicMock()
         engine_client.io_processor = MagicMock()
 
         models = MagicMock()
diff --git a/tests/v1/engine/test_processor_multi_modal_uuids.py b/tests/v1/engine/test_process_multi_modal_uuids.py
similarity index 87%
rename from tests/v1/engine/test_processor_multi_modal_uuids.py
rename to tests/v1/engine/test_process_multi_modal_uuids.py
index cb6865e42ef8b..1b11b8af49d17 100644
--- a/tests/v1/engine/test_processor_multi_modal_uuids.py
+++ b/tests/v1/engine/test_process_multi_modal_uuids.py
@@ -7,18 +7,17 @@ from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
 from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
 from vllm.sampling_params import SamplingParams
-from vllm.v1.engine import processor as processor_mod
-from vllm.v1.engine.processor import Processor
+from vllm.v1.engine import input_processor as input_processor_mod
+from vllm.v1.engine.input_processor import InputProcessor
 
 cherry_pil_image = ImageAsset("cherry_blossom").pil_image
 stop_pil_image = ImageAsset("stop_sign").pil_image
 baby_reading_np_ndarrays = VideoAsset("baby_reading").np_ndarrays
 
 
-# Mock processor for testing
-def _mk_processor(
+def _mock_input_processor(
     monkeypatch, *, mm_cache_gb: float = 4.0, enable_prefix_caching: bool = True
-) -> Processor:
+) -> InputProcessor:
     """
     Create a Processor instance with minimal configuration suitable for unit
     tests without accessing external resources.
@@ -36,7 +35,7 @@ def _mk_processor(
         raising=True,
     )
     monkeypatch.setattr(
-        processor_mod,
+        input_processor_mod,
         "processor_cache_from_config",
         lambda vllm_config, mm_registry: None,
         raising=True,
@@ -65,11 +64,11 @@ def _mk_processor(
         device_config=DeviceConfig(device="cpu"),
     )
 
-    return Processor(vllm_config, tokenizer=None)
+    return InputProcessor(vllm_config, tokenizer=None)
 
 
 def test_multi_modal_uuids_length_mismatch_raises(monkeypatch):
-    processor = _mk_processor(monkeypatch)
+    input_processor = _mock_input_processor(monkeypatch)
 
     prompt = {
         "prompt": "USER: <image>\nDescribe\nASSISTANT:",
@@ -79,7 +78,7 @@ def test_multi_modal_uuids_length_mismatch_raises(monkeypatch):
     }
 
     with pytest.raises(ValueError, match="must have same length as data"):
-        processor.process_inputs(
+        input_processor.process_inputs(
             request_id="req-1",
             prompt=prompt,  # type: ignore[arg-type]
             params=SamplingParams(),
@@ -87,7 +86,7 @@ def test_multi_modal_uuids_length_mismatch_raises(monkeypatch):
 
 
 def test_multi_modal_uuids_missing_modality_raises(monkeypatch):
-    processor = _mk_processor(monkeypatch)
+    input_processor = _mock_input_processor(monkeypatch)
 
     prompt = {
         "prompt": "USER: <image><video>\nDescribe\nASSISTANT:",
@@ -101,7 +100,7 @@ def test_multi_modal_uuids_missing_modality_raises(monkeypatch):
     }
 
     with pytest.raises(ValueError, match="must be provided if multi_modal_data"):
-        processor.process_inputs(
+        input_processor.process_inputs(
             request_id="req-2",
             prompt=prompt,  # type: ignore[arg-type]
             params=SamplingParams(),
@@ -119,7 +118,7 @@ def test_multi_modal_uuids_missing_modality_raises(monkeypatch):
 def test_multi_modal_uuids_accepts_none_and_passes_through(
     monkeypatch, mm_cache_gb: float, enable_prefix_caching: bool
 ):
-    processor = _mk_processor(
+    input_processor = _mock_input_processor(
         monkeypatch,
         mm_cache_gb=mm_cache_gb,
         enable_prefix_caching=enable_prefix_caching,
@@ -137,7 +136,7 @@ def test_multi_modal_uuids_accepts_none_and_passes_through(
 
     # Monkeypatch only the bound preprocess method on this instance
     monkeypatch.setattr(
-        processor.input_preprocessor, "preprocess", fake_preprocess, raising=True
+        input_processor.input_preprocessor, "preprocess", fake_preprocess, raising=True
     )
 
     # Use a consistent two-image scenario across all configurations
@@ -151,7 +150,7 @@ def test_multi_modal_uuids_accepts_none_and_passes_through(
         "multi_modal_uuids": mm_uuids,
     }
 
-    processor.process_inputs(
+    input_processor.process_inputs(
         request_id="req-3",
         prompt=prompt,  # type: ignore[arg-type]
         params=SamplingParams(),
@@ -163,7 +162,9 @@ def test_multi_modal_uuids_accepts_none_and_passes_through(
 def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
     # When both processor cache is 0 and prefix caching disabled, the
     # processor builds overrides from request id instead of using user UUIDs.
-    processor = _mk_processor(monkeypatch, mm_cache_gb=0.0, enable_prefix_caching=False)
+    input_processor = _mock_input_processor(
+        monkeypatch, mm_cache_gb=0.0, enable_prefix_caching=False
+    )
 
     captured: dict[str, object] = {}
 
@@ -174,7 +175,7 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
         return {"type": "token", "prompt_token_ids": [1]}
 
     monkeypatch.setattr(
-        processor.input_preprocessor, "preprocess", fake_preprocess, raising=True
+        input_processor.input_preprocessor, "preprocess", fake_preprocess, raising=True
     )
 
     request_id = "req-42"
@@ -188,7 +189,7 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
         "multi_modal_uuids": mm_uuids,
     }
 
-    processor.process_inputs(
+    input_processor.process_inputs(
         request_id=request_id,
         prompt=prompt,  # type: ignore[arg-type]
         params=SamplingParams(),
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index 6b3ee042daf3e..02741e50f6aa0 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -15,7 +15,7 @@ from vllm.sampling_params import SamplingParams
 from vllm.tasks import SupportedTask
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.processor import Processor
+from vllm.v1.engine.input_processor import InputProcessor
 
 
 class EngineClient(ABC):
@@ -23,7 +23,7 @@ class EngineClient(ABC):
 
     vllm_config: VllmConfig
     model_config: ModelConfig
-    processor: Processor
+    input_processor: InputProcessor
     io_processor: IOProcessor | None
 
     @property
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index f6ee746789981..2b34f36253edf 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -347,7 +347,7 @@ class LLM:
         self.supported_tasks = supported_tasks
 
         self.model_config = self.llm_engine.model_config
-        self.processor = self.llm_engine.processor
+        self.input_processor = self.llm_engine.input_processor
         self.io_processor = self.llm_engine.io_processor
 
     def get_tokenizer(self) -> AnyTokenizer:
@@ -364,7 +364,7 @@ class LLM:
             self.llm_engine.tokenizer = get_cached_tokenizer(tokenizer)
 
     def reset_mm_cache(self) -> None:
-        self.processor.clear_mm_cache()
+        self.input_processor.clear_mm_cache()
         self.llm_engine.reset_mm_cache()
 
     def get_default_sampling_params(self) -> SamplingParams:
@@ -1674,7 +1674,7 @@ class LLM:
             tokenization_kwargs,
         )
 
-        engine_request = self.processor.process_inputs(
+        engine_request = self.input_processor.process_inputs(
             request_id,
             engine_prompt,
             params,
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index d9feee917ff4e..cca2fd982fe0f 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -284,7 +284,7 @@ class OpenAIServing:
         self._async_tokenizer_pool: dict[AnyTokenizer, AsyncMicrobatchTokenizer] = {}
         self.log_error_stack = log_error_stack
 
-        self.processor = self.models.processor
+        self.input_processor = self.models.input_processor
         self.io_processor = self.models.io_processor
         self.model_config = self.models.model_config
         self.max_model_len = self.model_config.max_model_len
@@ -330,7 +330,7 @@ class OpenAIServing:
         return parser
 
     async def reset_mm_cache(self) -> None:
-        self.processor.clear_mm_cache()
+        self.input_processor.clear_mm_cache()
         await self.engine_client.reset_mm_cache()
 
     async def beam_search(
@@ -348,8 +348,8 @@ class OpenAIServing:
         length_penalty = params.length_penalty
         include_stop_str_in_output = params.include_stop_str_in_output
 
-        processor = self.processor
-        tokenizer = processor.tokenizer
+        input_processor = self.input_processor
+        tokenizer = input_processor.tokenizer
         if tokenizer is None:
             raise ValueError(
                 "You cannot use beam search when `skip_tokenizer_init` is True"
@@ -1214,7 +1214,7 @@ class OpenAIServing:
             self.max_model_len, params.truncate_prompt_tokens, tokenization_kwargs
         )
 
-        engine_request = self.processor.process_inputs(
+        engine_request = self.input_processor.process_inputs(
             request_id,
             engine_prompt,
             params,
diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
index 24b9587010cad..165de5b618c49 100644
--- a/vllm/entrypoints/openai/serving_models.py
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -69,7 +69,7 @@ class OpenAIServingModels:
             )
         self.lora_resolver_lock: dict[str, Lock] = defaultdict(Lock)
 
-        self.processor = self.engine_client.processor
+        self.input_processor = self.engine_client.input_processor
         self.io_processor = self.engine_client.io_processor
         self.model_config = self.engine_client.model_config
         self.max_model_len = self.model_config.max_model_len
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 827a2736af284..bd28c41fb50e8 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -10,6 +10,7 @@ from typing import Any, cast
 
 import numpy as np
 import torch
+from typing_extensions import deprecated
 
 import vllm.envs as envs
 from vllm.config import VllmConfig
@@ -35,9 +36,9 @@ from vllm.utils.math_utils import cdiv
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
+from vllm.v1.engine.input_processor import InputProcessor
 from vllm.v1.engine.output_processor import OutputProcessor, RequestOutputCollector
 from vllm.v1.engine.parallel_sampling import ParentRequest
-from vllm.v1.engine.processor import Processor
 from vllm.v1.executor import Executor
 from vllm.v1.metrics.loggers import (
     StatLoggerFactory,
@@ -112,7 +113,7 @@ class AsyncLLM(EngineClient):
         else:
             tokenizer = init_tokenizer_from_configs(self.model_config)
 
-        self.processor = Processor(self.vllm_config, tokenizer)
+        self.input_processor = InputProcessor(self.vllm_config, tokenizer)
         self.io_processor = get_io_processor(
             self.vllm_config,
             self.model_config.io_processor_plugin,
@@ -193,6 +194,14 @@ class AsyncLLM(EngineClient):
         else:
             self.profiler = None
 
+    @property
+    @deprecated(
+        "`AsyncLLM.processor` has been renamed to `AsyncLLM.input_processor`. "
+        "The old name will be removed in v0.13."
+    )
+    def processor(self):
+        return self.input_processor
+
     @classmethod
     def from_vllm_config(
         cls,
@@ -293,11 +302,7 @@ class AsyncLLM(EngineClient):
             request = prompt
         else:
             assert prompt_text is None
-            logger.warning_once(
-                "Processor has been moved under OpenAIServing and will "
-                "be removed from AsyncLLM in v0.13."
-            )
-            request = self.processor.process_inputs(
+            request = self.input_processor.process_inputs(
                 request_id,
                 prompt,
                 params,
@@ -481,7 +486,7 @@ class AsyncLLM(EngineClient):
         output_processor = self.output_processor
         log_stats = self.log_stats
         logger_manager = self.logger_manager
-        processor = self.processor
+        input_processor = self.input_processor
 
         async def output_handler():
             try:
@@ -532,7 +537,7 @@ class AsyncLLM(EngineClient):
                             engine_idx=outputs.engine_index,
                             scheduler_stats=outputs.scheduler_stats,
                             iteration_stats=iteration_stats,
-                            mm_cache_stats=processor.stat_mm_cache(),
+                            mm_cache_stats=input_processor.stat_mm_cache(),
                         )
             except Exception as e:
                 logger.exception("AsyncLLM output_handler failed.")
@@ -699,11 +704,11 @@ class AsyncLLM(EngineClient):
 
     @property
     def tokenizer(self) -> AnyTokenizer | None:
-        return self.processor.tokenizer
+        return self.input_processor.tokenizer
 
     @tokenizer.setter
     def tokenizer(self, tokenizer: AnyTokenizer | None) -> None:
-        self.processor.tokenizer = tokenizer
+        self.input_processor.tokenizer = tokenizer
 
     async def get_tokenizer(self) -> AnyTokenizer:
         if self.tokenizer is None:
@@ -738,7 +743,7 @@ class AsyncLLM(EngineClient):
         await asyncio.gather(*coros)
 
     async def reset_mm_cache(self) -> None:
-        self.processor.clear_mm_cache()
+        self.input_processor.clear_mm_cache()
         await self.engine_core.reset_mm_cache_async()
 
     async def reset_prefix_cache(self) -> None:
diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py
new file mode 100644
index 0000000000000..cfd637931a1ce
--- /dev/null
+++ b/vllm/v1/engine/input_processor.py
@@ -0,0 +1,637 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import time
+from collections.abc import Mapping
+from typing import Any, Literal, cast
+
+from vllm.config import VllmConfig
+from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs
+from vllm.inputs.parse import split_enc_dec_inputs
+from vllm.inputs.preprocess import InputPreprocessor
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.multimodal.cache import processor_cache_from_config
+from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalUUIDDict
+from vllm.multimodal.parse import MultiModalDataParser
+from vllm.multimodal.processing import EncDecMultiModalProcessor
+from vllm.multimodal.utils import argsort_mm_positions
+from vllm.pooling_params import PoolingParams
+from vllm.sampling_params import SamplingParams
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.utils import length_from_prompt_token_ids_or_embeds
+from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.metrics.stats import MultiModalCacheStats
+from vllm.v1.structured_output.backend_guidance import validate_guidance_grammar
+from vllm.v1.structured_output.backend_lm_format_enforcer import (
+    validate_structured_output_request_lm_format_enforcer,
+)
+from vllm.v1.structured_output.backend_outlines import (
+    validate_structured_output_request_outlines,
+)
+from vllm.v1.structured_output.backend_xgrammar import validate_xgrammar_grammar
+
+logger = init_logger(__name__)
+
+
+class InputProcessor:
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        tokenizer: AnyTokenizer | None,
+        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+    ) -> None:
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.lora_config = vllm_config.lora_config
+        self.structured_outputs_config = vllm_config.structured_outputs_config
+
+        self.generation_config_fields = self.model_config.try_get_generation_config()
+
+        self.mm_registry = mm_registry
+        self.mm_processor_cache = processor_cache_from_config(vllm_config, mm_registry)
+
+        self.input_preprocessor = InputPreprocessor(
+            self.model_config,
+            tokenizer,
+            mm_registry,
+            mm_processor_cache=self.mm_processor_cache,
+        )
+
+    @property
+    def tokenizer(self) -> AnyTokenizer | None:
+        return self.input_preprocessor.tokenizer
+
+    @tokenizer.setter
+    def tokenizer(self, tokenizer: AnyTokenizer | None) -> None:
+        self.input_preprocessor.tokenizer = tokenizer
+
+    def _validate_logprobs(
+        self,
+        params: SamplingParams,
+    ) -> None:
+        max_logprobs = self.model_config.max_logprobs
+        if max_logprobs == -1:
+            max_logprobs = self.model_config.get_vocab_size()
+
+        # Validate sample logprobs.
+        if params.logprobs:
+            num_logprobs = params.logprobs
+            if num_logprobs == -1:
+                num_logprobs = self.model_config.get_vocab_size()
+            if num_logprobs > max_logprobs:
+                raise ValueError(
+                    f"Requested sample logprobs of {num_logprobs}, "
+                    f"which is greater than max allowed: {max_logprobs}"
+                )
+
+        # Validate prompt logprobs.
+        if params.prompt_logprobs:
+            num_prompt_logprobs = params.prompt_logprobs
+            if num_prompt_logprobs == -1:
+                num_prompt_logprobs = self.model_config.get_vocab_size()
+            if num_prompt_logprobs > max_logprobs:
+                raise ValueError(
+                    f"Requested prompt logprobs of {num_prompt_logprobs}, "
+                    f"which is greater than max allowed: {max_logprobs}"
+                )
+
+    def _validate_sampling_params(
+        self,
+        params: SamplingParams,
+    ) -> None:
+        self._validate_structured_output(params)
+        self._validate_logit_bias(params)
+
+        if params.allowed_token_ids is None:
+            return
+        if not params.allowed_token_ids:
+            raise ValueError("allowed_token_ids is not None and empty!")
+        if self.tokenizer is None:
+            # When skip_tokenizer_init=True, we can't validate token IDs
+            # Skip validation and let the model handle invalid tokens
+            return
+        vocab_size = len(self.tokenizer)
+        if not all(0 <= tid < vocab_size for tid in params.allowed_token_ids):
+            raise ValueError("allowed_token_ids contains out-of-vocab token id!")
+
+    def _validate_logit_bias(
+        self,
+        params: SamplingParams,
+    ) -> None:
+        """Validate logit_bias token IDs are within vocabulary range."""
+        if not params.logit_bias:
+            return
+
+        vocab_size = self.model_config.get_vocab_size()
+        invalid_token_ids = []
+
+        for token_id in params.logit_bias:
+            if token_id < 0 or token_id >= vocab_size:
+                invalid_token_ids.append(token_id)
+
+        if invalid_token_ids:
+            raise ValueError(
+                f"token_id(s) {invalid_token_ids} in logit_bias contain "
+                f"out-of-vocab token ids. Vocabulary size: {vocab_size}"
+            )
+
+    def _validate_supported_sampling_params(
+        self,
+        params: SamplingParams,
+    ) -> None:
+        # Logits processors not supported.
+        if params.logits_processors:
+            raise ValueError(
+                "vLLM V1 does not support per request user provided logits processors."
+            )
+        # Async scheduling + spec decode currently incompatible with some
+        # sampling parameters.
+        if (
+            self.vllm_config.speculative_config is not None
+            and self.vllm_config.scheduler_config.async_scheduling
+            and (
+                params.frequency_penalty != 0.0
+                or params.presence_penalty != 0.0
+                or params.repetition_penalty != 1.0
+                or params.bad_words_token_ids
+                or params.structured_outputs
+            )
+        ):
+            raise ValueError(
+                "async scheduling with spec decoding doesn't yet support "
+                "penalties, bad words or structured outputs in sampling parameters."
+            )
+
+    def _validate_params(
+        self,
+        params: SamplingParams | PoolingParams,
+    ):
+        """
+        Validate supported SamplingParam.
+        Should raise ValueError if unsupported for API Server.
+        """
+
+        if isinstance(params, PoolingParams):
+            return
+
+        self._validate_logprobs(params)
+        self._validate_sampling_params(params)
+        self._validate_supported_sampling_params(params)
+
+    def _validate_multi_modal_uuids(self, prompt: PromptType) -> None:
+        """
+        Validate that user-provided multi_modal_uuids align with
+        multi_modal_data in the incoming request prompt(s).
+        Only checks lengths; `None` entries are allowed and will be
+        auto-hashed downstream.
+        """
+
+        def _validate_single_prompt(single_prompt: dict | str) -> None:
+            if not isinstance(single_prompt, dict):
+                return
+            mm_data = single_prompt.get("multi_modal_data")
+            mm_uuids = single_prompt.get("multi_modal_uuids")
+            if not mm_data or not mm_uuids:
+                return
+
+            for modality, items in mm_data.items():
+                if modality in mm_uuids:
+                    data_len = len(items) if isinstance(items, list) else 1
+                    uuid_len = (
+                        len(mm_uuids[modality])
+                        if isinstance(mm_uuids[modality], list)
+                        else 1
+                    )
+                    if uuid_len != data_len:
+                        raise ValueError(
+                            f"multi_modal_uuids for modality '{modality}' "
+                            "must have same length as data: got "
+                            f"{uuid_len} uuids vs "
+                            f"{data_len} items."
+                        )
+                else:
+                    raise ValueError(
+                        f"multi_modal_uuids for modality '{modality}' must "
+                        "be provided if multi_modal_data is provided."
+                    )
+
+        # Handle explicit encoder/decoder prompts or singleton prompt
+        if isinstance(prompt, dict) and "encoder_prompt" in prompt:
+            enc = prompt.get("encoder_prompt")
+            dec = prompt.get("decoder_prompt")
+            if enc is not None:
+                _validate_single_prompt(cast(dict | str, enc))
+            if dec is not None:
+                _validate_single_prompt(cast(dict | str, dec))
+        else:
+            _validate_single_prompt(prompt)  # type: ignore[arg-type]
+
+    def _validate_lora(self, lora_request: LoRARequest | None) -> None:
+        if lora_request is None:
+            return
+
+        # LoRA request passed in while LoRA is not enabled
+        if not self.lora_config:
+            raise ValueError(
+                f"Got lora_request {lora_request} but LoRA is not enabled!"
+            )
+
+        if self.tokenizer is not None:
+            logger.warning_once(
+                "vLLM has deprecated support for supporting different "
+                "tokenizers for different LoRAs. By default, vLLM uses base "
+                "model's tokenizer. If you are using a LoRA "
+                "with its own tokenizer, consider specifying `--tokenizer "
+                "[lora_path]` to use the LoRA tokenizer."
+            )
+
+    def _validate_structured_output(self, params: SamplingParams) -> None:
+        if not params.structured_outputs or not self.structured_outputs_config:
+            return
+
+        if self.model_config.skip_tokenizer_init and params.structured_outputs:
+            raise ValueError(
+                "Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'"  # noqa: E501
+            )
+
+        backend = self.structured_outputs_config.backend
+        if _backend := params.structured_outputs._backend:
+            # Request-level backend selection is not supported.
+            # The values may differ if `params` is reused and was set
+            # to a specific backend based on `auto` behavior in a previous
+            # request. We remember that it was set as a result of `auto`
+            # using the `_backend_was_auto` field set in the params.
+            if backend != _backend and not (
+                backend == "auto" and params.structured_outputs._backend_was_auto
+            ):
+                raise ValueError(
+                    "Request-level structured output backend selection is not "
+                    f"supported. The request specified '{_backend}', but vLLM "
+                    f"was initialised with '{backend}'. This error can be "
+                    "resolved by removing '_backend' from the request."
+                )
+        else:
+            params.structured_outputs._backend = backend
+
+        # Request content validation
+        if (
+            isinstance(params.structured_outputs.choice, list)
+            and not params.structured_outputs.choice
+        ):
+            # It is invalid for choice to be an empty list
+            raise ValueError(
+                f"Choice '{params.structured_outputs.choice}' cannot be an empty list"  # noqa: E501
+            )
+        # Reject empty string grammar early to avoid engine-side crashes
+        if (
+            isinstance(params.structured_outputs.grammar, str)
+            and params.structured_outputs.grammar.strip() == ""
+        ):
+            raise ValueError("structured_outputs.grammar cannot be an empty string")
+
+        if backend.startswith("xgrammar"):
+            # xgrammar with no fallback
+            validate_xgrammar_grammar(params)
+        elif backend.startswith("guidance"):
+            # TODO: ideally we would have the LLTokenizer here as Lark syntax
+            # allows <|special_token|> and similar, see
+            # https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#special-tokens
+            # Without tokenizer these are disallowed in grammars.
+            if isinstance(self.tokenizer, MistralTokenizer):
+                raise ValueError(
+                    "Mistral tokenizer is not supported for the 'guidance' "
+                    "structured output backend. Please use ['xgrammar', 'outlines'] "
+                    "backends or tokenizer_mode='hf' instead."
+                )
+            validate_guidance_grammar(params, tokenizer=None)
+        elif backend == "outlines":
+            # outlines backend
+            validate_structured_output_request_outlines(params)
+        elif backend == "lm-format-enforcer":
+            # lm format enforcer backend
+            if isinstance(self.tokenizer, MistralTokenizer):
+                raise ValueError(
+                    "Mistral tokenizer is not supported for the 'lm-format-enforcer' "
+                    "structured output backend. Please use ['xgrammar', 'outlines'] "
+                    "backends or tokenizer_mode='hf' instead."
+                )
+            validate_structured_output_request_lm_format_enforcer(params)
+        else:
+            # NOTE: backend must be "auto" here, because we have
+            # checked supported_backends above.
+            # In this mode, we set opinionated defaults based on what we think
+            # will satisfy the most use cases without having to worry about
+            # this setting. We include fallback behavior here, but not with any
+            # other setting where a specific backend was specified.
+            try:
+                validate_xgrammar_grammar(params)
+                params.structured_outputs._backend = "xgrammar"
+            except ValueError:
+                # The request either failed validation
+                # or includes some jsonschema feature(s) that
+                # are not supported in xgrammar.
+                if isinstance(self.tokenizer, MistralTokenizer):
+                    # Fall back to outlines if the tokenizer is Mistral
+                    validate_structured_output_request_outlines(params)
+                    params.structured_outputs._backend = "outlines"
+                else:
+                    # Fall back to guidance by default.
+                    validate_guidance_grammar(params, tokenizer=None)
+                    params.structured_outputs._backend = "guidance"
+            # Remember that this backend was set automatically
+            params.structured_outputs._backend_was_auto = True
+
+    def _maybe_build_mm_uuids(
+        self,
+        request_id: str,
+        prompt: PromptType,
+    ) -> MultiModalUUIDDict | None:
+        """Build per-item multimodal hash overrides when enabled. In this case,
+        multimodal data items are identified by their request id, modality and
+        index rather than their content.
+
+        Returns a dictionary of modality -> list[str] of overrides, or None if
+        disabled or no multimodal data is present.
+        """
+
+        def _extract_mm_data(p: PromptType):
+            if isinstance(p, dict) and "encoder_prompt" in p:
+                enc = p.get("encoder_prompt")
+                if isinstance(enc, dict):
+                    return enc.get("multi_modal_data")
+                return None
+            if isinstance(p, dict):
+                return p.get("multi_modal_data")
+            return None
+
+        mm_data = _extract_mm_data(prompt)
+        if not mm_data:
+            return None
+
+        mm_uuids: dict[str, list[str | None] | str] = {}
+        for modality, data in mm_data.items():
+            # Hash each item for embedding inputs.
+            n = (
+                len(data)
+                if isinstance(data, list) or MultiModalDataParser.is_embeddings(data)
+                else 1
+            )
+            mm_uuids[modality] = [f"{request_id}-{modality}-{i}" for i in range(n)]
+        return mm_uuids
+
+    def process_inputs(
+        self,
+        request_id: str,
+        prompt: PromptType,
+        params: SamplingParams | PoolingParams,
+        arrival_time: float | None = None,
+        lora_request: LoRARequest | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
+        trace_headers: Mapping[str, str] | None = None,
+        priority: int = 0,
+        data_parallel_rank: int | None = None,
+    ) -> EngineCoreRequest:
+        self._validate_lora(lora_request)
+        self._validate_params(params)
+
+        data_parallel_size = self.vllm_config.parallel_config.data_parallel_size
+        if data_parallel_rank is not None and not (
+            0 <= data_parallel_rank < data_parallel_size
+        ):
+            raise ValueError(
+                f"data_parallel_rank {data_parallel_rank} "
+                f"is out of range [0, {data_parallel_size})."
+            )
+
+        if arrival_time is None:
+            arrival_time = time.time()
+
+        # Optionally generate multimodal hash overrides to avoid hashing
+        # multimodal data items by their content as their identifiers.
+
+        # NOTE: when users explicitly turn off BOTH prefix caching and input
+        # processing caching, no multimodal features or embeddings will be
+        # reused across requests, therefore identifying multimodal data items
+        # by their content is no longer necessary, and we create uuids with
+        # request id-modality-index as multimodal hash overrides.
+        if (
+            self.model_config.multimodal_config
+            and self.model_config.multimodal_config.mm_processor_cache_gb == 0
+            and not self.cache_config.enable_prefix_caching
+        ):
+            mm_uuids = self._maybe_build_mm_uuids(request_id, prompt)
+        else:
+            # Otherwise, use user-provided uuids as multimodal hash overrides
+            # if provided.
+            self._validate_multi_modal_uuids(prompt)
+            if isinstance(prompt, dict):
+                mm_uuids = cast(
+                    MultiModalUUIDDict | None, prompt.get("multi_modal_uuids")
+                )
+            else:
+                mm_uuids = None
+
+        # Process inputs, which includes:
+        # 1. Tokenize text prompt, with LoRA request if one exists.
+        # 2. For multimodal models with a merged preprocessor, preprocess
+        #   multimodal data and expand prompt token ids accordingly.
+        processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
+            prompt,
+            tokenization_kwargs=tokenization_kwargs,
+            mm_uuids=mm_uuids,
+        )
+        from vllm.platforms import current_platform
+
+        current_platform.validate_request(
+            prompt=prompt,
+            params=params,
+            processed_inputs=processed_inputs,
+        )
+
+        eos_token_id = self.input_preprocessor.get_eos_token_id()
+
+        encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
+        self._validate_model_inputs(encoder_inputs, decoder_inputs)
+
+        # Mypy can be conservative for TypedDict unions; normalize access.
+        if decoder_inputs["type"] == "embeds":
+            prompt_token_ids = None
+            prompt_embeds = decoder_inputs["prompt_embeds"]
+        else:
+            prompt_token_ids = decoder_inputs["prompt_token_ids"]
+            prompt_embeds = None
+
+        sampling_params = None
+        pooling_params = None
+        if isinstance(params, SamplingParams):
+            # TODO: can we avoid cloning here in multiproc case?
+            sampling_params = params.clone()
+            # If unset max tokens, then generate up to the max_model_len.
+            if sampling_params.max_tokens is None:
+                seq_len = length_from_prompt_token_ids_or_embeds(
+                    prompt_token_ids, prompt_embeds
+                )
+                sampling_params.max_tokens = self.model_config.max_model_len - seq_len
+            sampling_params.update_from_generation_config(
+                self.generation_config_fields, eos_token_id
+            )
+            if self.tokenizer is not None:
+                sampling_params.update_from_tokenizer(self.tokenizer)
+        else:
+            pooling_params = params.clone()
+
+        # Multimodal related.
+        mm_features: list[MultiModalFeatureSpec] | None = None
+
+        if decoder_inputs["type"] == "multimodal":
+            decoder_mm_inputs = decoder_inputs["mm_kwargs"]
+            decoder_mm_positions = decoder_inputs["mm_placeholders"]
+            decoder_mm_hashes = decoder_inputs["mm_hashes"]
+
+            # Merge and flatten multimodal placeholders, hashes and inputs
+            # from dictionaries to lists, and sort them by each item's position
+            # in the input sequence.
+            sorted_mm_idxs = argsort_mm_positions(decoder_mm_positions)
+
+            mm_features = []
+            for modality, idx in sorted_mm_idxs:
+                mm_features.append(
+                    MultiModalFeatureSpec(
+                        data=decoder_mm_inputs[modality][idx],
+                        modality=modality,
+                        identifier=decoder_mm_hashes[modality][idx],
+                        mm_position=decoder_mm_positions[modality][idx],
+                    )
+                )
+
+        return EngineCoreRequest(
+            request_id=request_id,
+            prompt_token_ids=prompt_token_ids,
+            prompt_embeds=prompt_embeds,
+            mm_features=mm_features,
+            sampling_params=sampling_params,
+            pooling_params=pooling_params,
+            eos_token_id=eos_token_id,
+            arrival_time=arrival_time,
+            lora_request=lora_request,
+            cache_salt=decoder_inputs.get("cache_salt"),
+            priority=priority,
+            data_parallel_rank=data_parallel_rank,
+            trace_headers=trace_headers,
+        )
+
+    def _validate_model_inputs(
+        self, encoder_inputs: SingletonInputs | None, decoder_inputs: SingletonInputs
+    ):
+        if encoder_inputs is not None:
+            self._validate_model_input(encoder_inputs, prompt_type="encoder")
+
+        self._validate_model_input(decoder_inputs, prompt_type="decoder")
+
+    def _validate_model_input(
+        self,
+        prompt_inputs: SingletonInputs,
+        *,
+        prompt_type: Literal["encoder", "decoder"],
+    ):
+        model_config = self.model_config
+
+        prompt_ids = (
+            None
+            if prompt_inputs["type"] == "embeds"
+            else prompt_inputs["prompt_token_ids"]
+        )
+        prompt_embeds = (
+            prompt_inputs["prompt_embeds"]
+            if prompt_inputs["type"] == "embeds"
+            else None
+        )
+        prompt_len = length_from_prompt_token_ids_or_embeds(prompt_ids, prompt_embeds)
+        if not prompt_ids:
+            if prompt_type == "encoder" and model_config.is_multimodal_model:
+                pass  # Mllama may have empty encoder inputs for text-only data
+            elif prompt_inputs["type"] == "embeds":
+                pass  # Prompt embeds should not have prompt_ids.
+            else:
+                raise ValueError(f"The {prompt_type} prompt cannot be empty")
+
+        tokenizer = self.tokenizer
+        if tokenizer is not None:
+            max_input_id = max(prompt_ids or [], default=0)
+
+            # NOTE: tokenizer.max_token_id is the tokenizer’s vocab size while
+            # self.model_config.get_vocab_size() is the model’s vocab size.
+            # For Qwen3 models, the language model has extra tokens that do
+            # not exist in the tokenizer, and vice versa for multimodal
+            # placeholder tokens in some multimodal models.
+            # See https://github.com/QwenLM/Qwen3/issues/29#issuecomment-1933720399 # noqa: E501
+            # and https://github.com/vllm-project/vllm/pull/22471#discussion_r2312251421 # noqa: E501
+
+            # Here we take the max of the two to determine if a token id is
+            # truly out-of-vocabulary.
+            if max_input_id > max(
+                tokenizer.max_token_id, self.model_config.get_vocab_size() - 1
+            ):
+                raise ValueError(f"Token id {max_input_id} is out of vocabulary")
+
+        max_prompt_len = self.model_config.max_model_len
+        if prompt_len > max_prompt_len:
+            if prompt_type == "encoder" and model_config.is_multimodal_model:
+                mm_registry = self.input_preprocessor.mm_registry
+                mm_processor = mm_registry.create_processor(
+                    model_config,
+                    tokenizer=tokenizer,
+                )
+                assert isinstance(mm_processor, EncDecMultiModalProcessor)
+
+                if mm_processor.pad_dummy_encoder_prompt:
+                    return  # Skip encoder length check for Whisper
+
+            if model_config.is_multimodal_model:
+                suggestion = (
+                    "Make sure that `max_model_len` is no smaller than the "
+                    "number of text tokens plus multimodal tokens. For image "
+                    "inputs, the number of image tokens depends on the number "
+                    "of images, and possibly their aspect ratios as well."
+                )
+            else:
+                suggestion = (
+                    "Make sure that `max_model_len` is no smaller than the "
+                    "number of text tokens."
+                )
+
+            raise ValueError(
+                f"The {prompt_type} prompt (length {prompt_len}) is "
+                f"longer than the maximum model length of {max_prompt_len}. "
+                f"{suggestion}"
+            )
+
+            # TODO: Find out how many placeholder tokens are there so we can
+            # check that chunked prefill does not truncate them
+            # max_batch_len = self.scheduler_config.max_num_batched_tokens
+
+        if (
+            prompt_len == max_prompt_len
+            and prompt_type == "decoder"
+            and not model_config.is_multimodal_model
+            and self.model_config.runner_type != "pooling"
+        ):
+            suggestion = (
+                "Make sure that `max_model_len` is no smaller than the "
+                "number of text tokens (prompt + requested output tokens)."
+            )
+            raise ValueError(
+                f"The {prompt_type} prompt (length {prompt_len}) plus the number of "
+                f"requested output tokens (at least 1) is longer than the maximum "
+                f"model length of {max_prompt_len}. {suggestion}"
+            )
+
+    def stat_mm_cache(self) -> MultiModalCacheStats | None:
+        return self.input_preprocessor.stat_mm_cache()
+
+    def clear_mm_cache(self) -> None:
+        self.input_preprocessor.clear_mm_cache()
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index dffe05445ee46..ead553e98a978 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -7,7 +7,7 @@ from copy import copy
 from typing import Any, cast
 
 import torch.nn as nn
-from typing_extensions import TypeVar
+from typing_extensions import TypeVar, deprecated
 
 import vllm.envs as envs
 from vllm.config import ParallelConfig, VllmConfig
@@ -28,9 +28,9 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer, init_tokenizer_from_
 from vllm.usage.usage_lib import UsageContext
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core_client import EngineCoreClient
+from vllm.v1.engine.input_processor import InputProcessor
 from vllm.v1.engine.output_processor import OutputProcessor
 from vllm.v1.engine.parallel_sampling import ParentRequest
-from vllm.v1.engine.processor import Processor
 from vllm.v1.executor import Executor
 from vllm.v1.metrics.loggers import StatLoggerFactory, StatLoggerManager
 from vllm.v1.metrics.reader import Metric, get_metrics_snapshot
@@ -88,7 +88,7 @@ class LLMEngine:
         else:
             tokenizer = init_tokenizer_from_configs(self.model_config)
 
-        self.processor = Processor(self.vllm_config, tokenizer)
+        self.input_processor = InputProcessor(self.vllm_config, tokenizer)
         self.io_processor = get_io_processor(
             self.vllm_config,
             self.model_config.io_processor_plugin,
@@ -135,6 +135,14 @@ class LLMEngine:
         # Don't keep the dummy data in memory
         self.reset_mm_cache()
 
+    @property
+    @deprecated(
+        "`LLMEngine.processor` has been renamed to `LLMEngine.input_processor`. "
+        "The old name will be removed in v0.13."
+    )
+    def processor(self):
+        return self.input_processor
+
     @classmethod
     def from_vllm_config(
         cls,
@@ -231,11 +239,7 @@ class LLMEngine:
             request = prompt
         else:
             assert prompt_text is None
-            logger.warning_once(
-                "Processor has been moved under LLM and will "
-                "be removed from LLMEngine in v0.13."
-            )
-            request = self.processor.process_inputs(
+            request = self.input_processor.process_inputs(
                 request_id,
                 prompt,
                 params,
@@ -307,7 +311,7 @@ class LLMEngine:
                 self.logger_manager.record(
                     scheduler_stats=outputs.scheduler_stats,
                     iteration_stats=iteration_stats,
-                    mm_cache_stats=self.processor.stat_mm_cache(),
+                    mm_cache_stats=self.input_processor.stat_mm_cache(),
                 )
                 self.do_log_stats_with_interval()
 
@@ -320,7 +324,7 @@ class LLMEngine:
         self.engine_core.profile(False)
 
     def reset_mm_cache(self):
-        self.processor.clear_mm_cache()
+        self.input_processor.clear_mm_cache()
         self.engine_core.reset_mm_cache()
 
     def reset_prefix_cache(self):
@@ -347,11 +351,11 @@ class LLMEngine:
 
     @property
     def tokenizer(self) -> AnyTokenizer | None:
-        return self.processor.tokenizer
+        return self.input_processor.tokenizer
 
     @tokenizer.setter
     def tokenizer(self, tokenizer: AnyTokenizer | None) -> None:
-        self.processor.tokenizer = tokenizer
+        self.input_processor.tokenizer = tokenizer
 
     def get_tokenizer(self) -> AnyTokenizer:
         if self.tokenizer is None:
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index af4f0e410e253..bc5c7fc400fde 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -1,637 +1,20 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import time
-from collections.abc import Mapping
-from typing import Any, Literal, cast
-
-from vllm.config import VllmConfig
-from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs
-from vllm.inputs.parse import split_enc_dec_inputs
-from vllm.inputs.preprocess import InputPreprocessor
-from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
-from vllm.multimodal.cache import processor_cache_from_config
-from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalUUIDDict
-from vllm.multimodal.parse import MultiModalDataParser
-from vllm.multimodal.processing import EncDecMultiModalProcessor
-from vllm.multimodal.utils import argsort_mm_positions
-from vllm.pooling_params import PoolingParams
-from vllm.sampling_params import SamplingParams
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
-from vllm.utils import length_from_prompt_token_ids_or_embeds
-from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.metrics.stats import MultiModalCacheStats
-from vllm.v1.structured_output.backend_guidance import validate_guidance_grammar
-from vllm.v1.structured_output.backend_lm_format_enforcer import (
-    validate_structured_output_request_lm_format_enforcer,
-)
-from vllm.v1.structured_output.backend_outlines import (
-    validate_structured_output_request_outlines,
-)
-from vllm.v1.structured_output.backend_xgrammar import validate_xgrammar_grammar
-
-logger = init_logger(__name__)
+import warnings
 
 
-class Processor:
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        tokenizer: AnyTokenizer | None,
-        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
-    ) -> None:
-        self.vllm_config = vllm_config
-        self.model_config = vllm_config.model_config
-        self.cache_config = vllm_config.cache_config
-        self.lora_config = vllm_config.lora_config
-        self.structured_outputs_config = vllm_config.structured_outputs_config
+def __getattr__(name: str):
+    if name == "Processor":
+        from .input_processor import InputProcessor
 
-        self.generation_config_fields = self.model_config.try_get_generation_config()
-
-        self.mm_registry = mm_registry
-        self.mm_processor_cache = processor_cache_from_config(vllm_config, mm_registry)
-
-        self.input_preprocessor = InputPreprocessor(
-            self.model_config,
-            tokenizer,
-            mm_registry,
-            mm_processor_cache=self.mm_processor_cache,
+        warnings.warn(
+            "`vllm.v1.engine.processor.Processor` has been moved to "
+            "`vllm.v1.engine.input_processor.InputProcessor`. "
+            "The old name will be removed in v0.13.",
+            DeprecationWarning,
+            stacklevel=2,
         )
 
-    @property
-    def tokenizer(self) -> AnyTokenizer | None:
-        return self.input_preprocessor.tokenizer
+        return InputProcessor
 
-    @tokenizer.setter
-    def tokenizer(self, tokenizer: AnyTokenizer | None) -> None:
-        self.input_preprocessor.tokenizer = tokenizer
-
-    def _validate_logprobs(
-        self,
-        params: SamplingParams,
-    ) -> None:
-        max_logprobs = self.model_config.max_logprobs
-        if max_logprobs == -1:
-            max_logprobs = self.model_config.get_vocab_size()
-
-        # Validate sample logprobs.
-        if params.logprobs:
-            num_logprobs = params.logprobs
-            if num_logprobs == -1:
-                num_logprobs = self.model_config.get_vocab_size()
-            if num_logprobs > max_logprobs:
-                raise ValueError(
-                    f"Requested sample logprobs of {num_logprobs}, "
-                    f"which is greater than max allowed: {max_logprobs}"
-                )
-
-        # Validate prompt logprobs.
-        if params.prompt_logprobs:
-            num_prompt_logprobs = params.prompt_logprobs
-            if num_prompt_logprobs == -1:
-                num_prompt_logprobs = self.model_config.get_vocab_size()
-            if num_prompt_logprobs > max_logprobs:
-                raise ValueError(
-                    f"Requested prompt logprobs of {num_prompt_logprobs}, "
-                    f"which is greater than max allowed: {max_logprobs}"
-                )
-
-    def _validate_sampling_params(
-        self,
-        params: SamplingParams,
-    ) -> None:
-        self._validate_structured_output(params)
-        self._validate_logit_bias(params)
-
-        if params.allowed_token_ids is None:
-            return
-        if not params.allowed_token_ids:
-            raise ValueError("allowed_token_ids is not None and empty!")
-        if self.tokenizer is None:
-            # When skip_tokenizer_init=True, we can't validate token IDs
-            # Skip validation and let the model handle invalid tokens
-            return
-        vocab_size = len(self.tokenizer)
-        if not all(0 <= tid < vocab_size for tid in params.allowed_token_ids):
-            raise ValueError("allowed_token_ids contains out-of-vocab token id!")
-
-    def _validate_logit_bias(
-        self,
-        params: SamplingParams,
-    ) -> None:
-        """Validate logit_bias token IDs are within vocabulary range."""
-        if not params.logit_bias:
-            return
-
-        vocab_size = self.model_config.get_vocab_size()
-        invalid_token_ids = []
-
-        for token_id in params.logit_bias:
-            if token_id < 0 or token_id >= vocab_size:
-                invalid_token_ids.append(token_id)
-
-        if invalid_token_ids:
-            raise ValueError(
-                f"token_id(s) {invalid_token_ids} in logit_bias contain "
-                f"out-of-vocab token ids. Vocabulary size: {vocab_size}"
-            )
-
-    def _validate_supported_sampling_params(
-        self,
-        params: SamplingParams,
-    ) -> None:
-        # Logits processors not supported.
-        if params.logits_processors:
-            raise ValueError(
-                "vLLM V1 does not support per request user provided logits processors."
-            )
-        # Async scheduling + spec decode currently incompatible with some
-        # sampling parameters.
-        if (
-            self.vllm_config.speculative_config is not None
-            and self.vllm_config.scheduler_config.async_scheduling
-            and (
-                params.frequency_penalty != 0.0
-                or params.presence_penalty != 0.0
-                or params.repetition_penalty != 1.0
-                or params.bad_words_token_ids
-                or params.structured_outputs
-            )
-        ):
-            raise ValueError(
-                "async scheduling with spec decoding doesn't yet support "
-                "penalties, bad words or structured outputs in sampling parameters."
-            )
-
-    def _validate_params(
-        self,
-        params: SamplingParams | PoolingParams,
-    ):
-        """
-        Validate supported SamplingParam.
-        Should raise ValueError if unsupported for API Server.
-        """
-
-        if isinstance(params, PoolingParams):
-            return
-
-        self._validate_logprobs(params)
-        self._validate_sampling_params(params)
-        self._validate_supported_sampling_params(params)
-
-    def _validate_multi_modal_uuids(self, prompt: PromptType) -> None:
-        """
-        Validate that user-provided multi_modal_uuids align with
-        multi_modal_data in the incoming request prompt(s).
-        Only checks lengths; `None` entries are allowed and will be
-        auto-hashed downstream.
-        """
-
-        def _validate_single_prompt(single_prompt: dict | str) -> None:
-            if not isinstance(single_prompt, dict):
-                return
-            mm_data = single_prompt.get("multi_modal_data")
-            mm_uuids = single_prompt.get("multi_modal_uuids")
-            if not mm_data or not mm_uuids:
-                return
-
-            for modality, items in mm_data.items():
-                if modality in mm_uuids:
-                    data_len = len(items) if isinstance(items, list) else 1
-                    uuid_len = (
-                        len(mm_uuids[modality])
-                        if isinstance(mm_uuids[modality], list)
-                        else 1
-                    )
-                    if uuid_len != data_len:
-                        raise ValueError(
-                            f"multi_modal_uuids for modality '{modality}' "
-                            "must have same length as data: got "
-                            f"{uuid_len} uuids vs "
-                            f"{data_len} items."
-                        )
-                else:
-                    raise ValueError(
-                        f"multi_modal_uuids for modality '{modality}' must "
-                        "be provided if multi_modal_data is provided."
-                    )
-
-        # Handle explicit encoder/decoder prompts or singleton prompt
-        if isinstance(prompt, dict) and "encoder_prompt" in prompt:
-            enc = prompt.get("encoder_prompt")
-            dec = prompt.get("decoder_prompt")
-            if enc is not None:
-                _validate_single_prompt(cast(dict | str, enc))
-            if dec is not None:
-                _validate_single_prompt(cast(dict | str, dec))
-        else:
-            _validate_single_prompt(prompt)  # type: ignore[arg-type]
-
-    def _validate_lora(self, lora_request: LoRARequest | None) -> None:
-        if lora_request is None:
-            return
-
-        # LoRA request passed in while LoRA is not enabled
-        if not self.lora_config:
-            raise ValueError(
-                f"Got lora_request {lora_request} but LoRA is not enabled!"
-            )
-
-        if self.tokenizer is not None:
-            logger.warning_once(
-                "vLLM has deprecated support for supporting different "
-                "tokenizers for different LoRAs. By default, vLLM uses base "
-                "model's tokenizer. If you are using a LoRA "
-                "with its own tokenizer, consider specifying `--tokenizer "
-                "[lora_path]` to use the LoRA tokenizer."
-            )
-
-    def _validate_structured_output(self, params: SamplingParams) -> None:
-        if not params.structured_outputs or not self.structured_outputs_config:
-            return
-
-        if self.model_config.skip_tokenizer_init and params.structured_outputs:
-            raise ValueError(
-                "Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'"  # noqa: E501
-            )
-
-        backend = self.structured_outputs_config.backend
-        if _backend := params.structured_outputs._backend:
-            # Request-level backend selection is not supported.
-            # The values may differ if `params` is reused and was set
-            # to a specific backend based on `auto` behavior in a previous
-            # request. We remember that it was set as a result of `auto`
-            # using the `_backend_was_auto` field set in the params.
-            if backend != _backend and not (
-                backend == "auto" and params.structured_outputs._backend_was_auto
-            ):
-                raise ValueError(
-                    "Request-level structured output backend selection is not "
-                    f"supported. The request specified '{_backend}', but vLLM "
-                    f"was initialised with '{backend}'. This error can be "
-                    "resolved by removing '_backend' from the request."
-                )
-        else:
-            params.structured_outputs._backend = backend
-
-        # Request content validation
-        if (
-            isinstance(params.structured_outputs.choice, list)
-            and not params.structured_outputs.choice
-        ):
-            # It is invalid for choice to be an empty list
-            raise ValueError(
-                f"Choice '{params.structured_outputs.choice}' cannot be an empty list"  # noqa: E501
-            )
-        # Reject empty string grammar early to avoid engine-side crashes
-        if (
-            isinstance(params.structured_outputs.grammar, str)
-            and params.structured_outputs.grammar.strip() == ""
-        ):
-            raise ValueError("structured_outputs.grammar cannot be an empty string")
-
-        if backend.startswith("xgrammar"):
-            # xgrammar with no fallback
-            validate_xgrammar_grammar(params)
-        elif backend.startswith("guidance"):
-            # TODO: ideally we would have the LLTokenizer here as Lark syntax
-            # allows <|special_token|> and similar, see
-            # https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#special-tokens
-            # Without tokenizer these are disallowed in grammars.
-            if isinstance(self.tokenizer, MistralTokenizer):
-                raise ValueError(
-                    "Mistral tokenizer is not supported for the 'guidance' "
-                    "structured output backend. Please use ['xgrammar', 'outlines'] "
-                    "backends or tokenizer_mode='hf' instead."
-                )
-            validate_guidance_grammar(params, tokenizer=None)
-        elif backend == "outlines":
-            # outlines backend
-            validate_structured_output_request_outlines(params)
-        elif backend == "lm-format-enforcer":
-            # lm format enforcer backend
-            if isinstance(self.tokenizer, MistralTokenizer):
-                raise ValueError(
-                    "Mistral tokenizer is not supported for the 'lm-format-enforcer' "
-                    "structured output backend. Please use ['xgrammar', 'outlines'] "
-                    "backends or tokenizer_mode='hf' instead."
-                )
-            validate_structured_output_request_lm_format_enforcer(params)
-        else:
-            # NOTE: backend must be "auto" here, because we have
-            # checked supported_backends above.
-            # In this mode, we set opinionated defaults based on what we think
-            # will satisfy the most use cases without having to worry about
-            # this setting. We include fallback behavior here, but not with any
-            # other setting where a specific backend was specified.
-            try:
-                validate_xgrammar_grammar(params)
-                params.structured_outputs._backend = "xgrammar"
-            except ValueError:
-                # The request either failed validation
-                # or includes some jsonschema feature(s) that
-                # are not supported in xgrammar.
-                if isinstance(self.tokenizer, MistralTokenizer):
-                    # Fall back to outlines if the tokenizer is Mistral
-                    validate_structured_output_request_outlines(params)
-                    params.structured_outputs._backend = "outlines"
-                else:
-                    # Fall back to guidance by default.
-                    validate_guidance_grammar(params, tokenizer=None)
-                    params.structured_outputs._backend = "guidance"
-            # Remember that this backend was set automatically
-            params.structured_outputs._backend_was_auto = True
-
-    def _maybe_build_mm_uuids(
-        self,
-        request_id: str,
-        prompt: PromptType,
-    ) -> MultiModalUUIDDict | None:
-        """Build per-item multimodal hash overrides when enabled. In this case,
-        multimodal data items are identified by their request id, modality and
-        index rather than their content.
-
-        Returns a dictionary of modality -> list[str] of overrides, or None if
-        disabled or no multimodal data is present.
-        """
-
-        def _extract_mm_data(p: PromptType):
-            if isinstance(p, dict) and "encoder_prompt" in p:
-                enc = p.get("encoder_prompt")
-                if isinstance(enc, dict):
-                    return enc.get("multi_modal_data")
-                return None
-            if isinstance(p, dict):
-                return p.get("multi_modal_data")
-            return None
-
-        mm_data = _extract_mm_data(prompt)
-        if not mm_data:
-            return None
-
-        mm_uuids: dict[str, list[str | None] | str] = {}
-        for modality, data in mm_data.items():
-            # Hash each item for embedding inputs.
-            n = (
-                len(data)
-                if isinstance(data, list) or MultiModalDataParser.is_embeddings(data)
-                else 1
-            )
-            mm_uuids[modality] = [f"{request_id}-{modality}-{i}" for i in range(n)]
-        return mm_uuids
-
-    def process_inputs(
-        self,
-        request_id: str,
-        prompt: PromptType,
-        params: SamplingParams | PoolingParams,
-        arrival_time: float | None = None,
-        lora_request: LoRARequest | None = None,
-        tokenization_kwargs: dict[str, Any] | None = None,
-        trace_headers: Mapping[str, str] | None = None,
-        priority: int = 0,
-        data_parallel_rank: int | None = None,
-    ) -> EngineCoreRequest:
-        self._validate_lora(lora_request)
-        self._validate_params(params)
-
-        data_parallel_size = self.vllm_config.parallel_config.data_parallel_size
-        if data_parallel_rank is not None and not (
-            0 <= data_parallel_rank < data_parallel_size
-        ):
-            raise ValueError(
-                f"data_parallel_rank {data_parallel_rank} "
-                f"is out of range [0, {data_parallel_size})."
-            )
-
-        if arrival_time is None:
-            arrival_time = time.time()
-
-        # Optionally generate multimodal hash overrides to avoid hashing
-        # multimodal data items by their content as their identifiers.
-
-        # NOTE: when users explicitly turn off BOTH prefix caching and input
-        # processing caching, no multimodal features or embeddings will be
-        # reused across requests, therefore identifying multimodal data items
-        # by their content is no longer necessary, and we create uuids with
-        # request id-modality-index as multimodal hash overrides.
-        if (
-            self.model_config.multimodal_config
-            and self.model_config.multimodal_config.mm_processor_cache_gb == 0
-            and not self.cache_config.enable_prefix_caching
-        ):
-            mm_uuids = self._maybe_build_mm_uuids(request_id, prompt)
-        else:
-            # Otherwise, use user-provided uuids as multimodal hash overrides
-            # if provided.
-            self._validate_multi_modal_uuids(prompt)
-            if isinstance(prompt, dict):
-                mm_uuids = cast(
-                    MultiModalUUIDDict | None, prompt.get("multi_modal_uuids")
-                )
-            else:
-                mm_uuids = None
-
-        # Process inputs, which includes:
-        # 1. Tokenize text prompt, with LoRA request if one exists.
-        # 2. For multimodal models with a merged preprocessor, preprocess
-        #   multimodal data and expand prompt token ids accordingly.
-        processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
-            prompt,
-            tokenization_kwargs=tokenization_kwargs,
-            mm_uuids=mm_uuids,
-        )
-        from vllm.platforms import current_platform
-
-        current_platform.validate_request(
-            prompt=prompt,
-            params=params,
-            processed_inputs=processed_inputs,
-        )
-
-        eos_token_id = self.input_preprocessor.get_eos_token_id()
-
-        encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
-        self._validate_model_inputs(encoder_inputs, decoder_inputs)
-
-        # Mypy can be conservative for TypedDict unions; normalize access.
-        if decoder_inputs["type"] == "embeds":
-            prompt_token_ids = None
-            prompt_embeds = decoder_inputs["prompt_embeds"]
-        else:
-            prompt_token_ids = decoder_inputs["prompt_token_ids"]
-            prompt_embeds = None
-
-        sampling_params = None
-        pooling_params = None
-        if isinstance(params, SamplingParams):
-            # TODO: can we avoid cloning here in multiproc case?
-            sampling_params = params.clone()
-            # If unset max tokens, then generate up to the max_model_len.
-            if sampling_params.max_tokens is None:
-                seq_len = length_from_prompt_token_ids_or_embeds(
-                    prompt_token_ids, prompt_embeds
-                )
-                sampling_params.max_tokens = self.model_config.max_model_len - seq_len
-            sampling_params.update_from_generation_config(
-                self.generation_config_fields, eos_token_id
-            )
-            if self.tokenizer is not None:
-                sampling_params.update_from_tokenizer(self.tokenizer)
-        else:
-            pooling_params = params.clone()
-
-        # Multimodal related.
-        mm_features: list[MultiModalFeatureSpec] | None = None
-
-        if decoder_inputs["type"] == "multimodal":
-            decoder_mm_inputs = decoder_inputs["mm_kwargs"]
-            decoder_mm_positions = decoder_inputs["mm_placeholders"]
-            decoder_mm_hashes = decoder_inputs["mm_hashes"]
-
-            # Merge and flatten multimodal placeholders, hashes and inputs
-            # from dictionaries to lists, and sort them by each item's position
-            # in the input sequence.
-            sorted_mm_idxs = argsort_mm_positions(decoder_mm_positions)
-
-            mm_features = []
-            for modality, idx in sorted_mm_idxs:
-                mm_features.append(
-                    MultiModalFeatureSpec(
-                        data=decoder_mm_inputs[modality][idx],
-                        modality=modality,
-                        identifier=decoder_mm_hashes[modality][idx],
-                        mm_position=decoder_mm_positions[modality][idx],
-                    )
-                )
-
-        return EngineCoreRequest(
-            request_id=request_id,
-            prompt_token_ids=prompt_token_ids,
-            prompt_embeds=prompt_embeds,
-            mm_features=mm_features,
-            sampling_params=sampling_params,
-            pooling_params=pooling_params,
-            eos_token_id=eos_token_id,
-            arrival_time=arrival_time,
-            lora_request=lora_request,
-            cache_salt=decoder_inputs.get("cache_salt"),
-            priority=priority,
-            data_parallel_rank=data_parallel_rank,
-            trace_headers=trace_headers,
-        )
-
-    def _validate_model_inputs(
-        self, encoder_inputs: SingletonInputs | None, decoder_inputs: SingletonInputs
-    ):
-        if encoder_inputs is not None:
-            self._validate_model_input(encoder_inputs, prompt_type="encoder")
-
-        self._validate_model_input(decoder_inputs, prompt_type="decoder")
-
-    def _validate_model_input(
-        self,
-        prompt_inputs: SingletonInputs,
-        *,
-        prompt_type: Literal["encoder", "decoder"],
-    ):
-        model_config = self.model_config
-
-        prompt_ids = (
-            None
-            if prompt_inputs["type"] == "embeds"
-            else prompt_inputs["prompt_token_ids"]
-        )
-        prompt_embeds = (
-            prompt_inputs["prompt_embeds"]
-            if prompt_inputs["type"] == "embeds"
-            else None
-        )
-        prompt_len = length_from_prompt_token_ids_or_embeds(prompt_ids, prompt_embeds)
-        if not prompt_ids:
-            if prompt_type == "encoder" and model_config.is_multimodal_model:
-                pass  # Mllama may have empty encoder inputs for text-only data
-            elif prompt_inputs["type"] == "embeds":
-                pass  # Prompt embeds should not have prompt_ids.
-            else:
-                raise ValueError(f"The {prompt_type} prompt cannot be empty")
-
-        tokenizer = self.tokenizer
-        if tokenizer is not None:
-            max_input_id = max(prompt_ids or [], default=0)
-
-            # NOTE: tokenizer.max_token_id is the tokenizer’s vocab size while
-            # self.model_config.get_vocab_size() is the model’s vocab size.
-            # For Qwen3 models, the language model has extra tokens that do
-            # not exist in the tokenizer, and vice versa for multimodal
-            # placeholder tokens in some multimodal models.
-            # See https://github.com/QwenLM/Qwen3/issues/29#issuecomment-1933720399 # noqa: E501
-            # and https://github.com/vllm-project/vllm/pull/22471#discussion_r2312251421 # noqa: E501
-
-            # Here we take the max of the two to determine if a token id is
-            # truly out-of-vocabulary.
-            if max_input_id > max(
-                tokenizer.max_token_id, self.model_config.get_vocab_size() - 1
-            ):
-                raise ValueError(f"Token id {max_input_id} is out of vocabulary")
-
-        max_prompt_len = self.model_config.max_model_len
-        if prompt_len > max_prompt_len:
-            if prompt_type == "encoder" and model_config.is_multimodal_model:
-                mm_registry = self.input_preprocessor.mm_registry
-                mm_processor = mm_registry.create_processor(
-                    model_config,
-                    tokenizer=tokenizer,
-                )
-                assert isinstance(mm_processor, EncDecMultiModalProcessor)
-
-                if mm_processor.pad_dummy_encoder_prompt:
-                    return  # Skip encoder length check for Whisper
-
-            if model_config.is_multimodal_model:
-                suggestion = (
-                    "Make sure that `max_model_len` is no smaller than the "
-                    "number of text tokens plus multimodal tokens. For image "
-                    "inputs, the number of image tokens depends on the number "
-                    "of images, and possibly their aspect ratios as well."
-                )
-            else:
-                suggestion = (
-                    "Make sure that `max_model_len` is no smaller than the "
-                    "number of text tokens."
-                )
-
-            raise ValueError(
-                f"The {prompt_type} prompt (length {prompt_len}) is "
-                f"longer than the maximum model length of {max_prompt_len}. "
-                f"{suggestion}"
-            )
-
-            # TODO: Find out how many placeholder tokens are there so we can
-            # check that chunked prefill does not truncate them
-            # max_batch_len = self.scheduler_config.max_num_batched_tokens
-
-        if (
-            prompt_len == max_prompt_len
-            and prompt_type == "decoder"
-            and not model_config.is_multimodal_model
-            and self.model_config.runner_type != "pooling"
-        ):
-            suggestion = (
-                "Make sure that `max_model_len` is no smaller than the "
-                "number of text tokens (prompt + requested output tokens)."
-            )
-            raise ValueError(
-                f"The {prompt_type} prompt (length {prompt_len}) plus the number of "
-                f"requested output tokens (at least 1) is longer than the maximum "
-                f"model length of {max_prompt_len}. {suggestion}"
-            )
-
-    def stat_mm_cache(self) -> MultiModalCacheStats | None:
-        return self.input_preprocessor.stat_mm_cache()
-
-    def clear_mm_cache(self) -> None:
-        self.input_preprocessor.clear_mm_cache()
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

From fecae12cd7deb969dcbba37fda9d2d234697a944 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 28 Nov 2025 20:26:51 +0000
Subject: [PATCH 057/770] Remove `all_special_tokens_extended` from tokenizer
 code (#29686)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/tokenization/test_cached_tokenizer.py   |  1 -
 tests/tokenization/test_mistral_tokenizer.py  | 86 +++++++++----------
 tests/tokenization/test_tokenizer_registry.py |  4 -
 vllm/transformers_utils/tokenizer.py          |  5 --
 vllm/transformers_utils/tokenizer_base.py     |  5 --
 vllm/transformers_utils/tokenizers/mistral.py |  4 -
 6 files changed, 40 insertions(+), 65 deletions(-)

diff --git a/tests/tokenization/test_cached_tokenizer.py b/tests/tokenization/test_cached_tokenizer.py
index 074039f9e5134..a5bb3dbcfe29d 100644
--- a/tests/tokenization/test_cached_tokenizer.py
+++ b/tests/tokenization/test_cached_tokenizer.py
@@ -31,7 +31,6 @@ def _check_consistency(target: AnyTokenizer, expected: AnyTokenizer):
     # Cached attributes
     assert target.all_special_ids == expected.all_special_ids
     assert target.all_special_tokens == expected.all_special_tokens
-    assert target.all_special_tokens_extended == expected.all_special_tokens_extended
     assert target.get_vocab() == expected.get_vocab()
     assert len(target) == len(expected)
 
diff --git a/tests/tokenization/test_mistral_tokenizer.py b/tests/tokenization/test_mistral_tokenizer.py
index c80b698ba3848..4cdfa9df95e1a 100644
--- a/tests/tokenization/test_mistral_tokenizer.py
+++ b/tests/tokenization/test_mistral_tokenizer.py
@@ -258,52 +258,46 @@ def mistral_tokenizer(request) -> MistralTokenizer:
 )
 class TestMistralTokenizer:
     def test_all_special_tokens(self, mistral_tokenizer: MistralTokenizer):
-        attributes = [
-            mistral_tokenizer.all_special_tokens,
-            mistral_tokenizer.all_special_tokens_extended,
-        ]
-
-        for attribute in attributes:
-            if mistral_tokenizer.is_tekken:
-                assert attribute == [
-                    "<unk>",
-                    "<s>",
-                    "</s>",
-                    "[INST]",
-                    "[/INST]",
-                    "[AVAILABLE_TOOLS]",
-                    "[/AVAILABLE_TOOLS]",
-                    "[TOOL_RESULTS]",
-                    "[/TOOL_RESULTS]",
-                    "[TOOL_CALLS]",
-                    "[IMG]",
-                    "<pad>",
-                    "[IMG_BREAK]",
-                    "[IMG_END]",
-                    "[PREFIX]",
-                    "[MIDDLE]",
-                    "[SUFFIX]",
-                    "[SYSTEM_PROMPT]",
-                    "[/SYSTEM_PROMPT]",
-                    "[TOOL_CONTENT]",
-                ] + [f"<SPECIAL_{i}>" for i in range(20, 32)] + [
-                    "[ARGS]",
-                    "[CALL_ID]",
-                    "[THINK]",
-                    "[/THINK]",
-                ] + [f"<SPECIAL_{i}>" for i in range(36, 1000)]
-            else:
-                assert attribute == [
-                    "<s>",
-                    "</s>",
-                    "[INST]",
-                    "[/INST]",
-                    "[TOOL_CALLS]",
-                    "[AVAILABLE_TOOLS]",
-                    "[/AVAILABLE_TOOLS]",
-                    "[TOOL_RESULTS]",
-                    "[/TOOL_RESULTS]",
-                ] + [f"[control_{i}]" for i in range(8, 769)]
+        if mistral_tokenizer.is_tekken:
+            assert mistral_tokenizer.all_special_tokens == [
+                "<unk>",
+                "<s>",
+                "</s>",
+                "[INST]",
+                "[/INST]",
+                "[AVAILABLE_TOOLS]",
+                "[/AVAILABLE_TOOLS]",
+                "[TOOL_RESULTS]",
+                "[/TOOL_RESULTS]",
+                "[TOOL_CALLS]",
+                "[IMG]",
+                "<pad>",
+                "[IMG_BREAK]",
+                "[IMG_END]",
+                "[PREFIX]",
+                "[MIDDLE]",
+                "[SUFFIX]",
+                "[SYSTEM_PROMPT]",
+                "[/SYSTEM_PROMPT]",
+                "[TOOL_CONTENT]",
+            ] + [f"<SPECIAL_{i}>" for i in range(20, 32)] + [
+                "[ARGS]",
+                "[CALL_ID]",
+                "[THINK]",
+                "[/THINK]",
+            ] + [f"<SPECIAL_{i}>" for i in range(36, 1000)]
+        else:
+            assert mistral_tokenizer.all_special_tokens == [
+                "<s>",
+                "</s>",
+                "[INST]",
+                "[/INST]",
+                "[TOOL_CALLS]",
+                "[AVAILABLE_TOOLS]",
+                "[/AVAILABLE_TOOLS]",
+                "[TOOL_RESULTS]",
+                "[/TOOL_RESULTS]",
+            ] + [f"[control_{i}]" for i in range(8, 769)]
 
     def get_vocab(self, mistral_tokenizer: MistralTokenizer):
         assert (
diff --git a/tests/tokenization/test_tokenizer_registry.py b/tests/tokenization/test_tokenizer_registry.py
index d89737888aa2b..f13bb4333d619 100644
--- a/tests/tokenization/test_tokenizer_registry.py
+++ b/tests/tokenization/test_tokenizer_registry.py
@@ -15,10 +15,6 @@ class TestTokenizer(TokenizerBase):
     def from_pretrained(cls, *args, **kwargs) -> "TestTokenizer":
         return TestTokenizer()
 
-    @property
-    def all_special_tokens_extended(self) -> list[str]:
-        raise NotImplementedError()
-
     @property
     def all_special_tokens(self) -> list[str]:
         raise NotImplementedError()
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 9eb7fe37912b4..be4325ab9101d 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -96,7 +96,6 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
 
     tokenizer_all_special_ids = tokenizer.all_special_ids
     tokenizer_all_special_tokens = tokenizer.all_special_tokens
-    tokenizer_all_special_tokens_extended = tokenizer.all_special_tokens_extended
     tokenizer_vocab = tokenizer.get_vocab()
     tokenizer_len = len(tokenizer)
 
@@ -118,10 +117,6 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
         def all_special_tokens(self) -> list[str]:
             return tokenizer_all_special_tokens
 
-        @property
-        def all_special_tokens_extended(self) -> list[str]:
-            return tokenizer_all_special_tokens_extended
-
         @property
         def max_token_id(self) -> int:
             return max_token_id
diff --git a/vllm/transformers_utils/tokenizer_base.py b/vllm/transformers_utils/tokenizer_base.py
index 7421eb5348082..52f221d1e373e 100644
--- a/vllm/transformers_utils/tokenizer_base.py
+++ b/vllm/transformers_utils/tokenizer_base.py
@@ -10,11 +10,6 @@ if TYPE_CHECKING:
 
 
 class TokenizerBase(ABC):
-    @property
-    @abstractmethod
-    def all_special_tokens_extended(self) -> list[str]:
-        raise NotImplementedError()
-
     @property
     @abstractmethod
     def all_special_tokens(self) -> list[str]:
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index caff43c55ce85..1954e2a815b03 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -254,10 +254,6 @@ class MistralTokenizer(TokenizerBase):
 
     # the following attributes are set to fit vLLM's design and are used
     # by the structured output backends.
-    @property
-    def all_special_tokens_extended(self) -> list[str]:
-        return self.all_special_tokens
-
     @property
     def all_special_tokens(self) -> list[str]:
         return self._special_tokens

From 3461e7efd8d1af0dd069900383fd5e33c7956c1f Mon Sep 17 00:00:00 2001
From: Yanan Cao <gmagogsfm@users.noreply.github.com>
Date: Fri, 28 Nov 2025 13:51:12 -0800
Subject: [PATCH 058/770] [Frontend] Remap -O to -cc commandline flag (#29557)

Signed-off-by: Yanan Cao <gmagogsfm@gmail.com>
Co-authored-by: Claude <noreply@anthropic.com>
---
 .../scripts/hardware_ci/run-xpu-test.sh       |  2 +-
 docs/design/debug_vllm_compile.md             | 22 ++++----
 docs/design/torch_compile.md                  |  2 +-
 .../fullgraph/test_basic_correctness.py       |  6 +--
 tests/engine/test_arg_utils.py                | 12 ++---
 tests/utils_/test_argparse_utils.py           | 50 +++++++++++++------
 vllm/config/vllm.py                           |  4 +-
 vllm/engine/arg_utils.py                      |  2 +-
 vllm/utils/argparse_utils.py                  | 11 ++++
 9 files changed, 72 insertions(+), 39 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index d49f3e2f47cf1..4d163399cfc6c 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -35,7 +35,7 @@ docker run \
     echo $ZE_AFFINITY_MASK
     pip install tblib==3.1.0
     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
     VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
diff --git a/docs/design/debug_vllm_compile.md b/docs/design/debug_vllm_compile.md
index 408d2878309dd..e565f17da62ad 100644
--- a/docs/design/debug_vllm_compile.md
+++ b/docs/design/debug_vllm_compile.md
@@ -8,9 +8,9 @@ TL;DR:
 | Online Flag | Offline Flag   |      Result |
 |----------|----------|-------------|
 | --enforce-eager | enforce_eager=True |  Turn off torch.compile and CUDAGraphs |
-| -O.mode=0 | mode=CompilationMode.NONE |  Turn off torch.compile only |
-| -O.cudagraph_mode=NONE | compilation_config=CompilationConfig(cudagraph_mode=CUDAGraphMode.NONE) |  Turn off CUDAGraphs only |
-| -O.backend=eager | compilation_config=CompilationConfig(backend='eager') |  Turn off TorchInductor |
+| -cc.mode=0 | mode=CompilationMode.NONE |  Turn off torch.compile only |
+| -cc.cudagraph_mode=NONE | compilation_config=CompilationConfig(cudagraph_mode=CUDAGraphMode.NONE) |  Turn off CUDAGraphs only |
+| -cc.backend=eager | compilation_config=CompilationConfig(backend='eager') |  Turn off TorchInductor |
 
 ## vLLM-torch.compile overview
 
@@ -86,11 +86,11 @@ LLM(model, enforce_eager=True)
 ```
 
 To turn off just torch.compile, pass `mode = NONE` to the compilation config.
-(`-O` is short for `--compilation_config`):
+(`-cc` is short for `--compilation_config`; `-O.*` dotted syntax is deprecated):
 
 ```sh
 # Online
-vllm serve -O.mode=0
+vllm serve -cc.mode=0
 ```
 
 ```py
@@ -103,7 +103,7 @@ To turn off just CUDAGraphs, pass `cudagraph_mode = NONE`:
 
 ```sh
 # Online
-vllm serve -O.cudagraph_mode=NONE
+vllm serve -cc.cudagraph_mode=NONE
 ```
 
 ```py
@@ -183,10 +183,10 @@ help debug the issue:
 
 ```sh
 # Online - using unbacked mode
-vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=unbacked
+vllm serve meta-llama/Llama-3.2-1B -cc.dynamic_shapes_config.type=unbacked
 
 # Online - using backed_size_oblivious mode
-vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=backed_size_oblivious
+vllm serve meta-llama/Llama-3.2-1B -cc.dynamic_shapes_config.type=backed_size_oblivious
 ```
 
 ```py
@@ -233,7 +233,7 @@ to the compilation config:
 
 ```sh
 # online
-vllm serve -O.backend=eager
+vllm serve -cc.backend=eager
 ```
 
 ```py
@@ -252,7 +252,7 @@ You can also use `TORCH_LOGS=output_code <command>` to print the Inductor output
 ### Editable TorchInductor code
 
 You can edit the TorchInductor code that gets run by setting `VLLM_COMPILE_CACHE_SAVE_FORMAT=unpacked`
-or passing `-O.compile_cache_save_format=unpacked`. The default is `binary`, which means it is not editable.
+or passing `-cc.compile_cache_save_format=unpacked`. The default is `binary`, which means it is not editable.
 
 This is a useful technique: you can put breakpoints (e.g. `torch.distributed.breakpoint()`)
 and print statements in the output code.
@@ -299,7 +299,7 @@ To turn off just CUDAGraphs, pass `cudagraph_mode = NONE`:
 
 ```sh
 # Online
-vllm serve -O.cudagraph_mode=NONE
+vllm serve -cc.cudagraph_mode=NONE
 ```
 
 ```py
diff --git a/docs/design/torch_compile.md b/docs/design/torch_compile.md
index 7b0b2c1e96978..4dc0da0c7d659 100644
--- a/docs/design/torch_compile.md
+++ b/docs/design/torch_compile.md
@@ -117,7 +117,7 @@ vllm serve meta-llama/Llama-3.2-1B \
 
 
 # Alternative: Using dot notation (simpler for single values)
-vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=unbacked
+vllm serve meta-llama/Llama-3.2-1B -cc.dynamic_shapes_config.type=unbacked
 ```
 
 #### Choosing the Right Mode
diff --git a/tests/compile/fullgraph/test_basic_correctness.py b/tests/compile/fullgraph/test_basic_correctness.py
index 965938c4433dd..f2e58b5cc423e 100644
--- a/tests/compile/fullgraph/test_basic_correctness.py
+++ b/tests/compile/fullgraph/test_basic_correctness.py
@@ -115,7 +115,7 @@ def test_compile_correctness(
             str(pp_size),
             "-tp",
             str(tp_size),
-            "-O.cudagraph_mode=none",
+            "-cc.cudagraph_mode=none",
         ]
 
         all_args: list[list[str]] = []
@@ -128,7 +128,7 @@ def test_compile_correctness(
         ]:
             for mode in [CompilationMode.NONE, comp_mode]:
                 all_args.append(
-                    final_args + [f"-O.mode={mode.name}", "-O.backend=inductor"]
+                    final_args + [f"-cc.mode={mode.name}", "-cc.backend=inductor"]
                 )
 
             # inductor will change the output, so we only compare if the output
@@ -148,7 +148,7 @@ def test_compile_correctness(
             CompilationMode.DYNAMO_TRACE_ONCE,
             CompilationMode.VLLM_COMPILE,
         ]:
-            all_args.append(final_args + [f"-O.mode={mode.name}", "-O.backend=eager"])
+            all_args.append(final_args + [f"-cc.mode={mode.name}", "-cc.backend=eager"])
             all_envs.append({})
             all_envs.append({})
 
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index 0077609b2f365..e46f118f8e846 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -248,15 +248,15 @@ def test_optimization_level(args, expected):
 @pytest.mark.parametrize(
     ("args", "expected"),
     [
-        (["-O.mode=0"], 0),
-        (["-O.mode=1"], 1),
-        (["-O.mode=2"], 2),
-        (["-O.mode=3"], 3),
+        (["-cc.mode=0"], 0),
+        (["-cc.mode=1"], 1),
+        (["-cc.mode=2"], 2),
+        (["-cc.mode=3"], 3),
     ],
 )
 def test_mode_parser(args, expected):
     """
-    Test compilation config modes (-O.mode=int) map to compilation_config.
+    Test compilation config modes (-cc.mode=int) map to compilation_config.
     """
     parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
     parsed_args = parser.parse_args(args)
@@ -273,7 +273,7 @@ def test_compilation_config():
     # set to string form of a dict
     args = parser.parse_args(
         [
-            "-O",
+            "-cc",
             '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], "backend": "eager"}',
         ]
     )
diff --git a/tests/utils_/test_argparse_utils.py b/tests/utils_/test_argparse_utils.py
index c0519155c4ba8..0ea4a43d26024 100644
--- a/tests/utils_/test_argparse_utils.py
+++ b/tests/utils_/test_argparse_utils.py
@@ -27,7 +27,7 @@ def parser():
     parser.add_argument("--batch-size", type=int)
     parser.add_argument("--enable-feature", action="store_true")
     parser.add_argument("--hf-overrides", type=json.loads)
-    parser.add_argument("-O", "--compilation-config", type=json.loads)
+    parser.add_argument("-cc", "--compilation-config", type=json.loads)
     parser.add_argument("--optimization-level", type=int)
     return parser
 
@@ -167,8 +167,8 @@ def test_dict_args(parser):
         "--hf-overrides.key2.key4",
         "val3",
         # Test compile config and compilation mode
-        "-O.use_inductor_graph_partition=true",
-        "-O.backend",
+        "-cc.use_inductor_graph_partition=true",
+        "-cc.backend",
         "custom",
         "-O1",
         # Test = sign
@@ -191,9 +191,9 @@ def test_dict_args(parser):
         "--hf_overrides.key14.key15",
         "-minus.and.dot",
         # Test array values
-        "-O.custom_ops+",
+        "-cc.custom_ops+",
         "-quant_fp8",
-        "-O.custom_ops+=+silu_mul,-rms_norm",
+        "-cc.custom_ops+=+silu_mul,-rms_norm",
     ]
     parsed_args = parser.parse_args(args)
     assert parsed_args.model_name == "something.something"
@@ -234,7 +234,7 @@ def test_duplicate_dict_args(caplog_vllm, parser):
         "--hf-overrides.key1",
         "val2",
         "-O1",
-        "-O.mode",
+        "-cc.mode",
         "2",
         "-O3",
     ]
@@ -380,29 +380,29 @@ def test_load_config_file(tmp_path):
 
 
 def test_compilation_mode_string_values(parser):
-    """Test that -O.mode accepts both integer and string mode values."""
-    args = parser.parse_args(["-O.mode", "0"])
+    """Test that -cc.mode accepts both integer and string mode values."""
+    args = parser.parse_args(["-cc.mode", "0"])
     assert args.compilation_config == {"mode": 0}
 
     args = parser.parse_args(["-O3"])
     assert args.optimization_level == 3
 
-    args = parser.parse_args(["-O.mode=NONE"])
+    args = parser.parse_args(["-cc.mode=NONE"])
     assert args.compilation_config == {"mode": "NONE"}
 
-    args = parser.parse_args(["-O.mode", "STOCK_TORCH_COMPILE"])
+    args = parser.parse_args(["-cc.mode", "STOCK_TORCH_COMPILE"])
     assert args.compilation_config == {"mode": "STOCK_TORCH_COMPILE"}
 
-    args = parser.parse_args(["-O.mode=DYNAMO_TRACE_ONCE"])
+    args = parser.parse_args(["-cc.mode=DYNAMO_TRACE_ONCE"])
     assert args.compilation_config == {"mode": "DYNAMO_TRACE_ONCE"}
 
-    args = parser.parse_args(["-O.mode", "VLLM_COMPILE"])
+    args = parser.parse_args(["-cc.mode", "VLLM_COMPILE"])
     assert args.compilation_config == {"mode": "VLLM_COMPILE"}
 
-    args = parser.parse_args(["-O.mode=none"])
+    args = parser.parse_args(["-cc.mode=none"])
     assert args.compilation_config == {"mode": "none"}
 
-    args = parser.parse_args(["-O.mode=vllm_compile"])
+    args = parser.parse_args(["-cc.mode=vllm_compile"])
     assert args.compilation_config == {"mode": "vllm_compile"}
 
 
@@ -458,3 +458,25 @@ def test_flat_product():
         (3, 4, "a", 5, 6),
         (3, 4, "b", 5, 6),
     ]
+
+
+def test_o_legacy_syntax_deprecation(caplog_vllm):
+    """Test that -O.* dotted syntax emits warnings and converts correctly to -cc syntax."""
+    parser = FlexibleArgumentParser()
+    parser.add_argument("-cc", "--compilation-config", type=json.loads)
+
+    # Test that -O.backend gets converted correctly AND emits warning
+    args = parser.parse_args(["-O.backend=eager"])
+    assert args.compilation_config == {"backend": "eager"}
+
+    # Check that deprecation warning was logged
+    assert len(caplog_vllm.records) >= 1
+    assert (
+        "The -O.* dotted syntax for --compilation-config is deprecated"
+        in caplog_vllm.text
+    )
+
+    # Test that -O.mode gets converted correctly
+    # Note: warning_once won't emit again in same session
+    args = parser.parse_args(["-O.mode=2"])
+    assert args.compilation_config == {"mode": 2}
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 7ac8cc764322e..34e70e3e134be 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -193,8 +193,8 @@ class VllmConfig:
     compilation_config: CompilationConfig = Field(default_factory=CompilationConfig)
     """`torch.compile` and cudagraph capture configuration for the model.
 
-    As a shorthand, one can append compilation arguments via 
-    -0.parameter=argument such as `-O.mode=3` (same as `-O='{"mode":3}'`).
+    As a shorthand, one can append compilation arguments via
+    -cc.parameter=argument such as `-cc.mode=3` (same as `-cc='{"mode":3}'`).
 
     You can specify the full compilation config like so:
     `{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 8f0eb832064fa..31825980f3a10 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1107,7 +1107,7 @@ class EngineArgs:
             "--ec-transfer-config", **vllm_kwargs["ec_transfer_config"]
         )
         vllm_group.add_argument(
-            "--compilation-config", "-O", **vllm_kwargs["compilation_config"]
+            "--compilation-config", "-cc", **vllm_kwargs["compilation_config"]
         )
         vllm_group.add_argument(
             "--additional-config", **vllm_kwargs["additional_config"]
diff --git a/vllm/utils/argparse_utils.py b/vllm/utils/argparse_utils.py
index b68157f02f6cc..555fcfea491e2 100644
--- a/vllm/utils/argparse_utils.py
+++ b/vllm/utils/argparse_utils.py
@@ -257,6 +257,17 @@ class FlexibleArgumentParser(ArgumentParser):
             ):
                 # Convert -O <n> to --optimization-level <n>
                 processed_args.append("--optimization-level")
+            elif arg.startswith("-O."):
+                # Handle -O.* dotted syntax - ALL dotted syntax is deprecated
+                logger.warning_once(
+                    "The -O.* dotted syntax for --compilation-config is "
+                    "deprecated and will be removed in v0.13.0 or v1.0.0"
+                    ", whichever is earlier.  Please use -cc.* instead. "
+                    "Example: -cc.backend=eager instead of "
+                    "-O.backend=eager."
+                )
+                converted_arg = arg.replace("-O", "-cc", 1)
+                processed_args.append(converted_arg)
             else:
                 processed_args.append(arg)
 

From 1986de137502d0d767cb4c1d3cad23dedbd22397 Mon Sep 17 00:00:00 2001
From: Benjamin Chislett <bchislett@nvidia.com>
Date: Fri, 28 Nov 2025 17:25:05 -0500
Subject: [PATCH 059/770] [Perf] Optimize EAGLE prepare_inputs_padded with
 triton kernels (#28597)

Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
Signed-off-by: Benjamin Chislett <chislett.ben@gmail.com>
---
 tests/v1/spec_decode/test_eagle.py |  30 +++-----
 vllm/v1/spec_decode/eagle.py       | 109 +++++++++++++----------------
 vllm/v1/spec_decode/utils.py       | 105 +++++++++++++++++++++++++++
 vllm/v1/worker/gpu_model_runner.py |  63 +++++++++--------
 4 files changed, 199 insertions(+), 108 deletions(-)

diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index c93c59d1f4c42..9436ab471c21b 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -103,16 +103,23 @@ def test_prepare_next_token_ids():
         mock_request.num_computed_tokens = 0
         mock_requests[req_id] = mock_request
 
+    # explicitly discard the last request
+    discarded_req_mask = torch.tensor(
+        [False, False, False, True], dtype=torch.bool, device=device
+    )
     sampled_token_ids = [
         [0, 1, -1, -1, -1],  # 1 accepted, 3 rejected, "1" sampled
         [0, 1, 2, 3, 4],  # all accepted, "4" sampled
         [-1, -1, -1, -1, -1],  # sampling skipped, use backup token "30"
-        [-1, -1, -1, -1, -1],  # this request will be discarded
+        [0, 1, 2, -1, -1],  # explicitly discarded, sampling should be ignored
     ]
     sampled_token_ids_tensor = torch.tensor(
         sampled_token_ids, dtype=torch.int32, device=device
     )
     sampled_token_ids_cpu = [[i for i in seq if i != -1] for seq in sampled_token_ids]
+    for i in range(len(sampled_token_ids_cpu)):
+        if discarded_req_mask[i]:
+            sampled_token_ids_cpu[i] = []
 
     expected_next_token_ids_cpu = [1, 4, 30, 40]
     expected_next_token_ids_tensor = torch.tensor(
@@ -136,9 +143,6 @@ def test_prepare_next_token_ids():
         device=device,
     )
 
-    discarded_req_indices = torch.tensor([3], dtype=torch.int64, device=device)
-    num_discarded_reqs = 1
-
     expected_valid_sampled_tokens_count = torch.tensor(
         [2, 5, 0, 0], dtype=torch.int32, device=device
     )
@@ -149,8 +153,7 @@ def test_prepare_next_token_ids():
             sampled_token_ids_tensor,
             mock_requests,
             mock_input_batch,
-            discarded_req_indices,
-            num_discarded_reqs,
+            discarded_req_mask,
         )
     )
 
@@ -256,11 +259,6 @@ def test_prepare_inputs_padded():
     - Request 3: query_len = 3, rejected = 2
 
     Expected outputs:
-    token_indices: [0, 1, 2,
-                    3, 4, 5,
-                    6, 7, 8]
-    Reason: Deferred computation should not disturb the original indices.
-
     token_indices_to_sample: [1, 5, 6]
     Reason: After accounting for rejections, these are the valid token positions
             from the original indices to sample from.
@@ -268,9 +266,6 @@ def test_prepare_inputs_padded():
 
     device = torch.device(current_platform.device_type)
 
-    expected_token_indices = torch.tensor(
-        [0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=torch.int32, device=device
-    )
     expected_token_indices_to_sample = torch.tensor(
         [1, 5, 6], dtype=torch.int32, device=device
     )
@@ -305,15 +300,12 @@ def test_prepare_inputs_padded():
 
     proposer = _create_proposer("eagle", num_speculative_tokens)
 
-    output_metadata, token_indices, token_indices_to_sample = (
-        proposer.prepare_inputs_padded(
-            common_attn_metadata, spec_decode_metadata, valid_sampled_tokens_count
-        )
+    output_metadata, token_indices_to_sample = proposer.prepare_inputs_padded(
+        common_attn_metadata, spec_decode_metadata, valid_sampled_tokens_count
     )
 
     assert output_metadata.max_query_len == 3
     assert torch.equal(output_metadata.query_start_loc, expected_query_start_loc)
-    assert torch.equal(token_indices, expected_token_indices)
     assert torch.equal(token_indices_to_sample, expected_token_indices_to_sample)
 
 
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 305abdade8da6..72f9d15bc1328 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -25,6 +25,7 @@ from vllm.model_executor.models.deepseek_v2 import DeepseekV32IndexerCache
 from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.platforms import current_platform
+from vllm.triton_utils import triton
 from vllm.utils.platform_utils import is_pin_memory_available
 from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
 from vllm.v1.attention.backends.tree_attn import (
@@ -40,6 +41,10 @@ from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.sampler import _SAMPLING_EPS
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
+from vllm.v1.spec_decode.utils import (
+    eagle_prepare_inputs_padded_kernel,
+    eagle_prepare_next_token_padded_kernel,
+)
 from vllm.v1.utils import CpuGpuBuffer
 from vllm.v1.worker.dp_utils import coordinate_batch_across_dp
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
@@ -555,20 +560,15 @@ class EagleProposer:
         sampled_token_ids: torch.Tensor,
         requests: dict[str, CachedRequestState],
         gpu_input_batch: InputBatch,
-        discard_request_indices: torch.Tensor,
-        num_discarded_requests: int,
+        discard_request_mask: torch.Tensor,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         This function is used to prepare the inputs for speculative decoding.
         It calculates the next token ids and the number of valid sampled tokens
         for each request, considering the "discarded" requests whose next token
-        is not sampled and comes from `request.get_token_id()` instead.
-        It also accounts for the rejected tokens in `sampled_token_ids`.
-        This function must use device functions to operate on the inputs, and
-        should not introduce any blocking CPU-GPU synchronization.
+        is not sampled and comes from `request.get_token_id()` instead. This is denoted
+        the "backup" token id. It also counts rejected tokens via `sampled_token_ids`.
         """
-        # TODO(Ben): Combine this into a custom fused kernel
-
         # Precompute get_token_id for when there is no valid next token
         num_reqs = gpu_input_batch.num_reqs
         self.backup_next_token_ids.np[:num_reqs] = np.array(
@@ -577,44 +577,39 @@ class EagleProposer:
                     common_attn_metadata.seq_lens_cpu[i].item()
                 )
                 for i in range(num_reqs)
-            ]
+            ],
+            dtype=np.int32,
         )
         self.backup_next_token_ids.copy_to_gpu(num_reqs)
+        backup_tokens_gpu = self.backup_next_token_ids.gpu
 
-        # Mask out the sampled tokens indices that should not be sampled.
-        discard_sampled_tokens_req_indices = discard_request_indices[
-            :num_discarded_requests
-        ]
+        batch_size, num_tokens = sampled_token_ids.shape
+        device = sampled_token_ids.device
 
-        valid_sampled_token_ids_gpu = sampled_token_ids.clone()
-        valid_sampled_token_ids_gpu.index_fill_(
-            0, discard_sampled_tokens_req_indices, -1
+        assert discard_request_mask.dtype == torch.bool
+        assert backup_tokens_gpu.dtype == torch.int32
+
+        next_token_ids = torch.empty((batch_size,), dtype=torch.int32, device=device)
+        valid_sampled_tokens_count = torch.empty(
+            (batch_size,), dtype=torch.int32, device=device
         )
 
-        # Generate a mask for all valid tokens within those requests
-        valid_mask = (valid_sampled_token_ids_gpu != -1) & (
-            valid_sampled_token_ids_gpu < gpu_input_batch.vocab_size
-        )
+        # Kernel grid: one program per request (row)
+        grid = (batch_size,)
 
-        # Count the number of valid tokens in each request
-        valid_sampled_tokens_count = valid_mask.sum(dim=1)
-
-        # Get the rightmost valid index per row
-        last_valid_indices = valid_sampled_tokens_count - 1
-        last_valid_indices_safe = torch.clamp(last_valid_indices, min=0)
-
-        # Get last valid token from each row
-        # (assume undefined state where there is no valid token)
-        selected_tokens = torch.gather(
-            valid_sampled_token_ids_gpu, 1, last_valid_indices_safe.unsqueeze(1)
-        ).squeeze(1)
-
-        # Use last token if valid, pre-computed backup if not
-        batch_size = valid_sampled_token_ids_gpu.shape[0]
-        next_token_ids = torch.where(
-            last_valid_indices != -1,
-            selected_tokens,
-            self.backup_next_token_ids.gpu[:batch_size],
+        # Find the next power of 2 for block sizes
+        BLOCK_SIZE_TOKENS = triton.next_power_of_2(num_tokens)
+        eagle_prepare_next_token_padded_kernel[grid](
+            sampled_token_ids,
+            discard_request_mask,
+            backup_tokens_gpu,
+            next_token_ids,
+            valid_sampled_tokens_count,
+            gpu_input_batch.vocab_size,
+            num_tokens,
+            batch_size,
+            sampled_token_ids.stride(0),
+            BLOCK_SIZE_TOKENS=BLOCK_SIZE_TOKENS,
         )
 
         return next_token_ids, valid_sampled_tokens_count
@@ -624,35 +619,35 @@ class EagleProposer:
         common_attn_metadata: CommonAttentionMetadata,
         spec_decode_metadata: SpecDecodeMetadata,
         valid_sampled_tokens_count: torch.Tensor,
-    ) -> tuple[CommonAttentionMetadata, torch.Tensor, torch.Tensor]:
+    ) -> tuple[CommonAttentionMetadata, torch.Tensor]:
         """
         This function is used to prepare the inputs for speculative decoding
         It updates the common_attn_metadata for speculative decoding,
         but does not consider the rejected tokens. Instead, all tokens
         are included as inputs to the speculator, with the rejected tokens
         used as padding and filtered out later by `token_indices_to_sample`.
-        No blocking CPU operations should be introduced in this function.
         """
-        num_draft_tokens_gpu = torch.cat(
-            [
-                spec_decode_metadata.cu_num_draft_tokens[0:1],
-                spec_decode_metadata.cu_num_draft_tokens[1:]
-                - spec_decode_metadata.cu_num_draft_tokens[:-1],
-            ]
+        num_reqs = common_attn_metadata.num_reqs
+        device = valid_sampled_tokens_count.device
+
+        token_indices_to_sample = torch.empty(
+            (num_reqs,), dtype=torch.int32, device=device
         )
 
-        num_rejected_tokens_gpu = torch.where(
-            num_draft_tokens_gpu > 0,
-            num_draft_tokens_gpu + 1 - valid_sampled_tokens_count,
-            torch.zeros_like(num_draft_tokens_gpu),
+        # Kernel grid: one program per request (row)
+        grid = (num_reqs,)
+        eagle_prepare_inputs_padded_kernel[grid](
+            spec_decode_metadata.cu_num_draft_tokens,
+            valid_sampled_tokens_count,
+            common_attn_metadata.query_start_loc,
+            token_indices_to_sample,
+            num_reqs,
         )
 
         query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
-
         new_query_len_per_req = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
 
         total_num_tokens = query_start_loc_cpu[-1].item()
-        token_indices = self.arange[:total_num_tokens]
 
         spec_common_attn_metadata = CommonAttentionMetadata(
             query_start_loc=common_attn_metadata.query_start_loc,
@@ -665,16 +660,12 @@ class EagleProposer:
             max_query_len=new_query_len_per_req.max().item(),
             max_seq_len=common_attn_metadata.seq_lens_cpu.max().item(),
             block_table_tensor=common_attn_metadata.block_table_tensor,
-            slot_mapping=common_attn_metadata.slot_mapping[token_indices],
+            slot_mapping=common_attn_metadata.slot_mapping[:total_num_tokens],
             causal=True,
             dcp_local_seq_lens=common_attn_metadata.dcp_local_seq_lens,
         )
 
-        token_indices_to_sample = (
-            common_attn_metadata.query_start_loc[1:] - 1 - num_rejected_tokens_gpu
-        )
-
-        return spec_common_attn_metadata, token_indices, token_indices_to_sample
+        return spec_common_attn_metadata, token_indices_to_sample
 
     def propose_tree(
         self,
diff --git a/vllm/v1/spec_decode/utils.py b/vllm/v1/spec_decode/utils.py
index 1901c6fc9f14f..9d4399d00487a 100644
--- a/vllm/v1/spec_decode/utils.py
+++ b/vllm/v1/spec_decode/utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from vllm.sampling_params import SamplingParams
+from vllm.triton_utils import tl, triton
 
 _SAMPLING_EPS = 1e-5
 
@@ -14,3 +15,107 @@ def is_spec_decode_unsupported(sampling_params: SamplingParams) -> bool:
         or sampling_params.min_p > _SAMPLING_EPS
         or sampling_params.logprobs is not None
     )
+
+
+@triton.jit
+def eagle_prepare_inputs_padded_kernel(
+    cu_num_draft_tokens_ptr,  # [num_reqs]
+    valid_sampled_tokens_count_ptr,  # [num_reqs]
+    query_start_loc_gpu_ptr,  # [num_reqs + 1]
+    token_indices_to_sample_ptr,  # [num_reqs] (output)
+    num_reqs,  # tl.int32
+):
+    """
+    Fused kernel for Eagle prepare_input_padded. This kernel computes the
+    token index to sample for each request, taking into account the number
+    of draft tokens and the number of valid sampled tokens (which is one more than
+    the number of accepted tokens).
+    """
+    req_idx = tl.program_id(axis=0)
+    if req_idx >= num_reqs:
+        return
+
+    # Calculate num_draft_tokens from cu_num_draft_tokens, which is an inclusive
+    # cumulative sum (first entry is the first value, not zero).
+    cu_draft_curr = tl.load(cu_num_draft_tokens_ptr + req_idx)
+
+    num_draft_tokens = 0
+    if req_idx == 0:
+        num_draft_tokens = cu_draft_curr
+    else:
+        cu_draft_prev = tl.load(cu_num_draft_tokens_ptr + req_idx - 1)
+        num_draft_tokens = cu_draft_curr - cu_draft_prev
+
+    valid_count = tl.load(valid_sampled_tokens_count_ptr + req_idx)
+    num_rejected_tokens = num_draft_tokens + 1 - valid_count
+    num_rejected_tokens = tl.where(num_draft_tokens > 0, num_rejected_tokens, 0)
+
+    # query_start_loc[req_idx + 1] is the start position of the next request,
+    # which is one past the last token of this request.
+    q_last_tok_idx = tl.load(query_start_loc_gpu_ptr + req_idx + 1) - 1
+
+    index_to_sample = q_last_tok_idx - num_rejected_tokens
+    tl.store(token_indices_to_sample_ptr + req_idx, index_to_sample)
+
+
+@triton.jit
+def eagle_prepare_next_token_padded_kernel(
+    sampled_token_ids_ptr,  # [num_reqs, num_sampled_tokens_per_req]
+    discard_request_mask_ptr,  # [num_reqs]
+    backup_next_token_ids_ptr,  # [num_reqs]
+    next_token_ids_ptr,  # [num_reqs] (output)
+    valid_sampled_tokens_count_ptr,  # [num_reqs] (output)
+    vocab_size,  # tl.int32
+    num_sampled_tokens_per_req,  # tl.int32 (num_spec_tokens + 1)
+    num_reqs,  # tl.int32
+    stride_sampled_token_ids,  # tl.int32 (stride for dim 0)
+    BLOCK_SIZE_TOKENS: tl.constexpr,  # Power-of-2 >= num_sampled_tokens_per_req
+):
+    """
+    Fused kernel for Eagle prepare_next_token_ids_padded. This kernel computes the
+    number of valid (1 + accepted) tokens for each request, and the corresponding
+    "next" token id to sample from during speculative decoding. This is the
+    "last accepted token" from the sampled tokens, or the backup token if no
+    tokens were accepted or if the request is marked as discarded.
+    """
+    req_idx = tl.program_id(axis=0)
+    if req_idx >= num_reqs:
+        return
+
+    # Check if this request is discarded.
+    is_discarded = tl.load(discard_request_mask_ptr + req_idx)
+
+    if is_discarded:
+        backup_token = tl.load(backup_next_token_ids_ptr + req_idx)
+        valid_count = tl.full((), 0, dtype=tl.uint32)
+        tl.store(next_token_ids_ptr + req_idx, backup_token)
+        tl.store(valid_sampled_tokens_count_ptr + req_idx, valid_count)
+    else:
+        # Count the number of valid tokens among the sampled tokens.
+        token_offs = tl.arange(0, BLOCK_SIZE_TOKENS)
+        token_mask = token_offs < num_sampled_tokens_per_req
+
+        row_ptr = sampled_token_ids_ptr + req_idx * stride_sampled_token_ids
+        token_ids = tl.load(row_ptr + token_offs, mask=token_mask, other=-1)
+
+        # Rejected tokens are -1, valid tokens are in [0, vocab_size)
+        is_valid_mask = (token_ids != -1) & (token_ids < vocab_size) & token_mask
+        valid_count = tl.sum(is_valid_mask)
+
+        if valid_count > 0:
+            # Guaranteed to be well-defined since
+            # valid_count > 0 implies is_valid_mask is not empty
+            last_valid_index = tl.max(tl.where(is_valid_mask, token_offs, -1))
+
+            # Select the token at that index, using a sum trick since
+            # we don't want to load again to access token_ids[last_valid_index].
+            last_valid_token = tl.sum(
+                tl.where(token_offs == last_valid_index, token_ids, 0)
+            )
+            tl.store(next_token_ids_ptr + req_idx, last_valid_token)
+        else:
+            # No valid tokens found, use backup token
+            backup_token = tl.load(backup_next_token_ids_ptr + req_idx)
+            tl.store(next_token_ids_ptr + req_idx, backup_token)
+
+        tl.store(valid_sampled_tokens_count_ptr + req_idx, valid_count)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 6bff83658b45a..9b0fb07297ac3 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -488,11 +488,9 @@ class GPUModelRunner(
             self.max_num_tokens, self.hidden_size, dtype=self.dtype, numpy=False
         )
         self.is_token_ids = self._make_buffer(self.max_num_tokens, dtype=torch.bool)
-        self.discard_request_indices = self._make_buffer(
-            self.max_num_reqs, dtype=torch.int64
+        self.discard_request_mask = self._make_buffer(
+            self.max_num_reqs, dtype=torch.bool
         )
-        self.num_discarded_requests = 0
-
         self.num_decode_draft_tokens = self._make_buffer(
             self.max_num_reqs, dtype=torch.int32
         )
@@ -1369,16 +1367,12 @@ class GPUModelRunner(
         num_tokens = [self.requests[r].num_tokens for r in self.input_batch.req_ids]
         num_tokens_np = np.array(num_tokens, dtype=np.int32)
 
-        # Record the index of requests that should not be sampled,
+        # Record which requests should not be sampled,
         # so that we could clear the sampled tokens before returning
-        discard_requests_mask = self.seq_lens.np[:num_reqs] < num_tokens_np
-        discard_request_indices = np.nonzero(discard_requests_mask)[0]
-        self.num_discarded_requests = len(discard_request_indices)
-        self.discard_request_indices.np[: self.num_discarded_requests] = (
-            discard_request_indices
+        self.discard_request_mask.np[:num_reqs] = (
+            self.seq_lens.np[:num_reqs] < num_tokens_np
         )
-
-        self.discard_request_indices.copy_to_gpu(self.num_discarded_requests)
+        self.discard_request_mask.copy_to_gpu(num_reqs)
 
         # Copy the tensors to the GPU.
         self._prepare_input_ids(
@@ -2548,9 +2542,10 @@ class GPUModelRunner(
         if envs.VLLM_COMPUTE_NANS_IN_LOGITS:
             num_nans_in_logits = self._get_nans_in_logits(logits)
 
-        discard_sampled_tokens_req_indices = self.discard_request_indices.np[
-            : self.num_discarded_requests
-        ]
+        num_reqs = self.input_batch.num_reqs
+        discard_sampled_tokens_req_indices = np.nonzero(
+            self.discard_request_mask.np[:num_reqs]
+        )[0]
         for i in discard_sampled_tokens_req_indices:
             gen = self.input_batch.generators.get(int(i))
             if gen is not None:
@@ -3131,8 +3126,7 @@ class GPUModelRunner(
                         sampled_token_ids,
                         self.requests,
                         self.input_batch,
-                        self.discard_request_indices.gpu,
-                        self.num_discarded_requests,
+                        self.discard_request_mask.gpu,
                     )
                 )
                 self._copy_valid_sampled_token_count(
@@ -3335,8 +3329,7 @@ class GPUModelRunner(
                         sampled_token_ids,
                         self.requests,
                         self.input_batch,
-                        self.discard_request_indices.gpu,
-                        self.num_discarded_requests,
+                        self.discard_request_mask.gpu,
                     )
                 )
                 self._copy_valid_sampled_token_count(
@@ -3363,24 +3356,34 @@ class GPUModelRunner(
                         sampled_token_ids,
                         spec_decode_metadata.num_draft_tokens,
                     )
+                    target_token_ids = self.input_ids.gpu[token_indices]
+                    target_positions = self._get_positions(token_indices)
+                    if self.use_aux_hidden_state_outputs:
+                        assert aux_hidden_states is not None
+                        target_hidden_states = torch.cat(
+                            [h[token_indices] for h in aux_hidden_states], dim=-1
+                        )
+                    else:
+                        target_hidden_states = hidden_states[token_indices]
                 else:
-                    common_attn_metadata, token_indices, token_indices_to_sample = (
+                    common_attn_metadata, token_indices_to_sample = (
                         self.drafter.prepare_inputs_padded(
                             common_attn_metadata,
                             spec_decode_metadata,
                             valid_sampled_tokens_count,
                         )
                     )
-
-                target_token_ids = self.input_ids.gpu[token_indices]
-                target_positions = self._get_positions(token_indices)
-                if self.use_aux_hidden_state_outputs:
-                    assert aux_hidden_states is not None
-                    target_hidden_states = torch.cat(
-                        [h[token_indices] for h in aux_hidden_states], dim=-1
-                    )
-                else:
-                    target_hidden_states = hidden_states[token_indices]
+                    total_num_tokens = common_attn_metadata.num_actual_tokens
+                    # When padding the batch, token_indices is just a range
+                    target_token_ids = self.input_ids.gpu[:total_num_tokens]
+                    target_positions = self._get_positions(total_num_tokens)
+                    if self.use_aux_hidden_state_outputs:
+                        assert aux_hidden_states is not None
+                        target_hidden_states = torch.cat(
+                            [h[:total_num_tokens] for h in aux_hidden_states], dim=-1
+                        )
+                    else:
+                        target_hidden_states = hidden_states[:total_num_tokens]
 
             if self.supports_mm_inputs:
                 mm_embed_inputs = self._gather_mm_embeddings(

From 7c1ed45848890a06693d36d372d8e8e50cdc9593 Mon Sep 17 00:00:00 2001
From: Ralf Gommers <ralf.gommers@gmail.com>
Date: Sat, 29 Nov 2025 00:21:46 +0100
Subject: [PATCH 060/770] [CI/Build]: make it possible to build with a
 free-threaded interpreter (#29241)

Signed-off-by: Ralf Gommers <ralf.gommers@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 cmake/utils.cmake | 8 +++++++-
 setup.py          | 7 ++++++-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index ca0062ba4fabe..5047c354ff7d2 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -495,7 +495,13 @@ function (define_extension_target MOD_NAME)
     set(SOABI_KEYWORD "")
   endif()
 
-  if (ARG_USE_SABI)
+  run_python(IS_FREETHREADED_PYTHON
+    "import sysconfig; print(1 if sysconfig.get_config_var(\"Py_GIL_DISABLED\") else 0)"
+    "Failed to determine whether interpreter is free-threaded")
+
+  # Free-threaded Python doesn't yet support the stable ABI (see PEP 803/809),
+  # so avoid using the stable ABI under free-threading only.
+  if (ARG_USE_SABI AND NOT IS_FREETHREADED_PYTHON)
     Python_add_library(${MOD_NAME} MODULE USE_SABI ${ARG_USE_SABI} ${SOABI_KEYWORD} "${ARG_SOURCES}")
   else()
     Python_add_library(${MOD_NAME} MODULE ${SOABI_KEYWORD} "${ARG_SOURCES}")
diff --git a/setup.py b/setup.py
index 8871b04d8fc46..0022e7fe0bf36 100644
--- a/setup.py
+++ b/setup.py
@@ -10,6 +10,7 @@ import re
 import shutil
 import subprocess
 import sys
+import sysconfig
 from pathlib import Path
 from shutil import which
 
@@ -74,9 +75,13 @@ def is_ninja_available() -> bool:
     return which("ninja") is not None
 
 
+def is_freethreaded():
+    return bool(sysconfig.get_config_var("Py_GIL_DISABLED"))
+
+
 class CMakeExtension(Extension):
     def __init__(self, name: str, cmake_lists_dir: str = ".", **kwa) -> None:
-        super().__init__(name, sources=[], py_limited_api=True, **kwa)
+        super().__init__(name, sources=[], py_limited_api=not is_freethreaded(), **kwa)
         self.cmake_lists_dir = os.path.abspath(cmake_lists_dir)
 
 
From 7675ba30de041269988f9307b3fcb65e410a3857 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 29 Nov 2025 07:24:47 +0800
Subject: [PATCH 061/770] [Misc] Remove redundant `ClassRegistry` (#29681)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 .../multimodal/processing/test_common.py      |  2 +-
 .../processing/test_tensor_schema.py          |  2 +-
 vllm/model_executor/models/interfaces.py      |  7 ++++
 vllm/multimodal/registry.py                   | 24 +++++-------
 vllm/utils/collection_utils.py                | 39 +++----------------
 5 files changed, 25 insertions(+), 49 deletions(-)

diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 313ab2fa8038b..9638791ab5caa 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -233,7 +233,7 @@ def _test_processing_correctness(
     )
 
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
-    factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
+    factories = model_cls._processor_factory
     ctx = InputProcessingContext(
         model_config,
         tokenizer=cached_tokenizer_from_config(model_config),
diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py
index 687d1ef349f84..a287d5b87d1b7 100644
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -193,7 +193,7 @@ def test_model_tensor_schema(model_id: str):
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
     assert supports_multimodal(model_cls)
 
-    factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
+    factories = model_cls._processor_factory
 
     inputs_parse_methods = []
     for attr_name in dir(model_cls):
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index cee0b79e5e5ac..2218d688e59f6 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -32,11 +32,13 @@ if TYPE_CHECKING:
     from vllm.config import VllmConfig
     from vllm.model_executor.models.utils import WeightsMapper
     from vllm.multimodal.inputs import MultiModalFeatureSpec
+    from vllm.multimodal.registry import _ProcessorFactories
     from vllm.sequence import IntermediateTensors
 else:
     VllmConfig = object
     WeightsMapper = object
     MultiModalFeatureSpec = object
+    _ProcessorFactories = object
     IntermediateTensors = object
 
 logger = init_logger(__name__)
@@ -87,6 +89,11 @@ class SupportsMultiModal(Protocol):
     A set indicating CPU-only multimodal fields.
     """
 
+    _processor_factory: ClassVar[_ProcessorFactories]
+    """
+    Set internally by `MultiModalRegistry.register_processor`.
+    """
+
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         """
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 8f9276e846407..a7eafa76ad17e 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -2,14 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Mapping
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Generic, Protocol, TypeVar
-
-import torch.nn as nn
+from typing import TYPE_CHECKING, Generic, Protocol, TypeVar, cast
 
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer, cached_tokenizer_from_config
-from vllm.utils.collection_utils import ClassRegistry
 
 from .cache import BaseMultiModalProcessorCache
 from .processing import (
@@ -26,10 +23,11 @@ from .profiling import (
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
+    from vllm.model_executor.models.interfaces import SupportsMultiModal
 
 logger = init_logger(__name__)
 
-N = TypeVar("N", bound=type[nn.Module])
+N = TypeVar("N", bound=type["SupportsMultiModal"])
 _I = TypeVar("_I", bound=BaseProcessingInfo)
 _I_co = TypeVar("_I_co", bound=BaseProcessingInfo, covariant=True)
 
@@ -95,9 +93,6 @@ class MultiModalRegistry:
     A registry that dispatches data processing according to the model.
     """
 
-    def __init__(self) -> None:
-        self._processor_factories = ClassRegistry[nn.Module, _ProcessorFactories]()
-
     def _extract_mm_options(
         self,
         model_config: "ModelConfig",
@@ -207,7 +202,7 @@ class MultiModalRegistry:
         """
 
         def wrapper(model_cls: N) -> N:
-            if self._processor_factories.contains(model_cls, strict=True):
+            if "_processor_factory" in model_cls.__dict__:
                 logger.warning(
                     "Model class %s already has a multi-modal processor "
                     "registered to %s. It is overwritten by the new one.",
@@ -215,7 +210,7 @@ class MultiModalRegistry:
                     self,
                 )
 
-            self._processor_factories[model_cls] = _ProcessorFactories(
+            model_cls._processor_factory = _ProcessorFactories(
                 info=info,
                 dummy_inputs=dummy_inputs,
                 processor=processor,
@@ -225,12 +220,13 @@ class MultiModalRegistry:
 
         return wrapper
 
-    def _get_model_cls(self, model_config: "ModelConfig"):
+    def _get_model_cls(self, model_config: "ModelConfig") -> "SupportsMultiModal":
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture
 
         model_cls, _ = get_model_architecture(model_config)
-        return model_cls
+        assert hasattr(model_cls, "_processor_factory")
+        return cast("SupportsMultiModal", model_cls)
 
     def _create_processing_ctx(
         self,
@@ -248,7 +244,7 @@ class MultiModalRegistry:
         tokenizer: AnyTokenizer | None = None,
     ) -> BaseProcessingInfo:
         model_cls = self._get_model_cls(model_config)
-        factories = self._processor_factories[model_cls]
+        factories = model_cls._processor_factory
         ctx = self._create_processing_ctx(model_config, tokenizer)
         return factories.info(ctx)
 
@@ -266,7 +262,7 @@ class MultiModalRegistry:
             raise ValueError(f"{model_config.model} is not a multimodal model")
 
         model_cls = self._get_model_cls(model_config)
-        factories = self._processor_factories[model_cls]
+        factories = model_cls._processor_factory
 
         ctx = self._create_processing_ctx(model_config, tokenizer)
 
diff --git a/vllm/utils/collection_utils.py b/vllm/utils/collection_utils.py
index 57271311828cd..3b19e1bd78197 100644
--- a/vllm/utils/collection_utils.py
+++ b/vllm/utils/collection_utils.py
@@ -6,64 +6,37 @@ Contains helpers that are applied to collections.
 This is similar in concept to the `collections` module.
 """
 
-from collections import UserDict, defaultdict
+from collections import defaultdict
 from collections.abc import Callable, Generator, Hashable, Iterable, Mapping
 from typing import Generic, Literal, TypeVar
 
 from typing_extensions import TypeIs, assert_never
 
 T = TypeVar("T")
-U = TypeVar("U")
 
 _K = TypeVar("_K", bound=Hashable)
 _V = TypeVar("_V")
 
 
-class ClassRegistry(UserDict[type[T], _V]):
-    """
-    A registry that acts like a dictionary but searches for other classes
-    in the MRO if the original class is not found.
-    """
-
-    def __getitem__(self, key: type[T]) -> _V:
-        for cls in key.mro():
-            if cls in self.data:
-                return self.data[cls]
-
-        raise KeyError(key)
-
-    def __contains__(self, key: object) -> bool:
-        return self.contains(key)
-
-    def contains(self, key: object, *, strict: bool = False) -> bool:
-        if not isinstance(key, type):
-            return False
-
-        if strict:
-            return key in self.data
-
-        return any(cls in self.data for cls in key.mro())
-
-
-class LazyDict(Mapping[str, T], Generic[T]):
+class LazyDict(Mapping[str, _V], Generic[_V]):
     """
     Evaluates dictionary items only when they are accessed.
 
     Adapted from: https://stackoverflow.com/a/47212782/5082708
     """
 
-    def __init__(self, factory: dict[str, Callable[[], T]]):
+    def __init__(self, factory: dict[str, Callable[[], _V]]):
         self._factory = factory
-        self._dict: dict[str, T] = {}
+        self._dict: dict[str, _V] = {}
 
-    def __getitem__(self, key: str) -> T:
+    def __getitem__(self, key: str) -> _V:
         if key not in self._dict:
             if key not in self._factory:
                 raise KeyError(key)
             self._dict[key] = self._factory[key]()
         return self._dict[key]
 
-    def __setitem__(self, key: str, value: Callable[[], T]):
+    def __setitem__(self, key: str, value: Callable[[], _V]):
         self._factory[key] = value
 
     def __iter__(self):

From a51f4186f20d27a8329fc40fa970e22808dd4a27 Mon Sep 17 00:00:00 2001
From: Jiangyun Zhu <riverclouds.zhu@qq.com>
Date: Sat, 29 Nov 2025 07:25:26 +0800
Subject: [PATCH 062/770] [Bugfix] fix dots.llm1.inst (#29687)

Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
---
 vllm/model_executor/models/dots1.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py
index 1c2abbe7b3a78..3beee9f864634 100644
--- a/vllm/model_executor/models/dots1.py
+++ b/vllm/model_executor/models/dots1.py
@@ -181,13 +181,14 @@ class Dots1MoE(nn.Module):
         hidden_states = hidden_states.view(-1, hidden_dim)
 
         router_logits, _ = self.gate(hidden_states)
-        final_hidden_states = (
-            self.experts(hidden_states=hidden_states, router_logits=router_logits)
-            * self.routed_scaling_factor
-        )
 
+        shared_out, routed_out = self.experts(
+            hidden_states=hidden_states, router_logits=router_logits
+        )
         if self.shared_experts is not None:
-            final_hidden_states = final_hidden_states[0] + final_hidden_states[1]
+            final_hidden_states = (routed_out + shared_out) * self.routed_scaling_factor
+        else:
+            final_hidden_states = routed_out * self.routed_scaling_factor
 
         if self.tp_size > 1:
             final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)

From 3fd1fb0b6016d1471853f5114fd97c74f1a8d29c Mon Sep 17 00:00:00 2001
From: Huamin Li <3ericli@gmail.com>
Date: Fri, 28 Nov 2025 15:26:52 -0800
Subject: [PATCH 063/770] Revert "[LoRA] Support FusedMoE LoRA Triton kernel
 for mxfp4 (#28971)" (#29697)

Signed-off-by: Huamin Li <3ericli@gmail.com>
---
 .../moe/test_modular_oai_triton_moe.py        | 250 ------------------
 vllm/lora/layers/fused_moe.py                 |  37 +--
 .../fused_moe/gpt_oss_triton_kernels_moe.py   | 146 ----------
 .../layers/quantization/mxfp4.py              |  20 +-
 4 files changed, 12 insertions(+), 441 deletions(-)
 delete mode 100644 tests/kernels/moe/test_modular_oai_triton_moe.py

diff --git a/tests/kernels/moe/test_modular_oai_triton_moe.py b/tests/kernels/moe/test_modular_oai_triton_moe.py
deleted file mode 100644
index 3361d85e92507..0000000000000
--- a/tests/kernels/moe/test_modular_oai_triton_moe.py
+++ /dev/null
@@ -1,250 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-Test modular OAI Triton MoE
-"""
-
-import pytest
-import torch
-
-from vllm.utils.import_utils import has_triton_kernels
-
-if not has_triton_kernels():
-    pytest.skip(
-        "triton_kernels not found, skipping all related tests",
-        allow_module_level=True,
-    )
-
-from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig
-from triton_kernels.numerics import InFlexData
-from triton_kernels.numerics_details.mxfp import downcast_to_mxfp, upcast_from_mxfp
-from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor
-from triton_kernels.tensor_details import layout
-from triton_kernels.testing import assert_close
-
-from vllm.config import VllmConfig, set_current_vllm_config
-from vllm.model_executor.layers.fused_moe.config import mxfp4_w4a16_moe_quant_config
-from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
-    OAITritonExperts,
-    UnfusedOAITritonExperts,
-)
-from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
-from vllm.model_executor.layers.fused_moe.prepare_finalize import (
-    MoEPrepareAndFinalizeNoEP,
-)
-from vllm.model_executor.layers.utils import shuffle_weight
-from vllm.platforms import current_platform
-
-MNK = [
-    (1, 512, 384),
-    (1, 2880, 2880),
-    (2, 512, 384),
-    (2, 2880, 2880),
-    (32, 2880, 2880),
-    (64, 2880, 2880),
-]
-
-
-def unshuffle_weight(w: torch.Tensor):
-    first = w[..., ::2]
-    second = w[..., 1::2]
-    return torch.concat((first, second), dim=-1)
-
-
-def make_weights(dtype, k, n, e):
-    w1 = torch.randn((e, k, 2 * n), dtype=dtype, device="cuda")
-    w1_bias = torch.randn((e, 2 * n), dtype=dtype, device="cuda")
-
-    w2 = torch.randn((e, n, k), dtype=dtype, device="cuda")
-    w2_bias = torch.randn((e, k), dtype=dtype, device="cuda")
-
-    w1_tri = w1.clone()
-    w2_tri = w2.clone()
-
-    w1_bias_tri = w1_bias.clone()
-    w2_bias_tri = w2_bias.clone()
-    w1_bias_tri = w1_bias_tri.to(torch.float32)
-    w2_bias_tri = w2_bias_tri.to(torch.float32)
-
-    # shuffle weights
-    w1_tri = shuffle_weight(w1_tri)
-    w1_bias_tri = shuffle_weight(w1_bias_tri)
-
-    # quant triton_weights
-    w1_tri, w1_scale_tri = downcast_to_mxfp(w1_tri, torch.uint8, axis=1)
-    w1 = upcast_from_mxfp(w1_tri, w1_scale_tri, dtype, axis=1)
-    w1 = unshuffle_weight(w1)
-
-    w2_tri, w2_scale_tri = downcast_to_mxfp(w2_tri, torch.uint8, axis=1)
-    w2 = upcast_from_mxfp(w2_tri, w2_scale_tri, dtype, axis=1)
-
-    num_warps = 8
-    w_layout, w_layout_opts = layout.make_default_matmul_mxfp4_w_layout(mx_axis=1)
-    w_scale_layout, w_scale_layout_opts = (
-        layout.make_default_matmul_mxfp4_w_scale_layout(mx_axis=1, num_warps=num_warps)
-    )
-
-    w1_tri = convert_layout(wrap_torch_tensor(w1_tri, FP4), w_layout, **w_layout_opts)
-    w1_scale_tri = convert_layout(
-        wrap_torch_tensor(w1_scale_tri),
-        w_scale_layout,
-        **w_scale_layout_opts,
-    )
-
-    w2_tri = convert_layout(wrap_torch_tensor(w2_tri, FP4), w_layout, **w_layout_opts)
-    w2_scale_tri = convert_layout(
-        wrap_torch_tensor(w2_scale_tri),
-        w_scale_layout,
-        **w_scale_layout_opts,
-    )
-
-    w1_precision_config = PrecisionConfig(
-        weight_scale=w1_scale_tri, flex_ctx=FlexCtx(rhs_data=InFlexData())
-    )
-    w2_precision_config = PrecisionConfig(
-        weight_scale=w2_scale_tri, flex_ctx=FlexCtx(rhs_data=InFlexData())
-    )
-
-    return (
-        w1,
-        w2,
-        w1_bias,
-        w2_bias,
-        w1_tri,
-        w2_tri,
-        w1_bias_tri,
-        w2_bias_tri,
-        w1_precision_config,
-        w2_precision_config,
-    )
-
-
-def swiglu(x, alpha: float = 1.702, limit: float = 1.0):
-    # Note we add an extra bias of 1 to the linear layer
-    x_glu, x_linear = torch.chunk(x, 2, dim=-1)
-    if limit is not None:
-        x_glu = x_glu.clamp(max=limit)
-    out_glu = x_glu * torch.sigmoid(alpha * x_glu)
-    if limit is not None:
-        x_linear = x_linear.clamp(min=-limit, max=limit)
-    return out_glu * (x_linear + 1)
-
-
-def torch_moe_impl(
-    hidden_states: torch.Tensor,  # (M, K)
-    w1: torch.Tensor,  # (E, K, 2N)
-    w2: torch.Tensor,  # (E, N, K)
-    w1_bias: torch.Tensor,  # (E, 2N)
-    w2_bias: torch.Tensor,  # (E, K)
-    topk_weights: torch.Tensor,  # (M, topk)
-    topk_ids: torch.Tensor,  # (M, topk)
-):
-    w1 = w1[topk_ids, ...]
-    w1_bias = w1_bias[topk_ids, ...]
-    hidden_states = torch.einsum("bekc,bk->bec", w1, hidden_states) + w1_bias
-    hidden_states = swiglu(hidden_states, limit=7)
-
-    w2 = w2[topk_ids, ...]
-    w2_bias = w2_bias[topk_ids, ...]
-    hidden_states = torch.einsum("bekc,bek->bec", w2, hidden_states) + w2_bias
-
-    # Weighted sum of experts
-    hidden_states = torch.einsum("bec,be->bc", hidden_states, topk_weights)
-    return hidden_states
-
-
-def oai_triton_moe_impl(
-    x: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    w1_scale: "PrecisionConfig",
-    w2_scale: "PrecisionConfig",
-    w1_bias: torch.Tensor | None,
-    w2_bias: torch.Tensor | None,
-    num_experts: int,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    unfused: bool = False,
-) -> torch.Tensor:
-    quant_config = mxfp4_w4a16_moe_quant_config(
-        w1_bias=w1_bias,
-        w2_bias=w2_bias,
-        w1_scale=w1_scale,
-        w2_scale=w2_scale,
-    )
-
-    if unfused:
-        fused_experts = UnfusedOAITritonExperts(quant_config)
-    else:
-        fused_experts = OAITritonExperts(quant_config)
-
-    mk = FusedMoEModularKernel(MoEPrepareAndFinalizeNoEP(), fused_experts)
-
-    return mk.forward(
-        hidden_states=x,
-        w1=w1,
-        w2=w2,
-        topk_weights=topk_weights,
-        topk_ids=topk_ids,
-        inplace=True,
-        activation="swigluoai",
-        global_num_experts=num_experts,
-        expert_map=None,
-        apply_router_weight_on_input=False,
-    )
-
-
-@pytest.mark.skipif(
-    not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
-)
-@pytest.mark.parametrize("dtype", [torch.bfloat16])
-@pytest.mark.parametrize("m,n,k", MNK)
-@pytest.mark.parametrize("num_experts", [32, 128])
-@pytest.mark.parametrize("topk", [4])
-@pytest.mark.parametrize("unfused", [True, False])
-def test_oai_triton_moe(
-    dtype: torch.dtype,
-    m: int,
-    n: int,
-    k: int,
-    num_experts: int,
-    topk: int,
-    unfused: bool,
-):
-    current_platform.seed_everything(0)
-    (
-        w1,
-        w2,
-        w1_bias,
-        w2_bias,
-        w1_tri,
-        w2_tri,
-        w1_bias_tri,
-        w2_bias_tri,
-        w1_precision_config,
-        w2_precision_config,
-    ) = make_weights(dtype, k, n, num_experts)
-
-    x = torch.randn((m, k), dtype=dtype, device="cuda")
-    router_logits = torch.randn(m, num_experts, device="cuda", dtype=dtype)
-    topk_weights, topk_ids = torch.topk(router_logits, k=topk, dim=-1, sorted=True)
-    topk_weights = torch.nn.functional.softmax(topk_weights, dim=-1)
-
-    with set_current_vllm_config(VllmConfig()):
-        out_ref = torch_moe_impl(x, w1, w2, w1_bias, w2_bias, topk_weights, topk_ids)
-
-        out = oai_triton_moe_impl(
-            x,
-            w1_tri,
-            w2_tri,
-            w1_precision_config,
-            w2_precision_config,
-            w1_bias_tri,
-            w2_bias_tri,
-            num_experts,
-            topk_weights,
-            topk_ids,
-            unfused,
-        )
-
-    assert_close(ref=out_ref, tri=out, maxtol=0.025, rmstol=0.005)
diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py
index 24cab79a72443..3ad19370962ab 100644
--- a/vllm/lora/layers/fused_moe.py
+++ b/vllm/lora/layers/fused_moe.py
@@ -20,24 +20,15 @@ from vllm.model_executor.layers.fused_moe.config import (
     _get_config_dtype_str,
 )
 from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
-    MarlinExperts,
+    modular_marlin_fused_moe,
 )
 from vllm.model_executor.layers.fused_moe.fused_moe import (
-    TritonExperts,
+    modular_triton_fused_moe,
     try_get_optimal_moe_config,
 )
 from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import (
     FusedMoEModularMethod,
 )
-from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
-    UnfusedOAITritonExperts,
-)
-from vllm.model_executor.layers.fused_moe.modular_kernel import (
-    FusedMoEModularKernel,
-)
-from vllm.model_executor.layers.fused_moe.prepare_finalize import (
-    MoEPrepareAndFinalizeNoEP,
-)
 
 from .utils import _get_lora_device
 
@@ -123,23 +114,15 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
         self.base_layer.ensure_moe_quant_config_init()
         quant_config = self.base_layer.quant_method.moe_quant_config
 
-        prepare_finalize = MoEPrepareAndFinalizeNoEP()
-        m_fused_moe_fn = FusedMoEModularKernel(
-            prepare_finalize,
-            self.base_layer.quant_method.select_gemm_impl(
-                prepare_finalize, self.base_layer
-            ),
-            self.base_layer.shared_experts,
-            getattr(self.base_layer, "shared_experts_stream", None),
+        m_fused_moe_fn = (
+            modular_triton_fused_moe(
+                quant_config, shared_experts=self.base_layer.shared_experts
+            )
+            if not quant_config.use_mxfp4_w4a16
+            else modular_marlin_fused_moe(
+                quant_config, shared_experts=self.base_layer.shared_experts
+            )
         )
-        if quant_config.use_mxfp4_w4a16:
-            assert isinstance(
-                m_fused_moe_fn.fused_experts, (MarlinExperts, UnfusedOAITritonExperts)
-            )
-        else:
-            assert isinstance(
-                m_fused_moe_fn.fused_experts, (MarlinExperts, TritonExperts)
-            )
 
         def fwd_decorator(layer, func):
             def wrapper(*args, **kwargs):
diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
index 0b006e15632e1..128507639fdfd 100644
--- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
+++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
@@ -5,7 +5,6 @@
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
-from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import (
     FUSED_MOE_UNQUANTIZED_CONFIG,
@@ -377,148 +376,3 @@ class OAITritonExperts(BaseOAITritonExperts):
             intermediate_cache=workspace2,
             a1q_scale=a1q_scale,
         )
-
-
-class UnfusedOAITritonExperts(BaseOAITritonExperts):
-    """
-    A Triton based MoE expert class that operates on expert standard
-    format and explicitly keeps the activation and reduction (moe_sum) steps
-    unfused from the matmul_ogs kernel. This exposes injection points
-    for activation and moe_sum.
-
-    One use case for it is to inject LoRA modules on the activation and moe_sum.
-    """
-
-    def __init__(self, quant_config: FusedMoEQuantConfig):
-        # TODO (varun) : Enable activation quantization
-        assert quant_config.use_mxfp4_w4a16, "Supports only mxfp4_w4a16"
-        super().__init__(quant_config)
-
-    @property
-    def activation_formats(
-        self,
-    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
-        return (
-            mk.FusedMoEActivationFormat.Standard,
-            mk.FusedMoEActivationFormat.Standard,
-        )
-
-    def supports_chunking(self) -> bool:
-        return True
-
-    def workspace_shapes(
-        self,
-        M: int,
-        N: int,
-        K: int,
-        topk: int,
-        global_num_experts: int,
-        local_num_experts: int,
-        expert_tokens_meta: mk.ExpertTokensMetadata | None,
-    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
-        # workspace are allocated inside the kernel
-        workspace1 = (M * topk, N // 2)
-        workspace2 = (M * topk, max(N, K))
-        output = (M, K)
-        return (workspace1, workspace2, output)
-
-    def moe_sum(self, input: torch.Tensor, output: torch.Tensor):
-        ops.moe_sum(input, output)
-
-    def apply(
-        self,
-        output: torch.Tensor,
-        hidden_states: torch.Tensor,
-        w1: torch.Tensor,
-        w2: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        activation: str,
-        global_num_experts: int,
-        expert_map: torch.Tensor | None,
-        a1q_scale: torch.Tensor | None,
-        a2_scale: torch.Tensor | None,
-        workspace13: torch.Tensor,
-        workspace2: torch.Tensor,
-        expert_tokens_meta: mk.ExpertTokensMetadata | None,
-        apply_router_weight_on_input: bool,
-    ):
-        if self.quant_config is None:
-            self.quant_config = FUSED_MOE_UNQUANTIZED_CONFIG
-
-        if expert_map is not None:
-            topk_ids = expert_map[topk_ids]
-
-        local_num_experts = w1.size(0)
-        if global_num_experts == -1:
-            global_num_experts = local_num_experts
-
-        routing_data, gather_indx, scatter_indx = self._make_routing_data(
-            topk_ids, topk_weights, local_num_experts
-        )
-
-        topk = topk_ids.size(1)
-
-        # type check, uint8 means mxfp4
-        assert hidden_states.dtype == torch.bfloat16
-        assert (
-            self.quant_config.w1_bias is None
-            or self.quant_config.w1_bias.dtype == torch.float32
-        )
-        assert (
-            self.quant_config.w2_bias is None
-            or self.quant_config.w2_bias.dtype == torch.float32
-        )
-
-        # Shape check, only check non-mxfp4
-        assert hidden_states.ndim == 2
-        assert hidden_states.shape[-1] == w1.shape[-2]
-        assert w2.shape[-1] == w1.shape[1]
-
-        batch_dim = 1
-        M, K = hidden_states.shape
-        E, _, N = w1.shape
-
-        if global_num_experts == -1:
-            global_num_experts = E
-
-        # Note that the output tensor might be in workspace13
-        intermediate_cache1 = _resize_cache(workspace2, (batch_dim, M * topk, N))
-        intermediate_cache3 = _resize_cache(workspace2, (batch_dim, M * topk, K))
-        intermediate_cache2 = _resize_cache(workspace13, (M * topk, N // 2))
-
-        gammas = routing_data.gate_scal if routing_data else None
-
-        matmul_ogs(
-            hidden_states,
-            w1,
-            self.quant_config.w1_bias,
-            routing_data,
-            gather_indx=gather_indx,
-            precision_config=self.quant_config.w1_precision,
-            gammas=gammas if apply_router_weight_on_input else None,
-            fused_activation=None,
-            y=intermediate_cache1,
-        )
-
-        self.activation(
-            activation, intermediate_cache2, intermediate_cache1.view(-1, N)
-        )
-
-        # matmul_ogs grouped reduction fuse sum across multiple experts:
-        # y[dst_ind // n_expts_act, :] += x[src_ind, :]
-        # Need to set n_expts_act to 1 to unfuse moe_sum
-        routing_data.n_expts_act = 1
-
-        matmul_ogs(
-            intermediate_cache2,
-            w2,
-            self.quant_config.w2_bias,
-            routing_data,
-            scatter_indx=scatter_indx,
-            precision_config=self.quant_config.w2_precision,
-            gammas=None if apply_router_weight_on_input else gammas,
-            y=intermediate_cache3,
-        )
-
-        self.moe_sum(intermediate_cache3.view(-1, topk, K), output)
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 74036753496d4..bc241ac692e23 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -30,7 +30,6 @@ from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
 )
 from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
     OAITritonExperts,
-    UnfusedOAITritonExperts,
 )
 from vllm.model_executor.layers.fused_moe.trtllm_moe import TrtLlmGenExperts
 from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod
@@ -84,21 +83,8 @@ def get_mxfp4_backend_with_lora() -> Mxfp4Backend:
     if not current_platform.is_cuda():
         return Mxfp4Backend.NONE
 
-    # If FlashInfer is not available, try either Marlin or Triton
-    triton_kernels_supported = (
-        has_triton_kernels()
-        and is_torch_equal_or_newer("2.8.0")
-        # NOTE: triton_kernels are only confirmed to work on SM90 and SM100
-        # SM110 fails with this error: https://github.com/vllm-project/vllm/issues/29317
-        # SM120 needs this fix: https://github.com/triton-lang/triton/pull/8498
-        and (9, 0) <= current_platform.get_device_capability() < (11, 0)
-    )
-    if envs.VLLM_MXFP4_USE_MARLIN or not triton_kernels_supported:
-        logger.info_once("[get_mxfp4_backend_with_lora] Using Marlin backend")
-        return Mxfp4Backend.MARLIN
-
-    logger.info_once("[get_mxfp4_backend_with_lora] Using Triton backend")
-    return Mxfp4Backend.TRITON
+    logger.info_once("[get_mxfp4_backend_with_lora] Using Marlin backend")
+    return Mxfp4Backend.MARLIN
 
 
 def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend:
@@ -868,8 +854,6 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
             elif self.mxfp4_backend == Mxfp4Backend.MARLIN:
                 return MarlinExperts(self.moe_quant_config)
             elif self.mxfp4_backend == Mxfp4Backend.TRITON:
-                if self.moe.is_lora_enabled:
-                    return UnfusedOAITritonExperts(self.moe_quant_config)
                 return OAITritonExperts(self.moe_quant_config)
             else:
                 raise NotImplementedError(

From 9726e64530e391a4d617cf7ecb862b433698ff6d Mon Sep 17 00:00:00 2001
From: Augusto Yao <augusto.yjh@antgroup.com>
Date: Sat, 29 Nov 2025 07:52:12 +0800
Subject: [PATCH 064/770] bugfix: correct attn output with base 2 or e (#28840)

Signed-off-by: augusto.yjh <augusto.yjh@antgroup.com>
---
 vllm/attention/ops/common.py             | 42 ++++++++++++++++++------
 vllm/v1/attention/backends/flashinfer.py | 11 +++++--
 vllm/v1/attention/backends/mla/common.py |  7 +++-
 3 files changed, 47 insertions(+), 13 deletions(-)

diff --git a/vllm/attention/ops/common.py b/vllm/attention/ops/common.py
index af6766bdd1615..bd6bc864d45d8 100644
--- a/vllm/attention/ops/common.py
+++ b/vllm/attention/ops/common.py
@@ -21,6 +21,7 @@ def _correct_attn_cp_out_kernel(
     lse_idx,
     HEAD_DIM: tl.constexpr,
     N_ROUNDED: tl.constexpr,
+    IS_BASE_E: tl.constexpr,
 ):
     """
     Apply the all-gathered lses to correct each local rank's attention
@@ -55,9 +56,14 @@ def _correct_attn_cp_out_kernel(
     lse_max = tl.max(lse, axis=0)
     lse_max = tl.where(lse_max == -float("inf"), 0, lse_max)
     lse -= lse_max
-    lse_exp = tl.exp(lse)
-    lse_acc = tl.sum(lse_exp, axis=0)
-    lse = tl.log(lse_acc)
+    if IS_BASE_E:
+        lse_exp = tl.exp(lse)
+        lse_acc = tl.sum(lse_exp, axis=0)
+        lse = tl.log(lse_acc)
+    else:
+        lse_exp = tl.exp2(lse)
+        lse_acc = tl.sum(lse_exp, axis=0)
+        lse = tl.log2(lse_acc)
     lse += lse_max
 
     lse_offsets = batch_idx * lses_stride_B + head_idx * lses_stride_H
@@ -81,7 +87,7 @@ def _correct_attn_cp_out_kernel(
         -float("inf"),
         lse_finally,
     )
-    factor = tl.exp(lse_finally)
+    factor = tl.exp(lse_finally) if IS_BASE_E else tl.exp2(lse_finally)
     output = tl.load(outputs_ptr + output_offsets)
     output = output * factor
 
@@ -102,7 +108,11 @@ class CPTritonContext:
 
 
 def correct_attn_out(
-    out: torch.Tensor, lses: torch.Tensor, cp_rank: int, ctx: CPTritonContext
+    out: torch.Tensor,
+    lses: torch.Tensor,
+    cp_rank: int,
+    ctx: CPTritonContext,
+    is_lse_base_on_e: bool = True,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """Correct the attention output using the all-gathered lses.
 
@@ -163,8 +173,7 @@ def correct_attn_out(
         l_sH,
         cp_rank,
     )
-    const_args = {"HEAD_DIM": D, "N_ROUNDED": N}
-
+    const_args = {"HEAD_DIM": D, "N_ROUNDED": N, "IS_BASE_E": is_lse_base_on_e}
     ctx.call_kernel(_correct_attn_cp_out_kernel, grid, *regular_args, **const_args)
     return out, lse
 
@@ -174,6 +183,7 @@ def _cp_lse_common(
     cp_attn_lse: torch.Tensor,
     cp_group: GroupCoordinator,
     ctx: CPTritonContext | None = None,
+    is_lse_base_on_e=True,
 ):
     """
     cp_attn_out: [ B, H, D ]
@@ -193,7 +203,13 @@ def _cp_lse_common(
 
     cp_attn_lse = cp_attn_lse.contiguous()
     lses = cp_group.all_gather(cp_attn_lse, dim=0).view_as(lses)
-    out, lse = correct_attn_out(cp_attn_out, lses, cp_group.rank_in_group, ctx)
+    out, lse = correct_attn_out(
+        cp_attn_out,
+        lses,
+        cp_group.rank_in_group,
+        ctx,
+        is_lse_base_on_e=is_lse_base_on_e,
+    )
     return out, lse
 
 
@@ -203,12 +219,15 @@ def cp_lse_ag_out_rs(
     cp_group: GroupCoordinator,
     ctx: CPTritonContext | None = None,
     return_lse: bool = False,
+    is_lse_base_on_e=True,
 ):
     """
     cp_attn_out: [ B, H, D ]
     cp_attn_lse: [ B, H ]
     """
-    out, lse = _cp_lse_common(cp_attn_out, cp_attn_lse, cp_group, ctx=ctx)
+    out, lse = _cp_lse_common(
+        cp_attn_out, cp_attn_lse, cp_group, ctx=ctx, is_lse_base_on_e=is_lse_base_on_e
+    )
     out = cp_group.reduce_scatter(out, dim=1)
 
     if return_lse:
@@ -225,12 +244,15 @@ def cp_lse_ag_out_ar(
     cp_group: GroupCoordinator,
     ctx: CPTritonContext | None = None,
     return_lse: bool = False,
+    is_lse_base_on_e=True,
 ):
     """
     cp_attn_out: [ B, H, D ]
     cp_attn_lse: [ B, H ]
     """
-    out, lse = _cp_lse_common(cp_attn_out, cp_attn_lse, cp_group, ctx=ctx)
+    out, lse = _cp_lse_common(
+        cp_attn_out, cp_attn_lse, cp_group, ctx=ctx, is_lse_base_on_e=is_lse_base_on_e
+    )
     out = cp_group.all_reduce(out)
 
     if return_lse:
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 777398bf8a20e..69a6a5e5fae82 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -249,7 +249,11 @@ class BatchDCPPrefillWrapper:
             return_lse=True,
         )
         output_context, lse_context = cp_lse_ag_out_rs(
-            output_context_tmp, lse_context_tmp, get_dcp_group(), return_lse=True
+            output_context_tmp,
+            lse_context_tmp,
+            get_dcp_group(),
+            return_lse=True,
+            is_lse_base_on_e=False,
         )
         lse_context = lse_context.transpose(0, 1).contiguous()
 
@@ -1335,7 +1339,10 @@ class FlashInferImpl(AttentionImpl):
                         return_lse=True,
                     )
                     output[:num_decode_tokens] = cp_lse_ag_out_rs(
-                        output_tmp, lse, get_dcp_group()
+                        output_tmp,
+                        lse,
+                        get_dcp_group(),
+                        is_lse_base_on_e=False,
                     )
                 else:
                     decode_wrapper.run(
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index d94ed9183f639..b09541dbf7919 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -2057,7 +2057,12 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
 
             # correct dcp attn_out with lse.
             if self.dcp_world_size > 1:
-                attn_out = cp_lse_ag_out_rs(attn_out, lse, get_dcp_group())
+                attn_out = cp_lse_ag_out_rs(
+                    attn_out,
+                    lse,
+                    get_dcp_group(),
+                    is_lse_base_on_e=not self._use_fi_prefill,
+                )
 
             # v_up projection
             self._v_up_proj(attn_out, out=output[:num_decode_tokens])

From 6173682b6e98ae62f612db7af6813831097b23dc Mon Sep 17 00:00:00 2001
From: Zhengxu Chen <zhxchen17@fb.com>
Date: Fri, 28 Nov 2025 18:58:38 -0500
Subject: [PATCH 065/770] [compile] Include `enable_sleep_mode` into caching
 factors. (#29696)

Signed-off-by: zhxchen17 <zhxchen17@fb.com>
---
 vllm/config/model.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/config/model.py b/vllm/config/model.py
index b9ae4fec14efa..6b36573813544 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -353,7 +353,6 @@ class ModelConfig:
             "hf_token",
             "hf_overrides",
             "logits_processor_pattern",
-            "enable_sleep_mode",
             "override_attention_dtype",
             "logits_processors",
             "io_processor_plugin",

From c625d7b1c628bf8a49aa68d0a2628d95287be1ee Mon Sep 17 00:00:00 2001
From: Mert Unsal <mertunsal1905@gmail.com>
Date: Fri, 28 Nov 2025 16:10:39 -0800
Subject: [PATCH 066/770] =?UTF-8?q?[Bugfix]=20Fix=20O(n=C2=B2)=20multimoda?=
 =?UTF-8?q?l=20string=20prompt=20processing=20(#29667)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: mertunsall <mertunsal1905@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
---
 tests/multimodal/test_processing.py | 36 +++++++++++++++++
 vllm/multimodal/processing.py       | 62 ++++++++++++++---------------
 2 files changed, 65 insertions(+), 33 deletions(-)

diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 2f04bc6695c81..d860c50e7899a 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -15,6 +15,7 @@ from vllm.multimodal.processing import (
     PromptIndexTargets,
     PromptInsertion,
     PromptReplacement,
+    _apply_matches,
     apply_text_matches,
     apply_token_matches,
     find_mm_placeholders,
@@ -1075,3 +1076,38 @@ def test_hf_processor_call_kwargs(
 
     result = ctx.call_hf_processor(processor, {}, inference_kwargs)
     assert result == expected_kwargs
+
+
+def test_apply_matches_no_match_exits_quickly():
+    """
+    Test that _apply_matches exits quickly when no matches are found.
+
+    Previously, _apply_matches had O(n²) behavior when no match was found
+    because it would increment start_idx by 1 each iteration while
+    re-scanning the entire prompt from prev_end_idx=0.
+
+    With the fix, it should exit immediately when no match is found.
+    """
+    import time
+
+    mock_tokenizer = cast(AnyTokenizer, object())
+
+    # Create a long prompt with no placeholder
+    long_prompt = "x" * 10000
+
+    # Create update looking for a placeholder that doesn't exist
+    mm_prompt_updates = {
+        "image": [[PromptReplacement("image", "<image>", "REPLACED").resolve(0)]]
+    }
+
+    start = time.perf_counter()
+    result, _ = _apply_matches(
+        long_prompt,
+        mm_prompt_updates,
+        mock_tokenizer,
+    )
+    elapsed = time.perf_counter() - start
+
+    # Should complete in < 100ms (was taking seconds before the fix)
+    assert elapsed < 0.1, f"_apply_matches took {elapsed:.2f}s, expected < 0.1s"
+    assert "".join(result) == long_prompt
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 691eff9acf862..27bf12a5f3169 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -742,7 +742,6 @@ def _apply_matches(
     mm_prompt_updates: "MultiModalPromptUpdates",
     tokenizer: AnyTokenizer,
 ) -> tuple[list[_S], "MultiModalPromptUpdatesApplyResult"]:
-    prompt_len = len(prompt)
     mm_item_counts = {m: len(items) for m, items in mm_prompt_updates.items()}
 
     out_seqs = list[str | list[int]]()
@@ -750,16 +749,15 @@ def _apply_matches(
         m: [None] * len(items) for m, items in mm_prompt_updates.items()
     }
 
+    # Early exit if no items to find
     mm_found_counts = {
         m: sum(r is not None for r in res) for m, res in out_result.items()
     }
     if _all_items_found(mm_item_counts, mm_found_counts):
         return [prompt], out_result
 
-    start_idx = prev_end_idx = 0
-    while start_idx < max(prompt_len, 1):  # Allow inserts into empty prompt
-        found = False
-
+    prev_end_idx = 0
+    while True:
         mode, matches_to_apply = _find_matches(
             prompt,
             mm_prompt_updates,
@@ -768,39 +766,37 @@ def _apply_matches(
             current_result=out_result,
         )
 
-        if mode is not None:
-            for (modality, item_idx), (match, update_idx) in matches_to_apply:
-                found = True
+        if mode is None:
+            break  # No more matches to find
 
-                matched_update = mm_prompt_updates[modality][item_idx][update_idx]
-                matched_content = matched_update.content.full
+        for (modality, item_idx), (match, update_idx) in matches_to_apply:
+            matched_update = mm_prompt_updates[modality][item_idx][update_idx]
+            matched_content = matched_update.content.full
 
-                if mode == UpdateMode.INSERT:
-                    end_idx_to_insert = match.end_idx
-                elif mode == UpdateMode.REPLACE:
-                    end_idx_to_insert = match.start_idx
-                else:
-                    assert_never(mode)
+            if mode == UpdateMode.INSERT:
+                end_idx_to_insert = match.end_idx
+            elif mode == UpdateMode.REPLACE:
+                end_idx_to_insert = match.start_idx
+            else:
+                assert_never(mode)
 
-                out_seqs.append(prompt[prev_end_idx:end_idx_to_insert])
-                out_seqs.append(
-                    _seq2text(tokenizer, matched_content)
-                    if isinstance(prompt, str)
-                    else _seq2tokens(tokenizer, matched_content)
-                )
-                out_result[modality][item_idx] = update_idx
+            out_seqs.append(prompt[prev_end_idx:end_idx_to_insert])
+            out_seqs.append(
+                _seq2text(tokenizer, matched_content)
+                if isinstance(prompt, str)
+                else _seq2tokens(tokenizer, matched_content)
+            )
+            out_result[modality][item_idx] = update_idx
 
-                # Exclude overlapping matches
-                start_idx = prev_end_idx = match.end_idx
+            # Exclude overlapping matches
+            prev_end_idx = match.end_idx
 
-            mm_found_counts = {
-                m: sum(r is not None for r in res) for m, res in out_result.items()
-            }
-            if _all_items_found(mm_item_counts, mm_found_counts):
-                break
-
-        if not found:
-            start_idx += 1
+        # Early exit if all items found
+        mm_found_counts = {
+            m: sum(r is not None for r in res) for m, res in out_result.items()
+        }
+        if _all_items_found(mm_item_counts, mm_found_counts):
+            break
 
     out_seqs.append(prompt[prev_end_idx:])
 

From ea3370b428e1b192d29a4451d439f4ed0895f1f3 Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Fri, 28 Nov 2025 19:31:44 -0600
Subject: [PATCH 067/770] [ROCm][Bugfix] Patch for the `Multi-Modal Processor
 Test` group (#29702)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 docker/Dockerfile.rocm                        | 19 ++++-
 docker/Dockerfile.rocm_base                   | 26 +++++-
 requirements/rocm-test.txt                    | 80 +++++++++++++------
 .../processing/test_tensor_schema.py          |  9 ++-
 4 files changed, 105 insertions(+), 29 deletions(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 42466d1801cf6..4aabe2661088a 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -65,6 +65,8 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
+# Centralized v1 package - copied to both test and final stages
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1
 
 # -----------------------
 # Test vLLM image
@@ -88,10 +90,22 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
 
 # install development dependencies (for testing)
 RUN cd /vllm-workspace \
-    && rm -rf vllm \
     && python3 -m pip install -e tests/vllm_test_utils \
     && python3 -m pip install pytest-shard
 
+# enable fast downloads from hf (for testing)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system hf_transfer
+ENV HF_HUB_ENABLE_HF_TRANSFER=1
+
+# Copy in the v1 package
+COPY --from=export_vllm /vllm_v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
+
+# Source code is used in the `python_only_compile.sh` test
+# We hide it inside `src/` so that this source code
+# will not be imported by other tests
+RUN mkdir src && mv vllm src/vllm
+
 # -----------------------
 # Final vLLM image
 FROM base AS final
@@ -116,6 +130,9 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
     && pip uninstall -y vllm \
     && uv pip install --system *.whl
 
+# Copy in the v1 package
+COPY --from=export_vllm /vllm_v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
+
 ARG COMMON_WORKDIR
 
 # Copy over the benchmark scripts as well
diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base
index df4f9b6c26e7d..a57ee728d9243 100644
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@@ -5,6 +5,8 @@ ARG PYTORCH_BRANCH="1c57644d"
 ARG PYTORCH_VISION_BRANCH="v0.23.0"
 ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
+ARG PYTORCH_AUDIO_BRANCH="v2.9.0"
+ARG PYTORCH_AUDIO_REPO="https://github.com/pytorch/audio.git"
 ARG FA_BRANCH="0e60e394"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
 ARG AITER_BRANCH="59bd8ff2"
@@ -23,6 +25,7 @@ ENV AITER_ROCM_ARCH=gfx942;gfx950
 ENV HSA_NO_SCRATCH_RECLAIM=1
 
 ARG PYTHON_VERSION=3.12
+ENV PYTHON_VERSION=${PYTHON_VERSION}
 
 RUN mkdir -p /app
 WORKDIR /app
@@ -45,6 +48,7 @@ RUN apt-get update -y \
     && python3 --version && python3 -m pip --version
 
 RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython
+RUN apt-get update && apt-get install -y libjpeg-dev libsox-dev libsox-fmt-all sox && rm -rf /var/lib/apt/lists/*
 
 FROM base AS build_triton
 ARG TRITON_BRANCH
@@ -66,11 +70,14 @@ RUN mkdir -p /app/install && cp /opt/rocm/share/amd_smi/dist/*.whl /app/install
 FROM base AS build_pytorch
 ARG PYTORCH_BRANCH
 ARG PYTORCH_VISION_BRANCH
+ARG PYTORCH_AUDIO_BRANCH
 ARG PYTORCH_REPO
 ARG PYTORCH_VISION_REPO
+ARG PYTORCH_AUDIO_REPO
+
 RUN git clone ${PYTORCH_REPO} pytorch
-RUN cd pytorch && git checkout ${PYTORCH_BRANCH} && \
-    pip install -r requirements.txt && git submodule update --init --recursive \
+RUN cd pytorch && git checkout ${PYTORCH_BRANCH} \
+    && pip install -r requirements.txt && git submodule update --init --recursive \
     && python3 tools/amd_build/build_amd.py \
     && CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \
     && pip install dist/*.whl
@@ -78,8 +85,15 @@ RUN git clone ${PYTORCH_VISION_REPO} vision
 RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
     && python3 setup.py bdist_wheel --dist-dir=dist \
     && pip install dist/*.whl
+RUN git clone ${PYTORCH_AUDIO_REPO} audio
+RUN cd audio && git checkout ${PYTORCH_AUDIO_BRANCH} \
+    && git submodule update --init --recursive \
+    && pip install -r requirements.txt \
+    && python3 setup.py bdist_wheel --dist-dir=dist \
+    && pip install dist/*.whl
 RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
-    && cp /app/vision/dist/*.whl /app/install
+    && cp /app/vision/dist/*.whl /app/install \
+    && cp /app/audio/dist/*.whl /app/install
 
 FROM base AS build_fa
 ARG FA_BRANCH
@@ -130,6 +144,8 @@ ARG PYTORCH_BRANCH
 ARG PYTORCH_VISION_BRANCH
 ARG PYTORCH_REPO
 ARG PYTORCH_VISION_REPO
+ARG PYTORCH_AUDIO_BRANCH
+ARG PYTORCH_AUDIO_REPO
 ARG FA_BRANCH
 ARG FA_REPO
 ARG AITER_BRANCH
@@ -141,7 +157,9 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
     && echo "PYTORCH_VISION_BRANCH: ${PYTORCH_VISION_BRANCH}" >> /app/versions.txt \
     && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
     && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
+    && echo "PYTORCH_AUDIO_BRANCH: ${PYTORCH_AUDIO_BRANCH}" >> /app/versions.txt \
+    && echo "PYTORCH_AUDIO_REPO: ${PYTORCH_AUDIO_REPO}" >> /app/versions.txt \
     && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
     && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
     && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
-    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
+    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
\ No newline at end of file
diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt
index 8a91b59de6f72..ae61d4c6c6a81 100644
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -1,51 +1,85 @@
 # Common dependencies
 -r common.txt
-tblib==3.1.0
-bm25s==0.2.13
-pystemmer==3.0.0
 
-# Entrypoints test
-# librosa==0.10.2.post1 # required by audio tests in entrypoints/openai
+# Test infrastructure
+tblib==3.1.0
+pytest==8.3.5
+pytest-asyncio==0.24.0
+pytest-timeout==2.3.1
+pytest-cov==6.3.0
+pytest-forked==1.6.0
+pytest-rerunfailures==14.0
+pytest-shard==0.1.2
+
+# Async/HTTP dependencies
+anyio==4.6.2.post1
+    # via httpx, starlette
+aiohttp==3.13.0
+    # via gpt-oss
+httpx==0.27.2
+    # HTTP testing
+
+# Audio processing dependencies
 audioread==3.0.1
+    # via librosa
 cffi==1.17.1
+    # via soundfile
 decorator==5.2.1
+    # via librosa
 lazy-loader==0.4
+    # via librosa
 platformdirs==4.3.6
+    # via pooch
 pooch==1.8.2
-#pycparse==2.22
+    # via librosa
 soundfile==0.13.1
+    # via librosa
 soxr==0.5.0.post1
+    # via librosa
 librosa==0.10.2.post1
 
-# Entrypoints test
-#vllm[video] # required by entrypoints/openai/test_video.py
-decord==0.6.0
+# Retrieval and search
+bm25s==0.2.13
+    # via mteb
+pystemmer==3.0.0
+    # via mteb
 
-# Entrypoints test
-#sentence-transformers # required by entrypoints/openai/test_score.py
-sentence-transformers==3.4.1
-
-# Basic Models Test
-matplotlib==3.10.3
-
-# Multi-Modal Models Test (Extended) 3
+# Multi-modal processing
 blobfile==3.0.0
+    # Multi-Modal Models Test
+decord==0.6.0
+    # video processing, required by entrypoints/openai/test_video.py
 
-# Required for openai schema test.
+# OpenAI compatibility and testing
+gpt-oss==0.0.8
+    # OpenAI compatibility tests
 schemathesis==3.39.15
+    # OpenAI schema test
 
-# Required for mteb test
-mteb[bm25s]>=1.38.11, <2
-
-# Required for eval tests
+# Evaluation and benchmarking
 lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
 
-# Required for multiprocessed tests that use spawn method
+# Required for multiprocessed tests that use spawn method, Datasets and Evaluate Test
 multiprocess==0.70.16
 
 # Plugins test
 terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e
 torchgeo==0.7.0
+    # via terratorch
+# MTEB Benchmark Test
+mteb==2.1.2
+
+# Data processing
+xgrammar @ git+https://github.com/mlc-ai/xgrammar.git@eafd4db51b78acc64b3f0764ef27dfd206c28628
+    # Test async scheduling
+
+# Utilities
+num2words==0.5.14
+    # via lm-eval
+pqdm==0.2.0
+    # via lm-eval
 
 # Required for suffix decoding test
 arctic-inference == 0.1.1
+# Required for Nemotron test
+open-clip-torch==2.32.0
diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py
index a287d5b87d1b7..66a3fbe11b6a5 100644
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -30,6 +30,7 @@ from vllm.model_executor.models.interfaces import (
 from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
 from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
 from vllm.multimodal.utils import group_mm_kwargs_by_modality
+from vllm.platforms import current_platform
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 from vllm.utils.collection_utils import is_list_of
 from vllm.utils.torch_utils import set_default_torch_dtype
@@ -176,6 +177,12 @@ def test_model_tensor_schema(model_id: str):
         exist_overrides=model_info.hf_overrides,
     )
 
+    # ROCm: Detect if model uses AWQ quantization and set appropriate dtype
+    if "awq" in model_id.lower() and current_platform.is_rocm():
+        dtype = "float16"
+    else:
+        dtype = model_info.dtype
+
     model_config = ModelConfig(
         model_id,
         tokenizer=model_info.tokenizer or model_id,
@@ -187,7 +194,7 @@ def test_model_tensor_schema(model_id: str):
         enable_prompt_embeds=model_info.require_embed_inputs,
         enable_mm_embeds=model_info.require_embed_inputs,
         enforce_eager=model_info.enforce_eager,
-        dtype=model_info.dtype,
+        dtype=dtype,
     )
 
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)

From 1dcafb3dea62f556011be4df6f71769aa7260561 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 28 Nov 2025 17:53:17 -0800
Subject: [PATCH 068/770] [Model Runner V2] Support penalties using bin counts
 (#29703)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/input_batch.py  |  15 +++
 vllm/v1/worker/gpu/model_runner.py |   9 +-
 vllm/v1/worker/gpu/penalties.py    |  85 ++++++++++++++
 vllm/v1/worker/gpu/sampler.py      |   3 +
 vllm/v1/worker/gpu/states.py       | 182 +++++++++++++++++++++++++++--
 5 files changed, 280 insertions(+), 14 deletions(-)
 create mode 100644 vllm/v1/worker/gpu/penalties.py

diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py
index 2a7048ae3c0e0..43fd53d3acaae 100644
--- a/vllm/v1/worker/gpu/input_batch.py
+++ b/vllm/v1/worker/gpu/input_batch.py
@@ -341,6 +341,8 @@ def _post_update_kernel(
     idx_mapping_ptr,
     num_computed_tokens_ptr,
     last_sampled_tokens_ptr,
+    output_bin_counts_ptr,
+    output_bin_counts_stride,
     sampled_tokens_ptr,
     sampled_tokens_stride,
     num_sampled_ptr,
@@ -357,6 +359,15 @@ def _post_update_kernel(
         )
         tl.store(last_sampled_tokens_ptr + req_state_idx, token_id)
 
+    for i in range(num_sampled):
+        token_id = tl.load(sampled_tokens_ptr + req_id * sampled_tokens_stride + i)
+        token_ptr = (
+            output_bin_counts_ptr + req_state_idx * output_bin_counts_stride + token_id
+        )
+        count = tl.load(token_ptr)
+        count += 1
+        tl.store(token_ptr, count)
+
     query_start = tl.load(query_start_loc_ptr + req_id)
     query_end = tl.load(query_start_loc_ptr + req_id + 1)
     query_len = query_end - query_start
@@ -374,6 +385,8 @@ def post_update(
     num_computed_tokens: torch.Tensor,
     # [max_num_reqs]
     last_sampled_tokens: torch.Tensor,
+    # [max_num_reqs, vocab_size]
+    output_bin_counts: torch.Tensor,
     # [num_reqs, num_speculative_steps + 1]
     sampled_tokens: torch.Tensor,
     # [num_reqs]
@@ -388,6 +401,8 @@ def post_update(
         idx_mapping,
         num_computed_tokens,
         last_sampled_tokens,
+        output_bin_counts,
+        output_bin_counts.stride(0),
         sampled_tokens,
         sampled_tokens.stride(0),
         num_sampled,
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 0c9fdd0077f4a..9ba234544421d 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -512,7 +512,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             idx_mapping_np,
             num_scheduled_tokens,
             query_start_loc_np,
-            self.req_states.prefill_token_ids,
+            self.req_states.prefill_token_ids.np,
             self.req_states.num_computed_prefill_tokens,
             self.input_buffers.input_ids.np,
         )
@@ -681,7 +681,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Handle chunked prompts.
         pos_after_step = computed_prefill + input_batch.num_scheduled_tokens
         is_prompt_chunked = pos_after_step < prompt_lens
-        prefill_token_ids = self.req_states.prefill_token_ids
+        prefill_token_ids = self.req_states.prefill_token_ids.np
         query_start_loc = self.input_buffers.query_start_loc.np
         for i, req_id in enumerate(input_batch.req_ids):
             if not needs_prompt_logprobs[i]:
@@ -756,6 +756,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             input_batch.idx_mapping,
             self.req_states.num_computed_tokens,
             self.req_states.last_sampled_tokens,
+            self.req_states.output_bin_counts,
             sampled_tokens,
             num_sampled,
             num_rejected,
@@ -785,7 +786,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         idx_mapping_np = input_batch.idx_mapping_np
         with async_barrier(self.spec_decode_event):
             self.input_buffers.next_prefill_tokens.np[:num_reqs] = (
-                self.req_states.prefill_token_ids[
+                self.req_states.prefill_token_ids.np[
                     idx_mapping_np,
                     self.req_states.num_computed_prefill_tokens[idx_mapping_np],
                 ]
@@ -896,7 +897,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 # barrier to avoid race conditions.
                 pos = input_batch.positions[input_batch.logits_indices]
                 sampling_metadata = self.req_states.make_sampling_metadata(
-                    input_batch.idx_mapping_np, pos
+                    input_batch.idx_mapping, input_batch.idx_mapping_np, pos
                 )
                 if input_batch.num_draft_tokens > 0:
                     sampling_metadata = self.req_states.expand_sampling_metadata(
diff --git a/vllm/v1/worker/gpu/penalties.py b/vllm/v1/worker/gpu/penalties.py
new file mode 100644
index 0000000000000..f87ee01718cd6
--- /dev/null
+++ b/vllm/v1/worker/gpu/penalties.py
@@ -0,0 +1,85 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+
+from vllm.triton_utils import tl, triton
+from vllm.v1.worker.gpu.states import SamplingMetadata
+
+
+@triton.jit
+def _penalties_kernel(
+    logits_ptr,
+    logits_stride,
+    repetition_penalty_ptr,
+    frequency_penalty_ptr,
+    presence_penalty_ptr,
+    idx_mapping_ptr,
+    prompt_bin_counts_ptr,
+    prompt_bin_counts_stride,
+    output_bin_counts_ptr,
+    output_bin_counts_stride,
+    vocab_size,
+    BLOCK_SIZE: tl.constexpr,
+):
+    batch_idx = tl.program_id(0)
+    rep_penalty = tl.load(repetition_penalty_ptr + batch_idx)
+    freq_penalty = tl.load(frequency_penalty_ptr + batch_idx)
+    pres_penalty = tl.load(presence_penalty_ptr + batch_idx)
+
+    use_rep_penalty = rep_penalty != 1.0
+    use_freq_penalty = freq_penalty != 0.0
+    use_pres_penalty = pres_penalty != 0.0
+    if not (use_rep_penalty or use_freq_penalty or use_pres_penalty):
+        # No penalties to apply. Early return.
+        return
+
+    block_idx = tl.program_id(1)
+    block = block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = block < vocab_size
+    logits = tl.load(logits_ptr + batch_idx * logits_stride + block, mask=mask)
+    logits = logits.to(tl.float32)
+
+    req_state_idx = tl.load(idx_mapping_ptr + batch_idx)
+    output_bin_counts = tl.load(
+        output_bin_counts_ptr + req_state_idx * output_bin_counts_stride + block,
+        mask=mask,
+    )
+
+    # Apply repetition penalties.
+    if use_rep_penalty:
+        prompt_bin_counts = tl.load(
+            prompt_bin_counts_ptr + req_state_idx * prompt_bin_counts_stride + block,
+            mask=mask,
+        )
+        # If token appears in prompt or output, apply, otherwise use 1.0 for no-op.
+        scale = tl.where((prompt_bin_counts + output_bin_counts) > 0, rep_penalty, 1.0)
+        # If logits are positive, divide by penalty, otherwise multiply by penalty.
+        scale = tl.where(logits > 0, 1.0 / scale, scale)
+        logits *= scale
+
+    # Apply frequency penalties.
+    logits -= freq_penalty * output_bin_counts
+    # Apply presence penalties.
+    logits -= pres_penalty * (output_bin_counts > 0)
+    # Store back to logits.
+    tl.store(logits_ptr + batch_idx * logits_stride + block, logits, mask=mask)
+
+
+def apply_penalties(logits: torch.Tensor, sampling_metadata: SamplingMetadata) -> None:
+    num_reqs, vocab_size = logits.shape
+    BLOCK_SIZE = 8192
+    num_blocks = triton.cdiv(vocab_size, BLOCK_SIZE)
+    _penalties_kernel[(num_reqs, num_blocks)](
+        logits,
+        logits.stride(0),
+        sampling_metadata.repetition_penalty,
+        sampling_metadata.frequency_penalty,
+        sampling_metadata.presence_penalty,
+        sampling_metadata.idx_mapping,
+        sampling_metadata.prompt_bin_counts,
+        sampling_metadata.prompt_bin_counts.stride(0),
+        sampling_metadata.output_bin_counts,
+        sampling_metadata.output_bin_counts.stride(0),
+        vocab_size,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
diff --git a/vllm/v1/worker/gpu/sampler.py b/vllm/v1/worker/gpu/sampler.py
index d8676079ab951..6e0d6150a9669 100644
--- a/vllm/v1/worker/gpu/sampler.py
+++ b/vllm/v1/worker/gpu/sampler.py
@@ -8,6 +8,7 @@ from vllm.config.model import LogprobsMode
 from vllm.triton_utils import tl, triton
 from vllm.v1.outputs import LogprobsTensors, SamplerOutput
 from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
+from vllm.v1.worker.gpu.penalties import apply_penalties
 from vllm.v1.worker.gpu.states import SamplingMetadata
 
 
@@ -65,6 +66,8 @@ class Sampler:
         logits = apply_top_k_top_p(
             logits, sampling_metadata.top_k, sampling_metadata.top_p
         )
+        # Apply penalties in place.
+        apply_penalties(logits, sampling_metadata)
 
         sampled = gumbel_sample(
             logits,
diff --git a/vllm/v1/worker/gpu/states.py b/vllm/v1/worker/gpu/states.py
index 513d45d95d7cd..64874b72e60cf 100644
--- a/vllm/v1/worker/gpu/states.py
+++ b/vllm/v1/worker/gpu/states.py
@@ -8,6 +8,8 @@ import torch
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
 from vllm.triton_utils import tl, triton
+from vllm.utils.platform_utils import is_uva_available
+from vllm.utils.torch_utils import get_cuda_view_from_cpu_tensor
 from vllm.v1.outputs import LogprobsTensors
 from vllm.v1.utils import CpuGpuBuffer
 
@@ -23,12 +25,21 @@ class SamplingMetadata:
     top_p: torch.Tensor | None
     top_k: torch.Tensor | None
 
+    repetition_penalty: torch.Tensor
+    frequency_penalty: torch.Tensor
+    presence_penalty: torch.Tensor
+
     seeds: torch.Tensor
     pos: torch.Tensor
 
     # None means no logprobs, 0 means sampled token logprobs only
     max_num_logprobs: int | None
 
+    # For penalties
+    idx_mapping: torch.Tensor
+    prompt_bin_counts: torch.Tensor
+    output_bin_counts: torch.Tensor
+
     @classmethod
     def make_dummy(
         cls,
@@ -44,17 +55,35 @@ class SamplingMetadata:
         # top_k = torch.full((num_reqs,), 20, dtype=torch.int32, device=device)
         top_p = None
         top_k = None
+        # NOTE(woosuk): We must set penalties to their default values to make sure
+        # the penalties kernel does not touch the placeholder bin_counts tensors.
+        repetition_penalty = torch.ones(num_reqs, dtype=torch.float32, device=device)
+        frequency_penalty = torch.zeros(num_reqs, dtype=torch.float32, device=device)
+        presence_penalty = torch.zeros(num_reqs, dtype=torch.float32, device=device)
         seeds = torch.zeros(num_reqs, dtype=torch.int64, device=device)
         pos = torch.zeros(num_reqs, dtype=torch.int64, device=device)
         max_num_logprobs = 20
 
+        idx_mapping = torch.arange(num_reqs, dtype=torch.int32, device=device)
+        # NOTE(woosuk): These are placeholder tensors to avoid None checks in the
+        # penalties kernel. We use 2 instead of 1 as vocab_size to avoid Triton
+        # specialization and re-compilation at runtime.
+        prompt_bin_counts = torch.zeros(num_reqs, 2, dtype=torch.int32, device=device)
+        output_bin_counts = torch.zeros(num_reqs, 2, dtype=torch.int32, device=device)
+
         return cls(
             temperature=temperature,
             top_p=top_p,
             top_k=top_k,
+            repetition_penalty=repetition_penalty,
+            frequency_penalty=frequency_penalty,
+            presence_penalty=presence_penalty,
             seeds=seeds,
             pos=pos,
             max_num_logprobs=max_num_logprobs,
+            idx_mapping=idx_mapping,
+            prompt_bin_counts=prompt_bin_counts,
+            output_bin_counts=output_bin_counts,
         )
 
 
@@ -83,9 +112,10 @@ class RequestState:
         self.extra_data: dict[str, ExtraData] = {}
 
         self.prompt_len = np.zeros(self.max_num_reqs, dtype=np.int32)
-        self.prefill_token_ids = np.zeros(
-            (self.max_num_reqs, self.max_model_len),
-            dtype=np.int32,
+        # NOTE(woosuk): This tensor can be extremely large (e.g., several GBs)
+        # depending on the configured max_num_reqs and max_model_len.
+        self.prefill_token_ids = UvaBuffer(
+            self.max_num_reqs, self.max_model_len, dtype=torch.int32
         )
         self.prefill_len = self._make_buffer(self.max_num_reqs, dtype=torch.int32)
 
@@ -119,6 +149,9 @@ class RequestState:
         self.temperature = self._make_param(self.max_num_reqs, torch.float32)
         self.top_p = self._make_param(self.max_num_reqs, torch.float32)
         self.top_k = self._make_param(self.max_num_reqs, torch.int32)
+        self.repetition_penalty = self._make_param(self.max_num_reqs, torch.float32)
+        self.frequency_penalty = self._make_param(self.max_num_reqs, torch.float32)
+        self.presence_penalty = self._make_param(self.max_num_reqs, torch.float32)
         self.seeds = self._make_param(self.max_num_reqs, torch.int64)
 
         self.num_logprobs = np.empty(self.max_num_reqs, dtype=np.int32)
@@ -126,6 +159,16 @@ class RequestState:
         self.num_logprobs.fill(-1)
         self.needs_prompt_logprobs = np.zeros(self.max_num_reqs, dtype=bool)
 
+        # Statistics for penalties.
+        # TODO(woosuk): These tensors are rarely used but can be extremely large.
+        # Optimize the memory usage.
+        self.prompt_bin_counts = torch.zeros(
+            self.max_num_reqs, self.vocab_size, dtype=torch.int32, device=self.device
+        )
+        self.output_bin_counts = torch.zeros(
+            self.max_num_reqs, self.vocab_size, dtype=torch.int32, device=self.device
+        )
+
     def _make_param(self, size: int, dtype: torch.dtype) -> "Param":
         return Param(size, dtype=dtype, device=self.device, pin_memory=self.pin_memory)
 
@@ -159,7 +202,7 @@ class RequestState:
             f"prefill_len {prefill_len} < prompt_len {prompt_len}"
         )
         self.prefill_len.np[req_idx] = prefill_len
-        self.prefill_token_ids[req_idx, :prefill_len] = prefill_token_ids
+        self.prefill_token_ids.np[req_idx, :prefill_len] = prefill_token_ids
 
         self.num_computed_prefill_tokens[req_idx] = num_computed_tokens
         # FIXME(woosuk): This triggers a GPU operation whenever adding a new request.
@@ -178,6 +221,18 @@ class RequestState:
         else:
             top_k = self.vocab_size
         self.top_k.np[req_idx] = top_k
+        self.repetition_penalty.np[req_idx] = sampling_params.repetition_penalty
+        self.frequency_penalty.np[req_idx] = sampling_params.frequency_penalty
+        self.presence_penalty.np[req_idx] = sampling_params.presence_penalty
+
+        if use_penalty(sampling_params):
+            bincount(
+                self.prefill_token_ids.gpu[req_idx],
+                prefill_len,
+                prompt_len,
+                self.prompt_bin_counts[req_idx],
+                self.output_bin_counts[req_idx],
+            )
 
         if sampling_params.seed is not None:
             seed = sampling_params.seed
@@ -206,24 +261,32 @@ class RequestState:
 
     def make_sampling_metadata(
         self,
-        idx_mapping: np.ndarray,
+        idx_mapping: torch.Tensor,
+        idx_mapping_np: np.ndarray,
         pos: torch.Tensor,
     ) -> SamplingMetadata:
-        temperature = self.temperature.np[idx_mapping]
+        temperature = self.temperature.np[idx_mapping_np]
         temperature = self.temperature.copy_np_to_gpu(temperature)
 
-        top_p = self.top_p.np[idx_mapping]
+        top_p = self.top_p.np[idx_mapping_np]
         no_top_p = np.all(top_p == 1.0)
         top_p = self.top_p.copy_np_to_gpu(top_p) if not no_top_p else None
 
-        top_k = self.top_k.np[idx_mapping]
+        top_k = self.top_k.np[idx_mapping_np]
         no_top_k = np.all(top_k == self.vocab_size)
         top_k = self.top_k.copy_np_to_gpu(top_k) if not no_top_k else None
 
-        seeds = self.seeds.np[idx_mapping]
+        rep_penalty = self.repetition_penalty.np[idx_mapping_np]
+        rep_penalty = self.repetition_penalty.copy_np_to_gpu(rep_penalty)
+        freq_penalty = self.frequency_penalty.np[idx_mapping_np]
+        freq_penalty = self.frequency_penalty.copy_np_to_gpu(freq_penalty)
+        pres_penalty = self.presence_penalty.np[idx_mapping_np]
+        pres_penalty = self.presence_penalty.copy_np_to_gpu(pres_penalty)
+
+        seeds = self.seeds.np[idx_mapping_np]
         seeds = self.seeds.copy_np_to_gpu(seeds)
 
-        num_logprobs = self.num_logprobs[idx_mapping]
+        num_logprobs = self.num_logprobs[idx_mapping_np]
         max_num_logprobs: int | None = int(np.max(num_logprobs))
         if max_num_logprobs == -1:
             max_num_logprobs = None
@@ -232,9 +295,15 @@ class RequestState:
             temperature=temperature,
             top_p=top_p,
             top_k=top_k,
+            repetition_penalty=rep_penalty,
+            frequency_penalty=freq_penalty,
+            presence_penalty=pres_penalty,
             seeds=seeds,
             pos=pos,
             max_num_logprobs=max_num_logprobs,
+            idx_mapping=idx_mapping,
+            prompt_bin_counts=self.prompt_bin_counts,
+            output_bin_counts=self.output_bin_counts,
         )
 
     def expand_sampling_metadata(
@@ -294,6 +363,14 @@ class ExtraData:
     in_progress_prompt_logprobs: list[LogprobsTensors] = field(default_factory=list)
 
 
+class UvaBuffer:
+    def __init__(self, *size: int | torch.SymInt, dtype: torch.dtype):
+        assert is_uva_available()
+        self.cpu = torch.zeros(*size, dtype=dtype, device="cpu", pin_memory=True)
+        self.np = self.cpu.numpy()
+        self.gpu = get_cuda_view_from_cpu_tensor(self.cpu)
+
+
 # NOTE(woosuk): Re-compilation can happen at runtime since top_p and top_k can be None.
 @triton.jit
 def _expand_sampling_metadata_kernel(
@@ -304,6 +381,12 @@ def _expand_sampling_metadata_kernel(
     top_k_ptr,
     expanded_top_k_ptr,
     seeds_ptr,
+    rep_penalty_ptr,
+    expanded_rep_penalty_ptr,
+    freq_penalty_ptr,
+    expanded_freq_penalty_ptr,
+    pres_penalty_ptr,
+    expanded_pres_penalty_ptr,
     expanded_seeds_ptr,
     cu_num_logits_ptr,
     BLOCK_SIZE: tl.constexpr,
@@ -327,6 +410,15 @@ def _expand_sampling_metadata_kernel(
         top_k = tl.load(top_k_ptr + req_idx)
         tl.store(expanded_top_k_ptr + start_idx + block, top_k, mask=mask)
 
+    rep_penalty = tl.load(rep_penalty_ptr + req_idx)
+    tl.store(expanded_rep_penalty_ptr + start_idx + block, rep_penalty, mask=mask)
+
+    freq_penalty = tl.load(freq_penalty_ptr + req_idx)
+    tl.store(expanded_freq_penalty_ptr + start_idx + block, freq_penalty, mask=mask)
+
+    pres_penalty = tl.load(pres_penalty_ptr + req_idx)
+    tl.store(expanded_pres_penalty_ptr + start_idx + block, pres_penalty, mask=mask)
+
     seed = tl.load(seeds_ptr + req_idx)
     tl.store(expanded_seeds_ptr + start_idx + block, seed, mask=mask)
 
@@ -341,6 +433,9 @@ def expand_sampling_metadata(
     expanded_temp = create_empty(sampling_metadata.temperature)
     expanded_top_p = create_empty(sampling_metadata.top_p)
     expanded_top_k = create_empty(sampling_metadata.top_k)
+    expanded_repetition_penalty = create_empty(sampling_metadata.repetition_penalty)
+    expanded_frequency_penalty = create_empty(sampling_metadata.frequency_penalty)
+    expanded_presence_penalty = create_empty(sampling_metadata.presence_penalty)
     expanded_seeds = create_empty(sampling_metadata.seeds)
 
     num_reqs = cu_num_logits.shape[0] - 1
@@ -351,6 +446,12 @@ def expand_sampling_metadata(
         expanded_top_p,
         sampling_metadata.top_k,
         expanded_top_k,
+        sampling_metadata.repetition_penalty,
+        expanded_repetition_penalty,
+        sampling_metadata.frequency_penalty,
+        expanded_frequency_penalty,
+        sampling_metadata.presence_penalty,
+        expanded_presence_penalty,
         sampling_metadata.seeds,
         expanded_seeds,
         cu_num_logits,
@@ -361,6 +462,67 @@ def expand_sampling_metadata(
         top_p=expanded_top_p,
         top_k=expanded_top_k,
         seeds=expanded_seeds,
+        repetition_penalty=expanded_repetition_penalty,
+        frequency_penalty=expanded_frequency_penalty,
+        presence_penalty=expanded_presence_penalty,
         pos=sampling_metadata.pos,
         max_num_logprobs=sampling_metadata.max_num_logprobs,
+        # TODO(woosuk): Support penalties with spec decoding.
+        idx_mapping=sampling_metadata.idx_mapping,
+        prompt_bin_counts=sampling_metadata.prompt_bin_counts,
+        output_bin_counts=sampling_metadata.output_bin_counts,
+    )
+
+
+def use_penalty(sampling_params: SamplingParams) -> bool:
+    return (
+        sampling_params.repetition_penalty != 1.0
+        or sampling_params.frequency_penalty != 0.0
+        or sampling_params.presence_penalty != 0.0
+    )
+
+
+@triton.jit(do_not_specialize=["prefill_len", "prompt_len"])
+def _bincount_kernel(
+    prefill_token_ids_ptr,
+    prefill_len,
+    prompt_len,
+    prompt_bin_counts_ptr,
+    output_bin_counts_ptr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    block_idx = tl.program_id(0)
+    if block_idx * BLOCK_SIZE >= prefill_len:
+        return
+
+    block = block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    if block_idx * BLOCK_SIZE < prompt_len:
+        mask = block < prompt_len
+        prefill_tokens = tl.load(prefill_token_ids_ptr + block, mask=mask)
+        tl.atomic_add(prompt_bin_counts_ptr + prefill_tokens, 1, mask=mask)
+    if (block_idx + 1) * BLOCK_SIZE >= prompt_len:
+        mask = block < prefill_len
+        mask &= block >= prompt_len
+        prefill_tokens = tl.load(prefill_token_ids_ptr + block, mask=mask)
+        tl.atomic_add(output_bin_counts_ptr + prefill_tokens, 1, mask=mask)
+
+
+def bincount(
+    prefill_token_ids: torch.Tensor,
+    prefill_len: int,
+    prompt_len: int,
+    prompt_bin_counts: torch.Tensor,
+    output_bin_counts: torch.Tensor,
+) -> None:
+    prompt_bin_counts.zero_()
+    output_bin_counts.zero_()
+    BLOCK_SIZE = 1024
+    num_blocks = triton.cdiv(prefill_len, BLOCK_SIZE)
+    _bincount_kernel[(num_blocks,)](
+        prefill_token_ids,
+        prefill_len,
+        prompt_len,
+        prompt_bin_counts,
+        output_bin_counts,
+        BLOCK_SIZE=BLOCK_SIZE,
     )

From b2c50eda506d0970512e710650d7702e97d19352 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 29 Nov 2025 10:30:41 +0800
Subject: [PATCH 069/770] [Bugfix] Fix wrong mock attribute (#29704)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/entrypoints/openai/test_serving_chat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 492e15fc82a61..6a1b15c4131e0 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -399,7 +399,7 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
 @dataclass
 class MockEngine:
     model_config: MockModelConfig = field(default_factory=MockModelConfig)
-    processor: MagicMock = field(default_factory=MagicMock)
+    input_processor: MagicMock = field(default_factory=MagicMock)
     io_processor: MagicMock = field(default_factory=MagicMock)
 
 
From 762a4a6ca9020601220daf9ea11d32493b345442 Mon Sep 17 00:00:00 2001
From: Tsukasa OI <li@livegrid.org>
Date: Sat, 29 Nov 2025 11:32:08 +0900
Subject: [PATCH 070/770] [Frontend] Perform offline path replacement to
 `tokenizer` (#29706)

Signed-off-by: Tsukasa OI <floss_llm@irq.a4lg.com>
---
 .../offline_mode/test_offline_mode.py         | 10 ++++++++
 vllm/engine/arg_utils.py                      | 23 ++++++++++++++-----
 2 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py
index 25e663f3af0eb..539ff89abe9c2 100644
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
@@ -23,6 +23,16 @@ MODEL_CONFIGS = [
         "max_num_seqs": 64,
         "tensor_parallel_size": 1,
     },
+    {
+        "model": "Qwen/Qwen3-0.6B",
+        "enforce_eager": True,
+        "gpu_memory_utilization": 0.50,
+        "max_model_len": 64,
+        "max_num_batched_tokens": 64,
+        "max_num_seqs": 64,
+        "tensor_parallel_size": 1,
+        "tokenizer": "Qwen/Qwen3-4B",
+    },
     {
         "model": "mistralai/Mistral-7B-Instruct-v0.1",
         "enforce_eager": True,
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 31825980f3a10..186a2a4141879 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -581,15 +581,26 @@ class EngineArgs:
         from vllm.plugins import load_general_plugins
 
         load_general_plugins()
-        # when use hf offline,replace model id to local model path
+        # when use hf offline,replace model and tokenizer id to local model path
         if huggingface_hub.constants.HF_HUB_OFFLINE:
             model_id = self.model
             self.model = get_model_path(self.model, self.revision)
-            logger.info(
-                "HF_HUB_OFFLINE is True, replace model_id [%s] to model_path [%s]",
-                model_id,
-                self.model,
-            )
+            if model_id is not self.model:
+                logger.info(
+                    "HF_HUB_OFFLINE is True, replace model_id [%s] to model_path [%s]",
+                    model_id,
+                    self.model,
+                )
+            if self.tokenizer is not None:
+                tokenizer_id = self.tokenizer
+                self.tokenizer = get_model_path(self.tokenizer, self.tokenizer_revision)
+                if tokenizer_id is not self.tokenizer:
+                    logger.info(
+                        "HF_HUB_OFFLINE is True, replace tokenizer_id [%s] "
+                        "to tokenizer_path [%s]",
+                        tokenizer_id,
+                        self.tokenizer,
+                    )
 
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:

From ca1b1e7296e44c75e023b3baaec55d9431a33d78 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 28 Nov 2025 19:49:17 -0800
Subject: [PATCH 071/770] [Model Runner V2] Refactor prefill token preparation
 (#29712)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/cudagraph_utils.py   |  2 +-
 vllm/v1/worker/gpu/input_batch.py       | 86 +++++++++++++++----------
 vllm/v1/worker/gpu/model_runner.py      | 46 +++++--------
 vllm/v1/worker/gpu/spec_decode/eagle.py | 19 +++---
 vllm/v1/worker/gpu/states.py            |  8 ++-
 5 files changed, 83 insertions(+), 78 deletions(-)

diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py
index eb8e610ae4710..7f2994eeca008 100644
--- a/vllm/v1/worker/gpu/cudagraph_utils.py
+++ b/vllm/v1/worker/gpu/cudagraph_utils.py
@@ -78,7 +78,7 @@ class CudaGraphManager:
         kv_cache_config: KVCacheConfig,
     ) -> None:
         num_reqs = min(num_tokens, self.max_num_reqs)
-        input_ids = input_buffers.input_ids.gpu[:num_tokens]
+        input_ids = input_buffers.input_ids[:num_tokens]
         positions = input_buffers.positions[:num_tokens]
         attn_metadata = prepare_inputs_to_capture(
             num_reqs,
diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py
index 43fd53d3acaae..3f8ef03f96445 100644
--- a/vllm/v1/worker/gpu/input_batch.py
+++ b/vllm/v1/worker/gpu/input_batch.py
@@ -3,7 +3,6 @@
 from dataclasses import dataclass
 from typing import Any
 
-import numba
 import numpy as np
 import torch
 
@@ -30,15 +29,12 @@ class InputBuffers:
         self.pin_memory = pin_memory
 
         self.idx_mapping = self._make_buffer(max_num_reqs, dtype=torch.int32)
-        self.input_ids = self._make_buffer(max_num_tokens, dtype=torch.int32)
+        self.input_ids = torch.zeros(max_num_tokens, dtype=torch.int32, device=device)
         self.positions = torch.zeros(max_num_tokens, dtype=torch.int64, device=device)
         self.query_start_loc = self._make_buffer(max_num_reqs + 1, dtype=torch.int32)
         self.seq_lens = torch.zeros(max_num_reqs, dtype=torch.int32, device=device)
         self.cu_num_logits = self._make_buffer(max_num_reqs + 1, dtype=torch.int32)
 
-        # Spec decoding.
-        self.next_prefill_tokens = self._make_buffer(max_num_reqs, dtype=torch.int32)
-
         # Structured outputs.
         self.bitmask_indices = self._make_buffer(max_num_reqs, dtype=torch.int32)
         self.grammar_bitmask = self._make_buffer(
@@ -120,7 +116,7 @@ class InputBatch:
         input_buffers.seq_lens[num_reqs:] = 0
         seq_lens = input_buffers.seq_lens[:num_reqs]
 
-        input_ids = input_buffers.input_ids.copy_to_gpu(num_tokens)
+        input_ids = input_buffers.input_ids[:num_tokens]
         positions = input_buffers.positions[:num_tokens]
         # attn_metadata = defaultdict(lambda: None)
         logits_indices = query_start_loc[1:] - 1
@@ -146,41 +142,63 @@ class InputBatch:
         )
 
 
-@numba.njit(cache=True)
-def _prepare_prefill_inputs(
-    idx_mapping: np.ndarray,  # [B]
-    query_lens: np.ndarray,  # [B]
-    query_start_loc: np.ndarray,  # [B + 1]
-    prefill_token_ids: np.ndarray,  # [N, max_model_len]
-    num_computed_prefill_tokens: np.ndarray,  # [N]
-    input_ids: np.ndarray,  # [num_input_tokens]
-) -> None:
-    num_reqs = idx_mapping.shape[0]
-    query_starts = query_start_loc[:num_reqs]
-    query_ends = query_start_loc[1 : num_reqs + 1]
-    starts = num_computed_prefill_tokens[idx_mapping]
-    ends = starts + query_lens
-    for i in range(num_reqs):
-        input_ids[query_starts[i] : query_ends[i]] = prefill_token_ids[
-            idx_mapping[i], starts[i] : ends[i]
-        ]
+@triton.jit
+def _prepare_prefill_inputs_kernel(
+    input_ids_ptr,
+    next_prefill_tokens_ptr,
+    idx_mapping_ptr,
+    query_start_loc_ptr,
+    prefill_token_ids_ptr,
+    prefill_token_ids_stride,
+    prefill_lens_ptr,
+    num_computed_tokens_ptr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    batch_idx = tl.program_id(0)
+    req_state_idx = tl.load(idx_mapping_ptr + batch_idx)
+    prefill_len = tl.load(prefill_lens_ptr + req_state_idx)
+    num_computed = tl.load(num_computed_tokens_ptr + req_state_idx)
+    if num_computed >= prefill_len:
+        # Not prefill.
+        return
+
+    query_start = tl.load(query_start_loc_ptr + batch_idx)
+    query_end = tl.load(query_start_loc_ptr + batch_idx + 1)
+    query_len = query_end - query_start
+
+    prefill_ptr = prefill_token_ids_ptr + req_state_idx * prefill_token_ids_stride
+    for i in range(0, query_len, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        mask = block < query_len
+        tokens = tl.load(prefill_ptr + num_computed + block, mask=mask)
+        tl.store(input_ids_ptr + query_start + block, tokens, mask=mask)
+
+    next_pos = num_computed + query_len
+    if next_pos < prefill_len:
+        next_token = tl.load(prefill_ptr + next_pos)
+        tl.store(next_prefill_tokens_ptr + req_state_idx, next_token)
 
 
 def prepare_prefill_inputs(
-    idx_mapping: np.ndarray,
-    num_scheduled_tokens: np.ndarray,
-    query_start_loc: np.ndarray,
-    prefill_token_ids: np.ndarray,
-    num_computed_prefill_tokens: np.ndarray,
-    input_ids: np.ndarray,
+    input_ids: torch.Tensor,
+    next_prefill_tokens: torch.Tensor,
+    idx_mapping: torch.Tensor,
+    query_start_loc: torch.Tensor,
+    prefill_token_ids: torch.Tensor,
+    prefill_len: torch.Tensor,
+    num_computed_tokens: torch.Tensor,
 ) -> None:
-    _prepare_prefill_inputs(
+    num_reqs = idx_mapping.shape[0]
+    _prepare_prefill_inputs_kernel[(num_reqs,)](
+        input_ids,
+        next_prefill_tokens,
         idx_mapping,
-        num_scheduled_tokens,
         query_start_loc,
         prefill_token_ids,
-        num_computed_prefill_tokens,
-        input_ids,
+        prefill_token_ids.stride(0),
+        prefill_len,
+        num_computed_tokens,
+        BLOCK_SIZE=1024,
     )
 
 
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 9ba234544421d..1b512b2ee3e50 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -104,11 +104,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         if self.use_async_scheduling:
             self.input_prep_event = torch.cuda.Event()
             self.structured_outputs_event = torch.cuda.Event()
-            self.spec_decode_event = torch.cuda.Event()
         else:
             self.input_prep_event = None
             self.structured_outputs_event = None
-            self.spec_decode_event = None
 
         if self.speculative_config is not None:
             self.do_spec_decode = True
@@ -412,9 +410,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 cu_num_new_blocks[i].append(x + len(block_ids))
                 new_block_ids[i].extend(block_ids)
             overwrite.append(True)
-        # Update the GPU tensors for request states.
-        if scheduler_output.scheduled_new_reqs:
-            self.req_states.prefill_len.copy_to_gpu()
 
         # Add new blocks for the existing requests.
         cached_reqs = scheduler_output.scheduled_cached_reqs
@@ -507,16 +502,16 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         query_start_loc_cpu = self.input_buffers.query_start_loc.cpu[: num_reqs + 1]
         query_start_loc_np = self.input_buffers.query_start_loc.np[: num_reqs + 1]
 
-        # Copy prefill tokens from CPU to GPU.
+        # Get prefill tokens.
         prepare_prefill_inputs(
-            idx_mapping_np,
-            num_scheduled_tokens,
-            query_start_loc_np,
-            self.req_states.prefill_token_ids.np,
-            self.req_states.num_computed_prefill_tokens,
-            self.input_buffers.input_ids.np,
+            self.input_buffers.input_ids,
+            self.req_states.next_prefill_tokens,
+            idx_mapping,
+            query_start_loc_gpu,
+            self.req_states.prefill_token_ids.gpu,
+            self.req_states.prefill_len.gpu,
+            self.req_states.num_computed_tokens,
         )
-        self.input_buffers.input_ids.copy_to_gpu(num_tokens)
 
         # Prepare positions and seq_lens.
         prepare_pos_seq_lens(
@@ -531,7 +526,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Some input token ids are directly read from the last sampled tokens
         # and draft tokens. Also, get the logits indices to sample tokens from.
         logits_indices = combine_sampled_and_draft_tokens(
-            self.input_buffers.input_ids.gpu,
+            self.input_buffers.input_ids,
             idx_mapping,
             self.req_states.last_sampled_tokens,
             query_start_loc_gpu,
@@ -572,7 +567,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             kv_cache_config=self.kv_cache_config,
         )
 
-        input_ids = self.input_buffers.input_ids.gpu[:num_tokens_after_padding]
+        input_ids = self.input_buffers.input_ids[:num_tokens_after_padding]
         positions = self.input_buffers.positions[:num_tokens_after_padding]
         return InputBatch(
             req_ids=req_ids,
@@ -782,20 +777,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         num_sampled: torch.Tensor,
         num_rejected: torch.Tensor,
     ) -> torch.Tensor:
-        num_reqs = input_batch.num_reqs
-        idx_mapping_np = input_batch.idx_mapping_np
-        with async_barrier(self.spec_decode_event):
-            self.input_buffers.next_prefill_tokens.np[:num_reqs] = (
-                self.req_states.prefill_token_ids.np[
-                    idx_mapping_np,
-                    self.req_states.num_computed_prefill_tokens[idx_mapping_np],
-                ]
-            )
-            next_prefill_tokens = self.input_buffers.next_prefill_tokens.copy_to_gpu(
-                num_reqs
-            )
-
         assert self.speculator is not None
+        last_sampled_tokens = self.req_states.last_sampled_tokens[
+            input_batch.idx_mapping
+        ]
+        next_prefill_tokens = self.req_states.next_prefill_tokens[
+            input_batch.idx_mapping
+        ]
         draft_tokens = self.speculator.propose(
             input_batch,
             sampling_metadata,
@@ -803,7 +791,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             aux_hidden_states,
             num_sampled,
             num_rejected,
-            self.req_states.last_sampled_tokens,
+            last_sampled_tokens,
             next_prefill_tokens,
         )
         return draft_tokens
diff --git a/vllm/v1/worker/gpu/spec_decode/eagle.py b/vllm/v1/worker/gpu/spec_decode/eagle.py
index daf2775e8b92d..580d67246dfa1 100644
--- a/vllm/v1/worker/gpu/spec_decode/eagle.py
+++ b/vllm/v1/worker/gpu/spec_decode/eagle.py
@@ -121,7 +121,7 @@ class EagleSpeculator:
             num_tokens_across_dp=num_tokens_across_dp,
         ):
             ret_hidden_states = self.model(
-                input_ids=self.input_buffers.input_ids.gpu[:num_tokens],
+                input_ids=self.input_buffers.input_ids[:num_tokens],
                 positions=self.input_buffers.positions[:num_tokens],
                 hidden_states=self.hidden_states[:num_tokens],
             )
@@ -194,7 +194,7 @@ class EagleSpeculator:
         num_sampled: torch.Tensor,
         # [num_reqs]
         num_rejected: torch.Tensor,
-        # [max_num_reqs, 1]
+        # [num_reqs]
         last_sampled: torch.Tensor,
         # [num_reqs]
         next_prefill_tokens: torch.Tensor,
@@ -316,7 +316,6 @@ def _prepare_eagle_inputs_kernel(
     eagle_positions_ptr,
     target_input_ids_ptr,
     target_positions_ptr,
-    idx_mapping_ptr,
     last_sampled_ptr,
     next_prefill_tokens_ptr,
     num_sampled_ptr,
@@ -335,8 +334,7 @@ def _prepare_eagle_inputs_kernel(
 
     num_sampled = tl.load(num_sampled_ptr + batch_idx)
     if num_sampled > 0:
-        req_state_idx = tl.load(idx_mapping_ptr + batch_idx)
-        next_token = tl.load(last_sampled_ptr + req_state_idx).to(tl.int32)
+        next_token = tl.load(last_sampled_ptr + batch_idx).to(tl.int32)
     else:
         # Chunked prefilling.
         # Get the next prefill token.
@@ -368,9 +366,9 @@ def prepare_eagle_inputs(
     num_sampled: torch.Tensor,
     # [num_reqs]
     num_rejected: torch.Tensor,
-    # [max_num_reqs, 1]
+    # [num_reqs]
     last_sampled: torch.Tensor,
-    # [max_num_reqs]
+    # [num_reqs]
     next_prefill_tokens: torch.Tensor,
 ) -> torch.Tensor:
     num_reqs = input_batch.num_reqs
@@ -381,11 +379,10 @@ def prepare_eagle_inputs(
     )
     _prepare_eagle_inputs_kernel[(num_reqs,)](
         last_token_indices,
-        input_buffers.input_ids.gpu,
+        input_buffers.input_ids,
         input_buffers.positions,
         input_batch.input_ids,
         input_batch.positions,
-        input_batch.idx_mapping,
         last_sampled,
         next_prefill_tokens,
         num_sampled,
@@ -485,7 +482,7 @@ def prepare_eagle_decode(
         last_token_indices,
         target_seq_lens,
         num_rejected,
-        input_buffers.input_ids.gpu,
+        input_buffers.input_ids,
         input_buffers.positions,
         input_hidden_states,
         input_hidden_states.stride(0),
@@ -553,7 +550,7 @@ def update_eagle_inputs(
 ):
     num_reqs, hidden_size = output_hidden_states.shape
     _update_eagle_inputs_kernel[(num_reqs,)](
-        input_buffers.input_ids.gpu,
+        input_buffers.input_ids,
         input_buffers.positions,
         hidden_states,
         hidden_states.stride(0),
diff --git a/vllm/v1/worker/gpu/states.py b/vllm/v1/worker/gpu/states.py
index 64874b72e60cf..4ddd2dfdd731f 100644
--- a/vllm/v1/worker/gpu/states.py
+++ b/vllm/v1/worker/gpu/states.py
@@ -117,8 +117,7 @@ class RequestState:
         self.prefill_token_ids = UvaBuffer(
             self.max_num_reqs, self.max_model_len, dtype=torch.int32
         )
-        self.prefill_len = self._make_buffer(self.max_num_reqs, dtype=torch.int32)
-
+        self.prefill_len = UvaBuffer(self.max_num_reqs, dtype=torch.int32)
         # Number of computed tokens.
         self.num_computed_prefill_tokens = np.zeros(self.max_num_reqs, dtype=np.int32)
         self.num_computed_tokens = torch.zeros(
@@ -140,6 +139,9 @@ class RequestState:
             dtype=torch.int64,
             device=device,
         )
+        self.next_prefill_tokens = torch.zeros(
+            self.max_num_reqs, dtype=torch.int32, device=device
+        )
 
         # LoRA.
         self.lora_ids = np.zeros(self.max_num_reqs, dtype=np.int32)
@@ -380,13 +382,13 @@ def _expand_sampling_metadata_kernel(
     expanded_top_p_ptr,
     top_k_ptr,
     expanded_top_k_ptr,
-    seeds_ptr,
     rep_penalty_ptr,
     expanded_rep_penalty_ptr,
     freq_penalty_ptr,
     expanded_freq_penalty_ptr,
     pres_penalty_ptr,
     expanded_pres_penalty_ptr,
+    seeds_ptr,
     expanded_seeds_ptr,
     cu_num_logits_ptr,
     BLOCK_SIZE: tl.constexpr,

From e23f665d835a40cfe2fbf21df76a4cbdd3d60b4c Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Fri, 28 Nov 2025 23:19:01 -0500
Subject: [PATCH 072/770] [BugFix] Fix DBO failing with TypeError: 'NoneType'
 object is not iterable (#29698)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 tests/v1/distributed/test_dbo.py    | 1 -
 vllm/v1/attention/backends/utils.py | 4 +---
 vllm/v1/worker/dp_utils.py          | 9 ++++++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/v1/distributed/test_dbo.py b/tests/v1/distributed/test_dbo.py
index 866ae742bf3c0..16f154d196ba5 100644
--- a/tests/v1/distributed/test_dbo.py
+++ b/tests/v1/distributed/test_dbo.py
@@ -85,5 +85,4 @@ def test_dbo_dp_ep_gsm8k(all2all_backend: str, num_gpus_available):
         assert accuracy >= MIN_ACCURACY, (
             f"DBO+DP+EP accuracy too low ({all2all_backend}): "
             f"{accuracy:.3f} < {MIN_ACCURACY:.3f} "
-            f"(correct: {results['num_correct']}/{results['num_questions']})"
         )
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 27f07218d9b2e..8edfbb5140bc9 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -166,9 +166,7 @@ def _make_metadata_with_slice(
     assert start_locs[first_req] <= first_tok < start_locs[first_req + 1], (
         "Token slice start outside of first request"
     )
-    assert start_locs[last_req] <= last_tok < start_locs[last_req + 1], (
-        "Token slice end outside of last request"
-    )
+    # NOTE: last token can be outside of the last request if we have CG padding.
 
     # If the "middle" request has tokens in both ubatches, we have to split it.
     # If ubatch_slice is the first ubatch then we will be splitting the last
diff --git a/vllm/v1/worker/dp_utils.py b/vllm/v1/worker/dp_utils.py
index c1509de821b05..6539d72d81cb7 100644
--- a/vllm/v1/worker/dp_utils.py
+++ b/vllm/v1/worker/dp_utils.py
@@ -93,13 +93,16 @@ def _post_process_dp_padding(tensor: torch.Tensor, should_dp_pad: bool) -> torch
 
 # This just pads the second ubatch slice out to the total number of tokens
 # (num_tokens + padding) since we do `create_ubatch_slices` before applying DP padding.
-def _pad_out_ubatch_slice(ubatch_slices: UBatchSlices, num_total_tokens: int):
-    padded_second_ubatch_slice = slice(
+def _pad_out_ubatch_slice(
+    ubatch_slices: UBatchSlices, num_total_tokens: int
+) -> UBatchSlices:
+    padded_second_token_slice = slice(
         ubatch_slices[1].token_slice.start, num_total_tokens
     )
     ubatch_slices[1] = UBatchSlice(
-        padded_second_ubatch_slice, padded_second_ubatch_slice
+        ubatch_slices[1].request_slice, padded_second_token_slice
     )
+    return ubatch_slices
 
 
 def _synchronize_dp_ranks(

From 4b17ce6815ea3b648a15383e7354f0a43d2fe3f6 Mon Sep 17 00:00:00 2001
From: Angela Yi <yiangela7@gmail.com>
Date: Fri, 28 Nov 2025 20:19:05 -0800
Subject: [PATCH 073/770] Add gpu memory wait before test_async_tp (#28893)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: angelayi <yiangela7@gmail.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml |  6 +++---
 tests/conftest.py             | 29 +++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 3347d74cbb23b..c38068a9b22c0 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -1313,11 +1313,11 @@ steps:
   working_dir: "/vllm-workspace/"
   num_gpus: 2
   commands:
-    - pytest -v -s tests/compile/distributed/test_async_tp.py
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
     - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
     - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
-    - pytest -v -s tests/distributed/test_sequence_parallel.py
+    - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
     - pytest -v -s tests/distributed/test_context_parallel.py
     - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
     - pytest -v -s tests/v1/distributed/test_dbo.py
diff --git a/tests/conftest.py b/tests/conftest.py
index 163593eb3f14f..11c573befb2d2 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1424,3 +1424,32 @@ def disable_deepgemm_ue8m0(monkeypatch):
         # Clear cache so the next time it is used it is processed with the
         # default VLLM_USE_DEEP_GEMM_E8M0  setting.
         is_deep_gemm_e8m0_used.cache_clear()
+
+
+@pytest.fixture(autouse=True)
+def clean_gpu_memory_between_tests():
+    if os.getenv("VLLM_TEST_CLEAN_GPU_MEMORY", "0") != "1":
+        yield
+        return
+
+    # Wait for GPU memory to be cleared before starting the test
+    import gc
+
+    from tests.utils import wait_for_gpu_memory_to_clear
+
+    num_gpus = torch.cuda.device_count()
+    if num_gpus > 0:
+        try:
+            wait_for_gpu_memory_to_clear(
+                devices=list(range(num_gpus)),
+                threshold_ratio=0.1,
+            )
+        except ValueError as e:
+            logger.info("Failed to clean GPU memory: %s", e)
+
+    yield
+
+    # Clean up GPU memory after the test
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        gc.collect()

From 4a80ad0a2548cf0da02af843af346a783fa7754a Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 28 Nov 2025 20:27:16 -0800
Subject: [PATCH 074/770] [Model Runner V2] Don't use UVA buffer for
 prefill_len  (#29713)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/model_runner.py | 2 ++
 vllm/v1/worker/gpu/states.py       | 5 ++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 1b512b2ee3e50..8414ca53b8748 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -410,6 +410,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 cu_num_new_blocks[i].append(x + len(block_ids))
                 new_block_ids[i].extend(block_ids)
             overwrite.append(True)
+        if scheduler_output.scheduled_new_reqs:
+            self.req_states.prefill_len.copy_to_gpu()
 
         # Add new blocks for the existing requests.
         cached_reqs = scheduler_output.scheduled_cached_reqs
diff --git a/vllm/v1/worker/gpu/states.py b/vllm/v1/worker/gpu/states.py
index 4ddd2dfdd731f..44b076fa4c2ae 100644
--- a/vllm/v1/worker/gpu/states.py
+++ b/vllm/v1/worker/gpu/states.py
@@ -117,7 +117,10 @@ class RequestState:
         self.prefill_token_ids = UvaBuffer(
             self.max_num_reqs, self.max_model_len, dtype=torch.int32
         )
-        self.prefill_len = UvaBuffer(self.max_num_reqs, dtype=torch.int32)
+        # NOTE(woosuk): We don't use UVA for prefill_len because its GPU view
+        # can be used outside of update_states and prepare_inputs.
+        # Without async barrier, using UVA can cause race conditions.
+        self.prefill_len = self._make_buffer(self.max_num_reqs, dtype=torch.int32)
         # Number of computed tokens.
         self.num_computed_prefill_tokens = np.zeros(self.max_num_reqs, dtype=np.int32)
         self.num_computed_tokens = torch.zeros(

From 39e63dec7c751a28d5a577caf16439e08a878c92 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sat, 29 Nov 2025 14:52:58 +0800
Subject: [PATCH 075/770] [LoRA] Cleanup LoRA unused code (#29611)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 .../offline_inference/multilora_inference.py  |  6 +--
 tests/entrypoints/conftest.py                 |  6 +--
 tests/entrypoints/openai/test_basic.py        |  2 +-
 tests/entrypoints/openai/test_chat.py         |  8 ++++
 .../entrypoints/openai/test_chunked_prompt.py |  3 +-
 .../entrypoints/openai/test_lora_adapters.py  | 43 +++++++++----------
 tests/entrypoints/openai/test_models.py       | 12 +++---
 tests/entrypoints/openai/test_orca_metrics.py |  5 ++-
 .../openai/test_return_tokens_as_ids.py       |  6 +--
 tests/entrypoints/openai/test_tokenization.py |  2 +-
 tests/entrypoints/openai/test_uds.py          |  2 +-
 tests/entrypoints/sagemaker/conftest.py       |  1 -
 tests/lora/conftest.py                        | 27 ++++--------
 tests/lora/test_lora_checkpoints.py           | 21 +++------
 tests/lora/test_lora_huggingface.py           | 11 ++---
 tests/lora/test_lora_manager.py               | 17 +++-----
 tests/lora/test_olmoe_tp.py                   | 22 +++++++---
 tests/lora/test_peft_helper.py                | 28 ++++++------
 .../test_filesystem_resolver.py               | 10 ++---
 vllm/lora/models.py                           | 34 ++++++---------
 vllm/lora/worker_manager.py                   |  6 +--
 vllm/model_executor/models/apertus.py         |  1 -
 vllm/model_executor/models/bamba.py           |  1 -
 vllm/model_executor/models/exaone.py          |  1 -
 vllm/model_executor/models/exaone4.py         |  1 -
 vllm/model_executor/models/falcon_h1.py       |  1 -
 vllm/model_executor/models/granite.py         |  1 -
 vllm/model_executor/models/granitemoe.py      |  1 -
 .../model_executor/models/granitemoehybrid.py |  1 -
 .../model_executor/models/granitemoeshared.py |  1 -
 vllm/model_executor/models/interfaces.py      |  3 --
 vllm/model_executor/models/jamba.py           |  1 -
 vllm/model_executor/models/lfm2.py            |  1 -
 vllm/model_executor/models/lfm2_moe.py        |  1 -
 vllm/model_executor/models/llama.py           |  1 -
 vllm/model_executor/models/minicpm.py         |  1 -
 vllm/model_executor/models/minicpm_eagle.py   |  1 -
 vllm/model_executor/models/minicpmv.py        |  1 -
 vllm/model_executor/models/mixtral.py         |  1 -
 vllm/model_executor/models/nemotron.py        |  1 -
 vllm/model_executor/models/nemotron_h.py      |  1 -
 vllm/model_executor/models/nemotron_nas.py    |  1 -
 vllm/model_executor/models/phimoe.py          |  1 -
 vllm/model_executor/models/solar.py           |  1 -
 .../models/transformers/base.py               |  1 -
 vllm/v1/worker/lora_model_runner_mixin.py     |  1 -
 46 files changed, 126 insertions(+), 173 deletions(-)

diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py
index 6c23cf342e06b..5e5da2c0144c9 100644
--- a/examples/offline_inference/multilora_inference.py
+++ b/examples/offline_inference/multilora_inference.py
@@ -46,7 +46,6 @@ def create_test_prompts(
                 logprobs=1,
                 prompt_logprobs=1,
                 max_tokens=128,
-                stop_token_ids=[32003],
             ),
             LoRARequest("sql-lora", 1, lora_path),
         ),
@@ -57,7 +56,6 @@ def create_test_prompts(
                 logprobs=1,
                 prompt_logprobs=1,
                 max_tokens=128,
-                stop_token_ids=[32003],
             ),
             LoRARequest("sql-lora2", 2, lora_path),
         ),
@@ -98,7 +96,7 @@ def initialize_engine() -> LLMEngine:
     #   use the same rank, it is recommended to set this as low as possible.
     # max_cpu_loras: controls the size of the CPU LoRA cache.
     engine_args = EngineArgs(
-        model="meta-llama/Llama-2-7b-hf",
+        model="meta-llama/Llama-3.2-3B-Instruct",
         enable_lora=True,
         max_loras=1,
         max_lora_rank=8,
@@ -111,7 +109,7 @@ def initialize_engine() -> LLMEngine:
 def main():
     """Main function that sets up and runs the prompt processing."""
     engine = initialize_engine()
-    lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
+    lora_path = snapshot_download(repo_id="jeeejeee/llama32-3b-text2sql-spider")
     test_prompts = create_test_prompts(lora_path)
     process_requests(engine, test_prompts)
 
diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py
index a52e1cb7df33d..9ab50c44aa4a0 100644
--- a/tests/entrypoints/conftest.py
+++ b/tests/entrypoints/conftest.py
@@ -188,11 +188,11 @@ number: "1" | "2"
 
 
 @pytest.fixture(scope="session")
-def zephyr_lora_files():
-    """Download zephyr LoRA files once per test session."""
+def qwen3_lora_files():
+    """Download Qwen3 LoRA files once per test session."""
     from huggingface_hub import snapshot_download
 
-    return snapshot_download(repo_id="typeof/zephyr-7b-beta-lora")
+    return snapshot_download(repo_id="charent/self_cognition_Alice")
 
 
 @pytest.fixture(scope="session")
diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py
index e63a6f10cbc7f..3d581a300b6a9 100644
--- a/tests/entrypoints/openai/test_basic.py
+++ b/tests/entrypoints/openai/test_basic.py
@@ -16,7 +16,7 @@ from vllm.version import __version__ as VLLM_VERSION
 
 from ...utils import RemoteOpenAIServer
 
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
 
 
 @pytest.fixture(scope="module")
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index d25958f602b39..b2909f21e4dd8 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -19,6 +19,14 @@ from ...utils import RemoteOpenAIServer
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 
 
+@pytest.fixture(scope="module")
+def zephyr_lora_files():
+    """Download zephyr LoRA files once per test session."""
+    from huggingface_hub import snapshot_download
+
+    return snapshot_download(repo_id="typeof/zephyr-7b-beta-lora")
+
+
 @pytest.fixture(scope="module")
 def server(zephyr_lora_files):  # noqa: F811
     args = [
diff --git a/tests/entrypoints/openai/test_chunked_prompt.py b/tests/entrypoints/openai/test_chunked_prompt.py
index 608e509e59e8a..f5c412107775f 100644
--- a/tests/entrypoints/openai/test_chunked_prompt.py
+++ b/tests/entrypoints/openai/test_chunked_prompt.py
@@ -8,7 +8,7 @@ import pytest_asyncio
 from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
 
 
 @pytest.fixture(scope="module")
@@ -20,7 +20,6 @@ def server():
         "--max-model-len",
         "8192",
         "--enforce-eager",
-        # lora config below
         "--max-num-seqs",
         "128",
         "--enable-chunked-prefill",
diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py
index c74f805961bc8..22461f470db02 100644
--- a/tests/entrypoints/openai/test_lora_adapters.py
+++ b/tests/entrypoints/openai/test_lora_adapters.py
@@ -13,9 +13,8 @@ import pytest_asyncio
 from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
-# technically this needs Mistral-7B-v0.1 as base, but we're not testing
-# generation quality here
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+
 
 BADREQUEST_CASES = [
     (
@@ -33,11 +32,11 @@ BADREQUEST_CASES = [
 
 
 @pytest.fixture(scope="module", params=[True])
-def server_with_lora_modules_json(request, zephyr_lora_files):
+def server_with_lora_modules_json(request, qwen3_lora_files):
     # Define the json format LoRA module configurations
     lora_module_1 = {
-        "name": "zephyr-lora",
-        "path": zephyr_lora_files,
+        "name": "qwen3-lora",
+        "path": qwen3_lora_files,
         "base_model_name": MODEL_NAME,
     }
 
@@ -74,7 +73,7 @@ async def client(server_with_lora_modules_json):
 
 
 @pytest.mark.asyncio
-async def test_static_lora_lineage(client: openai.AsyncOpenAI, zephyr_lora_files):
+async def test_static_lora_lineage(client: openai.AsyncOpenAI, qwen3_lora_files):
     models = await client.models.list()
     models = models.data
     served_model = models[0]
@@ -82,17 +81,17 @@ async def test_static_lora_lineage(client: openai.AsyncOpenAI, zephyr_lora_files
     assert served_model.id == MODEL_NAME
     assert served_model.root == MODEL_NAME
     assert served_model.parent is None
-    assert all(lora_model.root == zephyr_lora_files for lora_model in lora_models)
+    assert all(lora_model.root == qwen3_lora_files for lora_model in lora_models)
     assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
-    assert lora_models[0].id == "zephyr-lora"
+    assert lora_models[0].id == "qwen3-lora"
 
 
 @pytest.mark.asyncio
-async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI, zephyr_lora_files):
+async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI, qwen3_lora_files):
     response = await client.post(
         "load_lora_adapter",
         cast_to=str,
-        body={"lora_name": "zephyr-lora-3", "lora_path": zephyr_lora_files},
+        body={"lora_name": "qwen3-lora-3", "lora_path": qwen3_lora_files},
     )
     # Ensure adapter loads before querying /models
     assert "success" in response
@@ -100,9 +99,9 @@ async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI, zephyr_lora_file
     models = await client.models.list()
     models = models.data
     dynamic_lora_model = models[-1]
-    assert dynamic_lora_model.root == zephyr_lora_files
+    assert dynamic_lora_model.root == qwen3_lora_files
     assert dynamic_lora_model.parent == MODEL_NAME
-    assert dynamic_lora_model.id == "zephyr-lora-3"
+    assert dynamic_lora_model.id == "qwen3-lora-3"
 
 
 @pytest.mark.asyncio
@@ -134,7 +133,7 @@ async def test_dynamic_lora_invalid_files(client: openai.AsyncOpenAI, tmp_path):
 async def test_dynamic_lora_badrequests(
     client: openai.AsyncOpenAI,
     tmp_path,
-    zephyr_lora_files,
+    qwen3_lora_files,
     test_name: str,
     config_change: dict,
     expected_error: str,
@@ -143,7 +142,7 @@ async def test_dynamic_lora_badrequests(
     test_dir = tmp_path / test_name
 
     # Copy adapter files
-    shutil.copytree(zephyr_lora_files, test_dir)
+    shutil.copytree(qwen3_lora_files, test_dir)
 
     # Load and modify configuration
     config_path = test_dir / "adapter_config.json"
@@ -167,7 +166,7 @@ async def test_dynamic_lora_badrequests(
 
 @pytest.mark.asyncio
 async def test_multiple_lora_adapters(
-    client: openai.AsyncOpenAI, tmp_path, zephyr_lora_files
+    client: openai.AsyncOpenAI, tmp_path, qwen3_lora_files
 ):
     """Validate that many loras can be dynamically registered and inferenced
     with concurrently"""
@@ -178,7 +177,7 @@ async def test_multiple_lora_adapters(
         await client.post(
             "load_lora_adapter",
             cast_to=str,
-            body={"lora_name": adapter_name, "lora_path": str(zephyr_lora_files)},
+            body={"lora_name": adapter_name, "lora_path": str(qwen3_lora_files)},
         )
         for _ in range(3):
             await client.completions.create(
@@ -199,7 +198,7 @@ async def test_multiple_lora_adapters(
 
 @pytest.mark.asyncio
 async def test_loading_invalid_adapters_does_not_break_others(
-    client: openai.AsyncOpenAI, tmp_path, zephyr_lora_files
+    client: openai.AsyncOpenAI, tmp_path, qwen3_lora_files
 ):
     invalid_files = tmp_path / "invalid_files"
     invalid_files.mkdir()
@@ -215,7 +214,7 @@ async def test_loading_invalid_adapters_does_not_break_others(
         while not stop_good_requests_event.is_set():
             try:
                 batch = await client.completions.create(
-                    model="zephyr-lora",
+                    model="qwen3-lora",
                     prompt=["Hello there", "Foo bar bazz buzz"],
                     max_tokens=5,
                 )
@@ -254,7 +253,7 @@ async def test_loading_invalid_adapters_does_not_break_others(
     await client.post(
         "load_lora_adapter",
         cast_to=str,
-        body={"lora_name": "valid", "lora_path": zephyr_lora_files},
+        body={"lora_name": "valid", "lora_path": qwen3_lora_files},
     )
     await client.completions.create(
         model="valid",
@@ -267,7 +266,7 @@ async def test_loading_invalid_adapters_does_not_break_others(
 async def test_beam_search_with_lora_adapters(
     client: openai.AsyncOpenAI,
     tmp_path,
-    zephyr_lora_files,
+    qwen3_lora_files,
 ):
     """Validate that async beam search can be used with lora."""
 
@@ -275,7 +274,7 @@ async def test_beam_search_with_lora_adapters(
         await client.post(
             "load_lora_adapter",
             cast_to=str,
-            body={"lora_name": adapter_name, "lora_path": str(zephyr_lora_files)},
+            body={"lora_name": adapter_name, "lora_path": str(qwen3_lora_files)},
         )
         for _ in range(3):
             await client.completions.create(
diff --git a/tests/entrypoints/openai/test_models.py b/tests/entrypoints/openai/test_models.py
index 7d2968d965066..e5af11edf7fa0 100644
--- a/tests/entrypoints/openai/test_models.py
+++ b/tests/entrypoints/openai/test_models.py
@@ -8,13 +8,13 @@ import pytest_asyncio
 from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
 # technically this needs Mistral-7B-v0.1 as base, but we're not testing
 # generation quality here
 
 
 @pytest.fixture(scope="module")
-def server(zephyr_lora_files):
+def server(qwen3_lora_files):
     args = [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
@@ -25,7 +25,7 @@ def server(zephyr_lora_files):
         # lora config below
         "--enable-lora",
         "--lora-modules",
-        f"zephyr-lora={zephyr_lora_files}",
+        f"qwen3-lora={qwen3_lora_files}",
         "--max-lora-rank",
         "64",
         "--max-cpu-loras",
@@ -45,12 +45,12 @@ async def client(server):
 
 
 @pytest.mark.asyncio
-async def test_check_models(client: openai.AsyncOpenAI, zephyr_lora_files):
+async def test_check_models(client: openai.AsyncOpenAI, qwen3_lora_files):
     models = await client.models.list()
     models = models.data
     served_model = models[0]
     lora_models = models[1:]
     assert served_model.id == MODEL_NAME
     assert served_model.root == MODEL_NAME
-    assert all(lora_model.root == zephyr_lora_files for lora_model in lora_models)
-    assert lora_models[0].id == "zephyr-lora"
+    assert all(lora_model.root == qwen3_lora_files for lora_model in lora_models)
+    assert lora_models[0].id == "qwen3-lora"
diff --git a/tests/entrypoints/openai/test_orca_metrics.py b/tests/entrypoints/openai/test_orca_metrics.py
index 1ed44a33bf81f..1ce043df0cd89 100644
--- a/tests/entrypoints/openai/test_orca_metrics.py
+++ b/tests/entrypoints/openai/test_orca_metrics.py
@@ -8,7 +8,7 @@ import pytest_asyncio
 from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
 
 
 @pytest.fixture(scope="module")
@@ -110,8 +110,9 @@ async def test_single_completion(client: openai.AsyncOpenAI):
     choice = completion.choices[0]
     assert len(choice.text) >= 5
     assert choice.finish_reason == "length"
+    # When using Qwen3-0.6B, prompt tokens=[9707, 11, 847, 829, 374]
     assert completion.usage == openai.types.CompletionUsage(
-        completion_tokens=5, prompt_tokens=6, total_tokens=11
+        completion_tokens=5, prompt_tokens=5, total_tokens=10
     )
 
     # test using token IDs
diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py
index adbcc1f2430c4..cedf6ce160607 100644
--- a/tests/entrypoints/openai/test_return_tokens_as_ids.py
+++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py
@@ -11,11 +11,11 @@ from vllm.transformers_utils.tokenizer import get_tokenizer
 
 from ...utils import RemoteOpenAIServer
 
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
 
 
 @pytest.fixture(scope="module")
-def default_server_args(zephyr_lora_files):
+def default_server_args(qwen3_lora_files):
     return [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
@@ -28,7 +28,7 @@ def default_server_args(zephyr_lora_files):
         # lora config
         "--enable-lora",
         "--lora-modules",
-        f"zephyr-lora={zephyr_lora_files}",
+        f"qwen3-lora={qwen3_lora_files}",
         "--max-lora-rank",
         "64",
         "--max-cpu-loras",
diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py
index 7fd32e1c7be1d..d23628671a87a 100644
--- a/tests/entrypoints/openai/test_tokenization.py
+++ b/tests/entrypoints/openai/test_tokenization.py
@@ -10,7 +10,7 @@ from vllm.transformers_utils.tokenizer import get_tokenizer
 from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
 
 
 @pytest.fixture(scope="module")
diff --git a/tests/entrypoints/openai/test_uds.py b/tests/entrypoints/openai/test_uds.py
index 5c39869a794fd..c79a4870dea34 100644
--- a/tests/entrypoints/openai/test_uds.py
+++ b/tests/entrypoints/openai/test_uds.py
@@ -10,7 +10,7 @@ from vllm.version import __version__ as VLLM_VERSION
 
 from ...utils import RemoteOpenAIServer
 
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
 
 
 @pytest.fixture(scope="module")
diff --git a/tests/entrypoints/sagemaker/conftest.py b/tests/entrypoints/sagemaker/conftest.py
index 4c859c2527d25..ad219eec18b79 100644
--- a/tests/entrypoints/sagemaker/conftest.py
+++ b/tests/entrypoints/sagemaker/conftest.py
@@ -9,7 +9,6 @@ import pytest_asyncio
 from ...utils import RemoteOpenAIServer
 
 # Model name constants used across tests
-MODEL_NAME_ZEPHYR = "HuggingFaceH4/zephyr-7b-beta"
 MODEL_NAME_SMOLLM = "HuggingFaceTB/SmolLM2-135M-Instruct"
 LORA_ADAPTER_NAME_SMOLLM = "jekunz/smollm-135m-lora-fineweb-faroese"
 
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 9d38ec5422794..be3ddf693383e 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -154,23 +154,6 @@ def dummy_model_gate_up() -> nn.Module:
     return model
 
 
-@pytest.fixture(scope="session")
-def llama_2_7b_base_huggingface_id():
-    # used as a base model for testing with sql lora adapter
-    return "meta-llama/Llama-2-7b-hf"
-
-
-@pytest.fixture(scope="session")
-def sql_lora_huggingface_id():
-    # huggingface repo id is used to test lora runtime downloading.
-    return "yard1/llama-2-7b-sql-lora-test"
-
-
-@pytest.fixture(scope="session")
-def sql_lora_files(sql_lora_huggingface_id):
-    return snapshot_download(repo_id=sql_lora_huggingface_id)
-
-
 @pytest.fixture(scope="session")
 def mixtral_lora_files():
     # Note: this module has incorrect adapter_config.json to test
@@ -256,8 +239,14 @@ def qwen3_lora_files():
 
 
 @pytest.fixture(scope="session")
-def llama32_lora_files():
-    return snapshot_download(repo_id="jeeejeee/llama32-3b-text2sql-spider")
+def llama32_lora_huggingface_id():
+    # huggingface repo id is used to test lora runtime downloading.
+    return "jeeejeee/llama32-3b-text2sql-spider"
+
+
+@pytest.fixture(scope="session")
+def llama32_lora_files(llama32_lora_huggingface_id):
+    return snapshot_download(repo_id=llama32_lora_huggingface_id)
 
 
 @pytest.fixture
diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py
index b9b1bc59c6ed7..e9653a2fedfaf 100644
--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
@@ -26,8 +26,7 @@ def test_load_checkpoints(
     chatglm3_lora_files,
 ):
     packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
-    embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
-    embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
+
     expected_lora_lst: list[str] = []
     for module in BAICHUAN_LORA_MODULES:
         if module in packed_modules_mapping:
@@ -47,8 +46,7 @@ def test_load_checkpoints(
             peft_helper=peft_helper,
             lora_model_id=1,
             device="cpu",
-            embedding_modules=embedding_modules,
-            embedding_padding_modules=embed_padding_modules,
+            model_vocab_size=64000,
         )
     elif lora_name == "baichuan7B-zero":
         # Test that the target_modules contain prefix
@@ -63,8 +61,7 @@ def test_load_checkpoints(
             peft_helper=peft_helper,
             lora_model_id=1,
             device="cpu",
-            embedding_modules=embedding_modules,
-            embedding_padding_modules=embed_padding_modules,
+            model_vocab_size=64000,
         )
     elif lora_name == "baichuan7B-zero-regex":
         # Test that the `target_modules` in the form of regular expressions,
@@ -78,8 +75,7 @@ def test_load_checkpoints(
             peft_helper=peft_helper,
             lora_model_id=1,
             device="cpu",
-            embedding_modules=embedding_modules,
-            embedding_padding_modules=embed_padding_modules,
+            model_vocab_size=64000,
         )
     else:
         # For the baichuan7B model, load chatglm3-6b's LoRA,
@@ -95,15 +91,13 @@ def test_load_checkpoints(
                 peft_helper=peft_helper,
                 lora_model_id=1,
                 device="cpu",
-                embedding_modules=embedding_modules,
-                embedding_padding_modules=embed_padding_modules,
+                model_vocab_size=64000,
             )
 
 
 def test_lora_weights_mapping(baichuan_lora_files):
     packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
-    embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
-    embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
+
     expected_lora_lst: list[str] = []
     for module in BAICHUAN_LORA_MODULES:
         if module in packed_modules_mapping:
@@ -128,8 +122,7 @@ def test_lora_weights_mapping(baichuan_lora_files):
         peft_helper=peft_helper,
         lora_model_id=1,
         device="cpu",
-        embedding_modules=embedding_modules,
-        embedding_padding_modules=embed_padding_modules,
+        model_vocab_size=64000,
         weights_mapper=hf_to_vllm_mapper,
     )
     for name in lora_model.loras:
diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py
index 6a787471c74fd..3348d2f8ce654 100644
--- a/tests/lora/test_lora_huggingface.py
+++ b/tests/lora/test_lora_huggingface.py
@@ -6,10 +6,10 @@ import pytest
 from vllm.lora.models import LoRAModel
 from vllm.lora.peft_helper import PEFTHelper
 from vllm.lora.utils import get_adapter_absolute_path
-from vllm.model_executor.models.llama import LlamaForCausalLM
+from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM
 
 # Provide absolute path and huggingface lora ids
-lora_fixture_name = ["sql_lora_files", "sql_lora_huggingface_id"]
+lora_fixture_name = ["llama32_lora_files", "llama32_lora_huggingface_id"]
 LLAMA_LORA_MODULES = [
     "qkv_proj",
     "o_proj",
@@ -23,9 +23,8 @@ LLAMA_LORA_MODULES = [
 @pytest.mark.parametrize("lora_fixture_name", lora_fixture_name)
 def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
     lora_name = request.getfixturevalue(lora_fixture_name)
-    packed_modules_mapping = LlamaForCausalLM.packed_modules_mapping
-    embedding_modules = LlamaForCausalLM.embedding_modules
-    embed_padding_modules = LlamaForCausalLM.embedding_padding_modules
+    packed_modules_mapping = Qwen3ForCausalLM.packed_modules_mapping
+
     expected_lora_lst: list[str] = []
     for module in LLAMA_LORA_MODULES:
         if module in packed_modules_mapping:
@@ -43,8 +42,6 @@ def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
         peft_helper=peft_helper,
         lora_model_id=1,
         device="cpu",
-        embedding_modules=embedding_modules,
-        embedding_padding_modules=embed_padding_modules,
     )
 
     # Assertions to ensure the model is loaded correctly
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index 24d4dfca46d62..081f14d6fabfb 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -34,7 +34,6 @@ EMBEDDING_MODULES = {
     "lm_head": "output_embeddings",
 }
 
-EMBEDDING_PADDING_MODULES = ["lm_head"]
 
 DEVICES = (
     [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
@@ -46,24 +45,22 @@ DEFAULT_DTYPE = torch.get_default_dtype()
 
 
 @pytest.mark.parametrize("device", DEVICES)
-def test_from_lora_tensors(sql_lora_files, device):
-    tensors = load_file(os.path.join(sql_lora_files, "adapter_model.safetensors"))
+def test_from_lora_tensors(qwen3_lora_files, device):
+    tensors = load_file(os.path.join(qwen3_lora_files, "adapter_model.safetensors"))
 
     peft_helper = PEFTHelper.from_local_dir(
-        sql_lora_files, max_position_embeddings=4096
+        qwen3_lora_files, max_position_embeddings=4096
     )
     lora_model = LoRAModel.from_lora_tensors(
         1,
         tensors,
         peft_helper=peft_helper,
         device=device,
-        embedding_modules=EMBEDDING_MODULES,
-        embedding_padding_modules=EMBEDDING_PADDING_MODULES,
     )
     for module_name, lora in lora_model.loras.items():
         assert lora.module_name == module_name
         assert lora.rank == 8
-        assert lora.lora_alpha == 16
+        assert lora.lora_alpha == 32
         assert lora.lora_a is not None
         assert lora.lora_b is not None
         assert lora.lora_a.device == torch.device(device)
@@ -430,7 +427,7 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_pa
     vllm_config.scheduler_config.max_num_seqs = 4
     vllm_config.scheduler_config.max_num_batched_tokens = 2
     worker_adapter_manager = LRUCacheWorkerLoRAManager(
-        vllm_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES
+        vllm_config, device, EMBEDDING_MODULES
     )
 
     worker_adapter_manager.max_num_seqs = 4
@@ -533,9 +530,7 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path
     vllm_config.scheduler_config.max_num_seqs = 4
     vllm_config.scheduler_config.max_num_batched_tokens = 2
 
-    worker_adapter_manager = WorkerLoRAManager(
-        vllm_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES
-    )
+    worker_adapter_manager = WorkerLoRAManager(vllm_config, device, EMBEDDING_MODULES)
     worker_adapter_manager.vocab_size = dummy_model_gate_up.unpadded_vocab_size
     worker_adapter_manager.create_lora_manager(dummy_model_gate_up)
 
diff --git a/tests/lora/test_olmoe_tp.py b/tests/lora/test_olmoe_tp.py
index e3c9816625ba7..e10419d244c37 100644
--- a/tests/lora/test_olmoe_tp.py
+++ b/tests/lora/test_olmoe_tp.py
@@ -40,7 +40,10 @@ EXPECTED_BASE_MODEL_OUTPUT = [
 
 
 def generate_and_test(
-    llm: vllm.LLM, lora_path: str, lora_id: list[int | None] | int | None
+    llm: vllm.LLM,
+    lora_path: str,
+    lora_id: list[int | None] | int | None,
+    compare_lower: bool = False,
 ) -> None:
     prompts = [
         PROMPT_TEMPLATE.format(context="How many candidates are there?"),
@@ -74,12 +77,18 @@ def generate_and_test(
 
     for i in range(len(EXPECTED_LORA_OUTPUT)):
         req_lora_id = lora_id[i] if isinstance(lora_id, list) else lora_id
+        generated_text = generated_texts[i]
         expected_output = (
             EXPECTED_LORA_OUTPUT[i]
             if req_lora_id is not None
             else EXPECTED_BASE_MODEL_OUTPUT[i]
         )
-        assert generated_texts[i].startswith(expected_output)
+
+        if compare_lower:
+            generated_text = generated_text.lower()
+            expected_output = expected_output.lower()
+
+        assert generated_text.startswith(expected_output)
 
 
 def test_olmoe_lora(olmoe_lora_files):
@@ -146,6 +155,9 @@ def test_olmoe_lora_tp4(olmoe_lora_files, fully_sharded_loras):
         tensor_parallel_size=4,
         fully_sharded_loras=fully_sharded_loras,
     )
-
-    generate_and_test(llm, olmoe_lora_files, lora_id=1)
-    generate_and_test(llm, olmoe_lora_files, lora_id=2)
+    generate_and_test(
+        llm, olmoe_lora_files, lora_id=1, compare_lower=fully_sharded_loras
+    )
+    generate_and_test(
+        llm, olmoe_lora_files, lora_id=2, compare_lower=fully_sharded_loras
+    )
diff --git a/tests/lora/test_peft_helper.py b/tests/lora/test_peft_helper.py
index 9c55c623d444b..e3035b00e9e03 100644
--- a/tests/lora/test_peft_helper.py
+++ b/tests/lora/test_peft_helper.py
@@ -25,31 +25,33 @@ ERROR_CASES = [
 ]
 
 
-def test_peft_helper_pass(sql_lora_files, tmp_path):
+def test_peft_helper_pass(llama32_lora_files, tmp_path):
     peft_helper = PEFTHelper.from_local_dir(
-        sql_lora_files, max_position_embeddings=4096
+        llama32_lora_files, max_position_embeddings=4096
     )
     lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2)
     peft_helper.validate_legal(lora_config)
     assert peft_helper.r == 8
-    assert peft_helper.lora_alpha == 16
-    assert peft_helper.target_modules == [
-        "q_proj",
-        "v_proj",
-        "k_proj",
-        "o_proj",
-        "gate_proj",
-        "up_proj",
+    assert peft_helper.lora_alpha == 32
+    target_modules = sorted(peft_helper.target_modules)
+
+    assert target_modules == [
         "down_proj",
         "embed_tokens",
+        "gate_proj",
+        "k_proj",
         "lm_head",
+        "o_proj",
+        "q_proj",
+        "up_proj",
+        "v_proj",
     ]
     assert peft_helper.vllm_max_position_embeddings == 4096
 
     # test RSLoRA
     rslora_config = dict(use_rslora=True)
     test_dir = tmp_path / "test_rslora"
-    shutil.copytree(sql_lora_files, test_dir)
+    shutil.copytree(llama32_lora_files, test_dir)
 
     # Load and modify configuration
     config_path = test_dir / "adapter_config.json"
@@ -70,14 +72,14 @@ def test_peft_helper_pass(sql_lora_files, tmp_path):
 
 @pytest.mark.parametrize("test_name,config_change,expected_error", ERROR_CASES)
 def test_peft_helper_error(
-    sql_lora_files,
+    llama32_lora_files,
     tmp_path,
     test_name: str,
     config_change: dict,
     expected_error: str,
 ):
     test_dir = tmp_path / test_name
-    shutil.copytree(sql_lora_files, test_dir)
+    shutil.copytree(llama32_lora_files, test_dir)
 
     # Load and modify configuration
     config_path = test_dir / "adapter_config.json"
diff --git a/tests/plugins/lora_resolvers/test_filesystem_resolver.py b/tests/plugins/lora_resolvers/test_filesystem_resolver.py
index cd98efdd13909..d4adf6f84cf09 100644
--- a/tests/plugins/lora_resolvers/test_filesystem_resolver.py
+++ b/tests/plugins/lora_resolvers/test_filesystem_resolver.py
@@ -8,8 +8,8 @@ from huggingface_hub import snapshot_download
 
 from vllm.plugins.lora_resolvers.filesystem_resolver import FilesystemResolver
 
-MODEL_NAME = "mistralai/Mistral-7B-v0.1"
-LORA_NAME = "typeof/zephyr-7b-beta-lora"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+LORA_NAME = "charent/self_cognition_Alice"
 PA_NAME = "swapnilbp/llama_tweet_ptune"
 
 
@@ -21,7 +21,7 @@ def adapter_cache(request, tmpdir_factory):
 
 
 @pytest.fixture(scope="module")
-def zephyr_lora_files():
+def qwen3_lora_files():
     return snapshot_download(repo_id=LORA_NAME)
 
 
@@ -31,9 +31,9 @@ def pa_files():
 
 
 @pytest.mark.asyncio
-async def test_filesystem_resolver(adapter_cache, zephyr_lora_files):
+async def test_filesystem_resolver(adapter_cache, qwen3_lora_files):
     model_files = adapter_cache / LORA_NAME
-    shutil.copytree(zephyr_lora_files, model_files)
+    shutil.copytree(qwen3_lora_files, model_files)
 
     fs_resolver = FilesystemResolver(adapter_cache)
     assert fs_resolver is not None
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 4caaf0e117cc4..f568b8b9ba595 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -103,7 +103,6 @@ class LoRAModel:
     def check_lora_name(self, lora_name: str) -> bool:
         return lora_name in self.loras
 
-    # (yard1): TODO see if we can derive target_embedding_padding automatically
     @classmethod
     def from_lora_tensors(
         cls,
@@ -112,9 +111,7 @@ class LoRAModel:
         peft_helper: PEFTHelper,
         device: str = "cuda",
         dtype: torch.dtype | None = None,
-        target_embedding_padding: int | None = None,
-        embedding_modules: dict[str, str] | None = None,
-        embedding_padding_modules: list[str] | None = None,
+        model_vocab_size: int | None = None,
         weights_mapper: WeightsMapper | None = None,
     ) -> "LoRAModel":
         """Create a LoRAModel from a dictionary of tensors."""
@@ -132,22 +129,21 @@ class LoRAModel:
                 )
 
             if is_lora_a:
+                if (
+                    "lora_embedding_A" in tensor_name
+                    and model_vocab_size is not None
+                    and model_vocab_size != tensor.shape[1]
+                ):
+                    raise RuntimeError(
+                        f"The embedding LoRA size({tensor.shape[1]}) must be consistent"
+                        f" with the base model's vocabulary size({model_vocab_size})."
+                    )
                 loras[module_name].lora_a = tensor.to(device=device, dtype=dtype)
                 if pin_memory:
                     loras[module_name].lora_a = loras[module_name].lora_a.pin_memory()
             else:
                 loras[module_name].lora_b = tensor.to(device=device, dtype=dtype)
-                assert embedding_padding_modules is not None
-                if (
-                    any(name in module_name for name in embedding_padding_modules)
-                    and target_embedding_padding is not None
-                ):
-                    lora_b = loras[module_name].lora_b
-                    assert target_embedding_padding >= lora_b.shape[0]
-                    addition = target_embedding_padding - lora_b.shape[0]
-                    loras[module_name].lora_b = torch.nn.functional.pad(
-                        lora_b, (0, 0, 0, addition)
-                    )
+
                 if pin_memory:
                     loras[module_name].lora_b = loras[module_name].lora_b.pin_memory()
 
@@ -163,9 +159,7 @@ class LoRAModel:
         lora_model_id: int | None = None,
         device: str = "cuda",
         dtype: torch.dtype | None = None,
-        target_embedding_padding: int | None = None,
-        embedding_modules: dict[str, str] | None = None,
-        embedding_padding_modules: list[str] | None = None,
+        model_vocab_size: int | None = None,
         weights_mapper: WeightsMapper | None = None,
         tensorizer_config_dict: dict | None = None,
     ) -> "LoRAModel":
@@ -287,9 +281,7 @@ class LoRAModel:
             peft_helper=peft_helper,
             device=device,
             dtype=dtype,
-            target_embedding_padding=target_embedding_padding,
-            embedding_modules=embedding_modules,
-            embedding_padding_modules=embedding_padding_modules,
+            model_vocab_size=model_vocab_size,
             weights_mapper=weights_mapper,
         )
 
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index d9a03f0500497..7d77ba7247ef0 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -34,12 +34,10 @@ class WorkerLoRAManager:
         vllm_config: VllmConfig,
         device: torch.device,
         embedding_modules: dict[str, str],
-        embedding_padding_modules: list[str],
         lora_model_cls: type[LoRAModel] = LoRAModel,
     ):
         self._lora_model_cls = lora_model_cls
         self.embedding_modules = embedding_modules
-        self.embedding_padding_modules = embedding_padding_modules
         self._cached_dummy_lora: None | Literal[False] | LoRAModel = False
         self.max_num_seqs = vllm_config.scheduler_config.max_num_seqs
         self.max_num_batched_tokens = (
@@ -121,9 +119,7 @@ class WorkerLoRAManager:
                 lora_model_id=lora_request.lora_int_id,
                 device="cpu",
                 dtype=self.lora_config.lora_dtype,
-                target_embedding_padding=self.vocab_size,
-                embedding_modules=self.embedding_modules,
-                embedding_padding_modules=self.embedding_padding_modules,
+                model_vocab_size=self.vocab_size,
                 tensorizer_config_dict=lora_request.tensorizer_config_dict,
                 weights_mapper=hf_to_vllm_mapper,
             )
diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py
index f38b09bf55068..4a69787af55e2 100644
--- a/vllm/model_executor/models/apertus.py
+++ b/vllm/model_executor/models/apertus.py
@@ -482,7 +482,6 @@ class ApertusForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(
         self,
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
index 4422bb5da98f4..1d6493b18c343 100644
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -419,7 +419,6 @@ class BambaForCausalLM(
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     @classmethod
     def get_mamba_state_dtype_from_config(
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index 99002baa87529..acf651ed24988 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -457,7 +457,6 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "wte": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py
index 9d2c67d6c4f80..cb710a7ec5cf9 100644
--- a/vllm/model_executor/models/exaone4.py
+++ b/vllm/model_executor/models/exaone4.py
@@ -450,7 +450,6 @@ class Exaone4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py
index 9433f0d1b4a49..83ceb9303cfb5 100644
--- a/vllm/model_executor/models/falcon_h1.py
+++ b/vllm/model_executor/models/falcon_h1.py
@@ -510,7 +510,6 @@ class FalconH1ForCausalLM(
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     @classmethod
     def get_mamba_state_dtype_from_config(
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index eac9ef9478a6a..76519c4660f15 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -400,7 +400,6 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index 02c6c5862141f..b038400a1262a 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -497,7 +497,6 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py
index 9d5eeef198a61..1d9c2f5df4a55 100644
--- a/vllm/model_executor/models/granitemoehybrid.py
+++ b/vllm/model_executor/models/granitemoehybrid.py
@@ -601,7 +601,6 @@ class GraniteMoeHybridForCausalLM(
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     @classmethod
     def get_mamba_state_dtype_from_config(
diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py
index fd346db7e35aa..8ad5a7105bb53 100644
--- a/vllm/model_executor/models/granitemoeshared.py
+++ b/vllm/model_executor/models/granitemoeshared.py
@@ -263,7 +263,6 @@ class GraniteMoeSharedForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 2218d688e59f6..ccd5be42e65a9 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -347,7 +347,6 @@ class SupportsLoRA(Protocol):
     # The `embedding_module` and `embedding_padding_modules`
     # are empty by default.
     embedding_modules: ClassVar[dict[str, str]] = {}
-    embedding_padding_modules: ClassVar[list[str]] = []
     packed_modules_mapping: dict[str, list[str]] = {}
 
 
@@ -359,7 +358,6 @@ class _SupportsLoRAType(Protocol):
 
     packed_modules_mapping: dict[str, list[str]]
     embedding_modules: dict[str, str]
-    embedding_padding_modules: list[str]
 
 
 @overload
@@ -379,7 +377,6 @@ def supports_lora(
         lora_attrs = (
             "packed_modules_mapping",
             "embedding_modules",
-            "embedding_padding_modules",
         )
         missing_attrs = tuple(attr for attr in lora_attrs if not hasattr(model, attr))
 
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 3a2c98c73dab4..b2ad12be1e358 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -480,7 +480,6 @@ class JambaForCausalLM(
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/lfm2.py b/vllm/model_executor/models/lfm2.py
index 69615f8b6a099..a4a994f97a2f8 100644
--- a/vllm/model_executor/models/lfm2.py
+++ b/vllm/model_executor/models/lfm2.py
@@ -422,7 +422,6 @@ class Lfm2ForCausalLM(
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     @classmethod
     def get_mamba_state_dtype_from_config(
diff --git a/vllm/model_executor/models/lfm2_moe.py b/vllm/model_executor/models/lfm2_moe.py
index aaeb2cc38999e..c8669de72dd09 100644
--- a/vllm/model_executor/models/lfm2_moe.py
+++ b/vllm/model_executor/models/lfm2_moe.py
@@ -602,7 +602,6 @@ class Lfm2MoeForCausalLM(
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     @classmethod
     def get_mamba_state_dtype_from_config(
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 6dfbde7a17f54..8f5a967cd422a 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -528,7 +528,6 @@ class LlamaForCausalLM(
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     # Mistral/Llama models can also be loaded with --load-format mistral
     # from consolidated.safetensors checkpoints
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 67911ba8c1c8f..67c462f4b25c4 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -568,7 +568,6 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/minicpm_eagle.py b/vllm/model_executor/models/minicpm_eagle.py
index e6bccfcac4f1a..9f3587a6d2fa5 100644
--- a/vllm/model_executor/models/minicpm_eagle.py
+++ b/vllm/model_executor/models/minicpm_eagle.py
@@ -305,7 +305,6 @@ class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 2ac97764dd341..6d0ebf5c9825c 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -1741,5 +1741,4 @@ class MiniCPMV(MiniCPMVBaseModel, SupportsMultiModal, SupportsLoRA):
         # so update values before init is called
         cls.packed_modules_mapping.update(instance_cls.packed_modules_mapping)
         cls.embedding_modules.update(instance_cls.embedding_modules)
-        cls.embedding_padding_modules += instance_cls.embedding_padding_modules
         return instance_cls(vllm_config=vllm_config, prefix=prefix)
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index e21656dbd6350..50ec57e7a8053 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -496,7 +496,6 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP, MixtureOfExperts):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index 93ad2064a2fca..ffba6c9dfe739 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -439,7 +439,6 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py
index 8675eff592224..baeb901bbb05a 100644
--- a/vllm/model_executor/models/nemotron_h.py
+++ b/vllm/model_executor/models/nemotron_h.py
@@ -713,7 +713,6 @@ class NemotronHForCausalLM(
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     @classmethod
     def get_mamba_state_dtype_from_config(
diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py
index 34ea2945b711e..9d968dee87114 100644
--- a/vllm/model_executor/models/nemotron_nas.py
+++ b/vllm/model_executor/models/nemotron_nas.py
@@ -387,7 +387,6 @@ class DeciLMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, HasNoOps):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     # Mistral/Llama models can also be loaded with --load-format mistral
     # from consolidated.safetensors checkpoints
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index a5a669139b2f7..49530776f8903 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -617,7 +617,6 @@ class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index c576154b1ecfd..7bef56110cab7 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -426,7 +426,6 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/transformers/base.py b/vllm/model_executor/models/transformers/base.py
index b33ce35427f5e..f3ebc6da8e302 100644
--- a/vllm/model_executor/models/transformers/base.py
+++ b/vllm/model_executor/models/transformers/base.py
@@ -93,7 +93,6 @@ ALL_ATTENTION_FUNCTIONS["vllm"] = vllm_flash_attention_forward
 
 
 class Base(nn.Module, VllmModel, SupportsQuant, SupportsLoRA, SupportsPP):
-    embedding_padding_modules = ["lm_head"]
     embedding_modules = ["embed_tokens"]  # TODO transformers will have a util to get it
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py
index 37abe56494609..a67246146005c 100644
--- a/vllm/v1/worker/lora_model_runner_mixin.py
+++ b/vllm/v1/worker/lora_model_runner_mixin.py
@@ -43,7 +43,6 @@ class LoRAModelRunnerMixin:
             vllm_config,
             device,
             model.embedding_modules,
-            model.embedding_padding_modules,
         )
         return self.lora_manager.create_lora_manager(model)
 

From 6afc0ffaf61b49599dec25c30b28596a390ad2d9 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sat, 29 Nov 2025 00:41:01 -0800
Subject: [PATCH 076/770] [Model Runner V2] Add sample/ directory and
 reorganize files (#29719)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/model_runner.py           |  15 +-
 vllm/v1/worker/gpu/sample/__init__.py        |   0
 vllm/v1/worker/gpu/sample/gumbel.py          | 100 ++++++
 vllm/v1/worker/gpu/sample/logprob.py         | 167 ++++++++++
 vllm/v1/worker/gpu/sample/metadata.py        | 179 ++++++++++
 vllm/v1/worker/gpu/{ => sample}/penalties.py |  48 ++-
 vllm/v1/worker/gpu/sample/sampler.py         |  79 +++++
 vllm/v1/worker/gpu/sampler.py                | 333 -------------------
 vllm/v1/worker/gpu/spec_decode/eagle.py      |   4 +-
 vllm/v1/worker/gpu/states.py                 | 232 +------------
 10 files changed, 587 insertions(+), 570 deletions(-)
 create mode 100644 vllm/v1/worker/gpu/sample/__init__.py
 create mode 100644 vllm/v1/worker/gpu/sample/gumbel.py
 create mode 100644 vllm/v1/worker/gpu/sample/logprob.py
 create mode 100644 vllm/v1/worker/gpu/sample/metadata.py
 rename vllm/v1/worker/gpu/{ => sample}/penalties.py (66%)
 create mode 100644 vllm/v1/worker/gpu/sample/sampler.py
 delete mode 100644 vllm/v1/worker/gpu/sampler.py

diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 8414ca53b8748..fdb930c4dcd79 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -47,13 +47,18 @@ from vllm.v1.worker.gpu.input_batch import (
     prepare_pos_seq_lens,
     prepare_prefill_inputs,
 )
-from vllm.v1.worker.gpu.sampler import Sampler, compute_prompt_logprobs
+from vllm.v1.worker.gpu.sample.logprob import compute_prompt_logprobs
+from vllm.v1.worker.gpu.sample.metadata import (
+    SamplingMetadata,
+    expand_sampling_metadata,
+)
+from vllm.v1.worker.gpu.sample.sampler import Sampler
 from vllm.v1.worker.gpu.spec_decode import init_speculator
 from vllm.v1.worker.gpu.spec_decode.rejection_sample import (
     get_num_rejected,
     rejection_sample,
 )
-from vllm.v1.worker.gpu.states import RequestState, SamplingMetadata
+from vllm.v1.worker.gpu.states import RequestState
 from vllm.v1.worker.gpu.structured_outputs import apply_grammar_bitmask
 from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
@@ -890,8 +895,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     input_batch.idx_mapping, input_batch.idx_mapping_np, pos
                 )
                 if input_batch.num_draft_tokens > 0:
-                    sampling_metadata = self.req_states.expand_sampling_metadata(
-                        sampling_metadata, input_batch.cu_num_logits
+                    sampling_metadata = expand_sampling_metadata(
+                        sampling_metadata,
+                        input_batch.cu_num_logits,
+                        max_expand_len=self.num_speculative_steps + 1,
                     )
 
                 if self.lora_config:
diff --git a/vllm/v1/worker/gpu/sample/__init__.py b/vllm/v1/worker/gpu/sample/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/v1/worker/gpu/sample/gumbel.py b/vllm/v1/worker/gpu/sample/gumbel.py
new file mode 100644
index 0000000000000..3e0d72e56939b
--- /dev/null
+++ b/vllm/v1/worker/gpu/sample/gumbel.py
@@ -0,0 +1,100 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+
+from vllm.triton_utils import tl, triton
+
+
+@triton.jit
+def _gumbel_sample_kernel(
+    local_argmax_ptr,
+    local_argmax_stride,
+    local_max_ptr,
+    local_max_stride,
+    logits_ptr,
+    logits_stride,
+    seeds_ptr,
+    pos_ptr,
+    temp_ptr,
+    vocab_size,
+    BLOCK_SIZE: tl.constexpr,
+    APPLY_TEMPERATURE: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+    block_idx = tl.program_id(1)
+    block = block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = block < vocab_size
+    logits = tl.load(
+        logits_ptr + req_idx * logits_stride + block,
+        mask=mask,
+        other=float("-inf"),
+    )
+    logits = logits.to(tl.float32)
+
+    temp = tl.load(temp_ptr + req_idx).to(tl.float32)
+    if temp != 0.0:
+        # Calculate the seed for gumbel noise.
+        seed = tl.load(seeds_ptr + req_idx)
+        pos = tl.load(pos_ptr + req_idx)
+        gumbel_seed = tl.randint(seed, pos)
+
+        # Generate gumbel noise.
+        r = tl.rand(gumbel_seed, block).to(tl.float64)
+        gumbel_noise = -tl.log(-tl.log(r + 1e-20) + 1e-20)
+        gumbel_noise = gumbel_noise.to(tl.float32)
+
+        # Apply temperature.
+        if APPLY_TEMPERATURE:
+            # NOTE(woosuk): Use div_rn to match the behavior of torch.
+            logits = tl.div_rn(logits, temp)
+
+        # Apply gumbel noise.
+        logits = tl.where(mask, logits + gumbel_noise, float("-inf"))
+
+    idx = tl.argmax(logits, axis=0)
+    token_id = block_idx * BLOCK_SIZE + idx
+    value = tl.max(logits, axis=0)
+    tl.store(local_argmax_ptr + req_idx * local_argmax_stride + block_idx, token_id)
+    tl.store(local_max_ptr + req_idx * local_max_stride + block_idx, value)
+
+
+def gumbel_sample(
+    logits: torch.Tensor,  # [num_reqs, vocab_size]
+    temperature: torch.Tensor,  # [num_reqs]
+    seed: torch.Tensor,  # [num_reqs]
+    pos: torch.Tensor,  # [num_reqs]
+    apply_temperature: bool,
+) -> torch.Tensor:
+    num_reqs, vocab_size = logits.shape
+    BLOCK_SIZE = 1024
+    num_blocks = triton.cdiv(vocab_size, BLOCK_SIZE)
+    local_argmax = torch.empty(
+        num_reqs,
+        num_blocks,
+        dtype=torch.int64,
+        device=logits.device,
+    )
+    local_max = torch.empty(
+        num_reqs,
+        num_blocks,
+        dtype=torch.float32,
+        device=logits.device,
+    )
+    _gumbel_sample_kernel[(num_reqs, num_blocks)](
+        local_argmax,
+        local_argmax.stride(0),
+        local_max,
+        local_max.stride(0),
+        logits,
+        logits.stride(0),
+        seed,
+        pos,
+        temperature,
+        vocab_size,
+        BLOCK_SIZE=BLOCK_SIZE,
+        APPLY_TEMPERATURE=apply_temperature,
+    )
+    # NOTE(woosuk): Use int64 for later indexing.
+    max_block_idx = local_max.argmax(dim=-1, keepdim=True)
+    sampled = local_argmax.gather(dim=-1, index=max_block_idx).view(-1)
+    return sampled
diff --git a/vllm/v1/worker/gpu/sample/logprob.py b/vllm/v1/worker/gpu/sample/logprob.py
new file mode 100644
index 0000000000000..25448b387b310
--- /dev/null
+++ b/vllm/v1/worker/gpu/sample/logprob.py
@@ -0,0 +1,167 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Callable
+
+import torch
+
+from vllm.triton_utils import tl, triton
+from vllm.v1.outputs import LogprobsTensors
+
+
+@triton.jit
+def _topk_log_softmax_kernel(
+    output_ptr,
+    logits_ptr,
+    logits_stride,
+    topk_ids_ptr,
+    topk,
+    vocab_size,
+    BLOCK_SIZE: tl.constexpr,
+    PADDED_TOPK: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+    row_ptr = logits_ptr + req_idx * logits_stride
+
+    max_val = float("-inf")
+    for i in range(0, vocab_size, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        logits = tl.load(row_ptr + block, mask=block < vocab_size, other=float("-inf"))
+        max_val = tl.max(tl.maximum(logits, max_val))
+    max_val = max_val.to(tl.float32)  # type: ignore
+
+    se = 0.0
+    for i in range(0, vocab_size, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        logits = tl.load(row_ptr + block, mask=block < vocab_size, other=0.0)
+        # NOTE(woosuk): Make sure that logits and all following operations use FP32.
+        logits = logits.to(tl.float32)
+        e = tl.exp(logits - max_val)
+        e = tl.where(block < vocab_size, e, 0.0)
+        se += tl.sum(e)
+    lse = tl.log(se)
+
+    k_offset = tl.arange(0, PADDED_TOPK)
+    k_mask = k_offset < topk
+    topk_ids = tl.load(topk_ids_ptr + req_idx * topk + k_offset, mask=k_mask, other=0)
+
+    logits = tl.load(row_ptr + topk_ids, mask=k_mask)
+    logits = logits.to(tl.float32)
+    o = logits - max_val - lse
+    tl.store(output_ptr + req_idx * topk + k_offset, o, mask=k_mask)
+
+
+@triton.jit
+def _ranks_kernel(
+    output_ptr,
+    logits_ptr,
+    logits_stride,
+    token_ids_ptr,
+    vocab_size,
+    BLOCK_SIZE: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+    row_ptr = logits_ptr + req_idx * logits_stride
+
+    token_id = tl.load(token_ids_ptr + req_idx)
+    x = tl.load(row_ptr + token_id)
+
+    n = 0
+    for i in range(0, vocab_size, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        logits = tl.load(row_ptr + block, mask=block < vocab_size, other=float("-inf"))
+        n += tl.sum((logits > x).to(tl.int32))
+    tl.store(output_ptr + req_idx, n)
+
+
+def compute_token_logprobs(
+    logits: torch.Tensor,
+    token_ids: torch.Tensor,
+) -> torch.Tensor:
+    batch_size = logits.shape[0]
+    vocab_size = logits.shape[1]
+    token_ids = token_ids.to(torch.int64)
+    num_logprobs = token_ids.shape[1]
+    logprobs = torch.empty(
+        batch_size,
+        num_logprobs,
+        dtype=torch.float32,
+        device=logits.device,
+    )
+    _topk_log_softmax_kernel[(batch_size,)](
+        logprobs,
+        logits,
+        logits.stride(0),
+        token_ids,
+        num_logprobs,
+        vocab_size,
+        BLOCK_SIZE=1024,  # type: ignore
+        PADDED_TOPK=triton.next_power_of_2(num_logprobs),
+    )
+    return logprobs
+
+
+def compute_topk_logprobs(
+    logits: torch.Tensor,
+    num_logprobs: int,
+    sampled_token_ids: torch.Tensor,
+) -> LogprobsTensors:
+    assert num_logprobs >= 0
+    batch_size, vocab_size = logits.shape
+    if num_logprobs == 0:
+        logprob_token_ids = sampled_token_ids.unsqueeze(-1)
+    else:
+        topk_indices = torch.topk(logits, num_logprobs, dim=-1).indices
+        logprob_token_ids = torch.cat(
+            (sampled_token_ids.unsqueeze(-1), topk_indices), dim=1
+        )
+
+    # NOTE(woosuk): Here, to save GPU memory, we do not materialize the full
+    # logprobs tensor. Instead, we only compute and return the logprobs of
+    # the topk + 1 tokens.
+    logprobs = compute_token_logprobs(logits, logprob_token_ids)
+    token_ranks = torch.empty(
+        batch_size,
+        dtype=torch.int64,
+        device=logits.device,
+    )
+    _ranks_kernel[(batch_size,)](
+        token_ranks,
+        logits,
+        logits.stride(0),
+        sampled_token_ids,
+        vocab_size,
+        BLOCK_SIZE=8192,  # type: ignore
+    )
+    return LogprobsTensors(
+        logprob_token_ids=logprob_token_ids,
+        logprobs=logprobs,
+        selected_token_ranks=token_ranks,
+    )
+
+
+def compute_prompt_logprobs(
+    prompt_token_ids: torch.Tensor,
+    prompt_hidden_states: torch.Tensor,
+    logits_fn: Callable[[torch.Tensor], torch.Tensor],
+) -> tuple[torch.Tensor, torch.Tensor]:
+    # Since materializing the full prompt logits can take too much memory,
+    # we compute it in chunks.
+    CHUNK_SIZE = 1024
+    logprobs = []
+    ranks = []
+    prompt_token_ids = prompt_token_ids.to(torch.int64)
+    for start_idx in range(0, prompt_token_ids.shape[0], CHUNK_SIZE):
+        end_idx = start_idx + CHUNK_SIZE
+        # NOTE(woosuk): logits_fn can be slow because it involves all-gather.
+        prompt_logits = logits_fn(prompt_hidden_states[start_idx:end_idx])
+        prompt_logprobs = compute_topk_logprobs(
+            prompt_logits,
+            0,  # num_logprobs
+            prompt_token_ids[start_idx:end_idx],
+        )
+        logprobs.append(prompt_logprobs.logprobs)
+        ranks.append(prompt_logprobs.selected_token_ranks)
+
+    logprobs = torch.cat(logprobs, dim=0) if len(logprobs) > 1 else logprobs[0]
+    ranks = torch.cat(ranks, dim=0) if len(ranks) > 1 else ranks[0]
+    return logprobs, ranks
diff --git a/vllm/v1/worker/gpu/sample/metadata.py b/vllm/v1/worker/gpu/sample/metadata.py
new file mode 100644
index 0000000000000..666649fd0eebc
--- /dev/null
+++ b/vllm/v1/worker/gpu/sample/metadata.py
@@ -0,0 +1,179 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+
+import torch
+
+from vllm.triton_utils import tl, triton
+
+
+@dataclass
+class SamplingMetadata:
+    temperature: torch.Tensor
+
+    top_p: torch.Tensor | None
+    top_k: torch.Tensor | None
+
+    repetition_penalty: torch.Tensor
+    frequency_penalty: torch.Tensor
+    presence_penalty: torch.Tensor
+
+    seeds: torch.Tensor
+    pos: torch.Tensor
+
+    # None means no logprobs, 0 means sampled token logprobs only
+    max_num_logprobs: int | None
+
+    # For penalties
+    idx_mapping: torch.Tensor
+    prompt_bin_counts: torch.Tensor
+    output_bin_counts: torch.Tensor
+
+    @classmethod
+    def make_dummy(
+        cls,
+        num_reqs: int,
+        device: torch.device,
+    ) -> "SamplingMetadata":
+        assert num_reqs > 0
+        temperature = torch.zeros(num_reqs, dtype=torch.float32, device=device)
+        temperature[0] = 0.5
+        # TODO(woosuk): Use top-p and top-k for dummy sampler.
+        # Currently, they are disabled because of memory usage.
+        # top_p = torch.full((num_reqs,), 0.95, dtype=torch.float32, device=device)
+        # top_k = torch.full((num_reqs,), 20, dtype=torch.int32, device=device)
+        top_p = None
+        top_k = None
+        # NOTE(woosuk): We must set penalties to their default values to make sure
+        # the penalties kernel does not touch the placeholder bin_counts tensors.
+        repetition_penalty = torch.ones(num_reqs, dtype=torch.float32, device=device)
+        frequency_penalty = torch.zeros(num_reqs, dtype=torch.float32, device=device)
+        presence_penalty = torch.zeros(num_reqs, dtype=torch.float32, device=device)
+        seeds = torch.zeros(num_reqs, dtype=torch.int64, device=device)
+        pos = torch.zeros(num_reqs, dtype=torch.int64, device=device)
+        max_num_logprobs = 20
+
+        idx_mapping = torch.arange(num_reqs, dtype=torch.int32, device=device)
+        # NOTE(woosuk): These are placeholder tensors to avoid None checks in the
+        # penalties kernel. We use 2 instead of 1 as vocab_size to avoid Triton
+        # specialization and re-compilation at runtime.
+        prompt_bin_counts = torch.zeros(num_reqs, 2, dtype=torch.int32, device=device)
+        output_bin_counts = torch.zeros(num_reqs, 2, dtype=torch.int32, device=device)
+
+        return cls(
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            repetition_penalty=repetition_penalty,
+            frequency_penalty=frequency_penalty,
+            presence_penalty=presence_penalty,
+            seeds=seeds,
+            pos=pos,
+            max_num_logprobs=max_num_logprobs,
+            idx_mapping=idx_mapping,
+            prompt_bin_counts=prompt_bin_counts,
+            output_bin_counts=output_bin_counts,
+        )
+
+
+# NOTE(woosuk): Re-compilation can happen at runtime since top_p and top_k can be None.
+@triton.jit
+def _expand_sampling_metadata_kernel(
+    temp_ptr,
+    expanded_temp_ptr,
+    top_p_ptr,
+    expanded_top_p_ptr,
+    top_k_ptr,
+    expanded_top_k_ptr,
+    rep_penalty_ptr,
+    expanded_rep_penalty_ptr,
+    freq_penalty_ptr,
+    expanded_freq_penalty_ptr,
+    pres_penalty_ptr,
+    expanded_pres_penalty_ptr,
+    seeds_ptr,
+    expanded_seeds_ptr,
+    cu_num_logits_ptr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+    start_idx = tl.load(cu_num_logits_ptr + req_idx)
+    end_idx = tl.load(cu_num_logits_ptr + req_idx + 1)
+    num_tokens = end_idx - start_idx
+
+    block = tl.arange(0, BLOCK_SIZE)
+    mask = block < num_tokens
+
+    temp = tl.load(temp_ptr + req_idx)
+    tl.store(expanded_temp_ptr + start_idx + block, temp, mask=mask)
+
+    if top_p_ptr is not None:
+        top_p = tl.load(top_p_ptr + req_idx)
+        tl.store(expanded_top_p_ptr + start_idx + block, top_p, mask=mask)
+
+    if top_k_ptr is not None:
+        top_k = tl.load(top_k_ptr + req_idx)
+        tl.store(expanded_top_k_ptr + start_idx + block, top_k, mask=mask)
+
+    rep_penalty = tl.load(rep_penalty_ptr + req_idx)
+    tl.store(expanded_rep_penalty_ptr + start_idx + block, rep_penalty, mask=mask)
+
+    freq_penalty = tl.load(freq_penalty_ptr + req_idx)
+    tl.store(expanded_freq_penalty_ptr + start_idx + block, freq_penalty, mask=mask)
+
+    pres_penalty = tl.load(pres_penalty_ptr + req_idx)
+    tl.store(expanded_pres_penalty_ptr + start_idx + block, pres_penalty, mask=mask)
+
+    seed = tl.load(seeds_ptr + req_idx)
+    tl.store(expanded_seeds_ptr + start_idx + block, seed, mask=mask)
+
+
+def expand_sampling_metadata(
+    sampling_metadata: SamplingMetadata,
+    cu_num_logits: torch.Tensor,
+    max_expand_len: int,
+) -> SamplingMetadata:
+    total_num_logits = sampling_metadata.pos.shape[0]
+    create_empty = lambda x: x.new_empty(total_num_logits) if x is not None else None
+    expanded_temp = create_empty(sampling_metadata.temperature)
+    expanded_top_p = create_empty(sampling_metadata.top_p)
+    expanded_top_k = create_empty(sampling_metadata.top_k)
+    expanded_repetition_penalty = create_empty(sampling_metadata.repetition_penalty)
+    expanded_frequency_penalty = create_empty(sampling_metadata.frequency_penalty)
+    expanded_presence_penalty = create_empty(sampling_metadata.presence_penalty)
+    expanded_seeds = create_empty(sampling_metadata.seeds)
+
+    num_reqs = cu_num_logits.shape[0] - 1
+    _expand_sampling_metadata_kernel[(num_reqs,)](
+        sampling_metadata.temperature,
+        expanded_temp,
+        sampling_metadata.top_p,
+        expanded_top_p,
+        sampling_metadata.top_k,
+        expanded_top_k,
+        sampling_metadata.repetition_penalty,
+        expanded_repetition_penalty,
+        sampling_metadata.frequency_penalty,
+        expanded_frequency_penalty,
+        sampling_metadata.presence_penalty,
+        expanded_presence_penalty,
+        sampling_metadata.seeds,
+        expanded_seeds,
+        cu_num_logits,
+        BLOCK_SIZE=triton.next_power_of_2(max_expand_len),
+    )
+    return SamplingMetadata(
+        temperature=expanded_temp,
+        top_p=expanded_top_p,
+        top_k=expanded_top_k,
+        seeds=expanded_seeds,
+        repetition_penalty=expanded_repetition_penalty,
+        frequency_penalty=expanded_frequency_penalty,
+        presence_penalty=expanded_presence_penalty,
+        pos=sampling_metadata.pos,
+        max_num_logprobs=sampling_metadata.max_num_logprobs,
+        # TODO(woosuk): Support penalties with spec decoding.
+        idx_mapping=sampling_metadata.idx_mapping,
+        prompt_bin_counts=sampling_metadata.prompt_bin_counts,
+        output_bin_counts=sampling_metadata.output_bin_counts,
+    )
diff --git a/vllm/v1/worker/gpu/penalties.py b/vllm/v1/worker/gpu/sample/penalties.py
similarity index 66%
rename from vllm/v1/worker/gpu/penalties.py
rename to vllm/v1/worker/gpu/sample/penalties.py
index f87ee01718cd6..1607a75fd56ba 100644
--- a/vllm/v1/worker/gpu/penalties.py
+++ b/vllm/v1/worker/gpu/sample/penalties.py
@@ -3,7 +3,7 @@
 import torch
 
 from vllm.triton_utils import tl, triton
-from vllm.v1.worker.gpu.states import SamplingMetadata
+from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata
 
 
 @triton.jit
@@ -83,3 +83,49 @@ def apply_penalties(logits: torch.Tensor, sampling_metadata: SamplingMetadata) -
         vocab_size,
         BLOCK_SIZE=BLOCK_SIZE,
     )
+
+
+@triton.jit(do_not_specialize=["prefill_len", "prompt_len"])
+def _bincount_kernel(
+    prefill_token_ids_ptr,
+    prefill_len,
+    prompt_len,
+    prompt_bin_counts_ptr,
+    output_bin_counts_ptr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    block_idx = tl.program_id(0)
+    if block_idx * BLOCK_SIZE >= prefill_len:
+        return
+
+    block = block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    if block_idx * BLOCK_SIZE < prompt_len:
+        mask = block < prompt_len
+        prefill_tokens = tl.load(prefill_token_ids_ptr + block, mask=mask)
+        tl.atomic_add(prompt_bin_counts_ptr + prefill_tokens, 1, mask=mask)
+    if (block_idx + 1) * BLOCK_SIZE >= prompt_len:
+        mask = block < prefill_len
+        mask &= block >= prompt_len
+        prefill_tokens = tl.load(prefill_token_ids_ptr + block, mask=mask)
+        tl.atomic_add(output_bin_counts_ptr + prefill_tokens, 1, mask=mask)
+
+
+def bincount(
+    prefill_token_ids: torch.Tensor,
+    prefill_len: int,
+    prompt_len: int,
+    prompt_bin_counts: torch.Tensor,
+    output_bin_counts: torch.Tensor,
+) -> None:
+    prompt_bin_counts.zero_()
+    output_bin_counts.zero_()
+    BLOCK_SIZE = 1024
+    num_blocks = triton.cdiv(prefill_len, BLOCK_SIZE)
+    _bincount_kernel[(num_blocks,)](
+        prefill_token_ids,
+        prefill_len,
+        prompt_len,
+        prompt_bin_counts,
+        output_bin_counts,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
diff --git a/vllm/v1/worker/gpu/sample/sampler.py b/vllm/v1/worker/gpu/sample/sampler.py
new file mode 100644
index 0000000000000..4e7c85a021cfb
--- /dev/null
+++ b/vllm/v1/worker/gpu/sample/sampler.py
@@ -0,0 +1,79 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm.config.model import LogprobsMode
+from vllm.v1.outputs import SamplerOutput
+from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
+from vllm.v1.worker.gpu.sample.gumbel import gumbel_sample
+from vllm.v1.worker.gpu.sample.logprob import compute_topk_logprobs
+from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata
+from vllm.v1.worker.gpu.sample.penalties import apply_penalties
+
+
+class Sampler:
+    def __init__(
+        self,
+        logprobs_mode: LogprobsMode = "raw_logprobs",
+    ):
+        if logprobs_mode not in ["processed_logprobs", "raw_logprobs"]:
+            raise NotImplementedError(f"Unsupported logprobs_mode: {logprobs_mode}")
+        self.logprobs_mode = logprobs_mode
+
+    def __call__(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> SamplerOutput:
+        if sampling_metadata.max_num_logprobs is not None:
+            if self.logprobs_mode == "processed_logprobs":
+                sampled, logits = self.sample(
+                    logits, sampling_metadata, return_logits=True
+                )
+            else:
+                assert self.logprobs_mode == "raw_logprobs"
+                sampled, _ = self.sample(logits, sampling_metadata, return_logits=False)
+
+            logprobs_tensors = compute_topk_logprobs(
+                logits,
+                sampling_metadata.max_num_logprobs,
+                sampled,
+            )
+        else:
+            sampled, _ = self.sample(logits, sampling_metadata, return_logits=False)
+            logprobs_tensors = None
+
+        # These are GPU tensors.
+        sampler_output = SamplerOutput(
+            # The sampled tokens are expanded to 2D tensor with shape
+            # [num_requests, 1], where each row represents one generated
+            # token per request.
+            sampled_token_ids=sampled.view(-1, 1),
+            logprobs_tensors=logprobs_tensors,
+        )
+        return sampler_output
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+        return_logits: bool = False,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        is_greedy = sampling_metadata.temperature == 0
+        temp = torch.where(is_greedy, 1.0, sampling_metadata.temperature)
+        logits = logits / temp.view(-1, 1)
+        logits = apply_top_k_top_p(
+            logits, sampling_metadata.top_k, sampling_metadata.top_p
+        )
+        # Apply penalties in place.
+        apply_penalties(logits, sampling_metadata)
+
+        sampled = gumbel_sample(
+            logits,
+            sampling_metadata.temperature,
+            sampling_metadata.seeds,
+            sampling_metadata.pos,
+            apply_temperature=False,
+        )
+        return sampled, logits if return_logits else None
diff --git a/vllm/v1/worker/gpu/sampler.py b/vllm/v1/worker/gpu/sampler.py
deleted file mode 100644
index 6e0d6150a9669..0000000000000
--- a/vllm/v1/worker/gpu/sampler.py
+++ /dev/null
@@ -1,333 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Callable
-
-import torch
-
-from vllm.config.model import LogprobsMode
-from vllm.triton_utils import tl, triton
-from vllm.v1.outputs import LogprobsTensors, SamplerOutput
-from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
-from vllm.v1.worker.gpu.penalties import apply_penalties
-from vllm.v1.worker.gpu.states import SamplingMetadata
-
-
-class Sampler:
-    def __init__(
-        self,
-        logprobs_mode: LogprobsMode = "raw_logprobs",
-    ):
-        if logprobs_mode not in ["processed_logprobs", "raw_logprobs"]:
-            raise NotImplementedError(f"Unsupported logprobs_mode: {logprobs_mode}")
-        self.logprobs_mode = logprobs_mode
-
-    def __call__(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> SamplerOutput:
-        if sampling_metadata.max_num_logprobs is not None:
-            if self.logprobs_mode == "processed_logprobs":
-                sampled, logits = self.sample(
-                    logits, sampling_metadata, return_logits=True
-                )
-            else:
-                assert self.logprobs_mode == "raw_logprobs"
-                sampled, _ = self.sample(logits, sampling_metadata, return_logits=False)
-
-            logprobs_tensors = compute_topk_logprobs(
-                logits,
-                sampling_metadata.max_num_logprobs,
-                sampled,
-            )
-        else:
-            sampled, _ = self.sample(logits, sampling_metadata, return_logits=False)
-            logprobs_tensors = None
-
-        # These are GPU tensors.
-        sampler_output = SamplerOutput(
-            # The sampled tokens are expanded to 2D tensor with shape
-            # [num_requests, 1], where each row represents one generated
-            # token per request.
-            sampled_token_ids=sampled.view(-1, 1),
-            logprobs_tensors=logprobs_tensors,
-        )
-        return sampler_output
-
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-        return_logits: bool = False,
-    ) -> tuple[torch.Tensor, torch.Tensor | None]:
-        is_greedy = sampling_metadata.temperature == 0
-        temp = torch.where(is_greedy, 1.0, sampling_metadata.temperature)
-        logits = logits / temp.view(-1, 1)
-        logits = apply_top_k_top_p(
-            logits, sampling_metadata.top_k, sampling_metadata.top_p
-        )
-        # Apply penalties in place.
-        apply_penalties(logits, sampling_metadata)
-
-        sampled = gumbel_sample(
-            logits,
-            sampling_metadata.temperature,
-            sampling_metadata.seeds,
-            sampling_metadata.pos,
-            apply_temperature=False,
-        )
-        return sampled, logits if return_logits else None
-
-
-@triton.jit
-def _gumbel_sample_kernel(
-    local_argmax_ptr,
-    local_argmax_stride,
-    local_max_ptr,
-    local_max_stride,
-    logits_ptr,
-    logits_stride,
-    seeds_ptr,
-    pos_ptr,
-    temp_ptr,
-    vocab_size,
-    BLOCK_SIZE: tl.constexpr,
-    APPLY_TEMPERATURE: tl.constexpr,
-):
-    req_idx = tl.program_id(0)
-    block_idx = tl.program_id(1)
-    block = block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
-    mask = block < vocab_size
-    logits = tl.load(
-        logits_ptr + req_idx * logits_stride + block,
-        mask=mask,
-        other=float("-inf"),
-    )
-    logits = logits.to(tl.float32)
-
-    temp = tl.load(temp_ptr + req_idx).to(tl.float32)
-    if temp != 0.0:
-        # Calculate the seed for gumbel noise.
-        seed = tl.load(seeds_ptr + req_idx)
-        pos = tl.load(pos_ptr + req_idx)
-        gumbel_seed = tl.randint(seed, pos)
-
-        # Generate gumbel noise.
-        r = tl.rand(gumbel_seed, block).to(tl.float64)
-        gumbel_noise = -tl.log(-tl.log(r + 1e-20) + 1e-20)
-        gumbel_noise = gumbel_noise.to(tl.float32)
-
-        # Apply temperature.
-        if APPLY_TEMPERATURE:
-            # NOTE(woosuk): Use div_rn to match the behavior of torch.
-            logits = tl.div_rn(logits, temp)
-
-        # Apply gumbel noise.
-        logits = tl.where(mask, logits + gumbel_noise, float("-inf"))
-
-    idx = tl.argmax(logits, axis=0)
-    token_id = block_idx * BLOCK_SIZE + idx
-    value = tl.max(logits, axis=0)
-    tl.store(local_argmax_ptr + req_idx * local_argmax_stride + block_idx, token_id)
-    tl.store(local_max_ptr + req_idx * local_max_stride + block_idx, value)
-
-
-def gumbel_sample(
-    logits: torch.Tensor,  # [num_reqs, vocab_size]
-    temperature: torch.Tensor,  # [num_reqs]
-    seed: torch.Tensor,  # [num_reqs]
-    pos: torch.Tensor,  # [num_reqs]
-    apply_temperature: bool,
-) -> torch.Tensor:
-    num_reqs, vocab_size = logits.shape
-    BLOCK_SIZE = 1024
-    num_blocks = triton.cdiv(vocab_size, BLOCK_SIZE)
-    local_argmax = torch.empty(
-        num_reqs,
-        num_blocks,
-        dtype=torch.int64,
-        device=logits.device,
-    )
-    local_max = torch.empty(
-        num_reqs,
-        num_blocks,
-        dtype=torch.float32,
-        device=logits.device,
-    )
-    _gumbel_sample_kernel[(num_reqs, num_blocks)](
-        local_argmax,
-        local_argmax.stride(0),
-        local_max,
-        local_max.stride(0),
-        logits,
-        logits.stride(0),
-        seed,
-        pos,
-        temperature,
-        vocab_size,
-        BLOCK_SIZE=BLOCK_SIZE,
-        APPLY_TEMPERATURE=apply_temperature,
-    )
-    # NOTE(woosuk): Use int64 for later indexing.
-    max_block_idx = local_max.argmax(dim=-1, keepdim=True)
-    sampled = local_argmax.gather(dim=-1, index=max_block_idx).view(-1)
-    return sampled
-
-
-@triton.jit
-def _topk_log_softmax_kernel(
-    output_ptr,
-    logits_ptr,
-    logits_stride,
-    topk_ids_ptr,
-    topk,
-    vocab_size,
-    BLOCK_SIZE: tl.constexpr,
-    PADDED_TOPK: tl.constexpr,
-):
-    req_idx = tl.program_id(0)
-    row_ptr = logits_ptr + req_idx * logits_stride
-
-    max_val = float("-inf")
-    for i in range(0, vocab_size, BLOCK_SIZE):
-        block = i + tl.arange(0, BLOCK_SIZE)
-        logits = tl.load(row_ptr + block, mask=block < vocab_size, other=float("-inf"))
-        max_val = tl.max(tl.maximum(logits, max_val))
-    max_val = max_val.to(tl.float32)  # type: ignore
-
-    se = 0.0
-    for i in range(0, vocab_size, BLOCK_SIZE):
-        block = i + tl.arange(0, BLOCK_SIZE)
-        logits = tl.load(row_ptr + block, mask=block < vocab_size, other=0.0)
-        # NOTE(woosuk): Make sure that logits and all following operations use FP32.
-        logits = logits.to(tl.float32)
-        e = tl.exp(logits - max_val)
-        e = tl.where(block < vocab_size, e, 0.0)
-        se += tl.sum(e)
-    lse = tl.log(se)
-
-    k_offset = tl.arange(0, PADDED_TOPK)
-    k_mask = k_offset < topk
-    topk_ids = tl.load(topk_ids_ptr + req_idx * topk + k_offset, mask=k_mask, other=0)
-
-    logits = tl.load(row_ptr + topk_ids, mask=k_mask)
-    logits = logits.to(tl.float32)
-    o = logits - max_val - lse
-    tl.store(output_ptr + req_idx * topk + k_offset, o, mask=k_mask)
-
-
-@triton.jit
-def _ranks_kernel(
-    output_ptr,
-    logits_ptr,
-    logits_stride,
-    token_ids_ptr,
-    vocab_size,
-    BLOCK_SIZE: tl.constexpr,
-):
-    req_idx = tl.program_id(0)
-    row_ptr = logits_ptr + req_idx * logits_stride
-
-    token_id = tl.load(token_ids_ptr + req_idx)
-    x = tl.load(row_ptr + token_id)
-
-    n = 0
-    for i in range(0, vocab_size, BLOCK_SIZE):
-        block = i + tl.arange(0, BLOCK_SIZE)
-        logits = tl.load(row_ptr + block, mask=block < vocab_size, other=float("-inf"))
-        n += tl.sum((logits > x).to(tl.int32))
-    tl.store(output_ptr + req_idx, n)
-
-
-def compute_token_logprobs(
-    logits: torch.Tensor,
-    token_ids: torch.Tensor,
-) -> torch.Tensor:
-    batch_size = logits.shape[0]
-    vocab_size = logits.shape[1]
-    token_ids = token_ids.to(torch.int64)
-    num_logprobs = token_ids.shape[1]
-    logprobs = torch.empty(
-        batch_size,
-        num_logprobs,
-        dtype=torch.float32,
-        device=logits.device,
-    )
-    _topk_log_softmax_kernel[(batch_size,)](
-        logprobs,
-        logits,
-        logits.stride(0),
-        token_ids,
-        num_logprobs,
-        vocab_size,
-        BLOCK_SIZE=1024,  # type: ignore
-        PADDED_TOPK=triton.next_power_of_2(num_logprobs),
-    )
-    return logprobs
-
-
-def compute_topk_logprobs(
-    logits: torch.Tensor,
-    num_logprobs: int,
-    sampled_token_ids: torch.Tensor,
-) -> LogprobsTensors:
-    assert num_logprobs >= 0
-    batch_size, vocab_size = logits.shape
-    if num_logprobs == 0:
-        logprob_token_ids = sampled_token_ids.unsqueeze(-1)
-    else:
-        topk_indices = torch.topk(logits, num_logprobs, dim=-1).indices
-        logprob_token_ids = torch.cat(
-            (sampled_token_ids.unsqueeze(-1), topk_indices), dim=1
-        )
-
-    # NOTE(woosuk): Here, to save GPU memory, we do not materialize the full
-    # logprobs tensor. Instead, we only compute and return the logprobs of
-    # the topk + 1 tokens.
-    logprobs = compute_token_logprobs(logits, logprob_token_ids)
-    token_ranks = torch.empty(
-        batch_size,
-        dtype=torch.int64,
-        device=logits.device,
-    )
-    _ranks_kernel[(batch_size,)](
-        token_ranks,
-        logits,
-        logits.stride(0),
-        sampled_token_ids,
-        vocab_size,
-        BLOCK_SIZE=8192,  # type: ignore
-    )
-    return LogprobsTensors(
-        logprob_token_ids=logprob_token_ids,
-        logprobs=logprobs,
-        selected_token_ranks=token_ranks,
-    )
-
-
-def compute_prompt_logprobs(
-    prompt_token_ids: torch.Tensor,
-    prompt_hidden_states: torch.Tensor,
-    logits_fn: Callable[[torch.Tensor], torch.Tensor],
-) -> tuple[torch.Tensor, torch.Tensor]:
-    # Since materializing the full prompt logits can take too much memory,
-    # we compute it in chunks.
-    CHUNK_SIZE = 1024
-    logprobs = []
-    ranks = []
-    prompt_token_ids = prompt_token_ids.to(torch.int64)
-    for start_idx in range(0, prompt_token_ids.shape[0], CHUNK_SIZE):
-        end_idx = start_idx + CHUNK_SIZE
-        # NOTE(woosuk): logits_fn can be slow because it involves all-gather.
-        prompt_logits = logits_fn(prompt_hidden_states[start_idx:end_idx])
-        prompt_logprobs = compute_topk_logprobs(
-            prompt_logits,
-            0,  # num_logprobs
-            prompt_token_ids[start_idx:end_idx],
-        )
-        logprobs.append(prompt_logprobs.logprobs)
-        ranks.append(prompt_logprobs.selected_token_ranks)
-
-    logprobs = torch.cat(logprobs, dim=0) if len(logprobs) > 1 else logprobs[0]
-    ranks = torch.cat(ranks, dim=0) if len(ranks) > 1 else ranks[0]
-    return logprobs, ranks
diff --git a/vllm/v1/worker/gpu/spec_decode/eagle.py b/vllm/v1/worker/gpu/spec_decode/eagle.py
index 580d67246dfa1..a2d0550326f3b 100644
--- a/vllm/v1/worker/gpu/spec_decode/eagle.py
+++ b/vllm/v1/worker/gpu/spec_decode/eagle.py
@@ -18,9 +18,9 @@ from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.worker.gpu.attn_utils import build_attn_metadata
 from vllm.v1.worker.gpu.block_table import BlockTables
 from vllm.v1.worker.gpu.input_batch import InputBatch, InputBuffers
-from vllm.v1.worker.gpu.sampler import gumbel_sample
+from vllm.v1.worker.gpu.sample.gumbel import gumbel_sample
+from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata
 from vllm.v1.worker.gpu.spec_decode.eagle_cudagraph import EagleCudaGraphManager
-from vllm.v1.worker.gpu.states import SamplingMetadata
 
 logger = init_logger(__name__)
 
diff --git a/vllm/v1/worker/gpu/states.py b/vllm/v1/worker/gpu/states.py
index 44b076fa4c2ae..c3428faab0a31 100644
--- a/vllm/v1/worker/gpu/states.py
+++ b/vllm/v1/worker/gpu/states.py
@@ -7,86 +7,18 @@ import torch
 
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
-from vllm.triton_utils import tl, triton
 from vllm.utils.platform_utils import is_uva_available
 from vllm.utils.torch_utils import get_cuda_view_from_cpu_tensor
 from vllm.v1.outputs import LogprobsTensors
 from vllm.v1.utils import CpuGpuBuffer
+from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata
+from vllm.v1.worker.gpu.sample.penalties import bincount
 
 _NP_INT64_MIN = np.iinfo(np.int64).min
 _NP_INT64_MAX = np.iinfo(np.int64).max
 NO_LORA_ID = 0
 
 
-@dataclass
-class SamplingMetadata:
-    temperature: torch.Tensor
-
-    top_p: torch.Tensor | None
-    top_k: torch.Tensor | None
-
-    repetition_penalty: torch.Tensor
-    frequency_penalty: torch.Tensor
-    presence_penalty: torch.Tensor
-
-    seeds: torch.Tensor
-    pos: torch.Tensor
-
-    # None means no logprobs, 0 means sampled token logprobs only
-    max_num_logprobs: int | None
-
-    # For penalties
-    idx_mapping: torch.Tensor
-    prompt_bin_counts: torch.Tensor
-    output_bin_counts: torch.Tensor
-
-    @classmethod
-    def make_dummy(
-        cls,
-        num_reqs: int,
-        device: torch.device,
-    ) -> "SamplingMetadata":
-        assert num_reqs > 0
-        temperature = torch.zeros(num_reqs, dtype=torch.float32, device=device)
-        temperature[0] = 0.5
-        # TODO(woosuk): Use top-p and top-k for dummy sampler.
-        # Currently, they are disabled because of memory usage.
-        # top_p = torch.full((num_reqs,), 0.95, dtype=torch.float32, device=device)
-        # top_k = torch.full((num_reqs,), 20, dtype=torch.int32, device=device)
-        top_p = None
-        top_k = None
-        # NOTE(woosuk): We must set penalties to their default values to make sure
-        # the penalties kernel does not touch the placeholder bin_counts tensors.
-        repetition_penalty = torch.ones(num_reqs, dtype=torch.float32, device=device)
-        frequency_penalty = torch.zeros(num_reqs, dtype=torch.float32, device=device)
-        presence_penalty = torch.zeros(num_reqs, dtype=torch.float32, device=device)
-        seeds = torch.zeros(num_reqs, dtype=torch.int64, device=device)
-        pos = torch.zeros(num_reqs, dtype=torch.int64, device=device)
-        max_num_logprobs = 20
-
-        idx_mapping = torch.arange(num_reqs, dtype=torch.int32, device=device)
-        # NOTE(woosuk): These are placeholder tensors to avoid None checks in the
-        # penalties kernel. We use 2 instead of 1 as vocab_size to avoid Triton
-        # specialization and re-compilation at runtime.
-        prompt_bin_counts = torch.zeros(num_reqs, 2, dtype=torch.int32, device=device)
-        output_bin_counts = torch.zeros(num_reqs, 2, dtype=torch.int32, device=device)
-
-        return cls(
-            temperature=temperature,
-            top_p=top_p,
-            top_k=top_k,
-            repetition_penalty=repetition_penalty,
-            frequency_penalty=frequency_penalty,
-            presence_penalty=presence_penalty,
-            seeds=seeds,
-            pos=pos,
-            max_num_logprobs=max_num_logprobs,
-            idx_mapping=idx_mapping,
-            prompt_bin_counts=prompt_bin_counts,
-            output_bin_counts=output_bin_counts,
-        )
-
-
 class RequestState:
     def __init__(
         self,
@@ -311,17 +243,6 @@ class RequestState:
             output_bin_counts=self.output_bin_counts,
         )
 
-    def expand_sampling_metadata(
-        self,
-        sampling_metadata: SamplingMetadata,
-        cu_num_logits: torch.Tensor,
-    ) -> SamplingMetadata:
-        # For draft tokens, we need to expand the sampling param tensors as
-        # each request samples multiple tokens in each step.
-        return expand_sampling_metadata(
-            sampling_metadata, cu_num_logits, self.num_speculative_steps
-        )
-
     def make_lora_inputs(
         self,
         req_ids: list[str],
@@ -376,158 +297,9 @@ class UvaBuffer:
         self.gpu = get_cuda_view_from_cpu_tensor(self.cpu)
 
 
-# NOTE(woosuk): Re-compilation can happen at runtime since top_p and top_k can be None.
-@triton.jit
-def _expand_sampling_metadata_kernel(
-    temp_ptr,
-    expanded_temp_ptr,
-    top_p_ptr,
-    expanded_top_p_ptr,
-    top_k_ptr,
-    expanded_top_k_ptr,
-    rep_penalty_ptr,
-    expanded_rep_penalty_ptr,
-    freq_penalty_ptr,
-    expanded_freq_penalty_ptr,
-    pres_penalty_ptr,
-    expanded_pres_penalty_ptr,
-    seeds_ptr,
-    expanded_seeds_ptr,
-    cu_num_logits_ptr,
-    BLOCK_SIZE: tl.constexpr,
-):
-    req_idx = tl.program_id(0)
-    start_idx = tl.load(cu_num_logits_ptr + req_idx)
-    end_idx = tl.load(cu_num_logits_ptr + req_idx + 1)
-    num_tokens = end_idx - start_idx
-
-    block = tl.arange(0, BLOCK_SIZE)
-    mask = block < num_tokens
-
-    temp = tl.load(temp_ptr + req_idx)
-    tl.store(expanded_temp_ptr + start_idx + block, temp, mask=mask)
-
-    if top_p_ptr is not None:
-        top_p = tl.load(top_p_ptr + req_idx)
-        tl.store(expanded_top_p_ptr + start_idx + block, top_p, mask=mask)
-
-    if top_k_ptr is not None:
-        top_k = tl.load(top_k_ptr + req_idx)
-        tl.store(expanded_top_k_ptr + start_idx + block, top_k, mask=mask)
-
-    rep_penalty = tl.load(rep_penalty_ptr + req_idx)
-    tl.store(expanded_rep_penalty_ptr + start_idx + block, rep_penalty, mask=mask)
-
-    freq_penalty = tl.load(freq_penalty_ptr + req_idx)
-    tl.store(expanded_freq_penalty_ptr + start_idx + block, freq_penalty, mask=mask)
-
-    pres_penalty = tl.load(pres_penalty_ptr + req_idx)
-    tl.store(expanded_pres_penalty_ptr + start_idx + block, pres_penalty, mask=mask)
-
-    seed = tl.load(seeds_ptr + req_idx)
-    tl.store(expanded_seeds_ptr + start_idx + block, seed, mask=mask)
-
-
-def expand_sampling_metadata(
-    sampling_metadata: SamplingMetadata,
-    cu_num_logits: torch.Tensor,
-    num_speculative_steps: int,
-) -> SamplingMetadata:
-    total_num_logits = sampling_metadata.pos.shape[0]
-    create_empty = lambda x: x.new_empty(total_num_logits) if x is not None else None
-    expanded_temp = create_empty(sampling_metadata.temperature)
-    expanded_top_p = create_empty(sampling_metadata.top_p)
-    expanded_top_k = create_empty(sampling_metadata.top_k)
-    expanded_repetition_penalty = create_empty(sampling_metadata.repetition_penalty)
-    expanded_frequency_penalty = create_empty(sampling_metadata.frequency_penalty)
-    expanded_presence_penalty = create_empty(sampling_metadata.presence_penalty)
-    expanded_seeds = create_empty(sampling_metadata.seeds)
-
-    num_reqs = cu_num_logits.shape[0] - 1
-    _expand_sampling_metadata_kernel[(num_reqs,)](
-        sampling_metadata.temperature,
-        expanded_temp,
-        sampling_metadata.top_p,
-        expanded_top_p,
-        sampling_metadata.top_k,
-        expanded_top_k,
-        sampling_metadata.repetition_penalty,
-        expanded_repetition_penalty,
-        sampling_metadata.frequency_penalty,
-        expanded_frequency_penalty,
-        sampling_metadata.presence_penalty,
-        expanded_presence_penalty,
-        sampling_metadata.seeds,
-        expanded_seeds,
-        cu_num_logits,
-        BLOCK_SIZE=triton.next_power_of_2(num_speculative_steps + 1),
-    )
-    return SamplingMetadata(
-        temperature=expanded_temp,
-        top_p=expanded_top_p,
-        top_k=expanded_top_k,
-        seeds=expanded_seeds,
-        repetition_penalty=expanded_repetition_penalty,
-        frequency_penalty=expanded_frequency_penalty,
-        presence_penalty=expanded_presence_penalty,
-        pos=sampling_metadata.pos,
-        max_num_logprobs=sampling_metadata.max_num_logprobs,
-        # TODO(woosuk): Support penalties with spec decoding.
-        idx_mapping=sampling_metadata.idx_mapping,
-        prompt_bin_counts=sampling_metadata.prompt_bin_counts,
-        output_bin_counts=sampling_metadata.output_bin_counts,
-    )
-
-
 def use_penalty(sampling_params: SamplingParams) -> bool:
     return (
         sampling_params.repetition_penalty != 1.0
         or sampling_params.frequency_penalty != 0.0
         or sampling_params.presence_penalty != 0.0
     )
-
-
-@triton.jit(do_not_specialize=["prefill_len", "prompt_len"])
-def _bincount_kernel(
-    prefill_token_ids_ptr,
-    prefill_len,
-    prompt_len,
-    prompt_bin_counts_ptr,
-    output_bin_counts_ptr,
-    BLOCK_SIZE: tl.constexpr,
-):
-    block_idx = tl.program_id(0)
-    if block_idx * BLOCK_SIZE >= prefill_len:
-        return
-
-    block = block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
-    if block_idx * BLOCK_SIZE < prompt_len:
-        mask = block < prompt_len
-        prefill_tokens = tl.load(prefill_token_ids_ptr + block, mask=mask)
-        tl.atomic_add(prompt_bin_counts_ptr + prefill_tokens, 1, mask=mask)
-    if (block_idx + 1) * BLOCK_SIZE >= prompt_len:
-        mask = block < prefill_len
-        mask &= block >= prompt_len
-        prefill_tokens = tl.load(prefill_token_ids_ptr + block, mask=mask)
-        tl.atomic_add(output_bin_counts_ptr + prefill_tokens, 1, mask=mask)
-
-
-def bincount(
-    prefill_token_ids: torch.Tensor,
-    prefill_len: int,
-    prompt_len: int,
-    prompt_bin_counts: torch.Tensor,
-    output_bin_counts: torch.Tensor,
-) -> None:
-    prompt_bin_counts.zero_()
-    output_bin_counts.zero_()
-    BLOCK_SIZE = 1024
-    num_blocks = triton.cdiv(prefill_len, BLOCK_SIZE)
-    _bincount_kernel[(num_blocks,)](
-        prefill_token_ids,
-        prefill_len,
-        prompt_len,
-        prompt_bin_counts,
-        output_bin_counts,
-        BLOCK_SIZE=BLOCK_SIZE,
-    )

From 04a797cd0e94567ee2b158a0246d6112a197fdc5 Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Sat, 29 Nov 2025 10:15:39 +0100
Subject: [PATCH 077/770] [Doc]: fixing typos in various files. (#29717)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
---
 vllm/distributed/device_communicators/pynccl_allocator.py       | 2 +-
 vllm/distributed/parallel_state.py                              | 2 +-
 vllm/entrypoints/openai/serving_models.py                       | 2 +-
 .../layers/quantization/quark/schemes/quark_ocp_mx.py           | 2 +-
 vllm/transformers_utils/repo_utils.py                           | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm/distributed/device_communicators/pynccl_allocator.py b/vllm/distributed/device_communicators/pynccl_allocator.py
index 401b80046f606..2e5d94de9d016 100644
--- a/vllm/distributed/device_communicators/pynccl_allocator.py
+++ b/vllm/distributed/device_communicators/pynccl_allocator.py
@@ -157,7 +157,7 @@ class nccl_symm_mem_context:
         if self.disabled:
             return self
         assert self.pynccl_comm is not None, (
-            "Symmetric memory requires pynccl to be initalized"
+            "Symmetric memory requires pynccl to be initialized"
         )
         assert self.pynccl_comm.nccl_version >= 22703, (
             "NCCL version 2.27.3 or higher is required for NCCL symmetric memory"
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 52b433cfaf1bd..c82a77c216af2 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -1583,7 +1583,7 @@ def destroy_distributed_environment():
 
 
 def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
-    # Ensure all objects are not freezed before cleanup
+    # Ensure all objects are not frozen before cleanup
     gc.unfreeze()
 
     destroy_model_parallel()
diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
index 165de5b618c49..953398a9a72ae 100644
--- a/vllm/entrypoints/openai/serving_models.py
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -150,7 +150,7 @@ class OpenAIServingModels:
                 lora_request.base_model_name = base_model_name
 
             # Validate that the adapter can be loaded into the engine
-            # This will also pre-load it for incoming requests
+            # This will also preload it for incoming requests
             try:
                 await self.engine_client.add_lora(lora_request)
             except Exception as e:
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
index 33e9f9806b27e..eeb60023dc0e7 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
@@ -37,7 +37,7 @@ logger = init_logger(__name__)
 # use `rocm_aiter_ops.is_asm_fp4_gemm_dynamic_quant_enabled()`
 # for envs checks which does not require @cache anymore.
 # triton kernel is torch compile compatible.
-# does not require direct registeration.
+# does not require direct registration.
 # use `rocm_aiter_ops.triton_fp4_gemm_dynamic_qaunt`.
 @cache
 def is_rocm_aiter_fp4_asm_gemm_enabled() -> bool:
diff --git a/vllm/transformers_utils/repo_utils.py b/vllm/transformers_utils/repo_utils.py
index 3ccec04fc487a..b63288914cf8d 100644
--- a/vllm/transformers_utils/repo_utils.py
+++ b/vllm/transformers_utils/repo_utils.py
@@ -171,7 +171,7 @@ def file_or_path_exists(
         repo_id=model, filename=config_name, revision=revision
     )
     if isinstance(cached_filepath, str):
-        # The config file exists in cache- we can continue trying to load
+        # The config file exists in cache - we can continue trying to load
         return True
 
     # NB: file_exists will only check for the existence of the config file on

From f223ed4181f003301a115950178746f42b668903 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sat, 29 Nov 2025 02:29:16 -0800
Subject: [PATCH 078/770] [Model Runner V2] Fuse penalties and temperature into
 single kernel (#29720)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/sample/gumbel.py    |  5 +-
 vllm/v1/worker/gpu/sample/penalties.py | 65 ++++++++++++++++----------
 vllm/v1/worker/gpu/sample/sampler.py   | 32 ++++++-------
 3 files changed, 58 insertions(+), 44 deletions(-)

diff --git a/vllm/v1/worker/gpu/sample/gumbel.py b/vllm/v1/worker/gpu/sample/gumbel.py
index 3e0d72e56939b..a95bf1e7a37a7 100644
--- a/vllm/v1/worker/gpu/sample/gumbel.py
+++ b/vllm/v1/worker/gpu/sample/gumbel.py
@@ -45,8 +45,9 @@ def _gumbel_sample_kernel(
 
         # Apply temperature.
         if APPLY_TEMPERATURE:
-            # NOTE(woosuk): Use div_rn to match the behavior of torch.
-            logits = tl.div_rn(logits, temp)
+            # NOTE(woosuk): Match the behavior of _penalties_and_temperature_kernel.
+            # E.g., if the kernel uses tl.div_rn, we should use tl.div_rn here too.
+            logits = logits / temp
 
         # Apply gumbel noise.
         logits = tl.where(mask, logits + gumbel_noise, float("-inf"))
diff --git a/vllm/v1/worker/gpu/sample/penalties.py b/vllm/v1/worker/gpu/sample/penalties.py
index 1607a75fd56ba..69cf9d26ec992 100644
--- a/vllm/v1/worker/gpu/sample/penalties.py
+++ b/vllm/v1/worker/gpu/sample/penalties.py
@@ -7,12 +7,13 @@ from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata
 
 
 @triton.jit
-def _penalties_kernel(
+def _penalties_and_temperature_kernel(
     logits_ptr,
     logits_stride,
     repetition_penalty_ptr,
     frequency_penalty_ptr,
     presence_penalty_ptr,
+    temperature_ptr,
     idx_mapping_ptr,
     prompt_bin_counts_ptr,
     prompt_bin_counts_stride,
@@ -25,12 +26,16 @@ def _penalties_kernel(
     rep_penalty = tl.load(repetition_penalty_ptr + batch_idx)
     freq_penalty = tl.load(frequency_penalty_ptr + batch_idx)
     pres_penalty = tl.load(presence_penalty_ptr + batch_idx)
+    temperature = tl.load(temperature_ptr + batch_idx)
+    temperature = tl.where(temperature == 0.0, 1.0, temperature)
 
     use_rep_penalty = rep_penalty != 1.0
     use_freq_penalty = freq_penalty != 0.0
     use_pres_penalty = pres_penalty != 0.0
-    if not (use_rep_penalty or use_freq_penalty or use_pres_penalty):
-        # No penalties to apply. Early return.
+    use_penalty = use_rep_penalty or use_freq_penalty or use_pres_penalty
+    use_temperature = temperature != 1.0
+    if not (use_penalty or use_temperature):
+        # Early return to avoid loading logits.
         return
 
     block_idx = tl.program_id(1)
@@ -39,42 +44,54 @@ def _penalties_kernel(
     logits = tl.load(logits_ptr + batch_idx * logits_stride + block, mask=mask)
     logits = logits.to(tl.float32)
 
-    req_state_idx = tl.load(idx_mapping_ptr + batch_idx)
-    output_bin_counts = tl.load(
-        output_bin_counts_ptr + req_state_idx * output_bin_counts_stride + block,
-        mask=mask,
-    )
-
-    # Apply repetition penalties.
-    if use_rep_penalty:
-        prompt_bin_counts = tl.load(
-            prompt_bin_counts_ptr + req_state_idx * prompt_bin_counts_stride + block,
+    if use_penalty:
+        req_state_idx = tl.load(idx_mapping_ptr + batch_idx)
+        output_bin_counts = tl.load(
+            output_bin_counts_ptr + req_state_idx * output_bin_counts_stride + block,
             mask=mask,
         )
-        # If token appears in prompt or output, apply, otherwise use 1.0 for no-op.
-        scale = tl.where((prompt_bin_counts + output_bin_counts) > 0, rep_penalty, 1.0)
-        # If logits are positive, divide by penalty, otherwise multiply by penalty.
-        scale = tl.where(logits > 0, 1.0 / scale, scale)
-        logits *= scale
+        output_bin_mask = output_bin_counts > 0
+
+        # Apply repetition penalties.
+        if use_rep_penalty:
+            prompt_bin_counts = tl.load(
+                prompt_bin_counts_ptr
+                + req_state_idx * prompt_bin_counts_stride
+                + block,
+                mask=mask,
+            )
+            prompt_bin_mask = prompt_bin_counts > 0
+            # If token appears in prompt or output, apply, otherwise use 1.0 for no-op.
+            scale = tl.where(prompt_bin_mask | output_bin_mask, rep_penalty, 1.0)
+            # If logits are positive, divide by penalty, otherwise multiply by penalty.
+            logits *= tl.where(logits > 0, 1.0 / scale, scale)
+
+        # Apply frequency penalties.
+        logits -= freq_penalty * output_bin_counts
+        # Apply presence penalties.
+        logits -= pres_penalty * output_bin_mask
+
+    # Apply temperature.
+    logits = logits / temperature
 
-    # Apply frequency penalties.
-    logits -= freq_penalty * output_bin_counts
-    # Apply presence penalties.
-    logits -= pres_penalty * (output_bin_counts > 0)
     # Store back to logits.
     tl.store(logits_ptr + batch_idx * logits_stride + block, logits, mask=mask)
 
 
-def apply_penalties(logits: torch.Tensor, sampling_metadata: SamplingMetadata) -> None:
+def apply_penalties_and_temperature(
+    logits: torch.Tensor,
+    sampling_metadata: SamplingMetadata,
+) -> None:
     num_reqs, vocab_size = logits.shape
     BLOCK_SIZE = 8192
     num_blocks = triton.cdiv(vocab_size, BLOCK_SIZE)
-    _penalties_kernel[(num_reqs, num_blocks)](
+    _penalties_and_temperature_kernel[(num_reqs, num_blocks)](
         logits,
         logits.stride(0),
         sampling_metadata.repetition_penalty,
         sampling_metadata.frequency_penalty,
         sampling_metadata.presence_penalty,
+        sampling_metadata.temperature,
         sampling_metadata.idx_mapping,
         sampling_metadata.prompt_bin_counts,
         sampling_metadata.prompt_bin_counts.stride(0),
diff --git a/vllm/v1/worker/gpu/sample/sampler.py b/vllm/v1/worker/gpu/sample/sampler.py
index 4e7c85a021cfb..3429dd3e4d0fb 100644
--- a/vllm/v1/worker/gpu/sample/sampler.py
+++ b/vllm/v1/worker/gpu/sample/sampler.py
@@ -9,7 +9,7 @@ from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
 from vllm.v1.worker.gpu.sample.gumbel import gumbel_sample
 from vllm.v1.worker.gpu.sample.logprob import compute_topk_logprobs
 from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata
-from vllm.v1.worker.gpu.sample.penalties import apply_penalties
+from vllm.v1.worker.gpu.sample.penalties import apply_penalties_and_temperature
 
 
 class Sampler:
@@ -26,22 +26,19 @@ class Sampler:
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> SamplerOutput:
+        sampled, processed_logits = self.sample(logits, sampling_metadata)
         if sampling_metadata.max_num_logprobs is not None:
-            if self.logprobs_mode == "processed_logprobs":
-                sampled, logits = self.sample(
-                    logits, sampling_metadata, return_logits=True
-                )
-            else:
-                assert self.logprobs_mode == "raw_logprobs"
-                sampled, _ = self.sample(logits, sampling_metadata, return_logits=False)
-
+            logits = (
+                processed_logits
+                if self.logprobs_mode == "processed_logprobs"
+                else logits
+            )
             logprobs_tensors = compute_topk_logprobs(
                 logits,
                 sampling_metadata.max_num_logprobs,
                 sampled,
             )
         else:
-            sampled, _ = self.sample(logits, sampling_metadata, return_logits=False)
             logprobs_tensors = None
 
         # These are GPU tensors.
@@ -58,16 +55,15 @@ class Sampler:
         self,
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
-        return_logits: bool = False,
-    ) -> tuple[torch.Tensor, torch.Tensor | None]:
-        is_greedy = sampling_metadata.temperature == 0
-        temp = torch.where(is_greedy, 1.0, sampling_metadata.temperature)
-        logits = logits / temp.view(-1, 1)
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Copy logits to a new FP32 tensor.
+        logits = torch.empty_like(logits, dtype=torch.float32).copy_(logits)
+
+        # Apply penalties and temperature in place.
+        apply_penalties_and_temperature(logits, sampling_metadata)
         logits = apply_top_k_top_p(
             logits, sampling_metadata.top_k, sampling_metadata.top_p
         )
-        # Apply penalties in place.
-        apply_penalties(logits, sampling_metadata)
 
         sampled = gumbel_sample(
             logits,
@@ -76,4 +72,4 @@ class Sampler:
             sampling_metadata.pos,
             apply_temperature=False,
         )
-        return sampled, logits if return_logits else None
+        return sampled, logits

From 34a984274eae2f8fb9d1d6413abd08d7fcde741c Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 29 Nov 2025 20:02:21 +0800
Subject: [PATCH 079/770] [Misc] Refactor tokenizer interface (#29693)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/test-amd.yaml                      |   4 +-
 .buildkite/test-pipeline.yaml                 |   4 +-
 benchmarks/backend_request_func.py            |   2 +-
 docs/features/reasoning_outputs.md            |   5 +-
 docs/features/tool_calling.md                 |   2 +-
 .../entrypoints/openai/test_serving_engine.py |   2 +-
 .../openai/tool_parsers/conftest.py           |   4 +-
 .../tool_parsers/test_hermes_tool_parser.py   |  12 +-
 .../test_llama3_json_tool_parser.py           |   4 +-
 .../test_llama4_pythonic_tool_parser.py       |  10 +-
 .../tool_parsers/test_olmo3_tool_parser.py    |  10 +-
 .../tool_parsers/test_pythonic_tool_parser.py |  10 +-
 .../entrypoints/openai/tool_parsers/utils.py  |   4 +-
 tests/entrypoints/test_chat_utils.py          |   2 +-
 .../language/generation/test_mistral.py       |   2 +-
 .../multimodal/generation/test_voxtral.py     |   2 +-
 .../multimodal/generation/vlm_utils/core.py   |   4 +-
 .../multimodal/generation/vlm_utils/types.py  |   4 +-
 .../multimodal/processing/test_common.py      |   2 +-
 tests/multimodal/test_processing.py           |  21 ++-
 .../test_mistral_reasoning_parser.py          |   2 +-
 tests/reasoning/utils.py                      |   2 +-
 tests/tokenization/__init__.py                |   0
 tests/tokenization/test_do_lower_case.py      |  18 --
 tests/tokenization/test_get_eos.py            |  32 ----
 tests/tokenization/test_tokenizer.py          |  23 ---
 tests/tokenization/test_tokenizer_registry.py | 120 -------------
 tests/tokenizers_/__init__.py                 |   4 +
 tests/tokenizers_/test_basic.py               |  59 +++++++
 .../test_cached_tokenizer.py                  |   5 +-
 .../test_detokenize.py                        |   2 +-
 .../test_mistral.py}                          |  21 +--
 tests/tokenizers_/test_registry.py            |  36 ++++
 .../tool_use/test_ernie45_moe_tool_parser.py  |   5 +-
 tests/tool_use/test_jamba_tool_parser.py      |   7 +-
 tests/tool_use/test_qwen3coder_tool_parser.py |   5 +-
 tests/tool_use/test_seed_oss_tool_parser.py   |   5 +-
 tests/tool_use/test_xlam_tool_parser.py       |   5 +-
 tests/transformers_utils/test_config.py       |  74 +++-----
 ...gs_from_processor.py => test_processor.py} |   0
 tests/transformers_utils/test_repo_utils.py   |  62 +++++++
 tests/v1/engine/test_output_processor.py      |   4 +-
 tools/pre_commit/check_pickle_imports.py      |   2 +-
 tools/pre_commit/mypy.py                      |   1 +
 vllm/benchmarks/datasets.py                   |   4 +-
 vllm/engine/protocol.py                       |   4 +-
 vllm/entrypoints/chat_utils.py                |  12 +-
 vllm/entrypoints/llm.py                       |  15 +-
 vllm/entrypoints/openai/serving_chat.py       |  13 +-
 .../openai/serving_classification.py          |   2 +-
 vllm/entrypoints/openai/serving_completion.py |  16 +-
 vllm/entrypoints/openai/serving_engine.py     |  88 +++++-----
 vllm/entrypoints/openai/serving_responses.py  |  20 +--
 vllm/entrypoints/openai/serving_score.py      |   8 +-
 .../openai/serving_tokenization.py            |   4 +-
 .../tool_parsers/abstract_tool_parser.py      |   4 +-
 .../tool_parsers/deepseekv31_tool_parser.py   |   4 +-
 .../tool_parsers/deepseekv3_tool_parser.py    |   4 +-
 .../tool_parsers/ernie45_tool_parser.py       |   4 +-
 .../tool_parsers/glm4_moe_tool_parser.py      |   4 +-
 .../granite_20b_fc_tool_parser.py             |   4 +-
 .../tool_parsers/granite_tool_parser.py       |   4 +-
 .../openai/tool_parsers/hermes_tool_parser.py |   8 +-
 .../tool_parsers/hunyuan_a13b_tool_parser.py  |   4 +-
 .../tool_parsers/internlm2_tool_parser.py     |   4 +-
 .../openai/tool_parsers/jamba_tool_parser.py  |   5 +-
 .../tool_parsers/kimi_k2_tool_parser.py       |   4 +-
 .../tool_parsers/longcat_tool_parser.py       |   4 +-
 .../tool_parsers/minimax_m2_tool_parser.py    |   4 +-
 .../tool_parsers/minimax_tool_parser.py       |   4 +-
 .../tool_parsers/mistral_tool_parser.py       |   6 +-
 .../openai/tool_parsers/openai_tool_parser.py |   6 +-
 .../tool_parsers/qwen3coder_tool_parser.py    |   4 +-
 .../tool_parsers/qwen3xml_tool_parser.py      |   4 +-
 .../tool_parsers/seed_oss_tool_parser.py      |   4 +-
 .../openai/tool_parsers/step3_tool_parser.py  |   4 +-
 .../openai/tool_parsers/xlam_tool_parser.py   |   4 +-
 vllm/entrypoints/renderer.py                  |  10 +-
 vllm/entrypoints/score_utils.py               |  18 +-
 vllm/entrypoints/utils.py                     |   2 +-
 vllm/inputs/preprocess.py                     |  10 +-
 vllm/logits_process.py                        |   4 +-
 vllm/model_executor/models/h2ovl.py           |   4 +-
 vllm/model_executor/models/internvl.py        |   6 +-
 .../model_executor/models/nano_nemotron_vl.py |  10 +-
 vllm/model_executor/models/nemotron_vl.py     |   4 +-
 vllm/model_executor/models/opencua.py         |   4 +-
 vllm/model_executor/models/pixtral.py         |   6 +-
 vllm/model_executor/models/qwen2_vl.py        |   4 +-
 vllm/model_executor/models/skyworkr1v.py      |   4 +-
 vllm/model_executor/models/step3_vl.py        |   4 +-
 vllm/model_executor/models/voxtral.py         |   6 +-
 vllm/multimodal/processing.py                 |  43 ++---
 vllm/multimodal/registry.py                   |  14 +-
 vllm/reasoning/abs_reasoning_parsers.py       |   6 +-
 vllm/reasoning/basic_parsers.py               |   4 +-
 vllm/reasoning/minimax_m2_reasoning_parser.py |   4 +-
 vllm/reasoning/mistral_reasoning_parser.py    |   2 +-
 vllm/reasoning/olmo3_reasoning_parser.py      |   4 +-
 vllm/sampling_params.py                       |   4 +-
 vllm/tokenizers/__init__.py                   |   8 +
 .../tokenizers/mistral.py                     |  29 +---
 vllm/tokenizers/protocol.py                   | 105 +++++++++++
 vllm/tokenizers/registry.py                   |  28 +++
 vllm/transformers_utils/config.py             |   7 +-
 vllm/transformers_utils/detokenizer_utils.py  |  10 +-
 vllm/transformers_utils/gguf_utils.py         |   3 +-
 vllm/transformers_utils/tokenizer.py          |  60 ++++---
 vllm/transformers_utils/tokenizer_base.py     | 163 +++---------------
 .../transformers_utils/tokenizers/__init__.py |  16 --
 vllm/v1/engine/async_llm.py                   |  16 +-
 vllm/v1/engine/detokenizer.py                 |   6 +-
 vllm/v1/engine/input_processor.py             |   9 +-
 vllm/v1/engine/llm_engine.py                  |  16 +-
 vllm/v1/engine/logprobs.py                    |   6 +-
 vllm/v1/engine/output_processor.py            |   9 +-
 vllm/v1/structured_output/backend_types.py    |   6 +-
 vllm/v1/structured_output/backend_xgrammar.py |   2 +-
 vllm/v1/structured_output/utils.py            |  10 +-
 119 files changed, 752 insertions(+), 821 deletions(-)
 delete mode 100644 tests/tokenization/__init__.py
 delete mode 100644 tests/tokenization/test_do_lower_case.py
 delete mode 100644 tests/tokenization/test_get_eos.py
 delete mode 100644 tests/tokenization/test_tokenizer.py
 delete mode 100644 tests/tokenization/test_tokenizer_registry.py
 create mode 100644 tests/tokenizers_/__init__.py
 create mode 100644 tests/tokenizers_/test_basic.py
 rename tests/{tokenization => tokenizers_}/test_cached_tokenizer.py (88%)
 rename tests/{tokenization => tokenizers_}/test_detokenize.py (99%)
 rename tests/{tokenization/test_mistral_tokenizer.py => tokenizers_/test_mistral.py} (98%)
 create mode 100644 tests/tokenizers_/test_registry.py
 rename tests/transformers_utils/{test_get_processor_kwargs_from_processor.py => test_processor.py} (100%)
 create mode 100644 tests/transformers_utils/test_repo_utils.py
 create mode 100644 vllm/tokenizers/__init__.py
 rename vllm/{transformers_utils => }/tokenizers/mistral.py (96%)
 create mode 100644 vllm/tokenizers/protocol.py
 create mode 100644 vllm/tokenizers/registry.py
 delete mode 100644 vllm/transformers_utils/tokenizers/__init__.py

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 4ddf11c0b268f..4d98ee40a4bbb 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -316,7 +316,7 @@ steps:
   source_file_dependencies:
   - vllm/
   - tests/engine
-  - tests/tokenization
+  - tests/tokenizers_
   - tests/test_sequence
   - tests/test_config
   - tests/test_logger
@@ -324,7 +324,7 @@ steps:
   commands:
   - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
   # OOM in the CI unless we run this separately
-  - pytest -v -s tokenization
+  - pytest -v -s tokenizers_
 
 - label: V1 Test e2e + engine # 30min
   timeout_in_minutes: 45
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index c38068a9b22c0..16d4907549587 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -282,7 +282,7 @@ steps:
   source_file_dependencies:
   - vllm/
   - tests/engine
-  - tests/tokenization
+  - tests/tokenizers_
   - tests/test_sequence
   - tests/test_config
   - tests/test_logger
@@ -290,7 +290,7 @@ steps:
   commands:
   - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
   # OOM in the CI unless we run this separately
-  - pytest -v -s tokenization
+  - pytest -v -s tokenizers_
 
 - label: V1 Test e2e + engine # 30min
   timeout_in_minutes: 45
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 4021fede72153..d69d74ca61f54 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -620,7 +620,7 @@ def get_tokenizer(
         kwargs["use_fast"] = False
     if tokenizer_mode == "mistral":
         try:
-            from vllm.transformers_utils.tokenizer import MistralTokenizer
+            from vllm.tokenizers import MistralTokenizer
         except ImportError as e:
             raise ImportError(
                 "MistralTokenizer requires vllm package.\n"
diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md
index 5f26c7cf182b9..08a0dd69efa90 100644
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@@ -216,14 +216,13 @@ You can add a new `ReasoningParser` similar to [vllm/reasoning/deepseek_r1_reaso
     # import the required packages
 
     from vllm.reasoning import ReasoningParser, ReasoningParserManager
-    from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                                DeltaMessage)
+    from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
 
     # define a reasoning parser and register it to vllm
     # the name list in register_module can be used
     # in --reasoning-parser.
     class ExampleParser(ReasoningParser):
-        def __init__(self, tokenizer: AnyTokenizer):
+        def __init__(self, tokenizer: TokenizerLike):
             super().__init__(tokenizer)
 
         def extract_reasoning_streaming(
diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md
index 22dda37279ac6..b6dfbf10b4568 100644
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -422,7 +422,7 @@ Here is a summary of a plugin file:
     # in --tool-call-parser. you can define as many
     # tool parsers as you want here.
     class ExampleToolParser(ToolParser):
-        def __init__(self, tokenizer: AnyTokenizer):
+        def __init__(self, tokenizer: TokenizerLike):
             super().__init__(tokenizer)
 
         # adjust request. e.g.: set skip special tokens
diff --git a/tests/entrypoints/openai/test_serving_engine.py b/tests/entrypoints/openai/test_serving_engine.py
index 29892d0bf38aa..956a06dc5487c 100644
--- a/tests/entrypoints/openai/test_serving_engine.py
+++ b/tests/entrypoints/openai/test_serving_engine.py
@@ -10,7 +10,7 @@ import pytest
 from vllm.config import ModelConfig
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
-from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer
 
 
 @pytest.fixture()
diff --git a/tests/entrypoints/openai/tool_parsers/conftest.py b/tests/entrypoints/openai/tool_parsers/conftest.py
index f2ac5e5b9a8fa..a40d0ab44cf7f 100644
--- a/tests/entrypoints/openai/tool_parsers/conftest.py
+++ b/tests/entrypoints/openai/tool_parsers/conftest.py
@@ -4,9 +4,9 @@
 import pytest
 from transformers import AutoTokenizer
 
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 
 @pytest.fixture(scope="function")
-def default_tokenizer() -> AnyTokenizer:
+def default_tokenizer() -> TokenizerLike:
     return AutoTokenizer.from_pretrained("gpt2")
diff --git a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
index 38008dafe32b2..b2303ab0e7b7c 100644
--- a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
@@ -7,7 +7,7 @@ import pytest
 
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.entrypoints.openai.tool_parsers.hermes_tool_parser import Hermes2ProToolParser
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 from ....utils import RemoteOpenAIServer
 
@@ -270,14 +270,14 @@ async def test_streaming_product_tool_call():
 
 
 @pytest.fixture
-def qwen_tokenizer() -> AnyTokenizer:
+def qwen_tokenizer() -> TokenizerLike:
     from vllm.transformers_utils.tokenizer import get_tokenizer
 
     return get_tokenizer("Qwen/Qwen3-32B")
 
 
 @pytest.fixture
-def hermes_parser(qwen_tokenizer: AnyTokenizer) -> Hermes2ProToolParser:
+def hermes_parser(qwen_tokenizer: TokenizerLike) -> Hermes2ProToolParser:
     return Hermes2ProToolParser(qwen_tokenizer)
 
 
@@ -291,7 +291,7 @@ def any_chat_request() -> ChatCompletionRequest:
 
 
 def test_hermes_parser_streaming_just_forward_text(
-    qwen_tokenizer: AnyTokenizer,
+    qwen_tokenizer: TokenizerLike,
     hermes_parser: Hermes2ProToolParser,
     any_chat_request: ChatCompletionRequest,
 ) -> None:
@@ -323,7 +323,7 @@ def test_hermes_parser_streaming_just_forward_text(
 
 
 def test_hermes_parser_streaming_failure_case_bug_19056(
-    qwen_tokenizer: AnyTokenizer,
+    qwen_tokenizer: TokenizerLike,
     hermes_parser: Hermes2ProToolParser,
     any_chat_request: ChatCompletionRequest,
 ) -> None:
@@ -357,7 +357,7 @@ def test_hermes_parser_streaming_failure_case_bug_19056(
 
 
 def test_hermes_parser_streaming(
-    qwen_tokenizer: AnyTokenizer,
+    qwen_tokenizer: TokenizerLike,
     hermes_parser: Hermes2ProToolParser,
     any_chat_request: ChatCompletionRequest,
 ) -> None:
diff --git a/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
index 37e52d2cdf609..6c286ca90ce48 100644
--- a/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
@@ -7,11 +7,11 @@ import pytest
 
 from vllm.entrypoints.openai.protocol import ExtractedToolCallInformation
 from vllm.entrypoints.openai.tool_parsers.llama_tool_parser import Llama3JsonToolParser
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 
 @pytest.fixture
-def parser(default_tokenizer: AnyTokenizer):
+def parser(default_tokenizer: TokenizerLike):
     return Llama3JsonToolParser(default_tokenizer)
 
 
diff --git a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
index d297432eab644..8aa88a007188f 100644
--- a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
@@ -11,7 +11,7 @@ from tests.entrypoints.openai.tool_parsers.utils import (
 )
 from vllm.entrypoints.openai.protocol import FunctionCall
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 # Test cases similar to pythonic parser but with Llama4 specific format
 SIMPLE_FUNCTION_OUTPUT = "[get_weather(city='LA', metric='C')]"
@@ -64,7 +64,7 @@ PYTHON_TAG_FUNCTION_OUTPUT = (
 
 
 @pytest.mark.parametrize("streaming", [True, False])
-def test_no_tool_call(streaming: bool, default_tokenizer: AnyTokenizer):
+def test_no_tool_call(streaming: bool, default_tokenizer: TokenizerLike):
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")(
         default_tokenizer
     )
@@ -208,7 +208,7 @@ def test_tool_call(
     streaming: bool,
     model_output: str,
     expected_tool_calls: list[FunctionCall],
-    default_tokenizer: AnyTokenizer,
+    default_tokenizer: TokenizerLike,
 ):
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")(
         default_tokenizer
@@ -224,7 +224,7 @@ def test_tool_call(
         assert actual.function == expected
 
 
-def test_streaming_tool_call_with_large_steps(default_tokenizer: AnyTokenizer):
+def test_streaming_tool_call_with_large_steps(default_tokenizer: TokenizerLike):
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")(
         default_tokenizer
     )
@@ -246,7 +246,7 @@ def test_streaming_tool_call_with_large_steps(default_tokenizer: AnyTokenizer):
 
 
 @pytest.mark.parametrize("streaming", [False])
-def test_regex_timeout_handling(streaming: bool, default_tokenizer: AnyTokenizer):
+def test_regex_timeout_handling(streaming: bool, default_tokenizer: TokenizerLike):
     """test regex timeout is handled gracefully"""
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")(
         default_tokenizer
diff --git a/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
index 13cff9a8ebf1e..a0b9a3c563bc2 100644
--- a/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
@@ -11,7 +11,7 @@ from tests.entrypoints.openai.tool_parsers.utils import (
 )
 from vllm.entrypoints.openai.protocol import FunctionCall
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 # https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#model-response-format-1
 SIMPLE_FUNCTION_OUTPUT = "get_weather(city='San Francisco', metric='celsius')"
@@ -69,7 +69,7 @@ ESCAPED_STRING_FUNCTION_CALL = FunctionCall(
 
 
 @pytest.mark.parametrize("streaming", [True, False])
-def test_no_tool_call(streaming: bool, default_tokenizer: AnyTokenizer):
+def test_no_tool_call(streaming: bool, default_tokenizer: TokenizerLike):
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(
         default_tokenizer
     )
@@ -188,7 +188,7 @@ def test_tool_call(
     streaming: bool,
     model_output: str,
     expected_tool_calls: list[FunctionCall],
-    default_tokenizer: AnyTokenizer,
+    default_tokenizer: TokenizerLike,
 ):
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(
         default_tokenizer
@@ -205,7 +205,7 @@ def test_tool_call(
         assert actual.function == expected
 
 
-def test_streaming_tool_call_with_large_steps(default_tokenizer: AnyTokenizer):
+def test_streaming_tool_call_with_large_steps(default_tokenizer: TokenizerLike):
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(
         default_tokenizer
     )
@@ -228,7 +228,7 @@ def test_streaming_tool_call_with_large_steps(default_tokenizer: AnyTokenizer):
 
 
 @pytest.mark.parametrize("streaming", [False])
-def test_regex_timeout_handling(streaming: bool, default_tokenizer: AnyTokenizer):
+def test_regex_timeout_handling(streaming: bool, default_tokenizer: TokenizerLike):
     """test regex timeout is handled gracefully"""
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(
         default_tokenizer
diff --git a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
index fcd3df16e5cfa..52202c55e8405 100644
--- a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
@@ -11,7 +11,7 @@ from tests.entrypoints.openai.tool_parsers.utils import (
 )
 from vllm.entrypoints.openai.protocol import FunctionCall
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 # https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#model-response-format-1
 SIMPLE_FUNCTION_OUTPUT = "get_weather(city='San Francisco', metric='celsius')"
@@ -61,7 +61,7 @@ ESCAPED_STRING_FUNCTION_CALL = FunctionCall(
 
 
 @pytest.mark.parametrize("streaming", [True, False])
-def test_no_tool_call(streaming: bool, default_tokenizer: AnyTokenizer):
+def test_no_tool_call(streaming: bool, default_tokenizer: TokenizerLike):
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
         default_tokenizer
     )
@@ -168,7 +168,7 @@ def test_tool_call(
     streaming: bool,
     model_output: str,
     expected_tool_calls: list[FunctionCall],
-    default_tokenizer: AnyTokenizer,
+    default_tokenizer: TokenizerLike,
 ):
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
         default_tokenizer
@@ -185,7 +185,7 @@ def test_tool_call(
         assert actual.function == expected
 
 
-def test_streaming_tool_call_with_large_steps(default_tokenizer: AnyTokenizer):
+def test_streaming_tool_call_with_large_steps(default_tokenizer: TokenizerLike):
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
         default_tokenizer
     )
@@ -208,7 +208,7 @@ def test_streaming_tool_call_with_large_steps(default_tokenizer: AnyTokenizer):
 
 
 @pytest.mark.parametrize("streaming", [False])
-def test_regex_timeout_handling(streaming: bool, default_tokenizer: AnyTokenizer):
+def test_regex_timeout_handling(streaming: bool, default_tokenizer: TokenizerLike):
     """test regex timeout is handled gracefully"""
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
         default_tokenizer
diff --git a/tests/entrypoints/openai/tool_parsers/utils.py b/tests/entrypoints/openai/tool_parsers/utils.py
index 38899f2632554..2d4f5f1734102 100644
--- a/tests/entrypoints/openai/tool_parsers/utils.py
+++ b/tests/entrypoints/openai/tool_parsers/utils.py
@@ -11,7 +11,7 @@ from vllm.entrypoints.openai.protocol import (
     ToolCall,
 )
 from vllm.entrypoints.openai.tool_parsers import ToolParser
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 
 class StreamingToolReconstructor:
@@ -111,7 +111,7 @@ def run_tool_extraction_nonstreaming(
     return tool_parser.extract_tool_calls(model_output, request)
 
 
-def split_string_into_token_deltas(tokenizer: AnyTokenizer, text: str) -> list[str]:
+def split_string_into_token_deltas(tokenizer: TokenizerLike, text: str) -> list[str]:
     # Split a string into a series of deltas using the provided tokenizer. Each
     # delta will be the string equivalent of a single token.
     token_ids = tokenizer.encode(text, add_special_tokens=False)
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 7baf564ad01a4..a351cda60621f 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -28,8 +28,8 @@ from vllm.multimodal.utils import (
     encode_image_base64,
     encode_video_base64,
 )
+from vllm.tokenizers import MistralTokenizer
 from vllm.transformers_utils.tokenizer import get_tokenizer
-from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
 
 from ..models.registry import HF_EXAMPLE_MODELS
 from ..utils import VLLM_PATH
diff --git a/tests/models/language/generation/test_mistral.py b/tests/models/language/generation/test_mistral.py
index 80e337d570a36..1377776a6d84b 100644
--- a/tests/models/language/generation/test_mistral.py
+++ b/tests/models/language/generation/test_mistral.py
@@ -10,7 +10,7 @@ from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
     MistralToolParser,
 )
 from vllm.sampling_params import SamplingParams
-from vllm.transformers_utils.tokenizer import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer
 
 from ...utils import check_logprobs_close
 
diff --git a/tests/models/multimodal/generation/test_voxtral.py b/tests/models/multimodal/generation/test_voxtral.py
index 18a50c3a555da..9e9087cb0fc4d 100644
--- a/tests/models/multimodal/generation/test_voxtral.py
+++ b/tests/models/multimodal/generation/test_voxtral.py
@@ -9,7 +9,7 @@ from mistral_common.audio import Audio
 from mistral_common.protocol.instruct.chunk import AudioChunk, RawAudio, TextChunk
 from mistral_common.protocol.instruct.messages import UserMessage
 
-from vllm.transformers_utils.tokenizer import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer
 
 from ....conftest import AudioTestAssets
 from ....utils import RemoteOpenAIServer
diff --git a/tests/models/multimodal/generation/vlm_utils/core.py b/tests/models/multimodal/generation/vlm_utils/core.py
index 03ff3bcf6307b..08cf4b2202dcd 100644
--- a/tests/models/multimodal/generation/vlm_utils/core.py
+++ b/tests/models/multimodal/generation/vlm_utils/core.py
@@ -9,7 +9,7 @@ import torch
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 
 from vllm.config.model import RunnerOption
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 from .....conftest import HfRunner, VllmRunner
 from ....registry import HF_EXAMPLE_MODELS
@@ -33,7 +33,7 @@ def run_test(
     auto_cls: type[_BaseAutoModelClass],
     use_tokenizer_eos: bool,
     comparator: Callable[..., None],
-    get_stop_token_ids: Callable[[AnyTokenizer], list[int]] | None,
+    get_stop_token_ids: Callable[[TokenizerLike], list[int]] | None,
     stop_str: list[str] | None,
     limit_mm_per_prompt: dict[str, int],
     vllm_runner_kwargs: dict[str, Any] | None,
diff --git a/tests/models/multimodal/generation/vlm_utils/types.py b/tests/models/multimodal/generation/vlm_utils/types.py
index 5c1bc6ac28fe3..0c03c84497125 100644
--- a/tests/models/multimodal/generation/vlm_utils/types.py
+++ b/tests/models/multimodal/generation/vlm_utils/types.py
@@ -14,7 +14,7 @@ from transformers.models.auto.auto_factory import _BaseAutoModelClass
 
 from vllm.config.model import RunnerOption
 from vllm.logprobs import SampleLogprobs
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 from .....conftest import (
     AUDIO_ASSETS,
@@ -126,7 +126,7 @@ class VLMTestInfo(NamedTuple):
     vllm_runner_kwargs: dict[str, Any] | None = None
 
     # Optional callable which gets a list of token IDs from the model tokenizer
-    get_stop_token_ids: Callable[[AnyTokenizer], list[int]] | None = None
+    get_stop_token_ids: Callable[[TokenizerLike], list[int]] | None = None
     # Optional list of strings to stop generation, useful when stop tokens are
     # not special tokens in the tokenizer
     stop_str: list[str] | None = None
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 9638791ab5caa..c39e522100901 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -22,8 +22,8 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
 from vllm.multimodal.cache import MultiModalProcessorOnlyCache
 from vllm.multimodal.inputs import MultiModalInputs
 from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
+from vllm.tokenizers import MistralTokenizer
 from vllm.transformers_utils.tokenizer import (
-    MistralTokenizer,
     cached_tokenizer_from_config,
     encode_tokens,
 )
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index d860c50e7899a..f7fa8da54d54e 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import time
 from contextlib import nullcontext
 from typing import cast
 
@@ -23,7 +24,7 @@ from vllm.multimodal.processing import (
     replace_token_matches,
 )
 from vllm.multimodal.profiling import MultiModalProfiler
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 from .utils import random_image
 
@@ -238,7 +239,7 @@ def test_find_token_matches(
     update_type,
 ):
     # Should not be used since there is nothing to convert to token IDs
-    mock_tokenizer = cast(AnyTokenizer, object())
+    mock_tokenizer = cast(TokenizerLike, object())
 
     prompt_updates = {
         key: update_type(key, target, []).resolve(0)
@@ -385,7 +386,7 @@ def test_find_text_matches(
     update_type,
 ):
     # Should not be used since there is nothing to convert to text
-    mock_tokenizer = cast(AnyTokenizer, object())
+    mock_tokenizer = cast(TokenizerLike, object())
 
     prompt_updates = {
         key: update_type(key, target, []).resolve(0)
@@ -545,7 +546,7 @@ def test_find_update_text(
     expected_by_update_type_mm_count,
 ):
     # Should not be used since there is nothing to convert to text
-    mock_tokenizer = cast(AnyTokenizer, object())
+    mock_tokenizer = cast(TokenizerLike, object())
 
     for (
         update_type,
@@ -750,7 +751,7 @@ def test_find_update_tokens(
     expected_by_update_type_mm_count,
 ):
     # Should not be used since there is nothing to convert to tokens
-    mock_tokenizer = cast(AnyTokenizer, object())
+    mock_tokenizer = cast(TokenizerLike, object())
 
     for (
         update_type,
@@ -900,7 +901,7 @@ def test_find_mm_placeholders(
     update_type,
 ):
     # Should not be used since there is nothing to convert to tokens
-    mock_tokenizer = cast(AnyTokenizer, object())
+    mock_tokenizer = cast(TokenizerLike, object())
 
     mm_prompt_updates = {
         key: [[update_type(key, [], repl).resolve(i)] for i in range(3)]
@@ -1029,7 +1030,7 @@ def test_hf_processor_init_kwargs(
     expected_kwargs,
 ):
     # Should not be used since there is nothing to convert to tokens
-    mock_tokenizer = cast(AnyTokenizer, object())
+    mock_tokenizer = cast(TokenizerLike, object())
 
     ctx = InputProcessingContext(
         model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
@@ -1065,7 +1066,7 @@ def test_hf_processor_call_kwargs(
     expected_kwargs,
 ):
     # Should not be used since there is nothing to convert to tokens
-    mock_tokenizer = cast(AnyTokenizer, object())
+    mock_tokenizer = cast(TokenizerLike, object())
 
     ctx = InputProcessingContext(
         model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
@@ -1088,9 +1089,7 @@ def test_apply_matches_no_match_exits_quickly():
 
     With the fix, it should exit immediately when no match is found.
     """
-    import time
-
-    mock_tokenizer = cast(AnyTokenizer, object())
+    mock_tokenizer = cast(TokenizerLike, object())
 
     # Create a long prompt with no placeholder
     long_prompt = "x" * 10000
diff --git a/tests/reasoning/test_mistral_reasoning_parser.py b/tests/reasoning/test_mistral_reasoning_parser.py
index 5163c863863a7..0fe315c2567f9 100644
--- a/tests/reasoning/test_mistral_reasoning_parser.py
+++ b/tests/reasoning/test_mistral_reasoning_parser.py
@@ -5,7 +5,7 @@ import pytest
 
 from tests.reasoning.utils import run_reasoning_extraction_mistral
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
-from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer
 
 parser_name = "mistral"
 
diff --git a/tests/reasoning/utils.py b/tests/reasoning/utils.py
index bd0b230a847cb..695312a0cadfe 100644
--- a/tests/reasoning/utils.py
+++ b/tests/reasoning/utils.py
@@ -4,7 +4,7 @@
 
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
 from vllm.reasoning import ReasoningParser
-from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer
 
 
 class StreamingReasoningReconstructor:
diff --git a/tests/tokenization/__init__.py b/tests/tokenization/__init__.py
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/tests/tokenization/test_do_lower_case.py b/tests/tokenization/test_do_lower_case.py
deleted file mode 100644
index 8aff50b351e31..0000000000000
--- a/tests/tokenization/test_do_lower_case.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-
-from vllm.transformers_utils.tokenizer import get_tokenizer
-
-TOKENIZER_NAMES = ["BAAI/bge-base-en"]
-
-
-@pytest.mark.parametrize("tokenizer_name", TOKENIZER_NAMES)
-@pytest.mark.parametrize("n_tokens", [510])
-def test_special_tokens(tokenizer_name: str, n_tokens: int):
-    tokenizer = get_tokenizer(tokenizer_name, revision="main")
-
-    prompts = "[UNK]" * n_tokens
-    prompt_token_ids = tokenizer.encode(prompts)
-    assert len(prompt_token_ids) == n_tokens + 2
diff --git a/tests/tokenization/test_get_eos.py b/tests/tokenization/test_get_eos.py
deleted file mode 100644
index 921d77b1b335e..0000000000000
--- a/tests/tokenization/test_get_eos.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-This test file includes some cases where it is inappropriate to
-only get the `eos_token_id` from the tokenizer as defined by
-{meth}`vllm.LLMEngine._get_eos_token_id`.
-"""
-
-from vllm.transformers_utils.config import try_get_generation_config
-from vllm.transformers_utils.tokenizer import get_tokenizer
-
-
-def test_get_llama3_eos_token():
-    model_name = "meta-llama/Llama-3.2-1B-Instruct"
-
-    tokenizer = get_tokenizer(model_name)
-    assert tokenizer.eos_token_id == 128009
-
-    generation_config = try_get_generation_config(model_name, trust_remote_code=False)
-    assert generation_config is not None
-    assert generation_config.eos_token_id == [128001, 128008, 128009]
-
-
-def test_get_blip2_eos_token():
-    model_name = "Salesforce/blip2-opt-2.7b"
-
-    tokenizer = get_tokenizer(model_name)
-    assert tokenizer.eos_token_id == 2
-
-    generation_config = try_get_generation_config(model_name, trust_remote_code=False)
-    assert generation_config is not None
-    assert generation_config.eos_token_id == 50118
diff --git a/tests/tokenization/test_tokenizer.py b/tests/tokenization/test_tokenizer.py
deleted file mode 100644
index e86bb03883b5e..0000000000000
--- a/tests/tokenization/test_tokenizer.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-from transformers import PreTrainedTokenizerBase
-
-from vllm.transformers_utils.tokenizer import get_tokenizer
-
-TOKENIZER_NAMES = [
-    "facebook/opt-125m",
-    "gpt2",
-]
-
-
-@pytest.mark.parametrize("tokenizer_name", TOKENIZER_NAMES)
-def test_tokenizer_revision(tokenizer_name: str):
-    # Assume that "main" branch always exists
-    tokenizer = get_tokenizer(tokenizer_name, revision="main")
-    assert isinstance(tokenizer, PreTrainedTokenizerBase)
-
-    # Assume that "never" branch always does not exist
-    with pytest.raises(OSError, match="not a valid git identifier"):
-        get_tokenizer(tokenizer_name, revision="never")
diff --git a/tests/tokenization/test_tokenizer_registry.py b/tests/tokenization/test_tokenizer_registry.py
deleted file mode 100644
index f13bb4333d619..0000000000000
--- a/tests/tokenization/test_tokenizer_registry.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from typing import TYPE_CHECKING, Any
-
-from vllm.transformers_utils.tokenizer import get_tokenizer
-from vllm.transformers_utils.tokenizer_base import TokenizerBase, TokenizerRegistry
-
-if TYPE_CHECKING:
-    from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
-
-
-class TestTokenizer(TokenizerBase):
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs) -> "TestTokenizer":
-        return TestTokenizer()
-
-    @property
-    def all_special_tokens(self) -> list[str]:
-        raise NotImplementedError()
-
-    @property
-    def all_special_ids(self) -> list[int]:
-        raise NotImplementedError()
-
-    @property
-    def bos_token_id(self) -> int:
-        return 0
-
-    @property
-    def eos_token_id(self) -> int:
-        return 1
-
-    @property
-    def sep_token(self) -> str:
-        raise NotImplementedError()
-
-    @property
-    def pad_token(self) -> str:
-        raise NotImplementedError()
-
-    @property
-    def is_fast(self) -> bool:
-        raise NotImplementedError()
-
-    @property
-    def vocab_size(self) -> int:
-        raise NotImplementedError()
-
-    @property
-    def max_token_id(self) -> int:
-        raise NotImplementedError()
-
-    @property
-    def truncation_side(self) -> str:
-        raise NotImplementedError()
-
-    def __call__(
-        self,
-        text: str | list[str] | list[int],
-        text_pair: str | None = None,
-        add_special_tokens: bool = False,
-        truncation: bool = False,
-        max_length: int | None = None,
-    ):
-        raise NotImplementedError()
-
-    def get_vocab(self) -> dict[str, int]:
-        raise NotImplementedError()
-
-    def get_added_vocab(self) -> dict[str, int]:
-        raise NotImplementedError()
-
-    def encode_one(
-        self,
-        text: str,
-        truncation: bool = False,
-        max_length: int | None = None,
-    ) -> list[int]:
-        raise NotImplementedError()
-
-    def encode(self, text: str, add_special_tokens: bool | None = None) -> list[int]:
-        raise NotImplementedError()
-
-    def apply_chat_template(
-        self,
-        messages: list["ChatCompletionMessageParam"],
-        tools: list[dict[str, Any]] | None = None,
-        **kwargs,
-    ) -> list[int]:
-        raise NotImplementedError()
-
-    def convert_tokens_to_string(self, tokens: list[str]) -> str:
-        raise NotImplementedError()
-
-    def decode(self, ids: list[int] | int, skip_special_tokens: bool = True) -> str:
-        raise NotImplementedError()
-
-    def convert_ids_to_tokens(
-        self,
-        ids: list[int],
-        skip_special_tokens: bool = True,
-    ) -> list[str]:
-        raise NotImplementedError()
-
-
-def test_customized_tokenizer():
-    TokenizerRegistry.register(
-        "test_tokenizer", "tests.tokenization.test_tokenizer_registry", "TestTokenizer"
-    )
-
-    tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer")
-    assert isinstance(tokenizer, TestTokenizer)
-    assert tokenizer.bos_token_id == 0
-    assert tokenizer.eos_token_id == 1
-
-    tokenizer = get_tokenizer("test_tokenizer", tokenizer_mode="custom")
-    assert isinstance(tokenizer, TestTokenizer)
-    assert tokenizer.bos_token_id == 0
-    assert tokenizer.eos_token_id == 1
diff --git a/tests/tokenizers_/__init__.py b/tests/tokenizers_/__init__.py
new file mode 100644
index 0000000000000..a5d7f4b031032
--- /dev/null
+++ b/tests/tokenizers_/__init__.py
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# NOTE: Since CI runs the tests from the `tests` directory, it is necessary to rename
+# this module to avoid conflicting with HF's `tokenizers` package
diff --git a/tests/tokenizers_/test_basic.py b/tests/tokenizers_/test_basic.py
new file mode 100644
index 0000000000000..1fca633cc5cd7
--- /dev/null
+++ b/tests/tokenizers_/test_basic.py
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import _get_protocol_attrs  # type: ignore
+
+import pytest
+from transformers import PreTrainedTokenizerBase
+
+from vllm.tokenizers import TokenizerLike
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+
+def _get_missing_attrs(obj: object, target: type):
+    return [k for k in _get_protocol_attrs(target) if not hasattr(obj, k)]
+
+
+def test_tokenizer_like_protocol():
+    assert not (
+        missing_attrs := _get_missing_attrs(
+            get_tokenizer("gpt2", use_fast=False),
+            TokenizerLike,
+        )
+    ), f"Missing attrs: {missing_attrs}"
+
+    assert not (
+        missing_attrs := _get_missing_attrs(
+            get_tokenizer("gpt2", use_fast=True),
+            TokenizerLike,
+        )
+    ), f"Missing attrs: {missing_attrs}"
+
+    assert not (
+        missing_attrs := _get_missing_attrs(
+            get_tokenizer(
+                "mistralai/Mistral-7B-Instruct-v0.3", tokenizer_mode="mistral"
+            ),
+            TokenizerLike,
+        )
+    ), f"Missing attrs: {missing_attrs}"
+
+
+@pytest.mark.parametrize("tokenizer_name", ["facebook/opt-125m", "gpt2"])
+def test_tokenizer_revision(tokenizer_name: str):
+    # Assume that "main" branch always exists
+    tokenizer = get_tokenizer(tokenizer_name, revision="main")
+    assert isinstance(tokenizer, PreTrainedTokenizerBase)
+
+    # Assume that "never" branch always does not exist
+    with pytest.raises(OSError, match="not a valid git identifier"):
+        get_tokenizer(tokenizer_name, revision="never")
+
+
+@pytest.mark.parametrize("tokenizer_name", ["BAAI/bge-base-en"])
+@pytest.mark.parametrize("n_tokens", [510])
+def test_special_tokens(tokenizer_name: str, n_tokens: int):
+    tokenizer = get_tokenizer(tokenizer_name, revision="main")
+
+    prompts = "[UNK]" * n_tokens
+    prompt_token_ids = tokenizer.encode(prompts)
+    assert len(prompt_token_ids) == n_tokens + 2
diff --git a/tests/tokenization/test_cached_tokenizer.py b/tests/tokenizers_/test_cached_tokenizer.py
similarity index 88%
rename from tests/tokenization/test_cached_tokenizer.py
rename to tests/tokenizers_/test_cached_tokenizer.py
index a5bb3dbcfe29d..48234687ea1ea 100644
--- a/tests/tokenization/test_cached_tokenizer.py
+++ b/tests/tokenizers_/test_cached_tokenizer.py
@@ -6,7 +6,8 @@ from copy import deepcopy
 import pytest
 from transformers import AutoTokenizer
 
-from vllm.transformers_utils.tokenizer import AnyTokenizer, get_cached_tokenizer
+from vllm.tokenizers import TokenizerLike
+from vllm.transformers_utils.tokenizer import get_cached_tokenizer
 
 
 @pytest.mark.parametrize("model_id", ["gpt2", "zai-org/chatglm3-6b"])
@@ -25,7 +26,7 @@ def test_cached_tokenizer(model_id: str):
     _check_consistency(unpickled_tokenizer, reference_tokenizer)
 
 
-def _check_consistency(target: AnyTokenizer, expected: AnyTokenizer):
+def _check_consistency(target: TokenizerLike, expected: TokenizerLike):
     assert isinstance(target, type(expected))
 
     # Cached attributes
diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenizers_/test_detokenize.py
similarity index 99%
rename from tests/tokenization/test_detokenize.py
rename to tests/tokenizers_/test_detokenize.py
index f4b43a21daaa8..ae1d6b0956722 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenizers_/test_detokenize.py
@@ -8,7 +8,7 @@ import pytest
 from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
 
 from vllm.sampling_params import SamplingParams
-from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.detokenizer import (
     FastIncrementalDetokenizer,
diff --git a/tests/tokenization/test_mistral_tokenizer.py b/tests/tokenizers_/test_mistral.py
similarity index 98%
rename from tests/tokenization/test_mistral_tokenizer.py
rename to tests/tokenizers_/test_mistral.py
index 4cdfa9df95e1a..0706a94791dc9 100644
--- a/tests/tokenization/test_mistral_tokenizer.py
+++ b/tests/tokenizers_/test_mistral.py
@@ -7,7 +7,7 @@ import pytest
 from mistral_common.exceptions import InvalidMessageStructureException
 from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
 
-from vllm.transformers_utils.tokenizers.mistral import (
+from vllm.tokenizers.mistral import (
     MistralTokenizer,
     _prepare_apply_chat_template_tools_and_messages,
 )
@@ -308,25 +308,6 @@ class TestMistralTokenizer:
     def test_get_added_vocab(self, mistral_tokenizer: MistralTokenizer):
         assert mistral_tokenizer.get_added_vocab() == {}
 
-    def test_encode_one(self, mistral_tokenizer: MistralTokenizer):
-        token_ids = (
-            [22177, 4304, 2662] if mistral_tokenizer.is_tekken else [23325, 2294, 1686]
-        )
-
-        assert mistral_tokenizer.encode_one("Hello world !") == token_ids
-        assert mistral_tokenizer.encode_one("Hello world !", max_length=1) == token_ids
-        assert (
-            mistral_tokenizer.encode_one("Hello world !", truncation=True, max_length=1)
-            == token_ids[:-2]
-        )
-        assert (
-            mistral_tokenizer.encode_one(
-                "Hello world !", truncation=False, max_length=1
-            )
-            == token_ids
-        )
-        assert mistral_tokenizer.encode_one("") == []
-
     def test_encode(self, mistral_tokenizer: MistralTokenizer):
         token_ids = (
             [1, 22177, 4304, 2662]
diff --git a/tests/tokenizers_/test_registry.py b/tests/tokenizers_/test_registry.py
new file mode 100644
index 0000000000000..1eb19a0996dd9
--- /dev/null
+++ b/tests/tokenizers_/test_registry.py
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.tokenizers import TokenizerLike, TokenizerRegistry
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+
+class TestTokenizer(TokenizerLike):
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs) -> "TestTokenizer":
+        return TestTokenizer()  # type: ignore
+
+    @property
+    def bos_token_id(self) -> int:
+        return 0
+
+    @property
+    def eos_token_id(self) -> int:
+        return 1
+
+
+def test_customized_tokenizer():
+    TokenizerRegistry.register(
+        "test_tokenizer",
+        __name__,
+        TestTokenizer.__name__,
+    )
+
+    tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer")
+    assert isinstance(tokenizer, TestTokenizer)
+    assert tokenizer.bos_token_id == 0
+    assert tokenizer.eos_token_id == 1
+
+    tokenizer = get_tokenizer("test_tokenizer", tokenizer_mode="custom")
+    assert isinstance(tokenizer, TestTokenizer)
+    assert tokenizer.bos_token_id == 0
+    assert tokenizer.eos_token_id == 1
diff --git a/tests/tool_use/test_ernie45_moe_tool_parser.py b/tests/tool_use/test_ernie45_moe_tool_parser.py
index 36a07bb561d9e..ee9da4fd6464b 100644
--- a/tests/tool_use/test_ernie45_moe_tool_parser.py
+++ b/tests/tool_use/test_ernie45_moe_tool_parser.py
@@ -14,8 +14,9 @@ from vllm.entrypoints.openai.protocol import (
     ToolCall,
 )
 from vllm.entrypoints.openai.tool_parsers.ernie45_tool_parser import Ernie45ToolParser
+from vllm.tokenizers import TokenizerLike
 from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
-from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
+from vllm.transformers_utils.tokenizer import get_tokenizer
 
 # Use a common model that is likely to be available
 MODEL = "baidu/ERNIE-4.5-21B-A3B-Thinking"
@@ -173,7 +174,7 @@ def test_extract_tool_calls(
 
 def stream_delta_message_generator(
     ernie45_tool_parser: Ernie45ToolParser,
-    ernie45_tokenizer: AnyTokenizer,
+    ernie45_tokenizer: TokenizerLike,
     model_output: str,
     request: ChatCompletionRequest | None = None,
 ) -> Generator[DeltaMessage, None, None]:
diff --git a/tests/tool_use/test_jamba_tool_parser.py b/tests/tool_use/test_jamba_tool_parser.py
index 9eb73b80fa9b4..2413b983fe871 100644
--- a/tests/tool_use/test_jamba_tool_parser.py
+++ b/tests/tool_use/test_jamba_tool_parser.py
@@ -10,8 +10,9 @@ from partial_json_parser.core.options import Allow
 
 from vllm.entrypoints.openai.protocol import DeltaMessage, FunctionCall, ToolCall
 from vllm.entrypoints.openai.tool_parsers.jamba_tool_parser import JambaToolParser
+from vllm.tokenizers import TokenizerLike
 from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
-from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
+from vllm.transformers_utils.tokenizer import get_tokenizer
 
 pytestmark = pytest.mark.cpu_test
 
@@ -44,7 +45,9 @@ def assert_tool_calls(
 
 
 def stream_delta_message_generator(
-    jamba_tool_parser: JambaToolParser, jamba_tokenizer: AnyTokenizer, model_output: str
+    jamba_tool_parser: JambaToolParser,
+    jamba_tokenizer: TokenizerLike,
+    model_output: str,
 ) -> Generator[DeltaMessage, None, None]:
     all_token_ids = jamba_tokenizer.encode(model_output, add_special_tokens=False)
 
diff --git a/tests/tool_use/test_qwen3coder_tool_parser.py b/tests/tool_use/test_qwen3coder_tool_parser.py
index 93ef1049fc07e..3cf1f4ef89f14 100644
--- a/tests/tool_use/test_qwen3coder_tool_parser.py
+++ b/tests/tool_use/test_qwen3coder_tool_parser.py
@@ -17,8 +17,9 @@ from vllm.entrypoints.openai.tool_parsers.qwen3coder_tool_parser import (
     Qwen3CoderToolParser,
 )
 from vllm.entrypoints.openai.tool_parsers.qwen3xml_tool_parser import Qwen3XMLToolParser
+from vllm.tokenizers import TokenizerLike
 from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
-from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
+from vllm.transformers_utils.tokenizer import get_tokenizer
 
 pytestmark = pytest.mark.cpu_test
 
@@ -104,7 +105,7 @@ def assert_tool_calls(
 
 def stream_delta_message_generator(
     qwen3_tool_parser,
-    qwen3_tokenizer: AnyTokenizer,
+    qwen3_tokenizer: TokenizerLike,
     model_output: str,
     request: ChatCompletionRequest | None = None,
 ) -> Generator[DeltaMessage, None, None]:
diff --git a/tests/tool_use/test_seed_oss_tool_parser.py b/tests/tool_use/test_seed_oss_tool_parser.py
index 1367ad87cb019..8e1ad5e9cedc8 100644
--- a/tests/tool_use/test_seed_oss_tool_parser.py
+++ b/tests/tool_use/test_seed_oss_tool_parser.py
@@ -15,8 +15,9 @@ from vllm.entrypoints.openai.protocol import (
     ToolCall,
 )
 from vllm.entrypoints.openai.tool_parsers.seed_oss_tool_parser import SeedOssToolParser
+from vllm.tokenizers import TokenizerLike
 from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
-from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
+from vllm.transformers_utils.tokenizer import get_tokenizer
 
 pytestmark = pytest.mark.cpu_test
 
@@ -256,7 +257,7 @@ def test_streaming_tool_calls_no_tools(seed_oss_tool_parser):
 
 def stream_delta_message_generator(
     seed_oss_tool_parser: SeedOssToolParser,
-    seed_oss_tokenizer: AnyTokenizer,
+    seed_oss_tokenizer: TokenizerLike,
     model_output: str,
     request: ChatCompletionRequest | None = None,
 ) -> Generator[DeltaMessage, None, None]:
diff --git a/tests/tool_use/test_xlam_tool_parser.py b/tests/tool_use/test_xlam_tool_parser.py
index 122b427d60409..a1852c368eeb8 100644
--- a/tests/tool_use/test_xlam_tool_parser.py
+++ b/tests/tool_use/test_xlam_tool_parser.py
@@ -13,8 +13,9 @@ from vllm.entrypoints.openai.protocol import (
     ToolCall,
 )
 from vllm.entrypoints.openai.tool_parsers.xlam_tool_parser import xLAMToolParser
+from vllm.tokenizers import TokenizerLike
 from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
-from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
+from vllm.transformers_utils.tokenizer import get_tokenizer
 
 pytestmark = pytest.mark.cpu_test
 
@@ -49,7 +50,7 @@ def assert_tool_calls(
 
 def stream_delta_message_generator(
     xlam_tool_parser: xLAMToolParser,
-    xlam_tokenizer: AnyTokenizer,
+    xlam_tokenizer: TokenizerLike,
     model_output: str,
     request: ChatCompletionRequest | None = None,
 ) -> Generator[DeltaMessage, None, None]:
diff --git a/tests/transformers_utils/test_config.py b/tests/transformers_utils/test_config.py
index 7107ad0f7b99d..7b56c9f0189d4 100644
--- a/tests/transformers_utils/test_config.py
+++ b/tests/transformers_utils/test_config.py
@@ -1,62 +1,32 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This test file includes some cases where it is inappropriate to
+only get the `eos_token_id` from the tokenizer as defined by
+`vllm.LLMEngine._get_eos_token_id`.
+"""
+
+from vllm.transformers_utils.config import try_get_generation_config
+from vllm.transformers_utils.tokenizer import get_tokenizer
 
 
-import tempfile
-from pathlib import Path
-from unittest.mock import MagicMock, call, patch
+def test_get_llama3_eos_token():
+    model_name = "meta-llama/Llama-3.2-1B-Instruct"
 
-import pytest
+    tokenizer = get_tokenizer(model_name)
+    assert tokenizer.eos_token_id == 128009
 
-from vllm.transformers_utils.repo_utils import list_filtered_repo_files
+    generation_config = try_get_generation_config(model_name, trust_remote_code=False)
+    assert generation_config is not None
+    assert generation_config.eos_token_id == [128001, 128008, 128009]
 
 
-@pytest.mark.parametrize(
-    "allow_patterns,expected_relative_files",
-    [
-        (
-            ["*.json", "correct*.txt"],
-            ["json_file.json", "subfolder/correct.txt", "correct_2.txt"],
-        ),
-    ],
-)
-def test_list_filtered_repo_files(
-    allow_patterns: list[str], expected_relative_files: list[str]
-):
-    with tempfile.TemporaryDirectory() as tmp_dir:
-        # Prep folder and files
-        path_tmp_dir = Path(tmp_dir)
-        subfolder = path_tmp_dir / "subfolder"
-        subfolder.mkdir()
-        (path_tmp_dir / "json_file.json").touch()
-        (path_tmp_dir / "correct_2.txt").touch()
-        (path_tmp_dir / "uncorrect.txt").touch()
-        (path_tmp_dir / "uncorrect.jpeg").touch()
-        (subfolder / "correct.txt").touch()
-        (subfolder / "uncorrect_sub.txt").touch()
+def test_get_blip2_eos_token():
+    model_name = "Salesforce/blip2-opt-2.7b"
 
-        def _glob_path() -> list[str]:
-            return [
-                str(file.relative_to(path_tmp_dir))
-                for file in path_tmp_dir.glob("**/*")
-                if file.is_file()
-            ]
+    tokenizer = get_tokenizer(model_name)
+    assert tokenizer.eos_token_id == 2
 
-        # Patch list_repo_files called by fn
-        with patch(
-            "vllm.transformers_utils.repo_utils.list_repo_files",
-            MagicMock(return_value=_glob_path()),
-        ) as mock_list_repo_files:
-            out_files = sorted(
-                list_filtered_repo_files(
-                    tmp_dir, allow_patterns, "revision", "model", "token"
-                )
-            )
-        assert out_files == sorted(expected_relative_files)
-        assert mock_list_repo_files.call_count == 1
-        assert mock_list_repo_files.call_args_list[0] == call(
-            repo_id=tmp_dir,
-            revision="revision",
-            repo_type="model",
-            token="token",
-        )
+    generation_config = try_get_generation_config(model_name, trust_remote_code=False)
+    assert generation_config is not None
+    assert generation_config.eos_token_id == 50118
diff --git a/tests/transformers_utils/test_get_processor_kwargs_from_processor.py b/tests/transformers_utils/test_processor.py
similarity index 100%
rename from tests/transformers_utils/test_get_processor_kwargs_from_processor.py
rename to tests/transformers_utils/test_processor.py
diff --git a/tests/transformers_utils/test_repo_utils.py b/tests/transformers_utils/test_repo_utils.py
new file mode 100644
index 0000000000000..7107ad0f7b99d
--- /dev/null
+++ b/tests/transformers_utils/test_repo_utils.py
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import tempfile
+from pathlib import Path
+from unittest.mock import MagicMock, call, patch
+
+import pytest
+
+from vllm.transformers_utils.repo_utils import list_filtered_repo_files
+
+
+@pytest.mark.parametrize(
+    "allow_patterns,expected_relative_files",
+    [
+        (
+            ["*.json", "correct*.txt"],
+            ["json_file.json", "subfolder/correct.txt", "correct_2.txt"],
+        ),
+    ],
+)
+def test_list_filtered_repo_files(
+    allow_patterns: list[str], expected_relative_files: list[str]
+):
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        # Prep folder and files
+        path_tmp_dir = Path(tmp_dir)
+        subfolder = path_tmp_dir / "subfolder"
+        subfolder.mkdir()
+        (path_tmp_dir / "json_file.json").touch()
+        (path_tmp_dir / "correct_2.txt").touch()
+        (path_tmp_dir / "uncorrect.txt").touch()
+        (path_tmp_dir / "uncorrect.jpeg").touch()
+        (subfolder / "correct.txt").touch()
+        (subfolder / "uncorrect_sub.txt").touch()
+
+        def _glob_path() -> list[str]:
+            return [
+                str(file.relative_to(path_tmp_dir))
+                for file in path_tmp_dir.glob("**/*")
+                if file.is_file()
+            ]
+
+        # Patch list_repo_files called by fn
+        with patch(
+            "vllm.transformers_utils.repo_utils.list_repo_files",
+            MagicMock(return_value=_glob_path()),
+        ) as mock_list_repo_files:
+            out_files = sorted(
+                list_filtered_repo_files(
+                    tmp_dir, allow_patterns, "revision", "model", "token"
+                )
+            )
+        assert out_files == sorted(expected_relative_files)
+        assert mock_list_repo_files.call_count == 1
+        assert mock_list_repo_files.call_args_list[0] == call(
+            repo_id=tmp_dir,
+            revision="revision",
+            repo_type="model",
+            token="token",
+        )
diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index 8e1198b315bd1..990aa9d925855 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -18,7 +18,7 @@ from vllm.logprobs import PromptLogprobs, SampleLogprobs
 from vllm.lora.request import LoRARequest
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.sampling_params import RequestOutputKind, SamplingParams
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.v1.engine import (
     EngineCoreEvent,
     EngineCoreEventType,
@@ -31,7 +31,7 @@ from vllm.v1.metrics.stats import IterationStats, SchedulerStats
 
 
 def _ref_convert_id_to_token(
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     token_id: int,
 ) -> str:
     """Reference impl of logprobs detokenization.
diff --git a/tools/pre_commit/check_pickle_imports.py b/tools/pre_commit/check_pickle_imports.py
index b96a6701333de..2bb468da68c2a 100644
--- a/tools/pre_commit/check_pickle_imports.py
+++ b/tools/pre_commit/check_pickle_imports.py
@@ -27,8 +27,8 @@ ALLOWED_FILES = {
     "vllm/distributed/device_communicators/shm_broadcast.py",
     "vllm/distributed/device_communicators/shm_object_storage.py",
     "vllm/utils/hashing.py",
+    "tests/tokenizers_/test_cached_tokenizer.py",
     "tests/utils_/test_hashing.py",
-    "tests/tokenization/test_cached_tokenizer.py",
     "benchmarks/kernels/graph_machete_bench.py",
     "benchmarks/kernels/benchmark_lora.py",
     "benchmarks/kernels/benchmark_machete.py",
diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py
index 47e01fc93b48b..724b393044266 100755
--- a/tools/pre_commit/mypy.py
+++ b/tools/pre_commit/mypy.py
@@ -35,6 +35,7 @@ FILES = [
     "vllm/multimodal",
     "vllm/platforms",
     "vllm/plugins",
+    "vllm/tokenizers",
     "vllm/transformers_utils",
     "vllm/triton_utils",
     "vllm/usage",
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 5411ecbb27b27..ec9b0fd6e969c 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -39,7 +39,7 @@ from vllm.lora.request import LoRARequest
 from vllm.lora.utils import get_adapter_absolute_path
 from vllm.multimodal import MultiModalDataDict
 from vllm.multimodal.image import convert_image_mode
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.utils.import_utils import PlaceholderModule
 
 try:
@@ -293,7 +293,7 @@ def lora_path_on_disk(lora_path: str) -> str:
 
 
 # Global cache for LoRA tokenizers.
-lora_tokenizer_cache: dict[int, AnyTokenizer] = {}
+lora_tokenizer_cache: dict[int, TokenizerLike] = {}
 
 
 def process_image(image: Any) -> Mapping[str, Any]:
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index 02741e50f6aa0..f2b19c845018c 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -13,7 +13,7 @@ from vllm.plugins.io_processors import IOProcessor
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.tasks import SupportedTask
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.input_processor import InputProcessor
 
@@ -85,7 +85,7 @@ class EngineClient(ABC):
         ...
 
     @abstractmethod
-    async def get_tokenizer(self) -> AnyTokenizer:
+    async def get_tokenizer(self) -> TokenizerLike:
         """Get the tokenizer"""
         ...
 
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index bf80856c1bbfc..1643906894c66 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -49,9 +49,9 @@ from vllm.logger import init_logger
 from vllm.model_executor.models import SupportsMultiModal
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalUUIDDict
 from vllm.multimodal.utils import MEDIA_CONNECTOR_REGISTRY, MediaConnector
+from vllm.tokenizers import MistralTokenizer, TokenizerLike
 from vllm.transformers_utils.chat_templates import get_chat_template_fallback_path
 from vllm.transformers_utils.processor import cached_get_processor
-from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 from vllm.utils import random_uuid
 from vllm.utils.func_utils import supports_kw
 
@@ -536,7 +536,7 @@ def resolve_hf_chat_template(
 def _resolve_chat_template_content_format(
     chat_template: str | None,
     tools: list[dict[str, Any]] | None,
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     *,
     model_config: ModelConfig,
 ) -> _ChatTemplateContentFormat:
@@ -593,7 +593,7 @@ def resolve_chat_template_content_format(
     chat_template: str | None,
     tools: list[dict[str, Any]] | None,
     given_format: ChatTemplateContentFormatOption,
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     *,
     model_config: ModelConfig,
 ) -> _ChatTemplateContentFormat:
@@ -627,7 +627,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
     maximum per prompt.
     """
 
-    def __init__(self, model_config: ModelConfig, tokenizer: AnyTokenizer):
+    def __init__(self, model_config: ModelConfig, tokenizer: TokenizerLike):
         super().__init__()
 
         self._model_config = model_config
@@ -1592,7 +1592,7 @@ def _postprocess_messages(messages: list[ConversationMessage]) -> None:
 def parse_chat_messages(
     messages: list[ChatCompletionMessageParam],
     model_config: ModelConfig,
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     content_format: _ChatTemplateContentFormat,
 ) -> tuple[
     list[ConversationMessage],
@@ -1624,7 +1624,7 @@ def parse_chat_messages(
 def parse_chat_messages_futures(
     messages: list[ChatCompletionMessageParam],
     model_config: ModelConfig,
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     content_format: _ChatTemplateContentFormat,
 ) -> tuple[
     list[ConversationMessage],
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 2b34f36253edf..4ea213752e394 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -71,11 +71,8 @@ from vllm.platforms import current_platform
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import BeamSearchParams, RequestOutputKind, SamplingParams
 from vllm.tasks import PoolingTask
-from vllm.transformers_utils.tokenizer import (
-    AnyTokenizer,
-    MistralTokenizer,
-    get_cached_tokenizer,
-)
+from vllm.tokenizers import MistralTokenizer, TokenizerLike
+from vllm.transformers_utils.tokenizer import get_cached_tokenizer
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils.collection_utils import as_iter, is_list_of
 from vllm.utils.counter import Counter
@@ -350,11 +347,11 @@ class LLM:
         self.input_processor = self.llm_engine.input_processor
         self.io_processor = self.llm_engine.io_processor
 
-    def get_tokenizer(self) -> AnyTokenizer:
+    def get_tokenizer(self) -> TokenizerLike:
         return self.llm_engine.get_tokenizer()
 
     @deprecated("`set_tokenizer` is deprecated and will be removed in v0.13.")
-    def set_tokenizer(self, tokenizer: AnyTokenizer) -> None:
+    def set_tokenizer(self, tokenizer: TokenizerLike) -> None:
         # While CachedTokenizer is dynamic, have no choice but
         # compare class name. Misjudgment will arise from
         # user-defined tokenizer started with 'Cached'
@@ -1244,7 +1241,7 @@ class LLM:
 
     def _embedding_score(
         self,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         text_1: list[str | TextPrompt | TokensPrompt],
         text_2: list[str | TextPrompt | TokensPrompt],
         truncate_prompt_tokens: int | None = None,
@@ -1276,7 +1273,7 @@ class LLM:
 
     def _cross_encoding_score(
         self,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         data_1: list[str] | list[ScoreContentPartParam],
         data_2: list[str] | list[ScoreContentPartParam],
         truncate_prompt_tokens: int | None = None,
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 9a7051e0920af..cecd1da1e5548 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -62,8 +62,9 @@ from vllm.logger import init_logger
 from vllm.logprobs import Logprob
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.sampling_params import BeamSearchParams, SamplingParams
-from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.transformers_utils.tokenizers import (
+from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers.mistral import (
+    MistralTokenizer,
     maybe_serialize_tool_calls,
     truncate_tool_call_ids,
     validate_request_params,
@@ -530,7 +531,7 @@ class OpenAIServingChat(OpenAIServing):
         request_id: str,
         model_name: str,
         conversation: list[ConversationMessage],
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         request_metadata: RequestResponseMetadata,
     ) -> AsyncGenerator[str, None]:
         created_time = int(time.time())
@@ -1296,7 +1297,7 @@ class OpenAIServingChat(OpenAIServing):
         request_id: str,
         model_name: str,
         conversation: list[ConversationMessage],
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         request_metadata: RequestResponseMetadata,
     ) -> ErrorResponse | ChatCompletionResponse:
         created_time = int(time.time())
@@ -1624,7 +1625,7 @@ class OpenAIServingChat(OpenAIServing):
         self,
         logprobs: dict[int, Logprob],
         top_logprobs: int | None,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         should_return_as_token_id: bool,
     ) -> list[ChatCompletionLogProb]:
         return [
@@ -1648,7 +1649,7 @@ class OpenAIServingChat(OpenAIServing):
         self,
         token_ids: GenericSequence[int],
         top_logprobs: GenericSequence[dict[int, Logprob] | None],
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         num_output_top_logprobs: int | None = None,
         return_as_token_id: bool | None = None,
     ) -> ChatCompletionLogProbs:
diff --git a/vllm/entrypoints/openai/serving_classification.py b/vllm/entrypoints/openai/serving_classification.py
index 167ee152fece3..3b973eb125a83 100644
--- a/vllm/entrypoints/openai/serving_classification.py
+++ b/vllm/entrypoints/openai/serving_classification.py
@@ -221,7 +221,7 @@ class ServingClassification(ClassificationMixin):
 
     def _create_pooling_params(
         self,
-        ctx: ClassificationServeContext,
+        ctx: ServeContext[ClassificationRequest],
     ) -> PoolingParams | ErrorResponse:
         pooling_params = super()._create_pooling_params(ctx)
         if isinstance(pooling_params, ErrorResponse):
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 9681aa8c71e6d..3e421e21e3e80 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -33,7 +33,7 @@ from vllm.logger import init_logger
 from vllm.logprobs import Logprob
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams, SamplingParams
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.utils.async_utils import merge_async_iterators
 from vllm.utils.collection_utils import as_list
 from vllm.v1.sample.logits_processor import validate_logits_processors_parameters
@@ -326,7 +326,7 @@ class OpenAIServingCompletion(OpenAIServing):
         created_time: int,
         model_name: str,
         num_prompts: int,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike | None,
         request_metadata: RequestResponseMetadata,
     ) -> AsyncGenerator[str, None]:
         num_choices = 1 if request.n is None else request.n
@@ -511,7 +511,7 @@ class OpenAIServingCompletion(OpenAIServing):
         request_id: str,
         created_time: int,
         model_name: str,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike | None,
         request_metadata: RequestResponseMetadata,
     ) -> CompletionResponse:
         choices: list[CompletionResponseChoice] = []
@@ -622,7 +622,7 @@ class OpenAIServingCompletion(OpenAIServing):
         token_ids: GenericSequence[int],
         top_logprobs: GenericSequence[dict[int, Logprob] | None],
         num_output_top_logprobs: int,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike | None,
         initial_text_offset: int = 0,
         return_as_token_id: bool | None = None,
     ) -> CompletionLogProbs:
@@ -642,9 +642,15 @@ class OpenAIServingCompletion(OpenAIServing):
         for i, token_id in enumerate(token_ids):
             step_top_logprobs = top_logprobs[i]
             if step_top_logprobs is None:
-                token = tokenizer.decode(token_id)
                 if should_return_as_token_id:
                     token = f"token_id:{token_id}"
+                else:
+                    if tokenizer is None:
+                        raise ValueError(
+                            "Unable to get tokenizer because `skip_tokenizer_init=True`"
+                        )
+
+                    token = tokenizer.decode(token_id)
 
                 out_tokens.append(token)
                 out_token_logprobs.append(None)
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index cca2fd982fe0f..e7a632e025103 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -7,13 +7,14 @@ import time
 import traceback
 from collections.abc import AsyncGenerator, Callable, Iterable, Mapping, Sequence
 from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass, field
 from http import HTTPStatus
 from typing import Any, ClassVar, Generic, TypeAlias, TypeVar
 
 import numpy as np
 import torch
 from fastapi import Request
-from pydantic import BaseModel, ConfigDict, Field, TypeAdapter
+from pydantic import ConfigDict, TypeAdapter
 from starlette.datastructures import Headers
 from typing_extensions import TypeIs
 
@@ -96,12 +97,12 @@ from vllm.outputs import CompletionOutput, PoolingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
 from vllm.sampling_params import BeamSearchParams, SamplingParams
+from vllm.tokenizers import MistralTokenizer, TokenizerLike
 from vllm.tracing import (
     contains_trace_headers,
     extract_trace_headers,
     log_tracing_disabled_warning,
 )
-from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 from vllm.utils import random_uuid
 from vllm.utils.async_utils import (
     AsyncMicrobatchTokenizer,
@@ -184,19 +185,19 @@ def is_embeds_prompt(prompt: RequestPrompt) -> TypeIs[EmbedsPrompt]:
 RequestT = TypeVar("RequestT", bound=AnyRequest)
 
 
-class RequestProcessingMixin(BaseModel):
+@dataclass(kw_only=True)
+class RequestProcessingMixin:
     """
     Mixin for request processing,
     handling prompt preparation and engine input.
     """
 
-    request_prompts: Sequence[RequestPrompt] | None = []
-    engine_prompts: list[EngineTokensPrompt] | None = []
-
-    model_config = ConfigDict(arbitrary_types_allowed=True)
+    request_prompts: Sequence[RequestPrompt] | None = field(default_factory=list)
+    engine_prompts: list[EngineTokensPrompt] | None = field(default_factory=list)
 
 
-class ResponseGenerationMixin(BaseModel):
+@dataclass(kw_only=True)
+class ResponseGenerationMixin:
     """
     Mixin for response generation,
     managing result generators and final batch results.
@@ -205,54 +206,38 @@ class ResponseGenerationMixin(BaseModel):
     result_generator: (
         AsyncGenerator[tuple[int, RequestOutput | PoolingRequestOutput], None] | None
     ) = None
-    final_res_batch: list[RequestOutput | PoolingRequestOutput] = Field(
+    final_res_batch: list[RequestOutput | PoolingRequestOutput] = field(
         default_factory=list
     )
 
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
 
-class ServeContext(
-    RequestProcessingMixin,
-    ResponseGenerationMixin,
-    BaseModel,
-    Generic[RequestT],
-):
+@dataclass(kw_only=True)
+class ServeContext(RequestProcessingMixin, ResponseGenerationMixin, Generic[RequestT]):
     # Shared across all requests
     request: RequestT
     raw_request: Request | None = None
     model_name: str
     request_id: str
-    created_time: int = Field(default_factory=lambda: int(time.time()))
+    created_time: int = field(default_factory=lambda: int(time.time()))
     lora_request: LoRARequest | None = None
 
     # Shared across most requests
-    tokenizer: AnyTokenizer | None = None
-
-    # `protected_namespaces` resolves Pydantic v2's warning
-    # on conflict with protected namespace "model_"
-    model_config = ConfigDict(
-        protected_namespaces=(),
-        arbitrary_types_allowed=True,
-    )
+    tokenizer: TokenizerLike | None = None
 
 
-ClassificationServeContext = ServeContext[ClassificationRequest]
+@dataclass(kw_only=True)
+class ClassificationServeContext(ServeContext[ClassificationRequest]):
+    pass
 
 
+@dataclass(kw_only=True)
 class EmbeddingServeContext(ServeContext[EmbeddingRequest]):
     chat_template: str | None = None
     chat_template_content_format: ChatTemplateContentFormatOption
 
 
-# Used to resolve the Pydantic error related to
-# forward reference of MultiModalDataDict in TokensPrompt
-RequestProcessingMixin.model_rebuild()
-ServeContext.model_rebuild()
-ClassificationServeContext.model_rebuild()
-EmbeddingServeContext.model_rebuild()
-
-
 class OpenAIServing:
     request_id_prefix: ClassVar[str] = """
     A short string prepended to every request’s ID (e.g. "embd", "classify")
@@ -281,7 +266,7 @@ class OpenAIServing:
             apply_mistral_chat_template, executor=self._tokenizer_executor
         )
 
-        self._async_tokenizer_pool: dict[AnyTokenizer, AsyncMicrobatchTokenizer] = {}
+        self._async_tokenizer_pool: dict[TokenizerLike, AsyncMicrobatchTokenizer] = {}
         self.log_error_stack = log_error_stack
 
         self.input_processor = self.models.input_processor
@@ -291,7 +276,7 @@ class OpenAIServing:
 
     def _get_tool_parser(
         self, tool_parser_name: str | None = None, enable_auto_tools: bool = False
-    ) -> Callable[[AnyTokenizer], ToolParser] | None:
+    ) -> Callable[[TokenizerLike], ToolParser] | None:
         """Get the tool parser based on the name."""
         parser = None
         if not enable_auto_tools or tool_parser_name is None:
@@ -317,7 +302,7 @@ class OpenAIServing:
     def _get_reasoning_parser(
         self,
         reasoning_parser_name: str,
-    ) -> Callable[[AnyTokenizer], ReasoningParser] | None:
+    ) -> Callable[[TokenizerLike], ReasoningParser] | None:
         """Get the reasoning parser based on the name."""
         parser = None
         if not reasoning_parser_name:
@@ -547,7 +532,7 @@ class OpenAIServing:
             prompt_logprobs=None,
         )
 
-    def _get_renderer(self, tokenizer: AnyTokenizer | None) -> BaseRenderer:
+    def _get_renderer(self, tokenizer: TokenizerLike | None) -> BaseRenderer:
         """
         Get a Renderer instance with the provided tokenizer.
         Uses shared async tokenizer pool for efficiency.
@@ -877,7 +862,7 @@ class OpenAIServing:
         self,
         request: AnyRequest,
         prompt: str,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         add_special_tokens: bool,
     ) -> TextTokensPrompt:
         async_tokenizer = self._get_async_tokenizer(tokenizer)
@@ -919,7 +904,7 @@ class OpenAIServing:
         self,
         request: AnyRequest,
         prompt_ids: list[int],
-        tokenizer: AnyTokenizer | None,
+        tokenizer: TokenizerLike | None,
     ) -> TextTokensPrompt:
         truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens", None)
 
@@ -1015,7 +1000,7 @@ class OpenAIServing:
     async def _tokenize_prompt_input_async(
         self,
         request: AnyRequest,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         prompt_input: str | list[int],
         add_special_tokens: bool = True,
     ) -> TextTokensPrompt:
@@ -1034,7 +1019,7 @@ class OpenAIServing:
     async def _tokenize_prompt_inputs_async(
         self,
         request: AnyRequest,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         prompt_inputs: Iterable[str | list[int]],
         add_special_tokens: bool = True,
     ) -> AsyncGenerator[TextTokensPrompt, None]:
@@ -1079,7 +1064,7 @@ class OpenAIServing:
     async def _preprocess_chat(
         self,
         request: ChatLikeRequest | ResponsesRequest,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike | None,
         messages: list[ChatCompletionMessageParam],
         chat_template: str | None,
         chat_template_content_format: ChatTemplateContentFormatOption,
@@ -1088,13 +1073,18 @@ class OpenAIServing:
         tool_dicts: list[dict[str, Any]] | None = None,
         documents: list[dict[str, str]] | None = None,
         chat_template_kwargs: dict[str, Any] | None = None,
-        tool_parser: Callable[[AnyTokenizer], ToolParser] | None = None,
+        tool_parser: Callable[[TokenizerLike], ToolParser] | None = None,
         add_special_tokens: bool = False,
     ) -> tuple[
         list[ConversationMessage],
         Sequence[RequestPrompt],
         list[EngineTokensPrompt],
     ]:
+        if tokenizer is None:
+            raise ValueError(
+                "Unable to get tokenizer because `skip_tokenizer_init=True`"
+            )
+
         model_config = self.model_config
 
         resolved_content_format = resolve_chat_template_content_format(
@@ -1370,9 +1360,9 @@ class OpenAIServing:
     @staticmethod
     def _parse_tool_calls_from_content(
         request: ResponsesRequest | ChatCompletionRequest,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         enable_auto_tools: bool,
-        tool_parser_cls: Callable[[AnyTokenizer], ToolParser] | None,
+        tool_parser_cls: Callable[[TokenizerLike], ToolParser] | None,
         content: str | None = None,
     ) -> tuple[list[FunctionCall] | None, str | None]:
         function_calls = list[FunctionCall]()
@@ -1442,7 +1432,7 @@ class OpenAIServing:
     def _get_decoded_token(
         logprob: Logprob,
         token_id: int,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike | None,
         return_as_token_id: bool = False,
     ) -> str:
         if return_as_token_id:
@@ -1450,6 +1440,12 @@ class OpenAIServing:
 
         if logprob.decoded_token is not None:
             return logprob.decoded_token
+
+        if tokenizer is None:
+            raise ValueError(
+                "Unable to get tokenizer because `skip_tokenizer_init=True`"
+            )
+
         return tokenizer.decode(token_id)
 
     def _is_model_supported(self, model_name: str | None) -> bool:
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index f546dbda7fef5..5144916ba71e9 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -105,7 +105,7 @@ from vllm.logprobs import Logprob as SampleLogprob
 from vllm.logprobs import SampleLogprobs
 from vllm.outputs import CompletionOutput
 from vllm.sampling_params import SamplingParams, StructuredOutputsParams
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.utils import random_uuid
 
 logger = init_logger(__name__)
@@ -492,7 +492,7 @@ class OpenAIServingResponses(OpenAIServing):
         self,
         request: ResponsesRequest,
         prev_response: ResponsesResponse | None,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
     ):
         if request.tools is None or (
             request.tool_choice == "none" and self.exclude_tools_when_tool_choice_none
@@ -563,7 +563,7 @@ class OpenAIServingResponses(OpenAIServing):
         result_generator: AsyncIterator[ConversationContext],
         context: ConversationContext,
         model_name: str,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         request_metadata: RequestResponseMetadata,
         created_time: int | None = None,
     ) -> ErrorResponse | ResponsesResponse:
@@ -675,7 +675,7 @@ class OpenAIServingResponses(OpenAIServing):
         self,
         logprobs: dict[int, SampleLogprob],
         top_logprobs: int,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
     ) -> list[LogprobTopLogprob]:
         """Returns the top-k logprobs from the logprobs dictionary."""
         out = []
@@ -700,7 +700,7 @@ class OpenAIServingResponses(OpenAIServing):
         self,
         token_ids: Sequence[int],
         logprobs: SampleLogprobs | None,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         top_logprobs: int | None = None,
     ) -> list[Logprob]:
         assert logprobs is not None, "logprobs must be provided"
@@ -736,7 +736,7 @@ class OpenAIServingResponses(OpenAIServing):
         self,
         token_ids: Sequence[int],
         logprobs: SampleLogprobs | None,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         top_logprobs: int | None = None,
     ) -> list[response_text_delta_event.Logprob]:
         lgs = self._create_response_logprobs(
@@ -763,7 +763,7 @@ class OpenAIServingResponses(OpenAIServing):
         self,
         request: ResponsesRequest,
         final_output: CompletionOutput,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
     ) -> list[ResponseOutputItem]:
         if self.reasoning_parser:
             try:
@@ -1135,7 +1135,7 @@ class OpenAIServingResponses(OpenAIServing):
         result_generator: AsyncIterator[ConversationContext | None],
         context: ConversationContext,
         model_name: str,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         request_metadata: RequestResponseMetadata,
         created_time: int,
         _increment_sequence_number_and_return: Callable[
@@ -1438,7 +1438,7 @@ class OpenAIServingResponses(OpenAIServing):
         result_generator: AsyncIterator[ConversationContext | None],
         context: ConversationContext,
         model_name: str,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         request_metadata: RequestResponseMetadata,
         created_time: int,
         _increment_sequence_number_and_return: Callable[
@@ -1891,7 +1891,7 @@ class OpenAIServingResponses(OpenAIServing):
         result_generator: AsyncIterator[ConversationContext | None],
         context: ConversationContext,
         model_name: str,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         request_metadata: RequestResponseMetadata,
         created_time: int | None = None,
     ) -> AsyncGenerator[StreamingResponsesResponse, None]:
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index 9cbfc9791819e..0874c01c1f2a7 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -36,7 +36,7 @@ from vllm.inputs.data import TokensPrompt
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
-from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
+from vllm.tokenizers import MistralTokenizer, TokenizerLike
 from vllm.utils.async_utils import make_async, merge_async_iterators
 
 logger = init_logger(__name__)
@@ -60,7 +60,7 @@ class ServingScores(OpenAIServing):
 
     async def _embedding_score(
         self,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         texts_1: list[str],
         texts_2: list[str],
         request: RerankRequest | ScoreRequest,
@@ -153,7 +153,7 @@ class ServingScores(OpenAIServing):
     def _preprocess_score(
         self,
         request: RerankRequest | ScoreRequest,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         tokenization_kwargs: dict[str, Any],
         data_1: str | ScoreContentPartParam,
         data_2: str | ScoreContentPartParam,
@@ -175,7 +175,7 @@ class ServingScores(OpenAIServing):
 
     async def _cross_encoding_score(
         self,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         data_1: list[str] | list[ScoreContentPartParam],
         data_2: list[str] | list[ScoreContentPartParam],
         request: RerankRequest | ScoreRequest,
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index 39aae0cd04956..979da02d14500 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -22,7 +22,7 @@ from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.renderer import RenderConfig
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
@@ -170,7 +170,7 @@ class OpenAIServingTokenization(OpenAIServing):
 
 @dataclass
 class TokenizerInfo:
-    tokenizer: AnyTokenizer
+    tokenizer: TokenizerLike
     chat_template: str | None
 
     def to_dict(self) -> dict[str, Any]:
diff --git a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
index e99e405f5de65..87ef2e0786a94 100644
--- a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
@@ -22,7 +22,7 @@ from vllm.logger import init_logger
 from vllm.sampling_params import (
     StructuredOutputsParams,
 )
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.utils.collection_utils import is_list_of
 from vllm.utils.import_utils import import_from_path
 
@@ -36,7 +36,7 @@ class ToolParser:
     derived classes.
     """
 
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         self.prev_tool_call_arr: list[dict] = []
         # the index of the tool call that is currently being parsed
         self.current_tool_id: int = -1
diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py
index cbeb879969ece..10de3dabf985c 100644
--- a/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py
@@ -19,13 +19,13 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
 
 class DeepSeekV31ToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
         self.current_tool_name_sent: bool = False
diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
index bf7f6fa61ab90..66b14875dce25 100644
--- a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
@@ -19,13 +19,13 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
 
 class DeepSeekV3ToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
         self.current_tool_name_sent: bool = False
diff --git a/vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py
index 82370323cb00d..d054d8e4b8651 100644
--- a/vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py
@@ -19,13 +19,13 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
 
 class Ernie45ToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         """
         Ernie thinking model format:
         abc\n</think>\n\n\n<tool_call>\ndef\n</tool_call>\n
diff --git a/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py
index 389e9754b34da..165346adb3d93 100644
--- a/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py
@@ -22,13 +22,13 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
 
 class Glm4MoeModelToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
         self.current_tool_name_sent = False
         self.prev_tool_call_arr: list[dict] = []
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
index ae9217426fb51..df1b590526b1a 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
@@ -29,7 +29,7 @@ from vllm.entrypoints.openai.tool_parsers.utils import (
     partial_json_loads,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
@@ -44,7 +44,7 @@ class Granite20bFCToolParser(ToolParser):
     are all set
     """
 
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
         self.bot_token = "<function_call>"
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
index d29c427694dc9..14b0ca0abe357 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
@@ -27,7 +27,7 @@ from vllm.entrypoints.openai.tool_parsers.utils import (
     partial_json_loads,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
@@ -42,7 +42,7 @@ class GraniteToolParser(ToolParser):
     are all set
     """
 
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
         # for granite 3.0, the token `<|tool_call|>`
         self.bot_token = "<|tool_call|>"
diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
index 4336a5438109f..19c1c83268ed4 100644
--- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
@@ -22,18 +22,18 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
+from vllm.tokenizers import MistralTokenizer, TokenizerLike
 
 logger = init_logger(__name__)
 
 
 class Hermes2ProToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
-        if isinstance(self.model_tokenizer, MistralTokenizer):
+        if isinstance(tokenizer, MistralTokenizer):
             logger.error("Detected Mistral tokenizer when using a Hermes model")
-            self.model_tokenizer = self.model_tokenizer.tokenizer
+            self.model_tokenizer = tokenizer.tokenizer
 
         self.current_tool_name_sent: bool = False
         self.prev_tool_call_arr: list[dict] = []
diff --git a/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py
index 920675c8389b8..d2419b5d84ead 100644
--- a/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py
@@ -22,14 +22,14 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
 )
 from vllm.entrypoints.openai.tool_parsers.utils import consume_space
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.utils import random_uuid
 
 logger = init_logger(__name__)
 
 
 class HunyuanA13BToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
         # Initialize state for streaming mode
diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
index 1dd327f645b3a..67788358543e9 100644
--- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
@@ -22,13 +22,13 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
 )
 from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
 
 class Internlm2ToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
         self.position = 0
 
diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
index 6f53ddea4f0ef..4655da8dd4542 100644
--- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
@@ -21,14 +21,13 @@ from vllm.entrypoints.openai.protocol import (
 from vllm.entrypoints.openai.tool_parsers import ToolParser
 from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.transformers_utils.tokenizers import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer, TokenizerLike
 
 logger = init_logger(__name__)
 
 
 class JambaToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
         if isinstance(self.model_tokenizer, MistralTokenizer):
diff --git a/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py
index 2b84c60a3b841..07db52ebd5af1 100644
--- a/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py
@@ -19,13 +19,13 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
 
 class KimiK2ToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
         self.current_tool_name_sent: bool = False
         self.prev_tool_call_arr: list[dict] = []
diff --git a/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py
index c6c8ae8ae95f1..76d76a4aa35a1 100644
--- a/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py
@@ -4,11 +4,11 @@
 import regex as re
 
 from vllm.entrypoints.openai.tool_parsers.hermes_tool_parser import Hermes2ProToolParser
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 
 class LongcatFlashToolParser(Hermes2ProToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
         self.tool_call_start_token: str = "<longcat_tool_call>"
diff --git a/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py
index 5c2258ba62b29..b595a98f35555 100644
--- a/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py
@@ -21,13 +21,13 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
 
 class MinimaxM2ToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
         self.prev_tool_call_arr: list[dict] = []
diff --git a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
index 982518a52e3da..1025041037c6e 100644
--- a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
@@ -22,13 +22,13 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
 )
 from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
 
 class MinimaxToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
         # Initialize streaming state for tracking tool call progress
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index 85671271522d3..7e2d67a1fb659 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -25,7 +25,7 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
 )
 from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
+from vllm.tokenizers import MistralTokenizer, TokenizerLike
 
 logger = init_logger(__name__)
 
@@ -46,7 +46,7 @@ class MistralToolCall(ToolCall):
         return id.isalnum() and len(id) == 9
 
 
-def _is_fn_name_regex_support(model_tokenizer: AnyTokenizer) -> bool:
+def _is_fn_name_regex_support(model_tokenizer: TokenizerLike) -> bool:
     return (
         isinstance(model_tokenizer, MistralTokenizer) and model_tokenizer.version >= 11
     )
@@ -61,7 +61,7 @@ class MistralToolParser(ToolParser):
     Used when --enable-auto-tool-choice --tool-call-parser mistral are all set
     """
 
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
         if not isinstance(self.model_tokenizer, MistralTokenizer):
diff --git a/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
index d1b36a297e0b1..8bdf35d408805 100644
--- a/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
@@ -18,15 +18,15 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
 from vllm.logger import init_logger
 
 if TYPE_CHECKING:
-    from vllm.transformers_utils.tokenizer import AnyTokenizer
+    from vllm.tokenizers import TokenizerLike
 else:
-    AnyTokenizer = object
+    TokenizerLike = object
 
 logger = init_logger(__name__)
 
 
 class OpenAIToolParser(ToolParser):
-    def __init__(self, tokenizer: "AnyTokenizer"):
+    def __init__(self, tokenizer: "TokenizerLike"):
         super().__init__(tokenizer)
 
     def extract_tool_calls(
diff --git a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
index 9d4c079eba188..d49b14690ef03 100644
--- a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
@@ -22,13 +22,13 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
 
 class Qwen3CoderToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
         self.current_tool_name_sent: bool = False
diff --git a/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py
index 432c419db189a..03862ff432a5d 100644
--- a/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py
@@ -23,7 +23,7 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
@@ -1165,7 +1165,7 @@ class StreamingXMLToolCallParser:
 
 
 class Qwen3XMLToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
         self.parser = StreamingXMLToolCallParser()
 
diff --git a/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
index 8aed7f0e9fc96..c7947faad1923 100644
--- a/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
@@ -25,7 +25,7 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
@@ -34,7 +34,7 @@ class SeedOssToolParser(ToolParser):
     TOOL_CALL_START = "<seed:tool_call>"
     TOOL_CALL_END = "</seed:tool_call>"
 
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
         # --- streaming state ---
diff --git a/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py
index adcb9f4765473..9213d6859dd93 100644
--- a/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py
@@ -21,7 +21,7 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.utils import random_uuid
 
 logger = init_logger(__name__)
@@ -41,7 +41,7 @@ class Step3ToolParser(ToolParser):
     TOOL_SEP = "<｜tool_sep｜>"
     SPECIAL_TOKENS = [TOOL_CALLS_BEGIN, TOOL_CALLS_END, TOOL_CALL_BEGIN, TOOL_CALL_END]
 
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
         self.position = 0
         # Explicit state flags for robust streaming
diff --git a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
index 9d308af4de601..effd2bd08b42a 100644
--- a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
@@ -21,14 +21,14 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.utils import random_uuid
 
 logger = init_logger(__name__)
 
 
 class xLAMToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
         # Initialize state for streaming mode
diff --git a/vllm/entrypoints/renderer.py b/vllm/entrypoints/renderer.py
index 3c5a396a99f93..10b90bbbb0f32 100644
--- a/vllm/entrypoints/renderer.py
+++ b/vllm/entrypoints/renderer.py
@@ -16,7 +16,7 @@ from vllm.inputs.data import EmbedsPrompt as EngineEmbedsPrompt
 from vllm.inputs.data import TextPrompt as EngineTextPrompt
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.inputs.parse import get_prompt_components, parse_raw_prompts
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.utils.async_utils import AsyncMicrobatchTokenizer
 
 
@@ -85,7 +85,7 @@ class BaseRenderer(ABC):
     def __init__(
         self,
         model_config: ModelConfig,
-        tokenizer: AnyTokenizer | None = None,
+        tokenizer: TokenizerLike | None = None,
     ):
         super().__init__()
         self.model_config = model_config
@@ -200,8 +200,8 @@ class CompletionRenderer(BaseRenderer):
     def __init__(
         self,
         model_config: ModelConfig,
-        tokenizer: AnyTokenizer | None = None,
-        async_tokenizer_pool: dict[AnyTokenizer, AsyncMicrobatchTokenizer]
+        tokenizer: TokenizerLike | None = None,
+        async_tokenizer_pool: dict[TokenizerLike, AsyncMicrobatchTokenizer]
         | None = None,
     ):
         super().__init__(model_config, tokenizer)
@@ -373,7 +373,7 @@ class CompletionRenderer(BaseRenderer):
             return async_tokenizer
 
         tokenizer = self.tokenizer
-        if self.tokenizer is None:
+        if tokenizer is None:
             raise ValueError("No tokenizer available for text input processing")
 
         if self.async_tokenizer_pool is None:
diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py
index 309a4c996392d..04d5a192918dd 100644
--- a/vllm/entrypoints/score_utils.py
+++ b/vllm/entrypoints/score_utils.py
@@ -19,11 +19,7 @@ from vllm.inputs import TokensPrompt
 from vllm.model_executor.models.interfaces import supports_score_template
 from vllm.multimodal.inputs import MultiModalDataDict
 from vllm.outputs import PoolingRequestOutput
-from vllm.transformers_utils.tokenizer import (
-    AnyTokenizer,
-    PreTrainedTokenizer,
-    PreTrainedTokenizerFast,
-)
+from vllm.transformers_utils.tokenizer import TokenizerLike
 
 ScoreContentPartParam: TypeAlias = (
     ChatCompletionContentPartImageParam | ChatCompletionContentPartImageEmbedsParam
@@ -45,7 +41,7 @@ class ScoreMultiModalParam(TypedDict, total=False):
 
 
 def _cosine_similarity(
-    tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
+    tokenizer: TokenizerLike,
     embed_1: list[PoolingRequestOutput],
     embed_2: list[PoolingRequestOutput],
 ) -> list[PoolingRequestOutput]:
@@ -93,7 +89,7 @@ def parse_score_data(
     data_1: str | ScoreContentPartParam,
     data_2: str | ScoreContentPartParam,
     model_config: ModelConfig,
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
 ) -> tuple[str, str, MultiModalDataDict | None]:
     mm_tracker = MultiModalItemTracker(model_config, tokenizer)
 
@@ -118,12 +114,14 @@ def _parse_score_content(
     mm_tracker: BaseMultiModalItemTracker,
 ) -> _ContentPart | None:
     if isinstance(data, str):
-        data = ChatCompletionContentPartTextParam(type="text", text=data)
+        part = ChatCompletionContentPartTextParam(type="text", text=data)
+    else:
+        part = data
 
     mm_parser = mm_tracker.create_parser()
 
     parse_res = _parse_chat_message_content_part(
-        data,
+        part,
         mm_parser,
         wrap_dicts=False,
         interleave_strings=False,
@@ -181,7 +179,7 @@ def post_process_tokens(
 
 def get_score_prompt(
     model_config: ModelConfig,
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     tokenization_kwargs: dict[str, Any],
     data_1: str | ScoreContentPartParam,
     data_2: str | ScoreContentPartParam,
diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
index 088bb679fef40..daeeb995bc749 100644
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -30,7 +30,7 @@ from vllm.entrypoints.openai.protocol import (
 from vllm.entrypoints.openai.serving_models import LoRAModulePath
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.transformers_utils.tokenizers import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 logger = init_logger(__name__)
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 839c13868a16c..46d1bed38aa85 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -17,7 +17,7 @@ from vllm.multimodal.inputs import (
     MultiModalUUIDDict,
 )
 from vllm.multimodal.processing import BaseMultiModalProcessor
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.utils.jsontree import json_iter_leaves
 from vllm.v1.metrics.stats import MultiModalCacheStats
 
@@ -46,7 +46,7 @@ class InputPreprocessor:
     def __init__(
         self,
         model_config: ModelConfig,
-        tokenizer: AnyTokenizer | None,
+        tokenizer: TokenizerLike | None,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
         mm_processor_cache: BaseMultiModalProcessorCache | None = None,
     ) -> None:
@@ -59,7 +59,7 @@ class InputPreprocessor:
 
         self.mm_cache_stats = MultiModalCacheStats() if mm_processor_cache else None
 
-    def get_tokenizer(self) -> AnyTokenizer:
+    def get_tokenizer(self) -> TokenizerLike:
         if self.tokenizer is None:
             raise ValueError(
                 "You cannot pass text prompts when `skip_tokenizer_init` is True"
@@ -228,11 +228,11 @@ class InputPreprocessor:
 
         return tokenizer.encode(prompt, **tokenization_kwargs)
 
-    def _get_mm_tokenizer(self) -> AnyTokenizer:
+    def _get_mm_tokenizer(self) -> TokenizerLike:
         # PrithviGeoSpatialMAE needs to be initialized without a tokenizer
         # while using also multi-modal input
         if not self.tokenizer:
-            return cast(AnyTokenizer, object())  # Dummy
+            return cast(TokenizerLike, object())  # Dummy
 
         tokenizer = self.get_tokenizer()
         return tokenizer
diff --git a/vllm/logits_process.py b/vllm/logits_process.py
index 7b6a6528e20e8..1bf97c2535fb7 100644
--- a/vllm/logits_process.py
+++ b/vllm/logits_process.py
@@ -5,7 +5,7 @@ from typing import TypeAlias
 
 import torch
 
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 LogitsProcessor: TypeAlias = (
     Callable[[list[int], torch.Tensor], torch.Tensor]
@@ -19,7 +19,7 @@ to sample from."""
 
 
 def get_bad_words_logits_processors(
-    bad_words: list[str], tokenizer: AnyTokenizer
+    bad_words: list[str], tokenizer: TokenizerLike
 ) -> list[LogitsProcessor]:
     bad_words_ids: list[list[int]] = list()
 
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index 81c6b34bd6ce0..6276c3d675411 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -28,7 +28,7 @@ from vllm.multimodal.processing import (
     PromptUpdate,
     PromptUpdateDetails,
 )
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 from .intern_vit import InternVisionModel
 from .internvl import (
@@ -241,7 +241,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
     def __init__(
         self,
         config: PretrainedConfig,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         *,
         min_dynamic_patch: int | None = None,
         max_dynamic_patch: int | None = None,
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index ccbde115009d2..fccddf3a6b293 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -50,7 +50,7 @@ from vllm.multimodal.processing import (
 )
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 from vllm.utils.torch_utils import set_default_torch_num_threads
 
@@ -347,7 +347,7 @@ class BaseInternVLProcessor(ABC):
     def __init__(
         self,
         config: PretrainedConfig,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         *,
         min_dynamic_patch: int | None = None,
         max_dynamic_patch: int | None = None,
@@ -561,7 +561,7 @@ class InternVLProcessor(BaseInternVLProcessor):
     def __init__(
         self,
         config: PretrainedConfig,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         *,
         min_dynamic_patch: int | None = None,
         max_dynamic_patch: int | None = None,
diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
index 5529089e06ae9..11beeddabe307 100644
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -73,9 +73,9 @@ from vllm.multimodal.processing import (
 )
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.tokenizers import TokenizerLike
 from vllm.transformers_utils.configs.radio import RadioConfig
 from vllm.transformers_utils.tokenizer import (
-    AnyTokenizer,
     cached_tokenizer_from_config,
     encode_tokens,
 )
@@ -284,7 +284,7 @@ class BaseNanoNemotronVLProcessor(ABC):
     def __init__(
         self,
         config: PretrainedConfig,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         *args,
         max_num_tiles: int | None = None,
         **kwargs,
@@ -434,7 +434,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
     def __init__(
         self,
         config: PretrainedConfig,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         *,
         max_num_tiles: int | None = None,
         min_dynamic_patch: int | None = None,
@@ -645,7 +645,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
         tokens_per_frame: list[int],
         frames_indices: list[int],
         frame_duration_ms: int,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         img_start_token_ids: list[int],
         img_end_token_ids: list[int],
         img_context_token_ids: list[int],
@@ -670,7 +670,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
             tokens_per_frame (list[int]): number of tokens per frame
             frames_indices (list[int]): frame indices
             frame_duration_ms (int): duration of each frame in milliseconds
-            tokenizer (AnyTokenizer): tokenizer to use for tokenizing frame separators
+            tokenizer (TokenizerLike): tokenizer to use for tokenizing frame separators
             img_start_token_ids (list[int]): pre-tokenized IMG_START tokens
             img_end_token_ids (list[int]): pre-tokenized IMG_END tokens
             img_context_token_ids (list[int]): pre-tokenized IMG_CONTEXT tokens
diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py
index 5a1dda8aac2c1..a57668b21fb86 100644
--- a/vllm/model_executor/models/nemotron_vl.py
+++ b/vllm/model_executor/models/nemotron_vl.py
@@ -34,8 +34,8 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import convert_image_mode
 from vllm.multimodal.processing import PromptUpdateDetails
 from vllm.sequence import IntermediateTensors
+from vllm.tokenizers import TokenizerLike
 from vllm.transformers_utils.processor import cached_image_processor_from_config
-from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 from .interfaces import (
     MultiModalEmbeddings,
@@ -203,7 +203,7 @@ class NemotronVLProcessor(InternVLProcessor):
     def __init__(
         self,
         config: PretrainedConfig,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         image_processor: BaseImageProcessorFast,
         *,
         min_dynamic_patch: int | None = None,
diff --git a/vllm/model_executor/models/opencua.py b/vllm/model_executor/models/opencua.py
index 121bf896fa6ba..4338918663378 100644
--- a/vllm/model_executor/models/opencua.py
+++ b/vllm/model_executor/models/opencua.py
@@ -31,7 +31,7 @@ from vllm.multimodal.processing import (
     PromptReplacement,
     PromptUpdate,
 )
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 from .qwen2_5_vl import (
     Qwen2_5_VisionTransformer as OpenCUAVisionTransformer,
@@ -79,7 +79,7 @@ class OpenCUAProcessor(Qwen2VLProcessor):
     def __init__(
         self,
         vision_config: dict,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         **kwargs,
     ):
         image_processor = Qwen2VLImageProcessor(**vision_config)
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 3464de472add5..54bde75cc0131 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -59,10 +59,8 @@ from vllm.multimodal.processing import (
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.tokenizer import (
-    MistralTokenizer,
-    cached_tokenizer_from_config,
-)
+from vllm.tokenizers import MistralTokenizer
+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 672659aa6042c..8fbd896223944 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -91,7 +91,7 @@ from vllm.multimodal.processing import (
 )
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import (
@@ -1533,7 +1533,7 @@ class Tarsier2Processor(Qwen2VLProcessor):
     def __init__(
         self,
         vision_config: dict,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         **kwargs,
     ):
         self.image_processor = Tarsier2ImageProcessor(**vision_config)
diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py
index d825eb3a1c134..55c25ce6190fb 100644
--- a/vllm/model_executor/models/skyworkr1v.py
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -47,7 +47,7 @@ from vllm.multimodal.processing import (
 )
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -282,7 +282,7 @@ class SkyworkR1VProcessor:
     def __init__(
         self,
         config: PretrainedConfig,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         *,
         min_dynamic_patch: int | None = None,
         max_dynamic_patch: int | None = None,
diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py
index 1c60cb4148121..3e55ada0ed2e1 100644
--- a/vllm/model_executor/models/step3_vl.py
+++ b/vllm/model_executor/models/step3_vl.py
@@ -43,8 +43,8 @@ from vllm.multimodal.processing import (
 )
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.tokenizers import TokenizerLike
 from vllm.transformers_utils.configs import Step3VisionEncoderConfig
-from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -321,7 +321,7 @@ class Step3VLProcessor:
     def __init__(
         self,
         config: PretrainedConfig,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
     ) -> None:
         super().__init__()
 
diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py
index 18ad8851fccda..0a39ea7ef5bff 100644
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -51,10 +51,8 @@ from vllm.multimodal.processing import (
 )
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.tokenizer import (
-    MistralTokenizer,
-    cached_tokenizer_from_config,
-)
+from vllm.tokenizers import MistralTokenizer
+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsTranscription
 from .utils import init_vllm_registered_model, maybe_prefix
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 27bf12a5f3169..aab657b24ba23 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -23,8 +23,9 @@ import torch
 from typing_extensions import TypeVar, assert_never
 
 from vllm.logger import init_logger
+from vllm.tokenizers import TokenizerLike
 from vllm.transformers_utils.processor import cached_processor_from_config
-from vllm.transformers_utils.tokenizer import AnyTokenizer, decode_tokens, encode_tokens
+from vllm.transformers_utils.tokenizer import decode_tokens, encode_tokens
 from vllm.utils.collection_utils import flatten_2d_lists, full_groupby
 from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
 from vllm.utils.jsontree import JSONTree, json_map_leaves
@@ -76,7 +77,7 @@ PromptSeq: TypeAlias = str | list[int]
 
 @lru_cache(maxsize=2048)
 def _cached_encode(
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     text: str,
     *,
     add_special_tokens: bool | None = None,
@@ -86,7 +87,7 @@ def _cached_encode(
 
 @lru_cache(maxsize=2048)
 def _cached_decode(
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     token_ids: tuple[int, ...],
     *,
     skip_special_tokens: bool | None = None,
@@ -96,14 +97,14 @@ def _cached_decode(
     )
 
 
-def _seq2text(tokenizer: AnyTokenizer, seq: PromptSeq) -> str:
+def _seq2text(tokenizer: TokenizerLike, seq: PromptSeq) -> str:
     if isinstance(seq, str):
         return seq
 
     return _cached_decode(tokenizer, tuple(seq))
 
 
-def _seq2tokens(tokenizer: AnyTokenizer, seq: PromptSeq) -> list[int]:
+def _seq2tokens(tokenizer: TokenizerLike, seq: PromptSeq) -> list[int]:
     if isinstance(seq, str):
         return _cached_encode(tokenizer, seq, add_special_tokens=False)
 
@@ -113,7 +114,7 @@ def _seq2tokens(tokenizer: AnyTokenizer, seq: PromptSeq) -> list[int]:
 class _GetMatchIndex(Protocol):
     def __call__(
         self,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         prompt: PromptSeq,
         start_idx: int = 0,
     ) -> int | None: ...
@@ -143,7 +144,7 @@ class PromptIndexTargets:
         """
 
         def get_match_index(
-            tokenizer: AnyTokenizer,
+            tokenizer: TokenizerLike,
             prompt: PromptSeq,
             start_idx: int = 0,
         ) -> int | None:
@@ -199,7 +200,7 @@ class PromptUpdateDetails(Generic[_S]):
     full: _S
     """The full content."""
 
-    is_embed: Callable[[AnyTokenizer, PromptSeq], torch.Tensor] | None = None
+    is_embed: Callable[[TokenizerLike, PromptSeq], torch.Tensor] | None = None
     """
     Given [`full`][vllm.multimodal.processing.PromptUpdateDetails.full],
     return a boolean mask of shape `(len(full),)` indicating which positions
@@ -220,7 +221,7 @@ class PromptUpdateDetails(Generic[_S]):
         seq: _S,
         embed_text: str,
     ) -> "PromptUpdateDetails[_S]":
-        def is_embed(tokenizer: AnyTokenizer, full: PromptSeq) -> torch.Tensor:
+        def is_embed(tokenizer: TokenizerLike, full: PromptSeq) -> torch.Tensor:
             embed_token_ids = encode_tokens(tokenizer, embed_text)
             token_ids = _seq2tokens(tokenizer, full)
 
@@ -236,7 +237,7 @@ class PromptUpdateDetails(Generic[_S]):
         seq: _S,
         embed_token_id: int,
     ) -> "PromptUpdateDetails[_S]":
-        def is_embed(tokenizer: AnyTokenizer, full: PromptSeq) -> torch.Tensor:
+        def is_embed(tokenizer: TokenizerLike, full: PromptSeq) -> torch.Tensor:
             token_ids = _seq2tokens(tokenizer, full)
 
             return torch.tensor(token_ids) == embed_token_id
@@ -522,7 +523,7 @@ class ResolvedPromptUpdate:
     def iter_token_matches(
         self,
         prompt: list[int],
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         *,
         start_idx: int = 0,
     ) -> Generator[PromptTargetMatch]:
@@ -544,7 +545,7 @@ class ResolvedPromptUpdate:
     def iter_text_matches(
         self,
         prompt: str,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         *,
         start_idx: int = 0,
     ) -> Generator[PromptTargetMatch]:
@@ -566,7 +567,7 @@ class ResolvedPromptUpdate:
     def iter_matches(
         self,
         prompt: list[int] | str,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         *,
         start_idx: int = 0,
     ) -> Generator[PromptTargetMatch]:
@@ -675,7 +676,7 @@ _MatchToApply = tuple[tuple[str, int], tuple[PromptTargetMatch, int]]
 def _find_matches(
     prompt: _S,
     mm_prompt_updates: "MultiModalPromptUpdates",
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     *,
     prev_end_idx: int = 0,
     current_result: "MultiModalPromptUpdatesApplyResult",
@@ -740,7 +741,7 @@ def _all_items_found(
 def _apply_matches(
     prompt: _S,
     mm_prompt_updates: "MultiModalPromptUpdates",
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
 ) -> tuple[list[_S], "MultiModalPromptUpdatesApplyResult"]:
     mm_item_counts = {m: len(items) for m, items in mm_prompt_updates.items()}
 
@@ -806,7 +807,7 @@ def _apply_matches(
 def apply_token_matches(
     prompt: list[int],
     mm_prompt_updates: "MultiModalPromptUpdates",
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
 ) -> tuple[list[int], "MultiModalPromptUpdatesApplyResult"]:
     """
     Apply the updates in `mm_prompt_updates` to `prompt`.
@@ -823,7 +824,7 @@ def apply_token_matches(
 def apply_text_matches(
     prompt: str,
     mm_prompt_updates: "MultiModalPromptUpdates",
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
 ) -> tuple[str, "MultiModalPromptUpdatesApplyResult"]:
     """
     Apply the updates in `mm_prompt_updates` to `prompt`.
@@ -840,7 +841,7 @@ def apply_text_matches(
 def _iter_placeholders(
     prompt: list[int],
     mm_prompt_updates: "MultiModalPromptUpdates",
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
 ) -> Iterable[PlaceholderFeaturesInfo]:
     """
     Yield each set of placeholder tokens found in `prompt`.
@@ -909,7 +910,7 @@ def _iter_placeholders(
 def find_mm_placeholders(
     prompt: list[int],
     mm_prompt_updates: "MultiModalPromptUpdates",
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
 ) -> Mapping[str, list[PlaceholderFeaturesInfo]]:
     it = _iter_placeholders(prompt, mm_prompt_updates, tokenizer)
     return dict(full_groupby_modality(it))
@@ -930,7 +931,7 @@ class InputProcessingContext:
     model_config: ModelConfig
     """The configuration of the model."""
 
-    tokenizer: AnyTokenizer
+    tokenizer: TokenizerLike
     """The tokenizer used to tokenize the inputs."""
 
     @overload
@@ -1146,7 +1147,7 @@ class BaseProcessingInfo:
     def model_id(self) -> str:
         return self.ctx.model_config.model
 
-    def get_tokenizer(self) -> AnyTokenizer:
+    def get_tokenizer(self) -> TokenizerLike:
         return self.ctx.tokenizer
 
     def get_hf_config(self) -> PretrainedConfig:
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index a7eafa76ad17e..ee90570b24aaf 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -6,7 +6,8 @@ from typing import TYPE_CHECKING, Generic, Protocol, TypeVar, cast
 
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer, cached_tokenizer_from_config
+from vllm.tokenizers import TokenizerLike
+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from .cache import BaseMultiModalProcessorCache
 from .processing import (
@@ -231,17 +232,20 @@ class MultiModalRegistry:
     def _create_processing_ctx(
         self,
         model_config: "ModelConfig",
-        tokenizer: AnyTokenizer | None = None,
+        tokenizer: TokenizerLike | None = None,
     ) -> InputProcessingContext:
-        if tokenizer is None and not model_config.skip_tokenizer_init:
+        if model_config.skip_tokenizer_init:
+            tokenizer = cast(TokenizerLike, object())
+        elif tokenizer is None:
             tokenizer = cached_tokenizer_from_config(model_config)
+
         return InputProcessingContext(model_config, tokenizer)
 
     def _create_processing_info(
         self,
         model_config: "ModelConfig",
         *,
-        tokenizer: AnyTokenizer | None = None,
+        tokenizer: TokenizerLike | None = None,
     ) -> BaseProcessingInfo:
         model_cls = self._get_model_cls(model_config)
         factories = model_cls._processor_factory
@@ -252,7 +256,7 @@ class MultiModalRegistry:
         self,
         model_config: "ModelConfig",
         *,
-        tokenizer: AnyTokenizer | None = None,
+        tokenizer: TokenizerLike | None = None,
         cache: BaseMultiModalProcessorCache | None = None,
     ) -> BaseMultiModalProcessor[BaseProcessingInfo]:
         """
diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py
index d26e4ffc9c163..4a04292be009e 100644
--- a/vllm/reasoning/abs_reasoning_parsers.py
+++ b/vllm/reasoning/abs_reasoning_parsers.py
@@ -19,12 +19,12 @@ if TYPE_CHECKING:
         DeltaMessage,
         ResponsesRequest,
     )
-    from vllm.transformers_utils.tokenizer import AnyTokenizer
+    from vllm.tokenizers import TokenizerLike
 else:
     ChatCompletionRequest = Any
     DeltaMessage = Any
     ResponsesRequest = Any
-    AnyTokenizer = Any
+    TokenizerLike = Any
 
 logger = init_logger(__name__)
 
@@ -37,7 +37,7 @@ class ReasoningParser:
     It is used to extract reasoning content from the model output.
     """
 
-    def __init__(self, tokenizer: AnyTokenizer, *args, **kwargs):
+    def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
         self.model_tokenizer = tokenizer
 
     @cached_property
diff --git a/vllm/reasoning/basic_parsers.py b/vllm/reasoning/basic_parsers.py
index 0268947732726..35084c0e7cc86 100644
--- a/vllm/reasoning/basic_parsers.py
+++ b/vllm/reasoning/basic_parsers.py
@@ -7,7 +7,7 @@ from typing import TYPE_CHECKING, Any
 
 from vllm.entrypoints.openai.protocol import DeltaMessage
 from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 if TYPE_CHECKING:
     from vllm.entrypoints.openai.protocol import (
@@ -43,7 +43,7 @@ class BaseThinkingReasoningParser(ReasoningParser):
         """The token that ends reasoning content."""
         raise NotImplementedError
 
-    def __init__(self, tokenizer: AnyTokenizer, *args, **kwargs):
+    def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
         super().__init__(tokenizer, *args, **kwargs)
 
         if not self.model_tokenizer:
diff --git a/vllm/reasoning/minimax_m2_reasoning_parser.py b/vllm/reasoning/minimax_m2_reasoning_parser.py
index 30f5f2f88caf7..138d1b4e6dacf 100644
--- a/vllm/reasoning/minimax_m2_reasoning_parser.py
+++ b/vllm/reasoning/minimax_m2_reasoning_parser.py
@@ -11,7 +11,7 @@ from vllm.entrypoints.openai.protocol import (
 from vllm.logger import init_logger
 from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
 from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
@@ -37,7 +37,7 @@ class MiniMaxM2AppendThinkReasoningParser(ReasoningParser):
     Reasoning parser for MiniMax M2 model.
     """
 
-    def __init__(self, tokenizer: AnyTokenizer, *args, **kwargs):
+    def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
         super().__init__(tokenizer, *args, **kwargs)
         self.end_token_id = self.vocab.get("</think>")
 
diff --git a/vllm/reasoning/mistral_reasoning_parser.py b/vllm/reasoning/mistral_reasoning_parser.py
index af6d179bf6d01..b61e50c188f8c 100644
--- a/vllm/reasoning/mistral_reasoning_parser.py
+++ b/vllm/reasoning/mistral_reasoning_parser.py
@@ -6,7 +6,7 @@ from functools import cached_property
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParser
 from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
-from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer
 
 logger = init_logger(__name__)
 
diff --git a/vllm/reasoning/olmo3_reasoning_parser.py b/vllm/reasoning/olmo3_reasoning_parser.py
index 7149f8c4123b3..2742a24a2c3e7 100644
--- a/vllm/reasoning/olmo3_reasoning_parser.py
+++ b/vllm/reasoning/olmo3_reasoning_parser.py
@@ -9,7 +9,7 @@ from typing import TYPE_CHECKING
 import regex as re
 
 if TYPE_CHECKING:
-    from vllm.transformers_utils.tokenizer import AnyTokenizer
+    from vllm.tokenizers import TokenizerLike
 
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionRequest,
@@ -220,7 +220,7 @@ class Olmo3ReasoningParser(ReasoningParser):
           token is missing from generation.
     """
 
-    def __init__(self, tokenizer: "AnyTokenizer", *args, **kwargs):
+    def __init__(self, tokenizer: "TokenizerLike", *args, **kwargs):
         super().__init__(tokenizer, *args, **kwargs)
 
         self.think_start = r"<think>"
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 8de961e62db1b..453100f2e5135 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -13,7 +13,7 @@ from pydantic.dataclasses import dataclass
 
 from vllm.logger import init_logger
 from vllm.logits_process import LogitsProcessor
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.v1.serial_utils import PydanticMsgspecMixin
 
 logger = init_logger(__name__)
@@ -477,7 +477,7 @@ class SamplingParams(
                     eos_ids.update(self.stop_token_ids)
                     self.stop_token_ids = list(eos_ids)
 
-    def update_from_tokenizer(self, tokenizer: AnyTokenizer) -> None:
+    def update_from_tokenizer(self, tokenizer: TokenizerLike) -> None:
         if not self.bad_words:
             return
         self._bad_words_token_ids = []
diff --git a/vllm/tokenizers/__init__.py b/vllm/tokenizers/__init__.py
new file mode 100644
index 0000000000000..e26b4e8797ec8
--- /dev/null
+++ b/vllm/tokenizers/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from .mistral import MistralTokenizer
+from .protocol import TokenizerLike
+from .registry import TokenizerRegistry
+
+__all__ = ["TokenizerLike", "MistralTokenizer", "TokenizerRegistry"]
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/tokenizers/mistral.py
similarity index 96%
rename from vllm/transformers_utils/tokenizers/mistral.py
rename to vllm/tokenizers/mistral.py
index 1954e2a815b03..a42fb0e1e5f14 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/tokenizers/mistral.py
@@ -4,7 +4,8 @@
 from typing import TYPE_CHECKING, Any, cast
 
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer_base import TokenizerBase
+
+from .protocol import TokenizerLike
 
 if TYPE_CHECKING:
     from mistral_common.protocol.instruct.request import (
@@ -163,7 +164,7 @@ def _tekken_token_to_id(tokenizer: "Tekkenizer", t: str | bytes) -> int:
         return tokenizer.unk_id
 
 
-class MistralTokenizer(TokenizerBase):
+class MistralTokenizer(TokenizerLike):
     def __init__(self, tokenizer: "TransformersMistralTokenizer") -> None:
         from mistral_common.protocol.instruct.validator import ValidationMode
         from mistral_common.tokens.tokenizers.sentencepiece import (
@@ -270,14 +271,6 @@ class MistralTokenizer(TokenizerBase):
     def eos_token_id(self) -> int:
         return self.tokenizer.eos_id
 
-    @property
-    def sep_token(self) -> str:
-        raise NotImplementedError()
-
-    @property
-    def pad_token(self) -> str:
-        return self.transformers_tokenizer.pad_token
-
     @property
     def is_fast(self) -> bool:
         return True
@@ -292,11 +285,14 @@ class MistralTokenizer(TokenizerBase):
 
     @property
     def truncation_side(self) -> str:
-        raise NotImplementedError()
+        return self.transformers_tokenizer.truncation_side
 
     def _is_special_token_id(self, token_id: int) -> bool:
         return token_id in self._special_token_ids_set
 
+    def __hash__(self) -> int:
+        return hash(id(self))
+
     def __len__(self) -> int:
         return self.vocab_size
 
@@ -341,17 +337,6 @@ class MistralTokenizer(TokenizerBase):
         # Mistral tokenizers have no added vocabulary
         return {}
 
-    def encode_one(
-        self,
-        text: str,
-        truncation: bool = False,
-        max_length: int | None = None,
-    ) -> list[int]:
-        # Mistral Tokenizers should not add special tokens
-        return self.transformers_tokenizer.encode(
-            text, add_special_tokens=False, truncation=truncation, max_length=max_length
-        )
-
     def encode(
         self,
         text: str,
diff --git a/vllm/tokenizers/protocol.py b/vllm/tokenizers/protocol.py
new file mode 100644
index 0000000000000..58a1a7c23f21c
--- /dev/null
+++ b/vllm/tokenizers/protocol.py
@@ -0,0 +1,105 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import TYPE_CHECKING, Any, Protocol
+
+from typing_extensions import Self
+
+if TYPE_CHECKING:
+    from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+
+
+class TokenizerLike(Protocol):
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: str,
+        /,
+        *,
+        revision: str | None = None,
+    ) -> Self:
+        raise NotImplementedError
+
+    @property
+    def all_special_tokens(self) -> list[str]:
+        raise NotImplementedError
+
+    @property
+    def all_special_ids(self) -> list[int]:
+        raise NotImplementedError
+
+    @property
+    def bos_token_id(self) -> int:
+        raise NotImplementedError
+
+    @property
+    def eos_token_id(self) -> int:
+        raise NotImplementedError
+
+    @property
+    def is_fast(self) -> bool:
+        raise NotImplementedError
+
+    @property
+    def vocab_size(self) -> int:
+        raise NotImplementedError
+
+    @property
+    def max_token_id(self) -> int:
+        raise NotImplementedError
+
+    @property
+    def truncation_side(self) -> str:
+        raise NotImplementedError
+
+    def __hash__(self) -> int:
+        return hash(id(self))
+
+    def __len__(self) -> int:
+        return self.vocab_size
+
+    def __call__(
+        self,
+        text: str | list[str] | list[int],
+        text_pair: str | None = None,
+        add_special_tokens: bool = False,
+        truncation: bool = False,
+        max_length: int | None = None,
+    ):
+        raise NotImplementedError
+
+    def get_vocab(self) -> dict[str, int]:
+        raise NotImplementedError
+
+    def get_added_vocab(self) -> dict[str, int]:
+        raise NotImplementedError
+
+    def encode(
+        self,
+        text: str,
+        truncation: bool | None = None,
+        max_length: int | None = None,
+        add_special_tokens: bool | None = None,
+    ) -> list[int]:
+        raise NotImplementedError
+
+    def apply_chat_template(
+        self,
+        messages: list["ChatCompletionMessageParam"],
+        tools: list[dict[str, Any]] | None = None,
+        **kwargs,
+    ) -> list[int]:
+        raise NotImplementedError
+
+    def convert_tokens_to_string(self, tokens: list[str]) -> str:
+        raise NotImplementedError
+
+    def decode(self, ids: list[int] | int, skip_special_tokens: bool = True) -> str:
+        raise NotImplementedError
+
+    def convert_ids_to_tokens(
+        self,
+        ids: list[int],
+        skip_special_tokens: bool = True,
+    ) -> list[str]:
+        raise NotImplementedError
diff --git a/vllm/tokenizers/registry.py b/vllm/tokenizers/registry.py
new file mode 100644
index 0000000000000..3a236c99b3564
--- /dev/null
+++ b/vllm/tokenizers/registry.py
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import importlib
+
+from .protocol import TokenizerLike
+
+
+class TokenizerRegistry:
+    # Tokenizer name -> (tokenizer module, tokenizer class)
+    REGISTRY: dict[str, tuple[str, str]] = {}
+
+    @staticmethod
+    def register(name: str, module: str, class_name: str) -> None:
+        TokenizerRegistry.REGISTRY[name] = (module, class_name)
+
+    @staticmethod
+    def get_tokenizer(
+        tokenizer_name: str,
+        *args,
+        **kwargs,
+    ) -> "TokenizerLike":
+        tokenizer_cls = TokenizerRegistry.REGISTRY.get(tokenizer_name)
+        if tokenizer_cls is None:
+            raise ValueError(f"Tokenizer {tokenizer_name} not found.")
+
+        tokenizer_module = importlib.import_module(tokenizer_cls[0])
+        class_ = getattr(tokenizer_module, tokenizer_cls[1])
+        return class_.from_pretrained(*args, **kwargs)
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 45c4358bbc8f2..8f2cd3315ab94 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -26,8 +26,9 @@ from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
 
 from vllm import envs
 from vllm.logger import init_logger
-from vllm.transformers_utils.config_parser_base import ConfigParserBase
-from vllm.transformers_utils.repo_utils import (
+
+from .config_parser_base import ConfigParserBase
+from .repo_utils import (
     _get_hf_token,
     file_or_path_exists,
     get_hf_file_to_dict,
@@ -35,7 +36,7 @@ from vllm.transformers_utils.repo_utils import (
     try_get_local_file,
     with_retry,
 )
-from vllm.transformers_utils.utils import (
+from .utils import (
     check_gguf_file,
     is_gguf,
     is_remote_gguf,
diff --git a/vllm/transformers_utils/detokenizer_utils.py b/vllm/transformers_utils/detokenizer_utils.py
index 560526bfd823e..e586a5d46cb82 100644
--- a/vllm/transformers_utils/detokenizer_utils.py
+++ b/vllm/transformers_utils/detokenizer_utils.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 
-from .tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 
 def _replace_none_with_empty(tokens: list[str | None]):
@@ -12,7 +12,7 @@ def _replace_none_with_empty(tokens: list[str | None]):
 
 
 def _convert_tokens_to_string_with_added_encoders(
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     output_tokens: list[str],
     skip_special_tokens: bool,
     spaces_between_special_tokens: bool,
@@ -57,7 +57,7 @@ INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET = 5
 
 
 def convert_prompt_ids_to_tokens(
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     prompt_ids: list[int],
     skip_special_tokens: bool = False,
 ) -> tuple[list[str], int, int]:
@@ -81,7 +81,7 @@ def convert_prompt_ids_to_tokens(
 
 
 def convert_ids_list_to_tokens(
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     token_ids: list[int],
 ) -> list[str]:
     """Detokenize the input ids individually.
@@ -108,7 +108,7 @@ def convert_ids_list_to_tokens(
 # https://github.com/huggingface/text-generation-inference/blob/v0.9.4/server/text_generation_server/models/model.py#L62C9-L62C15
 # under Apache 2.0 license
 def detokenize_incrementally(
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     all_input_ids: list[int],
     prev_tokens: list[str] | None,
     prefix_offset: int,
diff --git a/vllm/transformers_utils/gguf_utils.py b/vllm/transformers_utils/gguf_utils.py
index c5b4d3f000901..cb1fc2d092e01 100644
--- a/vllm/transformers_utils/gguf_utils.py
+++ b/vllm/transformers_utils/gguf_utils.py
@@ -9,7 +9,8 @@ from gguf.constants import Keys, VisionProjectorType
 from transformers import Gemma3Config, PretrainedConfig, SiglipVisionConfig
 
 from vllm.logger import init_logger
-from vllm.transformers_utils.repo_utils import list_filtered_repo_files
+
+from .repo_utils import list_filtered_repo_files
 
 logger = init_logger(__name__)
 
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index be4325ab9101d..87d5cc2b483fb 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -5,41 +5,48 @@ import contextlib
 import copy
 import importlib.util
 import os
+import warnings
 from functools import lru_cache
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, TypeAlias
+from typing import TYPE_CHECKING, Any
 
 import huggingface_hub
-from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
 from typing_extensions import assert_never
 
 from vllm import envs
 from vllm.logger import init_logger
-from vllm.transformers_utils.config import get_sentence_transformer_tokenizer_config
-from vllm.transformers_utils.gguf_utils import get_gguf_file_path_from_hf
-from vllm.transformers_utils.repo_utils import list_filtered_repo_files
-from vllm.transformers_utils.tokenizers import MistralTokenizer
-from vllm.transformers_utils.utils import (
-    check_gguf_file,
-    is_gguf,
-    is_remote_gguf,
-    split_remote_gguf,
-)
+from vllm.tokenizers import MistralTokenizer, TokenizerLike, TokenizerRegistry
+
+from .config import get_sentence_transformer_tokenizer_config
+from .gguf_utils import get_gguf_file_path_from_hf
+from .repo_utils import list_filtered_repo_files
+from .utils import check_gguf_file, is_gguf, is_remote_gguf, split_remote_gguf
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
-    from vllm.transformers_utils.tokenizer_base import TokenizerBase
-else:
-    ModelConfig = Any
-    TokenizerBase = Any
+
 
 logger = init_logger(__name__)
 
-AnyTokenizer: TypeAlias = PreTrainedTokenizer | PreTrainedTokenizerFast | TokenizerBase
+
+def __getattr__(name: str):
+    if name == "AnyTokenizer":
+        warnings.warn(
+            "`vllm.transformers_utils.tokenizer.AnyTokenizer` has been moved to "
+            "`vllm.tokenizers.TokenizerLike`. "
+            "The old name will be removed in v0.13.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+
+        return TokenizerLike
+
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
 
 
 def decode_tokens(
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     token_ids: list[int],
     *,
     skip_special_tokens: bool | None = None,
@@ -58,7 +65,7 @@ def decode_tokens(
 
 
 def encode_tokens(
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     text: str,
     *,
     truncation: bool | None = None,
@@ -86,7 +93,7 @@ def encode_tokens(
     return tokenizer.encode(text, **kw_args)
 
 
-def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
+def get_cached_tokenizer(tokenizer: TokenizerLike) -> TokenizerLike:
     """
     By default, transformers will recompute multiple tokenizer properties
     each time they are called, leading to a significant slowdown.
@@ -144,7 +151,7 @@ def get_tokenizer(
     revision: str | None = None,
     download_dir: str | None = None,
     **kwargs,
-) -> AnyTokenizer:
+) -> TokenizerLike:
     """Gets a tokenizer for the given model name via HuggingFace or ModelScope."""
     if envs.VLLM_USE_MODELSCOPE:
         # download model from ModelScope hub,
@@ -206,15 +213,13 @@ def get_tokenizer(
         if len(files_list) > 0:
             tokenizer_mode = "mistral"
 
-    tokenizer: AnyTokenizer
+    tokenizer: TokenizerLike
     if tokenizer_mode == "mistral":
         logger.debug_once(f"Loading MistralTokenizer from {tokenizer_name}")
         tokenizer = MistralTokenizer.from_pretrained(
             str(tokenizer_name), revision=revision
         )
     elif tokenizer_mode == "custom":
-        from vllm.transformers_utils.tokenizer_base import TokenizerRegistry
-
         logger.debug_once(f"Loading CustomTokenizer from {tokenizer_name}")
         tokenizer = TokenizerRegistry.get_tokenizer(
             str(tokenizer_name),
@@ -260,12 +265,13 @@ def get_tokenizer(
         if isinstance(encoder_config, dict) and encoder_config.get(
             "do_lower_case", False
         ):
+            assert isinstance(tokenizer, PreTrainedTokenizerBase)
             special_tokens_map = {
                 k: v.lower() for k, v in tokenizer.special_tokens_map.items()
             }
             tokenizer.add_special_tokens(special_tokens_map)
 
-        if not isinstance(tokenizer, PreTrainedTokenizerFast):
+        if not tokenizer.is_fast:
             logger.warning(
                 "Using a slow tokenizer. This might cause a significant "
                 "slowdown. Consider using a fast tokenizer instead."
@@ -279,7 +285,7 @@ cached_get_tokenizer = lru_cache(get_tokenizer)
 
 
 def cached_tokenizer_from_config(
-    model_config: ModelConfig,
+    model_config: "ModelConfig",
     **kwargs: Any,
 ):
     return cached_get_tokenizer(
@@ -291,7 +297,7 @@ def cached_tokenizer_from_config(
     )
 
 
-def init_tokenizer_from_configs(model_config: ModelConfig):
+def init_tokenizer_from_configs(model_config: "ModelConfig"):
     runner_type = model_config.runner_type
     if runner_type == "generate" or runner_type == "draft":
         truncation_side = "left"
diff --git a/vllm/transformers_utils/tokenizer_base.py b/vllm/transformers_utils/tokenizer_base.py
index 52f221d1e373e..78fb6edc8b9ed 100644
--- a/vllm/transformers_utils/tokenizer_base.py
+++ b/vllm/transformers_utils/tokenizer_base.py
@@ -1,150 +1,33 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import importlib
-from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Any
-
-if TYPE_CHECKING:
-    from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+import warnings
 
 
-class TokenizerBase(ABC):
-    @property
-    @abstractmethod
-    def all_special_tokens(self) -> list[str]:
-        raise NotImplementedError()
+def __getattr__(name: str):
+    if name == "TokenizerBase":
+        from vllm.tokenizers import TokenizerLike
 
-    @property
-    @abstractmethod
-    def all_special_ids(self) -> list[int]:
-        raise NotImplementedError()
+        warnings.warn(
+            "`vllm.transformers_utils.tokenizer_base.TokenizerBase` has been "
+            "moved to `vllm.tokenizers.TokenizerLike`. "
+            "The old name will be removed in v0.13.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
 
-    @property
-    @abstractmethod
-    def bos_token_id(self) -> int:
-        raise NotImplementedError()
+        return TokenizerLike
+    if name == "TokenizerRegistry":
+        from vllm.tokenizers import TokenizerRegistry
 
-    @property
-    @abstractmethod
-    def eos_token_id(self) -> int:
-        raise NotImplementedError()
+        warnings.warn(
+            "`vllm.transformers_utils.tokenizer_base.TokenizerRegistry` has been "
+            "moved to `vllm.tokenizers.TokenizerRegistry`. "
+            "The old name will be removed in v0.13.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
 
-    @property
-    @abstractmethod
-    def sep_token(self) -> str:
-        raise NotImplementedError()
+        return TokenizerRegistry
 
-    @property
-    @abstractmethod
-    def pad_token(self) -> str:
-        raise NotImplementedError()
-
-    @property
-    @abstractmethod
-    def is_fast(self) -> bool:
-        raise NotImplementedError()
-
-    @property
-    @abstractmethod
-    def vocab_size(self) -> int:
-        raise NotImplementedError()
-
-    @property
-    @abstractmethod
-    def max_token_id(self) -> int:
-        raise NotImplementedError()
-
-    @property
-    @abstractmethod
-    def truncation_side(self) -> str:
-        raise NotImplementedError()
-
-    def __len__(self) -> int:
-        return self.vocab_size
-
-    @abstractmethod
-    def __call__(
-        self,
-        text: str | list[str] | list[int],
-        text_pair: str | None = None,
-        add_special_tokens: bool = False,
-        truncation: bool = False,
-        max_length: int | None = None,
-    ):
-        raise NotImplementedError()
-
-    @abstractmethod
-    def get_vocab(self) -> dict[str, int]:
-        raise NotImplementedError()
-
-    @abstractmethod
-    def get_added_vocab(self) -> dict[str, int]:
-        raise NotImplementedError()
-
-    @abstractmethod
-    def encode_one(
-        self,
-        text: str,
-        truncation: bool = False,
-        max_length: int | None = None,
-    ) -> list[int]:
-        raise NotImplementedError()
-
-    @abstractmethod
-    def encode(
-        self,
-        text: str,
-        truncation: bool | None = None,
-        max_length: int | None = None,
-        add_special_tokens: bool | None = None,
-    ) -> list[int]:
-        raise NotImplementedError()
-
-    @abstractmethod
-    def apply_chat_template(
-        self,
-        messages: list["ChatCompletionMessageParam"],
-        tools: list[dict[str, Any]] | None = None,
-        **kwargs,
-    ) -> list[int]:
-        raise NotImplementedError()
-
-    @abstractmethod
-    def convert_tokens_to_string(self, tokens: list[str]) -> str:
-        raise NotImplementedError()
-
-    @abstractmethod
-    def decode(self, ids: list[int] | int, skip_special_tokens: bool = True) -> str:
-        raise NotImplementedError()
-
-    @abstractmethod
-    def convert_ids_to_tokens(
-        self,
-        ids: list[int],
-        skip_special_tokens: bool = True,
-    ) -> list[str]:
-        raise NotImplementedError()
-
-
-class TokenizerRegistry:
-    # Tokenizer name -> (tokenizer module, tokenizer class)
-    REGISTRY: dict[str, tuple[str, str]] = {}
-
-    @staticmethod
-    def register(name: str, module: str, class_name: str) -> None:
-        TokenizerRegistry.REGISTRY[name] = (module, class_name)
-
-    @staticmethod
-    def get_tokenizer(
-        tokenizer_name: str,
-        *args,
-        **kwargs,
-    ) -> TokenizerBase:
-        tokenizer_cls = TokenizerRegistry.REGISTRY.get(tokenizer_name)
-        if tokenizer_cls is None:
-            raise ValueError(f"Tokenizer {tokenizer_name} not found.")
-
-        tokenizer_module = importlib.import_module(tokenizer_cls[0])
-        class_ = getattr(tokenizer_module, tokenizer_cls[1])
-        return class_.from_pretrained(*args, **kwargs)
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/vllm/transformers_utils/tokenizers/__init__.py b/vllm/transformers_utils/tokenizers/__init__.py
deleted file mode 100644
index b63cb26af46dd..0000000000000
--- a/vllm/transformers_utils/tokenizers/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from .mistral import (
-    MistralTokenizer,
-    maybe_serialize_tool_calls,
-    truncate_tool_call_ids,
-    validate_request_params,
-)
-
-__all__ = [
-    "MistralTokenizer",
-    "maybe_serialize_tool_calls",
-    "truncate_tool_call_ids",
-    "validate_request_params",
-]
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index bd28c41fb50e8..336d3e9fa1d20 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -26,9 +26,10 @@ from vllm.plugins.io_processors import get_io_processor
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.tasks import SupportedTask
+from vllm.tokenizers import TokenizerLike
 from vllm.tracing import init_tracer
 from vllm.transformers_utils.config import maybe_register_config_serialize_by_value
-from vllm.transformers_utils.tokenizer import AnyTokenizer, init_tokenizer_from_configs
+from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils.async_utils import cancel_task_threadsafe
 from vllm.utils.collection_utils import as_list
@@ -120,9 +121,10 @@ class AsyncLLM(EngineClient):
         )
 
         # OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
-        stream_interval = self.vllm_config.scheduler_config.stream_interval
         self.output_processor = OutputProcessor(
-            self.tokenizer, log_stats=self.log_stats, stream_interval=stream_interval
+            self.tokenizer,
+            log_stats=self.log_stats,
+            stream_interval=self.vllm_config.scheduler_config.stream_interval,
         )
         endpoint = self.observability_config.otlp_traces_endpoint
         if endpoint is not None:
@@ -703,17 +705,17 @@ class AsyncLLM(EngineClient):
             raise EngineGenerateError() from e
 
     @property
-    def tokenizer(self) -> AnyTokenizer | None:
+    def tokenizer(self) -> TokenizerLike | None:
         return self.input_processor.tokenizer
 
     @tokenizer.setter
-    def tokenizer(self, tokenizer: AnyTokenizer | None) -> None:
+    def tokenizer(self, tokenizer: TokenizerLike | None) -> None:
         self.input_processor.tokenizer = tokenizer
 
-    async def get_tokenizer(self) -> AnyTokenizer:
+    async def get_tokenizer(self) -> TokenizerLike:
         if self.tokenizer is None:
             raise ValueError(
-                "Unable to get tokenizer because skip_tokenizer_init is True"
+                "Unable to get tokenizer because `skip_tokenizer_init=True`"
             )
 
         return self.tokenizer
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index b7a24096bf15f..c55240c40f6f0 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -10,7 +10,7 @@ from transformers import PreTrainedTokenizerFast
 
 from vllm.logger import init_logger
 from vllm.transformers_utils.detokenizer_utils import (
-    AnyTokenizer,
+    TokenizerLike,
     convert_prompt_ids_to_tokens,
     detokenize_incrementally,
 )
@@ -45,7 +45,7 @@ class IncrementalDetokenizer:
     @classmethod
     def from_new_request(
         cls,
-        tokenizer: AnyTokenizer | None,
+        tokenizer: TokenizerLike | None,
         request: EngineCoreRequest,
     ) -> "IncrementalDetokenizer":
         assert request.sampling_params is not None
@@ -256,7 +256,7 @@ class FastIncrementalDetokenizer(BaseIncrementalDetokenizer):
 
 
 class SlowIncrementalDetokenizer(BaseIncrementalDetokenizer):
-    def __init__(self, tokenizer: AnyTokenizer, request: EngineCoreRequest):
+    def __init__(self, tokenizer: TokenizerLike, request: EngineCoreRequest):
         super().__init__(request)
 
         self.tokenizer = tokenizer
diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py
index cfd637931a1ce..e6a94f4e3de5d 100644
--- a/vllm/v1/engine/input_processor.py
+++ b/vllm/v1/engine/input_processor.py
@@ -19,8 +19,7 @@ from vllm.multimodal.processing import EncDecMultiModalProcessor
 from vllm.multimodal.utils import argsort_mm_positions
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer, TokenizerLike
 from vllm.utils import length_from_prompt_token_ids_or_embeds
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.metrics.stats import MultiModalCacheStats
@@ -40,7 +39,7 @@ class InputProcessor:
     def __init__(
         self,
         vllm_config: VllmConfig,
-        tokenizer: AnyTokenizer | None,
+        tokenizer: TokenizerLike | None,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
     ) -> None:
         self.vllm_config = vllm_config
@@ -62,11 +61,11 @@ class InputProcessor:
         )
 
     @property
-    def tokenizer(self) -> AnyTokenizer | None:
+    def tokenizer(self) -> TokenizerLike | None:
         return self.input_preprocessor.tokenizer
 
     @tokenizer.setter
-    def tokenizer(self, tokenizer: AnyTokenizer | None) -> None:
+    def tokenizer(self, tokenizer: TokenizerLike | None) -> None:
         self.input_preprocessor.tokenizer = tokenizer
 
     def _validate_logprobs(
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index ead553e98a978..a3bde7ba8d64d 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -23,8 +23,9 @@ from vllm.plugins.io_processors import get_io_processor
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.tasks import SupportedTask
+from vllm.tokenizers import TokenizerLike
 from vllm.tracing import init_tracer
-from vllm.transformers_utils.tokenizer import AnyTokenizer, init_tokenizer_from_configs
+from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core_client import EngineCoreClient
@@ -95,9 +96,10 @@ class LLMEngine:
         )
 
         # OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
-        stream_interval = self.vllm_config.scheduler_config.stream_interval
         self.output_processor = OutputProcessor(
-            self.tokenizer, log_stats=self.log_stats, stream_interval=stream_interval
+            self.tokenizer,
+            log_stats=self.log_stats,
+            stream_interval=self.vllm_config.scheduler_config.stream_interval,
         )
         endpoint = self.observability_config.otlp_traces_endpoint
         if endpoint is not None:
@@ -350,17 +352,17 @@ class LLMEngine:
         return get_metrics_snapshot()
 
     @property
-    def tokenizer(self) -> AnyTokenizer | None:
+    def tokenizer(self) -> TokenizerLike | None:
         return self.input_processor.tokenizer
 
     @tokenizer.setter
-    def tokenizer(self, tokenizer: AnyTokenizer | None) -> None:
+    def tokenizer(self, tokenizer: TokenizerLike | None) -> None:
         self.input_processor.tokenizer = tokenizer
 
-    def get_tokenizer(self) -> AnyTokenizer:
+    def get_tokenizer(self) -> TokenizerLike:
         if self.tokenizer is None:
             raise ValueError(
-                "Unable to get tokenizer because skip_tokenizer_init is True"
+                "Unable to get tokenizer because `skip_tokenizer_init=True`"
             )
 
         return self.tokenizer
diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py
index 63064a2c65d67..1c8f808bc25ba 100644
--- a/vllm/v1/engine/logprobs.py
+++ b/vllm/v1/engine/logprobs.py
@@ -13,7 +13,7 @@ from vllm.logprobs import (
     create_sample_logprobs,
 )
 from vllm.transformers_utils.detokenizer_utils import (
-    AnyTokenizer,
+    TokenizerLike,
     convert_ids_list_to_tokens,
 )
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
@@ -28,7 +28,7 @@ NONES = itertools.repeat(None)
 class LogprobsProcessor:
     # Tokenizer for this request,
     # None if detokenization is disabled.
-    tokenizer: AnyTokenizer | None
+    tokenizer: TokenizerLike | None
 
     # Logprobs for this request
     logprobs: SampleLogprobs | None
@@ -40,7 +40,7 @@ class LogprobsProcessor:
     @classmethod
     def from_new_request(
         cls,
-        tokenizer: AnyTokenizer | None,
+        tokenizer: TokenizerLike | None,
         request: EngineCoreRequest,
     ) -> "LogprobsProcessor":
         sampling_params = request.sampling_params
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 0453c4a77f0cd..e85fbb4ee0fb0 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -15,8 +15,8 @@ from vllm.outputs import (
     RequestOutput,
 )
 from vllm.sampling_params import RequestOutputKind
+from vllm.tokenizers import TokenizerLike
 from vllm.tracing import SpanAttributes, SpanKind, Tracer, extract_trace_context
-from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import length_from_prompt_token_ids_or_embeds
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason
 from vllm.v1.engine.detokenizer import IncrementalDetokenizer
@@ -139,7 +139,7 @@ class RequestState:
     @classmethod
     def from_new_request(
         cls,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike | None,
         request: EngineCoreRequest,
         prompt: str | None,
         parent_req: ParentRequest | None,
@@ -341,7 +341,10 @@ class OutputProcessor:
     """Process EngineCoreOutputs into RequestOutputs."""
 
     def __init__(
-        self, tokenizer: AnyTokenizer, log_stats: bool, stream_interval: int = 1
+        self,
+        tokenizer: TokenizerLike | None,
+        log_stats: bool,
+        stream_interval: int = 1,
     ):
         self.log_stats = log_stats
         self.tokenizer = tokenizer
diff --git a/vllm/v1/structured_output/backend_types.py b/vllm/v1/structured_output/backend_types.py
index 7dc9589b63b86..5c09b7b0634f2 100644
--- a/vllm/v1/structured_output/backend_types.py
+++ b/vllm/v1/structured_output/backend_types.py
@@ -10,10 +10,10 @@ if TYPE_CHECKING:
     import torch
 
     from vllm.config import VllmConfig
-    from vllm.transformers_utils.tokenizer import AnyTokenizer
+    from vllm.tokenizers import TokenizerLike
 else:
     VllmConfig = object
-    AnyTokenizer = object
+    TokenizerLike = object
 
 
 class StructuredOutputOptions(enum.Enum):
@@ -100,7 +100,7 @@ class StructuredOutputBackend(ABC):
     """Engine-level backend for structured output requests."""
 
     vllm_config: VllmConfig
-    tokenizer: AnyTokenizer
+    tokenizer: TokenizerLike
     vocab_size: int
 
     @abstractmethod
diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py
index c9f2dc07da786..f8a2df43dd90e 100644
--- a/vllm/v1/structured_output/backend_xgrammar.py
+++ b/vllm/v1/structured_output/backend_xgrammar.py
@@ -10,7 +10,7 @@ import torch
 import vllm.envs
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams
-from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer
 from vllm.utils.import_utils import LazyLoader
 from vllm.v1.structured_output.backend_types import (
     StructuredOutputBackend,
diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py
index d2d14fcfc4362..ae42b33f80f88 100644
--- a/vllm/v1/structured_output/utils.py
+++ b/vllm/v1/structured_output/utils.py
@@ -24,7 +24,7 @@ if TYPE_CHECKING:
     import transformers.models.gpt2.tokenization_gpt2 as tokenization_gpt2
     import xgrammar as xgr
 
-    from vllm.transformers_utils.tokenizer import AnyTokenizer
+    from vllm.tokenizers import TokenizerLike
     from vllm.v1.worker.gpu_input_batch import InputBatch
 else:
     xgr = LazyLoader("xgr", globals(), "xgrammar")
@@ -36,7 +36,7 @@ else:
         "transformers.models.gpt2.tokenization_gpt2",
     )
 
-    AnyTokenizer = object
+    TokenizerLike = object
     SchedulerOutput = object
     InputBatch = object
 
@@ -195,7 +195,7 @@ re_replacement_seq = re.compile(r"^.{0,6}�+.{0,6}$")
 
 
 def _reduced_vocabulary(
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     eos_token_id: int,
 ) -> dict[bytes, list[int]]:
     """Create a map from vocabulary tokens to lists of equivalent token ids.
@@ -222,7 +222,7 @@ def _reduced_vocabulary(
     vocabulary: dict[bytes, list[int]] = {}
     empty_token_ids: list[int] = []
     for token, token_idx in tokenizer.get_vocab().items():
-        if token in tokenizer.all_special_tokens:  # type: ignore
+        if token in tokenizer.all_special_tokens:
             continue
 
         token_str = convert_token_to_string(token)
@@ -261,7 +261,7 @@ def _reduced_vocabulary(
     return vocabulary
 
 
-def get_outlines_vocabulary(tokenizer: AnyTokenizer) -> oc.Vocabulary:
+def get_outlines_vocabulary(tokenizer: TokenizerLike) -> oc.Vocabulary:
     """Get the `Vocabulary` object for a given tokenizer."""
     if hasattr(tokenizer, "_outlines_vocabulary"):
         return tokenizer._outlines_vocabulary  # type: ignore

From f4341f45d3be64133ded0ff1d403587ef39225c7 Mon Sep 17 00:00:00 2001
From: dublc <88999483+dublc@users.noreply.github.com>
Date: Sat, 29 Nov 2025 21:46:48 +0800
Subject: [PATCH 080/770] [Doc]: fix code block rendering (#29728)

Signed-off-by: dublc <jdublc0x@gmail.com>
---
 docs/design/plugin_system.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md
index 9c84889f7f03d..3485c40c36811 100644
--- a/docs/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@@ -86,7 +86,7 @@ Every plugin has three parts:
         },
         ...
     )
-        ```
+    ```
 
     Please make sure `vllm_add_dummy_platform:register` is a callable function and returns the platform class's fully qualified name. for example:
 

From ad7f714d62e2d0acad0799eed39d7823aba4b422 Mon Sep 17 00:00:00 2001
From: Chukwuma Nwaugha <20521315+nwaughachukwuma@users.noreply.github.com>
Date: Sat, 29 Nov 2025 13:57:00 +0000
Subject: [PATCH 081/770] hfrunner.classify should return list[list[float]] not
 list[str] (#29671)

Signed-off-by: Chukwuma Nwaugha <nwaughac@gmail.com>
---
 tests/conftest.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 11c573befb2d2..317b36ba6cb80 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -459,14 +459,17 @@ class HfRunner:
             embeddings.append(embedding)
         return embeddings
 
-    def classify(self, prompts: list[str]) -> list[str]:
+    def classify(self, prompts: list[str]) -> list[list[float]]:
         # output is final logits
         all_inputs = self.get_inputs(prompts)
-        outputs = []
+        outputs: list[list[float]] = []
         problem_type = getattr(self.config, "problem_type", "")
 
         for inputs in all_inputs:
             output = self.model(**self.wrap_device(inputs))
+
+            assert isinstance(output.logits, torch.Tensor)
+
             if problem_type == "regression":
                 logits = output.logits[0].tolist()
             elif problem_type == "multi_label_classification":

From fe3398fab2b11f92b4ada209a85558710c446a36 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 29 Nov 2025 22:25:10 +0800
Subject: [PATCH 082/770] [Chore] Enable passing `tokenizer=None` into MM
 processor (#29724)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/multimodal/test_processing.py       | 41 +++--------
 vllm/entrypoints/openai/serving_engine.py |  2 +-
 vllm/inputs/preprocess.py                 | 15 +---
 vllm/model_executor/models/glm4_1v.py     |  6 --
 vllm/model_executor/models/qwen3_vl.py    |  3 -
 vllm/model_executor/models/qwen_vl.py     |  2 +-
 vllm/multimodal/processing.py             | 86 +++++++++++++++--------
 vllm/multimodal/registry.py               |  4 +-
 8 files changed, 68 insertions(+), 91 deletions(-)

diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index f7fa8da54d54e..262ea42e4d0fa 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -3,7 +3,6 @@
 
 import time
 from contextlib import nullcontext
-from typing import cast
 
 import numpy as np
 import pytest
@@ -24,7 +23,6 @@ from vllm.multimodal.processing import (
     replace_token_matches,
 )
 from vllm.multimodal.profiling import MultiModalProfiler
-from vllm.tokenizers import TokenizerLike
 
 from .utils import random_image
 
@@ -238,15 +236,12 @@ def test_find_token_matches(
     expected_by_key,
     update_type,
 ):
-    # Should not be used since there is nothing to convert to token IDs
-    mock_tokenizer = cast(TokenizerLike, object())
-
     prompt_updates = {
         key: update_type(key, target, []).resolve(0)
         for key, target in target_by_key.items()
     }
     result = {
-        key: list(update.iter_token_matches(prompt, mock_tokenizer))
+        key: list(update.iter_token_matches(prompt, tokenizer=None))
         for key, update in prompt_updates.items()
     }
 
@@ -385,15 +380,12 @@ def test_find_text_matches(
     expected_by_key,
     update_type,
 ):
-    # Should not be used since there is nothing to convert to text
-    mock_tokenizer = cast(TokenizerLike, object())
-
     prompt_updates = {
         key: update_type(key, target, []).resolve(0)
         for key, target in target_by_key.items()
     }
     result = {
-        key: list(update.iter_text_matches(prompt, mock_tokenizer))
+        key: list(update.iter_text_matches(prompt, tokenizer=None))
         for key, update in prompt_updates.items()
     }
 
@@ -545,9 +537,6 @@ def test_find_update_text(
     repl_by_key,
     expected_by_update_type_mm_count,
 ):
-    # Should not be used since there is nothing to convert to text
-    mock_tokenizer = cast(TokenizerLike, object())
-
     for (
         update_type,
         expected_by_mm_count,
@@ -564,7 +553,7 @@ def test_find_update_text(
             new_prompt, result = apply_text_matches(
                 prompt,
                 mm_prompt_updates,
-                mock_tokenizer,
+                tokenizer=None,
             )
 
             # Only displayed on error
@@ -750,9 +739,6 @@ def test_find_update_tokens(
     repl_by_key,
     expected_by_update_type_mm_count,
 ):
-    # Should not be used since there is nothing to convert to tokens
-    mock_tokenizer = cast(TokenizerLike, object())
-
     for (
         update_type,
         expected_by_mm_count,
@@ -769,7 +755,7 @@ def test_find_update_tokens(
             new_prompt, result = apply_token_matches(
                 prompt,
                 mm_prompt_updates,
-                mock_tokenizer,
+                tokenizer=None,
             )
 
             # Only displayed on error
@@ -900,15 +886,12 @@ def test_find_mm_placeholders(
     expected,
     update_type,
 ):
-    # Should not be used since there is nothing to convert to tokens
-    mock_tokenizer = cast(TokenizerLike, object())
-
     mm_prompt_updates = {
         key: [[update_type(key, [], repl).resolve(i)] for i in range(3)]
         for key, repl in repl_by_key.items()
     }
 
-    result = find_mm_placeholders(prompt, mm_prompt_updates, mock_tokenizer)
+    result = find_mm_placeholders(prompt, mm_prompt_updates, tokenizer=None)
 
     # Only displayed on error
     print("result:", result)
@@ -1029,12 +1012,9 @@ def test_hf_processor_init_kwargs(
     inference_kwargs,
     expected_kwargs,
 ):
-    # Should not be used since there is nothing to convert to tokens
-    mock_tokenizer = cast(TokenizerLike, object())
-
     ctx = InputProcessingContext(
         model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
-        tokenizer=mock_tokenizer,
+        tokenizer=None,
     )
 
     processor = ctx.get_hf_processor(
@@ -1065,12 +1045,9 @@ def test_hf_processor_call_kwargs(
     inference_kwargs,
     expected_kwargs,
 ):
-    # Should not be used since there is nothing to convert to tokens
-    mock_tokenizer = cast(TokenizerLike, object())
-
     ctx = InputProcessingContext(
         model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
-        tokenizer=mock_tokenizer,
+        tokenizer=None,
     )
 
     processor = ctx.get_hf_processor(DummyProcessor)  # type: ignore[arg-type]
@@ -1089,8 +1066,6 @@ def test_apply_matches_no_match_exits_quickly():
 
     With the fix, it should exit immediately when no match is found.
     """
-    mock_tokenizer = cast(TokenizerLike, object())
-
     # Create a long prompt with no placeholder
     long_prompt = "x" * 10000
 
@@ -1103,7 +1078,7 @@ def test_apply_matches_no_match_exits_quickly():
     result, _ = _apply_matches(
         long_prompt,
         mm_prompt_updates,
-        mock_tokenizer,
+        tokenizer=None,
     )
     elapsed = time.perf_counter() - start
 
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index e7a632e025103..b6a2478cf8c81 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -337,7 +337,7 @@ class OpenAIServing:
         tokenizer = input_processor.tokenizer
         if tokenizer is None:
             raise ValueError(
-                "You cannot use beam search when `skip_tokenizer_init` is True"
+                "You cannot use beam search when `skip_tokenizer_init=True`"
             )
 
         eos_token_id: int = tokenizer.eos_token_id  # type: ignore
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 46d1bed38aa85..2893a56b1190f 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -62,7 +62,7 @@ class InputPreprocessor:
     def get_tokenizer(self) -> TokenizerLike:
         if self.tokenizer is None:
             raise ValueError(
-                "You cannot pass text prompts when `skip_tokenizer_init` is True"
+                "You cannot pass text prompts when `skip_tokenizer_init=True`"
             )
 
         return self.tokenizer
@@ -228,22 +228,11 @@ class InputPreprocessor:
 
         return tokenizer.encode(prompt, **tokenization_kwargs)
 
-    def _get_mm_tokenizer(self) -> TokenizerLike:
-        # PrithviGeoSpatialMAE needs to be initialized without a tokenizer
-        # while using also multi-modal input
-        if not self.tokenizer:
-            return cast(TokenizerLike, object())  # Dummy
-
-        tokenizer = self.get_tokenizer()
-        return tokenizer
-
     def _get_mm_processor(self) -> BaseMultiModalProcessor:
         if not hasattr(self, "_mm_processor"):
-            tokenizer = self._get_mm_tokenizer()
-
             self._mm_processor = self.mm_registry.create_processor(
                 self.model_config,
-                tokenizer=tokenizer,
+                tokenizer=self.tokenizer,
                 cache=self.mm_processor_cache,
             )
 
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index fe238861ecceb..5ba3c0a35928d 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -866,12 +866,6 @@ class Glm4vVisionTransformer(nn.Module):
 
 
 class Glm4vProcessingInfo(BaseProcessingInfo):
-    def get_hf_config(self):
-        return self.ctx.get_hf_config()
-
-    def get_tokenizer(self):
-        return self.ctx.tokenizer
-
     def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None, "video": 1}
 
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index f0ba631e66807..1d3929b936a9f 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -615,9 +615,6 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo):
             **kwargs,
         )
 
-    def get_tokenizer(self):
-        return self.ctx.tokenizer
-
     def get_image_processor(self, **kwargs: object) -> Qwen2VLImageProcessorFast:
         return self.get_hf_processor(**kwargs).image_processor
 
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index 4906cf441f6fb..55680b8e7ddfd 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -555,7 +555,7 @@ class QwenVLProcessor:
 
 class QwenVLProcessingInfo(BaseProcessingInfo):
     def get_tokenizer(self) -> PreTrainedTokenizer:
-        tokenizer = self.ctx.tokenizer
+        tokenizer = self.ctx.get_tokenizer()
         assert isinstance(tokenizer, PreTrainedTokenizer)
 
         return _get_tokenizer_without_image_pad(tokenizer)
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index aab657b24ba23..912cff2343dd0 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -97,15 +97,37 @@ def _cached_decode(
     )
 
 
-def _seq2text(tokenizer: TokenizerLike, seq: PromptSeq) -> str:
+def _seq2text(
+    tokenizer: TokenizerLike | None,
+    seq: PromptSeq,
+    *,
+    use_cache: bool = True,
+) -> str:
     if isinstance(seq, str):
         return seq
 
+    if tokenizer is None:
+        raise ValueError("You cannot decode tokens when `skip_tokenizer_init=True`")
+
+    if not use_cache:
+        return decode_tokens(tokenizer, seq)
+
     return _cached_decode(tokenizer, tuple(seq))
 
 
-def _seq2tokens(tokenizer: TokenizerLike, seq: PromptSeq) -> list[int]:
+def _seq2tokens(
+    tokenizer: TokenizerLike | None,
+    seq: PromptSeq,
+    *,
+    use_cache: bool = True,
+) -> list[int]:
     if isinstance(seq, str):
+        if tokenizer is None:
+            raise ValueError("You cannot encode text when `skip_tokenizer_init=True`")
+
+        if not use_cache:
+            return encode_tokens(tokenizer, seq, add_special_tokens=False)
+
         return _cached_encode(tokenizer, seq, add_special_tokens=False)
 
     return seq
@@ -114,7 +136,7 @@ def _seq2tokens(tokenizer: TokenizerLike, seq: PromptSeq) -> list[int]:
 class _GetMatchIndex(Protocol):
     def __call__(
         self,
-        tokenizer: TokenizerLike,
+        tokenizer: TokenizerLike | None,
         prompt: PromptSeq,
         start_idx: int = 0,
     ) -> int | None: ...
@@ -144,7 +166,7 @@ class PromptIndexTargets:
         """
 
         def get_match_index(
-            tokenizer: TokenizerLike,
+            tokenizer: TokenizerLike | None,
             prompt: PromptSeq,
             start_idx: int = 0,
         ) -> int | None:
@@ -154,13 +176,11 @@ class PromptIndexTargets:
             prefix = seq
 
             if isinstance(prompt, str):
-                if not isinstance(prefix, str):
-                    # Make both `str`
-                    prefix = decode_tokens(tokenizer, prefix)
+                # Make both `str`
+                prefix = _seq2text(tokenizer, prefix, use_cache=False)
             else:
-                if isinstance(prefix, str):
-                    # Make both `list[int]`
-                    prefix = encode_tokens(tokenizer, prefix, add_special_tokens=False)
+                # Make both `list[int]`
+                prefix = _seq2tokens(tokenizer, prefix, use_cache=False)
 
             match_idx = len(prefix)
             return match_idx if prompt[:match_idx] == prefix else None
@@ -200,7 +220,7 @@ class PromptUpdateDetails(Generic[_S]):
     full: _S
     """The full content."""
 
-    is_embed: Callable[[TokenizerLike, PromptSeq], torch.Tensor] | None = None
+    is_embed: Callable[[TokenizerLike | None, PromptSeq], torch.Tensor] | None = None
     """
     Given [`full`][vllm.multimodal.processing.PromptUpdateDetails.full],
     return a boolean mask of shape `(len(full),)` indicating which positions
@@ -221,8 +241,8 @@ class PromptUpdateDetails(Generic[_S]):
         seq: _S,
         embed_text: str,
     ) -> "PromptUpdateDetails[_S]":
-        def is_embed(tokenizer: TokenizerLike, full: PromptSeq) -> torch.Tensor:
-            embed_token_ids = encode_tokens(tokenizer, embed_text)
+        def is_embed(tokenizer: TokenizerLike | None, full: PromptSeq) -> torch.Tensor:
+            embed_token_ids = _seq2tokens(tokenizer, embed_text, use_cache=False)
             token_ids = _seq2tokens(tokenizer, full)
 
             return torch.isin(
@@ -237,7 +257,7 @@ class PromptUpdateDetails(Generic[_S]):
         seq: _S,
         embed_token_id: int,
     ) -> "PromptUpdateDetails[_S]":
-        def is_embed(tokenizer: TokenizerLike, full: PromptSeq) -> torch.Tensor:
+        def is_embed(tokenizer: TokenizerLike | None, full: PromptSeq) -> torch.Tensor:
             token_ids = _seq2tokens(tokenizer, full)
 
             return torch.tensor(token_ids) == embed_token_id
@@ -523,7 +543,7 @@ class ResolvedPromptUpdate:
     def iter_token_matches(
         self,
         prompt: list[int],
-        tokenizer: TokenizerLike,
+        tokenizer: TokenizerLike | None,
         *,
         start_idx: int = 0,
     ) -> Generator[PromptTargetMatch]:
@@ -545,7 +565,7 @@ class ResolvedPromptUpdate:
     def iter_text_matches(
         self,
         prompt: str,
-        tokenizer: TokenizerLike,
+        tokenizer: TokenizerLike | None,
         *,
         start_idx: int = 0,
     ) -> Generator[PromptTargetMatch]:
@@ -567,7 +587,7 @@ class ResolvedPromptUpdate:
     def iter_matches(
         self,
         prompt: list[int] | str,
-        tokenizer: TokenizerLike,
+        tokenizer: TokenizerLike | None,
         *,
         start_idx: int = 0,
     ) -> Generator[PromptTargetMatch]:
@@ -676,7 +696,7 @@ _MatchToApply = tuple[tuple[str, int], tuple[PromptTargetMatch, int]]
 def _find_matches(
     prompt: _S,
     mm_prompt_updates: "MultiModalPromptUpdates",
-    tokenizer: TokenizerLike,
+    tokenizer: TokenizerLike | None,
     *,
     prev_end_idx: int = 0,
     current_result: "MultiModalPromptUpdatesApplyResult",
@@ -741,7 +761,7 @@ def _all_items_found(
 def _apply_matches(
     prompt: _S,
     mm_prompt_updates: "MultiModalPromptUpdates",
-    tokenizer: TokenizerLike,
+    tokenizer: TokenizerLike | None,
 ) -> tuple[list[_S], "MultiModalPromptUpdatesApplyResult"]:
     mm_item_counts = {m: len(items) for m, items in mm_prompt_updates.items()}
 
@@ -807,7 +827,7 @@ def _apply_matches(
 def apply_token_matches(
     prompt: list[int],
     mm_prompt_updates: "MultiModalPromptUpdates",
-    tokenizer: TokenizerLike,
+    tokenizer: TokenizerLike | None,
 ) -> tuple[list[int], "MultiModalPromptUpdatesApplyResult"]:
     """
     Apply the updates in `mm_prompt_updates` to `prompt`.
@@ -824,7 +844,7 @@ def apply_token_matches(
 def apply_text_matches(
     prompt: str,
     mm_prompt_updates: "MultiModalPromptUpdates",
-    tokenizer: TokenizerLike,
+    tokenizer: TokenizerLike | None,
 ) -> tuple[str, "MultiModalPromptUpdatesApplyResult"]:
     """
     Apply the updates in `mm_prompt_updates` to `prompt`.
@@ -841,7 +861,7 @@ def apply_text_matches(
 def _iter_placeholders(
     prompt: list[int],
     mm_prompt_updates: "MultiModalPromptUpdates",
-    tokenizer: TokenizerLike,
+    tokenizer: TokenizerLike | None,
 ) -> Iterable[PlaceholderFeaturesInfo]:
     """
     Yield each set of placeholder tokens found in `prompt`.
@@ -910,7 +930,7 @@ def _iter_placeholders(
 def find_mm_placeholders(
     prompt: list[int],
     mm_prompt_updates: "MultiModalPromptUpdates",
-    tokenizer: TokenizerLike,
+    tokenizer: TokenizerLike | None,
 ) -> Mapping[str, list[PlaceholderFeaturesInfo]]:
     it = _iter_placeholders(prompt, mm_prompt_updates, tokenizer)
     return dict(full_groupby_modality(it))
@@ -931,9 +951,17 @@ class InputProcessingContext:
     model_config: ModelConfig
     """The configuration of the model."""
 
-    tokenizer: TokenizerLike
+    tokenizer: TokenizerLike | None
     """The tokenizer used to tokenize the inputs."""
 
+    def get_tokenizer(self) -> TokenizerLike:
+        if self.tokenizer is None:
+            raise ValueError(
+                "You cannot pass text prompts when `skip_tokenizer_init=True`"
+            )
+
+        return self.tokenizer
+
     @overload
     def get_hf_config(self, /) -> PretrainedConfig: ...
 
@@ -1148,7 +1176,7 @@ class BaseProcessingInfo:
         return self.ctx.model_config.model
 
     def get_tokenizer(self) -> TokenizerLike:
-        return self.ctx.tokenizer
+        return self.ctx.get_tokenizer()
 
     def get_hf_config(self) -> PretrainedConfig:
         return self.ctx.get_hf_config()
@@ -1960,15 +1988,11 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
             for update_idxs in match_result.values()
         ):
             new_text, match_result = self._apply_text_matches(
-                decode_tokens(tokenizer, token_ids),
+                _seq2text(tokenizer, token_ids, use_cache=False),
                 mm_prompt_updates,
             )
 
-            new_token_ids = encode_tokens(
-                tokenizer,
-                new_text,
-                add_special_tokens=False,
-            )
+            new_token_ids = _seq2tokens(tokenizer, new_text, use_cache=False)
 
         matched_updates = defaultdict[str, list[Sequence[ResolvedPromptUpdate]]](list)
         for modality, update_idxs in match_result.items():
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index ee90570b24aaf..2fdae46e547b0 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -234,9 +234,7 @@ class MultiModalRegistry:
         model_config: "ModelConfig",
         tokenizer: TokenizerLike | None = None,
     ) -> InputProcessingContext:
-        if model_config.skip_tokenizer_init:
-            tokenizer = cast(TokenizerLike, object())
-        elif tokenizer is None:
+        if tokenizer is None and not model_config.skip_tokenizer_init:
             tokenizer = cached_tokenizer_from_config(model_config)
 
         return InputProcessingContext(model_config, tokenizer)

From fa59fe417f509641fed102dfa2e3b8a63f224241 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 29 Nov 2025 22:25:17 +0800
Subject: [PATCH 083/770] [Chore] Move `detokenizer_utils` to `vllm/tokenizers`
 (#29727)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/tool_use/test_ernie45_moe_tool_parser.py               | 2 +-
 tests/tool_use/test_jamba_tool_parser.py                     | 2 +-
 tests/tool_use/test_qwen3coder_tool_parser.py                | 2 +-
 tests/tool_use/test_seed_oss_tool_parser.py                  | 2 +-
 tests/tool_use/test_xlam_tool_parser.py                      | 2 +-
 tests/utils_/test_argparse_utils.py                          | 2 +-
 vllm/{transformers_utils => tokenizers}/detokenizer_utils.py | 0
 vllm/v1/engine/detokenizer.py                                | 2 +-
 vllm/v1/engine/logprobs.py                                   | 2 +-
 9 files changed, 8 insertions(+), 8 deletions(-)
 rename vllm/{transformers_utils => tokenizers}/detokenizer_utils.py (100%)

diff --git a/tests/tool_use/test_ernie45_moe_tool_parser.py b/tests/tool_use/test_ernie45_moe_tool_parser.py
index ee9da4fd6464b..8fbbbba325385 100644
--- a/tests/tool_use/test_ernie45_moe_tool_parser.py
+++ b/tests/tool_use/test_ernie45_moe_tool_parser.py
@@ -15,7 +15,7 @@ from vllm.entrypoints.openai.protocol import (
 )
 from vllm.entrypoints.openai.tool_parsers.ernie45_tool_parser import Ernie45ToolParser
 from vllm.tokenizers import TokenizerLike
-from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
+from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 # Use a common model that is likely to be available
diff --git a/tests/tool_use/test_jamba_tool_parser.py b/tests/tool_use/test_jamba_tool_parser.py
index 2413b983fe871..c7ca024f3a767 100644
--- a/tests/tool_use/test_jamba_tool_parser.py
+++ b/tests/tool_use/test_jamba_tool_parser.py
@@ -11,7 +11,7 @@ from partial_json_parser.core.options import Allow
 from vllm.entrypoints.openai.protocol import DeltaMessage, FunctionCall, ToolCall
 from vllm.entrypoints.openai.tool_parsers.jamba_tool_parser import JambaToolParser
 from vllm.tokenizers import TokenizerLike
-from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
+from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 pytestmark = pytest.mark.cpu_test
diff --git a/tests/tool_use/test_qwen3coder_tool_parser.py b/tests/tool_use/test_qwen3coder_tool_parser.py
index 3cf1f4ef89f14..864bb0d0c06c2 100644
--- a/tests/tool_use/test_qwen3coder_tool_parser.py
+++ b/tests/tool_use/test_qwen3coder_tool_parser.py
@@ -18,7 +18,7 @@ from vllm.entrypoints.openai.tool_parsers.qwen3coder_tool_parser import (
 )
 from vllm.entrypoints.openai.tool_parsers.qwen3xml_tool_parser import Qwen3XMLToolParser
 from vllm.tokenizers import TokenizerLike
-from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
+from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 pytestmark = pytest.mark.cpu_test
diff --git a/tests/tool_use/test_seed_oss_tool_parser.py b/tests/tool_use/test_seed_oss_tool_parser.py
index 8e1ad5e9cedc8..d94df61128c9c 100644
--- a/tests/tool_use/test_seed_oss_tool_parser.py
+++ b/tests/tool_use/test_seed_oss_tool_parser.py
@@ -16,7 +16,7 @@ from vllm.entrypoints.openai.protocol import (
 )
 from vllm.entrypoints.openai.tool_parsers.seed_oss_tool_parser import SeedOssToolParser
 from vllm.tokenizers import TokenizerLike
-from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
+from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 pytestmark = pytest.mark.cpu_test
diff --git a/tests/tool_use/test_xlam_tool_parser.py b/tests/tool_use/test_xlam_tool_parser.py
index a1852c368eeb8..fdcdd4038131a 100644
--- a/tests/tool_use/test_xlam_tool_parser.py
+++ b/tests/tool_use/test_xlam_tool_parser.py
@@ -14,7 +14,7 @@ from vllm.entrypoints.openai.protocol import (
 )
 from vllm.entrypoints.openai.tool_parsers.xlam_tool_parser import xLAMToolParser
 from vllm.tokenizers import TokenizerLike
-from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
+from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 pytestmark = pytest.mark.cpu_test
diff --git a/tests/utils_/test_argparse_utils.py b/tests/utils_/test_argparse_utils.py
index 0ea4a43d26024..2d969b8c9347d 100644
--- a/tests/utils_/test_argparse_utils.py
+++ b/tests/utils_/test_argparse_utils.py
@@ -10,7 +10,7 @@ import yaml
 from transformers import AutoTokenizer
 from pydantic import ValidationError
 
-from vllm.transformers_utils.detokenizer_utils import convert_ids_list_to_tokens
+from vllm.tokenizers.detokenizer_utils import convert_ids_list_to_tokens
 
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from ..utils import flat_product
diff --git a/vllm/transformers_utils/detokenizer_utils.py b/vllm/tokenizers/detokenizer_utils.py
similarity index 100%
rename from vllm/transformers_utils/detokenizer_utils.py
rename to vllm/tokenizers/detokenizer_utils.py
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index c55240c40f6f0..6c0acd9a9f59f 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -9,7 +9,7 @@ from tokenizers.decoders import DecodeStream
 from transformers import PreTrainedTokenizerFast
 
 from vllm.logger import init_logger
-from vllm.transformers_utils.detokenizer_utils import (
+from vllm.tokenizers.detokenizer_utils import (
     TokenizerLike,
     convert_prompt_ids_to_tokens,
     detokenize_incrementally,
diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py
index 1c8f808bc25ba..599725b6de918 100644
--- a/vllm/v1/engine/logprobs.py
+++ b/vllm/v1/engine/logprobs.py
@@ -12,7 +12,7 @@ from vllm.logprobs import (
     create_prompt_logprobs,
     create_sample_logprobs,
 )
-from vllm.transformers_utils.detokenizer_utils import (
+from vllm.tokenizers.detokenizer_utils import (
     TokenizerLike,
     convert_ids_list_to_tokens,
 )

From 1656ad37045579999a5a9ef3b940f945cd92bb4e Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <jinzhen.ljz@antgroup.com>
Date: Sat, 29 Nov 2025 23:19:33 +0800
Subject: [PATCH 084/770] [Kernel][Quantization] add w4a8 support for marlin
 kernel (#24722)

Signed-off-by: Jinzhen Lin <jinzhen.ljz@antgroup.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Michael Goin <mgoin@redhat.com>
---
 CMakeLists.txt                                |  100 +-
 benchmarks/kernels/benchmark_machete.py       |    1 +
 benchmarks/kernels/benchmark_marlin.py        |    4 +-
 csrc/moe/marlin_moe_wna16/.gitignore          |    3 +-
 csrc/moe/marlin_moe_wna16/generate_kernels.py |  320 +++-
 csrc/moe/marlin_moe_wna16/kernel.h            |   12 +-
 csrc/moe/marlin_moe_wna16/marlin_template.h   | 1454 ++++++++++-------
 csrc/moe/marlin_moe_wna16/ops.cu              |  590 +++----
 csrc/moe/torch_bindings.cpp                   |    8 +-
 .../gptq_allspark/allspark_qgemm_w8a16.cu     |    8 +-
 .../gptq_allspark/allspark_utils.cuh          |    6 +-
 csrc/quantization/gptq_marlin/.gitignore      |    3 +-
 .../gptq_marlin/awq_marlin_repack.cu          |  100 +-
 csrc/quantization/gptq_marlin/dequant.h       |   87 +
 .../gptq_marlin/generate_kernels.py           |  333 ++--
 csrc/quantization/gptq_marlin/gptq_marlin.cu  |  473 ++----
 .../gptq_marlin/gptq_marlin_repack.cu         |   95 +-
 csrc/quantization/gptq_marlin/kernel.h        |   10 +-
 csrc/quantization/gptq_marlin/marlin.cuh      |   39 +
 .../gptq_marlin/marlin_dtypes.cuh             |   74 +-
 .../gptq_marlin/marlin_int4_fp8_preprocess.cu |  106 ++
 .../gptq_marlin/marlin_template.h             | 1201 +++++++++-----
 csrc/torch_bindings.cpp                       |   17 +-
 docs/design/moe_kernel_features.md            |    2 +-
 tests/kernels/moe/test_moe.py                 |  256 ++-
 .../kernels/quantization/test_marlin_gemm.py  |  397 +++--
 tests/kernels/utils.py                        |   18 +
 vllm/_custom_ops.py                           |   58 +-
 vllm/envs.py                                  |    5 +
 .../layers/fused_moe/fused_marlin_moe.py      |   47 +-
 .../layers/quantization/auto_round.py         |    4 +-
 .../model_executor/layers/quantization/awq.py |    4 +-
 .../layers/quantization/awq_marlin.py         |   82 +-
 .../compressed_tensors/compressed_tensors.py  |   11 +-
 .../compressed_tensors_moe.py                 |  105 +-
 .../schemes/compressed_tensors_wNa16.py       |   11 +
 .../model_executor/layers/quantization/fp8.py |   23 +-
 .../layers/quantization/gptq_marlin.py        |   74 +-
 .../layers/quantization/hqq_marlin.py         |    1 +
 .../kernels/mixed_precision/marlin.py         |   34 +
 .../layers/quantization/modelopt.py           |   18 +-
 .../layers/quantization/mxfp4.py              |   12 +-
 .../layers/quantization/utils/marlin_utils.py |  125 +-
 .../quantization/utils/marlin_utils_fp4.py    |  121 +-
 .../quantization/utils/marlin_utils_fp8.py    |   41 +-
 .../quantization/utils/marlin_utils_test.py   |  118 +-
 46 files changed, 4371 insertions(+), 2240 deletions(-)
 create mode 100644 csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d88ba3aa66303..e09972fe71995 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -354,8 +354,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # Only build Marlin kernels if we are building for at least some compatible archs.
   # Keep building Marlin for 9.0 as there are some group sizes and shapes that
   # are not supported by Machete yet.
-  # 9.0 for latest bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
+
+  # marlin arches for fp16 output
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
+  # marlin arches for bf16 output (we need 9.0 for bf16 atomicAdd PTX)
+  cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
+  # marlin arches for fp8 input
+  # - sm80 doesn't support fp8 computation
+  # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
+  # so we only enable fp8 computation for SM89 (e.g. RTX 40x0)  and 12.0 (e.g. RTX 50x0)
+  cuda_archs_loose_intersection(MARLIN_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
+
   if (MARLIN_ARCHS)
 
     #
@@ -365,16 +374,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     set(MARLIN_GEN_SCRIPT
       ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
     file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
+    list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
+    set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
 
-    message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
-    message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")
+    message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
+    message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
 
-    if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
-        OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
+    if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
+        OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
       execute_process(
         COMMAND ${CMAKE_COMMAND} -E env
         PYTHONPATH=$PYTHONPATH
-          ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
+          ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
         RESULT_VARIABLE marlin_generation_result
         OUTPUT_VARIABLE marlin_generation_result
         OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
@@ -387,15 +398,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                             "\nCheck the log for details: "
                             "${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
       else()
-        set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
-            CACHE STRING "Last run Marlin generate script hash" FORCE)
+        set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
+            CACHE STRING "Last run Marlin generate script hash and arch" FORCE)
         message(STATUS "Marlin generation completed successfully.")
       endif()
     else()
       message(STATUS "Marlin generation script has not changed, skipping generation.")
     endif()
 
-    file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
+    file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu")
     set_gencode_flags_for_srcs(
       SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
       CUDA_ARCHS "${MARLIN_ARCHS}")
@@ -403,12 +414,34 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
       set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
         PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
     endif()
-
     list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
 
+    file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}"
+      CUDA_ARCHS "${MARLIN_BF16_ARCHS}")
+    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+      set_source_files_properties(${MARLIN_TEMPLATE_BF16_KERNEL_SRC}
+        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+    endif()
+    list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
+
+    if (MARLIN_FP8_ARCHS) 
+      file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu")
+      set_gencode_flags_for_srcs(
+        SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}"
+        CUDA_ARCHS "${MARLIN_FP8_ARCHS}")
+      if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+        set_source_files_properties(${MARLIN_TEMPLATE_FP8_KERNEL_SRC}
+          PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+      endif()
+      list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_FP8_KERNEL_SRC})
+    endif()
+
     set(MARLIN_SRCS
        "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
        "csrc/quantization/gptq_marlin/gptq_marlin.cu"
+       "csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu"
        "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
        "csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
     set_gencode_flags_for_srcs(
@@ -941,8 +974,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     CUDA_ARCHS "${CUDA_ARCHS}")
 
   list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
-  # 9.0 for latest bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
+  # moe marlin arches
+  # note that we always set `use_atomic_add=False` for moe marlin now,
+  # so we don't need 9.0 for bf16 atomicAdd PTX
+  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
+  # moe marlin arches for fp8 input
+  # - sm80 doesn't support fp8 computation
+  # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
+  # so we only enable fp8 computation for SM89 (e.g. RTX 40x0)  and 12.0 (e.g. RTX 50x0)
+  cuda_archs_loose_intersection(MARLIN_MOE_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
   if (MARLIN_MOE_ARCHS)
 
     #
@@ -952,16 +992,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     set(MOE_MARLIN_GEN_SCRIPT
       ${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
     file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
+    list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
+    set(MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MOE_MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
 
-    message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
-    message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
+    message(STATUS "Marlin MOE generation script hash with arch: ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
+    message(STATUS "Last run Marlin MOE generate script hash with arch: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
 
-    if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
-        OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
+    if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
+        OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
       execute_process(
         COMMAND ${CMAKE_COMMAND} -E env
         PYTHONPATH=$PYTHONPATH
-          ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
+          ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
         RESULT_VARIABLE moe_marlin_generation_result
         OUTPUT_VARIABLE moe_marlin_generation_output
         OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
@@ -974,7 +1016,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                             "\nCheck the log for details: "
                             "${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
       else()
-        set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
+        set(MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
             CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
         message(STATUS "Marlin MOE generation completed successfully.")
       endif()
@@ -982,16 +1024,28 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
       message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
     endif()
 
-    file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
+    file(GLOB MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/sm80_kernel_*.cu")
+    list(APPEND MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/ops.cu")
     set_gencode_flags_for_srcs(
-      SRCS "${MOE_WNAA16_MARLIN_SRC}"
+      SRCS "${MARLIN_MOE_SRC}"
       CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
     if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
-      set_source_files_properties(${MOE_WNAA16_MARLIN_SRC}
+      set_source_files_properties(${MARLIN_MOE_SRC}
         PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
     endif()
+    list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
 
-    list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
+    if (MARLIN_MOE_FP8_ARCHS)
+      file(GLOB MARLIN_MOE_FP8_SRC "csrc/moe/marlin_moe_wna16/sm89_kernel_*.cu")
+      set_gencode_flags_for_srcs(
+        SRCS "${MARLIN_MOE_FP8_SRC}"
+        CUDA_ARCHS "${MARLIN_MOE_FP8_ARCHS}")
+      if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+        set_source_files_properties(${MARLIN_MOE_FP8_SRC}
+          PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+      endif()
+      list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_FP8_SRC})
+    endif()
 
     message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
   else()
diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py
index 8787724d77cfb..ac78c019a59e5 100644
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -237,6 +237,7 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
             b_q_weight=w_q,
             b_bias=None,
             b_scales=w_s,
+            a_scales=None,
             global_scale=None,
             b_zeros=w_zp,
             g_idx=g_idx,
diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py
index 12ca9214b1f95..48d790aec9e07 100644
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -263,7 +263,7 @@ def bench_run(
 
     results.append(
         benchmark.Timer(
-            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)",  # noqa: E501
+            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)",  # noqa: E501
             globals=globals,
             label=label,
             sub_label=sub_label,
@@ -273,7 +273,7 @@ def bench_run(
 
     results.append(
         benchmark.Timer(
-            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)",  # noqa: E501
+            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)",  # noqa: E501
             globals=globals,
             label=label,
             sub_label=sub_label,
diff --git a/csrc/moe/marlin_moe_wna16/.gitignore b/csrc/moe/marlin_moe_wna16/.gitignore
index 77088552b85b4..ba805f9250ece 100644
--- a/csrc/moe/marlin_moe_wna16/.gitignore
+++ b/csrc/moe/marlin_moe_wna16/.gitignore
@@ -1 +1,2 @@
-kernel_*.cu
\ No newline at end of file
+sm*_kernel_*.cu
+kernel_selector.h
diff --git a/csrc/moe/marlin_moe_wna16/generate_kernels.py b/csrc/moe/marlin_moe_wna16/generate_kernels.py
index be5b68cc53e6f..88f1055337fd5 100644
--- a/csrc/moe/marlin_moe_wna16/generate_kernels.py
+++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py
@@ -4,134 +4,282 @@ import glob
 import itertools
 import os
 import subprocess
+import sys
 
 import jinja2
 
-FILE_HEAD = """
-// auto generated by generate.py
-// clang-format off
+ARCHS = []
+SUPPORT_FP8 = False
+for arch in sys.argv[1].split(","):
+    arch = arch[: arch.index(".") + 2].replace(".", "")
+    arch = int(arch)
+    # only SM89 and SM120 fully support
+    # mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32.
+    # SM90 and SM100 can use this PTX, but it’s simulated
+    # with FP16 MMA, so it cannot achieve any acceleration.
+    if arch in [89, 120]:
+        SUPPORT_FP8 = True
 
+FILE_HEAD_COMMENT = """
+// auto generated by generate_kernels.py
+// clang-format off
+""".lstrip()
+
+FILE_HEAD = (
+    FILE_HEAD_COMMENT
+    + """
 #include "kernel.h"
 #include "marlin_template.h"
 
 namespace MARLIN_NAMESPACE_NAME {
-""".strip()
+"""
+)
 
 TEMPLATE = (
     "template __global__ void Marlin<"
-    "{{scalar_t}}, "
-    "{{w_type_id}}, "
+    "{{a_type_id}}, "
+    "{{b_type_id}}, "
+    "{{c_type_id}}, "
     "{{s_type_id}}, "
     "{{threads}}, "
     "{{thread_m_blocks}}, "
     "{{thread_n_blocks}}, "
     "{{thread_k_blocks}}, "
-    "{{'true' if m_block_size_8 else 'false'}}, "
+    "{{m_block_size_8}}, "
     "{{stages}}, "
     "{{group_blocks}}, "
-    "{{'true' if is_zp_float else 'false'}}>"
+    "{{is_zp_float}}>"
     "( MARLIN_KERNEL_PARAMS );"
 )
 
-# int8 with zero point case (vllm::kU8) is also supported,
-# we don't add it to reduce wheel size.
-SCALAR_TYPES = [
-    "vllm::kU4",
-    "vllm::kU4B8",
-    "vllm::kU8B128",
-    "vllm::kFE4M3fn",
-    "vllm::kFE2M1f",
-]
 THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)]
 
 THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
-# group_blocks:
-#   = 0 : act order case
-#   = -1 : channelwise quantization
-#   > 0 : group_size=16*group_blocks
-GROUP_BLOCKS = [0, -1, 1, 2, 4, 8]
-DTYPES = ["fp16", "bf16"]
+
+QUANT_CONFIGS = [
+    # AWQ-INT4
+    {
+        "b_type": "kU4",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT4
+    {
+        "b_type": "kU4B8",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [-1, 0, 2, 4, 8],
+    },
+    # AWQ-INT8
+    {
+        "b_type": "kU8B128",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [-1, 0, 2, 4, 8],
+    },
+    # FP8
+    {
+        "b_type": "kFE4M3fn",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [-1, 8],
+    },
+    # NVFP4
+    {
+        "b_type": "kFE2M1f",
+        "s_type": "kFE4M3fn",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [1],
+    },
+    # MXFP4
+    {
+        "a_type": ["kBFloat16"],
+        "b_type": "kFE2M1f",
+        "s_type": "kFE8M0fnu",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [2],
+    },
+    # AWQ-INT4 with INT8 activation
+    {
+        "a_type": ["kS8"],
+        "b_type": "kU4",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT4 with INT8 activation
+    {
+        "a_type": ["kS8"],
+        "b_type": "kU4B8",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT4 with FP8 activation
+    {
+        "a_type": ["kFE4M3fn"],
+        "b_type": "kU4B8",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # AWQ-INT4 with FP8 activation
+    {
+        "a_type": ["kFE4M3fn"],
+        "b_type": "kU4",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # MXFP4 with FP8 activation
+    {
+        "a_type": ["kFE4M3fn"],
+        "b_type": "kFE2M1f",
+        "c_type": ["kBFloat16"],
+        "s_type": "kFE8M0fnu",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [2],
+    },
+]
 
 
 def remove_old_kernels():
-    for filename in glob.glob(os.path.dirname(__file__) + "/kernel_*.cu"):
+    for filename in glob.glob(os.path.dirname(__file__) + "/*kernel_*.cu"):
         subprocess.call(["rm", "-f", filename])
 
+    filename = os.path.dirname(__file__) + "/kernel_selector.h"
+    subprocess.call(["rm", "-f", filename])
+
 
 def generate_new_kernels():
-    for scalar_type, dtype in itertools.product(SCALAR_TYPES, DTYPES):
+    result_dict = {}
+
+    for quant_config in QUANT_CONFIGS:
+        c_types = quant_config.get("c_type", ["kFloat16", "kBFloat16"])
+        a_types = quant_config.get("a_type", ["kFloat16", "kBFloat16"])
+        b_type = quant_config["b_type"]
+        all_group_blocks = quant_config["group_blocks"]
+        all_m_blocks = quant_config["thread_m_blocks"]
+        all_thread_configs = quant_config["thread_configs"]
+
+        for a_type, c_type in itertools.product(a_types, c_types):
+            if not SUPPORT_FP8 and a_type == "kFE4M3fn":
+                continue
+            if "16" in a_type and "16" in c_type and a_type != c_type:
+                continue
+            s_type = quant_config.get("s_type", c_type)
+            if (a_type, b_type, c_type) not in result_dict:
+                result_dict[(a_type, b_type, c_type)] = []
+
+            for group_blocks, m_blocks, thread_configs in itertools.product(
+                all_group_blocks, all_m_blocks, all_thread_configs
+            ):
+                thread_k, thread_n, threads = thread_configs
+
+                if threads == 256:
+                    # for small batch (m_blocks == 1),
+                    #     we only need (128, 128, 256)
+                    # for large batch (m_blocks > 1),
+                    #     we only need (64, 256, 256)
+                    if m_blocks <= 1 and (thread_k, thread_n) != (128, 128):
+                        continue
+                    if m_blocks > 1 and (thread_k, thread_n) != (64, 256):
+                        continue
+
+                config = {
+                    "threads": threads,
+                    "s_type": s_type,
+                    "thread_m_blocks": max(m_blocks, 1),
+                    "thread_k_blocks": thread_k // 16,
+                    "thread_n_blocks": thread_n // 16,
+                    "m_block_size_8": "true" if m_blocks == 0.5 else "false",
+                    "stages": "pipe_stages",
+                    "group_blocks": group_blocks,
+                    "is_zp_float": "false",
+                }
+
+                result_dict[(a_type, b_type, c_type)].append(config)
+
+    kernel_selector_str = FILE_HEAD_COMMENT
+
+    for (a_type, b_type, c_type), config_list in result_dict.items():
         all_template_str_list = []
-
-        for group_blocks, m_blocks, thread_configs in itertools.product(
-            GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS
-        ):
-            # act order case only support gptq-int4 and gptq-int8
-            if group_blocks == 0 and scalar_type not in [
-                "vllm::kU4B8",
-                "vllm::kU8B128",
-            ]:
-                continue
-            if thread_configs[2] == 256:
-                # for small batch (m_blocks == 1), we only need (128, 128, 256)
-                # for large batch (m_blocks > 1), we only need (64, 256, 256)
-                if m_blocks <= 1 and thread_configs[0] != 128:
-                    continue
-                if m_blocks > 1 and thread_configs[0] != 64:
-                    continue
-
-            # we only support channelwise quantization and group_size == 128
-            # for fp8
-            if scalar_type == "vllm::kFE4M3fn" and group_blocks not in [-1, 8]:
-                continue
-            # nvfp4 only supports group_size == 16
-            # mxfp4 only supports group_size == 32
-            if scalar_type == "vllm::kFE2M1f" and group_blocks not in [1, 2]:
-                continue
-            # other quantization methods don't support group_size = 16
-            if scalar_type != "vllm::kFE2M1f" and group_blocks == 1:
-                continue
-
-            k_blocks = thread_configs[0] // 16
-            n_blocks = thread_configs[1] // 16
-            threads = thread_configs[2]
-
-            c_dtype = "half" if dtype == "fp16" else "nv_bfloat16"
-
-            if scalar_type == "vllm::kFE2M1f" and group_blocks == 1:
-                s_type = "vllm::kFE4M3fn"
-            elif scalar_type == "vllm::kFE2M1f" and group_blocks == 2:
-                s_type = "vllm::kFE8M0fnu"
-                if dtype == "fp16":
-                    # we cannot safely dequantize e8m0 to fp16, so skip this
-                    continue
-            elif dtype == "fp16":
-                s_type = "vllm::kFloat16"
-            elif dtype == "bf16":
-                s_type = "vllm::kBFloat16"
-
+        for config in config_list:
+            s_type = config["s_type"]
             template_str = jinja2.Template(TEMPLATE).render(
-                scalar_t=c_dtype,
-                w_type_id=scalar_type + ".id()",
-                s_type_id=s_type + ".id()",
-                threads=threads,
-                thread_m_blocks=max(m_blocks, 1),
-                thread_n_blocks=n_blocks,
-                thread_k_blocks=k_blocks,
-                m_block_size_8=m_blocks == 0.5,
-                stages="pipe_stages",
-                group_blocks=group_blocks,
-                is_zp_float=False,
+                a_type_id=f"vllm::{a_type}.id()",
+                b_type_id=f"vllm::{b_type}.id()",
+                c_type_id=f"vllm::{c_type}.id()",
+                s_type_id=f"vllm::{s_type}.id()",
+                **config,
+            )
+            all_template_str_list.append(template_str)
+
+            conditions = [
+                f"a_type == vllm::{a_type}",
+                f"b_type == vllm::{b_type}",
+                f"c_type == vllm::{c_type}",
+                f"s_type == vllm::{s_type}",
+                f"threads == {config['threads']}",
+                f"thread_m_blocks == {config['thread_m_blocks']}",
+                f"thread_n_blocks == {config['thread_n_blocks']}",
+                f"thread_k_blocks == {config['thread_k_blocks']}",
+                f"m_block_size_8 == {config['m_block_size_8']}",
+                f"group_blocks == {config['group_blocks']}",
+                f"is_zp_float == {config['is_zp_float']}",
+            ]
+            conditions = " && ".join(conditions)
+
+            if kernel_selector_str == FILE_HEAD_COMMENT:
+                kernel_selector_str += f"if ({conditions})\n  kernel = "
+            else:
+                kernel_selector_str += f"else if ({conditions})\n  kernel = "
+
+            kernel_template2 = (
+                "Marlin<{{a_type_id}}, {{b_type_id}}, {{c_type_id}}, "
+                "{{s_type_id}}, {{threads}}, {{thread_m_blocks}}, "
+                "{{thread_n_blocks}}, {{thread_k_blocks}}, "
+                "{{m_block_size_8}}, {{stages}}, {{group_blocks}}, "
+                "{{is_zp_float}}>;"
             )
 
-            all_template_str_list.append(template_str)
+            kernel_selector_str += (
+                jinja2.Template(kernel_template2).render(
+                    a_type_id=f"vllm::{a_type}.id()",
+                    b_type_id=f"vllm::{b_type}.id()",
+                    c_type_id=f"vllm::{c_type}.id()",
+                    s_type_id=f"vllm::{s_type}.id()",
+                    **config,
+                )
+                + "\n"
+            )
 
         file_content = FILE_HEAD + "\n\n"
         file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
-        filename = f"kernel_{dtype}_{scalar_type[6:].lower()}.cu"
+        if a_type == "kFE4M3fn":
+            filename = f"sm89_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+        else:
+            filename = f"sm80_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+
+        filename = filename.lower()
 
         with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
             f.write(file_content)
 
+    if not SUPPORT_FP8 and kernel_selector_str != FILE_HEAD_COMMENT:
+        kernel_selector_str += (
+            "else if (a_type == vllm::kFE4M3fn)\n"
+            "  TORCH_CHECK(false, "
+            '"marlin kernel with fp8 activation is not built.");'
+        )
+
+    with open(os.path.join(os.path.dirname(__file__), "kernel_selector.h"), "w") as f:
+        f.write(kernel_selector_str)
+
 
 if __name__ == "__main__":
     remove_old_kernels()
diff --git a/csrc/moe/marlin_moe_wna16/kernel.h b/csrc/moe/marlin_moe_wna16/kernel.h
index 6190f7ee21ece..57f5a17932d44 100644
--- a/csrc/moe/marlin_moe_wna16/kernel.h
+++ b/csrc/moe/marlin_moe_wna16/kernel.h
@@ -11,8 +11,9 @@
   const int4 *__restrict__ A, const int4 *__restrict__ B,                     \
       int4 *__restrict__ C, int4 *__restrict__ C_tmp,                         \
       const int4 *__restrict__ b_bias_ptr,                                    \
+      const float *__restrict__ a_scales_ptr,                                 \
       const int4 *__restrict__ scales_ptr,                                    \
-      const uint16_t *__restrict__ scale2_ptr,                                \
+      const uint16_t *__restrict__ global_scale_ptr,                          \
       const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx,         \
       const int32_t *__restrict__ sorted_token_ids_ptr,                       \
       const int32_t *__restrict__ expert_ids_ptr,                             \
@@ -20,12 +21,13 @@
       const float *__restrict__ topk_weights_ptr, int top_k,                  \
       bool mul_topk_weights, bool is_ep, int num_groups, int prob_m,          \
       int prob_n, int prob_k, int *locks, bool has_bias, bool use_atomic_add, \
-      bool use_fp32_reduce, int max_shared_mem
+      bool use_fp32_reduce
 
 namespace MARLIN_NAMESPACE_NAME {
-template <typename scalar_t,  // compute dtype, half or nv_float16
-          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
-          const vllm::ScalarTypeId s_type_id,  // weight scale ScalarType id
+template <const vllm::ScalarTypeId a_type_id,  // A ScalarType id
+          const vllm::ScalarTypeId b_type_id,  // B ScalarType id
+          const vllm::ScalarTypeId c_type_id,  // C ScalarType id
+          const vllm::ScalarTypeId s_type_id,  // B_SCALE ScalarType id
           const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
diff --git a/csrc/moe/marlin_moe_wna16/marlin_template.h b/csrc/moe/marlin_moe_wna16/marlin_template.h
index 4dbca30da57a1..5b6b2456b4111 100644
--- a/csrc/moe/marlin_moe_wna16/marlin_template.h
+++ b/csrc/moe/marlin_moe_wna16/marlin_template.h
@@ -38,7 +38,7 @@ namespace MARLIN_NAMESPACE_NAME {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
 
 template <typename scalar_t,  // compute dtype, half or nv_float16
-          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const vllm::ScalarTypeId b_type_id,  // weight MarlinScalarType id
           const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
@@ -49,9 +49,10 @@ template <typename scalar_t,  // compute dtype, half or nv_float16
                                       // only works when thread_m_blocks == 1
           const int stages,  // number of stages for the async global->shared
                              // fetch pipeline
-          const int group_blocks,  // number of consecutive 16x16 blocks
-                                   // with a separate quantization scale
-          const bool is_zp_float   // is zero point of float16 type?
+          const bool has_act_order,  // whether act_order is enabled
+          const int group_blocks,    // number of consecutive 16x16 blocks
+                                     // with a separate quantization scale
+          const bool is_zp_float     // is zero point of float16 type?
           >
 __global__ void Marlin(
     const int4* __restrict__ A,  // fp16 input matrix of shape mxk
@@ -76,8 +77,8 @@ __global__ void Marlin(
     int prob_k,             // reduction dimension k
     int* locks,             // extra global storage for barrier synchronization
     bool use_atomic_add,    // whether to use atomic add to reduce
-    bool use_fp32_reduce,   // whether to use fp32 global reduce
-    int max_shared_mem) {}
+    bool use_fp32_reduce    // whether to use fp32 global reduce
+) {}
 
 }  // namespace MARLIN_NAMESPACE_NAME
 
@@ -85,65 +86,148 @@ __global__ void Marlin(
 
 // m16n8k16 tensor core mma instruction with fp16 inputs and fp32
 // output/accumulation.
-template <typename scalar_t>
-__device__ inline void mma(const typename ScalarType<scalar_t>::FragA& a_frag,
-                           const typename ScalarType<scalar_t>::FragB& frag_b,
-                           typename ScalarType<scalar_t>::FragC& frag_c) {
+template <vllm::ScalarTypeId type_id, int k_size = 16>
+__device__ inline void mma(
+    const typename MarlinScalarType<type_id>::FragA& a_frag,
+    const typename MarlinScalarType<type_id>::FragB& frag_b,
+    typename MarlinScalarType<type_id>::FragC& frag_c, int idx = 0) {
   const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
   const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
-  float* c = reinterpret_cast<float*>(&frag_c);
-  if constexpr (std::is_same<scalar_t, half>::value) {
-    asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
-        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
-    asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
-        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-  } else {
-    STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  if constexpr (k_size == 16) {
+    if constexpr (std::is_same<scalar_t, half>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "f"(c[0]),
+            "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
+      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+          : "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "r"(c[0]),
+            "r"(c[1]), "r"(c[2]), "r"(c[3]));
+    }
+  } else if (k_size == 32) {
+    if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
+      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
+    }
   }
 }
 
-template <typename scalar_t>
+template <vllm::ScalarTypeId type_id, int k_size = 16>
 __device__ inline void mma_trans(
-    const typename ScalarType<scalar_t>::FragA& a_frag,
-    const typename ScalarType<scalar_t>::FragB& frag_b,
-    const typename ScalarType<scalar_t>::FragB& frag_b2,
-    typename ScalarType<scalar_t>::FragC& frag_c) {
+    const typename MarlinScalarType<type_id>::FragA& a_frag,
+    const typename MarlinScalarType<type_id>::FragB& frag_b,
+    const typename MarlinScalarType<type_id>::FragB& frag_b2,
+    typename MarlinScalarType<type_id>::FragC& frag_c) {
   const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
   const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
   const uint32_t* b2 = reinterpret_cast<const uint32_t*>(&frag_b2);
   float* c = reinterpret_cast<float*>(&frag_c);
-  if constexpr (std::is_same<scalar_t, half>::value) {
-    asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
-        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-        : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
-    asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
-        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-        : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  if constexpr (k_size == 16) {
+    if constexpr (std::is_same<scalar_t, half>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "f"(c[0]), "f"(c[1]), "f"(c[2]),
+            "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
+      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "r"(c[0]), "r"(c[1]), "r"(c[2]),
+            "r"(c[3]));
+    }
   } else {
-    STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
+    if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 1200
+      asm volatile(
+          "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+  #else
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+  #endif
+    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
+      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
+    }
   }
 }
 
 // Instruction for loading a full 16x16 matrix fragment of operand A from shared
 // memory, directly in tensor core layout.
-template <int count, typename scalar_t>
-__device__ inline void ldsm(typename ScalarType<scalar_t>::FragA& frag_a,
+template <int count, vllm::ScalarTypeId type_id>
+__device__ inline void ldsm(typename MarlinScalarType<type_id>::FragA& frag_a,
                             const void* smem_ptr) {
   uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
   uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
@@ -167,47 +251,54 @@ __device__ inline void ldsm(typename ScalarType<scalar_t>::FragA& frag_a,
 
 // Multiply dequantized values by the corresponding quantization scale; used
 // only for grouped quantization.
-template <typename scalar_t>
-__device__ inline void scale(typename ScalarType<scalar_t>::FragB& frag_b,
-                             typename ScalarType<scalar_t>::FragS& frag_s,
+template <vllm::ScalarTypeId type_id>
+__device__ inline void scale(typename MarlinScalarType<type_id>::FragB& frag_b,
+                             typename MarlinScalarType<type_id>::FragS& frag_s,
                              int i) {
-  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
-  scalar_t2 s =
-      ScalarType<scalar_t>::num2num2(reinterpret_cast<scalar_t*>(&frag_s)[i]);
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  using scalar_t2 = typename MarlinScalarType<type_id>::scalar_t2;
+  scalar_t2 s = MarlinScalarType<type_id>::num2num2(
+      reinterpret_cast<scalar_t*>(&frag_s)[i]);
   frag_b[0] = __hmul2(frag_b[0], s);
   frag_b[1] = __hmul2(frag_b[1], s);
 }
 
-template <typename scalar_t>
+template <vllm::ScalarTypeId type_id>
 __device__ inline void scale_and_sub(
-    typename ScalarType<scalar_t>::FragB& frag_b, scalar_t s, scalar_t zp) {
-  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
-  scalar_t2 s2 = ScalarType<scalar_t>::num2num2(s);
-  scalar_t2 zp2 = ScalarType<scalar_t>::num2num2(zp);
+    typename MarlinScalarType<type_id>::FragB& frag_b,
+    typename MarlinScalarType<type_id>::scalar_t s,
+    typename MarlinScalarType<type_id>::scalar_t zp) {
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  using scalar_t2 = typename MarlinScalarType<type_id>::scalar_t2;
+  scalar_t2 s2 = MarlinScalarType<type_id>::num2num2(s);
+  scalar_t2 zp2 = MarlinScalarType<type_id>::num2num2(zp);
   frag_b[0] = __hfma2(frag_b[0], s2, __hneg2(zp2));
   frag_b[1] = __hfma2(frag_b[1], s2, __hneg2(zp2));
 }
 
-template <typename scalar_t>
-__device__ inline void sub_zp(typename ScalarType<scalar_t>::FragB& frag_b,
-                              typename ScalarType<scalar_t>::scalar_t2& frag_zp,
-                              int i) {
-  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
-  scalar_t2 zp =
-      ScalarType<scalar_t>::num2num2(reinterpret_cast<scalar_t*>(&frag_zp)[i]);
+template <vllm::ScalarTypeId type_id>
+__device__ inline void sub_zp(
+    typename MarlinScalarType<type_id>::FragB& frag_b,
+    typename MarlinScalarType<type_id>::scalar_t2& frag_zp, int i) {
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  using scalar_t2 = typename MarlinScalarType<type_id>::scalar_t2;
+  scalar_t2 zp = MarlinScalarType<type_id>::num2num2(
+      reinterpret_cast<scalar_t*>(&frag_zp)[i]);
   frag_b[0] = __hsub2(frag_b[0], zp);
   frag_b[1] = __hsub2(frag_b[1], zp);
 }
 
 // Same as above, but for act_order (each K is multiplied individually)
-template <typename scalar_t>
-__device__ inline void scale4(typename ScalarType<scalar_t>::FragB& frag_b,
-                              typename ScalarType<scalar_t>::FragS& frag_s_1,
-                              typename ScalarType<scalar_t>::FragS& frag_s_2,
-                              typename ScalarType<scalar_t>::FragS& frag_s_3,
-                              typename ScalarType<scalar_t>::FragS& frag_s_4,
-                              int i) {
-  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+template <vllm::ScalarTypeId type_id>
+__device__ inline void scale4(
+    typename MarlinScalarType<type_id>::FragB& frag_b,
+    typename MarlinScalarType<type_id>::FragS& frag_s_1,
+    typename MarlinScalarType<type_id>::FragS& frag_s_2,
+    typename MarlinScalarType<type_id>::FragS& frag_s_3,
+    typename MarlinScalarType<type_id>::FragS& frag_s_4, int i) {
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  using scalar_t2 = typename MarlinScalarType<type_id>::scalar_t2;
+
   scalar_t2 s_val_1_2;
   s_val_1_2.x = reinterpret_cast<scalar_t*>(&frag_s_1)[i];
   s_val_1_2.y = reinterpret_cast<scalar_t*>(&frag_s_2)[i];
@@ -221,12 +312,13 @@ __device__ inline void scale4(typename ScalarType<scalar_t>::FragB& frag_b,
 }
 
 // Given 2 floats multiply by 2 scales (halves)
-template <typename scalar_t>
-__device__ inline void scale_float(float* c,
-                                   typename ScalarType<scalar_t>::FragS& s) {
+template <vllm::ScalarTypeId type_id>
+__device__ inline void scale_float(
+    float* c, typename MarlinScalarType<type_id>::FragS& s) {
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
   scalar_t* s_ptr = reinterpret_cast<scalar_t*>(&s);
-  c[0] = __fmul_rn(c[0], ScalarType<scalar_t>::num2float(s_ptr[0]));
-  c[1] = __fmul_rn(c[1], ScalarType<scalar_t>::num2float(s_ptr[1]));
+  c[0] = __fmul_rn(c[0], MarlinScalarType<type_id>::num2float(s_ptr[0]));
+  c[1] = __fmul_rn(c[1], MarlinScalarType<type_id>::num2float(s_ptr[1]));
 }
 
 // Wait until barrier reaches `count`, then lock for current threadblock.
@@ -278,9 +370,10 @@ __device__ inline void wait_negative_and_add(int* lock) {
   __syncthreads();
 }
 
-template <typename scalar_t,  // compute dtype, half or nv_float16
-          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
-          const vllm::ScalarTypeId s_type_id,  // weight scale ScalarType id
+template <const vllm::ScalarTypeId a_type_id,  // A ScalarType id
+          const vllm::ScalarTypeId b_type_id,  // B ScalarType id
+          const vllm::ScalarTypeId c_type_id,  // C ScalarType id
+          const vllm::ScalarTypeId s_type_id,  // B_SCALE ScalarType id
           const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
@@ -301,13 +394,18 @@ __global__ void Marlin(
     int4* __restrict__ C,        // fp16 output buffer of shape mxn
     int4* __restrict__ C_tmp,    // fp32 tmp output buffer (for reduce)
     const int4* __restrict__ b_bias_ptr,
-    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
-                                          // (k/groupsize)xn
-    const uint16_t* __restrict__ scale2_ptr,  // fp16 global scale (for nvfp4
-                                              // only)
-    const int4* __restrict__ zp_ptr,  // 4bit packed zero-points of shape
-                                      // (k/groupsize)x(n/pack_factor)
-    const int* __restrict__ g_idx,    // int32 group indices of shape k
+    // float scales of input matrix, only used when is_a_8bit == true.
+    // shape (m,)
+    const float* __restrict__ a_scales_ptr,
+    // fp16 quantization scales. shape (k/groupsize, n)
+    const int4* __restrict__ scales_ptr,
+    // fp16 global scale (for nvfp4// only)
+    const uint16_t* __restrict__ global_scale_ptr,
+    // 4bit packed zero-points of shape
+    // (k/groupsize, n/pack_factor)
+    const int4* __restrict__ zp_ptr,
+    // int32 group indices of shape k
+    const int* __restrict__ g_idx,
     const int32_t* __restrict__ sorted_token_ids_ptr,        // moe sorted_ids
     const int32_t* __restrict__ expert_ids_ptr,              // moe expert ids
     const int32_t* __restrict__ num_tokens_past_padded_ptr,  // moe num tokens
@@ -321,9 +419,9 @@ __global__ void Marlin(
     int prob_k,             // reduction dimension k
     int* locks,             // extra global storage for barrier synchronization
     bool has_bias,
-    bool use_atomic_add,   // whether to use atomic add to reduce
-    bool use_fp32_reduce,  // whether to use fp32 global reduce
-    int max_shared_mem) {
+    bool use_atomic_add,  // whether to use atomic add to reduce
+    bool use_fp32_reduce  // whether to use fp32 global reduce
+) {
   // Each threadblock processes one "stripe" of the B matrix with (roughly) the
   // same size, which might involve multiple column "slices" (of width 16 *
   // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
@@ -335,18 +433,37 @@ __global__ void Marlin(
   // ensures good utilization of all SMs for many kinds of shape and GPU
   // configurations, while requiring as few slow global cross-threadblock
   // reductions as possible.
-  using Dtype = ScalarType<scalar_t>;
-  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
-  using FragA = typename ScalarType<scalar_t>::FragA;
-  using FragB = typename ScalarType<scalar_t>::FragB;
-  using FragC = typename ScalarType<scalar_t>::FragC;
-  using FragS = typename ScalarType<scalar_t>::FragS;
-  using FragZP = typename ScalarType<scalar_t>::FragZP;
+
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 890
+  // FP8 computation is only supported for Ada Lovelace or newer architectures.
+  if constexpr (a_type_id == vllm::kFE4M3fn.id()) return;
+  #endif
+
+  int num_tokens_past_padded = num_tokens_past_padded_ptr[0];
+  constexpr int moe_block_size = m_block_size_8 ? 8 : (16 * thread_m_blocks);
+
+  using Adtype = MarlinScalarType<a_type_id>;
+  using Cdtype = MarlinScalarType<c_type_id>;
+
+  using scalar_t = typename MarlinScalarType<a_type_id>::scalar_t;
+  using scalar_t2 = typename MarlinScalarType<a_type_id>::scalar_t2;
+  using scalar_32bit_t = typename MarlinScalarType<a_type_id>::scalar_32bit_t;
+
+  using c_scalar_t = typename MarlinScalarType<c_type_id>::scalar_t;
+  using c_scalar_t2 = typename MarlinScalarType<c_type_id>::scalar_t2;
+
+  using FragA = typename MarlinScalarType<a_type_id>::FragA;
+  using FragB = typename MarlinScalarType<a_type_id>::FragB;
+  using FragC = typename MarlinScalarType<a_type_id>::FragC;
+  using FragS = typename MarlinScalarType<c_type_id>::FragS;
+  using FragZP = typename MarlinScalarType<c_type_id>::FragZP;
 
   extern __shared__ int4 sh[];
-  static constexpr auto w_type = vllm::ScalarType::from_id(w_type_id);
+  static constexpr auto a_type = vllm::ScalarType::from_id(a_type_id);
+  static constexpr auto b_type = vllm::ScalarType::from_id(b_type_id);
+  static constexpr auto c_type = vllm::ScalarType::from_id(c_type_id);
   static constexpr auto s_type = vllm::ScalarType::from_id(s_type_id);
-  if constexpr (w_type == vllm::kFE2M1f) {
+  if constexpr (b_type == vllm::kFE2M1f) {
     static_assert(s_type == vllm::kFE4M3fn && group_blocks == 1 ||
                   s_type == vllm::kFE8M0fnu && group_blocks == 2);
   } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
@@ -355,34 +472,37 @@ __global__ void Marlin(
     static_assert(s_type == vllm::kFloat16);
   }
 
-  constexpr bool has_zp = w_type == vllm::kU4 || w_type == vllm::kU8;
-  constexpr bool is_int_type = w_type == vllm::kU4 || w_type == vllm::kU8 ||
-                               w_type == vllm::kU4B8 || w_type == vllm::kU8B128;
+  constexpr bool is_a_8bit = a_type.size_bits() == 8;
+  if constexpr (!is_a_8bit) {
+    static_assert(std::is_same<scalar_t, c_scalar_t>::value);
+  }
+  constexpr bool has_zp = b_type == vllm::kU4 || b_type == vllm::kU8;
+  constexpr bool is_int_type = b_type == vllm::kU4 || b_type == vllm::kU8 ||
+                               b_type == vllm::kS4 || b_type == vllm::kS8 ||
+                               b_type == vllm::kU4B8 || b_type == vllm::kU8B128;
   // see comments of dequant.h for more details
   constexpr bool dequant_skip_flop =
-      w_type == vllm::kFE4M3fn ||
-      w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn ||
+      is_a_8bit || b_type == vllm::kFE4M3fn ||
+      b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn ||
       has_zp && !is_zp_float && !std::is_same<scalar_t, nv_bfloat16>::value ||
-      has_zp && !is_zp_float && !(w_type == vllm::kU8);
+      has_zp && !is_zp_float && !(b_type == vllm::kU8);
 
-  scalar_t2 global_scale;
+  c_scalar_t2 global_scale;
 
   constexpr bool has_act_order = group_blocks == 0;
 
-  constexpr int pack_factor = 32 / w_type.size_bits();
+  constexpr int pack_factor = 32 / b_type.size_bits();
   static_assert(thread_m_blocks == 1 || !m_block_size_8);
-  constexpr int moe_block_size = m_block_size_8 ? 8 : (16 * thread_m_blocks);
   const int group_size =
       (!has_act_order && group_blocks == -1) ? prob_k : prob_k / num_groups;
   const int scales_expert_stride =
-      prob_n * prob_k / group_size / (w_type == vllm::kFE2M1f ? 16 : 8);
+      prob_n * prob_k / group_size / (b_type == vllm::kFE2M1f ? 16 : 8);
   const int zp_expert_stride =
       is_zp_float ? prob_n * prob_k / group_size / 8
                   : prob_n * prob_k / group_size / (pack_factor * 4);
   const int b_bias_expert_stride = prob_n / 8;
 
   // parallel: num valid moe blocks
-  int num_tokens_past_padded = num_tokens_past_padded_ptr[0];
   int parallel = num_tokens_past_padded / moe_block_size;
   int num_valid_blocks = parallel;
   if (is_ep) {
@@ -395,7 +515,23 @@ __global__ void Marlin(
 
   int k_tiles = prob_k / 16 / thread_k_blocks;
   int n_tiles = prob_n / 16 / thread_n_blocks;
-  int iters = div_ceil(k_tiles * n_tiles * parallel, gridDim.x);
+
+  int global_mn_tiles = parallel * n_tiles;
+  int part2_mn_tiles = global_mn_tiles;
+  int part1_mn_iters = 0;
+  bool in_part2 = false;
+
+  // we use DP + two-tile SK here
+  // part1: DP
+  // part2: two-tile SK
+  // see https://github.com/vllm-project/vllm/pull/24722 for more details
+  if (global_mn_tiles > gridDim.x) {
+    part2_mn_tiles = global_mn_tiles % gridDim.x;
+    if (part2_mn_tiles * 3 <= gridDim.x) part2_mn_tiles += gridDim.x;
+    part1_mn_iters = (global_mn_tiles - part2_mn_tiles) / gridDim.x;
+  }
+
+  int iters = div_ceil(k_tiles * part2_mn_tiles, gridDim.x);
 
   if constexpr (!has_act_order && group_blocks != -1) {
     if (group_blocks >= thread_k_blocks) {
@@ -407,14 +543,15 @@ __global__ void Marlin(
     }
   }
 
-  int slice_row = (iters * blockIdx.x) % k_tiles;
-  int slice_col_par = (iters * blockIdx.x) / k_tiles;
-  int slice_col = slice_col_par;
-  int slice_iters;  // number of threadblock tiles in the current slice
-  int slice_count =
-      0;          // total number of active threadblocks in the current slice
-  int slice_idx;  // index of threadblock in current slice; numbered bottom to
-                  // top
+  int slice_row = 0;
+  int slice_col_par = blockIdx.x;
+  int slice_col;
+  int slice_iters =
+      k_tiles;  // number of threadblock tiles in the current slice
+  // total number of active threadblocks in the current slice
+  int slice_count = 1;
+  // index of threadblock in current slice; numbered bottom to top
+  int slice_idx = 0;
 
   int par_id = 0;
   int block_id = -1;
@@ -422,87 +559,89 @@ __global__ void Marlin(
   int old_expert_id = 0;
   int64_t B_expert_off = 0;
 
-  int4* sh_block_sorted_ids_int4 = sh;
+  float* sh_a_s = reinterpret_cast<float*>(sh);
+  int4* sh_block_sorted_ids_int4 = sh + (is_a_8bit ? (4 * thread_m_blocks) : 0);
   int4* sh_rd_block_sorted_ids_int4 =
       sh_block_sorted_ids_int4 + moe_block_size / 4;
   int4* sh_block_topk_weights_int4 =
       sh_rd_block_sorted_ids_int4 + moe_block_size / 4;
   // sh_block_topk_weights_int4 only need (moe_block_size / 4);
   // but we pad to align to 256 bytes
-  int4* sh_new =
-      sh_block_topk_weights_int4 + moe_block_size / 2 + moe_block_size;
+  int4* sh_new = sh_block_topk_weights_int4 + moe_block_size / 2;
   int32_t* sh_block_sorted_ids =
       reinterpret_cast<int*>(sh_block_sorted_ids_int4);
   int32_t* sh_rd_block_sorted_ids =
       reinterpret_cast<int*>(sh_rd_block_sorted_ids_int4);
-  scalar_t2* sh_block_topk_weights =
-      reinterpret_cast<scalar_t2*>(sh_block_topk_weights_int4);
+  c_scalar_t2* sh_block_topk_weights =
+      reinterpret_cast<c_scalar_t2*>(sh_block_topk_weights_int4);
 
   int32_t block_num_valid_tokens = 0;
   int32_t locks_off = 0;
 
   // We can easily implement parallel problem execution by just remapping
   // indices and advancing global pointers
-  if (slice_col_par >= n_tiles) {
-    slice_col = slice_col_par % n_tiles;
-    par_id = slice_col_par / n_tiles;
-  }
-  if (parallel * n_tiles >= gridDim.x) {
-    // when parallel * n_tiles >= sms
+  if (part2_mn_tiles >= gridDim.x) {
+    // when part2_mn_tiles >= sms
     // then there are at most $sms$ conflict tile blocks
     locks_off = blockIdx.x;
   } else {
     locks_off = (iters * blockIdx.x) / k_tiles - 1;
   }
 
+  int prob_m_top_k = prob_m * top_k;
   // read moe block data given block_id
   // block_sorted_ids / block_num_valid_tokens / block_topk_weights
   auto read_moe_block_data = [&](int block_id) {
     block_num_valid_tokens = moe_block_size;
+
+    cp_async4_pred(sh_block_sorted_ids_int4 + threadIdx.x,
+                   reinterpret_cast<const int4*>(sorted_token_ids_ptr) +
+                       (block_id * moe_block_size / 4 + threadIdx.x),
+                   threadIdx.x < moe_block_size / 4);
+
+    cp_async_fence();
+    cp_async_wait<0>();
+
+    __syncthreads();
+
+    if (threadIdx.x >= threads - 32) {
+      constexpr int size_per_thread = div_ceil(moe_block_size, 32);
+      int lane_id = threadIdx.x - (threads - 32);
+
+      int local_count = 0;
   #pragma unroll
-    for (int i = 0; i < moe_block_size / 4; i++) {
-      int4 sorted_token_ids_int4 = reinterpret_cast<const int4*>(
-          sorted_token_ids_ptr)[block_id * moe_block_size / 4 + i];
-      int* sorted_token_ids = reinterpret_cast<int*>(&sorted_token_ids_int4);
-  #pragma unroll
-      for (int j = 0; j < 4; j++) {
-        if (sorted_token_ids[j] >= prob_m * top_k) {
-          block_num_valid_tokens = i * 4 + j;
-          break;
+      for (int i = 0; i < size_per_thread; i++) {
+        int j = lane_id * size_per_thread + i;
+        if (j < moe_block_size) {
+          int idx = sh_block_sorted_ids[j];
+          if (idx < prob_m_top_k) local_count++;
         }
       }
-      if (block_num_valid_tokens != moe_block_size) break;
+
+      block_num_valid_tokens = __reduce_add_sync(0xffffffff, local_count);
+
+      if (lane_id == 0)
+        reinterpret_cast<int*>(sh_new)[0] = block_num_valid_tokens;
+    }
+
+    if (threadIdx.x < moe_block_size) {
+      int idx = sh_block_sorted_ids[threadIdx.x];
+      sh_rd_block_sorted_ids[threadIdx.x] = idx / top_k;
+
+      if (mul_topk_weights) {
+        idx = idx < prob_m_top_k ? idx : 0;
+        c_scalar_t2 topk_weight_val =
+            Cdtype::num2num2(Cdtype::float2num(topk_weights_ptr[idx]));
+        if constexpr (b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
+          topk_weight_val = __hmul2(topk_weight_val, global_scale);
+        }
+        sh_block_topk_weights[threadIdx.x] = topk_weight_val;
+      }
     }
 
     __syncthreads();
-    int tid4 = threadIdx.x / 4;
-    if (threadIdx.x % 4 == 0 && threadIdx.x < block_num_valid_tokens) {
-      sh_block_sorted_ids_int4[tid4] = reinterpret_cast<const int4*>(
-          sorted_token_ids_ptr)[block_id * moe_block_size / 4 + tid4];
 
-  #pragma unroll
-      for (int i = 0; i < 4; i++)
-        sh_rd_block_sorted_ids[tid4 * 4 + i] =
-            sh_block_sorted_ids[tid4 * 4 + i] / top_k;
-
-      if (mul_topk_weights) {
-  #pragma unroll
-        for (int i = 0; i < 4; i++) {
-          int idx = tid4 * 4 + i;
-          if (idx < block_num_valid_tokens) {
-            if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
-              sh_block_topk_weights[idx] =
-                  __hmul2(global_scale,
-                          Dtype::num2num2(Dtype::float2num(
-                              topk_weights_ptr[sh_block_sorted_ids[idx]])));
-            } else {
-              sh_block_topk_weights[idx] = Dtype::num2num2(
-                  Dtype::float2num(topk_weights_ptr[sh_block_sorted_ids[idx]]));
-            }
-          }
-        }
-      }
-    }
+    block_num_valid_tokens = reinterpret_cast<int*>(sh_new)[0];
     __syncthreads();
   };
 
@@ -513,9 +652,8 @@ __global__ void Marlin(
 
     old_expert_id = expert_id;
     if (num_invalid_blocks > 0) {
-      int skip_count = block_id == -1 ? par_id : 0;
-      block_id++;
-      for (int i = block_id; i < num_tokens_past_padded / moe_block_size; i++) {
+      int skip_count = par_id;
+      for (int i = 0; i < num_tokens_past_padded / moe_block_size; i++) {
         expert_id = expert_ids_ptr[i];
         if (expert_id != -1) {
           if (skip_count == 0) {
@@ -530,9 +668,9 @@ __global__ void Marlin(
       expert_id = expert_ids_ptr[block_id];
     }
 
-    if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
-      uint16_t val = scale2_ptr[expert_id];
-      global_scale = Dtype::num2num2(*reinterpret_cast<scalar_t*>(&val));
+    if constexpr (b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
+      uint16_t val = global_scale_ptr[expert_id];
+      global_scale = Cdtype::num2num2(*reinterpret_cast<c_scalar_t*>(&val));
     }
 
     B_expert_off = expert_id * prob_n * prob_k / (pack_factor * 4);
@@ -552,10 +690,11 @@ __global__ void Marlin(
 
   // Compute all information about the current slice which is required for
   // synchronization.
-  auto init_slice = [&](bool first_init = false) {
+  bool first_init = true;
+  auto init_part2_slice = [&]() {
     slice_iters =
         iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
-    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
+    if (slice_iters < 0 || slice_col_par >= part2_mn_tiles) slice_iters = 0;
     if (slice_iters == 0) return;
     if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
     slice_count = 1;
@@ -573,7 +712,7 @@ __global__ void Marlin(
         if (col_off > 0) slice_idx--;
       }
     }
-    if (parallel * n_tiles >= gridDim.x) {
+    if (part2_mn_tiles >= gridDim.x) {
       if (slice_count > 1 && slice_idx == slice_count - 1) {
         locks_off++;
       }
@@ -607,25 +746,61 @@ __global__ void Marlin(
       par_id++;
       update_next_moe_block_data();
     }
+    if (is_a_8bit && (first_init || slice_col == 0)) {
+      __syncthreads();
+      cp_async1_ca_pred(&sh_a_s[threadIdx.x],
+                        &a_scales_ptr[sh_rd_block_sorted_ids[threadIdx.x]],
+                        threadIdx.x < block_num_valid_tokens);
+    }
   };
 
-  update_next_moe_block_data();
-  init_slice(true);
+  auto init_part1_slice = [&]() {
+    if (part1_mn_iters) {
+      part1_mn_iters--;
+      par_id = slice_col_par / n_tiles;
+      slice_col = slice_col_par % n_tiles;
+      slice_iters = k_tiles;
+      update_next_moe_block_data();
+      if (is_a_8bit) {
+        __syncthreads();
+        cp_async1_ca_pred(&sh_a_s[threadIdx.x],
+                          &a_scales_ptr[sh_rd_block_sorted_ids[threadIdx.x]],
+                          threadIdx.x < block_num_valid_tokens);
+      }
+    }
+  };
+
+  auto init_slice = [&]() {
+    if (!in_part2 && !part1_mn_iters) {
+      in_part2 = true;
+      slice_col_par = (iters * blockIdx.x) / k_tiles;
+      slice_row = (iters * blockIdx.x) % k_tiles;
+      slice_col = (slice_col_par + global_mn_tiles - part2_mn_tiles) % n_tiles;
+      par_id = (slice_col_par + global_mn_tiles - part2_mn_tiles) / n_tiles;
+      update_next_moe_block_data();
+    }
+    if (!in_part2) {
+      init_part1_slice();
+    } else {
+      init_part2_slice();
+      first_init = false;
+    }
+  };
+
+  init_slice();
 
   // A sizes/strides
 
   // stride of the A matrix in global memory
-  int a_gl_stride = prob_k / 8;
+  int a_gl_stride = prob_k / (is_a_8bit ? 16 : 8);
   // stride of an A matrix tile in shared memory
-  constexpr int a_sh_stride = 16 * thread_k_blocks / 8;
+  constexpr int a_sh_stride = 16 * thread_k_blocks / (is_a_8bit ? 16 : 8);
   // delta between subsequent A tiles in global memory
-  constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / 8;
+  constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / (is_a_8bit ? 16 : 8);
   // between subsequent accesses within a tile
   int a_gl_rd_delta_i = a_gl_stride * (threads / a_gl_rd_delta_o);
   // between shared memory writes
   constexpr int a_sh_wr_delta = a_sh_stride * (threads / a_gl_rd_delta_o);
-  // between shared memory tile reads
-  constexpr int a_sh_rd_delta_o = 2 * ((threads / 32) / (thread_n_blocks / 4));
   // within a shared memory tile
   constexpr int a_sh_rd_delta_i = a_sh_stride * 16;
   // overall size of a tile
@@ -634,24 +809,25 @@ __global__ void Marlin(
   constexpr int a_sh_wr_iters = div_ceil(a_sh_stage, a_sh_wr_delta);
 
   // B sizes/strides
-  int b_gl_stride = 16 * prob_n / (pack_factor * 4);
-  constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4;
-  constexpr int b_thread_vecs = w_type.size_bits() == 4 ? 1 : 2;
+  int b_gl_stride = 16 * prob_n / (pack_factor * (is_a_8bit ? 2 : 4));
+  constexpr int b_sh_stride =
+      ((thread_n_blocks * 16) * 16 / pack_factor) / (is_a_8bit ? 2 : 4);
+  constexpr int b_thread_vecs = b_type.size_bits() == 4 ? 1 : 2;
   constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs;
 
-  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
-  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads);
+  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks / (is_a_8bit ? 2 : 1);
   constexpr int b_sh_wr_delta = threads * b_thread_vecs;
-  constexpr int b_sh_rd_delta = threads * b_thread_vecs;
-  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
+  constexpr int b_sh_stage =
+      b_sh_stride * thread_k_blocks / (is_a_8bit ? 2 : 1);
   constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
 
   // Scale sizes/strides without act_order
-  int s_gl_stride = prob_n / 8;
-  constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
+  int s_gl_stride = prob_n / (b_type == vllm::kFE2M1f ? 16 : 8);
+  constexpr int s_sh_stride =
+      16 * thread_n_blocks / (b_type == vllm::kFE2M1f ? 16 : 8);
   constexpr int s_tb_groups =
       !has_act_order && group_blocks != -1 && group_blocks < thread_k_blocks
-          ? thread_k_blocks / group_blocks / (w_type == vllm::kFE2M1f ? 2 : 1)
+          ? thread_k_blocks / group_blocks
           : 1;
   constexpr int s_sh_stage = s_tb_groups * s_sh_stride;
   int s_gl_rd_delta = s_gl_stride;
@@ -664,7 +840,8 @@ __global__ void Marlin(
   constexpr int act_s_max_num_groups = 32;
   int act_s_col_stride = 1;
   int act_s_col_warp_stride = act_s_col_stride * 8;
-  int tb_n_warps = thread_n_blocks / 4;
+
+  constexpr int tb_n_warps = thread_n_blocks / (is_a_8bit ? 2 : 4);
   int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps;
 
   // Zero-points sizes/strides
@@ -679,7 +856,6 @@ __global__ void Marlin(
   // Global A read index of current thread.
   int a_gl_rd_row = threadIdx.x / a_gl_rd_delta_o;
   int a_gl_rd_col = a_gl_rd_delta_o * slice_row + threadIdx.x % a_gl_rd_delta_o;
-
   // Shared write index of current thread.
   int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
                 (threadIdx.x % a_gl_rd_delta_o);
@@ -687,17 +863,22 @@ __global__ void Marlin(
   int a_sh_rd =
       a_sh_stride * ((threadIdx.x % 32) % (16 / (m_block_size_8 ? 2 : 1))) +
       (threadIdx.x % 32) / (16 / (m_block_size_8 ? 2 : 1));
-  a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
+  a_sh_rd += 2 * ((threadIdx.x / 32) / tb_n_warps) * b_sh_wr_iters;
 
-  int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) +
-                (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
-  b_gl_rd += b_sh_stride * slice_col;
+  int b_gl_rd;
+  if (threads <= b_sh_stride) {
+    b_gl_rd = threadIdx.x;
+  } else {
+    b_gl_rd =
+        b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride);
+  }
+
+  b_gl_rd += B_expert_off + b_sh_stride * slice_col;
   b_gl_rd += b_gl_rd_delta_o * slice_row;
-  auto b_sh_wr = threadIdx.x * b_thread_vecs;
   auto b_sh_rd = threadIdx.x * b_thread_vecs;
+  b_sh_rd += b_sh_rd / b_sh_stride * (b_sh_stride * (b_sh_wr_iters - 1));
 
   // For act_order
-  constexpr int k_iter_size = tb_k / b_sh_wr_iters;
   int slice_k_start = tb_k * slice_row;
   int slice_k_finish = slice_k_start + tb_k * slice_iters;
   int slice_k_start_shared_fetch = slice_k_start;
@@ -708,58 +889,54 @@ __global__ void Marlin(
   if constexpr (!has_act_order) {
     if constexpr (group_blocks == -1) {
       s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
-    } else {
-      s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) /
-                    (w_type == vllm::kFE2M1f ? 2 : 1) +
+    } else if constexpr (group_blocks >= thread_k_blocks) {
+      s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
                 s_sh_stride * slice_col + threadIdx.x;
+    } else {
+      s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks +
+                               threadIdx.x / s_sh_stride) +
+                s_sh_stride * slice_col + threadIdx.x % s_sh_stride;
     }
   }
   auto s_sh_wr = threadIdx.x;
-  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
+  bool s_sh_wr_pred = threadIdx.x < s_sh_stage;
 
   // Zero-points
   int zp_gl_rd;
   if constexpr (has_zp) {
     if constexpr (group_blocks == -1) {
       zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
-    } else {
+    } else if constexpr (group_blocks >= thread_k_blocks) {
       zp_gl_rd = zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
                  zp_sh_stride * slice_col + threadIdx.x;
+    } else {
+      zp_gl_rd = zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks +
+                                 threadIdx.x / zp_sh_stride) +
+                 zp_sh_stride * slice_col + threadIdx.x % zp_sh_stride;
     }
   }
   auto zp_sh_wr = threadIdx.x;
-  bool zp_sh_wr_pred = threadIdx.x < zp_sh_stride;
+  bool zp_sh_wr_pred = zp_sh_stage > 0 && threadIdx.x < zp_sh_stage;
 
   // We use a different scale layout for grouped and column-wise quantization as
   // we scale a `half2` tile in column-major layout in the former and in
   // row-major in the latter case.
   int s_sh_rd;
-  if constexpr (group_blocks != -1 && w_type == vllm::kFE2M1f) {
-    auto warp_id = threadIdx.x / 32;
-    int n_warps = thread_n_blocks / 4;
-    int warp_row = warp_id / n_warps;
-
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) / 4;
-    s_sh_rd = s_sh_rd * 2 + (warp_row / group_blocks) % 2;
-
+  if constexpr (is_a_8bit) {
+    s_sh_rd = 4 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 4);
   } else if constexpr (group_blocks != -1)
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) / 4;
+    s_sh_rd = 8 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 32) / 4;
   else if constexpr (group_blocks == -1 &&
                      (m_block_size_8 || (has_zp && !dequant_skip_flop)))
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) / 8;
+    s_sh_rd = 8 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 32) / 8;
   else
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) % 4;
+    s_sh_rd = 8 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 32) % 4;
 
   int bias_sh_rd;
   if constexpr (m_block_size_8) {
-    bias_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-                 (threadIdx.x % 32) / 8;
+    bias_sh_rd = 8 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 32) / 8;
   } else {
-    bias_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+    bias_sh_rd = (is_a_8bit ? 4 : 8) * ((threadIdx.x / 32) % tb_n_warps) +
                  (threadIdx.x % 32) % 4;
   }
 
@@ -775,12 +952,16 @@ __global__ void Marlin(
   if constexpr (has_zp) {
     if constexpr (is_zp_float) {
       if constexpr (group_blocks != -1) {
-        zp_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-                   (threadIdx.x % 32) / 4;
+        zp_sh_rd =
+            8 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 32) / 4;
       }
+    } else if (is_a_8bit) {
+      zp_sh_rd = num_ints_per_thread * num_col_threads *
+                     ((threadIdx.x / 32) % tb_n_warps / 2) +
+                 num_ints_per_thread * ((threadIdx.x % 32) / num_row_threads);
     } else {
       zp_sh_rd = num_ints_per_thread * num_col_threads *
-                     ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+                     ((threadIdx.x / 32) % tb_n_warps) +
                  num_ints_per_thread * ((threadIdx.x % 32) / num_row_threads);
     }
   }
@@ -807,18 +988,13 @@ __global__ void Marlin(
   for (int i = 0; i < b_sh_wr_iters; i++) {
   #pragma unroll
     for (int j = 0; j < thread_m_blocks; j++)
-      a_sh_rd_trans[i][j] =
-          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
+      a_sh_rd_trans[i][j] = transform_a(2 * i + a_sh_rd_delta_i * j + a_sh_rd);
   }
 
   // Since B-accesses have non-constant stride they have to be computed at
   // runtime; we break dependencies between subsequent accesses with a tile by
   // maintining multiple pointers (we have enough registers), a tiny
   // optimization.
-  const int4* B_ptr[b_sh_wr_iters];
-  #pragma unroll
-  for (int i = 0; i < b_sh_wr_iters; i++)
-    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
 
   // Shared memory storage for global fetch pipelines.
   constexpr int sh_red_size = (2 * thread_n_blocks + 1) * 16 * thread_m_blocks;
@@ -847,19 +1023,12 @@ __global__ void Marlin(
   static_assert(thread_m_blocks * 16 * thread_n_blocks * 16 / 8 <=
                 stages * b_sh_stage);
   int4* sh_a = sh_s + sh_s_size;
-  constexpr int shm_size_used = moe_block_size +
-                                stages * (g_idx_stage + zp_sh_stage) +
-                                sh_s_size + sh_b_red_bias_size;
-
-  // all remaining shared memory is used to cache A (input)
-  // sh_a_max_row is at least ` stages * 16 * thread_m_blocks `
-  int sh_a_max_row =
-      ((max_shared_mem - 1024) / 16 - shm_size_used) / (thread_k_blocks * 2);
 
   // Register storage for double buffer of shared memory reads.
   FragA frag_a[2][thread_m_blocks];
   I4 frag_b_quant[2][b_thread_vecs];
-  FragC frag_c[thread_m_blocks][4][2];
+  FragC frag_c[thread_m_blocks][is_a_8bit ? 2 : 4][2];
+  FragC frag_c_tmp[thread_m_blocks][is_a_8bit ? 2 : 4][2];
   FragS frag_s[2][4];  // No act-order
   FragS frag_bias[2][4];
   FragS act_frag_s[2][4][4];             // For act-order
@@ -867,6 +1036,24 @@ __global__ void Marlin(
   FragZP frag_zp;                        // Zero-points in fp16
   FragZP frag_zpf[2];                    // Zero-points in fp16 in HQQ
 
+  if constexpr (is_a_8bit && group_blocks != -1) {
+  #pragma unroll
+    for (int j = 0; j < 2; j++) {
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+        for (int g = 0; g < 4; g++) {
+          frag_c_tmp[i][j][0][g] = 0.0f;
+        }
+
+  #pragma unroll
+        for (int g = 0; g < 4; g++) {
+          frag_c_tmp[i][j][1][g] = 0.0f;
+        }
+      }
+    }
+  }
+
   // Zero accumulators.
   auto zero_accums = [&]() {
   #pragma unroll
@@ -910,43 +1097,36 @@ __global__ void Marlin(
       }
     }
   };
-
   // Asynchronously fetch the next A, B and s tile from global to the next
   // shared memory pipeline location.
-  bool should_load_a = true;
-  int max_num_stage_groups =
-      ((sh_a_max_row - moe_block_size) / moe_block_size + 1) / stages;
-  max_num_stage_groups = max(max_num_stage_groups, 1);
-  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true,
-                             int pipe_a = 0) {
+  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
     if (pred) {
-      if (should_load_a) {
-        int4* sh_a_stage = sh_a + moe_block_size * a_sh_stride * pipe_a;
+      int4* sh_a_stage = sh_a + moe_block_size * a_sh_stride * pipe;
   #pragma unroll
-        for (int i = 0; i < a_sh_wr_iters; i++) {
-          int row = a_gl_rd_delta_i / a_gl_stride * i + a_gl_rd_row;
-          int64_t sorted_row = 0;
-          if (!m_block_size_8 || row < 8)
-            sorted_row = sh_rd_block_sorted_ids[row];
-          int64_t true_idx =
-              sorted_row * a_gl_stride + a_gl_rd_col + a_gl_rd_delta_o * a_off;
-          cp_async4_pred(&sh_a_stage[a_sh_wr_trans[i]], &A[true_idx],
-                         row < block_num_valid_tokens);
-        }
+      for (int i = 0; i < a_sh_wr_iters; i++) {
+        int row = a_gl_rd_delta_i / a_gl_stride * i + a_gl_rd_row;
+        int64_t sorted_row = 0;
+        if (!m_block_size_8 || row < 8)
+          sorted_row = sh_rd_block_sorted_ids[row];
+        int64_t true_idx =
+            sorted_row * a_gl_stride + a_gl_rd_col + a_gl_rd_delta_o * a_off;
+        cp_async4_pred(&sh_a_stage[a_sh_wr_trans[i]], &A[true_idx],
+                       row < block_num_valid_tokens);
       }
 
       int4* sh_b_stage = sh_b + b_sh_stage * pipe;
   #pragma unroll
-      for (int i = 0; i < b_sh_wr_iters; i++) {
-  #pragma unroll
-        for (int j = 0; j < b_thread_vecs; j++) {
-          cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j],
-                    B_ptr[i] + j + B_expert_off);
-        }
+      for (int i = 0; i < (b_sh_wr_iters * b_thread_vecs); i++) {
+        constexpr int count = div_ceil(b_sh_stride, threads);
+        int b_gl_idx =
+            b_gl_rd + (i % count) * threads +
+            b_gl_stride * (i / count) * div_ceil(threads, b_sh_stride);
 
-        B_ptr[i] += b_gl_rd_delta_o;
+        cp_async4(&sh_b_stage[threads * i + threadIdx.x], &B[b_gl_idx]);
       }
 
+      b_gl_rd += b_gl_rd_delta_o;
+
       if constexpr (has_act_order) {
         // Fetch g_idx thread-block portion
         int full_pipe = a_off;
@@ -966,44 +1146,24 @@ __global__ void Marlin(
         if constexpr (group_blocks != -1) {
           int4* sh_s_stage = sh_s + s_sh_stage * pipe;
 
-          if constexpr (group_blocks >= thread_k_blocks) {
-            // Only fetch scales if this tile starts a new group
-            if (pipe % (group_blocks / thread_k_blocks) == 0) {
-              if (s_sh_wr_pred) {
-                cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
-              }
-              s_gl_rd += s_gl_rd_delta;
-            }
-          } else {
-            for (int i = 0; i < s_tb_groups; i++) {
-              if (s_sh_wr_pred) {
-                cp_async4(&sh_s_stage[i * s_sh_stride + s_sh_wr],
-                          &scales_ptr[s_gl_rd]);
-              }
-              s_gl_rd += s_gl_rd_delta;
+          // Only fetch scales if this tile starts a new group
+          if (pipe % div_ceil(group_blocks, thread_k_blocks) == 0) {
+            if (s_sh_wr_pred) {
+              cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
             }
+            s_gl_rd += s_gl_rd_delta * s_tb_groups;
           }
         }
 
         if constexpr (has_zp && group_blocks != -1) {
           int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
 
-          if constexpr (group_blocks >= thread_k_blocks) {
-            // Only fetch zero-points if this tile starts a new group
-            if (pipe % (group_blocks / thread_k_blocks) == 0) {
-              if (zp_sh_wr_pred) {
-                cp_async4(&sh_zp_stage[zp_sh_wr], &zp_ptr[zp_gl_rd]);
-              }
-              zp_gl_rd += zp_gl_rd_delta;
-            }
-          } else {
-            for (int i = 0; i < zp_tb_groups; i++) {
-              if (zp_sh_wr_pred) {
-                cp_async4(&sh_zp_stage[i * zp_sh_stride + zp_sh_wr],
-                          &zp_ptr[zp_gl_rd]);
-              }
-              zp_gl_rd += zp_gl_rd_delta;
+          // Only fetch zero points if this tile starts a new group
+          if (pipe % div_ceil(group_blocks, thread_k_blocks) == 0) {
+            if (zp_sh_wr_pred) {
+              cp_async4(&sh_zp_stage[zp_sh_wr], &zp_ptr[zp_gl_rd]);
             }
+            zp_gl_rd += zp_gl_rd_delta * zp_tb_groups;
           }
         }
       }
@@ -1037,18 +1197,18 @@ __global__ void Marlin(
 
   // Load the next sub-tile from the current location in the shared memory pipe
   // into the current register buffer.
-  auto fetch_to_registers = [&](int k, int pipe, int pipe_a = 0) {
-    int4* sh_a_stage = sh_a + moe_block_size * a_sh_stride * pipe_a;
+  auto fetch_to_registers = [&](int k, int pipe) {
+    int4* sh_a_stage = sh_a + moe_block_size * a_sh_stride * pipe;
   #pragma unroll
     for (int i = 0; i < thread_m_blocks; i++)
-      ldsm<m_block_size_8 ? 2 : 4, scalar_t>(
+      ldsm<m_block_size_8 ? 2 : 4, a_type_id>(
           frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
     int4* sh_b_stage = sh_b + b_sh_stage * pipe;
 
   #pragma unroll
     for (int i = 0; i < b_thread_vecs; i++) {
       frag_b_quant[k % 2][i] = *reinterpret_cast<I4*>(
-          &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]);
+          &sh_b_stage[b_sh_stride * (k % b_sh_wr_iters) + b_sh_rd + i]);
     }
   };
 
@@ -1072,53 +1232,54 @@ __global__ void Marlin(
 
   auto fetch_scales_to_registers = [&](int k, int full_pipe) {
     int pipe = full_pipe % stages;
+    using IT1 = typename std::conditional_t<is_a_8bit, int2, int4>;
+    using IT0 = typename std::conditional_t<is_a_8bit, int, int2>;
+    constexpr int group_blocks2 = div_ceil(group_blocks, is_a_8bit ? 2 : 1);
 
     if constexpr (!has_act_order) {
       // No act-order case
       if constexpr (group_blocks == -1) {
         // load only when starting a new slice
-        if (k == 0 && full_pipe == 0) {
+        if (k == 0 && full_pipe == 0 && dequant_skip_flop) {
           reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd];
           reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
         }
       } else if constexpr (group_blocks != -1) {
         if constexpr (group_blocks >= thread_k_blocks) {
-          if (k % b_sh_wr_iters == 0) {
-            int4* sh_s_stage =
-                sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
-                                     (pipe / (group_blocks / thread_k_blocks)));
-            reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
-          } else {
-            reinterpret_cast<int4*>(&frag_s[1])[0] =
-                reinterpret_cast<int4*>(&frag_s[0])[0];
+          constexpr int g = group_blocks / thread_k_blocks;
+          if (pipe % g == 0) {
+            if (k % b_sh_wr_iters == 0) {
+              int4* sh_s_stage = sh_s + s_sh_stage * (g * (pipe / g));
+              reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
+            } else {
+              reinterpret_cast<int4*>(&frag_s[1])[0] =
+                  reinterpret_cast<int4*>(&frag_s[0])[0];
+            }
           }
-        } else {
+        } else if (group_blocks2 < b_sh_wr_iters || k % b_sh_wr_iters == 0) {
           auto warp_id = threadIdx.x / 32;
-          int n_warps = thread_n_blocks / 4;
+          int warp_row = warp_id / tb_n_warps;
 
-          int warp_row = warp_id / n_warps;
-
-          int cur_k = warp_row * 16;
-          cur_k += k_iter_size * (k % b_sh_wr_iters);
-
-          int k_blocks = cur_k / 16;
-          int cur_group_id =
-              k_blocks / (group_blocks * (w_type == vllm::kFE2M1f ? 2 : 1));
+          int k_blocks = b_sh_wr_iters * warp_row + k % b_sh_wr_iters;
+          int cur_group_id = k_blocks / group_blocks2;
 
           int4* sh_s_stage = sh_s + s_sh_stage * pipe;
 
-          if constexpr (w_type_id != vllm::kFE2M1f.id()) {
+          if constexpr (b_type_id != vllm::kFE2M1f.id()) {
             reinterpret_cast<int4*>(&frag_s[k % 2])[0] =
                 sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride];
-          } else if constexpr (group_blocks == 1 || thread_k_blocks > 4) {
-            reinterpret_cast<int2*>(&frag_s[k % 2])[0] =
-                reinterpret_cast<int2*>(
-                    sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride)];
           } else {
             reinterpret_cast<int2*>(&frag_s[k % 2])[0] =
                 reinterpret_cast<int2*>(
-                    sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride) +
-                                k % 2];
+                    sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride)];
+          }
+        } else if (group_blocks >= b_sh_wr_iters) {
+          if constexpr (b_type_id != vllm::kFE2M1f.id()) {
+            reinterpret_cast<int4*>(&frag_s[1])[0] =
+                reinterpret_cast<int4*>(&frag_s[0])[0];
+          } else {
+            reinterpret_cast<int2*>(&frag_s[1])[0] =
+                reinterpret_cast<int2*>(&frag_s[0])[0];
           }
         }
       }
@@ -1139,18 +1300,15 @@ __global__ void Marlin(
     cur_k = 0;
 
     // Progress to current iteration
-    cur_k += k_iter_size * (k % b_sh_wr_iters);
+    cur_k += k % b_sh_wr_iters;
 
     // Determine "position" inside the thread-block (based on warp and
     // thread-id)
     auto warp_id = threadIdx.x / 32;
-    int n_warps =
-        thread_n_blocks / 4;  // Each warp processes 4 16-size tiles over N
+    int warp_row = warp_id / tb_n_warps;
+    int warp_col = warp_id % tb_n_warps;
 
-    int warp_row = warp_id / n_warps;
-    int warp_col = warp_id % n_warps;
-
-    cur_k += warp_row * 16;
+    cur_k += warp_row * 16 * b_sh_wr_iters;
 
     auto th_id = threadIdx.x % 32;
     cur_k += (th_id % 4) * 2;  // Due to tensor-core layout for fp16 B matrix
@@ -1205,18 +1363,16 @@ __global__ void Marlin(
 
       if constexpr (group_blocks == -1) {
         // load only when starting a new slice
-        if (k == 0 && full_pipe == 0) {
+        if (k == 0 && full_pipe == 0 || is_a_8bit) {
   #pragma unroll
           for (int i = 0; i < num_ints_per_thread; i++) {
             frag_qzp[k % 2][i] = (reinterpret_cast<int*>(sh_zp))[zp_sh_rd + i];
           }
         }
-
       } else if constexpr (group_blocks >= thread_k_blocks) {
-        if (k % b_sh_wr_iters == 0) {
-          int4* sh_zp_stage =
-              sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) *
-                                     (pipe / (group_blocks / thread_k_blocks)));
+        constexpr int g = group_blocks / thread_k_blocks;
+        if (pipe % g == 0 && k % b_sh_wr_iters == 0 || is_a_8bit) {
+          int4* sh_zp_stage = sh_zp + zp_sh_stage * (g * (pipe / g));
   #pragma unroll
           for (int i = 0; i < num_ints_per_thread; i++) {
             frag_qzp[k % 2][i] =
@@ -1225,21 +1381,11 @@ __global__ void Marlin(
         }
       } else {
         auto warp_id = threadIdx.x / 32;
-        int n_warps = thread_n_blocks / 4;
 
-        int warp_row = warp_id / n_warps;
+        int warp_row = warp_id / tb_n_warps;
 
-        int cur_k = warp_row * 16;
-        cur_k += k_iter_size * (k % b_sh_wr_iters);
-
-        int k_blocks = cur_k / 16;
-        int cur_group_id = 0;
-
-        // Suppress bogus and persistent divide-by-zero warning
-  #pragma nv_diagnostic push
-  #pragma nv_diag_suppress divide_by_zero
-        cur_group_id = k_blocks / group_blocks;
-  #pragma nv_diagnostic pop
+        int k_blocks = b_sh_wr_iters * warp_row + k % b_sh_wr_iters;
+        int cur_group_id = k_blocks / div_ceil(group_blocks, is_a_8bit ? 2 : 1);
 
         int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
 
@@ -1258,29 +1404,18 @@ __global__ void Marlin(
 
       if constexpr (group_blocks != -1) {
         if constexpr (group_blocks >= thread_k_blocks) {
-          if (k % b_sh_wr_iters == 0) {
-            int4* sh_zp_stage =
-                sh_zp +
-                zp_sh_stage * ((group_blocks / thread_k_blocks) *
-                               (pipe / (group_blocks / thread_k_blocks)));
+          constexpr int g = group_blocks / thread_k_blocks;
+          if (pipe % g == 0 && k % b_sh_wr_iters == 0) {
+            int4* sh_zp_stage = sh_zp + zp_sh_stage * (g * (pipe / g));
             reinterpret_cast<int4*>(&frag_zpf[k % 2])[0] =
                 sh_zp_stage[zp_sh_rd];
           }
-        } else {
+        } else if (group_blocks < b_sh_wr_iters || k % b_sh_wr_iters == 0) {
           auto warp_id = threadIdx.x / 32;
-          int n_warps = thread_n_blocks / 4;
 
-          int warp_row = warp_id / n_warps;
-
-          int cur_k = warp_row * 16;
-          cur_k += k_iter_size * (k % b_sh_wr_iters);
-
-          int k_blocks = cur_k / 16;
-          // Suppress bogus and persistent divide-by-zero warning
-  #pragma nv_diagnostic push
-  #pragma nv_diag_suppress divide_by_zero
+          int warp_row = warp_id / tb_n_warps;
+          int k_blocks = b_sh_wr_iters * warp_row + k % b_sh_wr_iters;
           int cur_group_id = k_blocks / group_blocks;
-  #pragma nv_diagnostic pop
 
           int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
 
@@ -1291,33 +1426,46 @@ __global__ void Marlin(
     }
   };
 
-  auto dequant_data = [&](int q, scalar_t2* frag_b_ptr) {
-    dequant<scalar_t2, w_type_id, dequant_skip_flop>(q, frag_b_ptr);
+  auto dequant_data = [&](int q, scalar_32bit_t* frag_b_ptr, int zp = 0) {
+    if constexpr (a_type.size_bits() != b_type.size_bits()) {
+      if constexpr (is_a_8bit && has_zp) {
+        sub_zp_and_dequant<scalar_32bit_t, b_type_id, dequant_skip_flop>(
+            q, frag_b_ptr, zp);
+      } else {
+        dequant<scalar_32bit_t, b_type_id, dequant_skip_flop>(q, frag_b_ptr);
+      }
+    }
   };
 
   // Execute the actual tensor core matmul of a sub-tile.
   bool is_first_matmul_in_slice = true;
-  auto matmul = [&](int k) {
+  auto matmul = [&](int k, int pipe) {
+    if (is_a_8bit) return;
     int k2 = k % 2;
+    constexpr int g =
+        group_blocks > 0 ? div_ceil(group_blocks, thread_k_blocks) : 1;
     const bool is_new_zp =
-        ((group_blocks != -1) && (group_blocks < thread_k_blocks || k == 0)) ||
+        (group_blocks == 0) ||
+        ((group_blocks > 0) && (group_blocks < b_sh_wr_iters || k == 0)) &&
+            (pipe % g == 0) ||
         (group_blocks == -1 && is_first_matmul_in_slice);
     if constexpr (has_zp && !is_zp_float) {
       if (is_new_zp) {
         if constexpr (group_blocks == -1) is_first_matmul_in_slice = false;
         int zp_quant_0, zp_quant_1;
 
-        if constexpr (w_type.size_bits() == 4) {
+        if constexpr (b_type.size_bits() == 4) {
           zp_quant_0 = frag_qzp[k2][0];
           zp_quant_1 = zp_quant_0 >> 8;
         } else {
-          static_assert(w_type.size_bits() == 8);
+          static_assert(b_type.size_bits() == 8);
           zp_quant_0 = frag_qzp[k2][0];
           zp_quant_1 = frag_qzp[k2][1];
         }
 
-        dequant_data(zp_quant_0, reinterpret_cast<scalar_t2*>(&frag_zp));
-        dequant_data(zp_quant_1, reinterpret_cast<scalar_t2*>(&frag_zp) + 2);
+        dequant_data(zp_quant_0, reinterpret_cast<scalar_32bit_t*>(&frag_zp));
+        dequant_data(zp_quant_1,
+                     reinterpret_cast<scalar_32bit_t*>(&frag_zp) + 2);
       }
     }
     if constexpr (!dequant_skip_flop && has_zp && is_zp_float) {
@@ -1327,14 +1475,14 @@ __global__ void Marlin(
       }
     }
 
-    if constexpr (w_type == vllm::kFE2M1f) {
+    if constexpr (b_type == vllm::kFE2M1f) {
       int s_quant_0 = reinterpret_cast<int*>(frag_s[k2])[0];
       int s_quant_1 = reinterpret_cast<int*>(frag_s[k2])[1];
 
-      dequant_fp8_scales<scalar_t2, s_type_id>(
-          s_quant_0, reinterpret_cast<scalar_t2*>(&frag_s[k2]));
-      dequant_fp8_scales<scalar_t2, s_type_id>(
-          s_quant_1, reinterpret_cast<scalar_t2*>(&frag_s[k2]) + 2);
+      dequant_fp8_scales<c_scalar_t2, s_type_id>(
+          s_quant_0, reinterpret_cast<c_scalar_t2*>(&frag_s[k2]));
+      dequant_fp8_scales<c_scalar_t2, s_type_id>(
+          s_quant_1, reinterpret_cast<c_scalar_t2*>(&frag_s[k2]) + 2);
     }
 
   // We have the m dimension as the inner loop in order to encourage overlapping
@@ -1345,61 +1493,168 @@ __global__ void Marlin(
       FragB frag_b1;
       int b_quant_0, b_quant_1;
 
-      if constexpr (w_type_id == vllm::kFE2M1f.id()) {
+      if constexpr (b_type_id == vllm::kFE2M1f.id()) {
         b_quant_1 = frag_b_quant[k2][0][j];
         b_quant_0 = b_quant_1 << 8;
-      } else if constexpr (w_type.size_bits() == 4) {
+      } else if constexpr (b_type.size_bits() == 4) {
         b_quant_0 = frag_b_quant[k2][0][j];
         b_quant_1 = b_quant_0 >> 8;
       } else {
-        static_assert(w_type.size_bits() == 8);
+        static_assert(b_type.size_bits() == 8);
         int* frag_b_quant_ptr = reinterpret_cast<int*>(frag_b_quant[k2]);
         b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
         b_quant_1 = frag_b_quant_ptr[j * 2 + 1];
       }
 
-      dequant_data(b_quant_0, reinterpret_cast<scalar_t2*>(&frag_b0));
-      dequant_data(b_quant_1, reinterpret_cast<scalar_t2*>(&frag_b1));
+      dequant_data(b_quant_0, reinterpret_cast<scalar_32bit_t*>(&frag_b0));
+      dequant_data(b_quant_1, reinterpret_cast<scalar_32bit_t*>(&frag_b1));
 
-      if constexpr (dequant_skip_flop && has_zp && !is_zp_float) {
-        sub_zp<scalar_t>(frag_b0, frag_zp[j], 0);
-        sub_zp<scalar_t>(frag_b1, frag_zp[j], 1);
+      if constexpr (dequant_skip_flop && has_zp && !is_zp_float && !is_a_8bit) {
+        sub_zp<a_type_id>(frag_b0, frag_zp[j], 0);
+        sub_zp<a_type_id>(frag_b1, frag_zp[j], 1);
       }
 
       // Apply scale to frag_b0
-      if constexpr (has_act_order) {
+      if constexpr (has_act_order && !is_a_8bit) {
         static_assert(group_blocks != -1);
-        scale4<scalar_t>(frag_b0, act_frag_s[k2][0][j], act_frag_s[k2][1][j],
-                         act_frag_s[k2][2][j], act_frag_s[k2][3][j], 0);
-        scale4<scalar_t>(frag_b1, act_frag_s[k2][0][j], act_frag_s[k2][1][j],
-                         act_frag_s[k2][2][j], act_frag_s[k2][3][j], 1);
+        scale4<a_type_id>(frag_b0, act_frag_s[k2][0][j], act_frag_s[k2][1][j],
+                          act_frag_s[k2][2][j], act_frag_s[k2][3][j], 0);
+        scale4<a_type_id>(frag_b1, act_frag_s[k2][0][j], act_frag_s[k2][1][j],
+                          act_frag_s[k2][2][j], act_frag_s[k2][3][j], 1);
       } else if constexpr (!dequant_skip_flop && has_zp && !is_zp_float &&
-                           group_blocks == -1) {
+                           group_blocks == -1 && !is_a_8bit) {
         int idx = (threadIdx.x / 4) % 2;
-        scalar_t2 s2 = Dtype::nums2num2(
+        scalar_t2 s2 = Adtype::nums2num2(
             reinterpret_cast<scalar_t*>(&frag_s[j / 2][j % 2 * 2 + 0])[idx],
             reinterpret_cast<scalar_t*>(&frag_s[j / 2][j % 2 * 2 + 1])[idx]);
         if (is_new_zp) frag_zp[j] = __hmul2(frag_zp[j], s2);
-        scale_and_sub<scalar_t>(frag_b0, s2.x, frag_zp[j].x);
-        scale_and_sub<scalar_t>(frag_b1, s2.y, frag_zp[j].y);
-      } else if constexpr (!dequant_skip_flop && has_zp && group_blocks != -1) {
+        scale_and_sub<a_type_id>(frag_b0, s2.x, frag_zp[j].x);
+        scale_and_sub<a_type_id>(frag_b1, s2.y, frag_zp[j].y);
+      } else if constexpr (!dequant_skip_flop && has_zp && group_blocks != -1 &&
+                           !is_a_8bit) {
         if (is_new_zp)
           frag_zp[j] = __hmul2(frag_zp[j],
                                *reinterpret_cast<scalar_t2*>(&frag_s[k2][j]));
-        scale_and_sub<scalar_t>(frag_b0, frag_s[k2][j][0].x, frag_zp[j].x);
-        scale_and_sub<scalar_t>(frag_b1, frag_s[k2][j][0].y, frag_zp[j].y);
-      } else if constexpr (group_blocks != -1) {
-        scale<scalar_t>(frag_b0, frag_s[k2][j], 0);
-        scale<scalar_t>(frag_b1, frag_s[k2][j], 1);
+        scale_and_sub<a_type_id>(frag_b0, frag_s[k2][j][0].x, frag_zp[j].x);
+        scale_and_sub<a_type_id>(frag_b1, frag_s[k2][j][0].y, frag_zp[j].y);
+      } else if constexpr (group_blocks != -1 && !is_a_8bit) {
+        scale<a_type_id>(frag_b0, frag_s[k2][j], 0);
+        scale<a_type_id>(frag_b1, frag_s[k2][j], 1);
       }
 
   #pragma unroll
       for (int i = 0; i < thread_m_blocks; i++) {
         if constexpr (m_block_size_8) {
-          mma_trans<scalar_t>(frag_a[k2][i], frag_b0, frag_b1, frag_c[i][j][0]);
+          mma_trans<a_type_id>(frag_a[k2][i], frag_b0, frag_b1,
+                               frag_c[i][j][0]);
         } else {
-          mma<scalar_t>(frag_a[k2][i], frag_b0, frag_c[i][j][0]);
-          mma<scalar_t>(frag_a[k2][i], frag_b1, frag_c[i][j][1]);
+          mma<a_type_id>(frag_a[k2][i], frag_b0, frag_c[i][j][0]);
+          mma<a_type_id>(frag_a[k2][i], frag_b1, frag_c[i][j][1]);
+        }
+      }
+    }
+  };
+
+  auto matmul_a8 = [&](int k) {
+    int k2 = k % 2;
+  #pragma unroll
+    for (int j = 0; j < 2; j++) {
+      FragB frag_b[2];
+
+      if (is_a_8bit && b_type.size_bits() == 4 && !has_zp) {
+        dequant_data(frag_b_quant[k2][0][j * 2],
+                     reinterpret_cast<scalar_32bit_t*>(&frag_b));
+        dequant_data(frag_b_quant[k2][0][j * 2 + 1],
+                     reinterpret_cast<scalar_32bit_t*>(&frag_b) + 2);
+      } else if (is_a_8bit && b_type.size_bits() == 4 && has_zp) {
+        int off = (threadIdx.x / 32) % 2 * 2 + j;
+        int zp = (frag_qzp[k2][0] >> (off * 8)) & 0xF;
+        dequant_data(frag_b_quant[k2][0][j * 2],
+                     reinterpret_cast<scalar_32bit_t*>(&frag_b), zp);
+        zp = (frag_qzp[k2][0] >> (off * 8 + 4)) & 0xF;
+        dequant_data(frag_b_quant[k2][0][j * 2 + 1],
+                     reinterpret_cast<scalar_32bit_t*>(&frag_b) + 2, zp);
+      } else {
+        reinterpret_cast<int2*>(&frag_b)[0] =
+            reinterpret_cast<int2*>(&frag_b_quant[k2][j])[0];
+        reinterpret_cast<int2*>(&frag_b)[1] =
+            reinterpret_cast<int2*>(&frag_b_quant[k2][j])[1];
+      }
+
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+        mma<a_type_id, 32>(frag_a[k2][i], frag_b[0],
+                           (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][0]);
+        mma<a_type_id, 32>(frag_a[k2][i], frag_b[1],
+                           (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][1]);
+      }
+
+      if constexpr (group_blocks != -1) {
+        if (group_blocks == 2 || k == 1) {
+          if constexpr (a_type == vllm::kS8) {
+            int2 s_vals[2];
+            s_vals[0] = {
+                (int)reinterpret_cast<uint16_t*>(&frag_s[k2][j * 2][0])[0],
+                (int)reinterpret_cast<uint16_t*>(&frag_s[k2][j * 2][0])[1]};
+            s_vals[1] = {
+                (int)reinterpret_cast<uint16_t*>(&frag_s[k2][j * 2 + 1][0])[0],
+                (int)reinterpret_cast<uint16_t*>(&frag_s[k2][j * 2 + 1][0])[1]};
+
+  #pragma unroll
+            for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+              for (int g = 0; g < 4; g++) {
+                int scale = reinterpret_cast<int*>(&s_vals[0])[g % 2];
+                *reinterpret_cast<int32_t*>(&frag_c[i][j][0][g]) +=
+                    *reinterpret_cast<int32_t*>(&frag_c_tmp[i][j][0][g]) *
+                    scale;
+                frag_c_tmp[i][j][0][g] = 0.0f;
+              }
+
+  #pragma unroll
+              for (int g = 0; g < 4; g++) {
+                int scale = reinterpret_cast<int*>(&s_vals[1])[g % 2];
+                *reinterpret_cast<int32_t*>(&frag_c[i][j][1][g]) +=
+                    *reinterpret_cast<int32_t*>(&frag_c_tmp[i][j][1][g]) *
+                    scale;
+                frag_c_tmp[i][j][1][g] = 0.0f;
+              }
+            }
+          } else {
+            float2 s_vals[2];
+            if constexpr (s_type_id != vllm::kFE8M0fnu.id()) {
+              static_assert(a_type.size_bits() == 16 ||
+                            s_type.size_bits() == 16);
+              s_vals[0] = Cdtype::num22float2(frag_s[k2][j * 2][0]);
+              s_vals[1] = Cdtype::num22float2(frag_s[k2][j * 2 + 1][0]);
+            } else {
+              int32_t* s_vals_int = reinterpret_cast<int32_t*>(&s_vals[0]);
+              int32_t s_vals_e8m0 =
+                  *reinterpret_cast<int32_t*>(&frag_s[k2][j][0]);
+
+              s_vals_int[0] = (s_vals_e8m0 & 0xFF) << 23;
+              s_vals_int[1] = (s_vals_e8m0 & 0xFF00) << 15;
+              s_vals_int[2] = (s_vals_e8m0 & 0xFF0000) << 7;
+              s_vals_int[3] = (s_vals_e8m0 & 0xFF000000) >> 1;
+            }
+
+  #pragma unroll
+            for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+              for (int g = 0; g < 4; g++) {
+                float scale = reinterpret_cast<float*>(&s_vals[0])[g % 2];
+                frag_c[i][j][0][g] += frag_c_tmp[i][j][0][g] * scale;
+                frag_c_tmp[i][j][0][g] = 0.0f;
+              }
+
+  #pragma unroll
+              for (int g = 0; g < 4; g++) {
+                float scale = reinterpret_cast<float*>(&s_vals[1])[g % 2];
+                frag_c[i][j][1][g] += frag_c_tmp[i][j][1][g] * scale;
+                frag_c_tmp[i][j][1][g] = 0.0f;
+              }
+            }
+          }
         }
       }
     }
@@ -1413,7 +1668,8 @@ __global__ void Marlin(
     constexpr int red_off = threads / b_sh_stride_threads / 2;
     if (red_off >= 1) {
       auto red_idx = threadIdx.x / b_sh_stride_threads;
-      constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
+      constexpr int red_sh_stride =
+          b_sh_stride_threads * (is_a_8bit ? 2 : 4) * 2;
       constexpr int red_sh_delta = b_sh_stride_threads;
       int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
                       (threadIdx.x % b_sh_stride_threads);
@@ -1428,7 +1684,8 @@ __global__ void Marlin(
         for (int i = red_off; i > 0; i /= 2) {
           if (i <= red_idx && red_idx < 2 * i) {
   #pragma unroll
-            for (int j = 0; j < 4 * 2; j += (m_block_size_8 ? 2 : 1)) {
+            for (int j = 0; j < (is_a_8bit ? 2 : 4) * 2;
+                 j += (m_block_size_8 ? 2 : 1)) {
               int red_sh_wr =
                   red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
               if (i < red_off) {
@@ -1437,24 +1694,26 @@ __global__ void Marlin(
                 float* c_wr = reinterpret_cast<float*>(&sh_red[red_sh_wr]);
   #pragma unroll
                 for (int k = 0; k < 4; k++)
-                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
+                  reinterpret_cast<FragC*>(
+                      frag_c)[(is_a_8bit ? 2 : 4) * 2 * m_block + j][k] +=
                       c_rd[k] + c_wr[k];
               }
-              sh_red[red_sh_wr] =
-                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
+              sh_red[red_sh_wr] = reinterpret_cast<int4*>(
+                  &frag_c)[(is_a_8bit ? 2 : 4) * 2 * m_block + j];
             }
           }
           __syncthreads();
         }
         if (red_idx == 0) {
   #pragma unroll
-          for (int i = 0; i < 4 * 2; i += (m_block_size_8 ? 2 : 1)) {
+          for (int i = 0; i < (is_a_8bit ? 2 : 4) * 2;
+               i += (m_block_size_8 ? 2 : 1)) {
             float* c_rd =
                 reinterpret_cast<float*>(&sh_red[red_sh_delta * i + red_sh_rd]);
   #pragma unroll
             for (int j = 0; j < 4; j++)
-              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
-                  c_rd[j];
+              reinterpret_cast<FragC*>(
+                  frag_c)[(is_a_8bit ? 2 : 4) * 2 * m_block + i][j] += c_rd[j];
           }
         }
         __syncthreads();
@@ -1470,13 +1729,13 @@ __global__ void Marlin(
     // We are very careful here to reduce directly in the output buffer to
     // maximize L2 cache utilization in this step. To do this, we write out
     // results in FP16 (but still reduce with FP32 compute).
-    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    constexpr int active_threads = 32 * tb_n_warps;
     bool is_th_active = threadIdx.x < active_threads;
     if (!is_th_active) {
       return;
     }
 
-    int c_gl_stride = prob_n / 8;
+    int c_gl_stride = prob_n / 8 * (is_a_8bit ? 2 : 1);
     int c_gl_wr_delta_o = 8 * c_gl_stride;
     int c_gl_wr_delta_i = 4 * (active_threads / 32);
     int c_gl_wr;
@@ -1487,7 +1746,7 @@ __global__ void Marlin(
     } else {
       c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) +
                 4 * (threadIdx.x / 32) + threadIdx.x % 4;
-      c_gl_wr += (2 * thread_n_blocks) * slice_col;
+      c_gl_wr += (2 * thread_n_blocks) * slice_col * (is_a_8bit ? 2 : 1);
     }
     constexpr int c_sh_wr_delta = active_threads;
     int c_sh_wr = threadIdx.x;
@@ -1506,7 +1765,13 @@ __global__ void Marlin(
         if (c_idx / c_gl_stride < block_num_valid_tokens) {
           int64_t sorted_row = sh_block_sorted_ids[c_idx / c_gl_stride];
           int64_t true_idx = sorted_row * c_gl_stride + c_idx % c_gl_stride;
-          sh_red[c_sh_wr + c_sh_wr_delta * i] = C[true_idx];
+          if constexpr (is_a_8bit) {
+            int2* sh_red_int2 = reinterpret_cast<int2*>(sh_red);
+            int2* c_int2 = reinterpret_cast<int2*>(C);
+            sh_red_int2[c_sh_wr + c_sh_wr_delta * i] = c_int2[true_idx];
+          } else {
+            sh_red[c_sh_wr + c_sh_wr_delta * i] = C[true_idx];
+          }
         }
       }
     }
@@ -1514,29 +1779,37 @@ __global__ void Marlin(
   #pragma unroll
     for (int i = 0; i < (m_block_size_8 ? 2 : thread_m_blocks * 4); i++) {
       if (!first) {
-        int4 c_red = sh_red[c_sh_wr + i * c_sh_wr_delta];
+        c_scalar_t* c_red_f16;
+        if constexpr (is_a_8bit) {
+          int2 tmp =
+              reinterpret_cast<int2*>(sh_red)[c_sh_wr + i * c_sh_wr_delta];
+          c_red_f16 = reinterpret_cast<c_scalar_t*>(&tmp);
+        } else {
+          int4 tmp = sh_red[c_sh_wr + i * c_sh_wr_delta];
+          c_red_f16 = reinterpret_cast<c_scalar_t*>(&tmp);
+        }
   #pragma unroll
-        for (int j = 0; j < 2 * 4; j++) {
+        for (int j = 0; j < 2 * (is_a_8bit ? 2 : 4); j++) {
           int delta = 0;
           if constexpr (m_block_size_8) {
             delta = j % 2 == 1 ? -2 : 0;
           }
           reinterpret_cast<float*>(
-              &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4) + delta] +=
-              Dtype::num2float(reinterpret_cast<scalar_t*>(&c_red)[j]);
+              &frag_c)[(is_a_8bit ? 2 : 4) * 2 * 4 * (i / 4) + 4 * j + (i % 4) +
+                       delta] += Cdtype::num2float(c_red_f16[j]);
         }
       }
       if (!last) {
-        int4 c;
+        c_scalar_t c_f16[is_a_8bit ? 4 : 8];
   #pragma unroll
-        for (int j = 0; j < 2 * 4; j++) {
+        for (int j = 0; j < 2 * (is_a_8bit ? 2 : 4); j++) {
           int delta = 0;
           if constexpr (m_block_size_8) {
             delta = j % 2 == 1 ? -2 : 0;
           }
-          reinterpret_cast<scalar_t*>(&c)[j] =
-              Dtype::float2num(reinterpret_cast<float*>(
-                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4) + delta]);
+          c_f16[j] = Cdtype::float2num(reinterpret_cast<float*>(
+              &frag_c)[(is_a_8bit ? 2 : 4) * 2 * 4 * (i / 4) + 4 * j + (i % 4) +
+                       delta]);
         }
 
         int c_idx;
@@ -1549,7 +1822,12 @@ __global__ void Marlin(
         if (c_idx / c_gl_stride < block_num_valid_tokens) {
           int64_t sorted_row = sh_block_sorted_ids[c_idx / c_gl_stride];
           int64_t true_idx = sorted_row * c_gl_stride + c_idx % c_gl_stride;
-          C[true_idx] = c;
+          if constexpr (is_a_8bit) {
+            int2* c_int2 = reinterpret_cast<int2*>(C);
+            c_int2[true_idx] = *reinterpret_cast<int2*>(c_f16);
+          } else {
+            C[true_idx] = *reinterpret_cast<int4*>(c_f16);
+          }
         }
       }
     }
@@ -1563,10 +1841,10 @@ __global__ void Marlin(
 
     constexpr int c_size = tb_m * tb_n * sizeof(float) / 16;
 
-    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    constexpr int active_threads = 32 * tb_n_warps;
     bool is_th_active = threadIdx.x < active_threads;
 
-    constexpr int num_floats = thread_m_blocks * 4 * 2 * 4;
+    constexpr int num_floats = thread_m_blocks * (is_a_8bit ? 2 : 4) * 2 * 4;
     constexpr int th_size = num_floats * sizeof(float) / 16;
 
     int c_cur_offset = locks_off * c_size;
@@ -1634,7 +1912,7 @@ __global__ void Marlin(
     } else {
       c_sh_wr =
           (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
-      c_sh_wr += 32 * (threadIdx.x / 32);
+      c_sh_wr += (is_a_8bit ? 16 : 32) * (threadIdx.x / 32);
     }
 
     int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) +
@@ -1643,49 +1921,49 @@ __global__ void Marlin(
     // We first reorder in shared memory to guarantee the most efficient final
     // global write patterns
     auto write = [&](int idx, float c0, float c1, FragS& s, FragS& b_bias) {
-      scalar_t2 res =
-          Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1));
+      c_scalar_t2 res =
+          Cdtype::nums2num2(Cdtype::float2num(c0), Cdtype::float2num(c1));
 
       // For per-column quantization we finally apply the scale here (only for
       // 4-bit)
-      if constexpr (!has_act_order && group_blocks == -1 &&
-                    w_type.size_bits() == 4 &&
+      if constexpr (!has_act_order && group_blocks == -1 && !is_a_8bit &&
+                    b_type.size_bits() == 4 &&
                     (has_zp && dequant_skip_flop || !has_zp)) {
-        scalar_t2 tmp_scale = s[0];
+        c_scalar_t2 tmp_scale = s[0];
         if constexpr (m_block_size_8) {
-          tmp_scale = Dtype::num2num2(
+          tmp_scale = Cdtype::num2num2(
               reinterpret_cast<scalar_t*>(&s[0])[(threadIdx.x % 8) / 4]);
         }
         res = __hmul2(res, tmp_scale);
       }
 
-      if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
+      if constexpr (b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
         if (!mul_topk_weights) {
           res = __hmul2(res, global_scale);
         }
       }
       if (has_bias && last) {
-        scalar_t2 tmp_bias = b_bias[0];
+        c_scalar_t2 tmp_bias = b_bias[0];
         if constexpr (m_block_size_8) {
-          tmp_bias = Dtype::num2num2(
+          tmp_bias = Cdtype::num2num2(
               reinterpret_cast<scalar_t*>(&b_bias[0])[(threadIdx.x % 8) / 4]);
         }
         res = __hadd2(res, tmp_bias);
       }
 
       if constexpr (m_block_size_8) {
-        ((scalar_t*)sh_red)[idx] = res.x;
-        ((scalar_t*)sh_red)[idx + 8 * c_sh_stride] = res.y;
+        ((c_scalar_t*)sh_red)[idx] = res.x;
+        ((c_scalar_t*)sh_red)[idx + 8 * c_sh_stride] = res.y;
       } else {
-        ((scalar_t2*)sh_red)[idx] = res;
+        ((c_scalar_t2*)sh_red)[idx] = res;
       }
     };
 
-    if (threadIdx.x / 32 < thread_n_blocks / 4) {
+    if (threadIdx.x / 32 < tb_n_warps) {
   #pragma unroll
       for (int i = 0; i < thread_m_blocks; i++) {
   #pragma unroll
-        for (int j = 0; j < 4; j++) {
+        for (int j = 0; j < (is_a_8bit ? 2 : 4); j++) {
           if constexpr (m_block_size_8) {
             int wr = c_sh_wr + 16 * j;
             write(wr, frag_c[i][j][0][0], frag_c[i][j][0][1],
@@ -1723,24 +2001,26 @@ __global__ void Marlin(
       if (row < block_num_valid_tokens) {
         int64_t sorted_row = sh_block_sorted_ids[row];
         int64_t true_idx = sorted_row * c_gl_stride + c_gl_wr % c_gl_stride;
-        scalar_t2 topk_weight_score;
+        c_scalar_t2 topk_weight_score;
         if (mul_topk_weights) topk_weight_score = sh_block_topk_weights[row];
         if (use_atomic_add && slice_count > 1 || mul_topk_weights) {
-          scalar_t2* C_half2 = reinterpret_cast<scalar_t2*>(&C[true_idx]);
-          scalar_t2* sh_red_half2 =
-              reinterpret_cast<scalar_t2*>(&sh_red[c_sh_rd]);
+          c_scalar_t2* C_half2 = reinterpret_cast<c_scalar_t2*>(&C[true_idx]);
+          c_scalar_t2* sh_red_half2 =
+              reinterpret_cast<c_scalar_t2*>(&sh_red[c_sh_rd]);
+          if (mul_topk_weights) {
   #pragma unroll
-          for (int a = 0; a < 4; a++) {
-            scalar_t2 res = sh_red_half2[a];
-            if (mul_topk_weights) {
-              res = __hmul2(res, topk_weight_score);
+            for (int a = 0; a < 4; a++) {
+              sh_red_half2[a] = __hmul2(sh_red_half2[a], topk_weight_score);
             }
+          }
 
-            if (use_atomic_add && slice_count > 1) {
-              atomicAdd(&C_half2[a], res);
-            } else {
-              C_half2[a] = res;
-            };
+          if (use_atomic_add && slice_count > 1) {
+  #pragma unroll
+            for (int a = 0; a < 4; a++) {
+              atomicAdd(&C_half2[a], sh_red_half2[a]);
+            }
+          } else {
+            C[true_idx] = *reinterpret_cast<int4*>(sh_red_half2);
           }
         } else {
           C[true_idx] = sh_red[c_sh_rd];
@@ -1774,7 +2054,7 @@ __global__ void Marlin(
           }
         }
       }
-      fetch_to_shared(i, i, i < slice_iters, i);
+      fetch_to_shared(i, i, i < slice_iters);
     }
 
     zero_accums();
@@ -1799,73 +2079,100 @@ __global__ void Marlin(
     // have even length meaning that the next iteration will always start at
     // index 0.
 
-    for (int stage_group_id = 0; stage_group_id < max_num_stage_groups;
-         stage_group_id++) {
   #pragma unroll
-      for (int pipe = 0; pipe < stages;) {
+    for (int pipe = 0; pipe < stages;) {
   #pragma unroll
-        for (int k = 0; k < b_sh_wr_iters; k++) {
-          int idx =
-              (pipe >= stages && stage_group_id == max_num_stage_groups - 1)
-                  ? (pipe - stages)
-                  : (pipe + stage_group_id * stages);
-          fetch_to_registers(k + 1, pipe % stages, idx);
-          fetch_scales_to_registers(k + 1, pipe);
-          fetch_zp_to_registers(k + 1, pipe);
-          if (k == b_sh_wr_iters - 2) {
-            int idx = (pipe >= 1 && stage_group_id == max_num_stage_groups - 1)
-                          ? (pipe - 1)
-                          : (pipe + (stage_group_id + 1) * stages - 1);
-            fetch_to_shared((pipe + stages - 1) % stages, pipe,
-                            slice_iters >= stages, idx);
-            pipe++;
-            wait_for_stage();
-            init_same_group(pipe % stages);
-          }
-          matmul(k);
+      for (int k = 0; k < b_sh_wr_iters; k++) {
+        fetch_to_registers(k + 1, pipe % stages);
+        fetch_scales_to_registers(k + 1, pipe);
+        fetch_zp_to_registers(k + 1, pipe);
+        if (k == b_sh_wr_iters - 2) {
+          fetch_to_shared((pipe + stages - 1) % stages, pipe,
+                          slice_iters >= stages);
+          pipe++;
+          wait_for_stage();
+          init_same_group(pipe % stages);
         }
-        slice_iters--;
-        if (slice_iters == 0) {
-          break;
-        }
-      }
-
-      a_gl_rd_col += a_gl_rd_delta_o * stages;
-
-      if constexpr (has_act_order) {
-        slice_k_start += tb_k * stages;
-
-        if (slice_k_start < prob_k) {
-          slice_k_start_shared_fetch += tb_k * stages;
-          int first_group_id = g_idx[slice_k_start];
-          int last_g_idx = slice_k_start + stages * tb_k * 2;
-          if (last_g_idx >= prob_k) {
-            last_g_idx = prob_k - 1;
-          }
-          int last_group_id = g_idx[last_g_idx];
-          if (last_group_id >= sh_first_group_id + sh_num_groups) {
-            fetch_act_order_scales_to_shared(false, first_group_id,
-                                             last_group_id);
-            __syncthreads();
-          }
+
+        if constexpr (!is_a_8bit) {
+          matmul(k, pipe - (k >= b_sh_wr_iters - 2 ? 1 : 0));
+        } else {
+          static_assert(group_blocks != 0 && group_blocks != 1);
+          matmul_a8(k);
         }
       }
+      slice_iters--;
       if (slice_iters == 0) {
         break;
       }
     }
 
+    a_gl_rd_col += a_gl_rd_delta_o * stages;
+
+    if constexpr (has_act_order) {
+      slice_k_start += tb_k * stages;
+
+      if (slice_k_start < prob_k) {
+        slice_k_start_shared_fetch += tb_k * stages;
+        int first_group_id = g_idx[slice_k_start];
+        int last_g_idx = slice_k_start + stages * tb_k * 2;
+        if (last_g_idx >= prob_k) {
+          last_g_idx = prob_k - 1;
+        }
+        int last_group_id = g_idx[last_g_idx];
+        if (last_group_id >= sh_first_group_id + sh_num_groups) {
+          fetch_act_order_scales_to_shared(false, first_group_id,
+                                           last_group_id);
+          __syncthreads();
+        }
+      }
+    }
+
     // Process results and, if necessary, proceed to the next column slice.
     // While this pattern may not be the most readable, other ways of writing
     // the loop seemed to noticeably worse performance after compilation.
     if (slice_iters == 0) {
+      if constexpr (is_a_8bit) {
+        float frag_a_s[2 * thread_m_blocks];
+
+        for (int i = 0; i < 2 * thread_m_blocks; i++)
+          frag_a_s[i] = sh_a_s[i * 8 + (threadIdx.x % 32) / 4];
+
+  #pragma unroll
+        for (int j = 0; j < 2; j++) {
+  #pragma unroll
+          for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+            for (int g = 0; g < 4; g++) {
+              float c_val = frag_c[i][j][0][g];
+
+              if constexpr (a_type == vllm::kS8) {
+                c_val = __int2float_rn(*reinterpret_cast<int32_t*>(&c_val));
+              }
+              float s_val = frag_a_s[i * 2 + g / 2];
+              frag_c[i][j][0][g] = c_val * s_val;
+            }
+  #pragma unroll
+            for (int g = 0; g < 4; g++) {
+              float c_val = frag_c[i][j][1][g];
+
+              if constexpr (a_type == vllm::kS8) {
+                c_val = __int2float_rn(*reinterpret_cast<int32_t*>(&c_val));
+              }
+              float s_val = frag_a_s[i * 2 + g / 2];
+              frag_c[i][j][1][g] = c_val * s_val;
+            }
+          }
+        }
+      }
+
       cp_async_wait<0>();
       bool last = slice_idx == slice_count - 1;
       // For per-column scales, we only fetch them here in the final step before
       // write-out
       if constexpr (!has_act_order && group_blocks == -1 &&
                     (has_zp && dequant_skip_flop || !has_zp)) {
-        if (w_type.size_bits() == 8 || (last || use_atomic_add)) {
+        if (b_type.size_bits() == 8 || (last || use_atomic_add) || is_a_8bit) {
           if (s_sh_wr_pred) {
             cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
           }
@@ -1883,20 +2190,27 @@ __global__ void Marlin(
       }
 
       if constexpr (!has_act_order && group_blocks == -1 &&
-                    (has_zp && dequant_skip_flop || !has_zp)) {
-        if (w_type.size_bits() == 8 || (last || use_atomic_add)) {
+                    (has_zp && dequant_skip_flop || !has_zp || is_a_8bit)) {
+        if constexpr (is_a_8bit) {
           cp_async_wait<0>();
           __syncthreads();
-          if (threadIdx.x / 32 < thread_n_blocks / 4) {
+          if (threadIdx.x / 32 < tb_n_warps) {
+            reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
+          }
+        } else if (b_type.size_bits() == 8 || (last || use_atomic_add)) {
+          cp_async_wait<0>();
+          __syncthreads();
+          if (threadIdx.x / 32 < tb_n_warps) {
             reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
             reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
             if constexpr (m_block_size_8) {
               int idx = (threadIdx.x / 4) % 2;
-              scalar_t2* frag_s_half2 = reinterpret_cast<scalar_t2*>(frag_s);
+              c_scalar_t2* frag_s_half2 =
+                  reinterpret_cast<c_scalar_t2*>(frag_s);
   #pragma unroll
               for (int i = 0; i < 8; i++) {
-                frag_s_half2[i] = Dtype::num2num2(
-                    reinterpret_cast<scalar_t*>(&frag_s_half2[i])[idx]);
+                frag_s_half2[i] = Cdtype::num2num2(
+                    reinterpret_cast<c_scalar_t*>(&frag_s_half2[i])[idx]);
               }
             }
           }
@@ -1906,26 +2220,48 @@ __global__ void Marlin(
       // For 8-bit channelwise, we apply the scale before the global reduction
       // that converts the fp32 results to fp16 (so that we avoid possible
       // overflow in fp16)
-      if constexpr (!has_act_order && group_blocks == -1 &&
-                    w_type.size_bits() == 8 &&
-                    (has_zp && dequant_skip_flop || !has_zp)) {
-        if (threadIdx.x / 32 < thread_n_blocks / 4) {
+      if constexpr (!has_act_order && group_blocks == -1 && is_a_8bit) {
+  #pragma unroll
+        for (int j = 0; j < 2; j++) {
+          float2 aa[2];
+          aa[0] = Cdtype::num22float2(frag_s[0][j * 2][0]);
+          aa[1] = Cdtype::num22float2(frag_s[0][j * 2 + 1][0]);
+
+  #pragma unroll
+          for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+            for (int g = 0; g < 4; g++) {
+              float scale = reinterpret_cast<float*>(&aa[0])[g % 2];
+              frag_c[i][j][0][g] *= scale;
+            }
+
+  #pragma unroll
+            for (int g = 0; g < 4; g++) {
+              float scale = reinterpret_cast<float*>(&aa[1])[g % 2];
+              frag_c[i][j][1][g] *= scale;
+            }
+          }
+        }
+      } else if (!has_act_order && group_blocks == -1 &&
+                 b_type.size_bits() == 8 &&
+                 (has_zp && dequant_skip_flop || !has_zp)) {
+        if (threadIdx.x / 32 < tb_n_warps) {
   #pragma unroll
           for (int i = 0; i < thread_m_blocks; i++) {
   #pragma unroll
             for (int j = 0; j < 4; j++) {
-              scale_float<scalar_t>(
+              scale_float<c_type_id>(
                   reinterpret_cast<float*>(&frag_c[i][j][0][0]),
                   frag_s[j / 2][2 * (j % 2) + 0]);
-              scale_float<scalar_t>(
+              scale_float<c_type_id>(
                   reinterpret_cast<float*>(&frag_c[i][j][0][2]),
                   frag_s[j / 2][2 * (j % 2) + (m_block_size_8 ? 1 : 0)]);
 
               if constexpr (!m_block_size_8) {
-                scale_float<scalar_t>(
+                scale_float<c_type_id>(
                     reinterpret_cast<float*>(&frag_c[i][j][1][0]),
                     frag_s[j / 2][2 * (j % 2) + 1]);
-                scale_float<scalar_t>(
+                scale_float<c_type_id>(
                     reinterpret_cast<float*>(&frag_c[i][j][1][2]),
                     frag_s[j / 2][2 * (j % 2) + 1]);
               }
@@ -1949,7 +2285,8 @@ __global__ void Marlin(
         cp_async_wait<0>();
         __syncthreads();
         reinterpret_cast<int4*>(&frag_bias)[0] = sh_bias[bias_sh_rd];
-        reinterpret_cast<int4*>(&frag_bias)[1] = sh_bias[bias_sh_rd + 4];
+        if constexpr (!is_a_8bit)
+          reinterpret_cast<int4*>(&frag_bias)[1] = sh_bias[bias_sh_rd + 4];
         __syncthreads();
       }
 
@@ -1958,37 +2295,22 @@ __global__ void Marlin(
       if (last || use_atomic_add)
         // only the last block in a slice actually writes the result
         write_result(last);
-      int old_slice_row = slice_row;
       slice_row = 0;
-      slice_col_par++;
-      slice_col++;
+      if (!in_part2) {
+        slice_col_par += gridDim.x;
+      } else {
+        slice_col_par++;
+        slice_col++;
+      }
       is_first_matmul_in_slice = true;
       init_slice();
 
-      // Should we load A matrix in next slice?
-      // `slice_col == 0`: when move to a new moe block
-      // `old_slice_row > 0`:
-      //    when the last slice is not starting from k_index == 0
-      //    (only happen when it is the first slice of a threadblock)
-      // `prob_k > thread_k_blocks * 16 * stages * max_num_stage_groups`:
-      //    when the required shared memory size is larger than
-      //    the remaining shared memory
-      if (slice_col == 0 || old_slice_row ||
-          prob_k > thread_k_blocks * 16 * stages * max_num_stage_groups) {
-        should_load_a = true;
-      } else {
-        should_load_a = false;
-      }
-
       if (slice_iters) {
-        a_gl_rd_col = (threadIdx.x % a_gl_rd_delta_o);
-  #pragma unroll
-        for (int i = 0; i < b_sh_wr_iters; i++)
-          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
-        if (slice_col == 0) {
-  #pragma unroll
-          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
-        }
+        a_gl_rd_col =
+            a_gl_rd_delta_o * slice_row + threadIdx.x % a_gl_rd_delta_o;
+        b_gl_rd = B_expert_off + b_gl_stride * (threadIdx.x / b_sh_stride) +
+                  (threadIdx.x % b_sh_stride);
+        b_gl_rd += b_sh_stride * slice_col + b_gl_rd_delta_o * slice_row;
 
         bias_gl_rd = (thread_n_blocks * 16 / 8) * slice_col + threadIdx.x;
         // Update slice k/n for scales loading
@@ -1998,8 +2320,26 @@ __global__ void Marlin(
           slice_k_start_shared_fetch = slice_k_start;
           slice_n_offset = act_s_col_tb_stride * slice_col;
         } else {
-          s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
-          zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
+          if constexpr (group_blocks == -1) {
+            s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+            zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
+          } else if constexpr (group_blocks >= thread_k_blocks) {
+            s_gl_rd =
+                s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
+                s_sh_stride * slice_col + threadIdx.x;
+            zp_gl_rd =
+                zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
+                zp_sh_stride * slice_col + threadIdx.x;
+          } else {
+            s_gl_rd =
+                s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks +
+                               threadIdx.x / s_sh_stride) +
+                s_sh_stride * slice_col + threadIdx.x % s_sh_stride;
+            zp_gl_rd =
+                zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks +
+                                threadIdx.x / zp_sh_stride) +
+                zp_sh_stride * slice_col + threadIdx.x % zp_sh_stride;
+          }
         }
         start_pipes();
       }
diff --git a/csrc/moe/marlin_moe_wna16/ops.cu b/csrc/moe/marlin_moe_wna16/ops.cu
index 601e2aa6f9913..27b6ffaa67176 100644
--- a/csrc/moe/marlin_moe_wna16/ops.cu
+++ b/csrc/moe/marlin_moe_wna16/ops.cu
@@ -37,39 +37,6 @@ __global__ void MarlinDefault(MARLIN_KERNEL_PARAMS){};
 
 using MarlinFuncPtr = void (*)(MARLIN_KERNEL_PARAMS);
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
-
-template <int moe_block_size>
-__global__ void permute_cols_kernel(
-    int4 const* __restrict__ a_int4_ptr, int const* __restrict__ perm_int_ptr,
-    int4* __restrict__ out_int4_ptr,
-    const int32_t* __restrict__ sorted_token_ids_ptr,
-    const int32_t* __restrict__ expert_ids_ptr,
-    const int32_t* __restrict__ num_tokens_past_padded_ptr, int size_m,
-    int size_k, int top_k) {};
-
-}  // namespace marlin
-
-torch::Tensor moe_wna16_marlin_gemm(
-    torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
-    torch::Tensor& b_q_weight,
-    std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
-    std::optional<torch::Tensor> const& b_zeros_or_none,
-    std::optional<torch::Tensor> const& g_idx_or_none,
-    std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
-    torch::Tensor& sorted_token_ids, torch::Tensor& expert_ids,
-    torch::Tensor& num_tokens_past_padded, torch::Tensor& topk_weights,
-    int64_t moe_block_size, int64_t top_k, bool mul_topk_weights, bool is_ep,
-    vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n,
-    int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
-    bool is_zp_float) {
-  TORCH_CHECK_NOT_IMPLEMENTED(false,
-                              "marlin_gemm(..) requires CUDA_ARCH >= 8.0");
-  return torch::empty({1, 1});
-}
-
-#else
-
 // For a given "a" of size [M,K] performs a permutation of the K columns based
 // on the given "perm" indices.
 template <int moe_block_size>
@@ -207,7 +174,7 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
                           int thread_m_blocks, int prob_m, int prob_n,
                           int prob_k, int num_bits, int group_size,
                           bool has_act_order, bool is_k_full, int has_zp,
-                          int is_zp_float) {
+                          int is_zp_float, bool is_a_8bit) {
   int pack_factor = 32 / num_bits;
 
   // Get B size
@@ -217,8 +184,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
 
   // shm size for block_sorted_ids/rd_block_sorted_ids/block_topk_weights
   // both of them requires tb_m * 4 bytes (tb_m * int32 or tb_m * float32)
-  int sh_block_meta_size = tb_m * 4;
-  int sh_a_size = pipe_stages * (tb_m * tb_k) * 2;
+  int sh_block_meta_size = tb_m * 16;
+  int sh_a_size = pipe_stages * (tb_m * tb_k) * (is_a_8bit ? 1 : 2);
   int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4;
   int sh_red_size = tb_m * (tb_n + 8) * 2;
   int sh_bias_size = tb_n * 2;
@@ -250,7 +217,7 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8,
                      int thread_m_blocks, int prob_m, int prob_n, int prob_k,
                      int num_bits, int group_size, bool has_act_order,
                      bool is_k_full, int has_zp, int is_zp_float,
-                     int max_shared_mem) {
+                     int max_shared_mem, bool is_a_8bit) {
   // Sanity
   if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
       th_config.num_threads == -1) {
@@ -273,188 +240,34 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8,
   }
 
   // Check that pipeline fits into cache
-  int cache_size = get_kernel_cache_size(
-      th_config, m_block_size_8, thread_m_blocks, prob_m, prob_n, prob_k,
-      num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float);
-  return cache_size + 512 <= max_shared_mem;
+  int cache_size =
+      get_kernel_cache_size(th_config, m_block_size_8, thread_m_blocks, prob_m,
+                            prob_n, prob_k, num_bits, group_size, has_act_order,
+                            is_k_full, has_zp, is_zp_float, is_a_8bit);
+  return cache_size <= max_shared_mem;
 }
 
-  #define _GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,   \
-                  M_BLOCK_SIZE_8, GROUP_BLOCKS, NUM_THREADS, IS_ZP_FLOAT)      \
-    else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS &&         \
-             thread_n_blocks == THREAD_N_BLOCKS &&                             \
-             thread_k_blocks == THREAD_K_BLOCKS &&                             \
-             m_block_size_8 == M_BLOCK_SIZE_8 &&                               \
-             group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS &&     \
-             is_zp_float == IS_ZP_FLOAT) {                                     \
-      constexpr auto S_TYPE =                                                  \
-          W_TYPE == vllm::kFE2M1f                                              \
-              ? (GROUP_BLOCKS == 1 ? vllm::kFE4M3fn : vllm::kFE8M0fnu)         \
-              : (std::is_same<scalar_t, half>::value ? vllm::kFloat16          \
-                                                     : vllm::kBFloat16);       \
-      kernel = Marlin<scalar_t, W_TYPE.id(), S_TYPE.id(), NUM_THREADS,         \
-                      THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,       \
-                      M_BLOCK_SIZE_8, pipe_stages, GROUP_BLOCKS, IS_ZP_FLOAT>; \
-    }
-
-  // COMMON: cases for (group_blocks in [-1, 2, 4, 8] and is_zp_float == false)
-  //         this is the most common cases
-  // BIGGROUP: cases for big group size (group_blocks in [-1, 8])
-  // FZP: cases for float-zero-point (is_zp_float = true)
-  // ACT: cases for act order case (group_blocks == 0)
-  // FP4: cases for nvfp4(e2m1) (group_blocks == 1)
-  #define COMMON_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, -1, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 2, NUM_THREADS, false)   \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 4, NUM_THREADS, false)   \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 8, NUM_THREADS, false)   \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
-
-  #define COMMON_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
-                                                                          \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
-                                                                          \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
-
-  #define COMMON_GET_IF(W_TYPE)            \
-    COMMON_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    COMMON_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    COMMON_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    COMMON_GET_IF_M234(W_TYPE, 8, 4, 128)
-
-  #define BIGGROUP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, -1, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 8, NUM_THREADS, false)   \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
-
-  #define BIGGROUP_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)   \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
-
-  #define BIGGROUP_GET_IF(W_TYPE)            \
-    BIGGROUP_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    BIGGROUP_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    BIGGROUP_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    BIGGROUP_GET_IF_M234(W_TYPE, 8, 4, 128)
-
-  #define NVFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
-
-  #define NVFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
-
-  #define NVFP4_GET_IF(W_TYPE)            \
-    NVFP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    NVFP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    NVFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    NVFP4_GET_IF_M234(W_TYPE, 8, 4, 128)
-
-  #define MXFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 2, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)
-
-  #define MXFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)
-
-  #define MXFP4_GET_IF(W_TYPE)            \
-    MXFP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    MXFP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    MXFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    MXFP4_GET_IF_M234(W_TYPE, 8, 4, 128)
-
-  // We currently have 4-bit models only with group_blocks == 4
-  #define FZP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 4, NUM_THREADS, true) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true)
-
-  #define FZP_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true)
-
-  #define FZP_GET_IF(W_TYPE)            \
-    FZP_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    FZP_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    FZP_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    FZP_GET_IF_M234(W_TYPE, 8, 4, 128)
-
-  // We currently have 4-bit models only with group_blocks == 4
-  #define ACT_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)        \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false)
-
-  #define ACT_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false)
-
-  #define ACT_GET_IF(W_TYPE)            \
-    ACT_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    ACT_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    ACT_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    ACT_GET_IF_M234(W_TYPE, 8, 4, 128)
-
-template <typename scalar_t>
-MarlinFuncPtr get_marlin_kernel(const vllm::ScalarType q_type,
-                                int thread_m_blocks, int thread_n_blocks,
-                                int thread_k_blocks, bool m_block_size_8,
-                                bool has_act_order, bool has_zp,
-                                int group_blocks, int num_threads,
-                                bool is_zp_float) {
-  int num_bits = q_type.size_bits();
+MarlinFuncPtr get_marlin_kernel(
+    const vllm::ScalarType a_type, const vllm::ScalarType b_type,
+    const vllm::ScalarType c_type, const vllm::ScalarType s_type,
+    int thread_m_blocks, int thread_n_blocks, int thread_k_blocks,
+    bool m_block_size_8, bool has_act_order, bool has_zp, int group_blocks,
+    int threads, bool is_zp_float) {
+  int num_bits = b_type.size_bits();
   auto kernel = MarlinDefault;
-  if (false) {
-  }
 
-  COMMON_GET_IF(vllm::kU4)
-  COMMON_GET_IF(vllm::kU4B8)
-  COMMON_GET_IF(vllm::kU8B128)
-
-  NVFP4_GET_IF(vllm::kFE2M1f)
-
-  BIGGROUP_GET_IF(vllm::kFE4M3fn)
-
-  ACT_GET_IF(vllm::kU4B8)
-  ACT_GET_IF(vllm::kU8B128)
-  if (std::is_same<scalar_t, nv_bfloat16>::value) {
-    if (false) {
-    }
-    MXFP4_GET_IF(vllm::kFE2M1f)
-  }
+#include "kernel_selector.h"
 
   return kernel;
 }
 
-template <typename scalar_t>
-exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
-                                    int prob_n, int prob_k, int thread_m_blocks,
-                                    bool m_block_size_8, int num_bits,
-                                    int group_size, bool has_act_order,
-                                    bool is_k_full, bool has_zp,
-                                    bool is_zp_float, int max_shared_mem) {
+exec_config_t determine_exec_config(
+    const vllm::ScalarType& a_type, const vllm::ScalarType& b_type,
+    const vllm::ScalarType& c_type, const vllm::ScalarType& s_type, int prob_m,
+    int prob_n, int prob_k, int num_experts, int top_k, int thread_m_blocks,
+    bool m_block_size_8, int num_bits, int group_size, bool has_act_order,
+    bool is_k_full, bool has_zp, bool is_zp_float, int max_shared_mem, int sms,
+    bool is_a_8bit) {
   exec_config_t exec_cfg = exec_config_t{1, thread_config_t{-1, -1, -1}};
   thread_config_t* thread_configs = thread_m_blocks > 1
                                         ? large_batch_thread_configs
@@ -471,73 +284,69 @@ exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
 
     if (!is_valid_config(th_config, m_block_size_8, thread_m_blocks, prob_m,
                          prob_n, prob_k, num_bits, group_size, has_act_order,
-                         is_k_full, has_zp, is_zp_float, max_shared_mem)) {
+                         is_k_full, has_zp, is_zp_float, max_shared_mem - 512,
+                         is_a_8bit)) {
       continue;
     }
 
     int cache_size = get_kernel_cache_size(
         th_config, m_block_size_8, thread_m_blocks, prob_m, prob_n, prob_k,
-        num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float);
+        num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float,
+        is_a_8bit);
 
     int group_blocks = 0;
     if (!has_act_order) {
       group_blocks = group_size == -1 ? -1 : (group_size / 16);
     }
 
-    auto kernel = get_marlin_kernel<scalar_t>(
-        q_type, thread_m_blocks, th_config.thread_n / 16,
-        th_config.thread_k / 16, m_block_size_8, has_act_order, has_zp,
-        group_blocks, th_config.num_threads, is_zp_float);
+    auto kernel =
+        get_marlin_kernel(a_type, b_type, c_type, s_type, thread_m_blocks,
+                          th_config.thread_n / 16, th_config.thread_k / 16,
+                          m_block_size_8, has_act_order, has_zp, group_blocks,
+                          th_config.num_threads, is_zp_float);
 
     if (kernel == MarlinDefault) continue;
 
-    if (thread_m_blocks > 1) {
-      exec_cfg = {1, th_config};
-      break;
-    } else {
-      cudaFuncAttributes attr;
-      cudaFuncGetAttributes(&attr, kernel);
-      int reg_size = max(attr.numRegs, 1) * th_config.num_threads * 4;
-      int allow_count = min(device_max_reg_size / reg_size,
-                            max_shared_mem / (cache_size + 1024));
+    cudaFuncAttributes attr;
+    cudaFuncGetAttributes(&attr, kernel);
+    int reg_size = max(attr.numRegs, 1) * th_config.num_threads * 4;
+    int allow_count = min(device_max_reg_size / reg_size,
+                          max_shared_mem / (cache_size + 1536));
+    if (thread_m_blocks == 1)
       allow_count = max(min(allow_count, 4), 1);
-      if (allow_count > count) {
-        count = allow_count;
-        exec_cfg = {count, th_config};
-      };
+    else
+      allow_count = max(min(allow_count, 2), 1);
+
+    if (prob_n / th_config.thread_n * prob_m * top_k * 4 < sms * allow_count) {
+      allow_count =
+          max(prob_n / th_config.thread_n * prob_m * top_k * 4 / sms, 1);
     }
+
+    if (allow_count > count) {
+      count = allow_count;
+      exec_cfg = {count, th_config};
+    };
   }
 
   return exec_cfg;
 }
 
-template <typename scalar_t>
 void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
-               void* s, void* s2, void* zp, void* g_idx, void* perm,
-               void* a_tmp, void* sorted_token_ids, void* expert_ids,
-               void* num_tokens_past_padded, void* topk_weights,
-               int moe_block_size, int top_k, bool mul_topk_weights, bool is_ep,
-               int prob_m, int prob_n, int prob_k, void* workspace,
-               vllm::ScalarType const& q_type, bool has_bias,
-               bool has_act_order, bool is_k_full, bool has_zp, int num_groups,
-               int group_size, int dev, cudaStream_t stream, int thread_k,
-               int thread_n, int sms, bool use_atomic_add, bool use_fp32_reduce,
-               bool is_zp_float) {
+               void* a_s, void* b_s, void* g_s, void* zp, void* g_idx,
+               void* perm, void* a_tmp, void* sorted_token_ids,
+               void* expert_ids, void* num_tokens_past_padded,
+               void* topk_weights, int moe_block_size, int num_experts,
+               int top_k, bool mul_topk_weights, bool is_ep, int prob_m,
+               int prob_n, int prob_k, void* workspace,
+               vllm::ScalarType const& a_type, vllm::ScalarType const& b_type,
+               vllm::ScalarType const& c_type, vllm::ScalarType const& s_type,
+               bool has_bias, bool has_act_order, bool is_k_full, bool has_zp,
+               int num_groups, int group_size, int dev, cudaStream_t stream,
+               int thread_k, int thread_n, int sms, int blocks_per_sm,
+               bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) {
   int thread_m_blocks = div_ceil(moe_block_size, 16);
   bool m_block_size_8 = moe_block_size == 8;
-
-  if (has_zp) {
-    TORCH_CHECK(
-        q_type == vllm::kU4 || q_type == vllm::kU8,
-        "q_type must be u4 or u8 when has_zp = True. Got = ", q_type.str());
-  } else {
-    TORCH_CHECK(
-        q_type == vllm::kU4B8 || q_type == vllm::kU8B128 ||
-            q_type == vllm::kFE4M3fn || q_type == vllm::kFE2M1f,
-        "q_type must be uint4b8, uint8b128, float8_e4m3fn or float4_e2m1f when "
-        "has_zp = False. Got = ",
-        q_type.str());
-  }
+  bool is_a_8bit = a_type.size_bits() == 8;
 
   TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
               ", ", prob_n, ", ", prob_k, "]");
@@ -563,14 +372,15 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
     }
   }
 
-  int num_bits = q_type.size_bits();
+  int num_bits = b_type.size_bits();
   const int4* A_ptr = (const int4*)A;
   const int4* B_ptr = (const int4*)B;
   int4* C_ptr = (int4*)C;
   int4* C_tmp_ptr = (int4*)C_tmp;
   const int4* bias_ptr = (const int4*)b_bias;
-  const int4* s_ptr = (const int4*)s;
-  const uint16_t* s2_ptr = (const uint16_t*)s2;
+  const float* a_s_ptr = (const float*)a_s;
+  const int4* b_s_ptr = (const int4*)b_s;
+  const uint16_t* g_s_ptr = (const uint16_t*)g_s;
   const int4* zp_ptr = (const int4*)zp;
   const int* g_idx_ptr = (const int*)g_idx;
   const int* perm_ptr = (const int*)perm;
@@ -618,22 +428,41 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
                          cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
   TORCH_CHECK(max_shared_mem > 0);
 
+  int major_capability, minor_capability;
+  cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
+                         dev);
+  cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
+                         dev);
+  TORCH_CHECK(major_capability * 10 + minor_capability >= 80,
+              "marlin kernel only support Ampere or newer GPUs.");
+  if (a_type == vllm::kFE4M3fn) {
+    TORCH_CHECK(major_capability * 10 + minor_capability >= 89,
+                "FP8 only support Ada Lovelace or newer GPUs.");
+    TORCH_CHECK(
+        major_capability * 10 + minor_capability == 89 ||
+            major_capability * 10 + minor_capability == 120,
+        "Marlin W4A8-FP8 only support SM89 or SM120 device (It is slower than "
+        "Marlin W4A16 on other devices).");
+  }
+
   // Set thread config
   exec_config_t exec_cfg;
   thread_config_t thread_tfg;
   if (thread_k != -1 && thread_n != -1) {
-    thread_tfg = thread_config_t{thread_k, thread_n, default_threads};
-    exec_cfg = exec_config_t{1, thread_tfg};
+    thread_tfg = thread_config_t{thread_k, thread_n, thread_k * thread_n / 64};
+    if (blocks_per_sm == -1) blocks_per_sm = 1;
+    exec_cfg = exec_config_t{blocks_per_sm, thread_tfg};
     TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
                 " is not divisible by thread_n = ", thread_n);
     TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
                 " is not divisible by thread_k = ", thread_k);
   } else {
     // Auto config
-    exec_cfg = determine_exec_config<scalar_t>(
-        q_type, prob_m, prob_n, prob_k, thread_m_blocks, m_block_size_8,
-        num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float,
-        max_shared_mem);
+    exec_cfg = determine_exec_config(
+        a_type, b_type, c_type, s_type, prob_m, prob_n, prob_k, num_experts,
+        top_k, thread_m_blocks, m_block_size_8, num_bits, group_size,
+        has_act_order, is_k_full, has_zp, is_zp_float, max_shared_mem, sms,
+        is_a_8bit);
     thread_tfg = exec_cfg.tb_cfg;
   }
 
@@ -647,22 +476,29 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
   int thread_k_blocks = thread_k / 16;
   int thread_n_blocks = thread_n / 16;
 
-  TORCH_CHECK(
-      is_valid_config(thread_tfg, m_block_size_8, thread_m_blocks, prob_m,
-                      prob_n, prob_k, num_bits, group_size, has_act_order,
-                      is_k_full, has_zp, is_zp_float, max_shared_mem),
-      "Invalid thread config: thread_m_blocks = ", thread_m_blocks,
-      ", thread_k = ", thread_tfg.thread_k,
-      ", thread_n = ", thread_tfg.thread_n,
-      ", num_threads = ", thread_tfg.num_threads, " for MKN = [", prob_m, ", ",
-      prob_k, ", ", prob_n, "] and num_bits = ", num_bits,
-      ", group_size = ", group_size, ", has_act_order = ", has_act_order,
-      ", is_k_full = ", is_k_full, ", has_zp = ", has_zp,
-      ", is_zp_float = ", is_zp_float, ", max_shared_mem = ", max_shared_mem);
+  TORCH_CHECK(is_valid_config(thread_tfg, m_block_size_8, thread_m_blocks,
+                              prob_m, prob_n, prob_k, num_bits, group_size,
+                              has_act_order, is_k_full, has_zp, is_zp_float,
+                              max_shared_mem, is_a_8bit),
+              "Invalid thread config: thread_m_blocks = ", thread_m_blocks,
+              ", thread_k = ", thread_tfg.thread_k,
+              ", thread_n = ", thread_tfg.thread_n,
+              ", num_threads = ", thread_tfg.num_threads, " for MKN = [",
+              prob_m, ", ", prob_k, ", ", prob_n, "] and num_bits = ", num_bits,
+              ", group_size = ", group_size,
+              ", has_act_order = ", has_act_order, ", is_k_full = ", is_k_full,
+              ", has_zp = ", has_zp, ", is_zp_float = ", is_zp_float,
+              ", max_shared_mem = ", max_shared_mem);
 
-  auto kernel = get_marlin_kernel<scalar_t>(
-      q_type, thread_m_blocks, thread_n_blocks, thread_k_blocks, m_block_size_8,
-      has_act_order, has_zp, group_blocks, num_threads, is_zp_float);
+  int sh_cache_size =
+      get_kernel_cache_size(thread_tfg, m_block_size_8, thread_m_blocks, prob_m,
+                            prob_n, prob_k, num_bits, group_size, has_act_order,
+                            is_k_full, has_zp, is_zp_float, is_a_8bit);
+
+  auto kernel = get_marlin_kernel(
+      a_type, b_type, c_type, s_type, thread_m_blocks, thread_n_blocks,
+      thread_k_blocks, m_block_size_8, has_act_order, has_zp, group_blocks,
+      num_threads, is_zp_float);
 
   if (kernel == MarlinDefault) {
     TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n,
@@ -679,19 +515,20 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
   // avoid ">>>" being formatted to "> > >"
   // clang-format off
   kernel<<<blocks, num_threads, max_shared_mem, stream>>>(
-      A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, s_ptr, s2_ptr, zp_ptr, g_idx_ptr,
+      A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, a_s_ptr, b_s_ptr, g_s_ptr, zp_ptr, g_idx_ptr,
       sorted_token_ids_ptr, expert_ids_ptr, num_tokens_past_padded_ptr,
       topk_weights_ptr, top_k, mul_topk_weights, is_ep, num_groups, prob_m,
-      prob_n, prob_k, locks, has_bias, use_atomic_add, use_fp32_reduce, max_shared_mem);
+      prob_n, prob_k, locks, has_bias, use_atomic_add, use_fp32_reduce);
   // clang-format on
 }
 
 }  // namespace MARLIN_NAMESPACE_NAME
 
 torch::Tensor moe_wna16_marlin_gemm(
-    torch::Tensor& a, std::optional<torch::Tensor> const& c_or_none,
+    torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
     torch::Tensor& b_q_weight,
     std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
+    std::optional<torch::Tensor> const& a_scales_or_none,
     std::optional<torch::Tensor> const& global_scale_or_none,
     std::optional<torch::Tensor> const& b_zeros_or_none,
     std::optional<torch::Tensor> const& g_idx_or_none,
@@ -699,11 +536,70 @@ torch::Tensor moe_wna16_marlin_gemm(
     torch::Tensor& sorted_token_ids, torch::Tensor& expert_ids,
     torch::Tensor& num_tokens_past_padded, torch::Tensor& topk_weights,
     int64_t moe_block_size, int64_t top_k, bool mul_topk_weights, bool is_ep,
-    vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n,
+    vllm::ScalarTypeId const& b_type_id, int64_t size_m, int64_t size_n,
     int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
-    bool is_zp_float) {
-  vllm::ScalarType const b_q_type = vllm::ScalarType::from_id(b_q_type_id);
-  int pack_factor = 32 / b_q_type.size_bits();
+    bool is_zp_float, int64_t thread_k, int64_t thread_n,
+    int64_t blocks_per_sm) {
+  vllm::ScalarTypeId a_type_id, c_type_id, s_type_id;
+
+  auto c_dtype = a.dtype();
+  if (a.scalar_type() == at::ScalarType::Half) {
+    a_type_id = vllm::kFloat16.id();
+    c_type_id = vllm::kFloat16.id();
+  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
+    a_type_id = vllm::kBFloat16.id();
+    c_type_id = vllm::kBFloat16.id();
+  } else {
+    c_dtype = b_scales.dtype();
+    if (b_scales.scalar_type() == at::ScalarType::Half) {
+      c_type_id = vllm::kFloat16.id();
+    } else if (b_scales.scalar_type() == at::ScalarType::BFloat16) {
+      c_type_id = vllm::kBFloat16.id();
+    } else {
+      c_type_id = vllm::kBFloat16.id();
+
+      TORCH_CHECK(c_or_none.has_value(), "c must be passed for W4A8-FP4");
+      torch::Tensor c = c_or_none.value();
+      c_dtype = c.dtype();
+
+      if (c.scalar_type() == at::ScalarType::Half) {
+        c_type_id = vllm::kFloat16.id();
+      } else if (c.scalar_type() == at::ScalarType::BFloat16) {
+        c_type_id = vllm::kBFloat16.id();
+      } else {
+        TORCH_CHECK(false, "unsupported c dtype");
+      }
+    }
+
+    if (a.scalar_type() == at::ScalarType::Float8_e4m3fn) {
+      a_type_id = vllm::kFE4M3fn.id();
+    } else if (a.scalar_type() == at::ScalarType::Char) {
+      a_type_id = vllm::kS8.id();
+    } else {
+      TORCH_CHECK(false, "unsupported `a` scalar_type");
+    }
+  }
+
+  s_type_id = c_type_id;
+  if (b_type_id == vllm::kFE2M1f.id()) {
+    if (b_scales.scalar_type() == at::ScalarType::Float8_e4m3fn) {
+      s_type_id = vllm::kFE4M3fn.id();
+    } else if (b_scales.scalar_type() == at::ScalarType::Float8_e8m0fnu) {
+      s_type_id = vllm::kFE8M0fnu.id();
+    } else {
+      TORCH_CHECK(false,
+                  "When b_type = float4_e2m1f, b_scale scalar type must be",
+                  "float8_e4m3fn (for NVFP4) or float8_e8m0fnu (for MXFP4).");
+    }
+  }
+
+  vllm::ScalarType a_type = vllm::ScalarType::from_id(a_type_id);
+  vllm::ScalarType b_type = vllm::ScalarType::from_id(b_type_id);
+  vllm::ScalarType c_type = vllm::ScalarType::from_id(c_type_id);
+  vllm::ScalarType s_type = vllm::ScalarType::from_id(s_type_id);
+
+  int pack_factor = 32 / b_type.size_bits();
+  int num_experts = b_q_weight.size(0);
 
   if (moe_block_size != 8) {
     TORCH_CHECK(moe_block_size % 16 == 0,
@@ -745,19 +641,27 @@ torch::Tensor moe_wna16_marlin_gemm(
   TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
   TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
 
-  // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
-  // auto -1)
-  int thread_k = -1;
-  // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
-  // auto -1)
-  int thread_n = -1;
+  torch::Tensor a_scales;
+  auto options = torch::TensorOptions().dtype(c_dtype).device(a.device());
+  auto options_fp32 =
+      torch::TensorOptions().dtype(at::kFloat).device(a.device());
+
+  if (a_scales_or_none.has_value()) {
+    a_scales = a_scales_or_none.value();
+    TORCH_CHECK(a_type.size_bits() == 8,
+                "a_scales can only be used for 8bit activation.");
+  } else {
+    a_scales = torch::empty({0}, options_fp32);
+    TORCH_CHECK(a_type.size_bits() != 8,
+                "the a_scales parameter must be passed for 8bit activation.");
+  }
+
   // sms: number of SMs to use for the kernel
   int sms = -1;
   cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, a.get_device());
 
   // Alloc buffers
   const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
-  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
   torch::Tensor c;
   if (c_or_none.has_value()) {
     c = c_or_none.value();
@@ -774,8 +678,6 @@ torch::Tensor moe_wna16_marlin_gemm(
 
   // Alloc C tmp buffer that is going to be used for the global reduce
   torch::Tensor c_tmp;
-  auto options_fp32 =
-      torch::TensorOptions().dtype(at::kFloat).device(a.device());
   if (use_fp32_reduce && !use_atomic_add) {
     // max num of threadblocks is sms * 4
     long max_c_tmp_size = min(
@@ -846,11 +748,11 @@ torch::Tensor moe_wna16_marlin_gemm(
   torch::Tensor global_scale;
   if (global_scale_or_none.has_value()) {
     global_scale = global_scale_or_none.value();
-    TORCH_CHECK(b_q_type == vllm::kFE2M1f && group_size == 16,
+    TORCH_CHECK(b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn,
                 "global_scale can only be used for nvfp4 format.");
   } else {
     global_scale = torch::empty({0}, options);
-    TORCH_CHECK(!(b_q_type == vllm::kFE2M1f && group_size == 16),
+    TORCH_CHECK(!(b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn),
                 "the global_scale parameter must be passed for nvfp4 format.");
   }
 
@@ -877,15 +779,15 @@ torch::Tensor moe_wna16_marlin_gemm(
   bool has_zp = b_zeros.size(-1) > 0;
   if (has_zp) {
     TORCH_CHECK(
-        b_q_type == vllm::kU4 || b_q_type == vllm::kU8,
-        "b_q_type must be u4 or u8 when has_zp = True. Got = ", b_q_type.str());
+        b_type == vllm::kU4 || b_type == vllm::kU8,
+        "b_type must be u4 or u8 when has_zp = True. Got = ", b_type.str());
   } else {
-    TORCH_CHECK(b_q_type == vllm::kU4B8 || b_q_type == vllm::kU8B128 ||
-                    b_q_type == vllm::kFE4M3fn || b_q_type == vllm::kFE2M1f,
-                "b_q_type must be uint4b8, uint8b128, float8_e4m3fn or "
-                "float4_e2m1f when "
-                "has_zp = False. Got = ",
-                b_q_type.str());
+    TORCH_CHECK(b_type == vllm::kU4B8 || b_type == vllm::kU8B128 ||
+                    b_type == vllm::kS4 || b_type == vllm::kS8 ||
+                    b_type == vllm::kFE4M3fn || b_type == vllm::kFE2M1f,
+                "b_type must be uint4b8, uint8b128, int4, int8, "
+                "float8_e4m3fn or float4_e2m1f when has_zp = False. Got = ",
+                b_type.str());
   }
 
   if (has_zp && is_zp_float) {
@@ -929,71 +831,33 @@ torch::Tensor moe_wna16_marlin_gemm(
               " is below min_workspace_size = ", min_workspace_size);
 
   int dev = a.get_device();
-  if (a.scalar_type() == at::ScalarType::Half) {
-    void* scales_ptr;
-    if (b_q_type == vllm::kFE2M1f) {
-      if (group_size == 16)
-        scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
-      else if (group_size == 32)
-        scales_ptr = b_scales.data_ptr<at::Float8_e8m0fnu>();
-      else
-        TORCH_CHECK(false,
-                    "float4_e2m1f only supports group_size == 16 (NVFP4) ",
-                    "and group_size == 32 (MXFP4)");
-    } else {
-      scales_ptr = b_scales.data_ptr<at::Half>();
-    }
 
-    MARLIN_NAMESPACE_NAME::marlin_mm<half>(
-        a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
-        c_tmp.data_ptr<float>(), b_bias.data_ptr<at::Half>(), scales_ptr,
-        global_scale.data_ptr<at::Half>(), b_zeros.data_ptr(), g_idx.data_ptr(),
-        perm.data_ptr(), a_tmp.data_ptr<at::Half>(),
-        sorted_token_ids.data_ptr(), expert_ids.data_ptr(),
-        num_tokens_past_padded.data_ptr(), topk_weights.data_ptr(),
-        moe_block_size, top_k, mul_topk_weights, is_ep, size_m, size_n, size_k,
-        workspace.data_ptr(), b_q_type, has_bias, has_act_order, is_k_full,
-        has_zp, num_groups, group_size, dev,
-        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
-        use_atomic_add, use_fp32_reduce, is_zp_float);
-  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
-    void* scales_ptr;
-    if (b_q_type == vllm::kFE2M1f) {
-      if (group_size == 16)
-        scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
-      else if (group_size == 32)
-        scales_ptr = b_scales.data_ptr<at::Float8_e8m0fnu>();
-      else
-        TORCH_CHECK(false,
-                    "float4_e2m1f only supports group_size == 16 (NVFP4) ",
-                    "and group_size == 32 (MXFP4)");
-    } else {
-      scales_ptr = b_scales.data_ptr<at::BFloat16>();
-    }
-
-    MARLIN_NAMESPACE_NAME::marlin_mm<nv_bfloat16>(
-        a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
-        c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(),
-        b_bias.data_ptr<at::BFloat16>(), scales_ptr,
-        global_scale.data_ptr<at::BFloat16>(), b_zeros.data_ptr(),
-        g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(),
-        sorted_token_ids.data_ptr(), expert_ids.data_ptr(),
-        num_tokens_past_padded.data_ptr(), topk_weights.data_ptr(),
-        moe_block_size, top_k, mul_topk_weights, is_ep, size_m, size_n, size_k,
-        workspace.data_ptr(), b_q_type, has_bias, has_act_order, is_k_full,
-        has_zp, num_groups, group_size, dev,
-        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
-        use_atomic_add, use_fp32_reduce, is_zp_float);
-  } else {
-    TORCH_CHECK(false,
-                "moe_wna16_marlin_gemm only supports bfloat16 and float16");
+  TORCH_CHECK(a_scales.scalar_type() == at::ScalarType::Float,
+              "scalar type of a_scales must be float");
+  TORCH_CHECK(global_scale.scalar_type() == c.scalar_type(),
+              "scalar type of global_scale must be the same with c");
+  if (a_type.size_bits() == 16) {
+    TORCH_CHECK(
+        a.scalar_type() == c.scalar_type(),
+        "scalar type of a must be the same with c for 16 bit activation");
   }
 
+  MARLIN_NAMESPACE_NAME::marlin_mm(
+      a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), c_tmp.data_ptr(),
+      b_bias.data_ptr(), a_scales.data_ptr(), b_scales.data_ptr(),
+      global_scale.data_ptr(), b_zeros.data_ptr(), g_idx.data_ptr(),
+      perm.data_ptr(), a_tmp.data_ptr(), sorted_token_ids.data_ptr(),
+      expert_ids.data_ptr(), num_tokens_past_padded.data_ptr(),
+      topk_weights.data_ptr(), moe_block_size, num_experts, top_k,
+      mul_topk_weights, is_ep, size_m, size_n, size_k, workspace.data_ptr(),
+      a_type, b_type, c_type, s_type, has_bias, has_act_order, is_k_full,
+      has_zp, num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
+      thread_k, thread_n, sms, blocks_per_sm, use_atomic_add, use_fp32_reduce,
+      is_zp_float);
+
   return c;
 }
 
-#endif
-
 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
   m.impl("moe_wna16_marlin_gemm", &moe_wna16_marlin_gemm);
 }
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index bd95ade40a083..e0a8280722f3c 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -63,16 +63,18 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
   m.def(
       "moe_wna16_marlin_gemm(Tensor! a, Tensor? c_or_none,"
       "Tensor! b_q_weight, Tensor? b_bias_or_none,"
-      "Tensor! b_scales, Tensor? global_scale, Tensor? "
+      "Tensor! b_scales, Tensor? a_scales, Tensor? global_scale, Tensor? "
       "b_zeros_or_none,"
       "Tensor? g_idx_or_none, Tensor? perm_or_none, Tensor! workspace,"
       "Tensor sorted_token_ids,"
       "Tensor! expert_ids, Tensor! num_tokens_past_padded,"
       "Tensor! topk_weights, int moe_block_size, int top_k, "
-      "bool mul_topk_weights, bool is_ep, int b_q_type_id,"
+      "bool mul_topk_weights, bool is_ep, int b_type_id,"
       "int size_m, int size_n, int size_k,"
       "bool is_full_k, bool use_atomic_add,"
-      "bool use_fp32_reduce, bool is_zp_float) -> Tensor");
+      "bool use_fp32_reduce, bool is_zp_float,"
+      "int thread_k, int thread_n, int blocks_per_sm) -> Tensor");
+
   m.def(
       "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
       "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "
diff --git a/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu b/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
index 03bd5964a7fc4..e306ff02605b9 100644
--- a/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
+++ b/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
@@ -437,10 +437,10 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
       for (int n_idx = 0; n_idx < WARP_NITER; ++n_idx) {
   #pragma unroll
         for (int k_idx = 0; k_idx < 2; ++k_idx) {
-          FType low16 =
-              ScalarType<FType>::float2num(C_frag[m_idx][n_idx][k_idx * 2]);
-          FType high16 =
-              ScalarType<FType>::float2num(C_frag[m_idx][n_idx][k_idx * 2 + 1]);
+          FType low16 = MarlinScalarType2<FType>::float2num(
+              C_frag[m_idx][n_idx][k_idx * 2]);
+          FType high16 = MarlinScalarType2<FType>::float2num(
+              C_frag[m_idx][n_idx][k_idx * 2 + 1]);
           uint32_t tmp = (reinterpret_cast<uint32_t&>(low16) & 0xffff) |
                          (reinterpret_cast<uint32_t&>(high16) << 16);
           int sts_offset =
diff --git a/csrc/quantization/gptq_allspark/allspark_utils.cuh b/csrc/quantization/gptq_allspark/allspark_utils.cuh
index 831413016538e..14a61ad8fd880 100644
--- a/csrc/quantization/gptq_allspark/allspark_utils.cuh
+++ b/csrc/quantization/gptq_allspark/allspark_utils.cuh
@@ -8,7 +8,7 @@
 #include <cuda_bf16.h>
 #include <iostream>
 #include "../gptq_marlin/marlin_dtypes.cuh"
-using marlin::ScalarType;
+using marlin::MarlinScalarType2;
 
 namespace allspark {
 
@@ -72,10 +72,10 @@ __global__ void f16_gemm_splitk_reduce_kernel(const FType* C_split, FType* C,
 
   int n_mat = N_MATRIX > 0 ? N_MATRIX : (int)n_matrix;
   for (int i = 0; i < n_mat; ++i) {
-    sum += ScalarType<FType>::num2float(C_split[idx + i * matrix_size]);
+    sum += MarlinScalarType2<FType>::num2float(C_split[idx + i * matrix_size]);
   }
 
-  C[idx] = ScalarType<FType>::float2num(sum);
+  C[idx] = MarlinScalarType2<FType>::float2num(sum);
 }
 
 template <typename FType>
diff --git a/csrc/quantization/gptq_marlin/.gitignore b/csrc/quantization/gptq_marlin/.gitignore
index 77088552b85b4..ba805f9250ece 100644
--- a/csrc/quantization/gptq_marlin/.gitignore
+++ b/csrc/quantization/gptq_marlin/.gitignore
@@ -1 +1,2 @@
-kernel_*.cu
\ No newline at end of file
+sm*_kernel_*.cu
+kernel_selector.h
diff --git a/csrc/quantization/gptq_marlin/awq_marlin_repack.cu b/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
index e607107b3e77c..307bae6738ecf 100644
--- a/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
+++ b/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
@@ -4,14 +4,16 @@
 
 namespace marlin {
 
-template <int const num_threads, int const num_bits>
+template <int const num_threads, int const num_bits, bool is_a_8bit>
 __global__ void awq_marlin_repack_kernel(
     uint32_t const* __restrict__ b_q_weight_ptr, uint32_t* __restrict__ out_ptr,
     int size_k, int size_n) {
   constexpr int pack_factor = 32 / num_bits;
 
-  int k_tiles = size_k / tile_k_size;
-  int n_tiles = size_n / tile_n_size;
+  constexpr int target_tile_n_size = tile_n_size / (is_a_8bit ? 2 : 1);
+  constexpr int target_tile_k_size = tile_k_size * (is_a_8bit ? 2 : 1);
+  int k_tiles = size_k / target_tile_k_size;
+  int n_tiles = size_n / target_tile_n_size;
   int block_k_tiles = div_ceil(k_tiles, gridDim.x);
 
   auto start_k_tile = blockIdx.x * block_k_tiles;
@@ -33,10 +35,10 @@ __global__ void awq_marlin_repack_kernel(
 
   extern __shared__ int4 sh[];
 
-  constexpr int tile_n_ints = tile_n_size / pack_factor;
+  constexpr int tile_n_ints = target_tile_n_size / pack_factor;
 
   constexpr int stage_n_threads = tile_n_ints / 4;
-  constexpr int stage_k_threads = tile_k_size;
+  constexpr int stage_k_threads = target_tile_k_size;
   constexpr int stage_size = stage_k_threads * stage_n_threads;
 
   auto fetch_to_shared = [&](int pipe, int k_tile_id, int n_tile_id) {
@@ -45,7 +47,7 @@ __global__ void awq_marlin_repack_kernel(
       return;
     }
 
-    int first_n = n_tile_id * tile_n_size;
+    int first_n = n_tile_id * target_tile_n_size;
     int first_n_packed = first_n / pack_factor;
 
     int4* sh_ptr = sh + stage_size * pipe;
@@ -54,7 +56,7 @@ __global__ void awq_marlin_repack_kernel(
       auto k_id = threadIdx.x / stage_n_threads;
       auto n_id = threadIdx.x % stage_n_threads;
 
-      int first_k = k_tile_id * tile_k_size;
+      int first_k = k_tile_id * target_tile_k_size;
 
       cp_async4(&sh_ptr[k_id * stage_n_threads + n_id],
                 reinterpret_cast<int4 const*>(
@@ -78,11 +80,11 @@ __global__ void awq_marlin_repack_kernel(
     }
 
     int tc_col = th_id / 4;
-    int tc_row = (th_id % 4) * 2;
+    int tc_row = (th_id % 4) * (is_a_8bit ? 4 : 2);
 
     constexpr int tc_offsets[4] = {0, 1, 8, 9};
 
-    int cur_n = warp_id * 16 + tc_col;
+    int cur_n = (warp_id / (is_a_8bit ? 2 : 1)) * 16 + tc_col;
     int cur_n_packed = cur_n / pack_factor;
     int cur_n_pos = cur_n % pack_factor;
 
@@ -105,23 +107,50 @@ __global__ void awq_marlin_repack_kernel(
     uint32_t vals[8];
 #pragma unroll
     for (int i = 0; i < 4; i++) {
-      int cur_elem = tc_row + tc_offsets[i];
+      if constexpr (is_a_8bit) {
+        int cur_elem = tc_row + i;
 
-      int packed_src_0 = sh_stage_int_ptr[cur_n_packed + sh_stride * cur_elem];
-      int packed_src_1 = sh_stage_int_ptr[cur_n_packed + (8 / pack_factor) +
-                                          sh_stride * cur_elem];
+        int packed_src_0 =
+            sh_stage_int_ptr[cur_n_packed + (8 / pack_factor) * (warp_id % 2) +
+                             sh_stride * cur_elem];
+        int packed_src_1 =
+            sh_stage_int_ptr[cur_n_packed + (8 / pack_factor) * (warp_id % 2) +
+                             sh_stride * (cur_elem + 16)];
 
-      vals[i] = (packed_src_0 >> (cur_n_pos_unpacked * num_bits)) & mask;
-      vals[4 + i] = (packed_src_1 >> (cur_n_pos_unpacked * num_bits)) & mask;
+        vals[i] = (packed_src_0 >> (cur_n_pos_unpacked * num_bits)) & mask;
+        vals[4 + i] = (packed_src_1 >> (cur_n_pos_unpacked * num_bits)) & mask;
+      } else {
+        int cur_elem = tc_row + tc_offsets[i];
+
+        int packed_src_0 =
+            sh_stage_int_ptr[cur_n_packed + sh_stride * cur_elem];
+        int packed_src_1 = sh_stage_int_ptr[cur_n_packed + (8 / pack_factor) +
+                                            sh_stride * cur_elem];
+
+        vals[i] = (packed_src_0 >> (cur_n_pos_unpacked * num_bits)) & mask;
+        vals[4 + i] = (packed_src_1 >> (cur_n_pos_unpacked * num_bits)) & mask;
+      }
     }
 
-    constexpr int tile_size = tile_k_size * tile_n_size / pack_factor;
+    constexpr int tile_size =
+        target_tile_k_size * target_tile_n_size / pack_factor;
     int out_offset = (k_tile_id * n_tiles + n_tile_id) * tile_size;
 
     // Result of:
     // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
-    if constexpr (num_bits == 4) {
-      constexpr int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+    if constexpr (!is_a_8bit && num_bits == 4) {
+      int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+
+      uint32_t res = 0;
+#pragma unroll
+      for (int i = 0; i < 8; i++) {
+        res |= vals[pack_idx[i]] << (i * 4);
+      }
+
+      out_ptr[out_offset + th_id * 4 + warp_id] = res;
+
+    } else if constexpr (is_a_8bit && num_bits == 4) {
+      int pack_idx[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 
       uint32_t res = 0;
 #pragma unroll
@@ -138,8 +167,9 @@ __global__ void awq_marlin_repack_kernel(
       uint32_t res2 = 0;
 #pragma unroll
       for (int i = 0; i < 4; i++) {
-        res1 |= vals[pack_idx[i]] << (i * 8);
-        res2 |= vals[4 + pack_idx[i]] << (i * 8);
+        const int ii = is_a_8bit ? i : pack_idx[i];
+        res1 |= vals[ii] << (i * 8);
+        res2 |= vals[4 + ii] << (i * 8);
       }
 
       out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 0] = res1;
@@ -176,18 +206,21 @@ __global__ void awq_marlin_repack_kernel(
 
 }  // namespace marlin
 
-#define CALL_IF(NUM_BITS)                                                   \
-  else if (num_bits == NUM_BITS) {                                          \
-    cudaFuncSetAttribute(                                                   \
-        marlin::awq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS>, \
-        cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);       \
-    marlin::awq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS>      \
-        <<<blocks, marlin::repack_threads, max_shared_mem, stream>>>(       \
-            b_q_weight_ptr, out_ptr, size_k, size_n);                       \
+#define CALL_IF(NUM_BITS, IS_A_8BIT)                                       \
+  else if (num_bits == NUM_BITS && is_a_8bit == IS_A_8BIT) {               \
+    cudaFuncSetAttribute(                                                  \
+        marlin::awq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS, \
+                                         IS_A_8BIT>,                       \
+        cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);      \
+    marlin::awq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS,     \
+                                     IS_A_8BIT>                            \
+        <<<blocks, marlin::repack_threads, max_shared_mem, stream>>>(      \
+            b_q_weight_ptr, out_ptr, size_k, size_n);                      \
   }
 
 torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
-                                int64_t size_n, int64_t num_bits) {
+                                int64_t size_n, int64_t num_bits,
+                                bool is_a_8bit) {
   // Verify compatibility with marlin tile of 16x64
   TORCH_CHECK(size_k % marlin::tile_k_size == 0, "size_k = ", size_k,
               " is not divisible by tile_k_size = ", marlin::tile_k_size);
@@ -238,10 +271,13 @@ torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
 
   if (false) {
   }
-  CALL_IF(4)
-  CALL_IF(8)
+  CALL_IF(4, false)
+  CALL_IF(8, false)
+  CALL_IF(4, true)
+  CALL_IF(8, true)
   else {
-    TORCH_CHECK(false, "Unsupported repack config: num_bits = ", num_bits);
+    TORCH_CHECK(false, "Unsupported repack config: num_bits = ", num_bits,
+                ", is_a_8bit = ", is_a_8bit);
   }
 
   return out;
diff --git a/csrc/quantization/gptq_marlin/dequant.h b/csrc/quantization/gptq_marlin/dequant.h
index e8b0c302b2021..26b8d40368aa9 100644
--- a/csrc/quantization/gptq_marlin/dequant.h
+++ b/csrc/quantization/gptq_marlin/dequant.h
@@ -470,6 +470,50 @@ __device__ inline void dequant<nv_bfloat162, vllm::kFE2M1f.id(), false>(
   frag_b[0] = __hmul2(frag_b[0], bias_reg);
 }
 
+template <>
+__device__ inline void dequant<__nv_fp8x4_e4m3, vllm::kFE2M1f.id(), true>(
+    int q, __nv_fp8x4_e4m3* frag_b) {
+  // Constants for FP4 (E2M1) and FP16 formats
+  constexpr int FP4_EXPONENT = 2, FP8_EXPONENT = 4;
+  constexpr int RIGHT_SHIFT = FP8_EXPONENT - FP4_EXPONENT;
+  constexpr int MASK = 0x70707070;
+
+  // Extract and shift FP4 values to FP16 format
+  int Out1 = (q & 0x80808080) | ((q & MASK) >> RIGHT_SHIFT);
+  q <<= 4;
+  int Out2 = (q & 0x80808080) | ((q & MASK) >> RIGHT_SHIFT);
+
+  // Note1: reverse indexing is intentional because weights are permuted
+  // Note2: when dequant to 8bit type, we write to `frag_b[2]` instead of
+  //        `frag_b[1]` to fit the layout of tensorcore
+  frag_b[1] = *reinterpret_cast<const __nv_fp8x4_e4m3*>(&Out1);
+  frag_b[0] = *reinterpret_cast<const __nv_fp8x4_e4m3*>(&Out2);
+}
+
+template <>
+__device__ inline void dequant<int32_t, vllm::kU4B8.id(), true>(
+    int q, int32_t* frag_b) {
+  constexpr int repeated_zp = 0x08080808;
+  constexpr int MASK = 0x80808080;
+
+  frag_b[0] = ((q & 0x0F0F0F0F | MASK) - repeated_zp) ^ MASK;
+  q >>= 4;
+  frag_b[1] = ((q & 0x0F0F0F0F | MASK) - repeated_zp) ^ MASK;
+}
+
+template <>
+__device__ inline void dequant<__nv_fp8x4_e4m3, vllm::kU4B8.id(), true>(
+    int q, __nv_fp8x4_e4m3* frag_b) {
+  int s = q & 0x08080808;
+  int Out1 = ((q & 0x07070707) | (s << 4)) + (s >> 3);
+  q >>= 4;
+  s = q & 0x08080808;
+  int Out2 = ((q & 0x07070707) | (s << 4)) + (s >> 3);
+
+  frag_b[0] = *reinterpret_cast<const __nv_fp8x4_e4m3*>(&Out1);
+  frag_b[1] = *reinterpret_cast<const __nv_fp8x4_e4m3*>(&Out2);
+}
+
 template <typename scalar_t2, vllm::ScalarTypeId s_type_id>
 __device__ inline void dequant_fp8_scales(int q, scalar_t2* frag_b);
 
@@ -515,6 +559,49 @@ __device__ inline void dequant_fp8_scales<nv_bfloat162, vllm::kFE8M0fnu.id()>(
   // Note: reverse indexing is intentional because weights are permuted
   frag_b[1] = *reinterpret_cast<const nv_bfloat162*>(&Out1);
   frag_b[0] = *reinterpret_cast<const nv_bfloat162*>(&Out2);
+};
+
+// subtract zero point in quanted format and then dequant
+template <typename scalar_t2, vllm::ScalarTypeId w_type_id,
+          bool skip_flop = false>
+__device__ inline void sub_zp_and_dequant(int q, scalar_t2* frag_b, int zp);
+
+template <>
+__device__ inline void sub_zp_and_dequant<int32_t, vllm::kU4.id(), true>(
+    int q, int32_t* frag_b, int zp) {
+  // INT4 with zp -> INT8
+  // see https://github.com/vllm-project/vllm/pull/24722
+  int repeated_zp = 0x01010101 * zp;
+  int MASK = 0x80808080;
+
+  frag_b[0] = ((q & 0x0F0F0F0F | MASK) - repeated_zp) ^ MASK;
+  q >>= 4;
+  frag_b[1] = ((q & 0x0F0F0F0F | MASK) - repeated_zp) ^ MASK;
+}
+
+template <>
+__device__ inline void sub_zp_and_dequant<__nv_fp8x4_e4m3, vllm::kU4.id(),
+                                          true>(int q, __nv_fp8x4_e4m3* frag_b,
+                                                int zp) {
+  // INT4 with zp -> FP8
+  // see https://github.com/vllm-project/vllm/pull/24722
+  uint32_t u_q = *reinterpret_cast<uint32_t*>(&q);
+  uint32_t u_zp = *reinterpret_cast<uint32_t*>(&zp);
+  uint32_t u_zp1 = u_zp + 1;
+  uint32_t repeated_zp = 0x01010101 * u_zp;
+
+  uint32_t q0, s;
+  q0 = (u_q & 0x0F0F0F0F) | 0x70707070;
+  s = (q0 + repeated_zp) & 0x80808080;
+  uint32_t Out1 = (q0 + (s >> 7) * u_zp1) & 0x0F0F0F0F | s;
+
+  u_q >>= 4;
+  q0 = (u_q & 0x0F0F0F0F) | 0x70707070;
+  s = (q0 + repeated_zp) & 0x80808080;
+  uint32_t Out2 = (q0 + (s >> 7) * u_zp1) & 0x0F0F0F0F | s;
+
+  frag_b[0] = *reinterpret_cast<const __nv_fp8x4_e4m3*>(&Out1);
+  frag_b[1] = *reinterpret_cast<const __nv_fp8x4_e4m3*>(&Out2);
 }
 
 #endif
diff --git a/csrc/quantization/gptq_marlin/generate_kernels.py b/csrc/quantization/gptq_marlin/generate_kernels.py
index 42d3b456096ee..27ef7271ba41c 100644
--- a/csrc/quantization/gptq_marlin/generate_kernels.py
+++ b/csrc/quantization/gptq_marlin/generate_kernels.py
@@ -4,141 +4,292 @@ import glob
 import itertools
 import os
 import subprocess
+import sys
 
 import jinja2
 
-FILE_HEAD = """
-// auto generated by generate.py
-// clang-format off
+ARCHS = []
+SUPPORT_FP8 = False
+for arch in sys.argv[1].split(","):
+    arch = arch[: arch.index(".") + 2].replace(".", "")
+    arch = int(arch)
+    # only SM89 and SM120 fully support
+    # mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32.
+    # SM90 and SM100 can use this PTX, but it’s simulated
+    # with FP16 MMA, so it cannot achieve any acceleration.
+    if arch in [89, 120]:
+        SUPPORT_FP8 = True
 
+FILE_HEAD_COMMENT = """
+// auto generated by generate_kernels.py
+// clang-format off
+""".lstrip()
+
+FILE_HEAD = (
+    FILE_HEAD_COMMENT
+    + """
 #include "kernel.h"
 #include "marlin_template.h"
 
 namespace MARLIN_NAMESPACE_NAME {
-""".strip()
+"""
+)
 
 TEMPLATE = (
     "template __global__ void Marlin<"
-    "{{scalar_t}}, "
-    "{{w_type_id}}, "
+    "{{a_type_id}}, "
+    "{{b_type_id}}, "
+    "{{c_type_id}}, "
     "{{s_type_id}}, "
     "{{threads}}, "
     "{{thread_m_blocks}}, "
     "{{thread_n_blocks}}, "
     "{{thread_k_blocks}}, "
-    "{{'true' if m_block_size_8 else 'false'}}, "
+    "{{m_block_size_8}}, "
     "{{stages}}, "
     "{{group_blocks}}, "
-    "{{'true' if is_zp_float else 'false'}}>"
+    "{{is_zp_float}}>"
     "( MARLIN_KERNEL_PARAMS );"
 )
 
-# int8 with zero point case (vllm::kU8) is also supported,
-# we don't add it to reduce wheel size.
-SCALAR_TYPES = [
-    "vllm::kU4",
-    "vllm::kU4B8",
-    "vllm::kU8B128",
-    "vllm::kFE4M3fn",
-    "vllm::kFE2M1f",
-]
 THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128), (128, 64, 128)]
 
 THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
-# group_blocks:
-#   = 0 : act order case
-#   = -1 : channelwise quantization
-#   > 0 : group_size=16*group_blocks
-GROUP_BLOCKS = [0, 1, -1, 2, 4, 8]
-DTYPES = ["fp16", "bf16"]
+
+QUANT_CONFIGS = [
+    # AWQ-INT4
+    {
+        "b_type": "kU4",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # HQQ
+    {
+        "a_type": ["kFloat16"],
+        "b_type": "kU4",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [4],
+        "is_zp_float": True,
+    },
+    # GPTQ-INT4
+    {
+        "b_type": "kU4B8",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [-1, 0, 2, 4, 8],
+    },
+    # GPTQ-INT8
+    {
+        "b_type": "kU8B128",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [-1, 0, 2, 4, 8],
+    },
+    # FP8
+    {
+        "b_type": "kFE4M3fn",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [-1, 8],
+    },
+    # NVFP4
+    {
+        "b_type": "kFE2M1f",
+        "s_type": "kFE4M3fn",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [1],
+    },
+    # MXFP4
+    {
+        "a_type": ["kBFloat16"],
+        "b_type": "kFE2M1f",
+        "s_type": "kFE8M0fnu",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [2],
+    },
+    # AWQ-INT4 with INT8 activation
+    {
+        "a_type": ["kS8"],
+        "b_type": "kU4",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT4 with INT8 activation
+    {
+        "a_type": ["kS8"],
+        "b_type": "kU4B8",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT4 with FP8 activation
+    {
+        "a_type": ["kFE4M3fn"],
+        "b_type": "kU4B8",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # AWQ-INT4 with FP8 activation
+    {
+        "a_type": ["kFE4M3fn"],
+        "b_type": "kU4",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # MXFP4 with FP8 activation
+    {
+        "a_type": ["kFE4M3fn"],
+        "b_type": "kFE2M1f",
+        "c_type": ["kBFloat16"],
+        "s_type": "kFE8M0fnu",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [2],
+    },
+]
 
 
 def remove_old_kernels():
-    for filename in glob.glob(os.path.dirname(__file__) + "/kernel_*.cu"):
+    for filename in glob.glob(os.path.dirname(__file__) + "/*kernel_*.cu"):
         subprocess.call(["rm", "-f", filename])
 
+    filename = os.path.dirname(__file__) + "/kernel_selector.h"
+    subprocess.call(["rm", "-f", filename])
+
 
 def generate_new_kernels():
-    for scalar_type, dtype in itertools.product(SCALAR_TYPES, DTYPES):
+    result_dict = {}
+
+    for quant_config in QUANT_CONFIGS:
+        c_types = quant_config.get("c_type", ["kFloat16", "kBFloat16"])
+        a_types = quant_config.get("a_type", ["kFloat16", "kBFloat16"])
+        b_type = quant_config["b_type"]
+        is_zp_float = quant_config.get("is_zp_float", False)
+        all_group_blocks = quant_config["group_blocks"]
+        all_m_blocks = quant_config["thread_m_blocks"]
+        all_thread_configs = quant_config["thread_configs"]
+
+        for a_type, c_type in itertools.product(a_types, c_types):
+            if not SUPPORT_FP8 and a_type == "kFE4M3fn":
+                continue
+            if "16" in a_type and "16" in c_type and a_type != c_type:
+                continue
+            s_type = quant_config.get("s_type", c_type)
+            if (a_type, b_type, c_type) not in result_dict:
+                result_dict[(a_type, b_type, c_type)] = []
+
+            for group_blocks, m_blocks, thread_configs in itertools.product(
+                all_group_blocks, all_m_blocks, all_thread_configs
+            ):
+                thread_k, thread_n, threads = thread_configs
+
+                if threads == 256:
+                    # for small batch (m_blocks == 1),
+                    #     we only need (128, 128, 256)
+                    # for large batch (m_blocks > 1),
+                    #     we only need (64, 256, 256)
+                    if m_blocks <= 1 and (thread_k, thread_n) != (128, 128):
+                        continue
+                    if m_blocks > 1 and (thread_k, thread_n) != (64, 256):
+                        continue
+
+                config = {
+                    "threads": threads,
+                    "s_type": s_type,
+                    "thread_m_blocks": max(m_blocks, 1),
+                    "thread_k_blocks": thread_k // 16,
+                    "thread_n_blocks": thread_n // 16,
+                    "m_block_size_8": "true" if m_blocks == 0.5 else "false",
+                    "stages": "pipe_stages",
+                    "group_blocks": group_blocks,
+                    "is_zp_float": "true" if is_zp_float else "false",
+                }
+
+                result_dict[(a_type, b_type, c_type)].append(config)
+
+    kernel_selector_str = FILE_HEAD_COMMENT
+
+    for (a_type, b_type, c_type), config_list in result_dict.items():
         all_template_str_list = []
+        for config in config_list:
+            s_type = config["s_type"]
+            template_str = jinja2.Template(TEMPLATE).render(
+                a_type_id=f"vllm::{a_type}.id()",
+                b_type_id=f"vllm::{b_type}.id()",
+                c_type_id=f"vllm::{c_type}.id()",
+                s_type_id=f"vllm::{s_type}.id()",
+                **config,
+            )
+            all_template_str_list.append(template_str)
 
-        for group_blocks, m_blocks, thread_configs in itertools.product(
-            GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS
-        ):
-            # act order case only support gptq-int4 and gptq-int8
-            if group_blocks == 0 and scalar_type not in [
-                "vllm::kU4B8",
-                "vllm::kU8B128",
-            ]:
-                continue
-            if thread_configs[2] == 256:
-                # for small batch (m_blocks == 1), we only need (128, 128, 256)
-                # for large batch (m_blocks > 1), we only need (64, 256, 256)
-                if m_blocks <= 1 and thread_configs[0] != 128:
-                    continue
-                if m_blocks > 1 and thread_configs[0] != 64:
-                    continue
+            conditions = [
+                f"a_type == vllm::{a_type}",
+                f"b_type == vllm::{b_type}",
+                f"c_type == vllm::{c_type}",
+                f"s_type == vllm::{s_type}",
+                f"threads == {config['threads']}",
+                f"thread_m_blocks == {config['thread_m_blocks']}",
+                f"thread_n_blocks == {config['thread_n_blocks']}",
+                f"thread_k_blocks == {config['thread_k_blocks']}",
+                f"m_block_size_8 == {config['m_block_size_8']}",
+                f"group_blocks == {config['group_blocks']}",
+                f"is_zp_float == {config['is_zp_float']}",
+            ]
+            conditions = " && ".join(conditions)
 
-            # we only support channelwise quantization and group_size == 128
-            # for fp8
-            if scalar_type == "vllm::kFE4M3fn" and group_blocks not in [-1, 8]:
-                continue
-            # nvfp4 only supports group_size == 16
-            # mxfp4 only supports group_size == 32
-            if scalar_type == "vllm::kFE2M1f" and group_blocks not in [1, 2]:
-                continue
-            # other quantization methods don't support group_size = 16
-            if scalar_type != "vllm::kFE2M1f" and group_blocks == 1:
-                continue
+            if kernel_selector_str == FILE_HEAD_COMMENT:
+                kernel_selector_str += f"if ({conditions})\n  kernel = "
+            else:
+                kernel_selector_str += f"else if ({conditions})\n  kernel = "
 
-            k_blocks = thread_configs[0] // 16
-            n_blocks = thread_configs[1] // 16
-            threads = thread_configs[2]
+            kernel_template2 = (
+                "Marlin<{{a_type_id}}, {{b_type_id}}, {{c_type_id}}, "
+                "{{s_type_id}}, {{threads}}, {{thread_m_blocks}}, "
+                "{{thread_n_blocks}}, {{thread_k_blocks}}, "
+                "{{m_block_size_8}}, {{stages}}, {{group_blocks}}, "
+                "{{is_zp_float}}>;"
+            )
 
-            c_dtype = "half" if dtype == "fp16" else "nv_bfloat16"
-
-            is_zp_float_list = [False]
-            if dtype == "fp16" and scalar_type == "vllm::kU4" and group_blocks == 4:
-                # HQQ (is_zp_float = true) only supports
-                # 4bit quantization and fp16
-                is_zp_float_list.append(True)
-
-            if scalar_type == "vllm::kFE2M1f" and group_blocks == 1:
-                s_type = "vllm::kFE4M3fn"
-            elif scalar_type == "vllm::kFE2M1f" and group_blocks == 2:
-                s_type = "vllm::kFE8M0fnu"
-                if dtype == "fp16":
-                    # we cannot safely dequantize e8m0 to fp16, so skip this
-                    continue
-            elif dtype == "fp16":
-                s_type = "vllm::kFloat16"
-            elif dtype == "bf16":
-                s_type = "vllm::kBFloat16"
-
-            for is_zp_float in is_zp_float_list:
-                template_str = jinja2.Template(TEMPLATE).render(
-                    scalar_t=c_dtype,
-                    w_type_id=scalar_type + ".id()",
-                    s_type_id=s_type + ".id()",
-                    threads=threads,
-                    thread_m_blocks=max(m_blocks, 1),
-                    thread_n_blocks=n_blocks,
-                    thread_k_blocks=k_blocks,
-                    m_block_size_8=m_blocks == 0.5,
-                    stages="pipe_stages",
-                    group_blocks=group_blocks,
-                    is_zp_float=is_zp_float,
+            kernel_selector_str += (
+                jinja2.Template(kernel_template2).render(
+                    a_type_id=f"vllm::{a_type}.id()",
+                    b_type_id=f"vllm::{b_type}.id()",
+                    c_type_id=f"vllm::{c_type}.id()",
+                    s_type_id=f"vllm::{s_type}.id()",
+                    **config,
                 )
-
-                all_template_str_list.append(template_str)
+                + "\n"
+            )
 
         file_content = FILE_HEAD + "\n\n"
         file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
-        filename = f"kernel_{dtype}_{scalar_type[6:].lower()}.cu"
+        if a_type == "kFE4M3fn":
+            filename = f"sm89_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+        else:
+            filename = f"sm80_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+
+        filename = filename.lower()
 
         with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
             f.write(file_content)
 
+    if not SUPPORT_FP8 and kernel_selector_str != FILE_HEAD_COMMENT:
+        kernel_selector_str += (
+            "else if (a_type == vllm::kFE4M3fn)\n"
+            "  TORCH_CHECK(false, "
+            '"marlin kernel with fp8 activation is not built.");'
+        )
+
+    with open(os.path.join(os.path.dirname(__file__), "kernel_selector.h"), "w") as f:
+        f.write(kernel_selector_str)
+
 
 if __name__ == "__main__":
     remove_old_kernels()
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
index cc30abcf00800..28ff06559a98a 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -53,7 +53,7 @@ torch::Tensor gptq_marlin_gemm(
     std::optional<torch::Tensor> const& b_zeros_or_none,
     std::optional<torch::Tensor> const& g_idx_or_none,
     std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
-    vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n,
+    vllm::ScalarTypeId const& b_type_id, int64_t size_m, int64_t size_n,
     int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
     bool is_zp_float) {
   TORCH_CHECK_NOT_IMPLEMENTED(false,
@@ -243,204 +243,29 @@ bool is_valid_config(thread_config_t const& th_config, int thread_m_blocks,
   int cache_size = get_kernel_cache_size(
       th_config, thread_m_blocks, prob_m, prob_n, prob_k, num_bits, group_size,
       has_act_order, is_k_full, has_zp, is_zp_float);
-  return cache_size + 512 <= max_shared_mem;
+  return cache_size <= max_shared_mem;
 }
 
-  #define _GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,   \
-                  M_BLOCK_SIZE_8, GROUP_BLOCKS, NUM_THREADS, IS_ZP_FLOAT)      \
-    else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS &&         \
-             thread_n_blocks == THREAD_N_BLOCKS &&                             \
-             thread_k_blocks == THREAD_K_BLOCKS &&                             \
-             m_block_size_8 == M_BLOCK_SIZE_8 &&                               \
-             group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS &&     \
-             is_zp_float == IS_ZP_FLOAT) {                                     \
-      constexpr auto S_TYPE =                                                  \
-          W_TYPE == vllm::kFE2M1f                                              \
-              ? (GROUP_BLOCKS == 1 ? vllm::kFE4M3fn : vllm::kFE8M0fnu)         \
-              : (std::is_same<scalar_t, half>::value ? vllm::kFloat16          \
-                                                     : vllm::kBFloat16);       \
-      kernel = Marlin<scalar_t, W_TYPE.id(), S_TYPE.id(), NUM_THREADS,         \
-                      THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,       \
-                      M_BLOCK_SIZE_8, pipe_stages, GROUP_BLOCKS, IS_ZP_FLOAT>; \
-    }
-
-  // COMMON: cases for (group_blocks in [-1, 2, 4, 8] and is_zp_float == false)
-  //         this is the most common cases
-  // BIGGROUP: cases for big group size (group_blocks in [-1, 8])
-  // FZP: cases for float-zero-point (is_zp_float = true)
-  // ACT: cases for act order case (group_blocks == 0)
-  // FP4: cases for nvfp4(e2m1) (group_blocks == 1)
-  #define COMMON_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, -1, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 2, NUM_THREADS, false)   \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 4, NUM_THREADS, false)   \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 8, NUM_THREADS, false)   \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
-
-  #define COMMON_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
-                                                                          \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
-                                                                          \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
-
-  #define COMMON_GET_IF(W_TYPE)            \
-    COMMON_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    COMMON_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    COMMON_GET_IF_M1(W_TYPE, 4, 8, 128)    \
-    COMMON_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    COMMON_GET_IF_M234(W_TYPE, 8, 4, 128)  \
-    COMMON_GET_IF_M234(W_TYPE, 4, 8, 128)
-
-  #define BIGGROUP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, -1, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 8, NUM_THREADS, false)   \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
-
-  #define BIGGROUP_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)   \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
-
-  #define BIGGROUP_GET_IF(W_TYPE)            \
-    BIGGROUP_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    BIGGROUP_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    BIGGROUP_GET_IF_M1(W_TYPE, 4, 8, 128)    \
-    BIGGROUP_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    BIGGROUP_GET_IF_M234(W_TYPE, 8, 4, 128)  \
-    BIGGROUP_GET_IF_M234(W_TYPE, 4, 8, 128)
-
-  #define NVFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
-
-  #define NVFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
-
-  #define NVFP4_GET_IF(W_TYPE)            \
-    NVFP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    NVFP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    NVFP4_GET_IF_M1(W_TYPE, 4, 8, 128)    \
-    NVFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    NVFP4_GET_IF_M234(W_TYPE, 8, 4, 128)  \
-    NVFP4_GET_IF_M234(W_TYPE, 4, 8, 128)
-
-  #define MXFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 2, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)
-
-  #define MXFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)
-
-  #define MXFP4_GET_IF(W_TYPE)            \
-    MXFP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    MXFP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    MXFP4_GET_IF_M1(W_TYPE, 4, 8, 128)    \
-    MXFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    MXFP4_GET_IF_M234(W_TYPE, 8, 4, 128)  \
-    MXFP4_GET_IF_M234(W_TYPE, 4, 8, 128)
-
-  // We currently have 4-bit models only with group_blocks == 4
-  #define FZP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 4, NUM_THREADS, true) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true)
-
-  #define FZP_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true)
-
-  #define FZP_GET_IF(W_TYPE)            \
-    FZP_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    FZP_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    FZP_GET_IF_M1(W_TYPE, 4, 8, 128)    \
-    FZP_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    FZP_GET_IF_M234(W_TYPE, 8, 4, 128)  \
-    FZP_GET_IF_M234(W_TYPE, 4, 8, 128)
-
-  // We currently have 4-bit models only with group_blocks == 4
-  #define ACT_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)        \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false)
-
-  #define ACT_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false)
-
-  #define ACT_GET_IF(W_TYPE)            \
-    ACT_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    ACT_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    ACT_GET_IF_M1(W_TYPE, 4, 8, 128)    \
-    ACT_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    ACT_GET_IF_M234(W_TYPE, 8, 4, 128)  \
-    ACT_GET_IF_M234(W_TYPE, 4, 8, 128)
-
-template <typename scalar_t>
-MarlinFuncPtr get_marlin_kernel(const vllm::ScalarType q_type,
-                                int thread_m_blocks, int thread_n_blocks,
-                                int thread_k_blocks, bool m_block_size_8,
-                                bool has_act_order, bool has_zp,
-                                int group_blocks, int num_threads,
-                                bool is_zp_float) {
-  int num_bits = q_type.size_bits();
+MarlinFuncPtr get_marlin_kernel(
+    const vllm::ScalarType a_type, const vllm::ScalarType b_type,
+    const vllm::ScalarType c_type, const vllm::ScalarType s_type,
+    int thread_m_blocks, int thread_n_blocks, int thread_k_blocks,
+    bool m_block_size_8, bool has_act_order, bool has_zp, int group_blocks,
+    int threads, bool is_zp_float) {
+  int num_bits = b_type.size_bits();
   auto kernel = MarlinDefault;
-  if (false) {
-  }
 
-  COMMON_GET_IF(vllm::kU4)
-  COMMON_GET_IF(vllm::kU4B8)
-  COMMON_GET_IF(vllm::kU8B128)
-
-  NVFP4_GET_IF(vllm::kFE2M1f)
-
-  BIGGROUP_GET_IF(vllm::kFE4M3fn)
-
-  ACT_GET_IF(vllm::kU4B8)
-  ACT_GET_IF(vllm::kU8B128)
-
-  if (std::is_same<scalar_t, half>::value) {
-    if (false) {
-    }
-    FZP_GET_IF(vllm::kU4)
-  }
-  if (std::is_same<scalar_t, nv_bfloat16>::value) {
-    if (false) {
-    }
-    MXFP4_GET_IF(vllm::kFE2M1f)
-  }
+  #include "kernel_selector.h"
 
   return kernel;
 }
 
-template <typename scalar_t>
-exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
-                                    int prob_n, int prob_k, int thread_m_blocks,
-                                    bool m_block_size_8, int num_bits,
-                                    int group_size, bool has_act_order,
-                                    bool is_k_full, bool has_zp,
-                                    bool is_zp_float, int max_shared_mem,
-                                    int sms) {
+exec_config_t determine_exec_config(
+    const vllm::ScalarType& a_type, const vllm::ScalarType& b_type,
+    const vllm::ScalarType& c_type, const vllm::ScalarType& s_type, int prob_m,
+    int prob_n, int prob_k, int thread_m_blocks, bool m_block_size_8,
+    int num_bits, int group_size, bool has_act_order, bool is_k_full,
+    bool has_zp, bool is_zp_float, int max_shared_mem, int sms) {
   exec_config_t exec_cfg = exec_config_t{1, thread_config_t{-1, -1, -1}};
   thread_config_t* thread_configs = thread_m_blocks > 1
                                         ? large_batch_thread_configs
@@ -455,7 +280,7 @@ exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
 
     if (!is_valid_config(th_config, thread_m_blocks, prob_m, prob_n, prob_k,
                          num_bits, group_size, has_act_order, is_k_full, has_zp,
-                         is_zp_float, max_shared_mem)) {
+                         is_zp_float, max_shared_mem - 512)) {
       continue;
     }
 
@@ -468,10 +293,11 @@ exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
       group_blocks = group_size == -1 ? -1 : group_size / 16;
     }
 
-    auto kernel = get_marlin_kernel<scalar_t>(
-        q_type, thread_m_blocks, th_config.thread_n / 16,
-        th_config.thread_k / 16, m_block_size_8, has_act_order, has_zp,
-        group_blocks, th_config.num_threads, is_zp_float);
+    auto kernel =
+        get_marlin_kernel(a_type, b_type, c_type, s_type, thread_m_blocks,
+                          th_config.thread_n / 16, th_config.thread_k / 16,
+                          m_block_size_8, has_act_order, has_zp, group_blocks,
+                          th_config.num_threads, is_zp_float);
 
     if (kernel == MarlinDefault) continue;
 
@@ -485,28 +311,16 @@ exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
   return exec_cfg;
 }
 
-template <typename scalar_t>
 void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
-               void* s, void* s2, void* zp, void* g_idx, void* perm,
-               void* a_tmp, int prob_m, int prob_n, int prob_k, int lda,
-               void* workspace, vllm::ScalarType const& q_type, bool has_bias,
+               void* a_s, void* b_s, void* g_s, void* zp, void* g_idx,
+               void* perm, void* a_tmp, int prob_m, int prob_n, int prob_k,
+               int lda, void* workspace, vllm::ScalarType const& a_type,
+               vllm::ScalarType const& b_type, vllm::ScalarType const& c_type,
+               vllm::ScalarType const& s_type, bool has_bias,
                bool has_act_order, bool is_k_full, bool has_zp, int num_groups,
                int group_size, int dev, cudaStream_t stream, int thread_k_init,
                int thread_n_init, int sms, bool use_atomic_add,
                bool use_fp32_reduce, bool is_zp_float) {
-  if (has_zp) {
-    TORCH_CHECK(
-        q_type == vllm::kU4 || q_type == vllm::kU8,
-        "q_type must be u4 or u8 when has_zp = True. Got = ", q_type.str());
-  } else {
-    TORCH_CHECK(
-        q_type == vllm::kU4B8 || q_type == vllm::kU8B128 ||
-            q_type == vllm::kFE4M3fn || q_type == vllm::kFE2M1f,
-        "q_type must be uint4b8, uint8b128, float8_e4m3fn or float4_e2m1f when "
-        "has_zp = False. Got = ",
-        q_type.str());
-  }
-
   TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
               ", ", prob_n, ", ", prob_k, "]");
 
@@ -531,19 +345,21 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
     }
   }
 
-  int num_bits = q_type.size_bits();
+  int num_bits = b_type.size_bits();
   const int4* A_ptr = (const int4*)A;
   const int4* B_ptr = (const int4*)B;
   int4* C_ptr = (int4*)C;
   int4* C_tmp_ptr = (int4*)C_tmp;
+
   const int4* bias_ptr = (const int4*)b_bias;
-  const int4* s_ptr = (const int4*)s;
-  const uint16_t* s2_ptr = (const uint16_t*)s2;
+  const float* a_s_ptr = (const float*)a_s;
+  const int4* b_s_ptr = (const int4*)b_s;
+  const uint16_t* g_s_ptr = (const uint16_t*)g_s;
+
   const int4* zp_ptr = (const int4*)zp;
   const int* g_idx_ptr = (const int*)g_idx;
   const int* perm_ptr = (const int*)perm;
   int4* a_tmp_ptr = (int4*)a_tmp;
-
   int* locks = (int*)workspace;
 
   if (has_act_order) {
@@ -568,6 +384,21 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
                          cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
   TORCH_CHECK(max_shared_mem > 0);
 
+  int major_capability, minor_capability;
+  cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
+                         dev);
+  cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
+                         dev);
+  TORCH_CHECK(major_capability * 10 + minor_capability >= 80,
+              "marlin kernel only support Ampere or newer GPUs.");
+  if (a_type == vllm::kFE4M3fn) {
+    TORCH_CHECK(
+        major_capability * 10 + minor_capability == 89 ||
+            major_capability * 10 + minor_capability == 120,
+        "Marlin W4A8-FP8 only support SM89 or SM120 device (It is slower than "
+        "Marlin W4A16 on other devices).");
+  }
+
   int max_par = 16;
   if (prob_n <= 4096) max_par = 16 * 8;
   int max_shared_mem_new = max_shared_mem;
@@ -583,7 +414,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
     int thread_n = thread_n_init;
 
     int thread_m_blocks = min(div_ceil(prob_m_split, 16), max_thread_m_blocks);
-    int m_block_size_8 = prob_m_split <= 8;
+    int m_block_size_8 = prob_m_split <= 8 && a_type.size_bits() == 16;
 
     // Set thread config
     exec_config_t exec_cfg;
@@ -597,11 +428,25 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
                   " is not divisible by thread_k = ", thread_k);
     } else {
       // Auto config
-      exec_cfg = determine_exec_config<scalar_t>(
-          q_type, prob_m_split, prob_n, prob_k, thread_m_blocks, m_block_size_8,
-          num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float,
-          max_shared_mem, sms);
+      exec_cfg = determine_exec_config(
+          a_type, b_type, c_type, s_type, prob_m_split, prob_n, prob_k,
+          thread_m_blocks, m_block_size_8, num_bits, group_size, has_act_order,
+          is_k_full, has_zp, is_zp_float, max_shared_mem, sms);
       thread_tfg = exec_cfg.tb_cfg;
+      if (thread_tfg.thread_n != -1) {
+        if (prob_n / thread_tfg.thread_n *
+                div_ceil(prob_m_split, thread_m_blocks * 16) * 4 <=
+            sms) {
+          if (is_valid_config({128, 64, 128}, thread_m_blocks, prob_m_split,
+                              prob_n, prob_k, num_bits, group_size,
+                              has_act_order, is_k_full, has_zp, is_zp_float,
+                              max_shared_mem_new)) {
+            thread_tfg = {128, 64, 128};
+            exec_cfg = {1, thread_tfg};
+          }
+        }
+      }
+
       if (thread_tfg.thread_k == -1 && max_thread_m_blocks > 1) {
         max_thread_m_blocks--;
         continue;
@@ -632,10 +477,10 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
         ", has_zp = ", has_zp, ", is_zp_float = ", is_zp_float,
         ", max_shared_mem_new = ", max_shared_mem_new);
 
-    auto kernel = get_marlin_kernel<scalar_t>(
-        q_type, thread_m_blocks, thread_n_blocks, thread_k_blocks,
-        m_block_size_8, has_act_order, has_zp, group_blocks, num_threads,
-        is_zp_float);
+    auto kernel = get_marlin_kernel(
+        a_type, b_type, c_type, s_type, thread_m_blocks, thread_n_blocks,
+        thread_k_blocks, m_block_size_8, has_act_order, has_zp, group_blocks,
+        num_threads, is_zp_float);
 
     if (kernel == MarlinDefault) {
       TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n,
@@ -657,13 +502,15 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
     // avoid ">>>" being formatted to "> > >"
     // clang-format off
     kernel<<<blocks, num_threads, max_shared_mem_new, stream>>>(
-        A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, s_ptr, s2_ptr, zp_ptr,
+        A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, a_s_ptr, b_s_ptr, g_s_ptr, zp_ptr,
         g_idx_ptr, num_groups,
         prob_m_split, prob_n, prob_k, lda, locks, has_bias, part_use_atomic_add,
         use_fp32_reduce, max_shared_mem_new);
     // clang-format on
 
-    A_ptr += prob_m_split * (lda / 8);
+    bool is_a_8bit = a_type.size_bits() == 8;
+    A_ptr += prob_m_split * (lda / (is_a_8bit ? 16 : 8));
+    a_s_ptr += prob_m_split;
     C_ptr += prob_m_split * (prob_n / 8);
     rest_m -= prob_m_split;
   }
@@ -675,15 +522,73 @@ torch::Tensor gptq_marlin_gemm(
     torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
     torch::Tensor& b_q_weight,
     std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
+    std::optional<torch::Tensor> const& a_scales_or_none,
     std::optional<torch::Tensor> const& global_scale_or_none,
     std::optional<torch::Tensor> const& b_zeros_or_none,
     std::optional<torch::Tensor> const& g_idx_or_none,
     std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
-    vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n,
+    vllm::ScalarTypeId const& b_type_id, int64_t size_m, int64_t size_n,
     int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
     bool is_zp_float) {
-  vllm::ScalarType const b_q_type = vllm::ScalarType::from_id(b_q_type_id);
-  int pack_factor = 32 / b_q_type.size_bits();
+  vllm::ScalarTypeId a_type_id, c_type_id, s_type_id;
+
+  auto c_dtype = a.dtype();
+  if (a.scalar_type() == at::ScalarType::Half) {
+    a_type_id = vllm::kFloat16.id();
+    c_type_id = vllm::kFloat16.id();
+  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
+    a_type_id = vllm::kBFloat16.id();
+    c_type_id = vllm::kBFloat16.id();
+  } else {
+    c_dtype = b_scales.dtype();
+    if (b_scales.scalar_type() == at::ScalarType::Half) {
+      c_type_id = vllm::kFloat16.id();
+    } else if (b_scales.scalar_type() == at::ScalarType::BFloat16) {
+      c_type_id = vllm::kBFloat16.id();
+    } else {
+      c_type_id = vllm::kBFloat16.id();
+
+      TORCH_CHECK(c_or_none.has_value(), "c must be passed for W4A8-FP4");
+      torch::Tensor c = c_or_none.value();
+      c_dtype = c.dtype();
+
+      if (c.scalar_type() == at::ScalarType::Half) {
+        c_type_id = vllm::kFloat16.id();
+      } else if (c.scalar_type() == at::ScalarType::BFloat16) {
+        c_type_id = vllm::kBFloat16.id();
+      } else {
+        TORCH_CHECK(false, "unsupported c dtype");
+      }
+    }
+
+    if (a.scalar_type() == at::ScalarType::Float8_e4m3fn) {
+      a_type_id = vllm::kFE4M3fn.id();
+    } else if (a.scalar_type() == at::ScalarType::Char) {
+      a_type_id = vllm::kS8.id();
+    } else {
+      TORCH_CHECK(false, "unsupported `a` scalar_type");
+    }
+  }
+
+  s_type_id = c_type_id;
+  if (b_type_id == vllm::kFE2M1f.id()) {
+    if (b_scales.scalar_type() == at::ScalarType::Float8_e4m3fn) {
+      s_type_id = vllm::kFE4M3fn.id();
+    } else if (b_scales.scalar_type() == at::ScalarType::Float8_e8m0fnu) {
+      s_type_id = vllm::kFE8M0fnu.id();
+    } else {
+      TORCH_CHECK(false,
+                  "When b_type = float4_e2m1f, b_scale scalar type must be",
+                  "float8_e4m3fn (for NVFP4) or float8_e8m0fnu (for MXFP4).");
+    }
+  }
+
+  vllm::ScalarType a_type = vllm::ScalarType::from_id(a_type_id);
+  vllm::ScalarType b_type = vllm::ScalarType::from_id(b_type_id);
+  vllm::ScalarType c_type = vllm::ScalarType::from_id(c_type_id);
+  vllm::ScalarType s_type = vllm::ScalarType::from_id(s_type_id);
+
+  int pack_factor = 32 / b_type.size_bits();
 
   // Verify A
   TORCH_CHECK(a.size(0) == size_m, "Shape mismatch: a.size(0) = ", a.size(0),
@@ -721,6 +626,21 @@ torch::Tensor gptq_marlin_gemm(
   TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
   TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
 
+  torch::Tensor a_scales;
+  auto options = torch::TensorOptions().dtype(c_dtype).device(a.device());
+  auto options_fp32 =
+      torch::TensorOptions().dtype(at::kFloat).device(a.device());
+
+  if (a_scales_or_none.has_value()) {
+    a_scales = a_scales_or_none.value();
+    TORCH_CHECK(a_type.size_bits() == 8,
+                "a_scales can only be used for 8bit activation.");
+  } else {
+    a_scales = torch::empty({0}, options_fp32);
+    TORCH_CHECK(a_type.size_bits() != 8,
+                "the a_scales parameter must be passed for 8bit activation.");
+  }
+
   // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
   // auto -1)
   int thread_k = -1;
@@ -733,7 +653,6 @@ torch::Tensor gptq_marlin_gemm(
 
   // Alloc buffers
   const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
-  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
   torch::Tensor c;
   if (c_or_none.has_value()) {
     c = c_or_none.value();
@@ -750,8 +669,6 @@ torch::Tensor gptq_marlin_gemm(
 
   // Alloc C tmp buffer that is going to be used for the global reduce
   torch::Tensor c_tmp;
-  auto options_fp32 =
-      torch::TensorOptions().dtype(at::kFloat).device(a.device());
   if (use_fp32_reduce) {
     int max_m_block_size = (size_m + 16 - 1) / 16 * 16;
     max_m_block_size = min(max_m_block_size, 64);
@@ -821,11 +738,11 @@ torch::Tensor gptq_marlin_gemm(
   torch::Tensor global_scale;
   if (global_scale_or_none.has_value()) {
     global_scale = global_scale_or_none.value();
-    TORCH_CHECK(b_q_type == vllm::kFE2M1f && group_size == 16,
+    TORCH_CHECK(b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn,
                 "global_scale can only be used for nvfp4 format.");
   } else {
     global_scale = torch::empty({0}, options);
-    TORCH_CHECK(!(b_q_type == vllm::kFE2M1f && group_size == 16),
+    TORCH_CHECK(!(b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn),
                 "the global_scale parameter must be passed for nvfp4 format.");
   }
 
@@ -852,15 +769,15 @@ torch::Tensor gptq_marlin_gemm(
   bool has_zp = b_zeros.size(-1) > 0;
   if (has_zp) {
     TORCH_CHECK(
-        b_q_type == vllm::kU4 || b_q_type == vllm::kU8,
-        "b_q_type must be u4 or u8 when has_zp = True. Got = ", b_q_type.str());
+        b_type == vllm::kU4 || b_type == vllm::kU8,
+        "b_type must be u4 or u8 when has_zp = True. Got = ", b_type.str());
   } else {
-    TORCH_CHECK(b_q_type == vllm::kU4B8 || b_q_type == vllm::kU8B128 ||
-                    b_q_type == vllm::kFE4M3fn || b_q_type == vllm::kFE2M1f,
-                "b_q_type must be uint4b8, uint8b128, float8_e4m3fn or "
-                "float4_e2m1f when "
-                "has_zp = False. Got = ",
-                b_q_type.str());
+    TORCH_CHECK(b_type == vllm::kU4B8 || b_type == vllm::kU8B128 ||
+                    b_type == vllm::kS4 || b_type == vllm::kS8 ||
+                    b_type == vllm::kFE4M3fn || b_type == vllm::kFE2M1f,
+                "b_type must be uint4b8, uint8b128, int4, int8, "
+                "float8_e4m3fn or float4_e2m1f when has_zp = False. Got = ",
+                b_type.str());
   }
 
   if (has_zp && is_zp_float) {
@@ -902,59 +819,27 @@ torch::Tensor gptq_marlin_gemm(
               " is below min_workspace_size = ", min_workspace_size);
 
   int dev = a.get_device();
-  if (a.scalar_type() == at::ScalarType::Half) {
-    void* scales_ptr;
-    if (b_q_type == vllm::kFE2M1f) {
-      if (group_size == 16)
-        scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
-      else if (group_size == 32)
-        scales_ptr = b_scales.data_ptr<at::Float8_e8m0fnu>();
-      else
-        TORCH_CHECK(false,
-                    "float4_e2m1f only supports group_size == 16 (NVFP4) ",
-                    "and group_size == 32 (MXFP4)");
-    } else {
-      scales_ptr = b_scales.data_ptr<at::Half>();
-    }
 
-    marlin::marlin_mm<half>(
-        a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
-        c_tmp.data_ptr<float>(), b_bias.data_ptr<at::Half>(), scales_ptr,
-        global_scale.data_ptr<at::Half>(), b_zeros.data_ptr(), g_idx.data_ptr(),
-        perm.data_ptr(), a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k,
-        a.stride(0), workspace.data_ptr(), b_q_type, has_bias, has_act_order,
-        is_k_full, has_zp, num_groups, group_size, dev,
-        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
-        use_atomic_add, use_fp32_reduce, is_zp_float);
-  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
-    void* scales_ptr;
-    if (b_q_type == vllm::kFE2M1f) {
-      if (group_size == 16)
-        scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
-      else if (group_size == 32)
-        scales_ptr = b_scales.data_ptr<at::Float8_e8m0fnu>();
-      else
-        TORCH_CHECK(false,
-                    "float4_e2m1f only supports group_size == 16 (NVFP4) ",
-                    "and group_size == 32 (MXFP4)");
-    } else {
-      scales_ptr = b_scales.data_ptr<at::BFloat16>();
-    }
-
-    marlin::marlin_mm<nv_bfloat16>(
-        a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
-        c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(),
-        b_bias.data_ptr<at::BFloat16>(), scales_ptr,
-        global_scale.data_ptr<at::BFloat16>(), b_zeros.data_ptr(),
-        g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(),
-        size_m, size_n, size_k, a.stride(0), workspace.data_ptr(), b_q_type,
-        has_bias, has_act_order, is_k_full, has_zp, num_groups, group_size, dev,
-        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
-        use_atomic_add, use_fp32_reduce, is_zp_float);
-  } else {
-    TORCH_CHECK(false, "gpt_marlin_gemm only supports bfloat16 and float16");
+  TORCH_CHECK(a_scales.scalar_type() == at::ScalarType::Float,
+              "scalar type of a_scales must be float");
+  TORCH_CHECK(global_scale.scalar_type() == c.scalar_type(),
+              "scalar type of global_scale must be the same with c");
+  if (a_type.size_bits() == 16) {
+    TORCH_CHECK(
+        a.scalar_type() == c.scalar_type(),
+        "scalar type of a must be the same with c for 16 bit activation");
   }
 
+  marlin::marlin_mm(
+      a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), c_tmp.data_ptr(),
+      b_bias.data_ptr(), a_scales.data_ptr(), b_scales.data_ptr(),
+      global_scale.data_ptr(), b_zeros.data_ptr(), g_idx.data_ptr(),
+      perm.data_ptr(), a_tmp.data_ptr(), size_m, size_n, size_k, a.stride(0),
+      workspace.data_ptr(), a_type, b_type, c_type, s_type, has_bias,
+      has_act_order, is_k_full, has_zp, num_groups, group_size, dev,
+      at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
+      use_atomic_add, use_fp32_reduce, is_zp_float);
+
   return c;
 }
 
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu b/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
index ad80d51ece94e..796e6c5359da1 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
@@ -4,15 +4,18 @@
 
 namespace marlin {
 
-template <int const num_threads, int const num_bits, bool const has_perm>
+template <int const num_threads, int const num_bits, bool const has_perm,
+          bool is_a_8bit>
 __global__ void gptq_marlin_repack_kernel(
     uint32_t const* __restrict__ b_q_weight_ptr,
     uint32_t const* __restrict__ perm_ptr, uint32_t* __restrict__ out_ptr,
     int size_k, int size_n) {
   constexpr int pack_factor = 32 / num_bits;
 
-  int k_tiles = size_k / tile_k_size;
-  int n_tiles = size_n / tile_n_size;
+  constexpr int target_tile_n_size = tile_n_size / (is_a_8bit ? 2 : 1);
+  constexpr int target_tile_k_size = tile_k_size * (is_a_8bit ? 2 : 1);
+  int k_tiles = size_k / target_tile_k_size;
+  int n_tiles = size_n / target_tile_n_size;
   int block_k_tiles = div_ceil(k_tiles, gridDim.x);
 
   auto start_k_tile = blockIdx.x * block_k_tiles;
@@ -34,7 +37,7 @@ __global__ void gptq_marlin_repack_kernel(
 
   extern __shared__ int4 sh[];
 
-  constexpr int perm_size = tile_k_size / 4;
+  constexpr int perm_size = target_tile_k_size / 4;
 
   int4* sh_perm_ptr = sh;
   int4* sh_pipe_ptr = sh_perm_ptr;
@@ -42,14 +45,14 @@ __global__ void gptq_marlin_repack_kernel(
     sh_pipe_ptr += perm_size;
   }
 
-  constexpr int tile_ints = tile_k_size / pack_factor;
+  constexpr int tile_ints = target_tile_k_size / pack_factor;
 
-  constexpr int stage_n_threads = tile_n_size / 4;
-  constexpr int stage_k_threads = has_perm ? tile_k_size : tile_ints;
+  constexpr int stage_n_threads = target_tile_n_size / 4;
+  constexpr int stage_k_threads = has_perm ? target_tile_k_size : tile_ints;
   constexpr int stage_size = stage_k_threads * stage_n_threads;
 
   auto load_perm_to_shared = [&](int k_tile_id) {
-    int first_k_int4 = (k_tile_id * tile_k_size) / 4;
+    int first_k_int4 = (k_tile_id * target_tile_k_size) / 4;
 
     int4 const* perm_int4_ptr = reinterpret_cast<int4 const*>(perm_ptr);
 
@@ -65,7 +68,7 @@ __global__ void gptq_marlin_repack_kernel(
       return;
     }
 
-    int first_n = n_tile_id * tile_n_size;
+    int first_n = n_tile_id * target_tile_n_size;
 
     int4* sh_ptr = sh_pipe_ptr + stage_size * pipe;
 
@@ -91,7 +94,7 @@ __global__ void gptq_marlin_repack_kernel(
         auto k_id = threadIdx.x / stage_n_threads;
         auto n_id = threadIdx.x % stage_n_threads;
 
-        int first_k = k_tile_id * tile_k_size;
+        int first_k = k_tile_id * target_tile_k_size;
         int first_k_packed = first_k / pack_factor;
 
         cp_async4(&sh_ptr[k_id * stage_n_threads + n_id],
@@ -117,13 +120,13 @@ __global__ void gptq_marlin_repack_kernel(
     }
 
     int tc_col = th_id / 4;
-    int tc_row = (th_id % 4) * 2;
+    int tc_row = (th_id % 4) * (is_a_8bit ? 4 : 2);
 
     constexpr int tc_offsets[4] = {0, 1, 8, 9};
 
-    int cur_n = warp_id * 16 + tc_col;
+    int cur_n = (warp_id / (is_a_8bit ? 2 : 1)) * 16 + tc_col;
 
-    constexpr int sh_stride = 64;
+    constexpr int sh_stride = target_tile_n_size;
     constexpr uint32_t mask = (1 << num_bits) - 1;
 
     int4* sh_stage_ptr = sh_pipe_ptr + stage_size * pipe;
@@ -134,6 +137,7 @@ __global__ void gptq_marlin_repack_kernel(
     uint32_t vals[8];
 
     if constexpr (has_perm) {
+      static_assert(!is_a_8bit);
       for (int i = 0; i < 4; i++) {
         int k_idx = tc_row + tc_offsets[i];
 
@@ -156,28 +160,49 @@ __global__ void gptq_marlin_repack_kernel(
 
 #pragma unroll
       for (int i = 0; i < tile_ints; i++) {
-        b1_vals[i] = sh_stage_int_ptr[cur_n + sh_stride * i];
-        b2_vals[i] = sh_stage_int_ptr[cur_n + 8 + sh_stride * i];
+        if constexpr (is_a_8bit) {
+          b1_vals[i] =
+              sh_stage_int_ptr[cur_n + sh_stride * i + (warp_id % 2) * 8];
+        } else {
+          b1_vals[i] = sh_stage_int_ptr[cur_n + sh_stride * i];
+          b2_vals[i] = sh_stage_int_ptr[cur_n + 8 + sh_stride * i];
+        }
       }
 
 #pragma unroll
       for (int i = 0; i < 4; i++) {
-        int cur_elem = tc_row + tc_offsets[i];
+        int cur_elem = tc_row + (is_a_8bit ? i : tc_offsets[i]);
         int cur_int = cur_elem / pack_factor;
         int cur_pos = cur_elem % pack_factor;
 
         vals[i] = (b1_vals[cur_int] >> (cur_pos * num_bits)) & mask;
-        vals[4 + i] = (b2_vals[cur_int] >> (cur_pos * num_bits)) & mask;
+        if constexpr (is_a_8bit)
+          vals[4 + i] =
+              (b1_vals[cur_int + tile_ints / 2] >> (cur_pos * num_bits)) & mask;
+        else
+          vals[4 + i] = (b2_vals[cur_int] >> (cur_pos * num_bits)) & mask;
       }
     }
 
-    constexpr int tile_size = tile_k_size * tile_n_size / pack_factor;
+    constexpr int tile_size =
+        target_tile_k_size * target_tile_n_size / pack_factor;
     int out_offset = (k_tile_id * n_tiles + n_tile_id) * tile_size;
 
     // Result of:
     // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
-    if constexpr (num_bits == 4) {
-      constexpr int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+    if constexpr (!is_a_8bit && num_bits == 4) {
+      int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+
+      uint32_t res = 0;
+#pragma unroll
+      for (int i = 0; i < 8; i++) {
+        res |= vals[pack_idx[i]] << (i * 4);
+      }
+
+      out_ptr[out_offset + th_id * 4 + warp_id] = res;
+
+    } else if constexpr (is_a_8bit && num_bits == 4) {
+      int pack_idx[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 
       uint32_t res = 0;
 #pragma unroll
@@ -194,8 +219,9 @@ __global__ void gptq_marlin_repack_kernel(
       uint32_t res2 = 0;
 #pragma unroll
       for (int i = 0; i < 4; i++) {
-        res1 |= vals[pack_idx[i]] << (i * 8);
-        res2 |= vals[4 + pack_idx[i]] << (i * 8);
+        const int ii = is_a_8bit ? i : pack_idx[i];
+        res1 |= vals[ii] << (i * 8);
+        res2 |= vals[4 + ii] << (i * 8);
       }
 
       out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 0] = res1;
@@ -236,21 +262,22 @@ __global__ void gptq_marlin_repack_kernel(
 
 }  // namespace marlin
 
-#define CALL_IF(NUM_BITS, HAS_PERM)                                         \
-  else if (num_bits == NUM_BITS && has_perm == HAS_PERM) {                  \
+#define CALL_IF(NUM_BITS, HAS_PERM, IS_A_8BIT)                              \
+  else if (num_bits == NUM_BITS && has_perm == HAS_PERM &&                  \
+           is_a_8bit == IS_A_8BIT) {                                        \
     cudaFuncSetAttribute(                                                   \
         marlin::gptq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS, \
-                                          HAS_PERM>,                        \
+                                          HAS_PERM, IS_A_8BIT>,             \
         cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);       \
     marlin::gptq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS,     \
-                                      HAS_PERM>                             \
+                                      HAS_PERM, IS_A_8BIT>                  \
         <<<blocks, marlin::repack_threads, max_shared_mem, stream>>>(       \
             b_q_weight_ptr, perm_ptr, out_ptr, size_k, size_n);             \
   }
 
 torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
                                  int64_t size_k, int64_t size_n,
-                                 int64_t num_bits) {
+                                 int64_t num_bits, bool is_a_8bit) {
   // Verify compatibility with marlin tile of 16x64
   TORCH_CHECK(size_k % marlin::tile_k_size == 0, "size_k = ", size_k,
               " is not divisible by tile_k_size = ", marlin::tile_k_size);
@@ -309,13 +336,17 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
 
   if (false) {
   }
-  CALL_IF(4, false)
-  CALL_IF(4, true)
-  CALL_IF(8, false)
-  CALL_IF(8, true)
+  CALL_IF(4, false, false)
+  CALL_IF(4, true, false)
+  CALL_IF(8, false, false)
+  CALL_IF(8, true, false)
+
+  CALL_IF(4, false, true)
+  CALL_IF(8, false, true)
+
   else {
     TORCH_CHECK(false, "Unsupported repack config: num_bits = ", num_bits,
-                ", has_perm = ", has_perm);
+                ", has_perm = ", has_perm, ", is_a_8bit = ", is_a_8bit);
   }
 
   return out;
diff --git a/csrc/quantization/gptq_marlin/kernel.h b/csrc/quantization/gptq_marlin/kernel.h
index bb454f6aff22a..b3b79c8aec452 100644
--- a/csrc/quantization/gptq_marlin/kernel.h
+++ b/csrc/quantization/gptq_marlin/kernel.h
@@ -11,17 +11,19 @@
   const int4 *__restrict__ A, const int4 *__restrict__ B,                      \
       int4 *__restrict__ C, int4 *__restrict__ C_tmp,                          \
       const int4 *__restrict__ b_bias_ptr,                                     \
+      const float *__restrict__ a_scales_ptr,                                  \
       const int4 *__restrict__ scales_ptr,                                     \
-      const uint16_t *__restrict__ scale2_ptr,                                 \
+      const uint16_t *__restrict__ global_scale_ptr,                           \
       const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx,          \
       int num_groups, int prob_m, int prob_n, int prob_k, int lda, int *locks, \
       bool has_bias, bool use_atomic_add, bool use_fp32_reduce,                \
       int max_shared_mem
 
 namespace MARLIN_NAMESPACE_NAME {
-template <typename scalar_t,  // compute dtype, half or nv_float16
-          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
-          const vllm::ScalarTypeId s_type_id,  // weight ScalarType id
+template <const vllm::ScalarTypeId a_type_id,  // A ScalarType id
+          const vllm::ScalarTypeId b_type_id,  // B ScalarType id
+          const vllm::ScalarTypeId c_type_id,  // C ScalarType id
+          const vllm::ScalarTypeId s_type_id,  // B_SCALE ScalarType id
           const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
diff --git a/csrc/quantization/gptq_marlin/marlin.cuh b/csrc/quantization/gptq_marlin/marlin.cuh
index f3b44641e77ee..2505e221322dd 100644
--- a/csrc/quantization/gptq_marlin/marlin.cuh
+++ b/csrc/quantization/gptq_marlin/marlin.cuh
@@ -55,6 +55,45 @@ constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
 // No support for async
 #else
 
+__device__ inline void cp_async1_ca_pred(void* smem_ptr, const void* glob_ptr,
+                                         bool pred = true) {
+  const int BYTES = 4;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   .reg .pred p;\n"
+      "   setp.ne.b32 p, %0, 0;\n"
+      "   @p cp.async.ca.shared.global [%1], [%2], %3;\n"
+      "}\n" ::"r"((int)pred),
+      "r"(smem), "l"(glob_ptr), "n"(BYTES));
+}
+
+__device__ inline void cp_async2_ca_pred(void* smem_ptr, const void* glob_ptr,
+                                         bool pred = true) {
+  const int BYTES = 8;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   .reg .pred p;\n"
+      "   setp.ne.b32 p, %0, 0;\n"
+      "   @p cp.async.ca.shared.global [%1], [%2], %3;\n"
+      "}\n" ::"r"((int)pred),
+      "r"(smem), "l"(glob_ptr), "n"(BYTES));
+}
+
+__device__ inline void cp_async4_ca_pred(void* smem_ptr, const void* glob_ptr,
+                                         bool pred = true) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   .reg .pred p;\n"
+      "   setp.ne.b32 p, %0, 0;\n"
+      "   @p cp.async.ca.shared.global [%1], [%2], %3;\n"
+      "}\n" ::"r"((int)pred),
+      "r"(smem), "l"(glob_ptr), "n"(BYTES));
+}
+
 __device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
                                       bool pred = true) {
   const int BYTES = 16;
diff --git a/csrc/quantization/gptq_marlin/marlin_dtypes.cuh b/csrc/quantization/gptq_marlin/marlin_dtypes.cuh
index cc16054814342..a4807a6887f81 100644
--- a/csrc/quantization/gptq_marlin/marlin_dtypes.cuh
+++ b/csrc/quantization/gptq_marlin/marlin_dtypes.cuh
@@ -2,8 +2,10 @@
 #ifndef _data_types_cuh
 #define _data_types_cuh
 #include "marlin.cuh"
+#include "core/scalar_type.hpp"
 #include <cuda_fp16.h>
 #include <cuda_bf16.h>
+#include <cuda_fp8.h>
 
 #ifndef MARLIN_NAMESPACE_NAME
   #define MARLIN_NAMESPACE_NAME marlin
@@ -11,14 +13,16 @@
 
 namespace MARLIN_NAMESPACE_NAME {
 
-template <typename scalar_t>
-class ScalarType {};
+template <long scalar_type_id>
+class MarlinScalarType {};
 
 template <>
-class ScalarType<half> {
+class MarlinScalarType<vllm::kFloat16.id()> {
  public:
   using scalar_t = half;
   using scalar_t2 = half2;
+  using scalar_t4 = half2;
+  using scalar_32bit_t = half2;
 
   // Matrix fragments for tensor core instructions; their precise layout is
   // documented here:
@@ -27,6 +31,7 @@ class ScalarType<half> {
   using FragB = Vec<half2, 2>;
   using FragC = Vec<float, 4>;
   using FragS = Vec<half2, 1>;
+  using FragS0 = Vec<__nv_fp8x2_e4m3, 1>;
   using FragZP = Vec<half2, 4>;
 
   static __device__ float inline num2float(const half x) {
@@ -44,18 +49,25 @@ class ScalarType<half> {
   static __host__ __device__ half inline float2num(const float x) {
     return __float2half(x);
   }
+
+  static __host__ __device__ float2 inline num22float2(const half2 x) {
+    return __half22float2(x);
+  }
 };
 
 template <>
-class ScalarType<nv_bfloat16> {
+class MarlinScalarType<vllm::kBFloat16.id()> {
  public:
   using scalar_t = nv_bfloat16;
   using scalar_t2 = nv_bfloat162;
+  using scalar_t4 = nv_bfloat162;
+  using scalar_32bit_t = nv_bfloat162;
 
   using FragA = Vec<nv_bfloat162, 4>;
   using FragB = Vec<nv_bfloat162, 2>;
   using FragC = Vec<float, 4>;
   using FragS = Vec<nv_bfloat162, 1>;
+  using FragS0 = Vec<__nv_fp8x2_e4m3, 1>;
   using FragZP = Vec<nv_bfloat162, 4>;
 
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
@@ -75,9 +87,63 @@ class ScalarType<nv_bfloat16> {
   static __host__ __device__ nv_bfloat16 inline float2num(const float x) {
     return __float2bfloat16(x);
   }
+
+  static __host__ __device__ float2 inline num22float2(const nv_bfloat162 x) {
+    return __bfloat1622float2(x);
+  }
 #endif
 };
 
+template <>
+class MarlinScalarType<vllm::kFE4M3fn.id()> {
+ public:
+  using scalar_t = __nv_fp8_e4m3;
+  using scalar_t2 = __nv_fp8x2_e4m3;
+  using scalar_t4 = __nv_fp8x4_e4m3;
+  using scalar_32bit_t = __nv_fp8x4_e4m3;
+
+  using FragA = Vec<__nv_fp8x4_e4m3, 4>;
+  using FragB = Vec<__nv_fp8x4_e4m3, 2>;
+  using FragC = Vec<float, 4>;
+  using FragZP = Vec<__nv_fp8x2_e4m3, 4>;
+
+  static __host__ __device__
+      float2 inline num22float2(const __nv_fp8x2_e4m3 x) {
+    return (float2)x;
+  }
+};
+
+template <>
+class MarlinScalarType<vllm::kS8.id()> {
+ public:
+  using scalar_t = int8_t;
+  using scalar_t2 = int16_t;
+  using scalar_t4 = int32_t;
+  using scalar_32bit_t = int32_t;
+
+  using FragA = Vec<int32_t, 4>;
+  using FragB = Vec<int32_t, 2>;
+  using FragC = Vec<float, 4>;
+  using FragZP = Vec<int16_t, 4>;
+};
+
+template <typename scalar_t>
+class MarlinScalarType2 {};
+
+template <>
+class MarlinScalarType2<half> : public MarlinScalarType<vllm::kFloat16.id()> {};
+
+template <>
+class MarlinScalarType2<nv_bfloat16>
+    : public MarlinScalarType<vllm::kBFloat16.id()> {};
+
+template <>
+class MarlinScalarType2<__nv_fp8_e4m3>
+    : public MarlinScalarType<vllm::kFE4M3fn.id()> {};
+
+template <>
+class MarlinScalarType2<int8_t> : public MarlinScalarType<vllm::kS8.id()> {};
+
 }  // namespace MARLIN_NAMESPACE_NAME
 
 #endif
diff --git a/csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu b/csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu
new file mode 100644
index 0000000000000..7d4c97fb57ed4
--- /dev/null
+++ b/csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu
@@ -0,0 +1,106 @@
+
+
+#include "marlin.cuh"
+
+#include "core/registration.h"
+
+// for only non-zp format (like gptq)
+__global__ void marlin_int4_fp8_preprocess_kernel_without_zp(
+    // qweight: (size_k * size_n // 8,)
+    const int32_t* __restrict__ qweight,
+    // output: same shape with qweight
+    int32_t* __restrict__ output) {
+  int32_t val = qweight[blockIdx.x * 32 + threadIdx.x];
+  int32_t new_val = 0;
+
+#pragma unroll
+  for (int32_t i = 0; i < 8; i++) {
+    int32_t single_val = val & 0xF;
+    single_val = single_val >= 8 ? single_val - 8 : 15 - single_val;
+    new_val |= single_val << (i * 4);
+    val >>= 4;
+  }
+
+  output[blockIdx.x * 32 + threadIdx.x] = new_val;
+}
+
+// for awq format only (with zp and with awq weight layout)
+__global__ void marlin_int4_fp8_preprocess_kernel_awq(
+    // AWQ qweight: (size_k, size_n // 8)
+    const int32_t* __restrict__ qweight,
+    // output: same shape with qweight
+    int32_t* __restrict__ output,
+    // AWQ zeros: (size_k // group_size, size_n // 8)
+    const int32_t* __restrict__ qzeros, int32_t size_n, int32_t size_k,
+    int32_t group_size) {
+  int32_t val =
+      qweight[(blockIdx.x * 32 + threadIdx.x) * size_n / 8 + blockIdx.y];
+  int32_t zero =
+      qzeros[(blockIdx.x * 32 + threadIdx.x) / group_size * size_n / 8 +
+             blockIdx.y];
+  int32_t new_val = 0;
+
+#pragma unroll
+  for (int32_t i = 0; i < 8; i++) {
+    int32_t single_val = val & 0xF;
+    int32_t single_zero = zero & 0xF;
+
+    single_val =
+        single_val >= single_zero ? single_val - single_zero : 15 - single_val;
+    new_val |= single_val << (i * 4);
+    val >>= 4;
+    zero >>= 4;
+  }
+
+  output[(blockIdx.x * 32 + threadIdx.x) * size_n / 8 + blockIdx.y] = new_val;
+}
+
+torch::Tensor marlin_int4_fp8_preprocess(
+    torch::Tensor& qweight, std::optional<torch::Tensor> qzeros_or_none,
+    bool inplace) {
+  TORCH_CHECK(qweight.device().is_cuda(), "qweight is not on GPU");
+  TORCH_CHECK(qweight.scalar_type() == at::ScalarType::Int,
+              "qweight.dtype != torch.int32");
+
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(qweight));
+
+  torch::Tensor output = inplace ? qweight : torch::empty_like(qweight);
+
+  if (!qzeros_or_none.has_value()) {
+    TORCH_CHECK(qweight.numel() * 8 % 256 == 0,
+                "qweight.numel() * 8 % 256 != 0");
+
+    int blocks = qweight.numel() * 8 / 256;
+    marlin_int4_fp8_preprocess_kernel_without_zp<<<blocks, 32>>>(
+        (const int32_t*)qweight.data_ptr(), (int32_t*)output.data_ptr());
+  } else {
+    int32_t size_k = qweight.size(0);
+    int32_t size_n = qweight.size(1) * 8;
+    torch::Tensor qzeros = qzeros_or_none.value();
+
+    TORCH_CHECK(size_k % 32 == 0, "size_k % 32 != 0");
+    TORCH_CHECK(qzeros.device().is_cuda(), "qzeros is not on GPU");
+    TORCH_CHECK(qzeros.scalar_type() == at::ScalarType::Int,
+                "qweight.dtype != torch.int32");
+    TORCH_CHECK(device_of(qweight) == device_of(qzeros),
+                "qzeros is not on the same device with qweight");
+
+    int32_t group_size = qweight.size(0) / qzeros.size(0);
+    TORCH_CHECK(qweight.size(1) == qzeros.size(1),
+                "qweight.size(1) != qzeros.size(1)");
+    TORCH_CHECK(qweight.size(0) % qzeros.size(0) == 0,
+                "qweight.size(0) % qzeros.size(0) != 0");
+    TORCH_CHECK(group_size % 8 == 0, "group_size % 8 != 0");
+
+    dim3 blocks(size_k / 32, size_n / 8);
+    marlin_int4_fp8_preprocess_kernel_awq<<<blocks, 32>>>(
+        (const int32_t*)qweight.data_ptr(), (int32_t*)output.data_ptr(),
+        (const int32_t*)qzeros.data_ptr(), size_n, size_k, group_size);
+  }
+
+  return output;
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("marlin_int4_fp8_preprocess", &marlin_int4_fp8_preprocess);
+}
diff --git a/csrc/quantization/gptq_marlin/marlin_template.h b/csrc/quantization/gptq_marlin/marlin_template.h
index bfb0a3668f527..22bb71e482ce8 100644
--- a/csrc/quantization/gptq_marlin/marlin_template.h
+++ b/csrc/quantization/gptq_marlin/marlin_template.h
@@ -38,7 +38,7 @@ namespace MARLIN_NAMESPACE_NAME {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
 
 template <typename scalar_t,  // compute dtype, half or nv_float16
-          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const vllm::ScalarTypeId b_type_id,  // weight MarlinScalarType id
           const vllm::ScalarTypeId s_type_id,  // weight scale ScalarType id
           const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
@@ -77,65 +77,139 @@ __global__ void Marlin(
 
 // m16n8k16 tensor core mma instruction with fp16 inputs and fp32
 // output/accumulation.
-template <typename scalar_t>
-__device__ inline void mma(const typename ScalarType<scalar_t>::FragA& a_frag,
-                           const typename ScalarType<scalar_t>::FragB& frag_b,
-                           typename ScalarType<scalar_t>::FragC& frag_c) {
+template <vllm::ScalarTypeId type_id, int k_size = 16>
+__device__ inline void mma(
+    const typename MarlinScalarType<type_id>::FragA& a_frag,
+    const typename MarlinScalarType<type_id>::FragB& frag_b,
+    typename MarlinScalarType<type_id>::FragC& frag_c, int idx = 0) {
   const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
   const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
-  float* c = reinterpret_cast<float*>(&frag_c);
-  if constexpr (std::is_same<scalar_t, half>::value) {
-    asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
-        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
-    asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
-        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-  } else {
-    STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  if constexpr (k_size == 16) {
+    if constexpr (std::is_same<scalar_t, half>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "f"(c[0]),
+            "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
+      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+          : "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "r"(c[0]),
+            "r"(c[1]), "r"(c[2]), "r"(c[3]));
+    }
+  } else if (k_size == 32) {
+    if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
+      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
+    }
   }
 }
 
-template <typename scalar_t>
+template <vllm::ScalarTypeId type_id, int k_size = 16>
 __device__ inline void mma_trans(
-    const typename ScalarType<scalar_t>::FragA& a_frag,
-    const typename ScalarType<scalar_t>::FragB& frag_b,
-    const typename ScalarType<scalar_t>::FragB& frag_b2,
-    typename ScalarType<scalar_t>::FragC& frag_c) {
+    const typename MarlinScalarType<type_id>::FragA& a_frag,
+    const typename MarlinScalarType<type_id>::FragB& frag_b,
+    const typename MarlinScalarType<type_id>::FragB& frag_b2,
+    typename MarlinScalarType<type_id>::FragC& frag_c) {
   const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
   const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
   const uint32_t* b2 = reinterpret_cast<const uint32_t*>(&frag_b2);
   float* c = reinterpret_cast<float*>(&frag_c);
-  if constexpr (std::is_same<scalar_t, half>::value) {
-    asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
-        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-        : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
-    asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
-        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-        : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  if constexpr (k_size == 16) {
+    if constexpr (std::is_same<scalar_t, half>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "f"(c[0]), "f"(c[1]), "f"(c[2]),
+            "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
+      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "r"(c[0]), "r"(c[1]), "r"(c[2]),
+            "r"(c[3]));
+    }
   } else {
-    STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
+    if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
+      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
+    }
   }
 }
 
 // Instruction for loading a full 16x16 matrix fragment of operand A from shared
 // memory, directly in tensor core layout.
-template <int count, typename scalar_t>
-__device__ inline void ldsm(typename ScalarType<scalar_t>::FragA& frag_a,
+template <int count, vllm::ScalarTypeId type_id>
+__device__ inline void ldsm(typename MarlinScalarType<type_id>::FragA& frag_a,
                             const void* smem_ptr) {
   uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
   uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
@@ -159,47 +233,54 @@ __device__ inline void ldsm(typename ScalarType<scalar_t>::FragA& frag_a,
 
 // Multiply dequantized values by the corresponding quantization scale; used
 // only for grouped quantization.
-template <typename scalar_t>
-__device__ inline void scale(typename ScalarType<scalar_t>::FragB& frag_b,
-                             typename ScalarType<scalar_t>::FragS& frag_s,
+template <vllm::ScalarTypeId type_id>
+__device__ inline void scale(typename MarlinScalarType<type_id>::FragB& frag_b,
+                             typename MarlinScalarType<type_id>::FragS& frag_s,
                              int i) {
-  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
-  scalar_t2 s =
-      ScalarType<scalar_t>::num2num2(reinterpret_cast<scalar_t*>(&frag_s)[i]);
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  using scalar_t2 = typename MarlinScalarType<type_id>::scalar_t2;
+  scalar_t2 s = MarlinScalarType<type_id>::num2num2(
+      reinterpret_cast<scalar_t*>(&frag_s)[i]);
   frag_b[0] = __hmul2(frag_b[0], s);
   frag_b[1] = __hmul2(frag_b[1], s);
 }
 
-template <typename scalar_t>
+template <vllm::ScalarTypeId type_id>
 __device__ inline void scale_and_sub(
-    typename ScalarType<scalar_t>::FragB& frag_b, scalar_t s, scalar_t zp) {
-  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
-  scalar_t2 s2 = ScalarType<scalar_t>::num2num2(s);
-  scalar_t2 zp2 = ScalarType<scalar_t>::num2num2(zp);
+    typename MarlinScalarType<type_id>::FragB& frag_b,
+    typename MarlinScalarType<type_id>::scalar_t s,
+    typename MarlinScalarType<type_id>::scalar_t zp) {
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  using scalar_t2 = typename MarlinScalarType<type_id>::scalar_t2;
+  scalar_t2 s2 = MarlinScalarType<type_id>::num2num2(s);
+  scalar_t2 zp2 = MarlinScalarType<type_id>::num2num2(zp);
   frag_b[0] = __hfma2(frag_b[0], s2, __hneg2(zp2));
   frag_b[1] = __hfma2(frag_b[1], s2, __hneg2(zp2));
 }
 
-template <typename scalar_t>
-__device__ inline void sub_zp(typename ScalarType<scalar_t>::FragB& frag_b,
-                              typename ScalarType<scalar_t>::scalar_t2& frag_zp,
-                              int i) {
-  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
-  scalar_t2 zp =
-      ScalarType<scalar_t>::num2num2(reinterpret_cast<scalar_t*>(&frag_zp)[i]);
+template <vllm::ScalarTypeId type_id>
+__device__ inline void sub_zp(
+    typename MarlinScalarType<type_id>::FragB& frag_b,
+    typename MarlinScalarType<type_id>::scalar_t2& frag_zp, int i) {
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  using scalar_t2 = typename MarlinScalarType<type_id>::scalar_t2;
+  scalar_t2 zp = MarlinScalarType<type_id>::num2num2(
+      reinterpret_cast<scalar_t*>(&frag_zp)[i]);
   frag_b[0] = __hsub2(frag_b[0], zp);
   frag_b[1] = __hsub2(frag_b[1], zp);
 }
 
 // Same as above, but for act_order (each K is multiplied individually)
-template <typename scalar_t>
-__device__ inline void scale4(typename ScalarType<scalar_t>::FragB& frag_b,
-                              typename ScalarType<scalar_t>::FragS& frag_s_1,
-                              typename ScalarType<scalar_t>::FragS& frag_s_2,
-                              typename ScalarType<scalar_t>::FragS& frag_s_3,
-                              typename ScalarType<scalar_t>::FragS& frag_s_4,
-                              int i) {
-  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+template <vllm::ScalarTypeId type_id>
+__device__ inline void scale4(
+    typename MarlinScalarType<type_id>::FragB& frag_b,
+    typename MarlinScalarType<type_id>::FragS& frag_s_1,
+    typename MarlinScalarType<type_id>::FragS& frag_s_2,
+    typename MarlinScalarType<type_id>::FragS& frag_s_3,
+    typename MarlinScalarType<type_id>::FragS& frag_s_4, int i) {
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  using scalar_t2 = typename MarlinScalarType<type_id>::scalar_t2;
+
   scalar_t2 s_val_1_2;
   s_val_1_2.x = reinterpret_cast<scalar_t*>(&frag_s_1)[i];
   s_val_1_2.y = reinterpret_cast<scalar_t*>(&frag_s_2)[i];
@@ -213,12 +294,13 @@ __device__ inline void scale4(typename ScalarType<scalar_t>::FragB& frag_b,
 }
 
 // Given 2 floats multiply by 2 scales (halves)
-template <typename scalar_t>
-__device__ inline void scale_float(float* c,
-                                   typename ScalarType<scalar_t>::FragS& s) {
+template <vllm::ScalarTypeId type_id>
+__device__ inline void scale_float(
+    float* c, typename MarlinScalarType<type_id>::FragS& s) {
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
   scalar_t* s_ptr = reinterpret_cast<scalar_t*>(&s);
-  c[0] = __fmul_rn(c[0], ScalarType<scalar_t>::num2float(s_ptr[0]));
-  c[1] = __fmul_rn(c[1], ScalarType<scalar_t>::num2float(s_ptr[1]));
+  c[0] = __fmul_rn(c[0], MarlinScalarType<type_id>::num2float(s_ptr[0]));
+  c[1] = __fmul_rn(c[1], MarlinScalarType<type_id>::num2float(s_ptr[1]));
 }
 
 // Wait until barrier reaches `count`, then lock for current threadblock.
@@ -270,9 +352,10 @@ __device__ inline void wait_negative_and_add(int* lock) {
   __syncthreads();
 }
 
-template <typename scalar_t,  // compute dtype, half or nv_float16
-          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
-          const vllm::ScalarTypeId s_type_id,  // weight scale ScalarType id
+template <const vllm::ScalarTypeId a_type_id,  // A ScalarType id
+          const vllm::ScalarTypeId b_type_id,  // B ScalarType id
+          const vllm::ScalarTypeId c_type_id,  // C ScalarType id
+          const vllm::ScalarTypeId s_type_id,  // B_SCALE ScalarType id
           const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
@@ -288,18 +371,23 @@ template <typename scalar_t,  // compute dtype, half or nv_float16
           const bool is_zp_float   // is zero point of float16 type?
           >
 __global__ void Marlin(
-    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
-    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
-    int4* __restrict__ C,        // fp16 output buffer of shape mxn
-    int4* __restrict__ C_tmp,    // fp32 tmp output buffer (for reduce)
+    const int4* __restrict__ A0,  // fp16 input matrix of shape mxk
+    const int4* __restrict__ B,   // 4bit quantized weight matrix of shape kxn
+    int4* __restrict__ C0,        // fp16 output buffer of shape mxn
+    int4* __restrict__ C_tmp,     // fp32 tmp output buffer (for reduce)
     const int4* __restrict__ b_bias_ptr,
-    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
-                                          // (k/groupsize)xn
-    const uint16_t* __restrict__ scale2_ptr,  // fp16 global scale (for nvfp4
-                                              // only)
-    const int4* __restrict__ zp_ptr,  // 4bit packed zero-points of shape
-                                      // (k/groupsize)x(n/pack_factor)
-    const int* __restrict__ g_idx,    // int32 group indices of shape k
+    // float scales of input matrix, only used when is_a_8bit == true.
+    // shape (m,)
+    const float* __restrict__ a_scales_ptr,
+    // fp16 quantization scales. shape (k/groupsize, n)
+    const int4* __restrict__ scales_ptr,
+    // fp16 global scale (for nvfp4// only)
+    const uint16_t* __restrict__ global_scale_ptr,
+    // 4bit packed zero-points of shape
+    // (k/groupsize, n/pack_factor)
+    const int4* __restrict__ zp_ptr,
+    // int32 group indices of shape k
+    const int* __restrict__ g_idx,
     int num_groups,  // number of scale groups per output channel
     int prob_m,      // batch dimension m
     int prob_n,      // output dimension n
@@ -321,17 +409,35 @@ __global__ void Marlin(
   // ensures good utilization of all SMs for many kinds of shape and GPU
   // configurations, while requiring as few slow global cross-threadblock
   // reductions as possible.
-  using Dtype = ScalarType<scalar_t>;
-  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
-  using FragA = typename ScalarType<scalar_t>::FragA;
-  using FragB = typename ScalarType<scalar_t>::FragB;
-  using FragC = typename ScalarType<scalar_t>::FragC;
-  using FragS = typename ScalarType<scalar_t>::FragS;
-  using FragZP = typename ScalarType<scalar_t>::FragZP;
 
-  static constexpr auto w_type = vllm::ScalarType::from_id(w_type_id);
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 890
+  // FP8 computation is only supported for Ada Lovelace or newer architectures.
+  if constexpr (a_type_id == vllm::kFE4M3fn.id()) return;
+  #endif
+
+  using Adtype = MarlinScalarType<a_type_id>;
+  using Cdtype = MarlinScalarType<c_type_id>;
+  const int4* A = A0;
+  int4* C = C0;
+
+  using scalar_t = typename MarlinScalarType<a_type_id>::scalar_t;
+  using scalar_t2 = typename MarlinScalarType<a_type_id>::scalar_t2;
+  using scalar_32bit_t = typename MarlinScalarType<a_type_id>::scalar_32bit_t;
+
+  using c_scalar_t = typename MarlinScalarType<c_type_id>::scalar_t;
+  using c_scalar_t2 = typename MarlinScalarType<c_type_id>::scalar_t2;
+
+  using FragA = typename MarlinScalarType<a_type_id>::FragA;
+  using FragB = typename MarlinScalarType<a_type_id>::FragB;
+  using FragC = typename MarlinScalarType<a_type_id>::FragC;
+  using FragS = typename MarlinScalarType<c_type_id>::FragS;
+  using FragZP = typename MarlinScalarType<c_type_id>::FragZP;
+
+  static constexpr auto a_type = vllm::ScalarType::from_id(a_type_id);
+  static constexpr auto b_type = vllm::ScalarType::from_id(b_type_id);
+  static constexpr auto c_type = vllm::ScalarType::from_id(c_type_id);
   static constexpr auto s_type = vllm::ScalarType::from_id(s_type_id);
-  if constexpr (w_type == vllm::kFE2M1f) {
+  if constexpr (b_type == vllm::kFE2M1f) {
     static_assert(s_type == vllm::kFE4M3fn && group_blocks == 1 ||
                   s_type == vllm::kFE8M0fnu && group_blocks == 2);
   } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
@@ -340,27 +446,35 @@ __global__ void Marlin(
     static_assert(s_type == vllm::kFloat16);
   }
 
-  constexpr bool has_zp = w_type == vllm::kU4 || w_type == vllm::kU8;
-  constexpr bool is_int_type = w_type == vllm::kU4 || w_type == vllm::kU8 ||
-                               w_type == vllm::kU4B8 || w_type == vllm::kU8B128;
+  constexpr bool is_a_8bit = a_type.size_bits() == 8;
+  if constexpr (!is_a_8bit) {
+    static_assert(std::is_same<scalar_t, c_scalar_t>::value);
+  }
+  constexpr bool has_zp = b_type == vllm::kU4 || b_type == vllm::kU8;
+  constexpr bool is_int_type = b_type == vllm::kU4 || b_type == vllm::kU8 ||
+                               b_type == vllm::kS4 || b_type == vllm::kS8 ||
+                               b_type == vllm::kU4B8 || b_type == vllm::kU8B128;
   // see comments of dequant.h for more details
   constexpr bool dequant_skip_flop =
-      w_type == vllm::kFE4M3fn ||
-      w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn ||
+      is_a_8bit || b_type == vllm::kFE4M3fn ||
+      b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn ||
       has_zp && !is_zp_float && !std::is_same<scalar_t, nv_bfloat16>::value ||
-      has_zp && !is_zp_float && !(w_type == vllm::kU8);
+      has_zp && !is_zp_float && !(b_type == vllm::kU8);
 
-  scalar_t2 global_scale;
-  if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
-    // NVFP4 format requires global scale
-    uint16_t val = scale2_ptr[0];
-    global_scale = Dtype::num2num2(*reinterpret_cast<scalar_t*>(&val));
+  c_scalar_t2 global_scale;
+
+  if constexpr (b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
+    uint16_t val = global_scale_ptr[0];
+    global_scale = Cdtype::num2num2(*reinterpret_cast<c_scalar_t*>(&val));
   }
 
   constexpr bool has_act_order = group_blocks == 0;
   constexpr int m_block_size = m_block_size_8 ? 8 : (16 * thread_m_blocks);
 
-  constexpr int pack_factor = 32 / w_type.size_bits();
+  extern __shared__ int4 sh[];
+  float* sh_a_s = reinterpret_cast<float*>(sh);
+  int4* sh_new = sh + (is_a_8bit ? (4 * thread_m_blocks) : 0);
+  constexpr int pack_factor = 32 / b_type.size_bits();
   static_assert(thread_m_blocks == 1 || !m_block_size_8);
 
   // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
@@ -373,7 +487,19 @@ __global__ void Marlin(
 
   int k_tiles = prob_k / 16 / thread_k_blocks;
   int n_tiles = prob_n / 16 / thread_n_blocks;
-  int iters = div_ceil(k_tiles * n_tiles * parallel, gridDim.x);
+
+  int global_mn_tiles = parallel * n_tiles;
+  int part2_mn_tiles = global_mn_tiles;
+  int part1_mn_iters = 0;
+  bool in_part2 = false;
+
+  if (global_mn_tiles > gridDim.x) {
+    part2_mn_tiles = global_mn_tiles % gridDim.x;
+    if (part2_mn_tiles * 3 <= gridDim.x) part2_mn_tiles += gridDim.x;
+    part1_mn_iters = (global_mn_tiles - part2_mn_tiles) / gridDim.x;
+  }
+
+  int iters = div_ceil(k_tiles * part2_mn_tiles, gridDim.x);
 
   if constexpr (!has_act_order && group_blocks != -1) {
     if (group_blocks >= thread_k_blocks) {
@@ -385,28 +511,21 @@ __global__ void Marlin(
     }
   }
 
-  int slice_row = (iters * blockIdx.x) % k_tiles;
-  int slice_col_par = (iters * blockIdx.x) / k_tiles;
-  int slice_col = slice_col_par;
-  int slice_iters;  // number of threadblock tiles in the current slice
-  int slice_count =
-      0;          // total number of active threadblocks in the current slice
-  int slice_idx;  // index of threadblock in current slice; numbered bottom to
-                  // top
+  int slice_row = 0;
+  int slice_col_par = blockIdx.x;
+  int slice_col;
+  int slice_iters =
+      k_tiles;  // number of threadblock tiles in the current slice
+  // total number of active threadblocks in the current slice
+  int slice_count = 1;
+  // index of threadblock in current slice; numbered bottom to top
+  int slice_idx = 0;
 
   int par_id = 0;
   int locks_off = 0;
 
-  // We can easily implement parallel problem execution by just remapping
-  // indices and advancing global pointers
-  if (slice_col_par >= n_tiles) {
-    A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * lda / 8;
-    C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
-    slice_col = slice_col_par % n_tiles;
-    par_id = slice_col_par / n_tiles;
-  }
-  if (parallel * n_tiles >= gridDim.x) {
-    // when parallel * n_tiles >= sms
+  if (part2_mn_tiles >= gridDim.x) {
+    // when part2_mn_tiles >= sms
     // then there are at most $sms$ conflict tile blocks
     locks_off = blockIdx.x;
   } else {
@@ -415,10 +534,11 @@ __global__ void Marlin(
 
   // Compute all information about the current slice which is required for
   // synchronization.
-  auto init_slice = [&](bool first_init = false) {
+  bool first_init = true;
+  auto init_part2_slice = [&]() {
     slice_iters =
         iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
-    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
+    if (slice_iters < 0 || slice_col_par >= part2_mn_tiles) slice_iters = 0;
     if (slice_iters == 0) return;
     if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
     slice_count = 1;
@@ -436,7 +556,7 @@ __global__ void Marlin(
         if (col_off > 0) slice_idx--;
       }
     }
-    if (parallel * n_tiles >= gridDim.x) {
+    if (part2_mn_tiles >= gridDim.x) {
       if (slice_count > 1 && slice_idx == slice_count - 1) {
         locks_off++;
       }
@@ -466,28 +586,68 @@ __global__ void Marlin(
     }
 
     if (slice_col == n_tiles) {
-      A += 16 * thread_m_blocks * lda / 8;
+      A += 16 * thread_m_blocks * lda / (is_a_8bit ? 16 : 8);
       C += 16 * thread_m_blocks * prob_n / 8;
       slice_col = 0;
       par_id++;
     }
+    if (is_a_8bit && (first_init || slice_col == 0)) {
+      __syncthreads();
+      int a_s_gl_rd = par_id * 16 * thread_m_blocks + threadIdx.x;
+      cp_async1_ca_pred(&sh_a_s[threadIdx.x], &a_scales_ptr[a_s_gl_rd],
+                        threadIdx.x < prob_m);
+    }
   };
-  init_slice(true);
+
+  auto init_part1_slice = [&]() {
+    if (part1_mn_iters) {
+      part1_mn_iters--;
+      par_id = slice_col_par / n_tiles;
+      slice_col = slice_col_par % n_tiles;
+      slice_iters = k_tiles;
+      A = A0 + 16 * thread_m_blocks / (is_a_8bit ? 16 : 8) * par_id * lda;
+      C = C0 + 16 * thread_m_blocks / 8 * par_id * prob_n;
+      if (is_a_8bit) {
+        __syncthreads();
+        int a_s_gl_rd = par_id * 16 * thread_m_blocks + threadIdx.x;
+        cp_async1_ca_pred(&sh_a_s[threadIdx.x], &a_scales_ptr[a_s_gl_rd],
+                          threadIdx.x < prob_m);
+      }
+    }
+  };
+
+  auto init_slice = [&]() {
+    if (!in_part2 && !part1_mn_iters) {
+      in_part2 = true;
+      slice_col_par = (iters * blockIdx.x) / k_tiles;
+      slice_row = (iters * blockIdx.x) % k_tiles;
+      slice_col = (slice_col_par + global_mn_tiles - part2_mn_tiles) % n_tiles;
+      par_id = (slice_col_par + global_mn_tiles - part2_mn_tiles) / n_tiles;
+      A = A0 + 16 * thread_m_blocks / (is_a_8bit ? 16 : 8) * par_id * lda;
+      C = C0 + 16 * thread_m_blocks / 8 * par_id * prob_n;
+    }
+    if (!in_part2) {
+      init_part1_slice();
+    } else {
+      init_part2_slice();
+      first_init = false;
+    }
+  };
+
+  init_slice();
 
   // A sizes/strides
 
   // stride of the A matrix in global memory
-  int a_gl_stride = lda / 8;
+  int a_gl_stride = lda / (is_a_8bit ? 16 : 8);
   // stride of an A matrix tile in shared memory
-  constexpr int a_sh_stride = 16 * thread_k_blocks / 8;
+  constexpr int a_sh_stride = 16 * thread_k_blocks / (is_a_8bit ? 16 : 8);
   // delta between subsequent A tiles in global memory
-  constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / 8;
+  constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / (is_a_8bit ? 16 : 8);
   // between subsequent accesses within a tile
   int a_gl_rd_delta_i = a_gl_stride * (threads / a_gl_rd_delta_o);
   // between shared memory writes
   constexpr int a_sh_wr_delta = a_sh_stride * (threads / a_gl_rd_delta_o);
-  // between shared memory tile reads
-  constexpr int a_sh_rd_delta_o = 2 * ((threads / 32) / (thread_n_blocks / 4));
   // within a shared memory tile
   constexpr int a_sh_rd_delta_i = a_sh_stride * 16;
   // overall size of a tile
@@ -496,24 +656,25 @@ __global__ void Marlin(
   constexpr int a_sh_wr_iters = div_ceil(a_sh_stage, a_sh_wr_delta);
 
   // B sizes/strides
-  int b_gl_stride = 16 * prob_n / (pack_factor * 4);
-  constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4;
-  constexpr int b_thread_vecs = w_type.size_bits() == 4 ? 1 : 2;
+  int b_gl_stride = 16 * prob_n / (pack_factor * (is_a_8bit ? 2 : 4));
+  constexpr int b_sh_stride =
+      ((thread_n_blocks * 16) * 16 / pack_factor) / (is_a_8bit ? 2 : 4);
+  constexpr int b_thread_vecs = b_type.size_bits() == 4 ? 1 : 2;
   constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs;
 
-  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
-  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads);
+  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks / (is_a_8bit ? 2 : 1);
   constexpr int b_sh_wr_delta = threads * b_thread_vecs;
-  constexpr int b_sh_rd_delta = threads * b_thread_vecs;
-  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
+  constexpr int b_sh_stage =
+      b_sh_stride * thread_k_blocks / (is_a_8bit ? 2 : 1);
   constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
 
   // Scale sizes/strides without act_order
-  int s_gl_stride = prob_n / 8;
-  constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
+  int s_gl_stride = prob_n / (b_type == vllm::kFE2M1f ? 16 : 8);
+  constexpr int s_sh_stride =
+      16 * thread_n_blocks / (b_type == vllm::kFE2M1f ? 16 : 8);
   constexpr int s_tb_groups =
       !has_act_order && group_blocks != -1 && group_blocks < thread_k_blocks
-          ? thread_k_blocks / group_blocks / (w_type == vllm::kFE2M1f ? 2 : 1)
+          ? thread_k_blocks / group_blocks
           : 1;
   constexpr int s_sh_stage = s_tb_groups * s_sh_stride;
   int s_gl_rd_delta = s_gl_stride;
@@ -527,7 +688,7 @@ __global__ void Marlin(
   int act_s_col_stride = 1;
   int act_s_col_warp_stride = act_s_col_stride * 8;
 
-  int tb_n_warps = thread_n_blocks / 4;
+  constexpr int tb_n_warps = thread_n_blocks / (is_a_8bit ? 2 : 4);
   int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps;
 
   // Zero-points sizes/strides
@@ -550,17 +711,22 @@ __global__ void Marlin(
   int a_sh_rd =
       a_sh_stride * ((threadIdx.x % 32) % (16 / (m_block_size_8 ? 2 : 1))) +
       (threadIdx.x % 32) / (16 / (m_block_size_8 ? 2 : 1));
-  a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
+  a_sh_rd += 2 * ((threadIdx.x / 32) / tb_n_warps) * b_sh_wr_iters;
+
+  int b_gl_rd;
+  if (threads <= b_sh_stride) {
+    b_gl_rd = threadIdx.x;
+  } else {
+    b_gl_rd =
+        b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride);
+  }
 
-  int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) +
-                (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
   b_gl_rd += b_sh_stride * slice_col;
   b_gl_rd += b_gl_rd_delta_o * slice_row;
-  auto b_sh_wr = threadIdx.x * b_thread_vecs;
   auto b_sh_rd = threadIdx.x * b_thread_vecs;
+  b_sh_rd += b_sh_rd / b_sh_stride * (b_sh_stride * (b_sh_wr_iters - 1));
 
   // For act_order
-  constexpr int k_iter_size = tb_k / b_sh_wr_iters;
   int slice_k_start = tb_k * slice_row;
   int slice_k_finish = slice_k_start + tb_k * slice_iters;
   int slice_k_start_shared_fetch = slice_k_start;
@@ -571,58 +737,54 @@ __global__ void Marlin(
   if constexpr (!has_act_order) {
     if constexpr (group_blocks == -1) {
       s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
-    } else {
-      s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) /
-                    (w_type == vllm::kFE2M1f ? 2 : 1) +
+    } else if constexpr (group_blocks >= thread_k_blocks) {
+      s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
                 s_sh_stride * slice_col + threadIdx.x;
+    } else {
+      s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks +
+                               threadIdx.x / s_sh_stride) +
+                s_sh_stride * slice_col + threadIdx.x % s_sh_stride;
     }
   }
   auto s_sh_wr = threadIdx.x;
-  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
+  bool s_sh_wr_pred = threadIdx.x < s_sh_stage;
 
   // Zero-points
   int zp_gl_rd;
   if constexpr (has_zp) {
     if constexpr (group_blocks == -1) {
       zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
-    } else {
+    } else if constexpr (group_blocks >= thread_k_blocks) {
       zp_gl_rd = zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
                  zp_sh_stride * slice_col + threadIdx.x;
+    } else {
+      zp_gl_rd = zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks +
+                                 threadIdx.x / zp_sh_stride) +
+                 zp_sh_stride * slice_col + threadIdx.x % zp_sh_stride;
     }
   }
   auto zp_sh_wr = threadIdx.x;
-  bool zp_sh_wr_pred = threadIdx.x < zp_sh_stride;
+  bool zp_sh_wr_pred = zp_sh_stage > 0 && threadIdx.x < zp_sh_stage;
 
   // We use a different scale layout for grouped and column-wise quantization as
   // we scale a `half2` tile in column-major layout in the former and in
   // row-major in the latter case.
   int s_sh_rd;
-  if constexpr (group_blocks != -1 && w_type == vllm::kFE2M1f) {
-    auto warp_id = threadIdx.x / 32;
-    int n_warps = thread_n_blocks / 4;
-    int warp_row = warp_id / n_warps;
-
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) / 4;
-    s_sh_rd = s_sh_rd * 2 + (warp_row / group_blocks) % 2;
-
+  if constexpr (is_a_8bit) {
+    s_sh_rd = 4 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 4);
   } else if constexpr (group_blocks != -1)
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) / 4;
+    s_sh_rd = 8 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 32) / 4;
   else if constexpr (group_blocks == -1 &&
                      (m_block_size_8 || (has_zp && !dequant_skip_flop)))
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) / 8;
+    s_sh_rd = 8 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 32) / 8;
   else
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) % 4;
+    s_sh_rd = 8 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 32) % 4;
 
   int bias_sh_rd;
   if constexpr (m_block_size_8) {
-    bias_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-                 (threadIdx.x % 32) / 8;
+    bias_sh_rd = 8 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 32) / 8;
   } else {
-    bias_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+    bias_sh_rd = (is_a_8bit ? 4 : 8) * ((threadIdx.x / 32) % tb_n_warps) +
                  (threadIdx.x % 32) % 4;
   }
 
@@ -638,12 +800,16 @@ __global__ void Marlin(
   if constexpr (has_zp) {
     if constexpr (is_zp_float) {
       if constexpr (group_blocks != -1) {
-        zp_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-                   (threadIdx.x % 32) / 4;
+        zp_sh_rd =
+            8 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 32) / 4;
       }
+    } else if (is_a_8bit) {
+      zp_sh_rd = num_ints_per_thread * num_col_threads *
+                     ((threadIdx.x / 32) % tb_n_warps / 2) +
+                 num_ints_per_thread * ((threadIdx.x % 32) / num_row_threads);
     } else {
       zp_sh_rd = num_ints_per_thread * num_col_threads *
-                     ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+                     ((threadIdx.x / 32) % tb_n_warps) +
                  num_ints_per_thread * ((threadIdx.x % 32) / num_row_threads);
     }
   }
@@ -678,26 +844,19 @@ __global__ void Marlin(
   for (int i = 0; i < b_sh_wr_iters; i++) {
   #pragma unroll
     for (int j = 0; j < thread_m_blocks; j++)
-      a_sh_rd_trans[i][j] =
-          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
+      a_sh_rd_trans[i][j] = transform_a(2 * i + a_sh_rd_delta_i * j + a_sh_rd);
   }
 
   // Since B-accesses have non-constant stride they have to be computed at
   // runtime; we break dependencies between subsequent accesses with a tile by
   // maintining multiple pointers (we have enough registers), a tiny
   // optimization.
-  const int4* B_ptr[b_sh_wr_iters];
-  #pragma unroll
-  for (int i = 0; i < b_sh_wr_iters; i++)
-    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
 
-  extern __shared__ int4 sh[];
   // Shared memory storage for global fetch pipelines.
   constexpr int sh_red_size = (2 * thread_n_blocks + 1) * 16 * thread_m_blocks;
   constexpr int sh_b_size = stages * b_sh_stage;
-  int4* sh_b = sh;
-  int4* sh_red = sh;
-
+  int4* sh_b = sh_new;
+  int4* sh_red = sh_new;
   constexpr int sh_size_b_red_min =
       (sh_red_size < sh_b_size ? sh_red_size : sh_b_size);
   constexpr int sh_size_b_red_max =
@@ -708,8 +867,8 @@ __global__ void Marlin(
           ? sh_size_b_red_max
           : (sh_size_b_red_min + sh_bias_size);
 
-  int4* sh_bias = sh + sh_size_b_red_min;
-  int4* sh_g_idx = sh + sh_b_red_bias_size;
+  int4* sh_bias = sh_new + sh_size_b_red_min;
+  int4* sh_g_idx = sh_new + sh_b_red_bias_size;
   int4* sh_zp = sh_g_idx + (stages * g_idx_stage);
   constexpr int sh_s_size = has_act_order ? (act_s_max_num_groups * s_sh_stride)
                                           : (stages * s_sh_stage);
@@ -723,7 +882,8 @@ __global__ void Marlin(
   // Register storage for double buffer of shared memory reads.
   FragA frag_a[2][thread_m_blocks];
   I4 frag_b_quant[2][b_thread_vecs];
-  FragC frag_c[thread_m_blocks][4][2];
+  FragC frag_c[thread_m_blocks][is_a_8bit ? 2 : 4][2];
+  FragC frag_c_tmp[thread_m_blocks][is_a_8bit ? 2 : 4][2];
   FragS frag_s[2][4];  // No act-order
   FragS frag_bias[2][4];
   FragS act_frag_s[2][4][4];             // For act-order
@@ -731,6 +891,24 @@ __global__ void Marlin(
   FragZP frag_zp;                        // Zero-points in fp16
   FragZP frag_zpf[2];                    // Zero-points in fp16 in HQQ
 
+  if constexpr (is_a_8bit) {
+  #pragma unroll
+    for (int j = 0; j < 2; j++) {
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+        for (int g = 0; g < 4; g++) {
+          frag_c_tmp[i][j][0][g] = 0.0f;
+        }
+
+  #pragma unroll
+        for (int g = 0; g < 4; g++) {
+          frag_c_tmp[i][j][1][g] = 0.0f;
+        }
+      }
+    }
+  }
+
   // Zero accumulators.
   auto zero_accums = [&]() {
   #pragma unroll
@@ -788,15 +966,17 @@ __global__ void Marlin(
       }
       int4* sh_b_stage = sh_b + b_sh_stage * pipe;
   #pragma unroll
-      for (int i = 0; i < b_sh_wr_iters; i++) {
-  #pragma unroll
-        for (int j = 0; j < b_thread_vecs; j++) {
-          cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j], B_ptr[i] + j);
-        }
+      for (int i = 0; i < (b_sh_wr_iters * b_thread_vecs); i++) {
+        constexpr int count = div_ceil(b_sh_stride, threads);
+        int b_gl_idx =
+            b_gl_rd + (i % count) * threads +
+            b_gl_stride * (i / count) * div_ceil(threads, b_sh_stride);
 
-        B_ptr[i] += b_gl_rd_delta_o;
+        cp_async4(&sh_b_stage[threads * i + threadIdx.x], &B[b_gl_idx]);
       }
 
+      b_gl_rd += b_gl_rd_delta_o;
+
       if constexpr (has_act_order) {
         // Fetch g_idx thread-block portion
         int full_pipe = a_off;
@@ -816,44 +996,24 @@ __global__ void Marlin(
         if constexpr (group_blocks != -1) {
           int4* sh_s_stage = sh_s + s_sh_stage * pipe;
 
-          if constexpr (group_blocks >= thread_k_blocks) {
-            // Only fetch scales if this tile starts a new group
-            if (pipe % (group_blocks / thread_k_blocks) == 0) {
-              if (s_sh_wr_pred) {
-                cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
-              }
-              s_gl_rd += s_gl_rd_delta;
-            }
-          } else {
-            for (int i = 0; i < s_tb_groups; i++) {
-              if (s_sh_wr_pred) {
-                cp_async4(&sh_s_stage[i * s_sh_stride + s_sh_wr],
-                          &scales_ptr[s_gl_rd]);
-              }
-              s_gl_rd += s_gl_rd_delta;
+          // Only fetch scales if this tile starts a new group
+          if (pipe % div_ceil(group_blocks, thread_k_blocks) == 0) {
+            if (s_sh_wr_pred) {
+              cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
             }
+            s_gl_rd += s_gl_rd_delta * s_tb_groups;
           }
         }
 
         if constexpr (has_zp && group_blocks != -1) {
           int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
 
-          if constexpr (group_blocks >= thread_k_blocks) {
-            // Only fetch zero-points if this tile starts a new group
-            if (pipe % (group_blocks / thread_k_blocks) == 0) {
-              if (zp_sh_wr_pred) {
-                cp_async4(&sh_zp_stage[zp_sh_wr], &zp_ptr[zp_gl_rd]);
-              }
-              zp_gl_rd += zp_gl_rd_delta;
-            }
-          } else {
-            for (int i = 0; i < zp_tb_groups; i++) {
-              if (zp_sh_wr_pred) {
-                cp_async4(&sh_zp_stage[i * zp_sh_stride + zp_sh_wr],
-                          &zp_ptr[zp_gl_rd]);
-              }
-              zp_gl_rd += zp_gl_rd_delta;
+          // Only fetch zero points if this tile starts a new group
+          if (pipe % div_ceil(group_blocks, thread_k_blocks) == 0) {
+            if (zp_sh_wr_pred) {
+              cp_async4(&sh_zp_stage[zp_sh_wr], &zp_ptr[zp_gl_rd]);
             }
+            zp_gl_rd += zp_gl_rd_delta * zp_tb_groups;
           }
         }
       }
@@ -891,14 +1051,14 @@ __global__ void Marlin(
     int4* sh_a_stage = sh_a + a_sh_stage * pipe;
   #pragma unroll
     for (int i = 0; i < thread_m_blocks; i++)
-      ldsm<m_block_size_8 ? 2 : 4, scalar_t>(
+      ldsm<m_block_size_8 ? 2 : 4, a_type_id>(
           frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
     int4* sh_b_stage = sh_b + b_sh_stage * pipe;
 
   #pragma unroll
     for (int i = 0; i < b_thread_vecs; i++) {
       frag_b_quant[k % 2][i] = *reinterpret_cast<I4*>(
-          &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]);
+          &sh_b_stage[b_sh_stride * (k % b_sh_wr_iters) + b_sh_rd + i]);
     }
   };
 
@@ -922,53 +1082,54 @@ __global__ void Marlin(
 
   auto fetch_scales_to_registers = [&](int k, int full_pipe) {
     int pipe = full_pipe % stages;
+    using IT1 = typename std::conditional_t<is_a_8bit, int2, int4>;
+    using IT0 = typename std::conditional_t<is_a_8bit, int, int2>;
+    constexpr int group_blocks2 = div_ceil(group_blocks, is_a_8bit ? 2 : 1);
 
     if constexpr (!has_act_order) {
       // No act-order case
       if constexpr (group_blocks == -1) {
         // load only when starting a new slice
-        if (k == 0 && full_pipe == 0) {
+        if (k == 0 && full_pipe == 0 && dequant_skip_flop) {
           reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd];
           reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
         }
       } else if constexpr (group_blocks != -1) {
         if constexpr (group_blocks >= thread_k_blocks) {
-          if (k % b_sh_wr_iters == 0) {
-            int4* sh_s_stage =
-                sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
-                                     (pipe / (group_blocks / thread_k_blocks)));
-            reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
-          } else {
-            reinterpret_cast<int4*>(&frag_s[1])[0] =
-                reinterpret_cast<int4*>(&frag_s[0])[0];
+          constexpr int g = group_blocks / thread_k_blocks;
+          if (pipe % g == 0) {
+            if (k % b_sh_wr_iters == 0) {
+              int4* sh_s_stage = sh_s + s_sh_stage * (g * (pipe / g));
+              reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
+            } else {
+              reinterpret_cast<int4*>(&frag_s[1])[0] =
+                  reinterpret_cast<int4*>(&frag_s[0])[0];
+            }
           }
-        } else {
+        } else if (group_blocks2 < b_sh_wr_iters || k % b_sh_wr_iters == 0) {
           auto warp_id = threadIdx.x / 32;
-          int n_warps = thread_n_blocks / 4;
+          int warp_row = warp_id / tb_n_warps;
 
-          int warp_row = warp_id / n_warps;
-
-          int cur_k = warp_row * 16;
-          cur_k += k_iter_size * (k % b_sh_wr_iters);
-
-          int k_blocks = cur_k / 16;
-          int cur_group_id =
-              k_blocks / (group_blocks * (w_type == vllm::kFE2M1f ? 2 : 1));
+          int k_blocks = b_sh_wr_iters * warp_row + k % b_sh_wr_iters;
+          int cur_group_id = k_blocks / group_blocks2;
 
           int4* sh_s_stage = sh_s + s_sh_stage * pipe;
 
-          if constexpr (w_type_id != vllm::kFE2M1f.id()) {
+          if constexpr (b_type_id != vllm::kFE2M1f.id()) {
             reinterpret_cast<int4*>(&frag_s[k % 2])[0] =
                 sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride];
-          } else if constexpr (group_blocks == 1 || thread_k_blocks > 4) {
-            reinterpret_cast<int2*>(&frag_s[k % 2])[0] =
-                reinterpret_cast<int2*>(
-                    sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride)];
           } else {
             reinterpret_cast<int2*>(&frag_s[k % 2])[0] =
                 reinterpret_cast<int2*>(
-                    sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride) +
-                                k % 2];
+                    sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride)];
+          }
+        } else if (group_blocks >= b_sh_wr_iters) {
+          if constexpr (b_type_id != vllm::kFE2M1f.id()) {
+            reinterpret_cast<int4*>(&frag_s[1])[0] =
+                reinterpret_cast<int4*>(&frag_s[0])[0];
+          } else {
+            reinterpret_cast<int2*>(&frag_s[1])[0] =
+                reinterpret_cast<int2*>(&frag_s[0])[0];
           }
         }
       }
@@ -989,18 +1150,15 @@ __global__ void Marlin(
     cur_k = 0;
 
     // Progress to current iteration
-    cur_k += k_iter_size * (k % b_sh_wr_iters);
+    cur_k += k % b_sh_wr_iters;
 
     // Determine "position" inside the thread-block (based on warp and
     // thread-id)
     auto warp_id = threadIdx.x / 32;
-    int n_warps =
-        thread_n_blocks / 4;  // Each warp processes 4 16-size tiles over N
+    int warp_row = warp_id / tb_n_warps;
+    int warp_col = warp_id % tb_n_warps;
 
-    int warp_row = warp_id / n_warps;
-    int warp_col = warp_id % n_warps;
-
-    cur_k += warp_row * 16;
+    cur_k += warp_row * 16 * b_sh_wr_iters;
 
     auto th_id = threadIdx.x % 32;
     cur_k += (th_id % 4) * 2;  // Due to tensor-core layout for fp16 B matrix
@@ -1055,18 +1213,16 @@ __global__ void Marlin(
 
       if constexpr (group_blocks == -1) {
         // load only when starting a new slice
-        if (k == 0 && full_pipe == 0) {
+        if (k == 0 && full_pipe == 0 || is_a_8bit) {
   #pragma unroll
           for (int i = 0; i < num_ints_per_thread; i++) {
             frag_qzp[k % 2][i] = (reinterpret_cast<int*>(sh_zp))[zp_sh_rd + i];
           }
         }
-
       } else if constexpr (group_blocks >= thread_k_blocks) {
-        if (k % b_sh_wr_iters == 0) {
-          int4* sh_zp_stage =
-              sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) *
-                                     (pipe / (group_blocks / thread_k_blocks)));
+        constexpr int g = group_blocks / thread_k_blocks;
+        if (pipe % g == 0 && k % b_sh_wr_iters == 0 || is_a_8bit) {
+          int4* sh_zp_stage = sh_zp + zp_sh_stage * (g * (pipe / g));
   #pragma unroll
           for (int i = 0; i < num_ints_per_thread; i++) {
             frag_qzp[k % 2][i] =
@@ -1075,21 +1231,11 @@ __global__ void Marlin(
         }
       } else {
         auto warp_id = threadIdx.x / 32;
-        int n_warps = thread_n_blocks / 4;
 
-        int warp_row = warp_id / n_warps;
+        int warp_row = warp_id / tb_n_warps;
 
-        int cur_k = warp_row * 16;
-        cur_k += k_iter_size * (k % b_sh_wr_iters);
-
-        int k_blocks = cur_k / 16;
-        int cur_group_id = 0;
-
-        // Suppress bogus and persistent divide-by-zero warning
-  #pragma nv_diagnostic push
-  #pragma nv_diag_suppress divide_by_zero
-        cur_group_id = k_blocks / group_blocks;
-  #pragma nv_diagnostic pop
+        int k_blocks = b_sh_wr_iters * warp_row + k % b_sh_wr_iters;
+        int cur_group_id = k_blocks / div_ceil(group_blocks, is_a_8bit ? 2 : 1);
 
         int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
 
@@ -1108,29 +1254,18 @@ __global__ void Marlin(
 
       if constexpr (group_blocks != -1) {
         if constexpr (group_blocks >= thread_k_blocks) {
-          if (k % b_sh_wr_iters == 0) {
-            int4* sh_zp_stage =
-                sh_zp +
-                zp_sh_stage * ((group_blocks / thread_k_blocks) *
-                               (pipe / (group_blocks / thread_k_blocks)));
+          constexpr int g = group_blocks / thread_k_blocks;
+          if (pipe % g == 0 && k % b_sh_wr_iters == 0) {
+            int4* sh_zp_stage = sh_zp + zp_sh_stage * (g * (pipe / g));
             reinterpret_cast<int4*>(&frag_zpf[k % 2])[0] =
                 sh_zp_stage[zp_sh_rd];
           }
-        } else {
+        } else if (group_blocks < b_sh_wr_iters || k % b_sh_wr_iters == 0) {
           auto warp_id = threadIdx.x / 32;
-          int n_warps = thread_n_blocks / 4;
 
-          int warp_row = warp_id / n_warps;
-
-          int cur_k = warp_row * 16;
-          cur_k += k_iter_size * (k % b_sh_wr_iters);
-
-          int k_blocks = cur_k / 16;
-          // Suppress bogus and persistent divide-by-zero warning
-  #pragma nv_diagnostic push
-  #pragma nv_diag_suppress divide_by_zero
+          int warp_row = warp_id / tb_n_warps;
+          int k_blocks = b_sh_wr_iters * warp_row + k % b_sh_wr_iters;
           int cur_group_id = k_blocks / group_blocks;
-  #pragma nv_diagnostic pop
 
           int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
 
@@ -1141,33 +1276,46 @@ __global__ void Marlin(
     }
   };
 
-  auto dequant_data = [&](int q, scalar_t2* frag_b_ptr) {
-    dequant<scalar_t2, w_type_id, dequant_skip_flop>(q, frag_b_ptr);
+  auto dequant_data = [&](int q, scalar_32bit_t* frag_b_ptr, int zp = 0) {
+    if constexpr (a_type.size_bits() != b_type.size_bits()) {
+      if constexpr (is_a_8bit && has_zp) {
+        sub_zp_and_dequant<scalar_32bit_t, b_type_id, dequant_skip_flop>(
+            q, frag_b_ptr, zp);
+      } else {
+        dequant<scalar_32bit_t, b_type_id, dequant_skip_flop>(q, frag_b_ptr);
+      }
+    }
   };
 
   // Execute the actual tensor core matmul of a sub-tile.
   bool is_first_matmul_in_slice = true;
-  auto matmul = [&](int k) {
+  auto matmul = [&](int k, int pipe) {
+    if (is_a_8bit) return;
     int k2 = k % 2;
+    constexpr int g =
+        group_blocks > 0 ? div_ceil(group_blocks, thread_k_blocks) : 1;
     const bool is_new_zp =
-        ((group_blocks != -1) && (group_blocks < thread_k_blocks || k == 0)) ||
+        (group_blocks == 0) ||
+        ((group_blocks > 0) && (group_blocks < b_sh_wr_iters || k == 0)) &&
+            (pipe % g == 0) ||
         (group_blocks == -1 && is_first_matmul_in_slice);
     if constexpr (has_zp && !is_zp_float) {
       if (is_new_zp) {
         if constexpr (group_blocks == -1) is_first_matmul_in_slice = false;
         int zp_quant_0, zp_quant_1;
 
-        if constexpr (w_type.size_bits() == 4) {
+        if constexpr (b_type.size_bits() == 4) {
           zp_quant_0 = frag_qzp[k2][0];
           zp_quant_1 = zp_quant_0 >> 8;
         } else {
-          static_assert(w_type.size_bits() == 8);
+          static_assert(b_type.size_bits() == 8);
           zp_quant_0 = frag_qzp[k2][0];
           zp_quant_1 = frag_qzp[k2][1];
         }
 
-        dequant_data(zp_quant_0, reinterpret_cast<scalar_t2*>(&frag_zp));
-        dequant_data(zp_quant_1, reinterpret_cast<scalar_t2*>(&frag_zp) + 2);
+        dequant_data(zp_quant_0, reinterpret_cast<scalar_32bit_t*>(&frag_zp));
+        dequant_data(zp_quant_1,
+                     reinterpret_cast<scalar_32bit_t*>(&frag_zp) + 2);
       }
     }
     if constexpr (!dequant_skip_flop && has_zp && is_zp_float) {
@@ -1177,14 +1325,14 @@ __global__ void Marlin(
       }
     }
 
-    if constexpr (w_type == vllm::kFE2M1f) {
+    if constexpr (b_type == vllm::kFE2M1f) {
       int s_quant_0 = reinterpret_cast<int*>(frag_s[k2])[0];
       int s_quant_1 = reinterpret_cast<int*>(frag_s[k2])[1];
 
-      dequant_fp8_scales<scalar_t2, s_type_id>(
-          s_quant_0, reinterpret_cast<scalar_t2*>(&frag_s[k2]));
-      dequant_fp8_scales<scalar_t2, s_type_id>(
-          s_quant_1, reinterpret_cast<scalar_t2*>(&frag_s[k2]) + 2);
+      dequant_fp8_scales<c_scalar_t2, s_type_id>(
+          s_quant_0, reinterpret_cast<c_scalar_t2*>(&frag_s[k2]));
+      dequant_fp8_scales<c_scalar_t2, s_type_id>(
+          s_quant_1, reinterpret_cast<c_scalar_t2*>(&frag_s[k2]) + 2);
     }
 
   // We have the m dimension as the inner loop in order to encourage overlapping
@@ -1195,61 +1343,168 @@ __global__ void Marlin(
       FragB frag_b1;
       int b_quant_0, b_quant_1;
 
-      if constexpr (w_type_id == vllm::kFE2M1f.id()) {
+      if constexpr (b_type_id == vllm::kFE2M1f.id()) {
         b_quant_1 = frag_b_quant[k2][0][j];
         b_quant_0 = b_quant_1 << 8;
-      } else if constexpr (w_type.size_bits() == 4) {
+      } else if constexpr (b_type.size_bits() == 4) {
         b_quant_0 = frag_b_quant[k2][0][j];
         b_quant_1 = b_quant_0 >> 8;
       } else {
-        static_assert(w_type.size_bits() == 8);
+        static_assert(b_type.size_bits() == 8);
         int* frag_b_quant_ptr = reinterpret_cast<int*>(frag_b_quant[k2]);
         b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
         b_quant_1 = frag_b_quant_ptr[j * 2 + 1];
       }
 
-      dequant_data(b_quant_0, reinterpret_cast<scalar_t2*>(&frag_b0));
-      dequant_data(b_quant_1, reinterpret_cast<scalar_t2*>(&frag_b1));
+      dequant_data(b_quant_0, reinterpret_cast<scalar_32bit_t*>(&frag_b0));
+      dequant_data(b_quant_1, reinterpret_cast<scalar_32bit_t*>(&frag_b1));
 
-      if constexpr (dequant_skip_flop && has_zp && !is_zp_float) {
-        sub_zp<scalar_t>(frag_b0, frag_zp[j], 0);
-        sub_zp<scalar_t>(frag_b1, frag_zp[j], 1);
+      if constexpr (dequant_skip_flop && has_zp && !is_zp_float && !is_a_8bit) {
+        sub_zp<a_type_id>(frag_b0, frag_zp[j], 0);
+        sub_zp<a_type_id>(frag_b1, frag_zp[j], 1);
       }
 
       // Apply scale to frag_b0
-      if constexpr (has_act_order) {
+      if constexpr (has_act_order && !is_a_8bit) {
         static_assert(group_blocks != -1);
-        scale4<scalar_t>(frag_b0, act_frag_s[k2][0][j], act_frag_s[k2][1][j],
-                         act_frag_s[k2][2][j], act_frag_s[k2][3][j], 0);
-        scale4<scalar_t>(frag_b1, act_frag_s[k2][0][j], act_frag_s[k2][1][j],
-                         act_frag_s[k2][2][j], act_frag_s[k2][3][j], 1);
+        scale4<a_type_id>(frag_b0, act_frag_s[k2][0][j], act_frag_s[k2][1][j],
+                          act_frag_s[k2][2][j], act_frag_s[k2][3][j], 0);
+        scale4<a_type_id>(frag_b1, act_frag_s[k2][0][j], act_frag_s[k2][1][j],
+                          act_frag_s[k2][2][j], act_frag_s[k2][3][j], 1);
       } else if constexpr (!dequant_skip_flop && has_zp && !is_zp_float &&
-                           group_blocks == -1) {
+                           group_blocks == -1 && !is_a_8bit) {
         int idx = (threadIdx.x / 4) % 2;
-        scalar_t2 s2 = Dtype::nums2num2(
+        scalar_t2 s2 = Adtype::nums2num2(
             reinterpret_cast<scalar_t*>(&frag_s[j / 2][j % 2 * 2 + 0])[idx],
             reinterpret_cast<scalar_t*>(&frag_s[j / 2][j % 2 * 2 + 1])[idx]);
         if (is_new_zp) frag_zp[j] = __hmul2(frag_zp[j], s2);
-        scale_and_sub<scalar_t>(frag_b0, s2.x, frag_zp[j].x);
-        scale_and_sub<scalar_t>(frag_b1, s2.y, frag_zp[j].y);
-      } else if constexpr (!dequant_skip_flop && has_zp && group_blocks != -1) {
+        scale_and_sub<a_type_id>(frag_b0, s2.x, frag_zp[j].x);
+        scale_and_sub<a_type_id>(frag_b1, s2.y, frag_zp[j].y);
+      } else if constexpr (!dequant_skip_flop && has_zp && group_blocks != -1 &&
+                           !is_a_8bit) {
         if (is_new_zp)
           frag_zp[j] = __hmul2(frag_zp[j],
                                *reinterpret_cast<scalar_t2*>(&frag_s[k2][j]));
-        scale_and_sub<scalar_t>(frag_b0, frag_s[k2][j][0].x, frag_zp[j].x);
-        scale_and_sub<scalar_t>(frag_b1, frag_s[k2][j][0].y, frag_zp[j].y);
-      } else if constexpr (group_blocks != -1) {
-        scale<scalar_t>(frag_b0, frag_s[k2][j], 0);
-        scale<scalar_t>(frag_b1, frag_s[k2][j], 1);
+        scale_and_sub<a_type_id>(frag_b0, frag_s[k2][j][0].x, frag_zp[j].x);
+        scale_and_sub<a_type_id>(frag_b1, frag_s[k2][j][0].y, frag_zp[j].y);
+      } else if constexpr (group_blocks != -1 && !is_a_8bit) {
+        scale<a_type_id>(frag_b0, frag_s[k2][j], 0);
+        scale<a_type_id>(frag_b1, frag_s[k2][j], 1);
       }
 
   #pragma unroll
       for (int i = 0; i < thread_m_blocks; i++) {
         if constexpr (m_block_size_8) {
-          mma_trans<scalar_t>(frag_a[k2][i], frag_b0, frag_b1, frag_c[i][j][0]);
+          mma_trans<a_type_id>(frag_a[k2][i], frag_b0, frag_b1,
+                               frag_c[i][j][0]);
         } else {
-          mma<scalar_t>(frag_a[k2][i], frag_b0, frag_c[i][j][0]);
-          mma<scalar_t>(frag_a[k2][i], frag_b1, frag_c[i][j][1]);
+          mma<a_type_id>(frag_a[k2][i], frag_b0, frag_c[i][j][0]);
+          mma<a_type_id>(frag_a[k2][i], frag_b1, frag_c[i][j][1]);
+        }
+      }
+    }
+  };
+
+  auto matmul_a8 = [&](int k) {
+    int k2 = k % 2;
+  #pragma unroll
+    for (int j = 0; j < 2; j++) {
+      FragB frag_b[2];
+
+      if (is_a_8bit && b_type.size_bits() == 4 && !has_zp) {
+        dequant_data(frag_b_quant[k2][0][j * 2],
+                     reinterpret_cast<scalar_32bit_t*>(&frag_b));
+        dequant_data(frag_b_quant[k2][0][j * 2 + 1],
+                     reinterpret_cast<scalar_32bit_t*>(&frag_b) + 2);
+      } else if (is_a_8bit && b_type.size_bits() == 4 && has_zp) {
+        int off = (threadIdx.x / 32) % 2 * 2 + j;
+        int zp = (frag_qzp[k2][0] >> (off * 8)) & 0xF;
+        dequant_data(frag_b_quant[k2][0][j * 2],
+                     reinterpret_cast<scalar_32bit_t*>(&frag_b), zp);
+        zp = (frag_qzp[k2][0] >> (off * 8 + 4)) & 0xF;
+        dequant_data(frag_b_quant[k2][0][j * 2 + 1],
+                     reinterpret_cast<scalar_32bit_t*>(&frag_b) + 2, zp);
+      } else {
+        reinterpret_cast<int2*>(&frag_b)[0] =
+            reinterpret_cast<int2*>(&frag_b_quant[k2][j])[0];
+        reinterpret_cast<int2*>(&frag_b)[1] =
+            reinterpret_cast<int2*>(&frag_b_quant[k2][j])[1];
+      }
+
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+        mma<a_type_id, 32>(frag_a[k2][i], frag_b[0],
+                           (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][0]);
+        mma<a_type_id, 32>(frag_a[k2][i], frag_b[1],
+                           (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][1]);
+      }
+
+      if constexpr (group_blocks != -1) {
+        if (group_blocks == 2 || k == 1) {
+          if constexpr (a_type == vllm::kS8) {
+            int2 s_vals[2];
+            s_vals[0] = {
+                (int)reinterpret_cast<uint16_t*>(&frag_s[k2][j * 2][0])[0],
+                (int)reinterpret_cast<uint16_t*>(&frag_s[k2][j * 2][0])[1]};
+            s_vals[1] = {
+                (int)reinterpret_cast<uint16_t*>(&frag_s[k2][j * 2 + 1][0])[0],
+                (int)reinterpret_cast<uint16_t*>(&frag_s[k2][j * 2 + 1][0])[1]};
+
+  #pragma unroll
+            for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+              for (int g = 0; g < 4; g++) {
+                int scale = reinterpret_cast<int*>(&s_vals[0])[g % 2];
+                *reinterpret_cast<int32_t*>(&frag_c[i][j][0][g]) +=
+                    *reinterpret_cast<int32_t*>(&frag_c_tmp[i][j][0][g]) *
+                    scale;
+                frag_c_tmp[i][j][0][g] = 0.0f;
+              }
+
+  #pragma unroll
+              for (int g = 0; g < 4; g++) {
+                int scale = reinterpret_cast<int*>(&s_vals[1])[g % 2];
+                *reinterpret_cast<int32_t*>(&frag_c[i][j][1][g]) +=
+                    *reinterpret_cast<int32_t*>(&frag_c_tmp[i][j][1][g]) *
+                    scale;
+                frag_c_tmp[i][j][1][g] = 0.0f;
+              }
+            }
+          } else {
+            float2 s_vals[2];
+            if constexpr (s_type_id != vllm::kFE8M0fnu.id()) {
+              static_assert(a_type.size_bits() == 16 ||
+                            s_type.size_bits() == 16);
+              s_vals[0] = Cdtype::num22float2(frag_s[k2][j * 2][0]);
+              s_vals[1] = Cdtype::num22float2(frag_s[k2][j * 2 + 1][0]);
+            } else {
+              int32_t* s_vals_int = reinterpret_cast<int32_t*>(&s_vals[0]);
+              int32_t s_vals_e8m0 =
+                  *reinterpret_cast<int32_t*>(&frag_s[k2][j][0]);
+
+              s_vals_int[0] = (s_vals_e8m0 & 0xFF) << 23;
+              s_vals_int[1] = (s_vals_e8m0 & 0xFF00) << 15;
+              s_vals_int[2] = (s_vals_e8m0 & 0xFF0000) << 7;
+              s_vals_int[3] = (s_vals_e8m0 & 0xFF000000) >> 1;
+            }
+
+  #pragma unroll
+            for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+              for (int g = 0; g < 4; g++) {
+                float scale = reinterpret_cast<float*>(&s_vals[0])[g % 2];
+                frag_c[i][j][0][g] += frag_c_tmp[i][j][0][g] * scale;
+                frag_c_tmp[i][j][0][g] = 0.0f;
+              }
+
+  #pragma unroll
+              for (int g = 0; g < 4; g++) {
+                float scale = reinterpret_cast<float*>(&s_vals[1])[g % 2];
+                frag_c[i][j][1][g] += frag_c_tmp[i][j][1][g] * scale;
+                frag_c_tmp[i][j][1][g] = 0.0f;
+              }
+            }
+          }
         }
       }
     }
@@ -1263,7 +1518,8 @@ __global__ void Marlin(
     constexpr int red_off = threads / b_sh_stride_threads / 2;
     if (red_off >= 1) {
       auto red_idx = threadIdx.x / b_sh_stride_threads;
-      constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
+      constexpr int red_sh_stride =
+          b_sh_stride_threads * (is_a_8bit ? 2 : 4) * 2;
       constexpr int red_sh_delta = b_sh_stride_threads;
       int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
                       (threadIdx.x % b_sh_stride_threads);
@@ -1278,7 +1534,8 @@ __global__ void Marlin(
         for (int i = red_off; i > 0; i /= 2) {
           if (i <= red_idx && red_idx < 2 * i) {
   #pragma unroll
-            for (int j = 0; j < 4 * 2; j += (m_block_size_8 ? 2 : 1)) {
+            for (int j = 0; j < (is_a_8bit ? 2 : 4) * 2;
+                 j += (m_block_size_8 ? 2 : 1)) {
               int red_sh_wr =
                   red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
               if (i < red_off) {
@@ -1287,24 +1544,26 @@ __global__ void Marlin(
                 float* c_wr = reinterpret_cast<float*>(&sh_red[red_sh_wr]);
   #pragma unroll
                 for (int k = 0; k < 4; k++)
-                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
+                  reinterpret_cast<FragC*>(
+                      frag_c)[(is_a_8bit ? 2 : 4) * 2 * m_block + j][k] +=
                       c_rd[k] + c_wr[k];
               }
-              sh_red[red_sh_wr] =
-                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
+              sh_red[red_sh_wr] = reinterpret_cast<int4*>(
+                  &frag_c)[(is_a_8bit ? 2 : 4) * 2 * m_block + j];
             }
           }
           __syncthreads();
         }
         if (red_idx == 0) {
   #pragma unroll
-          for (int i = 0; i < 4 * 2; i += (m_block_size_8 ? 2 : 1)) {
+          for (int i = 0; i < (is_a_8bit ? 2 : 4) * 2;
+               i += (m_block_size_8 ? 2 : 1)) {
             float* c_rd =
                 reinterpret_cast<float*>(&sh_red[red_sh_delta * i + red_sh_rd]);
   #pragma unroll
             for (int j = 0; j < 4; j++)
-              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
-                  c_rd[j];
+              reinterpret_cast<FragC*>(
+                  frag_c)[(is_a_8bit ? 2 : 4) * 2 * m_block + i][j] += c_rd[j];
           }
         }
         __syncthreads();
@@ -1320,10 +1579,10 @@ __global__ void Marlin(
     // We are very careful here to reduce directly in the output buffer to
     // maximize L2 cache utilization in this step. To do this, we write out
     // results in FP16 (but still reduce with FP32 compute).
-    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    constexpr int active_threads = 32 * tb_n_warps;
     if (threadIdx.x < active_threads) {
       int c_gl_stride = prob_n / 8;
-      int c_gl_wr_delta_o = 8 * c_gl_stride;
+      int c_gl_wr_delta_o = 8 * c_gl_stride * (is_a_8bit ? 2 : 1);
       int c_gl_wr_delta_i = 4 * (active_threads / 32);
       int c_gl_wr;
       if constexpr (m_block_size_8) {
@@ -1331,9 +1590,9 @@ __global__ void Marlin(
                   4 * (threadIdx.x / 32) + (threadIdx.x % 32) / 8;
         c_gl_wr += (2 * thread_n_blocks) * slice_col;
       } else {
-        c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) +
+        c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) * (is_a_8bit ? 2 : 1) +
                   4 * (threadIdx.x / 32) + threadIdx.x % 4;
-        c_gl_wr += (2 * thread_n_blocks) * slice_col;
+        c_gl_wr += (2 * thread_n_blocks) * slice_col * (is_a_8bit ? 2 : 1);
       }
       constexpr int c_sh_wr_delta = active_threads;
       auto c_sh_wr = threadIdx.x;
@@ -1351,6 +1610,14 @@ __global__ void Marlin(
                            &C[c_gl_wr + i * c_gl_stride +
                               (threadIdx.x % 8) / 4 * c_gl_wr_delta_i],
                            (threadIdx.x % 4) * 2 + i < prob_m);
+          } else if constexpr (is_a_8bit) {
+            int2* sh_red_int2 = reinterpret_cast<int2*>(sh_red);
+            int2* c_int2 = reinterpret_cast<int2*>(C);
+            cp_async2_ca_pred(
+                &sh_red_int2[c_sh_wr + c_sh_wr_delta * i],
+                &c_int2[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
+                        c_gl_wr_delta_i * (i % 2)],
+                i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m);
           } else {
             cp_async4_pred(
                 &sh_red[c_sh_wr + c_sh_wr_delta * i],
@@ -1370,36 +1637,51 @@ __global__ void Marlin(
                     (m_block_size_8) && ((threadIdx.x % 4) * 2 + i < prob_m);
         if (mask) {
           if (!first) {
-            int4 c_red = sh_red[c_sh_wr + i * c_sh_wr_delta];
+            c_scalar_t* c_red_f16;
+            if constexpr (is_a_8bit) {
+              int2 tmp =
+                  reinterpret_cast<int2*>(sh_red)[c_sh_wr + i * c_sh_wr_delta];
+              c_red_f16 = reinterpret_cast<c_scalar_t*>(&tmp);
+            } else {
+              int4 tmp = sh_red[c_sh_wr + i * c_sh_wr_delta];
+              c_red_f16 = reinterpret_cast<c_scalar_t*>(&tmp);
+            }
   #pragma unroll
-            for (int j = 0; j < 2 * 4; j++) {
+            for (int j = 0; j < 2 * (is_a_8bit ? 2 : 4); j++) {
               int delta = 0;
               if constexpr (m_block_size_8) {
                 delta = j % 2 == 1 ? -2 : 0;
               }
               reinterpret_cast<float*>(
-                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4) + delta] +=
-                  Dtype::num2float(reinterpret_cast<scalar_t*>(&c_red)[j]);
+                  &frag_c)[(is_a_8bit ? 2 : 4) * 2 * 4 * (i / 4) + 4 * j +
+                           (i % 4) + delta] += Cdtype::num2float(c_red_f16[j]);
             }
           }
           if (!last) {
-            int4 c;
+            c_scalar_t c_f16[is_a_8bit ? 4 : 8];
   #pragma unroll
-            for (int j = 0; j < 2 * 4; j++) {
+            for (int j = 0; j < 2 * (is_a_8bit ? 2 : 4); j++) {
               int delta = 0;
               if constexpr (m_block_size_8) {
                 delta = j % 2 == 1 ? -2 : 0;
               }
-              reinterpret_cast<scalar_t*>(&c)[j] =
-                  Dtype::float2num(reinterpret_cast<float*>(
-                      &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4) + delta]);
+              c_f16[j] = Cdtype::float2num(reinterpret_cast<float*>(
+                  &frag_c)[(is_a_8bit ? 2 : 4) * 2 * 4 * (i / 4) + 4 * j +
+                           (i % 4) + delta]);
             }
-            if constexpr (m_block_size_8)
+            if constexpr (m_block_size_8) {
               C[c_gl_wr + i * c_gl_stride +
-                (threadIdx.x % 8) / 4 * c_gl_wr_delta_i] = c;
-            else
+                (threadIdx.x % 8) / 4 * c_gl_wr_delta_i] =
+                  *reinterpret_cast<int4*>(c_f16);
+            } else if constexpr (is_a_8bit) {
+              int2* c_int2 = reinterpret_cast<int2*>(C);
+              c_int2[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
+                     c_gl_wr_delta_i * (i % 2)] =
+                  *reinterpret_cast<int2*>(c_f16);
+            } else {
               C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
-                c_gl_wr_delta_i * (i % 2)] = c;
+                c_gl_wr_delta_i * (i % 2)] = *reinterpret_cast<int4*>(c_f16);
+            }
           }
         }
       }
@@ -1414,10 +1696,10 @@ __global__ void Marlin(
 
     constexpr int c_size = tb_m * tb_n * sizeof(float) / 16;
 
-    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    constexpr int active_threads = 32 * tb_n_warps;
     bool is_th_active = threadIdx.x < active_threads;
 
-    constexpr int num_floats = thread_m_blocks * 4 * 2 * 4;
+    constexpr int num_floats = thread_m_blocks * (is_a_8bit ? 2 : 4) * 2 * 4;
     constexpr int th_size = num_floats * sizeof(float) / 16;
 
     int c_cur_offset = locks_off * c_size;
@@ -1471,7 +1753,7 @@ __global__ void Marlin(
     } else {
       c_sh_wr =
           (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
-      c_sh_wr += 32 * (threadIdx.x / 32);
+      c_sh_wr += (is_a_8bit ? 16 : 32) * (threadIdx.x / 32);
     }
 
     int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) +
@@ -1481,47 +1763,47 @@ __global__ void Marlin(
     // We first reorder in shared memory to guarantee the most efficient final
     // global write patterns
     auto write = [&](int idx, float c0, float c1, FragS& s, FragS& b_bias) {
-      scalar_t2 res =
-          Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1));
+      c_scalar_t2 res =
+          Cdtype::nums2num2(Cdtype::float2num(c0), Cdtype::float2num(c1));
 
       // For per-column quantization we finally apply the scale here (only for
       // 4-bit)
-      if constexpr (!has_act_order && group_blocks == -1 &&
-                    w_type.size_bits() == 4 &&
+      if constexpr (!has_act_order && group_blocks == -1 && !is_a_8bit &&
+                    b_type.size_bits() == 4 &&
                     (has_zp && dequant_skip_flop || !has_zp)) {
-        scalar_t2 tmp_scale = s[0];
+        c_scalar_t2 tmp_scale = s[0];
         if constexpr (m_block_size_8) {
-          tmp_scale = Dtype::num2num2(
+          tmp_scale = Cdtype::num2num2(
               reinterpret_cast<scalar_t*>(&s[0])[(threadIdx.x % 8) / 4]);
         }
         res = __hmul2(res, tmp_scale);
       }
 
-      if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
+      if constexpr (b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
         res = __hmul2(res, global_scale);
       }
       if (has_bias && last) {
-        scalar_t2 tmp_bias = b_bias[0];
+        c_scalar_t2 tmp_bias = b_bias[0];
         if constexpr (m_block_size_8) {
-          tmp_bias = Dtype::num2num2(
+          tmp_bias = Cdtype::num2num2(
               reinterpret_cast<scalar_t*>(&b_bias[0])[(threadIdx.x % 8) / 4]);
         }
         res = __hadd2(res, tmp_bias);
       }
 
       if constexpr (m_block_size_8) {
-        ((scalar_t*)sh_red)[idx] = res.x;
-        ((scalar_t*)sh_red)[idx + 8 * c_sh_stride] = res.y;
+        ((c_scalar_t*)sh_red)[idx] = res.x;
+        ((c_scalar_t*)sh_red)[idx + 8 * c_sh_stride] = res.y;
       } else {
-        ((scalar_t2*)sh_red)[idx] = res;
+        ((c_scalar_t2*)sh_red)[idx] = res;
       }
     };
 
-    if (threadIdx.x / 32 < thread_n_blocks / 4) {
+    if (threadIdx.x / 32 < tb_n_warps) {
   #pragma unroll
       for (int i = 0; i < thread_m_blocks; i++) {
   #pragma unroll
-        for (int j = 0; j < 4; j++) {
+        for (int j = 0; j < (is_a_8bit ? 2 : 4); j++) {
           if constexpr (m_block_size_8) {
             int wr = c_sh_wr + 16 * j;
             write(wr, frag_c[i][j][0][0], frag_c[i][j][0][1],
@@ -1557,9 +1839,9 @@ __global__ void Marlin(
          i++) {
       if (c_gl_wr < c_gl_wr_end) {
         if (use_atomic_add && slice_count > 1) {
-          scalar_t2* C_half2 = reinterpret_cast<scalar_t2*>(&C[c_gl_wr]);
-          scalar_t2* sh_red_half2 =
-              reinterpret_cast<scalar_t2*>(&sh_red[c_sh_rd]);
+          c_scalar_t2* C_half2 = reinterpret_cast<c_scalar_t2*>(&C[c_gl_wr]);
+          c_scalar_t2* sh_red_half2 =
+              reinterpret_cast<c_scalar_t2*>(&sh_red[c_sh_rd]);
   #pragma unroll
           for (int a = 0; a < 4; a++) {
             atomicAdd(&C_half2[a], sh_red_half2[a]);
@@ -1635,7 +1917,13 @@ __global__ void Marlin(
           wait_for_stage();
           init_same_group(pipe % stages);
         }
-        matmul(k);
+
+        if constexpr (!is_a_8bit) {
+          matmul(k, pipe - (k >= b_sh_wr_iters - 2 ? 1 : 0));
+        } else {
+          static_assert(group_blocks != 0 && group_blocks != 1);
+          matmul_a8(k);
+        }
       }
       slice_iters--;
       if (slice_iters == 0) {
@@ -1668,13 +1956,47 @@ __global__ void Marlin(
     // While this pattern may not be the most readable, other ways of writing
     // the loop seemed to noticeably worse performance after compilation.
     if (slice_iters == 0) {
+      if constexpr (is_a_8bit) {
+        float frag_a_s[2 * thread_m_blocks];
+
+        for (int i = 0; i < 2 * thread_m_blocks; i++)
+          frag_a_s[i] = sh_a_s[i * 8 + (threadIdx.x % 32) / 4];
+
+  #pragma unroll
+        for (int j = 0; j < 2; j++) {
+  #pragma unroll
+          for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+            for (int g = 0; g < 4; g++) {
+              float c_val = frag_c[i][j][0][g];
+
+              if constexpr (a_type == vllm::kS8) {
+                c_val = __int2float_rn(*reinterpret_cast<int32_t*>(&c_val));
+              }
+              float s_val = frag_a_s[i * 2 + g / 2];
+              frag_c[i][j][0][g] = c_val * s_val;
+            }
+  #pragma unroll
+            for (int g = 0; g < 4; g++) {
+              float c_val = frag_c[i][j][1][g];
+
+              if constexpr (a_type == vllm::kS8) {
+                c_val = __int2float_rn(*reinterpret_cast<int32_t*>(&c_val));
+              }
+              float s_val = frag_a_s[i * 2 + g / 2];
+              frag_c[i][j][1][g] = c_val * s_val;
+            }
+          }
+        }
+      }
+
       cp_async_wait<0>();
       bool last = slice_idx == slice_count - 1;
       // For per-column scales, we only fetch them here in the final step before
       // write-out
       if constexpr (!has_act_order && group_blocks == -1 &&
                     (has_zp && dequant_skip_flop || !has_zp)) {
-        if (w_type.size_bits() == 8 || (last || use_atomic_add)) {
+        if (b_type.size_bits() == 8 || (last || use_atomic_add) || is_a_8bit) {
           if (s_sh_wr_pred) {
             cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
           }
@@ -1692,20 +2014,27 @@ __global__ void Marlin(
       }
 
       if constexpr (!has_act_order && group_blocks == -1 &&
-                    (has_zp && dequant_skip_flop || !has_zp)) {
-        if (w_type.size_bits() == 8 || (last || use_atomic_add)) {
+                    (has_zp && dequant_skip_flop || !has_zp || is_a_8bit)) {
+        if constexpr (is_a_8bit) {
           cp_async_wait<0>();
           __syncthreads();
-          if (threadIdx.x / 32 < thread_n_blocks / 4) {
+          if (threadIdx.x / 32 < tb_n_warps) {
+            reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
+          }
+        } else if (b_type.size_bits() == 8 || (last || use_atomic_add)) {
+          cp_async_wait<0>();
+          __syncthreads();
+          if (threadIdx.x / 32 < tb_n_warps) {
             reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
             reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
             if constexpr (m_block_size_8) {
               int idx = (threadIdx.x / 4) % 2;
-              scalar_t2* frag_s_half2 = reinterpret_cast<scalar_t2*>(frag_s);
+              c_scalar_t2* frag_s_half2 =
+                  reinterpret_cast<c_scalar_t2*>(frag_s);
   #pragma unroll
               for (int i = 0; i < 8; i++) {
-                frag_s_half2[i] = Dtype::num2num2(
-                    reinterpret_cast<scalar_t*>(&frag_s_half2[i])[idx]);
+                frag_s_half2[i] = Cdtype::num2num2(
+                    reinterpret_cast<c_scalar_t*>(&frag_s_half2[i])[idx]);
               }
             }
           }
@@ -1715,26 +2044,48 @@ __global__ void Marlin(
       // For 8-bit channelwise, we apply the scale before the global reduction
       // that converts the fp32 results to fp16 (so that we avoid possible
       // overflow in fp16)
-      if constexpr (!has_act_order && group_blocks == -1 &&
-                    w_type.size_bits() == 8 &&
-                    (has_zp && dequant_skip_flop || !has_zp)) {
-        if (threadIdx.x / 32 < thread_n_blocks / 4) {
+      if constexpr (!has_act_order && group_blocks == -1 && is_a_8bit) {
+  #pragma unroll
+        for (int j = 0; j < 2; j++) {
+          float2 aa[2];
+          aa[0] = Cdtype::num22float2(frag_s[0][j * 2][0]);
+          aa[1] = Cdtype::num22float2(frag_s[0][j * 2 + 1][0]);
+
+  #pragma unroll
+          for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+            for (int g = 0; g < 4; g++) {
+              float scale = reinterpret_cast<float*>(&aa[0])[g % 2];
+              frag_c[i][j][0][g] *= scale;
+            }
+
+  #pragma unroll
+            for (int g = 0; g < 4; g++) {
+              float scale = reinterpret_cast<float*>(&aa[1])[g % 2];
+              frag_c[i][j][1][g] *= scale;
+            }
+          }
+        }
+      } else if (!has_act_order && group_blocks == -1 &&
+                 b_type.size_bits() == 8 &&
+                 (has_zp && dequant_skip_flop || !has_zp)) {
+        if (threadIdx.x / 32 < tb_n_warps) {
   #pragma unroll
           for (int i = 0; i < thread_m_blocks; i++) {
   #pragma unroll
             for (int j = 0; j < 4; j++) {
-              scale_float<scalar_t>(
+              scale_float<c_type_id>(
                   reinterpret_cast<float*>(&frag_c[i][j][0][0]),
                   frag_s[j / 2][2 * (j % 2) + 0]);
-              scale_float<scalar_t>(
+              scale_float<c_type_id>(
                   reinterpret_cast<float*>(&frag_c[i][j][0][2]),
                   frag_s[j / 2][2 * (j % 2) + (m_block_size_8 ? 1 : 0)]);
 
               if constexpr (!m_block_size_8) {
-                scale_float<scalar_t>(
+                scale_float<c_type_id>(
                     reinterpret_cast<float*>(&frag_c[i][j][1][0]),
                     frag_s[j / 2][2 * (j % 2) + 1]);
-                scale_float<scalar_t>(
+                scale_float<c_type_id>(
                     reinterpret_cast<float*>(&frag_c[i][j][1][2]),
                     frag_s[j / 2][2 * (j % 2) + 1]);
               }
@@ -1758,7 +2109,8 @@ __global__ void Marlin(
         cp_async_wait<0>();
         __syncthreads();
         reinterpret_cast<int4*>(&frag_bias)[0] = sh_bias[bias_sh_rd];
-        reinterpret_cast<int4*>(&frag_bias)[1] = sh_bias[bias_sh_rd + 4];
+        if constexpr (!is_a_8bit)
+          reinterpret_cast<int4*>(&frag_bias)[1] = sh_bias[bias_sh_rd + 4];
         __syncthreads();
       }
 
@@ -1768,21 +2120,22 @@ __global__ void Marlin(
         // only the last block in a slice actually writes the result
         write_result(last);
       slice_row = 0;
-      slice_col_par++;
-      slice_col++;
+      if (!in_part2) {
+        slice_col_par += gridDim.x;
+      } else {
+        slice_col_par++;
+        slice_col++;
+      }
       is_first_matmul_in_slice = true;
       init_slice();
 
       if (slice_iters) {
         a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
                   (threadIdx.x % a_gl_rd_delta_o);
-  #pragma unroll
-        for (int i = 0; i < b_sh_wr_iters; i++)
-          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
-        if (slice_col == 0) {
-  #pragma unroll
-          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
-        }
+        a_gl_rd += a_gl_rd_delta_o * slice_row;
+        b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride) +
+                  (threadIdx.x % b_sh_stride);
+        b_gl_rd += b_sh_stride * slice_col + b_gl_rd_delta_o * slice_row;
 
         bias_gl_rd = (thread_n_blocks * 16 / 8) * slice_col + threadIdx.x;
         // Update slice k/n for scales loading
@@ -1791,12 +2144,28 @@ __global__ void Marlin(
           slice_k_finish = slice_k_start + tb_k * slice_iters;
           slice_k_start_shared_fetch = slice_k_start;
           slice_n_offset = act_s_col_tb_stride * slice_col;
-
         } else {
-          s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
-          zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
+          if constexpr (group_blocks == -1) {
+            s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+            zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
+          } else if constexpr (group_blocks >= thread_k_blocks) {
+            s_gl_rd =
+                s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
+                s_sh_stride * slice_col + threadIdx.x;
+            zp_gl_rd =
+                zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
+                zp_sh_stride * slice_col + threadIdx.x;
+          } else {
+            s_gl_rd =
+                s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks +
+                               threadIdx.x / s_sh_stride) +
+                s_sh_stride * slice_col + threadIdx.x % s_sh_stride;
+            zp_gl_rd =
+                zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks +
+                                threadIdx.x / zp_sh_stride) +
+                zp_sh_stride * slice_col + threadIdx.x % zp_sh_stride;
+          }
         }
-
         start_pipes();
       }
     }
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index e9c96bb8b56cf..914227838558a 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -298,9 +298,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // gptq_marlin Optimized Quantized GEMM for GPTQ.
   ops.def(
       "gptq_marlin_gemm(Tensor a, Tensor? c_or_none, Tensor b_q_weight, "
-      "Tensor? b_bias_or_none,"
-      "Tensor b_scales, Tensor? global_scale, Tensor? b_zeros_or_none, Tensor? "
-      "g_idx_or_none, Tensor? perm_or_none, Tensor workspace, int b_q_type, "
+      "Tensor? b_bias_or_none,Tensor b_scales, "
+      "Tensor? a_scales, Tensor? global_scale, Tensor? b_zeros_or_none, "
+      "Tensor? "
+      "g_idx_or_none, Tensor? perm_or_none, Tensor workspace, int b_type_id, "
       "SymInt size_m, SymInt size_n, SymInt size_k, bool is_k_full, "
       "bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) -> Tensor");
   // conditionally compiled so impl registration is in source file
@@ -308,13 +309,19 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // gptq_marlin repack from GPTQ.
   ops.def(
       "gptq_marlin_repack(Tensor b_q_weight, Tensor perm, "
-      "SymInt size_k, SymInt size_n, int num_bits) -> Tensor");
+      "SymInt size_k, SymInt size_n, int num_bits, bool is_a_8bit) -> Tensor");
   // conditionally compiled so impl registrations are in source file
 
   // awq_marlin repack from AWQ.
   ops.def(
       "awq_marlin_repack(Tensor b_q_weight, SymInt size_k, "
-      "SymInt size_n, int num_bits) -> Tensor");
+      "SymInt size_n, int num_bits, bool is_a_8bit) -> Tensor");
+  // conditionally compiled so impl registrations are in source file
+
+  // preprocess W-int4A-fp8 weight for marlin kernel
+  ops.def(
+      "marlin_int4_fp8_preprocess(Tensor qweight, "
+      "Tensor? qzeros_or_none, bool inplace) -> Tensor");
   // conditionally compiled so impl registrations are in source file
 
   // CUTLASS w4a8 GEMM
diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md
index e54a9e2bc5e77..44aaa65218cc4 100644
--- a/docs/design/moe_kernel_features.md
+++ b/docs/design/moe_kernel_features.md
@@ -60,7 +60,7 @@ Modular kernels are supported by the following `FusedMoEMethodBase` classes.
 
 - [`ModelOptFp8MoEMethod`][vllm.model_executor.layers.quantization.modelopt.ModelOptFp8MoEMethod]
 - [`Fp8MoEMethod`][vllm.model_executor.layers.quantization.fp8.Fp8MoEMethod]
-- [`CompressedTensorsW4A4Nvfp4MoeMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW4A4Nvfp4MoeMethod]
+- [`CompressedTensorsW4A4Nvfp4MoEMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW4A4Nvfp4MoEMethod]
 - [`CompressedTensorsW8A8Fp8MoEMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW8A8Fp8MoEMethod]
 - [`Mxfp4MoEMethod`][vllm.model_executor.layers.quantization.mxfp4.Mxfp4MoEMethod]
 - [`UnquantizedFusedMoEMethod`][vllm.model_executor.layers.fused_moe.layer.UnquantizedFusedMoEMethod]
diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
index 0550c2d9e2125..bacf6f37f2b08 100644
--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -21,7 +21,7 @@ from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
 
 import vllm.model_executor.layers.fused_moe  # noqa
 from tests.kernels.moe.utils import fused_moe
-from tests.kernels.utils import opcheck, stack_and_dev, torch_moe
+from tests.kernels.utils import opcheck, stack_and_dev, torch_experts, torch_moe
 from vllm._aiter_ops import rocm_aiter_ops
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.distributed.parallel_state import init_distributed_environment
@@ -65,6 +65,64 @@ NUM_EXPERTS = [8, 64, 192]
 EP_SIZE = [1, 4]
 TOP_KS = [2, 6]
 
+MOE_MARLIN_QUANT_TEST_CONFIGS = [
+    # AWQ-INT4
+    {"b_type": scalar_types.uint4, "group_blocks": [-1, 2, 4, 8]},
+    # GPTQ-INT4
+    {
+        "b_type": scalar_types.uint4b8,
+        "support_act_order": True,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT8
+    {
+        "b_type": scalar_types.uint8b128,
+        "support_act_order": True,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # FP8
+    {"b_type": scalar_types.float8_e4m3fn, "group_blocks": [-1, 8]},
+    # NVFP4
+    {"b_type": scalar_types.float4_e2m1f, "group_blocks": [1]},
+    # MXFP4
+    {
+        "a_type": [scalar_types.bfloat16],
+        "b_type": scalar_types.float4_e2m1f,
+        "group_blocks": [2],
+    },
+    # AWQ-INT4 with INT8 activation
+    {
+        "a_type": [scalar_types.int8],
+        "b_type": scalar_types.uint4,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT4 with INT8 activation
+    {
+        "a_type": [scalar_types.int8],
+        "b_type": scalar_types.uint4b8,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT4 with FP8 activation
+    {
+        "a_type": [scalar_types.float8_e4m3fn],
+        "b_type": scalar_types.uint4b8,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # AWQ-INT4 with FP8 activation
+    {
+        "a_type": [scalar_types.float8_e4m3fn],
+        "b_type": scalar_types.uint4,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # MXFP4 with FP8 activation
+    {
+        "a_type": [scalar_types.float8_e4m3fn],
+        "b_type": scalar_types.float4_e2m1f,
+        "c_type": [scalar_types.bfloat16],
+        "group_blocks": [2],
+    },
+]
+
 FUSED_MOE_MNK_FACTORS = [
     (1, 128, 128),
     (1, 2048, 128),
@@ -505,63 +563,74 @@ def marlin_moe_generate_valid_test_cases():
     m_list = [1, 123, 666]
     n_list = [128, 1024]
     k_list = [256, 2048]
-    e_list = [4, 12]
+    e_list = [5, 12]
     topk_list = [2, 3]
     ep_size_list = [1, 4]
-    dtype_list = [torch.bfloat16]
-    group_size_list = [-1, 32, 128]
     act_order_list = [True, False]
-    quant_type_list = [
-        scalar_types.float4_e2m1f,
-        scalar_types.float8_e4m3fn,
-        scalar_types.uint4,
-        scalar_types.uint4b8,
-        scalar_types.uint8b128,
-    ]
     is_k_full_list = [True, False]
 
     all_combinations = itertools.product(
+        MOE_MARLIN_QUANT_TEST_CONFIGS,
         m_list,
         n_list,
         k_list,
         e_list,
         topk_list,
         ep_size_list,
-        dtype_list,
-        group_size_list,
         act_order_list,
-        quant_type_list,
         is_k_full_list,
     )
 
     def is_invalid(
-        m, n, k, e, topk, ep_size, dtype, group_size, act_order, quant_type, is_k_full
+        a_type,
+        b_type,
+        c_type,
+        group_blocks,
+        m,
+        n,
+        k,
+        e,
+        topk,
+        ep_size,
+        act_order,
+        is_k_full,
     ):
-        if quant_type == scalar_types.float8_e4m3fn and group_size not in [-1, 128]:
-            return False
-        if quant_type == scalar_types.float4_e2m1f:
-            if group_size not in [16, 32]:
-                return False
-            if dtype == torch.float16 and group_size == 32:
-                return False
-        if quant_type != scalar_types.float4_e2m1f and group_size == 16:
+        group_size = group_blocks if group_blocks <= 0 else group_blocks * 16
+        if group_size > 0 and k % group_size != 0:
             return False
 
-        # Filter act_order
-        if act_order:
-            if group_size in (-1, k, n):
-                return False
-            if quant_type not in [scalar_types.uint4b8]:
-                return False
-        elif not is_k_full:
+        if act_order and group_size in [-1, k, n]:
+            return False
+        if group_size in [k, n]:
+            return False
+        if not act_order and is_k_full:
             return False
 
-        return True
+        return a_type.size_bits < 16 or a_type is c_type
 
     cases = []
     for case in all_combinations:
-        if is_invalid(*case):
-            cases.append(case)
+        quant_test_config, m, n, k, _, _, _, act_order, *_ = case
+        if act_order and not quant_test_config.get("support_act_order", False):
+            continue
+
+        f16_types = [scalar_types.float16]
+        inner_combinations = itertools.product(
+            quant_test_config.get("a_type", f16_types),
+            [quant_test_config["b_type"]],
+            quant_test_config.get("c_type", f16_types),
+            quant_test_config["group_blocks"],
+        )
+
+        for sub_case in inner_combinations:
+            if (
+                sub_case[0] == scalar_types.float8_e4m3fn
+                and current_platform.get_device_capability() not in [89, 120]
+            ):
+                continue
+            args = sub_case + (m, n, k) + case[4:]
+            if is_invalid(*args):
+                cases.append(args)
     return cases
 
 
@@ -571,6 +640,7 @@ class MarlinMoEWeightData:
     qweight: torch.Tensor
     scales: torch.Tensor
     global_scale: torch.Tensor | None
+    a_scales_factor: torch.Tensor | None
     g_idx: torch.Tensor | None
     zeros: torch.Tensor | None
     sort_indices: torch.Tensor | None
@@ -583,11 +653,20 @@ class MarlinMoEWeightData:
         group_size: int,
         act_order: bool | None = None,
         bias: torch.Tensor | None = None,
+        input_type: ScalarType = None,
     ) -> "MarlinMoEWeightData":
         assert w.ndim == 3
+
         has_zp = quant_type in [scalar_types.uint4, scalar_types.uint8]
         k = w.shape[-1]
 
+        if input_type == scalar_types.int8:
+            input_dtype = torch.int8
+        elif input_type == scalar_types.float8_e4m3fn:
+            input_dtype = torch.float8_e4m3fn
+        else:
+            input_dtype = w.dtype
+
         w_ref_l: list[torch.Tensor] = []
         qweight_l: list[torch.Tensor] = []
         scales_l: list[torch.Tensor] = []
@@ -601,11 +680,13 @@ class MarlinMoEWeightData:
             if quant_type == scalar_types.float4_e2m1f:
                 if group_size == 16:
                     w_ref, qweight, scales, global_scale = (
-                        rand_marlin_weight_nvfp4_like(w[i], group_size)
+                        rand_marlin_weight_nvfp4_like(
+                            w[i], group_size, input_dtype=input_dtype
+                        )
                     )
                 else:
                     w_ref, qweight, scales = rand_marlin_weight_mxfp4_like(
-                        w[i], group_size
+                        w[i], group_size, input_dtype=input_dtype
                     )
                     global_scale = None
 
@@ -615,13 +696,18 @@ class MarlinMoEWeightData:
                 if global_scale is not None:
                     global_scale_l.append(global_scale)
             elif quant_type == scalar_types.float8_e4m3fn:
-                w_ref, qweight, scales = marlin_quant_fp8_torch(w[i], group_size)
+                w_ref, qweight, scales = marlin_quant_fp8_torch(
+                    w[i], group_size, input_dtype=input_dtype
+                )
                 w_ref_l.append(w_ref.T)
                 qweight_l.append(qweight)
                 scales_l.append(scales)
             elif has_zp:
                 w_ref, qweight, scales, zeros = awq_marlin_quantize(
-                    w[i].transpose(1, 0), quant_type, group_size
+                    w[i].transpose(1, 0),
+                    quant_type,
+                    group_size,
+                    input_dtype=input_dtype,
                 )
 
                 w_ref_l.append(w_ref.T)
@@ -631,7 +717,12 @@ class MarlinMoEWeightData:
             else:
                 test_perm = torch.randperm(k)
                 w_ref, qweight, scales, g_idx, sort_indices, _ = marlin_quantize(
-                    w[i].transpose(1, 0), quant_type, group_size, act_order, test_perm
+                    w[i].transpose(1, 0),
+                    quant_type,
+                    group_size,
+                    act_order,
+                    test_perm,
+                    input_dtype=input_dtype,
                 )
 
                 w_ref_l.append(w_ref.T)
@@ -652,11 +743,18 @@ class MarlinMoEWeightData:
         sort_indices = stack_and_dev(sort_indices_l) if sort_indices_l else None
         marlin_bias = stack_and_dev(bias_l) if bias_l else None
 
+        a_scales_factor = None
+        if input_type == scalar_types.int8 and group_size != -1:
+            a_scales_factor = 1 / 4096 * scales.max().float()
+            scales = scales / scales.max() * 4096
+            scales = scales.round().to(torch.int16).view(w.dtype)
+
         return MarlinMoEWeightData(
             w_ref=w_ref,
             qweight=qweight,
             scales=scales,
             global_scale=global_scale,
+            a_scales_factor=a_scales_factor,
             g_idx=g_idx,
             zeros=zeros,
             sort_indices=sort_indices,
@@ -666,28 +764,47 @@ class MarlinMoEWeightData:
 
 @pytest.mark.flaky(reruns=2)
 @pytest.mark.parametrize(
-    ("m, n, k, e, topk, ep_size, dtype, group_size,act_order, quant_type, is_k_full"),
+    (
+        "a_type, b_type, c_type, group_blocks,"
+        "m, n, k, e, topk, ep_size, act_order, is_k_full"
+    ),
     marlin_moe_generate_valid_test_cases(),
 )
 @pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
 def test_fused_marlin_moe(
-    m: int,
-    n: int,
-    k: int,
-    e: int,
-    topk: int,
-    ep_size: int,
-    dtype: torch.dtype,
-    group_size: int,
-    act_order: bool,
-    quant_type: ScalarType,
-    is_k_full: bool,
+    a_type,
+    b_type,
+    c_type,
+    group_blocks,
+    m,
+    n,
+    k,
+    e,
+    topk,
+    ep_size,
+    act_order,
+    is_k_full,
 ):
-    torch.cuda.manual_seed(0)
+    torch.cuda.manual_seed(1)
+    group_size = group_blocks if group_blocks <= 0 else group_blocks * 16
+
+    if c_type == scalar_types.float16:
+        dtype = torch.float16
+    elif c_type == scalar_types.bfloat16:
+        dtype = torch.bfloat16
+    else:
+        raise RuntimeError("unsupported c_type")
+
+    if a_type == scalar_types.int8:
+        a_dtype = torch.int8
+    elif a_type == scalar_types.float8_e4m3fn:
+        a_dtype = torch.float8_e4m3fn
+    else:
+        a_dtype = dtype
 
     a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
-    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 20
-    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 20
+    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
 
     if ep_size > 1:
         local_e = e // ep_size
@@ -700,11 +817,19 @@ def test_fused_marlin_moe(
         e_map = None
 
     w1_data = MarlinMoEWeightData.make(
-        w=w1, quant_type=quant_type, group_size=group_size, act_order=act_order
+        w=w1,
+        quant_type=b_type,
+        group_size=group_size,
+        act_order=act_order,
+        input_type=a_type,
     )
 
     w2_data = MarlinMoEWeightData.make(
-        w=w2, quant_type=quant_type, group_size=group_size, act_order=act_order
+        w=w2,
+        quant_type=b_type,
+        group_size=group_size,
+        act_order=act_order,
+        input_type=a_type,
     )
 
     score = torch.randn((m, e), device="cuda", dtype=dtype)
@@ -712,8 +837,18 @@ def test_fused_marlin_moe(
     topk_weights, topk_ids, _ = fused_topk(a, score, topk, False)
 
     with set_current_vllm_config(vllm_config):
-        torch_output = torch_moe(
-            a, w1_data.w_ref, w2_data.w_ref, score, topk, expert_map=e_map
+        score = torch.softmax(score, dim=-1, dtype=torch.float32)
+        topk_weight, topk_ids = torch.topk(score, topk)
+        torch_output = torch_experts(
+            a,
+            w1_data.w_ref,
+            w2_data.w_ref,
+            topk_weight=topk_weight,
+            topk_ids=topk_ids,
+            global_num_experts=e,
+            expert_map=e_map,
+            quant_dtype=a_dtype,
+            per_act_token_quant=True,
         )
 
     marlin_output = fused_marlin_moe(
@@ -733,15 +868,18 @@ def test_fused_marlin_moe(
         global_scale2=w2_data.global_scale,
         g_idx1=w1_data.g_idx,
         g_idx2=w2_data.g_idx,
+        input_global_scale1=w1_data.a_scales_factor,
+        input_global_scale2=w2_data.a_scales_factor,
         sort_indices1=w1_data.sort_indices,
         sort_indices2=w2_data.sort_indices,
         w1_zeros=w1_data.zeros,
         w2_zeros=w2_data.zeros,
-        quant_type_id=quant_type.id,
+        input_dtype=a_dtype,
+        quant_type_id=b_type.id,
         is_k_full=is_k_full,
     )
 
-    torch.testing.assert_close(marlin_output, torch_output, atol=5e-2, rtol=0)
+    torch.testing.assert_close(marlin_output, torch_output, atol=4e-2, rtol=0)
 
 
 @pytest.mark.flaky(reruns=2)
diff --git a/tests/kernels/quantization/test_marlin_gemm.py b/tests/kernels/quantization/test_marlin_gemm.py
index 0833115fcf301..59516db1b115d 100644
--- a/tests/kernels/quantization/test_marlin_gemm.py
+++ b/tests/kernels/quantization/test_marlin_gemm.py
@@ -5,6 +5,8 @@
 Run `pytest tests/kernels/quantization/test_marlin_gemm.py`.
 """
 
+import itertools
+
 import pytest
 import torch
 
@@ -17,8 +19,10 @@ from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
     GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES,
     GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES,
 )
+from vllm.model_executor.layers.quantization.utils.int8_utils import (
+    per_token_quant_int8,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    MARLIN_SUPPORTED_GROUP_SIZES,
     marlin_make_empty_g_idx,
     marlin_make_workspace_new,
     marlin_permute_bias,
@@ -26,7 +30,6 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     query_marlin_supported_quant_types,
 )
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
-    FP4_MARLIN_SUPPORTED_GROUP_SIZES,
     rand_marlin_weight_mxfp4_like,
     rand_marlin_weight_nvfp4_like,
 )
@@ -50,6 +53,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
     quantize_weights,
     sort_weights,
 )
+from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 
 ACT_ORDER_OPTS = [False, True]
@@ -65,6 +69,12 @@ MARLIN_24_N_CHUNKS = [512]
 
 HQQ_SUPPORTED_GROUP_SIZES = [64]
 
+MARLIN_REPACK_NK_FACTORS = [
+    (4, 8),
+    (7, 5),
+    (13, 11),
+]
+
 MNK_FACTORS = [
     (1, 1, 1),
     (1, 4, 8),
@@ -74,6 +84,64 @@ MNK_FACTORS = [
 
 DTYPES = [torch.float16, torch.bfloat16]
 
+DENSE_MARLIN_QUANT_TEST_CONFIGS = [
+    # AWQ-INT4
+    {"b_type": scalar_types.uint4, "group_blocks": [-1, 2, 4, 8]},
+    # GPTQ-INT4
+    {
+        "b_type": scalar_types.uint4b8,
+        "support_act_order": True,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT8
+    {
+        "b_type": scalar_types.uint8b128,
+        "support_act_order": True,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # FP8
+    {"b_type": scalar_types.float8_e4m3fn, "group_blocks": [-1, 8]},
+    # NVFP4
+    {"b_type": scalar_types.float4_e2m1f, "group_blocks": [1]},
+    # MXFP4
+    {
+        "a_type": [scalar_types.bfloat16],
+        "b_type": scalar_types.float4_e2m1f,
+        "group_blocks": [2],
+    },
+    # AWQ-INT4 with INT8 activation
+    {
+        "a_type": [scalar_types.int8],
+        "b_type": scalar_types.uint4,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT4 with INT8 activation
+    {
+        "a_type": [scalar_types.int8],
+        "b_type": scalar_types.uint4b8,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT4 with FP8 activation
+    {
+        "a_type": [scalar_types.float8_e4m3fn],
+        "b_type": scalar_types.uint4b8,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # AWQ-INT4 with FP8 activation
+    {
+        "a_type": [scalar_types.float8_e4m3fn],
+        "b_type": scalar_types.uint4,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # MXFP4 with FP8 activation
+    {
+        "a_type": [scalar_types.float8_e4m3fn],
+        "b_type": scalar_types.float4_e2m1f,
+        "c_type": [scalar_types.bfloat16],
+        "group_blocks": [2],
+    },
+]
+
 
 def compute_max_diff(output, output_ref):
     return torch.mean(torch.abs(output - output_ref)) / torch.mean(
@@ -85,6 +153,58 @@ def rand_data(shape, dtype=torch.float16):
     return torch.randn(shape, dtype=dtype, device="cuda")
 
 
+@pytest.mark.skipif(
+    not is_quant_method_supported("gptq_marlin"),
+    reason="Marlin is not supported on this GPU type.",
+)
+def test_marlin_int4_fp8_preprocess_without_zp():
+    qweight_unpacked = torch.randint(
+        0, 16, size=(2048, 2048), dtype=torch.int32, device="cuda"
+    )
+    qweight_packed = qweight_unpacked[:, ::2] * 16 + qweight_unpacked[:, 1::2]
+    qweight_packed = qweight_packed.to(torch.int8).view(torch.int32)
+
+    cuda_res = ops.marlin_int4_fp8_preprocess(qweight_packed)
+
+    torch_res = torch.where(
+        qweight_unpacked >= 8, qweight_unpacked - 8, 15 - qweight_unpacked
+    )
+    torch_res = torch_res[:, ::2] * 16 + torch_res[:, 1::2]
+    torch_res = torch_res.to(torch.int8).view(torch.int32)
+
+    assert (cuda_res == torch_res).all()
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("gptq_marlin"),
+    reason="Marlin is not supported on this GPU type.",
+)
+def test_marlin_int4_fp8_preprocess_awq():
+    group_size = 128
+
+    qweight_unpacked = torch.randint(
+        0, 16, size=(2048, 2048), dtype=torch.int32, device="cuda"
+    )
+    qzeros_unpacked = torch.randint(
+        0, 16, size=(2048 // group_size, 2048), dtype=torch.int32, device="cuda"
+    )
+
+    qweight_packed = qweight_unpacked[:, ::2] * 16 + qweight_unpacked[:, 1::2]
+    qweight_packed = qweight_packed.to(torch.int8).view(torch.int32)
+    qzeros_packed = qzeros_unpacked[:, ::2] * 16 + qzeros_unpacked[:, 1::2]
+    qzeros_packed = qzeros_packed.to(torch.int8).view(torch.int32)
+
+    cuda_res = ops.marlin_int4_fp8_preprocess(qweight_packed, qzeros_packed)
+
+    repeated_zp = qzeros_unpacked.repeat_interleave(group_size, 0)
+    torch_res = qweight_unpacked - repeated_zp
+    torch_res[torch_res < 0] = 15 - qweight_unpacked[torch_res < 0]
+    torch_res = torch_res[:, ::2] * 16 + torch_res[:, 1::2]
+    torch_res = torch_res.to(torch.int8).view(torch.int32)
+
+    assert (cuda_res == torch_res).all()
+
+
 @pytest.mark.skipif(
     not is_quant_method_supported("gptq_marlin"),
     reason="Marlin is not supported on this GPU type.",
@@ -92,16 +212,17 @@ def rand_data(shape, dtype=torch.float16):
 @pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
 @pytest.mark.parametrize("quant_type", query_marlin_supported_quant_types(False, False))
-@pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES)
 @pytest.mark.parametrize("act_order", ACT_ORDER_OPTS)
-@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
+@pytest.mark.parametrize("is_a_8bit", [True, False])
+@pytest.mark.parametrize("nk_factors", MARLIN_REPACK_NK_FACTORS)
 def test_gptq_marlin_repack(
-    k_chunk, n_chunk, quant_type, group_size, act_order, mnk_factors
+    k_chunk, n_chunk, quant_type, act_order, is_a_8bit, nk_factors
 ):
-    m_factor, n_factor, k_factor = mnk_factors
+    n_factor, k_factor = nk_factors
 
     size_k = k_chunk * k_factor
     size_n = n_chunk * n_factor
+    group_size = 128
 
     # Filter act_order
     if act_order:
@@ -109,6 +230,8 @@ def test_gptq_marlin_repack(
             return
         if group_size == size_k:
             return
+        if is_a_8bit:
+            return
 
     # Normalize group_size
     if group_size == -1:
@@ -133,23 +256,19 @@ def test_gptq_marlin_repack(
         q_w, g_idx, sort_indices = sort_weights(q_w, g_idx)
 
     # Pack to Marlin format
-    weight_perm = get_weight_perm(quant_type.size_bits)
+    weight_perm = get_weight_perm(quant_type.size_bits, is_a_8bit)
     marlin_q_w_1 = marlin_weights(
-        q_w, size_k, size_n, quant_type.size_bits, weight_perm
+        q_w, size_k, size_n, quant_type.size_bits, weight_perm, is_a_8bit
     )
 
     opcheck(
         torch.ops._C.gptq_marlin_repack,
-        (q_w_gptq, sort_indices, size_k, size_n, quant_type.size_bits),
+        (q_w_gptq, sort_indices, size_k, size_n, quant_type.size_bits, is_a_8bit),
     )
 
     # Run Marlin repack GPU kernel
     marlin_q_w_2 = ops.gptq_marlin_repack(
-        q_w_gptq,
-        sort_indices,
-        size_k,
-        size_n,
-        quant_type.size_bits,
+        q_w_gptq, sort_indices, size_k, size_n, quant_type.size_bits, is_a_8bit
     )
     torch.cuda.synchronize()
 
@@ -163,18 +282,15 @@ def test_gptq_marlin_repack(
 @pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
 @pytest.mark.parametrize("quant_type", query_marlin_supported_quant_types(True))
-@pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES)
-@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
-def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, group_size, mnk_factors):
-    m_factor, n_factor, k_factor = mnk_factors
+@pytest.mark.parametrize("is_a_8bit", [True, False])
+@pytest.mark.parametrize("nk_factors", MARLIN_REPACK_NK_FACTORS)
+def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, is_a_8bit, nk_factors):
+    n_factor, k_factor = nk_factors
 
     size_k = k_chunk * k_factor
     size_n = n_chunk * n_factor
 
-    # Normalize group_size
-    if group_size == -1:
-        group_size = size_k
-    assert group_size <= size_k
+    group_size = 128
 
     # Create input
     b_weight = rand_data((size_k, size_n))
@@ -188,162 +304,221 @@ def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, group_size, mnk_factors
     q_w_awq = awq_pack(q_w, quant_type.size_bits, size_k, size_n)
 
     # Pack to Marlin format
-    weight_perm = get_weight_perm(quant_type.size_bits)
+    weight_perm = get_weight_perm(quant_type.size_bits, is_a_8bit)
     marlin_q_w_1 = marlin_weights(
-        q_w, size_k, size_n, quant_type.size_bits, weight_perm
+        q_w, size_k, size_n, quant_type.size_bits, weight_perm, is_a_8bit
     )
 
     opcheck(
-        torch.ops._C.awq_marlin_repack, (q_w_awq, size_k, size_n, quant_type.size_bits)
+        torch.ops._C.awq_marlin_repack,
+        (q_w_awq, size_k, size_n, quant_type.size_bits, is_a_8bit),
     )
 
     # Run Marlin repack GPU kernel
     marlin_q_w_2 = ops.awq_marlin_repack(
-        q_w_awq,
-        size_k,
-        size_n,
-        quant_type.size_bits,
+        q_w_awq, size_k, size_n, quant_type.size_bits, is_a_8bit
     )
     torch.cuda.synchronize()
 
     torch.testing.assert_close(marlin_q_w_1, marlin_q_w_2)
 
 
+def marlin_generate_valid_test_cases():
+    all_combinations = itertools.product(
+        DENSE_MARLIN_QUANT_TEST_CONFIGS,
+        MNK_FACTORS,
+        MARLIN_N_CHUNKS,
+        MARLIN_K_CHUNKS,
+        ACT_ORDER_OPTS,
+        K_FULL_OPTS,
+        USE_ATOMIC_ADD_OPTS,
+        USE_FP32_REDUCE_OPTS,
+    )
+
+    def is_invalid(
+        a_type,
+        b_type,
+        c_type,
+        group_blocks,
+        size_m,
+        size_n,
+        size_k,
+        act_order,
+        is_k_full,
+        use_atomic_add,
+        use_fp32_reduce,
+    ):
+        if use_atomic_add:
+            if use_fp32_reduce:
+                return False
+            if (
+                c_type == scalar_types.bfloat16
+                and torch.cuda.get_device_capability()[0] < 9
+            ):
+                return False
+
+        group_size = group_blocks if group_blocks <= 0 else group_blocks * 16
+        if group_size > 0 and size_k % group_size != 0:
+            return False
+
+        if act_order and group_size in [-1, size_k]:
+            return False
+        if group_size == size_k:
+            return False
+        if not act_order and is_k_full:
+            return False
+
+        return a_type.size_bits < 16 or a_type is c_type
+
+    cases = []
+    for case in all_combinations:
+        quant_test_config, mnk_factors, n_chunk, k_chunk, act_order, *_ = case
+        size_m = mnk_factors[0]
+        size_n = mnk_factors[1] * n_chunk
+        size_k = mnk_factors[2] * k_chunk
+
+        if act_order and not quant_test_config.get("support_act_order", False):
+            continue
+
+        f16_types = [scalar_types.float16, scalar_types.bfloat16]
+        inner_combinations = itertools.product(
+            quant_test_config.get("a_type", f16_types),
+            [quant_test_config["b_type"]],
+            quant_test_config.get("c_type", f16_types),
+            quant_test_config["group_blocks"],
+        )
+
+        for sub_case in inner_combinations:
+            if (
+                sub_case[0] == scalar_types.float8_e4m3fn
+                and current_platform.get_device_capability() not in [89, 120]
+            ):
+                continue
+            args = sub_case + (size_m, size_n, size_k) + case[4:]
+            if is_invalid(*args):
+                cases.append(args)
+    return cases
+
+
 @pytest.mark.skipif(
     not is_quant_method_supported("gptq_marlin"),
     reason="Marlin is not supported on this GPU type.",
 )
-@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
-@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
-@pytest.mark.parametrize("quant_type", query_marlin_supported_quant_types())
 @pytest.mark.parametrize(
-    "group_size", set(MARLIN_SUPPORTED_GROUP_SIZES + FP4_MARLIN_SUPPORTED_GROUP_SIZES)
+    (
+        "a_type, b_type, c_type, group_blocks,"
+        "size_m, size_n, size_k, act_order, is_k_full,"
+        "use_atomic_add, use_fp32_reduce"
+    ),
+    marlin_generate_valid_test_cases(),
 )
-@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
-@pytest.mark.parametrize("act_order", ACT_ORDER_OPTS)
-@pytest.mark.parametrize("is_k_full", K_FULL_OPTS)
-@pytest.mark.parametrize("use_atomic_add", USE_ATOMIC_ADD_OPTS)
-@pytest.mark.parametrize("use_fp32_reduce", USE_FP32_REDUCE_OPTS)
-@pytest.mark.parametrize("dtype", DTYPES)
 def test_gptq_marlin_gemm(
-    k_chunk,
-    n_chunk,
-    quant_type,
-    group_size,
-    mnk_factors,
+    a_type,
+    b_type,
+    c_type,
+    group_blocks,
+    size_m,
+    size_n,
+    size_k,
     act_order,
     is_k_full,
     use_atomic_add,
     use_fp32_reduce,
-    dtype,
 ):
-    m_factor, n_factor, k_factor = mnk_factors
-    has_zp = quant_type in [scalar_types.uint4, scalar_types.uint8]
+    has_zp = b_type in [scalar_types.uint4, scalar_types.uint8]
 
-    size_m = m_factor
-    size_k = k_chunk * k_factor
-    size_n = n_chunk * n_factor
+    group_size = group_blocks if group_blocks <= 0 else group_blocks * 16
 
-    if act_order:
-        if group_size == -1:
-            return
-        if group_size == size_k:
-            return
-        if has_zp:
-            return
+    if c_type == scalar_types.float16:
+        dtype = torch.float16
+    elif c_type == scalar_types.bfloat16:
+        dtype = torch.bfloat16
+    else:
+        raise RuntimeError("unsupported c_type")
 
-    if size_k % group_size != 0:
-        return
+    if a_type == scalar_types.int8:
+        a_dtype = torch.int8
+    elif a_type == scalar_types.float8_e4m3fn:
+        a_dtype = torch.float8_e4m3fn
+    else:
+        a_dtype = dtype
 
-    a_input = rand_data((size_m, size_k), dtype)
-    b_weight = rand_data((size_k, size_n), dtype)
-
-    if quant_type == scalar_types.float4_e2m1f:
-        if group_size not in [16, 32] or act_order:
-            return
-        if group_size == 32 and dtype == torch.float16:
-            return
+    a_input = rand_data((size_m, size_k), dtype=dtype)
+    b_weight = rand_data((size_k, size_n), dtype=dtype)
 
+    if b_type == scalar_types.float4_e2m1f:
         if group_size == 16:
             w_ref, marlin_q_w, marlin_s, marlin_s2 = rand_marlin_weight_nvfp4_like(
-                b_weight.T, group_size
+                b_weight.T, group_size, input_dtype=a_dtype
             )
         else:
             w_ref, marlin_q_w, marlin_s = rand_marlin_weight_mxfp4_like(
-                b_weight.T, group_size
+                b_weight.T, group_size, input_dtype=a_dtype
             )
             marlin_s2 = None
 
         g_idx = None
         sort_indices = None
         marlin_zp = None
-    elif quant_type == scalar_types.float8_e4m3fn:
-        if group_size not in [-1, 128]:
-            return
-        if act_order:
-            return
-        w_ref, marlin_q_w, marlin_s = marlin_quant_fp8_torch(b_weight.T, group_size)
+    elif b_type == scalar_types.float8_e4m3fn:
+        w_ref, marlin_q_w, marlin_s = marlin_quant_fp8_torch(
+            b_weight.T, group_size, input_dtype=a_dtype
+        )
         g_idx = None
         sort_indices = None
         marlin_zp = None
         marlin_s2 = None
     elif has_zp:
-        if group_size == 16:
-            return
         w_ref, marlin_q_w, marlin_s, marlin_zp = awq_marlin_quantize(
-            b_weight, quant_type, group_size
+            b_weight, b_type, group_size, input_dtype=a_dtype
         )
         g_idx = None
         sort_indices = None
         marlin_s2 = None
     else:
-        if group_size == 16:
-            return
         w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, _ = marlin_quantize(
-            b_weight, quant_type, group_size, act_order
+            b_weight, b_type, group_size, act_order, input_dtype=a_dtype
         )
+
         marlin_zp = None
         marlin_s2 = None
 
     workspace = marlin_make_workspace_new(w_ref.device)
 
-    opcheck(
-        torch.ops._C.gptq_marlin_gemm,
-        (
-            a_input,
-            None,
-            marlin_q_w,
-            None,
-            marlin_s,
-            marlin_s2,
-            marlin_zp,
-            g_idx,
-            sort_indices,
-            workspace,
-            quant_type.id,
-            a_input.shape[0],
-            b_weight.shape[1],
-            a_input.shape[1],
-            is_k_full,
-            use_atomic_add,
-            use_fp32_reduce,
-            False,
-        ),
-        test_utils=DEFAULT_OPCHECK_TEST_UTILS,
-    )
+    if a_type == scalar_types.int8:
+        a_input, a_scales = per_token_quant_int8(a_input)
+        a_input_ref = a_input.to(a_scales.dtype) * a_scales.view(-1, 1)
+        a_input_ref = a_input_ref.to(dtype)
+
+        if group_size != -1:
+            a_scales = a_scales / 4096 * marlin_s.max()
+            a_scales = a_scales.float()
+            marlin_s = marlin_s / marlin_s.max() * 4096
+            marlin_s = marlin_s.round().to(torch.int16).view(dtype)
+    elif a_type == scalar_types.float8_e4m3fn:
+        a_input, a_scales = ops.scaled_fp8_quant(a_input, use_per_token_if_dynamic=True)
+        a_input_ref = a_input.to(a_scales.dtype) * a_scales.view(-1, 1)
+        a_input_ref = a_input_ref.to(dtype)
+    else:
+        assert a_type.size_bits == 16
+        a_input_ref = a_input
+        a_scales = None
+
+    output = torch.empty((size_m, size_n), dtype=dtype, device=a_input.device)
 
     output = ops.gptq_marlin_gemm(
         a_input,
-        None,
+        output,
         marlin_q_w,
         None,
         marlin_s,
+        a_scales,
         marlin_s2,
         marlin_zp,
         g_idx,
         sort_indices,
         workspace,
-        quant_type,
+        b_type,
         a_input.shape[0],
         b_weight.shape[1],
         a_input.shape[1],
@@ -352,12 +527,9 @@ def test_gptq_marlin_gemm(
         use_fp32_reduce=use_fp32_reduce,
         is_zp_float=False,
     )
-    output_ref = torch.matmul(a_input, w_ref)
-
-    torch.cuda.synchronize()
+    output_ref = torch.matmul(a_input_ref, w_ref)
 
     max_diff = compute_max_diff(output, output_ref)
-
     assert max_diff < 0.04
 
 
@@ -507,6 +679,7 @@ def test_hqq_marlin_gemm(
         None,
         marlin_s,
         None,
+        None,
         marlin_zp,
         g_idx,
         g_idx_sort_indices,
@@ -559,6 +732,7 @@ def test_marlin_gemm_subset_input():
         None,
         marlin_s,
         None,
+        None,
         marlin_zp,
         g_idx,
         sort_indices,
@@ -607,6 +781,7 @@ def test_marlin_gemm_with_bias(size_m):
         marlin_bias,
         marlin_s,
         None,
+        None,
         marlin_zp,
         g_idx,
         sort_indices,
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index 75e82f9314e74..98646442391fe 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -846,6 +846,13 @@ def torch_experts(
         or (expert_map is not None and global_num_experts == expert_map.shape[0])
     )
 
+    if quant_dtype in [torch.float16, torch.bfloat16]:
+        quant_dtype = None
+    quant_input_only = quant_dtype is not None and w1_scale is None and w2_scale is None
+    if quant_input_only:
+        assert a1_scale is None and a2_scale is None
+        assert per_act_token_quant
+
     M, K = a.shape
     topk = topk_ids.shape[1]
 
@@ -863,6 +870,9 @@ def torch_experts(
         a, a1_scale, quant_dtype, per_act_token_quant, block_shape
     )
 
+    if quant_input_only:
+        a = (a.float() * a_scale.view(-1, 1)).to(w1.dtype)
+
     num_experts = w1.shape[0]
 
     topk_ids = topk_ids.view(-1)
@@ -882,6 +892,14 @@ def torch_experts(
                 out[mask] = tmp2 @ w2[i].transpose(0, 1)
                 if b_bias2 is not None:
                     out[mask] = out[mask] + b_bias2[i].view(1, -1).to(tmp1.dtype)
+            elif quant_input_only:
+                tmp1 = a[mask] @ w1[i].transpose(0, 1)
+                tmp2 = SiluAndMul()(tmp1)
+                tmp2, tmp2_scale = moe_kernel_quantize_input(
+                    tmp2, None, quant_dtype, per_act_token_quant
+                )
+                tmp2 = (tmp2.float() * tmp2_scale.view(-1, 1)).to(w2.dtype)
+                out[mask] = tmp2 @ w2[i].transpose(0, 1)
             elif block_shape is not None:
                 # block quantized
                 assert (
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 4a1bcc761f994..e60158898685a 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -554,6 +554,7 @@ if hasattr(torch.ops._C, "gptq_marlin_24_gemm"):
         b_q_weight: torch.Tensor,
         b_bias: torch.Tensor | None,
         b_scales: torch.Tensor,
+        a_scales: torch.Tensor | None,
         global_scale: torch.Tensor | None,
         b_zeros: torch.Tensor | None,
         g_idx: torch.Tensor | None,
@@ -568,7 +569,10 @@ if hasattr(torch.ops._C, "gptq_marlin_24_gemm"):
         use_fp32_reduce: bool = False,
         is_zp_float: bool = False,
     ) -> torch.Tensor:
-        return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype)
+        dtype = a.dtype
+        if dtype not in [torch.half, torch.bfloat16]:
+            dtype = b_scales.dtype
+        return torch.empty((size_m, size_n), device=a.device, dtype=dtype)
 
     @register_fake("_C::awq_dequantize")
     def _awq_dequantize_fake(
@@ -1167,8 +1171,11 @@ def gptq_marlin_repack(
     size_k: int,
     size_n: int,
     num_bits: int,
+    is_a_8bit: bool = False,
 ) -> torch.Tensor:
-    return torch.ops._C.gptq_marlin_repack(b_q_weight, perm, size_k, size_n, num_bits)
+    return torch.ops._C.gptq_marlin_repack(
+        b_q_weight, perm, size_k, size_n, num_bits, is_a_8bit
+    )
 
 
 if hasattr(torch.ops._C, "gptq_marlin_repack"):
@@ -1180,6 +1187,7 @@ if hasattr(torch.ops._C, "gptq_marlin_repack"):
         size_k: torch.SymInt,
         size_n: torch.SymInt,
         num_bits: int,
+        is_a_8bit: bool = False,
     ) -> torch.Tensor:
         pack_factor = 32 // num_bits
         marlin_tile_size = 16
@@ -1192,9 +1200,15 @@ if hasattr(torch.ops._C, "gptq_marlin_repack"):
 
 # awq_marlin
 def awq_marlin_repack(
-    b_q_weight: torch.Tensor, size_k: int, size_n: int, num_bits: int
+    b_q_weight: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+    is_a_8bit: bool = False,
 ) -> torch.Tensor:
-    return torch.ops._C.awq_marlin_repack(b_q_weight, size_k, size_n, num_bits)
+    return torch.ops._C.awq_marlin_repack(
+        b_q_weight, size_k, size_n, num_bits, is_a_8bit
+    )
 
 
 if hasattr(torch.ops._C, "awq_marlin_repack"):
@@ -1205,6 +1219,7 @@ if hasattr(torch.ops._C, "awq_marlin_repack"):
         size_k: torch.SymInt,
         size_n: torch.SymInt,
         num_bits: int,
+        is_a_8bit: bool = False,
     ) -> torch.Tensor:
         pack_factor = 32 // num_bits
         marlin_tile_size = 16
@@ -1221,6 +1236,7 @@ def gptq_marlin_moe_repack(
     size_k: int,
     size_n: int,
     num_bits: int,
+    is_a_8bit: bool = False,
 ) -> torch.Tensor:
     num_experts = b_q_weight.shape[0]
     assert size_k % 16 == 0
@@ -1231,7 +1247,7 @@ def gptq_marlin_moe_repack(
     )
     for e in range(num_experts):
         output[e] = torch.ops._C.gptq_marlin_repack(
-            b_q_weight[e], perm[e], size_k, size_n, num_bits
+            b_q_weight[e], perm[e], size_k, size_n, num_bits, is_a_8bit
         )
     return output
 
@@ -1242,6 +1258,7 @@ def awq_marlin_moe_repack(
     size_k: int,
     size_n: int,
     num_bits: int,
+    is_a_8bit: bool = False,
 ) -> torch.Tensor:
     num_experts = b_q_weight.shape[0]
     assert size_k % 16 == 0
@@ -1252,17 +1269,26 @@ def awq_marlin_moe_repack(
     )
     for e in range(num_experts):
         output[e] = torch.ops._C.awq_marlin_repack(
-            b_q_weight[e], size_k, size_n, num_bits
+            b_q_weight[e], size_k, size_n, num_bits, is_a_8bit
         )
     return output
 
 
+def marlin_int4_fp8_preprocess(
+    qweight: torch.Tensor,
+    qzeros_or_none: torch.Tensor | None = None,
+    inplace: bool = False,
+):
+    return torch.ops._C.marlin_int4_fp8_preprocess(qweight, qzeros_or_none, inplace)
+
+
 def gptq_marlin_gemm(
     a: torch.Tensor,
     c: torch.Tensor | None,
     b_q_weight: torch.Tensor,
     b_bias: torch.Tensor | None,
     b_scales: torch.Tensor,
+    a_scales: torch.Tensor | None,
     global_scale: torch.Tensor | None,
     b_zeros: torch.Tensor | None,
     g_idx: torch.Tensor | None,
@@ -1283,6 +1309,7 @@ def gptq_marlin_gemm(
         b_q_weight,
         b_bias,
         b_scales,
+        a_scales,
         global_scale,
         b_zeros,
         g_idx,
@@ -1600,7 +1627,7 @@ def allspark_repack_weight(
             if use asymmetric quantization, has_zp = True.
 
     Returns:
-        tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] :
+        tuple[torch.Tensor, torch.Tensor, torch.Tensor | None] :
             rearranged weight, scale, and optionally zero_point.
     """
     K = qweight.shape[0]
@@ -1683,7 +1710,7 @@ def scaled_int8_quant(
         symmetric: Whether to use symmetric quantization (scale only, azp ignored).
 
     Returns:
-      tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : Output int8 tensor, scales, and optionally azp.
+      tuple[torch.Tensor, torch.Tensor, torch.Tensor | None] : Output int8 tensor, scales, and optionally azp.
     """
     output = torch.empty_like(input, dtype=torch.int8)
     if scale is not None:
@@ -2004,6 +2031,7 @@ def moe_wna16_marlin_gemm(
     b_qweight: torch.Tensor,
     b_bias: torch.Tensor | None,
     b_scales: torch.Tensor,
+    a_scales: torch.Tensor | None,
     global_scale: torch.Tensor | None,
     b_qzeros: torch.Tensor | None,
     g_idx: torch.Tensor | None,
@@ -2025,6 +2053,9 @@ def moe_wna16_marlin_gemm(
     use_atomic_add: bool,
     use_fp32_reduce: bool,
     is_zp_float: bool,
+    thread_k: int = -1,
+    thread_n: int = -1,
+    blocks_per_sm: int = -1,
 ) -> torch.Tensor:
     return torch.ops._moe_C.moe_wna16_marlin_gemm(
         input,
@@ -2032,6 +2063,7 @@ def moe_wna16_marlin_gemm(
         b_qweight,
         b_bias,
         b_scales,
+        a_scales,
         global_scale,
         b_qzeros,
         g_idx,
@@ -2053,6 +2085,9 @@ def moe_wna16_marlin_gemm(
         use_atomic_add,
         use_fp32_reduce,
         is_zp_float,
+        thread_k,
+        thread_n,
+        blocks_per_sm,
     )
 
 
@@ -2088,7 +2123,10 @@ if hasattr(torch.ops, "_moe_C") and hasattr(torch.ops._moe_C, "marlin_gemm_moe")
         input: torch.Tensor,
         output: torch.Tensor | None,
         b_qweight: torch.Tensor,
+        b_bias: torch.Tensor | None,
         b_scales: torch.Tensor,
+        a_scales: torch.Tensor | None,
+        global_scale: torch.Tensor | None,
         b_qzeros: torch.Tensor | None,
         g_idx: torch.Tensor | None,
         perm: torch.Tensor | None,
@@ -2109,7 +2147,7 @@ if hasattr(torch.ops, "_moe_C") and hasattr(torch.ops._moe_C, "marlin_gemm_moe")
         use_atomic_add: bool,
         use_fp32_reduce: bool,
         is_zp_float: bool,
-    ) -> torch.Tensor:
+    ):
         return torch.empty(
             (size_m * top_k, size_n), dtype=input.dtype, device=input.device
         )
@@ -2583,7 +2621,7 @@ def onednn_scaled_int8_quant(
         symmetric: Whether to use symmetric quantization (scale only, azp ignored).
 
     Returns:
-      tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : Output int8 tensor, scales, and optionally azp.
+      tuple[torch.Tensor, torch.Tensor, torch.Tensor | None] : Output int8 tensor, scales, and optionally azp.
     """
     output = torch.empty_like(input, dtype=torch.int8)
     token_num = input.numel() // input.shape[-1]
diff --git a/vllm/envs.py b/vllm/envs.py
index 2ac457419a722..8ad62e1b8f508 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -145,6 +145,7 @@ if TYPE_CHECKING:
     VLLM_RANDOMIZE_DP_DUMMY_INPUTS: bool = False
     VLLM_RAY_DP_PACK_STRATEGY: Literal["strict", "fill", "span"] = "strict"
     VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
+    VLLM_MARLIN_INPUT_DTYPE: Literal["int8", "fp8"] | None = None
     VLLM_MXFP4_USE_MARLIN: bool | None = None
     VLLM_V1_USE_OUTLINES_CACHE: bool = False
     VLLM_TPU_BUCKET_PADDING_GAP: int = 0
@@ -1122,6 +1123,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_MXFP4_USE_MARLIN": lambda: maybe_convert_bool(
         os.environ.get("VLLM_MXFP4_USE_MARLIN", None)
     ),
+    # The activation dtype for marlin kernel
+    "VLLM_MARLIN_INPUT_DTYPE": env_with_choices(
+        "VLLM_MARLIN_INPUT_DTYPE", None, ["int8", "fp8"]
+    ),
     # Whether to turn on the outlines cache for V1
     # This cache is unbounded and on disk, so it's not safe to use in
     # an environment with potentially malicious users.
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index 0b0f59f673182..9c377db720132 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -24,7 +24,7 @@ from vllm.model_executor.layers.fused_moe.utils import _resize_cache, disable_in
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     marlin_make_workspace_new,
     marlin_moe_intermediate_size,
-    maybe_warn_marlin_atomic_add,
+    marlin_quant_input,
 )
 from vllm.scalar_type import ScalarType, scalar_types
 
@@ -65,6 +65,8 @@ def _fused_marlin_moe(
     activation_func: Callable[
         [str, torch.Tensor, torch.Tensor], None
     ] = default_activation_func,
+    input_global_scale1: torch.Tensor | None = None,
+    input_global_scale2: torch.Tensor | None = None,
     global_scale1: torch.Tensor | None = None,
     global_scale2: torch.Tensor | None = None,
     g_idx1: torch.Tensor | None = None,
@@ -77,6 +79,7 @@ def _fused_marlin_moe(
     intermediate_cache13: torch.Tensor | None = None,
     intermediate_cache2: torch.Tensor | None = None,
     output: torch.Tensor | None = None,
+    input_dtype: torch.dtype | None = None,
     is_k_full: bool = True,
 ) -> torch.Tensor:
     assert hidden_states.ndim == 2
@@ -106,18 +109,22 @@ def _fused_marlin_moe(
 
     intermediate_cache2 = _resize_cache(intermediate_cache2, (M * num_topk, N))
 
-    maybe_warn_marlin_atomic_add(hidden_states.device, hidden_states.dtype)
-    use_atomic_add = (
-        hidden_states.dtype == torch.half
-        or torch.cuda.get_device_capability(hidden_states.device)[0] >= 9
-    )
+    a_scales1 = None
+    gate_up_input = hidden_states
+    if input_dtype == torch.int8:
+        gate_up_input, a_scales1 = marlin_quant_input(hidden_states, input_dtype)
+        if input_global_scale1 is not None:
+            a_scales1 = a_scales1 * input_global_scale1
+    elif input_dtype == torch.float8_e4m3fn:
+        gate_up_input, a_scales1 = marlin_quant_input(hidden_states, input_dtype)
 
     intermediate_cache1 = ops.moe_wna16_marlin_gemm(
-        hidden_states,
+        gate_up_input,
         intermediate_cache1,
         w1,
         bias1,
         w1_scale,
+        a_scales1,
         global_scale1,
         w1_zeros,
         g_idx1,
@@ -136,7 +143,7 @@ def _fused_marlin_moe(
         size_n=2 * N,
         size_k=K,
         is_k_full=is_k_full,
-        use_atomic_add=use_atomic_add,
+        use_atomic_add=False,
         use_fp32_reduce=True,
         is_zp_float=False,
     )
@@ -151,12 +158,25 @@ def _fused_marlin_moe(
     if expert_map is not None:
         output.zero_()
 
+    a_scales2 = None
+    if input_dtype == torch.int8:
+        intermediate_cache2, a_scales2 = marlin_quant_input(
+            intermediate_cache2, input_dtype
+        )
+        if input_global_scale2 is not None:
+            a_scales2 = a_scales2 * input_global_scale2
+    elif input_dtype == torch.float8_e4m3fn:
+        intermediate_cache2, a_scales2 = marlin_quant_input(
+            intermediate_cache2, input_dtype
+        )
+
     output = ops.moe_wna16_marlin_gemm(
         intermediate_cache2,
         output,
         w2,
         bias2,
         w2_scale,
+        a_scales2,
         global_scale2,
         w2_zeros,
         g_idx2,
@@ -175,7 +195,7 @@ def _fused_marlin_moe(
         size_n=K,
         size_k=N,
         is_k_full=is_k_full,
-        use_atomic_add=use_atomic_add,
+        use_atomic_add=False,
         use_fp32_reduce=True,
         is_zp_float=False,
     )
@@ -203,6 +223,8 @@ def fused_marlin_moe(
     ] = default_activation_func,
     moe_sum: Callable[[torch.Tensor, torch.Tensor], None] | None = None,
     expert_map: torch.Tensor | None = None,
+    input_global_scale1: torch.Tensor | None = None,
+    input_global_scale2: torch.Tensor | None = None,
     global_scale1: torch.Tensor | None = None,
     global_scale2: torch.Tensor | None = None,
     g_idx1: torch.Tensor | None = None,
@@ -216,6 +238,7 @@ def fused_marlin_moe(
     intermediate_cache2: torch.Tensor | None = None,
     is_k_full: bool = True,
     output: torch.Tensor | None = None,
+    input_dtype: torch.dtype | None = None,
     inplace: bool = False,
 ) -> torch.Tensor:
     """
@@ -287,6 +310,9 @@ def fused_marlin_moe(
         if M * topk / E / block_size_m < 0.9:
             break
 
+    if input_dtype is not None and input_dtype.itemsize == 1:
+        block_size_m = max(block_size_m, 16)
+
     if global_num_experts == -1:
         global_num_experts = E
     sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
@@ -313,6 +339,8 @@ def fused_marlin_moe(
         num_tokens_post_padded=num_tokens_post_padded,
         activation=activation,
         activation_func=activation_func,
+        input_global_scale1=input_global_scale1,
+        input_global_scale2=input_global_scale2,
         global_scale1=global_scale1,
         global_scale2=global_scale2,
         g_idx1=g_idx1,
@@ -325,6 +353,7 @@ def fused_marlin_moe(
         intermediate_cache13=intermediate_cache13,
         intermediate_cache2=intermediate_cache2,
         output=None,
+        input_dtype=input_dtype,
         is_k_full=is_k_full,
     ).view(-1, topk, K)
 
diff --git a/vllm/model_executor/layers/quantization/auto_round.py b/vllm/model_executor/layers/quantization/auto_round.py
index f1943d4611877..95e4382c89d7a 100644
--- a/vllm/model_executor/layers/quantization/auto_round.py
+++ b/vllm/model_executor/layers/quantization/auto_round.py
@@ -266,7 +266,7 @@ class AutoRoundConfig(QuantizationConfig):
             from vllm.model_executor.layers.quantization.awq_marlin import (
                 AWQMarlinConfig,
                 AWQMarlinLinearMethod,
-                AWQMoEMethod,
+                AWQMarlinMoEMethod,
             )
 
             quant_args_marlin = AWQMarlinConfig(
@@ -291,7 +291,7 @@ class AutoRoundConfig(QuantizationConfig):
 
         if isinstance(layer, FusedMoE):
             if use_marlin:
-                return AWQMoEMethod(quant_args_marlin, layer.moe_config)
+                return AWQMarlinMoEMethod(quant_args_marlin, layer.moe)
             from vllm.model_executor.layers.quantization.moe_wna16 import MoeWNA16Config
 
             config = {
diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
index 0cf8b69f9f6ba..ab68c5dca52c0 100644
--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -106,7 +106,7 @@ class AWQConfig(QuantizationConfig):
             return AWQLinearMethod(self)
         elif isinstance(layer, FusedMoE):
             # Lazy import to avoid circular import.
-            from .awq_marlin import AWQMarlinConfig, AWQMoEMethod
+            from .awq_marlin import AWQMarlinConfig, AWQMarlinMoEMethod
             from .moe_wna16 import MoeWNA16Config
             from .utils.marlin_utils import check_moe_marlin_supports_layer
 
@@ -136,7 +136,7 @@ class AWQConfig(QuantizationConfig):
             awq_marlin_config = AWQMarlinConfig.from_config(
                 marlin_compatible_config_dict
             )
-            return AWQMoEMethod(awq_marlin_config, layer.moe_config)
+            return AWQMarlinMoEMethod(awq_marlin_config, layer.moe_config)
         return None
 
     def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 66945e2d2a7c8..d463e181fd2db 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -40,6 +40,8 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     check_marlin_supported,
     check_marlin_supports_layer,
     check_moe_marlin_supports_layer,
+    get_marlin_input_dtype,
+    marlin_act_int8_process_scales,
     marlin_make_empty_g_idx,
     marlin_make_workspace_new,
     marlin_moe_permute_scales,
@@ -69,7 +71,6 @@ class AWQMarlinConfig(QuantizationConfig):
     # num_bits -> type
     TYPE_MAP = {
         4: scalar_types.uint4,
-        8: scalar_types.uint8,
     }
 
     def __init__(
@@ -193,7 +194,9 @@ class AWQMarlinConfig(QuantizationConfig):
                 return AWQConfig.from_config(self.full_config).get_quant_method(
                     layer, prefix
                 )
-            return AWQMarlinLinearMethod(self)
+            quant_method = AWQMarlinLinearMethod(self)
+            quant_method.input_dtype = get_marlin_input_dtype(prefix)
+            return quant_method
         elif isinstance(layer, FusedMoE):
             from vllm.model_executor.layers.quantization.moe_wna16 import MoeWNA16Config
 
@@ -211,7 +214,9 @@ class AWQMarlinConfig(QuantizationConfig):
                 return MoeWNA16Config.from_config(self.full_config).get_quant_method(
                     layer, prefix
                 )
-            return AWQMoEMethod(self, layer.moe_config)
+            moe_quant_method = AWQMarlinMoEMethod(self, layer.moe_config)
+            moe_quant_method.input_dtype = get_marlin_input_dtype(prefix)
+            return moe_quant_method
         return None
 
     @classmethod
@@ -270,6 +275,8 @@ class AWQMarlinLinearMethod(LinearMethodBase):
 
     def __init__(self, quant_config: AWQMarlinConfig) -> None:
         self.quant_config = quant_config
+        self.quant_type = scalar_types.uint4
+        self.input_dtype = None
 
     def create_weights(
         self,
@@ -312,6 +319,7 @@ class AWQMarlinLinearMethod(LinearMethodBase):
         )
 
         num_groups = input_size_per_partition // group_size
+        layer.num_groups = num_groups
 
         qzeros = PackedvLLMParameter(
             data=torch.empty(
@@ -358,12 +366,19 @@ class AWQMarlinLinearMethod(LinearMethodBase):
         # Allocate marlin workspace
         layer.workspace = marlin_make_workspace_new(device)
 
+        is_a_8bit = self.input_dtype is not None and self.input_dtype.itemsize == 1
+
+        if self.input_dtype == torch.float8_e4m3fn:
+            ops.marlin_int4_fp8_preprocess(layer.qweight, layer.qzeros, inplace=True)
+            layer.scales.data = layer.scales.data * 512
+
         # Repack weights from AWQ format to marlin format.
         marlin_qweight = ops.awq_marlin_repack(
             layer.qweight,
             size_k=layer.input_size_per_partition,
             size_n=layer.output_size_per_partition,
             num_bits=self.quant_config.quant_type.size_bits,
+            is_a_8bit=is_a_8bit,
         )
         replace_parameter(layer, "qweight", marlin_qweight)
 
@@ -373,7 +388,16 @@ class AWQMarlinLinearMethod(LinearMethodBase):
             size_k=layer.input_size_per_partition,
             size_n=layer.output_size_per_partition,
             group_size=self.quant_config.group_size,
+            is_a_8bit=is_a_8bit,
         )
+        if self.input_dtype == torch.int8 and layer.num_groups > 1:
+            marlin_scales, input_global_scale = marlin_act_int8_process_scales(
+                marlin_scales
+            )
+            layer.register_parameter(
+                "input_global_scale", Parameter(input_global_scale, requires_grad=False)
+            )
+
         replace_parameter(layer, "scales", marlin_scales)
 
         # Permute zero-points from AWQ format to marlin format.
@@ -382,6 +406,7 @@ class AWQMarlinLinearMethod(LinearMethodBase):
             size_k=layer.num_groups,
             size_n=layer.output_size_per_partition,
             num_bits=self.quant_config.quant_type.size_bits,
+            is_a_8bit=is_a_8bit,
         )
         replace_parameter(layer, "qzeros", marlin_zp)
 
@@ -409,11 +434,13 @@ class AWQMarlinLinearMethod(LinearMethodBase):
             quant_type=self.quant_config.quant_type,
             output_size_per_partition=layer.output_size_per_partition,
             input_size_per_partition=layer.input_size_per_partition,
+            input_global_scale=getattr(layer, "input_global_scale", None),
             bias=bias,
+            input_dtype=self.input_dtype,
         )
 
 
-class AWQMoEMethod(FusedMoEMethodBase):
+class AWQMarlinMoEMethod(FusedMoEMethodBase):
     def __init__(
         self,
         quant_config: AWQMarlinConfig,
@@ -422,8 +449,9 @@ class AWQMoEMethod(FusedMoEMethodBase):
         super().__init__(moe)
         self.quant_config = quant_config
         if self.quant_config.weight_bits != 4:
-            raise ValueError("AWQMoEMethod only supports 4bit now.")
+            raise ValueError("AWQMarlinMoEMethod only supports 4bit now.")
         self.quant_type = scalar_types.uint4
+        self.input_dtype = None
         self.use_marlin = True
 
     def create_weights(
@@ -435,6 +463,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
         params_dtype: torch.dtype,
         **extra_weight_attrs,
     ):
+        layer.input_dtype = self.input_dtype
         extra_weight_attrs.update(
             {
                 "is_transposed": True,
@@ -468,6 +497,8 @@ class AWQMoEMethod(FusedMoEMethodBase):
 
         num_groups_w13 = hidden_size // self.quant_config.group_size
         num_groups_w2 = intermediate_size_per_partition // self.quant_config.group_size
+        layer.num_groups_w13 = num_groups_w13
+        layer.num_groups_w2 = num_groups_w2
 
         # WEIGHT_SCALES
         # Allocate 2 scales for w1 and w3 respectively.
@@ -522,6 +553,21 @@ class AWQMoEMethod(FusedMoEMethodBase):
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         num_experts = layer.w13_qweight.shape[0]
         device = layer.w13_qweight.device
+        is_a_8bit = self.input_dtype is not None and self.input_dtype.itemsize == 1
+
+        if self.input_dtype == torch.float8_e4m3fn:
+            ops.marlin_int4_fp8_preprocess(
+                layer.w13_qweight.view(-1, layer.w13_qweight.size(2)),
+                layer.w13_qzeros.view(-1, layer.w13_qzeros.size(2)),
+                inplace=True,
+            )
+            ops.marlin_int4_fp8_preprocess(
+                layer.w2_qweight.view(-1, layer.w2_qweight.size(2)),
+                layer.w2_qzeros.view(-1, layer.w2_qzeros.size(2)),
+                inplace=True,
+            )
+            layer.w13_scales.data = layer.w13_scales.data * 512
+            layer.w2_scales.data = layer.w2_scales.data * 512
 
         layer.w13_g_idx_sort_indices = torch.nn.Parameter(
             torch.empty((num_experts, 0), dtype=torch.int32, device=device),
@@ -538,6 +584,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
             size_k=layer.w13_qweight.shape[1],
             size_n=layer.w13_qweight.shape[2] * self.quant_config.pack_factor,
             num_bits=self.quant_config.weight_bits,
+            is_a_8bit=is_a_8bit,
         )
         replace_parameter(layer, "w13_qweight", marlin_w13_qweight)
 
@@ -547,6 +594,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
             size_k=layer.w2_qweight.shape[1],
             size_n=layer.w2_qweight.shape[2] * self.quant_config.pack_factor,
             num_bits=self.quant_config.weight_bits,
+            is_a_8bit=is_a_8bit,
         )
         replace_parameter(layer, "w2_qweight", marlin_w2_qweight)
 
@@ -556,7 +604,16 @@ class AWQMoEMethod(FusedMoEMethodBase):
             size_k=layer.intermediate_size_per_partition,
             size_n=layer.w13_scales.shape[2],
             group_size=self.quant_config.group_size,
+            is_a_8bit=is_a_8bit,
         )
+        if self.input_dtype == torch.int8 and layer.num_groups_w13 > 1:
+            marlin_w13_scales, w13_input_global_scale = marlin_act_int8_process_scales(
+                marlin_w13_scales
+            )
+            layer.register_parameter(
+                "w13_input_global_scale",
+                Parameter(w13_input_global_scale, requires_grad=False),
+            )
 
         replace_parameter(layer, "w13_scales", marlin_w13_scales)
 
@@ -565,7 +622,17 @@ class AWQMoEMethod(FusedMoEMethodBase):
             size_k=layer.intermediate_size_per_partition,
             size_n=layer.w2_scales.shape[2],
             group_size=self.quant_config.group_size,
+            is_a_8bit=is_a_8bit,
         )
+        if self.input_dtype == torch.int8 and layer.num_groups_w2 > 1:
+            marlin_w2_scales, w2_input_global_scale = marlin_act_int8_process_scales(
+                marlin_w2_scales
+            )
+            layer.register_parameter(
+                "w2_input_global_scale",
+                Parameter(w2_input_global_scale, requires_grad=False),
+            )
+
         replace_parameter(layer, "w2_scales", marlin_w2_scales)
 
         marlin_w13_zp = moe_awq_to_marlin_zero_points(
@@ -573,6 +640,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
             size_k=layer.w13_qzeros.shape[1],
             size_n=layer.w13_qzeros.shape[2] * self.quant_config.pack_factor,
             num_bits=self.quant_config.weight_bits,
+            is_a_8bit=is_a_8bit,
         )
         replace_parameter(layer, "w13_qzeros", marlin_w13_zp)
 
@@ -581,6 +649,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
             size_k=layer.w2_qzeros.shape[1],
             size_n=layer.w2_qzeros.shape[2] * self.quant_config.pack_factor,
             num_bits=self.quant_config.weight_bits,
+            is_a_8bit=is_a_8bit,
         )
         replace_parameter(layer, "w2_qzeros", marlin_w2_zp)
 
@@ -636,6 +705,8 @@ class AWQMoEMethod(FusedMoEMethodBase):
             router_logits,
             topk_weights,
             topk_ids,
+            input_global_scale1=getattr(layer, "w13_input_global_scale", None),
+            input_global_scale2=getattr(layer, "w2_input_global_scale", None),
             quant_type_id=self.quant_type.id,
             apply_router_weight_on_input=apply_router_weight_on_input,
             global_num_experts=global_num_experts,
@@ -643,4 +714,5 @@ class AWQMoEMethod(FusedMoEMethodBase):
             w1_zeros=layer.w13_qzeros,
             w2_zeros=layer.w2_qzeros,
             workspace=layer.workspace,
+            input_dtype=self.input_dtype,
         )
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index f9d8f5883680b..02086c3c0052d 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -157,7 +157,9 @@ class CompressedTensorsConfig(QuantizationConfig):
         if isinstance(layer, Attention):
             return CompressedTensorsKVCacheMethod(self)
         if isinstance(layer, FusedMoE):
-            return CompressedTensorsMoEMethod.get_moe_method(self, layer, prefix)
+            return CompressedTensorsMoEMethod.get_moe_method(
+                self, layer, layer_name=prefix
+            )
         return None
 
     def _add_fused_moe_to_target_scheme_map(self):
@@ -547,6 +549,7 @@ class CompressedTensorsConfig(QuantizationConfig):
         weight_quant: QuantizationArgs,
         input_quant: QuantizationArgs,
         format: str | None = None,
+        layer_name: str | None = None,
     ) -> "CompressedTensorsScheme":
         # use the per-layer format if defined, otherwise, use global format
         format = format if format is not None else self.quant_format
@@ -585,6 +588,7 @@ class CompressedTensorsConfig(QuantizationConfig):
                     symmetric=weight_quant.symmetric,
                     group_size=weight_quant.group_size,
                     actorder=weight_quant.actorder,
+                    layer_name=layer_name,
                 )
 
         act_quant_format = is_activation_quantization_format(format)
@@ -724,7 +728,10 @@ class CompressedTensorsConfig(QuantizationConfig):
         else:
             # Find the quant_scheme
             scheme = self._get_scheme_from_parts(  # type: ignore
-                weight_quant=weight_quant, input_quant=input_quant, format=format
+                weight_quant=weight_quant,
+                input_quant=input_quant,
+                format=format,
+                layer_name=layer_name,
             )
 
         # Raise error if device does not support the scheme
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index c7dfd1787cc8f..80ee443d4dd6a 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -64,6 +64,8 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
 )
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     check_moe_marlin_supports_layer,
+    get_marlin_input_dtype,
+    marlin_act_int8_process_scales,
     marlin_make_workspace_new,
     marlin_moe_permute_scales,
 )
@@ -101,7 +103,7 @@ __all__ = [
     "CompressedTensorsW8A8Int8MoEMethod",
     "CompressedTensorsWNA16MarlinMoEMethod",
     "CompressedTensorsWNA16MoEMethod",
-    "CompressedTensorsW4A4Nvfp4MoeMethod",
+    "CompressedTensorsW4A4Nvfp4MoEMethod",
     "CompressedTensorsW4A8Int8MoEMethod",
 ]
 
@@ -111,13 +113,13 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
     def get_moe_method(
         quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
         layer: torch.nn.Module,
-        prefix: str,
+        layer_name: str,
     ) -> "CompressedTensorsMoEMethod":
         # FusedMoE was made by combining multiple Linears so need to
         # make sure quantization config for Linear can target it
         quant_config._add_fused_moe_to_target_scheme_map()
         unfused_names = [
-            prefix + proj_name
+            layer_name + proj_name
             for proj_name in [".0.gate_proj", ".0.up_proj", ".0.down_proj"]
         ]
         # TODO: refactor this to use expert_mapping and check all layer numbers
@@ -158,32 +160,40 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
                         "WNA16MoE is not supported with actorder=group/dynamic."
                     )
                 logger.info_once("Using CompressedTensorsWNA16MoEMethod")
-                return CompressedTensorsWNA16MoEMethod(quant_config, layer.moe_config)
+                return CompressedTensorsWNA16MoEMethod(
+                    quant_config, layer.moe_config, layer_name
+                )
             else:
                 logger.info_once("Using CompressedTensorsWNA16MarlinMoEMethod")
                 return CompressedTensorsWNA16MarlinMoEMethod(
-                    quant_config, layer.moe_config
+                    quant_config, layer.moe_config, layer_name
                 )
         elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant):
-            return CompressedTensorsW4A4Nvfp4MoeMethod(layer.moe_config)
+            return CompressedTensorsW4A4Nvfp4MoEMethod(layer.moe_config, layer_name)
         elif (
             quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant)
             or quant_config._is_fp8_w8a8_sm100(weight_quant, input_quant)
             or quant_config._is_fp8_w8a8(weight_quant, input_quant)
         ):
-            return CompressedTensorsW8A8Fp8MoEMethod(quant_config, layer.moe_config)
+            return CompressedTensorsW8A8Fp8MoEMethod(
+                quant_config, layer.moe_config, layer_name
+            )
         elif quant_config._is_dynamic_token_w8a8(weight_quant, input_quant):
-            return CompressedTensorsW8A8Int8MoEMethod(quant_config, layer.moe_config)
+            return CompressedTensorsW8A8Int8MoEMethod(
+                quant_config, layer.moe_config, layer_name
+            )
         elif quant_config._is_dynamic_token_w4a8_int(weight_quant, input_quant):
-            return CompressedTensorsW4A8Int8MoEMethod(quant_config, layer.moe_config)
+            return CompressedTensorsW4A8Int8MoEMethod(
+                quant_config, layer.moe_config, layer_name
+            )
         else:
             raise RuntimeError(
                 f"Unsupported FusedMoe scheme: {weight_quant}, {input_quant}"
             )
 
 
-class CompressedTensorsW4A4Nvfp4MoeMethod(CompressedTensorsMoEMethod):
-    def __init__(self, moe: FusedMoEConfig):
+class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
+    def __init__(self, moe: FusedMoEConfig, layer_name: str | None = None):
         from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import (  # noqa: E501
             detect_nvfp4_moe_support,
         )
@@ -194,17 +204,21 @@ class CompressedTensorsW4A4Nvfp4MoeMethod(CompressedTensorsMoEMethod):
         self.allow_flashinfer = _nvfp4.allow_flashinfer
         self.use_marlin = _nvfp4.use_marlin
         self.group_size = 16
+        self.layer_name = layer_name
+        self.marlin_input_dtype = (
+            get_marlin_input_dtype(layer_name) if self.use_marlin else None
+        )
         self.flashinfer_moe_backend = None
         if self.allow_flashinfer:
             self.flashinfer_moe_backend = get_flashinfer_moe_backend()
             logger.info_once(
                 f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
-                " for CompressedTensorsW4A4Nvfp4MoeMethod."
+                " for CompressedTensorsW4A4Nvfp4MoEMethod."
             )
         elif self.use_marlin:
-            logger.info_once("Using Marlin for CompressedTensorsW4A4Nvfp4MoeMethod.")
+            logger.info_once("Using Marlin for CompressedTensorsW4A4Nvfp4MoEMethod.")
         else:
-            logger.info_once("Using Cutlass for CompressedTensorsW4A4Nvfp4MoeMethod.")
+            logger.info_once("Using Cutlass for CompressedTensorsW4A4Nvfp4MoEMethod.")
 
     def create_weights(
         self,
@@ -354,7 +368,7 @@ class CompressedTensorsW4A4Nvfp4MoeMethod(CompressedTensorsMoEMethod):
         )
 
         if self.use_marlin:
-            prepare_moe_fp4_layer_for_marlin(layer)
+            prepare_moe_fp4_layer_for_marlin(layer, input_dtype=self.marlin_input_dtype)
             return
         # w13
         if (
@@ -538,7 +552,7 @@ class CompressedTensorsW4A4Nvfp4MoeMethod(CompressedTensorsMoEMethod):
         ):
             if enable_eplb:
                 raise NotImplementedError(
-                    "EPLB not supported for `CompressedTensorsW4A4MoeMethod` yet."
+                    "EPLB not supported for `CompressedTensorsW4A4MoEMethod` yet."
                 )
 
             return flashinfer_trtllm_fp4_moe(
@@ -576,6 +590,7 @@ class CompressedTensorsW4A4Nvfp4MoeMethod(CompressedTensorsMoEMethod):
                 apply_router_weight_on_input=apply_router_weight_on_input,
                 global_num_experts=global_num_experts,
                 expert_map=expert_map,
+                input_dtype=self.marlin_input_dtype,
                 workspace=layer.workspace,
             )
 
@@ -610,7 +625,7 @@ class CompressedTensorsW4A4Nvfp4MoeMethod(CompressedTensorsMoEMethod):
             assert expert_map is None, (
                 "Expert Parallelism / expert_map "
                 "is currently not supported for "
-                "CompressedTensorsW4A4Nvfp4MoeMethod."
+                "CompressedTensorsW4A4Nvfp4MoEMethod."
             )
             assert self.moe_quant_config is not None
 
@@ -637,6 +652,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         self,
         quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
         moe: FusedMoEConfig,
+        layer_name: str | None = None,
     ):
         super().__init__(moe)
         self.quant_config = quant_config
@@ -690,6 +706,10 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             or self.is_fp8_w8a8_sm100
         )
         self.disable_expert_map = False
+        self.layer_name = layer_name
+        self.marlin_input_dtype = (
+            get_marlin_input_dtype(layer_name) if self.use_marlin else None
+        )
 
     def create_weights(
         self,
@@ -931,7 +951,9 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False)
 
         elif self.use_marlin:
-            prepare_moe_fp8_layer_for_marlin(layer, False)
+            prepare_moe_fp8_layer_for_marlin(
+                layer, False, input_dtype=self.marlin_input_dtype
+            )
             # Activations not quantized for marlin.
             del layer.w13_input_scale
             del layer.w2_input_scale
@@ -1144,6 +1166,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 apply_router_weight_on_input=apply_router_weight_on_input,
                 global_num_experts=global_num_experts,
                 expert_map=expert_map,
+                input_dtype=self.marlin_input_dtype,
                 workspace=layer.workspace,
             )
 
@@ -1240,6 +1263,7 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
         self,
         quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
         moe: FusedMoEConfig,
+        layer_name: str | None = None,
     ):
         super().__init__(moe)
         self.quant_config = quant_config
@@ -1392,6 +1416,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
         self,
         quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
         moe: FusedMoEConfig,
+        layer_name: str | None = None,
     ):
         super().__init__(moe)
         self.quant_config = quant_config
@@ -1403,6 +1428,8 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
         self.strategy = config.strategy
         self.group_size = config.group_size
         self.actorder = config.actorder
+        self.layer_name = layer_name
+        self.marlin_input_dtype = get_marlin_input_dtype(layer_name)
         assert config.symmetric, "Only symmetric quantization is supported for MoE"
 
         if not (
@@ -1477,6 +1504,9 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
             num_groups_w2 = w2_scales_size // self.group_size
             num_groups_w13 = hidden_size // self.group_size
 
+        layer.num_groups_w13 = num_groups_w13
+        layer.num_groups_w2 = num_groups_w2
+
         w13_scale = torch.nn.Parameter(
             torch.ones(
                 num_experts,
@@ -1560,6 +1590,17 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         num_experts = layer.w13_weight_g_idx.shape[0]
         device = layer.w13_weight_g_idx.device
+        is_a_8bit = (
+            self.marlin_input_dtype is not None
+            and self.marlin_input_dtype.itemsize == 1
+        )
+
+        if self.marlin_input_dtype == torch.float8_e4m3fn:
+            # NOTE: for non-zp quantization format only
+            ops.marlin_int4_fp8_preprocess(layer.w13_weight_packed, inplace=True)
+            ops.marlin_int4_fp8_preprocess(layer.w2_weight_packed, inplace=True)
+            layer.w13_weight_scale.data = layer.w13_weight_scale.data * 512
+            layer.w2_weight_scale.data = layer.w2_weight_scale.data * 512
 
         # when running models with grouped act order,
         # resort to g_idx values provided in checkpoint
@@ -1610,31 +1651,54 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
             layer.w13_weight_packed.shape[1] * self.packed_factor,
             layer.w13_weight_packed.shape[2],
             self.num_bits,
+            is_a_8bit=is_a_8bit,
         )
         replace_parameter(layer, "w13_weight_packed", marlin_w13_qweight)
+
         marlin_w2_qweight = ops.gptq_marlin_moe_repack(
             layer.w2_weight_packed,
             layer.w2_g_idx_sort_indices,
             layer.w2_weight_packed.shape[1] * self.packed_factor,
             layer.w2_weight_packed.shape[2],
             self.num_bits,
+            is_a_8bit=is_a_8bit,
         )
         replace_parameter(layer, "w2_weight_packed", marlin_w2_qweight)
+
         # Repack scales
         marlin_w13_scales = marlin_moe_permute_scales(
             s=layer.w13_weight_scale,
             size_k=layer.w13_weight_packed.shape[2],
             size_n=layer.w13_weight_scale.shape[2],
             group_size=self.group_size,
+            is_a_8bit=is_a_8bit,
         )
+        if self.marlin_input_dtype == torch.int8 and layer.num_groups_w13 > 1:
+            marlin_w13_scales, w13_input_global_scale = marlin_act_int8_process_scales(
+                marlin_w13_scales
+            )
+            layer.register_parameter(
+                "w13_input_global_scale",
+                torch.nn.Parameter(w13_input_global_scale, requires_grad=False),
+            )
         replace_parameter(layer, "w13_weight_scale", marlin_w13_scales)
+
         marlin_w2_scales = marlin_moe_permute_scales(
             s=layer.w2_weight_scale,
             size_k=layer.w2_weight_scale.shape[1]
             * (self.group_size if self.group_size != -1 else self.packed_factor),
             size_n=layer.w2_weight_scale.shape[2],
             group_size=self.group_size,
+            is_a_8bit=is_a_8bit,
         )
+        if self.marlin_input_dtype == torch.int8 and layer.num_groups_w2 > 1:
+            marlin_w2_scales, w2_input_global_scale = marlin_act_int8_process_scales(
+                marlin_w2_scales
+            )
+            layer.register_parameter(
+                "w2_input_global_scale",
+                torch.nn.Parameter(w2_input_global_scale, requires_grad=False),
+            )
         replace_parameter(layer, "w2_weight_scale", marlin_w2_scales)
 
         layer.workspace = marlin_make_workspace_new(device, 4)
@@ -1729,6 +1793,8 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
             router_logits,
             topk_weights,
             topk_ids,
+            input_global_scale1=getattr(layer, "w13_input_global_scale", None),
+            input_global_scale2=getattr(layer, "w2_input_global_scale", None),
             quant_type_id=self.quant_type.id,
             apply_router_weight_on_input=apply_router_weight_on_input,
             global_num_experts=global_num_experts,
@@ -1738,6 +1804,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
             sort_indices1=layer.w13_g_idx_sort_indices,
             sort_indices2=layer.w2_g_idx_sort_indices,
             workspace=layer.workspace,
+            input_dtype=self.marlin_input_dtype,
             is_k_full=self.is_k_full,
         )
 
@@ -1747,6 +1814,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
         self,
         quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
         moe: FusedMoEConfig,
+        layer_name: str | None = None,
     ):
         super().__init__(moe)
         self.quant_config = quant_config
@@ -1999,6 +2067,7 @@ class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod):
         self,
         quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
         moe: FusedMoEConfig,
+        layer_name: str | None = None,
     ):
         super().__init__(moe)
         self.has_bias = self.moe.has_bias
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index 2267395fe67d3..7f4dad70287bd 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -14,7 +14,11 @@ from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
     MPLinearLayerConfig,
     choose_mp_linear_kernel,
 )
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.marlin import (
+    MarlinLinearKernel,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    get_marlin_input_dtype,
     marlin_repeat_scales_on_all_ranks,
 )
 from vllm.model_executor.parameter import (
@@ -45,12 +49,14 @@ class CompressedTensorsWNA16(CompressedTensorsScheme):
         group_size: int | None = None,
         symmetric: bool | None = True,
         actorder: ActivationOrdering | None = None,
+        layer_name: str | None = None,
     ):
         self.pack_factor = 32 // num_bits
         self.strategy = strategy
         self.symmetric = symmetric
         self.group_size = -1 if group_size is None else group_size
         self.has_g_idx = actorder == ActivationOrdering.GROUP
+        self.layer_name = layer_name
 
         if self.group_size == -1 and self.strategy != "channel":
             raise ValueError(
@@ -108,6 +114,11 @@ class CompressedTensorsWNA16(CompressedTensorsScheme):
             logger.info("Using %s for CompressedTensorsWNA16", kernel_type.__name__)
             self._kernel_backends_being_used.add(kernel_type.__name__)
 
+        if isinstance(kernel_type, MarlinLinearKernel):
+            input_dtype = get_marlin_input_dtype(self.layer_name)
+            if input_dtype is not None:
+                mp_linear_kernel_config.act_type = input_dtype
+
         # If group_size is -1, we are in channelwise case.
         group_size = self.group_size if self.group_size != -1 else input_size
         row_parallel = input_size != input_size_per_partition
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 7dfc8a9c36c3e..48223c9f103ea 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -69,6 +69,9 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     process_fp8_weight_tensor_strategy,
     validate_fp8_block_shape,
 )
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    get_marlin_input_dtype,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     apply_fp8_marlin_linear,
     prepare_fp8_layer_for_marlin,
@@ -316,7 +319,9 @@ class Fp8Config(QuantizationConfig):
                 fused_mapping=self.packed_modules_mapping,
             ):
                 return UnquantizedLinearMethod()
-            return Fp8LinearMethod(self)
+            quant_method = Fp8LinearMethod(self)
+            quant_method.marlin_input_dtype = get_marlin_input_dtype(prefix)
+            return quant_method
         elif isinstance(layer, FusedMoE):
             if is_layer_skipped(
                 prefix=prefix,
@@ -324,7 +329,9 @@ class Fp8Config(QuantizationConfig):
                 fused_mapping=self.packed_modules_mapping,
             ):
                 return UnquantizedFusedMoEMethod(layer.moe_config)
-            return Fp8MoEMethod(self, layer)
+            moe_quant_method = Fp8MoEMethod(self, layer)
+            moe_quant_method.marlin_input_dtype = get_marlin_input_dtype(prefix)
+            return moe_quant_method
         elif isinstance(layer, Attention):
             return Fp8KVCacheMethod(self)
         return None
@@ -375,6 +382,7 @@ class Fp8LinearMethod(LinearMethodBase):
 
         # For GPUs that lack FP8 hardware support, we can leverage the Marlin
         # kernel for fast weight-only FP8 quantization
+        self.marlin_input_dtype = None
         self.use_marlin = (
             not current_platform.has_device_capability(89)
             or envs.VLLM_TEST_FORCE_FP8_MARLIN
@@ -552,7 +560,9 @@ class Fp8LinearMethod(LinearMethodBase):
         )
 
         if self.use_marlin:
-            prepare_fp8_layer_for_marlin(layer, size_k_first)
+            prepare_fp8_layer_for_marlin(
+                layer, size_k_first, input_dtype=self.marlin_input_dtype
+            )
             # Activations not quantized for marlin.
             del layer.input_scale
             return
@@ -610,6 +620,7 @@ class Fp8LinearMethod(LinearMethodBase):
                 workspace=layer.workspace,
                 size_n=layer.output_size_per_partition,
                 size_k=layer.input_size_per_partition,
+                input_dtype=self.marlin_input_dtype,
                 bias=bias,
             )
 
@@ -657,6 +668,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             self.block_quant, layer.moe_parallel_config
         )
 
+        self.marlin_input_dtype = None
         self.use_marlin = self.fp8_backend == Fp8MoeBackend.MARLIN
         self.flashinfer_moe_backend: FlashinferMoeBackend | None = None
         if self.fp8_backend == Fp8MoeBackend.FLASHINFER_TRTLLM:
@@ -1031,7 +1043,9 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 layer.w13_weight.data = w13_weight.data
 
         if self.use_marlin:
-            prepare_moe_fp8_layer_for_marlin(layer, False)
+            prepare_moe_fp8_layer_for_marlin(
+                layer, False, input_dtype=self.marlin_input_dtype
+            )
             # Activations not quantized for marlin.
             del layer.w13_input_scale
             del layer.w2_input_scale
@@ -1270,6 +1284,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 apply_router_weight_on_input=apply_router_weight_on_input,
                 global_num_experts=global_num_experts,
                 expert_map=expert_map,
+                input_dtype=self.marlin_input_dtype,
                 workspace=layer.workspace,
             )
         elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 77b15db373a3a..56034e11329dc 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -41,6 +41,8 @@ from vllm.model_executor.layers.quantization.utils.gptq_utils import (
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     check_marlin_supported,
     check_moe_marlin_supports_layer,
+    get_marlin_input_dtype,
+    marlin_act_int8_process_scales,
     marlin_make_workspace_new,
     marlin_moe_permute_scales,
     marlin_permute_bias,
@@ -251,8 +253,21 @@ class GPTQMarlinConfig(QuantizationConfig):
                 return MoeWNA16Config.from_config(self.full_config).get_quant_method(
                     layer, prefix
                 )
-            return get_moe_quant_method(self, layer, prefix, GPTQMarlinMoEMethod)
-        return get_linear_quant_method(self, layer, prefix, GPTQMarlinLinearMethod)
+            moe_quant_method = get_moe_quant_method(
+                self, layer, prefix, GPTQMarlinMoEMethod
+            )
+            if moe_quant_method is None:
+                return None
+            moe_quant_method.input_dtype = get_marlin_input_dtype(prefix)
+            return moe_quant_method
+
+        quant_method = get_linear_quant_method(
+            self, layer, prefix, GPTQMarlinLinearMethod
+        )
+        if quant_method is None:
+            return None
+        quant_method.input_dtype = get_marlin_input_dtype(prefix)
+        return quant_method
 
     @classmethod
     def is_gptq_marlin_compatible(cls, quant_config: dict[str, Any]):
@@ -319,6 +334,8 @@ class GPTQMarlinLinearMethod(LinearMethodBase):
 
     def __init__(self, quant_config: GPTQMarlinConfig) -> None:
         self.quant_config = quant_config
+        self.input_dtype = None
+        self.quant_type = self.quant_config.quant_type
 
         # Verify supported on platform.
         verify_marlin_supported(
@@ -339,6 +356,7 @@ class GPTQMarlinLinearMethod(LinearMethodBase):
         output_size_per_partition = sum(output_partition_sizes)
         is_row_parallel = input_size != input_size_per_partition
         weight_loader = extra_weight_attrs.get("weight_loader")
+        input_dtype = self.input_dtype
 
         mp_linear_kernel_config = MPLinearLayerConfig(
             full_weight_shape=(input_size, output_size),
@@ -347,7 +365,7 @@ class GPTQMarlinLinearMethod(LinearMethodBase):
                 output_size_per_partition,
             ),
             weight_type=self.quant_config.quant_type,
-            act_type=params_dtype,
+            act_type=params_dtype if input_dtype is None else input_dtype,
             group_size=self.quant_config.group_size,
             zero_points=False,
             has_g_idx=self.quant_config.desc_act,
@@ -482,6 +500,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
             self.quant_type = scalar_types.uint8b128
         else:
             raise ValueError("GPTQMarlinMoEMethod only supports int4 and int8 now.")
+        self.input_dtype = None
         self.use_marlin = True
 
     def create_weights(
@@ -493,6 +512,14 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
         params_dtype: torch.dtype,
         **extra_weight_attrs,
     ):
+        layer.input_dtype = self.input_dtype
+        is_a_8bit = self.input_dtype is not None and self.input_dtype.itemsize == 1
+
+        if is_a_8bit:
+            assert self.quant_type == scalar_types.uint4b8, (
+                "W8A8-INT8 is not supported by marlin kernel."
+            )
+
         intermediate_size_full = extra_weight_attrs.pop("intermediate_size_full")
 
         self.is_k_full = (not self.quant_config.desc_act) or (
@@ -513,6 +540,9 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
             scales_size2 = 1
             strategy = FusedMoeWeightScaleSupported.CHANNEL.value
 
+        layer.num_groups_w13 = scales_size13
+        layer.num_groups_w2 = scales_size2
+
         extra_weight_attrs.update({"quant_method": strategy, "is_transposed": True})
         # Fused gate_up_proj (column parallel)
         w13_qweight = torch.nn.Parameter(
@@ -630,6 +660,19 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
         layer.workspace = marlin_make_workspace_new(device, 4)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        is_a_8bit = self.input_dtype is not None and self.input_dtype.itemsize == 1
+
+        if is_a_8bit:
+            assert self.quant_type == scalar_types.uint4b8, (
+                "W8A8-INT8 is not supported by marlin kernel."
+            )
+
+        if self.input_dtype == torch.float8_e4m3fn:
+            ops.marlin_int4_fp8_preprocess(layer.w13_qweight, inplace=True)
+            ops.marlin_int4_fp8_preprocess(layer.w2_qweight, inplace=True)
+            layer.w13_scales.data = layer.w13_scales.data * 512
+            layer.w2_scales.data = layer.w2_scales.data * 512
+
         # Process act_order
         if self.quant_config.desc_act:
             # Get sorting based on g_idx
@@ -678,6 +721,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
             layer.w13_qweight.shape[1] * self.quant_config.pack_factor,
             layer.w13_qweight.shape[2],
             self.quant_config.quant_type.size_bits,
+            is_a_8bit=is_a_8bit,
         )
         replace_parameter(layer, "w13_qweight", marlin_w13_qweight)
         marlin_w2_qweight = ops.gptq_marlin_moe_repack(
@@ -686,6 +730,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
             layer.w2_qweight.shape[1] * self.quant_config.pack_factor,
             layer.w2_qweight.shape[2],
             self.quant_config.quant_type.size_bits,
+            is_a_8bit=is_a_8bit,
         )
         replace_parameter(layer, "w2_qweight", marlin_w2_qweight)
         # Repack scales
@@ -694,7 +739,17 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
             size_k=layer.intermediate_size_per_partition,
             size_n=layer.w13_scales.shape[2],
             group_size=self.quant_config.group_size,
+            is_a_8bit=is_a_8bit,
         )
+        if self.input_dtype == torch.int8 and layer.num_groups_w13 > 1:
+            marlin_w13_scales, w13_input_global_scale = marlin_act_int8_process_scales(
+                marlin_w13_scales
+            )
+            layer.register_parameter(
+                "w13_input_global_scale",
+                torch.nn.Parameter(w13_input_global_scale, requires_grad=False),
+            )
+
         replace_parameter(layer, "w13_scales", marlin_w13_scales)
         marlin_w2_scales = marlin_moe_permute_scales(
             s=layer.w2_scales,
@@ -706,7 +761,17 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
             ),
             size_n=layer.w2_scales.shape[2],
             group_size=self.quant_config.group_size,
+            is_a_8bit=is_a_8bit,
         )
+        if self.input_dtype == torch.int8 and layer.num_groups_w2 > 1:
+            marlin_w2_scales, w2_input_global_scale = marlin_act_int8_process_scales(
+                marlin_w2_scales
+            )
+            layer.register_parameter(
+                "w2_input_global_scale",
+                torch.nn.Parameter(w2_input_global_scale, requires_grad=False),
+            )
+
         replace_parameter(layer, "w2_scales", marlin_w2_scales)
 
         if hasattr(layer, "w13_bias") and layer.w13_bias is not None:
@@ -761,6 +826,8 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
             router_logits,
             topk_weights,
             topk_ids,
+            input_global_scale1=getattr(layer, "w13_input_global_scale", None),
+            input_global_scale2=getattr(layer, "w2_input_global_scale", None),
             quant_type_id=self.quant_type.id,
             apply_router_weight_on_input=apply_router_weight_on_input,
             global_num_experts=global_num_experts,
@@ -771,4 +838,5 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
             sort_indices2=layer.w2_g_idx_sort_indices,
             workspace=layer.workspace,
             is_k_full=self.is_k_full,
+            input_dtype=self.input_dtype,
         )
diff --git a/vllm/model_executor/layers/quantization/hqq_marlin.py b/vllm/model_executor/layers/quantization/hqq_marlin.py
index 5fb67c35378be..fad8cb10fa8ac 100644
--- a/vllm/model_executor/layers/quantization/hqq_marlin.py
+++ b/vllm/model_executor/layers/quantization/hqq_marlin.py
@@ -351,6 +351,7 @@ class HQQMarlinMethod(LinearMethodBase):
             bias,
             scales,
             None,
+            None,
             zeros,
             layer.g_idx,
             layer.g_idx_sort_indices,
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
index ac21286eeffac..faaa45b861de7 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
@@ -9,6 +9,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     MARLIN_SUPPORTED_GROUP_SIZES,
     apply_gptq_marlin_linear,
     check_marlin_supports_shape,
+    marlin_act_int8_process_scales,
     marlin_is_k_full,
     marlin_make_empty_g_idx,
     marlin_make_workspace_new,
@@ -21,6 +22,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
 )
 from vllm.model_executor.parameter import BasevLLMParameter, permute_param_layout_
 from vllm.platforms import current_platform
+from vllm.scalar_type import scalar_types
 
 from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
 
@@ -65,6 +67,18 @@ class MarlinLinearKernel(MPLinearKernel):
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         device = getattr(layer, self.w_q_name).device
         c = self.config
+        is_a_8bit = c.act_type is not None and c.act_type.itemsize == 1
+
+        if is_a_8bit:
+            assert c.weight_type == scalar_types.uint4b8, (
+                "W8A8 is not supported by marlin kernel."
+            )
+
+        if c.act_type == torch.float8_e4m3fn:
+            ops.marlin_int4_fp8_preprocess(getattr(layer, self.w_q_name), inplace=True)
+            getattr(layer, self.w_s_name).data = (
+                getattr(layer, self.w_s_name).data * 512
+            )
 
         row_parallel = c.partition_weight_shape[0] != c.full_weight_shape[0]
         self.is_k_full = marlin_is_k_full(c.has_g_idx, row_parallel)
@@ -88,6 +102,7 @@ class MarlinLinearKernel(MPLinearKernel):
                 size_k=c.partition_weight_shape[0],
                 size_n=c.partition_weight_shape[1],
                 num_bits=c.weight_type.size_bits,
+                is_a_8bit=is_a_8bit,
             )
             return x
 
@@ -99,7 +114,22 @@ class MarlinLinearKernel(MPLinearKernel):
                 size_k=c.partition_weight_shape[0],
                 size_n=c.partition_weight_shape[1],
                 group_size=c.group_size,
+                is_a_8bit=is_a_8bit,
             )
+
+            if c.group_size == -1:
+                num_groups = 1
+            else:
+                num_groups = c.partition_weight_shape[0] // c.group_size
+
+            if c.act_type == torch.int8 and num_groups > 1:
+                x.data, input_global_scale = marlin_act_int8_process_scales(x.data)
+                layer.register_parameter(
+                    "input_global_scale",
+                    torch.nn.Parameter(input_global_scale, requires_grad=False),
+                )
+            else:
+                layer.input_global_scale = None
             return x
 
         if c.has_g_idx:
@@ -129,6 +159,7 @@ class MarlinLinearKernel(MPLinearKernel):
                     size_k=grouped_k,
                     size_n=c.partition_weight_shape[1],
                     num_bits=c.weight_type.size_bits,
+                    is_a_8bit=is_a_8bit,
                 ),
             )
         else:
@@ -150,6 +181,7 @@ class MarlinLinearKernel(MPLinearKernel):
 
         # `process_weights_after_loading` will ensure w_zp and w_gidx are not
         #  None for marlin
+
         return apply_gptq_marlin_linear(
             input=x,
             weight=w_q,
@@ -162,5 +194,7 @@ class MarlinLinearKernel(MPLinearKernel):
             input_size_per_partition=c.partition_weight_shape[0],
             output_size_per_partition=c.partition_weight_shape[1],
             is_k_full=self.is_k_full,
+            input_global_scale=getattr(layer, "input_global_scale", None),
             bias=bias,
+            input_dtype=c.act_type,
         )
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 80f8e3a03e7cf..709c86175477a 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -55,6 +55,9 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
     select_cutlass_fp8_gemm_impl,
     swap_w13_to_w31,
 )
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    get_marlin_input_dtype,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
     apply_fp4_marlin_linear,
     is_fp4_marlin_supported,
@@ -170,9 +173,15 @@ class ModelOptQuantConfigBase(QuantizationConfig):
 
         # now, the layer is quantized, handle it here
         if isinstance(layer, LinearBase):
-            return self.LinearMethodCls(self)
+            quant_method = self.LinearMethodCls(self)
+            if getattr(quant_method, "backend", "") == "marlin":
+                quant_method.marlin_input_dtype = get_marlin_input_dtype(prefix)
+            return quant_method
         elif isinstance(layer, FusedMoE):
-            return self.FusedMoEMethodCls(quant_config=self, layer=layer)
+            quant_method = self.FusedMoEMethodCls(quant_config=self, layer=layer)
+            if getattr(quant_method, "backend", "") == "marlin":
+                quant_method.marlin_input_dtype = get_marlin_input_dtype(prefix)
+            return quant_method
 
         return None
 
@@ -898,6 +907,7 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
 
     def __init__(self, quant_config: ModelOptNvFp4Config) -> None:
         self.quant_config = quant_config
+        self.marlin_input_dtype = None
 
         self.backend = "none"
         if envs.VLLM_NVFP4_GEMM_BACKEND is None:
@@ -1065,6 +1075,7 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
                 size_n=layer.output_size_per_partition,
                 size_k=layer.input_size_per_partition,
                 bias=bias,
+                input_dtype=self.marlin_input_dtype,
             )
 
         output_dtype = x.dtype
@@ -1124,6 +1135,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         self.cutlass_nvfp4_supported = _nvfp4.cutlass_supported
         self.allow_flashinfer = _nvfp4.allow_flashinfer
         self.use_marlin = _nvfp4.use_marlin
+        self.marlin_input_dtype = None
         self.flashinfer_moe_backend = None
         if self.allow_flashinfer:
             self.flashinfer_moe_backend = get_flashinfer_moe_backend()
@@ -1517,7 +1529,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 apply_router_weight_on_input=apply_router_weight_on_input,
                 global_num_experts=global_num_experts,
                 expert_map=expert_map,
-                workspace=layer.workspace,
+                input_dtype=self.marlin_input_dtype,
             )
 
         elif self.allow_flashinfer:
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index bc241ac692e23..d271e56e08568 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -38,6 +38,9 @@ from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,
 )
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    get_marlin_input_dtype,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
     prepare_moe_fp4_layer_for_marlin,
 )
@@ -205,7 +208,9 @@ class Mxfp4Config(QuantizationConfig):
             if current_platform.is_xpu():
                 return IpexMxfp4MoEMethod(layer.moe_config)
             else:
-                return Mxfp4MoEMethod(layer.moe_config)
+                quant_method = Mxfp4MoEMethod(layer.moe_config)
+                quant_method.marlin_input_dtype = get_marlin_input_dtype(prefix)
+                return quant_method
         elif isinstance(layer, Attention):
             # TODO: Add support for MXFP4 Attention.
             logger.debug_once(
@@ -220,6 +225,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
     def __init__(self, moe: FusedMoEConfig):
         super().__init__(moe)
         self.mxfp4_backend = get_mxfp4_backend(moe.is_lora_enabled)
+
+        self.marlin_input_dtype = None
         self.use_marlin = self.mxfp4_backend == Mxfp4Backend.MARLIN
         self.max_capture_size = (
             get_current_vllm_config().compilation_config.max_cudagraph_capture_size
@@ -385,7 +392,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
 
     def process_weights_after_loading(self, layer):
         if self.mxfp4_backend == Mxfp4Backend.MARLIN:
-            prepare_moe_fp4_layer_for_marlin(layer)
+            prepare_moe_fp4_layer_for_marlin(layer, input_dtype=self.marlin_input_dtype)
         elif (
             self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM
             or self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16
@@ -914,6 +921,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                 global_num_experts=global_num_experts,
                 activation=activation,
                 expert_map=expert_map,
+                input_dtype=self.marlin_input_dtype,
             )
 
         assert _can_support_mxfp4(
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 071fb4ba16867..14337ee1d7bee 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -9,6 +9,11 @@ import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import LinearBase
+from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
+from vllm.model_executor.layers.quantization.utils.int8_utils import (
+    per_token_quant_int8,
+)
+from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
 
@@ -286,10 +291,10 @@ def get_scale_perms():
 
 
 def marlin_permute_scales(
-    s: torch.Tensor, size_k: int, size_n: int, group_size: int
+    s: torch.Tensor, size_k: int, size_n: int, group_size: int, is_a_8bit: bool = False
 ) -> torch.Tensor:
     scale_perm, scale_perm_single = get_scale_perms()
-    if group_size < size_k and group_size != -1:
+    if group_size < size_k and group_size != -1 and not is_a_8bit:
         s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
     else:
         s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
@@ -305,11 +310,15 @@ def marlin_permute_bias(s: torch.Tensor) -> torch.Tensor:
     return s.reshape(*origin_shape).contiguous()
 
 
+def marlin_act_int8_process_scales(s: torch.Tensor):
+    a_scales_scale_factor = 1 / 4096 * s.max().float()
+    s = s / s.max() * 4096
+    s = s.round().to(torch.int16).view(s.dtype)
+    return s, a_scales_scale_factor
+
+
 def marlin_moe_permute_scales(
-    s: torch.Tensor,
-    size_k: int,
-    size_n: int,
-    group_size: int,
+    s: torch.Tensor, size_k: int, size_n: int, group_size: int, is_a_8bit: bool = False
 ):
     num_experts = s.shape[0]
     output = torch.empty(
@@ -319,12 +328,12 @@ def marlin_moe_permute_scales(
     )
 
     for e in range(num_experts):
-        output[e] = marlin_permute_scales(s[e], size_k, size_n, group_size)
+        output[e] = marlin_permute_scales(s[e], size_k, size_n, group_size, is_a_8bit)
     return output
 
 
 def marlin_zero_points(
-    zp: torch.Tensor, size_k: int, size_n: int, num_bits: int
+    zp: torch.Tensor, size_k: int, size_n: int, num_bits: int, is_a_8bit: bool = False
 ) -> torch.Tensor:
     # Permute zero-points in a similar way to scales, but do not use the
     # "single" permutation, since zero-points are applied on every MMA
@@ -339,7 +348,8 @@ def marlin_zero_points(
     else:
         raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
 
-    zp = zp.reshape((-1, len(interleave)))[:, interleave].ravel()
+    if not is_a_8bit:
+        zp = zp.reshape((-1, len(interleave)))[:, interleave].ravel()
     zp = zp.reshape((-1, size_n)).contiguous()
     zp = pack_cols(zp, num_bits, size_k, size_n)
 
@@ -347,7 +357,11 @@ def marlin_zero_points(
 
 
 def awq_to_marlin_zero_points(
-    q_zp_packed: torch.Tensor, size_k: int, size_n: int, num_bits: int
+    q_zp_packed: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+    is_a_8bit: bool = False,
 ) -> torch.Tensor:
     # AWQ zero-points are quantized and packed on the column dim.
     # In addition, the values are permuted based on dequantizer.
@@ -366,12 +380,16 @@ def awq_to_marlin_zero_points(
     q_zp = q_zp.reshape((-1, len(undo_interleave)))[:, undo_interleave].ravel()
     q_zp = q_zp.reshape((-1, size_n)).contiguous()
 
-    marlin_zp = marlin_zero_points(q_zp, size_k, size_n, num_bits)
+    marlin_zp = marlin_zero_points(q_zp, size_k, size_n, num_bits, is_a_8bit)
     return marlin_zp
 
 
 def moe_awq_to_marlin_zero_points(
-    q_zp_packed: torch.Tensor, size_k: int, size_n: int, num_bits: int
+    q_zp_packed: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+    is_a_8bit: bool = False,
 ):
     num_experts = q_zp_packed.shape[0]
     output = torch.empty(
@@ -380,7 +398,9 @@ def moe_awq_to_marlin_zero_points(
         dtype=q_zp_packed.dtype,
     )
     for e in range(num_experts):
-        output[e] = awq_to_marlin_zero_points(q_zp_packed[e], size_k, size_n, num_bits)
+        output[e] = awq_to_marlin_zero_points(
+            q_zp_packed[e], size_k, size_n, num_bits, is_a_8bit
+        )
     return output
 
 
@@ -432,6 +452,48 @@ def should_use_atomic_add_reduce(
     return True
 
 
+_quant_fp8_method: QuantFP8 | None = None
+
+
+def get__quant_fp8_method() -> QuantFP8:
+    global _quant_fp8_method
+    if _quant_fp8_method is None:
+        _quant_fp8_method = QuantFP8(False, GroupShape.PER_TOKEN)
+    return _quant_fp8_method
+
+
+def get_marlin_input_dtype(prefix):
+    if envs.VLLM_MARLIN_INPUT_DTYPE is None:
+        return
+    elif envs.VLLM_MARLIN_INPUT_DTYPE.lower() == "int8":
+        return torch.int8
+    elif envs.VLLM_MARLIN_INPUT_DTYPE.lower() == "fp8":
+        if not current_platform.is_device_capability(
+            89
+        ) and not current_platform.is_device_capability(120):
+            raise ValueError(
+                "Marlin W4A8-FP8 only support SM89 or SM120 device "
+                "(It is slower than Marlin W4A16 on other devices). "
+                "You can consider using W4A8-INT8 instead"
+                "(set VLLM_MARLIN_INPUT_DTYPE=int8)."
+            )
+
+        _ = get__quant_fp8_method()
+        return torch.float8_e4m3fn
+    else:
+        return
+
+
+def marlin_quant_input(x: torch.Tensor, quant_dtype: torch.dtype):
+    x = x.reshape(-1, x.shape[-1])
+    if quant_dtype == torch.int8:
+        return per_token_quant_int8(x)
+    elif quant_dtype == torch.float8_e4m3fn:
+        return get__quant_fp8_method()(x)
+    else:
+        raise ValueError(f"unsupported quant_dtype {quant_dtype}")
+
+
 def apply_gptq_marlin_linear(
     input: torch.Tensor,
     weight: torch.Tensor,
@@ -444,8 +506,10 @@ def apply_gptq_marlin_linear(
     output_size_per_partition: int,
     input_size_per_partition: int,
     is_k_full: bool,
+    input_global_scale: torch.Tensor | None = None,
     bias: torch.Tensor | None = None,
     use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
+    input_dtype: torch.dtype | None = None,
 ) -> torch.Tensor:
     reshaped_x = input.reshape(-1, input.shape[-1])
     out_shape = input.shape[:-1] + (output_size_per_partition,)
@@ -458,12 +522,27 @@ def apply_gptq_marlin_linear(
         dtype=input.dtype,
     )
 
+    a_scales = None
+    if input_dtype == torch.int8:
+        assert wtype == scalar_types.uint4b8, (
+            "W8A8-INT8 is not supported by marlin kernel."
+        )
+        reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
+        a_scales = a_scales * input_global_scale
+    elif input_dtype == torch.float8_e4m3fn:
+        assert wtype == scalar_types.uint4b8, (
+            "INT8 weight + FP8 activation is not supported."
+        )
+
+        reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
+
     output = ops.gptq_marlin_gemm(
         reshaped_x,
         None,
         weight,
         bias,
         weight_scale,
+        a_scales,
         None,
         weight_zp,
         g_idx,
@@ -493,8 +572,10 @@ def apply_awq_marlin_linear(
     quant_type: ScalarType,
     output_size_per_partition: int,
     input_size_per_partition: int,
+    input_global_scale: torch.Tensor | None = None,
     bias: torch.Tensor | None = None,
     use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
+    input_dtype: torch.dtype | None = None,
 ) -> torch.Tensor:
     reshaped_x = input.reshape(-1, input.shape[-1])
     out_shape = input.shape[:-1] + (output_size_per_partition,)
@@ -507,12 +588,20 @@ def apply_awq_marlin_linear(
         dtype=input.dtype,
     )
 
+    a_scales = None
+    if input_dtype == torch.int8:
+        reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
+        a_scales = a_scales * input_global_scale
+    elif input_dtype == torch.float8_e4m3fn:
+        reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
+
     output = ops.gptq_marlin_gemm(
         reshaped_x,
         None,
         weight,
         bias,
         weight_scale,
+        a_scales,
         None,
         weight_zp,
         g_idx,
@@ -538,8 +627,10 @@ def apply_rtn_marlin_linear(
     quant_type: ScalarType,
     output_size_per_partition: int,
     input_size_per_partition: int,
+    input_global_scale: torch.Tensor | None = None,
     bias: torch.Tensor | None = None,
     use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
+    input_dtype: torch.dtype | None = None,
 ) -> torch.Tensor:
     reshaped_x = input.reshape(-1, input.shape[-1])
     out_shape = input.shape[:-1] + (output_size_per_partition,)
@@ -552,12 +643,20 @@ def apply_rtn_marlin_linear(
         dtype=input.dtype,
     )
 
+    a_scales = None
+    if input_dtype == torch.int8:
+        reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
+        a_scales = a_scales * input_global_scale
+    elif input_dtype == torch.float8_e4m3fn:
+        reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
+
     output = ops.gptq_marlin_gemm(
         reshaped_x,
         None,
         weight,
         bias,
         weight_scale,
+        a_scales,
         None,
         None,
         None,
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
index 842fb9b62267a..b94d5bbf36540 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
@@ -11,6 +11,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     marlin_make_workspace_new,
     marlin_permute_bias,
     marlin_permute_scales,
+    marlin_quant_input,
     should_use_atomic_add_reduce,
 )
 from vllm.platforms import current_platform
@@ -37,12 +38,6 @@ def nvfp4_marlin_process_scales(marlin_scales):
     # convert to half first, we would convert to fp8 later
     marlin_scales = marlin_scales.to(torch.half)
 
-    # 8 is the number of scale number using by one thread
-    marlin_scales = marlin_scales.view(marlin_scales.size(0) // 2, 2, -1, 8)
-    marlin_scales = marlin_scales.permute(0, 2, 1, 3).reshape(
-        marlin_scales.size(0) * 2, -1
-    )
-
     # fit the layout of fp8 dequantization
     marlin_scales = marlin_scales.view(-1, 4)[:, [0, 2, 1, 3]].view(
         marlin_scales.size(0), -1
@@ -62,18 +57,20 @@ def nvfp4_marlin_process_scales(marlin_scales):
     return marlin_scales
 
 
-def mxfp4_marlin_process_scales(marlin_scales):
-    # 8 is the number of scale number using by one thread
-    marlin_scales = marlin_scales.view(marlin_scales.size(0) // 2, 2, -1, 8)
-    marlin_scales = marlin_scales.permute(0, 2, 1, 3).reshape(
-        marlin_scales.size(0) * 2, -1
-    )
-
+def mxfp4_marlin_process_scales(marlin_scales, input_dtype=None):
     # fit the layout of fp8 dequantization
-    marlin_scales = marlin_scales.view(-1, 4)[:, [0, 2, 1, 3]].view(
-        marlin_scales.size(0), -1
-    )
+    if input_dtype is None or input_dtype.itemsize == 2:
+        marlin_scales = marlin_scales.view(-1, 4)[:, [0, 2, 1, 3]].view(
+            marlin_scales.size(0), -1
+        )
+
     marlin_scales = marlin_scales.to(torch.float8_e8m0fnu)
+    if input_dtype == torch.float8_e4m3fn:
+        marlin_scales = marlin_scales.view(torch.uint8)
+        assert marlin_scales.max() <= 249
+        # exponent_bias (fp4->fp8) = 2 ** 3 - 2 ** 1 = 6
+        marlin_scales = marlin_scales + 6
+        marlin_scales = marlin_scales.view(torch.float8_e8m0fnu)
     return marlin_scales
 
 
@@ -99,6 +96,7 @@ def apply_fp4_marlin_linear(
     size_n: int,
     size_k: int,
     bias: torch.Tensor | None = None,
+    input_dtype: torch.dtype | None = None,
     use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
 ) -> torch.Tensor:
     # For GPUs that lack FP4 hardware support, we can leverage the
@@ -111,12 +109,24 @@ def apply_fp4_marlin_linear(
         m=reshaped_x.size(0), n=size_n, k=size_k, device=input.device, dtype=input.dtype
     )
 
+    inputs = reshaped_x
+    a_scales = None
+    is_nvfp4 = weight_scale_2 is not None
+    if input_dtype is not None and input_dtype.itemsize == 1:
+        if is_nvfp4:
+            raise RuntimeError("NVFP4 weight + INT8/FP8 activation is not supported.")
+        elif input_dtype != torch.float8_e4m3fn:
+            raise RuntimeError("MXFP4 weight + INT8 activation is not supported.")
+
+        inputs, a_scales = marlin_quant_input(inputs, torch.float8_e4m3fn)
+
     output = ops.gptq_marlin_gemm(
-        a=reshaped_x,
+        a=inputs,
         c=None,
         b_q_weight=weight,
         b_bias=bias,
         b_scales=weight_scale,
+        a_scales=a_scales,
         global_scale=weight_scale_2,
         b_zeros=None,
         g_idx=None,
@@ -133,7 +143,9 @@ def apply_fp4_marlin_linear(
     return output.reshape(out_shape)
 
 
-def prepare_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
+def prepare_fp4_layer_for_marlin(
+    layer: torch.nn.Module, input_dtype: torch.dtype | None = None
+) -> None:
     logger.warning_once(
         "Your GPU does not have native support for FP4 computation but "
         "FP4 quantization is being used. Weight-only FP4 compression will "
@@ -160,12 +172,14 @@ def prepare_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
     perm = torch.empty(0, dtype=torch.int, device=device)
     qweight = layer.weight.view(torch.int32).T.contiguous()
 
+    is_a_8bit = input_dtype is not None and input_dtype.itemsize == 1
     marlin_qweight = ops.gptq_marlin_repack(
         b_q_weight=qweight,
         perm=perm,
         size_k=part_size_k,
         size_n=part_size_n,
         num_bits=4,
+        is_a_8bit=is_a_8bit,
     )
     layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False)
 
@@ -178,7 +192,11 @@ def prepare_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
 
     weight_scale = weight_scale.to(param_dtype)
     weight_scale = marlin_permute_scales(
-        s=weight_scale, size_k=part_size_k, size_n=part_size_n, group_size=group_size
+        s=weight_scale,
+        size_k=part_size_k,
+        size_n=part_size_n,
+        group_size=group_size,
+        is_a_8bit=is_a_8bit,
     )
 
     if is_nvfp4:
@@ -189,7 +207,9 @@ def prepare_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
         weight_scale_2 = nvfp4_marlin_process_global_scale(weight_scale_2)
         layer.weight_scale_2 = torch.nn.Parameter(weight_scale_2, requires_grad=False)
     else:
-        weight_scale = mxfp4_marlin_process_scales(weight_scale)
+        weight_scale = mxfp4_marlin_process_scales(
+            weight_scale, input_dtype=input_dtype
+        )
         layer.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
 
     if hasattr(layer, "bias") and layer.bias is not None:
@@ -200,7 +220,9 @@ def prepare_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
     return
 
 
-def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
+def prepare_moe_fp4_layer_for_marlin(
+    layer: torch.nn.Module, input_dtype: torch.dtype | None = None
+) -> None:
     logger.warning_once(
         "Your GPU does not have native support for FP4 computation but "
         "FP4 quantization is being used. Weight-only FP4 compression will "
@@ -220,6 +242,7 @@ def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
     param_dtype = layer.params_dtype
     layer.workspace = marlin_make_workspace_new(device, 4)
     perm = torch.empty(0, dtype=torch.int, device=device)
+    is_a_8bit = input_dtype is not None and input_dtype.itemsize == 1
 
     # WEIGHT
     # Repack weights to marlin format
@@ -237,7 +260,12 @@ def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
             qweight = weight[i].view(torch.int32).T.contiguous()
 
             marlin_qweight = ops.gptq_marlin_repack(
-                b_q_weight=qweight, perm=perm, size_k=size_k, size_n=size_n, num_bits=4
+                b_q_weight=qweight,
+                perm=perm,
+                size_k=size_k,
+                size_n=size_n,
+                num_bits=4,
+                is_a_8bit=is_a_8bit,
             )
             tensor_list.append(marlin_qweight)
 
@@ -266,12 +294,18 @@ def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
             scale = scales[i].T
 
             marlin_scales = marlin_permute_scales(
-                s=scale, size_k=size_k, size_n=size_n, group_size=group_size
+                s=scale,
+                size_k=size_k,
+                size_n=size_n,
+                group_size=group_size,
+                is_a_8bit=is_a_8bit,
             )
             if is_nvfp4:
                 marlin_scales = nvfp4_marlin_process_scales(marlin_scales)
             else:
-                marlin_scales = mxfp4_marlin_process_scales(marlin_scales)
+                marlin_scales = mxfp4_marlin_process_scales(
+                    marlin_scales, input_dtype=input_dtype
+                )
             tensor_list.append(marlin_scales)
 
         scales = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
@@ -301,7 +335,10 @@ def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
         setattr(layer, name, bias)
 
 
-def rand_marlin_weight_nvfp4_like(weight, group_size):
+def rand_marlin_weight_nvfp4_like(weight, group_size, input_dtype=None):
+    is_a_8bit = input_dtype is not None and input_dtype.itemsize == 1
+
+    assert not is_a_8bit, "NVFP4 weight + INT8/FP8 activation is not supported."
     assert group_size > 0
     size_n, size_k = weight.shape
     device = weight.device
@@ -337,10 +374,15 @@ def rand_marlin_weight_nvfp4_like(weight, group_size):
         size_k=size_k,
         size_n=size_n,
         num_bits=4,
+        is_a_8bit=is_a_8bit,
     )
 
     marlin_scales = marlin_permute_scales(
-        s=scales.T.to(weight.dtype), size_k=size_k, size_n=size_n, group_size=group_size
+        s=scales.T.to(weight.dtype),
+        size_k=size_k,
+        size_n=size_n,
+        group_size=group_size,
+        is_a_8bit=is_a_8bit,
     )
     marlin_scales = nvfp4_marlin_process_scales(marlin_scales)
 
@@ -349,14 +391,20 @@ def rand_marlin_weight_nvfp4_like(weight, group_size):
     return weight_ref.T, marlin_qweight, marlin_scales, global_scale
 
 
-def rand_marlin_weight_mxfp4_like(weight, group_size):
+def rand_marlin_weight_mxfp4_like(weight, group_size, input_dtype=None):
+    is_a_8bit = input_dtype is not None and input_dtype.itemsize == 1
+    if is_a_8bit:
+        assert input_dtype == torch.float8_e4m3fn, (
+            "MXFP4 weight + INT8 activation is not supported."
+        )
+
     assert group_size > 0
     size_n, size_k = weight.shape
     device = weight.device
 
     scales = torch.randint(
-        100,
-        125,
+        110,
+        120,
         (size_n, size_k // group_size),
         dtype=torch.uint8,
         device=weight.device,
@@ -380,18 +428,25 @@ def rand_marlin_weight_mxfp4_like(weight, group_size):
     ).view(size_n, size_k)
     weight_ref = weight_ref * scales.repeat_interleave(group_size, 1).to(weight.dtype)
 
+    perm = torch.empty(0, dtype=torch.int, device=device)
+    fp4_weight = fp4_weight.view(torch.int32).T.contiguous()
     marlin_qweight = ops.gptq_marlin_repack(
-        b_q_weight=fp4_weight.view(torch.int32).T.contiguous(),
-        perm=torch.empty(0, dtype=torch.int, device=device),
+        b_q_weight=fp4_weight,
+        perm=perm,
         size_k=size_k,
         size_n=size_n,
         num_bits=4,
+        is_a_8bit=is_a_8bit,
     )
 
     marlin_scales = marlin_permute_scales(
-        s=scales.T.to(weight.dtype), size_k=size_k, size_n=size_n, group_size=group_size
+        s=scales.T.to(weight.dtype),
+        size_k=size_k,
+        size_n=size_n,
+        group_size=group_size,
+        is_a_8bit=is_a_8bit,
     )
 
-    marlin_scales = mxfp4_marlin_process_scales(marlin_scales)
+    marlin_scales = mxfp4_marlin_process_scales(marlin_scales, input_dtype=input_dtype)
 
     return weight_ref.T, marlin_qweight, marlin_scales.to(torch.float8_e8m0fnu)
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
index 8c96848a85397..e6b4f567caea4 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -11,6 +11,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     marlin_make_workspace_new,
     marlin_permute_bias,
     marlin_permute_scales,
+    marlin_quant_input,
     should_use_atomic_add_reduce,
 )
 from vllm.platforms import current_platform
@@ -45,6 +46,7 @@ def apply_fp8_marlin_linear(
     size_n: int,
     size_k: int,
     bias: torch.Tensor | None,
+    input_dtype: torch.dtype | None = None,
     use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
 ) -> torch.Tensor:
     # For GPUs that lack FP8 hardware support, we can leverage the
@@ -57,12 +59,21 @@ def apply_fp8_marlin_linear(
         m=reshaped_x.size(0), n=size_n, k=size_k, device=input.device, dtype=input.dtype
     )
 
+    inputs = reshaped_x
+    a_scales = None
+    if input_dtype is not None and input_dtype.itemsize == 1:
+        if input_dtype != torch.float8_e4m3fn:
+            raise RuntimeError("FP8 weight + INT8 activation is not supported.")
+
+        inputs, a_scales = marlin_quant_input(inputs, torch.float8_e4m3fn)
+
     output = ops.gptq_marlin_gemm(
         a=reshaped_x,
         c=None,
         b_q_weight=weight,
         b_bias=bias,
         b_scales=weight_scale,
+        a_scales=a_scales,
         global_scale=None,
         b_zeros=None,
         g_idx=None,
@@ -80,7 +91,9 @@ def apply_fp8_marlin_linear(
 
 
 def prepare_fp8_layer_for_marlin(
-    layer: torch.nn.Module, size_k_first: bool = True
+    layer: torch.nn.Module,
+    size_k_first: bool = True,
+    input_dtype: torch.dtype | None = None,
 ) -> None:
     logger.warning_once(
         "Your GPU does not have native support for FP8 computation but "
@@ -162,7 +175,8 @@ def prepare_fp8_layer_for_marlin(
     marlin_scales = marlin_permute_scales(
         s=scales, size_k=part_size_k, size_n=part_size_n, group_size=group_size
     )
-    marlin_scales = fp8_fused_exponent_bias_into_scales(marlin_scales)
+    if input_dtype != torch.float8_e4m3fn:
+        marlin_scales = fp8_fused_exponent_bias_into_scales(marlin_scales)
     layer.weight_scale = torch.nn.Parameter(marlin_scales, requires_grad=False)
 
     if hasattr(layer, "bias") and layer.bias is not None:
@@ -172,7 +186,9 @@ def prepare_fp8_layer_for_marlin(
 
 
 def prepare_moe_fp8_layer_for_marlin(
-    layer: torch.nn.Module, size_k_first: bool = True
+    layer: torch.nn.Module,
+    size_k_first: bool = True,
+    input_dtype: torch.dtype | None = None,
 ) -> None:
     logger.warning_once(
         "Your GPU does not have native support for FP8 computation but "
@@ -278,7 +294,8 @@ def prepare_moe_fp8_layer_for_marlin(
             tensor_list.append(marlin_scales)
 
         scales = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
-        scales = fp8_fused_exponent_bias_into_scales(scales)
+        if input_dtype != torch.float8_e4m3fn:
+            scales = fp8_fused_exponent_bias_into_scales(scales)
         scales = torch.nn.Parameter(scales, requires_grad=False)
 
         setattr(layer, name + "_weight_scale", scales)
@@ -318,7 +335,11 @@ def pack_fp8_to_int32(
     return int32_tensor.T.contiguous() if size_k_first else int32_tensor
 
 
-def marlin_quant_fp8_torch(weight, group_size):
+def marlin_quant_fp8_torch(weight, group_size, input_dtype=None):
+    is_a_8bit = input_dtype is not None and input_dtype.itemsize == 1
+    if is_a_8bit:
+        assert input_dtype == torch.float8_e4m3fn
+
     size_n, size_k = weight.shape
     device = weight.device
 
@@ -334,16 +355,22 @@ def marlin_quant_fp8_torch(weight, group_size):
         weight_ref = fp8_weight.to(weight.dtype) * repeated_scales
 
     packed_weight = pack_fp8_to_int32(fp8_weight, False).T.contiguous()
+    perm = torch.empty(0, dtype=torch.int, device=device)
     marlin_qweight = ops.gptq_marlin_repack(
         b_q_weight=packed_weight,
-        perm=torch.empty(0, dtype=torch.int, device=device),
+        perm=perm,
         size_k=size_k,
         size_n=size_n,
         num_bits=8,
+        is_a_8bit=is_a_8bit,
     )
 
     marlin_scales = marlin_permute_scales(
-        s=scales.T, size_k=size_k, size_n=size_n, group_size=group_size
+        s=scales.T,
+        size_k=size_k,
+        size_n=size_n,
+        group_size=group_size,
+        is_a_8bit=is_a_8bit,
     )
 
     marlin_scales = fp8_fused_exponent_bias_into_scales(marlin_scales)
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
index 89756c45ef556..9162afe03da90 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
@@ -5,7 +5,8 @@
 import numpy as np
 import torch
 
-from vllm.scalar_type import ScalarType
+from vllm import _custom_ops as ops
+from vllm.scalar_type import ScalarType, scalar_types
 
 from .marlin_utils import GPTQ_MARLIN_TILE, marlin_permute_scales, marlin_zero_points
 from .quant_utils import (
@@ -29,13 +30,19 @@ class MarlinWorkspace:
         self.scratch = torch.zeros(max_workspace_size, dtype=torch.int, device="cuda")
 
 
-def marlin_permute_weights(q_w, size_k, size_n, perm, tile=GPTQ_MARLIN_TILE):
+def marlin_permute_weights(
+    q_w, size_k, size_n, perm, tile=GPTQ_MARLIN_TILE, is_a_8bit=False
+):
     assert q_w.shape == (size_k, size_n)
     assert size_k % tile == 0, f"size_k = {size_k}, tile = {tile}"
     assert size_n % tile == 0, f"size_k = {size_n}, tile = {tile}"
 
-    # Permute weights to 16x64 marlin tiles
-    q_w = q_w.reshape((size_k // tile, tile, size_n // tile, tile))
+    if is_a_8bit:
+        # Permute weights to 32x32 marlin tiles
+        q_w = q_w.reshape((size_k // (tile * 2), tile * 2, size_n // tile, tile))
+    else:
+        # Permute weights to 16x64 marlin tiles
+        q_w = q_w.reshape((size_k // tile, tile, size_n // tile, tile))
     q_w = q_w.permute((0, 2, 1, 3))
     q_w = q_w.reshape((size_k // tile, size_n * tile))
 
@@ -44,9 +51,9 @@ def marlin_permute_weights(q_w, size_k, size_n, perm, tile=GPTQ_MARLIN_TILE):
     return q_w
 
 
-def marlin_weights(q_w, size_k, size_n, num_bits, perm):
+def marlin_weights(q_w, size_k, size_n, num_bits, perm, is_a_8bit=False):
     # Permute
-    q_w = marlin_permute_weights(q_w, size_k, size_n, perm)
+    q_w = marlin_permute_weights(q_w, size_k, size_n, perm, is_a_8bit=is_a_8bit)
 
     # Pack
     pack_factor = get_pack_factor(num_bits)
@@ -63,28 +70,53 @@ def marlin_weights(q_w, size_k, size_n, num_bits, perm):
     return q_packed
 
 
-def get_weight_perm(num_bits: int):
+def get_weight_perm(num_bits: int, is_a_8bit: bool = False):
     perm_list: list[int] = []
-    for i in range(32):
-        perm1: list[int] = []
-        col = i // 4
-        for block in [0, 1]:
-            for row in [
-                2 * (i % 4),
-                2 * (i % 4) + 1,
-                2 * (i % 4 + 4),
-                2 * (i % 4 + 4) + 1,
-            ]:
-                perm1.append(16 * row + col + 8 * block)
-        for j in range(4):
-            perm_list.extend([p + 256 * j for p in perm1])
+    if is_a_8bit:
+        for i in range(32):
+            perm1 = []
+            col = i // 4
+            for block in [0, 1]:
+                for row in [
+                    4 * (i % 4),
+                    4 * (i % 4) + 1,
+                    4 * (i % 4) + 2,
+                    4 * (i % 4) + 3,
+                    4 * (i % 4 + 4),
+                    4 * (i % 4 + 4) + 1,
+                    4 * (i % 4 + 4) + 2,
+                    4 * (i % 4 + 4) + 3,
+                ]:
+                    perm1.append(16 * row + col + 8 * block)
+            for j in range(2):
+                perm_list.extend([p + 512 * j for p in perm1])
+    else:
+        for i in range(32):
+            perm1 = []
+            col = i // 4
+            for block in [0, 1]:
+                for row in [
+                    2 * (i % 4),
+                    2 * (i % 4) + 1,
+                    2 * (i % 4 + 4),
+                    2 * (i % 4 + 4) + 1,
+                ]:
+                    perm1.append(16 * row + col + 8 * block)
+            for j in range(4):
+                perm_list.extend([p + 256 * j for p in perm1])
 
     perm = np.array(perm_list)
 
     if num_bits == 4:
-        interleave = np.array([0, 2, 4, 6, 1, 3, 5, 7])
+        if is_a_8bit:  # noqa: SIM108
+            interleave = np.array([0, 4, 1, 5, 2, 6, 3, 7])
+        else:
+            interleave = np.array([0, 2, 4, 6, 1, 3, 5, 7])
     elif num_bits == 8:
-        interleave = np.array([0, 2, 1, 3])
+        if is_a_8bit:  # noqa: SIM108
+            interleave = np.array([0, 1, 2, 3])
+        else:
+            interleave = np.array([0, 2, 1, 3])
     else:
         raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
 
@@ -99,7 +131,10 @@ def marlin_quantize(
     group_size: int,
     act_order: bool,
     test_perm: torch.Tensor | None = None,
+    input_dtype: torch.dtype | None = None,
 ):
+    is_a_8bit = input_dtype is not None and input_dtype.itemsize == 1
+
     size_k, size_n = w.shape
     num_bits = quant_type.size_bits
 
@@ -120,9 +155,15 @@ def marlin_quantize(
         q_w, g_idx, sort_indices = sort_weights(q_w, g_idx)
 
     # Reformat to marlin
-    weight_perm = get_weight_perm(num_bits)
-    marlin_q_w = marlin_weights(q_w, size_k, size_n, num_bits, weight_perm)
-    marlin_s = marlin_permute_scales(s, size_k, size_n, group_size)
+    weight_perm = get_weight_perm(num_bits, is_a_8bit)
+    marlin_q_w = marlin_weights(
+        q_w, size_k, size_n, num_bits, weight_perm, is_a_8bit=is_a_8bit
+    )
+    marlin_s = marlin_permute_scales(s, size_k, size_n, group_size, is_a_8bit=is_a_8bit)
+
+    if input_dtype == torch.float8_e4m3fn and quant_type == scalar_types.uint4b8:
+        ops.marlin_int4_fp8_preprocess(marlin_q_w, inplace=True)
+        marlin_s = marlin_s * 512
 
     # Create result
     res_list = [w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, rand_perm]
@@ -132,7 +173,13 @@ def marlin_quantize(
     return res_list
 
 
-def awq_marlin_quantize(w: torch.Tensor, quant_type: ScalarType, group_size: int):
+def awq_marlin_quantize(
+    w: torch.Tensor,
+    quant_type: ScalarType,
+    group_size: int,
+    input_dtype: torch.dtype | None = None,
+):
+    is_a_8bit = input_dtype is not None and input_dtype.itemsize == 1
     size_k, size_n = w.shape
 
     # Normalize group_size
@@ -147,11 +194,22 @@ def awq_marlin_quantize(w: torch.Tensor, quant_type: ScalarType, group_size: int
     # Quantize with zp
     w_ref, q_w, s, zp = quantize_weights(w, quant_type, group_size, zero_points=True)
 
+    if input_dtype == torch.float8_e4m3fn and quant_type == scalar_types.uint4:
+        repeated_zp = zp.repeat_interleave(group_size, 0)
+        q_w_old = q_w
+        q_w = q_w_old - repeated_zp
+        q_w[q_w < 0] = 15 - q_w_old[q_w < 0]
+        s = s * 512
+
     # Reformat to marlin
-    weight_perm = get_weight_perm(quant_type.size_bits)
-    marlin_q_w = marlin_weights(q_w, size_k, size_n, quant_type.size_bits, weight_perm)
-    marlin_s = marlin_permute_scales(s, size_k, size_n, group_size)
-    marlin_zp = marlin_zero_points(zp, num_groups, size_n, quant_type.size_bits)
+    weight_perm = get_weight_perm(quant_type.size_bits, is_a_8bit)
+    marlin_q_w = marlin_weights(
+        q_w, size_k, size_n, quant_type.size_bits, weight_perm, is_a_8bit=is_a_8bit
+    )
+    marlin_s = marlin_permute_scales(s, size_k, size_n, group_size, is_a_8bit=is_a_8bit)
+    marlin_zp = marlin_zero_points(
+        zp, num_groups, size_n, quant_type.size_bits, is_a_8bit=is_a_8bit
+    )
 
     # Create result
     res_list = [w_ref, marlin_q_w, marlin_s, marlin_zp]

From b9d0504a366b667f8160174af1251bf8da9f006e Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sun, 30 Nov 2025 00:35:15 +0800
Subject: [PATCH 085/770] [Bugfix] Revert test_tokenization.py (#29729)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/entrypoints/openai/test_tokenization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py
index d23628671a87a..7fd32e1c7be1d 100644
--- a/tests/entrypoints/openai/test_tokenization.py
+++ b/tests/entrypoints/openai/test_tokenization.py
@@ -10,7 +10,7 @@ from vllm.transformers_utils.tokenizer import get_tokenizer
 from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
-MODEL_NAME = "Qwen/Qwen3-0.6B"
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 
 
 @pytest.fixture(scope="module")

From a491b0911bf9d9ed264ec3ad83bdd9e96c0324a6 Mon Sep 17 00:00:00 2001
From: Xin Yang <105740670+xyang16@users.noreply.github.com>
Date: Sat, 29 Nov 2025 18:37:25 -0800
Subject: [PATCH 086/770] [LoRA] Support FusedMoE LoRA Triton kernel for mxfp4
 (#29708)

Signed-off-by: Xin Yang <xyangx@amazon.com>
Signed-off-by: Xin Yang <105740670+xyang16@users.noreply.github.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 .../moe/test_modular_oai_triton_moe.py        | 249 ++++++++++++++++++
 vllm/lora/layers/fused_moe.py                 |  37 ++-
 .../fused_moe/gpt_oss_triton_kernels_moe.py   | 146 ++++++++++
 .../layers/quantization/mxfp4.py              |  20 +-
 4 files changed, 440 insertions(+), 12 deletions(-)
 create mode 100644 tests/kernels/moe/test_modular_oai_triton_moe.py

diff --git a/tests/kernels/moe/test_modular_oai_triton_moe.py b/tests/kernels/moe/test_modular_oai_triton_moe.py
new file mode 100644
index 0000000000000..c8616f13bbf85
--- /dev/null
+++ b/tests/kernels/moe/test_modular_oai_triton_moe.py
@@ -0,0 +1,249 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Test modular OAI Triton MoE
+"""
+
+import pytest
+import torch
+
+from vllm.utils.import_utils import has_triton_kernels
+
+if not has_triton_kernels():
+    pytest.skip(
+        "triton_kernels not found, skipping all related tests",
+        allow_module_level=True,
+    )
+
+from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig
+from triton_kernels.numerics import InFlexData
+from triton_kernels.numerics_details.mxfp import downcast_to_mxfp, upcast_from_mxfp
+from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor
+from triton_kernels.tensor_details import layout
+from triton_kernels.testing import assert_close
+
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.config import mxfp4_w4a16_moe_quant_config
+from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
+    OAITritonExperts,
+    UnfusedOAITritonExperts,
+)
+from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+    MoEPrepareAndFinalizeNoEP,
+)
+from vllm.model_executor.layers.utils import shuffle_weight
+from vllm.platforms import current_platform
+
+MNK = [
+    (1, 512, 384),
+    (1, 2880, 2880),
+    (2, 512, 384),
+    (2, 2880, 2880),
+    (16, 2880, 2880),
+]
+
+
+def unshuffle_weight(w: torch.Tensor):
+    first = w[..., ::2]
+    second = w[..., 1::2]
+    return torch.concat((first, second), dim=-1)
+
+
+def make_weights(dtype, k, n, e):
+    w1 = torch.randn((e, k, 2 * n), dtype=dtype, device="cuda")
+    w1_bias = torch.randn((e, 2 * n), dtype=dtype, device="cuda")
+
+    w2 = torch.randn((e, n, k), dtype=dtype, device="cuda")
+    w2_bias = torch.randn((e, k), dtype=dtype, device="cuda")
+
+    w1_tri = w1.clone()
+    w2_tri = w2.clone()
+
+    w1_bias_tri = w1_bias.clone()
+    w2_bias_tri = w2_bias.clone()
+    w1_bias_tri = w1_bias_tri.to(torch.float32)
+    w2_bias_tri = w2_bias_tri.to(torch.float32)
+
+    # shuffle weights
+    w1_tri = shuffle_weight(w1_tri)
+    w1_bias_tri = shuffle_weight(w1_bias_tri)
+
+    # quant triton_weights
+    w1_tri, w1_scale_tri = downcast_to_mxfp(w1_tri, torch.uint8, axis=1)
+    w1 = upcast_from_mxfp(w1_tri, w1_scale_tri, dtype, axis=1)
+    w1 = unshuffle_weight(w1)
+
+    w2_tri, w2_scale_tri = downcast_to_mxfp(w2_tri, torch.uint8, axis=1)
+    w2 = upcast_from_mxfp(w2_tri, w2_scale_tri, dtype, axis=1)
+
+    num_warps = 8
+    w_layout, w_layout_opts = layout.make_default_matmul_mxfp4_w_layout(mx_axis=1)
+    w_scale_layout, w_scale_layout_opts = (
+        layout.make_default_matmul_mxfp4_w_scale_layout(mx_axis=1, num_warps=num_warps)
+    )
+
+    w1_tri = convert_layout(wrap_torch_tensor(w1_tri, FP4), w_layout, **w_layout_opts)
+    w1_scale_tri = convert_layout(
+        wrap_torch_tensor(w1_scale_tri),
+        w_scale_layout,
+        **w_scale_layout_opts,
+    )
+
+    w2_tri = convert_layout(wrap_torch_tensor(w2_tri, FP4), w_layout, **w_layout_opts)
+    w2_scale_tri = convert_layout(
+        wrap_torch_tensor(w2_scale_tri),
+        w_scale_layout,
+        **w_scale_layout_opts,
+    )
+
+    w1_precision_config = PrecisionConfig(
+        weight_scale=w1_scale_tri, flex_ctx=FlexCtx(rhs_data=InFlexData())
+    )
+    w2_precision_config = PrecisionConfig(
+        weight_scale=w2_scale_tri, flex_ctx=FlexCtx(rhs_data=InFlexData())
+    )
+
+    return (
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        w1_tri,
+        w2_tri,
+        w1_bias_tri,
+        w2_bias_tri,
+        w1_precision_config,
+        w2_precision_config,
+    )
+
+
+def swiglu(x, alpha: float = 1.702, limit: float = 1.0):
+    # Note we add an extra bias of 1 to the linear layer
+    x_glu, x_linear = torch.chunk(x, 2, dim=-1)
+    if limit is not None:
+        x_glu = x_glu.clamp(max=limit)
+    out_glu = x_glu * torch.sigmoid(alpha * x_glu)
+    if limit is not None:
+        x_linear = x_linear.clamp(min=-limit, max=limit)
+    return out_glu * (x_linear + 1)
+
+
+def torch_moe_impl(
+    hidden_states: torch.Tensor,  # (M, K)
+    w1: torch.Tensor,  # (E, K, 2N)
+    w2: torch.Tensor,  # (E, N, K)
+    w1_bias: torch.Tensor,  # (E, 2N)
+    w2_bias: torch.Tensor,  # (E, K)
+    topk_weights: torch.Tensor,  # (M, topk)
+    topk_ids: torch.Tensor,  # (M, topk)
+):
+    w1 = w1[topk_ids, ...]
+    w1_bias = w1_bias[topk_ids, ...]
+    hidden_states = torch.einsum("bekc,bk->bec", w1, hidden_states) + w1_bias
+    hidden_states = swiglu(hidden_states, limit=7)
+
+    w2 = w2[topk_ids, ...]
+    w2_bias = w2_bias[topk_ids, ...]
+    hidden_states = torch.einsum("bekc,bek->bec", w2, hidden_states) + w2_bias
+
+    # Weighted sum of experts
+    hidden_states = torch.einsum("bec,be->bc", hidden_states, topk_weights)
+    return hidden_states
+
+
+def oai_triton_moe_impl(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: "PrecisionConfig",
+    w2_scale: "PrecisionConfig",
+    w1_bias: torch.Tensor | None,
+    w2_bias: torch.Tensor | None,
+    num_experts: int,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    unfused: bool = False,
+) -> torch.Tensor:
+    quant_config = mxfp4_w4a16_moe_quant_config(
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+    )
+
+    if unfused:
+        fused_experts = UnfusedOAITritonExperts(quant_config)
+    else:
+        fused_experts = OAITritonExperts(quant_config)
+
+    mk = FusedMoEModularKernel(MoEPrepareAndFinalizeNoEP(), fused_experts)
+
+    return mk.forward(
+        hidden_states=x,
+        w1=w1,
+        w2=w2,
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        inplace=True,
+        activation="swigluoai",
+        global_num_experts=num_experts,
+        expert_map=None,
+        apply_router_weight_on_input=False,
+    )
+
+
+@pytest.mark.skipif(
+    not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
+)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("m,n,k", MNK)
+@pytest.mark.parametrize("num_experts", [32, 128])
+@pytest.mark.parametrize("topk", [4])
+@pytest.mark.parametrize("unfused", [True, False])
+def test_oai_triton_moe(
+    dtype: torch.dtype,
+    m: int,
+    n: int,
+    k: int,
+    num_experts: int,
+    topk: int,
+    unfused: bool,
+):
+    current_platform.seed_everything(0)
+    (
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        w1_tri,
+        w2_tri,
+        w1_bias_tri,
+        w2_bias_tri,
+        w1_precision_config,
+        w2_precision_config,
+    ) = make_weights(dtype, k, n, num_experts)
+
+    x = torch.randn((m, k), dtype=dtype, device="cuda")
+    router_logits = torch.randn(m, num_experts, device="cuda", dtype=dtype)
+    topk_weights, topk_ids = torch.topk(router_logits, k=topk, dim=-1, sorted=True)
+    topk_weights = torch.nn.functional.softmax(topk_weights, dim=-1)
+
+    with set_current_vllm_config(VllmConfig()):
+        out_ref = torch_moe_impl(x, w1, w2, w1_bias, w2_bias, topk_weights, topk_ids)
+
+        out = oai_triton_moe_impl(
+            x,
+            w1_tri,
+            w2_tri,
+            w1_precision_config,
+            w2_precision_config,
+            w1_bias_tri,
+            w2_bias_tri,
+            num_experts,
+            topk_weights,
+            topk_ids,
+            unfused,
+        )
+
+    assert_close(ref=out_ref, tri=out, maxtol=0.025, rmstol=0.005)
diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py
index 3ad19370962ab..24cab79a72443 100644
--- a/vllm/lora/layers/fused_moe.py
+++ b/vllm/lora/layers/fused_moe.py
@@ -20,15 +20,24 @@ from vllm.model_executor.layers.fused_moe.config import (
     _get_config_dtype_str,
 )
 from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
-    modular_marlin_fused_moe,
+    MarlinExperts,
 )
 from vllm.model_executor.layers.fused_moe.fused_moe import (
-    modular_triton_fused_moe,
+    TritonExperts,
     try_get_optimal_moe_config,
 )
 from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import (
     FusedMoEModularMethod,
 )
+from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
+    UnfusedOAITritonExperts,
+)
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEModularKernel,
+)
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+    MoEPrepareAndFinalizeNoEP,
+)
 
 from .utils import _get_lora_device
 
@@ -114,15 +123,23 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
         self.base_layer.ensure_moe_quant_config_init()
         quant_config = self.base_layer.quant_method.moe_quant_config
 
-        m_fused_moe_fn = (
-            modular_triton_fused_moe(
-                quant_config, shared_experts=self.base_layer.shared_experts
-            )
-            if not quant_config.use_mxfp4_w4a16
-            else modular_marlin_fused_moe(
-                quant_config, shared_experts=self.base_layer.shared_experts
-            )
+        prepare_finalize = MoEPrepareAndFinalizeNoEP()
+        m_fused_moe_fn = FusedMoEModularKernel(
+            prepare_finalize,
+            self.base_layer.quant_method.select_gemm_impl(
+                prepare_finalize, self.base_layer
+            ),
+            self.base_layer.shared_experts,
+            getattr(self.base_layer, "shared_experts_stream", None),
         )
+        if quant_config.use_mxfp4_w4a16:
+            assert isinstance(
+                m_fused_moe_fn.fused_experts, (MarlinExperts, UnfusedOAITritonExperts)
+            )
+        else:
+            assert isinstance(
+                m_fused_moe_fn.fused_experts, (MarlinExperts, TritonExperts)
+            )
 
         def fwd_decorator(layer, func):
             def wrapper(*args, **kwargs):
diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
index 128507639fdfd..0b006e15632e1 100644
--- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
+++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
@@ -5,6 +5,7 @@
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import (
     FUSED_MOE_UNQUANTIZED_CONFIG,
@@ -376,3 +377,148 @@ class OAITritonExperts(BaseOAITritonExperts):
             intermediate_cache=workspace2,
             a1q_scale=a1q_scale,
         )
+
+
+class UnfusedOAITritonExperts(BaseOAITritonExperts):
+    """
+    A Triton based MoE expert class that operates on expert standard
+    format and explicitly keeps the activation and reduction (moe_sum) steps
+    unfused from the matmul_ogs kernel. This exposes injection points
+    for activation and moe_sum.
+
+    One use case for it is to inject LoRA modules on the activation and moe_sum.
+    """
+
+    def __init__(self, quant_config: FusedMoEQuantConfig):
+        # TODO (varun) : Enable activation quantization
+        assert quant_config.use_mxfp4_w4a16, "Supports only mxfp4_w4a16"
+        super().__init__(quant_config)
+
+    @property
+    def activation_formats(
+        self,
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (
+            mk.FusedMoEActivationFormat.Standard,
+            mk.FusedMoEActivationFormat.Standard,
+        )
+
+    def supports_chunking(self) -> bool:
+        return True
+
+    def workspace_shapes(
+        self,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
+        # workspace are allocated inside the kernel
+        workspace1 = (M * topk, N // 2)
+        workspace2 = (M * topk, max(N, K))
+        output = (M, K)
+        return (workspace1, workspace2, output)
+
+    def moe_sum(self, input: torch.Tensor, output: torch.Tensor):
+        ops.moe_sum(input, output)
+
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        apply_router_weight_on_input: bool,
+    ):
+        if self.quant_config is None:
+            self.quant_config = FUSED_MOE_UNQUANTIZED_CONFIG
+
+        if expert_map is not None:
+            topk_ids = expert_map[topk_ids]
+
+        local_num_experts = w1.size(0)
+        if global_num_experts == -1:
+            global_num_experts = local_num_experts
+
+        routing_data, gather_indx, scatter_indx = self._make_routing_data(
+            topk_ids, topk_weights, local_num_experts
+        )
+
+        topk = topk_ids.size(1)
+
+        # type check, uint8 means mxfp4
+        assert hidden_states.dtype == torch.bfloat16
+        assert (
+            self.quant_config.w1_bias is None
+            or self.quant_config.w1_bias.dtype == torch.float32
+        )
+        assert (
+            self.quant_config.w2_bias is None
+            or self.quant_config.w2_bias.dtype == torch.float32
+        )
+
+        # Shape check, only check non-mxfp4
+        assert hidden_states.ndim == 2
+        assert hidden_states.shape[-1] == w1.shape[-2]
+        assert w2.shape[-1] == w1.shape[1]
+
+        batch_dim = 1
+        M, K = hidden_states.shape
+        E, _, N = w1.shape
+
+        if global_num_experts == -1:
+            global_num_experts = E
+
+        # Note that the output tensor might be in workspace13
+        intermediate_cache1 = _resize_cache(workspace2, (batch_dim, M * topk, N))
+        intermediate_cache3 = _resize_cache(workspace2, (batch_dim, M * topk, K))
+        intermediate_cache2 = _resize_cache(workspace13, (M * topk, N // 2))
+
+        gammas = routing_data.gate_scal if routing_data else None
+
+        matmul_ogs(
+            hidden_states,
+            w1,
+            self.quant_config.w1_bias,
+            routing_data,
+            gather_indx=gather_indx,
+            precision_config=self.quant_config.w1_precision,
+            gammas=gammas if apply_router_weight_on_input else None,
+            fused_activation=None,
+            y=intermediate_cache1,
+        )
+
+        self.activation(
+            activation, intermediate_cache2, intermediate_cache1.view(-1, N)
+        )
+
+        # matmul_ogs grouped reduction fuse sum across multiple experts:
+        # y[dst_ind // n_expts_act, :] += x[src_ind, :]
+        # Need to set n_expts_act to 1 to unfuse moe_sum
+        routing_data.n_expts_act = 1
+
+        matmul_ogs(
+            intermediate_cache2,
+            w2,
+            self.quant_config.w2_bias,
+            routing_data,
+            scatter_indx=scatter_indx,
+            precision_config=self.quant_config.w2_precision,
+            gammas=None if apply_router_weight_on_input else gammas,
+            y=intermediate_cache3,
+        )
+
+        self.moe_sum(intermediate_cache3.view(-1, topk, K), output)
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index d271e56e08568..5d330e837eea0 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -30,6 +30,7 @@ from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
 )
 from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
     OAITritonExperts,
+    UnfusedOAITritonExperts,
 )
 from vllm.model_executor.layers.fused_moe.trtllm_moe import TrtLlmGenExperts
 from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod
@@ -86,8 +87,21 @@ def get_mxfp4_backend_with_lora() -> Mxfp4Backend:
     if not current_platform.is_cuda():
         return Mxfp4Backend.NONE
 
-    logger.info_once("[get_mxfp4_backend_with_lora] Using Marlin backend")
-    return Mxfp4Backend.MARLIN
+    # If FlashInfer is not available, try either Marlin or Triton
+    triton_kernels_supported = (
+        has_triton_kernels()
+        and is_torch_equal_or_newer("2.8.0")
+        # NOTE: triton_kernels are only confirmed to work on SM90 and SM100
+        # SM110 fails with this error: https://github.com/vllm-project/vllm/issues/29317
+        # SM120 needs this fix: https://github.com/triton-lang/triton/pull/8498
+        and (9, 0) <= current_platform.get_device_capability() < (11, 0)
+    )
+    if envs.VLLM_MXFP4_USE_MARLIN or not triton_kernels_supported:
+        logger.info_once("[get_mxfp4_backend_with_lora] Using Marlin backend")
+        return Mxfp4Backend.MARLIN
+
+    logger.info_once("[get_mxfp4_backend_with_lora] Using Triton backend")
+    return Mxfp4Backend.TRITON
 
 
 def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend:
@@ -861,6 +875,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
             elif self.mxfp4_backend == Mxfp4Backend.MARLIN:
                 return MarlinExperts(self.moe_quant_config)
             elif self.mxfp4_backend == Mxfp4Backend.TRITON:
+                if self.moe.is_lora_enabled:
+                    return UnfusedOAITritonExperts(self.moe_quant_config)
                 return OAITritonExperts(self.moe_quant_config)
             else:
                 raise NotImplementedError(

From e1464c3a0861384974dd6cfa35f2d6ff729ab29c Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 30 Nov 2025 14:04:28 +0800
Subject: [PATCH 087/770] [Quantization] Enable compressed-tensors AWQ for
 Turing GPU (#29732)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 .../compressed_tensors/schemes/compressed_tensors_wNa16.py    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index 7f4dad70287bd..3f1b4d883b79c 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -79,8 +79,8 @@ class CompressedTensorsWNA16(CompressedTensorsScheme):
 
     @classmethod
     def get_min_capability(cls) -> int:
-        # ampere and up
-        return 80
+        # Turing and up
+        return 75
 
     def create_weights(
         self,

From 82c795d6f28ee365bfa822f30612e5da35c93fc0 Mon Sep 17 00:00:00 2001
From: Huamin Li <3ericli@gmail.com>
Date: Sat, 29 Nov 2025 22:04:55 -0800
Subject: [PATCH 088/770] Fix AttributeError about _use_fi_prefill (#29734)

Signed-off-by: Huamin Li <3ericli@gmail.com>
---
 vllm/v1/attention/backends/mla/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index b09541dbf7919..180625b6ce897 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -2061,7 +2061,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
                     attn_out,
                     lse,
                     get_dcp_group(),
-                    is_lse_base_on_e=not self._use_fi_prefill,
+                    is_lse_base_on_e=not getattr(self, "_use_fi_prefill", False),
                 )
 
             # v_up projection

From 66b5840287792d2cda6d80a4b8e6d748b70dc8ce Mon Sep 17 00:00:00 2001
From: Vensen <vensenmu@gmail.com>
Date: Sun, 30 Nov 2025 14:24:25 +0800
Subject: [PATCH 089/770] [Bugfix][sleepmode][fp8 kv cache]: Fix FP8 KV cache +
 sleep(level=2) gibberish output (#28783)

Signed-off-by: vensen <vensenmu@gmail.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
---
 tests/basic_correctness/test_cumem.py | 33 ++++++++++++++++++-
 tests/utils.py                        |  7 ++++
 vllm/v1/worker/gpu_model_runner.py    | 46 ++++++++++++++++++++++++++-
 vllm/v1/worker/gpu_worker.py          | 10 ++++++
 4 files changed, 94 insertions(+), 2 deletions(-)

diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index 754ef20dbeb2b..dc9c69bf58b95 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -11,7 +11,7 @@ from vllm.device_allocator.cumem import CuMemAllocator
 from vllm.platforms import current_platform
 from vllm.utils.mem_constants import GiB_bytes
 
-from ..utils import create_new_process_for_each_test
+from ..utils import create_new_process_for_each_test, requires_fp8
 
 
 @create_new_process_for_each_test("fork" if not current_platform.is_rocm() else "spawn")
@@ -243,3 +243,34 @@ def test_deep_sleep_async():
         assert output.outputs[0].text == output2.outputs[0].text
 
     asyncio.run(test())
+
+
+@requires_fp8
+def test_deep_sleep_fp8_kvcache():
+    GiB_bytes = 1 << 30
+    model = "Qwen/Qwen2-0.5B"
+    used_bytes_baseline = current_platform.get_current_memory_usage()
+
+    llm = LLM(model, enable_sleep_mode=True, kv_cache_dtype="fp8")
+    prompt = "How are you?"
+    sampling_params = SamplingParams(temperature=0, max_tokens=10)
+    output = llm.generate(prompt, sampling_params)
+
+    # Put the engine to deep sleep
+    llm.sleep(level=2)
+
+    used_bytes = current_platform.get_current_memory_usage() - used_bytes_baseline
+    assert used_bytes < 3 * GiB_bytes
+
+    llm.wake_up(tags=["weights"])
+    llm.collective_rpc("reload_weights")
+
+    used_bytes = current_platform.get_current_memory_usage() - used_bytes_baseline
+    assert used_bytes < 4 * GiB_bytes
+
+    # now allocate kv cache and cuda graph memory
+    llm.wake_up(tags=["kv_cache"])
+    output2 = llm.generate(prompt, sampling_params)
+
+    # cmp output
+    assert output[0].outputs[0].text == output2[0].outputs[0].text
diff --git a/tests/utils.py b/tests/utils.py
index c31a2aeeb9c80..9565b0ff06e36 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -1075,6 +1075,13 @@ def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator:
     )
 
 
+requires_fp8 = pytest.mark.skipif(
+    not current_platform.supports_fp8(),
+    reason="FP8 is not supported on this GPU (requires Hopper or "
+    "Ada architecture, compute capability 8.9+)",
+)
+
+
 def large_gpu_test(*, min_gb: int):
     """
     Decorate a test to be skipped if no GPU is available or it does not have
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 9b0fb07297ac3..eeae82568c32d 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -25,7 +25,7 @@ from vllm.attention.backends.abstract import (
     AttentionType,
     MultipleOf,
 )
-from vllm.attention.layer import Attention
+from vllm.attention.layer import Attention, MLAAttention
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.cuda_graph import CUDAGraphWrapper
 from vllm.compilation.monitor import set_cudagraph_capturing_enabled
@@ -602,6 +602,50 @@ class GPUModelRunner(
         if self.mm_budget:
             self.mm_budget.reset_cache()
 
+    @torch.inference_mode()
+    def init_fp8_kv_scales(self) -> None:
+        """
+        Re-initialize the KV cache and FP8 scales after waking from sleep.
+        1. Zero out the KV cache tensors to remove garbage data from re-allocation.
+        2. Reset Attention layer scaling factors (_k_scale, _v_scale) to 1.0.
+          If these are left at 0.0 (default after wake_up), all KV cache values
+          become effectively zero, causing gibberish output.
+        """
+        if not self.cache_config.cache_dtype.startswith("fp8"):
+            return
+
+        kv_caches = getattr(self, "kv_caches", [])
+        for cache_tensor in kv_caches:
+            if cache_tensor is not None:
+                cache_tensor.zero_()
+
+        k_attr_names = ("_k_scale", "k_scale")
+        v_attr_names = ("_v_scale", "v_scale")
+
+        attn_layers = self.compilation_config.static_forward_context
+        for name, module in attn_layers.items():
+            if isinstance(module, (Attention, MLAAttention)):
+                # TODO: Generally, scale is 1.0 if user uses on-the-fly fp8
+                # kvcache quant. However, to get better accuracy, compression
+                # frameworks like llm-compressors allow users to tune the
+                # scale. We may need to restore the specific calibrated scales
+                # here in the future.
+                k_scale_val, v_scale_val = 1.0, 1.0
+
+                # Processing K Scale
+                for attr in k_attr_names:
+                    if hasattr(module, attr):
+                        param = getattr(module, attr)
+                        if isinstance(param, torch.Tensor):
+                            param.fill_(k_scale_val)
+
+                # Processing V Scale
+                for attr in v_attr_names:
+                    if hasattr(module, attr):
+                        param = getattr(module, attr)
+                        if isinstance(param, torch.Tensor):
+                            param.fill_(v_scale_val)
+
     def _get_positions(self, num_tokens: Any):
         if isinstance(num_tokens, int):
             if self.uses_mrope:
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index d0c6091ce2a6e..ed6fb32bcb2f6 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -141,6 +141,16 @@ class Worker(WorkerBase):
                     buffer.data.copy_(self._sleep_saved_buffers[name].data)
             self._sleep_saved_buffers = {}
 
+        # If the KV cache has just been woken up,
+        # the internal state of cache_engine must be reset,
+        # especially the FP8 scaling factor.
+        if (
+            (tags is None or "kv_cache" in tags)
+            and self.cache_config.cache_dtype.startswith("fp8")
+            and hasattr(self.model_runner, "init_fp8_kv_scales")
+        ):
+            self.model_runner.init_fp8_kv_scales()
+
     def _maybe_get_memory_pool_context(self, tag: str) -> AbstractContextManager:
         if self.vllm_config.model_config.enable_sleep_mode:
             from vllm.device_allocator.cumem import CuMemAllocator

From 9381b5cde09c542d74df835b2cba3225eff68d0e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9C=9D?= <108347991+BowTen@users.noreply.github.com>
Date: Sun, 30 Nov 2025 14:29:13 +0800
Subject: [PATCH 090/770] [Doc]: Fix typo in fused_moe layer (#29731)

Signed-off-by: BowTen <bowten@qq.com>
---
 vllm/model_executor/layers/fused_moe/layer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 0ef3130b26333..e180b4f4ba23f 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1422,7 +1422,7 @@ class FusedMoE(CustomOp):
                 # do nothing.
                 return p
 
-            # Do not update the layer paramater as the layer's MoE operations would
+            # Do not update the layer parameter as the layer's MoE operations would
             # expect the parameter's tensor to the same shape / stride. Instead,
             # make a new torch.nn.Parameter that is used just in the context of
             # EPLB.

From 2afcec4decbbd64cf8da20d49281f828dd136c27 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 30 Nov 2025 14:59:47 +0800
Subject: [PATCH 091/770] [Misc] Update `TokenizerLike` interface and move
 `get_cached_tokenizer` (#29730)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/test-amd.yaml                      |  13 +-
 .buildkite/test-pipeline.yaml                 |  13 +-
 docs/design/huggingface_integration.md        |   2 +-
 .../{test_cached_tokenizer.py => test_hf.py}  |   2 +-
 tests/tokenizers_/test_mistral.py             |   6 +-
 tests/tokenizers_/test_registry.py            |  16 +-
 tools/pre_commit/check_pickle_imports.py      |   2 +-
 vllm/entrypoints/llm.py                       |   2 +-
 vllm/entrypoints/score_utils.py               |   4 +-
 vllm/tokenizers/__init__.py                   |   3 +-
 vllm/tokenizers/hf.py                         | 122 +++++++++++++++
 vllm/tokenizers/mistral.py                    |  67 +++++---
 vllm/tokenizers/protocol.py                   |  32 ++--
 vllm/transformers_utils/tokenizer.py          | 148 +++++-------------
 vllm/v1/engine/detokenizer.py                 |   2 +-
 15 files changed, 260 insertions(+), 174 deletions(-)
 rename tests/tokenizers_/{test_cached_tokenizer.py => test_hf.py} (95%)
 create mode 100644 vllm/tokenizers/hf.py

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 4d98ee40a4bbb..687b6b08507c7 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -61,8 +61,8 @@ steps:
   - pytest -v -s -m 'not cpu_test' multimodal
   - pytest -v -s utils_
 
-- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins
-  timeout_in_minutes: 10
+- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
+  timeout_in_minutes: 20
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
@@ -72,6 +72,7 @@ steps:
   - tests/test_outputs.py
   - tests/multimodal
   - tests/standalone_tests/lazy_imports.py
+  - tests/tokenizers_
   - tests/transformers_utils
   - tests/config
   no_gpu: true
@@ -80,6 +81,7 @@ steps:
   - pytest -v -s test_inputs.py
   - pytest -v -s test_outputs.py
   - pytest -v -s -m 'cpu_test' multimodal
+  - pytest -v -s tokenizers_
   - pytest -v -s transformers_utils
   - pytest -v -s config
 
@@ -308,23 +310,20 @@ steps:
   - pytest -v -s test_regression.py
   working_dir: "/vllm-workspace/tests" # optional
 
-- label: Engine Test # 25min
-  timeout_in_minutes: 40
+- label: Engine Test # 9min
+  timeout_in_minutes: 15
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   source_file_dependencies:
   - vllm/
   - tests/engine
-  - tests/tokenizers_
   - tests/test_sequence
   - tests/test_config
   - tests/test_logger
   - tests/test_vllm_port
   commands:
   - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
-  # OOM in the CI unless we run this separately
-  - pytest -v -s tokenizers_
 
 - label: V1 Test e2e + engine # 30min
   timeout_in_minutes: 45
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 16d4907549587..9f2107fb1e5ab 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -57,14 +57,15 @@ steps:
   - pytest -v -s -m 'not cpu_test' multimodal
   - pytest -v -s utils_
 
-- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins
-  timeout_in_minutes: 10
+- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
+  timeout_in_minutes: 20
   source_file_dependencies:
   - vllm/
   - tests/test_inputs.py
   - tests/test_outputs.py
   - tests/multimodal
   - tests/standalone_tests/lazy_imports.py
+  - tests/tokenizers_
   - tests/transformers_utils
   - tests/config
   no_gpu: true
@@ -73,6 +74,7 @@ steps:
   - pytest -v -s test_inputs.py
   - pytest -v -s test_outputs.py
   - pytest -v -s -m 'cpu_test' multimodal
+  - pytest -v -s tokenizers_
   - pytest -v -s transformers_utils
   - pytest -v -s config
 
@@ -276,21 +278,18 @@ steps:
   - pytest -v -s test_regression.py
   working_dir: "/vllm-workspace/tests" # optional
 
-- label: Engine Test # 25min
-  timeout_in_minutes: 40
+- label: Engine Test # 9min
+  timeout_in_minutes: 15
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/
   - tests/engine
-  - tests/tokenizers_
   - tests/test_sequence
   - tests/test_config
   - tests/test_logger
   - tests/test_vllm_port
   commands:
   - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
-  # OOM in the CI unless we run this separately
-  - pytest -v -s tokenizers_
 
 - label: V1 Test e2e + engine # 30min
   timeout_in_minutes: 45
diff --git a/docs/design/huggingface_integration.md b/docs/design/huggingface_integration.md
index 412ce658b92a2..1109abf6cb935 100644
--- a/docs/design/huggingface_integration.md
+++ b/docs/design/huggingface_integration.md
@@ -21,7 +21,7 @@ Let's say we want to serve the popular Qwen model by running `vllm serve Qwen/Qw
 
 Beyond that, there are two more things vLLM depends on Hugging Face for.
 
-1. **Tokenizer**: vLLM uses the tokenizer from Hugging Face to tokenize the input text. The tokenizer is loaded using [AutoTokenizer.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained) with the `model` argument as the model name and the `--revision` argument as the revision. It is also possible to use a tokenizer from another model by specifying the `--tokenizer` argument in the `vllm serve` command. Other relevant arguments are `--tokenizer-revision` and `--tokenizer-mode`. Please check Hugging Face's documentation for the meaning of these arguments. This part of the logic can be found in the [get_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87) function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in [get_cached_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L24).
+1. **Tokenizer**: vLLM uses the tokenizer from Hugging Face to tokenize the input text. The tokenizer is loaded using [AutoTokenizer.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained) with the `model` argument as the model name and the `--revision` argument as the revision. It is also possible to use a tokenizer from another model by specifying the `--tokenizer` argument in the `vllm serve` command. Other relevant arguments are `--tokenizer-revision` and `--tokenizer-mode`. Please check Hugging Face's documentation for the meaning of these arguments. This part of the logic can be found in the [get_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87) function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in [vllm.tokenizers.hf.get_cached_tokenizer][].
 
 2. **Model weight**: vLLM downloads the model weight from the Hugging Face model hub using the `model` argument as the model name and the `--revision` argument as the revision. vLLM provides the argument `--load-format` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass `--load-format dummy` to skip downloading the weights.
     - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the [documentation](https://huggingface.co/docs/safetensors/en/index) for more information on the safetensors format. This part of the logic can be found [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/model_executor/model_loader/loader.py#L385). Please note that:
diff --git a/tests/tokenizers_/test_cached_tokenizer.py b/tests/tokenizers_/test_hf.py
similarity index 95%
rename from tests/tokenizers_/test_cached_tokenizer.py
rename to tests/tokenizers_/test_hf.py
index 48234687ea1ea..c1238900ce0d1 100644
--- a/tests/tokenizers_/test_cached_tokenizer.py
+++ b/tests/tokenizers_/test_hf.py
@@ -7,7 +7,7 @@ import pytest
 from transformers import AutoTokenizer
 
 from vllm.tokenizers import TokenizerLike
-from vllm.transformers_utils.tokenizer import get_cached_tokenizer
+from vllm.tokenizers.hf import get_cached_tokenizer
 
 
 @pytest.mark.parametrize("model_id", ["gpt2", "zai-org/chatglm3-6b"])
diff --git a/tests/tokenizers_/test_mistral.py b/tests/tokenizers_/test_mistral.py
index 0706a94791dc9..92efac86dff29 100644
--- a/tests/tokenizers_/test_mistral.py
+++ b/tests/tokenizers_/test_mistral.py
@@ -356,8 +356,8 @@ class TestMistralTokenizer:
         )
         attn_mask = [1 for _ in range(len(token_ids))]
 
-        # Test 1: default
-        assert mistral_tokenizer("Hello world !") == {
+        # Test 1: no special tokens
+        assert mistral_tokenizer("Hello world !", add_special_tokens=False) == {
             "attention_mask": attn_mask[1:],
             "input_ids": token_ids[1:],
         }
@@ -381,7 +381,7 @@ class TestMistralTokenizer:
             "input_ids": token_ids,
         }
         # Test 5: empty string
-        assert mistral_tokenizer("") == {
+        assert mistral_tokenizer("", add_special_tokens=False) == {
             "attention_mask": [],
             "input_ids": [],
         }
diff --git a/tests/tokenizers_/test_registry.py b/tests/tokenizers_/test_registry.py
index 1eb19a0996dd9..b357669f83781 100644
--- a/tests/tokenizers_/test_registry.py
+++ b/tests/tokenizers_/test_registry.py
@@ -17,20 +17,26 @@ class TestTokenizer(TokenizerLike):
     def eos_token_id(self) -> int:
         return 1
 
+    @property
+    def pad_token_id(self) -> int:
+        return 2
+
+    @property
+    def is_fast(self) -> bool:
+        return True
+
 
 def test_customized_tokenizer():
-    TokenizerRegistry.register(
-        "test_tokenizer",
-        __name__,
-        TestTokenizer.__name__,
-    )
+    TokenizerRegistry.register("test_tokenizer", __name__, TestTokenizer.__name__)
 
     tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer")
     assert isinstance(tokenizer, TestTokenizer)
     assert tokenizer.bos_token_id == 0
     assert tokenizer.eos_token_id == 1
+    assert tokenizer.pad_token_id == 2
 
     tokenizer = get_tokenizer("test_tokenizer", tokenizer_mode="custom")
     assert isinstance(tokenizer, TestTokenizer)
     assert tokenizer.bos_token_id == 0
     assert tokenizer.eos_token_id == 1
+    assert tokenizer.pad_token_id == 2
diff --git a/tools/pre_commit/check_pickle_imports.py b/tools/pre_commit/check_pickle_imports.py
index 2bb468da68c2a..13e5a0eda751a 100644
--- a/tools/pre_commit/check_pickle_imports.py
+++ b/tools/pre_commit/check_pickle_imports.py
@@ -27,7 +27,7 @@ ALLOWED_FILES = {
     "vllm/distributed/device_communicators/shm_broadcast.py",
     "vllm/distributed/device_communicators/shm_object_storage.py",
     "vllm/utils/hashing.py",
-    "tests/tokenizers_/test_cached_tokenizer.py",
+    "tests/tokenizers_/test_hf.py",
     "tests/utils_/test_hashing.py",
     "benchmarks/kernels/graph_machete_bench.py",
     "benchmarks/kernels/benchmark_lora.py",
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 4ea213752e394..acdf28501cbb9 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -72,7 +72,7 @@ from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import BeamSearchParams, RequestOutputKind, SamplingParams
 from vllm.tasks import PoolingTask
 from vllm.tokenizers import MistralTokenizer, TokenizerLike
-from vllm.transformers_utils.tokenizer import get_cached_tokenizer
+from vllm.tokenizers.hf import get_cached_tokenizer
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils.collection_utils import as_iter, is_list_of
 from vllm.utils.counter import Counter
diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py
index 04d5a192918dd..602f59ac09f55 100644
--- a/vllm/entrypoints/score_utils.py
+++ b/vllm/entrypoints/score_utils.py
@@ -51,8 +51,8 @@ def _cosine_similarity(
     for emb_1, emb_2 in zip(embed_1, embed_2):
         pair_score = scorer(emb_1.outputs.data, emb_2.outputs.data)
 
-        padding = []
-        if (pad_token_id := getattr(tokenizer, "pad_token_id", None)) is not None:
+        padding: list[int] = []
+        if (pad_token_id := tokenizer.pad_token_id) is not None:
             padding = [pad_token_id]
 
         tokens = emb_1.prompt_token_ids + padding + emb_2.prompt_token_ids
diff --git a/vllm/tokenizers/__init__.py b/vllm/tokenizers/__init__.py
index e26b4e8797ec8..03174872146a7 100644
--- a/vllm/tokenizers/__init__.py
+++ b/vllm/tokenizers/__init__.py
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from .hf import HfTokenizer
 from .mistral import MistralTokenizer
 from .protocol import TokenizerLike
 from .registry import TokenizerRegistry
 
-__all__ = ["TokenizerLike", "MistralTokenizer", "TokenizerRegistry"]
+__all__ = ["TokenizerLike", "HfTokenizer", "MistralTokenizer", "TokenizerRegistry"]
diff --git a/vllm/tokenizers/hf.py b/vllm/tokenizers/hf.py
new file mode 100644
index 0000000000000..64672fdbb1209
--- /dev/null
+++ b/vllm/tokenizers/hf.py
@@ -0,0 +1,122 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import contextlib
+import copy
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+from transformers import AutoTokenizer
+
+from vllm.transformers_utils.config import get_sentence_transformer_tokenizer_config
+
+from .protocol import TokenizerLike
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+
+
+def get_cached_tokenizer(
+    tokenizer: "PreTrainedTokenizer | PreTrainedTokenizerFast",
+) -> TokenizerLike:
+    """
+    By default, transformers will recompute multiple tokenizer properties
+    each time they are called, leading to a significant slowdown.
+    This proxy caches these properties for faster access.
+    """
+    cached_tokenizer = copy.copy(tokenizer)
+
+    tokenizer_all_special_ids = tokenizer.all_special_ids
+    tokenizer_all_special_tokens = tokenizer.all_special_tokens
+    tokenizer_vocab = tokenizer.get_vocab()
+    tokenizer_len = len(tokenizer)
+
+    max_token_id = max(tokenizer_vocab.values())
+    # Some tokenizers (e.g., QwenTokenizer) have special tokens that
+    # are added and included in the implementation of the vocab_size
+    # property, but not in get_vocab(); if there is an implementation
+    # of vocab size, we should take the greater value.
+    if hasattr(tokenizer, "vocab_size"):
+        with contextlib.suppress(NotImplementedError):
+            max_token_id = max(max_token_id, tokenizer.vocab_size)
+
+    class CachedTokenizer(tokenizer.__class__):  # type: ignore
+        @property
+        def all_special_ids(self) -> list[int]:
+            return tokenizer_all_special_ids
+
+        @property
+        def all_special_tokens(self) -> list[str]:
+            return tokenizer_all_special_tokens
+
+        @property
+        def max_token_id(self) -> int:
+            return max_token_id
+
+        def get_vocab(self) -> dict[str, int]:
+            return tokenizer_vocab
+
+        def __len__(self) -> int:
+            return tokenizer_len
+
+        def __reduce__(self):
+            return get_cached_tokenizer, (tokenizer,)
+
+    CachedTokenizer.__name__ = f"Cached{tokenizer.__class__.__name__}"
+
+    cached_tokenizer.__class__ = CachedTokenizer
+    return cached_tokenizer  # type: ignore
+
+
+class HfTokenizer(TokenizerLike):
+    @classmethod
+    def from_pretrained(
+        cls,
+        path_or_repo_id: str | Path,
+        *args,
+        trust_remote_code: bool = False,
+        revision: str | None = None,
+        download_dir: str | None = None,
+        **kwargs,
+    ) -> "TokenizerLike":
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(
+                path_or_repo_id,
+                *args,
+                trust_remote_code=trust_remote_code,
+                revision=revision,
+                cache_dir=download_dir,
+                **kwargs,
+            )
+        except ValueError as e:
+            # If the error pertains to the tokenizer class not existing or not
+            # currently being imported,
+            # suggest using the --trust-remote-code flag.
+            if not trust_remote_code and (
+                "does not exist or is not currently imported." in str(e)
+                or "requires you to execute the tokenizer file" in str(e)
+            ):
+                err_msg = (
+                    "Failed to load the tokenizer. If the tokenizer "
+                    "is a custom tokenizer not yet available in the "
+                    "HuggingFace transformers library, consider "
+                    "setting `trust_remote_code=True` in LLM or using "
+                    "the `--trust-remote-code` flag in the CLI."
+                )
+                raise RuntimeError(err_msg) from e
+            else:
+                raise e
+
+        # The special_tokens in tokenizer should also be
+        # controlled by do_lower_case in encoder_config
+        encoder_config = get_sentence_transformer_tokenizer_config(
+            path_or_repo_id, revision
+        )
+        if isinstance(encoder_config, dict) and encoder_config.get(
+            "do_lower_case", False
+        ):
+            special_tokens_map = {
+                k: v.lower() for k, v in tokenizer.special_tokens_map.items()
+            }
+            tokenizer.add_special_tokens(special_tokens_map)
+
+        return get_cached_tokenizer(tokenizer)
diff --git a/vllm/tokenizers/mistral.py b/vllm/tokenizers/mistral.py
index a42fb0e1e5f14..de3e5ec43854a 100644
--- a/vllm/tokenizers/mistral.py
+++ b/vllm/tokenizers/mistral.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, cast
 
 from vllm.logger import init_logger
@@ -12,6 +12,7 @@ if TYPE_CHECKING:
         ChatCompletionRequest as MistralChatCompletionRequest,
     )
     from mistral_common.tokens.tokenizers.tekken import Tekkenizer
+    from transformers import BatchEncoding
     from transformers.tokenization_mistral_common import (
         MistralCommonTokenizer as TransformersMistralTokenizer,
     )
@@ -165,7 +166,35 @@ def _tekken_token_to_id(tokenizer: "Tekkenizer", t: str | bytes) -> int:
 
 
 class MistralTokenizer(TokenizerLike):
+    @classmethod
+    def from_pretrained(
+        cls,
+        path_or_repo_id: str | Path,
+        *args,
+        trust_remote_code: bool = False,
+        revision: str | None = None,
+        download_dir: str | None = None,
+        **kwargs,
+    ) -> "MistralTokenizer":
+        from mistral_common.protocol.instruct.validator import ValidationMode
+        from transformers.tokenization_mistral_common import (
+            MistralCommonTokenizer as TransformersMistralTokenizer,
+        )
+
+        tokenizer = TransformersMistralTokenizer.from_pretrained(
+            path_or_repo_id,
+            *args,
+            mode=ValidationMode.test,
+            cache_dir=download_dir,
+            revision="main" if revision is None else revision,
+            **kwargs,
+        )
+
+        return cls(tokenizer)
+
     def __init__(self, tokenizer: "TransformersMistralTokenizer") -> None:
+        super().__init__()
+
         from mistral_common.protocol.instruct.validator import ValidationMode
         from mistral_common.tokens.tokenizers.sentencepiece import (
             SentencePieceTokenizer,
@@ -211,22 +240,6 @@ class MistralTokenizer(TokenizerLike):
         self._vocab = self.tokenizer._vocab
         self._max_token_id = self.vocab_size - 1
 
-    @classmethod
-    def from_pretrained(
-        cls, path_or_repo_id: str, *, revision: str | None = None
-    ) -> "MistralTokenizer":
-        from mistral_common.protocol.instruct.validator import ValidationMode
-        from transformers.tokenization_mistral_common import (
-            MistralCommonTokenizer as TransformersMistralTokenizer,
-        )
-
-        str_revision = "main" if revision is None else revision
-        return cls(
-            TransformersMistralTokenizer.from_pretrained(
-                path_or_repo_id, revision=str_revision, mode=ValidationMode.test
-            )
-        )
-
     def _get_special_token_ids(self) -> list[int]:
         from mistral_common.tokens.tokenizers.sentencepiece import (
             SentencePieceTokenizer,
@@ -271,6 +284,10 @@ class MistralTokenizer(TokenizerLike):
     def eos_token_id(self) -> int:
         return self.tokenizer.eos_id
 
+    @property
+    def pad_token_id(self) -> int:
+        return self.tokenizer.pad_id
+
     @property
     def is_fast(self) -> bool:
         return True
@@ -298,12 +315,12 @@ class MistralTokenizer(TokenizerLike):
 
     def __call__(
         self,
-        text: str | list[str] | list[int],
+        text: str | list[str],
         text_pair: str | None = None,
-        add_special_tokens: bool = False,
+        add_special_tokens: bool = True,
         truncation: bool = False,
         max_length: int | None = None,
-    ):
+    ) -> "BatchEncoding":
         if text_pair is not None:
             raise ValueError(
                 "`text_pair` is not supported by `MistralTokenizer.__call__`."
@@ -342,13 +359,11 @@ class MistralTokenizer(TokenizerLike):
         text: str,
         truncation: bool | None = None,
         max_length: int | None = None,
-        add_special_tokens: bool | None = None,
+        add_special_tokens: bool = True,
     ) -> list[int]:
         # TODO(juliendenize): once https://github.com/huggingface/transformers/pull/41962
         # is in, directly call self.transformers_tokenizer.encode(...).
-        encoded = self.tokenizer.encode(
-            text, bos=add_special_tokens is not False, eos=False
-        )
+        encoded = self.tokenizer.encode(text, bos=add_special_tokens, eos=False)
 
         if truncation is not False and max_length is not None:
             return encoded[:max_length]
@@ -383,7 +398,7 @@ class MistralTokenizer(TokenizerLike):
             return_dict=False,
         )
 
-    def decode(self, ids: list[int] | int, skip_special_tokens: bool = True) -> str:
+    def decode(self, ids: list[int] | int, skip_special_tokens: bool = False) -> str:
         # TODO(juliendenize): once https://github.com/huggingface/transformers/pull/41962
         # is in, directly call self.transformers_tokenizer.decode(...).
         if isinstance(ids, int):
@@ -455,7 +470,7 @@ class MistralTokenizer(TokenizerLike):
     def convert_ids_to_tokens(
         self,
         ids: list[int],
-        skip_special_tokens: bool = True,
+        skip_special_tokens: bool = False,
     ) -> list[str]:
         from mistral_common.tokens.tokenizers.base import (
             SpecialTokenPolicy,
diff --git a/vllm/tokenizers/protocol.py b/vllm/tokenizers/protocol.py
index 58a1a7c23f21c..6c807bd998781 100644
--- a/vllm/tokenizers/protocol.py
+++ b/vllm/tokenizers/protocol.py
@@ -1,11 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, Protocol
 
-from typing_extensions import Self
-
 if TYPE_CHECKING:
+    from transformers import BatchEncoding
+
     from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
 
 
@@ -13,11 +13,13 @@ class TokenizerLike(Protocol):
     @classmethod
     def from_pretrained(
         cls,
-        pretrained_model_name_or_path: str,
-        /,
-        *,
+        path_or_repo_id: str | Path,
+        *args,
+        trust_remote_code: bool = False,
         revision: str | None = None,
-    ) -> Self:
+        download_dir: str | None = None,
+        **kwargs,
+    ) -> "TokenizerLike":
         raise NotImplementedError
 
     @property
@@ -36,6 +38,10 @@ class TokenizerLike(Protocol):
     def eos_token_id(self) -> int:
         raise NotImplementedError
 
+    @property
+    def pad_token_id(self) -> int:
+        raise NotImplementedError
+
     @property
     def is_fast(self) -> bool:
         raise NotImplementedError
@@ -60,12 +66,12 @@ class TokenizerLike(Protocol):
 
     def __call__(
         self,
-        text: str | list[str] | list[int],
+        text: str | list[str],
         text_pair: str | None = None,
-        add_special_tokens: bool = False,
+        add_special_tokens: bool = True,
         truncation: bool = False,
         max_length: int | None = None,
-    ):
+    ) -> "BatchEncoding":
         raise NotImplementedError
 
     def get_vocab(self) -> dict[str, int]:
@@ -79,7 +85,7 @@ class TokenizerLike(Protocol):
         text: str,
         truncation: bool | None = None,
         max_length: int | None = None,
-        add_special_tokens: bool | None = None,
+        add_special_tokens: bool = True,
     ) -> list[int]:
         raise NotImplementedError
 
@@ -94,12 +100,12 @@ class TokenizerLike(Protocol):
     def convert_tokens_to_string(self, tokens: list[str]) -> str:
         raise NotImplementedError
 
-    def decode(self, ids: list[int] | int, skip_special_tokens: bool = True) -> str:
+    def decode(self, ids: list[int] | int, skip_special_tokens: bool = False) -> str:
         raise NotImplementedError
 
     def convert_ids_to_tokens(
         self,
         ids: list[int],
-        skip_special_tokens: bool = True,
+        skip_special_tokens: bool = False,
     ) -> list[str]:
         raise NotImplementedError
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 87d5cc2b483fb..622d5c7fe9931 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import contextlib
-import copy
 import importlib.util
 import os
 import warnings
@@ -11,14 +9,17 @@ from pathlib import Path
 from typing import TYPE_CHECKING, Any
 
 import huggingface_hub
-from transformers import AutoTokenizer, PreTrainedTokenizerBase
 from typing_extensions import assert_never
 
 from vllm import envs
 from vllm.logger import init_logger
-from vllm.tokenizers import MistralTokenizer, TokenizerLike, TokenizerRegistry
+from vllm.tokenizers import (
+    HfTokenizer,
+    MistralTokenizer,
+    TokenizerLike,
+    TokenizerRegistry,
+)
 
-from .config import get_sentence_transformer_tokenizer_config
 from .gguf_utils import get_gguf_file_path_from_hf
 from .repo_utils import list_filtered_repo_files
 from .utils import check_gguf_file, is_gguf, is_remote_gguf, split_remote_gguf
@@ -41,6 +42,18 @@ def __getattr__(name: str):
         )
 
         return TokenizerLike
+    if name == "get_cached_tokenizer":
+        from vllm.tokenizers.hf import get_cached_tokenizer
+
+        warnings.warn(
+            "`vllm.transformers_utils.tokenizer.get_cached_tokenizer` "
+            "has been moved to `vllm.tokenizers.hf.get_cached_tokenizer`. "
+            "The old name will be removed in v0.13.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+
+        return get_cached_tokenizer
 
     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
 
@@ -58,10 +71,12 @@ def decode_tokens(
     `skip_special_tokens=None` means to use the backend's default
     settings.
     """
-    if skip_special_tokens is not None:
-        return tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
+    kw_args: dict[str, Any] = {}
 
-    return tokenizer.decode(token_ids)
+    if skip_special_tokens is not None:
+        kw_args["skip_special_tokens"] = skip_special_tokens
+
+    return tokenizer.decode(token_ids, **kw_args)
 
 
 def encode_tokens(
@@ -93,56 +108,6 @@ def encode_tokens(
     return tokenizer.encode(text, **kw_args)
 
 
-def get_cached_tokenizer(tokenizer: TokenizerLike) -> TokenizerLike:
-    """
-    By default, transformers will recompute multiple tokenizer properties
-    each time they are called, leading to a significant slowdown.
-    This proxy caches these properties for faster access.
-    """
-    cached_tokenizer = copy.copy(tokenizer)
-
-    tokenizer_all_special_ids = tokenizer.all_special_ids
-    tokenizer_all_special_tokens = tokenizer.all_special_tokens
-    tokenizer_vocab = tokenizer.get_vocab()
-    tokenizer_len = len(tokenizer)
-
-    max_token_id = max(tokenizer_vocab.values())
-    # Some tokenizers (e.g., QwenTokenizer) have special tokens that
-    # are added and included in the implementation of the vocab_size
-    # property, but not in get_vocab(); if there is an implementation
-    # of vocab size, we should take the greater value.
-    if hasattr(tokenizer, "vocab_size"):
-        with contextlib.suppress(NotImplementedError):
-            max_token_id = max(max_token_id, tokenizer.vocab_size)
-
-    class CachedTokenizer(tokenizer.__class__):  # type: ignore
-        @property
-        def all_special_ids(self) -> list[int]:
-            return tokenizer_all_special_ids
-
-        @property
-        def all_special_tokens(self) -> list[str]:
-            return tokenizer_all_special_tokens
-
-        @property
-        def max_token_id(self) -> int:
-            return max_token_id
-
-        def get_vocab(self) -> dict[str, int]:
-            return tokenizer_vocab
-
-        def __len__(self) -> int:
-            return tokenizer_len
-
-        def __reduce__(self):
-            return get_cached_tokenizer, (tokenizer,)
-
-    CachedTokenizer.__name__ = f"Cached{tokenizer.__class__.__name__}"
-
-    cached_tokenizer.__class__ = CachedTokenizer
-    return cached_tokenizer
-
-
 def get_tokenizer(
     tokenizer_name: str | Path,
     *args,
@@ -217,66 +182,39 @@ def get_tokenizer(
     if tokenizer_mode == "mistral":
         logger.debug_once(f"Loading MistralTokenizer from {tokenizer_name}")
         tokenizer = MistralTokenizer.from_pretrained(
-            str(tokenizer_name), revision=revision
+            tokenizer_name,
+            *args,
+            trust_remote_code=trust_remote_code,
+            revision=revision,
+            download_dir=download_dir,
+            **kwargs,
         )
     elif tokenizer_mode == "custom":
         logger.debug_once(f"Loading CustomTokenizer from {tokenizer_name}")
         tokenizer = TokenizerRegistry.get_tokenizer(
             str(tokenizer_name),
             *args,
+            trust_remote_code=trust_remote_code,
             revision=revision,
             download_dir=download_dir,
             **kwargs,
         )
     else:
-        try:
-            logger.debug_once(f"Loading AutoTokenizer from {tokenizer_name}")
-            tokenizer = AutoTokenizer.from_pretrained(
-                tokenizer_name,
-                *args,
-                trust_remote_code=trust_remote_code,
-                revision=revision,
-                **kwargs,
-            )
-        except ValueError as e:
-            # If the error pertains to the tokenizer class not existing or not
-            # currently being imported,
-            # suggest using the --trust-remote-code flag.
-            if not trust_remote_code and (
-                "does not exist or is not currently imported." in str(e)
-                or "requires you to execute the tokenizer file" in str(e)
-            ):
-                err_msg = (
-                    "Failed to load the tokenizer. If the tokenizer "
-                    "is a custom tokenizer not yet available in the "
-                    "HuggingFace transformers library, consider "
-                    "setting `trust_remote_code=True` in LLM or using "
-                    "the `--trust-remote-code` flag in the CLI."
-                )
-                raise RuntimeError(err_msg) from e
-            else:
-                raise e
-
-        # The special_tokens in tokenizer should also be
-        # controlled by do_lower_case in encoder_config
-        encoder_config = get_sentence_transformer_tokenizer_config(
-            tokenizer_name, revision
+        logger.debug_once(f"Loading HfTokenizer from {tokenizer_name}")
+        tokenizer = HfTokenizer.from_pretrained(
+            tokenizer_name,
+            *args,
+            trust_remote_code=trust_remote_code,
+            revision=revision,
+            download_dir=download_dir,
+            **kwargs,
         )
-        if isinstance(encoder_config, dict) and encoder_config.get(
-            "do_lower_case", False
-        ):
-            assert isinstance(tokenizer, PreTrainedTokenizerBase)
-            special_tokens_map = {
-                k: v.lower() for k, v in tokenizer.special_tokens_map.items()
-            }
-            tokenizer.add_special_tokens(special_tokens_map)
 
-        if not tokenizer.is_fast:
-            logger.warning(
-                "Using a slow tokenizer. This might cause a significant "
-                "slowdown. Consider using a fast tokenizer instead."
-            )
-        tokenizer = get_cached_tokenizer(tokenizer)
+    if not tokenizer.is_fast:
+        logger.warning(
+            "Using a slow tokenizer. This might cause a significant "
+            "slowdown. Consider using a fast tokenizer instead."
+        )
 
     return tokenizer
 
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 6c0acd9a9f59f..dce8765fcf6ba 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -9,8 +9,8 @@ from tokenizers.decoders import DecodeStream
 from transformers import PreTrainedTokenizerFast
 
 from vllm.logger import init_logger
+from vllm.tokenizers import TokenizerLike
 from vllm.tokenizers.detokenizer_utils import (
-    TokenizerLike,
     convert_prompt_ids_to_tokens,
     detokenize_incrementally,
 )

From 47539cfd3e5006159e427ee5bc32823f6cef7ec3 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 30 Nov 2025 17:15:01 +0800
Subject: [PATCH 092/770] [Bugfix] Fix mismatched nvfp4 gemm output shape
 (#29742)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 .../compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
index b603bdb13280b..c0b1e3ceeba34 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
@@ -184,7 +184,7 @@ class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
             return out
 
         output_dtype = x.dtype
-        output_shape = [x.shape[0], layer.weight_packed.shape[0]]
+        output_shape = [*x.shape[:-1], layer.weight_packed.shape[0]]
 
         # quantize BF16 or FP16 to (FP4 and interleaved block scale)
         x_fp4, x_blockscale = scaled_fp4_quant(x, layer.input_global_scale)

From 64bc09ba279d42bbb7f8c4fba6a049474c438774 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 30 Nov 2025 17:31:12 +0800
Subject: [PATCH 093/770] [Core] Enable `inputs_embeds_size` separate from
 `hidden_size` (#29741)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../models/multimodal/pooling/test_siglip.py  |  7 ++-
 vllm/config/model.py                          | 10 ++++
 vllm/model_executor/models/clip.py            | 51 +++++++++++++++++--
 vllm/model_executor/models/siglip.py          | 49 ++++++++++++++++--
 vllm/v1/spec_decode/eagle.py                  |  5 +-
 vllm/v1/worker/gpu/input_batch.py             |  2 +-
 vllm/v1/worker/gpu/model_runner.py            |  4 +-
 vllm/v1/worker/gpu/spec_decode/eagle.py       |  3 +-
 vllm/v1/worker/gpu_model_runner.py            |  4 +-
 vllm/v1/worker/tpu_model_runner.py            |  6 ++-
 10 files changed, 123 insertions(+), 18 deletions(-)

diff --git a/tests/models/multimodal/pooling/test_siglip.py b/tests/models/multimodal/pooling/test_siglip.py
index 3345b10c099ac..c973676ba0272 100644
--- a/tests/models/multimodal/pooling/test_siglip.py
+++ b/tests/models/multimodal/pooling/test_siglip.py
@@ -19,7 +19,12 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
     }
 )
 
-MODELS = ["google/siglip-base-patch16-224", "google/siglip2-base-patch16-224"]
+MODELS = [
+    "google/siglip-base-patch16-224",
+    "google/siglip2-base-patch16-224",
+    # Different image embedding dim than text_config.hidden_size
+    "google/siglip2-giant-opt-patch16-384",
+]
 
 
 def _run_test(
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 6b36573813544..92cd48402a65d 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -1202,6 +1202,16 @@ class ModelConfig:
     def get_hidden_size(self) -> int:
         return getattr(self.hf_text_config, "hidden_size", 0)
 
+    def get_inputs_embeds_size(self) -> int:
+        # The size of inputs_embeds is usually identical to the size
+        # of the hidden states, however there are exceptions, such as
+        # embedding models like CLIP and SigLIP
+        for target_attr in ("projection_dim", "projection_size"):
+            if hasattr(self.hf_text_config, target_attr):
+                return getattr(self.hf_text_config, target_attr)
+
+        return self.get_hidden_size()
+
     @property
     def is_deepseek_mla(self) -> bool:
         if not hasattr(self.hf_text_config, "model_type"):
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index c2993b47dc3f9..b8af3050990bc 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Iterable, Mapping, Sequence
+from collections.abc import Callable, Iterable, Mapping, Sequence
 from functools import cached_property
 from typing import Annotated, Literal
 
@@ -903,6 +903,41 @@ class CLIPEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
     def get_language_model(self) -> torch.nn.Module:
         return self.text_model
 
+    def _embed_text_input_ids(
+        self,
+        input_ids: torch.Tensor,
+        embed_input_ids: Callable[[torch.Tensor], torch.Tensor],
+        *,
+        is_multimodal: torch.Tensor | None,
+        handle_oov_mm_token: bool,
+    ) -> torch.Tensor:
+        inputs_embeds = super()._embed_text_input_ids(
+            input_ids,
+            embed_input_ids,
+            is_multimodal=is_multimodal,
+            handle_oov_mm_token=handle_oov_mm_token,
+        )
+
+        # NOTE: inputs_embeds in model runner has size text_config.projection_dim
+        # (instead of text_config.hidden_size) to accommodate image embeddings
+        inputs_embeds_size = self.projection_dim
+        if inputs_embeds.shape[1] < inputs_embeds_size:
+            inputs_embeds = torch.cat(
+                [
+                    inputs_embeds,
+                    inputs_embeds.new_empty(
+                        inputs_embeds.shape[0],
+                        inputs_embeds_size - inputs_embeds.shape[1],
+                    ),
+                ],
+                dim=1,
+            )
+        elif inputs_embeds.shape[1] > inputs_embeds_size:
+            # No need to handle this case for now
+            raise NotImplementedError
+
+        return inputs_embeds
+
     def embed_input_ids(
         self,
         input_ids: torch.Tensor,
@@ -949,10 +984,16 @@ class CLIPEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
         if not self._is_text_input:
             return inputs_embeds
 
-        # Text inputs
-        return self.get_text_features(
-            input_ids=input_ids, position_ids=positions, inputs_embeds=inputs_embeds
-        )
+        # NOTE: inputs_embeds in model runner has size text_config.projection_dim
+        # (instead of text_config.hidden_size) to accommodate image embeddings
+        hidden_size = self.text_embed_dim
+        if inputs_embeds.shape[1] > hidden_size:
+            inputs_embeds = inputs_embeds[:, :hidden_size]
+        elif inputs_embeds.shape[1] < hidden_size:
+            # No need to handle this case for now
+            raise NotImplementedError
+
+        return self.get_text_features(input_ids, positions, inputs_embeds)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index ce5847bf79a5e..9db1423d98e07 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -1,10 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Implementation of SiglipVisionModel intended to be only used
-within a vision language model."""
 
 import math
-from collections.abc import Iterable, Mapping
+from collections.abc import Callable, Iterable, Mapping
 from functools import cached_property
 from typing import Annotated, Literal
 
@@ -976,6 +974,7 @@ class SiglipTextEmbeddings(nn.Module):
 
         position_embeddings = self.position_embedding(position_ids)
         embeddings = inputs_embeds + position_embeddings
+
         return embeddings
 
 
@@ -1145,6 +1144,41 @@ class SiglipEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
     def get_language_model(self) -> torch.nn.Module:
         return self.text_model
 
+    def _embed_text_input_ids(
+        self,
+        input_ids: torch.Tensor,
+        embed_input_ids: Callable[[torch.Tensor], torch.Tensor],
+        *,
+        is_multimodal: torch.Tensor | None,
+        handle_oov_mm_token: bool,
+    ) -> torch.Tensor:
+        inputs_embeds = super()._embed_text_input_ids(
+            input_ids,
+            embed_input_ids,
+            is_multimodal=is_multimodal,
+            handle_oov_mm_token=handle_oov_mm_token,
+        )
+
+        # NOTE: inputs_embeds in model runner has size text_config.projection_size
+        # (instead of text_config.hidden_size) to accommodate image embeddings
+        inputs_embeds_size = self.text_projection_size
+        if inputs_embeds.shape[1] < inputs_embeds_size:
+            inputs_embeds = torch.cat(
+                [
+                    inputs_embeds,
+                    inputs_embeds.new_empty(
+                        inputs_embeds.shape[0],
+                        inputs_embeds_size - inputs_embeds.shape[1],
+                    ),
+                ],
+                dim=1,
+            )
+        elif inputs_embeds.shape[1] > inputs_embeds_size:
+            # No need to handle this case for now
+            raise NotImplementedError
+
+        return inputs_embeds
+
     def embed_input_ids(
         self,
         input_ids: torch.Tensor,
@@ -1190,6 +1224,15 @@ class SiglipEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
         if not self._is_text_input:
             return inputs_embeds
 
+        # NOTE: inputs_embeds in model runner has size text_config.projection_size
+        # (instead of text_config.hidden_size) to accommodate image embeddings
+        hidden_size = self.text_embed_dim
+        if inputs_embeds.shape[1] > hidden_size:
+            inputs_embeds = inputs_embeds[:, :hidden_size]
+        elif inputs_embeds.shape[1] < hidden_size:
+            # No need to handle this case for now
+            raise NotImplementedError
+
         return self.get_text_features(input_ids, positions, inputs_embeds)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 72f9d15bc1328..d7111d52dd8a1 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -80,6 +80,7 @@ class EagleProposer:
         # the draft model's hidden size can be different from the target model's
         # hidden size (e.g., Llama 3.3 70B).
         self.hidden_size = self.draft_model_config.get_hidden_size()
+        self.inputs_embeds_size = self.draft_model_config.get_inputs_embeds_size()
 
         # Multi-modal data support
         self.mm_registry = MULTIMODAL_REGISTRY
@@ -151,7 +152,9 @@ class EagleProposer:
         )
 
         self.inputs_embeds = torch.zeros(
-            (self.max_num_tokens, self.hidden_size), dtype=self.dtype, device=device
+            (self.max_num_tokens, self.inputs_embeds_size),
+            dtype=self.dtype,
+            device=device,
         )
 
         self.backup_next_token_ids = CpuGpuBuffer(
diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py
index 3f8ef03f96445..8ae887fe82cfe 100644
--- a/vllm/v1/worker/gpu/input_batch.py
+++ b/vllm/v1/worker/gpu/input_batch.py
@@ -17,7 +17,7 @@ class InputBuffers:
         self,
         max_num_reqs: int,
         max_num_tokens: int,
-        hidden_size: int,
+        inputs_embeds_size: int,
         vocab_size: int,
         dtype: torch.dtype,
         device: torch.device,
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index fdb930c4dcd79..9bf345053c30c 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -98,7 +98,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.max_model_len = self.model_config.max_model_len
         self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
         self.max_num_reqs = self.scheduler_config.max_num_seqs
-        self.hidden_size = self.model_config.get_hidden_size()
+        self.inputs_embeds_size = self.model_config.get_inputs_embeds_size()
 
         self.dp_size = self.parallel_config.data_parallel_size
         self.dp_rank = self.parallel_config.data_parallel_rank
@@ -134,7 +134,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.input_buffers = InputBuffers(
             max_num_reqs=self.max_num_reqs,
             max_num_tokens=self.max_num_tokens,
-            hidden_size=self.hidden_size,
+            inputs_embeds_size=self.inputs_embeds_size,
             vocab_size=self.vocab_size,
             dtype=self.dtype,
             device=self.device,
diff --git a/vllm/v1/worker/gpu/spec_decode/eagle.py b/vllm/v1/worker/gpu/spec_decode/eagle.py
index a2d0550326f3b..8848e220eb5b8 100644
--- a/vllm/v1/worker/gpu/spec_decode/eagle.py
+++ b/vllm/v1/worker/gpu/spec_decode/eagle.py
@@ -44,6 +44,7 @@ class EagleSpeculator:
         # the draft model's hidden size can be different from the target model's
         # hidden size (e.g., Llama 3.3 70B).
         self.hidden_size = self.draft_model_config.get_hidden_size()
+        self.inputs_embeds_size = self.draft_model_config.get_inputs_embeds_size()
         self.vocab_size = self.draft_model_config.get_vocab_size()
         self.pin_memory = is_pin_memory_available()
         self.dtype = vllm_config.model_config.dtype
@@ -51,7 +52,7 @@ class EagleSpeculator:
         self.input_buffers = InputBuffers(
             max_num_reqs=self.max_num_reqs,
             max_num_tokens=self.max_num_tokens,
-            hidden_size=self.hidden_size,
+            inputs_embeds_size=self.inputs_embeds_size,
             vocab_size=self.vocab_size,
             dtype=self.dtype,
             device=device,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index eeae82568c32d..2218e4f023f92 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -320,7 +320,7 @@ class GPUModelRunner(
 
         # Model-related.
         self.num_query_heads = model_config.get_num_attention_heads(parallel_config)
-        self.hidden_size = model_config.get_hidden_size()
+        self.inputs_embeds_size = model_config.get_inputs_embeds_size()
         self.attention_chunk_size = model_config.attention_chunk_size
         # Only relevant for models using ALiBi (e.g, MPT)
         self.use_alibi = model_config.uses_alibi
@@ -485,7 +485,7 @@ class GPUModelRunner(
         # version of this tensor, avoid a RuntimeError by not creating a
         # numpy buffer.
         self.inputs_embeds = self._make_buffer(
-            self.max_num_tokens, self.hidden_size, dtype=self.dtype, numpy=False
+            self.max_num_tokens, self.inputs_embeds_size, dtype=self.dtype, numpy=False
         )
         self.is_token_ids = self._make_buffer(self.max_num_tokens, dtype=torch.bool)
         self.discard_request_mask = self._make_buffer(
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 9c1fbfd24149d..f3dd9aa96d2ae 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -215,7 +215,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.num_query_heads = model_config.get_num_attention_heads(parallel_config)
         self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
         self.head_size = model_config.get_head_size()
-        self.hidden_size = model_config.get_hidden_size()
+        self.inputs_embeds_size = model_config.get_inputs_embeds_size()
         self.vocab_size = model_config.get_vocab_size()
 
         # Multi-modal data support
@@ -1406,7 +1406,9 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         if self.supports_mm_inputs:
             input_ids = None
             inputs_embeds = torch.zeros(
-                (num_tokens, self.hidden_size), dtype=self.dtype, device=self.device
+                (num_tokens, self.inputs_embeds_size),
+                dtype=self.dtype,
+                device=self.device,
             )
         else:
             input_ids = torch.zeros((num_tokens), dtype=torch.int32).to(self.device)

From 8c363ed6663f69b97c9f34b0be0091d8135f958c Mon Sep 17 00:00:00 2001
From: Pleaplusone <ygan@amd.com>
Date: Sun, 30 Nov 2025 19:31:50 +0800
Subject: [PATCH 094/770] [ROCm][Attention] Sliding window support for
 `AiterFlashAttentionBackend` (#29234)

Signed-off-by: ganyi <ygan@amd.com>
---
 vllm/v1/attention/backends/rocm_aiter_fa.py | 273 ++++++++++++++++----
 1 file changed, 224 insertions(+), 49 deletions(-)

diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
index ea911af3d19ce..b6aa0ae2be48e 100644
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -13,8 +13,9 @@ from vllm.attention.backends.abstract import (
     AttentionType,
     MultipleOf,
 )
+from vllm.attention.layer import Attention
 from vllm.attention.ops.merge_attn_states import merge_attn_states
-from vllm.config import VllmConfig
+from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils.math_utils import cdiv
@@ -57,58 +58,55 @@ if current_platform.is_rocm():
         head_size,
         x,
         max_block_num,
-        num_tokens,
-        num_programs,
         DEQUANT: tl.constexpr,
         PAGE_SIZE: tl.constexpr,
         CACHE_FORMAT: tl.constexpr,
         BLOCK_SIZE: tl.constexpr,
     ):
-        bid = tl.program_id(0)
+        token_id = tl.program_id(0)
         col_offsets = tl.arange(0, BLOCK_SIZE)
         if DEQUANT:
             k_scale = tl.load(k_scale_ptr)
             v_scale = tl.load(v_scale_ptr)
 
-        for token_id in tl.range(bid, num_tokens, num_programs):
-            key_ptr_offset = key_ptr + token_id * head_size * num_heads
-            value_ptr_offset = value_ptr + token_id * head_size * num_heads
-            batch_idx = tl.load(token_to_batch_ptr + token_id)
-            batch_start = tl.load(seq_start_ptr + batch_idx)
-            token_start = tl.load(cu_seqlens_kv_ptr + batch_idx)
-            batch_offset = token_id - token_start + batch_start
-            block_offset = batch_offset // PAGE_SIZE
-            block_id = tl.load(
-                block_table_ptr + max_block_num * batch_idx + block_offset
+        key_ptr_offset = key_ptr + token_id * head_size * num_heads
+        value_ptr_offset = value_ptr + token_id * head_size * num_heads
+        batch_idx = tl.load(token_to_batch_ptr + token_id)
+        batch_start = tl.load(seq_start_ptr + batch_idx)
+        token_start = tl.load(cu_seqlens_kv_ptr + batch_idx)
+        batch_offset = token_id - token_start + batch_start
+        block_offset = batch_offset // PAGE_SIZE
+        block_id = tl.load(
+            block_table_ptr + max_block_num * batch_idx + block_offset
+        ).to(tl.int64)
+        slot_id = batch_offset % PAGE_SIZE
+
+        if CACHE_FORMAT == "NHD":
+            # for kv cache layout as
+            # K: [num_blocks, page_size, num_head, head_dim]
+            # V: [num_blocks, page_size, num_head, head_dim]
+            key_cache_ptr_offset = (
+                key_cache_ptr
+                + block_id * num_heads * head_size * PAGE_SIZE
+                + slot_id * num_heads * head_size
+            )
+            value_cache_ptr_offset = (
+                value_cache_ptr
+                + block_id * num_heads * head_size * PAGE_SIZE
+                + slot_id * num_heads * head_size
             )
-            slot_id = batch_offset % PAGE_SIZE
 
-            if CACHE_FORMAT == "NHD":
-                # for kv cache layout as
-                # K: [num_blocks, page_size, num_head, head_dim]
-                # V: [num_blocks, page_size, num_head, head_dim]
-                key_cache_ptr_offset = (
-                    key_cache_ptr
-                    + block_id * num_heads * head_size * PAGE_SIZE
-                    + slot_id * num_heads * head_size
-                )
-                value_cache_ptr_offset = (
-                    value_cache_ptr
-                    + block_id * num_heads * head_size * PAGE_SIZE
-                    + slot_id * num_heads * head_size
-                )
-
-                for i in tl.range(0, head_size * num_heads, BLOCK_SIZE):
-                    mask = (col_offsets + i) < head_size * num_heads
-                    k_reg = tl.load(key_cache_ptr_offset + col_offsets + i, mask=mask)
-                    v_reg = tl.load(value_cache_ptr_offset + col_offsets + i, mask=mask)
-                    if DEQUANT:
-                        k_dtype = k_reg.dtype
-                        v_dtype = v_reg.dtype
-                        k_reg = (k_reg.to(tl.float32) * k_scale).to(k_dtype)
-                        v_reg = (v_reg.to(tl.float32) * v_scale).to(v_dtype)
-                    tl.store(key_ptr_offset + col_offsets + i, k_reg, mask=mask)
-                    tl.store(value_ptr_offset + col_offsets + i, v_reg, mask=mask)
+            for i in tl.range(0, head_size * num_heads, BLOCK_SIZE):
+                mask = (col_offsets + i) < head_size * num_heads
+                k_reg = tl.load(key_cache_ptr_offset + col_offsets + i, mask=mask)
+                v_reg = tl.load(value_cache_ptr_offset + col_offsets + i, mask=mask)
+                if DEQUANT:
+                    k_dtype = k_reg.dtype
+                    v_dtype = v_reg.dtype
+                    k_reg = (k_reg.to(tl.float32) * k_scale).to(k_dtype)
+                    v_reg = (v_reg.to(tl.float32) * v_scale).to(v_dtype)
+                tl.store(key_ptr_offset + col_offsets + i, k_reg, mask=mask)
+                tl.store(value_ptr_offset + col_offsets + i, v_reg, mask=mask)
 
     def cp_mha_gather_cache(
         key_cache: torch.Tensor,
@@ -143,9 +141,7 @@ if current_platform.is_rocm():
         page_size = key_cache.shape[1]
         num_heads = key_cache.shape[2]
 
-        NUM_PRGMS = num_programs(total_tokens)
-        BLOCK_SIZE = block_size(key_cache, head_dim)
-        grid = lambda meta: (NUM_PRGMS,)
+        grid = lambda meta: (total_tokens,)
         cp_mha_gather_cache_kernel[grid](
             key_cache,
             value_cache,
@@ -161,12 +157,10 @@ if current_platform.is_rocm():
             head_dim,
             x,
             block_tables.size(1),
-            total_tokens,
-            NUM_PRGMS,
             DEQUANT=dequant,
             PAGE_SIZE=page_size,
             CACHE_FORMAT=kv_cache_layout,
-            BLOCK_SIZE=BLOCK_SIZE,
+            BLOCK_SIZE=head_dim,
         )
 
 
@@ -189,6 +183,17 @@ class AiterFlashAttentionPrefillMetadata:
     query_start_loc: torch.Tensor
 
 
+@dataclass
+class AiterChunkSlidingWindowMetadata:
+    swa_seqlens: torch.Tensor
+    swa_cu_seqlens: torch.Tensor
+    swa_seq_starts: torch.Tensor
+    swa_token_to_batch: torch.Tensor
+    swa_max_seqlens: int
+    swa_total_tokens: int
+    swa_workspace: torch.Tensor
+
+
 @dataclass
 class AiterChunkContextMetadata:
     workspace: torch.Tensor
@@ -200,6 +205,7 @@ class AiterChunkContextMetadata:
     seq_lens: torch.Tensor
     num_chunks: int
     total_token_per_batch: list[int]
+    swa_metadata: AiterChunkSlidingWindowMetadata | None
 
 
 @dataclass
@@ -278,6 +284,20 @@ class AiterFlashAttentionMetadataBuilder(
         self.aot_sliding_window: tuple[int, int] | None = None
         self.total_tokens: int = 0
 
+        sliding_window_configs: set[tuple[int, int] | None] = set()
+        layers = get_layers_from_vllm_config(self.vllm_config, Attention)
+        for layer in layers.values():
+            assert isinstance(layer.impl, AiterFlashAttentionImpl)
+            sliding_window_configs.add(layer.impl.sliding_window)
+
+        while len(sliding_window_configs) > 0:
+            sliding_window_config = sliding_window_configs.pop()
+            if sliding_window_config is not None and sliding_window_config[0] != -1:
+                assert self.aot_sliding_window is None, (
+                    "Aiter Flash ATTENTION can only support one valid sliding window!"
+                )
+                self.aot_sliding_window = sliding_window_config
+
         self.extend_workspace = torch.empty(
             [2, _CP_TOKENS_PER_ITER_ROCM, self.num_heads_kv, self.headdim],
             dtype=self.model_config.dtype,
@@ -349,6 +369,55 @@ class AiterFlashAttentionMetadataBuilder(
             query_lens_for_extend = query_lens_cpu[num_extends_slice]
             seq_lens_for_extend = common_attn_metadata.seq_lens_cpu[num_extends_slice]
             computed_kv_lens = seq_lens_for_extend - query_lens_for_extend
+            swa_metadata = None
+            if self.aot_sliding_window is not None:
+                swa_seqlen_for_extend = torch.minimum(
+                    seq_lens_for_extend,
+                    query_lens_for_extend + self.aot_sliding_window[0] + 1,
+                )
+                cu_seq_lens = torch.zeros(
+                    num_extends + 1,
+                    dtype=torch.int32,
+                    device=seq_lens_for_extend.device,
+                )
+                torch.cumsum(
+                    swa_seqlen_for_extend,
+                    dim=0,
+                    dtype=cu_seq_lens.dtype,
+                    out=cu_seq_lens[1:],
+                )
+                token_to_seq = torch.arange(
+                    0,
+                    num_extends,
+                    dtype=torch.int32,
+                    device=seq_lens_for_extend.device,
+                )
+                token_to_seq = torch.repeat_interleave(
+                    token_to_seq, swa_seqlen_for_extend
+                )
+                fetched_shape = cu_seq_lens[-1].item()
+                # TODO(ganyi): Maybe reuse these 2 buffer from extend_workspace
+                swa_workspace = torch.empty(
+                    (2, fetched_shape, self.num_heads_kv, self.headdim),
+                    dtype=self.vllm_config.model_config.dtype,
+                    device=self.device,
+                )
+
+                seq_starts = seq_lens_for_extend - swa_seqlen_for_extend
+                max_seqlen_k = swa_seqlen_for_extend.max().item()
+                total_tokens = cu_seq_lens[-1].item()
+
+                swa_metadata = AiterChunkSlidingWindowMetadata(
+                    swa_seqlens=swa_seqlen_for_extend.to(
+                        self.device, non_blocking=True
+                    ),
+                    swa_cu_seqlens=cu_seq_lens.to(self.device, non_blocking=True),
+                    swa_seq_starts=seq_starts.to(self.device, non_blocking=True),
+                    swa_token_to_batch=token_to_seq.to(self.device, non_blocking=True),
+                    swa_max_seqlens=max_seqlen_k,
+                    swa_total_tokens=total_tokens,
+                    swa_workspace=swa_workspace,
+                )
 
             # allocate the equal amount of workspace for
             # each chunk prefill request
@@ -392,6 +461,7 @@ class AiterFlashAttentionMetadataBuilder(
                 token_to_batch=token_to_batch_tensor.to(self.device, non_blocking=True),
                 num_chunks=num_chunks,
                 total_token_per_batch=cu_seq_lens_cpu[:, -1].tolist(),
+                swa_metadata=swa_metadata,
             )
 
             query_start_loc_device = common_attn_metadata.query_start_loc[
@@ -504,9 +574,9 @@ class AiterFlashAttentionImpl(AttentionImpl):
             alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
         self.alibi_slopes = alibi_slopes
         if sliding_window is None:
-            self.sliding_window = [-1, -1]
+            self.sliding_window = (-1, -1)
         else:
-            self.sliding_window = [sliding_window - 1, 0]
+            self.sliding_window = (sliding_window - 1, 0)
         self.kv_cache_dtype = kv_cache_dtype
         if logits_soft_cap is None:
             # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
@@ -522,6 +592,67 @@ class AiterFlashAttentionImpl(AttentionImpl):
                 "Encoder self-attention is not implemented for FlashAttentionImpl"
             )
 
+    def extend_for_sliding_window(
+        self,
+        attn_metadata: AiterFlashAttentionMetadata,
+        query: torch.Tensor,
+        key_cache,
+        value_cache,
+        output: torch.Tensor,
+        cu_seqlens_q: torch.Tensor,
+        max_seqlen_q: int,
+        block_table: torch.Tensor,
+        k_scale: float,
+        v_scale: float,
+    ):
+        assert attn_metadata.extend_metadata is not None
+        assert attn_metadata.extend_metadata.chunk_context_metadata is not None
+        chunked_metadata = attn_metadata.extend_metadata.chunk_context_metadata
+        swa_metadata = chunked_metadata.swa_metadata
+        assert swa_metadata is not None
+        swa_cu_seqlens = swa_metadata.swa_cu_seqlens
+        swa_seq_starts = swa_metadata.swa_seq_starts
+        swa_token_to_batch = swa_metadata.swa_token_to_batch
+        swa_max_seqlens = swa_metadata.swa_max_seqlens
+        swa_total_tokens = swa_metadata.swa_total_tokens
+        key_fetched, value_fetched = (
+            swa_metadata.swa_workspace[0],
+            swa_metadata.swa_workspace[1],
+        )
+        cp_mha_gather_cache(
+            key_cache=key_cache,
+            value_cache=value_cache,
+            key=key_fetched,
+            value=value_fetched,
+            block_tables=block_table,
+            k_scales=k_scale,
+            v_scales=v_scale,
+            cu_seqlens_kv=swa_cu_seqlens,
+            token_to_batch=swa_token_to_batch,
+            seq_starts=swa_seq_starts,
+            dequant=False,
+            kv_cache_layout="NHD",
+            total_tokens=swa_total_tokens,
+        )
+
+        aiter.flash_attn_varlen_func(
+            q=query,
+            k=key_fetched,
+            v=value_fetched,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=swa_cu_seqlens,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_k=swa_max_seqlens,
+            min_seqlen_q=1,
+            dropout_p=0.0,
+            softmax_scale=self.scale,
+            causal=True,
+            window_size=self.sliding_window,
+            alibi_slopes=self.alibi_slopes,
+            return_lse=False,
+            out=output,
+        )
+
     def extend_forward(
         self,
         attn_metadata: AiterFlashAttentionMetadata,
@@ -540,6 +671,20 @@ class AiterFlashAttentionImpl(AttentionImpl):
         k_scale: float,
         v_scale: float,
     ):
+        if self.sliding_window[0] != -1:
+            self.extend_for_sliding_window(
+                attn_metadata,
+                query,
+                key_cache,
+                value_cache,
+                output,
+                cu_seqlens_q,
+                max_seqlen_q,
+                block_table,
+                k_scale,
+                v_scale,
+            )
+            return
         out, lse = aiter.flash_attn_varlen_func(
             q=query,
             k=key,
@@ -782,6 +927,36 @@ class AiterFlashAttentionImpl(AttentionImpl):
             # calculate for decodes
             if num_decodes > 0:
                 assert attn_metadata.decode_metadata is not None
+                if self.sliding_window[0] != -1:
+                    from aiter.ops.triton.unified_attention import (
+                        unified_attention,
+                    )
+
+                    descale_shape = (
+                        attn_metadata.query_start_loc[:num_decodes].shape[0] - 1,
+                        key_cache.shape[2],
+                    )
+                    unified_attention(
+                        q=query[:num_decode_tokens],
+                        k=key_cache,
+                        v=value_cache,
+                        out=output[:num_decode_tokens],
+                        cu_seqlens_q=attn_metadata.query_start_loc[:num_decodes],
+                        max_seqlen_q=1,  # optimize this
+                        seqused_k=attn_metadata.seq_lens[:num_decodes],
+                        max_seqlen_k=attn_metadata.max_seq_len,
+                        softmax_scale=self.scale,
+                        causal=True,
+                        alibi_slopes=self.alibi_slopes,
+                        window_size=self.sliding_window,
+                        block_table=attn_metadata.block_table[:num_decodes],
+                        softcap=self.logits_soft_cap,
+                        q_descale=None,
+                        k_descale=layer._k_scale.expand(descale_shape),
+                        v_descale=layer._v_scale.expand(descale_shape),
+                    )
+                    return
+                assert attn_metadata.decode_metadata is not None
                 _, num_heads, head_size = query.shape
                 nbytes_per_qo_elem = torch.finfo(query.dtype).bits // 8
                 num_seqs = attn_metadata.seq_lens.shape[0]

From cd719de5cb2d0a7c8e1c93fedd206ca8eaa8c517 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sun, 30 Nov 2025 14:29:32 +0000
Subject: [PATCH 095/770] Fix RoPE failures in Transformers nightly (#29700)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/transformers_utils/config.py             | 19 ++-----------------
 vllm/transformers_utils/configs/qwen3_next.py |  2 --
 2 files changed, 2 insertions(+), 19 deletions(-)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 8f2cd3315ab94..1bb5791e19016 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -300,25 +300,10 @@ def set_default_rope_theta(config: PretrainedConfig, default_theta: float) -> No
 
 def patch_rope_parameters(config: PretrainedConfig) -> None:
     """Provide backwards compatibility for RoPE."""
-    # Patch rope_parameters differently based on Transformers version
-    if Version(version("transformers")) >= Version("5.0.0.dev0"):
-        from transformers.modeling_rope_utils import (
-            rope_config_validation,
-            standardize_rope_params,
-        )
-
-        # When Transformers v5 is installed, legacy rope_theta may be present
-        # when using custom code models written for Transformers v4
-        if (rope_theta := getattr(config, "rope_theta", None)) is not None:
-            standardize_rope_params(config, rope_theta=rope_theta)
-            rope_config_validation(config)
-            # Delete rope_theta to avoid confusion in downstream code
-            del config.rope_theta
-    else:
-        # When Transformers v4 is installed, legacy rope_scaling may be present
+    if Version(version("transformers")) < Version("5.0.0.dev0"):
+        # Transformers v4 installed, legacy config fields may be present
         if (rope_scaling := getattr(config, "rope_scaling", None)) is not None:
             config.rope_parameters = rope_scaling
-        # When Transformers v4 is installed, legacy rope_theta may be present
         if (rope_theta := getattr(config, "rope_theta", None)) is not None:
             if not hasattr(config, "rope_parameters"):
                 config.rope_parameters = {"rope_type": "default"}
diff --git a/vllm/transformers_utils/configs/qwen3_next.py b/vllm/transformers_utils/configs/qwen3_next.py
index d2fe58d48da6f..fd36b49245f56 100644
--- a/vllm/transformers_utils/configs/qwen3_next.py
+++ b/vllm/transformers_utils/configs/qwen3_next.py
@@ -17,7 +17,6 @@
 """Qwen3-Next model configuration"""
 
 from transformers.configuration_utils import PretrainedConfig, layer_type_validation
-from transformers.modeling_rope_utils import rope_config_validation
 from transformers.utils import logging
 
 logger = logging.get_logger(__name__)
@@ -245,7 +244,6 @@ class Qwen3NextConfig(PretrainedConfig):
         self.attention_bias = attention_bias
         self.attention_dropout = attention_dropout
         self.head_dim = head_dim
-        rope_config_validation(self)
 
         self.layer_types = layer_types
         if self.layer_types is None:

From 39d28108f46ac320bccdf76e84640dbf2bd02bb4 Mon Sep 17 00:00:00 2001
From: Omer Ullman Argov <118735753+omera-nv@users.noreply.github.com>
Date: Sun, 30 Nov 2025 18:02:40 +0200
Subject: [PATCH 096/770] [Feat] Support non-gated activations in NVFP4
 modelopt path (#29004)

---
 tests/kernels/moe/test_flashinfer_moe.py      | 24 +++++--
 tests/kernels/moe/utils.py                    | 11 +++-
 tests/kernels/utils.py                        |  8 ++-
 vllm/model_executor/layers/fused_moe/layer.py | 12 +++-
 .../layers/quantization/modelopt.py           | 65 ++++++++++++++++---
 5 files changed, 98 insertions(+), 22 deletions(-)

diff --git a/tests/kernels/moe/test_flashinfer_moe.py b/tests/kernels/moe/test_flashinfer_moe.py
index be3e36865d1a4..b2be03ecee2f1 100644
--- a/tests/kernels/moe/test_flashinfer_moe.py
+++ b/tests/kernels/moe/test_flashinfer_moe.py
@@ -16,11 +16,11 @@ from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
     FlashInferExperts,
     is_valid_flashinfer_cutlass_fused_moe,
 )
+from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (
+    create_flashinfer_prepare_finalize,
+)
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
-from vllm.model_executor.layers.fused_moe.prepare_finalize import (
-    MoEPrepareAndFinalizeNoEP,
-)
 from vllm.platforms import current_platform
 from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
 
@@ -48,9 +48,10 @@ MNK_FACTORS = [
 @pytest.mark.parametrize("e", [40, 64, 256])
 @pytest.mark.parametrize("topk", [1, 6, 8])
 @pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("activation", ["silu_and_mul", "relu2"])
 @torch.inference_mode()
 def test_flashinfer_fp4_moe_no_graph(
-    m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype
+    m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype, activation: str
 ):
     current_platform.seed_everything(7)
     with set_current_vllm_config(
@@ -59,6 +60,7 @@ def test_flashinfer_fp4_moe_no_graph(
         a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
 
         quant_blocksize = 16
+        is_gated_act = activation == "silu_and_mul"
 
         w1_q, w2_q, quant_config = make_test_quant_config(
             e,
@@ -68,6 +70,7 @@ def test_flashinfer_fp4_moe_no_graph(
             quant_dtype="nvfp4",
             block_shape=None,
             per_act_token_quant=False,
+            make_gate=is_gated_act,
         )
 
         score = torch.randn((m, e), device="cuda", dtype=dtype)
@@ -76,16 +79,19 @@ def test_flashinfer_fp4_moe_no_graph(
         assert is_valid_flashinfer_cutlass_fused_moe(a, w1_q, w2_q)
 
         flashinfer_experts = FusedMoEModularKernel(
-            MoEPrepareAndFinalizeNoEP(),
+            create_flashinfer_prepare_finalize(use_dp=False, use_nvfp4=True),
             FlashInferExperts(out_dtype=dtype, quant_config=quant_config),
         )
 
+        fi_activation = {"silu_and_mul": "silu", "relu2": "relu2_no_mul"}[activation]
+
         flashinfer_output = flashinfer_experts(
             hidden_states=a,
             w1=w1_q,
             w2=w2_q,
             topk_weights=topk_weights,
             topk_ids=topk_ids,
+            activation=fi_activation,
         )
 
         # Reference check:
@@ -103,7 +109,9 @@ def test_flashinfer_fp4_moe_no_graph(
             block_size=quant_blocksize,
         )
 
-        w1_d = torch.empty((e, 2 * n, k), device="cuda", dtype=dtype)
+        w1_d = torch.empty(
+            (e, (2 if is_gated_act else 1) * n, k), device="cuda", dtype=dtype
+        )
         w2_d = torch.empty((e, k, n), device="cuda", dtype=dtype)
 
         for idx in range(0, e):
@@ -124,7 +132,9 @@ def test_flashinfer_fp4_moe_no_graph(
                 block_size=quant_blocksize,
             )
 
-        torch_output = torch_moe(a_in_dtype, w1_d, w2_d, score, topk)
+        torch_output = torch_moe(
+            a_in_dtype, w1_d, w2_d, score, topk, activation=activation
+        )
 
         torch.testing.assert_close(
             torch_output, flashinfer_output, atol=1e-1, rtol=1e-1
diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py
index c7e6c4240e853..f0c8c8033b8eb 100644
--- a/tests/kernels/moe/utils.py
+++ b/tests/kernels/moe/utils.py
@@ -264,13 +264,20 @@ def make_test_weights(
     quant_dtype: torch.dtype | str | None = None,
     block_shape: list[int] | None = None,
     per_out_ch_quant: bool = False,
+    make_gate: bool = True,
 ) -> tuple[
     tuple[torch.Tensor, torch.Tensor, torch.Tensor | None, torch.Tensor | None],
     tuple[torch.Tensor, torch.Tensor, torch.Tensor | None, torch.Tensor | None],
 ]:
     return (
         make_test_weight(
-            e, 2 * n, k, in_dtype, quant_dtype, block_shape, per_out_ch_quant
+            e,
+            (2 if make_gate else 1) * n,
+            k,
+            in_dtype,
+            quant_dtype,
+            block_shape,
+            per_out_ch_quant,
         ),
         make_test_weight(e, k, n, in_dtype, quant_dtype, block_shape, per_out_ch_quant),
     )
@@ -297,6 +304,7 @@ def make_test_quant_config(
     quant_dtype: torch.dtype | str | None = None,
     per_act_token_quant: bool = False,
     block_shape: list[int] | None = None,
+    make_gate: bool = True,
 ) -> tuple[torch.Tensor, torch.Tensor, FusedMoEQuantConfig]:
     (_, w1, w1_s, w1_gs), (_, w2, w2_s, w2_gs) = make_test_weights(
         e,
@@ -306,6 +314,7 @@ def make_test_quant_config(
         quant_dtype,
         per_out_ch_quant=per_act_token_quant,
         block_shape=block_shape,
+        make_gate=make_gate,
     )
 
     # Hacky/trivial scales for nvfp4.
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index 98646442391fe..72c79370d19c1 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -14,6 +14,7 @@ from torch._prims_common import TensorLikeType
 
 from tests.kernels.quant_utils import native_w8a8_block_matmul
 from vllm.attention.backends.abstract import AttentionType
+from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
 from vllm.utils.torch_utils import make_tensor_with_pad
@@ -839,6 +840,7 @@ def torch_experts(
     per_act_token_quant=False,
     block_shape: list[int] | None = None,
     apply_router_weights_on_input: bool = False,
+    activation: str = "silu_and_mul",
 ) -> torch.Tensor:
     assert (
         global_num_experts == -1
@@ -881,6 +883,8 @@ def torch_experts(
 
     f32 = torch.float32
 
+    act = CustomOp.op_registry[activation]
+
     for i in range(num_experts):
         mask = topk_ids == i
         if mask.sum():
@@ -888,7 +892,7 @@ def torch_experts(
                 tmp1 = a[mask] @ w1[i].transpose(0, 1)
                 if b_bias1 is not None:
                     tmp1 = tmp1 + b_bias1[i].view(1, -1).to(tmp1.dtype)
-                tmp2 = SiluAndMul()(tmp1)
+                tmp2 = act()(tmp1)
                 out[mask] = tmp2 @ w2[i].transpose(0, 1)
                 if b_bias2 is not None:
                     out[mask] = out[mask] + b_bias2[i].view(1, -1).to(tmp1.dtype)
@@ -969,6 +973,7 @@ def torch_moe(
     b_bias2: torch.Tensor | None = None,
     global_num_experts: int = -1,
     expert_map: torch.Tensor | None = None,
+    activation: str = "silu_and_mul",
 ) -> torch.Tensor:
     score = torch.softmax(score, dim=-1, dtype=torch.float32)
     topk_weight, topk_ids = torch.topk(score, topk)
@@ -982,6 +987,7 @@ def torch_moe(
         b_bias1,
         b_bias2,
         expert_map,
+        activation=activation,
     )
 
 
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index e180b4f4ba23f..902a77987d61a 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -600,14 +600,20 @@ class FusedMoE(CustomOp):
             # Avoid circular import
             from vllm.model_executor.layers.quantization.modelopt import (
                 ModelOptFp8MoEMethod,
+                ModelOptNvFp4FusedMoE,
             )
 
             if not isinstance(
-                self.quant_method, (UnquantizedFusedMoEMethod, ModelOptFp8MoEMethod)
+                self.quant_method,
+                (
+                    UnquantizedFusedMoEMethod,
+                    ModelOptFp8MoEMethod,
+                    ModelOptNvFp4FusedMoE,
+                ),
             ):
                 raise NotImplementedError(
                     "is_act_and_mul=False is supported only for unquantized "
-                    "and ModelOpt FP8 moe for now"
+                    ", ModelOpt FP8, and ModelOpt NvFp4 checkpoints"
                 )
             if not current_platform.is_cuda():
                 raise NotImplementedError(
@@ -1277,7 +1283,7 @@ class FusedMoE(CustomOp):
                     self._load_combined_w13_weight_scale(
                         shard_dim=shard_dim,
                         loaded_weight=loaded_weight,
-                        param=param,
+                        param=expert_data,
                         tp_rank=self.tp_rank,
                     )
                     return True if return_success else None
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 709c86175477a..034e97a713cdd 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1216,7 +1216,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         w13_weight = ModelWeightParameter(
             data=torch.empty(
                 num_experts,
-                2 * intermediate_size_per_partition,
+                (2 if self.moe.is_act_and_mul else 1) * intermediate_size_per_partition,
                 # 2 fp4 items are packed in the input dimension
                 hidden_size // 2,
                 dtype=weight_dtype,
@@ -1245,7 +1245,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         w13_weight_scale = ModelWeightParameter(
             data=torch.empty(
                 num_experts,
-                2 * intermediate_size_per_partition,
+                (2 if self.moe.is_act_and_mul else 1) * intermediate_size_per_partition,
                 # 2 fp4 items are packed in the input dimension
                 hidden_size // self.quant_config.group_size,
                 dtype=weight_scale_dtype,
@@ -1275,7 +1275,9 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         )
 
         w13_weight_scale_2 = PerTensorScaleParameter(
-            data=torch.empty(num_experts, 2, dtype=torch.float32),
+            data=torch.empty(
+                num_experts, 2 if self.moe.is_act_and_mul else 1, dtype=torch.float32
+            ),
             weight_loader=weight_loader,
         )
         layer.register_parameter("w13_weight_scale_2", w13_weight_scale_2)
@@ -1296,7 +1298,11 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         global_scale_num_experts = global_num_experts if use_global_sf else num_experts
 
         w13_input_scale = PerTensorScaleParameter(
-            data=torch.empty(global_scale_num_experts, 2, dtype=torch.float32),
+            data=torch.empty(
+                global_scale_num_experts,
+                2 if self.moe.is_act_and_mul else 1,
+                dtype=torch.float32,
+            ),
             weight_loader=weight_loader,
         )
         layer.register_parameter("w13_input_scale", w13_input_scale)
@@ -1312,9 +1318,13 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         gemm1_weight = layer.w13_weight.data
         gemm1_weight_scale = layer.w13_weight_scale.data
 
-        if self.allow_flashinfer and (
-            self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS
-            or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
+        if (
+            self.allow_flashinfer
+            and (
+                self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS
+                or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
+            )
+            and self.moe.is_act_and_mul
         ):
             gemm1_weight, gemm1_weight_scale = reorder_w1w3_to_w3w1(
                 gemm1_weight, gemm1_weight_scale, dim=-2
@@ -1324,7 +1334,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         layer.w13_weight_scale = Parameter(gemm1_weight_scale, requires_grad=False)
 
         # Common processing for w13_weight_scale_2
-        if not torch.allclose(
+        if self.moe.is_act_and_mul and not torch.allclose(
             layer.w13_weight_scale_2[:, 0], layer.w13_weight_scale_2[:, 1]
         ):
             logger.warning_once(
@@ -1437,11 +1447,39 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 w13_blockscale_swizzled, requires_grad=False
             )
 
+            w13_weight = layer.w13_weight
+            intermediate_size_pad = w13_blockscale_swizzled.size(1) - w13_weight.size(1)
+            if intermediate_size_pad:
+                # padding gated activations will require to split w1 and w3
+                # and pad them individually
+                assert not self.moe.is_act_and_mul, (
+                    "The intermediate size required padding, "
+                    "but padding is not implemented for gated activations"
+                )
+
+                layer.w13_weight = Parameter(
+                    torch.nn.functional.pad(
+                        w13_weight, (0, 0, 0, intermediate_size_pad)
+                    ),
+                    requires_grad=False,
+                )
+                layer.w2_weight = Parameter(
+                    torch.nn.functional.pad(
+                        layer.w2_weight, (0, intermediate_size_pad // 2, 0, 0)
+                    ),
+                    requires_grad=False,
+                )
+                layer.w2_weight_scale = Parameter(
+                    torch.nn.functional.pad(
+                        layer.w2_weight_scale, (0, intermediate_size_pad // 16)
+                    ),
+                    requires_grad=False,
+                )
+
             w2_blockscale_swizzled = swizzle_blockscale(layer.w2_weight_scale)
             layer.w2_weight_scale = Parameter(
                 w2_blockscale_swizzled, requires_grad=False
             )
-            layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False)
 
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
@@ -1484,7 +1522,14 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        assert activation == "silu", "Only SiLU activation is supported."
+        if not self.moe.is_act_and_mul:
+            assert (
+                self.allow_flashinfer
+                and self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS
+            ), (
+                "Non-gated activations are only supported by the"
+                " flashinfer CUTLASS backend for modelopt checkpoints"
+            )
 
         if (
             self.allow_flashinfer

From 21c26279344871558aaba033e5bd058ab2e69793 Mon Sep 17 00:00:00 2001
From: Xingyu Liu <38244988+charlotte12l@users.noreply.github.com>
Date: Mon, 1 Dec 2025 01:14:23 +0800
Subject: [PATCH 097/770] [Misc]Remove redundant hidden_size property in
 ModelConfig (#29749)

Signed-off-by: Xingyu Liu <charlotteliu12x@gmail.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/config/model.py                   | 9 +--------
 vllm/model_executor/models/adapters.py | 2 +-
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/vllm/config/model.py b/vllm/config/model.py
index 92cd48402a65d..b595c8550491b 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -1726,19 +1726,12 @@ class ModelConfig:
         logger.debug_once("head dtype: %s", head_dtype)
         return head_dtype
 
-    @property
-    def hidden_size(self):
-        if hasattr(self.hf_config, "hidden_size"):
-            return self.hf_config.hidden_size
-        text_config = self.hf_config.get_text_config()
-        return text_config.hidden_size
-
     @property
     def embedding_size(self):
         dense_modules = try_get_dense_modules(self.model, revision=self.revision)
         if dense_modules is not None:
             return dense_modules[-1]["out_features"]
-        return self.hidden_size
+        return self.get_hidden_size()
 
     def get_and_verify_max_len(self, max_model_len: int):
         # Consider max_model_len in tokenizer_config only when
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 738400ae864a5..05f257feea3ee 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -301,7 +301,7 @@ def as_seq_cls_model(cls: _T) -> _T:
             quant_config = vllm_config.quant_config
 
             self.score = ReplicatedLinear(
-                model_config.hidden_size,
+                model_config.get_hidden_size(),
                 text_config.num_labels,
                 bias=False,
                 params_dtype=vllm_config.model_config.head_dtype,

From ec38a7368df7a14331e5f99f5de899df13f3b954 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 30 Nov 2025 14:15:42 -0800
Subject: [PATCH 098/770] [Model Runner V2] Use packed mask for prompt bin
 counts (#29756)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/sample/metadata.py  |  8 +++---
 vllm/v1/worker/gpu/sample/penalties.py | 36 +++++++++++++++-----------
 vllm/v1/worker/gpu/states.py           | 16 +++++++-----
 3 files changed, 35 insertions(+), 25 deletions(-)

diff --git a/vllm/v1/worker/gpu/sample/metadata.py b/vllm/v1/worker/gpu/sample/metadata.py
index 666649fd0eebc..040771c051bb4 100644
--- a/vllm/v1/worker/gpu/sample/metadata.py
+++ b/vllm/v1/worker/gpu/sample/metadata.py
@@ -26,7 +26,7 @@ class SamplingMetadata:
 
     # For penalties
     idx_mapping: torch.Tensor
-    prompt_bin_counts: torch.Tensor
+    prompt_bin_mask: torch.Tensor
     output_bin_counts: torch.Tensor
 
     @classmethod
@@ -57,7 +57,7 @@ class SamplingMetadata:
         # NOTE(woosuk): These are placeholder tensors to avoid None checks in the
         # penalties kernel. We use 2 instead of 1 as vocab_size to avoid Triton
         # specialization and re-compilation at runtime.
-        prompt_bin_counts = torch.zeros(num_reqs, 2, dtype=torch.int32, device=device)
+        prompt_bin_mask = torch.zeros(num_reqs, 2, dtype=torch.int32, device=device)
         output_bin_counts = torch.zeros(num_reqs, 2, dtype=torch.int32, device=device)
 
         return cls(
@@ -71,7 +71,7 @@ class SamplingMetadata:
             pos=pos,
             max_num_logprobs=max_num_logprobs,
             idx_mapping=idx_mapping,
-            prompt_bin_counts=prompt_bin_counts,
+            prompt_bin_mask=prompt_bin_mask,
             output_bin_counts=output_bin_counts,
         )
 
@@ -174,6 +174,6 @@ def expand_sampling_metadata(
         max_num_logprobs=sampling_metadata.max_num_logprobs,
         # TODO(woosuk): Support penalties with spec decoding.
         idx_mapping=sampling_metadata.idx_mapping,
-        prompt_bin_counts=sampling_metadata.prompt_bin_counts,
+        prompt_bin_mask=sampling_metadata.prompt_bin_mask,
         output_bin_counts=sampling_metadata.output_bin_counts,
     )
diff --git a/vllm/v1/worker/gpu/sample/penalties.py b/vllm/v1/worker/gpu/sample/penalties.py
index 69cf9d26ec992..c8d4b7d81841d 100644
--- a/vllm/v1/worker/gpu/sample/penalties.py
+++ b/vllm/v1/worker/gpu/sample/penalties.py
@@ -15,8 +15,8 @@ def _penalties_and_temperature_kernel(
     presence_penalty_ptr,
     temperature_ptr,
     idx_mapping_ptr,
-    prompt_bin_counts_ptr,
-    prompt_bin_counts_stride,
+    prompt_bin_mask_ptr,
+    prompt_bin_mask_stride,
     output_bin_counts_ptr,
     output_bin_counts_stride,
     vocab_size,
@@ -54,13 +54,16 @@ def _penalties_and_temperature_kernel(
 
         # Apply repetition penalties.
         if use_rep_penalty:
-            prompt_bin_counts = tl.load(
-                prompt_bin_counts_ptr
-                + req_state_idx * prompt_bin_counts_stride
-                + block,
-                mask=mask,
+            packed_block = block_idx * BLOCK_SIZE // 32 + tl.arange(0, BLOCK_SIZE // 32)
+            packed_mask = tl.load(
+                prompt_bin_mask_ptr
+                + req_state_idx * prompt_bin_mask_stride
+                + packed_block,
+                mask=packed_block < tl.cdiv(vocab_size, 32),
             )
-            prompt_bin_mask = prompt_bin_counts > 0
+            prompt_bin_mask = (packed_mask[:, None] >> (tl.arange(0, 32)[None, :])) & 1
+            prompt_bin_mask = prompt_bin_mask.reshape(BLOCK_SIZE)
+
             # If token appears in prompt or output, apply, otherwise use 1.0 for no-op.
             scale = tl.where(prompt_bin_mask | output_bin_mask, rep_penalty, 1.0)
             # If logits are positive, divide by penalty, otherwise multiply by penalty.
@@ -93,8 +96,8 @@ def apply_penalties_and_temperature(
         sampling_metadata.presence_penalty,
         sampling_metadata.temperature,
         sampling_metadata.idx_mapping,
-        sampling_metadata.prompt_bin_counts,
-        sampling_metadata.prompt_bin_counts.stride(0),
+        sampling_metadata.prompt_bin_mask,
+        sampling_metadata.prompt_bin_mask.stride(0),
         sampling_metadata.output_bin_counts,
         sampling_metadata.output_bin_counts.stride(0),
         vocab_size,
@@ -107,7 +110,7 @@ def _bincount_kernel(
     prefill_token_ids_ptr,
     prefill_len,
     prompt_len,
-    prompt_bin_counts_ptr,
+    prompt_bin_mask_ptr,
     output_bin_counts_ptr,
     BLOCK_SIZE: tl.constexpr,
 ):
@@ -119,7 +122,10 @@ def _bincount_kernel(
     if block_idx * BLOCK_SIZE < prompt_len:
         mask = block < prompt_len
         prefill_tokens = tl.load(prefill_token_ids_ptr + block, mask=mask)
-        tl.atomic_add(prompt_bin_counts_ptr + prefill_tokens, 1, mask=mask)
+        idx = prefill_tokens // 32
+        bit_idx = prefill_tokens % 32
+        bit = tl.full((BLOCK_SIZE,), 1, tl.int32) << bit_idx
+        tl.atomic_or(prompt_bin_mask_ptr + idx, bit, mask=mask)
     if (block_idx + 1) * BLOCK_SIZE >= prompt_len:
         mask = block < prefill_len
         mask &= block >= prompt_len
@@ -131,10 +137,10 @@ def bincount(
     prefill_token_ids: torch.Tensor,
     prefill_len: int,
     prompt_len: int,
-    prompt_bin_counts: torch.Tensor,
+    prompt_bin_mask: torch.Tensor,
     output_bin_counts: torch.Tensor,
 ) -> None:
-    prompt_bin_counts.zero_()
+    prompt_bin_mask.zero_()
     output_bin_counts.zero_()
     BLOCK_SIZE = 1024
     num_blocks = triton.cdiv(prefill_len, BLOCK_SIZE)
@@ -142,7 +148,7 @@ def bincount(
         prefill_token_ids,
         prefill_len,
         prompt_len,
-        prompt_bin_counts,
+        prompt_bin_mask,
         output_bin_counts,
         BLOCK_SIZE=BLOCK_SIZE,
     )
diff --git a/vllm/v1/worker/gpu/states.py b/vllm/v1/worker/gpu/states.py
index c3428faab0a31..367348c4a18f7 100644
--- a/vllm/v1/worker/gpu/states.py
+++ b/vllm/v1/worker/gpu/states.py
@@ -7,6 +7,7 @@ import torch
 
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
+from vllm.utils.math_utils import cdiv
 from vllm.utils.platform_utils import is_uva_available
 from vllm.utils.torch_utils import get_cuda_view_from_cpu_tensor
 from vllm.v1.outputs import LogprobsTensors
@@ -97,11 +98,14 @@ class RequestState:
         self.needs_prompt_logprobs = np.zeros(self.max_num_reqs, dtype=bool)
 
         # Statistics for penalties.
-        # TODO(woosuk): These tensors are rarely used but can be extremely large.
-        # Optimize the memory usage.
-        self.prompt_bin_counts = torch.zeros(
-            self.max_num_reqs, self.vocab_size, dtype=torch.int32, device=self.device
+        self.prompt_bin_mask = torch.zeros(
+            self.max_num_reqs,
+            cdiv(self.vocab_size, 32),
+            dtype=torch.int32,
+            device=self.device,
         )
+        # TODO(woosuk): This tensor is rarely used but can be extremely large.
+        # Optimize the memory usage.
         self.output_bin_counts = torch.zeros(
             self.max_num_reqs, self.vocab_size, dtype=torch.int32, device=self.device
         )
@@ -167,7 +171,7 @@ class RequestState:
                 self.prefill_token_ids.gpu[req_idx],
                 prefill_len,
                 prompt_len,
-                self.prompt_bin_counts[req_idx],
+                self.prompt_bin_mask[req_idx],
                 self.output_bin_counts[req_idx],
             )
 
@@ -239,7 +243,7 @@ class RequestState:
             pos=pos,
             max_num_logprobs=max_num_logprobs,
             idx_mapping=idx_mapping,
-            prompt_bin_counts=self.prompt_bin_counts,
+            prompt_bin_mask=self.prompt_bin_mask,
             output_bin_counts=self.output_bin_counts,
         )
 

From f72a817bdf6bd04b223a9da3af6c4ad1a676a98e Mon Sep 17 00:00:00 2001
From: Shu Wang <shuw@nvidia.com>
Date: Sun, 30 Nov 2025 18:05:32 -0600
Subject: [PATCH 099/770] [MoE] CuteDSL MoE with Nvfp4 DeepEP dispatch 
 (#27141)

Signed-off-by: Shu Wang <shuw@nvidia.com>
Signed-off-by: Shu Wang. <shuw@nvidia.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: root <root@umbriel-b200-017.ipp4a1.colossus.nvidia.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 vllm/envs.py                                  |  7 ++
 .../fused_moe/deepep_ll_prepare_finalize.py   | 83 +++++++++++++------
 .../fused_moe/flashinfer_cutedsl_moe.py       | 68 ++++++++++-----
 3 files changed, 112 insertions(+), 46 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 8ad62e1b8f508..541d5e20d5aab 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -147,6 +147,7 @@ if TYPE_CHECKING:
     VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
     VLLM_MARLIN_INPUT_DTYPE: Literal["int8", "fp8"] | None = None
     VLLM_MXFP4_USE_MARLIN: bool | None = None
+    VLLM_DEEPEPLL_NVFP4_DISPATCH: bool = False
     VLLM_V1_USE_OUTLINES_CACHE: bool = False
     VLLM_TPU_BUCKET_PADDING_GAP: int = 0
     VLLM_TPU_MOST_MODEL_LEN: int | None = None
@@ -1127,6 +1128,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_MARLIN_INPUT_DTYPE": env_with_choices(
         "VLLM_MARLIN_INPUT_DTYPE", None, ["int8", "fp8"]
     ),
+    # Whether to use DeepEPLL kernels for NVFP4 quantization and dispatch method
+    # only supported on Blackwell GPUs and with
+    # https://github.com/deepseek-ai/DeepEP/pull/341
+    "VLLM_DEEPEPLL_NVFP4_DISPATCH": lambda: bool(
+        int(os.getenv("VLLM_DEEPEPLL_NVFP4_DISPATCH", "0"))
+    ),
     # Whether to turn on the outlines cache for V1
     # This cache is unbounded and on disk, so it's not safe to use in
     # an environment with potentially malicious users.
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
index fea9f49c04b89..06e4a61133bd9 100644
--- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -184,31 +184,47 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
             x_fp8, x_scales = x
             x = dequant_fp8(x_fp8, x_scales).to(dtype=a1_dtype)
 
-        assert isinstance(x, torch.Tensor)
-
-        num_experts, max_tokens, hidden_dim = x.size()
-
-        # TODO (varun): Optimization - Use a batched version of quant
-        x = x.view((-1, hidden_dim))
+        assert isinstance(x, (torch.Tensor, tuple))
         q_dtype = quant_config.quant_dtype
 
-        if envs.VLLM_FLASHINFER_MOE_BACKEND == "masked_gemm":
+        if q_dtype == "nvfp4" and envs.VLLM_DEEPEPLL_NVFP4_DISPATCH:
             logger.info_once(
-                "Skip quantization when using FlashInfer CUTEDSL(masked_gemm) "
-                "for ModelOptNvFp4FusedMoE."
+                "Since VLLM_DEEPEPLL_NVFP4_DISPATCH==1, make sure "
+                "using the hybrid-ep branch of DeepEP"
+                "(https://github.com/deepseek-ai/DeepEP/tree/hybrid-ep)"
             )
-            q_dtype = None
+            assert isinstance(x, tuple)
+            x_scales = x[1]
+            x = x[0].permute(2, 0, 1)
+            num_experts, max_tokens, hidden_dim_by_2 = x.shape
+            hidden_dim = hidden_dim_by_2 * 2
+            assert envs.VLLM_FLASHINFER_MOE_BACKEND == "masked_gemm"
+            logger.info_once(
+                "Quantization is fused with DeepEP nvfp4 dispatch for "
+                "FlashInfer CUTEDSL as VLLM_DEEPEPLL_NVFP4_DISPATCH==1"
+            )
+        else:
+            if q_dtype == "nvfp4":
+                q_dtype = None
+                logger.info_once(
+                    "Using DeepEP bfloat16 dispatch for FlashInfer CUTEDSL as "
+                    "VLLM_DEEPEPLL_NVFP4_DISPATCH==0"
+                )
+            assert isinstance(x, torch.Tensor)
+            num_experts, max_tokens, hidden_dim = x.size()
 
-        x, x_scales = moe_kernel_quantize_input(
-            x,
-            quant_config.a1_scale,
-            q_dtype,
-            quant_config.per_act_token_quant,
-            quant_config.block_shape,
-        )
-        x = x.view((num_experts, -1, hidden_dim))
+            # TODO (varun): Optimization - Use a batched version of quant
+            x = x.view((-1, hidden_dim))
+            x, x_scales = moe_kernel_quantize_input(
+                x,
+                quant_config.a1_scale,
+                q_dtype,
+                quant_config.per_act_token_quant,
+                quant_config.block_shape,
+            )
+            x = x.view((num_experts, -1, hidden_dim))
 
-        if q_dtype is not None:
+        if q_dtype is not None and q_dtype != "nvfp4":
             assert x_scales is not None
             x_scales = normalize_batched_scales_shape(x_scales, num_experts)
 
@@ -240,18 +256,28 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                 "DeepEP kernels quantize the inputs in blocks of shape 128"
             )
 
+        use_nvfp4 = False
+        nvfp4_dispatch = (
+            quant_config.quant_dtype == "nvfp4" and envs.VLLM_DEEPEPLL_NVFP4_DISPATCH
+        )
+        if nvfp4_dispatch:
+            use_nvfp4 = True
+        qc_a1_gscale_or_scale = (
+            quant_config.a1_gscale if nvfp4_dispatch else quant_config.a1_scale
+        )
         has_per_token_scales = (
-            quant_config.a1_scale.numel() != 1
-            if quant_config.a1_scale is not None
+            qc_a1_gscale_or_scale.numel() != 1
+            if qc_a1_gscale_or_scale is not None
             else (
                 quant_config.a2_scale.numel() != 1
                 if quant_config.a2_scale is not None
                 else False
             )
         )
-        assert not has_per_token_scales, (
-            "low_latency kernels doesn't support dispatching per-token scales"
-        )
+        if not use_nvfp4:
+            assert not has_per_token_scales, (
+                "low_latency kernels doesn't support dispatching per-token scales"
+            )
 
         if apply_router_weight_on_input:
             topk = topk_ids.size(1)
@@ -269,9 +295,12 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
             self.max_tokens_per_rank,
             num_experts,
             use_fp8=self.use_fp8_dispatch,
-            # round_scale needs to be set to dispatch in ue8m0
-            round_scale=self.use_ue8m0_dispatch,
-            use_ue8m0=self.use_ue8m0_dispatch,
+            **(dict(use_nvfp4=True) if use_nvfp4 else dict()),
+            **(
+                dict(x_global_scale=qc_a1_gscale_or_scale)
+                if qc_a1_gscale_or_scale is not None
+                else dict()
+            ),
             async_finish=False,
             return_recv_hook=True,
         )
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py
index 2747ef04a3499..6e0b57156cb31 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py
@@ -4,6 +4,7 @@
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm import envs
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
@@ -109,7 +110,8 @@ class FlashInferCuteDSLExperts(mk.FusedMoEPermuteExpertsUnpermute):
         - Note: in order for activation chunking to work, the first dimension
           of each tuple must be the number of tokens.
         """
-        output_shape = (local_num_experts, M, K)
+        K_dim = K * 2 if envs.VLLM_DEEPEPLL_NVFP4_DISPATCH else K
+        output_shape = (local_num_experts, M, K_dim)
         workspace2 = (local_num_experts, M, N)
         workspace1 = output_shape
         return (workspace1, workspace2, output_shape)
@@ -144,9 +146,18 @@ class FlashInferCuteDSLExperts(mk.FusedMoEPermuteExpertsUnpermute):
         assert hidden_states.ndim == 3
         assert self.w1_scale.ndim == 3
         assert self.w2_scale.ndim == 3
+
+        input_global_scale = (
+            None if envs.VLLM_DEEPEPLL_NVFP4_DISPATCH else self.a1_gscale
+        )
+        flashinfer_hidden_states = (
+            (hidden_states, a1q_scale)
+            if envs.VLLM_DEEPEPLL_NVFP4_DISPATCH
+            else hidden_states
+        )
         flashinfer_cutedsl_moe_masked(
-            hidden_states=hidden_states,
-            input_global_scale=self.a1_gscale,
+            hidden_states=flashinfer_hidden_states,
+            input_global_scale=input_global_scale,
             w1=w1,
             w1_blockscale=self.w1_scale,
             w1_alpha=self.g1_alphas,
@@ -172,7 +183,7 @@ def get_cute_dtype(input: torch.Tensor) -> str:
 
 
 def flashinfer_cutedsl_moe_masked(
-    hidden_states: torch.Tensor,
+    hidden_states: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
     input_global_scale: torch.Tensor,
     w1: torch.Tensor,
     w1_blockscale: torch.Tensor,
@@ -190,7 +201,10 @@ def flashinfer_cutedsl_moe_masked(
     kernels.
 
     Args:
-        hidden_states (torch.Tensor): [num_experts, m, k], bf16
+        hidden_states: Either of the following case
+            * torch.Tensor: [num_experts, m, k], bf16
+            * tuple[torch.Tensor, torch.Tensor]: [num_experts, m, k // 2],
+                  uint8, [num_experts, m, k // 16], float8_e4m3fn
         input_global_scale (torch.Tensor): (l,)
         w1 (torch.Tensor): fp4 weights, [l, 2 * n, k // 2], uint8
         w1_blockscale (torch.Tensor): blockscale factors, e4m3,
@@ -207,9 +221,6 @@ def flashinfer_cutedsl_moe_masked(
     """
 
     # === Assertions on dtypes ===
-    assert input_global_scale.dtype == torch.float32, (
-        f"input_global_scale must be float32, got {input_global_scale.dtype}"
-    )
     assert w1.dtype == torch.uint8, f"w1 must be uint8, got {w1.dtype}"
     assert w1_blockscale.dtype == torch.float8_e4m3fn, (
         f"w1_blockscale must be float8_e4m3fn, got {w1_blockscale.dtype}"
@@ -230,7 +241,32 @@ def flashinfer_cutedsl_moe_masked(
 
     # === Assertions on shapes ===
     n = w2.shape[-1] * 2  # intermediate dimension
-    num_experts, m, k = hidden_states.shape
+    if isinstance(hidden_states, tuple):
+        assert input_global_scale is None, (
+            "input_global_scale is needed when input needs quant"
+        )
+
+        aq = hidden_states[0].view(torch.uint8)
+        aq_sf = hidden_states[1].view(torch.float8_e4m3fn)
+        # m, k_by_2, num_experts = aq.shape
+        num_experts, m, k_by_2 = aq.shape
+        k = k_by_2 * 2
+        aq = aq.permute(1, 2, 0)
+    else:
+        num_experts, m, k = hidden_states.shape
+
+        assert input_global_scale.dtype == torch.float32, (
+            f"input_global_scale must be float32, got {input_global_scale.dtype}"
+        )
+        assert input_global_scale.shape == (num_experts,), (
+            f"input_global_scale must be (l,), got {input_global_scale.shape}"
+        )
+
+        aq, aq_sf = scaled_fp4_grouped_quantize(
+            hidden_states,
+            masked_m,
+            input_global_scale,
+        )
 
     assert w1.shape[-2] == 2 * n, f"w1 last-2 dim must be 2*n, got {w1.shape}"
     assert w1.shape[-1] * 2 == k, (
@@ -241,9 +277,6 @@ def flashinfer_cutedsl_moe_masked(
         n // 2,
     ), f"w2 shape mismatch, got {w2.shape[-2:]}, expected {(k, n // 2)}"
 
-    assert input_global_scale.shape == (num_experts,), (
-        f"input_global_scale must be (l,), got {input_global_scale.shape}"
-    )
     assert w1_alpha.shape == (num_experts,), (
         f"w1_alpha must be (l,), got {w1_alpha.shape}"
     )
@@ -254,12 +287,6 @@ def flashinfer_cutedsl_moe_masked(
         f"w2_alpha must be (l,), got {w2_alpha.shape}"
     )
 
-    aq, aq_sf = scaled_fp4_grouped_quantize(
-        hidden_states,
-        masked_m,
-        input_global_scale,
-    )
-
     workspace = workspace.permute(1, 2, 0)  # requirement of kernel
     sf_vec_size = 16
     assert aq_sf.dtype == torch.float8_e4m3fn
@@ -267,7 +294,10 @@ def flashinfer_cutedsl_moe_masked(
     ab_dtype = "float4_e2m1fn"
     sf_dtype = "float8_e4m3fn"
 
-    c_dtype = get_cute_dtype(hidden_states)
+    if isinstance(hidden_states, tuple):
+        c_dtype = "bfloat16"
+    else:
+        c_dtype = get_cute_dtype(hidden_states)
 
     # Gemm1
     flashinfer_cutedsl_grouped_gemm_nt_masked(

From 1ab8fc8197f5aa0ae471ef0ff1c725b733594f7a Mon Sep 17 00:00:00 2001
From: Yifei Zhang <yifei.zhang1992@outlook.com>
Date: Mon, 1 Dec 2025 12:30:46 +0800
Subject: [PATCH 100/770] Make PyTorch profiler gzip and CUDA time dump
 configurable (#29568)

Signed-off-by: Yifei Zhang <yifei.zhang1992@outlook.com>
---
 docs/contributing/profiling.md |  2 ++
 vllm/envs.py                   | 13 +++++++++++++
 vllm/profiler/gpu_profiler.py  | 25 ++++++++++++++-----------
 vllm/v1/engine/async_llm.py    |  4 +++-
 vllm/v1/worker/xpu_worker.py   |  4 +++-
 5 files changed, 35 insertions(+), 13 deletions(-)

diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md
index 7634cc0859edf..65382afbe4f21 100644
--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@@ -11,6 +11,8 @@ We support tracing vLLM workers using the `torch.profiler` module. You can enabl
 - `VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1` to record memory, off by default
 - `VLLM_TORCH_PROFILER_WITH_STACK=1` to enable recording stack information, on by default
 - `VLLM_TORCH_PROFILER_WITH_FLOPS=1` to enable recording FLOPs, off by default
+- `VLLM_TORCH_PROFILER_USE_GZIP=0` to disable gzip-compressing profiling files, on by default
+- `VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL=0` to disable dumping and printing the aggregated CUDA self time table, on by default
 
 The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set.
 
diff --git a/vllm/envs.py b/vllm/envs.py
index 541d5e20d5aab..46f1aa3222be7 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -100,6 +100,8 @@ if TYPE_CHECKING:
     VLLM_TORCH_PROFILER_WITH_FLOPS: bool = False
     VLLM_PROFILER_DELAY_ITERS: int = 0
     VLLM_PROFILER_MAX_ITERS: int = 0
+    VLLM_TORCH_PROFILER_USE_GZIP: bool = True
+    VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL: bool = True
     VLLM_USE_TRITON_AWQ: bool = False
     VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
     VLLM_SKIP_P2P_CHECK: bool = False
@@ -890,6 +892,17 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # Maximum number of iterations to profile when using the torch/torch CUDA profiler.
     # If set to 0, will not limit the number of iterations.
     "VLLM_PROFILER_MAX_ITERS": lambda: int(os.getenv("VLLM_PROFILER_MAX_ITERS", "0")),
+    # Control whether torch profiler gzip-compresses profiling files.
+    # Set VLLM_TORCH_PROFILER_USE_GZIP=0 to disable gzip (enabled by default).
+    "VLLM_TORCH_PROFILER_USE_GZIP": lambda: bool(
+        os.getenv("VLLM_TORCH_PROFILER_USE_GZIP", "1") != "0"
+    ),
+    # Control whether torch profiler dumps the self_cuda_time_total table.
+    # Set VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL=0 to disable dumping
+    # (enabled by default).
+    "VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL": lambda: bool(
+        os.getenv("VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL", "1") != "0"
+    ),
     # If set, vLLM will use Triton implementations of AWQ.
     "VLLM_USE_TRITON_AWQ": lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))),
     # If set, allow loading or unloading lora adapters in runtime,
diff --git a/vllm/profiler/gpu_profiler.py b/vllm/profiler/gpu_profiler.py
index 3e2cbe7296e9d..798c615221b9f 100644
--- a/vllm/profiler/gpu_profiler.py
+++ b/vllm/profiler/gpu_profiler.py
@@ -162,7 +162,9 @@ class TorchProfilerWrapper(WorkerProfiler):
             with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
             with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
             on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                torch_profiler_trace_dir, worker_name=worker_name, use_gzip=True
+                torch_profiler_trace_dir,
+                worker_name=worker_name,
+                use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP,
             ),
         )
 
@@ -174,18 +176,19 @@ class TorchProfilerWrapper(WorkerProfiler):
     def _stop(self) -> None:
         self.profiler.stop()
 
-        rank = self.local_rank
-        profiler_dir = envs.VLLM_TORCH_PROFILER_DIR
-        profiler_out_file = f"{profiler_dir}/profiler_out_{rank}.txt"
-        sort_key = "self_cuda_time_total"
-        table = self.profiler.key_averages().table(sort_by=sort_key)
+        if envs.VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL:
+            rank = self.local_rank
+            profiler_dir = envs.VLLM_TORCH_PROFILER_DIR
+            profiler_out_file = f"{profiler_dir}/profiler_out_{rank}.txt"
+            sort_key = "self_cuda_time_total"
+            table = self.profiler.key_averages().table(sort_by=sort_key)
 
-        with open(profiler_out_file, "w") as f:
-            print(table, file=f)
+            with open(profiler_out_file, "w") as f:
+                print(table, file=f)
 
-        # only print profiler results on rank 0
-        if rank == 0:
-            print(table)
+            # only print profiler results on rank 0
+            if rank == 0:
+                print(table)
 
     @override
     def annotate_context_manager(self, name: str):
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 336d3e9fa1d20..d0708a8a046d1 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -190,7 +190,9 @@ class AsyncLLM(EngineClient):
                 ],
                 with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
                 on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                    envs.VLLM_TORCH_PROFILER_DIR, worker_name=worker_name, use_gzip=True
+                    envs.VLLM_TORCH_PROFILER_DIR,
+                    worker_name=worker_name,
+                    use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP,
                 ),
             )
         else:
diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py
index 4d7864e90496a..267369c730368 100644
--- a/vllm/v1/worker/xpu_worker.py
+++ b/vllm/v1/worker/xpu_worker.py
@@ -64,7 +64,9 @@ class XPUWorker(Worker):
                 with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
                 with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
                 on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                    torch_profiler_trace_dir, worker_name=worker_name, use_gzip=True
+                    torch_profiler_trace_dir,
+                    worker_name=worker_name,
+                    use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP,
                 ),
             )
         else:

From 83805a6078c73f7f72d270952965a637c40ebdf2 Mon Sep 17 00:00:00 2001
From: Huamin Li <3ericli@gmail.com>
Date: Sun, 30 Nov 2025 20:38:06 -0800
Subject: [PATCH 101/770] [CI] Skip paddleocr_vl for transformer 4.57.3
 (#29758)

Signed-off-by: Huamin Li <3ericli@gmail.com>
---
 tests/models/multimodal/generation/test_common.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index 95b64b380db0d..deaeea059ccaf 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -708,6 +708,12 @@ VLM_TEST_SETTINGS = {
         max_num_seqs=2,
         auto_cls=AutoModelForCausalLM,
         image_size_factors=[(), (0.25,)],
+        marks=[
+            pytest.mark.skipif(
+                Version(TRANSFORMERS_VERSION) == Version("4.57.3"),
+                reason="This model is broken in Transformers v4.57.3",
+            )
+        ],
     ),
     "phi3v": VLMTestInfo(
         models=["microsoft/Phi-3.5-vision-instruct"],

From 62de4f4257255bff553a61f475212e3e9454706c Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <yuqi.wang@daocloud.io>
Date: Mon, 1 Dec 2025 15:30:43 +0800
Subject: [PATCH 102/770] [Frontend] Resettle pooling entrypoints  (#29634)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
---
 .github/CODEOWNERS                            |   1 +
 docs/design/io_processor_plugins.md           |   2 +-
 docs/serving/openai_compatible_server.md      |  10 +-
 tests/entrypoints/openai/test_run_batch.py    |   2 +-
 .../pooling/classify/test_online.py           |   3 +-
 .../pooling/classify/test_online_vision.py    |   2 +-
 .../entrypoints/pooling/embed/test_online.py  |   6 +-
 .../pooling/embed/test_online_dimensions.py   |   2 +-
 .../pooling/embed/test_online_long_text.py    |   2 +-
 .../pooling/embed/test_online_vision.py       |   2 +-
 .../pooling/pooling/test_online.py            |   2 +-
 .../pooling/score/test_online_rerank.py       |   3 +-
 .../pooling/score/test_online_score.py        |   2 +-
 .../prithvi_io_processor/prithvi_processor.py |   5 +-
 .../test_io_processor_plugins.py              |   2 +-
 vllm/entrypoints/openai/api_server.py         | 307 +-------
 vllm/entrypoints/openai/protocol.py           | 710 +-----------------
 vllm/entrypoints/openai/run_batch.py          | 100 ++-
 vllm/entrypoints/openai/serving_engine.py     |  35 +-
 vllm/entrypoints/openai/utils.py              |  12 +
 vllm/entrypoints/pooling/__init__.py          |  16 +
 vllm/entrypoints/pooling/classify/__init__.py |   0
 .../pooling/classify/api_router.py            |  50 ++
 vllm/entrypoints/pooling/classify/protocol.py | 181 +++++
 .../classify/serving.py}                      |  12 +-
 vllm/entrypoints/pooling/embed/__init__.py    |   0
 vllm/entrypoints/pooling/embed/api_router.py  |  67 ++
 vllm/entrypoints/pooling/embed/protocol.py    | 208 +++++
 .../embed/serving.py}                         |  14 +-
 vllm/entrypoints/pooling/pooling/__init__.py  |   0
 .../entrypoints/pooling/pooling/api_router.py |  63 ++
 vllm/entrypoints/pooling/pooling/protocol.py  | 148 ++++
 .../pooling/serving.py}                       |   8 +-
 vllm/entrypoints/pooling/score/__init__.py    |   0
 vllm/entrypoints/pooling/score/api_router.py  | 149 ++++
 vllm/entrypoints/pooling/score/protocol.py    | 145 ++++
 .../score/serving.py}                         |   8 +-
 vllm/entrypoints/sagemaker/routes.py          |  50 +-
 vllm/plugins/io_processors/interface.py       |   2 +-
 39 files changed, 1264 insertions(+), 1067 deletions(-)
 create mode 100644 vllm/entrypoints/pooling/__init__.py
 create mode 100644 vllm/entrypoints/pooling/classify/__init__.py
 create mode 100644 vllm/entrypoints/pooling/classify/api_router.py
 create mode 100644 vllm/entrypoints/pooling/classify/protocol.py
 rename vllm/entrypoints/{openai/serving_classification.py => pooling/classify/serving.py} (99%)
 create mode 100644 vllm/entrypoints/pooling/embed/__init__.py
 create mode 100644 vllm/entrypoints/pooling/embed/api_router.py
 create mode 100644 vllm/entrypoints/pooling/embed/protocol.py
 rename vllm/entrypoints/{openai/serving_embedding.py => pooling/embed/serving.py} (99%)
 create mode 100644 vllm/entrypoints/pooling/pooling/__init__.py
 create mode 100644 vllm/entrypoints/pooling/pooling/api_router.py
 create mode 100644 vllm/entrypoints/pooling/pooling/protocol.py
 rename vllm/entrypoints/{openai/serving_pooling.py => pooling/pooling/serving.py} (99%)
 create mode 100644 vllm/entrypoints/pooling/score/__init__.py
 create mode 100644 vllm/entrypoints/pooling/score/api_router.py
 create mode 100644 vllm/entrypoints/pooling/score/protocol.py
 rename vllm/entrypoints/{openai/serving_score.py => pooling/score/serving.py} (99%)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 3247408e1163e..ecb10d1a450f3 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -149,6 +149,7 @@ mkdocs.yaml @hmellor
 /examples/*/pooling/ @noooop
 /tests/models/*/pooling* @noooop
 /tests/entrypoints/pooling @noooop
+/vllm/entrypoints/pooling @aarnphm @chaunceyjiang @noooop
 /vllm/config/pooler.py @noooop
 /vllm/pooling_params.py @noooop
 /vllm/model_executor/layers/pooler.py @noooop
diff --git a/docs/design/io_processor_plugins.md b/docs/design/io_processor_plugins.md
index 91ab4deae71df..b4a30cda35a01 100644
--- a/docs/design/io_processor_plugins.md
+++ b/docs/design/io_processor_plugins.md
@@ -77,7 +77,7 @@ The `parse_request` method is used for validating the user prompt and converting
 The `pre_process*` methods take the validated plugin input to generate vLLM's model prompts for regular inference.
 The `post_process*` methods take `PoolingRequestOutput` objects as input and generate a custom plugin output.
 The `validate_or_generate_params` method is used for validating with the plugin any `SamplingParameters`/`PoolingParameters` received with the user request, or to generate new ones if none are specified. The function always returns the validated/generated parameters.
-The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/pooling` serving endpoint is available here [vllm/entrypoints/openai/serving_pooling.py](../../vllm/entrypoints/openai/serving_pooling.py).
+The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/pooling` serving endpoint is available here [vllm/entrypoints/openai/serving_pooling.py](../../vllm/entrypoints/pooling/pooling/serving.py).
 
 An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/IBM/terratorch/tree/main/terratorch/vllm/plugins/segmentation). Please, also refer to our online ([examples/online_serving/pooling/prithvi_geospatial_mae.py](../../examples/online_serving/pooling/prithvi_geospatial_mae.py)) and offline ([examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py](../../examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py)) inference examples.
 
diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index e3280bd15b55c..ac98efb7b88a6 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -351,7 +351,7 @@ The following extra parameters are supported by default:
 ??? code
 
     ```python
-    --8<-- "vllm/entrypoints/openai/protocol.py:embedding-extra-params"
+    --8<-- "vllm/entrypoints/pooling/embed/protocol.py:embedding-extra-params"
     ```
 
 For chat-like input (i.e. if `messages` is passed), these extra parameters are supported instead:
@@ -359,7 +359,7 @@ For chat-like input (i.e. if `messages` is passed), these extra parameters are s
 ??? code
 
     ```python
-    --8<-- "vllm/entrypoints/openai/protocol.py:chat-embedding-extra-params"
+    --8<-- "vllm/entrypoints/pooling/embed/protocol.py:chat-embedding-extra-params"
     ```
 
 ### Transcriptions API
@@ -629,7 +629,7 @@ The following [pooling parameters][vllm.PoolingParams] are supported.
 The following extra parameters are supported:
 
 ```python
---8<-- "vllm/entrypoints/openai/protocol.py:classification-extra-params"
+--8<-- "vllm/entrypoints/pooling/classify/protocol.py:classification-extra-params"
 ```
 
 ### Score API
@@ -834,7 +834,7 @@ The following [pooling parameters][vllm.PoolingParams] are supported.
 The following extra parameters are supported:
 
 ```python
---8<-- "vllm/entrypoints/openai/protocol.py:score-extra-params"
+--8<-- "vllm/entrypoints/pooling/score/protocol.py:score-extra-params"
 ```
 
 ### Re-rank API
@@ -915,7 +915,7 @@ The following [pooling parameters][vllm.PoolingParams] are supported.
 The following extra parameters are supported:
 
 ```python
---8<-- "vllm/entrypoints/openai/protocol.py:rerank-extra-params"
+--8<-- "vllm/entrypoints/pooling/score/protocol.py:rerank-extra-params"
 ```
 
 ## Ray Serve LLM
diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py
index f951b57fe7269..f6f109990e73a 100644
--- a/tests/entrypoints/openai/test_run_batch.py
+++ b/tests/entrypoints/openai/test_run_batch.py
@@ -7,7 +7,7 @@ import tempfile
 
 import pytest
 
-from vllm.entrypoints.openai.protocol import BatchRequestOutput
+from vllm.entrypoints.openai.run_batch import BatchRequestOutput
 
 MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
 
diff --git a/tests/entrypoints/pooling/classify/test_online.py b/tests/entrypoints/pooling/classify/test_online.py
index 25080d4189c2d..6fef688586955 100644
--- a/tests/entrypoints/pooling/classify/test_online.py
+++ b/tests/entrypoints/pooling/classify/test_online.py
@@ -7,7 +7,8 @@ import torch
 import torch.nn.functional as F
 
 from tests.utils import RemoteOpenAIServer
-from vllm.entrypoints.openai.protocol import ClassificationResponse, PoolingResponse
+from vllm.entrypoints.pooling.classify.protocol import ClassificationResponse
+from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
 
 MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"
 DTYPE = "float32"  # Use float32 to avoid NaN issue
diff --git a/tests/entrypoints/pooling/classify/test_online_vision.py b/tests/entrypoints/pooling/classify/test_online_vision.py
index f2616e057b175..aeb05c64d1907 100644
--- a/tests/entrypoints/pooling/classify/test_online_vision.py
+++ b/tests/entrypoints/pooling/classify/test_online_vision.py
@@ -7,7 +7,7 @@ import pytest
 import requests
 
 from tests.utils import RemoteOpenAIServer
-from vllm.entrypoints.openai.protocol import ClassificationResponse
+from vllm.entrypoints.pooling.classify.protocol import ClassificationResponse
 
 VLM_MODEL_NAME = "muziyongshixin/Qwen2.5-VL-7B-for-VideoCls"
 MAXIMUM_VIDEOS = 1
diff --git a/tests/entrypoints/pooling/embed/test_online.py b/tests/entrypoints/pooling/embed/test_online.py
index 0c88d800e2f99..81f4f9a82c1f4 100644
--- a/tests/entrypoints/pooling/embed/test_online.py
+++ b/tests/entrypoints/pooling/embed/test_online.py
@@ -15,10 +15,8 @@ import torch.nn.functional as F
 from tests.models.language.pooling.embed_utils import run_embedding_correctness_test
 from tests.models.utils import check_embeddings_close
 from tests.utils import RemoteOpenAIServer
-from vllm.entrypoints.openai.protocol import (
-    EmbeddingResponse,
-    PoolingResponse,
-)
+from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
+from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
 from vllm.platforms import current_platform
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.utils.serial_utils import (
diff --git a/tests/entrypoints/pooling/embed/test_online_dimensions.py b/tests/entrypoints/pooling/embed/test_online_dimensions.py
index 8018dac2d3ffe..26aa57742b02a 100644
--- a/tests/entrypoints/pooling/embed/test_online_dimensions.py
+++ b/tests/entrypoints/pooling/embed/test_online_dimensions.py
@@ -11,7 +11,7 @@ from tests.conftest import HfRunner
 from tests.models.language.pooling.embed_utils import run_embedding_correctness_test
 from tests.models.utils import EmbedModelInfo
 from tests.utils import RemoteOpenAIServer
-from vllm.entrypoints.openai.protocol import EmbeddingResponse
+from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
 from vllm.platforms import current_platform
 
 if current_platform.is_rocm():
diff --git a/tests/entrypoints/pooling/embed/test_online_long_text.py b/tests/entrypoints/pooling/embed/test_online_long_text.py
index a9ade09dad0b5..0be7eebc2017d 100644
--- a/tests/entrypoints/pooling/embed/test_online_long_text.py
+++ b/tests/entrypoints/pooling/embed/test_online_long_text.py
@@ -15,7 +15,7 @@ import pytest
 import pytest_asyncio
 
 from tests.utils import RemoteOpenAIServer
-from vllm.entrypoints.openai.protocol import EmbeddingResponse
+from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
 from vllm.platforms import current_platform
 
 if current_platform.is_rocm():
diff --git a/tests/entrypoints/pooling/embed/test_online_vision.py b/tests/entrypoints/pooling/embed/test_online_vision.py
index 1befb5a3cf7a8..83e7048b9def6 100644
--- a/tests/entrypoints/pooling/embed/test_online_vision.py
+++ b/tests/entrypoints/pooling/embed/test_online_vision.py
@@ -8,7 +8,7 @@ import requests
 from transformers import AutoProcessor
 
 from tests.utils import VLLM_PATH, RemoteOpenAIServer
-from vllm.entrypoints.openai.protocol import EmbeddingResponse
+from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
 from vllm.multimodal.utils import encode_image_base64, fetch_image
 
 MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
diff --git a/tests/entrypoints/pooling/pooling/test_online.py b/tests/entrypoints/pooling/pooling/test_online.py
index 4b20c5b0fa84d..bfcb724f19448 100644
--- a/tests/entrypoints/pooling/pooling/test_online.py
+++ b/tests/entrypoints/pooling/pooling/test_online.py
@@ -11,7 +11,7 @@ import torch
 
 from tests.models.utils import check_embeddings_close
 from tests.utils import RemoteOpenAIServer
-from vllm.entrypoints.openai.protocol import PoolingResponse
+from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.utils.serial_utils import (
     EMBED_DTYPE_TO_TORCH_DTYPE,
diff --git a/tests/entrypoints/pooling/score/test_online_rerank.py b/tests/entrypoints/pooling/score/test_online_rerank.py
index 5a772e22a7414..f262dd4cb06b6 100644
--- a/tests/entrypoints/pooling/score/test_online_rerank.py
+++ b/tests/entrypoints/pooling/score/test_online_rerank.py
@@ -7,7 +7,8 @@ import torch
 import torch.nn.functional as F
 
 from tests.utils import RemoteOpenAIServer
-from vllm.entrypoints.openai.protocol import PoolingResponse, RerankResponse
+from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
+from vllm.entrypoints.pooling.score.protocol import RerankResponse
 from vllm.platforms import current_platform
 
 if current_platform.is_rocm():
diff --git a/tests/entrypoints/pooling/score/test_online_score.py b/tests/entrypoints/pooling/score/test_online_score.py
index ceff9d0181825..30ef55c8b6756 100644
--- a/tests/entrypoints/pooling/score/test_online_score.py
+++ b/tests/entrypoints/pooling/score/test_online_score.py
@@ -9,7 +9,7 @@ import torch.nn.functional as F
 from torch import tensor
 
 from tests.utils import RemoteOpenAIServer
-from vllm.entrypoints.openai.protocol import ScoreResponse
+from vllm.entrypoints.pooling.score.protocol import ScoreResponse
 from vllm.platforms import current_platform
 
 if current_platform.is_rocm():
diff --git a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
index 5614f19d1a4f3..e38a79de367ef 100644
--- a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
+++ b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
@@ -18,7 +18,10 @@ from einops import rearrange
 from terratorch.datamodules import Sen1Floods11NonGeoDataModule
 
 from vllm.config import VllmConfig
-from vllm.entrypoints.openai.protocol import IOProcessorRequest, IOProcessorResponse
+from vllm.entrypoints.pooling.pooling.protocol import (
+    IOProcessorRequest,
+    IOProcessorResponse,
+)
 from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
 from vllm.outputs import PoolingRequestOutput
diff --git a/tests/plugins_tests/test_io_processor_plugins.py b/tests/plugins_tests/test_io_processor_plugins.py
index 582cf9a0711b1..2088ee36e89a8 100644
--- a/tests/plugins_tests/test_io_processor_plugins.py
+++ b/tests/plugins_tests/test_io_processor_plugins.py
@@ -7,7 +7,7 @@ import requests
 
 from tests.utils import RemoteOpenAIServer
 from vllm.config import VllmConfig
-from vllm.entrypoints.openai.protocol import IOProcessorResponse
+from vllm.entrypoints.pooling.pooling.protocol import IOProcessorResponse
 from vllm.plugins.io_processors import get_io_processor
 
 MODEL_NAME = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index a7a2733913b08..6a648822d9b2b 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -14,7 +14,7 @@ import socket
 import tempfile
 import uuid
 from argparse import Namespace
-from collections.abc import AsyncGenerator, AsyncIterator, Awaitable, Callable
+from collections.abc import AsyncGenerator, AsyncIterator, Awaitable
 from contextlib import asynccontextmanager
 from http import HTTPStatus
 from typing import Annotated, Any, Literal
@@ -54,29 +54,16 @@ from vllm.entrypoints.openai.orca_metrics import metrics_header
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionRequest,
     ChatCompletionResponse,
-    ClassificationRequest,
-    ClassificationResponse,
     CompletionRequest,
     CompletionResponse,
     DetokenizeRequest,
     DetokenizeResponse,
-    EmbeddingBytesResponse,
-    EmbeddingRequest,
-    EmbeddingResponse,
     ErrorInfo,
     ErrorResponse,
     GenerateRequest,
     GenerateResponse,
-    IOProcessorResponse,
-    PoolingBytesResponse,
-    PoolingRequest,
-    PoolingResponse,
-    RerankRequest,
-    RerankResponse,
     ResponsesRequest,
     ResponsesResponse,
-    ScoreRequest,
-    ScoreResponse,
     StreamingResponsesResponse,
     TokenizeRequest,
     TokenizeResponse,
@@ -86,17 +73,13 @@ from vllm.entrypoints.openai.protocol import (
     TranslationResponse,
 )
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
-from vllm.entrypoints.openai.serving_classification import ServingClassification
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
-from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import (
     BaseModelPath,
     OpenAIServingModels,
 )
-from vllm.entrypoints.openai.serving_pooling import OpenAIServingPooling
 from vllm.entrypoints.openai.serving_responses import OpenAIServingResponses
-from vllm.entrypoints.openai.serving_score import ServingScores
 from vllm.entrypoints.openai.serving_tokenization import OpenAIServingTokenization
 from vllm.entrypoints.openai.serving_tokens import ServingTokens
 from vllm.entrypoints.openai.serving_transcription import (
@@ -104,6 +87,11 @@ from vllm.entrypoints.openai.serving_transcription import (
     OpenAIServingTranslation,
 )
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
+from vllm.entrypoints.openai.utils import validate_json_request
+from vllm.entrypoints.pooling.classify.serving import ServingClassification
+from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding
+from vllm.entrypoints.pooling.pooling.serving import OpenAIServingPooling
+from vllm.entrypoints.pooling.score.serving import ServingScores
 from vllm.entrypoints.tool_server import DemoToolServer, MCPToolServer, ToolServer
 from vllm.entrypoints.utils import (
     cli_env_setup,
@@ -254,15 +242,6 @@ async def build_async_engine_client_from_engine_args(
             async_llm.shutdown()
 
 
-async def validate_json_request(raw_request: Request):
-    content_type = raw_request.headers.get("content-type", "").lower()
-    media_type = content_type.split(";", maxsplit=1)[0]
-    if media_type != "application/json":
-        raise RequestValidationError(
-            errors=["Unsupported Media Type: Only 'application/json' is allowed"]
-        )
-
-
 router = APIRouter()
 
 
@@ -324,26 +303,6 @@ def completion(request: Request) -> OpenAIServingCompletion | None:
     return request.app.state.openai_serving_completion
 
 
-def pooling(request: Request) -> OpenAIServingPooling | None:
-    return request.app.state.openai_serving_pooling
-
-
-def embedding(request: Request) -> OpenAIServingEmbedding | None:
-    return request.app.state.openai_serving_embedding
-
-
-def score(request: Request) -> ServingScores | None:
-    return request.app.state.openai_serving_scores
-
-
-def classify(request: Request) -> ServingClassification | None:
-    return request.app.state.openai_serving_classification
-
-
-def rerank(request: Request) -> ServingScores | None:
-    return request.app.state.openai_serving_scores
-
-
 def tokenization(request: Request) -> OpenAIServingTokenization:
     return request.app.state.openai_serving_tokenization
 
@@ -817,166 +776,6 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
     return StreamingResponse(content=generator, media_type="text/event-stream")
 
 
-@router.post(
-    "/v1/embeddings",
-    dependencies=[Depends(validate_json_request)],
-    responses={
-        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
-        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
-    },
-)
-@with_cancellation
-@load_aware_call
-async def create_embedding(
-    request: EmbeddingRequest,
-    raw_request: Request,
-):
-    handler = embedding(raw_request)
-    if handler is None:
-        return base(raw_request).create_error_response(
-            message="The model does not support Embeddings API"
-        )
-
-    try:
-        generator = await handler.create_embedding(request, raw_request)
-    except Exception as e:
-        raise HTTPException(
-            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
-        ) from e
-
-    if isinstance(generator, ErrorResponse):
-        return JSONResponse(
-            content=generator.model_dump(), status_code=generator.error.code
-        )
-    elif isinstance(generator, EmbeddingResponse):
-        return JSONResponse(content=generator.model_dump())
-    elif isinstance(generator, EmbeddingBytesResponse):
-        return StreamingResponse(
-            content=generator.body,
-            headers={"metadata": generator.metadata},
-            media_type=generator.media_type,
-        )
-
-    assert_never(generator)
-
-
-@router.post(
-    "/pooling",
-    dependencies=[Depends(validate_json_request)],
-    responses={
-        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
-        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
-    },
-)
-@with_cancellation
-@load_aware_call
-async def create_pooling(request: PoolingRequest, raw_request: Request):
-    handler = pooling(raw_request)
-    if handler is None:
-        return base(raw_request).create_error_response(
-            message="The model does not support Pooling API"
-        )
-    try:
-        generator = await handler.create_pooling(request, raw_request)
-    except Exception as e:
-        raise HTTPException(
-            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
-        ) from e
-    if isinstance(generator, ErrorResponse):
-        return JSONResponse(
-            content=generator.model_dump(), status_code=generator.error.code
-        )
-    elif isinstance(generator, (PoolingResponse, IOProcessorResponse)):
-        return JSONResponse(content=generator.model_dump())
-    elif isinstance(generator, PoolingBytesResponse):
-        return StreamingResponse(
-            content=generator.body,
-            headers={"metadata": generator.metadata},
-            media_type=generator.media_type,
-        )
-
-    assert_never(generator)
-
-
-@router.post("/classify", dependencies=[Depends(validate_json_request)])
-@with_cancellation
-@load_aware_call
-async def create_classify(request: ClassificationRequest, raw_request: Request):
-    handler = classify(raw_request)
-    if handler is None:
-        return base(raw_request).create_error_response(
-            message="The model does not support Classification API"
-        )
-
-    try:
-        generator = await handler.create_classify(request, raw_request)
-    except Exception as e:
-        raise HTTPException(
-            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
-        ) from e
-    if isinstance(generator, ErrorResponse):
-        return JSONResponse(
-            content=generator.model_dump(), status_code=generator.error.code
-        )
-
-    elif isinstance(generator, ClassificationResponse):
-        return JSONResponse(content=generator.model_dump())
-
-    assert_never(generator)
-
-
-@router.post(
-    "/score",
-    dependencies=[Depends(validate_json_request)],
-    responses={
-        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
-        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
-    },
-)
-@with_cancellation
-@load_aware_call
-async def create_score(request: ScoreRequest, raw_request: Request):
-    handler = score(raw_request)
-    if handler is None:
-        return base(raw_request).create_error_response(
-            message="The model does not support Score API"
-        )
-
-    try:
-        generator = await handler.create_score(request, raw_request)
-    except Exception as e:
-        raise HTTPException(
-            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
-        ) from e
-    if isinstance(generator, ErrorResponse):
-        return JSONResponse(
-            content=generator.model_dump(), status_code=generator.error.code
-        )
-    elif isinstance(generator, ScoreResponse):
-        return JSONResponse(content=generator.model_dump())
-
-    assert_never(generator)
-
-
-@router.post(
-    "/v1/score",
-    dependencies=[Depends(validate_json_request)],
-    responses={
-        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
-        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
-    },
-)
-@with_cancellation
-@load_aware_call
-async def create_score_v1(request: ScoreRequest, raw_request: Request):
-    logger.warning(
-        "To indicate that Score API is not part of standard OpenAI API, we "
-        "have moved it to `/score`. Please update your client accordingly."
-    )
-
-    return await create_score(request, raw_request)
-
-
 @router.post(
     "/v1/audio/transcriptions",
     responses={
@@ -1055,70 +854,6 @@ async def create_translations(
     return StreamingResponse(content=generator, media_type="text/event-stream")
 
 
-@router.post(
-    "/rerank",
-    dependencies=[Depends(validate_json_request)],
-    responses={
-        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
-        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
-    },
-)
-@with_cancellation
-@load_aware_call
-async def do_rerank(request: RerankRequest, raw_request: Request):
-    handler = rerank(raw_request)
-    if handler is None:
-        return base(raw_request).create_error_response(
-            message="The model does not support Rerank (Score) API"
-        )
-    try:
-        generator = await handler.do_rerank(request, raw_request)
-    except Exception as e:
-        raise HTTPException(
-            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
-        ) from e
-    if isinstance(generator, ErrorResponse):
-        return JSONResponse(
-            content=generator.model_dump(), status_code=generator.error.code
-        )
-    elif isinstance(generator, RerankResponse):
-        return JSONResponse(content=generator.model_dump())
-
-    assert_never(generator)
-
-
-@router.post(
-    "/v1/rerank",
-    dependencies=[Depends(validate_json_request)],
-    responses={
-        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
-        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
-    },
-)
-@with_cancellation
-async def do_rerank_v1(request: RerankRequest, raw_request: Request):
-    logger.warning_once(
-        "To indicate that the rerank API is not part of the standard OpenAI"
-        " API, we have located it at `/rerank`. Please update your client "
-        "accordingly. (Note: Conforms to JinaAI rerank API)"
-    )
-
-    return await do_rerank(request, raw_request)
-
-
-@router.post(
-    "/v2/rerank",
-    dependencies=[Depends(validate_json_request)],
-    responses={
-        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
-        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
-    },
-)
-@with_cancellation
-async def do_rerank_v2(request: RerankRequest, raw_request: Request):
-    return await do_rerank(request, raw_request)
-
-
 if envs.VLLM_SERVER_DEV_MODE:
     logger.warning(
         "SECURITY WARNING: Development endpoints are enabled! "
@@ -1285,30 +1020,6 @@ async def is_scaling_elastic_ep(raw_request: Request):
     return JSONResponse({"is_scaling_elastic_ep": _scaling_elastic_ep})
 
 
-# TODO: RequestType = TypeForm[BaseModel] when recognized by type checkers
-# (requires typing_extensions >= 4.13)
-RequestType = Any
-GetHandlerFn = Callable[[Request], OpenAIServing | None]
-EndpointFn = Callable[[RequestType, Request], Awaitable[Any]]
-
-# NOTE: Items defined earlier take higher priority
-INVOCATION_TYPES: list[tuple[RequestType, tuple[GetHandlerFn, EndpointFn]]] = [
-    (ChatCompletionRequest, (chat, create_chat_completion)),
-    (CompletionRequest, (completion, create_completion)),
-    (EmbeddingRequest, (embedding, create_embedding)),
-    (ClassificationRequest, (classify, create_classify)),
-    (ScoreRequest, (score, create_score)),
-    (RerankRequest, (rerank, do_rerank)),
-    (PoolingRequest, (pooling, create_pooling)),
-]
-
-# NOTE: Construct the TypeAdapters only once
-INVOCATION_VALIDATORS = [
-    (pydantic.TypeAdapter(request_type), (get_handler, endpoint))
-    for request_type, (get_handler, endpoint) in INVOCATION_TYPES
-]
-
-
 @router.post(
     "/inference/v1/generate",
     dependencies=[Depends(validate_json_request)],
@@ -1653,12 +1364,16 @@ def build_app(args: Namespace) -> FastAPI:
     from vllm.entrypoints.sagemaker.routes import register_sagemaker_routes
 
     register_sagemaker_routes(router)
-
     app.include_router(router)
+
     app.root_path = args.root_path
 
     mount_metrics(app)
 
+    from vllm.entrypoints.pooling import register_pooling_api_routers
+
+    register_pooling_api_routers(app)
+
     app.add_middleware(
         CORSMiddleware,
         allow_origins=args.allowed_origins,
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 688ea9697d9d6..fb73416f45b24 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -6,7 +6,7 @@
 import json
 import time
 from http import HTTPStatus
-from typing import Annotated, Any, ClassVar, Generic, Literal, TypeAlias, TypeVar
+from typing import Annotated, Any, ClassVar, Literal, TypeAlias
 
 import regex as re
 import torch
@@ -48,14 +48,6 @@ from openai.types.responses.response_reasoning_item import (
 )
 from openai_harmony import Message as OpenAIHarmonyMessage
 
-from vllm.config.pooler import get_use_activation
-from vllm.tasks import PoolingTask
-from vllm.utils.serial_utils import (
-    EmbedDType,
-    EncodingFormat,
-    Endianness,
-)
-
 # Backward compatibility for OpenAI client versions
 try:  # For older openai versions (< 1.100.0)
     from openai.types.responses import ResponseTextConfig
@@ -70,19 +62,14 @@ from pydantic import (
     BaseModel,
     ConfigDict,
     Field,
-    TypeAdapter,
     ValidationError,
-    ValidationInfo,
     field_serializer,
-    field_validator,
     model_validator,
 )
 
 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam, make_tool_call_id
-from vllm.entrypoints.score_utils import ScoreContentPartParam, ScoreMultiModalParam
 from vllm.logger import init_logger
 from vllm.logprobs import Logprob
-from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import (
     BeamSearchParams,
     RequestOutputKind,
@@ -1345,401 +1332,6 @@ class CompletionRequest(OpenAIBaseModel):
         return data
 
 
-class EmbeddingCompletionRequest(OpenAIBaseModel):
-    # Ordered by official OpenAI API documentation
-    # https://platform.openai.com/docs/api-reference/embeddings
-    model: str | None = None
-    input: list[int] | list[list[int]] | str | list[str]
-    encoding_format: EncodingFormat = "float"
-    dimensions: int | None = None
-    user: str | None = None
-    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
-
-    # --8<-- [start:embedding-extra-params]
-    add_special_tokens: bool = Field(
-        default=True,
-        description=(
-            "If true (the default), special tokens (e.g. BOS) will be added to "
-            "the prompt."
-        ),
-    )
-    priority: int = Field(
-        default=0,
-        description=(
-            "The priority of the request (lower means earlier handling; "
-            "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."
-        ),
-    )
-    request_id: str = Field(
-        default_factory=random_uuid,
-        description=(
-            "The request_id related to this request. If the caller does "
-            "not set it, a random_uuid will be generated. This id is used "
-            "through out the inference process and return in response."
-        ),
-    )
-    normalize: bool | None = Field(
-        default=None,
-        description="Whether to normalize the embeddings outputs. Default is True.",
-    )
-    embed_dtype: EmbedDType = Field(
-        default="float32",
-        description=(
-            "What dtype to use for encoding. Default to using float32 for base64 "
-            "encoding to match the OpenAI python client behavior. "
-            "This parameter will affect base64 and binary_response."
-        ),
-    )
-    endianness: Endianness = Field(
-        default="native",
-        description=(
-            "What endianness to use for encoding. Default to using native for "
-            "base64 encoding to match the OpenAI python client behavior."
-            "This parameter will affect base64 and binary_response."
-        ),
-    )
-    # --8<-- [end:embedding-extra-params]
-
-    def to_pooling_params(self):
-        return PoolingParams(
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
-            dimensions=self.dimensions,
-            normalize=self.normalize,
-        )
-
-
-class EmbeddingChatRequest(OpenAIBaseModel):
-    model: str | None = None
-    messages: list[ChatCompletionMessageParam]
-
-    encoding_format: EncodingFormat = "float"
-    dimensions: int | None = None
-    user: str | None = None
-    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
-
-    # --8<-- [start:chat-embedding-extra-params]
-    add_generation_prompt: bool = Field(
-        default=False,
-        description=(
-            "If true, the generation prompt will be added to the chat template. "
-            "This is a parameter used by chat template in tokenizer config of the "
-            "model."
-        ),
-    )
-
-    add_special_tokens: bool = Field(
-        default=False,
-        description=(
-            "If true, special tokens (e.g. BOS) will be added to the prompt "
-            "on top of what is added by the chat template. "
-            "For most models, the chat template takes care of adding the "
-            "special tokens so this should be set to false (as is the "
-            "default)."
-        ),
-    )
-    chat_template: str | None = Field(
-        default=None,
-        description=(
-            "A Jinja template to use for this conversion. "
-            "As of transformers v4.44, default chat template is no longer "
-            "allowed, so you must provide a chat template if the tokenizer "
-            "does not define one."
-        ),
-    )
-    chat_template_kwargs: dict[str, Any] | None = Field(
-        default=None,
-        description=(
-            "Additional keyword args to pass to the template renderer. "
-            "Will be accessible by the chat template."
-        ),
-    )
-    mm_processor_kwargs: dict[str, Any] | None = Field(
-        default=None,
-        description=("Additional kwargs to pass to the HF processor."),
-    )
-    priority: int = Field(
-        default=0,
-        description=(
-            "The priority of the request (lower means earlier handling; "
-            "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."
-        ),
-    )
-    request_id: str = Field(
-        default_factory=random_uuid,
-        description=(
-            "The request_id related to this request. If the caller does "
-            "not set it, a random_uuid will be generated. This id is used "
-            "through out the inference process and return in response."
-        ),
-    )
-    normalize: bool | None = Field(
-        default=None,
-        description="Whether to normalize the embeddings outputs. Default is True.",
-    )
-    embed_dtype: EmbedDType = Field(
-        default="float32",
-        description=(
-            "What dtype to use for encoding. Default to using float32 for base64 "
-            "encoding to match the OpenAI python client behavior. "
-            "This parameter will affect base64 and binary_response."
-        ),
-    )
-    endianness: Endianness = Field(
-        default="native",
-        description=(
-            "What endianness to use for encoding. Default to using native for "
-            "base64 encoding to match the OpenAI python client behavior."
-            "This parameter will affect base64 and binary_response."
-        ),
-    )
-    # --8<-- [end:chat-embedding-extra-params]
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_generation_prompt(cls, data):
-        if data.get("continue_final_message") and data.get("add_generation_prompt"):
-            raise ValueError(
-                "Cannot set both `continue_final_message` and "
-                "`add_generation_prompt` to True."
-            )
-        return data
-
-    def to_pooling_params(self):
-        return PoolingParams(
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
-            dimensions=self.dimensions,
-            normalize=self.normalize,
-        )
-
-
-EmbeddingRequest: TypeAlias = EmbeddingCompletionRequest | EmbeddingChatRequest
-
-
-class PoolingCompletionRequest(EmbeddingCompletionRequest):
-    task: PoolingTask | None = None
-    softmax: bool | None = Field(
-        default=None,
-        description="softmax will be deprecated, please use use_activation instead.",
-    )
-    activation: bool | None = Field(
-        default=None,
-        description="activation will be deprecated, please use use_activation instead.",
-    )
-    use_activation: bool | None = Field(
-        default=None,
-        description="Whether to use activation for classification outputs. "
-        "If it is a classify or token_classify task, the default is True; "
-        "for other tasks, this value should be None.",
-    )
-
-    def to_pooling_params(self):
-        return PoolingParams(
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
-            dimensions=self.dimensions,
-            normalize=self.normalize,
-            use_activation=get_use_activation(self),
-        )
-
-
-class PoolingChatRequest(EmbeddingChatRequest):
-    task: PoolingTask | None = None
-    softmax: bool | None = Field(
-        default=None,
-        description="softmax will be deprecated, please use use_activation instead.",
-    )
-    activation: bool | None = Field(
-        default=None,
-        description="activation will be deprecated, please use use_activation instead.",
-    )
-    use_activation: bool | None = Field(
-        default=None,
-        description="Whether to use activation for classification outputs. "
-        "If it is a classify or token_classify task, the default is True; "
-        "for other tasks, this value should be None.",
-    )
-
-    def to_pooling_params(self):
-        return PoolingParams(
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
-            dimensions=self.dimensions,
-            normalize=self.normalize,
-            use_activation=get_use_activation(self),
-        )
-
-
-T = TypeVar("T")
-
-
-class IOProcessorRequest(OpenAIBaseModel, Generic[T]):
-    model: str | None = None
-
-    priority: int = Field(default=0)
-    """
-    The priority of the request (lower means earlier handling;
-    default: 0). Any priority other than 0 will raise an error
-    if the served model does not use priority scheduling.
-    """
-    data: T
-
-    task: PoolingTask = "plugin"
-    encoding_format: EncodingFormat = "float"
-    embed_dtype: EmbedDType = Field(
-        default="float32",
-        description=(
-            "What dtype to use for encoding. Default to using float32 for base64 "
-            "encoding to match the OpenAI python client behavior. "
-            "This parameter will affect base64 and binary_response."
-        ),
-    )
-    endianness: Endianness = Field(
-        default="native",
-        description=(
-            "What endianness to use for encoding. Default to using native for "
-            "base64 encoding to match the OpenAI python client behavior."
-            "This parameter will affect base64 and binary_response."
-        ),
-    )
-
-    def to_pooling_params(self):
-        return PoolingParams()
-
-
-class IOProcessorResponse(OpenAIBaseModel, Generic[T]):
-    request_id: str | None = None
-    """
-    The request_id associated with this response
-    """
-    created_at: int = Field(default_factory=lambda: int(time.time()))
-
-    data: T
-    """
-    When using plugins IOProcessor plugins, the actual output is generated
-    by the plugin itself. Hence, we use a generic type for the response data
-    """
-
-
-PoolingRequest: TypeAlias = (
-    PoolingCompletionRequest | PoolingChatRequest | IOProcessorRequest
-)
-
-
-class ScoreRequest(OpenAIBaseModel):
-    model: str | None = None
-    text_1: list[str] | str | ScoreMultiModalParam
-    text_2: list[str] | str | ScoreMultiModalParam
-    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
-
-    # --8<-- [start:score-extra-params]
-
-    mm_processor_kwargs: dict[str, Any] | None = Field(
-        default=None,
-        description=("Additional kwargs to pass to the HF processor."),
-    )
-
-    priority: int = Field(
-        default=0,
-        description=(
-            "The priority of the request (lower means earlier handling; "
-            "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."
-        ),
-    )
-
-    softmax: bool | None = Field(
-        default=None,
-        description="softmax will be deprecated, please use use_activation instead.",
-    )
-
-    activation: bool | None = Field(
-        default=None,
-        description="activation will be deprecated, please use use_activation instead.",
-    )
-
-    use_activation: bool | None = Field(
-        default=None,
-        description="Whether to use activation for classification outputs. "
-        "Default is True.",
-    )
-    # --8<-- [end:score-extra-params]
-
-    def to_pooling_params(self):
-        return PoolingParams(
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
-            use_activation=get_use_activation(self),
-        )
-
-
-class RerankRequest(OpenAIBaseModel):
-    model: str | None = None
-    query: str | ScoreMultiModalParam
-    documents: list[str] | ScoreMultiModalParam
-    top_n: int = Field(default_factory=lambda: 0)
-    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
-
-    # --8<-- [start:rerank-extra-params]
-
-    mm_processor_kwargs: dict[str, Any] | None = Field(
-        default=None,
-        description=("Additional kwargs to pass to the HF processor."),
-    )
-
-    priority: int = Field(
-        default=0,
-        description=(
-            "The priority of the request (lower means earlier handling; "
-            "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."
-        ),
-    )
-
-    softmax: bool | None = Field(
-        default=None,
-        description="softmax will be deprecated, please use use_activation instead.",
-    )
-
-    activation: bool | None = Field(
-        default=None,
-        description="activation will be deprecated, please use use_activation instead.",
-    )
-
-    use_activation: bool | None = Field(
-        default=None,
-        description="Whether to use activation for classification outputs. "
-        "Default is True.",
-    )
-    # --8<-- [end:rerank-extra-params]
-
-    def to_pooling_params(self):
-        return PoolingParams(
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
-            use_activation=get_use_activation(self),
-        )
-
-
-class RerankDocument(BaseModel):
-    text: str | None = None
-    multi_modal: ScoreContentPartParam | None = None
-
-
-class RerankResult(BaseModel):
-    index: int
-    document: RerankDocument
-    relevance_score: float
-
-
-class RerankUsage(BaseModel):
-    total_tokens: int
-
-
-class RerankResponse(OpenAIBaseModel):
-    id: str
-    model: str
-    usage: RerankUsage
-    results: list[RerankResult]
-
-
 class CompletionLogProbs(OpenAIBaseModel):
     text_offset: list[int] = Field(default_factory=list)
     token_logprobs: list[float | None] = Field(default_factory=list)
@@ -1809,229 +1401,6 @@ class CompletionStreamResponse(OpenAIBaseModel):
     usage: UsageInfo | None = Field(default=None)
 
 
-class EmbeddingResponseData(OpenAIBaseModel):
-    index: int
-    object: str = "embedding"
-    embedding: list[float] | str
-
-
-class EmbeddingResponse(OpenAIBaseModel):
-    id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
-    object: str = "list"
-    created: int = Field(default_factory=lambda: int(time.time()))
-    model: str
-    data: list[EmbeddingResponseData]
-    usage: UsageInfo
-
-
-class EmbeddingBytesResponse(OpenAIBaseModel):
-    body: list[bytes]
-    metadata: str
-    media_type: str = "application/octet-stream"
-
-
-class PoolingResponseData(OpenAIBaseModel):
-    index: int
-    object: str = "pooling"
-    data: list[list[float]] | list[float] | str
-
-
-class PoolingResponse(OpenAIBaseModel):
-    id: str = Field(default_factory=lambda: f"pool-{random_uuid()}")
-    object: str = "list"
-    created: int = Field(default_factory=lambda: int(time.time()))
-    model: str
-    data: list[PoolingResponseData]
-    usage: UsageInfo
-
-
-class PoolingBytesResponse(OpenAIBaseModel):
-    body: list[bytes]
-    metadata: str
-    media_type: str = "application/octet-stream"
-
-
-class ScoreResponseData(OpenAIBaseModel):
-    index: int
-    object: str = "score"
-    score: float
-
-
-class ScoreResponse(OpenAIBaseModel):
-    id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
-    object: str = "list"
-    created: int = Field(default_factory=lambda: int(time.time()))
-    model: str
-    data: list[ScoreResponseData]
-    usage: UsageInfo
-
-
-class ClassificationCompletionRequest(OpenAIBaseModel):
-    model: str | None = None
-    input: list[str] | str
-    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
-    user: str | None = None
-
-    # --8<-- [start:classification-extra-params]
-    priority: int = Field(
-        default=0,
-        description=(
-            "The priority of the request (lower means earlier handling; "
-            "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."
-        ),
-    )
-    add_special_tokens: bool = Field(
-        default=True,
-        description=(
-            "If true (the default), special tokens (e.g. BOS) will be added to "
-            "the prompt."
-        ),
-    )
-    request_id: str = Field(
-        default_factory=random_uuid,
-        description=(
-            "The request_id related to this request. If the caller does "
-            "not set it, a random_uuid will be generated. This id is used "
-            "through out the inference process and return in response."
-        ),
-    )
-    softmax: bool | None = Field(
-        default=None,
-        description="softmax will be deprecated, please use use_activation instead.",
-    )
-
-    activation: bool | None = Field(
-        default=None,
-        description="activation will be deprecated, please use use_activation instead.",
-    )
-
-    use_activation: bool | None = Field(
-        default=None,
-        description="Whether to use activation for classification outputs. "
-        "Default is True.",
-    )
-    # --8<-- [end:classification-extra-params]
-
-    def to_pooling_params(self):
-        return PoolingParams(
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
-            use_activation=get_use_activation(self),
-        )
-
-
-class ClassificationChatRequest(OpenAIBaseModel):
-    model: str | None = None
-    messages: list[ChatCompletionMessageParam]
-    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
-    user: str | None = None
-
-    # --8<-- [start:chat-classification-extra-params]
-    add_generation_prompt: bool = Field(
-        default=False,
-        description=(
-            "If true, the generation prompt will be added to the chat template. "
-            "This is a parameter used by chat template in tokenizer config of the "
-            "model."
-        ),
-    )
-
-    add_special_tokens: bool = Field(
-        default=False,
-        description=(
-            "If true, special tokens (e.g. BOS) will be added to the prompt "
-            "on top of what is added by the chat template. "
-            "For most models, the chat template takes care of adding the "
-            "special tokens so this should be set to false (as is the "
-            "default)."
-        ),
-    )
-
-    chat_template: str | None = Field(
-        default=None,
-        description=(
-            "A Jinja template to use for this conversion. "
-            "As of transformers v4.44, default chat template is no longer "
-            "allowed, so you must provide a chat template if the tokenizer "
-            "does not define one."
-        ),
-    )
-
-    chat_template_kwargs: dict[str, Any] | None = Field(
-        default=None,
-        description=(
-            "Additional keyword args to pass to the template renderer. "
-            "Will be accessible by the chat template."
-        ),
-    )
-
-    mm_processor_kwargs: dict[str, Any] | None = Field(
-        default=None,
-        description=("Additional kwargs to pass to the HF processor."),
-    )
-
-    priority: int = Field(
-        default=0,
-        description=(
-            "The priority of the request (lower means earlier handling; "
-            "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."
-        ),
-    )
-
-    request_id: str = Field(
-        default_factory=random_uuid,
-        description=(
-            "The request_id related to this request. If the caller does "
-            "not set it, a random_uuid will be generated. This id is used "
-            "through out the inference process and return in response."
-        ),
-    )
-    softmax: bool | None = Field(
-        default=None,
-        description="softmax will be deprecated, please use use_activation instead.",
-    )
-
-    activation: bool | None = Field(
-        default=None,
-        description="activation will be deprecated, please use use_activation instead.",
-    )
-
-    use_activation: bool | None = Field(
-        default=None,
-        description="Whether to use activation for classification outputs. "
-        "Default is True.",
-    )
-    # --8<-- [end:chat-classification-extra-params]
-
-    def to_pooling_params(self):
-        return PoolingParams(
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
-            use_activation=get_use_activation(self),
-        )
-
-
-ClassificationRequest: TypeAlias = (
-    ClassificationCompletionRequest | ClassificationChatRequest
-)
-
-
-class ClassificationData(OpenAIBaseModel):
-    index: int
-    label: str | None
-    probs: list[float]
-    num_classes: int
-
-
-class ClassificationResponse(OpenAIBaseModel):
-    id: str = Field(default_factory=lambda: f"classify-{random_uuid()}")
-    object: str = "list"
-    created: int = Field(default_factory=lambda: int(time.time()))
-    model: str
-    data: list[ClassificationData]
-    usage: UsageInfo
-
-
 class FunctionCall(OpenAIBaseModel):
     name: str
     arguments: str
@@ -2409,83 +1778,6 @@ StreamingResponsesResponse: TypeAlias = (
     | ResponseCodeInterpreterCallCompletedEvent
 )
 
-BatchRequestInputBody: TypeAlias = (
-    ChatCompletionRequest | EmbeddingRequest | ScoreRequest | RerankRequest
-)
-
-
-class BatchRequestInput(OpenAIBaseModel):
-    """
-    The per-line object of the batch input file.
-
-    NOTE: Currently only the `/v1/chat/completions` endpoint is supported.
-    """
-
-    # A developer-provided per-request id that will be used to match outputs to
-    # inputs. Must be unique for each request in a batch.
-    custom_id: str
-
-    # The HTTP method to be used for the request. Currently only POST is
-    # supported.
-    method: str
-
-    # The OpenAI API relative URL to be used for the request. Currently
-    # /v1/chat/completions is supported.
-    url: str
-
-    # The parameters of the request.
-    body: BatchRequestInputBody
-
-    @field_validator("body", mode="plain")
-    @classmethod
-    def check_type_for_url(cls, value: Any, info: ValidationInfo):
-        # Use url to disambiguate models
-        url: str = info.data["url"]
-        if url == "/v1/chat/completions":
-            return ChatCompletionRequest.model_validate(value)
-        if url == "/v1/embeddings":
-            return TypeAdapter(EmbeddingRequest).validate_python(value)
-        if url.endswith("/score"):
-            return ScoreRequest.model_validate(value)
-        if url.endswith("/rerank"):
-            return RerankRequest.model_validate(value)
-        return TypeAdapter(BatchRequestInputBody).validate_python(value)
-
-
-class BatchResponseData(OpenAIBaseModel):
-    # HTTP status code of the response.
-    status_code: int = 200
-
-    # An unique identifier for the API request.
-    request_id: str
-
-    # The body of the response.
-    body: (
-        ChatCompletionResponse
-        | EmbeddingResponse
-        | ScoreResponse
-        | RerankResponse
-        | None
-    ) = None
-
-
-class BatchRequestOutput(OpenAIBaseModel):
-    """
-    The per-line object of the batch output and error files
-    """
-
-    id: str
-
-    # A developer-provided per-request id that will be used to match outputs to
-    # inputs.
-    custom_id: str
-
-    response: BatchResponseData | None
-
-    # For requests that failed with a non-HTTP error, this will contain more
-    # information on the cause of the failure.
-    error: Any | None
-
 
 class TokenizeCompletionRequest(OpenAIBaseModel):
     model: str | None = None
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 4b9dba085a8e9..837e742e6be49 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -7,29 +7,35 @@ from argparse import Namespace
 from collections.abc import Awaitable, Callable
 from http import HTTPStatus
 from io import StringIO
+from typing import Any, TypeAlias
 
 import aiohttp
 import torch
 from prometheus_client import start_http_server
+from pydantic import TypeAdapter, field_validator
+from pydantic_core.core_schema import ValidationInfo
 from tqdm import tqdm
 
 from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
-    BatchRequestInput,
-    BatchRequestOutput,
-    BatchResponseData,
+    ChatCompletionRequest,
     ChatCompletionResponse,
-    EmbeddingResponse,
     ErrorResponse,
-    RerankResponse,
-    ScoreResponse,
+    OpenAIBaseModel,
 )
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
-from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
 from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
-from vllm.entrypoints.openai.serving_score import ServingScores
+from vllm.entrypoints.pooling.embed.protocol import EmbeddingRequest, EmbeddingResponse
+from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding
+from vllm.entrypoints.pooling.score.protocol import (
+    RerankRequest,
+    RerankResponse,
+    ScoreRequest,
+    ScoreResponse,
+)
+from vllm.entrypoints.pooling.score.serving import ServingScores
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParserManager
 from vllm.utils import random_uuid
@@ -39,6 +45,84 @@ from vllm.version import __version__ as VLLM_VERSION
 logger = init_logger(__name__)
 
 
+BatchRequestInputBody: TypeAlias = (
+    ChatCompletionRequest | EmbeddingRequest | ScoreRequest | RerankRequest
+)
+
+
+class BatchRequestInput(OpenAIBaseModel):
+    """
+    The per-line object of the batch input file.
+
+    NOTE: Currently only the `/v1/chat/completions` endpoint is supported.
+    """
+
+    # A developer-provided per-request id that will be used to match outputs to
+    # inputs. Must be unique for each request in a batch.
+    custom_id: str
+
+    # The HTTP method to be used for the request. Currently only POST is
+    # supported.
+    method: str
+
+    # The OpenAI API relative URL to be used for the request. Currently
+    # /v1/chat/completions is supported.
+    url: str
+
+    # The parameters of the request.
+    body: BatchRequestInputBody
+
+    @field_validator("body", mode="plain")
+    @classmethod
+    def check_type_for_url(cls, value: Any, info: ValidationInfo):
+        # Use url to disambiguate models
+        url: str = info.data["url"]
+        if url == "/v1/chat/completions":
+            return ChatCompletionRequest.model_validate(value)
+        if url == "/v1/embeddings":
+            return TypeAdapter(EmbeddingRequest).validate_python(value)
+        if url.endswith("/score"):
+            return ScoreRequest.model_validate(value)
+        if url.endswith("/rerank"):
+            return RerankRequest.model_validate(value)
+        return TypeAdapter(BatchRequestInputBody).validate_python(value)
+
+
+class BatchResponseData(OpenAIBaseModel):
+    # HTTP status code of the response.
+    status_code: int = 200
+
+    # An unique identifier for the API request.
+    request_id: str
+
+    # The body of the response.
+    body: (
+        ChatCompletionResponse
+        | EmbeddingResponse
+        | ScoreResponse
+        | RerankResponse
+        | None
+    ) = None
+
+
+class BatchRequestOutput(OpenAIBaseModel):
+    """
+    The per-line object of the batch output and error files
+    """
+
+    id: str
+
+    # A developer-provided per-request id that will be used to match outputs to
+    # inputs.
+    custom_id: str
+
+    response: BatchResponseData | None
+
+    # For requests that failed with a non-HTTP error, this will contain more
+    # information on the cause of the failure.
+    error: Any | None
+
+
 def make_arg_parser(parser: FlexibleArgumentParser):
     parser.add_argument(
         "-i",
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index b6a2478cf8c81..1d89aa011af21 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -18,6 +18,28 @@ from pydantic import ConfigDict, TypeAdapter
 from starlette.datastructures import Headers
 from typing_extensions import TypeIs
 
+from vllm.entrypoints.pooling.classify.protocol import (
+    ClassificationChatRequest,
+    ClassificationCompletionRequest,
+    ClassificationRequest,
+    ClassificationResponse,
+)
+from vllm.entrypoints.pooling.embed.protocol import (
+    EmbeddingChatRequest,
+    EmbeddingCompletionRequest,
+    EmbeddingRequest,
+    EmbeddingResponse,
+)
+from vllm.entrypoints.pooling.pooling.protocol import (
+    IOProcessorRequest,
+    PoolingResponse,
+)
+from vllm.entrypoints.pooling.score.protocol import (
+    RerankRequest,
+    ScoreRequest,
+    ScoreResponse,
+)
+
 if sys.version_info >= (3, 12):
     from typing import TypedDict
 else:
@@ -45,29 +67,16 @@ from vllm.entrypoints.openai.protocol import (
     ChatCompletionNamedToolChoiceParam,
     ChatCompletionRequest,
     ChatCompletionResponse,
-    ClassificationChatRequest,
-    ClassificationCompletionRequest,
-    ClassificationRequest,
-    ClassificationResponse,
     CompletionRequest,
     CompletionResponse,
     DetokenizeRequest,
-    EmbeddingChatRequest,
-    EmbeddingCompletionRequest,
-    EmbeddingRequest,
-    EmbeddingResponse,
     ErrorInfo,
     ErrorResponse,
     FunctionCall,
     FunctionDefinition,
     GenerateRequest,
     GenerateResponse,
-    IOProcessorRequest,
-    PoolingResponse,
-    RerankRequest,
     ResponsesRequest,
-    ScoreRequest,
-    ScoreResponse,
     TokenizeChatRequest,
     TokenizeCompletionRequest,
     TokenizeResponse,
diff --git a/vllm/entrypoints/openai/utils.py b/vllm/entrypoints/openai/utils.py
index 6f37f6adff4c2..29db601af67f2 100644
--- a/vllm/entrypoints/openai/utils.py
+++ b/vllm/entrypoints/openai/utils.py
@@ -2,6 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import TypeVar
 
+from fastapi import Request
+from fastapi.exceptions import RequestValidationError
+
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionRequest,
     ChatCompletionResponseChoice,
@@ -35,3 +38,12 @@ def maybe_filter_parallel_tool_calls(
         ]
 
     return choice
+
+
+async def validate_json_request(raw_request: Request):
+    content_type = raw_request.headers.get("content-type", "").lower()
+    media_type = content_type.split(";", maxsplit=1)[0]
+    if media_type != "application/json":
+        raise RequestValidationError(
+            errors=["Unsupported Media Type: Only 'application/json' is allowed"]
+        )
diff --git a/vllm/entrypoints/pooling/__init__.py b/vllm/entrypoints/pooling/__init__.py
new file mode 100644
index 0000000000000..789fd8bd262b9
--- /dev/null
+++ b/vllm/entrypoints/pooling/__init__.py
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from fastapi import FastAPI
+
+
+def register_pooling_api_routers(app: FastAPI):
+    from vllm.entrypoints.pooling.classify.api_router import router as classify_router
+    from vllm.entrypoints.pooling.embed.api_router import router as embed_router
+    from vllm.entrypoints.pooling.pooling.api_router import router as pooling_router
+    from vllm.entrypoints.pooling.score.api_router import router as score_router
+
+    app.include_router(classify_router)
+    app.include_router(embed_router)
+    app.include_router(score_router)
+    app.include_router(pooling_router)
diff --git a/vllm/entrypoints/pooling/classify/__init__.py b/vllm/entrypoints/pooling/classify/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/entrypoints/pooling/classify/api_router.py b/vllm/entrypoints/pooling/classify/api_router.py
new file mode 100644
index 0000000000000..d6ced73c88ebc
--- /dev/null
+++ b/vllm/entrypoints/pooling/classify/api_router.py
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from http import HTTPStatus
+
+from fastapi import APIRouter, Depends, HTTPException, Request
+from starlette.responses import JSONResponse
+from typing_extensions import assert_never
+
+from vllm.entrypoints.openai.protocol import ErrorResponse
+from vllm.entrypoints.openai.utils import validate_json_request
+from vllm.entrypoints.pooling.classify.protocol import (
+    ClassificationRequest,
+    ClassificationResponse,
+)
+from vllm.entrypoints.pooling.classify.serving import ServingClassification
+from vllm.entrypoints.utils import load_aware_call, with_cancellation
+
+router = APIRouter()
+
+
+def classify(request: Request) -> ServingClassification | None:
+    return request.app.state.openai_serving_classification
+
+
+@router.post("/classify", dependencies=[Depends(validate_json_request)])
+@with_cancellation
+@load_aware_call
+async def create_classify(request: ClassificationRequest, raw_request: Request):
+    handler = classify(raw_request)
+    if handler is None:
+        base_server = raw_request.app.state.openai_serving_tokenization
+        return base_server.create_error_response(
+            message="The model does not support Classification API"
+        )
+
+    try:
+        generator = await handler.create_classify(request, raw_request)
+    except Exception as e:
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
+
+    elif isinstance(generator, ClassificationResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    assert_never(generator)
diff --git a/vllm/entrypoints/pooling/classify/protocol.py b/vllm/entrypoints/pooling/classify/protocol.py
new file mode 100644
index 0000000000000..273bdd29ee589
--- /dev/null
+++ b/vllm/entrypoints/pooling/classify/protocol.py
@@ -0,0 +1,181 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import time
+from typing import Annotated, Any, TypeAlias
+
+from pydantic import (
+    Field,
+)
+
+from vllm import PoolingParams
+from vllm.config.pooler import get_use_activation
+from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+from vllm.entrypoints.openai.protocol import OpenAIBaseModel, UsageInfo
+from vllm.utils import random_uuid
+
+
+class ClassificationCompletionRequest(OpenAIBaseModel):
+    model: str | None = None
+    input: list[str] | str
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
+    user: str | None = None
+
+    # --8<-- [start:classification-extra-params]
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."
+        ),
+    )
+    add_special_tokens: bool = Field(
+        default=True,
+        description=(
+            "If true (the default), special tokens (e.g. BOS) will be added to "
+            "the prompt."
+        ),
+    )
+    request_id: str = Field(
+        default_factory=random_uuid,
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
+    softmax: bool | None = Field(
+        default=None,
+        description="softmax will be deprecated, please use use_activation instead.",
+    )
+
+    activation: bool | None = Field(
+        default=None,
+        description="activation will be deprecated, please use use_activation instead.",
+    )
+
+    use_activation: bool | None = Field(
+        default=None,
+        description="Whether to use activation for classification outputs. "
+        "Default is True.",
+    )
+    # --8<-- [end:classification-extra-params]
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            use_activation=get_use_activation(self),
+        )
+
+
+class ClassificationChatRequest(OpenAIBaseModel):
+    model: str | None = None
+    messages: list[ChatCompletionMessageParam]
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
+    user: str | None = None
+
+    # --8<-- [start:chat-classification-extra-params]
+    add_generation_prompt: bool = Field(
+        default=False,
+        description=(
+            "If true, the generation prompt will be added to the chat template. "
+            "This is a parameter used by chat template in tokenizer config of the "
+            "model."
+        ),
+    )
+
+    add_special_tokens: bool = Field(
+        default=False,
+        description=(
+            "If true, special tokens (e.g. BOS) will be added to the prompt "
+            "on top of what is added by the chat template. "
+            "For most models, the chat template takes care of adding the "
+            "special tokens so this should be set to false (as is the "
+            "default)."
+        ),
+    )
+
+    chat_template: str | None = Field(
+        default=None,
+        description=(
+            "A Jinja template to use for this conversion. "
+            "As of transformers v4.44, default chat template is no longer "
+            "allowed, so you must provide a chat template if the tokenizer "
+            "does not define one."
+        ),
+    )
+
+    chat_template_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=(
+            "Additional keyword args to pass to the template renderer. "
+            "Will be accessible by the chat template."
+        ),
+    )
+
+    mm_processor_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=("Additional kwargs to pass to the HF processor."),
+    )
+
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."
+        ),
+    )
+
+    request_id: str = Field(
+        default_factory=random_uuid,
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
+    softmax: bool | None = Field(
+        default=None,
+        description="softmax will be deprecated, please use use_activation instead.",
+    )
+
+    activation: bool | None = Field(
+        default=None,
+        description="activation will be deprecated, please use use_activation instead.",
+    )
+
+    use_activation: bool | None = Field(
+        default=None,
+        description="Whether to use activation for classification outputs. "
+        "Default is True.",
+    )
+    # --8<-- [end:chat-classification-extra-params]
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            use_activation=get_use_activation(self),
+        )
+
+
+ClassificationRequest: TypeAlias = (
+    ClassificationCompletionRequest | ClassificationChatRequest
+)
+
+
+class ClassificationData(OpenAIBaseModel):
+    index: int
+    label: str | None
+    probs: list[float]
+    num_classes: int
+
+
+class ClassificationResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"classify-{random_uuid()}")
+    object: str = "list"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    data: list[ClassificationData]
+    usage: UsageInfo
diff --git a/vllm/entrypoints/openai/serving_classification.py b/vllm/entrypoints/pooling/classify/serving.py
similarity index 99%
rename from vllm/entrypoints/openai/serving_classification.py
rename to vllm/entrypoints/pooling/classify/serving.py
index 3b973eb125a83..d6d3825daf7bb 100644
--- a/vllm/entrypoints/openai/serving_classification.py
+++ b/vllm/entrypoints/pooling/classify/serving.py
@@ -13,11 +13,6 @@ from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionRequest,
-    ClassificationChatRequest,
-    ClassificationCompletionRequest,
-    ClassificationData,
-    ClassificationRequest,
-    ClassificationResponse,
     ErrorResponse,
     UsageInfo,
 )
@@ -27,6 +22,13 @@ from vllm.entrypoints.openai.serving_engine import (
     ServeContext,
 )
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.pooling.classify.protocol import (
+    ClassificationChatRequest,
+    ClassificationCompletionRequest,
+    ClassificationData,
+    ClassificationRequest,
+    ClassificationResponse,
+)
 from vllm.entrypoints.renderer import RenderConfig
 from vllm.logger import init_logger
 from vllm.outputs import ClassificationOutput, PoolingRequestOutput
diff --git a/vllm/entrypoints/pooling/embed/__init__.py b/vllm/entrypoints/pooling/embed/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/entrypoints/pooling/embed/api_router.py b/vllm/entrypoints/pooling/embed/api_router.py
new file mode 100644
index 0000000000000..5b10a32e79f81
--- /dev/null
+++ b/vllm/entrypoints/pooling/embed/api_router.py
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from http import HTTPStatus
+
+from fastapi import APIRouter, Depends, HTTPException, Request
+from fastapi.responses import JSONResponse, StreamingResponse
+from typing_extensions import assert_never
+
+from vllm.entrypoints.openai.protocol import ErrorResponse
+from vllm.entrypoints.openai.utils import validate_json_request
+from vllm.entrypoints.pooling.embed.protocol import (
+    EmbeddingBytesResponse,
+    EmbeddingRequest,
+    EmbeddingResponse,
+)
+from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding
+from vllm.entrypoints.utils import load_aware_call, with_cancellation
+
+router = APIRouter()
+
+
+def embedding(request: Request) -> OpenAIServingEmbedding | None:
+    return request.app.state.openai_serving_embedding
+
+
+@router.post(
+    "/v1/embeddings",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+@load_aware_call
+async def create_embedding(
+    request: EmbeddingRequest,
+    raw_request: Request,
+):
+    handler = embedding(raw_request)
+    if handler is None:
+        base_server = raw_request.app.state.openai_serving_tokenization
+        return base_server.create_error_response(
+            message="The model does not support Embeddings API"
+        )
+
+    try:
+        generator = await handler.create_embedding(request, raw_request)
+    except Exception as e:
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
+
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
+    elif isinstance(generator, EmbeddingResponse):
+        return JSONResponse(content=generator.model_dump())
+    elif isinstance(generator, EmbeddingBytesResponse):
+        return StreamingResponse(
+            content=generator.body,
+            headers={"metadata": generator.metadata},
+            media_type=generator.media_type,
+        )
+
+    assert_never(generator)
diff --git a/vllm/entrypoints/pooling/embed/protocol.py b/vllm/entrypoints/pooling/embed/protocol.py
new file mode 100644
index 0000000000000..7eb53e14d5d8a
--- /dev/null
+++ b/vllm/entrypoints/pooling/embed/protocol.py
@@ -0,0 +1,208 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import time
+from typing import Annotated, Any, TypeAlias
+
+from pydantic import (
+    Field,
+    model_validator,
+)
+
+from vllm import PoolingParams
+from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+from vllm.entrypoints.openai.protocol import OpenAIBaseModel, UsageInfo
+from vllm.utils import random_uuid
+from vllm.utils.serial_utils import EmbedDType, EncodingFormat, Endianness
+
+
+class EmbeddingCompletionRequest(OpenAIBaseModel):
+    # Ordered by official OpenAI API documentation
+    # https://platform.openai.com/docs/api-reference/embeddings
+    model: str | None = None
+    input: list[int] | list[list[int]] | str | list[str]
+    encoding_format: EncodingFormat = "float"
+    dimensions: int | None = None
+    user: str | None = None
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
+
+    # --8<-- [start:embedding-extra-params]
+    add_special_tokens: bool = Field(
+        default=True,
+        description=(
+            "If true (the default), special tokens (e.g. BOS) will be added to "
+            "the prompt."
+        ),
+    )
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."
+        ),
+    )
+    request_id: str = Field(
+        default_factory=random_uuid,
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
+    normalize: bool | None = Field(
+        default=None,
+        description="Whether to normalize the embeddings outputs. Default is True.",
+    )
+    embed_dtype: EmbedDType = Field(
+        default="float32",
+        description=(
+            "What dtype to use for encoding. Default to using float32 for base64 "
+            "encoding to match the OpenAI python client behavior. "
+            "This parameter will affect base64 and binary_response."
+        ),
+    )
+    endianness: Endianness = Field(
+        default="native",
+        description=(
+            "What endianness to use for encoding. Default to using native for "
+            "base64 encoding to match the OpenAI python client behavior."
+            "This parameter will affect base64 and binary_response."
+        ),
+    )
+    # --8<-- [end:embedding-extra-params]
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            dimensions=self.dimensions,
+            normalize=self.normalize,
+        )
+
+
+class EmbeddingChatRequest(OpenAIBaseModel):
+    model: str | None = None
+    messages: list[ChatCompletionMessageParam]
+
+    encoding_format: EncodingFormat = "float"
+    dimensions: int | None = None
+    user: str | None = None
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
+
+    # --8<-- [start:chat-embedding-extra-params]
+    add_generation_prompt: bool = Field(
+        default=False,
+        description=(
+            "If true, the generation prompt will be added to the chat template. "
+            "This is a parameter used by chat template in tokenizer config of the "
+            "model."
+        ),
+    )
+
+    add_special_tokens: bool = Field(
+        default=False,
+        description=(
+            "If true, special tokens (e.g. BOS) will be added to the prompt "
+            "on top of what is added by the chat template. "
+            "For most models, the chat template takes care of adding the "
+            "special tokens so this should be set to false (as is the "
+            "default)."
+        ),
+    )
+    chat_template: str | None = Field(
+        default=None,
+        description=(
+            "A Jinja template to use for this conversion. "
+            "As of transformers v4.44, default chat template is no longer "
+            "allowed, so you must provide a chat template if the tokenizer "
+            "does not define one."
+        ),
+    )
+    chat_template_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=(
+            "Additional keyword args to pass to the template renderer. "
+            "Will be accessible by the chat template."
+        ),
+    )
+    mm_processor_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=("Additional kwargs to pass to the HF processor."),
+    )
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."
+        ),
+    )
+    request_id: str = Field(
+        default_factory=random_uuid,
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
+    normalize: bool | None = Field(
+        default=None,
+        description="Whether to normalize the embeddings outputs. Default is True.",
+    )
+    embed_dtype: EmbedDType = Field(
+        default="float32",
+        description=(
+            "What dtype to use for encoding. Default to using float32 for base64 "
+            "encoding to match the OpenAI python client behavior. "
+            "This parameter will affect base64 and binary_response."
+        ),
+    )
+    endianness: Endianness = Field(
+        default="native",
+        description=(
+            "What endianness to use for encoding. Default to using native for "
+            "base64 encoding to match the OpenAI python client behavior."
+            "This parameter will affect base64 and binary_response."
+        ),
+    )
+    # --8<-- [end:chat-embedding-extra-params]
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_generation_prompt(cls, data):
+        if data.get("continue_final_message") and data.get("add_generation_prompt"):
+            raise ValueError(
+                "Cannot set both `continue_final_message` and "
+                "`add_generation_prompt` to True."
+            )
+        return data
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            dimensions=self.dimensions,
+            normalize=self.normalize,
+        )
+
+
+EmbeddingRequest: TypeAlias = EmbeddingCompletionRequest | EmbeddingChatRequest
+
+
+class EmbeddingResponseData(OpenAIBaseModel):
+    index: int
+    object: str = "embedding"
+    embedding: list[float] | str
+
+
+class EmbeddingResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
+    object: str = "list"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    data: list[EmbeddingResponseData]
+    usage: UsageInfo
+
+
+class EmbeddingBytesResponse(OpenAIBaseModel):
+    body: list[bytes]
+    metadata: str
+    media_type: str = "application/octet-stream"
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/pooling/embed/serving.py
similarity index 99%
rename from vllm/entrypoints/openai/serving_embedding.py
rename to vllm/entrypoints/pooling/embed/serving.py
index 51f6106acec3d..868a3cb017a6b 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/pooling/embed/serving.py
@@ -13,12 +13,6 @@ from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
-    EmbeddingBytesResponse,
-    EmbeddingChatRequest,
-    EmbeddingCompletionRequest,
-    EmbeddingRequest,
-    EmbeddingResponse,
-    EmbeddingResponseData,
     ErrorResponse,
     UsageInfo,
 )
@@ -29,6 +23,14 @@ from vllm.entrypoints.openai.serving_engine import (
     TextTokensPrompt,
 )
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.pooling.embed.protocol import (
+    EmbeddingBytesResponse,
+    EmbeddingChatRequest,
+    EmbeddingCompletionRequest,
+    EmbeddingRequest,
+    EmbeddingResponse,
+    EmbeddingResponseData,
+)
 from vllm.entrypoints.renderer import RenderConfig
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.logger import init_logger
diff --git a/vllm/entrypoints/pooling/pooling/__init__.py b/vllm/entrypoints/pooling/pooling/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/entrypoints/pooling/pooling/api_router.py b/vllm/entrypoints/pooling/pooling/api_router.py
new file mode 100644
index 0000000000000..674da94d126cf
--- /dev/null
+++ b/vllm/entrypoints/pooling/pooling/api_router.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from http import HTTPStatus
+
+from fastapi import APIRouter, Depends, HTTPException, Request
+from fastapi.responses import JSONResponse, StreamingResponse
+from typing_extensions import assert_never
+
+from vllm.entrypoints.openai.protocol import ErrorResponse
+from vllm.entrypoints.openai.utils import validate_json_request
+from vllm.entrypoints.pooling.pooling.protocol import (
+    IOProcessorResponse,
+    PoolingBytesResponse,
+    PoolingRequest,
+    PoolingResponse,
+)
+from vllm.entrypoints.pooling.pooling.serving import OpenAIServingPooling
+from vllm.entrypoints.utils import load_aware_call, with_cancellation
+
+router = APIRouter()
+
+
+def pooling(request: Request) -> OpenAIServingPooling | None:
+    return request.app.state.openai_serving_pooling
+
+
+@router.post(
+    "/pooling",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+@load_aware_call
+async def create_pooling(request: PoolingRequest, raw_request: Request):
+    handler = pooling(raw_request)
+    if handler is None:
+        base_server = raw_request.app.state.openai_serving_tokenization
+        return base_server.create_error_response(
+            message="The model does not support Pooling API"
+        )
+    try:
+        generator = await handler.create_pooling(request, raw_request)
+    except Exception as e:
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
+    elif isinstance(generator, (PoolingResponse, IOProcessorResponse)):
+        return JSONResponse(content=generator.model_dump())
+    elif isinstance(generator, PoolingBytesResponse):
+        return StreamingResponse(
+            content=generator.body,
+            headers={"metadata": generator.metadata},
+            media_type=generator.media_type,
+        )
+
+    assert_never(generator)
diff --git a/vllm/entrypoints/pooling/pooling/protocol.py b/vllm/entrypoints/pooling/pooling/protocol.py
new file mode 100644
index 0000000000000..364cd93738b84
--- /dev/null
+++ b/vllm/entrypoints/pooling/pooling/protocol.py
@@ -0,0 +1,148 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import time
+from typing import Generic, TypeAlias, TypeVar
+
+from pydantic import (
+    Field,
+)
+
+from vllm import PoolingParams
+from vllm.config.pooler import get_use_activation
+from vllm.entrypoints.openai.protocol import OpenAIBaseModel, UsageInfo
+from vllm.entrypoints.pooling.embed.protocol import (
+    EmbeddingChatRequest,
+    EmbeddingCompletionRequest,
+)
+from vllm.tasks import PoolingTask
+from vllm.utils import random_uuid
+from vllm.utils.serial_utils import EmbedDType, EncodingFormat, Endianness
+
+
+class PoolingCompletionRequest(EmbeddingCompletionRequest):
+    task: PoolingTask | None = None
+    softmax: bool | None = Field(
+        default=None,
+        description="softmax will be deprecated, please use use_activation instead.",
+    )
+    activation: bool | None = Field(
+        default=None,
+        description="activation will be deprecated, please use use_activation instead.",
+    )
+    use_activation: bool | None = Field(
+        default=None,
+        description="Whether to use activation for classification outputs. "
+        "If it is a classify or token_classify task, the default is True; "
+        "for other tasks, this value should be None.",
+    )
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            dimensions=self.dimensions,
+            normalize=self.normalize,
+            use_activation=get_use_activation(self),
+        )
+
+
+class PoolingChatRequest(EmbeddingChatRequest):
+    task: PoolingTask | None = None
+    softmax: bool | None = Field(
+        default=None,
+        description="softmax will be deprecated, please use use_activation instead.",
+    )
+    activation: bool | None = Field(
+        default=None,
+        description="activation will be deprecated, please use use_activation instead.",
+    )
+    use_activation: bool | None = Field(
+        default=None,
+        description="Whether to use activation for classification outputs. "
+        "If it is a classify or token_classify task, the default is True; "
+        "for other tasks, this value should be None.",
+    )
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            dimensions=self.dimensions,
+            normalize=self.normalize,
+            use_activation=get_use_activation(self),
+        )
+
+
+T = TypeVar("T")
+
+
+class IOProcessorRequest(OpenAIBaseModel, Generic[T]):
+    model: str | None = None
+
+    priority: int = Field(default=0)
+    """
+    The priority of the request (lower means earlier handling;
+    default: 0). Any priority other than 0 will raise an error
+    if the served model does not use priority scheduling.
+    """
+    data: T
+
+    task: PoolingTask = "plugin"
+    encoding_format: EncodingFormat = "float"
+    embed_dtype: EmbedDType = Field(
+        default="float32",
+        description=(
+            "What dtype to use for encoding. Default to using float32 for base64 "
+            "encoding to match the OpenAI python client behavior. "
+            "This parameter will affect base64 and binary_response."
+        ),
+    )
+    endianness: Endianness = Field(
+        default="native",
+        description=(
+            "What endianness to use for encoding. Default to using native for "
+            "base64 encoding to match the OpenAI python client behavior."
+            "This parameter will affect base64 and binary_response."
+        ),
+    )
+
+    def to_pooling_params(self):
+        return PoolingParams()
+
+
+class IOProcessorResponse(OpenAIBaseModel, Generic[T]):
+    request_id: str | None = None
+    """
+    The request_id associated with this response
+    """
+    created_at: int = Field(default_factory=lambda: int(time.time()))
+
+    data: T
+    """
+    When using plugins IOProcessor plugins, the actual output is generated
+    by the plugin itself. Hence, we use a generic type for the response data
+    """
+
+
+PoolingRequest: TypeAlias = (
+    PoolingCompletionRequest | PoolingChatRequest | IOProcessorRequest
+)
+
+
+class PoolingResponseData(OpenAIBaseModel):
+    index: int
+    object: str = "pooling"
+    data: list[list[float]] | list[float] | str
+
+
+class PoolingResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"pool-{random_uuid()}")
+    object: str = "list"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    data: list[PoolingResponseData]
+    usage: UsageInfo
+
+
+class PoolingBytesResponse(OpenAIBaseModel):
+    body: list[bytes]
+    metadata: str
+    media_type: str = "application/octet-stream"
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/pooling/pooling/serving.py
similarity index 99%
rename from vllm/entrypoints/openai/serving_pooling.py
rename to vllm/entrypoints/pooling/pooling/serving.py
index ee4c5c8bacaae..7fb767e26d019 100644
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/pooling/pooling/serving.py
@@ -16,6 +16,11 @@ from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
     ErrorResponse,
+    UsageInfo,
+)
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.pooling.pooling.protocol import (
     IOProcessorRequest,
     IOProcessorResponse,
     PoolingBytesResponse,
@@ -24,10 +29,7 @@ from vllm.entrypoints.openai.protocol import (
     PoolingRequest,
     PoolingResponse,
     PoolingResponseData,
-    UsageInfo,
 )
-from vllm.entrypoints.openai.serving_engine import OpenAIServing
-from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.renderer import RenderConfig
 from vllm.entrypoints.utils import _validate_truncation_size
 from vllm.logger import init_logger
diff --git a/vllm/entrypoints/pooling/score/__init__.py b/vllm/entrypoints/pooling/score/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/entrypoints/pooling/score/api_router.py b/vllm/entrypoints/pooling/score/api_router.py
new file mode 100644
index 0000000000000..c7481ed9fa968
--- /dev/null
+++ b/vllm/entrypoints/pooling/score/api_router.py
@@ -0,0 +1,149 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from http import HTTPStatus
+
+from fastapi import APIRouter, Depends, HTTPException, Request
+from fastapi.responses import JSONResponse
+from typing_extensions import assert_never
+
+from vllm.entrypoints.openai.protocol import ErrorResponse
+from vllm.entrypoints.openai.utils import validate_json_request
+from vllm.entrypoints.pooling.score.protocol import (
+    RerankRequest,
+    RerankResponse,
+    ScoreRequest,
+    ScoreResponse,
+)
+from vllm.entrypoints.pooling.score.serving import ServingScores
+from vllm.entrypoints.utils import load_aware_call, with_cancellation
+from vllm.logger import init_logger
+
+router = APIRouter()
+
+logger = init_logger(__name__)
+
+
+def score(request: Request) -> ServingScores | None:
+    return request.app.state.openai_serving_scores
+
+
+def rerank(request: Request) -> ServingScores | None:
+    return request.app.state.openai_serving_scores
+
+
+@router.post(
+    "/score",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+@load_aware_call
+async def create_score(request: ScoreRequest, raw_request: Request):
+    handler = score(raw_request)
+    if handler is None:
+        base_server = raw_request.app.state.openai_serving_tokenization
+        return base_server.create_error_response(
+            message="The model does not support Score API"
+        )
+
+    try:
+        generator = await handler.create_score(request, raw_request)
+    except Exception as e:
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
+    elif isinstance(generator, ScoreResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    assert_never(generator)
+
+
+@router.post(
+    "/v1/score",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+@load_aware_call
+async def create_score_v1(request: ScoreRequest, raw_request: Request):
+    logger.warning(
+        "To indicate that Score API is not part of standard OpenAI API, we "
+        "have moved it to `/score`. Please update your client accordingly."
+    )
+
+    return await create_score(request, raw_request)
+
+
+@router.post(
+    "/rerank",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+@load_aware_call
+async def do_rerank(request: RerankRequest, raw_request: Request):
+    handler = rerank(raw_request)
+    if handler is None:
+        base_server = raw_request.app.state.openai_serving_tokenization
+        return base_server.create_error_response(
+            message="The model does not support Rerank (Score) API"
+        )
+    try:
+        generator = await handler.do_rerank(request, raw_request)
+    except Exception as e:
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
+    elif isinstance(generator, RerankResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    assert_never(generator)
+
+
+@router.post(
+    "/v1/rerank",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+async def do_rerank_v1(request: RerankRequest, raw_request: Request):
+    logger.warning_once(
+        "To indicate that the rerank API is not part of the standard OpenAI"
+        " API, we have located it at `/rerank`. Please update your client "
+        "accordingly. (Note: Conforms to JinaAI rerank API)"
+    )
+
+    return await do_rerank(request, raw_request)
+
+
+@router.post(
+    "/v2/rerank",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+async def do_rerank_v2(request: RerankRequest, raw_request: Request):
+    return await do_rerank(request, raw_request)
diff --git a/vllm/entrypoints/pooling/score/protocol.py b/vllm/entrypoints/pooling/score/protocol.py
new file mode 100644
index 0000000000000..a22219707c357
--- /dev/null
+++ b/vllm/entrypoints/pooling/score/protocol.py
@@ -0,0 +1,145 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import time
+from typing import Annotated, Any
+
+from pydantic import (
+    BaseModel,
+    Field,
+)
+
+from vllm import PoolingParams
+from vllm.config.pooler import get_use_activation
+from vllm.entrypoints.openai.protocol import OpenAIBaseModel, UsageInfo
+from vllm.entrypoints.score_utils import ScoreContentPartParam, ScoreMultiModalParam
+from vllm.utils import random_uuid
+
+
+class ScoreRequest(OpenAIBaseModel):
+    model: str | None = None
+    text_1: list[str] | str | ScoreMultiModalParam
+    text_2: list[str] | str | ScoreMultiModalParam
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
+
+    # --8<-- [start:score-extra-params]
+
+    mm_processor_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=("Additional kwargs to pass to the HF processor."),
+    )
+
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."
+        ),
+    )
+
+    softmax: bool | None = Field(
+        default=None,
+        description="softmax will be deprecated, please use use_activation instead.",
+    )
+
+    activation: bool | None = Field(
+        default=None,
+        description="activation will be deprecated, please use use_activation instead.",
+    )
+
+    use_activation: bool | None = Field(
+        default=None,
+        description="Whether to use activation for classification outputs. "
+        "Default is True.",
+    )
+    # --8<-- [end:score-extra-params]
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            use_activation=get_use_activation(self),
+        )
+
+
+class RerankRequest(OpenAIBaseModel):
+    model: str | None = None
+    query: str | ScoreMultiModalParam
+    documents: list[str] | ScoreMultiModalParam
+    top_n: int = Field(default_factory=lambda: 0)
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
+
+    # --8<-- [start:rerank-extra-params]
+
+    mm_processor_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=("Additional kwargs to pass to the HF processor."),
+    )
+
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."
+        ),
+    )
+
+    softmax: bool | None = Field(
+        default=None,
+        description="softmax will be deprecated, please use use_activation instead.",
+    )
+
+    activation: bool | None = Field(
+        default=None,
+        description="activation will be deprecated, please use use_activation instead.",
+    )
+
+    use_activation: bool | None = Field(
+        default=None,
+        description="Whether to use activation for classification outputs. "
+        "Default is True.",
+    )
+    # --8<-- [end:rerank-extra-params]
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            use_activation=get_use_activation(self),
+        )
+
+
+class RerankDocument(BaseModel):
+    text: str | None = None
+    multi_modal: ScoreContentPartParam | None = None
+
+
+class RerankResult(BaseModel):
+    index: int
+    document: RerankDocument
+    relevance_score: float
+
+
+class RerankUsage(BaseModel):
+    total_tokens: int
+
+
+class RerankResponse(OpenAIBaseModel):
+    id: str
+    model: str
+    usage: RerankUsage
+    results: list[RerankResult]
+
+
+class ScoreResponseData(OpenAIBaseModel):
+    index: int
+    object: str = "score"
+    score: float
+
+
+class ScoreResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
+    object: str = "list"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    data: list[ScoreResponseData]
+    usage: UsageInfo
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/pooling/score/serving.py
similarity index 99%
rename from vllm/entrypoints/openai/serving_score.py
rename to vllm/entrypoints/pooling/score/serving.py
index 0874c01c1f2a7..e5a66783005a6 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/pooling/score/serving.py
@@ -11,6 +11,11 @@ from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
     ErrorResponse,
+    UsageInfo,
+)
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.pooling.score.protocol import (
     RerankDocument,
     RerankRequest,
     RerankResponse,
@@ -19,10 +24,7 @@ from vllm.entrypoints.openai.protocol import (
     ScoreRequest,
     ScoreResponse,
     ScoreResponseData,
-    UsageInfo,
 )
-from vllm.entrypoints.openai.serving_engine import OpenAIServing
-from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.score_utils import (
     ScoreContentPartParam,
     ScoreMultiModalParam,
diff --git a/vllm/entrypoints/sagemaker/routes.py b/vllm/entrypoints/sagemaker/routes.py
index 498b7294f0d8c..108fdd773e321 100644
--- a/vllm/entrypoints/sagemaker/routes.py
+++ b/vllm/entrypoints/sagemaker/routes.py
@@ -1,7 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
+from collections.abc import Awaitable, Callable
 from http import HTTPStatus
+from typing import Any
 
 import model_hosting_container_standards.sagemaker as sagemaker_standards
 import pydantic
@@ -9,12 +11,56 @@ from fastapi import APIRouter, Depends, HTTPException, Request
 from fastapi.responses import JSONResponse, Response
 
 from vllm.entrypoints.openai.api_server import (
-    INVOCATION_VALIDATORS,
     base,
+    chat,
+    completion,
+    create_chat_completion,
+    create_completion,
     health,
     validate_json_request,
 )
-from vllm.entrypoints.openai.protocol import ErrorResponse
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    CompletionRequest,
+    ErrorResponse,
+)
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.pooling.classify.api_router import classify, create_classify
+from vllm.entrypoints.pooling.classify.protocol import ClassificationRequest
+from vllm.entrypoints.pooling.embed.api_router import create_embedding, embedding
+from vllm.entrypoints.pooling.embed.protocol import EmbeddingRequest
+from vllm.entrypoints.pooling.pooling.api_router import create_pooling, pooling
+from vllm.entrypoints.pooling.pooling.protocol import PoolingRequest
+from vllm.entrypoints.pooling.score.api_router import (
+    create_score,
+    do_rerank,
+    rerank,
+    score,
+)
+from vllm.entrypoints.pooling.score.protocol import RerankRequest, ScoreRequest
+
+# TODO: RequestType = TypeForm[BaseModel] when recognized by type checkers
+# (requires typing_extensions >= 4.13)
+RequestType = Any
+GetHandlerFn = Callable[[Request], OpenAIServing | None]
+EndpointFn = Callable[[RequestType, Request], Awaitable[Any]]
+
+# NOTE: Items defined earlier take higher priority
+INVOCATION_TYPES: list[tuple[RequestType, tuple[GetHandlerFn, EndpointFn]]] = [
+    (ChatCompletionRequest, (chat, create_chat_completion)),
+    (CompletionRequest, (completion, create_completion)),
+    (EmbeddingRequest, (embedding, create_embedding)),
+    (ClassificationRequest, (classify, create_classify)),
+    (ScoreRequest, (score, create_score)),
+    (RerankRequest, (rerank, do_rerank)),
+    (PoolingRequest, (pooling, create_pooling)),
+]
+
+# NOTE: Construct the TypeAdapters only once
+INVOCATION_VALIDATORS = [
+    (pydantic.TypeAdapter(request_type), (get_handler, endpoint))
+    for request_type, (get_handler, endpoint) in INVOCATION_TYPES
+]
 
 
 def register_sagemaker_routes(router: APIRouter):
diff --git a/vllm/plugins/io_processors/interface.py b/vllm/plugins/io_processors/interface.py
index e0488e48614d9..d2dd8b1bdc1f2 100644
--- a/vllm/plugins/io_processors/interface.py
+++ b/vllm/plugins/io_processors/interface.py
@@ -6,7 +6,7 @@ from collections.abc import AsyncGenerator, Sequence
 from typing import Any, Generic, TypeVar
 
 from vllm.config import VllmConfig
-from vllm.entrypoints.openai.protocol import IOProcessorResponse
+from vllm.entrypoints.pooling.pooling.protocol import IOProcessorResponse
 from vllm.inputs.data import PromptType
 from vllm.outputs import PoolingRequestOutput
 from vllm.pooling_params import PoolingParams

From 014ece97c7aa49084a1119dca792af081a18dbc1 Mon Sep 17 00:00:00 2001
From: daniel-salib <danielsalib@meta.com>
Date: Mon, 1 Dec 2025 03:03:57 -0500
Subject: [PATCH 103/770] [Frontend] Add tool filtering support to ToolServer
 (#29224)

Signed-off-by: Daniel Salib <danielsalib@meta.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
---
 .../openai_responses_client_with_mcp_tools.py | 184 ++++++++++++++++++
 .../openai/test_response_api_mcp_tools.py     | 100 ++++++++++
 .../openai/test_serving_responses.py          |  96 +++++++++
 vllm/entrypoints/openai/serving_responses.py  |  89 +++++++--
 vllm/entrypoints/tool_server.py               |  33 +++-
 5 files changed, 477 insertions(+), 25 deletions(-)
 create mode 100644 examples/online_serving/openai_responses_client_with_mcp_tools.py

diff --git a/examples/online_serving/openai_responses_client_with_mcp_tools.py b/examples/online_serving/openai_responses_client_with_mcp_tools.py
new file mode 100644
index 0000000000000..cafe19a2d1958
--- /dev/null
+++ b/examples/online_serving/openai_responses_client_with_mcp_tools.py
@@ -0,0 +1,184 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Example demonstrating MCP (Model Context Protocol) tools with the Responses API.
+
+This example shows how to use MCP tools with different allowed_tools configurations:
+1. No filter (allows all tools from the MCP server)
+2. Wildcard "*" (explicitly allows all tools)
+3. Specific tool names (filters to only those tools)
+
+Set up this example by starting a vLLM OpenAI-compatible server with MCP tools enabled.
+For example:
+vllm serve openai/gpt-oss-20b --enforce-eager --tool-server demo
+
+Environment variables:
+- VLLM_ENABLE_RESPONSES_API_STORE=1
+- VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS=code_interpreter,container
+- VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS=1
+"""
+
+from openai import OpenAI
+from utils import get_first_model
+
+
+def example_no_filter():
+    """Example with no allowed_tools filter - allows all tools."""
+    print("=" * 60)
+    print("Example 1: No allowed_tools filter (allows all tools)")
+    print("=" * 60)
+
+    base_url = "http://0.0.0.0:8000/v1"
+    client = OpenAI(base_url=base_url, api_key="empty")
+    model = get_first_model(client)
+
+    response = client.responses.create(
+        model=model,
+        input="Execute this code: print('Hello from Python!')",
+        instructions="Use the Python tool to execute code.",
+        tools=[
+            {
+                "type": "mcp",
+                "server_label": "code_interpreter",
+                "server_url": "http://localhost:8888",
+                # No allowed_tools specified - all tools are available
+            }
+        ],
+    )
+
+    print(f"Status: {response.status}")
+    print(f"Output: {response.output_text}")
+    print()
+
+
+def example_wildcard():
+    """Example with allowed_tools=['*'] - explicitly allows all tools."""
+    print("=" * 60)
+    print("Example 2: allowed_tools=['*'] (select all tools)")
+    print("=" * 60)
+
+    base_url = "http://0.0.0.0:8000/v1"
+    client = OpenAI(base_url=base_url, api_key="empty")
+    model = get_first_model(client)
+
+    response = client.responses.create(
+        model=model,
+        input="Execute this code: print('Hello from Python with wildcard!')",
+        instructions="Use the Python tool to execute code.",
+        tools=[
+            {
+                "type": "mcp",
+                "server_label": "code_interpreter",
+                "server_url": "http://localhost:8888",
+                # Using "*" to explicitly allow all tools from this MCP server
+                # This is equivalent to not specifying allowed_tools
+                "allowed_tools": ["*"],
+            }
+        ],
+    )
+
+    print(f"Status: {response.status}")
+    print(f"Output: {response.output_text}")
+    print()
+
+
+def example_specific_tools():
+    """Example with specific allowed_tools list - filters available tools.
+
+    Note: This example uses 'web_search_preview' (browser) which has multiple
+    sub-tools: 'search', 'open', 'find'. The code_interpreter (python) doesn't
+    have sub-tools, so filtering doesn't apply there.
+    """
+    print("=" * 60)
+    print("Example 3: allowed_tools=['search'] (filter browser to specific tools)")
+    print("=" * 60)
+
+    base_url = "http://0.0.0.0:8000/v1"
+    client = OpenAI(base_url=base_url, api_key="empty")
+    model = get_first_model(client)
+
+    response = client.responses.create(
+        model=model,
+        input="Search for 'Python programming tutorials'",
+        instructions="Use the browser tool to search.",
+        tools=[
+            {
+                "type": "mcp",
+                "server_label": "web_search_preview",
+                "server_url": "http://localhost:8888",
+                # Browser has tools: 'search', 'open', 'find'
+                # Only allow 'search' - blocks 'open' and 'find'
+                "allowed_tools": ["search"],
+            }
+        ],
+    )
+
+    print(f"Status: {response.status}")
+    print(f"Output: {response.output_text}")
+    print()
+
+
+def example_object_format():
+    """Example using object format for allowed_tools with browser tools."""
+    print("=" * 60)
+    print("Example 4: allowed_tools with object format")
+    print("=" * 60)
+
+    base_url = "http://0.0.0.0:8000/v1"
+    client = OpenAI(base_url=base_url, api_key="empty")
+    model = get_first_model(client)
+
+    response = client.responses.create(
+        model=model,
+        input="Search for 'machine learning' and open the first result",
+        instructions="Use the browser tool.",
+        tools=[
+            {
+                "type": "mcp",
+                "server_label": "web_search_preview",
+                "server_url": "http://localhost:8888",
+                # Object format with tool_names field
+                # Can also include read_only and other fields
+                # Browser has tools: 'search', 'open', 'find'
+                "allowed_tools": {
+                    "tool_names": [
+                        "search",
+                        "open",
+                    ],  # Allow search and open, block find
+                    "read_only": False,
+                },
+            }
+        ],
+    )
+
+    print(f"Status: {response.status}")
+    print(f"Output: {response.output_text}")
+    print()
+
+
+def main():
+    """Run all examples."""
+    print("\n" + "=" * 60)
+    print("MCP Tools with allowed_tools Examples")
+    print("=" * 60 + "\n")
+
+    # Run all examples
+    example_no_filter()
+    example_wildcard()
+    example_specific_tools()
+    example_object_format()
+
+    print("=" * 60)
+    print("Summary:")
+    print("  - No filter or '*' → All tools available from server")
+    print("  - Specific list → Only those sub-tools available")
+    print("  - Object format → More control with tool_names field")
+    print("")
+    print("Note: allowed_tools filters SUB-TOOLS within an MCP server:")
+    print("  - code_interpreter (python): No sub-tools to filter")
+    print("  - web_search_preview (browser): Has 'search', 'open', 'find'")
+    print("=" * 60)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/entrypoints/openai/test_response_api_mcp_tools.py b/tests/entrypoints/openai/test_response_api_mcp_tools.py
index 0dc2430caef7c..cd338b5555c5c 100644
--- a/tests/entrypoints/openai/test_response_api_mcp_tools.py
+++ b/tests/entrypoints/openai/test_response_api_mcp_tools.py
@@ -4,6 +4,9 @@
 import pytest
 import pytest_asyncio
 from openai import OpenAI
+from openai_harmony import ToolDescription, ToolNamespaceConfig
+
+from vllm.entrypoints.tool_server import MCPToolServer
 
 from ...utils import RemoteOpenAIServer
 
@@ -111,6 +114,48 @@ async def test_mcp_tool_env_flag_enabled(mcp_enabled_client: OpenAI, model_name:
         )
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_mcp_tool_with_allowed_tools_star(
+    mcp_enabled_client: OpenAI, model_name: str
+):
+    """Test MCP tool with allowed_tools=['*'] to select all available tools.
+
+    This E2E test verifies that the "*" wildcard works end-to-end.
+    See test_serving_responses.py for detailed unit tests of "*" normalization.
+    """
+    response = await mcp_enabled_client.responses.create(
+        model=model_name,
+        input=(
+            "Execute the following code: "
+            "import random; print(random.randint(1, 1000000))"
+        ),
+        instructions=(
+            "You must use the Python tool to execute code. Never simulate execution."
+        ),
+        tools=[
+            {
+                "type": "mcp",
+                "server_label": "code_interpreter",
+                "server_url": "http://localhost:8888",
+                # Using "*" to allow all tools from this MCP server
+                "allowed_tools": ["*"],
+            }
+        ],
+        extra_body={"enable_response_messages": True},
+    )
+    assert response is not None
+    assert response.status == "completed"
+    # Verify tool calls work with allowed_tools=["*"]
+    tool_call_found = False
+    for message in response.output_messages:
+        recipient = message.get("recipient")
+        if recipient and recipient.startswith("python"):
+            tool_call_found = True
+            break
+    assert tool_call_found, "Should have found at least one Python tool call with '*'"
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_mcp_tool_env_flag_disabled(mcp_disabled_client: OpenAI, model_name: str):
@@ -159,3 +204,58 @@ async def test_mcp_tool_env_flag_disabled(mcp_disabled_client: OpenAI, model_nam
         assert message.get("author").get("role") != "developer", (
             "No developer messages should be present without a valid tool"
         )
+
+
+def test_get_tool_description():
+    """Test MCPToolServer.get_tool_description filtering logic.
+
+    Note: The wildcard "*" is normalized to None by
+    _extract_allowed_tools_from_mcp_requests before reaching this layer,
+    so we only test None and specific tool filtering here.
+    See test_serving_responses.py for "*" normalization tests.
+    """
+    pytest.importorskip("mcp")
+
+    server = MCPToolServer()
+    tool1 = ToolDescription.new(
+        name="tool1", description="First", parameters={"type": "object"}
+    )
+    tool2 = ToolDescription.new(
+        name="tool2", description="Second", parameters={"type": "object"}
+    )
+    tool3 = ToolDescription.new(
+        name="tool3", description="Third", parameters={"type": "object"}
+    )
+
+    server.harmony_tool_descriptions = {
+        "test_server": ToolNamespaceConfig(
+            name="test_server", description="test", tools=[tool1, tool2, tool3]
+        )
+    }
+
+    # Nonexistent server
+    assert server.get_tool_description("nonexistent") is None
+
+    # None (no filter) - returns all tools
+    result = server.get_tool_description("test_server", allowed_tools=None)
+    assert len(result.tools) == 3
+
+    # Filter to specific tools
+    result = server.get_tool_description(
+        "test_server", allowed_tools=["tool1", "tool3"]
+    )
+    assert len(result.tools) == 2
+    assert result.tools[0].name == "tool1"
+    assert result.tools[1].name == "tool3"
+
+    # Single tool
+    result = server.get_tool_description("test_server", allowed_tools=["tool2"])
+    assert len(result.tools) == 1
+    assert result.tools[0].name == "tool2"
+
+    # No matching tools - returns None
+    result = server.get_tool_description("test_server", allowed_tools=["nonexistent"])
+    assert result is None
+
+    # Empty list - returns None
+    assert server.get_tool_description("test_server", allowed_tools=[]) is None
diff --git a/tests/entrypoints/openai/test_serving_responses.py b/tests/entrypoints/openai/test_serving_responses.py
index 6af32774cc5c6..cf00f0a042241 100644
--- a/tests/entrypoints/openai/test_serving_responses.py
+++ b/tests/entrypoints/openai/test_serving_responses.py
@@ -17,6 +17,7 @@ from vllm.entrypoints.context import ConversationContext
 from vllm.entrypoints.openai.protocol import ErrorResponse, ResponsesRequest
 from vllm.entrypoints.openai.serving_responses import (
     OpenAIServingResponses,
+    _extract_allowed_tools_from_mcp_requests,
     extract_tool_types,
 )
 from vllm.entrypoints.tool_server import ToolServer
@@ -254,3 +255,98 @@ class TestValidateGeneratorInput:
         # Should return an ErrorResponse
         assert result is not None
         assert isinstance(result, ErrorResponse)
+
+
+class TestExtractAllowedToolsFromMcpRequests:
+    """Test class for _extract_allowed_tools_from_mcp_requests function"""
+
+    def test_extract_allowed_tools_basic_formats(self):
+        """Test extraction with list format, object format, and None."""
+        from openai.types.responses.tool import McpAllowedToolsMcpToolFilter
+
+        tools = [
+            # List format
+            Mcp(
+                type="mcp",
+                server_label="server1",
+                allowed_tools=["tool1", "tool2"],
+            ),
+            # Object format
+            Mcp(
+                type="mcp",
+                server_label="server2",
+                allowed_tools=McpAllowedToolsMcpToolFilter(
+                    tool_names=["tool3", "tool4"]
+                ),
+            ),
+            # None (no filter)
+            Mcp(
+                type="mcp",
+                server_label="server3",
+                allowed_tools=None,
+            ),
+        ]
+        result = _extract_allowed_tools_from_mcp_requests(tools)
+        assert result == {
+            "server1": ["tool1", "tool2"],
+            "server2": ["tool3", "tool4"],
+            "server3": None,
+        }
+
+    def test_extract_allowed_tools_star_normalization(self):
+        """Test that '*' wildcard is normalized to None (select all tools).
+
+        This is the key test requested by reviewers to explicitly demonstrate
+        that the "*" select-all scenario is handled correctly.
+        """
+        from openai.types.responses.tool import McpAllowedToolsMcpToolFilter
+
+        tools = [
+            # Star in list format
+            Mcp(
+                type="mcp",
+                server_label="server1",
+                allowed_tools=["*"],
+            ),
+            # Star mixed with other tools in list
+            Mcp(
+                type="mcp",
+                server_label="server2",
+                allowed_tools=["tool1", "*"],
+            ),
+            # Star in object format
+            Mcp(
+                type="mcp",
+                server_label="server3",
+                allowed_tools=McpAllowedToolsMcpToolFilter(tool_names=["*"]),
+            ),
+        ]
+        result = _extract_allowed_tools_from_mcp_requests(tools)
+        # All should be normalized to None (allows all tools)
+        assert result == {
+            "server1": None,
+            "server2": None,
+            "server3": None,
+        }
+
+    def test_extract_allowed_tools_filters_non_mcp(self):
+        """Test that non-MCP tools are ignored during extraction."""
+        tools = [
+            Mcp(
+                type="mcp",
+                server_label="server1",
+                allowed_tools=["tool1"],
+            ),
+            LocalShell(type="local_shell"),  # Non-MCP tool should be ignored
+            Mcp(
+                type="mcp",
+                server_label="server2",
+                allowed_tools=["tool2"],
+            ),
+        ]
+        result = _extract_allowed_tools_from_mcp_requests(tools)
+        # Non-MCP tools should be ignored
+        assert result == {
+            "server1": ["tool1"],
+            "server2": ["tool2"],
+        }
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 5144916ba71e9..81495a0777546 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -48,6 +48,7 @@ from openai.types.responses.response_output_text import Logprob, LogprobTopLogpr
 from openai.types.responses.response_reasoning_item import (
     Content as ResponseReasoningTextContent,
 )
+from openai.types.responses.tool import Mcp, Tool
 from openai_harmony import Message as OpenAIHarmonyMessage
 
 from vllm import envs
@@ -111,6 +112,45 @@ from vllm.utils import random_uuid
 logger = init_logger(__name__)
 
 
+def _extract_allowed_tools_from_mcp_requests(
+    tools: list[Tool],
+) -> dict[str, list[str] | None]:
+    """
+    Extract allowed_tools mapping from MCP tool requests.
+
+    Returns a dictionary mapping server_label to allowed_tools list.
+    Handles both list format and McpAllowedToolsMcpToolFilter object format.
+
+    Special handling:
+    - If allowed_tools is None, returns None (allows all tools)
+    - If allowed_tools contains "*", returns None (allows all tools)
+    - Otherwise, returns the list of specific tool names
+
+    This function can be reused for both harmony and non-harmony MCP calls.
+    """
+    allowed_tools_map: dict[str, list[str] | None] = {}
+    for tool in tools:
+        if not isinstance(tool, Mcp):
+            continue
+
+        # allowed_tools can be a list or an object with tool_names
+        # Extract the actual list of tool names
+        allowed_tools_val = None
+        if tool.allowed_tools is not None:
+            if isinstance(tool.allowed_tools, list):
+                allowed_tools_val = tool.allowed_tools
+            elif hasattr(tool.allowed_tools, "tool_names"):
+                # It's an McpAllowedToolsMcpToolFilter object
+                allowed_tools_val = tool.allowed_tools.tool_names
+
+        # Normalize "*" to None (both mean "allow all tools")
+        if allowed_tools_val is not None and "*" in allowed_tools_val:
+            allowed_tools_val = None
+
+        allowed_tools_map[tool.server_label] = allowed_tools_val
+    return allowed_tools_map
+
+
 class OpenAIServingResponses(OpenAIServing):
     def __init__(
         self,
@@ -878,38 +918,45 @@ class OpenAIServingResponses(OpenAIServing):
         self, request: ResponsesRequest, with_custom_tools: bool, tool_types: set[str]
     ) -> OpenAIHarmonyMessage:
         reasoning_effort = request.reasoning.effort if request.reasoning else None
-        enable_browser = (
-            "web_search_preview" in tool_types
+
+        # Extract allowed_tools from MCP tool requests
+        allowed_tools_map = _extract_allowed_tools_from_mcp_requests(request.tools)
+
+        # Get filtered tool descriptions first.
+        # If get_tool_description returns None (due to filtering), the tool is disabled.
+        browser_description = (
+            self.tool_server.get_tool_description(
+                "browser", allowed_tools_map.get("web_search_preview")
+            )
+            if "web_search_preview" in tool_types
             and self.tool_server is not None
             and self.tool_server.has_tool("browser")
+            else None
         )
-        enable_code_interpreter = (
-            "code_interpreter" in tool_types
+        python_description = (
+            self.tool_server.get_tool_description(
+                "python", allowed_tools_map.get("code_interpreter")
+            )
+            if "code_interpreter" in tool_types
             and self.tool_server is not None
             and self.tool_server.has_tool("python")
+            else None
         )
-        enable_container = (
-            "container" in tool_types
+        container_description = (
+            self.tool_server.get_tool_description(
+                "container", allowed_tools_map.get("container")
+            )
+            if "container" in tool_types
             and self.tool_server is not None
             and self.tool_server.has_tool("container")
+            else None
         )
+
         sys_msg = get_system_message(
             reasoning_effort=reasoning_effort,
-            browser_description=(
-                self.tool_server.get_tool_description("browser")
-                if enable_browser and self.tool_server is not None
-                else None
-            ),
-            python_description=(
-                self.tool_server.get_tool_description("python")
-                if enable_code_interpreter and self.tool_server is not None
-                else None
-            ),
-            container_description=(
-                self.tool_server.get_tool_description("container")
-                if enable_container and self.tool_server is not None
-                else None
-            ),
+            browser_description=browser_description,
+            python_description=python_description,
+            container_description=container_description,
             instructions=request.instructions,
             with_custom_tools=with_custom_tools,
         )
diff --git a/vllm/entrypoints/tool_server.py b/vllm/entrypoints/tool_server.py
index 0d83031ef69fe..99db867025181 100644
--- a/vllm/entrypoints/tool_server.py
+++ b/vllm/entrypoints/tool_server.py
@@ -80,7 +80,9 @@ class ToolServer(ABC):
         pass
 
     @abstractmethod
-    def get_tool_description(self, tool_name: str) -> ToolNamespaceConfig | None:
+    def get_tool_description(
+        self, tool_name: str, allowed_tools: list[str] | None = None
+    ) -> ToolNamespaceConfig | None:
         """
         Return the tool description for the given tool name.
         If the tool is not supported, return None.
@@ -147,8 +149,29 @@ class MCPToolServer(ToolServer):
     def has_tool(self, tool_name: str):
         return tool_name in self.harmony_tool_descriptions
 
-    def get_tool_description(self, tool_name: str):
-        return self.harmony_tool_descriptions.get(tool_name)
+    def get_tool_description(
+        self,
+        server_label: str,
+        allowed_tools: list[str] | None = None,
+    ) -> ToolNamespaceConfig | None:
+        cfg = self.harmony_tool_descriptions.get(server_label)
+        if cfg is None:
+            return None
+
+        # No restrictions: all tools from this MCP server
+        if allowed_tools is None:
+            return cfg
+
+        filtered = [t for t in cfg.tools if t.name in allowed_tools]
+
+        if not filtered:
+            return None
+
+        return ToolNamespaceConfig(
+            name=cfg.name,
+            description=cfg.description,
+            tools=filtered,
+        )
 
     @asynccontextmanager
     async def new_session(
@@ -190,7 +213,9 @@ class DemoToolServer(ToolServer):
     def has_tool(self, tool_name: str) -> bool:
         return tool_name in self.tools
 
-    def get_tool_description(self, tool_name: str) -> ToolNamespaceConfig | None:
+    def get_tool_description(
+        self, tool_name: str, allowed_tools: list[str] | None = None
+    ) -> ToolNamespaceConfig | None:
         if tool_name not in self.tools:
             return None
         if tool_name == "browser":

From 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micka=C3=ABl=20Seznec?= <mickael@mistral.ai>
Date: Mon, 1 Dec 2025 10:29:33 +0100
Subject: [PATCH 104/770] [crashfix] Eagle + multimodal can crash on mm cache
 miss (#29750)

Signed-off-by: Mickael Seznec <mickael@mistral.ai>
Co-authored-by: Roger Wang <hey@rogerw.io>
---
 vllm/v1/core/sched/scheduler.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index e3ec8440a9323..4e38b991326d3 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -263,6 +263,7 @@ class Scheduler(SchedulerInterface):
                     request.num_computed_tokens,
                     num_new_tokens,
                     encoder_compute_budget,
+                    shift_computed_tokens=1 if self.use_eagle else 0,
                 )
 
             if num_new_tokens == 0:
@@ -532,6 +533,7 @@ class Scheduler(SchedulerInterface):
                             num_computed_tokens,
                             num_new_tokens,
                             encoder_compute_budget,
+                            shift_computed_tokens=1 if self.use_eagle else 0,
                         )
                         if num_new_tokens == 0:
                             # The request cannot be scheduled.
@@ -829,6 +831,7 @@ class Scheduler(SchedulerInterface):
         num_computed_tokens: int,
         num_new_tokens: int,
         encoder_compute_budget: int,
+        shift_computed_tokens: int = 0,
     ) -> tuple[list[int], int, int, list[int]]:
         """
         Determine which encoder inputs need to be scheduled in the current step,
@@ -873,7 +876,10 @@ class Scheduler(SchedulerInterface):
             # The encoder output is needed if the two ranges overlap:
             # [num_computed_tokens, num_computed_tokens + num_new_tokens) and
             # [start_pos, start_pos + num_encoder_tokens)
-            if start_pos >= num_computed_tokens + num_new_tokens:
+            if (
+                start_pos
+                >= num_computed_tokens + num_new_tokens + shift_computed_tokens
+            ):
                 # The encoder input is not needed in this step.
                 break
 
@@ -929,10 +935,12 @@ class Scheduler(SchedulerInterface):
                 # NOTE(woosuk): We assume that the encoder input tokens should
                 # be processed altogether, as the encoder usually uses
                 # bidirectional attention.
-                if num_computed_tokens < start_pos:
+                if num_computed_tokens + shift_computed_tokens < start_pos:
                     # We only schedule the decoder tokens just before the
                     # encoder input.
-                    num_new_tokens = start_pos - num_computed_tokens
+                    num_new_tokens = start_pos - (
+                        num_computed_tokens + shift_computed_tokens
+                    )
                 else:
                     # Because of prefix caching, num_computed_tokens is greater
                     # than start_pos even though its encoder input is not

From f0a28bf6610602a4f064ac59e1449f5acdf04c7a Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 1 Dec 2025 19:34:58 +0800
Subject: [PATCH 105/770] [Misc] Unify tokenizer registration (#29767)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/entrypoints/openai/test_tokenization.py |  10 +-
 .../entrypoints/pooling/embed/test_online.py  |   2 +-
 .../pooling/pooling/test_online.py            |   6 +-
 tests/models/registry.py                      |   2 +-
 tests/tokenizers_/test_registry.py            |  25 ++-
 .../llm/test_struct_output_generate.py        |   9 +-
 vllm/config/model.py                          |  22 +-
 vllm/engine/arg_utils.py                      |   2 +-
 vllm/entrypoints/llm.py                       |   2 +-
 vllm/tokenizers/__init__.py                   |  10 +-
 vllm/tokenizers/hf.py                         |   2 +
 vllm/tokenizers/mistral.py                    |   2 +
 vllm/tokenizers/registry.py                   | 199 ++++++++++++++++--
 vllm/transformers_utils/tokenizer.py          | 127 +----------
 14 files changed, 237 insertions(+), 183 deletions(-)

diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py
index 7fd32e1c7be1d..751f94319eb9f 100644
--- a/tests/entrypoints/openai/test_tokenization.py
+++ b/tests/entrypoints/openai/test_tokenization.py
@@ -53,7 +53,7 @@ async def test_tokenize_completions(
     model_name: str,
     tokenizer_name: str,
 ):
-    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
 
     for add_special in [False, True]:
         prompt = "vllm1 This is a test prompt."
@@ -87,7 +87,7 @@ async def test_tokenize_chat(
     model_name: str,
     tokenizer_name: str,
 ):
-    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
 
     for add_generation in [False, True]:
         for add_special in [False, True]:
@@ -140,7 +140,7 @@ async def test_tokenize_chat_with_tools(
     model_name: str,
     tokenizer_name: str,
 ):
-    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
 
     for add_generation in [False, True]:
         for add_special in [False, True]:
@@ -210,7 +210,7 @@ async def test_tokenize_with_return_token_strs(
     model_name: str,
     tokenizer_name: str,
 ):
-    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
 
     prompt = "This is a token_strs test prompt! vllm1"
     response = requests.post(
@@ -240,7 +240,7 @@ async def test_detokenize(
     model_name: str,
     tokenizer_name: str,
 ):
-    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
 
     prompt = "This is a test prompt. vllm1"
     tokens = tokenizer.encode(prompt, add_special_tokens=False)
diff --git a/tests/entrypoints/pooling/embed/test_online.py b/tests/entrypoints/pooling/embed/test_online.py
index 81f4f9a82c1f4..6aac649bc3035 100644
--- a/tests/entrypoints/pooling/embed/test_online.py
+++ b/tests/entrypoints/pooling/embed/test_online.py
@@ -197,7 +197,7 @@ async def test_conversation_embedding(
     chat_response.raise_for_status()
     chat_embeddings = EmbeddingResponse.model_validate(chat_response.json())
 
-    tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
+    tokenizer = get_tokenizer(tokenizer_name=model_name)
     prompt = tokenizer.apply_chat_template(
         messages,
         chat_template=DUMMY_CHAT_TEMPLATE,
diff --git a/tests/entrypoints/pooling/pooling/test_online.py b/tests/entrypoints/pooling/pooling/test_online.py
index bfcb724f19448..977c74d54a351 100644
--- a/tests/entrypoints/pooling/pooling/test_online.py
+++ b/tests/entrypoints/pooling/pooling/test_online.py
@@ -158,11 +158,7 @@ async def test_conversation_pooling(server: RemoteOpenAIServer, model_name: str)
     chat_response.raise_for_status()
     chat_poolings = PoolingResponse.model_validate(chat_response.json())
 
-    tokenizer = get_tokenizer(
-        tokenizer_name=model_name,
-        tokenizer_mode="fast",
-        trust_remote_code=True,
-    )
+    tokenizer = get_tokenizer(tokenizer_name=model_name, trust_remote_code=True)
     prompt = tokenizer.apply_chat_template(
         messages,
         chat_template=DUMMY_CHAT_TEMPLATE,
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 1f4a106c06b4b..d90f3a4d4f781 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -23,7 +23,7 @@ class _HfExamplesInfo:
     tokenizer: str | None = None
     """Set the tokenizer to load for this architecture."""
 
-    tokenizer_mode: TokenizerMode = "auto"
+    tokenizer_mode: TokenizerMode | str = "auto"
     """Set the tokenizer type for this architecture."""
 
     speculative_model: str | None = None
diff --git a/tests/tokenizers_/test_registry.py b/tests/tokenizers_/test_registry.py
index b357669f83781..57b6a14a54b3f 100644
--- a/tests/tokenizers_/test_registry.py
+++ b/tests/tokenizers_/test_registry.py
@@ -1,13 +1,28 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from pathlib import Path
+
 from vllm.tokenizers import TokenizerLike, TokenizerRegistry
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 
 class TestTokenizer(TokenizerLike):
     @classmethod
-    def from_pretrained(cls, *args, **kwargs) -> "TestTokenizer":
-        return TestTokenizer()  # type: ignore
+    def from_pretrained(
+        cls,
+        path_or_repo_id: str | Path,
+        *args,
+        trust_remote_code: bool = False,
+        revision: str | None = None,
+        download_dir: str | None = None,
+        **kwargs,
+    ) -> "TestTokenizer":
+        return TestTokenizer(path_or_repo_id)  # type: ignore
+
+    def __init__(self, path_or_repo_id: str | Path) -> None:
+        super().__init__()
+
+        self.path_or_repo_id = path_or_repo_id
 
     @property
     def bos_token_id(self) -> int:
@@ -29,14 +44,16 @@ class TestTokenizer(TokenizerLike):
 def test_customized_tokenizer():
     TokenizerRegistry.register("test_tokenizer", __name__, TestTokenizer.__name__)
 
-    tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer")
+    tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer", "abc")
     assert isinstance(tokenizer, TestTokenizer)
+    assert tokenizer.path_or_repo_id == "abc"
     assert tokenizer.bos_token_id == 0
     assert tokenizer.eos_token_id == 1
     assert tokenizer.pad_token_id == 2
 
-    tokenizer = get_tokenizer("test_tokenizer", tokenizer_mode="custom")
+    tokenizer = get_tokenizer("abc", tokenizer_mode="test_tokenizer")
     assert isinstance(tokenizer, TestTokenizer)
+    assert tokenizer.path_or_repo_id == "abc"
     assert tokenizer.bos_token_id == 0
     assert tokenizer.eos_token_id == 1
     assert tokenizer.pad_token_id == 2
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index 85f108786c05a..9cc5a6e073a6e 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -4,7 +4,7 @@
 
 import json
 from enum import Enum
-from typing import TYPE_CHECKING, Any
+from typing import Any
 
 import jsonschema
 import pytest
@@ -24,11 +24,6 @@ from vllm.sampling_params import (
     StructuredOutputsParams,
 )
 
-if TYPE_CHECKING:
-    from vllm.config.model import TokenizerMode
-else:
-    TokenizerMode = str
-
 NGRAM_SPEC_CONFIG = {
     "model": "[ngram]",
     "num_speculative_tokens": 5,
@@ -627,7 +622,7 @@ Make the response as short as possible.
 )
 def test_structured_output_with_reasoning_matrices(
     backend: str,
-    tokenizer_mode: TokenizerMode,
+    tokenizer_mode: str,
     reasoning_parser: str,
     model_name: str,
     speculative_config: dict[str, Any] | None,
diff --git a/vllm/config/model.py b/vllm/config/model.py
index b595c8550491b..ef592ac001535 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -86,7 +86,7 @@ TaskOption = Literal[
     "transcription",
     "draft",
 ]
-TokenizerMode = Literal["auto", "hf", "slow", "mistral", "custom"]
+TokenizerMode = Literal["auto", "hf", "slow", "mistral"]
 ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
 LogprobsMode = Literal[
     "raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs"
@@ -137,13 +137,13 @@ class ModelConfig:
     tokenizer: SkipValidation[str] = None  # type: ignore
     """Name or path of the Hugging Face tokenizer to use. If unspecified, model
     name or path will be used."""
-    tokenizer_mode: TokenizerMode = "auto"
+    tokenizer_mode: TokenizerMode | str = "auto"
     """Tokenizer mode:\n
     - "auto" will use "hf" tokenizer if Mistral's tokenizer is not available.\n
     - "hf" will use the fast tokenizer if available.\n
     - "slow" will always use the slow tokenizer.\n
     - "mistral" will always use the tokenizer from `mistral_common`.\n
-    - "custom" will use --tokenizer to select the preregistered tokenizer."""
+    - Other custom values can be supported via plugins."""
     trust_remote_code: bool = False
     """Trust remote code (e.g., from HuggingFace) when downloading the model
     and tokenizer."""
@@ -708,9 +708,6 @@ class ModelConfig:
             # can be correctly capped to sliding window size
             self.hf_text_config.sliding_window = None
 
-        if not self.skip_tokenizer_init:
-            self._verify_tokenizer_mode()
-
         # Avoid running try_verify_and_update_config multiple times
         self.config_updated = False
 
@@ -718,6 +715,10 @@ class ModelConfig:
         self._verify_cuda_graph()
         self._verify_bnb_config()
 
+    @field_validator("tokenizer_mode", mode="after")
+    def _lowercase_tokenizer_mode(cls, tokenizer_mode: str) -> str:
+        return tokenizer_mode.lower()
+
     @field_validator("quantization", mode="before")
     @classmethod
     def validate_quantization_before(cls, value: Any) -> Any:
@@ -829,15 +830,6 @@ class ModelConfig:
             model, _ = split_remote_gguf(model)
         return get_sentence_transformer_tokenizer_config(model, self.revision)
 
-    def _verify_tokenizer_mode(self) -> None:
-        tokenizer_mode = cast(TokenizerMode, self.tokenizer_mode.lower())
-        if tokenizer_mode not in get_args(TokenizerMode):
-            raise ValueError(
-                f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be "
-                f"one of {get_args(TokenizerMode)}."
-            )
-        self.tokenizer_mode = tokenizer_mode
-
     def _get_default_runner_type(
         self,
         architectures: list[str],
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 186a2a4141879..96b1b971552c6 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -360,7 +360,7 @@ class EngineArgs:
     task: TaskOption | None = ModelConfig.task
     skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init
     enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds
-    tokenizer_mode: TokenizerMode = ModelConfig.tokenizer_mode
+    tokenizer_mode: TokenizerMode | str = ModelConfig.tokenizer_mode
     trust_remote_code: bool = ModelConfig.trust_remote_code
     allowed_local_media_path: str = ModelConfig.allowed_local_media_path
     allowed_media_domains: list[str] | None = ModelConfig.allowed_media_domains
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index acdf28501cbb9..f005605c08d7e 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -188,7 +188,7 @@ class LLM:
         runner: RunnerOption = "auto",
         convert: ConvertOption = "auto",
         tokenizer: str | None = None,
-        tokenizer_mode: TokenizerMode = "auto",
+        tokenizer_mode: TokenizerMode | str = "auto",
         skip_tokenizer_init: bool = False,
         trust_remote_code: bool = False,
         allowed_local_media_path: str = "",
diff --git a/vllm/tokenizers/__init__.py b/vllm/tokenizers/__init__.py
index 03174872146a7..14f0148cf7ba8 100644
--- a/vllm/tokenizers/__init__.py
+++ b/vllm/tokenizers/__init__.py
@@ -4,6 +4,12 @@
 from .hf import HfTokenizer
 from .mistral import MistralTokenizer
 from .protocol import TokenizerLike
-from .registry import TokenizerRegistry
+from .registry import TokenizerRegistry, get_tokenizer
 
-__all__ = ["TokenizerLike", "HfTokenizer", "MistralTokenizer", "TokenizerRegistry"]
+__all__ = [
+    "TokenizerLike",
+    "HfTokenizer",
+    "MistralTokenizer",
+    "TokenizerRegistry",
+    "get_tokenizer",
+]
diff --git a/vllm/tokenizers/hf.py b/vllm/tokenizers/hf.py
index 64672fdbb1209..3445073120387 100644
--- a/vllm/tokenizers/hf.py
+++ b/vllm/tokenizers/hf.py
@@ -10,6 +10,7 @@ from transformers import AutoTokenizer
 from vllm.transformers_utils.config import get_sentence_transformer_tokenizer_config
 
 from .protocol import TokenizerLike
+from .registry import TokenizerRegistry
 
 if TYPE_CHECKING:
     from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
@@ -67,6 +68,7 @@ def get_cached_tokenizer(
     return cached_tokenizer  # type: ignore
 
 
+@TokenizerRegistry.register("hf")
 class HfTokenizer(TokenizerLike):
     @classmethod
     def from_pretrained(
diff --git a/vllm/tokenizers/mistral.py b/vllm/tokenizers/mistral.py
index de3e5ec43854a..7e6745004b01f 100644
--- a/vllm/tokenizers/mistral.py
+++ b/vllm/tokenizers/mistral.py
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Any, cast
 from vllm.logger import init_logger
 
 from .protocol import TokenizerLike
+from .registry import TokenizerRegistry
 
 if TYPE_CHECKING:
     from mistral_common.protocol.instruct.request import (
@@ -165,6 +166,7 @@ def _tekken_token_to_id(tokenizer: "Tekkenizer", t: str | bytes) -> int:
         return tokenizer.unk_id
 
 
+@TokenizerRegistry.register("mistral")
 class MistralTokenizer(TokenizerLike):
     @classmethod
     def from_pretrained(
diff --git a/vllm/tokenizers/registry.py b/vllm/tokenizers/registry.py
index 3a236c99b3564..d5e7899321615 100644
--- a/vllm/tokenizers/registry.py
+++ b/vllm/tokenizers/registry.py
@@ -1,28 +1,197 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import importlib
+import importlib.util
+from collections.abc import Callable
+from pathlib import Path
+from typing import TypeVar, overload
+
+import huggingface_hub
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+from vllm.transformers_utils.gguf_utils import get_gguf_file_path_from_hf
+from vllm.transformers_utils.repo_utils import list_filtered_repo_files
+from vllm.transformers_utils.utils import (
+    check_gguf_file,
+    is_gguf,
+    is_remote_gguf,
+    split_remote_gguf,
+)
+from vllm.utils.import_utils import resolve_obj_by_qualname
 
 from .protocol import TokenizerLike
 
+logger = init_logger(__name__)
+
+_T = TypeVar("_T", bound=type[TokenizerLike])
+
 
 class TokenizerRegistry:
-    # Tokenizer name -> (tokenizer module, tokenizer class)
-    REGISTRY: dict[str, tuple[str, str]] = {}
+    # Tokenizer name -> tokenizer_cls or (tokenizer module, tokenizer class)
+    REGISTRY: dict[str, type[TokenizerLike] | tuple[str, str]] = {}
+
+    # In-tree tokenizers
+    @staticmethod
+    @overload
+    def register(tokenizer_mode: str) -> Callable[[_T], _T]: ...
+
+    # OOT tokenizers
+    @staticmethod
+    @overload
+    def register(tokenizer_mode: str, module: str, class_name: str) -> None: ...
 
     @staticmethod
-    def register(name: str, module: str, class_name: str) -> None:
-        TokenizerRegistry.REGISTRY[name] = (module, class_name)
+    def register(
+        tokenizer_mode: str,
+        module: str | None = None,
+        class_name: str | None = None,
+    ) -> Callable[[_T], _T] | None:
+        # In-tree tokenizers
+        if module is None or class_name is None:
+
+            def wrapper(tokenizer_cls: _T) -> _T:
+                assert tokenizer_mode not in TokenizerRegistry.REGISTRY
+                TokenizerRegistry.REGISTRY[tokenizer_mode] = tokenizer_cls
+
+                return tokenizer_cls
+
+            return wrapper
+
+        # OOT tokenizers
+        if tokenizer_mode in TokenizerRegistry.REGISTRY:
+            logger.warning(
+                "%s.%s is already registered for tokenizer_mode=%r. "
+                "It is overwritten by the new one.",
+                module,
+                class_name,
+                tokenizer_mode,
+            )
+
+        TokenizerRegistry.REGISTRY[tokenizer_mode] = (module, class_name)
+
+        return None
 
     @staticmethod
-    def get_tokenizer(
-        tokenizer_name: str,
-        *args,
-        **kwargs,
-    ) -> "TokenizerLike":
-        tokenizer_cls = TokenizerRegistry.REGISTRY.get(tokenizer_name)
-        if tokenizer_cls is None:
-            raise ValueError(f"Tokenizer {tokenizer_name} not found.")
+    def get_tokenizer(tokenizer_mode: str, *args, **kwargs) -> "TokenizerLike":
+        if tokenizer_mode not in TokenizerRegistry.REGISTRY:
+            raise ValueError(f"No tokenizer registered for {tokenizer_mode=!r}.")
 
-        tokenizer_module = importlib.import_module(tokenizer_cls[0])
-        class_ = getattr(tokenizer_module, tokenizer_cls[1])
+        item = TokenizerRegistry.REGISTRY[tokenizer_mode]
+        if isinstance(item, type):
+            return item.from_pretrained(*args, **kwargs)
+
+        module, class_name = item
+        logger.debug_once(f"Loading {class_name} for {tokenizer_mode=!r}")
+
+        class_ = resolve_obj_by_qualname(f"{module}.{class_name}")
         return class_.from_pretrained(*args, **kwargs)
+
+
+def get_tokenizer(
+    tokenizer_name: str | Path,
+    *args,
+    tokenizer_mode: str = "auto",
+    trust_remote_code: bool = False,
+    revision: str | None = None,
+    download_dir: str | None = None,
+    **kwargs,
+) -> TokenizerLike:
+    """Gets a tokenizer for the given model name via HuggingFace or ModelScope."""
+    if envs.VLLM_USE_MODELSCOPE:
+        # download model from ModelScope hub,
+        # lazy import so that modelscope is not required for normal use.
+        from modelscope.hub.snapshot_download import snapshot_download
+
+        # avoid circular import
+        from vllm.model_executor.model_loader.weight_utils import get_lock
+
+        # Only set the tokenizer here, model will be downloaded on the workers.
+        if not Path(tokenizer_name).exists():
+            # Use file lock to prevent multiple processes from
+            # downloading the same file at the same time.
+            with get_lock(tokenizer_name, download_dir):
+                tokenizer_path = snapshot_download(
+                    model_id=str(tokenizer_name),
+                    cache_dir=download_dir,
+                    revision=revision,
+                    local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+                    # Ignore weights - we only need the tokenizer.
+                    ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"],
+                )
+                tokenizer_name = tokenizer_path
+
+    if tokenizer_mode == "slow":
+        if kwargs.get("use_fast", False):
+            raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
+
+        tokenizer_mode = "hf"
+        kwargs["use_fast"] = False
+
+    if "truncation_side" not in kwargs:
+        kwargs["truncation_side"] = "left"
+
+    # Separate model folder from file path for GGUF models
+    if is_gguf(tokenizer_name):
+        if check_gguf_file(tokenizer_name):
+            kwargs["gguf_file"] = Path(tokenizer_name).name
+            tokenizer_name = Path(tokenizer_name).parent
+        elif is_remote_gguf(tokenizer_name):
+            tokenizer_name, quant_type = split_remote_gguf(tokenizer_name)
+            # Get the HuggingFace Hub path for the GGUF file
+            gguf_file = get_gguf_file_path_from_hf(
+                tokenizer_name,
+                quant_type,
+                revision=revision,
+            )
+            kwargs["gguf_file"] = gguf_file
+
+    # Try to use official Mistral tokenizer if possible
+    if tokenizer_mode == "auto" and importlib.util.find_spec("mistral_common"):
+        allow_patterns = ["tekken.json", "tokenizer.model.v*"]
+        files_list = list_filtered_repo_files(
+            model_name_or_path=str(tokenizer_name),
+            allow_patterns=allow_patterns,
+            revision=revision,
+        )
+        if len(files_list) > 0:
+            tokenizer_mode = "mistral"
+
+    # Fallback to HF tokenizer
+    if tokenizer_mode == "auto":
+        tokenizer_mode = "hf"
+
+    tokenizer_args = (tokenizer_name, *args)
+    tokenizer_kwargs = dict(
+        trust_remote_code=trust_remote_code,
+        revision=revision,
+        download_dir=download_dir,
+        **kwargs,
+    )
+
+    if tokenizer_mode == "custom":
+        logger.warning_once(
+            "TokenizerRegistry now uses `tokenizer_mode` as the registry key "
+            "instead of `tokenizer_name`. "
+            "Please update the definition of `.from_pretrained` in "
+            "your custom tokenizer to accept `args=%s`, `kwargs=%s`. "
+            "Then, you can pass `tokenizer_mode=%r` instead of "
+            "`tokenizer_mode='custom'` when initializing vLLM.",
+            tokenizer_args,
+            str(tokenizer_kwargs),
+            tokenizer_mode,
+        )
+
+        tokenizer_mode = str(tokenizer_name)
+
+    tokenizer = TokenizerRegistry.get_tokenizer(
+        tokenizer_mode,
+        *tokenizer_args,
+        **tokenizer_kwargs,
+    )
+    if not tokenizer.is_fast:
+        logger.warning(
+            "Using a slow tokenizer. This might cause a significant "
+            "slowdown. Consider using a fast tokenizer instead."
+        )
+
+    return tokenizer
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 622d5c7fe9931..0911848c02e14 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -1,28 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import importlib.util
-import os
 import warnings
 from functools import lru_cache
-from pathlib import Path
 from typing import TYPE_CHECKING, Any
 
-import huggingface_hub
 from typing_extensions import assert_never
 
-from vllm import envs
 from vllm.logger import init_logger
-from vllm.tokenizers import (
-    HfTokenizer,
-    MistralTokenizer,
-    TokenizerLike,
-    TokenizerRegistry,
-)
-
-from .gguf_utils import get_gguf_file_path_from_hf
-from .repo_utils import list_filtered_repo_files
-from .utils import check_gguf_file, is_gguf, is_remote_gguf, split_remote_gguf
+from vllm.tokenizers import TokenizerLike, get_tokenizer
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
@@ -108,117 +94,6 @@ def encode_tokens(
     return tokenizer.encode(text, **kw_args)
 
 
-def get_tokenizer(
-    tokenizer_name: str | Path,
-    *args,
-    tokenizer_mode: str = "auto",
-    trust_remote_code: bool = False,
-    revision: str | None = None,
-    download_dir: str | None = None,
-    **kwargs,
-) -> TokenizerLike:
-    """Gets a tokenizer for the given model name via HuggingFace or ModelScope."""
-    if envs.VLLM_USE_MODELSCOPE:
-        # download model from ModelScope hub,
-        # lazy import so that modelscope is not required for normal use.
-        # pylint: disable=C.
-        from modelscope.hub.snapshot_download import snapshot_download
-
-        # avoid circuit import
-        from vllm.model_executor.model_loader.weight_utils import get_lock
-
-        # Only set the tokenizer here, model will be downloaded on the workers.
-        if not os.path.exists(tokenizer_name):
-            # Use file lock to prevent multiple processes from
-            # downloading the same file at the same time.
-            with get_lock(tokenizer_name, download_dir):
-                tokenizer_path = snapshot_download(
-                    model_id=tokenizer_name,
-                    cache_dir=download_dir,
-                    revision=revision,
-                    local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
-                    # Ignore weights - we only need the tokenizer.
-                    ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"],
-                )
-                tokenizer_name = tokenizer_path
-
-    if tokenizer_mode == "slow":
-        if kwargs.get("use_fast", False):
-            raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
-        kwargs["use_fast"] = False
-
-    if "truncation_side" not in kwargs:
-        kwargs["truncation_side"] = "left"
-
-    # Separate model folder from file path for GGUF models
-    if is_gguf(tokenizer_name):
-        if check_gguf_file(tokenizer_name):
-            kwargs["gguf_file"] = Path(tokenizer_name).name
-            tokenizer_name = Path(tokenizer_name).parent
-        elif is_remote_gguf(tokenizer_name):
-            tokenizer_name, quant_type = split_remote_gguf(tokenizer_name)
-            # Get the HuggingFace Hub path for the GGUF file
-            gguf_file = get_gguf_file_path_from_hf(
-                tokenizer_name,
-                quant_type,
-                revision=revision,
-            )
-            kwargs["gguf_file"] = gguf_file
-
-    # if `tokenizer_mode` == "auto", check if tokenizer can be loaded via Mistral format
-    # first to use official Mistral tokenizer if possible.
-    mistral_common_installed = importlib.util.find_spec("mistral_common") is not None
-    if tokenizer_mode == "auto" and mistral_common_installed:
-        allow_patterns = ["tekken.json", "tokenizer.model.v*"]
-        files_list = list_filtered_repo_files(
-            model_name_or_path=str(tokenizer_name),
-            allow_patterns=allow_patterns,
-            revision=revision,
-        )
-        if len(files_list) > 0:
-            tokenizer_mode = "mistral"
-
-    tokenizer: TokenizerLike
-    if tokenizer_mode == "mistral":
-        logger.debug_once(f"Loading MistralTokenizer from {tokenizer_name}")
-        tokenizer = MistralTokenizer.from_pretrained(
-            tokenizer_name,
-            *args,
-            trust_remote_code=trust_remote_code,
-            revision=revision,
-            download_dir=download_dir,
-            **kwargs,
-        )
-    elif tokenizer_mode == "custom":
-        logger.debug_once(f"Loading CustomTokenizer from {tokenizer_name}")
-        tokenizer = TokenizerRegistry.get_tokenizer(
-            str(tokenizer_name),
-            *args,
-            trust_remote_code=trust_remote_code,
-            revision=revision,
-            download_dir=download_dir,
-            **kwargs,
-        )
-    else:
-        logger.debug_once(f"Loading HfTokenizer from {tokenizer_name}")
-        tokenizer = HfTokenizer.from_pretrained(
-            tokenizer_name,
-            *args,
-            trust_remote_code=trust_remote_code,
-            revision=revision,
-            download_dir=download_dir,
-            **kwargs,
-        )
-
-    if not tokenizer.is_fast:
-        logger.warning(
-            "Using a slow tokenizer. This might cause a significant "
-            "slowdown. Consider using a fast tokenizer instead."
-        )
-
-    return tokenizer
-
-
 cached_get_tokenizer = lru_cache(get_tokenizer)
 
 
From f37e8938d26ad9c55d7bdef7098dfdba4204dd94 Mon Sep 17 00:00:00 2001
From: Fanli Lin <fanli.lin@intel.com>
Date: Mon, 1 Dec 2025 20:00:52 +0800
Subject: [PATCH 106/770] [XPU] Fix AWQ skipped layer detection in IPEX
 quantization (#29774)

Signed-off-by: Fanli Lin <fanli.lin@intel.com>
---
 vllm/model_executor/layers/quantization/ipex_quant.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py
index 22c4bae041a56..a1571afba2974 100644
--- a/vllm/model_executor/layers/quantization/ipex_quant.py
+++ b/vllm/model_executor/layers/quantization/ipex_quant.py
@@ -150,7 +150,10 @@ class IPEXConfig(QuantizationConfig):
         if isinstance(layer, LinearBase):
             if self.method == "awq":
                 if is_layer_skipped(
-                    prefix, self.modules_to_not_convert, self.packed_modules_mapping
+                    prefix,
+                    self.modules_to_not_convert,
+                    self.packed_modules_mapping,
+                    skip_with_substr=True,
                 ):
                     return UnquantizedLinearMethod()
                 return IPEXAWQLinearMethod(self)

From ad9d656bfa19d937b3ecb4b33b2824a8440bd8f5 Mon Sep 17 00:00:00 2001
From: Zhengxu Chen <zhxchen17@fb.com>
Date: Mon, 1 Dec 2025 07:41:48 -0500
Subject: [PATCH 107/770] [multimodal][test] Reduce memory utilization for
 test_siglip to avoid OOM  (#29504)

Signed-off-by: zhxchen17 <zhxchen17@fb.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 tests/models/multimodal/pooling/test_siglip.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tests/models/multimodal/pooling/test_siglip.py b/tests/models/multimodal/pooling/test_siglip.py
index c973676ba0272..92ae115a19831 100644
--- a/tests/models/multimodal/pooling/test_siglip.py
+++ b/tests/models/multimodal/pooling/test_siglip.py
@@ -37,7 +37,12 @@ def _run_test(
     dtype: str,
 ) -> None:
     with vllm_runner(
-        model, runner="pooling", dtype=dtype, enforce_eager=True, max_model_len=64
+        model,
+        runner="pooling",
+        dtype=dtype,
+        enforce_eager=True,
+        max_model_len=64,
+        gpu_memory_utilization=0.7,
     ) as vllm_model:
         vllm_outputs = vllm_model.embed(input_texts, images=input_images)
 
@@ -134,6 +139,7 @@ def test_models_text_image_no_crash(
         dtype=dtype,
         enforce_eager=True,
         max_model_len=64,
+        gpu_memory_utilization=0.7,
     ) as vllm_model:
         with pytest.raises(ValueError, match="not both"):
             vllm_model.embed(texts, images=images)

From b95db244ee2d37390709aae27a2457a18984540a Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Mon, 1 Dec 2025 21:12:51 +0800
Subject: [PATCH 108/770] [v1] Add real sliding window calculation to
 FlexAttention direct BlockMask building (#26015)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: baonudesifeizhai <baonudesifeizhai@gmail.com>
Co-authored-by: baonudesifeizhai <baonudesifeizhai@gmail.com>
---
 tests/v1/attention/test_attention_backends.py | 12 ++++++-
 vllm/v1/attention/backends/flex_attention.py  | 33 +++++++++++++++----
 2 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/tests/v1/attention/test_attention_backends.py b/tests/v1/attention/test_attention_backends.py
index b46002c5fa8ff..e7ec8380e0a84 100644
--- a/tests/v1/attention/test_attention_backends.py
+++ b/tests/v1/attention/test_attention_backends.py
@@ -74,6 +74,9 @@ BATCH_SPECS = {
     ),
     "large_decode": BatchSpec(seq_lens=[2048] * 32, query_lens=[1] * 32),
     "large_prefill": BatchSpec(seq_lens=[4096] * 8, query_lens=[32] * 8),
+    "mixed_large": BatchSpec(
+        seq_lens=[1024, 2048, 4096, 1024, 2048, 4096], query_lens=[1, 1, 1, 32, 32, 32]
+    ),
     "single_decode": BatchSpec(seq_lens=[1024], query_lens=[1]),
     "single_prefill": BatchSpec(seq_lens=[1024], query_lens=[64]),
 }
@@ -587,7 +590,14 @@ SLIDING_WINDOW_BACKENDS_TO_TEST = [
 
 @pytest.mark.parametrize(
     "batch_spec_name",
-    ["small_decode", "small_prefill", "mixed_medium", "large_decode", "large_prefill"],
+    [
+        "small_decode",
+        "small_prefill",
+        "mixed_medium",
+        "large_decode",
+        "large_prefill",
+        "mixed_large",
+    ],
 )
 @pytest.mark.parametrize("model", ["microsoft/Phi-tiny-MoE-instruct"])
 @pytest.mark.parametrize("tensor_parallel_size", [1, 2, 4])
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index 8de0a0a11471f..fe92f6570501c 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -4,6 +4,7 @@
 
 import math
 from dataclasses import dataclass
+from functools import cached_property
 from typing import ClassVar
 
 import torch
@@ -315,6 +316,14 @@ class FlexAttentionMetadata:
     transformed_score_mod: _score_mod_signature | None = None
     sliding_window: int | None = None
 
+    @cached_property
+    def logical_block_ids(self):
+        return torch.arange(
+            cdiv(self.max_seq_len, self.block_size),
+            device=self.block_table.device,
+            dtype=torch.long,
+        )
+
     def _convert_physical_to_logical(
         self,
         request_lookup: torch.Tensor,
@@ -493,6 +502,7 @@ class FlexAttentionMetadata:
 
         The direct path works as follows:
         1. For each query token, fetch blocks from block_table using max_seq_len
+           and exclude out of sliding window blocks if needed.
            (this fetches more blocks than needed for shorter sequences)
         2. Group query tokens into chunks of q_block_size
         3. For each group, deduplicate the blocks using unique_static_unsorted
@@ -517,6 +527,23 @@ class FlexAttentionMetadata:
         used_pages = self.block_table[
             self.doc_ids, : cdiv(self.max_seq_len, self.block_size)
         ]
+
+        if self.sliding_window and self.causal:
+            device = used_pages.device
+            assert self.doc_ids is not None
+            token_indices = torch.arange(
+                self.doc_ids.shape[0], device=device, dtype=torch.long
+            )
+            logical_q_idx = (
+                token_indices
+                - self.query_start_loc[self.doc_ids]
+                + self.decode_offset[self.doc_ids]
+            )
+            min_kv_idx = torch.clamp(logical_q_idx - (self.sliding_window - 1), min=0)
+            min_block_idx = min_kv_idx // self.block_size
+            sliding_mask = self.logical_block_ids >= min_block_idx[:, None]
+            used_pages.masked_fill_(~sliding_mask, 0)
+
         used_pages_padded = pad_to_multiple(
             used_pages, multiple=self.q_block_size, dim=0
         )
@@ -785,12 +812,6 @@ class FlexAttentionImpl(AttentionImpl):
         if attn_metadata.sliding_window != self.sliding_window:
             attn_metadata.sliding_window = self.sliding_window
             if attn_metadata.direct_build:
-                # TODO: Support skipping the computation of sliding window
-                # in direct block mask building code path.
-                logger.warning_once(
-                    "Using direct block mask building with sliding window, "
-                    "which is suboptimal now. Performance may be degraded."
-                )
                 # update mask mod in attention metadata
                 attn_metadata.mask_mod = attn_metadata.get_mask_mod()
                 attn_metadata.block_mask = attn_metadata._build_block_mask_direct()

From 5cfa967efa3525e97565979ab04118977e0b3bb3 Mon Sep 17 00:00:00 2001
From: Marcin Ostrowski <marcinx.ostrowski@intel.com>
Date: Mon, 1 Dec 2025 14:16:44 +0100
Subject: [PATCH 109/770] [Bugfix] TypeError: 'NoneType' object is not callable
 (#29414)

Signed-off-by: Marcin Ostrowski <marcinx.ostrowski@intel.com>
---
 tests/v1/core/test_prefix_caching.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 64fd5ab1dd9aa..0880a17c78d40 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -45,7 +45,7 @@ pytestmark = pytest.mark.cpu_test
 def _auto_init_hash_fn(request):
     hash_fn: Callable
     if "hash_fn" in request.fixturenames:
-        hash_fn = init_none_hash(request.getfixturevalue("hash_fn"))
+        hash_fn = request.getfixturevalue("hash_fn")
     else:
         hash_fn = sha256
     init_none_hash(hash_fn)

From 36db0a35e45f32f7c37f6f1967dc8d6ff301d882 Mon Sep 17 00:00:00 2001
From: Shengqi Chen <i@harrychen.xyz>
Date: Mon, 1 Dec 2025 21:25:39 +0800
Subject: [PATCH 110/770] [CI] Renovation of nightly wheel build & generation
 (#29690)

Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
---
 .buildkite/generate_index.py                  |  46 ---
 .buildkite/release-pipeline.yaml              |  16 +-
 .buildkite/scripts/generate-nightly-index.py  | 368 ++++++++++++++++++
 .buildkite/scripts/upload-wheels.sh           | 119 +++---
 docs/getting_started/installation/cpu.md      |  15 +-
 .../installation/gpu.cuda.inc.md              |  75 ++--
 docs/getting_started/installation/gpu.md      |   2 +-
 setup.py                                      | 101 +++--
 vllm/envs.py                                  |   7 +-
 9 files changed, 568 insertions(+), 181 deletions(-)
 delete mode 100644 .buildkite/generate_index.py
 create mode 100644 .buildkite/scripts/generate-nightly-index.py

diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py
deleted file mode 100644
index bbed80ebe8476..0000000000000
--- a/.buildkite/generate_index.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import argparse
-import os
-
-template = """<!DOCTYPE html>
-<html>
-    <body>
-    <h1>Links for vLLM</h1/>
-        <a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
-        <a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
-    </body>
-</html>
-"""
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--wheel", help="The wheel path.", required=True)
-args = parser.parse_args()
-
-filename = os.path.basename(args.wheel)
-
-with open("index.html", "w") as f:
-    print(f"Generated index.html for {args.wheel}")
-    # sync the abi tag with .buildkite/scripts/upload-wheels.sh
-    if "x86_64" in filename:
-        x86_wheel = filename
-        arm_wheel = filename.replace("x86_64", "aarch64").replace(
-            "manylinux1", "manylinux2014"
-        )
-    elif "aarch64" in filename:
-        x86_wheel = filename.replace("aarch64", "x86_64").replace(
-            "manylinux2014", "manylinux1"
-        )
-        arm_wheel = filename
-    else:
-        raise ValueError(f"Unsupported wheel: {filename}")
-    # cloudfront requires escaping the '+' character
-    f.write(
-        template.format(
-            x86_wheel=x86_wheel,
-            x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
-            arm_wheel=arm_wheel,
-            arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
-        )
-    )
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 38c400ba1faf5..fbfc923998f89 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -8,7 +8,7 @@ steps:
     commands:
       # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
       # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       - "bash .buildkite/scripts/upload-wheels.sh"
@@ -30,19 +30,6 @@ steps:
       DOCKER_BUILDKIT: "1"
 
   # x86 + CUDA builds
-  - label: "Build wheel - CUDA 12.8"
-    depends_on: ~
-    id: build-wheel-cuda-12-8
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
-      - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
-    env:
-      DOCKER_BUILDKIT: "1"
-
   - label: "Build wheel - CUDA 12.9"
     depends_on: ~
     id: build-wheel-cuda-12-9
@@ -109,7 +96,6 @@ steps:
   - label: "Annotate release workflow"
     depends_on:
       - create-multi-arch-manifest
-      - build-wheel-cuda-12-8
     id: annotate-release-workflow
     agents:
       queue: cpu_queue_postmerge
diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py
new file mode 100644
index 0000000000000..a61f081076475
--- /dev/null
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -0,0 +1,368 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# do not complain about line length (for docstring)
+# ruff: noqa: E501
+
+import argparse
+import json
+import re
+import sys
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Any
+from urllib.parse import quote
+
+if not sys.version_info >= (3, 10):
+    raise RuntimeError("This script requires Python 3.10 or higher.")
+
+INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
+<html>
+  <meta name="pypi:repository-version" content="1.0">
+  <body>
+{items}
+  </body>
+</html>
+"""
+
+
+@dataclass
+class WheelFileInfo:
+    package_name: str
+    version: str
+    build_tag: str | None
+    python_tag: str
+    abi_tag: str
+    platform_tag: str
+    variant: str | None
+    filename: str
+
+
+def parse_from_filename(file: str) -> WheelFileInfo:
+    """
+    Parse wheel file name to extract metadata.
+
+    The format of wheel names:
+        {package_name}-{version}(-{build_tag})?-{python_tag}-{abi_tag}-{platform_tag}.whl
+    All versions could contain a variant like '+cu129' or '.cpu' or `.rocm` (or not).
+    Example:
+        vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl
+        vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl
+        vllm-0.11.1rc8.dev14+gaa384b3c0-cp38-abi3-manylinux2014_aarch64.whl
+        vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl
+    """
+    wheel_file_re = re.compile(
+        r"^(?P<package_name>.+)-(?P<version>[^-]+?)(-(?P<build_tag>[^-]+))?-(?P<python_tag>[^-]+)-(?P<abi_tag>[^-]+)-(?P<platform_tag>[^-]+)\.whl$"
+    )
+    match = wheel_file_re.match(file)
+    if not match:
+        raise ValueError(f"Invalid wheel file name: {file}")
+
+    package_name = match.group("package_name")
+    version = match.group("version")
+    build_tag = match.group("build_tag")
+    python_tag = match.group("python_tag")
+    abi_tag = match.group("abi_tag")
+    platform_tag = match.group("platform_tag")
+
+    # extract variant from version
+    variant = None
+    if "dev" in version:
+        ver_after_dev = version.split("dev")[-1]
+        if "." in ver_after_dev:
+            variant = ver_after_dev.split(".")[-1]
+            version = version.removesuffix("." + variant)
+    else:
+        if "+" in version:
+            version, variant = version.split("+")
+
+    return WheelFileInfo(
+        package_name=package_name,
+        version=version,
+        build_tag=build_tag,
+        python_tag=python_tag,
+        abi_tag=abi_tag,
+        platform_tag=platform_tag,
+        variant=variant,
+        filename=file,
+    )
+
+
+def generate_project_list(subdir_names: list[str]) -> str:
+    """
+    Generate project list HTML content linking to each project & variant sub-directory.
+    """
+    href_tags = []
+    for name in sorted(subdir_names):
+        name = name.strip("/").strip(".")
+        href_tags.append(f'    <a href="{name}/">{name}/</a><br/>')
+    return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
+
+
+def generate_package_index_and_metadata(
+    wheel_files: list[WheelFileInfo], wheel_base_dir: Path, index_base_dir: Path
+) -> tuple[str, str]:
+    """
+    Generate package index HTML content for a specific package, linking to actual wheel files.
+    """
+    href_tags = []
+    metadata = []
+    for file in sorted(wheel_files, key=lambda x: x.filename):
+        relative_path = (
+            wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename
+        )
+        href_tags.append(
+            f'    <a href="{quote(relative_path.as_posix())}">{file.filename}</a><br/>'
+        )
+        file_meta = asdict(file)
+        file_meta["path"] = relative_path.as_posix()
+        metadata.append(file_meta)
+    index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
+    metadata_str = json.dumps(metadata, indent=2)
+    return index_str, metadata_str
+
+
+def generate_index_and_metadata(
+    whl_files: list[str],
+    wheel_base_dir: Path,
+    index_base_dir: Path,
+    default_variant: str | None = None,
+    alias_to_default: str | None = None,
+):
+    """
+    Generate index for all wheel files.
+
+    Args:
+        whl_files (list[str]): List of wheel files (must be directly under `wheel_base_dir`).
+        wheel_base_dir (Path): Base directory for wheel files.
+        index_base_dir (Path): Base directory to store index files.
+        default_variant (str | None): The default variant name, if any.
+        alias_to_default (str | None): Alias variant name for the default variant, if any.
+
+    First, parse all wheel files to extract metadata.
+    We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
+    The index for the default variant (if any) is generated in the root index directory.
+
+    If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
+    is purely a copy of the corresponding variant index, with only the links adjusted.
+    Otherwise, all wheels without variant suffixes are treated as the default variant.
+
+    If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
+    as the default variant index, but the links are adjusted accordingly.
+
+    Index directory structure:
+        index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
+            index.html  # project list, linking to "vllm/" and other packages, and all variant sub-directories
+            vllm/
+                index.html # package index, pointing to actual files in wheel_base_dir (relative path)
+                metadata.json # machine-readable metadata for all wheels in this package
+            cpu/ # cpu variant sub-directory
+                index.html
+                vllm/
+                    index.html
+                    metadata.json
+            cu129/ # cu129 is actually the alias to default variant
+                index.html
+                vllm/
+                    index.html
+                    metadata.json
+            cu130/ # cu130 variant sub-directory
+                index.html
+                vllm/
+                    index.html
+                    metadata.json
+            ...
+
+    metadata.json stores a dump of all wheel files' metadata in a machine-readable format:
+        [
+            {
+                "package_name": "vllm",
+                "version": "0.10.2rc2",
+                "build_tag": null,
+                "python_tag": "cp38",
+                "abi_tag": "abi3",
+                "platform_tag": "manylinux2014_aarch64",
+                "variant": "cu129",
+                "filename": "vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl",
+                "path": "../vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL
+            },
+            ...
+        ]
+    """
+
+    parsed_files = [parse_from_filename(f) for f in whl_files]
+
+    if not parsed_files:
+        print("No wheel files found, skipping index generation.")
+        return
+
+    # Group by variant
+    variant_to_files: dict[str, list[WheelFileInfo]] = {}
+    for file in parsed_files:
+        variant = file.variant or "default"
+        if variant not in variant_to_files:
+            variant_to_files[variant] = []
+        variant_to_files[variant].append(file)
+
+    print(f"Found variants: {list(variant_to_files.keys())}")
+
+    # sanity check for default variant
+    if default_variant:
+        if "default" in variant_to_files:
+            raise ValueError(
+                "All wheel files must have variant suffixes when `default_variant` is specified."
+            )
+        if default_variant not in variant_to_files:
+            raise ValueError(
+                f"Default variant '{default_variant}' not found among wheel files."
+            )
+
+    if alias_to_default:
+        if "default" not in variant_to_files:
+            # e.g. only some wheels are uploaded to S3 currently
+            print(
+                "[WARN] Alias to default variant specified, but no default variant found."
+            )
+        elif alias_to_default in variant_to_files:
+            raise ValueError(
+                f"Alias variant name '{alias_to_default}' already exists among wheel files."
+            )
+        else:
+            variant_to_files[alias_to_default] = variant_to_files["default"].copy()
+            print(f"Alias variant '{alias_to_default}' created for default variant.")
+
+    # Generate index for each variant
+    subdir_names = set()
+    for variant, files in variant_to_files.items():
+        if variant == "default":
+            variant_dir = index_base_dir
+        else:
+            variant_dir = index_base_dir / variant
+            subdir_names.add(variant)
+
+        variant_dir.mkdir(parents=True, exist_ok=True)
+
+        # gather all package names in this variant
+        packages = set(f.package_name for f in files)
+        if variant == "default":
+            # these packages should also appear in the "project list"
+            # generate after all variants are processed
+            subdir_names = subdir_names.union(packages)
+        else:
+            # generate project list for this variant directly
+            project_list_str = generate_project_list(sorted(packages))
+            with open(variant_dir / "index.html", "w") as f:
+                f.write(project_list_str)
+
+        for package in packages:
+            # filter files belonging to this package only
+            package_files = [f for f in files if f.package_name == package]
+            package_dir = variant_dir / package
+            package_dir.mkdir(parents=True, exist_ok=True)
+            index_str, metadata_str = generate_package_index_and_metadata(
+                package_files, wheel_base_dir, package_dir
+            )
+            with open(package_dir / "index.html", "w") as f:
+                f.write(index_str)
+            with open(package_dir / "metadata.json", "w") as f:
+                f.write(metadata_str)
+
+    # Generate top-level project list index
+    project_list_str = generate_project_list(sorted(subdir_names))
+    with open(index_base_dir / "index.html", "w") as f:
+        f.write(project_list_str)
+
+
+if __name__ == "__main__":
+    """
+    Arguments:
+        --version <version> : version string for the current build (e.g., commit hash)
+        --current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
+        --output-dir <output_directory> : directory to store generated index files
+        --alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
+    """
+
+    parser = argparse.ArgumentParser(
+        description="Process nightly build wheel files to generate indices."
+    )
+    parser.add_argument(
+        "--version",
+        type=str,
+        required=True,
+        help="Version string for the current build (e.g., commit hash)",
+    )
+    parser.add_argument(
+        "--current-objects",
+        type=str,
+        required=True,
+        help="Path to JSON file containing current S3 objects listing in this version directory",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        required=True,
+        help="Directory to store generated index files",
+    )
+    parser.add_argument(
+        "--alias-to-default",
+        type=str,
+        default=None,
+        help="Alias variant name for the default variant",
+    )
+
+    args = parser.parse_args()
+
+    version = args.version
+    if "/" in version or "\\" in version:
+        raise ValueError("Version string must not contain slashes.")
+    current_objects_path = Path(args.current_objects)
+    output_dir = Path(args.output_dir)
+    if not output_dir.exists():
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Read current objects JSON
+    with open(current_objects_path) as f:
+        current_objects: dict[str, list[dict[str, Any]]] = json.load(f)
+
+    # current_objects looks like from list_objects_v2 S3 API:
+    """
+    "Contents": [
+        {
+            "Key": "e2f56c309d2a28899c68975a7e104502d56deb8f/vllm-0.11.2.dev363+ge2f56c309-cp38-abi3-manylinux1_x86_64.whl",
+            "LastModified": "2025-11-28T14:00:32+00:00",
+            "ETag": "\"37a38339c7cdb61ca737021b968075df-52\"",
+            "ChecksumAlgorithm": [
+                "CRC64NVME"
+            ],
+            "ChecksumType": "FULL_OBJECT",
+            "Size": 435649349,
+            "StorageClass": "STANDARD"
+        },
+        ...
+    ]
+    """
+
+    # Extract wheel file keys
+    wheel_files = []
+    for item in current_objects.get("Contents", []):
+        key: str = item["Key"]
+        if key.endswith(".whl"):
+            wheel_files.append(key.split("/")[-1])  # only the filename is used
+
+    print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
+
+    # Generate index and metadata, assuming wheels and indices are stored as:
+    # s3://vllm-wheels/{version}/<wheel files>
+    # s3://vllm-wheels/<anything>/<index files>
+    wheel_base_dir = Path(output_dir).parent / version
+    index_base_dir = Path(output_dir)
+
+    generate_index_and_metadata(
+        whl_files=wheel_files,
+        wheel_base_dir=wheel_base_dir,
+        index_base_dir=index_base_dir,
+        default_variant=None,
+        alias_to_default=args.alias_to_default,
+    )
+    print(f"Successfully generated index and metadata in {output_dir}")
diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
index 945c5e48c0090..05accb9cf16d0 100644
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -2,6 +2,28 @@
 
 set -ex
 
+# ======== part 0: setup ========
+
+BUCKET="vllm-wheels"
+INDICES_OUTPUT_DIR="indices"
+DEFAULT_VARIANT_ALIAS="cu129" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py
+PYTHON=${PYTHON_PROG:=python3} # try to read from env var, otherwise use python3
+SUBPATH=$BUILDKITE_COMMIT
+S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
+
+# detect if python3.10+ is available
+has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,10) else 0)")
+if [[ "$has_new_python" -eq 0 ]]; then
+    # use new python from docker
+    docker pull python:3-slim
+    PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3"
+fi
+
+echo "Using python interpreter: $PYTHON"
+echo "Python version: $($PYTHON --version)"
+
+# ========= part 1: collect, rename & upload the wheel ==========
+
 # Assume wheels are in artifacts/dist/*.whl
 wheel_files=(artifacts/dist/*.whl)
 
@@ -10,74 +32,69 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then
   echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
   exit 1
 fi
-
-# Get the single wheel file
 wheel="${wheel_files[0]}"
 
-# Detect architecture and rename 'linux' to appropriate manylinux version
-arch=$(uname -m)
-if [[ $arch == "x86_64" ]]; then
-    manylinux_version="manylinux1"
-elif [[ $arch == "aarch64" ]]; then
-    manylinux_version="manylinux2014"
-else
-    echo "Warning: Unknown architecture $arch, using manylinux1 as default"
-    manylinux_version="manylinux1"
-fi
+# current build image uses ubuntu 20.04, which corresponds to manylinux_2_31
+# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
+manylinux_version="manylinux_2_31"
 
 # Rename 'linux' to the appropriate manylinux version in the wheel filename
+if [[ "$wheel" != *"linux"* ]]; then
+  echo "Error: Wheel filename does not contain 'linux': $wheel"
+  exit 1
+fi
 new_wheel="${wheel/linux/$manylinux_version}"
 mv -- "$wheel" "$new_wheel"
 wheel="$new_wheel"
+echo "Renamed wheel to: $wheel"
 
 # Extract the version from the wheel
 version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
-echo "Version: $version"
+echo "Version in wheel: $version"
+pure_version="${version%%+*}"
+echo "Pure version (without variant): $pure_version"
 
-normal_wheel="$wheel" # Save the original wheel filename
+# copy wheel to its own bucket
+aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
 
-# If the version contains "dev", rename it to v1.0.0.dev for consistency
-if [[ $version == *dev* ]]; then
-    suffix="${version##*.}"
-    if [[ $suffix == cu* ]]; then
-        new_version="1.0.0.dev+${suffix}"
-    else
-        new_version="1.0.0.dev"
-    fi
-    new_wheel="${wheel/$version/$new_version}"
-    # use cp to keep both files in the artifacts directory
-    cp -- "$wheel" "$new_wheel"
-    wheel="$new_wheel"
-    version="$new_version"
-fi
+# ========= part 2: generate and upload indices ==========
+# generate indices for all existing wheels in the commit directory
+# this script might be run multiple times if there are multiple variants being built
+# so we need to guarantee there is little chance for "TOCTOU" issues
+# i.e., one process is generating indices while another is uploading a new wheel
+# so we need to ensure no time-consuming operations happen below
 
-# Upload the wheel to S3
-python3 .buildkite/generate_index.py --wheel "$normal_wheel"
+# list all wheels in the commit directory
+echo "Existing wheels on S3:"
+aws s3 ls "$S3_COMMIT_PREFIX"
+obj_json="objects.json"
+aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
+mkdir -p "$INDICES_OUTPUT_DIR"
 
-# generate index for this commit
-aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
-aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
-
-if [[ $normal_wheel == *"cu129"* ]]; then
-    # only upload index.html for cu129 wheels (default wheels) as it
-    # is available on both x86 and arm64
-    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
-    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
+# call script to generate indicies for all existing wheels
+# this indices have relative paths that could work as long as it is next to the wheel directory in s3
+# i.e., the wheels are always in s3://vllm-wheels/<commit>/
+# and indices can be placed in /<commit>/, or /nightly/, or /<version>/
+if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
+    alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
 else
-    echo "Skipping index files for non-cu129 wheels"
+    alias_arg=""
 fi
 
-# generate index for nightly
-aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
-aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
+$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg
 
-if [[ $normal_wheel == *"cu129"* ]]; then
-    # only upload index.html for cu129 wheels (default wheels) as it
-    # is available on both x86 and arm64
-    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
-else
-    echo "Skipping index files for non-cu129 wheels"
+# copy indices to /<commit>/ unconditionally
+echo "Uploading indices to $S3_COMMIT_PREFIX"
+aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
+
+# copy to /nightly/ only if it is on the main branch and not a PR 
+if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
+    echo "Uploading indices to overwrite /nightly/"
+    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
 fi
 
-aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
-aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
+# copy to /<pure_version>/ only if it does not have "dev" in the version
+if [[ "$version" != *"dev"* ]]; then
+    echo "Uploading indices to overwrite /$pure_version/"
+    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
+fi
diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md
index d1beab7855b18..18dc6d19434b3 100644
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -46,10 +46,23 @@ vLLM is a Python library that supports the following CPU variants. Select your C
 
 ### Pre-built wheels
 
-Currently, there are no pre-built CPU wheels.
+Please refer to the instructions for [pre-built wheels on GPU](./gpu.md#pre-built-wheels).
+
+When specifying the index URL, please make sure to use the `cpu` variant subdirectory.
+For example, the nightly build index is: `https://wheels.vllm.ai/nightly/cpu/`.
 
 ### Build wheel from source
 
+#### Set up using Python-only build (without compilation) {#python-only-build}
+
+Please refer to the instructions for [Python-only build on GPU](./gpu.md#python-only-build), and replace the build commands with:
+
+```bash
+VLLM_USE_PRECOMPILED=1 VLLM_PRECOMPILED_WHEEL_VARIANT=cpu VLLM_TARGET_DEVICE=cpu uv pip install --editable .
+```
+
+#### Full build (with compilation) {#full-build}
+
 === "Intel/AMD x86"
 
     --8<-- "docs/getting_started/installation/cpu.x86.inc.md:build-wheel-from-source"
diff --git a/docs/getting_started/installation/gpu.cuda.inc.md b/docs/getting_started/installation/gpu.cuda.inc.md
index 601d3659af886..ad26672f80926 100644
--- a/docs/getting_started/installation/gpu.cuda.inc.md
+++ b/docs/getting_started/installation/gpu.cuda.inc.md
@@ -26,42 +26,49 @@ uv pip install vllm --torch-backend=auto
 
 ??? console "pip"
     ```bash
-    # Install vLLM with CUDA 12.8.
-    pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128
+    # Install vLLM with CUDA 12.9.
+    pip install vllm --extra-index-url https://download.pytorch.org/whl/cu129
     ```
 
-We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu126`), set `--torch-backend=cu126` (or `UV_TORCH_BACKEND=cu126`). If this doesn't work, try running `uv self update` to update `uv` first.
+We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu128`), set `--torch-backend=cu128` (or `UV_TORCH_BACKEND=cu128`). If this doesn't work, try running `uv self update` to update `uv` first.
 
 !!! note
     NVIDIA Blackwell GPUs (B200, GB200) require a minimum of CUDA 12.8, so make sure you are installing PyTorch wheels with at least that version. PyTorch itself offers a [dedicated interface](https://pytorch.org/get-started/locally/) to determine the appropriate pip command to run for a given target configuration.
 
-As of now, vLLM's binaries are compiled with CUDA 12.8 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.6, 11.8, and public PyTorch release versions:
+As of now, vLLM's binaries are compiled with CUDA 12.9 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.8, 13.0, and public PyTorch release versions:
 
 ```bash
-# Install vLLM with a specific CUDA version (e.g., 11.8 or 12.6).
+# Install vLLM with a specific CUDA version (e.g., 13.0).
 export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
-export CUDA_VERSION=118 # or 126
-uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}
+export CUDA_VERSION=130 # or other
+uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux_2_31_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}
 ```
 
 #### Install the latest code
 
-LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on an x86 platform with CUDA 12 for every commit since `v0.5.3`.
+LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for every commit since `v0.5.3` on <https://wheels.vllm.ai/nightly>. There are multiple indices that could be used:
+
+* `https://wheels.vllm.ai/nightly`: the default variant (CUDA with version specified in `VLLM_MAIN_CUDA_VERSION`) built with the last commit on the `main` branch. Currently it is CUDA 12.9.
+* `https://wheels.vllm.ai/nightly/<variant>`: all other variants. Now this includes `cu130`, and `cpu`. The default variant (`cu129`) also has a subdirectory to keep consistency.
+
+To install from nightly index, run:
 
 ```bash
 uv pip install -U vllm \
     --torch-backend=auto \
-    --extra-index-url https://wheels.vllm.ai/nightly
+    --extra-index-url https://wheels.vllm.ai/nightly # add variant subdirectory here if needed
 ```
 
-??? console "pip"
-    ```bash
-    pip install -U vllm \
-        --pre \
-        --extra-index-url https://wheels.vllm.ai/nightly
-    ```
+!!! warning "`pip` caveat"
 
-    `--pre` is required for `pip` to consider pre-released versions.
+    Using `pip` to install from nightly indices is _not supported_, because `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. In contrast, `uv` gives the extra index [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes).
+
+    If you insist on using `pip`, you have to specify the full URL of the wheel file (which can be obtained from the web page).
+
+    ```bash
+    pip install -U https://wheels.vllm.ai/nightly/vllm-0.11.2.dev399%2Bg3c7461c18-cp38-abi3-manylinux_2_31_x86_64.whl # current nightly build (the filename will change!)
+    pip install -U https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-0.11.2.dev399%2Bg3c7461c18-cp38-abi3-manylinux_2_31_x86_64.whl # from specific commit
+    ```
 
 ##### Install specific revisions
 
@@ -71,33 +78,13 @@ If you want to access the wheels for previous commits (e.g. to bisect the behavi
 export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
 uv pip install vllm \
     --torch-backend=auto \
-    --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}
+    --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT} # add variant subdirectory here if needed
 ```
 
-The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
-
-??? note "pip"
-    If you want to access the wheels for previous commits (e.g. to bisect the behavior change,
-    performance regression), due to the limitation of `pip`, you have to specify the full URL of the
-    wheel file by embedding the commit hash in the URL:
-
-    ```bash
-    export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
-    pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
-    ```
-
-    Note that the wheels are built with Python 3.8 ABI (see [PEP
-    425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible
-    with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a
-    placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in
-    the wheel metadata (the wheels listed in the extra index url have correct versions). Although we
-    don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the
-    wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
-
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]
 
-#### Set up using Python-only build (without compilation)
+#### Set up using Python-only build (without compilation) {#python-only-build}
 
 If you only need to change Python code, you can build and install vLLM without compilation. Using `uv pip`'s [`--editable` flag](https://docs.astral.sh/uv/pip/packages/#editable-packages), changes you make to the code will be reflected when you run vLLM:
 
@@ -121,18 +108,24 @@ This command will do the following:
 In case you see an error about wheel not found when running the above command, it might be because the commit you based on in the main branch was just merged and the wheel is being built. In this case, you can wait for around an hour to try again, or manually assign the previous commit in the installation using the `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable.
 
 ```bash
-export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
-export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+export VLLM_PRECOMPILED_WHEEL_COMIMT=$(git rev-parse HEAD~1) # or earlier commit on main
+export VLLM_USE_PRECOMPILED=1
 uv pip install --editable .
 ```
 
+There are more environment variables to control the behavior of Python-only build:
+
+* `VLLM_PRECOMPILED_WHEEL_LOCATION`: specify the exact wheel URL or local file path of a pre-compiled wheel to use. All other logic to find the wheel will be skipped.
+* `VLLM_PRECOMPILED_WHEEL_COMMIT`: override the commit hash to download the pre-compiled wheel. It can be `nightly` to use the last **already built** commit on the main branch.
+* `VLLM_PRECOMPILED_WHEEL_VARIANT`: specify the variant subdirectory to use on the nightly index, e.g., `cu129`, `cpu`. If not specified, the CUDA variant with `VLLM_MAIN_CUDA_VERSION` will be tried, then fallback to the default variant on the remote index.
+
 You can find more information about vLLM's wheels in [Install the latest code](#install-the-latest-code).
 
 !!! note
     There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
     It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [Install the latest code](#install-the-latest-code) for instructions on how to install a specified wheel.
 
-#### Full build (with compilation)
+#### Full build (with compilation) {#full-build}
 
 If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes:
 
diff --git a/docs/getting_started/installation/gpu.md b/docs/getting_started/installation/gpu.md
index bc7508b29475f..fb750f4499858 100644
--- a/docs/getting_started/installation/gpu.md
+++ b/docs/getting_started/installation/gpu.md
@@ -52,7 +52,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 
     --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:set-up-using-python"
 
-### Pre-built wheels
+### Pre-built wheels {#pre-built-wheels}
 
 === "NVIDIA CUDA"
 
diff --git a/setup.py b/setup.py
index 0022e7fe0bf36..67226b4447c73 100644
--- a/setup.py
+++ b/setup.py
@@ -310,9 +310,6 @@ class cmake_build_ext(build_ext):
 class precompiled_build_ext(build_ext):
     """Disables extension building when using precompiled binaries."""
 
-    def run(self) -> None:
-        assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
-
     def build_extensions(self) -> None:
         print("Skipping build_ext: using precompiled extensions.")
         return
@@ -648,37 +645,97 @@ package_data = {
     ]
 }
 
+
+def _fetch_metadata_for_variant(
+    commit: str, variant: str | None
+) -> tuple[list[dict], str]:
+    variant_dir = f"{variant}/" if variant is not None else ""
+    repo_url = f"https://wheels.vllm.ai/{commit}/{variant_dir}vllm/"
+    meta_url = repo_url + "metadata.json"
+    logger.info("Trying to fetch metadata from {}", meta_url)
+    from urllib.request import urlopen
+
+    with urlopen(meta_url) as resp:
+        # urlopen raises HTTPError on unexpected status code
+        wheels = json.loads(resp.read().decode("utf-8"))
+    return wheels, repo_url
+
+
 # If using precompiled, extract and patch package_data (in advance of setup)
 if envs.VLLM_USE_PRECOMPILED:
-    assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
+    # Attempts:
+    # 1. user-specified wheel location (can be either local or remote, via
+    #    VLLM_PRECOMPILED_WHEEL_LOCATION)
+    # 2. user-specified variant from nightly repo (current main commit via
+    #    VLLM_PRECOMPILED_WHEEL_VARIANT)
+    # 3. the variant corresponding to VLLM_MAIN_CUDA_VERSION from nightly repo
+    # 4. the default variant from nightly repo (current main commit)
     wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
     if wheel_location is not None:
         wheel_url = wheel_location
+        logger.info("Using user-specified precompiled wheel location: {}", wheel_url)
     else:
         import platform
 
         arch = platform.machine()
-        if arch == "x86_64":
-            wheel_tag = "manylinux1_x86_64"
-        elif arch == "aarch64":
-            wheel_tag = "manylinux2014_aarch64"
-        else:
-            raise ValueError(f"Unsupported architecture: {arch}")
-        base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
-        wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
-        nightly_wheel_url = (
-            f"https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
+        # try to fetch the wheel metadata from the nightly wheel repo
+        main_variant = envs.VLLM_MAIN_CUDA_VERSION.replace(".", "")
+        variant = os.getenv("VLLM_PRECOMPILED_WHEEL_VARIANT", main_variant)
+        commit = os.getenv(
+            "VLLM_PRECOMPILED_WHEEL_COMMIT",
+            precompiled_wheel_utils.get_base_commit_in_main_branch(),
         )
-        from urllib.request import urlopen
-
+        logger.info(
+            "Using precompiled wheel commit {} with variant {}", commit, variant
+        )
+        try_default = False
+        wheels, repo_url = None, None
         try:
-            with urlopen(wheel_url) as resp:
-                if resp.status != 200:
-                    wheel_url = nightly_wheel_url
+            wheels, repo_url = _fetch_metadata_for_variant(commit, variant)
         except Exception as e:
-            print(f"[warn] Falling back to nightly wheel: {e}")
-            wheel_url = nightly_wheel_url
-
+            logger.warning(
+                "Failed to fetch precompiled wheel metadata for variant {}",
+                variant,
+                exc_info=e,
+            )
+            try_default = True  # try outside handler to keep the stacktrace simple
+        if try_default:
+            logger.info("Trying the default variant")
+            wheels, repo_url = _fetch_metadata_for_variant(commit, None)
+            # if this also fails, then we have nothing more to try / cache
+        assert wheels is not None and repo_url is not None, (
+            "Failed to fetch precompiled wheel metadata"
+        )
+        # The metadata.json has the following format:
+        # see .buildkite/scripts/generate-nightly-index.py for details
+        """[{
+"package_name": "vllm",
+"version": "0.11.2.dev278+gdbc3d9991",
+"build_tag": null,
+"python_tag": "cp38",
+"abi_tag": "abi3",
+"platform_tag": "manylinux1_x86_64",
+"variant": null,
+"filename": "vllm-0.11.2.dev278+gdbc3d9991-cp38-abi3-manylinux1_x86_64.whl",
+"path": "../vllm-0.11.2.dev278+gdbc3d9991-cp38-abi3-manylinux1_x86_64.whl"
+},
+...]"""
+        for wheel in wheels:
+            if wheel.get("package_name") == "vllm" and arch in wheel.get(
+                "platform_tag", ""
+            ):
+                logger.info("Found precompiled wheel metadata: {}", wheel)
+                if "path" not in wheel:
+                    raise ValueError(f"Wheel metadata missing path: {wheel}")
+                # TODO: maybe check more compatibility later? (python_tag, abi_tag, etc)
+                wheel_url = repo_url + wheel["path"]
+                logger.info("Using precompiled wheel URL: {}", wheel_url)
+                break
+        else:
+            raise ValueError(
+                f"No precompiled vllm wheel found for architecture {arch} "
+                f"from repo {repo_url}. All available wheels: {wheels}"
+            )
     patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(wheel_url)
     for pkg, files in patch.items():
         package_data.setdefault(pkg, []).extend(files)
diff --git a/vllm/envs.py b/vllm/envs.py
index 46f1aa3222be7..d0912863e6444 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -74,7 +74,7 @@ if TYPE_CHECKING:
     VLLM_MEDIA_CONNECTOR: str = "http"
     VLLM_MM_INPUT_CACHE_GIB: int = 4
     VLLM_TARGET_DEVICE: str = "cuda"
-    VLLM_MAIN_CUDA_VERSION: str = "12.8"
+    VLLM_MAIN_CUDA_VERSION: str = "12.9"
     MAX_JOBS: str | None = None
     NVCC_THREADS: str | None = None
     VLLM_USE_PRECOMPILED: bool = False
@@ -445,10 +445,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # Target device of vLLM, supporting [cuda (by default),
     # rocm, cpu]
     "VLLM_TARGET_DEVICE": lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda").lower(),
-    # Main CUDA version of vLLM, supporting [12.6, 12.8, 12.9],
-    # 12.8 is the default. This follows PyTorch but can be overridden.
+    # Main CUDA version of vLLM. This follows PyTorch but can be overridden.
     "VLLM_MAIN_CUDA_VERSION": lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower()
-    or "12.8",
+    or "12.9",
     # Maximum number of compilation jobs to run in parallel.
     # By default this is the number of CPUs
     "MAX_JOBS": lambda: os.getenv("MAX_JOBS", None),

From f5516039c5cc7ddb27a62f7e276a388723cc8542 Mon Sep 17 00:00:00 2001
From: Liu Jinyi <kkkzoz@qq.com>
Date: Mon, 1 Dec 2025 22:49:22 +0800
Subject: [PATCH 111/770] [Doc] fix heading levels (#29783)

Signed-off-by: KKKZOZ <kkkzoz@qq.com>
---
 benchmarks/auto_tune/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/auto_tune/README.md b/benchmarks/auto_tune/README.md
index d1bdb4c43f10b..9a9600e08dafe 100644
--- a/benchmarks/auto_tune/README.md
+++ b/benchmarks/auto_tune/README.md
@@ -83,7 +83,7 @@ MIN_CACHE_HIT_PCT=0
 MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
 ```
 
-#### 2. Maximize Throughput with a Latency Requirement
+### 2. Maximize Throughput with a Latency Requirement
 
 - **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms.
 - **Configuration**:
@@ -96,7 +96,7 @@ MIN_CACHE_HIT_PCT=0
 MAX_LATENCY_ALLOWED_MS=500
 ```
 
-#### 3. Maximize Throughput with Prefix Caching and Latency Requirements
+### 3. Maximize Throughput with Prefix Caching and Latency Requirements
 
 - **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms.
 - **Configuration**:

From 37593deb02423826e9206ff28e77f57a0ff8a0b0 Mon Sep 17 00:00:00 2001
From: Shengqi Chen <harry-chen@outlook.com>
Date: Mon, 1 Dec 2025 23:17:20 +0800
Subject: [PATCH 112/770] [CI] fix url-encoding behavior in nightly metadata
 generation (#29787)

Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
---
 .buildkite/scripts/generate-nightly-index.py | 11 ++++---
 setup.py                                     | 33 ++++++++++++--------
 2 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py
index a61f081076475..90286ad4c6e14 100644
--- a/.buildkite/scripts/generate-nightly-index.py
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -112,11 +112,12 @@ def generate_package_index_and_metadata(
         relative_path = (
             wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename
         )
-        href_tags.append(
-            f'    <a href="{quote(relative_path.as_posix())}">{file.filename}</a><br/>'
-        )
+        # handle with '+' in URL, and avoid double-encoding '/' and already-encoded '%2B'
+        # NOTE: this is AWS S3 specific behavior!
+        file_path_quoted = quote(relative_path.as_posix(), safe=":%/")
+        href_tags.append(f'    <a href="{file_path_quoted}">{file.filename}</a><br/>')
         file_meta = asdict(file)
-        file_meta["path"] = relative_path.as_posix()
+        file_meta["path"] = file_path_quoted
         metadata.append(file_meta)
     index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
     metadata_str = json.dumps(metadata, indent=2)
@@ -185,7 +186,7 @@ def generate_index_and_metadata(
                 "platform_tag": "manylinux2014_aarch64",
                 "variant": "cu129",
                 "filename": "vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl",
-                "path": "../vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL
+                "path": "../vllm-0.10.2rc2%2Bcu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL and URL-encoded
             },
             ...
         ]
diff --git a/setup.py b/setup.py
index 67226b4447c73..67fbebb1d37b5 100644
--- a/setup.py
+++ b/setup.py
@@ -319,14 +319,17 @@ class precompiled_wheel_utils:
     """Extracts libraries and other files from an existing wheel."""
 
     @staticmethod
-    def extract_precompiled_and_patch_package(wheel_url_or_path: str) -> dict:
+    def extract_precompiled_and_patch_package(
+        wheel_url_or_path: str, download_filename: str | None
+    ) -> dict:
         import tempfile
         import zipfile
 
         temp_dir = None
         try:
             if not os.path.isfile(wheel_url_or_path):
-                wheel_filename = wheel_url_or_path.split("/")[-1]
+                # use provided filename first, then derive from URL
+                wheel_filename = download_filename or wheel_url_or_path.split("/")[-1]
                 temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
                 wheel_path = os.path.join(temp_dir, wheel_filename)
                 print(f"Downloading wheel from {wheel_url_or_path} to {wheel_path}")
@@ -673,7 +676,8 @@ if envs.VLLM_USE_PRECOMPILED:
     wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
     if wheel_location is not None:
         wheel_url = wheel_location
-        logger.info("Using user-specified precompiled wheel location: {}", wheel_url)
+        download_filename = None
+        logger.info("Using user-specified precompiled wheel location: %s", wheel_url)
     else:
         import platform
 
@@ -686,17 +690,17 @@ if envs.VLLM_USE_PRECOMPILED:
             precompiled_wheel_utils.get_base_commit_in_main_branch(),
         )
         logger.info(
-            "Using precompiled wheel commit {} with variant {}", commit, variant
+            "Using precompiled wheel commit %s with variant %s", commit, variant
         )
         try_default = False
-        wheels, repo_url = None, None
+        wheels, repo_url, download_filename = None, None, None
         try:
             wheels, repo_url = _fetch_metadata_for_variant(commit, variant)
-        except Exception as e:
+        except Exception:
             logger.warning(
-                "Failed to fetch precompiled wheel metadata for variant {}",
+                "Failed to fetch precompiled wheel metadata for variant %s",
                 variant,
-                exc_info=e,
+                exc_info=True,
             )
             try_default = True  # try outside handler to keep the stacktrace simple
         if try_default:
@@ -717,26 +721,29 @@ if envs.VLLM_USE_PRECOMPILED:
 "platform_tag": "manylinux1_x86_64",
 "variant": null,
 "filename": "vllm-0.11.2.dev278+gdbc3d9991-cp38-abi3-manylinux1_x86_64.whl",
-"path": "../vllm-0.11.2.dev278+gdbc3d9991-cp38-abi3-manylinux1_x86_64.whl"
+"path": "../vllm-0.11.2.dev278%2Bgdbc3d9991-cp38-abi3-manylinux1_x86_64.whl"
 },
 ...]"""
         for wheel in wheels:
+            # TODO: maybe check more compatibility later? (python_tag, abi_tag, etc)
             if wheel.get("package_name") == "vllm" and arch in wheel.get(
                 "platform_tag", ""
             ):
-                logger.info("Found precompiled wheel metadata: {}", wheel)
+                logger.info("Found precompiled wheel metadata: %s", wheel)
                 if "path" not in wheel:
                     raise ValueError(f"Wheel metadata missing path: {wheel}")
-                # TODO: maybe check more compatibility later? (python_tag, abi_tag, etc)
                 wheel_url = repo_url + wheel["path"]
-                logger.info("Using precompiled wheel URL: {}", wheel_url)
+                download_filename = wheel.get("filename")
+                logger.info("Using precompiled wheel URL: %s", wheel_url)
                 break
         else:
             raise ValueError(
                 f"No precompiled vllm wheel found for architecture {arch} "
                 f"from repo {repo_url}. All available wheels: {wheels}"
             )
-    patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(wheel_url)
+    patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(
+        wheel_url, download_filename
+    )
     for pkg, files in patch.items():
         package_data.setdefault(pkg, []).extend(files)
 

From 5d43f7372ebe0e306a03142d3815f76ff0147240 Mon Sep 17 00:00:00 2001
From: FredericOdermatt <50372080+FredericOdermatt@users.noreply.github.com>
Date: Mon, 1 Dec 2025 17:48:33 +0100
Subject: [PATCH 113/770] [Doc] Update description disable_any_whitespace
 (#29784)

Signed-off-by: Frederic Odermatt <frederic.odermatt@44ai.ch>
---
 vllm/config/structured_outputs.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/config/structured_outputs.py b/vllm/config/structured_outputs.py
index 1b32675c3dbd2..1672c1d7c9a09 100644
--- a/vllm/config/structured_outputs.py
+++ b/vllm/config/structured_outputs.py
@@ -28,8 +28,10 @@ class StructuredOutputsConfig:
     disable_fallback: bool = False
     """If `True`, vLLM will not fallback to a different backend on error."""
     disable_any_whitespace: bool = False
-    """If `True`, the model will not generate any whitespace during structured
-    outputs. This is only supported for xgrammar and guidance backends."""
+    """If `True`, json output will always be compact without any whitespace.
+    If `False`, the model may generate whitespace between JSON fields,
+    which is still valid JSON. This is only supported for xgrammar
+    and guidance backends."""
     disable_additional_properties: bool = False
     """If `True`, the `guidance` backend will not use `additionalProperties`
     in the JSON schema. This is only supported for the `guidance` backend and

From 092bb73b8a36ccdb6d6bbac897ba3aa79f660e36 Mon Sep 17 00:00:00 2001
From: sangbumlikeagod <98077576+sangbumlikeagod@users.noreply.github.com>
Date: Tue, 2 Dec 2025 02:19:17 +0900
Subject: [PATCH 114/770] [Frontend] add 'verbose_json' and 'timestamp' feature
 on Whisper Transcription/Translation (#24209)

Signed-off-by: sangbumlikeagod <oironese@naver.com>
Signed-off-by: sangbumlikeagod <98077576+sangbumlikeagod@users.noreply.github.com>
---
 docs/serving/openai_compatible_server.md      |   1 +
 .../test_transcription_validation_whisper.py  |  13 ++
 vllm/entrypoints/openai/api_server.py         |   8 +-
 vllm/entrypoints/openai/protocol.py           |  20 +-
 .../openai/serving_transcription.py           |  28 ++-
 vllm/entrypoints/openai/speech_to_text.py     | 172 +++++++++++++++++-
 vllm/model_executor/models/interfaces.py      |   4 +
 vllm/model_executor/models/whisper.py         |   1 +
 8 files changed, 224 insertions(+), 23 deletions(-)

diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index ac98efb7b88a6..672663dc50b1e 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -456,6 +456,7 @@ For `verbose_json` response format:
       ]
     }
     ```
+Currently “verbose_json” response format doesn’t support avg_logprob, compression_ratio, no_speech_prob.
 
 #### Extra Parameters
 
diff --git a/tests/entrypoints/openai/test_transcription_validation_whisper.py b/tests/entrypoints/openai/test_transcription_validation_whisper.py
index 82c50e58a0168..47cd7b1f12d00 100644
--- a/tests/entrypoints/openai/test_transcription_validation_whisper.py
+++ b/tests/entrypoints/openai/test_transcription_validation_whisper.py
@@ -235,3 +235,16 @@ async def test_audio_prompt(mary_had_lamb, whisper_client):
     )
     out_prompt = json.loads(transcription_wprompt)["text"]
     assert prefix in out_prompt
+
+
+@pytest.mark.asyncio
+async def test_audio_with_timestamp(mary_had_lamb, whisper_client):
+    transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="verbose_json",
+        temperature=0.0,
+    )
+    assert transcription.segments is not None
+    assert len(transcription.segments) > 0
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 6a648822d9b2b..92161f67f1cf0 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -68,9 +68,9 @@ from vllm.entrypoints.openai.protocol import (
     TokenizeRequest,
     TokenizeResponse,
     TranscriptionRequest,
-    TranscriptionResponse,
+    TranscriptionResponseVariant,
     TranslationRequest,
-    TranslationResponse,
+    TranslationResponseVariant,
 )
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
@@ -809,7 +809,7 @@ async def create_transcriptions(
             content=generator.model_dump(), status_code=generator.error.code
         )
 
-    elif isinstance(generator, TranscriptionResponse):
+    elif isinstance(generator, TranscriptionResponseVariant):
         return JSONResponse(content=generator.model_dump())
 
     return StreamingResponse(content=generator, media_type="text/event-stream")
@@ -848,7 +848,7 @@ async def create_translations(
             content=generator.model_dump(), status_code=generator.error.code
         )
 
-    elif isinstance(generator, TranslationResponse):
+    elif isinstance(generator, TranslationResponseVariant):
         return JSONResponse(content=generator.model_dump())
 
     return StreamingResponse(content=generator, media_type="text/event-stream")
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index fb73416f45b24..0f4b2b4d7aad0 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -2126,13 +2126,13 @@ class TranscriptionSegment(OpenAIBaseModel):
     id: int
     """Unique identifier of the segment."""
 
-    avg_logprob: float
+    avg_logprob: float | None = None
     """Average logprob of the segment.
 
     If the value is lower than -1, consider the logprobs failed.
     """
 
-    compression_ratio: float
+    compression_ratio: float | None = None
     """Compression ratio of the segment.
 
     If the value is greater than 2.4, consider the compression failed.
@@ -2141,7 +2141,7 @@ class TranscriptionSegment(OpenAIBaseModel):
     end: float
     """End time of the segment in seconds."""
 
-    no_speech_prob: float
+    no_speech_prob: float | None = None
     """Probability of no speech in the segment.
 
     If the value is higher than 1.0 and the `avg_logprob` is below -1, consider
@@ -2181,6 +2181,11 @@ class TranscriptionResponseVerbose(OpenAIBaseModel):
     """Extracted words and their corresponding timestamps."""
 
 
+TranscriptionResponseVariant: TypeAlias = (
+    TranscriptionResponse | TranscriptionResponseVerbose
+)
+
+
 class TranslationResponseStreamChoice(OpenAIBaseModel):
     delta: DeltaMessage
     finish_reason: str | None = None
@@ -2325,13 +2330,13 @@ class TranslationSegment(OpenAIBaseModel):
     id: int
     """Unique identifier of the segment."""
 
-    avg_logprob: float
+    avg_logprob: float | None = None
     """Average logprob of the segment.
 
     If the value is lower than -1, consider the logprobs failed.
     """
 
-    compression_ratio: float
+    compression_ratio: float | None = None
     """Compression ratio of the segment.
 
     If the value is greater than 2.4, consider the compression failed.
@@ -2340,7 +2345,7 @@ class TranslationSegment(OpenAIBaseModel):
     end: float
     """End time of the segment in seconds."""
 
-    no_speech_prob: float
+    no_speech_prob: float | None = None
     """Probability of no speech in the segment.
 
     If the value is higher than 1.0 and the `avg_logprob` is below -1, consider
@@ -2380,6 +2385,9 @@ class TranslationResponseVerbose(OpenAIBaseModel):
     """Extracted words and their corresponding timestamps."""
 
 
+TranslationResponseVariant: TypeAlias = TranslationResponse | TranslationResponseVerbose
+
+
 ####### Tokens IN <> Tokens OUT #######
 class GenerateRequest(BaseModel):
     request_id: str = Field(
diff --git a/vllm/entrypoints/openai/serving_transcription.py b/vllm/entrypoints/openai/serving_transcription.py
index 33da7034afabc..189b532810b43 100644
--- a/vllm/entrypoints/openai/serving_transcription.py
+++ b/vllm/entrypoints/openai/serving_transcription.py
@@ -12,10 +12,12 @@ from vllm.entrypoints.openai.protocol import (
     TranscriptionRequest,
     TranscriptionResponse,
     TranscriptionResponseStreamChoice,
+    TranscriptionResponseVerbose,
     TranscriptionStreamResponse,
     TranslationRequest,
     TranslationResponse,
     TranslationResponseStreamChoice,
+    TranslationResponseVerbose,
     TranslationStreamResponse,
 )
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
@@ -51,7 +53,12 @@ class OpenAIServingTranscription(OpenAISpeechToText):
 
     async def create_transcription(
         self, audio_data: bytes, request: TranscriptionRequest, raw_request: Request
-    ) -> TranscriptionResponse | AsyncGenerator[str, None] | ErrorResponse:
+    ) -> (
+        TranscriptionResponse
+        | TranscriptionResponseVerbose
+        | AsyncGenerator[str, None]
+        | ErrorResponse
+    ):
         """Transcription API similar to OpenAI's API.
 
         See https://platform.openai.com/docs/api-reference/audio/createTranscription
@@ -61,7 +68,11 @@ class OpenAIServingTranscription(OpenAISpeechToText):
             audio_data=audio_data,
             request=request,
             raw_request=raw_request,
-            response_class=TranscriptionResponse,
+            response_class=(
+                TranscriptionResponseVerbose
+                if request.response_format == "verbose_json"
+                else TranscriptionResponse
+            ),
             stream_generator_method=self.transcription_stream_generator,
         )
 
@@ -112,7 +123,12 @@ class OpenAIServingTranslation(OpenAISpeechToText):
 
     async def create_translation(
         self, audio_data: bytes, request: TranslationRequest, raw_request: Request
-    ) -> TranslationResponse | AsyncGenerator[str, None] | ErrorResponse:
+    ) -> (
+        TranslationResponse
+        | TranslationResponseVerbose
+        | AsyncGenerator[str, None]
+        | ErrorResponse
+    ):
         """Translation API similar to OpenAI's API.
 
         See https://platform.openai.com/docs/api-reference/audio/createTranslation
@@ -122,7 +138,11 @@ class OpenAIServingTranslation(OpenAISpeechToText):
             audio_data=audio_data,
             request=request,
             raw_request=raw_request,
-            response_class=TranslationResponse,
+            response_class=(
+                TranslationResponseVerbose
+                if request.response_format == "verbose_json"
+                else TranslationResponse
+            ),
             stream_generator_method=self.translation_stream_generator,
         )
 
diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py
index 3dece07748cc4..b34446d3230b1 100644
--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@@ -10,6 +10,7 @@ from typing import Literal, TypeAlias, TypeVar, cast
 
 import numpy as np
 from fastapi import Request
+from transformers import PreTrainedTokenizerBase
 
 import vllm.envs as envs
 from vllm.engine.protocol import EngineClient
@@ -20,9 +21,13 @@ from vllm.entrypoints.openai.protocol import (
     RequestResponseMetadata,
     TranscriptionResponse,
     TranscriptionResponseStreamChoice,
+    TranscriptionResponseVerbose,
+    TranscriptionSegment,
     TranscriptionStreamResponse,
     TranslationResponse,
     TranslationResponseStreamChoice,
+    TranslationResponseVerbose,
+    TranslationSegment,
     TranslationStreamResponse,
     UsageInfo,
 )
@@ -32,6 +37,7 @@ from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
 from vllm.model_executor.models import SupportsTranscription
 from vllm.outputs import RequestOutput
+from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.utils.import_utils import PlaceholderModule
 
 try:
@@ -40,7 +46,20 @@ except ImportError:
     librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
 
 SpeechToTextResponse: TypeAlias = TranscriptionResponse | TranslationResponse
+SpeechToTextResponseVerbose: TypeAlias = (
+    TranscriptionResponseVerbose | TranslationResponseVerbose
+)
+SpeechToTextSegment: TypeAlias = TranscriptionSegment | TranslationSegment
 T = TypeVar("T", bound=SpeechToTextResponse)
+V = TypeVar("V", bound=SpeechToTextResponseVerbose)
+S = TypeVar("S", bound=SpeechToTextSegment)
+
+ResponseType: TypeAlias = (
+    TranscriptionResponse
+    | TranslationResponse
+    | TranscriptionResponseVerbose
+    | TranslationResponseVerbose
+)
 
 logger = init_logger(__name__)
 
@@ -78,6 +97,14 @@ class OpenAISpeechToText(OpenAIServing):
         self.enable_force_include_usage = enable_force_include_usage
 
         self.max_audio_filesize_mb = envs.VLLM_MAX_AUDIO_CLIP_FILESIZE_MB
+        if self.model_cls.supports_segment_timestamp:
+            self.tokenizer = cast(
+                PreTrainedTokenizerBase,
+                get_tokenizer(
+                    tokenizer_name=self.model_config.tokenizer,
+                    tokenizer_mode=self.model_config.tokenizer_mode,
+                ),
+            )
 
         if self.default_sampling_params:
             logger.info(
@@ -133,17 +160,87 @@ class OpenAISpeechToText(OpenAIServing):
                 request_prompt=request.prompt,
                 to_language=to_language,
             )
+            if request.response_format == "verbose_json":
+                if not isinstance(prompt, dict):
+                    raise ValueError(f"Expected prompt to be a dict,got {type(prompt)}")
+                prompt_dict = cast(dict, prompt)
+                decoder_prompt = prompt.get("decoder_prompt")
+                if not isinstance(decoder_prompt, str):
+                    raise ValueError(
+                        f"Expected decoder_prompt to bestr, got {type(decoder_prompt)}"
+                    )
+                prompt_dict["decoder_prompt"] = decoder_prompt.replace(
+                    "<|notimestamps|>", "<|0.00|>"
+                )
             prompts.append(prompt)
         return prompts, duration
 
+    def _get_verbose_segments(
+        self,
+        tokens: tuple,
+        request: SpeechToTextRequest,
+        segment_class: type[SpeechToTextSegment],
+        start_time: float = 0,
+    ) -> list[SpeechToTextSegment]:
+        """
+        Convert tokens to verbose segments.
+
+        This method expects the model to produce
+        timestamps as tokens (similar to Whisper).
+        If the tokens do not include timestamp information,
+        the segments may not be generated correctly.
+
+        Note: Fields like avg_logprob, compression_ratio,
+        and no_speech_prob are not supported
+        in this implementation and will be None. See docs for details.
+        """
+        BASE_OFFSET = 0.02
+        init_token = self.tokenizer.encode("<|0.00|>", add_special_tokens=False)[0]
+        if tokens[-1] == self.tokenizer.eos_token_id:
+            tokens = tokens[:-1]
+
+        tokens_with_start = (init_token,) + tokens
+        segments: list[SpeechToTextSegment] = []
+        last_timestamp_start = 0
+
+        if tokens_with_start[-2] < init_token and tokens_with_start[-1] >= init_token:
+            tokens_with_start = tokens_with_start + (tokens_with_start[-1],)
+        for idx, token in enumerate(tokens_with_start):
+            # Timestamp tokens (e.g., <|0.00|>) are assumed to be sorted.
+            # If the ordering is violated, this slicing may produce incorrect results.
+            if (
+                token >= init_token
+                and idx != 0
+                and tokens_with_start[idx - 1] >= init_token
+            ):
+                sliced_timestamp_tokens = tokens_with_start[last_timestamp_start:idx]
+                start_timestamp = sliced_timestamp_tokens[0] - init_token
+                end_timestamp = sliced_timestamp_tokens[-1] - init_token
+
+                casting_segment = cast(
+                    SpeechToTextSegment,
+                    segment_class(
+                        id=len(segments),
+                        seek=start_time,
+                        start=start_time + BASE_OFFSET * start_timestamp,
+                        end=start_time + BASE_OFFSET * end_timestamp,
+                        temperature=request.temperature,
+                        text=self.tokenizer.decode(sliced_timestamp_tokens[1:-1]),
+                        tokens=sliced_timestamp_tokens[1:-1],
+                    ),
+                )
+                segments.append(casting_segment)
+                last_timestamp_start = idx
+        return segments
+
     async def _create_speech_to_text(
         self,
         audio_data: bytes,
         request: SpeechToTextRequest,
         raw_request: Request,
-        response_class: type[T],
+        response_class: type[T | V],
         stream_generator_method: Callable[..., AsyncGenerator[str, None]],
-    ) -> T | AsyncGenerator[str, None] | ErrorResponse:
+    ) -> T | V | AsyncGenerator[str, None] | ErrorResponse:
         """Base method for speech-to-text operations like transcription and
         translation."""
         error_check_ret = await self._check_model(request)
@@ -156,11 +253,24 @@ class OpenAISpeechToText(OpenAIServing):
         if self.engine_client.errored:
             raise self.engine_client.dead_error
 
-        if request.response_format not in ["text", "json"]:
+        if request.response_format not in ["text", "json", "verbose_json"]:
             return self.create_error_response(
-                "Currently only support response_format `text` or `json`"
+                ("Currently only support response_format")
+                + ("`text`, `json` or `verbose_json`")
             )
 
+        if (
+            request.response_format == "verbose_json"
+            and not self.model_cls.supports_segment_timestamp
+        ):
+            return self.create_error_response(
+                f"Currently do not support verbose_json for {request.model}"
+            )
+
+        if request.response_format == "verbose_json" and request.stream:
+            return self.create_error_response(
+                "verbose_json format doesn't support streaming case"
+            )
         request_id = f"{self.task_type}-{self._base_request_id(raw_request)}"
 
         request_metadata = RequestResponseMetadata(request_id=request_id)
@@ -215,25 +325,69 @@ class OpenAISpeechToText(OpenAIServing):
                 request, list_result_generator, request_id, request_metadata, duration_s
             )
         # Non-streaming response.
+        total_segments = []
+        text_parts = []
         try:
             assert list_result_generator is not None
+            segments_types: dict[str, type[SpeechToTextSegment]] = {
+                "transcribe": TranscriptionSegment,
+                "translate": TranslationSegment,
+            }
+            segment_class: type[SpeechToTextSegment] = segments_types[self.task_type]
             text = ""
-            for result_generator in list_result_generator:
+            for idx, result_generator in enumerate(list_result_generator):
                 async for op in result_generator:
-                    text += op.outputs[0].text
+                    if request.response_format == "verbose_json":
+                        segments: list[SpeechToTextSegment] = (
+                            self._get_verbose_segments(
+                                tokens=tuple(op.outputs[0].token_ids),
+                                segment_class=segment_class,
+                                request=request,
+                                start_time=idx * self.asr_config.max_audio_clip_s,
+                            )
+                        )
 
+                        total_segments.extend(segments)
+                        text_parts.extend([seg.text for seg in segments])
+                    else:
+                        text_parts.append(op.outputs[0].text)
+            text = "".join(text_parts)
             if self.task_type == "transcribe":
+                final_response: ResponseType
                 # add usage in TranscriptionResponse.
                 usage = {
                     "type": "duration",
                     # rounded up as per openAI specs
                     "seconds": int(math.ceil(duration_s)),
                 }
-                final_response = cast(T, response_class(text=text, usage=usage))
+                if request.response_format != "verbose_json":
+                    final_response = cast(
+                        T, TranscriptionResponse(text=text, usage=usage)
+                    )
+                else:
+                    final_response = cast(
+                        V,
+                        TranscriptionResponseVerbose(
+                            text=text,
+                            language=request.language,
+                            duration=str(duration_s),
+                            segments=total_segments,
+                        ),
+                    )
             else:
                 # no usage in response for translation task
-                final_response = cast(T, response_class(text=text))  # type: ignore[call-arg]
-
+                if request.response_format != "verbose_json":
+                    final_response = cast(T, TranslationResponse(text=text))
+                else:
+                    final_response = cast(
+                        V,
+                        TranslationResponseVerbose(
+                            text=text,
+                            language=request.language,
+                            duration=str(duration_s),
+                            segments=total_segments,
+                        ),
+                    )
             return final_response
         except asyncio.CancelledError:
             return self.create_error_response("Client disconnected")
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index ccd5be42e65a9..0f65683cf7c57 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -837,6 +837,10 @@ class SupportsTranscription(Protocol):
     Transcription models can opt out of text generation by setting this to
     `True`.
     """
+    supports_segment_timestamp: ClassVar[bool] = False
+    """
+    Enables the segment timestamp option for supported models by setting this to `True`.
+    """
 
     def __init_subclass__(cls, **kwargs):
         super().__init_subclass__(**kwargs)
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index c72b5e1c091f2..1ed6ae4366d0c 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -791,6 +791,7 @@ class WhisperForConditionalGeneration(
 
     # Whisper only supports audio-conditioned generation.
     supports_transcription_only = True
+    supports_segment_timestamp = True
     supported_languages = ISO639_1_SUPPORTED_LANGS
 
     @classmethod

From d0985c5feb212023fb61938cfc6b521843b6d031 Mon Sep 17 00:00:00 2001
From: BADAOUI Abdennacer
 <106801897+Abdennacer-Badaoui@users.noreply.github.com>
Date: Mon, 1 Dec 2025 19:03:13 +0100
Subject: [PATCH 115/770] [Hardware][AMD] Remove ROCm skip conditions for
 transformers backend tests (#29782)

Signed-off-by: badaoui <abdennacerbadaoui0@gmail.com>
---
 tests/models/test_transformers.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py
index ae5befd2c00b7..c642ff1ee4384 100644
--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@@ -59,10 +59,6 @@ def check_implementation(
     )
 
 
-@pytest.mark.skipif(
-    current_platform.is_rocm(),
-    reason="Llama-3.2-1B-Instruct, Ilama-3.2-1B produce memory access fault.",
-)
 @pytest.mark.parametrize(
     "model,model_impl",
     [

From fc6acc88caac881a92d84bcef5bc734d67035786 Mon Sep 17 00:00:00 2001
From: knlnguyen1802 <knlnguyen1802@gmail.com>
Date: Tue, 2 Dec 2025 02:18:07 +0800
Subject: [PATCH 116/770] [Bugfix] Missing cached item in the
 MultiModalReceiverCache (#28525)

Signed-off-by: knlnguyen1802 <knlnguyen1802@gmail.com>
Co-authored-by: Chenguang Zheng <645327136@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 tests/multimodal/test_cache.py                | 304 +++++++++++++++++-
 .../shm_object_storage.py                     |  39 ++-
 vllm/multimodal/cache.py                      |  77 ++++-
 vllm/multimodal/inputs.py                     |   4 +-
 vllm/multimodal/processing.py                 |  33 +-
 5 files changed, 436 insertions(+), 21 deletions(-)

diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py
index 531674c30f55f..2ddc93f8daf7b 100644
--- a/tests/multimodal/test_cache.py
+++ b/tests/multimodal/test_cache.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import multiprocessing as mp
 
 import numpy as np
 import pytest
@@ -8,9 +9,16 @@ import torch
 from vllm.config import ModelConfig, ParallelConfig, VllmConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.cache import (
+    BaseMultiModalProcessorCache,
+    BaseMultiModalReceiverCache,
     MultiModalCache,
+    MultiModalProcessorCacheInItem,
     MultiModalProcessorCacheItem,
     MultiModalProcessorCacheItemMetadata,
+    MultiModalProcessorSenderCache,
+    MultiModalReceiverCache,
+    ShmObjectStoreReceiverCache,
+    ShmObjectStoreSenderCache,
     engine_receiver_cache_from_config,
     processor_cache_from_config,
 )
@@ -22,6 +30,7 @@ from vllm.multimodal.inputs import (
     MultiModalSharedField,
 )
 from vllm.multimodal.processing import PromptInsertion
+from vllm.utils.mem_constants import GiB_bytes, MiB_bytes
 
 pytestmark = pytest.mark.cpu_test
 
@@ -144,8 +153,7 @@ def _compare_caches(
         MultiModalHasher.hash_kwargs(item=item.get_data()) for item in all_items
     ]
 
-    # Should not be used since there is nothing to convert to text
-    prompt_update = PromptInsertion("dummy", "target", "insertion")
+    prompt_update = PromptInsertion("dummy", "target", "insertion").resolve(0)
 
     for it in range(n_iter):
         num_items_to_select = rng.randint(0, max_items_per_iter)
@@ -159,10 +167,11 @@ def _compare_caches(
         else:
             for _ in range(is_cached_calls_per_iter):
                 cache_0_p0.is_cached(selected_hashes)
+
             cache_0_p0_out = [
                 item
                 for item, _ in cache_0_p0.get_and_update(
-                    [(item, prompt_update.content) for item in selected_items],
+                    [(item, [prompt_update]) for item in selected_items],
                     selected_hashes,
                 )
             ]
@@ -172,10 +181,11 @@ def _compare_caches(
         else:
             for _ in range(is_cached_calls_per_iter):
                 cache_1_p0.is_cached(selected_hashes)
+
             cache_1_p0_out = [
                 item
                 for item, _ in cache_1_p0.get_and_update(
-                    [(item, prompt_update.content) for item in selected_items],
+                    [(item, [prompt_update]) for item in selected_items],
                     selected_hashes,
                 )
             ]
@@ -225,3 +235,289 @@ def test_ipc_enable_disable_consistency(is_cached_calls_per_iter):
         vllm_config_ipc_enabled,
         is_cached_calls_per_iter=is_cached_calls_per_iter,
     )
+
+
+def _run_test_cache_eviction_lru(
+    p0_cache: BaseMultiModalProcessorCache,
+    p1_cache: BaseMultiModalReceiverCache,
+    base_item_size: int,
+):
+    request1_hashes = [
+        "image_A",
+        "image_B",
+        "image_C",
+    ]
+    request1_items = {
+        h: MultiModalKwargsItem.dummy(h, nbytes=2 * base_item_size)
+        for h in request1_hashes
+    }
+
+    request2_hashes = ["image_D", "image_E", "image_A", "image_C"]
+    request2_items = {
+        h: MultiModalKwargsItem.dummy(h, nbytes=1 * base_item_size)
+        for h in request2_hashes
+    }
+
+    ##########################
+    # STEP 1: Request 1 send
+    ##########################
+    sender_is_cached_item_req1 = p0_cache.is_cached(request1_hashes)
+    # Cache is empty
+    assert sender_is_cached_item_req1 == [False, False, False]
+
+    # Touch all mm hash for P0 Cache before process
+    for mm_hash in request1_hashes:
+        p0_cache.touch_sender_cache_item(mm_hash)
+
+    ###########################
+    # Process request 1 for P0 Cache
+    ###########################
+    item_tuple: MultiModalProcessorCacheInItem
+    for i, h in enumerate(request1_hashes):
+        # Use precomputed cache state
+        is_cached = sender_is_cached_item_req1[i]
+        item_tuple = (request1_items[h], []) if not is_cached else None
+        print(f"Request 1: key={h} | cached={is_cached}")
+
+        p0_cache.get_and_update_item(item_tuple, h)
+
+    ###########################
+    # Process request 1 for P1 Cache
+    ###########################
+    # Touch all mm hash for P1 Cache before process
+    for mm_hash in request1_hashes:
+        p1_cache.touch_receiver_cache_item(mm_hash)
+
+    for h in request1_hashes:
+        p1_cache.get_and_update_item(request1_items[h], h)
+
+    expected_hashes = ["image_A", "image_B", "image_C"]
+    assert list(p0_cache._cache.order) == expected_hashes
+
+    ##########################
+    # STEP 2: Request 2 send
+    ##########################
+    sender_is_cached_item_req2 = p0_cache.is_cached(request2_hashes)
+    assert sender_is_cached_item_req2 == [False, False, True, True]
+
+    # Touch all mm hash for P0 Cache before process
+    for mm_hash in request2_hashes:
+        p0_cache.touch_sender_cache_item(mm_hash)
+
+    ###########################
+    # Process request 2 for P0 Cache
+    ###########################
+    for i, h in enumerate(request2_hashes):
+        # Use precomputed cache state again
+        is_cached = sender_is_cached_item_req2[i]
+        item_tuple = (request2_items[h], []) if not is_cached else None
+        print(f"Request 2: key={h} | cached={is_cached}")
+
+        p0_cache.get_and_update_item(item_tuple, h)
+
+    ###########################
+    # Process request 2 for P1 Cache
+    ###########################
+
+    # Touch all mm hash for P1 Cache before process
+    for mm_hash in request2_hashes:
+        p1_cache.touch_receiver_cache_item(mm_hash)
+
+    for h in request2_hashes:
+        p1_cache.get_and_update_item(request2_items[h], h)
+
+    expected_hashes = ["image_D", "image_E", "image_A", "image_C"]
+    assert list(p0_cache._cache.order) == expected_hashes
+
+
+def test_cache_eviction_lru_cache():
+    model_config = ModelConfig(
+        model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
+        mm_processor_cache_gb=6 / GiB_bytes,
+    )
+    sender_cache = MultiModalProcessorSenderCache(model_config)
+    receiver_cache = MultiModalReceiverCache(model_config)
+
+    _run_test_cache_eviction_lru(sender_cache, receiver_cache, base_item_size=1)
+
+
+# This test verifies shared-memory cache eviction behavior across processor (p0)
+# and receiver (p1) caches.
+# Flow summary:
+# 1. Request 1 adds images A, B, C — completely filling the cache.
+# 2. Request 2 tries to add image_G and image_A, but image_G cannot be added because
+#    cache is full and A is protected from eviction — cache remains unchanged.
+# 3. Request 3 adds image_G, image_H, image_I and image_B
+#    this time, image_A is evicted, freeing 5MB space
+#    and image_G, image_H successfully fits,
+#    image_B is protected from eviction then image_i cannot be added.
+#    This proving normal eviction and reuse behavior.
+def _run_test_cache_eviction_shm(
+    p0_cache: BaseMultiModalProcessorCache,
+    p1_cache: BaseMultiModalReceiverCache,
+    base_item_size: int,
+):
+    request1_hashes = ["image_A", "image_B", "image_C"]
+    request1_items = {
+        h: MultiModalKwargsItem.dummy(h, nbytes=5 * base_item_size)
+        for h in request1_hashes
+    }
+    request1_items_p0_result = []
+
+    request2_hashes = ["image_G", "image_A"]
+    request2_items = {
+        h: MultiModalKwargsItem.dummy(
+            h, nbytes=(5 if h in request1_hashes else 2) * base_item_size
+        )
+        for h in request2_hashes
+    }
+    request2_items_p0_result = []
+
+    request3_hashes = ["image_G", "image_H", "image_I", "image_B"]
+    request3_items = {
+        h: MultiModalKwargsItem.dummy(
+            h, nbytes=(5 if h in request1_hashes else 2) * base_item_size
+        )
+        for h in request3_hashes
+    }
+    request3_items_p0_result = []
+
+    ##########################
+    # STEP 1: Request 1 send
+    # This will fill up the cache
+    ##########################
+    sender_is_cached_item_req1 = p0_cache.is_cached(request1_hashes)
+    # Cache is empty
+    assert sender_is_cached_item_req1 == [False, False, False]
+
+    # Touch all mm hash for P0 Cache before process
+    for mm_hash in request1_hashes:
+        p0_cache.touch_sender_cache_item(mm_hash)
+
+    ###########################
+    # Process request 1 for P0 Cache
+    ###########################
+    item_tuple: MultiModalProcessorCacheInItem
+    for i, h in enumerate(request1_hashes):
+        # Use precomputed cache state
+        is_cached = sender_is_cached_item_req1[i]
+        item_tuple = (request1_items[h], []) if not is_cached else None
+        print(f"Request 1: key={h} | cached={is_cached}")
+
+        p0_result = p0_cache.get_and_update_item(item_tuple, h)
+        # Only get mm item, ignore prompt update result
+        request1_items_p0_result.append(p0_result[0])
+
+    ###########################
+    # Process request 1 for P1 Cache
+    ###########################
+    # Touch all mm hash for P1 Cache before process
+    for mm_hash, mm_item in zip(request1_hashes, request1_items_p0_result):
+        p1_cache.touch_receiver_cache_item(mm_hash, mm_item)
+
+    for mm_hash, mm_item in zip(request1_hashes, request1_items_p0_result):
+        p1_cache.get_and_update_item(mm_item, mm_hash)
+
+    expected_hashes = ["image_A", "image_B", "image_C"]
+    assert list(p0_cache._shm_cache.key_index.keys()) == expected_hashes
+
+    ##########################
+    # STEP 2: Request 2 send
+    # There is no eviction because image_A is protected
+    # No new item can add to cache
+    ##########################
+    sender_is_cached_item_req2 = p0_cache.is_cached(request2_hashes)
+    assert sender_is_cached_item_req2 == [False, True]
+
+    # Touch all mm hash for P0 Cache before process
+    for mm_hash in request2_hashes:
+        p0_cache.touch_sender_cache_item(mm_hash)
+
+    ###########################
+    # Process request 2 for P0 Cache
+    ###########################
+    for i, h in enumerate(request2_hashes):
+        # Use precomputed cache state again
+        is_cached = sender_is_cached_item_req2[i]
+        item_tuple = (request2_items[h], []) if not is_cached else None
+        print(f"Request 2: key={h} | cached={is_cached}")
+
+        p0_result = p0_cache.get_and_update_item(item_tuple, h)
+        # Only get mm item, ignore prompt update result
+        request2_items_p0_result.append(p0_result[0])
+
+    # image_A cannot be evict then
+    # image_G will fail to allocate anyway and image_A still in cache
+    assert p0_cache.is_cached(request2_hashes) == [False, True]
+
+    ###########################
+    # Process request 2 for P1 Cache
+    ###########################
+
+    # Touch all mm hash for P1 Cache before process
+    for mm_hash, mm_item in zip(request2_hashes, request2_items_p0_result):
+        p1_cache.touch_receiver_cache_item(mm_hash, mm_item)
+
+    for mm_hash, mm_item in zip(request2_hashes, request2_items_p0_result):
+        p1_cache.get_and_update_item(mm_item, mm_hash)
+
+    # Prove that cache state is unchanged
+    expected_hashes = ["image_A", "image_B", "image_C"]
+    assert list(p0_cache._shm_cache.key_index.keys()) == expected_hashes
+
+    ##########################
+    # STEP 3: Request 3 send
+    ##########################
+    ##### Prove that cache eviction work normally
+    sender_is_cached_item_req3 = p0_cache.is_cached(request3_hashes)
+    assert sender_is_cached_item_req3 == [False, False, False, True]
+
+    # Touch all mm hash for P0 Cache before process
+    for mm_hash in request3_hashes:
+        p0_cache.touch_sender_cache_item(mm_hash)
+
+    ###########################
+    # Process request 3 for P0 Cache
+    ###########################
+    for i, h in enumerate(request3_hashes):
+        # Use precomputed cache state again
+        is_cached = sender_is_cached_item_req3[i]
+        item_tuple = (request3_items[h], []) if not is_cached else None
+        print(f"Request 3: key={h} | cached={is_cached}")
+        p0_result = p0_cache.get_and_update_item(item_tuple, h)
+        # Only get mm item, ignore prompt update result
+        request3_items_p0_result.append(p0_result[0])
+
+    # image_A got evict and image_G add to cache
+    # image_B is still protected
+    # image_G, image_H fit but image_I cannot fit
+    assert p0_cache.is_cached(request3_hashes) == [True, True, False, True]
+
+    ###########################
+    # Process request 3 for P1 Cache
+    ###########################
+
+    # Touch all mm hash for P1 Cache before process
+    for mm_hash, mm_item in zip(request3_hashes, request3_items_p0_result):
+        p1_cache.touch_receiver_cache_item(mm_hash, mm_item)
+
+    for mm_hash, mm_item in zip(request3_hashes, request3_items_p0_result):
+        p1_cache.get_and_update_item(mm_item, mm_hash)
+
+    expected_hashes = ["image_B", "image_C", "image_G", "image_H"]
+    assert list(p0_cache._shm_cache.key_index.keys()) == expected_hashes
+
+
+def test_cache_eviction_shm_cache():
+    vllm_config = VllmConfig(
+        model_config=ModelConfig(
+            model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
+            mm_processor_cache_type="shm",
+            mm_shm_cache_max_object_size_mb=6,
+            mm_processor_cache_gb=15.2 * MiB_bytes / GiB_bytes,
+        ),
+    )
+    sender_cache = ShmObjectStoreSenderCache(vllm_config)
+    receiver_cache = ShmObjectStoreReceiverCache(vllm_config, mp.Lock())
+
+    _run_test_cache_eviction_shm(sender_cache, receiver_cache, base_item_size=MiB_bytes)
diff --git a/vllm/distributed/device_communicators/shm_object_storage.py b/vllm/distributed/device_communicators/shm_object_storage.py
index 4af2caa16b0d6..5da261fbc6cfc 100644
--- a/vllm/distributed/device_communicators/shm_object_storage.py
+++ b/vllm/distributed/device_communicators/shm_object_storage.py
@@ -574,7 +574,6 @@ class SingleWriterShmObjectStorage:
             value
         )
         buffer_size = self.flag_bytes + data_bytes + md_bytes
-
         # Sanity checks
         if buffer_size > self.max_object_size:
             raise ValueError(
@@ -626,6 +625,44 @@ class SingleWriterShmObjectStorage:
 
         return obj
 
+    def touch(
+        self,
+        key: str,
+        address: int = 0,
+        monotonic_id: int = 0,
+    ) -> None:
+        """
+        Touch an existing cached item to update its eviction status.
+
+        For writers (ShmObjectStoreSenderCache): Increment writer_flag
+        For readers (ShmObjectStoreReceiverCache): Increment reader_count
+
+        Args:
+            key: String key of the object to touch
+            address: Address of the object (only for readers)
+            monotonic_id: Monotonic ID of the object (only for readers)
+
+        """
+        if self._reader_lock is None:
+            if key not in self.key_index:
+                return None
+            address, monotonic_id = self.key_index[key]
+            # Writer side: increment writer_flag to raise eviction threshold
+            self.increment_writer_flag(monotonic_id)
+        else:
+            with (
+                self._reader_lock,
+                self.ring_buffer.access_buf(address) as (data_view, _),
+            ):
+                reader_count = self.ring_buffer.byte2int(data_view[: self.flag_bytes])
+
+                # NOTE(Long):
+                # Avoid increasing flag on newly added item (sync with sender)
+                # Since when a new item is added
+                # pre-touch has no effect on writer side
+                if reader_count >= self.n_readers:
+                    self.increment_reader_flag(data_view[: self.flag_bytes])
+
     def handle(self):
         """Get handle for sharing across processes."""
         return ShmObjectStorageHandle(
diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py
index c1531cbfdc31d..97f6aa461b90c 100644
--- a/vllm/multimodal/cache.py
+++ b/vllm/multimodal/cache.py
@@ -302,6 +302,19 @@ class BaseMultiModalProcessorCache(
         """
         return [self.is_cached_item(mm_hash) for mm_hash in mm_hashes]
 
+    @abstractmethod
+    def touch_sender_cache_item(self, mm_hash: str) -> None:
+        """
+        Update the cache eviction order for a multi-modal item.
+
+        This is used to touch the item in the cache without changing
+        its value.
+
+        Args:
+            mm_hash: The hash of the multi-modal item.
+        """
+        raise NotImplementedError
+
     @abstractmethod
     def make_stats(self, *, delta: bool = False) -> CacheInfo:
         """
@@ -353,6 +366,10 @@ class MultiModalProcessorOnlyCache(BaseMultiModalProcessorCache):
 
         return mm_item
 
+    @override
+    def touch_sender_cache_item(self, mm_hash: str) -> None:
+        self._cache.touch(mm_hash)
+
     @override
     def clear_cache(self) -> None:
         self._cache.clear()
@@ -407,6 +424,10 @@ class MultiModalProcessorSenderCache(BaseMultiModalProcessorCache):
 
         return mm_item
 
+    @override
+    def touch_sender_cache_item(self, mm_hash: str) -> None:
+        self._cache.touch(mm_hash)
+
     @override
     def clear_cache(self) -> None:
         self._cache.clear()
@@ -501,6 +522,12 @@ class ShmObjectStoreSenderCache(BaseMultiModalProcessorCache):
             logger.debug("Failed to cache mm_input with hash %s: %s", mm_hash, e)
             return mm_item
 
+    @override
+    def touch_sender_cache_item(self, mm_hash: str) -> None:
+        """Touch the item in shared memory cache to prevent eviction.
+        Increments writer_flag on sender side."""
+        self._shm_cache.touch(mm_hash)
+
     @override
     def clear_cache(self) -> None:
         self._shm_cache.clear()
@@ -610,11 +637,37 @@ class BaseMultiModalReceiverCache(
         self,
         mm_features: list["MultiModalFeatureSpec"],
     ) -> list["MultiModalFeatureSpec"]:
-        """Update multimodal features with cached encoder outputs."""
+        """
+        Update multimodal features with cached encoder outputs.
+        Touch all identifier at first before update to avoid
+        item in updated list evict during update.
+        """
+        for feature in mm_features:
+            self.touch_receiver_cache_item(feature.identifier, feature.data)
+
         for feature in mm_features:
             feature.data = self.get_and_update_item(feature.data, feature.identifier)
         return mm_features
 
+    @abstractmethod
+    def touch_receiver_cache_item(
+        self,
+        mm_hash: str,
+        mm_item: MultiModalKwargsItem | None = None,
+    ) -> None:
+        """
+        Update the cache eviction order for a multi-modal item.
+
+        This is used to touch the item in the cache without changing
+        its value.
+
+        Args:
+            mm_hash: The hash of the multi-modal item.
+            mm_item: The multi-modal item itself. This is optional and
+                may not be needed by some cache implementations.
+        """
+        raise NotImplementedError
+
 
 class MultiModalReceiverCache(BaseMultiModalReceiverCache):
     """
@@ -651,6 +704,14 @@ class MultiModalReceiverCache(BaseMultiModalReceiverCache):
         self._cache[mm_hash] = mm_item
         return mm_item
 
+    @override
+    def touch_receiver_cache_item(
+        self,
+        mm_hash: str,
+        mm_item: MultiModalKwargsItem | None = None,
+    ) -> None:
+        self._cache.touch(mm_hash)
+
     @override
     def clear_cache(self) -> None:
         self._cache.clear()
@@ -703,6 +764,20 @@ class ShmObjectStoreReceiverCache(BaseMultiModalReceiverCache):
 
         return mm_item
 
+    @override
+    def touch_receiver_cache_item(
+        self,
+        mm_hash: str,
+        mm_item: MultiModalKwargsItem | None = None,
+    ) -> None:
+        """Touch the item in shared memory cache to prevent eviction.
+        Increments reader_count on receiver side."""
+        assert mm_item is not None
+        if "address" in mm_item:
+            address = cast(int, mm_item["address"].data)
+            monotonic_id = cast(int, mm_item["monotonic_id"].data)
+            self._shm_cache.touch(mm_hash, address=address, monotonic_id=monotonic_id)
+
     @override
     def clear_cache(self) -> None:
         self._shm_cache.clear()
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 7518a023c5f50..f4e38b1f3325f 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -721,12 +721,12 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
     """
 
     @staticmethod
-    def dummy(modality: str):
+    def dummy(modality: str, nbytes: int = 1):
         """Convenience class for testing."""
         mm_elem = MultiModalFieldElem(
             modality=modality,
             key="dummy",
-            data=torch.empty(1),
+            data=torch.empty(nbytes, dtype=torch.uint8),
             field=MultiModalSharedField(1),
         )
         return MultiModalKwargsItem.from_elems([mm_elem])
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 912cff2343dd0..2f651bd71706f 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1248,7 +1248,13 @@ _I = TypeVar("_I", bound=BaseProcessingInfo)
 
 MultiModalHashes = dict[str, list[str]]
 """
-A collection of hashes with a similar structure as
+A collection of the multi-modal hash for each item, with a similar structure as
+[`MultiModalKwargsItems`][vllm.multimodal.inputs.MultiModalKwargsItems].
+"""
+
+MultiModalIsCached = dict[str, list[bool]]
+"""
+A collection of the `is_cached` flag for each item, with a similar structure as
 [`MultiModalKwargsItems`][vllm.multimodal.inputs.MultiModalKwargsItems].
 """
 
@@ -1725,7 +1731,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         cache: BaseMultiModalProcessorCache,
         mm_data_items: MultiModalDataItems,
         mm_hashes: MultiModalHashes,
-    ) -> MultiModalDataItems:
+    ) -> tuple[MultiModalIsCached, MultiModalDataItems]:
         mm_is_cached = {
             modality: cache.is_cached(hashes) for modality, hashes in mm_hashes.items()
         }
@@ -1752,7 +1758,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
                     missing_modality_data.append(data)
             mm_missing_data[modality] = missing_modality_data
 
-        return self._to_mm_items(mm_missing_data)
+        return mm_is_cached, self._to_mm_items(mm_missing_data)
 
     def _recompute_cached_prompt_update(
         self,
@@ -1769,14 +1775,15 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         self,
         cache: BaseMultiModalProcessorCache,
         mm_hashes: MultiModalHashes,
+        mm_is_cached: MultiModalIsCached,
         mm_missing_kwargs: MultiModalKwargsItems,
         mm_missing_prompt_updates: MultiModalPromptUpdates,
     ) -> tuple[MultiModalKwargsOptionalItems, MultiModalPromptUpdates]:
-        # Need to calculate this at the beginning to avoid skipping cache logic
-        # for subsequently repeated items in the same modality
-        mm_is_cached = {
-            modality: cache.is_cached(hashes) for modality, hashes in mm_hashes.items()
-        }
+        # Need to touch all mm hashes before update to avoid hash in updated
+        # list evict during update
+        for hashes in mm_hashes.values():
+            for item_hash in hashes:
+                cache.touch_sender_cache_item(item_hash)
 
         mm_missing_next_idx = defaultdict[str, int](lambda: 0)
 
@@ -1789,15 +1796,14 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
             missing_prompt_updates = mm_missing_prompt_updates.get(modality, [])
 
             for item_idx, item_hash in enumerate(hashes):
-                kwargs: MultiModalKwargsItem | None
                 if not mm_is_cached[modality][item_idx]:
                     missing_next_idx = mm_missing_next_idx[modality]
-                    kwargs = missing_kwargs[missing_next_idx]
-                    updates = missing_prompt_updates[missing_next_idx]
+                    missing_kwargs_item = missing_kwargs[missing_next_idx]
+                    missing_updates_item = missing_prompt_updates[missing_next_idx]
 
                     mm_missing_next_idx[modality] += 1
 
-                    item = kwargs, updates
+                    item = missing_kwargs_item, missing_updates_item
                 else:
                     item = None
 
@@ -1896,7 +1902,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
             mm_uuids=mm_uuids,
         )
 
-        mm_missing_data_items = self._get_cache_missing_items(
+        mm_is_cached, mm_missing_data_items = self._get_cache_missing_items(
             cache=cache,
             mm_data_items=mm_data_items,
             mm_hashes=mm_hashes,
@@ -1933,6 +1939,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         mm_kwargs, mm_prompt_updates = self._merge_mm_kwargs(
             cache,
             mm_hashes=mm_hashes,
+            mm_is_cached=mm_is_cached,
             mm_missing_kwargs=mm_missing_kwargs,
             mm_missing_prompt_updates=mm_missing_prompt_updates,
         )

From ec7035c9d40cda43fa9c3d2d1c148b1382aa04a4 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <khluu000@gmail.com>
Date: Mon, 1 Dec 2025 10:22:05 -0800
Subject: [PATCH 117/770] [ci] Make distributed 8 gpus test optional (#29801)

Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
---
 .buildkite/test-pipeline.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 9f2107fb1e5ab..0e715a719d27d 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -215,6 +215,7 @@ steps:
   timeout_in_minutes: 10
   gpu: h100
   num_gpus: 8
+  optional: true
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - examples/offline_inference/torchrun_dp_example.py
@@ -1369,4 +1370,4 @@ steps:
   num_gpus: 2
   working_dir: "/vllm-workspace"
   commands:
-  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
\ No newline at end of file
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1

From cabc77cc86586206134a19e3e70aa401ec30b71f Mon Sep 17 00:00:00 2001
From: shivampr <shivamprasad91@gmail.com>
Date: Mon, 1 Dec 2025 10:27:53 -0800
Subject: [PATCH 118/770] [Core][Observability] Add KV cache residency metrics
 (#27793)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduces three new Prometheus histograms for fine-grained observability of KV cache residency behavior:

vllm:kv_block_lifetime_seconds — total lifetime from allocation to free
vllm:kv_block_idle_before_evict_seconds — idle duration before eviction
vllm:kv_block_reuse_gap_seconds — time between consecutive reuses of the same block

These metrics help operators analyze KV cache efficiency, reuse patterns, and eviction timing beyond simple utilization rates.

Implementation uses monotonic timestamps for accuracy, 1% sampling for minimal overhead (~48 bytes/block), and is fully thread-safe with zero runtime cost when disabled.

Two new runtime flags are introduced:

--kv-cache-metrics – enable KV cache residency metrics
--kv-cache-metrics-sample – control sampling ratio (default: 0.01)

Signed-off-by: Shivam <shivamprasad91@gmail.com>
---
 docs/design/metrics.md                 |  23 +++
 tests/v1/core/test_kv_cache_metrics.py | 224 +++++++++++++++++++++++++
 vllm/config/observability.py           |  10 +-
 vllm/engine/arg_utils.py               |  13 ++
 vllm/v1/core/block_pool.py             |  18 ++
 vllm/v1/core/kv_cache_coordinator.py   |  31 +++-
 vllm/v1/core/kv_cache_manager.py       |   4 +
 vllm/v1/core/kv_cache_metrics.py       |  96 +++++++++++
 vllm/v1/core/sched/scheduler.py        |  27 ++-
 vllm/v1/metrics/loggers.py             |  90 ++++++++++
 vllm/v1/metrics/stats.py               |  11 ++
 11 files changed, 534 insertions(+), 13 deletions(-)
 create mode 100644 tests/v1/core/test_kv_cache_metrics.py
 create mode 100644 vllm/v1/core/kv_cache_metrics.py

diff --git a/docs/design/metrics.md b/docs/design/metrics.md
index 313c9aaebd26b..59cb6ba46fe17 100644
--- a/docs/design/metrics.md
+++ b/docs/design/metrics.md
@@ -263,6 +263,29 @@ record:
 - End-to-end latency - the interval between frontend `arrival_time`
   and the frontend receiving the final token.
 
+### KV Cache Residency Metrics
+
+We also emit a set of histograms that describe how long sampled KV cache
+blocks stay resident and how often they are reused. Sampling
+(`--kv-cache-metrics-sample`) keeps the overhead tiny; when a block is
+chosen we record:
+
+- `lifetime` – allocation ⟶ eviction
+- `idle before eviction` – last touch ⟶ eviction
+- `reuse gaps` – the pauses between touches when the block gets reused
+
+Those map directly to the Prometheus metrics:
+
+- `vllm:kv_block_lifetime_seconds` – how long each sampled block exists.
+- `vllm:kv_block_idle_before_evict_seconds` – idle tail after the final access.
+- `vllm:kv_block_reuse_gap_seconds` – time between consecutive touches.
+
+The engine core only ships raw eviction events via `SchedulerStats`; the
+frontend drains them, turns them into Prometheus observations, and also
+exposes the same data through `LLM.get_metrics()` when logging is on.
+Looking at lifetime and idle time on one chart makes it easy to spot
+stranded cache or workloads that pin prompts for a long decode.
+
 ### Metrics Publishing - Logging
 
 The `LoggingStatLogger` metrics publisher outputs a log `INFO` message
diff --git a/tests/v1/core/test_kv_cache_metrics.py b/tests/v1/core/test_kv_cache_metrics.py
new file mode 100644
index 0000000000000..9e16aa64ab6af
--- /dev/null
+++ b/tests/v1/core/test_kv_cache_metrics.py
@@ -0,0 +1,224 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import patch
+
+import pytest
+
+from vllm.v1.core.kv_cache_metrics import (
+    BlockMetricsState,
+    KVCacheMetricsCollector,
+)
+from vllm.v1.core.kv_cache_utils import KVCacheBlock
+
+
+class TestBlockMetricsState:
+    def test_init(self):
+        with patch("time.monotonic_ns", return_value=1000000000):
+            state = BlockMetricsState()
+            assert state.birth_time_ns == 1000000000
+            assert state.last_access_ns == 1000000000
+            assert len(state.access_history) == 0
+
+    def test_access_tracking(self):
+        with patch("time.monotonic_ns", return_value=1000000000):
+            state = BlockMetricsState()
+
+        with patch("time.monotonic_ns", return_value=2000000000):
+            state.record_access()
+
+        assert state.last_access_ns == 2000000000
+        assert list(state.access_history) == [2000000000]
+
+    def test_ring_buffer_wraps_at_4(self):
+        with patch("time.monotonic_ns", return_value=1000000000):
+            state = BlockMetricsState()
+
+        for i in range(5):
+            t = 1000000000 + (i + 1) * 1000000000
+            with patch("time.monotonic_ns", return_value=t):
+                state.record_access()
+
+        assert len(state.access_history) == 4
+        assert list(state.access_history) == [
+            3000000000,
+            4000000000,
+            5000000000,
+            6000000000,
+        ]
+
+    def test_lifetime(self):
+        with patch("time.monotonic_ns", return_value=1000000000):
+            state = BlockMetricsState()
+        with patch("time.monotonic_ns", return_value=6500000000):
+            assert abs(state.get_lifetime_seconds() - 5.5) < 0.001
+
+    def test_idle_time(self):
+        with patch("time.monotonic_ns", return_value=1000000000):
+            state = BlockMetricsState()
+        state.last_access_ns = 2000000000
+        with patch("time.monotonic_ns", return_value=5200000000):
+            assert abs(state.get_idle_time_seconds() - 3.2) < 0.001
+
+    def test_reuse_gaps(self):
+        with patch("time.monotonic_ns", return_value=1000000000):
+            state = BlockMetricsState()
+
+        base = 1000000000
+        for offset in [0, 1.5, 3.0, 5.5]:
+            state.access_history.append(base + int(offset * 1e9))
+
+        gaps = state.get_reuse_gaps_seconds()
+        assert len(gaps) == 3
+        assert gaps[0] == 1.5 and gaps[1] == 1.5 and gaps[2] == 2.5
+
+    def test_ring_wrap_only_gives_3_gaps(self):
+        # 5 accesses in size-4 buffer = 3 gaps
+        with patch("time.monotonic_ns", return_value=1000000000):
+            state = BlockMetricsState()
+
+        for i in range(5):
+            state.access_history.append(1000000000 + i * 1000000000)
+
+        assert len(state.get_reuse_gaps_seconds()) == 3
+
+
+class TestKVCacheMetricsCollector:
+    def test_sample_rate_validation(self):
+        with pytest.raises(AssertionError):
+            KVCacheMetricsCollector(sample_rate=-0.1)
+        with pytest.raises(AssertionError):
+            KVCacheMetricsCollector(sample_rate=1.5)
+        with pytest.raises(AssertionError):
+            KVCacheMetricsCollector(sample_rate=0.0)
+
+    def test_sampling(self):
+        c = KVCacheMetricsCollector(sample_rate=1.0)
+        assert sum(1 for _ in range(100) if c.should_sample_block()) == 100
+
+        c = KVCacheMetricsCollector(sample_rate=0.5)
+        samples = sum(1 for _ in range(1000) if c.should_sample_block())
+        assert 400 < samples < 600
+
+    def test_alloc(self):
+        c = KVCacheMetricsCollector(sample_rate=1.0)
+
+        blocks = [KVCacheBlock(block_id=i) for i in range(5)]
+        with patch("time.monotonic_ns", return_value=1000000000):
+            for block in blocks:
+                c.on_block_allocated(block)
+
+        assert len(c.block_metrics) == 5
+
+    def test_access(self):
+        c = KVCacheMetricsCollector(sample_rate=1.0)
+        block = KVCacheBlock(block_id=0)
+
+        with patch("time.monotonic_ns", return_value=1000000000):
+            c.on_block_allocated(block)
+
+        for i in range(3):
+            t = 1000000000 + (i + 1) * 1000000000
+            with patch("time.monotonic_ns", return_value=t):
+                c.on_block_accessed(block)
+
+        assert len(c.block_metrics[0].access_history) == 3
+
+    def test_evict_no_accesses(self):
+        # lifetime should equal idle if never accessed
+        c = KVCacheMetricsCollector(sample_rate=1.0)
+
+        block = KVCacheBlock(block_id=0)
+        with patch("time.monotonic_ns", return_value=1000000000):
+            c.on_block_allocated(block)
+
+        with patch("time.monotonic_ns", return_value=6000000000):
+            c.on_block_evicted(block)
+
+        events = c.drain_events()
+        assert len(events) == 1
+        assert abs(events[0].lifetime_seconds - 5.0) < 0.001
+        assert abs(events[0].idle_seconds - 5.0) < 0.001
+
+    def test_evict(self):
+        c = KVCacheMetricsCollector(sample_rate=1.0)
+
+        block = KVCacheBlock(block_id=0)
+        with patch("time.monotonic_ns", return_value=1000000000):
+            c.on_block_allocated(block)
+
+        with patch("time.monotonic_ns", return_value=2000000000):
+            c.on_block_accessed(block)
+        with patch("time.monotonic_ns", return_value=3000000000):
+            c.on_block_accessed(block)
+
+        with patch("time.monotonic_ns", return_value=4000000000):
+            c.on_block_evicted(block)
+
+        events = c.drain_events()
+        assert len(events) == 1
+        sample = events[0]
+        assert abs(sample.lifetime_seconds - 3.0) < 0.001
+        assert abs(sample.idle_seconds - 1.0) < 0.001
+        assert sample.reuse_gaps_seconds == (1.0,)
+        assert 0 not in c.block_metrics
+
+    def test_reset(self):
+        c = KVCacheMetricsCollector(sample_rate=1.0)
+
+        with patch("time.monotonic_ns", return_value=1000000000):
+            for i in range(5):
+                c.on_block_allocated(KVCacheBlock(block_id=i))
+
+        assert len(c.block_metrics) == 5
+        c.reset()
+        assert len(c.block_metrics) == 0
+
+        with patch("time.monotonic_ns", return_value=2000000000):
+            c.on_block_allocated(KVCacheBlock(block_id=10))
+        assert 10 in c.block_metrics
+
+    def test_huge_time_jump(self):
+        c = KVCacheMetricsCollector(sample_rate=1.0)
+
+        block = KVCacheBlock(block_id=0)
+        with patch("time.monotonic_ns", return_value=1000000000):
+            c.on_block_allocated(block)
+
+        with patch("time.monotonic_ns", return_value=9999999999999999):
+            c.on_block_evicted(block)
+
+        events = c.drain_events()
+        assert len(events) == 1
+        assert events[0].lifetime_seconds > 0
+
+
+def test_kv_cache_metrics_collector_smoke() -> None:
+    """Simple smoke test for KVCacheMetricsCollector on CPU."""
+    collector = KVCacheMetricsCollector(sample_rate=1.0)
+    block = KVCacheBlock(block_id=123)
+
+    # Allocate at t = 1.0s.
+    with patch("time.monotonic_ns", return_value=1_000_000_000):
+        collector.on_block_allocated(block)
+
+    # Access at t = 2.0s and t = 3.0s.
+    with patch("time.monotonic_ns", return_value=2_000_000_000):
+        collector.on_block_accessed(block)
+    with patch("time.monotonic_ns", return_value=3_000_000_000):
+        collector.on_block_accessed(block)
+
+    # Evict at t = 4.0s.
+    with patch("time.monotonic_ns", return_value=4_000_000_000):
+        collector.on_block_evicted(block)
+
+    events = collector.drain_events()
+    assert len(events) == 1
+
+    event = events[0]
+    # Lifetime: 1.0s → 4.0s.
+    assert abs(event.lifetime_seconds - 3.0) < 1e-6
+    # Idle: last access at 3.0s, evicted at 4.0s.
+    assert abs(event.idle_seconds - 1.0) < 1e-6
+    # One reuse gap between the two accesses.
+    assert event.reuse_gaps_seconds == (1.0,)
diff --git a/vllm/config/observability.py b/vllm/config/observability.py
index ff35e12fe20ed..656a5f8a9068e 100644
--- a/vllm/config/observability.py
+++ b/vllm/config/observability.py
@@ -5,7 +5,7 @@ from functools import cached_property
 from typing import Any, Literal, cast
 
 from packaging.version import parse
-from pydantic import field_validator, model_validator
+from pydantic import Field, field_validator, model_validator
 from pydantic.dataclasses import dataclass
 
 from vllm import version
@@ -47,6 +47,14 @@ class ObservabilityConfig:
     Note that collecting detailed timing information for each request can be
     expensive."""
 
+    kv_cache_metrics: bool = False
+    """Enable KV cache residency metrics (lifetime, idle time, reuse gaps).
+    Uses sampling to minimize overhead.
+    Requires log stats to be enabled (i.e., --disable-log-stats not set)."""
+
+    kv_cache_metrics_sample: float = Field(default=0.01, gt=0, le=1)
+    """Sampling rate for KV cache metrics (0.0, 1.0]. Default 0.01 = 1% of blocks."""
+
     @cached_property
     def collect_model_forward_time(self) -> bool:
         """Whether to collect model forward time for the request."""
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 96b1b971552c6..5a2836668174f 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -517,6 +517,10 @@ class EngineArgs:
     collect_detailed_traces: list[DetailedTraceModules] | None = (
         ObservabilityConfig.collect_detailed_traces
     )
+    kv_cache_metrics: bool = ObservabilityConfig.kv_cache_metrics
+    kv_cache_metrics_sample: float = get_field(
+        ObservabilityConfig, "kv_cache_metrics_sample"
+    )
     scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
     scheduler_cls: str | type[object] | None = SchedulerConfig.scheduler_cls
 
@@ -1013,6 +1017,13 @@ class EngineArgs:
             "--collect-detailed-traces",
             **observability_kwargs["collect_detailed_traces"],
         )
+        observability_group.add_argument(
+            "--kv-cache-metrics", **observability_kwargs["kv_cache_metrics"]
+        )
+        observability_group.add_argument(
+            "--kv-cache-metrics-sample",
+            **observability_kwargs["kv_cache_metrics_sample"],
+        )
 
         # Scheduler arguments
         scheduler_kwargs = get_kwargs(SchedulerConfig)
@@ -1698,6 +1709,8 @@ class EngineArgs:
             show_hidden_metrics_for_version=self.show_hidden_metrics_for_version,
             otlp_traces_endpoint=self.otlp_traces_endpoint,
             collect_detailed_traces=self.collect_detailed_traces,
+            kv_cache_metrics=self.kv_cache_metrics,
+            kv_cache_metrics_sample=self.kv_cache_metrics_sample,
         )
 
         # Compilation config overrides
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
index 8b0e8fd3a2410..cfb2c02e00f1b 100644
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -11,6 +11,7 @@ from vllm.distributed.kv_events import (
     KVCacheEvent,
 )
 from vllm.logger import init_logger
+from vllm.v1.core.kv_cache_metrics import KVCacheMetricsCollector
 from vllm.v1.core.kv_cache_utils import (
     BlockHash,
     BlockHashList,
@@ -140,6 +141,7 @@ class BlockPool:
             where different KV cache groups have different block sizes, the
             actual block size can be a multiple of hash_block_size.
         enable_kv_cache_events: Whether to enable kv cache events.
+        metrics_collector: Optional metrics collector for tracking block residency.
     """
 
     def __init__(
@@ -148,6 +150,7 @@ class BlockPool:
         enable_caching: bool,
         hash_block_size: int,
         enable_kv_cache_events: bool = False,
+        metrics_collector: KVCacheMetricsCollector | None = None,
     ):
         assert isinstance(num_gpu_blocks, int) and num_gpu_blocks > 0
         self.num_gpu_blocks = num_gpu_blocks
@@ -174,6 +177,8 @@ class BlockPool:
         self.enable_kv_cache_events = enable_kv_cache_events
         self.kv_event_queue: list[KVCacheEvent] = []
 
+        self.metrics_collector = metrics_collector
+
     def get_cached_block(
         self, block_hash: BlockHash, kv_cache_group_ids: list[int]
     ) -> list[KVCacheBlock] | None:
@@ -308,10 +313,14 @@ class BlockPool:
                 self._maybe_evict_cached_block(block)
                 assert block.ref_cnt == 0
                 block.ref_cnt += 1
+                if self.metrics_collector:
+                    self.metrics_collector.on_block_allocated(block)
         else:
             for block in ret:
                 assert block.ref_cnt == 0
                 block.ref_cnt += 1
+                if self.metrics_collector:
+                    self.metrics_collector.on_block_allocated(block)
         return ret
 
     def _maybe_evict_cached_block(self, block: KVCacheBlock) -> bool:
@@ -325,6 +334,10 @@ class BlockPool:
         Returns:
             True if the block is evicted, False otherwise.
         """
+        # Clean up metrics tracking first to prevent leaks
+        if self.metrics_collector:
+            self.metrics_collector.on_block_evicted(block)
+
         block_hash = block.block_hash
         if block_hash is None:
             # The block doesn't have hash, eviction is not needed
@@ -365,6 +378,8 @@ class BlockPool:
                 if block.ref_cnt == 0 and not block.is_null:
                     self.free_block_queue.remove(block)
                 block.ref_cnt += 1
+                if self.metrics_collector:
+                    self.metrics_collector.on_block_accessed(block)
 
     def free_blocks(self, ordered_blocks: Iterable[KVCacheBlock]) -> None:
         """Free a list of blocks. The blocks should be ordered by their
@@ -407,6 +422,9 @@ class BlockPool:
         for block in self.blocks:
             block.reset_hash()
 
+        if self.metrics_collector:
+            self.metrics_collector.reset()
+
         logger.info("Successfully reset prefix cache")
 
         if self.enable_kv_cache_events:
diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
index fd1ec8e27fba2..4b09b76c1c591 100644
--- a/vllm/v1/core/kv_cache_coordinator.py
+++ b/vllm/v1/core/kv_cache_coordinator.py
@@ -5,6 +5,7 @@ from collections.abc import Sequence
 from math import lcm
 
 from vllm.v1.core.block_pool import BlockPool
+from vllm.v1.core.kv_cache_metrics import KVCacheMetricsCollector
 from vllm.v1.core.kv_cache_utils import (
     BlockHash,
     BlockHashList,
@@ -39,6 +40,7 @@ class KVCacheCoordinator(ABC):
         dcp_world_size: int,
         pcp_world_size: int,
         hash_block_size: int,
+        metrics_collector: KVCacheMetricsCollector | None = None,
     ):
         self.kv_cache_config = kv_cache_config
         self.max_model_len = max_model_len
@@ -49,6 +51,7 @@ class KVCacheCoordinator(ABC):
             enable_caching,
             hash_block_size,
             enable_kv_cache_events,
+            metrics_collector,
         )
 
         # Needs special handling for find_longest_cache_hit if eagle is enabled
@@ -228,6 +231,7 @@ class KVCacheCoordinatorNoPrefixCache(KVCacheCoordinator):
         dcp_world_size: int,
         pcp_world_size: int,
         hash_block_size: int,
+        metrics_collector: KVCacheMetricsCollector | None = None,
     ):
         super().__init__(
             kv_cache_config,
@@ -238,6 +242,7 @@ class KVCacheCoordinatorNoPrefixCache(KVCacheCoordinator):
             dcp_world_size=dcp_world_size,
             pcp_world_size=pcp_world_size,
             hash_block_size=hash_block_size,
+            metrics_collector=metrics_collector,
         )
         self.num_single_type_manager = len(self.single_type_managers)
 
@@ -272,6 +277,7 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator):
         dcp_world_size: int,
         pcp_world_size: int,
         hash_block_size: int,
+        metrics_collector: KVCacheMetricsCollector | None = None,
     ):
         super().__init__(
             kv_cache_config,
@@ -282,6 +288,7 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator):
             dcp_world_size=dcp_world_size,
             pcp_world_size=pcp_world_size,
             hash_block_size=hash_block_size,
+            metrics_collector=metrics_collector,
         )
         self.kv_cache_spec = self.kv_cache_config.kv_cache_groups[0].kv_cache_spec
         self.block_size = self.kv_cache_spec.block_size
@@ -338,6 +345,7 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
         dcp_world_size: int,
         pcp_world_size: int,
         hash_block_size: int,
+        metrics_collector: KVCacheMetricsCollector | None = None,
     ):
         super().__init__(
             kv_cache_config,
@@ -348,6 +356,7 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
             dcp_world_size=dcp_world_size,
             pcp_world_size=pcp_world_size,
             hash_block_size=hash_block_size,
+            metrics_collector=metrics_collector,
         )
         # hash_block_size: the block size used to compute block hashes.
         # The actual block size usually equals hash_block_size, but in cases where
@@ -523,6 +532,7 @@ def get_kv_cache_coordinator(
     dcp_world_size: int,
     pcp_world_size: int,
     hash_block_size: int,
+    metrics_collector: KVCacheMetricsCollector | None = None,
 ) -> KVCacheCoordinator:
     if not enable_caching:
         return KVCacheCoordinatorNoPrefixCache(
@@ -530,9 +540,10 @@ def get_kv_cache_coordinator(
             max_model_len,
             use_eagle,
             enable_kv_cache_events,
-            dcp_world_size,
-            pcp_world_size,
-            hash_block_size,
+            dcp_world_size=dcp_world_size,
+            pcp_world_size=pcp_world_size,
+            hash_block_size=hash_block_size,
+            metrics_collector=metrics_collector,
         )
     if len(kv_cache_config.kv_cache_groups) == 1:
         return UnitaryKVCacheCoordinator(
@@ -541,9 +552,10 @@ def get_kv_cache_coordinator(
             use_eagle,
             enable_caching,
             enable_kv_cache_events,
-            dcp_world_size,
-            pcp_world_size,
-            hash_block_size,
+            dcp_world_size=dcp_world_size,
+            pcp_world_size=pcp_world_size,
+            hash_block_size=hash_block_size,
+            metrics_collector=metrics_collector,
         )
     return HybridKVCacheCoordinator(
         kv_cache_config,
@@ -551,7 +563,8 @@ def get_kv_cache_coordinator(
         use_eagle,
         enable_caching,
         enable_kv_cache_events,
-        dcp_world_size,
-        pcp_world_size,
-        hash_block_size,
+        dcp_world_size=dcp_world_size,
+        pcp_world_size=pcp_world_size,
+        hash_block_size=hash_block_size,
+        metrics_collector=metrics_collector,
     )
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index b061e5cc831dd..3823384881cd3 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -9,6 +9,7 @@ from typing import Literal, overload
 from vllm.distributed.kv_events import KVCacheEvent
 from vllm.logger import init_logger
 from vllm.v1.core.kv_cache_coordinator import get_kv_cache_coordinator
+from vllm.v1.core.kv_cache_metrics import KVCacheMetricsCollector
 from vllm.v1.core.kv_cache_utils import KVCacheBlock
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.metrics.stats import PrefixCacheStats
@@ -102,12 +103,14 @@ class KVCacheManager:
         enable_kv_cache_events: bool = False,
         dcp_world_size: int = 1,
         pcp_world_size: int = 1,
+        metrics_collector: KVCacheMetricsCollector | None = None,
     ) -> None:
         self.max_model_len = max_model_len
 
         self.enable_caching = enable_caching
         self.use_eagle = use_eagle
         self.log_stats = log_stats
+        self.metrics_collector = metrics_collector
         # FIXME: make prefix cache stats conditional on log_stats. We still need
         # this comment because when the log stats is enabled there are still
         # potential configs we could expose in the future.
@@ -122,6 +125,7 @@ class KVCacheManager:
             dcp_world_size=dcp_world_size,
             pcp_world_size=pcp_world_size,
             hash_block_size=hash_block_size,
+            metrics_collector=self.metrics_collector,
         )
         self.num_kv_cache_groups = len(kv_cache_config.kv_cache_groups)
         self.block_pool = self.coordinator.block_pool
diff --git a/vllm/v1/core/kv_cache_metrics.py b/vllm/v1/core/kv_cache_metrics.py
new file mode 100644
index 0000000000000..a6dbf5b1e4034
--- /dev/null
+++ b/vllm/v1/core/kv_cache_metrics.py
@@ -0,0 +1,96 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""KV cache metrics tracking."""
+
+import random
+import time
+from collections import deque
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from vllm.v1.core.kv_cache_utils import KVCacheBlock
+
+from vllm.v1.metrics.stats import KVCacheEvictionEvent
+
+
+class BlockMetricsState:
+    """Tracks lifecycle metrics for a single KV cache block."""
+
+    def __init__(self):
+        now_ns = time.monotonic_ns()
+        self.birth_time_ns = now_ns
+        self.last_access_ns = now_ns
+        # Bounded to prevent unbounded growth if a block is accessed many times.
+        self.access_history: deque[int] = deque(maxlen=4)
+
+    def record_access(self) -> None:
+        now_ns = time.monotonic_ns()
+        self.last_access_ns = now_ns
+        self.access_history.append(now_ns)
+
+    def get_lifetime_seconds(self) -> float:
+        now_ns = time.monotonic_ns()
+        return (now_ns - self.birth_time_ns) / 1e9
+
+    def get_idle_time_seconds(self) -> float:
+        now_ns = time.monotonic_ns()
+        return (now_ns - self.last_access_ns) / 1e9
+
+    def get_reuse_gaps_seconds(self) -> list[float]:
+        if len(self.access_history) < 2:
+            return []
+        history = list(self.access_history)
+        return [(history[i] - history[i - 1]) / 1e9 for i in range(1, len(history))]
+
+
+class KVCacheMetricsCollector:
+    """Collects KV cache residency metrics with sampling."""
+
+    def __init__(self, sample_rate: float = 0.01):
+        assert 0 < sample_rate <= 1.0, (
+            f"sample_rate must be in (0, 1.0], got {sample_rate}"
+        )
+        self.sample_rate = sample_rate
+
+        self.block_metrics: dict[int, BlockMetricsState] = {}
+
+        self._eviction_events: list[KVCacheEvictionEvent] = []
+
+    def should_sample_block(self) -> bool:
+        return random.random() < self.sample_rate
+
+    def on_block_allocated(self, block: "KVCacheBlock") -> None:
+        if self.should_sample_block():
+            self.block_metrics[block.block_id] = BlockMetricsState()
+
+    def on_block_accessed(self, block: "KVCacheBlock") -> None:
+        metrics = self.block_metrics.get(block.block_id)
+        if metrics:
+            metrics.record_access()
+
+    def on_block_evicted(self, block: "KVCacheBlock") -> None:
+        metrics = self.block_metrics.pop(block.block_id, None)
+        if not metrics:
+            return
+
+        lifetime = metrics.get_lifetime_seconds()
+        idle_time = metrics.get_idle_time_seconds()
+        reuse_gaps = tuple(metrics.get_reuse_gaps_seconds())
+
+        self._eviction_events.append(
+            KVCacheEvictionEvent(
+                lifetime_seconds=lifetime,
+                idle_seconds=idle_time,
+                reuse_gaps_seconds=reuse_gaps,
+            )
+        )
+
+    def reset(self) -> None:
+        """Clear all state on cache reset."""
+        self.block_metrics.clear()
+        self._eviction_events.clear()
+
+    def drain_events(self) -> list[KVCacheEvictionEvent]:
+        events = self._eviction_events
+        self._eviction_events = []
+        return events
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 4e38b991326d3..4314ba75eceef 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -29,6 +29,7 @@ from vllm.v1.core.encoder_cache_manager import (
     compute_encoder_budget,
 )
 from vllm.v1.core.kv_cache_manager import KVCacheBlocks, KVCacheManager
+from vllm.v1.core.kv_cache_metrics import KVCacheMetricsCollector
 from vllm.v1.core.sched.interface import SchedulerInterface
 from vllm.v1.core.sched.output import (
     CachedRequestData,
@@ -40,7 +41,10 @@ from vllm.v1.core.sched.request_queue import SchedulingPolicy, create_request_qu
 from vllm.v1.core.sched.utils import check_stop, remove_all
 from vllm.v1.engine import EngineCoreEventType, EngineCoreOutput, EngineCoreOutputs
 from vllm.v1.kv_cache_interface import KVCacheConfig
-from vllm.v1.metrics.stats import PrefixCacheStats, SchedulerStats
+from vllm.v1.metrics.stats import (
+    PrefixCacheStats,
+    SchedulerStats,
+)
 from vllm.v1.outputs import DraftTokenIds, KVConnectorOutput, ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.spec_decode.metrics import SpecDecodingStats
@@ -69,6 +73,12 @@ class Scheduler(SchedulerInterface):
         self.kv_events_config = vllm_config.kv_events_config
         self.parallel_config = vllm_config.parallel_config
         self.log_stats = log_stats
+        self.observability_config = vllm_config.observability_config
+        self.kv_metrics_collector: KVCacheMetricsCollector | None = None
+        if self.observability_config.kv_cache_metrics:
+            self.kv_metrics_collector = KVCacheMetricsCollector(
+                self.observability_config.kv_cache_metrics_sample,
+            )
         self.structured_output_manager = structured_output_manager
         self.is_encoder_decoder = vllm_config.model_config.is_encoder_decoder
 
@@ -187,6 +197,7 @@ class Scheduler(SchedulerInterface):
             dcp_world_size=self.dcp_world_size,
             pcp_world_size=self.pcp_world_size,
             hash_block_size=self.block_size,
+            metrics_collector=self.kv_metrics_collector,
         )
         self.use_pp = self.parallel_config.pipeline_parallel_size > 1
         self.use_v2_model_runner = envs.VLLM_USE_V2_MODEL_RUNNER
@@ -1356,14 +1367,24 @@ class Scheduler(SchedulerInterface):
         prefix_cache_stats = self.kv_cache_manager.make_prefix_cache_stats()
         assert prefix_cache_stats is not None
         connector_prefix_cache_stats = self._make_connector_prefix_cache_stats()
+        eviction_events = (
+            self.kv_metrics_collector.drain_events()
+            if self.kv_metrics_collector is not None
+            else []
+        )
+        spec_stats = spec_decoding_stats
+        connector_stats_payload = (
+            kv_connector_stats.data if kv_connector_stats else None
+        )
         return SchedulerStats(
             num_running_reqs=len(self.running),
             num_waiting_reqs=len(self.waiting),
             kv_cache_usage=self.kv_cache_manager.usage,
             prefix_cache_stats=prefix_cache_stats,
             connector_prefix_cache_stats=connector_prefix_cache_stats,
-            spec_decoding_stats=spec_decoding_stats,
-            kv_connector_stats=kv_connector_stats.data if kv_connector_stats else None,
+            kv_cache_eviction_events=eviction_events,
+            spec_decoding_stats=spec_stats,
+            kv_connector_stats=connector_stats_payload,
         )
 
     def make_spec_decoding_stats(
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 429cee3b5af10..dec0e2d00aea8 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -375,6 +375,9 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
         # Use this flag to hide metrics that were deprecated in
         # a previous release and which will be removed future
         self.show_hidden_metrics = vllm_config.observability_config.show_hidden_metrics
+        self.kv_cache_metrics_enabled = (
+            vllm_config.observability_config.kv_cache_metrics
+        )
 
         labelnames = ["model_name", "engine"]
         model_name = vllm_config.model_config.served_model_name
@@ -853,6 +856,79 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
             histogram_decode_time_request, engine_indexes, model_name
         )
 
+        #
+        # KV Cache residency metrics
+        #
+        if self.kv_cache_metrics_enabled:
+            kv_cache_residency_buckets = [
+                0.001,
+                0.002,
+                0.005,
+                0.01,
+                0.02,
+                0.05,
+                0.1,
+                0.2,
+                0.5,
+                1,
+                2,
+                5,
+                10,
+                20,
+                30,
+                60,
+                120,
+                300,
+                600,
+                1200,
+                1800,
+            ]
+
+            histogram_kv_block_lifetime = self._histogram_cls(
+                name="vllm:kv_block_lifetime_seconds",
+                documentation=(
+                    "Histogram of KV cache block lifetime from allocation to eviction. "
+                    "Sampled metrics (controlled by --kv-cache-metrics-sample)."
+                ),
+                buckets=kv_cache_residency_buckets,
+                labelnames=labelnames,
+            )
+            self.histogram_kv_block_lifetime = make_per_engine(
+                histogram_kv_block_lifetime, engine_indexes, model_name
+            )
+
+            histogram_kv_block_idle_before_evict = self._histogram_cls(
+                name="vllm:kv_block_idle_before_evict_seconds",
+                documentation=(
+                    "Histogram of idle time before KV cache block eviction. "
+                    "Sampled metrics (controlled by --kv-cache-metrics-sample)."
+                ),
+                buckets=kv_cache_residency_buckets,
+                labelnames=labelnames,
+            )
+            self.histogram_kv_block_idle_before_evict = make_per_engine(
+                histogram_kv_block_idle_before_evict, engine_indexes, model_name
+            )
+
+            histogram_kv_block_reuse_gap = self._histogram_cls(
+                name="vllm:kv_block_reuse_gap_seconds",
+                documentation=(
+                    "Histogram of time gaps between consecutive KV cache block "
+                    "accesses. Only the most recent accesses are recorded "
+                    "(ring buffer). Sampled metrics (controlled by "
+                    "--kv-cache-metrics-sample)."
+                ),
+                buckets=kv_cache_residency_buckets,
+                labelnames=labelnames,
+            )
+            self.histogram_kv_block_reuse_gap = make_per_engine(
+                histogram_kv_block_reuse_gap, engine_indexes, model_name
+            )
+        else:
+            self.histogram_kv_block_lifetime = {}
+            self.histogram_kv_block_idle_before_evict = {}
+            self.histogram_kv_block_reuse_gap = {}
+
         #
         # LoRA metrics
         #
@@ -944,6 +1020,20 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
                     scheduler_stats.kv_connector_stats, engine_idx
                 )
 
+            if (
+                self.kv_cache_metrics_enabled
+                and scheduler_stats.kv_cache_eviction_events
+            ):
+                lifetime_hist = self.histogram_kv_block_lifetime[engine_idx]
+                idle_hist = self.histogram_kv_block_idle_before_evict[engine_idx]
+                reuse_hist = self.histogram_kv_block_reuse_gap[engine_idx]
+
+                for event in scheduler_stats.kv_cache_eviction_events:
+                    lifetime_hist.observe(event.lifetime_seconds)
+                    idle_hist.observe(event.idle_seconds)
+                    for gap in event.reuse_gaps_seconds:
+                        reuse_hist.observe(gap)
+
             if self.gauge_lora_info is not None:
                 running_lora_adapters = ",".join(
                     scheduler_stats.running_lora_adapters.keys()
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 4e9db98db0bc2..a3078eaa75dc5 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -150,6 +150,15 @@ class MultiModalCacheStats(BaseCacheStats):
     """
 
 
+@dataclass
+class KVCacheEvictionEvent:
+    """Single KV cache block eviction sample."""
+
+    lifetime_seconds: float
+    idle_seconds: float
+    reuse_gaps_seconds: tuple[float, ...]
+
+
 @dataclass
 class SchedulerStats:
     """Stats associated with the scheduler."""
@@ -166,6 +175,8 @@ class SchedulerStats:
     prefix_cache_stats: PrefixCacheStats = field(default_factory=PrefixCacheStats)
     connector_prefix_cache_stats: PrefixCacheStats | None = None
 
+    kv_cache_eviction_events: list[KVCacheEvictionEvent] = field(default_factory=list)
+
     spec_decoding_stats: SpecDecodingStats | None = None
     kv_connector_stats: dict[str, Any] | None = None
 

From 38caf7fa1a2f57a4a791ffa4ba0f9f55d409e0d0 Mon Sep 17 00:00:00 2001
From: Finbarr Timbers <finbarrtimbers@gmail.com>
Date: Mon, 1 Dec 2025 12:15:19 -0700
Subject: [PATCH 119/770] Update FAQ on interleaving sliding windows support
 (#29796)

Signed-off-by: Finbarr Timbers <finbarrtimbers@gmail.com>
---
 docs/contributing/model/basic.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md
index a68d1f0162a10..d37501b86556f 100644
--- a/docs/contributing/model/basic.md
+++ b/docs/contributing/model/basic.md
@@ -113,8 +113,6 @@ See [this page](registration.md) for instructions on how to register your new mo
 
 ### How to support models with interleaving sliding windows?
 
-For models with interleaving sliding windows (e.g. `google/gemma-2-2b-it` and `mistralai/Ministral-8B-Instruct-2410`), the scheduler will treat the model as a full-attention model, i.e., kv-cache of all tokens will not be dropped. This is to make sure prefix caching works with these models. Sliding window only appears as a parameter to the attention kernel computation.
-
 To support a model with interleaving sliding windows, we need to take care of the following details:
 
 - Make sure the model's `config.json` contains `layer_types`.

From eaf81485eda13efc10e1c38f4094978eae04e77f Mon Sep 17 00:00:00 2001
From: Nengjun Ma <nengjunma@outlook.com>
Date: Tue, 2 Dec 2025 04:02:18 +0800
Subject: [PATCH 120/770] [Ascend]: Fixed the issue where OOT Platform
 vllm-ascend could not enable SP in Eager mode (#28935)

Signed-off-by: leo-pony <nengjunma@outlook.com>
---
 vllm/config/compilation.py |  7 +++++++
 vllm/config/vllm.py        | 10 ++++++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index da2c100dae3dc..0f876c38169ac 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -855,6 +855,13 @@ class CompilationConfig:
         self.compute_bs_to_padded_graph_size()
 
     def set_splitting_ops_for_v1(self):
+        # To compatible with OOT hardware plugin platform (for example vllm-ascend)
+        # which currently only supports sequence parallelism in eager mode.
+        if self.mode != CompilationMode.VLLM_COMPILE:
+            if self.splitting_ops is None:
+                self.splitting_ops = []
+            return
+
         # NOTE: this function needs to be called only when mode is
         # CompilationMode.VLLM_COMPILE
         assert self.mode == CompilationMode.VLLM_COMPILE, (
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 34e70e3e134be..4542866aa166c 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -797,8 +797,7 @@ class VllmConfig:
         ), "MTP with cp_kv_cache_interleave_size > 1 is not supported now."
 
         # Do this after all the updates to compilation_config.mode
-        if self.compilation_config.mode == CompilationMode.VLLM_COMPILE:
-            self.compilation_config.set_splitting_ops_for_v1()
+        self.compilation_config.set_splitting_ops_for_v1()
 
         if self.compilation_config.pass_config.enable_sequence_parallelism:
             # With pipeline parallelism or dynamo partitioning,
@@ -806,6 +805,13 @@ class VllmConfig:
             # Use custom rms norm to unblock. In the future,
             # the pass will operate on higher-level IR to avoid the issue.
             # TODO: https://github.com/vllm-project/vllm/issues/27894
+            if self.compilation_config.mode != CompilationMode.VLLM_COMPILE:
+                logger.warning(
+                    "Sequence parallelism is enabled, but running in wrong "
+                    "vllm compile mode: %s.",
+                    self.compilation_config.mode,
+                )
+
             is_fullgraph = (
                 self.compilation_config.use_inductor_graph_partition
                 or len(self.compilation_config.splitting_ops) == 0

From 1336a1ea244fa8bfd7e72751cabbdb5b68a0c11a Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <khluu000@gmail.com>
Date: Mon, 1 Dec 2025 13:42:03 -0800
Subject: [PATCH 121/770] Revert #29787 and #29690 (#29815)

---
 .buildkite/generate_index.py                  |  46 +++
 .buildkite/release-pipeline.yaml              |  16 +-
 .buildkite/scripts/generate-nightly-index.py  | 369 ------------------
 .buildkite/scripts/upload-wheels.sh           | 119 +++---
 docs/getting_started/installation/cpu.md      |  15 +-
 .../installation/gpu.cuda.inc.md              |  73 ++--
 docs/getting_started/installation/gpu.md      |   2 +-
 setup.py                                      | 118 ++----
 vllm/envs.py                                  |   7 +-
 9 files changed, 185 insertions(+), 580 deletions(-)
 create mode 100644 .buildkite/generate_index.py
 delete mode 100644 .buildkite/scripts/generate-nightly-index.py

diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py
new file mode 100644
index 0000000000000..bbed80ebe8476
--- /dev/null
+++ b/.buildkite/generate_index.py
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import os
+
+template = """<!DOCTYPE html>
+<html>
+    <body>
+    <h1>Links for vLLM</h1/>
+        <a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
+        <a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
+    </body>
+</html>
+"""
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--wheel", help="The wheel path.", required=True)
+args = parser.parse_args()
+
+filename = os.path.basename(args.wheel)
+
+with open("index.html", "w") as f:
+    print(f"Generated index.html for {args.wheel}")
+    # sync the abi tag with .buildkite/scripts/upload-wheels.sh
+    if "x86_64" in filename:
+        x86_wheel = filename
+        arm_wheel = filename.replace("x86_64", "aarch64").replace(
+            "manylinux1", "manylinux2014"
+        )
+    elif "aarch64" in filename:
+        x86_wheel = filename.replace("aarch64", "x86_64").replace(
+            "manylinux2014", "manylinux1"
+        )
+        arm_wheel = filename
+    else:
+        raise ValueError(f"Unsupported wheel: {filename}")
+    # cloudfront requires escaping the '+' character
+    f.write(
+        template.format(
+            x86_wheel=x86_wheel,
+            x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
+            arm_wheel=arm_wheel,
+            arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
+        )
+    )
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index fbfc923998f89..38c400ba1faf5 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -8,7 +8,7 @@ steps:
     commands:
       # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
       # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       - "bash .buildkite/scripts/upload-wheels.sh"
@@ -30,6 +30,19 @@ steps:
       DOCKER_BUILDKIT: "1"
 
   # x86 + CUDA builds
+  - label: "Build wheel - CUDA 12.8"
+    depends_on: ~
+    id: build-wheel-cuda-12-8
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "bash .buildkite/scripts/upload-wheels.sh"
+    env:
+      DOCKER_BUILDKIT: "1"
+
   - label: "Build wheel - CUDA 12.9"
     depends_on: ~
     id: build-wheel-cuda-12-9
@@ -96,6 +109,7 @@ steps:
   - label: "Annotate release workflow"
     depends_on:
       - create-multi-arch-manifest
+      - build-wheel-cuda-12-8
     id: annotate-release-workflow
     agents:
       queue: cpu_queue_postmerge
diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py
deleted file mode 100644
index 90286ad4c6e14..0000000000000
--- a/.buildkite/scripts/generate-nightly-index.py
+++ /dev/null
@@ -1,369 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# do not complain about line length (for docstring)
-# ruff: noqa: E501
-
-import argparse
-import json
-import re
-import sys
-from dataclasses import asdict, dataclass
-from pathlib import Path
-from typing import Any
-from urllib.parse import quote
-
-if not sys.version_info >= (3, 10):
-    raise RuntimeError("This script requires Python 3.10 or higher.")
-
-INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
-<html>
-  <meta name="pypi:repository-version" content="1.0">
-  <body>
-{items}
-  </body>
-</html>
-"""
-
-
-@dataclass
-class WheelFileInfo:
-    package_name: str
-    version: str
-    build_tag: str | None
-    python_tag: str
-    abi_tag: str
-    platform_tag: str
-    variant: str | None
-    filename: str
-
-
-def parse_from_filename(file: str) -> WheelFileInfo:
-    """
-    Parse wheel file name to extract metadata.
-
-    The format of wheel names:
-        {package_name}-{version}(-{build_tag})?-{python_tag}-{abi_tag}-{platform_tag}.whl
-    All versions could contain a variant like '+cu129' or '.cpu' or `.rocm` (or not).
-    Example:
-        vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl
-        vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl
-        vllm-0.11.1rc8.dev14+gaa384b3c0-cp38-abi3-manylinux2014_aarch64.whl
-        vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl
-    """
-    wheel_file_re = re.compile(
-        r"^(?P<package_name>.+)-(?P<version>[^-]+?)(-(?P<build_tag>[^-]+))?-(?P<python_tag>[^-]+)-(?P<abi_tag>[^-]+)-(?P<platform_tag>[^-]+)\.whl$"
-    )
-    match = wheel_file_re.match(file)
-    if not match:
-        raise ValueError(f"Invalid wheel file name: {file}")
-
-    package_name = match.group("package_name")
-    version = match.group("version")
-    build_tag = match.group("build_tag")
-    python_tag = match.group("python_tag")
-    abi_tag = match.group("abi_tag")
-    platform_tag = match.group("platform_tag")
-
-    # extract variant from version
-    variant = None
-    if "dev" in version:
-        ver_after_dev = version.split("dev")[-1]
-        if "." in ver_after_dev:
-            variant = ver_after_dev.split(".")[-1]
-            version = version.removesuffix("." + variant)
-    else:
-        if "+" in version:
-            version, variant = version.split("+")
-
-    return WheelFileInfo(
-        package_name=package_name,
-        version=version,
-        build_tag=build_tag,
-        python_tag=python_tag,
-        abi_tag=abi_tag,
-        platform_tag=platform_tag,
-        variant=variant,
-        filename=file,
-    )
-
-
-def generate_project_list(subdir_names: list[str]) -> str:
-    """
-    Generate project list HTML content linking to each project & variant sub-directory.
-    """
-    href_tags = []
-    for name in sorted(subdir_names):
-        name = name.strip("/").strip(".")
-        href_tags.append(f'    <a href="{name}/">{name}/</a><br/>')
-    return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
-
-
-def generate_package_index_and_metadata(
-    wheel_files: list[WheelFileInfo], wheel_base_dir: Path, index_base_dir: Path
-) -> tuple[str, str]:
-    """
-    Generate package index HTML content for a specific package, linking to actual wheel files.
-    """
-    href_tags = []
-    metadata = []
-    for file in sorted(wheel_files, key=lambda x: x.filename):
-        relative_path = (
-            wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename
-        )
-        # handle with '+' in URL, and avoid double-encoding '/' and already-encoded '%2B'
-        # NOTE: this is AWS S3 specific behavior!
-        file_path_quoted = quote(relative_path.as_posix(), safe=":%/")
-        href_tags.append(f'    <a href="{file_path_quoted}">{file.filename}</a><br/>')
-        file_meta = asdict(file)
-        file_meta["path"] = file_path_quoted
-        metadata.append(file_meta)
-    index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
-    metadata_str = json.dumps(metadata, indent=2)
-    return index_str, metadata_str
-
-
-def generate_index_and_metadata(
-    whl_files: list[str],
-    wheel_base_dir: Path,
-    index_base_dir: Path,
-    default_variant: str | None = None,
-    alias_to_default: str | None = None,
-):
-    """
-    Generate index for all wheel files.
-
-    Args:
-        whl_files (list[str]): List of wheel files (must be directly under `wheel_base_dir`).
-        wheel_base_dir (Path): Base directory for wheel files.
-        index_base_dir (Path): Base directory to store index files.
-        default_variant (str | None): The default variant name, if any.
-        alias_to_default (str | None): Alias variant name for the default variant, if any.
-
-    First, parse all wheel files to extract metadata.
-    We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
-    The index for the default variant (if any) is generated in the root index directory.
-
-    If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
-    is purely a copy of the corresponding variant index, with only the links adjusted.
-    Otherwise, all wheels without variant suffixes are treated as the default variant.
-
-    If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
-    as the default variant index, but the links are adjusted accordingly.
-
-    Index directory structure:
-        index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
-            index.html  # project list, linking to "vllm/" and other packages, and all variant sub-directories
-            vllm/
-                index.html # package index, pointing to actual files in wheel_base_dir (relative path)
-                metadata.json # machine-readable metadata for all wheels in this package
-            cpu/ # cpu variant sub-directory
-                index.html
-                vllm/
-                    index.html
-                    metadata.json
-            cu129/ # cu129 is actually the alias to default variant
-                index.html
-                vllm/
-                    index.html
-                    metadata.json
-            cu130/ # cu130 variant sub-directory
-                index.html
-                vllm/
-                    index.html
-                    metadata.json
-            ...
-
-    metadata.json stores a dump of all wheel files' metadata in a machine-readable format:
-        [
-            {
-                "package_name": "vllm",
-                "version": "0.10.2rc2",
-                "build_tag": null,
-                "python_tag": "cp38",
-                "abi_tag": "abi3",
-                "platform_tag": "manylinux2014_aarch64",
-                "variant": "cu129",
-                "filename": "vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl",
-                "path": "../vllm-0.10.2rc2%2Bcu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL and URL-encoded
-            },
-            ...
-        ]
-    """
-
-    parsed_files = [parse_from_filename(f) for f in whl_files]
-
-    if not parsed_files:
-        print("No wheel files found, skipping index generation.")
-        return
-
-    # Group by variant
-    variant_to_files: dict[str, list[WheelFileInfo]] = {}
-    for file in parsed_files:
-        variant = file.variant or "default"
-        if variant not in variant_to_files:
-            variant_to_files[variant] = []
-        variant_to_files[variant].append(file)
-
-    print(f"Found variants: {list(variant_to_files.keys())}")
-
-    # sanity check for default variant
-    if default_variant:
-        if "default" in variant_to_files:
-            raise ValueError(
-                "All wheel files must have variant suffixes when `default_variant` is specified."
-            )
-        if default_variant not in variant_to_files:
-            raise ValueError(
-                f"Default variant '{default_variant}' not found among wheel files."
-            )
-
-    if alias_to_default:
-        if "default" not in variant_to_files:
-            # e.g. only some wheels are uploaded to S3 currently
-            print(
-                "[WARN] Alias to default variant specified, but no default variant found."
-            )
-        elif alias_to_default in variant_to_files:
-            raise ValueError(
-                f"Alias variant name '{alias_to_default}' already exists among wheel files."
-            )
-        else:
-            variant_to_files[alias_to_default] = variant_to_files["default"].copy()
-            print(f"Alias variant '{alias_to_default}' created for default variant.")
-
-    # Generate index for each variant
-    subdir_names = set()
-    for variant, files in variant_to_files.items():
-        if variant == "default":
-            variant_dir = index_base_dir
-        else:
-            variant_dir = index_base_dir / variant
-            subdir_names.add(variant)
-
-        variant_dir.mkdir(parents=True, exist_ok=True)
-
-        # gather all package names in this variant
-        packages = set(f.package_name for f in files)
-        if variant == "default":
-            # these packages should also appear in the "project list"
-            # generate after all variants are processed
-            subdir_names = subdir_names.union(packages)
-        else:
-            # generate project list for this variant directly
-            project_list_str = generate_project_list(sorted(packages))
-            with open(variant_dir / "index.html", "w") as f:
-                f.write(project_list_str)
-
-        for package in packages:
-            # filter files belonging to this package only
-            package_files = [f for f in files if f.package_name == package]
-            package_dir = variant_dir / package
-            package_dir.mkdir(parents=True, exist_ok=True)
-            index_str, metadata_str = generate_package_index_and_metadata(
-                package_files, wheel_base_dir, package_dir
-            )
-            with open(package_dir / "index.html", "w") as f:
-                f.write(index_str)
-            with open(package_dir / "metadata.json", "w") as f:
-                f.write(metadata_str)
-
-    # Generate top-level project list index
-    project_list_str = generate_project_list(sorted(subdir_names))
-    with open(index_base_dir / "index.html", "w") as f:
-        f.write(project_list_str)
-
-
-if __name__ == "__main__":
-    """
-    Arguments:
-        --version <version> : version string for the current build (e.g., commit hash)
-        --current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
-        --output-dir <output_directory> : directory to store generated index files
-        --alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
-    """
-
-    parser = argparse.ArgumentParser(
-        description="Process nightly build wheel files to generate indices."
-    )
-    parser.add_argument(
-        "--version",
-        type=str,
-        required=True,
-        help="Version string for the current build (e.g., commit hash)",
-    )
-    parser.add_argument(
-        "--current-objects",
-        type=str,
-        required=True,
-        help="Path to JSON file containing current S3 objects listing in this version directory",
-    )
-    parser.add_argument(
-        "--output-dir",
-        type=str,
-        required=True,
-        help="Directory to store generated index files",
-    )
-    parser.add_argument(
-        "--alias-to-default",
-        type=str,
-        default=None,
-        help="Alias variant name for the default variant",
-    )
-
-    args = parser.parse_args()
-
-    version = args.version
-    if "/" in version or "\\" in version:
-        raise ValueError("Version string must not contain slashes.")
-    current_objects_path = Path(args.current_objects)
-    output_dir = Path(args.output_dir)
-    if not output_dir.exists():
-        output_dir.mkdir(parents=True, exist_ok=True)
-
-    # Read current objects JSON
-    with open(current_objects_path) as f:
-        current_objects: dict[str, list[dict[str, Any]]] = json.load(f)
-
-    # current_objects looks like from list_objects_v2 S3 API:
-    """
-    "Contents": [
-        {
-            "Key": "e2f56c309d2a28899c68975a7e104502d56deb8f/vllm-0.11.2.dev363+ge2f56c309-cp38-abi3-manylinux1_x86_64.whl",
-            "LastModified": "2025-11-28T14:00:32+00:00",
-            "ETag": "\"37a38339c7cdb61ca737021b968075df-52\"",
-            "ChecksumAlgorithm": [
-                "CRC64NVME"
-            ],
-            "ChecksumType": "FULL_OBJECT",
-            "Size": 435649349,
-            "StorageClass": "STANDARD"
-        },
-        ...
-    ]
-    """
-
-    # Extract wheel file keys
-    wheel_files = []
-    for item in current_objects.get("Contents", []):
-        key: str = item["Key"]
-        if key.endswith(".whl"):
-            wheel_files.append(key.split("/")[-1])  # only the filename is used
-
-    print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
-
-    # Generate index and metadata, assuming wheels and indices are stored as:
-    # s3://vllm-wheels/{version}/<wheel files>
-    # s3://vllm-wheels/<anything>/<index files>
-    wheel_base_dir = Path(output_dir).parent / version
-    index_base_dir = Path(output_dir)
-
-    generate_index_and_metadata(
-        whl_files=wheel_files,
-        wheel_base_dir=wheel_base_dir,
-        index_base_dir=index_base_dir,
-        default_variant=None,
-        alias_to_default=args.alias_to_default,
-    )
-    print(f"Successfully generated index and metadata in {output_dir}")
diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
index 05accb9cf16d0..945c5e48c0090 100644
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -2,28 +2,6 @@
 
 set -ex
 
-# ======== part 0: setup ========
-
-BUCKET="vllm-wheels"
-INDICES_OUTPUT_DIR="indices"
-DEFAULT_VARIANT_ALIAS="cu129" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py
-PYTHON=${PYTHON_PROG:=python3} # try to read from env var, otherwise use python3
-SUBPATH=$BUILDKITE_COMMIT
-S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
-
-# detect if python3.10+ is available
-has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,10) else 0)")
-if [[ "$has_new_python" -eq 0 ]]; then
-    # use new python from docker
-    docker pull python:3-slim
-    PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3"
-fi
-
-echo "Using python interpreter: $PYTHON"
-echo "Python version: $($PYTHON --version)"
-
-# ========= part 1: collect, rename & upload the wheel ==========
-
 # Assume wheels are in artifacts/dist/*.whl
 wheel_files=(artifacts/dist/*.whl)
 
@@ -32,69 +10,74 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then
   echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
   exit 1
 fi
+
+# Get the single wheel file
 wheel="${wheel_files[0]}"
 
-# current build image uses ubuntu 20.04, which corresponds to manylinux_2_31
-# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
-manylinux_version="manylinux_2_31"
+# Detect architecture and rename 'linux' to appropriate manylinux version
+arch=$(uname -m)
+if [[ $arch == "x86_64" ]]; then
+    manylinux_version="manylinux1"
+elif [[ $arch == "aarch64" ]]; then
+    manylinux_version="manylinux2014"
+else
+    echo "Warning: Unknown architecture $arch, using manylinux1 as default"
+    manylinux_version="manylinux1"
+fi
 
 # Rename 'linux' to the appropriate manylinux version in the wheel filename
-if [[ "$wheel" != *"linux"* ]]; then
-  echo "Error: Wheel filename does not contain 'linux': $wheel"
-  exit 1
-fi
 new_wheel="${wheel/linux/$manylinux_version}"
 mv -- "$wheel" "$new_wheel"
 wheel="$new_wheel"
-echo "Renamed wheel to: $wheel"
 
 # Extract the version from the wheel
 version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
-echo "Version in wheel: $version"
-pure_version="${version%%+*}"
-echo "Pure version (without variant): $pure_version"
+echo "Version: $version"
 
-# copy wheel to its own bucket
-aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
+normal_wheel="$wheel" # Save the original wheel filename
 
-# ========= part 2: generate and upload indices ==========
-# generate indices for all existing wheels in the commit directory
-# this script might be run multiple times if there are multiple variants being built
-# so we need to guarantee there is little chance for "TOCTOU" issues
-# i.e., one process is generating indices while another is uploading a new wheel
-# so we need to ensure no time-consuming operations happen below
+# If the version contains "dev", rename it to v1.0.0.dev for consistency
+if [[ $version == *dev* ]]; then
+    suffix="${version##*.}"
+    if [[ $suffix == cu* ]]; then
+        new_version="1.0.0.dev+${suffix}"
+    else
+        new_version="1.0.0.dev"
+    fi
+    new_wheel="${wheel/$version/$new_version}"
+    # use cp to keep both files in the artifacts directory
+    cp -- "$wheel" "$new_wheel"
+    wheel="$new_wheel"
+    version="$new_version"
+fi
 
-# list all wheels in the commit directory
-echo "Existing wheels on S3:"
-aws s3 ls "$S3_COMMIT_PREFIX"
-obj_json="objects.json"
-aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
-mkdir -p "$INDICES_OUTPUT_DIR"
+# Upload the wheel to S3
+python3 .buildkite/generate_index.py --wheel "$normal_wheel"
 
-# call script to generate indicies for all existing wheels
-# this indices have relative paths that could work as long as it is next to the wheel directory in s3
-# i.e., the wheels are always in s3://vllm-wheels/<commit>/
-# and indices can be placed in /<commit>/, or /nightly/, or /<version>/
-if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
-    alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
+# generate index for this commit
+aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
+aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
+
+if [[ $normal_wheel == *"cu129"* ]]; then
+    # only upload index.html for cu129 wheels (default wheels) as it
+    # is available on both x86 and arm64
+    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
+    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
 else
-    alias_arg=""
+    echo "Skipping index files for non-cu129 wheels"
 fi
 
-$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg
+# generate index for nightly
+aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
+aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
 
-# copy indices to /<commit>/ unconditionally
-echo "Uploading indices to $S3_COMMIT_PREFIX"
-aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
-
-# copy to /nightly/ only if it is on the main branch and not a PR 
-if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
-    echo "Uploading indices to overwrite /nightly/"
-    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
+if [[ $normal_wheel == *"cu129"* ]]; then
+    # only upload index.html for cu129 wheels (default wheels) as it
+    # is available on both x86 and arm64
+    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
+else
+    echo "Skipping index files for non-cu129 wheels"
 fi
 
-# copy to /<pure_version>/ only if it does not have "dev" in the version
-if [[ "$version" != *"dev"* ]]; then
-    echo "Uploading indices to overwrite /$pure_version/"
-    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
-fi
+aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
+aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md
index 18dc6d19434b3..d1beab7855b18 100644
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -46,23 +46,10 @@ vLLM is a Python library that supports the following CPU variants. Select your C
 
 ### Pre-built wheels
 
-Please refer to the instructions for [pre-built wheels on GPU](./gpu.md#pre-built-wheels).
-
-When specifying the index URL, please make sure to use the `cpu` variant subdirectory.
-For example, the nightly build index is: `https://wheels.vllm.ai/nightly/cpu/`.
+Currently, there are no pre-built CPU wheels.
 
 ### Build wheel from source
 
-#### Set up using Python-only build (without compilation) {#python-only-build}
-
-Please refer to the instructions for [Python-only build on GPU](./gpu.md#python-only-build), and replace the build commands with:
-
-```bash
-VLLM_USE_PRECOMPILED=1 VLLM_PRECOMPILED_WHEEL_VARIANT=cpu VLLM_TARGET_DEVICE=cpu uv pip install --editable .
-```
-
-#### Full build (with compilation) {#full-build}
-
 === "Intel/AMD x86"
 
     --8<-- "docs/getting_started/installation/cpu.x86.inc.md:build-wheel-from-source"
diff --git a/docs/getting_started/installation/gpu.cuda.inc.md b/docs/getting_started/installation/gpu.cuda.inc.md
index ad26672f80926..601d3659af886 100644
--- a/docs/getting_started/installation/gpu.cuda.inc.md
+++ b/docs/getting_started/installation/gpu.cuda.inc.md
@@ -26,50 +26,43 @@ uv pip install vllm --torch-backend=auto
 
 ??? console "pip"
     ```bash
-    # Install vLLM with CUDA 12.9.
-    pip install vllm --extra-index-url https://download.pytorch.org/whl/cu129
+    # Install vLLM with CUDA 12.8.
+    pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128
     ```
 
-We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu128`), set `--torch-backend=cu128` (or `UV_TORCH_BACKEND=cu128`). If this doesn't work, try running `uv self update` to update `uv` first.
+We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu126`), set `--torch-backend=cu126` (or `UV_TORCH_BACKEND=cu126`). If this doesn't work, try running `uv self update` to update `uv` first.
 
 !!! note
     NVIDIA Blackwell GPUs (B200, GB200) require a minimum of CUDA 12.8, so make sure you are installing PyTorch wheels with at least that version. PyTorch itself offers a [dedicated interface](https://pytorch.org/get-started/locally/) to determine the appropriate pip command to run for a given target configuration.
 
-As of now, vLLM's binaries are compiled with CUDA 12.9 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.8, 13.0, and public PyTorch release versions:
+As of now, vLLM's binaries are compiled with CUDA 12.8 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.6, 11.8, and public PyTorch release versions:
 
 ```bash
-# Install vLLM with a specific CUDA version (e.g., 13.0).
+# Install vLLM with a specific CUDA version (e.g., 11.8 or 12.6).
 export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
-export CUDA_VERSION=130 # or other
-uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux_2_31_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}
+export CUDA_VERSION=118 # or 126
+uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}
 ```
 
 #### Install the latest code
 
-LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for every commit since `v0.5.3` on <https://wheels.vllm.ai/nightly>. There are multiple indices that could be used:
-
-* `https://wheels.vllm.ai/nightly`: the default variant (CUDA with version specified in `VLLM_MAIN_CUDA_VERSION`) built with the last commit on the `main` branch. Currently it is CUDA 12.9.
-* `https://wheels.vllm.ai/nightly/<variant>`: all other variants. Now this includes `cu130`, and `cpu`. The default variant (`cu129`) also has a subdirectory to keep consistency.
-
-To install from nightly index, run:
+LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on an x86 platform with CUDA 12 for every commit since `v0.5.3`.
 
 ```bash
 uv pip install -U vllm \
     --torch-backend=auto \
-    --extra-index-url https://wheels.vllm.ai/nightly # add variant subdirectory here if needed
+    --extra-index-url https://wheels.vllm.ai/nightly
 ```
 
-!!! warning "`pip` caveat"
-
-    Using `pip` to install from nightly indices is _not supported_, because `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. In contrast, `uv` gives the extra index [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes).
-
-    If you insist on using `pip`, you have to specify the full URL of the wheel file (which can be obtained from the web page).
-
+??? console "pip"
     ```bash
-    pip install -U https://wheels.vllm.ai/nightly/vllm-0.11.2.dev399%2Bg3c7461c18-cp38-abi3-manylinux_2_31_x86_64.whl # current nightly build (the filename will change!)
-    pip install -U https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-0.11.2.dev399%2Bg3c7461c18-cp38-abi3-manylinux_2_31_x86_64.whl # from specific commit
+    pip install -U vllm \
+        --pre \
+        --extra-index-url https://wheels.vllm.ai/nightly
     ```
 
+    `--pre` is required for `pip` to consider pre-released versions.
+
 ##### Install specific revisions
 
 If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL:
@@ -78,13 +71,33 @@ If you want to access the wheels for previous commits (e.g. to bisect the behavi
 export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
 uv pip install vllm \
     --torch-backend=auto \
-    --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT} # add variant subdirectory here if needed
+    --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}
 ```
 
+The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
+
+??? note "pip"
+    If you want to access the wheels for previous commits (e.g. to bisect the behavior change,
+    performance regression), due to the limitation of `pip`, you have to specify the full URL of the
+    wheel file by embedding the commit hash in the URL:
+
+    ```bash
+    export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
+    pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+    ```
+
+    Note that the wheels are built with Python 3.8 ABI (see [PEP
+    425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible
+    with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a
+    placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in
+    the wheel metadata (the wheels listed in the extra index url have correct versions). Although we
+    don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the
+    wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
+
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]
 
-#### Set up using Python-only build (without compilation) {#python-only-build}
+#### Set up using Python-only build (without compilation)
 
 If you only need to change Python code, you can build and install vLLM without compilation. Using `uv pip`'s [`--editable` flag](https://docs.astral.sh/uv/pip/packages/#editable-packages), changes you make to the code will be reflected when you run vLLM:
 
@@ -108,24 +121,18 @@ This command will do the following:
 In case you see an error about wheel not found when running the above command, it might be because the commit you based on in the main branch was just merged and the wheel is being built. In this case, you can wait for around an hour to try again, or manually assign the previous commit in the installation using the `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable.
 
 ```bash
-export VLLM_PRECOMPILED_WHEEL_COMIMT=$(git rev-parse HEAD~1) # or earlier commit on main
-export VLLM_USE_PRECOMPILED=1
+export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
+export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
 uv pip install --editable .
 ```
 
-There are more environment variables to control the behavior of Python-only build:
-
-* `VLLM_PRECOMPILED_WHEEL_LOCATION`: specify the exact wheel URL or local file path of a pre-compiled wheel to use. All other logic to find the wheel will be skipped.
-* `VLLM_PRECOMPILED_WHEEL_COMMIT`: override the commit hash to download the pre-compiled wheel. It can be `nightly` to use the last **already built** commit on the main branch.
-* `VLLM_PRECOMPILED_WHEEL_VARIANT`: specify the variant subdirectory to use on the nightly index, e.g., `cu129`, `cpu`. If not specified, the CUDA variant with `VLLM_MAIN_CUDA_VERSION` will be tried, then fallback to the default variant on the remote index.
-
 You can find more information about vLLM's wheels in [Install the latest code](#install-the-latest-code).
 
 !!! note
     There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
     It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [Install the latest code](#install-the-latest-code) for instructions on how to install a specified wheel.
 
-#### Full build (with compilation) {#full-build}
+#### Full build (with compilation)
 
 If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes:
 
diff --git a/docs/getting_started/installation/gpu.md b/docs/getting_started/installation/gpu.md
index fb750f4499858..bc7508b29475f 100644
--- a/docs/getting_started/installation/gpu.md
+++ b/docs/getting_started/installation/gpu.md
@@ -52,7 +52,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 
     --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:set-up-using-python"
 
-### Pre-built wheels {#pre-built-wheels}
+### Pre-built wheels
 
 === "NVIDIA CUDA"
 
diff --git a/setup.py b/setup.py
index 67fbebb1d37b5..0022e7fe0bf36 100644
--- a/setup.py
+++ b/setup.py
@@ -310,6 +310,9 @@ class cmake_build_ext(build_ext):
 class precompiled_build_ext(build_ext):
     """Disables extension building when using precompiled binaries."""
 
+    def run(self) -> None:
+        assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
+
     def build_extensions(self) -> None:
         print("Skipping build_ext: using precompiled extensions.")
         return
@@ -319,17 +322,14 @@ class precompiled_wheel_utils:
     """Extracts libraries and other files from an existing wheel."""
 
     @staticmethod
-    def extract_precompiled_and_patch_package(
-        wheel_url_or_path: str, download_filename: str | None
-    ) -> dict:
+    def extract_precompiled_and_patch_package(wheel_url_or_path: str) -> dict:
         import tempfile
         import zipfile
 
         temp_dir = None
         try:
             if not os.path.isfile(wheel_url_or_path):
-                # use provided filename first, then derive from URL
-                wheel_filename = download_filename or wheel_url_or_path.split("/")[-1]
+                wheel_filename = wheel_url_or_path.split("/")[-1]
                 temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
                 wheel_path = os.path.join(temp_dir, wheel_filename)
                 print(f"Downloading wheel from {wheel_url_or_path} to {wheel_path}")
@@ -648,102 +648,38 @@ package_data = {
     ]
 }
 
-
-def _fetch_metadata_for_variant(
-    commit: str, variant: str | None
-) -> tuple[list[dict], str]:
-    variant_dir = f"{variant}/" if variant is not None else ""
-    repo_url = f"https://wheels.vllm.ai/{commit}/{variant_dir}vllm/"
-    meta_url = repo_url + "metadata.json"
-    logger.info("Trying to fetch metadata from {}", meta_url)
-    from urllib.request import urlopen
-
-    with urlopen(meta_url) as resp:
-        # urlopen raises HTTPError on unexpected status code
-        wheels = json.loads(resp.read().decode("utf-8"))
-    return wheels, repo_url
-
-
 # If using precompiled, extract and patch package_data (in advance of setup)
 if envs.VLLM_USE_PRECOMPILED:
-    # Attempts:
-    # 1. user-specified wheel location (can be either local or remote, via
-    #    VLLM_PRECOMPILED_WHEEL_LOCATION)
-    # 2. user-specified variant from nightly repo (current main commit via
-    #    VLLM_PRECOMPILED_WHEEL_VARIANT)
-    # 3. the variant corresponding to VLLM_MAIN_CUDA_VERSION from nightly repo
-    # 4. the default variant from nightly repo (current main commit)
+    assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
     wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
     if wheel_location is not None:
         wheel_url = wheel_location
-        download_filename = None
-        logger.info("Using user-specified precompiled wheel location: %s", wheel_url)
     else:
         import platform
 
         arch = platform.machine()
-        # try to fetch the wheel metadata from the nightly wheel repo
-        main_variant = envs.VLLM_MAIN_CUDA_VERSION.replace(".", "")
-        variant = os.getenv("VLLM_PRECOMPILED_WHEEL_VARIANT", main_variant)
-        commit = os.getenv(
-            "VLLM_PRECOMPILED_WHEEL_COMMIT",
-            precompiled_wheel_utils.get_base_commit_in_main_branch(),
-        )
-        logger.info(
-            "Using precompiled wheel commit %s with variant %s", commit, variant
-        )
-        try_default = False
-        wheels, repo_url, download_filename = None, None, None
-        try:
-            wheels, repo_url = _fetch_metadata_for_variant(commit, variant)
-        except Exception:
-            logger.warning(
-                "Failed to fetch precompiled wheel metadata for variant %s",
-                variant,
-                exc_info=True,
-            )
-            try_default = True  # try outside handler to keep the stacktrace simple
-        if try_default:
-            logger.info("Trying the default variant")
-            wheels, repo_url = _fetch_metadata_for_variant(commit, None)
-            # if this also fails, then we have nothing more to try / cache
-        assert wheels is not None and repo_url is not None, (
-            "Failed to fetch precompiled wheel metadata"
-        )
-        # The metadata.json has the following format:
-        # see .buildkite/scripts/generate-nightly-index.py for details
-        """[{
-"package_name": "vllm",
-"version": "0.11.2.dev278+gdbc3d9991",
-"build_tag": null,
-"python_tag": "cp38",
-"abi_tag": "abi3",
-"platform_tag": "manylinux1_x86_64",
-"variant": null,
-"filename": "vllm-0.11.2.dev278+gdbc3d9991-cp38-abi3-manylinux1_x86_64.whl",
-"path": "../vllm-0.11.2.dev278%2Bgdbc3d9991-cp38-abi3-manylinux1_x86_64.whl"
-},
-...]"""
-        for wheel in wheels:
-            # TODO: maybe check more compatibility later? (python_tag, abi_tag, etc)
-            if wheel.get("package_name") == "vllm" and arch in wheel.get(
-                "platform_tag", ""
-            ):
-                logger.info("Found precompiled wheel metadata: %s", wheel)
-                if "path" not in wheel:
-                    raise ValueError(f"Wheel metadata missing path: {wheel}")
-                wheel_url = repo_url + wheel["path"]
-                download_filename = wheel.get("filename")
-                logger.info("Using precompiled wheel URL: %s", wheel_url)
-                break
+        if arch == "x86_64":
+            wheel_tag = "manylinux1_x86_64"
+        elif arch == "aarch64":
+            wheel_tag = "manylinux2014_aarch64"
         else:
-            raise ValueError(
-                f"No precompiled vllm wheel found for architecture {arch} "
-                f"from repo {repo_url}. All available wheels: {wheels}"
-            )
-    patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(
-        wheel_url, download_filename
-    )
+            raise ValueError(f"Unsupported architecture: {arch}")
+        base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
+        wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
+        nightly_wheel_url = (
+            f"https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
+        )
+        from urllib.request import urlopen
+
+        try:
+            with urlopen(wheel_url) as resp:
+                if resp.status != 200:
+                    wheel_url = nightly_wheel_url
+        except Exception as e:
+            print(f"[warn] Falling back to nightly wheel: {e}")
+            wheel_url = nightly_wheel_url
+
+    patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(wheel_url)
     for pkg, files in patch.items():
         package_data.setdefault(pkg, []).extend(files)
 
diff --git a/vllm/envs.py b/vllm/envs.py
index d0912863e6444..46f1aa3222be7 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -74,7 +74,7 @@ if TYPE_CHECKING:
     VLLM_MEDIA_CONNECTOR: str = "http"
     VLLM_MM_INPUT_CACHE_GIB: int = 4
     VLLM_TARGET_DEVICE: str = "cuda"
-    VLLM_MAIN_CUDA_VERSION: str = "12.9"
+    VLLM_MAIN_CUDA_VERSION: str = "12.8"
     MAX_JOBS: str | None = None
     NVCC_THREADS: str | None = None
     VLLM_USE_PRECOMPILED: bool = False
@@ -445,9 +445,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # Target device of vLLM, supporting [cuda (by default),
     # rocm, cpu]
     "VLLM_TARGET_DEVICE": lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda").lower(),
-    # Main CUDA version of vLLM. This follows PyTorch but can be overridden.
+    # Main CUDA version of vLLM, supporting [12.6, 12.8, 12.9],
+    # 12.8 is the default. This follows PyTorch but can be overridden.
     "VLLM_MAIN_CUDA_VERSION": lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower()
-    or "12.9",
+    or "12.8",
     # Maximum number of compilation jobs to run in parallel.
     # By default this is the number of CPUs
     "MAX_JOBS": lambda: os.getenv("MAX_JOBS", None),

From 342c4f147266e7d01a8a6a64754df7e0f7f8991a Mon Sep 17 00:00:00 2001
From: Alexei-V-Ivanov-AMD
 <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
Date: Mon, 1 Dec 2025 17:44:33 -0600
Subject: [PATCH 122/770] Updated CI mirror 2025-11-25 (#29434)

Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
Signed-off-by: Alexei-V-Ivanov-AMD <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
Co-authored-by: Kevin H. Luu <khluu000@gmail.com>
---
 .buildkite/test-amd.yaml | 106 +++++++++++++++++++++++++++++----------
 1 file changed, 80 insertions(+), 26 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 687b6b08507c7..d5d4043a1d5bc 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -39,9 +39,9 @@ steps:
   # if this test fails, it means the nightly torch version is not compatible with some
   # of the dependencies. Please check the error message and add the package to whitelist
   # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
   agent_pool: mi325_1
-  # grade: Blocking
+  grade: Blocking
   soft_fail: true
   source_file_dependencies:
   - requirements/nightly_torch_test.txt
@@ -50,9 +50,9 @@ steps:
 
 - label: Async Engine, Inputs, Utils, Worker Test # 10min
   timeout_in_minutes: 15
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
   agent_pool: mi325_1
-  # grade: Blocking
+  grade: Blocking
   source_file_dependencies:
   - vllm/
   - tests/multimodal
@@ -63,9 +63,9 @@ steps:
 
 - label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
   timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
   agent_pool: mi325_1
-  # grade: Blocking
+  grade: Blocking
   source_file_dependencies:
   - vllm/
   - tests/test_inputs.py
@@ -115,9 +115,9 @@ steps:
   - pytest -v -s basic_correctness/test_cpu_offload.py
 
 - label: Entrypoints Unit Tests # 5min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
   agent_pool: mi325_1
-  # grade: Blocking
+  grade: Blocking
   timeout_in_minutes: 10
   working_dir: "/vllm-workspace/tests"
   fast_check: true
@@ -214,6 +214,7 @@ steps:
   # test with internal dp
   - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
   - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
   - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
   - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
   - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
@@ -252,9 +253,9 @@ steps:
   - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
 
 - label: EPLB Algorithm Test # 5min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
   agent_pool: mi325_1
-  # grade: Blocking
+  grade: Blocking
   timeout_in_minutes: 15
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
@@ -341,9 +342,9 @@ steps:
 
 - label: V1 Test entrypoints # 35min
   timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
   agent_pool: mi325_1
-  # grade: Blocking
+  grade: Blocking
   source_file_dependencies:
     - vllm/
     - tests/v1
@@ -391,6 +392,20 @@ steps:
   commands:
     - pytest -v -s v1/attention
 
+- label: Batch Invariance Tests (H100) # 10min
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  timeout_in_minutes: 25
+  gpu: h100
+  source_file_dependencies:
+    - vllm/
+    - tests/v1/determinism/
+  commands:
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pip install pytest-timeout pytest-forked
+    - pytest -v -s v1/determinism/test_batch_invariance.py
+    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
+
 - label: V1 Test attention (B200) # 10min
   timeout_in_minutes: 30
   gpu: b200
@@ -401,9 +416,9 @@ steps:
     - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
 
 - label: V1 Test others (CPU) # 5 mins
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
   agent_pool: mi325_1
-  # grade: Blocking
+  grade: Blocking
   source_file_dependencies:
     - vllm/
     - tests/v1
@@ -495,7 +510,7 @@ steps:
 
 - label: PyTorch Compilation Unit Tests # 15min
   timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   torch_nightly: true
@@ -512,7 +527,7 @@ steps:
 
 - label: PyTorch Fullgraph Smoke Test # 15min
   timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   torch_nightly: true
@@ -568,7 +583,7 @@ steps:
 
 - label: Kernels Attention Test %N # 23min
   timeout_in_minutes: 35
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_8
   # grade: Blocking
   source_file_dependencies:
@@ -595,7 +610,7 @@ steps:
 
 - label: Kernels MoE Test %N # 40min
   timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_8
   # grade: Blocking
   source_file_dependencies:
@@ -622,6 +637,26 @@ steps:
   commands:
     - pytest -v -s kernels/mamba
 
+- label: Kernels DeepGEMM Test (H100) # Nvidia-centric
+# Not replicating for CUTLAS & CuTe
+  timeout_in_minutes: 45
+  gpu: h100
+  num_gpus: 1
+  source_file_dependencies:
+  - tools/install_deepgemm.sh
+  - vllm/utils/deep_gemm.py
+  - vllm/model_executor/layers/fused_moe
+  - vllm/model_executor/layers/quantization
+  - tests/kernels/quantization/test_block_fp8.py
+  - tests/kernels/moe/test_deepgemm.py
+  - tests/kernels/moe/test_batched_deepgemm.py
+  - tests/kernels/attention/test_deepgemm_attention.py
+  commands:
+    - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
+    - pytest -v -s kernels/moe/test_deepgemm.py
+    - pytest -v -s kernels/moe/test_batched_deepgemm.py
+    - pytest -v -s kernels/attention/test_deepgemm_attention.py
+
 - label: Model Executor Test # 23min
   timeout_in_minutes: 35
   torch_nightly: true
@@ -1055,6 +1090,7 @@ steps:
     - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
     - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
     - pytest -v -s tests/kernels/moe/test_flashinfer.py
+    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
 
 - label: Blackwell Fusion and Compile Tests # 30 min
   timeout_in_minutes: 40
@@ -1064,11 +1100,19 @@ steps:
   - csrc/quantization/fp4/
   - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
   - vllm/v1/attention/backends/flashinfer.py
+  - vllm/v1/worker/
+  - vllm/v1/cudagraph_dispatcher.py
   - vllm/compilation/
   # can affect pattern matching
   - vllm/model_executor/layers/layernorm.py
   - vllm/model_executor/layers/activation.py
   - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - vllm/model_executor/layers/fused_moe/layer.py
+  - tests/compile/test_fusion_attn.py
+  - tests/compile/test_silu_mul_quant_fusion.py
+  - tests/compile/distributed/test_fusion_all_reduce.py
+  - tests/compile/distributed/test_fusions_e2e.py
+  - tests/compile/fullgraph/test_full_graph.py
   commands:
     - nvidia-smi
     - pytest -v -s tests/compile/test_fusion_attn.py
@@ -1079,7 +1123,7 @@ steps:
     # Wrap with quotes to escape yaml
     - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
     # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
-    - pytest -v -s tests/compile/distributed/test_full_graph.py::test_fp8_kv_scale_compile
+    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
 
 - label: Blackwell Fusion E2E Tests # 30 min
   timeout_in_minutes: 40
@@ -1101,7 +1145,7 @@ steps:
   commands:
     - nvidia-smi
     # Run all e2e fusion tests
-    - pytest -v -s tests/compile/test_fusions_e2e.py
+    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
 
 - label: ROCm GPT-OSS Eval
   timeout_in_minutes: 60
@@ -1216,6 +1260,7 @@ steps:
   - tests/v1/worker/test_worker_memory_snapshot.py
   commands:
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
   - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
   - pytest -v -s entrypoints/llm/test_collective_rpc.py
@@ -1251,7 +1296,7 @@ steps:
 
 - label: Plugin Tests (2 GPUs) # 40min
   timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_2
   # grade: Blocking
   working_dir: "/vllm-workspace/tests"
@@ -1327,7 +1372,7 @@ steps:
 
 - label: Weight Loading Multiple GPU Test  # 33min
   timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_2
   # grade: Blocking
   working_dir: "/vllm-workspace/tests"
@@ -1432,7 +1477,7 @@ steps:
     - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
     #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
     - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
-    - pytest -v -s tests/compile/distributed/test_sequence_parallel.py
+    - pytest -v -s tests/distributed/test_sequence_parallel.py
     - pytest -v -s tests/distributed/test_context_parallel.py
     - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
     - pytest -v -s tests/v1/distributed/test_dbo.py
@@ -1464,7 +1509,7 @@ steps:
     - bash .buildkite/scripts/run-prime-rl-test.sh
 
 - label: DeepSeek V2-Lite Accuracy
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_4
   # grade: Blocking
   timeout_in_minutes: 60
@@ -1475,8 +1520,8 @@ steps:
   commands:
   - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
 
-- label: Qwen3-30B-A3B-FP8-block Accuracy
-  mirror_hardwares: [amdexperimental]
+- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_4
   # grade: Blocking
   timeout_in_minutes: 60
@@ -1486,3 +1531,12 @@ steps:
   working_dir: "/vllm-workspace"
   commands:
   - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
+  timeout_in_minutes: 60
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
\ No newline at end of file

From 44822d7ff22cb62856b4a107f97f00fc8e1199a0 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Mon, 1 Dec 2025 17:15:52 -0800
Subject: [PATCH 123/770] [BugFix] Preserve spec decoding uniform decode when
 scheduling (#29759)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/e2e/test_spec_decode.py      |  4 +--
 vllm/v1/core/sched/async_scheduler.py |  2 +-
 vllm/v1/core/sched/scheduler.py       | 36 ++++++++++++++++-----------
 3 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 3a25f7411eecd..f711715dec0e6 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -191,8 +191,8 @@ def test_suffix_decoding_acceptance(
     # Expect the acceptance rate to improve.
     assert first_accept_rate < last_accept_rate
 
-    # Heuristic: expect at least 85% acceptance rate at the end.
-    assert last_accept_rate > 0.85
+    # Heuristic: expect at least 82.5% acceptance rate at the end.
+    assert last_accept_rate > 0.825
 
     del spec_llm
     torch.cuda.empty_cache()
diff --git a/vllm/v1/core/sched/async_scheduler.py b/vllm/v1/core/sched/async_scheduler.py
index 3214f65a09728..7916fafdae1fb 100644
--- a/vllm/v1/core/sched/async_scheduler.py
+++ b/vllm/v1/core/sched/async_scheduler.py
@@ -33,7 +33,7 @@ class AsyncScheduler(Scheduler):
                 # in this scheduling step.
                 request.num_output_placeholders += 1 + cur_num_spec_tokens
                 # Add placeholders for the new tokens in spec_token_ids.
-                # Wwe will update the actual spec token ids in the worker process.
+                # We will update the actual spec token ids in the worker process.
                 request.spec_token_ids = [-1] * self.num_spec_tokens
 
         scheduler_output.pending_structured_output_tokens = (
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 4314ba75eceef..c1ead200ba8d6 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -236,6 +236,22 @@ class Scheduler(SchedulerInterface):
         while req_index < len(self.running) and token_budget > 0:
             request = self.running[req_index]
 
+            if (
+                request.num_output_placeholders > 0
+                # This is (num_computed_tokens + 1) - (num_output_placeholders - 1).
+                # Since output placeholders are also included in the computed tokens
+                # count, we subtract (num_output_placeholders - 1) to remove any draft
+                # tokens, so that we can be sure no further steps are needed even if
+                # they are all rejected.
+                and request.num_computed_tokens + 2 - request.num_output_placeholders
+                >= request.num_prompt_tokens + request.max_tokens
+            ):
+                # Async scheduling: Avoid scheduling an extra step when we are sure that
+                # the previous step has reached request.max_tokens. We don't schedule
+                # partial draft tokens since this prevents uniform decode optimizations.
+                req_index += 1
+                continue
+
             num_new_tokens = (
                 request.num_tokens_with_spec
                 + request.num_output_placeholders
@@ -245,18 +261,10 @@ class Scheduler(SchedulerInterface):
                 num_new_tokens = self.scheduler_config.long_prefill_token_threshold
             num_new_tokens = min(num_new_tokens, token_budget)
 
-            num_spec_placeholders = max(0, request.num_output_placeholders - 1)
-            max_total_tokens = min(
-                # Avoid scheduling tokens that we're sure won't will be needed based on
-                # request.max_tokens. For this calculation we assume placeholder
-                # speculated output tokens are rejected.
-                request.num_prompt_tokens + request.max_tokens + num_spec_placeholders,
-                # Make sure the input position does not exceed the max model len.
-                # This is necessary when using spec decoding.
-                self.max_model_len,
-            )
+            # Make sure the input position does not exceed the max model len.
+            # This is necessary when using spec decoding.
             num_new_tokens = min(
-                num_new_tokens, max_total_tokens - 1 - request.num_computed_tokens
+                num_new_tokens, self.max_model_len - 1 - request.num_computed_tokens
             )
 
             # Schedule encoder inputs.
@@ -799,15 +807,15 @@ class Scheduler(SchedulerInterface):
         for idx, req in enumerate(itertools.chain(running_reqs, resumed_reqs)):
             req_id = req.request_id
             req_ids.append(req_id)
-            num_tokens = num_scheduled_tokens[req_id] - len(
-                spec_decode_tokens.get(req_id, ())
-            )
             if self.use_pp:
                 # When using PP, the scheduler sends the sampled tokens back,
                 # because there's no direct communication between the first-
                 # stage worker and the last-stage worker. Otherwise, we don't
                 # need to send the sampled tokens back because the model runner
                 # will cache them.
+                num_tokens = num_scheduled_tokens[req_id] - len(
+                    spec_decode_tokens.get(req_id, ())
+                )
                 token_ids = req.all_token_ids[
                     req.num_computed_tokens : req.num_computed_tokens + num_tokens
                 ]

From c0dfc894857e98d8325b42a9d4c7cbbc0ecb5a24 Mon Sep 17 00:00:00 2001
From: Hendrik Holtmann <holtmann@mac.com>
Date: Tue, 2 Dec 2025 02:24:18 +0100
Subject: [PATCH 124/770]  SM120 / NVFP4: add device guard and runtime SM
 dispatch to cutlass_scaled_fp4_mm (#29711)

Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
---
 .../quantization/fp4/nvfp4_scaled_mm_entry.cu | 39 ++++++++++++-------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu b/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu
index 9cba2828aac2e..d9c4d24d8e1f2 100644
--- a/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu
+++ b/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu
@@ -15,6 +15,8 @@
  */
 
 #include <torch/all.h>
+#include <c10/cuda/CUDAGuard.h>
+#include "cutlass_extensions/common.hpp"
 
 #if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
 void cutlass_scaled_fp4_mm_sm100a(torch::Tensor& D, torch::Tensor const& A,
@@ -32,23 +34,34 @@ void cutlass_scaled_fp4_mm_sm120a(torch::Tensor& D, torch::Tensor const& A,
                                   torch::Tensor const& alpha);
 #endif
 
-void cutlass_scaled_fp4_mm(torch::Tensor& D, torch::Tensor const& A,
-                           torch::Tensor const& B, torch::Tensor const& A_sf,
-                           torch::Tensor const& B_sf,
-                           torch::Tensor const& alpha) {
-#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
-  return cutlass_scaled_fp4_mm_sm100a(D, A, B, A_sf, B_sf, alpha);
-#elif defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120
-  return cutlass_scaled_fp4_mm_sm120a(D, A, B, A_sf, B_sf, alpha);
+void cutlass_scaled_fp4_mm(torch::Tensor& D, const torch::Tensor& A,
+                           const torch::Tensor& B, const torch::Tensor& A_sf,
+                           const torch::Tensor& B_sf,
+                           const torch::Tensor& alpha) {
+  // Make sure we’re on A’s device.
+  const c10::cuda::OptionalCUDAGuard device_guard(device_of(A));
+  const int32_t sm = get_sm_version_num();
+
+#if defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100
+  if (sm >= 100 && sm < 120) {
+    cutlass_scaled_fp4_mm_sm100a(D, A, B, A_sf, B_sf, alpha);
+    return;
+  }
 #endif
-  TORCH_CHECK_NOT_IMPLEMENTED(false,
-                              "No compiled nvfp4 mm kernel, vLLM should "
-                              "be compiled using CUDA 12.8 and target "
-                              "compute capability 100 or above.");
+
+#if defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120
+  if (sm >= 120 && sm < 130) {
+    cutlass_scaled_fp4_mm_sm120a(D, A, B, A_sf, B_sf, alpha);
+    return;
+  }
+#endif
+
+  TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 mm kernel for SM ", sm,
+                              ". Recompile with CUDA >= 12.8 and CC >= 100.");
 }
 
 bool cutlass_scaled_mm_supports_fp4(int64_t cuda_device_capability) {
   int runtimeVersion;
   cudaRuntimeGetVersion(&runtimeVersion);
   return cuda_device_capability >= 100 && runtimeVersion >= 12080;
-}
\ No newline at end of file
+}

From 4b4092499889415c3a3d23dd44f8e616a6c424d1 Mon Sep 17 00:00:00 2001
From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Date: Mon, 1 Dec 2025 20:02:22 -0600
Subject: [PATCH 125/770] [ROCm] Fallback pytorch GELU with tanh approximation
 to GELU() (#29244)

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
Signed-off-by: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/model_executor/layers/activation.py | 30 ++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 3471ee327cf8c..7038d0868c7eb 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -159,6 +159,13 @@ class GeluAndMulSparse(CustomOp):
         self.approximate = approximate
         if approximate not in ("none", "tanh"):
             raise ValueError(f"Unknown approximate mode: {approximate}")
+        if current_platform.is_rocm() and approximate == "tanh":
+            # TODO:[ROCm] PyTorch native GELU with tanh is unstable with torch.compile
+            logger.warning_once(
+                "[ROCm] Pytorch's native GELU with tanh approximation is currently "
+                "unstable and produces garbage. Fallback to 'none' approximation."
+            )
+            self.approximate = "none"
 
         # Sparsity.
         if activation_sparsity == 0.0:
@@ -209,6 +216,12 @@ class GeluAndMul(CustomOp):
                 self.op = torch.ops._C.gelu_and_mul
             elif approximate == "tanh":
                 self.op = torch.ops._C.gelu_tanh_and_mul
+        if current_platform.is_rocm() and approximate == "tanh":
+            logger.warning_once(
+                "[ROCm] PyTorch's native GELU with tanh approximation is unstable "
+                "with torch.compile. For native implementation, fallback to 'none' "
+                "approximation. The custom kernel implementation is unaffected."
+            )
         elif current_platform.is_xpu():
             from vllm._ipex_ops import ipex_ops
 
@@ -219,8 +232,12 @@ class GeluAndMul(CustomOp):
 
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""
+        # TODO: [ROCm] PyTorch's native GELU with tanh is unstable with torch.compile
+        approximate = self.approximate
+        if current_platform.is_rocm() and approximate == "tanh":
+            approximate = "none"
         d = x.shape[-1] // 2
-        return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:]
+        return F.gelu(x[..., :d], approximate=approximate) * x[..., d:]
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
         d = x.shape[-1] // 2
@@ -522,7 +539,16 @@ _ACTIVATION_REGISTRY = LazyDict(
         "gelu": lambda: nn.GELU(),
         "gelu_fast": lambda: FastGELU(),
         "gelu_new": lambda: NewGELU(),
-        "gelu_pytorch_tanh": lambda: nn.GELU(approximate="tanh"),
+        "gelu_pytorch_tanh": lambda: (
+            # TODO:[ROCm] PyTorch native GELU with tanh is unstable with torch.compile
+            logger.warning_once(
+                "[ROCm] PyTorch's native GELU with tanh approximation is unstable. "
+                "Falling back to GELU(approximate='none')."
+            ),
+            nn.GELU(approximate="none"),
+        )[1]
+        if current_platform.is_rocm()
+        else nn.GELU(approximate="tanh"),
         "relu": lambda: nn.ReLU(),
         "relu2": lambda: ReLUSquaredActivation(),
         "silu": lambda: nn.SiLU(),

From fa8804ad9c3f21dad3143418c7d9f40190f609ae Mon Sep 17 00:00:00 2001
From: Andrew Xia <axia@meta.com>
Date: Mon, 1 Dec 2025 18:11:35 -0800
Subject: [PATCH 126/770] [responsesAPI][4] fix responseOutputItem Kimi K2
 thinking bug (#29555)

Signed-off-by: Andrew Xia <axia@fb.com>
Co-authored-by: Andrew Xia <axia@fb.com>
---
 tests/entrypoints/test_responses_utils.py | 21 +++++++++++++++++++++
 vllm/entrypoints/responses_utils.py       |  7 ++++++-
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/tests/entrypoints/test_responses_utils.py b/tests/entrypoints/test_responses_utils.py
index 893d806b65742..3951bd4840085 100644
--- a/tests/entrypoints/test_responses_utils.py
+++ b/tests/entrypoints/test_responses_utils.py
@@ -5,6 +5,8 @@ import pytest
 from openai.types.responses.response_function_tool_call_output_item import (
     ResponseFunctionToolCallOutputItem,
 )
+from openai.types.responses.response_output_message import ResponseOutputMessage
+from openai.types.responses.response_output_text import ResponseOutputText
 from openai.types.responses.response_reasoning_item import (
     Content,
     ResponseReasoningItem,
@@ -101,3 +103,22 @@ class TestResponsesUtils:
         )
         with pytest.raises(ValueError):
             construct_chat_message_with_tool_call(item)
+
+        output_item = ResponseOutputMessage(
+            id="msg_bf585bbbe3d500e0",
+            content=[
+                ResponseOutputText(
+                    annotations=[],
+                    text="dongyi",
+                    type="output_text",
+                    logprobs=None,
+                )
+            ],
+            role="assistant",
+            status="completed",
+            type="message",
+        )
+
+        formatted_item = construct_chat_message_with_tool_call(output_item)
+        assert formatted_item["role"] == "assistant"
+        assert formatted_item["content"] == "dongyi"
diff --git a/vllm/entrypoints/responses_utils.py b/vllm/entrypoints/responses_utils.py
index 07abb80ebc9e3..2e01cb038af85 100644
--- a/vllm/entrypoints/responses_utils.py
+++ b/vllm/entrypoints/responses_utils.py
@@ -97,13 +97,18 @@ def construct_chat_message_with_tool_call(
             "role": "assistant",
             "reasoning": reasoning_content,
         }
+    elif isinstance(item, ResponseOutputMessage):
+        return {
+            "role": "assistant",
+            "content": item.content[0].text,
+        }
     elif isinstance(item, ResponseFunctionToolCallOutputItem):
         return ChatCompletionToolMessageParam(
             role="tool",
             content=item.output,
             tool_call_id=item.call_id,
         )
-    elif item.get("type") == "function_call_output":
+    elif isinstance(item, dict) and item.get("type") == "function_call_output":
         # Append the function call output as a tool message.
         return ChatCompletionToolMessageParam(
             role="tool",

From d0cd728907d82a109a165612b8790ddaf5496f59 Mon Sep 17 00:00:00 2001
From: Zhuohan Li <zhuohan123@gmail.com>
Date: Mon, 1 Dec 2025 18:25:05 -0800
Subject: [PATCH 127/770] [Core] Support reseting all running requests' KV
 while calling `reset_prefix_cache` (#28827)

Signed-off-by: Zhuohan Li <zhuohan123@gmail.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
---
 .../offline_inference/llm_engine_reset_kv.py  | 98 +++++++++++++++++++
 tests/v1/core/test_reset_prefix_cache_e2e.py  | 66 +++++++++++++
 tests/v1/core/test_scheduler.py               | 31 ++++++
 vllm/engine/protocol.py                       |  2 +-
 vllm/entrypoints/llm.py                       |  4 +-
 vllm/entrypoints/openai/api_server.py         |  6 +-
 vllm/v1/core/sched/async_scheduler.py         |  6 ++
 vllm/v1/core/sched/interface.py               |  8 +-
 vllm/v1/core/sched/scheduler.py               | 77 ++++++++++++---
 vllm/v1/engine/async_llm.py                   |  4 +-
 vllm/v1/engine/core.py                        |  4 +-
 vllm/v1/engine/core_client.py                 | 22 +++--
 vllm/v1/engine/llm_engine.py                  |  4 +-
 vllm/v1/request.py                            |  7 +-
 vllm/v1/worker/gpu_input_batch.py             |  2 +
 vllm/v1/worker/gpu_model_runner.py            |  9 +-
 16 files changed, 315 insertions(+), 35 deletions(-)
 create mode 100644 examples/offline_inference/llm_engine_reset_kv.py
 create mode 100644 tests/v1/core/test_reset_prefix_cache_e2e.py

diff --git a/examples/offline_inference/llm_engine_reset_kv.py b/examples/offline_inference/llm_engine_reset_kv.py
new file mode 100644
index 0000000000000..3fbe7fa7545e6
--- /dev/null
+++ b/examples/offline_inference/llm_engine_reset_kv.py
@@ -0,0 +1,98 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This file demonstrates preempt requests when using the `LLMEngine`
+for processing prompts with various sampling parameters.
+"""
+
+import argparse
+
+from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+
+
+def create_test_prompts() -> list[tuple[str, SamplingParams]]:
+    """Create a list of test prompts with their sampling parameters."""
+    return [
+        (
+            "A robot may not injure a human being " * 50,
+            SamplingParams(
+                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=16
+            ),
+        ),
+        (
+            "A robot may not injure a human being " * 50,
+            SamplingParams(
+                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=16
+            ),
+        ),
+        (
+            "To be or not to be,",
+            SamplingParams(
+                temperature=0.8, top_k=5, presence_penalty=0.2, max_tokens=128
+            ),
+        ),
+        (
+            "What is the meaning of life?",
+            SamplingParams(
+                n=2, temperature=0.8, top_p=0.95, frequency_penalty=0.1, max_tokens=128
+            ),
+        ),
+    ]
+
+
+def process_requests(engine: LLMEngine, test_prompts: list[tuple[str, SamplingParams]]):
+    """Continuously process a list of prompts and handle the outputs."""
+    request_id = 0
+
+    print("-" * 50)
+    step_id = 0
+    while test_prompts or engine.has_unfinished_requests():
+        print("-" * 50)
+        import os
+
+        print(f"Step {step_id} (pid={os.getpid()})")
+
+        if test_prompts:
+            prompt, sampling_params = test_prompts.pop(0)
+            engine.add_request(str(request_id), prompt, sampling_params)
+            request_id += 1
+
+        if step_id == 10:
+            print(f"Resetting prefix cache at {step_id}")
+            engine.reset_prefix_cache(reset_running_requests=True)
+
+        request_outputs: list[RequestOutput] = engine.step()
+
+        for request_output in request_outputs:
+            if request_output.finished:
+                print("-" * 50)
+                print(request_output)
+                print("-" * 50)
+        step_id += 1
+
+
+def initialize_engine(args: argparse.Namespace) -> LLMEngine:
+    """Initialize the LLMEngine from the command line arguments."""
+    engine_args = EngineArgs.from_cli_args(args)
+    return LLMEngine.from_engine_args(engine_args)
+
+
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description="Demo on using the LLMEngine class directly"
+    )
+    parser = EngineArgs.add_cli_args(parser)
+    return parser.parse_args()
+
+
+def main(args: argparse.Namespace):
+    """Main function that sets up and runs the prompt processing."""
+    engine = initialize_engine(args)
+    test_prompts = create_test_prompts()
+    process_requests(engine, test_prompts)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/tests/v1/core/test_reset_prefix_cache_e2e.py b/tests/v1/core/test_reset_prefix_cache_e2e.py
new file mode 100644
index 0000000000000..e543c30a156ec
--- /dev/null
+++ b/tests/v1/core/test_reset_prefix_cache_e2e.py
@@ -0,0 +1,66 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm import EngineArgs, LLMEngine, SamplingParams
+
+PROMPTS = [
+    "A robot may not injure a human being ",
+    "To be or not to be,",
+    "What is the meaning of life?",
+    "What does the fox say? " * 20,  # Test long prompt
+]
+
+
+def test_reset_prefix_cache_e2e():
+    engine_args = EngineArgs(
+        model="Qwen/Qwen3-0.6B",
+        gpu_memory_utilization=0.2,
+        async_scheduling=True,
+        max_num_batched_tokens=32,
+        max_model_len=2048,
+        compilation_config={"mode": 0},
+    )
+    engine = LLMEngine.from_engine_args(engine_args)
+    sampling_params = SamplingParams(
+        temperature=0.0,
+        max_tokens=16,
+    )
+
+    # No preempt case:
+    for i, prompt in enumerate(PROMPTS):
+        engine.add_request("ground_truth_" + str(i), prompt, sampling_params)
+
+    ground_truth_results = {}
+    while engine.has_unfinished_requests():
+        request_outputs = engine.step()
+        for request_output in request_outputs:
+            if request_output.finished:
+                ground_truth_results[request_output.request_id] = request_output
+
+    # Preempt case:
+    for i, prompt in enumerate(PROMPTS):
+        engine.add_request("preempted_" + str(i), prompt, sampling_params)
+
+    step_id = 0
+    preempted_results = {}
+    while engine.has_unfinished_requests():
+        if step_id == 10:
+            engine.reset_prefix_cache(reset_running_requests=True)
+
+        request_outputs = engine.step()
+
+        for request_output in request_outputs:
+            if request_output.finished:
+                preempted_results[request_output.request_id] = request_output
+        step_id += 1
+
+    for i in range(len(PROMPTS)):
+        assert (
+            ground_truth_results["ground_truth_" + str(i)].outputs[0].text
+            == preempted_results["preempted_" + str(i)].outputs[0].text
+        ), (
+            f"ground_truth_results['ground_truth_{i}'].outputs[0].text="
+            f"{ground_truth_results['ground_truth_' + str(i)].outputs[0].text} "
+            f"preempted_results['preempted_{i}'].outputs[0].text="
+            f"{preempted_results['preempted_' + str(i)].outputs[0].text}"
+        )
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index fe4153e609971..0051c11d18d85 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -728,6 +728,37 @@ def test_preempt_during_execution():
     assert requests[1].output_token_ids[0] == 42
 
 
+def test_scheduler_reset_prefix_cache():
+    scheduler = create_scheduler(enable_prefix_caching=True)
+    requests = create_requests(num_requests=10)
+    for request in requests:
+        scheduler.add_request(request)
+
+    # Initial scheduling, requests should be at the running state now
+    _ = scheduler.schedule()
+
+    # Verify requests moved from waiting to running
+    assert len(scheduler.waiting) == 0
+    assert len(scheduler.running) == len(requests)
+    for i, request in enumerate(requests):
+        assert scheduler.running[i] == request
+
+    # Reset prefix cache should fail since there are still running requests
+    # and they are taking KV cache
+    assert not scheduler.reset_prefix_cache()
+
+    # Reset prefix cache with reset_running_requests=True. All running requests
+    # Should be pushed back to the waiting queue and kv cache should be freed
+    assert scheduler.reset_prefix_cache(reset_running_requests=True)
+
+    # Verify requests moved from running to waiting
+    assert len(scheduler.waiting) == len(requests)
+    assert len(scheduler.running) == 0
+
+    for i, request in enumerate(requests):
+        assert scheduler.waiting[i] == request
+
+
 # Note - these test cases mirror some of those in test_rejection_sampler.py
 @pytest.mark.parametrize(
     "spec_tokens,output_tokens,expected",
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index f2b19c845018c..1b6330c9f9b65 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -116,7 +116,7 @@ class EngineClient(ABC):
         ...
 
     @abstractmethod
-    async def reset_prefix_cache(self) -> None:
+    async def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
         """Reset the prefix cache"""
         ...
 
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index f005605c08d7e..c121fa71f0196 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1492,8 +1492,8 @@ class LLM:
     def stop_profile(self) -> None:
         self.llm_engine.stop_profile()
 
-    def reset_prefix_cache(self) -> None:
-        self.llm_engine.reset_prefix_cache()
+    def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
+        return self.llm_engine.reset_prefix_cache(reset_running_requests)
 
     def sleep(self, level: int = 1):
         """
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 92161f67f1cf0..cdc316b65ba78 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -877,13 +877,15 @@ if envs.VLLM_SERVER_DEV_MODE:
         return JSONResponse(content=server_info)
 
     @router.post("/reset_prefix_cache")
-    async def reset_prefix_cache(raw_request: Request):
+    async def reset_prefix_cache(
+        raw_request: Request, reset_running_requests: bool = Query(default=False)
+    ):
         """
         Reset the prefix cache. Note that we currently do not check if the
         prefix cache is successfully reset in the API server.
         """
         logger.info("Resetting prefix cache...")
-        await engine_client(raw_request).reset_prefix_cache()
+        await engine_client(raw_request).reset_prefix_cache(reset_running_requests)
         return Response(status_code=200)
 
     @router.post("/reset_mm_cache")
diff --git a/vllm/v1/core/sched/async_scheduler.py b/vllm/v1/core/sched/async_scheduler.py
index 7916fafdae1fb..df61eebb395e5 100644
--- a/vllm/v1/core/sched/async_scheduler.py
+++ b/vllm/v1/core/sched/async_scheduler.py
@@ -45,6 +45,12 @@ class AsyncScheduler(Scheduler):
         request: Request,
         new_token_ids: list[int],
     ) -> tuple[list[int], bool]:
+        if request.discard_latest_async_tokens:
+            # If the request is force preempted in reset_prefix_cache, we
+            # should discard the latest async token.
+            request.discard_latest_async_tokens = False
+            return [], False
+
         status_before_update = request.status
         new_token_ids, stopped = super()._update_request_with_output(
             request, new_token_ids
diff --git a/vllm/v1/core/sched/interface.py b/vllm/v1/core/sched/interface.py
index 88d99d9402821..c2f503ef2354e 100644
--- a/vllm/v1/core/sched/interface.py
+++ b/vllm/v1/core/sched/interface.py
@@ -152,10 +152,16 @@ class SchedulerInterface(ABC):
         return self.has_unfinished_requests() or self.has_finished_requests()
 
     @abstractmethod
-    def reset_prefix_cache(self) -> bool:
+    def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
         """Reset the prefix cache for KV cache.
 
         This is particularly required when the model weights are live-updated.
+
+        Args:
+            reset_running_requests: If True, all the running requests will be
+                preempted and moved to the waiting queue. Otherwise, this method
+                will only reset the KV prefix cache when there is no running request
+                taking KV cache.
         """
         raise NotImplementedError
 
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index c1ead200ba8d6..52b98ef654592 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -347,17 +347,7 @@ class Scheduler(SchedulerInterface):
                     else:
                         preempted_req = self.running.pop()
 
-                    self.kv_cache_manager.free(preempted_req)
-                    self.encoder_cache_manager.free(preempted_req)
-                    preempted_req.status = RequestStatus.PREEMPTED
-                    preempted_req.num_computed_tokens = 0
-                    preempted_req.num_preemptions += 1
-                    if self.log_stats:
-                        preempted_req.record_event(
-                            EngineCoreEventType.PREEMPTED, scheduled_timestamp
-                        )
-
-                    self.waiting.prepend_request(preempted_req)
+                    self._preempt_request(preempted_req, scheduled_timestamp)
                     preempted_reqs.append(preempted_req)
                     if preempted_req == request:
                         # No more request to preempt. Cannot schedule this request.
@@ -756,6 +746,30 @@ class Scheduler(SchedulerInterface):
             self._update_after_schedule(scheduler_output)
         return scheduler_output
 
+    def _preempt_request(
+        self,
+        request: Request,
+        timestamp: float,
+    ) -> None:
+        """Preempt a request and put it back to the waiting queue.
+
+        NOTE: The request should be popped from the running queue outside of this
+        method.
+        """
+        assert request.status == RequestStatus.RUNNING, (
+            "Only running requests can be preempted"
+        )
+        self.kv_cache_manager.free(request)
+        self.encoder_cache_manager.free(request)
+        request.status = RequestStatus.PREEMPTED
+        request.num_computed_tokens = 0
+        request.num_preemptions += 1
+        if self.log_stats:
+            request.record_event(EngineCoreEventType.PREEMPTED, timestamp)
+
+        # Put the request back to the waiting queue.
+        self.waiting.prepend_request(request)
+
     def _update_after_schedule(
         self,
         scheduler_output: SchedulerOutput,
@@ -1362,8 +1376,45 @@ class Scheduler(SchedulerInterface):
     def has_finished_requests(self) -> bool:
         return len(self.finished_req_ids) > 0
 
-    def reset_prefix_cache(self) -> bool:
-        return self.kv_cache_manager.reset_prefix_cache()
+    def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
+        """Reset the KV prefix cache.
+
+        If reset_running_requests is True, all the running requests will be
+        preempted and moved to the waiting queue.
+        Otherwise, this method will only reset the KV prefix cache when there
+        is no running requests taking KV cache.
+        """
+        if reset_running_requests:
+            # For logging.
+            timestamp = time.monotonic()
+            # Invalidate all the current running requests KV's by pushing them to
+            # the waiting queue. In this case, we can reduce the ref count of all
+            # the kv blocks to 0 and thus we can make sure the reset is successful.
+            # Preempt in reverse order so the requests will be added back to the
+            # running queue in FIFO order.
+            while self.running:
+                request = self.running.pop()
+                self._preempt_request(request, timestamp)
+                # NOTE(zhuohan): For async scheduling, we need to discard the latest
+                # output token on the fly to avoid a redundant repetitive output token.
+                request.num_output_placeholders = 0
+                request.discard_latest_async_tokens = True
+
+            # Clear scheduled request ids cache. Since we are forcing preemption
+            # + resumption in the same step, we must act as if these requests were
+            # not scheduled in the prior step. They will be flushed from the
+            # persistent batch in the model runner.
+            self.prev_step_scheduled_req_ids.clear()
+
+        reset_successful = self.kv_cache_manager.reset_prefix_cache()
+        if reset_running_requests and not reset_successful:
+            raise RuntimeError(
+                "Failed to reset KV cache even when all the running requests are "
+                "preempted and moved to the waiting queue. This is likely due to "
+                "the presence of running requests waiting for remote KV transfer, "
+                "which is not supported yet."
+            )
+        return reset_successful
 
     def make_stats(
         self,
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index d0708a8a046d1..17a271ca42e26 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -750,8 +750,8 @@ class AsyncLLM(EngineClient):
         self.input_processor.clear_mm_cache()
         await self.engine_core.reset_mm_cache_async()
 
-    async def reset_prefix_cache(self) -> None:
-        await self.engine_core.reset_prefix_cache_async()
+    async def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
+        return await self.engine_core.reset_prefix_cache_async(reset_running_requests)
 
     async def sleep(self, level: int = 1) -> None:
         await self.reset_prefix_cache()
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index e3a5f51a8fc56..61b8422dd6633 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -483,8 +483,8 @@ class EngineCore:
 
         self.model_executor.reset_mm_cache()
 
-    def reset_prefix_cache(self):
-        self.scheduler.reset_prefix_cache()
+    def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
+        return self.scheduler.reset_prefix_cache(reset_running_requests)
 
     def sleep(self, level: int = 1):
         self.model_executor.sleep(level)
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 9b440505bd9dc..afa0593921d06 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -138,7 +138,7 @@ class EngineCoreClient(ABC):
     def reset_mm_cache(self) -> None:
         raise NotImplementedError
 
-    def reset_prefix_cache(self) -> None:
+    def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
         raise NotImplementedError
 
     def sleep(self, level: int = 1) -> None:
@@ -208,7 +208,9 @@ class EngineCoreClient(ABC):
     async def reset_mm_cache_async(self) -> None:
         raise NotImplementedError
 
-    async def reset_prefix_cache_async(self) -> None:
+    async def reset_prefix_cache_async(
+        self, reset_running_requests: bool = False
+    ) -> bool:
         raise NotImplementedError
 
     async def sleep_async(self, level: int = 1) -> None:
@@ -287,8 +289,8 @@ class InprocClient(EngineCoreClient):
     def reset_mm_cache(self) -> None:
         self.engine_core.reset_mm_cache()
 
-    def reset_prefix_cache(self) -> None:
-        self.engine_core.reset_prefix_cache()
+    def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
+        return self.engine_core.reset_prefix_cache(reset_running_requests)
 
     def sleep(self, level: int = 1) -> None:
         self.engine_core.sleep(level)
@@ -751,8 +753,8 @@ class SyncMPClient(MPClient):
     def reset_mm_cache(self) -> None:
         self.call_utility("reset_mm_cache")
 
-    def reset_prefix_cache(self) -> None:
-        self.call_utility("reset_prefix_cache")
+    def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
+        return self.call_utility("reset_prefix_cache", reset_running_requests)
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
         return self.call_utility("add_lora", lora_request)
@@ -955,8 +957,12 @@ class AsyncMPClient(MPClient):
     async def reset_mm_cache_async(self) -> None:
         await self.call_utility_async("reset_mm_cache")
 
-    async def reset_prefix_cache_async(self) -> None:
-        await self.call_utility_async("reset_prefix_cache")
+    async def reset_prefix_cache_async(
+        self, reset_running_requests: bool = False
+    ) -> bool:
+        return await self.call_utility_async(
+            "reset_prefix_cache", reset_running_requests
+        )
 
     async def sleep_async(self, level: int = 1) -> None:
         await self.call_utility_async("sleep", level)
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index a3bde7ba8d64d..e7dfc554e76fa 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -329,8 +329,8 @@ class LLMEngine:
         self.input_processor.clear_mm_cache()
         self.engine_core.reset_mm_cache()
 
-    def reset_prefix_cache(self):
-        self.engine_core.reset_prefix_cache()
+    def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
+        return self.engine_core.reset_prefix_cache(reset_running_requests)
 
     def sleep(self, level: int = 1):
         self.engine_core.sleep(level)
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 366cdadf5a583..f2dfd2eed03cd 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -93,7 +93,12 @@ class Request:
             if self.prompt_token_ids is not None
             else [0] * self.num_prompt_tokens
         )
-        self.num_output_placeholders = 0  # Used in async scheduling.
+
+        # Used in async scheduling.
+        self.num_output_placeholders = 0
+        # Used in forced preemption (reset_prefix_cache) with async scheduling.
+        self.discard_latest_async_tokens = False
+
         self.spec_token_ids: list[int] = []
         self.num_computed_tokens = 0
         self.cache_salt: str | None = cache_salt
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index e7991baeaa1b8..516c76a5e4b15 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -482,6 +482,8 @@ class InputBatch:
         self.generators.pop(req_index, None)
         self.num_logprobs.pop(req_id, None)
         self.in_progress_prompt_logprobs_cpu.pop(req_id, None)
+        if self.prev_req_id_to_index is not None:
+            self.prev_req_id_to_index.pop(req_id, None)
 
         self.has_allowed_token_ids.discard(req_id)
         if self.allowed_token_ids_mask_cpu_tensor is not None:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 2218e4f023f92..9eacd2138978b 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -774,7 +774,14 @@ class GPUModelRunner(
         # they will be scheduled again sometime in the future.
         scheduled_req_ids = scheduler_output.num_scheduled_tokens.keys()
         cached_req_ids = self.input_batch.req_id_to_index.keys()
-        unscheduled_req_ids = cached_req_ids - scheduled_req_ids
+        resumed_req_ids = scheduler_output.scheduled_cached_reqs.resumed_req_ids
+        # NOTE(zhuohan): cached_req_ids and resumed_req_ids are usually disjoint,
+        # so `(scheduled_req_ids - resumed_req_ids) == scheduled_req_ids` holds
+        # apart from the forced-preemption case in reset_prefix_cache. And in
+        # that case we include the resumed_req_ids in the unscheduled set so
+        # that they get cleared from the persistent batch before being re-scheduled
+        # in the normal resumed request path.
+        unscheduled_req_ids = cached_req_ids - (scheduled_req_ids - resumed_req_ids)
         # NOTE(woosuk): The persistent batch optimization assumes that
         # consecutive batches contain mostly the same requests. If batches
         # have low request overlap (e.g., alternating between two distinct

From fc95521ba59f4800c8afa7fcdcebc3cf08dd197c Mon Sep 17 00:00:00 2001
From: Wei Wei <wwei6@meta.com>
Date: Mon, 1 Dec 2025 18:58:44 -0800
Subject: [PATCH 128/770] [Misc] Throw error on unintended access to
 scheduler_config.max_model_len (#29771)

Signed-off-by: Wei Wei <wwei6@meta.com>
---
 vllm/config/scheduler.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
index ff1ac0e18f324..88f3e62fbd4ed 100644
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -274,3 +274,8 @@ class SchedulerConfig:
             )
 
         return self
+
+    def __getattribute__(self, name: str) -> Any:
+        if name == "max_model_len" or name == "is_encoder_decoder":
+            raise AttributeError(f"{name} is an init-only parameter. ")
+        return object.__getattribute__(self, name)

From 22274b2184d7b1adadd7b244bf3e3ee0cfd71280 Mon Sep 17 00:00:00 2001
From: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com>
Date: Mon, 1 Dec 2025 19:21:44 -0800
Subject: [PATCH 129/770] [Misc] Add ReplicaId to Ray metrics (#24267)

Signed-off-by: Seiji Eicher <seiji@anyscale.com>
Co-authored-by: rongfu.leng <1275177125@qq.com>
---
 vllm/v1/metrics/ray_wrappers.py | 61 +++++++++++++++++++++++----------
 1 file changed, 43 insertions(+), 18 deletions(-)

diff --git a/vllm/v1/metrics/ray_wrappers.py b/vllm/v1/metrics/ray_wrappers.py
index a319ffb1d2573..4b46669d5d3bf 100644
--- a/vllm/v1/metrics/ray_wrappers.py
+++ b/vllm/v1/metrics/ray_wrappers.py
@@ -7,37 +7,55 @@ from vllm.v1.metrics.loggers import PrometheusStatLogger
 from vllm.v1.spec_decode.metrics import SpecDecodingProm
 
 try:
+    from ray import serve as ray_serve
     from ray.util import metrics as ray_metrics
     from ray.util.metrics import Metric
 except ImportError:
     ray_metrics = None
+    ray_serve = None
 import regex as re
 
 
+def _get_replica_id() -> str | None:
+    """Get the current Ray Serve replica ID, or None if not in a Serve context."""
+    if ray_serve is None:
+        return None
+    try:
+        return ray_serve.get_replica_context().replica_id.unique_id
+    except ray_serve.exceptions.RayServeException:
+        return None
+
+
 class RayPrometheusMetric:
     def __init__(self):
         if ray_metrics is None:
             raise ImportError("RayPrometheusMetric requires Ray to be installed.")
-
         self.metric: Metric = None
 
+    @staticmethod
+    def _get_tag_keys(labelnames: list[str] | None) -> tuple[str, ...]:
+        labels = list(labelnames) if labelnames else []
+        labels.append("ReplicaId")
+        return tuple(labels)
+
     def labels(self, *labels, **labelskwargs):
+        if labels:
+            # -1 because ReplicaId was added automatically
+            expected = len(self.metric._tag_keys) - 1
+            if len(labels) != expected:
+                raise ValueError(
+                    "Number of labels must match the number of tag keys. "
+                    f"Expected {expected}, got {len(labels)}"
+                )
+            labelskwargs.update(zip(self.metric._tag_keys, labels))
+
+        labelskwargs["ReplicaId"] = _get_replica_id() or ""
+
         if labelskwargs:
             for k, v in labelskwargs.items():
                 if not isinstance(v, str):
                     labelskwargs[k] = str(v)
-
             self.metric.set_default_tags(labelskwargs)
-
-        if labels:
-            if len(labels) != len(self.metric._tag_keys):
-                raise ValueError(
-                    "Number of labels must match the number of tag keys. "
-                    f"Expected {len(self.metric._tag_keys)}, got {len(labels)}"
-                )
-
-            self.metric.set_default_tags(dict(zip(self.metric._tag_keys, labels)))
-
         return self
 
     @staticmethod
@@ -71,10 +89,14 @@ class RayGaugeWrapper(RayPrometheusMetric):
         # "mostrecent", "all", "sum" do not apply. This logic can be manually
         # implemented at the observability layer (Prometheus/Grafana).
         del multiprocess_mode
-        labelnames_tuple = tuple(labelnames) if labelnames else None
+
+        tag_keys = self._get_tag_keys(labelnames)
         name = self._get_sanitized_opentelemetry_name(name)
+
         self.metric = ray_metrics.Gauge(
-            name=name, description=documentation, tag_keys=labelnames_tuple
+            name=name,
+            description=documentation,
+            tag_keys=tag_keys,
         )
 
     def set(self, value: int | float):
@@ -95,10 +117,12 @@ class RayCounterWrapper(RayPrometheusMetric):
         documentation: str | None = "",
         labelnames: list[str] | None = None,
     ):
-        labelnames_tuple = tuple(labelnames) if labelnames else None
+        tag_keys = self._get_tag_keys(labelnames)
         name = self._get_sanitized_opentelemetry_name(name)
         self.metric = ray_metrics.Counter(
-            name=name, description=documentation, tag_keys=labelnames_tuple
+            name=name,
+            description=documentation,
+            tag_keys=tag_keys,
         )
 
     def inc(self, value: int | float = 1.0):
@@ -118,13 +142,14 @@ class RayHistogramWrapper(RayPrometheusMetric):
         labelnames: list[str] | None = None,
         buckets: list[float] | None = None,
     ):
-        labelnames_tuple = tuple(labelnames) if labelnames else None
+        tag_keys = self._get_tag_keys(labelnames)
         name = self._get_sanitized_opentelemetry_name(name)
+
         boundaries = buckets if buckets else []
         self.metric = ray_metrics.Histogram(
             name=name,
             description=documentation,
-            tag_keys=labelnames_tuple,
+            tag_keys=tag_keys,
             boundaries=boundaries,
         )
 

From f441d36cee366126dc4f6db5b6ca262c1e0cc20c Mon Sep 17 00:00:00 2001
From: Johnny Yang <24908445+jcyang43@users.noreply.github.com>
Date: Mon, 1 Dec 2025 19:22:50 -0800
Subject: [PATCH 130/770] Add missing return in
 _check_vllm_model_embed_input_ids (#29834)

Signed-off-by: Johnny Yang <johnnyyang@google.com>
---
 vllm/model_executor/models/interfaces_base.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index 2c99fce8d918c..e8d521ec2e8aa 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -76,6 +76,7 @@ def _check_vllm_model_embed_input_ids(model: type[object] | object) -> bool:
                 "this method to `embed_input_ids`."
             )
             model.embed_input_ids = model_get_input_embeddings
+            return True
         logger.warning(
             "The model (%s) is missing the `embed_input_ids` method.",
             model,

From 53bf71b0f0e0fc57ec5879ea3d13cf93a024667a Mon Sep 17 00:00:00 2001
From: Zuyi Zhao <zhaozuy@amazon.com>
Date: Mon, 1 Dec 2025 19:56:39 -0800
Subject: [PATCH 131/770] [Misc] Update conftest for entrypoints/sagemaker test
 folder (#29799)

Signed-off-by: Zuyi Zhao <zhaozuy@amazon.com>
---
 tests/entrypoints/sagemaker/conftest.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/entrypoints/sagemaker/conftest.py b/tests/entrypoints/sagemaker/conftest.py
index ad219eec18b79..1c34d738fa7a3 100644
--- a/tests/entrypoints/sagemaker/conftest.py
+++ b/tests/entrypoints/sagemaker/conftest.py
@@ -45,7 +45,10 @@ def basic_server_with_lora(smollm2_lora_files):
         "64",
     ]
 
-    envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"}
+    envs = {
+        "VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True",
+        "SAGEMAKER_ENABLE_STATEFUL_SESSIONS": "True",
+    }
     with RemoteOpenAIServer(MODEL_NAME_SMOLLM, args, env_dict=envs) as remote_server:
         yield remote_server
 

From 81fe3f82af2e30c93ddb106b7a84b7730b982e66 Mon Sep 17 00:00:00 2001
From: usberkeley <150880684+usberkeley@users.noreply.github.com>
Date: Tue, 2 Dec 2025 12:48:11 +0800
Subject: [PATCH 132/770] [BugFix] Fix index error in ngram_proposer (#29779)

Signed-off-by: Bradley <bradley.b.pitt@gmail.com>
---
 tests/v1/spec_decode/test_ngram.py    | 34 ++++++++++++++++++++++++++-
 vllm/v1/spec_decode/ngram_proposer.py |  4 ++--
 2 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py
index 692c39282c372..6bc412abe8695 100644
--- a/tests/v1/spec_decode/test_ngram.py
+++ b/tests/v1/spec_decode/test_ngram.py
@@ -2,7 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import numpy as np
 
-from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
+from vllm.config import (
+    ModelConfig,
+    SpeculativeConfig,
+    VllmConfig,
+)
 from vllm.v1.spec_decode.ngram_proposer import (
     NgramProposer,
     _find_longest_matched_ngram_and_propose_tokens,
@@ -167,6 +171,34 @@ def test_ngram_proposer():
     assert np.array_equal(result[0], np.array([3, 1]))
     assert np.array_equal(result[1], np.array([]))
 
+    # Test non-contiguous indices: requests 0 and 2 need proposals,
+    # request 1 is in prefill
+    proposer = get_ngram_proposer(min_n=2, max_n=2, k=2)
+    max_model_len = 20
+    token_ids_cpu = np.zeros((3, max_model_len), dtype=np.int32)
+    token_ids_cpu[0, :5] = [1, 2, 3, 1, 2]
+    token_ids_cpu[1, :3] = [4, 5, 6]
+    token_ids_cpu[2, :5] = [7, 8, 9, 7, 8]
+    num_tokens_no_spec = np.array([5, 3, 5], dtype=np.int32)
+    sampled_token_ids = [[2], [], [8]]  # Empty list for request 1 simulates prefill
+    result = proposer.propose(
+        sampled_token_ids=sampled_token_ids,
+        req_ids=["0", "1", "2"],
+        num_tokens_no_spec=num_tokens_no_spec,
+        token_ids_cpu=token_ids_cpu,
+        spec_decode_unsupported_reqs=(),
+    )
+    assert len(result) == 3
+    assert np.array_equal(result[0], [3, 1])
+    assert len(result[1]) == 0
+    assert np.array_equal(result[2], [9, 7])
+    # Verify internal arrays written to correct indices
+    assert proposer.valid_ngram_num_drafts[0] == 2
+    assert proposer.valid_ngram_num_drafts[1] == 0
+    assert proposer.valid_ngram_num_drafts[2] == 2
+    assert np.array_equal(proposer.valid_ngram_draft[0, :2], [3, 1])
+    assert np.array_equal(proposer.valid_ngram_draft[2, :2], [9, 7])
+
     # test if 0 threads available: can happen if TP size > CPU count
     ngram_proposer = get_ngram_proposer(min_n=2, max_n=2, k=2)
     ngram_proposer.num_numba_thread_available = 0
diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py
index 10b3f0aa040e5..1273ca12c3600 100644
--- a/vllm/v1/spec_decode/ngram_proposer.py
+++ b/vllm/v1/spec_decode/ngram_proposer.py
@@ -196,9 +196,9 @@ def batch_propose_numba(
             k=k,
         )
 
-        valid_ngram_num_drafts[i] = drafter_output.shape[0]
+        valid_ngram_num_drafts[idx] = drafter_output.shape[0]
         if len(drafter_output):
-            valid_ngram_draft[i, : drafter_output.shape[0]] = drafter_output
+            valid_ngram_draft[idx, : drafter_output.shape[0]] = drafter_output
 
 
 @jit(nopython=True)

From a690fb5bd6773128fc82bf03f612156b387d5b61 Mon Sep 17 00:00:00 2001
From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Date: Mon, 1 Dec 2025 22:53:27 -0600
Subject: [PATCH 133/770] [CI][ROCm] Fix test_correctness_sliding_window
 (#29243)

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 tests/v1/e2e/test_correctness_sliding_window.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tests/v1/e2e/test_correctness_sliding_window.py b/tests/v1/e2e/test_correctness_sliding_window.py
index 71b0e86c75c18..b6a78eaa09209 100644
--- a/tests/v1/e2e/test_correctness_sliding_window.py
+++ b/tests/v1/e2e/test_correctness_sliding_window.py
@@ -5,6 +5,7 @@ from dataclasses import dataclass
 import pytest
 
 from vllm import LLM, SamplingParams
+from vllm.platforms import current_platform
 
 from ...utils import check_answers, prep_prompts
 
@@ -40,10 +41,17 @@ def test_sliding_window_retrieval(
     If we tell it upfront which we are going to be looking for, then
     it answers correctly (mostly).
     """
+    # NOTE: For ROCm, we have to enforce eager mode to use custom kernel
+    # implementation of GELU with tanh approximation, as PyTorch's native
+    # implementation is currently unstable with torch.compile and produces garbage.
+    enforce_eager = current_platform.is_rocm()
+
     test_config = model_config[model]
 
     llm = LLM(
-        model=model, disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager
+        model=model,
+        disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager,
+        enforce_eager=enforce_eager,
     )
     sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
 

From e2fbfc955e1bc9f67c349c5a2bde63a3edd86b84 Mon Sep 17 00:00:00 2001
From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Date: Mon, 1 Dec 2025 23:27:46 -0600
Subject: [PATCH 134/770] [CI][AMD] spec_decode:eagle skip FLASH_ATTN for
 deepseek on ROCm (#29827)

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
---
 tests/v1/e2e/test_spec_decode.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index f711715dec0e6..5246ea6517f6c 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -414,7 +414,10 @@ def test_eagle_correctness(
             )
 
         if attn_backend == "FLASH_ATTN" and current_platform.is_rocm():
-            m.setenv("VLLM_ROCM_USE_AITER", "1")
+            if "deepseek" in model_setup[1].lower():
+                pytest.skip("FLASH_ATTN for deepseek not supported on ROCm platform")
+            else:
+                m.setenv("VLLM_ROCM_USE_AITER", "1")
 
         method, model_name, spec_model_name, tp_size = model_setup
         max_model_len = 2048

From 653591d5e73b34ffd9186c61e964474bcc4b7c80 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 2 Dec 2025 13:33:37 +0800
Subject: [PATCH 135/770] [Chore] Move tokenizer initialization methods
 (#29793)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 benchmarks/benchmark_prefix_caching.py        |  2 +-
 .../benchmark_serving_structured_output.py    |  2 +-
 .../test_dynamic_shapes_compilation.py        |  2 +-
 .../entrypoints/openai/test_chat_template.py  |  2 +-
 .../entrypoints/openai/test_lora_resolvers.py |  2 +-
 .../openai/test_return_token_ids.py           |  2 +-
 .../openai/test_return_tokens_as_ids.py       |  2 +-
 tests/entrypoints/openai/test_serving_chat.py |  2 +-
 .../openai/test_token_in_token_out.py         |  2 +-
 tests/entrypoints/openai/test_tokenization.py |  2 +-
 .../tool_parsers/test_hermes_tool_parser.py   |  2 +-
 .../entrypoints/pooling/embed/test_online.py  |  2 +-
 .../pooling/pooling/test_online.py            |  2 +-
 tests/entrypoints/test_chat_utils.py          |  3 +-
 .../multimodal/processing/test_common.py      |  7 +-
 .../processing/test_tensor_schema.py          |  2 +-
 tests/models/utils.py                         |  2 +-
 tests/test_inputs.py                          |  4 +-
 tests/tokenizers_/test_basic.py               |  3 +-
 tests/tokenizers_/test_registry.py            |  3 +-
 .../tool_use/test_deepseekv31_tool_parser.py  |  2 +-
 .../tool_use/test_ernie45_moe_tool_parser.py  |  3 +-
 tests/tool_use/test_glm4_moe_tool_parser.py   |  2 +-
 tests/tool_use/test_jamba_tool_parser.py      |  3 +-
 tests/tool_use/test_kimi_k2_tool_parser.py    |  2 +-
 tests/tool_use/test_minimax_tool_parser.py    |  2 +-
 tests/tool_use/test_openai_tool_parser.py     |  2 +-
 tests/tool_use/test_qwen3coder_tool_parser.py |  3 +-
 tests/tool_use/test_seed_oss_tool_parser.py   |  3 +-
 tests/tool_use/test_xlam_tool_parser.py       |  3 +-
 tests/transformers_utils/test_config.py       |  2 +-
 tests/utils.py                                |  2 +-
 .../v1/entrypoints/openai/test_completion.py  |  2 +-
 tests/v1/tpu/test_perf.py                     |  2 +-
 vllm/benchmarks/serve.py                      |  2 +-
 vllm/model_executor/models/adapters.py        |  4 +-
 vllm/model_executor/models/deepseek_ocr.py    |  2 +-
 vllm/model_executor/models/deepseek_vl2.py    |  2 +-
 vllm/model_executor/models/granite_speech.py  |  8 +-
 vllm/model_executor/models/gritlm.py          |  2 +-
 .../model_executor/models/nano_nemotron_vl.py |  7 +-
 vllm/model_executor/models/pixtral.py         |  3 +-
 vllm/model_executor/models/voxtral.py         |  3 +-
 vllm/model_executor/models/whisper.py         |  6 +-
 vllm/multimodal/registry.py                   |  3 +-
 vllm/tokenizers/__init__.py                   | 11 ++-
 vllm/tokenizers/registry.py                   | 38 +++++++-
 vllm/transformers_utils/tokenizer.py          | 91 +++++++++----------
 vllm/v1/engine/async_llm.py                   |  5 +-
 vllm/v1/engine/llm_engine.py                  |  5 +-
 vllm/v1/structured_output/__init__.py         |  4 +-
 51 files changed, 150 insertions(+), 129 deletions(-)

diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index 28fc383a318dd..e6391134ff932 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -40,7 +40,7 @@ from vllm.engine.arg_utils import EngineArgs
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 try:
-    from vllm.transformers_utils.tokenizer import get_tokenizer
+    from vllm.tokenizers import get_tokenizer
 except ImportError:
     from backend_request_func import get_tokenizer
 
diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index 55001cf3722a0..df122b4c5e8db 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -46,7 +46,7 @@ from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
 
 try:
-    from vllm.transformers_utils.tokenizer import get_tokenizer
+    from vllm.tokenizers import get_tokenizer
 except ImportError:
     from backend_request_func import get_tokenizer
 
diff --git a/tests/compile/test_dynamic_shapes_compilation.py b/tests/compile/test_dynamic_shapes_compilation.py
index c20aea822fe81..1966b03cd9c89 100644
--- a/tests/compile/test_dynamic_shapes_compilation.py
+++ b/tests/compile/test_dynamic_shapes_compilation.py
@@ -8,7 +8,7 @@ import torch
 
 from vllm import LLM, SamplingParams
 from vllm.config.compilation import CompilationMode, DynamicShapesType
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
 
diff --git a/tests/entrypoints/openai/test_chat_template.py b/tests/entrypoints/openai/test_chat_template.py
index ee79ed59c4102..77087ac21ea8b 100644
--- a/tests/entrypoints/openai/test_chat_template.py
+++ b/tests/entrypoints/openai/test_chat_template.py
@@ -6,7 +6,7 @@ import pytest
 from vllm.config import ModelConfig
 from vllm.entrypoints.chat_utils import apply_hf_chat_template, load_chat_template
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
 
 from ...models.registry import HF_EXAMPLE_MODELS
 from ...utils import VLLM_PATH
diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py
index 4856cafef44b3..ea6b3d812d8fe 100644
--- a/tests/entrypoints/openai/test_lora_resolvers.py
+++ b/tests/entrypoints/openai/test_lora_resolvers.py
@@ -14,7 +14,7 @@ from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
 from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
 from vllm.lora.request import LoRARequest
 from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
 from vllm.v1.engine.async_llm import AsyncLLM
 
 MODEL_NAME = "openai-community/gpt2"
diff --git a/tests/entrypoints/openai/test_return_token_ids.py b/tests/entrypoints/openai/test_return_token_ids.py
index feef48a36dfa1..8537082e3f8d1 100644
--- a/tests/entrypoints/openai/test_return_token_ids.py
+++ b/tests/entrypoints/openai/test_return_token_ids.py
@@ -3,7 +3,7 @@
 
 import pytest
 
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
 
 from ...utils import RemoteOpenAIServer
 
diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py
index cedf6ce160607..d4d9a6c5b6120 100644
--- a/tests/entrypoints/openai/test_return_tokens_as_ids.py
+++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py
@@ -7,7 +7,7 @@
 
 import pytest
 
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
 
 from ...utils import RemoteOpenAIServer
 
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 6a1b15c4131e0..9ea65f9fa6e7a 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -14,7 +14,7 @@ from vllm.config.multimodal import MultiModalConfig
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
 from vllm.v1.engine.async_llm import AsyncLLM
 
 from ...utils import RemoteOpenAIServer
diff --git a/tests/entrypoints/openai/test_token_in_token_out.py b/tests/entrypoints/openai/test_token_in_token_out.py
index 25eb5882be89c..c7f8abe27e6e0 100644
--- a/tests/entrypoints/openai/test_token_in_token_out.py
+++ b/tests/entrypoints/openai/test_token_in_token_out.py
@@ -7,7 +7,7 @@ import tempfile
 import pytest
 
 from vllm.model_executor.model_loader.weight_utils import download_weights_from_hf
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
 
 from ...utils import RemoteOpenAIServer
 
diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py
index 751f94319eb9f..052f9fecc18de 100644
--- a/tests/entrypoints/openai/test_tokenization.py
+++ b/tests/entrypoints/openai/test_tokenization.py
@@ -5,7 +5,7 @@ import pytest
 import pytest_asyncio
 import requests
 
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
 
 from ...utils import RemoteOpenAIServer
 
diff --git a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
index b2303ab0e7b7c..ce6727bb04f6c 100644
--- a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
@@ -271,7 +271,7 @@ async def test_streaming_product_tool_call():
 
 @pytest.fixture
 def qwen_tokenizer() -> TokenizerLike:
-    from vllm.transformers_utils.tokenizer import get_tokenizer
+    from vllm.tokenizers import get_tokenizer
 
     return get_tokenizer("Qwen/Qwen3-32B")
 
diff --git a/tests/entrypoints/pooling/embed/test_online.py b/tests/entrypoints/pooling/embed/test_online.py
index 6aac649bc3035..ddba1c790ba8c 100644
--- a/tests/entrypoints/pooling/embed/test_online.py
+++ b/tests/entrypoints/pooling/embed/test_online.py
@@ -18,7 +18,7 @@ from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
 from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
 from vllm.platforms import current_platform
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
 from vllm.utils.serial_utils import (
     EMBED_DTYPE_TO_TORCH_DTYPE,
     ENDIANNESS,
diff --git a/tests/entrypoints/pooling/pooling/test_online.py b/tests/entrypoints/pooling/pooling/test_online.py
index 977c74d54a351..cc5c2f26f80fb 100644
--- a/tests/entrypoints/pooling/pooling/test_online.py
+++ b/tests/entrypoints/pooling/pooling/test_online.py
@@ -12,7 +12,7 @@ import torch
 from tests.models.utils import check_embeddings_close
 from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
 from vllm.utils.serial_utils import (
     EMBED_DTYPE_TO_TORCH_DTYPE,
     ENDIANNESS,
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index a351cda60621f..03a0c058ea690 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -28,8 +28,7 @@ from vllm.multimodal.utils import (
     encode_image_base64,
     encode_video_base64,
 )
-from vllm.tokenizers import MistralTokenizer
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import MistralTokenizer, get_tokenizer
 
 from ..models.registry import HF_EXAMPLE_MODELS
 from ..utils import VLLM_PATH
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index c39e522100901..90158a028b0bd 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -22,11 +22,8 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
 from vllm.multimodal.cache import MultiModalProcessorOnlyCache
 from vllm.multimodal.inputs import MultiModalInputs
 from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
-from vllm.tokenizers import MistralTokenizer
-from vllm.transformers_utils.tokenizer import (
-    cached_tokenizer_from_config,
-    encode_tokens,
-)
+from vllm.tokenizers import MistralTokenizer, cached_tokenizer_from_config
+from vllm.transformers_utils.tokenizer import encode_tokens
 
 from ....multimodal.utils import random_audio, random_image, random_video
 from ...registry import (
diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py
index 66a3fbe11b6a5..7628ab4fe2349 100644
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -31,7 +31,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
 from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
 from vllm.multimodal.utils import group_mm_kwargs_by_modality
 from vllm.platforms import current_platform
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
+from vllm.tokenizers import cached_tokenizer_from_config
 from vllm.utils.collection_utils import is_list_of
 from vllm.utils.torch_utils import set_default_torch_dtype
 
diff --git a/tests/models/utils.py b/tests/models/utils.py
index 9843887a13204..d84b4b820533e 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -13,7 +13,7 @@ from transformers import PretrainedConfig
 from vllm.config.model import ModelConfig, ModelDType, RunnerOption
 from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
 from vllm.multimodal.processing import InputProcessingContext
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
+from vllm.tokenizers import cached_tokenizer_from_config
 
 from .. import ci_envs
 from .registry import HF_EXAMPLE_MODELS
diff --git a/tests/test_inputs.py b/tests/test_inputs.py
index b1fb4e06a6906..c4339827de8b6 100644
--- a/tests/test_inputs.py
+++ b/tests/test_inputs.py
@@ -7,7 +7,7 @@ from vllm.config import ModelConfig
 from vllm.inputs import zip_enc_dec_prompts
 from vllm.inputs.parse import parse_raw_prompts
 from vllm.inputs.preprocess import InputPreprocessor
-from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs
+from vllm.tokenizers import init_tokenizer_from_config
 
 pytestmark = pytest.mark.cpu_test
 
@@ -108,7 +108,7 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
 )
 def test_preprocessor_always_mm_code_path(model_id, prompt):
     model_config = ModelConfig(model=model_id)
-    tokenizer = init_tokenizer_from_configs(model_config)
+    tokenizer = init_tokenizer_from_config(model_config)
     input_preprocessor = InputPreprocessor(model_config, tokenizer)
 
     # HF processor adds sep token
diff --git a/tests/tokenizers_/test_basic.py b/tests/tokenizers_/test_basic.py
index 1fca633cc5cd7..b152227a5a50f 100644
--- a/tests/tokenizers_/test_basic.py
+++ b/tests/tokenizers_/test_basic.py
@@ -5,8 +5,7 @@ from typing import _get_protocol_attrs  # type: ignore
 import pytest
 from transformers import PreTrainedTokenizerBase
 
-from vllm.tokenizers import TokenizerLike
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import TokenizerLike, get_tokenizer
 
 
 def _get_missing_attrs(obj: object, target: type):
diff --git a/tests/tokenizers_/test_registry.py b/tests/tokenizers_/test_registry.py
index 57b6a14a54b3f..7e795350d64c8 100644
--- a/tests/tokenizers_/test_registry.py
+++ b/tests/tokenizers_/test_registry.py
@@ -2,8 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from pathlib import Path
 
-from vllm.tokenizers import TokenizerLike, TokenizerRegistry
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import TokenizerLike, TokenizerRegistry, get_tokenizer
 
 
 class TestTokenizer(TokenizerLike):
diff --git a/tests/tool_use/test_deepseekv31_tool_parser.py b/tests/tool_use/test_deepseekv31_tool_parser.py
index db5168071fbce..8beb7739b6081 100644
--- a/tests/tool_use/test_deepseekv31_tool_parser.py
+++ b/tests/tool_use/test_deepseekv31_tool_parser.py
@@ -6,7 +6,7 @@ import pytest
 from vllm.entrypoints.openai.tool_parsers.deepseekv31_tool_parser import (
     DeepSeekV31ToolParser,
 )
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
 
 MODEL = "deepseek-ai/DeepSeek-V3.1"
 
diff --git a/tests/tool_use/test_ernie45_moe_tool_parser.py b/tests/tool_use/test_ernie45_moe_tool_parser.py
index 8fbbbba325385..92f86de23267b 100644
--- a/tests/tool_use/test_ernie45_moe_tool_parser.py
+++ b/tests/tool_use/test_ernie45_moe_tool_parser.py
@@ -14,9 +14,8 @@ from vllm.entrypoints.openai.protocol import (
     ToolCall,
 )
 from vllm.entrypoints.openai.tool_parsers.ernie45_tool_parser import Ernie45ToolParser
-from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers import TokenizerLike, get_tokenizer
 from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
-from vllm.transformers_utils.tokenizer import get_tokenizer
 
 # Use a common model that is likely to be available
 MODEL = "baidu/ERNIE-4.5-21B-A3B-Thinking"
diff --git a/tests/tool_use/test_glm4_moe_tool_parser.py b/tests/tool_use/test_glm4_moe_tool_parser.py
index f545f52c02dcb..753b3f1c23adf 100644
--- a/tests/tool_use/test_glm4_moe_tool_parser.py
+++ b/tests/tool_use/test_glm4_moe_tool_parser.py
@@ -10,7 +10,7 @@ from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
 from vllm.entrypoints.openai.tool_parsers.glm4_moe_tool_parser import (
     Glm4MoeModelToolParser,
 )
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
 
 pytestmark = pytest.mark.cpu_test
 
diff --git a/tests/tool_use/test_jamba_tool_parser.py b/tests/tool_use/test_jamba_tool_parser.py
index c7ca024f3a767..9036bd32dd704 100644
--- a/tests/tool_use/test_jamba_tool_parser.py
+++ b/tests/tool_use/test_jamba_tool_parser.py
@@ -10,9 +10,8 @@ from partial_json_parser.core.options import Allow
 
 from vllm.entrypoints.openai.protocol import DeltaMessage, FunctionCall, ToolCall
 from vllm.entrypoints.openai.tool_parsers.jamba_tool_parser import JambaToolParser
-from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers import TokenizerLike, get_tokenizer
 from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
-from vllm.transformers_utils.tokenizer import get_tokenizer
 
 pytestmark = pytest.mark.cpu_test
 
diff --git a/tests/tool_use/test_kimi_k2_tool_parser.py b/tests/tool_use/test_kimi_k2_tool_parser.py
index 3a48b5206141d..1558a9c3e01f2 100644
--- a/tests/tool_use/test_kimi_k2_tool_parser.py
+++ b/tests/tool_use/test_kimi_k2_tool_parser.py
@@ -8,7 +8,7 @@ import pytest
 
 from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
 from vllm.entrypoints.openai.tool_parsers.kimi_k2_tool_parser import KimiK2ToolParser
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
 
 pytestmark = pytest.mark.cpu_test
 
diff --git a/tests/tool_use/test_minimax_tool_parser.py b/tests/tool_use/test_minimax_tool_parser.py
index 4332984083dab..dda63f984a832 100644
--- a/tests/tool_use/test_minimax_tool_parser.py
+++ b/tests/tool_use/test_minimax_tool_parser.py
@@ -13,7 +13,7 @@ from vllm.entrypoints.openai.protocol import (
     ToolCall,
 )
 from vllm.entrypoints.openai.tool_parsers.minimax_tool_parser import MinimaxToolParser
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
 
 pytestmark = pytest.mark.cpu_test
 
diff --git a/tests/tool_use/test_openai_tool_parser.py b/tests/tool_use/test_openai_tool_parser.py
index c874a9601ae70..6537f281c0e1b 100644
--- a/tests/tool_use/test_openai_tool_parser.py
+++ b/tests/tool_use/test_openai_tool_parser.py
@@ -16,7 +16,7 @@ from openai_harmony import (
 
 from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
 from vllm.entrypoints.openai.tool_parsers.openai_tool_parser import OpenAIToolParser
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
 
 MODEL = "gpt2"
 
diff --git a/tests/tool_use/test_qwen3coder_tool_parser.py b/tests/tool_use/test_qwen3coder_tool_parser.py
index 864bb0d0c06c2..5a56768805fdf 100644
--- a/tests/tool_use/test_qwen3coder_tool_parser.py
+++ b/tests/tool_use/test_qwen3coder_tool_parser.py
@@ -17,9 +17,8 @@ from vllm.entrypoints.openai.tool_parsers.qwen3coder_tool_parser import (
     Qwen3CoderToolParser,
 )
 from vllm.entrypoints.openai.tool_parsers.qwen3xml_tool_parser import Qwen3XMLToolParser
-from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers import TokenizerLike, get_tokenizer
 from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
-from vllm.transformers_utils.tokenizer import get_tokenizer
 
 pytestmark = pytest.mark.cpu_test
 
diff --git a/tests/tool_use/test_seed_oss_tool_parser.py b/tests/tool_use/test_seed_oss_tool_parser.py
index d94df61128c9c..8795c35a1347f 100644
--- a/tests/tool_use/test_seed_oss_tool_parser.py
+++ b/tests/tool_use/test_seed_oss_tool_parser.py
@@ -15,9 +15,8 @@ from vllm.entrypoints.openai.protocol import (
     ToolCall,
 )
 from vllm.entrypoints.openai.tool_parsers.seed_oss_tool_parser import SeedOssToolParser
-from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers import TokenizerLike, get_tokenizer
 from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
-from vllm.transformers_utils.tokenizer import get_tokenizer
 
 pytestmark = pytest.mark.cpu_test
 
diff --git a/tests/tool_use/test_xlam_tool_parser.py b/tests/tool_use/test_xlam_tool_parser.py
index fdcdd4038131a..3098fda036a81 100644
--- a/tests/tool_use/test_xlam_tool_parser.py
+++ b/tests/tool_use/test_xlam_tool_parser.py
@@ -13,9 +13,8 @@ from vllm.entrypoints.openai.protocol import (
     ToolCall,
 )
 from vllm.entrypoints.openai.tool_parsers.xlam_tool_parser import xLAMToolParser
-from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers import TokenizerLike, get_tokenizer
 from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
-from vllm.transformers_utils.tokenizer import get_tokenizer
 
 pytestmark = pytest.mark.cpu_test
 
diff --git a/tests/transformers_utils/test_config.py b/tests/transformers_utils/test_config.py
index 7b56c9f0189d4..85680c41ed74d 100644
--- a/tests/transformers_utils/test_config.py
+++ b/tests/transformers_utils/test_config.py
@@ -6,8 +6,8 @@ only get the `eos_token_id` from the tokenizer as defined by
 `vllm.LLMEngine._get_eos_token_id`.
 """
 
+from vllm.tokenizers import get_tokenizer
 from vllm.transformers_utils.config import try_get_generation_config
-from vllm.transformers_utils.tokenizer import get_tokenizer
 
 
 def test_get_llama3_eos_token():
diff --git a/tests/utils.py b/tests/utils.py
index 9565b0ff06e36..539f67c47ac1d 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -44,7 +44,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.entrypoints.cli.serve import ServeSubcommand
 from vllm.model_executor.model_loader import get_model_loader
 from vllm.platforms import current_platform
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.mem_constants import GB_bytes
 from vllm.utils.network_utils import get_open_port
diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py
index 736ccbefbc4da..ddab006d0d31a 100644
--- a/tests/v1/entrypoints/openai/test_completion.py
+++ b/tests/v1/entrypoints/openai/test_completion.py
@@ -9,7 +9,7 @@ import regex as re
 from openai import BadRequestError
 
 from tests.utils import RemoteOpenAIServer
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
 
 # any model with a chat template should work here
 MODEL_NAME = "facebook/opt-125m"
diff --git a/tests/v1/tpu/test_perf.py b/tests/v1/tpu/test_perf.py
index e230491cddb01..e62b969fe3b95 100644
--- a/tests/v1/tpu/test_perf.py
+++ b/tests/v1/tpu/test_perf.py
@@ -14,7 +14,7 @@ import pytest
 
 from vllm.platforms import current_platform
 from vllm.sampling_params import SamplingParams
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
 
 if TYPE_CHECKING:
     from tests.conftest import VllmRunner
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index 519303c0bfa0a..2933f5d01b274 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -47,7 +47,7 @@ from vllm.benchmarks.lib.endpoint_request_func import (
 )
 from vllm.benchmarks.lib.ready_checker import wait_for_endpoint
 from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
 from vllm.utils.gc_utils import freeze_gc_heap
 from vllm.utils.network_utils import join_host_port
 
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 05f257feea3ee..007d847ac3b7b 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -444,7 +444,7 @@ def load_weights_using_from_2_way_softmax(
     )
     loaded_weights = pooling_model_cls.load_weights(model, weights, load_lm_head=True)
 
-    from vllm.transformers_utils.tokenizer import get_tokenizer
+    from vllm.tokenizers import get_tokenizer
 
     tokenizer = get_tokenizer(
         model_config.tokenizer,
@@ -498,7 +498,7 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te
     # Skip ModelForSequenceClassification in MRO to avoid infinite recursion
     loaded_weights = type(model).__mro__[1].load_weights(model, weights)
 
-    from vllm.transformers_utils.tokenizer import get_tokenizer
+    from vllm.tokenizers import get_tokenizer
 
     tokenizer = get_tokenizer(
         model_config.tokenizer,
diff --git a/vllm/model_executor/models/deepseek_ocr.py b/vllm/model_executor/models/deepseek_ocr.py
index 8179f916ff417..019fb3e29ab91 100644
--- a/vllm/model_executor/models/deepseek_ocr.py
+++ b/vllm/model_executor/models/deepseek_ocr.py
@@ -45,6 +45,7 @@ from vllm.multimodal.processing import (
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import IntermediateTensors
+from vllm.tokenizers import cached_tokenizer_from_config
 from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config
 from vllm.transformers_utils.processors.deepseek_ocr import (
     BASE_SIZE,
@@ -53,7 +54,6 @@ from vllm.transformers_utils.processors.deepseek_ocr import (
     DeepseekOCRProcessor,
     count_tiles,
 )
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 from vllm.v1.sample.logits_processor import (
     AdapterLogitsProcessor,
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 1b6e4110039c4..56c1a87a25401 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -41,13 +41,13 @@ from vllm.multimodal.processing import (
 )
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.tokenizers import cached_tokenizer_from_config
 from vllm.transformers_utils.configs.deepseek_vl2 import (
     DeepseekVLV2Config,
     MlpProjectorConfig,
     VisionEncoderConfig,
 )
 from vllm.transformers_utils.processors.deepseek_vl2 import DeepseekVLV2Processor
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 from vllm.utils.torch_utils import set_default_torch_dtype
 
diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py
index 1797adab8d146..accf7e6ef2f47 100644
--- a/vllm/model_executor/models/granite_speech.py
+++ b/vllm/model_executor/models/granite_speech.py
@@ -59,8 +59,8 @@ from vllm.multimodal.processing import (
 )
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.processor import cached_get_processor
-from vllm.transformers_utils.tokenizer import cached_get_tokenizer
+from vllm.tokenizers import cached_tokenizer_from_config
+from vllm.transformers_utils.processor import cached_processor_from_config
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .blip2 import Blip2QFormerModel
@@ -862,7 +862,7 @@ class GraniteSpeechForConditionalGeneration(
         else:
             raise ValueError(f"Unsupported task type {task_type}")
 
-        tokenizer = cached_get_tokenizer(model_config.model)
+        tokenizer = cached_tokenizer_from_config(model_config)
         chat = [dict(role="user", content=user_prompt)]
         prompt = tokenizer.apply_chat_template(
             chat,
@@ -886,7 +886,7 @@ class GraniteSpeechForConditionalGeneration(
         model_config: ModelConfig,
     ) -> int | None:
         """Get the number of audio tokens for an audio duration in sec."""
-        processor = cached_get_processor(model_config.model)
+        processor = cached_processor_from_config(model_config)
         hop_length = processor.audio_processor.melspec_kwargs["hop_length"]
         proj_win_size = processor.audio_processor.projector_window_size
         ds_rate = processor.audio_processor.projector_downsample_rate
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
index 181c4ed2dca5a..550e8b014d5e7 100644
--- a/vllm/model_executor/models/gritlm.py
+++ b/vllm/model_executor/models/gritlm.py
@@ -19,7 +19,7 @@ from vllm.model_executor.layers.pooler import (
 )
 from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.tasks import PoolingTask
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
+from vllm.tokenizers import cached_tokenizer_from_config
 from vllm.v1.outputs import PoolerOutput
 from vllm.v1.pool.metadata import PoolingMetadata
 
diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
index 11beeddabe307..0f86a17752802 100644
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -73,12 +73,9 @@ from vllm.multimodal.processing import (
 )
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
-from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
 from vllm.transformers_utils.configs.radio import RadioConfig
-from vllm.transformers_utils.tokenizer import (
-    cached_tokenizer_from_config,
-    encode_tokens,
-)
+from vllm.transformers_utils.tokenizer import encode_tokens
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .utils import _merge_multimodal_embeddings
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 54bde75cc0131..cad241842cd30 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -59,8 +59,7 @@ from vllm.multimodal.processing import (
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
-from vllm.tokenizers import MistralTokenizer
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
+from vllm.tokenizers import MistralTokenizer, cached_tokenizer_from_config
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py
index 0a39ea7ef5bff..45f8fa079c714 100644
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -51,8 +51,7 @@ from vllm.multimodal.processing import (
 )
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
-from vllm.tokenizers import MistralTokenizer
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
+from vllm.tokenizers import MistralTokenizer, cached_tokenizer_from_config
 
 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsTranscription
 from .utils import init_vllm_registered_model, maybe_prefix
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index 1ed6ae4366d0c..0daf6bda61ccb 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -48,7 +48,7 @@ from vllm.multimodal.processing import (
     PromptUpdate,
 )
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
-from vllm.transformers_utils.processor import cached_get_processor
+from vllm.transformers_utils.processor import cached_processor_from_config
 from vllm.utils.jsontree import json_map_leaves
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 from vllm.utils.torch_utils import set_default_torch_dtype
@@ -850,7 +850,7 @@ class WhisperForConditionalGeneration(
     def get_speech_to_text_config(
         cls, model_config: ModelConfig, task_type: str
     ) -> SpeechToTextConfig:
-        processor = cached_get_processor(model_config.model)
+        processor = cached_processor_from_config(model_config)
 
         return SpeechToTextConfig(
             max_audio_clip_s=processor.feature_extractor.chunk_length,
@@ -864,7 +864,7 @@ class WhisperForConditionalGeneration(
         stt_config: SpeechToTextConfig,
         model_config: ModelConfig,
     ) -> int | None:
-        processor = cached_get_processor(model_config.model)
+        processor = cached_processor_from_config(model_config)
         hop_length = processor.feature_extractor.hop_length
         assert hop_length is not None
         # NOTE(NickLucche) user can't pass encoder
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 2fdae46e547b0..00a84f9dec4f7 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -6,8 +6,7 @@ from typing import TYPE_CHECKING, Generic, Protocol, TypeVar, cast
 
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.logger import init_logger
-from vllm.tokenizers import TokenizerLike
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
+from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
 
 from .cache import BaseMultiModalProcessorCache
 from .processing import (
diff --git a/vllm/tokenizers/__init__.py b/vllm/tokenizers/__init__.py
index 14f0148cf7ba8..42487f5f51651 100644
--- a/vllm/tokenizers/__init__.py
+++ b/vllm/tokenizers/__init__.py
@@ -4,12 +4,21 @@
 from .hf import HfTokenizer
 from .mistral import MistralTokenizer
 from .protocol import TokenizerLike
-from .registry import TokenizerRegistry, get_tokenizer
+from .registry import (
+    TokenizerRegistry,
+    cached_get_tokenizer,
+    cached_tokenizer_from_config,
+    get_tokenizer,
+    init_tokenizer_from_config,
+)
 
 __all__ = [
     "TokenizerLike",
     "HfTokenizer",
     "MistralTokenizer",
     "TokenizerRegistry",
+    "cached_get_tokenizer",
     "get_tokenizer",
+    "cached_tokenizer_from_config",
+    "init_tokenizer_from_config",
 ]
diff --git a/vllm/tokenizers/registry.py b/vllm/tokenizers/registry.py
index d5e7899321615..bf9d295de23ae 100644
--- a/vllm/tokenizers/registry.py
+++ b/vllm/tokenizers/registry.py
@@ -2,10 +2,12 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import importlib.util
 from collections.abc import Callable
+from functools import lru_cache
 from pathlib import Path
-from typing import TypeVar, overload
+from typing import TYPE_CHECKING, TypeVar, overload
 
 import huggingface_hub
+from typing_extensions import assert_never
 
 import vllm.envs as envs
 from vllm.logger import init_logger
@@ -21,6 +23,9 @@ from vllm.utils.import_utils import resolve_obj_by_qualname
 
 from .protocol import TokenizerLike
 
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig
+
 logger = init_logger(__name__)
 
 _T = TypeVar("_T", bound=type[TokenizerLike])
@@ -195,3 +200,34 @@ def get_tokenizer(
         )
 
     return tokenizer
+
+
+cached_get_tokenizer = lru_cache(get_tokenizer)
+
+
+def cached_tokenizer_from_config(model_config: "ModelConfig", **kwargs):
+    return cached_get_tokenizer(
+        model_config.tokenizer,
+        tokenizer_mode=model_config.tokenizer_mode,
+        revision=model_config.tokenizer_revision,
+        trust_remote_code=model_config.trust_remote_code,
+        **kwargs,
+    )
+
+
+def init_tokenizer_from_config(model_config: "ModelConfig"):
+    runner_type = model_config.runner_type
+    if runner_type == "generate" or runner_type == "draft":
+        truncation_side = "left"
+    elif runner_type == "pooling":
+        truncation_side = "right"
+    else:
+        assert_never(runner_type)
+
+    return get_tokenizer(
+        model_config.tokenizer,
+        tokenizer_mode=model_config.tokenizer_mode,
+        trust_remote_code=model_config.trust_remote_code,
+        revision=model_config.tokenizer_revision,
+        truncation_side=truncation_side,
+    )
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 0911848c02e14..617d16779ca26 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -2,17 +2,10 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import warnings
-from functools import lru_cache
-from typing import TYPE_CHECKING, Any
-
-from typing_extensions import assert_never
+from typing import Any
 
 from vllm.logger import init_logger
-from vllm.tokenizers import TokenizerLike, get_tokenizer
-
-if TYPE_CHECKING:
-    from vllm.config import ModelConfig
-
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
@@ -28,18 +21,54 @@ def __getattr__(name: str):
         )
 
         return TokenizerLike
-    if name == "get_cached_tokenizer":
-        from vllm.tokenizers.hf import get_cached_tokenizer
+    if name == "get_tokenizer":
+        from vllm.tokenizers import get_tokenizer
 
         warnings.warn(
-            "`vllm.transformers_utils.tokenizer.get_cached_tokenizer` "
-            "has been moved to `vllm.tokenizers.hf.get_cached_tokenizer`. "
+            "`vllm.transformers_utils.tokenizer.get_tokenizer` "
+            "has been moved to `vllm.tokenizers.get_tokenizer`. "
             "The old name will be removed in v0.13.",
             DeprecationWarning,
             stacklevel=2,
         )
 
-        return get_cached_tokenizer
+        return get_tokenizer
+    if name == "cached_get_tokenizer":
+        from vllm.tokenizers import cached_get_tokenizer
+
+        warnings.warn(
+            "`vllm.transformers_utils.tokenizer.cached_get_tokenizer` "
+            "has been moved to `vllm.tokenizers.cached_get_tokenizer`. "
+            "The old name will be removed in v0.13.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+
+        return cached_get_tokenizer
+    if name == "cached_tokenizer_from_config":
+        from vllm.tokenizers import cached_tokenizer_from_config
+
+        warnings.warn(
+            "`vllm.transformers_utils.tokenizer.cached_tokenizer_from_config` "
+            "has been moved to `vllm.tokenizers.cached_tokenizer_from_config`. "
+            "The old name will be removed in v0.13.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+
+        return cached_tokenizer_from_config
+    if name == "init_tokenizer_from_configs":
+        from vllm.tokenizers import init_tokenizer_from_config
+
+        warnings.warn(
+            "`vllm.transformers_utils.tokenizer.init_tokenizer_from_configs` "
+            "has been moved to `vllm.tokenizers.init_tokenizer_from_config`. "
+            "The old name will be removed in v0.13.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+
+        return init_tokenizer_from_config
 
     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
 
@@ -92,37 +121,3 @@ def encode_tokens(
         kw_args["add_special_tokens"] = add_special_tokens
 
     return tokenizer.encode(text, **kw_args)
-
-
-cached_get_tokenizer = lru_cache(get_tokenizer)
-
-
-def cached_tokenizer_from_config(
-    model_config: "ModelConfig",
-    **kwargs: Any,
-):
-    return cached_get_tokenizer(
-        model_config.tokenizer,
-        tokenizer_mode=model_config.tokenizer_mode,
-        revision=model_config.tokenizer_revision,
-        trust_remote_code=model_config.trust_remote_code,
-        **kwargs,
-    )
-
-
-def init_tokenizer_from_configs(model_config: "ModelConfig"):
-    runner_type = model_config.runner_type
-    if runner_type == "generate" or runner_type == "draft":
-        truncation_side = "left"
-    elif runner_type == "pooling":
-        truncation_side = "right"
-    else:
-        assert_never(runner_type)
-
-    return get_tokenizer(
-        model_config.tokenizer,
-        tokenizer_mode=model_config.tokenizer_mode,
-        trust_remote_code=model_config.trust_remote_code,
-        revision=model_config.tokenizer_revision,
-        truncation_side=truncation_side,
-    )
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 17a271ca42e26..ec5d6e95ce3aa 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -26,10 +26,9 @@ from vllm.plugins.io_processors import get_io_processor
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.tasks import SupportedTask
-from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers import TokenizerLike, init_tokenizer_from_config
 from vllm.tracing import init_tracer
 from vllm.transformers_utils.config import maybe_register_config_serialize_by_value
-from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils.async_utils import cancel_task_threadsafe
 from vllm.utils.collection_utils import as_list
@@ -112,7 +111,7 @@ class AsyncLLM(EngineClient):
         if self.model_config.skip_tokenizer_init:
             tokenizer = None
         else:
-            tokenizer = init_tokenizer_from_configs(self.model_config)
+            tokenizer = init_tokenizer_from_config(self.model_config)
 
         self.input_processor = InputProcessor(self.vllm_config, tokenizer)
         self.io_processor = get_io_processor(
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index e7dfc554e76fa..d21cdf04ead26 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -23,9 +23,8 @@ from vllm.plugins.io_processors import get_io_processor
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.tasks import SupportedTask
-from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers import TokenizerLike, init_tokenizer_from_config
 from vllm.tracing import init_tracer
-from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core_client import EngineCoreClient
@@ -87,7 +86,7 @@ class LLMEngine:
         if self.model_config.skip_tokenizer_init:
             tokenizer = None
         else:
-            tokenizer = init_tokenizer_from_configs(self.model_config)
+            tokenizer = init_tokenizer_from_config(self.model_config)
 
         self.input_processor = InputProcessor(self.vllm_config, tokenizer)
         self.io_processor = get_io_processor(
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 029129cf1a475..d087d28b1dae3 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -7,7 +7,7 @@ from typing import TYPE_CHECKING
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParserManager
-from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs
+from vllm.tokenizers import init_tokenizer_from_config
 from vllm.utils.import_utils import LazyLoader
 from vllm.v1.structured_output.backend_guidance import GuidanceBackend
 from vllm.v1.structured_output.backend_types import (
@@ -61,7 +61,7 @@ class StructuredOutputManager:
             # of CPUs.
             max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
             self.executor = ThreadPoolExecutor(max_workers=max_workers)
-            self.tokenizer = init_tokenizer_from_configs(
+            self.tokenizer = init_tokenizer_from_config(
                 model_config=self.vllm_config.model_config
             )
             reasoning_parser = (

From 4b612664fdfb4e87af6684403872d83ac04fa496 Mon Sep 17 00:00:00 2001
From: Shengqi Chen <harry-chen@outlook.com>
Date: Tue, 2 Dec 2025 14:17:10 +0800
Subject: [PATCH 136/770] [CI] Renovation of nightly wheel build & generation
 (take 2) (#29838)

Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
---
 .buildkite/release-pipeline.yaml              |  16 +-
 .buildkite/scripts/generate-nightly-index.py  | 369 ++++++++++++++++++
 .buildkite/scripts/upload-wheels.sh           | 119 +++---
 .buildkite/test-pipeline.yaml                 |   3 +-
 docs/getting_started/installation/cpu.md      |  15 +-
 .../installation/gpu.cuda.inc.md              |  75 ++--
 docs/getting_started/installation/gpu.md      |   2 +-
 setup.py                                      | 148 +++++--
 vllm/envs.py                                  |   7 +-
 9 files changed, 606 insertions(+), 148 deletions(-)
 create mode 100644 .buildkite/scripts/generate-nightly-index.py

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 38c400ba1faf5..fbfc923998f89 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -8,7 +8,7 @@ steps:
     commands:
       # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
       # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       - "bash .buildkite/scripts/upload-wheels.sh"
@@ -30,19 +30,6 @@ steps:
       DOCKER_BUILDKIT: "1"
 
   # x86 + CUDA builds
-  - label: "Build wheel - CUDA 12.8"
-    depends_on: ~
-    id: build-wheel-cuda-12-8
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
-      - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
-    env:
-      DOCKER_BUILDKIT: "1"
-
   - label: "Build wheel - CUDA 12.9"
     depends_on: ~
     id: build-wheel-cuda-12-9
@@ -109,7 +96,6 @@ steps:
   - label: "Annotate release workflow"
     depends_on:
       - create-multi-arch-manifest
-      - build-wheel-cuda-12-8
     id: annotate-release-workflow
     agents:
       queue: cpu_queue_postmerge
diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py
new file mode 100644
index 0000000000000..8d09ba178db7b
--- /dev/null
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -0,0 +1,369 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# do not complain about line length (for docstring)
+# ruff: noqa: E501
+
+import argparse
+import json
+import re
+import sys
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Any
+from urllib.parse import quote
+
+if not sys.version_info >= (3, 12):
+    raise RuntimeError("This script requires Python 3.12 or higher.")
+
+INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
+<html>
+  <meta name="pypi:repository-version" content="1.0">
+  <body>
+{items}
+  </body>
+</html>
+"""
+
+
+@dataclass
+class WheelFileInfo:
+    package_name: str
+    version: str
+    build_tag: str | None
+    python_tag: str
+    abi_tag: str
+    platform_tag: str
+    variant: str | None
+    filename: str
+
+
+def parse_from_filename(file: str) -> WheelFileInfo:
+    """
+    Parse wheel file name to extract metadata.
+
+    The format of wheel names:
+        {package_name}-{version}(-{build_tag})?-{python_tag}-{abi_tag}-{platform_tag}.whl
+    All versions could contain a variant like '+cu129' or '.cpu' or `.rocm` (or not).
+    Example:
+        vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl
+        vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl
+        vllm-0.11.1rc8.dev14+gaa384b3c0-cp38-abi3-manylinux2014_aarch64.whl
+        vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl
+    """
+    wheel_file_re = re.compile(
+        r"^(?P<package_name>.+)-(?P<version>[^-]+?)(-(?P<build_tag>[^-]+))?-(?P<python_tag>[^-]+)-(?P<abi_tag>[^-]+)-(?P<platform_tag>[^-]+)\.whl$"
+    )
+    match = wheel_file_re.match(file)
+    if not match:
+        raise ValueError(f"Invalid wheel file name: {file}")
+
+    package_name = match.group("package_name")
+    version = match.group("version")
+    build_tag = match.group("build_tag")
+    python_tag = match.group("python_tag")
+    abi_tag = match.group("abi_tag")
+    platform_tag = match.group("platform_tag")
+
+    # extract variant from version
+    variant = None
+    if "dev" in version:
+        ver_after_dev = version.split("dev")[-1]
+        if "." in ver_after_dev:
+            variant = ver_after_dev.split(".")[-1]
+            version = version.removesuffix("." + variant)
+    else:
+        if "+" in version:
+            version, variant = version.split("+")
+
+    return WheelFileInfo(
+        package_name=package_name,
+        version=version,
+        build_tag=build_tag,
+        python_tag=python_tag,
+        abi_tag=abi_tag,
+        platform_tag=platform_tag,
+        variant=variant,
+        filename=file,
+    )
+
+
+def generate_project_list(subdir_names: list[str]) -> str:
+    """
+    Generate project list HTML content linking to each project & variant sub-directory.
+    """
+    href_tags = []
+    for name in sorted(subdir_names):
+        name = name.strip("/").strip(".")
+        href_tags.append(f'    <a href="{name}/">{name}/</a><br/>')
+    return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
+
+
+def generate_package_index_and_metadata(
+    wheel_files: list[WheelFileInfo], wheel_base_dir: Path, index_base_dir: Path
+) -> tuple[str, str]:
+    """
+    Generate package index HTML content for a specific package, linking to actual wheel files.
+    """
+    href_tags = []
+    metadata = []
+    for file in sorted(wheel_files, key=lambda x: x.filename):
+        relative_path = (
+            wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename
+        )
+        # handle with '+' in URL, and avoid double-encoding '/' and already-encoded '%2B'
+        # NOTE: this is AWS S3 specific behavior!
+        file_path_quoted = quote(relative_path.as_posix(), safe=":%/")
+        href_tags.append(f'    <a href="{file_path_quoted}">{file.filename}</a><br/>')
+        file_meta = asdict(file)
+        file_meta["path"] = file_path_quoted
+        metadata.append(file_meta)
+    index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
+    metadata_str = json.dumps(metadata, indent=2)
+    return index_str, metadata_str
+
+
+def generate_index_and_metadata(
+    whl_files: list[str],
+    wheel_base_dir: Path,
+    index_base_dir: Path,
+    default_variant: str | None = None,
+    alias_to_default: str | None = None,
+):
+    """
+    Generate index for all wheel files.
+
+    Args:
+        whl_files (list[str]): List of wheel files (must be directly under `wheel_base_dir`).
+        wheel_base_dir (Path): Base directory for wheel files.
+        index_base_dir (Path): Base directory to store index files.
+        default_variant (str | None): The default variant name, if any.
+        alias_to_default (str | None): Alias variant name for the default variant, if any.
+
+    First, parse all wheel files to extract metadata.
+    We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
+    The index for the default variant (if any) is generated in the root index directory.
+
+    If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
+    is purely a copy of the corresponding variant index, with only the links adjusted.
+    Otherwise, all wheels without variant suffixes are treated as the default variant.
+
+    If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
+    as the default variant index, but the links are adjusted accordingly.
+
+    Index directory structure:
+        index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
+            index.html  # project list, linking to "vllm/" and other packages, and all variant sub-directories
+            vllm/
+                index.html # package index, pointing to actual files in wheel_base_dir (relative path)
+                metadata.json # machine-readable metadata for all wheels in this package
+            cpu/ # cpu variant sub-directory
+                index.html
+                vllm/
+                    index.html
+                    metadata.json
+            cu129/ # cu129 is actually the alias to default variant
+                index.html
+                vllm/
+                    index.html
+                    metadata.json
+            cu130/ # cu130 variant sub-directory
+                index.html
+                vllm/
+                    index.html
+                    metadata.json
+            ...
+
+    metadata.json stores a dump of all wheel files' metadata in a machine-readable format:
+        [
+            {
+                "package_name": "vllm",
+                "version": "0.10.2rc2",
+                "build_tag": null,
+                "python_tag": "cp38",
+                "abi_tag": "abi3",
+                "platform_tag": "manylinux2014_aarch64",
+                "variant": "cu129",
+                "filename": "vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl",
+                "path": "../vllm-0.10.2rc2%2Bcu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL and URL-encoded
+            },
+            ...
+        ]
+    """
+
+    parsed_files = [parse_from_filename(f) for f in whl_files]
+
+    if not parsed_files:
+        print("No wheel files found, skipping index generation.")
+        return
+
+    # Group by variant
+    variant_to_files: dict[str, list[WheelFileInfo]] = {}
+    for file in parsed_files:
+        variant = file.variant or "default"
+        if variant not in variant_to_files:
+            variant_to_files[variant] = []
+        variant_to_files[variant].append(file)
+
+    print(f"Found variants: {list(variant_to_files.keys())}")
+
+    # sanity check for default variant
+    if default_variant:
+        if "default" in variant_to_files:
+            raise ValueError(
+                "All wheel files must have variant suffixes when `default_variant` is specified."
+            )
+        if default_variant not in variant_to_files:
+            raise ValueError(
+                f"Default variant '{default_variant}' not found among wheel files."
+            )
+
+    if alias_to_default:
+        if "default" not in variant_to_files:
+            # e.g. only some wheels are uploaded to S3 currently
+            print(
+                "[WARN] Alias to default variant specified, but no default variant found."
+            )
+        elif alias_to_default in variant_to_files:
+            raise ValueError(
+                f"Alias variant name '{alias_to_default}' already exists among wheel files."
+            )
+        else:
+            variant_to_files[alias_to_default] = variant_to_files["default"].copy()
+            print(f"Alias variant '{alias_to_default}' created for default variant.")
+
+    # Generate index for each variant
+    subdir_names = set()
+    for variant, files in variant_to_files.items():
+        if variant == "default":
+            variant_dir = index_base_dir
+        else:
+            variant_dir = index_base_dir / variant
+            subdir_names.add(variant)
+
+        variant_dir.mkdir(parents=True, exist_ok=True)
+
+        # gather all package names in this variant
+        packages = set(f.package_name for f in files)
+        if variant == "default":
+            # these packages should also appear in the "project list"
+            # generate after all variants are processed
+            subdir_names = subdir_names.union(packages)
+        else:
+            # generate project list for this variant directly
+            project_list_str = generate_project_list(sorted(packages))
+            with open(variant_dir / "index.html", "w") as f:
+                f.write(project_list_str)
+
+        for package in packages:
+            # filter files belonging to this package only
+            package_files = [f for f in files if f.package_name == package]
+            package_dir = variant_dir / package
+            package_dir.mkdir(parents=True, exist_ok=True)
+            index_str, metadata_str = generate_package_index_and_metadata(
+                package_files, wheel_base_dir, package_dir
+            )
+            with open(package_dir / "index.html", "w") as f:
+                f.write(index_str)
+            with open(package_dir / "metadata.json", "w") as f:
+                f.write(metadata_str)
+
+    # Generate top-level project list index
+    project_list_str = generate_project_list(sorted(subdir_names))
+    with open(index_base_dir / "index.html", "w") as f:
+        f.write(project_list_str)
+
+
+if __name__ == "__main__":
+    """
+    Arguments:
+        --version <version> : version string for the current build (e.g., commit hash)
+        --current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
+        --output-dir <output_directory> : directory to store generated index files
+        --alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
+    """
+
+    parser = argparse.ArgumentParser(
+        description="Process nightly build wheel files to generate indices."
+    )
+    parser.add_argument(
+        "--version",
+        type=str,
+        required=True,
+        help="Version string for the current build (e.g., commit hash)",
+    )
+    parser.add_argument(
+        "--current-objects",
+        type=str,
+        required=True,
+        help="Path to JSON file containing current S3 objects listing in this version directory",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        required=True,
+        help="Directory to store generated index files",
+    )
+    parser.add_argument(
+        "--alias-to-default",
+        type=str,
+        default=None,
+        help="Alias variant name for the default variant",
+    )
+
+    args = parser.parse_args()
+
+    version = args.version
+    if "/" in version or "\\" in version:
+        raise ValueError("Version string must not contain slashes.")
+    current_objects_path = Path(args.current_objects)
+    output_dir = Path(args.output_dir)
+    if not output_dir.exists():
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Read current objects JSON
+    with open(current_objects_path) as f:
+        current_objects: dict[str, list[dict[str, Any]]] = json.load(f)
+
+    # current_objects looks like from list_objects_v2 S3 API:
+    """
+    "Contents": [
+        {
+            "Key": "e2f56c309d2a28899c68975a7e104502d56deb8f/vllm-0.11.2.dev363+ge2f56c309-cp38-abi3-manylinux1_x86_64.whl",
+            "LastModified": "2025-11-28T14:00:32+00:00",
+            "ETag": "\"37a38339c7cdb61ca737021b968075df-52\"",
+            "ChecksumAlgorithm": [
+                "CRC64NVME"
+            ],
+            "ChecksumType": "FULL_OBJECT",
+            "Size": 435649349,
+            "StorageClass": "STANDARD"
+        },
+        ...
+    ]
+    """
+
+    # Extract wheel file keys
+    wheel_files = []
+    for item in current_objects.get("Contents", []):
+        key: str = item["Key"]
+        if key.endswith(".whl"):
+            wheel_files.append(key.split("/")[-1])  # only the filename is used
+
+    print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
+
+    # Generate index and metadata, assuming wheels and indices are stored as:
+    # s3://vllm-wheels/{version}/<wheel files>
+    # s3://vllm-wheels/<anything>/<index files>
+    wheel_base_dir = Path(output_dir).parent / version
+    index_base_dir = Path(output_dir)
+
+    generate_index_and_metadata(
+        whl_files=wheel_files,
+        wheel_base_dir=wheel_base_dir,
+        index_base_dir=index_base_dir,
+        default_variant=None,
+        alias_to_default=args.alias_to_default,
+    )
+    print(f"Successfully generated index and metadata in {output_dir}")
diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
index 945c5e48c0090..2eaa91c04086c 100644
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -2,6 +2,28 @@
 
 set -ex
 
+# ======== part 0: setup ========
+
+BUCKET="vllm-wheels"
+INDICES_OUTPUT_DIR="indices"
+DEFAULT_VARIANT_ALIAS="cu129" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py
+PYTHON=${PYTHON_PROG:=python3} # try to read from env var, otherwise use python3
+SUBPATH=$BUILDKITE_COMMIT
+S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
+
+# detect if python3.10+ is available
+has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)")
+if [[ "$has_new_python" -eq 0 ]]; then
+    # use new python from docker
+    docker pull python:3-slim
+    PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3"
+fi
+
+echo "Using python interpreter: $PYTHON"
+echo "Python version: $($PYTHON --version)"
+
+# ========= part 1: collect, rename & upload the wheel ==========
+
 # Assume wheels are in artifacts/dist/*.whl
 wheel_files=(artifacts/dist/*.whl)
 
@@ -10,74 +32,69 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then
   echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
   exit 1
 fi
-
-# Get the single wheel file
 wheel="${wheel_files[0]}"
 
-# Detect architecture and rename 'linux' to appropriate manylinux version
-arch=$(uname -m)
-if [[ $arch == "x86_64" ]]; then
-    manylinux_version="manylinux1"
-elif [[ $arch == "aarch64" ]]; then
-    manylinux_version="manylinux2014"
-else
-    echo "Warning: Unknown architecture $arch, using manylinux1 as default"
-    manylinux_version="manylinux1"
-fi
+# current build image uses ubuntu 20.04, which corresponds to manylinux_2_31
+# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
+manylinux_version="manylinux_2_31"
 
 # Rename 'linux' to the appropriate manylinux version in the wheel filename
+if [[ "$wheel" != *"linux"* ]]; then
+  echo "Error: Wheel filename does not contain 'linux': $wheel"
+  exit 1
+fi
 new_wheel="${wheel/linux/$manylinux_version}"
 mv -- "$wheel" "$new_wheel"
 wheel="$new_wheel"
+echo "Renamed wheel to: $wheel"
 
 # Extract the version from the wheel
 version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
-echo "Version: $version"
+echo "Version in wheel: $version"
+pure_version="${version%%+*}"
+echo "Pure version (without variant): $pure_version"
 
-normal_wheel="$wheel" # Save the original wheel filename
+# copy wheel to its own bucket
+aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
 
-# If the version contains "dev", rename it to v1.0.0.dev for consistency
-if [[ $version == *dev* ]]; then
-    suffix="${version##*.}"
-    if [[ $suffix == cu* ]]; then
-        new_version="1.0.0.dev+${suffix}"
-    else
-        new_version="1.0.0.dev"
-    fi
-    new_wheel="${wheel/$version/$new_version}"
-    # use cp to keep both files in the artifacts directory
-    cp -- "$wheel" "$new_wheel"
-    wheel="$new_wheel"
-    version="$new_version"
-fi
+# ========= part 2: generate and upload indices ==========
+# generate indices for all existing wheels in the commit directory
+# this script might be run multiple times if there are multiple variants being built
+# so we need to guarantee there is little chance for "TOCTOU" issues
+# i.e., one process is generating indices while another is uploading a new wheel
+# so we need to ensure no time-consuming operations happen below
 
-# Upload the wheel to S3
-python3 .buildkite/generate_index.py --wheel "$normal_wheel"
+# list all wheels in the commit directory
+echo "Existing wheels on S3:"
+aws s3 ls "$S3_COMMIT_PREFIX"
+obj_json="objects.json"
+aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
+mkdir -p "$INDICES_OUTPUT_DIR"
 
-# generate index for this commit
-aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
-aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
-
-if [[ $normal_wheel == *"cu129"* ]]; then
-    # only upload index.html for cu129 wheels (default wheels) as it
-    # is available on both x86 and arm64
-    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
-    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
+# call script to generate indicies for all existing wheels
+# this indices have relative paths that could work as long as it is next to the wheel directory in s3
+# i.e., the wheels are always in s3://vllm-wheels/<commit>/
+# and indices can be placed in /<commit>/, or /nightly/, or /<version>/
+if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
+    alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
 else
-    echo "Skipping index files for non-cu129 wheels"
+    alias_arg=""
 fi
 
-# generate index for nightly
-aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
-aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
+$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg
 
-if [[ $normal_wheel == *"cu129"* ]]; then
-    # only upload index.html for cu129 wheels (default wheels) as it
-    # is available on both x86 and arm64
-    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
-else
-    echo "Skipping index files for non-cu129 wheels"
+# copy indices to /<commit>/ unconditionally
+echo "Uploading indices to $S3_COMMIT_PREFIX"
+aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
+
+# copy to /nightly/ only if it is on the main branch and not a PR 
+if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
+    echo "Uploading indices to overwrite /nightly/"
+    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
 fi
 
-aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
-aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
+# copy to /<pure_version>/ only if it does not have "dev" in the version
+if [[ "$version" != *"dev"* ]]; then
+    echo "Uploading indices to overwrite /$pure_version/"
+    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
+fi
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 0e715a719d27d..9f2107fb1e5ab 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -215,7 +215,6 @@ steps:
   timeout_in_minutes: 10
   gpu: h100
   num_gpus: 8
-  optional: true
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - examples/offline_inference/torchrun_dp_example.py
@@ -1370,4 +1369,4 @@ steps:
   num_gpus: 2
   working_dir: "/vllm-workspace"
   commands:
-  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
\ No newline at end of file
diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md
index d1beab7855b18..18dc6d19434b3 100644
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -46,10 +46,23 @@ vLLM is a Python library that supports the following CPU variants. Select your C
 
 ### Pre-built wheels
 
-Currently, there are no pre-built CPU wheels.
+Please refer to the instructions for [pre-built wheels on GPU](./gpu.md#pre-built-wheels).
+
+When specifying the index URL, please make sure to use the `cpu` variant subdirectory.
+For example, the nightly build index is: `https://wheels.vllm.ai/nightly/cpu/`.
 
 ### Build wheel from source
 
+#### Set up using Python-only build (without compilation) {#python-only-build}
+
+Please refer to the instructions for [Python-only build on GPU](./gpu.md#python-only-build), and replace the build commands with:
+
+```bash
+VLLM_USE_PRECOMPILED=1 VLLM_PRECOMPILED_WHEEL_VARIANT=cpu VLLM_TARGET_DEVICE=cpu uv pip install --editable .
+```
+
+#### Full build (with compilation) {#full-build}
+
 === "Intel/AMD x86"
 
     --8<-- "docs/getting_started/installation/cpu.x86.inc.md:build-wheel-from-source"
diff --git a/docs/getting_started/installation/gpu.cuda.inc.md b/docs/getting_started/installation/gpu.cuda.inc.md
index 601d3659af886..03ce28c78efc9 100644
--- a/docs/getting_started/installation/gpu.cuda.inc.md
+++ b/docs/getting_started/installation/gpu.cuda.inc.md
@@ -26,42 +26,49 @@ uv pip install vllm --torch-backend=auto
 
 ??? console "pip"
     ```bash
-    # Install vLLM with CUDA 12.8.
-    pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128
+    # Install vLLM with CUDA 12.9.
+    pip install vllm --extra-index-url https://download.pytorch.org/whl/cu129
     ```
 
-We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu126`), set `--torch-backend=cu126` (or `UV_TORCH_BACKEND=cu126`). If this doesn't work, try running `uv self update` to update `uv` first.
+We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu128`), set `--torch-backend=cu128` (or `UV_TORCH_BACKEND=cu128`). If this doesn't work, try running `uv self update` to update `uv` first.
 
 !!! note
     NVIDIA Blackwell GPUs (B200, GB200) require a minimum of CUDA 12.8, so make sure you are installing PyTorch wheels with at least that version. PyTorch itself offers a [dedicated interface](https://pytorch.org/get-started/locally/) to determine the appropriate pip command to run for a given target configuration.
 
-As of now, vLLM's binaries are compiled with CUDA 12.8 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.6, 11.8, and public PyTorch release versions:
+As of now, vLLM's binaries are compiled with CUDA 12.9 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.8, 13.0, and public PyTorch release versions:
 
 ```bash
-# Install vLLM with a specific CUDA version (e.g., 11.8 or 12.6).
+# Install vLLM with a specific CUDA version (e.g., 13.0).
 export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
-export CUDA_VERSION=118 # or 126
-uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}
+export CUDA_VERSION=130 # or other
+uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux_2_31_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}
 ```
 
 #### Install the latest code
 
-LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on an x86 platform with CUDA 12 for every commit since `v0.5.3`.
+LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for every commit since `v0.5.3` on <https://wheels.vllm.ai/nightly>. There are multiple indices that could be used:
+
+* `https://wheels.vllm.ai/nightly`: the default variant (CUDA with version specified in `VLLM_MAIN_CUDA_VERSION`) built with the last commit on the `main` branch. Currently it is CUDA 12.9.
+* `https://wheels.vllm.ai/nightly/<variant>`: all other variants. Now this includes `cu130`, and `cpu`. The default variant (`cu129`) also has a subdirectory to keep consistency.
+
+To install from nightly index, run:
 
 ```bash
 uv pip install -U vllm \
     --torch-backend=auto \
-    --extra-index-url https://wheels.vllm.ai/nightly
+    --extra-index-url https://wheels.vllm.ai/nightly # add variant subdirectory here if needed
 ```
 
-??? console "pip"
-    ```bash
-    pip install -U vllm \
-        --pre \
-        --extra-index-url https://wheels.vllm.ai/nightly
-    ```
+!!! warning "`pip` caveat"
 
-    `--pre` is required for `pip` to consider pre-released versions.
+    Using `pip` to install from nightly indices is _not supported_, because `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. In contrast, `uv` gives the extra index [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes).
+
+    If you insist on using `pip`, you have to specify the full URL of the wheel file (which can be obtained from the web page).
+
+    ```bash
+    pip install -U https://wheels.vllm.ai/nightly/vllm-0.11.2.dev399%2Bg3c7461c18-cp38-abi3-manylinux_2_31_x86_64.whl # current nightly build (the filename will change!)
+    pip install -U https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-0.11.2.dev399%2Bg3c7461c18-cp38-abi3-manylinux_2_31_x86_64.whl # from specific commit
+    ```
 
 ##### Install specific revisions
 
@@ -71,33 +78,13 @@ If you want to access the wheels for previous commits (e.g. to bisect the behavi
 export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
 uv pip install vllm \
     --torch-backend=auto \
-    --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}
+    --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT} # add variant subdirectory here if needed
 ```
 
-The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
-
-??? note "pip"
-    If you want to access the wheels for previous commits (e.g. to bisect the behavior change,
-    performance regression), due to the limitation of `pip`, you have to specify the full URL of the
-    wheel file by embedding the commit hash in the URL:
-
-    ```bash
-    export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
-    pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
-    ```
-
-    Note that the wheels are built with Python 3.8 ABI (see [PEP
-    425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible
-    with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a
-    placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in
-    the wheel metadata (the wheels listed in the extra index url have correct versions). Although we
-    don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the
-    wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
-
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]
 
-#### Set up using Python-only build (without compilation)
+#### Set up using Python-only build (without compilation) {#python-only-build}
 
 If you only need to change Python code, you can build and install vLLM without compilation. Using `uv pip`'s [`--editable` flag](https://docs.astral.sh/uv/pip/packages/#editable-packages), changes you make to the code will be reflected when you run vLLM:
 
@@ -121,18 +108,24 @@ This command will do the following:
 In case you see an error about wheel not found when running the above command, it might be because the commit you based on in the main branch was just merged and the wheel is being built. In this case, you can wait for around an hour to try again, or manually assign the previous commit in the installation using the `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable.
 
 ```bash
-export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
-export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+export VLLM_PRECOMPILED_WHEEL_COMMIT=$(git rev-parse HEAD~1) # or earlier commit on main
+export VLLM_USE_PRECOMPILED=1
 uv pip install --editable .
 ```
 
+There are more environment variables to control the behavior of Python-only build:
+
+* `VLLM_PRECOMPILED_WHEEL_LOCATION`: specify the exact wheel URL or local file path of a pre-compiled wheel to use. All other logic to find the wheel will be skipped.
+* `VLLM_PRECOMPILED_WHEEL_COMMIT`: override the commit hash to download the pre-compiled wheel. It can be `nightly` to use the last **already built** commit on the main branch.
+* `VLLM_PRECOMPILED_WHEEL_VARIANT`: specify the variant subdirectory to use on the nightly index, e.g., `cu129`, `cpu`. If not specified, the CUDA variant with `VLLM_MAIN_CUDA_VERSION` will be tried, then fallback to the default variant on the remote index.
+
 You can find more information about vLLM's wheels in [Install the latest code](#install-the-latest-code).
 
 !!! note
     There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
     It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [Install the latest code](#install-the-latest-code) for instructions on how to install a specified wheel.
 
-#### Full build (with compilation)
+#### Full build (with compilation) {#full-build}
 
 If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes:
 
diff --git a/docs/getting_started/installation/gpu.md b/docs/getting_started/installation/gpu.md
index bc7508b29475f..fb750f4499858 100644
--- a/docs/getting_started/installation/gpu.md
+++ b/docs/getting_started/installation/gpu.md
@@ -52,7 +52,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 
     --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:set-up-using-python"
 
-### Pre-built wheels
+### Pre-built wheels {#pre-built-wheels}
 
 === "NVIDIA CUDA"
 
diff --git a/setup.py b/setup.py
index 0022e7fe0bf36..5b7d12bb373e3 100644
--- a/setup.py
+++ b/setup.py
@@ -311,7 +311,7 @@ class precompiled_build_ext(build_ext):
     """Disables extension building when using precompiled binaries."""
 
     def run(self) -> None:
-        assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
+        return
 
     def build_extensions(self) -> None:
         print("Skipping build_ext: using precompiled extensions.")
@@ -322,14 +322,121 @@ class precompiled_wheel_utils:
     """Extracts libraries and other files from an existing wheel."""
 
     @staticmethod
-    def extract_precompiled_and_patch_package(wheel_url_or_path: str) -> dict:
+    def fetch_metadata_for_variant(
+        commit: str, variant: str | None
+    ) -> tuple[list[dict], str]:
+        """
+        Fetches metadata for a specific variant of the precompiled wheel.
+        """
+        variant_dir = f"{variant}/" if variant is not None else ""
+        repo_url = f"https://wheels.vllm.ai/{commit}/{variant_dir}vllm/"
+        meta_url = repo_url + "metadata.json"
+        print(f"Trying to fetch nightly build metadata from {meta_url}")
+        from urllib.request import urlopen
+
+        with urlopen(meta_url) as resp:
+            # urlopen raises HTTPError on unexpected status code
+            wheels = json.loads(resp.read().decode("utf-8"))
+        return wheels, repo_url
+
+    @staticmethod
+    def determine_wheel_url() -> tuple[str, str | None]:
+        """
+        Try to determine the precompiled wheel URL or path to use.
+        The order of preference is:
+        1. user-specified wheel location (can be either local or remote, via
+           VLLM_PRECOMPILED_WHEEL_LOCATION)
+        2. user-specified variant from nightly repo (current main commit via
+           VLLM_PRECOMPILED_WHEEL_VARIANT)
+        3. the variant corresponding to VLLM_MAIN_CUDA_VERSION from nightly repo
+        4. the default variant from nightly repo (current main commit)
+        """
+        wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
+        if wheel_location is not None:
+            print(f"Using user-specified precompiled wheel location: {wheel_location}")
+            return wheel_location, None
+        else:
+            import platform
+
+            arch = platform.machine()
+            # try to fetch the wheel metadata from the nightly wheel repo
+            main_variant = "cu" + envs.VLLM_MAIN_CUDA_VERSION.replace(".", "")
+            variant = os.getenv("VLLM_PRECOMPILED_WHEEL_VARIANT", main_variant)
+            commit = os.getenv(
+                "VLLM_PRECOMPILED_WHEEL_COMMIT",
+                precompiled_wheel_utils.get_base_commit_in_main_branch(),
+            )
+            print(f"Using precompiled wheel commit {commit} with variant {variant}")
+            try_default = False
+            wheels, repo_url, download_filename = None, None, None
+            try:
+                wheels, repo_url = precompiled_wheel_utils.fetch_metadata_for_variant(
+                    commit, variant
+                )
+            except Exception as e:
+                logger.warning(
+                    "Failed to fetch precompiled wheel metadata for variant %s: %s",
+                    variant,
+                    e,
+                )
+                try_default = True  # try outside handler to keep the stacktrace simple
+            if try_default:
+                print("Trying the default variant from remote")
+                wheels, repo_url = precompiled_wheel_utils.fetch_metadata_for_variant(
+                    commit, None
+                )
+                # if this also fails, then we have nothing more to try / cache
+            assert wheels is not None and repo_url is not None, (
+                "Failed to fetch precompiled wheel metadata"
+            )
+            # The metadata.json has the following format:
+            # see .buildkite/scripts/generate-nightly-index.py for details
+            """[{
+    "package_name": "vllm",
+    "version": "0.11.2.dev278+gdbc3d9991",
+    "build_tag": null,
+    "python_tag": "cp38",
+    "abi_tag": "abi3",
+    "platform_tag": "manylinux1_x86_64",
+    "variant": null,
+    "filename": "vllm-0.11.2.dev278+gdbc3d9991-cp38-abi3-manylinux1_x86_64.whl",
+    "path": "../vllm-0.11.2.dev278%2Bgdbc3d9991-cp38-abi3-manylinux1_x86_64.whl"
+    },
+    ...]"""
+            from urllib.parse import urljoin
+
+            for wheel in wheels:
+                # TODO: maybe check more compatibility later? (python_tag, abi_tag, etc)
+                if wheel.get("package_name") == "vllm" and arch in wheel.get(
+                    "platform_tag", ""
+                ):
+                    print(f"Found precompiled wheel metadata: {wheel}")
+                    if "path" not in wheel:
+                        raise ValueError(f"Wheel metadata missing path: {wheel}")
+                    wheel_url = urljoin(repo_url, wheel["path"])
+                    download_filename = wheel.get("filename")
+                    print(f"Using precompiled wheel URL: {wheel_url}")
+                    break
+            else:
+                raise ValueError(
+                    f"No precompiled vllm wheel found for architecture {arch} "
+                    f"from repo {repo_url}. All available wheels: {wheels}"
+                )
+
+        return wheel_url, download_filename
+
+    @staticmethod
+    def extract_precompiled_and_patch_package(
+        wheel_url_or_path: str, download_filename: str | None
+    ) -> dict:
         import tempfile
         import zipfile
 
         temp_dir = None
         try:
             if not os.path.isfile(wheel_url_or_path):
-                wheel_filename = wheel_url_or_path.split("/")[-1]
+                # use provided filename first, then derive from URL
+                wheel_filename = download_filename or wheel_url_or_path.split("/")[-1]
                 temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
                 wheel_path = os.path.join(temp_dir, wheel_filename)
                 print(f"Downloading wheel from {wheel_url_or_path} to {wheel_path}")
@@ -648,38 +755,13 @@ package_data = {
     ]
 }
 
+
 # If using precompiled, extract and patch package_data (in advance of setup)
 if envs.VLLM_USE_PRECOMPILED:
-    assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
-    wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
-    if wheel_location is not None:
-        wheel_url = wheel_location
-    else:
-        import platform
-
-        arch = platform.machine()
-        if arch == "x86_64":
-            wheel_tag = "manylinux1_x86_64"
-        elif arch == "aarch64":
-            wheel_tag = "manylinux2014_aarch64"
-        else:
-            raise ValueError(f"Unsupported architecture: {arch}")
-        base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
-        wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
-        nightly_wheel_url = (
-            f"https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
-        )
-        from urllib.request import urlopen
-
-        try:
-            with urlopen(wheel_url) as resp:
-                if resp.status != 200:
-                    wheel_url = nightly_wheel_url
-        except Exception as e:
-            print(f"[warn] Falling back to nightly wheel: {e}")
-            wheel_url = nightly_wheel_url
-
-    patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(wheel_url)
+    wheel_url, download_filename = precompiled_wheel_utils.determine_wheel_url()
+    patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(
+        wheel_url, download_filename
+    )
     for pkg, files in patch.items():
         package_data.setdefault(pkg, []).extend(files)
 
diff --git a/vllm/envs.py b/vllm/envs.py
index 46f1aa3222be7..d0912863e6444 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -74,7 +74,7 @@ if TYPE_CHECKING:
     VLLM_MEDIA_CONNECTOR: str = "http"
     VLLM_MM_INPUT_CACHE_GIB: int = 4
     VLLM_TARGET_DEVICE: str = "cuda"
-    VLLM_MAIN_CUDA_VERSION: str = "12.8"
+    VLLM_MAIN_CUDA_VERSION: str = "12.9"
     MAX_JOBS: str | None = None
     NVCC_THREADS: str | None = None
     VLLM_USE_PRECOMPILED: bool = False
@@ -445,10 +445,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # Target device of vLLM, supporting [cuda (by default),
     # rocm, cpu]
     "VLLM_TARGET_DEVICE": lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda").lower(),
-    # Main CUDA version of vLLM, supporting [12.6, 12.8, 12.9],
-    # 12.8 is the default. This follows PyTorch but can be overridden.
+    # Main CUDA version of vLLM. This follows PyTorch but can be overridden.
     "VLLM_MAIN_CUDA_VERSION": lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower()
-    or "12.8",
+    or "12.9",
     # Maximum number of compilation jobs to run in parallel.
     # By default this is the number of CPUs
     "MAX_JOBS": lambda: os.getenv("MAX_JOBS", None),

From 13ea39bc09cf4c102ba4ad308df379dc5abc3ba4 Mon Sep 17 00:00:00 2001
From: Zhang Xiangze <Xiangze.Zhang@arm.com>
Date: Tue, 2 Dec 2025 14:21:39 +0800
Subject: [PATCH 137/770] [CPU]Parallelize over tokens in int4 moe (#29600)

Signed-off-by: Zhang Xiangze <Xiangze.Zhang@arm.com>
---
 csrc/moe/dynamic_4bit_int_moe_cpu.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/csrc/moe/dynamic_4bit_int_moe_cpu.cpp b/csrc/moe/dynamic_4bit_int_moe_cpu.cpp
index df47bb8dd1d7d..58dc402016881 100644
--- a/csrc/moe/dynamic_4bit_int_moe_cpu.cpp
+++ b/csrc/moe/dynamic_4bit_int_moe_cpu.cpp
@@ -93,16 +93,16 @@ torch::Tensor dynamic_4bit_int_moe_cpu(
   }
   auto Y_all = at::empty({offsets[E], H}, x_c.options());
 
-  at::parallel_for(0, E, 1, [&](int64_t e_begin, int64_t e_end) {
+  at::parallel_for(0, offsets[E], 0, [&](int64_t idx_begin, int64_t idx_end) {
     c10::InferenceMode guard;
-    for (int64_t e = e_begin; e < e_end; ++e) {
-      const int64_t te = counts[e];
-      if (te == 0) {
+    for (int64_t e = 0; e < E; ++e) {
+      int64_t start = std::max(offsets[e], idx_begin);
+      int64_t end = std::min(offsets[e + 1], idx_end);
+      int64_t te = end - start;
+      if (te <= 0) {
         continue;
       }
 
-      const int64_t start = offsets[e];
-
       auto x_e = X_all.narrow(/*dim=*/0, /*start=*/start, /*length=*/te);
 
       auto w13_e = w13_packed.select(/*dim=*/0, e);

From f5b0846ba0aa6a4b6ab788ff257d0a00eb376e75 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 2 Dec 2025 07:05:27 +0000
Subject: [PATCH 138/770] Fix some Transformers nightly tests (#29802)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/models/jina_vl.py    |  2 +-
 vllm/model_executor/models/modernbert.py | 51 ++++++++++++------------
 vllm/model_executor/models/qwen2.py      |  2 +-
 3 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/vllm/model_executor/models/jina_vl.py b/vllm/model_executor/models/jina_vl.py
index 05a40837954d8..8bba7b62882f1 100644
--- a/vllm/model_executor/models/jina_vl.py
+++ b/vllm/model_executor/models/jina_vl.py
@@ -29,7 +29,7 @@ logger = init_logger(__name__)
 class JinaVLScorer(nn.Module):
     def __init__(self, model_config: "ModelConfig"):
         super().__init__()
-        config = model_config.hf_config
+        config = model_config.hf_config.get_text_config()
         head_dtype = model_config.head_dtype
         self.dense = ColumnParallelLinear(
             config.hidden_size, config.hidden_size, params_dtype=head_dtype, bias=True
diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py
index 743bc23d9876f..be36f761c63aa 100644
--- a/vllm/model_executor/models/modernbert.py
+++ b/vllm/model_executor/models/modernbert.py
@@ -20,7 +20,7 @@ from vllm.model_executor.layers.pooler import (
     PoolingParamsUpdate,
     PoolingType,
 )
-from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
@@ -62,19 +62,6 @@ class ModernBertEmbeddings(nn.Module):
             return embeddings
 
 
-class ModernBertRotaryEmbedding(RotaryEmbedding):
-    def __init__(self, config: ModernBertConfig, head_size: int, dim: int, base: float):
-        super().__init__(
-            head_size=head_size,
-            rotary_dim=dim,
-            max_position_embeddings=config.max_position_embeddings,
-            base=base,
-            is_neox_style=True,
-            dtype=torch.float16,
-        )
-        self.config = config
-
-
 class ModernBertAttention(nn.Module):
     def __init__(self, config: ModernBertConfig, layer_id: int | None = None):
         super().__init__()
@@ -95,19 +82,33 @@ class ModernBertAttention(nn.Module):
             bias=config.attention_bias,
         )
 
-        sliding_window = None
-        if layer_id % config.global_attn_every_n_layers != 0:
-            sliding_window = config.local_attention // 2
-            rope_theta = (
-                config.local_rope_theta
-                if config.local_rope_theta is not None
-                else config.global_rope_theta
-            )
+        if layer_types := getattr(config, "layer_types", None):
+            # Transformers v5
+            layer_type = layer_types[layer_id]
+            rope_parameters = config.rope_parameters[layer_type]
+            sliding_window: int | None = None
+            if layer_type == "sliding_attention":
+                sliding_window = config.local_attention // 2
         else:
-            rope_theta = config.global_rope_theta
+            # Transformers v4
+            sliding_window = None
+            if layer_id % config.global_attn_every_n_layers != 0:
+                sliding_window = config.local_attention // 2
+                rope_theta = (
+                    config.local_rope_theta
+                    if config.local_rope_theta is not None
+                    else config.global_rope_theta
+                )
+            else:
+                rope_theta = config.global_rope_theta
+            rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
 
-        self.rotary_emb = ModernBertRotaryEmbedding(
-            config=config, head_size=self.head_dim, dim=self.head_dim, base=rope_theta
+        self.rotary_emb = get_rope(
+            head_size=self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=config.max_position_embeddings,
+            rope_parameters=rope_parameters,
+            dtype=torch.float16,
         )
         self.attn = EncoderOnlyAttention(
             self.num_heads,
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 34c31d8deee23..f5501bae78418 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -503,7 +503,7 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
-        config = vllm_config.model_config.hf_config
+        config = vllm_config.model_config.hf_config.get_text_config()
         quant_config = vllm_config.quant_config
 
         self.config = config

From 0037b5746a4319c851eed2bc0b24912e179a851c Mon Sep 17 00:00:00 2001
From: Wushi Dong <33078715+wushidonguc@users.noreply.github.com>
Date: Mon, 1 Dec 2025 23:08:07 -0800
Subject: [PATCH 139/770] [Core] Eliminate redundant is_encoder_decoder lookups
 (20-40us/step) (#29800)

Signed-off-by: Wushi Dong <dongws@meta.com>
---
 vllm/v1/worker/gpu_model_runner.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 9eacd2138978b..ee28f477a26ad 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2439,16 +2439,13 @@ class GPUModelRunner(
     ]:
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         is_first_rank = get_pp_group().is_first_rank
+        is_encoder_decoder = self.model_config.is_encoder_decoder
 
         # _prepare_inputs may reorder the batch, so we must gather multi
         # modal outputs after that to ensure the correct order
         ec_connector_output = None
 
-        if (
-            self.supports_mm_inputs
-            and is_first_rank
-            and not self.model_config.is_encoder_decoder
-        ):
+        if self.supports_mm_inputs and is_first_rank and not is_encoder_decoder:
             # Run the multimodal encoder if any.
             with self.maybe_get_ec_connector_output(
                 scheduler_output,
@@ -2526,10 +2523,7 @@ class GPUModelRunner(
                 num_input_tokens, intermediate_tensors, True
             )
 
-        if (
-            self.model_config.is_encoder_decoder
-            and scheduler_output.scheduled_encoder_inputs
-        ):
+        if is_encoder_decoder and scheduler_output.scheduled_encoder_inputs:
             # Run the encoder, just like we do with other multimodal inputs.
             # For an encoder-decoder model, our processing here is a bit
             # simpler, because the outputs are just passed to the decoder.

From 3b221cb661676fb2e9ae791ede9cb7f5cf4752d1 Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Mon, 1 Dec 2025 23:49:16 -0800
Subject: [PATCH 140/770] [BugFix] respect VLLM_LOGGING_LEVEL in logger
 (#29761)

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 tests/conftest.py    | 1 +
 tests/test_config.py | 4 ++--
 tests/test_logger.py | 4 ++--
 vllm/logger.py       | 5 ++++-
 4 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 317b36ba6cb80..53bbaddd0bb7f 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1174,6 +1174,7 @@ def caplog_mp_spawn(tmp_path, monkeypatch):
             "level": level,
             "filename": log_path.as_posix(),
         }
+        config["loggers"]["vllm"]["level"] = level
 
         config_path.write_text(json.dumps(config))
 
diff --git a/tests/test_config.py b/tests/test_config.py
index 112b02edd0389..76e0d94425fa6 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -716,7 +716,7 @@ def test_is_chunked_prefill_supported(
 ):
     model_config = ModelConfig(model_id, trust_remote_code=True)
     assert model_config.attn_type == expected_attn_type
-    with caplog_vllm.at_level(level=logging.DEBUG):
+    with caplog_vllm.at_level(level=logging.DEBUG, logger="vllm"):
         assert model_config.is_chunked_prefill_supported == expected_result
     assert reason in caplog_vllm.text
 
@@ -835,7 +835,7 @@ def test_is_prefix_caching_supported(
 ):
     model_config = ModelConfig(model_id, trust_remote_code=True)
     assert model_config.attn_type == expected_attn_type
-    with caplog_vllm.at_level(level=logging.DEBUG):
+    with caplog_vllm.at_level(level=logging.DEBUG, logger="vllm"):
         assert model_config.is_prefix_caching_supported == expected_result
     assert reason in caplog_vllm.text
 
diff --git a/tests/test_logger.py b/tests/test_logger.py
index 8900e9c2a1e69..b4f44f52d4df9 100644
--- a/tests/test_logger.py
+++ b/tests/test_logger.py
@@ -57,7 +57,7 @@ def test_default_vllm_root_logger_configuration(monkeypatch):
     _configure_vllm_root_logger()
 
     logger = logging.getLogger("vllm")
-    assert logger.level == logging.DEBUG
+    assert logger.level == logging.INFO
     assert not logger.propagate
 
     handler = logger.handlers[0]
@@ -524,7 +524,7 @@ def mp_function(**kwargs):
 
 
 def test_caplog_mp_fork(caplog_vllm, caplog_mp_fork):
-    with caplog_vllm.at_level(logging.DEBUG), caplog_mp_fork():
+    with caplog_vllm.at_level(logging.DEBUG, logger="vllm"), caplog_mp_fork():
         import multiprocessing
 
         ctx = multiprocessing.get_context("fork")
diff --git a/vllm/logger.py b/vllm/logger.py
index ad3123c0f0149..3b7bb1f22ec96 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -62,7 +62,7 @@ DEFAULT_LOGGING_CONFIG = {
     "loggers": {
         "vllm": {
             "handlers": ["vllm"],
-            "level": "DEBUG",
+            "level": envs.VLLM_LOGGING_LEVEL,
             "propagate": False,
         },
     },
@@ -175,6 +175,9 @@ def _configure_vllm_root_logger() -> None:
         vllm_handler["stream"] = envs.VLLM_LOGGING_STREAM
         vllm_handler["formatter"] = "vllm_color" if _use_color() else "vllm"
 
+        vllm_loggers = logging_config["loggers"]["vllm"]
+        vllm_loggers["level"] = envs.VLLM_LOGGING_LEVEL
+
     if envs.VLLM_LOGGING_CONFIG_PATH:
         if not path.exists(envs.VLLM_LOGGING_CONFIG_PATH):
             raise RuntimeError(

From 48d15a32aa567dfc59ede46683b01cc2321579cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=B0=E5=85=AE?=
 <38908462+zhyajie@users.noreply.github.com>
Date: Tue, 2 Dec 2025 16:02:12 +0800
Subject: [PATCH 141/770] [CI] Fix Bad_words test for tokenizer encode/decode
 asymmetry (#28193)

Signed-off-by: zhyajie <yajizhan@amd.com>
Co-authored-by: zhyajie <yajizhan@amd.com>
---
 tests/v1/sample/test_sampling_params_e2e.py | 27 ++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/tests/v1/sample/test_sampling_params_e2e.py b/tests/v1/sample/test_sampling_params_e2e.py
index 1684252174d3d..a75a37befe0e1 100644
--- a/tests/v1/sample/test_sampling_params_e2e.py
+++ b/tests/v1/sample/test_sampling_params_e2e.py
@@ -106,6 +106,25 @@ def test_detokenize_false(llm):
 def test_bad_words(llm):
     """Check that we respect bad words."""
 
+    tokenizer = llm.get_tokenizer()
+
+    def contains_bad_word(text: str, tokens: list[int], bad_word: str) -> bool:
+        """Check if word appears in BOTH text and token sequence."""
+        if bad_word not in text:
+            return False
+
+        for add_prefix_space in [False, True]:
+            prefix = " " if add_prefix_space else ""
+            bad_words_token = tokenizer.encode(
+                prefix + bad_word.lstrip(), add_special_tokens=False
+            )
+            if not bad_words_token:
+                continue
+            for i in range(len(tokens) - len(bad_words_token) + 1):
+                if tokens[i : i + len(bad_words_token)] == bad_words_token:
+                    return True
+        return False
+
     output = llm.generate(PROMPT, SamplingParams(temperature=0))
     split_text = output[0].outputs[0].text.split()
 
@@ -113,14 +132,16 @@ def test_bad_words(llm):
     params = SamplingParams(temperature=0, bad_words=[bad_words_1])
     output = llm.generate(PROMPT, params)
     new_text = output[0].outputs[0].text
-    assert bad_words_1 not in new_text
+    new_tokens = output[0].outputs[0].token_ids
+    assert not contains_bad_word(new_text, new_tokens, bad_words_1)
 
     bad_words_2 = new_text.split()[-1]
     params = SamplingParams(temperature=0, bad_words=[bad_words_1, bad_words_2])
     output = llm.generate(PROMPT, params)
     new_text = output[0].outputs[0].text
-    assert bad_words_1 not in new_text
-    assert bad_words_2 not in new_text
+    new_tokens = output[0].outputs[0].token_ids
+    assert not contains_bad_word(new_text, new_tokens, bad_words_1)
+    assert not contains_bad_word(new_text, new_tokens, bad_words_2)
 
 
 def test_logits_processor(llm):

From 70fb77b4dcc3cfb368831d1aba8e1e2dca7c31a9 Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Tue, 2 Dec 2025 00:55:02 -0800
Subject: [PATCH 142/770] [BugFix] add max-num-batched-token to scheduler hash
 (#29829)

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 vllm/config/scheduler.py | 14 ++++++++++++--
 vllm/config/vllm.py      |  4 ----
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
index 88f3e62fbd4ed..1e089b42cccde 100644
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -175,9 +175,19 @@ class SchedulerConfig:
         excluding anything before input ids/embeddings and after
         the final hidden states.
         """
-        # no factors to consider.
-        # this config will not affect the computation graph.
         factors: list[Any] = []
+
+        # max_num_batched_tokens need to be included in the hash due
+        # to two reasons:
+        # 1. LoRA creates static buffers based on max_num_batched_tokens.
+        #   The tensor sizes and strides get captured in the torch.compile
+        #   graph explicitly.
+        # 2. Inductor decides whether using 32-bit or 64-bit indexing integer
+        #   based on the data sizes. `max_num_batched_tokens` has an
+        #   impact on that. For more details, please check
+        #   https://github.com/vllm-project/vllm/issues/29585
+        factors.append(self.max_num_batched_tokens)
+
         hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 4542866aa166c..615b1f8489eff 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -265,10 +265,6 @@ class VllmConfig:
             vllm_factors.append("None")
         if self.lora_config:
             vllm_factors.append(self.lora_config.compute_hash())
-            # LoRA creates static buffers based on max_num_batched_tokens.
-            # The tensor sizes and strides get captured in the torch.compile
-            # graph explicitly.
-            vllm_factors.append(str(self.scheduler_config.max_num_batched_tokens))
         else:
             vllm_factors.append("None")
         if self.speculative_config:

From 8bbcf8b6e7ad0cdeaef010bd834bd723f4e00445 Mon Sep 17 00:00:00 2001
From: Louie Tsai <louie.tsai@intel.com>
Date: Tue, 2 Dec 2025 01:00:23 -0800
Subject: [PATCH 143/770] [vLLM Benchmark Suite] Add default parameters section
 and update CPU benchmark cases (#29381)

Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
Signed-off-by: Louie Tsai <louie.tsai@intel.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Li, Jiang <bigpyj64@gmail.com>
---
 .buildkite/performance-benchmarks/README.md   |   59 +
 .../scripts/run-performance-benchmarks.sh     |   48 +-
 .../tests/serving-tests-cpu-snc2.json         |  610 ----------
 .../tests/serving-tests-cpu-snc3.json         | 1023 -----------------
 .../tests/serving-tests-cpu.json              |  516 ++++-----
 docs/getting_started/installation/cpu.md      |   29 +
 6 files changed, 374 insertions(+), 1911 deletions(-)
 delete mode 100644 .buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json
 delete mode 100644 .buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json

diff --git a/.buildkite/performance-benchmarks/README.md b/.buildkite/performance-benchmarks/README.md
index 6d494f64f14fa..015f48c2520d6 100644
--- a/.buildkite/performance-benchmarks/README.md
+++ b/.buildkite/performance-benchmarks/README.md
@@ -108,6 +108,65 @@ The number of this test is less stable compared to the delay and latency benchma
 
 WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
 
+#### Default Parameters Field
+
+We can specify default parameters in a JSON field with key `defaults`. Parameters defined in the field are applied globally to all serving tests, and can be overridden in test case fields. Here is an example:
+
+<details>
+<summary> An Example of default parameters field </summary>
+
+```json
+{
+  "defaults": {
+    "qps_list": [
+      "inf"
+    ],
+    "server_environment_variables": {
+      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1
+    },
+    "server_parameters": {
+      "tensor_parallel_size": 1,
+      "dtype": "bfloat16",
+      "block_size": 128,
+      "disable_log_stats": "",
+      "load_format": "dummy"
+    },
+    "client_parameters": {
+      "backend": "vllm",
+      "dataset_name": "random",
+      "random-input-len": 128,
+      "random-output-len": 128,
+      "num_prompts": 200,
+      "ignore-eos": ""
+    }
+  },
+  "tests": [
+    {
+      "test_name": "serving_llama3B_tp2_random_128_128",
+      "server_parameters": {
+        "model": "meta-llama/Llama-3.2-3B-Instruct",
+        "tensor_parallel_size": 2,
+      },
+      "client_parameters": {
+        "model": "meta-llama/Llama-3.2-3B-Instruct",
+      }
+    },
+    {
+      "test_name": "serving_qwen3_tp4_random_128_128",
+      "server_parameters": {
+        "model": "Qwen/Qwen3-14B",
+        "tensor_parallel_size": 4,
+      },
+      "client_parameters": {
+        "model": "Qwen/Qwen3-14B",
+      }
+    },
+  ]
+}
+```
+
+</details>
+
 ### Visualizing the results
 
 The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
diff --git a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
index 99a5a5e334f8e..34ceefe0996f2 100644
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -110,7 +110,8 @@ json2envs() {
 wait_for_server() {
   # wait for vllm server to start
   # return 1 if vllm server crashes
-  timeout 1200 bash -c '
+  local timeout_val="1200"
+  timeout "$timeout_val" bash -c '
     until curl -X POST localhost:8000/v1/completions; do
       sleep 1
     done' && return 0 || return 1
@@ -316,12 +317,44 @@ run_throughput_tests() {
 run_serving_tests() {
   # run serving tests using `vllm bench serve` command
   # $1: a json file specifying serving test cases
+  #
+  # Supported JSON formats:
+  # 1) Plain format: top-level array
+  #    [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
+  #
+  # 2) Default parameters field + plain format tests
+  #    {
+  #      "defaults": { ... },
+  #      "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
+  #    }
 
   local serving_test_file
   serving_test_file=$1
 
   # Iterate over serving tests
-  jq -c '.[]' "$serving_test_file" | while read -r params; do
+  jq -c '
+    if type == "array" then
+      # Plain format: test cases array
+      .[]
+    elif (type == "object" and has("tests")) then
+      # merge the default parameters into each test cases
+      . as $root
+      | ($root.defaults // {}) as $d
+      | ($root.tests // [])[]
+      # default qps / max_concurrency from defaults if missing
+      | .qps_list = (.qps_list // $d.qps_list)
+      | .max_concurrency_list = (.max_concurrency_list // $d.max_concurrency_list)
+      # merge envs / params: test overrides defaults
+      | .server_environment_variables =
+          (($d.server_environment_variables // {}) + (.server_environment_variables // {}))
+      | .server_parameters =
+          (($d.server_parameters // {}) + (.server_parameters // {}))
+      | .client_parameters =
+          (($d.client_parameters // {}) + (.client_parameters // {}))
+    else
+      error("Unsupported serving test file format: must be array or object with .tests")
+    end
+  ' "$serving_test_file" | while read -r params; do
     # get the test name, and append the GPU type back to it.
     test_name=$(echo "$params" | jq -r '.test_name')
     if [[ ! "$test_name" =~ ^serving_ ]]; then
@@ -335,20 +368,25 @@ run_serving_tests() {
       continue
     fi
 
-    # get client and server arguments
+    # get client and server arguments (after merged the default parameters)
     server_params=$(echo "$params" | jq -r '.server_parameters')
     server_envs=$(echo "$params" | jq -r '.server_environment_variables')
     client_params=$(echo "$params" | jq -r '.client_parameters')
+
     server_args=$(json2args "$server_params")
     server_envs=$(json2envs "$server_envs")
     client_args=$(json2args "$client_params")
+
+    # qps_list
     qps_list=$(echo "$params" | jq -r '.qps_list')
     qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
     echo "Running over qps list $qps_list"
+
+    # max_concurrency_list (fallback to num_prompts if missing)
     max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
     if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
-        num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
-        max_concurrency_list="[$num_prompts]"
+      num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
+      max_concurrency_list="[$num_prompts]"
     fi
     max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
     echo "Running over max concurrency list $max_concurrency_list"
diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json
deleted file mode 100644
index f758097e098e4..0000000000000
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json
+++ /dev/null
@@ -1,610 +0,0 @@
-[
-    {
-        "test_name": "serving_llama8B_bf16_tp1_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_bf16_tp2_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_bf16_tp4_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_bf16_tp1_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_bf16_tp2_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_bf16_tp4_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp1_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp2_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp4_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp1_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp2_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp4_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp1_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp2_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp4_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp1_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp2_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp4_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    }
-]
diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json
deleted file mode 100644
index 0b1a42e790255..0000000000000
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json
+++ /dev/null
@@ -1,1023 +0,0 @@
-[
-    {
-        "test_name": "serving_llama8B_bf16_pp1_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "pipeline_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_bf16_tp2_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_bf16_pp3_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "pipeline_parallel_size": 3,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_bf16_tp4_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
-            "pipeline_parallel_size": 3,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_bf16_pp1_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "pipeline_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_bf16_tp2_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_bf16_pp3_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "pipeline_parallel_size": 3,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_bf16_tp4_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
-            "pipeline_parallel_size": 3,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_pp1_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "pipeline_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp2_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_pp3_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "pipeline_parallel_size": 3,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp4_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 2,
-            "pipeline_parallel_size": 3,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_pp1_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "pipeline_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp2_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_pp3_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "pipeline_parallel_size": 3,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp4_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 2,
-            "pipeline_parallel_size": 3,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_pp1_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "pipeline_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp2_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_pp3_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "pipeline_parallel_size": 3,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp4_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 2,
-            "pipeline_parallel_size": 3,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_pp1_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "pipeline_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp2_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_pp3_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "pipeline_parallel_size": 3,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp4_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 2,
-            "pipeline_parallel_size": 3,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    }
-]
diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
index f792956f39472..8f7200862d20c 100644
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
@@ -1,276 +1,246 @@
-[
-    {
-        "test_name": "serving_llama8B_tp1_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [32],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 32
-        }
+{
+  "defaults": {
+    "qps_list": [
+      "inf"
+    ],
+    "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+    "server_environment_variables": {
+      "VLLM_RPC_TIMEOUT": 100000,
+      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+      "VLLM_CPU_SGL_KERNEL": 1,
+      "VLLM_CPU_KVCACHE_SPACE": 40
     },
-    {
-        "test_name": "serving_llama8B_tp2_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [32],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 32
-        }
+    "server_parameters": {
+      "model": "meta-llama/Llama-3.1-8B-Instruct",
+      "tensor_parallel_size": 1,
+      "dtype": "bfloat16",
+      "distributed_executor_backend": "mp",
+      "block_size": 128,
+      "trust_remote_code": "",
+      "disable_log_stats": "",
+      "enforce_eager": "",
+      "max_num_batched_tokens": 2048,
+      "max_num_seqs": 256,
+      "load_format": "dummy"
     },
-    {
-        "test_name": "serving_llama8B_tp1_random_128_128",
-        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [32],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 32
-        }
-    },
-    {
-        "test_name": "serving_llama8B_tp2_random_128_128",
-        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [32],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 32
-        }
-    },
-    {
-        "test_name": "serving_llama8B_tp1_random_128_2048",
-        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [32],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 2048,
-	    "ignore-eos": "",
-            "num_prompts": 32
-        }
-    },
-    {
-        "test_name": "serving_llama8B_tp2_random_128_2048",
-        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [32],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 2048,
-	    "ignore-eos": "",
-            "num_prompts": 32
-        }
-    },
-    {
-        "test_name": "serving_llama8B_tp1_random_2048_128",
-        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [32],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 2048,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 32
-        }
-    },
-    {
-        "test_name": "serving_llama8B_tp2_random_2048_128",
-        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [32],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 2048,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 32
-        }
+    "client_parameters": {
+      "model": "meta-llama/Llama-3.1-8B-Instruct",
+      "backend": "vllm",
+      "ignore-eos": "",
+      "num_prompts": 200
     }
-]
+  },
+  "tests": [
+    {
+      "test_name": "serving_llama8B_tp1_sharegpt",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "sharegpt",
+        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_sharegpt",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "sharegpt",
+        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp1_random_128_128",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_128_128",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp4_random_128_128",
+      "server_parameters": {
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp1_random_128_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_128_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp4_random_128_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp1_random_2048_128",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_2048_128",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp4_random_2048_128",
+      "server_parameters": {
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama3B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "meta-llama/Llama-3.2-3B-Instruct",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "meta-llama/Llama-3.2-3B-Instruct",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_granite2B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "ibm-granite/granite-3.2-2b-instruct",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "ibm-granite/granite-3.2-2b-instruct",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_qwen1.7B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "Qwen/Qwen3-1.7B",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "Qwen/Qwen3-1.7B",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_qwen4B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "Qwen/Qwen3-4B",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "Qwen/Qwen3-4B",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_qwen8B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "Qwen/Qwen3-8B",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "Qwen/Qwen3-8B",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_glm9B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "zai-org/glm-4-9b-hf",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "zai-org/glm-4-9b-hf",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_gemma7B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "google/gemma-7b",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "google/gemma-7b",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    }
+  ]
+}
diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md
index 18dc6d19434b3..4b68cb4811789 100644
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -138,6 +138,35 @@ vllm serve facebook/opt-125m --dtype=bfloat16
 
 Note, it is recommended to manually reserve 1 CPU for vLLM front-end process when `world_size == 1`.
 
+### What are supported models on CPU?
+
+For the full and up-to-date list of models validated on CPU platforms, please see the official documentation: [Supported Models on CPU](https://docs.vllm.ai/en/latest/models/hardware_supported_models/cpu)
+
+### How to find benchmark configuration examples for supported CPU models?
+
+For any model listed under [Supported Models on CPU](https://docs.vllm.ai/en/latest/models/hardware_supported_models/cpu), optimized runtime configurations are provided in the vLLM Benchmark Suite’s CPU test cases, defined in [cpu test cases](https://github.com/vllm-project/vllm/blob/main/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json)
+For details on how these optimized configurations are determined, see: [performance-benchmark-details](https://github.com/vllm-project/vllm/tree/main/.buildkite/performance-benchmarks#performance-benchmark-details).
+To benchmark the supported models using these optimized settings, follow the steps in [running vLLM Benchmark Suite manually](https://docs.vllm.ai/en/latest/contributing/benchmarks/#manually-trigger-the-benchmark) and run the Benchmark Suite on a CPU environment.  
+
+Below is an example command to benchmark all CPU-supported models using optimized configurations.
+
+```bash
+ON_CPU=1 bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+```
+
+The benchmark results will be saved in `./benchmark/results/`.
+In the directory, the generated `.commands` files contain all example commands for the benchmark.
+
+We recommend configuring tensor-parallel-size to match the number of NUMA nodes on your system. Note that the current release does not support tensor-parallel-size=6.
+To determine the number of NUMA nodes available, use the following command:
+
+```bash
+lscpu | grep "NUMA node(s):" | awk '{print $3}'
+```
+
+For performance reference, users may also consult the [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm&deviceName=cpu)
+, which publishes default-model CPU results produced using the same Benchmark Suite.
+
 ### How to decide `VLLM_CPU_OMP_THREADS_BIND`?
 
 - Default `auto` thread-binding is recommended for most cases. Ideally, each OpenMP thread will be bound to a dedicated physical core respectively, threads of each rank will be bound to the same NUMA node respectively, and 1 CPU per rank will be reserved for other vLLM components when `world_size > 1`. If you have any performance problems or unexpected binding behaviours, please try to bind threads as following.

From d8c6210eeaa7f3b474e50cf74926f77a8dc79adf Mon Sep 17 00:00:00 2001
From: Julien Denize <40604584+juliendenize@users.noreply.github.com>
Date: Tue, 2 Dec 2025 11:29:00 +0100
Subject: [PATCH 144/770] Add Mistral Large 3 and Ministral 3 (#29757)

Signed-off-by: Julien Denize <julien.denize@mistral.ai>
Signed-off-by: Julien Denize <40604584+juliendenize@users.noreply.github.com>
Signed-off-by: Mickael Seznec <mickael@mistral.ai>
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Mickael Seznec <mickael@mistral.ai>
---
 docs/models/supported_models.md               |   5 +-
 tests/models/registry.py                      |  14 ++
 tests/tokenizers_/test_mistral.py             | 158 ++++++++++++++++-
 vllm/config/speculative.py                    |   4 +
 .../tool_parsers/mistral_tool_parser.py       |   2 +-
 ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++
 vllm/model_executor/layers/mla.py             |   4 +
 .../layers/rotary_embedding/__init__.py       |   2 +-
 vllm/model_executor/models/deepseek_v2.py     |  66 ++++++-
 vllm/model_executor/models/mistral_large_3.py |  63 +++++++
 .../models/mistral_large_3_eagle.py           | 165 ++++++++++++++++++
 vllm/model_executor/models/registry.py        |   5 +
 vllm/tokenizers/mistral.py                    |  36 ++++
 vllm/transformers_utils/configs/eagle.py      |   6 +
 vllm/transformers_utils/configs/mistral.py    |  74 ++++++--
 vllm/v1/spec_decode/eagle.py                  |   4 +
 16 files changed, 724 insertions(+), 30 deletions(-)
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/models/mistral_large_3.py
 create mode 100644 vllm/model_executor/models/mistral_large_3_eagle.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index da7c5edf66bfb..6ea2285b92bb8 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -417,7 +417,8 @@ th {
 | `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ |
 | `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ |
 | `MiniMaxM2ForCausalLM` | MiniMax-M2 |`MiniMaxAI/MiniMax-M2`, etc. | | ✅︎ |
-| `MistralForCausalLM` | Mistral, Mistral-Instruct | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ |
+| `MistralForCausalLM` | Ministral-3, Mistral, Mistral-Instruct | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ |
+| `MistralLarge3ForCausalLM` | Mistral-Large-3-675B-Base-2512, Mistral-Large-3-675B-Instruct-2512 | `mistralai/Mistral-Large-3-675B-Base-2512`, `mistralai/Mistral-Large-3-675B-Instruct-2512`, etc. | ✅︎ | ✅︎ |
 | `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ |
 | `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ |
 | `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ |
@@ -711,7 +712,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ |
 | `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ |
 | `Phi4MultimodalForCausalLM` | Phi-4-multimodal (HF Transformers) | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct` (with revision `refs/pr/70`), etc. | ✅︎ | ✅︎ |
-| `PixtralForConditionalGeneration` | Mistral 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Pixtral-12B-2409`, etc. | | ✅︎ |
+| `PixtralForConditionalGeneration` | Ministral 3 (Mistral format), Mistral 3 (Mistral format), Mistral Large 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Mistral-Large-3-675B-Instruct-2512` `mistralai/Pixtral-12B-2409` etc. | | ✅︎ |
 | `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ |
 | `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ |
 | `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ |
diff --git a/tests/models/registry.py b/tests/models/registry.py
index d90f3a4d4f781..26351089fc464 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -358,6 +358,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
         trust_remote_code=True,
     ),
     "MistralForCausalLM": _HfExamplesInfo("mistralai/Mistral-7B-Instruct-v0.1"),
+    "MistralLarge3ForCausalLM": _HfExamplesInfo(
+        "mistralai/Mistral-Large-3-675B-Instruct-2512-NVFP4", is_available_online=False
+    ),
     "MixtralForCausalLM": _HfExamplesInfo(
         "mistralai/Mixtral-8x7B-Instruct-v0.1",
         {"tiny": "TitanML/tiny-mixtral"},
@@ -770,7 +773,13 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     ),
     "PixtralForConditionalGeneration": _HfExamplesInfo(
         "mistralai/Pixtral-12B-2409",
+        extras={
+            "mistral-large-3": "mistralai/Mistral-Large-3-675B-Instruct-2512-NVFP4",
+            "ministral-3": "mistralai/Ministral-3-3B-Instruct-2512",
+        },
         tokenizer_mode="mistral",
+        # TODO: revert once Mistral-Large-3 and Ministral-3 are publicly available.
+        is_available_online=False,
     ),
     "QwenVLForConditionalGeneration": _HfExamplesInfo(
         "Qwen/Qwen-VL",
@@ -870,6 +879,11 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
         use_original_num_layers=True,
         max_model_len=10240,
     ),
+    "EagleMistralLarge3ForCausalLM": _HfExamplesInfo(
+        "mistralai/Mistral-Large-3-675B-Instruct-2512",
+        speculative_model="mistralai/Mistral-Large-3-675B-Instruct-2512-Eagle",
+        is_available_online=False,
+    ),
     "LlamaForCausalLMEagle3": _HfExamplesInfo(
         "Qwen/Qwen3-8B",
         trust_remote_code=True,
diff --git a/tests/tokenizers_/test_mistral.py b/tests/tokenizers_/test_mistral.py
index 92efac86dff29..faff611502652 100644
--- a/tests/tokenizers_/test_mistral.py
+++ b/tests/tokenizers_/test_mistral.py
@@ -91,6 +91,118 @@ from vllm.tokenizers.mistral import (
                 ],
             ),
         ),
+        (
+            {
+                "messages": [
+                    {
+                        "role": "user",
+                        "content": "What is the current local date and time?",
+                    }
+                ],
+                "tools": [
+                    {
+                        "type": "function",
+                        "function": {
+                            "description": "Fetch the current local date and time.",
+                            "unsupported_field": False,
+                            "name": "get_current_time",
+                            "parameters": {},
+                        },
+                    },
+                    {
+                        "type": "function",
+                        "function": {
+                            "description": "Fetch the current local date and time.",
+                            "unsupported_field2": False,
+                            "name": "get_current_time",
+                            "parameters": {},
+                        },
+                    },
+                ],
+            },
+            (
+                [
+                    {
+                        "role": "user",
+                        "content": "What is the current local date and time?",
+                    }
+                ],
+                [
+                    {
+                        "type": "function",
+                        "function": {
+                            "description": "Fetch the current local date and time.",
+                            "name": "get_current_time",
+                            "parameters": {},
+                        },
+                    },
+                    {
+                        "type": "function",
+                        "function": {
+                            "description": "Fetch the current local date and time.",
+                            "name": "get_current_time",
+                            "parameters": {},
+                        },
+                    },
+                ],
+            ),
+        ),
+        (
+            {
+                "messages": [
+                    {
+                        "role": "user",
+                        "content": "What is the current local date and time?",
+                    }
+                ],
+                "tools": [
+                    {
+                        "type": "function",
+                        "unsupported_field": False,
+                        "function": {
+                            "description": "Fetch the current local date and time.",
+                            "name": "get_current_time",
+                            "parameters": {},
+                        },
+                    },
+                    {
+                        "type": "function",
+                        "unsupported_field2": False,
+                        "function": {
+                            "description": "Fetch the current local date and time 2.",
+                            "name": "get_current_time2",
+                            "parameters": {"a": "1"},
+                        },
+                    },
+                ],
+            },
+            (
+                [
+                    {
+                        "role": "user",
+                        "content": "What is the current local date and time?",
+                    }
+                ],
+                [
+                    {
+                        "type": "function",
+                        "function": {
+                            "description": "Fetch the current local date and time.",
+                            "name": "get_current_time",
+                            "parameters": {},
+                        },
+                    },
+                    {
+                        "type": "function",
+                        "function": {
+                            "description": "Fetch the current local date and time 2.",
+                            "name": "get_current_time2",
+                            "parameters": {"a": "1"},
+                        },
+                    },
+                ],
+            ),
+        ),
     ],
 )
 def test_prepare_apply_chat_template_tools_and_messages(
@@ -1108,13 +1220,6 @@ class TestMistralTokenizer:
             )
             == expected_tokens[mistral_tokenizer.is_tekken]
         )
-        assert (
-            mistral_tokenizer.decode(
-                ids[mistral_tokenizer.is_tekken],
-                skip_special_tokens=skip_special_tokens,
-            )
-            == expected_tokens[mistral_tokenizer.is_tekken]
-        )
 
     def test_decode_empty(
         self,
@@ -1140,6 +1245,45 @@ class TestMistralTokenizer:
             == "<s>"
         )
 
+    @pytest.mark.parametrize(
+        "skip_special_tokens,expected_tokens",
+        (
+            (
+                False,
+                (
+                    ["<s>[INST]▁Hello▁world▁![/INST]▁Hello</s>"],
+                    ["<s>[INST]Hello world ![/INST]Hello</s>"],
+                ),
+            ),
+            (True, (["Hello world ! Hello"], ["Hello world !Hello"])),
+        ),
+    )
+    def test_batch_decode(
+        self,
+        mistral_tokenizer: MistralTokenizer,
+        skip_special_tokens: bool,
+        expected_tokens: tuple[str, str],
+    ):
+        ids = (
+            [[1, 3, 23325, 2294, 1686, 4, 23325, 2]],
+            [[1, 3, 22177, 4304, 2662, 4, 22177, 2]],
+        )
+        assert (
+            mistral_tokenizer.batch_decode(
+                ids[mistral_tokenizer.is_tekken],
+                skip_special_tokens=skip_special_tokens,
+            )
+            == expected_tokens[mistral_tokenizer.is_tekken]
+        )
+
+    def test_batch_decode_empty(
+        self,
+        mistral_tokenizer: MistralTokenizer,
+    ):
+        assert mistral_tokenizer.batch_decode(
+            [[]],
+        ) == [""]
+
     def test_convert_tokens_to_string(self, mistral_tokenizer: MistralTokenizer):
         tokens = (
             [
diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py
index 80d53a543f149..c6d6f705f535c 100644
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@@ -167,6 +167,7 @@ class SpeculativeConfig:
 
     @staticmethod
     def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig:
+        initial_architecture = hf_config.architectures[0]
         if hf_config.model_type in ("deepseek_v3", "deepseek_v32"):
             hf_config.model_type = "deepseek_mtp"
         if hf_config.model_type == "deepseek_mtp":
@@ -226,6 +227,9 @@ class SpeculativeConfig:
                 {"n_predict": n_predict, "architectures": ["LongCatFlashMTPModel"]}
             )
 
+        if initial_architecture == "MistralLarge3ForCausalLM":
+            hf_config.update({"architectures": ["EagleMistralLarge3ForCausalLM"]})
+
         return hf_config
 
     def __post_init__(self):
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index 7e2d67a1fb659..89b882d6c8475 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -80,7 +80,7 @@ class MistralToolParser(ToolParser):
         self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL)
         if _is_fn_name_regex_support(self.model_tokenizer):
             self.fn_name_regex = re.compile(
-                r"([a-zA-Z0-9_-]+)(\{[\s\S]*?\})(?=\s*$|,|\s)", re.DOTALL
+                r"([a-zA-Z0-9_-]+)(\{[\s\S]*?\})(?=\s*$|,|\s)?", re.DOTALL
             )
         else:
             self.fn_name_regex = None
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..a9f24c20a25a2
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/mla.py b/vllm/model_executor/layers/mla.py
index 6ebfa47a9dc3f..dad960160f2ad 100644
--- a/vllm/model_executor/layers/mla.py
+++ b/vllm/model_executor/layers/mla.py
@@ -111,6 +111,7 @@ class MultiHeadLatentAttentionWrapper(CustomOp):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
+        llama_4_scaling: torch.Tensor | None = None,
     ) -> torch.Tensor:
         q_c = None
         kv_lora = None
@@ -159,6 +160,9 @@ class MultiHeadLatentAttentionWrapper(CustomOp):
                 hidden_states, q_c, positions, self.indexer_rope_emb
             )
 
+        if llama_4_scaling is not None:
+            q *= llama_4_scaling
+
         attn_out = self.mla_attn(
             q,
             kv_c_normed,
diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py
index 0f10bff6ac4f5..aa6ece30026d3 100644
--- a/vllm/model_executor/layers/rotary_embedding/__init__.py
+++ b/vllm/model_executor/layers/rotary_embedding/__init__.py
@@ -238,7 +238,7 @@ def get_rope(
                     dtype,
                     **extra_kwargs,
                 )
-        elif scaling_type == "deepseek_yarn":
+        elif scaling_type in ["deepseek_yarn", "deepseek_llama_scaling"]:
             scaling_factor = rope_parameters["factor"]
             original_max_position = rope_parameters["original_max_position_embeddings"]
             # assert max_position == original_max_position * scaling_factor
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 73cac2556c55a..d8a081af125c5 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -395,6 +395,16 @@ def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
     return 0.1 * mscale * math.log(scale) + 1.0
 
 
+def _get_llama_4_scaling(
+    original_max_position_embeddings: int, scaling_beta: float, positions: torch.Tensor
+) -> torch.Tensor:
+    scaling = 1 + scaling_beta * torch.log(
+        1 + torch.floor(positions / original_max_position_embeddings)
+    )
+    # Broadcast over num_heads and head_dim
+    return scaling[..., None, None]
+
+
 class DeepseekV2Attention(nn.Module):
     def __init__(
         self,
@@ -481,7 +491,11 @@ class DeepseekV2Attention(nn.Module):
             prefix=f"{prefix}.o_proj",
         )
         if config.rope_parameters["rope_type"] != "default":
-            config.rope_parameters["rope_type"] = "deepseek_yarn"
+            config.rope_parameters["rope_type"] = (
+                "deepseek_yarn"
+                if config.rope_parameters.get("apply_yarn_scaling", True)
+                else "deepseek_llama_scaling"
+            )
 
         self.rotary_emb = get_rope(
             qk_rope_head_dim,
@@ -491,7 +505,10 @@ class DeepseekV2Attention(nn.Module):
             is_neox_style=False,
         )
 
-        if config.rope_parameters["rope_type"] != "default":
+        if (
+            config.rope_parameters["rope_type"] != "default"
+            and config.rope_parameters["rope_type"] == "deepseek_yarn"
+        ):
             mscale_all_dim = config.rope_parameters.get("mscale_all_dim", False)
             scaling_factor = config.rope_parameters["factor"]
             mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
@@ -511,6 +528,7 @@ class DeepseekV2Attention(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
+        llama_4_scaling: torch.Tensor | None,
     ) -> torch.Tensor:
         if self.q_lora_rank is not None:
             q = self.q_a_proj(hidden_states)[0]
@@ -536,6 +554,11 @@ class DeepseekV2Attention(nn.Module):
         k = torch.empty_like(q)
         k[..., : self.qk_nope_head_dim] = k_nope
         k[..., self.qk_nope_head_dim :] = k_pe
+
+        # Apply llama 4 scaling if provided
+        if llama_4_scaling is not None:
+            q *= llama_4_scaling
+
         # padding value to qk_head_dim for alignment
         v = torch.nn.functional.pad(
             v, [0, self.qk_head_dim - self.v_head_dim], value=0
@@ -987,7 +1010,12 @@ class DeepseekV2MLAAttention(nn.Module):
         )
 
         if config.rope_parameters["rope_type"] != "default":
-            config.rope_parameters["rope_type"] = "deepseek_yarn"
+            config.rope_parameters["rope_type"] = (
+                "deepseek_yarn"
+                if config.rope_parameters.get("apply_yarn_scaling", True)
+                else "deepseek_llama_scaling"
+            )
+
         self.rotary_emb = get_rope(
             qk_rope_head_dim,
             rotary_dim=qk_rope_head_dim,
@@ -995,7 +1023,11 @@ class DeepseekV2MLAAttention(nn.Module):
             rope_parameters=config.rope_parameters,
             is_neox_style=False,
         )
-        if config.rope_parameters["rope_type"] != "default":
+
+        if (
+            config.rope_parameters["rope_type"] != "default"
+            and config.rope_parameters["rope_type"] == "deepseek_yarn"
+        ):
             mscale_all_dim = config.rope_parameters.get("mscale_all_dim", False)
             scaling_factor = config.rope_parameters["factor"]
             mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
@@ -1064,8 +1096,9 @@ class DeepseekV2MLAAttention(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
+        llama_4_scaling: torch.Tensor | None,
     ) -> torch.Tensor:
-        return self.mla_attn(positions, hidden_states)
+        return self.mla_attn(positions, hidden_states, llama_4_scaling)
 
 
 class DeepseekV2DecoderLayer(nn.Module):
@@ -1155,6 +1188,7 @@ class DeepseekV2DecoderLayer(nn.Module):
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
         residual: torch.Tensor | None,
+        llama_4_scaling: torch.Tensor | None = None,
     ) -> torch.Tensor:
         # Self Attention
         if residual is None:
@@ -1165,6 +1199,7 @@ class DeepseekV2DecoderLayer(nn.Module):
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
+            llama_4_scaling=llama_4_scaling,
         )
 
         if (
@@ -1266,8 +1301,24 @@ class DeepseekV2Model(nn.Module):
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
+        # Compute llama 4 scaling once per forward pass if enabled
+        llama_4_scaling_config = getattr(self.config, "llama_4_scaling", None)
+        llama_4_scaling: torch.Tensor | None
+        if llama_4_scaling_config is not None:
+            llama_4_scaling = _get_llama_4_scaling(
+                original_max_position_embeddings=llama_4_scaling_config[
+                    "original_max_position_embeddings"
+                ],
+                scaling_beta=llama_4_scaling_config["beta"],
+                positions=positions,
+            )
+        else:
+            llama_4_scaling = None
+
         for layer in islice(self.layers, self.start_layer, self.end_layer):
-            hidden_states, residual = layer(positions, hidden_states, residual)
+            hidden_states, residual = layer(
+                positions, hidden_states, residual, llama_4_scaling
+            )
 
         if not get_pp_group().is_last_rank:
             return IntermediateTensors(
@@ -1325,6 +1376,7 @@ class DeepseekV2ForCausalLM(
     packed_modules_mapping = {
         "gate_up_proj": ["gate_proj", "up_proj"],
     }
+    model_cls = DeepseekV2Model
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -1355,7 +1407,7 @@ class DeepseekV2ForCausalLM(
                 "kv_a_proj_with_mqa",
             ]
 
-        self.model = DeepseekV2Model(
+        self.model = self.model_cls(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
         if get_pp_group().is_last_rank:
diff --git a/vllm/model_executor/models/mistral_large_3.py b/vllm/model_executor/models/mistral_large_3.py
new file mode 100644
index 0000000000000..ff7e9b60c1d3c
--- /dev/null
+++ b/vllm/model_executor/models/mistral_large_3.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterable
+
+import regex as re
+import torch
+
+from vllm.model_executor.models.deepseek_v2 import DeepseekV3ForCausalLM
+
+
+class MistralLarge3ForCausalLM(DeepseekV3ForCausalLM):
+    # fmt: off
+    remapping = {
+        r"layers\.(\d+)\.attention_norm\.weight": r"model.layers.\1.input_layernorm.weight",  # noqa: E501
+        r"layers\.(\d+)\.attention\.wq_a\.(\w+)": r"model.layers.\1.self_attn.q_a_proj.\2",  # noqa: E501
+        r"layers\.(\d+)\.attention\.q_a_norm\.weight": r"model.layers.\1.self_attn.q_a_layernorm.weight",  # noqa: E501
+        r"layers\.(\d+)\.attention\.wq_b\.(\w+)": r"model.layers.\1.self_attn.q_b_proj.\2",  # noqa: E501
+        r"layers\.(\d+)\.attention\.wkv_a_with_mqa\.(\w+)": r"model.layers.\1.self_attn.kv_a_proj_with_mqa.\2",  # noqa: E501
+        r"layers\.(\d+)\.attention\.kv_a_norm\.weight": r"model.layers.\1.self_attn.kv_a_layernorm.weight",  # noqa: E501
+        r"layers\.(\d+)\.attention\.wkv_b\.(\w+)": r"model.layers.\1.self_attn.kv_b_proj.\2",  # noqa: E501
+        r"layers\.(\d+)\.attention\.wo\.(\w+)": r"model.layers.\1.self_attn.o_proj.\2",  # noqa: E501
+        r"layers\.(\d+)\.ffn_norm\.weight": r"model.layers.\1.post_attention_layernorm.weight",  # noqa: E501
+        r"layers\.(\d+)\.feed_forward\.w1\.(\w+)": r"model.layers.\1.mlp.gate_proj.\2",  # noqa: E501
+        r"layers\.(\d+)\.feed_forward\.w2\.(\w+)": r"model.layers.\1.mlp.down_proj.\2",  # noqa: E501
+        r"layers\.(\d+)\.feed_forward\.w3\.(\w+)": r"model.layers.\1.mlp.up_proj.\2",  # noqa: E501
+        r"layers\.(\d+)\.gate\.weight": r"model.layers.\1.mlp.gate.weight",  # noqa: E501
+        r"layers\.(\d+)\.shared_experts\.w1\.(\w+)": r"model.layers.\1.mlp.shared_experts.gate_proj.\2",  # noqa: E501
+        r"layers\.(\d+)\.shared_experts\.w2\.(\w+)": r"model.layers.\1.mlp.shared_experts.down_proj.\2",  # noqa: E501
+        r"layers\.(\d+)\.shared_experts\.w3\.(\w+)": r"model.layers.\1.mlp.shared_experts.up_proj.\2",  # noqa: E501
+        r"layers\.(\d+)\.experts\.(\d+)\.w1\.(\w+)": r"model.layers.\1.mlp.experts.\2.gate_proj.\3",  # noqa: E501
+        r"layers\.(\d+)\.experts\.(\d+)\.w2\.(\w+)": r"model.layers.\1.mlp.experts.\2.down_proj.\3",  # noqa: E501
+        r"layers\.(\d+)\.experts\.(\d+)\.w3\.(\w+)": r"model.layers.\1.mlp.experts.\2.up_proj.\3",  # noqa: E501
+        r"norm\.weight": "model.norm.weight",  # noqa: E501
+        r"tok_embeddings\.weight": "model.embed_tokens.weight",  # noqa: E501
+        r"output\.weight": "lm_head.weight",  # noqa: E501
+    }
+    # fmt: on
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        return super().load_weights(map(self._remap_mistral_to_ds, weights))
+
+    def _remap_mistral_to_ds(
+        self, weight: tuple[str, torch.Tensor]
+    ) -> tuple[str, torch.Tensor]:
+        """Remap Mistral parameters to DeepseekV2 parameters."""
+        name, loaded_weight = weight
+
+        for k, v in self.remapping.items():
+            match = re.fullmatch(k, name)
+            if match:
+                name = re.sub(k, v, name)
+                break
+        else:
+            raise ValueError(f"Cannot remap {name}")
+
+        # Remapping scale names. We could do this in the regex above but it
+        # would triple the number of lines for most layers.
+        if name.endswith(".qscale_act"):
+            name = re.sub(r"\.qscale_act$", ".input_scale", name)
+        elif name.endswith(".qscale_weight"):
+            name = re.sub(r"\.qscale_weight$", ".weight_scale", name)
+
+        return name, loaded_weight
diff --git a/vllm/model_executor/models/mistral_large_3_eagle.py b/vllm/model_executor/models/mistral_large_3_eagle.py
new file mode 100644
index 0000000000000..e3ca9e4ca82d0
--- /dev/null
+++ b/vllm/model_executor/models/mistral_large_3_eagle.py
@@ -0,0 +1,165 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Iterable
+from functools import partial
+
+import torch
+import torch.nn as nn
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import VllmConfig
+from vllm.distributed.parallel_state import get_pp_group
+from vllm.logger import init_logger
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import RowParallelLinear
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from vllm.model_executor.models.deepseek_v2 import (
+    DeepseekV2DecoderLayer,
+    DeepseekV2Model,
+)
+from vllm.model_executor.models.interfaces import MultiModalEmbeddings
+from vllm.model_executor.models.mistral_large_3 import MistralLarge3ForCausalLM
+from vllm.multimodal.inputs import NestedTensors
+
+from .utils import (
+    _merge_multimodal_embeddings,
+    make_empty_intermediate_tensors_factory,
+    maybe_prefix,
+)
+
+logger = init_logger(__name__)
+
+
+@support_torch_compile
+class EagleMistralLarge3Model(DeepseekV2Model):
+    def __init__(
+        self, *, vllm_config: VllmConfig, prefix: str = "", start_layer_id: int = 0
+    ):
+        nn.Module.__init__(self)
+
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.vllm_config = vllm_config
+
+        self.vocab_size = config.vocab_size
+
+        assert get_pp_group().world_size == 1
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.embed_tokens",
+        )
+
+        self.layers = nn.ModuleList(
+            [
+                DeepseekV2DecoderLayer(
+                    vllm_config=vllm_config,
+                    prefix=maybe_prefix(prefix, f"layers.{i + start_layer_id}"),
+                )
+                for i in range(self.config.num_hidden_layers)
+            ]
+        )
+        self.start_layer = 0
+        self.end_layer = self.config.num_hidden_layers
+
+        self.fc = RowParallelLinear(
+            self.config.hidden_size * 2,
+            self.config.hidden_size,
+            bias=False,
+            input_is_parallel=False,
+            quant_config=quant_config,
+            return_bias=False,
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_input_ids(input_ids)
+        inputs_embeds = self.fc(torch.cat((inputs_embeds, hidden_states), dim=-1))
+        output = super().forward(
+            input_ids, positions, intermediate_tensors=None, inputs_embeds=inputs_embeds
+        )
+        assert isinstance(output, torch.Tensor)
+        return output
+
+
+class EagleMistralLarge3ForCausalLM(MistralLarge3ForCausalLM):
+    remapping = MistralLarge3ForCausalLM.remapping | {
+        r"eagle_linear\.weight": r"model.fc.weight",
+        r"eagle_linear\.qscale_act": r"model.fc.input_scale",
+        r"eagle_linear\.qscale_weight": r"model.fc.weight_scale",
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        target_layer_num = vllm_config.model_config.get_num_layers(
+            vllm_config.parallel_config
+        )
+        vllm_config.model_config = vllm_config.speculative_config.draft_model_config
+        # draft model quantization config may differ from target model
+        self.quant_config = VllmConfig.get_quantization_config(
+            vllm_config.speculative_config.draft_model_config, vllm_config.load_config
+        )
+        vllm_config.quant_config = self.quant_config
+        self.model_cls = partial(
+            EagleMistralLarge3Model, start_layer_id=target_layer_num
+        )
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
+        *,
+        is_multimodal: torch.Tensor | None = None,
+        handle_oov_mm_token: bool = False,
+    ) -> torch.Tensor:
+        inputs_embeds = super().embed_input_ids(input_ids)
+
+        if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
+            return inputs_embeds
+
+        assert is_multimodal is not None
+
+        return _merge_multimodal_embeddings(
+            inputs_embeds=inputs_embeds,
+            multimodal_embeddings=multimodal_embeddings,
+            is_multimodal=is_multimodal,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        hidden_states = self.model(input_ids, positions, hidden_states, inputs_embeds)
+        return hidden_states, hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        # Pretend we've loaded the embedding and lm_head weights
+        # (later copied from target model)
+        return super().load_weights(weights) | {
+            "model.embed_tokens.weight",
+            "lm_head.weight",
+        }
+
+    def embed_input_ids(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: NestedTensors | None = None,
+        is_multimodal: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 73a61f1148b50..d3b6268e7647b 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -145,6 +145,7 @@ _TEXT_GENERATION_MODELS = {
     "MiniMaxM1ForCausalLM": ("minimax_text_01", "MiniMaxText01ForCausalLM"),
     "MiniMaxM2ForCausalLM": ("minimax_m2", "MiniMaxM2ForCausalLM"),
     "MistralForCausalLM": ("llama", "LlamaForCausalLM"),
+    "MistralLarge3ForCausalLM": ("mistral_large_3", "MistralLarge3ForCausalLM"),
     "MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"),
     # transformers's mpt class has lower case
     "MptForCausalLM": ("mpt", "MPTForCausalLM"),
@@ -424,6 +425,10 @@ _SPECULATIVE_DECODING_MODELS = {
     "LlamaForCausalLMEagle3": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
     "Eagle3Qwen2_5vlForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
     "Eagle3Qwen3vlForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
+    "EagleMistralLarge3ForCausalLM": (
+        "mistral_large_3_eagle",
+        "EagleMistralLarge3ForCausalLM",
+    ),
     "EagleDeepSeekMTPModel": ("deepseek_eagle", "EagleDeepseekV3ForCausalLM"),
     "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"),
     "ErnieMTPModel": ("ernie_mtp", "ErnieMTP"),
diff --git a/vllm/tokenizers/mistral.py b/vllm/tokenizers/mistral.py
index 7e6745004b01f..96d1e78ce9f17 100644
--- a/vllm/tokenizers/mistral.py
+++ b/vllm/tokenizers/mistral.py
@@ -97,6 +97,8 @@ def _prepare_apply_chat_template_tools_and_messages(
     continue_final_message: bool = False,
     add_generation_prompt: bool = False,
 ) -> tuple[list["ChatCompletionMessageParam"], list[dict[str, Any]] | None]:
+    from mistral_common.protocol.instruct.tool_calls import Function, Tool
+
     if add_generation_prompt and continue_final_message:
         raise ValueError(
             "Cannot set both `add_generation_prompt` and "
@@ -139,6 +141,33 @@ def _prepare_apply_chat_template_tools_and_messages(
             if function.get("description") is None:
                 function["description"] = ""
 
+        # We filter not supported arguments to avoid throwing an error.
+        # TODO(juliendenize): remove this once OpenAI API is better supported by
+        # `mistral-common`.
+        tools_fields = set(Tool.model_fields.keys())
+        function_fields = set(Function.model_fields.keys())
+        for tool in tools:
+            tool_keys = list(tool.keys())
+            for tool_key in tool_keys:
+                if tool_key not in tools_fields:
+                    tool.pop(tool_key)
+                    logger.warning_once(
+                        f"'{tool_key}' is not supported by mistral-common for tools. "
+                        "It has been poped from the tool definition."
+                    )
+                if tool["type"] == "function":
+                    function_keys = list(tool["function"].keys())
+                    for function_key in function_keys:
+                        if function_key not in function_fields:
+                            tool["function"].pop(function_key)
+                            logger.warning_once(
+                                f"'{function_key}' is not supported by mistral-common "
+                                "for function tools. It has been poped from the "
+                                "function definition."
+                            )
+                else:
+                    raise ValueError("mistral-common only supports function tools.")
+
     return messages, tools
 
 
@@ -410,6 +439,13 @@ class MistralTokenizer(TokenizerLike):
             ids, skip_special_tokens=skip_special_tokens
         )
 
+    def batch_decode(
+        self, ids: list[list[int]] | list[int], skip_special_tokens: bool = False
+    ) -> str:
+        return self.transformers_tokenizer.batch_decode(
+            ids, skip_special_tokens=skip_special_tokens
+        )
+
     def convert_tokens_to_string(self, tokens: list[str]) -> str:
         from mistral_common.tokens.tokenizers.base import (
             SpecialTokenPolicy,
diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py
index f5dc9ddfbc575..ce428e567c844 100644
--- a/vllm/transformers_utils/configs/eagle.py
+++ b/vllm/transformers_utils/configs/eagle.py
@@ -82,3 +82,9 @@ class EAGLEConfig(PretrainedConfig):
             pretrained_model_name_or_path, **kwargs
         )
         return cls.from_dict(config_dict, **kwargs)
+
+    def to_json_string(self, use_diff: bool = True) -> str:
+        # we override use_diff to False as initializing
+        # EAGLEConfig with default arguments is not supported
+        del use_diff
+        return super().to_json_string(use_diff=False)
diff --git a/vllm/transformers_utils/configs/mistral.py b/vllm/transformers_utils/configs/mistral.py
index 966737aad0867..d59169d95f0c9 100644
--- a/vllm/transformers_utils/configs/mistral.py
+++ b/vllm/transformers_utils/configs/mistral.py
@@ -18,9 +18,31 @@ def adapt_config_dict(
     if bool(config_dict.get("quantization")):
         config_dict = _remap_mistral_quantization_args(config_dict)
 
+    is_moe = bool(config_dict.get("moe"))
+    is_mistral_large_3 = (
+        is_moe and (config_dict["moe"].get("num_shared_experts") or 0) > 0
+    )
     if config_dict.get("model_type") == "mamba":
         config_dict["architectures"] = ["Mamba2ForCausalLM"]
-    elif bool(config_dict.get("moe")):
+    elif is_moe and is_mistral_large_3:
+        config_dict = _remap_moe_args(config_dict)
+        config_dict["model_type"] = "deepseek_v3"
+        config_dict["architectures"] = ["MistralLarge3ForCausalLM"]
+
+        assert "llama_4_scaling" in config_dict, (
+            "MistralLarge3 expect llama4 scaling config."
+        )
+        llama_4_scaling_config_keys = ["original_max_position_embeddings", "beta"]
+        assert all(
+            [
+                key in config_dict["llama_4_scaling"]
+                for key in llama_4_scaling_config_keys
+            ]
+        ), (
+            "llama_4_scaling config should define the keys: "
+            f"{','.join(llama_4_scaling_config_keys)}"
+        )
+    elif is_moe:
         config_dict["architectures"] = ["MixtralForCausalLM"]
     else:
         config_dict["architectures"] = ["MistralForCausalLM"]
@@ -140,17 +162,20 @@ def _remap_general_mistral_args(config: dict) -> dict:
 
 
 def _remap_mistral_quantization_args(config: dict) -> dict:
-    quantization = config.get("quantization", {})
-    if quantization.get("qformat_weight") == "fp8_e4m3":
-        # This maps to the FP8 static per-tensor quantization scheme
-        quantization_config = {"quant_method": "fp8", "activation_scheme": "static"}
-    elif quantization.get("quant_method") == "compressed-tensors":
-        # Pass through the quantization config to compressed-tensors
-        quantization_config = quantization
-    else:
-        raise ValueError(f"Found unknown quantization='{quantization}' in config")
-
-    config["quantization_config"] = quantization_config
+    if config.get("quantization"):
+        quantization = config.pop("quantization", {})
+        if quantization.get("qformat_weight") == "fp8_e4m3":
+            qscheme_act = quantization.get("qscheme_act")
+            assert qscheme_act in ("NO_SCALES", "TENSOR", None), (
+                "Only NO_SCALES and TENSOR (default) are supported for qscheme_act"
+            )
+            is_dynamic = qscheme_act == "NO_SCALES"
+            config["quantization_config"] = {
+                "quant_method": "fp8",
+                "activation_scheme": "dynamic" if is_dynamic else "static",
+            }
+        else:
+            raise ValueError(f"Found unknown quantization='{quantization}' in config")
 
     return config
 
@@ -183,3 +208,28 @@ def _remap_mistral_audio_args(config: dict) -> dict:
     if quant_config:
         config["quantization_config"] = quant_config
     return config
+
+
+def _remap_moe_args(config: dict) -> dict:
+    moe_config_map = {
+        "route_every_n": "moe_layer_freq",
+        "first_k_dense_replace": "first_k_dense_replace",
+        "num_experts_per_tok": "num_experts_per_tok",
+        "num_experts": "n_routed_experts",
+        "expert_hidden_dim": "moe_intermediate_size",
+        "routed_scale": "routed_scaling_factor",
+        "num_shared_experts": "n_shared_experts",
+        "num_expert_groups": "n_group",
+        "num_expert_groups_per_tok": "topk_group",
+    }
+    moe_config = config.get("moe", {})
+    for old_name, new_name in moe_config_map.items():
+        if old_name in moe_config:
+            value = moe_config.pop(old_name)
+            config[new_name] = value
+
+    config["topk_method"] = None
+    config["norm_topk_prob"] = True
+    config["scoring_func"] = "softmax"
+
+    return config
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index d7111d52dd8a1..1c7845a14b742 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -1016,6 +1016,10 @@ class EagleProposer:
                 "Qwen3VLForConditionalGeneration",
             ]:
                 self.model.config.image_token_index = target_model.config.image_token_id
+            elif self.get_model_name(target_model) == "PixtralForConditionalGeneration":
+                self.model.config.image_token_index = (
+                    target_model.config.vision_config.image_token_id
+                )
             else:
                 self.model.config.image_token_index = (
                     target_model.config.image_token_index

From 951445a52df030050c9a3ed72d612d7e807ba368 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 2 Dec 2025 12:16:37 +0000
Subject: [PATCH 145/770] Remove default values from `InitVar`s so that they're
 not stored (#29859)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 benchmarks/benchmark_ngram_proposer.py        |  5 ++-
 tests/compile/test_fusion_attn.py             | 15 ++++---
 tests/lora/test_worker.py                     | 25 +++++++----
 tests/test_config.py                          | 13 ++++++
 tests/v1/attention/utils.py                   |  2 +
 tests/v1/core/test_kv_cache_utils.py          | 11 ++++-
 tests/v1/core/test_scheduler.py               | 13 +++---
 tests/v1/core/utils.py                        | 15 +++----
 tests/v1/cudagraph/test_cudagraph_dispatch.py |  4 +-
 tests/v1/engine/test_engine_core.py           | 13 +++---
 tests/v1/kv_connector/unit/utils.py           | 13 +++---
 tests/v1/spec_decode/test_eagle.py            |  5 ++-
 tests/v1/spec_decode/test_mtp.py              |  5 ++-
 tests/v1/tpu/worker/test_tpu_model_runner.py  | 11 ++---
 tests/v1/worker/test_gpu_model_runner.py      | 20 +++++----
 vllm/config/scheduler.py                      | 42 +++++++++++--------
 vllm/config/vllm.py                           |  4 +-
 17 files changed, 139 insertions(+), 77 deletions(-)

diff --git a/benchmarks/benchmark_ngram_proposer.py b/benchmarks/benchmark_ngram_proposer.py
index dedb564fffac8..cac401456b62a 100644
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@@ -108,7 +108,10 @@ def benchmark_batched_propose(args):
         device_config=DeviceConfig(device=current_platform.device_type),
         parallel_config=ParallelConfig(),
         load_config=LoadConfig(),
-        scheduler_config=SchedulerConfig(),
+        scheduler_config=SchedulerConfig(
+            max_model_len=model_config.max_model_len,
+            is_encoder_decoder=model_config.is_encoder_decoder,
+        ),
     )
 
     # monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group
diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
index dbe12dc5de705..4d213e030edb5 100644
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -318,13 +318,18 @@ def test_attention_quant_pattern(
     torch.set_default_dtype(dtype)
     torch.manual_seed(42)
 
+    model_config = ModelConfig(
+        model=model_name,
+        max_model_len=2048,
+        dtype=dtype,
+    )
     vllm_config = VllmConfig(
-        model_config=ModelConfig(
-            model=model_name,
-            max_model_len=2048,
-            dtype=dtype,
+        model_config=model_config,
+        scheduler_config=SchedulerConfig(
+            max_num_seqs=1024,
+            max_model_len=model_config.max_model_len,
+            is_encoder_decoder=model_config.is_encoder_decoder,
         ),
-        scheduler_config=SchedulerConfig(max_num_seqs=1024),
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
             custom_ops=custom_ops_list,
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index b163559a9414d..54059ec561907 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -33,14 +33,16 @@ def test_worker_apply_lora(qwen3_lora_files):
             lora_requests, lora_mapping
         )
 
+    model_config = ModelConfig(
+        MODEL_PATH,
+        seed=0,
+        dtype="float16",
+        max_model_len=127,
+        enforce_eager=True,
+    )
+
     vllm_config = VllmConfig(
-        model_config=ModelConfig(
-            MODEL_PATH,
-            seed=0,
-            dtype="float16",
-            max_model_len=127,
-            enforce_eager=True,
-        ),
+        model_config=model_config,
         load_config=LoadConfig(
             download_dir=None,
             load_format="dummy",
@@ -50,7 +52,14 @@ def test_worker_apply_lora(qwen3_lora_files):
             tensor_parallel_size=1,
             data_parallel_size=1,
         ),
-        scheduler_config=SchedulerConfig("generate", 32, 32, 32),
+        scheduler_config=SchedulerConfig(
+            max_model_len=model_config.max_model_len,
+            is_encoder_decoder=model_config.is_encoder_decoder,
+            runner_type="generate",
+            max_num_batched_tokens=32,
+            max_num_seqs=32,
+            max_num_partial_prefills=32,
+        ),
         device_config=DeviceConfig("cuda"),
         cache_config=CacheConfig(
             block_size=16,
diff --git a/tests/test_config.py b/tests/test_config.py
index 76e0d94425fa6..b7ed68fea92ab 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -6,12 +6,14 @@ from dataclasses import MISSING, Field, asdict, dataclass, field
 from unittest.mock import patch
 
 import pytest
+from pydantic import ValidationError
 
 from vllm.compilation.backends import VllmBackend
 from vllm.config import (
     CompilationConfig,
     ModelConfig,
     PoolerConfig,
+    SchedulerConfig,
     VllmConfig,
     update_config,
 )
@@ -1095,3 +1097,14 @@ def test_vllm_config_explicit_overrides():
     # Other fields should still use defaults
     assert config.compilation_config.mode == CompilationMode.VLLM_COMPILE
     assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE
+
+
+def test_scheduler_config_init():
+    with pytest.raises(ValidationError):
+        # Positional InitVars missing
+        # (InitVars cannot have defaults otherwise they will become attributes)
+        SchedulerConfig()
+
+    with pytest.raises(AttributeError):
+        # InitVar does not become an attribute
+        print(SchedulerConfig.default_factory().max_model_len)
diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
index df3d53332c7cd..6cab129c116c5 100644
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@@ -185,6 +185,8 @@ def create_vllm_config(
         max_num_seqs=max_num_seqs,
         max_num_batched_tokens=max_num_batched_tokens,
         enable_chunked_prefill=enable_chunked_prefill,
+        max_model_len=model_config.max_model_len,
+        is_encoder_decoder=model_config.is_encoder_decoder,
     )
 
     device_config = DeviceConfig()
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 58a7a2692bfc8..fd5cf6d3e74aa 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -1128,7 +1128,11 @@ def test_estimate_max_model_len(model_id, max_model_len, want_estimated_max_len)
         dtype="float16",
         max_model_len=max_model_len,
     )
-    scheduler_config = SchedulerConfig(max_num_batched_tokens=32768)
+    scheduler_config = SchedulerConfig(
+        max_num_batched_tokens=32768,
+        max_model_len=model_config.max_model_len,
+        is_encoder_decoder=model_config.is_encoder_decoder,
+    )
 
     vllm_config = VllmConfig(
         model_config=model_config,
@@ -1163,7 +1167,10 @@ def test_get_max_concurrency_for_kv_cache_config():
         max_model_len=max_model_len,
     )
     scheduler_config = SchedulerConfig(
-        max_num_batched_tokens=1024, enable_chunked_prefill=True
+        max_num_batched_tokens=1024,
+        enable_chunked_prefill=True,
+        max_model_len=model_config.max_model_len,
+        is_encoder_decoder=model_config.is_encoder_decoder,
     )
 
     vllm_config = VllmConfig(
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 0051c11d18d85..c6c4a5085bff7 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -1508,6 +1508,12 @@ def create_scheduler_with_priority(
     Returns:
       {class}`Scheduler` instance with priority scheduling
     """
+    model_config = ModelConfig(
+        model=model,
+        trust_remote_code=True,
+        dtype="float16",
+        seed=42,
+    )
     if max_model_len is None:
         max_model_len = max_num_batched_tokens
     scheduler_config = SchedulerConfig(
@@ -1517,14 +1523,9 @@ def create_scheduler_with_priority(
         long_prefill_token_threshold=long_prefill_token_threshold,
         disable_chunked_mm_input=disable_chunked_mm_input,
         enable_chunked_prefill=True,
+        is_encoder_decoder=model_config.is_encoder_decoder,
         policy="priority",  # Enable priority scheduling
     )
-    model_config = ModelConfig(
-        model=model,
-        trust_remote_code=True,
-        dtype="float16",
-        seed=42,
-    )
     # Cache config, optionally force APC
     cache_config = CacheConfig(
         block_size=block_size,
diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py
index 7537c7a60476b..f5ba613d38db1 100644
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -69,6 +69,13 @@ def create_scheduler(
     Returns:
       {class}`Scheduler` instance
     """
+    model_config = ModelConfig(
+        model=model,
+        trust_remote_code=True,
+        dtype="float16",
+        seed=42,
+        skip_tokenizer_init=skip_tokenizer_init,
+    )
     if max_model_len is None:
         max_model_len = max_num_batched_tokens
     scheduler_config = SchedulerConfig(
@@ -79,13 +86,7 @@ def create_scheduler(
         disable_chunked_mm_input=disable_chunked_mm_input,
         enable_chunked_prefill=enable_chunked_prefill,
         async_scheduling=async_scheduling,
-    )
-    model_config = ModelConfig(
-        model=model,
-        trust_remote_code=True,
-        dtype="float16",
-        seed=42,
-        skip_tokenizer_init=skip_tokenizer_init,
+        is_encoder_decoder=model_config.is_encoder_decoder,
     )
     # Cache config, optionally force APC
     cache_config = CacheConfig(
diff --git a/tests/v1/cudagraph/test_cudagraph_dispatch.py b/tests/v1/cudagraph/test_cudagraph_dispatch.py
index 314e7094ef97f..b86534d3d4381 100644
--- a/tests/v1/cudagraph/test_cudagraph_dispatch.py
+++ b/tests/v1/cudagraph/test_cudagraph_dispatch.py
@@ -40,7 +40,9 @@ def _create_vllm_config(
 ) -> MagicMock:
     mock_config = MagicMock(spec=VllmConfig)
     mock_config.compilation_config = compilation_config
-    mock_config.scheduler_config = SchedulerConfig(max_num_seqs=max_num_seqs)
+    mock_config.scheduler_config = SchedulerConfig.default_factory(
+        max_num_seqs=max_num_seqs,
+    )
     mock_config.parallel_config = ParallelConfig()
     mock_config.speculative_config = None  # No speculative decoding
     if not lora_config:
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 3ba8ab26f5522..48be8c15aba9e 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -484,12 +484,6 @@ def test_encoder_instance_zero_kv_cache(
     vision encoder, so they don't need KV cache for text generation.
     """
     # Form vllm config
-    scheduler_config = SchedulerConfig(
-        max_num_seqs=10,
-        max_num_batched_tokens=512,
-        max_model_len=512,
-        disable_hybrid_kv_cache_manager=True,
-    )
     model_config = ModelConfig(
         model="llava-hf/llava-1.5-7b-hf",  # Multimodal model
         enforce_eager=True,
@@ -497,6 +491,13 @@ def test_encoder_instance_zero_kv_cache(
         dtype="float16",
         seed=42,
     )
+    scheduler_config = SchedulerConfig(
+        max_num_seqs=10,
+        max_num_batched_tokens=512,
+        max_model_len=512,
+        disable_hybrid_kv_cache_manager=True,
+        is_encoder_decoder=model_config.is_encoder_decoder,
+    )
     cache_config = CacheConfig(
         block_size=16,
         gpu_memory_utilization=gpu_memory_utilization,
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index f35f91bb3adf8..98f1f44923b1c 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -92,18 +92,19 @@ def create_vllm_config(
     enable_permute_local_kv: bool = False,
 ) -> VllmConfig:
     """Initialize VllmConfig For Testing."""
-    scheduler_config = SchedulerConfig(
-        max_num_seqs=max_num_seqs,
-        max_num_batched_tokens=max_num_batched_tokens,
-        max_model_len=max_model_len,
-        enable_chunked_prefill=enable_chunked_prefill,
-    )
     model_config = ModelConfig(
         model=model,
         trust_remote_code=True,
         dtype="float16",
         seed=42,
     )
+    scheduler_config = SchedulerConfig(
+        max_num_seqs=max_num_seqs,
+        max_num_batched_tokens=max_num_batched_tokens,
+        max_model_len=max_model_len,
+        enable_chunked_prefill=enable_chunked_prefill,
+        is_encoder_decoder=model_config.is_encoder_decoder,
+    )
     # Cache config, optionally force APC
     cache_config = CacheConfig(
         block_size=block_size,
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index 9436ab471c21b..616e57de339e2 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -66,7 +66,10 @@ def _create_proposer(
         device_config=DeviceConfig(device=current_platform.device_type),
         parallel_config=ParallelConfig(),
         load_config=LoadConfig(),
-        scheduler_config=SchedulerConfig(),
+        scheduler_config=SchedulerConfig(
+            max_model_len=model_config.max_model_len,
+            is_encoder_decoder=model_config.is_encoder_decoder,
+        ),
     )
 
     return EagleProposer(vllm_config=vllm_config, device=current_platform.device_type)
diff --git a/tests/v1/spec_decode/test_mtp.py b/tests/v1/spec_decode/test_mtp.py
index c5c0491abaf7c..3b8813ceb818a 100644
--- a/tests/v1/spec_decode/test_mtp.py
+++ b/tests/v1/spec_decode/test_mtp.py
@@ -51,7 +51,10 @@ def _create_mtp_proposer(num_speculative_tokens: int) -> EagleProposer:
         device_config=DeviceConfig(device=current_platform.device_type),
         parallel_config=ParallelConfig(),
         load_config=LoadConfig(),
-        scheduler_config=SchedulerConfig(),
+        scheduler_config=SchedulerConfig(
+            max_model_len=model_config.max_model_len,
+            is_encoder_decoder=model_config.is_encoder_decoder,
+        ),
     )
 
     return EagleProposer(vllm_config=vllm_config, device=current_platform.device_type)
diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index 7b3a07b4e12a5..cfc06666e7984 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -26,16 +26,17 @@ from vllm.v1.worker.tpu_model_runner import (
 
 
 def get_vllm_config():
-    scheduler_config = SchedulerConfig(
-        max_num_seqs=10,
-        max_num_batched_tokens=512,
-        max_model_len=512,
-    )
     model_config = ModelConfig(
         model="facebook/opt-125m",
         dtype="bfloat16",  # TPUs typically use bfloat16
         seed=42,
     )
+    scheduler_config = SchedulerConfig(
+        max_num_seqs=10,
+        max_num_batched_tokens=512,
+        max_model_len=512,
+        is_encoder_decoder=model_config.is_encoder_decoder,
+    )
     cache_config = CacheConfig(
         block_size=16,
         gpu_memory_utilization=0.9,
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index 89669ee8b71a0..0439bef1226e3 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -79,16 +79,17 @@ def initialize_kv_cache(runner: GPUModelRunner):
 
 
 def get_vllm_config():
-    scheduler_config = SchedulerConfig(
-        max_num_seqs=10,
-        max_num_batched_tokens=512,
-        max_model_len=512,
-    )
     model_config = ModelConfig(
         model="facebook/opt-125m",
         dtype="float16",
         seed=42,
     )
+    scheduler_config = SchedulerConfig(
+        max_num_seqs=10,
+        max_num_batched_tokens=512,
+        max_model_len=512,
+        is_encoder_decoder=model_config.is_encoder_decoder,
+    )
     cache_config = CacheConfig(
         block_size=BLOCK_SIZE,
         gpu_memory_utilization=0.9,
@@ -784,14 +785,15 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
     initialize_model_parallel(tensor_model_parallel_size=1)
     torch.set_default_dtype(torch.float16)
 
+    model_config = ModelConfig(
+        model="ibm-granite/granite-4.0-tiny-preview",
+        dtype="float16",
+    )
     scheduler_config = SchedulerConfig(
         max_num_seqs=10,
         max_num_batched_tokens=512,
         max_model_len=512,
-    )
-    model_config = ModelConfig(
-        model="ibm-granite/granite-4.0-tiny-preview",
-        dtype="float16",
+        is_encoder_decoder=model_config.is_encoder_decoder,
     )
     cache_config = CacheConfig(
         block_size=BLOCK_SIZE,
diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
index 1e089b42cccde..8da3ae538d671 100644
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -28,6 +28,19 @@ SchedulerPolicy = Literal["fcfs", "priority"]
 class SchedulerConfig:
     """Scheduler configuration."""
 
+    max_model_len: InitVar[int]
+    """Maximum length of a sequence (including prompt and generated text).
+
+    Note: This is stored in the ModelConfig, and is used only here to
+    provide fallbacks and validate other attributes."""
+
+    is_encoder_decoder: InitVar[bool]
+    """True if the model is an encoder-decoder model.
+
+    Note: This is stored in the ModelConfig, and is used only here to
+    disable chunked prefill and prefix caching for encoder-decoder models.
+    """
+
     DEFAULT_MAX_NUM_BATCHED_TOKENS: ClassVar[int] = 2048
     DEFAULT_MAX_NUM_SEQS: ClassVar[int] = 128
 
@@ -73,19 +86,6 @@ class SchedulerConfig:
     is_multimodal_model: bool = False
     """True if the model is multimodal."""
 
-    max_model_len: InitVar[int] = 8192
-    """Maximum length of a sequence (including prompt and generated text).
-
-    Note: This is stored in the ModelConfig, and is used only here to
-    provide fallbacks and validate other attributes."""
-
-    is_encoder_decoder: InitVar[bool] = False
-    """True if the model is an encoder-decoder model.
-
-    Note: This is stored in the ModelConfig, and is used only here to
-    disable chunked prefill and prefix caching for encoder-decoder models.
-    """
-
     # TODO (ywang96): Make this configurable.
     max_num_encoder_input_tokens: int = Field(init=False)
     """Multimodal encoder compute budget, only used in V1.
@@ -141,6 +141,17 @@ class SchedulerConfig:
     while a larger value (e.g., 10) reduces host overhead and may increase throughput
     by batching multiple tokens before sending."""
 
+    @staticmethod
+    def default_factory(**kwargs):
+        """
+        Factory method to create `SchedulerConfig` with default values for `InitVar`s.
+        """
+        if "max_model_len" not in kwargs:
+            kwargs["max_model_len"] = 8192
+        if "is_encoder_decoder" not in kwargs:
+            kwargs["is_encoder_decoder"] = False
+        return SchedulerConfig(**kwargs)
+
     def get_scheduler_cls(self) -> type["SchedulerInterface"]:
         if self.scheduler_cls is None:
             if self.async_scheduling:
@@ -284,8 +295,3 @@ class SchedulerConfig:
             )
 
         return self
-
-    def __getattribute__(self, name: str) -> Any:
-        if name == "max_model_len" or name == "is_encoder_decoder":
-            raise AttributeError(f"{name} is an init-only parameter. ")
-        return object.__getattribute__(self, name)
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 615b1f8489eff..5b3a9c437662b 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -170,7 +170,9 @@ class VllmConfig:
     """Cache configuration."""
     parallel_config: ParallelConfig = Field(default_factory=ParallelConfig)
     """Parallel configuration."""
-    scheduler_config: SchedulerConfig = Field(default_factory=SchedulerConfig)
+    scheduler_config: SchedulerConfig = Field(
+        default_factory=SchedulerConfig.default_factory,
+    )
     """Scheduler configuration."""
     device_config: DeviceConfig = Field(default_factory=DeviceConfig)
     """Device configuration."""

From 68ffbca7e462cfa6a32b46dabc9a604c7c1b918d Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 2 Dec 2025 20:30:40 +0800
Subject: [PATCH 146/770] [Chore] Use `tokenizer.encode` and `tokenizer.decode`
 directly (#29851)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../multimodal/processing/test_common.py      | 14 ++++++-----
 .../multimodal/processing/test_llama4.py      |  3 +--
 vllm/entrypoints/openai/speech_to_text.py     |  2 +-
 vllm/entrypoints/renderer.py                  |  4 +--
 vllm/entrypoints/score_utils.py               |  2 +-
 .../model_executor/models/nano_nemotron_vl.py | 25 ++++++++-----------
 .../models/qwen2_5_omni_thinker.py            |  3 +--
 vllm/multimodal/processing.py                 | 19 ++++++--------
 vllm/transformers_utils/tokenizer.py          |  4 +++
 9 files changed, 36 insertions(+), 40 deletions(-)

diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 90158a028b0bd..8ef1fba8df3e3 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -22,8 +22,11 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
 from vllm.multimodal.cache import MultiModalProcessorOnlyCache
 from vllm.multimodal.inputs import MultiModalInputs
 from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
-from vllm.tokenizers import MistralTokenizer, cached_tokenizer_from_config
-from vllm.transformers_utils.tokenizer import encode_tokens
+from vllm.tokenizers import (
+    MistralTokenizer,
+    TokenizerLike,
+    cached_tokenizer_from_config,
+)
 
 from ....multimodal.utils import random_audio, random_image, random_video
 from ...registry import (
@@ -151,7 +154,7 @@ def get_text_token_prompts(
     mm_data: MultiModalDataDict,
 ):
     dummy_inputs = processor.dummy_inputs
-    tokenizer = processor.info.get_tokenizer()
+    tokenizer: TokenizerLike = processor.info.get_tokenizer()
     model_config = processor.info.ctx.model_config
 
     model_type = model_config.hf_config.model_type
@@ -188,10 +191,9 @@ def get_text_token_prompts(
         assert isinstance(inputs.prompt, str)
 
         text_prompt = inputs.prompt
-        token_prompt = encode_tokens(
-            tokenizer,
+        token_prompt = tokenizer.encode(
             text_prompt,
-            add_special_tokens=_ADD_SPECIAL_TOKENS_OVERRIDES.get(model_type),
+            add_special_tokens=_ADD_SPECIAL_TOKENS_OVERRIDES.get(model_type, True),
         )
 
     return text_prompt, token_prompt
diff --git a/tests/models/multimodal/processing/test_llama4.py b/tests/models/multimodal/processing/test_llama4.py
index 4c0791ea3cece..b73246b68b36a 100644
--- a/tests/models/multimodal/processing/test_llama4.py
+++ b/tests/models/multimodal/processing/test_llama4.py
@@ -5,7 +5,6 @@
 import pytest
 
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.transformers_utils.tokenizer import encode_tokens
 
 from ....conftest import ImageTestAssets
 from ...utils import build_model_context
@@ -48,7 +47,7 @@ def test_processor_override(
         ]
     }
     if tokenized_prompt:
-        prompt = encode_tokens(tokenizer, prompt)
+        prompt = tokenizer.encode(prompt)
 
     processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
     mm_data = processed_inputs["mm_kwargs"].get_data()
diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py
index b34446d3230b1..cea9924ebbaca 100644
--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@@ -37,7 +37,7 @@ from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
 from vllm.model_executor.models import SupportsTranscription
 from vllm.outputs import RequestOutput
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
 from vllm.utils.import_utils import PlaceholderModule
 
 try:
diff --git a/vllm/entrypoints/renderer.py b/vllm/entrypoints/renderer.py
index 10b90bbbb0f32..f31b309b8ca48 100644
--- a/vllm/entrypoints/renderer.py
+++ b/vllm/entrypoints/renderer.py
@@ -33,7 +33,7 @@ class RenderConfig:
     `0` yields an empty list (and skips embeds).
     `-1` maps to `model_config.max_model_len`."""
 
-    add_special_tokens: bool | None = True
+    add_special_tokens: bool = True
     """Whether to add model-specific special tokens during tokenization."""
 
     cache_salt: str | None = None
@@ -315,7 +315,7 @@ class CompletionRenderer(BaseRenderer):
         text: str,
         max_length: int | None,
         truncate_prompt_tokens: int | None,
-        add_special_tokens: bool | None,
+        add_special_tokens: bool,
         cache_salt: str | None,
     ) -> EngineTokensPrompt:
         """Tokenize text input asynchronously."""
diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py
index 602f59ac09f55..8819c85af9a26 100644
--- a/vllm/entrypoints/score_utils.py
+++ b/vllm/entrypoints/score_utils.py
@@ -19,7 +19,7 @@ from vllm.inputs import TokensPrompt
 from vllm.model_executor.models.interfaces import supports_score_template
 from vllm.multimodal.inputs import MultiModalDataDict
 from vllm.outputs import PoolingRequestOutput
-from vllm.transformers_utils.tokenizer import TokenizerLike
+from vllm.tokenizers import TokenizerLike
 
 ScoreContentPartParam: TypeAlias = (
     ChatCompletionContentPartImageParam | ChatCompletionContentPartImageEmbedsParam
diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
index 0f86a17752802..891a9ce080233 100644
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -75,7 +75,6 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
 from vllm.transformers_utils.configs.radio import RadioConfig
-from vllm.transformers_utils.tokenizer import encode_tokens
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .utils import _merge_multimodal_embeddings
@@ -454,14 +453,12 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
 
         # Pre-tokenize special tokens for video processing
         # to avoid repeated tokenization
-        self._img_start_token_ids = encode_tokens(
-            tokenizer, IMG_START, add_special_tokens=False
+        self._img_start_token_ids = tokenizer.encode(
+            IMG_START, add_special_tokens=False
         )
-        self._img_end_token_ids = encode_tokens(
-            tokenizer, IMG_END, add_special_tokens=False
-        )
-        self._img_context_token_ids = encode_tokens(
-            tokenizer, IMG_CONTEXT, add_special_tokens=False
+        self._img_end_token_ids = tokenizer.encode(IMG_END, add_special_tokens=False)
+        self._img_context_token_ids = tokenizer.encode(
+            IMG_CONTEXT, add_special_tokens=False
         )
 
     @property
@@ -1179,14 +1176,12 @@ class NemotronH_Nano_VL_V2(
         # Pre-tokenize special tokens for video processing
         # to avoid repeated tokenization
         tokenizer = cached_tokenizer_from_config(vllm_config.model_config)
-        self._img_start_token_ids = encode_tokens(
-            tokenizer, IMG_START, add_special_tokens=False
+        self._img_start_token_ids = tokenizer.encode(
+            IMG_START, add_special_tokens=False
         )
-        self._img_end_token_ids = encode_tokens(
-            tokenizer, IMG_END, add_special_tokens=False
-        )
-        self._img_context_token_ids = encode_tokens(
-            tokenizer, IMG_CONTEXT, add_special_tokens=False
+        self._img_end_token_ids = tokenizer.encode(IMG_END, add_special_tokens=False)
+        self._img_context_token_ids = tokenizer.encode(
+            IMG_CONTEXT, add_special_tokens=False
         )
 
     def pixel_shuffle(self, x, scale_factor=0.5):
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index 7506ee8656fda..1ce0fb4e4d93d 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -88,7 +88,6 @@ from vllm.multimodal.processing import (
 )
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.tokenizer import encode_tokens
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import (
@@ -591,7 +590,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
                     tokenization_kwargs=tokenization_kwargs,
                 )
             tokenizer = self.info.get_tokenizer()
-            prompt_ids = encode_tokens(tokenizer, prompt)
+            prompt_ids = tokenizer.encode(prompt)
         else:
             prompt_ids = self._apply_hf_processor_tokens_only(prompt)
 
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 2f651bd71706f..f241e79cfa7cb 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -25,7 +25,6 @@ from typing_extensions import TypeVar, assert_never
 from vllm.logger import init_logger
 from vllm.tokenizers import TokenizerLike
 from vllm.transformers_utils.processor import cached_processor_from_config
-from vllm.transformers_utils.tokenizer import decode_tokens, encode_tokens
 from vllm.utils.collection_utils import flatten_2d_lists, full_groupby
 from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
 from vllm.utils.jsontree import JSONTree, json_map_leaves
@@ -80,9 +79,9 @@ def _cached_encode(
     tokenizer: TokenizerLike,
     text: str,
     *,
-    add_special_tokens: bool | None = None,
+    add_special_tokens: bool = True,
 ) -> list[int]:
-    return encode_tokens(tokenizer, text, add_special_tokens=add_special_tokens)
+    return tokenizer.encode(text, add_special_tokens=add_special_tokens)
 
 
 @lru_cache(maxsize=2048)
@@ -90,11 +89,9 @@ def _cached_decode(
     tokenizer: TokenizerLike,
     token_ids: tuple[int, ...],
     *,
-    skip_special_tokens: bool | None = None,
+    skip_special_tokens: bool = False,
 ) -> str:
-    return decode_tokens(
-        tokenizer, list(token_ids), skip_special_tokens=skip_special_tokens
-    )
+    return tokenizer.decode(list(token_ids), skip_special_tokens=skip_special_tokens)
 
 
 def _seq2text(
@@ -110,7 +107,7 @@ def _seq2text(
         raise ValueError("You cannot decode tokens when `skip_tokenizer_init=True`")
 
     if not use_cache:
-        return decode_tokens(tokenizer, seq)
+        return tokenizer.decode(seq)
 
     return _cached_decode(tokenizer, tuple(seq))
 
@@ -126,7 +123,7 @@ def _seq2tokens(
             raise ValueError("You cannot encode text when `skip_tokenizer_init=True`")
 
         if not use_cache:
-            return encode_tokens(tokenizer, seq, add_special_tokens=False)
+            return tokenizer.encode(seq, add_special_tokens=False)
 
         return _cached_encode(tokenizer, seq, add_special_tokens=False)
 
@@ -2198,8 +2195,8 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]):
         tokenizer = self.info.get_tokenizer()
         decoder_prompt_raw = self.create_decoder_prompt(prompt, mm_data)
         if isinstance(decoder_prompt_raw, str):
-            decoder_prompt_ids = encode_tokens(
-                tokenizer, decoder_prompt_raw, add_special_tokens=False
+            decoder_prompt_ids = tokenizer.encode(
+                decoder_prompt_raw, add_special_tokens=False
             )
         else:
             decoder_prompt_ids = decoder_prompt_raw
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 617d16779ca26..32999903b3480 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -4,6 +4,8 @@
 import warnings
 from typing import Any
 
+from typing_extensions import deprecated
+
 from vllm.logger import init_logger
 from vllm.tokenizers import TokenizerLike
 
@@ -73,6 +75,7 @@ def __getattr__(name: str):
     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
 
 
+@deprecated("Will be removed in v0.13. Please use `tokenizer.decode()` instead.")
 def decode_tokens(
     tokenizer: TokenizerLike,
     token_ids: list[int],
@@ -94,6 +97,7 @@ def decode_tokens(
     return tokenizer.decode(token_ids, **kw_args)
 
 
+@deprecated("Will be removed in v0.13. Please use `tokenizer.encode()` instead.")
 def encode_tokens(
     tokenizer: TokenizerLike,
     text: str,

From 60c3d413afccab6a1f9a18cf3cd1fe11019c1040 Mon Sep 17 00:00:00 2001
From: ImaGoodFella <31959740+ImaGoodFella@users.noreply.github.com>
Date: Tue, 2 Dec 2025 14:49:02 +0100
Subject: [PATCH 147/770] [Multimodal][Core] Optimize multimodal preprocessing
 cache by hashing image bytes instead of pixel values (#29621)

Signed-off-by: Rahul Steiger <rasteiger@ethz.ch>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 tests/conftest.py                             |  7 ++++-
 tests/entrypoints/openai/test_vision.py       |  7 ++++-
 .../pooling/embed/test_online_vision.py       |  7 ++++-
 vllm/multimodal/base.py                       | 28 +++++++++++++++++++
 vllm/multimodal/hasher.py                     | 24 ++++++++++++----
 vllm/multimodal/image.py                      | 24 +++++++++-------
 vllm/multimodal/parse.py                      | 15 ++++++++++
 vllm/multimodal/processing.py                 |  2 +-
 8 files changed, 95 insertions(+), 19 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 53bbaddd0bb7f..b20c9efef542a 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -59,6 +59,7 @@ from vllm.distributed import (
 )
 from vllm.logger import init_logger
 from vllm.logprobs import Logprob
+from vllm.multimodal.base import MediaWithBytes
 from vllm.multimodal.utils import fetch_image
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
@@ -1389,7 +1390,11 @@ class LocalAssetServer:
         return f"{self.base_url}/{name}"
 
     def get_image_asset(self, name: str) -> Image.Image:
-        return fetch_image(self.url_for(name))
+        image = fetch_image(self.url_for(name))
+        # Unwrap MediaWithBytes if present
+        if isinstance(image, MediaWithBytes):
+            image = image.media
+        return image
 
 
 @pytest.fixture(scope="session")
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index d83c6726e72da..ae8860ee877b4 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -8,6 +8,7 @@ import pytest
 import pytest_asyncio
 from transformers import AutoProcessor
 
+from vllm.multimodal.base import MediaWithBytes
 from vllm.multimodal.utils import encode_image_base64, fetch_image
 
 from ...utils import RemoteOpenAIServer
@@ -111,7 +112,11 @@ def get_hf_prompt_tokens(model_name, content, image_url):
             "content": f"{placeholder}{content}",
         }
     ]
-    images = [fetch_image(image_url)]
+    image = fetch_image(image_url)
+    # Unwrap MediaWithBytes if present
+    if isinstance(image, MediaWithBytes):
+        image = image.media
+    images = [image]
 
     prompt = processor.tokenizer.apply_chat_template(
         messages, tokenize=False, add_generation_prompt=True
diff --git a/tests/entrypoints/pooling/embed/test_online_vision.py b/tests/entrypoints/pooling/embed/test_online_vision.py
index 83e7048b9def6..eebbcdd2e4396 100644
--- a/tests/entrypoints/pooling/embed/test_online_vision.py
+++ b/tests/entrypoints/pooling/embed/test_online_vision.py
@@ -9,6 +9,7 @@ from transformers import AutoProcessor
 
 from tests.utils import VLLM_PATH, RemoteOpenAIServer
 from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
+from vllm.multimodal.base import MediaWithBytes
 from vllm.multimodal.utils import encode_image_base64, fetch_image
 
 MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
@@ -62,7 +63,11 @@ def get_hf_prompt_tokens(model_name, content, image_url):
 
     placeholder = "<|image_1|> "
     prompt = f"{placeholder}{content}"
-    images = [fetch_image(image_url)]
+    image = fetch_image(image_url)
+    # Unwrap MediaWithBytes if present
+    if isinstance(image, MediaWithBytes):
+        image = image.media
+    images = [image]
     inputs = processor(prompt, images, return_tensors="pt")
     return inputs.input_ids.shape[1]
 
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index fef118a93c6cb..4a619fd303ca9 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -2,12 +2,40 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
+from dataclasses import dataclass
 from pathlib import Path
 from typing import Generic, TypeVar
 
+import numpy as np
+
 _T = TypeVar("_T")
 
 
+@dataclass
+class MediaWithBytes(Generic[_T]):
+    """
+    Wrapper that couples a media object with its original encoded bytes.
+
+    This ensures the raw bytes and media object remain synchronized,
+    preventing cache corruption from in-place modifications.
+
+    The wrapper delegates attribute access to the underlying media object,
+    making it behave transparently like the wrapped type (e.g., PIL.Image).
+    """
+
+    media: _T
+    original_bytes: bytes
+
+    def __array__(self, *args, **kwargs) -> np.ndarray:
+        """Allow np.array(obj) to return np.array(obj.media)."""
+        return np.array(self.media, *args, **kwargs)
+
+    def __getattr__(self, name: str):
+        """Delegate attribute access to the underlying media object."""
+        # This is only called when the attribute is not found on self
+        return getattr(self.media, name)
+
+
 class MediaIO(ABC, Generic[_T]):
     @abstractmethod
     def load_bytes(self, data: bytes) -> _T:
diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py
index d0dcbb25fcce8..cc50322fed902 100644
--- a/vllm/multimodal/hasher.py
+++ b/vllm/multimodal/hasher.py
@@ -12,6 +12,8 @@ from PIL import Image
 
 from vllm.logger import init_logger
 
+from .base import MediaWithBytes
+
 logger = init_logger(__name__)
 
 
@@ -31,14 +33,26 @@ class MultiModalHasher:
             if Image.ExifTags.Base.ImageID in exif and isinstance(
                 exif[Image.ExifTags.Base.ImageID], uuid.UUID
             ):
-                # If the image has exif ImageID tag, use that
                 return (exif[Image.ExifTags.Base.ImageID].bytes,)
+
             data = {"mode": obj.mode, "data": np.asarray(obj)}
-            if obj.palette is not None:
-                data["palette"] = obj.palette.palette
-                if obj.palette.rawmode is not None:
-                    data["palette_rawmode"] = obj.palette.rawmode
+            palette = obj.palette
+            if palette is not None:
+                data["palette"] = palette.palette
+                if palette.rawmode is not None:
+                    data["palette_rawmode"] = palette.rawmode
+
             return cls.iter_item_to_bytes("image", data)
+
+        if isinstance(obj, MediaWithBytes) and isinstance(obj.media, Image.Image):
+            exif = obj.media.getexif()
+            if Image.ExifTags.Base.ImageID in exif and isinstance(
+                exif[Image.ExifTags.Base.ImageID], uuid.UUID
+            ):
+                return (exif[Image.ExifTags.Base.ImageID].bytes,)
+
+            return cls.iter_item_to_bytes("image", obj.original_bytes)
+
         if isinstance(obj, torch.Tensor):
             tensor_obj: torch.Tensor = obj.cpu()
             tensor_dtype = tensor_obj.dtype
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 21e8bef97a787..789421e9e0c3b 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -8,7 +8,7 @@ import pybase64
 import torch
 from PIL import Image
 
-from .base import MediaIO
+from .base import MediaIO, MediaWithBytes
 
 
 def rescale_image_size(
@@ -74,8 +74,12 @@ class ImageMediaIO(MediaIO[Image.Image]):
             )
         self.rgba_background_color = rgba_bg
 
-    def _convert_image_mode(self, image: Image.Image) -> Image.Image:
+    def _convert_image_mode(
+        self, image: Image.Image | MediaWithBytes[Image.Image]
+    ) -> Image.Image:
         """Convert image mode with custom background color."""
+        if isinstance(image, MediaWithBytes):
+            image = image.media
         if image.mode == self.image_mode:
             return image
         elif image.mode == "RGBA" and self.image_mode == "RGB":
@@ -83,18 +87,18 @@ class ImageMediaIO(MediaIO[Image.Image]):
         else:
             return convert_image_mode(image, self.image_mode)
 
-    def load_bytes(self, data: bytes) -> Image.Image:
+    def load_bytes(self, data: bytes) -> MediaWithBytes[Image.Image]:
         image = Image.open(BytesIO(data))
-        image.load()
-        return self._convert_image_mode(image)
+        return MediaWithBytes(self._convert_image_mode(image), data)
 
-    def load_base64(self, media_type: str, data: str) -> Image.Image:
+    def load_base64(self, media_type: str, data: str) -> MediaWithBytes[Image.Image]:
         return self.load_bytes(pybase64.b64decode(data, validate=True))
 
-    def load_file(self, filepath: Path) -> Image.Image:
-        image = Image.open(filepath)
-        image.load()
-        return self._convert_image_mode(image)
+    def load_file(self, filepath: Path) -> MediaWithBytes[Image.Image]:
+        with open(filepath, "rb") as f:
+            data = f.read()
+        image = Image.open(BytesIO(data))
+        return MediaWithBytes(self._convert_image_mode(image), data)
 
     def encode_base64(
         self,
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 810f29072a0fe..0d3b8289e4e12 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -23,6 +23,7 @@ from vllm.utils.collection_utils import is_list_of
 from vllm.utils.import_utils import LazyLoader
 
 from .audio import AudioResampler
+from .base import MediaWithBytes
 from .inputs import (
     AudioItem,
     HfAudioItem,
@@ -84,6 +85,12 @@ class ModalityDataItems(ABC, Generic[_T, _I]):
         """Get all data items."""
         return [self.get(idx) for idx in range(self.get_count())]
 
+    def get_item_for_hash(self, index: int) -> object:
+        return self.get(index)
+
+    def get_all_items_for_hash(self) -> list[object]:
+        return [self.get_item_for_hash(idx) for idx in range(self.get_count())]
+
     @abstractmethod
     def get_processor_data(self) -> Mapping[str, object]:
         """Get the data to pass to the HF processor."""
@@ -98,10 +105,18 @@ class ModalityDataItems(ABC, Generic[_T, _I]):
 class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]):
     """Base class for data items that are arranged in a list."""
 
+    def _unwrap(self, item: _T | MediaWithBytes[_T]) -> _T:
+        """Extract media from wrapper if present."""
+        return item.media if isinstance(item, MediaWithBytes) else item
+
     def get_count(self) -> int:
         return len(self.data)
 
     def get(self, index: int) -> _T:
+        return self._unwrap(self.data[index])
+
+    def get_item_for_hash(self, index: int) -> _T | MediaWithBytes[_T]:
+        # Return raw item for hashing (preserves original_bytes if present)
         return self.data[index]
 
     def get_processor_data(self) -> Mapping[str, object]:
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index f241e79cfa7cb..0390773783961 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1684,7 +1684,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
 
                 # For None entries, compute a hash; otherwise, use provided ID.
                 computed: list[str] = []
-                for i, item in enumerate(items):
+                for i, item in enumerate(items.get_all_items_for_hash()):
                     item_uuid = mm_uuids_per_modality[i]
 
                     # NOTE: Even if a item_uuid is provided, we still compute a

From 51c57b51dd51d87715367850faae1da7a9cabaef Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Tue, 2 Dec 2025 10:52:18 -0500
Subject: [PATCH 148/770] [Bugfix] Fix DeepSeek R1 MTP weight loading (#29545)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
Co-authored-by: Benjamin Chislett <bchislett@nvidia.com>
---
 vllm/model_executor/models/deepseek_mtp.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py
index 6e23037b919ab..ca77b8322e2e8 100644
--- a/vllm/model_executor/models/deepseek_mtp.py
+++ b/vllm/model_executor/models/deepseek_mtp.py
@@ -346,11 +346,16 @@ class DeepSeekMTP(nn.Module, SupportsPP, DeepseekV2MixtureOfExperts):
                     # Use expert_params_mapping to locate the destination
                     # param and delegate to its expert-aware weight_loader
                     # with expert_id.
+                    is_expert_weight = False
                     for mapping in expert_params_mapping:
                         param_name, weight_name, expert_id, shard_id = mapping
                         if weight_name not in chunk_name:
                             continue
 
+                        # Anyway, this is an expert weight and should not be
+                        # attempted to load as other weights later
+                        is_expert_weight = True
+
                         # Do not modify `name` since the loop may continue here
                         # Instead, create a new variable
                         name_mapped = chunk_name.replace(weight_name, param_name)
@@ -377,6 +382,12 @@ class DeepSeekMTP(nn.Module, SupportsPP, DeepseekV2MixtureOfExperts):
                                 loaded_params.add(name_mapped)
                             break
                     else:
+                        if is_expert_weight:
+                            # We've checked that this is an expert weight
+                            # However it's not mapped locally to this rank
+                            # So we simply skip it
+                            continue
+
                         # Skip loading extra bias for GPTQ models.
                         if name.endswith(".bias") and name not in params_dict:
                             continue

From 2eb4fe912916aea8998d085786df7abd7737e1f3 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <yuqi.wang@daocloud.io>
Date: Tue, 2 Dec 2025 23:54:28 +0800
Subject: [PATCH 149/770] [examples] Resettle pooling examples. (#29365)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml                 |  22 +--
 .github/CODEOWNERS                            |   4 +-
 docs/.nav.yml                                 |   6 +-
 docs/design/io_processor_plugins.md           |   2 +-
 docs/mkdocs/hooks/generate_examples.py        | 148 ++++++++++--------
 docs/models/pooling_models.md                 |   4 +-
 docs/models/supported_models.md               |   4 +-
 docs/serving/openai_compatible_server.md      |  14 +-
 examples/offline_inference/pooling/README.md  |  57 -------
 examples/online_serving/pooling/README.md     |  97 ------------
 .../classify}/openai_classification_client.py |   0
 .../embed}/embed_jina_embeddings_v3.py        |   0
 .../embed}/embed_matryoshka_fy.py             |   0
 .../embedding_requests_base64_client.py       |   0
 .../embed}/embedding_requests_bytes_client.py |   0
 ...ai_chat_embedding_client_for_multimodal.py |   0
 .../embed}/openai_embedding_client.py         |   0
 .../openai_embedding_long_text/README.md      |   0
 .../openai_embedding_long_text/client.py      |   0
 .../openai_embedding_long_text/service.sh     |   0
 .../embed}/openai_embedding_matryoshka_fy.py  |   0
 .../plugin/prithvi_geospatial_mae_client.py}  |   0
 .../prithvi_geospatial_mae_io_processor.py    |   0
 .../plugin/prithvi_geospatial_mae_offline.py} |   0
 .../pooling/openai_pooling_client.py          |   0
 .../pooling}/vision_language_pooling.py       |   0
 .../score}/cohere_rerank_client.py            |   0
 .../score}/convert_model_to_seq_cls.py        |   0
 .../score}/jinaai_rerank_client.py            |   0
 .../score}/openai_cross_encoder_score.py      |   0
 ...enai_cross_encoder_score_for_multimodal.py |   0
 .../score}/qwen3_reranker.py                  |   0
 .../pooling => pooling/token_classify}/ner.py |   0
 .../token_classify}/ner_client.py             |   0
 .../token_embed}/multi_vector_retrieval.py    |   0
 .../multi_vector_retrieval_client.py          |   0
 36 files changed, 109 insertions(+), 249 deletions(-)
 delete mode 100644 examples/offline_inference/pooling/README.md
 delete mode 100644 examples/online_serving/pooling/README.md
 rename examples/{online_serving/pooling => pooling/classify}/openai_classification_client.py (100%)
 rename examples/{offline_inference/pooling => pooling/embed}/embed_jina_embeddings_v3.py (100%)
 rename examples/{offline_inference/pooling => pooling/embed}/embed_matryoshka_fy.py (100%)
 rename examples/{online_serving/pooling => pooling/embed}/embedding_requests_base64_client.py (100%)
 rename examples/{online_serving/pooling => pooling/embed}/embedding_requests_bytes_client.py (100%)
 rename examples/{online_serving/pooling => pooling/embed}/openai_chat_embedding_client_for_multimodal.py (100%)
 rename examples/{online_serving/pooling => pooling/embed}/openai_embedding_client.py (100%)
 rename examples/{online_serving => pooling/embed}/openai_embedding_long_text/README.md (100%)
 rename examples/{online_serving => pooling/embed}/openai_embedding_long_text/client.py (100%)
 rename examples/{online_serving => pooling/embed}/openai_embedding_long_text/service.sh (100%)
 rename examples/{online_serving/pooling => pooling/embed}/openai_embedding_matryoshka_fy.py (100%)
 rename examples/{online_serving/pooling/prithvi_geospatial_mae.py => pooling/plugin/prithvi_geospatial_mae_client.py} (100%)
 rename examples/{offline_inference/pooling => pooling/plugin}/prithvi_geospatial_mae_io_processor.py (100%)
 rename examples/{offline_inference/pooling/prithvi_geospatial_mae.py => pooling/plugin/prithvi_geospatial_mae_offline.py} (100%)
 rename examples/{online_serving => pooling}/pooling/openai_pooling_client.py (100%)
 rename examples/{offline_inference => pooling/pooling}/vision_language_pooling.py (100%)
 rename examples/{online_serving/pooling => pooling/score}/cohere_rerank_client.py (100%)
 rename examples/{offline_inference/pooling => pooling/score}/convert_model_to_seq_cls.py (100%)
 rename examples/{online_serving/pooling => pooling/score}/jinaai_rerank_client.py (100%)
 rename examples/{online_serving/pooling => pooling/score}/openai_cross_encoder_score.py (100%)
 rename examples/{online_serving/pooling => pooling/score}/openai_cross_encoder_score_for_multimodal.py (100%)
 rename examples/{offline_inference/pooling => pooling/score}/qwen3_reranker.py (100%)
 rename examples/{offline_inference/pooling => pooling/token_classify}/ner.py (100%)
 rename examples/{online_serving/pooling => pooling/token_classify}/ner_client.py (100%)
 rename examples/{offline_inference/pooling => pooling/token_embed}/multi_vector_retrieval.py (100%)
 rename examples/{online_serving/pooling => pooling/token_embed}/multi_vector_retrieval_client.py (100%)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 9f2107fb1e5ab..52c848c784e53 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -390,20 +390,24 @@ steps:
   - examples/
   commands:
     - pip install tensorizer # for tensorizer test
+    # for basic
+    - python3 offline_inference/basic/chat.py
     - python3 offline_inference/basic/generate.py --model facebook/opt-125m
     - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
-    - python3 offline_inference/basic/chat.py
-    - python3 offline_inference/prefix_caching.py
-    - python3 offline_inference/llm_engine_example.py
-    - python3 offline_inference/audio_language.py --seed 0
-    - python3 offline_inference/vision_language.py --seed 0
-    - python3 offline_inference/vision_language_pooling.py --seed 0
-    - python3 offline_inference/vision_language_multi_image.py --seed 0
-    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
     - python3 offline_inference/basic/classify.py
     - python3 offline_inference/basic/embed.py
     - python3 offline_inference/basic/score.py
+    # for multi-modal models
+    - python3 offline_inference/audio_language.py --seed 0
+    - python3 offline_inference/vision_language.py --seed 0
+    - python3 offline_inference/vision_language_multi_image.py --seed 0
+    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
+    # for pooling models
+    - python3 pooling/pooling/vision_language_pooling.py --seed 0
+    # for features demo
+    - python3 offline_inference/prefix_caching.py
+    - python3 offline_inference/llm_engine_example.py
+    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
     - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
     # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
     - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index ecb10d1a450f3..d6447649cd89a 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -146,10 +146,10 @@ mkdocs.yaml @hmellor
 /requirements/kv_connectors.txt @NickLucche
 
 # Pooling models
-/examples/*/pooling/ @noooop
+/examples/pooling @noooop
 /tests/models/*/pooling* @noooop
 /tests/entrypoints/pooling @noooop
-/vllm/entrypoints/pooling @aarnphm @chaunceyjiang @noooop
+/vllm/entrypoints/pooling @noooop
 /vllm/config/pooler.py @noooop
 /vllm/pooling_params.py @noooop
 /vllm/model_executor/layers/pooler.py @noooop
diff --git a/docs/.nav.yml b/docs/.nav.yml
index d30c0f12eba4c..aa98ad52be215 100644
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@@ -5,11 +5,7 @@ nav:
     - Getting Started:
       - getting_started/quickstart.md
       - getting_started/installation
-    - Examples:
-      - examples/README.md
-      - Offline Inference: examples/offline_inference
-      - Online Serving: examples/online_serving
-      - Others: examples/others
+      - Examples: examples
     - General:
       - usage/v1_guide.md
       - usage/*
diff --git a/docs/design/io_processor_plugins.md b/docs/design/io_processor_plugins.md
index b4a30cda35a01..5a86940fa9f13 100644
--- a/docs/design/io_processor_plugins.md
+++ b/docs/design/io_processor_plugins.md
@@ -79,7 +79,7 @@ The `post_process*` methods take `PoolingRequestOutput` objects as input and gen
 The `validate_or_generate_params` method is used for validating with the plugin any `SamplingParameters`/`PoolingParameters` received with the user request, or to generate new ones if none are specified. The function always returns the validated/generated parameters.
 The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/pooling` serving endpoint is available here [vllm/entrypoints/openai/serving_pooling.py](../../vllm/entrypoints/pooling/pooling/serving.py).
 
-An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/IBM/terratorch/tree/main/terratorch/vllm/plugins/segmentation). Please, also refer to our online ([examples/online_serving/pooling/prithvi_geospatial_mae.py](../../examples/online_serving/pooling/prithvi_geospatial_mae.py)) and offline ([examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py](../../examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py)) inference examples.
+An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/IBM/terratorch/tree/main/terratorch/vllm/plugins/segmentation). Please, also refer to our online ([examples/pooling/plugin/prithvi_geospatial_mae_client.py](../../examples/pooling/plugin/prithvi_geospatial_mae_client.py)) and offline ([examples/pooling/plugin/prithvi_geospatial_mae_io_processor.py](../../examples/pooling/plugin/prithvi_geospatial_mae_io_processor.py)) inference examples.
 
 ## Using an IO Processor plugin
 
diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py
index 6e4fb039e3a07..e886a91e65732 100644
--- a/docs/mkdocs/hooks/generate_examples.py
+++ b/docs/mkdocs/hooks/generate_examples.py
@@ -2,7 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import itertools
 import logging
-from dataclasses import dataclass, field
+from dataclasses import dataclass
+from functools import cached_property
 from pathlib import Path
 from typing import Literal
 
@@ -16,13 +17,18 @@ EXAMPLE_DIR = ROOT_DIR / "examples"
 EXAMPLE_DOC_DIR = ROOT_DIR / "docs/examples"
 
 
-def fix_case(text: str) -> str:
+def title(text: str) -> str:
+    # Default title case
+    text = text.replace("_", " ").replace("/", " - ").title()
+    # Custom substitutions
     subs = {
+        "io": "IO",
         "api": "API",
         "cli": "CLI",
         "cpu": "CPU",
         "llm": "LLM",
         "mae": "MAE",
+        "ner": "NER",
         "tpu": "TPU",
         "gguf": "GGUF",
         "lora": "LoRA",
@@ -48,71 +54,65 @@ class Example:
     Attributes:
         path (Path): The path to the main directory or file.
         category (str): The category of the document.
-        main_file (Path): The main file in the directory.
-        other_files (list[Path]): list of other files in the directory.
-        title (str): The title of the document.
+
+    Properties::
+        main_file() -> Path | None: Determines the main file in the given path.
+        other_files() -> list[Path]: Determines other files in the directory excluding
+        the main file.
+        title() -> str: Determines the title of the document.
 
     Methods:
-        __post_init__(): Initializes the main_file, other_files, and title attributes.
-        determine_main_file() -> Path: Determines the main file in the given path.
-        determine_other_files() -> list[Path]: Determines other files in the directory excluding the main file.
-        determine_title() -> str: Determines the title of the document.
         generate() -> str: Generates the documentation content.
-    """  # noqa: E501
+    """
 
     path: Path
-    category: str = None
-    main_file: Path = field(init=False)
-    other_files: list[Path] = field(init=False)
-    title: str = field(init=False)
+    category: str
 
-    def __post_init__(self):
-        self.main_file = self.determine_main_file()
-        self.other_files = self.determine_other_files()
-        self.title = self.determine_title()
+    @cached_property
+    def main_file(self) -> Path | None:
+        """Determines the main file in the given path.
 
-    @property
-    def is_code(self) -> bool:
-        return self.main_file.suffix != ".md"
+        If path is a file, it returns the path itself. If path is a directory, it
+        searches for Markdown files (*.md) in the directory and returns the first one
+        found. If no Markdown files are found, it returns None."""
+        # Single file example
+        if self.path.is_file():
+            return self.path
+        # Multi file example with a README
+        if md_paths := list(self.path.glob("*.md")):
+            return md_paths[0]
+        # Multi file example without a README
+        return None
 
-    def determine_main_file(self) -> Path:
-        """
-        Determines the main file in the given path.
-        If the path is a file, it returns the path itself. Otherwise, it searches
-        for Markdown files (*.md) in the directory and returns the first one found.
-        Returns:
-            Path: The main file path, either the original path if it's a file or the first
-            Markdown file found in the directory.
-        Raises:
-            IndexError: If no Markdown files are found in the directory.
-        """  # noqa: E501
-        return self.path if self.path.is_file() else list(self.path.glob("*.md")).pop()
+    @cached_property
+    def other_files(self) -> list[Path]:
+        """Determine other files in the directory excluding the main file.
 
-    def determine_other_files(self) -> list[Path]:
-        """
-        Determine other files in the directory excluding the main file.
-
-        This method checks if the given path is a file. If it is, it returns an empty list.
-        Otherwise, it recursively searches through the directory and returns a list of all
-        files that are not the main file.
-
-        Returns:
-            list[Path]: A list of Path objects representing the other files in the directory.
-        """  # noqa: E501
+        If path is a file, it returns an empty list. Otherwise, it returns every file
+        in the directory except the main file in a list."""
+        # Single file example
         if self.path.is_file():
             return []
+        # Multi file example
         is_other_file = lambda file: file.is_file() and file != self.main_file
-        return [file for file in self.path.rglob("*") if is_other_file(file)]
+        return sorted(file for file in self.path.rglob("*") if is_other_file(file))
 
-    def determine_title(self) -> str:
-        if not self.is_code:
-            # Specify encoding for building on Windows
-            with open(self.main_file, encoding="utf-8") as f:
-                first_line = f.readline().strip()
-            match = re.match(r"^#\s+(?P<title>.+)$", first_line)
-            if match:
-                return match.group("title")
-        return fix_case(self.path.stem.replace("_", " ").title())
+    @cached_property
+    def is_code(self) -> bool:
+        return self.main_file is not None and self.main_file.suffix != ".md"
+
+    @cached_property
+    def title(self) -> str:
+        # Generate title from filename if no main md file found
+        if self.main_file is None or self.is_code:
+            return title(self.path.stem)
+        # Specify encoding for building on Windows
+        with open(self.main_file, encoding="utf-8") as f:
+            first_line = f.readline().strip()
+        match = re.match(r"^#\s+(?P<title>.+)$", first_line)
+        if match:
+            return match.group("title")
+        raise ValueError(f"Title not found in {self.main_file}")
 
     def fix_relative_links(self, content: str) -> str:
         """
@@ -156,24 +156,35 @@ class Example:
         # included files containing code fences too
         code_fence = "``````"
 
-        if self.is_code:
-            content += (
-                f"{code_fence}{self.main_file.suffix[1:]}\n"
-                f'--8<-- "{self.main_file}"\n'
-                f"{code_fence}\n"
-            )
+        if self.main_file is not None:
+            # Single file example or multi file example with a README
+            if self.is_code:
+                content += (
+                    f"{code_fence}{self.main_file.suffix[1:]}\n"
+                    f'--8<-- "{self.main_file}"\n'
+                    f"{code_fence}\n"
+                )
+            else:
+                with open(self.main_file, encoding="utf-8") as f:
+                    # Skip the title from md snippets as it's been included above
+                    main_content = f.readlines()[1:]
+                content += self.fix_relative_links("".join(main_content))
+            content += "\n"
         else:
-            with open(self.main_file) as f:
-                # Skip the title from md snippets as it's been included above
-                main_content = f.readlines()[1:]
-            content += self.fix_relative_links("".join(main_content))
-        content += "\n"
+            # Multi file example without a README
+            for file in self.other_files:
+                file_title = title(str(file.relative_to(self.path).with_suffix("")))
+                content += f"## {file_title}\n\n"
+                content += (
+                    f'{code_fence}{file.suffix[1:]}\n--8<-- "{file}"\n{code_fence}\n\n'
+                )
+            return content
 
         if not self.other_files:
             return content
 
         content += "## Example materials\n\n"
-        for file in sorted(self.other_files):
+        for file in self.other_files:
             content += f'??? abstract "{file.relative_to(self.path)}"\n'
             if file.suffix != ".md":
                 content += f"    {code_fence}{file.suffix[1:]}\n"
@@ -200,11 +211,13 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
     glob_patterns = ["*.py", "*.md", "*.sh"]
     # Find categorised examples
     for category in categories:
+        logger.info("Processing category: %s", category.stem)
         globs = [category.glob(pattern) for pattern in glob_patterns]
         for path in itertools.chain(*globs):
             examples.append(Example(path, category.stem))
         # Find examples in subdirectories
-        for path in category.glob("*/*.md"):
+        globs = [category.glob(f"*/{pattern}") for pattern in glob_patterns]
+        for path in itertools.chain(*globs):
             examples.append(Example(path.parent, category.stem))
 
     # Generate the example documentation
@@ -217,3 +230,4 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
         with open(doc_path, "w+", encoding="utf-8") as f:
             f.write(example.generate())
         logger.debug("Example generated: %s", doc_path.relative_to(ROOT_DIR))
+    logger.info("Total examples generated: %d", len(examples))
diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index aca865f4bf77d..e2d427e8a4590 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -274,7 +274,7 @@ outputs = llm.embed(
 print(outputs[0].outputs)
 ```
 
-A code example can be found here: [examples/offline_inference/pooling/embed_matryoshka_fy.py](../../examples/offline_inference/pooling/embed_matryoshka_fy.py)
+A code example can be found here: [examples/pooling/embed/embed_matryoshka_fy.py](../../examples/pooling/embed/embed_matryoshka_fy.py)
 
 ### Online Inference
 
@@ -304,7 +304,7 @@ Expected output:
 {"id":"embd-5c21fc9a5c9d4384a1b021daccaf9f64","object":"list","created":1745476417,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-0.3828125,-0.1357421875,0.03759765625,0.125,0.21875,0.09521484375,-0.003662109375,0.1591796875,-0.130859375,-0.0869140625,-0.1982421875,0.1689453125,-0.220703125,0.1728515625,-0.2275390625,-0.0712890625,-0.162109375,-0.283203125,-0.055419921875,-0.0693359375,0.031982421875,-0.04052734375,-0.2734375,0.1826171875,-0.091796875,0.220703125,0.37890625,-0.0888671875,-0.12890625,-0.021484375,-0.0091552734375,0.23046875]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0,"prompt_tokens_details":null}}
 ```
 
-An OpenAI client example can be found here: [examples/online_serving/pooling/openai_embedding_matryoshka_fy.py](../../examples/online_serving/pooling/openai_embedding_matryoshka_fy.py)
+An OpenAI client example can be found here: [examples/pooling/embed/openai_embedding_matryoshka_fy.py](../../examples/pooling/embed/openai_embedding_matryoshka_fy.py)
 
 ## Deprecated Features
 
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 6ea2285b92bb8..040107c11efcf 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -568,7 +568,7 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
     ```
 
 !!! note
-    Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/offline_inference/pooling/qwen3_reranker.py](../../examples/offline_inference/pooling/qwen3_reranker.py).
+    Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/pooling/score/qwen3_reranker.py](../../examples/pooling/score/qwen3_reranker.py).
 
     ```bash
     vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
@@ -606,7 +606,7 @@ These models primarily support the [`LLM.encode`](./pooling_models.md#llmencode)
 | `ModernBertForTokenClassification` | ModernBERT-based | `disham993/electrical-ner-ModernBERT-base` |  |  |
 
 !!! note
-    Named Entity Recognition (NER) usage, please refer to [examples/offline_inference/pooling/ner.py](../../examples/offline_inference/pooling/ner.py), [examples/online_serving/pooling/ner_client.py](../../examples/online_serving/pooling/ner_client.py).
+    Named Entity Recognition (NER) usage, please refer to [examples/pooling/token_classify/ner.py](../../examples/pooling/token_classify/ner.py), [examples/pooling/token_classify/ner_client.py](../../examples/pooling/token_classify/ner_client.py).
 
 ## List of Multimodal Language Models
 
diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index 672663dc50b1e..01453483a8d60 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -234,7 +234,7 @@ The following extra parameters are supported:
 Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings);
 you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
 
-Code example: [examples/online_serving/pooling/openai_embedding_client.py](../../examples/online_serving/pooling/openai_embedding_client.py)
+Code example: [examples/pooling/embed/openai_embedding_client.py](../../examples/pooling/embed/openai_embedding_client.py)
 
 If the model has a [chat template](../serving/openai_compatible_server.md#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat API](#chat-api))
 which will be treated as a single prompt to the model. Here is a convenience function for calling the API while retaining OpenAI's type annotations:
@@ -335,7 +335,7 @@ and passing a list of `messages` in the request. Refer to the examples below for
         `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code
         example below for details.
 
-Full example: [examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py](../../examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py)
+Full example: [examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py](../../examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py)
 
 #### Extra parameters
 
@@ -516,7 +516,7 @@ Our Pooling API encodes input prompts using a [pooling model](../models/pooling_
 
 The input format is the same as [Embeddings API](#embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats.
 
-Code example: [examples/online_serving/pooling/openai_pooling_client.py](../../examples/online_serving/pooling/openai_pooling_client.py)
+Code example: [examples/pooling/pooling/openai_pooling_client.py](../../examples/pooling/pooling/openai_pooling_client.py)
 
 ### Classification API
 
@@ -524,7 +524,7 @@ Our Classification API directly supports Hugging Face sequence-classification mo
 
 We automatically wrap any other transformer via `as_seq_cls_model()`, which pools on the last token, attaches a `RowParallelLinear` head, and applies a softmax to produce per-class probabilities.
 
-Code example: [examples/online_serving/pooling/openai_classification_client.py](../../examples/online_serving/pooling/openai_classification_client.py)
+Code example: [examples/pooling/classify/openai_classification_client.py](../../examples/pooling/classify/openai_classification_client.py)
 
 #### Example Requests
 
@@ -640,7 +640,7 @@ Usually, the score for a sentence pair refers to the similarity between two sent
 
 You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
 
-Code example: [examples/online_serving/pooling/openai_cross_encoder_score.py](../../examples/online_serving/pooling/openai_cross_encoder_score.py)
+Code example: [examples/pooling/score/openai_cross_encoder_score.py](../../examples/pooling/score/openai_cross_encoder_score.py)
 
 #### Single inference
 
@@ -821,7 +821,7 @@ You can pass multi-modal inputs to scoring models by passing `content` including
         print("Scoring output:", response_json["data"][0]["score"])
         print("Scoring output:", response_json["data"][1]["score"])
         ```
-Full example: [examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py](../../examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py)
+Full example: [examples/pooling/score/openai_cross_encoder_score_for_multimodal.py](../../examples/pooling/score/openai_cross_encoder_score_for_multimodal.py)
 
 #### Extra parameters
 
@@ -851,7 +851,7 @@ endpoints are compatible with both [Jina AI's re-rank API interface](https://jin
 [Cohere's re-rank API interface](https://docs.cohere.com/v2/reference/rerank) to ensure compatibility with
 popular open-source tools.
 
-Code example: [examples/online_serving/pooling/jinaai_rerank_client.py](../../examples/online_serving/pooling/jinaai_rerank_client.py)
+Code example: [examples/pooling/score/jinaai_rerank_client.py](../../examples/pooling/score/jinaai_rerank_client.py)
 
 #### Example Request
 
diff --git a/examples/offline_inference/pooling/README.md b/examples/offline_inference/pooling/README.md
deleted file mode 100644
index ad78be38716b6..0000000000000
--- a/examples/offline_inference/pooling/README.md
+++ /dev/null
@@ -1,57 +0,0 @@
-# Pooling models
-
-## Convert llm model to seq cls
-
-```bash
-# for BAAI/bge-reranker-v2-gemma
-# Caution: "Yes" and "yes" are two different tokens
-python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name BAAI/bge-reranker-v2-gemma --classifier_from_tokens '["Yes"]' --method no_post_processing --path ./bge-reranker-v2-gemma-seq-cls
-# for mxbai-rerank-v2
-python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name mixedbread-ai/mxbai-rerank-base-v2 --classifier_from_tokens '["0", "1"]' --method from_2_way_softmax --path ./mxbai-rerank-base-v2-seq-cls
-# for Qwen3-Reranker
-python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name Qwen/Qwen3-Reranker-0.6B --classifier_from_tokens '["no", "yes"]' --method from_2_way_softmax --path ./Qwen3-Reranker-0.6B-seq-cls
-```
-
-## Embed jina_embeddings_v3 usage
-
-Only text matching task is supported for now. See <https://github.com/vllm-project/vllm/pull/16120>
-
-```bash
-python examples/offline_inference/pooling/embed_jina_embeddings_v3.py
-```
-
-## Embed matryoshka dimensions usage
-
-```bash
-python examples/offline_inference/pooling/embed_matryoshka_fy.py
-```
-
-## Multi vector retrieval usage
-
-```bash
-python examples/offline_inference/pooling/multi_vector_retrieval.py
-```
-
-## Named Entity Recognition (NER) usage
-
-```bash
-python examples/offline_inference/pooling/ner.py
-```
-
-## Prithvi Geospatial MAE usage
-
-```bash
-python examples/offline_inference/pooling/prithvi_geospatial_mae.py
-```
-
-## IO Processor Plugins for Prithvi Geospatial MAE
-
-```bash
-python examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py
-```
-
-## Qwen3 reranker usage
-
-```bash
-python examples/offline_inference/pooling/qwen3_reranker.py
-```
diff --git a/examples/online_serving/pooling/README.md b/examples/online_serving/pooling/README.md
deleted file mode 100644
index b76ad21f04818..0000000000000
--- a/examples/online_serving/pooling/README.md
+++ /dev/null
@@ -1,97 +0,0 @@
-# Pooling models
-
-## Cohere rerank usage
-
-```bash
-# vllm serve BAAI/bge-reranker-base
-python examples/online_serving/pooling/cohere_rerank_client.py
-```
-
-## Embedding requests base64 encoding_format usage
-
-```bash
-# vllm serve intfloat/e5-small
-python examples/online_serving/pooling/embedding_requests_base64_client.py
-```
-
-## Embedding requests bytes encoding_format usage
-
-```bash
-# vllm serve intfloat/e5-small
-python examples/online_serving/pooling/embedding_requests_bytes_client.py
-```
-
-## Jinaai rerank usage
-
-```bash
-# vllm serve BAAI/bge-reranker-base
-python examples/online_serving/pooling/jinaai_rerank_client.py
-```
-
-## Multi vector retrieval usage
-
-```bash
-# vllm serve BAAI/bge-m3
-python examples/online_serving/pooling/multi_vector_retrieval_client.py
-```
-
-## Named Entity Recognition (NER) usage
-
-```bash
-# vllm serve boltuix/NeuroBERT-NER
-python examples/online_serving/pooling/ner_client.py
-```
-
-## OpenAI chat embedding for multimodal usage
-
-```bash
-python examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py
-```
-
-## OpenAI classification usage
-
-```bash
-# vllm serve jason9693/Qwen2.5-1.5B-apeach
-python examples/online_serving/pooling/openai_classification_client.py
-```
-
-## OpenAI cross_encoder score usage
-
-```bash
-# vllm serve BAAI/bge-reranker-v2-m3
-python examples/online_serving/pooling/openai_cross_encoder_score.py
-```
-
-## OpenAI cross_encoder score for multimodal usage
-
-```bash
-# vllm serve jinaai/jina-reranker-m0
-python examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py
-```
-
-## OpenAI embedding usage
-
-```bash
-# vllm serve intfloat/e5-small
-python examples/online_serving/pooling/openai_embedding_client.py
-```
-
-## OpenAI embedding matryoshka dimensions usage
-
-```bash
-# vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
-python examples/online_serving/pooling/openai_embedding_matryoshka_fy.py
-```
-
-## OpenAI pooling usage
-
-```bash
-# vllm serve internlm/internlm2-1_8b-reward --trust-remote-code
-python examples/online_serving/pooling/openai_pooling_client.py
-```
-
-## Online Prithvi Geospatial MAE usage
-
-```bash
-python examples/online_serving/pooling/prithvi_geospatial_mae.py
-```
diff --git a/examples/online_serving/pooling/openai_classification_client.py b/examples/pooling/classify/openai_classification_client.py
similarity index 100%
rename from examples/online_serving/pooling/openai_classification_client.py
rename to examples/pooling/classify/openai_classification_client.py
diff --git a/examples/offline_inference/pooling/embed_jina_embeddings_v3.py b/examples/pooling/embed/embed_jina_embeddings_v3.py
similarity index 100%
rename from examples/offline_inference/pooling/embed_jina_embeddings_v3.py
rename to examples/pooling/embed/embed_jina_embeddings_v3.py
diff --git a/examples/offline_inference/pooling/embed_matryoshka_fy.py b/examples/pooling/embed/embed_matryoshka_fy.py
similarity index 100%
rename from examples/offline_inference/pooling/embed_matryoshka_fy.py
rename to examples/pooling/embed/embed_matryoshka_fy.py
diff --git a/examples/online_serving/pooling/embedding_requests_base64_client.py b/examples/pooling/embed/embedding_requests_base64_client.py
similarity index 100%
rename from examples/online_serving/pooling/embedding_requests_base64_client.py
rename to examples/pooling/embed/embedding_requests_base64_client.py
diff --git a/examples/online_serving/pooling/embedding_requests_bytes_client.py b/examples/pooling/embed/embedding_requests_bytes_client.py
similarity index 100%
rename from examples/online_serving/pooling/embedding_requests_bytes_client.py
rename to examples/pooling/embed/embedding_requests_bytes_client.py
diff --git a/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py b/examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py
similarity index 100%
rename from examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py
rename to examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py
diff --git a/examples/online_serving/pooling/openai_embedding_client.py b/examples/pooling/embed/openai_embedding_client.py
similarity index 100%
rename from examples/online_serving/pooling/openai_embedding_client.py
rename to examples/pooling/embed/openai_embedding_client.py
diff --git a/examples/online_serving/openai_embedding_long_text/README.md b/examples/pooling/embed/openai_embedding_long_text/README.md
similarity index 100%
rename from examples/online_serving/openai_embedding_long_text/README.md
rename to examples/pooling/embed/openai_embedding_long_text/README.md
diff --git a/examples/online_serving/openai_embedding_long_text/client.py b/examples/pooling/embed/openai_embedding_long_text/client.py
similarity index 100%
rename from examples/online_serving/openai_embedding_long_text/client.py
rename to examples/pooling/embed/openai_embedding_long_text/client.py
diff --git a/examples/online_serving/openai_embedding_long_text/service.sh b/examples/pooling/embed/openai_embedding_long_text/service.sh
similarity index 100%
rename from examples/online_serving/openai_embedding_long_text/service.sh
rename to examples/pooling/embed/openai_embedding_long_text/service.sh
diff --git a/examples/online_serving/pooling/openai_embedding_matryoshka_fy.py b/examples/pooling/embed/openai_embedding_matryoshka_fy.py
similarity index 100%
rename from examples/online_serving/pooling/openai_embedding_matryoshka_fy.py
rename to examples/pooling/embed/openai_embedding_matryoshka_fy.py
diff --git a/examples/online_serving/pooling/prithvi_geospatial_mae.py b/examples/pooling/plugin/prithvi_geospatial_mae_client.py
similarity index 100%
rename from examples/online_serving/pooling/prithvi_geospatial_mae.py
rename to examples/pooling/plugin/prithvi_geospatial_mae_client.py
diff --git a/examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py b/examples/pooling/plugin/prithvi_geospatial_mae_io_processor.py
similarity index 100%
rename from examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py
rename to examples/pooling/plugin/prithvi_geospatial_mae_io_processor.py
diff --git a/examples/offline_inference/pooling/prithvi_geospatial_mae.py b/examples/pooling/plugin/prithvi_geospatial_mae_offline.py
similarity index 100%
rename from examples/offline_inference/pooling/prithvi_geospatial_mae.py
rename to examples/pooling/plugin/prithvi_geospatial_mae_offline.py
diff --git a/examples/online_serving/pooling/openai_pooling_client.py b/examples/pooling/pooling/openai_pooling_client.py
similarity index 100%
rename from examples/online_serving/pooling/openai_pooling_client.py
rename to examples/pooling/pooling/openai_pooling_client.py
diff --git a/examples/offline_inference/vision_language_pooling.py b/examples/pooling/pooling/vision_language_pooling.py
similarity index 100%
rename from examples/offline_inference/vision_language_pooling.py
rename to examples/pooling/pooling/vision_language_pooling.py
diff --git a/examples/online_serving/pooling/cohere_rerank_client.py b/examples/pooling/score/cohere_rerank_client.py
similarity index 100%
rename from examples/online_serving/pooling/cohere_rerank_client.py
rename to examples/pooling/score/cohere_rerank_client.py
diff --git a/examples/offline_inference/pooling/convert_model_to_seq_cls.py b/examples/pooling/score/convert_model_to_seq_cls.py
similarity index 100%
rename from examples/offline_inference/pooling/convert_model_to_seq_cls.py
rename to examples/pooling/score/convert_model_to_seq_cls.py
diff --git a/examples/online_serving/pooling/jinaai_rerank_client.py b/examples/pooling/score/jinaai_rerank_client.py
similarity index 100%
rename from examples/online_serving/pooling/jinaai_rerank_client.py
rename to examples/pooling/score/jinaai_rerank_client.py
diff --git a/examples/online_serving/pooling/openai_cross_encoder_score.py b/examples/pooling/score/openai_cross_encoder_score.py
similarity index 100%
rename from examples/online_serving/pooling/openai_cross_encoder_score.py
rename to examples/pooling/score/openai_cross_encoder_score.py
diff --git a/examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py b/examples/pooling/score/openai_cross_encoder_score_for_multimodal.py
similarity index 100%
rename from examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py
rename to examples/pooling/score/openai_cross_encoder_score_for_multimodal.py
diff --git a/examples/offline_inference/pooling/qwen3_reranker.py b/examples/pooling/score/qwen3_reranker.py
similarity index 100%
rename from examples/offline_inference/pooling/qwen3_reranker.py
rename to examples/pooling/score/qwen3_reranker.py
diff --git a/examples/offline_inference/pooling/ner.py b/examples/pooling/token_classify/ner.py
similarity index 100%
rename from examples/offline_inference/pooling/ner.py
rename to examples/pooling/token_classify/ner.py
diff --git a/examples/online_serving/pooling/ner_client.py b/examples/pooling/token_classify/ner_client.py
similarity index 100%
rename from examples/online_serving/pooling/ner_client.py
rename to examples/pooling/token_classify/ner_client.py
diff --git a/examples/offline_inference/pooling/multi_vector_retrieval.py b/examples/pooling/token_embed/multi_vector_retrieval.py
similarity index 100%
rename from examples/offline_inference/pooling/multi_vector_retrieval.py
rename to examples/pooling/token_embed/multi_vector_retrieval.py
diff --git a/examples/online_serving/pooling/multi_vector_retrieval_client.py b/examples/pooling/token_embed/multi_vector_retrieval_client.py
similarity index 100%
rename from examples/online_serving/pooling/multi_vector_retrieval_client.py
rename to examples/pooling/token_embed/multi_vector_retrieval_client.py

From 0ec84221718d920c3f46da879cc354f94b8fb59e Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Wed, 3 Dec 2025 00:03:52 +0800
Subject: [PATCH 150/770] [Bugfix] Fix incorrect channel order for idefics3 in
 edge case (#29881)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/model_executor/models/idefics3.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 9c5f9389e54bb..7c3933c6feb7e 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -338,6 +338,7 @@ class Idefics3MultiModalProcessor(BaseMultiModalProcessor[Idefics3ProcessingInfo
             prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
             return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
 
+        mm_kwargs = {"input_data_format": "channels_last", **mm_kwargs}
         processed_outputs = super()._call_hf_processor(
             prompt,
             mm_data,

From 52cb349fc010c3d9e8f576f7cc675e6403aadd0a Mon Sep 17 00:00:00 2001
From: Andrew Xia <axia@meta.com>
Date: Tue, 2 Dec 2025 08:24:45 -0800
Subject: [PATCH 151/770] [responsesAPI][3] ResponsesParser to set up non
 harmony MCP (#29413)

Signed-off-by: Andrew Xia <axia@fb.com>
Co-authored-by: Andrew Xia <axia@fb.com>
---
 .../test_response_api_parsable_context.py     |  87 +++++++++++++++
 vllm/entrypoints/chat_utils.py                |   1 +
 vllm/entrypoints/context.py                   |  76 +++++++++++++
 vllm/entrypoints/openai/parser/__init__.py    |   0
 .../openai/parser/responses_parser.py         | 101 ++++++++++++++++++
 vllm/entrypoints/openai/serving_responses.py  |  45 +++++---
 vllm/entrypoints/responses_utils.py           |  30 ++++++
 vllm/envs.py                                  |   5 +
 8 files changed, 332 insertions(+), 13 deletions(-)
 create mode 100644 tests/entrypoints/openai/test_response_api_parsable_context.py
 create mode 100644 vllm/entrypoints/openai/parser/__init__.py
 create mode 100644 vllm/entrypoints/openai/parser/responses_parser.py

diff --git a/tests/entrypoints/openai/test_response_api_parsable_context.py b/tests/entrypoints/openai/test_response_api_parsable_context.py
new file mode 100644
index 0000000000000..1b2795770d4c7
--- /dev/null
+++ b/tests/entrypoints/openai/test_response_api_parsable_context.py
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import pytest
+import pytest_asyncio
+from openai import OpenAI
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen3-8B"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = ["--reasoning-parser", "qwen3", "--max_model_len", "5000"]
+    env_dict = dict(
+        VLLM_ENABLE_RESPONSES_API_STORE="1",
+        VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT="1",
+        # uncomment for tool calling
+        # PYTHON_EXECUTION_BACKEND="dangerously_use_uv",
+    )
+
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_basic(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="What is 13 * 24?",
+    )
+    assert response is not None
+    print("response: ", response)
+    assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_reasoning_and_function_items(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input=[
+            {"type": "message", "content": "Hello.", "role": "user"},
+            {
+                "type": "reasoning",
+                "id": "lol",
+                "content": [
+                    {
+                        "type": "reasoning_text",
+                        "text": "We need to respond: greeting.",
+                    }
+                ],
+                "summary": [],
+            },
+            {
+                "arguments": '{"location": "Paris", "unit": "celsius"}',
+                "call_id": "call_5f7b38f3b81e4b8380fd0ba74f3ca3ab",
+                "name": "get_weather",
+                "type": "function_call",
+                "id": "fc_4fe5d6fc5b6c4d6fa5f24cc80aa27f78",
+                "status": "completed",
+            },
+            {
+                "call_id": "call_5f7b38f3b81e4b8380fd0ba74f3ca3ab",
+                "id": "fc_4fe5d6fc5b6c4d6fa5f24cc80aa27f78",
+                "output": "The weather in Paris is 20 Celsius",
+                "status": "completed",
+                "type": "function_call_output",
+            },
+        ],
+        temperature=0.0,
+    )
+    assert response is not None
+    assert response.status == "completed"
+    # make sure we get a reasoning and text output
+    assert response.output[0].type == "reasoning"
+    assert response.output[1].type == "message"
+    assert type(response.output[1].content[0].text) is str
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 1643906894c66..2dd5b9c8f8aa0 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -1530,6 +1530,7 @@ def _parse_chat_message_content(
     role = message["role"]
     content = message.get("content")
     reasoning = message.get("reasoning") or message.get("reasoning_content")
+
     if content is None:
         content = []
     elif isinstance(content, str):
diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
index 7a41c668d7645..1260f65dba59a 100644
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -5,6 +5,7 @@ import contextlib
 import json
 import logging
 from abc import ABC, abstractmethod
+from collections.abc import Callable
 from contextlib import AsyncExitStack
 from typing import TYPE_CHECKING, Union
 
@@ -17,9 +18,19 @@ from vllm.entrypoints.harmony_utils import (
     get_streamable_parser_for_assistant,
     render_for_completion,
 )
+from vllm.entrypoints.openai.parser.responses_parser import (
+    get_responses_parser_for_simple_context,
+)
+from vllm.entrypoints.openai.protocol import (
+    ResponseInputOutputItem,
+    ResponsesRequest,
+)
+from vllm.entrypoints.responses_utils import construct_tool_dicts
 from vllm.entrypoints.tool import Tool
 from vllm.entrypoints.tool_server import ToolServer
 from vllm.outputs import RequestOutput
+from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
+from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 if TYPE_CHECKING:
     from mcp.client import ClientSession
@@ -180,6 +191,71 @@ class SimpleContext(ConversationContext):
         raise NotImplementedError("Should not be called.")
 
 
+class ParsableContext(ConversationContext):
+    def __init__(
+        self,
+        *,
+        response_messages: list[ResponseInputOutputItem],
+        tokenizer: AnyTokenizer,
+        reasoning_parser_cls: Callable[[AnyTokenizer], ReasoningParser] | None,
+        request: ResponsesRequest,
+    ):
+        self.num_prompt_tokens = 0
+        self.num_output_tokens = 0
+        self.num_cached_tokens = 0
+        # TODO: num_reasoning_tokens is not implemented yet.
+        self.num_reasoning_tokens = 0
+        # not implemented yet for ParsableContext
+        self.all_turn_metrics: list[TurnMetrics] = []
+
+        if reasoning_parser_cls is None:
+            raise ValueError("reasoning_parser_cls must be provided.")
+
+        self.parser = get_responses_parser_for_simple_context(
+            tokenizer=tokenizer,
+            reasoning_parser_cls=reasoning_parser_cls,
+            response_messages=response_messages,
+            request=request,
+        )
+
+        self._tool_sessions: dict[str, ClientSession | Tool] = {}
+        self.called_tools: set[str] = set()
+
+        self.tool_dicts = construct_tool_dicts(request.tools, request.tool_choice)
+
+    def append_output(self, output: RequestOutput) -> None:
+        self.num_prompt_tokens = len(output.prompt_token_ids or [])
+        self.num_cached_tokens = output.num_cached_tokens or 0
+        self.num_output_tokens += len(output.outputs[0].token_ids or [])
+        self.parser.process(output.outputs[0])
+
+    def append_tool_output(self, output: list[ResponseInputOutputItem]) -> None:
+        raise NotImplementedError("Should not be called.")
+
+    def need_builtin_tool_call(self) -> bool:
+        """Return true if the last message is a MCP tool call"""
+        return False
+
+    async def call_tool(self) -> list[ResponseInputOutputItem]:
+        raise NotImplementedError("Should not be called.")
+
+    def render_for_completion(self):
+        raise NotImplementedError("Should not be called.")
+
+    async def init_tool_sessions(
+        self,
+        tool_server: ToolServer | None,
+        exit_stack: AsyncExitStack,
+        request_id: str,
+        mcp_tools: dict[str, Mcp],
+    ):
+        pass
+
+    async def cleanup_session(self, *args, **kwargs) -> None:
+        """Can be used as coro to used in __aexit__"""
+        raise NotImplementedError("Should not be called.")
+
+
 class HarmonyContext(ConversationContext):
     def __init__(
         self,
diff --git a/vllm/entrypoints/openai/parser/__init__.py b/vllm/entrypoints/openai/parser/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/entrypoints/openai/parser/responses_parser.py b/vllm/entrypoints/openai/parser/responses_parser.py
new file mode 100644
index 0000000000000..1bc8e81bd9dfc
--- /dev/null
+++ b/vllm/entrypoints/openai/parser/responses_parser.py
@@ -0,0 +1,101 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import logging
+from collections.abc import Callable
+
+from openai.types.responses.response_output_message import ResponseOutputMessage
+from openai.types.responses.response_output_text import ResponseOutputText
+from openai.types.responses.response_reasoning_item import (
+    Content,
+    ResponseReasoningItem,
+)
+
+from vllm.entrypoints.openai.protocol import ResponseInputOutputItem, ResponsesRequest
+from vllm.outputs import CompletionOutput
+from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import random_uuid
+
+logger = logging.getLogger(__name__)
+
+
+class ResponsesParser:
+    """Incremental parser over completion tokens with reasoning support."""
+
+    def __init__(
+        self,
+        *,
+        tokenizer: AnyTokenizer,
+        reasoning_parser_cls: Callable[[AnyTokenizer], ReasoningParser],
+        response_messages: list[ResponseInputOutputItem],
+        request: ResponsesRequest,
+    ):
+        self.response_messages: list[ResponseInputOutputItem] = (
+            # TODO: initial messages may not be properly typed
+            response_messages
+        )
+        self.num_init_messages = len(response_messages)
+        self.tokenizer = tokenizer
+        self.request = request
+
+        self.reasoning_parser_instance = reasoning_parser_cls(tokenizer)
+
+    def process(self, output: CompletionOutput) -> "ResponsesParser":
+        reasoning_content, content = self.reasoning_parser_instance.extract_reasoning(
+            output.text, request=self.request
+        )
+        if reasoning_content:
+            self.response_messages.append(
+                ResponseReasoningItem(
+                    type="reasoning",
+                    id=f"rs_{random_uuid()}",
+                    summary=[],
+                    content=[
+                        Content(
+                            type="reasoning_text",
+                            text=reasoning_content,
+                        )
+                    ],
+                )
+            )
+
+        if content:
+            self.response_messages.append(
+                ResponseOutputMessage(
+                    type="message",
+                    id=f"msg_{random_uuid()}",
+                    status="completed",
+                    role="assistant",
+                    content=[
+                        ResponseOutputText(
+                            annotations=[],  # TODO
+                            type="output_text",
+                            text=content,
+                            logprobs=None,  # TODO
+                        )
+                    ],
+                )
+            )
+
+        return self
+
+
+def get_responses_parser_for_simple_context(
+    *,
+    tokenizer: AnyTokenizer,
+    reasoning_parser_cls: Callable[[AnyTokenizer], ReasoningParser],
+    response_messages: list[ResponseInputOutputItem],
+    request: ResponsesRequest,
+) -> ResponsesParser:
+    """Factory function to create a ResponsesParser with
+    optional reasoning parser.
+
+    Returns:
+        ResponsesParser instance configured with the provided parser
+    """
+    return ResponsesParser(
+        tokenizer=tokenizer,
+        reasoning_parser_cls=reasoning_parser_cls,
+        response_messages=response_messages,
+        request=request,
+    )
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 81495a0777546..5ad86194ce1b2 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -60,6 +60,7 @@ from vllm.entrypoints.chat_utils import (
 from vllm.entrypoints.context import (
     ConversationContext,
     HarmonyContext,
+    ParsableContext,
     SimpleContext,
     StreamingHarmonyContext,
 )
@@ -96,8 +97,9 @@ from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.responses_utils import (
     construct_input_messages,
-    convert_tool_responses_to_completions_format,
+    construct_tool_dicts,
     extract_tool_types,
+    make_response_output_items_from_parsable_context,
 )
 from vllm.entrypoints.tool_server import ToolServer
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
@@ -228,7 +230,6 @@ class OpenAIServingResponses(OpenAIServing):
         self.tool_parser = self._get_tool_parser(
             tool_parser_name=tool_parser, enable_auto_tools=enable_auto_tools
         )
-        self.exclude_tools_when_tool_choice_none = False
         # HACK(woosuk): This is a hack. We should use a better store.
         # FIXME: If enable_store=True, this may cause a memory leak since we
         # never remove responses from the store.
@@ -413,7 +414,17 @@ class OpenAIServingResponses(OpenAIServing):
                     else:
                         context = HarmonyContext(messages, available_tools)
                 else:
-                    context = SimpleContext()
+                    if envs.VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT:
+                        # This is an feature in development for parsing
+                        # tokens during generation instead of at the end
+                        context = ParsableContext(
+                            response_messages=messages,
+                            tokenizer=tokenizer,
+                            reasoning_parser_cls=self.reasoning_parser,
+                            request=request,
+                        )
+                    else:
+                        context = SimpleContext()
 
                 if self.reasoning_parser is not None:
                     reasoning_parser = self.reasoning_parser(tokenizer)
@@ -534,15 +545,7 @@ class OpenAIServingResponses(OpenAIServing):
         prev_response: ResponsesResponse | None,
         tokenizer: TokenizerLike,
     ):
-        if request.tools is None or (
-            request.tool_choice == "none" and self.exclude_tools_when_tool_choice_none
-        ):
-            tool_dicts = None
-        else:
-            tool_dicts = [
-                convert_tool_responses_to_completions_format(tool.model_dump())
-                for tool in request.tools
-            ]
+        tool_dicts = construct_tool_dicts(request.tools, request.tool_choice)
         # Construct the input messages.
         messages = construct_input_messages(
             request_instructions=request.instructions,
@@ -642,6 +645,22 @@ class OpenAIServingResponses(OpenAIServing):
                     status = "cancelled"
             else:
                 status = "incomplete"
+        elif isinstance(context, ParsableContext):
+            response_messages = context.parser.response_messages[
+                context.parser.num_init_messages :
+            ]
+            output = make_response_output_items_from_parsable_context(response_messages)
+
+            # TODO: context for non-gptoss models doesn't use messages
+            # so we can't get them out yet
+            if request.enable_response_messages:
+                raise NotImplementedError(
+                    "enable_response_messages is currently only supported for gpt-oss"
+                )
+
+            # TODO: Calculate usage.
+            # assert final_res.prompt_token_ids is not None
+            num_tool_output_tokens = 0
         else:
             assert isinstance(context, SimpleContext)
             final_res = context.last_output
@@ -661,7 +680,7 @@ class OpenAIServingResponses(OpenAIServing):
             assert final_res.prompt_token_ids is not None
             num_tool_output_tokens = 0
 
-        assert isinstance(context, (SimpleContext, HarmonyContext))
+        assert isinstance(context, (SimpleContext, HarmonyContext, ParsableContext))
         num_prompt_tokens = context.num_prompt_tokens
         num_generated_tokens = context.num_output_tokens
         num_cached_tokens = context.num_cached_tokens
diff --git a/vllm/entrypoints/responses_utils.py b/vllm/entrypoints/responses_utils.py
index 2e01cb038af85..5f21e2c44450c 100644
--- a/vllm/entrypoints/responses_utils.py
+++ b/vllm/entrypoints/responses_utils.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from typing import Any
+
 from openai.types.chat import (
     ChatCompletionAssistantMessageParam,
     ChatCompletionMessageToolCallParam,
@@ -10,6 +12,7 @@ from openai.types.chat.chat_completion_message_tool_call_param import (
     Function as FunctionCallTool,
 )
 from openai.types.responses import ResponseFunctionToolCall, ResponseOutputItem
+from openai.types.responses.response import ToolChoice
 from openai.types.responses.response_function_tool_call_output_item import (
     ResponseFunctionToolCallOutputItem,
 )
@@ -24,6 +27,20 @@ from vllm.entrypoints.openai.protocol import (
 )
 
 
+def make_response_output_items_from_parsable_context(
+    response_messages: list[ResponseInputOutputItem],
+) -> list[ResponseOutputItem]:
+    """Given a list of sentences, construct ResponseOutput Items."""
+    output_messages: list[ResponseOutputItem] = []
+    for message in response_messages:
+        if not isinstance(message, ResponseFunctionToolCallOutputItem):
+            output_messages.append(message)
+        else:
+            raise NotImplementedError("tool calls not supported for response context")
+
+    return output_messages
+
+
 def construct_input_messages(
     *,
     request_instructions: str | None = None,
@@ -146,3 +163,16 @@ def convert_tool_responses_to_completions_format(tool: dict) -> dict:
         "type": "function",
         "function": tool,
     }
+
+
+def construct_tool_dicts(
+    tools: list[Tool], tool_choice: ToolChoice
+) -> list[dict[str, Any]] | None:
+    if tools is None or (tool_choice == "none"):
+        tool_dicts = None
+    else:
+        tool_dicts = [
+            convert_tool_responses_to_completions_format(tool.model_dump())
+            for tool in tools
+        ]
+    return tool_dicts
diff --git a/vllm/envs.py b/vllm/envs.py
index d0912863e6444..8b954fa14f28c 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -214,6 +214,7 @@ if TYPE_CHECKING:
     VLLM_ALLREDUCE_USE_SYMM_MEM: bool = True
     VLLM_TUNED_CONFIG_FOLDER: str | None = None
     VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS: set[str] = set()
+    VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT: bool = False
     VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS: bool = False
     VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY: bool = False
     VLLM_CUSTOM_SCOPES_FOR_PROFILING: bool = False
@@ -1444,6 +1445,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_ALLREDUCE_USE_SYMM_MEM": lambda: bool(
         int(os.getenv("VLLM_ALLREDUCE_USE_SYMM_MEM", "1"))
     ),
+    # Experimental: use this to enable MCP tool calling for non harmony models
+    "VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT": lambda: bool(
+        int(os.getenv("VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT", "0"))
+    ),
     # Allows vllm to find tuned config under customized folder
     "VLLM_TUNED_CONFIG_FOLDER": lambda: os.getenv("VLLM_TUNED_CONFIG_FOLDER", None),
     # Valid values are container,code_interpreter,web_search_preview

From 63b1da76ba35cd8cb220c79c44556e07fa4fb0c6 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Wed, 3 Dec 2025 01:33:23 +0800
Subject: [PATCH 152/770] [Chore]: Reorganize gguf utils funtions under
 `transformers_utils` (#29891)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 tests/models/test_gguf_download.py     |  2 +-
 tests/transformers_utils/test_utils.py | 12 +++--
 vllm/config/model.py                   |  8 ++-
 vllm/engine/arg_utils.py               |  3 +-
 vllm/tokenizers/registry.py            |  6 +--
 vllm/transformers_utils/config.py      | 14 ++---
 vllm/transformers_utils/gguf_utils.py  | 71 +++++++++++++++++++++++++
 vllm/transformers_utils/processor.py   |  3 +-
 vllm/transformers_utils/utils.py       | 72 --------------------------
 9 files changed, 96 insertions(+), 95 deletions(-)

diff --git a/tests/models/test_gguf_download.py b/tests/models/test_gguf_download.py
index 155768ac9bff7..b1674cdf77178 100644
--- a/tests/models/test_gguf_download.py
+++ b/tests/models/test_gguf_download.py
@@ -203,7 +203,7 @@ class TestGGUFModelLoader:
     @patch("vllm.config.model.get_hf_image_processor_config", return_value=None)
     @patch("vllm.config.model.get_config")
     @patch("vllm.config.model.is_gguf", return_value=False)
-    @patch("vllm.transformers_utils.utils.check_gguf_file", return_value=False)
+    @patch("vllm.transformers_utils.gguf_utils.check_gguf_file", return_value=False)
     @patch("os.path.isfile", return_value=False)
     def test_prepare_weights_invalid_format(
         self,
diff --git a/tests/transformers_utils/test_utils.py b/tests/transformers_utils/test_utils.py
index a8d0b9be9ec29..0a6a65b4133c9 100644
--- a/tests/transformers_utils/test_utils.py
+++ b/tests/transformers_utils/test_utils.py
@@ -5,13 +5,15 @@ from unittest.mock import patch
 
 import pytest
 
+from vllm.transformers_utils.gguf_utils import (
+    is_gguf,
+    is_remote_gguf,
+    split_remote_gguf,
+)
 from vllm.transformers_utils.utils import (
     is_cloud_storage,
     is_gcs,
-    is_gguf,
-    is_remote_gguf,
     is_s3,
-    split_remote_gguf,
 )
 
 
@@ -132,7 +134,7 @@ class TestSplitRemoteGGUF:
 class TestIsGGUF:
     """Test is_gguf utility function."""
 
-    @patch("vllm.transformers_utils.utils.check_gguf_file", return_value=True)
+    @patch("vllm.transformers_utils.gguf_utils.check_gguf_file", return_value=True)
     def test_is_gguf_with_local_file(self, mock_check_gguf):
         """Test is_gguf with local GGUF file."""
         assert is_gguf("/path/to/model.gguf")
@@ -149,7 +151,7 @@ class TestIsGGUF:
         assert not is_gguf("repo/model:quant")
         assert not is_gguf("repo/model:INVALID")
 
-    @patch("vllm.transformers_utils.utils.check_gguf_file", return_value=False)
+    @patch("vllm.transformers_utils.gguf_utils.check_gguf_file", return_value=False)
     def test_is_gguf_false(self, mock_check_gguf):
         """Test is_gguf returns False for non-GGUF models."""
         assert not is_gguf("unsloth/Qwen3-0.6B")
diff --git a/vllm/config/model.py b/vllm/config/model.py
index ef592ac001535..5de97697698a1 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -37,15 +37,13 @@ from vllm.transformers_utils.config import (
     uses_xdrope_dim,
 )
 from vllm.transformers_utils.gguf_utils import (
-    maybe_patch_hf_config_from_gguf,
-)
-from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri
-from vllm.transformers_utils.utils import (
     is_gguf,
     is_remote_gguf,
-    maybe_model_redirect,
+    maybe_patch_hf_config_from_gguf,
     split_remote_gguf,
 )
+from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri
+from vllm.transformers_utils.utils import maybe_model_redirect
 from vllm.utils.import_utils import LazyLoader
 from vllm.utils.torch_utils import common_broadcastable_dtype
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 5a2836668174f..83029e09ceaad 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -86,8 +86,9 @@ from vllm.transformers_utils.config import (
     is_interleaved,
     maybe_override_with_speculators,
 )
+from vllm.transformers_utils.gguf_utils import is_gguf
 from vllm.transformers_utils.repo_utils import get_model_path
-from vllm.transformers_utils.utils import is_cloud_storage, is_gguf
+from vllm.transformers_utils.utils import is_cloud_storage
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.mem_constants import GiB_bytes
 from vllm.utils.network_utils import get_ip
diff --git a/vllm/tokenizers/registry.py b/vllm/tokenizers/registry.py
index bf9d295de23ae..87048f2ec7845 100644
--- a/vllm/tokenizers/registry.py
+++ b/vllm/tokenizers/registry.py
@@ -11,14 +11,14 @@ from typing_extensions import assert_never
 
 import vllm.envs as envs
 from vllm.logger import init_logger
-from vllm.transformers_utils.gguf_utils import get_gguf_file_path_from_hf
-from vllm.transformers_utils.repo_utils import list_filtered_repo_files
-from vllm.transformers_utils.utils import (
+from vllm.transformers_utils.gguf_utils import (
     check_gguf_file,
+    get_gguf_file_path_from_hf,
     is_gguf,
     is_remote_gguf,
     split_remote_gguf,
 )
+from vllm.transformers_utils.repo_utils import list_filtered_repo_files
 from vllm.utils.import_utils import resolve_obj_by_qualname
 
 from .protocol import TokenizerLike
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 1bb5791e19016..0cceab90ba9a2 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -26,8 +26,15 @@ from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
 
 from vllm import envs
 from vllm.logger import init_logger
+from vllm.transformers_utils.utils import parse_safetensors_file_metadata
 
 from .config_parser_base import ConfigParserBase
+from .gguf_utils import (
+    check_gguf_file,
+    is_gguf,
+    is_remote_gguf,
+    split_remote_gguf,
+)
 from .repo_utils import (
     _get_hf_token,
     file_or_path_exists,
@@ -36,13 +43,6 @@ from .repo_utils import (
     try_get_local_file,
     with_retry,
 )
-from .utils import (
-    check_gguf_file,
-    is_gguf,
-    is_remote_gguf,
-    parse_safetensors_file_metadata,
-    split_remote_gguf,
-)
 
 if envs.VLLM_USE_MODELSCOPE:
     from modelscope import AutoConfig
diff --git a/vllm/transformers_utils/gguf_utils.py b/vllm/transformers_utils/gguf_utils.py
index cb1fc2d092e01..f3fd43c6ace51 100644
--- a/vllm/transformers_utils/gguf_utils.py
+++ b/vllm/transformers_utils/gguf_utils.py
@@ -2,10 +2,14 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """GGUF utility functions."""
 
+from functools import cache
+from os import PathLike
 from pathlib import Path
 
 import gguf
+import regex as re
 from gguf.constants import Keys, VisionProjectorType
+from gguf.quants import GGMLQuantizationType
 from transformers import Gemma3Config, PretrainedConfig, SiglipVisionConfig
 
 from vllm.logger import init_logger
@@ -15,6 +19,73 @@ from .repo_utils import list_filtered_repo_files
 logger = init_logger(__name__)
 
 
+@cache
+def check_gguf_file(model: str | PathLike) -> bool:
+    """Check if the file is a GGUF model."""
+    model = Path(model)
+    if not model.is_file():
+        return False
+    elif model.suffix == ".gguf":
+        return True
+
+    try:
+        with model.open("rb") as f:
+            header = f.read(4)
+
+        return header == b"GGUF"
+    except Exception as e:
+        logger.debug("Error reading file %s: %s", model, e)
+        return False
+
+
+@cache
+def is_remote_gguf(model: str | Path) -> bool:
+    """Check if the model is a remote GGUF model."""
+    pattern = r"^[a-zA-Z0-9][a-zA-Z0-9._-]*/[a-zA-Z0-9][a-zA-Z0-9._-]*:[A-Za-z0-9_+-]+$"
+    model = str(model)
+    if re.fullmatch(pattern, model):
+        _, quant_type = model.rsplit(":", 1)
+        return is_valid_gguf_quant_type(quant_type)
+    return False
+
+
+def is_valid_gguf_quant_type(gguf_quant_type: str) -> bool:
+    """Check if the quant type is a valid GGUF quant type."""
+    return getattr(GGMLQuantizationType, gguf_quant_type, None) is not None
+
+
+def split_remote_gguf(model: str | Path) -> tuple[str, str]:
+    """Split the model into repo_id and quant type."""
+    model = str(model)
+    if is_remote_gguf(model):
+        parts = model.rsplit(":", 1)
+        return (parts[0], parts[1])
+    raise ValueError(
+        f"Wrong GGUF model or invalid GGUF quant type: {model}.\n"
+        "- It should be in repo_id:quant_type format.\n"
+        f"- Valid GGMLQuantizationType values: {GGMLQuantizationType._member_names_}",
+    )
+
+
+def is_gguf(model: str | Path) -> bool:
+    """Check if the model is a GGUF model.
+
+    Args:
+        model: Model name, path, or Path object to check.
+
+    Returns:
+        True if the model is a GGUF model, False otherwise.
+    """
+    model = str(model)
+
+    # Check if it's a local GGUF file
+    if check_gguf_file(model):
+        return True
+
+    # Check if it's a remote GGUF model (repo_id:quant_type format)
+    return is_remote_gguf(model)
+
+
 def detect_gguf_multimodal(model: str) -> Path | None:
     """Check if GGUF model has multimodal projector file.
 
diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py
index 63cdf63370342..e9864b0c1531d 100644
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@@ -18,7 +18,8 @@ from transformers.processing_utils import ProcessorMixin
 from transformers.video_processing_utils import BaseVideoProcessor
 from typing_extensions import TypeVar
 
-from vllm.transformers_utils.utils import convert_model_repo_to_path, is_gguf
+from vllm.transformers_utils.gguf_utils import is_gguf
+from vllm.transformers_utils.utils import convert_model_repo_to_path
 from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
 
 if TYPE_CHECKING:
diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py
index 45a873c9f7001..96f292f4c949e 100644
--- a/vllm/transformers_utils/utils.py
+++ b/vllm/transformers_utils/utils.py
@@ -9,8 +9,6 @@ from os import PathLike
 from pathlib import Path
 from typing import Any
 
-from gguf import GGMLQuantizationType
-
 import vllm.envs as envs
 from vllm.logger import init_logger
 
@@ -29,76 +27,6 @@ def is_cloud_storage(model_or_path: str) -> bool:
     return is_s3(model_or_path) or is_gcs(model_or_path)
 
 
-@cache
-def check_gguf_file(model: str | PathLike) -> bool:
-    """Check if the file is a GGUF model."""
-    model = Path(model)
-    if not model.is_file():
-        return False
-    elif model.suffix == ".gguf":
-        return True
-
-    try:
-        with model.open("rb") as f:
-            header = f.read(4)
-
-        return header == b"GGUF"
-    except Exception as e:
-        logger.debug("Error reading file %s: %s", model, e)
-        return False
-
-
-@cache
-def is_remote_gguf(model: str | Path) -> bool:
-    """Check if the model is a remote GGUF model."""
-    model = str(model)
-    return (
-        (not is_cloud_storage(model))
-        and (not model.startswith(("http://", "https://")))
-        and ("/" in model and ":" in model)
-        and is_valid_gguf_quant_type(model.rsplit(":", 1)[1])
-    )
-
-
-def is_valid_gguf_quant_type(gguf_quant_type: str) -> bool:
-    """Check if the quant type is a valid GGUF quant type."""
-    return getattr(GGMLQuantizationType, gguf_quant_type, None) is not None
-
-
-def split_remote_gguf(model: str | Path) -> tuple[str, str]:
-    """Split the model into repo_id and quant type."""
-    model = str(model)
-    if is_remote_gguf(model):
-        parts = model.rsplit(":", 1)
-        return (parts[0], parts[1])
-    raise ValueError(
-        "Wrong GGUF model or invalid GGUF quant type: %s.\n"
-        "- It should be in repo_id:quant_type format.\n"
-        "- Valid GGMLQuantizationType values: %s",
-        model,
-        GGMLQuantizationType._member_names_,
-    )
-
-
-def is_gguf(model: str | Path) -> bool:
-    """Check if the model is a GGUF model.
-
-    Args:
-        model: Model name, path, or Path object to check.
-
-    Returns:
-        True if the model is a GGUF model, False otherwise.
-    """
-    model = str(model)
-
-    # Check if it's a local GGUF file
-    if check_gguf_file(model):
-        return True
-
-    # Check if it's a remote GGUF model (repo_id:quant_type format)
-    return is_remote_gguf(model)
-
-
 def modelscope_list_repo_files(
     repo_id: str,
     revision: str | None = None,

From c77b9929a04c56d369c9f6b86fbf5d4891bab285 Mon Sep 17 00:00:00 2001
From: Alexei-V-Ivanov-AMD
 <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
Date: Tue, 2 Dec 2025 11:52:54 -0600
Subject: [PATCH 153/770] Update AMD-CI testing mirror (as of 2025-12-02)
 (#29898)

Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
---
 .buildkite/test-amd.yaml | 43 ++++++++++++++++++++++++++--------------
 1 file changed, 28 insertions(+), 15 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index d5d4043a1d5bc..67088caa8150b 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -51,7 +51,7 @@ steps:
 - label: Async Engine, Inputs, Utils, Worker Test # 10min
   timeout_in_minutes: 15
   mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi325_1
+  agent_pool: mi355_1
   grade: Blocking
   source_file_dependencies:
   - vllm/
@@ -64,7 +64,7 @@ steps:
 - label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
   timeout_in_minutes: 20
   mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi325_1
+  agent_pool: mi355_1
   grade: Blocking
   source_file_dependencies:
   - vllm/
@@ -99,7 +99,7 @@ steps:
 - label: Basic Correctness Test # 20min
   timeout_in_minutes: 30
   mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
+  agent_pool: mi355_1
   # grade: Blocking
   fast_check: true
   torch_nightly: true
@@ -116,7 +116,7 @@ steps:
 
 - label: Entrypoints Unit Tests # 5min
   mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi325_1
+  agent_pool: mi355_1
   grade: Blocking
   timeout_in_minutes: 10
   working_dir: "/vllm-workspace/tests"
@@ -131,7 +131,7 @@ steps:
 - label: Entrypoints Integration Test (LLM) # 30min
   timeout_in_minutes: 40
   mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
+  agent_pool: mi355_1
   # grade: Blocking
   working_dir: "/vllm-workspace/tests"
   fast_check: true
@@ -254,7 +254,7 @@ steps:
 
 - label: EPLB Algorithm Test # 5min
   mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi325_1
+  agent_pool: mi355_1
   grade: Blocking
   timeout_in_minutes: 15
   working_dir: "/vllm-workspace/tests"
@@ -266,7 +266,7 @@ steps:
 
 - label: EPLB Execution Test # 10min
   mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_4
+  agent_pool: mi355_4
   # grade: Blocking
   timeout_in_minutes: 20
   working_dir: "/vllm-workspace/tests"
@@ -281,7 +281,7 @@ steps:
 - label: Metrics, Tracing Test # 12min
   timeout_in_minutes: 20
   mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_2
+  agent_pool: mi355_2
   # grade: Blocking
   num_gpus: 2
   source_file_dependencies:
@@ -301,7 +301,7 @@ steps:
 - label: Regression Test # 7min
   timeout_in_minutes: 20
   mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi325_1
+  agent_pool: mi355_1
   grade: Blocking
   source_file_dependencies:
   - vllm/
@@ -343,7 +343,7 @@ steps:
 - label: V1 Test entrypoints # 35min
   timeout_in_minutes: 50
   mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi325_1
+  agent_pool: mi355_1
   grade: Blocking
   source_file_dependencies:
     - vllm/
@@ -544,7 +544,7 @@ steps:
 - label: PyTorch Fullgraph Test # 27min
   timeout_in_minutes: 40
   mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
+  agent_pool: mi355_1
   # grade: Blocking
   torch_nightly: true
   source_file_dependencies:
@@ -715,6 +715,7 @@ steps:
   # we can only upgrade after this is resolved
   # TODO(jerryzh168): resolve the above comment
   - uv pip install --system torchao==0.13.0
+  - uv pip install --system conch-triton-kernels
   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 
 - label: LM Eval Small Models # 15min
@@ -934,6 +935,18 @@ steps:
   commands:
     - pytest -v -s models/language/pooling_mteb_test
 
+- label: Multi-Modal Processor Test (CPU)
+  timeout_in_minutes: 60
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  no_gpu: true
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
+
 - label: Multi-Modal Processor Test # 44min
   timeout_in_minutes: 60
   mirror_hardwares: [amdexperimental]
@@ -1472,14 +1485,14 @@ steps:
   working_dir: "/vllm-workspace/"
   num_gpus: 2
   commands:
-    - pytest -v -s tests/compile/distributed/test_async_tp.py
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
     - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
     - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
     #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
-    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
-    - pytest -v -s tests/distributed/test_sequence_parallel.py
+    - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
     - pytest -v -s tests/distributed/test_context_parallel.py
-    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+    - HIP_VISIBLE_DEVICES=0,1 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
     - pytest -v -s tests/v1/distributed/test_dbo.py
 
 ##### B200 test #####

From 2d613de9aef3ef25e9adc8887ac0388da092b500 Mon Sep 17 00:00:00 2001
From: Benjamin Bartels <benjamin@bartels.dev>
Date: Tue, 2 Dec 2025 18:21:49 +0000
Subject: [PATCH 154/770] [CI/Build] Fixes missing runtime dependencies
 (#29822)

Signed-off-by: bbartels <benjamin@bartels.dev>
---
 docker/Dockerfile | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index eb7c105071c00..006481b23cb9f 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -364,7 +364,12 @@ RUN CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') && \
     cuda-cudart-${CUDA_VERSION_DASH} \
     cuda-nvrtc-${CUDA_VERSION_DASH} \
     cuda-cuobjdump-${CUDA_VERSION_DASH} \
-    libcublas-${CUDA_VERSION_DASH} && \
+    # https://github.com/vllm-project/vllm/issues/29590
+    libcurand-dev-${CUDA_VERSION_DASH} \
+    libcublas-${CUDA_VERSION_DASH} \
+    # Fixes nccl_allocator requiring nccl.h at runtime
+    # https://github.com/vllm-project/vllm/blob/1336a1ea244fa8bfd7e72751cabbdb5b68a0c11a/vllm/distributed/device_communicators/pynccl_allocator.py#L22
+    libnccl-dev && \
     rm -rf /var/lib/apt/lists/*
 
 ARG PIP_INDEX_URL UV_INDEX_URL

From 1d93f116754f6e81acb9287ebcca0d1d1170a944 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Tue, 2 Dec 2025 13:48:08 -0500
Subject: [PATCH 155/770] [Attention][CUDAGraph] Remove CG padding from
 attention backends (#29352)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 .../layers/mamba/mamba_mixer.py               | 17 +++++++-------
 vllm/v1/attention/backends/gdn_attn.py        | 22 +++++--------------
 vllm/v1/attention/backends/mamba1_attn.py     | 12 +++-------
 vllm/v1/attention/backends/mamba2_attn.py     | 12 +++-------
 vllm/v1/attention/backends/short_conv_attn.py |  3 +--
 5 files changed, 20 insertions(+), 46 deletions(-)

diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py
index 90e520e244416..0b63acf2dc5a5 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer.py
@@ -252,7 +252,6 @@ class MambaMixer(MambaBase, CustomOp):
             conv_state = self_kv_cache[0].transpose(-1, -2)
             ssm_state = self_kv_cache[1]
             has_initial_states_p = attn_metadata.has_initial_states_p
-            num_padded_decodes = attn_metadata.num_padded_decodes
 
         # 1. Gated MLP's linear projection
         projected_states = self.in_proj(hidden_states)[0].transpose(-2, -1)
@@ -281,7 +280,7 @@ class MambaMixer(MambaBase, CustomOp):
             state_indices_tensor,
             num_prefill_tokens,
             num_prefills,
-            num_padded_decodes,
+            num_decode_tokens,
         )
         hidden_states_BC_p = prefill_decode_split.hidden_states_BC_p
         hidden_states_BC_d = prefill_decode_split.hidden_states_BC_d
@@ -470,24 +469,24 @@ def split_batch_to_prefill_and_decode(
     state_indices_tensor: torch.Tensor,
     num_prefill_tokens: int,
     num_prefills: int,
-    num_padded_decodes: int,
+    num_decode_tokens: int,
 ) -> PrefillDecodeSplit:
-    num_actual_tokens = num_prefill_tokens + num_padded_decodes
+    num_actual_tokens = num_prefill_tokens + num_decode_tokens
 
     # In v1, decode tokens come first, then prefill tokens.
     hidden_states_BC_d, hidden_states_BC_p = torch.split(
         hidden_states_BC[..., :num_actual_tokens],
-        [num_padded_decodes, num_prefill_tokens],
+        [num_decode_tokens, num_prefill_tokens],
         dim=-1,
     )
     gate_d, gate_p = torch.split(
-        gate[..., :num_actual_tokens], [num_padded_decodes, num_prefill_tokens], dim=-1
+        gate[..., :num_actual_tokens], [num_decode_tokens, num_prefill_tokens], dim=-1
     )
 
-    # num_padded_decodes accounts for CUDA graph padding when applicable
+    # num_decode_tokens accounts for CUDA graph padding when applicable
     state_indices_tensor_d, state_indices_tensor_p = torch.split(
-        state_indices_tensor[: num_padded_decodes + num_prefills],
-        [num_padded_decodes, num_prefills],
+        state_indices_tensor[: num_decode_tokens + num_prefills],
+        [num_decode_tokens, num_prefills],
         dim=0,
     )
 
diff --git a/vllm/v1/attention/backends/gdn_attn.py b/vllm/v1/attention/backends/gdn_attn.py
index 69b5a6fb48564..e921f8c3de073 100644
--- a/vllm/v1/attention/backends/gdn_attn.py
+++ b/vllm/v1/attention/backends/gdn_attn.py
@@ -254,17 +254,11 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata]
             )
         else:
             has_initial_state = None
-        num_actual_tokens = (
-            num_prefill_tokens + num_decode_tokens + num_spec_decode_tokens
-        )
 
-        # prepare tensors for cudagraph
-        #
-        # With speculative decoding, the xgrammar backend may rollback tokens
-        # and causing some sequences has less draft tokens than self.num_spec.
-        #
-        # In above cases, the max possible batch size for n tokens, can be
-        # min(n, cudagraph_max_bs).
+        # Prepare tensors for cudagraph
+        # Note: m.num_actual_tokens is already padded by the model runner for CUDAGraph
+        batch_size = m.num_actual_tokens
+
         if (
             self.use_full_cuda_graph
             and num_prefills == 0
@@ -272,9 +266,6 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata]
             and num_spec_decodes <= self.decode_cudagraph_max_bs
             and num_spec_decode_tokens <= self.decode_cudagraph_max_bs
         ):
-            num_actual_tokens = self.vllm_config.pad_for_cudagraph(m.num_actual_tokens)
-            batch_size = min(self.decode_cudagraph_max_bs, num_actual_tokens)
-
             self.spec_state_indices_tensor[:num_spec_decodes].copy_(
                 spec_state_indices_tensor, non_blocking=True
             )
@@ -319,9 +310,6 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata]
             and num_spec_decodes == 0
             and num_decodes <= self.decode_cudagraph_max_bs
         ):
-            num_actual_tokens = self.vllm_config.pad_for_cudagraph(m.num_actual_tokens)
-            batch_size = num_actual_tokens
-
             self.non_spec_state_indices_tensor[:num_decodes].copy_(
                 non_spec_state_indices_tensor, non_blocking=True
             )
@@ -344,7 +332,7 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata]
             num_decode_tokens=num_decode_tokens,
             num_spec_decodes=num_spec_decodes,
             num_spec_decode_tokens=num_spec_decode_tokens,
-            num_actual_tokens=num_actual_tokens,
+            num_actual_tokens=m.num_actual_tokens,
             has_initial_state=has_initial_state,
             spec_query_start_loc=spec_query_start_loc,
             non_spec_query_start_loc=non_spec_query_start_loc,
diff --git a/vllm/v1/attention/backends/mamba1_attn.py b/vllm/v1/attention/backends/mamba1_attn.py
index 8e949e53330c1..fcda6134016ba 100644
--- a/vllm/v1/attention/backends/mamba1_attn.py
+++ b/vllm/v1/attention/backends/mamba1_attn.py
@@ -31,7 +31,6 @@ class Mamba1AttentionMetadata:
     num_prefill_tokens: int
     num_decodes: int
     num_decode_tokens: int
-    num_padded_decodes: int
 
     block_idx_last_scheduled_token: torch.Tensor  # shape: [batch,]
     block_idx_first_scheduled_token_p: torch.Tensor  # shape: [batch,]
@@ -68,7 +67,6 @@ class Mamba1AttentionMetadataBuilder(
 
         has_initial_states_p = None
         query_start_loc_p = None
-        padded_decodes = num_decodes
         num_computed_tokens, num_computed_tokens_p = None, None
         block_idx_first_scheduled_token = None
         block_idx_first_scheduled_token_p = None
@@ -125,11 +123,10 @@ class Mamba1AttentionMetadataBuilder(
             and num_decodes <= self.decode_cudagraph_max_bs
             and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
         ):
-            padded_decodes = self.vllm_config.pad_for_cudagraph(num_decodes)
             self.state_indices_tensor[:num_decodes].copy_(
                 state_indices_tensor, non_blocking=True
             )
-            state_indices_tensor = self.state_indices_tensor[:padded_decodes]
+            state_indices_tensor = self.state_indices_tensor[:num_decode_tokens]
             state_indices_tensor[num_decodes:] = PAD_SLOT_ID
 
             if self.vllm_config.cache_config.enable_prefix_caching:
@@ -137,17 +134,15 @@ class Mamba1AttentionMetadataBuilder(
                     block_idx_last_scheduled_token, non_blocking=True
                 )
                 block_idx_last_scheduled_token = self.block_idx_last_scheduled_token[
-                    :padded_decodes
+                    :num_decode_tokens
                 ]
-                block_idx_last_scheduled_token[num_decodes:] = 0
 
                 self.block_idx_last_computed_token[:num_decodes].copy_(
                     block_idx_last_computed_token, non_blocking=True
                 )
                 block_idx_last_computed_token = self.block_idx_last_computed_token[
-                    :padded_decodes
+                    :num_decode_tokens
                 ]
-                block_idx_last_computed_token[num_decodes:] = 0
 
         return Mamba1AttentionMetadata(
             query_start_loc_p=query_start_loc_p,
@@ -157,7 +152,6 @@ class Mamba1AttentionMetadataBuilder(
             num_prefill_tokens=num_prefill_tokens,
             num_decodes=num_decodes,
             num_decode_tokens=num_decode_tokens,
-            num_padded_decodes=padded_decodes,
             block_idx_last_scheduled_token=block_idx_last_scheduled_token,
             block_idx_first_scheduled_token_p=block_idx_first_scheduled_token_p,
             block_idx_last_computed_token=block_idx_last_computed_token,
diff --git a/vllm/v1/attention/backends/mamba2_attn.py b/vllm/v1/attention/backends/mamba2_attn.py
index 888734e5d2b6b..bf1d8f09ab0ac 100644
--- a/vllm/v1/attention/backends/mamba2_attn.py
+++ b/vllm/v1/attention/backends/mamba2_attn.py
@@ -10,7 +10,6 @@ from vllm.config import VllmConfig
 from vllm.utils.math_utils import cdiv
 from vllm.v1.attention.backends.mamba_attn import BaseMambaAttentionMetadataBuilder
 from vllm.v1.attention.backends.utils import (
-    PAD_SLOT_ID,
     CommonAttentionMetadata,
     compute_causal_conv1d_metadata,
     split_decodes_and_prefills,
@@ -304,30 +303,25 @@ class Mamba2AttentionMetadataBuilder(
             num_decodes <= self.decode_cudagraph_max_bs
             and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
         ):
-            # Pad state tensor for CUDA graph
-            num_input_tokens = self.vllm_config.pad_for_cudagraph(num_decodes)
             self.state_indices_tensor[:num_decodes].copy_(
                 state_indices_tensor, non_blocking=True
             )
-            state_indices_tensor = self.state_indices_tensor[:num_input_tokens]
-            state_indices_tensor[num_decodes:] = PAD_SLOT_ID
+            state_indices_tensor = self.state_indices_tensor[:num_decode_tokens]
 
             if self.vllm_config.cache_config.enable_prefix_caching:
                 self.block_idx_last_scheduled_token[:num_decodes].copy_(
                     block_idx_last_scheduled_token, non_blocking=True
                 )
                 block_idx_last_scheduled_token = self.block_idx_last_scheduled_token[
-                    :num_input_tokens
+                    :num_decode_tokens
                 ]
-                block_idx_last_scheduled_token[num_decodes:] = 0
 
                 self.block_idx_last_computed_token[:num_decodes].copy_(
                     block_idx_last_computed_token, non_blocking=True
                 )
                 block_idx_last_computed_token = self.block_idx_last_computed_token[
-                    :num_input_tokens
+                    :num_decode_tokens
                 ]
-                block_idx_last_computed_token[num_decodes:] = 0
 
         attn_metadata = Mamba2AttentionMetadata(
             num_prefills=num_prefills,
diff --git a/vllm/v1/attention/backends/short_conv_attn.py b/vllm/v1/attention/backends/short_conv_attn.py
index de0cb73db0917..c8fe0faf71088 100644
--- a/vllm/v1/attention/backends/short_conv_attn.py
+++ b/vllm/v1/attention/backends/short_conv_attn.py
@@ -83,11 +83,10 @@ class ShortConvAttentionMetadataBuilder(
             and num_decodes <= self.decode_cudagraph_max_bs
             and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
         ):
-            num_input_tokens = self.vllm_config.pad_for_cudagraph(num_decodes)
             self.state_indices_tensor[:num_decodes].copy_(
                 state_indices_tensor, non_blocking=True
             )
-            state_indices_tensor = self.state_indices_tensor[:num_input_tokens]
+            state_indices_tensor = self.state_indices_tensor[:num_decode_tokens]
             state_indices_tensor[num_decodes:] = PAD_SLOT_ID
 
         attn_metadata = ShortConvAttentionMetadata(

From a2b053dc858db461d8d98cff37ee7c67ba21126b Mon Sep 17 00:00:00 2001
From: Navanit Dubey <98005188+Navanit-git@users.noreply.github.com>
Date: Wed, 3 Dec 2025 00:58:35 +0530
Subject: [PATCH 156/770] feat(model): Add BitsAndBytes quantization support
 for Qwen3-Omni-MoE (#29896)

Signed-off-by: navanit-git <navanitdubey@gmail.com>
---
 .../models/qwen3_omni_moe_thinker.py          | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
index 39dd42552ae8f..fe825198dcaa4 100755
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -62,6 +62,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.qwen2_audio import Qwen2AudioProcessingInfo
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalKwargsItems
@@ -1137,6 +1138,18 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
         }
     )
 
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
@@ -1763,3 +1776,13 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
 
         mrope_position_delta = llm_positions.max() + 1 - seq_len
         return llm_positions, mrope_position_delta
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="visual.merger",
+            tower_model=["visual.", "audio_tower."],
+        )

From 1c593e117d3a818815b5d07992a096d53b519a15 Mon Sep 17 00:00:00 2001
From: Copilot <198982749+Copilot@users.noreply.github.com>
Date: Tue, 2 Dec 2025 20:40:56 +0000
Subject: [PATCH 157/770] Fix boolean nested params, add dict format support,
 and enhance plotting for vllm bench sweep (#29025)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Luka Govedič <luka.govedic@gmail.com>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: ProExpertProg <11367180+ProExpertProg@users.noreply.github.com>
Co-authored-by: Luka Govedič <luka.govedic@gmail.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
---
 tests/benchmarks/test_param_sweep.py  | 257 ++++++++++++++++++++++++++
 tests/benchmarks/test_plot_filters.py | 171 +++++++++++++++++
 vllm/benchmarks/sweep/param_sweep.py  |  85 ++++++++-
 vllm/benchmarks/sweep/plot.py         | 109 ++++++++++-
 vllm/benchmarks/sweep/serve.py        |  14 +-
 5 files changed, 614 insertions(+), 22 deletions(-)
 create mode 100644 tests/benchmarks/test_param_sweep.py
 create mode 100644 tests/benchmarks/test_plot_filters.py

diff --git a/tests/benchmarks/test_param_sweep.py b/tests/benchmarks/test_param_sweep.py
new file mode 100644
index 0000000000000..0d47cfd9d6230
--- /dev/null
+++ b/tests/benchmarks/test_param_sweep.py
@@ -0,0 +1,257 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+import tempfile
+from pathlib import Path
+
+import pytest
+
+from vllm.benchmarks.sweep.param_sweep import ParameterSweep, ParameterSweepItem
+
+
+class TestParameterSweepItem:
+    """Test ParameterSweepItem functionality."""
+
+    @pytest.mark.parametrize(
+        "input_dict,expected",
+        [
+            (
+                {"compilation_config.use_inductor_graph_partition": False},
+                "--compilation-config.use_inductor_graph_partition=false",
+            ),
+            (
+                {"compilation_config.use_inductor_graph_partition": True},
+                "--compilation-config.use_inductor_graph_partition=true",
+            ),
+            (
+                {"compilation_config.use_inductor": False},
+                "--compilation-config.use_inductor=false",
+            ),
+            (
+                {"compilation_config.use_inductor": True},
+                "--compilation-config.use_inductor=true",
+            ),
+        ],
+    )
+    def test_nested_boolean_params(self, input_dict, expected):
+        """Test that nested boolean params use =true/false syntax."""
+        item = ParameterSweepItem.from_record(input_dict)
+        cmd = item.apply_to_cmd(["vllm", "serve", "model"])
+        assert expected in cmd
+
+    @pytest.mark.parametrize(
+        "input_dict,expected",
+        [
+            ({"enable_prefix_caching": False}, "--no-enable-prefix-caching"),
+            ({"enable_prefix_caching": True}, "--enable-prefix-caching"),
+            ({"disable_log_stats": False}, "--no-disable-log-stats"),
+            ({"disable_log_stats": True}, "--disable-log-stats"),
+        ],
+    )
+    def test_non_nested_boolean_params(self, input_dict, expected):
+        """Test that non-nested boolean params use --no- prefix."""
+        item = ParameterSweepItem.from_record(input_dict)
+        cmd = item.apply_to_cmd(["vllm", "serve", "model"])
+        assert expected in cmd
+
+    @pytest.mark.parametrize(
+        "compilation_config",
+        [
+            {"cudagraph_mode": "full", "mode": 2, "use_inductor_graph_partition": True},
+            {
+                "cudagraph_mode": "piecewise",
+                "mode": 3,
+                "use_inductor_graph_partition": False,
+            },
+        ],
+    )
+    def test_nested_dict_value(self, compilation_config):
+        """Test that nested dict values are serialized as JSON."""
+        item = ParameterSweepItem.from_record(
+            {"compilation_config": compilation_config}
+        )
+        cmd = item.apply_to_cmd(["vllm", "serve", "model"])
+        assert "--compilation-config" in cmd
+        # The dict should be JSON serialized
+        idx = cmd.index("--compilation-config")
+        assert json.loads(cmd[idx + 1]) == compilation_config
+
+    @pytest.mark.parametrize(
+        "input_dict,expected_key,expected_value",
+        [
+            ({"model": "test-model"}, "--model", "test-model"),
+            ({"max_tokens": 100}, "--max-tokens", "100"),
+            ({"temperature": 0.7}, "--temperature", "0.7"),
+        ],
+    )
+    def test_string_and_numeric_values(self, input_dict, expected_key, expected_value):
+        """Test that string and numeric values are handled correctly."""
+        item = ParameterSweepItem.from_record(input_dict)
+        cmd = item.apply_to_cmd(["vllm", "serve"])
+        assert expected_key in cmd
+        assert expected_value in cmd
+
+    @pytest.mark.parametrize(
+        "input_dict,expected_key,key_idx_offset",
+        [
+            ({"max_tokens": 200}, "--max-tokens", 1),
+            ({"enable_prefix_caching": False}, "--no-enable-prefix-caching", 0),
+        ],
+    )
+    def test_replace_existing_parameter(self, input_dict, expected_key, key_idx_offset):
+        """Test that existing parameters in cmd are replaced."""
+        item = ParameterSweepItem.from_record(input_dict)
+
+        if key_idx_offset == 1:
+            # Key-value pair
+            cmd = item.apply_to_cmd(["vllm", "serve", "--max-tokens", "100", "model"])
+            assert expected_key in cmd
+            idx = cmd.index(expected_key)
+            assert cmd[idx + 1] == "200"
+            assert "100" not in cmd
+        else:
+            # Boolean flag
+            cmd = item.apply_to_cmd(
+                ["vllm", "serve", "--enable-prefix-caching", "model"]
+            )
+            assert expected_key in cmd
+            assert "--enable-prefix-caching" not in cmd
+
+
+class TestParameterSweep:
+    """Test ParameterSweep functionality."""
+
+    def test_from_records_list(self):
+        """Test creating ParameterSweep from a list of records."""
+        records = [
+            {"max_tokens": 100, "temperature": 0.7},
+            {"max_tokens": 200, "temperature": 0.9},
+        ]
+        sweep = ParameterSweep.from_records(records)
+        assert len(sweep) == 2
+        assert sweep[0]["max_tokens"] == 100
+        assert sweep[1]["max_tokens"] == 200
+
+    def test_read_from_dict(self):
+        """Test creating ParameterSweep from a dict format."""
+        data = {
+            "experiment1": {"max_tokens": 100, "temperature": 0.7},
+            "experiment2": {"max_tokens": 200, "temperature": 0.9},
+        }
+        sweep = ParameterSweep.read_from_dict(data)
+        assert len(sweep) == 2
+
+        # Check that items have the _benchmark_name field
+        names = {item["_benchmark_name"] for item in sweep}
+        assert names == {"experiment1", "experiment2"}
+
+        # Check that parameters are preserved
+        for item in sweep:
+            if item["_benchmark_name"] == "experiment1":
+                assert item["max_tokens"] == 100
+                assert item["temperature"] == 0.7
+            elif item["_benchmark_name"] == "experiment2":
+                assert item["max_tokens"] == 200
+                assert item["temperature"] == 0.9
+
+    def test_read_json_list_format(self):
+        """Test reading JSON file with list format."""
+        records = [
+            {"max_tokens": 100, "temperature": 0.7},
+            {"max_tokens": 200, "temperature": 0.9},
+        ]
+
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+            json.dump(records, f)
+            temp_path = Path(f.name)
+
+        try:
+            sweep = ParameterSweep.read_json(temp_path)
+            assert len(sweep) == 2
+            assert sweep[0]["max_tokens"] == 100
+            assert sweep[1]["max_tokens"] == 200
+        finally:
+            temp_path.unlink()
+
+    def test_read_json_dict_format(self):
+        """Test reading JSON file with dict format."""
+        data = {
+            "experiment1": {"max_tokens": 100, "temperature": 0.7},
+            "experiment2": {"max_tokens": 200, "temperature": 0.9},
+        }
+
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+            json.dump(data, f)
+            temp_path = Path(f.name)
+
+        try:
+            sweep = ParameterSweep.read_json(temp_path)
+            assert len(sweep) == 2
+
+            # Check that items have the _benchmark_name field
+            names = {item["_benchmark_name"] for item in sweep}
+            assert names == {"experiment1", "experiment2"}
+        finally:
+            temp_path.unlink()
+
+    def test_unique_benchmark_names_validation(self):
+        """Test that duplicate _benchmark_name values raise an error."""
+        # Test with duplicate names in list format
+        records = [
+            {"_benchmark_name": "exp1", "max_tokens": 100},
+            {"_benchmark_name": "exp1", "max_tokens": 200},
+        ]
+
+        with pytest.raises(ValueError, match="Duplicate _benchmark_name values"):
+            ParameterSweep.from_records(records)
+
+    def test_unique_benchmark_names_multiple_duplicates(self):
+        """Test validation with multiple duplicate names."""
+        records = [
+            {"_benchmark_name": "exp1", "max_tokens": 100},
+            {"_benchmark_name": "exp1", "max_tokens": 200},
+            {"_benchmark_name": "exp2", "max_tokens": 300},
+            {"_benchmark_name": "exp2", "max_tokens": 400},
+        ]
+
+        with pytest.raises(ValueError, match="Duplicate _benchmark_name values"):
+            ParameterSweep.from_records(records)
+
+    def test_no_benchmark_names_allowed(self):
+        """Test that records without _benchmark_name are allowed."""
+        records = [
+            {"max_tokens": 100, "temperature": 0.7},
+            {"max_tokens": 200, "temperature": 0.9},
+        ]
+        sweep = ParameterSweep.from_records(records)
+        assert len(sweep) == 2
+
+    def test_mixed_benchmark_names_allowed(self):
+        """Test that mixing records with and without _benchmark_name is allowed."""
+        records = [
+            {"_benchmark_name": "exp1", "max_tokens": 100},
+            {"max_tokens": 200, "temperature": 0.9},
+        ]
+        sweep = ParameterSweep.from_records(records)
+        assert len(sweep) == 2
+
+
+class TestParameterSweepItemKeyNormalization:
+    """Test key normalization in ParameterSweepItem."""
+
+    def test_underscore_to_hyphen_conversion(self):
+        """Test that underscores are converted to hyphens in CLI."""
+        item = ParameterSweepItem.from_record({"max_tokens": 100})
+        cmd = item.apply_to_cmd(["vllm", "serve"])
+        assert "--max-tokens" in cmd
+
+    def test_nested_key_preserves_suffix(self):
+        """Test that nested keys preserve the suffix format."""
+        # The suffix after the dot should preserve underscores
+        item = ParameterSweepItem.from_record(
+            {"compilation_config.some_nested_param": "value"}
+        )
+        cmd = item.apply_to_cmd(["vllm", "serve"])
+        # The prefix (compilation_config) gets converted to hyphens,
+        # but the suffix (some_nested_param) is preserved
+        assert any("compilation-config.some_nested_param" in arg for arg in cmd)
diff --git a/tests/benchmarks/test_plot_filters.py b/tests/benchmarks/test_plot_filters.py
new file mode 100644
index 0000000000000..2b58a99125e6c
--- /dev/null
+++ b/tests/benchmarks/test_plot_filters.py
@@ -0,0 +1,171 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pandas as pd
+import pytest
+
+from vllm.benchmarks.sweep.plot import (
+    PlotEqualTo,
+    PlotFilterBase,
+    PlotFilters,
+    PlotGreaterThan,
+    PlotGreaterThanOrEqualTo,
+    PlotLessThan,
+    PlotLessThanOrEqualTo,
+    PlotNotEqualTo,
+)
+
+
+class TestPlotFilters:
+    """Test PlotFilter functionality including 'inf' edge case."""
+
+    def setup_method(self):
+        """Create sample DataFrames for testing."""
+        # DataFrame with numeric values
+        self.df_numeric = pd.DataFrame(
+            {
+                "request_rate": [1.0, 5.0, 10.0, 50.0, 100.0],
+                "value": [10, 20, 30, 40, 50],
+            }
+        )
+
+        # DataFrame with float('inf') - note: string "inf" values are coerced
+        # to float when loading data, so we only test with float('inf')
+        self.df_inf_float = pd.DataFrame(
+            {
+                "request_rate": [1.0, 5.0, 10.0, float("inf"), float("inf")],
+                "value": [10, 20, 30, 40, 50],
+            }
+        )
+
+    @pytest.mark.parametrize(
+        "target,expected_count",
+        [
+            ("5.0", 1),
+            ("10.0", 1),
+            ("1.0", 1),
+        ],
+    )
+    def test_equal_to_numeric(self, target, expected_count):
+        """Test PlotEqualTo with numeric values."""
+        filter_obj = PlotEqualTo("request_rate", target)
+        result = filter_obj.apply(self.df_numeric)
+        assert len(result) == expected_count
+
+    def test_equal_to_inf_float(self):
+        """Test PlotEqualTo with float('inf')."""
+        filter_obj = PlotEqualTo("request_rate", "inf")
+        result = filter_obj.apply(self.df_inf_float)
+        # Should match both float('inf') entries because float('inf') == float('inf')
+        assert len(result) == 2
+
+    @pytest.mark.parametrize(
+        "target,expected_count",
+        [
+            ("5.0", 4),  # All except 5.0
+            ("1.0", 4),  # All except 1.0
+        ],
+    )
+    def test_not_equal_to_numeric(self, target, expected_count):
+        """Test PlotNotEqualTo with numeric values."""
+        filter_obj = PlotNotEqualTo("request_rate", target)
+        result = filter_obj.apply(self.df_numeric)
+        assert len(result) == expected_count
+
+    def test_not_equal_to_inf_float(self):
+        """Test PlotNotEqualTo with float('inf')."""
+        filter_obj = PlotNotEqualTo("request_rate", "inf")
+        result = filter_obj.apply(self.df_inf_float)
+        # Should exclude float('inf') entries
+        assert len(result) == 3
+
+    @pytest.mark.parametrize(
+        "target,expected_count",
+        [
+            ("10.0", 2),  # 1.0, 5.0
+            ("50.0", 3),  # 1.0, 5.0, 10.0
+            ("5.0", 1),  # 1.0
+        ],
+    )
+    def test_less_than(self, target, expected_count):
+        """Test PlotLessThan with numeric values."""
+        filter_obj = PlotLessThan("request_rate", target)
+        result = filter_obj.apply(self.df_numeric)
+        assert len(result) == expected_count
+
+    @pytest.mark.parametrize(
+        "target,expected_count",
+        [
+            ("10.0", 3),  # 1.0, 5.0, 10.0
+            ("5.0", 2),  # 1.0, 5.0
+        ],
+    )
+    def test_less_than_or_equal_to(self, target, expected_count):
+        """Test PlotLessThanOrEqualTo with numeric values."""
+        filter_obj = PlotLessThanOrEqualTo("request_rate", target)
+        result = filter_obj.apply(self.df_numeric)
+        assert len(result) == expected_count
+
+    @pytest.mark.parametrize(
+        "target,expected_count",
+        [
+            ("10.0", 2),  # 50.0, 100.0
+            ("5.0", 3),  # 10.0, 50.0, 100.0
+        ],
+    )
+    def test_greater_than(self, target, expected_count):
+        """Test PlotGreaterThan with numeric values."""
+        filter_obj = PlotGreaterThan("request_rate", target)
+        result = filter_obj.apply(self.df_numeric)
+        assert len(result) == expected_count
+
+    @pytest.mark.parametrize(
+        "target,expected_count",
+        [
+            ("10.0", 3),  # 10.0, 50.0, 100.0
+            ("5.0", 4),  # 5.0, 10.0, 50.0, 100.0
+        ],
+    )
+    def test_greater_than_or_equal_to(self, target, expected_count):
+        """Test PlotGreaterThanOrEqualTo with numeric values."""
+        filter_obj = PlotGreaterThanOrEqualTo("request_rate", target)
+        result = filter_obj.apply(self.df_numeric)
+        assert len(result) == expected_count
+
+    @pytest.mark.parametrize(
+        "filter_str,expected_var,expected_target,expected_type",
+        [
+            ("request_rate==5.0", "request_rate", "5.0", PlotEqualTo),
+            ("request_rate!=10.0", "request_rate", "10.0", PlotNotEqualTo),
+            ("request_rate<50.0", "request_rate", "50.0", PlotLessThan),
+            ("request_rate<=50.0", "request_rate", "50.0", PlotLessThanOrEqualTo),
+            ("request_rate>10.0", "request_rate", "10.0", PlotGreaterThan),
+            ("request_rate>=10.0", "request_rate", "10.0", PlotGreaterThanOrEqualTo),
+            ("request_rate==inf", "request_rate", "inf", PlotEqualTo),
+            ("request_rate!='inf'", "request_rate", "inf", PlotNotEqualTo),
+        ],
+    )
+    def test_parse_str(self, filter_str, expected_var, expected_target, expected_type):
+        """Test parsing filter strings."""
+        filter_obj = PlotFilterBase.parse_str(filter_str)
+        assert isinstance(filter_obj, expected_type)
+        assert filter_obj.var == expected_var
+        assert filter_obj.target == expected_target
+
+    def test_parse_str_inf_edge_case(self):
+        """Test parsing 'inf' string in filter."""
+        filter_obj = PlotFilterBase.parse_str("request_rate==inf")
+        assert isinstance(filter_obj, PlotEqualTo)
+        assert filter_obj.var == "request_rate"
+        assert filter_obj.target == "inf"
+
+    def test_parse_multiple_filters(self):
+        """Test parsing multiple filters."""
+        filters = PlotFilters.parse_str("request_rate>5.0,value<=40")
+        assert len(filters) == 2
+        assert isinstance(filters[0], PlotGreaterThan)
+        assert isinstance(filters[1], PlotLessThanOrEqualTo)
+
+    def test_parse_empty_filter(self):
+        """Test parsing empty filter string."""
+        filters = PlotFilters.parse_str("")
+        assert len(filters) == 0
diff --git a/vllm/benchmarks/sweep/param_sweep.py b/vllm/benchmarks/sweep/param_sweep.py
index 986561ed8502a..a438a328880fd 100644
--- a/vllm/benchmarks/sweep/param_sweep.py
+++ b/vllm/benchmarks/sweep/param_sweep.py
@@ -9,8 +9,26 @@ class ParameterSweep(list["ParameterSweepItem"]):
     @classmethod
     def read_json(cls, filepath: os.PathLike):
         with open(filepath, "rb") as f:
-            records = json.load(f)
+            data = json.load(f)
 
+        # Support both list and dict formats
+        if isinstance(data, dict):
+            return cls.read_from_dict(data)
+
+        return cls.from_records(data)
+
+    @classmethod
+    def read_from_dict(cls, data: dict[str, dict[str, object]]):
+        """
+        Read parameter sweep from a dict format where keys are names.
+
+        Example:
+            {
+                "experiment1": {"max_tokens": 100, "temperature": 0.7},
+                "experiment2": {"max_tokens": 200, "temperature": 0.9}
+            }
+        """
+        records = [{"_benchmark_name": name, **params} for name, params in data.items()]
         return cls.from_records(records)
 
     @classmethod
@@ -21,6 +39,15 @@ class ParameterSweep(list["ParameterSweepItem"]):
                 f"but found type: {type(records)}"
             )
 
+        # Validate that all _benchmark_name values are unique if provided
+        names = [r["_benchmark_name"] for r in records if "_benchmark_name" in r]
+        if names and len(names) != len(set(names)):
+            duplicates = [name for name in names if names.count(name) > 1]
+            raise ValueError(
+                f"Duplicate _benchmark_name values found: {set(duplicates)}. "
+                f"All _benchmark_name values must be unique."
+            )
+
         return cls(ParameterSweepItem.from_record(record) for record in records)
 
 
@@ -38,6 +65,18 @@ class ParameterSweepItem(dict[str, object]):
     def __or__(self, other: dict[str, Any]):
         return type(self)(super().__or__(other))
 
+    @property
+    def name(self) -> str:
+        """
+        Get the name for this parameter sweep item.
+
+        Returns the '_benchmark_name' field if present, otherwise returns a text
+        representation of all parameters.
+        """
+        if "_benchmark_name" in self:
+            return self["_benchmark_name"]
+        return self.as_text(sep="-")
+
     # In JSON, we prefer "_"
     def _iter_param_key_candidates(self, param_key: str):
         # Inner config arguments are not converted by the CLI
@@ -63,29 +102,57 @@ class ParameterSweepItem(dict[str, object]):
     def has_param(self, param_key: str) -> bool:
         return any(k in self for k in self._iter_param_key_candidates(param_key))
 
+    def _normalize_cmd_kv_pair(self, k: str, v: object) -> list[str]:
+        """
+        Normalize a key-value pair into command-line arguments.
+
+        Returns a list containing either:
+        - A single element for boolean flags (e.g., ['--flag'] or ['--flag=true'])
+        - Two elements for key-value pairs (e.g., ['--key', 'value'])
+        """
+        if isinstance(v, bool):
+            # For nested params (containing "."), use =true/false syntax
+            if "." in k:
+                return [f"{self._normalize_cmd_key(k)}={'true' if v else 'false'}"]
+            else:
+                return [self._normalize_cmd_key(k if v else "no-" + k)]
+        else:
+            return [self._normalize_cmd_key(k), str(v)]
+
     def apply_to_cmd(self, cmd: list[str]) -> list[str]:
         cmd = list(cmd)
 
         for k, v in self.items():
+            # Skip the '_benchmark_name' field, not a parameter
+            if k == "_benchmark_name":
+                continue
+
+            # Serialize dict values as JSON
+            if isinstance(v, dict):
+                v = json.dumps(v)
+
             for k_candidate in self._iter_cmd_key_candidates(k):
                 try:
                     k_idx = cmd.index(k_candidate)
 
-                    if isinstance(v, bool):
-                        cmd[k_idx] = self._normalize_cmd_key(k if v else "no-" + k)
+                    # Replace existing parameter
+                    normalized = self._normalize_cmd_kv_pair(k, v)
+                    if len(normalized) == 1:
+                        # Boolean flag
+                        cmd[k_idx] = normalized[0]
                     else:
-                        cmd[k_idx + 1] = str(v)
+                        # Key-value pair
+                        cmd[k_idx] = normalized[0]
+                        cmd[k_idx + 1] = normalized[1]
 
                     break
                 except ValueError:
                     continue
             else:
-                if isinstance(v, bool):
-                    cmd.append(self._normalize_cmd_key(k if v else "no-" + k))
-                else:
-                    cmd.extend([self._normalize_cmd_key(k), str(v)])
+                # Add new parameter
+                cmd.extend(self._normalize_cmd_kv_pair(k, v))
 
         return cmd
 
     def as_text(self, sep: str = ", ") -> str:
-        return sep.join(f"{k}={v}" for k, v in self.items())
+        return sep.join(f"{k}={v}" for k, v in self.items() if k != "_benchmark_name")
diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index 9947d6170d891..163d517931342 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -65,6 +65,18 @@ class PlotEqualTo(PlotFilterBase):
         return df[df[self.var] == target]
 
 
+@dataclass
+class PlotNotEqualTo(PlotFilterBase):
+    @override
+    def apply(self, df: "pd.DataFrame") -> "pd.DataFrame":
+        try:
+            target = float(self.target)
+        except ValueError:
+            target = self.target
+
+        return df[df[self.var] != target]
+
+
 @dataclass
 class PlotLessThan(PlotFilterBase):
     @override
@@ -96,6 +108,7 @@ class PlotGreaterThanOrEqualTo(PlotFilterBase):
 # NOTE: The ordering is important! Match longer op_keys first
 PLOT_FILTERS: dict[str, type[PlotFilterBase]] = {
     "==": PlotEqualTo,
+    "!=": PlotNotEqualTo,
     "<=": PlotLessThanOrEqualTo,
     ">=": PlotGreaterThanOrEqualTo,
     "<": PlotLessThan,
@@ -167,6 +180,27 @@ def _json_load_bytes(path: Path) -> list[dict[str, object]]:
         return json.load(f)
 
 
+def _convert_inf_nan_strings(data: list[dict[str, object]]) -> list[dict[str, object]]:
+    """
+    Convert string values "inf", "-inf", and "nan" to their float equivalents.
+
+    This handles the case where JSON serialization represents inf/nan as strings.
+    """
+    converted_data = []
+    for record in data:
+        converted_record = {}
+        for key, value in record.items():
+            if isinstance(value, str):
+                if value in ["inf", "-inf", "nan"]:
+                    converted_record[key] = float(value)
+                else:
+                    converted_record[key] = value
+            else:
+                converted_record[key] = value
+        converted_data.append(converted_record)
+    return converted_data
+
+
 def _get_metric(run_data: dict[str, object], metric_key: str):
     try:
         return run_data[metric_key]
@@ -178,12 +212,15 @@ def _get_group(run_data: dict[str, object], group_keys: list[str]):
     return tuple((k, str(_get_metric(run_data, k))) for k in group_keys)
 
 
-def _get_fig_path(fig_dir: Path, group: tuple[tuple[str, str], ...]):
+def _get_fig_path(fig_dir: Path, group: tuple[tuple[str, str], ...], fig_name: str):
     parts = list[str]()
+
+    # Start with figure name (always provided, defaults to "FIGURE")
+    parts.append(fig_name)
+
+    # Always append group data if present
     if group:
-        parts.extend(("FIGURE-", *(f"{k}={v}" for k, v in group)))
-    else:
-        parts.append("figure")
+        parts.extend(f"{k}={v}" for k, v in group)
 
     return fig_dir / sanitize_filename("-".join(parts) + ".png")
 
@@ -217,6 +254,10 @@ def _plot_fig(
     scale_x: str | None,
     scale_y: str | None,
     dry_run: bool,
+    fig_name: str,
+    error_bars: bool,
+    fig_height: float,
+    fig_dpi: int,
 ):
     fig_group, fig_data = fig_group_data
 
@@ -230,7 +271,7 @@ def _plot_fig(
         for _, row_data in row_groups
     )
 
-    fig_path = _get_fig_path(fig_dir, fig_group)
+    fig_path = _get_fig_path(fig_dir, fig_group, fig_name)
 
     print("[BEGIN FIGURE]")
     print(f"Group: {dict(fig_group)}")
@@ -241,6 +282,8 @@ def _plot_fig(
         print("[END FIGURE]")
         return
 
+    # Convert string "inf", "-inf", and "nan" to their float equivalents
+    fig_data = _convert_inf_nan_strings(fig_data)
     df = pd.DataFrame.from_records(fig_data)
 
     if var_x not in df.columns:
@@ -275,6 +318,10 @@ def _plot_fig(
     df = filter_by.apply(df)
     df = bin_by.apply(df)
 
+    # Sort by curve_by columns alphabetically for consistent legend ordering
+    if curve_by:
+        df = df.sort_values(by=curve_by)
+
     df["row_group"] = (
         pd.concat(
             [k + "=" + df[k].astype(str) for k in row_by],
@@ -293,7 +340,7 @@ def _plot_fig(
         else "(All)"
     )
 
-    g = sns.FacetGrid(df, row="row_group", col="col_group")
+    g = sns.FacetGrid(df, row="row_group", col="col_group", height=fig_height)
 
     if row_by and col_by:
         g.set_titles("{row_name}\n{col_name}")
@@ -320,6 +367,7 @@ def _plot_fig(
             style=style,
             size=size,
             markers=True,
+            errorbar="sd" if error_bars else None,
         )
 
         g.add_legend(title=hue)
@@ -339,11 +387,12 @@ def _plot_fig(
             y=var_y,
             hue="curve_group",
             markers=True,
+            errorbar="sd" if error_bars else None,
         )
 
         g.add_legend()
 
-    g.savefig(fig_path)
+    g.savefig(fig_path, dpi=fig_dpi)
     plt.close(g.figure)
 
     print("[END FIGURE]")
@@ -364,6 +413,10 @@ def plot(
     scale_x: str | None,
     scale_y: str | None,
     dry_run: bool,
+    fig_name: str = "FIGURE",
+    error_bars: bool = True,
+    fig_height: float = 6.4,
+    fig_dpi: int = 300,
 ):
     all_data = [
         run_data
@@ -398,6 +451,10 @@ def plot(
                     scale_x=scale_x,
                     scale_y=scale_y,
                     dry_run=dry_run,
+                    fig_name=fig_name,
+                    error_bars=error_bars,
+                    fig_height=fig_height,
+                    fig_dpi=fig_dpi,
                 ),
                 fig_groups,
             )
@@ -419,6 +476,10 @@ class SweepPlotArgs:
     scale_x: str | None
     scale_y: str | None
     dry_run: bool
+    fig_name: str = "FIGURE"
+    error_bars: bool = True
+    fig_height: float = 6.4
+    fig_dpi: int = 300
 
     parser_name: ClassVar[str] = "plot"
     parser_help: ClassVar[str] = "Plot performance curves from parameter sweep results."
@@ -448,6 +509,10 @@ class SweepPlotArgs:
             scale_x=args.scale_x,
             scale_y=args.scale_y,
             dry_run=args.dry_run,
+            fig_name=args.fig_name,
+            error_bars=not args.no_error_bars,
+            fig_height=args.fig_height,
+            fig_dpi=args.fig_dpi,
         )
 
     @classmethod
@@ -541,6 +606,32 @@ class SweepPlotArgs:
             "Currently only accepts string values such as 'log' and 'sqrt'. "
             "See also: https://seaborn.pydata.org/generated/seaborn.objects.Plot.scale.html",
         )
+        parser.add_argument(
+            "--fig-name",
+            type=str,
+            default="FIGURE",
+            help="Name prefix for the output figure file. "
+            "Group data is always appended when present. "
+            "Default: 'FIGURE'. Example: --fig-name my_performance_plot",
+        )
+        parser.add_argument(
+            "--no-error-bars",
+            action="store_true",
+            help="If set, disables error bars on the plot. "
+            "By default, error bars are shown.",
+        )
+        parser.add_argument(
+            "--fig-height",
+            type=float,
+            default=6.4,
+            help="Height of each subplot in inches. Default: 6.4",
+        )
+        parser.add_argument(
+            "--fig-dpi",
+            type=int,
+            default=300,
+            help="Resolution of the output figure in dots per inch. Default: 300",
+        )
         parser.add_argument(
             "--dry-run",
             action="store_true",
@@ -566,6 +657,10 @@ def run_main(args: SweepPlotArgs):
         scale_x=args.scale_x,
         scale_y=args.scale_y,
         dry_run=args.dry_run,
+        fig_name=args.fig_name,
+        error_bars=args.error_bars,
+        fig_height=args.fig_height,
+        fig_dpi=args.fig_dpi,
     )
 
 
diff --git a/vllm/benchmarks/sweep/serve.py b/vllm/benchmarks/sweep/serve.py
index 1298e4acbd87d..6626707cf2a52 100644
--- a/vllm/benchmarks/sweep/serve.py
+++ b/vllm/benchmarks/sweep/serve.py
@@ -138,9 +138,9 @@ def _get_comb_base_path(
 ):
     parts = list[str]()
     if serve_comb:
-        parts.extend(("SERVE-", serve_comb.as_text(sep="-")))
+        parts.extend(("SERVE-", serve_comb.name))
     if bench_comb:
-        parts.extend(("BENCH-", bench_comb.as_text(sep="-")))
+        parts.extend(("BENCH-", bench_comb.name))
 
     return output_dir / sanitize_filename("-".join(parts))
 
@@ -345,8 +345,9 @@ class SweepServeArgs:
             "--serve-params",
             type=str,
             default=None,
-            help="Path to JSON file containing a list of parameter combinations "
-            "for the `vllm serve` command. "
+            help="Path to JSON file containing parameter combinations "
+            "for the `vllm serve` command. Can be either a list of dicts or a dict "
+            "where keys are benchmark names. "
             "If both `serve_params` and `bench_params` are given, "
             "this script will iterate over their Cartesian product.",
         )
@@ -354,8 +355,9 @@ class SweepServeArgs:
             "--bench-params",
             type=str,
             default=None,
-            help="Path to JSON file containing a list of parameter combinations "
-            "for the `vllm bench serve` command. "
+            help="Path to JSON file containing parameter combinations "
+            "for the `vllm bench serve` command. Can be either a list of dicts or "
+            "a dict where keys are benchmark names. "
             "If both `serve_params` and `bench_params` are given, "
             "this script will iterate over their Cartesian product.",
         )

From afb1e5b380ff623e478d19a246b42b2903b9331f Mon Sep 17 00:00:00 2001
From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Date: Tue, 2 Dec 2025 14:46:10 -0600
Subject: [PATCH 158/770] [CI][ROCm][tests/v1/e2e] Fix multiprocessing launch
 for the test (#29123)

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
---
 tests/v1/e2e/test_kv_sharing_fast_prefill.py | 22 +++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/tests/v1/e2e/test_kv_sharing_fast_prefill.py b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
index 2778b0c5e5670..f895fb72e94a1 100644
--- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py
+++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
@@ -7,6 +7,7 @@ import pytest
 
 from vllm import LLM, SamplingParams
 from vllm.config import CompilationConfig, CompilationMode
+from vllm.platforms import current_platform
 
 from ...utils import check_answers, fork_new_process_for_each_test, prep_prompts
 
@@ -43,15 +44,26 @@ def test_prompts():
     return prompts
 
 
-@fork_new_process_for_each_test
+use_fork_for_test = (
+    fork_new_process_for_each_test if not current_platform.is_rocm() else lambda x: x
+)
+
+
+@use_fork_for_test
 @pytest.mark.parametrize("kv_sharing_fast_prefill", [False, True])
 @pytest.mark.parametrize("enforce_eager", [True, False])
 def test_kv_sharing_fast_prefill(
     monkeypatch: pytest.MonkeyPatch,
     kv_sharing_fast_prefill: bool,
     enforce_eager: bool,
-    test_prompts: list[str],
 ):
+    if not enforce_eager and current_platform.is_rocm():
+        # Relevant context: https://github.com/vllm-project/vllm/pull/29244
+        pytest.skip(
+            "ROCm: torch.compile produces incorrect output for gemma-3n's GELU "
+            "with tanh approximation. Use enforce_eager=True instead."
+        )
+
     sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
     compilation_config = CompilationConfig(
         # This allows vLLM compilation backend to handle allocating and
@@ -65,7 +77,11 @@ def test_kv_sharing_fast_prefill(
 
     with monkeypatch.context() as m:
         # Make scheduling deterministic for reproducibility
-        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+        if current_platform.is_rocm():
+            # Use spawn to prevent cuda re-initialization error
+            m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
+        else:
+            m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
 
         prompts, answer, indices = prep_prompts(batch_size)
 

From 1528e079e2b2cf8a807e4dce86ef05540e16a430 Mon Sep 17 00:00:00 2001
From: jthomson04 <jwillthomson19@gmail.com>
Date: Tue, 2 Dec 2025 13:25:52 -0800
Subject: [PATCH 159/770] [Perf] Avoid pageable HtoD transfer in
 MinTokensLogitsProcessor (#29826)

Signed-off-by: jthomson04 <jwillthomson19@gmail.com>
---
 vllm/v1/sample/logits_processor/builtin.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/sample/logits_processor/builtin.py b/vllm/v1/sample/logits_processor/builtin.py
index 4ee7dc2880c8c..82743f72b0310 100644
--- a/vllm/v1/sample/logits_processor/builtin.py
+++ b/vllm/v1/sample/logits_processor/builtin.py
@@ -110,7 +110,7 @@ class MinPLogitsProcessor(LogitsProcessor):
         # Identify valid tokens using threshold comparison
         invalid_token_mask = probability_values < adjusted_min_p
         # Apply mask using boolean indexing
-        logits[invalid_token_mask] = -float("inf")
+        logits.masked_fill_(invalid_token_mask, -float("inf"))
         return logits
 
 
@@ -178,6 +178,10 @@ class MinTokensLogitsProcessor(LogitsProcessor):
             self._device_tensor([], torch.int32),
         )
 
+        self.neg_inf_tensor = torch.tensor(
+            -float("inf"), dtype=torch.float32, device=self.device
+        )
+
     def is_argmax_invariant(self) -> bool:
         """By censoring stop tokens, min-tokens can change the outcome
         of the argmax operation in greedy sampling."""
@@ -229,7 +233,7 @@ class MinTokensLogitsProcessor(LogitsProcessor):
     def apply(self, logits: torch.Tensor) -> torch.Tensor:
         if self.min_toks:
             # Inhibit EOS token for requests which have not reached min length
-            logits[self.logits_slice] = -float("inf")
+            logits.index_put_(self.logits_slice, self.neg_inf_tensor)
         return logits
 
 
From 3ff5b53bc2330688ea85d72ae79fe84eed63547c Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 2 Dec 2025 21:29:32 +0000
Subject: [PATCH 160/770] Bump actions/setup-python from 6.0.0 to 6.1.0
 (#29768)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/cleanup_pr_body.yml | 2 +-
 .github/workflows/pre-commit.yml      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml
index 861290ea43c87..56fbe5ca704a1 100644
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -16,7 +16,7 @@ jobs:
         uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
 
       - name: Set up Python
-        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+        uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
         with:
           python-version: '3.12'
 
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index d5e70f30ef638..a03b979ad761d 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -17,7 +17,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
-    - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+    - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
       with:
         python-version: "3.12"
     - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"

From 6fc5841db14efedae7e6a8d1abdde3516c6c35a1 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 2 Dec 2025 21:49:44 +0000
Subject: [PATCH 161/770] Fix some more Transformers nightly tests (#29872)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 examples/offline_inference/vision_language.py |  5 +++-
 .../vision_language_multi_image.py            |  5 +++-
 tests/models/registry.py                      |  5 +++-
 vllm/model_executor/models/qwen2_vl.py        |  9 ------
 vllm/tokenizers/mistral.py                    | 28 +++++++++++++------
 vllm/transformers_utils/config.py             | 24 +++++++++++-----
 vllm/transformers_utils/configs/__init__.py   |  2 ++
 vllm/transformers_utils/configs/tarsier2.py   | 24 ++++++++++++++++
 8 files changed, 75 insertions(+), 27 deletions(-)
 create mode 100644 vllm/transformers_utils/configs/tarsier2.py

diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 8f72bf6f0b0d1..0888a9d60a3fa 100755
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -1801,7 +1801,10 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
     engine_args = EngineArgs(
         model=model_name,
         max_model_len=4096,
-        hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
+        hf_overrides={
+            "architectures": ["Tarsier2ForConditionalGeneration"],
+            "model_type": "tarsier2",
+        },
         limit_mm_per_prompt={modality: 1},
     )
 
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 7ba4e64b567de..2193b1ca9cf48 100755
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -1222,7 +1222,10 @@ def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
         trust_remote_code=True,
         max_model_len=32768,
         limit_mm_per_prompt={"image": len(image_urls)},
-        hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
+        hf_overrides={
+            "architectures": ["Tarsier2ForConditionalGeneration"],
+            "model_type": "tarsier2",
+        },
     )
 
     prompt = (
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 26351089fc464..6b1d24b1c99b5 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -831,7 +831,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     "TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b"),
     "Tarsier2ForConditionalGeneration": _HfExamplesInfo(
         "omni-research/Tarsier2-Recap-7b",
-        hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
+        hf_overrides={
+            "architectures": ["Tarsier2ForConditionalGeneration"],
+            "model_type": "tarsier2",
+        },
     ),
     "VoxtralForConditionalGeneration": _HfExamplesInfo(
         "mistralai/Voxtral-Mini-3B-2507",
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 8fbd896223944..b748768498412 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -1576,15 +1576,6 @@ class Tarsier2ForConditionalGeneration(Qwen2VLForConditionalGeneration):
         }
     )
 
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        # Tarsier2 uses llava as model_type, which will create a Qwen2VLConfig
-        # as text_config, we need to reconstruct Qwen2VLConfig from LlavaConfig.
-        config = vllm_config.model_config.hf_config
-        qwen2vl_config = config.text_config
-        qwen2vl_config.architectures = config.architectures
-        vllm_config.model_config.hf_config = qwen2vl_config
-        super().__init__(vllm_config=vllm_config, prefix=prefix)
-
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         skip_prefixes = []
         if self.visual is None:
diff --git a/vllm/tokenizers/mistral.py b/vllm/tokenizers/mistral.py
index 96d1e78ce9f17..37d67607c2cfe 100644
--- a/vllm/tokenizers/mistral.py
+++ b/vllm/tokenizers/mistral.py
@@ -14,13 +14,19 @@ if TYPE_CHECKING:
     )
     from mistral_common.tokens.tokenizers.tekken import Tekkenizer
     from transformers import BatchEncoding
-    from transformers.tokenization_mistral_common import (
-        MistralCommonTokenizer as TransformersMistralTokenizer,
-    )
 
     from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
     from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 
+    try:
+        # Transformers v5
+        from transformers.tokenization_mistral_common import MistralCommonBackend
+    except ImportError:
+        # Transformers v4
+        from transformers.tokenization_mistral_common import (
+            MistralCommonTokenizer as MistralCommonBackend,
+        )
+
 logger = init_logger(__name__)
 
 
@@ -208,11 +214,17 @@ class MistralTokenizer(TokenizerLike):
         **kwargs,
     ) -> "MistralTokenizer":
         from mistral_common.protocol.instruct.validator import ValidationMode
-        from transformers.tokenization_mistral_common import (
-            MistralCommonTokenizer as TransformersMistralTokenizer,
-        )
 
-        tokenizer = TransformersMistralTokenizer.from_pretrained(
+        try:
+            # Transformers v5
+            from transformers.tokenization_mistral_common import MistralCommonBackend
+        except ImportError:
+            # Transformers v4
+            from transformers.tokenization_mistral_common import (
+                MistralCommonTokenizer as MistralCommonBackend,
+            )
+
+        tokenizer = MistralCommonBackend.from_pretrained(
             path_or_repo_id,
             *args,
             mode=ValidationMode.test,
@@ -223,7 +235,7 @@ class MistralTokenizer(TokenizerLike):
 
         return cls(tokenizer)
 
-    def __init__(self, tokenizer: "TransformersMistralTokenizer") -> None:
+    def __init__(self, tokenizer: "MistralCommonBackend") -> None:
         super().__init__()
 
         from mistral_common.protocol.instruct.validator import ValidationMode
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 0cceab90ba9a2..2911dcff2ab49 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -89,6 +89,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
     step3_text="Step3TextConfig",
     qwen3_next="Qwen3NextConfig",
     lfm2_moe="Lfm2MoeConfig",
+    tarsier2="Tarsier2Config",
 )
 
 _CONFIG_ATTRS_MAPPING: dict[str, str] = {
@@ -127,6 +128,9 @@ class HFConfigParser(ConfigParserBase):
                 if config_dict.get("speculators_config") is not None
                 else model_type
             )
+        # Allow hf_overrides to override model_type before checking _CONFIG_REGISTRY
+        if (hf_overrides := kwargs.pop("hf_overrides", None)) is not None:
+            model_type = hf_overrides.get("model_type", model_type)
 
         if model_type in _CONFIG_REGISTRY:
             config_class = _CONFIG_REGISTRY[model_type]
@@ -310,7 +314,7 @@ def patch_rope_parameters(config: PretrainedConfig) -> None:
             config.rope_parameters["rope_theta"] = rope_theta
 
     # No RoPE parameters to patch
-    if not hasattr(config, "rope_parameters"):
+    if getattr(config, "rope_parameters", None) is None:
         return
 
     # Add original_max_position_embeddings if present
@@ -351,7 +355,10 @@ def patch_rope_parameters_dict(rope_parameters: dict[str, Any]) -> None:
         rope_parameters["rope_type"] = "longrope"
         logger.warning("Replacing legacy rope_type 'su' with 'longrope'")
     elif rope_parameters["rope_type"] == "mrope":
-        assert "mrope_section" in rope_parameters
+        if "mrope_section" not in rope_parameters:
+            raise ValueError(
+                "Legacy rope_type 'mrope' requires 'mrope_section' in rope_parameters"
+            )
         rope_parameters["rope_type"] = "default"
         logger.warning("Replacing legacy rope_type 'mrope' with 'default'")
 
@@ -584,6 +591,7 @@ def get_config(
         trust_remote_code=trust_remote_code,
         revision=revision,
         code_revision=code_revision,
+        hf_overrides=hf_overrides_kw,
         **kwargs,
     )
     # Special architecture mapping check for GGUF models
@@ -915,11 +923,13 @@ def get_hf_text_config(config: PretrainedConfig):
     """
     text_config = config.get_text_config()
 
-    if text_config is not config:
-        # The code operates under the assumption that text_config should have
-        # `num_attention_heads` (among others). Assert here to fail early
-        # if transformers config doesn't align with this assumption.
-        assert hasattr(text_config, "num_attention_heads")
+    if text_config is not config and not hasattr(text_config, "num_attention_heads"):
+        raise ValueError(
+            "The text_config extracted from the model config does not have "
+            "`num_attention_heads` attribute. This indicates a mismatch "
+            "between the model config and vLLM's expectations. Please "
+            "ensure that the model config is compatible with vLLM."
+        )
 
     return text_config
 
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 109f2b6986514..0e8d167886935 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -48,6 +48,7 @@ from vllm.transformers_utils.configs.step3_vl import (
     Step3VisionEncoderConfig,
     Step3VLConfig,
 )
+from vllm.transformers_utils.configs.tarsier2 import Tarsier2Config
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 
 __all__ = [
@@ -81,4 +82,5 @@ __all__ = [
     "Step3VisionEncoderConfig",
     "Step3TextConfig",
     "Qwen3NextConfig",
+    "Tarsier2Config",
 ]
diff --git a/vllm/transformers_utils/configs/tarsier2.py b/vllm/transformers_utils/configs/tarsier2.py
new file mode 100644
index 0000000000000..12ebb4b7f602d
--- /dev/null
+++ b/vllm/transformers_utils/configs/tarsier2.py
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from transformers import Qwen2VLConfig
+
+
+class Tarsier2Config(Qwen2VLConfig):
+    """
+    Tarsier2's config.json is written such that AutoConfig.from_pretrained will create
+    a deeply nested config consisting of:
+
+    - LlavaConfig
+      - Qwen2VLConfig
+        - Qwen2VLTextConfig
+        - Qwen2VLVisionConfig
+      - Qwen2VLConfig
+        - Qwen2VLTextConfig
+        - Qwen2VLVisionConfig
+
+    When it should really just be a single Qwen2VLConfig.
+
+    This class is a hack to stop AutoConfig from creating the nested config structure.
+    """
+
+    model_type = "tarsier2"

From e6f114ac25967b073954f7f3dc733672d173124c Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Tue, 2 Dec 2025 14:20:22 -0800
Subject: [PATCH 162/770] [Bugfix][EPLB] Prevent user-provided EPLB config from
 being overwritten with defaults (#29911)

Signed-off-by: Sage Moore <sage@neuralmagic.com>
---
 tests/distributed/test_eplb_spec_decode.py | 16 +++++++++-------
 vllm/engine/arg_utils.py                   | 14 --------------
 2 files changed, 9 insertions(+), 21 deletions(-)

diff --git a/tests/distributed/test_eplb_spec_decode.py b/tests/distributed/test_eplb_spec_decode.py
index c055b7a3f6dd7..868cc702866e2 100644
--- a/tests/distributed/test_eplb_spec_decode.py
+++ b/tests/distributed/test_eplb_spec_decode.py
@@ -22,7 +22,14 @@ def get_model_args(
         "num_speculative_tokens": 1,
         "max_model_len": model_max_len,
     }
-
+    eplb_config = {
+        "num_redundant_experts": tp_size,
+        "window_size": 128,
+        "step_interval": 1024,
+        "log_balancedness": False,
+    }
+    if use_async:
+        eplb_config["use_async"] = True
     model_args = {
         "pretrained": model_name,
         "dtype": "auto",
@@ -31,15 +38,10 @@ def get_model_args(
         "gpu_memory_utilization": 0.7,
         "speculative_config": speculative_config,
         "enable_expert_parallel": True,
-        "num_redundant_experts": tp_size,
-        "eplb_window_size": 128,
-        "eplb_step_interval": 1024,
-        "eplb_log_balancedness": False,
+        "eplb_config": eplb_config,
         "enable_eplb": True,
         "max_model_len": model_max_len,
     }
-    if use_async:
-        model_args["eplb_config"] = {"use_async": True}
     return model_args
 
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 83029e09ceaad..096217da4fe44 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -421,10 +421,6 @@ class EngineArgs:
     )
     _api_process_count: int = ParallelConfig._api_process_count
     _api_process_rank: int = ParallelConfig._api_process_rank
-    num_redundant_experts: int = EPLBConfig.num_redundant_experts
-    eplb_window_size: int = EPLBConfig.window_size
-    eplb_step_interval: int = EPLBConfig.step_interval
-    eplb_log_balancedness: bool = EPLBConfig.log_balancedness
     max_parallel_loading_workers: int | None = (
         ParallelConfig.max_parallel_loading_workers
     )
@@ -1582,16 +1578,6 @@ class EngineArgs:
             )
             self.disable_nccl_for_dp_synchronization = True
 
-        # Forward the deprecated CLI args to the EPLB config.
-        if self.num_redundant_experts is not None:
-            self.eplb_config.num_redundant_experts = self.num_redundant_experts
-        if self.eplb_window_size is not None:
-            self.eplb_config.window_size = self.eplb_window_size
-        if self.eplb_step_interval is not None:
-            self.eplb_config.step_interval = self.eplb_step_interval
-        if self.eplb_log_balancedness is not None:
-            self.eplb_config.log_balancedness = self.eplb_log_balancedness
-
         parallel_config = ParallelConfig(
             pipeline_parallel_size=self.pipeline_parallel_size,
             tensor_parallel_size=self.tensor_parallel_size,

From 0a9caca9f5e130acbf39d5acd0b79fb492d6c4a3 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Wed, 3 Dec 2025 06:42:28 +0800
Subject: [PATCH 163/770] [Bugfix] fix --scheduling-policy=priority & n>1
 crashes engine (#29764)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
---
 .../v1/core/test_priority_scheduler_random.py | 12 +++++++++-
 vllm/v1/core/sched/request_queue.py           | 24 ++++++++-----------
 vllm/v1/request.py                            | 13 ++++++++++
 3 files changed, 34 insertions(+), 15 deletions(-)

diff --git a/tests/v1/core/test_priority_scheduler_random.py b/tests/v1/core/test_priority_scheduler_random.py
index b4805be802723..429b179b61dce 100644
--- a/tests/v1/core/test_priority_scheduler_random.py
+++ b/tests/v1/core/test_priority_scheduler_random.py
@@ -219,7 +219,17 @@ def test_priority_scheduling_blast(
             vllm_config=scheduler.vllm_config,
         )
         scheduler.add_request(req)
-
+    num_initial_requests = 2
+    for _ in range(num_initial_requests):
+        req = _create_random_request(
+            max_tokens_range=(1, max_output_tokens),
+            num_tokens_range=(1, max_input_tokens),
+            arrival_time_range=(0, 0),
+            priority_range=(4, 4),
+            num_mm_item_range=(0, 2),
+            vllm_config=scheduler.vllm_config,
+        )
+        scheduler.add_request(req)
     for _ in range(20000):
         if len(scheduler.waiting) == 0:
             num_new_requests = random.randint(0, 2)
diff --git a/vllm/v1/core/sched/request_queue.py b/vllm/v1/core/sched/request_queue.py
index 7bc1010db23a2..a00ca1912b0f3 100644
--- a/vllm/v1/core/sched/request_queue.py
+++ b/vllm/v1/core/sched/request_queue.py
@@ -137,31 +137,30 @@ class PriorityRequestQueue(RequestQueue):
     """
     A priority queue that supports heap operations.
 
-    Requests with a smaller value of `priority` are processed first.
+    Respects the ordering defined in the Request class, where
+    requests with a smaller value of `priority` are processed first.
     If multiple requests have the same priority, the one with the earlier
     `arrival_time` is processed first.
     """
 
     def __init__(self) -> None:
-        self._heap: list[tuple[int, float, Request]] = []
+        self._heap: list[Request] = []
 
     def add_request(self, request: Request) -> None:
         """Add a request to the queue according to priority policy."""
-        heapq.heappush(self._heap, (request.priority, request.arrival_time, request))
+        heapq.heappush(self._heap, request)
 
     def pop_request(self) -> Request:
         """Pop a request from the queue according to priority policy."""
         if not self._heap:
             raise IndexError("pop from empty heap")
-        _, _, request = heapq.heappop(self._heap)
-        return request
+        return heapq.heappop(self._heap)
 
     def peek_request(self) -> Request:
         """Peek at the next request in the queue without removing it."""
         if not self._heap:
             raise IndexError("peek from empty heap")
-        _, _, request = self._heap[0]
-        return request
+        return self._heap[0]
 
     def prepend_request(self, request: Request) -> None:
         """Add a request to the queue according to priority policy.
@@ -180,15 +179,13 @@ class PriorityRequestQueue(RequestQueue):
 
     def remove_request(self, request: Request) -> None:
         """Remove a specific request from the queue."""
-        self._heap = [(p, t, r) for p, t, r in self._heap if r != request]
+        self._heap.remove(request)
         heapq.heapify(self._heap)
 
     def remove_requests(self, requests: Iterable[Request]) -> None:
         """Remove multiple specific requests from the queue."""
-        requests_to_remove = set(requests)
-        self._heap = [
-            (p, t, r) for p, t, r in self._heap if r not in requests_to_remove
-        ]
+        requests_to_remove = requests if isinstance(requests, set) else set(requests)
+        self._heap = [r for r in self._heap if r not in requests_to_remove]
         heapq.heapify(self._heap)
 
     def __bool__(self) -> bool:
@@ -203,8 +200,7 @@ class PriorityRequestQueue(RequestQueue):
         """Iterate over the queue according to priority policy."""
         heap_copy = self._heap[:]
         while heap_copy:
-            _, _, request = heapq.heappop(heap_copy)
-            yield request
+            yield heapq.heappop(heap_copy)
 
     def __reversed__(self) -> Iterator[Request]:
         """Iterate over the queue in reverse priority order."""
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index f2dfd2eed03cd..33762fe34e64f 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -227,6 +227,19 @@ class Request:
         events, self.events = self.events, []
         return events
 
+    def __lt__(self, other: "Request") -> bool:
+        """
+        Compare two requests based on priority, arrival time, and request ID.
+        Used in priority scheduling.
+        """
+        if self.priority != other.priority:
+            return self.priority < other.priority
+        if self.arrival_time != other.arrival_time:
+            return self.arrival_time < other.arrival_time
+        if self.request_id != other.request_id:
+            return self.request_id < other.request_id
+        return id(self) < id(other)
+
 
 class RequestStatus(enum.IntEnum):
     """Status of a request."""

From 5e5646e2064f925f97ff533aa688a43834e9ff96 Mon Sep 17 00:00:00 2001
From: Julien Denize <40604584+juliendenize@users.noreply.github.com>
Date: Tue, 2 Dec 2025 23:51:20 +0100
Subject: [PATCH 164/770] [BUGFIX] llama_4_scaling wrongly passed to
 DeepseekAttention (#29908)

Signed-off-by: juliendenize <julien.denize@mistral.ai>
---
 vllm/model_executor/models/deepseek_v2.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index d8a081af125c5..a8eb4a69b6f2b 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -1135,6 +1135,8 @@ class DeepseekV2DecoderLayer(nn.Module):
             dim == 0 for dim in (qk_nope_head_dim, qk_rope_head_dim)
         )
 
+        self.use_mha = use_mha
+
         if use_mha:
             attn_cls = DeepseekAttention
         elif model_config.use_mla:
@@ -1196,11 +1198,14 @@ class DeepseekV2DecoderLayer(nn.Module):
             hidden_states = self.input_layernorm(hidden_states)
         else:
             hidden_states, residual = self.input_layernorm(hidden_states, residual)
-        hidden_states = self.self_attn(
-            positions=positions,
-            hidden_states=hidden_states,
-            llama_4_scaling=llama_4_scaling,
-        )
+
+        attn_kwargs = {
+            "positions": positions,
+            "hidden_states": hidden_states,
+        }
+        if not self.use_mha:
+            attn_kwargs["llama_4_scaling"] = llama_4_scaling
+        hidden_states = self.self_attn(**attn_kwargs)
 
         if (
             not isinstance(self.self_attn, DeepseekAttention)

From 1b1e35aaf9d9561e1b5bf5b8e08b03565188e537 Mon Sep 17 00:00:00 2001
From: Julien Denize <40604584+juliendenize@users.noreply.github.com>
Date: Tue, 2 Dec 2025 23:51:58 +0100
Subject: [PATCH 165/770] [BUGFIX] Fix regex pattern for Mistral Tool Call
 (#29918)

Signed-off-by: juliendenize <julien.denize@mistral.ai>
---
 .../language/generation/test_mistral.py       | 35 +++++++++++++++++++
 .../tool_parsers/mistral_tool_parser.py       |  2 +-
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/tests/models/language/generation/test_mistral.py b/tests/models/language/generation/test_mistral.py
index 1377776a6d84b..e2d6271e2faed 100644
--- a/tests/models/language/generation/test_mistral.py
+++ b/tests/models/language/generation/test_mistral.py
@@ -315,3 +315,38 @@ def test_mistral_function_call_nested_json():
     assert json.loads(parsed.tool_calls[0].function.arguments) == args_dict
     # No additional content outside the tool call should be returned.
     assert parsed.content is None
+
+    # multiple calls
+    multiple_args_dict = [
+        {
+            "city": "Dallas",
+            "state": "TX",
+            "unit": "fahrenheit",
+            "sub_dict": {"foo": "bar", "inner": {"x": 1, "y": 2}},
+        },
+        {},
+        {"a": 0},
+        {"a": 1, "b": "c"},
+    ]
+    names = ["get_current_weather", "get_current_weather_2", "random", "random_2"]
+
+    model_output = "".join(
+        [
+            f"{parser.bot_token}{name}{json.dumps(args)}"
+            for name, args in zip(names, multiple_args_dict)
+        ]
+    )
+
+    parsed = parser.extract_tool_calls(model_output, None)
+
+    # Assertions: the tool call is detected and the full nested JSON is parsed
+    # without truncation.
+    assert parsed.tools_called
+    assert len(parsed.tool_calls) == len(multiple_args_dict)
+
+    for i, tool_call in enumerate(parsed.tool_calls):
+        assert MistralToolCall.is_valid_id(tool_call.id)
+        assert tool_call.function.name == names[i]
+        assert json.loads(tool_call.function.arguments) == multiple_args_dict[i]
+        # No additional content outside the tool call should be returned.
+        assert parsed.content is None
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index 89b882d6c8475..b89db60545abd 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -80,7 +80,7 @@ class MistralToolParser(ToolParser):
         self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL)
         if _is_fn_name_regex_support(self.model_tokenizer):
             self.fn_name_regex = re.compile(
-                r"([a-zA-Z0-9_-]+)(\{[\s\S]*?\})(?=\s*$|,|\s)?", re.DOTALL
+                r"([a-zA-Z0-9_-]+)(\{[\s\S]*?\}+)", re.DOTALL
             )
         else:
             self.fn_name_regex = None

From c014de1ec777554d2954655bd564493476d92061 Mon Sep 17 00:00:00 2001
From: Micah Williamson <micah.williamson@amd.com>
Date: Tue, 2 Dec 2025 16:54:36 -0600
Subject: [PATCH 166/770] [ROCm][CI] Fix test_cudagraph_mode.py Failure For AMD
 CI (#29808)

Signed-off-by: Micah Williamson <micah.williamson@amd.com>
---
 tests/v1/cudagraph/test_cudagraph_mode.py | 40 ++++++++---------------
 1 file changed, 14 insertions(+), 26 deletions(-)

diff --git a/tests/v1/cudagraph/test_cudagraph_mode.py b/tests/v1/cudagraph/test_cudagraph_mode.py
index 12621d493e549..b1895e83b8b37 100644
--- a/tests/v1/cudagraph/test_cudagraph_mode.py
+++ b/tests/v1/cudagraph/test_cudagraph_mode.py
@@ -100,32 +100,20 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
 
 # test cudagraph_mode with different compilation mode.
 # (backend_name, cudagraph_mode, compilation_mode, supported)
-if current_platform.is_rocm():
-    combo_cases_2 = [
-        ("RocmAttn", "FULL", CompilationMode.NONE, True),
-        ("RocmAttn", "FULL", CompilationMode.VLLM_COMPILE, True),
-        ("RocmAttn", "PIECEWISE", CompilationMode.NONE, False),
-        ("RocmAttn", "PIECEWISE", CompilationMode.VLLM_COMPILE, True),
-        ("RocmAttn", "FULL_AND_PIECEWISE", CompilationMode.NONE, False),
-        ("RocmAttn", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True),
-        ("RocmAttn", "FULL_DECODE_ONLY", CompilationMode.NONE, True),
-        ("RocmAttn", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True),
-        ("RocmAttn", "NONE", CompilationMode.NONE, True),
-        ("RocmAttn", "NONE", CompilationMode.VLLM_COMPILE, True),
-    ]
-else:
-    combo_cases_2 = [
-        ("FA2", "FULL", CompilationMode.NONE, True),
-        ("FA2", "FULL", CompilationMode.VLLM_COMPILE, True),
-        ("FA2", "PIECEWISE", CompilationMode.NONE, True),
-        ("FA2", "PIECEWISE", CompilationMode.VLLM_COMPILE, True),
-        ("FA2", "FULL_AND_PIECEWISE", CompilationMode.NONE, True),
-        ("FA2", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True),
-        ("FA2", "FULL_DECODE_ONLY", CompilationMode.NONE, True),
-        ("FA2", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True),
-        ("FA2", "NONE", CompilationMode.NONE, True),
-        ("FA2", "NONE", CompilationMode.VLLM_COMPILE, True),
-    ]
+attn_backend = "RocmAttn" if current_platform.is_rocm() else "FA2"
+
+combo_cases_2 = [
+    (attn_backend, "FULL", CompilationMode.NONE, True),
+    (attn_backend, "FULL", CompilationMode.VLLM_COMPILE, True),
+    (attn_backend, "PIECEWISE", CompilationMode.NONE, True),
+    (attn_backend, "PIECEWISE", CompilationMode.VLLM_COMPILE, True),
+    (attn_backend, "FULL_AND_PIECEWISE", CompilationMode.NONE, True),
+    (attn_backend, "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True),
+    (attn_backend, "FULL_DECODE_ONLY", CompilationMode.NONE, True),
+    (attn_backend, "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True),
+    (attn_backend, "NONE", CompilationMode.NONE, True),
+    (attn_backend, "NONE", CompilationMode.VLLM_COMPILE, True),
+]
 
 
 @pytest.mark.parametrize(

From 5d91d2b292be9b1d6b121d36d242d5077a031e4b Mon Sep 17 00:00:00 2001
From: maang-h <55082429+maang-h@users.noreply.github.com>
Date: Wed, 3 Dec 2025 07:23:09 +0800
Subject: [PATCH 167/770] [Doc] Add allocate_slots parameter docs (#29777)

Signed-off-by: maang <maang_h@163.com>
Signed-off-by: maang-h <55082429+maang-h@users.noreply.github.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
---
 vllm/v1/core/kv_cache_manager.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 3823384881cd3..33e8c81514c5f 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -230,6 +230,9 @@ class KVCacheManager:
             delay_cache_blocks: Whether to skip caching the blocks. This is
                 used by P/D when allocating blocks used in a KV transfer
                 which will complete in a future step.
+            num_encoder_tokens: The number of encoder tokens to allocate for
+                cross-attention in encoder-decoder models(e.g., Whisper).
+                For decoder-only models, this should be 0.
 
         Blocks layout:
         ```

From 5f67361fd12851bfe8faad4cc173ca24565611e4 Mon Sep 17 00:00:00 2001
From: Alexei-V-Ivanov-AMD
 <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
Date: Tue, 2 Dec 2025 18:40:02 -0600
Subject: [PATCH 168/770] Reverting re-direction to amd_mi355_X. (#29914)

Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
---
 .buildkite/test-amd.yaml | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 67088caa8150b..ee4fdebae5675 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -51,7 +51,7 @@ steps:
 - label: Async Engine, Inputs, Utils, Worker Test # 10min
   timeout_in_minutes: 15
   mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi355_1
+  agent_pool: mi325_1
   grade: Blocking
   source_file_dependencies:
   - vllm/
@@ -64,7 +64,7 @@ steps:
 - label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
   timeout_in_minutes: 20
   mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi355_1
+  agent_pool: mi325_1
   grade: Blocking
   source_file_dependencies:
   - vllm/
@@ -99,7 +99,7 @@ steps:
 - label: Basic Correctness Test # 20min
   timeout_in_minutes: 30
   mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_1
+  agent_pool: mi325_1
   # grade: Blocking
   fast_check: true
   torch_nightly: true
@@ -116,7 +116,7 @@ steps:
 
 - label: Entrypoints Unit Tests # 5min
   mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi355_1
+  agent_pool: mi325_1
   grade: Blocking
   timeout_in_minutes: 10
   working_dir: "/vllm-workspace/tests"
@@ -131,7 +131,7 @@ steps:
 - label: Entrypoints Integration Test (LLM) # 30min
   timeout_in_minutes: 40
   mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_1
+  agent_pool: mi325_1
   # grade: Blocking
   working_dir: "/vllm-workspace/tests"
   fast_check: true
@@ -254,7 +254,7 @@ steps:
 
 - label: EPLB Algorithm Test # 5min
   mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi355_1
+  agent_pool: mi325_1
   grade: Blocking
   timeout_in_minutes: 15
   working_dir: "/vllm-workspace/tests"
@@ -266,7 +266,7 @@ steps:
 
 - label: EPLB Execution Test # 10min
   mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_4
+  agent_pool: mi325_4
   # grade: Blocking
   timeout_in_minutes: 20
   working_dir: "/vllm-workspace/tests"
@@ -281,7 +281,7 @@ steps:
 - label: Metrics, Tracing Test # 12min
   timeout_in_minutes: 20
   mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_2
+  agent_pool: mi325_2
   # grade: Blocking
   num_gpus: 2
   source_file_dependencies:
@@ -301,7 +301,7 @@ steps:
 - label: Regression Test # 7min
   timeout_in_minutes: 20
   mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi355_1
+  agent_pool: mi325_1
   grade: Blocking
   source_file_dependencies:
   - vllm/
@@ -343,7 +343,7 @@ steps:
 - label: V1 Test entrypoints # 35min
   timeout_in_minutes: 50
   mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi355_1
+  agent_pool: mi325_1
   grade: Blocking
   source_file_dependencies:
     - vllm/
@@ -544,7 +544,7 @@ steps:
 - label: PyTorch Fullgraph Test # 27min
   timeout_in_minutes: 40
   mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_1
+  agent_pool: mi325_1
   # grade: Blocking
   torch_nightly: true
   source_file_dependencies:

From 5cdd66450910589c8e1a3d25e80711b0b6e51eb1 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Tue, 2 Dec 2025 19:56:54 -0500
Subject: [PATCH 169/770] [BugFix] Fix assert in `build_for_cudagraph_capture`
 (#29893)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 vllm/v1/worker/gpu_model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index ee28f477a26ad..8c22ada029b1a 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -4000,7 +4000,7 @@ class GPUModelRunner(
                 num_reqs=num_reqs_padded,
                 max_query_len=max_query_len,
                 ubatch_slices=ubatch_slices,
-                for_cudagraph_capture=True,
+                for_cudagraph_capture=is_graph_capturing,
             )
 
         with self.maybe_dummy_run_with_lora(

From 4dd79783744adbfdc86f9454bffb5a92715a7f61 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Tue, 2 Dec 2025 18:33:45 -0800
Subject: [PATCH 170/770] [Bugfix] Fix regression on pooling models from
 PR#29621 (#29921)

Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/multimodal/parse.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 0d3b8289e4e12..650368dcb8fcd 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -134,11 +134,17 @@ class EmbeddingItems(
     or a list of embedding tensors (one per item).
     """
 
+    def _unwrap(
+        self, item: torch.Tensor | MediaWithBytes[torch.Tensor]
+    ) -> torch.Tensor:
+        """Extract media from wrapper if present."""
+        return item.media if isinstance(item, MediaWithBytes) else item
+
     def get_count(self) -> int:
         return len(self.data)
 
     def get(self, index: int) -> torch.Tensor:
-        return self.data[index]
+        return self._unwrap(self.data[index])
 
     def get_processor_data(self) -> Mapping[str, object]:
         return {}

From 506ed87e876e9dca3c64ac83e21051c02e9cb2e3 Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Tue, 2 Dec 2025 20:36:49 -0600
Subject: [PATCH 171/770] [ROCm][CI][Bugfix] Disable Flash/MemEfficient SDP on
 ROCm to avoid HF Transformers accuracy issues (#29909)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 docker/Dockerfile.rocm                        |  6 +-----
 requirements/rocm-test.txt                    |  4 ++--
 .../models/multimodal/generation/conftest.py  | 19 +++++++++++++++++++
 3 files changed, 22 insertions(+), 7 deletions(-)
 create mode 100644 tests/models/multimodal/generation/conftest.py

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 4aabe2661088a..1b6bdabc7a539 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -65,7 +65,6 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
-# Centralized v1 package - copied to both test and final stages
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1
 
 # -----------------------
@@ -98,7 +97,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system hf_transfer
 ENV HF_HUB_ENABLE_HF_TRANSFER=1
 
-# Copy in the v1 package
+# Copy in the v1 package (for python-only install test group)
 COPY --from=export_vllm /vllm_v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
 
 # Source code is used in the `python_only_compile.sh` test
@@ -130,9 +129,6 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
     && pip uninstall -y vllm \
     && uv pip install --system *.whl
 
-# Copy in the v1 package
-COPY --from=export_vllm /vllm_v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
-
 ARG COMMON_WORKDIR
 
 # Copy over the benchmark scripts as well
diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt
index ae61d4c6c6a81..394728b67eaa4 100644
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -70,8 +70,8 @@ torchgeo==0.7.0
 mteb==2.1.2
 
 # Data processing
-xgrammar @ git+https://github.com/mlc-ai/xgrammar.git@eafd4db51b78acc64b3f0764ef27dfd206c28628
-    # Test async scheduling
+xgrammar==0.1.27
+# Test async scheduling
 
 # Utilities
 num2words==0.5.14
diff --git a/tests/models/multimodal/generation/conftest.py b/tests/models/multimodal/generation/conftest.py
new file mode 100644
index 0000000000000..ee3ecdb10fdb8
--- /dev/null
+++ b/tests/models/multimodal/generation/conftest.py
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Pytest configuration for vLLM tests."""
+
+import torch
+
+from vllm.platforms import current_platform
+
+
+def pytest_configure(config):
+    """Disable Flash/MemEfficient SDP on ROCm to avoid HF
+    Transformers accuracy issues.
+    """
+    if not current_platform.is_rocm():
+        return
+
+    torch.backends.cuda.enable_flash_sdp(False)
+    torch.backends.cuda.enable_mem_efficient_sdp(False)
+    torch.backends.cuda.enable_math_sdp(True)

From d7284a2604ef3fe96f0779309caafb59860704bb Mon Sep 17 00:00:00 2001
From: Arpit Khandelwal <60464796+arpitkh101@users.noreply.github.com>
Date: Tue, 2 Dec 2025 22:38:55 -0500
Subject: [PATCH 172/770] [Core] Rename PassConfig flags as per RFC #27995
 (#29646)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: arpitkh101 <arpit5khandelwal@gmail.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
---
 tests/compile/distributed/test_async_tp.py    |   4 +-
 .../distributed/test_fusion_all_reduce.py     |   2 +-
 tests/compile/distributed/test_fusions_e2e.py |  16 +--
 .../distributed/test_sequence_parallelism.py  |  18 +--
 tests/compile/fullgraph/test_full_graph.py    |   4 +-
 tests/compile/test_config.py                  |  77 +++++++++--
 tests/compile/test_functionalization.py       |   6 +-
 tests/compile/test_fusion.py                  |   4 +-
 tests/compile/test_fusion_attn.py             |   2 +-
 tests/compile/test_noop_elimination.py        |   4 +-
 tests/compile/test_pass_manager.py            |   7 +-
 tests/compile/test_qk_norm_rope_fusion.py     |   2 +-
 tests/compile/test_silu_mul_quant_fusion.py   |   2 +-
 tests/distributed/test_sequence_parallel.py   |  34 +++--
 tests/test_config.py                          |  12 +-
 vllm/compilation/pass_manager.py              |  13 +-
 vllm/config/compilation.py                    | 120 +++++++++++++++---
 vllm/config/utils.py                          |  29 +++++
 vllm/config/vllm.py                           |  76 ++++++-----
 vllm/v1/worker/gpu_model_runner.py            |   5 +-
 vllm/v1/worker/gpu_worker.py                  |   2 +-
 vllm/v1/worker/utils.py                       |   2 +-
 22 files changed, 318 insertions(+), 123 deletions(-)

diff --git a/tests/compile/distributed/test_async_tp.py b/tests/compile/distributed/test_async_tp.py
index 86d409f1eadb0..2eb18e25c98bf 100644
--- a/tests/compile/distributed/test_async_tp.py
+++ b/tests/compile/distributed/test_async_tp.py
@@ -326,7 +326,7 @@ def async_tp_pass_on_test_model(
     vllm_config = VllmConfig()
     vllm_config.compilation_config = CompilationConfig(
         pass_config=PassConfig(
-            enable_async_tp=True,
+            fuse_gemm_comms=True,
         ),
     )
     vllm_config.device_config = DeviceConfig(device=torch.device("cuda"))
@@ -413,7 +413,7 @@ def test_async_tp_pass_correctness(
         "mode": CompilationMode.VLLM_COMPILE,
         "compile_sizes": [2, 4, 8],
         "splitting_ops": [],
-        "pass_config": {"enable_async_tp": async_tp_enabled},
+        "pass_config": {"fuse_gemm_comms": async_tp_enabled},
     }
 
     async_tp_args = [
diff --git a/tests/compile/distributed/test_fusion_all_reduce.py b/tests/compile/distributed/test_fusion_all_reduce.py
index d401d57032752..fc8d1f98ebf87 100644
--- a/tests/compile/distributed/test_fusion_all_reduce.py
+++ b/tests/compile/distributed/test_fusion_all_reduce.py
@@ -295,7 +295,7 @@ def all_reduce_fusion_pass_on_test_model(
         )
     )
     vllm_config.compilation_config.pass_config = PassConfig(
-        enable_fi_allreduce_fusion=True, enable_noop=True
+        fuse_allreduce_rms=True, eliminate_noops=True
     )
     vllm_config.device_config = DeviceConfig(device=torch.device("cuda"))
     vllm_config.parallel_config.rank = local_rank  # Setup rank for debug path
diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py
index 661172e1965b5..5d2786e122a61 100644
--- a/tests/compile/distributed/test_fusions_e2e.py
+++ b/tests/compile/distributed/test_fusions_e2e.py
@@ -192,7 +192,7 @@ def test_attn_quant(
         splitting_ops=splitting_ops,
         # Common
         mode=CompilationMode.VLLM_COMPILE,
-        pass_config=PassConfig(enable_attn_fusion=True, enable_noop=True),
+        pass_config=PassConfig(fuse_attn_quant=True, eliminate_noops=True),
         # Inductor caches custom passes by default as well via uuid
         inductor_compile_config={"force_disable_caches": True},
     )
@@ -282,9 +282,9 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
         # Common
         mode=CompilationMode.VLLM_COMPILE,
         pass_config=PassConfig(
-            enable_attn_fusion=True,
-            enable_noop=True,
-            enable_fi_allreduce_fusion=True,
+            fuse_attn_quant=True,
+            eliminate_noops=True,
+            fuse_allreduce_rms=True,
         ),
         # Inductor caches custom passes by default as well via uuid
         inductor_compile_config={"force_disable_caches": True},
@@ -384,10 +384,10 @@ def test_tp2_attn_quant_async_tp(
         # Common
         level=CompilationMode.VLLM_COMPILE,
         pass_config=PassConfig(
-            enable_attn_fusion=True,
-            enable_noop=True,
-            enable_sequence_parallelism=True,
-            enable_async_tp=True,
+            fuse_attn_quant=True,
+            eliminate_noops=True,
+            enable_sp=True,
+            fuse_gemm_comms=True,
         ),
         # Inductor caches custom passes by default as well via uuid
         inductor_compile_config={"force_disable_caches": True},
diff --git a/tests/compile/distributed/test_sequence_parallelism.py b/tests/compile/distributed/test_sequence_parallelism.py
index 30084dfd5a950..d9fdc3acc3d6f 100644
--- a/tests/compile/distributed/test_sequence_parallelism.py
+++ b/tests/compile/distributed/test_sequence_parallelism.py
@@ -153,7 +153,7 @@ class TestAllReduceRMSNormStaticQuantFP8Model(torch.nn.Module):
         ]
 
     def ops_in_model(self):
-        if self.vllm_config.compilation_config.pass_config.enable_fusion:
+        if self.vllm_config.compilation_config.pass_config.fuse_norm_quant:
             return [torch.ops._C.fused_add_rms_norm_static_fp8_quant.default]
         elif RMSNorm.enabled():
             return [
@@ -183,7 +183,7 @@ class TestAllReduceRMSNormStaticQuantFP8Model(torch.nn.Module):
 @pytest.mark.parametrize("seq_len", [16])
 @pytest.mark.parametrize("hidden_size", [16])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
-@pytest.mark.parametrize("enable_fusion", [True, False])
+@pytest.mark.parametrize("fuse_norm_quant", [True, False])
 @pytest.mark.parametrize("dynamic", [False, True])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
 def test_sequence_parallelism_pass(
@@ -193,7 +193,7 @@ def test_sequence_parallelism_pass(
     seq_len: int,
     hidden_size: int,
     dtype: torch.dtype,
-    enable_fusion: bool,
+    fuse_norm_quant: bool,
     dynamic: bool,
 ):
     num_processes = 2
@@ -211,7 +211,7 @@ def test_sequence_parallelism_pass(
                 seq_len,
                 hidden_size,
                 dtype,
-                enable_fusion,
+                fuse_norm_quant,
                 dynamic,
             ),
             nprocs=nprocs,
@@ -229,7 +229,7 @@ def sequence_parallelism_pass_on_test_model(
     seq_len: int,
     hidden_size: int,
     dtype: torch.dtype,
-    enable_fusion: bool,
+    fuse_norm_quant: bool,
     dynamic: bool,
 ):
     current_platform.seed_everything(0)
@@ -260,9 +260,9 @@ def sequence_parallelism_pass_on_test_model(
         cudagraph_mode=CUDAGraphMode.NONE,  # avoid piecewise warnings
         custom_ops=custom_ops_list,
         pass_config=PassConfig(
-            enable_sequence_parallelism=True,
-            enable_fusion=enable_fusion,
-            enable_noop=True,
+            enable_sp=True,
+            fuse_norm_quant=fuse_norm_quant,
+            eliminate_noops=True,
         ),
     )  # NoOp needed for fusion
     device_config = DeviceConfig(device=torch.device("cuda"))
@@ -297,7 +297,7 @@ def sequence_parallelism_pass_on_test_model(
             sequence_parallelism_pass,
         ]
 
-        if enable_fusion:
+        if fuse_norm_quant:
             fusion_pass = RMSNormQuantFusionPass(vllm_config)
             passes_for_backend.append(fusion_pass)
 
diff --git a/tests/compile/fullgraph/test_full_graph.py b/tests/compile/fullgraph/test_full_graph.py
index 2c11ecef7f029..3cd1d4be2ebdc 100644
--- a/tests/compile/fullgraph/test_full_graph.py
+++ b/tests/compile/fullgraph/test_full_graph.py
@@ -122,7 +122,9 @@ def test_full_graph(
             CompilationConfig(
                 mode=CompilationMode.VLLM_COMPILE,
                 custom_ops=["+rms_norm"],
-                pass_config=PassConfig(enable_fusion=True, enable_noop=True),
+                pass_config=PassConfig(
+                    fuse_norm_quant=True, fuse_act_quant=True, eliminate_noops=True
+                ),
             ),
             *model_info,
         )
diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py
index a9e5ccee520e3..9e912c6d810d2 100644
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
+import logging
 from contextlib import nullcontext
 from unittest.mock import patch
 
@@ -10,8 +11,9 @@ from pydantic import ValidationError
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.fix_functionalization import FixFunctionalizationPass
 from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
-from vllm.config.compilation import CompilationMode
+from vllm.config.compilation import CompilationMode, PassConfig
 from vllm.engine.arg_utils import EngineArgs
+from vllm.logger import _print_warning_once
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import _is_torch_equal_or_newer
 
@@ -191,7 +193,7 @@ def test_splitting_ops_dynamic():
     config = VllmConfig(
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
-            pass_config={"enable_attn_fusion": True, "enable_noop": True},
+            pass_config=PassConfig(fuse_attn_quant=True, eliminate_noops=True),
             custom_ops=["+quant_fp8"],
             cudagraph_mode=CUDAGraphMode.PIECEWISE,
         )
@@ -206,7 +208,7 @@ def test_splitting_ops_dynamic():
         config = VllmConfig(
             compilation_config=CompilationConfig(
                 mode=CompilationMode.VLLM_COMPILE,
-                pass_config={"enable_attn_fusion": True, "enable_noop": True},
+                pass_config=PassConfig(fuse_attn_quant=True, eliminate_noops=True),
                 custom_ops=["+quant_fp8"],
                 cudagraph_mode=CUDAGraphMode.PIECEWISE,
                 # work around for accessing all attntion ops
@@ -219,7 +221,7 @@ def test_splitting_ops_dynamic():
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
             use_inductor_graph_partition=True,
-            pass_config={"enable_attn_fusion": True, "enable_noop": True},
+            pass_config=PassConfig(fuse_attn_quant=True, eliminate_noops=True),
             custom_ops=["+quant_fp8"],
             cudagraph_mode=CUDAGraphMode.PIECEWISE,
         )
@@ -227,7 +229,7 @@ def test_splitting_ops_dynamic():
     # With inductor graph partition, attn_fusion and splitting_ops
     # work together. Default splitting_ops include attention ops.
     assert config.compilation_config.splitting_ops_contain_attention()
-    # enable_attn_fusion is directly supported under
+    # fuse_attn_quant is directly supported under
     # use_inductor_graph_partition=True, and cudagraph_mode
     # is unchanged.
     assert config.compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE
@@ -301,7 +303,7 @@ def test_should_split():
         "cudagraph_capture_sizes",
         "max_cudagraph_capture_size",
         "tp_size",
-        "enable_sequence_parallelism",
+        "enable_sp",
         "max_num_batched_tokens",
         "cudagraph_mode",
         "expected_max_size",
@@ -339,7 +341,7 @@ def test_cudagraph_sizes_post_init(
     cudagraph_capture_sizes,
     max_cudagraph_capture_size,
     tp_size,
-    enable_sequence_parallelism,
+    enable_sp,
     max_num_batched_tokens,
     cudagraph_mode,
     expected_max_size,
@@ -355,11 +357,12 @@ def test_cudagraph_sizes_post_init(
         compilation_config = CompilationConfig(
             cudagraph_capture_sizes=cudagraph_capture_sizes,
             max_cudagraph_capture_size=max_cudagraph_capture_size,
-            pass_config={
-                "enable_sequence_parallelism": enable_sequence_parallelism,
-                "enable_fusion": True,
-                "enable_noop": True,
-            },
+            pass_config=PassConfig(
+                enable_sp=enable_sp,
+                fuse_norm_quant=True,
+                fuse_act_quant=True,
+                eliminate_noops=True,
+            ),
             cudagraph_mode=cudagraph_mode,
         )
         engine_args = EngineArgs(
@@ -375,3 +378,53 @@ def test_cudagraph_sizes_post_init(
             vllm_config.compilation_config.max_cudagraph_capture_size
             == expected_max_size
         )
+
+
+def test_pass_config_deprecation(caplog_vllm):
+    caplog_vllm.set_level(logging.WARNING)
+
+    # Clear cache to ensure warnings are re-issued
+    _print_warning_once.cache_clear()
+
+    # Test enable_fusion -> fuse_norm_quant, fuse_act_quant
+    caplog_vllm.clear()
+    config = PassConfig(enable_fusion=True)
+    assert "enable_fusion is deprecated" in caplog_vllm.text
+    assert config.fuse_norm_quant is True
+    assert config.fuse_act_quant is True
+    assert config.enable_fusion is None
+
+    # Test enable_attn_fusion -> fuse_attn_quant
+    caplog_vllm.clear()
+    config = PassConfig(enable_attn_fusion=True)
+    assert "enable_attn_fusion is deprecated" in caplog_vllm.text
+    assert config.fuse_attn_quant is True
+    assert config.enable_attn_fusion is None
+
+    # Test enable_noop -> eliminate_noops
+    caplog_vllm.clear()
+    config = PassConfig(enable_noop=True)
+    assert "enable_noop is deprecated" in caplog_vllm.text
+    assert config.eliminate_noops is True
+    assert config.enable_noop is None
+
+    # Test enable_sequence_parallelism -> enable_sp
+    caplog_vllm.clear()
+    config = PassConfig(enable_sequence_parallelism=True)
+    assert "enable_sequence_parallelism is deprecated" in caplog_vllm.text
+    assert config.enable_sp is True
+    assert config.enable_sequence_parallelism is None
+
+    # Test enable_async_tp -> fuse_gemm_comms
+    caplog_vllm.clear()
+    config = PassConfig(enable_async_tp=True)
+    assert "enable_async_tp is deprecated" in caplog_vllm.text
+    assert config.fuse_gemm_comms is True
+    assert config.enable_async_tp is None
+
+    # Test enable_fi_allreduce_fusion -> fuse_allreduce_rms
+    caplog_vllm.clear()
+    config = PassConfig(enable_fi_allreduce_fusion=True)
+    assert "enable_fi_allreduce_fusion is deprecated" in caplog_vllm.text
+    assert config.fuse_allreduce_rms is True
+    assert config.enable_fi_allreduce_fusion is None
diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py
index 515e0a93ac2a8..7585915892700 100644
--- a/tests/compile/test_functionalization.py
+++ b/tests/compile/test_functionalization.py
@@ -223,7 +223,11 @@ def test_fix_functionalization(
         model_config=ModelConfig(dtype=dtype),
         compilation_config=CompilationConfig(
             custom_ops=["all"],
-            pass_config=PassConfig(enable_fusion=do_fusion, enable_noop=True),
+            pass_config=PassConfig(
+                fuse_norm_quant=do_fusion,
+                fuse_act_quant=do_fusion,
+                eliminate_noops=True,
+            ),
         ),
     )
 
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index 286f2276367a0..d0ba8385f4a01 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -159,7 +159,9 @@ def test_fusion_rmsnorm_quant(
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
             custom_ops=custom_ops,
-            pass_config=PassConfig(enable_fusion=True, enable_noop=True),
+            pass_config=PassConfig(
+                fuse_norm_quant=True, fuse_act_quant=True, eliminate_noops=True
+            ),
         ),
     )
     with vllm.config.set_current_vllm_config(vllm_config):
diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
index 4d213e030edb5..9b4486e56c73e 100644
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -373,7 +373,7 @@ def test_attention_quant_pattern(
 
     # Run model with attn fusion enabled
     vllm_config.compilation_config.pass_config = PassConfig(
-        enable_attn_fusion=True, enable_noop=True
+        fuse_attn_quant=True, eliminate_noops=True
     )
     with (
         set_current_vllm_config(vllm_config),
diff --git a/tests/compile/test_noop_elimination.py b/tests/compile/test_noop_elimination.py
index 0ccc1a0161629..bfe08382fd949 100644
--- a/tests/compile/test_noop_elimination.py
+++ b/tests/compile/test_noop_elimination.py
@@ -51,7 +51,7 @@ def test_noop_elimination(dtype, num_tokens, hidden_size, buffer_size):
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
-            pass_config=PassConfig(enable_noop=True),
+            pass_config=PassConfig(eliminate_noops=True),
         )
     )
     with vllm.config.set_current_vllm_config(vllm_config):
@@ -99,7 +99,7 @@ def test_non_noop_slice_preserved():
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
-            pass_config=PassConfig(enable_noop=True),
+            pass_config=PassConfig(eliminate_noops=True),
         )
     )
     with vllm.config.set_current_vllm_config(vllm_config):
diff --git a/tests/compile/test_pass_manager.py b/tests/compile/test_pass_manager.py
index 1c40c599f7487..6d0ba6b655031 100644
--- a/tests/compile/test_pass_manager.py
+++ b/tests/compile/test_pass_manager.py
@@ -64,8 +64,11 @@ def test_pass_manager_uuid(callable):
 
     # UUID should be different due to config change
     config2 = copy.deepcopy(config)
-    config2.compilation_config.pass_config.enable_fusion = (
-        not config2.compilation_config.pass_config.enable_fusion
+    config2.compilation_config.pass_config.fuse_norm_quant = (
+        not config2.compilation_config.pass_config.fuse_norm_quant
+    )
+    config2.compilation_config.pass_config.fuse_act_quant = (
+        not config2.compilation_config.pass_config.fuse_act_quant
     )
     pass_manager3 = PostGradPassManager()
     pass_manager3.configure(config2)
diff --git a/tests/compile/test_qk_norm_rope_fusion.py b/tests/compile/test_qk_norm_rope_fusion.py
index 5ebb95b6db332..e0968ac799256 100644
--- a/tests/compile/test_qk_norm_rope_fusion.py
+++ b/tests/compile/test_qk_norm_rope_fusion.py
@@ -140,7 +140,7 @@ def test_qk_norm_rope_fusion(
             custom_ops=custom_ops,
             pass_config=PassConfig(
                 enable_qk_norm_rope_fusion=True,
-                enable_noop=True,
+                eliminate_noops=True,
             ),
         ),
     )
diff --git a/tests/compile/test_silu_mul_quant_fusion.py b/tests/compile/test_silu_mul_quant_fusion.py
index 0ddb82b7c3fc2..c336a45955cb5 100644
--- a/tests/compile/test_silu_mul_quant_fusion.py
+++ b/tests/compile/test_silu_mul_quant_fusion.py
@@ -168,7 +168,7 @@ def test_fusion_silu_and_mul_quant(
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
             custom_ops=custom_ops,
-            pass_config=PassConfig(enable_fusion=True, enable_noop=True),
+            pass_config=PassConfig(fuse_act_quant=True, eliminate_noops=True),
         ),
     )
 
diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py
index f38c509775ed5..0a7907aadeab5 100644
--- a/tests/distributed/test_sequence_parallel.py
+++ b/tests/distributed/test_sequence_parallel.py
@@ -32,7 +32,8 @@ VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
 class ParallelSetup(NamedTuple):
     tp_size: int
     pp_size: int
-    enable_fusion: bool
+    fuse_norm_quant: bool
+    fuse_act_quant: bool
     eager_mode: bool
     chunked_prefill: bool
 
@@ -66,7 +67,8 @@ class SPTestSettings:
                         ParallelSetup(
                             tp_size=tp_base,
                             pp_size=pp_multiplier * pp_base,
-                            enable_fusion=False,
+                            fuse_norm_quant=False,
+                            fuse_act_quant=False,
                             eager_mode=eager_mode_val,
                             chunked_prefill=chunked_prefill_val,
                         )
@@ -97,7 +99,8 @@ class SPTestSettings:
                         ParallelSetup(
                             tp_size=tp_base,
                             pp_size=pp_multiplier * pp_base,
-                            enable_fusion=False,
+                            fuse_norm_quant=False,
+                            fuse_act_quant=False,
                             eager_mode=eager_mode_val,
                             chunked_prefill=chunked_prefill_val,
                         )
@@ -126,7 +129,8 @@ class SPTestSettings:
                 ParallelSetup(
                     tp_size=tp_base,
                     pp_size=pp_base,
-                    enable_fusion=fusion_val,
+                    fuse_norm_quant=fusion_val,
+                    fuse_act_quant=fusion_val,
                     eager_mode=True,
                     chunked_prefill=False,
                 )
@@ -162,7 +166,7 @@ def _compare_sp(
     test_options: SPTestOptions,
     num_gpus_available: int,
     use_inductor_graph_partition: bool,
-    enable_async_tp: bool,
+    fuse_gemm_comms: bool,
     *,
     method: Literal["generate", "encode"],
     is_multimodal: bool,
@@ -170,7 +174,8 @@ def _compare_sp(
     (
         tp_size,
         pp_size,
-        enable_fusion,
+        fuse_norm_quant,
+        fuse_act_quant,
         eager_mode,
         chunked_prefill,
     ) = parallel_setup
@@ -248,10 +253,11 @@ def _compare_sp(
         "mode": CompilationMode.VLLM_COMPILE,
         "compile_sizes": [4, 8],
         "pass_config": {
-            "enable_sequence_parallelism": True,
-            "enable_async_tp": enable_async_tp,
-            "enable_fusion": enable_fusion,
-            "enable_noop": True,
+            "enable_sp": True,
+            "fuse_gemm_comms": fuse_gemm_comms,
+            "fuse_norm_quant": fuse_norm_quant,
+            "fuse_act_quant": fuse_act_quant,
+            "eliminate_noops": True,
         },
         "use_inductor_graph_partition": use_inductor_graph_partition,
     }
@@ -309,7 +315,7 @@ SP_TEST_MODELS = [
     ],
 )
 @pytest.mark.parametrize("use_inductor_graph_partition", [True, False])
-@pytest.mark.parametrize("enable_async_tp", [False])  # TODO: enable async TP
+@pytest.mark.parametrize("fuse_gemm_comms", [False])  # TODO: enable async TP
 @create_new_process_for_each_test()
 def test_tp_sp_generation(
     model_id: str,
@@ -319,7 +325,7 @@ def test_tp_sp_generation(
     test_options: SPTestOptions,
     num_gpus_available,
     use_inductor_graph_partition: bool,
-    enable_async_tp: bool,
+    fuse_gemm_comms: bool,
 ):
     if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
         pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
@@ -328,7 +334,7 @@ def test_tp_sp_generation(
     if (
         "fp8" in model_id.lower()
         and current_platform.get_device_capability() < (9, 0)
-        and (not enable_async_tp)
+        and (not fuse_gemm_comms)
     ):
         pytest.skip("FP8 reduction support begins with sm90 capable devices.")
 
@@ -340,7 +346,7 @@ def test_tp_sp_generation(
         test_options,
         num_gpus_available,
         use_inductor_graph_partition,
-        enable_async_tp=enable_async_tp,
+        fuse_gemm_comms=fuse_gemm_comms,
         method="generate",
         is_multimodal=False,
     )
diff --git a/tests/test_config.py b/tests/test_config.py
index b7ed68fea92ab..019c0d6d8733f 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -1023,17 +1023,17 @@ def test_vllm_config_explicit_overrides():
     assert config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE
 
     # Explicit pass config flags to override defaults
-    pass_config = PassConfig(enable_noop=True, enable_attn_fusion=True)
+    pass_config = PassConfig(eliminate_noops=True, fuse_attn_quant=True)
     compilation_config = CompilationConfig(pass_config=pass_config)
     config = VllmConfig(
         optimization_level=OptimizationLevel.O0,
         compilation_config=compilation_config,
     )
-    assert config.compilation_config.pass_config.enable_noop is True
-    assert config.compilation_config.pass_config.enable_attn_fusion is True
+    assert config.compilation_config.pass_config.eliminate_noops is True
+    assert config.compilation_config.pass_config.fuse_attn_quant is True
 
     # Explicit cudagraph mode override on quantized model at O2
-    pass_config = PassConfig(enable_async_tp=True)
+    pass_config = PassConfig(fuse_gemm_comms=True)
     compilation_config = CompilationConfig(
         cudagraph_mode=CUDAGraphMode.NONE, pass_config=pass_config
     )
@@ -1043,7 +1043,7 @@ def test_vllm_config_explicit_overrides():
         compilation_config=compilation_config,
     )
     assert config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE
-    assert config.compilation_config.pass_config.enable_async_tp is True
+    assert config.compilation_config.pass_config.fuse_gemm_comms is True
     # Mode should still use default for O2
     assert config.compilation_config.mode == CompilationMode.VLLM_COMPILE
 
@@ -1093,7 +1093,7 @@ def test_vllm_config_explicit_overrides():
         compilation_config=compilation_config,
     )
     # Explicit override should be respected
-    assert config.compilation_config.pass_config.enable_noop is False
+    assert config.compilation_config.pass_config.eliminate_noops is False
     # Other fields should still use defaults
     assert config.compilation_config.mode == CompilationMode.VLLM_COMPILE
     assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE
diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
index fe2547d7fecaf..37f48721ea208 100644
--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
@@ -92,22 +92,23 @@ class PostGradPassManager(CustomGraphPass):
 
         # Set the current vllm config to allow tracing CustomOp instances
         with set_current_vllm_config(config, check_compile=False):
-            if self.pass_config.enable_noop:
+            if self.pass_config.eliminate_noops:
                 self.passes += [NoOpEliminationPass(config)]
 
-            if self.pass_config.enable_sequence_parallelism:
+            if self.pass_config.enable_sp:
                 self.passes += [SequenceParallelismPass(config)]
-                if self.pass_config.enable_async_tp:
+                if self.pass_config.fuse_gemm_comms:
                     self.passes += [AsyncTPPass(config)]
 
-            if self.pass_config.enable_fi_allreduce_fusion:
+            if self.pass_config.fuse_allreduce_rms:
                 self.passes += [AllReduceFusionPass(config)]
 
-            if self.pass_config.enable_fusion:
+            if self.pass_config.fuse_norm_quant:
                 self.passes += [RMSNormQuantFusionPass(config)]
+            if self.pass_config.fuse_act_quant:
                 self.passes += [ActivationQuantFusionPass(config)]
 
-            if self.pass_config.enable_attn_fusion:
+            if self.pass_config.fuse_attn_quant:
                 self.passes += [AttnFusionPass(config)]
 
             if self.pass_config.enable_qk_norm_rope_fusion:
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 0f876c38169ac..963b091939e0e 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -13,7 +13,7 @@ from pydantic.dataclasses import dataclass
 
 import vllm.envs as envs
 from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
-from vllm.config.utils import config
+from vllm.config.utils import config, handle_deprecated
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils.import_utils import resolve_obj_by_qualname
@@ -105,18 +105,43 @@ class PassConfig:
     improper state.
     """
 
+    # New flags
+    fuse_norm_quant: bool = Field(default=None)
+    """Fuse the custom RMSNorm + quant ops."""
+    fuse_act_quant: bool = Field(default=None)
+    """Fuse the custom SiluMul + quant ops."""
+    fuse_attn_quant: bool = Field(default=None)
+    """Fuse the custom attention + quant ops."""
+    eliminate_noops: bool = Field(default=None)
+    """Eliminate no-op ops."""
+    enable_sp: bool = Field(default=None)
+    """Enable sequence parallelism."""
+    fuse_gemm_comms: bool = Field(default=None)
+    """Enable async TP."""
+    fuse_allreduce_rms: bool = Field(default=None)
+    """Enable flashinfer allreduce fusion."""
+
+    # Deprecated flags
     enable_fusion: bool = Field(default=None)
-    """Whether to enable the custom fusion (RMSNorm/SiluMul+quant) pass."""
+    """Deprecated in: v0.12.0. Use fuse_norm_quant and fuse_act_quant 
+    instead. Will be removed in v0.13.0 or v1.0.0, whichever is sooner.
+    """
     enable_attn_fusion: bool = Field(default=None)
-    """Whether to enable the custom attention+quant fusion pass."""
+    """Deprecated in: v0.12.0. Use fuse_attn_quant instead. 
+    Will be removed in v0.13.0 or v1.0.0, whichever is sooner."""
     enable_noop: bool = Field(default=None)
-    """Whether to enable the custom no-op elimination pass."""
+    """Deprecated in: v0.12.0. Use eliminate_noops instead. 
+    Will be removed in v0.13.0 or v1.0.0, whichever is sooner."""
     enable_sequence_parallelism: bool = Field(default=None)
-    """Whether to enable sequence parallelism."""
+    """Deprecated in: v0.12.0. Use enable_sp instead. 
+    Will be removed in v0.13.0 or v1.0.0, whichever is sooner."""
     enable_async_tp: bool = Field(default=None)
-    """Whether to enable async TP."""
+    """Deprecated in: v0.12.0. Use fuse_gemm_comms instead. 
+    Will be removed in v0.13.0 or v1.0.0, whichever is sooner."""
     enable_fi_allreduce_fusion: bool = Field(default=None)
-    """Whether to enable flashinfer allreduce fusion."""
+    """Deprecated in: v0.12.0. Use fuse_allreduce_rms instead. 
+    Will be removed in v0.13.0 or v1.0.0, whichever is sooner."""
+
     fi_allreduce_fusion_max_size_mb: float | None = None
     """The threshold of the communicated tensor sizes under which
     vllm should use flashinfer fused allreduce. Specified as a
@@ -136,7 +161,7 @@ class PassConfig:
             },
         }, where key is the device capability"""
     enable_qk_norm_rope_fusion: bool = False
-    """Whether to enable the fused Q/K RMSNorm + RoPE pass."""
+    """Enable fused Q/K RMSNorm + RoPE pass."""
 
     # TODO(luka) better pass enabling system.
 
@@ -174,6 +199,13 @@ class PassConfig:
         return InductorPass.hash_dict(asdict(self))
 
     @field_validator(
+        "fuse_norm_quant",
+        "fuse_act_quant",
+        "fuse_attn_quant",
+        "eliminate_noops",
+        "enable_sp",
+        "fuse_gemm_comms",
+        "fuse_allreduce_rms",
         "enable_fusion",
         "enable_attn_fusion",
         "enable_noop",
@@ -190,18 +222,71 @@ class PassConfig:
         return handler(value)
 
     def __post_init__(self) -> None:
-        if not self.enable_noop:
-            if self.enable_fusion:
+        # Handle deprecation and defaults
+
+        # Map old flags to new flags and issue warnings
+        handle_deprecated(
+            self,
+            "enable_fusion",
+            ["fuse_norm_quant", "fuse_act_quant"],
+            "v0.13.0 or v1.0.0, whichever is sooner",
+        )
+
+        handle_deprecated(
+            self,
+            "enable_attn_fusion",
+            "fuse_attn_quant",
+            "v0.13.0 or v1.0.0, whichever is sooner",
+        )
+
+        handle_deprecated(
+            self,
+            "enable_sequence_parallelism",
+            "enable_sp",
+            "v0.13.0 or v1.0.0, whichever is sooner",
+        )
+
+        handle_deprecated(
+            self,
+            "enable_async_tp",
+            "fuse_gemm_comms",
+            "v0.13.0 or v1.0.0, whichever is sooner",
+        )
+
+        handle_deprecated(
+            self,
+            "enable_fi_allreduce_fusion",
+            "fuse_allreduce_rms",
+            "v0.13.0 or v1.0.0, whichever is sooner",
+        )
+
+        handle_deprecated(
+            self,
+            "enable_noop",
+            "eliminate_noops",
+            "v0.13.0 or v1.0.0, whichever is sooner",
+        )
+
+        # Force old flags to None to ensure they are not used
+        self.enable_fusion = None
+        self.enable_attn_fusion = None
+        self.enable_noop = None
+        self.enable_sequence_parallelism = None
+        self.enable_async_tp = None
+        self.enable_fi_allreduce_fusion = None
+
+        if not self.eliminate_noops:
+            if self.fuse_norm_quant or self.fuse_act_quant:
                 logger.warning_once(
                     "Fusion enabled but reshape elimination disabled. "
                     "RMSNorm/SiluMul + quant (fp8) fusion might not work"
                 )
-            if self.enable_attn_fusion:
+            if self.fuse_attn_quant:
                 logger.warning_once(
                     "Fusion enabled but reshape elimination disabled. "
                     "Attention + quant (fp8) fusion might not work"
                 )
-            if self.enable_fi_allreduce_fusion:
+            if self.fuse_allreduce_rms:
                 logger.warning_once(
                     "Fusion enabled but reshape elimination disabled. "
                     "Allreduce + rms norm + quant (fp8) fusion might not work"
@@ -873,7 +958,7 @@ class CompilationConfig:
             self.set_splitting_ops_for_inductor_graph_partition()
             return
 
-        if self.pass_config.enable_attn_fusion:
+        if self.pass_config.fuse_attn_quant:
             # here use_inductor_graph_partition is False
             self.set_splitting_ops_for_attn_fusion()
             return
@@ -915,12 +1000,12 @@ class CompilationConfig:
             self.splitting_ops = list(self._attention_ops)
 
     def set_splitting_ops_for_attn_fusion(self):
-        assert self.pass_config.enable_attn_fusion
+        assert self.pass_config.fuse_attn_quant
         if self.splitting_ops is None:
             self.splitting_ops = []
             if self.cudagraph_mode.has_piecewise_cudagraphs():
                 logger.warning_once(
-                    "enable_attn_fusion is incompatible with piecewise "
+                    "fuse_attn_quant is incompatible with piecewise "
                     "cudagraph when use_inductor_graph_partition is off. "
                     "In this case, splitting_ops will be set to empty "
                     "list, and cudagraph_mode will be set to FULL. "
@@ -931,8 +1016,7 @@ class CompilationConfig:
                 self.cudagraph_mode = CUDAGraphMode.FULL
 
         assert not self.splitting_ops_contain_attention(), (
-            "attention ops should not be in splitting_ops "
-            "when enable_attn_fusion is True"
+            "attention ops should not be in splitting_ops when fuse_attn_quant is True"
         )
 
     def splitting_ops_contain_attention(self) -> bool:
@@ -1008,7 +1092,7 @@ class CompilationConfig:
         self, uniform_decode_query_len: int, tensor_parallel_size: int
     ):
         multiple_of = uniform_decode_query_len
-        if tensor_parallel_size > 1 and self.pass_config.enable_sequence_parallelism:
+        if tensor_parallel_size > 1 and self.pass_config.enable_sp:
             multiple_of = max(uniform_decode_query_len, tensor_parallel_size)
             if (
                 multiple_of % uniform_decode_query_len != 0
diff --git a/vllm/config/utils.py b/vllm/config/utils.py
index 02f2b75f608f1..3124fcf007396 100644
--- a/vllm/config/utils.py
+++ b/vllm/config/utils.py
@@ -19,6 +19,10 @@ import torch
 from pydantic.fields import FieldInfo
 from typing_extensions import runtime_checkable
 
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
 if TYPE_CHECKING:
     from _typeshed import DataclassInstance
 else:
@@ -293,3 +297,28 @@ def get_hash_factors(config: ConfigT, ignored_factors: set[str]) -> dict[str, ob
 def hash_factors(items: dict[str, object]) -> str:
     """Return a SHA-256 hex digest of the canonical items structure."""
     return hashlib.sha256(json.dumps(items, sort_keys=True).encode()).hexdigest()
+
+
+def handle_deprecated(
+    config: ConfigT,
+    old_name: str,
+    new_name_or_names: str | list[str],
+    removal_version: str,
+) -> None:
+    old_val = getattr(config, old_name)
+    if old_val is None:
+        return
+
+    if isinstance(new_name_or_names, str):
+        new_names = [new_name_or_names]
+    else:
+        new_names = new_name_or_names
+
+    msg = (
+        f"{old_name} is deprecated and will be removed in {removal_version}. "
+        f"Use {', '.join(new_names)} instead."
+    )
+    logger.warning(msg)
+
+    for new_name in new_names:
+        setattr(config, new_name, old_val)
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 5b3a9c437662b..735b0afbaaeb3 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -83,22 +83,33 @@ IS_DENSE = False
 # See https://github.com/vllm-project/vllm/issues/25689.
 
 
-def enable_fusion(cfg: "VllmConfig") -> bool:
-    """Returns True if RMS norm or quant FP8 is enabled."""
+def enable_norm_fusion(cfg: "VllmConfig") -> bool:
+    """Enable if either RMS norm or quant FP8 custom op is active;
+    otherwise Inductor handles fusion."""
+
     return cfg.compilation_config.is_custom_op_enabled(
         "rms_norm"
     ) or cfg.compilation_config.is_custom_op_enabled("quant_fp8")
 
 
+def enable_act_fusion(cfg: "VllmConfig") -> bool:
+    """Enable if either SiLU+Mul or quant FP8 custom op is active;
+    otherwise Inductor handles fusion."""
+    return cfg.compilation_config.is_custom_op_enabled(
+        "silu_and_mul"
+    ) or cfg.compilation_config.is_custom_op_enabled("quant_fp8")
+
+
 OPTIMIZATION_LEVEL_00 = {
     "compilation_config": {
         "pass_config": {
-            "enable_noop": False,
-            "enable_fusion": False,
-            "enable_fi_allreduce_fusion": False,
-            "enable_attn_fusion": False,
-            "enable_sequence_parallelism": False,
-            "enable_async_tp": False,
+            "eliminate_noops": False,
+            "fuse_norm_quant": False,
+            "fuse_act_quant": False,
+            "fuse_allreduce_rms": False,
+            "fuse_attn_quant": False,
+            "enable_sp": False,
+            "fuse_gemm_comms": False,
         },
         "cudagraph_mode": CUDAGraphMode.NONE,
         "use_inductor_graph_partition": False,
@@ -107,12 +118,13 @@ OPTIMIZATION_LEVEL_00 = {
 OPTIMIZATION_LEVEL_01 = {
     "compilation_config": {
         "pass_config": {
-            "enable_noop": True,
-            "enable_fusion": enable_fusion,
-            "enable_fi_allreduce_fusion": False,
-            "enable_attn_fusion": False,
-            "enable_sequence_parallelism": False,
-            "enable_async_tp": False,
+            "eliminate_noops": True,
+            "fuse_norm_quant": enable_norm_fusion,
+            "fuse_act_quant": enable_act_fusion,
+            "fuse_allreduce_rms": False,
+            "fuse_attn_quant": False,
+            "enable_sp": False,
+            "fuse_gemm_comms": False,
         },
         "cudagraph_mode": CUDAGraphMode.PIECEWISE,
         "use_inductor_graph_partition": False,
@@ -121,12 +133,13 @@ OPTIMIZATION_LEVEL_01 = {
 OPTIMIZATION_LEVEL_02 = {
     "compilation_config": {
         "pass_config": {
-            "enable_noop": True,
-            "enable_fusion": enable_fusion,
-            "enable_fi_allreduce_fusion": False,
-            "enable_attn_fusion": IS_QUANTIZED,
-            "enable_sequence_parallelism": IS_DENSE,
-            "enable_async_tp": IS_DENSE,
+            "eliminate_noops": True,
+            "fuse_norm_quant": enable_norm_fusion,
+            "fuse_act_quant": enable_act_fusion,
+            "fuse_allreduce_rms": False,
+            "fuse_attn_quant": IS_QUANTIZED,
+            "enable_sp": IS_DENSE,
+            "fuse_gemm_comms": IS_DENSE,
         },
         "cudagraph_mode": CUDAGraphMode.FULL_AND_PIECEWISE,
         "use_inductor_graph_partition": False,
@@ -135,12 +148,13 @@ OPTIMIZATION_LEVEL_02 = {
 OPTIMIZATION_LEVEL_03 = {
     "compilation_config": {
         "pass_config": {
-            "enable_noop": True,
-            "enable_fusion": enable_fusion,
-            "enable_fi_allreduce_fusion": False,
-            "enable_attn_fusion": IS_QUANTIZED,
-            "enable_sequence_parallelism": IS_DENSE,
-            "enable_async_tp": IS_DENSE,
+            "eliminate_noops": True,
+            "fuse_norm_quant": enable_norm_fusion,
+            "fuse_act_quant": enable_act_fusion,
+            "fuse_allreduce_rms": False,
+            "fuse_attn_quant": IS_QUANTIZED,
+            "enable_sp": IS_DENSE,
+            "fuse_gemm_comms": IS_DENSE,
         },
         "cudagraph_mode": CUDAGraphMode.FULL_AND_PIECEWISE,
         "use_inductor_graph_partition": False,
@@ -645,9 +659,9 @@ class VllmConfig:
 
         # async tp is built on top of sequence parallelism
         # and requires it to be enabled.
-        if self.compilation_config.pass_config.enable_async_tp:
-            self.compilation_config.pass_config.enable_sequence_parallelism = True
-        if self.compilation_config.pass_config.enable_sequence_parallelism:
+        if self.compilation_config.pass_config.fuse_gemm_comms:
+            self.compilation_config.pass_config.enable_sp = True
+        if self.compilation_config.pass_config.enable_sp:
             if "-rms_norm" in self.compilation_config.custom_ops:
                 logger.warning(
                     "RMS norm force disabled, sequence parallelism might break"
@@ -797,7 +811,7 @@ class VllmConfig:
         # Do this after all the updates to compilation_config.mode
         self.compilation_config.set_splitting_ops_for_v1()
 
-        if self.compilation_config.pass_config.enable_sequence_parallelism:
+        if self.compilation_config.pass_config.enable_sp:
             # With pipeline parallelism or dynamo partitioning,
             # native rms norm tracing errors due to incorrect residual shape.
             # Use custom rms norm to unblock. In the future,
@@ -1062,7 +1076,7 @@ class VllmConfig:
 
             if (
                 self.parallel_config.tensor_parallel_size > 1
-                and self.compilation_config.pass_config.enable_sequence_parallelism
+                and self.compilation_config.pass_config.enable_sp
             ):
                 cudagraph_capture_sizes = self.update_sizes_for_sequence_parallelism(
                     cudagraph_capture_sizes
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 8c22ada029b1a..1b250a8bd009c 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2417,10 +2417,7 @@ class GPUModelRunner(
         # Pad tokens to multiple of tensor_parallel_size when
         # enabled collective fusion for SP
         tp_size = self.vllm_config.parallel_config.tensor_parallel_size
-        if (
-            self.compilation_config.pass_config.enable_sequence_parallelism
-            and tp_size > 1
-        ):
+        if self.compilation_config.pass_config.enable_sp and tp_size > 1:
             return round_up(num_scheduled_tokens, tp_size)
         return num_scheduled_tokens
 
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index ed6fb32bcb2f6..edba07a423cda 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -552,7 +552,7 @@ class Worker(WorkerBase):
 
         if (
             parallel_config.pipeline_parallel_size > 1
-            and compilation_config.pass_config.enable_sequence_parallelism
+            and compilation_config.pass_config.enable_sp
             and forward_pass
         ):
             # currently only supported by V1 GPUModelRunner
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index bd88cb1b253f8..427a0d296b253 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -342,7 +342,7 @@ def is_residual_scattered_for_sp(
       partition), SP is always applied
     - Otherwise, SP is only applied for specific shapes in compile_sizes
     """
-    if not vllm_config.compilation_config.pass_config.enable_sequence_parallelism:
+    if not vllm_config.compilation_config.pass_config.enable_sp:
         return False
 
     tp = vllm_config.parallel_config.tensor_parallel_size

From b08025a83bf416d97d0547ac52c3909356e118c4 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Tue, 2 Dec 2025 23:57:28 -0500
Subject: [PATCH 173/770] [Docs] Discuss api key limitations in security guide
 (#29922)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 docs/usage/security.md         | 110 +++++++++++++++++++++++++++++++++
 vllm/entrypoints/cli/openai.py |   4 ++
 2 files changed, 114 insertions(+)

diff --git a/docs/usage/security.md b/docs/usage/security.md
index 9d10b66a5a97f..74060d86f6854 100644
--- a/docs/usage/security.md
+++ b/docs/usage/security.md
@@ -108,6 +108,116 @@ networks.
 Consult your operating system or application platform documentation for specific
 firewall configuration instructions.
 
+## API Key Authentication Limitations
+
+### Overview
+
+The `--api-key` flag (or `VLLM_API_KEY` environment variable) provides authentication for vLLM's HTTP server, but **only for OpenAI-compatible API endpoints under the `/v1` path prefix**. Many other sensitive endpoints are exposed on the same HTTP server without any authentication enforcement.
+
+**Important:** Do not rely exclusively on `--api-key` for securing access to vLLM. Additional security measures are required for production deployments.
+
+### Protected Endpoints (Require API Key)
+
+When `--api-key` is configured, the following `/v1` endpoints require Bearer token authentication:
+
+- `/v1/models` - List available models
+- `/v1/chat/completions` - Chat completions
+- `/v1/completions` - Text completions
+- `/v1/embeddings` - Generate embeddings
+- `/v1/audio/transcriptions` - Audio transcription
+- `/v1/audio/translations` - Audio translation
+- `/v1/messages` - Anthropic-compatible messages API
+- `/v1/responses` - Response management
+- `/v1/score` - Scoring API
+- `/v1/rerank` - Reranking API
+
+### Unprotected Endpoints (No API Key Required)
+
+The following endpoints **do not require authentication** even when `--api-key` is configured:
+
+**Inference endpoints:**
+
+- `/invocations` - SageMaker-compatible endpoint (routes to the same inference functions as `/v1` endpoints)
+- `/inference/v1/generate` - Generate completions
+- `/pooling` - Pooling API
+- `/classify` - Classification API
+- `/score` - Scoring API (non-`/v1` variant)
+- `/rerank` - Reranking API (non-`/v1` variant)
+
+**Operational control endpoints (always enabled):**
+
+- `/pause` - Pause generation (causes denial of service)
+- `/resume` - Resume generation
+- `/scale_elastic_ep` - Trigger scaling operations
+
+**Utility endpoints:**
+
+- `/tokenize` - Tokenize text
+- `/detokenize` - Detokenize tokens
+- `/health` - Health check
+- `/ping` - SageMaker health check
+- `/version` - Version information
+- `/load` - Server load metrics
+
+**Tokenizer information endpoint (only when `--enable-tokenizer-info-endpoint` is set):**
+
+This endpoint is **only available when the `--enable-tokenizer-info-endpoint` flag is set**. It may expose sensitive information such as chat templates and tokenizer configuration:
+
+- `/tokenizer_info` - Get comprehensive tokenizer information including chat templates and configuration
+
+**Development endpoints (only when `VLLM_SERVER_DEV_MODE=1`):**
+
+These endpoints are **only available when the environment variable `VLLM_SERVER_DEV_MODE` is set to `1`**. They are intended for development and debugging purposes and should never be enabled in production:
+
+- `/server_info` - Get detailed server configuration
+- `/reset_prefix_cache` - Reset prefix cache (can disrupt service)
+- `/reset_mm_cache` - Reset multimodal cache (can disrupt service)
+- `/sleep` - Put engine to sleep (causes denial of service)
+- `/wake_up` - Wake engine from sleep
+- `/is_sleeping` - Check if engine is sleeping
+- `/collective_rpc` - Execute arbitrary RPC methods on the engine (extremely dangerous)
+
+**Profiler endpoints (only when `VLLM_TORCH_PROFILER_DIR` or `VLLM_TORCH_CUDA_PROFILE` are set):**
+
+These endpoints are only available when profiling is enabled and should only be used for local development:
+
+- `/start_profile` - Start PyTorch profiler
+- `/stop_profile` - Stop PyTorch profiler
+
+**Note:** The `/invocations` endpoint is particularly concerning as it provides unauthenticated access to the same inference capabilities as the protected `/v1` endpoints.
+
+### Security Implications
+
+An attacker who can reach the vLLM HTTP server can:
+
+1. **Bypass authentication** by using non-`/v1` endpoints like `/invocations`, `/inference/v1/generate`, `/pooling`, `/classify`, `/score`, or `/rerank` to run arbitrary inference without credentials
+2. **Cause denial of service** by calling `/pause` or `/scale_elastic_ep` without a token
+3. **Access operational controls** to manipulate server state (e.g., pausing generation)
+4. **If `--enable-tokenizer-info-endpoint` is set:** Access sensitive tokenizer configuration including chat templates, which may reveal prompt engineering strategies or other implementation details
+5. **If `VLLM_SERVER_DEV_MODE=1` is set:** Execute arbitrary RPC commands via `/collective_rpc`, reset caches, put the engine to sleep, and access detailed server configuration
+
+### Recommended Security Practices
+
+#### 1. Minimize Exposed Endpoints
+
+**CRITICAL:** Never set `VLLM_SERVER_DEV_MODE=1` in production environments. Development endpoints expose extremely dangerous functionality including:
+
+- Arbitrary RPC execution via `/collective_rpc`
+- Cache manipulation that can disrupt service
+- Detailed server configuration disclosure
+
+Similarly, never enable profiler endpoints (`VLLM_TORCH_PROFILER_DIR` or `VLLM_TORCH_CUDA_PROFILE`) in production.
+
+**Be cautious with `--enable-tokenizer-info-endpoint`:** Only enable the `/tokenizer_info` endpoint if you need to expose tokenizer configuration information. This endpoint reveals chat templates and tokenizer settings that may contain sensitive implementation details or prompt engineering strategies.
+
+#### 2. Deploy Behind a Reverse Proxy
+
+The most effective approach is to deploy vLLM behind a reverse proxy (such as nginx, Envoy, or a Kubernetes Gateway) that:
+
+- Explicitly allowlists only the endpoints you want to expose to end users
+- Blocks all other endpoints, including the unauthenticated inference and operational control endpoints
+- Implements additional authentication, rate limiting, and logging at the proxy layer
+
 ## Reporting Security Vulnerabilities
 
 If you believe you have found a security vulnerability in vLLM, please report it following the project's security policy. For more information on how to report security issues and the project's security policy, please see the [vLLM Security Policy](https://github.com/vllm-project/vllm/blob/main/SECURITY.md).
diff --git a/vllm/entrypoints/cli/openai.py b/vllm/entrypoints/cli/openai.py
index fb49be370203e..1c18b193d1cdc 100644
--- a/vllm/entrypoints/cli/openai.py
+++ b/vllm/entrypoints/cli/openai.py
@@ -109,6 +109,10 @@ def _add_query_options(parser: FlexibleArgumentParser) -> FlexibleArgumentParser
         help=(
             "API key for OpenAI services. If provided, this api key "
             "will overwrite the api key obtained through environment variables."
+            " It is important to note that this option only applies to the "
+            "OpenAI-compatible API endpoints and NOT other endpoints that may "
+            "be present in the server. See the security guide in the vLLM docs "
+            "for more details."
         ),
     )
     return parser

From c719c40540a85c1e6aeee9af20f29db581da27f0 Mon Sep 17 00:00:00 2001
From: elvischenv <219235043+elvischenv@users.noreply.github.com>
Date: Wed, 3 Dec 2025 13:15:50 +0800
Subject: [PATCH 174/770] [Bugfix] Defunctionalize TRTLLM AR+Norm op for
 avoiding extra clone kernel before it (#29631)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
---
 vllm/compilation/fix_functionalization.py | 12 ++++++++++++
 vllm/compilation/fx_utils.py              |  4 ++--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py
index 126ad35e527ae..76068f86ebfb3 100644
--- a/vllm/compilation/fix_functionalization.py
+++ b/vllm/compilation/fix_functionalization.py
@@ -103,6 +103,18 @@ class FixFunctionalizationPass(VllmInductorPass):
             ]:
                 mutated_args = {1: "result"}
                 self.defunctionalize(graph, node, mutated_args)
+            elif (
+                at_target
+                == torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default
+            ):
+                mutated_args = {
+                    1: "allreduce_in",
+                    2: "residual",
+                    3: "norm_out",
+                    4: "quant_out",
+                    5: "scale_out",
+                }
+                self.defunctionalize(graph, node, mutated_args)
             # For some reason we need to specify the args for both
             # silu_and_mul and silu_and_mul_quant. The kwargs
             # pathway gets the wrong answer.
diff --git a/vllm/compilation/fx_utils.py b/vllm/compilation/fx_utils.py
index f2497950fc22f..3650ee6b41745 100644
--- a/vllm/compilation/fx_utils.py
+++ b/vllm/compilation/fx_utils.py
@@ -75,8 +75,8 @@ def find_op_nodes(
         return
 
     assert isinstance(op, OpOverload)
-    if not op._schema.is_mutable:
-        yield from graph.find_nodes(op="call_function", target=op)
+
+    yield from graph.find_nodes(op="call_function", target=op)
 
     for n in graph.find_nodes(op="call_function", target=auto_functionalized):
         if n.args[0] == op:

From 0bec63fa317e1fbd62e19b0fc31c43c81bf89077 Mon Sep 17 00:00:00 2001
From: JackieWu <wkcn@live.cn>
Date: Wed, 3 Dec 2025 14:20:37 +0800
Subject: [PATCH 175/770] [BugFix] fix imgs_pos in hunyuan_vl (#29879)

Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/transformers_utils/processors/hunyuan_vl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/transformers_utils/processors/hunyuan_vl.py b/vllm/transformers_utils/processors/hunyuan_vl.py
index 615a8bff85912..f32ce115c866d 100644
--- a/vllm/transformers_utils/processors/hunyuan_vl.py
+++ b/vllm/transformers_utils/processors/hunyuan_vl.py
@@ -123,7 +123,7 @@ class HunYuanVLProcessor(ProcessorMixin):
 
         attention_mask = input_ids.ne(self.pad_id)
         text_inputs["attention_mask"] = attention_mask
-        text_inputs["imgs_pos"] = [self.get_imgs_pos(input_ids)]
+        text_inputs["imgs_pos"] = [self.get_imgs_pos(e) for e in input_ids]
         # image_inputs["imgs"] = [[image_inputs["pixel_values"]]]
 
         return_tensors = kwargs.pop("return_tensors", None)

From bbfb55c29e7febb91e90f261dd9adb4200ee3a09 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 3 Dec 2025 15:49:34 +0800
Subject: [PATCH 176/770] [Misc] Allow `fetch_*` utils to access local files by
 default (#29932)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/multimodal/utils.py | 38 ++++++++++++++++++++++++++++++--------
 vllm/multimodal/video.py |  2 +-
 2 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 1020554e2e073..1840220854858 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -67,8 +67,9 @@ class MediaConnector:
                              to set num_frames for video, set
                              `--media-io-kwargs '{"video":{"num_frames":40}}'`
             connection: HTTP connection client to download media contents.
-            allowed_local_media_path: A local directory to load media files
-                                      from.
+            allowed_local_media_path: A local directory to load media files from.
+            allowed_media_domains: If set, only media URLs that belong to this
+                                   domain can be used for multi-modal inputs.
         """
         super().__init__()
 
@@ -123,16 +124,16 @@ class MediaConnector:
                 "Cannot load local files without `--allowed-local-media-path`."
             )
 
-        filepath = Path(url2pathname(url_spec.path))
+        filepath = Path(url2pathname(url_spec.netloc + url_spec.path))
         if allowed_local_media_path not in filepath.resolve().parents:
             raise ValueError(
                 f"The file path {filepath} must be a subpath "
-                f"of `--allowed-local-media-path` {allowed_local_media_path}."
+                f"of `--allowed-local-media-path {allowed_local_media_path}`."
             )
 
         return media_io.load_file(filepath)
 
-    def _assert_url_in_allowed_media_domains(self, url_spec) -> None:
+    def _assert_url_in_allowed_media_domains(self, url_spec: ParseResult) -> None:
         if (
             self.allowed_media_domains
             and url_spec.hostname not in self.allowed_media_domains
@@ -489,9 +490,16 @@ def fetch_audio(
     Args:
         audio_url: URL of the audio file to fetch.
         audio_io_kwargs: Additional kwargs passed to handle audio IO.
+
+    Warning:
+        This method has direct access to local files and is only intended
+        to be called by user code. Never call this from the online server!
     """
     media_io_kwargs = None if not audio_io_kwargs else {"audio": audio_io_kwargs}
-    media_connector = MediaConnector(media_io_kwargs=media_io_kwargs)
+    media_connector = MediaConnector(
+        media_io_kwargs=media_io_kwargs,
+        allowed_local_media_path="/",
+    )
     return media_connector.fetch_audio(audio_url)
 
 
@@ -503,9 +511,16 @@ def fetch_image(
     Args:
         image_url: URL of the image file to fetch.
         image_io_kwargs: Additional kwargs passed to handle image IO.
+
+    Warning:
+        This method has direct access to local files and is only intended
+        to be called by user code. Never call this from the online server!
     """
     media_io_kwargs = None if not image_io_kwargs else {"image": image_io_kwargs}
-    media_connector = MediaConnector(media_io_kwargs=media_io_kwargs)
+    media_connector = MediaConnector(
+        media_io_kwargs=media_io_kwargs,
+        allowed_local_media_path="/",
+    )
     return media_connector.fetch_image(image_url)
 
 
@@ -517,7 +532,14 @@ def fetch_video(
     Args:
         video_url: URL of the video file to fetch.
         video_io_kwargs: Additional kwargs passed to handle video IO.
+
+    Warning:
+        This method has direct access to local files and is only intended
+        to be called by user code. Never call this from the online server!
     """
     media_io_kwargs = None if not video_io_kwargs else {"video": video_io_kwargs}
-    media_connector = MediaConnector(media_io_kwargs=media_io_kwargs)
+    media_connector = MediaConnector(
+        media_io_kwargs=media_io_kwargs,
+        allowed_local_media_path="/",
+    )
     return media_connector.fetch_video(video_url)
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 763f90fde7b6d..abfc226a689c2 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -267,7 +267,7 @@ class OpenCVDynamicVideoBackend(OpenCVVideoBackend):
         return frames, metadata
 
 
-class VideoMediaIO(MediaIO[npt.NDArray]):
+class VideoMediaIO(MediaIO[tuple[npt.NDArray, dict[str, Any]]]):
     def __init__(
         self,
         image_io: ImageMediaIO,

From 3a7751485b71ce5ef927e4aa03b28602cb90811c Mon Sep 17 00:00:00 2001
From: Andrew Xia <axia@meta.com>
Date: Tue, 2 Dec 2025 23:59:23 -0800
Subject: [PATCH 177/770] [responsesAPI] support input output messages for non
 harmony models (#29549)

Signed-off-by: Andrew Xia <axia@fb.com>
Co-authored-by: Andrew Xia <axia@fb.com>
---
 .../openai/test_response_api_simple.py        | 18 +++++++++++++++
 vllm/entrypoints/context.py                   | 22 +++++++++++++++++++
 vllm/entrypoints/openai/protocol.py           | 22 +++++++++++++++----
 vllm/entrypoints/openai/serving_responses.py  | 13 +++++------
 4 files changed, 64 insertions(+), 11 deletions(-)

diff --git a/tests/entrypoints/openai/test_response_api_simple.py b/tests/entrypoints/openai/test_response_api_simple.py
index 425b8199a0fd0..aee03199bc6f4 100644
--- a/tests/entrypoints/openai/test_response_api_simple.py
+++ b/tests/entrypoints/openai/test_response_api_simple.py
@@ -42,6 +42,24 @@ async def test_basic(client: OpenAI, model_name: str):
     assert response.status == "completed"
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_enable_response_messages(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="Hello?",
+        extra_body={"enable_response_messages": True},
+    )
+    assert response.status == "completed"
+    assert response.input_messages[0]["type"] == "raw_message_tokens"
+    assert type(response.input_messages[0]["message"]) is str
+    assert len(response.input_messages[0]["message"]) > 10
+    assert type(response.input_messages[0]["tokens"][0]) is int
+    assert type(response.output_messages[0]["message"]) is str
+    assert len(response.output_messages[0]["message"]) > 10
+    assert type(response.output_messages[0]["tokens"][0]) is int
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_reasoning_item(client: OpenAI, model_name: str):
diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
index 1260f65dba59a..43783c92667af 100644
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -23,6 +23,7 @@ from vllm.entrypoints.openai.parser.responses_parser import (
 )
 from vllm.entrypoints.openai.protocol import (
     ResponseInputOutputItem,
+    ResponseRawMessageAndToken,
     ResponsesRequest,
 )
 from vllm.entrypoints.responses_utils import construct_tool_dicts
@@ -148,6 +149,8 @@ def _create_json_parse_error_messages(
 
 
 class SimpleContext(ConversationContext):
+    """This is a context that cannot handle MCP tool calls"""
+
     def __init__(self):
         self.last_output = None
         self.num_prompt_tokens = 0
@@ -158,6 +161,9 @@ class SimpleContext(ConversationContext):
         # not implemented yet for SimpleContext
         self.all_turn_metrics = []
 
+        self.input_messages: list[ResponseRawMessageAndToken] = []
+        self.output_messages: list[ResponseRawMessageAndToken] = []
+
     def append_output(self, output) -> None:
         self.last_output = output
         if not isinstance(output, RequestOutput):
@@ -166,6 +172,22 @@ class SimpleContext(ConversationContext):
         self.num_cached_tokens = output.num_cached_tokens or 0
         self.num_output_tokens += len(output.outputs[0].token_ids or [])
 
+        if len(self.input_messages) == 0:
+            output_prompt = output.prompt or ""
+            output_prompt_token_ids = output.prompt_token_ids or []
+            self.input_messages.append(
+                ResponseRawMessageAndToken(
+                    message=output_prompt,
+                    tokens=output_prompt_token_ids,
+                )
+            )
+        self.output_messages.append(
+            ResponseRawMessageAndToken(
+                message=output.outputs[0].text,
+                tokens=output.outputs[0].token_ids,
+            )
+        )
+
     def append_tool_output(self, output) -> None:
         raise NotImplementedError("Should not be called.")
 
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 0f4b2b4d7aad0..2d34a6a0cd5ad 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1598,6 +1598,20 @@ def serialize_messages(msgs):
     return [serialize_message(msg) for msg in msgs] if msgs else None
 
 
+class ResponseRawMessageAndToken(OpenAIBaseModel):
+    """Class to show the raw message.
+    If message / tokens diverge, tokens is the source of truth"""
+
+    message: str
+    tokens: list[int]
+    type: Literal["raw_message_tokens"] = "raw_message_tokens"
+
+
+ResponseInputOutputMessage: TypeAlias = (
+    list[ChatCompletionMessageParam] | list[ResponseRawMessageAndToken]
+)
+
+
 class ResponsesResponse(OpenAIBaseModel):
     id: str = Field(default_factory=lambda: f"resp_{random_uuid()}")
     created_at: int = Field(default_factory=lambda: int(time.time()))
@@ -1631,8 +1645,8 @@ class ResponsesResponse(OpenAIBaseModel):
     # These are populated when enable_response_messages is set to True
     # NOTE: custom serialization is needed
     # see serialize_input_messages and serialize_output_messages
-    input_messages: list[ChatCompletionMessageParam] | None = None
-    output_messages: list[ChatCompletionMessageParam] | None = None
+    input_messages: ResponseInputOutputMessage | None = None
+    output_messages: ResponseInputOutputMessage | None = None
     # --8<-- [end:responses-extra-params]
 
     # NOTE: openAI harmony doesn't serialize TextContent properly,
@@ -1658,8 +1672,8 @@ class ResponsesResponse(OpenAIBaseModel):
         output: list[ResponseOutputItem],
         status: ResponseStatus,
         usage: ResponseUsage | None = None,
-        input_messages: list[ChatCompletionMessageParam] | None = None,
-        output_messages: list[ChatCompletionMessageParam] | None = None,
+        input_messages: ResponseInputOutputMessage | None = None,
+        output_messages: ResponseInputOutputMessage | None = None,
     ) -> "ResponsesResponse":
         incomplete_details: IncompleteDetails | None = None
         if status == "incomplete":
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 5ad86194ce1b2..3c9ae8e8c8087 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -86,6 +86,7 @@ from vllm.entrypoints.openai.protocol import (
     ResponseCompletedEvent,
     ResponseCreatedEvent,
     ResponseInProgressEvent,
+    ResponseInputOutputMessage,
     ResponseReasoningPartAddedEvent,
     ResponseReasoningPartDoneEvent,
     ResponsesRequest,
@@ -629,8 +630,8 @@ class OpenAIServingResponses(OpenAIServing):
         # "completed" is implemented as the "catch-all" for now.
         status: ResponseStatus = "completed"
 
-        input_messages = None
-        output_messages = None
+        input_messages: ResponseInputOutputMessage | None = None
+        output_messages: ResponseInputOutputMessage | None = None
         if self.use_harmony:
             assert isinstance(context, HarmonyContext)
             output = self._make_response_output_items_with_harmony(context)
@@ -670,12 +671,10 @@ class OpenAIServingResponses(OpenAIServing):
 
             output = self._make_response_output_items(request, final_output, tokenizer)
 
-            # TODO: context for non-gptoss models doesn't use messages
-            # so we can't get them out yet
             if request.enable_response_messages:
-                raise NotImplementedError(
-                    "enable_response_messages is currently only supported for gpt-oss"
-                )
+                input_messages = context.input_messages
+                output_messages = context.output_messages
+
             # Calculate usage.
             assert final_res.prompt_token_ids is not None
             num_tool_output_tokens = 0

From 69520bc695ff8fa7fda66ef7c1a16761824ad354 Mon Sep 17 00:00:00 2001
From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com>
Date: Tue, 2 Dec 2025 23:01:48 -1000
Subject: [PATCH 178/770] Add logging for cudagraph related info (#29825)

Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
---
 vllm/compilation/cuda_graph.py     | 94 ++++++++++++++++++++++++++++++
 vllm/config/observability.py       |  4 ++
 vllm/engine/arg_utils.py           |  6 ++
 vllm/v1/core/sched/scheduler.py    |  8 ++-
 vllm/v1/metrics/loggers.py         | 14 +++++
 vllm/v1/metrics/stats.py           |  3 +
 vllm/v1/outputs.py                 |  4 ++
 vllm/v1/worker/gpu_model_runner.py | 32 ++++++++--
 vllm/v1/worker/gpu_worker.py       |  2 +-
 9 files changed, 161 insertions(+), 6 deletions(-)

diff --git a/vllm/compilation/cuda_graph.py b/vllm/compilation/cuda_graph.py
index a2e0abfebc2c9..0748643a5299f 100644
--- a/vllm/compilation/cuda_graph.py
+++ b/vllm/compilation/cuda_graph.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
+from collections import Counter
 from collections.abc import Callable
 from contextlib import ExitStack
 from typing import Any
@@ -22,6 +23,99 @@ from vllm.utils.torch_utils import weak_ref_tensors
 logger = init_logger(__name__)
 
 
+@dataclasses.dataclass(frozen=True)
+class CUDAGraphStat:
+    num_unpadded_tokens: int
+    num_padded_tokens: int
+    num_paddings: int
+    runtime_mode: str
+
+
+class CUDAGraphLogging:
+    """Aggregate and log cudagraph metrics"""
+
+    COLUMN_HEADERS = [
+        "Unpadded Tokens",
+        "Padded Tokens",
+        "Num Paddings",
+        "Runtime Mode",
+        "Count",
+    ]
+
+    def __init__(self, cg_mode: CUDAGraphMode, cg_capture_sizes: list[int] | None):
+        self.reset()
+        self.cg_mode = str(cg_mode)
+        self.cg_capture_sizes = str(cg_capture_sizes or [])
+
+        self.settings_header = (
+            "**CUDAGraph Config Settings:**\n\n"
+            f"- Mode: {self.cg_mode}\n"
+            f"- Capture sizes: {self.cg_capture_sizes}\n\n"
+            "**CUDAGraph Stats:**\n\n"
+        )
+
+    def reset(self):
+        self.stats = []
+
+    def observe(self, cudagraph_stat: CUDAGraphStat):
+        self.stats.append(cudagraph_stat)
+
+    def generate_metric_table(self) -> str:
+        stats_counts = Counter(self.stats)
+
+        # Convert stats to rows of strings, in descending order of observed frequencies
+        rows = []
+        for stat, count in sorted(
+            stats_counts.items(), key=lambda item: item[1], reverse=True
+        ):
+            rows.append(
+                [
+                    str(stat.num_unpadded_tokens),
+                    str(stat.num_padded_tokens),
+                    str(stat.num_paddings),
+                    stat.runtime_mode,
+                    str(count),
+                ]
+            )
+
+        # Calculate column widths (max of header and data)
+        col_widths = []
+        for i, header_text in enumerate(self.COLUMN_HEADERS):
+            max_width = len(header_text)
+            for row in rows:
+                max_width = max(max_width, len(row[i]))
+            col_widths.append(max_width)
+
+        table_header_list = [
+            h.ljust(w) for h, w in zip(self.COLUMN_HEADERS, col_widths)
+        ]
+        table_header = "| " + " | ".join(table_header_list) + " |\n"
+
+        table_separator = "|" + "|".join("-" * (w + 2) for w in col_widths) + "|\n"
+
+        # Create data rows with proper alignment
+        data_rows = []
+        for row in rows:
+            formatted_row = [
+                str(val).ljust(width) for val, width in zip(row, col_widths)
+            ]
+            data_rows.append("| " + " | ".join(formatted_row) + " |")
+
+        return (
+            self.settings_header
+            + table_header
+            + table_separator
+            + "\n".join(data_rows)
+            + "\n"
+        )
+
+    def log(self, log_fn=logger.info):
+        if not self.stats:
+            return
+        log_fn(self.generate_metric_table())
+        self.reset()
+
+
 @dataclasses.dataclass
 class CUDAGraphEntry:
     batch_descriptor: BatchDescriptor
diff --git a/vllm/config/observability.py b/vllm/config/observability.py
index 656a5f8a9068e..fdc27aee380ef 100644
--- a/vllm/config/observability.py
+++ b/vllm/config/observability.py
@@ -55,6 +55,10 @@ class ObservabilityConfig:
     kv_cache_metrics_sample: float = Field(default=0.01, gt=0, le=1)
     """Sampling rate for KV cache metrics (0.0, 1.0]. Default 0.01 = 1% of blocks."""
 
+    cudagraph_metrics: bool = False
+    """Enable CUDA graph metrics (number of padded/unpadded tokens, runtime cudagraph
+    dispatch modes, and their observed frequencies at every logging interval)."""
+
     @cached_property
     def collect_model_forward_time(self) -> bool:
         """Whether to collect model forward time for the request."""
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 096217da4fe44..fd07cded7bc51 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -518,6 +518,7 @@ class EngineArgs:
     kv_cache_metrics_sample: float = get_field(
         ObservabilityConfig, "kv_cache_metrics_sample"
     )
+    cudagraph_metrics: bool = ObservabilityConfig.cudagraph_metrics
     scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
     scheduler_cls: str | type[object] | None = SchedulerConfig.scheduler_cls
 
@@ -1021,6 +1022,10 @@ class EngineArgs:
             "--kv-cache-metrics-sample",
             **observability_kwargs["kv_cache_metrics_sample"],
         )
+        observability_group.add_argument(
+            "--cudagraph-metrics",
+            **observability_kwargs["cudagraph_metrics"],
+        )
 
         # Scheduler arguments
         scheduler_kwargs = get_kwargs(SchedulerConfig)
@@ -1698,6 +1703,7 @@ class EngineArgs:
             collect_detailed_traces=self.collect_detailed_traces,
             kv_cache_metrics=self.kv_cache_metrics,
             kv_cache_metrics_sample=self.kv_cache_metrics_sample,
+            cudagraph_metrics=self.cudagraph_metrics,
         )
 
         # Compilation config overrides
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 52b98ef654592..75a7385df38b1 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -7,6 +7,7 @@ from collections.abc import Iterable
 from typing import Any
 
 from vllm import envs
+from vllm.compilation.cuda_graph import CUDAGraphStat
 from vllm.config import VllmConfig
 from vllm.distributed.ec_transfer.ec_connector.base import (
     ECConnectorMetadata,
@@ -1037,6 +1038,7 @@ class Scheduler(SchedulerInterface):
         pooler_outputs = model_runner_output.pooler_output
         num_nans_in_logits = model_runner_output.num_nans_in_logits
         kv_connector_output = model_runner_output.kv_connector_output
+        cudagraph_stats = model_runner_output.cudagraph_stats
 
         outputs: dict[int, list[EngineCoreOutput]] = defaultdict(list)
         spec_decoding_stats: SpecDecodingStats | None = None
@@ -1219,7 +1221,9 @@ class Scheduler(SchedulerInterface):
             finished_req_ids.clear()
 
         if (
-            stats := self.make_stats(spec_decoding_stats, kv_connector_stats)
+            stats := self.make_stats(
+                spec_decoding_stats, kv_connector_stats, cudagraph_stats
+            )
         ) is not None:
             # Return stats to only one of the front-ends.
             if (eco := next(iter(engine_core_outputs.values()), None)) is None:
@@ -1420,6 +1424,7 @@ class Scheduler(SchedulerInterface):
         self,
         spec_decoding_stats: SpecDecodingStats | None = None,
         kv_connector_stats: KVConnectorStats | None = None,
+        cudagraph_stats: CUDAGraphStat | None = None,
     ) -> SchedulerStats | None:
         if not self.log_stats:
             return None
@@ -1444,6 +1449,7 @@ class Scheduler(SchedulerInterface):
             kv_cache_eviction_events=eviction_events,
             spec_decoding_stats=spec_stats,
             kv_connector_stats=connector_stats_payload,
+            cudagraph_stats=cudagraph_stats,
         )
 
     def make_spec_decoding_stats(
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index dec0e2d00aea8..6961e15c2d0c5 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -10,6 +10,7 @@ from typing import TypeAlias
 from prometheus_client import Counter, Gauge, Histogram
 
 import vllm.envs as envs
+from vllm.compilation.cuda_graph import CUDAGraphLogging
 from vllm.config import SupportsMetricsInfo, VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
     KVConnectorLogging,
@@ -106,6 +107,12 @@ class LoggingStatLogger(StatLoggerBase):
         self.spec_decoding_logging = SpecDecodingLogging()
         kv_transfer_config = self.vllm_config.kv_transfer_config
         self.kv_connector_logging = KVConnectorLogging(kv_transfer_config)
+        self.cudagraph_logging = None
+        if self.vllm_config.observability_config.cudagraph_metrics:
+            self.cudagraph_logging = CUDAGraphLogging(
+                self.vllm_config.compilation_config.cudagraph_mode,
+                self.vllm_config.compilation_config.cudagraph_capture_sizes,
+            )
         self.last_prompt_throughput: float = 0.0
         self.last_generation_throughput: float = 0.0
         self.engine_is_idle = False
@@ -161,6 +168,11 @@ class LoggingStatLogger(StatLoggerBase):
                 self.spec_decoding_logging.observe(scheduler_stats.spec_decoding_stats)
             if kv_connector_stats := scheduler_stats.kv_connector_stats:
                 self.kv_connector_logging.observe(kv_connector_stats)
+            if (
+                self.cudagraph_logging is not None
+                and scheduler_stats.cudagraph_stats is not None
+            ):
+                self.cudagraph_logging.observe(scheduler_stats.cudagraph_stats)
             if not self.aggregated:
                 self.last_scheduler_stats = scheduler_stats
         if mm_cache_stats:
@@ -240,6 +252,8 @@ class LoggingStatLogger(StatLoggerBase):
 
         self.spec_decoding_logging.log(log_fn=log_fn)
         self.kv_connector_logging.log(log_fn=log_fn)
+        if self.cudagraph_logging is not None:
+            self.cudagraph_logging.log(log_fn=log_fn)
 
     def log_engine_initialized(self):
         if self.vllm_config.cache_config.num_gpu_blocks:
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index a3078eaa75dc5..733d3ae12e67f 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -7,6 +7,7 @@ from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any
 
 import vllm.envs as envs
+from vllm.compilation.cuda_graph import CUDAGraphStat
 from vllm.v1.spec_decode.metrics import SpecDecodingStats
 
 if TYPE_CHECKING:
@@ -183,6 +184,8 @@ class SchedulerStats:
     waiting_lora_adapters: dict[str, int] = field(default_factory=dict)
     running_lora_adapters: dict[str, int] = field(default_factory=dict)
 
+    cudagraph_stats: CUDAGraphStat | None = None
+
 
 @dataclass
 class RequestStateStats:
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index 8110deb5a610b..88ac6b4aeb4bb 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -8,6 +8,7 @@ from typing import TYPE_CHECKING, NamedTuple
 import numpy as np
 import torch
 
+from vllm.compilation.cuda_graph import CUDAGraphStat
 from vllm.v1.core.sched.output import SchedulerOutput
 
 if TYPE_CHECKING:
@@ -169,6 +170,9 @@ class ModelRunnerOutput:
     # req_id -> num_nans_in_logits
     num_nans_in_logits: dict[str, int] | None = None
 
+    # information related to cudagraph execution
+    cudagraph_stats: CUDAGraphStat | None = None
+
 
 # ModelRunnerOutput wrapper for async scheduling.
 class AsyncModelRunnerOutput(ABC):
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 1b250a8bd009c..3f043e3b2648b 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -27,7 +27,7 @@ from vllm.attention.backends.abstract import (
 )
 from vllm.attention.layer import Attention, MLAAttention
 from vllm.compilation.counter import compilation_counter
-from vllm.compilation.cuda_graph import CUDAGraphWrapper
+from vllm.compilation.cuda_graph import CUDAGraphStat, CUDAGraphWrapper
 from vllm.compilation.monitor import set_cudagraph_capturing_enabled
 from vllm.config import (
     CompilationMode,
@@ -257,6 +257,7 @@ class ExecuteModelState(NamedTuple):
     sample_hidden_states: torch.Tensor
     aux_hidden_states: list[torch.Tensor] | None
     ec_connector_output: ECConnectorOutput | None
+    cudagraph_stats: CUDAGraphStat | None
 
 
 class GPUModelRunner(
@@ -2755,7 +2756,11 @@ class GPUModelRunner(
         force_uniform_decode: bool | None = None,
         force_has_lora: bool | None = None,
     ) -> tuple[
-        CUDAGraphMode, BatchDescriptor, UBatchSlices | None, torch.Tensor | None
+        CUDAGraphMode,
+        BatchDescriptor,
+        UBatchSlices | None,
+        torch.Tensor | None,
+        CUDAGraphStat | None,
     ]:
         num_tokens_padded = self._pad_for_sequence_parallelism(num_tokens)
         uniform_decode = (
@@ -2820,7 +2825,22 @@ class GPUModelRunner(
                 # num_tokens_across_dp will no-longer be valid
                 assert batch_descriptor.num_tokens == num_tokens_padded
 
-        return cudagraph_mode, batch_descriptor, ubatch_slices, num_tokens_across_dp
+        cudagraph_stats = None
+        if self.vllm_config.observability_config.cudagraph_metrics:
+            cudagraph_stats = CUDAGraphStat(
+                num_unpadded_tokens=num_tokens,
+                num_padded_tokens=batch_descriptor.num_tokens,
+                num_paddings=batch_descriptor.num_tokens - num_tokens,
+                runtime_mode=str(cudagraph_mode),
+            )
+
+        return (
+            cudagraph_mode,
+            batch_descriptor,
+            ubatch_slices,
+            num_tokens_across_dp,
+            cudagraph_stats,
+        )
 
     @torch.inference_mode()
     def execute_model(
@@ -2918,6 +2938,7 @@ class GPUModelRunner(
                     batch_desc,
                     ubatch_slices,
                     num_tokens_across_dp,
+                    cudagraph_stats,
                 ) = self._determine_batch_execution_and_padding(
                     num_tokens=num_tokens_unpadded,
                     num_reqs=num_reqs,
@@ -3067,6 +3088,7 @@ class GPUModelRunner(
             sample_hidden_states,
             aux_hidden_states,
             ec_connector_output,
+            cudagraph_stats,
         )
         self.kv_connector_output = kv_connector_output
         return None
@@ -3102,6 +3124,7 @@ class GPUModelRunner(
             sample_hidden_states,
             aux_hidden_states,
             ec_connector_output,
+            cudagraph_stats,
         ) = self.execute_model_state
         # Clear ephemeral state.
         self.execute_model_state = None
@@ -3217,6 +3240,7 @@ class GPUModelRunner(
                 if self.supports_mm_inputs
                 else None,
                 num_nans_in_logits=num_nans_in_logits,
+                cudagraph_stats=cudagraph_stats,
             )
 
         if not self.use_async_scheduling:
@@ -3937,7 +3961,7 @@ class GPUModelRunner(
 
         num_sampled_tokens = np.ones(num_reqs, dtype=np.int32)
 
-        _cudagraph_mode, batch_desc, ubatch_slices, num_tokens_across_dp = (
+        _cudagraph_mode, batch_desc, ubatch_slices, num_tokens_across_dp, _ = (
             self._determine_batch_execution_and_padding(
                 num_tokens=num_tokens_unpadded,
                 num_reqs=num_reqs,
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index edba07a423cda..a133575cbbced 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -564,7 +564,7 @@ class Worker(WorkerBase):
             # TODO(lucas): This is pretty gross; ideally we should only ever call
             # `_determine_batch_execution_and_padding` once (will get called again
             # in `execute_model`) but this requires a larger refactor of PP.
-            _, batch_desc, _, _ = (
+            _, batch_desc, _, _, _ = (
                 self.model_runner._determine_batch_execution_and_padding(
                     num_tokens=num_scheduled_tokens,
                     num_reqs=len(num_scheduled_tokens_np),

From 3f42b05fbc53e50813a1619f5fc770f17ac2a1b6 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Wed, 3 Dec 2025 17:26:39 +0800
Subject: [PATCH 179/770] [Refactor] [1/N] to simplify the vLLM serving
 architecture (#28040)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 tests/entrypoints/openai/test_basic.py        |   2 +-
 vllm/entrypoints/api_server.py                |   1 +
 vllm/entrypoints/openai/api_server.py         | 455 +-----------------
 vllm/entrypoints/openai/serving_engine.py     |   3 +-
 vllm/entrypoints/sagemaker/routes.py          |   2 +-
 vllm/entrypoints/serve/__init__.py            |  60 +++
 vllm/entrypoints/serve/disagg/__init__.py     |   0
 vllm/entrypoints/serve/disagg/api_router.py   | 110 +++++
 vllm/entrypoints/serve/disagg/protocol.py     |  90 ++++
 .../disagg/serving.py}                        |  10 +-
 vllm/entrypoints/serve/elastic_ep/__init__.py |   0
 .../serve/elastic_ep/api_router.py            |  96 ++++
 .../serve/elastic_ep/middleware.py            |  49 ++
 .../serve/instrumentator/__init__.py          |   0
 .../serve/instrumentator/health.py            |  33 ++
 .../serve/instrumentator/metrics.py           |  46 ++
 vllm/entrypoints/serve/lora/__init__.py       |   0
 .../lora/api_router.py}                       |  19 +-
 vllm/entrypoints/serve/profile/__init__.py    |   0
 vllm/entrypoints/serve/profile/api_router.py  |  49 ++
 vllm/entrypoints/serve/rlhf/__init__.py       |   0
 vllm/entrypoints/serve/rlhf/api_router.py     | 102 ++++
 vllm/entrypoints/serve/sleep/__init__.py      |   0
 vllm/entrypoints/serve/sleep/api_router.py    |  60 +++
 vllm/entrypoints/serve/tokenize/__init__.py   |   0
 vllm/entrypoints/serve/tokenize/api_router.py | 118 +++++
 .../tokenize/serving.py}                      |   0
 27 files changed, 850 insertions(+), 455 deletions(-)
 create mode 100644 vllm/entrypoints/serve/__init__.py
 create mode 100644 vllm/entrypoints/serve/disagg/__init__.py
 create mode 100644 vllm/entrypoints/serve/disagg/api_router.py
 create mode 100644 vllm/entrypoints/serve/disagg/protocol.py
 rename vllm/entrypoints/{openai/serving_tokens.py => serve/disagg/serving.py} (99%)
 create mode 100644 vllm/entrypoints/serve/elastic_ep/__init__.py
 create mode 100644 vllm/entrypoints/serve/elastic_ep/api_router.py
 create mode 100644 vllm/entrypoints/serve/elastic_ep/middleware.py
 create mode 100644 vllm/entrypoints/serve/instrumentator/__init__.py
 create mode 100644 vllm/entrypoints/serve/instrumentator/health.py
 create mode 100644 vllm/entrypoints/serve/instrumentator/metrics.py
 create mode 100644 vllm/entrypoints/serve/lora/__init__.py
 rename vllm/entrypoints/{dynamic_lora.py => serve/lora/api_router.py} (80%)
 create mode 100644 vllm/entrypoints/serve/profile/__init__.py
 create mode 100644 vllm/entrypoints/serve/profile/api_router.py
 create mode 100644 vllm/entrypoints/serve/rlhf/__init__.py
 create mode 100644 vllm/entrypoints/serve/rlhf/api_router.py
 create mode 100644 vllm/entrypoints/serve/sleep/__init__.py
 create mode 100644 vllm/entrypoints/serve/sleep/api_router.py
 create mode 100644 vllm/entrypoints/serve/tokenize/__init__.py
 create mode 100644 vllm/entrypoints/serve/tokenize/api_router.py
 rename vllm/entrypoints/{openai/serving_tokenization.py => serve/tokenize/serving.py} (100%)

diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py
index 3d581a300b6a9..1ff30de31bbe5 100644
--- a/tests/entrypoints/openai/test_basic.py
+++ b/tests/entrypoints/openai/test_basic.py
@@ -232,7 +232,7 @@ async def test_server_load(server: RemoteOpenAIServer):
 @pytest.mark.asyncio
 async def test_health_check_engine_dead_error():
     # Import the health function directly to test it in isolation
-    from vllm.entrypoints.openai.api_server import health
+    from vllm.entrypoints.serve.instrumentator.health import health
 
     # Create a mock request that simulates what FastAPI would provide
     mock_request = Mock(spec=Request)
diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index 154cdeb42a3ea..b59f7120551e0 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -118,6 +118,7 @@ async def init_app(
         )
     )
     app.state.engine_client = engine
+    app.state.args = args
     return app
 
 
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index cdc316b65ba78..2fa6afa2bacb5 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -20,21 +20,15 @@ from http import HTTPStatus
 from typing import Annotated, Any, Literal
 
 import model_hosting_container_standards.sagemaker as sagemaker_standards
-import prometheus_client
 import pydantic
-import regex as re
 import uvloop
 from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Query, Request
 from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, Response, StreamingResponse
-from prometheus_client import make_asgi_app
-from prometheus_fastapi_instrumentator import Instrumentator
 from starlette.concurrency import iterate_in_threadpool
 from starlette.datastructures import URL, Headers, MutableHeaders, State
-from starlette.routing import Mount
 from starlette.types import ASGIApp, Message, Receive, Scope, Send
-from typing_extensions import assert_never
 
 import vllm.envs as envs
 from vllm.config import VllmConfig
@@ -56,17 +50,11 @@ from vllm.entrypoints.openai.protocol import (
     ChatCompletionResponse,
     CompletionRequest,
     CompletionResponse,
-    DetokenizeRequest,
-    DetokenizeResponse,
     ErrorInfo,
     ErrorResponse,
-    GenerateRequest,
-    GenerateResponse,
     ResponsesRequest,
     ResponsesResponse,
     StreamingResponsesResponse,
-    TokenizeRequest,
-    TokenizeResponse,
     TranscriptionRequest,
     TranscriptionResponseVariant,
     TranslationRequest,
@@ -80,8 +68,6 @@ from vllm.entrypoints.openai.serving_models import (
     OpenAIServingModels,
 )
 from vllm.entrypoints.openai.serving_responses import OpenAIServingResponses
-from vllm.entrypoints.openai.serving_tokenization import OpenAIServingTokenization
-from vllm.entrypoints.openai.serving_tokens import ServingTokens
 from vllm.entrypoints.openai.serving_transcription import (
     OpenAIServingTranscription,
     OpenAIServingTranslation,
@@ -92,6 +78,11 @@ from vllm.entrypoints.pooling.classify.serving import ServingClassification
 from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding
 from vllm.entrypoints.pooling.pooling.serving import OpenAIServingPooling
 from vllm.entrypoints.pooling.score.serving import ServingScores
+from vllm.entrypoints.serve.disagg.serving import ServingTokens
+from vllm.entrypoints.serve.elastic_ep.middleware import (
+    ScalingMiddleware,
+)
+from vllm.entrypoints.serve.tokenize.serving import OpenAIServingTokenization
 from vllm.entrypoints.tool_server import DemoToolServer, MCPToolServer, ToolServer
 from vllm.entrypoints.utils import (
     cli_env_setup,
@@ -109,8 +100,6 @@ from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.gc_utils import freeze_gc_heap
 from vllm.utils.network_utils import is_valid_ipv6_address
 from vllm.utils.system_utils import decorate_logs, set_ulimit
-from vllm.v1.engine.exceptions import EngineDeadError
-from vllm.v1.metrics.prometheus import get_prometheus_registry
 from vllm.version import __version__ as VLLM_VERSION
 
 prometheus_multiproc_dir: tempfile.TemporaryDirectory
@@ -245,39 +234,6 @@ async def build_async_engine_client_from_engine_args(
 router = APIRouter()
 
 
-class PrometheusResponse(Response):
-    media_type = prometheus_client.CONTENT_TYPE_LATEST
-
-
-def mount_metrics(app: FastAPI):
-    """Mount prometheus metrics to a FastAPI app."""
-
-    registry = get_prometheus_registry()
-
-    # `response_class=PrometheusResponse` is needed to return an HTTP response
-    # with header "Content-Type: text/plain; version=0.0.4; charset=utf-8"
-    # instead of the default "application/json" which is incorrect.
-    # See https://github.com/trallnag/prometheus-fastapi-instrumentator/issues/163#issue-1296092364
-    Instrumentator(
-        excluded_handlers=[
-            "/metrics",
-            "/health",
-            "/load",
-            "/ping",
-            "/version",
-            "/server_info",
-        ],
-        registry=registry,
-    ).add().instrument(app).expose(app, response_class=PrometheusResponse)
-
-    # Add prometheus asgi middleware to route /metrics requests
-    metrics_route = Mount("/metrics", make_asgi_app(registry=registry))
-
-    # Workaround for 307 Redirect for /metrics
-    metrics_route.path_regex = re.compile("^/metrics(?P<path>.*)$")
-    app.routes.append(metrics_route)
-
-
 def base(request: Request) -> OpenAIServing:
     # Reuse the existing instance
     return tokenization(request)
@@ -323,16 +279,6 @@ def generate_tokens(request: Request) -> ServingTokens | None:
     return request.app.state.serving_tokens
 
 
-@router.get("/health", response_class=Response)
-async def health(raw_request: Request) -> Response:
-    """Health check."""
-    try:
-        await engine_client(raw_request).check_health()
-        return Response(status_code=200)
-    except EngineDeadError:
-        return Response(status_code=503)
-
-
 @router.get("/load")
 async def get_server_load_metrics(request: Request):
     # This endpoint returns the current server load metrics.
@@ -352,167 +298,6 @@ async def get_server_load_metrics(request: Request):
     return JSONResponse(content={"server_load": request.app.state.server_load_metrics})
 
 
-@router.post("/pause")
-async def pause_generation(
-    raw_request: Request,
-    wait_for_inflight_requests: bool = Query(False),
-    clear_cache: bool = Query(True),
-) -> JSONResponse:
-    """Pause generation requests to allow weight updates.
-
-    Args:
-        wait_for_inflight_requests: When ``True`` waits for in-flight
-            requests to finish before pausing. When ``False`` (default),
-            aborts any in-flight requests immediately.
-        clear_cache: Whether to clear KV/prefix caches after draining.
-    """
-
-    engine = engine_client(raw_request)
-
-    try:
-        await engine.pause_generation(
-            wait_for_inflight_requests=wait_for_inflight_requests,
-            clear_cache=clear_cache,
-        )
-        return JSONResponse(
-            content={"status": "paused"},
-            status_code=HTTPStatus.OK.value,
-        )
-
-    except ValueError as err:
-        return JSONResponse(
-            content={"error": str(err)},
-            status_code=HTTPStatus.BAD_REQUEST.value,
-        )
-    except Exception as err:  # pragma: no cover - defensive
-        logger.exception("Failed to pause generation")
-        return JSONResponse(
-            content={"error": f"Failed to pause generation: {err}"},
-            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
-        )
-
-
-@router.post("/resume")
-async def resume_generation(raw_request: Request) -> JSONResponse:
-    """Resume generation after a pause."""
-
-    engine = engine_client(raw_request)
-
-    try:
-        await engine.resume_generation()
-        return JSONResponse(
-            content={"status": "resumed"},
-            status_code=HTTPStatus.OK.value,
-        )
-    except Exception as err:  # pragma: no cover - defensive
-        logger.exception("Failed to resume generation")
-        return JSONResponse(
-            content={"error": f"Failed to resume generation: {err}"},
-            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
-        )
-
-
-@router.get("/is_paused")
-async def is_paused(raw_request: Request) -> JSONResponse:
-    """Return the current pause status."""
-
-    engine = engine_client(raw_request)
-
-    try:
-        paused = await engine.is_paused()
-    except Exception as err:  # pragma: no cover - defensive
-        logger.exception("Failed to fetch pause status")
-        return JSONResponse(
-            content={"error": f"Failed to fetch pause status: {err}"},
-            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
-        )
-
-    return JSONResponse(content={"is_paused": paused})
-
-
-@router.post(
-    "/tokenize",
-    dependencies=[Depends(validate_json_request)],
-    responses={
-        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
-        HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
-        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
-        HTTPStatus.NOT_IMPLEMENTED.value: {"model": ErrorResponse},
-    },
-)
-@with_cancellation
-async def tokenize(request: TokenizeRequest, raw_request: Request):
-    handler = tokenization(raw_request)
-
-    try:
-        generator = await handler.create_tokenize(request, raw_request)
-    except NotImplementedError as e:
-        raise HTTPException(
-            status_code=HTTPStatus.NOT_IMPLEMENTED.value, detail=str(e)
-        ) from e
-    except Exception as e:
-        raise HTTPException(
-            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
-        ) from e
-
-    if isinstance(generator, ErrorResponse):
-        return JSONResponse(
-            content=generator.model_dump(), status_code=generator.error.code
-        )
-    elif isinstance(generator, TokenizeResponse):
-        return JSONResponse(content=generator.model_dump())
-
-    assert_never(generator)
-
-
-@router.post(
-    "/detokenize",
-    dependencies=[Depends(validate_json_request)],
-    responses={
-        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
-        HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
-        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
-    },
-)
-@with_cancellation
-async def detokenize(request: DetokenizeRequest, raw_request: Request):
-    handler = tokenization(raw_request)
-
-    try:
-        generator = await handler.create_detokenize(request, raw_request)
-    except OverflowError as e:
-        raise RequestValidationError(errors=[str(e)]) from e
-    except Exception as e:
-        raise HTTPException(
-            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
-        ) from e
-
-    if isinstance(generator, ErrorResponse):
-        return JSONResponse(
-            content=generator.model_dump(), status_code=generator.error.code
-        )
-    elif isinstance(generator, DetokenizeResponse):
-        return JSONResponse(content=generator.model_dump())
-
-    assert_never(generator)
-
-
-def maybe_register_tokenizer_info_endpoint(args):
-    """Conditionally register the tokenizer info endpoint if enabled."""
-    if getattr(args, "enable_tokenizer_info_endpoint", False):
-
-        @router.get("/tokenizer_info")
-        async def get_tokenizer_info(raw_request: Request):
-            """Get comprehensive tokenizer information."""
-            result = await tokenization(raw_request).get_tokenizer_info()
-            return JSONResponse(
-                content=result.model_dump(),
-                status_code=result.error.code
-                if isinstance(result, ErrorResponse)
-                else 200,
-            )
-
-
 @router.get("/v1/models")
 async def show_available_models(raw_request: Request):
     handler = models(raw_request)
@@ -898,33 +683,6 @@ if envs.VLLM_SERVER_DEV_MODE:
         await engine_client(raw_request).reset_mm_cache()
         return Response(status_code=200)
 
-    @router.post("/sleep")
-    async def sleep(raw_request: Request):
-        # get POST params
-        level = raw_request.query_params.get("level", "1")
-        await engine_client(raw_request).sleep(int(level))
-        # FIXME: in v0 with frontend multiprocessing, the sleep command
-        # is sent but does not finish yet when we return a response.
-        return Response(status_code=200)
-
-    @router.post("/wake_up")
-    async def wake_up(raw_request: Request):
-        tags = raw_request.query_params.getlist("tags")
-        if tags == []:
-            # set to None to wake up all tags if no tags are provided
-            tags = None
-        logger.info("wake up the engine with tags: %s", tags)
-        await engine_client(raw_request).wake_up(tags)
-        # FIXME: in v0 with frontend multiprocessing, the wake-up command
-        # is sent but does not finish yet when we return a response.
-        return Response(status_code=200)
-
-    @router.get("/is_sleeping")
-    async def is_sleeping(raw_request: Request):
-        logger.info("check whether the engine is sleeping")
-        is_sleeping = await engine_client(raw_request).is_sleeping()
-        return JSONResponse(content={"is_sleeping": is_sleeping})
-
     @router.post("/collective_rpc")
     async def collective_rpc(raw_request: Request):
         try:
@@ -952,138 +710,13 @@ if envs.VLLM_SERVER_DEV_MODE:
             return Response(status_code=200)
         response: list[Any] = []
         for result in results:
-            if result is None or isinstance(result, (dict, list)):
+            if result is None or isinstance(result, dict | list):
                 response.append(result)
             else:
                 response.append(str(result))
         return JSONResponse(content={"results": response})
 
 
-@router.post(
-    "/scale_elastic_ep",
-    dependencies=[Depends(validate_json_request)],
-    responses={
-        HTTPStatus.OK.value: {"model": dict},
-        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
-        HTTPStatus.REQUEST_TIMEOUT.value: {"model": ErrorResponse},
-        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
-    },
-)
-async def scale_elastic_ep(raw_request: Request):
-    try:
-        body = await raw_request.json()
-    except json.JSONDecodeError as e:
-        raise HTTPException(status_code=400, detail="Invalid JSON format") from e  # noqa: B904
-
-    new_data_parallel_size = body.get("new_data_parallel_size")
-    drain_timeout = body.get("drain_timeout", 120)  # Default 2 minutes
-
-    if new_data_parallel_size is None:
-        raise HTTPException(
-            status_code=400, detail="new_data_parallel_size is required"
-        )
-
-    if not isinstance(new_data_parallel_size, int) or new_data_parallel_size <= 0:
-        raise HTTPException(
-            status_code=400, detail="new_data_parallel_size must be a positive integer"
-        )
-
-    if not isinstance(drain_timeout, int) or drain_timeout <= 0:
-        raise HTTPException(
-            status_code=400, detail="drain_timeout must be a positive integer"
-        )
-
-    # Set scaling flag to prevent new requests
-    global _scaling_elastic_ep
-    _scaling_elastic_ep = True
-    client = engine_client(raw_request)
-    try:
-        await client.scale_elastic_ep(new_data_parallel_size, drain_timeout)
-        return JSONResponse(
-            {
-                "message": f"Scaled to {new_data_parallel_size} data parallel engines",
-            }
-        )
-    except TimeoutError as e:
-        raise HTTPException(
-            status_code=408,
-            detail="Scale failed due to request drain timeout "
-            f"after {drain_timeout} seconds",
-        ) from e
-    except Exception as e:
-        logger.error("Scale failed: %s", e)
-        raise HTTPException(status_code=500, detail="Scale failed") from e
-    finally:
-        _scaling_elastic_ep = False
-
-
-@router.post("/is_scaling_elastic_ep")
-async def is_scaling_elastic_ep(raw_request: Request):
-    return JSONResponse({"is_scaling_elastic_ep": _scaling_elastic_ep})
-
-
-@router.post(
-    "/inference/v1/generate",
-    dependencies=[Depends(validate_json_request)],
-    responses={
-        HTTPStatus.OK.value: {"content": {"text/event-stream": {}}},
-        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
-        HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
-        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
-    },
-)
-@with_cancellation
-@load_aware_call
-async def generate(request: GenerateRequest, raw_request: Request):
-    handler = generate_tokens(raw_request)
-    if handler is None:
-        return base(raw_request).create_error_response(
-            message="The model does not support generate tokens API"
-        )
-    try:
-        generator = await handler.serve_tokens(request, raw_request)
-    except Exception as e:
-        raise HTTPException(
-            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
-        ) from e
-    if isinstance(generator, ErrorResponse):
-        return JSONResponse(
-            content=generator.model_dump(), status_code=generator.error.code
-        )
-
-    elif isinstance(generator, GenerateResponse):
-        return JSONResponse(content=generator.model_dump())
-
-    return StreamingResponse(content=generator, media_type="text/event-stream")
-
-
-if envs.VLLM_TORCH_PROFILER_DIR:
-    logger.warning_once(
-        "Torch Profiler is enabled in the API server. This should ONLY be "
-        "used for local development!"
-    )
-elif envs.VLLM_TORCH_CUDA_PROFILE:
-    logger.warning_once(
-        "CUDA Profiler is enabled in the API server. This should ONLY be "
-        "used for local development!"
-    )
-if envs.VLLM_TORCH_PROFILER_DIR or envs.VLLM_TORCH_CUDA_PROFILE:
-
-    @router.post("/start_profile")
-    async def start_profile(raw_request: Request):
-        logger.info("Starting profiler...")
-        await engine_client(raw_request).start_profile()
-        logger.info("Profiler started.")
-        return Response(status_code=200)
-
-    @router.post("/stop_profile")
-    async def stop_profile(raw_request: Request):
-        logger.info("Stopping profiler...")
-        await engine_client(raw_request).stop_profile()
-        logger.info("Profiler stopped.")
-        return Response(status_code=200)
-
-
 def load_log_config(log_config_file: str | None) -> dict | None:
     if not log_config_file:
         return None
@@ -1176,41 +809,6 @@ class XRequestIdMiddleware:
         return self.app(scope, receive, send_with_request_id)
 
 
-# Global variable to track scaling state
-_scaling_elastic_ep = False
-
-
-class ScalingMiddleware:
-    """
-    Middleware that checks if the model is currently scaling and
-    returns a 503 Service Unavailable response if it is.
-
-    This middleware applies to all HTTP requests and prevents
-    processing when the model is in a scaling state.
-    """
-
-    def __init__(self, app: ASGIApp) -> None:
-        self.app = app
-
-    def __call__(self, scope: Scope, receive: Receive, send: Send) -> Awaitable[None]:
-        if scope["type"] != "http":
-            return self.app(scope, receive, send)
-
-        # Check global scaling state
-        global _scaling_elastic_ep
-        if _scaling_elastic_ep:
-            # Return 503 Service Unavailable response
-            response = JSONResponse(
-                content={
-                    "error": "The model is currently scaling. Please try again later."
-                },
-                status_code=503,
-            )
-            return response(scope, receive, send)
-
-        return self.app(scope, receive, send)
-
-
 def _extract_content_from_chunk(chunk_data: dict) -> str:
     """Extract content from a streaming response chunk."""
     try:
@@ -1353,15 +951,10 @@ def build_app(args: Namespace) -> FastAPI:
         )
     else:
         app = FastAPI(lifespan=lifespan)
+    app.state.args = args
+    from vllm.entrypoints.serve import register_vllm_serve_api_routers
 
-    if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
-        logger.warning(
-            "LoRA dynamic loading & unloading is enabled in the API server. "
-            "This should ONLY be used for local development!"
-        )
-        from vllm.entrypoints.dynamic_lora import register_dynamic_lora_routes
-
-        register_dynamic_lora_routes(router)
+    register_vllm_serve_api_routers(app)
 
     from vllm.entrypoints.sagemaker.routes import register_sagemaker_routes
 
@@ -1370,8 +963,6 @@ def build_app(args: Namespace) -> FastAPI:
 
     app.root_path = args.root_path
 
-    mount_metrics(app)
-
     from vllm.entrypoints.pooling import register_pooling_api_routers
 
     register_pooling_api_routers(app)
@@ -1462,31 +1053,6 @@ def build_app(args: Namespace) -> FastAPI:
             )
 
     app = sagemaker_standards.bootstrap(app)
-    # Optional endpoints
-    if args.tokens_only:
-
-        @app.post("/abort_requests")
-        async def abort_requests(raw_request: Request):
-            """
-            Abort one or more requests. To be used in a
-            Disaggregated Everything setup.
-            """
-            try:
-                body = await raw_request.json()
-            except json.JSONDecodeError as e:
-                raise HTTPException(
-                    status_code=HTTPStatus.BAD_REQUEST.value,
-                    detail=f"JSON decode error: {e}",
-                ) from e
-            request_ids = body.get("request_ids")
-            if request_ids is None:
-                raise HTTPException(
-                    status_code=HTTPStatus.BAD_REQUEST.value,
-                    detail="Missing 'request_ids' in request body",
-                )
-            # Abort requests in background
-            asyncio.create_task(engine_client(raw_request).abort(request_ids))
-            return Response(status_code=200)
 
     return app
 
@@ -1515,7 +1081,7 @@ async def init_app_state(
     state.engine_client = engine_client
     state.log_stats = not args.disable_log_stats
     state.vllm_config = vllm_config
-
+    state.args = args
     supported_tasks = await engine_client.get_supported_tasks()
     logger.info("Supported tasks: %s", supported_tasks)
 
@@ -1839,7 +1405,6 @@ async def run_server_worker(
         args,
         client_config=client_config,
     ) as engine_client:
-        maybe_register_tokenizer_info_endpoint(args)
         app = build_app(args)
 
         await init_app_state(engine_client, app.state, args)
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 1d89aa011af21..67291f45a9251 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -74,8 +74,6 @@ from vllm.entrypoints.openai.protocol import (
     ErrorResponse,
     FunctionCall,
     FunctionDefinition,
-    GenerateRequest,
-    GenerateResponse,
     ResponsesRequest,
     TokenizeChatRequest,
     TokenizeCompletionRequest,
@@ -87,6 +85,7 @@ from vllm.entrypoints.openai.protocol import (
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
 from vllm.entrypoints.renderer import BaseRenderer, CompletionRenderer, RenderConfig
+from vllm.entrypoints.serve.disagg.protocol import GenerateRequest, GenerateResponse
 from vllm.entrypoints.utils import _validate_truncation_size
 from vllm.inputs.data import PromptType
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
diff --git a/vllm/entrypoints/sagemaker/routes.py b/vllm/entrypoints/sagemaker/routes.py
index 108fdd773e321..ea88c0fc4b979 100644
--- a/vllm/entrypoints/sagemaker/routes.py
+++ b/vllm/entrypoints/sagemaker/routes.py
@@ -16,7 +16,6 @@ from vllm.entrypoints.openai.api_server import (
     completion,
     create_chat_completion,
     create_completion,
-    health,
     validate_json_request,
 )
 from vllm.entrypoints.openai.protocol import (
@@ -38,6 +37,7 @@ from vllm.entrypoints.pooling.score.api_router import (
     score,
 )
 from vllm.entrypoints.pooling.score.protocol import RerankRequest, ScoreRequest
+from vllm.entrypoints.serve.instrumentator.health import health
 
 # TODO: RequestType = TypeForm[BaseModel] when recognized by type checkers
 # (requires typing_extensions >= 4.13)
diff --git a/vllm/entrypoints/serve/__init__.py b/vllm/entrypoints/serve/__init__.py
new file mode 100644
index 0000000000000..c4fcc92db931f
--- /dev/null
+++ b/vllm/entrypoints/serve/__init__.py
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+from fastapi import FastAPI
+
+
+def register_vllm_serve_api_routers(app: FastAPI):
+    from vllm.entrypoints.serve.lora.api_router import (
+        attach_router as attach_lora_router,
+    )
+
+    attach_lora_router(app)
+    from vllm.entrypoints.serve.elastic_ep.api_router import (
+        attach_router as attach_elastic_ep_router,
+    )
+
+    attach_elastic_ep_router(app)
+
+    from vllm.entrypoints.serve.profile.api_router import (
+        attach_router as attach_profile_router,
+    )
+
+    attach_profile_router(app)
+
+    from vllm.entrypoints.serve.sleep.api_router import (
+        attach_router as attach_sleep_router,
+    )
+
+    attach_sleep_router(app)
+
+    from vllm.entrypoints.serve.tokenize.api_router import (
+        attach_router as attach_tokenize_router,
+    )
+
+    attach_tokenize_router(app)
+
+    from vllm.entrypoints.serve.disagg.api_router import (
+        attach_router as attach_disagg_router,
+    )
+
+    attach_disagg_router(app)
+
+    from vllm.entrypoints.serve.rlhf.api_router import (
+        attach_router as attach_rlhf_router,
+    )
+
+    attach_rlhf_router(app)
+
+    from vllm.entrypoints.serve.instrumentator.metrics import (
+        attach_router as attach_metrics_router,
+    )
+
+    attach_metrics_router(app)
+
+    from vllm.entrypoints.serve.instrumentator.health import (
+        attach_router as attach_health_router,
+    )
+
+    attach_health_router(app)
diff --git a/vllm/entrypoints/serve/disagg/__init__.py b/vllm/entrypoints/serve/disagg/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/entrypoints/serve/disagg/api_router.py b/vllm/entrypoints/serve/disagg/api_router.py
new file mode 100644
index 0000000000000..c38ede30dad1c
--- /dev/null
+++ b/vllm/entrypoints/serve/disagg/api_router.py
@@ -0,0 +1,110 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import asyncio
+import json
+from http import HTTPStatus
+
+from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request, Response
+from fastapi.responses import JSONResponse, StreamingResponse
+
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.openai.api_server import validate_json_request
+from vllm.entrypoints.openai.protocol import (
+    ErrorResponse,
+)
+from vllm.entrypoints.serve.disagg.protocol import (
+    GenerateRequest,
+    GenerateResponse,
+)
+from vllm.entrypoints.serve.disagg.serving import (
+    ServingTokens,
+)
+from vllm.entrypoints.serve.tokenize.serving import OpenAIServingTokenization
+from vllm.entrypoints.utils import (
+    load_aware_call,
+    with_cancellation,
+)
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def tokenization(request: Request) -> OpenAIServingTokenization:
+    return request.app.state.openai_serving_tokenization
+
+
+def generate_tokens(request: Request) -> ServingTokens | None:
+    return request.app.state.serving_tokens
+
+
+def engine_client(request: Request) -> EngineClient:
+    return request.app.state.engine_client
+
+
+router = APIRouter()
+
+
+@router.post(
+    "/inference/v1/generate",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.OK.value: {"content": {"text/event-stream": {}}},
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+@load_aware_call
+async def generate(request: GenerateRequest, raw_request: Request):
+    handler = generate_tokens(raw_request)
+    if handler is None:
+        return tokenization(raw_request).create_error_response(
+            message="The model does not support generate tokens API"
+        )
+    try:
+        generator = await handler.serve_tokens(request, raw_request)
+    except Exception as e:
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
+
+    elif isinstance(generator, GenerateResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    return StreamingResponse(content=generator, media_type="text/event-stream")
+
+
+def attach_router(app: FastAPI):
+    if getattr(app.state.args, "tokens_only", False):
+
+        @router.post("/abort_requests")
+        async def abort_requests(raw_request: Request):
+            """
+            Abort one or more requests. To be used in a
+            Disaggregated Everything setup.
+            """
+            try:
+                body = await raw_request.json()
+            except json.JSONDecodeError as e:
+                raise HTTPException(
+                    status_code=HTTPStatus.BAD_REQUEST.value,
+                    detail=f"JSON decode error: {e}",
+                ) from e
+            request_ids = body.get("request_ids")
+            if request_ids is None:
+                raise HTTPException(
+                    status_code=HTTPStatus.BAD_REQUEST.value,
+                    detail="Missing 'request_ids' in request body",
+                )
+            # Abort requests in background
+            asyncio.create_task(engine_client(raw_request).abort(request_ids))
+            return Response(status_code=200)
+
+    app.include_router(router)
diff --git a/vllm/entrypoints/serve/disagg/protocol.py b/vllm/entrypoints/serve/disagg/protocol.py
new file mode 100644
index 0000000000000..251fcf12ed7dd
--- /dev/null
+++ b/vllm/entrypoints/serve/disagg/protocol.py
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionLogProbs,
+    Logprob,
+    SamplingParams,
+    StreamOptions,
+)
+from vllm.utils import random_uuid
+
+
+####### Tokens IN <> Tokens OUT #######
+class GenerateRequest(BaseModel):
+    request_id: str = Field(
+        default_factory=lambda: f"{random_uuid()}",
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
+    token_ids: list[int]
+    """The token ids to generate text from."""
+
+    # features: MultiModalFeatureSpec
+    # TODO (NickLucche): implement once Renderer work is completed
+    features: str | None = None
+    """The processed MM inputs for the model."""
+
+    sampling_params: SamplingParams
+    """The sampling parameters for the model."""
+
+    model: str | None = None
+
+    stream: bool | None = False
+    stream_options: StreamOptions | None = None
+    cache_salt: str | None = Field(
+        default=None,
+        description=(
+            "If specified, the prefix cache will be salted with the provided "
+            "string to prevent an attacker to guess prompts in multi-user "
+            "environments. The salt should be random, protected from "
+            "access by 3rd parties, and long enough to be "
+            "unpredictable (e.g., 43 characters base64-encoded, corresponding "
+            "to 256 bit)."
+        ),
+    )
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."
+        ),
+    )
+    kv_transfer_params: dict[str, Any] | None = Field(
+        default=None,
+        description="KVTransfer parameters used for disaggregated serving.",
+    )
+
+
+class GenerateResponseChoice(BaseModel):
+    index: int
+    logprobs: ChatCompletionLogProbs | None = None
+    # per OpenAI spec this is the default
+    finish_reason: str | None = "stop"
+    token_ids: list[int] | None = None
+
+
+class GenerateResponse(BaseModel):
+    request_id: str = Field(
+        default_factory=lambda: f"{random_uuid()}",
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
+    choices: list[GenerateResponseChoice]
+
+    prompt_logprobs: list[dict[int, Logprob] | None] | None = None
+
+    kv_transfer_params: dict[str, Any] | None = Field(
+        default=None,
+        description="KVTransfer parameters used for disaggregated serving.",
+    )
diff --git a/vllm/entrypoints/openai/serving_tokens.py b/vllm/entrypoints/serve/disagg/serving.py
similarity index 99%
rename from vllm/entrypoints/openai/serving_tokens.py
rename to vllm/entrypoints/serve/disagg/serving.py
index daa739e41fa07..5c1d17156a90d 100644
--- a/vllm/entrypoints/openai/serving_tokens.py
+++ b/vllm/entrypoints/serve/disagg/serving.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
 import asyncio
 import time
 from collections.abc import AsyncGenerator
@@ -14,15 +16,17 @@ from vllm.entrypoints.openai.protocol import (
     ChatCompletionLogProbs,
     ChatCompletionLogProbsContent,
     ErrorResponse,
-    GenerateRequest,
-    GenerateResponse,
-    GenerateResponseChoice,
     PromptTokenUsageInfo,
     RequestResponseMetadata,
     UsageInfo,
 )
 from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.serve.disagg.protocol import (
+    GenerateRequest,
+    GenerateResponse,
+    GenerateResponseChoice,
+)
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.logger import init_logger
 from vllm.logprobs import Logprob
diff --git a/vllm/entrypoints/serve/elastic_ep/__init__.py b/vllm/entrypoints/serve/elastic_ep/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/entrypoints/serve/elastic_ep/api_router.py b/vllm/entrypoints/serve/elastic_ep/api_router.py
new file mode 100644
index 0000000000000..21d5d2e60778a
--- /dev/null
+++ b/vllm/entrypoints/serve/elastic_ep/api_router.py
@@ -0,0 +1,96 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import json
+from http import HTTPStatus
+
+from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request
+from fastapi.responses import JSONResponse
+
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.openai.api_server import validate_json_request
+from vllm.entrypoints.openai.protocol import (
+    ErrorResponse,
+)
+from vllm.entrypoints.serve.elastic_ep.middleware import (
+    get_scaling_elastic_ep,
+    set_scaling_elastic_ep,
+)
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def engine_client(request: Request) -> EngineClient:
+    return request.app.state.engine_client
+
+
+router = APIRouter()
+
+
+@router.post(
+    "/scale_elastic_ep",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.OK.value: {"model": dict},
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.REQUEST_TIMEOUT.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+async def scale_elastic_ep(raw_request: Request):
+    try:
+        body = await raw_request.json()
+    except json.JSONDecodeError as e:
+        raise HTTPException(status_code=400, detail="Invalid JSON format") from e  # noqa: B904
+
+    new_data_parallel_size = body.get("new_data_parallel_size")
+    drain_timeout = body.get("drain_timeout", 120)  # Default 2 minutes
+
+    if new_data_parallel_size is None:
+        raise HTTPException(
+            status_code=400, detail="new_data_parallel_size is required"
+        )
+
+    if not isinstance(new_data_parallel_size, int) or new_data_parallel_size <= 0:
+        raise HTTPException(
+            status_code=400,
+            detail="new_data_parallel_size must be a positive integer",
+        )
+
+    if not isinstance(drain_timeout, int) or drain_timeout <= 0:
+        raise HTTPException(
+            status_code=400, detail="drain_timeout must be a positive integer"
+        )
+
+    # Set scaling flag to prevent new requests
+    set_scaling_elastic_ep(True)
+    client = engine_client(raw_request)
+    try:
+        await client.scale_elastic_ep(new_data_parallel_size, drain_timeout)
+        return JSONResponse(
+            {
+                "message": f"Scaled to {new_data_parallel_size} data parallel engines",
+            }
+        )
+    except TimeoutError as e:
+        raise HTTPException(
+            status_code=408,
+            detail="Scale failed due to request drain timeout "
+            f"after {drain_timeout} seconds",
+        ) from e
+    except Exception as e:
+        logger.error("Scale failed: %s", e)
+        raise HTTPException(status_code=500, detail="Scale failed") from e
+    finally:
+        set_scaling_elastic_ep(False)
+
+
+@router.post("/is_scaling_elastic_ep")
+async def is_scaling_elastic_ep(raw_request: Request):
+    return JSONResponse({"is_scaling_elastic_ep": get_scaling_elastic_ep()})
+
+
+def attach_router(app: FastAPI):
+    app.include_router(router)
diff --git a/vllm/entrypoints/serve/elastic_ep/middleware.py b/vllm/entrypoints/serve/elastic_ep/middleware.py
new file mode 100644
index 0000000000000..23f45eafeaa0d
--- /dev/null
+++ b/vllm/entrypoints/serve/elastic_ep/middleware.py
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Awaitable
+
+from fastapi.responses import JSONResponse
+from starlette.types import ASGIApp, Receive, Scope, Send
+
+# Global variable to track scaling state
+_scaling_elastic_ep = False
+
+
+def get_scaling_elastic_ep():
+    return _scaling_elastic_ep
+
+
+def set_scaling_elastic_ep(value):
+    global _scaling_elastic_ep
+    _scaling_elastic_ep = value
+
+
+class ScalingMiddleware:
+    """
+    Middleware that checks if the model is currently scaling and
+    returns a 503 Service Unavailable response if it is.
+
+    This middleware applies to all HTTP requests and prevents
+    processing when the model is in a scaling state.
+    """
+
+    def __init__(self, app: ASGIApp) -> None:
+        self.app = app
+
+    def __call__(self, scope: Scope, receive: Receive, send: Send) -> Awaitable[None]:
+        if scope["type"] != "http":
+            return self.app(scope, receive, send)
+
+        # Check global scaling state
+        if get_scaling_elastic_ep():
+            # Return 503 Service Unavailable response
+            response = JSONResponse(
+                content={
+                    "error": "The model is currently scaling. Please try again later."
+                },
+                status_code=503,
+            )
+            return response(scope, receive, send)
+
+        return self.app(scope, receive, send)
diff --git a/vllm/entrypoints/serve/instrumentator/__init__.py b/vllm/entrypoints/serve/instrumentator/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/entrypoints/serve/instrumentator/health.py b/vllm/entrypoints/serve/instrumentator/health.py
new file mode 100644
index 0000000000000..029ef677aaa25
--- /dev/null
+++ b/vllm/entrypoints/serve/instrumentator/health.py
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+from fastapi import APIRouter, Request
+from fastapi.responses import Response
+
+from vllm.engine.protocol import EngineClient
+from vllm.logger import init_logger
+from vllm.v1.engine.exceptions import EngineDeadError
+
+logger = init_logger(__name__)
+
+
+router = APIRouter()
+
+
+def engine_client(request: Request) -> EngineClient:
+    return request.app.state.engine_client
+
+
+@router.get("/health", response_class=Response)
+async def health(raw_request: Request) -> Response:
+    """Health check."""
+    try:
+        await engine_client(raw_request).check_health()
+        return Response(status_code=200)
+    except EngineDeadError:
+        return Response(status_code=503)
+
+
+def attach_router(app):
+    app.include_router(router)
diff --git a/vllm/entrypoints/serve/instrumentator/metrics.py b/vllm/entrypoints/serve/instrumentator/metrics.py
new file mode 100644
index 0000000000000..efe0c63a90714
--- /dev/null
+++ b/vllm/entrypoints/serve/instrumentator/metrics.py
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import re
+
+import prometheus_client
+from fastapi import FastAPI, Response
+from prometheus_client import make_asgi_app
+from prometheus_fastapi_instrumentator import Instrumentator
+from starlette.routing import Mount
+
+from vllm.v1.metrics.prometheus import get_prometheus_registry
+
+
+class PrometheusResponse(Response):
+    media_type = prometheus_client.CONTENT_TYPE_LATEST
+
+
+def attach_router(app: FastAPI):
+    """Mount prometheus metrics to a FastAPI app."""
+
+    registry = get_prometheus_registry()
+
+    # `response_class=PrometheusResponse` is needed to return an HTTP response
+    # with header "Content-Type: text/plain; version=0.0.4; charset=utf-8"
+    # instead of the default "application/json" which is incorrect.
+    # See https://github.com/trallnag/prometheus-fastapi-instrumentator/issues/163#issue-1296092364
+    Instrumentator(
+        excluded_handlers=[
+            "/metrics",
+            "/health",
+            "/load",
+            "/ping",
+            "/version",
+            "/server_info",
+        ],
+        registry=registry,
+    ).add().instrument(app).expose(app, response_class=PrometheusResponse)
+
+    # Add prometheus asgi middleware to route /metrics requests
+    metrics_route = Mount("/metrics", make_asgi_app(registry=registry))
+
+    # Workaround for 307 Redirect for /metrics
+    metrics_route.path_regex = re.compile("^/metrics(?P<path>.*)$")
+    app.routes.append(metrics_route)
diff --git a/vllm/entrypoints/serve/lora/__init__.py b/vllm/entrypoints/serve/lora/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/entrypoints/dynamic_lora.py b/vllm/entrypoints/serve/lora/api_router.py
similarity index 80%
rename from vllm/entrypoints/dynamic_lora.py
rename to vllm/entrypoints/serve/lora/api_router.py
index cc0f437e5c77f..6a57e73f334f2 100644
--- a/vllm/entrypoints/dynamic_lora.py
+++ b/vllm/entrypoints/serve/lora/api_router.py
@@ -1,9 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
 import model_hosting_container_standards.sagemaker as sagemaker_standards
-from fastapi import APIRouter, Depends, Request
+from fastapi import APIRouter, Depends, FastAPI, Request
 from fastapi.responses import JSONResponse, Response
 
+from vllm import envs
 from vllm.entrypoints.openai.api_server import models, validate_json_request
 from vllm.entrypoints.openai.protocol import (
     ErrorResponse,
@@ -14,9 +17,18 @@ from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
+router = APIRouter()
 
 
-def register_dynamic_lora_routes(router: APIRouter):
+def attach_router(app: FastAPI):
+    if not envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
+        """If LoRA dynamic loading & unloading is not enabled, do nothing."""
+        return
+    logger.warning(
+        "LoRA dynamic loading & unloading is enabled in the API server. "
+        "This should ONLY be used for local development!"
+    )
+
     @sagemaker_standards.register_load_adapter_handler(
         request_shape={
             "lora_name": "body.name",
@@ -54,4 +66,5 @@ def register_dynamic_lora_routes(router: APIRouter):
 
         return Response(status_code=200, content=response)
 
-    return router
+    # register the router
+    app.include_router(router)
diff --git a/vllm/entrypoints/serve/profile/__init__.py b/vllm/entrypoints/serve/profile/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/entrypoints/serve/profile/api_router.py b/vllm/entrypoints/serve/profile/api_router.py
new file mode 100644
index 0000000000000..166f13764eb36
--- /dev/null
+++ b/vllm/entrypoints/serve/profile/api_router.py
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+from fastapi import APIRouter, FastAPI, Request
+from fastapi.responses import Response
+
+import vllm.envs as envs
+from vllm.engine.protocol import EngineClient
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+router = APIRouter()
+
+
+def engine_client(request: Request) -> EngineClient:
+    return request.app.state.engine_client
+
+
+@router.post("/start_profile")
+async def start_profile(raw_request: Request):
+    logger.info("Starting profiler...")
+    await engine_client(raw_request).start_profile()
+    logger.info("Profiler started.")
+    return Response(status_code=200)
+
+
+@router.post("/stop_profile")
+async def stop_profile(raw_request: Request):
+    logger.info("Stopping profiler...")
+    await engine_client(raw_request).stop_profile()
+    logger.info("Profiler stopped.")
+    return Response(status_code=200)
+
+
+def attach_router(app: FastAPI):
+    if envs.VLLM_TORCH_PROFILER_DIR:
+        logger.warning_once(
+            "Torch Profiler is enabled in the API server. This should ONLY be "
+            "used for local development!"
+        )
+    elif envs.VLLM_TORCH_CUDA_PROFILE:
+        logger.warning_once(
+            "CUDA Profiler is enabled in the API server. This should ONLY be "
+            "used for local development!"
+        )
+    if envs.VLLM_TORCH_PROFILER_DIR or envs.VLLM_TORCH_CUDA_PROFILE:
+        app.include_router(router)
diff --git a/vllm/entrypoints/serve/rlhf/__init__.py b/vllm/entrypoints/serve/rlhf/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/entrypoints/serve/rlhf/api_router.py b/vllm/entrypoints/serve/rlhf/api_router.py
new file mode 100644
index 0000000000000..3b37840ae0899
--- /dev/null
+++ b/vllm/entrypoints/serve/rlhf/api_router.py
@@ -0,0 +1,102 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+from http import HTTPStatus
+
+from fastapi import APIRouter, FastAPI, Query, Request
+from fastapi.responses import JSONResponse
+
+from vllm.engine.protocol import EngineClient
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def engine_client(request: Request) -> EngineClient:
+    return request.app.state.engine_client
+
+
+router = APIRouter()
+
+
+@router.post("/pause")
+async def pause_generation(
+    raw_request: Request,
+    wait_for_inflight_requests: bool = Query(False),
+    clear_cache: bool = Query(True),
+) -> JSONResponse:
+    """Pause generation requests to allow weight updates.
+
+    Args:
+        wait_for_inflight_requests: When ``True`` waits for in-flight
+            requests to finish before pausing. When ``False`` (default),
+            aborts any in-flight requests immediately.
+        clear_cache: Whether to clear KV/prefix caches after draining.
+    """
+
+    engine = engine_client(raw_request)
+
+    try:
+        await engine.pause_generation(
+            wait_for_inflight_requests=wait_for_inflight_requests,
+            clear_cache=clear_cache,
+        )
+        return JSONResponse(
+            content={"status": "paused"},
+            status_code=HTTPStatus.OK.value,
+        )
+
+    except ValueError as err:
+        return JSONResponse(
+            content={"error": str(err)},
+            status_code=HTTPStatus.BAD_REQUEST.value,
+        )
+    except Exception as err:  # pragma: no cover - defensive
+        logger.exception("Failed to pause generation")
+        return JSONResponse(
+            content={"error": f"Failed to pause generation: {err}"},
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
+        )
+
+
+@router.post("/resume")
+async def resume_generation(raw_request: Request) -> JSONResponse:
+    """Resume generation after a pause."""
+
+    engine = engine_client(raw_request)
+
+    try:
+        await engine.resume_generation()
+        return JSONResponse(
+            content={"status": "resumed"},
+            status_code=HTTPStatus.OK.value,
+        )
+    except Exception as err:  # pragma: no cover - defensive
+        logger.exception("Failed to resume generation")
+        return JSONResponse(
+            content={"error": f"Failed to resume generation: {err}"},
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
+        )
+
+
+@router.get("/is_paused")
+async def is_paused(raw_request: Request) -> JSONResponse:
+    """Return the current pause status."""
+
+    engine = engine_client(raw_request)
+
+    try:
+        paused = await engine.is_paused()
+    except Exception as err:  # pragma: no cover - defensive
+        logger.exception("Failed to fetch pause status")
+        return JSONResponse(
+            content={"error": f"Failed to fetch pause status: {err}"},
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
+        )
+
+    return JSONResponse(content={"is_paused": paused})
+
+
+def attach_router(app: FastAPI):
+    app.include_router(router)
diff --git a/vllm/entrypoints/serve/sleep/__init__.py b/vllm/entrypoints/serve/sleep/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/entrypoints/serve/sleep/api_router.py b/vllm/entrypoints/serve/sleep/api_router.py
new file mode 100644
index 0000000000000..bc01e185315c8
--- /dev/null
+++ b/vllm/entrypoints/serve/sleep/api_router.py
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+from fastapi import APIRouter, FastAPI, Request
+from fastapi.responses import JSONResponse, Response
+
+import vllm.envs as envs
+from vllm.engine.protocol import EngineClient
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def engine_client(request: Request) -> EngineClient:
+    return request.app.state.engine_client
+
+
+router = APIRouter()
+
+
+@router.post("/sleep")
+async def sleep(raw_request: Request):
+    # get POST params
+    level = raw_request.query_params.get("level", "1")
+    await engine_client(raw_request).sleep(int(level))
+    # FIXME: in v0 with frontend multiprocessing, the sleep command
+    # is sent but does not finish yet when we return a response.
+    return Response(status_code=200)
+
+
+@router.post("/wake_up")
+async def wake_up(raw_request: Request):
+    tags = raw_request.query_params.getlist("tags")
+    if tags == []:
+        # set to None to wake up all tags if no tags are provided
+        tags = None
+    logger.info("wake up the engine with tags: %s", tags)
+    await engine_client(raw_request).wake_up(tags)
+    # FIXME: in v0 with frontend multiprocessing, the wake-up command
+    # is sent but does not finish yet when we return a response.
+    return Response(status_code=200)
+
+
+@router.get("/is_sleeping")
+async def is_sleeping(raw_request: Request):
+    logger.info("check whether the engine is sleeping")
+    is_sleeping = await engine_client(raw_request).is_sleeping()
+    return JSONResponse(content={"is_sleeping": is_sleeping})
+
+
+def attach_router(app: FastAPI):
+    if not envs.VLLM_SERVER_DEV_MODE:
+        return
+    logger.warning(
+        "SECURITY WARNING: Development endpoints are enabled! "
+        "This should NOT be used in production!"
+    )
+
+    app.include_router(router)
diff --git a/vllm/entrypoints/serve/tokenize/__init__.py b/vllm/entrypoints/serve/tokenize/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/entrypoints/serve/tokenize/api_router.py b/vllm/entrypoints/serve/tokenize/api_router.py
new file mode 100644
index 0000000000000..a10e78c8d28ee
--- /dev/null
+++ b/vllm/entrypoints/serve/tokenize/api_router.py
@@ -0,0 +1,118 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+from http import HTTPStatus
+
+from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request
+from fastapi.exceptions import RequestValidationError
+from fastapi.responses import JSONResponse
+from typing_extensions import assert_never
+
+from vllm.entrypoints.openai.api_server import validate_json_request
+from vllm.entrypoints.openai.protocol import (
+    DetokenizeRequest,
+    DetokenizeResponse,
+    ErrorResponse,
+    TokenizeRequest,
+    TokenizeResponse,
+)
+from vllm.entrypoints.serve.tokenize.serving import OpenAIServingTokenization
+from vllm.entrypoints.utils import (
+    with_cancellation,
+)
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def tokenization(request: Request) -> OpenAIServingTokenization:
+    return request.app.state.openai_serving_tokenization
+
+
+router = APIRouter()
+
+
+@router.post(
+    "/tokenize",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+        HTTPStatus.NOT_IMPLEMENTED.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+async def tokenize(request: TokenizeRequest, raw_request: Request):
+    handler = tokenization(raw_request)
+
+    try:
+        generator = await handler.create_tokenize(request, raw_request)
+    except NotImplementedError as e:
+        raise HTTPException(
+            status_code=HTTPStatus.NOT_IMPLEMENTED.value, detail=str(e)
+        ) from e
+    except Exception as e:
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
+
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
+    elif isinstance(generator, TokenizeResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    assert_never(generator)
+
+
+@router.post(
+    "/detokenize",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+async def detokenize(request: DetokenizeRequest, raw_request: Request):
+    handler = tokenization(raw_request)
+
+    try:
+        generator = await handler.create_detokenize(request, raw_request)
+    except OverflowError as e:
+        raise RequestValidationError(errors=[str(e)]) from e
+    except Exception as e:
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
+
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
+    elif isinstance(generator, DetokenizeResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    assert_never(generator)
+
+
+def attach_router(app: FastAPI):
+    if getattr(app.state.args, "enable_tokenizer_info_endpoint", False):
+        """Conditionally register the tokenizer info endpoint if enabled."""
+
+        @router.get("/tokenizer_info")
+        async def get_tokenizer_info(raw_request: Request):
+            """Get comprehensive tokenizer information."""
+            result = await tokenization(raw_request).get_tokenizer_info()
+            return JSONResponse(
+                content=result.model_dump(),
+                status_code=result.error.code
+                if isinstance(result, ErrorResponse)
+                else 200,
+            )
+
+    app.include_router(router)
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/serve/tokenize/serving.py
similarity index 100%
rename from vllm/entrypoints/openai/serving_tokenization.py
rename to vllm/entrypoints/serve/tokenize/serving.py

From 7fe9c1a2232275ee4cc7d65af3bc5b648543f367 Mon Sep 17 00:00:00 2001
From: WeiQing Chen <40507679+david6666666@users.noreply.github.com>
Date: Wed, 3 Dec 2025 17:51:08 +0800
Subject: [PATCH 180/770] [CI] Add Async Eplb nightly CI tests (#29385)

Signed-off-by: David Chen <530634352@qq.com>
Signed-off-by: WeiQing Chen <40507679+david6666666@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 .../deepseek_v2_lite_ep_async_eplb.sh         | 73 ++++++++++++++++++
 .../deepseek_v2_lite_ep_eplb.sh               |  1 +
 .../qwen3_next_mtp_async_eplb.sh              | 74 +++++++++++++++++++
 .buildkite/test-pipeline.yaml                 | 20 ++++-
 vllm/distributed/eplb/rebalance_execute.py    |  3 -
 5 files changed, 167 insertions(+), 4 deletions(-)
 create mode 100644 .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh
 create mode 100644 .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh

diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh
new file mode 100644
index 0000000000000..d7167161b0059
--- /dev/null
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh
@@ -0,0 +1,73 @@
+#!/usr/bin/env bash
+set -euxo pipefail
+
+# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
+THRESHOLD=${1:-0.25}
+NUM_Q=${2:-1319}
+PORT=${3:-8030}
+OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
+mkdir -p "${OUT_DIR}"
+
+wait_for_server() {
+  local port=$1
+  timeout 600 bash -c '
+    until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
+      sleep 1
+    done'
+}
+
+MODEL="deepseek-ai/DeepSeek-V2-lite"
+
+# Set BACKENDS based on platform
+if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
+  # ROCm platform
+  BACKENDS=("allgather_reducescatter")
+  # Disable MOE padding for ROCm since it is causing eplb to fail
+  export VLLM_ROCM_MOE_PADDING=0
+else
+  # Non-ROCm platform (CUDA/other)
+  BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+fi
+
+cleanup() {
+  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
+    kill "${SERVER_PID}" 2>/dev/null || true
+    for _ in {1..20}; do
+      kill -0 "${SERVER_PID}" 2>/dev/null || break
+      sleep 0.5
+    done
+    kill -9 "${SERVER_PID}" 2>/dev/null || true
+  fi
+}
+trap cleanup EXIT
+
+for BACK in "${BACKENDS[@]}"; do
+  VLLM_DEEP_GEMM_WARMUP=skip \
+  VLLM_ALL2ALL_BACKEND=$BACK \
+  vllm serve "$MODEL" \
+    --enforce-eager \
+    --tensor-parallel-size 2 \
+    --data-parallel-size 2 \
+    --enable-expert-parallel \
+    --enable-eplb \
+    --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
+    --trust-remote-code \
+    --max-model-len 2048 \
+    --port $PORT &
+  SERVER_PID=$!
+  wait_for_server $PORT
+
+  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
+  OUT="${OUT_DIR}/${TAG}_${BACK}_async_eplb.json"
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
+  python3 - <<PY
+import json; acc=json.load(open('${OUT}'))['accuracy']
+print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
+assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
+PY
+
+  cleanup
+  SERVER_PID=
+  sleep 1
+  PORT=$((PORT+1))
+done
diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
index 8106f50f18f66..693418da6093e 100644
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
@@ -50,6 +50,7 @@ for BACK in "${BACKENDS[@]}"; do
     --data-parallel-size 2 \
     --enable-expert-parallel \
     --enable-eplb \
+    --eplb-config '{"window_size":200,"step_interval":600}' \
     --trust-remote-code \
     --max-model-len 2048 \
     --port $PORT &
diff --git a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
new file mode 100644
index 0000000000000..937a43d1a3221
--- /dev/null
+++ b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
@@ -0,0 +1,74 @@
+#!/usr/bin/env bash
+set -euxo pipefail
+
+# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
+THRESHOLD=${1:-0.25}
+NUM_Q=${2:-1319}
+PORT=${3:-8040}
+OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
+mkdir -p "${OUT_DIR}"
+
+wait_for_server() {
+  local port=$1
+  timeout 600 bash -c '
+    until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
+      sleep 1
+    done'
+}
+
+MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"
+
+# Set BACKENDS based on platform
+if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
+  # ROCm platform
+  BACKENDS=("allgather_reducescatter")
+  # Disable MOE padding for ROCm since it is causing eplb to fail
+  export VLLM_ROCM_MOE_PADDING=0
+else
+  # Non-ROCm platform (CUDA/other)
+  BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+fi
+
+cleanup() {
+  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
+    kill "${SERVER_PID}" 2>/dev/null || true
+    for _ in {1..20}; do
+      kill -0 "${SERVER_PID}" 2>/dev/null || break
+      sleep 0.5
+    done
+    kill -9 "${SERVER_PID}" 2>/dev/null || true
+  fi
+}
+trap cleanup EXIT
+
+for BACK in "${BACKENDS[@]}"; do
+  VLLM_DEEP_GEMM_WARMUP=skip \
+  VLLM_ALL2ALL_BACKEND=$BACK \
+  vllm serve "$MODEL" \
+    --enforce-eager \
+    --tensor-parallel-size 4 \
+    --enable-expert-parallel \
+    --enable-eplb \
+    --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
+    --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
+    --trust-remote-code \
+    --max-model-len 2048 \
+    --gpu-memory-utilization 0.9 \
+    --port $PORT &
+  SERVER_PID=$!
+  wait_for_server $PORT
+
+  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
+  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
+  python3 - <<PY
+import json; acc=json.load(open('${OUT}'))['accuracy']
+print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
+assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
+PY
+
+  cleanup
+  SERVER_PID=
+  sleep 1
+  PORT=$((PORT+1))
+done
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 52c848c784e53..f79e9266559f6 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -1373,4 +1373,22 @@ steps:
   num_gpus: 2
   working_dir: "/vllm-workspace"
   commands:
-  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
\ No newline at end of file
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
+
+- label: DeepSeek V2-Lite Async EPLB Accuracy
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
+
+- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
diff --git a/vllm/distributed/eplb/rebalance_execute.py b/vllm/distributed/eplb/rebalance_execute.py
index 376dad8a72ef1..55856d940f001 100644
--- a/vllm/distributed/eplb/rebalance_execute.py
+++ b/vllm/distributed/eplb/rebalance_execute.py
@@ -322,9 +322,6 @@ async def transfer_layer(
     num_local_physical_experts = next(iter(expert_weights[0])).shape[0]
     assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts)
     assert num_physical_experts == ep_size * num_local_physical_experts
-    # A buffer to hold the expert weights in one layer during the exchange.
-    # NOTE: Currently we assume the same weights across different layers
-    # have the same shape.
 
     is_unchanged, is_received_locally, experts_recv_loc = move_to_buffer(
         num_local_experts=num_local_physical_experts,

From a21cd9ed239b853bd587ffe3c9140fe68cd41f59 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Wed, 3 Dec 2025 18:05:10 +0800
Subject: [PATCH 181/770] [Bugfix] Fix incorrect `image_grid_thw` rank for
 HunyuanOCR from missing `merge_by_field_config=True` (#29950)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 .../vision_language_multi_image.py            | 23 +++++++++++++++++++
 vllm/model_executor/models/hunyuan_vision.py  |  1 +
 2 files changed, 24 insertions(+)

diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 2193b1ca9cf48..560ca768d1a6c 100755
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -309,6 +309,28 @@ def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+# HunyuanOCR
+def load_hunyuan_vl(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "tencent/HunyuanOCR"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholder = (
+        "<｜hy_place▁holder▁no▁100｜><｜hy_place▁holder▁no▁102｜><｜hy_place▁holder▁no▁101｜>"  # noqa: E501
+    ) * len(image_urls)
+    prompt = f"<｜hy_begin▁of▁sentence｜>{placeholder}{question}<｜hy_User｜>"
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
 def load_hyperclovax_seed_vision(
     question: str, image_urls: list[str]
 ) -> ModelRequestData:
@@ -1322,6 +1344,7 @@ model_example_map = {
     "deepseek_ocr": load_deepseek_ocr,
     "gemma3": load_gemma3,
     "h2ovl_chat": load_h2ovl,
+    "hunyuan_vl": load_hunyuan_vl,
     "hyperclovax_seed_vision": load_hyperclovax_seed_vision,
     "idefics3": load_idefics3,
     "interns1": load_interns1,
diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py
index 2950db571e6ee..6537b6df876a9 100644
--- a/vllm/model_executor/models/hunyuan_vision.py
+++ b/vllm/model_executor/models/hunyuan_vision.py
@@ -785,6 +785,7 @@ class HunYuanVLForConditionalGeneration(
     SupportsQuant,
     SupportsXDRoPE,
 ):
+    merge_by_field_config = True
     multimodal_cpu_fields = {"image_grid_thw"}
 
     # To ensure correct weight loading and mapping.

From cc4e296ea62226632de5285621fd0cd287621ddc Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Wed, 3 Dec 2025 18:27:36 +0800
Subject: [PATCH 182/770] [CI/Build] Avoid duplicate empty inputs test for
 common multimodal generation tests (#29907)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 .../multimodal/generation/test_common.py      |  14 +--
 .../generation/vlm_utils/case_filtering.py    | 114 +++++++++---------
 .../multimodal/generation/vlm_utils/types.py  |   4 +-
 3 files changed, 69 insertions(+), 63 deletions(-)

diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index deaeea059ccaf..0eaf7198f91b7 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -137,7 +137,7 @@ VLM_TEST_SETTINGS = {
         max_num_seqs=2,
         auto_cls=AutoModelForImageTextToText,
         vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
-        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
         marks=[pytest.mark.core_model, pytest.mark.cpu_model],
     ),
     "qwen2_5_omni": VLMTestInfo(
@@ -152,7 +152,7 @@ VLM_TEST_SETTINGS = {
         auto_cls=AutoModelForTextToWaveform,
         vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
         patch_hf_runner=model_utils.qwen2_5_omni_patch_hf_runner,
-        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
         marks=[pytest.mark.core_model, pytest.mark.cpu_model],
     ),
     "qwen3_vl": VLMTestInfo(
@@ -173,7 +173,7 @@ VLM_TEST_SETTINGS = {
         auto_cls=AutoModelForImageTextToText,
         vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
         patch_hf_runner=model_utils.qwen3_vl_patch_hf_runner,
-        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
         marks=[
             pytest.mark.core_model,
         ],
@@ -350,7 +350,7 @@ VLM_TEST_SETTINGS = {
         patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
         hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
         stop_str=["<｜end▁of▁sentence｜>", "<｜begin▁of▁sentence｜>"],
-        image_size_factors=[(), (1.0,), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
+        image_size_factors=[(1.0,), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
     ),
     "fuyu": VLMTestInfo(
         models=["adept/fuyu-8b"],
@@ -707,7 +707,7 @@ VLM_TEST_SETTINGS = {
         max_model_len=8192,
         max_num_seqs=2,
         auto_cls=AutoModelForCausalLM,
-        image_size_factors=[(), (0.25,)],
+        image_size_factors=[(0.25,)],
         marks=[
             pytest.mark.skipif(
                 Version(TRANSFORMERS_VERSION) == Version("4.57.3"),
@@ -760,7 +760,7 @@ VLM_TEST_SETTINGS = {
         max_num_seqs=2,
         auto_cls=AutoModelForImageTextToText,
         vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
-        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
         marks=[pytest.mark.cpu_model],
     ),
     "skywork_r1v": VLMTestInfo(
@@ -812,7 +812,7 @@ VLM_TEST_SETTINGS = {
         max_model_len=4096,
         max_num_seqs=2,
         auto_cls=AutoModelForImageTextToText,
-        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
         marks=[pytest.mark.skip("Model initialization hangs")],
     ),
     ### Tensor parallel / multi-gpu broadcast tests
diff --git a/tests/models/multimodal/generation/vlm_utils/case_filtering.py b/tests/models/multimodal/generation/vlm_utils/case_filtering.py
index d42150bcbf672..116eead7a70ad 100644
--- a/tests/models/multimodal/generation/vlm_utils/case_filtering.py
+++ b/tests/models/multimodal/generation/vlm_utils/case_filtering.py
@@ -62,6 +62,65 @@ def get_filtered_test_settings(
     return matching_tests
 
 
+def get_model_type_cases(
+    model_type: str,
+    test_info: VLMTestInfo,
+    test_type: VLMTestType,
+):
+    # Ensure that something is wrapped as an iterable it's not already
+    ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e,)
+
+    # This is essentially the same as nesting a bunch of mark.parametrize
+    # decorators, but we do it programmatically to allow overrides for on
+    # a per-model basis, while still being able to execute each of these
+    # as individual test cases in pytest.
+    iter_kwargs = OrderedDict(
+        [
+            ("model", ensure_wrapped(test_info.models)),
+            ("max_tokens", ensure_wrapped(test_info.max_tokens)),
+            ("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
+            ("dtype", ensure_wrapped(test_info.dtype)),
+            (
+                "distributed_executor_backend",
+                ensure_wrapped(test_info.distributed_executor_backend),
+            ),
+        ]
+    )
+
+    # num_frames is video only
+    if test_type == VLMTestType.VIDEO:
+        iter_kwargs["num_video_frames"] = ensure_wrapped(test_info.num_video_frames)
+        iter_kwargs["needs_video_metadata"] = ensure_wrapped(
+            test_info.needs_video_metadata
+        )
+
+    # No sizes passed for custom inputs, since inputs are directly provided
+    if test_type not in (
+        VLMTestType.CUSTOM_INPUTS,
+        VLMTestType.AUDIO,
+    ):
+        wrapped_sizes = get_wrapped_test_sizes(test_info, test_type)
+        if wrapped_sizes is None:
+            raise ValueError(f"Sizes must be set for test type {test_type}")
+        iter_kwargs["size_wrapper"] = wrapped_sizes
+
+    # Otherwise expand the custom test options instead
+    elif test_type == VLMTestType.CUSTOM_INPUTS:
+        if test_info.custom_test_opts is None:
+            raise ValueError("Test has type CUSTOM_INPUTS, but none given")
+        iter_kwargs["custom_test_opts"] = test_info.custom_test_opts
+
+    # Wrap all model cases in a pytest parameter & pass marks through
+    return [
+        pytest.param(
+            model_type,
+            ExpandableVLMTestArgs(**{k: v for k, v in zip(iter_kwargs.keys(), case)}),
+            marks=test_info.marks if test_info.marks is not None else [],
+        )
+        for case in list(itertools.product(*iter_kwargs.values()))
+    ]
+
+
 def get_parametrized_options(
     test_settings: dict[str, VLMTestInfo],
     test_type: VLMTestType,
@@ -76,64 +135,11 @@ def get_parametrized_options(
         test_settings, test_type, create_new_process_for_each_test
     )
 
-    # Ensure that something is wrapped as an iterable it's not already
-    ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e,)
-
-    def get_model_type_cases(model_type: str, test_info: VLMTestInfo):
-        # This is essentially the same as nesting a bunch of mark.parametrize
-        # decorators, but we do it programmatically to allow overrides for on
-        # a per-model basis, while still being able to execute each of these
-        # as individual test cases in pytest.
-        iter_kwargs = OrderedDict(
-            [
-                ("model", ensure_wrapped(test_info.models)),
-                ("max_tokens", ensure_wrapped(test_info.max_tokens)),
-                ("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
-                ("dtype", ensure_wrapped(test_info.dtype)),
-                (
-                    "distributed_executor_backend",
-                    ensure_wrapped(test_info.distributed_executor_backend),
-                ),
-            ]
-        )
-
-        # num_frames is video only
-        if test_type == VLMTestType.VIDEO:
-            iter_kwargs["num_video_frames"] = ensure_wrapped(test_info.num_video_frames)
-            iter_kwargs["needs_video_metadata"] = ensure_wrapped(
-                test_info.needs_video_metadata
-            )
-
-        # No sizes passed for custom inputs, since inputs are directly provided
-        if test_type not in (VLMTestType.CUSTOM_INPUTS, VLMTestType.AUDIO):
-            wrapped_sizes = get_wrapped_test_sizes(test_info, test_type)
-            if wrapped_sizes is None:
-                raise ValueError(f"Sizes must be set for test type {test_type}")
-            iter_kwargs["size_wrapper"] = wrapped_sizes
-
-        # Otherwise expand the custom test options instead
-        elif test_type == VLMTestType.CUSTOM_INPUTS:
-            if test_info.custom_test_opts is None:
-                raise ValueError("Test has type CUSTOM_INPUTS, but none given")
-            iter_kwargs["custom_test_opts"] = test_info.custom_test_opts
-
-        # Wrap all model cases in a pytest parameter & pass marks through
-        return [
-            pytest.param(
-                model_type,
-                ExpandableVLMTestArgs(
-                    **{k: v for k, v in zip(iter_kwargs.keys(), case)}
-                ),
-                marks=test_info.marks if test_info.marks is not None else [],
-            )
-            for case in list(itertools.product(*iter_kwargs.values()))
-        ]
-
     # Get a list per model type, where each entry contains a tuple of all of
     # that model type's cases, then flatten them into the top level so that
     # we can consume them in one mark.parametrize call.
     cases_by_model_type = [
-        get_model_type_cases(model_type, test_info)
+        get_model_type_cases(model_type, test_info, test_type)
         for model_type, test_info in matching_tests.items()
     ]
     return list(itertools.chain(*cases_by_model_type))
diff --git a/tests/models/multimodal/generation/vlm_utils/types.py b/tests/models/multimodal/generation/vlm_utils/types.py
index 0c03c84497125..ae2f754813590 100644
--- a/tests/models/multimodal/generation/vlm_utils/types.py
+++ b/tests/models/multimodal/generation/vlm_utils/types.py
@@ -50,8 +50,8 @@ MULTI_IMAGE_BASE_PROMPT = f"Image-1: {TEST_IMG_PLACEHOLDER}Image-2: {TEST_IMG_PL
 VIDEO_BASE_PROMPT = f"{TEST_VIDEO_PLACEHOLDER}Why is this video funny?"
 
 
-IMAGE_SIZE_FACTORS = [(), (1.0,), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)]
-EMBEDDING_SIZE_FACTORS = [(), (1.0,), (1.0, 1.0, 1.0)]
+IMAGE_SIZE_FACTORS = [(1.0,), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)]
+EMBEDDING_SIZE_FACTORS = [(1.0,), (1.0, 1.0, 1.0)]
 RunnerOutput = tuple[list[int], str, SampleLogprobs | None]
 
 
From 42c194964341bea9fc59e0d35db04dfafc3c473d Mon Sep 17 00:00:00 2001
From: Tsukasa OI <floss_llm@irq.a4lg.com>
Date: Wed, 3 Dec 2025 19:33:46 +0900
Subject: [PATCH 183/770] [Bugfix][Quantization] Support BF16 tensors on GGUF
 (#29948)

Signed-off-by: Tsukasa OI <floss_llm@irq.a4lg.com>
---
 tests/models/quantization/test_gguf.py           |  7 +++++++
 vllm/model_executor/model_loader/weight_utils.py | 12 +++++++++++-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/tests/models/quantization/test_gguf.py b/tests/models/quantization/test_gguf.py
index 3b9597507ac1b..064ca94f3cbac 100644
--- a/tests/models/quantization/test_gguf.py
+++ b/tests/models/quantization/test_gguf.py
@@ -47,6 +47,12 @@ QWEN2_CONFIG = GGUFTestConfig(
     gguf_filename="qwen2.5-1.5b-instruct-q6_k.gguf",
 )
 
+QWEN3_CONFIG = GGUFTestConfig(
+    original_model="Qwen/Qwen3-0.6B",
+    gguf_repo="unsloth/Qwen3-0.6B-GGUF",
+    gguf_filename="Qwen3-0.6B-BF16.gguf",
+)
+
 PHI3_CONFIG = GGUFTestConfig(
     original_model="microsoft/Phi-3.5-mini-instruct",
     gguf_repo="bartowski/Phi-3.5-mini-instruct-GGUF",
@@ -87,6 +93,7 @@ GEMMA3_CONFIG = GGUFTestConfig(
 MODELS = [
     # LLAMA_CONFIG, # broken: https://github.com/vllm-project/vllm/issues/19458
     QWEN2_CONFIG,
+    QWEN3_CONFIG,
     PHI3_CONFIG,
     GPT2_CONFIG,
     STABLELM_CONFIG,
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 0809bdfa9d4c2..0496b7a84507b 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -921,7 +921,17 @@ def gguf_quant_weights_iterator(
             name = gguf_to_hf_name_map[tensor.name]
             if weight_type.name not in ("F32", "BF16", "F16"):
                 name = name.replace("weight", "qweight")
-            param = torch.tensor(weight)
+            if weight_type.name == "BF16" and tensor.data.dtype == np.uint8:
+                # BF16 is currently the only "quantization" type that isn't
+                # actually quantized but is read as a raw byte tensor.
+                # Reinterpret as `torch.bfloat16` tensor.
+                weight = weight.view(np.uint16)
+                if reader.byte_order == "S":
+                    # GGUF endianness != system endianness
+                    weight = weight.byteswap()
+                param = torch.tensor(weight).view(torch.bfloat16)
+            else:
+                param = torch.tensor(weight)
             yield name, param
 
 
From 787b84a9fc9d1744f82addf40912e9fb84c0b4c5 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Wed, 3 Dec 2025 02:42:49 -0800
Subject: [PATCH 184/770] [Bugfix] Follow-up fix on MediaWithBytes (#29951)

Signed-off-by: Roger Wang <hey@rogerw.io>
---
 vllm/multimodal/base.py   | 2 ++
 vllm/multimodal/inputs.py | 3 ++-
 vllm/multimodal/parse.py  | 2 +-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 4a619fd303ca9..53eb4c591ef99 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -21,6 +21,8 @@ class MediaWithBytes(Generic[_T]):
 
     The wrapper delegates attribute access to the underlying media object,
     making it behave transparently like the wrapped type (e.g., PIL.Image).
+
+    NOTE: Currently, this wrapper is used only for the image modality.
     """
 
     media: _T
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index f4e38b1f3325f..397684fa2f83c 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -32,6 +32,7 @@ if TYPE_CHECKING:
     from PIL.Image import Image
     from transformers.feature_extraction_utils import BatchFeature
 
+    from .base import MediaWithBytes
     from .processing import MultiModalHashes
 
 else:
@@ -59,7 +60,7 @@ Represents a single audio
 item, which can be passed to a HuggingFace `AudioProcessor`.
 """
 
-ImageItem: TypeAlias = Union[HfImageItem, "torch.Tensor"]
+ImageItem: TypeAlias = Union[HfImageItem, "torch.Tensor", "MediaWithBytes[HfImageItem]"]
 """
 A `transformers.image_utils.ImageInput` representing a single image
 item, which can be passed to a HuggingFace `ImageProcessor`.
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 650368dcb8fcd..c3c7cc2c3da0e 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -484,7 +484,7 @@ class MultiModalDataParser:
             return ImageEmbeddingItems(data)
 
         if (
-            isinstance(data, PILImage.Image)
+            isinstance(data, (PILImage.Image, MediaWithBytes))
             or isinstance(data, (np.ndarray, torch.Tensor))
             and data.ndim == 3
         ):

From b294e28db2c5dee61bc25157664edcada8b90b31 Mon Sep 17 00:00:00 2001
From: HDCharles <39544797+HDCharles@users.noreply.github.com>
Date: Wed, 3 Dec 2025 06:00:56 -0500
Subject: [PATCH 185/770] [refactor] CTMoEMethods to use QuantizationArgs
 (#28871)

Signed-off-by: HDCharles <charlesdavidhernandez@gmail.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 .../compressed_tensors/compressed_tensors.py  |   6 +-
 .../compressed_tensors_moe.py                 | 155 +++++++++---------
 2 files changed, 86 insertions(+), 75 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 02086c3c0052d..b91ecb59fee18 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -767,8 +767,10 @@ class CompressedTensorsConfig(QuantizationConfig):
                 targets=self.target_scheme_map.keys(),
                 fused_mapping=self.packed_modules_mapping,
             )
-
-            return self.target_scheme_map[matched_target]
+            scheme_dict = self.target_scheme_map[matched_target]
+            if scheme_dict.get("format") is None:
+                scheme_dict["format"] = self.quant_format
+            return scheme_dict
 
         return None
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 80ee443d4dd6a..c7368bf427fe1 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -7,7 +7,11 @@ from enum import Enum
 
 import torch
 from compressed_tensors import CompressionFormat
-from compressed_tensors.quantization import ActivationOrdering, QuantizationStrategy
+from compressed_tensors.quantization import (
+    ActivationOrdering,
+    QuantizationArgs,
+    QuantizationStrategy,
+)
 from torch.nn.parameter import Parameter
 
 import vllm.envs as envs
@@ -142,10 +146,26 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
         # are supported + check if the layer is being ignored.
         weight_quant = scheme_dict.get("weights")
         input_quant = scheme_dict.get("input_activations")
+        format = scheme_dict.get("format")
 
         if quant_config._is_wNa16_group_channel(weight_quant, input_quant):
             # group_size=None means channelwise
             group_size = weight_quant.group_size or -1
+
+            valid_format_and_bits = (
+                weight_quant.num_bits in WNA16_SUPPORTED_BITS
+                and format == CompressionFormat.pack_quantized.value
+            )
+
+            if not valid_format_and_bits:
+                raise ValueError(
+                    "For Fused MoE layers, only format: ",
+                    f"{CompressionFormat.pack_quantized.value} ",
+                    f" and bits: {WNA16_SUPPORTED_BITS} is supported ",
+                    f"but got format: {CompressionFormat.pack_quantized.value} "
+                    f" and bits: {weight_quant.num_bits}",
+                )
+
             # Prefer to use the MarlinMoE kernel when it is supported.
             if (
                 not check_moe_marlin_supports_layer(layer, group_size)
@@ -161,12 +181,12 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
                     )
                 logger.info_once("Using CompressedTensorsWNA16MoEMethod")
                 return CompressedTensorsWNA16MoEMethod(
-                    quant_config, layer.moe_config, layer_name
+                    weight_quant, input_quant, layer.moe_config
                 )
             else:
                 logger.info_once("Using CompressedTensorsWNA16MarlinMoEMethod")
                 return CompressedTensorsWNA16MarlinMoEMethod(
-                    quant_config, layer.moe_config, layer_name
+                    weight_quant, input_quant, layer.moe_config
                 )
         elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant):
             return CompressedTensorsW4A4Nvfp4MoEMethod(layer.moe_config, layer_name)
@@ -176,15 +196,15 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
             or quant_config._is_fp8_w8a8(weight_quant, input_quant)
         ):
             return CompressedTensorsW8A8Fp8MoEMethod(
-                quant_config, layer.moe_config, layer_name
+                weight_quant, input_quant, layer.moe_config
             )
         elif quant_config._is_dynamic_token_w8a8(weight_quant, input_quant):
             return CompressedTensorsW8A8Int8MoEMethod(
-                quant_config, layer.moe_config, layer_name
+                weight_quant, input_quant, layer.moe_config
             )
         elif quant_config._is_dynamic_token_w4a8_int(weight_quant, input_quant):
             return CompressedTensorsW4A8Int8MoEMethod(
-                quant_config, layer.moe_config, layer_name
+                weight_quant, input_quant, layer.moe_config
             )
         else:
             raise RuntimeError(
@@ -650,17 +670,19 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
 class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
     def __init__(
         self,
-        quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
+        weight_quant: QuantizationArgs,
+        input_quant: QuantizationArgs,
         moe: FusedMoEConfig,
         layer_name: str | None = None,
     ):
-        super().__init__(moe)
-        self.quant_config = quant_config
-        self.weight_quant = self.quant_config.target_scheme_map["Linear"].get("weights")
-        self.input_quant = self.quant_config.target_scheme_map["Linear"].get(
-            "input_activations"
+        from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
+            CompressedTensorsConfig,
         )
 
+        super().__init__(moe)
+        self.weight_quant = weight_quant
+        self.input_quant = input_quant
+
         per_tensor = (
             self.weight_quant.strategy == QuantizationStrategy.TENSOR
             and self.input_quant.strategy == QuantizationStrategy.TENSOR
@@ -698,11 +720,13 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
 
         # cutlass path
-        self.is_fp8_w8a8_sm100 = quant_config._is_fp8_w8a8_sm100(
+        self.is_fp8_w8a8_sm100 = CompressedTensorsConfig._is_fp8_w8a8_sm100(
             self.weight_quant, self.input_quant
         )
         self.use_cutlass = not self.block_quant and (
-            quant_config._is_fp8_w8a8_sm90(self.weight_quant, self.input_quant)
+            CompressedTensorsConfig._is_fp8_w8a8_sm90(
+                self.weight_quant, self.input_quant
+            )
             or self.is_fp8_w8a8_sm100
         )
         self.disable_expert_map = False
@@ -1261,16 +1285,14 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
 class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
     def __init__(
         self,
-        quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
+        weight_quant: QuantizationArgs,
+        input_quant: QuantizationArgs,
         moe: FusedMoEConfig,
         layer_name: str | None = None,
     ):
         super().__init__(moe)
-        self.quant_config = quant_config
-        self.weight_quant = self.quant_config.target_scheme_map["Linear"].get("weights")
-        self.input_quant = self.quant_config.target_scheme_map["Linear"].get(
-            "input_activations"
-        )
+        self.weight_quant = weight_quant
+        self.input_quant = input_quant
 
         per_channel = (
             self.weight_quant.strategy == QuantizationStrategy.CHANNEL
@@ -1414,36 +1436,27 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
 class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
     def __init__(
         self,
-        quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
+        weight_quant: QuantizationArgs,
+        input_quant: QuantizationArgs | None,
         moe: FusedMoEConfig,
         layer_name: str | None = None,
     ):
         super().__init__(moe)
-        self.quant_config = quant_config
-        # TODO: @dsikka: refactor this to use schemes as other kernels
-        # are supported + check if the layer is being ignored.
-        config = self.quant_config.target_scheme_map["Linear"].get("weights")
-        self.num_bits = config.num_bits
-        self.packed_factor = 32 // config.num_bits
-        self.strategy = config.strategy
-        self.group_size = config.group_size
-        self.actorder = config.actorder
-        self.layer_name = layer_name
-        self.marlin_input_dtype = get_marlin_input_dtype(layer_name)
-        assert config.symmetric, "Only symmetric quantization is supported for MoE"
+        self.weight_quant = weight_quant
+        self.input_quant = input_quant
+        assert weight_quant.symmetric, (
+            "Only symmetric quantization is supported for MoE"
+        )
+        # Extract properties from weight_quant
+        self.num_bits = weight_quant.num_bits
+        self.packed_factor = 32 // weight_quant.num_bits
+        self.strategy = weight_quant.strategy
+        self.group_size = weight_quant.group_size
+        self.actorder = weight_quant.actorder
 
-        if not (
-            self.quant_config.quant_format == CompressionFormat.pack_quantized.value
-            and self.num_bits in WNA16_SUPPORTED_BITS
-        ):
-            raise ValueError(
-                "For Fused MoE layers, only ",
-                f"{CompressionFormat.pack_quantized.value} ",
-                "is supported for the following bits: ",
-                f"{WNA16_SUPPORTED_BITS}",
-            )
         self.quant_type = WNA16_SUPPORTED_TYPES_MAP[self.num_bits]
         self.use_marlin = True
+        self.marlin_input_dtype = get_marlin_input_dtype(layer_name)
 
     def create_weights(
         self,
@@ -1812,35 +1825,26 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
 class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
     def __init__(
         self,
-        quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
+        weight_quant: QuantizationArgs,
+        input_quant: QuantizationArgs | None,
         moe: FusedMoEConfig,
         layer_name: str | None = None,
     ):
         super().__init__(moe)
-        self.quant_config = quant_config
-        # TODO: @dsikka: refactor this to use schemes as other kernels
-        # are supported + check if the layer is being ignored.
-        config = self.quant_config.target_scheme_map["Linear"].get("weights")
-        self.num_bits = config.num_bits
-        self.packed_factor = 32 // config.num_bits
-        self.strategy = config.strategy
+        self.weight_quant = weight_quant
+        self.input_quant = input_quant
+        # Extract properties from weight_quant
+        self.num_bits = weight_quant.num_bits
+        self.packed_factor = 32 // weight_quant.num_bits
+        self.strategy = weight_quant.strategy
         # channelwise is not supported by this kernel
-        assert config.strategy == "group"
-        self.group_size = config.group_size
+        assert weight_quant.strategy == "group"
+        self.group_size = weight_quant.group_size
         # grouped actorder isn't supported by this kernel
-        assert config.actorder != "group"
-        assert config.symmetric, "Only symmetric quantization is supported for MoE"
-
-        if not (
-            self.quant_config.quant_format == CompressionFormat.pack_quantized.value
-            and self.num_bits in WNA16_SUPPORTED_BITS
-        ):
-            raise ValueError(
-                "For Fused MoE layers, only ",
-                f"{CompressionFormat.pack_quantized.value} ",
-                "is supported for the following bits: ",
-                f"{WNA16_SUPPORTED_BITS}",
-            )
+        assert weight_quant.actorder != "group"
+        assert weight_quant.symmetric, (
+            "Only symmetric quantization is supported for MoE"
+        )
 
     def create_weights(
         self,
@@ -2065,28 +2069,33 @@ class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod):
 
     def __init__(
         self,
-        quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
+        weight_quant: QuantizationArgs,
+        input_quant: QuantizationArgs,
         moe: FusedMoEConfig,
         layer_name: str | None = None,
     ):
         super().__init__(moe)
         self.has_bias = self.moe.has_bias
-        self.quant_config = quant_config
+        self.weight_quant = weight_quant
+        self.input_quant = input_quant
 
         # Validate scheme: weights=W4 (channel or group),
         # activations=dynamic TOKEN (A8)
-        wq = self.quant_config.target_scheme_map["Linear"].get("weights")
-        aq = self.quant_config.target_scheme_map["Linear"].get("input_activations")
 
         # Must be dynamic per-token activations
-        if aq.strategy != QuantizationStrategy.TOKEN or not aq.dynamic:
+        if (
+            input_quant.strategy != QuantizationStrategy.TOKEN
+            or not input_quant.dynamic
+        ):
             raise ValueError(
                 "W4A8-int MoE needs dynamic per-token activation quantization."
             )
 
         # Weight can be channel-wise (group_size=None) or group-wise
-        self.group_size = wq.group_size if (wq.group_size is not None) else -1
-        if wq.num_bits != 4:
+        self.group_size = (
+            weight_quant.group_size if (weight_quant.group_size is not None) else -1
+        )
+        if weight_quant.num_bits != 4:
             raise ValueError("This method only supports 4-bit weights (num_bits=4).")
 
         # CPU only

From 78f4bb0ba8c70f8876311cc414938d3d68997fc6 Mon Sep 17 00:00:00 2001
From: Fadi Arafeh <115173828+fadara01@users.noreply.github.com>
Date: Wed, 3 Dec 2025 11:36:58 +0000
Subject: [PATCH 186/770] [DOC] Add Arm to list of compute resouces providers
 (#29894)

Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
---
 README.md                  | 1 +
 docs/community/sponsors.md | 1 +
 2 files changed, 2 insertions(+)

diff --git a/README.md b/README.md
index abbb63158f166..5c040fe4a66d2 100644
--- a/README.md
+++ b/README.md
@@ -137,6 +137,7 @@ Compute Resources:
 - Alibaba Cloud
 - AMD
 - Anyscale
+- Arm
 - AWS
 - Crusoe Cloud
 - Databricks
diff --git a/docs/community/sponsors.md b/docs/community/sponsors.md
index 8abb07caaab62..fd1c82376d086 100644
--- a/docs/community/sponsors.md
+++ b/docs/community/sponsors.md
@@ -18,6 +18,7 @@ Compute Resources:
 - Alibaba Cloud
 - AMD
 - Anyscale
+- Arm
 - AWS
 - Crusoe Cloud
 - Databricks

From f5d3d93c40417c296c20dc301100e55708a17f3f Mon Sep 17 00:00:00 2001
From: Amr Mahdi <amramahdi@gmail.com>
Date: Wed, 3 Dec 2025 03:41:53 -0800
Subject: [PATCH 187/770] [docker] Build CUDA kernels in separate Docker stage
 for faster rebuilds (#29452)

Signed-off-by: Amr Mahdi <amrmahdi@meta.com>
---
 docker/Dockerfile                             |  66 +++++++++++++++---
 .../dockerfile-stages-dependency.png          | Bin 149377 -> 177867 bytes
 setup.py                                      |  14 +++-
 vllm/envs.py                                  |   5 ++
 4 files changed, 74 insertions(+), 11 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 006481b23cb9f..8bcd7f118f1ef 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -150,8 +150,8 @@ ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 #################### BASE BUILD IMAGE ####################
 
-#################### WHEEL BUILD IMAGE ####################
-FROM base AS build
+#################### CSRC BUILD IMAGE ####################
+FROM base AS csrc-build
 ARG TARGETPLATFORM
 
 ARG PIP_INDEX_URL UV_INDEX_URL
@@ -172,10 +172,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
     --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
-COPY . .
-ARG GIT_REPO_CHECK=0
-RUN --mount=type=bind,source=.git,target=.git \
-    if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
+WORKDIR /workspace
+
+COPY pyproject.toml setup.py CMakeLists.txt ./
+COPY cmake cmake/
+COPY csrc csrc/
+COPY vllm/envs.py vllm/envs.py
+COPY vllm/__init__.py vllm/__init__.py
 
 # max jobs used by Ninja to build extensions
 ARG max_jobs=2
@@ -195,9 +198,11 @@ ARG SCCACHE_S3_NO_CREDENTIALS=0
 ARG VLLM_USE_PRECOMPILED=""
 ARG VLLM_MAIN_CUDA_VERSION=""
 
+# Use dummy version for csrc-build wheel (only .so files are extracted, version doesn't matter)
+ENV SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0+csrc.build"
+
 # if USE_SCCACHE is set, use sccache to speed up compilation
 RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,source=.git,target=.git \
     if [ "$USE_SCCACHE" = "1" ]; then \
         echo "Installing sccache..." \
         && curl -L -o sccache.tar.gz ${SCCACHE_DOWNLOAD_URL} \
@@ -223,7 +228,6 @@ ENV VLLM_TARGET_DEVICE=${vllm_target_device}
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,source=.git,target=.git  \
     if [ "$USE_SCCACHE" != "1" ]; then \
         # Clean any existing CMake artifacts
         rm -rf .deps && \
@@ -232,6 +236,52 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
         export VLLM_DOCKER_BUILD_CONTEXT=1 && \
         python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
     fi
+#################### CSRC BUILD IMAGE ####################
+
+#################### WHEEL BUILD IMAGE ####################
+FROM base AS build
+ARG TARGETPLATFORM
+
+ARG PIP_INDEX_URL UV_INDEX_URL
+ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
+ARG PYTORCH_CUDA_INDEX_BASE_URL
+
+# install build dependencies
+COPY requirements/build.txt requirements/build.txt
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+# Use copy mode to avoid hardlink failures with Docker cache mounts
+ENV UV_LINK_MODE=copy
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
+    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+
+WORKDIR /workspace
+
+COPY --from=csrc-build /workspace/dist /precompiled-wheels
+
+COPY . .
+
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
+
+ARG vllm_target_device="cuda"
+ENV VLLM_TARGET_DEVICE=${vllm_target_device}
+
+# Skip adding +precompiled suffix to version (preserves git-derived version)
+ENV VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX=1
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=.git,target=.git \
+    if [ "${vllm_target_device}" = "cuda" ]; then \
+        export VLLM_PRECOMPILED_WHEEL_LOCATION=$(ls /precompiled-wheels/*.whl); \
+    fi && \
+    python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38
 
 # Install DeepGEMM from source
 ARG DEEPGEMM_GIT_REF
diff --git a/docs/assets/contributing/dockerfile-stages-dependency.png b/docs/assets/contributing/dockerfile-stages-dependency.png
index b327eb2151f50e4d682fe533fa57e12f0b6118c2..7420ca4d89441e6dd320657092aaf3e1c0491e9c 100644
GIT binary patch
literal 177867
zcmbq+c_5W**Z!m4>YT=NP9sgHq9PHI%$nO2GLxaf5Q;J+GIcuDDUI8dS%b)&Aw!`_
zX3Cf%8W2L6$?&_@raJHY{rTIk&Zlh8zMuPE_gdF=t?Pci^Qy`_W=!Ro$}r3f*3PYJ
z4D*`_!%RH(%VhkEI$z=({GZ7Oly+=oM(KZ{8R4D`^C!dFx=G_i;OAP0>zWN|67_Y#
zT$}#pI{b^pVVz$tOp<;j{`)^WR6Spv6x=Dc;iA(YhaWGoR6Z<n=px?+!F|yY5hnzn
zz5PvO^6!%--QMZ9-_5N*?aHk`>zR`aI=-d%diP|u8k7vi#Ay4(_ql4_&=H(B5rg>m
z>rqwE-aYOA`S(B3_dVC{O#k11vkm3{Kfk@`Q2zhEZQT@q(f|Fo!c+erm%Eg|m~%aM
zzPgWjp6jh@Qg>Znx6E_p?)=Zyi3N*)-FLn;-M%&HaJ6W`+7x}g6f1-FjXwr3<TgH<
zc!kKQsIKlZm8N&+mKGNXX9a2-I`S=b{<10UAeY#JBX8~(O<s8T>ZJ{@S4+329=u=Y
z`c2Hd^mfg?AD42yIyL*iJ)QR7IR8>^ooH54EI&F>Qgh<NCC4w1r>sBr{+Z2=3H^_C
zO#Ms}x{9UEvW9z=N`hs}QjdMW558Zh|Hejbb&3gwX|#Cq_pfO!kx@29;#D7q`|3hI
z_*cg44Ob6W);00gEKM*eNILZLz}K_?ImFyi4)SMtu57q<MJ}I)f9%8aqGJYc?jNyz
zwf|hh^@Er56xYNgU046{IYag1i}NUDcaW60ufcI$q4ZBTF?lyCGaG#N3O#5~w{+_M
zbWO-0{o@Y#!99Lz6U6kFx^(Y3J7b~Grqh#6zSKYFmOg$sX1G8wao1(p-dBOnoq6W9
z>GpW%_q!20F+RbnZ|X*RQ|KK7{U5F<+0LCix7xnSz_$D4j_|!HR(GSL1rKg^U+Qz<
z-t}c$&VG!Hip_s?YT|m!%IMNetr)$A&xs|<9d9$#_w3p8-0S<*nHlWy$Z>8yJxeZk
z&yL03Q<j_v6|0Kab8C^^i%pWl-DQc-c`Us|7Bj5Qmphg=Jeek37&1E4v@1j|L#!@L
z$yfhmTXOS@8(X-`xV~RjY-5v?{!Gr_(uRBAB#sQWR$1i6R~epc=&3R)!LY-ZO;nJ5
zdH=|pp|4lm3Z*+9dOn>gEqG@0GFB&I$n3)<iDFG{1I@<2zh9D>;W0i^`F#}?6$5S7
zZUIIkoxXlDjy7yzw=XA~f*cxq-&mJOwMMJs*9!A61OBbC4}`Xyn_1-48sp3BCvtH6
zWx2smpR-0s{G{wH{AT~0<>_Y-dV&tWa6WEcR*C$mOomfW{=qA<TK>}Y?h|bqytVH{
zMn+!DV|0Ei3s)(*8y@UmAoM`c$fY|aZ)Jcn*3!J}&ThY-lOXuAIA*oO*hk=sa^D2y
zhnrQDW^nt9nX{FIGsGm`p15^(I-_*`&hCIlwvtd@twUXq&)U5!S!ZW04UW10TA(BI
zo7KOsSUoj6Ziz1^OFWwO<hsbggb$ac8{1Oql-beoPo}2f{R!Fg_IgQjGxj}=0oHd;
z_O&~X4)=NTw!Ka+KxF&(RPK#?`1r@9`s`}b@{%4~H;<CrEYH$gJFcj0IeWbAb;G@!
zC1qE+LQj3d+6Fr|`q{OV^XK_qI&u81W7E*Lx=|t9A9=`~&-44@{D!@?JH@KDvs6i#
zkM%mmI>ao*WkYLRmaCs9SNyu<BX2frIXj)zR2<~VyVSLRKT?a&*AF?q81ne*5qrnj
z_H|$Cj$l4LQ)>0ovon;~NtRVYvt+wfwG8YlL|0ARw@2rAwMo~biL;Dr?W=aP9}YAI
zRNI!wsv8&>G#Y=u7^5_A_t=$P8KKPn_ce}9K`b_MKoervtWxLTCDHH+`}XMVKQp(p
z{`4$UZ%r*0TV~*+r(}bddVU}k8_sKAp4#ep9@#eM#{Txwu`_Q~VwtjZSbDk?b|TGu
zX~xMuo16*zL>$}GniAUXzcGD&k(+<cRL%})3O?y0J#YMC%p-Pv%>|oP<Ja=ra71sW
zZ1)QdZS7So*S-gJ=DEz5iHW$MCPW2`J;x(^k>kf}7?jF?jcB}HC`|0F8y&9Wmb5Wu
z8-Hs{Dd}(<{U*9<Lg*q2_M!Llvv#tb2b;qsNBTb$1dc^GK6ziB>c8*k-N|=n<|iO+
z-MMpTWs$#l0p-Bnwl}^QwFz!E<o3aw%iQv=&TI<xX3U_U`j7oyZuze;dd?pn$oBX5
z=greK3kq8{@zJcB3u|JpD+#6D7HfZ#Vr^OyZ18;R_o-ldjHqrFEaq$!_cN{6b}Y#g
zdhnUetBbtS^?afA?LD`)n<!6GaCw1bBhyv5p2ZHZO6SL&8u`usIfT7GhcK<K1JhXA
zJNEeupIiUarHXHc2YZxu@7~Q__T!_=t;ch#*t!zgKK)>q-fCV}xN=aZhtbf@!t+do
z3YDp$kE`Ta?2C&eMf>X9-2VNMMn8VtZKNwi=}p!^k+s^McX5AlMYJJD3m-9Z>8UW;
z^&`uuX3f2T5?8Z~GXkBBE0qJU(@ygc&fM8r8M7Y6LJavDH8H{rvlNtEWhmW|(Pt<i
zAkf+Vpu_Z%c;#vb!SBO&A8gdWvV)Th4k;`-QGd$$YxaB=d*aNE3R@;KjS)9*E<d-_
z^#ogZq1g0A8{=_5-(M}O#`grzRg@v``+cttUJ+f%6`I*v5v6VZt+j8~_jfWQD*s-c
z^r*XcsSJ8y$A+5}_;5>D>GR-EyAD%+u<cDsN$#~3RE;k#m6y}h)NHss27G3`Mm6ai
zY>hKRIrbAXS3|AUYv0Kz1tOzzfQ4|Dvy^>{m$#PQ-I{BR``Nad?G4JDK|Ff$wOG2t
zc&Ps?iyd_QHH&WTF6v!cXZ{*0ZV$ganO1!)XEA^Z=lwhvs}s$d&dfK=4c&CQ6E}v$
zZJ5^-;%10J&M2O`Ci?0niMJacd`?iJ+B|cyl$Z1$%!^0kwfu#`!=cWCu(skLX<w0F
z8Kn!mH&owF8tyC*wj4)d(;f#{XZl6OWlE*g*eJ<iSK8!o9s2V8(n8;JOPp3yhEm4A
zMyECUOMVJHZPWkmT)4r5;|-UmFk;O2fcY&hbI`*4<kJ}vf(QR#eC8Ns895KjjRlHe
z>f_T>nyAdqR*|`evSev=Lp8T0DlS+#NXq2&j14;<9B<=57eKPz{$l*<2i*BelFZ9Q
zfv0xi?*zIGT?ZN-Vu}b~m=~Sw$nbR?9&8A-EzWZru5xQAS6AtApUs3SivdF8b?DEx
zcu(o^b2As`U;mFgUv^c;{1rPc)8Oc(V<Sr`u`Q{Ls&kdUb!D_JCGPgx2jM6&y5?ct
z`Ch+;zFR1AVC&==JYEEDa<4ACi)e|~O|+rX9Q&X>Klo(3Ut+OTfrAC(-VvrcWZEjr
zxxCL#f=WxAzdYv6Lk&_x=rnF+1b_9HunxYf9&U<z&RrAtP;DG{#y?wtyK)_FHEOyn
z-5w#H_13AiA>!8M{7q99djaZr^DllhtLoFsTefY<RgLMbu?47<ie28`-q@e6o*2Oz
zB)ByPH#7TwwYA!oHk&hu_@lc5H;S;-_UyUVSQOwZIRlFympOf~!euOoucvTLoJ>hD
znMZV}KS#be$l<LYO8APL#SVce<0c9D0#P3e*OwGhu*;8j$(vWj8nAC8oz6FMK1Si4
zv*cuSAwQQp-}&)sIpdiZzdD<j;GclNP>-;)$s>0zlnh(vpkvRu8!o5pjL95qvCZf`
zW0KKR8E|kINI@vcHvUlZp_j{9odZoFzLH~MU+$)UXIGxwx6htiW=isgxb*6>jkD)!
z7rJvz>_q68UXkt9i3rJNz6`HTE?1Kr{B)}e8Rd@K=!mXUbC@SD7P}DpEU<34-2uzS
zZ>z}cW7Y3X<4nzrt@m_x{bBTlYu_p7irl8CZ(~Jdomt1jF6H~LEuw0Gufoy@6*^+d
zZC-4eVjet8Rb_2Pcd2Agxq8=Y%Q%zppyP_24NbS20&SHELO2h7%`s8_g(-I!Nt(j>
z7_mcBmO39rhHS3*4KR0AwTZ7m>CJ5(pMgYx*IE`;WsHt|3u|q!imO|m*IBvO+PnBz
zK7Y6%+m`eDjMp1=6<P1K_g^iK%jnf(-$sT?`_}%T>x$dZiZ#*kn*pWP$ye&z7Rd~K
zd79S$-lGgl#hdrxihM9(^o+i@Yiy;MP`NU%v9f~jsMIxUpo)ZbvgxaJXacHP>%hzT
zIQPkxFG4pBG^@DzpP#*IB}>Zg)7AY?&xV<m1e+*Ne)N~XoT-xU{+=yroO_j^3rp1I
zW3F9uDW7mgw;F-E9`Ere00Kn0)cerV(jv+?_Q{7{wIgo3TPoH7Q!26TI^N0(Km4*8
zAE63tQ@#z6)iu;rVm91UrNX|rSb76-DxgL~{8Yi^_BfGz^haY~)~E}IYgtwb$~txD
zTDx`yeCJAsj-H*)%?3UfPBbl2WPg6pk;%RB=stgOOD$YRa5`Z|V860t%PQWyzki*b
z05shglk!#oz%MlO<&Q}!1B{=}Ey@ezS412O5Y`7dm0k*`J877hxsvYYGEZJ)$U|T;
zSz%yTG)Yg&BQ>LB`s+_uTRmuF^L}`KNeS3d)U@zA5FXPqd%R9@uEj9*qu)?g%c8X7
zb_L5wvJ$YiO$ct~ZD&H>F=AT@wO|bckg|DMog?2m!nNZJ3G@SbGCgPHXZ$F{G%_}(
z8=OdP;LFolp1g04q!?fb_r)yABMO1>y-=A%PkgDTG|plJy$P)fK5>8`PtTe0WmBC+
zH;vt|zv1<vThXcSFPPV)9`i9cUTyd3#WJK*2cfK?P9(J{UD4~0yiTe)`g8^X)Ur&M
zZ(yCvFD;Z%4%l#{6w}+?_C}&arfU~sKWP1xC9+bhg6z~PjogL-2LyKqY$!Ur(CT{K
z=t!R<W-_bmRmxkZHD<w%rs;OgY)vg)K2!k~lBO2$M)#+eE9gfQ3{4lOy$5A0IyGg<
zbrd?<59Tj#Zo|gO=x(o-2FV#ZP^4BTgDpBx5$Ea+UM5i#AfXba71Q})>FCCDi)?lV
zpLl=XCTC2T&^a{N)9MR$$;;9Le;&enc9KpIIMqG-28NjiQl8jds<etV(j6|}fX`WL
zD}ep9dd~dkDMFmcUj5+mV(CUy;GG!n%H94N?HdcjDT@L;2X`#JZ=F(mToSdW41b?9
z^^D%PB|yH5Y;u=zBYI_8qwhWEAskDbJWMyyG`Y7Hsiztgqwz#db9hLwZL#DkhnizM
zX3F)~?+TK#zuc;2=-43Hcpa!idE>jkX%Ud${LG38T&)T`x|!Z7cI35PaBEXeRb2{|
z$=jaF`A{)}-ykBSuSmZCXN3)NQPEc=hxrC}d#JigE|JC}+qJ#!+1n+1vbRvSucj+z
zAh}Y{^78Jr&o3_yz?LZfy2v2iE)OY5ylRML^PJm^s)OLArlu7)QPmo$V(pfw?G9?4
zDfKCIsAP0Vr5NNj&~0RZ#m3Ya4|X?q$xLZz8X3z8Mo84Y3Du?n#5oW=#UQ5}a!MST
zL$9pTo9`?-`iP6gCfy+}YsBewlDT@L2BX0JQ-1ogrRUSFE2gb6DMI4LIu5n=wqV(&
zh!;OeTT?mke3{PN6$1^zZn{#9=T`;imbmsN0`(?n#oX9(?#_-YORxlXfTA{qB00vd
z;`jS)Jazz@&wAIP{^?6(Hp&m=sjzu(ZNHR{bR9s1@06`PuE<%O*H`yjVV+MPMj1<k
z8lbMO?uj(fh~!Z6#yTrVIJ1Ab=PH%rX~GY$sw+!|mAmhM_RsHBrkbkf)PED0=P{5i
z?0OAvmEEz}ULzQDL@8kQDrF8x3YI<TQ)vLOP5W!TZDZj!c339Ss&t1A!`6&C#0LSF
zrmhnC5<tOVqNL{*AJ?&Z`1Lud0VcbqaZ-$`1Tta%<0(r6ksN;VmvZEi5M&l<RwrVt
zQ6g(#H_?EGJL7rQ>oIneHw_|KF^d>^n6F_z*OWz90X-#Yr4kB+0svn5v2QJ7(_11t
zqN~DHLevu7v0;RvlbPJFu_+)R42+^{mDW%ivp0pZqGWsOhs?xEV6!0LKqpNtEuUT&
zpnS=XSGSa+b_E`PZSL4pp-t)KBKP{0zDBN&P!~jV{EeKyZo~S8G|*P#P-iMTgS~A5
z_LaJ;uvd3?H5Qen*)|0_ymjQ3vOA=gbV#JGqp7c_Dr-c1mR$cn%7#M&C2qxc0<8Rj
zhzRdjAAfVimdYUT<Jy>|Vuw_b5c*;`>8g1+9VEE1wPxa!nSzTGL7a+ghrtyzT0;ty
zrQ5_t=MTy*avS<e%vtG!hfw-zi8^Q7`@n9L4A<%<<<8rC_!<c|6DLp4g>+>Lxlyom
z*I4fN$j6-e<NlELzdm2sge@vyyPU~y1~M<fpAg<ONE=<C@II2iGpfqtK>N`FEx~z-
zeQ!^i<$Lk7*huO%xCQgqNzM9KBoUm6*y2UU>IffW8N$Tgx{VHJbq&qeKe`D!s7sc;
z3c2av%3as50=o$kk4!-Hnqea<LmBL9H5zq_HBy#_RzN7r=eZj|Y2si@R7sBvII4N^
z#t#=eypL@fYLXw#i*xOX`i!h&YtVI0erQ>cw1dzk6!j+$aYYZAy^=E)ZV-e*cEtQ4
zbfu5aW~&6Y4o;jUvkq0xn)Xii?_;!V-wl84Jb<-_yew?DPT#B{zDqpf0!X#_=4a+=
z<?{LJtMP<5wcO>O=V7wzKaAIRaVGxy3=qL?uOME^X^JY^{X#B--F_d2z!P;WWk4S~
zq7%#2luWSn0atF<ZNS#<ZmkmDdnCDHQD5Jumml`p0suox)mP`SM$W;OcNHzMUF_Qb
zuC6-bz<$cyS+;IBR3nl}M$pzjmVNWGrk1b14J$_26IIS3I4961pWj1t3?4jMZi{Gc
zLKRkGliFAfsTUH3``|uEy9E>vyaR|wZ|wUF@K<-6@1SE|qGeph{mN7n_%8dJ4Q~#6
zf`7SLW@<)hB|pdR?v)8CK8l(irXyIK{PJlH61><BloI_u>*NY8%nRi*^LD5Uh1fTW
z9|khsA~x*WYaCB?u8lygG>%EG^lwVHs!gwO>aflnXe=);7tB9ovfjR>oY=HjY-FT=
z+{z^j565q!vb*j2n4Z7yUBJ5in}q;;c_Ak6?tD-;97{@q@YFM#iviC|`!B99zTVU)
z0+P!bSE+R_QW`})DNVh3^FU@u64g+%#zH@g)4VRz#XJ*}u6yv3xZ(#;Pf02&s5hQS
zQD^4M<w52P#@HuGnSya8Kc7~5=pD34#d|u1C}EijO@T%Wty14P$3)>xJ;5r-ny3$h
z%kwm>Q2(7xi~Kcttm7X~wmm#qbQKDwp2oN$Kkqz#Zy8txi@gZztBV-Or=>B5(9JKK
zTVEtmS4i?Gn3{s*_cn#c%YS}fp=i{i$fnc>k<%{7LQ6yABfzW@Az52e@*vqKzQ3@>
z6vAW)Y1+gD?4?xL(MXTeoD09adun+J8=+0ANk`}P{Ug$KmRWDTma$OBz0WKTH#~ua
zhd1b1#tg{a2eY0gJ*G52T5b>W;d38^)ZjTUQ?8w}$aa_cH|$Fx(AZG@ajNT{eF$Pg
z^7FNO@9VF&2Uf_DLG7zP;IY=CNCk}#h-Y=S^{1xXLCd1rA}YoPA7_SAT-PcSqnE4~
z+LL`iB*b;lJSHvOu#5T=fGLidv>hEQE#5vHa1KeqOjq_JsQ1=?R{AS_s7vo^-x$XX
ztqBIPLD5;rVlO`aYMK1V*DIWOj{s>1J12+8S_cfg2AOT&vF|Ko-YqQD!^#s#I);am
zC8~`*Wxjn*q%>;#@!Ag4FmJ8M<SZS*Mikm2((i#n)*wgMO$fOKlszrU6`2f>=dO6l
zy12Ah0fENRTWssbU>V2GCzsWQcln8iRi;9^?s7S7V$$&B@)*>M&#ggH3~PfbAAOO<
zM3g}038ZYq5{2l3dY?-BF?;vV5RQzrxReD>b;kAbk^9vvwGHewp?`XU5j8-#EEb50
zR<kT3$(m3*HlF^crqOhSUl`=Q<i>f~;=(iQ3Tr|4MEP`+G93*dvJ0z^r3v?Kt*F2%
zohBceu_3)d&3lM1XuV#73dHTnFb&^ZXM)>_=f!-5iZHJ_4SS9XtNzi)lSl=(tuk<E
z$dMm8X6^<)8zSc-y8p@F+x+*ZHTXn{Ygbvm1^!hRoYyokh~|QHiQcr*Ki>zWA>6Ac
z70VGa&^wge(h@NG5W3yQx*Vu6_WN_ju$d0;_Dek8DRr*WtWv0yX9lZhF*8bh_FDNh
z_!`NvI$vy=1=N<#nX{kUz7j#Mp{eQp976cJ3F4FcGX>VHA$l`?shkWTg}u3J=f^y5
znG;`@%l54dIILzVbKu!O+nh2deFY9F?tAS}=aN#H3f^U_uf2MX#U-=iK*^HV9tn~;
ze+@R;Us@z-lX=D9NALv_Y%1+VsOn>X2>yx>O9Wb3IncR&1SlZ<e?~_?F52+M>p3;z
zuqm5bU*TP=(4)Z@6(w`m`bVdrR#dxwYirWIg?!gsm*p0dh6OD$>hEZd66SMmfD(7T
z;~<J_1NI4cu+{z?MfStq*Opz$R;9OD$^N340Tb}pkPojB4HY~XV^deOQ3#LbVN@7h
zi@RZpAGD2J#4-Q=aR;0FhR5WX)hSf4lQOWE#)LX_79)~|d!S9c_+mb;)p=C^zGKlI
zOJ{6{lB#3)uaWsTGx=}TDZN>Q7BtP!IfxNAL_e{yHMOG^y@XwqrKlW0vk^J?{31)V
zsnuYCA*9ud3ni?ATS?tQ;w?Q<=hC-(TXvwlsV<Z@%S&|t4js|pD4({yU2t#{@wStn
z@9i+hF8c_YXdIm50wOq$N*JY!xTEBxjK`A~dT~@n#FF>@EZ3!643FG6=iMiQ`c<2N
z-jdgIf(6g7EFBHXA&oZ;$&<|s$d?B}Z6HQ`soT)8C4{VkM*&i!#771_7tK2}S6g3X
z$5IzN^P%@1!i_i1E%9$U2JGu<{=lK*tq1R3dwYB0rycqTd0S}zKF<L@o6EvW^sC~K
zhyeyz=0hOJu4coQv1u!_4;+)5ExaUWshcylmAUidN%IT4IHLYrgE9H$Q4PTy7Mp6V
z{NT$g`GJ$&`5sd{?)69uw9TP)KdOW-1kJT6HSBq7^>|rwqeESvF>pUQm%&9<RpsR$
z|5<T4Hw*L0%NiXSwjBO6+UnHT9;6~aY!}-U40WOyU4YUPLH3n-_up83a}hSl?CGeJ
zN4eT>?wq^qoL8)wax8dz4dRlchwr<)Xi3j^FTgeN%MsDhg%p=GJ+YPmr%4Y{al^i%
z%Ml2AT5RuvB%BB+>hrv>#<oO1t|zT2xM16S120syfs?n5>^9Uncj;XcTjJ9FQtuLf
zW;4=L34Z36pP&C7riB(0Y~(esPB4ivtzgzk0MiQ{Q3>355Bc<G=3IT6069KJMK9UH
z`(we<3~SVf0W^Dk5P!xHTB+&Mp82gEY*SpK)=r)M5h%A93o2EW4rF4w0i0GJLFkB%
z$a>(pLTH=l;uTYIA-?7K>Bc!WFL-^)42ha}9oL<8(28r2I&|HJdpvm&<ayf{9`OS8
zTDM~R#a!SaQ3NfC0@Vm=sxd2B&3f~^(LR)dBt!y@&68hOeI$}Ysxw9SVkt4A!Dy~&
z>(zm}>+N3K*hnNHBaWJ-m|`y|R4b@%nidE80gwwr6gC8X315WH1!a)=6@0GN#IKKo
z+n1(V*JV~?jO!1-iYKN=+7fb*EwsQpsFk{wG0CK?`-+Hg_`##001$k(mt5RLcgKN2
z*)6C`{rmaRIXjTHWsKT6ES5fAxX}9HHB{%)`Ts<WzIE;@A{pe0{7An;Y!qUG8doUW
z(sSn~pA;$ylw42KO+9_;Eq2+z#V&0?7efWj%dF%MC}E1w%NwJLMRFE-LZ7zDtXyg2
zn$`yBPP(g_x_VcT<!Ecx$eJo6Hy0Fxw1HUpRm5(Vom(WD<AR1%!FOf}0{mXz5-F0f
z7Hx2BKhFWF4)BG5w;3nH)6<<|ErCKjX;q2-b&|4o+0qcCHyU`3bbi80Moml5kMS{b
z9f*c<Jb*XJmh@mA9h`bU55x?h&Nn~=C<ZI+56U4doQ7Mot<cnN02PNjf*3aXZdJ|=
zW?yYaIWt#(E&{(sIvVL*L*P6D4vQF{-;8SIH@rRZ0k!I3%`rsuhaWu~M&5X_lpWyc
z77{;eT>5Gq-=AMasvb$~pn5yM^|mDfT5xtcfSh;^kJ8#?4Qhfea{6?GTh95!<e7`E
zlk7$jPU1x2b&(Ym!&h9pZ&6{W2KFRD@=ij6jfl|^>WAn<(;x;y#PK7!P{5EyL1ExC
z;f;z;P$adCAPmVm2lC1x!x4~5L#LM0p+KcBMtTe6)dkSxj1E=12=F>n!QQPW$|%gT
zDwbBJ6B;BE-GB;Edm=P{V|-j80WWCCf)|wZ4naWmA?ysoVLUQemDTt)XK4wL549FH
z9C>X_<_1)L?~LBpQG%m#V+f*L^M>kb2dZr?X`5k1@R1&?bM{TFCdPSb`%=zY4E^R<
zD#_HH`!<Cc`tzr>!};0RfBuiNsOJ$8_{}%2;tMT{w3=h{(ma)=?%HJ#X~ar0*H?U(
zS?i}n(<lGXNfk1}#v~4fiT4Uc3wT**+_g-(&lINctKZ>eK{MFXI3|YEhxc6h-qzR(
z1(D4wE-ucoS(GvHx37b;vPU0W`7zQjZa-R2diM|Oz77U$0+frt9T6K_OkI)k%zwR^
z?|Z}YAGku2lVkyd(6HRc62Ujv<h;opE5@!YIB-WLBp98Vk60o{vLB$v+=fy}{Dn+I
zO@B=j%0+KzencaZmkl~_#_0VDRvd<Q3!v;7Yw+L>au56tW{M%3xEO^kKicdsq1u)9
zqA$*YPyCs#S^b0K%^~+}GtdN;8>%}|+0f3_@ppqSN2#mMxfZ?2I$eBlM<Gffr#nOX
z<o3&p{4j?f@!ni?J+7h@iG>H+C=w(LbRBF4UWZ1B2;xPITaiXt19TeZPu3T}X^xtO
z7U3>mA%g^{jt%Ir=i+`cHJeZzgbZ@B=Nsw!$KfKaJ-y9kW?XOgpo^FKW6JT(zD?gz
z(u*(MD4*E%+UR1B^0Io-H6;rKr&GJjk4;br^2cEwaSJa1gm5&Cn$al6b{47fhr4Ds
z6l+l+9}N&a@=3+-e<}*E7VHdWU|N8$M_oHTLwH#Pkx8>eHXJ^XW{LrM{TPtc@z2)*
z@c0Hgo9CcKPL#Q02~r_13v-HmTM#(r=&C>0j243m@cTWT=pMf!R2B{Y(MG5Nz5WMd
zXx*Xn319jKof?d6{hPyyyOHk`5WL2oyEo*aqvUgH;*>jps4O-xtT4J#ul3hsidL;R
zE8fK>HH<pTiDZO;;00Gzp-sVBU(k*Q$byAf%)X#1>3xsfOYH-pjE4=N;-?vQ5ZmpL
z09%>&`;SN^3748)Y-bdz<Y3pCOdVoksY(BrcPuvg%i$oHCdy;GT)}|)1dyo@Dt<s3
zxowo?YHLu3D!vFPGVCi99a@F2dt0eZOMTt2uc#cOL6SnL^?zaAO*h?~$S7ot3z7;G
z$%|6j3sAL}n6LvOU-TB(!dXYhu^PJ#sIVBnsrZ2?I8c0@V<9z%G&GzCW+%dTgE$}B
zYbl3LncfFLdl$2!Fdsbu!lXmbCo}F&aeLCnsp6N<fMEy>6$=Xs8zA{6EP;xNi3yB<
zs<Vb6DDxkIGT6>BLd4f|lx7IKBp=5NAR|cvUYZ737=fjQtvt?B)7A9@HWUT3k5R8h
z<sqHlJ`!1SodiWc>W?0eaAKh-b>5^(K9C)VDj<@!Hk)@8omn#bp?a<i69-|Bv@ZlP
z4uTrjEWJ>U)IUo2^SVVDCe*NWtcMlK#Yy?tX{OZCL(vQ(LkE`ZQ3@eLGW4){7amF4
zLj@1k8iwJCr08+`_NGqMB;TtLSb?<YLcjH0q_IPV>wjy%1ACk-YIXj^Hd<@E-X)bO
zlLaLv4N$i{jsGc0?9@bJq2=uj1<S0fNTQ_v@#$6#2gpCfeOfdy$vT}GAlBq`Gbbbx
z4qD%3{~0|Y9gF;RKztAJ(`xiVl)g!08;Muytbi`4$gTmv;TE^hP==R@+CrCj2F~-q
zJR30jeiCKNL;J`eG@VCmHOpq8!50RHM=AR7RE4zxaS*9uRE=cNY~d)(pvum@)n)@=
z1O!dU^k*OW4MErh)7@z05UiuE<z6>XjW2iA9ljGEU)bh6?En;jur|mSGA!cSQdP3B
z@04Y^jV4)C2%rHJR<+`a3>!c)y|{;qap!ybL;HcW1CcuUIdwf_?o20(6*{8iP6Kh5
zWr+e7c7Lv34@|Ax*IwII7BV764Twxl2>Cn{zVE^71`f626kfD#WV&B$A*K61S}5>o
zg$ADhV74vWEy2fvsWk}ZA7a3a`(VAW+kjGY$y6g<Oa_+i1_H3o-?Ac76WCQZJUuGr
zzFsWQm7Xmss3J0Oi-rV?+zT``@W(YpG4L4J$3A$ueIYeE{X|U^$<Bk2Gy&3zv8}~-
zd@OSRP~&)@Gvg2{Pf*L{1tQC5$qyaX(lZYuJ6~88Qjd01A;Ua+A^}^j-UlLLRp?*$
zMT#H){K~qhE%n1?M_9Yc!0CoMMn^!&;jkHiIWL&_6LE9sS~f^6O`)e}M#7_nsLy;c
z9`&avyOC3h`fDsU+-z%+&8cPA;!&m|FN@vAVgn`JaqMd3Br3sw$hJtmbvc9uH5^C{
zCIJ$y;|I$2B$Zef?FPn!WT3olTXtXUw*sRPQ8L>AD_lc@55uk7pVv)*$)*W3PiJ*<
zra6+N9<Ti<T83-c>BJlp$ohs}jc|sKp;I$Ju~dZ%>M=E*oGl>_gp>f%=!L)LObqD9
z87)f`8r#4+<oHems3|OqoJly@o))e(@f@-lb$npKJWL%neftW{YUHtbdO{4ddj9ud
z359d&Pg4Gg7!dv8v@^$zdnOy8&r%6yvC-OVgeYy#ae_^20j|$O*Gb#b3VRHT4vT^r
zbdv(=`M@RTJCYJB|JnNO!!*;Q9y(BS%TlcC&KULus8kQ<qiXAfrxUlq#Ck0q{<Om!
zZ3Q0jPE9TB`UR+57ME0HzCP2pd%4ZyqdrwLie64zW^|-ul;f&FvQ%p=2`3&0O9u!0
zK*-SNm4wId*jI^a4ltUbiiAAa*AZfv;l#%x?~EBb*Sp|4D+rwKZUgS^ba~@@4wlLz
z|DYfeEKGOE7F7UaqUeh2#gP@NSbi(nTFf7yf+9EXM&PPfrB)wKETYb2McT|6XcWpO
zJej)27=Vgc2^phc7pp?L14m*5Um@Z<L~x%06<T}O4u9(_LHT&F4h8JdtWx;1R4qv~
z+2&ddz)FPxyvd{(f;S100|@etG5r*aiY#i3esPxz#_4nDySGY}m?+HzR(}=Qt(B#k
zZx2x!f_E<eBYB{xUB^oq)Arf#JG{^^V#E#;L!gGVW&8ZB{%t`4qIjdEB;_lZZm7GD
z-DDp*#kLIHP{hLi8&$)AT8eLga;cfH{mPOcxH!(ZBBuJ6L8RSn!4ZYBokjo=$jMRr
zz@Y?wrR(U<ugvo;S~hvsl0cX*tnW1<^_{tUvu0dMv>Snr;v2l-7?n~8rGnJw+7C7*
z3A+uOmxvOn8yXt@5wOb73#-sJ8s2M*%ouB13-#g-!U-#L>FjjDYlMHH@ClI=g(gX2
zbW*7zi%k$(**<)CfDajx9f3}|kjKq>KUYs%fzb<!NIA3zl8NK_DA>!qPgx#R4Z~Ge
z-&S31{4w`haxdCQ>mUMxEDp=G61~DI#)(THN(PE?7d?A!*^J?geJ?4XM^Ka-uWp%%
zPJ(TrNOMh^E#RIGc}#>zs6cWyaC-lV+br;%9D=p(3v`%|V5qsYOuw(#ZcJ4XIk`Zp
zqJ|_fq*W7GA#-^wqpzVAwH{fqakJudrC0)0woS#!jew@tfL0=v?a_v8gYzU@jKoHo
z4x|-%$aE3vN3g>Jp>byQ9V1tYP})Z}ZyvmhouKc=XbxX*x;J~}u3QkttGv~44I~0)
z7agm`n8b~AD^XbVKA8CEHcHz9c)2uvl^EM*HPk%bGD}vBMYw>ROC;aGHxj=Qb_8*R
zq#0QTd0E&cXoQ7(NzP@M@}-n7$<>EY`yBstCdZ!zrCNj>r$LtJXH|2r+owS83nH+9
zM!WIud7CW`ogg#$<{PTvJkV|;#Dl@o4I%4@S$rITvrkZ$DB9~_u+EfTNLEFNVSJ@Q
zf3a+n{`zco*+a*!v6fVvEqN!1uibrKLme4M{AtpK{S=Dg+e5c5HKzBrR_#UK-fz8G
z<!gCNacNKR(I>xl$>7!nr;hKwA!Z1$OhDG99mtM_(!3sfnPaggTahc`s0mt{5<42U
zv9xz*7s4*PVl#G(;<jzuNFlK<a_h>eRTxfzjr}8RW4kG1LJ_z7{7M8ampqaa+Yd2J
zyaF+$(q2&f%;eFi5?8h{T!InF)<A*9oc^RdrkVU#YEbl@GKm?$yS1B5DjJ|$heKm4
zHY9=3wDyqy)fA<xdW^E(eta`yab~L%7t-j<k@>W~^;1HzwWuzTDe5AOI@Ck!g+@rp
z32kNbv=3bIS1X`ll19gg9Xii}&&!k?d4(3Ck1^UuXExtZ7N?{~zAz{yJ0T+_pfZt>
z-C*ahr?O2T%oISuFtoJ7uCyN+MOSURF`OI4WCS9y*w9Ga1z6<__QUZ4Y-n#@%C#e3
z&dGnwU2$a>@ShDj9=`rUhjA(D9cJ1Okvs5;+~Dd}A<j00N8rfjML(_p@<a%=Y=HQk
z?~L<3=+pKvA6({TL8>7^<#s|sr*u~aDl=^cDq=$@*hOfN`5?b>0)|R7gycp@t2^0Z
z)^BAbpiDYc){u=4OoPx--sr)O;-`_ZU0GMMW_r{=xoLXfP;6LQN9ev!?^2Be)ja)K
z=J$SIc;tEGeSg!!m#%uISr(<G?EF7=E2+&^RiBkJqh_1_QPyM!gp2-fy2Zmg9GZ%M
zFPPJxAhWcp!ak@=a;aQh`0%nxP2c27NuwDC7SckaCJE;bGQgYIyu!l5IB9b*tE+A$
zR#a7H*JY!H_G9FdVIov#E|H0-uiqCWX}i5tMPjj_Zo*bMIl0NcT+D@np4O_lzyE$?
z(zJOy*^yc?k#pB*MWEc#4;jXx2@qIOSy{`*CgFH{&0X~OBcZepf~{Xvo~U|it$wQN
z=_#`^&0S!@;@NmKlxrKnSn3HUCjkiwZA5`8e(|NGyXVt)FQ6XIS-9|IfvmV?WjTzt
z^k;9o?)}4lC^IohNlA4N8}XiQbb%s4O}AyjwParuB(06@8Vu8Z-y64ISfG~`aNm|g
zwWzBhq%M6<cusb9_BOPuy_YXvZq@o)f%P6r660NJd=`9&^HShTbnG(iX)fDQU0uz-
zjaK-ffzE~$XWL0k=umEPaV0)C$*ISJ#XfiL9BC^f!`~L_)BM$B;9OG+q!o#1`_{_c
z3pX5I7x9aN$#07n#}F9J&&^HUFS~Z_Zs2<H2pp`KGsP-$o356sYAAr&E@Y=mi1k|@
z^MxZ7y#-K8u0iyVK^$t1jErPCD!aO7&J)s)KzGdo))gt2!K7IIwB^>;*5XHAYhWvD
zZ{7Or$PDFvT})%<=*Y<<)d|}WyX0~43J8dTc<~8ypbafS{LqU(v6L#Yv9T#G?HA^*
zp?S0b6p}jG8_k<nQlj$s@narYr=;}bZBB2@xcJU@Kz-9toQ{mjdAaAWmXy@JBGdT~
z`fV#%oqc^(e@&WJo^jIQbO8dv0$mFpnG^94fcaPuN~)fmnO2cvZA3q0+~12Ftygft
z<}7X!w<uRtP;fuIA>foIBj5oA;t;Ua>onVNtYLUW#Ok!v)CEBRH|-N7Dkd=P|EZHl
zh<Ys;?`~n?Dj;4*00w=6@t%EKKr`|a)Db$nWi0A9J$529v;=ddZfty;=86@m9)8(J
z$RH95!vfBd%?SatCp_2k>fSBX<Ol$uqg%w>Z7lI?{!;dl6DCZUvtR)kA|kLT+s(Kb
z)w@U4*E03WUxp_#?uzGcjM*Nrk^F*WPio<4&r3Ku11-U0;*oFG%fh%a*@ug<+Od86
zS!8VSBER)HX_c8SCuu(*ql?7T6|OFtd9Js;HVRGoPx;<!$mw=bd4jv>LD+VjfP;cc
zWMyQ^Z)}<30?w8Wh&@AaA`{wKe@Bmb6jVPR!_WU76C@Y~KCN)&#kkfkl7)7y^yH-^
z-16eaX{5KCfq{!oPvc^?<rNf^Ar;<4_5XDC)O0Y-zcyIa<SI}sFR*B@NlVa+NeBGL
z=Cg!1XBP;*nA9{T-wPY7_KOTQE!egyE<8hE0>&~h(0A+1nKP4p87AK|B+Gheh_peC
znLndOvAC#cHw+aoF*%_S!&vNC!;DC%VFt5gzPN*1-hKE`=HLo0hVANfx@2get0KWf
z2~AumBD(C`NC7PXTovfy38|>4oL+;Zqgu_kYSkJ!Im4!s5QkXGS<TJOleM@Mszu#K
zoZDXA+Y;j3c^*Jm5{B{3Xn=TO)4nvvuy_m&4cWJqe1*96CorppZES1;J8pda`c)(q
zx9_zQN$BW;#bbaJ9BCoK=jOeGIW4eCYi-(e%C@$y&d6=BEJVz?|LDNCZwqyPVZ6(5
z@B$*J5x;<dqPe+wvAt*BZO|^Qj;zs4i;75-eakShUq;9CU?l;I(9l08O`a^UZk;L@
z7gx>U=}gvgVPTc__V(>&xH)sOz)79>5^7Nu8&g=g>lSR@4Udq{?9hB=vC$!j*yb^h
zTOZS`nD;Y-F;+Nz6batl{H-IaO4#k<=f=i6Fy{#f3hqMrivSQY-N(mh9L~ACxD4I;
zxl5M{2Nd0@si|3Ln4%--gb1q$Q<{HzDsJEXra`8&?keR#Rcwe$%3SV>W+?Jf$KL;4
zXqx!f>C>7SPWHE8Kv<oKToDO1A|FdEDJf}VYr9_RS4Q=DTXo_NbZW2h%A(6jje*CM
z^}W`Oo2MD(sO8@KhhIXgyp@`|v7xD{t~)&>Bm}LTD4Yzsi@e_1)8ic%7l&PCs<4*v
zQo(LjhYv6zA;GTmBa0ex1RSwAR<B#%zrP9TrWH*i&+5Y)Lg~EO-0Fq`-(=TeC%+<7
z#}g+O1Wja8C*r716~HndOEXeq4>AzC+PC4YtFA{C$PW$=an<W;D)H?udoc(az=H{1
z-I_VvyLb5AEVrw?XesU;gt~5i6ufCQpC|W+F|5xpsbOGu7AS8tL1^pRZribAjjU|#
zfD<;qsHkX?O@rdZHC&<Zql~gL;XS|k?&-NG0IF=)%%|_J<{1n0+EP&8pF3}!1yZ-(
zy#r^oqIJkw%r7r*1Qe(R>(HfQq&4OFLi#VTrhz4KMp>~)<^?4sS11?Fp?4SGn2tF)
zuzDg>FF_{ufmCeS$v#-l(&wt*OZ_8j%G>>4KZv^yoxp^sAo>%k55K;5@LX}Qtfk!&
zMq?w&uLiC(hleK;0=)*_CH(TnuONhFz%$18bm(hCfF|DZ$Acx9?EJGB%6v)ZQy7Y>
z9ICH6H#fIlqUoj2#~<Kp=Aw;%CRQJ|g9v9*Dr|jcx3cna*G};C6n0?XXXlzbyBAJ`
zLZe{5UK086C8n42a_?NTV#O9nJX)rvXm=%ed3%SUO!5S2@(0Sh8NvgniB=+_^XUAC
z4<CMPZCztzWK<Druov>D+UdzN6qEPkSczVOao!9|WZYvj%|DfeUvYMJR!pYDBYZ$a
zDuA<<D6PCK^w_j<OnB~+C9!Y?zN(jE6pn4gaw7ijAbxp*CNK(qo_>B<5&5}u=0sq+
zC7cIw52I^Wt=fUhXeldSZmEik1GDq_v=1Ov_0F#2>(j^Yg;C8M7#Ki}E%|g&@!jiW
zOT}HgyoyZK4<C+J4wkv=<uwOIG53=g)1Z$$Q&U^(?d#j5ucV@)t)by{^XAQu?d`D$
zG669$4bw#3Q>RXKe)+O4egUqY{kXQ)0D6=rjsZ}$6+isy#V1}yL8Pm%PeKY5TeKC4
zaF0UJhP>t9y#NaC8xFq;1$bzM@Hq5#26I$tLjdhx_U-uib(66hRP(N5DV~1%acojG
z2{&eeroLXf-Ce9HQh1B=5#RtGDZ7YM_KW}<acTirjab6HmBTsf9TH;n@hv7$VD;*q
zn>KBlI&-Gh&Yl0@=9JQC<Vd(@>7wvdJApBFqy2I$5x)7&)X|5~r^716QbfGGwPPV|
z#VErQ?<V0UmDs?DYoaPFIQwv$C-)CnpunvLRE4<&+T%xc$ji$UXxT9UCFyVI@X`~f
za%s#%#i+u18mGK5NqLiuj@RDywz{k=zyrMt_83wwsv~5ZiqvBVZ>b}V)PP0zfI&%y
zq#Svpr4^+~R#$uAO&*=e%%j>jS#DYA<ME$+3S)xPPPDI_5!m%B=L;KpWo2a%{6J6!
z$vCRPqA89t%6J9mS#9UADw+Y+YN}oZzQR>>CLehMa@BCl2olpL5(2}d7;jC4vlyV;
z3?8waY`ej3a}s~CT<A5HLX!zRIdS5|OZg}U5gJIp$xxW7)ZDvwud}O5-NGXBr7q(>
z)vCZ}t%OWnvZXJ9%`C`5ljj1|ID;qOxO(-s7~RC}1R4>T6^Ujlw-T{aciGu-peD-+
z1ZP1m_9vgQ&77XEdNgi&g+7}zp0EJW311;kM8(H{xu01Hw2cwsB)PylMW){#WijDR
z8@_nfn_fga`heXZxZIguC~kWpHmDvy-aDgp%B&?CFgs0;oWN+z!$e-VbV(E2<No35
zo9^z+kyr7LKM69-g>1OZqU49(&0_OLRYbmerE}T?gJl6GV;$~-X(+Pk7I7YqTOSZo
zvD#ST7N&jd6)~Mw<Uwt0fB%WK2I(58aH+R!VbYq5pkF;3fpG_7Ex)K}=_fo^!(H(P
zijl;0<1f9vS~@z{r%s(ZJ(@;G2?o>OjRRI&U})Dunt#bc5)MF*EfR+SsjK(?{d*+K
zm8kPPfU>JsWDa#IH7T5Z@j{iezgT|%y}tx*c6Zy2$1Nav84_wo%R{0eNV}10VTvMi
zCn!{3_4B(65kmzz09YMd%%47&F;PZkq@@HT&b3I6M5i*+Fe4o+kC*B)YG4#Edxl^7
z_19k~&*x(D|3Xi{8OLK@Ak_I-aCjLXg~{5x>|ba?br-;qmzURV)aSKegXrq;-%pg4
zl}$D&n3s&|^{V(k?g`6RuH3F7H*lLb4*&o)$20rDwd@0Xpz?ft_iq1DKp%x_RUMtx
z^XJdMwVp;3`h<V=YE|4}vfHRDOMbY<3CF4@YjZId0YW#@fgZNlzDLYj!-ucxk+1gs
z_Jg4@M<p;Fx3Q1%OcTvP&eEL+Y|iU9!JmWunW3@VY@+*1^q23_cyg;KapRn3!GU|L
zk$#Xdu{qW>Ky$%MH4wS<4*t+gIYC*kc<5+Lc;*FsnB5S<cCv?thP(i$U{0biT9=PY
zRoqoy)zHvz7mPrW#d-;~%h(Q3W8LH}+?=c8KdG}F9UZ6s`s+)mze+=TM3t~H1U&OJ
z-d9avRv4R_TEJqZsHn(B?gwGB>apB~GBoKw|H(aY4_^^T<U_+qy7^~@hMdnTaRE3A
z>clIk$o6=3IilqvW}N%mjKxwKC^#FyAXKZ76><moKj9SBMQ9sGkP{#MLSvXZwG=8m
zAL~QTl`5QJyLf2=!)zyLF#d9Afi|Ol;>3ybhPNQBVB`DZmKD3D7*(G|(#Llle_((w
z#0xOZv$cJ1;iZ{6V+Nt{SkNu61AJ&gSOWm~&l5I`M$1$UZK5*BcGk4DO`ZT>04`bI
z4Gpb7+k=D^34L<1CKvOkiLr468qDcO-~Gkg3>Ux)bj_#7PGG$51BmHD!}aDR%+Alx
zPZcf|e0jk?KgbqSb7{WeiA~570{6Obe&wVr`Z0XR<ruFiAONU5_|nU8N)}ze-ulEa
zB*}7Y$mx-|>C|ehwwP_>uA&h6I>!}P0s;aqFOiLO>S^KDpMn@YiibS7KyUSzc8Fzr
zdQv=~EYwlPAw_2fx9C>3MzVz_A0OXcG>a%;6$?OqyJ7aigA-(By{})7z&vtt&h(#X
zi)nxEy+)g~h(FS}H;Rg?f$Rzb?u9=5)^PPKV!0VP<Q7E5{S->>WoR*qq2;x(5(nG4
zih^bLf@iIVYT%w=2}O|DKMYOgc9<6SZC^tt7P`;AyeRpqEEe%`2~3!3DWy#08G`U^
zy60`%xARag%xC_!nNsds*To{LTcNIIA2&soNj{h}6QMKrjUS`>J0STQY3aQvC!$jL
z)kn=(Q4%Lmy-kw!=5ll2#V7dfT6wFC6HqDda2X1GD7m~W<VSjTPQR`gqp|N>UxzmG
zffuk7%LyIjZWxZnhV#qM;ixWp>-z1;rNz?sb&9c3)ejuF2@ZatK?YGqH5Qq9rG$h8
zm1KoU_i$pU=hMs6`%ltmt11V{x#&ULeu4Ajd@L}kK}d*zgO*3RAvA@BhXa`yN<sVP
z<p+EM_wxoDd~AYTbaCI$4Uv5#K3;X6$2Pn<vo#S-$_RQ+$%06r)D=MCm<YA~`)}Z`
zR(fPF_sG79T$GD^{YhqgL%DCFCIlq3rrqgu|Ik-Hc63;C#&G-2ok&0pq6fTpu@g`U
zr|-qAW<C1>%OSH7?1(jx4w82pxm@xKHZV42LraU^MH9x|$P<~kzP>)7vbFr~En;;@
zYA69M<##=~2)96_J$|?@)4)2T$A^;O`0601&D{zRAUMwhM1+@x1pBhHp`ZscEf1uS
z)V;XfN*p?}|N4F#OAW^|y{=vj#hGJ{PLh8;&pG->?aaOnRz@tbqrcU%JmLn9@?SWG
z6qHYymbjCvtE)tG2-FY}5fLXe)+^F%%_ua<#*eoCEgW>?(HOhj1yPqlJLGsB_%*Zu
zP0;5in;ADMWdwtq8wTMa0_|t^ZB#}3_8MJWlH|sAFQ|mrO$2mNOo0nx3^V*O3MO>t
z%5zWOt&>8U<GkE2{E5<p=D>?H)8@B-YK3_+1|6t7*P}4)J1`?-)C4}$ThY<cd)tq^
zym@BxmMtb50!;4dd~9noxDbYm-d-f@qyYoU8TcP&q&;}B5Hymp+5(79)fR%}!b%u0
zZ^DUuZBr`ae&86=h%C-mk(R}qXDn(CuMl-AWMpJWma2)x;saf%dCXgD6RR7eMJb1>
z_$i9U=>dg5!=b$t3R({V4D3wX*tE1XFptsuB@4t3U4<T~^M@njF0C;JcllPX6yWD)
zt<j1Gk~)6vq=fa`XrwzI8<gMG%6Mo*1*}$~@S-S#gABsS*#oD_LK4`l)Pb9P1a)sh
zPt(BxhikmUJ#lUonBhw^CNdhULAvRzIN7?Jz~_qQV4cuO<p%amT};ai!OE_&keekx
z;}so&??%oSDV)JDLp(qC_ECxZR<)5hKKlt4kK*JnUyhQk16(Pjv$NB+Yr318TU$+<
zI!c;Qo(A*CAit!9Sh`7KwQ*u~5-5B_V`F|!&JIYJA1(oXdZ9`LmEwG97(h~8{E|6y
zZa;gr^)z<squN)QE)QTDn@B(cClEJCO<`2Ic8ACrpiR8T$|{yO56~1Wr3~pzGt1TG
zbSoI#5!@!JZOH%>O?p5vbD`amzKh#r@_+r;ovfNVZCVe^OwEuQ2%_<^peH$j4b0)@
z=4a7Lqwx1dHKnGe9%{(<M#WtF)s#{A3aFA$8D|tnJxb7>?M=w_(i%EC5rC=YkWJaQ
z(MQq7!wAYDXlg>lqjBM7QUP6~CtQA*Fq&Pgqmz>b8j((rn~<N)_ASP7ZjkFx*eNVV
zh6bSY*u~CTBBO)u@9%Ef5Gvnf|BNWtIDiZO<;yo?quIyWx4qVcA1xV)wNZbKprBXw
zftM{6QPz%?=vLpvb9F}7zBHsEJc{$Ue7O=}Au4OQ&6-El_#afeOUj&m<~0$@-ynRU
zq$Pm4NLJO>)`~;t<t%V@VmLI&Q{Iefntqn60T{_3;tsT{r{K*hsAH~pe3&L|!RzAD
z&-L2!Loj6O`*c!|-dMh3h5g8oBa0ms6-5Xg{)#=2$Ryh2k;;)89i9fnyH+A$9(nsv
zcgd0^eBiOG-@qp9MyfET-n;s+7SI-rHB^lsx0q-AekBL0Jz6a3Hu8`}QHpz1hm3dP
z;PtYwXSIL~BFWz&WT5dYGnd*DkZY2yYNfJsaxAf^`NhSV`&H0e=`L{_QKuHM;}yh7
z>g<2rz;XR_J67Do&I_RKfuy3A2TIzOgX|g&zeYZ?0Opm5hAedmxz7Tf;`7}Tn0#N<
zP63jQ9%74uEc;+d;1jwDGPpqJht{-sl}d0qNGUWLL4c__Y1j->xJC?CZxy)aKz~0t
zY?PN0SLhHrRHfM-^Krl-jEFYck`goa_v=E_qXLfG>T+0%kuUt$$Wsp2rfZU%jba*P
z`R?62e@WXr7M0PCd(C9v*7uh^dB1C5oR-f&xdzRVyuv~a<S6R_QxO*QPVccSI`@x1
zz<le}1VK!Z+iVDyVNOl#CNuRyZM6@y!SFhJdws&g!vQO9piPnc^ywC~oB+8a5n&!Z
z{=g!GXkCZ{f1#i%YRBsHS9WnRDaT=4jUeC-T7x7hOeh3(PGKPDXRup^p_*eGT!T?u
z6QG^NewdNbgO1;t<;&kY1lbRCn&3nu3l{iW)NVmX>x+jQ<F1a(uBN7jyn`<yKY@XP
z%lmMGDbnRzFEF**(WCJ=OuWY0x<gHfqo{=*@VIitx57dba^gyr;KHMmnbkR3vHEvm
zAR1|u!-8O$BXH-)b<{e-FhJ<V=^w!~JG;9z0Gt~>d{{AS)|I_8PxLqFk~RtWgZhM#
zB%SQDLa0T~8TrVle#M=NTh-`5Slt7*UJ<Ucl$ZnjN)V11)q6kHRri}zxPJfrcXD{y
zuKn{*1r#1lB=Gi6h0p|>(VwTMSCF7QR~wSc1jxAl(1iYcP%8v>48SfRFQ1j0fo<h4
zYH|VHH-mg{T1ZA?7q&d14h$~P5_k#1{vIgG)R?F_-&z$Hg>b|c*%190j1Yt%3y!fh
zKx<ndiD--od3NufOnREmgv!|1+mpuY?Bb${$14I27eTqgeTw{$iQPwgaiq^2hvKEM
zV{eDN<rOzATri#r9PE#Sd4slGSTUPXJ+Gf;vkhqyvYKa+X&KNM66_~um#^=$dDiW7
zKXnFFVio#VFmK$)se@_QD5MHA0{=in0P@s2+57oeY(<=r5x9iIu<Zdxq4^|E8_u0Q
z`{1h{$dIvU8r{XYzO&+-nP`7yJpZS%uzC&oSJAFsEhAGSdG!X~2bAZ8(;gQZF5RDQ
zj6UIc>`>6rPjCp(69B!gUAuv?XaS?B0uRjj<BtP&%NZuD9o3g-!{O`DNp2!nw4h-?
znmVKLCmv>Vom#9wUipp_+i<Aku>!&^JX$MiS(W{<V=rIbJD7aDO>Z&;JnRQaQ=5&8
zX5L!C87=;^?ln|xB9LKWVUkr(o<0=^H-K1M_K5TTN5arGKVh~j(9pHV16!J$2oJK^
z%jeIR3rDceiL0s6hqT=aC&NJiXD@Y#Y1$wtB=lKSMej|C1qF>k;QPE5hoB9K^?+#B
zDNX{wUo<QxO`1f}g#JO$L~d*ZakJt@Yjxt?f#2K}-vNc{#qT@C5uw3vvhpe_G><>1
z#av>e3<9uHFPm=7{STz28^~VkuR_YW2e^0_XJ5mPXP{QD;p00)a1c?n>C4`o^bKAL
zVfNjh_Mj7xfivg%1qGhG*f-S61eFs{oe6z>J(7$pyl?4HS1RD`y0k!KGw=-(sHoHo
z#(g=JD%zM^xBlJRQ5Q$05TeY5F@?olw9q2ir(Yc0hzj8_o~#1xZOzwK&Lk+F%X@cp
z<HgHkLqRS~%6|6j5F(nxfk`(3HhKB%{Op0G^%!Z{tsS!f=-wu^x3ol}ABu>9FhCjx
zDTtxJG427xchG7UmqPRs|G6C&Mq<wDdtD2~Ep88VHLY2<F8*08=hN3c`<JF8;QjgY
z;S8M9zi{D#O)Oz^N(;zogg+OoTDEKx9bl-n(uOpFY<Hoisk!+si71CR;42E$i6f$X
zR{&Gj<4ns7Q0xoCH9&Q6>zH+Z{mS2)$ob^66eDA=(9Ty0CLYxJNJ<iG(mQbi2fl9-
zL)(XPY_bF573%mJB%Z(j{!iGQJ8x~`e0_a4Z{FMuXwj<THmJ_8HIC~awE?S?V{doE
zil~WR`|A6)84n+dflh4Ox^)hyD_-uR=WcLffWE&F7aFQS{s17ea5NfO?39!g9F~ZM
zcFM!y{>K4eV+<Z>L9sl4{`_m~^xY@%_?KUCe#c>QIp<zf<rc{~>!6MULl7SpxCGuL
zfcZl6gmlX7+qd5fuDf?|2LXB~oT9$aqfaAI@O%2BCMjee|CswbMtKNbi{j)*j}*wc
zfIfmagdF$IW^DVhra&18nmv3!E^s2z*#l}^;^oDHGXV;G7+~g2n2}nbIEx}N2t*UE
z6B1Gez@nZDmdIn6XQd|#&M~8ISa9;6KE3s-gaaVFB!8BA4SfVSkH~?)h4fSZWZ}2U
zkUmgeHm0Sgb0#J`9#0X0=(PitQiMbRn%QQYuqcP$n~xP79PEU<?a5ygM?@Gsw1@?0
z{iCfI2gwk6?WS$DE7ei!u<t;1_;`5_J^pKwgIx^FU^{S7E9}XWC+!;+(q&a$f0i!Q
zuf}F(t?)*d0x|_!!fIR!;~Y|xF$GI1)Rb5%r_?D`+=F@oMPUg?T2!n7<rcH~xD$+D
zx(eqqZXmCN?vPO5K)xj}NHip7;v81am8HTqM&JQNBvH`rs)WcvZ@S7$Yd&?>tkK%4
zwu&fUu+|lBZle!c#wJhNv-Tp4f@81Hjf6W1DFFg5{T|dOY}X*7%K@HxaEj_KP(y-_
zAifCB0qS5W-T$~65)ubFO`E7X@lUcP1R+bR-UPIWLRZETW@buwpm%?w<;^KnxQFMk
zbR^j|s}T;(zAzU>R>;PP$n%mVLXg6-De{d)Ym>rphywSbYGu`a#NZ93mz>LJXMFhB
z(&D7;Hj_rLFkt-Or6F}Mn4uHQ8MOZ2;ga~l=hGp4Jwq>y`C<vnnEEiDi;}rN1}KlP
zkgDocq;dj!4h{~~HqLH94kuKW3@Z(*6awjOY6;-&9IKGmWo)2Ba0}}_JaG^7&)RM#
z<7JLwd%?p)-P!pe%>{~iKreZNHg43}zkfg3Nrr}pty@1sdoagMXQjnKU#AD;;iKIg
z;0r-QeRBP?sU2sZ(qA1~0spBh^~dblL6#7HiSbt4X3d^h05;PWfwWEJ9o9|hDoEU8
zSdW*LnVH!F#)AGfzED$4G=!M?nebaRL$9Ld-t7=Mb32^;cFtc-y%s!Thj0>45Eo;j
z2G|JsI)b_ix(Of8hRh!+g>&~7&M|F8%Lx13)7Q7EBh#nCVz}tWbUZ1hoP<<-%5Xan
zEnagy{<}!jEB~uV+zzp7fz2%%87$`#9v$el9!-ZzuoU9Q=ou}RbME&6f$TFp^Y`Cl
zLh`zQer57cLBZNWQO0{0(1oBoG&K@i@mkOfzX=e^h4bh4=<5ffy?UVrr$=UEGw{pE
z=pzb!Jv<5w4Dj@mThyVu76;g>iAOA+qC?>i(#OE>lj}bv3xzbVpr9D?N5ex>oD8D}
zeS<&|C9BIEU!;EIfLyB-%bu1BSdI=~`v&g;T)zX-gIi?)w$L5bK6EI8JOH#ksN50m
zK@-6J>A6`%y$~p67`8noG@v&%J)ImBm|t*6@+o*^+(JEvG@rIV2ERNAgFYF^4rEb_
znhCA>seXqJRpZFp*_n$Y@#sGJ;^cq+`2~+?*n$@9IvT4NsZKBwG|8jgewFvTs|!Hk
zsB|p)?hwEiVA=6ny$$n4t#fpH<{<GQ+~=po0k6=OpEj)w$G%pA1^v+mL-9x$Xu4Dg
z3_&eVXUQA?^Cw-PKI<peRdxN?)MN?2F6juEHyja{lk*ojcs3Zvhe;4nFehmQvOYL;
zLq|s(OghB`m^rUe6~rhkRqsE1pcD+TZC+X&+#qoC;0x01wFeJQ-W7*!P7^*_Zxlh!
zjs{&8Ot$0{>ioGsm5%&#5F-xZJON6QT`bK2Aq70yWjb*F*elc+58WgqhC1XE!td~@
z)N7L)3hhj7>}MZgqgZ@fmy`p|g^Ye^p#iEwT{Ko`Rzdp<5VQv-=(~uSvtt@&zRbUd
zka_+30!Y{JDl4%+!pj1$hXu^_APt1s7>)moHjNG~g$;+W3GT_}*fEi@fMG~7#3Oeh
zja!sc@2BN;aui~}92FWG3XlFTp$RyUq=`XNn{NNp$#d;J6e01#)c|!HUz%}-vJ(|K
zYn*!UXU+67Kn<pv+*lmcs&847dJo@30U>duA^|oK(p^Y8SSDXy&{uvCFHj_Wkp~IT
zAr;LLBzo=)JiQgli`KO`h>A4!slWU}mN;_L(6PwTbCXnW0z;zDwvrM7I1E|GuYGIB
zChuu`qYVVy0y6`2Ma4Awe*ggO@n1*-(OtHZdKO5o|0lXN#0qXSJdgkG1@Kv0h7Ouj
zLDlf^Fx1bvJUjx4)zQ!#ybdg;3x|q=1^9yjy1e-Yg|K<ii!WXAXQ^MbZ<%|3J!&&$
z5oB=mYK5Tae#`{R$hSx)+Yy@n)6a7zQDe@(CUGr_4aGQIcenrgYXXaX<;oSf!)Fg8
z7{xeCtAT-!wR3QB*z#-E_H8gP-HM4M9|5GDtSmQvA5J0@@I-X-^YU(Giye6O3mv<A
z3DlhI^6fZ!HRfQZ7|2k?dT=iiO88M%aEX?hv_yyuN-LHsNcsc^#{w2mRYfp^9Xv3<
zf>KSTZQi^)c>LjJ2z?TvW3&9nPmMvL`dI&${nU&@@1-~ynhn2{l)uC=ST5AmA??A3
zP~CPMUol2mhQ7i^LN;-E#%d6IdSJUm=8A_zLlZpL0QeU*k`EpN7{dtre{ni~{62}M
zaHU9(O5&(nTyQfu&-f1D^P3qONNrR_#S4^pX3xF_9Ua>p-D=T*B1$}PtCEPFA4Z?R
zcuD<hTb=#yZPn<k9sz~zF*abR=(m9x&49sUNEpHRddA7p($aDeBGmU#JMxx7<tHN|
zY&WQabMSbaXe8%?H*faBttaZYD=U2z<rii#&kaza;YB2x1!sxGXF6~JcKVuKI9YY3
zeuk4Knu`~%T&ZI7DOpLD(*Y?A+!IO)%uY~zwZ_N3;KC0q&m1&(cmm1dQYp-XZU1Nc
zsV2eVh(+fY5!r(!q34of@E40GxT{@5SA%-<$bLv@8sMxP1vBEy<mn4S`ILNirYGKT
z$A%@{p2m$<Oe-QA9<&P3DL30k6WuQ~;l9Y4MHvGo3k>mXQVuvO(hCx-6To`K&27`h
zlg8@xKi#V9icz@5@d#56FogqI48W=)V$W0J-R^H3;9=IhtCs!g;(_%zQUd0^4X{yM
zYLWsshn2!Wm+m^8&T9_jGze-rg9>;=Kc0&cH#!h9dX1OkESx+e{X-(ME7Ct73%+_a
zOw)xGzbkP2qKl0cM|iKnN^Gm$t_JTOD)9fK>rLQl&foX{gE3|>8Y4^Aj1ZGGTQW3^
z<rJw9k+l>`C|azI8H_DCSrb_biBLpJ41=OXNz#T&D239h`aSOxGxPcWAOG|CK0e=>
zk<<JATJGh#ulu@9Fd;nmDXXyK^*)~xrO(WN<J-x@<(BrHyPin;dHa}W+1bYkBIJlm
zetp=whu3dSNxP>^pFX|cqx7q<UM=97OqAL?`NW*%%{=?hL$_hepMNfGX6w%4e54h;
zjZ`7c450ttuW$#>pZ`)%F@I{E-x7v5HN;?Uv!oSV;L>(q_eYm>+|Hdl$+2W3TDER|
zf+7yARrRs{)Ozr-FWyM8`yPmZtTHFSAcYJekVgW|$x@>m*~r+Kg0k@PA2r*z*Q0<(
zL@D|ueUiNhkY7`pPzA6~64Xn<N3WGhkgGy>26>Snt(XF1)tGH<j?hzc55SFVN8x$*
zh|Z?|(kbe;qN8VM7k&LnMB@_4(!kvaxw6;Cf9~D8xA1RCb!;zA7XrbT^RW*8y_dlk
zY~=7);3^c=%%xDE=2CnP+Ar@X^_V=JDB~3Hiqy4Np5dm{B^BKDQ}~X};c*m-0Y%fl
zb0F=Lnip{WPks8F{o(3x&(QpW)UY=Xk8P5pPMi0gS4bat8sB=pG$tiws-%=ubtz^p
zzhhrbx{Z`qgw8_Rfu~<yI(OOp`hAU7uDtTu$WTxjN*}tE`pL(HDz;(MrhKSX`k$?%
z^Wy&hav{1FzPPout$}j@RIohtWsR<>bYOeq>vkkJCrA1}jmM2Mv#=0K@bsK${}Hhd
z>aYg-3!t#9JRdGv6<fT#_UPaX$B>)5Mn67j-bHv|LN9vkP+xZh3Dn<9h4lIRSB)C|
zL?w(>;B-ymhl*bL-MBH54C)n0^`fU2g0Ki%A~cPZ)K4F9EhSU}A<stuL(h3ONb94s
zvqzAvN)hiArl!YUwUaSB@}C4XR6>2qS?TX$X9BSjp?$qX!8t7NLzPY}AJFX&Km2e3
zo(!o}%Ls}hGcgkEmf(;yW35X`S@PiUZ(f~7gy5FM&`4}9H4YAqQU!-y_{Adn{OJ;#
zh5bG%VKq&~ssQrXHzIJC9VS%@&;z219z;l)nVGkCe|z{VVMdB!{r`2y*yuoO@HN88
z4Fgkey;8rQ{M)Tv<fY%X=q9Zl_kU|jKIA)MUG&jo$09bv4LbY{Sg;iM(FF;j+(xx$
zA?KiyK9dmqYV57t+FIWcFe-t7Q{-o`s_7puYEYemf?}fT$jl2sZ(WL?Y@#J`z783W
zE}Uu5Y^gjIRr<<9d8Q>%(#gxFjil8*t7e;o{}~+|rr5QOm|RdeJl!6c!U)PK;ne40
z3!I#sh?dn{L4sD@fdi+|FzAK<)`GOg$EY8qWc5GsKW6XS(|AOlkWANxp97pIdeAIM
zC<MS(=+35?U02}$Q7O79I0L-{@!jA_=j7PJPTe2>GBVn7<jD2K@B1V;JUMrSXfIIu
zI0VCkalC?%V8h0Zix4}IbXx1O8aC;AT4!VbPpj$qcrf!|qi2Oe^d&O+ocBV+n3WSv
zGAQ%}l3sBoR0Jk{c7T&j{|%iqgE4CA?(06Y_mm`^C_w|;Qo0qL%54Ee6ra{oj~v`|
zp)glDg@n9?l?BvD3_I19c{hTX9uqT8dR(Le%9`gc9U>K7lBI<t-=x2JGihq$v*mI4
z;u+vTr@+pj;<&A_T-MB8u{2MJ`(Sk9Oa=|IgycKAkc4G(oq4N60?I6UtKV7izx8;t
z8u8syEgJ|$KW{p=xP+NF!Pq(JlTbXn1es9$^IghGlzWy#;+xUz5imt1oy`LP{qwYY
z^8y(8d?xmoi3ihH2dF)_(~k2B<N?s*0b7v!($(HS!7cZ0OOI8n;3Jx0B9pN}TO%~D
zp8-=EGvd>`-{;Mn_diaY#BD0+iJYp7j;V>B7V2eq*Tkn{9F|`Fxju6SH7p7$Yi=5C
zBx9tGvR&<na<5B045Gf$f=Z&3f%h+!3l*biJ|Wl%gsNd&xzKDBG+s$;xxRbGsK0Tt
zh)VhB5JtSg_(Wy9oSJ$zgot>Y%C#u-%=-1~=e=%VcHz;Z$rJYX9}UxkY#<4drr`4O
za{S`tV9N;{6)Dak%6EikSqNw(Wd=U~<+VKXqEak?(wmIPOIV!VN(pL^pgqP(8r~I1
z?Xcf8den>0r#eQTIlGXQhJ!;>#_LZ-(OfyeJHf)G__othzWn6rwH@zjD>)UnoCOxa
zpccN-Qw)|066vS7agv9V;>Ibd@7}db2VO>c%DK6tB$wmVNYMbqf%2{-jScERawGG9
z<E6`8h+h)Pkx%oONfkY&b4jVqJfz}3f2nNw+8x|qgHBn6W?w5YXIr*ydj#Eq0?Lm{
zf_>yzc9EW(;0an%pyV^zlI?XHUW46am`{-5@ZcehhAF)9rc&sq7si60odWxtmZ}T2
z_cGUvoofE^ZS6JFapOid@itme-y&dC-|u(*`b)qcKbU<#*}M15zw(0Hltj1Gj96It
zr*o4Z0$Vqk&~)^|o!wLqng{4=C)PjWQ@=-X)$g5(mulw*^y^WSGTkaa&NkstzMo%C
z+@ZMD;Z~16h47FL#B>sK${uRz+X>1~cUG&eYP?b5bxx~n&-5=b!u5J!97)jSmKVRA
zpj2($x;1l6*-Y^g&dx_W_QlRQsl2@fFq}!X9ke6~khuS&i8xnrywwP8@8H<$C{0aG
zSHjhh{$(dMoas4!sLRW3Jtv=>b17urtu2eV(G?$w1%l0vj9gesMkv^#v5MgZ9ja-7
zh}{!sG|Tl<`7$wd{GLhxGH65ZjM~A_qOfw3-C0RP<mBkp9!*qg|Kd5m?L>1mWW;uO
z-ek9q3m*hO(Jw{X1)8+({fHhLr@3-9Y#&h_)BOH>ZKVKm+}yoEHPmhTNZw1+2!8{P
zKVsMB6!-O$2Ya4&fps(G^t6X>gjGK71Y}uyaeCe=YOQN#1OVV&S7L1W{r88l`NPf3
zDuCKgO%77&1YsJmyHmnne*IxRPHiy+D)8k;0Lja-;I?U`1VP*_K|ANnIlQG+uS##9
zx);(L^Y8oEVE-Obh^8*{)H((Jv=B}x-G%q%dMWzb6g)XE&Y-K30;~lzKK%@PHU&+o
zQ_#i8MSwR73goa)5`4{i=1l#o8Vaq}v?2(ticW?^2~VKV2Gb;^xc&>VO%endY)Nly
zhzTiW60o>M#_<IYkDj8%#413fJ9^H%a>00yUXTFht1CXZqNo1JpG&_u{jxc26ykz0
z=e~}Sz*(|of|2+RT3l(=S)J-Cyo1gEDZ_eS!RjOWH?}P>q)MgI$=Lrq53O_h%?|3B
za#}s(ge6`=EA2Ry9A;2Dd#fzTL(m)5Y~FlsNH9%!!U6*qoS(W@F{F?dBr!!ONcK`z
z!UPYdWNM|JJf$UMN}fov_e!RGX|fXhD|ubN<hN|xQI5+i&V?#~_aI@<pZn|A&QTy<
zi&zd}sdBScP#|ygRFe}R^nc803Ej#;{PI>hb?!;y+tIQ+>Q4_t=bS$U&;|B9kk33M
zD=E$3OD?H*Fmxo?hVd@lBQkG}^CD2po{|I51L##0w<b*mv@+RD-S-zzmn5ikelBUL
zH?6Np>9^X8Nk%0Jz{Pe^uB4(Ys17ZT)BGfjv3kv$P;`>yJ&^ObP#Q9qz4-N1+G%hD
zLW_b?#^2=V@f}aY>~}z}xCr9Qd;XN_N{IDHTmsR%5QlrtiYgyvst)ZGdi^s~-+Zwg
zo_-56qdcUhY+$nIf9!^H=nyfep*MsKkpH-sQkZELj(bU$d9~tvn(^!&Zr~rW-M%}D
zLMATu`V){_s^vnRkoricPX2o9oD?0Qtquv(9`!mg@S^+sd5z^F=ln8yi>CBh5r@RB
zAdlUFddm@{2`11%_yE*QO6zOaaU(S68%0aJEVvkD(UGB9c+xA<nF%5y2w&>{8CW<z
zF*}BIQU3$~bI#@Cn<)Zw@s(#HLV#TX_IL803+n91Md}UvcATY$4<5p7;UD0XXUhAW
zbWE-FUsn@Eys?m0OB(HFh8J|{D!ejEH&Xm&C*t+Se4R5Nj<YH(EQ~{s)#y&S`udx3
zF}aw6o*Eh&fQb^boH9GIK6$4?JHe2+OFF5!bFVmr<SpSte{kN9MH1F9zvke5{}I9h
zd(Cfuh_Yt@s~MbKPh<4?dADH9cZLkkRlZ*swaLfFhc}THRPhQ#<E^@JaE(lFmA6;)
zFUoxHLCiYk_;{`0E`fLL0tOANJJq0ivr{iGJ0|%rRO5a5(1nA@S%a58ztO525w4Nz
z0twh|-V8fE0&~4v*L-AG<6UME&bgDJ4QuWNusuuOW}g~8$)?Rl=}ndBkl*zb7B0;b
zAIwU>Q&cEpRb2^Dj{)^b2D*rBM&5s_iLr5_SSIihF*8tqQ$PjIcr{dbm{R<ka`e}4
zq4QXS7)^yTbVlZ6dA9%Ky@6s_`#|H}7E+KqHRUJTa%S-rp+gg|-k$W24LU3bk1U>(
zJ_YlsLm#UNuQ(Ya(MxTD3YMaAz>f69PTU$qv1k$sL^-*+(&)9j=M-a7q?b1QGhyL@
z9U1BA9bf2=OM?R!NCOP}Y6L~};?QP_97Cb)%eo0-#MW8fspn~pcdB`S^iK);->+zF
zmjAf=1g!uPR|&~Tw(ZRjIhEgJyC}|CHf!hq@3Vi1<duU-te#$7btxhQ0z5=>C(|K2
zp-Or>IzPK!kJKhvFVx%m;Raac(>v&o!lw+t3{YaauQ$~7EH3F3hy_97AsqE#+;l8_
zDfn=$0vdXz_aLqkt|l%)I)tFE&z7}k<2x<L&&iQURFF@&(dr+cOZ4nJ%p(+ijDV^>
zMHf>YPgzHVdbFPrVr@W@CQK}K81l7ediQSKZV&VNnJG@vn@!P7Y=XuG*e1gI5l{yh
zA!YpqdGY$Zc+A>}rK?T<@r^MFSho|D+ItQiI$cEx@)R=`j7ZaValUW8=5KkHe5;{B
zb>w3oaMDbtt6-7D*k|}{f{U+2hE>n_8-+(!tWS3cWQ)#NM_5qye?Ua{Had$`QH}(4
zl2%>6^7OMOLRv`$SEemSe%#QwrNFJisb*bF(Iq*r+gXvbhUZQq&6D^aTm`T4@Y5mM
zG3c%+Awq<XJH6F$4yKXoyw%MVkMI4J?Xu-d4q&u)xtJU%mN2IdMuhOQ;3P->F*(}R
zy=O<Ct}Uyr@#nqm2iur)@T7T5urX4zR^7hx{3HZmIk@z~)znliak&axiS%V4yUpTH
z+xNZVv=ASKt3vbQ=8Q@z1;QAVleVRGA1CvnikNGr{=?6}12Ps4r%+}>`j{Q|6qfQD
zU{+3u_l9qY6})%1@o!*sf7i|1d7ATCZ25S8vg4k~m(jgv6OvDiC3FbR^bEC_|F~-c
zh>)92#ac#4ky5(07_2rQ7}siFGljMinuK5jLbRo|Ezx>5k)FgnAnT7eG*E1XA21dH
z)8j0LV+h+A&S;omDxR|j9f-fNT%k~2SyL4*jo<Vo#}vNY!EW>f!KX5RuCLl3sZUac
zb4<D4(K@J)SDF1c2tD<mFCmB$i;>80im2d46%nT76b%0f5ptCE)F}65+dGYz#d)QS
zxpe~vEjEJRDK9GhQ^<<-P0{Ey+#2R~zjcxIa+<)IegtS<Lb&0ZkGPqjY(ZEA&6W*3
z+v{?@q=6lRL`!-Pm6Rsb1wE9CPa6GJzg2fRE-U*DZ{(SN=f!)2e*z2b5kpp&pOe4>
zZKi$s(x65SpS$L)w#ACXBHVyE7gt&@TxJq336W*WNwdFh*eVob{S4hqB~u5wieDi_
z2%z%GK^9$`_|!8hm4*hPh!H<!Sf^Zcm?{HOq%~(ltU}>6WbLQR(kLz-FrRXHh%n7C
zfOS4#WnX#J_@nPD-0yx=9}1qoBTF!ha5+XV<{9@CB<K}ec3c~$p>RG(WE2btW%tWQ
z4vGUsTjH!h{w(JQj2R94_ayzop(w@vQ=DAl)_H&;rz;VYR1M><xd0zck6s-i5*1G6
z3Qy~!*iK<5O5w-A(9lGMlyhrbeLl?WRwwwSo?YbZ7TPD5r`xsJxO4GUc@X*NwE(ja
zfeZ-meN%84CIl@~Ly|XFcy3FL@(7dA0iH9XPspl?lTKngTXk<hZb3p`f`aJ2*0JLI
zjvP5c+svgQS_<2PIJzL7mx}n#<$1&25})#4dTsTor|2&#87o=e6{Q7MN(<b|1-tmR
zHH`>-y?>W&kIL)pm2k&!{)&IJTl0kggI%)w<!itrJjo)WcI&3(K%Um3%CHEjDl6~j
zl!7dO{q?JcS_<vG0w>{^g{q)T#GO2%FB&*H^@Xj%dmx_nINOmK_52uSk({yn;MEGF
z-mOV$z#@{*%$Ta8l<pzDQy^yHybvuyH;ENK9N%Ox{KP+0tennJmgGW<k;Mv)Q?U<$
z@Lv}IFN)o`C3eHJq=|;cRGt{by}LSiMVqy|SSMmu@0OI#E>bk5x6xPyKnNK8_jijG
z&R0Q%h3Uy}Pz4a)<%5en8`eva(_FGq-awMN*QURnhx-I);R!DWqo=iajkJ!B>$W-Y
z=P!Vyo|M;C6lKmN_wScNJYXz2AAySAc*!Jf)A1@L5X$q}C{=*;g^8Am-@${uOutsd
zEJCTr5|b|c2sPzP%4t;DtayRMKCs}4tWVQv)mbTx2*Ob{FzDysR*^H1=s^NCe4~z1
z+#;dz?b<wkqqE|6p2B;bZE^YPVgFunjeU?&gi1ha#e~)a2Uy2rJG+eiF!5LZPR?{p
z4TK+mgWu6?=p_m=z-rTF6=%+~v1y6O2sJ(vm2jZh$CkaOKVns$T?VIvy7UI~rV0RF
zT}Zs8*{Xr(umVaXl4%JqSgGQB0Vv}P#6~uebNH!yyE}Q=5kGwCFLpjJx-!I@V(zWH
zrJeOD9TH%pJp>!|w7%E3>BoZ9B(8jZelVwCsXkYF8DWJ~y52lK^OJHYsS$vV?XsSV
zrPctS;s}a2?jYEHO_d>gqlO|To8aU)C}S{3NfjXd9mplMI>AQ9T$4Lov+H2COUUid
z5vr&4BEm_Cf)5F)bj6m6DiCF{oa}6yWdhV{yP&gZb7tE4kt)KsKnzUKxSc=B7Y=KR
zZ<1DNiItVY@DSDgt&{3`+6eH(`BPfp_DfKMnKZ4b;={M-<Vc?{FQf`UYkZ_+3b%sz
z9Ur=@H*<bU-LZSE?see5D>UK>xdyB(Suo+yFe|byRRAHN8XhHutBY9T46(g!2P`;D
z*%+TFIC$fxO)m^>rFho4$#>t;zj4sE&Bn$pdj`Pt*{9#Go{{MVsB}Wz2AYSaAnl>R
z5p!1PDXxzSpYvCJ_5gi*$d>dk-%U$Pdoh_Osc6K_@Z6P~UHsnEYR^A@@GS)=jh_)Q
z^BuD6D0_;LqNdOZ6M{ABu0w+&+<2g6AGKnIrmezrwKR*tRNJW|Tmr~Et-9Cu+y#gm
z4DGG!55eodZ~>^rK!GEGNPkyPQSnGRL8Qr0CF2`q;sk$>JsrWGDhltuc^?tqw`rH3
z){Fa<PiP9NY6GaiQOhz>!R-;$R&-oUO&>jD^Nlj!5mh*#$)E}Ve3phrXE(P?Ls}}X
z_vDtCsaq;W)Q(e$=7w~Tu;ZSMWE+ICts~THsCj!y#$*RABi!E~6@?VCQ)Qy0Qry^S
zf*)uX*$}v0p4tkN{BH>Ed|9pnOM~MYsD_Xv)dgJnY);O=78HT3q4ZdSV+y2m+Dbpz
zrk+ACoJeuSs|UZR0Hz00DH}&sPoAw-@=u{fQQhCeV~Ozw=dAlXpLaF!V7vNmZ=ZHO
z5O2o~Qg}vfCEL+9uo^JDQ^KQ#bqj|R5stury+zL-e_X#_wmim#BF}N!JWW)LG4YVR
zT1P2pHKoR`N`{7<;3m#r_M_rl8S;Tr<x&N}gLb(|SF4JS6ODoJCR7*f6FN#u^$W2{
z9_=_+M$Vy&Ry*l8y%}?N&gZA_j3ZP^La=>+#|K-gs7jT@K_E#&V^q6jMd&P$5P}Rf
z@DdxVl-^FZ?KBBqOlPj}BYc<E#v>g4SGaLb`gtyeC>x!ccK)bSW>k-<Q>S(?aQ>C3
zIFvup-ZYH}08i9QNeH-y%z59=)FBeLe$DOld}RAq`Rb@i;W}SDzwHy^wIc%9iYQT&
z{ui7wrVI639ix0WO;6D*5nj+?43!B7U-3lXZ1_exO4(I3Rpzo#mu(8uBd&N9AJ=(}
zYELc(%<~{|e2}yo7JkxL3aVIl&JV~rxA1BC-}t}T+%_8_CR3^HB26r6UJ`oduK?;S
z5Xb1nU{_p|KZ)0Y9fam9n$Vcye2SC1e}_FyzH@(2l#dV{vI=*&Rjs<!Q{+r;e}_wD
zJ3?Ogg01W4Ju&*SCnb>_a|BG!($MKLleLzqaZ=Wn2vK2FYPj^7r$>4|xi}%_G{&5J
z%Lb4!h$5N9smMr72{dlWp^UkMvv9m0Qpz~SzEzw;D;%}pphRrlK^H}df|y!1Mqza5
zFJgyhe~wp&HTII%%E#y_?6VWNgMAIPxFix`V(!&okM`_oxQvR++%WZfL8Y32o%dOO
z!ssT3egP|Ak#Tk1rXie_fWLvsDBr4<=Y8#s<Cr1i*|nF<<=QU&)$Y!}xr$;dKe>?A
z#=brxbk<QK)*{Fy9n(ifZfI=fN9&lveF*zEe|WRTjYIo>tI$3XS?2O0GeJXf?gl5X
ztFmEZO*8VqUZxEcF|qRN6*0rNCDc=R-Hod={>i_oUylC?V!WldP=~29`sP#1J%+eb
zuw!ayUA*2g1cFyATo)1`$;4>|Sh51Gm1a``kvoi7YmG$SBc5<_z_{jMwK1Cn3gx<t
zQwfES;zBGJ(9=&3f(qHFIfV)cTvXt?+S}WU)llTDlm~mgb2E|oiGpTzWdJJ6BR0;}
zB|(yo#wPtQM??r{1=st#I-_4yka<-eNfkhhHQn!Pg?BrWTVvJos?x=nii8&f6d!_#
zE{glzInN;zP?S{5TZce$0+__bAt>$`WnO2)y*uvL`VVZ;Q3cW}&?3&xTU|SV3jUB{
zfKaU<NN#Px(N$<<T^*c6^U#AEIttJ98+Y#9SsjJA14ln5p|HB@<}$oo^SV?1vz+ph
zyRZI}*S-7xG~9b^-}DIQKpl~G0C@*H+baxVbvST+=(XCK$coF*0n{g}RoNjE1CIR$
zr6R|^tfu<;5FGC}Ub23n9mTY*oWFc^p<jo&Uv|}vLnQDAanSC$ZBzjSj1H6+CCXDf
zbVYtOx8!9uV*hdKci{AFT{U8?;@R9QRb_XeP!kzl81kJ$d+;-UmqI(ASW6Kwb<BTG
zDku)qZzWs>c-Bu5vVw%GqzZNZC8s?lw4FL&!e#_0q3SQwofUPNTh#RC1Ma{XuLj!w
zS5~#pC$bS|(Uy9S8F-m`(}0P87r$|}|Ip!XcfIPMP|T!1cOCSK$l?hh92`k((E?}>
zV*HTLTLvJebaOh_kPWzh|9(&F%ja~Q>>AAJAdB^SMUnm29EksZ_rH09?Gd(KkVibE
z#rW~tbObBK{iYzvc``;=<mq{Z%fccfBX23aTT_W2JLo{K)U#y7(>nB-F$Ey$R^Fj0
zG!E`}tffG26|SM(6*PFKPDYeR(hD8Mho8J`I?pd~FP_`}MDDK@6;Ik<zL%P61?;4%
zyP`d}CA<xEi;JE1T}(#^aLV^?`6xu3A37^~@5BG5*jv9GAI1p%b9^YN5n<TZMU(dT
z@&{{kIJB3%klo{>-ap&H|H3-mXkd^gvwzG?W@aY9$8M4G3p=VVl08!sE}P!$d+oz|
zxfO(6J*Qtd2|4lft5>gV(*y`vok^SJ{XKe{G&&6`Yu>)SU#}{DTTE<!=ctOlUP;bJ
zqud>3=@bSNR^IX3mh4&3j~1xnGOCtwZ-&JD_%B8?ehB(+(&$0+eEO$5k5Jk^L8d>G
zV$?-7WUtUZD-)bW;fxiXBnm<ZR-YC`^~=Yj?Y~({gQhplz?-1gUl83D75gUbuY}gX
zsP|s{A%Dj+N*655$)I!RzmDo)i+T$t!h{K#8bN<a9lELhLF>O$-+cKk-f7mCm;QpK
z7fdZ~1(j52a9bzCp9Q9nANp`b(07)^lD?lq9xt^dlC&IKU!i@cNs}g%=zd=D;_q*j
z7Gl*C4)vlBS5oe}k8^|W!C#!ek<$lmX)yHvx||v&fCCw7-BDNAMv@f>=k(#jhhj-E
ztg;wJUMr}UGI7R1`V!*TH&vlw-I+S?WU4e4Kt^hqO#RCu>ix4^*Xu1?woF=Mq-2jI
zoQZUg3aBkDPVkQ}4(a^)0iT7Cb4bsiaK@V6^wxnaHXe$kN?OvTO%dRpTXU&Dv7h>*
zz3JFLxp=^HSZ7jg70tNf7a8!mO+^Js?X14o{>wk1V;~Ppz^7g)**ivT9mW|*>#$Jt
ztQM3puv|(@qVY+PKTS(bjo>$WkNUTYXECSEMmf=BmZH_EDzuI=oPSrd*X(F(nt9YH
z7LmvXFr$<7L`y`*Do#d8SxU6jFQ(A_mi%7t;);CB@7na0(yH0cA4g9HYm<sBL1Hqo
z>S?b=1XScu(y~vOHd&3;qnn6boJ6Y`9ko*Jr1HtQg&FFl3JB1lap5F8`;<1Z-4pA@
zoFj31MRYa=)Jvq#fV+d>vJ3PlH3*|BDfV$+g}q3kA^uiyASQfXK~AmWo&WXPKEXTU
znfe(7>0X+51xS`}0=)_*aF9MKHGDaAMbC`t+oVMVg>MoAxANA<Ilp%aP_O~evL7Hm
zd4^CfS+De9N2$DucBa%lNDE0{rb5jYHALyB2Tlq2Q-XHS_PyxGk~ojb7d+fEVES)%
zj_qj93^4=^DjPlR4mbJvjSY)3u3SXBZt}S~zX(|cI+x(89*4Rc;<qOPlS^kceTO~7
zZvY@n0l5Fj*x#QB1^_pVr}`7ZNyYQ<FTayMccJ)CVKF8tX%ddwoCHfG4PX@8=@vt2
zMcSd-gx#GCnlJ@=pKL4Rkp+3%+w|jpKhEv~n13Rd3c`c*$SLRCyP^wQ1fL4%V0YK5
z&2TLUEbAJUHCpzZ95bx@QX~l)J<YnjY!urKiOgS$V?cqF<jrOJ^2UVs@C1Lt0gH@L
zMdnAElSlvvlv?w`6G;7H3MR$3;(R0tZxI8qyVnd$LMgD$z{zk#MEQ)0THFJVEjiTE
zM-e$%fGF{RJ*mOTjpO!>zOrtjbIUKr;+-egwjE_seVzTYSu}lQ66UNJz}{$2#7A!H
z9D5RIC)wOvBi-<JJ2Y=OUEgKX+je2by4t<*cKQN&5H@s86M5rF>8{+qSf?a(d~DOE
zk_D!gauiIKbwZ<e;qW3P>e3cx|9N2WlD0Jc8=pG6UFR4Y$YW4Y!wWcuxG;2ZK_?@-
zcNO)X&2k&Dl{*lZ(1ork8;Ch$2&}fGk&U-Q3au~I|M<83`8FWKw%D?u!s5a~>{;BK
z6Tj;XSXc9Dzz+Myt$X_wgmYzkI3*En+FpvNb=U9Z)&7KmvtBt_;FGJhMYU4?KMxB&
zEP*<9J7Nyw^H)KqnzwE1YgZE&<7HF4N&?aaoWQwZsfnvK2n(;fqqV(_#tFk+YrrVS
z@V62RrmEgE+odf?wTJJzNjO0Is<tEL`al=M?_14Pcq{~>Q64+Pct0%8Md!5lQ@D;v
z!!rFDY0w~z+Sb72#j42Ty}H!4TWV`<$SKowp<noZIZ~zDfwWHeUIL031H43jwG2Y^
zF-}HM_!1s&RQ9QDe{SB~k2d|c56U0nsNDf#w-dVtKFuR;+72ncEv1!a+dDv@om-F|
zp+7SMUAF~<$$?LoQ0&<bGG{|j)FWJLkmIy1RZl3k1D{MBQqo=|1+y^;JxZrwVl^*q
z-?3vb?9K(dbj^RfHHeVuHZjM-Qa6yuY-B$s3`~+j)uvhZphMVDQy;?#>##BeZ2LB4
zu?{s>6qHgIcWTwD)aU?Vv%!U;Cr_R<3g5nEiwuf-{$SOUl$62AWz(l$%i1`%4X}pm
zsVd>--aNZ~cmH=4?>lw>;`s_bF_=z3GUAx7N^&U*h@y%joT;rz>YoZ_v4H3fhyoW?
zYm^N}a+f5Q6oLVCU>CbcNJS^fS)|@_E2xtf1ZOFQHEE^SH{~j`Nbqk5q@xMaW<I)l
z;*3RZK@`JV9dD?}aRU$E4YH0L(t`WGujo0#lu9UVYT9FV&;0+j<Arf?ZGQMc<SK04
z$Nc<kt_UfsIYV$nAG1qBFp$>e+Nx`{KXLe#iEv~XN4H8S6x{F0STw1z(;};j&jVC>
zOyqDDFk##k{bR4ucncsAs5&xiQ^AlyONcC`Y4~Q}5>6*^rWJYqE~l*MGa_wAe?T&k
zOE>Z=te;xG7t@>Qw`K+Rcu`jLRmx}Rc6R4tjpw+hux7jKI1lIvO~-ld(K$F|*+^M)
zq{J=W4yFJEBlR)I^(?0e8@m9bH<Tm<5Lf5|{1sD>A#WXi%9L7(2&{HkvtRZ}J5Bgl
zpuAJ5fM29;Fq7T~ZnmsFdAeQr0eyQf@kKnTG_(O|B!l}(36R4s#B|Yni7(~7Xw_aC
zZi}Z1qwR=SCy4ouFP^xrwwl+w5(pkyjmCw91d>?7ZASEm05dR$xi-mk2#UBid7D%t
z866c2{w<uA*O)jFKJ)1H0^l0eD(Pc0+*QG(M-jY${jGz(;#!f_T8?zXt=qTXNHrX!
z2mofNc)pEu1FBY7F}VtVSAxS-sstXzHobvM02Z0V_LpFpPTIz*Q~}guVzRPEjLQ2#
zI4=|fC?q^R7(9()XHU!1x=LWk2MkffQ?8Wrf{jHd<H9O`X+Pm#?5+XPh=OradM(yR
zm{$VIB_4EXET@R9nkI-9JgBIK@FsiJi7xQSBTyRO!aUz(Ny`aU7U6;WHytH)M9y7&
z**lVb3MfBHG?RER1&rt=*woC8rGrIt!goNAQAI(+I=9<0id~hDQ3y(;4h(`!57}~-
zhPv}?ZtyYLbB|W~2eaoU{v38;7)0`o)Vj~Rzki{bc8w7Vu4&sCK$u)JYSgH=PnV>I
zc8wp8cv)X@E6GRl-otkFO-n!6!C>I)gWoyOZ|eFMs!g_L8kaxiT&m+3hz;?j<Vnqb
z*3i~3|D-@@elfiFcF_7>+Itfm_M3a-t1eVAGOE8g75i~MwjkA7L{;NmyWLJ7)WUx4
z-d($FiPAGZrP8OD`R9%uZVS?gs(g#WF4*0ODr?g-P-}G2?wwX9ML)MX4Zp?wq*>=K
z0|pF84Tmrtp1%(KY7B_ic1}#Uv>Hz7Mi9`{8W3JL-AQ9?%g$G1xZ>se(-XFDUL;X+
z<o?kk;POwdy6P65LUlFpsVfnjT^c_1_|tIQrQHlbsBDT{rsv;QpS{Hq+qWny#)?-Y
zhngPov0OBcT<?4Y40Ur)BaG5gZ99DU@Z_}lwsHG2m%ZgI6ok`$=i3};Jso=dZ{f7W
zkHPZ4y`LS$@QX3T6WfSUbkf!#g%e+jeIG-OdsM<rb8O_msy<Ug{FV_|Xr^`idF7*b
zA3pTg(Zb6Hm98ps7{NRb%``5g78qRUz3*GJ@L7}vf*iUk9?w{znXnAclcZ}A@aKD~
zi%%AZVJIh;EqnLumt(Lf?W(}d+L2VmhF3$I+XhT~U~%=QPl?0w`ooE8lsUM^!QdfO
zTAsgnF)tjy9Mrr`n|&{;^o9<7ibj-gYBb<SkX^rRhIwBXhHKDM3m?hC(GazXNS93%
z$dprSRol*;JExh3S4u)NbbIGmcldhS@j`)52Fce@hR>+jjRiGK4d-->;o;-L#dEO%
z7V&Fpi^?zq+v_)M+=eC4r&8|ZM@swW)~)(NzO*4Qw2Ne20xEoSzS1iHR6g-;?z+$3
zO1o196&HxcCnQ}x{h|urmCuoos>tVTHCP}P>3_iGZqQ7Vb{tZpm(2P0Spm0UQ}cxW
zVu&g0&hLE*@Cy2X-IlkjpfO1yk?X0^g*IMQz`u#b0Q-w&C|M&nA`Ly=Pn68=_<tNy
zz~{t**p3`5Au-Q$(>?Gde%;FtaAPmbyVZD8cs1cPEad>G*RNz?X~>^i&QNF&U@#Y}
zQd%X^CakvjTr^KbJxoR`g}%iaFTP)?8+#;GCv2o^#{+~fGl@cati6WS5S6_6Tq5qD
zSm@Mn)CN+J6N}Gfba&U}H4|x@CAU~;he%WqhTNg|j`jxqldYzXsC?B$3h>q)StlGz
zQaX(n;DyKdWm4IX7)G&cQ+VXckEf(WqD&;|3+p<=u06=3zFnLd{-sm-A%Z+;u6Die
zV5UdZR_qY&1vqH;v@9Cf;_py_YPKV{!$<hl`*r|#9>zkScoGg_tWpL5hSYE@MaC_#
zd<|f|NKkS{Q~_Liak4$auPw{o2+zW)_bSB5a7E6puBPEvq1Z}88kT*CUuxf`5?e?e
zU8iNAS=OSd9vtit`ZgHqrXb_@Xg!nQ;AWLi0GocoJ>&&~M3V^B;&aKgrtrEG-mkI5
z;6#?C5(mo=oK4R3d()ic0j5nLM+>e%s!F%}9lY8NzJDaT!!o?ERKG{mehMRIk|~qI
z)$9?j(X82HO2u)xZ22kSO@*&;b4wvyn?ly8Oe7vEVc<^SPgzK?NG2!J^_1Z(aSOs%
zV&PvwYn^OlWOSwS-ge!)Y>9ldXX`f|#e|~$v;#|>Z`>7^`YQ#NMGTqjsjd>Bi0NH6
zD{gOl0KJrMVv<{{atARDI>_p@nJVMEMLoW4*6jZMam=D6382?amt-a`YTr{jhDZh;
zz#-y^K8`OsCWTw#1@itmvmDp{4}@WTysVx>3EXIkoG(MW430{VUXvE3^<($Uf*+L-
zU?#p_g=>)Br;IC*AG)ndx-nCoh?UbA26sMg^|(g1cg}mjKzsy-GN3GjfgZ*0X}4o(
zLIAc5Ty8s6fXD`w&LB_a3H9QucXTj(yt;tpkrqVd#jAL@nFlCZ7a!M?#fkVG8W(ZB
zeeX^~mKs6YR=PyuTm{MiH=9C7aj5>hjWTRi=!09buDQka$nfZ3@soSX;a?g^72KL4
zmfOownc11kW=*#lrVMKJz{FJ2Z^uc1q7b|uTf-5YvSl&c+p|P9E*@NK*RvCyWq|Uc
z^P8(gg~zqww^XQ>OnukS0BVwJUakNwVcw^@B)Wu*G_I|Rdsr5)pD--@fy0U-k3X>E
zB9PqdqrWdmvGi_pP6YvwZ4h!~n*WG(nISP(IP3xrz8TK7IwA7|tecU7b8>T+-sVx@
z4iCOnHVWcs`@)|t8ApHmR+fZMP6JvQ-=tkS%SEyYAzRrglU>xPvl_4UEU<)dBJyDI
zt{mUpl;hOxoLK5a#-JX-Hru-Wy{R|W+{yAJN7-O`3oFjY<Ot{az>odgCDn_Kq_8E?
z`lS#!-wW?sAKXKXyMTpr%?D~8wZYiJMVRgOF2eib`S(6w%P0Y-!R=CK3L_$8{8O!d
zSecB=%WgW&#C6iMcw+p=Dy)U5>bd4isUPaT?WrsZKV&;UpW`$hSvc#kpD*x-D7<K#
zUWX=&;A>6G9NKKeu9jS1De8L6l+!Aldc|dDDM#V$<J^~7JA^P&lr@`lt?xCe$1d@H
z29tn07*bu7SC?E!-4$0V&=Yy5aO9-x0QBDK%7|g@p{_n4FR-ooBt3cHB>|O&QP7WP
z0!iYz$&S))Zh3Q#g&Vf?*k9G$W!p=BkF)kL)!>m+g>?9qmfoLc)M);l^A%S&+)i|^
z&GIICXR1QcZ?0uo`})E3h8n7!tTo^o3i6^3(_`&A3>6i3|A=%7#B>M*?l<V$_koiK
z#)-{c;+rO3F~)#7cc&pO9vFOf;(<ZElKUN++olp<6Tf&;yRFl@p6QoYZSx2ofG~4=
zgkP$oU>c+*QswI3=|jgdui~|z-j9_cBejtp;b%ePMrFr?uP;DTv{~L|xAA_{>-|Os
z^|V)`h7AK#-B|i$y0%1tsM&nZ@%b50?6;0ele?mKbR|PGh@WpAsqLqHy)Mf(Cw<H!
zkx>aL{-&%ZN)iDKr}0-Q4p=sY^4DJ`PvW94cdx1vCrCi@5W_7^pOR37oLk^5x(B2V
z`|rK3Fv#fj5@fNJ-n7F2A%+(ggIW3fJlk0)@yDd&^hmVxPyb-;F?*#u7X0w~>Gq4m
zQcDnLaUco^vYaWb9!1V~pgl`Z&cqr?wO!H~a7t@pgkOMJM;K;XQD(jo_1A<elN=%2
z-#W6G+?C#vX)jnyO!2Oap(yr@&srW0@rTx_TTTakd?p=VGBgW9Py>%Z!`5r;j_cke
z;q!T0MphKR+<c>mfM~=utzk5<-4)%NfOR6X6S;z_PaJ<thSj}lx=bJ5-}CNLp4GPM
z3<byITl?#z6v?c<*GHQQ{7WD=_fqe{HEYYOs)VTMJABO*k7XCIgu-z{C}6;hB}OL!
z*Va~loGP>iQKjNgbHqDCd^?dpa&b<3_<mtOosugkSTpW+#=GMP`p|}auXOcm`oz1!
zA3FZ@WzDrU3tI;ECf>~WP`$w;^!MaDFQFy$gayi1c2!abA0ICDVo8sEZ_SmFRfN8}
z<nEFZg}GGriLa$`r(h4d4;`L{keCTXT0#If?#iTHx}7_Bu50C;b_@_u^d)&^CdT63
z=-is5x9t)Hunvd}Qrs~J*PSul=6;EndDp6Ci!67T|Gs|x1vHwK7O(}#2&J2q(TQe4
zkxpC__%hR_8L7T%ZW|51V#4%2IJfz|4GjzeryISPrQUwsQ9wa3wifkn4e$xl?dsmC
zOdR0N(ORLEYkV^D1{IDNyBzGbY3uo-`AW0@VQL^kXhGiWZ2Pup_9d%FI*(v92km^%
z!##HUB!3Q8sp?5PId!ZO!qfu|wQ$<UIP9-zr8uT$5-4(7P#9Tpc`=ZG!Je|BEdS(X
z&VNMePsqrPalGX=|D~O?LSd23)EPX(y`-chYd4?`E{T%Yx?5U3n=W1&gI~r!6;4l&
z?%jL;6~7&k!?#Y|_K{}e>Q##)DZ^mnMQVXyW9_u&NrAs^{$XKw53|EYSHdG|Yjw(^
z%fd{EECIo$h(;iN6g;V*hN+I>jiL+J9O`kkJ4m6lQKn_A7OYP&AHn^kl^CJ(*|?i#
zBmArEjX*W7+)T?{b<C1d9xVkq@51G>Id3E|^4)<n^%<b^p|`^2<}N=m=grkTrL*e7
zZNJwYjv1F`KRVs<7#4RD&pUAMv17+X<c^alQV_x<q<{6$^Xgf=G=E)9qh^vv5O2d_
zyxzbj5yEXkdN;(<xPfaA%YYQ@@{E_xf-dSka-T{<+hLI#F`kqLt-<$0Z{Dgc?9(mO
zCC_wIxKC)!8lQ(Q?F7Fe=NHjS+PRT9H31=B{VWgBzKN-e_a=J$ap5%B4T=+w06ay*
zkJvTsd5Yn&ycYe7UG6S;zB%d-i+%Fid_KsK(wUh0OOdewNZZia9DBa@q?EH2wmYiq
zW0e!IKn0}6(SEY|RZAR!t(Ur|l|&sfvg~OUDqZdNmqN(u_ZPY`q*Oqqj4M_4-R}L#
zpE!V?YMf3exU~%o#58>NSUX>ndo+02xSc;+@@&HaEg54zVyTadW!y-Ha-`VW|5^Ou
z!`;c(fbEr~1Me3U6i7LB?N3DgL2J7%q-#<7D>JU7S5`fQV<S?9qKyTmrL$7xv_$e9
z3oX)&_eyGf!eAfKwt1s4>>6-VCFguf^y&)7)b62K<P7mzqtDWThwoSf8B60r<f<?e
z+S{Old@$1=+Ox~L!C^0!oY7%M<V4rOApy1w8OE|sKD9VB`^hIkejrLG4$RJ;3G7pT
z70a@KI+Q4Gi3JR~=1g}CaSdO8*DfrGOkb30fNu}JaafDYcw{aaT`-UJ`o5l$=q;fn
z?*~@hb5nldzG7fARR9%j>OOCNJMbXE&)^uu7iH;&_RhW~UIt-{w)Ip>Ls!O-UIt(w
zKuh|2n`r7D##CeHh9^pKMu?bhw>NMDQJ;7q($_z9H5N87H2r;}vU}g`E3J?8Fg<2Y
zo2uPjyO-;RP+M7XG6Rpy>9i}4$|Cyh;N*~7^)dvbCHhyOi<7Thn(%hmNaL48oS+Xq
zIMIm*Q4p>!@u}aSL5HVV)kT}OwVEjJ4-WbEfvug@GADye|J3_Dhb8oFx`t^Jh`|l&
zqs>KDk70sJ@fpxP?R_Zq3<}C`_bf@jyi<<dD*F+tL^t!a>eheO*Vrnp5!m0{>f!Lz
ztSlZpTT2?L{{I-^)DJ_)t(^1t>}l1^!P_k<8JRS0n|_w_nfjq^nq#<QRjO#a&8WQi
zrtH}G{rv^ZyFjEQ`Zea}=Ay1D{4xRaxW)IDzuz8-ogau=DCNQQ-dQLL2?0khwbAKf
z#%;W&4Q9ZSmo)cPHW;lX5Hj0sd^dG)WKK(~HIC6svWAn4-T0(sXlUp_WWJIT6Z4`L
zLm?ivbkb&^qMQ2mGEH0h`=s+rL`>jwpNmZWyKhq&4=jHm{TOc~`7$#z3tclm_sNg+
zb>6F#Tl#J(fR^Y~h_apze$H61KbV&VPesa$_=n~~gh~Bdnios!<Qj&7OGQ`wpzy>5
z{>*6eo(Pz$Z%%1kY-TkX@Pv!HH-gVbQ(3gze^08}c=$l+!vq!aVFzYb(X=p5#MVw3
z=ogP@Z*PCcscXVn8(AHiFW1U&4;{LY-thdE+zTOJwd`&EB%~Unr@tcelMKDUkmVK>
z$a4YF2?w_@V}{g6p^I2#45#Jhxl0?bKN0CXdGhzITTS-7)xYHYsc9W~ff#8dhFuJf
z^n1{AN2o3QQZ)@rp=kdDpyZ{3SKCHL#8)TeXQmz{8tsAkI3=wDAwzDE@Jkf|OR1Ms
z-3Q4gjL!L=V!YFb?_7m~MG)@b<p(mw3wV^sL1gbKL_kx>F)&nxbI1PLGjZskiN?m#
z>>jtiDGm%0y`=LrkDlVY?cMhzREdF-XjcgH(p4*^OzeHZQ$O^<*sVmUqN=yYpp3cR
z<FEH`+GW&itZ{-WPgufmEs){P_^y<r7*@5GSU@_Mm72}sOr_^hI0-z`PxR3U$7ar)
zXAMRN5hxsn3W-3?(A2d^+RvNnEnTqET`daFMEoVct#9hJvQ6#Hh;`fE<mVrF-jCAD
z*sx)<U58LQfWp0~l}!41jCuOV0SpFm)6leyX>77b&m6LuFv*ET?pv<|S{BlDoJvI{
z`De2Qz|rlrYOD+_yz9A{ed$*-IwZAMk<S!e{pEQ#B89NPfk^G|qZYl?F1uW77xKuO
zMnk6v!Y`{mU5NDWRo>O3iW`}W!06DZ#m22W)mM0T5c&Zjsf^DQ9cegqOQcu=7}hlY
z@78E}0}e|#PD6;rma1Pe`*Lq1+Z7jGdp;i#9#sIP_1M_W_s;mVXw&A^gALW37~`<B
zr-u#7qTntpw%+((b3{s-mH9VkN5#a7I+v8$0eMN%!z<QMY5^z_-9Qb;85o$z+|z!$
zH5Hz<JFXgP1fQ}_AHH_U*S|EZH_+a1bVGZS_|7L12Rvy1pz(k(zh3#f7W!_tJQQl`
z6=-#6e7w#HqkF0C6NeczX{Ouk*{-`Mf6QK6`)+=<w)ro=-FjDT-8v=Jp`yS(qutPR
z{#VPcv{oFRqf6D67;5`liWm7nHMdJiMWr0Uk$s(N??151yx-1Y<X455hfTL-_?niC
z9;_rd;&}JiuBG4Yrt)2*X6+x~B2=%}*vI?so?C@&i7NVW<2pOzK7%uF_SoX^2*aPx
zcHS<<6HOuhv;Nsm+C7$Kj}c*}B~kCkY)t7T)6*ChpXGM-+O2sJZ?A1S`Y!CQhWl<I
zBl0mi(t^^9GzWod-`-@8NtcS86uXIa^bI=84_$X$n1Ac^xZ3Aqm5~3<k7w6-+{*a-
z@4r{wuTJo}8&{HWX9cN-Ue;j4tcgTwTL2xJbsByeSfpJQG4&&G5-jtud&OTN0Zuf8
z@w}X$t~-n#hd@yjO{3Xw%>Fyrx)8V)_{Z_^-;Y`~-@40hwNo@9Pt!E+;g#AsWq&<>
zYzc~x$X6#b^V4P>v=|Tym$Y)B7C*tnFLxR%ydrN6*xZSP#nZQk8ojRF@>L_|CvxNS
zJi?FO2eJ}TS3vkZb8Dev?|NMhw4-&2MoU=1KJ8-xQ#y|wJJzO(wA-rp{y#yU5-;Oc
zPg3T;?Fu&tMe)$q7d-oAulR#q2qpwqIbFSUnNUWW`^(EnCG*0qYyZyw_T}M!ZrPDg
zsLjIXp(-%ARBcD$B-2S1*F>F9#!O|^`~sH&zG<0$5)@UqjIW;%64*ZR+P?j2-Y4vj
z)!m(~QK)&|bo55)S?9B-@IZ1=cbHmdz){5O%78QAF=G{r8^D^J#Ma;5T)IC3l^8KF
zQjh4q!G2Ju#wSX;N9pM3yrK@ayJTJW$nwg{OxF$La=cjhnuHb;U{$Y4@JI^Tx3UjD
zi@^)AiB%JtHESk=6!Z%WIj)O2454>0!G#8HzDR)V?lXH<tF2vA&b|;RN!v}RNH>*_
znJ&Ce1dK4QGDrYXDDiNczio1YyNeE}{KvBPX@oRQ^Z$x_>@G|Tvt8c12L=V`wd|zH
zk&tn<=MSO~22N<du;z3~<l@EZhDjU4x7YqUO?h$;lVkY5_d`wAx{Wy7W!M7q4Pjv}
z8{f*bJ7{pbw#T2DR0I0=k@k9QZ>5*Zu$cUB(JQ%~u#n&~bzC#!FKzakjnV3q6oF<c
z(Kl9FkiTu;^X7E)U+UCMw=GG=F|xQUmvj)^-F|+Kjw8ruZf`nT^V+|=Q)d3D0R(mX
z*DgrV1NezqI`AC@PdcP34wV_MD%A^wdXpqwGw+_?)T-GdN)*^Vn$9W~XE%U{Ln{f_
zAbmwM%~`}vGIr%W3MI3pq4oB<7Ozd(c}F^bh=@3FV8+I`)PpiyGPxx^1`Zs!b)k$;
zuv=(NRIl&2SI_E0ct2Kd>Bbhsfgh)G%zK#>FQMN{>aaTRS2baibiYd6+xyosV-~@+
z8MEve?ZY9Q^?fbRJc#sPG0V;&N)DjzH6G4+&*KB!4XdHih#GEu%O&b+QbqwGD|CRV
z3a{il*HBwq`(D{hNESM=nVGYw7RG$2j$6)l*nZ60-9g+bLCSnbqOs|;c|>HF1DfBZ
z#QY@M-`__E!JqC2lv@Y=yAkfv3r`8`BdIdo(&nlXhNp{!j-+$3uX%jBPs|YkrwX(-
zr61cSK==8@4i3S1%&cXrF&8;l7005d7xP1k697C-rsl+($k<^H%lT^B+A@oW*CH4E
zAtM{{*9oL#>IsMVr(=3lQ%oc$5aI_n;FbI<{Ymn#c)iT`Yjr<4Iy(0ETd<Tu%+J&e
ziK#4bvSOWDB(>~iP?0{YddjG-(s)xgY^|%QwYO9Ctk}C#Cf~eqqrc-|qWh#}097H&
zsxakNE1Mlssg9C|C{^TD?jwYurTdjk2L?)Mo5qPuVmGxnzqp<rjvv<BZgTB@WK!Ec
z)mH`#8dQ2H{|=4kB+x@m?oe-<N(0)qw+1dR=?N_jDf<@TDe6371Fq?imD=iRATVUN
zXNx8?884E)Difur6eZ~S75nV&ZcS+(`i4T;RHA_GZv}29e2HD|$XE8)jF7<_j4z1F
z>xVNxLFl4Xk@ktcLZk6BFPMF)I~+Ee0nZ*IHs^xD482xD)u5%z@a;3%G*83Oix-ob
zKJ1#Jp*vw;-(P^g2Zw`KiW;1u7d7e>pNYd4I)Yyt+u!$#U0y)2Gmz8<!fwUz#Jod<
z$F}9)r?71LexFJwf|`rOkmKUT#XnMhvwA_77(_s6jJ+N0>8TJ2d2m)~7CWB>Q4G|`
zdVchuG8U-6^<U0hbk&g5u4i*|T#7}c0)KM5lzzGrsC^mBGKJ*Fbc1`YFDW9oz14fa
z68Qiq7E2sgxx2rjBy?}NBa_4WYyPnsYGY+Zo`2C0H#9FrhmT;<<jTjZ3?~}nN{Ga(
zuRadsu#)s@`&+(P7$g)#{DvI7x3;$04J@E4#D@L{zquJhZDYL0-;o9*t#Ih?XJTSv
z#@{)9{J6N`E2KnwoCzPLLwi7Y7g8X`*)VO|erOFv$hN~*yN5Uafio`PEN!dp?#N78
z7!rl<{=n3)H?*GRt+Ttwq%ob)JKM9vKAdv)EQT`OE)@Wuq$A%eTPhZE%cq7udHQq_
zHc(B~Dq^G%{K%!pvnltB4*QcQrIa_hZ$(4aQKgo07tD*hm~}$BsB3#C?E2D`uIo*i
zK;~lL{?f`H9DoDBo1Ug#Iluh-u5vCEwZhm3Syy7h)Ep*}2#}^Ld_5bYp5H}>bHCd}
zEI;2=PI3D{2~exk9O;u0inAP68Mw<}5)meVH7XLjtn+02rE`Ag%uCb>PEl;WT`J;l
z!eljcE1b-?OKwOv#hcwzXFUXOg_E&}(=n@mzkW<sLt$vW3`-$-Y}?5;npT(hoy+5D
zI*gvkop^-Gb<-+qYineY?JbTT?a|%;0KIVzw5^9;x-_^{TFXQnCuC;iawubnOTU7e
zD8lK|IZFkjOWJ-at4BG^4lEfh!TV@I3Zf`1BK#O3VPBGzbyp$-o(R=wT}q1%(jks3
zSE^(RAb>L>m<usISE#x2UxZa6!!QM90?z63`f<zgVWA?0sxa%jjpxBj-;4%SbAGYB
zV;d<_!grS5E?GNHE{}SgLtw%;PQw1~_Vw-g+OqRW4FB=srm!4g4-E>JXXd7eW(xp@
z8D^MFT(EaE5v224%eY?QEwmf?68&&kBZ0hw1Ex@(5#2!<k1Z&}O8WRvGUj$aa$M5~
zK9C3=VDXHvOmTmOFO(4i#L=?DvPbY^7Kl(0qygkXr$}=&PLS+%cj$^?`sD4ea7ly0
zW$rZ3k&hWm;nvEyH~@I!?wKO_G{F7cGso(v(0fFKqL6UD1K|m!Z)UaEgrE2F@`4dC
znLX3VSot<gG|6EqiLL~GT-Z<O-${JH4{W=!8K7y%sE5NsV;ZYZutL!=uC}IZL@+~i
zrl{Tv4ww&$AMYh$p7sGzui9-ZIA^N4**AcnBW^zJSDgZ()Zx4DzPreM_~d1yW7VtG
z))k<2kD+SXRb0luUwoRHnz162Yc``rixy{qG~L>}0LU|6_~NX~OP`0h1lEcuC;vJf
zDU>~d?+LVHO8PO;Dg*x4^;Ma*5CbnB8oJ`{P^u_~Uf+K6jr+S!-QH<@swQ#6(Oy7A
zC9OP7#$9}hkD5)OXQ_M)`{<>a@xJ+Z0<A-lQIyNAqKh&PEjeC`#P#`Q{#71_ECsRa
zkWGY=kl2rT8Hl?kw}llF_?}w-<)R{&StAC+1PLYLE<Lm(mLvl#-7@&hnZ?Iz3%N0I
z`@OyA2$FJ!j41*6IK+>vYALtqt=YkxhBM;QlW*P1JyiUZj;5@Y`Mn%0EE3n$j-S)4
zW-V))bNWX4Cz;!+b*O6g<)xkPmBknO7O#DL5nl6@aIfaATkj2gkL+PTuVrUH9?@|i
zP3l;_Y7H|k@|*ODn)AtRSMtSa&dwiZ=561)HLR3*wug!X&$v+QbHCG%n0tRm8~od2
z@`o&!39uv0A;#4B&aK7!g?*}ijn=}R=)n{(qw*^+t*foE^6egZc8YD;wJ?HjTi+tk
zzAJgFR;f;dN?(LzRPI$<11L4^jAif&TClC|7UN{Jv|v|<F83!wwvFkC+Rz$oPht7f
zF_mYD<)^pqHGTOV|4Ej$wY^$?DyuEbF1m9kV(OhQg#rV{x-7zSh#oDHRJuyI4v+By
zqB84K$H8puOJQ72e<dN0)I2&+@0Q|*4j$DC)$d2pV5A|!G2W1zoLZbEyZ3-S6HzNP
zB?U*vCMFWI3VS}?eVl>#u}T?y<@ltpp!k%}ACeHwx3M_}Z%78mOAVP>5%*eNAx@jc
zC|rsh{bjBNg?`aYOWo<cgK0UIz!i@P1$9Idwy%Nt!(B-U@%}5{^q_<)<6o1A@1-4r
zAqTO4{q+k8*AxU)a^b6s{U%ktm*~g;Ch-ab-QUfTS|s7xBSMB1>~i*^0Dhf}>mqZS
zE14JKzqWh)`$$wmi-+LTLQNIDC-Q+x9Q$sD2|)}89TQg}zB;$C@KB(KEV!l~phP0J
zx~md*VF_3?-gL4%H+-TP5}BSXbUyCwWAfCN&6^*A%bJ>&7EEul$PZBf(q?3p-kDoc
z62Z~P*oMc1_8-ws(}fxfB1q?kG!)37aUlVTR2n0@{N5Hw-+<(BSfquJ#YGKIJO#{C
z(Txx+iVn;^B(EJttQ~f94UNQya*Gn!t_$2a!AITCijt`aDS;AxB$z%4i1vmD{cY{F
z1#rBjWq$yzK2jrMew+A3KBDFEW#*g=oB=4yD7dV(3#}rNG7#bpa2E{i6~b+$F!Cd4
zDtNRhTtssT-)ZEGLBY2mrs1Aj*r)hrUruhBGeEo0<~f=JOGV69l(D#OHuKa$gT_(V
z1H?K<MIHmP!~PD;Ni=#_*1$8heTNQ0h2`ZIp@?Y5WzhT{QwD};HSAjb!o_eVg~oNh
z1Lv1?df&cp`ev>9R9*gew@afw|N85{i_(XLsG9xZKfe;!kCpGCCOPm;Up_H^Lkr!O
z1`SUHv^FpFTz~I=!IwW=+FgVyMy!nzCR#x-$4u(R(xKf=bgcgSiOG@>FWVKyg{}Ra
zpxo=#=lU6b`2pLP?5+hGB>gq}3rhx=^%!%bNCAE&jME{%OU|KhalOvlqA^dink!-)
zhV5vmNY^+dZMivmkMndF5e~}4SMqVYg`e<dXj8k|tfD9o(egW8vqJkdraeACk>_gm
z0jkYHk@rxmZHmbtCqVh)Flo*Kb-${SDgSkk4%O@ZeqSlfy#Ky^+gL@MGv2iR;gc^s
z^c32W1au;uxj9ez^9(k8G}wX^t2TIFVTcXJPM4XPs1z$Lc$;K$)2Kfi<v5f@4nOe^
zE)w0G{XscGdYmO{(+Q%)Au@A<b2_W#a3lUgy*%-Zk3-V%@4sSpreGG-u2eNeMc*iL
zI_~Hd{e=r~@j})J0F-X9(=|28D{n8ZTb-t?&MAC}m<+<9NfPowzXd+gM5&v)<{~PV
z;?NMB=&A%hwxoy|8}!?!=WA2)TqByR5gJ}F)B%uD3P=PxU6tYC;VZMZ4;JwME{S>$
zf<XXaCQmoDmoM(np%*5^r$s%*he)hZn@a}!zw@+8^e@@z=ci|AI9%04aXFGSvh5|0
zjwIsV`qTW6>548HS<!Kk)AO=n-=(9Pb1|KgxyQa{G1m&+);RgJn=8MGkG9CDs{`t{
zX(_t4CpmnBkjkQQ#7tx}6q?Sw%OIysfBdm;9~)>EFf-NUZn@i6iVqe~i~O<b*opL<
z0h}t&x((dH2DYV1OuPdgHJKW~NuoA^nC0lVZ{I%W)vJ>Lnr~z>AM?6V(@{^A9r?BP
z!K#`5ld3)+u<HYW9cu}QPf!-KnN5e!K(|8c5HEooG@Iis2RL;a=5B_a>DTW&#bIq}
zBU~0qlGLqRJ+obsu8D46z>I5q;iye8{Ynv2<gQ+(VwUOT`HL5&4??=zW$;hI5ToCg
zU(0tnkp9h=8|P+2L<u=TNF}<qOaD3CMxEs+yc<PVylf|2c*y&M29B-NFeu1~O!-^x
z*N@d%TT60>E4=s0FO2D3cY=mJrXjfCO4i2+F<>rTJ|n0K+inb@VXn&q@Jpux2Zr-X
z;RauOK&?@wI~{vJXIP)J$O_=$MLC0$8;t#iDK3cG$M#r{o<PQIq*P!SOEPF8tE9V+
zGc@_g$c8f+7RcsHXjky|ZM;{*!{1z?a3|Vo3~mtDdowwi&g<^8YI(7k4yWnMPJOS)
zFP`Y=Y^WCH=068m0SwZmInS?rKG+C6(}i3}O|^;jh`V|7=H)dc459=CI++3i_AJd2
zC#k-P#PdMm7Eyf}>($;IfWs?K+JQ0jI!dDcXE&V{o{>_#5VNmVjl_J;@g-_GCBSF^
zS@>(FfWG{?z9Po~K{PXD=XGdJKQNL@<=Os5CC%I&G(!Jl5kf@W0r}WRjJz5>W{j~4
z1WX1@IB-W-yOZ53`u<XBu`%w~(W52fk%|uOqC#MxOvU`u>EfR$+lZC~xz@0ydrll$
zdH3>FJV~QN8VWOaq}dWLAFL^Aq}bQFx=3_M_H#SDvjW$3OoPnV0x1NA=5Sd^tB>0G
zE?WtyUoO|?Kgwqu^W3W^5`K$3N2zoulIFpB(eEelq1gq|eFJq``OE_h2lbaD>^GgO
zCZG?J#_<k5xdtMx2>#szoicLWhoq#;5-Yo&s!(h&p@<~Xp_q)AhY!sm#7z=Xuk5?1
zxo{g~>dUpYYZ^9c<dW;6xV{*gh7jwdd4o!`6i7vU%@uy5=g8kb8Yr}X>bP`G&E^SC
z8$VwTv*paLQyYKSGUhR6lt`p6nk*_%?Qr_;-?VA0SDcTZ-)Sn`!cP?UCTat+tF#kJ
zyN@(!&=EmP%zH1FfbF{j=-a0~;^nwO3Zv&g;6-?WaaX2PD!FiB5*?`<97I#&8cp)C
zn+XUXLST}FSeXc-r*^tAbJeT<{_y8jAM>Swd~^y_yn%DBD;8&5Nqz!CXCif2F>dN*
z_Z&dfBxDCJ&4+@0gdUdUXZov|jx$sBrT0UHT<;8m@Lmbd*yIsRB9pOKaME|`2*E{2
zBL*!M<D^V3*|2)ri)ekRO}G!nl5ntX*HRjU_b~M@Hi+$R+)2zrWbAJrE4u%Ka>SUA
zfZ7YS`8YgNrRw0pgR>CW5VaaX0}Q0$)w4vZACL-@ys}_PNFkVCKZR-E(vCtBFiu5b
z;~1?n`}cgO@V=(*v7$^YUAu9*_6kK9S?DR~NYKUi>c~)PV$S|Nz4o<o(HbJ|S6Xm%
zMEpt~U!iF#3+v|7Ky4R>wd5J(F@BVBd@#9uDB5rk;IT>I2flJdeGp<ck&TZxEJV&t
zE>m{Tyg`Eo32w2ipD5j2zE9t>iX|7Cg3T(bK;4el^W27a*z?@&5h?qG@hDkLBtqT)
z{9>~@JK9ke%rG|Sqk*8{>QBcf1nRVS>cdA(d6EZm2S6o?$93`3(!l@&u}RqsAj$`_
zF{jZ-g4}iowso2zm=*zJXQc!TT(rF>DmYh3P+fxvHjGu+;S;im$aeIXFdwwOyHX|}
ztgWfASMTfZQX~hgR|QDV%})clB02r#8RK#uRi@9&4He3Vd<rK4i-;65_$->@O4Ve!
zHqbz2Fm&Cf8X|3B{#o+_&QDI|dv5zNLPx=r%)!r!hPslsyrC_M9GN&uoF`ILxhuyg
zjA~H)?je3J<?QUe+qZ8Q@g$*y6--ksJpd{#j4@RJk-jkFAnOcbx3B9-5s0spz5vYI
zLfYaaSxA*&tnttC8YRwh_v5eLQNM)Dl|zJ4M&B-kEHkF_C@L=(r6e!L%X76h7&~h1
zPtVQ$ow)iqaxkwqcR$3rxF~b2xGp*=O|(b|L_-%MuVCGg_MLy%ktT#bQSa6V%*bW3
z?6&Vy4k$d260uIf`X`gscU7_-$Gz(rMJGqPEg^F;QNf8DE$!~zy4P3i>&k*k)$e`B
zHxvj)i;p(yICo7m#W<fNOu_DnYGnf+G3Gd6t~A3z1ySjcWn2ZRlwmXT5ztaci-#A;
z$ZVu<km*0p4Z}=I6EYw&&TC7L{hZF@R9jw)Ha`hz*qU1MGsv;8srIP?u*7c!$auYv
z)hOg=)DudSk~~tL)1yt#XHvcE2c;1ZgMQl~PjtE<&^kKzR2a=#MgN-w9ZK;4qF-Pz
zWXL{IdOG}S4lm>Fyn0Q|xBr~`P(3amh1!kMT2a72Q(s{t0do4*g%0WBC74MeLW4M)
zDV!`6P2`=p{B{l@yop-V7ow)fOK~peuktcH4|EQ!Arn-_Vh7hAj=+)t&UeD*Gl|g@
zO1lO5hqF$1g?25K_=A8k$0^;V0`Ve_@aD<+V;rAnhscQaAVaHbU!wZRdp*jTjP^n(
zr4ivmt0WS{s7o-@?F2c9?#gLss6E`%B;Vzq3XW|&cU$^xpmesWqAV^^AlTmLS8R_c
zAJt=}$eSXGK=v&p7haU6lCXg^W5`U#K(ha~v&DZxMCZEfo_o4(>S{;bNG=Mv2M|l;
zIIM)n;fTZoGOugmAw6Z!+0juj{zSb#@5v|BL}m6CZLpKTsC7h_hLKQySaI!qQDl=c
z#+50KQKb8gdiO$GZk<g3z=T3H8mN>JpJG(md~E4VZ0X?zcLV=p!wP$M?K+zL6iq}Q
z#&PQ?rGN`I+LipL4uNTCLUIEoc-xH)7wT`kaJE3#^7K9&{g03+IDYeW=bYpj_T;^0
zoF_>0{)|oY)YFK7YhK<>-azt#cH&?OSyj?aDO?r0*6zMXAmzg*nqE#(wI?3)))9#|
zpg-?h^ZFT}7CDn&3ke(^ohvW+M>s&h?%p$>=GY>E;|!RE4>eUQzDO1hhh{K#MPc;t
zHgVaOL)O;*lfKk*p4}(VIaaU1*FvJZ9SzG?gaJf+VRNh)k54kTiF!-CS-7|54XKE6
z;_N(nb4jIBU{vbJ>EDjx>H`lIU-!7fkXwrH1JcSI$UsF#Pcmi=zvoUIiR2@r8c7DC
zdW1@uM5TZg_mZ>gD?Vg`FUfQw0ITt;M~tx&^=8nWZrvMr7DQ6v7K<-LJW-+eNMW1d
z%*pbueyuPET%R}elBfcJ?qs;5^U|e#StEr)eQ<Z)`|rW+O<M(?aS~}_z>My{i$n&w
zx$4ZBGj(;xV!$Y^`Aur|=3HioX!uJZKAT<kj`lnPl`W#=8>jsux7bummHQsROT-y^
zQ6aM)zgwoSh}3VkP6h{Cl;BUXH;3lqctoQ^h;2ZR-a5j_k*1?R8X5i8iX0ik(6&?k
z(hnaJc}HP6`S{dRI2+>*g)&95rc}?a5eE)X!Gf@jZE*o+<+*=V`dNHQVYR5uERqRb
zWUbrQ5SXRVRt}=qt_5F5Luj9#J>P`i*g)VXYYbp`@3Hm5TTNa!X)XE7^XGHlH2O&S
z*bOrH^F!HaADyKjF&mG)n{1|fPVM)sv!wHCMM!SzLV7b~8sZ^)G{jC*4ZDZjhPwV7
zB)vlT;`EG+2`FJ|^v1D)rC%1<;I-vnPe^-3HbGvq<0G6q&)NsckZ~%J8wW8|&lOg0
zU(II1T+LY?7yiv6uGdzO#)!VF^-eR@|1~@@KNBxfF7h*%9pe}`N%g%4vWUSP)VOtG
z<2Iso0k772<yRUn0e;=$ExB_t!UA~OXZ!XE9lJ$lPzx%_@tODr)KR4a<|e^_II*Az
zg`#W;T@DoCE|b=ERdTf+$fg0K@is>H4M53YK+TIPEu?uw6mMxUlpRagAS#6(>p+1+
z&CeqOumB^l1hXD*iqeXwp?Jj&CLPZv^x(v(KiX9N@_AXNqWWxW-OU7<RDxw%LlV@8
z#P#0+6)fD{-4Vpa@{r*D1XbXk3z#I;j|^SwakH<R(N31MwMpS7QZG!eenBw#YUbLi
z#UfBgtMfi(tdkqX7D~s`qH{Z1A_IQqR7e98P&tk9LX=Q?>0~ha`LlR^k+u<}h(C-F
z;R5OZN7_PL3Hd|jtqerEPwgFXi&H2%B}4ZW9-56mUHsr!N1+|;Q~N=yzc59O8?|_x
z_fLd9I)j?f0Lcc@!sDN$pu{eiGO_UhbvEF=hUh>`y^k`wDkmq$F`n@?dyXvae8%$#
z@nC3J*uB89bG(p!ou-cev53=&Ws)e%6~B%Bio+pL`alRpHX_O)^WR}r-TE6ta(Wu0
ztEI4u&23vHda6hl<OKhyt%zADIz$wDOPEcsRV6-IsFrvX31n?Y11X88A^n+=c&0^p
z=IY`JGO2<>_&|017`jDHv17M0|8DJ;(Ek(ed^STbejmlw$6TV?(qFT8K+C@EX>cHL
zOn7V!!?y0#rMHm4{bgd8m~}EKg5*r@y`*xY-Ql&XE9wdl=q^}}qKHkF2sgnl@KzZp
zVm8ULp{u+AwCnb8f*!K(i3A6d<K~E~d`*lrl|)r1_V(>Dz-NyE>4ODi^=qbhHxX%|
zN5m{2sgty=k`S0+_#`}Pbv%FlcL{|WZjJrd*m@on-D=6&IVjuzzc%TT0DhzsW1f$k
zUhgLCM8+}roQVL8V!BYTV@o@~m606y`_;XFUcX*SRWgGP`k%e)E=NO;4*{}ExJRuj
z5ENkwi{dsl_io1<D8?mn-9Azodrew@mdfe&YN<S<%WwwQ=8KAWTyvsyQ5OX?98hX7
zbm$@!6-3}iDtYsUDm-<>P*O%eBBgeGUfU|Lm0(WAyU4bLe&5UcR?Hr5^{*?{94LC2
zfEH2$waJ2dF@R=8s`PcB(dY?C5uwq!{l<ophE1UgTexQ455IJ2FZFVITYF5Ln9#Ig
zQyOsb@v*nY)(-<=m5wjjT2PpI1(z;fl)1U0GA?65!6~~P|H|_yQkkRScs`VuBW~?=
zb5oAMNMMcQQ7h=y?JGsG00Rum5N&!!4lOR=TezH8r7;GB|J%o=iku?|MoBNSOsj@1
zc)aP^iTHQnOC?dwSUhheCo^XNj>o$;=jOj5l94&M=V`ijtZ}2{jLb!#N0~u0A`Yx}
zsb>2Qj#xK*6r23*O_HWvbdH~i5aT2F_Kzb#kbX-wL}@~7ov;)xU%uS5)-v4D3ik5^
z09e6Sh^XdS7%Oz6B+X%CbP=kcwlKxk)^_7Mw-qarWp+Fagxz{pi2=KKMyOA3vTAEC
ziOM^y7TfYm<XEqQP8_n3!UbG?X~pdp_?734*hLrLfMh+rvLcu<ErKfGO6}Sd!<vzq
zRzhjxNRBrq-uFzG_$h9Z;=e}gFO1e_&5Cwa9cj+~e{{VET+jRa|NpXAR*I~UB0>Wv
zTgs@2%1VWXXpn3YWtEXt*((&4k<82zLUu~Fk`-lUhW_{Kot*Re-M)Xf@Auq3=Nw1x
z_v`h1Uf1J#T#xH<35D@DXjNUtMCQ)%&E}LjN4wg$=w|w9jA1|+dDXNe%MuTtr8hTo
zWyncWhMlSU1{2hGxr_HH9n)C;1wN*$!OVh@FIts}%w~LODAqm#vfn=J45Dg!Zs@ag
z?9qb(COf!DSFT-ayE442+1W{$c#x?3OACwqyFPEd=;r7_+0$1O|01Y0&gaWX_KJQY
ze;zkb_Hw~i4x<D;QJ}lBgGkT`;&Q*{W>F4kGy^6#{i>P#pEKYpHsGiZ8Z=pY?9idx
z+-Y22b@%H+1q6*>vSP&@<`6mcht-2mp6)!iC3KeHOOo`kl$&t914H|*!0GrFq3sCE
zR|E{h_K)pXHc00j=7+$jOqirhJm&A+y}Q0~6{Tt@A_`&T=!D%tw#7p3X@{DZF<WgX
zrv?v1&f_Wc7W&IBxJ}pP;95%k0i@y)>XFhhO}>nKEUj=JtyJ*OU%gtt^Wqq8^oz?O
z|A}BJvM}s_A{ebymU{cBP*u~O{aNP9WT@=wRaXz&9@MN&n>J2UMC=OxXIxdVRcP5@
zhF6}2m|nA6h&i$sVNB|9l4c|k7N6Y|r~q3}S*S(<JcM+mT75+dB%QVIT36ASN!}7;
zGHC&zlYGP5U`Ne-%DG6}0}gFi6B#Uf9Ju2u_8u#)LS~Zqf|v7W%6A_}<YT$go9-8>
z(9(cIH1#5wfRI27*&#{`c@k5mgfm+O$JkB|MW;>=6SABAa+dxpy|WRNfbA@uwwc0Y
zNcT_F->`;1r{{DoEeeBDJb{(C*=Nbyo+IQrp!~a1o6itIj3BX&)pxQqaP5Q_E_KhL
zZtLLCAc4Ij+C~|=U^C%3cXoKAOHC=Cq3?%*&^eJ8`sv%=mE8`A!^5IrfJTT}N$1*C
zO&T_ADBp)35@keg$Ck<^3r>xY=yH%XH9e*`8cb*FH9c5?gdfDvhOZ@xuiSU(-7#m-
zhvP>Gzy!w~E=^|QBzM6&WBZkxyoGw;L${+yZ&?(-R~*`12XIU>k53sV!(ZqX#9E|P
zTfSPlE(Sl_L$qbq(5rs><9di<i}!$m6YxW?^!~=@SxeOQuCADB_!s6ZLho~sT2`iW
z4T#-g&zR%`+R5-#dlh8(=tvu@1lIxB8POQ$)6UfNnA#$V8oZ&c{qgx{6Ii)&nQZ$a
zO#u3QsEj01sz@ReO*Hv3Zi%9WZ<tyj|J%gm<IxY8DM+MCTW2`Y{Ql1Z;5-O+-S3~D
z28kUW+5B|Dw0W}F1MeZzR?S?tN*C?v`6c&xI=kZ6aRJ475~g@HfR5umz_JPtL1ki(
zDttiKxoGZW0TaLJ+F|bh?Z3i7jk@Tm)+Qw-$&^<9Y#@@#>xMnV;4e^*4dsZ}Xs-%t
z*j2Eg&-v&JYlBgiqZ6~)phLYNVJf{jIVjrn@k|T!V8?aQoQ&iX4+YNH46MM`$hg4$
zUd7+n3>EU7x@rh?&BJO=W{f!4AVkcffqEFqgR5nOF6PwBU%Vv|jKAn*-ZW5FK~qR-
zXlcdJ;1Jqq!wULiCjj(1kWf;;i6S94iqG073rH1-vf;(|NfXA87rTl)j#c=UEhD!!
zX`-w${P#;&L<TRNMb4z26+%2QO-CKA>*VC5A`r5BiEfOa1`7I7HW*3s7@u~PDpf+h
zaZfOOiY>X6dWAw<heDjxm%(-PNO0(YPZo<{?h-u!Bg>X9O~+|aqd^1jpA+vFP+RnP
z|5HS-@)&>`jVzrG=(cQWKrJR1<%0+)2pEq)&xK#HW5ECL^a4leNx=k*aRxKX+Ip2f
zCLyv#$1Faq@}S4=9vOi&kq;38vrOlp85SLNWW}CFV@866ngMEk%3m)(nffsvz1fR<
z#(24_m(pWqz3Mh>xQA5eeQQUb)H8%ePwF%|xGK6~)kqS7n59s24@-t^$eX>@cJ<cN
zr%zK(F5e-*iFy$t)uvq!b;WG%4CqE#*CaSi{2r#$8ndk|*B(H?8ncCBwFKGLu!34=
zyHkEGfzc95`LPi`7cWf|`Lt|YmS-a3Z=_^Trk%pU1jT(U9X~k=V$~>Um$n>d5GC0t
z?hwhvLA!g|$ZV9jq4<F9p=Tm_3#$0oa%p42`(cQ<lOrc<gj>kcH~pVN*zmhvTuu<V
zh3tG12gC4OxSo!G!{~e}jFHr%>~LS7FcA&=MtX<CO?b$cF8*5>AOv|ZeA=WEcH+x(
z?nu9W7ggsZr77oxlQvek;YH`kw6dk*WuEbt&6~y2O2qqvR{97zNDzCyqL12_FJGjT
z4_2?pJsZ(x?SoWp`Z+LjVY>8;*<a9M_@Q!(E<pIo7)gZ+6-4;Vy|CYvE4Y>0K~R*1
zd3%%ik&`cpM2So8+&!50CRU-4mLUgc!(06OC0p<zneMrR|Bk3~tYj+z<~z}J$?vG@
z+f}}rhEayjmIf}J{7>{!Pc78~6u#;n@A7>RFS74FxiRpZ_9d$wF4cp6N8T9?8}ah;
z2ZSUdT7(ngZ$2*L5bU8wOr6g6&yXcwl;FrHRR)(dyV_hkGUl>+?wn|XvJdg@cvyY?
ztKiwgsHpkId1RhjoLZB{!~XzNW~E4~rJxL6ekCzB28!hPKY`SipHbY`jV)`Yn63Rc
z80h{t0=|Kg70&{dh=oWQ`N|64F)p!CiBfAn`cQS&jHlVzk$k86i-?Wz4PxFrIZsk5
z31G^d_|ft0MxJ=~V-ihCQGc=hq<PlLHBXqyk@=9YD4~mKVgH_U<1;}@=+Uq!jw)uK
zFD?5<>Wcv`lee;-5P1}<tMiT;qmq<ydq%!Q<M=q2lbcwRZzLXSlLJO^BE;f>_jUQh
z@iIz<$1SIiZK?V^e4>t0dr~S-6?XMlkE!ed?hRZJNkxm@FjxAULI0vWD`D`*>AT42
z+riuOI75aEr0(=P%Q(BfD}O{V8_?yB%dg2<bq00YPIUGL9v`2sh8TttyzXW5idM1~
zZW5*=FFOA-oC-wSvz>B`#R-FDA2J4UvMx)gGtdK1aSbGho|TmIwDAtpL_5u}zI5y*
zm+^g}%SJUX&-Y!%<$wbpgiHG6TeE;E;Q2OqMl|ime4s|Vx<1o;=<)mHSJNA^)PmYk
z#-J18f$So6k~%<z5jC`B#8{$PiXSs4(NR%jR1|-Mk_sSmPrnVSvC2r>2MO*WVliMa
z1p%9s0%YJDYra=2%d<gO>KG=Q)rczvXI@ebg|%}|j5P5`ljR1THEWx-An?NZwj)}5
zlBXnuPG!&2^9)ck-RwBJz|f+XRO##c4QMtQDJwnLEH#-j^|xq#A%0|C%Zlqr6{hys
zUMlCgbaDFIhpiyJVf5=4$tHK9qCl9c)Lmf^Sk3OJP;Pj%oZt`s%bN~MvBim*s?21a
zw(yX&@kUDl28ZzC+bP*;l{hJPa=_#H=gH;t>qH_4k>957l{wd@eE;YnYBSa1LkMBx
zmhNQ}LnI?rsL+b~FYxDw5R<OaJCQ4*EDH|^2#|dfvb}1dB<UBGSs}#XmR_$o_kp52
z&y}MiAQBIxRvr$}K7S{<F0JGV3Vnq$#=S(7Llj)Hl}B7-R7dwx5-D7o8W0ov7UlrL
zDZ{SaaV{-p&XBKe!fWHrNI|Fis*x@aSedCk>)Gl7gG|nh7Dpw<KqK5gY4#R9Hn2rc
zR%v7bLEg^ZPk7zKTMCQH8a3NS3}pxQF7JoIYtcjz(aMH#kxN3?JV-^<7DI`SdCXc)
zE;cH@=JryZu6gt{j#ulAvf%5qz2I|ktmAjhy?K|$U2dJOhw7+47^&nHL0Ygn3oTsp
z@iz;X#aJ+zLWc2@EdF*i0H8aAa`@f%y5z~cEJA^$b^^*gG>l-ya?iwf5(KE2*=1s&
zL!9W(1l5hz-}|m;G~!vc`nq|PZm9+6Dqn6ou5?>ZIqQX~U;|d}z@bVbJ9o@^P>*F8
zCBYH8(PDo^#5zM}@zUsUnA}c!YSB}w?|kn0F6sRc!B9Oq%P`goX|TbRw-$`Xgd~-W
z%3%M=@KzSgoruj?3_r7GL=(lf&|z=q5f&m<;#c;(HsoKa?YK6&rO-_LLZ!%r@M4b(
zi0KVAOi7|)AVVLj#px5%fW|^+l(8U{jPdc=)PE((na{UFM8nL-rhWPGYunFXzdj>U
z|D?SnO8i}9Uz?uQ58fnF*6gCKvW7-h_vw0}p~8dPYQ>oHtER{^V00Z$Pgdr@VP2l<
z%*&LzL;RU(!${m!UE&c!+)7+vM#d56OW}7LzeFuVfR=O&owuQG-e{n&=xS-4u3fR^
zOXpLx6F>YmQnKd;n3swMSJh<f(-SYmADG=NpCbi7+_-e^RT1pje)g48dVtIsuK_0i
z%SP5S(l8v=t@LJFBJy{ZW*L^H+A$H^`Lr@K<1udBx&?Vv_e*z{B(qLehTgoHnwxV&
zw$-qR%Yip6_7b8n7;IeTSr~tato#%=eA0Ww`t>s(XaSgc+}4tIX{CvP!ICqv$_|ud
z*Wm~g{Ltl3{z*k!1HKoU41u!v&W-bWD+^m7i%!4nM2Zo~lkm&a(U|}4Sqey$LjnV3
zo`iC~6m=b?B`!=iBEIyc_0rGKKoABMero)+HN#G=I^%a11rTTuIpy`}ejEW^u}hU!
zy-L%?sra~^4o)63Pq$Dm9Sn$0?@DEHh<r<zb-hJi)icxrGWbTVKiJv%;heyUOi+kF
zIRpnxpd|ytmvW+%$q!1w7hs&whX#_9kNG%R>p98_w0u`mQ(M6Ka0)r4eMAXqYrBe?
zS~WV@Wj{MagsUe=N^|3J*6h>N73mP;@4D<F7>N}VjMJ8a?`0IrkkYPFavMVYMizBa
zt&VqYCAGGgii?a!RlJND1X&Eg8V6Zz=)$yu`Oz1`><L>Z|HxHNVLREV=4Z+`v4vC0
zAV{!TYrcuDqPQfYeiKVHsqfLtrOw3?SIkc-)^6Mh5Lr5yzSaJLEzuFs4YVFGAc7V#
z9s7VMQzSh@ZG}(WqZ>d|rn6?9pehIb({n9qc8}tPf1EdvqDe`tbI5w9aE3ITJ$_6i
z(Wxj#%Ke@<jgi+C>wC!6<=0g|gc`E{k6xA07pfvFu#Cv5r4gd|jr)J<lK<&~X6idy
zmULV8->;~c^$0_(50sXLHnarvup=76JUBffMQJbYyCT6aVLCgx2a<A&o6Me}J^J>R
zZ-GDO9BR4NGE;qin+2A({hj&_$lRz?uf1p6on1~C-v1F7t)FoC=-W)MuAat&56Asl
z-LS|$)qX<9Bf7npbRV*+=|RJ!o<9<Izr8u^(zVMU{d;JCwd?Wn*_6lIv}>-OePdP=
z@#krP+4#YO2Mc%i{B?Qz-kh)0k<O^vUsTyu?2h32#GCymb**Hq3E_<Yl__(Wd=uXi
zUFi*Ek=~c-H4jQ}#tVULk^Mf<#cS5BoAIazOs$Ly0R?zz7hm2c#I9`wg$^Mw7}p<;
z=x*-Qn=g|{@sME1-2`7UXqI?)3fT4I36o?TZe&{;oPOIc6h*D~=K21`maR7;AeKS<
zq5l3SRHeUkcAy%Me@C<#Q8hL;w#K4rS!<2Pcz9@fwL{|&l2W0_fRKbO=0Hatz5YP1
z>W}<GH!>hi#Gy?@?!P~-OY)RvE$ej9S1M*zWZ{u&vps2-A}7pv)E}9=2n*yGh4#cZ
zP%Ji!{T_oOrPZrp@-X>`sn)$K^?frb`9en@^St%+=!>$pKXAr<!%*H$(QeR&mpLh(
z8oP6r+eL<N*bocn=Se?WmOE<FVnyA3#rp^~+b|kxE#kR35rWdhXQpc8^mmWvoWmaT
zr&!n1r;g>~${6wBDNsNm+crp1d74j6#-ILrQW-^aC0eutH}gwv0s8piQP~|^9a-yY
zT(?1kI9m0%oE<Td{#wjpvQOi*y7?8<sABuh%D6@mIz`|4z&ayzK@K3X>FGpVGph$c
zZiLdzN4K&1kyKpNQ>HCcQV~6_i@}eVJ&R*$ljyf?drj->`}a4Suk0qCINYt#U#Fq5
zf{Qiwt$1cNmY=Hb4BEKth<uMn9Inr(+^oZ-o~BfL^wC-94%!+lXeW^jC1O2YTsbo0
zN?KYvyoLs3wPDgvd+3c5Idi2r0y$D4V@n|;9&7!;M7A66UfGQCApiH_@%v&>R9L3>
znoRGZgd)~4jB=|lzIu;PJ&G2n$Itf`7=;};zoNPlW4r6GMj|4JPT3ne>(dhxJs95-
zt(*)`NPZ0Zi0a!(Hm+2ymme?uEQq1cMr^qTlP<yGGxS|NF~6j$FNi{e5tROdCK|q%
z8QXprww!~sNY8%~M50kY&h_KK;yKc9<}jxwo(^@T5T)n!_T6=3EU!dXW%_U!E;gto
zhjMV*+<Exi3bH^@lvO%yh^2+22Kf5!XB<@a$O?}jgAgX|+p`j|F;>?1N31RrF_}oq
z5rpNrMMX7r=L2g&&WJBKL^4G~mNjYCitFOVyK+43e}rc@EPm9SbLN2}#HDPe4SUdW
zpKvx+tO7~EF`Qb}!d}p}S7$|eu3gIZ1ht2m@InoMH_Jwjlb#g?>28|-yO~YdBp@rI
z68;%oXy(}^)@ssZVuZ9j3xn(3{#}<`4b^i)-??(NYVEDH$eCjj95yFX5!w!eyp1Ul
ztQvRaO#1DB@B&iI63aW$0J_5#jf8==rA-1kKhb;t;lmw}jCGCZVrkD8#7(FL-{hdf
zp>?K|c0-VmakD>%^x4EfZYHAyvg9-mamP?5Wr&vhAH>6dNd<XGr<}fpkV^b#GUE$l
zG<^E>FVl*!hm8YQq}DmW5V#7PY>tjuWW(&bG~wWXk~{omfUk=}N47Xi?=e;8TM-@n
z&Z)DLJoI}w?6BQYzX@dogWLU{$}rlm<p&q}RMRrIu%Ku2T22waf(|z2-~pXg_7+u@
zd=E8g!C6{bu71h((2Zg!l>1evP~x%7cDY?1ISP=}aGQZN)?_m0=txY{MCt|+>JeeY
zzTdYqUSb4h52#j|=HZeUd#_AH_xkngLjdoTlfa<%raneUx>Cavu<yl2>Z|%d-hpqw
zQKPns{2o$efS?2xq0Bn>3m6}ZURIQ4v?0QG?;cmBWd$WN<8R&N7yO^N9yPk%kVb@A
z97OdwYyvWZ(pxhkv)%9Ps1!d;7)hT;ET=FP+gYRC-Tv}OM1Ik7pX!k)5kqP1h*&}B
zb)>!S4{1gsV#yRoC1UmCk3aMB@*w<&@y21VWh51mV?w7L>wH9y!L+xo7043+x+PR6
zn3gCM($5!Jj&16Sh7%^i4x!yt+?V3`)DOE=&)%h>p`lO5q9hyMXLPLkSD}p&(nrzx
z!~38g!MrUPPRSlCn~FrJM>>?LW2(JJeo0Uv+&sd0E2hNiE{k3Wh_h{NmsawP6ph^+
zT~*M=tyvYc0Wo4u(n7|NqWd^HZd}o*vs;AW-D+Am=@f6OcjjkZw#;+d8!oE$rED-|
z{bg39TN#x6fzPdo5Ml_4Ib|RCsDDxmJk<EULrJI0o;%=sy-NnaU~ox9b+2rG{z$gY
z(-?k|afbV7npQ8r_;l`yuA4X4GV8M%6L*acT%<nS;;e#P)TDH&VgieQ|9J4^_bwv*
zhE*EuL40)e>qI`2Ftr2^86T8p0RY}CwqI!(V(TmoF1m;m^Mh0GV!&T`wo;S#9lRB#
z_4>`58(}k%1|x21X#@<{?jE-zp!Z93_(4I+PQRS{WrLYd+B!g0h$+7E9Yt-z#EjU+
zrY2r+v%9giyEqNdblBmras#PX3T`s!g)S3k{lk!bh>2fUK%3ZxZC|(@<v|PG^0u`_
z5=#I0nbiYgl?mrgiX>^1@&<5V%Ly(LF*={kUVsmpMd@rx=1FT*T3ZA?03hwAUKblZ
z>bh0Bk7m8x!lIoX^)z|a*SV8jIn32m#Rb`2=NkH^ceaTxhAu2BnTM)l%i)smWqXU$
z=83#jBmuG!Ib;Gi_!dnZV1+?=dQKpdm13wt$dT|Yd^7re8J+;0jlEb-L9;1Ks|2{=
zy}|J1_J(hVqk-!FKDit*osq$lt&t6;SWRrxzI}MopfZYwW8c2PxZCQpMw-T^?BqU^
zGnf0nbm>xZX(lzUA-ySy0cIQemcC5Ct}N4xVJnF2U^YqxN9`cQ#X*m%j_n1#KX*EL
z6|;U<uU_RlzkK<!<U6yFij1`R&vy<QeMJin+3!*HL-Jke)K5djuqmQOk_TkN32gNl
z8LED0r;=>5WaUa$=Lpz@t@7%2H(S2-A6YiIU_jCM?nF$4(4vEo&e>oSa)*J`bZ`Kz
z|Bq-cj|68BqKu6k*5keYe!o334>WG;tiGI}HR8ACjZR+L3Zjk@N!QYpQKwu2?^V;n
zcp{3S2@S9Pq!`4i=Ydr9j5-ib2n^E%Qpqphy$251;(Qf<>f=&65xEE25)H)9<n76K
zJ;Kw=C~1diKuEpMZ<0{u@Bf3Olb9$iiq!CvNPb7UeV6G1l(|<hX~1Qf#=%hvW$XNJ
zG-&!Tpaap=g4am1q?hT5wui}}q6-$;{hrL$AZBUh*tYvO^b94qY-L_otYA|e$S#G*
zR%QN2^%9e)*7^}BO{&)0-`3UA+PZbRGYA{UY9pXf1+_1-1=_a_r2;Q19FfaCJ5a^f
zKY5D8dhRHPljS@^fq5NTHC7^bT)ui$+FNqJ2taRm*`^N-sdq^oqSyp=ZgC8IkJV2e
z$If(aAn(zK`sv5*ZNHx#b}y^=9kR~-I*@>2^!M`ZxQiB1CWJ$39v68;%^u&(q<Jfl
zcU9D>Q{7m!lA`aJcr+*n_J-@3n0H)OdNGO<a<IbeDi(km-l+<B1buATwyh0-*&YeS
zVai>u+eY*VL6d#J@FE@pq8AuFMg}U;RooO>@ne$oU*A3>s{}Xznh!oey35{e+6A3L
zhckZNB;fr#wC2v^nfS~ei&$n9={M}!Sk-tmh?<fKl>+Tun%a5@38|zu5mke#{;kuq
z$D-qZojH7TCZTirrp{P<Z-)|>`arnL$fs5|Hq-Q-6OS%zW?-;Kl|E6RWYslV7!tw|
z(-l>2h?yztR2f0A4ue(rELLlP?%nxSl^{3a${`j9ijL&mOsY5WawKbohm}#@Kltwj
zqSzZa@cLgkRC0m7PN?_m7w%HD(w<NBZ9+Q~G15+Ig(r6=QG_?AwiY-FVM)8?dcz)`
z$#mFZ_Xsie%svQuRUOUke~;$R)zX({h+3t0O`r)PZNz}&3QABN`kJGLCrV%;e@E}$
zU9Yqn<YkiNjz<C^BNxifQt?m|XJ_}lB5lM(20^Km`)2o!O?dE+_}4J|K1plh<5S0_
zjm0eXWw(U&5jBdXZ4W(T)U~1#Va-z(cs1tGJ=y5+3$s%WZKqCmXPSTe6C8CKkE8R#
zpkDgBx3>|zP~;ZQxPl9xJ|k`cGUvmk7R*Gfm<nd%^;OcIX$E$bR@RgWuoHbsBhUNU
z|38sOW6-4XN~9JOXKxthXNQfSzRO)oHP{cQsN)QniRmH@mQl&Iw6)v8<(Gh_KC9QP
ziKaZ0jsumZ?4aX-U9{Q3hRq$2EYb`UIq$idNWsMaJWb1bLrRJ{Q$53(Xo?`<QcGQB
z*J1SiBc}Ths@X&JB<9Q&q<><@yLOyd03cej&oy73VX$A8qZjAQMW`nQHAwGLM_4PT
z>2z{M^$m~`ktuJKCxO??m?Msy`)DG?+^rnt!oR>VaW(by8<|&yHw;aHNb^LZL0fr|
zbdvRFBWU?U*Z1i0Sl_i1Q+7)~PFh18f<T0^a&}6trKNSL!fK+F$9Y1dSk#E9h>pFC
zokw21CmRoJHcmZf63heoRWLCy_r(@_aS|4PLWt;gRzk~D@DE|c6MDz&9)KuHi~+{~
zBQ1v#P=wbBG0lJheURwsoEw`V)d-UM=`ZVuv<XGg5GFSsILbv$-0;qc)~1Ya%ZPB&
z@6Yi<Cswg4Lw@`F)xV7fl4Q_=m_|#;eY{kp%*wD7Pc>d|eGKHgbG%?8E&zG;-wRMi
zE&!U@7k_F9CBnDs^!%H|S&u{a8l|!lNVfZlVJeN7j;$Kcy#lt7m;U_OGs`fLNXN;~
zEG4wj944{^9n3@8DW`*BMLGN<@eBzrdh}{ZE_RozNgc5|O%6dq3e!#So@m-biEKoS
z726-X^9QJ@8or%zvGI%;uDx7#2q@@Grxf|RZU27V1L)aQ2wpsXA|uUIlM>=gfg_3Z
ziX&>2kUwA4gNX{SdA-!%=%l4+NUjCVf$XcZ7ky0m<#B^sKU(b1d4^N9uua=8l@j#9
zVv}I|@eD(VzB_94^?MjM?B&I6V$vrKsY^F$dx{gEw1t?G`WRd!3k=ac$Q~}*Cq+(O
z)?RSC#pU}9DTWC9*L1gXO4_{8P&GN&TybDP292|ciKEe?fk-&pG81togqrrt-!`#j
zI=D%-NRw?p?s7yJs6@okSm*}ZkCdZRQE#Yj@&SgCxG1rfQ$QfC|6&WDJE1MbLWN&H
z=aF2+u?fgmBbPR)IN{28F=ul_L=DqDJWpX+wiW*z*QZ}U$p-nBitT<G4<#ns0j9+F
zpmXlg4Ui<m#fub^sDvBU75zsrzq=`eMNaIPH1{+dcKm~jaKxy#g$C3Pc;AbDp{fGV
zX#O`LOS!jU+*aeQZwt5?Tr6^_HKWjysggTmg(MSmC6J!!vbuR?`}}1BNYc%$_Jpk}
z)dJ8Xsk`jL5`&V?qJE(ChG3p-Q*BoOl;0n(qpVM!o4c1(%9&WRb?d0{Q;P8CqbHPs
zICtrUfr?okTgnW<L|oc}y9v3$+jbYxJfR-?1O0^Xljd5sZfW%%`BY|crNK+ENY^5p
z--arUBPxrqJ?JU(wfQO_+iC94L^zLS9o8XwvdEc#dSNr<So8~3Y$Wf->_V^l)Nt47
znhh1^g#+2<&a4mtDLsGM(&@O+#ZXD^He$pG=?ink3h@+?>d5qAJlzyHByl^SL(&52
zu&M~Ru3XBz#H<&SP^zic2ra^3P72AtY*&2z)VvBnJ$&QF*Cij4vuk2vvb4iG^tvUq
z132%SwyLMcw@w<GS_>5gy?=Wo7-q&ufdq}=t9#FF+ex}fjvPO_2cz<*lvAuznd5<8
zX-)jw$5w4*eAEg6Zm}>a5ohQvyI?QZJE%%2^KN#5Sy~x$^Wx7`l}DK|b{6y{69lsZ
zrIRQ2@8AD6;)~Ff3bV_cgR?B$RByr0wg19t7)n^|zi>L}sU{jD!NJIi^Yuw&CTUuv
zgwE0xA=GaECy6$KtZ=V4YUWw5i=g9p`YrVFFO5BvM{9c6s8Nrn70sMAE0%~YaVcr;
zFIfbtk~)Rkyd3GMx?9Q+EmGZ0T0WZf>WvE(iI`YY^0p-ded{xD;0g*J4^?fKAHU;0
zdq)`-Ag7^#cx#(nt5Ku;k~pmA4nmfnBM=)3^!2<-+D2W0DSgc4fan?G$Q=7=oQFpq
z8$vTk&$1*@IFQdX>Gu%oWHTcXTtaZ;ZZqE#f~hd2nB$>q&&)0ZN21~tg3?p%9UL4y
zr?c-~ekI@gFPJFozhD-G84`vIw>8QrdaVr&2Qjncr|$#@EL1-Dz_j~1<WVRh-`O5@
zF?6l0ruJ;M*b$^f&!1nGU@(9-d+OaMzrUDDQ-`aq0@{J0CNxp5-o_OlHRHNV2_k)l
z$6#mVj@~OODc>gi0Z}4i;XS||7X%kUVMx}5QytD*`*w;EYiFM>r6*iJ3we%gm*~>%
zgAP-Mq6Ho~<B<mrGiI~aZQO{g^ykDf7Q==OTm7Ph$tHZ9#lWqE39*icty4Wu4$ATb
zj1hz>khCY2fob3R7WbwD4OOd~xi@GDrya_ro;k}^Ah-<KadGODDYv0=B|ZA)R|rx^
z@?`u+yUg0YEkv^Ys&L595ylyJ%nU@7a)yFT+`7DbTvF3VJ`_2yj53h5t-?qk;?;TV
zv44L8fW*XzN@TYz&2~&nupiqF{)+52>XhGv9CJ)-DLJE15Oq?eH?w=P&QfLuey~e~
z86U$u8YIB1MvI6{_Tx~`Y&7v$DrVLpA9&3weR~JbT2GR!6Qt)#Y!MS!p)1fD$i%EE
z0_5zvd)bL_e&q)Kv8baMgOF8S`$RZ8v!W8C4+s*!l@i<&Vv^<aXwt>TQg;g(&vX|T
zbDywUK?l?R=k!H(qBrbA?^_C0{t#U&4p0VqT~Y5c^=mc)MddzZFa{7UC&~uXf!FYA
z$6zHRlUb@NTI!c4Po9*#(X?ZQg;QmuMHUG`0AE{CjwuciTrWj}jsl@^DxJ6vDEDO2
z*w?q4)4s#R8`&c%-bfxE9`$o+rgy=zMB2*(Dk+JYl6~O~(Y4ZR6mhx`CA&Oe{a?#q
z3C!wlpvxn_+0=iqV}`=y_3r)t5aA|K^Y-_8v>@eI+8SERvfEhIVo#vC;YH8#A+^;v
zn?L-wDnUpI+V}E}Ykn+tM!bp=4B8i&?{g^q^K0Z!(i}uZ_?fr6u`!XsDxJh8tQIxI
zZP!y6fQX|Vb(}uJO!%O&s8RG=wYrRl46+UB+cG1w6^jo21pw?QsM?6TAK*dR5#1Ku
z03bqEF30h@MF>IE=p*t_m;+ivs4Mh@eLa7$jM*$Zc3&YYdVRg?yCINYQ_)ttZ(_<!
zbd%7YJO)Uo`{!XjBTxOsanH?NUAN2P9)`*Fn>V*eH>}r%WkMFB*2fd)a+NYlbT7;j
z<WU5P!Y1v?6@zqVOgZT6!DR9ZKaHs_Pl4ihEKgj%EY$ULBW+q|vYLn|d}`c~)Khdy
zq{a68_tzvJ4z;Z5z6Z`f=E+-3yLPSP+;>l(>XZm2WLY&o;g1T8c3Nu=n0#|cf-uqq
zB_Z}gZDB7t#x~$y5nu8DI{Xv&(eisyI_KOaG>T`MBp%wAv9uAecA0EdYzR4U<J>t5
ze#8kN;%BPEJsc{A8|agS{!f1{xxkVk2!@_9wbF8LL+6r^HjFzfC8?3c-v>j!=`%Nd
z75Q5ApafL#Stpk=Nw#$vk!SOT>Ndk3H9EQ|B*aHX#BfRg!rNFcelQJb6%|=#zLpa7
zAOinoZjKnn(d1i8fpmN2tnB16GQ}pHLQzl_(vEFJJobSx^v_SYC~#E5VbH;e{|4mS
zqDK_-4GNqvU6fWJXy=KiKd*((lSJ;fHa~wndXKxRyPJ;#C}RYWMm$vM2}=JAvYHK=
zj-?1mNozg3{y8+FM`nNZ4Vl2(yUn_--jWnG5pnM6F#F4RDYS}bpIiaO3snOY*)M+x
ziB}ddAXBFDwXs%HJO;sI%AtpKm#BCjX%^0idl*X?bu6AQ4Oty6(lA2!Kh0>!>9_og
zQ0_(iPmLTFCDD;EJq%9ShA*aH<Ig3wOQdge4l~F2ldB58CZ0S=E5Kjc4X9Ks%2=St
z;GEYPE|EX-aQ{JrvS7Jpd21`#Gw3tOw48KsRLt1Pw<r1|Ssf1Ym>CblRMnF{x)jV{
z+_y$=krtD-_HfK(eKVcD3HIiSez9=EOfSjQ3O)T{bLXB(xaO81W^fYs#VN?mnp&f8
zDUT9#uqD+jMbjNr51yge^1Mc?vic8#Ifml=o~rY*YW@0Geyh)4d;j(83|`(kqlLl2
zAIF%>O-%BlxZqC?6Y^W;=~%ZDFPAOavRAKpEU?)Rv=b3njRy!Lw-ci51V~np%?#C>
z^l+BVRp(~n4Io6LYy*MHZVwqa#2IQa9bvp+9cf$Tf}EvI-VyUx1WxPMOWWW0FDlR1
zZ__42emhmX$CxpeVT82~?x||x!tb14S&4Sw0;vG*IfVP6Nw+7i{V(E8^puxkwwnJ`
z)b4QP?NyCAU}qd<st}&*uZ>E5pJotG7QW@`1R{}CA|zOP&Pp7y24_@N{Nk926fvlH
z+ZoGjx;Rg;_TC`NNHJ@*Dm;8A2-$~<xyEx*Q;8}pnxndbfkY*u+21cSULpZrNpMP`
z38IQXv_xhz1)3zcgM!4fNv?~Y8oBs3XJ}c6;lOTuN08mn6_ZQA?dVcc^S0iBB98&V
z5keA(gRJ{0&JF5CK|0B8IJ$DnN8zdo`*g|G2?Z1emKOCbTCLr>bw8i?4b9{`6#CPk
z|29B(+Fp;^z55M)kSr|amM3r-=<<zdU@~U0kl&IwS6{t4K$Hk#;zavd&#LaSK609;
zPVKPBmttyVX|WrWK9P2t3`kITH_uyF^Sn$qq8hr44w%~BHi0o;(JG=llgU&r5vUJ9
zSAwN!8f$C&Y8aKTP~k&iQ#p=$t)ckET2$b@BwYxPZmnu6t+faw{5jKDr5-W?Zc$`S
zbY92Xclo&*h~XpMW0qrmIRB9=>xG0?2Dc+x!fn~3&^tn*?Y@`3mOm>nq~Rd7xNl$P
z9aYAO4-;=`og}==k_^u86%kQJM(iw=&qfa)Hf-$<7FQ9jxagKYp696}QCh+Tg)2Bm
zKMB#`)TigGs0e2nk_=4>pBP+%vnNKhg{dO3mD~D{)VYxvt-vg#LYar7?V(s(jvc#8
zTIw=FM5wIsKoGz9auXJ|SmpV(2mNO9--`f1>ySemap*jQpC!&an>cs2qsB831J~%~
z<y9j1M;AHgN1@nEMCw<qR!yZGCr^`&!*v%eTquT+#I%28R7pSjZ;;FrFJ8)0FM?vK
z15ykTw}TO7_%&@Mf@<_Qv&+b0HBJOYu_}M19I2p2ljd){_j{9m`!-{=0fhSgq>&mT
z!P!CYES9X=BHF?ZjzZS;iB%OpOei~{7#deoYzNBJ2OYy9#Ccs=`y|>233-!El;nL<
zL2LWJYL@%*ET2hSB7`ZtOr2)ds&^^3YQ3(M*u<~2-B>m-r%UsboJ~h%J>H3Fb!UH=
z|I5SBrPI_`tAy%|I-Y+4BE20zl=;iPB909oCg+e}8u5H-#)4)`yGeEL9ukW&xE9z#
zg}_Yh!=-?JkvtWDyVz4~rkP(c>->>2Up8ZbRMzMs=@&PjaD=|{X^)8=2b`1W-lInE
zUxo|h0(1!;d`ZPN?cZyaODGiG7P8RgstAH>G|yB<y(4__?Cio2UP-nR-AIRZyLazK
zALyg1Rhdzi4rD1us9c{RW29J#y*u@3MS6OA=p=gc^0|;9Mi}!=D@n4UcTSqrvqH|Z
zidQfDrJzKWrTu9lQHCPeX<;R^imcU(!^%VcU6B34QmJ7?4f5M<q(W{FyNzeMiOFO+
z0wp}7@(IrJwVZ7g4rKH`HZn4M*UQQOzcAVCf)Nv_qT=J(ZzJj6o1{dVQ~z(4`FwdP
zX_qb;rDBvI?qjqzR&0W0Po8!LHIO)qmsqDzCV=jrAL^j!hd`2uyj;*mm<)+Qjf>xs
z%8(<vd}&c?BsZPtxs#tT=*zJ<IErXetAlE2v9S_ys`MV5gPK9iI6Ue&9p6R*eF*ay
z#{A(agD85@ca9B<0+z^{DX7%l{8&$4BPGT~8ZP2@$%h-t<u`syA6y1|rFqCp_jDsO
z+d4`o!kaGez->L+T1gx8%(ti2i1r&ct0~!SWW@51K-x0wE$gHZ6|JdMl@PO}^qs74
z#juk|0TJsgPr&xrlYPr4SRpTf^g8C@e<&mhru{S~RbM4krBnf^?4`ZawIl%rJ!$V>
zC=G@Ou2-Q4Vfclbu!@E6rfZlE#r*kl%We9mrde8F3k!wU{w!&>Y18EM3loQtwjt+t
zNk(~}Qcak7GoF)(SuxdaWA*yx_5}#WuGkz`Y|DtDnnXROOmwZJ45ZKGj)#+CTlMeN
z2pf)KiSZF{6we%jRk|pv#h=|DJ7DJO1`MpM95einqzeitkEVCK|G}yBd&H;hPu$8L
zCWS18nT&o&4;Ho<;f<BJQvuDd!f-slgGh?<wvE`g^Z8R}^4o=nWe?mi)tLUmk<+Ii
z6_q6YRWEw{e72j7XFS|agz%kNL0RvMvzLFt5wz+wC=i+)6NS9^Q(1i{6^r$Z$Jnvf
zVIq6L%}3==wLY};RYvUPtauSDB<-ilrYupBaKHdIFi7z0RQ6&w#;%EwvcV9qarzU_
zf{F#eGMXqV>lBE<-ma?pE5Z(5=q9vu1c-*{EdIS|(O9^@T5kIIrqH3VLGGtG{r(tJ
zcjWC0CD5Y9`C@s>pC8YDH`4UX*lP^{oEukFR)R+VgmySVbBHU^lE5J&J6tefA0ik2
zLQr@MIf$+m74~q7QQ>6erMl>plFGJ|MKc)qGq;I&C+pIsQ}w?)`qUlNy{v8AAB9DK
zEmo7VlWjC7_5apXQIp+(^6v<41V(q!h`Iw6d86B>P@F-UQ7_i1^YoZx%fRnfM(bKh
zZ45roND2(}_2&(>HZ>iN!geJ*W^*f%q9LU5;e!T^_}f{P(HX|C*W+BLtY2LEb~=sz
zH)`BS)IUfC^aGW42CkJzppp%#zDpEfGBq9=8v3tV!nHCEVtC}2swI?2HwUPAj^{4w
zm@c7}7zbiXL;OCeVkg*lc^FEH`}L&w*Eatz`LPH9?Hw5P(HwhY>pses{$WzUi*pT4
zD3Nro7jp>+T%o*m#q$T!xr10q8C59_d4BsjGqg`+LbPM5Q~}a@CTwxEY->kHN9Vz-
z^%eTt6(A*5@#+_%T4f|5^rIjGDqJ>Q4gI6xvRy43e}y=txw?J_?TLhh1b<|bGTjt1
zK{^-|0iSt1;E^j)3qeF};D-mZLQfoq@aCO^G*^Z;WfHnY(T%|0J6RZQ`6K$kI3|xo
zR?GKNTC5-F(L|M|mCZYlUdXx@iQ81DR$)vJ$Q}k^hrys}hyE}1(Ns;Agh-4+pjfk}
zX;l#@ev1?rNX6ET#1Ku6qlse%jUzp59K+Av*rlYMe)!N?LI7OW5TMYxqs-SV*Y55J
zn~~e<4$%1MDb%!5KInQ;Yip3!kz-S!VO4gRCjg`EMZd+Lb%L0OD#`$tR&CNUcb9yV
zWSdT8xZfXE26}=*Qyu8o%2cCxyn#s5mX)d<oJ8#ab(wbKhGBXD(WfM@!hv4GSIcUT
z>QBplaT;+=)|xF9ahQ>974#&D*~znjZ-UTHW6{@z7RR}xgy+{lmSMbJh$=Vxl%;vl
zQZkf6OGx&Y2vcC&hEpsCr*z~+9E-+SeADG6MV}XiBC1_0A;4+K46@^PYu=dZJ+Gv*
zW4l+bw=fILl&sMm(Wb6cj#TbzS;m0LNJPg!4ZyO&3m29r4A%YA@)!CHF+-_-YH2?h
z0+CzNSiP^`yc_eukZaefS5z+Ki*lJDBq3VD#rS3H?t~fC(A2z8-AL~hEmF~K)0q~P
z+TOiwAbg~X);AS*Kj;&yboZ6r`(kmJ+zZ{UwDklkNPmFK$a2PUhp==DQXh*UUYibG
zOL@T%inl$o$?F~PXAiA0>cxRAc9pfge{Wg6d_&qD;Do893Y}{nhNBXX`Za?V;&RxN
zR!Z-@^>u2r&&9u=6yL_5f6eOE+u`-ZCSz$IRE{DkmKSM_1mtdAeCrAHQ*JM_5=qQ2
zyqwSY2GDgSmu#rAgs(G-RZwULR_Vgi8?<S2^;vHem}CE(I6W(1Nfyb_cd`y|G-cPi
zH}>+xg>gRCk)pD@tX+5d{ezY#pCSLK$-+g8^4N5Ad|zi7MZrSP>9;?og~&mRkDB=P
zn0w*-w+b(Fhi_gvdQy+MXTN{bEcqM$&CmwdE7>-ZsYV8(M6FMG(&x+vb+bHJx!r-k
zKW(!fIB*yYICDM*{E>&;*UT3B3dRRSMTh13?b>#&$;+Z5>YF2l2I;ufe<VtXmMYh%
zp%S%o;PvOgK;y*VHeF3gY%A%l;lxv-qG{E<`8ll_C~)|CC^&Q<@omXu%x}0l=gM>=
z6aeK!f9BnjglmEiN-nsE$AYW`uo8h4CuR%}%*-od)22PrXaMm5U9+$#I$p7%613Fg
z#HM>y_OC&cNEjLjVLccvEy{?QPM^dW2e^82UuPooET>{K>p_E(3e<=QX9_k|Oajrd
z7UApKB(mwoV{>jd{rmiQvOLUsB~SLxV~)-VP>#6~jX(iHYYS^n$BMHO6n>ufjZh+A
zOEZi|`1Zt`c7mQs=|eb~;G(N57FBIZf@6KaR26^s|2BjsYDl3fl8WjrRRP5EXMkGd
zfUc9DEtYW#VG}W`{3o@0w{EhLN{mXynte$XS6LuQA6OnDr=~+!xR49pocYvvR0NFC
zQscG<6m<Xj{+w)sLf9caM!2F3mH9#W4WrUn&>D$!wXduIqz|*5h;1d6_{5VXHMnh;
z&G?#vfkbXG_2;)qge0@$GoZ9ZOOJR?40S+``$Vz`7#%`9zP2J{a#NK~G3UN|@ZP<9
z|EAqQ7**+%muhOKBQ;+ISVH0;VGLQoZJOw5$4dbb<8;7|iJ`;6gm_xU(0Yju3Q|B5
zfx<Ks^d#9S16>c@)3t);5_o*si@>-6fhC7U&x#x<<dE%I2MY3P*E->RepI+MF?mA7
zd&ZZaev(>m8e)ovAB|<=34Phafdi<H#X{)V&(A2MrTZ|^z=DAl0`ad08=FJ^(NE%E
zqQ*J)$oK01ac2Hr%{BF3=5Cqs*pA~Va;FDf+O=LGkDOtqRoBK&NsLU}PhUd<i5wp0
zkT0<#2m?Gybz}I{2pjs?GEXBMEN*o^WDE4E+kha`ADw|8LM?ZPc{FH^tgxEPOjHWd
z@d!BJ9vz+qRx2u%d{bKlIQ3NJ?zlz)rlnc{h-~SjqsDzS)BH4KFv(Ar0*ccuM!rEG
z$MV7D_{dW#h-R10uuRzpMb%ycR@wy0Q-fvOgFIe5Nhui>{b?dK&dzqRO6S_|mcR;h
zv)zcQMm$~w7SSLOYZ>aQ-&e=;icEH`o7u{O3(Rut?ZTc>iPw0*bBQGii(PgWV_41U
z*oTGk&Y^yB1<HfBh|m?Ct`sfMcfONbM+^f{Wr`PnojSQjyrBIM@iGD`jPHl-EtJS_
zFglL(*2H>75+qqEq(H<RqIU1e_PZ8(y}eeIw9rHgo<#{FjeE{9hV=jsj&WBFqYiAZ
z<e`@a!sLQ)G&01E6F-m(t^q>rp=%}eN0?&teiJ)FaZw5RAS5_?&^vg|$#Nn46CX*O
z{m`b$b|D!rCx!G66K`ca=VYQbDE*~)d@-S?YbAWN=x)TV+4`guuiFB#bdib^xY?p5
z5KV84|B_+GG7l^!WT4aK#xj7yUKvWV%+bF;)}y->F-BK{jj7Cwrp80A>_fR7%ad!+
zd2$pS=vX(Ch!#}yw^+5hSG1qx?bMmH;f1#oZ&HS!O^ur=LmKhcGo|$^O^bT%oTa)U
zWHsR%&JEL*=}@lC?I*>*XPkx*Q&qtO)y7>eshlvk0tD||n8N{XZ9P%i2_PUuX5_#6
zdTMkP!JPx0tEl3lzVD1DbB@XUzDRTUQ>6hcA{!<VZsmI_X^A2>LUJpom1^YWfwoxO
z%A_*%NXogBOGiMgOGQCR9P$BPOmIkYb=h?AZ5mXO4Jvqj14>cX2owI{I;k>R*Xr+T
zw?FZ8A0|0AwjZ}bsDRvEjp)%Zk~>-)TL<+Ofm9mu1any!CWb1ExwwXM@4O(i{j=v)
z8ZaOv-GwMETtuVa0kn1p5o9v>R5}R23o4XX9-7hNlWH9s3v}qWwBjLrq?O22)y?x$
z6GuYn^}Q##82G0`pU$Aql7E6v5#9GvLA05d46D(T+VB<q28$ENtDXpREq3bTv(OQU
zY6x!P=^=}jqFEG$I@tdaeup&t!o~fV>vn_3sYG;PD&r3LPR|O$L$-SjVDW^00f9gi
zwN}zq5%yb-VvxGYqr7M}1gSD7No*P$3VI$QHrG|E@_h6cK&BpH6SGiAvb@1q8*w6S
zqYW)L3=v*JM8HBZ<yt)ejSM5J$fC#J1E3kHZFft104i109{4RwFVS7?2BF#v2zgSf
z$#IpX8Gi{C;fD0ocsSB`Eaetk&xaKdF*roFEG+0+BS_uv4@t*P$6ILa8y)3N4}>Zk
zibe<m%&LmP3i{sq1_u2puxwSdL^<X!?l3t3$hgcTGiT$cQw8OHh_sPp`Z%luu5YNh
zE`b6m#5)~}a0LtqeF+<;YXxzuD(_*?xQU(K9FXH}oeZ-eC>n#`vsx&TSA$Gc*)Dv$
zZgf<XC)58jOJsZph%6#fK9u--W{+k3XrSTg!m!^sXf<@7@o3sQ$#8_JI^jddgZ{dJ
zB<#*G6>*oeVbSW&HTYS3gmiYQNeS`}k+>A;+uq`1TcYY+qbq8nKZ83m`W#2{MOI<E
znU}juWEQAMllFJ|2f00-A*(aJJGL5m5B_KeIZ6Qe*nUVsQ!1nLgqIg%O3d-@<aa{D
zk&RLzwa5wP99>_B&b2um&91lg5K0RQOVAuFy$=l3!_9KFBH9bX$>dq6$x#@LJen%9
z%>a+wabhEP%=eMB0Ib4bV&wp-j3;P8XvjLvT^F@y4<J*AMG;nHhV%r-&t~ATx#%*b
zI|4&PF=?0V$6T4_^9fGOv5<hZn1`*a>Wy>|DL%RB>DGXcR=?hiE<WMg4IWu|C*88P
z52X*vnB_=O_mQ}aCD{nw$4M0eaY>c_NrX3vfpopnWjv0KiSXS_WOwL_8>VKeFja7y
z=)xWF2^9<%sm5Apk^SrD83uk|2JP#usv68UJEb+XV*Cus>SPBz8&Mr}`D81fZ-C6j
z$;IZLhRZ@zPm+(A(HuXU-9e1IDd1IZLC9xsQikkTB{?3_!#I?#%<~Lla(tbxP;a!n
zhSP(%d^znXS4Gu&ILD&YzwA$Au2ls!Xp2=CeUx4++o>q+Mal?Sy^G-dil&#yZ{b^g
zr*}|}dHo9^8mD>i*7uTZBs$7(T2kf83hX_Gh;r@RdH@K;7gNQ)_SCYiWg?4OL=Ke9
z@^)Qu`)A;nw_RH=eG}N^o5jW8pjw(`DwKD(sBN!NO{?0`^VVfk`?}faEbhC!h4s(-
zH?<aNIfd@%yFh2{N|(?b^CNp{?2VjXb$(FO^youHMGMaTbbej7n$N6JFa54RMyGjd
z#@KliJ`HeoZUk8r=WObEvuXK?6^A+UPN(KGe3fR5|Jg!J*3}Gj{kx)bbA|dCy5H&e
zWIaY7HXd#2-Qjt0PYTsqwQDzfH+k{m#W7xc2m9u?Rb!X+=!Y%vJJ<2QLb!0HmT+J<
zA8#_4q=ln~!r9Tn+~d@h=Jl&puYLhF?CLFBsw)Q$A3jgL^8xDb@^KR#9q+~9z-)rD
z7V0D%@mIn}KLw9n;FcMAXXwvb_to(V>}+Lc{hBU|mo7C&K(XZo&4)16KY4ZkhygI^
zvE|nwKArrsJj&;Fadn+X_$j9#mVBRouYKp?eR_Y^Q(uz(?%hhlp$<|Bw=Mhkx1(Ja
zLaDW)SJ`QWVB%`q)xdor(b0N6e!OZ<m3m1&;*J?B;dD;*Zz^9`XGhlN`QI{Q+)DnY
zBKy`-AodqiJ}F8@M#cdQ*iqK;ABL?#?ylCTQ!VbRk`|U;j8>`_%IfQDYS&)97e2w2
zS%6NHpH)^O;a@uuuDdolJVg7H?eD?b$+a3cUe1(!muu0nu~k?I+01Wyx1h5b84<;U
zmy~?Z#oh<}KYM>GD%#2B2o`13glSs@@HW007PFc^9fey%0?j!pb-g>hqO<qz)rV|)
z_Rghy_gNpy(^NMHdEY{-q;FvQW7-}5)R_4*K3mHU5TaaJ+b_T3larGh6jiENF@??S
zo}*9Atj)h$?$FFG7ZTwDZJ62(8Z3pEcuH5FX}U#MkA3pazQViAdGVsfrwOY9PVFh~
z$)Edp(wlEJ+L5Aa*S;JY7uOnj$Vx`YvZ-ij)(h$Lfuh-hiNMFt?aGmaTp3z2Z+ec#
zw;x73F?;r0k@pa$H+1OGMqQ>>Zr!>ysj(>;_yaxw&Ghvvqh$ZsF5Zqbp2OiX&raFX
zOl>92k=xF3ad9`Zvg(zsmsU<IE%48yTO9MY-RLu+Fmvb^s$Q8oZQ5!!pk9ny{J00z
z(cMGiY62q`6H1m$gSMN$XOBUn#*LqBs8wdQ<wDqv9N789IYs~YJ~^|TX!H~XM)e<i
z0mCc4;p^hz&`R}Ht=x&vl`BK=HtIRI5d$P=VP&2>LjcyUefsU=<ulH!-ek;WIYQU2
zRp%#L4j4cKu~NDEkIMp2*OOo0NAE!cT#NEaZA?=g|Ne|(yF$}kx_WhCQc_PCIAa1Q
z|BVH1rjMU@L@rPTJDo+5i-woeLMzr(eM!{T{v$@z4-O7icdhoknHnOkr&CH!GAFn`
zBlMi@vgh3r4(tUiPu{!uRRKwFKII8(qslQJss3i~zI_R>hoP{OPrc*cl_jY#<UGt#
zyMFyDFs_}kN|9gH?HnCj-)h&Y)#|}|^7n-;T=mPTp2THFr%e#z+1c7EHa0f1)>)Vs
zu_uDxqbNd4ZSFCr0Torml%c(QXRJrd?1!#hRxLwnGp^Q+(xECl6-{-<H*MbBLdP#B
zvsN&ln#;jWUD)<igXEhq<P|kkKa9V0+)hYlD7qp4J8{~Zx^$}+a5Fm2d<=YK)c5t4
zuD3SFI1d@3A)Zi;y3MS`)NRUSDDDB9JJm*P)fbKCGl%T~8#Zkk?zn6F_Iad^&O<EZ
zTlKDf1aa-dq`)SWCkKukS%lc_RDTHuJ4_Xe`l<sKW5<#Gq<i7md1{#|M0~{zI6etk
zC|SSnnLf2^+l6~RKGm6$vMLXdA@*#3<5{rk0a)=_>ncXJnYrC9+nf)m-EC$Np}6F$
z)=sq5PPvKaL&ef>TKjb04jnqUxVx9b1$DzUjOT}7`IgOOO+U!TTm07bn>OVVPS@P~
zXYr_hC{0ilhH!G^g*)5WRL610;Y>hh>T#WmRU5Ww+qODgIf*8Ove(^L9zWihfhT3k
zlqoqEFI`&PtDWi*OpKu#$%V}L*lvM*;m1a59)E(PHxJPjM@RD_#C*N^49^|pg&bj^
ziCM0O0I@zCQ8>Uw6u4@1Lp{R^>ea3L6eAXbdst<c#MxilF-S9P_i;Xfi<{egE?58Z
zMpNHCZp@jS7&Y5W^%3|>{7DXM4!kL}d%u}8J1LRKGSoaGkA+$4&gUxv85U54TJODm
z^JX&KpPb=Wd3l#(Q+`05oZ}AHCdeYOL&Q=};e1q>GUb#14Pd}BV18QM09Axn{l&kW
zam2im*bygY@ngo0Evtxn_AVy-$N4Q`m3aPbMxM~%2llnItBu?A5|D?4v=*NzR)t`x
z?7TQMRKr}0azk(b&0!4*R$5wGmFHDMMp4IHOE#_@+E^OgUlla1yX+RDsf#bTb({NB
zi<7P>g9i`pJiM>26K#7K>Ymwoc}sv}OQ!Yh+t+W>q<P`tb>IeD?XKGJExE4~e0%hB
zI`h@5{&_j7uj2OSPSzeOCRMoZX((oc2GwNbAYuxNRoy{7e}8KZfS&a3ad(2F8v>~H
zU1#`ic>$bNPpLbqx(l{d^WJ;R0El~8m2VeO$%xIT71GiI9)y=Xp~Z`%c1A`9@eIo<
zkW_yjGEcta%(oz~Tyn0FiHZ7LL@)D%<v2T~j@279D}0<v@TO=Qnk8IMNh!-Hh~Lzy
z5f6&uP`EE%whYBcS=wenlo6S;+qG}sUhzoEBCZ*A?6_#dhWcXjyl1jJznFPQgAOpb
z2i2zzVa8Z~@b>M^7_C&p>z+^BXGh?`fi*;k5HK2-aV^I9wMjxDuvisg@H`LR#}#%l
zC2dnu3nidL+qG-!78Vw|`1ma2JM847RewB?`^g^?3@Zd_x-%-OJOx!Us!SB9Y5_E6
zrfoeG8L3bmA(vwzis>o>YkIo<tJkknGBPw^78mT?*}U-c=Z4OKefw5~9bSkwDi?yo
zcIePWXU_DH>3kw>%B4#&1BC$20YoKE%~O}WgBbvM!@Ph0YM7+;kmtg{OY+T|2hJWb
zT|+73PI6D2UeJJGP7-J^HMF5m6YgdX9S?IeGX+KVs;ygVthOvaa^%Ru{Cjoy-pQyT
z-)N5Cuz7PR_cVu))N8;1HM%w1ckQYuzwPa#hWLpx_G3n??oBsSZnmAfU9GI=CR0i-
zW#K>s`+eZY{cxI8l6g|bNaF;2_5L^=si~Tlk>Zx11=Vqm!htYRZAlNHiTXUrOkdwU
zxyYS)bGUokwr!UrFZ!qi3zeQez4T5j9%TxVdY(~8y|pwBnlO7lnUt+Z#gqcmlwDA;
z9NOOX_>LVqyz5XXH+Kq`$*~1o{%@eW)vHzo@z`9y(-C#?^&RzY7Qwf+re+21m0*k=
zL%1RSS7P_<*?Hgqdrs<dmR3164*d1G-mU=~?A^C9BLVY$EPZn0#*MKG-H~RDm0vCM
zSEJZ@vjE!xB^fCa#J5B4(Q@U=x%&7R9OfWZZrE^>^)JbDOk_l+bq4p>Zqz6gE^J8E
zY6iRa?hWPhxIxD$K~zjGs1X(~U%v0?Q6ig+gLJ&}#Fj9!9%HTpH@r<y3eb42-l9c|
zHFuyHRDgZOAHa^RuV?b3M~ySLypHAuf-29F!@a$|?K9uKd9w`8p=q;b+f(P_9pX1_
z+67p#$f&3xUcYFd-8?g=sP#ohC#UARy5(v8Xtf`!M$ctW{_b77f|H7WH$*<S=g3<$
zv#vCM=`K1SDcXsNr#cFsAP}{ApFYuPJ<>A_edwN~6hL{n@p`{8|27FM*WjY%>`;%>
zD@Prf^=YiM_^Al@9GM9Bd;p8+0HguxUyrxeSzQ@*;=Ge5XYYH`-22Iy?%W7$lR_MQ
zJiX(u6xZbKc-z0xcs{aBN%}iI7oQ8m@+O?Bpm}P&dzX*Qw|Ir#tBiipf%|k89gg8;
zjE{eV`}gl3ICyaW>C@fk;tBKZ+qb6Ox}CaT)4n6A)58Nbb0r2F)Oq;l`6jD2Y*<1Y
zkG<-RQCH5}w5cJZpXE6z6-}suRKIRD?B(Rt(9zLR{mPh0nYXyK=WgGw!<a_pDpk%B
z_mE^i$)1&0P)l8XKCvK~U`GG1R@wTbG$T~25W$s*tKm*f%PGWnEO#w*SjY!(YuF*4
zXG%x@vROsG?v5;55*wbbF5||9P$@SvG^|NA!3e9qTJ`Eo=IH4WoZI1{@xCx`u!ED+
zPFTQi6vXl+D5P{cb<(<X=T6Egd9LyC6L)6LfgCENKCV3GReE$ap5IDPE;@0YX3ffA
zGdOSc>e@;+#zo(FNgtjB7H7_qF;qaqMND}Yk==A!wyXpyq(V?Xlhm^AvIKl_2dp<L
z1A;no{WuRC2xV7!j26Dc6m|=tq8Ulo>DH|gF)0KX$niM9?+R^bT7#CvwEaDM^w1`#
zXm#|~zyWL?#bZ=-wA3zin-w7;J9X;Rv_*?ucfQjwTgXW@Z&UZ&rAw9hu)TnW3`?zA
zvt|)^FB_MhMf?Ek{{26|@g_fhtgW*;=K2xS3JDHsvuDrdwY#7#B_=v9SUJhPU?(&I
zt5r3*L^m@s7M?uWm9EX|b?ZWiA&l6#5I^M+DXQAE>-0AKTc6FQJ$p9cbzfxC`2EM{
z{V_z$hBv)P%Yt{ll43fWcdOOizk`DLa(x_h9T)&Qd~$<iCA(IOeU_!fWy9QDN=r+D
z8uGD;SLI}&;~+1uRs(Eo7A^b$&$XoH!NrSTJ*o>l0sTl~;jArQxw0Q9@&jXOZj87O
zU13*}Btn-jzeTS}08(mds?k6H1oIA`jMFXolXZ3+a~wOi1(@T*v&(hJJpX)~vQD6v
ztnbxboA>5TmVf48|Ji^MdFa3ZReZgZM~=FDxr)3r#wEABs9w2pGL_!@pWml+3;f*(
z`gZ2;pPje%m{WDagaZGp*+pL_IE)z4YxL+Qyth|(6Pl~3sb#-<wV2PJHG7mC<rDso
z_D^jGr~#DbEc^^0#NfY1T3XtX6>~m5tHRf4roIFf?+VC?ArG=xJX5*87BfjS=6yDI
zH8QG&cd&U8pz}aNLhPigJjD;-+A0M!L)^JRNSZ97&j5E|-a~5XQ)2e?^z<ZL*TZ)=
zgetIiF=wmrb!N4iO?s49B;_4rN;F~XkHo;@9yIk9F0++lo!)RS{KZ;-zIOTYS^NX%
zbM4CLSIa{NU<MntXG?MBJL}NKUF*U_+&wwN5CQF#)<uSR9X)+kh-t!|nFmj-TUQU&
zakIm;8>)~mS{MB&DssnNhzXoB3JKi}x-av3b;TlB?JQZUOvK1%79?_t>!EXJ=yvJS
zfHbFJUXikAsdP9x1%CexSZlt2bW)ARjkmNe!WPtxG=V-{QP>gUg1KwQ>m!H$0^T24
z5xZyj%7l6I${=wf^ty5#6ViT9>z=wiVI{aa7`C(OMH!9$Mz+|Kzlj5nDyDKX-5GBm
z_9An~?VEvgCuP!bJG*(?wl$$SJj~<C(xRsfOK)>NSHZyl(eFUwpk27bZilSeAn*&g
zHOQL=9i{4H{3pMjiA=)0>4Gg=8keIavYZb-BZZb#E?v4L$4F7YybBT%j0tV;*`_3H
z@;8!w?S>7PVT91lzpyg#aRH?;U~dthFBvOV3Gshm#+}>T*hPyfpsO+nzcG?TWzcrd
zo6BE@b`JPvB-k5il_Wx(fh&P!DNOHYACH}cC4v;S<j(c<4|+ER<upEkW9WGp2iiuT
z{X@yn3g2lF-yGt7Dd-w8Zu5>EkMx@M_0_F%6y-C2elw=_l!0sPO@mb*s_^<?TiaxW
zISZ$|x#{3{)hhEYtf85%dX7x-7_aIziZ}&b^SX5wu``Ert9hTpaTe4|{Jd^GPffXr
zA7%i9<>o~kX!BKz_L3L^5IC7P71i$~xW(4x&|%Z9l;}?)Q4}&kX0XAK=fJl0-DXW*
z?oP#1jzqQnG(9)=-VD2EN*B$o$p|MWce?1A*So`~EeS7#E(=9u8v7<`#yf4`-Ja9y
zLwG^*Fb_kTv|H$xF_w7(gaL|2v*wh;b?@_jX15)Eu?2Av9y_z5i88HgYJAx99wHyE
zb;)Jy*kKczmb3kG5?bVo<C}5{8o4D+SBQV_eKX40yBE^ZtUYgR?py~fZ$E*{v^XXq
zqg6yR9J6NHk92lcYuk2T-&Uo^=VhkpW%1+dJj|Rr&j8e(^97`>L{#`LXM;(Onp6rO
zu9-e@;od`h*U`{?v$=Zz(gV^v^X{ox?GecZfx1&JT&RE>uvV9;o(o={4)|_@H_LLG
z4K8Sz)}su>qP1b8y+7XPSTyQ9xjcdoGw?A^KcxD3%;;9k`nd~*nFR=)aOdc8sqX}$
zX-<B=TKDeV@!iRWa9Z<2jur$P>|y+jEhoV26Ufczv<5z{n)*39Cg#%D<pN{lq%>Gk
zr}#?G@OUfgL{J~|Buq{3%)C<z7wlLQm+cQ3N!Pv4bL<1Kje2YN#vpTBr%7e`8}<7T
zc)j7CeHyuYdg?nrE2}{F`!UH8d+hsbt4~8ASdzI$hKD%?AH=op-V8zwV5`o3klVh`
zN4CILyze%W6be4NpT2tQ)^@ea+A*xwFlRa?5u*t{TZUU$lp`|59@)kk7{4OU=(MU4
zp4Rc&NpTs^wbq4y?g87c;mZ_qW|xU9k9rT4n|c3T@$YxV3m*VB9S*qkTK-EqYCm8$
zsQORdDG=n;vBmQfQ+rH$TK>G7vEqX2<J8i|C2-0KD~&QtO&BVXsGbfSIutIYXkizk
zR6QYLolpB~^7UoRwYPmw)qf*@>8S1F;{)iq67ZPdV|}GV$BqpHK3=-(%D{9}#j1C&
zAptCaghg%LHVGE!>a}Zcx=#c%0ONs*W0u1Jlv5}ig6IDFkvV&m0ugfDFY_jlWpQsE
zHSKO%R6w&9ynuEv<T(7TIqa7gbFSZ$ho2&yQy<=OsI6_SGY>|z0~vvWI&S?oY!h@w
zE~)QkX67PT=A`!4oXm{!hYufq&)~KT1fCKEVN9u535?PIh&zLEt2S<23g88|oQbvG
zSI5ceL4&LrB#nW;mWPJw-v3;s>z>QB6q?=A#`BByKr5T9q!OxD0QPXhr{l<q_QuAs
z#tU~mqzxZQ2~<9sYP2%Yy(>KS!X6yQl-x=R-QanAxU4S6ND)$j!>N>0FpitgFk&wo
zn|*f@6z@mD$mn_(@%vt(_&)N>;7n@l5(??wU3GNIN<%m!cau5Xj;ouSy-!cC6_bvq
z-tD(><HnQ%tf}(vAE`zHjhpnApW7g7U0_noL&~DaojV8nT+>WSIzi;5EZcc!lHAti
z(F3=u$@E{tsJ)?7zNXf_N-nW(PZ;X?mUN}op15w1QLDDyH1XSzqo4>W9Q+rJ*FK#i
zr8AUA^X}anQ%+hA8nmv{hom#3&IF8j{*kxc!IJ|r6#zKI(HGCmY};U9@^-|YZX1>^
zwO(LRKd#G$*b~17<LImfy?cSiHRWH6g#9S6k^WU=P$YyfE#&>v-P<q5qYyB^TS0XM
zw}|rPY19v9yQ#Hy-v7<>tfX}6+_~4tk*9CH*;BK9`$G+~j?w2E_zMcS)vfE7FUO8{
z9SRg!U3nmBW7UtYwZ-_UHxS1*ZW45JSlqVY;EJddL&-#C^iMv45mBEvgQ85mD3=Ov
zA-N=DT11Em)8`a2&T(18DR5+K>+(5#(w4Va54tw4g%{w|4Dw=T^+G_~hHGsN4t7iZ
z+^nDnf(Tui%P8E`eq0SwS$2ND^Sj7pOOq<4SG4i=@i~{AtT4Q<1Yt<XD>NH%eBf>i
zb-wN8(2N`ATf5DgPg+!hNWV~z4fg4m0L`$PI0@x^p-0)`qBZDsnkn2~&TAi9iw@Xj
zmQbXs-?~G!VRm23TNxMW8{|M@0nC{0ilpd<+43Y}EqKwkRjb&^v)s3AwtMvD^$uPN
zBu-yghwXsj>eZKf$46{jv!(*n>Q73{lo_U1kb~5vg>n{2Ma%q)yts~uJ><vvcXx1f
ze9EThWNJG_A+Mc-XC>QyvYx{@<Z@x@di5&9f9mzxK6+Pl^w3;?J-z5IpN9_}y2kU0
zj_rPDhSiXoEjkJ#gVd!(rUT`_r?<CG`}TDdxwYu7%twz-N}UAswa@zcx=NARJ@@R)
zI$xUW=;;2~v~i<#_Zb(m7Ruu9))Ane8b`jwCf3AWGfHgW%1hl?v*twz5geEn@j|GU
zyUrL)0em`Rj!)NA&C8RUY*@eEg>HgSwTfAry15h&Odi#ut7pVxrW@A?@Ms^>_MbxI
z5!z|10Cl|mij{^)+|68<v8@LyAo?V>ouH%YKGe3e>8(>63dqIN^HTP)EHfSen(4}s
z`3rQ<OtR_SyCY`w-P^QnJBfX2VC!<g{b@fGsbWfgJ+4l}W%<YBgBkJqz^aiY9*mP5
z*_yAroMt(Xv)9x113;=3Z=B?+I2M6LQYetAfz6miJ>LcTRT3u}ppcCl9T&W5#(Ub4
zmBl+TB0VY|^F~T7PIt)QGpD+tl6?Jo4Qd-3NB5ENG^XCguQ~dbnlVNU2Z6dODy!D6
z<+k#(rrjS4R9R#j{PDZ@>@DdBJ^IJKjQ=+B3fF(h7m69Z`!udo=u$>)Egs5;50Bf{
zq2<fr0o9w=UFMS?2<Lefj6&tSHZJ(Y)wcV{w?fDq`GPXA@mjWsx;Gi2ojZB?k(I=V
ziuC2$I3BJ{|4no4#njZN$DYhRhi1MaZK_Z$<Ffq6=aIZ`kBqb}2+(X2PB{pnxp&Gn
zjz`#&33`0%S#}Xe-Dywz6#*2zBivMv_thh56-WZ&X<h_UJRezOd;YZ1-|L-(5e#qe
zM7JezIEa%}R=+6n3(V!@{lTTL20n}&Oc}WLo3W{B$kC(I7rdm2U^ITh1XpUzpHo@o
zUUBGfn>KZuH*Kn*bm+xnNI+1YGo>rpizZJ4pf|0K@{}KGB`m~@WJIpbK0KVx?NLFU
zTr!uEX5Ps1AIvRu@24;9l9AC9N4Y!3X=(POryNWFsrhE#UEjq^%V=iWF&laBpEB8G
z+gG^1)cPk9#*3#^m!}MIU6GSWGvfTpy*{ftVr4xjXU}J?a?6ueiXXf%-;gk8$+HF(
zCeqZ*q3I~)wJel=cydPk{o~I*td~;PTg|@b&SfiBU_%iTBPe3|(3%pUYoBfoC%y!+
zy<&!i(dIqPBJ*v2OorT>ymuj;3k@Veb>UOzIIUPyO7HL=%@Ue|M8&jxIW8>qz&5`l
z>9j3e!Pqn^QjljGp^51{T!lGptVWJ(jQ+9KeN*3u$$`bcTHbm|Y`b*rS|75Q(d1{B
zcD4JkFB%g1R=b)5!iQb>4dv(SXVdZ86I3@#reFAh(99w4S%6m{VnV&!_oJzWjcDem
zS`!?0@udGrZ}C-Dd=p^KlONRvW=;>?<lQ7TCdLKhjQ5{EV*uL#!Ltz(7wgXyN85>R
zG*f)nd>RrkU@P;tk5^i;8mi(GL%D4~!U#n%F_rOvR+Q$=oAcB<c-yH`Z{e5r9Xo3J
z_Vh2bT(QRX`0yvS-Z3cJ$e<ZC#Uk#xpHwKSr}yW~y}HKgPc=kW*oc-PdRrhgI&l5&
zT`fG3`+>JG;amb2bSc4}pR5zMo0%)?%?_s_=$Cv}rOT^ZyEnx2X7r)YTXq1KGM7B2
z3a_0CTVZ{5^@a^2c25}V;bC=n+SIAujR&vG4q5sM;Mf1K!JTd4;mtokj9&l)zogm%
z;SmKq_HPTJWZ!+R+}AhGWWA4f`$2gM#l$Q)GSbD|DPhXgJKosPes@~J#2@Zqe3zKb
zL*h>`cFKt@s~C0YFg|1cpZiFh)DLihGeAgSq2MUk^x;3)qPo-PEZoJ?wnOiWe`Xdp
zFF`Ky*C8X|?WAo9WRO|3l%Y0dlJDPdz+vu1I$<`eERE34!vq)hjxR?bQDer?6U(Wl
zwBQs5tQI03h>44Pii9(4_>zfwaMbV_l?$Njo;`nV)$t3&fqFC9VMnK6?!Menr)D;r
zHV1Aa!ZXRgS9OIR-pbxxDX(51A6<!A<!!xZi<?_q@1tX8g+4#Ml3CFTG^V6=OtIiF
z7j~WnhQnSL@9Nn?DwQ-V&9wL4Wi$5}wkdhdulv)u<H1!*&Ulc;yjHMZQKo4czcp?r
zi22lLhb#k&|J<Nz(A+t>x#a|+j~J1|ZAj^Ju$2EiF|8V0#rh6j>Hv&q&c>>XI*`Ut
z%ZKd{1W4CZxhs)h&i0!9=7Xw3f-Pj4t@4Bf2UYEwI*NrRJ0`fhH_uGuNyYxawZ&@l
zfp~hg+THyv3lk0>?k$<%3k9Gw!8bP1pFMzqX-Rdk9AW;O_nvAns*(b7I<lhnvJ17A
zUHJdVIuo#**6;oAOouX+GGwMm#=?;dB}Fn-N*PmzlZ=^;DME!PG#D~fL}f^nP?V_1
zRLGD_nI%LeLjTVSzw^DW|9hS59EZH`e)oQ!^{jQT`@WZ-eZa81v&$T!SKMs&EIn^9
z{U_Muo;!DrJK}*kUQMdd^;5}$gTL9=_S;<grIteILQbp(DlpMCguC?o==CtsJIsvx
z6Uc9}y_MQi7f)aqfr?(A^C$auqhlKR_luU#yZ`AZ@)uuMT%pAT+%as}ut{^~Yy!@#
zS-@Lw0LOSj7X|yql@W*bND`(V+?4ny&8|4AgH_m73A?~^ifh%&*8wDSa)mPGHvX%7
zhhB)xqtYjUW7DaNFa7-GOCz&KN=kM%cA5bPeP9xtXZES932+`Kwrvh+<|^jj{iS5V
z3E94VyJgQ2TlFVe>yr)LF@UDIP<7AdwcrNee@S=b9v!$hn4ZnHXmRh*$%SVZc{?LB
zQx(a$dajh{Bux<^CY&-<6N)EaV`xt~rXpi4wKA<89rf}?J2=dVbBUY=#O56wY#(d_
z2DRniCASsYDv_T~m4Z`0icr5?XJuFI>({Ow$*pL*aLBfxgoGC>>f7XyiZPh{#*J<B
zb*m|WxQZw1M{<wrIav>Qb|Wn9V#?aiGV7~;wne7Nu;9`QmElK15TNK7cHyqwyR+dQ
zl8Vq48Q}O2oFR&-HlUIOE@00+%{s~%9o}mdsgk*=DKsixSQc(X3t5zy<2QvmMx*@}
z%DKcl4IV#x_RJl(l(2r#An)boTE(9Q%9o(svy=sP52|C<Xj|J<l7w+nA4ANv8Xt6*
zskYI^nCkm+7^=>{v?e$>*u0!GF5aKOJpJ&zDF?`RxsslxTS^=EO6xDG#<^iJAsmd3
zryeqyXZwyF_V?$2?xYryMbE^<@Xl!|{(+b%9qw&Eabo_F$=vjH+JVP=?7ryYKKbr|
zYKpY*XzeLaq)AT+|9rgq<xFtxSX{UIE-$OEJbC}#e`No!`yYwmIv++4?_kO;qniki
zyiVvl-v>*VE~Pr`^E<ByEVpjN*=IUwQY(S+sRKbo_Q7(?%FdPy!+^tbdD+vo^ZMMk
zxs$PGU1VzkBZ=G63xRSVXfo?1EIQhkQ#OnPyc^jI1J}%R=X(%sQg7U-2X19<Wu;1B
z2|IL1jhK1w{(Wayq_oGjobMMnarW#kF!whgsOanQO_N2B_)SZ85xp%J#E*pZps$Mv
zk)4WO7Lvh4EDapBb#Xp`LiWy06*vB`&G>pkdyVyis(`JgJbvty-)9>oS7XX1R*Za+
z-lK;|+@2FnGgaH9;(K@gSW(yoRg|mPPVJd`Hit9&BS_2z_5-{g5&>_zPhxE`e&btv
z4pCxXz+9X0vu4d&@9C2(9bw=~7|E$z*P(YOcn*0<OGdepN_>$TJlF$^eNRgC^K4(z
z$8;JO7|2JLE^EP5lY7Ufs#UTdQtus@1{Y*9_|%jNZtl#-=bBTv9ywk?zOldV?#=Ci
z0cPf%5*}C1p2XIL2D5!V5bJw$Bnyel+SWGu_{e$l3|CbA_&2QZ^=rNHc@g2T!ZoSr
zC2dv|g6bM!g^bwcIJJUi_SM~e8o`87GE2R;Y_Zw>UKl@H4tK~*^i0SGpKzWtrydiV
zz%d3+``l&R$=Xyw8FXG|3?F>RQb$8zR{{9}hgZWK-jSo*k&+eU_~gmI>4J55<s0IU
z3Tf`lgTr^W`!H|Hd(i#uC<f{))=o5%`C!L5S9R>1S5xr`>0W4AkOQsl?7YB0Xwp_8
zrmRXx=!LM=uj{$8DkuYjv(HA2nK`qQ0L7Rd<a5g<r32`cy$zXOwgAItGa$IG;RiBb
z{A)8hV_)pq!sJH7&$DIkt;Ikf=-=KYr6=?11;AIThG+J$oqyM%_vb?+38B64N??hE
zvwew8(6Y{>S*8cQauTD&g(<lQd*_&0FP}euf-!muyr3t~o~@C;^v^$SX)>|>Pc1HL
zBZ&g-NnmNy^e?BDvcyuMNyO5_BEys6J7p^yYH4}11{*+ExCk!gPte&hs~3Ni?`Pd6
z{J<Ub8$iiV|G_)pQ<wi+<Lu|i5o|cg{G}K2JISqoDU=hG)IwMYLJkr5o*#V##@CNF
z-vE~}EC2(h<Mp;Lp53i`_Y0^9!=NaJzAN8DDxHp{&Sv-n+<UT#)GJri05mj4w78p{
zO*g@5cNg=&IhB^9ryx<gfCOvq=a<9DXIg`u(avv7pN-B<*W#9JHhQ(Mv|lZlJ2wT4
zE;Q0{(c61O_-q<Gg40rx3M4=QLHAQU9*D(N=^zuq)9G7IC|qCWH{8Oax;&{<DY;Tx
zLVAVKl22b<ScmW;V^NQRAGXEbuuJ@%G(u6>!GmitB}>W4i64-0DylS_xAFrXl-?Le
zpnRgUb8BEyL08Y;ys5!zRXj;<Z!dK=!UGIG5bl>&R&t1QXP&_fUtE-5o!{ZE)MhlD
ze0AaOC{lVwl@yI2j@;I4oS9WqI8iLRw%iFgh3RbA=Myxj%Ga@|<eE?Y`(a|Blgb!v
zx{R8$AKbfFv+MWQ*Ele|IZIsFM`rnx2TI@rB@wUkxwg?jG{}L}Se%sq)n)NeGd_9#
z+>85IRcX<Gc}<+#%f5c?dCDh5euCWn0{M~>6fyLuV{8?0jdTLgS}B4dA_&Z`JmIfy
zH(sdEiKrcabl&m)EnvjC<37wV`_`~}Ovi+Oq~?u<1q+S8E^dx#T;4HsLK~a)wzYgw
z2X+9(<f_-NUmMTr3W6e^N%T<^8YUNkuZmd1ZJ^!AJpc$-6qwlCk<fp9xp*={%K8|W
zQ4-W5=NcB$qZZs6>A6#l9mA7Pdi^W`X-Xm5v4)NqIE&Nmo8RIFlAitddcCkgqeg2v
zoz8=GiF~;1=ccHr4lBwFd&#oJrmgH~WiD$oPy%{*KO~TDF_um?qyR*Auf32xoF@R@
zT|7g$P=sCPJ(d2X#^?LFxvB-YA9=kHKdWQkMKsiSUwF9f^rkcnb7oIQ%bj|Ly{hP`
zSuBgc;~?f7sh^O^Sj9W-hcdE(+%F|Py@nzTV7`|b<SYt$IIJ01(0nW_njH2X-fA(Z
z^oR#kDyjTcVFysooQG#X7Ul_hh$?6uQ+2@y&dF$R!4oJZ?ij~FWkr+jO$Ap~dK^VK
zyAR|Oge80DPCBrXJIs3}g&?D@p;5)d<A?q$va{XC_TJcQ(zR1ELYHGngwWJa3_4Q1
zQm1xpotNU4bZj=w6paE2#H0~IMs>uWtUp)Th<w=V%o$_m`uV~NF@rckt-HDA7B~fs
zUjD>oT162cX7P7Bhh&(j1aG&83HC*wK1G)r<T8kLSHEQqL7Qmk%$L6XyS7RhQmkW0
zZ=4^2P(lwdrR&TWuy$}5%o^{8B%OJURrQwTK^_@6Zd_vek9`oB*nD#Gv9Ej_JEgM`
zq<&`^ni3?)Wh5n{W+Rvji{=S8Ktn}SPWeyaorWKzCtY)-%>BuUtbVqRd7CZf<3_i?
z-qSoyd*a#<{piM-rgKhmpPaFaO3k_YAa42^cIl-BO`TT$=*h{`nK|h8_FB`XO(SGV
zR}}L@j+=>e2%UlTT<;!jP6v9**uShxUuFjA4uwomPay%ykbq7Lw3f$$1q<XU(=d=W
z6WBY#!WJ)Y?A?_7^P3ucs=y@^UWDW?uAg1NLD;#B<^+=`e0_&&bhCDXE?tH!f6}a3
zonyz-+`$PII!r2{ynI!zfnfk@%s{I{7s9LfX$WfI$uJBc9Q>su(Uy@)45-(uKLLIa
z$!;t|03-y%NOL?sr4}{y+aSB~jU>K+ig}fiE*gMJhCRfCl*jXgaF<SLvYGmVcdLhX
zFa=I?R_LsjCM&+juNd}U)mT{v`qR@AY9k9ttx<4?e`d|9L+Q1#rOCwE4ndU(d4dCS
z17WT|4VSTs!SgLX_>-LELk_@)_9DswDWEBhsJaG>8r6VCh^L=l9hj`1C?D@G>jRy_
zXvrrXIj0d&3p<75r2cwyHfL25+NXp_uJG81&oo9fgr^r1(-o%>cXXvf)+HKeWo3D;
zUtddDgUpsmxq0*WqmnDRj4Q6+purm2_fH53r{=$^i#{rp#Rp;x4DAA(1OgbOADX`~
z3nu_MjiB@a-lVZCVFy-GN{ppZk1t$R@DyQ6K_E}zcwZxh7p+g)8<eUzZ}o&NlASQe
zi5q<Tjk^#oC7eOopr@Tm(o(-kllACug0^lIj-a&Lj~waDK{~p_RpIPf?%A(8Rdx(N
zuAR6wAzd^K73Ikl1A&p`MzP{ACF;~}VpxqN%V=3?CxWkh#N|YtS~Q*|&r(A9|1w6P
zQ9k52fDRzIUd6FGP)B3H;K6^f4n=$*8ODxTQW%1<N5yWU>(9)i_2d{;_0)#B!z{vD
zPd}pW-(GA>>XA;>3;}s3bHy5~YRA~x+ow_Kb(@`Ml(hV-2HkT>b|ptR5t7+6r>IpM
zHx99bbkMbALSY&4Bo^FGF(ok|0(G%BT7_ZO$JSmjgE(Pz)h~V2#}ZYtC+a*>bD=iT
zT7J8DlbZw8K1~8u=ZWA)^5HzZR1T735WW>pmVlVQY|$Cth<X+eXGP(WvUZXte%}0!
zBu{b}6x?pNkF~2TGXMN5KnfG+E?zw2d;+gVfiRm%5ZK%;SSmrIyd};QA3X&*?rb_o
zO)Zz$QzIs`ub8fDf6zb(q)hldQ$c=PVCTSnEm&?c<#KxZf$XlP_~3gYz;dR(G2HC{
z(;x7pH`wXt8rMF<<1#M!Kw7iLf5(9X?KnE0zI{7(#$(^}OSJf+N3&a+2*S)#o$(G3
zXCoQQWIWXj?;q&&CHoL{f%8BA{MDyVqEC=RXDXHD_^F7%q3P5WQWmF&>V2s4Xb?fc
zhkHZc!YlCni{Ui2g+VObFQE+{IB?)omm#STW4r=2jvP4>MrQcRH}c&3M`zXO-q2_z
zFVGN-m-1ukx#lFhQEaNa^&n}O_4)hnhEN|3kLbc<tJk7${oJ;LHkwxUzxN$E;^+P>
zy;dRB9_@w9xGte@{Qpw7@eqLoppk9K50*k$WLd@DbEue;<InYm3BQ4zG|IJJb7M`~
z2|1;Bbom4Z24>Y&QyYl!q+Ll)8Uc0-&4jpru7%i7u3%+a>73<sxU=j56uJRCNVFpK
z{>Oh<Vu@HTaV^VuAGtHgwn^W<eRW#2mq+=3uiFdIA3e#>=OFa}<Q&&#Gv`iS$Foa&
zGTl}1<^MQ^gub&cI7ec<IT%zExZN)tYh7Lq9E!Wu_-DbFnr1?VdwXu{)~y<apAJP_
z$t17R$)#iraLc2D(aj~lFg*0eez<$zb?e?DnW{-g*0v<)&K(G0Te1L9(#Ry;n$5>=
zlek!`kUaPHndTH{J#g!GCF3@a{<dGY;ReY7!X9FA)>vEnj`91PTxpY`r@Oo40nNNq
z5GWL(PKdM2?pLAdQY!-<f})@n667X@(g7kus}DE-OSt&eBR5hr9IK8cB#1VFc4q6M
z9VZZQpe0y)@1A2Pr*q99PRu*J^?&|ai0Z&5mhW*zoXa42#S`riVw4x-)q8q@r8EC^
zBbeZ&<+EYi3SX1jGG^4N6y_7O&)!<1b+GNZ7o!~hZ~^GW<#GVq`hjjsmo2Mq&6=K;
zIswyX`7Pm&X}eIPo?7w47!htkkE2Id-l|%qiq``&_u$n_qi+JWtUt7!fQ{X6%!q||
zo^#b;7N@g*a)w4|JH$GrS%>BV*H0#)RXm+9_zY>@#{3i108eB=vLb2FcE(J|pNzT@
zRrseae*g15*S$W1I?j&*Ka5@{R24nWViH(Y#d_4J3qc)5Nu4BL5n#ldbA4CG$`~_7
zucAxB;p7Xn?B*nVs%RRsr-7`!IVMw=G_rOea|Spxfx|+7%!kDNLRMCNtT4s0WEKB=
zCGld7A;X4UWZRSMt;SJ+o_Kv@V`I8U>97bOyFYl|@-^ElpE^b&J`7b6@%8N4Q&Mf>
zSO5NHPpWRz?iu~4yg#i>8c2$#i0h-5KvwflNipSWp0GzQUB5nPL&MIYM~>8ixX(_!
zK#EJ+a>!x0-%~E-<LA!}+(5@<y$dS=noKBdwK^dt9Xizd-H&1`L2IqWX8?pti|6&j
z#9sZGVmjz?dth8Y<CPv1y6aCKKR&af*uHOT$6Svr9vgZeRocpmCtfrR>oCvP|5&HQ
zxWy)io~0kEE7DV<#<Q@rtVx&5lRn|^6BA*9aA1jm4sXoo2vht5r|R4nnOboDg=iot
z^Ua&WY?CAauEann@{c0UHRSQsuV3Gjs!0frz)&2spouMnqfcrQ4rKqS%P9Rh<@0y@
zM3?nCd*GD?q$pazjdBTkJnulaJ_ut>9UU8c9?x)~^&mr`Hu-i`t6O)gE6MkSXm=Kh
zH(1=qw7tT}#<J9Q&cBp^+-4sFJu^s_y-_=GchI6mf6MNIOY{_-`Whf80L(11hI3Iv
z=fvaBjV(YUxxLWV=;USL9S2x17?aSwZ+$R@ItqVIov%aw@iabxHwSuNCx99Uh3(px
zK@B#f33Csqe+?%JsJ;s5mH=-MKtBR_Os0L>*ucf>cEzSlEKfxQBvHiuSq(%QvckD*
z+e1QJ;Bk=}dC|+*{Zf*=->;>2G5UPQ2V{U3Zr-`GW2`im1dT$c><91TQk0qesM|26
zahU<`^EIANP$ihQMjlALR$C!iil^_#iFKej*UM1q369W1^LC?`Vd<@9B{{PcQyVl2
zeRpyHZ#|!o?8qPdp-YYq5$@(<YFxjbMt%BBMzyJNMHn-1$n5&OB!TN@V6Y|nXD&w8
zEol_gucDf%+pb+30|Rv+jn{XMG$mbeDj5Gl(kfhY>`uRj*d$Qfyxo>MlB;pdOX=@0
z)`=Y;v~3nWMN*1n|Ik_{L;kb9JGu)5kyEo~%osQMF+V(+$4vl<jO&8VjrEN{<^ohL
zp+3vbks6&)N$=T{9iIX<CG#{(#}6#gVeC0-Wt?*D#~zJ|*(A;l^FCzmnHe)Kp)Mu=
z(YP*<X6bVX{4;1%3qK*X8ap9cQx{>R_z}D@w-0^yd^hPOd6Y9+Z0LuEGV*iBUk#G3
z>-)6hfo9Hp&r!er@faaf6h453Q!}i#LU!EtmbK4a&en-RiF2P;5#|m(Agcm_gNZ<`
zJ9VnfJxsw#VSVrUFHJfA!<{BqR746`NS%n}za<Y=_YVBB)?6n<6{D3u`^uw2-Y^NR
zLC-sPw^M+_y|<l?6eN7E*)j{@&fs&G6a_!vCU_bgG`>p@&y(g#D`VraE=$N+Fer0J
zv+|u$2iqCQ2<2ZVp*;KEjfF}I(oBxl;H?|X+psz8d#A%J=Zvmlz365ElKVsNdZC6`
zUXtSwyp@S&H7Uqu1Wy*P&Ce-;zAJA5K#iXzPfSgvhVr)fOFZjMqY$KThqs~K^21i+
z7*U~(0Z!NN!*OG>aCeWjTQz|_d4cm?QHV|bppCtCT==Ys;gp(G$Dtg8A8a4#k%Znm
zv#=v_dR2};;nS0Rno$_Nn4rDoDv^+8fC}uEZDq^qDkJywJr5r61Qp1DH8rFg*R5N(
zNcVHHkjS*riiuRK<B^fWhg-P0=~GPA0L_xr3@|15?aDE~dLPVC8asUgsWqALTqxhd
z^&+d4>Q1tpe*MmleN{>|L(<F*ww^wL?k{JCJE8+oSm0;`7sx(7rsRKc$jEoyi{Zkt
zM5_`Qg3ryL`n%w3+Ux!9Z~j79$xgJSiUWZdj6Gt~P-#S)Pn&jN?l_vFm2G>QQOhRx
zH^9Uz9lmlm5JtMGwMixfd!r@Lv09V;2M(lynUhM}?%q{ujE|Wbwsl>@9wHott#v*x
zuNlyZDbP1?qU=W&pC}4u-CipF4Ajas3qQ?W!zvA@z?S0-@fT;bNIW1H90o3<Hj(0P
zX&>rh$&x&eH*kXT;yCLuFhwVfyNy&%ibR-so1!`)!vpwyN<nG_Cz7jF3+;3a6=fBk
zNlZz#-RU$R73n$FU(ez446;n45p6tRA7et`Ux78o;;)}{QGpc1`62YFl~7V#{E^lB
z3?hjctpOTr>qM#y`Xl53s2@Uy#iLDnL^m(LnsbOJTPq>w_H8*2c3e9W9j(fqdRpx*
zpgud&^2e%~Y4x{~nC-|ZJ>%y4SWS`bFQU0P*g3UH)agA9VC%59gefiw3#hx)zFe=E
zPv(uNE!|xB;a;s8)1NgmHm+>Lbn7Nv*!Pc<H%Wy+jI`u%MH|h`G%rMKmm;PuL6)3K
z+rf)qn4kFJoYbyD!jss@9U?{#qJ2dicuJoNi!Yww<-~XTXA5sEoI=PG%+TxskCe0^
zr*viU>n-MqT~ySy%13h!=wFWj1S`2G#=JGBie8N?4_4_KlX>;3rkRV(%(&ZXTXfR5
z@jh=c#aE&G_OZCw9R8`G%)^)$c3^;=`6ua+L4{NypHIUb!g=acZw`}Ze09mW$k{h?
z9|^MONp$e4?0N3}{CV><_WZ-NX$_)BI;3sqDO0K;DCZV~!9FG84WRZGOoHPzpCf|S
zS=6FOz(gW!MH*hsx-{<&WM&$yGCf!nC$zrtai~X(F&{m;p@`NqHdVA-_N95lw49%H
zzF%1#8o5sEaBQfJ`tYDo^`@&MHD{#w=-SwHy6CE*xmU+!-RfBLwlB8~o?M4OYdT!>
zyp_xOE`P5MwzLVk>Aa-kNk-CD+eHqmRKBk`x}hNC#QbS5A5B<vedg&84g4tWw|#9a
z3!fB3D;GGMNhWGp)D3Xs@;PTj&ne*@gj9Zt?wv^10kUo{IrTsL-z^DoH3w`}#yJJP
zPW&fylFxIraUyEq?A+NyMY`TL5gmse7(Qf(O+l|G=5t)&Aw~Rr`GAnYoKeM-j{g&u
zM{pBu^(YYpry6b$cIer0(O3Hn`o$4en?>Gv>BT&ydiUs<vGJ;q?&zx}W@V5P!@C{;
z$m89sCG7Wf@twd;Jo_hQVztmo=`X|6YS4yR&F^p3-iBZkD*zRRz>0@tD{?yPnS)**
z&v$qUbfKv=<=C-fo(v0dopvR)N;Nc<o+nOpmjPTi&(&*Y)T+~*hi9~szkk|b^*~Gf
zriq(1w6(QUXOT*H(N%+@9P3fONN`Hi0q(U3Z_Gd&65-)=W<f0yNk#RM(blbwUt!!W
zXL=hv*S{2LD9C2ha~}(i`x{{d%f@ufW~K=-zu~kKCcrmh^b}!$qy8YrpG=mT3nCo+
zzNvDU>0CjJ&rLdCg*~+X+k%2SbvO?Oa?(mK7=3bNQbjlh8~*fFe7n4kkBbwjy@-t{
zMCGKEgA;O+tU>_0mHQ^!x2I8p>V9ih3z|4AV~3*|uJ?|$7F&W+Zkzqm7BA-Yrh~3~
zrf@>!@v%)}Hh`wv45;zXKd)SdB=b3gIQ_B@j%+F1I&c-7F>13#&0qQWwlMJ(=gr7g
zGW^R+K-q$LB>10qZQ3M}sfO~gsAvF)R%WM}qin>yD0^OZ(cvC&ur#*LnmN-vrscvT
z+;Z`FI6c3;^t7HXA`XM&gtNs&6UpjPQP^B&at#QCtdd)5c^s*#2$)46^L<!v$1orE
zum0JQ<ltH@4pA`#|2!yj$~C7*e*#@X@a)eA#L0wnA*|o@oBIvR8#U~zXE)mHNSidX
zTE68=N0B0vuQkZ!h+@Q?f98XZkJgg>bm*LUjFuEWaADjZ$E3xvOI1e2H)ztN!}8v@
zaQPp$!?3y(fS5qt4pWbMA!i5454cY3iXTg}jcJJsWY`$GAEE})vf-u(aN2_RwjaBE
z1lT(SkAGqdROk>lZrbz$a3O0{8vC@gfHgLPhHgD$g26*rD~Y+3$>tyS7UUdS%YNQR
z{@9ot1f9sZndwIYXf98%+x(R9+#vm{8FAm7RpUHsmLG7Gq-Ibt1mi!u?SIqxZFtF5
z$0a@;@WlJ^UM*}f+fo(R<)D*E$l*05GFa-<*At;L4NA$%7BEol?Pii)-$ak{lgGvz
z8HmHi*`}G-?IPPqCf<6W&$<B34aei-wR9tyOb~RYYsZcqv$wS;qtH+L_%^4bkYtpP
z7&=DA`RT*)Dgu;NEksgC-Z9W_b66yaem-aH;F&XLo=&rF$f}_GzfQwSlg3INzFaly
z%~Q_N)e8R75^8v-dfGiAFZQQyldpKL{-JRRT|hO*yc||aAl#zU`*b8yx>l`PnYxl5
zCpzXYv7tTGm`KpnBmsA&4kuE_(wP#9UziZ#f>qbbS#jr1L$*~JO&NQu7SUEBZ_T?w
zdC7Y=d9k8)+S@YTZPfHcQiQfRlP}nr-zd4~@To~%M3KU9qPS($&J+||bCdCs4X>h0
z;tC2qB6fvVzZC*r7#TUbj$~}7-R1#Xkb=BkBZG@TgQ7G?4cDM07IW^K6?a3AuaC)Q
zy~qiPC+9vu>}bCJhy8#h6ANxnA#2du$^q;TQnu}W1yEi1N#nG7zCUj$^?fbRc24^d
z*`Pt_it)nWyFNgzVJB0HwcExm?B{?&v7`-zj!8#VI{&0+-nX6Aqn?8hEQaZDrITw<
zD7`4;R12v$zkgU+G3v|*@_NmVm#ppBg0YCT!kX8w-%7vuK;*G?jjXM^MIQcg16(Ve
zrekn=85&0j2@bA|%y`Da>=>U6=J+}m6=ri1hoBZ~|HE7&58y)T!-*tKE4JHtTq?^Z
z_z!Cb2}cXjp%@_0)QoK<jE3w!nGo+B&$&PI@!FD=I0=9r>XE~@e}Sc<2I__zYBuj`
zRsKV?DLKP(!u1%^q)(}BHKA+Z<&RI-wp1-d`DR-VyMDv8t^SY=2#dioP*F}tK`6v5
z%6Ci0f1h7cr+KAT$Z2Ed;E-^27MQcmbf+piLPEm1YgXl)6vG^U(%*sPeQ-h3)8o5b
ztywQ<V$f4c5tNMQvRZVXHf%@QU#eQyt-ET``n6G66JA~2YM<5KfZIpLHR6P@D%bMZ
zBjN}bmZhlfDX%xdzM!=+3HBJJ@9U;%dfp@+>!u|-!BxQa(e8Z)bmB(wNVxXhTtqzS
zylC9;u9?<EZ&)5}yLTVuI&$aVN3LEpQnnkphj-gIZ^erKsGCL|9%-?urA32^xcGPj
z_xy{7UpfX}HupfBScFVj-|yf7gKphYFm=y-cZ;ET@TOD?Nr+ghqZhckb+|EOcgR~{
z>{IhzFm|1$0&Ob4n|BvMKa89(dO&*%UymyV-w&T^NrNT+q{BO>(KbHxoTU$^c;c{H
z6SqX$0!O>~9R0gYR<@A!h*74X?;JE+$Hl6L`+h=_6EybNu`CqMy4v;R*oY{H&Aaht
ze`vyxhBR}ho~`$do8<ihUQ<LlJ9%_20CCT7gUL)9w#qom5?yz6dN0oTVx#{9f1%M2
zr)&aGlthn~X!icU@EP+Y`;@X2k2lx?9e;e^Eo;*4WccMc?_?Xe%=$yKi^5&qTe4uI
zzsBvaYpMleWI<Q|%DY=<7S3?y^lB30lspJ#h1EN1Wk;#P=ACY4dVc0Biw4cLuq2&K
z8JK$V#aD}~tRwS6KvKPjgW3g#5fl(^>n9j#($7~Tn;w07rOf1e!6_!x0Y0en{GEhn
z0WXv+s3Xi_JbrNnux<EB<l0sS9&osTy#l8ET@0SK%Xp!c>s?kA0n+ZwZiL6^i&|T~
zwHdVg)sP~Cb9lL+>m6Fyk|Qi>genceaY1?GgJq_yS(`u(7>=oq&yA2itd%ZJ#djgd
zATXpj)mylet%1E&qr?gBHHt&_8%}IQGC+S%Ggtifqc;1aV3MX3)d-fkyNp%=R5To#
zc{H-<8C>)5{O)p5QDqzRYu}*1pswY=xKZ(k$4(LM%rsj?#p{{XpHa|OwYCIDB*iRr
z3jxl+0j<_78gC-pd$Y(wElx27R<i4VAiu!L-D!@MyUEzGZzuS3S|88H)|_cfi&v3&
zKHE9xoK1m|Ej`8kWAFCtDYlFH-y4q^Glr6M5E?XT)PjmnhWkV3)i&h&2)p}sr&lmf
zy!TazpLcJthui_t!A%+#j4K{%r$x)s6pE7P@#F<_a7X_*aT4*8oz+H^ZiP&(tJ0{^
zHru<eUcPjpA7Yk>eGRB_<juwK%KiACTK1>D6h8P18D_kiLR9+mvLDQ^q~S;-YJ<n7
zT)L3LzJ|MQO5UgI%P9+^H=N^sASE|9T*xYtltc{#K-2Gp|I+r*Q>ajp>G)lgceR-s
z-O!>507gvQNDv6~Lf{J923q3~nn@i)_o-4z1g|2T#6Y(+nR9&)9?Ubh7AFLQd2^Pi
z&LXLTmJk&;i}-V;v;U{i?WeXggd$FFAOk0k^<|!)M!8hW+-$RP?E;@-aOBXT9h?3;
zztS)1EQd?DOMTJ?{q_^Iv5~e98ql0B@8-)>%*>X6H17C;@I(Kt)8cUpkC9;u2xvN>
z`Yo2s=zFomrXm{TLcuc~u=*Vhcp+n9$fH@%b=#F#Jeh>^dhblgu1OOJcvM^KrfGsH
zjsDsg){3UDq2`{w8%8Hr*Ps2JyH`J+Q#j1_Ds3ayv9LxxSBH1tR(`OvJrezF8kwa<
zp=1YGT4i2cZ1d=@anJSRy9nE??f1PSLwXE(QDBg!bDO$0c+;VK2}!;$_0spxF49Z;
z8IeYXzp_^KB;gMLsYNbq)&Y{kp&l3d1@C43N*P5&kRumKw&U#Tmg^pErBgx&^?>bT
zzzbjB5NO8R9D@c-u(%NmcotE5A|ZjB-bc~Y^ajWL?RVitW)$g>$FTJzqKxZKJS@jM
zMuI1p+4!(6iE)}8pQF+I^RgR7644rwKJ2gykREi$whz3FE!7&TqGqHt_AkGv$j{ov
z<D6bicm!}QW<N?=&s%o5wx$&xdk#SGz)Ffj^YwcYZcDG5kLv7eD5k{ITc%a(HTAMN
zDqb_&S?E-yn<~~p7teX0xK9Tw*~ZLbI*V#G{$63wVo$iF)v?;Qeft<2DNo+#^%j*7
zC*5Ek1`t|MyZX@qSM3*Uy?N8uP<jGElODprb2GT|57LFmhmf{|6YO79oYfJc1=l48
zVnCz$lI>K)1n4b8Px01^#p6O)#vMpxK5%!+2Nl^Lz`p^dp`y3_;UadFe)7pjLGkfD
z$*(1OQL5K^OqnE9o~*=w=k##9T@xHne0l`Iq?&h~F=H~VTyZu%wl`$FUov6OLhYeq
zedz5ZFnJ5fBU~(+c=|RmdGz=D)F~zMGzJfLIGx&0Ye-JfF}4_@Q?2wMRs${;k-OfF
z$=1&wd+>lX9VtrcDUJItH<F25pH5PJgr7X_9vcbw#PCbW0nHnqeKTwZ6)ovHJ&)Ql
z0}6x^8rm%zu>SVOfJZ&)Cko?N6uDkrUY5=4sw<)R;_H9u3GpjlZG%S8OOirbIkE!0
z`7#tTt&ua3B}dF2RqPQJ4gn~iJY_K`mgnj(B7GO^#9;BqNdnm6RFdXv9{dvi2s!25
zMab#GODV|4{0~!po!c&InwgK1nt-!*6w$-jyEJ)3wFT#!3FHyx>Xm@OLKku^l1L>5
zNJeN*Lz@w-?Cq%zD{DcWjmUr2?YCx$1@*X<)Fy3A+bdQFNE%FjHioifUP+IBa()d?
z0oqdxcSJe?Wy0UB8rn)dp)dBe>vG>-T{L0h`c+eVQ(be&<6Us5^kim>3fd}tmzCBw
zJ%9Cc#I*%`cJ1oq=CG~F`Hr7Ydz5`_O?@b;y7-e-gG@#$NP{ee7+2^q^o6*tFr;J+
z1_+!#b%6<no^Cj*!xer%n{)qr@vV_~aG7yobegT+5%$N4VSpsjk$P)fG$0tYXi*n(
zL&OhhYcP#RC34qY3*=Xk9t<#U)_1Y>i)Wf80Bss`V%8N6G<Yuym`?7TW?vp?PL-;|
zd&izD$0<i(|0fl>XKv8HThp5gzP(lCupvXD3@w_-Qfenu_PPGeC<VEY+tKZ!7w@q-
zxY=7jt==QA{IA0v=4<=@7;0?tlMBYl$Qvpzy@8!BuMS3Mrlm!U)m#~EDQ<|IF&$qx
z8t$mg$#wyI^CwT9w9;(<skHQH(6a^|_Faz;2YQR#jqAHhGI^JHiW1wbmo@8rXeHau
z9fGzPB!~&e^Lf2JSEu$Nj>*tGMS&c*mhKL10F*n9Z-3HlH=o5>SoZB(wAP%kUQ0|y
z{)t?~B(2LY=8<rwViQR&VUF>ws|Upyx>#XE94Z-Cg;cB!$zFQRWRu4j3s7+P{@hz#
zT`Mpkpw0K|4)D+5RA~&B_Y6KVRPI&(vgG_J)3U`bKc&=O)g+*sdlS}Rs@>NKMI
z-ZrDQZ}(Z01c#DlS+u{44bfz#J((E}p{Fl};cxO>r_waB!WU)a@7i;mqhnBRSEfJS
z9=%I9DcW}V?40%EcE2hG4%psr+21%isAyiD*O$ytb``CyfuxGQ9<AUbx9c0z^BTLe
zI!O+`JmFv%f(~<$9edUwkm6kW1eS^DQGo(B1IV%(d(6&jEVNn(V{pjo;Lg>o|8mDT
zkLlH|+*W5Uo)FY?P3#%X4XK)b5lI=FGS?*@uzj^Fs?5Y;&!+MR(Orgo4!+E-tyFr!
zwiV}r!i*vAcN(PW+&!*WIx8nScx*<O;UrqpuBE$LRqxcD%a<-$e;qu)_e7FjIxT=z
z&^(kBiJG(~=B+z34_qv2(55xfVCUxEE%(uSgus_Vqg;%vR83lB@8lX{7T5KFvK8B(
z7<=r8YQXM4T!3T8QYgT@9|+frR0^C;z9Tx}O_sz$k(UR%aCUnFYqP|U!C;c!9!zre
zlJ)-n>1-jP4htWKiq?I-)01tQ6+3O4^^)Z2_bYm5S=r*5z<pSEE(U>i-R<vR7wFJ{
zDW(ADLmIS_<jH=S+*sN%f}x2NLO8eF=lylD$cWy~AF3{y4L<2?W628qce9m66V9)X
zl(~v2$yOGpmTH~o<I<>+mO!P$E@pF$baP&kEkWm0f^N+F*-rDH@m)3!bfc+<Rknu4
zm!>`4yrll=9eD7JUFXcuGJ)rV$JN&2vAp$NH?GSzsh%6C8q#T>#3}Zghj+T=A!BK4
z$i&Fs+78gRsN&VNz2U?Z0j)e?WUNi@7rKzC$A`5W$ZqUt*kg!&dVyKdE}_&>B?Xnc
zXD<70y>Xem>Eq;;y$205Evo39Rxu>nSm;_;YvMNLPs~rYdFsn4zb@C?yS`o6JqNOH
zT%RoCD(GN{iDuy`WRPMay#CH!+#<JM^MH48FK&9^*jR72^6lGk_X?%HnPFlVp9It|
zepF)hK-xRBd0m^+@MG%7BlB@Oya{J&07bZivCPMqw<&@ijUzg1JjeYB<>EIBiV{Bg
zpJ~%1K@?%r$(i?UXSO823onJjF1{}FHU<BA@W531JUroxq2;uHm*@#8(ggrVY!v;z
zc5W-reaH-I<4W_zN$k%~neZgq018&d;FBj#obbv9zusbJr=K~O!va90iry*L$PSI_
zN&yKzA<m)7Df*{alK6^`g`QrskNIh8xGXwDV$_=VJ^c0#p<0|sx}aXIMh)Lm6Lg{T
zI$sh4CowNI9dLdvz|^)}2ZN}eq%|)ZX6!_zVjs1a^%{BP`usy>x-Qe%Y(9j<no>a~
zov7YK{$E5CR5q@8rwjz$hn+g4`Bp2)<a(X35pmdQwFJ6>eV+jdWG=q1HVQtvDa`*<
z3lsg0v+b|$HED?qJ0|W~)Y6Ha{fyK4X=c<<b1$2fRQMyZL*_Gg$sTY^S5YY5E@DvU
zdu{yDXgCbNs}aR|=A9sdM&}#hy>YS?N08T!(VA1mP>c8VUc{+rLq$7p!v?plB*?NP
zSFKr76<{6EwDm$ub2pokZ}k)^0`o!);MeEEWxRLJBQa}sZd?cU`Xl}R_Klk~c}g4o
zqV%AJUO}@kwWO^go0;5gEtGrLK4~pCMbCdIiSX~2-R`q1erU5&gla2nrEFtPfZu#w
z@Viwo;mmC+Q<LF6V8;r9hCaOo-XTAKmd}BMH~8%_fjD8??%aG~>h$Tg0BP4_A<(8x
zhe(qbbL)&+P~$VvSjJ_$mY0`TRWMHySB>`>vu_=!Lk;7-2=Di91!oK)v%Y9CGK@Oj
zH&-U-oPLjo_b|s6-6omt=<bxj_UW-^IX^qL)~TsB!-ieCeA(uVU$jMQ>;%jU!FHmW
zZF4cRnNH)gwcEBYpY0~(A=spOOYZ3jkp(b5ixfHPf-PlBBac-xQDHjw(29`<egdDU
z56Sv12Tpj=dxhP!JgunLOYFOB8`x!A$A>RxZ77U1ZUM=~*LMOdHF#{2g>Siymi=?1
zG1|CC(2HoOh`sIMaS88_96e815t46Mb0#Z%T6mF0vlqSLS=dmP`#rP14F+Pn$zHHX
z)#~j$R~P76fbOIaJM4d|)vQ^6mI+3s8s#=6p^zLn$J<h!NPh=7H5&~{!zrO|Z}?&T
zFMhth8u40c6%R2wpvhvEDA4Jb{9y5JN`(O^%7W}daL>+n61X{}rWW4+ZFc0Z{E;&_
z5EXG9c7yJ>ettMbkRzz0%z<5za{m0P>c3<(dDCZ9f(7bg#}nL58-&LS4`-<`!wTQL
z>EPUh)ua>g?>W6SqBd#rH$n*Z+G6rXmVrofM3iUdKY{TB0*bp{c`*+e{K$w|6=ZL!
z>uc%l`F=QaSbnt5;f7QwLC0n~JKIciVj+^tTQAtfy^3D23l2&O{-Z2N-4%MaiHm00
zB)SHS7*lR%LAhhm-C-~?{JzUhMc<gk$j!(j%pq8?lZ6*tI$;7IR*!u)7^i-o4n%Mt
zm-Q7E6oT@GK#ReRr@n#t)kP<Wwwq`jGzQzn>OWg3rf)JP#Pu;(d*pK<`0U(A)}wob
z_syus>1!jBa{Y`>+pf?*sKal714buJu(1A;q|Iomc?RJ(Bflkwm2KB=H>0605VJ3B
zMDz7|FM5yIs>5?-$5&A#X@=i<BC=ax8*)7%6_Kz{eSeomtmAbYoS0{Q(~%O>+#-7E
zm-iE<b$ZJLoxhk2+(v9VBX2H(b5lQ_csG8wflY}$H%0`8cmoKxh6@xe8_7@9z_v>}
zq9U}L{!YdEUEU+w=Qe!|3~X-CX2yk8IYgI(J=lT55U29B`r&Zcv9-QYF+3p`4~`1m
zr7c&Gl*AvM-n>~e9;pg;CaOQ){jslSWxbvyZc>jQx5R~M0IYg3HigXYEJ^7xG0<d(
z(empD7}DN3PU25-m$G*!Zg@SevDU}Ox82OPqzZR=;;o)<gPSLU+$}~DM*=eYI0dPA
z>P`O}fj^y~gWay2hQ9`wJ<2)bWNfVZZ+I8SyqI-!70r?F9j5x+1da_Z-5nI9V`q3b
zCnt^32KyK6BpF_p3o~55`Jva7?<cRUa49Guk(@QP`>l+Oj6;2wd2p^-RBunK#Xe-J
zF<8lq{X5ut8*+Wu57HfCX=&YY$3guKAmNne(x8#TS42j%e5VviUzFuZfAL)A1K`b}
zj$&Y7<=r#-1o`HrDTdf<n7JZmd`hjTKMRW$Un%uWY4k~Cig*xow=A26nawH@EW^q?
z6jun5%KL8e?OM6MuX6Z;+wME%ygJay`v%<*wJIuU8IQyFpK7T!pxv^>D}8qiYo|TK
zu1Qr94-LwI@zz+3iRrg<q@-kI9K84W;{n)J7O(D-EOdC-4$>0De6t7V&xeh7>9F51
z?geKujIpSjg}&*tF|mdY9W-OSoyYBHdo|$WduNyRq9y_V2fg`=uRgT7w$14+S6&#c
z_<ri<3iPnyGcs9mjXEyQ!Pn46bo>#G?RxiWgcI2XGV`F)Tj**z?7SBZu~;do5Jvd5
z3Q3N3;ZYW01UJyDNNjW1f@`cENXqg1Wxtg#e*qRASlL`Q5#utuWe?+FiKtK9mB^ZW
zXj+qBw{^c*xwwwvk9kp+Q=BY}n{$fSuZw(nX36JvRJ|&i!_v>dkN<s8#p+cR?(6!1
zx?kSiUqfG}ie80IUkd=<M%QkFyQ)N-SVhV*_D${gb0ld`^FKI@E3hi>kFeQVGliCE
z#Kl+&nGFHK@NW^#jeq$!5m1Ue&(&kBhg#XqW(>&dGUyok^Y^k>HP0z_X^f&W@4gYH
z`=~d6a|G+R+}CFB!Em~lP*~OI1kRK(vV|Q}xu9;?chl%F1>Hnx<%PS%=@!p8eZ^IT
z!4X3J;Ua=hlES56cN$a*+PBNvXun7uC}@Lc2X5V3gCTU{6)8v#xs$zblz!CfyVWag
zpQyiG%PBMC9<l_yb!MWl3qIN5MiH`|F^<=-UbTA@%X5|_LD5`Zzpry@wRrzAk3`S!
zMrnEC>QBDY@`B0zF(RY^)9J(wn$2FXsw*&ku5_Bzn%Gy<BCszVbcl}T1r0PbCb_w(
z5-~YEPhRv8x`3m!DR;!Aal}LgCymvEm;2b`OLJ;u1{~F9LZa}!3x5P1pq~_e@eU^x
zEVwhbe@gxdyy$&0Dr$Ocle<InA&b#iMT|BmqGCP1x?vFR&6ZJmPc6k9B=U8U{lS{`
zb81t;3#pic-$|@N;q!>GgTMw0UCp%)#1%zFf)}q(mzhVJ1(RJ|s({;u5tOF$9D+_i
z<}OE%Z+tilcrvK;GR!J`TC#p3E3tZ);F{egyI>g!n0^|PpbpCq9OHL_Ps(Zd`xWgf
z<k?!aR>905kgZ{xZ#Y-+4y_{k2@&H+o%8K0)jCkM1weOgIE{5eC%K#^ku}RF+*<0>
zQK^&|_W2GaaIS4XDYcqyFboY%J&6K6>)gtU{u7f|)PZJ))o`fUrU?f}G++Ls<gQsC
z;^^ZQ<((m@TfJ-9pg~p0;R$<woaqFH$)?fFP5x2Z6ysPrAdMUA>1^vbYQcedxE}?d
z%*K<RzN(i_LF&a~5;=`>j&f<are1x?9qt%gwg$)X{@C2u%#66^v*H)oZs-vm8?wUe
zPWNPObie!!Y>2H9lX*jQj#=0pg-O9nikN)+CwQkr$j31^{}v7*vmiv5OX+VPbgz>)
zWv197l4Nj9rQn&h7I0!~fq}>pu*sKR5O0{uMtu6})xn!5A(%RzoH=I4`YW7~o=BW4
zP2+iTqoYhOQ&-Jpk|Msnt@ZU)!KF2>J)hXzuDGCz0`Tku%A{JzUXw~^YXA1o0?GjG
zRWx<drI#5oIrvqX0bCf0`$4XAu(l>W=@g#~AWm#|=X)!aWj$WuvQaR|Fz;^jLB8PX
zc4FoQ4auT<k4Ij^+Fg(~CeW+KUC0C%DAkA?sk0bEL%&LY;a=OkO}<&+Xe2vT^pvM7
z|58YIt?~wES3D!+Z5^A(aZ;Vjv`gHcilt-hf;YA_4M|ha6Yw4LUI;6i)KyXS?XsLP
zrx6Xa#?S$RdQ?q`bckuekcOc5CNvAFp9H1@E`$&e6}Npbb&A~6r{7EIZao1h)-4qv
zknqqxjtK3?y4UyH;O4`BT{r;+92JQsq`vj4OY7&@0lVV2-3(H9xovr~eoHmsE}11F
z%Glm}{<K(fi5zU?_hs=5EdVdC)vJ8*L$%irIUs{HVNuxb@DRZjwQf~~w5i*YPtP^V
zp`qE@jeiF_5HY&*j=OPG_!*FDswrYI1f3=F{RGH}e_r|y=xPKh5*i%3N&0eXkCSJ(
zxus&wg=J<64J;Y@5<<(Ma!Wy>(O5C~4BUfkbFjAJ_d<L0dRz@JpeFDie&-|F8w|g<
z<OIxqS7<sQ_=0bz71mU8s&HJias>%NJ*$89(GI#qB;L+sr$RIa-!j1cS#<-R0DsHA
z&ALuq!jT12y$BaiQP8(-TUGdGYuElo8Q(=e-#Uakbo{BgDgf!DJEK21q;-q*$-BuL
zar#F>n-A(`X=!PS!iT%3-msw$Ye3DrDh)~lymoEBlp{J-z2Od{o|`NP5y7h!iZ9M;
z>c>6;H3B%pBZz9&-MZV*LqJYhkB&K4#e~RLnmxa)31zy)itj~(h1vw*m5N}{E;oUl
z&X4I^bt?VzDHWEHh&`rEnIaQR@Ty6z`1!?T*xvrlA(*<Q(MlcD$>t5K$E|Q*U#O<e
zAPHQ#Vr^|v0bx~Z+ERzoUW{D}_Wulg?P6g<85cHhm)uTLEiqW8K`#6pECxi^$6OWz
zD9R*B`=AC3w(HvN+i}g|p*2(k(jHA2y)?@Hc|mv<SQ22XQ_Yw;a0-?)lu%f!(*NYN
zgj_4y%enLCi)Akd%~+a{P-Y_+zVCXY=$X{gbodo<X%PkpBYP{J@bkDL3_!&yZ^B7B
znT$iG<ydbvig{b{uLZ?_Lu?`1#I@|ZSm9Ka+=OCXDt-D#qRA7klOvN$#@+b%K0jZG
z^Dg-p*MlGo1S98olz@j+T3Az<?W&i2V2uYSw$uuv?haqwm2D)hBh+8+s5)9t9|hmy
zcK+WYF#}8TE(PTy_^pK)Ar9s=#06f)n<Z@DzBPdBQOzC`Sp7I})Vw2QO|JO9IQYgJ
ziw5?=*+{90e3FVPDV#rea`g}b65&?*DoE_5rWDGY>3O39vU`A{4iA&_zxFmUaf&FK
zhgCxvvT*N{+)Z+<BARHekmv&62?bYdD<}sPPg3`WT_+y?<PlR>6d&y6zaUZv83*Y<
z6SW7<_0qSZhv(c^XX!Q;y$A_Jz-NFDu&{w360oLZLXLoXl2bPx%&u4q_7b$T4hW*I
zzS|dyjmasVD$>bzK_xPOei1fUf`!0-ZB4B=ahm$U0Ajy0Kg0^SK)9EDo^EX|Z$F@%
zweY2D{b0>aaC6pCm}yi63$?a+2XX5Yi!vef<AotCUTGJRtkC6?Osh4;;FKv|+~Wxm
zU*1R}`unBD>0gBorx}DG0~UI=mnpVXYx%l>Q0w3l!qV3&e^^~k{4GJ`O|jS^hxG$@
zWeM4*5}qQD(;%n^`uCT0w;iv5!FZ%?-DQlCPEWt@#}AD8Zh|}wIv^=spFU)6r}L^V
z5e&Dq_@;Q>bHr6q1Q?`2VU=*7shu~g4O6ybr&;Nu*h>8aQC|Os*)mSWQ;CTk3-6xu
zSl0Uf$ZS4J1RCq6eI9)qtQ&?IA!P945t_Y?jq~C3)KJ7(9F$_T_BK3AaA34Z_<_MC
z;`bAqpWB8Ytx~u>;XB@J_$I9)VcO~Fm|M1CEp-)|X^$=zRHj}SkQ2Ex-_eAXwI|ZU
z=`IVfH6|y@Oxi2H^w4^MyP$!_YLtr#2*QEhqGfalgvR#v-Cgf9|9+?oIjot{Txq-L
z#f}_u;}%8@$DoD6P}+8SCF#)OrAu9Sn!49(_E}!`(d9(eBk+L{BSwVZcVxOV0dd?*
zyh?I)lfGJKkR?pQCi?9+lJI;2mHo10Um9y4$95gUSqT=WtADi{j0nR(KRSAA<hj3S
z3>#0j%IkuHLrK?&2<ETq;kqp1b2uO!<|q0t?f+<yMRFI_-H>}K(~-q#z~_*&e-1j)
zCFxpOm>y;-z#i$6jgJW$x-TNaTV$>TP2F}6Y4p`5U$xpaX2DF4ad2pOpfF52#c!f%
zMPL;DE1)JliADvtLI+gc#XuR;`(GwY*Oxd%sFtC`Zw=6atbxO}VcO-R@5)=J#b4(^
zd*`d^Zn->#w5bW-!m4mSsSG_VQ$1Dd0FHX`Mu8*Oq30t^gvJ(^iPMh}<hQ7DJ0?ap
z0b1Q}`<NzfWllKXHUtd29#@B5@CK`L2Ds^vaF|yv4&gCdJy`4QeIAcZj_HU}K6dWB
zc~V*59--024PG`0#ICUqXq%2HOtpI}bswD1w|9<gK>zD-&pmn#5(NM?C%dG`<4u}2
zZA(KJczf#4ks85k$x(vGexD|!8HUwifkaevc)8-+mk(k<EIxWuOA|?vX>QQjEHNx+
zc7O}wP=dLh`&C-9x_sSsag6T~AOl9SnfE3~Iv1l3J0-V3FKV6JNN;h=VgRhR`ubMW
zAG4^RCRhCEih#^6x0lV5r)9q_f!Uv=bUYuK7>B0ht4qPPYuBW=I~ig9oRZz`E0g@G
zJ9^C#j8f73eiXR-bNG+}Z24)3LE5mPyqEhEEgUSn=<>t%eL8+E9Hd^!D-cJh$3|hL
z!>I<GvC0#FRZ8%+)Vys8zT~RrBsJX_p?L4!y-;c=nq)B_7xyoJ*S$7^Vw4_&JxXvS
z7!Gi7XvB^2#v%ZmAe&bIc>L<#{6vfC^@3yetc{f17wFRp?FSW=02!R(QgT`+%$^{W
zKmZV?ux!jcQHvx%8;@WW6vvdMHJwZ6u=nWEPGa|xw`4R;Q`59F|NQY|rEf>$#)P<<
zH2+Oud~q{{S&8*$E$ZNi>N-|80Y8}$)(Y5+uQnJN3x`)2!b~*E3&_q$s52P6CwWqQ
z=?}EilP?G8is=6M@#ErTY1rW38NT40vf8k65XlT*(Li|433BJfHuhP19f6F=jVwou
z7{pIZF>C_P2CK5FbTxs+<`l2WV|_}85j;`tL=8PCp7!>|1>u|F!83ti4GTrcLvlL7
z7KGo^dRKRti7M>J;J}!nwtJodcGYM%a?eIE+I-sfV&4JHV$1gJHBXQCXNl~i5h25&
zcJH1LJoHrH<tcLZ2|*b+LMUP)puw2AIv$O3IC3*zd<$Cu-uIL#k!=V2i-!XsP6rY~
z8^FLGa~?I5o5LN?S$bH&BC%T_ZqmN{3%i95nxmyDB8pf!Bv6=7eSZP5BGTy$dfn<|
zUX(VH2$BE{f`=e1L>dGR8}6S%yd#@Kn0H9sg>^zq6h$W6ByP^*7Bpxt@G(55BcE1)
zL5hvML}n;Xk^yH{Z=%SQ8I}wPk`V?PG7m;5S0tn`7|oeyhkeMw9{0CLn^1p74Qv%{
zE8CyqwGP~`N~IxejMfP5$+4vMiG%<VoENOMLp|1v23Ql3H)+4%l4M$g7{;Z9vbtmd
zCp}X!R}yLuOhNs$>ByT{ZBj+bwjeSJ<$);r475Z%(GkSVefZ|d6HlJfq3=d)C^5&B
zpXFJK^BNNwggHVC5=)M6rG*;^+77Pj!s7k?3J);1uy7^^5Ea}nA_x&$bZ~Sktde(D
z3Z$dmFhG($BRnywtbFP=_eGhTlwEBuo&SFLJvIW5GaKMTM)!;ATdT(>`|arlc!Kag
ziYTxq4%?hd-Xjtpil~9m&sc3w)~ZWBQVm|G`QX9(pxhZT3gPzCq|ZjT02|KM@^2}B
zUitd4mF$KtoUnxis>mN%ANNl>&#Pq!)Pq`srDj^Eh+J3}82Fx|kmII93RYoPu5SyV
z`-y2Vb$E2pm8zMZzvuy_aTVwI#Vv#}8?9K|p>J!xekh}|!>qOwb=3Z|N0*>j`Lb^-
z#R||P!BN&nwlP4A7!_j}4lOZ2&rEJ@_@}9pCV8^^!s&x7W-<!{$~_5pPsu{fx!n<U
zH{lL?nGDKc!E5V_7cVM$dPYs^*H2k3^Eq@<!)x1h_!H)aG^+XM!~gqllnzsj#PNaX
zCyr>QUAAqwgO{Z`Moj7Sc8zQ*oqKaEMOn!Dllu+^op;WosSGKDR3XIFs0-Yb1SS{x
z@FQ(0-A&I=xyVsv!4ahEzmfYHtsU%Z2v+P&R-K%>IxOT0m;i8q?b-p;BDERI`D;di
zJR$pGPdA;!;zq~c%d?`Q??;c8>grKzo>hWIa^W`DKGQdSmv_1WtQ<8A_&bd~7!~zL
zmnZXLO`3`eak%ykHaxlGv6Cl7AV@fJc60OPi=>`EKj7_0u>Cp=6q(F+mU?(|U|>~6
zRP3T*AXOgPOa>y@Tj*;|6%yChTL2fG8p}gzvos-z_M&1PGqK<LwyhO2`^T~rxpS(;
z{FUH#08t8MLskB;^5^+icmgRj6IxYihDgBg)ypC79Lcz(1_V~&6s-@Jt8No82y*1y
zmsgvJkvQ+0!6_t;E8~+EtCMq}L)&6(-lmOhHu9a{DiPI$`wt#WM(`2d*GfJs+%EH2
zhmcnG+k5<C1s0zq@>G9P(Wq*x=?odFsu%flTv;AX?GAw$p*^YC^q*O--Pw!j>8ZaE
zY0;NpwBx;6P(?U3QXO!+YhxACkol*4(7&VEL^3Y+*@+wdt6G2Ta|kBKK3K0c)zkuw
zjJZm0IQ_Uy>1$4r77S$picZJP5DM32?zgq^39n`AZF0OaI<#5Zl=_EEu1ZRHZF`IC
zDY?LEmW^xi3>tywi@u6KsNBNxubgY-zcT8E<ZKcRKiYC($BqIFQ}zrbpMry|B5=N&
zn^oh^mCGi_nsY;HB@h5F_#a9u05ZN7$W1#*irl>jLIM1tIDWC4D=n9Vib7HfQzQQs
z?rP@ygJ69Kblay-AD93x=+Cv9H<0&NKU<X*=KTU=n<0&BY0=u!*b4NaE7Z2=4OF{o
zS1dBr<q225qC+^zo)#3S0U^SKZizVN(&fu43VVc-Ng7=eAt@1LqQ{VC2J#6uIXwUe
z#2k)bWT3EDCldo?z!R+ykuI)p%MwyBr)#lm*DjGd<fHpbE!?~OQ!N>($!T3{A5HA=
zx8@Py@=*EP;%C8h#Flmoe)Z3b>&i&UV^?*qN1y*sWg!zEY!)O*cI&l;h4brWGls9y
zjjAusHCu{64XxLIV@<_V{>`b!56Kj>hE&%|iDC3wC<~HCNWY9piz*5a+!5SPpkmS#
zk(+}bYYetBFP;tdoDSkC@rHiQ6O3BdjPLT7LP)B?u?DSXEgXW2Tj=52EKHp<LSpth
zOAQn`4Yo>!<g8;C>1rsW?<%>4(Hp{_S!Cz;mz7ib(m4x@O-e1E_eFA0ivqHu-$N;-
zO^3NyfSzn7Zr1R2f#-<;lu|hVq!SvL)FLIaU<RO`Yv3}7@hSvc;VbfH+l*%teRL^y
z1Thl-Cq@i{sxZ6A{`Nr*H*`vCH*8P?0Mzjf;=-f5`z_y^S4l5uQ%m?moDJag4aprO
z0gW2C^cA!?1b|OKIN)Ea01cN_?Tb7i5Sl;Mj4f$fG1gRtGun3TpYLd1e4W#_lKX<$
z0GM0$85G&#6+ty^LC&M|i(wx5QuGvJ8UZ4$1+bx8IvoNAHK#wP!axXh5M4|O<sEz5
zayvSLR8ZNFOr~=%0gqjCv1qJ%7DROJqDAQdA4`rRG7C?L4X~(ew6BZ2iEYFM5L-oP
zwt&Kx2mnG1r>atoR#zA+jF`b&=<7E2T1+u!LntCb5g7YdC#!P9Ed4j)zyTM69K@mq
z0(H#%A70=-Kw1rROZnz39fpO4;jIW>B>fb)d7@W))MR)_k5xF2i4!<yH*6})=0X9n
zU)eE!zyGh6@1uT|@<b|`2sJO?SR`z|hJ@>5GN(28EwEeaak9Su*#c)1xj2;k;_Jwv
zFg9)kw~OFGx3oW_E!x?5RX&Xm2e7Lnj;K)GNU@5SAo6`*nr>paBHlH(TWhH2)I@(O
z{&UDPIW7c<7b8=0B0%jux1Y)_suND@b8pv$^()`Ec=l<`J62#2ORG_GHW>yQQFr~%
zk$CfpxGOHM2laZ}cI|3FFt%OGd0@YBe3!MX8t|E#a1kKz3MH%Zk1F5Nym*8q1I3Am
z(%vT6MBMu(iY%P&*a6@7RI0WjeXVd&r5h%&9{5+(h1PANN7MQYi;1a6mLyA+BvA_F
z$dO>~lPGx@`$r2uj1aA1Ax8Z@FIqMHJq-6Uw=Trq^-w5^^GUGQ6;Hr`NfaKHG*0_#
zl6cTOR2cDWF?a5<N~y!I5R|i=|B8fyAqGBt*=%DV2~k<%ZzdDeelKzMSmY<+l+cGu
z<HD$5Y-XN?7bm$mOaiVA66nzSm38R+N(z|V3>T8?bb9On@i>RD=<3pp7HrBcSj^Zk
zSW!Ij)j>K`Nof9cNjBGtnMNo&AQ|gNEiz`N2OlF0R)QSGsKe@-e<at@J|f%!E};=4
zz7N>ySiQRonOL})p!wni%hU*6LLx_A{YoXZtO9pT7%6%GQpD0)G+w{9k0cnxTg8;k
zJr~KGoPNz<Z2q}0mb(fG$h${pn^IYXFJxb*Kq$v9Vzn3r{StgiB#l1FfhE-iYgXi5
zR#yAq0bwA}31YA(W>wfcI<2<kYeZ^KvZ+zILEGpbbxl3O%A&^x?W^<pef)M50;p%G
zR?;gCGz9yhJ*?yZd$t&VeH#5484$e!Ei7f~q5oZM<Xyq7%znK&f#(gb<z>i1)luq6
zG{p?fu_d`XmEm8E&TfptnL$0B2JGiv*;&y2{`IwkXI|RenoM6sK~W@*!IgbRH~RG#
z7?4v#L1=LvPH(Hg%l~^_-vX|@kkABk5NoUPomz*|N*7uy=Cw@1ZetU~r&PXr4#p_}
z`%al7OEKHFOP8Y;R|x5O_wMG>u>hoSoj%=u-hO`C&bu;{$b|*J17*O0QoJKU+r_X)
zoVF`BqQ-hAEK_(&`K-uSrcmx&iHZLD$edf$=`!pS4s{z`Q{qNY2gzOLFsG$71zP{$
zaJ@f^%<Uv@<YJ~GurWG$fap;!$rP00(1HV425}9qU5CqZnr^x1_=n+d^(Du_$(g>D
zg7<5UR<zUTerXIQhS<uD;B;0|2nqM%XKeeQ9ckYc=#H~lHYHJ*w7TusIj*kj2zf53
z$%F&(i+U$F;cOx~lRQuSv58=3?!D@zNSek})OxgS#e$f?sr&1ZUe1isBMBG`gdrlx
z7Ky(O%y4}M(N_ii68NOj4GoKkAR~SFTr=f#2X#`>wDU;DGK)Sy<@Wt`7tM2|Hz5P)
zoK}3T4$#~12)Kj@w!zf|mWLoR^uOB}8A&u~!(EWQ%4kE7nxXIRLTq9C%U)$mlH*1`
zuiO<HK_hK^@KX5x%B12vnR+Zl{@bKKeOLdv=*r=wd=|ihaLFW$+`m7hc|#7Q0I}kr
zjc-nai$Z8Ff&p?8E1@L*enrReNI|O5$u=S#f-I_aWKMQ?bwOaD{O&_Lc(YoiFS>kH
z{mb~xf#KGH*C!L{>89zHVxiZB0$i0OnP7uU?7@Jxa&D{I_P1yvk|x?0iUNQH6W8j}
zl;a6x#FejTAG!t3PgA`WPlQQ=%K*Zxy=X&NvOsL54!u)+gMZCrU%YJD(Eo`*Ik=j5
z^QfMooa$WqZ25nzOn_(~ps4?rM)m_SIa|GEjVup{LqS2|41?YvBu+ekXbgF>$f{K)
zfW15H4X@3#QIUthaZ(XG=ko~1xUWTb)a8ul+H$0bl9WwBhOiENg~eI7ay4t|GWJ$<
z50$mP_`cJFEF0+lJMgq=Q=N{xJaL)CPWI$tk~#-ZEkR^3rU?aF)Ufz{uZt`0C4T<6
z=lb`GXEE>8#;n_%n`S%It-k%ZMm8=(-c3Jf=^LPJarO31n*(RFvTt7Rwfbc0ZO7Gr
z>uH!-*}Zc+bT+7=uEz29TQ7YozYx<h@MhiPyE@&-SXgrA`sokld0#rdAKJh0;H#v1
zp5Zw9gpz4?G&HmY2(_Iy?Pb-;-1r<=2Vcv|E+L<$R;-~@T~VrA8(B2bnT}QLw+D}Z
z9v}o;1yF?bn0B?XYIlW~X%lx^oC{$v@@HFhazLI*KlA$5RHj<HxP()^MHAMB283X7
zPmOV!xoo>?%U@I!z23~%jDl?Pm|SBAz0EB#H*#~wLMO?FVV8rRaR<df5F-HPl_o^2
zTw**21cwsC-cPOdpa1W_y|i2x)6RtTZhv)%ijp!}KPkL+fM)aNM{sl1qf!n44L3Ra
z`Ffx4^>lfk&|Q>fcR}%T@TJ(dFLWI(&uBuY@x15jblyZA+;bC>ck$KkH{?349bsEM
zFDyL5i81udmxjr#zENI(RbG&ib+%~%HJX;Tb~HQYO+v!sp7<WjNh<PTXGGKD(EfAn
zgRXFM!NWAUZm917jjzL=={p=f3@UvkBV#ZK(bmAgQ3ma~<C{X~ZZNdGN%O9R5-dcu
zJLe7w^>iE&I$xbCpB&T;v=@id%_Mp~nL%Ch{^5hymU?Zl%7MLdlZruS2nwoPnptQ$
z=gc)!Ob6^nril|Bl-zAPLwdU1&cb7(nATj=KM$kZ9dfo}WcuTrJW7J8KR=aVfE|sO
zhBlUO6Bj?NPPx_O&ll3ofncy-2IjFjeEt0?gzr?H%mII!?Bgq(DcXT<a41#ei44?S
zVN+uWKXnH?$AgtihzPtNO}8b5lK#7h$}goLB+7<4f^SLZA-2d|;^+(0Bxtg$wuLSp
z`1E*l;m7D)h64)DnMEd0dqfRov8@n2FiC&oGY>8owZb6~t{UyJ8<*Ec<{2Gk|9fB+
z_6-EQ<?A`QaZ#+`;`F9-BCUe;Yv@#~{6W*{K`0~f^73B0wqu(%X4EM4n!kGgz6&fX
z%E~AJXv1E;y1v}6C{HRnhOorqaB-Jqh1pP$PH!V4SE~89)TcI`WPPM4SdNT_5GSJ!
zM~$Pt)oj-6Fdx`td+oH={rXJ+9=wZAbh2Ahqq(7|in{_&Rt;29TF98Np7;~-pBuOD
z0PR5c9|C|`quuZC<m)th_EB;@O%@&~tjWi8GoZ^KUvI43nw*>r|IF3es`5K2L0WV`
z<gT%2;#d4wx|@g-_6$CA4r8X}F#)#Bn>$xezE#i+3x}Wq0iu~7&fIUY@QJ#1<;qZs
zp#6qMyq|TZ2V|LVu(^t9d~VN$02=|74D;Iblqq^6_@ofdU>rb>!Xp?G056zx;1Fkj
z<6U)?kI6u8qlmj%un#A>F~R08rU6~ZoaH-LzS0U{>DwR@M$)+X!-@o_B}<p;W0Xfo
zwe?ojs%;6*$IJ866NHAOfYuCKJNKh>+|+;mG3l0m{rU*D)n>Hk82N;<z8Y`VR6OTJ
zxU!kPJc{cD%Ff!@nM-R+j4A3g?q-Te(c*Mpc}uf~P?J%i4#PU>TZu_~x*$DKBbB_l
zmGNXjw@ga09x!F7AVYGY8y{hZMf1(&f%LsfP2CfTS7)bldBsa+gUq8Et<$o9-jfBD
zm#df-VcX#^9bNVw{@qsGsIT81=H(S<c8qbwQt{KrkKObnGV#hhM9v{&6MZ*syiIKv
zyl2nPEO*X`V9GWQ5PESP*@s&5ZTZ=x@UEF34^e`7d@stt-84XF%-FGRWL(h@j@&ze
zJS(A|lpRN*!nx9;VnDZHJo789P$Ajy|8<6~4Um!O->x|Q3<p>rwUbI3$WCGkar|)#
z-(bj@r6tiK6YWgD6V;SKeDcRswXvt>hH;ChOfs&#Y9(|r>DO4uK-u__hS80GRnWax
zFU*s+(I=8PEBp9Q>^SuS5eVUJ8z*EO<1fVBw_bAxh<HnuEt6=7k}cTJZ@5Im&|&)L
z*lIx`A<^V`v+_>aaMIrD^C4-}_@S~J7FU7lXp8{>y@5jrxSucUL&_LP3p#=*U^86n
z>*2BSR#bN<F*|f~<Dwj3VfR3)Dd!yes}wKA`W&PXH|oZ*$0n_k=Y;ZjloNA!O6bdv
zqQ9|x&##RaI*jTF--ZI56bxrQJX5mjC+X#Mx~bd?&9qCu*0FWI#9B&3mZ%URi;rR3
zP+0f^3I<gVv3eX2v3siyiZa!#sNvKy&be21r{_)OWYqj}Q|@PrYyg!Q$`!DZ-cE^U
z&z_yQtgz?T^72D2X}&cg1`hd`6PM^P5<$oo=)s>#?4#|5JbLu#EyLZ|{*hdwb^{G1
z^BAGgm7`l06FvJDiyt`QrqPp3#t#d2lJ6Q3zk;9Q`-pr-;t5nAJ<c?zy?%avvcX9#
z7rxKiky;s}IkJO+)AsY(5+cRGgKFS&N@D;KhIWpIFZZdW#Gd6_#}bc^sHwwjd{QOX
zU?g3YYen$d8<sh>U?#&taEA8h9_8fNb0+Co`;e{5nDOc%7jE1bNijBZdB1Y^mX-h1
z`~QA1a>%_+<i;$Pj&i!&VMEA99ZW;Pnun$qROiKDD=S~oMN>7tC4aBi{Mqy8``Jy0
z=ylFpv}h{?vwAhFrc4GEcHl=*5aE-5kgRhew4Qp+$L$+bhaEg#3`clY8+!As<YnEL
zn;o~b#7*u!zC9~LlR@KDYu3ynU0D)a|JP4t2GRPH`cUpVh#BIJ_{v0O8?lKkT#~<`
zD?2fIdtepCI{Io{599NJ%t5-v$)VOG<MGeZgeoRI7#QQCtE1DmsAT20TYFW%XFVbN
z=RowWaxI%=p-R6VmsaFuRQ_|}|NF%#jnaL#5ms*4I_TR)hC*N{ps~Kq+uLH5Vc2(g
zZ+AF5dq7wAKyxgq7kDc+fm5^t*=BH@w70W%{wA}RtWI318QasT^Uy+v)xW-B%7;wk
zWl(!5TjOUxG{$(bms2v^ZlXttJp=7CZ{4~T8uaUDO|y?DBy8gi<eNime*>N7ckD(J
zI7RzkI)u19!E81sWg&eD;4T5(wB<yry0VI*ShahfQiJ#Un8*YCXF;_fbt)y4#aj~+
z5-5J+{<+aYzJ@U@X518yVwryb^*Y6R8p`{DH~N0wh>J-WSyq8}qlud_7|V|RbzPoq
z#5T$e=zr|-DeI4pKOGQr1O0v1v&8|A7C)=Qu{x3p^1uDDmpFrP;KiFaQEW~psI%R)
zT!F{eMje|@n#-C-vE7CE(C<s$eB2C2C9n8l2aK_1r7GW{#tESKzW<M}H-XDBZ{N5t
zJJ~5ak>XaC82eCJqO!G<EuopRhU`j~EE(IajJ-vrMV7`cl`JU+CHs=7k+OwIO7Hh<
zG4s6d|MT9T=P}0QzOVbbe!uf~p2u;V$9ZtbfQ{g&`#B{|7EsPjph2?Qo|oK_`nGl}
zTe0;ze|Q|+jSWUMujR4@SY9*xpA}zT8kPR9&Hv{&2h`EpCi9X|qOVvH!z%VwtAjXu
z54NwgWH6#d4OADjjH533eDl}$VH^s>67@gn+ePS-YMceo3u#q$Gn}6RI7m*7#EwjM
zRNRGX??{BLO-#Jj)z>~koz|=(c&pCJ9-thhj(36kvKTeWBl`@&C7Ggu6b+Pf4L&G7
zvS;)XZdgz7EkE7$Zjruosu+hHIy6EJ&5RpFPf0X*oYfDNXEv65-afl90buYR20qS(
z^*&<jZck^=`qF2ToiYR$pP$df;!jZYw@=THCDs<AtXhZ=h>f;zp&iQBOk_qZe2JJ>
z_I=KPztDqoIXI1Dv$qe<6!HpilZh=KNk!OPBWJTuZOA@QyZ=6={3nzUY=8Di2IoPE
zL1=wYr5MKl9{BQSK9Q{cZ?CmwZ)WmvK1x!?n}Hk*rj5*Q&YX8urXdGUqmCifO6FjL
z-s;mf-2K(f*1fR80RNMK-LAQ!WK}lhS9w_$U~|!nWaaN&-G>nV)d;jYO7hx*Eb-#&
zvL8}>r}Lj4(d+m>I}quxWG7x;3L)Ep7Ov`g)yn=t@+r<_19A^JmKe`DY}nQRqu{%;
zb1rc?2bP8>C>7q{h|R>w;gpdQ@FYN0{*Zib(wE&6qX&dq&(i*R|NbbofGw-Hr(yY*
z@KXOnG?$GfqDBlRY7~O2YT-bLyy9cAPT56S-r%W(P*BNP0-lfp-O{H25&kd-H*tt<
z>aIz_ri~E!N1;2|dy`}}il2!<bYps|gAy<?=;tNbj~~D1*|OX&l7*$&LwfL=iR6QZ
z6_=zOBzwytZ%vaS?L40vg)`H_0>J}j0%JWft6O$WzHQNU*wTth!o0BHgVVq_R(CbD
z^l|nZEt?u4eI^qFTykG!WrcI0C|0dw7ZE8vP|>#N>0%ieRZUTSRVA0bo7W4JB1bqv
zF_|z^1BT?kJO6$La9uPSt-h@*euz{#5}hXUDrcw#8y;M&CuAFm+z5C*c^>@Lx)uZy
zc|Es+%eA6dV#MoO@Gp8;^Z$DS=R7@s8j5jZKraC72x{3Mb4t)79>LxtoZV1uJk^O+
z#jd>a8u4z60ZWcSrZ~?a?CckjHFea_tXwriOfFFYNOl0IGvIbQLY8V(t7dQR^UTsu
z->xDr$6o~ftybg42dQwQU|0|(|64&77UqQfIeFQe7l+9X?obDQT|Z5VK-x))o$?6q
zZ7TiT0x@iXe45Z*xKrnre?6iLC1Va`c^^4oozdmRcYxv3vkQi!TDr{x1s4i|vrHfJ
za#DpBU_~)fh-F!RV?Se&NiA;wGj3RF%DF-D#%7&C2<s2vFeK3~+}f12C`T1K-K^~F
z0^?VnJk^Eh$4dI$Ois3duDWOZBd<Q7gz=YKR1Fo6Sve?;yd0FNEYnf5=~~jLTi1`)
z?^Q`<k4(`_BfUDoVj;oq1U%v>#|cM5=KJ^Epnd>|erCre^Uk?ttR^+q#QKPgcGu)L
zpVJ`-?;f8voSZd`f}HdoyoQ7}0z@)rUYo6NAxcDb-lxy$%Efi4dNu}?Lo9})*%-2c
z5DV;ngiI(BVOu27)z4z`!x`AX6LhVXMmQP@suZJ-O9Rx^YS5spjw(-VOw0k4!>EuY
z^pl2Lj2N-}{|NmjFd*IBs^7=s_Yd1g?ZIwUXYSnhtFP8je7k8nm)#GqJkHB}o(t_d
z4(jQyuHm$;jAf580L}YnkSjVse4mXoB}g4A^ruzkp8D(LNk1Nt6cvIF2_T_a_Le5*
z7*It~q|^C55hC(rSI5|*+w*wtek2Mw@kRm98KsP8bG1N;C*3&IBwD&mzA5?{*(}5k
zG?_7xG>Y{+DH$zok)R5FDx!#2y0qeQt0SoT4BEBJSmlYXmXs&w<o(Gz)~s1m37(+z
z<!jGWEqit-Ptu)0qIG53G3xjf%B{1;4V4e7ipArlW=!00@eG7noU!cS0irl)z)>Po
zOX}v*mnvO^|3p|9@)a_j>?*}TIF@oe*J&O^vD188#WQl)@h`MTF`Intn(D;a{rmQb
zCNr0MW<lWJwG`Eke?R>dS~McpHfd$B*-Mqlk&;CPRB;3btYG)0P_YTjMOuUy%)l{@
z>VVi6#8H#~_VHMd*Plm?<vr)$ev80glxdw40&nJWJfJq3NK)_Jznc<p<KLSd3gI@2
zIfOjk&{N!37c5io5ItD2YrLZ0m!Bdmq99_^z}xR>90P@hcrv5p>E5<o+5PeQ*nvub
zTe2j3@y=>@gUuV$kYPcV-tF_*N=itZpOGaRa3}ZgKIyje=SP<UNm3btSajt}tpWKh
zeK||{AZLD;N#|@qt2_frNQTth8nXFN<Kow2Dlj9(@_^(YBv0L<ztPSvfStZd6?W|&
zaEe96YBxz*jFyY?&7gPhdujK$Epj+<D}OF4%X5ro)$h<@82q9P!*%BvUQvuISq>W;
ztSXHPS|fyHSYC!VXk6$8L8;?-2w4~T?RHW++sv7j6pLy<QyX{WASo!a4nv&ekO--D
zNvQw!H+Ezj^sYF1EYG3a97e}=>dGIVv?noQbJY)0IeolNiHkcTBO`yV=eFV`OJN&5
z?p_4#iC7W`lD}(SU#569+FTKBRFg0!CrDR+%_sLf32f$yVA(cA@^@Z8OlU%9fWlFN
zkt+pK;oGQboagu8homB>k14gHY*0P~kjXo61RC7yg-B8k&K&KMg7u)(n-<PiOg0X-
zDt}jBBgn`W#-1@O+)=a9lBb~M{!C)ma#sb;?11F&B!vRB@JMzvfZdJzb7#+vBJGp!
zDoxd#qJzLHOJC2wN=2elUE6jNYR6gr7Mc3G!_0AO^>ZmLi(+~L;DE*G(L3Fb>gOYa
zkAfLl@JXODUdb0o^?T8t3GL$<AYZvfkLeq*BoE_hHQQFcvz0cJz#~r3<$38IUe*z^
z(#b2D@2sRuY4-2KJn#Nm4KSnCvgOPf%io=Sg;p*Uy=B=qR82y2Ci|lkj()UCtqAHT
zg&63q(Y$s1u`4>AF)J?UN<1S|b6@q3jMiF+_Jk4~LKEPiTJkYQ{E3%iwLNQ!&#W&#
zv$gbp{N`xq_lc-7M$OjJx4n%{D^kq;_<#NV@`G{OUyo~l)pBPDro28(ht|r<$%$aw
zG`YSE=>bPMh2ail^R~yQn%d!HO<9F{!zW_w(IcFwasp7Q@JKrEoTymt#8^I=)1C!w
z#U@}|{N^5P+~ZDab~#XCGDr$1=;iy=%e|;!DL%&{%N@%qM%6ojZPv1NLoP+7MJ98E
z160#x3jzeK>~<#w4FRvHmJDa__<Vi!rd`GdXJ%${7w+8oE53f+cTBcL@b`b-0?J}C
zzs^mYHYM}!1bj>8QSh|igRRR8N&yPTL0`+$<1xQ+;omfRphL!u!<@huef+pph=t#D
zyfwf(=?_V!4;Lv%gk|7rPWG(KOkHXdo>wxB#q4_da~x-<<GaveCVvSx?G-bzhs;6N
z{)nb7px-pS*+W7?w7zOrQhb*J4$r_YO(HUYlR%Thh9|H8xhwP8G+LUP!36P{2n%7k
z_knzAa=Zij39d&L*Q0+2ZklBf492l|S>z^3FJHXS;rmL>RQif<oJ(FV_z&H#5u}o+
z$n`(=q5~_6c3>B<7Yj2p)yI4#Am+Sh@i|gcEl3z-vDcnHokVBEIjJS7u3EMp^Be7B
zONXs5e`Adi%a)zH%BTd&Pn!woJp^AHD09zB5pRoA&~1Vngtpv$AhT{f`?2%wx|aBR
zFhcuAW5;ghFp72nHNYnsm)4U96kopoYu~<_1)J+)BC%lA+;>xr-#Hn-n^yY2|JQE}
zvOUW#BOsGmk2{#~7{#K=KD{-44}1V}ycwy%8_B0TW3vUwe=Q}&ijt_hUUje3S5&kl
zK{i}H074M?%WDp-`o(S1$rd3nQqxO*EC$4GvCUrHbH~rev4b|7gA@f!jxtw6ne?RF
zY?Qyz{=L7I$Di`s;$=Q-)?p~w!=$CJW}apvfv3LqU){v8no<<&?^$S0L39@$M=%f;
z(XK+`T3X!7mB7VO@!tFVm7l?C(WG<k<}kgF1h9zWze}A#!dho!c@R}pV^HRwgsT<c
zi+wvRvFu{vOT-s>GY!26NpIeNhAx#Ymg_fm@^%uHL9a=l0I#HOmLs8{V2RCnayV3O
zB2-Iy_AK_y3(JaP!x_Nih;RZBr1}O1Bh;DM*<tX_28M>WE6{S_{V7b?B6-*ge-tQr
zA!$?h{*6j+ipklp{v}1j<<tg-7&&`$i(aYN&86XEFUjC8RGH?a9(T#Iv|6>=ceL*i
z@>CKReVVynY#8+O$f!>j{eEDLU{==dM@2<NlC%<YlH#ZHIVKmmo7tSQGoHnqO|g<s
zF1p$+t|$ml!VV04yDlH*iWH$mzxl!Jt-9qkiRBMEJq6JR74{2{g6ok$M{~`s#l02F
z%3_&I0cDE>X-;}<m)Fm}zI`A2{m8SGB_0mr#%X4K{N<Z`Ixmy6AHYg;AjVt#y6A9t
z7gAa#npm<NN^>Kx)r|T|bNunKDS5UvlCvKD<v8(52jU5pPGu31s=?Ae)8*+xKMiWD
z?&W8`$+!6ojYF`~)p1aDsI4oPFXMi_bv0DyU?R{AMlbtp+-OCAq~6o-u^xC`En_@^
zmbSKIaPD3(>@2O<<#nh@EjX<f<i7flbL#0%PJtlSGlA|13q49d?L)T6a%AnmOsa!{
z{0$hf7#3L~k#b38LIBH9a8Ex}9<L`~Tqh+e5N!n<r1~xmXMpB5fg`E0an-|{eU4Cj
z1GWICb(wg^l2)2k95H(4q*G(v-A~ZywP|Y7?w_gGS$A3;Sg<qsLZPP+`!X(iB<W)r
zF7UyHgDiwLL%oh2N2n<hLxq!M4n-M#_+dEOox|w8a#_!6U1H(i7?QK*dh{&=I!qp(
zq#yCZ$7BO6G(vU5(fE`yBfV$2K)S~75MpB3mR8|xi&8!eoHY%Xv4qJpLYF$-Dx<R$
z-|CQ{sghu~YSo#LZ~irQ*G!D`NN36xR?%_D=b7|-hOVw7;1lVMiUCNZn3YL$Qz?J#
z;ygXft$`t*t@tzwc=6Hm@`dCqozZu3wXqpg9o;qa)~vRXCZ3fW0UE@+?bL;B8O7lF
z{*YpUN1b;PW5#CxRi0~xT%!!>g#>B;z~%SSdQXX0N{<VJvAs(xq7MO7<uh6EM8O@p
zM7kWulEOd1oTu>hd2MZvdJ?PCI|%)C%_Y_2+uI`0SyRsW`L^aH9}7<5PDh=&HL_*7
z381L*0~#Z!T_>+BUf8;?n~6G;tOEUj&GB`E4e!Jp`>p&6Eg}iRxfMHQHVLDggm~!J
zEe&m%i}6rk1zu&k!P!>_WU+NByLW?;S?<A<2K>8|IpkUDgt-?&FkSh1X>}z=fTzvY
zzxDwcjstL==5)oY$5l=M1N+11jNa;+qp%-~YIq@^J9xsS&fMdpw|^^71T4<7&1T5m
z5o!+9U?y8}&%ouUU>8fDtk=gkAk!`IdX~XP=J0^GrAwY1q3j}58<6+0!T$NaxELw}
z(jd9w!G??l@db}wNV~JSnhKZIB45mT9ETC{%pLWsxoA5yTedW4*>dmd(F5<q#FxW|
zUOj>-90W0+yzI4}xiwK4sc;$q16j_^2W~t$)qPr%U6+|fl8o-58Mc}%c&{D-ocb~&
zn&hpkdH;DY*8a#py0ZDcvWO7y85jP0Gml>QP5+vVpYuRjTzbcD(l9{|YQxE{YmUnF
zHo5X7)dHJ}bD>}}0kae34|2iiQ>S+73eWuU^JiNId6uF9mTd%20=rb}a&rWg6z=Mf
zECI`LwCnd|Ow^sn=LCdr-Fmj0K7Y>DroN_i`VJdQ&wR|ld|w8sF1T~SOYw^S8{SE}
ztI^`>y3%{dr`xT16;c5verNzlprN6WO4)Qb%&NV)HTP6@mT<-liLc&~T-0Rgc8IO_
zUCs)Z%2{7$B!8#`s0J{^koGvWZW7cv+p|YsNk;kk*9(}%iP8LsQoKVDQ>kG4YAFGI
zsS<8{c4%vkh{+z<bJah=KQE;Qz(*8+K7)?7&{O<i7y?*1#Y+zY%gLDK+He@DgVD1@
zlIvIRSb9@zg}MWx09ADu(7E&iln=WY)EbEkjOy24C#Z$~Iq7=m>T_E~X)j*tB0<q*
z*4>OJXXaJgv$Y~yi7%06Gh_ooAkUvSZ@ca!rVa2Ut<j$l>FbOpjCn=Rpqjvd{E%+a
z-Mc5_OV%`#ublK%R5VQ3kyn7NM!kCX;7+=gE_Bklx}&QqhN)yscCer3sf6DXR7o8V
zZ2I}x?Q<TVK1wS_Agi$SK+|m0C<udVQCXB85+)Ar@JF1UW@;JR!XdGy@xMuwNfM^6
zLFf%E%kH_+x1oYmux^?qUAbZk-xvE@22-5ps661(wW8cowu;PCbWxnt>{UH;i3GwU
zaT`R-z&884<=mJ+WF>L{b2G{g;L`6&ST>_$Vk7y9t~pSku=}im0A0*D{7Zw42#r{(
z{w{6)is{(pRmI<0fa*4t;`uX!ymvh!Ymju<?#RT4r8!CYPsu-g>x0Nh5spY_I#n#2
zHiA=gSYl<xw*mV5Wp7X3*EL73p2E=u81tpC&5fSj)D^i!Kiqf1X^C|C?~*D+2)wy=
zMKW0SVJ7{K<8|UZeY%;Nvej%*b;ZP}Tg->m94Q^m0ZmU(hqPH<@8@l+_J6GuP8PP3
zHN)O6RzPFrUqyAkN!N)k{XH??sXMZ=q6!Dgn%aNzu_+g)IgY8w)B+mNWI9$bc>q9N
zb4eL$)w1GFxs%2GL~P36(eex3M_|S_YAJOh>1dSg;5!t9QX9$xJ##qzJ0uF)xyycD
zw0AJOA9d9Zq=GmSb}{e2<U_#!JnQl|=O13t&cGs}FYixY{&&2t;-$WU6p2TymPwPa
zt)sRs99WTZdkw~-H<Z)@62eS)NL}&yppMulPnOo0HI2(pu!@WC&2Ux=sRyuIaCU2<
zn3&(gY41)RxrcV{_@5VS8qbs)8Q}~0IBGUq-f#W-^|+W$tpMyQuGb<?N(AGg4{LN^
zL%a$eeQ_hCoCB4z=nc%R$(P8Tw7>o5%`YRPp{PnhU>OBYVWtKm2_+61s8nX)rKO>-
z;sf|+fO{dp%*aF#zL-_!iW0Jl<R5WcTXT59yQKF2^X7*s73I*NHKeOK?q@crqxj(u
z;~f9_chkzddo5>^|JMHtTgR}L5^%!h-r?P^3*wB2|7iO2K3oZq2S+t54~`=F|M$VA
zW)<YU3?&n3*>YD^#d18{W14$Wx_|qU+CLxI)T!kU40G-OePH>yhyJez)_)?AP)${T
zP}D~Isw<W)&YVBLcgxD+R!cQ2GEl{5H_wOZKS#-dKsiRLKO?2S&ND<M@Dqqr0)oxa
zB9#0n#vghtTQ@1Y97fU`TnbepPj&CcY>btc+57JM(0p9G|C7Wq^upQ-ar8kvZs8To
zOCK6;MnG8q`_Ozf0bgjI4<b1E?%L(zUdI1=HEKs|QR)SWy90bme%x5HFR0E=(}v~Y
zApT(9moN8dq5vDfg5pH^E))k0EpRAOmPat|Qnjde?NR~uum3{gGn&$>%RfU^ow@Wq
zbh-}tE;IzkkLKLA0ox{F-@&Q5%*Bz_2(vW)#>OdC$@DS78F=D7ILDQJ)lHnDPn^JT
z|0KtZhyoA3l(j+ps|m>Et4p=gVqev*FY$IkZSi)<cjD3^7*^xkMhIL0dW}Et;a?-k
zZ@u?4G<lexo__lG>C*=kE;QX$tL&ZP+lN6>h@O;l1=Qp<=@<)g8(}&ejTcbvwbE~<
zByXcPYaExV)+PP7APW4_w_$01`rD0NflwmmsRl=5f~aea@>am+kW3#V1RcUxWe|Oc
zwj{EA2|>V6`3Moe3=oe%)zM+Xgr@hlel&xvI1H|l%{K?dQG=r48XL|I6zqfme{WN_
zSkCS9Q~%D7On6pb*(MSJSH@5D$7PVKJ|z4u@E_0wdOk}c-rbm`3&2H2iMht{nVJNb
zbs#b)K^jGGb&lyrKuoq#<9o@olJP2>@3lRiQs!FIyxV*Ka#W{e58komNM##;@(h}=
z6z889s8^$S7BqTxDG__QJvRY38A+T@r77gZw8VN+pdsoQmMHmB!<OW^c&ez#;2_>w
zv_3RcM6uXIGPgN|XD6kj=n+@FS~zcB($@}otd(p=D{fs;%zYKLz@#|ONldkhprmqm
zKGItTjD)^~R7^V4_R3-IhtS5lod6a9<ooos%8;i3sS-iYSmj#_MQDPwQ(r%4ae5s#
z0FoC?80^Uad6uAanwpxr+S>k=kCwzy#b;wSoI<$pL&bRXJvW%Gn?^!)l7S;^F;{fl
zS7Ot>&te@+s`brtrEox~fS}4P)`6<snMdv_72L=*32HyT5JY-HJ%B0HZ@mMeip{TY
zWHedgIATVT1m)~s{A_z<<jjrZ4jOWd$(cmio^<~JoEy=HB*Q{<5yIr68nxNE(|f_D
zOxAa=l|PCG@2*|8pjGcNK?nio+;y`CDoAp=9~r&Ks!6?e>IzkXNJ^FZ2OJ~MU-l!?
zY*EHo5{YRx*}TO&{U<+TxmwHx<qAbDMCT~Q|7$*8biyZ)CoO#c>}l!lPz`2f2o5%S
z^p+GWix}?yOzI(ZFoq<OfEFS%t~*+fjA|6JnQgj28D=cDJN`dDcU)uhbTQ4!Y<qL{
zjQdBrz&x*M92g)3EnYD~N!Hg?EwPr&7?b1IF>qZXggD^w#bIE6nhiX<C4Vkk4UMA;
zYfZxAGE6cSl5Ai2{_ec#mtheC`vPmozS0Genv4}I^Ys1{0GjGE7jRo(2%;Hsr(T;G
z5)CYEk9@~R_s&WKpJK&}D|ZD530;5*H{W`a=&0|EG42ARLV6`Ltwd@qA}Z#133y03
zS_puH1gZU{`>HUAyKE_pJEeP7sv-a#tKNnSm89z=CWt5#M2G~M`G*{4`20RU`8#oi
zMbcymLBH(fnOpMXeaQ*hcFe6AZGt!`m?NddK8-l%IS4h&Uf%@6!73M#hX`<WJF&=Q
zbCaxtd{1*5%#Xz{iR&`a%38yr=u5jdYIW{d?n0l|i*wSF;&oMZaCf@Pz<7FcbwJ-`
z%ERsZG!^a#QwkCMi>`aY^a;x|?i~(C-7CPa)ZDH`qKtG&2^xH>-BW{!pYF%~xXb)G
zglOZ>xrYmN#(z&n-%sH!(lj}+08Dlt5!zu?pB`F%FCNraOlcc)q{-f%Tn$1`b?{|T
zJIFb(pPB3k0==Xqy66po(Ibd&v=!_zD*7;F_ulmLy>JMzn|mRDCsHVj22IbpR%ijz
zT5J{W?Q@d>=F3nXU~Zucs9^%kxhyCA(C0@pkGiG=nMT-#tAm%%y}q~CO#4;$87X{p
zA=1vux5~?TCH_*;mslA~vqt@V>rYTq6^L9QA4`fz91GOJ;8KuBVO*I+<}xmO+J&Nz
z*K_V1+R7Fa4_L&P&$7+b!SpAD0YC}dFC-t*-h=J4KBb#Bm$Llj>VSCCx}a56YCwuk
zqb|ND2n7^V)0`HRzoNs&9`Eb<Y5Mku!vwd2X$U<fyfiIjmVcS0qOU$enY-&IrgL#p
z$jANa$&wUMGkAG58%hSd>E87=O*2+Z)be=X^NKQd5$k3mc?kW4#8Ol(dNVdBigjZ#
zPH#Q6;*b#RrABSjUrYfCa~(@&^mqL0x6PQ?7hay(QCHW6#i!OCGv-#u1LZ;uaLG@n
z<BCVI<%xXzVIL0D0obf{t@=Ol*np~$a9?r`gZ`j=F;6Ape}1yy*6umgB?(DxzhJ?F
zQMmh3-N?|;765GSy5`oHcbx!(d<$=bd2g@&qm3ZjdJnukImAM1>5?VQ_SLM`9ceZt
zjGVdSV^^G|F+;z0?M;Ix9mIujzPo#ozqDcOd!O;(@Zrv)BlO-Qy#|D>?)y=NO`v=!
z`E(BP#-1(EA)m2&Z8GDCqgJA);zgzl(9*{B4QICV9M6(3LQD)u-Fwn!+U7(_j8Jm#
zx`|CriQ5VgE)fIXS!!l>t?|)f8s-t%4uQ6zHd^*#Y5C?=3I?KAxC=#+FWN-Lxp0;n
z_HWc?bwx#NQj5_A<yFPC&+Be+T)I-PQzp-iZK;+d`uSI8c|LV7N*&P+UEq!Qo+=eL
zbaR`5<2gNco<F~Q`(c*)rr~WKvl~~us<zbN+a(%0Cy?BSeroEtB5(VV@A;d1#35eo
zoHj8|e+%)C6E5lD!*AQKik}#f{g?gZQP~J<^`>r4B$P_lwf812VL*FtOO{Ro5F3@q
z;e3Z2s?GAZpo}y4BX#iNXG_=bol=qNs}d_j2#RkbfRQ_N1omRk$(>6!?EM{N%|z5P
zxlcgwcS7Z;M5(m9*3`H{%GYc9wQVcrs=9TiPEQ4-<Z#Nrb{|iLe}3)urt)e$Ns<=h
z*?4LAxtpn}Jsh{MWhZ3AN_S41Wu5Y_h63y`9D80iU1XcwbNu?xuO$c5|Gskk>W;pP
z-t0m(Kd1lPJE4EKpY-+dJQO=yPtHL~Y4k0#vAH!1L;GyAa>WH$+1}2ZE9l(kDdhoJ
zlzk%3z;f^kCfckvk)5#T^mrTxr&0R__)7l6QxKsK`VTAeMgSI}J@0Zx0178(Z>T2P
zFv6ieZJJn&S#b@(t8M85J~=m?9hko?xQ}7eP^e~rd>85mIYdM(LPeSk>;`Ww`<;sG
z2%kMLr3itu1SVaPf&sA=`QI;sDwp0IM14q=djW+ru^U!<G{Tp$K>VK*YIh5x^>~bL
zxQd!~28@avTnqDFm?Gu84rR&XA4@8F7fup#bYNZ)-{jF>zi$P-521SmTcMexPqy@M
z*s?M2_$QIfj?1k7>OR^8vsWu8Qq_OE8Fl!u#=;*Zo^8jfLG;>udqgJlntUnr6NkS@
z{=9dN(qH;LZ#W({!dt#Ya>dXy0p9ufG&CQQ?aB(#TC+maH2=Fnw{Bz5`Q4@LmK>O;
zkc)y?f~^4kzaLmSj6`KDxD{o8@wf%QJ39+e&b9vOS&&f!a;_7)9GFpS0v9aX1m6Db
zs`Y<g_n~23svs6cX!?8Z6G`h4v|M=Ncaxf=^{4Ih367C2DyrB}AY3`BxPg+MqTR82
zS35*A|NTEN#!cg<eY>fhznIcvln^NBoy@IadB}3|ujO(X>jhY+aPKg3*N5y?<vYh$
zAJq9i4T`{JO$3dD@hV>V=IPRjqra6H{}}F?-hOkh^X+syICT8pB)Zq!4wsvKUw-?>
z^24884R=ldt=WWYJrXWIs6I+J+MrQm>+J)z0?k5375~V7{B7ZpBUuw|$L5_$pEKw5
zr*k)R7f!yFdv5fheV>*MbPStxzBj@94+vc`iwY`4b|E7^XlrZ0xLI_^IFE=)0cG|O
zZtL|fUjVQeg)7D!RQ^{W5T`gf$=HLcJf_urt<0gPlf`Eml_Q(38A7yeJ9n-l{vG6I
zi+_w@CHMXc(E$LuOtK@6ryNVX;!{%_u|ozUo-l-!p!fB{;v%s*<{-n0eJxwtgc;fZ
zrbFdNIPgy<Ru|VQa&)xyIzU(3)auG;cA)7G@^D<zV+S&yRYpL@Eq-Hr+^|!5|2Ms#
zdZNFffSOuVkcVa6P$u5nADz6JG%<l5ci~{%H%Ozb?I6h0R+u*Wo_`N3BmSTmk8Gke
z+w_I?)UH#fT13S)ST0jFS_Ktz=ROeV_1~u7zJ2U+W`8C7;a?ru)Ox2!abO%nlV_ec
z7FnWe3IDp9*$pnMdt3G$)CYqgddseIQj2#OgKawvwBJs93=>%#-^EJb$bMzlwryU<
zKMJZ42nVr7Hx`TM1}3d|+ephYe|H+voG^~v;0HdU0@~H{8xpZ0!-wa&ts<Prn0J|v
zgX-1vJMx+ByLXFIsrc^F<#Fuz@r(G~ii4ih)XyXuTcjlKdkXJOk^_qv@woQ({|d@o
z6{)%3`|k8TbN+PgzSJ1|4^m&4jmyM5ne!RMv)11_b|P~q$ho?S^FcSq*v`j|(Y(U(
zCrtm!`QwCfoeP0%e6j6s(FO<mU{Png)&IbOuKJO`I6XmdYP)6ACIi1+`g>QeuFMJ8
znj^#@@2^Sjn~m{TgiYQ=V)Rmm8ZC>7j`0NNez9KLvw=EEbL{6&G=Ht9110|Q<*HQn
zmt<B9YcO)a7x2?OxQdL@j%SP-tByG^U=Y@HGzocuhfJg4FQoTz$Hw=EIUr)AZ@hEf
zuV2xC;>ght-QO{C_0H75PK`o8F40Dc@C)*&wx>5U;hfrAOycMP-1G%;uyn`s4h7uF
zV{q`zTC_+WcoQrbPd(cg`QXfbQ69{G$={&(s;p4RXV95=`WX;H9&pm1OP;b2A*HrG
z{s&+^<Uq}#Umi_-RLeE_;8s$DP028x+tS2agbi^SHnEZ-6BOZnf`c2<$dbqxt)vj^
zSEDJ}V^wBIL}RDj9Lm2xzYjRmmzJQjw<dy%Y{bX%>}*=>6i3H&>y1_#$kTCN8~ZL6
zy`v6JYq&tF=G`iyoDzd%L`&kn{RjES5)bGv8J`vAi2(x4s@NUSYTX>O)zJ@Q0!v;|
z#)<I>T4Reh2Itb53^xk5|5^gzP_I3y`VvL47yvvmLW#ctt*1<>6wh{m@S)$ejXnR6
zVcPr|HP~gIpcN^Xc^*~TG?LOY^zCf)xZ?j7M6(HHu$R2$+PADl8Gri3h>gD<ZQKiF
zRpJ|fy^^;Fry27?hD@B;>POM1F~A4`sBiUO#u7jz0DzP1G#gh(!RhoO!NBcD(b6JX
zOfl5Z;iNhA;bDw#LHN{fXKYsedq<~&(BP&S54PZ4s{;y6H+Od#j+FWAbLnh#>wh?v
z#M}@c+bZz$Y|ZK<4`L)i_29jX$2pv4*Xr#(`VbTnI3T^();V|XoD9XH|6vUw7rp%y
z;O=98{dEbU&N0rw9$Tfp>AU1ZTMFwbIPp?>Orb9fhl(pK*W>6%#FU!m3dM^&>3QVQ
zSoPj!Cc9Cz)Ow8iWbC;2tPuGKd2Gy&5yMKJoOq4^a?AsZhfJtK4zAV6aKN(9HA$N>
z7`^uEipn7}kf8vOc<ov-0h8y8dKXjf4!U(@Tr>^84NwW)%PVH3(4iW$;#(t}5Z&g@
zyU4*U?~G4DyS8l=@uIg+UW(g#EMcNi(kP?pR0BcL(VyN-y4JxSTO9Qh``%9RY^f1?
zp~+Wo-KtL(KZuO<7$pMtM7l;F(5b|Ax7TgMa?qy*U$>9zpn1*<iwQAW9I|b7r>k0A
zYl;;Idu|_tWl<@vVaPKRnZ&h;NAps8RKGPBBQPF+lV=koQoG*iAkT^8{E`G^w2Zdl
z*40=&;%6v+&S%X#>dTCbi<d4{fm|%WfZUBtAfrWZ?5{Y1s;F8GfKhSB<$Pw}saM44
zkmn%AWMaDE;-Zqsh&8Eg7%QpW>zD#d+TAlqdj~%RuzwpoS}U;r!%BCo)kifu9mssa
zb?Q~Oxc&(vnRd;0F+CC2MP$c>q_S7U08+$C2%6mAKWi=aeLUlacoG7@qONPHFpqpa
zU5R40g}+Jy1rDa88-XEpFqK3Ab5Y!~$?j?;g5xgSZ8XAo<&ObmVLW5t(FB&W8?^fX
z=3Wp)#vPwp4f&s<+2Or*U$?fRlv?B{w~F9}DernVt5~U##F#RkRcyjL?`lAs4>>Nb
zmk|U0p^6Zn`gZNsFl8F$!deQe=M)PvGmS34wgVh#bh^w%dRL9f5#v=(rrRWA%wu^;
z7}~&K8aB0e)kZuJJRbH0zgkb^Y1(9noMP+{al;(2qCluKv1t(HZ+3R}b8g}TOy3b;
zP1JGdZ#Q|QOm-`K*>2=uazdtriB41dWq&6puU@~t2Fc1+5*3%egXJzqxB&WNuUM16
zG6hm9$b+T0+S|JthlfH;!Z3&_cr|Iax<U<LiGm~W9m$J@Zsi4w77fFV5=gjnosWt}
z$IFPf@Ud$s#=lR_z|`ZQw#v&(ahm#p)y98lH}Ve~t+*cGlFW=eW$5Zi<^!r3kK5U8
z;0Bp{@Z?D=9-p|6$%IzviA{~)Zsbu|vpZt`m^(f>gb?vkV-JWehvZI-Y;}2XtliZu
zt!uAbxzg#w7;HkN$>7@q5<jdw8a!J}Hj{Pl`pui``1aiC8Ub#p2QlLjx8-Tms?hS&
zta<Z5L_PSL_U}FS*~F<qE4D)m)bbxrkz;oiD=wSBAFQP=-!4@0jyzVBUQ(<L7po{F
z?5lVq?#DVw7r8iG7}w!>#-j7)PhSWWmX>qy431CFsp{5Zmcs~$031G$Pfx|Z9LS^w
z-V+)DZVGem7i>cmc+<nK1J?OXPO*OK3?H6oJuvo7P9KH_Y$I^g@_I94xb_+_9487K
zF#{s}tS5m_;MNru7K$CLxV4Ly^%+lEGp9U@S*otS(eF$Cbvob1MTMc=mchIT+1IiG
z22YqEV|m0f8O2-expU{@;$vP|2cU<6)a`vcV*f!ahFC5e_U^q76F4q>9h7t%=8Wt$
z-1Y%faop)S`-6>aJKdUjZ~#J^;gl!X+14kcvc-b^%(~6^ebT9*>3I)CPK^J_r6;z3
zKMhI4;Jv8dH+^{;v$Qd~6}cI*I_0`}va&~E#l3)*G!|B=Tv@YTTAi9Tw|y7eXVbX*
zn@VyiUM+9kyJx?on9EQ<`x7Yf_CFq;P0eLeYsTXC3WhQfsrl6Y9W5G6n?AiNzPm|p
zDdC1XcUrRu5M~Q$f*cZnj;U)(4>s8og<j%HN!7Llb3&R9+Qv+#)mHW%f5u6><8f~y
z0l#O|h5Eyjc8pBpU$6q|@@K&&3CH}KY;Z2h^gDxSTZ%a^ncfio?=)qK0!qYG7lqn)
zoj5cG1OzBDTA0e$n^^yxo)#<!e;qk$l#`1~P4*zU=7`UoJ?M#c{^-f2u7O?9U>03<
z#Xb9WC2AR*?z9eUYV!1xF*WnC4}-BqV{wmo(w>t1$cJwjGjBN1K5pr!>XL<s@xFKh
z0uOs(BZ}v>3_%rNbxx6lPrf8bQwanX{#{T4wxmaeS<A?zNN-HBRob_IzxVQD*y%*z
zcDKxH4T0*am^dZmbMkI_H@u4X%LYD)=`g}6KRB{wO#MZ;n$<GJDn4F8*C0nGZ(42A
zq)CeRt%WObbxTSgLZ#IJ7ikc;01#a4KUd)fEJoLz9ipqsjbgr?SHgl4_^oZrTiNB=
zT)7nduT7@57w<3g9d>n}w$8A51`7$tr~BD6PD&uOJ<Efrfm6)<9U9og$w-31{U~yz
z7GSh*mo7Z*i7FDrmtgGbsSn~-8dFXn&mSSo!rzMn9uU9g(G3|Wvy<8#-%Xoamm7@A
zB1fWD>2$K0?>D$A4v9l<aSYx@@8tav%)sJstEg+-gzbCx)(&u+vLz_#4{FOM9>W+K
z*ucK#_QZSlnvjp})EweK#@Za_ZS6-&kzhB!Ik*FMrmMs3X5bON*%W92jjy>3+N2i3
z$g7ef)1FZmSTB77_?{rr&e5ZH^_tlPz4WtAMx)No&+^%Jql!W#=(IE_#umg+({C=l
zrDtd7eQKf5|0cOdCf6o&WMaR+`_+3P0YUfNO$1l1DN1dZzDbSs)Ee_41&Al)l{L7S
z_fS=`H~}NWAAFWti1aZip^%v0(4z^3@dnhG5677P$y_v{IEn^Z{`K1^agrb0uS=Jx
z8`ZbOZsi1>Lhg4Aw)Ff;60w9uRH&R2$p$AhvJhT3I`#I=o7=!B7Wapz#6Qu=(M1aa
zFd34#gPvIj&Pf!Nc>JR!v0qts!ttwi3!cWA^XF@4ai~4#$*UhBkgq~hhfGHl&6It+
z*uN<IdAP&h#S)c+Fnt+J;Zqr;Y89F^!|ic>2@Lo+k5(mKz50SPJEX;2t_@O7f559A
zy?bAk6YoilE|lYyNj^BKDyc?J<y>-4DuyH+OvOT2wJSIrN?<1cjIqQ;$_tZ|gaRhU
ztYe`m3MeOITW7R05@_-g;>e+SDvLR7$B?x%jN-_LrHd%)WK3jWD@{#Xo!4u0M3dq_
zyow3OS9fT|z!*#@Vm!XvPtCO5-u>UYKcz+jI}6fxFoQiIJ8CK7AdVDf0~v{D&&a>T
z8Y<=d%=Fj`3AMH;9}rn=Oor{&$gM|z(zxPE3MTK7(IwoXRz@qDDIZYdh8^y9mlR$y
z?!d!`4t3l!%V6t~BVB3Hw&+sbE_mcrcQ>~N_J%by#va-kR8aL5sC`n@C#J!s<YYlt
zNcEFEt$y<!Lo-Zve-w{C=>=86xsHFG>86A^#>EDkKly0(?ErU0td#>-P?Ut^fBN|G
z!_&p-8B|y@;+{Q}Hh!b&9nj}v81MuRfV)l6?;jFEV_;AvN9(a6#I)ZwX6kECv#uWC
zR&DuG^Tj?`Ej9JH=~Mlt*4Rh)kG?Rsk$He^CM2JnK{6MV*#+3gW_RDqNT&*7<c*hq
zo#R(*>kdX~y=PVph4##$kxTdYomZKX%emF=WsHBiO%tE@_m&(4E$W27dGA+Wyf7qn
zRJ=EyWVeqP=rDTe=XT34bp3HUL^C?yPtSQtabYh}6)PsBQ<4^*EY$BtF9l^(=k13}
z9meFBu){I}rDt9`eiF`mc>)=+%krjI(@FyW$?7mp^&&f4rN0t-!v!~>ghFz?rhapu
zoXPX7lw4CoqtVm%i|_$vJ^7sq2lm1+$9q0a!I{6`Py|aNJyb>g!qQ2(kih|-6x9^+
z*utwAdLRMp^!0ISuFa%THM0m1Z5ae(ku3)Gl39i<^fA6OYRs5FPpDQCiMcDb_k<Zg
zMvvaaoL5y`tMB;N?%P95EZ2Z;U=ZKHB;)cbkXG?ZzQi%ntJGk+q22U|4yQ;}#`X+&
z0rI1fOV`3i`}${@?%c*`McC&8COVmLY#~#Z(FGf=BC&{Tw<k0A`E!M$-tTR2ozNwY
zKswyCZ3B)Ufsaa91O%B}NVZIUSWUqj)?mdLI-Te!Ynbe#?sJWeJB$QQ?{1gckpDr^
zY41mJ!$^mDj|Ne(NB-!aZewlTNQ~T<bwOqvRnSwU3JJ<F?ulxmRkp{&2M?a(OV~77
z1{W>;7@WL_P0=lHPmtCcBlj!D#-{iK+-6KzVA1o7RlFNNx}H<$3}`UxJqLDH@)w8`
zJ}Mp>AnV=A?_Z*}eW;<D7-eDcVw+@(q&JO{K?0gi>S)lu6Z$%<7kj8#lOhlJHSLt{
zh&ICeEwy}fokI!cBStiiYSWuJf++!alT&LbRz0*7{Te07>eiNBmdCuHr(qRdTbGCf
z9CI`BZ4OZP*QG4Kfdj~$zzQqll;M&kf*v$UhT*pCw{#!{>n(p{aTDh&+umyz+t@j2
zWu?3Ybxj7Wa%=+yVD$}9>3x%3m7_PFfECj_nadosuozSmNgiL|CY)O%1;-k&b`}}l
zbHE9`oMP~~ppQ_<x>qa*^jorG#Q=J7bi-RKej5v&-_K-5u8C_g3TD&;7U#xJ!d~2J
zw|R4;I{ztLK39TAyh@Xd%+Ya*en^s-66>*?dO)LEwQ4Q4V061-gMJWES^w!#00;>0
zz{4}DeUkZlwhV+hnpz05TSckUW<WE{f-Q0*hAg^09N<XzWF)-@F7w??EmvH5Me^sD
zkB^1!;T}&zN3~=U{k5p3EIa+k9WG8}v3$twYd)z6dRaVY_V#qOZjw>cXXO|GT<gM>
z$H;_z?A>d$-J|Kay;ugMo}SZM5yTfJe{Eo()9(CN85xy0$^+kIWrg&7cpdR;!$ys)
zU*x|?EEosT7Ik@P=!Q4FqE9p$U~z%dcF#wak+|ctFYUhlI-EY{W*Pl!YW`SZzW>3b
zqVe15Kj_qav}x$9YDZrr=d@CFiTE+OlE$rXIuVVQuP`HN$)NGH&X?BM{7~lVPtrE^
zaqOeCg<**L5}mK5msP8F?I9*6&~I@}I}r_rz45;XDPoJO8<P<VTI4p>`@2oQ`O`Ok
zU!mn4vHb7oTN~}}{^{|<cF)&PV)W3YnkP=2Kt|aye0JX2S|%CAmYom;%875jZ~{)2
z`qzz?%lKPjk>$O+cTLHA^;$nBQMZj<@hv)d5eQD$<O}Cg`$vB#;pp=Efi<BjfJT(T
z4!zBd?MCN>Y$<a~Cm9d27&2l+ZI}lE(VzeGkF*Lxwb!Qop)aO9^cAf`*XGiHq-5?9
zCJ15e?sYP98TN3-C6kP*OD9a2;N+LT^2fItVgvM~7A+q0&MbP{f-hi5D?zXJW8ab8
z1x3G!=#UICaN*O?DGLP}B4Qv9s!qAM9vb5%E6}3jr^v#hO3%Rq5*BfkY2Mqh-=*_;
z^$64aBk55QmHPU`EX&s*8$Ux(lPsO(1qC2+)oCw4xDLI)PC?)ot;Nn3uk{2F&J+ab
zOO`B&v00JzI%E7JlRql70GHMmfYDL8)L;QgAspo&wOHJ@fud!|@HM7Z&X}PON+hP@
zfnaZ84u5r^z-kCeESMuhJrqNxev6-3-_fQcPFc4hTIt@Gk|3-VK0+rvT{(xTPQ+>*
zy}05<0u_%bzARq`m`jBS*DiBs)X#2xpP245=1mPR6hcXBS{dkszgl(fmC4c_^Xqo|
zaeAF^`k&f9Wc6Ca6~*#Qp@Tz_QKzj&KG*6^#ooX^^1_?N-MV!e!>lJta}7l>KXU!F
zKh(03<&}Mja<K-Qt9DL>V8={l8gMr09Fsc*^`InmA_dNB9Wfmj7K^V<{VnU8IDP!C
zL%|*Nk&v9;#jqbws-wk6S%co@F3xsQ<F!Dr%tm0pCDaYyPrq^3upKd{py-0;V+s_u
zqU48%RS}u2-#(KGUv_(?4NbMadNuh#oV9;P0Irub_Fm)w%VSS6UMD<eW${0Mw=_=-
z_1SD*=J+kwZRAF}BTNk<;SFN2O5Q0Pk&L6`PD_cur`~q?$shg{HWxW|KPo<{9YP#U
z8(-J@hM?*g3Kqo#Rsn5-4-Ed!u+w-xD3bNb2$++FmRS_wLDBsiM{oY)4~t$+9KbaS
z-kwhXfSOgcKfd$Uh(h^LnHg*E>l+vl@ce>N`%#@xRc<#NGULg|>KnpyW)cpW=(S*X
zon5Y&dIEQkC_dw?hnG6oX({6c+6w6IJ1!*ju>0%OYS7gQ0Fm=Z-5^sF5)*4+nxmJ)
zsqMs)laa})PM13RudwcVJoYGjpiF^vz5os}6^h&<H*7(d9s5sA>}fW}-eFQ4>rQOS
zW%7%hcj^OlvzAi{iWg@F<Ib9MR_ducYEnPw>w~{3^j}ySJkyDeYz8l14*=TYF==<L
z?lbqK&Gb)OU}6&-v-QOq$GZcK4I+^SPQ@}^@gl%Z13_l`W`w(kX2<1kt@~VDH_Kz`
z{RHrk*03V0m?clbCPQsJ1OAPCt03((%$qZDbu$FZ6iQmX91yR%SZ6L@3}Je7z!2LJ
zc~@#C2gT|@8iO|lCNJZFzQh`-rBDMle9IAJ6M5@}d2^pFhx9wGiP1gnbIs_WpU0QZ
z-&bUDUGE2UXiE>V^}tTIu1-%h3R`wS*hu6a8Kej5sg|!E*!KGmnU^wL%4N==A$+m!
z$PHvt9h?-+kO_zlajQ-4;Xe!fZRpSl8k%ODSo!d-k;M=?lf=l@X<H&RxrMItwQJW*
zs0zFibnAtTTGZ%5|0=&F&!0MZ*s#(mM#wU19bL%N=H_EM?tS*`(v9z@qhA6q{L{UN
zgwCnQd`?ZxdWVyelN)sy*tApBAXDep1Vg<SJNNV)`gDN@*e_`0F-s<qH<VyRJSMk{
z7#fH3%SG_ayEzA=1)RmR85i|B<Z@VS*Mp4Izu%0eVp7mK<=nv)9wTI@D3`{KAD_T@
z3hr({>X@3~B?CN*#-+v5tC`heu4%x{VAqHKB`*w`YORdweAE#&$`y!A87D1s+CY-d
ztSpHQ8x0Z&4qFxHl9BuRHYD-i@!g2$q?g~x`%T+~<Y77Qdv<!|)xqydx`E$*{5lM8
zpM?M(7d_oPyriy88;@(L3=Tp;W_Yc4`}S3!Pqy!yB1Lo1eBg?qS;WcbAYs)N;GMwB
zGv@qUPoGO#VtGvAuKUD*C$fBFa2?Q^ChyLo+B@}RT9IYavqNJSZ?EyN&|2L+H`~nL
zv+ME|4=)#neA_pu_aJYbkmAnm`uA^v%EjC5M{ahu)c-;=BO+t&@|{Mv!U3v3P$XGz
zUej~af1{BOQp&5`M?-pbo%BXfs|piJc}~Q=Q^x*&UX^m^&YDe|HXXAlCPKL}CMhNT
zfaJ|LZVctQQ0%<2-a4pvB?m;<;a~H*lC^i-a~mkW?+S!RA&ZL8OV%!<TW+yNX%A{x
zw<d!u?3WvkS^5i;F^Gt&?n6K%!V_CJYNTUv4HMT142!m(UwK1)b9}9-t3{o_e$Ie+
zHu$=>jw#FqhH;BSUM*_)Ft`pK0j>_-`GdAy7?wPtR`}tgj;4HLL;N0PvZf^C9)&%b
zwLR41Rc7WCHX=aX@h6$XqVu3RT96rpw8>pRG`XLNRwC<B1{WyVPtN{=XXqj8F*_~W
zc&O0+PD@GV)Eu}5kC5O~R~}=m6LL9X1qDE3`;MRT@>XH+HvTwP5{)R^^d<w5Tjuut
z`H1~i0Z0iEPjrIl3<`3i!%b$C4&D#5!z!=B!D7nQ^wi3!k%%zX7Hb~s$P2SPe);9E
zuUpI?<~hIg>+<;F!wf{2AuWbDB;UG~2+0IB0j>0aw&LJdzMq|(tjk^J$%D3zq|lp(
zM~je+rF0@=MByG~SQmlf5`Q~w{Gc6j)|hF*N)(ZYCj_4+7ucxoQECZoNYi|e-OqoY
zGpDQm^&>X|sSf5^{Mt(Ku3`h;yEd9KKxl*r56v;jw{c$e&X+WgpB`_gJ8<BI%dhMw
z1AFV<Wk`?|g(3ln`BIoa(|pZ_v~QCczusOSIPa^r^Wb%emPhXIuZ7+CFtjPp8F3CI
z?%Kbz`MdPEZnJ>0?Dni(OKJ`;J$C=}p3(Qs9{PuWJmZ;UgDDo@q&keNhyZ%ed9neD
zspj|kt^2PfrX|qTn1)o!+LpoQoloxK@!HQwv^LE-X|<cc0lXJ^_)t!(CE6i*Mn;+;
zc@E4En}#w*(b!R4>`bEWUe~wKPDZ9JME2lsa8qh^51HSKoK8iir%g`b+^9W}N$D#k
zaVkg^(5}{j?;Nw-Nn!0xoK|NtetY~wK97MUgQ9vvWH$EOWUTF{Lo2E65%dbly>kYP
z@hC1l88*wwsh$4yt4D8sD)f0yoB}Fqn2ec+)8KVmHIL5txu&$4Kh|aBo}RS;rq^A#
zFwiT(9J9VQ(vam6-$O1Z<vR}0^`vpdcg()N?GEeA1B7_O08B2r*?=W(yFaHRFtYka
zg{^)Z@oRu-1iG#ilP1fTf4APRT7udEs;aVT-?b-5enaw6kap0Qx$0?CJ01Akh5Xdn
zV`24-nB2vYH@_}pqMp;^8#n5J9QZx?G9(X{PIjLR`(%1BjFuM^^^E?VlW=&_pY?0M
zetzRNznT_svmbS5H~s5B_W9-KkI*3hma+|x%0?I3v}!dm=*XM=sO(2!uFgwm+YZfH
z8bBE{y7BviI_gnw<J<UO;ix>0p|T<)KIAAfH}9iAUOR%E#ME{RYT>l5aC-oFTFC?$
zy&Q7qw%D?LjEgJzF(EA$_-%e}*^}c}k1*Kvj?0E&A<<UXvwrQ`mk4-5zDW0gO*_wR
zVEYP#>;sE3lNu+PHx?woBH0qIFb(Ci^LY~Zq&%%L+v;gP8`E`k?;mXAl;X!e?`Yc(
znopqUrCH0h#eZ>-%baL3$e9Mbn!CgCS#o0D9u^99hYy#htgV&;^})@u$t{ZJ=r2{g
z*hGHg{*F0u0&L8-YpjoV4>&u78A~13+D`4&C_Hl8h}Qe&BMftzkgVUDmE71b#B-Q(
z8(bE#MF8c~{I4yWHLLl!dGqG_TRZ;ReBAEE#dF?2Iom#bvb}v1i1fAdU006r8^rhq
zJag7jT2@kE8mf_Rhs+jz2cs@X6F%0o3TFeA;&k=F1=pmwjHXjN^WxHI<KdhHLGv9Z
z$B`h_rFjwa6CJ7==VDRp-exFc^2bJ9aW$FfyFrE=s<%v<PKA=tFKV>4wq5eaM`0V%
zW;z1agmqqCPxH{muT~A2&$X%1FJ|95KkHa2RqA|1M2hzs2ga~gq2BS%XSzxT7si2a
z7iJXf^7LFUlE}3JZUrx*4s7Iy2ftOsyni3{VF*BjS#lg8dsGz8eAtDlFiD}?9~emw
z>neO8EW4^y@#OS+Coe2|yC>`kt;FVUaVOU^o}hmqvyEzn{kVOziqMz}$>TmtAr7}h
z?!TEFwf3tCOZCUQ1#Li2o7@`|g8W()xy@|Iu8Kht6fy08pEAXZ=1?IZWU`S$%2%4Q
zp?^S`6rNF2rd4DQ!QQ-Sicd>aD~Yi4Ycnfd+0olqoJ^TfP+Rf}vpt=cLXY*>6kcG_
zR#~ex-F9i==kVZVS9bKE#b}gsF@p9D9eOvNdlY!G4h2(nMaDO0QTeJw7@XA|j6y$r
zuA--|0wHB@{cCu7)T&jL+!rqNo{zgxaMBa&xI;c&Ij1O#>*DiJSCP2_GAG-m^8rry
zODN>nVQTxp8eSYBwW?f2o%B8B$iyAY<*^tYXhy%6s+0IuFtf%W!sg!nZUKgqJ2k$?
zTuG8YpN=(ELG+JfMi97<oc>$?9Z2R-eJC>N+6oh8^S8mO>oX7Pn7%jlE6pe#J$jT)
zA8o?9JxjEm162;Mzn}PXQq~ARIa2l)(gv==b%;Y>6zH`<6n6AfP6Xw*^CK9Q0-rP+
z1ZTD(0%)BAgRq<G+*=TV($}&+z`Ko}c1wp~JYeLZ|73^3ffK7NZgKXSX1(}o;<r7j
z4?dFSZ-G$s1}^vbn7L7vH-v<7gzmi>7FSECw9I^}D9EA|1#W5(gKl(B8+4%<4T|>l
zZhc_D(ja@aTSTk>tc4AVrcvjOfBd0oU$=`&CF7}BL}PU(l*aMx=g4<tmSNz&UAs2W
zeN_o<sX)A`c4tN$e2?HxBa5O+1kbK}QE&H{<SgR9Al;fw6(2;KcWQ3{2=xhz<#d%C
zYq1Q8p-levn;l^vrN8Gxv&N0vyH;MD;?u|L#pM>X=`}n>oY)MnqZ%-oI3TUc(djy*
z6OBhj(?QZ7NxR=Eb4FcpaOi!tTG8ltDg;qLobX(<=+vWH1Lr)8sf7-sFLP?y*;Lgw
zksb~L(W9CXMMi*|*F9PnN1WWsnx2OKP$vHI-@L18TTeX?=99dP3S49+UI|)DV44kH
z7XaqUAVMJ&2Tj&qjqc3Gc|QpECi}PZ7A!EZIz(oFMa(n~@7TN7p#NN>;?Fi`^XUev
zOL`UfCO6lh(>ieLn&tzi*QgzPG&-iYexqj1#!c<j(Lu&r>0Qpg8~LZ|`1a1DVzx$4
znKi5PEVQ$J541HkVUlVPJXNjMQtQ|_{R;N`T4Nli?oQuvCil?v)7)E`p@|1!kA&P#
zegA5b?{Mv$$-fV0X7qT6xvUI}T<w1IhkpKsWnb~rf2m`ghG;cqjEO2SF>xAif9?8p
z(Ss0Zdp3KzJ`(vz(5qMya((|OVdp6i2A^erWv7Mj$TaH12<GT5B%axx1yp%i_jYU+
z#xovGqHUWJZg9dU2Qs^q3okzE?42%S@AMys?QmbMU%x~7j44^}gUa*7f^Xky?a)Gm
zml8On1|v#-S<p@5<%K`#STo^mR86CHP8!hM#DUx0c+}1((axJvlfDa2J#yrY<wy5L
zZP>oOA&MhCUy7y-Aj}VQ-;p6ng{|J=AxF}AusDisb=o_4-i98-5I<R;%lQUjzc%S4
zz)#SrE8@*;Ywz2jJ#*rC#DBTGIH5`B?kch`%OA|*#28%X^qkS7>rs|(-@m`hw*vZ}
z8v5}tju^TQ_VhHOZAGtcTYWWJA8csq0rm_7MuIjJN+{{=^}&pfk+I^`F^aeUMOeNO
z7yJ=(WKeucNi%L-Gu&M}pM4AE=G)<@Q2HW}pnqr(xFT?QX*K_nJ=};D$xtv;Din0m
zZsb2n^XzO1OQHfrDW`SsEK(jA%=W$zl@|L-Odd{-^LdQR!8XmI+eO7ngPEeBC+Yj^
zuh*k~XY8*uR;nDa^%5tJ{;o#2xS8FYMp6K%7XH17uYN9?2~O?;xf87N#Va8s`0%yS
z*aUI7mUJ2z+Tq!7hcH09WBa#lvz%H&@$LN<fz0)R>o6=kRt)LpT}wq8Q4Lp*lZ{(T
zHBC2l6+)TQZ@<YDn~g8J*3C*uJU{XGx^?Sz2U*K_A5F)1RUELRmIHzvo_N+;MSiUb
z<k96qACsR6DMKb35B%E2;)@Xg#3=927bWpW70Q{TXg>Sok89AAp_fd*i%5iUCA}uq
zARfA$HO3l6G6~nc@Y^CQ82nH~{t0uJ=EWhLLu_xmr+{szK^YU(;1=|}UZUe30UTLw
zwX)z+odB#fa?w7#lnQa{?XZ2XUHU6!(VSl+mY{DC>=adTJ7!RFWbeZ7!>#cJQk!zX
zO+S#+KZx@n$gjPScyTLBdc2=7e0U886^VF3hPrXpQl?0;K~sX@ZBhzQdjlAEL^7Cy
zqMDRhM_)f;<W$B+Fzi!DV2l7a)6JlzQ)p_D+~kkVo54Y9BLNiJz-;8mp%gt7=o3N`
zyBzQhB<E7Ur0Tvz;wVyR838PnCsSjDYvV+(OAna|BeFP%MQr<$gGtdCTiw6c%$*4@
z!7w!8*)M`TR8qJvYnT+zi6@7s0BKw;Sj!;t08Dh(p)@kvavrF0+&TA7xP)z{(~EB)
z8i8^QohfxPFpsYA=G1>zp@W2TbwXXD-eL^HDpGg^vL+eRZ=s*hpp|kM@zjzQT+nQx
zcu><1VWvg!`*1Tl*f#3cb6$AU+{#xsNad=bv>LFa0S8UzyPl+;ZipsQX6EP`b4%8u
zC&&Ck^enXVYLVqK-)1;cI?<oe)g)3x0>edoKJ{9kF1S3CA-^gU4t1BT1IgDIhQy?4
z5xT*_GZWklX+|CNPVycQ8UUK!mt~ygYUHvBmwO_W)PKiL*=6WdnKs;wY>z;;98;#_
zk-5*KOIxNhGr0PUWja0XuQT(gEnjeh$DN#^B|Hk_xRGDQbJyshz%8H}nu&%jTW*r#
zURSp&H>VhM>TKGM&%XI5tq#DnPo=+*;pufSh^x=}vTM&C^e9dHF8HSomhyQRL@M5K
zjoI3^NBV}40ylwDJ+`vMQ}M2)b!yI2A6g6pAwYm}TS=M`VCY~D7?auN(v>UK81>n}
zzQcFan8>Yu`TXW#W6I4T9OMvdYe{Ei<Tp+kmq8Nst}LeE1#!yQg61@3Tta4@{&n`B
z%eCXrJN|y2QH}d9ft)kvK$S6Z1OpIZ_m0D%i=0)FIkSL2I}4XnQkA_5AF#8W0I`E!
z4VU5Yy=U#Jez5QZ9rg`Y7Uk7I;yM#-JcDm4M>%n|7BU*pyV9?|{7Yw<*(kaxlZ*w0
zM3+ElTlIG|3e8zYH4vLIqQ0&ujL^T-BISP)w^Dogmp3w|*?W017K6-Cy^#8<Q*@pq
z+CtQyiJ}!0O!lxtV~W5lhy~rUZ?_YfD?G4a&z_fwjCpu6$E4RN*uvydQLoFiX68Th
zy}tqFCOm!G8jN}k$}7AF)*?`ZWUZNGn0+b8E}(Ok2o=j-9%-;C$H@ffitLtfxei3<
ztcTI2AJ;aKdlNAZ_uH*>Gx+86rK(XyDqggT80Oh;XB+z1U$oREE}=4Skjyk0%l?6m
z_u%xsFSbwSB$lo|y~!&%ia7P`M*moZCW^Q%L+!j9XHwc%qH~liMI>_i(`EoSyJ#&_
z5T!RHb&@H&2D4{UX^5(jBx~?|uNTl>^AKYN2M0?@h*jEpCUP@qpzHXMVLqSflbCq6
z5ypB6B~>)%k(#ZpRvTRB$cf2NNIbQ0_q*SPl9o}^$j&4)J-d(|pg_0Exh&#5x`;as
z97|>|GuK5M2mYy^F#Oq9W)W@V2R3!^x_&Su#GAxI_)l<UzX`8^Mw^Ukb&H{FaKQk(
zIx+t+fl)fhTpf|0FdcU6_~8f0I3-Qe_{ze82XJ`%GWjnSB$TqjlM?@uq8dn#8YhcV
z9=cWYo-x3mD3z0nPtSVTNX|K2jC9V9B973g+kV+>(igB!n@2|u$JSpcfe#0rcd@`k
zQ`s{T6R9A4+n~znJuYTRck%8!F&Q4r2gYX_2bFMl!F6Wh=x`YipshrOh{LrNSgm*j
zN<IYzOZ*nHi`Ee-4nm)*w?rL@KGPc=O-<6<`MHQxrx2iJ@-Mw8T3>+oM|B>5Le#T&
zAAA>KnFN3@#l+MgZjz$l%G}OrTMv>UDWfmgTUvUR6u)2DclnpY35(#bWD2a<j~}~9
zFvIh66_AAYLg^EcDwA^)uzKT@{uGf{#TqvdYuu@=aB|7$9zxgXS~AHe9$6_hKD@jC
zP=_F%KsVOfsyZLhN=&26K`K6X_h%kY^)bzDm$AW?kawABkza0ll#P>PP~tN5fv8e9
z%c5>{$>w2gq{L6(w?<<xSylLK=zYPX!sl@QAm(|_HG)A$95zNOCtqDLq_Y)W(YX2A
zNIc?6S?%tnM?2RhJ6liB89<zEX*C;<I@Nh#IAhF&`{N`HKb)JHxt6aPNbuIr1>w?2
z1d3}f-2+0jk$Tz{K4E&^mvMjnR#%$xJN-a|AbMByAa)L?x%uO0!cACyjgI5N(bG)<
zXS^9J*JgkT*p1y6#+Lp#eYu$zbC5uhHXO)JwxJONoVtks?@!idNs|qWPVxkf;IKcc
zsYcFmw7(T;WKW-p2`MY<w*B~HCbf<(UisqRtVw$Sk*xECQ+2306ZpQc_#5a3V-DPu
z7ne73NqR_td$#T%QN~V(Ym>syaN~U(sV06u3cg1>^=d9k&S9`F8ls46YyJGR__#G_
z(+u{TP2K|>ehd-DK6FZ=oWUZ;aQhv^9dc!?Kv7R4XOQS%v2}6bXICeak$ma9qw!wr
z(4hk@Kyn;I*NKz3PWSE&X=PdqH%1tdA?>JBO+K!jL>5QHs|GjFcixMtlADk>aIggt
z@g&}(F_0mlu0baTU>_9EW|3gLk4zena46>!3&IlREaYG<cr{J?Ib&reFMGgn>gAdu
zHlv)7L6_3*!O(ijtw6R43Q<Vfy2of6OiDyPa76^9EChI+O4NsR+^#=2H_lo#lY$5U
zane_J@y;C^R}X@SG-mag^CTW%-i_ooI5hMH{3){|O$h+r{x1PbXe@3**<x)F%D+ra
zSX)bWCWplbdXz>SP)$r}-lD}+I=O^%k$VIY#>T45Q5Xhq#WE6^9z6+@QeWAlG1I{*
z0A1H*Pz>Z9t3{vCwQD_|8|BAZKo`=`b?ABnNNc4e9sjk1^d~~Cw<VK6wj&cgNu)kN
zFQc|U$jv3EtaWR+U4<5)7d3JU>}^t_N7Jl%vwgGPy{pGJC4twWXAzxR!pUgvn6%{~
z<Ec7i$R&+pBa6KNBe+(LEzNYGq_I`o#7U5ylM_UGD2;^hbyYakLK11asRo_29u;zJ
zVv!s)<U3ZRd_8BR)#O<WhRLwRePsT!-?eUipZq(D+<GKA98oW+QzZ#QyG4>+`n1s`
z^R(A88moD1?eauIM@QljwClEGUni-@-Z@WaTrK%XCQJxthuyWP7SXQ?-!P!%!bOYL
z9X$B**RKboqBaAkPQl{f!C$}Ezv*wh6>&U(MHNJ5!-@8CScx%Q)bI0o<X6Fu=aB)m
z8n{dVetAzRjgWVWP8s<3D*2^B+Q59C{EPBV8);P_wWH(|Atz*_H!^(+5pUS|*gFfB
zMHHOS2oCn`Sn*YYwmu`^8WP~F{&4}{7NRAqx6@`}UN-pdMS^@R$joYWuZo|Se+Jdx
z&PpJm5Lt06{AccDG_aci?^?5V?JA1<n+FZqwRhZx4s|}nlSINc2nJpxhMh*moVea!
z_4B<74H4D1M*F+k-(Q=$qmsf|noV4!PWHaLMDWE8X20<C<{_L=yx7x}RlynZ#~Z?W
zGMQlt$<Koe%TI?6Y4`I@bqd4&!#y_)ze0}0?HI;2C`wsv)l~vE0C+i+{W-5cy)+zh
zl@wOLVUHfETstq@J&WEwsRjDV1Xh>+n*V-6&{mEAAm!U1?ddaji`022ns49h8uK@G
z`8*E&1tx|P7q_n?a@F8BTy;E&#F5u-rC-38LCaMXSgD?^q}4=TR#z<D{e!b(`S<Ob
z@8`$u{|_IUy`R-Sl_9Ruo&<JYNulhzL<^2k#nL2R)&o;|l|l7~fR~rHbr2z2Y{4F<
z^c6N0yN+rKwU0~RM&x~+ebVOwh5h+t%YQ%Uf@I|g#b99!O9lO435*Xmp-x(NP=?_~
zOq7<2w{Pom<aan*c3%X^JNEa#I(BbFREk1n)MCq8^lh77#Qoyqxz+6Azdteb^D}U8
z8I=pTbCI-PY0$Xw8bXQac2I$oN#nxiv#Ku1P5>k)at&s&mENjqQilksGjh%L3Q?e4
zE7Kh$K%QL=FB?5HmEPlnFm(xW>`Ny{?2=uy8~fj0kNUv$!EgVPkvge0vB2SZ2arf9
zUJwVOhvzIdg)=Ds!UZ%Dw4aq8x6-lv!>ml_BPVe>Ur^Gr{N8Rb`i(tXUBOCj<2!$@
zsMCBnDyQ?=UpHv}M-DhNL?#8&Y|#kTNtyjNVZt^JJufIS?rG0bfvMFM`y!{g3_+^i
zzI0g!`2-h~)K6=!D<j3_^|_sx)gEij+?R`H8}$8YX3~MVOEm`D8nu3e?F^riSQ5t_
z|MOVG(q8|6*ALuzZO{iEc3>HvuOAArqVbr%tQLRp+<57$Uz8ZTV8~s5HS+xX`iu#^
zfSr;2QBZPF!TU`wl00nxL-6Y8|AGb1(CO24t5ZM6oZ@&tG4p=Ck{{oXR&QUrgm!W2
z)0$pML7y>v^5g^S%BC#qjKT>a!Ovi<qR^2*SJ}FX|G>vKW3=j4yl51Kb0s9hg(wij
zj}HbGQv&!Sn#;C7+4-NP95#{-pTQoX%wKc-cz303hYr=rzt^5V-G|D!SF3iVn+^%h
zX8w95h31f5*BF(?OZwhf3Q9cv!S&!W>z1-!a`$b!qpb^MeJEa-<Fd$R_LX~b4e8x_
zYJVl3sC@Ta`py5Wbla}nWIk#YxqD@$ymJVb#im$8v?!g@9QGocxf93!zB{q>(`7PW
z!XlyNn%0!n&daJ=&nfYW(|5A4(nq6z4&2KL*4-iM)PL7kR;a8mQE`>6ug5#e*OwS8
zXJpIU>#yV6UPSMpNJnJ5A;zU2;IRUA`8-0G;>C1;V;_I=##(s4>^heEjW1|rX&E5#
zH%w<4Y_OsI!-Ry&qBf5l_@97ibr!A36;WLAW~I%IQtFF-6(b|)x!Z1Q<v#8~Uc8#V
z14Xr>Es*lzvbes%%%*acDtY7&ig%<@PHt}L-7SqVK|UdTwd_YH4V1PdC78UWNuaiE
zh|5;fQn-?R+>DVacMtq$4Vbm3_*(@?@CUx3OT@rTMxjHoMqa!oH>2k*tZ5&7`TBLZ
zr)O;SgJlQFDEQ?r(f}Gb00X|d!-mKO^Rk^MmN*Zd;elAX_I*?o<5WLd$pBK;-<$pS
zLA02L^8%el7io(n3YV@H(KjFg$jltVH!5_&yCQ)(<ceTR<R2AZRe!FO7c`G3av%w&
z`=3p)3qKtnc6?=pl6VUsz>L!6XC_xGn1gcruiwYu2{ribwEv!iuKP%m`B2fXC?@13
zNP5<C_uCRUE~Ur<aJ`Be)LX1$bP(CDNhu4nOLIw%<!4D~oygkv)LSoijaOI#>tziV
z36_(YXDD|r$cLc{BDvH|W-jT{Lofe(0d_S}!gLaphsgyd6V89Q=S-nXgY`V}?6Q;C
z_J=?zfNF{fr5J~Ac=?(L8{kTn2XrI-)a1(HWk;x7h5iYu?t`q0(l+Y0%&xt}cd;@R
zM4d=qNx>0jHHS*fop~o{k<hUC#jtcS$uB`)#iiuT`SUM%kA#^Zr=s$5g551!8p$Fs
zHH1uqKN&(gCEcVbY(g*ne23ECM7V)+d1YK$Uqun<g14kxaglNMslG%A+2>GV6vZHo
zy5tH!x~~72+V$;ox%{%%GcmMo$e?v0^M9^UlEPlin#;HOCVp-i4DD0!`l^O$R#5qk
zsd$48`k_vjei(mwH*j)pzx0xl1ySx>bkv{Pu4$-cg%D@ZmW?CB?fz*yZEyIT$8~0Z
zYJIoOqKAKd9T~aXX~y2;;jMPqsQwt{qaN3=@%)-YZt6I=pNU;r;PL0{SH3gyU%SPL
zHA(I=NB>*jKb@K5Z=8Jc+~mj!qcP^uO``fgfbOK}#HuD%l+`;I+#1*i*m#7x=y6=}
z<D7aJ+4}lfgrq)#a^yAc@sV>m5&IV0?B@`(1b=#ZZO%SlU3-3pchk_ACr=!dDKE!P
znzWyJD;*PC|38$y2UL}J)9$_17!yl;6I*Pkaoam6ilUfe*>=T-1+al&2P}Y;#1=Iw
zZYzjj#ex)3RFD=;1XMslz=EABB1#bvgzq<-BIf<hS?jRYlP5;l-1ooCTyxFLHEZXf
zzP&BBQi5gX3bYOTKK+<<IgiPTN>y1&c=-_e{<>bN@4R{$@>xM}TYHrDIe|0Io;^E-
zRAA!c<p+<VuZjGcT`ss*YTfgY@Jc$@7M+)%S7*Hgu75YA>CCU?TjVR^W9ln!zX^Ps
zT1oI3)$4MKni_x>bLvI}TRmEKIjxI9u}VF$tPNzttnUUra~|-(>YzzSjp)|7qBW0K
zCquvwfP}`pu2w6Hp4hLTcJ^?#?^WMtl~a59ElDkq@3f26%Wzk!R<B;|Sa7`EB4AzJ
z8MXX-Pn|X`4Gk_ITVo@mAg?+t)yhRDr<}QX<ErNQ&UHljT1CP(4;C-^+_b4vQv+`G
z9!mzVLC-41G=-_dTM{rCcYbXgL+ugj-EV@-X)$&onb2&`*Ta{&8aO%1-i3U6|460Q
zi5=8&#++6EKD=)?v-shC&Fn(cxmk3f!_FNdut++GlzV4F)wqtk^rx7LCV9nI_6EX=
zGzN;rYT3Wut+YkeEuK&+gtxAHpw^_;xE~j|wotS7*S{atX1C06TC&XPSIq}FH=01^
z+`2GvMg{|!$6-6wN@`@r5(;P|GKCPzv1cNiSMjy&V$P*U>3&Y1maKVVo1u(8zH68p
z<vM1vXa{Qf>xwF&DE*eb+vTKGfe}0pEf4Mnmuy1uV1?lMeN)EOcG)%b{mk!%eR!sf
zCV!?)zv=p1`={Q;rp>B;lk@n52}5%B4taB>BiYz-d08#ida{0h+)~VJt+npi@9sA7
zssVM@z9+|!vMRFe1BwWfU#)IfF=9=3@zpoMOORa%!_^JKvZ&N44BfM!q|Z0>{lh=9
zHNR!pwC?)?5P~|UBY)KGR`UA7MEU58*eg|l>EC})9n^`WQqL<5ofG+9D|9|jL9yDF
ze=6Fegqnz=`ZOR$_tCk+?0s^xW<AwB{t-pvVING{l6?S{Ff*TH$9_Vl(*j1Za?||J
z*dy_7jl84z<;QDo(K$EQMkKT%I%g!#Fd0uFiZ(rjN)?DHc5wDNcd|xjNG$%nT5;|y
zDq>wS4zD$SbLHhux4<S726aG>c^PEh)Yb>uOgC<Q6>TkrqUlizux$(Oj<TjUF}eEO
z!0c5XPu)eN&|)o49o~-^M(Va6zEM!%+a|~l-%nl2Q@QqAFw*hE^>&?$Q!k$@&zk5o
z1_+cNQKxa!IrSpW(xo~7yCJXr-F|O~6oVd8Io1i|2PP&B->lqPdCy0lq{^lxlNfzN
z@@|Uepepa4;lz`uWVF;?IGX81#w$B0B7)!XAJaneV{c)uX4oL5zcc^_+!I17<3M=n
zq9=|e%oz}mZv4h`g*;x<qeQaB6qL9dls<2Kh7d!YiiT$tS<O$SFIt30A>T2JkM_(!
z+uWXs<5o#V=4nhl8>gn0NT2<@4P(yo9G(^GXmn<3_rzBfhpv7eQeP*?q`MAtu#1_+
z>slvHo3RMv*VDhD;>a56{PcmEF(J13Q$WctcMj=o@bT`h(zZyzC+)_QreX@>D9O1X
zV*(00?i}=!Y!qI`*i<v@Qk9pwXJUG!LD?#;^>6=n3&g>lBHN(Hxl4dMs-3MEt%fGm
zh1iecc|+nea!z^JSvr=NJUs<ZmHpF#DjPa-8N)JzGLLPyx;vu9A_m8))WizuVu6T6
z8~D~yoNgO8v}jIQnn~Ho^nrCW(&4Da9K^_{4S8BT!KZw)?DV~!bA$V{6fq35Jc#2?
z^DMVD!)N5Py!u<(u9>;&cfL)r{suY`d0PxhJ77>_xmB|-ggr1jM9WBT5<L@P0`}5n
zT)Rgt-Cq2sn7L~o@ALKz?#^GCWaYm{*;SAEVp$IZvxCQJR~mu-`X8Qc$RhRP!KHQ!
zM6h#?UDiq18`bYz;lSg;!Gn|MuAM7qN6c7{&`ZVXhfeXRdZ{kI#)I9LA{0<}T5Uzd
zX78!pnR96zjZsu{<Q73c72{f_4>r5i>+<c!j&C=(qjM=2=;xD~x7kPTcgTS`R;seM
zTMQOL$n^-7$sBD~{qwyZt;Gotjw0lFWQ_9;!!uKFj_nh=mqc7T<|GzamQVkTpUrLE
zM@#oK=zXO<8@A?EGP1Ny8dch!wv;Q}#HH+^v?lRfPH?tw%OPu)im|JGOQEogqoqq@
z6-nYmF_E=2C{k_*S-G2+9xXq_#_LmBqoRW20<Y}%e)UtkY#$2ER|UpRRFhpaK-V?5
zRZhQo*EYtL9x2O47s_@gRRRMn*QM?rmG|N0F2B247&>c%7mlSmMK2BCM0QEpH8H52
zr<K95=Z@=de)@3F?a408;XwLbKi&SH&E@6?=B`mu`D2wA-Q^U?S#Jl;c}?<}VQkyU
zsq|&Le6C_Az;bEI0*<o>gC&MMT7GrerTg=@yLq0k?Qi*nk8>**tG|%PG5K&|3xl9_
zsN%ZiYVHv<(lgud=)6PCR%=$sd9{yN2G(T4;H(n`*~xt<z*z2qk!xd@d9*mKkCUs(
zr&lxR(w%UZx`!o+I5w)QK5DL}p0(K<ti`xPy1kGxHIC^A(L7<=w8SwPRD`VF;dt;9
z!lfBv3oJc7mE!4=KaAZlWA5t27T+D8*-R#{wsNv~`+GgDC=t={y{k2<&8m$h-xs~~
zthqvaOWu8}S-Mk61&)}FQk^<2;q3SS`kkIq(2#gGj{E@2uQ*Yrj~`*BH0C_Yr9%+A
zrg5iilX2FCBbT4p^}xgS@c+T|<)y-;*%|j2>>AJRCXf>?e38GV(hZNF0PeJY_Io|A
zN(&?q4yMoyx<iAHus6*Qf1~hE+K1#MYm>!Uq(O5xEapOTRJIs;9mJ5iFg-?~`ycW3
zaVW<JJlKUuE&GL7gK@-L2}CQ%^tL!GKr3R_BsLwwc_{ZVZd5ymz2Tp7Rs?a~45v-o
zhpvb|f)JK>w<=n1;$2^JkRLFRDuK(_X7#<vS$B?Jw<{W(baM=_C=(lqaQ#o@CwjJ+
zYcsibaOSa|*h^JBZe`Az2aB>$!^eLqvXc-VJ17?r`puEyPoG}%2dsFRWo{9|ps8EC
zc+5QR)pl)4KcvS#anA9C>v3e>4vv5;x_?W~O}@^Rym+I$t2Zpsn^7sdoYFPNdSl7E
zZZ*&Qtb`H)tPx(zC@r$3$snICNyi*@8(hzHgG`Ph+usx3@|IjYR0o?KE||8t<xP1N
z#nL=f>UKl&rZG6j+sOO$=_n@nx<R<tK&g16#WQX1*46yPh-~-(Er)`;1@w%WEu*>O
z0dJ5eF7q?4<tL&iX;7}Sa`w{mBQREb0Pe$him_^a{HR{RonLhTUaVe74dA6Po!g@a
z7~@z+&F@O;g4n}oG^}XDzUUSR$X!Iu$b;3%b8Em)+|to`X(L)_e&XB;cHsb()}5+B
zv7i*9yaPDQ;2NNqU)8PVubQD_c3XZ`KAO%+8l;Q5>Udmo{m)1jyY|38?^uNncak~0
z&TTQraD^4%1DmyiL8HbsK&xWXqe`^)p?=Mkn7uq`QH4jJx25fVQojrFZ*^N}V2$QY
z6oE(2P3^vK+fL1&?oaAD)4x5r!B|e3Ws*2K?Cv5EOb@(v_1DZ>oA<S#fR``-4lKl6
z?L{~Dl5*CkkZHV{z3K>$dyCf0`Z&+(uOzR4DIkQwnqO}EYkvw!3ZObUgiB*)abI=d
zGdM%%A~f@!Dt^qg*N@YUZ8VWm#OAY@SX<2U6gtf=thwH3;0ag=!58A?s=gu~npRA_
ziD-pE?PUe(NW+T7?Vr8eRcT8+W_5qwmPMErqp<b1e{ZV#F;4cY?lm?__iH&547qv(
zqv94whRKY?<^!e)eQ_qHU$5RM-R4(c2aH8?N=?Lk3T&WKllxx?yp7{13yi4#3iGYG
z5)1N)X-?u%C4%`suSBy(J^BP|KEiZ?&7Gg?Eh5R9MQ$$UVWWX5F4z$>Sd6+1`>g)*
zzFB!WW(EA)eD*?;D?yMc7t;36?yQn12Hwk@8IDH5HWptj#DSV8R(B2c@bf`_+8}?o
zKp2OGaykCH5MdFY7ozTv+z{(G__fbR!d*`XPwhng_w1D#cUbX|Z*6NDyz$Y$HXgbc
zE3t@@iUCW+jzW|M@Vw1y-uco!)%S3z<#&TKkLc$`b^N>%Fym>Uias?D><9DB6^E*+
zDV9q<)r#Y@(A(z{Q&Kqbn9+4)<@N%?n$OW#--_px5PymcD&Xr-0FuW2m=5kbc(rBB
zED7n>yuMF(bGGJzy?^5KHW^;dj~!x;uw%=9^M4buYo#k5SK?~|Sg)E!;=WC#q4d{+
zc#ESJLz~`{*XJX(RBAj(AN~+6;RW%^shW1|e!-6w2G(Co?`=dK9qMv`0DfjUn_Dj*
z4Xb0t>1$DQ?G9;Gct}N(4%MKxnCX2`MCR2CL9LExMFtS8{n6m_Q8G=KG`;)2F(XE4
zR^n~3G4OIKQ97zL-bO<*k8IFJ1nz$=MP~I<IQO|k0E&^QSPBxXO54n%TLR$C&8cyy
zoBa7#McnYIVUV7AW6OKM#NO12(>J`3glVF1x4W3NnwQ>xa;qvcq3}+}NsOB|Z5k?7
zCKYCae6?)a7eBG85<HvMiRU;>oj%=+is%QH-i+mt1OFXmD-B!q?+OI(b){OKnZQ;{
zh>NK<0#y_Wlc$1bfm5j~oJwZLJp8#{+r1dflhW+CspdP)t9ESXJPxmU*o{qHL1)@-
z;@VCsEs)x4%uLInb~fVHtG@i7*Yr{|b<o*bDS<FFOR!lEp<aw!kHqP-o7LRn*5|6P
zhkv?2D_A3Pui<9PctHyqFX-%>Rnv7Wy;A8q?vL&dDNJ+`^zKIM1R^t~s4I(#yEznP
zF<`hs?~3=9Cv<MKy&^bOw04z9=<cX?G2&#DMvNw*p3ha_U7UXYxbv&3UI=$lt6xRT
zV%qT{Cn61~8$*-%6sRd>H1j*GB5KuJ_}>$;xu%Oc@?M{?A`1@DgH}PeK*I@oonVw>
zr_WwCt>#;uORn}&!*eWwh%7@q<)v<cjN9tRiHO03&G_Oco>r~HSC#AF+IQYB0@0PW
zStk#~12L(&Xiss}s+#7~zfAK=W{0dESJeW5Vw(v3H%DPWFa6*GwesYYZWRCB?y#ke
z8nx&z?*7d1syn>Zf<RQL2;?QC@PDlOHiuvIp`iu7I2dJJYq+YeH((+XqyZYf2(uZy
z`c`>qT)9}c?lp~I{ekB~GAlg)Hfz>7l^q*TMeUL#q0w+4>2ioy&8x9nT9q`DX_I91
z&*hR(h+!dWb*bRDn~W&;pvff_;o;sjFUhufl|$!LYGiWtChEWYO*1j0+rBX^QLG39
z7-vu@+51YB)H%O0DYU{T{;@;qe)FzA{p6b|ki)d2F1X+|(fA1Sh><zvjTQ{b4av`g
z3hbhi{LQ@Un-_69zc3+#b#71kI_WI&pgyZ@c2~H$h>=U#4cH-vZL!YduR#O~o0Gxk
z_!&{5Fq@u*!<00BVJ)-&sdj2J4FZ=x9;{;?JFa@g_#ty<x72jG5y9t;?$%lwBF7)A
z(Dbm_fTdqKzQ5qP2w21$be6%MXU?6IN+0WVSMzj2)Z3(#5^z1Gugwi^t#HggB;D+p
z<M`3auhY#oYfoAtZwoQeyz)bFD6?&9FU>=l&<)6%R?pzW&l1h;9dnY5kalDJx5KHk
zJRqMZAxza*#H<;{l_Ne{Z(er!u`UsVy_g~hlrAbQ2;_^)9_eBPzF5(R=9N3@O=`O*
zlDAPzmUHI@4OHcE3e2ed5T^18(e)R8Bd7AZJNx;74aNOol_l6yMvou(c&{uWEX9|<
zs_O7~pR8D_+XNY|1aN?s-g6Ev*fmsFem@b_9EhpvuX8OcJ=@HV1Yb7Xb}Rg6XB7@y
zc4I^1VS||5IQ5Hb^s(~7o9<Y4HRqBmG+gyUP?dccByo~Qt?8&|4y^<d{?qsF+b0Ll
zb*TLsq(Z`#(GrW%Rgn%a70Y;yX*Hj2{pX*{A?*5MAvgi<OvmVgbJs+5Rj=g#s_OB4
zWYjiOj;*1<ydbYsYG}${5;`NE?83Z`(=|cC2g}N5oU~-;u#cHETWE3)>JwADFG5z&
z0$Zp>VQ?jHtNCruZdKxs;XamxcgtA~akHo&jZsrHFc5lnG&YE}tZDh)7FRytktEK`
zNytNuQ>R~`lZrVi*HN14E7l04Vf4jf?5}hj=X8W6rnc1;hVm2#OJxN-JH<%}0y))$
z%+C8NA24EYBKSls5$EOs52{xBg}$G)6KY#yoE-`#fANUlR-THk|4bg$b|0sr&DxYz
zD)p2zduibEf`POxeHU5tHz}%qU=qxeBu5s3n=SaCdZAczu+9`CBUd0m%{MxvUG-B_
zp_#c!2jKrs5>TtuuU@^9b)YvOnrmLuftngFs=QR$n^rhv&lZxn`gXv43O68Ze9hb8
z->nk1I2Vqj<Vzj}K6erOHC7$+{^1h6vJWqHOe35n$gdgZ`FE?jh=s9L3&9zvT8MP#
zI^RWcv!>Ss=ZET7-Kah?SqSQ7iQq;4*Gc8E?!{K}3?0~ajQSV9>AX&bhj%WV{{OHR
z3(Tt4BD88PVrfO%d%!90?t57bEME%z%NL7rzG^WV{la2sumA9t&g^dN*<5uhWrA_v
zc5!#P5IeXBN!xBU{FC#J9hC=g>7=lt(A-6|%jrGe*hJ%|1xpBwgXcL{^XBVzuDbbu
z!`3ddf)164N^>%VXBkaE@c(l%TGy#u#QW{5?g*n8WNS4^GL$n8F4(EpS-7E^=XPmk
z)!pB439Og_nRA;=pJ!g)#9{BlNW2Ma`@hzsL)FqZGOk(=4}Cq1lf(agJsRz<T95ua
z38o}jx3S3tGMu;)t)YYKX{|Gb6f>-OI9j)=dau(<FU05u!Npla9D9L&RcbV6w1u_k
zbYPB}uBxUrLaK<(*D8Wih3*)MYQTK*Diw=C1XjHmHE{6KZ>kpKcp!^m4$UJ;5<}y%
z=us~q(bf|9?x7>F`5aBgQe)8~KE_u49!oN2abziUZ1Ylu=vJu*K3UhXi^sb8k`F?=
z^h8bTnKYwnpH+j1D}}skGEu*uYD)Kg+5nL@q_obQJ=?toba5`MUOw$hoQ)LDMi9hd
z0tARM8UW-f?pZyycQnL*{Tc|`)M0+*p+Dp<F$9iu0zsQvnV6W^vj5^qJHd{^Kz?!N
z!zv+}^Z0mU`<#7n>1O1JyY(qgeVIoxVTd4G-1-+cIlKB0E=)u+g66$wNh`EnbH9Ts
z??&qtsVDYKxM^g5cps{uH8*Sc?y7~~v-Y3wN1fokyBxNeqVXhvbnzru)S4vEk*<}i
zH(c#=`N?3W%UNqORtS@*@*t9j{-2IT@Fjqg!vFkfU*A~vYDVz~$-@58p!aFpVJy2W
zLyxwGjDAH=j_Y?Rx>mh5E@XOF>?<-M-jGbAI4YpwgiD|h*(M7nIc8{ulanaTa4Iv8
z$rmW3$4q5|qWeIE{+Cm%#>V7>U5AkcHc8V}m?mgDmVW3-HvZJ%HM1DpI+;Y``jfIX
zM>c$S6U6EvKi3W)BV>Uj^WpRtUBclFct|Ll8@Dzr|8U^c3gS+V;qYZ*p^`LC03W(<
zm*_&mMcnonS~5270rRcJXi4_f#4bg)M64Baaj%&hFQL()Ebl<La9lln`WOW)JXt1b
zslA31RHOn7E<`$gN_I*kukUO7gnCu3qjMj~^^nq6Gs16DswIQcskZ7uDM3{8*-Ize
zv{I};O1`!SAoe!$q3$N-ACo0kV|=8CU_P1?Jgu*N+vQwR7<t7Q$@QS*KeS$3n>(|7
zt7{1qrpv_2SpE(D>j5R{l+}+=r6i>gq%>#U^GyTds9$o2pw56CxJ3{dkJL@+o3-J$
zcqW6QQjpOA0R{9y$pw)yK|^vb8hVgbA}`aAxV@NfJ_N_v2gazp#d}8lb(QogHug&V
zM*&%UMB&({ttnv>4i6^OgT$GHsN($+DEn<=|N6WIkm;_i;%!UNM2t&^U-J&<uJ0ll
zM*|WoAq2R3@BV3GIC=8!TfMLLq&CPn?UQ6pN()XI#E?agr-zkY8Uh45`|nzfNvW_a
zZnC46KO+n%8U4h;mqS8It>Z$;x(MDhBLP<YZy(3q)B{PHU{2OS>e$o%(us|d7v_Ug
zVCj+fnii8SjPcGvFaGRl95P5KS5Be_*}wJavp6bIz&2}!FFTrbz0P&sPXo^>b_I-Z
z!!mS~!WmfcS384YsMAfvH6~EcLvgDPtMehF?9*3@Z8qFj0W8JQ6CKs~i{2`2D0m5-
zC5sGEU9`_$@(b`;MjUjtWSWYL<D5Js8!-LRXYOLAPTB~}s&~2+Ues+d$0c=7AFERG
zO2xPPX|h?GqLto3sp@@SYb#ac0jxMsomFLY#+nl?dfNN27AHtj1ySuF!GdqW!vV>2
z_9%<_j&QRT-zc7yUm!zBu|_k!SEr8$kee|RU=17+_RT^uNy?ytKsVJMW>~RPb`7H%
zR#3XVcW_rpky_+Tboyv00Cv`r8fP{};A3kSGW<u=-@!5#xF)aQ5GYmn$#@n&?(uO{
zr9Ab@%Kd0TB)htV&KY-nO47MLoIio)LIgm=TLnI;qlh?QbY<l{i7VtOT0i|TQh@j<
zE!z{BJr$>ENcEJJe!y1mQC67?syhbzVaGB&B=Mh$s$AJ^J2IlXSj7-{*}#XN8TN=*
za>sPrbM{Z$mNKl*r{E#&dP~Cm?_&dCC*E2Yk@*@LuOVp&!-_T~zoLasol0KG+vsYR
z*P%S3`;%1gJ-|Ir&mFH&oI57`rLYiUIwjx`KfvU*?Xk6;Uk4@|OHe{ptGi$fv+xWL
z`*^`zM07pcUS0R@h5M=-Esbb<$#_!;pDwv}kYC`FM%8yCB3XjOrNgDlz%3H+J2xwk
z9`8i`<%wq#54^cRlCx8B>cGh?Xr}cpJ)G+W$>3qbHCbF`P`cX^_4>j7waA&+!k*{=
zFogmYH$(7qVde8{$1W)DZ(N3xm8yp~7?{`o*T2J`KV7dQjtPa^J>07HDl0^uGDwyd
z$A^+*+DjHT(N0sBQt_}Ro+hgrDFRKQnfaio16rCCJ&EGtStVtKeoj8Nr8z5{>`x{e
zI~J$8MN%?T8t*__>#AF1pU{GM^RM-lp6i2BF~!kBB?ARhGmr0z6@$8zlP6iH@Vo|{
zNWBY5FA-ua8C)6&`T==Vv}p(*-}La;5uGy;-;NV5LLQjek>k}+!<~_HyV=<3!}Y)L
zmiyl86{T2POVWZs=WNWJ>h)HHb;jr)X|7KY)sp@$USp4EY3h<ji)gc6UiRr}`A}5=
z`O^%_(u}eSX_X64Uxmg`n2jG!G)W&Nop&v?k^#ID>q&utB%USFD1{PaN-x}m(F&A`
z2uZStofWbts^hM4l3k@P$}#Qf7W@IdqDH>#p3R#Sz4DKK<!-VEls(<D%{YAdracHt
zDGV+l_(7a{;{0BpU%_Mx$a7MIHBTLiNc5GA-Bo<-_lET0lb`w3(gXd249tEMgiruY
zPD?`zlY-s^X9vI>`q&s#V;1w06s%)bV0rPawwS)>EwqXr=Q<7)#LM^jh7`n)RjH|5
zDKQBin0*5{I>n~8;#>fZB|OP*G6i|ua5EadvZ#RPllD^=md=h0oJ8rJ7xAp@q%dpZ
zua|h1^Xdzc%?|qTV*8S#pSdgNM4>G1{Xxf$Pyd<Ev%5lbu^;A149+;%pif9&8j{6I
z8|2$zZY;G>vn?3w_fDv%i@7bjQIyJh8WPG$_L)MFC}_ff?khIT(p`M#$gw60(@^3s
zjeQJyv8_iuPQ-o9LyI$~PmiV_W&4CWrcM00t?BTyCO$_SssvI<0oX0r*h_1!e_rgF
zy#Qwsc3uS9EsWRAQ!k{Q+!oMomF;smouZW}fS24-$whhU`2pnz>^jiXDDPzQ2Nq?C
z7L&44$L`9(&Tt*~C<P=@$T+<?lsxPiTG)Y~_G^f~Uc*9-$WZuiMeTli*-L}61%Yt_
z(nuoZVYL=<U*>GDk+M7Y9p~6|s3CPgWGK&Fd{49^GpWHP_Tq;nS*}Ng%Z$Nh{LZts
zAy|Swp*IY|%i=N|JV?!@V@zWqgxsmib7=OoZeR`;st<MV(;rAJ4(@OH;khfwg-oQN
zCxHY18j>~pb+E!8yCykESpK=k9ptvdlp9blYc#gz^YxbJt8)jqGRHb*zyoecK4lc?
zw{_xf(-9o-z|6C3au^Srr0>W*4!05G*;^$WMeAfi4?m~B>$W&&rWbe;k%19i6cOF+
zxjdE*U9b1P`dUqcbE@D2(EK#-<l4@CtSF&O#dsO@ut$qfmq2u>-TT);)8Bv2Aet8L
zZP6B!?j|4`Gp<tW7pLxNi9o>^zZK{IZH~7cuNJsYB;k2A4kIrQ+*uV?@;EoixUXV(
zuSXmXp~D4^*p?M%=%BM(r=_CnZ3IlDQp+cCD2dtEB*#pB-c*h_=OjfWUwAO19e1ct
ztbYFO()5Ah(X=V&F}rmz*h=!)mc|A=3K-LU>WmrYq#USADl&gh*ZsJdmyq>~4%_&h
z!7Zd=?R;#9t$Y?i&iD0_oPLxmN#<A;Pe=%3@^;B#6D}yZ)MS06x|fKuQQ&)c;yGAH
zD`4sg7L&22V|5?h9@SdWG#A12SYnvKq@uY18cA4^Z=$Tj+$Xnj^&N<qksni6ao<@6
zES6`KmwLL}rn}mLv+^E2Ez^}-5tD6D@*qz0N2!Ss5`xyqZgRF`vPs@YNfe=sX14-=
zZnWsf!8VIy@+mf55V%<zR;sf~3Ih^b-FzF@=;?<W0~;=KF2D68bW72ka;G*<uNPi7
ztG8uG+MvjO3;%S#<o<(|rB#%>qpM-pwyFJ3Z42Af#;T!_TE}S2J>AHkzVB?_ZOeoE
ze?53zT5>(@aNxYw`43(@UhSP+Xcua7Dl#Kq%i_JGqvF}O^t4Qdt4c*ykL>K};lJ1R
z+WD2j86{F98P`lx)Xvx8jE{4G@3xat$%RhU>rwG9`Uiu~aJZb$funa#Ui@5U+x_<2
z#(wFksV77YjV5Q|?@4ubLaj2CQ0Em*9VlIjbX~LFb7rE}kzuG?OEKJ+iPJ}^s?UUb
zpNbF|lX`k0HCLbPtJPtOkSr;i(X+e3O|l_sZ8P-qk*;`>md7O`w*8CQagf(tR5Fqi
zxkGoeQi{^<!p~3h8rPEWVJv;}Xpx*v@-y+H;SD@J7Ex!$aC>xn!3U%qV1N-NMqU&W
zcGcIPLoBri?a<e+Uyp0sNa5U&nSHOg6jp22#NM@LROCHc3z!?8mZ|rjl^Goue-A9Y
zZr1(#_oJ95t)7bTT~EJueNCM~wMy&!Ey?5TI6gMavf)EzBsD#y7D<6f9>VY^P|j;Z
zD~e2;Ck~i~8=MM4ZdlZF>S>qV;aJ$lF_X^C?6<&?chbrK;eroqe8Whc`s|rY+EC;5
z??x<`yC&(Wy;G+%kkLjUPsPI9G|`e8Po~i1EZL14_eifr^=Ga%W#p46E1Sx+csfmF
zx;(;6J-T(vI`2_P?d&UxdlShNwaX6s&r7cBd*L2E>xNq*5*8qlV)v;qxs!?$6<-d9
za2=#ce@EP-vf-HIHiM=ISuK*$<vhd;Bvi)jZD^|^DH6Ca`Uh1!&VjdF-K0xx=aQ>-
zg=gvHzlc--Bkj47ug<?-VrJ$=A<uLQMC^NNDvD}6%(J7z0{XA6$W^!)Fo5?ghB@!u
zJMh~MIswgIjaDd9|D+%^Ix=z;TE|S5O`@vnuiA~so?Z=@-9wL|-8kE1tVY`F*QEyy
z|8u_DcWhc|*Y(cAn=;CNIwor&lWkw28m2!P&2t9X?xpvGl-Lm(J{FT2Vj|c*5*4ed
znCh9+LQBh+lEzoqgX0kug#pzs5YZ%JPerP@{U~&r$C0`|-kv8=yB|KB#_c3$*}85k
zu$LD^keS^s!dqi0Z~S|2^+#0v%a!e?sxupXYoYLuXv!E`{w3OwDY%OPy72Uy=*nn!
z<vl(ip20qzlcs$6G0vsit7bj3Y_;h<OM;0%13{F9<f4q3H+K58Iw&~U>ySNSD^LSV
z#*7=+|L6bpF&k=pj5flt%n(0LD&{nYEt4KJnH~-k+R<p{%+YFuV*CJAZiBjg`7jX)
zJ8FE26`Vj4`Z@ex?@i390~NO7N|EI8qN(06@;}?Q&eF)Oy3e5f3iKxLLNFBK@w@i#
zKNZOZW)4rMy&hqLac%pU&;WjxN|PQeoZ+ZiZ@*!g7Ohy(r(V5!&3et6$}SwBJDpM8
zJ;#nM)BEfj<blU+^?Xs4D_i)GpgWKTcj|*jpyx3ILGm*lYeg7(Q_XIP9spkH6t(E8
zX5FXuU9J4$eWTXi9*={r_;R_*AY@VZCJh}r=0I;BVDw^6Q`3bC=Qa!sJw^2X3ZUY`
zXe_dN-kTb<=+21wdv~ioK=c3YpZfQ&CPjz8v7reQy@`pQ>+`l^iuNu~K|}){V>+iU
z?fLVgc*a}kIU!vTK%3~#^A%BtrWhKoMuAI=5skx1ToPHrOr{XtMJr>6ZN2Z>xOjSc
zk_q)8^Oi+kYA@p?zTNZ1ZbT&Ps@_QF{*1PidoMM%=x>bH5!+Ds`hg*bY#O~Ha%qq4
zRlK!9&qVtn|8c-WMg-o4vpvOed<EWA3($+kvQiCp7hLW~)6K1i4?k`-{~W#Z`p%^h
zDPo8#zJel2-rQpxU85iSMs$r5l@x9UeBZLcs0AVJ(}xCLd`?6Yq%MwPJ+csJyF(!%
zd*6qogi1|+QvqT18GcNrOK;s%wdtB)5RtI8`W+Qa+m+`-gzRCsQ}`+w%pykTA_FFY
z<$wIrP+|HD+Jhk!;rjv>42jZa{=7M^susZQc1&)mA)<znGFQ4$Yu#}SH|RX!Go+H=
zII+dHiuxod!(lC$_l~DqLV}C?xK}xi0d3I25{;)me?ITUMuoFI?yVa-Wr#T4qtw5P
zO0ER9y%2rLY?!0Y6KWJO$>%q&@%fjK^G-ZtI10rg7iF`qlZK;o_mNuYMDB2tQ%LcT
z<_HDyO)&@i{mNO~FWniVq0y!>EOcu^^)ASRM<o1Sy;3d{mputFB3QnMmP02Mr5}I&
zv0=jiTJZTL5&vW<6s{-;9A$i1CRE^AT40q~q{=kbq}Qy4FDkRr^09om6itQ#srbq}
zO49QT)Bd(0`Kb(C`e2HTze4(Byn921OIw6P?-IGCG7C?IOW4mDX06O#EUf(TivM%D
zT;l^>VY-=c<Tz%`7`3wS&Ewgqb4|vzkK+U1b_FL$+e#=&Nc9OkqvS!`t$+Lb);(2E
zr1{9<Pex3xI;Byg!alv9O?P)UT#&i|QhhO0AQCnGS8)zsn&S32gEPU!$nm%;Ehr}Z
zNC~L?zM5Zoooj3qLKVkmm^e&Bwnlw|H2rLluR2KYY4@+4PsMO|3Sp#M$>e8C&_m~O
zpG$M(6K$IqSX2?w#S!dCB9csx!yIO-1^YUX_3S*Mt|CgcBCObp8{bP-ZaRgj&04jx
zd@;4=GbvmyPpaMu4=d*83Hp<fk}6Ba-6HSt6aCpuy3|pmUZnr!G$q&S6EII%0@*N8
z<K1aQsqvNfUG+Og7%m6%KV%q;^33|;ypOQi3u&$A_d9p_n+|wLgap#%Jyde9;ii`U
z_sQ1ElR3Ac5U-zX5)A+WXe^|qrFBARPnx?lM%3d^j7+yqo&LSoPtpFVTw}P<4yu6j
z=kH>}(-3~q{Khlb^;uWf_%OP#lgfmd>#$w(|N9#UzMZ^S;WGB0fBwnDT+Ns?$p!0s
z0t^Zi8Y4ggkw_XD^CAl6&f@HC7>mZhcyIC$c9fsqJHKEQkoiNH`6Zm+<JBx(vz{|2
z5sDjvi4ytSX{RT3h;awTWK;M+1Dd`4t*1}RkNTZ=DlZ#so&Fc&mzp7#JYz=k7T@S>
zs@9K^q38<id^{5+gKvM}jOr6$#Uy}m6A`gRPvSJdO@e~rkyECq(JI~R?%q>H3zeFu
zT_A60c$!)EGRm*`<Hs>-vKuFel5e68`A>9>ox`TB6@Z>vo3@q&AdokGnhhhLI+Z&#
zVvZOlTs5#c6L?7rEFl8Ty205vMil&tqU<DQG<h!9NB0(qn8vr8|Ihy%kt0niBY}W#
zD(|eC0zBTURv)7X5vicLc2E;j`A;QpF3h|BbuXiU-_-Yz=wCnC1>yZ!sFzITWpLx?
z$O%qF*liC+QF(@TK#lc#n=0al+!%=`r=m4m1zAx@ks!dsQvMEaLo(aL!;ZRx@$TO!
zOvmsG<AG-vS7%TEgi&-uKT(XytC?S{gNwHqlW|Kag_ZA6o<SLp#uZQ2vm<IPGG1vG
zx<>TR9`X10w^_mrt9uM6`-FW~pO7dG$;0k*uK3GTH>!j)*($k9{ISQ{wM8ZA5I?_D
zyV3D@JG2nUFO6qE4T-11KD=l!H^^!PqJX-jWOYVSlsg%fU#RFa5MUjyhc;=_q!1oY
zr4}_){`Oq>WLtWpA2mg9RyB)K-_4u)@hW-8NgZ&mOfdddYu6q?h03nzjSoqRLxWo>
z^rs;vYf%~=hLEyINR1sE)~TVQ_%V-(u8l8IUyR4l*-XA=r2NToZL91|#S&TdvNo)T
z$6m_bj{JEu8`2GwMha&-p%08)9}B8bJ~AAofE>N|=-oeMC82DT(lx3tDlyAau1v<;
zO~aQDV3feJ$HCJ8<xD-=yt#3^vWT8UQM7Cvc<0SGx%Hj*2N|EnK#2+Jra%68(5vgO
zxUK={BdX#-?76e|iW07LR4FZ})anl7ne^ne{wqA?$Vb<&U*D+WiLDx=6ykcQyd>vO
z5-b+5foEycG+y_-ZZG4InUtD$9`}u6;x0ntY+$qc1boeDg0apMzEVUrLM-wWoYiPG
z(6c{!CB!{7&xv!2pkC9TL=J($Y|3bq@u|-}uwlcmo2_Xo^`)lJ>*_G%CRJ08{<5in
z?1)!ua)jsCuV1fJfjN`0?g8!zKs)h2cTEu9cY6oe_hu4ug-<|P$l?ZKyF1#w@?mV2
zBqq}u+D=1d<8JSn{ztyu!}FwS;dkzFO?^lg<9J6RJ+nv2ta#KLN28e*ysbqaU%<B|
zRPVN}Tl=jNv`9_oNxprInu@RfIPrw9Yb#Q_6N#xqLqnBWREFNctH`CP`DN7y<L<nb
zi*fhE&&lAVo6*r(zvq{iOe#l1@FYiOteTGdPD6%VZJ50JP0EACD;{3#HrpsPB;>F6
z1P<|k>7kWbw*$JKLVEUpNx0L0Sk^lB5gZ#eeRIe&9Ks$s0!v_jZp;)z_ZMyGk|ij5
ze2W=<jAU%3XLVxLGJ?`p{pN3^2G);ysqwV=U=9zwxf;o1^>-{?7fMo@czd6*L*X>i
z731kf4b63<IN(=4V%C+FN7v*&@sXbdxF(WXJB_rQAbBpHAbA9$&fz^fZ_oV!fSUU1
z)macVj%My)l;J3{J&U5pk1yGuhhYv^<#PSdY0PO@vRN0*cECA|JJ?E*w_ji|AV__J
zviCE;Y^tw0R^zud?{!w5BmQvhYxHF4uKKCw8Y`kgdrEYX=MCO(l>>%dWcw+!ae=*^
z-IKM(V1XU)si(TIey0*0hE_RH7d9^?DyA*(B5Ib!B_-Zsex{uGQ{qWng-Q*<c!o^L
z!tBzJ7jS0G&5d5U6s{)Am(iI|*7{*>1+;-@rlL%a^uYX(14VB7wgJo)aPwL`3W;mn
z%0)<en|#<~AN-9}#z>cL@H={BEMd5}?b?;hbL|t~^AH_l%-{*$I18$%fBK61{y#%x
zVMCufm|#yLHtgkA!@zl9%!Y2-yxEeQJu7T|5Y@<CRNz2Ez$i3FT4tu)Mj-?wHDfsU
zZ3Bf?B6e!WdqUIk2ftFd%qB`0P8M#r1S=FW?a&(P7QWz%pl0~jns6=8`)b`EIPSpg
z-D!nrRrGxb@C?ljkzfiqqPBt0?CIH21-serA>J|TModi107m|S>AJx)VS<k;voLMb
zk=Ox67KAgxZMJzvCoz>kg?FMYed9iwS_yw4Pk)A7qwB-9m?lXsiUt(FE{l_P2U2w^
zLbw0kxY24;lQ(&Vg#+qq_g|heQ~WNLZ2-Nu5*A=rWD`(U&9zSl{2Zf5Yb;p@hOQq_
zTCigEzxn14ebo}I9RE>r$`{`D1|OuD5OE7KKRnD~7J_`lQrD**`|aBW`p9fUt=Z9>
zJ?+v`qecwf1;%L4!yp_MofXc(Xf>Nu9x;C>Cf=Jq0n101e3V%Tt=?rW?!p(peQ{D<
zhG-)9)EV5ZeKr~OOnUXYP!{=)Q~%c*sHWT@aY(!|yM>n3DG2-}Z5SgfahGm_RhzB(
zJ$^u&#oTi@J!>y?)6WWZq)yGv^+P1lEc<oHi76+?fVR|k8-Mqm?n7SqR+2*)1+n0}
zqmT>2A6eXBV!byjeKP10PMy}!G3tB3BCO?&t~WcV$o}qUx!99a<gJFI358<T*tM9y
z)rF*J#;TFHnRaqavjuVF-bbqu_DDqLt?f{|sWO|G0GU|L+}*PoLj1FsyuP1a2Io_P
zMOqD9ehw0I31kiW9Pzl;Kyb{z|NJvr-P<_iG&yYx5@!GycQHT|DNSTqbXVCa`i3%&
z;&=`cG1rs|=aksw<m3;-mUG@QoQwXuz$Jo!$35dAB=k`>>o!F>+djqN$MI@}h_nV3
zz4;KjM7yJ|?sO>=zyc^$gmg&&GRLRIVi7Su&46rhT4U5Qi9zbAv*wg$GvFHI`CITG
z4b=-5Ejo$eyvbzQE0++jRH=alxb=Nr?Kda|ydr>dhO2%IAniaO{ztM!2|0720gulm
z?c<7OHquliC;Iw6?KOP2o7-8KBS;!cL9+<E1(XhE883toOQkl(+Y7Zt_03UW#<Hhj
zb2*Pv;DyOls+>L*9srJck>H4a{CF-L;43I=ys-JhxA@tj=8LYBe47bNdV+YB`IOVw
zuMd&HOisqV-CrEd_Q^+qvIip`Z;4b!$7|a=HSN-=Q_CFtb~odq9_y&p>b-7mZf|~e
zP)T?af_rjAKLxkctaa-iSrg`MVYAMH>5GP*`fckEZVXzcc6i>LLqqK^VL`MD-CWM&
zW}d8l9d2U44cn`Pdzc9Jyt#5bK6EOiFAJ1-u-pVCoRN>1D$D#1p|L2x-H~0?yy;fv
zsEAm{XkkjDdW`Xd*EcfW^Ari)q46KFopLR9T^xnm3uHA$J?yBhowqv)ZyMdT_MW*M
z{ApAqYFRXX6WNxg&Q%6w|9-l%q2%4Vrfe9I(G2(~x_k%tF!Kqb^B#6}ANsE<>G?wf
z`ZOTp)~#D(h2$kU7GL`5YlVjyGIr8YqC7(pXw%b-xOPD;Q1SpmDq?g8Gk=u>z4U*f
z<%&c4WXprx=0l^-V)a?*?fMH-xpg<({;ID%daYLfhTndxTKhOQb^>(69fpJh+7q{=
zx6)qtC+Zk&Vb{-c<}9Ct6#ts6KY`n6h#Jl21(f^j-?AmLPVyGyNQpXI?u3WT6eW_>
zzl<iIo!X6?xX4m;rdKolNu|DewZtI<bA1NSPw;mD7r5`adwFx!l}F)4A48eux=y7&
zbZE9${)@XQy*BQiOINR5YD3LG+wAJzp~LgtI{L5U?%aOnnDY0h(O2sgz5GtI#F2jB
z6~wogz2S+87Ky-aKmBy=$B9rg@I&XYkVb%+d(?3xBH!7h^ViN@UJ}J?t}S%&?SCwY
z+JDCsfz~^#2BQcxLL(|teQKW>bft==vnki{rxDvSu`Tr*H;zlv;^d4h!2mRNw=a4g
zWS?9*gETV*!0lAQ`FFw)_933}!u;GyNzqRTVleuS;_1_;yBZf=5!GZg0@rU|z|J<F
z%lzA%hT9ECuid7tZ$b7fx*!F$*&lQ48#pce=gAY}_)kywlsC`4Vm~oA)Nw`0n>q&T
zth!G<{dRQr*u7ofMYS9w|J2jk=i`&p(VSs*4ykQYb`81G&Z0=OSaSS6P|`3?&Kq}j
zH9*Sd<%n%fCxbMEp1?c*vVZ^j244*?dkZu9V-7aLwePoz_I_APue-ynQvVcF*~IG$
z8pb(gOP3!M5x=paCkY*2fzpG?Jn+){PZkypo3xgTQWCqJyTt`oM#seDgyjRUy7%eh
z*K<+S4w<eAFEZqMzs8MsZSh^ZY87H&lQu6X?=|(UvT5%*gA%M8H0^47k-(}Q=CsQ)
z)FjRO+;@<%y;OnS%o~i%rw={X=gn*%E-HANwMS+E8;+WUFr(GK_PbkauQVvz_vJtJ
zw_yPh5M3=;5R`d%c?y{$sI$GqVUlV)89ugFM8Z$~``=q^o?gD8mg2*TYO1N?JPe;D
zsf;ET1PcgCbyJbt^9(4>T7VSU$P(cra~S}<bJ3M_3n`c%HEYF!1$)f58!}QO3YQC|
z6in{BPR5g7siHm@>Gm9OAh0iQu(^3gq#N)^e<&l<G|8K<zLE<y#nRH!jWHC~D2q=L
z%L@m;fBRvcGo8&Gd6f~RUJYt1qI96Rd!daDRPNEU=WWSP)%dD3yML07lK@v0k0%e9
zv-c=|M*?DrIyfRJYoCNJz*XNgo|*bBhc)l*+dIVSg<!?55Ny!OS?M6tR|uP`PXL_s
zlNxg7r+`%gLVU55y9Q9R7WIgzjJWVLamo@BIk2hY21epG@bN&ZX%KYdU4BqlokB$I
zcn(8T#odNn1L(n;Z~N=7d-&w>Z9mr(%Uf8r)x<Zb;|sqn{7wONdzQedS6GwH)=hr+
zp@2l2=q=nBKmua*>-&?bXSqWhcaa?I0K_RC2a{JAEAe>&bqnb!4Sw;(+16{Tzitxz
zs1Q&@fAKMN1Bc;-l-**I@4aNEZs&mmFGmKSQsNKBva<s|%B)<}eFH8=QiO1kx$`PD
z69YNUhkE+#+PYO`xN<%aywsz%;#ohz-tdp7u``R|O$qW;nw6;7##2Np3;)=wa48uv
zYSdV@<V%>4@}mFn_l!STOwB~gowyyLRwfSm{;iGm%9TeXO<UtDr5?5_Vk6p0-sGG!
zIS(vwqC}jS!86ors80Y6K2f4#M;XR6m{yy{cNjcpOuzSJf(Pn?P7=(V1<I>WU~>>h
zpGXzmew&P1R)mUON@~CRN1P$nJ_%eWih;z5XJG?01g!&#OTq>y&tNfLr3790Ho@!(
z4!7A4Kg=(nQtfcs&<UQymc5<~qu0}KBCPM8qtyFp5~?R~d&hX_#S&n^*^nBDkD)5S
zTnL*=?yU3ZZxyy<!4k^JF12ipq1j*=b=uHt7f997mM-q!tP5ri`RJQZiXPWk6=pbF
zjN;z*J<1`M)Hg(Jpy=5Nv)szSF+>#4YS*nVs)(F7sD5C<<BzP~63(WqAbFWWXyTh3
zT7hk35g0_#OErbuav_D6`gfDRPUt%M*hXOi@npiMTRsRldGkP@K1_8Yk!upnU>{{P
zYKoLGP5-@i-Er<e0<%V;yDPB)USUVLXUeVUXcaDeAD0sR-CO9~6VS<CQ;!U<C9Ww`
zs+%Pj|80{mOfMn8HX%6X1BRxPccw<n*V$RYK*lM65A7Qbnsq;ci<kDLmcF?UE}Q~F
z7VuF^SPKCf{g}?=B~df=UfKf(#=T4aBjqT_?IRC9fO#gLK$3*n?wX)4ATV$mP()~!
zeZA*C{6|H-fuoSwB(#P;UOi;U5V>puScsAqSwvs%i3x~vcXw~vrp?lV*8LYBfq_W0
z`TdIxQVb9Kx5;-3mj+F{PaVy*N~BkRHLARG!`2&NB)J%S$Iv~}puzTfzAGT*$Y(cg
z*6d_{MqH0!!;E|P?%jR%rPp<m3ADSzrKqQ(brbpC&aw2}DCiaDKfgJj<Jtm^E~G$>
z)dhF#KbW8^a0fI(;Cj&K!ccvElomP@=e^<#+%SejR`1)F=#VV@8a}x@!f0;UP)0Mj
zB%GG{b9ziV;{2`<6?C7aX}2Z5Xl*X{e(e4tYEe$Js9lBEj?VjN?-V|I3WX6DLa<<y
z-2P~Y?x>s%jEkIXp4hGLm-s>!X1E=0#d(ew$hJE*=a-TO4<2lm)xi;&@#BjlUf$zr
zBg4#%r~1y(W6yifUcw}uc4Ie#=)^p@dX#fo&DIJ3IdN?acw<zUqxKYZ&qfXNCT|b3
zHvdv`#=Tr*H1b;GBOtfEspF9=i#;6I_S-Kvhd(_F6VM}HRwzAQWljgF^g}s6Ev)#D
zHUBxWRV82=zEkR4<Nz5)*8GNwe~;1@ep2%_@iMpXV1**Z7vDJi(#+2pXGK&vhPVKy
zDuzvX6DGTmQ7*q9ry4!_4%VCzkiDUOoAoiC1H@Omx;|K7@FgjVH?4XPev!u&2~3Im
z#s5<yvfu0+b`tPthzOomu{-Bu2`X^n)0?(valq^xzFKU@llA%IN!@jM{1BLmI%v^R
z^L2K1@btZ70-4*!XoH1=o-)3gT*%F9-e0^CxA~XP5aR7`LbMY4CSi5|S^||Jai}<7
zZ1!A8szjo!i|p9a!{1Z!z+W({I_k=F#0^PL2vZ|aU$UrZtXYt}kfM<RYv0T<npIXZ
zu?yOcdGFslW*RYRYz8nVVNQNgQ8E>l+U_P4XA$`fnC<u91v09t&EUBwjw9Tqv-K>A
z6@lZ2kTLTE;#ROQB(cYP4x1}%%ae64cl7jJ#Vp3qx7Qj7^QJt5E{m`T1dR*azlT4k
z2xu{BCh>u^;H^e^TzaYrG}THrO)53i(#wYLYi*tq+FJ#l&eBG~`*GKhmCWk^XGoxT
zc1}<L!T*I)w=dV#+2V6{Orifh)d&)jPIh@t2_MssuBN_tp*Hhi&LsTW2?};bIyyRb
zr>xE^ZuGcuQ=toY{LzpjluRAtRNPble6;HaFg%C2MNowL1g!<4DwsgM#Ew9!oYYDu
zCnqxQ(K8KbtAPqC4bvlH>ng0+9&c}Nr3H+GbO(D0kyi7o@6hU#&wnnyE@fDRfXB<>
zIZpM6TIok&!fAp@KIm3KOUmh!p#}abgAhz;w(!afjCLWihChCSr+vjuXlZu-V(H~1
zDK7Fjqc3m-z?k~_^*J(P3oSnvhHD-ns^<B(nnSe)K5Je9^)F{xq+vFXp%--=M&e`X
z2el|T?CwrcR@@ff{QP{KhvXck9Vb{1F?Qjmc9fAj@`f25W|MpM-_~ePl}!XCHa`4i
zHHX6Lb6%Y+VG&48)@5WkG#hO4y3AnyWd89l1czZXT7IH%EsENMquEe5<UGb659i=F
z%xI*ge}6ILfCX2waY_-tIsqQ4Gd`jJrgk;g7xVe#=g*hS<t6!?);h1KJa4a1{r2<J
zLtZAt{^NYCt*q1+E?9e|Z{E1^L{>5`%rG0aMvUrlCr%t3(TPLJuuRi@W+i4)FVFOH
z6&8(t4)qBj(ivzs*PCBt8ZeJ7Zf%)Z?VZ>4C)qM4A^h?;*RD71F?}2}32ie&s87%t
zU(QQY`4oGvB}+DvnXpD|tf3_0Vp<Pcc?24xgGx|??$Z11+qLtw$(JxdR2<`BRCV@M
z|B8w?PaRg{+UoVD%4_`7q$J&hB}6#xXKYd6w)iOGYy~aLo1y;xyigcDn#Z>7*s)LP
zK;>!6=8G8lB11*Up>-m;ul?vhRd~)8pELK0)GHe|ZtU*>A6wpJldG$dSGojm<7Gk=
z44mt*4CA`>n?}72CP_ix*ijACrVIgS&@brf)z;P&!3-ecRmIDBJN%#R`|@RC|Ngtf
zcmpOJB}o2Gdq8e@-5)7NJO}x*@T2?fR{HDwC-41PJ*>RcI>uEeGIPbFOXFGfDahp~
zQnnoFIPA;AvCgVSQIJCa6J=ou1t?#*Wd%Yd-o6(CO7zM?CYU36ZXK1xL~eDCw<A+}
z@ZgWf%kimW)!5L^qeexx`7$kwM`>7{ROh5;WM#Q}IpJT%`88_L;MTnm;uN!zhN{6U
zpE<J_z~a0+LD2HL51GPohZ=)Us^FE+|A0?Vi22X{+BW&jLJZ%TpO-f#;VGfISCR%?
z!Yg~FW3{Fbk&SdBQgDq0cb>v|+<f$CHrk8pGE2GM;`c{7(PE@NAr+Y%IRYj_9D|O-
zf7U4Ka1A$UHHTm)Ei22Sz^No>AhG?MqaxLCAWH{;>qp(vgUl@(H)c{t7klp^d8}FZ
zC5`s(P@6DD!UKlQ3z%|pj{7aSJazwdZ=a3Js4Cg0+?=2mpI<PdNk3GPj~`|js*_WB
zy;v$VHzO&#r{`hHla)rZXOC4g1Vk!_hT4oL3XoDG6+<)b5n^kvPU1){T3(iG5cuKa
z$L>9Qj*|n~{q>iJKw)K2W2d?t!^9Igdf#mf8kbSM4KqfL8s$x`3*3(63BBTk`_R}#
zRGUu`CU+xfyIxGW)g%^UHL9wrcsfX?ofy4)+~H|7#D&tHVDVmk0^5D_#*NMiu3zrx
z@HdMrUR6I+`*OK_IP#`Vn<`ZTn+i!^oR$0#nrV!a@gZ|0(MbIU4H9@h1K|r9LE^3K
zrmZ~%Z*;>EumNVB=B7h4^J!*^ON|p*FE^M_4pbtgwTZrSXG=)Uc=pkw|CCsT1Fee3
zufX~g^$ZoI<t2djkxz-eNip1xo&9Basj%uGi)J+r28Vc}3jqp|^luV9>?{!7y?7E!
z?d;R}ScyHjO_HWMF~sv>W@>7|%=f^(bq|p_&m^NiLt-bg(5{2&;8^!peL@uRu-K?O
zK=6Hg_e#g5=;T<W>kS$th0J>ii;{fjE}27@NH~Gt4o8Zkg<g8&&Vzdscgea165ew}
zCxF<a`x*kg3lSSRgA$OD<52L%WOA$IJM~mkfbN<q;@tnfV;A1B@osnbb9{`~Rcc9k
zLp4#KdGo=8;@|Cm8#O8$1FSwlUHfTaP-Cx_^u40w@6VD1xpANEZVEN6>Q2uXvo^I;
zUP?}e46afN9MI;6MgGbvmim6F8#+tElfIn?O@<C3jsb#C+f={VRBC+3`CXtahLgPd
z#XsNnn(03rv5j>n+K>kqCYH&OKFl-hzW|sRN`3ObqaI>HpIHd~ASEE6pO26l_b#t6
z5rjr&$7?6|O=}McElMUWM8kxn$8kZKxTM|C$50qV6^2qVO>L=$&57dJUJr#ve`7xZ
zTIV8D79`(-B+~{V-~ptN3B>;r7C(9N#FA{$C6ny{COX596Mw4W=>R&M6Iy-BHUBdF
z&99Tce%1j;6Tkk|tdS6xN5mIVjKIH63Vugx+>iF6k4j;=<>nH1q_7SSTp0B~V%vYr
zCm09zC{@t)FlWc=H}83H`7U-hONw-CGMc;uSsdCrW9zA(8OR|+-Bfn;L28$NT&JGk
zp?`*Kk4g<n`|wZPu0Q{DxqSJuuwPuA?g`mU6p?ha)J;-#lSE;6cZR_)%~(ePOaWyx
zV}lL%o+PzOgnU2=5wpnNgoN{{$;@4Zhd&81;1v{RlQqQwYQ+uKh~6X9V^q_%??MT+
z<v)LZ-}rD5&ch&Hj~vV-ey5hju6ndQg^Sw6f|CCn!-ReVS1a20S+#QIBr0SW;-@o!
z`Cn2s3r33d80w8+*ez<%;i~jJI+8CWs$RX*b3;fPiA#AgZirxP!^pdwwS5N<p614S
zlit>Igg9B~l}@337XPWSJ6)>-S;6t#wFD#s)H&m%jC-=nJoXb-I)tjOqtBk1%6MVY
zcMqPlmXmgYlLjmrF#hMw<nmh94Y+pgn$%QD{tPv|fz!r)x$FGNB@TohRhyd7m-Ajx
zUo2+{{eWHYhDuFk89?FPZvpg=7j1Q~voMm>N1$0uD^)zSoONjkg)-iR6s?BZU7WvQ
zLGF+hfN@G@k_m;iu6T0l$b2hRfYs^ni<hiE!KsfddE>}09^HfG0q#o)JH<Pi@(&6B
z@nNeOA+b91&ff>a**IAQ*kbKz|2wF4*~PQ3`;Br%CTe?29g}JbiDcIAz=8}tM%Z4v
z_KQ@3QgUVey3Mo6C`OH?6pYyYgv3BEGafv6aE4%$B#oP-_gJP=Ee!~hq<EQ-<0iB|
zC&Dh%1LEWWZ5NCp27?7GO%q+AyS*nL`|T24H;Q#`l<=SBxJg~aDxAO?hJBx=>y>89
zRjcm8Lj>9tT5>p5>dFm~p~9fQ&e7i~g7$HAlss1SKUCb<Mn5iJ&iqNYL0?Ausl!Sa
zHhcRcLr6qkxRs9CU&?NS2F>J&<91)bN)%8$wFL6gjUlpPM>=MtYI-c-$Z&q&OGzsx
z<t`;PsrUubEUoi-K`(rQ?YecRsqJh0xQMs@NZDSqcJ2C@o15?JV^a2eKE%FMyQ-!T
z#un1kwQ$f1fXtZDqg|j=CtrE#`{$qEP=GK-%?<B?MOMXg<7_zLlKRBX@4@+PwlC+R
zN<Q<h0i}v}%OJ2;6E5`6XTML=4)*4EzwA|`VGD?xI;rT%@2mB7w2E-%BgdtX9@uR3
z5)GlZfD(07@oZK9RWE*HgO}hpVA@aJj%jDS=QOEC;c#TSBH4m*2x2;9x&a}wc-@iZ
zPf9L8VcF!4fXduauR(b>W?Wc13K}_Ly<P%{l1aL>eDLUzH*9_5PaU$D<8YYS^T0rt
z<HwJ8pMGxLQ@?JKd^*BGkY*bR_hDkEV{B-z@x}3|)GuDXTuoOCmDG|5bhR@8VZ5P}
z{3NACC%fHXM(Jrn*H-`jSayrL3_?}$CuAu*tKw<O5gt6o)4+r@-@2b3)M|hE!sW{c
zY%<F0?NO`OU1_dx>Bc?A&<z@N@PC~U{Sn{lcc#`)=OIgV^X`Q=!`g&#Rp}wf$TSLy
zp~JZY6H5u$X41sBn_!rOavJxTMf0hYD*-0YkRQsXwa}2#HKtD(JYE|v{-0)jcPWFy
zD4YglouwWUmPcyPG|8dWka18N`>}680sE()0HZzmRpS|q@OUPu=6oa-V$GU1J%Pg5
zbh72saa>bb^nG0?|AHO8xiXh6Xqin|f;sKKKpL*c>ZIr7_)xk0WMIb2rcl-EXRX2v
zctp9b<-{%q5J}wt=CzXwlkOsQ`6Lieb2*H<w_iE19I+`{ZiBYO3h5RflEK8qll??|
z+oot?bu|?JQL3R%4B0f27P=lp7f66e-;SF7Ce}NOJGaSZ_L}fY+}~&dDM55_ZBm5<
zt~%9CYn7i=Bk@sg3{Jz1A7=`#6t7sVUY&?BlQxch-3&)cjW|i0sRX$;b%$d$w3N&(
z!Rd@PbU9jKGBO<+Xs4vTcySEgNwOv*S9#N(AjB<yKcGyjzM}X+XyXJh@F7Y3^$99@
zS|hUIBP%INtv&Ybo;ck(b;sa4^fLYs)3$NrSD^esqvCG*5TTu5R_bqBT3W&yQzwdn
zI>qUbOrum3mz3u`w%<E_b*RDHqKK&RBfe=iRoQ;(&iY+XzNkC$-Zx+Ev9Z`cYRuL{
z?#BvduIqNo<)>Y3pDPEg)AQO_-^*og*V~i&Puc&Y)!m)jT5S8D0ln+juAN`9!E@JN
z`dttFUN5z9*7m^n`H$_Lip&l2KgH_bcFVOt#cIK)s(K^tacp~%)7uU0POy=iHoZeN
zVByZ8@9)3H02uk3{;W~QLmcrIG8ci5b2HY4m&|2&!$p1C!1dnWU!YN)MtclQE?w|<
z{qkv-b*F&lS~ycb&iUukV(W9sk2whlgr%$r-?h-8>#u6OM~}!h*0;Y4O5ZHvO_~N_
z5u{qiMTrLTd%%M&ooqr<tNYT4ZvC@<-DQ9{{MpcqcI6);8<dcg47i(|`gqgEc4i_x
zOGT7p$)l&DMkd<XBEuR&H^0dHN&mlPqyY`P^0+!ktDj>cBj)NhG?OPRvCI#+Eut18
z_CuueICTJZbm<~PE}aI%d>7!`0OMUSoXdfWdvJHB@b;3G$@5v^u;jv<I*R+DvfCmB
z*V5d5@;3oXKn?UIYs3{0SV(ytMjx#^S<NnxZe~YC{ZjA6pB=SVz4)h<Ut7}|@vvXl
z-UGH~T^pmgKTW#SLFQwoe9HJ-(U`sOK(OM&>4UVt&%27Tee}tWL$oUFW1?TW*P-P%
zFNYQSFbK;2s)@CE0QqOpIT=f-TRyHTgWN?NO1OLI>P|g#%3}=4NfDG6xp7hY71=ZV
zQ}fvJ1kbX)J^Vtnqd)p3a|kd250ysk{27{fN0j_zx<_wn+h{Lco?KEKyEbf(G?j~z
z1AfI-=?YwxZd}n3NuFapo^q!{h_PJL^DHmT$Vby7?CnPpSzxZiL8WMX7+>)mFG|ST
zD4h556}@XwztRmV-#rSwS&@PvgiCo*b~%aOk^}N&K%h904Wq12#<;{dm1Q`I)QPAR
zY39`!eE7R@UQnFF@(UmJ(eMyGPo^1bgOqL&8V`H>L8PZi@@z_e`svNu{Loy7KB_#1
zH}p$On_aZvvb5TB+VVsQ8S+`BMpj5>jI<s){v(F+R*SH=n@A%3gkieLw&gKFBY0(O
z9J-1OAsWbQ8pwMNqdm84D`f(|QMSEao0Y>wO>=$LSd*JrMQNZBtpp1ul#9+E<vH{(
z7mw$6O_@IZ(&HP=6#h#}kbgJ~4cheDk7~J~NHwM$>$|xTx1*=fSU_4GdX`J+c)rQ?
zNuY>Vrv3z>xS>1bs|L@GpCFI6DF?Wp#{T!2H$d^aK09r6|GjlJ&Nf}}OUol)vtv3U
zbGJtcASpNN^WjNz5yYas$OA-^fvCRdgkLThu(q?P<OQ$I^-xM*w<vUp(x()>0%vfP
zykr0_EFH`tS`yxyv>Ue8?TM0(;3&sJZ<CYcNjEuQZEuNGGyusAYDnZ*tjL?LD4c&j
zMu=+ECUhye>3&hdRGG0NBaLo*8W?azV;FTac|md;y10Fb{(BAc<5AE+dEU9GoD~~C
z#o>D<`0~aXV!1?^3Mh~!ZTSY}#c`UMBUX$NO6y@#JVrD|MCS+xc@s|C;NnxPUq?dZ
zXzH@_amg!kA9b~a!xphU(Lw&jbQHsu_NB4V(b&2<y<R_;%O@k6lr5Qj(81;TW9!~m
zp8X*YD~e1a-pHaF2<{r0oP(_r(PibC_n$<Gr|ZUL((})Cj^S_e(O>CEzUArim-Q9*
z2Z&9PzY)pqW%#IA(c#!}?c}8-8A3rG(j2KvQHoSA#GuI4TOOR{C0*nLtlB!y{rBoT
zM^zs0Fr$IZSI+ZwOCM)k4L7OGHW9uI{iUU81G%WM1l>Z6DY4{G?x(O&blD$p-K=2}
zX3QQG8Y&XBE#|EJnUHo5inXgQH(0dJX}CZMwGv6DNrCU-G7KQS>;nIN0>F>)EGsyr
zj5eox-SYQ_tv~K6|FCNu>Te|2qy0DpwAA}|UUAe7p`6pR+F!qlL7#Hp5&m^a|Gm$f
zcz*Z|=JwKukj8co-vUuZF2FH!)qtFTnoPWn!25ot6lIH^&2}woGzrs0l|uwPvEXJG
zN|w{ArcUML`tF`}`5p})RwX%?SLBt9fek$y>Vb-`y3qOsAf>lmEJ#oWI+z1dgE+L?
zHft{(6bVW@2sq8a5Bcgw>aO>L-8x9RXT^b4BUZKUz1(32#?y+Pr4@$ZK%m=VwS8pp
z#iNiTtHC3#T+fj?kAmB;g=4gR`**#8mIhp}1VS{QH)~m1F?(T?OK?EpG25FsqI6}>
ze@%QSW2{gUS$62713#sMe3s=hbiK@_?SI*;wfG)atntVkQXFNv4(YZ$d4uhz2HHHd
zICT=VGyCz1`HmsCO4scwe&^9v&w#FhO`^&tl1fK&c;Q$M_`qV<Ws+sm*bi}gV|xcF
zWydP*hp1*1Gwj11ES;D+pw^n{c)_kqnL_fi)48uWQ+UD#k6Ldo(V!pshrh?9ukY8E
zNdkn<yy`ROI`Lri^)FjL9gPoLGjk~1ZY(8qG8HTx8frZ46l?DX%BliSFJ*=Shi1%H
z_;+(EzGV_%m3H%3iAaU=z{3vd7k%2E(yISr7p3Tvoeczx8>nQm(=U$<Ck_<}O=UE~
zivH0{mR?!G={%(G+j;4$#&e%G`f7AipIASURJw@?w!9ce?YxhFXvY>0C~7nR@b-B-
zyE(u^D@sIHOn#dCB$DfAh`t`q%IEB=B_d5n9Y)Hz1<B`(4A-5V%jwznwSzJ{Hw%@Y
zTNJtI>Rs9%bDsRm)rW2D*E)8}jX9sw&$sJx)S_Ak&b5l}^eE)?8u*UZz4{Jg^)B1q
zEf#+&2&Q%5itJ<gunYm2_G0Xo&9Vt2se0@aJ<x@ZLNeE*>r%|Labuqx%*=!I5g9~M
zF<5)=Kr(FW54AWvk;{Zt%o>?PE!0QdUV}ZEMRd1l9k7F+Pbr-lSA7h0FN!&4s9N+I
zr&%Q<TFl)mnN_FqvSQR}4k(MC(pb;9C8sE?jHnQ=NY1YFx}glUkzxSy%<E|%a1&J{
zeBjzMw`!Yq2_)@v%{&7hKOL!|-m6AbABG|AjyCkKD>JLUX+4vgTdvIk82!Mvew{r-
zg7q4N??Pit#M)^tk>jXidGB<qt7ThUsm<4*(UJ)a8ap;|kz+XCs1H@-R(Bo1C^4!o
zhc8B3OG87>YFpdldY9zyH(hG9p^l)v@x}`f?TL{YX`)L1Fu~#SmhruI-d{QS6JF`^
zl12mOU3b|ba^?Q&!g*g49WucBU){p|HxB4aE&}9j>KAyt&6@l32o+<^G!j#EYITmT
zw<|}f5woYre*Dgy7}o`@!&<bC?Wb@qnoV=ocMXDO@s=8pxs3j%GtIl!eLHzAt7uNO
zlM#O<dJ*^+?|YqU`S1C7%cM-5Y~nOjB{E+zWpkBP3m5U`x2|=T{yxz$Z%^X2uO+R(
zn6n<Bj~GE1;uWyb4;J5j@8vPQ{O2d(?d=T3b3t+ql1VrSro8PH7MmpEN0`oSZ|nzN
zd2wB&LO{AbgICb5|KMT#Hm*zerJ$i;|4@Wn)<3i5k)s6_5)CFU9-4bOw1ro*d}d`W
zM2u?odSmdnh?_KhoQ&wvwfBVemxC|4`O^X>>iLgS;=&QA;!4<$s)yPdP2>Yh?)Pf@
zNM*TnY50)HOOaQO@$@&A-CKja{&MSZ=#pD_kI6pEt_Ot%;#2p4_Lf;2<gOKCdZ6eS
zDhMM=9;bM&YPXOSdFx)yY5XeIEh^cuVtq%1qb3Aia+~R#LhVo+rfkV2x-tk&r0rpm
z)Mu9J79Kj-E?3*&$pXg^^HH(8MAn7Q*V|MAUw}grL0;(870ZLt_AmP`w!0Y#A2e#F
ziXx<-x#*AO+Z})>(#tBKtyOsEh}g4q3T%2C%S(K_NMX8%42#MAH&hya+oL*-ao_ga
zMb676Vd-R;n-M*2vN0@gPr<|plh4Yz!oh2HX<);8>%toqVOZ8)u>6pcG3-E{#`=9>
zWTFH*_6phHlgy!DW{<<6k5^C}Tyom2R|2gN*WTm9bZt{0E9!3#deZKP^FbZzx9+We
z`asmVI8C3LdkX#6lY&K!Ej@+$5svo3;>k~ZUp=0*xz2Zkjwq9#s@wk=!`{d=1?7Nt
z&H+PjH2XKyY1OLndTDzNFO3by&Y#|0_famX-u#?AW4ariu7*Z($RGCpl=RR1ocZ4k
zEx6Ol$p4cg9Lj^`x#x?|dlbLZ%6~a@h@GQMCt%c+(Q-m3xluqm$y)ewie(1Gjcd_j
z3CIdeZ;(c4GHp5PS!DYINBY+C@1aLhOZqN(gT4rZIZ;-dh*r_XdyBX;ws>!Af=8uF
z+<+`qC*SIr$TUN_x0YoD+ohQt8QsP6wy62Z^e!Nm)4cM?Veijhdta|tya`$Wcddtw
zra4R0@$lJ$VG$OD6`bs|?BBMpjfcJ4I<e^`vN2bqUwUv`ewie*%+KaL>f3(-&pRT{
zgk;vd=OsJ+Pd0cc-+bcOvH9Em@_$}%0R^ji?bff91Vky*)ly!BtsRMxH&8Ft|9Z_x
zF`DnmF!V%du1@3{fUtbLa)6V`qXlznDX#irMZE$7v}M{DsN;YP=7Maw#n=riI*!sA
z6(jN|mx9Z=y(!E>MBH#}0R73zY=b@T8L`C)a6m1?&n>u_^V~6VWo2?44o=LEtg~nt
zp6uPg`S5LOK@XFI?xLbvFXSiRdN~S3vyD;<&>u7WPqXSX{n~w__V7ksd-S+PymJVt
z0&`{owA5WX6HqYi{!<RfJ=`Nlo&t=JfM`e2s=D@m^WU}FdEr;&8wK<Vmyo@D{y>q`
zl~GU{Wg~BsPu=N1>eFg*Ut)31^kwgyM_^64b8-(wr}A#LKKI(_^76Sc&C6>+k<Ga8
z_;Sa<VK(PK@m<>Vqqg#UDPT}0Li~U2oqbf4Wg5m`thO)vGPUJ+klY3pOen2TQR`G3
z9aL;}g@{qGNZmjQ5L=25&mOU3da?$AOpsCu`4CA>G!pP+DH@ut$%L|g1Y$&DkOGRh
zziVKS`m6u9`3Gk_@Xq_Z&vSoV_kCX%6RVI*y_UZ|u|tWjqyep$TLF-AKJ<*)E<qK}
z*ko&b!^|uX^F6_NBoKBbG7z%)6vt)Vy~mx0+V2oC{G2hRLl<t15~<*PkcH$b)-&&^
z^WOV+zMD_fB$a}ZUo0Yk;WX~_O)$zY=F#R)#1{a2);2vJ_;?SBA{w`Dd7~y4DVWm(
zlNM#*o|OtSlHR|Xw$N1YlqUh1ww(1bY?o!5C{G{2SB;LYJrMtMc*uhRw>-0#_bnke
zE{7ynfh-IseRI5gieJk1G$wjXY*~AxmW7yV6$v@vW^w}K=AxBNF_pDfMoN}&(c(^G
z=ok_nx1h2!H&E)RX$<!!0-T>28}M<`#1|C>k$?|1ON<oGzF~u0<O6CYXK2+BSUWVN
zvlk|Tu&PZmEa6{YmfU^nGyqFvU0!}+_29b58SGf1LeHrukE&tZBr~XOzNO&@sksWh
zXFbkC#g<>uyi<adcb&}1i1w>!u=T^qP21MFI<M@?XIrz`<JFz?@|~9ul@&C@2hFjg
z_il{}AAj}2#^lG3g6?4^USE05b|OmOU{?NI;{D)Lw;PRz6}ZCzC)#beDkob-Dayi0
zI7w`(l?^%gKO(7VOcvao^%kP$A{`sgYV~n8I?HIdsU_(2w{lcBUub&TuqD($8n_nz
zLaUvWm^~7F?j6T7FqX_w<~$n`HnWv28oA?P<Q#7#>MROlBFJbbrC&>-?mT3}37nl&
zAeQr@=eEYz`<#};EWS`pbnI^}2re%C%f;G_O)RJ%2g_;_ksAGX(Zn^ca))-tQIefk
z8B|jp7jS82{)w+odtSqhn&LPe(ce72Uuq@o*PD{ESO#&+$3dod3|2^&CDS$PK*F9<
z;JQ*-Db>FZ*o<NLagolyG1)~7dULO_!E1(8otmSV&m?lWyXQXWQ^J^W|A93fFaS>l
z>W;=7*OH1pCxk0^dP3f%J@bfI1)BH;c^XB7)Yk;GmMXj!>sWF1$ZWjJ0kl;nUb^uF
zL$Zh5b26*|8%i|IR=~76LY!Cla}%5fpGgpXBT{z5%y!jYZWxQdG;^;qL9m*wm)^v<
zxfS$uPK4GE!wGz=KPiC^b2M3NJs6iqqYn^UGtXoqoHmE)F4iH)X6W%p3RH}ZO8sOg
ziBqN0>iCv*r%#;=r>$u$14&zRgWC_B<n~7s>BKjuWJ$wEf))}-_z_+y!4V?vW#l1>
z(&~CTp|>n>l|bI%F+Z#_Rlr=khI)Js#cby>uZP0Nvi{Bohf0dsr*S<<<Z3x@U7OOq
zI1(Y8^uZpe{1`(X-1Oc!SqX{2R$PrM$|p6PP4tmvlBDQNLb!f`JKFJuS#=8EQwr|;
zy6sqc>6#<)=SS;u%(A33H#b|b%ObW7Ip^oJv}P9ZmxvvSvV_5y#fB~hNx5HPgn>S#
zH_wOe+b&b9vMf2qIKEuX(PE=$${1))qu~%VnHk2<fUvj#&rvk`uULI4F`3MwH3Zf<
zHt<%*Q7K9p;T7vtv|V%Zgr`<<xU_xy!`t_{uBW46yje<mA}#f7^a1;8E-t%1Qi=6*
z2yoF?GqwpHZ7{<!5aY-7M#_U5t`qEks+mPH?(G%fAvifa!PK+j8`u-nx@&}Zjk#Df
zE%AZjRO`PKzjD&G;-vf&{}llajc!=J>djj@_hAI1?Xaw_V{PgrCWFt}PMdkXoD4P~
zox{6uTN?IQEW*y-i!6!Pe+rq;c-;5WP2_GB{0YnRA9Hq@@Hjtwg!i~47~V&$+A?jz
z(PCO^lV8DWT-h7xZ#|N5F-CSJ@t2h$l6VZFm6cmbF$raw_-n_9@^Kx&1c#ZIwNsS~
z6xz^ol6&=S-7~@N9)ahrVE{nE-%XPFZT;oq%#XaQl<29Rv+i<2{dI059aleElBkJf
z*Yl{*#I;m<i|krTk79H6Zr`J|_a`#aF57aw=F@ltdHPo?PTN&Ravm9#8T>MKXryIC
zML=>`#8104t`zWG?$<+&_%)gVP)$j|43u7DIVeJ+R3vJ`%<V=krx|OW&E!M!W6Moa
zY9<CBIJd8{?yT%3)ZL{Lt3}r@xU}snFOiD7#|9Z|4p0xvbiICIl39&r19$d>C$=5i
z@Dq769@;x;ztqF2X1JLZdBy|jxkvdv<IX|#5j;5uMo|moh@vX>n|*BHPqdNRPcbCt
z`jWYa7-l+f?Mv6r_xK}pILvY1K~Ch(@bF?fgWjiFaXhf>>UYwPW8!1@tlTw|K{{jL
z6?HQ!#z6{r!@dD~3pWBA+&{*~_at$7L=fqU{63+Rf+>;vkN;(#2r<ppro`5g-O$QR
zu?%pWjB@{`+duQ0?PR!4U^_$G^K;}dv#6XY52o+hFtSBMIYiNZ+d5ZlK)ol4)jLF}
zEXM^3>1Gxyd;dPpf{w%`zWeO*LAQ=FB0k3S*5m3CKwRhTCFm~fa-YUTjNXhp!WOVM
zm*4C1Qg2^&&$G^k&9f9P?Rto1zX73+#;2rxo-uM%gsn)qFqyzX0&Gh|v|%h6q`oRd
zSVU;d`7xQ<EUALk*~JsrIYmgSMr_TJ{k7IglXW%5ykFpL<?k-Wdu$@$H6J6E1HYf3
zK}lB2u_U%9?;zJIk@Oq@fex)AHSRmx^Pk)0(&4T4leRkLh;7H~45KlPWls~xEm;Fe
z0S#KTYlt2fyQpM`D6K`_s(CC^#ehJ|`(SE0`(`;6JtECaMOI&wN@I-dq+h^In-%Vk
z%`)N_3xF0t8tWXqiCXO1@o~|)hfrAlnaQ)q6nyipZSq(^@D|bip6<L-yd5*IijnIz
z+kSg_;CQxEQKoi=sy_jRmbE6)_O?J=zI%sn&%SPa`L)z8QAK0PDpfk@cAIW3n6qb`
zLNy7qX#uZX;<;wi69@lwkX6#Ts&n;Tz{@P4UbbnH1>&+g<|!tK>H}yld7qY<^j)Kj
z*N$na2+QVS#=5kzh*tv-GG}Y)8O10soYXa3RlaBwZhAL#=+d7zRbW|KlD*=G+-Q0F
z#>{gEjm>xJmsJ;_+&t8*D3HMC6FP0&3i~=9ZD<wgn%;zZ&!kUIMHNXUTEoZ$+pDWe
z)4VW<qS0TGe@ke%Hl5J#hsxT%QvJq!jLcub9eeN5s2=t?qr|gYFr?&T40H}#E6@_<
zO9W&sNc#LuaJx}d75fF=UY^zluWFk5!GYzHck0A5{xh9s<0&upy5s4NJI*6^0X1Bx
zSP2Kvb>bxsF0OLIi7A#LhHDtuu^xazV@!^lS*=&vfC@CRsVheGY$@AiPz1efm6`CD
zEAEP6GIm%{*?~8Ji$p@+Liz0pP_B_t$wtIx+_CauO*sqlmOg@&AnV==71R1UkJ_|v
zK+my>U)=rFAAWoAn+sG`mC>Tif))=vH|^Mo3@VrPtfntda@IKyrzMeGsgkM1|Bx(Y
z#GVLJNz-Z~gRlp#bnAV>O}G`pDAM&hv(dME)Xsdut6>iVyDe2^olnw0GJMC8OdoH6
z=|lmCeO<;qK?{=S+*G3^i=*eMO5jU7Bi0gki9>sMqtB0<QdEDO&y$eQ`<?kctJJo@
zLkCZ<MvIi_JFXO$RZ`ZCq1V2i7`#hdNHUfxoauN=f>@JTx<9IR)8clAjJ+iypYW<q
z`tj7?B&OSOwW{o}SVR{tKBf90hdCR3$8B@Z{e`xk^$N3JdbrZWI37+5*05Fu9{60+
ziM!p*N_HhgLwB6NRe7)!X+S*f?Qa01*^jz}rq2LLrcEl0;8o=O_>#Z>x{HWRwH~D;
z?0&Ugx>(%1Ru+zti`^&9aNP_ZubQ@KeOC?g_LO?sU`G2m1lwN!hVFu9yh2)$-_q5e
z@FykMp#`8S=_P3pk67B8x?d#&99lYQ$dh;ph^K3FJLrLAO2x6S!Y{vEufirB->MtW
zNWZJ-(Dx$rCrXfjIXc?UYY(dMO(mbYr$=m@;}S;6NXIokl`=S+VKSuhPgS=PvdyPE
z#%=Rmp;aQ9ZNv&^d-fC_m;P-TU9`Mj*eJ3pVIQCOSPIU_Jp5wMMY{h{et7clW<BY^
zcmUV+yVpDScaLe^{QQMw-4Z-EKkE4gH$O7^59zW{y*~Qrj_w=m^_g$?Ve9qL@Mr)0
zYQ2`@kFMH=FOJ`K>F8&4#Boe>j@u2StCK{sVtaSwH7;K^s;I^SGH8#WX7^-&-It6=
z6(aJxSLpnI%xUP;wqUGM7ErNN`vyDnNCALuwy0+uucbB52NAc<%3bOB#PQQs(eY70
z?g+<Q$4{Gcw<&jgz|TK7K(ECxj6b>o{%?-W!IV|QV;}g<8u7+w?W9iE58j$l`_3~l
R8}*-NU;pO|pAG%@{{Wg+^oIZd

literal 149377
zcmbTecU;fw|3Ch;j&sa|j8Ji;5(@2|nHMb$?LjJ<G_}LALgketO{CHuD(xJSw4}X+
zmbUh;-~H+w<o)^Mce{OipYuMvs^{~1Uf1J#JnrLhJ#Wg(N-bZqaS26H%V|dsDNxjk
z3lz2B?C-zflcz#l8}OgsP9K*#M9q-@1!snNQPdWSc4)uSMgOjbbN7^60>rx}7;?94
zxw1<B^`+khx4!-J?@t>T@6lw+Uq3jsCY|ri$vUCZkh0n%sz#4ZjW^eY2M8X~Te#5V
z<&G7H4)5O^zDnxQdFoKGWJK}AR`U*)+Eq)cmCtJ#$Jb8?O-v3ujppq5R^PV6yP(Aa
zW0~`%>UTtiAz;o2_@nN@kk^0w@=@FhjtldD`6Oz=G5=papgLVwv;Xo*j&Sh*%Vi|>
zOXvN(jw^TT)JS8=L~nM`)urnVKL2y|aNfML@|>KUA3|mPo}VjY>#j?YE4aV=mit1=
zWZ^x*pT8T-te$M5#(eS9C28M%+sne_N^Udq>@)wE@mO!3$)4mov-GaC&X^k0l!kq0
z-g++EuxHDijeMNK>!`R(vlCUib<Qx(<hrd*ZE_c}D3cM;e|K+(TAJ0l(z_J@MnBVJ
zwc$pWnay_1cNuwSN9Km1@H#&6o;wy@%htkteIKp+oiXWL18pTUuZ@MRzIrl?*`K+!
zJNam5Wpvr&!?%X~R|kKd_;h7u!r9Wt8GUay`R>&_vTnEf;LR%Q_7B_hm>o>qg<2mx
zH>xla3tLF>Z<)IY^7m{TodopW+|G}6nd;o7k!<3fmt1QUxoi{vu2<JrmblJa@FB}-
z;<CbglNZ`fLlvTv-(E3FI6L-aj(psu8mH@>C(!6P;BVLJD>OaYl$g)v$8c{obv9@2
zoa#D7#>5nF_tB(V77MZ8TF)J$W&R<AmeqI1y!pDs<f-9$%aVJ?p3YaCZ=!iJM$<Q5
zFR%68&e%eykp`cK4<FXf4oK4?f4fk#=fNe?zf-)Da~~m2Tx_B%QE2?@lY!T3<jQ0C
z%s-0evA8(OiVc-peSQBZ$BJUjKhB+bZ=hI&JA0y`S3yflE8Y41{>3f*1>ACZ?o3Oi
z7gM}jp6qm!{keG5rdu<UB{R}N;!Z|O{nxHtJN9{{%L5;6`yJ~w&tIC)E-6-;WL#B$
z{?&3a-AC?hR8DuU(z1Q(Fw(;)Vq<ctopQbQT6Rvj$GJO=e!e~y<}w~uZ{Hc6WSriW
za6Be+xOT9&F+1lB<+w^O&%NkE9+QudWh*P~*i!{r!OwqB_WNW@>+9>k#P#e8f4Lr4
zkuMMXg5Q>f?>>M3*i*6uzWk;SF|q}FEt=nok~tZCf6$Ay#X<<fK8GQCZ$EyY!G8r6
zUh>QGFxj<ut0atkbZAR$3p`Fopm3%%Iy=0%&FIA@^ANw_T2Y#8T32sVXPVj(5ot8D
zENIWE1H%orjTsX?>5IY^P#uv9shV8tndWSFe%=zX5zXJ0Z@gzW(7K(LX4OvrV+~u#
zaUZTl><fZpnC-tl+HCcHzgnE`etHNd%~#aUqBg}`=lnh@IFQklea=#-78sNS25dce
zt$dwex@~XZVirgFOKWr&#8+uMw0yqKA{!<fK%b0$A#w5Br(K#a;y#_BB%PkjO(Vvo
zb>`XR3TJxiQ)%>h3w~>sa~W3@w&_yE{VGGXY-+Zz6dROJZm_9i&Bu<n`1W#fSVTm`
z?4=(6a;g0OmIBL9mln6S_~^*?r!WcTd9VakMr*A1U*>B4$g-{2m-n1PZIZDMs~vvI
zqHu2>gZ~mLT<(`C-#u34GF3G>R^lStKjS85T9>?Aa4psOVTDMSr$bLFyPsLwIeN3p
zbeoGs?pl@E3vmB>fzhCuu^>&X9F5*u<kz;+W%S_e<8V@y*Rf$+1+vFv5JU9lW-qyz
z-9rngO#<8J%xU<Ol#lj1*#j@0?jFN~1O|#Ztb6dg>rnTm#?d3PLM``L$#V{5_BwGZ
zD=RMwn@_og&YcEptNIIh`pB#_daujOgiuHL6Sw6HCAX5GL12i{`u|zV-8?x^!Y#Op
zl7Iex!ee7Z<IMEPzX7UYmZ~4<GChXG9QsL#VrU^@h<%rEI@gWYtX{AzROa|dPs1(u
zMHGYH^ME-SQg?Z@X6inPsdhQz67K9ti$$`ocQo;%171OsthD}8DVCO~#9~e|1lGAv
zuX=E1s&OV@z4pbew0(%0&5qw*E@Gb_EI!)j;o@(aT*F4&XVX<JikyOvDYsuEKo@CU
zs5fJP&iYj0+3*JHA)oPmmx&aYNU@ok66ce;8>!%<za*&NdR{cSXN<M_(dZ|h9C@mf
z?L2!kq5ikq5?`$lVWovi`_kJ29b!Hmq9j-Svd+sjFXXOG$_w+iIChCfg?n)B+-dpG
zAME9UI@T4+i$dp7ZVH?}1uc&c{>)sG2m3MahCW(0G<sLYK(ViorNPUa_+8Y5d$Iw-
zEw5HeNTYTPRBD(%#pXPmH=n8uDz*8t@W(Y=E3DcAY%7^rEDQH8Vw)E%ZrqS&oi$cu
z&Nim;Xdr8%XD6mKE^Eh}`krI75oMvfZ~tN@CEe9jNBWhO;xC3rSsEoM*E<KFX>$48
zTJf{YQp<DGd$as@Cs+SfQBhH2U8Ti?8uWPEJb91Z8p*r-%(Lw7b7?({POjxN&*(dX
zKrW4^T!;Q=_4%X&g!%CMG&;(C_9tGd+}6(|fj0cnp_et^U&yk*);M;p|L>G)&@Xe!
zCpu8bla%K)KDcAg9~g;5mQM9cMv9;7mqptc=rmkY+cdi|7P?-0EakzRl!u>)yKv%{
zncCwp+{H>mibjf4IKPqFcsaoKW9slIb`=*c|G%?@nu&xb<H!FjDr%SDa%%ZYpRB>B
zwM_sj>mDqmR2zRC>SU&4S(<g{WiRTz{Bxv<Vbz;!{?ACoQIuI}z%xr_=Sv<e#XjOa
z_W|+$axn{o|8M{2@$VJ1u+k1+_4qRXTdw!9r>rx*P7A4O!BM|$KLUXK;r|zknNZ`y
z-F^zs?%|*PSAJW|WEddQ{k+Ub#~ia!3NZU=&D;nf*ZhA(fJ*C_$Z!Am<vRW}B7!re
z-s1Qhq~5R~XW@a?qT|-q)+Fo3Xr&vx_1ro(oRS@YV0ZfLi;`2+ozfVHfD<8rF2im>
zL0Q?mh`(-^R=Q1*O7?gKsYHx>bR~~H{UGdUO;K^v_ujQwq<;PL`z<L<Kd*7+t^yx!
zxmfMYeWta~A2-F_1sv=qK^9$UX_h@TyxHRI)}ZlGCfoYEUUK3>h}<`Z#pf!c)JQ`0
zM_xZy9$spw?a=5n(lXIF<NVQaps1v%D%G;h-?2ZRAeb@?PMKK*+0^3IaskJl+b&j<
zeR*vO1zZ}@Z8>S5rWR2A$NmSXNKT!f9&2m!Jh<Z7?(@!5ojN54SBl*`d}|%q=!J@M
zV%}@yME81odq0bacsJG-6oj2AhzFS|_gyJ5dH&i;mOx-Q9zYxd;RdSpJV;(wOSRDd
z2m}UfCdRn$%u#x?w~BWDP4>VC0HbD^83q)k>N7W1sg2<(I!=5*8~6u%>zP)gdG?f1
z)l6@!T;0`y@y^(i8p9whhv|W!8ByE%3sMC35v;LW{rS<}kD2&*EGQKhS5Q;)WhEG@
zShzDvu4t5drk`6a{`6t`nZgHqQ@auhK4iPhJU43)2J(JKHqYksIrB)-l@)FeDRIoK
z(s6O>?i3s<S2V8L#Ah`4EVhaZIE;aYo%7>MciRX?)Hd4W5i7<bcwW~T!*b~uB9Ir)
zsOsR{m%G00Sy<qBwRNIlNiiUSQAzedK(C{L`%<nKTbP8cR0F<Jlzi+j5pL4#JXWA%
z(Ej0RQk%qdpNAr+xA=h2%y{g~LpQf<C)P|&uC@=yxu>f6fb7*;97`V`4iZYFd9l?v
z^k!zCHKC~R5zbpHzK=3a9JRM4I_2XrdKBs$kh3h^5jT*nOE&G~b?9%--{Dh0qa&|}
z;2P(uVziQL4E)lrf<dZYCrG++vQ;3d4HNwK{iA~tBX%0*nbv8xz1rzztza<sBV)6K
zk%^)+!HW1JMKR<iiT=9A>0zFwj{y&i%`%57E1VednA|@fbLhihd)Ywt%tZFs*C!r?
zb0{e*>(?acr+&J;^aFC0a8Y+-_Do)-d1}jj#{s{_sN>CD*)!7!s1{CR0?fI9e1(g|
zP!#(C*h5f)&ec9Y_fWt*kV`AA*(ZBaKcE#zk~JT(t30ttfc7C)CmTRubg5X1VM$=p
z_F^D2QLOJ+hf22k#3-&uFAflCLx9rg^;Tsv1kYNu`WhOHe*63^YVfWNMe&<Be_8b3
zymk;2Xg5$Kkf{6ea>mT`1OcHaG0vl}SSru;ELbV(StchjrPnDjmM_<#^u+|bmOVyA
zMJ2Ycuq1(B+;b9<+k)Uw*N--D%(4!g=+v1J&YtQvFN;u)Bq5DNEfaY%x}RmHkA>|2
zfVKr-`LB?ePYU(cCUvfNHoLxp>7<ZLe?Gg)jteK1l$w!6ioSh*rHIVLYS$H?X90fl
z6xU(oH&MN|g6Sc^tE0&}iZc2A$EI@?7Q!T0;z42)DuMZgeFXgk%%^X5BI8Au)pI+w
zA145-EJ43O+2($Y^F((iuK-O}VrsA)Ux6xEwof^itOM&1Js#0XSjl2b9ml?0(cx;n
z$S7qcC9>hHA5Ng0wiF38N~ha&e?WTW&bYD?A#vOGqn)Am0Er>&`=O0R_7mMH63N3!
zu}-@^MEj3A4?bbh?<jwk)VXxsuD89;Bk7Ypw%Ny@DZT4S?+tw7F3c&#Y!||Pe;%cK
z<<Splx#=cYyF_f(xKV4NuNOi?zt~@>=3H1n|1F)#$Fi1){E7IaMN8L(XmW)qaYpym
zzr3=z4ZJ)OxWTlJU*;h%&4){?W3@nlh;1=~{A3toP%H3|4!b40T%e{y#>KubQL6EO
z?K~M3K2V?CdwIY^J;iM5S-Q9L2~Z&OEG%PHU<xh;ji7{JTPq|d*~EF2Ym#~2mt14h
zN)(cbo1K{sBgIV~5`ouQoCOK1?9Y>!zTo|jVXEfr#|U&$KsvLSjQ)4GHw#%rx6G$p
zAOBE9a*lj?UM5>WB0_B<gh}CKmH0%L7YZpUOIY^kDRZsk{&l7)T10aYZmWB5lGYxI
zn<L15cgAMyBa;du<hfFObPJMs>_Gk4KwGRu7D5`lJh)h-9cvttmR4IFh<bKE3GCl{
z02_*Ly_Qv}aB5YcLyxc6NIj2==7qzdvH=f8dNX)X1}sTUsx|%yhPoLdNb2->Cm{jm
z0nHTf*Wn*2wBs3WM|NC30^p%LlZXN6UQ}7e?v(m|J!7z-=(0jq5`C6mA6>7sTakON
zF1ee%Cp<0GrbgdqvqM+BuYkGsN4vHF!Xe3@3Hb|4EfF1%&g%D$B?8A=Jh~sUpD;;V
z?!p}$B98MZNv7w<Rc77;NvS+-VLQB!wKrzFjF0pNl|GSbLtqty*OiIROb*bdlX=>*
zCkwLAG{3!TJk+}rv+xtJN{S`owuXo?wS+UY?poWdfdFhD59lPzSj0s&Op}y}(gOS$
zQT$YRl=aGQ%ijg>zfw6~kLn`E$H!;a8J*G(0NoOw#~hg%L{zM()Q_1Cu@=)Ek+CsV
z69Pu%99W<8*OqU(AM4a@Y&Tdc<uw^CZI5|szOl_+HP#NInP~)v@Q-XCk$Y=>timU0
zy1>VaPl_$2K-eK3aXmS5XLic~%{)b#w+>v|c^t$%fjmIL?iA@~N)cL(27aa;ytXax
zyrBWq+Si!=PEj3C#GS@NtUD`{+K|nxjEs!3z72rozK7O=e=;Wf3nE2rh-RV0sUPGt
zJXSA+GEfABko~R}A=ZD@k^+%F^Mi_qKhZC^-*R=m_FJN+U^x0YuJf!PreKhvp`mQp
za4}O!5LsfMK7W6HIT4VqEk8~tn~msLo6Ixr0)}PDTF#7DWrxZIiPPwamVICr$5YH2
z`sHQ@<=U%abpk9GP&wlt!cRQmbN;x}`P&suq$C<0*@hjlu1jQx5~rTai<a&?_0Qj|
zw30wEiJ`UuwL}A_N4rG2;$Cr*dN*2Fj}+Bd^CMm>C9y9rEoOdzA&4VYNCU-LRG+%u
z0yQskpo>I*qCwHSzyDa%Pm(!6r9FUA);fh(*PdX~+v`&;hmoRr0sLeWwK1aFCcw5<
zAnZcEDoro^xss0!F`z=2dw1W5r^h9RM`Z*Wl+yKDG2+a3dSQX~Mnqe5nC_jK?$!Ak
z2!^G7f0vqqNK;v%p8)qt(Qi3z#eQNA!<=pD-N|)f+7lkJDQRiK$W*E=eW=Hg!$W}<
z?{)+^n4`Y0|4{~Xmt!>x{e^e{RAgmjWV*+(JW9Hpx-us5x$6k3OoUMK^x*XsNu2<^
zyZ5ZcII~<P8Hcfb$EurL0DX)-Y#Xerl$q;{BBL#aDpaZhC9rpaS``(zd?bdP38vyo
zZKkv4GdpzMHFue)3-axik{CaM+K?&dH2K6s{GCvnzoNs(XpNMvq$=$o`>B~J->I2(
zk`kRm^Rb``YF6I^{cN}m!~-x>ks8QdiTH1rMv7T`O`>6GylX`{iv3_`RqQZ9M|SN`
zeL6J1_>0(@xr+@~4}LA(h6$4PU>3s+`|ui-hsWHV2cB_fv(P>jZ99E>GuF45+qv%+
zNim>Ug)c5N86AHJne1p=ae&0|Brvfk(o<1;SWqCuMG{=I<rbCL5L>gb)z3T-uy~K0
z*pO08T1Uj>^hmZG-3me~vg1J8bkqa@%61H*kkQcp?xTa(4UnW|ut$kVOaz0}Z~rVE
zfat<T!)p6(GJElv$8ZI+s9@1UzT<#%^kyg$?{e4c_}g^Xw&b%16{5J^Z#EaPao@;i
z#OES_df3h3Ezucy;=lRQUC@VF)(1qg8*dLA=t?XJRf*LmYtzANiHcMM#8ahkK=&Ze
zg)grO7LJS}5@<?;jRrzkTb>hX<}5(SeQupV?M!=ep~HEX)Pd3$Gg-;^fX&NghN9%y
zAya=qRg=qXMRCZdu?O7Co*s$~awxg7LYM=y+rpV%JM(1i&Xdh4*;A%mX<r{tws5<Q
zO|HKi0&<+%lh*kmOfIM-bC^IlQ?&svCEyx@`)yGlxvk7qeKK6AoFnI+|Iqw5Wgta4
zc3rsu=%9n_oppKjyiHebaiwu!1CHla2HCYTrVr1TToCs6L8p^iS^>r}sceM!VSuT-
z)Sq*p+~-4<^dTT%uuQJxq=xBGswIKGB0Vp*HRXAfls{AC!OHAXOJI#wPkJ|8XC4&m
zJQO(wEHZJ`?Y@fuc62@ThcWOE|7<*vU2o<{g5y~KbCa5VbtZ{2gyRxzyvAi_O2v5b
z1hPR3QKvzw?vkWLOUPZPamGVz_$`?+=c#e0&uVd8b~BJY=szDc3z5=_Wz^D-hRo~B
zV<@}-rt6(tq**#6k3sqLLx?nikZ?iVL%yCxmE-tK#j*W6Usn31-<=g*tvch_OhR8#
zYr<@MKenUp+0>p3gFUh75<qYmVq552&Vs7iLLVJxA<Tf>%=A<uFvO0(XVuwqy&oz^
z`EkSm$$FDx1Bt*sRp%<f&;k-m#2t2TU%*ibSZMKq#=d>5j*BbPe&`r3G|<E{BM4x`
zE%+}XpE>$H!@Alxkzj@`;0IjC>LEUUb{&`I0}RP<xU1%{)PisUW|zsn+9rbC0Wf`q
zt&IrjTK`}P#V<?9HcA#%ZuxU`n^d%{m5D&ET#4B8E)0@r?6II`vRP(E^9WfdsD+>`
z?yS+%WC}nD(=c^$r2w*rNw6bBtYcc_`Ws}#wQ5<8cJwpwVRl1vi4*@pXXMS7Z5E76
zew~U%3~`y6t^wsOL+!cjHLsinRl(8*7-ci)Dvb`%c?Mt<TBz%t<9__6{nHb@hQwQv
zniqU&G;icKDlfyY5>B|lwPlR-&e&}61&dcm1DCqFFL=KNIu1X9cmw`7*RUIaIbqhF
zD&L7x)t*yMi@U(wpU#3?NLQ}oW_XySJQo+1c@+POwTBmfPYSQT;r3yuyRtmceIIP|
z5LLLeCYT~lP8GH-KoXbJ#BdQwQu$u&>zS5P9Zw0CqLJ9nKrLGoHZOSV#m`sQ8+pu3
zi_fP_Hvgz7s@s4lOPce&EZ;C1h9JrawItDrD2w}z2y<lKY4A}3P}Mjgqn_e=@#!+=
zOmeCmRj9TJF^#Eky>@zhustbH%zlt#Rti)8q2st-GXyuNfkKdQCZDcc83qLuA-(?I
z5?8(o<m`GNLt+UKWfT_+0lom<F)uBpg0*Y0U5!1EY679Au6^*E>$OiT$7g<(=5&*~
zQD6u|H7eN&-!D=n*0KG#4kwDAam0nDH?y<~(2SLAcZ->+k)5pza>7R6sbncQOwN+q
z|D@cG^p1`W;sw$Ox3%i7O@c_9Gp_WD>iYhhW2Z*rJ2pQPI;jZngZI|}MiY0Diu(w~
zg3AsH4GAk0`Vya7>f5vF!7vGZ)Ej9!p~!^*LJH^qqIAVJ{&YSnZ>;p2_swqIE>%yR
zsSXuJJ|m@n^)@9dR!Y^F=r(OZF5=KXeFZO^b{G6|gXpJncbG(yhcANpLb#>e<WKx4
zyOr6fcaOJGajPD{z9~j<qs(##2wT5{dGP8siv&>{uu)xwMUX&@Jz}x-=6g}{K~8|K
zsqhuOSV_8_w|KKW8LE@wcNw($DE6iVNT4+#s#C*tv0nvYtCGr6+eB;zvXPJ6-f$N%
zBsxclinGC<+e7RxdNUx|+d^NSrLqepod#hBZbd}c(3|0A7NZht|Idwb08+GS;8>Qy
zBP10*6=`eW+pCqpQXh8+Z=^Q8IL?*ITBT_zbHshKuMB?;D=h&E`<d%ZHXMSCR1{O*
zW=rYrK4`PrM5K-^;7X4oBDDarwbU;nbxxXP{&pYYo@nbwBdMkiRcZ`)$2vX@R-#Ny
z$R0pEas!*a-Fkhqg4Je9btlPp#5*854FEcEK+XWk?Swl#^cu-D0a61yjif4q*5Dn$
z?8vd_ul?cbZF+HVg%C`I9fZ3Qsf}<;_!<fG*{M!th@3ni6^mpkiMMTS-+KZ0IZ(i6
zi#00|$%?`0F0Yv<uLF_S-?~yQsjbc5(jC8V64LdZWXG)<1;%{{v4w#Nx`ghPL}W~&
zD-{#R96lkT?d_0Cp}BKZFBhoyBy64b0p6SFShLscMoU=ppKqV@M)J}c*X4%+Jl!Rx
zN2MR^*}79DW|!~^YL<G4L@TSPv|$?-OR+d9o#&+7cwMF^{p~xRCAA?}o%poS_0EGI
zWs-WY3lzLyY$(<x;MA*Y1hWdERt$tmOzj{g;w)wIny7qTV1_h0hS-KV-UTn>uyEh&
zH3O)x+i7GYEMpdZfo&>z_n7{S{eQJnp1UYeOoRYgFd++M^|efksgA2GQ(vW0zqsub
zPj-eZs}B`-=ayB}WDya$u-w~OoCZ(8bm4)2-vwU^W{Iv?I@0Gr3G87S9lD4pk72R&
z#BldgS@=_De}EfFF;b?9mkmN#jAW|F<wK;H!T=Eg5RhHM%oIZm;QdIWi;EeQVv_L)
z83hjxf&(0qkdW~8A_YZpI5DWg`40kZyh*l&^vFTHLZTh)JO3#M>f3f2bm-*GuvCdr
zx8#RWCm-YwzK}Zl<(7(CB(N}I!wX0WNtWseBLXp!fHXF}JHXQ8CQ33QGLi>f4YS!U
z0h?7HE<KU2UglG(RUe@rk$wlF&Cr|wUL*VVsgJh02?rG{F8@6fgv%KeKDZ0ZgpFnc
zay6S2357FBEaHdl3rFDAMdaHLDdvr}t4rSA-DZi6Tf29o?t)=3sWTv<`B2^hSP7|T
zh9^hR5V2szel&&1H~OQAfKl|~`K<AZSd@3j5`5pv73K7?dPQGyZ+Wp2W6`oZsVemG
z<Hz{rDAVKjcjd>Ode#5_;L7bZVcQ;m0#JzQi05*(KkyyFkjh$_yqNjomc^UTO9H0Q
zS+V~v0>EE``1pccYBy0`FzkW1+dNol#BLxv6s+^$?4`~;j~)2oaq%0)W>4Fbx|o3^
z1(2Y-AfZHjqB+#hEMm7Qky%JaCj^jK(AU5&cmQe*O)n(^8vqB=v1z~U{Q-vnoVy9^
z^fe6h1@&%OEHSztlYIiUfd5Y-G_XlO@VviUgN?=}>;Hf_z##J6$a&U%cy8i35+9I0
ztE<2*3jz~Z>%R=H;E$?bNhg)gUoigb1}eu$75aZndQTcF4FTee8g&MQq5#uKx+l&H
zQ~V1FW+(T8?8ZhjD0=8kY;{D`F5EOS5=uArW+8&c2jrNImS)ra5GjBc%xM?EqrA=!
z2v%N-WwQj)3k3@l`sBKxKWD*82Btr#(aYF)WU8C7zC}<tK$BPs2_OPfQX*qu0IVtm
z!Y_Lw<-wXy<S=u`rcFXn0rnxwTrQjU?J$u!NQ+IeVAE|PqSF$f-nGuU|F%7vxJa=0
zk3yz;aOrxkSzbrdID3GV0%<&89jl0SR>mUfTALs}q<(#TtDkrWDC-4;PL#=tM1NJK
zOmwHS6zhnPRGWlu1F`Sz9kmDkq=pLU+mBoHq_%~UPKSN^$LJxCAO9sWGge}Wuzd=k
zwupTpJkwun^5CuNNCzZdXK984r#ERi@<;t3&qEqQ_>>hDz1)^GdaTSz8-Q-l&mQDx
zK}?(4BzRDJiTt<%Ju{@Yrfi-kD=AhLDmnqA4+g~~QVUWv2vzIDE}-^Bp$n*uo~6@I
zSN)4Hb(cdB{)=4jA-YPy?}ZV(CRp$A9C)Vwq$16#W<+Zfg8yh-MM40zb{CR~9vGHR
z3&M<h&(@qbozMdch%RS>EG6{$?=@(sxwA>2n1nKN_|Eh5smR<PMbP>iS~?8!>D$}k
zS(FkIeAbm25M;8MT_PXO*7JAv;4J&l*+Y~%7&fyVJjZ^I`|JQPyAL~+Kp7$xA<rAJ
z?DRfH;$e335nqc!7s9A2{kg3m6U70Bw~}Bb`$BH~VlUAhNs|?{F&VT6q5r^-_)TE&
zJGbnXeDxe^0QyY!kYE9}4+K6v1DxwC;WC{UV_N^>!V2N9!DP5Jh)he4E~QMi{Mck9
z#Srxxd;D-773@yX9g!P|Bn1Pv4A9rwxn#YL<6Z)q%(BK>Q7>!*8dth1L&@8X%ykC&
zl!a_#4%FE9-+pW(h&nQ4iN=RIeR`dG9fU^eEz5vL0Se=;5hsUa;|Q_X@bKBjF&K>y
zvv@HM89X2H9f@&41VALHhs08Xq;{?$&6>aFj2E#!0W)aiC(&_6q-atXx|8tp_{a2Q
zsx9O-QjjqH(t)BvWFUY;FS1;gQv4TDvW^nh6rC2gH|+fj7u_kmnu<F$$4Ey@Apl}u
zQy%-QYyx3j+%UlUq<WGO*?A|ca983zI0A}xwm4%Y3JBOn+f5jnMv_Ea%@Xq2#@gjv
z9+KRM%s03BPYs&Ceg1E8ThC(r-LcSJ^vTMiWRsQ>lj`q_<9erxq!8k0kiEZAz)Tak
zPIrMkb{R;q2<f5%@KaBphU`Izb3e~w$pyT4s(L`us}0%M$ElyMZ6Y{tJ$wypQ2w;X
zU%$6ibZP)2kxFlcg(Lza$*LsJRZBuBIRkCW4$7M)e5+7s3^Y2FDos$jOB;nO&YT)y
z#-4|08iU<Oy?6Z~Y*vSpYE9Zh@J;RXO61x}peLEbS9@PBqAFuZB+YjBVMk(pLV?Vq
zE}^vVhHBMdpJ@oEz8KEKWL@@@Uv?xE0$!6JJ&3M%j=)rhUl@o`*iJgq(4G``n^{a4
zA;~qX01Q>2Mm!{mCbQlcU5)vWJnvmvusFtN0?!{OduO;p^xu4vbL0uw+E^^iP*xhT
zdjauV?`xO=0Rk5Az{@qttZxG@wFl8;1jwaam*4o_>*IQ-`1Q>-1e+)+jegX?6~!Bg
z_kxNOl{iMC;lW-#qE$pEI^dh~H-6dR-x!ey5-Lt=v_Krx?o32FZL7B^*iG6_s>Wfu
zQcDDCtKR)=Udu65g;0wJ&#G*LZ`D`?t>+P$__%qIyME~P6cuC!!5s$U32xUR)9J}E
z!o%L&+2n&pw`Q_>#)-%{0g*9joWzK&s+GpAI@o6tzLR)LUJmHkkZ4E56VsXevi=<8
z0Ma5#LbaeeUXKTDNYZ}%yIU;mq8s%_8f;5|IQ^lj#17TqYUL7hGTBf$>9KsK7-j(c
zPMS2QC$eWmAkf+Z$n$RiEH5OJMv4;%A4~t6P3DeYtclHqOv__j86^r3ZEO__7ix{H
z;8qw#C&(V~sm6_Xe{_vXUICDWvF+wQkBTeC_{k0=@`|?eSc_w~u}%SWAb(P0NWK(i
zmb@}Z<V&=_5>J16G*6-x4G5#JJBhLaXKdfLBe&n?Tv~uj66HDe#LYy&e!@4&dZ)d?
zWCj0EZ*5jMB#pLvin95meNAO}V1QRD%92P~uw(qLGuv~}K0t^Tof<4V_c6l`;B2Ei
z<YmLI>Uc&x5rLK@ldz<RBC>1rJjC{$etiSTp>gzewq+KWUOPa=$+o7HgV&bj-x2uu
zU<H$Kc{?BG;r*67C%>cJjeiqA1b8$bp2!1`>t!2xIe_(InS|!K-Z_AbNUQ+>S}}ZR
zZi7mXTwHDp!cG)tk}B&_tHHab`)^j6BGnV{vk$$S%b0|DAy$sp*G!EMO&mzS0A-06
zJWiH)x5yPy!^Rkj?v*`h2|uz+P#Xy2+f`DV?~*j-Jp?bw<<ZLO`4q1e2qmGt1PYj-
zK>)qb;Xc5f^c{+!XHc!ipy@q;)9Uoa8YA)H!BmiCLI0>6=T4R0rsvzF+$ZZ-8^WZ)
zrb!8b_#DZ%iR@30AN$k3SEF(KB&jm2G(Zc>my6an!ydS+MVj^aEg&VRqm}d?VM7Rp
zBDb|W#chaaPFgw0&49Xfi76o5FaT6AR%p?BSBlve?0gprV^o<F9=CeS?H@DWk%@Mx
zPzX~sr*G{hoiZWDRWXO9u!DoQfQ=Pmp9ezZAyW5Ph{r$4Kr%!<S^@LxHrlmO%G8WU
zxP1z0NdFG7Tx>#Cd#GQl+s+s=gYlI;pcRr=1m;d}80s7mq>^EcDxDl}B%QsRyhs3G
zr1{vXmU&FJL>~;gh$d1~AJT04`o=0oVXLzbdCw)t_5+2HCxt3&*R9@|<wSh@FgW#{
z%`j*wUd_4vcfr~qgSN!ry1hw&YYm(9Tkwq0cw^GR4ke2eQ7j7SO|I0)Lb1??fF%cK
z{SS73&!yZM=L5WtVBXKZKeF>@hn|TmVF(}wp)i3!M@A1A6#M&<S;DR$0jZ#<I1q14
z%6%8we;HAANJ)mZyUvrBNDD-`(6n!V$|&x5aVwfasIz~5ublV`{~`2#DH04wOg;4f
z%sNFV1;pvYHD%n~7kh$W5bc-j!Rc?;C7NM_Oup_UJPOcD0*_P(b&0T;n6{806Qsgv
zQLMzLfgiRx`X6qrkeK{N(l?6+=_1Emr%bwZHP~Kc7^<D=D+#5`ul_yrw~%YA8I~Vi
zb?Enbt0nh@-(WZ)`Dfqe{c%hSm6TS%tavE3U(%6z(V?Ru>)U12w}fxW)N}pY^B8?>
z>ZONMb-7Hyy^+o|3oVhv5yz2_MQkFCHK}comy;$ZC(F~Ujrq>L&-r`VynBZP%<A7G
z<F8%4`my&;6)X>DgCGkFi*ZU3P)7dgc!u+#m>Y^}zIjg8z;O1DKmPBnLsnK+`??Ek
zY-~m-h_&gqvDN`TK0Y!h!h(W=IF-WXJ=^DVyaAwj>goCC)oa&`01(qG+hiYZU}tAf
zGHXzO`SN8)T}rG`MTGu=|L!9u{By<0oh0`@w;wVZ`dWIBJ_r`|3`!Qc5hVF1P-_nE
z*s+7r%s5`PJb3>CT(MTY9Fu-rNJM14(m3T7ckdnpljz0w>Mv3ujaxNkWMq`5S{g3;
z?Y9UlDyK?}<~&1keua|E@;kEXKS7B69bXIwA$KdFth3Ulr>E7^ZK8*=!oxX?_qRbF
zc#a+<Vnvqq9rNaVihO+(7^Re_MAR!bbpeO{X+6E93r$DJBcs5kS+pEWyD-#Ip{T1H
zeEIU_@_0Qdw^xX7ippg=O!nb_UrUP7k*^dg&6t{=?iYZ8N#2wPi=7&((uq)bdhFUF
z<q>~zCw0hJ$?L3{MK8+t_4S?mf-)bY9C@-u<sIebiRn3qvbE%oKT3h4y*)iW+1`|v
zo<PDlAusQVn>RN#>HRm?53c<+r=faL6J*T(kr5S`-bs#Q7HQEK6@x1<s}Vk-hd@z}
z#6sDaJG2!ukx8wMx5Qk#-MXcSN3({HH+hUva_f3#=9u^I<*)wz_gVOr$9j2{Y;A3=
zd+OEp@893ike(8u^z8J1_h(qbFlVjbx?Eq$a#B*#HEDv72wtagE3ZT(M2>01XBa5T
zQ92O&69jR~#VFthW29+%2$D&JMaw%o`Zm{pc8iN^*QJ=N96fr&d9qJR>&5x44KIYP
zJHi2CmMmL#FFQM1Q%h^9pyKRwFM0ZFL<e82OS6u&AL@9DRi-IG{~Uzw_0S(?vt<6K
z&Q9Y@C!EF5Tzg@Bd|dsO;H!WBDThC!*|s?NyxWZ%@6kBLuyJDqpK&FJu(0s#CR((3
z@lFnoqv%-h+_mob_elFJH7C$q9lahr2!{&(Q9Xxf{hYkKN^3cl_6YHyJkcK^J6G0W
zT8b-FpDjJG<I6%fAH^`)9jvT}9zA+A4iWeszS!Or70E!>C(#vX-?5`2{KTujrDwy7
zL!IM$t;aKanrmumGGM18C3b+LR(eQGheObN7=X4c$(yAiv^A+|X`EZPZml}(Z>RR>
zk{rkU{Ctw!4jep~bUJs*6darOH_8!8w0ZO9Em^WeK~j>k{1;9p<e%Pf?Uc>x{~{Ux
zaZ8Dd+rxM6!=d8hVgsmpl>8Ub(N9-MSITB4rnO8=y9ESNHs2x7k1=rW-Fu?QPe4Ug
zRh8Lr$(HQg*RKhJgn)4Kz<~n{jEuf!ny~N~Hf#w0^hr0#wC?1J6)P6`@0h(`?i=2@
zS>vrO8YuRN5M^z3B&LwulJ4cT7LiHRE~%y$FJ8dPI3jG(d;{{1z;;&Fw-pt99*K)Q
zt7L=3qlY@Hc4CUrVXk=k^y!Biq@<*h&XpZO0NcZ4uZTN!aLqnk9JTu5;~R5YGRZ^l
z)zQ)E>;|cCdvQZCP2*4d=C`Z1Z{OaI`f86O83TVErtCPjZ$G?vC9@LZluYbwW<s_M
z#$kp7mo8l*yF^q}bjg2ldh9p1IR^&FT{j%pdhJ^m4~lz+>DFJe!RK$>x>bfY9TI;?
zv|3q?LJcnv$UFAzkte&ZI{t}&K*0C7KgRrju2<0cS8>M1rIJ0G={7Qe(I?KH4L=#J
z{`A3v2fTYR&t&A$SN(wK_HF%l^Bm*<KNpFVe{>o7;VF)J?cB9X`r4_tr{kN??!wPm
zUcG)@Bi$4|i;CAyh1|SgYpa6Pu$F-#1RCBm{0`fh*(6N$Jo?}16vA`{NVsU<y?a;K
zVK^!yGt-h+_USQirA&Kktu(8fczTL;`}T7<GC@KGrfCfBAGS#aX%ljT3<?ZVnone;
z@qgQo+IL{i_9KhPV2LWWWchM$tCTASvbO|#?~p@h4KFSnY%2*;!87-B9H;13Q{&b|
zK)~iuk3W4CS;*VW-^0TL8I{B&X@z+4ii=#2daE-xt>FX7y=}AU<5SMOp`Hd26}@Qf
zO!W^9m*w3AFV%}!cibZpj<o|){EEbO?lh?>!`*c|v3|hMipb|OZy9E<eP;dKdV!DR
z3mbG^3+caO*sU5TNwPemwgLrS3|q~@(z0M!m1I?2UERR8r3|)nwliuHj4pF*=P1<^
z4e7QjC=c)lLy=6^{#Wz2r2LXScGGAy3yUw2N3gFRhoDb@Q!DK`HV7*XJ*^dBCquwM
z$LR>Qmd?&dC>sSOC2}A!Cs981<5fT0_D;L8#?=YG0UpmrY9nmzG!*w=YU49S_Ew$Q
zXJxps?$w@UiC(jRnf*U`Ud>dC!%QMJ<zP1z*)A@LWk_?9{85-sG>%r1!t65RY`ov_
z$U7fybH(x+%!Hkt-3}okHRJ(hO-=ue{3hYR4NqJH=gANHi#ePG?pDS|Btp#V*ROf^
zy1Tm@m4)6RWjfu~oX@Pj`oCN%Den6VL`X+Rl@v1Q3lQ9`w{joDRyf9;@$GWWIgDw`
zVmh79fP@3JSk;9jnWrHkXVSFLhS%KI7KSp<`PW|uu(jOW+!UavF5$2Y{<d`G$|}gL
zBorVEb7YqBl0H~C)tUq+RYdxTiiv#>1O{sE2CQxaL?Es?H%$SRz#5T;$Mz+)i|T&|
zf^7Zu{Z=$J>r9OFR^d^t0iSlI)FHEjU?@_I^RFsmZtx(FkUxl$x1L*DTA;YDP6N;f
zrBeyisrkzng&bexaeS?j|8B+MFI$X3%#{G;&uqAc`kLf4VIu}T)gX){U`LzbF17BD
znHo4`QVM^Z3@tY|x8`+WpqK{mGVhLqIijadog%%rfJ|?AUDm@q{S*ws$bs64^kr20
z{MjFB|K8W@#LxSBt~qhy1S+rmnKL0~A2ZI`2uo&7pidT1_Ai<}h&U3=8H*zv?{=-b
zg^6ovZ-0h;^$ZbYfJ3C)0V)Knn%*oweB=n{zI`9MtmGGNXJ_w3CyDTdFMt2@>Xr52
z*Arvo<3?y9HVFHzS!D6z#huP3fOFQ^Ap}3q)?GQ#zQ6y=0aziwd%H0qOL$t_C{N30
zZEX+@_KG}>nloFT-mhODGHpmx-XvgF4snxV-MUb$vM)+7!zO?SrCaNEH)H$2+YE<_
zU}0m!FS48B=>PKN%U}U_HmQlij-zq^{PU01P=|72*?ia2udqz3R;@Zlqn$ucHm}uJ
z`Jt|wL=JyZBvse>uO~NC{XQuvDQHiR0Wo>k*r<cvmT=VIBH2#oIjJgyea8+Oma-gf
zMw0WC{b4C7zkt=$y$yI~FJLDeg$#kH%#h4oJ1<!NxU_UR>h>Ud10kQtxZSzKC$gR5
z*h>nTt1GpfSyUAX`tFr0S88@(L*%d)mdwi$y?WkR_GkXhnF6{b3Pw95%MT#@Wq11e
z`xU?vl%v)8>@Hq>e`*0`@d>FA+k|}Idi2{VWae!RFMvvn(INn8F$AD~iQsXHLzqiD
zBYbAEO@cJy<HGmNq`f(n%O4+JimNFzGczM!S<N&2gF4)Zti-T(?e|iJIBI`W5v!vE
zA?H0>*p*{6Q|A{GC}VKSc?GrT$&I2CU@!cA_RBNm?c1Y85B45}+vjW6m}P3IL+<lL
zY9(PQ(4&=shh^Tarrcs7u%CtgJ`O!M35o9Tp+nX6y`Minf{Ckz7Q{x5z{Rc;hYlSA
z@)Q3MrIyIBe!Zck4mii`)w*gF$x4Bym?+$*<Q)TqERUdlysSiGCJM!^9mgj5M^uo$
z!LQ$cEq&6`+G>CwL<KS@ldXBxaqeVu)GT}T9yx0WS-;NelBI=(0wir-i{_(ZP|YmP
zpWm|=lc)xulw{Mbj-$C!H~^qvVUaMDg?-bG^FXDaUS2^Wmo8jQ6?*&oM?oW(_F|Vi
zX=U}?I`h`8TOb@^cehDJAZ{mP7rM>|HVr{OA(z_3Zz6x}nA=bmRMM}IqRY_8t$+F&
z87ogy11ieD?*%Z%xP!m#_<O+eXLgg!x1JomkU99_crEa`bpUPzTK@<VWjLIEHiN~K
zX9!qNJM_wOm;hE_C!OP1ek0(G2#u7On>F)Xh1@p_J%%5D4zfk1bx%*vMlt*6+AdSp
z%Fh&XUw2Wc?_Xa3kqOybJ>z10(P0|gkSFMIivWrucc5G|a<Gq<WFOck5%v?K+>-_e
zDMJg~;}<UkY4qIFr2{1ru~4PL!FkyX7gI;OVVm1S2^l0EIhEGP2c=j7KHVH@4bKAC
z%R^|-*FcL~%DQ9`LBX>KXnQE6JAR+<df+QTj8IsTP^df8aFo^f<>f`d-23)|8c@x@
z4Omhf>e7Khia#^rvHO4jy)YBH<2ce_bZ19pbR>E&6cF`$@k_2^gg#+?I*=VItpQQ1
zpsjCRzkWR#%B1zguxTcY_HkhR=uTK>5u~62QT+S6fGC4FY~G37K=09(CRy>}UcGDn
z{{GU%<hal9Y3!84ezad#4r%ETiZW3LDy?C0R>F?^3w#iFq|W4_F7e?2Ghly7S`30l
zJ=NkVDRG6%sQx_c_O*Zh8H`^#mgaD}6wTL<FtPGExXWAo2H)w|zwcH{c%2plLy|zh
zV?JD_rwuTYxW!~@n57^5GBwu-eFYCBn4dKtgb6BJIaFhr;$@VlBKB!HvTIazwJOav
z_F^_R0AW$hh18Z_!ni^8E1=kJry&&*rwJ{-5?*+Q=5nZE&)`pRfWFWN;q;!Zh)|AD
zjenf)OHjZz63?SMa>j&y!nWX15xf2nxS~;rS{mJ`GU{1Hq)H^RAe-TD)Z6H(i4g+)
zpiSIm#W&;tY)+!4lYv_FVwZP(czP>FJCl<Zp_83;GFCep+s@CQk=l}yY}OEgcGR!P
z3-k~OS?8MG+=<XkP55^#d?-rtj%(jf`u}A8@)>wC8Ms^TVO(u**xjQBxgYP2m^b@y
z-sqqyK)Jxl=CF`-_7kR!Sy~`}dqGa$rh<n)gI$#7IF^9#t&G-6KZ(%_;CqtVf2>@o
z0wNFlYzxh(^zqeXlbXNe<mAXjsT{G{IoattHpY8BW8;6lEnqcqIMEb}BdDi$e=@=E
z*z|ssna55s@F3Gw0J5N`PLK8>$YuIjPL3ogO?c$yLTFaay}jmQ`-vD{fkGj)AIQiT
zey0(DaD-x*JL?XR5jpX(kS|S5WDY56HHnWMOHTii9Pbl?xOsCFAjwfWnSp4v#3M-S
z`^W=1gAF|-2oY``jqW@(q=H5i%YDmc1H@tV>eU1)28h}zd}vs~A~7?W2>K>X0*i(0
zL4ZAV4;8rh+#AImU*xMHFkHuksC&D{#O5XwhU4({WBASLIcO2;NfDJXT10Lk7;cLM
zg6cS-qX5j?2?ZHP$Cym<5-On$&U6`I)%K~zn+ID8d<2AcVz}rkzf1TV>hrPL!*u}X
ze*X#?Z;-~01y1@0X)NOs`ImKJO5A*rr@>7$z}jkR)JVwTtW=@wQc7Nu-=vxpbbzA*
z2uXw;LKh?7N_Y-cdih3PoUH8WiGic`uBxi4If|kdk;8OLR;;*xYJ-KfHCeotwl+f>
zb=+wC3NraM{imG8@BA>#sbJOA3r*Lw?c29Eq`SAEacRe=g|4d!E_x_xm%vKk`<2Bj
zamef*W>2ye*eC?edKGD+maWpVe=fq0qB~?92U59c$BrIF^Wqf-78Y}hljMd=aYISx
z!np-J_Z>QHY_&4&mFeYSaxCaKtZcrGVt)}wVh~N-?L9YDa5~;SpAk2MDWngKc#MwL
zzzAsK5B&b5av0E5br+|ucGJdTNDV?udFYR}1Ihe<Fuhw^rY7r8*QYNdcFXLS<f@cY
zuYR-cu2rTh!Dj*;jzlD}87`vs6rjqWY*{vBIcb0n)Mh!QqgUVr0d~BwY9k<a&=hd?
z$*QY*ubsU;k)v{ULH~u|7k%%+0}ew&LnguVlAwq@z=@RWX%B?fCuciBiJ&7t&b?6b
zyEa@S6;sv6u<*ftY^!_cKYsi;4hNH!1_c!d-!sU=s)OP2H(WL>_Rs(0F(v;G@TjzT
z)>tTOJ7_5mRHaB4qsKXTR1dlS=x*+|HmjegDEA+90R+PpR;QOoDDOmWms@u$mxP1C
zaaq~>v9Ynh)Z1wUQ-G|K;GHfZr4aaorhx8Th_}LCJo71r-ymv|0LCrZX|?fs*K3na
zjc3;~cRg84`~I`evU4775X|Z<bo9y=48U`~Eal3Yk3dxnm5%2^w@V$H!2&9I^_n$9
zC~U{+AOl1Ecs<cE?+-ZNy0Wt$J^RL|@d=f?iCq0iztJywp`UE~Mwb~aI#F%MCMNu_
z(7}7`AOsB|-5;gnWJEaWcf!Z@N$9wT>bRd?9;w1THa12+4BmrYuQ<RSv|Bwt=jF~U
zW@G`pL*=d^w=?MfH4XKnU&l0a3da+QUQ1Kd)w{qH=$ALp3dGr@n>AF<Osb2-Fi=DK
zO<xd=ZP3y0F!H=fD|Ft4@SE1f!n~E%E-cx7?bHT4-+wg&+)Q$R+WF6yp`H-fhPOR*
zV2@c?SnNZ{NN)A=^bAHgwxipN9vK;ltQ`z{3!<RwY8WeegnzN-=jBO#{`{HEa3RHU
z8*OPyi3TzxWPz&5i&5$u8j6q+sNiZcf;cIzwzjgbKY@wcbDX*0z%HiqnpE#u0y(Rl
zBnH4q)SR!AK*uu0zf00au$R=$SF(uL&px~F_3PIIoyX77Y~?mI#GSf=2!&e1jq*Va
zzwQ2|jD<Al{9T5cyoP5U#b86S{SZGYZlTT0%p+992M_*0ebJLxhmt(AM`AHmy;ESo
z^46&hN3_wk^xv2KpHS_UVH1I#K11~+eM^WILg$w}Yi>S6NCfZ<(769!fBn_m)n$T=
zb}abDjo-<4psW~u%t!?<E0p~M{yn(}+V(+<lmX|hrYG&(U2vo3d8op!U2k(I9XEdW
z9<e-k2|vfL(CG8$&lkM3it^k6(Gk&hoDQYw>9=p^u{Gosx@!`pa3gw8Pu_gWlTkoG
zz`CpY1l_9d%i#bqhuI`Xxh?+p4n@CA&>7VDxAy7C8f*shDGpA&y=LpIOzHRF4_Er_
zDOF0+reVfz|8)+6bzdTjd1?s-%tcUcq%^9UPmGU0B(oX*VrJmWbjd1;<7gKKlMsn+
z%*wi1Hrc#<b_DO*e;ono)3#3s#s7B%j*gB|DRP0L=T5Ux#}XVS;$>xJ38KkISAg66
zVLhyMKQq}PHZ4}a?8pR={T=N+KAnMqfoz5gD2^qZC!@}0I`vvEq?XiX&Pd>M65B$X
zD9IC_T3YlonHU+#U9jf}H2l98>~Y$m5ThxiwriaiPRTMX;^N|hbmsLszn3bs?49vI
z9Oh%Uxw!#_YnUgpAkaMp9t16*5_X3Df_Lu{5KSX`j&$+D+){KzZo8FB?<qkCgQ1Q6
z*t@zqkymYjD(Fe@?-@l3dY70-?NdOf#;n<o-;t!{I`0bc?>1le+FAXS5`CoTk^>}_
zmM>rax`{Qy()=@7f6}JSzp%n}axc0ImdzxiX<>b0+0H%-;a|{%+O<<h>6<r?zFR=)
zW;+CJ8WESGf`j8R__sG5*h}U-g~ea&1D4|FzV8Fd)oK~zzC+hesk*pbfjagK|8l~^
zM(chg1YxTtSj2ixAK{O@^U-vknX>KFBR$hP%m8=6dkXTW4fc11tv!-^p>Z?0<PyKG
zVj<tid1Fm@CrbRZmg&cgnl39(skzePu^WK)XCV`&IZY%J4HwL^u<PEB5dk1P`N_H{
zj;#P|ruV(*bm?~sp+y?Pdm^H2&5vhr+whY=!UXt&6F1jk1hIm8KpRnnoBxog67pXO
z2mp$$K>J1opukXXqYn1?taLSdcaFb)9VZakIdDF=n>UF@4wy%FyXyf{bo-G!iLsj{
z#sJEX_vNt=9`o;)ywV&6h<sGvhEO6pKMwxJ;;7(pdc?`-h$U+|kEYcHBm&HA6g(e{
z_KH&m5&vDn^?>_N*iJtCG@}95Lqr{5VD&0^O`YiLB`0aHP{vx>7yC|+jgG$e{_anB
zZk;<Nn$<5ZJRyP(ni}4|dUcSfqhJesXwN6@G+l$zM8+eeo0MX-zH9@2$Mw#_0;^5X
zzg5$YQ+`cQyoh}(Gg8E<9B~$)ZyZQSDgJaWZ^1}19xI4E9ug!^(0orlDfi8#x45$i
zX(#-PFe#k*d5q?l4iJ`eUrPLio}j&goNd9??%|$KFu1cIn!<p2FC{L`aL0}mk)ML-
zstR3I3EF&fYilSDQbr<Q6JEq77f4u@zC%5j98tuHe2#yWvB%&EM?%OjYRt;~c(G}D
z_pBRn#-R(u&=@a$BEOEVqism4z*!M^9Sjg`C4`#j>gs9}5A1mW+wE`|XQp~xDzcm$
zARQb=rhxBvpFAdx3y0j4qtQq&a7%A*EK2CR+}y+9OD7<n5~vKE((F(SIldB2KNZQQ
z8pKG$70Bk=$a^*zXQ)o<IFCIYy6BY&?cxa_`JD}W4?vE+2UDHM>Mbo#3A4c(sAoB*
zViQ>28^HVxLec3&`?=E9C2N)NhCYbasyK#z8=1jp^Bvzn@G@g`a*zOpR@U6n5rKsx
z;yl52!@Z3`>>id76Umzg$jy-MP|u_D7UE?p9=v^}rKRKe10Z}D_yp`*g$S+mWYXgX
z*t1(osH-7e1p<^jw8yAkt(7ZR64Ht}FC%&u6H`E4*hB6WEaFi(M|0fS*%^WU5H=((
zvz@?8NN)0H&z^O6R}6uRAclGH1Wak^=-|YaNOdFKFvE?syQj$l%ziPPKpEp>WdwpX
z2)n#ml0IYz_;&;<R3A=I)<wxp>BXzwUbXGW1|Z3Hck(KiWho7q@3gu>*e?yEq(i=8
z6zgs{x&h3(px;go$ASztIn-x4)osALR6=9#u<CvL^zw+b^zFT;{#n>5jCW_GGcS0%
zTwH<*8HUodL0em!KS&X}{2LvYDeuqxpzC|d#!Hj|X}pc?q(62sCMM<@YZEzPT@ZFj
z6g@nTca0cArwHgte%2^AGDA);HJB>&nj7Q>PVqVnF9J9}B`7F(W&^RW^Z<%0_xd2C
zyhu+s+@KSov|}49D{H}|<u-}oqTUq!f_1MN8yn+Pcd2D9hMIc^8;9}c&6_qYSrBiZ
zU`MFA@63ny9n%MZwQ29(y=v}${{ALtx6I{$CUn2)WaP=BQS|L~y<2~EX%Gu0*Df-=
zTSp8BSk!uY(I6pKWP{gf91g9+>mOK)_=U#IasutTZ>-w3uTT#w{06l04sjtL_IhqT
zcsnG76@E<<0P_(<{JuLQBO@us+TLs8<Ky1|>UBSq5Ec>Hgp>Tw<6(J9=iR=2n-@nQ
zjg+EPOJh>?&`fn3bIGS+=Ylr|9eJkkbRSa+TD8a`#6yQ$rgy}HIpDM2W*`~J;CHl2
zKI080`-sr{Srr)T-T68Z+nx<*=G_PLC+O5|FE6jXhQ*sZFJ{k7r;|-6ZAn~)Zu{M+
z9JkSbwNYGL+(t|Vfu@I+e4CjVe)EqeiBrf%Wv~4OFZ>ge=YH$f@kG-@hyL!v3sY)_
zn9V@MLwl2=rn8p6+Jh`|N1CM*XKE5BlJ+XD5SI`S`F^MfbLlB_?Nm4(l!#L}%SFVJ
z9m2xua56~(g}|tH#}A;Mh^DXvyb`%KV4^xopIpzH8rJAcBiTU2rt3YVYSL~?EEE*T
zvVX2F<vagLa!ukQ>YmlGRb%1rT0$o_;|%zEA{Gwe^ukv(ni3X_2OPw)-{WRxW^~eB
z_6++AonqO<UV?e|49#ccqz*u@HEfV6C<+m3iJ>9sOp0W015YMP1|D?o^0UPzg!bTd
zY6DSIF*)Rf?hfc(_^M5{48zgZwL(HdgrpPk9k$%euJm*hv}Gelmtz#f&cPvH3_;87
z_H88v1v>CE>D8Xax5RJ<nJ0(C3asj(qYib~so;RD1jCv&+rVz`ut;d5bew=c6{C|K
zL%eyYV;>sK0&pb$J)FcYCnB{`TN=@oeH<W@4uP1+Vi4&`69MFRVu17Y0DhE0jU@~S
zat7(>MzkeC>VG^sdJ&Yo6TMu|-^6O9$dYz7AAfXLN21aF?Ve?*ibsh;h;T<kO)!3(
zIHx!%@6ik`A3^sO$IeoU$M*H(OnwIp5oJUSsDt*!&x=TzK?WrY0;d05_vL(&_2`3e
z^}?aOk>(05A&*T0<fihDjv1`^$jfMw^g>*RVm8Q~>~Z4gIrcPad|UX$7r3<Ls6Qkk
z=)@{QvLW0}0lzKlFZc+ZGT{fVE}@5@I|Z5AMo9R698A$g3m0;1*^<+$nu~g=oNfc0
zij^ShhG})&{`Fj%@=$!50hb5F;W7AOV1`U1=yQFKq@Un8iCljU!V19`A?bMx=v!C|
zZI?Vdev<eb6^Vv&GzCaBcoCwV6VCb9ILFv^gJ&Aypzu0w?Om_-keu&?NUa?ChOrUn
zh|tOp4OM9n6V<bwo!pu6q#w&564MQt#BTXJa<I+zS&+2P@u!u-!TlecR4zrM;(mrM
zLQcfL8U@HMGw>tlkcFM$<H|T52@vdsI#;6B#MANGw-%kxryAHKgfE~5)w4|p%}g3z
zi^nObJy@U<m<D6f0KpS~j!sg#8-zNueD8MIwCpkp5I>fN*0Co*0hNuWNG`YWq}55?
zSi&eRuEdV#7iK2dV}&n-+RS9>LVH<mYdf+m-JlpJybUJNS;B}s!9(8dX6AVD;)~B<
zk9Syj3~p-)wH$Kp^=rHpi3bjySwr$oS!#15)Tq#=kBu^|Fj?|>VZ+8(R^w3kUuD%e
z?53d?5D;(&BBH?tzwsn;swq_O$@$L%fZJQl*H%ms=2g>Gh2mM)W2%FYJ4j!}z;N%J
zrnB{zj7FSoGD&_0UR8>a*#_xQ#soX56k@7zl%JnpAKn1NNMufI!+@SqY^k_6B0@rp
z$h%3McGbqfbd`PRu`;XAu;Y1E4XZ*!#Ms`>PG7iZD%Qz9K)Ls(hlhauNRO%4K@$^`
zP(2s;7yPkfco5&ud)=|aDmWBFZ}*{duf`-1DL|TKo2Y30d0X3*Hxm;Rn@ELz6DVp&
z-j;HWud<|syFiZ%lOr>bmH$;Wm&IlBl%R-+h8{xFc;Ui@XGX<u-MqQCqbk<?4&I4i
z7n-`^nrT>JaPXFGXtH@#O&YoNm^N>|4Y|<3*m1{{tp5lp>b1Vm#?#&)O7`8pbMt0=
zJF|M^$=yvbLfYTpYa_(V<Tv&gFK$3zN&#<gxl1FYZvj$Nfs{7kKsY}kJ+JffHsKXa
z&)YxIe(FyYbrO9dRd{1b2szyN>H%8AC4o#6xKKc~pqVVX8+Z=wAy09rFT1h@+qwf@
zeaRkY?52%-bX_5Hsr(5vy=8}Og8<I~mEtc&r{mA!l|Pj&IEG7FBy3DYP{x2B!y$#^
zh~)MDow=J+q356|>US4EsD<9Cgs|xB2jYS*5rqSp5)Btg7pKY}G#l&4w@b%T&BbBB
zOV6-db(GWaTEA=7c2J{^fpZHCqnBdI(xr)9h81_Gv2G_24he-9u=470x|gi&CP({x
zZLICkpgV*bq2Z$gnHExo{+J-ymP<Ua69HuN(?oH^CO!Gq?c3#eJIu^6I&i)>cWTQU
z090NaR7-vFA|y2yDS%jS)<RrdvS<i?g!<2EZr*E^w+RyDkPR|YgQcAVy58j~>(C0-
zKR8&pCmFktcxVZHS!gRI#}htJ0a`tjaLEDzV<m6P`T$R6-)#?f_evPaMq0!D{U7jt
zo#y7|duEPM9LYJM-CAkoJ^ssIy@&f4Nqio9SXdaq3#*Dsl~q$`MPwM#yM_<6$w+Ly
ze|qw~LeLMg%tINHl{`An61aMi%25alzB#q_^q38Zi@-`k!ab$JjM1pSIR6Mue%JP{
zVV5mOKRz1`etaoT<LFjZ55I7pN+X63HgN>{)QPBx5Dq#uiMRDcH5}aDj<TF=!zEJ$
zUIhQ22nehV-E8AHE_r5WHlaI^+211<5Vf@8nW7YK7@-myh_^pIf#+dpobmX^9Co;6
zanDS@gc#&kHa1k|IvvsVe0Uqlvt_U1xEfs!I2<8;)Is3Cx_WbXtJZqhAF$gZ@J6^)
zW{J#rbl5)zifl)zDjXW!L(W(bhgmDRZD9=%OJ}~<)xKs`8airJuZ;)^3KAKVzmvQN
z*GkSJBMgRKWgS@?@Bz!95T<^;#tbxHfM*@Q?;!>Uw6hz{250e3j`lZN85c_pgwT9k
zc0o&16H05|E*2NCvC29;={ODs-anUtP6C-iTq~OW0#YBvu@S<^5DvYm`gp+H#l7`Y
z8_>P6cGIRv0DMWHfr1hS%VzjO-X0!u*U+BbcCROEg|F6@ys9b{6wM?qyf<;n2<>Cr
zw4L@zOHrDy_IY-(;mwhpI$0TLsx{NXI}d_*ZfiY1d$;K78(_F|1LGlvFll|)g@q2|
zSVWCx&e`g?Q~c|19Fs%)6v&IN)-=3VZ#w<-h@L^4>eYKwBfgovewf1gn5vHHrr(hQ
zl@aAyO@^&dzX=%~M#Nzejfr0c)8<+qs6>1!hcV)|904QE@&x<bez?Xkx&Y~07~Wxn
z(2+xj&cY66%wn-0{Ca9c^|11I%LlR@|NYumj#kcD82`q)_oEgXFwu4G&e{&Gy9`af
zEnmMrXxy>x#e)YM{32z2cB&M|cooC*;W9TjM~?v;OX7kLJ^(tPwAE1qgM-i8Wxdzn
zCDC$arz%g!e<jB>Dqf&MRKQIB5arT*S1L+HqvR%V5gbCXqH(k$8eD8Kb-<d){PD*h
z{GA|;j{zLpjOx)63P@69papnu;gr;a)EerbKZMgMXD%jPba>*Eh&T2B+Y4~$$dL}v
zZ<)j_6gaiO*Y_A-?Ric*ggQIRpXphCQIze%Yg~%*4G)j?HbG7ytsr=B&bds|PX3U`
z&>dANATm$3TU(_V?_@ZMx7Jjk=O;=_Xz{WQ!RVn7m19}C=m}mXVviGRIMnY*BQMvV
zMP{Ugn<bvfUrp>$P`wTu=idL+NI*&|kC&Y-^kkSUlV1xKRkUmrXsmXZPL^X$j2S)k
zF3Uvgh|UAno#+5bR1-h{$rb(fA6`EdziWwT9tW1=>3-jbHn~n9#*%N-K*(Rwa8lWV
zj@;S`89@Lv13SEtWg`w<+^-|BmP3?q2OKihxYdBo=-s}#WY02mROF?Z8d$DMJj2%}
zEhgoJB>?enU)HpH-7V1P=)BupI$8b`ti6^qqUKOqNCzx=nbRpp$aY;1&9iWxhoH)?
z;-+b#3kHj>qtqm;tyr?;orMV+0!Yxz3`orAsc;cNqR1lKd8(_HIPH~932=wV%hbEZ
z-D&TpC)n8Y`1g?$OeRcHFJ?y2)*B52R1p)M>@ZP|#|lN^Z3k;X>%s>VCUeWJ34?gQ
zUry@_CmUzE5jP2>WSrKB#37IxwDqFBMNHeG1YrFcL3T0mZk%&<E0g}Ap`@Jv?G{8U
zCVh1@+w9xR;Ws01_Vb7-A|^`g4`f)S2$t2<)Wn!-JU_dC6}vRz<_@olj~O^f+M+xE
zA7gI<)^qxW|9>zT+Zg-4r7V@LNC+`P2~kNYYce8Q6j_onj4ewWDMX@@P(qUIsZ>%S
z*`rjlRF;VTuk+zM%<uaBuj|+4nrmj%r}y$a=Q-y-_kEw-ek<ND#2k5^PD%Ur?R_CJ
z9C2C=j5-us<@vWED@>#6N`HHX?;iI-Jx^li^cAJC?YC(diY|0_`W*42iyGoFK0wO=
zwo;6x1(_rzla57xZ?e({ixx>jP2T9Nw`ke2WoM6xpIYg#5#5GDfF1Jc8@;_kK2Kx=
zFeDb<ZjqC?O=v8Tx^GkD94#T<Q7UN0Z`$K*kIQB8z1SlhsdGy&RSM;b`^hct=6Puj
zs>|Cd6N+BdL$vuJQG3fi3OCgv^js!vv9h6wFKL2L@ULAQKEzp?FIgY8=BC`$q`Oe4
z8Mm<Sibwb9_riI+T|H+teeJVOsD#XkovxoMXuMQ+683|XCZw0opZR_<trWO9@Hq)h
zZmf;qP2^ER(DfEY1UcCA9<XH;ih?)q-%sHzJ-VkA)%+M>)&Z9r6fL}eW)gA4f-9?E
zYG56`+r@6_R`uL;)AHq8+rrixTF|UjtHT7t5>(zfY>elnrF#x<^9%UAZIE@MhYb&}
z=dJOSQeVG(KAK9qG@xbB!wkC4Q7Y-64}o+30Zv=aW_anb<#XxX6PI7XYxa2b?86DK
z?4%7iK^|-0%IVd;*m0y5XX!ZOZFE4Mnp;|_%W}Og36Fddgn+Pgq0{r3Zr>P(ax?mq
zOrfL2&jHa~9MisKOaD7X&FhE9@5{|<RJ4}_vCtI`{8(?n?1aKghH~4Lb89)0Bnm3O
z*V|qL+`&A*dnoG>ueRq?%>(q?=DL(UAxUZ8u3g}wg;S^Q`c5MyrnWTOlsozS%1X^S
zlb43yuy4nX419jS<)3+7hkr=Jc_K<+Qs4xk3EaPb-xh9d)%|Kc?d1~zk6w@gMGvxc
zVeJ-Y9o|N}DiRG7EDQsvwh<39y1V}4_j1Xs^VL^gkp;n}z46LH0!{PZ0AQjYPJq@&
zIqo;>CCQ}fgM<rKBic7h-WXXBl)ERy=u~<3xxFF7P|O!UOo(?K)UDfSxPPXkN5X0)
zs7$1YOnI9>otK4}aWZiu8@jw-pBd9xWAh6R3dO-Skawllt=pkCB><s1ds@LM2zb73
z&#Q27MH{E?6M}AnK_5+h`0%0I49y;{Z{Jeg+TU@T{4jB`^YVohF(cS>SwKz@5XT99
znYTXQV@CLp0=p&cyJ$#@nY=^DouU(f4}SXdyY!;Bf8Vd#>zw6q^33B~mhxJaU-~+s
z=j$)FOYzf~7qfD86GJEG)l0Pv%ig>>z9R4CxxM<+)oCQ}p5WtJoSppc40XZ;mzu`K
z8v6=I5lQ=_Z|h&;GVNCzBGQs18IR#IAZp2nQPD7sawAhWFS-KwXlS}2zo;mn#OVuX
z!Kz-9XB-_G59=r3)fqvpa?Z5etoFdE6J9{pUK$UE);r36)hXHZ^wAi=j0)=2F*y18
za;6^QFM4wG5;6!UVv$EW)aTB&eS2vTHtI0g49FrTs{HNSO!A<2*kHF9>(s97A*EGh
zhjBD*Ptp?*NPqs*S9{vG`~COri(j9(pJKT5=DFe?XYJD31~#+N)*@9NAEetaYgg6X
zfz`+m`Tjn0kAz$ya@j7ML46cEC+Ro2@I!fT;bFz{nZ2f;o7!@-0WtG7FGb|!#xu&Z
zZHCJ^i@@1-$UH-lK<!;QTA8@%ekcf%ffuT)z^8YkINp6Amh(p)$%VJWlX<gz9t@f{
zJ#CU&IH5?V{A9%$G%3S|Jv{sJ#fvcd$S*iP?_g&w_)yWSbB<8@1jiESpDaK_S9dB&
z?5PD0=eAxytnyj2@p}g647PJSIIvGz+EU&39ovl`I%cRx)v~SIKHJ1OUwt)I`_aG;
ze?vwmkdmWYpE{lPQg@_`pE-yv*a#RA@ZD7HMN{?bug6Q?PEPFP{HeobZrzV=*r7KA
zhPpddoe%rDKXttuT;cdR5ANSz=pPaiQa*Uk$R}TqXpGDK+*I#fJvIFiJwg13fgkC0
z8u*Rl>^^LfoL`}5E@fqD&zN4<P{<Z#bMLcr3sTP@ub#TMxxS<wdse!8^=dip{Fhe(
zhW5$d{q931jQI{jmz%-Fybat!?s<SH6h;#Y*`~LE5JxjU+z^ojT@LwQv6t5XjSBm5
z{9e7>SH0`@2ol6Ef%tg)X(Cn7;hs;=xt90~XcO+Zul+RZXYbBeKDqo)4$`@j*LRbX
z4QT~uQ%7k~W5UusIH+C@ZRuB+UWyP7b=qz616gcGr=GY}@XU<7ZN-O0f>*JZG8t>2
zRh1a9(`EC94IA{@Nn=I-qNAhZO>pJNR;Q%x4!V>Bqbl$WW&rAtk}0RXDgD}nx12^U
zE1aNfTRR*fmP|lNxz_E~{rTEsr~nV0uPPfSi%Pm@;?izwHLCLI<~}s$-ETi9dzz0}
zSfVoOk$3%*{2#8aUKO|~s3=($hTBrjaF=WVzZY_L!c|pGPE8#rrLySjNWsG~&Cq#^
zzHDNam-Y~(r{UBP+_vHO^cr?8D7P?_g}j_qI%m$D@`|U<v=iN0tO5Bh%$?KH+})+3
z&G*Q|ZJ&kJz0lt!FYV}Yqc7)PCANhzv~CTs42z?yki;qbwlDeMvi#|ljdI;fsKu>^
zUnBfyN7)Qvn`T;FD9X<dA&Luw_<(0j>mIbuVw)XSMW82uUi3ywsuxJgy~ypMCx--^
zj_MYDZ=f`)fDIyR2eEO5P{w1*rbR;QtHAKXOvB=v``1c!2?a~6^MoKo$eQ2G1!;`%
z#DGJMmu|`4WAMkwk+!WzZ}o7#cTqZxo^g`=n{FRS<`VcBoV|jR-0<nhKPtn)B$s1r
ze$usMigaD^BK3moaLf8i8G3$mU|Q{$$K#kl5m+-_^l7C2LJ%<rw{PERn)-&<j*~Mc
zO^dbn%O7Mvz5QX0UkOEGNXIPzKV-5~DzF0x6EF7qy5*M$kk4VIp+sDTc<J^#ZXyeE
z?&37HuF!wy*LPuKx>Uud%x{(9{h!)7aPq#YJpo7n1E156eP-El>8tz6F(;f;`LY{?
zQs9+?_Fy=Co>ez_2OWm3oJx9h-0e*sMLj8YIMuptYO_7Jd>}td<u2+!N<59CMLYk3
zf%Sq;^OCq8b^^Qxjy#{;{dzh=2=D%A*7Zu>SltKYy6dX)7z{tAy+$97dxyM36ip@0
zaJ8HN{`<@i9|z4s?Rsc@yH!w-=!|X=sUw})sjmk%`~%P9P0%9(?DFVz=4uwsy`yaZ
z8my&(_}ALCM^i~*?!cj(Z!?_+A~k}9%o5xi2XwuvGK+l8GJR5Muay<xCp>$htnVTJ
zqjHMvxqAq*1L`CHSEp#nowWS=GA5(!*@2n0-*%W_7!(5ZIfMSi?dM=@5oCvzpLS6<
z-J{D~_2ml<N!`HwmXN@2QZxk>z30eF`@CPZDyn|1txbO<TDhXKil6g(xaCN1D_OVJ
z#aoBb(z4vT=acFZ?NE>D0>3E_W~eml&b>@A{E+5dhy~wv^a-x+pcx24wyQeiMjL><
zIXgLJ;`emJ=)|bx)K#2F=AgJi<yV|9zz-9W&~NQ^g9%kucSwg90NKue+6i7v;B#*9
z+0sq2PwVghJY-?p^|KPrY_C||s$*L%+YMw~_4Q7BM^USP;u(m`YT!E<Vyn)+6=Ci}
zp6WTNwYs~LZ{IvLrvxT*X>X$h@=Y5jHuUkLYwpm2;8V`)lc{oTOa8B9C>B)zY7f3F
za-#T?4(YqEtE=)m%`Y?Z%CA=t{1f-#`XNrYdo5bEXffC2z%nKhWRs^Yne*Vm0|)=b
zh|->aB!4=tvF)!nfOV<hbZ1whK#U)<?%wC+aE9(UXZdL-yb2^6Tk=S=%d6odbjT>A
z5puA4$RHi^$MJRRKihWh{=kW=X4@A}*i#gDXNQ`GG$g>wp-}kFjf5k{yp}%#T`{nn
z(CkC?-T3k|Jxab*S7*%HG5(hpA8Lo*cf5LH)Ww9RE~9&?P0LQXm7<k+c-Wq;Zqbnw
z+=f~0+iGQHxu@8yrC+C@emk4*9A#WP`?sggmj*Z+)ZVeD@=NBi%26tRU9#5P_PO+0
zsm`4NuS>po{O0SDFfZ!PyRZ3I_p>4_N<R<e<|K_W?q=$DOu2}H?P=re?2+=RaCmii
zP*D5OLx=3Cle7YiC3rS8o;rlGd|WplL$o*N?DHt{{dP6>*GY;e>TJA}Qg*{o&F_dw
z^z?+~k~zD?pE)y?5UOrb{d~{T`1tq*n1!d8KKj~D^O*Fq`J8S($(4h~xi0OsudDUq
zSw|8RS8ptZv+!a@^{1WXF=>}BjeG$0XKvJ~<wN&%KX~w<ZuJD<!AO1QpXnfGCbMTB
z0DfPU<0d+)S+n-L$iM<ukBy(kAb(Ojb<FKf9>Dv~Pil#iR<2s*3KTl{bWC*gNdyf?
zIxa<~Bo*@dv;O9Tj-C8eK>yWpRm@iU!VnnT7WJ`=9#vFSB!q#3-UfTato+;;J!Drj
z&0{Z}vrnX2cL}KdX!S{8Kj&DW71NCl!-prP-XoVdd}ZA|vT0G?q~=tMduVZXPeXO5
zj*L%G1lxUDwZm@o==F$-3eIKT#5Lh*)s0$yudNE)L(|@JrQ_~vmh&mkcQ5WZ#4XY~
zf8wsb`>(tgVub5`VIbs8@CN_2nmvSC;C_+^JtJ>bXeK0=5%-hY6j==$)8qA{BPv3?
zrSUTOG|a$Ah(gvUzTib^6nSHLuhrcR+v3Sm+vm2!PeLX|3$v+H!#K))bFV#d{FBo^
z-2vv0$XqNIJ|d@RTDucKgGNj>u1BY>HNKv?weM?1cUDs2`_<JSR?m31i%Blk;cmxj
zDNF|X%`;Drp7O}p*x2@6+Tbf`X-BWD+bAp!IN%|F{Z&&4MR4{9_O*F1IiRp0afaV0
zZ<?yd274IvUgGPK5FDd-<mAE7P`|scNY}$lr7KvN&%x?f(*7J?Rv=My{ovqe+ub-*
z+)tuT#A%j|1x0Flsf2+~czCKhh3t=jM<4AR`}vFBYa^F)k21z-Yil1Hy!u9Rawhdl
z>VU7n$=T@n3g2Um8gk>t4Xte_@=8M=K7MHwIc34dz(&ZT!>k8RkIr34KXSV&3B)8w
zA3Z<p8tr_w@Ym-9Xfr@{9(+3SQ?W)udHRshQ|LeDZ@l!HY<B?-e~S+JNY!Z>C8#Mg
zgvco+u7{$g{Y;Tkzy9EX4ZU2A@&;+7pprbKm(JVkJ{lw~VfG4MuYrNVpwlEruJkx2
z=aHJ&Nc>}&MRi$~6UU6PaVv}BX;3$9m0G>y!6Grvf?xh>(=vs^I%n|_XS%@C$ymI<
zE;dg`K<6L#so?eN>D$gefqT^Z#K>y^jcQI#kw?r6;wm}B_Ef(fC&1G&7V(S|k5IZw
zqc_sAfL>7%4Iqn^=Yxaw@%MLHB%N-AmzuGq<P}EEn`h9PVK7Kei_g8WEdkUo&>XcI
ztdQ>O=S}C8#L=8B5Sw2mR(lk3#>cxpxl|ja`1+AY)K4LCEsQv95zQuTT7FyOsP{))
z4E^qUJl)vccgK!N_IXdA&K?TG2DRa+IOk85mGddl(@VE(X(&9Lb?erJo*E8nC&qE0
zEW3@ntpbfk%e-`JKQuGAUfI0eMPE^LzTO9c97+I=dVg3pIvx?za}K}z9{FAaPl70E
zoT`?iAfcwc0k#q)r0u3u`az$?z_+KGT4?rrggz0=-@h!nzZYBGfFmh(wzfjMO1W{v
zH`!JiDIYzK{q>)h`~sIEQ;Kj=;V`7qRNC0W%4Z_jDjG#?>2(5_4Vd?t)GpzQ!Ceb2
zC|rfD_Qu7<(Jt6^p7jM<HU)~b(YSG05S$P3n6Sz;Y>O|ZdBT+iI>51w+Gt&g`$8<Y
z6iXsh7QTLdq0nwcPLD%VHjY_KaL#$bd)Mj9zS-FhqBXJ+QOw?tOQ|4PnYyZ(8qFUw
zGMHW``gHEBPj_<Ph>gXEQuGFXsHn(hp9rz7@VqPr(ukX61HOCq96fhVPYCRN5-NnN
zK;j=-e;$(w%t66oY2k~KmRnKp(L;1I?|AYNLiE5QL^Yh1E6jkDbA6pb=o!usulnVz
zR*}up2cbClOq2wS63d(b1+*&Qp(zFII|9DpC)ahpWj9PE0Ax1=R&nvz`Zf?`b@HFj
zYm!Nx!H#$lc1ue|2#+38Siiz90FjJjc9&a=R;|W)9|#E<04Oo}vy5&OysyyB-aY~(
z{3spwV&E2aO-(am_Di>TVgJFHKR&9-lblgw#z+?Ex+v+|HMtdN%OdFZ+qqM0><z(R
z&O%t}`p@P43{A0kfgRVJF;}l$8*H&&EFp4Y7g!Kl#^MIK+BY!ph`gh2-HP!+ngjr}
z1OYHQ+G6xoX56ku|Ghh45ridveRnLqly4OU3EOWH*I8CpCQly}xCqu%Hf#-ziVqQC
zXn|kAL~vYG`0UxAlt3xT$#IcKk+d$<3FJ`fW+;l?8+HIryF=L$?2VqohtI%aAzLW^
zG)>bigFoTr;eV!Iuasl~yOduhe*mG}?~<HJ9v>1C+)_j}9lVXcxZ}duo}XT_E3H7|
z)Q)F;1B|zF?C6S|eyDi-m%MrNrbo;p>>4iZ*thTX83DEK7O?7;=(hhYNhEi6GZL%e
zot?b=#8H>LJ2l6nvg~>2`Ssjx5@%P`@|8T8_Aut0Wy-0=!e-5yxqMk|{J8P5YK2J?
z@?^)q|NdL|Q6PnI2TI(_uJ4K<|80NvYl8+y4*K>@A)d{6dVenO<oU2uk>zQRsw-aR
zX1U*p4LS^F965517>7M?n@@}Bw|_1_Ii9=lxqYJJlS>P}4vqhG|Nf%WL(nWoecDZQ
z&%b-lJ?8R*2MO7Quj5}2o3{_)_*OpuQ6?+KdEdSdFETHmAdEPF*^1Rm2SU3GHHvlY
z4E9YD&7=;sYuCP0HE>NoE}iLL*0FD^?CFWArGEiY-|5_GTIQg_*c%HFlw0rbhb3@I
zjt4kIdcP>0;l9uQynp7cYju>Vsc8q==E3EOA!X@TX-22M9H^m@>V(OmO>E+$f;fD}
z@3+Wn&;HKtSCFyq^_#bE<BEfpuyeM;1@ybSZ~JzWSC2A4pHkk%7omp=D1XGx+<9c*
zCXXqTGON?U3#QqhmkC!EUlvXD?n)i^mIZI~wvyPA!CPuiMXPDCmqTKRws@plCmlR=
z$p5+r5j_dM-^!~QIv&-{nm2#;k>2#Nk4x2GS3jDZMyI>&!{O79o-MQazNpBHTE-*6
z1bNgEuFEGt=UC7WQHLpoF+{adKs8ZsimgFICLvzOW-^R^2`r^KvOB2+#HnNqZqr}~
z=5M{#y*Oh85g5w2O<AO3EfjkAa4OwmU@23uW8(i|S{zPw9TobqAMzrCR%6O$2BSt4
zon`Oe>kMugu@OR)<SG;?{uLz=+Z_{9h7IgQ(@YdBLRxs_@%g9}lLB&5yV0=m&Ryt_
z00mBt_~Q5-$>5*!8QLZY4E_d6QL8AHFy7j8WDlaq8Ejp1Qdt?kNjEp*eqkzUa`n3z
z)uG5I<?P8K@e_y&EHj+shZdt5L}~%z>4?oB-RNnND38+KpZ#P_wA`us04p)kR$Sa@
ze+;%niI9?{KLmU&)FlwQ!5&p-L~O{C6s{KSi!6a+@T>D%(0dAxLEgmfVXMwc-6FV(
zG$E!>-&?a*En)IDYOiY|I%bZr3R>=Fx}WLo9_4H>B}>a8wIWq%LhD<4_d*i}Q#;1$
zN=0{H_qX;voFeHjBg=7ea=OP}!UnHh;Zg(O2_N1YKz}F`bquLPvOrU(ha~oqX{!>;
zCBtG%@w>jr!6^cCfIHsWm;T_B4}buLD46Lq?$4XSYd9c=N_$EwOOXcBT`v;9rZ;a)
zI4T4Oen2<uEJqCF<Hbv!wM~7-q8|OvzsvFrh~~+gRObR6i5dhWP0zM}eQ2y0T~4ot
zr-60_#kMVuq56$3ex{A1|NLoH{WE779Wn!$_JGT0Q3K&2wq~Ou-{$h?B<s@=zx~V4
z`u7(R1S}yHzh3$B_3Mn{s0#!BPmi2$-)VA*^Eqf67;$F^n6<JL1AekL3BKmS#gCEi
zRNqpL@PV%_NmcD#ufj&N)e7iO+uJ$z5m^T*@zeVrw0t7e+nSX8WDjAlIehqVLtvnt
z4JXu8gc%Zgltx~4y4NtVaO&7V5f)ZewUVf6_*4&FP;hkL;bUy(;)wxxp_K>yb0oNq
zW6ME}IiARuoDKAXt>SP0N633kOr=!mem#7ws_@)>fIO0(qxV;tMr!TFW$PP!`8ss1
zT)$@uni^MgC#MsX(O;oSYEX8wL`)}5+QF5boR`s&VFe3{b7Kxe%TqO!0a_TPG@Q`L
ztnr<}!#{*=>Z^PR@{g$BfB(I6QFVQ1kN7!S3WY`4=RS8<uVB1}#plvT`-o%LpEzE(
z)oR1sSd$0H(Jof*`f&-^aIFyzCrjDpgcs=@IYBIq?t(5Cs$W|b7XB!sJlh@tfKN|i
ziJK5&)*!AJa1od-Xxr*)rfozC1_g+r(WlQu;b&4QZ(&?SSb_50-*6W_+VhznThDfb
z^eS#uq-hmI(jEf_n73)uX7K8-=S1P>lY9`wP%H!BhGqkM3CJ8Wr&dl%|8pAr%5PkN
z*z}+En**jWJ}GG1hnKm3;3e@mf}F2(W!2{oO%#ryqBe&!<?S(a+jEskiVHc8ylRC>
z<*t1pQDQvAPl~q!joYg)boITK94&V_9r3I5C6r%}>nB!Ee)eI8ze3@tW?l7L@Zc9T
zy2|?%GQ79!Zp^%G%70ZT{MI6~3_cL$`m;1C>;#EcFDgyLTNIY_7^SN}{y4Hm!^z;y
zXFhiDoVDm|QMbZ5>4=*wOwG&=ifaHKTAsA$XEG>4;dyV3e5^{;y#IbI>cNAwcnohP
zR)1Lx<aqb4zQWVK7dBG!=g;>(2r=jgX=k18HBBsFj7;EZdmp45BSL@vr^0L=EDV7P
zReP$d8zI?~|EUP;AQC3^r^^fe^U_NpDJ&g!DzCH-(>KhP>I9}$E~8T8zdWlyFAXDR
zLkHAh22JqbeC5U|7D1iwop=9{*bcY{*1TsM5!IeC75c6Pj=B}3bYs1ZyZJ;;Il?sO
zY}X)kV<{nQ8d!cORKtN?+V-3MH-AO+ZzU44vNI3=v)@Kzw0H!*G_ufy6{jf(8ctxr
zZ54$kd_N;^+&iblE`Idz@hC;{S!5?l!7Y@L(QxNl%Wmk<YsPtN@UiWx5188}W(rJR
zQ0SM6YS*OWogqI9sIr&QcG*?_mhz{eF##ls`1x$5o@=pLEgxi<m0uFaEKYdm&rEAY
zP79WYLSe3y-r7ek^;qPr+kR_G!+)iPkL{@zL{36BFNzgv;c&<lqj>;k&BU<|l4$$i
z{y|WD^yJC@VDN33b|acW>0eq}S_;>TZ6(IfZTK6L@nC+Vp<jC+r05biF@PvWaVQkA
zjYZ&QsgAsQr{BM>VftQ$rKLxOQ0{GJZ5>Hs-%g&!9)@_BCO&xd5th?o&u03uO)#=j
z`s|gJm%pR!r;H#r<g{cNsc}O=2kOgbM*PYzx!`AZyDI3X-*I$1nLj+97X%a#H|}K*
zz>WP9O@-%0=D+QS0Y1)~g&qvA7QD(#Ll_becD1y$L_Q7~aD-h<E;jUvwKA?{IrWRG
zA;Y|G6D5@9D`6u=`5791uyW83Q&7))7deF~A9hrfJm}lW{aF)~4a#THb*`OVF)FO3
zixSIHG@3is40EI15VX{}54HRsjfWltx>;*lL$O6Lt)ZmL>7}?zd!Zjzc&dv^{>>P_
zLH}70Z6!sm!+~%3DP#555W0gGS!s%{Uog|oBp*;SgvY|bfs)>9o7K=SVz$A4*0iCw
zl%#}Pd73j1!fl=m5#LW06342u81*({4TCbtCaRh~H$7uK3Hv0=i^^yu|JdU8=J5#0
z+hFpEDiK74M%fm8za3AI0m(vUweiNXs1d{9PPAi%MEwbtkH>kSnTd%UuRx{qK`&*&
zow+N$V+_L#Cqr4|uo1*a9)u#hgFN8Lp&1#4(LV;7gvlJqV1}m|C>M{kFd$}JThAwD
za~jB;?k{c=^J&-Y;o0R(r$l}SEN<jY8t+Qp{(>s5&SAb==;m0dzPeAIavFg@5w@S~
z@`zpgVt)E{u51M9?qOyy&RYssQe!-e9x=VbGlEv%phm6x>Z4irJ}8$0U$q~zM0_W5
zj$(zY!;Knx^NzwOgRbZ0<$e1pyj|d3WVm;Y0Mis76}fyxSO@`7|ADzdd3oGV$DUPB
z4UKWIE6gcKRp&E_TWDLAe|1pUp8!*C)M=PCV1I&YI4ypZ>46Zu8cwLAc)pY}_gzs|
zlKYPBIrcds{S;lv+8T-oV>`P;bsBe!q%<|0%uc7AJxC(mhT#>8;8)_C;ySTKdPd<{
zo2B2p6nX2O_SZ4qkRBO@nyR66P5}3SHUGp+Uf=wBMur7?rx?!|78b>NDBITy*UUMf
zRTk;y&m43DK~^JxnAZAm1tZ<uP7!s>Yi0_!hTM6-Cw-v{GsTHxQF!rxHdOsUQa#at
z^Q}in=LuU=9378yt0J;%)8W|PbQ^{Bn39s>t%_z=a%)&w19aPvMA^=pw}2+ARl@RE
z)o^&b(f}f_lqYm&5jn6#cvi8RAq;)Ijb3z>V8??nv6N)sNKOPK!f)Q2boqrPU#B}~
z^M%Forb-L{s;spB!3NV`$3Vq7$ZHJ4GiZQ`oMLB-rKdOmk!9KEMu(zev)R3rYJl_8
z7fG<HA>tb=grZ<jFK)FbDO(E5iCYk36K`R#P)y1`;g@=roL*V!EaDUDTknJP1Le_1
zYBf~EUSO{?Ydn-dX2^8SM&gyVXcP;|q*1@&bv^H;8vI8TeD#0fEUIWK1qvT0jbjX2
zoxY+96nUQ+iQtiE<CG|CcGx75>Z|sI$ADx@_OSXYV(0>C;PYgK$#-x@;?&c5rH~R6
zF$NDViv$rBQV7MXnjzJjQng6uQ{H`44%wb7s8I<{H9J&6Jghmtr?5*K@lF(;8scyj
zyRE^0GQUlEiK`QZ4579v(6}38G%m$$DqO7LUZ-x|Ar|HQx~~J?B`}{2D7T31?X8MS
zV**N*vtW-J<!S)6Gik+rT3!9Rx+m$Ow>S>tr?P(YAw{+kl>ca}qtfPAt`I3`rKO=<
z1x<eY=MT`}JDhK<^&U!)SzL=GtObe9ki}&pvwhuK|H)Z4Q7mwnPX^cOl8>SBRNjB|
zRL$Km;aCExt6s_rgP6!_9`u1ho0DOiOhG3Wq0(Cs+*De)Vzb5X$-m#KO`CA&-||-a
z_idvvJ4#|<&ttzw>gO_pEhoGX3YWIKShF3aEhhVnJg)FO-XJ1@ts260nye+Bd+tsP
zX2G70DE_zQpFN#&nAi|AE5ERypqL?$;utk!f6WO+cl_?*{Subcam9@|p==7lN{|8K
zOUhQNm0w69gz^p>Vfq2j@`<ZI`M@6N@%R^$63hce5RGz(5J8`*8lVkZ2;JR1X_-R*
zoi~UYj_QW&1DENYi%CC(*8<9rKN3d*&$jDHIkSyNXZ)OD-%&-RvE3<YOPdgm;Nrxu
zC5cF#oW`$O=I&lZ8GD<12;otT^_EB>XPPWKD)vaODP>C|#hK9{*vo-}b3h8EZaur~
z?E%$r-hh}qiT+l(@tZ2FFQa$bx3X-baCO)lhTbSGLV(-CNRW_O@3{P=udbRx-<uOo
z=D9p@yxeFx03A4^M$w(kU-R+BgS(KD<~2@o6MGeo6cZP-ot1IP7l-_t)}N^(zBP!G
zQm@hH^<#F?c#f5*$)AT(hA7D|tX#G8CiK7(<b}f9HzDaclHHF?edpf2Q3ALL7(f=Y
z=QG3}Q4}PADFVn3?At}rx`aO0Zv3!>8rSqQy1g(4os424t|zg_)?~0EEM6?ggdz7j
zvHCo+fhYH0<4A~PL}?m{8$v^4yj46?uCTx(4%_Pt-#6}uQQnzu;zdgQJhJkt5$6hp
z*?94f6R<$No%r?ve%=CDFnW26T#<^Z8;+Dv3DtFV&!li-*zEN8KZH;`Wzp#cge;Kj
z9&fAYatPibPr8oUFP_EoSQNVlxg0{4WIa0(e3c1(QGAF!`M<=|5%p>Wzn9PedJNNA
z_*s4*Bd^$M6){7Dd@<o5#ErJQ#5m7<Bde<Qv9|wRRlRXH4i67Ezr9*uuRhZw;Rc)M
zwq7lMpkI(x-;q1$WdcGIE2JCD21W7$@5*H6%+5y`Dq0&!2Yw;tQax_%G+f*@|0I+d
zO`Mn*d6X8daHVAgs;Db!%&!q_H<g1(%3W}#u#>54z+!W#{)FEwaTmj6a>8e&r4IU5
zim(8Z>1TS|nDrPY?&8uZq8qw1_8pA^dJX+)VX&y%YzHk+j2530^J8`XQ<pxspr#X<
zC*nw=+$S8LTp6li)afwY4Oz6}Y0Wk^Utay(3BwO?wH2R$6UAxr4|z!dJ42-VBd|Qs
z>>Qw&uzo^PVBp6XXd7`tz71i{*&fCRj(Na5S3%<N09eF#LP3?F+8Yx@Y9uKLQO`t{
z^$XZn%kxA;?$2f%j&p@%$;%2d2$aA9ND{G7k5XJxD%439a2q2Q25piuOI|cDK^%5d
zu3zuV*qiLZVk(Rs>7uyjEjc|YvvOXHRhWbfYX2L(nPiq2y+Uz7IP$EHro!Y7=+s0E
zqv`D&m!z1=ETk)Y^>O3#AD=dOKuaUt0o6SM#p?(&Kpw%5_lr;+Z6i&J>(H=pA!Yyj
zuq5Su<>z;W&4h`23V(&9TpWJ+tC*v(T!8e(ZAoJfS%=VmBnQo6W~e%wL6&BRq7Of{
zu@d=lQ<74Tk5B3nm_({zVYB8Ip9uu+r{w`9NxTvlkn8x}nqe*=BRQ<JGlcLg8<NB>
z^Mt3mA5<S%VARI+=`F~XGIyA)#UfOn7Ovckn8^~8u?M<Efn!qAVUY7EHk~;KR=J)I
zNcW4WVQVOa$ku}^Dwcd<;jrj~9UHlhWQmp^h8r(7YO*It=?oKF*7|$hI?Va6+sCFX
z)M*#gnwRgr1gKi<wh5xW+hkA`p>F0=w)0S2YMa2+`@&cqbGA4O;)3hP{Hk+Yf_MUC
zo=Ex06WP^#`SP<a>(baCVrs{7o!gO-Neu$h?C1@PYb|cAM1$w9E&6`?`0)j$kh1YY
zMr-3TToG|w>=5Y^PxqF>g7pwiQY8fdxkh`~#_pH*NrwdTftiQL`SMB%2OA-{uBlgJ
z&PwSWVvd4)qIq#SAmvR$lx?gPQqwL(y;fq@b78TR`NGdX{#MR#j_$FP!t3WM6s>>q
zvoHQjx=_GV+yi_O0cHunlMDJ$2v`s@_80`>#jF1Al^@Vk4goW&=T4w*gbF!G^cD>6
z=vhMHE7cY0wVx`)k_F(b&1Z0=WNXpP07Z=tDo^4m;l}XC+>S9MflzR>4eG5a4uv=Z
zF_1QUbY7{Ev9V^X4G5xi<Jc{*8;n>My(eJ4W^!6QD(L(7wSCDrKLa@jWj9~{s8C$~
zfo1AD`0m|1efGLEGH>6hYN7pUe+$nEbUt_C6q+D-+q&nRkoFhkhABN}NN1MpjZhLk
zGO0%2DXcRpsZ!0FGf=7RUG}t|vfg`aMHV|BwL`kOj<Cu(=_6RfH}ZwXcH@RIqAa|H
z$dFl4aVlCSrIIP3U-C<{%c9{s?u*_v5^P${q;t~og&kz8KbiM@_Bl=@y%WR3#K39A
zt3iydoQ4X$cZYTglcPVVFZ$zm($FQ$4%w_DG&fjRzV61nmF$ESOTVu*h!t||9h+Z8
zt&7vuA!~=BdV0#!m#UtPpLpo4qBR#~BiS&sb`Q{?+<D|K(_Iufmsw~BNK6J><Zjuv
z?GDc-MeiDA6>Nq(U}k$hOK7E+Uo1*2&KyF-l$^{pZF{0P1<aT1gInbT$2qjZRG1+0
z;>Dj>#O!NOMfi#sOGp$0&pfp`g42i_EX`tTt&WOds~>b=&)pOcV_{}%I9+tWO%=rd
z*s?90RXi@$nB13|_g+$vhuZwU3><>0^<_jUtxVD0VO(XFaz?Smh2w!=aGR-y6s^G!
zjC|d*_T4JjMiqRPBV=4;dL@@JjRem}Rnl9x_=J?0S_5pTi2=!MPv?myj&sTHUT^5e
zIBrm(nwyz5A9zu!;87ImLDQ*j)H@YoACS$}U~zw09j73Zu0OpkS2qF-qKZGJom$xQ
zgL+S%qJ@|~>jci#2^2?D0`ipk>A04#al^6;1+)ZF84m&ScMVW77RCsVua)yKdOLgN
zJdJh{8m8}~S(l1}qq2%6CH6BQ@`FB5&b`1ZMz+7IA?spEvcXM#2Ood`Zh>jt;vYnj
zyhR*8{w7)njR3f@FFO}ajrh*h^5&khJ3{X<B{zzQT+K-#$U4c{Xg{wSim)9&PJrOE
zJv245(UP?2)akY6J!2aDmYtrjShg$-zCe15$-D;|uyeHntan+<gMJ54w)5^h+7Eyp
z-2Oy1nA7}@a%=jA(mf;n=xHdelCOwLM3-%`uoHq7$-|g+aOErbow9u<5*?Y71+8{v
zPVKtIp4{RKs1kmFJETCq0Rfiw@f2{_T&CK`M&Q)YySj>oOz<I%-o0_{IEq{CMx)n5
zNT-BAz&g}OSgg?Z8hGdOzvGk?+2+u`h(1fAj<@O5=@=p>t!;2j*4?mk*?3?)=RqsA
z^U-v&n;HaxnTc%;CsyZLf&?iP9zH=qFIVcpO4Vwh+pK9*X}F};*HsJWC=#-V%MeY?
zNawXRT}=RySdlcfyY(D6(1Nugdq&Cwj=~{%rI^Ej&I#$eIr36S<f$kK8BgfLv9#tG
zzbV-6dUCS3bEqe5aN)MZE(7kEM(hlL*^57hgPqZ*b3tB~Z#~O}o2Cvn2}t@_Ss9KH
zQ7bTwKf?Ov+6s4HXmJwrBJq*n2>b*OJQDht^V`mHV`g2KK5*SBuJvE6e!WfNGdWLO
z)fkgCY4qsP;Th93TS6NwhV~$gb1iLc`hHX9#;rNq8<^EC-bN>IJQ<Vu{GBTq10qg5
z68um!^V;l`!LL{*ZwdY(NirUsQfpd0BK;8wGmX8EMkUP@CK_p>|B^aA)wfe&0DR2L
zEC2<8bYW4@k7ivD$n(rfi{xaD>+$v-#t?S^4Ufu!2{~mT!xQ;>+?Y6W5Z2OltP}zQ
z0%!=HiA)n<4wn#`&Sjgv7?V*}f4Y+>*e}rzKL!W$HtjCFuA1Wcd$VRC!d}SE*iEG@
zT|`bO8BxtVgaMs<vSm-u=e3Q=<qy3<vtqBlQ5M2iVJDO)v{nS`{n$;9m&<`c8)!0o
zM$A?rLUlvI5^-Y=3}DH+=VjV~exX2kdf<v&y60I{0xk3p6WU@>8HC1eeY=3hf#~8f
zO0=?@A!=&MOAOs<@ttJQx_WZM;DxjmZjs|kw~nbKd)RQ<Zk&nK{PH%i5W+>+CJ6V#
z;Nli7TgnX+%QK-mV<>cDa^3(KL1KT*!!X*@%I_b#V3cnIpd6($^XB=04`q4b%3UBA
zsTCR;I>_QK2Z+V@=zaGg#~ybk-Eq!eN*NF?VEuwi+23xfSg0WB(^kAIcFjdaSrr5N
zw=$>Z1Ve}`PQzY-zdv91i%A#8PH?&W5zlGNoaXYFPv@a3d9hHcHTa97@fH8A@AVwT
zrv?hq3l{tu$5rsgj{rR6F4TdEqpqzTlkG<1M?P_?rep`)6<n1S;|+qSWV56;&jTS{
zo?lB*-1-NI*5eP7*i(xiy(0c0tmE2J^}0cn8u(y|52qQ?U3@+e9>(=hQWF(w=s5B!
zKW<yM6Z-FMl56SR`5ifin~%o(+80`L(5DCfBfJBsT`nz`EMjN|2?VaW)dRbA8x5u<
zJ!{sGATmfH#V~!qM{{n;`hj1%i-%;cz;xUM%+5&K2&*jHZN94FldK9Z{_4bHK;BjO
z1Ht^wbxo+F(0^d-Z1&8C;Hdoaw2J(8+PXv?kHaB}z3g9EJBmaE&e03&S=!{UJS7?w
zy^WRo=YpTq(ri+HRh;Q8F~HcaDi)sH6gut6ai68(55Iv^KpHZH$Y3L_T4C*JVd7Yl
z00T*y8#G;g??s(#A#QD3@TonB5qEB>8~T@#)VralP`};+wI`Jwu=3*@=QPtrumqfs
zUW3&W%BbuYP^ESR22NWi>7{qU<bRzr!(znIg?*t*iiEqfutwdDkqQNAv_l^)R4t%a
z*GaP#VR5vP#R_=|sIuqap4=yK$}ibgP%(-kKTse*OqlVZBUcF>i(~#QJQY`LuGEbh
z2CG+{|8+JbrNM-a@KQGE0lwSv=oE1vBV}IdL}F*MbMCx(oIn$ZgK+(kqT7a-VhoM|
zQSNNno0*7#%>E1lX|zPx-8)8R&x$81WwUwMVuj+X=f5rm`VVL-hmqb5vA7GhHe1pF
zy=z%nrj*|%*yqGJid#r;Bk<NFKzqqANdHDdM4D^Ril^L)riTosVD+N@I!4><InV&*
z+8wws$KoELk%94bKH^-k!n${_UgL?|EInwi1e>>W9l%?fQ&fCQ>7*|6JV>-SC6*50
zfL#OA4$(+eA3nS~S_kfF{l<-V0_qc)=uQ`wv=wpX+?Mmu4}C$V&}86A&Xu30^u(He
zE1@23(s6+^rdM#<T&HW-uCkHH>7*@Tu(3w4TM8gPISO%A)o{1g+)bjlqIoQ+ts=XT
zkm<7b|54ZQ7T7}t6vJYMoZp0uqH%vGTYK@&K#}q-hZL$YKtqww0Xte!5IJ=)KZ=Zu
zOucs|E)MhS8*xwl2oe&_!+No#MK?zZg$C1a%Dtz6-T;@3tg@m+narG+P_NgHx=M}+
zr`i>d)vH&Fybl0*MyOY9MOgWdfS<h*wgVO-Q;7HY>&Cj5+fmpayfFv#<rsbV!JBp9
z4unybNbJb&#U{Ir8_yH#oVjT2#H0Q>v&iUY(D#0g1IBGqI8{Ry&TUe%caU_gO=nQu
zr=+L*$NZhg1jTP$09C>rL2;6we>44eJA8aXI61k$RZK|}AEI#%yqqC2!B9fTu$5;(
zDhi*<+wB&<wXn6!>tFz4uR=F+*9yMB1l3SvG&r9tiO{zo0+*xv(4h%e>IVYknQ#dK
zNzn0nu%+x`Ecqj;H0B|jo7Pf@;!Jc{m9O7%ga|kETPw{&ZUe@nrs3J={?nMfe0ISr
ztkfUH4SqxpgweQl&o(v87PqiblLQZVsE#U3nw&J2I%d!65<9G7ji@uvMSATxvg+8k
zelKB5SWEy~NnX58^EE%V(>wqF*ILTPw>8`b=_s+Vq(&zutZT8IMQ1$|)-UDmQQz|%
zpyq%P7cS_}wY7yzVs4bEu-}U8?@taB3foWcAaNyJYT1qzOL5Fyh=7txDyD`Z;;2b1
zG!8S4G+&G)0EJWUeZ)NyDCOG$C2(R5!%!GF3|lh5j_D>PW20o2XtN&+%&Zd)2GMr1
zx;PfYc+ph!J3l>twOFzhbdP`%WEG?X%H#JLlmT%hli>x_7i~N6NP{Q*38stplXdti
z$~_Z-46=vS$kzLjLr8T5{mcJeZ&lSRanZM-jtPNUxn|TUkwtS$#KSM}4tbT4!yU{u
zxUghy8CON|A=vEW-@AMFF=U&gli0h9esmMJxcn%ZoLV}Fi}Q^+V%jNzV4+UxLLE_+
z5XhDg!{y6<EB%inEC|1Qhe?s8d>#s>?02t^=WYV{Sn@0HG&@r~@gST}vyUg*Es!jf
zwN436?(<lA-CDMAxIy7~RnqVRr8iMl6ovMIkP7Aa1Aowu)&Cb_zAyl+j8SABSCf*c
z$6;ie-%iSgD<bA;F6+j}<VI4EoPW$C<)OCUyRyu%m-U(Bw>|}u;(dL)Z8c#n{??~o
zOooszCELXa3J72V(0nG+3vf=QYga~x;B7TkrZg&}?YWfludgsrBMnn0{(x~F7SL0&
z=!gzKG8FT!By&JG=`D2f^Yi_Dd`y+uG3ZC{NheW1=6}m3Df%Rn6~4UI;=N0Ht(p*^
z6?*GaEt7#kFwH6FBeQEmY-7%mG-BsHv7-hEFP%ESox=0T4~ClK7aB>z#wx5ING+3z
zzyQnuSQ60d|Ex0PNhq>si#vo=KrC{j(t$cUQwXHWCDw~$Pddc>qLcIzImIKh-N-h?
z*g%pW(lPNCQND=>aLm<?(#K<=G38!(dPd(t2Nk-aBi7bQ&dSQV%GL^-^wS2h=Aa?f
z2Fya)X2ypFB+*kmK;$BSYu0LpGK)2-EeK~C0zGtIuAxy^F#8Xt@G~lDtoV1EBFnrz
zxjKz(B1iD7SS$YI0Q*TjZL&i2#k39mDea&wSQIby_;xBTJ>Th{_3PViZ%!YHNp#95
zLNS~xAT;gR??5<j)kh$`f_~t*;&2%Ar?#G+z*A1&SAAzgNRTI`Ln5MH%9uM3W>{J6
zX7A6am#lc~EU6|&t3(uuj?@`%qjg9$S+XusRlpC8#(vU(oX(*yTHmvbFmISzLvbNC
z?1fizgwjW)b1fHX^;5(&ggf<ZO>xxsW}lN;!AAl5gec3a!=yHib6W06-;zS`9RxOe
z;Cyi4j{`oqz`jSmU_M(*(ZgoK1R)`HkAKjcgSDcfVsf6)km6MCBC&l&6&}4sNIvAe
zIZP>KYRM?+ey3imqX<4gGdz|&RSq!x8!dub{ZhBC3^rmzpDjI=hkwL#+0fZW|Hc%3
z8~=cSa4JByDU~^RbgtI6dcXd9lgZ`JH1BO;A{Hyqnka9wlb?3V{2kELRTu*c7e-1i
zgoSAKVk#Q1$re{@c}7h6wj;O#P91pr{`aB?KXPo5*YyXHQQ|sLR0bk3aa%NzHz~%k
zq)-t{tQCrs5_qV>k&to>#VGW9fF_^`Y!3<c#kd?p`r7QnI{Jy9>1qHvMO(M<TRCr}
z#9o^Hi@oS{RZ+pHGG*s!bRCQn#6{w=7*_(u#nhy*yWlDlp(wgj9t^`bfJUroi*7YN
z%LXA*rHi8Re^21#z=MKil+lE#TdYsNyLY5j%DgkMvm+I6vD?{3CRym!SX!}Eb;?sq
z4t`Y9z>uauFAc{MIQRSj%(JI_@1-z}Axvg_p*^KH(5HMC{ZT?ZZ;7F6edv(4QmdnQ
zobr`Qym;~I&w;8%WFFbWt1Blm7OHg+aB#4l*efnN^P~==%X(0R@mon+LtBpgatN$U
zVo3wjniq1uXW{%6;oYLDN}DIZxAMFN1Vx`3wctJ~QR)3<7xJ1!82thttb`~^xE|XM
z98i&omy1qo5Ud24jJ2WP!#pfCA`MwTww&?0q!+Gj|4oQY+A*ih9;+@k3|fU)5Gk!~
zZ!0Rypi`1ZKheDZ5^A0#L?-<@t$edajt8!C+$UFaKV^$Eu_~I5PQwY*X^#if<nQKF
zboSFSv``hO$qc<kYR&@FAktGxi7Y3COy9<B3t2aM!Kek+iZG++y=K?@R!2o0*)SKi
z6uHpy00-0rA<~`|*`OmD1mCM+fgq)x3w})N%N7Ns0P&Mf22f@d^h-P%O#v59Cko2W
z7(hfRG%mawB7&=xZ$>~vH}sZ4!P3r3N=kx71nqVQVN-NMlohuc4ZQC-epS$5S0l6n
zrtE*(^h=QbQ|S)chxhh@VILpYN<hped<~as;hTXqP5-a}3WMReaXx8wLgeD`MhLXb
zvK=5LfOiVUe{c=3uQHGO)REMcmAoRyjjPE7kBX>KtB#rOaD^gqDZN7x0?FZy=7`Q%
zp9T4F=%9#Fu}RND4sFT9^n3n$Sa=rq25q4&!E-PpW2eb0j{YF-qzPi)3Hl^1=#iG%
zvQUGISrA1`j=w3;9UvyGp<KFrWS#OklHLjjpLUe!cQ{_}ob*n2$ryTAyj;C2=xQz?
zR)(IUT)a)u^6uV(WAo`fEq@hPC_QKEJ0%K5h@8x5UVZ<6Vb9R0l`9FfJXLrk$E@cN
zA70?MD@y)8SZ<aq?*NJv@~z#H-ma;tGJ2uw6_FW4YYF|Ho{Co8$y%QO$aTyHvK-W=
z+`Uf|HGFB-mjp2DFEW99<;243>bGO+SlaHYXEggtdfokM{bs-RSC1HW^wi25x9yMU
zn8%yvFF0*5&g;O@YdvPKyW4N}olSnft2W)Zqw@3RjQGErYE4kh_*{BBuk=RA@@|LA
zCZrEKR)PS7%C;HgA0a+C(@h_-kv*^V5DYmvaNx1bevE({b&D8O@*+2;JM5K14*1PH
zVO1w3Cl{@|N31(A^~CJ^XI&d;XlMZZ1Rk&~#4l)swRL;W?Nyl{Ra64f>EA%muM4bH
zMKrd?Kqx#3zmzSkGHKv}$N%l;*Fr%adzjeQ=l-#6+ZqEU7`<J%PesN1%l(8Et)R?P
zJ)$@114uG)Xw<T|Psu0#Lg70Z%d?@)*z3tvW>-KLzJJX=$jRoy!Z*z<Kmf@YpKIC~
zCr+O}%<PNoNXup{Z1%m`mG9r~ZNPr}hfP#t=A1c~K{Cfv08h7XnNH)mFUW%e+`PIU
zjtl+M)K$vVB^-uNwSyj|R~l<IZQ3+yxiNshZ_CT)fK~L#vEU|m4PQM>)MpM0xi}Sq
z_D5<-`u+pJ4>-;WHqN<`mKM$!tZ+wCs4UNeT#I*o^{3p+_pjt0w0?v1+`TDxfJ!(^
zA5#Ocn7)<_YT2@-Z*QBnygSoOJ0SZR#RlWQ|Cswb3?fbTkF33^Wv>)=2}Y+k#>Ki+
ze_cfg{<Tq~tCueQ1%7FQ`(6JD8Y*S>=**kS+o9@Qh*Y0r;jnurJ3G6TD4&ffBKyZ5
z-mt;8PoKe_7uZ`HV`8YKSSPApVH-?kg($yN699gq7e5-t`=6yQ$NjQ`L0FLBhTnp#
zq;R9*+7f+#8{etle?%*UcW!#daTb!`>1Vy@i)XUm6=YW#pg>vbdpC^rm$7*e6R+@u
zE7@If9OS!C%?LGWK_}|3`1lpOPie5mCazqmhknS8hhK&8&l<NZ)>sb{Pg!Hwhe*$7
z_Ua~0c0~SRAX8Glf6ebQY}j7Nd`&R0P`{_=^z@49nDkZGZ{2E2^_bN#kcl0>8Wz3H
z``7|pD!=r4cj>YQr!vQa8*LQ46CA@fT?`*JYI+AM0d!#7!HZ%zBKj{Fh+QHG_eT1$
zhb#kmwS%y#^<ih_mz1QEGj7<lX?d{|gsKo&cP!?bEXkqWWg+jrX4!U+{`Y4zYSc4x
zMq^9Aeg671g2t<d37>Dc;8g#nqQZCzb;%qb<*t?QSB<wiyCgi^GlrF{{<wJ)_Yt7K
z%a$*9CP(P+FmUkT-4y&!3Ja5HEIKV&Vx0BfV3g_{I+4GT2C%`>s7{&Lyy9u=1q&8{
zWE!#!u{3XFu&H^!&YVgcZ2PDEw8_fa6JLL+T|2-|LRMX}i`4~#PMtbs7G>S$`zyDJ
z*THeDO*47rdo}v^Ct|k>nIn(#U_3|<OYjn<jN;9Q4~J<=)H&R=iTgWQ@yKZoP5aD`
zVy65z@BZ+#PCUX>v<>gZE@(x{Uw}OQ8M2ItL-Dm(N#oU*U~mhj>C4hmZC$z6*2daS
z|9fp1`cOpw&Eq2;fa1gfP-<sz^@ci^?MU5^qyGz*U{Iz4eP;r1g^7$qp}95!?e0I=
zn~xiNmi6hyv!(5!e71&cnWB3h%%|i7X2ZvA*lAdM7nkxSr4v~X_?0!py3!73he)f~
zP3-xh@6OQ6l88qi8+YQy{{HF@e&>Cs_ix7j`u=6RfOH8f`3<t?Yu3Db%Ap-4UWQ(G
zf)9>;yEf@?^AkA2w0oT3M-EQm)#e_bRZD^E9OF^`X8kQt)FBci%LG3|eSO87U`04B
zzdGG{wgVs$1d^i98#{yJaVDl7x<iJ{8@rktUkLAgCP(haPoL0+*TE@ZpYAmx1rI+9
zJ2Szi7=1w%ET4VwTKG>}KiZEK&mw3@fTBoa7=T`R5ce8rcNfBBH|Slrd%WP`9+Nzj
zlJpAs?gmWu9r{>AwTH9AZ1vW=Ca^-H4?z9shkqFRcO-l}?URea`EaPLrH^)o9J()2
zsm<;Yqf|A{xR-Y$x?p`gZ|c6<^_xG2+Hr^|-{v;YvMQeg2lLAyI!7)H_T4+hZPvVb
zV>rm>Ga7;-p&N0ijw-<C4vP!AY016*7Z6k|0m++l5d&H{;K#R_H#n@kecifs*|2u%
z_bT4M$IeUdTLilm7#BB)F$A?~xy%MYs$v)gw|)~&X$wad-q_usx-;pNtc0LTL=qoT
z*baO4w;?Fav+77!u)eOrsOh)p4QikCY}Q#*a|`ugKboENYd405POogy#Bd@hty<^K
z3fZ7h*2z4}y~~4+9MK@F#DJh>e;jNs*R|)J?6B~$`*C*c(4_CQ2J~q6x4lU%u&g-j
zOk+zctEZR*@1B*Fmlw!L?V@#~`O$@{J$h_lT4Z0O6kfi4cpCFndiLB1ZtKPFbS`^l
znB|7$J!sALS$*6)4O_X3)CnV@zXJo?fi<75@5zdv3<NY4H;-9gR=B&Lz-5aB*=}kb
zVlH6I2Eg3oG~Oh7UGV^YMmjQq6O(SUnv4hR9Li!o_w?G_B9(L&xHFS;%wi`o{D^CO
zcdHQjw=-yx*<43Aw?hC4;r98+IZ%fuR#XmheiqDkg{K6Uylrf=t?rVcU-$oC{%uE_
zsDH*<#jX0RbLZHM?kXyOQB8y2d2LlyDKm0%cD{7+;u^fVo&v%w32G>FN2mhF!0&?;
zLw9a1U5laR?FnC;WaAA2!8GpPyJu%I`}?7r{2L9)H)UnBX?h=WU~@edI{V~Mqc;Be
zbVp_KpyI~v0h>ui=1qO1qGD;daG@quV|R7+l;aKk{r#KwvhpR^nBZH<KsqK>7E-!r
zB=qU6qLNH;kpJSvhLDi1veMa+zUt=cx&3r>bkMf$lfXw*!4=BZ#7cIjiq?!K^Oh~s
z1tsp>ts82W!3QIbqUv9V*7b0N7u0-U#J9H!|5>*#Qy9&2nd#b<tv}AZ9g?Sh?8rW=
zJl52#jmwNfgw~7YRe#*cuVJ%o_5bbPY}c1`DboV$Av>8#ENai<2o4T5Yo)q*!}Z}D
zQ(;q2sNmdv0LckR>rATfC6Q*Ew{0t=;yu(xr&g_6g8(kHGm>|7tqZKTY{iPe`U3}O
zMg1=x{!Jh5Nq+txEcn&~7qu6^!<%2J^$j@d4abhXkeeF-GC9Y=fsI83*#zP3CV+im
z$3mVMd8M11+q3jN?Z{6#8BWB;UQJ1<&vd%Tqep2Kk<!-SZ&*GCI3;olunKClYuApy
z^@*K3e||F58_}T`RTcxrXg$_w)g-wi{?6o+bvW=Lc5H^{wE^^E!OWFBdc$$~nqCtW
zjq4Aj`|q`HUnl5fs{Q)ys9C%I!L$F1m|Fi31~IkkBdn9#$7USMoM|xX7IT|=klZf3
zdwdNb+ib!Wj3aq50PZRX8Ap#XrqKJHoWrMu!$XptfI_{Xxk7Q{k(tKYn<oST2q?eS
zhnXJ!4dCq{?a}oW!|iUg=G;_#ME$a48%aWoP0f2=!a8A7Lc-A7w{KTlEer4`+*`&u
ztI-OvLrG71(0zPft0p^EgZc#evtcp-Lm6gRk63pA&~FzTzm+m#H~eF>?%KcpntFCg
zPjJaDI9}$S#}^D()ydIuS<(AcKaDZi15Wlkp}(eP_~1vkZ|mf=ky&*S<u$;=cSpG0
znyuPIxw!PcZD5w!|M72Kqmt+}lkceFtsh=&5xtE9nSq3IojwmLJbZn8nz3}6_nge-
z)^#Q=KhectRJ12P(p*QqMvb0>Y~s^8H1|B5zY`#Jtjibo<Ft`KUZPiY;Yu<SIULb5
zNtWHP;gv^9xvE}y`yoSyF!RY){az%A47>A&GD%QA<cxV{H44E7ETe7fDcDk>oEqz&
z{oW*5o!B@A11znB$EYFuLak+ajF~$1uIaX(Q;&D%LOFBuA3yfov&Uw=ZL=oHMh6ZY
z;5d9n&Py`YgrRWvT6OH0&H-w9ZdpIIasCI1J!t=1tXQ$4Rx{iQuOCU*gk#k+WXMq0
z;^uI4R6BR>?A}^io9;t62QIP6x6h1@<5G?LS%%3z8hdJKdBJ$#{Ru#s^c{u@7P4qU
zN+R@wSsbq`Q9eofn74D_;_38QN3sUqzJD)~woYjCCdp@^8|9LBK~|q79t0FIxgI<l
zj<YmoHDlXoy1uYMy?SI*1uT>eUS6vT23_6UDk(3o-M2*0oi121Y?nG}E!igSr`Q1s
zuM|0H0!H$_0tPPbsDRuwm*E4FsO%_|9R^cp9taEt;K1DTFd&)Sq%Ai=D40BS*(s7b
z)F``1XV?W))_31riSJER$fv(`kp0Jetk%Ahxv%N7^0iKGpvC?dDDKw8ub(!LiBI$C
z-WZ)^O9-2XsFm8Z7{fR6trt+MT_-3Sm(Wo?#5+9+g|1HzBQ5ZsXHdT$b6OvAAgzXL
z^*2OtCcT(yYHBt#?xEtHfEMNips6fP0x4B+(ENP_ZQ$(zFv5*bXO#cHRHgl!o#CC?
z+RJYQ(|L*zAB?8=?%LE`qZ$gpi;12A#|J4N3!anV&n22onpC!;T4adQI&5<`@p}SK
z72XK&p?10nZ4vBd9q{#<p6rXABqA`G0<eXqhj^&Q{l_C_eevySx(p(a+y{wT@D`1y
z*70t=a4P6Pe_zYN#mVW(>gvy(XcMRrjM_B$<J;r91>;S0nwHF`j1Guq?m=svg~GL{
zbI9k$W0vre*AMpq1q7TT^SDK4OlfV*v({Q8Z5u2J2w+!12#oU*5)vw@-6V`e+?zOd
z>|+!>ztpMIv~}yTWU=_37{8xc?(~1wy#vPn&BHpU#Zew^vcD#Ij-oSBLVhhydcTmh
ztj*fU4sgqY>BEa@bb*Fm8>OC?(&|zBLC$}}ZHl38&C;QeC%M0hAayipTO(t#1O*7c
z9Sua8l1y^9>(pt0KeVWS*3^)2A70Vt>6VQfpYWPyvu6%D@Jg79<i*i_WSO!ye%8XJ
z^xgng@mj!(#??XB1GKxt$ZP|6)=zg&>$$AgC5Q)eUmSmZuKYLDQWvqzyI@N~-k(n3
zQ~6*z2Zu6TIHsVHjwHR95s^kjiJ{wTyRY`d{Fg6pFob*rg+}<KkU9jo9Ak6dVVO~b
z1`R@>izM&;rzUObwUl{neT{f+ljp!QmeQ1r4-lYdNdCd896s#VLaK+md3ntkRDu(0
z<_VrqPh=-4>~Vo~6fAq<kRLF7c;~UBRGZOq(B+hRN-fsk!NuA6DRAHc%Go;Inn^-B
z>QhT6`tAMzgm}Ev&N{_;Bs0HAOZQe3Pf@NQEwH`1t%ok}PfxkO0|BND%BE}g?%lg!
z>%D`tqRrqXJpe2Ex2VCO4hg0YJw7|gE4^Nh-OJ6;ajhmWtMlG%ATu6v`K5J!a8)H9
z=IV?aH;#ZfUVGv4<;!J`CMd%#)kF!0b|jNs&VPP2gw$y?8;LUa0MTc3Qd2IwIZW7m
z6ti=nev$aq@+4)`#x``VPpCDd<jTbA&%+fg<4e#k8ck4DQQ5Y6>sDq$-|2L0`~SEA
zy)~zux^eO-9{!l*G&VanUHMAA1!Z9|t>s)$%(Sz+#^XwXp?Dl@Uw4n}`PoI&u8^X-
zF}QEy<jGCnzke@9Wv5R2DvIT_W!v0^ni4Q<)!=ddn*AI!rk&EntfdPJzD_rvEx)&D
zVE~rtlh(hCjik-PBIRjh+)YKL&r{xP!?yi5&}G<Nzw8KQH*t*PXy&H8N(c{f&;WKM
zZ_;sCxbT(+BJ~Yp4=s6~(!vAZ6|@ecI5)OzrT3xIbNlw~`pf~*MRgKgDr<&3?%0*o
zyRaKBr=@w3n2@_9k=Z8i8=k0KLQOzI_QS($KXiHXWhNiBq0F!l45;rKO<;@lqzoHD
zkdfM9{xo{CmVjps8vBClsN{}j3Drao?!)TU!jX(5*LSZC8=9{XVB4(+*f*y#uJM?8
z<7}QbpW1b9(4cW+v5|QBFrl4-UBgfjeK-{jC&+U^Wo3@AT0CbIpkb74wK!mnGb_=>
z9J|uMi{GjInwWKT+CW&34fOGne{ttnCz?}NP?kQn?ODzW)`n=SsME0Z1*B~am1_Xy
zj~=P#Gpr31Y$0umSZNwNYH`|GQXW4iimU19wDcqrpKvR+g7h~}{<2B(-?tJsXp<un
zmA#s>oWMD*BIMZ18*A3KZGC-#5(g}3DKWLX&&ifxL8L9Z9Q+C%G9yEF;>mIOF%iSt
zis%;%O2MH}%OuRC3$cF`uWCyp?p*ohosQz;BhEiXIY(M4Spn`{1}=VhB`3#^LxiMv
z(9&1!N#SH8?5KXt0`LFlO;50k8YZY&32Hl^j%W^KR1U#iG^s2|MdLbu=boJ3jKZX^
zZUAou)zyse7*Q@+PiwRT^*G^c1yc%3iA?g-F39+nk*%#NR{+xS>oSx3qEb*d3!5oV
z#887YSWZ`_Cn@UV=S%db`qx%|y7I;ArP04^J={Z2K@6OW?~gKN_=h+?1{TQpwOuU2
zOx8dSGvyhq_1U;#tkod5m#t~*KH;e=Kv9lkg4Di4heom_R01|^q73It)q!5eEx4vc
zlzN`=@k0m;msli^ZN07NX`z$Y4nAF@@#qZ?7&;}hS}!K^Q%9khVEEgW+_6RvHYB~|
zXQ?|g<c+X^{~_#y58X<g&VQ8|Q+Qz;?#wt|m{7Va-fJ?4PG_+Hh7Af5mQc<@S`RHn
zn<xGp%z#j?+WDnO%d&a;WCV)G`T3Ut<d2^^rSt81H_fkT8LNCQ2YZtKB4ytSJ^&2I
zP|0bvgDmw4B&6+HEgV~_n{5z1kf-S=)y*j+Km>~r8vF`eW&UEZ{LXNC<(k~_@(D>S
z?4Y)sp9BiI36hI%@zMBCa3F?Pwev)mru;0QGdQn^=>+H)Zpvg&qich*TB3z)hf`Ab
z)nBU=0xScCwhfXKNUPcZV8kMo{{4MPZznMbH6&zg+mCbz#2Q=S=<GZjeyq%3d~C+r
zehBy=)8yk;R%$7TfihZC;6gUgC-(hia3>!J7gyK0U|InDS;72}O28c`?jYWwv|dJ!
zNj5=`!_()7Mi9?*SAFU)>q`0bVfW3XVSQokrM>KQ**t4aSvbp46%OZQ)oyfBoC0O2
zjib8!&R_nA*&S>QB+$aqt_iQ50Q;)~Y(uoTH75Y8NOL68I<6^ENC9wkgNNE^f*Kd*
z{f;%U39ildJ~M`8oe~)Q+%l(H{K^fsy*c-1LpW!hK7R6K1ZQ>fqB*$}19iNpdKBUm
zCGe7MQx-%27LJ^hWjItFk#`yxW)-VWF`RVcMng|@b|;YJ%58Aq1qXav<D;I<co64O
z7ts>kng$o(ezt}hcVa*5_%cs-Gskt!?Gyomf`R}v{C>4i+sXjgw4fqu_UdJc+jZs+
zd5`OuX4-do#h+wfJ`OmTc1J1Ew}<&P>WYgT2E6<^vu0hSo)!CVISitur^Oh|4DxP;
zbVy{_*0*0TK9Xc5^*pJATsWTzN8cV+om<A=G5fODV%?$sxZ>^GS#S|#+c)Vd2Py)F
z8jh4Wkgw-Cs_=q#yzIg}m~%K;u^QI{WTm~(l?5@H>DI=zgMblH43qyf6im}GdW#&E
z7<m46foFM4-ktG*G21#Gs+le_E?LaE*#J)NSXU|}^pY$L(A~kNYd!vdAoF!o+!yzO
z%`mp!^XSoDII$jr7p~khYmvoGI=|AL$;mY?u%G6mZj-Ko@=JBdS+);f{TJNc0^Ts0
zl@QW(ZWsmp<AQ?Ggivx`6lz;|F6}yuqe`20Vs>p%k=hroUR_6xBn5!1%6=&jv1xv*
z>gf2B*E#}xN=%j7ph>^={{shi9R5r$3v!~ENmeLGc5*@e<~=TuZI3U3X52SzSr?v^
z%CS0Mz7ifS*MXsc><Pk|8|B$db_sV=<L3EQE0pW7CMRb@1ZF3P@idkH$EAh3*-Q#K
z#g(g9`$4{4+f2^u9sk=g8}u$_WNf3LuPMnV3vDaREO{EL+6-rrrkX<2>f{E|MFx$P
zCp1|_OG_Fu6smuGZ%6>@d<#acmV9_+W8~Xo1PLtJDfP2F=a4xYH`uN~-?5(pP9cGt
zu-95XO=~UxXA-lBVFLZC&&fiq{4I>sI``2yh!#d6kGZG`gRJCBYa_XCfd3e;wxKMF
zCjH3#x6F5kT}Mkt&J0zT@^|ml0B*rB88*}%Twxgv@Ieg2%FN`i1BseX$Z$s|Et|`5
zq<^S+6q3MxdvX9!TRIMHtH3TdE1E|X8#@5{|G=ZeKMmW;f6{TYzEX+bgmkcoaetVZ
znJu@KkjGao6f8T@sMV4f!TV8WApz1>pgOMkm^37EdcwWOqvTb3H|5yTrTcRmwniGQ
zmHv)CeEbG}nKqjjY<u($6@wSIj|OJ$m~IH*{2c7E#)bR$gFt_kFF_&LGxu0y8Dt=-
z<&}wVfFZ~ZBYXQRz{y1)q1=#7x}Gy>GH=wVUkK~WrQXX<pA9-UJx=*VM_uVmwXW6c
zLup<sm-1E}tEmN}nM3y}z@Kt;&P?U)8HSVmNZka{U>pmAW^|vqDZcw<KaFYDWQXP}
znJNId+OVC@77hcT=usfG<eB{j-_w{!;tnrXAwKvrZ9)0sJ6D$H(D-S$OWtZ%N`qlg
zbF$9vJqA*5JJAlaKEs|vmr?c^?;F-(nd916&f}t%#^0`4>mfJM!z}D~p3T+d<c-AZ
zIdsTmXC>Pb`+Ay3H0Lre^Ta@7*tK@bP1nS|2Ac!u-!L(W!{n@t)tz53p4SZkW(@jW
zlo?W28cezmj&ud6rCe^-vgP`9>uOVC@5fVF)_C%JiN%zOHG0r3fw-<MfyZ_<S_A;E
zzuEhw3BT5>m-$%&TyE-_R=X$6r53%gwp|2QJz=ME38<*d4x$WqVz(${yu^dPEt~8(
ztTk}pVUUFjYyV0g{`KSKZ>y#303qJ|0=@J$Qp?d{*u}D!0*(x)4W~tY-5EDY`s+1l
zP%D`FQW-*2W#m<sGetlnV=f~gVC~vk05f$vHeZ5;a2w@D*0S2PS@-hg%j^zS1&D^=
zE=4URAK+^XGSRkKGIGH|)<E*hjvbzrW@a>&wVpg3XTuEGT!V0!&78!(@^J=_0Kz<g
zWzjiQbfc5p)u6uile8Q(rZqypiyNt4_m6qKjfy?xf-1Wpx2Q6Lv+%uLz^z)fYStWU
zZ2ZfGyu2X~YAVZ#vG1MfkkgU}i2f&YQgi}R3BcLLdaY~RWW<6z8}9(Wo~G-Ysu?+=
zwMIO&anq(taCBZw<&IUn*GRQWQp$@_`1Y9e7iUoxD-hE8V#mRxwG9B9L81d`GPLcW
zHZ4%kU5jn0SjiX&I{C`2TGNr)K(rm|O1rD)l*nCj6RO%YL^_hsfp-2wal4(q3s8=n
zrlVBX9$eNfokTgynQebdjjp={j!q;a6~#H5xdazb&5-<G-@w^$;lmZefxkqTzL
zkea`@YL&#UXf#1eJC!yLBpZinPVj_9qa`C}7Us^L$-!diMG0^MGd^KKOgb>C9!c?3
z<ZziMqGKtfKnUvVlTh_<aCpPmZqblb1#Si{-gt37mlzXs;>3PvmI{WWk?r<YZpTqt
zlUWRyTfDB@eOKUu{sZMbH!+MQW}M&yD7lkB3n@7-Jh}YO_*81sI^6_PFrbxSoVv|J
z;v=u}@$=_b1wWuMzybQ~MnE%9a^to^?Az(rSbsgO&-7)STt~cWYq3FV&17TL>7M#f
zMp<*iijP$1J*S^L^KG%Pm;?OZtz=5_<foY#o5ys|WhL&2>|<aM1lWJ5T?~Eji`TAo
zv-#uOin(pU0hUz;H=ae`7>&V|lU4hwtCD23l|?M8DCbj@_m$ic>><oLP76JpE&HA~
z2XC+r43sTfr7G=037cGJVf3j}hrYE*TDMXUjHkm%HC(IXjZ1^*3CbyYJ)}aeZn`hJ
zq4}0dCPVJ#xkkU8<y#3&wJ)Tl^|Gn;eL25*F*Z{oV&9E=6^?H@dJ^~BO7xrFVw=Zw
z%8c7`nXKD3^s#PnIcBF_C`&jO#{ypW)z%)j)|S5&1C>Lh&)G*T8u#hbhosDFtMAcO
z0vH<@)DZk7;k25Hce38f^Y-)S8)C1cTujIQqNnoxVXCe$oYbU{w=q7Iwrf$#Z)a<C
zW3pmK^PXApA*!hMFa0Pc|DHdHUjDZiz0S+45vM#%7S+$~zb%eNuz=+7q~Ke7`t(3X
z;MAeNo0OW`fR0T_K1aUQhu4piQc~3aKdSx&tmm$4|HnUO%B;*p8Yn~NOht-HQmJGr
zAu6JbMafVa6crH)Whx0JgcQ+uA(ae;k|;w+(xm#ocJAl<KYks@ec#V>UG*9Ed+)W^
zI?r>R%gjNHCil<PwSSx7w2$Z|>2@lr^Wf>P#3}xj;8l}4Xwab8c!Qj;34FyTAD^6j
z8SlzpwBcIx)}1pi$I2fT{=!Fg?D?gA|2;xWc!eiixQDw89G2$dy!q9eH_O}`2<yI6
zrcUJ!IP2xtvYcyR?Ij(%b)z!x18FM}bHraWs=X<h0-y3*=oc7yYt2y;(SX})z@YiL
z8Dy99vlp;=#Z~+dQrPKi^@f;H+_IafhsKTi`Y(V5MaVxTI}ADM;NXxq%5QohY<C^*
zH>d7wL>NHHG<IJvy(MTsy1?l>uiOdJ@|yO8RzBK(7v1|rO=egtVvok>Ze&Og80@sK
z*S*lozqWd3<jwfoAg`p9R;<vvxpVQVwQIewS$JTl5G~!cYub}16;Tc3U=P6o#!m|H
zPK+lYwXt2}Al653Jiy9XAOS!*$Hw}t5-RPvK0Osh{Tk$w$Uz@Tp78pn$V*Z5+$%iX
z4i+I8u%gup2ZtxSTeWHV&I_u3*4dlO4XLp?^;JwQ`-@(0!Hl&W8Mwo9)>?s=tp%Zu
z58r~flks?mC_f&%M`5|e;>CgE*BZiNwH#lClkB*)hzadpKYl#&ZHo7n@@IRPoZRN;
zheMjb#1Spl4l)NXNJmdapIc`Mo~ma^LdLwc+_Gf?Yg<av_LtO)JR+niBr|N<WN@Ge
z@I1=!mjSJ>0#e2?FbIp{42VIrJSk6~9^_vegM-uTa387HhF96o<C6x<%SSmuCezCJ
zlwwpAoq~TR8AF*BG;R9^*u=`8KhJ=V$*7u-%x|Rp^b@iLDj@3RIl!*KJQuZndJ0H{
z+Tk$b$O*MsCZl-?vf5(QJS1j<E)j$UAdA3y!G{;3DVp$OJG`?iIopM5knOPuoF@Dz
zTRgb8l#-&NjcxL;E#0o}Yx+(`)h9fAbIWsF$2q+StRg`}>I*I}po+KGyZ3O><^dm;
zbt-uE$_VupayG}UTPKbVIp>3k)yej6e;^PPScXVWrayVuP||<MB4W*>hdY<;NWz^X
z=-xThVwR67QTl<|u?FH@fazz<FJLB=8VlNf1tA*HVp?jMl9G}@``Br7(bLv}SdD)>
zzk$+mVXL*Lu$IYx907(pVN`Qbb7N8R!I{;U=q70gt}MhoQT7-Q`S9wMJcT%>!f`R5
z-th+M)otP=sg*Up`A0@B4kEiX3D@#8tv=n8#D6VdFAA=oK5l@KCA>8cXg`*EVX!S@
z*+89}Z-tn@9Cpv9+PzOdYs8Wla&D0keI{*=-D7EDUN=?YHf9j;Q1{vVAWjf|<l}s&
zX;3Nt5I{fKP^irY;RC44LA6TAhNc-$>j|pv+DA!{ujn3hxOMje;DBa^gUyV2ll~v-
zsI@#*WR|M0|9G8Vi%axc{$Aji=(()zj!#NX@$&Fc<4-2o&C@Qm0pw8HIRw`HA%d>X
z1}$-(q|BJ_o%hZ1E%p=W$%ZhVDxB|@aQE)T*Nu~U1p1lZ+OQmHqb`7lsB|M4ioL&^
z$%xOxz-ZW9tyipFdvP&1!Xw8qRm(PV5Sf=M*-esZ!ixR&8y*zSwh)vxAg~ymFmhQ!
z_i5Aan%-)lMd26!Iq_m*NU#X8FCvspK8yM@6z$n=Fn~QkUVSyrmIBCwnY5FDWUc@M
z0r|~)W^kwwziGSIG?JfPXw!=P%CG-Kl_!>uKt{(2UhQTKmkm5!Z(t2BhuMe$kBzZP
zqsk=9By0~N06wJA_%SrFm}YTewaW>CCnN04vkhU_UI3s4q#uMX0K)h+n2$&ph`mCH
zg}df%c;1C#l;VHYMoX-^^$2{9G-4D|_DL}($;6-lQT)$gby1iqlgKAGH=U%v`Y;)H
z;E^MbcnIoS%DYqdq`ZAQBefA7(gA435FU7IBce4?fB(gc%5d>aH9bM!jt_?r66P_p
zIWx$#MZuN#cb)rl3WPcu+;(>;+DBKC7m%INIo%#yJFnOp;v=&*k56j~f(4DD44JUW
zTYU1*&Og0a+)XV%uODFab!vTgJ+5CdYmYp(i-Ds2>r|vg9i^m7SR;V?BYcVxW;G3h
zDL`kGs}{bxH{VxiX(g@<_>viE@$7%)0$vz%&=hu<p72tQH-#Bl=_oKVa(ENdM~g`p
z^EwAmNX&mZbMY*Cj^e7xtoH@|-MRm9z-OmJeVnqQ@i}yZ=m-TbY9r7mF)HkW8@u0=
z{jE5_VK`V0+Y~LiAsqpHp$?$dLRTT)`r9`HLQ~#^4)-~%Pw4Tr+YwOm{`~_+F<y&}
z9XseS5S!q%LCwqaE_S1_-~&gJICB#H6n`E1>Gt3(@2^AUkN*b=YS8UF{WY)eZom4#
zr)fD^tw1E<!VPb}efRDZrLqZ93vmkqK~q5}j2jn3$`(r>6@c>T!d=8`VYmVk3Kh%t
z?FDI{H&zH>C|42F20{i0CnTtg2F>0hEoR29OwIix-PFuu0IK)3a|2w+Y(K!!$%*LH
zl}#DOEkGLjfd2_mcan#0(>E((gYCS~<gP_J*Ycw<1Ey4IX)-_(phEle;9n~NLVs5j
z9;2aZb)gqN_S+pxJ3+FQauC{zQu+4}%uC6KL`IZe^x)v3Lz9QMq>hRDCKK|-J?lc2
zC;qdw&6IWkZ~A=JYuB%Tz~}x9abwP0p#+Jy&zFKX<^N9wAl$I!+-jj-l9FnDp9>ML
zgVN8^oCC@2dy$eF6-F4>WF?ZQ914&L6&Lv(aSJSNRrbNR&zPqmk_KPNwhcnm_Ar@h
z!jm8l5ZZkkmE!oDH`maHkSV1u7tCc3BC;<mY|zgY+Y;s;aLzTEvs@g^S}2Co!;4*)
zp=T2AGwYTh7Fj{>^ay)*ifH6XeI-=?5KzIFcUKvcex>K<%kCC|*gC~Srn)jh)E*>M
z^Y^)P<Ca>jJ+N)_=CfAGyDn67LWBzK*mq1u{9w{r7k~a-ms_D*P>4jxcU=pKz~jix
zK?7ld+ypt)Sr~&Oqm~{8`4!k*ochk44a!pLt=L#AXZFwhCZKCRLo2Z}TPPF(Dtb@f
z0J<sEQtQ^uqhk47c<X8l-l|<H8xSFH=py#hN<^yb3vc>^lnfZu!C9@%OCaz{ZVFOx
z<Y8Imx|y{N4JUp3GIv{)v=4krZD^Mh8nb|q5M6>%Esko8w%vB3CXc-)GpOw2M;%f&
zgRcSwBOkRpVZ&h|56FstM~IW^J+k)kW)WYZ4d#DEIshm+a{lI7X=34z*;rf80<d4W
z<r_KytH)28>#Dc%46+AhlBpC-STc4d{(jV2waJ4wBCKK;u3u7gcl#Lb3Pa<hK9z_z
zMD?4u0}SD`iz}bcx#Gr4gRhi8J3lAk_&A%)Or$~!PHT*Ld+^*n2%ah$h>E{_(TCns
zcm8?i-x`Sn5qE@5mIEonRhvQKqt4(0-wp?M9z_%20iXGkIX!A{x<t@N+!47$V!iv0
zg80hc*Opce0sj**Qivs)+?6ThZ9y}}^lp{l_`WqQ!IJwA9?ZHfB%a=@P52{#8c(D~
zqCsKbzJ34qpjOyzk@VP24_C-h)@$90o_f-J`nA{^2s@y+_0ECDfiq^z5RislBS9C%
zj>eO?_3sj=yxsv_W3q*?Qf<MoK|43@W4jYy!~GISCdV*cqYLBSWd@y8UND$K+(0vr
zk&(R2NWMtZ%gy8;GqL`#yGQ<?U2%~BiVh}ZjYX4q+PtRkxC`1aLNs%##R#5^=&n(d
z$XOH>zk~us9e4(?%_TeuyVKb~q!OxD1uE1rTas4$QNXL=G0E=>DBYRjS^%DhWq*m~
z5I?$~Vj-|ap+k42-BNgtR|L~NQQD<C?K*V04y{E+^n}wxDjaVYQ2e~}k%~{B(l}eW
zYEi4rm}@0V2$P7=g-gsou%IQzY3KTaVLybv@FKTs6+j4JYXd|^8OPfilQ<k5-EJSO
zTz~o(*3uFN3wyCP5#^o2Qui&NuE&Fi?^*qo{?HLquaen>fT6r#^0C4A+zY|K6Gfh$
z!-K)jTr?k8l_&OTJpR3`Z0(Yp*RI{q%d5KW#yT32-sja5AxMXwN`G`c3iJYPuwe-^
z%3m(r^6o?W3=Qx5RUW5j_c8rHEr6F-3tLQrbEITehesAn*I7w#H8~oJSqCV;;7`f_
z{aZVvUH{F~*C!;G_*6uVsO{NfmFOvelo^Mo1z_lZU=1Rt<O-iO^~CBnLY+dKm#9yf
zl5^Ec^B>aw?=dXd@o`N_=dN8p5ti4WTOe|_AyCs3B3i>-cxZ?rCr({?Nx(8f7Du9d
z#qwSj(J1rS7&(~zi|io5I#RQ(w!Hx55sQGA2b<trDG!rI&Go*xSZDzv&13qLUAuq&
z(Fkg9Fsu_+P@{AzzQKo(9BzV#V2unBRtqg==HU)t`3QxWmGj{B>$>yf&U<mw#nRZZ
za1D}`UuD?~sI{=HRU|>xGiyM|y#7r>Ae+z;&o}n7C%S8t5n=(OOce1<eX^zb)atar
zA&hkmidnkkLdBt^bm<W$^)tBt1+*fN3WW9#$n+dHZ=QF3{qf2j(^c*@FPA(@ON)bg
zCRcyJj-+Z=9y+uYS)f=Yaq4o<W%ly4o^zM#iYMGh6o%wr9$sFD*?pm*p^n?PgFhTV
z(2W4qVas!ap-}w56hhsJr>CgO$m(R^U75EcwiiLvcFSX#%rD7b&%OAdnRmK+XAYnO
zqQeO>$A%lXR#lCbsa{C$PTuf3jsY5mlN4_Ev`du9k!HN!p}*F8G#oRG5Naez@sA%1
zz!4U{`9ZiAXk<=KATGi3ANMFOBsp14_+DR>6b8@ujA`ez{LQ3MG$UB`SWcyOZa#(E
zaTqv;PL^^#2}T#Tmm!~74EdWYbR=C4{+do^FK!wPt#IMDtuHL(E$E>l@#-SFpi1N$
zyjXQ{+X0HNC65zIcR*=@mM}kUj#x#8=Lj>K%Cel<$d^`;Cks-Jif<DTKUiVLX-!Rn
zNdzyNh@Ba*cBy9pst|qxpM#cvV9iDI3|trYj~?+OOetHPL{p3A3v>|+{wm97;z`Ev
zzI~pasq=1CI#+HbT{jIF)o%^?6iotco+Z1xxA5r{LqSux_58sX<B)<fAa(&s8@b?l
zq9bM*UL1erX-&YeLU$tWd~Dijq91N_MzBTTs;aY<=e~-wk#_%E*?}u>7@Z~HmhVeT
zwW%f}m);t^F}ZOPNUgzEO;M_$OA=3kY`>?R+yag()?kC>+<x~UZQc2P()eF4Hn1fm
zwk3eHWiU73n1|(ce2tQKxgi8<PI2!AuYVoC8j?g(s}i;w0^AS0pE!JURf!kl3(Vj8
z$1jdE%EU@#Azs(j?MNmSbtmm}^$dKyttowYj!u7ToygJjNM0f4L5l*}o0T~1B|a}&
zj`U)rWnze+3OH<(b7-RF#jjm5^`Ucv6vJzM^OY8@(&*zCyf<^FgR9j>HE1-|u3}c7
z4@v<dh2rSZPvaSA;7iT037bH?h#v6Fhnru1)ysjD(U;AlIes?xwlmh6haDQYoQ|tI
zl_Z}MUlTqrFye1^75!f0cm7$waMp*}<f}rGAl8vuZ!C)OvYXYyG=n=iWnm3gX=PS&
zD`<`TMe|-Pb|<9xHIJHr+^9HUN32PsFcH~@6zv%wL}u+yTXy(`kCa{-Rj&x<;w6>W
zXn1sJ9z(e3LGcAcDRO^0yrhuJwgTwq%7}G*uLa&+|NN#q-UVobe?R)epgTY4*cq|+
zbOYtFfRf9u`!ZN2z;D3hgJ44%v16uAJpn6E-`70o%f9N4#hVLviV^A|s9u2rT_Ucv
z8~nF&VDH{(By)At$)bkl_Sdh{N_+gRQvSbq*Jr&43^@6%=rY?|+HE7_=?rv2f3Dc_
z<lE2U83eNF@tE9*&KlnE)T`~_O(K-@YC`1MjDs<=$EJPyl&KtL_#^vJI_-z!5GaX#
zLPN&M^;T%$06fLf<)DrwF@iamOZs~H_?Q{^)g1$rnjVY2^cz!2jXD;6jPpDFs)-iq
zvFO$NJaxn*f`aBh<vFts`ui*L+^In0aK%O-Jf9QNrD1Ar!}dYDNu=TkXUaDzlBuKe
zFP;(%2|6mt^aP&<=CDk!P7s<}Zdc`-c+7;L*Zm$<CdAiIq|eDUAGeHKo#{T~55VJv
zy>p8_cO~5c=~$`>d4EFINu#ORUVk%VN#4`_TYvobPit-c_O-8Y`nUV^l(5)ZS@~ig
zgRotE4WaZ;OG_&se$|g!OkAuWYq7xZqM~sqJSu#RC!bcFr>Czkq!=_-DZI|!yqn}l
z@L-UkdZcI;364X_HwwlVaxV;IwN`Pq-%x}3G}5vKNgj);R3u8|vPd0v`D*;fo$&ct
zcpGxwT`R3P^h7u)YZ^<J5u3^jZm5#koQD?6cGyYK{r&ajA?Udou|Eia%MzUf#lr#h
zBq(H4b(>VjMf}Sf5<?-COo;6h+*&!4bZ54|)W(^N1lVH1iRz)hRA{M`BXd=IJAg^%
zf?^FNxAHmm$4Z-Ofk4=mv`FAU<?sA;mw*DS6zwXd*{3<6^(`rw<wUmbj*_?M)6vV%
zsGf0*{Mv~##S>)Q6307Gso%l~N?yu^FHsGI%U4Nm(-64|Ey3#~P%kHJM3Y3>%5Y{b
z6@x+!Zx$Kx;X&@~9@4io^7OvrR(B+17@i(e-aHL)Dd>*tmz|N_P8$yfiJ^T(2(1`#
zU-!Hwi|YA4!{~lL*O=qi_kmFLPO$9`j_X{qJlCs8wC(3%qQGH2QL<-k-WQjTpsIBI
zx!B8blto7m9d>l$=`rTbu86E>ic?8jV5edfAfkps{ZxPZc4K2B^g(CtNY^iSZ;(s0
zM4^IsAIQfO0LIKbedX@fO<A!nZz=T&hLvL*zt=_{yPam&IAe#H6e$F({GH3nA_3Ea
zSP^$tQ3VM72}($jzLa$>4h;#(pers!&AK&Gv&=u_1c#v31aL(gmC#gkKSYoxWj_(R
z$e<pNPrK`Wx<;n0WEiT;de#ywg+w2&@B&njpu(JgrW=q4AC}cfR6#nNf}6QP>Iy!E
z&`ybz3Jff6(?@+&O%HewgouWP%^`;KkXo;%{zUvjd=rq4sFKJ*IZ4BKkQ4pudkq@&
zgpvi-Xr**@uReX|QYHL6V{`!-PV)|OmYsD};q%*t!eXuh<m3Dv<qI(rWDU<w4;K)F
zvVrQ3yD5`3Zb?+{z2VEU(G*lonb9$pUG=c2NY35eT|8lC`lbS|SzdCEf`k<L5N)HV
zK!1zz-nAiDn%#@9e(Uk5=OdE_bD8|M!A7(6y_bJjRdnmsOL==e5APec!!ag4|7W>u
zkD>NHiXKaR_fL>tU8H!tNUrYVt}??lw}*{C?(wp!@~87-pUpqqvUjZgKIKPM8fB3q
z_x@ilCeFj&<YY}wT{k=?aC%=si7H|@qdT^^EIJU+f_1z#YUIdg9EGroLWrT?`zByU
zU@>T+RO&g0ROvZ#5h>1AlHI(DU>=9=xy~I~`}yMO?lLkS#l<UxR2AvXZXcgc7`nXw
z@#7G?w``}O)WSJ)+6(PC-2^?bg;AS0F%_on8oDlsiH@f?yY!lgI(#2Bz36|OWbt_a
zQPf8yb_|m2i8A;BcEUY+_S_2?*$eHlR_?YXOZubz=PfoCdz&>pOa5A}wPVNBQ=0Ch
z<4;apcj0EqPkwip{kp?OmtHM$vT7HeaFq?H|IS?F3dYqGVzzITG&C%ARhqG}>wKa%
zhC#z)c<!EL`u(RKSvg|Hj6ozl>1+?<m?aYtXmm!ikoTpbqp|VFuU`)!%nj$u3m!y#
z^5@S@36$1|ic;=d85wmar#{lPf7Spf0#MWaOdSTxkRt6862siw+<>5<y(oCB*;$du
z{dd1f?I>J)HnwV|gSJJ7yT-HaQ*?>Nz^2{@56&?+?=AZBND9!^4<h92(lQ2$r^dW2
z|3eP|>MOA_?N}4TUyHhiY&0M=)DNAGLD*E;3l}cvGEJ1<%DK2e?!4Zme}$;&-{Au$
z=f-;n%8<@J<jU>duU}n`<;DvoG?KO^uBGq<uyRo{C^@$2$+AJ<O(X+d4Gj$qiqB~&
zYI=RZ0CP5Ja2LUB8s`X^FDk->goI-5UETTfMRNy*=or%ukF&Dqa-W7+?*NcC`F#(X
z{)7SzP9u=DVDljCk!o0ebA3PI8htB);m~A@9oS<a@U=MO@Zo$|N6u<%WC1Cyivd#~
z0B!idpxC-yM*e&ss*<RiMt}sa_8@8EQVJ)ENH+D!eM_Rtr&6{%waoh9BDh;rXYFXr
zPT2ajH~y2ApPmdLWu03)Dl{-~4@pf4-@Shg9;Q3tPuh<=-kJ5;&&DTe1sp%_1NO_-
zT>*^5X4cuZZIXaZAZ_}*zI}sV7I++CLCrd)El}aT``NKcgU@0Xv4T?|7KVH2J-Y};
zk#iR=ln_*Kf@sI|r4b`j@EqDp=oyBCQ^l^l+}vlFG$ADGNTU>N_Cr36BkGrP=g+?f
zT^HdphrSre?fj`7y7X^LP7~E_yeK$!W%@>P8)bd__H8xwY8;F59|rfM?`|$Tn45$s
zP`19{_3KOkyogQ7+!x)e-!gfkG=#QL8}a*G7q+s=c=Q$|QJwcroj?Y=*xdYRZW5j}
zK@7{vgz?iUWY^pI(T(g7;b1xl+nG$XfkS_vWK_pU9NTP)n5wY29NuHzYgAej(}K#_
zgef;+c#K~|wXKcg$aQj4HV7H=Uq*(}>t^W6h=%@r{Hxhi?mBLi$>+zuBYiR~FGA7m
z-*{})Ql+VBfXJqQW-u_|@w<1Hzu(-n%bW9I?5}zzfi3Timr7g_?L4cwwR_Yzllu0r
ziJ3<?PLCPAk=cwo@_`3q{(IMNthnmN-q}{6a%baJ%X>lb%bDU-J|$<7YgXScbpe@D
zWp3flmefZFvXY=s;^^z|<VM7dj<>Q7)f8ru-;${>;@EzX-x`>_=v_ilV{&(6UG`Rb
z%3kvk6w=4!lbLImwHZ-bq{->;-=O;L5Z@^?G%U;}`?Thw0?lE=W={gu%bCQvm{i*s
zkomi2`NbhMvn}=Yeak~w*t~=%R4dwrbP2C>Teel$(&n!{#)FA$2Eb92`^udc;n1P$
zK#ONTaZxqEw{BX_uEHDZF6RFsUGL-IPO6t#7*7&cw3(b~Xh9KsDBjm=%U5BE-RfDU
zCc1;y^qQ{bYuYFOr)vKuA-41G#{~sT4%Hmdy-f)wef2$eXa)}6Hr$+MubW8-#&L;Q
z58#L{X5G>nPcojaQRC;YkMkCjQ+hQ1!T87F4Y9jNJ<07Qw`|x^BPO3Kv>5_Da=M_g
ziGGcUnrc!)vwx407+O9Jb@Gh{rQC_9G+R=_-^1`0{Zss)&1;e-8?f3qXzmQgbR28_
z;}&3c%0^wBKaa=6#9ZcsP4y1Z9m%=*Lz6=0(b3CQgn8b2B%E>d4rdTRrQ(aYeLG#X
z2B29?WwM%B)<69}Z{G+4(EI9T*VoKnjOAPSY)<(o+W9ji85odSmm4t?)!^vi!^5Jj
zl9L=aM488&_{O0Py6M*R_x5Ah()KfZ<DPZ%>fZ!Qz>10Ar>UtKP53U$z(1#x;F-QR
zsg+M<P3h82P6PMx<34F!mM>ejnrh2zYW4)2uZbQr!rjS1C#F?t44&>hM0Ii8Imb|1
z;Pj84IB_X{BO-vykWX4goS*2>ZozY#DnH}=^_WGgSeD2ZIB=8Z!&3!~Uw0<Pu@qYF
zbLY-oOkB4ilQJ);dH>LB9Fp$vX71ey->sUanPYL7wTq_kafJ2oW<uk{?@Jf|tsl}{
z3%{{^4WSz{;dEA0fkt}0vmd>k+S&O85wkz@_4EQUpPp0B1-AS~rSN_%U$<tBQUjs{
z(?4deBe9mgw5`Z9CNfMs#7Lu&V|oXxF9=oNL$cW)b@sygW&*F)`OXqL2QFv1O_?_B
zEa;uqPGFaymAd-+DWvieiI~oI&fy&4GL@UoV;dj=M8ybG8Dmor1%s25x^QfB7kB;w
zi0}|+TL+hVWZcCxH%bDQUf9KR2&$3m+jsbzZJRck6wFmR{NUlkHrNGAb_XfWOM!)r
z)W2ZBTjrZrkL*mP1GgFQjKeum^!CLi!(AhTeb;b13~(vy=&0yl&D5)EuC<QgjYQpf
zOU6>(u0lPagL?EDQ!q1D$JO{|8;J|=jlf=4QK(PVOrs4?$zId7)+l!y@{x^6!3Td?
zmdRZrkxJZAEhVxU6jGo1;$E25#D+*kL*lWGp{0>K)(v($tqHDk2*60Xf@|<S<bvtz
zA~FDiOYZ{me>t1f^Pp5{3OfMQnASHV9=6~PMf&KO%rwr#4713*kK4n#z2-{sEM)O#
zlg8x#S#)`sC5&cfdXW1Q$xK2`sYAZ;n?sYvwkk;93VHv2ye;a}C1%6UmsD`g2wqn7
zPw<~r<SE>lN5db=2aQxwkwUxih<|MM=Q9pvktb{bYeYTn-*AfVE~-J(CU+`i(Ww&d
zh8Bbzd4WwccY*XR(#^|%YBp1wJrkjQEO+t>GI}kbxgLN_=C6zL^UZ!+TwZ#sD{xN#
zL;qSewRKLOKd&fMucDQhmbypaDMHuK`p0CZuO^H-oY{TQAb&va)<0_aSBMVM+5Ap?
zDc-vq%yQ}m*T7AmSH2PRomuvgOaT+^B=p)zI1Xf6qo3{4Z1^@FVUH}Rj$CeDUS)0N
zDwxUst_LMK4W!Jc91Ag=0=;bSx8K{_cN#GWqHmV%+&ObfsNjNbHUgjEox2bDg>==d
z13`@YpuJtSF&Wb447ZThGzEpWB7xzGKJs+@jkvg;{;6;8?ObGHGuHt>*v<@L40&mc
zF7GeeLX|QfVUh$P&?iY6toKsbqKjfmWM3x!6gcn_?YVHm?|}ja2X~X+nfQxznfiGm
zp#1sY%>yRb4lzB^c2l}z(Y>9@c14Jsdq25rLIL0bz-MmTP+L2MEPrLGXBl^{*oXyF
zybGqM|JqX5SBzB;=~IZUB50TX$6Xj7U!(l-uWTjy9j|RJGYl7VOKJUFY$HZNKvqas
z0Kv->wsT`e2>*TFLcs|gObXM~KQ;@68wj6P1F4cq!-<n85A3=;!{wKD?&?ou(?dzR
zMV}R0Y}nXMPw?j6LlPJM7D@n=KI-iSRxUzY-YxFdt+}L~6aUc8+hc>(?M*tl-9i{1
zNxIj`!Avg{x!8RYL62?XbStTT`iyN~nnt}0bDjGK99eM2R7YRVM*rx@s6SqYgX_vZ
zv%~d!<(JYt(-E9PhXR=VS$sJGFWg%>!2_Cp1+i3XC+XX8PSsP<tT5c(3Pb+fZQPo5
zO@G3boqDG=2-^%&?LGxe0Wq2uI9S5-ZoBHk8!!c>ht!_Gs&i`UW@{KDN*@RXZgSTO
zu-EuBH;4e-`hTc<hKjKSy^nbzULoy8m~0^Esxyx(X?C9P=;W`ik$^8i7V{;$5m<`}
zc9r*HoZruX+q%22Ov9HiRg0Z83%Wj1{j=!UqvK@3)X8lm-~%J>gv^&ahRaDGtcmZ7
zZg+4tF-Fk2{&y6IIoB2rSEY}77Rv8|IduX)c$vWZHQ0ZHR_%Zn(qh}_hC#m(Cv>_+
zjymD{(zi2BP7!sypZ25H&2xTt67S^lX%zsD69#*=T;JXrhJ4EuvW1+x+c#3Ux2LgY
zCQcAiiAB87ARf{jasJ7&u|AUFn~wQe`V#As>2CEhq7cGAb<c-$e<}4azVRZLil8w|
zZSv%~V9wHsR%X)DX^$Ss*d{CL9}%1YlCI#^6<!%)_SSx|*)|gT>Ag7(1B9r&zPlgO
zTaFA*09*DiT)k#Z3iY`}4T09ZRP*kD5@yqW2%<Y;8?}$Nbm_V;rzb1xEaHOm0~zZm
z_I1jO7v>d{_!8Ymz1b`*Wk0EBD!(4FYV~Rl5|fTx5bHrCjO~h224D5__m@@lR~e?6
zuFe3t30Ey-(b}s|o3`$ovSzm*d{>=q50=%HE$HWP>cjLqyZn&scov$ZWph49+ZorB
zHb{wT9640y8;$<{{yJ{JmBCM?MdOu<!G--@XZnJ8+a=b5_iuf_4RHcP4n)W}@77TL
z)r&9EDZh^7O<CX1Na1kiHOz;tm(JJJju#M?@zSiU0P09vjD%RM3)j_UK*Zhy$p<yK
zBaBI@AhV#>()uv7h)l;*b~#~0G<}Y!P`0|!EhEf8m2*65g}uFffc-jZdI0Rb&qkBA
zoXd}*d&r9`+gGX8x`-!L=W6>BMvH`yI;#m~YxbfTVNZLcvl)tln+6F1P~X15rR5*h
z`OKbVwm)|Oqr|5j7%gliRi~(!TQmg7q&3LXaw!Uz0-koj5$BET)*WC+NU2_Gq?MH(
z;8{u1?)6jM#TB!~_%nGxz>&*WEM|SdUMoU7?~rIOQ3yLmR&xjvu44t*uD<@6;Un)f
zU8xF(MtB53;2|(shHN<a7iKDNX|tPLt#p-2QnPGLb&^cYD*H=sUpl2WoEd$EYOn;Z
zke261I6B;zdT{7;q6dZ7{;m(tG&R;(-IM!3jKl2X5vjm?JY0Ze=_Ttz-Md3`QsB*V
zuI^WX<avUgWN5838vTW?J;(QiOKI`}eFnBH50_88Gpi5*RLA6YDEJSF7a%9Maff#V
z<c$8?eEIUozhu!`cWEdoXO2ReGcR=z9ABg}l1<7_$!3C8YiMHU$5`WpQd-g&oBZiy
z%Qup`UMeSq_r*b=e51L!xiIy^10t{H-9{Y0AZSD<V~)i>EJ@IT{(RTaZqurxa%<gc
zz7Jv(1aDkFcPZ<ah6<?NknOpfNdHh8>?Kj@W19@AT6e`AwrUt9V2DSf22&>diLqxP
z+yvUO-oe3|XGa#+d$?spw^@dUU1>E`h+lNs!ninl9@|?YdPKm}hxkn<`x;EvtPMkB
zwn6wCB<^bL&my)Tb1K~Tg3DO~(DO^lC5eX_tyLp!;ECW%@A=WVXX`CB>sCrA`(}~w
zOx&=D1-+pc?LxgL1r-<WYXjCt<F)pIul?;4P4dRlP%y?`ldGuW@1nYuy-onU%zd8Z
zQgz{6ii#(W1alz7uB7cU;jEGlmzK?#15pPCie^8wz>Q=(y0Mwn_YI0*gI&G3$#!sE
z%4Eq;GPV_HEc@RQoe8<eRytKLs623boUc_Hq0LljlCTxTY#{+STQWpVtvh6?it6v>
z$-4&e$m%X#lVt*fQWJS&I34XHUDR|d)4jt-C{3j&&CZ~tbLY-78=^0z91jcg=WV4O
zxJ_>K=~cSe!lGjv3(pF6U_@ySRE1vBD@#g@m>~5V*7o*N{F?QBH6~3sG;`t))R#5J
zsFTck8GVjJ%FI#P^X{O{h7AX)&}npOGjoI1lfR2!XifM(EkIorN;%~v`6Czb?2!K-
zv8=5N=oKgR!L5>7Ek|kvP0E3;eiRkbdZ8|-v&lPqRp-2{o9$$F2~F=j-`52N{b>bB
z(eVQU5krkHTT~AX-0@BbrFAMW-XRAH8~xa*yEvRfaaDc5Df4_;ht65iU2b7w$519_
zNTnv-(T`i?r&pM&B-0S>`;~MrWv?6vb8e>Z=lYu@DtfZNbti*wcE<{W63+nBGp1+V
zhySY@RXl6;aA?t^@dZi&cO`Qy8Ild8Gkku@^P_*)UcU3CJNdwU__tQ2N`XMfGO%l@
zyua}UgMKlG2K4W*a!Dl@Dm+uV#{>p?_LZ5xV8L8lTiZR#UV@q6M{gp9y}G*k5Xf|%
zD~^PDdRiv>`ua)9%uVTCd&hxtnB_S3;|KT4NXMRKygUi&(E37dOzFSV8Y%GW(t@|d
z&OC|6fu0^~2Zt_PBA7XLg^#B)8uiRV8yTkJ{BcV4ecQEd+p)@8xbkr$^lQ2C0_a55
zJK}MR8&6CzwrM72wGp-jW~&c$x2->DNef|2z3Q&)V7b$(k;D`0@A3gl2jx6{YM6VI
z=qz*N{AknYC$7RR@ICjaWYdrB3>mUL6*@Fcl?(Q2tce{Ak4^<fIC^9Ctk<k*4=lg~
zZDXXVv3~d+*Mmh~&(k?N8(;<YKiin%(Fqt@K=|*bjWw@yD<+MyyGu=bmNyU>m_!)U
zWMpNPl#BqRT&<Q`vQ!t4bmAZ5(C$~GmIMJa1b9>daR>z*2)s$b<f}_85hoX|Ykfh;
zH8UPRmMK72mUXOil<yH)?JYyckMBh*atgU^WqHmpV3c7L+^bUd9I-e1Wp&9+(TT#t
zyJwsV?9a$c-ei2*&yQX;z2tM`<=t`2?Mvejh+Qp!M>b-_%VXnCoVrL}heo}fpwFof
zaya=bjXkSC5Pw!xwF#J}mViJg^G8e#15MVF%$QoV!)3cO;P`zOLU`*5XCqkc1dQC^
zUiOgN({?uyrFt5on9<+LW?OE(srIhVFP};#f6nQWnQk<&ILW;^DOb8ry}G;1gzt%^
zH<@M0YsY6O)_*<xyviC~ptX&SV(3EJIC?OJWXcrnqTiLv%VrBv3VyZyW`5UqLcn+|
zF(x=TI5OsQ;_wAdoBT_X-W@9`ODu7kVC?!XIBYU#Xe{it<hFFbzbFs$#_89;;)xF9
z^CNeE`Nd7^+=3G-rvLGsPeq1J9v+!R*vEQwH%M495r=&~n~Ji)Y|cmbV=BLp#~@ho
zu)o8@37KHm6Y9i=j3Wj372@XUtw;|QU$ZgsbsH|KUvuFKRC4-ve2l|!-g@@mk7uG}
za&K<R$jA_ds#@1e>Uv9;UP?=>o_Kux#@6d*mHNw49Xv=$X$=PaT(2i{9r_THq!Q)i
zgFcbFh!(GBmMESA7rWw6pwSt4Q^;~epY)LJ;fNXJ&Tp0zod9*eZ2I==hkmk?=b%fg
z+1CpjntczvTleLv`e~y+MK7zw9nT)nYB;<=!+1$&t>Z8_XVA@gA3D@QA}*#eR!Ps*
zUtVD|2Yooa;AfjnR&9P&m3AI$xfl|`-%Unt_SRJizX0|<U`U7JBZ>{<EG@0QX)cRQ
zYL<t@51PHGW}s^B$5p1>S5Rx6JQuAV*N;}5?yN5-<1Cm9H@!6cf<pGOQRVB#jqw>j
zaiZDBL?iB(h!mrdLF(uKIvX$hsI$<ghNyLh1Orp9wh{zb-6=|T6Nn;<K;T3{EV=D7
z?G6J(hS;yWzW4l<o}Hd|>e_WbL<sTYeD%!}gPU=<DW-yuY*H<ZT2ejyhnMZC5)-G(
zYNrb+#U5ab1tpkcKW%1JRa*&onu@uq21CpRFR;%*X*L3_Sb}hvl2tUS)V4yJPS!sI
zRuwS66OLNv-7qroWP6BQDnh|<ezcyfEJCReaJ#f!BEdjME0KBHpcQ-P8{x-^F2Kz6
zz?wf5OV7K%Os|_g_B)(W;`ZuS9Ms;!eMq7FwG>Be6^?Wi6u<8sI;(0j#<BJ7t5fO@
z%%zE0N6BZQT+vpIwy4ba{p?o<EheC&=vM1fv->&pv^uXaFR5E<_IQWGPO=TE4X&d4
zu0$yV#3WepI*rr5m)N|$52Q~0{h2E%!hUc48V3h|^3@gS^00N>5BUXVV`ZDJY%=%c
z4`1SR#5N`a<nH3_L%>8p`J<{`OqVx<s0t56;z4^bTUFC|1|!@@zU4@IF1a0i3$b%+
zOt@X6A7w3E7E32s3@}WzdF3aU(?wPP7#wXK|LfPs114`xDuS@?`0@-6%z;ObwwKs)
zE9|Mc$2lIVH(|{CI1-q4?|y)*NhNlZAD0NrRRT2MD&X!ANAjZ<0>{l84%QoOL+gw|
z%0kH^NMYATHc?v%I__D(x>6J8IM^;7YPv{Phc{Au@|7xF8XZ#?iF4H_|MKSeuw)Ca
z$DB8>shDjIN}hkLy~IUv{_%gtp$jAI9@p>;1;B94IdRgKiDSf6Z`1<OX}dHuGz6?t
zpi$p<_RE*67>2%w)PAMl)0R06*`8EIni31f#&b9ef-7Jf{{4k*h9M>EHUaSVWhmW?
zv2AW&)oTeKf#EU-6b#)ArZX>O4i?yep6w;Kt+FVc5jF2LPDZpPstKA+wMZAX^xs#a
zqEKKe^}J^FUxL{~W)z3GF5#O9+al64lz~f<<j#xRfXAiqLlby6;Vorxa3g7)c16j!
zSgXq0`AQocP8Cw2hzKIuas%$30IF;(E-Be7&Ms=rx?QgXkm78qIRPoq1B#8n;9yVw
zrAQ_FFM6IkanYhhfLX0Lc-lZ|gEe;vWe3?1fL1ZEwkHWWW2iebKn6XdcKp3d9-q3+
zvQ8Yc<#y|%D|_B{9eGE?U1rW3ITZ(u@v>H%b?j2(OM+G}3c0!AtuOKyNyeZ=M29cx
zolKUUUr@RC)Kp)$8&QKVA3T1%9~;tx2;U~7@roD)I%VkMO0DFT%K+a*+lNF<FJkJ*
z&8`un>8$uY6CSAZp#8I+giTxYLoXCCOgQqq|MY1;dIGm+X7{6<^FEEYsvJK7=E~|c
z34-`uQ2afcK7IKjMG*s7^pG}GtKcpO_UNE7H0exE*s){B-E@%%E*2K;fcd0$yCI-b
zXpiK6c?6%WhUDjb8{@1JcPrnrE`&YtQ?E&wdReFS)J1?^#~OPYR~`!r5@ytr-GPBU
zqKk0?dIzc|LB!HC=oaq$p+wHHV|!qur@H&ixdpjhQY$qadPgmKVHNAVcbxC4u!3(^
zV=ZU<EmAQz86{jS&VW#*d@XwZTn2T<eTpY$qOPLVLaE3i23XlpHSWuaE9uXw8~Y+u
zYMu>t;)HQKDM1GWc3!rI&Sdh?ZF^~aJA+VF<l9-Pht&5AM=Uwtm4T0<#t=eN=&11q
zE|lB{d(qZYrz%&+Fj(?-(fhW)t`0oh$19xQl0x^~y=Nm1B!@6W)p0oA-9c}DiK(j}
zQ?9r4-ls9bJT-Uj8(2CEW*-LJw*#Mms{*N-RhF*O6^?EXS<NsqBR&Wfw^g{Z<EL1}
zL<|<)hp+IQRaSLo`vEYdO=rbMs`}M<maI(+7&==RmxamAeOcBmHQ{B<QZF9TXSJ2B
ze<gpCI(up2z0+*4j;ggZlJdB|-s`!XO|5EQrf8@i-a^6~Q-b6)ty;NKWHd1!#*H02
zi^@Q!S#IFKk(alKB-J|AAd2YOjrIap_hhFJ`^_7|BT_Lygv{H64G)tZ{kU`cPF0W(
z%Zd&Pyhw9RT|E{v-aXaJo}BE!T^^Qh0zJUp`G76M_4)wxm+WN7)NKd^gkBeyw-xYe
zb$`MWRK6ghyGC{tmj1u~eV&EqMx;ktI+w&cDCfJ=OL8QZG9O%5^!7{c6FVKT6eEY7
zQ_c{*(F)4SXNeEOcOqP?v3U5wnh^)r+DE*yD%3w!@ZHME^+v*>MW2vcU_GL>DT3X|
z4bmQ>yv<y5=Ls{?J*)cyGYD(|4XEJhk&RuW(B;N9Iuhc7wp0LTB87!YIJ!~O_i83P
zQCNaZ^%q@ek!rg7C5hWAP8eNqL+{d8!!^NjpUOzI?|Zl$gb9=+fvedv(RmNoO`DAQ
z6I`o4b)f?~M|3?8v^q6O)$`oNi(jV>d_Y&DPJ<knfMGB7vk?(L$(JR!J4JlS%2ic*
z&up--BM;UPuoR=LAh2~D98N8b(tHI@=Z{oEx&pT9%h~zGiKMj5q2B{pPtaNcsYM4~
zQ24{3QubU*1+b&-y)Q?@DU58xo-OSALpn9PvzytH=hby*uaz8KiIYvpI2--&K?}Z=
zT~aAfXrp>+_7;~t*U!{^m>qr~eJC)!Xc3bLqUSyp-;E`sj4ZjER;*kpe_2xVw{CN0
zp~|0pn;<u!$L`FPo_h3%!c#sBddN-{q(culU2C0kye?w1Efh)nyUAV2`%$IsIqT#e
zgVf-)lr>}QFJ8DHl^@pX43GsF&PXFy6VW-2p+Fi3+KJ9!4BfSBn>tD;yVId;ioY>w
z`U<}tadY${Spm`g`+11Wy-!a&Mlec!oT}KP2eS*_z7@u4IV+$2(hCLaYcKJW(`>re
z`(;yKizQ2>4xKjjrHrnuT;gz5y=jC^_?qPhulBB~%%XgSZ0}S7irjzXp1jml9b|PX
zR9Iaq`(jF&Li-PJu@E=WquW&shJfbwcHt@lTrf#;IMA?8_`eT7bk1ry;zI5xZ8W8^
zQ69sCT|FlpH`q_Km97vi2@-SyU13PW<nB@_g0}W4K+F{s?PT9@)^GrN3fUXrDp7fj
z|1Em|7x~QdrrN8<XZJ^HEr(NlYPpX=zY%}X22R*8Y@$y*xn#sm*Bd(XW!$Gn$v3x=
zde&HFx=e5}(3Cppc-V=c30}0cJbGrW2Zp&%Kp7R?+$LU!Boo6MBL09|E7$cc$o*J%
zR_((J<+|*A;QsK4>r>m!e|l_(Mhn!M&IF5SgFllVH4gXv?Q+W0h6pVB8zpB*ek{nR
z_ayD(i$9>YwG}|muNrU8P^Vk;$!)zOwTX~mHh5lGa&yyCUEOw@lIM<K{>@Y&XQ(S`
zA>g-vdu42=siM(;@&-SO$qyVB(I3wpe*C^(s>!>V{vw<2aYMF`6oa!w3G{43;mw%x
zTh7*^+z@OA`Im1~5-aHHmRwsRZM^iPPRD;hxj4t=)Ji=r4PwN2PZE=Hj>Mwh;nC=@
zx$sBE<22(%RwWU=E!_tX?h2cW>rR2u&|LunySg-x(q)8;aWgGEIj+NLk<hCex^+s@
zk~is$gSjnl&&SW18CZUF^yyWzHRpgXr;&a20LD<+?L^_j)QeAFt=?E27MZ1sM*aEC
zCnh<l=${gnb6hGvzr7ysv44L`Mn*@$%UT|EShHq7Nttv74`RAtH*YdV?~o;zf$}`P
zC{rcH2a4IE?kfauF)DmX2p_d1zWX52@^N%aC>CJi_J8+oFmld}nr4i{BX0gZXSucE
zNCd|+BFloM3|tmw@MZJKkdWc_|23B{(pjWhFCz;U_=rn4;KT_jVZ89_+EA;AGl>YD
zcs<d$@7edfJw-{6(Fx>=U(c6pG&bq`;{f|Oob*epfy-ctn}*0)YTrCNvij10RbCjj
zMmAF5XO1l^Ej<IubM4kGujuIL(u&`+3bXT}S2GXviIlIt5ndZ|DY~_TcZ0DE0GoHu
zDO=fP!Y{}I%W26OE<+*GwH?|)2fV`*S8DR<$Zg5Qrk)$faA-d^T{RM@JVIv_h-l2D
z-VPH1j^g!80FxCQgxcA-VfO6|gLaDm^o4GV`&#TUfYL2XW*p7v&e2!;{HD#2qP2JF
z(4zJpp>n81<?N*l<)_h<vE85{Lr$E~|B_4_XHM&Ov{d}}GM(-Id)i-!$R%zUw{G4H
zz|@0T!oPP^?EZf{@7w`;F1l&{l?>TFeG}^X_QIA#qxY{W6`s5hXG(6HP1*A%Rxwp&
zyQvyvp6$ykVb6|ct`7PX%~+?eFLfMfHqk5e^zxeb<rwv({?F?-Z?-VsUly|eiq^VX
zC>@rqtGi*+$LH9ufnTe}inGPXxP}{xzZ-U`J~Zk0G+kKVdAZZUBhPDk1x!oMeet3<
zp#+8S{?ZEZS%1GjyP%})5dAm-)*~exYHCdR7;KpwKf?UoU4}xPt6+%6<wK5=(gp30
z#O)9Wg+j$NV9v<twg4khJ601CbR6pRPkqVIJo-!1y9x%v{P_#hy6=36py&_8QQw}X
z<kpI2U?yLJOXXrXD7|mUx|@yw*E*Yj=ySw3lqjNu6*<WYAX0dovWD=*A+%<;p+^?!
zu+FbZFbzF7uv3?r$Zv2m=ZXVx{r&CrLfiDz)b}8MK{w|Ue#!48wBNQ5GFQvXuRAOD
zy~i`m0WzbnLWInBZXPf-d|u_|iO|{(?orwPQ(dx;EB%u0DEXpV)W2j``Mx{)FHLS8
z(|oBrrNcYQp?e2$l^+4mj1DUtlpZMRznbZZSHUn3XjtJjf8OaNVqv6XogC`2?%p{Q
zQtC#_Y75J=?N{t%hL1##cP|xX0N?Qgs9Y_b9I>SHu4a}#zSlYFD2L{(N=DI}vptKE
zf)k2b_xq)~z4dKrHQk{fuTNQk5RQ`Na_niiNj`yMeDJ8AT+Dlkz?~|HOfa;1zQcGT
z>Q;cJVz^Rpyg<7D{d_*g*jKgo)3HBl@tWnI=A?^R8j*{yUwGC8Ls|)b@E-J5a>l80
z{SHma{a!b^4W15$m{_KmP4|$Z!*(#F@T^Nyg{Do`lETm^Y08{bx5Qs=X<i(V1ApHX
z5G^odvHhm}amr}FbY1%gr{FAo3$oIlRa8{y<g2q(o{*4w4VttgS|;!6GNjpc3-@_M
zN}7V9gjUH21y3a1!r57}Mbpm87+IK(Y6x23nX7ID(o5$rTOy^EG}ZQ9<%B6nT+X=(
zrnt7}kMi=4gl+%A_;oK0kG!EC8XCSMT!m3-^cIfU{YG}>5yeR`oAb7~&X0aB^1tZd
zZKS*-FRl0kk$_~Cp>@B;{;r#z$xMX42=M6p9(g&*#uhDEIy4u~`V#gjEw|-~ch;ao
zMZ~cW?b8Y%)F9P4`}gl>9){kkWLWM_K50bDRI=KM8)!IdjpqOovB3hN1eQ}sH9b<^
z;MPhqdw=IWfQ8T4U(x3($Sg)HICSYZPCIwfo$2lwRtfVKEI9W2l@Jrd0ZTTGIn{+B
zu+gi&oLHGZxlVt7mhL_EsHc?F6V1nxb^8BIl*|NI%U4Ql*}#9>#jvRoch(sGZYIUh
z`iS)VrOI0|GQgt{Dt%#tC8*jR^UTpMLORN=3#HTn_J^%K8?HD0-60imK$lxIA<3!y
z{dr+<<Mp^W`pL$`^C+v#wk%vj4I&ED9E@=FuKEB#P;<_)-N+b0hh`8a-@#T6y}O+0
z<^{q|CU_8hREUmI%6kxF%irk#9%{UGSI{j4<~enCtY=+4$dajP<GA6&X~^khU$Ya(
zotQ<QZ?CUCl%_!@58NhG0Jw5Hcb6zO%|edF>5q_bETvz{P@F2-4fwz0P(JsnNSr?`
z5VtW_&FhMZa^<U5!t|Ut_M1$$^Fa0P^gPaHZ55yR45_Ft&zXH>3yR`N{yv*H9T6$r
z!Pt7q&Not4Rh8y5n--#TvStfu6+U{@mak`Ca7+ErB9+ngJ4-Wq_!pMSZWx}$By$0X
z+;BUmk?vX9H%!BA$%lmLlAP&R@C5Vz6rH)0qS0~M#EG(0+|Lt!KxzvM3<+fKiK}0f
zd0%7eRXvqAML*n%;L=v}A1Hc_nG`g*pMSZE20*PXPLn9EVTiR*2j(`xYT$>VvnP68
zR338GkdTm$6>x{L(7}sdH*<vssd{HE(mGuKA(sq;47^WJ+^?RmQIQA#$%g%*r@|x4
zPJEveBcYt}cP{W=xzNmE)b!~C3AVW^QTR_QMW`(0BrZI8L>9USC|!To1{irGpx1CW
z9uke5RyNJxGj;~ujc-1Im+gU42!#m^<_CI)cYe?Qq7!m1%P+1bM!#?(EH9>e;{u9v
zEwfytamKf%7XgeSAf60JiRoPtlX5;&SK&_}2{`E|pmx1ZPlR}yWKYPvL2^XjeuTbL
z>Yk*cJ~KDi#DQ6MtRVGY$-}cheVA(QJx?FnAn+Y&eQxX)m!?|sYHigSjnKR5+rNiq
zHBrmcX*!E>C=yXfVv%~uJpK6%`f_~DKNld~&1zdNBo&}CJr&<1b>qgts>sx0)I4K8
zCv|TC#M0j<uz9G#Rch?K8^XK!M%QF1`$@W6PLmi>nxAh7a37RR)$nnpgq_XX<xH&;
z+ngM2CPd%AME&A1GcVYRL%ehAjC*HpvrNIAY5?U@QJc1sa4J(si6?$<X32zZ47B%W
zef2tGlC440DC3B>pP1h1D!MuQ6QYd@YN!gy&cmL4&%1c2TuamK72>rVM9<#}Z*`-a
z3!J0_bCe?fZ1yXkE$$M6mvFvOX+WLR6G^C6A;V5s!zV9AKXf@-pJ#^p`pC`iW|?U*
zzgTfpT!bzCo47OHDFA5HNbx-bS|!Z&a(8i5dX1VR{A%cuTv)Na00={9Uph#H5esoB
z_s?j0ZvXuR4n~NrYvH(+ewL}rpB<q~+!ymz?@t#$KB-wPYuGq;+_)4pRZ21)bqx*Y
zuu;sO*32_B+z)G|mL5f~QG|!#e2AuwIBJDzx?9@F5q9ZVU22<+f)CQvxaGJ`oiar@
z&1p?jSGO2`<ji9ckd>qQ2LUYuG@RdN82-F0L2d9^MCd{Y5ZbMq1kOETY%|Hkh?qy6
zBwS81isuk^MTU(J3k&N4*cLwX5tp|S(uSDtZTLEYSH}8OHFF#Nx9EYGe!64+(Z_i3
z`;j0^S8#<1sZ3<oAhfColUH)ASg+v_OJf1@@1-sTz@hqBVteuYd23p-#e*h0-A`#P
zl<Dx6RLvd?*d9(uMoA*HbiyjWJw$oTN-<UQ=e|F{o?Oq}K`4Ak4OZLOFve~$61=`M
zcEktRzGBdU$eBLw!s=+m^mHPtXj29p!Mc4nHPleTG}oYTxaKBAAEZ2G&5&DCet&%@
z3W?L+cGBH(q2$fQC!*{~6a5nO1Pb4ma<sGO+)kc2u@Vw5qSIgCwELvl|5Mfr%23>>
zWH=sFd163zbnOq?z&EbEe!=&Ay$oT4iHZy*YCrS)rCCI|o+_Dk^^>7{_0nDFOHERE
z4~L!U)V^IiYx?fECuZSQK-?840b6rHs)IXg+1slCckIJBXx7K9S-z#cuIzr3`SHiX
z>Zi|s*Cl1-0nh-t&USY1(6+6x84_}T(WxNTitx%)dB|CN|M{~I*QlUZh@3CKT(>$_
zCOU6Epy_BIu(O9X9V)f>?KlD@(Qb{{ZvamF4~S1f%_^+RXKsiHd;ap4xV@|eO4UH~
z100MB`VmhSkYVS=><fz|*D35j&vw#Q+I^T*Iz*zUJ<GJWxB2>=ax~sKH~bj|S0|>^
zQA~=a252$ipigt98hy_TpIhZg?cJXsSO1fC4&TxD8|vzYq98@&lYq{`G3R+!mcGq6
zSIQP)KNeo`@uR1BEeKmKH+U=2FM@vy#kI6w-?3VoTtzQC*|hSq$az!*pfC#V7v{yp
zNs9pw7T4Ew(Woo!CAkmQuST~ayT3gzBDyc|A{H#92$wUzyS!lhcy8*n)a<p#4U|5+
zel9IlSn~b}`cAaDwX`D<IQJll7EX79cOafgBuLm&=v7oGswpX^DJPXm_=e^BlG(_n
zJF~@I5j+Vtn^u<Hz=-WBZ})fgk9<atO9U^49YzmYO8p6#!`Sj-0k;1b%v2A5&X81D
zpx^^|H<jdlF&42}@g1KMQ}%<dNs%px+evIBg!Xh152C0R`C7c;QA0hw2i(7&9FP*x
zext81uHI|iQ<|G<G;{91;EpmIeNG}=Fdor~=FvcB;8Z}kVsiYsm>M7u_JV2%#kd_D
zvhS2l3g{-pwIHUQ2JO1|Pemjm$1bLKclxk^c9us#2>-YvKT$PMDfl#*3)%#a!kInX
z1|l8WE#Ph_Qpu)XLutv{{H@@IG{H*H^q(Su;g{)7ibfM8{FC`q`Wi1t;&b4~=^_#y
z&4M3?kQV(s0ARQs4G_q8{r0afx?rYhU$b~wy6Th1kFPPVhPLKT?kB0t=%d^NtS=4k
z0BJ1jl)7N~$lA+KoR~i(*8U*^YI0v4j?Y-E16#L1u#J36M@Pr|JOK|EH&Sxv-{n0p
zH0y}8_Ag5$*-edLfBDL7GAHX+Wk@A39+G2krLo{YAtc3_WH-|YJ&TmH60W9IM%cV7
zpQ{DFZ`#*y^_QbMb<J8@TB2u5a-V(+ZPmr)S?mFw{MV-a)E$SA9JZ4Pupd8=i5uv9
zYVY4^z|8M`o+Zl7;?kzCye&qP6E<{^pd9X@@)fSoaM1gb%Ml#4Riua!_SU{UhPDu^
zBBgdVl&StvWEJ`SHTv1V+-dzkFn(UB)wSKLP(`B~etu<d>r(R=Zh>BANr&;$C=%^b
zLX{t|rI^u9$f9li<}uAQ``XG>De#|56>Hb8_d{mC0$j0wuPAB@5B5Riz0;!O2|=}p
zMSQFg71wF*!*yeFQ4vfb7H((b?wFtA8w%2xW)&f<3I1dJXF9E>ofrZ#%hffpps}!^
zAccQVy|8GB(*<&hx=U{Wm%3Ah^oTjdsP|6LV>i+{AAQ8Yv73j+k!Wcl{4{N9reuw!
zRXe68Is-Ya^HtR<Vl5kH>5&YadRO1b)B|Sdp)y5ZR086%PnbC@75rG|v>Y6*SX}z`
z!?#Uyn<3XO{6~$>DPrj=(uSa*<zb<r*i-MJAYPZ$MoMZ^Q;VK*XUzsq;PS}abG}hg
zBM3~s+zG?)mEQo<7^t~x*@v+ovt!x?A3od$Fc!wl)JBelhO0P3MJw2XT$K}m^gYbG
zEZnXs-15MvL6zGQq*M+GzcGO&n0iKE`{6Y-=_|dJOLbqErU9gh<$^~u{lj#<bWWhB
zf|gAZaN=c)*w#2>nfCiZHmSPfFO-2Vr!Vr&MO0=tQb^bos=*x|kxW0oqT4}&bj||<
zQXE|r?2?ht{I@7J4@E4L%;9)9jP=-tVkURoA?(NKiP($(eecNDA9zP6A$`3G2XNqp
z^kUnys54Lzk3doFL*{UA&_y98>i|H4h4g-wc!0uwP=BEfxpwoWdBX{XTu;22q79Cc
zcgzsDD9{JE{>+M9JXtSxGs2$!_6<UhEw&Z$?fD<dx)ff@iJJL*nIG7~dk8<R2BEmA
zu1@6)KEE>kq!^1VJUp;Vii{l-&@jywDY^5nQlGRWp{#*g>nMOx8jNKB*I-1!b%f)3
zQZ0KmtdB}refJ*5m8<RSM5`grlUnf^&c<*|ilSn-T_C`nfreW@?C|9lFhU_I?xBya
zV_YCPdVRVn_(h&=u&W<C7;W?5{ta^6uDtb}O`GD3bhX2XkWn!|FUHXOAbbIZXf`cL
z%+(U(!ncgRBwv@(hTexX7@wBAdD=FEib-_KJ>Y^t(>Zj+uH^?8(YN1^#Mr(jC(&hd
z_=!@=BH^*&FBa;`@I5^wVo60S45AAATQQdRrtWI27Oe8D&(ljNfLBpjJ6C^^Wom82
z*xnM#n^`#3C{zdE!aGOgeV?00Stj=7&^_Qzef~83#iuDRs#mp@@VLZH2U>s8DcMEb
zU~pl+zFi{akr%^_`EQQd{bqGOsRs;t^vtT5-{I;lJwUo^U436n?%75Jnjl9ES8CsN
zQvb1zVY9*_LI|u$>P7S<?h5Ir{qsm+UxVhM3#IWKU0v^YAB5c|b--TkuVOxgbKNgR
zxc3z{HlFBqBY+RfnS=+fI_mEV6J^2ASCp3iwj7&(W%&TuiU)Mebnn~Oo2m<E*!I|@
z4__RoAQA+D8}r4qZj72Q&=T0efjmPU+Ubp!KL&!ttUw16{~6bPjc=cf#)t|+9MADR
zRVy<&bdbwJKM1>i4EnnR_U^rYy=te~_jM?%MR;T%ln^=Cg`G&Bt$_?Ve_|M#4uvby
z0Qn^o=lXKDti%dajL8LZ75y+bd^F^Iv7;05l5}<^@@{B=%TD>R1iG_?*pAZD57}9w
zEtV1%we3Sf10xH>&<6T$o?+u}EITAhXgavoh5QR2BB3~#sN~q&tbUhODWj7gi3VCA
z@AnkLA_u{F7Ws&h>l%U1At5;&dilKH=xjM~SdssOiFyivgyt3O%S3yx$Pv&GqL%G|
zte~7EBJkvyjocBa!^3%D^-ZN6sRq1|PNXo-1HI*bCdrYX)L8y#Wya?eOlPRNQ){Mm
zij0YVU~5hv*Vq{_Q0(LmnI8%s`rS(d?~1m-g~BG=5rgxgJ@V*}0A-oNS@$NoM82I%
zMlD3{bmsLzL&V9RNq)_ZX$1Gfbd5HYb|ZV|Cy6qa-dhN&5^@cZU;y#?bMJ)<eKI2_
zY5Ni#3C6Mu=Oe1|pu#JL?H5h!JaR>4$JR6PzFM(y;~`;^PXH31n}rk7doeN^LS7LD
z?aJ6GKE9d`mtOaD>ntLpfIGxV7P%a|{XM`jxu_>ki|JNkz65bq*~(^%jYyh+a#ny<
zBWscqWIg4PWG3=Hp*T>VHf<H(1|jucqv~b7AQt#1(!raNo{2VvVvuZ#&;5+MKxggw
zBaCD&sJ&N}I7`HbG_igd0*Hc)7^17Ew+G<s%ZnK44jno$|HAaDUX5UYT7J%j381i3
zC>O;emg--o_qgDu!vV5xY3WmU?%o}s9&u<c;aZ$6Y(<JuunK`Ed>u}%ZGqQ&Q(+4g
zIgi{6;3}!ci74*HwIt9}VciD><r^6}90*Zp)$k>3L-0Sp)!M6Ms^w=o1IT%Wj{-2t
zBW{Ksefo4D|2K^^YK4rC{Wgt~<6Qpqp|hF9J6QNtiN`Mz48E+#xmPtV2dgXlzJ`aU
z)up8f!t*{9B@?O^k=)}P-vRCumA&vy7DK|t-I4z_4H!zWy|52LdM7qLP}5bb<T*gx
z;u{cx#8f)ic6Hu)v`dj#v3m2>Do(jxqo`LH>MR}uS10<YM$7!~>__kNhbj7@I?}39
z0NV}4b^F9W<7c=g!Or>{7{(yM-tQ4w@#U)&CyrA{)ejXD(w3=dpZj-XY3OVGuzKgG
zD~(WRE?(M_GekZE;jR7u`HAS3O9_XE03UM|VYY?Q`S)htoqhB|O^0Lo>sd(q>wal5
zc<}%8qq`vP`t`BQ$O#wvF|fM6YXAQiA2(s)h%iAQ4eH=l46H<AWL>T%{yx$>Fmv_(
zu6y`DrM$~F|L;#Cnm^zJK}2mO;V#|>8{DEy&Yf(j%+NIPGhe~1BtAY~W&>El=TUuI
z{#ktjeeta&443LWuv4p<JiAsB()oe5e9=rOVsNH;$0j){v}{xUYkx3q2f%mI6-1a#
zm9Fv;-7MbK+P_P{kUvm~vX(+Xa=(&+f)R)O&nXzzjp^ueSj;LBA68|=h)$9*`n_do
zux%$1QddRA)cZMP0!}?()ry;+4*S3Nk&C8q5Eqq1q&Dp5{z7J*dol5AiOb7Oq#_dn
zHP<@2BFiZMAV0$AI)GG)&_>^D_pf_V?hs%qctor2&7X!m9I^0%pCkO#)7^pM&4jXI
zWWAF6;FiHPyl(v~QF(|;R&t-<iGBEZK?<MS=<+g^7OWwMGdmCL;#Q7KuNBP3`_t3O
z{NMi&(DNR71^E#)v3V2@v_gzfpMXRpJ~md!ZE_efNa?qvXlx6pl{0sUyu4Q54E*!r
z_M9CmiL|_T$5GV(?fuvoz6S(AUADQdO7ymfpPckHFoeX7RuVDgh-fqU|9z(M@WcQu
z{nh)^q11J-hU@S*pK1|h=cMbH^t_0|Ow_|Xpa|~U`h%<%KN_tCV!7dE^rGzDfkvW2
z=ydDHo7=1I>$0N~0yM2MPS0)mMe(~za6K)Od<l;#b$gN9T7j5HstEyp!l2&0*WA}F
zUnPot1atn)8;X(~%8h(<yBxfVq}VM{LGgdm;?E~s&(BqI>fOblg#F~;6wu;U$}4pY
znX*}l>RKWJ^4v|&l{ktKDPlP4`$_4GaLJddJLmrI<(7p!(reik?S(ITi^>dT!1`j_
zjuKj7aKKZQ+$R$nDjpK+CcD+WsZM$Ko2uIYax8SqHfNb!0lb8Zh~x~=N(i3d|K2he
z%Y^sa-c@hn_slqsPECJZB*LUnWO{8{hP--mj$1x}X*og;F&Tq)w~6*1LLa3r-&#VL
z#9F(PgbX1we<hzihA;~b(@TN@wih=nzxBVLh!o0{eSBK!3bN{l7?kYhBO6<Tm>s>g
z!U>}CU6NZ1I}thj?xv4Y07`9yCUg(Fx0YxXk@85vu;s;6w!m6z1p|W2Z8LZFFK9`n
zHyS&BN46R^R6Zi&JB2|?-JAUUd~v@1{m3Q}s-+GryJ4Hty1AjK`QLKgO#J!*?VFa(
z8eY~0#gx+j?Io}O_q73vjj=J7>}JZR^}eBjG?Cqh_7Hx8;+(ZClTJP@aGr%|=Ekj1
zXjv})%X7@AQC-Nhdlx_v?~fgOk@4ql>r1a5+E{q=Ny}k07B&iETB<}$LZj6<|GxON
z#?~K#g*gBZZVTzrLr^g>DVNOe*|YZnp_C;L(6vj%DcE$wLSEkF3-6w=t)joA80c;;
zNMXyjYx&~F@0W10k!7_SGI`HMf`3bp?)-rEXOQ4ll4S{wt(dIR@=tw*XOc)6B)gFs
zpP>ozUqqK0a9F+N9Ayk42?YaPTdt<t3Ap$DA5zNBeLJ!7%usvx*+Bn&j`(K(eq@t~
z>X+%mltAkE^2}QrB*&ej5bJacH_`w7rBF#bE^SJj#hM{603RE_rTf1{G~Gg=JPY6~
zDnE%Bkt;+2d(LuRB1%{mF{V2JZ7Vi+?Df{r0^_z={J-^IL-F&9>4Or{@g33WHA<IC
zlu`_+n#f$3wk)|Q`)M3+`6E+p(H+ZvYbBXr|GXoMqDGeNXv)i(X1H4Th;bMEXrr>a
zhOu3E1$WYYTLMGm;%h5fwVY>0r;Y0MZ>fmQXMg>CKO5KKE)tQv=2S!O)~g4TtWd|b
zk%-PvM&*bHZfwl?)vnrrD+(eXK4sua#ee%;K1+Gpw1(!`?PWO=i+q~XtzNJ0u{fZ+
z%=|)+A&>k`^o$xd?K|#km)?2F*O7N@?QUH9YU^c^m(!(sU9`=Te8acV+kD@sCCE?k
z=pCb|8{Bqi(3{d4uZ>sx+oc7)S#Y%MLugZAw%eF7Wt+P`xmQ!|BJQ063}_WTU(eRa
zbE|o~)=@>WI|m-@97sxZLV^Y5VLcd@86FmASG5A3l}HjEQ@_Qd{?f}WYWg#Iut!Hp
zfWpVp(&4mUqNzRV<>e&?OA##l4U(3uxQ0diOj`4gDcoRKG$O2vPzf`uAL)gP7u&tg
z#o5^w%kZbC)BOirB=_wb(owP^J~45+KTLq8X0Huk*)r-aZ$0xy9PriJ7W1ARjJfDy
z*j5rLa*nXnU0x5vDNZBLoVj0&x$H0-8=KEO-R(_6N@h~rR&u8QS>BhQ;W?z;6EPRl
z6Uj{Vqh1U8hVqW*s96?MC<dJ)D%g_>w^wd0eqG`iMpYjDPWd$51GM&aHsC#N2fLK^
z5N;JgM~)u7fbHynmY2Yh-a@nS_hAJ)lA|v<XR(#$8wS4db>FpRSHIyZDnWOr-7^v|
z8mR&_d4>K&^#$+$7VqmeVN1EuzJ;7l2k-*%mjk<cQLctlTtrgy%t`;>XE_eoVBuc-
zP)<RQBKZX$wf2;Zq_{u&AA1gpl#~>3RumP@h@nFtlv_w7eJD*&k@_m?U82*FrXtz?
z{q1?kk`-qbWtE0KiinVB^x9k;4@~=Q(`s4e_JsI6K({c>=8t3C|9_Od2UJw&7WRMC
z7!x&m6MHviP*AL(0@lQ696+Sm5W!x+0thNtAjyqhyN=iptRT&<C}6|bKm{U2L`6|V
z1gsPh5&i%60K&cBx4yOhv$(m54m0Px=Y4m1_OqXDg>uR7+{TebC{<<<fx8*C)dUjb
z?ylWM?nDV0TToco7a_EqMB>@tm~RzbE3l8^r&~swYCzSCW9l@1PE`JNaBU?ri7RGi
zmQhY36EU0DwNRA3eLVZW4GrF%1d?7v;O<PqI^(!XK6&pS61TSZdj6w23BbSj{-aN0
z!5RA)HB(}mxh-7i!2oW(Zr#i@DL|1q+egdqC`#B`nhkTw7&L)!ox{LO<Vo62Cj~zz
zx;$!8W`+K6h%KgF^2bmdq7eh~_kO*xO@9DyMrX3z&3>Qg+Ny29PJjPH?33`2kdfp(
z_+ZV7vipy=4IeDuHMH)g?c0M%l+#}>QX^RW@)R6QQj?pR(-ofrYfH<$NNUa|#nJ?l
zOvcVX6HWo{>n?TVciC9%_UA~8rm%ssA3xrQ!wMq;k%w*TBKt@tw$S5jrXj^IrW^St
zpy%DmijUhu(hZN?q~SP9O(1Z<Z2DNkU^HEdlWTfoy3_wks}3CyMkzk32}JMl{D-`7
z+`^~{`rpYnhbYUIEjtCENXz+14JFCb;k(;@qlDP8>tHL;G)TM6&8f#NIXCi3O=!^b
z++LopnWp`+H$T_+SA6oVL5Y{YOP`gn>WQ6xb2+Pw7?WG`U4_IO8k!~z`~eKl4hEg)
z7nGP#bBiY*(;3FS4^Eh+=IN_fN5CAyLHTAx4CG6E>h~Bj#F0lDuz&ylbsPS0D`03(
zSJc#CS8Bu@<%>?q<jH&GCTHmIUhj`t9g5M3Gu6<^9|4s@N2O}%-%xwWF%j5uDr$%S
z_@j1dS(!6cQ}&{PNpofI4>B<&70I%k%dQ=kWY${FjIn}(f<8vAl)Z<9ibMn5iS)Ue
zKuA*%#MQ72w2!RW$aMcJr6+M_b5N+(1afrF7bTe`Sj#!^3G3d!|2XdK*ocko4+Ao^
zYF97&>5;RqlE6TkK@?ZtJ2vIdSLaI{5Wh&0?Qufpq!qu)xIZDYh}H>M*du;#qP=!o
zGl8mq{`q&+92}b1zW#?une8H7_MJQx1sW{?d{k7FcD4Tk3F^vWpOA^1^&N6ZJ(t>8
zRo`5-zsA<y{t$JfNu<#34*2UmY(f?DvQIpy&le4?>x%gFD1Rd>amcDrGSv?ZmtEpD
zyV+?VyW-sB>z<uvv9n~2!z7a6={`6Fg!x@-!;zGiG=bDsoXDam0&SP4A}{+yr--h-
zds}JT@6S7<hYl6BbU9D9V*%b%F})tdoN8-0EV2RFGp%@HE+?spgiuUyBr(|TURq{N
zJ;o9%@}0%kiqduOi9xGvG$@lAQVT6FP4wPI>031iq}YpR=K($a#KT5>x6h?*Jspn_
zCJbocPrL}#v~}Htu1DD4B#B!%yPdR$U!Fqz?m-i0d_2@q@o4}H#S(Npq4q2gK?(rg
z!|En@<H5W#f3L^XQT=(4dH}t<9*xxAppxB{y*GN`+kY44Q0x@n@<2mLa}I2%`i&ZG
zb1%KLrsToGjk59oB&D2h>*{(OUC$%D$GCz2;tY@NAdI^5lpM!A>*D$;iB&9O;q&PH
z((rgXtZ!v7;v%}V*aRzw`(pWBeg~SH!%cuh1{B(%U@spCM$vLd!y`o~Bst-5?9O{d
z6xv}R4N**G+yZ~6xCjuw4~X4~igbcA+xa#H>89=5uNdac-kS&&$Nkd_Tn1Un)$7;0
z!J+HCMNK6wq29#`Z*OZWvz!?d-ch|~<Hj^Hf6vi%6wA&c)hPP_qvz&;EPE1=mH3v)
zec*1Q;_JgXSe{xEB#T|1@A&!2Yga|F^c#00;)<S{;@NpVQU)X%A7u~ZuI{qi(%s$N
zbH3Q8y=|K|Z=Uz)^QSZDe~!W`CU~C5OnBvm2HV0AsFQr@8)}7K&vwo-q;n)k&z~*k
z$9RzTE8b5j{qUi+|G$(sk8p+LZvzLF7bV|FNLb_`{Rh-SyR-e<ILIM3VbE9TG#WHh
zK;=AuvE`weci!T3=aA>+r4={tyPb+0OR0<G0JZ3Q(A9Xy?c<6v25)fW+J<)FjKGcc
zJj}E8%HBH8kjl5Gmsjf@jTB4vt9?9W_M_(t)?i*#YCgaVgDI{u3W$rV#LQ<qKJg7C
zXUIEz=3!EoJ<G6toz&8g42Zm?&5Hd)WY0}&+N{|eibh^J0gI1xot<)I(nOktETvV-
zeASxCIGomuUKRb=oCpH+y!2UnQinAjq@q5V^>-~rHR)5yM|w+d!*DL58fSczym&Md
z;#T4YmnAz7XxnJU^yzq~oBYf?sYgAtbj4&fFica(-&^9I4m^%J>9b|A-^#@(4Gmbi
z^GK0v0?{tX!%Fuv`d--^_Tl+~jYW5-ZOjF>O>hPiNT!z}gjn9SzM*2Pl$MtI8O++?
zu%dK^m_W~RF^&Mo(#u>Yxt8K%dZn*U!k(8@q!=slgJ&&G6&K(H3I1Lj=Nxo$j@)a_
zCB%#}S@>+7e)F{ysMdzx*o9~Avn$@~#qsN?@l2lli>(UtsNEpw3mUg=lv6BaR#8k~
z&H&;qvnT(egzR4N@$GNtQP1MC62@eY>fgUVl}U4hnYS!qP~BYi2%uRH#i4$?M$w&n
z4}5)$;|M)9B3dAd+4>#;BvwL=Eu5=e7UgqyT4trp*0n31e-nybNAVyMHHRI8S8TO$
zV8MTg_}o*AIc##q35t7eFL@2>BqN~Y)XJ)wxIX!JVKgKc#(Eyj*}c>P!ob&^al~8K
zIh*+>Wp^o<sNb9%#&q!m*HzulU>_BiI&vD%mp<DiJCtzjsMkxF8Fw5%e;6kg*0)i!
zNrZ8`m%p1(PAYt-r}xXEq9R+2+Vp<AQVE_iJaYoRKKPYS*h`q3wEFKnG&US|*#W%H
zq+%*E8GWfPL!wLwih?{B5M*&SL#*ng!q2TgLzc5gIT$HBew--1bs4CQZ+!Zs@nq+r
zTcat$;pX4|twUegC()ohx8c?Ke{NKM3%xXQXj-evC5G!e`=6M-|5hv!)jl(NZ1@<U
zsbn4;lYO5qqn~0Fb@4m0r&zx_HrP+o8^t;aC){2=FlOFw$+kPQkr<?F!Q0FkoG)ze
z-pqqJWo2cL3R*mZHN7mg$>8}TsO(}x*6-S#SRVODnQPsWtUj-<H+!(LTV#jru0xJw
zuCTvU9(kzK11YpQ0njJx$;ik^#X@|?6zA+2jcpxZ;irZlz0h^-)%_R`X9v*R3rOf7
z$>=QM6cq<dj|RP&T61dcb-Qu%rVWG%u$HYmcgj4OIq(C<1?(+1Ae|QCr9|HR(RU-4
z9>!px;BZ5G4xJ$(e_&Ub1^{#(g=c?rdLKQ4A*|t8ib!jJs958$*}!HW=nU+s#%}GT
zS>Wn0*cd&|P*@+eXYeGP{y4Sc>v^c;$TcD?!5R;hYi!uicjLy5+G~uaURYR|%67y>
zPck(#vl%<~PTy(g&z&=52pFh=8W!!>fpxKZzh-xLJs1=;hq(b~PF(}N4_I*F;UU0-
z=Zvg5ms=`H2?G)qo3(-p6H2GX!=%<!yvL0k89OR5N4vPJdaIJG`r1{$8QEc+i+0t=
zTD|(s+FC<YFOgaKv;G$$cdws6XOOKU_;oU6vLaR^kFb8k^;$ko!TfFOKGXjrp#FeY
zk(ZwveYTl7^Wcj`Wn;Q5dX}BN)ufBQ{;dgl>(;Kd!9zWTw!8m|_L%PNBb<J}{tI3$
zJC-8MQZBxwe|^O=fC=S}qydtZ1rmTUH%JLX%l{iO3XKO9{nNdJ6Q9sAx7p|e+BhvF
z@}Xx=J1T(_F+U@I`&lQAF%pCDB3`eea!x^1m~eRP;Rz>45Tu{6eekv7s}J#x1!9m~
zhU|9o$Ai}8OCKVGCJ%+EQxo|2-@h>L+qgl%qO+6Wd_*}VkbBICu;WK((u%J0-Q7=-
zjrwONAaJt%^91c34ZT`Ij!U>b^W;u)vKSW-20wH$9G1T9`||GbVH_k0y{MOM4G5U$
zK2Qm9lhhK<zN8RhmH5bY8?Z28J4`1&Y53iDsStVR&smT5&BTPag>k?3M`hy0HUl)`
z*v|Re?!c2i8&s-2G(5f@S^wvA?Ruk}npZ;iqqRPsp{f&(`|#^E<quUZxz@e&OV41<
zgTT^e%$n7$OPA}JzKW$MgMZ8_4v#hE&XoPb8)PiHKX1hKt~O9|Dy&O_GvQ$x*jGT8
zMF#@|pA#@8q&28t-~X?~J>|>na(a$_f&QvT?jE)(FI#;$ez%#k(K)xE#hF>Rc1&^U
zL?X+SlZ6+KmOogSsMf4tP66W)egn5ywBmvnmgLVZwTEWEEu0lD*YfSRR^!L-z}yFk
z0mn_>k?08#jFahy;QEITKESF)PMCj{*N9cxFC_|{_}InBck|}Q`W~ITba94h-Sy>5
zF!a#`5=IRID&niHc`4S~_q#Z+C^`Emm&)x{)hN{dz={kolf1@B&?<E(>J>jYUd%so
zHszwXH<?veqO{$+x4vuJoiyHmoUUsHIG}`RM8=gJ*45Z}0vm|Zh6w>u=t2+@w&{Rb
zp$1qG0^=U06-knP9NzxNAAjcR>+{rMQlKq5+g)G2y~kKv+dx=)6ZD(2kJDei`{BB5
zb*bK_&mBuT)vRgLr%$jodo}wjfEE7rgAAHFnKI{Ox-a^o#4?WUUX$3;*#XGM*LUtb
zf+(nmxi$+`ZrE8_ZAN_mDCXWw!!`*9!I85_ZZcfPLDGja=EQh?)_<!wwN_bt7TK;Y
z=>_$A2!w&PC>RXY`FV9k;smTD*~QBTQOOhX9*}PV(wPR>$h@83e(T1)<EQfe+4m=O
zDSO|Ra+iwlUns22h7PF-1iNI6Ou(vs68lAtAil%w22=ha6V_!?;}usRLc$EBcC4{U
zrR6O9#~*aSFto4tLC{pw=l0849RSFcMkUEK05QjB`W}Afoio7JBtfsT+ihG){)Uej
zAu3{;Kr9TW#X<`^D3-SgSmr%EH-U3Y(Km=Z*0ik%O_7lOhLOmFWC|jjtzG*+_EoLD
z5~QcA%TV{T{y_<Jnb02>l44AW)fSJ3&MZHKWnKEtyO+}UcWrPxG*ki%Ne?weKs2)v
zq7a~jAib4{>Na0HTrR-7Jc7KLrH5+;&apForUd<5uwh3%Md{V@%k4*6Q4c$D|E2t-
z&*di?3U}y2TqIxZ-(?Z1a5t`Bzdn4m_RD=#EXZP~+x!3G!J>Q~4(A{Yu6+g2Wf_MB
zFlE@)R^kwNDWTL%w;A=IV+nA^)ZE;buF7oKn6d;hif;uts3@6Zn>BAv1hSlOPiQiL
zuq8a{;fi7+9i}I<D_H`pN{fRMGO<^485~Ex*hB60_^%d*MCE@rk!uTaB3k$L_t(e4
zgBjTaAa!@OhXp{ZZcpyN820ePhYx#Luutv(jl5IE%XbxISz@C233;c|=MtWphc1+H
z;;_cp`}_b-=<4dmRxUuuJ&1ChPp0duqhFrBkG1%b!AJr}aRGV2bYk<5kN4GRte^Zj
z8R$HPX$>5GCy)mU!@ITn-njbVAIa33oFC6-ywdl8)fW~L@_bwltT5d^eXP;X_MAEk
zDql8-@9(GD>$P1`x32V3Z51g$^#{N@L&yS^7cif5bBYK8zRq|`8F$#rN3@3zU^sSe
zKW7FstXZ?BkL&>5VPBR%Mi;i1jh&s}+<v<#rbbY~;4y}!7}u&*OTOu3N;*Zt)KHtM
zO%UQwL-)QeyO;c;X^Or!l$bc8Jb`a<MJE9*W)p{>usduQ?H)=RYxN`&ug4Q$9m0ss
zc}#Au7*UtCUo7Cz;ZVzS^*!#rT*RXoMamJB0G(}|y}dtq8$Vc^#8qdsEAUYUfzpa-
zO%H_I;>F@tnl))Mt-|`bND%Mj6h?}{7JRX2>FO--RY~*WxPKl3@sqpu%bV2ra$t#Y
zOeSC;c0eLu=mq*78!leFC=8-4ae<s54@6C22~?Wkp;!ArW0qmgnm|IDIkW&&973NY
z4&gr&o{JSDo@Dt+(z(QLbXr|3jbD%gXQPgW7&w1umGM)$?vft#(88Q10}2ZYs;tko
zt5+rNEO_^Bk=0b~ov)jMeTISuFv!q3XATUGt~byaFZ(mVe9p4&*j?5gCXQ*_7R{Ok
zQEq1G@f7lmu3~m$j_&<pd$vxRGG!n2J^#NXGoVkv*M!MmW>7)atX;cs+W?s9lPV0Z
zPeG20QPVbU!U<$eH8QY#eY*j7DxSGZA^0PdHU03^u$~v*|3W`VPgi?D88VQ=e7_h#
zMEbQSZ^iFlPM+=X|0600N+>NomEgGhZf<^ler4e2dsy&7DI4gh*&L}DcL0_tf`zk<
z2f&^4y*l+a6{iTqg_<>MHp$=`6mH3EDoj3I8x2R+SB6H6__E+(b*MUkhvC`qxUdn%
z%vtdLQZFwiI{!W4Io<D^63OKoY|%#RE?G1sgi?#vLl~E%h?x>^Bya-u=mGqZFE;Pc
zc%eCQ2^kra2)^YK%*}n{rfbh!YNIb!H{{6wKbId<+^XN)9fLo<8KzE>COvY_3ajUn
zZ%;EJsV;UOZV2W4>!xkfs8EZ!n|V@X&8;b;+50zDe4MJ+K64E@nTc5dzet$Nq4|*d
zwtf5dYu2sfV%@fxpvnQz`|9r$xkbhP|FQ8}cBUVK2y0uaQz}IQTkG|KtW-1!@m62o
zK%Rd4r!0PYwaI($0oyYTTsR;o0e_^+aJUp>@?vX=k+ZUzVNr+`-ptSj$v$RAZ5v^P
z39i-uv+FEfKfbx#*p6ZL9Cj4b4@=T(aP9KtVbWucLOP0(d%2WWo4EC^e}36^nb)Z3
z-Iz&x>I-^pmagvb>XS9WqSnv2<J`i@O)5`$NRLl%H+nxO1Kz4Ff{{XQrAl@wR2F6L
zRLQ@XveZqs{l$;gVl-Us(Zjnn^CZB3D(xCzO3_AG+pVp?Jo%Wz|3Cfxm?Yr#)I$>t
z2)b{wt$1YIV6Tc}Z(G07s$uZFs*V6UT*P~bRbn}|rq8_tZVQN=DGt}x<cq~uX7n^L
zup!dqhmYe2qOiV)1<Qb~ExeL|^DBM-K2o>0_C`;%At+&NO-FTWW4FL(2?5R$kK^#p
zF)3bm?bS{!fP_LM?tZ=Lr~+@?&UX_j1hbaf^{>D2%lhyA_<!;C```WFy}fO7!XRt>
z(-<Ie=)hw1q##y#_Dp5mjC3v#?RKZX4VR#^f%bK49zRck{OD<AxM4f8`tGmA#>L%+
zt>YhLu`B){Ad;_1Fd+Jx_3O9N#2z=@8VNRDm7%8P_grt+e*MN$tJ4Hh8kq>B-j`Yy
zFWXT7njlPT831ueOrJC-x!RNH!;K7LrcaHaW9VOB!d=C(H`}@byBlB@P;~;0t54wW
zyXmlbfg8{EGL|oj8HssMBk0soW8;&FLEfJShvaCgIDQ0t9DmY=uA%#%K*uGT6eypd
zQ)(yT?pIcyzmQ1IUht_?hWI)}T$}-G;5FKkZPczULtJNt${e3Co8S$VL9cx-76*$c
zwLBt`Bfh>(BNjY*C{;>=&hF(*MY6lFEDC=sC_Wtwj{g_)mpcO&B-J{BeluLJw_BOb
z@_AN2l&xmq035<I_bE5`%g)K!Zqns{5GsI;b5#$;L5O*B>NHZ8JpcAun+pG{j7%c(
zkAQwm_QTJNnKQQ;y`mDnc=tc);9A>PtUTk9Kb9MCAiNP^R)XlrxeIU*zkG2JmQjSM
zU4aEF4ha0q6b_-;{}a=>@i)BQR5}&a*3^O{z!8YlT4}Q)U~Sfq)C%y4!_xI^hQpIc
z8d#)Z78?z#y=Tdzw@kCzzumzruCx$hDH(u1nE1B*3)t;d)rTi9?$#~$Z6@A>j~n#t
zc{IhCI)8%oEaD^%M;AH^TIgt_<Of2(4528=c+5>I84n)~VEUbfHo=Z;FE~B~BOO9z
zm;`!5hJ5zx&E^xYhUMxDdPI)1&2s~(6-ilPm@|z<|4^0e8A0#Pt&%#5?ZoOQH^z~Y
z7YJ6izB}<7S$S}ZG1E=fY}hcC>pJ<MRr~g@Dtx(5(!k;rn-4&lJ=9Vx=MrXAn39<I
z*!Xyvj&h3ZgTsui+iui@=g;zxJLKIxegLJM1#=z&c7>B~=v;V4@<AELiJ50-PSsPC
z&ovgw%VfpJNV?Ns4Lojux+engx*iFM!5*Iqu2|4(S-D0dt+CE<rED!NJF;@Pn5{Qg
zYjbENjbiyd#$U6iWu!SL)z?VsKzT5b#k3<|PKukqvczxD($kby#^vpRNt*`@Y{T(f
zuw=QS)ChExr0m9j!G^2I@PItW1)X32s-k!^YhN+|AsG@_R1b40hAFAPQFszPJJ@t)
zn%&-?JO!MpNHVF?;<#^2Js#^6ktf3NQ>HxbJB_Mv(+(Ya0&alXhcX%>En*SVFwm9@
z`)vH`^!@qCpnQjrvu%^mSc&;uZ3|+KiXZ`EaABmLguj@q=Mawxz8f~Q1%Km;34IAR
z&sMks+j&r#E=Vm|ofNS6VJH;bn<R%7VW3ZeOj?oZp2Yt6WkQ^ZPa>h2>FDU_VULL3
zy1#uKj{6@6907<Glz)%O2{wrX3)cz84Wr~i(jJOb-L0N{B`d)za|FT%|7deF(@~?w
zSXtdZx}oIF8%r<*T-C{xR)9JnvK7gp&yCkyz0<>UQ6xetHI~f=e^I&XT_PY3VDLZ%
zp2NOMqm|RFZCaqZ2?ZK6wwXk~XU?2a&7rfq^}r=>zI%Ni55AnZGi~`jvc(!3OgZy}
zcIJCg3xiLlNC#L>i#6V(XsVIAHS2|?3#CFvaPC?Wxrg`*-U*hb*1Wp-c?E|lHuG1x
zJD)Wj|J!_b&Z;)wG$=_}yTIuvn<4LW`Nhv&PNpvlKVC~ot@kCI0F1$IlDXlvfn)Ni
zTJ!W8G+4OJWK`!ZuP&6$PYtoUJbd+nNnkdDEDkD&{V5^1r3AJ^Cmbh|O$ptCacatN
z8glurpsy8%-KYLD%zkAyaRDmz=h9Xg@nCSt^r{&$EPHT^I*NT`Q!N`v?#1^EC3|-N
z%?Wwt78Z99Ndm%wyC8HMurq$hDy@OZCN+8A8HKTMtW$Jh!hqsuyB35Iw~Y8r3t|j^
znv)aGPBBGFLM3(`8l`_l#HBRL0GSr528LbSJssn^$;E&Q;c#vAI(6D_o>o_}HLf;I
zy?$YFUMxDx4@<;P4>IFCf4XsrDFJ+_`k=DpYV{;39+6H2QC&U_oF4lxDgUu640?%^
z_X*1k+U}4<4Nj%?z2IHd`Nz;1))jEuAc9d)4!x5e3|H)nNI%Sq=v#Nb`mf6g`38TC
z>ls`DH&rAaSyhQg9*Kt_D-CQMdwUhlMwO@}%o#M(PvlS6wUTW<%uNe4Jdm=HKw~|7
zjs|3v;t_Ui-3G074+&&+Km>KyK92oKI%#RNvb40^dGtFar07e|t5{mW{6g-3UV?RS
z{LEx5DNhSB=~dvK>3v98govcmp*h4&X8;+U3wMZ>&-1vPc!E(m^1v-U$o|8DS)_qm
zOL-8>KEEO{8#-g_=FR(9tL^tc01%|2)R@ASaJ$d()~i0=dr!C+wY?D=yD4M-;%z2K
z?ToRtEec=wza`8WUVWa?)YTMaz*U#uhEKy6^re_7l@h%8jMM$K&q*jqttl6)Lh%sn
zpK;VOD-VCjAt}65Pv4f`o)y!U80og_Q$CQ2Z)lC{=|eE>PeweQR2v2nqSw#C-#Aj=
zd}B#xAqJ5oCkz<8f4~$rd)~wIWghFlgLp-0DU2I@ShPacCQOEdhH~N)9xcF$(C#1%
zJzhg++yTy`o*$>PRNOzzVoSn>Nhdledw;29y2;zdEJWNEYc+r@<@CH=plxOWP1?6V
zLi0`qEF}>Pue69qLgpK_dtm9xJ~-hq@UynD0T#?|&&T-WJ#Os%{?EaTF;*v`*oZOf
z&<e^MFUy@&*hhysUcA7boOF5?F{`APZ2Oxtmey9fs;W=6tp`<Etnb#0`P<nm_&X0K
z-C5@#lo1YP4UC_Ne;h$NJOaBXcgguC9KO>gLEu4QFu>^q#JKRdRK7OsAdK%Y+G{j{
z63zp%U;I#}y&xrR8eE6gq#8F%Jiv7*H5Q?FA>66#vN}rFuWDS-Z(x=Q);Wp6Uus6?
zXykz%GU!aAAgHmUp&#`R$d+RDt_2rDAp<`sk`lBMwvucWD)U$q$PTJG>=_}WRPB=&
z!Z0I(Y;)QhtT+PxXx}+WSB`<ZU$*?>loV?RhuxHs_kI4DGep9F>DQx)1@r{yL1HBk
zpPQ;-6vm(~OqyeHmqPSy8gzXF0uGaJw4_AUwHbA5B1E#mk@P3k|K^)sCO<%(TY??|
zXs@AKvYv<)<zB3PWfW<-ld!m(O7$z&vakN1s3Tz;55e?yM4y9-!6|x64p&rX+H7l}
zbQOz{lVc8QRZvpW-z4dCb%3k*zMu$r@51u4_zeR3NpO_xOB&X%pM+~X3d<rcBjdb%
zN%Z$XarclhBh3}$2P|#Uja#=2Nj_~`0V_z(F5+d>sJc;)U_S~F9C?=n1gz+7{crA9
zaRI_{4xY?BBBrJfv42JOx>qxi5&@}F@sR5*Mcn=S69xIklT!FQ?BgTc1NlQrADHRQ
zM~@z*B9i^`Wp{6HZ^&Uc`78MHv@vhpxG`~zGugIOP@o?Pn#o}Z0x68n9NkDs9fsAC
zDyldm9DNEA)lZ0R9GF&KT?kNImV;vf+gHYE1hV$;*xZsNa5ocs<U4Mp&L@tCs?u?R
z9Kd>7(IgVKi))os53fIlh@;!RdM{!7f$gS%oX*RL9;49`h2VEoCXRc%-bEltXq`~Z
zptd-6ci0y6@#C`Y=tvT|r{M8#l>f|dvTg!j(nsfv5Cg_$>F^~!hSkpBr4YM+imPy&
zAc2&ypNe6rW2o%1PyD7P3Myip^&ANLkSE8lo`$nO02ZvPuWv33S5mTg!eBV9XsjLu
z1&6e1PLeCNr`1`qwFr@%iLbS}Aa#5D84x+VAYbbUw~l)A7UHE!*Bsef%Dr%j`qC)`
ziFd0}XErL2dK`v@I27LBkQTvT5urgHp>@$8K0*Zh*m1dTp+u6j)y;;y%Q0|qHSJP;
z)H9z6zJ(XOk(k&sZ8_a=d&fC=4riger!A*CFRm6dZ$?mgF)xCx6iPQq9u)#^ID{gc
zuXA%HWCJtkx{~S>am<0#NxGXd=4idIzs`ecoHlLapb?(!si6{BcQrAw<Gcy1jFkh5
znpFWSD@X2gdW$I6YWY6chofGHfeh%B+r^^-s1M}-TKoQ>gdM>SJRkD8bfX*@!MD0}
z$q7qiPlm%1vi2^$>G%|EGjB<CkP;KGwy+or=5X_!!LIlZIYT~&(wx05H<~!o3qY#)
zmO-MuM}2?JJJWiPg7ATW%;y?=CL_spG2yYxY(%uQsch;v$2f|Zjt##3_BKt!G$Qm8
z^ReRx#j-hwE@rVO4ZwEQIWq)5UYQ?&BV-kOo_q86nBW*sPtQ=sDSpW6@HqraR_M@K
ziL?YMmMxE35q)u^fDGL(+@WwTzXrtWHlv2a$b5E-j(RB1k!B@Dg@wrr69)xfZ@<(W
z`*7ymL&}v9E_A`!Z+jjlFcuEf?qvn)zH{%s*;b+`5B!ADz*hAe;(Y-O-9K$<(er(@
z_ly%>>hHOVaz-+(WWrM&HI2M?kSRVut+CQd`h1>8z#?$~5YMx7k|)9}bf#1&XnC)8
z%1{mXVxW+X;iN?7?b}C9GDUm@+a<Eo*ELSp!YXHQVllZyF(?AX$WI5;n0rn{oOUsf
z@LZ$9j+X-yrU=6YqNeW{TV-e~W1LIp(Yp_M!=X!X2L0APN#di2G1C>FT^=-D3V#l|
zwkD8<i*P(*WrmmOU3PuQr+ugg7<Cw!1K#rEu$9k`UvF;%l-tmvp7MSH_<CBmqvs@b
zH$F;omj7qt!2q69Jr|7rf?y~Q{Mo_ztkKGsub(|TNJNzTR||(t!NEBmiD%t&HKIn@
z|KXC$o-etrH0ZM_g#QKqsZV7H4(C5~<`iFUJjCdzFoea;jS&W)&>~X=*OLO9Mz)vm
zu&#Ywv5UNAFxiL=Z9_%8)w4S#8lfQ4AnL3o7mkPLxZ5jFFTlw4m!$01?U){H0ck68
z-|Kb{I@UPx*TfFtT)L(WqR|3Q!?3>dAB-o6978eh$OrLGc!Q{YeMQ5NLpX=SmEbHV
zOVSct#@6BvwG>MWv1-4y_xu$^YO#aa$dUgDqqk|(>Dr6=KtqbJudhl=uSZ$r0F<wa
zDw^^ppAhdLwl<$lvDrQ9eIZA30>nKy<J&K>{nFq*2=m@I)ycylA4ovZKggilu5YVx
zs@G3s(ga(|-AvT3Lb&QQUjxw0FrG^|Ir@wbfj6P6P^WVcW`!z_&;fOrgJjp2`Vb~(
z+93qA?^99m5E(6?354Dv=iKS;v3<hX4I-qlqJzl&#ogZOq~JX(^7P%ckHPYL;Ng*|
zCHw;<p;CjaC#`E`AjB0dtMv}k<pYJly?5_W4UEJA%ZpRTi9wmLV0ew)6UqL29+BqI
z&BJ`?(3rkgZ#nMX--@dE{!>YOtW-7I4PG{0g9_1443@MfPIjO0HC@Ta?!TnyAzV0>
z9`KFum|#Q+9r%4;8cOy)^!~hDtx2D|?dsIt{j><C67-%7klnXf#tUjvs9h|_$2i?P
z6?*&<XMHY5&+GZVRlW>dRXe4h2ol5k&p-d1c(C#dET%-d5&}uc4}>i~0{yk{Pi3WE
z!)ZDSqV-rfLP^<fiNt;G*V`=?wz=h4F#`y^<T0OOi%O_;rr2TaDdTDvNEm@35#Slh
z&GN$d{;+R0R{+m6qoAD745~g)&`iU#KS*X;@bD+a_D3d-CKGD=d#6e61PvP#)-}9)
zb?W8QW-$^H1Ve4wWqAu9QabrTQp}=1PPxfPHT~g-sV;rU&If>E(fJTYrAlqh&<LS5
zXzg4><&BhCb@5j8Hg8gifGuvvumY4;bOx~elu`mNUWdXUi+L{R8)eKtm_ez@z>qW5
zh=v+=2lOUbf1CH!^ZH{Yv(&y)&1UGjn6zyKY9qX7*ow@;`5BITSO|$T!I|lqj78A6
zxly5VEjiL#v2BYu)~Uj^N6~M&&5(|UH2Clu8pmvtKw2q`yf@B+=0*)+sP=O>0?zOj
zB(PBB0%P3)@w_>Ad9@MPYbuo62vYqDG?gX+kan@~mb>Slec4a0LgYeKV1=UsOi~4b
zQVU`yRADvQk}%J}{%2NXU5e>DNUlyiQq7U3O9>(++}9l~_a=1>!bx=!3SmW<!tHVQ
z=3gDSsA<JofKm4bT1N-d5>O8O`4ka3epx!CGCP6LSdqbGyBIN|VxK7AkkS(Y`#1>>
z<AmHZmgOh4kh<V(EU3Dc(~~dNhL&inArc6uMvpJOAvZ8?0t`+v{R0$u*&aeAj4pe1
zcEOCeaYofg(^e1C(;Yg|iwYO&^RE`NqC1Je^TocaAfXU-m{SSCpZMfClc~XwN1~u8
z_NR@~F~!SA11z2gFK^MOfk6ND{T0jjqWrYc><nn|wLBz@b#Qpt{jj>`*Rs{qFdRB+
z{N>XIPZY~hLekD7Ob)#1c+QVzRobTZdPzRy;9c&a)({;60CXYFuC%O{Cx(>gj)vB~
zO;!GC@x^3Vfzk@r>u+$w!(h$oBszZ#iOwe4B_4jCbKO!~ew8b>b(6J*IIU8X3{)O;
z2|Y)YGLuCj`v3&=s917@?-1Mz9U!Ei>^_13lhKd;8AiE#$&+utTnJ<F%mJ1)fvCUE
zyL;?sU<C2MEp_O8z!8oG(w3)9KLwC?!OU%5e{HFvt0+UaT4EFk=ctHaT4@Ow+E+@h
z<u4a_Z&6W#o`W0`<p`g=?47fAyEZq1S9Fy4=T-&ir}!Q^cFdC0q{ix#_L%dy`R%4n
zpHtJikQ-6+k$=k^yHMeqsWqUU+)`(P4!M_g&&V-gum%MOn`sc-?<U<J>?O-$>3$$5
z2FND)-j*?c9KNCwAsV{+;qsD6)bGb?QgT0^T%7Z^@9xsi*B^CR`|IX^-@E!l_XDOQ
zttPfu>g+ht)G0jZ)bRM0GY+Q4|Fg(s&4!?mLAJx3ABEaY?cf{|pmO^5MC=9sM;%r~
zT~w>wetD9xI<4?+Y?iBIj(ytliwEu*Z0WWqzwkykJA7f>YDMw<nUa7|$^ZKIzej4A
z&@KVsjFNut)w%V%p5$<>g~7GYgjl+nUnwIsZQHezKB_iZLIvo2u$E7kVSJ9zI5ARc
zjbQZkD;|&MSfr!mzttL80hpChY_M3F-7GxR*0fxFdcW`0u<rqMD_+Nz;**o;<kC^2
zsTPKPo#EC=8S0Qku}EM9O%W{}qd|oXjlWl(F8KWZ`Vg>ET{Xzb3O`3Tt;_9rbOHr1
z#>wcYF-+FrLfUUNd$=U)yFrDw2=M?kg+iHrq2BN>fN9Jt@T-i-XDzPOQgX3l>s$8z
zyJe!kaq<;IWz$6|6G&54;L_j}<RE|Buwg^zifFy#!qAeP1Qye${=jvjlo3co-6m^Q
z`m7V<507=WSgE}xxw|F)T+^6PYZgf-j{J}+FJ5#@I>ZG}a~6<+AxG&myn)Opr;3>n
zk^EIh$JLCcqj7Aq`{R!gz$I4Sq9r&fo1*Z9gHU$Y1mZ}YZ~&dV)K;DjCmR$~zJLFI
z=?){Jq2pus36IjROsXi3igNa}G<wO?lE)@btCgFps}=KJRT<s-2!zP~==iym*mp`(
z7utCo#iOz2@@jk<R#R3qp;G70wUnnbhOcRAg-KJXVfJBf%ue62x$>>{o&4j4U0x@D
z5D32=Un!f?;O(SG9{f)e#qACuT`_GG>gaJsLl|qzU=qKI@|;gSv+{+1L#fNSqEdX$
z9l^6)^H@}}Q-5rDs$cT&RUfRhUxoa8vz`w2P&*2Wic)~GrXL@F3*0sK0Qy$8bPxZv
za^(#e?_e%&NP`S$uG~G(@0@@0cY_b?J(XIBd9vl*v}mBxULP5FT)<eMB9`<ZBv8M0
z-zu9{fy0LJzBQ{?TSd8LHzjP%f8E7=Z<FeMJCRzm6(bhdAV1mAy}9IH#2eCIplSO3
z_xms}ei-|LRnq68o5|Sw--*oUu(joNg5ua<?Zcqg%E_%Y`zR+bFAUDUsRojzI}-qQ
zrCHZl(mM=-T9;4ayn{uZ+|yyn<GNWNI9)srpN~`Us~<4KI_lAbihV`Ul`E;*G7%|d
z=Fslsx!qk$bkjNwpx~ruyBg<Ue!C$n9z-5NU0}6ZQQmbb&KaJU_`Ndp-CwI#NuMF*
zBuyZP)0Bj#ud%l9uAEkv7ha-&ZI||Xti2s9tFLzVH&pz39id5jCP&2Q{K)fBE1;J)
z*`oNAm(jHA&b$`2HN3jwhf81YtZ<3S{h$|+fs8|~gG`^T<XZm{NrbfY^jZ7aJC(1y
zhp_`Aqt8UWf`G)T`3%zK%(w;=7}r)}FR&Y&2x>GcU^gj>Q#+|Vu`!GEhIDXWxRA5A
zC2kJ4xEK2WH4kFd_<Y@J%eSc844W(Q@#`I1)>G~V4*k7$*YU^p2ELegq>l0_5h$fE
zOuuR3u*Za0Ub><!ASu%wZzR@R+f;9=oAyqZM;cZyZ02fFFjZS~QKKws5A<W7Q8}Fc
zzArP5PiECBwYbr92HfTy{$D9CBxH4GkcnC-vXbfUstgrgn5EC^Y8$dCPQ9reFickl
zRjqWSe0S3&o<Oh9q$dCR*P9&|>nQIIal!2QZNBH1cV6?wSNdeg(^J)}cbZ<&(`Ka)
zMwYf^b?3i&5GAAmRp_s)fABM2cu<@JuO&t=3~DLuS6-dESw3eli&gd7B3&0S9r)q+
z&Ad*$8f)p|a!eBO)>#<vaO71wcdjsgqx~qx1C?HRPrfu|oAZUTR|BS9xOnmNw2|Kq
z9L(vI@^jeO!(jxcgwgk2jEq&P7f$<??RD`(gaUWKJymLI^y2_OM-ew_Ujx%g^#l1U
zf;UsLoHU@X_znMVSYsQ?`!au*s=k&dO)gtZ&y}1??Q1CAgnq6NRn+P?9h+7gO8Lv!
z6z-=lkeyod=bwK{h)NJud7=~rHO@eqp+bmQ!zeMdHgegUzIjuvE+@Z)OWH4p-6f@J
zx-za_xiV5Cb8Uc?DlHx{plYlB1Z#kanb(Ile_nlf<X1Zm6M&OOsGjOD1Yjg~tu!2p
za;)e7Tx(<^wzep~|D%x_|9}8ySPX2YKmMPm#IxGhWJmv7+Ol@Ve-8W>wG8Qe%%gAr
zHf1OGoP>5;b*+A3NT)Ro77`MZ%zoIsXCZy-QlgM}5;hKsRo&sk$U}CbT}bHX#6j%m
zr!k9pCIv`tyFi#gy+^H)c2j(JV%0@>UYFAYFe%(lHnHi*k(O9h;bMN*ve!`R&*pxm
zATX&^bt7OC1q@m`mV8niA_!l&w4;MFZ0ue}ws?wqFQZkTQ_`v5xbb%AV92rsVX8wx
zD7@ZYZPB6BSP*wo5P=YX_+dY-C~|{BaihsopxssXPJ)W;lHOVNRX*>d0&5LPV+<mK
z(wi))1A2=cjJ=_g3E7sAo<WahT`l+eAggcynur?E%^IP;6CM{)S(JQ;+ymxUuzT6N
zdB#RYu~5Ybat{fYndBhcKFV#Qe>=~$SDhm4;v0QQ_TKc8&!f*o3?J0*{pqeg<`NLU
zK3BNmh{JimTz~!|E-`U5h>U6SV<$peX&5}Q>>9pG`X&Kf^uoKx;2UNG4-*Gi_@%LF
z$6Q!es=vpBPUTb-HshT!8OfYU1OLxF({6@_hBwP<Z)~uqgR&_C6)5=vO%c$wGeL!$
zySrhfnK!<knmPt9r*QCDuGx$HmrVn~W%u8$lOtpj6S{4dA59Cf8pfEHIG0gWGzGVM
zP~ydAqN%W>S^yI!Sa*2;0+1fh9^aDQG67&p;+N9;=}rNt;cJ<BBN%}sda$ooE@Y&p
z>>_rQsRkgaC$y1C%ymhgOG2NF<Lj{FvsHkMzkmz&=~nxvAg`(a{<ewj@u^qK9@5%k
z&Ex4~zr7$)s)QdPDv}m#%{7o?i_TLBG@5?djT<&h;iIGvT0e))JQj>8_RYUip&P|s
zzux7t2gSH#!~mp+I~0YzXht5cEUe#DS5&@4EMmgUbVNYJ!a(R5baVMY{H{z~Kv-_`
z7MpA3ZRW*oe2|r$9Rw)>KHuG<#a2mQt$NU?(uQ#&2gZpQBwe(ZbTjqQpOU>sZ?P_&
zDnx`Iz{kfhNu4uc2haA_I|J(HA?$VGNMP5yTJ8V$L5qMeI>n>|nec2Y{c|FKrWW22
zjobS<csb8YpxhU%lB!x=e`rQ_2p-q141+&EqERaK<gC%uyljahk|CMV%n}0jb?NzD
z)W$*xU`FD0QxLKWmD($=lMmP|wJr)DxtI|vFq?H{7RYh3kK-dAKwPmVI$T#fVb*TX
z5kx|FKt7<G=tzIWXWG!$z0x2Rj{5maWH9&59pX%>O`r^poK*Y-pcIZOcuO(un5~C6
zo9cxv|L5lrJ*oLo`)xIRA?78GX>snzdrzW3fqdW{coDz=KdL2`mFfeMxpQ+G(~1NY
zM;aJJlR!R}ny8TA4rChA*+Qxvk$Wr_6Y7NHWlh6S@)H>rna3Wztn`H3CvlJ9@o>8F
z85(&;pikY1R_>>7?ti;FN59+MB=pCl13+Q9lV<JOb<ce`^F21p_QdqKvS$o74gm(n
z{*3jr*>N)ON-O9pR7~`c$JE%OSjkSkd3W;Y?sWU`ugUC4ZS%7U=U`<<|9a4A>A7i{
zB{=|uqzNk$am8F3jlelYcsA+Nop7X9`~HK6C><9{&F<a1e$fsx8VxkESJjE$XyG4q
z6ZR6S-~Cp6J&~9h_t-Nz;g3+DY&n;b!b|rkQUm5W&&GC1ND&~Nb8xTaxG-C-e##J@
zV%al?NSB`l7RHCFLSbihwGR9bImjtw>Wl{r2;SL&89kt;VmLK{)Sv_?ZhWh-m+&8X
zwr!S<r6=YeF#PPiJg-O0AbTB#4W+;QKY>eEX?+D$ft?SB6c1TNtvVlzFVrG{b2zVV
zk3N#qNwJBxB8~g&MA?X&{~UI$ygn(Mw#ddZCY|rGk2urj_u6$|?OKYBk<LBB2!nz{
z_E81DdzXkZBvf5>bXr^z1{)ij1Ss-)*V4y-*A+|vFLbSBuxG(j^Vi+ubYIaElfq?;
zLEE=s4qhmPBS&3li)#6&(j2J(wO{gjT|x3{gT*y=g%UdqIbx@|E6oJs>Z70OD<uoe
zcDj3#ixZenzxqZx(!tQGxr?baHF!A#&3f36*Pu6Wg!1x`%TcdU1MquGNonqllMH4g
z?y;Z{WJ~|=TsxwUf|c;cL=!_d6T!v)m4`cX-+2Az@~15w&n}98HZ>iLR0_0Egd6K;
z8k$k^LTn)5@lqn^JGA@FT!)}r2<ZQF$Q02>eQcu7EKY_{Ic*tItZj!Z`^PCl^b)S^
z9<%S4zYH;Q4I3Eggt|QbTN-ZH)s6$56itwRLtaz%Ai2CviQ9nN6+(I_dBnB%tSyfD
zrr~FX2Cc7c!7_~6*+=JcZ3}hS_~ZL|vO78_QlFfc;7ywssoc!*F?tzgvm^~}lZ6j%
zIsEK|oxP^~a<Hy{()nMW!uM#M1tadEuKQoh4^-y~N^D)qtI`J88@<lsG_M!M^aZ4{
z!79yNagBy-(WjVZj}(ibOTRnc*w#J#%tNo6Ein~#Qp#Mgn<ige_kwjT^vCa|p;O-=
zsOi#oM8Ghe;@uFji5(zt4~=h=lFlbTC9rU+4ACYhvZ;0-v~<C-O*o>isC($>jdGpl
zki<7PY|zK%9NqVGu7R^zaZv^Vd%T8>(4cGdX?CsTbjmG$b$-UHhBE$Tpvx%Yq$GF!
z3`UB8*32t3s7Fn8Mp`Pjeo(K#{Cv*Vdz@92<@Ba~q|_XW`f1~HQZ38Q&YrO@=hn^x
z?2L)~yIou3V^+ll(AU?$`KPJP>(Y$#OZ#6wf4XQ*Q@#CEb#-ofva$LdS@0lobC!39
z$&ISh2*t;q2`o<xvnWU2Kh<NvaDS>g(jL_N6%&I;<wHW4XFm~EnB6Je%bua5Kdk@H
ze`4Jo)<|eH>Fm1;5q$US0n6SW8(dywqYbK|v3KM7FC1183Y~oha)Enkodps<Yr>1u
zOm1NVh_~tY`tZQ{)EQS~mSGR(T$|9YV1jJGEW-XF_Q%WvU%R9q9MxvmouG%KHLpBI
zMqm>UpYXRiF<njP<^7g>I4rQF&M$EIFz5_M^`7rOijUVkL~G#X{aMK-YA;GK9p10V
zt8JwQmrtsbgIcQaFefxWZCYZo`rwG~XMGx44OuCh8YJx*A-iYS9dP3NAJ>VVS-erW
zW9yMte79lNEenfRpdEf&)&UTIiYeo8+kkdT2O0sbvkfgnlP`cv2=x(=P%8*AvvjRf
zMT~xW<fBmr;q(3BCT&tl*2RpgsAl<_k+UuR{WZC@(Zs?A&AR%|Y}>3^vCE}Cn{d+y
zm)xFu{O-9HTccyFH89EN+T{R=BcyuQ@C?Uq&Nbo(a{~)=^Ay0U^~VuKUS4LuGnCNJ
z?=1x-?b|oc2u><`{o9BU_uLmuG%_}hqYtU&BW8(QfAw8q&BW^IA4S1W1_P}1JKSyJ
z=|k~}M#Q0huA~ri?QjvHhFL{zp6G%3N!4rGGI#~%4F)5TX^5(X+WqlYO3XtMwq|X%
zxxda_E#=Cm73p-xi*Me&v#@Cxi4P%dnr>7A`%8#I^q4$cQTFnBv)=b1pJ^Y~z+4Bb
z%s-9rG_AVm=nS$y))0-pA4gfuMwjtvUY=9(Jrsnc>#F6dl`He8nBH=mwCG)Q(&AH+
z!~4A@9!h?SPVc!<8TvtG4Z3e^H(+LRG|p<4aq#3zf6TcCJF#EJQd)MU%}=^OrI8m~
zIhHf|s?4-}amSP1j*}hdmQ8F>@!<Gp%uJjzr0_WoH=TwJ8U7)8XNRJ7MO|LcoMkgG
zf-p*#n4@m}rjf7DOIi8;f{SC&B3I49R<&C6(VmR_h~q^WTpG%bZM+rJK5zv)DhYw4
zj=2tLt8~n^Jt$fGXT{fT+)q$i;V#=X7)R?9Pcws|l;=D(fux#Ye9T6Jsi&u(88+L(
zqv`s!Yj2(i4_3ja`c?XcR2dhR4~SnHy0*_e#zY?^OPaQf;9vjj+=H+6KXlA3xtjF)
zXwahuXX&y*9%)u#xy@Nd0FNr9h8(J&6ZpvHq}}1h{$i<dn>U`#-<uxQ|3#gEFVKVH
z)3$lDW;2=&I(V$uHmeAYy3QshCcllI+UUFQ;@@~6vU2|2J?Cq8A3buEi(c>$_eP@s
zq4;F!QX=GM&;_hm{Pm7s?Ls{)KHz+XrI?!v1H&*w7sKrG$e$MJa?%#ft7gkP%?f-q
zRWJR5<JR<#DRl#O9*?k@Iilnug8}?pKYZNSY5C92Gv{W6pB#f07TMDDrY=nOk+@E{
z($ZK@M*y&RVCjwAhP^uc*L{o)vHX#oE>Q0&x{eXKPaP%>BKMJakl5#~=nBOL3_LlF
zVAk&9it{QFoS^mg4%5lkl`uLp6thaE{2cHz{%uYj#m9d;+(ryUn~a812;=}(;Q~*8
z<FG?CI*Sg3o?7a-QYGbbTP~FD4#?bFdpbHm`Y`C`bYaViN4kt(o0y-Rm^i;n;){nv
zmOm_-2LEna;*ItULKf1}h*ef}830p+lC*U0aye^6m#WTdZ1O5_?`GZY4C9^kcAcDX
zy!h8CyqWypB;FZN`E=b|2cav4kbz!XLUK8$zO7BP%V{u9GTXUFrn3!=oj<;RE6KD<
z0B%CYK9!JhY$8F!PiwDEU57v2^Q_$Wl|O<kZZ3Nt=rM7yM~TPWThE<IW3KrXEfp9!
z-D~EDgk5Ms-E?Cr!n*j9(HGBbjh+_n?)|AKz@#vmQMe+Ib1f3<-qdCAzV}D9Y(nix
z*wlb{J9!;Lf)*r4`_3PKdI_D$N$Yw}ik@D0?*(&2{vq&K|CV~IwxU_m@20dQ<T_ku
zB{r2v-s!mP?O!VFofT!>Q@JhPR2;DOW4WGUSVyh1$T#xw7Mehk*mBY`kT%@y`59xc
zEQ_OK6F|K8H3G|%QLzPtwsY1PnK8Zs^E*B272$5b9K<uG`n`pSvwf!j;Lm&-mEh@x
z<?)NyGlo=K+U)ZEYUY23FuDDwBriys`HZ?nzaa;3S&fSus!co#0<nda=1@PUG586~
zdCe%O>=`tLX02L<qBt8uY}{{12SwTO6I3%i>NsLO)f`4c!3JF&wNHC{F-`ViKdyOD
zuwigor`*rcZotx?E+nj7Vo0?^u$1-3Q=R0v&9#F9blWhCP~+aI9&}?r`{g#}$lz9t
z9q%7+wEPndYk)R)4lXrIqVS8OGR96Oyf1raMA<l~Rq$&=zwc?h0nVSedGZhc6<1{D
z=kJ<%&MCDk4%|KR)Nm>2V!hWP&^v2a8VT>+NMa1<ZG{*u=>ch#MJ$j9QpC{GXw@8g
zs=+HxmBMa~Y0Hp=-qDLM7ToC8dcdMXQNLH-=K+E?>IH_^gxW_}htN9JbJh;ryfEqY
zgrJX}vvkS_PHa?nYap>wIY9b^1X5Ewzh7=6E;kn3D4NjWBL&dodaG1214tu9A2@#L
z=XwN3o;@4f3I{VY>KRu_?v!wNWUp^)V3|XEyua!AX+P(vWfq%<%r5_TXFz^nm?7yO
z*CC9d4XS0hr)0b*?OT9ZF(v4tCJ+cbgni#{&;oYJwTM)wG1x;7Qr>2|mP3I`ju}I|
z?j?g>B1BeH5G=_Z8cnI#z8lwr1Ce0PSoh~2e~411G=THpB_30LO<)Le5Yjhu*3l&I
zPL5RX2)WK?TCRKJfi4g-PcJl>74Nq-+aJ3TlxqZ`DyCshP~s3;U%`^P-7LdqPavSy
zs07sU?Qt}Bt#$wT4|Zhxy=WTx_lp0D7c>Ov4uEBSt_h&a6EN^^)22-~KppBVGxJ+X
zr7>x)Qug6P9%0wc1jgQ-#^wd$l!Rcgv(MFMPu2^@2dVU$QFha8lD4qvahmiSC}Olf
zN=D3>W@`<_AW?%7c)k*P%v2|XZIF$&$>N($_cxd|pRxc&7r*)~dg!aAs}A<buH4;d
zCHYd*lTLHB22c@DXKG>Ln&#cZ_`Sg{@x+ZHi#E+(1SMXiCm;NXlNaQdIB!>OIeNY#
z&-hPlVadhM8P7f%hticUIZf|}XBO-MK;D%5k*i!@_VUcepIWy!_^%W2Bs*K^nOWPv
zXSeGjWEW4kNd7LmabyfMWOU&VWhBxEm^vql8!~oBXoOS-H>t>8N0CX$V7_9hlw!8!
z_fU`S%bXpWPlic{w);^<L_!nq6k0F-WWG6i7<H`JrRZb|^g}0unMQ~~9t@@avT(NN
zmbgkQV7#+b`{?a8H!YeTf-ZSw<gPARGmB9kdXP6VB3*~Ed*I6W(HVJ=|K*VZY61yB
z=8(^F^iGvdAk|5(*J0USDI^KFDWor+FJz8fp7{r;=F+^UEs9a^*b4xE^U{xnea;k>
zGHW}Sc*cpE1>RCaSE)4m0Llr0(2|(N0}LC`+Y)Lw@N`i7cN<@ACVtH&jYUvl%iDW6
zVncMug)$|*^{fw=i6Mh+xnnD;83(5ie|2KKO-aBPWbW8cv?ttV_4z;@5T@HNjE0IY
zlv*!!4OOx+<K6v?T=xq_k+goMQ4R%i!It_g>xqQB@D8O*d|L7V0|L!7gj@{<-G2Ww
z_=s3XDr;sMj$bHuF{5m-&(8NBl{bN{Am7s`(L*h8EAaoG+|LiSM>$jdnDQXK2l70D
zCk+QAG?{@d$a@fVviB=yEXu&4U3VC87fJ_OzxY8OdA=h1$UV!<$_WV_dQJ8jT=v31
zUD7DR9?o}s;$2|<Q5nBVhA3JlNEb@&eOsY)^8o(+c+>MV*zO^r!}$34=;vI(sB*tk
z{VX&)bD`b3CgKdBgUygVS%m0};9<!gob5E+^ce1OSAk?+zy!B}z)tv2WbmhGVo{82
zbpb$PUIw1x_YzUK7wrPcyfo5ll)doL($|AvN3ipt#idf^0#bij?+6^XL!#9uO-Zd@
zl?apr*qw?uc$$^9Wkz!~Kv^WyXJ#hb-@S%UCb5F2cpL0--h@7v^zyZhycL$Q{cwV6
zNt&ygXFt8qYMc?Z@HhOu4fwW*l8I`9bjEQ=I^?wy4O<w7*q|WFc+!L;VEr${hSA0o
zhz$?JGV!PDy`ai=8op*=%0k?5AR1Y-!S^-K|3<pm;S5eiTAMUQDK?{}Pp>Irmh!SW
z4<I0w9+*)&Y79<4<J(&fP}hJVK5pnxAKXGPu4B^06_e1Yf_XfS-aWR$PeP}s?fJ>D
zA=fWm`WYx`ERIX?H@u7)GbzSl)ntwqPc{^=)+r-8g2*AhL<DGy=`@i34~baiERs!k
zgE1(u5}!_#)t%jML8Q6mz=5n`-i?i=o#q})$!$F*g;9GDOHJ#_mbGiwvgn<OI+2Y3
zg-CbX$&)A3T{kzVQPg_TJRk)Y!JmnZmC=F;NM3IA*c++5)@;nUQ0*$jw&6FP`WFQl
z(ZU6R<LT~IxmR*G-LHsNv~S@b106)m2hx*OOgh0w{i<{$=Bxjyi<REOBytNq)Fl=x
z_wej0OPmx1<9@A9=8r9fWbi_2f<t#7=K+<&5#FZ4V@PmX@a9bwZtB^j%v|w#Frvn9
zPQ;^WR9MelRB$Jcrt3#XErFo%x60FV+@)}V-q%eL_O28`s%~?s^}C(0h0QH$E5}-3
zHZe!r|F>vsn%fyI&zTiPWvXR7N7e1EjfP&>g8m3dl^N9)953pdAcMQBK6#h*ioq?v
z|9*=0*O*n`|H#RciBEmoRK^3`WBYm($JL+5ZQ9~CwU46gR8M-pN2=$gGE66eR7ss=
zkT~!Fn;>@T@g0@yjB|R~Ie)V8K*v9N&+gcUYG4YJ(i=W%KvNdE@Vd8@y&(I&i`-u;
z3T2q2$b^K!O{x<ppNy_#*{;i<-D(8~KzRYXU~S5tpOne3MJth>bPLVXCr|cMbzD8w
zL;IZDcjOy6zv#mDGiKaId&aXq{69Wb@a-dH?Va(CO_KTGOEgTZ&)0r*;t~6({S*#w
zI7OIhq*P2e@#TKdm$TzYYz<r*P}{iZyJM3$@>B|^E^3&0yg|c;hO%tGR;Njm&sZ;5
z6WHTMEv2IYn=Ez#qdZcWhe{Y^rYXYwNdQ+x4tgEVbadB%Xx(~>&#f9k&~78h{eM8D
z(osYCskkeuP{TF0dxzE{9C+HmyX^iWZN*Hhb_zhbrAaMWjmaJ0S<JGB=>1yD;`kF?
z15x|bN<kAM-aWUi)0dQP@)-`}K_Z!v8d0!ClVfKabmoJF+e^~3Nt^zI1NS3M8|X(K
zB1C_UiiiU9YL1nwqD&071?%VUx2vRTLvgP+ztOSh0P0Fi^60MiDlbmM_DwIetBz%S
zCOKmQUC-0~FAusPn%{4eMlRB~_qWQxXv$RK0QN#i5C*h#E>}hycenfQY0_CoM>wsW
zgN0!vYjot~*~FSB)fs5o@CZ#S^R&#XnGtry6tYb<m^8}KQX*fn1k5HcDQK;0-cm+(
zA`jTHvT8rs+xoQunEOs4C2c1Vz5(o@U3)1wPt4b=K?8Y-Cm%<hQAZ63GY0#sWzO9u
ztZ@hIP+M1`tJ?1)u)V;<G=2343G%6@SMdwVq`F<f)v8PxE4W(1RlsGb)L6=~n3$6h
zyK6kX5+?giwP&*3RV@=L4{7KAM{>v3i`gy0%k+|NRxwm;q)0<*StZ(d#i!h~yzp<V
z^UfCB+$`A0X^|^-EbqKF)hXiH^p4sS#Xo$mcSl77mTZc+%*@Qa5WGUb24`M}L-svQ
z4dbYpPLHt5rZb*IDVBL3FUBjI=E)+k8i+hWrKR%%RiSy6^r;sU;m}C51`H!6lRBOp
zbuGc|3Zu0Sf=H<~yO+K)mi^a!t<HcmJBHRcSS5@K?n|$&+8Pv0#=psyFVN78q?^4=
zxPm)sTPTZi$)M$-TE!UC@la{;V{1{#Y>m#nN|gBMOc3Cq<I!Vwhb1cO{fYj96BA?f
zR$F}G`*qy&pY`j<08&aDD=-HeK7_1VZhyyIqEit$P=(@k$+1-=a4#&lC~~@BVj44z
zu-e&wA+T&Zqvy@5qc<jY94!GJQt>x2al%)MNrH41Go9h8Cz*WuqgEu%og{UY-jl#;
z_E>BO?!1_OG<Edi*3UpEs20ki`s@^X2DGSFS%_<Bw0F;l34{7Zhzm)mU3K?A!?l$h
zSB2_`T7u23J9f<0egHAa`bR%+3M3RAY}>*@U4uXMv<>K5GOa7<dD8+0>M)B<?6Y(5
zhvz?0{4>)4Lmr@;CmA$$({hfWjDOWp0|rRNS3tnZ=U%Xfs4AS0M-xa^Lk%!q@LL0+
zx<@a}*`NxgE=FvaJ?zoVsS$REjhj{ICbDQTb2-0bpi77vWW9RzMCg-XLKZEv_e6j=
zE=+W+n5mM>my5nUe>MFk*v%MHFG>QY8hU@61#+x39#0)*hf;$^o2w(EyTp+3$ogwP
zvaqnwU`6wZ<TA1Yflrs+y+5i3jW1U4>it?F6Va{{nnARC+yhHaOCEW39-@KP35K7@
zz4S7_n!H#tCVDRp_?)D0{DHv=GpknQ3=U-uAc;mamxNsorYS6nOjij@KT-8=okb=+
z$h20lYIXIA-}HJJB4!fDt8&4Y@~EZ|M+nVnNn7%r15l`SNhh>ab&AfraLFfL6K$ta
z08(WP2$@cPRZVp3__+*NLBxvGQfn%rdXDI20t@AH{V|VQ64tkm%4q}{ij4KOpB|-6
z9cAqGg=^oj1A{hg*&`&BdTP-M5aSpX9qmWI#-@n`Ek*a7kIvzgYXU(CW)nSXscCUa
z$E6sOAaW12M&e#VSYc0A`jX5zSI9%qP{xr(YE+cmtVNiR%q;4XqC|D3QAyY-Y*>|A
zD#Db3aEibGcD}d<fz@YO=*Z2T?2b?UYfX!uJ=D@V%k%>6uBTUHKEtikn2`W=mGIOK
z3Pqq#6Q2xYWHm`sX0G!_ue9>$EV;FN#fLexuZHt^nm`B=`LDe{pCwToMVCzMcO;0?
zzi1Vc3vmN$`SjQSd14EQjYm<+Ht2VkVeCgRPTgRzJ*BP8AHDG)uHK38Tich$&b9yu
z>u20bnV!g=6iq4+4^z$H_nCDBMyTAv19#$u5AeBh&}}q=4CT^OmKkHI#LoKr@?}ZV
zg^#SUWe)9O$BPc+rg6t-)rc+%r1esi^o6phnu8_sdKr->+%B2U=Jq#tDZ4_5MH;5{
z{V2zdGBB3R=Y^CgWxNCttJ)eUvY6$#Uv**WqW%v!fps_22v-G->Fsrf)#|uMrk{%j
zc{?MQp&Hv$v#o^OC0$@{MaI>kxX-eqfNwIxfwGN`n)F0NW%c;?b(CYHS2gZ1md%D%
z!U}aszdQZOt|{WDvyz*Z(}E}(Wjbm}!;uh|D5=tw`|#(^{9af4$Ya_r7)fiWuuWtB
zUd(~JFml9<!80GGWdu{A9W<^{{h;dd@WAgKJhPh#o<j8GtICBIC9o*O*#+_umq80w
zvb{xK-c$o^P?89+mRRkDQTI$E^U+b0|9=2d5z1KAg*9cF)7Wpa8fpmaIY=(lV-F<A
z=6FHeBao#~4oKlgYQ*r+3mrl*8dYFk9Yw`7Ie=v~Ve8^^m>z(R4?WGKOatiA-s#^+
znY@;;E|>34gEWc;L>IMJZqW8qP>RPKb;x+?&hpVtfriSl-k*4u;Vm4O-L9mocKiw7
zkCa3i4c^_=$jzlda}i!EhSg8|(GsM=!uUW@g~O$*mOd~pM+dbd`=vdqam~!tPlyvJ
ze2x;()nOu}8^JTk1eiVY@QORpFS^ItX$11;d~NHaoWfPq3fi>GSMCPE70O-T|6#8+
zNx!XYxer)HY@58McMh`IMX&kWdt}_wjz-wls_xGa!*yu5!qaOFljJHw&cvz(izf#S
zjQX%l8}j6RAq*=8AxOavmmKNS$HLKBeJMIE|BxFjE$e^Q$(d5yCtdWGU?xgNJSV#I
zxDd;qQTiH3PlCC5J*9C^XjRgdfNAcM>oD#yv1}Z15zkXzG0kb}NMd@?j)M(yaMR&{
zexbZY4D!J`%19A16IE=2-F!N6uH`fH^){>}X=EO}p=FaA+~d)wU{46ePxM0a<~89$
zMKG3aZUdraZvKrj`XI!9C-%7?nnz?eQ_sC3hQkfyAy>bh68_YwzbH!#rI{8ktd3d&
zJqk-=GOG6Hi%8GHgDV@G<mdcc#Rcea1fLk_d7i;wUVxJ#26F?|$v}7I%~P_kJFs&E
zTL$5~yd)z%eF8Bffo+!UvB{UNH(OQh#}u^*)qNT4z>_}cd7dyy$T|`}0i8-Iva`-t
zJ|ACGqY!?UB!1A25H0vIzKnj!e>nJ|JZ?YVe<^O)0FY=#?xog}VHW0Ci1ulvLCTyW
zveN=85qa-^=Fy@>3n^%F@T9@U$f&8}b{h(th^R^j4m(*2tZ#SIj3O<@(l?8lUH;&9
zjkO;+59$|_evY1^Nz`zci5_QO26s{0NBzQvNDCZbiGQ$rZJ&&_Gz1XyBF{?{c}Orw
zP=;uX@A8H1+O<nPJ#V5Y7D>AQ?VMU2XGxxhF}9(`LYBf9EXf;BRtyC5X^UL=*V5Q1
z&a6+@w70N}q*Ysn7ciks6fgntNR6ldVu!f|r(E-&|3Ht7lokOzbltDN?rLaQbRmlA
zrudI7B2pu%aFS__n6KkBx=vEqbfWA9h4ShPBjwGNP}@+AsDnbk4p~-O0Gr#Vtt+&A
z61LHR_?uMHc1(+H6%RCa4nyv5OQC_`Gv(w*@*7<&h?-L&ILlleP_cdk8dci|Wr8tt
z2BlY&)A<~9fePN~^vz1gM|#kDMOA^55dwPc1$8PO6?J22{wEMSp-rd6eTmYLwn?NS
zdVJcPxKLWtSx}MOkx5`28dh`Y1skUQ!mt#@H9k53X1UX%z?2%#IQ9tDT?qjJPDE}}
z9w11`B7Nmp8?s&=u%+;Si4!Gf>k8B;tvG2k^aw|T;Cr)*uKZlAr5E-?A_aN1KO@vj
z_<Iag?)Ppdky|{?D2q5im){FHQ!F3gpkKTEpKDx;{HB}8I2EywX3^tfFU(48Z_-Gc
zRA5xLO!?!_^l#-&RWF~9lC4(MD8c#I==*)DY~1eu<*r9%zIl9EgwoKxQRTAftw^fB
z@aE}B`rj+L%R{Hg_nWp1@ZsIdaC%Lu%F&AQCerG^$>~pFg|;L>ltw^JpfI6nBCkTp
zxcr9I&R=`|##fvo!eD=C<z9s4EHND15cA44q=Hqa#<t;ihBjh`%V_PSJnPM!hM$58
z1cCeSN(;anPS%Kg<U5}FKu(=NL2|o_{en`(Xxw#;OEUkTOWH^|ZI}dk+9eVa262gb
zRcnFLtIdC3Ps~q|Ud3TTDTT75mZ2NMyRI~<e0@RfYUeGN<Fm@W<kcji1`EuamWts-
z@w_q^mdkDSKiAXvN0K<I;~yS<{?tWH&XYttuZr=J61-Ks3hMvwcl5$S9=4CN*2Z2E
zXzG~yR^M~Sp8Ki;9{GsagEY5~Af(?S;ATF9=%lh)ebced|FbVnTw>34SYEVQ6*u+x
zY!G6x=aon!pXd4ge=m~HdYLu@;4N(%8rj*RsZe#(@4a7Sct&nt`1g=Ei^F~Q*G*m5
z*ZEq(H%+F6UT?4VGtgVp@^+IM&QE@v`0Kl%(&5|Itu*!h=G$p{*T#Rnadq!4Bc@%u
z*GFfLaX?|&ft%Y;%`TqjdH%>vkBmoUpOy_SDO)<<^~gdm?;*;e(CYsG$*s)&je65;
z?@ng2RVCbk9w6nNDth=Pk2+YrtA|VmqZf@vMo$&0)Q}k5aD^=@3A4^7)ru73*HBjr
z#({W#E-%@l!qp{Ep0H*iJ5^u;lSj#v#>$tOB{D@g=QK~kGmQ#=9?LYLoum4t2gb%}
z-!r~Vbz6GO5B4sX>P7+cZBezEcgm{0_s6$$qq4URuh^F{_iCsIHP1p<=?3xuyHr?m
zYeelneN6K%AGg^?yI+p=t5($VdF@G7o{V|^JD(<s>>blK=uNHYLc24fy5ajxJHi|%
zA#xzTrBVZ)ExguhR@16AIB?@?qnPOIMDI@v+vDJ-UYE<iGX?V<e{k1)yB9JPAjU*w
zLCmmD1+ejT)cF4G?1wL((M`u<Y^X4YqnPBnww;)}@zM}qu>|2)(Ma_FY2&~?HV4gV
zA)(kxC}U1av$kS1olnHpJ-sSzq;l-xuEEQ_1eFjmN;Ly7&a7+MyHlIF$9~>iab}M7
zJK3a5csvkG0R0H(?uwMIk4FrBGXBB4y}~O0JB8X;0Knce_|p~|#+m98TEtyzv%tPG
z#5?rT*J#4LMd>rPw{^Yp{nZV!zK7}%<B!$W3}Q^2pn2wEP6&N=snXq(M-?&|b2?7F
zC%J^Y?$m9zww?Ch<R5FY0j3hTB7apV;FIug+bz1gFE&*Bj`7t)DU@Rl(R89DGbTr+
z2DB>pXISGIiGxb?D*=Z)I@J&F&|<WfH+4G_*-32;Gx7lnA;vF`U0zd(-+QdD7HY`f
z%8@YSxphBxfjU|;@nfgX+A7rKNkLz>Zj4vf)fSHC)B;7xNfnj(M*}wk<f<c)-?jQk
z+-U&Lw`jwlcXw~?9yCDh_GEPzN|=MIc5Srz7ka#p2e{Vx?ZX-mGQ<p-Y1(?EShX^8
znGA$rS+&|Xs}Jz_;BsjAx4!?a?Pf|`ozy4DHD+MAL+PZcdd5d=zQDGV0|ZXXK6bg$
zP1$3rxb{n5Y-URhl>XVOM^UfFnr(sA?oXC&L?F3BGaXfmTQ;KyZLN7a9jj00v4owy
zr;XS!upo9_harR2N%a4JMB6QP=B{~Tt1f<Y&5K_KK5qPM$Fu^t%U#qf3byQdN?*AJ
zKx)-~F0T1l)&bv&U@=94Uo?ThrmYNSnx9UexvR>sEDx-HIwn&pH=WoBCYsts^r7yE
z5N~9aU2)X=($IG7j>r^(y!4^jLKU~N&HNqo!s}Ct)vH>TSv8iWctzo^M^6(5yf)X^
zQn6)#k7c8QZ(^$K=JIO$7Gw8v|Joim$V2yNtx)?VnZKWcuUom7tM&DLBJQH#NbdVV
zr53yhhdYR7486*oJig{TleUg?*eN?X?vL6z3n+<vgc{PL@}2LuRc~bHfHw0Ze+2b2
zJ+U{&^Je!mfh5J1yH06PV_zJ0OCHrfn<{wEb1=5U7v7mLt6A@gwr2g^H9l|s#Q9M=
zgx>QXd(ae}1VUALFnn}sewqs`|7ZRS6ZFnWMSLMdv*uNw7Et5U&Os{~0Flm%$v<0E
z<Lq6B_lSPJzj76Qf@*x?`bC`TK&rEU)t-gGLK0bC)!RB$JE@q~`HL@>O|ls9@{})w
z>D4@-bM{MLMpWJL&1!XhpT0?K>m6+72+PcYIsOaX*osw0SARf_qiY1<Z%TA%;Qi^N
zszH5!^oXWax{#QHY8Ntjy<6I(#o5GUJ(?Pg<_aFTT>kFM7guW@`SiUePt2WZKq!!@
zWw>MZh5@feQYKwfVZ)VedupuFzvT(e3h)@KT_Ylr74xIAwZm;OhIj0$H)YdF03Zv=
z6I#^w2Qj(jkv3^uWi)uqt@_aEh5>9m`=db*-E$XS{C~uqiC>QC+y9%@3^ROZ##po8
z*2<P5WtqitCmE$wT2Qi-t&}Zk7GubI7fC5g32jJdp=>QuAtYogC1p*rv^?+QPR#uN
zf#3alUY_r8FV}Tm=Xorj<8yqD!&z?GdigEiKbxp_TRs|LpDN&KGTtqdd#Lv#0w}}f
z!8hLefqx0~Zj3>-R{n9`$u}kHH7Z<W4uoU>{?H$M^wCB<+tY`WMuKr!j5h0uPRkTX
zF*c3wo%8WjKHC#GVx_k{&Tlk_km(W_-{Q6mh<tt9H{?5(pM88(-YsRkRauzd3)Hd^
zO(J~*Yz+r%81+7kj{K;<tPxp=o|N0^uX`5Lm|?0u$B3)tp!z+z@lxJ?yzGNQO6ZoZ
z>25+>`6{(y(F+>pBEaoFe0jjf#}=7QD3ynvXieFKmBuR9BbP?E9qzTHIg`HL+bW-{
zvL`Gz6G@;kH7_t7jJ<ag%>^UxRe>{ZCdYHzHGnc!hS;L<g9``UbdAe)n+dMri3H{u
zlOTN^k-^lz>FRfWOY8cSdTiqZEJ&Z^+r05IiPO0awG93o=w0*rbUE&`3#}oxPrNiz
z-`?A}vc{3@Z#yO5mWFCPX~bbdga_PEF2yt~6`T>2NY>O)m2Y&>uNSf!seJ~5y%2YJ
z1*X%_*bOp~b~5b>d6k3%+6%ozJ(upU`r147*;}y{SK=8pocFS#Ko{cI0x>P8-_YB&
zm%>28{k2*$sdZI}38rLM&6K(E7IJ6jV(!3szvWe#y(*u^)7-W*gAZLX;j+tdUE|r4
zhpVnliQu71wCZ_lVfu7xQxt&p@SD|cfM=z9HV$XWi3)KK>q5V6i1zF`sy<hDg3=Yt
zJyy7NsPma`2ix9{|FikfeG81Zr(}jf1P=9>^W=EZhHp9=4cR#Cm6h1^b2+tCFt$=n
z7(s41N>v;&qIS0T+q}>c>_-Nu;byW&1K9aa_5h4mqNglg(^dr5q*a8PkKIErG(JTx
zEBxeO4WDywSYz6S?!0__b1;@^vc!L2KP5d!yip$7BbRz`?Fod84NySVrKL)d$@cfB
z*5~A_bF|S#zh1~0Nu6^Lg0U0T6s<wW$QYN((H~lkrzuS=kI52e!u^NAJCtT168;kx
zj|2Kd@E32`7l%)M+1}}H>hCf#P!)#KJ|!)89OP!{rJ{y+fy&~^>)+g*mo!&ejv(S|
zo89+<?pg`4w{;o;-amwe!{~ThCZGDOg;9U7stI6%(m*_Yj0B-myGsw=Yyc3%PO=o4
zNIP#~B2U+#d5N>%M<=NpW2YbDy(3(C9v8X?*s3teddh+1&bJV|@?H0i`GcY8>J3)9
zi5qDetTcm75e)Iw;ND+9s1(e~{Z&@}-0i14xp4%6481vs`PW%Amx`iN#p=e?wEs-w
z+H&lzEK;7(+9-O<VTPC%EYRikOI)so)h8f#CdB4VzsMmWdccOWxcj;M%hVGR_ql8~
zH#G=fP*Ynt)UWvWCLPDBCPi5dpcZCb(N3a8&6VaEH7D8MC7XU}o++U^wBS}b0b940
zU}JK=Z11`W96LH=3O}`*t0+M+Zyc^Au553R=sq$&#R}r!l!{sMo7#WgL4fEDW6}dj
zsCp`ejDeodKoY4w?}L(F9kG5^zOE;#uC0<v@RD6F_(+|WhKtcJaV0YHwQ~$z@k)uM
z@Kd0Bapn7`9wa(Fl~mBph76`j(}dPC^yi%Pk%9SfDJ>dV#aQ|rBDWWj4z+zK$4mhC
zt7P)(48o~%8wAoxoi4kVqi(F8HUYl36zMZFBCJN+cHfM{7N0RV)FxN~oNN8?25rYp
zlO{b*uFM%{yNBIcFl{xYSim#eS=mpAzPj#c&c}^7{;Lk39c;Lmhvs?i4m~h#&`@n)
zYWO>DFz6vBLi)x)Aj6ZBlN%nI`nej22S4>i>z3a3L}oQTILz}!1p=I3UcY&H+cejR
zJcY$*mNdO@CQUK{D4F!lPelCB=H3znEFB)+fJ5FaU99Azp^}EeT;GB?sR=P4Qy`1F
zI?9|W$-R)^&H%^`l07dQhSklZ`r-wqnYE>Z##D>f@LPBAaXm;#E=O-KD`yJ2KI>&u
zQLQVN6J?LV$;|8-8qBe32=`!N4#jJ0zkje+?qY1`Jfo5*lI-I;RPydHQq!ztP!e6x
z_fSPM?@zpM%#P4mv;s_XRT02&UytbZ`Zs1Bsyz7=X^&iyz95`MRI(cY{|p%|$Jomm
zIp1m=Oq7CZ8In&xSwT`AAgb=UW09Ez(+y|Ws#LlTc_`<{^*iCO5KKx;BarI;_wn=8
zY?;j$`4e@#ILw5O#|4$=J?(z|9A>r7Wy8*-=OKamk?yq9f1U2xMhIRGvu=a2dM|i9
ze-hNf1vO;AIty<h_q>D4?j_4gd=sLWee>vJv;30u=NEKyTN-|<p&dspc53{cB54hw
zBR7PmkWyT{2}g${DTt>!GyYC{7Ix~2N_<;4(XJOrLD6jZNNJuH*rL(iY#$?T_fJY0
zJ)r{H?@GWb_!gdX$NZQ8aJ?8&Upb;+Wqoby1p9;+$Lgo_)6$yqXc#)O&C+Ej-#-q|
zC_>Jv(UJvJZ?~}QCHwYlu-SU-)ErNaK`SmeYV-N?=U)&_3*A++3gP>6AzjFyZvC9J
zS#0CM>Gp^kLPI?K?310c-8gcGiJ@9i29;M-V4qfoylo#Is#lW^i+*c9<gSn8-pUtZ
z;)x-r=odKooy!pt^}QuPPvh)XQutU^)TxiICMnOVB94snE<2-7G_E;OkjrI@Z685C
zwzNF@*jmJ$S+3~8Zp9gR^W@*|-kK3$N30$zv%`6%nq6m|uJy;Bo~MSHaxo#_ez^|B
z<E2IG2g~gwGZ6R<!A;`DZQ1%aqZZVV3u{+(yi+s1_FZMG(to;b$k?^I<94xf{^q9E
zX^T%ix<<M%eT;_Bj91jnW}?|U;zli}aH~{*VDK9}+PBB+D<SKo_E|tI4I0x|MkK3Y
z4j^rZ`%N~fuT7(w91feQ`1>3v)Nvg4nJCCX$Q(yLduUZ!1sio$5fP1!#|b*~dnwJ>
zZkYV3`yKfoDBYz+O?9&jaEVFSd;L3&seyJ&A8d&{an>4LgbGp;*dm!<_YB@d!6N$F
zJ2jZmkLcU|&4YkauDo9`bJ8+_4@%F($x>-dZ97TZlmAyxfr5(x9e;ib&e?g%EiP7x
zsyeQFtV$NFH>aawD9!$T{f)Tlr&qVN(U@B0Ld!`^Ldo_TSd}>hZkLgn{6ffTJ{C?q
zQSo!F`YiB-9zYdwVuM5Snc)OQ$jaAt(WOxdDBkO0Q6$V--NDmJu!7^au}C#3_0{#_
zN>R*hHv3vsL01<5pxp~@vFe?=y)q6mTIyE!A|R!uJ!R!Z3H8%cG1D(%SJrAN#hRm3
zx+tLp@8!$|uF5A2H>cY-#nwebBXTq~aC0KPA0`+L09Ud*R3s}A<;QW{>bP;3K4`>r
zRTxI242EsakfL>st|8tR&f`=aDKRX1dV3hE>o@0@!rF6q_||&njx8{Yx-0u0zxxM3
z$?{$00&_v2@KXqujw4$6*A*=~Zx@PM^u163;6(}f5BY^@bwnj8w<;;XB>eU_499-e
z?KD!?-kk2Ma~aro9Ykr6?_BoyUR0<!R@+Us_3r|J>98v`;nL{#4@A56{mT9*qvu=#
zS=4?+;tN$_Btn6Pi|5_DA)y!=?Fyv`p3Xq{ESvbo52UP*Br&gepyC6RDn54TU~`Ss
z?fdAME2CFia4!k435hf1UZITfDwk5)jFj3}S!!b?U44~eG|)i&;0FvX`pVXNBO<8y
zIMcpK6^GTf`%cQrX^V3dbO21i*tlW7ikb3$9!;~vi#6D&QXrLFWzQWi9mgQY`;*)W
z{l`0qe;=SO)wg0`bFLuu(PGvh0tR{D;Q98d;+Z4fS&LDNK6>Loqm_Rwkx%4UC|vFr
ze`?YIBR&YZgYvy(HCdA;4dJ>50;^mI8r+XvktL6D?T@Wp5845<Nu&@F>}E`J|Mrxc
z2mPS<)cFx1Im_YYnT3K9_C4&3nQ8jw*e$oLP8WUyt~mYrT=cO%lw4vL#47e%M&EFY
zJ;cD?AFu6mNU3wnt$$K!cS(<En$v8vVd$fMQr5@lTkWdPSKHS6&=f*_`zH3ntmp(A
z-v6iDK&OmNuCRGVnKTroTvD3S{Qclu^HYgxn0R)?j;#tgNR{Od^@$m4Se#x*A%P9#
zYHS|bQfYL{=(0MDC~r1293laWutiZiri4_C`W>cJ$-RMp+O1V9CtdrNa?e=YQzd@5
zbq*o74zNMdz!k51$%EH8IBD`tHRSjDk~CO|7A`R2V^<kn`<s&}tPDuk^ut2*Q%am(
z%o)ig0I<C^RX(k8uz3USU>Qj&qrc+qOQDQNTGZN5o~nit@=lq=9BK$rMvJNus-X{e
zHl9KcUv;i}(1r;OC$WLNUT>hzS&YlQfOS_XlAa{GfMQSVl&Ypr;^)sz7oVYiO|8<x
zFmr_6sPln0h{MHjwQS&-Pt`m0fy9?ObRH3dSkz8+&sLvK5V!*{-1ywu;h$-$zss`q
zhYa4w<GKW0bdWgRKf6Y$`d0sXXF9mXa`4~dfg0CV3^#7Tr8pHGJlI9f93ZWhL)s6T
zwW+oJ5|MzFm-oMvzRR~E2~rGUXabpuc+Z|nX>MXZx1OM&zEC>kb*<CC;SY;a4&`KG
zCY<7@{m#s8<O0OD%t5WsKn^@9w9VNDlEcBef8X+b+<q>}jzzBpjF{CSds$5E)Ld=T
z*;k)_yy19T3Z@c+{nFe(L8NZ2a!Pu9G%FMX6YO@r_b8;a<y9U_X+izJ_#<^~5GX5D
zZGZTMt#djELOf{JZ)A9O@2|$^Bdz?JnIO;blgzjlI8A~KkslvWTR{yC7Y)M?7--Zg
z0mun#4blJeP~)ph@BZGzO@f;DLCLdk|8?mZ$cnmA|8OGBTIIg{Ny5yINFW0~=r<lR
zFNc5cnSx}NvH-u?8fW(ShMttM87VHWsNX*VRM!@8E=k^}xM6M{Py{eh$-$-VXBNIR
zfIYGE_$TU|-sju_+Y0sP<qz!w2_C5(sgv=0w=gu7B=Xmv1gYP0@NY^T@86u-?h&b7
zHo?NghE?H6XF*Z;@%{eQ`8a&^;NOHG*&OVxXPO&ohoS(llDb7MwR(5KQ}I`$OdG_~
zw0-~mkd8^#EY?<n++aiGsB~!#<SfSilc~N8p9eEOkbHgg&ay56)Cczbb~g&I*%yH=
z>HdGeo3H)s`2oZ4br^nz5yWVTeVUa@dLeiPSc8sqEAW~+Lt?jv*Uo=BGKL(WuWz#-
zhtOIdOlCNy@mt;aR-daC5?0I4OO`E(f*NTg+?z`^BeZLTSX3rST9GYo#``%jpmvoz
zpvO|#F2tC78*j)0ZSJYx4j>XS0-;;Vp!yeB_?fus+4NXL7W1b^XC^VQ?V}p;d}sgp
zE|6v^5*nFAUYt_@F2%j#Q2Jgx(75DM(t;BiZy-uKNobB4J`5sXOd^`itgxhu+aw-E
zvS+M1Jfy;2*5YKkU<aKT{ME-YY8%ks15lOB7CF{83km22HKb6Jm=O=U)vC*P8mSwf
zijPD@bQ8Lm^wm4Bxf3b@X4C`*eH93Kgv##3d$wR2QtOPM;Ly0Uql^JwU3`F?2Ft+Q
z7ZUuwdW*L4wivsly|l~w+Dh%sbZA)^kI`+j3pmJxqc`S6f+&4fz>#c~U{BEqgEeh`
zfuy&@*dxSG&Lfqd6l*_Bc^K;-5f!CPcAbrBeF1ZRAih8A`~0>8rvJ*?hA1+7M+6_X
zZ#jfyFhn<JNH@?=Nvj2w7ervc)uHJsCk&OA+s_(4ozfvZ>7ow44$*qq?*N?vvOi_n
zg+4VsT^{u7iE!u*1MEDlDV`>(QFMU@1VxuXW^<nU<Y`21|GwwQ%_P2B;9N6QVJIRQ
zJn%P{1jhN#Z}l^zh6NM16z*CE0Doe}Ei!3Mq^#eZ=~Tz8Kl~@%2%+qWVgV!PC6?`#
zJX?^6I>c-DvOUIKkm(S%BuT6Cff+9VON2mxD`ll}*|9qSzqAFUn`Q#nK>fo26b3X+
zN}1<e%rLSV35b=4GkXLTZ+D5YuIqQjMr4a~NTB@`)J<(&H@6~X)Wv}czATkg7Dh5k
z$<<O`pX(=u!iIIRimrQFU)K*$ogv`nk2xKP2Y3j}51(r_rR9Kzob6zuBmzEF1&~Xc
zeUT%X8&T&30@yr49?Cvlsf*a7J6$E)V82D=wXT&<NjQG~ounc60@#Aeq4v>aNrVJ=
zFCmySCh{0axs_OX#eQoU<pD7DW1Ru^#GQEK0VUZSdhB;7+ET*%xqA*#lFrQFg`-u)
zeAs6CnmG328}f<}MHwCL`h|;_Q}lCmIL66p$ZfE6Uy&xweqOj^aD+*qN+JNC&zg@V
zeQ^WolN5dw3_l`{Wh`3H09!ZZdso616>vj3AK$x@^8VdTeY#2<nPXIWK|O{{*X#8&
zSJvOR!%dDw&8uT6cEvSd+5S$a`Yn0)vMes6N^n(mJ1Z(^J$QIIZlzINa!lf*1E<R&
z4KCSL4o}*Y<F8Uk@q@rF<zf{lcrU~WWh?*TjEOjWtL-&Q@mcqlU@EjlPE{|RYI7wL
zfPyruGvt~PjLEh2xN7mHAlG}Voo7LJ+2lP;w*7WCj2qO5Ov9uFVg?7x?+LKwBq6F`
z@o_mrwLs!~ag5|O)dO^^mx{frQ2T=~l4db3Yv%r<6o*d%uv#<=y}bPTwAJA}DBm4y
z`Sc$*pn{w5vvrW3i{%h+ND-|Qr3LZl-w|eeE-`28L~nA-nJ{vdttv9!6z&MPrYFJh
z&q;ysMGh`_OvG;g&<3#l1q8gl3!~va4};*91wuM`NHrjl6wFO&xpAkmT*77-@G<$U
z9=;*k)-AKU8blNa25df_GEwUrU0%M+8Co7C5>G@}8G`XhDmSKs$0Q^v{0NwwI8F)V
ztO9Mx$!$vr?dH9O?OiB9rtkqsce7}xep-W0dX~UAP=#BjB_jtf?1ebMpPArm=XH+V
zQi=5bKl{aaY`p^p<c0xLrvS3(iv2*FF~{$(P3BVtMB;0M8-^~t9x#dQ$EWHlC&O$A
zp<@-X|GZBQl1h!!Jca}Kaf1Vf>A}ef>I%!MBFF+dZnZRyW;3oQP&@>VVYXNre&GQ$
z-{lOa2~|i&?UwpUlc<Ze`K?ygzq?a;x>-N%S2twTN=5>kizOyr?*{#UG$>*)WTQo(
zT+iZbVt}3mYiPE`3xa=;JO|0+yB6VfPQAn&wzggHlibRvm=xAdI4uLoNt4lVS+&C=
zl#)s^VY(;^iBS5FX&d2V@L11_B|lqDucJZ|HCR;JV8}ZYISaDtMyB}y+{=_$Q8hoM
z-%!rUWo)BN)!;dMNj6gaQnuzeeZ>Ey7@WhrwPU9f2q?Nq^!bV;rE8A{NAB&4*YBm8
z-*gLjR;F0_Ta_tE-AXJ$SI;h-;(BP5l%ExEBoDe(tyF=%?OY(>ML)L_FjOU<mp2fE
zBp=ijfGbq0@@)2mCyC{j4{y8pq2yVLY1FE5zpU)}V~mK4H?t!?EIlkXJ1<cV7=77d
z6K~-t+74T`NT5Yrf5$+M$gWv$)^qnj{nVmXK7lSH8k%J!yuvV%ER!$Dp}vuLcDa<7
z4e+hH)6f2M+B6E{Ko(7xVmzG@tX%ct8B~^JO^5E1D^k;vh8~GLnD!=HyY<_HDoP@y
zeAjs#Lx{ZjU{LA(;Z6ej@3exW5yH(xu&q{oS}KJ&Afq0XJQ5q@*3}hfzy3{NY?H$~
zovagHyr?gIO$4o_IUkQUPIEG)3Ne6zfY`#DdeCCK_L0wufdEpj4|rD*H^G#CF7Z85
zeDMU#HNvhdRh-5_K!R4gmdB1i3Zui76lYaFEADmg_0B(=+w3Yeg7JR~9*t~|%Q&o6
zF#sg9H+SeCitf-i9fl54%HMv|UO(!$`{Px9rbBy+xex>}GA6MP<sB1De5-_~QjtN_
zPG2bnLp5xs*N+msqM2vT(ARhWtPDu0=`JB59L+$SZ|f&1Mna+`@#t4=q%pPd2;MIY
z{%{%R0tMbs;(!O-T`oA1N<kbVg@l0w9U_b?^%n9ex3k}kX$YHJu*gxdQfCZanAY|U
z*1BowD?~!2#=GhcYH0E*?<8*JHMm=|q@LFfSh%dZMmpzw?>HxVT!(krRZnxQTXBSp
zrTUF~?V*$jD0!xabJrBb0;+w}X}A#>iKKoi1$+Q^cHL3i>IWp6RhBul`;TfGocV{H
zZIkAj+t&V=QfT##BVnwfv9mW0uKedOa-2@}9Cn9dzQJ9e&jvyQ5a!S-=1{ueKzeJ=
zC%}?OOAz0<(Aua4bwQ`+a8T>2Vpq=KdUB9+ki>La!kyClo=d*a7~9Irfp9#jm_M?_
zy<jf;3BWC7R2KeN8<IH*3As?7p}z2@yr26zXunVo(@uTVA6kg8;M(?|I`W<^IGx+M
zA14<_#pT#MT-?yUpZXS%d*@eDN*ZFnxv;Gt^}QntYJN$1U0c$TC{YbVIyl!eI%}JR
z6ZW~=^|S;Z!!oP1d;P)0^d<WZ=ggxTAcovsSWOaH%=P@mMv0>?`95|<{hd&XTRoJ5
zdNE^Z!6A3KNFn6}f+|j{siqerX@M<}D!ghbSWF<cYF3&uWn>74K_x^K31U{>?Vxi2
z1C}*x=8DB^=9nurQdSYvWxpQe>m5{B7lvP30?sgbP8|k%8#^ubN;}?go(kM?GnXYD
zn=R>h=Y`S92TQf@E{$p@@(J`mxPeobFxeuLzCp1LQU&2TUqB&r6Jd@Mpyha+@MW%Y
z)$pAX_D{7)gjeXEV^CVm@AL{blrW6Azf>x|-@PZ9x-?a9$9Zoa2dRo*AV;w5-s`0B
zi{}EV%TLQud3IwE>Z9q4e`6(`rC=u@6u@a-ixRrlT%=NpwVc78$vV}fQ_Y|5mf=!o
zJ*bseE$dsr&JP$`k6XzWJoCDX!0*_Tq|0ps#$ep?I8HO#y~7uj@VDb+MB0)yMPiP3
zc;4tO;2Y%F>T?_3c<+A^lz(SfePI!vT%jb+K5}8HwE<EM7mTy(=S^HZO2(vW?QonS
zJsny*c2PFe+Ht1RDz)<U*3#!XxsIRYI$niAVb**^%`;6>NOdWP$||fL1GM9er(K|2
z=6=;;@`D4BG3Oa~v$Z@XsjtA#-<gY}DOWL^DUALV;b`5n#eKR%HHlJ%7RjmR-<!)q
zifWdW)p%1efxW{nJuHp&kUEZgz|dFND;uYrYMC?%SmUAkqI(K&4rkFRQ_b~W5Rv=z
z07SQGUIRSudSUYi&b}OW&Yr~UZHt^xkdEnDQJH@nPTe?{Ox?ER(4C5rTeNLk+6R-B
z49Qfb^tjXSop+^4%JRCTbP6O6C3L|?o(BDT0ai2<RAXuJ<}t!|7?q~Bz=cBs9(<;h
zi}%hDN~F~6Bu&2eOJ6@3o{@lSkK@XI9MC{zJlI1WD3YuUlPpfuO{nMt0nAS*nGej<
zP_kW9gqyLy{@8uCp@RH)Fot?c%r`E(DGj#t>{xrU(~VWSN^tj3A=n(s-+IKi?9n8t
z<8^#jxMM(RcF?ga+}Z6!aKb37gWVtECjnb=>hBG8>}BopL}D6F*3+j)6T5zip`)JZ
z7@lw47UN-47JKeRjlq=f<&hUn2}wU@wDAk0jTY+EO;7>IkF7kr)W-nSmf#MgjQGOW
zp=NilJ7*-EE8oj?^`&T5@0WtjTjef{0k*?YVYxLT1A2jyE>^(6ZENW1swx&H2Y_dQ
zi#L0=SvrCC?eEi1U<iG8#Kq7|Rc4$W3+ih##ktVQZ@d)z{i&hL-Zo43Hm@Oz89#aU
z-(ob^7zXyurXS~|v3pbtfxs%hBHT~w)4OH#I;tEo->vHU-y7Sa3n2!_s12>_JX^SR
z!buXuFe-WiQf?m8G4TAB+T*f=tvqnfev(7-);__JY*mq85w>pa%n7m$K=`xo%psT}
zTqz+kH&qE3oMm_a7D1Oe&&UhS%%vn!0Yjc7<u^C1IQV0;ESp_mMsm<Ax$F{{e9EOu
z-%MW|g5j8hzEG_qlUDEqV_h^hyF){L>6Sa?2V)(6N}?8m@UapRt@1qfdNf1;H7L#I
zBB9>$RNyTiix@zI#f2V|#ny1FTIO_ua!B-4zwG%Z2O6&WJj}XYDlr+<MueO_5NyW?
zfB)uctatqNLl}c>%u)#3v2_kaZXJ*pSK&{SpBc&%lv?Ddl0t5RgvE3b9zE*h;Ir*^
z3F#Yjtdd}=gzBHMQkmJ+aURhNkFQ7O&-JF!O{IN{m9Z?_u9TvIEtHZ~`P8+f`Vru=
zUf9?2NtLK#^4HceZ%F=a_w8Ipb^o>Q7aEb*g;znGF#}9#DQT0rz5U6HRjjEL%Eehq
znS-iqtKPl2+k-LhAjk#_Jfbf3+fs&i<EfM$$upf&(PXw5ez3z0?9cCNNl>%1`lSR*
zm?{()L5JlrkXnybT?id9LU_*t@;nBD+I+fD=qhq0^dQR*cf?*u5UW&)Im3pi!yxTm
z!EeSrDL+Y*axovvVn(;R>yd)aYE8GXsxY{7l0->5Pjv<2hGbi#Op+>udrRc7>)A4@
zyICjFghS!t8h_O?Ygo#WBa3!@0~_RpTkIzq+K;%>3Boy-jZIm*uh;dng9QYWe3tZD
zNIo#d;<|PhF#lQ;%4(TFR>3&JFt$tPAE6V3kSLV|47Kn3^vmidb=H;>+5Imd43!8w
zOxpOs0E<=Q`ewpf+{O`eMr=Qjy{ks9r#RKI%{Q@ud&&Y*BzoacU*(_{)gvrZBaADm
zEL6qB`{N*TdS9s?y-<iMLg|-;0G2>~rWB6n9xdnr`$YYVu&Wfx!vyT5I876>9pB(K
z8djYIsge?)1WUBzIP(~M*U3*|qArUdA6978*sr<cD5;UX00tQ@7y=y*zNg=aSr&d7
zN$eClG3Awd10wd$3{SFmAutQmQ@%TenORaaZ<-H!8$#l{m*B;B$4a<^VHDpJD6Uq@
z0`ZMHwk}K?6po7EfTT;2wI>cb4*?daB#P)Q!HlrkWkQq;5!$RMn6+oUgz7J!6dFqI
zwb}1ejq*LjI1!YZ0&(75^W}1m&|S-u$zsi|{*sGLefy9UYaFxxIFe6W=CCMQLZB7w
zh1hlfKV|AW@`;CUo7!$CrR)Um+8tX|3PesV$x8+EP=;fYXuLk<@C_B<L2ztsxt6WO
zPctYyi|nUhmf<-!xyl#Vw(d$0sHF3^bLsvyv$WjGF^f5@mp_`XbK`$+Rb<s>)#6;#
zd=sIZ{ah^|lP)ZS3xt>ywt7Xy0}DiOT*n13CI*`U+ECz@rPF%o2*9EXz}mY8j^bvT
z)z?WiUCZCy`Z0!Q)$;CY1F?KP$~(GZj#OcUS6RdAlUCZRoFaC3+B~5kC=(9s9#&0&
zNiO9tA*2xsuJsp=qg5vaAnMQ8gX1}0-1*ZgcOh4<=^3(({Ks>ZH`6#ZH0nc7EZ>-$
zP**>sE?M93U|aWg<1Y+2{l832$FFwGy4U`J*W0e8H~clte*S0obW?{>Ym9$AoxkgE
ze{+-OXBPKJpB6ZGTS3)3$CMSv+jt(2S&`PUnmfGdabiSh?z`ILS8u$W<I6ox@H;*?
zrC-dzWX<>8X+2$I(xLqE;~l$prx-1L(Pq@&uVi3lQtq41$bkEWhC1xs5I^~oUl>(U
z{Pbz)lUg(u151_=06%Htj=E|_mETsxw49bk)1n<opJL$f+Y_r{awSZZ`YVxCIZlRs
zy8dB1$+#G-bd`4dmjsyQcoK8br?$t&i!R0NIy*JpP2SZxuCdptc>tgueUu0A3nF+a
z%9EZ7c@HAb(R5dVoK8aCb$}(Efikg;(5v{&UiBOJ!>6#ZIZpEt&*v)r*o+{U_I`Td
zrg7365A*gK@$vcrw5p1@lzL79;@s1RHh#;q{*A()L;RWFP|#2W9NE_c!6fO!<l02#
zY~6F5Ryh?eGn!(?tFcS<H@JQFpVqL{U%DfRg85uS)*6d2r(-fJ*KUM&GFLJ4<F9TC
z`fnFq$EGxXUPQ`Hm^iqS^KJSu?F7-~xg4Xhbi4QH@nHcIzWq=0*WKTbp{OakY&V#y
zJ)i_JClIYNj0S_jH?C<ouUzZMd80ncXYt=lrov2jkTzooezSC8Q0bBtjt9IM(7OS4
z=1o4uPS6BCzI@h+2C9h5B8L+6X$}X}j55Y$EUXNxd;i+I=R-+M{$|Nkn$c<USCW@)
zL?>kGx&WOimgI$#sbtSMWF-yO?_Udk_|Ls{P9KM8cH^?3aoVQ()hkQp9$`LZ($o(>
zH27IQed}Mk9{lGfM{!R`{K$Z1^gQd^7KyemI5&CaFjRN1oc*iuM>Q6Y`OnH^EJTl@
z3eaX22ZpZS{%JdcgCT?UlsyPQGGIWL#*e1fHtxZOzj5=&w`dI+VkL$1w(~hrP>+8>
zE$&R1Q{!he+7|tHQ5oJz>lXm{Zu-=3Hq3t${g?kPYKxZt*=aueMVsQ}isjNZ%p(|>
zCZMqhq@s-}P{q=QVy67+z}_1R#0)jQMm~|@23TVuu%Vi&pN3{f6rN4yQpj-Hk2g5-
z+J86d;j4vf2g8O9XB_nu%4D0+n_KhshcB=A^Iu)*JOv%;{Fh~;RX5=39KEy^D^}<s
zj7WuN+<$lV!G8a}y<ZBWE1k*mMLJc+x)6lveY`9m$NyfI<{$URH{RVQwGnLmFO<pV
z&`>9GJ!G3rtfMU*Z!v6`=1|eGLl<NG$KAJSSfgob__XICgbrk?f(^4t$3O3}V{qe*
z{=bJQLq#+6_~0QEQNGxdJ>*<NZJ2NAe9V0ybUS&aK4x2qEG4tA13NUZA8xI>I_E0(
zwd%^Wa!=C4&Kk;Jf0|@IVL#!Y{ujX%X)bfbc5_zPRHDc9{<X<P_T7SQ@FBCNACejK
z@)!+G6FF%I7z%+10;s?F)>34`w<B4=#-|afA9}7u(!p)rf-Z+={}VhI)k9G@A;KOH
zuC|qDWQagMuh%OXvMlP{_}gfBvyUKf4?enulS?H_TGs|mP1Q(9N($Q6^8H0=q<Ya?
zTm2oA<INXsd5b`qg5Mm5yrT>pL%$?HDd#(c^)wOLb<3}1|E{m4);l#WniMGQC?(lj
zRo`BK#isx2CN0q=_JgK2q3@nr!!d=D5Sg()uj>3Nxeo4m7U5yrp*cuRf?xP4WRw*q
zG{bBI`RzV>;H0g;ufxgU@C2@|qbK(w+zg2;y>^afo-j6S50KN}F%1$`+q4G^5dG;%
z@R6!O?Pn3$rs-qkY6%=g#!}?W)_!>Y8g-pU1&{2(Zn8V+Wevu6k@Q~Nt9r&sti$5h
zH>k~A0$Qzj=J9N*^}(sXPfT52obg>;(JTyMBC_s`Ws-s1p26SHhqn_&+$f?(84Mg4
zQu_LNk;vXFlp<TVq2sUvG8Jh~JbHQuNGM;em8X~Liyi@KuAWoI|GFy_V^v;WUTE+t
zl&ItE(Q2S1^pI(mST2jbCT-=)5Pi`<%AJga;B*-x+!Z8bXh{CoBkWvC>vDz<!cu!)
z%eWE*c|>DWWK)%w(T_l`<gRO`fETXTO)}P@($emnFHb=C_L8T)cvv!J0o{S+xwWQ9
zuOpwX9%oYO9TE~EZSiUuY|&|+6eFS}>85~c+)?(c4PiVcgB_d(%E_YtfONG|#N|XN
z0W$O~cCA`4Cb41a=%&J)kYSN9F87g0wbOUq1|x2UYotTUN#-^0qG)G93hMd#&Yz8Y
zQ={XgNh*y%R%lY#*zcr296Q;dVgoWH&EKU5J%$+>DU<}`mP8$Au6=TNI{zdJk!TV_
z1n9mx7DrM$fi+nM1gH*y$n^DZ(s2}n%!p+2YB_uQPVN{ukdHV%K{{(@pZw_gPQ9S>
z?=9hKmTVK499L_>+gf(&)KA%m*;91{CjwC+*E@+u0U@3-JSOP<?sW1)tXCodZ*On0
zWFl3pBu2X|iWMT|fzY@VPBN8^21>(iIQG;~Ml{j{)kn$rtiK^|^;{jzY_g>ypwCmd
zhQxNsK@Xp(ZH9|u6e6C&&o6qtjNXU8=oRTFdZ{UOZzHKyMS=7ft3;_)RB*ALbMBlx
zfm|Bq`Qp1A>jbwMNF148&d&Fnh(u^w5fP`;9H>P;t0&H>9?v<F0Z^B-83cqV2(h#4
zt7<y!K{Fqomqbd#=TA{EB0-LVJu^V9Wk%b-f(a7iiTY%2Mm!@_)mkL7QyN|^WJrqP
z^Xlrs9Sslv5TOt$3^U_t){i=wy8Dv3A^X3NSV-_v4t@^;2Dx|<0mMs-$O$qnCdf~k
zQ{hC}`*~8%Vxm5o#6(^r<K}P=7R+c8k|?`n@vtW-pJQ72Ka9x@yJ{_#mpwYMczL8y
zx82#hV~|*rP*i40J;)B4O#hwwDd8BpWheBU5aOyi|Bw~^JcAuY!qoLBHkqjcbuk8y
zMcZpAx*cKuU2`fBTHeiFmqQ2=p*@7<x+`TQE<$h-)Tu;|86lnV)xnJPQ-!gg+lBUD
z(NA>A--IB`d9BBt$E8jjn#DUrDvm3bQj1!y4}sxb1PHshmq>(rL@LfM=`X-f3()k*
z#Pf1M&>cIUo<0U3?=s2dE;QE%ZFFIdDtC(j${>bl$^=DJ0OwrVy$b0KkGC&piy$s#
zugRnbS7=4Dt#&7_a~i)aasKUZ?F`gA^I!_9l@i*cktE_qma+D8YX%<;2nYZ^+y}yd
zrphG1>4641jSo5X&(%k#!aOr5j9PQjyEgHceA#-ShI<i&&|$t72W#QOWd@gsU#=XT
zJEdrOEy?&J;FRBh0^<>0aZC|~irC~AYmd$=T$ZjW8-P++7%|Dsbz?@2k}%OGdC4K*
z%{eW{4K@4q*T2{@;ow7U9O3AaHAc$h^I8tS1!z)-?WgQz9G_A1ufFQv{mkQ1t+>)y
zwwWr7W(TZ}sgOC`j$SwpsE2orM?=R^8!026{S^*rl>BT<L(LK1wurM_fz2|v#DMxC
zhkp-KuVz7OlHs6Cy4zH;U1UeZvKcu5KH<^3p%of*`aVT~{lD${S@Mb#_I<G_WAQ|&
zNX?-sVf6F8=cZ4jpKTDK>Y~L9COkghwq;8xp!8<c`or@HC=_Q0>ZU)urapKYPl$_6
zvE$bOkQTjuHVpvW+gtQGpbsw3PwC$H95gzwKqB~hX7o8T;p{%NLAgczx?(m&tLAoE
z5ol#yqY$=-7OF8AKV%=+tFo@_U7Sz%6)!S-s)s?h-O<H7yw&gac+T6mZ*SL~xq%^r
zv48#bxlFk~w_0?i!Qv{O<<oyZLZ(OJ{U)L2w0rk%3KK&_L4#}xiLGZH(ka8)q9OcR
zv@s9qbnSZDIS3!Z+1IQas+f(mzA6l`a}x3#8C&(j{qL6n?1+!g193wA8i7nP>>Ui;
zeq|b{M1;7lz$XUfSN1!$oK;VfTV1#kGK&74O32?gi<4@DMd8`bppN!2EwuCG$lxp(
z7I{DzAs`!zD}or@$Dx@h+V4moQ4tJBq7&R@Jb}V*Kr5)w>d!Tf`^EJD<pCG=7n6<$
z#H1iwX^zq_y3Tp1mdof1W8>bhKiAMW9uT$YPICK3*JsevH2Al%WB(Cs64dxQ8N&gK
zarnG=3^K|{CZ9AC5;c4}5GP+hCBrwEO`uev7fr5F{!EX|OAEP1nWpg;u1trM_NGM)
zf%>l7Xc5_@(q9)28bTZTuAjd^D-ST#{t;5=&z+xf9|f8t&<#aAp6TOgg3FX7RBbKU
zxfUtjD?}p?!|JaPnaCx3680QFetZ@o!ZK=YTgdo_kp{M7itEIGjx%ANV7R~|n*fWL
z&gpd|bFQj`7jtzg1x_QW<U|0!SH@LNrm}x^=TDzWR?kXrQ49k_myA~uZDh3eAzCR=
zcVbC)=!lew{@BQQs~@BUNbEp`P&A9g#L&x;g-;QEE|^E6T2Qd4o%FN5#Ap~BC*)8k
zUa<ugL$VWT^rhD6($axI(iR6WJLV=6!ek(%2z8T&4!8~4dX;hA>|>F4RFt7-V97)b
zMY`Q&w60H~=&HxuB?J!<=N+#yOPv4eN#T_DKJf6Mm2)QNaOT?-4mMA6lbKGQ^&Qlc
zReYKhAhah6e?Tgx@Bk(;mgP`1D7Far$RtKWt6ogE>CvMhk>HdrIsPo5b?0WNWXj}#
z6To3I!A3C~%5xv<!(nIgL5h!v9IPaX7`I8_44NmMP0g6q6*Qt_KpzWZCQ2MG`(w@_
ztC#c&JcMl~Y~TUjkZLhiCaNo8${+OR-Fm&@P342E2)G62_~uCj(-Lz**gH2Z7S7;s
zWDxImD8k`iVogztWPIB5w3op$otAMl0ScLUbMu-GQICvxK(_Y)-Oehrrp?R~jvs$W
z-aUz;`XGq)-*|il1AHI5S|}+=rKTH6gj|T+M^AkI81v*(=|uvoQZOG_@Q>LYVaK;w
z0d}u4TRR-$_JZ``lCF0|gv%Bymx%Ys9_Cq|n}~^-QEo8ie9+u^rO4MmSox=vrpMe%
zB+!#6Gkhtq@qC7=yxQ;_S1@DtZKiw;dYa3L%RF<YTV7?49zF0vD;BQ{zzd?~7#K2H
zg-l%p@}ofmP-7=e(jQmZ5vaOk{A3jav7{Ew*N|e}s4*Fj2CE(()_K`&MqUJ&mh5K+
zp4R;NPfc>Q7Oo7tUw3@XyP~}n$s>sK_Aw%~{kb5s^H1AI`NZvc$z!u5H<1r%`t7${
zNY@hG_Il1vsrs{@+E8ccps6RKapOaW+tIDHI$NJXZ!&3t6KIZSH8K7Gwmf?V^p)Ww
z<4zW@D4I{%=Ovi0$buDYjaXS<CtHo-G`nSHD{`F$M~@cJ9qAep;dv|h=4))q`a_(u
zM>K_~WXfQn$NDM95PU4$H7XeQK6}FmdX3YHZZ>Oicn;L2?M6!v+m$Ah`H1k7xG&Eg
z@9bsjC~XYyo|H2C7?GqNzqm)7+;HXol?rEep|mNw+#zHr`nC;_fjL;^<y*0oW<fB`
zF}s%xs_2^-N8X`dw7)uFQDsN>vmUFCKoNMi;See69JGb3w;$uY3Br<XHFD(0#3zTA
z3nk=o#UDX$b-Tq~i*<<Q$=w-nlFLY{a~;2tWe$HcEM(w}wU^2U$5HiJG3L@_@SK4)
zm1UkpzBY-A_GepVs=BXlu{|S-tt%|##p^<cEbKaNl!KgYHHKPG+H|<)UCZh*O`mKG
zIzD3F`J8uWGBcHAqG?5xdF|NCDdkUlS#P<qMYmV4v3$r3rTI%++!;=QQ#9RB|64<5
zT_MtoxUxJf*3;K3JIkB|dL&-LGS14+&ky_SgNsgB?N{)L&+6Y1a1;ks4xE#7%>yhc
z24Qj`3T&s}=xEG|si!`**4{cO<%va`rCK9Kq^R?9&s;}4TieTC-p8IxdEN0?t1oXw
zMhb<uxT3-<eZYnV$3DF^#Ub~x-F~;-IQjG?2*Z2-;CFq(oJ;q~=9r*f<?4_E-dXnK
zxHx=q5Q)|CE6QIb+1)HlNT0XDt9%%3GMzJf-Ss!~KK2Az=`&4-%)cr|b<WlE&J`@n
zRkl-ylltmgau&gJf%?!uWjL5PI+6@XIc7Y2D3&8XI=YeiyME}o=@TYT%1DJU|34Bj
z{5<C2mr-Monx71s!aN)=8qzajUH-U;RxNHbA8VrMiJ`!$9w7>|V_<U5nfvoPn3)&e
z$P_8fC0?<EH&4-SC6PRnm`6_w-s4o#E^|%OR@$?SHmqN-H$Z8;I6BI+zu&x*&1cRm
zHEh~G?iWz2%O~b9i+w!hSVZ-WoQz?AS6pJgs$rVIj}MkgyFYMCwsVE|s8eMVd6I*4
zXIk*k^Mdo4d_1ml*^(t;2C*qAE4P+x2_yO|ykwU^ONe#IG2*_(EmuSw`>Mts_#m&w
ze#w$0PP6ShbqXC+8lPU3R=fF+cSSweZimb3#+sT+8q0W(C<0!IRcO@qm*Cy4c&r$o
zrP)VZ3llA%?~0!Bx<fub8YnqVz(%-bY*(-Uw#T&aN=7T{;K<jWat;FS6M<Jq@B&dG
zE7`dBG77=Y28x6(aI&N;4=O5ztRm4NDMq+Bo}d-F=C8Nhb|fP`AowBUa<Hxzf9GdL
zQPw)ZuCw0WjS&WYtp`53Hu!2!h2oaS2y_QQC&#MLh189|#y%V9Ve8q3l!z%os(VGC
z9*g*iLC1dVZT4#`-%(5e#N1z)b5;{5rI{)Wt&Hu5PbMWTPplsQRQBQp`<si4{wZ%d
z8`|vFUrN4h-rO@InHrcmN4#SAd@>%*7bh5m=y|<*2^drJ!2It<`Is3$pRz2;b>^=;
zrVg)$mDG(rRlD$q>h$piD^~usdUYh1ej~v>Lgoj;11y7wmrr_yvfbRI=#^EEEY`Qx
zm08UC-ZBCrUnb&Em0Gl7Y(w)+i|zP@IfzNiAfBhMlY+>HWQ~BbGIDcs{k<9&S!3+;
z1xLI;GtnozmcUpgdIRX`-*8QdH@rtu`*dh-bUFj7I@`IlW=s_xUZr55k#1q2yDswD
zMbAl+o+LV8XP$rgY|6&XuD-5*D%EA9;rM3yiO(BfX|jPwJh7sUNTRoUH+Dq_@t_8o
zwR~x6uj1r<S;CvKbGf4B%&a*;$NS*%+xnk+c(N^!GFymq`Vd27ZC}I_HYWkqBUdb{
zt_IAWL+DGrVQ+(c;<JkdteC)XOls?;P@LWC6Nl{(l#V?gP6ML(dj>eawdpg7V}Fl`
zh@i}=(>uKXBe+cQ1y9P$wS3PZs1AM3u2iz9=f~j|mD#7@zSLQ9AA*O%^aqxp<bxC2
z^L*-J-W4WBldkFCd{LSkizwyxwxL5k`eYX^uDHH1cemYgcXxN)j_K#l6)vuqK?!r>
zonM!G7k&=|vbXXwnPgymYyZ!{S=*i4K<qZiAiLIpaYvLs{S|Wy6R$p0EKeelZ8!hn
z@!A_Xea18df?Ik{=m@YNBK@PQvq=;MEGBEe2E}8b-cv4*85p~<br0c{Nt|f9t^ZGi
zoDzc;+6!$&L&KTqJ&!Xr>BKS*T-+2kjecG8yPPx0HYndU;*>?dsH=|Z4^36$K=3G&
zo-2<Mot`8<Tt8Pp4>pHFDKaW{%?wg%UW`-rUS%aLis~DrLUYLV@zb<W`)5wxwETEN
z0+eHWl|oP&{K-kjQ1Z1EH2V{`>9Z(HgC3ijnx+Mvo*Zp6V^K}QRZ8>~9cn{3p6k*H
z=d-TG&)>4_aR}>89(>`7CY3oV4Vx)8RQ<3Uv>*mAuQVe_8K1TJI}kE|#kutK_W92q
zlYKcFnLaKK%F}T;dFyx&5QyxEwGopoV<W{0^*AGQosDQFNRTO468g%1e669n;Q8Pd
zddGzgpl!m>r1e<8N#HriXivsZ>m3#R2PI7z<0%6k!GqIYPqkLBoBUC%37+jKNDK+3
zv2oearOTgRA;s;c)`bSe5_L@A2PI%}9<_B;)(7v=h=Vr+q-XN?wF!^Y+G3lp2B@^w
zQ3GMo{S;iFrTB6(JVxAGh)%8uS7U({;VqMwE?ahcz!-3g?n)8pW=gK$aFRL_nwxP#
zkDv+fxL(G$&lR7fR?QR4K@RWHDs#Qrm(MOq2FP=DxD#SlX=rE|Nr2KLLK#Xr5s9%?
znZ;CcGkyB>F$SlTLG|!?rnOv0?pfKkPaBpo<G_`eU?#+8Oq!f?<w+2_6=Cms$ZFi(
z<x%_ALlObJ>@o-`f&$5OuIu)6Z4juBWscSM{?q>0-|3$QXn_y;yd%auC`$Swb*l=K
zf%XvB2OJ#+O<5GV5hImilnnbLsq#-sN&-Tjw`dV*@fL#0E%)M?pC{wcl9D&v+R1mk
zMOVKV^=pJs|CM-3YRKtBzn!Sag^@6`a|QtLNx4Gxl4iesgzhrCiHgcgip4VV`+Vq#
zl}AtXphWR4AcCqT(;M0D`<OmEIzRP_OWq~n1R~8$k_&^()jDtD$=X*_lxD!QqT=qC
z7q$_RU@0mwl6)dz-7;D*1TIWSNDx_z?Stosg!$*}rm8SV;H-@3fq7f}8rlYZRu__J
zMx(kfL_;bpJM>+zcV<Xq?+3rEgFLM#+Qkfbn@Ef5`ue>DO%1P+Im|$sItIGmB`BMU
z@I_}efrGPOp*1b}hzxW>mTuwFB}+2!#HgCeSS-oW8Cf^fys$k1p0w@CTHc4mdP&MS
z$f=T#Sb)&Gd3ab5*D~L_iNLj;R@NNSZZ&Ruy0{X0xdpH;#^O>-7v}@+EHe{j8n&zV
zC>iA!DWe?N95Rc3_I#%aCt8U{KX_M=s!Js0o_M6YlxEO#OicX-;8$ce*&xONzt5o0
z`}N!?86j0r7n($F9ACIc;w;Ze*ni*fV_H4mALSNtq*($Yas>g@>_w6J{@Je`mxHGc
zns)kgQ5u#~4#v#H1j0hY%jd#DEAtVFKOFoM_Dh>{Q+DmzC3p|xWwv!;T0I-X49Wt(
z9f5NQ?9ltnAAkPIaZ+sQZ9W=QW|1eQ<^?+mrOFX#x`>C5<tmiDJ53Ul3M)EcC>q>{
z=8qidLoZP*;K_xk<NcM~?7Ey6j95Kj7QU9>;?=owgQmgmix+>>u{t;se-KwtLViev
z?EtMul*u#gMlASHQqyShR|}ctb<mKkQtF^8S+HTVLR<f@efvnhM0uHZO~zwzas7EP
zV!%`C>ykiwd-o1M=q}0fC(<e@0|{$68}G-SbsWp_mrRCTD#?MA=P5Yuw;L?e6lq1U
zp+XV<LFicHZWo!so!W(-y>BJb;*O*!TCi}S6F&idbqQi$499$%D1da%m&tNRvFgMd
z=dWJ9+NQ-<UtOf-K|uH6Q`PS5St?U4edPNDNtVq-8}9lRsf*m*^I&}TlW_@R^nh8o
zK4-V-@1LWF#&tDGV0AC1EyySb<z-rIw*&{fy6t|sX8roB;C%`uVjN$N5^_L+cDz<8
zJ9!t7{-qV__SI)2*YY-^!>h?l-7WT;jUqA=(B{yv_;P1LBbhDFlm9E`Llsw}#eO4a
z?#n0{%G`c-?e-4cejyiL^!)y)I7fb^co{zP22=eC2B<GHV-t*{Ine6_|HOf~9>pk>
z2_{X`-j8kCjJ>2bsq)^DeSi87`yf#EvTO||kYAK=?YjM*(chAsj>ln}5!8+!`yqqZ
z(EShnVC0v!cnU4csEgcOJcFR%Koq$4W1VPKiiJ`<o*GjB-X9@dxBrSKk6n!`c?3!Q
zBD&ekh;fX|Ms;4~-6h=wBs<SQ?=XM$9m*dW%Ky~M@Aj3cS2#8oiWBfAjyt}SmT)4R
z1E>LzvS-!#eEjUcMc{ERt?*!%LFWXg1P9KX6Mw_Cnqn9(5ZartgsJ9i^Z$4-3_ARq
zD&k}Nlw3tjMytNI`<^n^j0tA?Hz95?)oZ6XKfE&4twCz`X2Vr<RQyK@+AW^qg$vEY
zj9#qhaFfVuw|ow)h(e2Amya^C9i2XY?yxrFrokT)z1|!KB@EtgASbVN>j0CP3_Q9D
zy05z;S-}tm>E*EA?ntBni}3u*=TayEhmP~_LS}{)$hxc<ys>5Q#-Jgc2ETN&_ip3@
zEF5mOVEDZIUbkNE3~4c8!i1ntP4QSz*C+9;L?6AoQVI)1s(E^yN*j{DWy2enIrYzv
zw3v}wPe65~F1~zgi52Appcma0UPMLG{vQ4faggwwx0e31W=-bH=OpA(o;&P64Ud4K
zn7y^+u2<MyuUlu|9A8tp`H#wlKa>{jZBpfBMC2f&9-;s3XT1nRt-&3C-J24@%?RxA
z=uP9*Xte0jGp-?)<*zz=!IbNdtdYjZz3>RM2uVovCXt3wq`iPB{ri{C>d!yw4p>_r
zn`gIU#7cYI6}s^mS2URrV152K0tA5^#<@#_&A9Y&W|)petglI@r;Ip$@QviY@GJtt
z4Tf~;%@Qw^HGO|OWj_t?1l*hEn4^Yj+)n@df@P@1_70!6*ofEIM>R+lhS60|kNH)7
zeRro_WU7E9&MJvxKo7U6r8*j~UP7R^shJfYZb$y%coM#2?(O6PwRW@mt)2fpkb=L-
zIOUuh5>1XRLzZP2Xu(4D`}kwzmmkS57g001utEN%^S6+^kI5`jrD*J{r>s!FXcPhG
z9c3R0V8zpwN+odl0Wl2J(c+5aD~^xnS?hV%Ecvmot(B@J4n}^T>{`er1|JT<@lGG8
zI-LV#o-4xqQfIR}zThXswl0#RUqFeUL#;1(j5rxRmmW$P!9tY=iMZW7=WXxRF?idp
zW*<)Sre^2Pp52GFkfMZo%nL!-mf#&b8btp)>rjV~5R+2f0ZPncRYzS5c7uxd*4MxG
zUXx9JcisoP<YUX+dmAYs6Cx>O{;AMm7^2vqfPdE;4$MBO32`0+6nLQt@?LCM<CFou
z?()wc40Lf<X52}X#r%W5%085TK{_nJFdD?-$HpQ<_%nH#CDQo{5f;j$y*vhx_oJA6
z14omu<&|(2cwg|R4#SpBQcVJgXMWsd>Te}q$}!nAjT3T__IMeMpu8->nvgwu^pKHI
z?_ampeqdZ>d_l%WejE)dYph}@iHyXfikn{SByS!FSU4F7Y}2u0#}vZcUPgb!M-6(7
zfAIs0KczV@TtZT{AU=W|#C<LeLcrD!N=qZi&b1s)da1%lTwTCK{<X{Ye_Z?3w}5p9
zIi`K1lu5uM2gea?m^9;a!U!@!yJ5?Ykk2V*(^Dd4f{zOKJ|AZf8eNfbl2AP!@(rpm
zqKHVIc*8GFozD(a&yLHaTFsvB(Nmnh`la$u|ES`T{)SFwkzP=W0@anI#q(v_9YK>n
z`$8(+l+oQt{2PRB<n#aguqmHil<^5N_)IBdK}K)T>`m#cVFbZ~mP`uR)4r$r7x^Qj
z?UG{+pbpVOm_**)d-fcs5}1Ry<cdsc`}i99<mV4k9ZSZ7ob01iPm;EUvWt_a=EpZm
zMD=l`Ku0rDZK=B>!QM!BegEFhlieEWX8-;)|2y@}KOZ*AfB&h;&i@W8X#CgIPd+@!
z#=mO#{|hueod5s+gQm~_$Dhb>{0~!AV~8mwV_{zcLyK&=l(ZRv^8S^f*X#aYXOhXT
z+cEd~ebkov%U#fC=P~N{-2Gh;OwOmV3dm%<X`|obp?%j=vxGM$pZdPx+eXH?-8=n-
z^drxn$5_^j!h+9UGqoS6;8hVGDmh9+#6JG3p#4_|;|M4u5*+J(lg{{EqG;&eb=y4}
zp7S|XdAE)dsJNPbmKwtN8GleCVz}NQ(M*nc|0?d&a^dM1ytf6c8+G)Fk6FEq%o*}>
z=gEk#9aisEnU!$`$v3|rPT6w@RS;iigRiK<D0h@oWxMdwivt5z7a$;>Xho%O>@=Bb
z9u*`RLrDx7N!Co`Z-f;7fQ)aHhlgd6v%aLI&IRKp6WsAXa+Q)3K!LHCI&~*=q@!TN
zIlIJ6dNsV0MrS4Y{v|Ss1BhnzW}F`MskOeoevy%p0C4?)_GKI#pX~#ZvcayK7U8lC
zpOG@Mcr@V#fTc39bPQD`8P5tuxpz`*62ou=fJ&>RwLsDrPlr<b;NW0|(lSzSN`7Nm
zwDU0l2v3-HVT@RNzgiex=}TS&XNAB+hOj!PXI>I^Ci$MEa#91t*z}|{qMbYC_=0Ic
zrZWIxEP&e+H&SiPmt+^WDKmh%t8BL(%WozxjeGu@1||J|WA$Ac4rdpov55(2U6qNc
z6GOKjUlw`<-obfk3=erW!h)2<Lm46#P$FI28xfK1B%Xi>UW_zoV3;()WtHEVKxzu!
zy_>p0Zv%q@=bJ!>aK~b$9tsCg9Fb@;xQ@8<%BWLW71i78H8qczpNK)(GPk_jOe;|Q
z=c4AgXzGfw3aXbN-r|{M(0R#l_JJ{j=-7+#E8clRBsl<z-Sq3PCsJN@KilrrGrE@^
z!YIEAOysER&1M&0kKRat<hlaxQ1(6Vmp**{{vlM1opXX`2MF&{{er;r!7MBpTY-yZ
z>-bS=Etu$*Osqh40HqhW{=$rW%j%Ce?Jhlg!cJ8PP}7u2#{|0LO+E66r-f;xmh7G+
z!{2`B5=pn$E@Co5u%{dTNV_23nK*Q&QFT?71;11Uk@CQ=Yrc2$^t^^@<Vytb%V=xb
zXc{^^8VpI0dE)&`{Iw1Rp#be;hA`aGQt^P9pj6?oBBVCK^vGp&Z7u?`laah&K$mIQ
zq4LmHl5%>!x2g$(PKjH(<upNv^@I)1HRSm%7-&4RIfaSw!fT^{o3m^Ww0Di1mRlV`
zo-+qHP>2jNA|52^(o4O=b6_N;HQ8aM-RcB=(oLH-H6t)%2J9Y^a#<1S7nGF;M1@5)
zn_J6yaYKHP;mC)L8oDu9`z8H9riU3Gtkc5WQs;ydv$P_LjpG*<jV@XUhG}B*>2i5!
zxV8NfI2e~Hmh*s<V!k7Bf+OA=!5jS|7#3I91X0|y3|zj!D00PY08Ju);TlzJ|3!7b
zs7Sb|dwlY4-V}62-)@PWot;I5X$NR~V;C^KA8-YUG{QGQdC2H)44ZtQVn`<q%_63$
z<Vk>*`TEUfVFApdV0-{FXZCSQAwodLOhvLM+YOlh7w1$$eIfhNFJ(d5#Qj3MwpCo+
zEY>1TP1ISmvM&>$BnFnrm)w>m{0ShIy+V2?#k00~Cp8-I@Q`qW_xn^M_0R0gj}e2@
zG2qAjEcp3=exbBN?}Vcg)PdnBl+T190{2ZOhaxlZbzHZC<MIDUccOm3uMSthdaz{U
z3EG9xJU}S|lNfB5-%DCLFlmO=JjK9bNg3Gdm3AHTwthqGE1BEVe#gA-a)Dqm$H1CG
zDf4v?Z)r6Fo-7et-15G?m$Sddno&x5+AAj64ttB<t3}aL?vD)=rx`zbv@KLV48!2z
z+Iujo@=d}pCGMVXRFC3cJPbvX*cJ5U2}eLBlDQ_ijowY8qwB}y)~y3^&U)m+C=d7w
zDq^wkw^J2k)$>Oma$v6VK=l^f9T``2eF#*Oi5in(4uU0KE&erQtq6iqG8NnlmF{r@
zU@$7p)2;~4&6a?G{1w$&^p}qxFn;><-N1-5^R;DG8bu=)qkfoQ>Ojlx6j3#DHiF}H
z1RlcleB1qpi@Y>^qy~xCidi8PTLXhaFT+#9tAl>0d1&-#0ivOq3=Uz5-i5paxv&9r
z-uJu)fGX$Z4Jm=(VPr}qU`VgL%EqVf)8t>1r|zDd_w@8^X-)Yvim))x=abI{F{ltY
z`C1q0#Oqf04wm!HlBJf3A>k~t4u*bE2n6&YcIjVy{`vJq-wU(q@pdf>&)^msy;y#b
z%xy6%nlYc1a#x$&qWkw3A+atO%$PGk^$SbvNh2t}K028$#qcG`Y#&Tb|1&PJHu1(_
z!J8FCh`p$EwGH|3w2U<W6{A%{!0e$u(-)e3nk3Q|GA=i$`K(umsZ%_@eyE$>j=tv7
z`%>~Ix<g*YG*(DIH}B?(!4Qe`M{Qxx=207{*goE^;*Ew64A)uw)muPA-9;j};2LtL
ze5Si+)aJHAasV)Uo=l~#Z0M6$wWJE0c6s!Jt^!$PtV(7BJ!$d54r#QAP{U^pOKoJ!
z36A%A<5r7^rIcP!Nt=X^B@EDl2O;F+g74xKkk5sv$&>bYcK0Xh3==tVeV_voc}38^
z+QqIzNQI0KmHDOuHDU}Oagv1PBb7adCuT83i)i0W2&H=UP}GE0tyED(B&+ne;|Omx
zo=5|x)cm3DCF>Zm%e=qcpMKGFAM(kvanUyrNs$?S{JNb@PZ!;&iv%0UD=ie-KlO?$
zC<yI_EU)H-HEo&kvPMEGxMniq4mbdsRWA&_TsfHd^I>$69y;Uu#yFtw<BM-};q%`x
zTD)-Kq4Jtzt)7h)a4&B~6kS_*<@rW;@Cn$S`Nv$%atu3FyFq~x&>L~>eEb-FR68X?
zqd+0$b-IUCyG&1Pc>88Xyp0sW)Zr5jT4?f=>Zl0RI4ob>(ZV93eqqgLU-fR<d`V_+
zw~)Mnnm?Z#Fn(Oz5$%ZN8FNZT?%5iU82-0SSyh15W}6LxaRy;Ma#Y=ODmVS`)##>w
z{PStklo3`>hA<4g<&b}#=G!cet_`lNn7J~yV0DwHC%kedi>wa~%il2&FoTc{+LeKg
zA=PEN67mQ<p;<3v)*b&fg?R4>7^=v42p>K0^z2om*i^!UB`Rq5ZaF?rLR_hsN$^E1
z^4M-~Z^Q<#aN{0#Q$L|Xm{nLP!`>``duV1TeK*`{NB(kmZ(B!dXssOMmMZ~GAuoqf
zzCf~nUZK>~x1D@KgT;@nTSri(DWg|W*P(x-73k&v)d%59lX9x2gc1P{+yzp{zahjh
zU&au&${jU&bRU2_?!&xcB;7NU(pcvD%FMRx^440G+IgLd#+ecH0iy&xFrXxNm0p4I
zSD#Yntb8jxxRjKXQ9H*OH+y;Z?AeF>8{C|jzgu*IO1iATw)`m7{BjCc7hWWwQE}CI
z>RoR7EgCrQaoriqh68L6OUnsrrERqnTU?HbvqVJ0<f`XQ9QC?Qqp(aCVgFRsFYm%G
z#Mxhl%j~aGO@D67U`~PsDSA<C?Zvqhh9?0dp~dvi_k->4=k(3ANy1@bif#6$Qx}+4
zRa-~U0xhxB)xi-4Dz#a^-W9DGyu5E;F9A;|YMqcgALs6zWWNjyRwCS&T-PJWz=0tf
zeSM`d^t%nqiQM%(E360&dXC9Y2$)fk?0nRGuQi`&RTTXB1tzrXq)DZC%l0RF`}gk`
z)CMxOZmsK`+eO&feTX^EMtfx$-C1*E0s`Jm2FL{anEO|5UQ%tP=Zj*cj$NYgelL=&
z#4_R3z<kX0C|AnY0&S4@#hT=bQ4yyj<pCMcN1ys7%J1$vU(MQhV!)nFoBoGQn)!Ce
za>5NM^Agm^Ly}$%o7&5N*DUFPv}zzTueU!ifZpjx%nmDQC$W;1QR@yJqWJ<$_AJWM
zLVhFS+xlQA-!sw4$!TwTJ-rkRIi0yQhyHt!VOCzo_(D}m(JmD>qYU93IBYmxN2Wz(
zxpOZ3%1_-}x7>YRlk1g5?H~akY%_Vk*|{rATNpj_!BWINd$vG%8O(=a>aKm9xW$xq
zI}Gp8??}8dbhSmJ*YG+0_x~CJi_Vj8C3=jL_*L*CqFm7sb6c=LDi7#%50Zzf8U%$^
z8pK3wZHHD)ps<!{bsn9OOL>6&=PLS`)X(~qfh0>wnZPNWee+;kbqp=`zY<<=_HQ*v
z!Wpc21nbbJd-qY))P$4=y0`~8Q0-|T*fSg3S*Ggkjg1p$cAWUjvQ(av%%mni?jG*(
zSky1BCVIFCHI_l|a_)~X%vIcpbm)WR{rJEC$uN3ij>paKtVxO^8<oNsZ}wM6Nmv?-
ziMpjN3cPU^@Vk_v<?xZG4jogT8#=;4`YIGnnlvHS90h^coD4(&0Yc8%^b^W{l-@?l
zJf;XhY-d1Gdub8NcbNZ^kX3yTMdudm$Y^smnZ}zkcik_)$qhE})6y$+k;S2zMluTN
zh=LOq2}h|99XvE1T_bVke9CC*SqxDj@;}_R_(Ei@mcG6vy&b<%ol=z1Tz*kvafyjp
zDrxPJWQHdsDWw3i^r8INvE#hY>!NA(-#aSfAm(QoyjW?hVn`=(tGmEVl)I&80fG>i
z#E#1tMfY2e<2nlx$F$r@ho-C5eCFV@c(^3`eP?6uBpU%Fklr7PdE-Njtl6K378h6x
zoGi>_Y0Bc}WCSZ-@CaN(HF>yYyLJZ%(~NHR+tw>eayAm-c17g4#jnS8ZhBfzR@3)T
z;K5gOf%ZrLQ;COw){`t9dzHcLkT%QLKx>qw@XXrAE~k;xe*J)5O|^Nu!2)>m;8$_7
z6C{NfFx*$ek(QR0ZQAU+ePVh0mHX;JKK4QJWRLeRL2=odCulim&H*Vhv9HG&zx(Vh
z8|y}es;#D`cJJu+w@^fi^!INm$Sj^l-!4VxeXW1JMg)u;!CHJbti#dCj86D2gD`8%
zxEMu$L_tg;wRBK|Q*8zVEtm%+RSYpS6u=?y)0I3g3rvGpK$*-VJ%N2~kM9)z7$$3Q
zyeD&uL6^ISpGcG8Hd#xl*iuB({WO>n6e$9$N;tlaNOCuL;XXn!;Sn%-ChD)h{xWe1
ze|i*f-vK<~it~CQD$V(Z#sV6~W<H8nnlZ?@4gr%kpXBgp1>$ej<$J129*=KWfDGi}
z<EldIYtmo7Z7bvm&UpNiD99JW;g^K^k<`F(G5-GkVOXs>WP;1kbbj)BS@l{H@%HK9
z!nsv<rE%`uYhtv$x!Ko;UA4)-bGm{MO*WXM%aY}{%C(5Kx1($zy^I1av(y4EAbj|{
z=;Y1aIqX<<EZ9xA)Pmiknz7#s?k`>sc!YFOEcPZ+GINL*>rKxbjj~O++*QiTBT33H
zPB-OFgo4M-n;>ntfx3P;MYBGP3>J~Lqxl)8-cQzB%jhKOTR@4xW>JK;!$~Rf_J%Dd
z872e<OA-{kO*rx60ph|pbwdSJ+>=$sz%%KAElR*PNzXS;k%t~`vVz;a9^n8siJY;p
zBk$_y<RF?f4vaZOw5IK_I)20Mw&bPxOYFY>`fG}d*{_1Jx^3;0LYKhsIxo-JapREV
z>gP!lWl;2rT)QH?dn-G4^66+#%WUJc+<8DzN4w-m3JG~Z+#GxmH_MN4%#jy$E~$;@
zq<R$>&luwoS*bb5pzTDxGPBK?(W4(h@wbjCVw5?kdix@rIDDjYlfGr-@<e0fwW1Ey
zJ9X(`njoYM+O`N7)9UjfdG7cJeok084<A0XjXbp#|61UjKAIhb$82AO)t>RlWJWdt
zABA~srSzU%eRf6^ZmpBY2(M(6kM16!1mZ^a&dC@@x_a#4d4bPt9)GHqduTs6=c~#R
z0nY}0!7nacc!{2%sPDTQk5$#d&8LT<>rIaqorVLoEX$;@A?OEr-IIO8mlps|d{?(6
zjAj<Azn-}L0xwx74QzApig|By<HNN7OkY)qHG9c(FGuJ0=uk=|>C;0;0P@2E3weAa
z;l#@D7N0MEbS1N7rXT=h3l>N9?D0Y<jr3205_JXNI?ux!7)blJFAisB`$@^IiQ{H5
zeZ+EQ3;%u&_HY38(@20t*w%Z?%p35KUlP7eVf~h)DYmZY%9Tk)zFnUThCJf8@D}_f
z0^L_3Nog1v?KFe07Z{rn@e<oy!V<>Qf2LHyb%6!zzGzruwEEkU@9sAi7K_)Cn%I2%
z>)6XgTk{4CCoH_0K(uk_;qc=~X1T_O1_wt;Bb@3b{mo8>ccO@iXOnWIT?ZlW8tnI@
zR5Hvhz5D;L4EpQQhohxE$J`~N#jj*HL$eInoY&B7UAGW#NV8|0YyK1X%rZ>+>gLws
zfUu<PiUijA#!qQR?REUBlZU653<qKbyiIRIRj;t52J+w|7X2DKS5g>Ow@j2$J6xUF
zLc%;)_XE6Xu8s0sexg0>(YUWFg;aTEDVSMK%e0f7ehD_SS)E*wIOi22KKECIKtVrR
zUh=AL%W$U-aP1!7aDH`wfWYRj6Yr2@oqO>(HQTgzTefdMK)U9%e8(R#7&HAdZ&XyY
zB8f@c7x6gT?3mbWEqiMpDg&0Gv1ERuw%xgwGP!9aj#J#d>hj5d&TxM6&s4oxxPIL`
z4^zV_GlS=Aom~nbC{lN-Fv8wQR@5M|p?#i5)a%yAT*7UX=TF;KcU+cne`6&B%A<2U
zAfu68uYV4@C>lvY<?|uD5YJ1mP6$#Ceik$P4H|Uiz?BYT8@T{oTNXcltTTCXzH4qJ
zRwKS$)YmxUS+i#C?fd673g4MShU0sVRzm4*`Th4@m2RX{Z@1H98I6~Bbk(T{oCvTe
zh@825<!QnPoiyjC)}~J%WA1-!*KW?G!tAZVw<4rj?W9-1{X_IwJ>Wb^%=_IY$%nxR
z1AFE_fAPYM#A5wI%FHxEZhO2GSDy3IDlNyg+V|ztP(1(<>48}H96&kx%eyb;)F&2Z
zTB^KJf^ju071o|a(nz##&Ao$S`0GD3dPn<(2JU>0R}n3Pwna3vU7a49+&b{hH)9Wf
zmB<(=Y2sRHak;|)_j<+wbSWC5t*zZJtM>xV{2(gUjz++8aCshtB>k=MeHO2vAfXsj
z3IXz_ZNC!)qcX)u(1TIccmL?2Q2Z)a0737>xJDa!_vFtXVu?*!GQ_^}4f;13g@uJ}
zX6AR%aU7gok+_hK94pmJYKbLt7N^s&KV=<dow<*a@o`iBTGRB8mmR`^5f1OD2VQJ%
zr*38jedv?5l;4xYMtA0gZGu)OEs7HP;B(hq3sC{f@OAs9?5R0nm6^GDbPhz<<WcEE
zl&!iX{^1H?sOdM~5VwLk3Q-9l=0(LaYQzHi51<G2VB6*n;4RSAbccm_uYb2{_3ANu
za|89-R%a0B78`%-TpjzeOF7Tz0LY&}Gy7wLtqo{29yK@xR#af~Kf>>50m7K9Ds6U{
zQvLSr0h2?2zojZnnXIs0<elC3E8A_vi5Z|N0{Mp>X`W0zexfvH;gmE}7V-9lOK;ki
zPHFaB&_!t%P?Sk|>8Rs}9^T{i#TAh~8dv(@>AXDMn+9d|+iqJ9?$%r9H|Rxndwg#a
z>Pb7fcD+v>nuZc2XliOw!K&7@oOhFuSv##P>C}t}eDGpT&GTy{3&Fz=UGi+FSRbP2
zN5ljAXI2)Rs2AD&9!1wJ7iX_Kg$PA#g>PStCogZ;iTLoI@^i@1;1oCj0YWvW3Lp$y
zbowlF6Fkbd0#8MM>E}y6Bz#7SPEA|1nA{MZEFM$!AcE+zOo|}{I#QUId1hlqOj&#y
zEehV12On<>9iYNbg=fqjk>(+32d8lVh#M117+pvGloI|+e_@pQ#nT0Ec5z>OOZPsV
z+AI8V?$P2`b$TeD2o01jiT|ta-2ZCM*EYUpj2YV=hP_P_Qwp)O9a3q88HEv1X+$CA
zPzmKw357Hm3<)_)*h)!65gm<T<dACUB&y+w(wR~b)$_h%Ei?P|dj5c?U(9q^>$|?+
z&*wf|*L7c?pfCn>7cq?>eI(4^)8gV#L8mDv!W}aUE0>mS@DEy!y#y`Dm6xyEZ_m#h
zZDrE#b^4v2M_T^qG2d^tnwpFfQ=a!Sr~0%x@^;j%9c~5t^p_uHvMsdEu;|U{F>k3x
z1gX}EWV{1yq-*CWWpYt8wj1-#D1(D!CUj$-4%rQI(YUnwGv5E2s28DWFniVG_H84+
z_eQJ~pG{wknGMpD3QyG0uUIp)<h6WzB|Pl)d53o^>>U5thAKj4^`uBjzhYw>8+??;
zhj`oF$*osLc6N4>?~cm8Mn*<9S90A8hgYc`B!+WQGXDq0=Np(|RE(|SUFu1%wS~#T
zz)IVUv~Mrfp`g_w$DSQsNY#j4-O46xx}zA^9e_cTIGC_P4AP0hdDKACO>XN_O87aT
z7gr?F68Oq_WQ}xMpf4_7R#pwM0lgxaWF1)gG^b@6)Dy{Rl0XUgn%MQx384^v*X~aU
z!XWRYD96U{=y~w!zA~4}rUVA^fFp*EuP<8hy4Gcz2g9s%1-*hxc|~=jhr#(NnU$_r
z9PcDEMC4j*9C0*rbq<fIZD{wND66E8?%cU^V}a4VwD^@zhwcpa@Uvb0nAtrO>5l0>
zY8xB(hlV<3{9Va>Dt_wO%j$ilpsOQNge1p5Zr$pimq(EjiV2|mC#Gt8j)(jI=EH#J
z8HXg>DKnoLUUK=o@I+0&Lm|m;$F>mL7E+;&@OvBoVmif6Hc}@cR6scHET1e&9&8P+
zWj7)4H@rGYa5x%)^M_Zm4J<-*ylQ*qrvYOb<}Cik`EXWd6LT*@9e-U+jE|%o5St<<
zAA`#h>Dzu6>(fxTf&rkixQxt!ev@e;^c_OH)y8d-auKpeRF(qdv9Sh#fYB5wfy7vs
zNy5QP0!3^*C<8oh{^eZzE>Cov_r54?WxF3Nc~ajZT|WS$)rL<y-6PsDFnt=BcKV$J
z8ZeaJBjv#Y10;&2`_l@G2@{5RTN)aCT2x~+e_aF#>AsEQ+tp7Uzk*W6N?~+RHZ2|&
zN8jYNcG|6#ck?Eg2pufkc+j4P8Ofm0Wu~>1n*&q3k2E8g+mkwXxB1n;W8eDelrk#*
zxtFz`0|zF80kf}?DUB=0&oYpq?+OS3QEQFRIUA&4*4bW8c_%14XZ?~I(&6*ZFLW@K
zo`tlX9#>CXcGtMHFe|k@8GPY3lZQ|t20by<8k%!3=LjfFzwzV8Z_{?=VBP3EO$`nA
zu8-V#AmLUahl-;w{4^S@LDab*owu)FAIDO6x%`>&Qm^I9r!u|n?ssu%n)?5oripdj
zJ5ATQppW4Smp!$9t%>P`L9tx}Nc&PvEu0`JQCcm}ien;St9UX`v71}CJ)pj^ENeV@
zEWy!j?z|F1|MkV->(=N!bs{Olgb4QfZq`AqymR3sDYGp$ZFN5k>cJ7intf2f_yQpg
zI`Rh?(%s0MnRc%B)FQevM7v}2rcEUaRB|=z=O2pXZVT|N60e>Blc3qe;c;8MsEC5X
zpzKCX{`e!-^#yfm)!<RXoW9B|@GMU41mu2C9c}7NmdouNx3DifSbPu4zXcf*-7*DR
zsuQ5)!J<oVJ?ly=ZR*HbPfAL{@qR0gAt&7~Ed1`FmzU@4uLjRrpzlW@s!3D0QkcoT
zB~_>KV8jdqkR>I}pzqqx#-rli)*PWt%X2GE+uG`C1&KVs#O8I(Iv24por2CkM;d!P
z3-EA1LrkCh?Jx7Lca$Mtfnto$Vu_<xTu^z(CuzW3zadG(9t0gYu;5gd=Zd30ChhwA
zZ0Nb0+Z*ShuouK8?1EnR1I8Z5w|2W#XwvKE4!8Rpo@@zhsa{OQaHyk4zkdBdy6>rT
z>gfy!JfA6jl--B!+uG8KLCW<TKbAsA4qgs;K@WiT@=J5E7!r|%I5xogrSGobZ0Bq9
z4#x~QI;gDO`h+sFq1x!H2&0DDZD=p927tI>7arL_fVpR!bx68+{T1Oo!FJn*4I4br
zb{wvHdw8RJ&C8;aCzl!6{@r*5RW$q6@~HiEQp-45cNG6mXlR*2q!W*eT{$DXD*yZq
z;|A0K*dkU=-@f+A6GJU*sPr>ldF{)4v%fnqVGiaOA`07fgJTGeE?kn@T-8vtV5v{S
zu$fK`UONk2wZB`P9JpS$`Hl9|%+)cRxWb(_4`NVVQxJ)%6D}Qd=ra=P@tnFA?jPAc
zl=kPJ88Dv~yf9Ek6vmFVY;{HGpjLVbawg43c@K|0?}fVm-Iwlj&H!&;Yz$i>3XnZ&
zop*{>$F+RlK6@;V7U++nW{a$&;^hIKzyXNv5{|<m<KE`3?)euM(nZn&x|h0Eo?OuI
zU6729-hzrp`Z{%A=2y^u#+PBcE>^gF{zblLL}LR9CcLCD-eWABRjTo&nyeIUtUxfw
zy(>VgxpdSVY~Ws}!RP@<wHpiX^$*7_iMTTBs9(+au4YusQLKrwG#41QXt-3)I<)^y
z9?;Lzs;V;_R;`P6>EUaT_3&}71*bnggpW>4hw7dteg)YNF2O6`du37jWRfCw_x{zP
zM)MmlBZ`?3u-N--+>Hv|Ej?E~`e>VW#KFe@S#o*EV>W0m97$hug63UP@(JiY`w*>e
z-E?FwqPszusl&AZ)}0_UsjqnWL%fTidY9-gfe4DuRprELy^Ze<<KmXLcO<1rVdj*K
zmZ%xUsoDJ@Yi}A};=`t8Wo4-otshQ}pGS7{vE7aG@4E4b5f8F`qzI1mDI0qjFezh2
z3o4GqT{li#cX<#AKL-Y6khw@&_IU|&r~%3{KWTnb^w0UX)cb+8M@81%DSAjmp1E8-
z0Nal*TMSko?U*Q;7W0;OjtvMb#1FH|HWS!~dpQU&%EqBuM~rIBR>k1C3DoH0xNrKL
z6JcS~H_kZo^VY4g9XxE=p{(V&1G`z(du-1h^bZD~yk4{K<&Re%YG`T-8|<Wktl#%2
zzF;=+H#(72kuqY)McqtGxqIm0*}fy`(7y@P<k#_Fi}I>^F`{;TXmMflAJu_@fodjS
z&mVCau&6OGx=nFGDKXK~avBK<h;@O7mgiW%S=R;<9O?3|we5we>aReD^aHgZ1qJJz
zT(0CH6pf#$ic0^FGw<I00i)Q%`hi=v{Eq99H_;TBA#48rvjy=Dlk`%Y1L$i7P7W@m
z8+isCbm#mb85Zk$r~t$VdT#L%FX%-ZhezP3bC%oJJ%9c?{A^IwaAu8SI!w?23Z42$
z1nQ|VaoS5@1OkesYXB{`&6C+m)=z9o^<s{cKQ~UdhNPRnVqE|lH>!a}O+CuSY9YbM
z;=?b_m~3Hj$3wwUAlG1{_pia!5Gz`Xkj_>31B^ywCp@~2xlki*XJh0ULobU?<8@Qh
zBHRMZwC`IirYpF;!jTbe<Wl|H+`PclEY7f|!^Bn-G$ogw)iT6O?*OhT7rjIe4rb(3
zz%^TV)q1v0TuKv@pJ8rgb@5d}Hb%W^J}_9jsDAZ3ckE~|?Han>fG*3J+Oi07vURN9
zo-4W};&{w(S1Mq&j~)<R?ALT1W7e=r)2G+k&9^YxarVXTBhGBRt=nvMYcfIcD;%iR
zOeS^byA^?nK}$d>1i+(ez-{$EaNog=rB+XF7<ndUinj?=cj_y&A#uak5t!9#YR=8Q
zR-N2-E@@ko`;dzZ`iQ#YZqD-+!=hh4{x$g?gK68AUFV|RH#Nicd83a=s=nhQU@yJ$
z(tv-$S9a+#CoZj+5L3^<?SYvi*M61GX#6dP7qWe*d*W8@kXiL$ha6bBMH@XNet9hM
zl|-O!|M%a+SvSD>=R3?is1IJi_~#w<bsVi~JF;EBzI|`e;1b-z^J>7guCKy3#dfT?
zw~Jzl*$#K#=Klh(9zB{v^LPE_YoA`Y2U5z2PB`pB;~WG$34S3^;ilsFgAP>`mMG0q
zUnU6B*?mPPG=%u0!wnB$uYj3GQ4;4zCn3se-@bi+AYEoD(Y^hS;LS2`nl+@2%=ZaP
zl{1{-Od0Um?m<anu(stya^Wxcf^kvm+WTS&u!9}%7j_}~aQCFTq7ooljElZ57D7oz
zyvDA8N*Q|ftw>^yAniPS`IF#U-=%kMQ@nE{v}RN}e+?(>@2_yG#19A&&yA8pvOFGB
zdKd)i&5GYy_8PqV9m4O~NruV=;J0lTg)*KizUeW<lM1&7%5GUAyar2TTcQ_nq1dV+
zUrzd=%^K3;a8Wd#OeLlzh(44`VEYSkvjb^=TBI7<Q#79F-7xrlepDz7@$(2p#)Y2F
zJJku)+Zx`g_U(glTf5yQJBSvEwt+`x&JjzSaEOT_2S)Oq$aAvUk{n6>t;nodDa<Fa
zus7F+g)`&OAJ;K%cu*MOvy99lR!3yqll*4|AFc_&Quvl}-+zDeT^${pu%xMrD|N6%
zp71YB-?$MBnrLq7v8QB%IyMQJ333}nU6SZi7cQewl$Xzlg}e6T`cxu2IG?zLEs{fj
z8anbRZ}s%qvy+&h%h?seW6JgNrXmPs(MX$Wna9z$?WK1S71NTU<VbQaEx{z=GbNWX
zrURC{kT4}HMWi2MvjFdJ5Z8jibtn(HoZKj-1;@5sl0j3^X<WNDRw5gAb5JD%p?ov5
z*Cq}!Ffg!rW_Wg#*ZhLcdy6{Hm~@X@fo`XER*DrTG~d;4xOjMD-#)#2i@k<0Scq_c
zmIo9cZ_ET&e;5&Xgu7g>2r&3G<xHIF$1Q0(wom6ocz8LtKYYXkHD_6hXq{hz89gKJ
z`!Z<G5YXc9ILc0}u)t`IAi?`=nZajATQ$8?Y-vu*-Xa=HBiI)Xw4B7&ND+@hvZ}bz
zNV^I*HUI9kJlqrJ-#<-VA&N;6&<c?Uxr;Ryry-`i<k&}Hn??Kz?{a}MfFXS}8I6MJ
zCiE=NtVq!xATuqpJ~I*-9}zW}*lD?>smV@-m!0yUW+VG$TlMDlPraYCXGe%D-ISRM
z_@INnXnHY1m18`*7xu=J=$M>5nPA5KHge6-2pw6;F@~p*m)H|uMHEYQ_mWLq0l{EP
zy05=CNlzS{!pk30bFn#G-hR#_p0T`pW~bL4oysyA^I2qHAD!F3Dy<)$P|j;|!5=R`
zcq8sBd}jO*<#v|C0cDY{aQB!HSN~;<az};LJ>IM(SXh3>LkdV9gU$)>8OfU2G2zTu
zH1We&hn4RKi}kA+mrYm9F%!oSs_fX1VGP@{qy}ddGY$&=konEPTxtwa3=*1Rdz{2B
zYlBe7N-fL9a4^1^%M*5o33!K&PU@X>FP849$JaKyLCKpP81fw~5;6cYS}W0>3mpzL
z{|tB_?wQ{rGgwiu3lZketUA3i7*2Rp%PS{(v3n)SqUTlDzdGM8h0~F&Av+ReVPP=u
zGCYDOVB1bl)I;3yMRn)P*|%){NVIH-SZ{e+**5kkdE+x1(;>j+`meu^Cdk-OXdCey
z2bK;*qSt%DJ~$Nq{K4_nVh~$Tz|2q1WogS62=<DVWAsEGYkPDD71@LIArVlR(K7f^
zUZ;2Y)p2mj#Q~hw_BT$Sw6iTmSv!MK#pp7m5wmAQ7}-*jSRDN)hy<@W^V@H{N-PA?
zEAjF1Ht~dU9|KOAUYo!&Sja{btPT~Tn^%ZfwQ=D;$gyA<X;sY?7r|x5n!J6!0sXhj
zd7F~O01&w%T4xw@{z$ut6~;}R)1V86{xE?mHdoy6&?S;kBWmX@t)#Ng8GKtHa%2ud
za7Psy_l=a{d$?bVm3nu!<;Ad9wnZPy<(!SLMBc1J_Wa}3`j70H#zsXq@5OXTbq<}-
z6#Ayz2e?oM_oqURMtODC8Z{8zKhlfoAsju_XS<e(Bh*O_bG+vd*Fln1aB)LZ`9ybt
zDNaW(v?YY0M!7KXXxa|92`evW7F`721$4>npy7I*!cmkvV!9c2Gi06KMH;TI*56c!
zLKQB|??#>q#%w=D@nM#*F@PsCRjBbXCcpwW^D694O*IFKvS#L(8!IED{ehUoZ;67w
zUV;TQ=r<c&6K(-=uq^IUF<wA=LNESX2fsbw0kkJSP+Qf*5Kvc7dB9XadOaRI?w{yU
zJRB)jMC{4c%%RX=&$S)ii8~}|<rz`V)5BvI(nQHSvc#5OiJ;A6j(L+1IFMAL5tG3Q
z8WF?|Z8ajxL7G~N_EE76YZ?xMoRH~Xs3<v0nmBO_c^pjKZZw4)uhl)&aF$hkV&cEG
zwM}KrO;od_4s}h3!h!;E`jZ1m<TdH!CG}*v!Jkx+)bfyqcmuG?*ePYs@U^cS!|)|@
zqxu&=_#Q(SXyeuwT!*`Ny=c9_egu|BsU9leI-NOexQLKncrg+-lJadbVCL{sk-VAc
z^-=d8hiiF;4BSw2;J=^o)WeC#r&ttn>I!9Q1TF#<d@gwrk<3IK@tw$m(67Jzw4udY
zQk=}FC>`y|WZo#i5E=^Fivz}9BCB|wiopX_Imoq9^C+r;StMW^p0?;mDGKCivlZ=x
zC5xQWmuYotk%yomv^cx^U$st4?{HAOAB=qtX{Mg_C6=1^2cW_^&PIzwAp`b1{YO8|
zgmmJEAnxR$a49Od)l-RFOfpWB`^<ghGWPPg)4h4A2nG%-c<58;$=-WmBG^0$G+xeD
zL?(q-W5gO?m-d;3)#v%xyHJ{maT%+EJ!~PYA!-}ePz3#48EJOc{{416FU#>1ZC}fN
z!5vHaKRb0q(m|sVbnjzNlKmyGampXbzl}pjs1%S2iHP(mK|0`($T@Fhd`TwJ9wsg@
zR9WOg8y0(&gL2|c(b!4V(J4$(s@fql)QN<C6Rx1`eQsKE_{}=ejqvFf(#AFn{_OqR
z`@i{A1z0PNk<6q{YZ$C*7x}eg)*csDs*;)f-OZ6KEy|@Joe|N>-`trgmI?2^e)^5M
z0+HfEH}D`VLCPBj%MVRI>Y}B!RAayH4_5>CzW-&#vepl9%um086M_ySUtk8c=bGoE
zN7_jsIi?+tHL*u!Ny+>h6P14$-)j6IN%=pf)f2Q3_EsO(93a<fy)&$}<}<e8d8fY0
zUny9p;)YFo{eYoQ@heJ<6{qD5Oh<cS0Z?q*yl$!T*A+fHRFb`J|6rRA3CwEUB`#_H
zZ|s}&d_G*YX-n@9pZ$A^!)Ykp&E+_{J%<i0D%~otc;6mz2Me~Zxh$KWey`@f`(*O7
zE6qRs>74-l1ew%foo`*zMgUz{*Xjp<Hc{-xsq38Iy`ubO{4ZUUVg&y&VQjk87p(ru
z>2kCE*UtfmN%bJZSG9dZ{y@?AFV$+X-h!{{l)%7+tywj!<GpB9-(PDQ2yz_yb@QO}
z=M6usk4pa!m)RS<ogBVPM^w2uNwuv~#ku_=E}&)&3PZ|%l4!glTIrnN|K<zTOX)hJ
z{aT`dO~RFT3*lS@7vKE`TQ>RK1C(oh)G|&*4|hBWE)zvt5fnyvSOiS;?v+jbq)21P
zs*kyNV~G>XpKt8XAFkK{EJXsJerHg7CnI@-%Ky;F`l1XG4LWP~Ha%@oU*%d~HrsQ8
zrTO|9Jr#;i9p#JwOUt;GUFGli@-N_i9S3YJ$00B%3~9wFs+|?lvj0u}#3vqF=5MN9
zJxA-K<sY8ZP0}a(%qSIYi#6I@%>gn&JLd~VkF`|>T>hG>zTJNeHJq(%37!5%M$=F-
z|5pIH(tKh|imHQq5<)2>sGKhccs;GIUQ**Jm#qBqf3tP$A}WQHZOZTb<zHRrk2R_s
zlsDloKl(FY@|PcIf1z53fBC-RX1fpA-(P-Gt57Y5zkJ{STT|6D|H}{l`z>@I!P|?`
zE2{gVq7%yB_1DmySEHIomha9fq;M92U{zs~omk8|rX-5gpe9!JH-A$-8-=qc!edUO
zvImuD7%UiO{8EQs{2asw+z~!TUL<HaB!wW{8GBW?>AVJ)V^gI%bLZF`9~5#j@rA$C
zqAUI%VFek5-gjQx+xov5LzI?8B~#rXoU00`Zm33cMM!j`TzO@toj>_-W2YOF_r;r^
zhN@Vfid|wW1@mrFnr4&);*Mw0k%;nBDiSdZy@44=cEn^Y8JH8rvgQC)SvLHGjB{r`
zS3!OpD$ZDCt**}Kc;yI*ws~h%>7t<toF+H5@iIibC{pF92cRdn7cT#%s_?~TTKGel
zqWl<Dd|tL$6>k-%#a5C;DPlls3HyUb8lOLx2}9QPAnp(@Czgme<-9-#?+zI89im`e
zGwg);`=7G@&sB#rF$6)_{Gu1HN!)fwU8+xQ;s6lBPf<<}lJXP3EpK-9Y%JC)*fF$V
z$GT-EEx$HTxYqpYGrqNQwFwa-luwDtv<;@&886r17LXlo`L&RDP#=lGQ<RU=f30fY
zDV+6i$e;|$LzQs++_`h^rDrkn1;a`%t`Lxbf+D53uCKtz;1{h9XI@Z*mID;6T;oRt
zGZdI5>C{}<e`)Kg7j>#0-=q45NxW9;Bfqf6es4FtdGUqnTk#LyypS%mZA^F-sY|Sg
z#j*>}jribyC@;_S*Dlt-6@95(Ast7t5ajVvRA!??m$R~(Dd(ljqbsd+AbKg5P39oQ
zO*)Yl2Mq`$a@1EiYwvbiTH1#d>|bD}+ITzrNKH*eDm|7ti0FQ?Y8KTav|A#Xm=>?P
zya%|W1kdV4+c`k}Np!}3F$zF7-4K|GjZAVy51Hi=Rl&>+Z+2~x&nlmNVcW;&vn#(<
z-QWimWJW_~#~?-ZqCc^S!^4WIS`C#SUHO^Y{`C*|e-Wls*ZhA+!Yi%qhB*G1xi9Yi
RX8CRNaaP|)kD0Une*r_c+CBgP

diff --git a/setup.py b/setup.py
index 5b7d12bb373e3..8b2b4f7e5def7 100644
--- a/setup.py
+++ b/setup.py
@@ -461,14 +461,22 @@ class precompiled_wheel_utils:
                     "vllm/cumem_allocator.abi3.so",
                 ]
 
-                compiled_regex = re.compile(
+                flash_attn_regex = re.compile(
                     r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
                 )
+                triton_kernels_regex = re.compile(
+                    r"vllm/third_party/triton_kernels/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
+                )
                 file_members = list(
                     filter(lambda x: x.filename in files_to_copy, wheel.filelist)
                 )
                 file_members += list(
-                    filter(lambda x: compiled_regex.match(x.filename), wheel.filelist)
+                    filter(lambda x: flash_attn_regex.match(x.filename), wheel.filelist)
+                )
+                file_members += list(
+                    filter(
+                        lambda x: triton_kernels_regex.match(x.filename), wheel.filelist
+                    )
                 )
 
                 for file in file_members:
@@ -648,7 +656,7 @@ def get_vllm_version() -> str:
         if envs.VLLM_TARGET_DEVICE == "empty":
             version += f"{sep}empty"
     elif _is_cuda():
-        if envs.VLLM_USE_PRECOMPILED:
+        if envs.VLLM_USE_PRECOMPILED and not envs.VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX:
             version += f"{sep}precompiled"
         else:
             cuda_version = str(get_nvcc_cuda_version())
diff --git a/vllm/envs.py b/vllm/envs.py
index 8b954fa14f28c..4b594e54f7219 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -78,6 +78,7 @@ if TYPE_CHECKING:
     MAX_JOBS: str | None = None
     NVCC_THREADS: str | None = None
     VLLM_USE_PRECOMPILED: bool = False
+    VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX: bool = False
     VLLM_DOCKER_BUILD_CONTEXT: bool = False
     VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False
     VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
@@ -462,6 +463,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
     .lower()
     in ("1", "true")
     or bool(os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
+    # If set, skip adding +precompiled suffix to version string
+    "VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX": lambda: bool(
+        int(os.environ.get("VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX", "0"))
+    ),
     # Used to mark that setup.py is running in a Docker build context,
     # in order to force the use of precompiled binaries.
     "VLLM_DOCKER_BUILD_CONTEXT": lambda: os.environ.get("VLLM_DOCKER_BUILD_CONTEXT", "")

From b78772c433515a22bfeeaea41f3524002609e264 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Wed, 3 Dec 2025 20:53:44 +0800
Subject: [PATCH 188/770] [Frontend] supports deepseekv32 chat template
 (#29837)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 vllm/config/model.py                      |   3 +-
 vllm/entrypoints/openai/serving_engine.py |   9 +-
 vllm/tokenizers/__init__.py               |   2 +
 vllm/tokenizers/deepseek_v32_encoding.py  | 456 ++++++++++++++++++++++
 vllm/tokenizers/deepseekv32.py            | 148 +++++++
 5 files changed, 616 insertions(+), 2 deletions(-)
 create mode 100644 vllm/tokenizers/deepseek_v32_encoding.py
 create mode 100644 vllm/tokenizers/deepseekv32.py

diff --git a/vllm/config/model.py b/vllm/config/model.py
index 5de97697698a1..655b7c995f6d2 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -84,7 +84,7 @@ TaskOption = Literal[
     "transcription",
     "draft",
 ]
-TokenizerMode = Literal["auto", "hf", "slow", "mistral"]
+TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"]
 ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
 LogprobsMode = Literal[
     "raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs"
@@ -141,6 +141,7 @@ class ModelConfig:
     - "hf" will use the fast tokenizer if available.\n
     - "slow" will always use the slow tokenizer.\n
     - "mistral" will always use the tokenizer from `mistral_common`.\n
+    - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
     - Other custom values can be supported via plugins."""
     trust_remote_code: bool = False
     """Trust remote code (e.g., from HuggingFace) when downloading the model
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 67291f45a9251..9642024dd1e9e 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -105,7 +105,7 @@ from vllm.outputs import CompletionOutput, PoolingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
 from vllm.sampling_params import BeamSearchParams, SamplingParams
-from vllm.tokenizers import MistralTokenizer, TokenizerLike
+from vllm.tokenizers import DeepseekV32Tokenizer, MistralTokenizer, TokenizerLike
 from vllm.tracing import (
     contains_trace_headers,
     extract_trace_headers,
@@ -1128,6 +1128,13 @@ class OpenAIServing:
                 messages=messages,
                 **_chat_template_kwargs,
             )
+        elif isinstance(tokenizer, DeepseekV32Tokenizer):
+            request_prompt = tokenizer.apply_chat_template(
+                conversation=conversation,
+                messages=messages,
+                model_config=model_config,
+                **_chat_template_kwargs,
+            )
         else:
             request_prompt = apply_hf_chat_template(
                 tokenizer=tokenizer,
diff --git a/vllm/tokenizers/__init__.py b/vllm/tokenizers/__init__.py
index 42487f5f51651..67a6d7c8eb3d9 100644
--- a/vllm/tokenizers/__init__.py
+++ b/vllm/tokenizers/__init__.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from .deepseekv32 import DeepseekV32Tokenizer
 from .hf import HfTokenizer
 from .mistral import MistralTokenizer
 from .protocol import TokenizerLike
@@ -21,4 +22,5 @@ __all__ = [
     "get_tokenizer",
     "cached_tokenizer_from_config",
     "init_tokenizer_from_config",
+    "DeepseekV32Tokenizer",
 ]
diff --git a/vllm/tokenizers/deepseek_v32_encoding.py b/vllm/tokenizers/deepseek_v32_encoding.py
new file mode 100644
index 0000000000000..72f43395b192e
--- /dev/null
+++ b/vllm/tokenizers/deepseek_v32_encoding.py
@@ -0,0 +1,456 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+# copy from https://huggingface.co/deepseek-ai/DeepSeek-V3.2/blob/main/encoding/encoding_dsv32.py
+import copy
+import json
+import re
+from typing import Any
+
+# flake8: noqa: E501
+TOOLS_SYSTEM_TEMPLATE = """## Tools
+You have access to a set of tools you can use to answer the user's question.
+You can invoke functions by writing a "<{dsml_token}function_calls>" block like the following as part of your reply to the user:
+<{dsml_token}function_calls>
+<{dsml_token}invoke name="$FUNCTION_NAME">
+<{dsml_token}parameter name="$PARAMETER_NAME" string="true|false">$PARAMETER_VALUE</{dsml_token}parameter>
+...
+</{dsml_token}invoke>
+<{dsml_token}invoke name="$FUNCTION_NAME2">
+...
+</{dsml_token}invoke>
+</{dsml_token}function_calls>
+String and scalar parameters should be specified as is without any escaping or quotes, while lists and objects should use JSON format. The "string" attribute should be set to "true" for string type parameters and "false" for other types (numbers, booleans, arrays, objects).
+If the thinking_mode is enabled, then after function results you should strongly consider outputting a thinking block. Here is an example:
+<{dsml_token}function_calls>
+...
+</{dsml_token}function_calls>
+<function_results>
+...
+</function_results>
+{thinking_start_token}...thinking about results{thinking_end_token}
+Here are the functions available in JSONSchema format:
+<functions>
+{tool_schemas}
+</functions>
+"""
+
+bos_token: str = "<｜begin▁of▁sentence｜>"
+eos_token: str = "<｜end▁of▁sentence｜>"
+thinking_start_token: str = "<think>"
+thinking_end_token: str = "</think>"
+dsml_token: str = "｜DSML｜"
+system_msg_template: str = "{content}"
+user_msg_template: str = "<｜User｜>{content}<｜Assistant｜>"
+assistant_msg_template: str = "{reasoning}{content}{tool_calls}<｜end▁of▁sentence｜>"
+thinking_template = "{reasoning_content}"
+
+response_format_template: str = "## Response Format:\n\nYou MUST strictly adhere to the following schema to reply:\n{schema}"
+tool_call_template: str = (
+    '<{dsml_token}invoke name="{name}">\n{arguments}\n</{dsml_token}invoke>'
+)
+tool_calls_template = (
+    "<{dsml_token}function_calls>\n{tool_calls}\n</{dsml_token}function_calls>"
+)
+
+tool_output_template: str = "\n<result>{content}</result>"
+
+
+def to_json(value: Any) -> str:
+    try:
+        return json.dumps(value, ensure_ascii=False)
+    except Exception:
+        return json.dumps(value, ensure_ascii=True)
+
+
+def tools_from_openai_format(tools):
+    return [tool["function"] for tool in tools]
+
+
+def tool_calls_from_openai_format(tool_calls):
+    return [
+        {
+            "name": tool_call["function"]["name"],
+            "arguments": tool_call["function"]["arguments"],
+        }
+        for tool_call in tool_calls
+    ]
+
+
+def tool_calls_to_openai_format(tool_calls):
+    return [
+        {
+            "type": "function",
+            "function": {
+                "name": tool_call["name"],
+                "arguments": tool_call["arguments"],
+            },
+        }
+        for tool_call in tool_calls
+    ]
+
+
+def encode_arguments_to_dsml(tool_call: dict[str, str]) -> str:
+    p_dsml_template = """<{dsml_token}parameter name="{key}" string="{is_str}">{value}</{dsml_token}parameter>"""
+    P_dsml_strs = []
+
+    arguments = json.loads(tool_call["arguments"])
+
+    for k, v in arguments.items():
+        p_dsml_str = p_dsml_template.format(
+            dsml_token=dsml_token,
+            key=k,
+            is_str="true" if isinstance(v, str) else "false",
+            value=v if isinstance(v, str) else to_json(v),
+        )
+
+        P_dsml_strs.append(p_dsml_str)
+
+    return "\n".join(P_dsml_strs)
+
+
+def decode_dsml_to_arguments(
+    tool_name: str, tool_args: dict[str, tuple[str, str]]
+) -> dict[str, str]:
+    def _decode_value(key: str, value: str, string: str):
+        if string == "true":
+            value = to_json(value)
+        return f"{to_json(key)}: {value}"
+
+    tool_args_json = (
+        "{"
+        + ", ".join(
+            [_decode_value(k, v, string=is_str) for k, (v, is_str) in tool_args.items()]
+        )
+        + "}"
+    )
+    return dict(name=tool_name, arguments=tool_args_json)
+
+
+def render_tools(tools: list[dict[str, str | dict[str, Any]]]) -> str:
+    tools_json = [to_json(t) for t in tools]
+
+    return TOOLS_SYSTEM_TEMPLATE.format(
+        tool_schemas="\n".join(tools_json),
+        dsml_token=dsml_token,
+        thinking_start_token=thinking_start_token,
+        thinking_end_token=thinking_end_token,
+    )
+
+
+def find_last_user_index(messages: list[dict[str, Any]]) -> int:
+    last_user_index = -1
+    for idx in range(len(messages) - 1, -1, -1):
+        if messages[idx].get("role") in ["user", "developer"]:
+            last_user_index = idx
+            break
+    return last_user_index
+
+
+def render_message(
+    index: int, messages: list[dict[str, Any]], thinking_mode: str
+) -> str:
+    assert 0 <= index < len(messages)
+    assert thinking_mode in ["chat", "thinking"], (
+        f"Invalid thinking_mode `{thinking_mode}`"
+    )
+
+    prompt = ""
+    msg = messages[index]
+    last_user_idx = find_last_user_index(messages)
+
+    role = msg.get("role")
+    content = msg.get("content")
+    tools = msg.get("tools")
+    response_format = msg.get("response_format")
+    tool_calls = msg.get("tool_calls")
+    reasoning_content = msg.get("reasoning") or msg.get("reasoning_content")
+
+    if tools:
+        tools = tools_from_openai_format(tools)
+    if tool_calls:
+        tool_calls = tool_calls_from_openai_format(tool_calls)
+
+    if role == "system":
+        prompt += system_msg_template.format(content=content or "")
+        if tools:
+            prompt += "\n\n" + render_tools(tools)
+
+        if response_format:
+            prompt += "\n\n" + response_format_template.format(
+                schema=to_json(response_format)
+            )
+
+    elif role == "developer":
+        assert content, f"Invalid message for role `{role}`: {msg}"
+        content_developer = ""
+        if tools:
+            content_developer += "\n\n" + render_tools(tools)
+
+        if response_format:
+            content_developer += "\n\n" + response_format_template.format(
+                schema=to_json(response_format)
+            )
+
+        content_developer += "\n\n# The user's message is: {}".format(content)
+
+        prompt += user_msg_template.format(content=content_developer)
+        if index == last_user_idx and thinking_mode == "thinking":
+            prompt += thinking_start_token
+        else:
+            prompt += thinking_end_token
+
+    elif role == "user":
+        prompt += user_msg_template.format(content=content)
+
+        if index == last_user_idx and thinking_mode == "thinking":
+            prompt += thinking_start_token
+        else:
+            prompt += thinking_end_token
+
+    elif role == "tool":
+        prev_assistant_idx = index - 1
+        assistant_msg = messages[prev_assistant_idx]
+        while prev_assistant_idx >= 0 and assistant_msg.get("role") == "tool":
+            prev_assistant_idx -= 1
+            assistant_msg = messages[prev_assistant_idx]
+
+        assert (
+            index == 0
+            or prev_assistant_idx >= 0
+            and assistant_msg.get("role") == "assistant"
+        ), f"Invalid messages at {index}:\n{assistant_msg}"
+
+        tool_call_order = index - prev_assistant_idx
+        assistant_tool_calls = assistant_msg.get("tool_calls")
+        assert assistant_tool_calls and len(assistant_tool_calls) >= tool_call_order, (
+            "No tool calls but found tool output"
+        )
+
+        if tool_call_order == 1:
+            prompt += "\n\n<function_results>"
+
+        prompt += tool_output_template.format(content=content)
+
+        if tool_call_order == len(assistant_tool_calls):
+            prompt += "\n</function_results>"
+
+            if index >= last_user_idx and thinking_mode == "thinking":
+                prompt += "\n\n" + thinking_start_token
+            else:
+                prompt += "\n\n" + thinking_end_token
+
+    elif role == "assistant":
+        prev_assistant_idx = index
+        thinking_part = ""
+
+        tool_calls_content = ""
+        if tool_calls:
+            tool_calls = [
+                tool_call_template.format(
+                    dsml_token=dsml_token,
+                    name=tool_call.get("name"),
+                    arguments=encode_arguments_to_dsml(tool_call),
+                )
+                for tool_call in tool_calls
+            ]
+            tool_calls_content += "\n\n" + tool_calls_template.format(
+                dsml_token=dsml_token, tool_calls="\n".join(tool_calls)
+            )
+
+        summary_content = content or ""
+
+        if thinking_mode == "thinking" and index > last_user_idx:
+            assert reasoning_content or tool_calls, (
+                f"ThinkingMode: {thinking_mode}, invalid message without reasoning_content/tool_calls `{msg}` after last user message"
+            )
+            thinking_part = (
+                thinking_template.format(reasoning_content=reasoning_content or "")
+                + thinking_end_token
+            )
+
+        prompt += assistant_msg_template.format(
+            reasoning=thinking_part,
+            content=summary_content,
+            tool_calls=tool_calls_content,
+        )
+    else:
+        raise NotImplementedError(f"Unknown role: {role}")
+
+    return prompt
+
+
+def drop_thinking_messages(
+    messages: list[dict[str, Any]], last_user_idx: int | None = None
+) -> list[dict[str, Any]]:
+    messages_wo_thinking: list[dict[str, Any]] = []
+    last_user_idx = (
+        find_last_user_index(messages) if last_user_idx is None else last_user_idx
+    )
+    for idx, msg in enumerate(messages):
+        role = msg.get("role")
+        if role in ["user", "system", "tool"] or idx >= last_user_idx:
+            messages_wo_thinking.append(msg)
+            continue
+
+        elif role == "assistant":
+            msg_wo_thinking = copy.copy(msg)
+            msg_wo_thinking.pop("reasoning_content", None)
+            msg_wo_thinking.pop("reasoning", None)
+            messages_wo_thinking.append(msg_wo_thinking)
+
+    return messages_wo_thinking
+
+
+def encode_messages(
+    messages: list[dict[str, Any]],
+    thinking_mode: str,
+    context: list[dict[str, Any]] | None = None,
+    drop_thinking: bool = True,
+    add_default_bos_token: bool = True,
+) -> str:
+    context = context if context else []
+    full_messages = context + messages
+
+    prompt = bos_token if add_default_bos_token and len(context) == 0 else ""
+
+    if thinking_mode == "thinking" and drop_thinking:
+        full_messages = drop_thinking_messages(full_messages)
+
+    for idx in range(len(messages)):
+        prompt += render_message(
+            idx + len(context), full_messages, thinking_mode=thinking_mode
+        )
+
+    return prompt
+
+
+def _read_until_stop(
+    index: int, text: str, stop: list[str]
+) -> tuple[int, str, None | str]:
+    min_pos = len(text)
+    matched_stop = None
+
+    for s in stop:
+        pos = text.find(s, index)
+        if pos != -1 and pos < min_pos:
+            min_pos = pos
+            matched_stop = s
+
+    if matched_stop:
+        content = text[index:min_pos]
+        return min_pos + len(matched_stop), content, matched_stop
+    else:
+        content = text[index:]
+        return len(text), content, None
+
+
+def parse_tool_calls(index: int, text: str):
+    tool_calls: list[dict[str, Any]] = []
+    stop_token = None
+    tool_calls_end_token = f"</{dsml_token}function_calls>"
+
+    while index < len(text):
+        index, _, stop_token = _read_until_stop(
+            index, text, [f"<{dsml_token}invoke", tool_calls_end_token]
+        )
+        assert _ == ">\n", "Tool call format error"
+
+        if stop_token == tool_calls_end_token:
+            break
+
+        assert stop_token is not None, "Missing special token"
+
+        index, tool_name_content, stop_token = _read_until_stop(
+            index, text, [f"<{dsml_token}parameter", f"</{dsml_token}invoke"]
+        )
+
+        p_tool_name = re.findall(
+            r'^\s*name="(.*?)">\n$', tool_name_content, flags=re.DOTALL
+        )
+        assert len(p_tool_name) == 1, "Tool name format error"
+        tool_name = p_tool_name[0]
+
+        tool_args: dict[str, tuple[str, str]] = {}
+        while stop_token == f"<{dsml_token}parameter":
+            index, param_content, stop_token = _read_until_stop(
+                index, text, [f"/{dsml_token}parameter"]
+            )
+
+            param_kv = re.findall(
+                r'^ name="(.*?)" string="(true|false)">(.*?)<$',
+                param_content,
+                flags=re.DOTALL,
+            )
+            assert len(param_kv) == 1, "Parameter format error"
+            param_name, string, param_value = param_kv[0]
+
+            assert param_name not in tool_args, "Duplicate parameter name"
+            tool_args[param_name] = (param_value, string)
+
+            index, content, stop_token = _read_until_stop(
+                index, text, [f"<{dsml_token}parameter", f"</{dsml_token}invoke"]
+            )
+            assert content == ">\n", "Parameter format error"
+
+        tool_call = decode_dsml_to_arguments(tool_name=tool_name, tool_args=tool_args)
+        tool_calls.append(tool_call)
+
+    return index, stop_token, tool_calls
+
+
+# NOTE: This function is designed to parse only correctly
+# formatted string and will not attempt to correct malformed output
+# that may be generated by the model.
+def parse_message_from_completion_text(text: str, thinking_mode: str):
+    summary_content, reasoning_content, tool_calls = "", "", []
+    index, stop_token = 0, None
+    tool_calls_start_token = f"\n\n<{dsml_token}function_calls"
+
+    is_thinking, is_tool_calling = thinking_mode == "thinking", False
+
+    if is_thinking:
+        index, content_delta, stop_token = _read_until_stop(
+            index, text, [thinking_end_token, tool_calls_start_token]
+        )
+        reasoning_content = content_delta
+        assert stop_token == thinking_end_token, "Invalid thinking format"
+
+    index, content_delta, stop_token = _read_until_stop(
+        index, text, [eos_token, tool_calls_start_token]
+    )
+    summary_content = content_delta
+    if stop_token == tool_calls_start_token:
+        is_tool_calling = True
+    else:
+        assert stop_token == eos_token, "Invalid summary format"
+
+    if is_tool_calling:
+        index, stop_token, tool_calls = parse_tool_calls(index, text)
+
+        index, tool_ends_text, stop_token = _read_until_stop(index, text, [eos_token])
+        assert not tool_ends_text, "Unexpected content after tool calls"
+
+    assert len(text) == index and stop_token in [eos_token, None], (
+        "Unexpected content at end"
+    )
+
+    for sp_token in [
+        bos_token,
+        eos_token,
+        thinking_start_token,
+        thinking_end_token,
+        dsml_token,
+    ]:
+        assert sp_token not in summary_content and sp_token not in reasoning_content, (
+            "Unexpected special token in content"
+        )
+
+    return {
+        "role": "assistant",
+        "content": summary_content,
+        "reasoning_content": reasoning_content,
+        "reasoning": reasoning_content,
+        "tool_calls": tool_calls_to_openai_format(tool_calls),
+    }
diff --git a/vllm/tokenizers/deepseekv32.py b/vllm/tokenizers/deepseekv32.py
new file mode 100644
index 0000000000000..7466ad4076bd8
--- /dev/null
+++ b/vllm/tokenizers/deepseekv32.py
@@ -0,0 +1,148 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from pathlib import Path
+
+from transformers import BatchEncoding
+
+from .deepseek_v32_encoding import encode_messages
+from .hf import HfTokenizer, TokenizerLike
+from .registry import TokenizerRegistry
+
+
+@TokenizerRegistry.register("deepseek_v32")
+class DeepseekV32Tokenizer(HfTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
+        self.tokenizer = tokenizer
+        self.name_or_path = (
+            tokenizer.name_or_path if hasattr(tokenizer, "name_or_path") else ""
+        )
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        path_or_repo_id: str | Path,
+        *args,
+        trust_remote_code: bool = False,
+        revision: str | None = None,
+        download_dir: str | None = None,
+        **kwargs,
+    ) -> "TokenizerLike":
+        tokenizer = super().from_pretrained(
+            path_or_repo_id,
+            *args,
+            trust_remote_code=trust_remote_code,
+            revision=revision,
+            download_dir=download_dir,
+            **kwargs,
+        )
+        return DeepseekV32Tokenizer(tokenizer)
+
+    def apply_chat_template(self, messages, tools=None, **kwargs):
+        thinking = kwargs.get("thinking", False)
+        thinking_mode = "thinking"
+        if not thinking:
+            thinking_mode = "chat"
+        messages = messages.copy()
+        drop_thinking = True
+        if tools is not None and len(tools) > 0:
+            messages.insert(0, {"role": "system"})
+            messages[0]["tools"] = tools
+            drop_thinking = False
+        encode_config = dict(thinking_mode=thinking_mode, drop_thinking=drop_thinking)
+        prompt_str = encode_messages(messages, **encode_config)  # type: ignore
+        return prompt_str
+
+    @property
+    def all_special_tokens(self) -> list[str]:
+        return self.tokenizer.all_special_tokens
+
+    @property
+    def all_special_ids(self) -> list[int]:
+        return self.tokenizer.all_special_ids
+
+    @property
+    def bos_token_id(self) -> int:
+        return self.tokenizer.bos_token_id
+
+    @property
+    def eos_token_id(self) -> int:
+        return self.tokenizer.eos_token_id
+
+    @property
+    def pad_token_id(self) -> int:
+        return self.tokenizer.pad_token_id
+
+    @property
+    def is_fast(self) -> bool:
+        return self.tokenizer.is_fast
+
+    @property
+    def vocab_size(self) -> int:
+        return self.tokenizer.vocab_size
+
+    @property
+    def max_token_id(self) -> int:
+        return self.tokenizer.max_token_id
+
+    @property
+    def truncation_side(self) -> str:
+        return self.tokenizer.truncation_side
+
+    def __hash__(self) -> int:
+        return hash(id(self))
+
+    def __len__(self) -> int:
+        # </think> is an added token in DeepseekV32 tokenizer
+        return self.vocab_size + len(self.get_added_vocab())
+
+    def __call__(
+        self,
+        text: str | list[str],
+        text_pair: str | None = None,
+        add_special_tokens: bool = True,
+        truncation: bool = False,
+        max_length: int | None = None,
+    ) -> "BatchEncoding":
+        return self.tokenizer(
+            text,
+            text_pair=text_pair,
+            add_special_tokens=add_special_tokens,
+            truncation=truncation,
+            max_length=max_length,
+        )
+
+    def get_vocab(self) -> dict[str, int]:
+        return self.tokenizer.get_vocab()
+
+    def get_added_vocab(self) -> dict[str, int]:
+        return self.tokenizer.get_added_vocab()
+
+    def encode(
+        self,
+        text: str,
+        truncation: bool | None = None,
+        max_length: int | None = None,
+        add_special_tokens: bool = True,
+    ) -> list[int]:
+        return self.tokenizer.encode(
+            text,
+            truncation=truncation,
+            max_length=max_length,
+            add_special_tokens=add_special_tokens,
+        )
+
+    def convert_tokens_to_string(self, tokens: list[str]) -> str:
+        return self.tokenizer.convert_tokens_to_string(tokens)
+
+    def decode(self, ids: list[int] | int, skip_special_tokens: bool = False) -> str:
+        return self.tokenizer.decode(ids, skip_special_tokens=skip_special_tokens)
+
+    def convert_ids_to_tokens(
+        self,
+        ids: list[int],
+        skip_special_tokens: bool = False,
+    ) -> list[str]:
+        return self.tokenizer.convert_ids_to_tokens(
+            ids, skip_special_tokens=skip_special_tokens
+        )

From 15b1511a15dfb1d56048847da755213632c07b29 Mon Sep 17 00:00:00 2001
From: ioana ghiban <ioana.gbn@gmail.com>
Date: Wed, 3 Dec 2025 13:56:47 +0100
Subject: [PATCH 189/770] [GPU Backend] [Doc]: Remove duplicate statements on
 missing GPU wheels. (#29962)

Signed-off-by: Ioana Ghiban <ioana.ghiban@arm.com>
---
 docs/getting_started/installation/gpu.rocm.inc.md | 3 ---
 docs/getting_started/installation/gpu.xpu.inc.md  | 3 ---
 2 files changed, 6 deletions(-)

diff --git a/docs/getting_started/installation/gpu.rocm.inc.md b/docs/getting_started/installation/gpu.rocm.inc.md
index c80ba9478f6be..21120cc6fcd98 100644
--- a/docs/getting_started/installation/gpu.rocm.inc.md
+++ b/docs/getting_started/installation/gpu.rocm.inc.md
@@ -5,9 +5,6 @@ vLLM supports AMD GPUs with ROCm 6.3 or above, and torch 2.8.0 and above.
 !!! tip
     [Docker](#set-up-using-docker) is the recommended way to use vLLM on ROCm.
 
-!!! warning
-    There are no pre-built wheels for this device, so you must either use the pre-built Docker image or build vLLM from source.
-
 # --8<-- [end:installation]
 # --8<-- [start:requirements]
 
diff --git a/docs/getting_started/installation/gpu.xpu.inc.md b/docs/getting_started/installation/gpu.xpu.inc.md
index 620a660a240ed..7e9c6a2b9de07 100644
--- a/docs/getting_started/installation/gpu.xpu.inc.md
+++ b/docs/getting_started/installation/gpu.xpu.inc.md
@@ -2,9 +2,6 @@
 
 vLLM initially supports basic model inference and serving on Intel GPU platform.
 
-!!! warning
-    There are no pre-built wheels for this device, so you need build vLLM from source. Or you can use pre-built images which are based on vLLM released versions.
-
 # --8<-- [end:installation]
 # --8<-- [start:requirements]
 

From 1bb17ecb396f911beaa26ab0d3926d46154c7155 Mon Sep 17 00:00:00 2001
From: ioana ghiban <ioana.gbn@gmail.com>
Date: Wed, 3 Dec 2025 14:33:50 +0100
Subject: [PATCH 190/770] [CPU Backend] [Doc]: Update Installation Docs for
 CPUs (#29868)

Signed-off-by: Ioana Ghiban <ioana.ghiban@arm.com>
---
 .../installation/cpu.apple.inc.md             |  7 +++--
 .../installation/cpu.arm.inc.md               | 26 ++++++++++++----
 docs/getting_started/installation/cpu.md      | 30 +++++++++++++++++--
 .../installation/cpu.s390x.inc.md             |  7 +++--
 .../installation/cpu.x86.inc.md               |  2 ++
 5 files changed, 58 insertions(+), 14 deletions(-)

diff --git a/docs/getting_started/installation/cpu.apple.inc.md b/docs/getting_started/installation/cpu.apple.inc.md
index 4dc707d5f9a14..9f1f6e3821397 100644
--- a/docs/getting_started/installation/cpu.apple.inc.md
+++ b/docs/getting_started/installation/cpu.apple.inc.md
@@ -4,9 +4,6 @@ vLLM has experimental support for macOS with Apple Silicon. For now, users must
 
 Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
 
-!!! warning
-    There are no pre-built wheels or images for this device, so you must build vLLM from source.
-
 # --8<-- [end:installation]
 # --8<-- [start:requirements]
 
@@ -20,6 +17,8 @@ Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
 # --8<-- [end:set-up-using-python]
 # --8<-- [start:pre-built-wheels]
 
+Currently, there are no pre-built Apple silicon CPU wheels.
+
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]
 
@@ -78,6 +77,8 @@ uv pip install -e .
 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:pre-built-images]
 
+Currently, there are no pre-built Arm silicon CPU images.
+
 # --8<-- [end:pre-built-images]
 # --8<-- [start:build-image-from-source]
 
diff --git a/docs/getting_started/installation/cpu.arm.inc.md b/docs/getting_started/installation/cpu.arm.inc.md
index 9cae9ed1a212e..156f31f633d57 100644
--- a/docs/getting_started/installation/cpu.arm.inc.md
+++ b/docs/getting_started/installation/cpu.arm.inc.md
@@ -1,11 +1,6 @@
 # --8<-- [start:installation]
 
-vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform.
-
-ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
-
-!!! warning
-    There are no pre-built wheels or images for this device, so you must build vLLM from source.
+vLLM offers basic model inferencing and serving on Arm CPU platform, with support NEON, data types FP32, FP16 and BF16.
 
 # --8<-- [end:installation]
 # --8<-- [start:requirements]
@@ -20,6 +15,23 @@ ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
 # --8<-- [end:set-up-using-python]
 # --8<-- [start:pre-built-wheels]
 
+Pre-built vLLM wheels for Arm are available since version 0.11.2. These wheels contain pre-compiled C++ binaries.
+Please replace `<version>` in the commands below with a specific version string (e.g., `0.11.2`).
+
+```bash
+uv pip install --pre vllm==<version>+cpu --extra-index-url https://wheels.vllm.ai/<version>%2Bcpu/
+```
+
+??? console "pip"
+    ```bash
+    pip install --pre vllm==<version>+cpu --extra-index-url https://wheels.vllm.ai/<version>%2Bcpu/
+    ```
+
+The `uv` approach works for vLLM `v0.6.6` and later. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
+
+!!! note
+    Nightly wheels are currently unsupported for this architecture. (e.g. to bisect the behavior change, performance regression).
+
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]
 
@@ -69,6 +81,8 @@ Testing has been conducted on AWS Graviton3 instances for compatibility.
 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:pre-built-images]
 
+Currently, there are no pre-built Arm CPU images.
+
 # --8<-- [end:pre-built-images]
 # --8<-- [start:build-image-from-source]
 ```bash
diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md
index 4b68cb4811789..210f720e2d92a 100644
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -46,11 +46,25 @@ vLLM is a Python library that supports the following CPU variants. Select your C
 
 ### Pre-built wheels
 
-Please refer to the instructions for [pre-built wheels on GPU](./gpu.md#pre-built-wheels).
-
 When specifying the index URL, please make sure to use the `cpu` variant subdirectory.
 For example, the nightly build index is: `https://wheels.vllm.ai/nightly/cpu/`.
 
+=== "Intel/AMD x86"
+
+    --8<-- "docs/getting_started/installation/cpu.x86.inc.md:pre-built-wheels"
+
+=== "ARM AArch64"
+
+    --8<-- "docs/getting_started/installation/cpu.arm.inc.md:pre-built-wheels"
+
+=== "Apple silicon"
+
+    --8<-- "docs/getting_started/installation/cpu.apple.inc.md:pre-built-wheels"
+
+=== "IBM Z (S390X)"
+
+    --8<-- "docs/getting_started/installation/cpu.s390x.inc.md:pre-built-wheels"
+
 ### Build wheel from source
 
 #### Set up using Python-only build (without compilation) {#python-only-build}
@@ -87,6 +101,18 @@ VLLM_USE_PRECOMPILED=1 VLLM_PRECOMPILED_WHEEL_VARIANT=cpu VLLM_TARGET_DEVICE=cpu
 
     --8<-- "docs/getting_started/installation/cpu.x86.inc.md:pre-built-images"
 
+=== "ARM AArch64"
+
+    --8<-- "docs/getting_started/installation/cpu.arm.inc.md:pre-built-images"
+
+=== "Apple silicon"
+
+    --8<-- "docs/getting_started/installation/cpu.apple.inc.md:pre-built-images"
+
+=== "IBM Z (S390X)"
+
+    --8<-- "docs/getting_started/installation/cpu.s390x.inc.md:pre-built-images"
+
 ### Build image from source
 
 === "Intel/AMD x86"
diff --git a/docs/getting_started/installation/cpu.s390x.inc.md b/docs/getting_started/installation/cpu.s390x.inc.md
index c2163139a7c5d..4984c87c17b01 100644
--- a/docs/getting_started/installation/cpu.s390x.inc.md
+++ b/docs/getting_started/installation/cpu.s390x.inc.md
@@ -4,9 +4,6 @@ vLLM has experimental support for s390x architecture on IBM Z platform. For now,
 
 Currently, the CPU implementation for s390x architecture supports FP32 datatype only.
 
-!!! warning
-    There are no pre-built wheels or images for this device, so you must build vLLM from source.
-
 # --8<-- [end:installation]
 # --8<-- [start:requirements]
 
@@ -21,6 +18,8 @@ Currently, the CPU implementation for s390x architecture supports FP32 datatype
 # --8<-- [end:set-up-using-python]
 # --8<-- [start:pre-built-wheels]
 
+Currently, there are no pre-built IBM Z CPU wheels.
+
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]
 
@@ -69,6 +68,8 @@ Execute the following commands to build and install vLLM from source.
 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:pre-built-images]
 
+Currently, there are no pre-built IBM Z CPU images.
+
 # --8<-- [end:pre-built-images]
 # --8<-- [start:build-image-from-source]
 
diff --git a/docs/getting_started/installation/cpu.x86.inc.md b/docs/getting_started/installation/cpu.x86.inc.md
index 310f179cb89ca..1fad7f4338822 100644
--- a/docs/getting_started/installation/cpu.x86.inc.md
+++ b/docs/getting_started/installation/cpu.x86.inc.md
@@ -17,6 +17,8 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data
 # --8<-- [end:set-up-using-python]
 # --8<-- [start:pre-built-wheels]
 
+Currently, there are no pre-built x86 CPU wheels.
+
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]
 

From 5aa9b090407d5fb9b89c05d28fab808623e3070c Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Wed, 3 Dec 2025 08:56:35 -0600
Subject: [PATCH 191/770] [CI/Build][AMD] Skip
 test_shared_storage_connector_hashes in test_shared_storage_connector.py due
 to hipErrorLaunchFailure when calling .cpu() (#29839)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 .../kv_connector/unit/test_shared_storage_connector.py   | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tests/v1/kv_connector/unit/test_shared_storage_connector.py b/tests/v1/kv_connector/unit/test_shared_storage_connector.py
index e7013a794a8c6..ff4697a978255 100644
--- a/tests/v1/kv_connector/unit/test_shared_storage_connector.py
+++ b/tests/v1/kv_connector/unit/test_shared_storage_connector.py
@@ -3,12 +3,14 @@
 from dataclasses import asdict
 from typing import NamedTuple
 
+import pytest
 from PIL import Image
 
 from vllm import LLM, EngineArgs, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.config import KVTransferConfig
 from vllm.multimodal.utils import encode_image_base64
+from vllm.platforms import current_platform
 
 MODEL_NAME = "RedHatAI/Qwen2.5-VL-3B-Instruct-quantized.w8a8"
 
@@ -108,6 +110,13 @@ def process_prompt(processor, llm: LLM, question: str, image_urls: list[Image]):
         print("-" * 50)
 
 
+@pytest.mark.skipif(
+    current_platform.is_rocm(),
+    reason=(
+        "hipErrorLaunchFailure when running this test, see issue:"
+        "https://github.com/ROCm/pytorch/issues/2822"
+    ),
+)
 def test_shared_storage_connector_hashes(tmp_path):
     """
     Tests that SharedStorageConnector saves KV to the storage locations

From 9bcf92295a918c9579d59a9c4d003cb563f495f7 Mon Sep 17 00:00:00 2001
From: Lumis Chen <lumischen01@gmail.com>
Date: Thu, 4 Dec 2025 00:06:57 +0800
Subject: [PATCH 192/770] [Core] Add xxHash as a high-performance hash option
 for accelerating prefix caching (#29163)

Signed-off-by: LuminolT <lumischen01@gmail.com>
Signed-off-by: Lumis Chen <lumischen01@gmail.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
---
 benchmarks/benchmark_hash.py              | 120 ++++++++++++++++++++++
 benchmarks/benchmark_prefix_block_hash.py | 110 ++++++++++++++++++++
 docs/benchmarking/cli.md                  |  29 ++++++
 tests/v1/engine/test_engine_args.py       |  16 +++
 vllm/config/cache.py                      |  18 +++-
 vllm/utils/hashing.py                     |  36 +++++++
 vllm/v1/core/kv_cache_utils.py            |  11 +-
 7 files changed, 332 insertions(+), 8 deletions(-)
 create mode 100644 benchmarks/benchmark_hash.py
 create mode 100644 benchmarks/benchmark_prefix_block_hash.py

diff --git a/benchmarks/benchmark_hash.py b/benchmarks/benchmark_hash.py
new file mode 100644
index 0000000000000..08cdc012d6527
--- /dev/null
+++ b/benchmarks/benchmark_hash.py
@@ -0,0 +1,120 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Micro benchmark comparing built-in hash(), SHA-256, and xxHash.
+
+This focuses on a single test payload shaped like the prefix-cache hash input:
+    (32-byte bytes object, 32-int tuple)
+
+Usage:
+    python benchmarks/hash_micro_benchmark.py --iterations 20000
+"""
+
+from __future__ import annotations
+
+import argparse
+import random
+import statistics
+import time
+from collections.abc import Callable, Iterable
+
+from vllm.utils.hashing import sha256, xxhash
+
+
+def _generate_test_data(seed: int) -> tuple[bytes, tuple[int, ...]]:
+    """Generate a deterministic test payload."""
+    random.seed(seed)
+    bytes_data = bytes(random.getrandbits(8) for _ in range(32))
+    int_tuple = tuple(random.randint(1, 1_000_000) for _ in range(32))
+    return (bytes_data, int_tuple)
+
+
+def _benchmark_func(func: Callable[[tuple], object], data: tuple, iterations: int):
+    """Return (avg_seconds, std_seconds) for hashing `data` `iterations` times."""
+    times: list[float] = []
+
+    # Warm-up to avoid first-run noise.
+    for _ in range(200):
+        func(data)
+
+    for _ in range(iterations):
+        start = time.perf_counter()
+        func(data)
+        end = time.perf_counter()
+        times.append(end - start)
+
+    avg = statistics.mean(times)
+    std = statistics.stdev(times) if len(times) > 1 else 0.0
+    return avg, std
+
+
+def _run_benchmarks(
+    benchmarks: Iterable[tuple[str, Callable[[tuple], object]]],
+    data: tuple,
+    iterations: int,
+):
+    """Yield (name, avg, std) for each benchmark, skipping unavailable ones."""
+    for name, func in benchmarks:
+        try:
+            avg, std = _benchmark_func(func, data, iterations)
+        except ModuleNotFoundError as exc:
+            print(f"Skipping {name}: {exc}")
+            continue
+        yield name, avg, std
+
+
+def builtin_hash(data: tuple) -> int:
+    """Wrapper for Python's built-in hash()."""
+    return hash(data)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--iterations",
+        type=int,
+        default=10_000,
+        help="Number of measured iterations per hash function.",
+    )
+    parser.add_argument(
+        "--seed", type=int, default=42, help="Random seed for test payload."
+    )
+    args = parser.parse_args()
+
+    data = _generate_test_data(args.seed)
+    benchmarks = (
+        ("SHA256 (pickle)", sha256),
+        ("xxHash (pickle)", xxhash),
+        ("built-in hash()", builtin_hash),
+    )
+
+    print("=" * 60)
+    print("HASH FUNCTION MICRO BENCHMARK")
+    print("=" * 60)
+    print("Test data: (32-byte bytes object, 32-int tuple)")
+    print(f"Iterations: {args.iterations:,}")
+    print("=" * 60)
+
+    results = list(_run_benchmarks(benchmarks, data, args.iterations))
+    builtin_entry = next((r for r in results if r[0] == "built-in hash()"), None)
+
+    print("\nResults:")
+    for name, avg, std in results:
+        print(f"  {name:16s}: {avg * 1e6:8.2f} ± {std * 1e6:6.2f} μs")
+
+    if builtin_entry:
+        _, builtin_avg, _ = builtin_entry
+        print("\n" + "=" * 60)
+        print("SUMMARY (relative to built-in hash())")
+        print("=" * 60)
+        for name, avg, _ in results:
+            if name == "built-in hash()":
+                continue
+            speed_ratio = avg / builtin_avg
+            print(f"• {name} is {speed_ratio:.1f}x slower than built-in hash()")
+    else:
+        print("\nBuilt-in hash() result missing; cannot compute speed ratios.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/benchmark_prefix_block_hash.py b/benchmarks/benchmark_prefix_block_hash.py
new file mode 100644
index 0000000000000..8bcd8af0d3102
--- /dev/null
+++ b/benchmarks/benchmark_prefix_block_hash.py
@@ -0,0 +1,110 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Simple benchmark to compare prefix-cache block hashing algorithms.
+
+Example:
+    python benchmark_prefix_block_hash.py --num-blocks 20000 --block-size 32
+"""
+
+from __future__ import annotations
+
+import argparse
+import random
+import statistics
+import sys
+import time
+from collections.abc import Callable, Iterable, Sequence
+
+from vllm.utils.hashing import get_hash_fn_by_name
+from vllm.v1.core.kv_cache_utils import BlockHash, hash_block_tokens, init_none_hash
+
+SUPPORTED_ALGOS = ("sha256", "sha256_cbor", "xxhash", "xxhash_cbor")
+
+
+def _generate_blocks(
+    num_blocks: int, block_size: int, vocab_size: int, seed: int
+) -> list[list[int]]:
+    rng = random.Random(seed)
+    return [
+        [rng.randrange(vocab_size) for _ in range(block_size)]
+        for _ in range(num_blocks)
+    ]
+
+
+def _hash_all_blocks(
+    hash_fn: Callable[[object], bytes],
+    blocks: Iterable[Sequence[int]],
+) -> float:
+    parent_hash: BlockHash | None = None
+    start = time.perf_counter()
+    for block in blocks:
+        parent_hash = hash_block_tokens(hash_fn, parent_hash, block, extra_keys=None)
+    end = time.perf_counter()
+    return end - start
+
+
+def _benchmark(
+    hash_algo: str,
+    blocks: list[list[int]],
+    trials: int,
+) -> tuple[float, float, float] | None:
+    try:
+        hash_fn = get_hash_fn_by_name(hash_algo)
+        init_none_hash(hash_fn)
+        timings = [_hash_all_blocks(hash_fn, blocks) for _ in range(trials)]
+    except ModuleNotFoundError as exc:
+        print(f"Skipping {hash_algo}: {exc}", file=sys.stderr)
+        return None
+
+    avg = statistics.mean(timings)
+    best = min(timings)
+    # throughput: tokens / second
+    tokens_hashed = len(blocks) * len(blocks[0])
+    throughput = tokens_hashed / best
+    return avg, best, throughput
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--num-blocks", type=int, default=10000, help="Block count.")
+    parser.add_argument("--block-size", type=int, default=32, help="Tokens per block.")
+    parser.add_argument(
+        "--vocab-size", type=int, default=32000, help="Token id range [0, vocab_size)."
+    )
+    parser.add_argument("--seed", type=int, default=0, help="Random seed.")
+    parser.add_argument(
+        "--trials", type=int, default=5, help="Number of timed trials per algorithm."
+    )
+    parser.add_argument(
+        "--algorithms",
+        nargs="+",
+        default=SUPPORTED_ALGOS,
+        choices=SUPPORTED_ALGOS,
+        help="Hash algorithms to benchmark.",
+    )
+    args = parser.parse_args()
+
+    blocks = _generate_blocks(
+        args.num_blocks, args.block_size, args.vocab_size, args.seed
+    )
+    print(
+        f"Benchmarking {len(args.algorithms)} algorithms on "
+        f"{args.num_blocks} blocks (block size={args.block_size})."
+    )
+
+    for algo in args.algorithms:
+        result = _benchmark(algo, blocks, args.trials)
+        if result is None:
+            continue
+
+        avg, best, throughput = result
+        print(
+            f"{algo:14s} avg: {avg:.6f}s  best: {best:.6f}s  "
+            f"throughput: {throughput / 1e6:.2f}M tokens/s"
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/benchmarking/cli.md b/docs/benchmarking/cli.md
index 44a4c40125952..1ce6b611745b1 100644
--- a/docs/benchmarking/cli.md
+++ b/docs/benchmarking/cli.md
@@ -670,6 +670,35 @@ vllm bench serve \
 
 </details>
 
+### 🧪 Hashing Benchmarks
+
+<details class="admonition abstract" markdown="1">
+<summary>Show more</summary>
+
+Two helper scripts live in `benchmarks/` to compare hashing options used by prefix caching and related utilities. They are standalone (no server required) and help choose a hash algorithm before enabling prefix caching in production.
+
+- `benchmarks/benchmark_hash.py`: Micro-benchmark that measures per-call latency of three implementations on a representative `(bytes, tuple[int])` payload.
+
+```bash
+python benchmarks/benchmark_hash.py --iterations 20000 --seed 42
+```
+
+- `benchmarks/benchmark_prefix_block_hash.py`: End-to-end block hashing benchmark that runs the full prefix-cache hash pipeline (`hash_block_tokens`) across many fake blocks and reports throughput.
+
+```bash
+python benchmarks/benchmark_prefix_block_hash.py --num-blocks 20000 --block-size 32 --trials 5
+```
+
+Supported algorithms: `sha256`, `sha256_cbor`, `xxhash`, `xxhash_cbor`. Install optional deps to exercise all variants:
+
+```bash
+uv pip install xxhash cbor2
+```
+
+If an algorithm’s dependency is missing, the script will skip it and continue.
+
+</details>
+
 ### ⚡ Request Prioritization Benchmark
 
 <details class="admonition abstract" markdown="1">
diff --git a/tests/v1/engine/test_engine_args.py b/tests/v1/engine/test_engine_args.py
index e96759ed66a79..527a56ff49eec 100644
--- a/tests/v1/engine/test_engine_args.py
+++ b/tests/v1/engine/test_engine_args.py
@@ -9,6 +9,7 @@ from vllm.config import VllmConfig
 from vllm.engine.arg_utils import EngineArgs
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.hashing import _xxhash
 
 
 def test_prefix_caching_from_cli():
@@ -48,6 +49,21 @@ def test_prefix_caching_from_cli():
         args = parser.parse_args(["--prefix-caching-hash-algo", "invalid"])
 
 
+@pytest.mark.skipif(_xxhash is None, reason="xxhash not installed")
+def test_prefix_caching_xxhash_from_cli():
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+
+    # set hash algorithm to xxhash (pickle)
+    args = parser.parse_args(["--prefix-caching-hash-algo", "xxhash"])
+    vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
+    assert vllm_config.cache_config.prefix_caching_hash_algo == "xxhash"
+
+    # set hash algorithm to xxhash_cbor
+    args = parser.parse_args(["--prefix-caching-hash-algo", "xxhash_cbor"])
+    vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
+    assert vllm_config.cache_config.prefix_caching_hash_algo == "xxhash_cbor"
+
+
 def test_defaults_with_usage_context():
     engine_args = EngineArgs(model="facebook/opt-125m")
     vllm_config: VllmConfig = engine_args.create_engine_config(UsageContext.LLM_CLASS)
diff --git a/vllm/config/cache.py b/vllm/config/cache.py
index 00530846fce00..91f083a5534ba 100644
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -30,7 +30,7 @@ CacheDType = Literal[
     "fp8_ds_mla",
 ]
 MambaDType = Literal["auto", "float32"]
-PrefixCachingHashAlgo = Literal["sha256", "sha256_cbor"]
+PrefixCachingHashAlgo = Literal["sha256", "sha256_cbor", "xxhash", "xxhash_cbor"]
 KVOffloadingBackend = Literal["native", "lmcache"]
 
 
@@ -77,9 +77,21 @@ class CacheConfig:
     """Whether to enable prefix caching."""
     prefix_caching_hash_algo: PrefixCachingHashAlgo = "sha256"
     """Set the hash algorithm for prefix caching:\n
-    - "sha256" uses Pickle for object serialization before hashing.\n
+    - "sha256" uses Pickle for object serialization before hashing. This is the
+    current default, as SHA256 is the most secure choice to avoid potential
+    hash collisions.\n
     - "sha256_cbor" provides a reproducible, cross-language compatible hash. It
-    serializes objects using canonical CBOR and hashes them with SHA-256."""
+    serializes objects using canonical CBOR and hashes them with SHA-256.\n
+    - "xxhash" uses Pickle serialization with xxHash (128-bit) for faster,
+    non-cryptographic hashing. Requires the optional ``xxhash`` package.
+    IMPORTANT: Use of a hashing algorithm that is not considered 
+    cryptographically secure theoretically increases the risk of hash collisions,
+    which can cause undefined behavior or even leak private information in
+    multi-tenant environments. Even if collisions are still very unlikely, it is
+    important to consider your security risk tolerance against the performance
+    benefits before turning this on.\n
+    - "xxhash_cbor" combines canonical CBOR serialization with xxHash for
+    reproducible hashing. Requires the optional ``xxhash`` package."""
     cpu_offload_gb: float = Field(default=0, ge=0)
     """The space in GiB to offload to CPU, per GPU. Default is 0, which means
     no offloading. Intuitively, this argument can be seen as a virtual way to
diff --git a/vllm/utils/hashing.py b/vllm/utils/hashing.py
index edf1e9cb34e56..f01c6b074ffeb 100644
--- a/vllm/utils/hashing.py
+++ b/vllm/utils/hashing.py
@@ -11,6 +11,17 @@ from typing import Any
 
 import cbor2
 
+try:
+    # It is important that this remains an optional dependency.
+    # It would not be allowed in environments with strict security controls,
+    # so it's best not to have it installed when not in use.
+    import xxhash as _xxhash
+
+    if not hasattr(_xxhash, "xxh3_128_digest"):
+        _xxhash = None
+except ImportError:  # pragma: no cover
+    _xxhash = None
+
 
 def sha256(input: Any) -> bytes:
     """Hash any picklable Python object using SHA-256.
@@ -47,6 +58,27 @@ def sha256_cbor(input: Any) -> bytes:
     return hashlib.sha256(input_bytes).digest()
 
 
+def _xxhash_digest(input_bytes: bytes) -> bytes:
+    if _xxhash is None:
+        raise ModuleNotFoundError(
+            "xxhash is required for the 'xxhash' prefix caching hash algorithms. "
+            "Install it via `pip install xxhash`."
+        )
+    return _xxhash.xxh3_128_digest(input_bytes)
+
+
+def xxhash(input: Any) -> bytes:
+    """Hash picklable objects using xxHash."""
+    input_bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
+    return _xxhash_digest(input_bytes)
+
+
+def xxhash_cbor(input: Any) -> bytes:
+    """Hash objects serialized with CBOR using xxHash."""
+    input_bytes = cbor2.dumps(input, canonical=True)
+    return _xxhash_digest(input_bytes)
+
+
 def get_hash_fn_by_name(hash_fn_name: str) -> Callable[[Any], bytes]:
     """Get a hash function by name, or raise an error if the function is not found.
 
@@ -60,6 +92,10 @@ def get_hash_fn_by_name(hash_fn_name: str) -> Callable[[Any], bytes]:
         return sha256
     if hash_fn_name == "sha256_cbor":
         return sha256_cbor
+    if hash_fn_name == "xxhash":
+        return xxhash
+    if hash_fn_name == "xxhash_cbor":
+        return xxhash_cbor
 
     raise ValueError(f"Unsupported hash function: {hash_fn_name}")
 
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 602eb81beb010..774200deed158 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -12,7 +12,7 @@ from typing import Any, NewType, TypeAlias, overload
 from vllm import envs
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.utils.hashing import sha256_cbor
+from vllm.utils.hashing import sha256_cbor, xxhash_cbor
 from vllm.utils.math_utils import cdiv
 from vllm.utils.mem_constants import GiB_bytes
 from vllm.v1.kv_cache_interface import (
@@ -83,18 +83,19 @@ logger = init_logger(__name__)
 #
 # The function `init_none_hash` initializes this variable globally.
 NONE_HASH: BlockHash
+_CBOR_HASH_FUNCTIONS = frozenset({sha256_cbor, xxhash_cbor})
 
 
 def init_none_hash(hash_fn: Callable[[Any], bytes]):
     global NONE_HASH
 
     hash_seed = os.getenv("PYTHONHASHSEED")
-    if hash_seed is None and hash_fn is sha256_cbor:
+    if hash_seed is None and hash_fn in _CBOR_HASH_FUNCTIONS:
         logger.warning(
             "PYTHONHASHSEED is not set. This will lead to non-reproducible "
-            "block-hashes when using sha256_cbor as the hash function."
-            "Consider setting PYTHONHASHSEED to a fixed value for "
-            "reproducibility."
+            "block-hashes when using CBOR-based hash functions such as "
+            "sha256_cbor or xxhash_cbor. Consider setting PYTHONHASHSEED to a "
+            "fixed value for reproducibility."
         )
 
     if hash_seed is None:

From 9ae3c55b10318ad7b0c19becb0dc8ad41c171db2 Mon Sep 17 00:00:00 2001
From: Yu Jiaqi <54204033+piood@users.noreply.github.com>
Date: Thu, 4 Dec 2025 00:12:58 +0800
Subject: [PATCH 193/770] SigLIP example add chat_template (#29902)

Signed-off-by: piood <2477084691@qq.com>
---
 ...ai_chat_embedding_client_for_multimodal.py |  3 +-
 vllm/entrypoints/chat_utils.py                | 35 ++++++++++++++-----
 2 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py b/examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py
index 47c2c5030078c..a7ab7e73e7d42 100644
--- a/examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py
@@ -150,7 +150,8 @@ def run_siglip(client: OpenAI, model: str):
     Start the server using:
 
     vllm serve google/siglip-base-patch16-224 \
-        --runner pooling
+        --runner pooling \
+        --chat-template template_basic.jinja
     """
 
     response = create_chat_embeddings(
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 2dd5b9c8f8aa0..1b3a7d2665b41 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -1139,11 +1139,19 @@ def validate_chat_template(chat_template: Path | str | None):
             not any(c in chat_template for c in JINJA_CHARS)
             and not Path(chat_template).exists()
         ):
-            raise ValueError(
-                f"The supplied chat template string ({chat_template}) "
-                f"appears path-like, but doesn't exist!"
+            # Try to find the template in the built-in templates directory
+            from vllm.transformers_utils.chat_templates.registry import (
+                CHAT_TEMPLATES_DIR,
             )
 
+            builtin_template_path = CHAT_TEMPLATES_DIR / chat_template
+            if not builtin_template_path.exists():
+                raise ValueError(
+                    f"The supplied chat template string ({chat_template}) "
+                    f"appears path-like, but doesn't exist! "
+                    f"Tried: {chat_template} and {builtin_template_path}"
+                )
+
     else:
         raise TypeError(f"{type(chat_template)} is not a valid chat template type")
 
@@ -1173,12 +1181,23 @@ def _load_chat_template(
 
         JINJA_CHARS = "{}\n"
         if not any(c in chat_template for c in JINJA_CHARS):
-            msg = (
-                f"The supplied chat template ({chat_template}) "
-                f"looks like a file path, but it failed to be "
-                f"opened. Reason: {e}"
+            # Try to load from the built-in templates directory
+            from vllm.transformers_utils.chat_templates.registry import (
+                CHAT_TEMPLATES_DIR,
             )
-            raise ValueError(msg) from e
+
+            builtin_template_path = CHAT_TEMPLATES_DIR / chat_template
+            try:
+                with open(builtin_template_path) as f:
+                    return f.read()
+            except OSError:
+                msg = (
+                    f"The supplied chat template ({chat_template}) "
+                    f"looks like a file path, but it failed to be opened. "
+                    f"Tried: {chat_template} and {builtin_template_path}. "
+                    f"Reason: {e}"
+                )
+                raise ValueError(msg) from e
 
         # If opening a file fails, set chat template to be args to
         # ensure we decode so our escape are interpreted correctly

From d1f7392c5f774245d0a0776d141a64e72ca3e8ca Mon Sep 17 00:00:00 2001
From: Micah Williamson <micah.williamson@amd.com>
Date: Wed, 3 Dec 2025 11:17:07 -0600
Subject: [PATCH 194/770] [ROCm][CI] Fix v1/logits_processors failure on ROCm
 (#29927)

Signed-off-by: Micah Williamson <micah.williamson@amd.com>
---
 tests/v1/logits_processors/test_custom_offline.py |  5 -----
 tests/v1/logits_processors/test_custom_online.py  | 12 ++----------
 tests/v1/logits_processors/utils.py               |  2 +-
 3 files changed, 3 insertions(+), 16 deletions(-)

diff --git a/tests/v1/logits_processors/test_custom_offline.py b/tests/v1/logits_processors/test_custom_offline.py
index 1899737737f4b..e3ddb6138cfdd 100644
--- a/tests/v1/logits_processors/test_custom_offline.py
+++ b/tests/v1/logits_processors/test_custom_offline.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import random
-import sys
 from typing import Any
 
 import pytest
@@ -10,7 +9,6 @@ from tests.utils import create_new_process_for_each_test
 from tests.v1.logits_processors.utils import (
     DUMMY_LOGITPROC_ARG,
     DUMMY_LOGITPROC_FQCN,
-    DUMMY_LOGITPROC_MODULE,
     MAX_TOKENS,
     MODEL_NAME,
     POOLING_MODEL_NAME,
@@ -18,7 +16,6 @@ from tests.v1.logits_processors.utils import (
     CustomLogitprocSource,
     DummyLogitsProcessor,
     WrappedPerReqLogitsProcessor,
-    dummy_module,
     prompts,
 )
 from tests.v1.logits_processors.utils import entry_points as fake_entry_points
@@ -162,8 +159,6 @@ def test_custom_logitsprocs(monkeypatch, logitproc_source: CustomLogitprocSource
     kwargs: dict[str, list[str | type[LogitsProcessor]]] = {}
     if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_FQCN:
         # Scenario: load logitproc based on fully-qualified class name (FQCN)
-        # Inject dummy module which defines logitproc
-        sys.modules[DUMMY_LOGITPROC_MODULE] = dummy_module
         kwargs["logits_processors"] = [DUMMY_LOGITPROC_FQCN]
     elif logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_CLASS:
         # Scenario: load logitproc from provided class object
diff --git a/tests/v1/logits_processors/test_custom_online.py b/tests/v1/logits_processors/test_custom_online.py
index 3e0bb02ed68be..3dc6b89790157 100644
--- a/tests/v1/logits_processors/test_custom_online.py
+++ b/tests/v1/logits_processors/test_custom_online.py
@@ -14,11 +14,9 @@ from tests.utils import RemoteOpenAIServerCustom, create_new_process_for_each_te
 from tests.v1.logits_processors.utils import (
     DUMMY_LOGITPROC_ARG,
     DUMMY_LOGITPROC_FQCN,
-    DUMMY_LOGITPROC_MODULE,
     MAX_TOKENS,
     MODEL_NAME,
     TEMP_GREEDY,
-    dummy_module,
     prompts,
 )
 from tests.v1.logits_processors.utils import entry_points as fake_entry_points
@@ -47,20 +45,14 @@ def _server_with_logitproc_entrypoint(
     main.main()
 
 
-def _server_with_logitproc_module(
+def _server_with_logitproc_fqcn(
     env_dict: dict[str, str] | None,
     model: str,
     vllm_serve_args: list[str],
 ) -> None:
     """Start vLLM server, inject module with dummy logitproc"""
-
-    # Patch `modules` to inject dummy logitproc module
     from vllm.entrypoints.cli import main
 
-    sys.modules[DUMMY_LOGITPROC_MODULE] = dummy_module
-
-    # fork is required for workers to see entrypoint patch
-    os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "fork"
     if env_dict is not None:
         os.environ.update(env_dict)
 
@@ -99,7 +91,7 @@ def server(default_server_args, request, monkeypatch):
     if request.param:
         # Launch server, append FQCN argument, inject dummy logitproc module
         args = default_server_args + request.param
-        _server_fxn = _server_with_logitproc_module
+        _server_fxn = _server_with_logitproc_fqcn
     else:
         # Launch server, inject dummy logitproc entrypoint
         args = default_server_args
diff --git a/tests/v1/logits_processors/utils.py b/tests/v1/logits_processors/utils.py
index b8548bc319554..e54da72e5e2ed 100644
--- a/tests/v1/logits_processors/utils.py
+++ b/tests/v1/logits_processors/utils.py
@@ -27,7 +27,7 @@ DUMMY_LOGITPROC_ARG = "target_token"
 TEMP_GREEDY = 0.0
 MAX_TOKENS = 20
 DUMMY_LOGITPROC_ENTRYPOINT = "dummy_logitproc"
-DUMMY_LOGITPROC_MODULE = "DummyModule"
+DUMMY_LOGITPROC_MODULE = "tests.v1.logits_processors.utils"
 DUMMY_LOGITPROC_FQCN = f"{DUMMY_LOGITPROC_MODULE}:DummyLogitsProcessor"
 
 
From dd5d1ef780b5b73b9817e5dc8fe9b3e98a399e20 Mon Sep 17 00:00:00 2001
From: avigny <47987522+avigny@users.noreply.github.com>
Date: Wed, 3 Dec 2025 18:45:31 +0100
Subject: [PATCH 195/770] [Bugfix] Mistral tool parser streaming update
 (#19425)

Signed-off-by: avigny <47987522+avigny@users.noreply.github.com>
Signed-off-by: Chauncey <chaunceyjiang@gmail.com>
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Co-authored-by: Jeff Cook <jeff@jeffcook.io>
Co-authored-by: sfbemerk <benjaminmerkel@mail.de>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 requirements/common.txt                       |   1 +
 tests/tool_use/test_mistral_tool_parser.py    | 847 ++++++++++++++++++
 tests/tool_use/utils.py                       |  28 +-
 .../tool_parsers/mistral_tool_parser.py       | 608 ++++++++-----
 4 files changed, 1277 insertions(+), 207 deletions(-)
 create mode 100644 tests/tool_use/test_mistral_tool_parser.py

diff --git a/requirements/common.txt b/requirements/common.txt
index 8b9e6b935bd20..f18560b98d16c 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -46,6 +46,7 @@ scipy # Required for phi-4-multimodal-instruct
 ninja # Required for xgrammar, rocm, tpu, xpu
 pybase64 # fast base64 implementation
 cbor2 # Required for cross-language serialization of hashable objects
+ijson # Required for mistral streaming tool parser
 setproctitle # Used to set process names for better debugging and monitoring
 openai-harmony >= 0.0.3  # Required for gpt-oss
 anthropic == 0.71.0
diff --git a/tests/tool_use/test_mistral_tool_parser.py b/tests/tool_use/test_mistral_tool_parser.py
new file mode 100644
index 0000000000000..e5deb7f40eb35
--- /dev/null
+++ b/tests/tool_use/test_mistral_tool_parser.py
@@ -0,0 +1,847 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from collections.abc import Generator
+
+import partial_json_parser
+import pytest
+from mistral_common.protocol.instruct.messages import AssistantMessage
+from mistral_common.protocol.instruct.request import InstructRequest
+from mistral_common.protocol.instruct.tool_calls import FunctionCall, ToolCall
+from partial_json_parser.core.options import Allow
+
+from vllm.entrypoints.openai.protocol import DeltaMessage, DeltaToolCall
+from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolParser
+from vllm.tokenizers import (
+    MistralTokenizer,
+    TokenizerLike,
+    get_tokenizer,
+)
+from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
+
+
+@pytest.fixture(scope="module")
+def mistral_pre_v11_tokenizer():
+    MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
+    return get_tokenizer(tokenizer_name=MODEL)
+
+
+@pytest.fixture(scope="module")
+def mistral_tokenizer():
+    MODEL = "mistralai/Mistral-Small-3.2-24B-Instruct-2506"
+    return get_tokenizer(tokenizer_name=MODEL, tokenizer_mode="mistral")
+
+
+@pytest.fixture
+def mistral_pre_v11_tool_parser(mistral_pre_v11_tokenizer):
+    return MistralToolParser(mistral_pre_v11_tokenizer)
+
+
+@pytest.fixture
+def mistral_tool_parser(mistral_tokenizer):
+    return MistralToolParser(mistral_tokenizer)
+
+
+def assert_tool_calls(
+    actual_tool_calls: list[ToolCall] | list[DeltaToolCall],
+    expected_tool_calls: list[ToolCall],
+):
+    assert len(actual_tool_calls) == len(expected_tool_calls)
+
+    for actual_tool_call, expected_tool_call in zip(
+        actual_tool_calls, expected_tool_calls
+    ):
+        assert isinstance(actual_tool_call.id, str)
+        assert len(actual_tool_call.id) == 9
+
+        if isinstance(actual_tool_call, ToolCall):
+            assert actual_tool_call.type == "function"
+        elif isinstance(actual_tool_call, DeltaToolCall):
+            assert actual_tool_call.function is not None
+            assert actual_tool_call.function.name is not None
+            assert actual_tool_call.function.arguments is not None
+        assert actual_tool_call.function is not None
+        assert actual_tool_call.function.name == expected_tool_call.function.name, (
+            f"got wrong function name:${actual_tool_call.function.name}"
+        )
+        assert (
+            actual_tool_call.function.arguments == expected_tool_call.function.arguments
+        ), f"got wrong function argument:${actual_tool_call.function.arguments}"
+
+
+def fix_tool_call_tokenization(
+    tokens: list[int],
+    mistral_tool_parser: MistralToolParser,
+    mistral_tokenizer: TokenizerLike,
+):
+    """
+    Replaces the textual token sequence for [TOOL_CALLS]
+    with its single special token ID.
+    """
+    textual_tool_call_token_ids = mistral_tokenizer.encode(
+        text=mistral_tool_parser.bot_token,
+        add_special_tokens=False,
+    )
+    # textual_tool_call_token_ids must not contain special tokens like bos, eos etc
+    special_tool_call_token_ids = [mistral_tool_parser.bot_token_id]
+
+    # If the input is too short to contain the sequence, no replacement is possible
+    if not tokens or len(tokens) < len(textual_tool_call_token_ids):
+        return tokens
+
+    result_tokens = []
+    i = 0
+    target_len = len(textual_tool_call_token_ids)
+
+    while i < len(tokens):
+        # Check if the slice from the current position matches the target sequence
+        if tokens[i : i + target_len] == textual_tool_call_token_ids:
+            # If it matches, add the replacement and jump the index forward
+            result_tokens.extend(special_tool_call_token_ids)
+            i += target_len
+        else:
+            # Otherwise, just add the current token and move to the next one
+            result_tokens.append(tokens[i])
+            i += 1
+
+    return result_tokens
+
+
+def stream_delta_message_generator(
+    mistral_tool_parser: MistralToolParser,
+    mistral_tokenizer: TokenizerLike,
+    model_output: str | None,
+    tools: list[tuple[str, str]] | None,
+) -> Generator[DeltaMessage, None, None]:
+    if (
+        isinstance(mistral_tokenizer, MistralTokenizer)
+        and mistral_tokenizer.version >= 11
+    ):
+        # With the newer versions of the tokenizer,
+        # we cannot tokenize free text
+        # so we need to create a list of messages to get tokenized
+        assert tools is not None
+        assistant_msg = AssistantMessage(
+            tool_calls=[
+                ToolCall(
+                    function=FunctionCall(
+                        name=name,
+                        arguments=arg,
+                    )
+                )
+                for (name, arg) in tools
+            ],
+        )
+        request = InstructRequest(
+            messages=[assistant_msg],
+        )
+        all_token_ids = mistral_tokenizer.instruct.encode_instruct(request).tokens
+    else:
+        # Older versions of the tokenizer are
+        # able to encode directly the model's output (free text) into tokens
+        assert model_output is not None
+        all_token_ids = mistral_tokenizer.encode(model_output, add_special_tokens=False)
+
+    all_token_ids = fix_tool_call_tokenization(
+        all_token_ids, mistral_tool_parser, mistral_tokenizer
+    )
+
+    previous_text = ""
+    previous_tokens = None
+    prefix_offset = 0
+    read_offset = 0
+    for i, delta_token in enumerate(all_token_ids):
+        delta_token_ids = [delta_token]
+        previous_token_ids = all_token_ids[:i]
+        current_token_ids = all_token_ids[: i + 1]
+
+        (new_tokens, delta_text, new_prefix_offset, new_read_offset) = (
+            detokenize_incrementally(
+                tokenizer=mistral_tokenizer,
+                all_input_ids=current_token_ids,
+                prev_tokens=previous_tokens,
+                prefix_offset=prefix_offset,
+                read_offset=read_offset,
+                skip_special_tokens=isinstance(mistral_tokenizer, MistralTokenizer),
+                spaces_between_special_tokens=True,
+            )
+        )
+
+        current_text = previous_text + delta_text
+
+        delta_message = mistral_tool_parser.extract_tool_calls_streaming(
+            previous_text,
+            current_text,
+            delta_text,
+            previous_token_ids,
+            current_token_ids,
+            delta_token_ids,
+            request=None,  # type: ignore[arg-type]
+        )
+        if delta_message:
+            yield delta_message
+
+        previous_text = current_text
+        previous_tokens = (
+            previous_tokens + new_tokens if previous_tokens else new_tokens
+        )
+        prefix_offset = new_prefix_offset
+        read_offset = new_read_offset
+
+
+def test_extract_tool_calls_no_tools(mistral_pre_v11_tool_parser):
+    model_output = "This is a test"
+    extracted_tool_calls = mistral_pre_v11_tool_parser.extract_tool_calls(
+        model_output, request=None
+    )  # type: ignore[arg-type]
+    assert not extracted_tool_calls.tools_called
+    assert extracted_tool_calls.tool_calls == []
+    assert extracted_tool_calls.content == model_output
+
+
+@pytest.mark.parametrize(
+    ids=[
+        "single_tool_add",
+        "single_tool_weather",
+        "argument_before_name",
+        "argument_before_name_and_name_in_argument",
+    ],
+    argnames=["model_output", "expected_tool_calls", "expected_content"],
+    argvalues=[
+        (
+            """[TOOL_CALLS][{"name": "add", "arguments":{"a": 3.5, "b": 4}}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add", arguments=json.dumps({"a": 3.5, "b": 4})
+                    )
+                )
+            ],
+            None,
+        ),
+        (
+            """[TOOL_CALLS] [{"name": "get_current_weather", "arguments":{"city": "San Francisco", "state": "CA", "unit": "celsius"}}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "San Francisco", "state": "CA", "unit": "celsius"}
+                        ),
+                    )
+                )
+            ],
+            None,
+        ),
+        (
+            """[TOOL_CALLS] [{"arguments":{"city": "San Francisco", "state": "CA", "unit": "celsius"}, "name": "get_current_weather"}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "San Francisco", "state": "CA", "unit": "celsius"}
+                        ),
+                    )
+                )
+            ],
+            None,
+        ),
+        (
+            """[TOOL_CALLS] [{"arguments":{"name": "John Doe"}, "name": "get_age"}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_age",
+                        arguments=json.dumps(
+                            {
+                                "name": "John Doe",
+                            }
+                        ),
+                    )
+                )
+            ],
+            None,
+        ),
+    ],
+)
+def test_extract_tool_calls_pre_v11_tokenizer(
+    mistral_pre_v11_tool_parser, model_output, expected_tool_calls, expected_content
+):
+    extracted_tool_calls = mistral_pre_v11_tool_parser.extract_tool_calls(
+        model_output, request=None
+    )  # type: ignore[arg-type]
+    assert extracted_tool_calls.tools_called
+
+    assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
+
+    assert extracted_tool_calls.content == expected_content
+
+
+@pytest.mark.parametrize(
+    ids=[
+        "single_tool_add",
+        "single_tool_weather",
+        "multiple_tool_calls",
+    ],
+    argnames=["model_output", "expected_tool_calls", "expected_content"],
+    argvalues=[
+        (
+            """[TOOL_CALLS]add_this_and_that{"a": 3.5, "b": 4}""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add_this_and_that",
+                        arguments=json.dumps({"a": 3.5, "b": 4}),
+                    )
+                )
+            ],
+            None,
+        ),
+        (
+            """[TOOL_CALLS]get_current_weather{"city": "San Francisco", "state": "CA", "unit": "celsius"}""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "San Francisco", "state": "CA", "unit": "celsius"}
+                        ),
+                    )
+                )
+            ],
+            None,
+        ),
+        (
+            """[TOOL_CALLS]add{"a": 3.5, "b": 4}[TOOL_CALLS]multiply{"a": 3, "b": 6}""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add", arguments=json.dumps({"a": 3.5, "b": 4})
+                    )
+                ),
+                ToolCall(
+                    function=FunctionCall(
+                        name="multiply", arguments=json.dumps({"a": 3, "b": 6})
+                    )
+                ),
+            ],
+            None,
+        ),
+    ],
+)
+def test_extract_tool_calls(
+    mistral_tool_parser, model_output, expected_tool_calls, expected_content
+):
+    extracted_tool_calls = mistral_tool_parser.extract_tool_calls(
+        model_output, request=None
+    )  # type: ignore[arg-type]
+    assert extracted_tool_calls.tools_called
+
+    assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
+
+    assert extracted_tool_calls.content == expected_content
+
+
+def _test_extract_tool_calls_streaming(
+    tool_parser, tokenizer, model_output, tools, expected_tool_calls, expected_content
+):
+    other_content: str = ""
+    function_names: list[str] = []
+    function_args_strs: list[str] = []
+    tool_call_idx: int = -1
+    tool_call_ids: list[str | None] = []
+
+    for delta_message in stream_delta_message_generator(
+        tool_parser, tokenizer, model_output, tools
+    ):
+        # role should never be streamed from tool parser
+        assert not delta_message.role
+
+        if delta_message.content:
+            other_content += delta_message.content
+
+        streamed_tool_calls = delta_message.tool_calls
+
+        if streamed_tool_calls and len(streamed_tool_calls) > 0:
+            # make sure only one diff is present - correct even for parallel
+            assert len(streamed_tool_calls) == 1
+            tool_call = streamed_tool_calls[0]
+
+            assert len(tool_parser.prev_tool_call_arr) > 0
+
+            # if a new tool is being called, set up empty arguments
+            if tool_call.index != tool_call_idx:
+                tool_call_idx = tool_call.index
+                function_args_strs.append("")
+                tool_call_ids.append(None)
+
+            # if a tool call ID is streamed, make sure one hasn't been already
+            if tool_call.id and not tool_call_ids[tool_call.index]:
+                tool_call_ids[tool_call.index] = tool_call.id
+
+            # if parts of the function start being streamed
+            if tool_call.function:
+                # if the function name is defined, set it. it should be streamed
+                # IN ENTIRETY, exactly one time.
+                if tool_call.function.name:
+                    assert isinstance(tool_call.function.name, str)
+                    function_names.append(tool_call.function.name)
+
+                if tool_call.function.arguments:
+                    # make sure they're a string and then add them to the list
+                    assert isinstance(tool_call.function.arguments, str)
+
+                    function_args_strs[tool_call.index] += tool_call.function.arguments
+
+    assert other_content == expected_content
+
+    actual_tool_calls = [
+        ToolCall(
+            id=tool_call_id,
+            function=FunctionCall(
+                name=function_name,
+                arguments=partial_json_parser.ensure_json(
+                    function_args_str, Allow.OBJ | Allow.STR
+                ),
+            ),
+        )
+        for tool_call_id, function_name, function_args_str in zip(
+            tool_call_ids, function_names, function_args_strs
+        )
+    ]
+    assert_tool_calls(actual_tool_calls, expected_tool_calls)
+
+
+@pytest.mark.parametrize(
+    ids=[
+        "no_tools",
+        "single_tool_add",
+        "single_tool_add_strings",
+        "single_tool_weather",
+        "argument_before_name",
+        "argument_before_name_and_name_in_argument",
+        "multiple_tools",
+    ],
+    argnames=["model_output", "expected_tool_calls", "expected_content"],
+    argvalues=[
+        ("""This is a test""", [], """This is a test"""),
+        (
+            """[TOOL_CALLS]  [ {"name":"add" , "arguments" : {"a": 3, "b": 4} } ]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add", arguments=json.dumps({"a": 3, "b": 4})
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            """[TOOL_CALLS] [{"name": "add", "arguments":{"a": "3", "b": "4"}}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add", arguments=json.dumps({"a": "3", "b": "4"})
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            """[TOOL_CALLS] [{"name": "get_current_weather", "arguments": {"city": "San Francisco", "state": "CA", "unit": "celsius"}}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "San Francisco", "state": "CA", "unit": "celsius"}
+                        ),
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            """[TOOL_CALLS] [{"arguments": {"city": "San Francisco", "state": "CA", "unit": "celsius"}, "name": "get_current_weather"}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "San Francisco", "state": "CA", "unit": "celsius"}
+                        ),
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            """[TOOL_CALLS] [{"arguments": {"name": "John Doe"}, "name": "get_age"}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_age",
+                        arguments=json.dumps(
+                            {
+                                "name": "John Doe",
+                            }
+                        ),
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            """[TOOL_CALLS] [{"name": "add", "arguments": {"a": 3.5, "b": 4}}, {"name": "get_current_weather", "arguments":{"city": "San Francisco", "state": "CA", "unit": "celsius"}}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add", arguments=json.dumps({"a": 3.5, "b": 4})
+                    )
+                ),
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "San Francisco", "state": "CA", "unit": "celsius"}
+                        ),
+                    )
+                ),
+            ],
+            "",
+        ),
+    ],
+)
+def test_extract_tool_calls_streaming_pre_v11_tokenizer(
+    mistral_pre_v11_tool_parser,
+    mistral_pre_v11_tokenizer,
+    model_output,
+    expected_tool_calls,
+    expected_content,
+):
+    _test_extract_tool_calls_streaming(
+        mistral_pre_v11_tool_parser,
+        mistral_pre_v11_tokenizer,
+        model_output,
+        None,
+        expected_tool_calls,
+        expected_content,
+    )
+
+
+@pytest.mark.parametrize(
+    ids=[
+        "single_tool_add",
+        "single_tool_add_strings",
+        "multiple_tools",
+    ],
+    argnames=["tools", "expected_tool_calls", "expected_content"],
+    argvalues=[
+        (
+            [("add", '{"a": 3, "b": 4}')],
+            # [TOOL_CALLS]add{"a": 3, "b": 4}
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add", arguments=json.dumps({"a": 3, "b": 4})
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            [("add_two_strings", '{"a": "3", "b": "4"}')],
+            # [TOOL_CALLS]add_two_strings{"a": "3", "b": "4"}
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add_two_strings",
+                        arguments=json.dumps({"a": "3", "b": "4"}),
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            [
+                ("add", '{"a": 3.5, "b": 4}'),
+                (
+                    "get_current_weather",
+                    '{"city": "San Francisco", "state": "CA", "unit": "celsius"}',  # noqa: E501
+                ),
+            ],
+            # [TOOL_CALLS]add{"a": 3.5, "b": 4}[TOOL_CALLS]get_current_weather{"city": "San Francisco", "state": "CA", "unit": "celsius"}  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add", arguments=json.dumps({"a": 3.5, "b": 4})
+                    )
+                ),
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "San Francisco", "state": "CA", "unit": "celsius"}
+                        ),
+                    )
+                ),
+            ],
+            "",
+        ),
+    ],
+)
+def test_extract_tool_calls_streaming(
+    mistral_tool_parser,
+    mistral_tokenizer,
+    tools,
+    expected_tool_calls,
+    expected_content,
+):
+    _test_extract_tool_calls_streaming(
+        mistral_tool_parser,
+        mistral_tokenizer,
+        None,
+        tools,
+        expected_tool_calls,
+        expected_content,
+    )
+
+
+@pytest.mark.parametrize(
+    ids=[
+        "single_tool_add",
+        "single_tool_weather",
+        "multiple_tool_calls",
+        "content_before_tool",
+    ],
+    argnames=["model_output", "expected_tool_calls", "expected_content"],
+    argvalues=[
+        (
+            """[TOOL_CALLS]add_this_and_that{"a": 3.5, "b": 4}""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add_this_and_that",
+                        arguments=json.dumps({"a": 3.5, "b": 4}),
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            """[TOOL_CALLS]get_current_weather{"city": "San Francisco", "state": "CA", "unit": "celsius"}""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "San Francisco", "state": "CA", "unit": "celsius"}
+                        ),
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            """[TOOL_CALLS]add{"a": 3.5, "b": 4}[TOOL_CALLS]multiply{"a": 3, "b": 6}""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add", arguments=json.dumps({"a": 3.5, "b": 4})
+                    )
+                ),
+                ToolCall(
+                    function=FunctionCall(
+                        name="multiply", arguments=json.dumps({"a": 3, "b": 6})
+                    )
+                ),
+            ],
+            "",
+        ),
+        (
+            # Additional content should not be after the tool calls
+            """bla[TOOL_CALLS]add_this_and_that{"a": 3.5, "b": 4}""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add_this_and_that",
+                        arguments=json.dumps({"a": 3.5, "b": 4}),
+                    )
+                )
+            ],
+            "bla",
+        ),
+    ],
+)
+def test_extract_tool_calls_streaming_one_chunk(
+    mistral_tool_parser,
+    mistral_tokenizer,
+    model_output,
+    expected_tool_calls,
+    expected_content,
+):
+    if isinstance(mistral_tokenizer, MistralTokenizer):
+        all_token_ids = mistral_tokenizer.encode(model_output)
+    else:
+        all_token_ids = mistral_tokenizer.encode(model_output, add_special_tokens=False)
+    all_token_ids = fix_tool_call_tokenization(
+        all_token_ids, mistral_tool_parser, mistral_tokenizer
+    )
+
+    delta_message = mistral_tool_parser.extract_tool_calls_streaming(
+        previous_text="",
+        current_text=model_output,
+        delta_text=model_output,
+        previous_token_ids=[],
+        current_token_ids=all_token_ids,
+        delta_token_ids=all_token_ids,
+        request=None,
+    )  # type: ignore[arg-type]
+    assert isinstance(delta_message, DeltaMessage)
+    assert len(delta_message.tool_calls) == len(expected_tool_calls)
+
+    assert_tool_calls(delta_message.tool_calls, expected_tool_calls)
+
+    if delta_message.content is None:
+        assert expected_content == ""
+    else:
+        assert delta_message.content == expected_content
+
+
+@pytest.mark.parametrize(
+    ids=[
+        "no_tools",
+        "single_tool_add",
+        "single_tool_add_strings",
+        "single_tool_weather",
+        "argument_before_name",
+        "argument_before_name_and_name_in_argument",
+        "multiple_tools",
+    ],
+    argnames=["model_output", "expected_tool_calls", "expected_content"],
+    argvalues=[
+        ("""This is a test""", [], """This is a test"""),
+        (
+            """[TOOL_CALLS]  [ {"name":"add" , "arguments" : {"a": 3, "b": 4} } ]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add", arguments=json.dumps({"a": 3, "b": 4})
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            """[TOOL_CALLS] [{"name": "add", "arguments":{"a": "3", "b": "4"}}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add", arguments=json.dumps({"a": "3", "b": "4"})
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            """[TOOL_CALLS] [{"name": "get_current_weather", "arguments": {"city": "San Francisco", "state": "CA", "unit": "celsius"}}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "San Francisco", "state": "CA", "unit": "celsius"}
+                        ),
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            """[TOOL_CALLS] [{"arguments": {"city": "San Francisco", "state": "CA", "unit": "celsius"}, "name": "get_current_weather"}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "San Francisco", "state": "CA", "unit": "celsius"}
+                        ),
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            """[TOOL_CALLS] [{"arguments": {"name": "John Doe"}, "name": "get_age"}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_age",
+                        arguments=json.dumps(
+                            {
+                                "name": "John Doe",
+                            }
+                        ),
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            """[TOOL_CALLS] [{"arguments": {"a": 3.5, "b": 4}, "name": "add"}, {"arguments":{"city": "San Francisco", "state": "CA", "unit": "celsius"}, "name": "get_current_weather"}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add", arguments=json.dumps({"a": 3.5, "b": 4})
+                    )
+                ),
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "San Francisco", "state": "CA", "unit": "celsius"}
+                        ),
+                    )
+                ),
+            ],
+            "",
+        ),
+    ],
+)
+def test_extract_tool_calls_streaming_pre_v11_tokenizer_one_chunk(
+    mistral_pre_v11_tool_parser,
+    mistral_pre_v11_tokenizer,
+    model_output,
+    expected_tool_calls,
+    expected_content,
+):
+    if isinstance(mistral_pre_v11_tokenizer, MistralTokenizer):
+        all_token_ids = mistral_pre_v11_tokenizer.encode(model_output)
+    else:
+        all_token_ids = mistral_pre_v11_tokenizer.encode(
+            model_output, add_special_tokens=False
+        )
+    all_token_ids = fix_tool_call_tokenization(
+        all_token_ids, mistral_pre_v11_tool_parser, mistral_pre_v11_tokenizer
+    )
+
+    delta_message = mistral_pre_v11_tool_parser.extract_tool_calls_streaming(
+        previous_text="",
+        current_text=model_output,
+        delta_text=model_output,
+        previous_token_ids=[],
+        current_token_ids=all_token_ids,
+        delta_token_ids=all_token_ids,
+        request=None,
+    )  # type: ignore[arg-type]
+    assert isinstance(delta_message, DeltaMessage)
+    assert len(delta_message.tool_calls) == len(expected_tool_calls)
+
+    assert_tool_calls(delta_message.tool_calls, expected_tool_calls)
+
+    if delta_message.content is None:
+        assert expected_content == ""
+    else:
+        assert delta_message.content == expected_content
diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
index 7584b903156b7..de7284a309c53 100644
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -123,7 +123,7 @@ CONFIGS: dict[str, ServerConfig] = {
         "supports_parallel": True,
         "extended": True,
     },
-    "mistral": {
+    "mistral-7b": {
         "model": "mistralai/Mistral-7B-Instruct-v0.3",
         "arguments": [
             "--enforce-eager",
@@ -145,6 +145,32 @@ CONFIGS: dict[str, ServerConfig] = {
         "call the tool. Otherwise, answer the user's query directly "
         "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
         "to the user's question - just respond to it normally.",
+        "supports_parallel": True,
+    },
+    "mistral-small-3.2": {
+        "model": "mistralai/Mistral-Small-3.2-24B-Instruct-2506",
+        "arguments": [
+            "--enforce-eager",
+            "--no-enable-prefix-caching",
+            "--tool-call-parser",
+            "mistral",
+            "--tokenizer-mode",
+            "mistral",
+            "--config-format",
+            "mistral",
+            "--load-format",
+            "mistral",
+            "--tensor-parallel-size",
+            "4",
+            '--ignore-patterns="consolidated.safetensors"',
+        ],
+        "system_prompt": "You are a helpful assistant with access to tools. If a tool"
+        " that you have would be helpful to answer a user query, "
+        "call the tool. Otherwise, answer the user's query directly "
+        "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
+        "to the user's question - just respond to it normally.",
+        "supports_parallel": True,
+        "extended": True,
     },
     # FIXME: This test currently fails, need to debug why.
     # "granite20b": {
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index b89db60545abd..aa5089ffe84d7 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -3,12 +3,12 @@
 
 import json
 from collections.abc import Sequence
+from enum import Enum, auto
 from random import choices
 from string import ascii_letters, digits
 
-import partial_json_parser
+import ijson
 import regex as re
-from partial_json_parser.core.options import Allow
 from pydantic import Field
 
 from vllm.entrypoints.openai.protocol import (
@@ -23,7 +23,6 @@ from vllm.entrypoints.openai.protocol import (
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
 )
-from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff
 from vllm.logger import init_logger
 from vllm.tokenizers import MistralTokenizer, TokenizerLike
 
@@ -32,6 +31,22 @@ logger = init_logger(__name__)
 ALPHANUMERIC = ascii_letters + digits
 
 
+class StreamingState(Enum):
+    """Enum for tracking the current streaming parsing state."""
+
+    WAITING_FOR_TOOL_START = auto()
+    WAITING_FOR_TOOL_KEY = (
+        auto()
+    )  # waiting for the "name" or "arguments" key to be complete
+    PARSING_NAME = auto()
+    PARSING_NAME_COMPLETED = auto()
+    WAITING_FOR_ARGUMENTS_START = auto()
+    PARSING_ARGUMENTS = auto()
+    PARSING_ARGUMENTS_COMPLETED = auto()
+    TOOL_COMPLETE = auto()
+    ALL_TOOLS_COMPLETE = auto()
+
+
 class MistralToolCall(ToolCall):
     id: str = Field(default_factory=lambda: MistralToolCall.generate_random_id())
 
@@ -46,8 +61,8 @@ class MistralToolCall(ToolCall):
         return id.isalnum() and len(id) == 9
 
 
-def _is_fn_name_regex_support(model_tokenizer: TokenizerLike) -> bool:
-    return (
+def _is_pre_v11_tokeniser(model_tokenizer: TokenizerLike) -> bool:
+    return not (
         isinstance(model_tokenizer, MistralTokenizer) and model_tokenizer.version >= 11
     )
 
@@ -69,16 +84,22 @@ class MistralToolParser(ToolParser):
 
         # initialize properties used for state when parsing tool calls in
         # streaming mode
-        self.prev_tool_call_arr: list[dict] = []
         self.current_tool_id: int = -1
-        self.current_tool_name_sent: bool = False
-        self.streamed_args_for_tool: list[
-            str
-        ] = []  # map what has been streamed for each tool so far to a list
+        self.streaming_state: StreamingState = StreamingState.WAITING_FOR_TOOL_START
+
+        # For streaming pre v11 tokenizer tool calls
+        self.current_tool_name: str | None = None
+        self.current_tool_mistral_id: str | None = None
+        self.starting_new_tool = False
+        if _is_pre_v11_tokeniser(self.model_tokenizer):
+            self.parse_coro = ijson.parse_coro(
+                self.update_stream_state_pre_v11_tokenizer()
+            )
+
         self.bot_token = "[TOOL_CALLS]"
         self.bot_token_id = self.vocab.get(self.bot_token)
         self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL)
-        if _is_fn_name_regex_support(self.model_tokenizer):
+        if not _is_pre_v11_tokeniser(self.model_tokenizer):
             self.fn_name_regex = re.compile(
                 r"([a-zA-Z0-9_-]+)(\{[\s\S]*?\}+)", re.DOTALL
             )
@@ -131,18 +152,19 @@ class MistralToolParser(ToolParser):
             # jsons is difficult
             try:
                 if self.fn_name_regex:
-                    matches = self.fn_name_regex.findall(tool_content)
-
                     function_call_arr = []
-                    for match in matches:
-                        fn_name = match[0]
-                        args = match[1]
+                    for single_tool_content in model_output.split(self.bot_token):
+                        matches = self.fn_name_regex.findall(single_tool_content)
 
-                        # fn_name is encoded outside serialized json dump
-                        # only arguments are serialized
-                        function_call_arr.append(
-                            {"name": fn_name, "arguments": json.loads(args)}
-                        )
+                        for match in matches:
+                            fn_name = match[0]
+                            args = match[1]
+
+                            # fn_name is encoded outside serialized json dump
+                            # only arguments are serialized
+                            function_call_arr.append(
+                                {"name": fn_name, "arguments": json.loads(args)}
+                            )
                 else:
                     function_call_arr = json.loads(tool_content)
             except json.JSONDecodeError:
@@ -193,198 +215,372 @@ class MistralToolParser(ToolParser):
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
     ) -> DeltaMessage | None:
-        # if the tool call token is not in the tokens generated so far, append
-        # output to contents since it's not a tool
-        if self.bot_token not in current_text:
+        if self.bot_token_id not in current_token_ids:
+            # if the tool call token is not in the tokens generated so far,
+            # append output to contents since it's not a tool
             return DeltaMessage(content=delta_text)
 
-        # if the tool call token ID IS in the tokens generated so far, that
+        # if the tool call token IS in the tokens generated so far, that
         # means we're parsing as tool calls now
-
-        # handle if we detected the BOT token which means the start of tool
-        # calling
-        if self.bot_token_id in delta_token_ids and len(delta_token_ids) == 1:
-            # if it's the only token, return None, so we don't send a chat
-            # completion any don't send a control token
-            return None
-
-        # bit mask flags for partial JSON parsing. If the name hasn't been
-        # sent yet, don't allow sending
-        # an incomplete string since OpenAI only ever (as far as I have
-        # seen) allows sending the entire tool/ function name at once.
-        flags = Allow.ALL if self.current_tool_name_sent else Allow.ALL & ~Allow.STR
         try:
-            # replace BOT token with empty string, and convert single quotes
-            # to double to allow parsing as JSON since mistral uses single
-            # quotes instead of double for tool calls
-            parsable_arr = current_text.split(self.bot_token)[-1]
-
-            # tool calls are generated in an array, so do partial JSON
-            # parsing on the entire array
-            try:
-                tool_call_arr: list[dict] = partial_json_parser.loads(
-                    parsable_arr, flags
+            if _is_pre_v11_tokeniser(self.model_tokenizer):
+                return self._extract_tool_calls_streaming_pre_v11_tokenizer(
+                    delta_text=delta_text,
+                    delta_token_ids=delta_token_ids,
                 )
-            except partial_json_parser.core.exceptions.MalformedJSON:
-                logger.debug("not enough tokens to parse into JSON yet")
-                return None
-
-            # select as the current tool call the one we're on the state at
-
-            current_tool_call: dict = (
-                tool_call_arr[self.current_tool_id] if len(tool_call_arr) > 0 else {}
-            )
-
-            # case -- if no tokens have been streamed for the tool, e.g.
-            #   only the array brackets, stream nothing
-            if len(tool_call_arr) == 0:
-                return None
-
-            # case: we are starting a new tool in the array
-            #   -> array has > 0 length AND length has moved past cursor
-            elif (
-                len(tool_call_arr) > 0 and len(tool_call_arr) > self.current_tool_id + 1
-            ):
-                # if we're moving on to a new call, first make sure we
-                # haven't missed anything in the previous one that was
-                # auto-generated due to JSON completions, but wasn't
-                # streamed to the client yet.
-                if self.current_tool_id >= 0:
-                    diff: str | None = current_tool_call.get("arguments")
-
-                    if diff:
-                        diff = json.dumps(diff, ensure_ascii=False).replace(
-                            self.streamed_args_for_tool[self.current_tool_id], ""
-                        )
-                        delta = DeltaMessage(
-                            tool_calls=[
-                                DeltaToolCall(
-                                    index=self.current_tool_id,
-                                    function=DeltaFunctionCall(
-                                        arguments=diff
-                                    ).model_dump(exclude_none=True),
-                                )
-                            ]
-                        )
-                        self.streamed_args_for_tool[self.current_tool_id] += diff
-                    else:
-                        delta = None
-                else:
-                    delta = None
-                # re-set stuff pertaining to progress in the current tool
-                self.current_tool_id = len(tool_call_arr) - 1
-                self.current_tool_name_sent = False
-                self.streamed_args_for_tool.append("")
-                logger.debug("starting on new tool %d", self.current_tool_id)
-                return delta
-
-            # case: update an existing tool - this is handled below
-
-            # if the current tool name hasn't been sent, send if available
-            # - otherwise send nothing
-            if not self.current_tool_name_sent:
-                function_name = current_tool_call.get("name")
-                if function_name:
-                    delta = DeltaMessage(
-                        tool_calls=[
-                            DeltaToolCall(
-                                index=self.current_tool_id,
-                                type="function",
-                                id=MistralToolCall.generate_random_id(),
-                                function=DeltaFunctionCall(
-                                    name=function_name
-                                ).model_dump(exclude_none=True),
-                            )
-                        ]
-                    )
-                    self.current_tool_name_sent = True
-                else:
-                    delta = None
-
-            # now we know we're on the same tool call and we're streaming
-            # arguments
             else:
-                prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get(
-                    "arguments"
+                return self._extract_tool_calls_streaming(
+                    delta_text=delta_text, delta_token_ids=delta_token_ids
                 )
-                cur_arguments = current_tool_call.get("arguments")
-
-                new_text = delta_text.replace("'", '"')
-                if '"}' in new_text:
-                    new_text = new_text[: new_text.rindex('"}')]
-
-                if not cur_arguments and not prev_arguments:
-                    delta = None
-                elif not cur_arguments and prev_arguments:
-                    logger.error(
-                        "INVARIANT - impossible to have arguments reset mid-arguments"
-                    )
-                    delta = None
-                elif cur_arguments and not prev_arguments:
-                    cur_arguments_json = json.dumps(cur_arguments, ensure_ascii=False)[
-                        :-2
-                    ]
-                    logger.debug("finding %s in %s", new_text, cur_arguments_json)
-
-                    if new_text not in cur_arguments_json:
-                        return None
-                    arguments_delta = cur_arguments_json[
-                        : cur_arguments_json.rindex(new_text) + len(new_text)
-                    ]
-                    logger.debug(
-                        "First tokens in arguments received: %s", arguments_delta
-                    )
-                    delta = DeltaMessage(
-                        tool_calls=[
-                            DeltaToolCall(
-                                index=self.current_tool_id,
-                                function=DeltaFunctionCall(
-                                    arguments=arguments_delta
-                                ).model_dump(exclude_none=True),
-                            )
-                        ]
-                    )
-                    self.streamed_args_for_tool[self.current_tool_id] += arguments_delta
-
-                elif cur_arguments and prev_arguments:
-                    cur_args_json = json.dumps(cur_arguments, ensure_ascii=False)
-                    prev_args_json = json.dumps(prev_arguments, ensure_ascii=False)
-                    logger.debug(
-                        "Searching for diff between \n%s\n%s",
-                        cur_args_json,
-                        prev_args_json,
-                    )
-
-                    argument_diff = extract_intermediate_diff(
-                        cur_args_json, prev_args_json
-                    )
-                    logger.debug("got arguments diff: %s", argument_diff)
-                    delta = DeltaMessage(
-                        tool_calls=[
-                            DeltaToolCall(
-                                index=self.current_tool_id,
-                                function=DeltaFunctionCall(
-                                    arguments=argument_diff
-                                ).model_dump(exclude_none=True),
-                            )
-                        ]
-                    )
-                    self.streamed_args_for_tool[self.current_tool_id] += argument_diff
-                else:
-                    # try parsing it with regular JSON - if it works we're
-                    # at the end, and we need to send the difference between
-                    # tokens streamed so far and the valid JSON
-                    delta = None
-
-            # check to see if the name is defined and has been sent. if so,
-            # stream the name - otherwise keep waiting
-            # finish by setting old and returning None as base case
-            self.prev_tool_call_arr = tool_call_arr
-            return delta
-
         except Exception:
             logger.exception("Error trying to handle streaming tool call.")
-            logger.debug(
-                "Skipping chunk as a result of tool streaming extraction error"
-            )
             return None
+
+    def _extract_tool_calls_streaming(
+        self,
+        delta_text: str,
+        delta_token_ids: Sequence[int],
+    ) -> DeltaMessage | None:
+        """
+        Extracts tool calls for Mistral models
+        doing tool calls of the following format:
+        `[TOOL_CALLS]add{"a": 3.5, "b": 4}`
+        """
+        additional_content: str = ""
+        if self.streaming_state == StreamingState.WAITING_FOR_TOOL_START:
+            # this is the first tool call
+            assert self.bot_token_id in delta_token_ids
+            if not delta_text.startswith(self.bot_token):
+                additional_content += delta_text.split(self.bot_token)[0]
+                delta_text = self.bot_token + "".join(
+                    delta_text.split(self.bot_token)[1:]
+                )
+
+        delta_tool_calls = self._generate_delta_tool_call(delta_text)
+        if not additional_content and len(delta_tool_calls) == 0:
+            if self.streaming_state in [
+                StreamingState.PARSING_ARGUMENTS,
+                StreamingState.PARSING_ARGUMENTS_COMPLETED,
+                StreamingState.TOOL_COMPLETE,
+                StreamingState.ALL_TOOLS_COMPLETE,
+            ]:
+                # Return an empty DeltaMessage once the tool calls are all done
+                # so that finish_reason gets set.
+                return DeltaMessage()
+            else:
+                # return None when the tool is not likely to be finished
+                # This can occur when the name is being parsed for example
+                # and we wait for the name to be complete
+                # before sending the function name
+                return None
+
+        delta = DeltaMessage()
+        if additional_content:
+            delta.content = additional_content
+        if len(delta_tool_calls) > 0:
+            delta.tool_calls = delta_tool_calls
+
+        # HACK: serving_chat.py inspects the internal state of tool parsers
+        # when determining its final streaming delta, automatically
+        # adding autocompleted JSON.
+        # These two lines avoid that nonsense while ensuring finish_reason
+        # is set to tool_calls when at least one tool is called.
+        if delta_tool_calls and not self.prev_tool_call_arr:
+            self.prev_tool_call_arr = [{"arguments": {}}]
+        return delta
+
+    def _generate_delta_tool_call(self, delta_text: str) -> list[DeltaToolCall]:
+        if delta_text == "" or delta_text is None:
+            return []
+        delta_function_name = None
+        tool_id = None
+        if self.streaming_state not in [
+            StreamingState.PARSING_NAME,
+            StreamingState.PARSING_ARGUMENTS,
+        ] and delta_text.startswith(self.bot_token):
+            self.current_tool_id += 1
+            self.streaming_state = StreamingState.PARSING_NAME
+            delta_text = delta_text.replace(self.bot_token, "", 1)
+        if self.streaming_state == StreamingState.PARSING_NAME:
+            if self.current_tool_name is None:
+                self.current_tool_name = ""
+            # The name stops where the arguments start
+            # And the arguments start with the `{` char
+            if "{" in delta_text:
+                tool_id = MistralToolCall.generate_random_id()
+                delta_function_name = delta_text.split("{")[0]
+                self.current_tool_name += delta_function_name
+                delta_text = delta_text[len(delta_function_name) :]
+                self.streaming_state = StreamingState.PARSING_ARGUMENTS
+            else:
+                # we want to send the tool name once it's complete
+                self.current_tool_name += delta_text
+                return []
+        if self.streaming_state == StreamingState.PARSING_ARGUMENTS:
+            next_function_text = None
+            if self.bot_token in delta_text:
+                # current tool call is over
+                delta_arguments = ""
+                delta_arguments += delta_text.split(self.bot_token)[0]
+                next_function_text = delta_text[len(delta_arguments) :]
+                self.streaming_state = StreamingState.TOOL_COMPLETE
+            else:
+                delta_arguments = delta_text
+            ret = []
+            if self.current_tool_name or delta_arguments:
+                ret += [
+                    DeltaToolCall(
+                        index=self.current_tool_id,
+                        type="function",
+                        id=tool_id,
+                        function=DeltaFunctionCall(
+                            name=self.current_tool_name, arguments=delta_arguments
+                        ).model_dump(exclude_none=True),
+                    )
+                ]
+                self.current_tool_name = None
+            if next_function_text:
+                ret += self._generate_delta_tool_call(next_function_text)
+            return ret
+        # Should not happen
+        return []
+
+    @ijson.coroutine
+    def update_stream_state_pre_v11_tokenizer(self):
+        while True:
+            (prefix, event, value) = yield
+
+            if prefix == "item" and event == "start_map":
+                self.streaming_state = StreamingState.WAITING_FOR_TOOL_KEY
+            if prefix == "item" and event == "map_key" and value == "name":
+                self.streaming_state = StreamingState.PARSING_NAME
+            if prefix == "item.name" and event == "string":
+                self.current_tool_name = value
+                self.streaming_state = StreamingState.PARSING_NAME_COMPLETED
+            if prefix == "item" and event == "map_key" and value == "arguments":
+                self.streaming_state = StreamingState.WAITING_FOR_ARGUMENTS_START
+            if prefix == "item.arguments" and event == "start_map":
+                self.streaming_state = StreamingState.PARSING_ARGUMENTS
+            if prefix == "item.arguments" and event == "end_map":
+                self.streaming_state = StreamingState.PARSING_ARGUMENTS_COMPLETED
+            if prefix == "item" and event == "end_map":
+                self.streaming_state = StreamingState.TOOL_COMPLETE
+            if prefix == "" and event == "end_array":
+                self.streaming_state = StreamingState.ALL_TOOLS_COMPLETE
+
+    def _extract_tool_calls_streaming_pre_v11_tokenizer(
+        self,
+        delta_text: str,
+        delta_token_ids: Sequence[int],
+    ) -> DeltaMessage | None:
+        """
+        Extracts tool calls for Mistral models
+        doing tool calls of the following format:
+        `[TOOL_CALLS][{"name": "add", "arguments":{"a": 3.5, "b": 4}}`
+        """
+        assert self.parse_coro is not None
+        content = None
+        delta_tool_calls: list[DeltaToolCall] = []
+        current_tool_call: DeltaToolCall = DeltaToolCall(
+            index=self.current_tool_id, type="function"
+        )
+        current_tool_call_modified = False
+        if self.bot_token_id in delta_token_ids:
+            # this is the first tool call
+            if not delta_text.startswith(self.bot_token):
+                content = delta_text.split(self.bot_token)[0]
+            delta_text = "".join(delta_text.split(self.bot_token)[1:])
+
+        # Cut smartly the delta text to catch the ijson events
+        # as ijson does not give us the index in the text at each event.
+        # We need to cut so that we know
+        # where in the text the events are emitted from.
+        while len(delta_text) > 0:
+            streaming_state_before_parse = self.streaming_state
+
+            if self.streaming_state == StreamingState.WAITING_FOR_TOOL_START:
+                delta_to_be_parsed, delta_text = self._split_delta(
+                    delta_text=delta_text,
+                    stop_after_opening_curly_braces=1,
+                )
+            elif self.streaming_state == StreamingState.WAITING_FOR_TOOL_KEY:
+                # Wait until another key is sent
+                # or the current tool is completed
+                delta_to_be_parsed, delta_text = self._split_delta(
+                    delta_text=delta_text,
+                    stop_after_colon=1,
+                    stop_after_opening_curly_braces=1,
+                    # if the tool ends, we want to separate
+                    # at the start of the next tool
+                )
+            elif self.streaming_state == StreamingState.PARSING_NAME:
+                delta_to_be_parsed, delta_text = self._split_delta(
+                    delta_text=delta_text,
+                    stop_after_comma=1,
+                    stop_after_closing_brackets=1,
+                )
+            elif self.streaming_state == StreamingState.WAITING_FOR_ARGUMENTS_START:
+                delta_to_be_parsed, delta_text = self._split_delta(
+                    delta_text=delta_text,
+                    stop_after_opening_curly_braces=1,
+                )
+            elif self.streaming_state == StreamingState.PARSING_ARGUMENTS:
+                delta_to_be_parsed, delta_text = self._split_delta(
+                    delta_text=delta_text,
+                    stop_after_closing_curly_braces=1,
+                    # we could be more clever
+                    # by listening to item.arguments.* start_map events
+                    # and know how many curly braces we can allow
+                )
+            elif self.streaming_state in [
+                StreamingState.PARSING_ARGUMENTS_COMPLETED,
+                StreamingState.PARSING_NAME_COMPLETED,
+            ]:
+                delta_to_be_parsed, delta_text = self._split_delta(
+                    delta_text=delta_text,
+                    stop_after_closing_curly_braces=1,
+                    stop_after_closing_brackets=1,
+                )
+            elif self.streaming_state == StreamingState.TOOL_COMPLETE:
+                delta_to_be_parsed, delta_text = self._split_delta(
+                    delta_text=delta_text,
+                    stop_after_opening_curly_braces=1,
+                    stop_after_closing_brackets=1,
+                )
+            elif self.streaming_state == StreamingState.ALL_TOOLS_COMPLETE:
+                content = delta_text
+                delta_text = ""
+            else:
+                delta_to_be_parsed = delta_text
+                delta_text = ""
+
+            if self.streaming_state != StreamingState.ALL_TOOLS_COMPLETE:
+                self.parse_coro.send(delta_to_be_parsed.encode("utf-8"))
+
+            # Given the parsed text and the possible streaming state change,
+            # let's add to the tool delta
+            if (
+                (streaming_state_before_parse != self.streaming_state)
+                and streaming_state_before_parse
+                in [StreamingState.WAITING_FOR_TOOL_START, StreamingState.TOOL_COMPLETE]
+                and self.streaming_state
+                not in [
+                    StreamingState.ALL_TOOLS_COMPLETE,
+                    StreamingState.TOOL_COMPLETE,
+                    StreamingState.WAITING_FOR_TOOL_START,
+                ]
+            ):
+                # starting a new tool call
+                if current_tool_call_modified:
+                    if self.current_tool_mistral_id is not None:
+                        current_tool_call.id = self.current_tool_mistral_id
+                        self.current_tool_mistral_id = None
+                    delta_tool_calls.append(current_tool_call)
+                current_tool_call_modified = False
+                self.current_tool_id += 1
+                self.current_tool_mistral_id = MistralToolCall.generate_random_id()
+                current_tool_call = DeltaToolCall(
+                    index=self.current_tool_id,
+                    type="function",
+                )
+            if current_tool_call.function is None:
+                current_tool_call.function = DeltaFunctionCall()
+
+            if self.current_tool_name is not None:
+                # we have the complete tool name
+                current_tool_call_modified = True
+                current_tool_call.function.name = self.current_tool_name
+                self.current_tool_name = None
+            if self.streaming_state == StreamingState.PARSING_NAME_COMPLETED:
+                self.streaming_state = StreamingState.WAITING_FOR_TOOL_KEY
+            if self.streaming_state in [
+                StreamingState.PARSING_ARGUMENTS,
+                StreamingState.PARSING_ARGUMENTS_COMPLETED,
+            ]:
+                if self.streaming_state == StreamingState.PARSING_ARGUMENTS_COMPLETED:
+                    self.streaming_state = StreamingState.WAITING_FOR_TOOL_KEY
+                # the delta_to_be_parsed is part of arguments.
+                current_tool_call_modified = True
+                if current_tool_call.function.arguments is None:
+                    current_tool_call.function.arguments = delta_to_be_parsed
+                else:
+                    current_tool_call.function.arguments += delta_to_be_parsed
+                if streaming_state_before_parse != StreamingState.PARSING_ARGUMENTS:
+                    # It's the first chunk of arg. let's lstrip it
+                    current_tool_call.function.arguments = (
+                        current_tool_call.function.arguments.lstrip()
+                    )
+
+        if current_tool_call_modified:
+            if self.current_tool_mistral_id is not None:
+                current_tool_call.id = self.current_tool_mistral_id
+                self.current_tool_mistral_id = None
+            delta_tool_calls.append(current_tool_call)
+
+        # HACK: serving_chat.py inspects the internal state of tool parsers
+        # when determining it's final streaming delta, automatically
+        # adding autocompleted JSON.
+        # These two lines avoid that nonsense while ensuring finish_reason
+        # is set to tool_calls when at least one tool is called.
+        if delta_tool_calls and not self.prev_tool_call_arr:
+            self.prev_tool_call_arr = [{"arguments": {}}]
+
+        if content or len(delta_tool_calls) > 0:
+            delta_message = DeltaMessage()
+            if content:
+                delta_message.content = content
+            if len(delta_tool_calls) > 0:
+                delta_message.tool_calls = delta_tool_calls
+            return delta_message
+        else:
+            if self.streaming_state == StreamingState.ALL_TOOLS_COMPLETE:
+                return DeltaMessage()
+            else:
+                return None
+
+    def _split_delta(
+        self,
+        delta_text: str,
+        stop_after_quotes: int = -1,
+        stop_after_opening_curly_braces: int = -1,
+        stop_after_closing_curly_braces: int = -1,
+        stop_after_closing_brackets: int = -1,
+        stop_after_colon: int = -1,
+        stop_after_comma=-1,
+    ) -> tuple[str, str]:
+        delta_to_be_parsed = ""
+        for i, c in enumerate(delta_text):
+            if c in ['"', "'"]:
+                delta_to_be_parsed += c
+                stop_after_quotes -= 1
+                if stop_after_quotes == 0:
+                    return (delta_to_be_parsed, delta_text[i + 1 :])
+            elif c == "{":
+                delta_to_be_parsed += c
+                stop_after_opening_curly_braces -= 1
+                if stop_after_opening_curly_braces == 0:
+                    return (delta_to_be_parsed, delta_text[i + 1 :])
+            elif c == "}":
+                delta_to_be_parsed += c
+                stop_after_closing_curly_braces -= 1
+                if stop_after_closing_curly_braces == 0:
+                    return (delta_to_be_parsed, delta_text[i + 1 :])
+            elif c == "]":
+                delta_to_be_parsed += c
+                stop_after_closing_brackets -= 1
+                if stop_after_closing_brackets == 0:
+                    return (delta_to_be_parsed, delta_text[i + 1 :])
+            elif c == ":":
+                delta_to_be_parsed += c
+                stop_after_colon -= 1
+                if stop_after_colon == 0:
+                    return (delta_to_be_parsed, delta_text[i + 1 :])
+            elif c == ",":
+                delta_to_be_parsed += c
+                stop_after_comma -= 1
+                if stop_after_comma == 0:
+                    return (delta_to_be_parsed, delta_text[i + 1 :])
+            else:
+                delta_to_be_parsed += c
+
+        return (delta_to_be_parsed, "")

From 19bee6d12d985c231b16374c99836376fc0c5706 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Wed, 3 Dec 2025 13:04:59 -0500
Subject: [PATCH 196/770] [Performance][DP/EP] Add
 silu_mul_per_token_group_quant_fp8_colmajor kernel (#29470)

Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 .../benchmark_2d_silu_mul_fp8_quant.py        | 244 ++++++++++++++++++
 ..._mul_per_token_group_quant_fp8_colmajor.py |  86 ++++++
 .../layers/fused_moe/deep_gemm_moe.py         | 114 +++-----
 .../layers/quantization/utils/fp8_utils.py    | 133 ++++++++++
 4 files changed, 496 insertions(+), 81 deletions(-)
 create mode 100644 benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
 create mode 100644 tests/kernels/moe/test_silu_mul_per_token_group_quant_fp8_colmajor.py

diff --git a/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
new file mode 100644
index 0000000000000..04921dafbdbea
--- /dev/null
+++ b/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
@@ -0,0 +1,244 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from enum import Enum
+from itertools import product
+from typing import Any
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    _per_token_group_quant_fp8_colmajor,
+    silu_mul_per_token_group_quant_fp8_colmajor,
+)
+from vllm.triton_utils import triton
+from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
+
+from .utils import ArgPool, Bench, CudaGraphBenchParams
+
+GROUP_SIZE = 128
+FLOAT8_T = torch.float8_e4m3fn
+
+
+def print_timers(timers: list[TMeasurement], cuda_graph_nops: int):
+    print(
+        f"Note : The timings reported above is for {cuda_graph_nops} "
+        "consecutive invocations of the benchmarking functions. "
+        f"Please divide by {cuda_graph_nops} for single invocation "
+        "timings."
+    )
+    compare = TBenchmark.Compare(timers)
+    compare.print()
+
+
+class ImplType(Enum):
+    SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR = 1
+    REFERENCE = 2
+
+    def get_impl(self):
+        if self == ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR:
+            return silu_mul_per_token_group_quant_fp8_colmajor
+        elif self == ImplType.REFERENCE:
+            return reference
+        raise ValueError(f"Unrecognized ImplType {self}")
+
+
+@dataclass
+class BenchmarkTensors:
+    input: torch.Tensor
+    output: torch.Tensor
+
+    # Reference act output tensor
+    ref_act_out: torch.Tensor
+    ref_quant_out: torch.Tensor
+
+    @staticmethod
+    def make(T: int, N: int) -> "BenchmarkTensors":
+        assert T % GROUP_SIZE == 0
+        assert N % (GROUP_SIZE * 2) == 0
+
+        input = torch.rand((T, N), dtype=torch.bfloat16, device="cuda")
+
+        # silu_mul_per_token_group_quant_fp8_colmajor output.
+        output = torch.rand((T, N // 2), dtype=torch.bfloat16, device="cuda").to(
+            FLOAT8_T
+        )
+
+        # reference output.
+        ref_act_out = torch.empty((T, N // 2), dtype=torch.bfloat16, device="cuda")
+        ref_quant_out = torch.empty(
+            (T, N // 2), dtype=torch.bfloat16, device="cuda"
+        ).to(FLOAT8_T)
+
+        return BenchmarkTensors(
+            input=input,
+            output=output,
+            ref_act_out=ref_act_out,
+            ref_quant_out=ref_quant_out,
+        )
+
+    @property
+    def T(self):
+        return self.input.size(0)
+
+    @property
+    def N(self):
+        return self.input.size(1)
+
+    def make_impl_kwargs(self, impl_type: ImplType) -> dict[str, Any]:
+        if impl_type == ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR:
+            return {
+                "input": self.input,
+                "output": self.output,
+                "use_ue8m0": is_deep_gemm_e8m0_used(),
+            }
+        elif impl_type == ImplType.REFERENCE:
+            return {
+                "input": self.input,
+                "act_out": self.ref_act_out,
+                "quant_out": self.ref_quant_out,
+                "use_ue8m0": is_deep_gemm_e8m0_used(),
+            }
+        raise ValueError(f"Unrecognized impl_type {impl_type}")
+
+
+def reference_quant(x: torch.Tensor, quant_out: torch.Tensor, use_ue8m0: bool):
+    """
+    Reference triton quant kernel from,
+    vllm.model_executor.layers.quantization.utils.fp8_utils
+    """
+    assert quant_out.size() == x.size()
+    # Allocate the scale tensor column-major format.
+    shape = (x.shape[-1] // GROUP_SIZE,) + x.shape[:-1]
+    x_q = quant_out
+    x_s = torch.empty(shape, device=x.device, dtype=torch.float32).permute(-1, -2)
+
+    M = x.numel() // GROUP_SIZE
+    N = GROUP_SIZE
+    BLOCK = triton.next_power_of_2(N)
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK // 256, 1), 8)
+    num_stages = 1
+
+    finfo = torch.finfo(FLOAT8_T)
+    fp8_min = finfo.min
+    fp8_max = finfo.max
+
+    _per_token_group_quant_fp8_colmajor[(M,)](
+        x,
+        x_q,
+        x_s,
+        GROUP_SIZE,
+        x.shape[1],
+        x.stride(0),
+        x_s.stride(1),
+        eps=1e-10,
+        fp8_min=fp8_min,
+        fp8_max=fp8_max,
+        use_ue8m0=use_ue8m0,
+        BLOCK=BLOCK,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+    return x_q, x_s
+
+
+def reference(
+    input: torch.Tensor,
+    act_out: torch.Tensor,
+    quant_out: torch.Tensor,
+    use_ue8m0: bool,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    torch.ops._C.silu_and_mul(act_out, input)
+    return reference_quant(act_out, quant_out, use_ue8m0)
+
+
+def bench_impl(
+    bench_tensors: list[BenchmarkTensors], impl_type: ImplType
+) -> TMeasurement:
+    T = bench_tensors[0].T
+    N = bench_tensors[0].N
+
+    arg_pool_size = len(bench_tensors)
+    kwargs_list = [bt.make_impl_kwargs(impl_type) for bt in bench_tensors]
+
+    # warmup
+    for kwargs in kwargs_list:
+        impl_type.get_impl()(**kwargs)
+    torch.cuda.synchronize()
+
+    # Merge into a single kwargs and qualify arguments as ArgPool
+    kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
+    for _kwargs in kwargs_list:
+        for k, v in _kwargs.items():
+            kwargs[k].values.append(v)
+
+    cuda_graph_params = None
+    cuda_graph_params = CudaGraphBenchParams(arg_pool_size)
+    timer = None
+    with Bench(
+        cuda_graph_params,
+        "silu-mul-quant",
+        f"num_tokens={T}, N={N}",
+        impl_type.name,
+        impl_type.get_impl(),
+        **kwargs,
+    ) as bench:
+        timer = bench.run()
+    return timer
+
+
+def test_correctness(T: int, N: int):
+    print(f"Testing num_tokens={T}, N={N} ...")
+
+    bench_tensor = BenchmarkTensors.make(T, N)
+
+    def output_from_impl(impl: ImplType) -> tuple[torch.Tensor, torch.Tensor]:
+        return impl.get_impl()(**bench_tensor.make_impl_kwargs(impl))
+
+    # reference output
+    ref_out_q, ref_out_s = output_from_impl(ImplType.REFERENCE)
+
+    # test ouptut
+    out_q, out_s = output_from_impl(
+        ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
+    )
+
+    torch.testing.assert_close(ref_out_q.to(torch.float32), out_q.to(torch.float32))
+    torch.testing.assert_close(ref_out_s, out_s)
+
+
+def run(Ts: list[int], Ns: list[int], arg_pool_size: int) -> list[TMeasurement]:
+    timers = []
+    for N, T in product(Ns, Ts):
+        test_correctness(T, N)
+
+        bench_tensors: list[BenchmarkTensors] = [
+            BenchmarkTensors.make(T, N) for _ in range(arg_pool_size)
+        ]
+
+        silu_mul_quant_timer = bench_impl(
+            bench_tensors, ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
+        )
+        timers.append(silu_mul_quant_timer)
+        reference_timer = bench_impl(bench_tensors, ImplType.REFERENCE)
+        timers.append(reference_timer)
+
+        print_timers(
+            [silu_mul_quant_timer, reference_timer], cuda_graph_nops=arg_pool_size
+        )
+
+    print_timers(timers, cuda_graph_nops=arg_pool_size)
+
+    return timers
+
+
+if __name__ == "__main__":
+    T = [128 * i for i in range(1, 16)] + [2048 * i for i in range(1, 65)]
+    N = [2048, 4096, 8192]
+
+    print(f"T = {T}, N = {N}")
+    run(T, N, arg_pool_size=8)
diff --git a/tests/kernels/moe/test_silu_mul_per_token_group_quant_fp8_colmajor.py b/tests/kernels/moe/test_silu_mul_per_token_group_quant_fp8_colmajor.py
new file mode 100644
index 0000000000000..e4617072cd52c
--- /dev/null
+++ b/tests/kernels/moe/test_silu_mul_per_token_group_quant_fp8_colmajor.py
@@ -0,0 +1,86 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    _per_token_group_quant_fp8_colmajor,
+    silu_mul_per_token_group_quant_fp8_colmajor,
+)
+from vllm.platforms import current_platform
+from vllm.triton_utils import triton
+from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
+
+FLOAT8_DTYPE = torch.float8_e4m3fn
+GROUP_SIZE = 128
+
+
+def reference_quant(x: torch.Tensor, use_ue8m0: bool):
+    """
+    Reference triton quant kernel from,
+    vllm.model_executor.layers.quantization.utils.fp8_utils
+    """
+
+    x_q = torch.empty_like(x, device=x.device, dtype=FLOAT8_DTYPE)
+
+    # Allocate the scale tensor in column-major format.
+    shape = (x.shape[-1] // GROUP_SIZE,) + x.shape[:-1]
+    x_s = torch.empty(shape, device=x.device, dtype=torch.float32).permute(-1, -2)
+
+    M = x.numel() // GROUP_SIZE
+    N = GROUP_SIZE
+    BLOCK = triton.next_power_of_2(N)
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK // 256, 1), 8)
+    num_stages = 1
+
+    finfo = torch.finfo(FLOAT8_DTYPE)
+    fp8_min = finfo.min
+    fp8_max = finfo.max
+
+    _per_token_group_quant_fp8_colmajor[(M,)](
+        x,
+        x_q,
+        x_s,
+        GROUP_SIZE,
+        x.shape[1],
+        x.stride(0),
+        x_s.stride(1),
+        eps=1e-10,
+        fp8_min=fp8_min,
+        fp8_max=fp8_max,
+        use_ue8m0=use_ue8m0,
+        BLOCK=BLOCK,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+    return x_q, x_s
+
+
+def reference(x: torch.Tensor, use_ue8m0: bool) -> tuple[torch.Tensor, torch.Tensor]:
+    T, N = x.size()
+    ref_act_out = torch.empty((T, N // 2), dtype=torch.bfloat16, device="cuda")
+    torch.ops._C.silu_and_mul(ref_act_out, x)
+    return reference_quant(ref_act_out, use_ue8m0)
+
+
+@pytest.mark.parametrize("T", [128, 256, 512])
+@pytest.mark.parametrize("N", [128 * 2, 256 * 2, 768 * 2, 2048 * 2, 7168 * 2])
+def test_silu_mul_fp8_quant_deep_gemm(T: int, N: int):
+    current_platform.seed_everything(42)
+
+    input = torch.rand((T, N), dtype=torch.bfloat16, device="cuda")
+
+    use_ue8m0 = is_deep_gemm_e8m0_used()
+
+    # Test
+    output, output_scales = silu_mul_per_token_group_quant_fp8_colmajor(
+        input, use_ue8m0=use_ue8m0
+    )
+
+    # Reference
+    ref_output, ref_output_scales = reference(input, use_ue8m0)
+
+    torch.testing.assert_close(output.to(torch.float32), ref_output.to(torch.float32))
+    torch.testing.assert_close(output_scales, ref_output_scales)
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
index 86cdd25f2c873..9f47e692d5ae2 100644
--- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -2,9 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
-from tqdm import tqdm
 
-import vllm.envs as env
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import (
@@ -25,12 +23,12 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
 from vllm.model_executor.layers.fused_moe.utils import _resize_cache
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     per_token_group_quant_fp8,
+    silu_mul_per_token_group_quant_fp8_colmajor,
 )
 from vllm.utils.deep_gemm import (
     get_mk_alignment_for_contiguous_layout,
     m_grouped_fp8_gemm_nt_contiguous,
 )
-from vllm.utils.func_utils import run_once
 from vllm.utils.import_utils import has_deep_gemm
 
 logger = init_logger(__name__)
@@ -108,70 +106,6 @@ def _valid_deep_gemm(
     return True
 
 
-@run_once
-def warmup_deepgemm_gg_contiguous_kernels(
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    w1_scale: torch.Tensor,
-    w2_scale: torch.Tensor,
-    num_topk: int,
-):
-    """
-    DeepGemm JITs the grouped-gemm kernels. The JIT'ing happens based on the
-    input tensor shapes. In this function, we construct all possible input
-    tensor shapes so all the kernels are JIT'ed and cached.
-    Note that this warmup is expected to happen during the model profile
-    call and not during actual model inference.
-    """
-
-    assert w1.size(0) == w2.size(0), "w1 and w2 must have the same number of experts"
-
-    block_m = get_mk_alignment_for_contiguous_layout()[0]
-    num_experts = w1.size(0)
-    device = w1.device
-
-    # This is the maximum GroupedGemm M size that we expect to run
-    # the grouped_gemm with.
-    MAX_M = compute_aligned_M(
-        env.VLLM_FUSED_MOE_CHUNK_SIZE,
-        num_topk,
-        num_experts,
-        block_m,
-        expert_tokens_meta=None,
-    )
-    # Distribute expert-ids evenly.
-    MAX_BLOCKS = MAX_M // block_m
-    expert_ids_block = torch.randint(
-        low=0, high=num_experts, size=(MAX_BLOCKS,), device=device, dtype=torch.int32
-    )
-    expert_ids = torch.repeat_interleave(expert_ids_block, block_m, dim=0)
-
-    def _warmup(w: torch.Tensor, w_scale: torch.Tensor):
-        _, n, k = w.size()
-        a1q = torch.empty((MAX_M, k), device=device).to(torch.float8_e4m3fn)
-        a1q_scales = torch.empty(
-            (MAX_M, k // block_m), device=device, dtype=torch.float32
-        )
-        out = torch.empty((MAX_M, n), device=device, dtype=torch.bfloat16)
-
-        pbar = tqdm(
-            total=MAX_BLOCKS, desc=f"DeepGemmExperts GEMM warmup (MAX_M={MAX_M})"
-        )
-        num_tokens = MAX_M
-        while num_tokens > 0:
-            m_grouped_fp8_gemm_nt_contiguous(
-                (a1q[:num_tokens], a1q_scales[:num_tokens]),
-                (w, w_scale),
-                out[:num_tokens],
-                expert_ids[:num_tokens],
-            )
-            pbar.update(1)
-            num_tokens = num_tokens - block_m
-
-    _warmup(w1, w1_scale)
-    _warmup(w2, w2_scale)
-
-
 class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
     def __init__(self, quant_config: FusedMoEQuantConfig):
         super().__init__(quant_config)
@@ -215,11 +149,32 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         )
         assert M_sum % block_m == 0
 
-        workspace1 = (M_sum, N)
-        workspace2 = (M_sum, max(N // 2, K))
+        workspace1 = (M_sum, max(N // 2, K))
+        workspace2 = (M_sum, max(N, K))
         output = (M, K)
         return (workspace1, workspace2, output)
 
+    def _act_mul_quant(
+        self, input: torch.Tensor, output: torch.Tensor, activation: str
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if activation == "silu":
+            return silu_mul_per_token_group_quant_fp8_colmajor(
+                input=input, output=output
+            )
+        else:
+            # This is a fallback path. If we find ourselves using any activation other
+            # than silu, we should add that activation to
+            # silu_mul_per_token_group_quant_fp8_colmajor kernel as it is much faster.
+            M_sum, N = input.size()
+            act_out = torch.empty(
+                (M_sum, N // 2), dtype=input.dtype, device=input.device
+            )
+            self.activation(activation, act_out, input)
+            assert self.block_shape is not None
+            return per_token_group_quant_fp8(
+                act_out, self.block_shape[1], column_major_scales=True, out_q=output
+            )
+
     def apply(
         self,
         output: torch.Tensor,
@@ -261,14 +216,9 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
             expert_tokens_meta=expert_tokens_meta,
         )
 
-        a1q_perm = _resize_cache(workspace2.view(dtype=torch.float8_e4m3fn), (M_sum, K))
-        mm1_out = _resize_cache(workspace13, (M_sum, N))
-        act_out = _resize_cache(workspace2, (M_sum, N // 2))
-        quant_out = _resize_cache(
-            workspace13.view(dtype=torch.float8_e4m3fn), (M_sum, N // 2)
+        a1q_perm = _resize_cache(
+            workspace13.view(dtype=torch.float8_e4m3fn), (M_sum, K)
         )
-        mm2_out = _resize_cache(workspace2, (M_sum, K))
-
         a1q, a1q_scale, expert_ids, inv_perm = deepgemm_moe_permute(
             aq=a1q,
             aq_scale=a1q_scale,
@@ -280,17 +230,19 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         )
         assert a1q.size(0) == M_sum
 
+        mm1_out = _resize_cache(workspace2, (M_sum, N))
         m_grouped_fp8_gemm_nt_contiguous(
             (a1q, a1q_scale), (w1, self.w1_scale), mm1_out, expert_ids
         )
 
-        self.activation(activation, act_out, mm1_out.view(-1, N))
-
-        a2q_scale: torch.Tensor | None = None
-        a2q, a2q_scale = per_token_group_quant_fp8(
-            act_out, self.block_shape[1], column_major_scales=True, out_q=quant_out
+        quant_out = _resize_cache(
+            workspace13.view(dtype=torch.float8_e4m3fn), (M_sum, N // 2)
+        )
+        a2q, a2q_scale = self._act_mul_quant(
+            input=mm1_out.view(-1, N), output=quant_out, activation=activation
         )
 
+        mm2_out = _resize_cache(workspace2, (M_sum, K))
         m_grouped_fp8_gemm_nt_contiguous(
             (a2q, a2q_scale), (w2, self.w2_scale), mm2_out, expert_ids
         )
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index ae63b4a767268..6e73833d1ae1c 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -492,6 +492,139 @@ def _per_token_group_quant_fp8(
     tl.store(y_s_ptr, y_s)
 
 
+@triton.jit
+def _silu_mul_per_token_group_quant_fp8_colmajor(
+    y_ptr,  # [M, N]
+    y_q_ptr,  # [M, N // 2]
+    y_s_ptr,  # [M, (N // 2) // GROUP_SIZE]
+    M,  # num tokens
+    N,  # intermediate size
+    # Stride
+    y_s_col_stride: tl.int64,
+    # Information for float8
+    eps,
+    fp8_min,
+    fp8_max,
+    use_ue8m0: tl.constexpr,
+    # Meta-parameters
+    GROUP_SIZE: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    # TODO(varun) : Add expert_ids so we may early-exit no-op thread blocks.
+    """
+    Each thread block (BLOCK_N) computes [BLOCK_M, GROUP_SIZE] act-mul outputs. Then
+    the thread block quantizes the [BLOCK_M, GROUP_SIZE] block of values and fills
+    the outputs tensors at the right positions.
+    """
+
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+    N_2 = N // 2
+
+    m_offset = pid_m * BLOCK_M
+    n_offset = pid_n * BLOCK_N
+    if m_offset >= M:
+        return
+
+    offs_n = tl.arange(0, BLOCK_N).to(tl.int64)
+    offs_m = tl.arange(0, BLOCK_M).to(tl.int64)
+
+    base_y_ptr = y_ptr + m_offset * N + n_offset
+
+    act_in_ptrs = base_y_ptr + offs_m[:, None] * N + offs_n[None, :]
+
+    act_in = tl.load(act_in_ptrs)
+    mul_in = tl.load(act_in_ptrs + N_2)
+
+    # silu & mul
+    act_in = act_in.to(tl.float32)
+    one_f32 = tl.cast(1, tl.float32)
+    silu_out = (act_in / (one_f32 + tl.exp(-act_in))).to(y_ptr.dtype.element_ty)
+    y = (silu_out * mul_in).to(tl.float32)
+
+    # quant
+    _absmax = tl.maximum(tl.max(tl.abs(y), axis=1), eps)
+    scale_raw = _absmax / fp8_max
+    y_s = tl.math.exp2(tl.ceil(tl.log2(scale_raw))) if use_ue8m0 else scale_raw
+    y_s = tl.reshape(y_s, (BLOCK_M, 1))
+    y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty)
+
+    # store y_q
+    base_y_q_ptr = y_q_ptr + m_offset * N_2 + n_offset
+    y_q_ptrs = base_y_q_ptr + offs_m[:, None] * N_2 + offs_n[None, :]
+    tl.store(y_q_ptrs, y_q)
+
+    # store y_s
+    group_id = n_offset // GROUP_SIZE
+    base_y_s_ptr = y_s_ptr + group_id * y_s_col_stride + m_offset
+    y_s_ptrs = base_y_s_ptr + offs_m
+    y_s = tl.reshape(y_s, (BLOCK_M,))
+    tl.store(y_s_ptrs, y_s)
+
+
+def silu_mul_per_token_group_quant_fp8_colmajor(
+    input: torch.Tensor,  # [M, N]
+    output: torch.Tensor | None = None,  # [M, N // 2]
+    use_ue8m0: bool | None = None,
+    eps: float = 1e-10,
+):
+    """
+    silu+mul + block-fp8 quant with group size 128.
+    """
+    GROUP_SIZE = 128
+    assert input.ndim == 2
+    if output is not None:
+        assert output.ndim == 2
+    assert input.size(0) % GROUP_SIZE == 0
+    assert input.size(1) % (GROUP_SIZE * 2) == 0
+
+    if use_ue8m0 is None:
+        use_ue8m0 = is_deep_gemm_e8m0_used()
+
+    M, N = input.size()
+    N_2 = N // 2
+
+    if output is None:
+        output = torch.empty((M, N_2), dtype=torch.float8_e4m3fn, device=input.device)
+
+    output_scales = torch.empty(
+        ((N_2 // GROUP_SIZE), M), dtype=torch.float32, device=input.device
+    ).transpose(0, 1)
+
+    BLOCK_M = 8
+    BLOCK_N = GROUP_SIZE
+    assert M % BLOCK_M == 0
+    assert N_2 % BLOCK_N == 0
+
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    fp8_min = finfo.min
+    fp8_max = finfo.max
+
+    # Force even division so we can avoid edgecases within the kernel.
+    assert M % BLOCK_M == 0
+    assert N_2 % BLOCK_N == 0
+    grid = (M // BLOCK_M, N_2 // BLOCK_N)
+
+    _silu_mul_per_token_group_quant_fp8_colmajor[grid](
+        input,
+        output,
+        output_scales,
+        M,
+        N,
+        output_scales.stride(-1),
+        eps,
+        fp8_min,
+        fp8_max,
+        use_ue8m0,
+        GROUP_SIZE,
+        BLOCK_M,
+        BLOCK_N,
+    )
+
+    return output, output_scales
+
+
 @triton.jit
 def _per_token_group_quant_fp8_colmajor(
     # Pointers to inputs and output

From afe9eb408ee1191cd57a68d46b6ce2860b1b41e1 Mon Sep 17 00:00:00 2001
From: elvischenv <219235043+elvischenv@users.noreply.github.com>
Date: Thu, 4 Dec 2025 02:50:53 +0800
Subject: [PATCH 197/770] [Bugfix] Fix flashinfer ar+norm kernel not available
 issue (#29960)

Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
---
 vllm/compilation/fix_functionalization.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py
index 76068f86ebfb3..2625562aadd36 100644
--- a/vllm/compilation/fix_functionalization.py
+++ b/vllm/compilation/fix_functionalization.py
@@ -104,7 +104,8 @@ class FixFunctionalizationPass(VllmInductorPass):
                 mutated_args = {1: "result"}
                 self.defunctionalize(graph, node, mutated_args)
             elif (
-                at_target
+                hasattr(torch.ops.vllm, "flashinfer_trtllm_fused_allreduce_norm")
+                and at_target
                 == torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default
             ):
                 mutated_args = {

From 2fc5d6e0d7596dd93dbf4e1ca776f17449bb2143 Mon Sep 17 00:00:00 2001
From: Yongtao Huang <yongtaoh2022@gmail.com>
Date: Thu, 4 Dec 2025 04:14:44 +0800
Subject: [PATCH 198/770] Fix LLMEngine.del dp_group cleanup condition (#29954)

Signed-off-by: Yongtao Huang <yongtaoh2022@gmail.com>
---
 vllm/v1/engine/llm_engine.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index d21cdf04ead26..8772f2e488dc0 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -409,8 +409,6 @@ class LLMEngine:
         return self.collective_rpc("apply_model", args=(func,))
 
     def __del__(self):
-        if (
-            dp_group := getattr(self, "dp_group", None)
-            and not self.external_launcher_dp
-        ):
+        dp_group = getattr(self, "dp_group", None)
+        if dp_group is not None and not self.external_launcher_dp:
             stateless_destroy_torch_distributed_process_group(dp_group)

From ac1886588fd8799ff874b860b6c266a84d5a2b2b Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Wed, 3 Dec 2025 15:16:54 -0500
Subject: [PATCH 199/770] [CI] Fix re import error (#29973)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 .buildkite/scripts/generate-nightly-index.py     | 3 ++-
 vllm/entrypoints/serve/instrumentator/metrics.py | 3 +--
 vllm/tokenizers/deepseek_v32_encoding.py         | 3 ++-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py
index 8d09ba178db7b..4d28ec9619de9 100644
--- a/.buildkite/scripts/generate-nightly-index.py
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -7,13 +7,14 @@
 
 import argparse
 import json
-import re
 import sys
 from dataclasses import asdict, dataclass
 from pathlib import Path
 from typing import Any
 from urllib.parse import quote
 
+import regex as re
+
 if not sys.version_info >= (3, 12):
     raise RuntimeError("This script requires Python 3.12 or higher.")
 
diff --git a/vllm/entrypoints/serve/instrumentator/metrics.py b/vllm/entrypoints/serve/instrumentator/metrics.py
index efe0c63a90714..5231451383a2b 100644
--- a/vllm/entrypoints/serve/instrumentator/metrics.py
+++ b/vllm/entrypoints/serve/instrumentator/metrics.py
@@ -2,9 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 
-import re
-
 import prometheus_client
+import regex as re
 from fastapi import FastAPI, Response
 from prometheus_client import make_asgi_app
 from prometheus_fastapi_instrumentator import Instrumentator
diff --git a/vllm/tokenizers/deepseek_v32_encoding.py b/vllm/tokenizers/deepseek_v32_encoding.py
index 72f43395b192e..fb8989e651e3a 100644
--- a/vllm/tokenizers/deepseek_v32_encoding.py
+++ b/vllm/tokenizers/deepseek_v32_encoding.py
@@ -5,9 +5,10 @@
 # copy from https://huggingface.co/deepseek-ai/DeepSeek-V3.2/blob/main/encoding/encoding_dsv32.py
 import copy
 import json
-import re
 from typing import Any
 
+import regex as re
+
 # flake8: noqa: E501
 TOOLS_SYSTEM_TEMPLATE = """## Tools
 You have access to a set of tools you can use to answer the user's question.

From 2902c348265639de300c95cbcae1c26486f57ac7 Mon Sep 17 00:00:00 2001
From: bnellnm <49004751+bnellnm@users.noreply.github.com>
Date: Wed, 3 Dec 2025 15:49:00 -0500
Subject: [PATCH 200/770] [Kernels] Remove BatchedTritonOrDeepGemmExperts and
 default fallback to Triton (#29929)

Signed-off-by: Bill Nell <bnell@redhat.com>
Signed-off-by: bnellnm <49004751+bnellnm@users.noreply.github.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 docs/design/moe_kernel_features.md            |   3 +-
 .../moe/modular_kernel_tools/mk_objects.py    |  17 --
 .../layers/fused_moe/__init__.py              |   4 -
 .../batched_triton_or_deep_gemm_moe.py        | 180 ------------------
 .../compressed_tensors_moe.py                 |  59 ++++--
 5 files changed, 46 insertions(+), 217 deletions(-)
 delete mode 100644 vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py

diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md
index 44aaa65218cc4..48341d199cb80 100644
--- a/docs/design/moe_kernel_features.md
+++ b/docs/design/moe_kernel_features.md
@@ -90,7 +90,6 @@ To be used with a particular `FusedMoEPrepareAndFinalize` subclass, MoE kernels
 | cutlass_fp8 | standard,</br>batched | fp8 | A,T | silu, gelu | Y | Y | [`cutlass_moe_fp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp8],</br>[`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],</br>[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8] |
 | flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`flashinfer_cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.flashinfer_cutlass_moe_fp4],</br>[`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] |
 | gpt oss triton | standard | N/A | N/A | <sup>5</sup> | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] |
-| deep gemm+triton<sup>2</sup> | standard,</br>batched | all<sup>1</sup> | G(128),A,T | silu, gelu | <sup>6</sup> | Y | [`TritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe.TritonOrDeepGemmExperts],</br>[`BatchedTritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe.BatchedTritonOrDeepGemmExperts] |
 | marlin | standard,</br>batched | <sup>3</sup> / N/A | <sup>3</sup> / N/A | silu,</br>swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],</br>[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] |
 | trtllm | standard | mxfp4,</br>nvfp4 | G(16),G(32) | <sup>5</sup> | N | Y | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts] |
 | pallas | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_pallas.fused_moe] |
@@ -114,5 +113,5 @@ The following table shows "families" of modular kernels that are intended to wor
 | backend | `FusedMoEPrepareAndFinalize` subclasses | `FusedMoEPermuteExpertsUnpermute` subclasses |
 |---------|-----------------------------------------|----------------------------------------------|
 | deepep_high_throughput | `DeepEPHTPrepareAndFinalize` |  `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8`, </br>`MarlinExperts` |
-| deepep_low_latency,</br>pplx | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` |  `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`BatchedTritonOrDeepGemmExperts`,</br>`CutlassBatchedExpertsFp8`,</br>`BatchedMarlinExperts` |
+| deepep_low_latency,</br>pplx | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` |  `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`CutlassBatchedExpertsFp8`,</br>`BatchedMarlinExperts` |
 | flashinfer | `FlashInferCutlassMoEPrepareAndFinalize` | `FlashInferExperts` |
diff --git a/tests/kernels/moe/modular_kernel_tools/mk_objects.py b/tests/kernels/moe/modular_kernel_tools/mk_objects.py
index d79fdfbe07af3..99b168dc75548 100644
--- a/tests/kernels/moe/modular_kernel_tools/mk_objects.py
+++ b/tests/kernels/moe/modular_kernel_tools/mk_objects.py
@@ -13,9 +13,6 @@ from vllm.model_executor.layers.fused_moe.all2all_utils import (
 from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
     BatchedDeepGemmExperts,
 )
-from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (
-    BatchedTritonOrDeepGemmExperts,
-)
 from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEConfig,
     FusedMoEQuantConfig,
@@ -286,16 +283,6 @@ if has_deep_gemm() and is_deep_gemm_supported():
         needs_matching_quant=False,
         needs_deep_gemm=True,
     )
-    register_experts(
-        BatchedTritonOrDeepGemmExperts,
-        batched_format,
-        common_float_and_int_types,
-        blocked_quantization_support=True,
-        supports_chunking=False,
-        supports_expert_map=False,
-        needs_matching_quant=True,
-        needs_deep_gemm=True,
-    )
     register_experts(
         TritonOrDeepGemmExperts,
         standard_format,
@@ -457,10 +444,6 @@ def make_fused_experts(
         kwargs = batch_kwargs | quant_kwargs
         print(f"Making BatchedTritonExperts {kwargs} ...")
         experts = BatchedTritonExperts(**kwargs)
-    elif fused_experts_type == BatchedTritonOrDeepGemmExperts:
-        kwargs = batch_kwargs | quant_kwargs | deepgemm_kwargs
-        print(f"Making BatchedTritonOrDeepGemmExperts {kwargs} ...")
-        experts = BatchedTritonOrDeepGemmExperts(**kwargs)
     elif fused_experts_type == DeepGemmExperts:
         print(f"Making DeepGemmExperts {quant_config} ...")
         experts = DeepGemmExperts(quant_config)
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 669abcb3d6ff1..9103e84aa7057 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -60,9 +60,6 @@ if HAS_TRITON:
     from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
         BatchedDeepGemmExperts,
     )
-    from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (  # noqa: E501
-        BatchedTritonOrDeepGemmExperts,
-    )
     from vllm.model_executor.layers.fused_moe.cutlass_moe import (
         CutlassBatchedExpertsFp8,
         CutlassExpertsFp8,
@@ -98,7 +95,6 @@ if HAS_TRITON:
         "DeepGemmExperts",
         "BatchedDeepGemmExperts",
         "TritonOrDeepGemmExperts",
-        "BatchedTritonOrDeepGemmExperts",
     ]
 else:
     # Some model classes directly use the custom ops. Add placeholders
diff --git a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
deleted file mode 100644
index e69e9fd307aeb..0000000000000
--- a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import torch
-
-import vllm.model_executor.layers.fused_moe.modular_kernel as mk
-from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
-    BatchedDeepGemmExperts,
-)
-from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
-from vllm.model_executor.layers.fused_moe.fused_batched_moe import BatchedTritonExperts
-from vllm.utils.deep_gemm import get_mk_alignment_for_contiguous_layout
-
-
-class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
-    def __init__(
-        self,
-        max_num_tokens: int,
-        num_dispatchers: int,
-        quant_config: FusedMoEQuantConfig,
-        allow_deep_gemm: bool = False,
-    ):
-        super().__init__(quant_config)
-
-        self.batched_triton_experts = BatchedTritonExperts(
-            max_num_tokens=max_num_tokens,
-            num_dispatchers=num_dispatchers,
-            quant_config=self.quant_config,
-        )
-
-        self.allow_deep_gemm = (
-            allow_deep_gemm
-            and self.quant_config.use_fp8_w8a8
-            and self.block_shape == get_mk_alignment_for_contiguous_layout()
-        )
-
-        self.batched_deep_gemm_experts = (
-            BatchedDeepGemmExperts(
-                max_num_tokens=max_num_tokens,
-                num_dispatchers=num_dispatchers,
-                quant_config=self.quant_config,
-            )
-            if self.allow_deep_gemm
-            else None
-        )
-
-        assert (
-            self.batched_deep_gemm_experts is not None
-            or self.batched_triton_experts is not None
-        )
-
-    @property
-    def activation_formats(
-        self,
-    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
-        if self.batched_triton_experts is not None:
-            assert (
-                self.batched_deep_gemm_experts is None
-                or self.batched_deep_gemm_experts.activation_formats
-                == self.batched_triton_experts.activation_formats
-            )
-            return self.batched_triton_experts.activation_formats
-        else:
-            assert self.batched_deep_gemm_experts is not None
-            return self.batched_deep_gemm_experts.activation_formats
-
-    def supports_chunking(self) -> bool:
-        bdge = self.batched_deep_gemm_experts
-        bte = self.batched_triton_experts
-        return (bdge is None or bdge.supports_chunking()) and (
-            bte is None or bte.supports_chunking()
-        )
-
-    def supports_expert_map(self) -> bool:
-        bdge = self.batched_deep_gemm_experts
-        bte = self.batched_triton_experts
-        return (bdge is None or bdge.supports_expert_map()) and (
-            bte is None or bte.supports_expert_map()
-        )
-
-    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
-        bdge = self.batched_deep_gemm_experts
-        bte = self.batched_triton_experts
-        bdge_war = bdge.finalize_weight_and_reduce_impl() if bdge else None
-        bte_war = bte.finalize_weight_and_reduce_impl() if bte else None
-        is_bdge_war = bdge_war is not None
-        is_bte_war = bte_war is not None
-
-        if is_bdge_war and is_bte_war:
-            assert bdge_war == bte_war, (
-                "Both implementations should agree on WeightAndReduce impls. "
-                f"Got bdge_war: {bdge_war}, and bte_war: {bte_war}"
-            )
-
-        if bdge_war is not None:
-            return bdge_war
-
-        assert bte_war is not None
-        return bte_war
-
-    def workspace_dtype(self, act_dtype: torch.dtype) -> torch.dtype:
-        return act_dtype
-
-    def workspace_shapes(
-        self,
-        M: int,
-        N: int,
-        K: int,
-        topk: int,
-        global_num_experts: int,
-        local_num_experts: int,
-        expert_tokens_metadata: mk.ExpertTokensMetadata | None,
-    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
-        # Note: the deep gemm workspaces are strictly larger than the triton
-        # workspaces so we can be pessimistic here and allocate for DeepGemm
-        # even if we fall back to triton later, e.g. if expert maps are set.
-        if self.allow_deep_gemm:
-            assert self.batched_deep_gemm_experts is not None
-            return self.batched_deep_gemm_experts.workspace_shapes(
-                M,
-                N,
-                K,
-                topk,
-                global_num_experts,
-                local_num_experts,
-                expert_tokens_metadata,
-            )
-        else:
-            assert self.batched_triton_experts is not None
-            return self.batched_triton_experts.workspace_shapes(
-                M,
-                N,
-                K,
-                topk,
-                global_num_experts,
-                local_num_experts,
-                expert_tokens_metadata,
-            )
-
-    def apply(
-        self,
-        output: torch.Tensor,
-        hidden_states: torch.Tensor,
-        w1: torch.Tensor,
-        w2: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        activation: str,
-        global_num_experts: int,
-        expert_map: torch.Tensor | None,
-        a1q_scale: torch.Tensor | None,
-        a2_scale: torch.Tensor | None,
-        workspace13: torch.Tensor,
-        workspace2: torch.Tensor,
-        expert_tokens_meta: mk.ExpertTokensMetadata | None,
-        apply_router_weight_on_input: bool,
-    ):
-        experts = (
-            self.batched_deep_gemm_experts
-            if self.allow_deep_gemm
-            else self.batched_triton_experts
-        )
-        assert experts is not None
-        experts.apply(
-            output,
-            hidden_states,
-            w1,
-            w2,
-            topk_weights,
-            topk_ids,
-            activation,
-            global_num_experts,
-            expert_map,
-            a1q_scale,
-            a2_scale,
-            workspace13,
-            workspace2,
-            expert_tokens_meta,
-            apply_router_weight_on_input,
-        )
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index c7368bf427fe1..d7fb6d2ca367d 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -90,8 +90,10 @@ from vllm.platforms import CpuArchEnum, current_platform
 from vllm.scalar_type import scalar_types
 from vllm.utils.deep_gemm import (
     get_col_major_tma_aligned_tensor,
+    get_mk_alignment_for_contiguous_layout,
     is_deep_gemm_e8m0_used,
 )
+from vllm.utils.import_utils import has_deep_gemm
 
 logger = init_logger(__name__)
 
@@ -1088,9 +1090,11 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
 
             return experts
 
-        # triton path
-        from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (  # noqa: E501
-            BatchedTritonOrDeepGemmExperts,
+        from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
+            BatchedDeepGemmExperts,
+        )
+        from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
+            BatchedTritonExperts,
         )
         from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
             TritonOrDeepGemmExperts,
@@ -1098,6 +1102,8 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
 
         assert not self.rocm_aiter_moe_enabled and not self.use_marlin
 
+        use_deep_gemm = envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM
+
         if (
             prepare_finalize.activation_format
             == FusedMoEActivationFormat.BatchedExperts
@@ -1105,22 +1111,47 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank()
             assert max_num_tokens_per_rank is not None
 
-            logger.debug("BatchedTritonExperts(%s)", self.__class__.__name__)
-            return BatchedTritonOrDeepGemmExperts(
-                max_num_tokens=max_num_tokens_per_rank,
-                num_dispatchers=prepare_finalize.num_dispatchers(),
-                quant_config=self.moe_quant_config,
-                allow_deep_gemm=(
-                    envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM
-                ),
+            if use_deep_gemm and not has_deep_gemm():
+                raise RuntimeError(
+                    "DeepGEMM requested for MoE layer but not installed."
+                )
+
+            compatible_with_deep_gemm = (
+                self.moe_quant_config.use_fp8_w8a8
+                and self.moe_quant_config.block_shape
+                == get_mk_alignment_for_contiguous_layout()
             )
+
+            # If this MoE layer is compatible with DeepGEMM, the proper env
+            # vars are set and DeepGEMM is not installed, throw an error.
+            if use_deep_gemm and compatible_with_deep_gemm and not has_deep_gemm():
+                raise RuntimeError(
+                    f"MoE layer incompatible with DeepGEMM, expected "
+                    f"fp8==True, got {self.moe_quant_config.use_fp8_w8a8}"
+                    f"or block_shape {self.moe_quant_config.block_shape}"
+                    f"=={get_mk_alignment_for_contiguous_layout()}."
+                )
+
+            if use_deep_gemm and compatible_with_deep_gemm and has_deep_gemm():
+                logger.debug("BatchedDeepGemmExperts(%s)", self.__class__.__name__)
+                return BatchedDeepGemmExperts(
+                    max_num_tokens=max_num_tokens_per_rank,
+                    num_dispatchers=prepare_finalize.num_dispatchers(),
+                    quant_config=self.moe_quant_config,
+                )
+            else:
+                logger.debug("BatchedTritonExperts(%s)", self.__class__.__name__)
+                return BatchedTritonExperts(
+                    max_num_tokens=max_num_tokens_per_rank,
+                    num_dispatchers=prepare_finalize.num_dispatchers(),
+                    quant_config=self.moe_quant_config,
+                )
+
         else:
             logger.debug("TritonOrDeepGemmExperts(%s)", self.__class__.__name__)
             return TritonOrDeepGemmExperts(
                 self.moe_quant_config,
-                allow_deep_gemm=(
-                    envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM
-                ),
+                allow_deep_gemm=use_deep_gemm,
             )
 
     def get_fused_moe_quant_config(

From b5407869c8594d8e3c4ee3c09ff7cfe454be0798 Mon Sep 17 00:00:00 2001
From: Elizabeth Thomas <email2eliza@gmail.com>
Date: Wed, 3 Dec 2025 16:00:52 -0600
Subject: [PATCH 201/770] [Bugfix] Respect VLLM_CONFIGURE_LOGGING value
 (#28671)

Signed-off-by: Elizabeth Thomas <email2eliza@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Jane Xu <janeyx@meta.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Johnny Yang <johnnyyang@google.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: bruceszchen <bruceszchen@tencent.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Jane (Yuan) Xu <31798555+janeyx99@users.noreply.github.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Johnny Yang <24908445+jcyang43@users.noreply.github.com>
---
 tests/test_envs.py         | 51 ++++++++++++++++++++++++++++++++++++++
 vllm/envs.py               |  6 +++--
 vllm/utils/system_utils.py |  4 +++
 3 files changed, 59 insertions(+), 2 deletions(-)

diff --git a/tests/test_envs.py b/tests/test_envs.py
index 6a9835a68e7e2..11bbec38202bf 100644
--- a/tests/test_envs.py
+++ b/tests/test_envs.py
@@ -365,3 +365,54 @@ class TestEnvSetWithChoices:
         with patch.dict(os.environ, {"TEST_ENV": "option1,option1,option2"}):
             env_func = env_set_with_choices("TEST_ENV", [], ["option1", "option2"])
             assert env_func() == {"option1", "option2"}
+
+
+class TestVllmConfigureLogging:
+    """Test cases for VLLM_CONFIGURE_LOGGING environment variable."""
+
+    def test_configure_logging_defaults_to_true(self):
+        """Test that VLLM_CONFIGURE_LOGGING defaults to True when not set."""
+        # Ensure the env var is not set
+        with patch.dict(os.environ, {}, clear=False):
+            if "VLLM_CONFIGURE_LOGGING" in os.environ:
+                del os.environ["VLLM_CONFIGURE_LOGGING"]
+
+            # Clear cache if it exists
+            if hasattr(envs.__getattr__, "cache_clear"):
+                envs.__getattr__.cache_clear()
+
+            result = envs.VLLM_CONFIGURE_LOGGING
+            assert result is True
+            assert isinstance(result, bool)
+
+    def test_configure_logging_with_zero_string(self):
+        """Test that VLLM_CONFIGURE_LOGGING='0' evaluates to False."""
+        with patch.dict(os.environ, {"VLLM_CONFIGURE_LOGGING": "0"}):
+            # Clear cache if it exists
+            if hasattr(envs.__getattr__, "cache_clear"):
+                envs.__getattr__.cache_clear()
+
+            result = envs.VLLM_CONFIGURE_LOGGING
+            assert result is False
+            assert isinstance(result, bool)
+
+    def test_configure_logging_with_one_string(self):
+        """Test that VLLM_CONFIGURE_LOGGING='1' evaluates to True."""
+        with patch.dict(os.environ, {"VLLM_CONFIGURE_LOGGING": "1"}):
+            # Clear cache if it exists
+            if hasattr(envs.__getattr__, "cache_clear"):
+                envs.__getattr__.cache_clear()
+
+            result = envs.VLLM_CONFIGURE_LOGGING
+            assert result is True
+            assert isinstance(result, bool)
+
+    def test_configure_logging_with_invalid_value_raises_error(self):
+        """Test that invalid VLLM_CONFIGURE_LOGGING value raises ValueError."""
+        with patch.dict(os.environ, {"VLLM_CONFIGURE_LOGGING": "invalid"}):
+            # Clear cache if it exists
+            if hasattr(envs.__getattr__, "cache_clear"):
+                envs.__getattr__.cache_clear()
+
+            with pytest.raises(ValueError, match="invalid literal for int"):
+                _ = envs.VLLM_CONFIGURE_LOGGING
diff --git a/vllm/envs.py b/vllm/envs.py
index 4b594e54f7219..60d91e9850970 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -37,7 +37,7 @@ if TYPE_CHECKING:
     VLLM_DISABLE_FLASHINFER_PREFILL: bool = False
     VLLM_DO_NOT_TRACK: bool = False
     VLLM_USAGE_SOURCE: str = ""
-    VLLM_CONFIGURE_LOGGING: int = 1
+    VLLM_CONFIGURE_LOGGING: bool = True
     VLLM_LOGGING_LEVEL: str = "INFO"
     VLLM_LOGGING_PREFIX: str = ""
     VLLM_LOGGING_STREAM: str = "ext://sys.stdout"
@@ -623,7 +623,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # If set to 0, vllm will not configure logging
     # If set to 1, vllm will configure logging using the default configuration
     #    or the configuration file specified by VLLM_LOGGING_CONFIG_PATH
-    "VLLM_CONFIGURE_LOGGING": lambda: int(os.getenv("VLLM_CONFIGURE_LOGGING", "1")),
+    "VLLM_CONFIGURE_LOGGING": lambda: bool(
+        int(os.getenv("VLLM_CONFIGURE_LOGGING", "1"))
+    ),
     "VLLM_LOGGING_CONFIG_PATH": lambda: os.getenv("VLLM_LOGGING_CONFIG_PATH"),
     # this is used for configuring the default logging level
     "VLLM_LOGGING_LEVEL": lambda: os.getenv("VLLM_LOGGING_LEVEL", "INFO").upper(),
diff --git a/vllm/utils/system_utils.py b/vllm/utils/system_utils.py
index a4eb8f4d4fd7d..76cac59c18098 100644
--- a/vllm/utils/system_utils.py
+++ b/vllm/utils/system_utils.py
@@ -204,6 +204,10 @@ def _add_prefix(file: TextIO, worker_name: str, pid: int) -> None:
 
 def decorate_logs(process_name: str | None = None) -> None:
     """Decorate stdout/stderr with process name and PID prefix."""
+    # Respect VLLM_CONFIGURE_LOGGING environment variable
+    if not envs.VLLM_CONFIGURE_LOGGING:
+        return
+
     if process_name is None:
         process_name = get_mp_context().current_process().name
 

From 1109f98288b4a77f10e6f3b520b07005a0143b13 Mon Sep 17 00:00:00 2001
From: Shengqi Chen <harry-chen@outlook.com>
Date: Thu, 4 Dec 2025 06:08:19 +0800
Subject: [PATCH 202/770] [CI] fix docker image build by specifying merge-base
 commit id when downloading pre-compiled wheels (#29930)

Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
---
 .buildkite/generate_index.py                  | 46 -------------------
 docker/Dockerfile                             |  3 ++
 setup.py                                      | 25 +++++-----
 tests/standalone_tests/python_only_compile.sh |  6 ++-
 vllm/envs.py                                  |  6 ---
 5 files changed, 22 insertions(+), 64 deletions(-)
 delete mode 100644 .buildkite/generate_index.py

diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py
deleted file mode 100644
index bbed80ebe8476..0000000000000
--- a/.buildkite/generate_index.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import argparse
-import os
-
-template = """<!DOCTYPE html>
-<html>
-    <body>
-    <h1>Links for vLLM</h1/>
-        <a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
-        <a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
-    </body>
-</html>
-"""
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--wheel", help="The wheel path.", required=True)
-args = parser.parse_args()
-
-filename = os.path.basename(args.wheel)
-
-with open("index.html", "w") as f:
-    print(f"Generated index.html for {args.wheel}")
-    # sync the abi tag with .buildkite/scripts/upload-wheels.sh
-    if "x86_64" in filename:
-        x86_wheel = filename
-        arm_wheel = filename.replace("x86_64", "aarch64").replace(
-            "manylinux1", "manylinux2014"
-        )
-    elif "aarch64" in filename:
-        x86_wheel = filename.replace("aarch64", "x86_64").replace(
-            "manylinux2014", "manylinux1"
-        )
-        arm_wheel = filename
-    else:
-        raise ValueError(f"Unsupported wheel: {filename}")
-    # cloudfront requires escaping the '+' character
-    f.write(
-        template.format(
-            x86_wheel=x86_wheel,
-            x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
-            arm_wheel=arm_wheel,
-            arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
-        )
-    )
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 8bcd7f118f1ef..73cb4d7e0dc10 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -196,6 +196,7 @@ ARG SCCACHE_S3_NO_CREDENTIALS=0
 
 # Flag to control whether to use pre-built vLLM wheels
 ARG VLLM_USE_PRECOMPILED=""
+ARG VLLM_MERGE_BASE_COMMIT=""
 ARG VLLM_MAIN_CUDA_VERSION=""
 
 # Use dummy version for csrc-build wheel (only .so files are extracted, version doesn't matter)
@@ -216,6 +217,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
         && export SCCACHE_IDLE_TIMEOUT=0 \
         && export CMAKE_BUILD_TYPE=Release \
         && export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \
+        && export VLLM_PRECOMPILED_WHEEL_COMMIT="${VLLM_MERGE_BASE_COMMIT}" \
         && export VLLM_MAIN_CUDA_VERSION="${VLLM_MAIN_CUDA_VERSION}" \
         && export VLLM_DOCKER_BUILD_CONTEXT=1 \
         && sccache --show-stats \
@@ -233,6 +235,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
         rm -rf .deps && \
         mkdir -p .deps && \
         export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \
+        export VLLM_PRECOMPILED_WHEEL_COMMIT="${VLLM_MERGE_BASE_COMMIT}" && \
         export VLLM_DOCKER_BUILD_CONTEXT=1 && \
         python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
     fi
diff --git a/setup.py b/setup.py
index 8b2b4f7e5def7..af7282d4f747b 100644
--- a/setup.py
+++ b/setup.py
@@ -346,10 +346,13 @@ class precompiled_wheel_utils:
         The order of preference is:
         1. user-specified wheel location (can be either local or remote, via
            VLLM_PRECOMPILED_WHEEL_LOCATION)
-        2. user-specified variant from nightly repo (current main commit via
-           VLLM_PRECOMPILED_WHEEL_VARIANT)
+        2. user-specified variant (VLLM_PRECOMPILED_WHEEL_VARIANT) from nightly repo
         3. the variant corresponding to VLLM_MAIN_CUDA_VERSION from nightly repo
-        4. the default variant from nightly repo (current main commit)
+        4. the default variant from nightly repo
+
+        If downloading from the nightly repo, the commit can be specified via
+        VLLM_PRECOMPILED_WHEEL_COMMIT; otherwise, the head commit in the main branch
+        is used.
         """
         wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
         if wheel_location is not None:
@@ -362,10 +365,13 @@ class precompiled_wheel_utils:
             # try to fetch the wheel metadata from the nightly wheel repo
             main_variant = "cu" + envs.VLLM_MAIN_CUDA_VERSION.replace(".", "")
             variant = os.getenv("VLLM_PRECOMPILED_WHEEL_VARIANT", main_variant)
-            commit = os.getenv(
-                "VLLM_PRECOMPILED_WHEEL_COMMIT",
-                precompiled_wheel_utils.get_base_commit_in_main_branch(),
-            )
+            commit = os.getenv("VLLM_PRECOMPILED_WHEEL_COMMIT", "").lower()
+            if not commit or len(commit) != 40:
+                print(
+                    f"VLLM_PRECOMPILED_WHEEL_COMMIT not valid: {commit}"
+                    ", trying to fetch base commit in main branch"
+                )
+                commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
             print(f"Using precompiled wheel commit {commit} with variant {variant}")
             try_default = False
             wheels, repo_url, download_filename = None, None, None
@@ -502,10 +508,6 @@ class precompiled_wheel_utils:
 
     @staticmethod
     def get_base_commit_in_main_branch() -> str:
-        # Force to use the nightly wheel. This is mainly used for CI testing.
-        if envs.VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL:
-            return "nightly"
-
         try:
             # Get the latest commit hash of the upstream main branch.
             resp_json = subprocess.check_output(
@@ -516,6 +518,7 @@ class precompiled_wheel_utils:
                 ]
             ).decode("utf-8")
             upstream_main_commit = json.loads(resp_json)["sha"]
+            print(f"Upstream main branch latest commit: {upstream_main_commit}")
 
             # In Docker build context, .git may be immutable or missing.
             if envs.VLLM_DOCKER_BUILD_CONTEXT:
diff --git a/tests/standalone_tests/python_only_compile.sh b/tests/standalone_tests/python_only_compile.sh
index 7cc5ef6596490..d29b9afcc6fbf 100644
--- a/tests/standalone_tests/python_only_compile.sh
+++ b/tests/standalone_tests/python_only_compile.sh
@@ -5,6 +5,10 @@
 set -e
 set -x
 
+merge_base_commit=$(git merge-base HEAD origin/main)
+echo "Current merge base commit with main: $merge_base_commit"
+git show --oneline -s $merge_base_commit
+
 cd /vllm-workspace/
 
 # uninstall vllm
@@ -18,7 +22,7 @@ apt autoremove -y
 
 echo 'import os; os.system("touch /tmp/changed.file")' >> vllm/__init__.py
 
-VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL=1 VLLM_USE_PRECOMPILED=1 pip3 install -vvv -e .
+VLLM_PRECOMPILED_WHEEL_COMMIT=$merge_base_commit VLLM_USE_PRECOMPILED=1 pip3 install -vvv -e .
 
 # Run the script
 python3 -c 'import vllm'
diff --git a/vllm/envs.py b/vllm/envs.py
index 60d91e9850970..2ed5816b350b7 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -80,7 +80,6 @@ if TYPE_CHECKING:
     VLLM_USE_PRECOMPILED: bool = False
     VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX: bool = False
     VLLM_DOCKER_BUILD_CONTEXT: bool = False
-    VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False
     VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
     CMAKE_BUILD_TYPE: Literal["Debug", "Release", "RelWithDebInfo"] | None = None
     VERBOSE: bool = False
@@ -473,11 +472,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
     .strip()
     .lower()
     in ("1", "true"),
-    # Whether to force using nightly wheel in python build.
-    # This is used for testing the nightly wheel in python build.
-    "VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL": lambda: bool(
-        int(os.getenv("VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL", "0"))
-    ),
     # CMake build type
     # If not set, defaults to "Debug" or "RelWithDebInfo"
     # Available options: "Debug", "Release", "RelWithDebInfo"

From ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 Mon Sep 17 00:00:00 2001
From: Xieyang Xu <xieyang@meta.com>
Date: Wed, 3 Dec 2025 17:02:02 -0800
Subject: [PATCH 203/770] enable multi-node in external launcher mode (#29833)

---
 vllm/config/parallel.py            |  8 ++++--
 vllm/distributed/parallel_state.py | 39 ++++++++++++++++--------------
 2 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index 4a8c8bc17cfc3..20de672257107 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -593,10 +593,14 @@ class ParallelConfig:
                 "max_parallel_loading_workers is currently "
                 "not supported and will be ignored."
             )
-        if self.distributed_executor_backend not in ("mp", "uni") and self.nnodes > 1:
+        allowed_backends = ("mp", "uni", "external_launcher")
+        if (
+            self.distributed_executor_backend not in allowed_backends
+            and self.nnodes > 1
+        ):
             raise ValueError(
                 "nnodes > 1 can only be set when distributed executor "
-                "backend is mp or uni."
+                "backend is mp, uni or external_launcher."
             )
 
     @property
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index c82a77c216af2..f910f10407d44 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -1169,17 +1169,13 @@ def init_distributed_environment(
     from vllm.config import get_current_vllm_config
 
     config = get_current_vllm_config()
-    if config is not None and config.parallel_config.nnodes > 1:
-        parallel_config = config.parallel_config
-        ip = parallel_config.master_addr
-        rank = parallel_config.data_parallel_rank * world_size + rank
-        world_size = parallel_config.world_size_across_dp
-        port = parallel_config.master_port
-        distributed_init_method = get_distributed_init_method(ip, port)
-    elif (
+    if (
         config is not None
-        and config.parallel_config.data_parallel_size > 1
         and config.parallel_config.distributed_executor_backend != "external_launcher"
+        and (
+            config.parallel_config.nnodes > 1
+            or config.parallel_config.data_parallel_size > 1
+        )
     ):
         parallel_config = config.parallel_config
         # adjust to take into account data parallelism
@@ -1187,15 +1183,22 @@ def init_distributed_environment(
         rank = parallel_config.data_parallel_rank * world_size + rank
         # adjust the world size to take into account data parallelism
         world_size = parallel_config.world_size_across_dp
-        ip = parallel_config.data_parallel_master_ip
-        port = parallel_config.get_next_dp_init_port()
-        distributed_init_method = get_distributed_init_method(ip, port)
-        logger.debug(
-            "Adjusting world_size=%d rank=%d distributed_init_method=%s for DP",
-            world_size,
-            rank,
-            distributed_init_method,
-        )
+
+        # Use appropriate IP and port based on configuration
+        if parallel_config.nnodes > 1:
+            ip = parallel_config.master_addr
+            port = parallel_config.master_port
+            distributed_init_method = get_distributed_init_method(ip, port)
+        else:
+            ip = parallel_config.data_parallel_master_ip
+            port = parallel_config.get_next_dp_init_port()
+            distributed_init_method = get_distributed_init_method(ip, port)
+            logger.debug(
+                "Adjusting world_size=%d rank=%d distributed_init_method=%s for DP",
+                world_size,
+                rank,
+                distributed_init_method,
+            )
     if not torch.distributed.is_initialized():
         logger.info(
             "world_size=%d rank=%d local_rank=%d distributed_init_method=%s backend=%s",

From c493b9d0924b3810439fd3fcd17995f3bb93bb75 Mon Sep 17 00:00:00 2001
From: Zhewen Li <zhewenli@meta.com>
Date: Wed, 3 Dec 2025 19:21:45 -0800
Subject: [PATCH 204/770] [CI/Build] Add MM code path to Examples Test (#29986)

Signed-off-by: zhewenli <zhewenli@meta.com>
---
 .buildkite/test-pipeline.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index f79e9266559f6..a79f0b0c6bbdf 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -387,6 +387,7 @@ steps:
   working_dir: "/vllm-workspace/examples"
   source_file_dependencies:
   - vllm/entrypoints
+  - vllm/multimodal
   - examples/
   commands:
     - pip install tensorizer # for tensorizer test

From 33a3d6c79826aa7938db45a4e24a213664859cc0 Mon Sep 17 00:00:00 2001
From: Iceber Gu <caiwei95@hotmail.com>
Date: Thu, 4 Dec 2025 11:48:30 +0800
Subject: [PATCH 205/770] fix LoRA-related examples (#29956)

Signed-off-by: Iceber Gu <caiwei95@hotmail.com>
---
 .../lora_with_quantization_inference.py        | 16 ++++------------
 .../offline_inference/multilora_inference.py   | 18 +++---------------
 2 files changed, 7 insertions(+), 27 deletions(-)

diff --git a/examples/offline_inference/lora_with_quantization_inference.py b/examples/offline_inference/lora_with_quantization_inference.py
index dc5c6202fa57b..2f3564b597556 100644
--- a/examples/offline_inference/lora_with_quantization_inference.py
+++ b/examples/offline_inference/lora_with_quantization_inference.py
@@ -23,31 +23,23 @@ def create_test_prompts(
         # this is an example of using quantization without LoRA
         (
             "My name is",
-            SamplingParams(
-                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
-            ),
+            SamplingParams(temperature=0.0, logprobs=1, max_tokens=128),
             None,
         ),
         # the next three examples use quantization with LoRA
         (
             "my name is",
-            SamplingParams(
-                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
-            ),
+            SamplingParams(temperature=0.0, logprobs=1, max_tokens=128),
             LoRARequest("lora-test-1", 1, lora_path),
         ),
         (
             "The capital of USA is",
-            SamplingParams(
-                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
-            ),
+            SamplingParams(temperature=0.0, logprobs=1, max_tokens=128),
             LoRARequest("lora-test-2", 1, lora_path),
         ),
         (
             "The capital of France is",
-            SamplingParams(
-                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
-            ),
+            SamplingParams(temperature=0.0, logprobs=1, max_tokens=128),
             LoRARequest("lora-test-3", 1, lora_path),
         ),
     ]
diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py
index 5e5da2c0144c9..92021f9fb226c 100644
--- a/examples/offline_inference/multilora_inference.py
+++ b/examples/offline_inference/multilora_inference.py
@@ -27,9 +27,7 @@ def create_test_prompts(
     return [
         (
             "A robot may not injure a human being",
-            SamplingParams(
-                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
-            ),
+            SamplingParams(temperature=0.0, logprobs=1, max_tokens=128),
             None,
         ),
         (
@@ -41,22 +39,12 @@ def create_test_prompts(
         ),
         (
             "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
-            SamplingParams(
-                temperature=0.0,
-                logprobs=1,
-                prompt_logprobs=1,
-                max_tokens=128,
-            ),
+            SamplingParams(temperature=0.0, logprobs=1, max_tokens=128),
             LoRARequest("sql-lora", 1, lora_path),
         ),
         (
             "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
-            SamplingParams(
-                temperature=0.0,
-                logprobs=1,
-                prompt_logprobs=1,
-                max_tokens=128,
-            ),
+            SamplingParams(temperature=0.0, logprobs=1, max_tokens=128),
             LoRARequest("sql-lora2", 2, lora_path),
         ),
     ]

From 5f91cdda75b24a3d9cdda8c82897db07b288b5c9 Mon Sep 17 00:00:00 2001
From: Li Wang <wangli858794774@gmail.com>
Date: Thu, 4 Dec 2025 11:53:00 +0800
Subject: [PATCH 206/770] [Misc] Add docker build env for Ascend NPU (#30015)

Signed-off-by: wangli <wangli858794774@gmail.com>
---
 .buildkite/scripts/hardware_ci/run-npu-test.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.buildkite/scripts/hardware_ci/run-npu-test.sh b/.buildkite/scripts/hardware_ci/run-npu-test.sh
index 29c8f5ed5a91a..0db1abe37ba11 100644
--- a/.buildkite/scripts/hardware_ci/run-npu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-npu-test.sh
@@ -74,6 +74,7 @@ FROM ${BASE_IMAGE_NAME}
 
 # Define environments
 ENV DEBIAN_FRONTEND=noninteractive
+ENV SOC_VERSION="ascend910b1"
 
 RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
     pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \

From dd38ba3a2682d6f73b02bc983a5b0157ed3e5498 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 4 Dec 2025 12:51:15 +0800
Subject: [PATCH 207/770] [Bugfix] Fix adapter_enabled IMA (#29977)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/lora/ops/triton_ops/fused_moe_lora_op.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
index 413ee8ecbbf96..34383cdf1767c 100644
--- a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
+++ b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
@@ -96,10 +96,14 @@ def _fused_moe_lora_kernel(
     slice_id = tl.program_id(axis=1)
     lora_idx = tl.program_id(axis=2)
     lora_id = tl.load(lora_ids + lora_idx)
-    moe_enabled = tl.load(adapter_enabled + lora_id)
-    if lora_id == -1 or moe_enabled == 0:
+
+    if lora_id == -1:
         # Early exit for the no-lora case.
         return
+    moe_enabled = tl.load(adapter_enabled + lora_id)
+    if moe_enabled == 0:
+        # Early exit for the no moe lora case.
+        return
     max_loras = tl.num_programs(axis=2)
     grid_k = tl.cdiv(K, BLOCK_SIZE_K * SPLIT_K)
 

From 28097d5638cc695f4644c411edac8eb05a03b39b Mon Sep 17 00:00:00 2001
From: gausah01 <141038176+gausah01@users.noreply.github.com>
Date: Thu, 4 Dec 2025 05:01:15 +0000
Subject: [PATCH 208/770] [Bugfix][CPU] Fix CPU KV cache fallback memory
 allocation (#29604)

Signed-off-by: Gauri Sahnan <gauri.sahnan@arm.com>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
---
 vllm/platforms/cpu.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 2b2c2f9cdc571..a2518d5fd3dc4 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -10,6 +10,7 @@ import sys
 from dataclasses import dataclass
 from typing import TYPE_CHECKING
 
+import psutil
 import regex as re
 import torch
 
@@ -147,11 +148,21 @@ class CpuPlatform(Platform):
         from vllm.utils.mem_constants import GiB_bytes
 
         kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
+        node_dir = "/sys/devices/system/node"
         if kv_cache_space is None:
-            kv_cache_space = 4 * GiB_bytes  # type: ignore
+            nodes = (
+                [d for d in os.listdir(node_dir) if d.startswith("node")]
+                if os.path.exists(node_dir)
+                else []
+            )
+            num_numa_nodes = len(nodes) or 1
+            free_cpu_memory = psutil.virtual_memory().total // num_numa_nodes
+            DEFAULT_CPU_MEM_UTILIZATION = 0.5
+            kv_cache_space = int(free_cpu_memory * DEFAULT_CPU_MEM_UTILIZATION)
+            kv_cache_space_gib = kv_cache_space / GiB_bytes
             logger.warning_once(
-                "Environment variable VLLM_CPU_KVCACHE_SPACE (GiB) "
-                "for CPU backend is not set, using 4 by default."
+                "VLLM_CPU_KVCACHE_SPACE not set. Using "
+                f"{kv_cache_space_gib:.2f} GiB for KV cache."
             )
         else:
             kv_cache_space *= GiB_bytes

From fca3f4665838605e268a8408bc7ca359f5d5c14b Mon Sep 17 00:00:00 2001
From: Benjamin Bartels <benjamin@bartels.dev>
Date: Thu, 4 Dec 2025 05:50:27 +0000
Subject: [PATCH 209/770] [Frontend] Fixes anthropic /v1/messages streaming not
 containing input_tokens on first chunk (#29971)

Signed-off-by: bbartels <benjamin@bartels.dev>
---
 tests/entrypoints/openai/test_messages.py      | 11 +++++++++++
 vllm/entrypoints/anthropic/serving_messages.py | 10 +++++++++-
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/tests/entrypoints/openai/test_messages.py b/tests/entrypoints/openai/test_messages.py
index 3e390ad496428..b804a1a7a841a 100644
--- a/tests/entrypoints/openai/test_messages.py
+++ b/tests/entrypoints/openai/test_messages.py
@@ -69,9 +69,20 @@ async def test_anthropic_streaming(client: anthropic.AsyncAnthropic):
         stream=True,
     )
 
+    first_chunk = None
+    chunk_count = 0
     async for chunk in resp:
+        chunk_count += 1
+        if first_chunk is None and chunk.type == "message_start":
+            first_chunk = chunk
         print(chunk.model_dump_json())
 
+    assert chunk_count > 0
+    assert first_chunk is not None, "message_start chunk was never observed"
+    assert first_chunk.usage is not None, "first chunk should include usage stats"
+    assert first_chunk.usage["output_tokens"] == 0
+    assert first_chunk.usage["input_tokens"] > 5
+
 
 @pytest.mark.asyncio
 async def test_anthropic_tool_call(client: anthropic.AsyncAnthropic):
diff --git a/vllm/entrypoints/anthropic/serving_messages.py b/vllm/entrypoints/anthropic/serving_messages.py
index 340dabf0e7117..e7ea3bb59ca70 100644
--- a/vllm/entrypoints/anthropic/serving_messages.py
+++ b/vllm/entrypoints/anthropic/serving_messages.py
@@ -183,7 +183,9 @@ class AnthropicServingMessages(OpenAIServingChat):
 
         if anthropic_request.stream:
             req.stream = anthropic_request.stream
-            req.stream_options = StreamOptions.validate({"include_usage": True})
+            req.stream_options = StreamOptions.validate(
+                {"include_usage": True, "continuous_usage_stats": True}
+            )
 
         if anthropic_request.tool_choice is None:
             req.tool_choice = None
@@ -323,6 +325,12 @@ class AnthropicServingMessages(OpenAIServingChat):
                                     content=[],
                                     model=origin_chunk.model,
                                 ),
+                                usage=AnthropicUsage(
+                                    input_tokens=origin_chunk.usage.prompt_tokens
+                                    if origin_chunk.usage
+                                    else 0,
+                                    output_tokens=0,
+                                ),
                             )
                             first_item = False
                             data = chunk.model_dump_json(exclude_unset=True)

From 8aaa81b35f96a3b5c56d3dccf58f48129ee34126 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 4 Dec 2025 14:00:52 +0800
Subject: [PATCH 210/770] [KVConnector] remove unused code (the model aware kv
 ops class) (#29709)

Signed-off-by: KuntaiDu <kuntai@uchicago.edu>
---
 .../kv_transfer/kv_connector/utils.py         | 87 +------------------
 1 file changed, 1 insertion(+), 86 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py
index b8eb5ea3b4939..b2c2c0e6b596d 100644
--- a/vllm/distributed/kv_transfer/kv_connector/utils.py
+++ b/vllm/distributed/kv_transfer/kv_connector/utils.py
@@ -8,9 +8,7 @@ from typing import TYPE_CHECKING, Literal
 
 import torch
 
-import vllm.envs as envs
-from vllm import _custom_ops as ops
-from vllm.config import VllmConfig, get_current_vllm_config
+from vllm.config import get_current_vllm_config
 from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
 from vllm.logger import init_logger
 from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput
@@ -21,89 +19,6 @@ if TYPE_CHECKING:
 logger = init_logger(__name__)
 
 
-class model_aware_kv_ops_helper:
-    def __init__(self, config: VllmConfig):
-        self.is_deepseek_mla = config.model_config.is_deepseek_mla
-        self.use_mla_opt = not envs.VLLM_MLA_DISABLE
-        self.tp_size = config.parallel_config.tensor_parallel_size
-
-    def get_model_args(self, model_executable: torch.nn.Module):
-        model_config = model_executable.model.config
-        self.model_executable = model_executable
-        num_heads = int(model_config.num_key_value_heads / self.tp_size)
-        hidden_size = model_config.hidden_size
-        num_attention_heads = model_config.num_attention_heads
-
-        # Deepseek's MLA (Multi-head Latent Attention) uses two different
-        # kv_cache shapes based on whether VLLM_MLA_DISABLE is set to 0.
-        # When VLLM_MLA_DISABLE=0 (default), forward absorb is applied,
-        # resulting in a kv_cache shape of [num_blks, blk_size, 1,
-        # kv_lora_rank + qk_rope_head_dim].
-        # When VLLM_MLA_DISABLE=1, standard FA is used instead, leading
-        # to a kv_cache shape of [2, num_blks, blk_size,
-        # num_key_value_heads / tp, qk_nope_head_dim + qk_rope_head_dim].
-        # For more details, see vllm/v1/attention/backends/mla/common.py.
-        if self.is_deepseek_mla and self.use_mla_opt:
-            head_size = model_config.kv_lora_rank + model_config.qk_rope_head_dim
-            num_heads = 1
-        elif self.is_deepseek_mla and not self.use_mla_opt:
-            head_size = model_config.qk_nope_head_dim + model_config.qk_rope_head_dim
-        else:
-            head_size = getattr(model_config, "head_dim", None)
-            if head_size is None:
-                head_size = int(hidden_size // num_attention_heads)
-
-        return num_heads, head_size
-
-    def get_kv_from_cache(self, kv_cache, num_heads, head_size):
-        if self.is_deepseek_mla and self.use_mla_opt:
-            key_cache = kv_cache.reshape(-1, num_heads, head_size)
-            value_cache = kv_cache.reshape(-1, num_heads, head_size)
-        else:
-            key_cache = kv_cache[0].reshape(-1, num_heads, head_size)
-            value_cache = kv_cache[1].reshape(-1, num_heads, head_size)
-        return key_cache, value_cache
-
-    def put_kv_to_cache(
-        self,
-        model_executable: torch.nn.Module,
-        keys,
-        values,
-        layer,
-        kv_cache,
-        slot_mapping,
-        start_pos,
-        end_pos,
-    ):
-        model_config = model_executable.model.config
-
-        if self.is_deepseek_mla and self.use_mla_opt:
-            layer.self_attn.attn = layer.self_attn.mla_attn
-            k_c_normed_k_pe = keys.squeeze(1)
-            k_c_normed = k_c_normed_k_pe[:, : model_config.kv_lora_rank]
-            k_pe = k_c_normed_k_pe[:, model_config.kv_lora_rank :]
-            ops.concat_and_cache_mla(
-                k_c_normed.to(kv_cache.device),
-                k_pe.to(kv_cache.device),
-                kv_cache,
-                slot_mapping[start_pos:end_pos],
-                layer.self_attn.attn.kv_cache_dtype,
-                layer.self_attn.attn._k_scale,
-            )
-        else:
-            key_cache, value_cache = kv_cache[0], kv_cache[1]
-            ops.reshape_and_cache_flash(
-                keys.to(key_cache.device),
-                values.to(value_cache.device),
-                key_cache,
-                value_cache,
-                slot_mapping[start_pos:end_pos],
-                layer.self_attn.attn.kv_cache_dtype,
-                layer.self_attn.attn._k_scale,
-                layer.self_attn.attn._v_scale,
-            )
-
-
 def get_kv_connector_cache_layout():
     # NOTE (NickLucche) When running disaggregated PD with NIXL, HND layout is
     # used for faster transfer.

From 80f8af4b2fadf85403290a38c8ae77f01b6b5378 Mon Sep 17 00:00:00 2001
From: Jianwei Mao <maojianwei2016@126.com>
Date: Thu, 4 Dec 2025 14:04:44 +0800
Subject: [PATCH 211/770] Fix error while downloading dependencies for CPU
 backend (#29797)

Signed-off-by: Jianwei Mao <maojianwei2016@126.com>
---
 requirements/cpu-build.txt | 1 -
 requirements/cpu.txt       | 1 -
 2 files changed, 2 deletions(-)

diff --git a/requirements/cpu-build.txt b/requirements/cpu-build.txt
index e18e0825fc428..1ea401a04a12c 100644
--- a/requirements/cpu-build.txt
+++ b/requirements/cpu-build.txt
@@ -3,7 +3,6 @@ ninja
 packaging>=24.2
 setuptools>=77.0.3,<81.0.0
 setuptools-scm>=8
---extra-index-url https://download.pytorch.org/whl/cpu
 torch==2.9.1+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
 torch==2.9.1; platform_system == "Darwin" or platform_machine == "ppc64le" or platform_machine == "aarch64"
 scons; platform_machine == "aarch64"    # needed to build Arm Compute Library (ACL)
diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index 21571be479c83..7a670812e8943 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -4,7 +4,6 @@
 numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative decoding
 
 # Dependencies for CPUs
---extra-index-url https://download.pytorch.org/whl/cpu
 torch==2.9.1+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
 torch==2.9.1; platform_system == "Darwin" or platform_machine == "ppc64le" or platform_machine == "aarch64"
 

From 9ae2f603748446317c90fe40f1eb269e9a027815 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 4 Dec 2025 14:22:20 +0800
Subject: [PATCH 212/770] [Misc] Various cleanups for MM input processing
 (#29970)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/features/multimodal_inputs.md            |   8 +-
 ...ompt_embed_inference_with_openai_client.py |  12 +-
 .../entrypoints/openai/test_vision_embeds.py  |  75 +++++------
 tests/entrypoints/test_chat_utils.py          | 127 +-----------------
 .../test_completion_with_image_embeds.py      |  17 +--
 vllm/entrypoints/chat_utils.py                |  13 +-
 vllm/entrypoints/llm.py                       |   1 -
 vllm/entrypoints/openai/serving_engine.py     |   6 -
 vllm/entrypoints/score_utils.py               |   5 +-
 vllm/model_executor/models/hunyuan_vision.py  |   3 +-
 vllm/model_executor/models/keye.py            |   4 +-
 vllm/model_executor/models/keye_vl1_5.py      |   4 +-
 vllm/multimodal/audio.py                      |   7 +-
 vllm/utils/serial_utils.py                    |  10 ++
 14 files changed, 67 insertions(+), 225 deletions(-)

diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
index 4656ee43ea251..2b25dc7666c37 100644
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -795,14 +795,12 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
 ??? code
 
     ```python
+    from vllm.utils.serial_utils import tensor2base64
+
     image_embedding = torch.load(...)
     grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct
 
-    buffer = io.BytesIO()
-    torch.save(image_embedding, buffer)
-    buffer.seek(0)
-    binary_data = buffer.read()
-    base64_image_embedding = base64.b64encode(binary_data).decode('utf-8')
+    base64_image_embedding = tensor2base64(image_embedding)
 
     client = OpenAI(
         # defaults to os.environ.get("OPENAI_API_KEY")
diff --git a/examples/online_serving/prompt_embed_inference_with_openai_client.py b/examples/online_serving/prompt_embed_inference_with_openai_client.py
index 0bbe4b8f5ee9b..889be6820e70a 100644
--- a/examples/online_serving/prompt_embed_inference_with_openai_client.py
+++ b/examples/online_serving/prompt_embed_inference_with_openai_client.py
@@ -28,13 +28,11 @@ Dependencies:
 - openai
 """
 
-import base64
-import io
-
-import torch
 import transformers
 from openai import OpenAI
 
+from vllm.utils.serial_utils import tensor2base64
+
 
 def main():
     client = OpenAI(
@@ -58,11 +56,7 @@ def main():
     prompt_embeds = embedding_layer(token_ids).squeeze(0)
 
     # Prompt embeddings
-    buffer = io.BytesIO()
-    torch.save(prompt_embeds, buffer)
-    buffer.seek(0)
-    binary_data = buffer.read()
-    encoded_embeds = base64.b64encode(binary_data).decode("utf-8")
+    encoded_embeds = tensor2base64(prompt_embeds)
 
     completion = client.completions.create(
         model=model_name,
diff --git a/tests/entrypoints/openai/test_vision_embeds.py b/tests/entrypoints/openai/test_vision_embeds.py
index a6593c5b05e2e..42d9fe4840bbe 100644
--- a/tests/entrypoints/openai/test_vision_embeds.py
+++ b/tests/entrypoints/openai/test_vision_embeds.py
@@ -2,64 +2,47 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import base64
-import io
 
 import numpy as np
 import pytest
 import requests
 import torch
 
+from vllm.utils.serial_utils import tensor2base64
+
 from ...utils import RemoteOpenAIServer
 
-MODEL_NAME = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
-DTYPE = "float16"
 
-
-def _terratorch_dummy_inputs(model_name: str):
+def _terratorch_dummy_messages():
     pixel_values = torch.full((6, 512, 512), 1.0, dtype=torch.float16)
     location_coords = torch.full((1, 2), 1.0, dtype=torch.float16)
 
-    buffer_tiff = io.BytesIO()
-    torch.save(pixel_values, buffer_tiff)
-    buffer_tiff.seek(0)
-    binary_data = buffer_tiff.read()
-    base64_tensor_embedding = base64.b64encode(binary_data).decode("utf-8")
-
-    buffer_coord = io.BytesIO()
-    torch.save(location_coords, buffer_coord)
-    buffer_coord.seek(0)
-    binary_data = buffer_coord.read()
-    base64_coord_embedding = base64.b64encode(binary_data).decode("utf-8")
-
-    return {
-        "model": model_name,
-        "additional_data": {"prompt_token_ids": [1]},
-        "encoding_format": "base64",
-        "messages": [
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "image_embeds",
-                        "image_embeds": {
-                            "pixel_values": base64_tensor_embedding,
-                            "location_coords": base64_coord_embedding,
-                        },
-                    }
-                ],
-            }
-        ],
-    }
+    return [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_embeds",
+                    "image_embeds": {
+                        "pixel_values": tensor2base64(pixel_values),
+                        "location_coords": tensor2base64(location_coords),
+                    },
+                }
+            ],
+        }
+    ]
 
 
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_single_request(model_name: str):
+@pytest.mark.parametrize(
+    "model_name", ["ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"]
+)
+def test_single_request(model_name: str):
     args = [
         "--runner",
         "pooling",
         # use half precision for speed and memory savings in CI environment
         "--dtype",
-        DTYPE,
+        "float16",
         "--enforce-eager",
         "--trust-remote-code",
         "--max-num-seqs",
@@ -70,11 +53,15 @@ async def test_single_request(model_name: str):
         "--enable-mm-embeds",
     ]
 
-    with RemoteOpenAIServer(MODEL_NAME, args) as server:
-        prompt = _terratorch_dummy_inputs(model_name)
-
-        # test single pooling
-        response = requests.post(server.url_for("pooling"), json=prompt)
+    with RemoteOpenAIServer(model_name, args) as server:
+        response = requests.post(
+            server.url_for("pooling"),
+            json={
+                "model": model_name,
+                "messages": _terratorch_dummy_messages(),
+                "encoding_format": "base64",
+            },
+        )
         response.raise_for_status()
 
         output = response.json()["data"][0]["data"]
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 03a0c058ea690..75be34820bcd7 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -29,6 +29,7 @@ from vllm.multimodal.utils import (
     encode_video_base64,
 )
 from vllm.tokenizers import MistralTokenizer, get_tokenizer
+from vllm.utils.serial_utils import tensor2base64
 
 from ..models.registry import HF_EXAMPLE_MODELS
 from ..utils import VLLM_PATH
@@ -85,11 +86,6 @@ def phi3v_model_config_image_embeds():
     )
 
 
-@pytest.fixture(scope="module")
-def phi3v_tokenizer():
-    return get_tokenizer(PHI3V_MODEL_ID)
-
-
 @pytest.fixture(scope="function")
 def qwen2_audio_model_config():
     return ModelConfig(
@@ -115,11 +111,6 @@ def audio_embeds_model_config():
     )
 
 
-@pytest.fixture(scope="module")
-def qwen2_audio_tokenizer():
-    return get_tokenizer(QWEN2AUDIO_MODEL_ID)
-
-
 @pytest.fixture(scope="function")
 def qwen25omni_model_config_mm_interleaved():
     return ModelConfig(
@@ -134,11 +125,6 @@ def qwen25omni_model_config_mm_interleaved():
     )
 
 
-@pytest.fixture(scope="module")
-def qwen25omni_tokenizer():
-    return get_tokenizer(QWEN25OMNI_MODEL_ID)
-
-
 @pytest.fixture(scope="function")
 def mistral_model_config():
     return ModelConfig(
@@ -150,11 +136,6 @@ def mistral_model_config():
     )
 
 
-@pytest.fixture(scope="module")
-def mistral_tokenizer():
-    return get_tokenizer(MISTRAL_MODEL_ID)
-
-
 @pytest.fixture(scope="module")
 def image_url():
     image = ImageAsset("cherry_blossom")
@@ -239,7 +220,6 @@ def _assert_mm_data_inputs(
 
 def test_parse_chat_messages_single_image(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     conversation, mm_data, mm_uuids = parse_chat_messages(
@@ -253,7 +233,6 @@ def test_parse_chat_messages_single_image(
             }
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -266,7 +245,6 @@ def test_parse_chat_messages_single_image(
 
 def test_parse_chat_messages_single_image_with_uuid(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     image_uuid = str(hash(image_url))
@@ -287,7 +265,6 @@ def test_parse_chat_messages_single_image_with_uuid(
             }
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -300,7 +277,6 @@ def test_parse_chat_messages_single_image_with_uuid(
 
 def test_parse_chat_messages_single_empty_image_with_uuid(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     image_uuid = str(hash(image_url))
@@ -319,7 +295,6 @@ def test_parse_chat_messages_single_empty_image_with_uuid(
             }
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -332,7 +307,6 @@ def test_parse_chat_messages_single_empty_image_with_uuid(
 
 def test_parse_chat_messages_single_image_with_bad_uuid_format(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     image_uuid = str(hash(image_url))
@@ -354,7 +328,6 @@ def test_parse_chat_messages_single_image_with_bad_uuid_format(
             }
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -367,7 +340,6 @@ def test_parse_chat_messages_single_image_with_bad_uuid_format(
 
 def test_parse_chat_messages_multiple_images_with_uuids(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     image_uuid1 = "my_uuid_1"
@@ -397,7 +369,6 @@ def test_parse_chat_messages_multiple_images_with_uuids(
             }
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -413,7 +384,6 @@ def test_parse_chat_messages_multiple_images_with_uuids(
 
 def test_parse_chat_messages_multiple_empty_images_with_uuids(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     image_uuid1 = "my_uuid_1"
@@ -439,7 +409,6 @@ def test_parse_chat_messages_multiple_empty_images_with_uuids(
             }
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -455,7 +424,6 @@ def test_parse_chat_messages_multiple_empty_images_with_uuids(
 
 def test_parse_chat_messages_mixed_empty_images_with_uuids(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     image_uuid1 = "my_uuid_1"
@@ -483,7 +451,6 @@ def test_parse_chat_messages_mixed_empty_images_with_uuids(
             }
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -500,7 +467,6 @@ def test_parse_chat_messages_mixed_empty_images_with_uuids(
 @pytest.mark.asyncio
 async def test_parse_chat_messages_single_image_with_uuid_async(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     image_uuid = str(hash(image_url))
@@ -519,7 +485,6 @@ async def test_parse_chat_messages_single_image_with_uuid_async(
             }
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -533,7 +498,6 @@ async def test_parse_chat_messages_single_image_with_uuid_async(
 @pytest.mark.asyncio
 async def test_parse_chat_messages_empty_image_with_uuid_async(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     image_uuid = str(hash(image_url))
@@ -552,7 +516,6 @@ async def test_parse_chat_messages_empty_image_with_uuid_async(
             }
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -566,7 +529,6 @@ async def test_parse_chat_messages_empty_image_with_uuid_async(
 @pytest.mark.asyncio
 async def test_parse_chat_messages_multiple_images_with_uuids_async(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     image_uuid1 = "my_uuid_1"
@@ -592,7 +554,6 @@ async def test_parse_chat_messages_multiple_images_with_uuids_async(
             }
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -609,7 +570,6 @@ async def test_parse_chat_messages_multiple_images_with_uuids_async(
 @pytest.mark.asyncio
 async def test_parse_chat_messages_multiple_empty_images_with_uuids_async(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     image_uuid1 = "my_uuid_1"
@@ -635,7 +595,6 @@ async def test_parse_chat_messages_multiple_empty_images_with_uuids_async(
             }
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -652,7 +611,6 @@ async def test_parse_chat_messages_multiple_empty_images_with_uuids_async(
 @pytest.mark.asyncio
 async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     image_uuid2 = "my_uuid_2"
@@ -676,7 +634,6 @@ async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
             }
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -692,7 +649,6 @@ async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
 
 def test_parse_chat_messages_empty_system(
     mistral_model_config,
-    mistral_tokenizer,
 ):
     # Test string format
     conversation, _, _ = parse_chat_messages(
@@ -704,7 +660,6 @@ def test_parse_chat_messages_empty_system(
             },
         ],
         mistral_model_config,
-        mistral_tokenizer,
         content_format="string",
     )
     assert conversation == [
@@ -722,7 +677,6 @@ def test_parse_chat_messages_empty_system(
             },
         ],
         mistral_model_config,
-        mistral_tokenizer,
         content_format="openai",
     )
     assert conversation == [
@@ -734,7 +688,6 @@ def test_parse_chat_messages_empty_system(
 @pytest.mark.asyncio
 async def test_parse_chat_messages_single_image_async(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     conversation, mm_future, mm_uuids = parse_chat_messages_futures(
@@ -748,7 +701,6 @@ async def test_parse_chat_messages_single_image_async(
             }
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -761,7 +713,6 @@ async def test_parse_chat_messages_single_image_async(
 
 def test_parse_chat_messages_multiple_images(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     conversation, mm_data, mm_uuids = parse_chat_messages(
@@ -779,7 +730,6 @@ def test_parse_chat_messages_multiple_images(
             }
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -795,7 +745,6 @@ def test_parse_chat_messages_multiple_images(
 
 def test_parse_chat_messages_empty_pil_image_with_uuid(
     phi3v_model_config,
-    phi3v_tokenizer,
 ):
     uuid = "abcd"
     conversation, mm_data, mm_uuids = parse_chat_messages(
@@ -809,7 +758,6 @@ def test_parse_chat_messages_empty_pil_image_with_uuid(
             }
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -825,7 +773,6 @@ def test_parse_chat_messages_empty_pil_image_with_uuid(
 
 def test_parse_chat_messages_empty_image_embeds_with_uuid(
     phi3v_model_config_image_embeds,
-    phi3v_tokenizer,
 ):
     uuid = "abcd"
     conversation, mm_data, mm_uuids = parse_chat_messages(
@@ -839,7 +786,6 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
             }
         ],
         phi3v_model_config_image_embeds,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -857,7 +803,6 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
 
 def test_parse_chat_messages_empty_audio_embeds_with_uuid(
     audio_embeds_model_config,
-    qwen2_audio_tokenizer,
 ):
     """Test audio_embeds with UUID (no actual embeds data)."""
     uuid = "test-audio-uuid-123"
@@ -873,7 +818,6 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid(
             }
         ],
         audio_embeds_model_config,
-        qwen2_audio_tokenizer,
         content_format="string",
     )
 
@@ -889,11 +833,8 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid(
 
 def test_parse_chat_messages_audio_embeds_with_string(
     audio_embeds_model_config,
-    qwen2_audio_tokenizer,
 ):
     """Test audio_embeds with base64 string embedding data."""
-    import base64
-    import io
 
     import torch
 
@@ -901,11 +842,7 @@ def test_parse_chat_messages_audio_embeds_with_string(
     audio_embedding = torch.randn(1, 128, 768)
 
     # Encode it as base64
-    buffer = io.BytesIO()
-    torch.save(audio_embedding, buffer)
-    buffer.seek(0)
-    binary_data = buffer.read()
-    base64_audio_embedding = base64.b64encode(binary_data).decode("utf-8")
+    base64_audio_embedding = tensor2base64(audio_embedding)
 
     conversation, mm_data, mm_uuids = parse_chat_messages(
         [
@@ -921,7 +858,6 @@ def test_parse_chat_messages_audio_embeds_with_string(
             }
         ],
         audio_embeds_model_config,
-        qwen2_audio_tokenizer,
         content_format="string",
     )
 
@@ -939,11 +875,8 @@ def test_parse_chat_messages_audio_embeds_with_string(
 @pytest.mark.asyncio
 async def test_parse_chat_messages_audio_embeds_async(
     audio_embeds_model_config,
-    qwen2_audio_tokenizer,
 ):
     """Test audio_embeds with async futures."""
-    import base64
-    import io
 
     import torch
 
@@ -951,11 +884,7 @@ async def test_parse_chat_messages_audio_embeds_async(
     audio_embedding = torch.randn(1, 128, 768)
 
     # Encode it as base64
-    buffer = io.BytesIO()
-    torch.save(audio_embedding, buffer)
-    buffer.seek(0)
-    binary_data = buffer.read()
-    base64_audio_embedding = base64.b64encode(binary_data).decode("utf-8")
+    base64_audio_embedding = tensor2base64(audio_embedding)
 
     conversation, mm_future, mm_uuids = parse_chat_messages_futures(
         [
@@ -971,7 +900,6 @@ async def test_parse_chat_messages_audio_embeds_async(
             }
         ],
         audio_embeds_model_config,
-        qwen2_audio_tokenizer,
         content_format="string",
     )
 
@@ -990,7 +918,6 @@ async def test_parse_chat_messages_audio_embeds_async(
 @pytest.mark.asyncio
 async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
     phi3v_model_config_image_embeds,
-    phi3v_tokenizer,
 ):
     uuid = "abcd"
     conversation, mm_future, mm_uuids = parse_chat_messages_futures(
@@ -1004,7 +931,6 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
             }
         ],
         phi3v_model_config_image_embeds,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -1024,7 +950,6 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
 @pytest.mark.asyncio
 async def test_parse_chat_messages_multiple_images_async(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     conversation, mm_future, mm_uuids = parse_chat_messages_futures(
@@ -1042,7 +967,6 @@ async def test_parse_chat_messages_multiple_images_async(
             }
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -1058,7 +982,6 @@ async def test_parse_chat_messages_multiple_images_async(
 
 def test_parse_chat_messages_placeholder_already_in_prompt(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     conversation, mm_data, mm_uuids = parse_chat_messages(
@@ -1076,7 +999,6 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
             }
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
     assert conversation == [
@@ -1091,7 +1013,6 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
 
 def test_parse_chat_messages_placeholder_one_already_in_prompt(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     conversation, mm_data, mm_uuids = parse_chat_messages(
@@ -1110,7 +1031,6 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
             }
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -1127,7 +1047,6 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
 
 def test_parse_chat_messages_multiple_images_across_messages(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     conversation, mm_data, mm_uuids = parse_chat_messages(
@@ -1149,7 +1068,6 @@ def test_parse_chat_messages_multiple_images_across_messages(
             },
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -1164,7 +1082,6 @@ def test_parse_chat_messages_multiple_images_across_messages(
 
 def test_parse_chat_messages_multiple_images_with_uuids_across_messages(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     image_uuid = str(hash(image_url))
@@ -1195,7 +1112,6 @@ def test_parse_chat_messages_multiple_images_with_uuids_across_messages(
             },
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -1210,7 +1126,6 @@ def test_parse_chat_messages_multiple_images_with_uuids_across_messages(
 
 def test_parse_chat_messages_context_text_format(
     phi3v_model_config,
-    phi3v_tokenizer,
 ):
     conversation, mm_data, mm_uuids = parse_chat_messages(
         [
@@ -1222,7 +1137,6 @@ def test_parse_chat_messages_context_text_format(
             {"role": "user", "content": "What about this one?"},
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="openai",
     )
 
@@ -1246,7 +1160,6 @@ def test_parse_chat_messages_context_text_format(
 
 def test_parse_chat_messages_rejects_too_many_images_in_one_message(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     with warnings.catch_warnings():
@@ -1277,14 +1190,12 @@ def test_parse_chat_messages_rejects_too_many_images_in_one_message(
                     }
                 ],
                 phi3v_model_config,
-                phi3v_tokenizer,
                 content_format="string",
             )
 
 
 def test_parse_chat_messages_rejects_too_many_images_across_messages(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     with warnings.catch_warnings():
@@ -1322,14 +1233,12 @@ def test_parse_chat_messages_rejects_too_many_images_across_messages(
                     },
                 ],
                 phi3v_model_config,
-                phi3v_tokenizer,
                 content_format="string",
             )
 
 
 def test_parse_chat_messages_multiple_images_uncommon_input(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     conversation, mm_data, mm_uuids = parse_chat_messages(
@@ -1344,7 +1253,6 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
             }
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -1360,7 +1268,6 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
 
 def test_parse_chat_messages_multiple_images_interleave(
     phi3v_model_config_mm_interleaved,
-    phi3v_tokenizer,
     image_url,
 ):
     conversation, mm_data, mm_uuids = parse_chat_messages(
@@ -1380,7 +1287,6 @@ def test_parse_chat_messages_multiple_images_interleave(
             }
         ],
         phi3v_model_config_mm_interleaved,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -1398,7 +1304,6 @@ def test_parse_chat_messages_multiple_images_interleave(
 @pytest.mark.asyncio
 async def test_parse_chat_messages_multiple_images_interleave_async(
     phi3v_model_config_mm_interleaved,
-    phi3v_tokenizer,
     image_url,
 ):
     conversation, mm_data, mm_uuids = parse_chat_messages_futures(
@@ -1418,7 +1323,6 @@ async def test_parse_chat_messages_multiple_images_interleave_async(
             }
         ],
         phi3v_model_config_mm_interleaved,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -1436,7 +1340,6 @@ async def test_parse_chat_messages_multiple_images_interleave_async(
 @pytest.mark.asyncio
 async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async(
     phi3v_model_config_mm_interleaved,
-    phi3v_tokenizer,
     image_url,
 ):
     image_uuid = str(hash(image_url))
@@ -1465,7 +1368,6 @@ async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async(
             }
         ],
         phi3v_model_config_mm_interleaved,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -1482,7 +1384,6 @@ async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async(
 
 def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
     phi3v_model_config_mm_interleaved,
-    phi3v_tokenizer,
     image_url,
 ):
     conversation, mm_data, mm_uuids = parse_chat_messages(
@@ -1505,7 +1406,6 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
             },
         ],
         phi3v_model_config_mm_interleaved,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -1523,7 +1423,6 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
 
 def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interleave(
     phi3v_model_config_mm_interleaved,
-    phi3v_tokenizer,
     image_url,
 ):
     image_uuid = str(hash(image_url))
@@ -1555,7 +1454,6 @@ def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interl
             },
         ],
         phi3v_model_config_mm_interleaved,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -1573,7 +1471,6 @@ def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interl
 
 def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
     qwen25omni_model_config_mm_interleaved,
-    qwen25omni_tokenizer,
     image_url,
     video_url,
     audio_url,
@@ -1601,7 +1498,6 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
             },
         ],
         qwen25omni_model_config_mm_interleaved,
-        qwen25omni_tokenizer,
         content_format="string",
     )
 
@@ -1627,7 +1523,6 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
 
 def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interleave(
     qwen25omni_model_config_mm_interleaved,
-    qwen25omni_tokenizer,
     image_url,
     video_url,
     audio_url,
@@ -1671,7 +1566,6 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl
             },
         ],
         qwen25omni_model_config_mm_interleaved,
-        qwen25omni_tokenizer,
         content_format="string",
     )
 
@@ -1699,7 +1593,6 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl
 
 def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_messages_interleave(  # noqa: E501
     qwen25omni_model_config_mm_interleaved,
-    qwen25omni_tokenizer,
     image_url,
     video_url,
     audio_url,
@@ -1743,7 +1636,6 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_mes
             },
         ],
         qwen25omni_model_config_mm_interleaved,
-        qwen25omni_tokenizer,
         content_format="string",
     )
 
@@ -1775,7 +1667,6 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_mes
 
 def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_messages_interleave(  # noqa: E501
     qwen25omni_model_config_mm_interleaved,
-    qwen25omni_tokenizer,
     image_url,
     video_url,
     audio_url,
@@ -1811,7 +1702,6 @@ def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_message
             },
         ],
         qwen25omni_model_config_mm_interleaved,
-        qwen25omni_tokenizer,
         content_format="string",
     )
 
@@ -1837,7 +1727,6 @@ def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_message
 
 def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
     phi3v_model_config_mm_interleaved,
-    phi3v_tokenizer,
     image_url,
 ):
     with pytest.raises(
@@ -1861,7 +1750,6 @@ def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
                 }
             ],
             phi3v_model_config_mm_interleaved,
-            phi3v_tokenizer,
             content_format="string",
         )
 
@@ -2237,9 +2125,7 @@ def test_resolve_content_format_examples(template_path, expected_format):
     assert resolved_format == expected_format
 
 
-def test_parse_chat_messages_include_thinking_chunk(
-    mistral_model_config, mistral_tokenizer
-):
+def test_parse_chat_messages_include_thinking_chunk(mistral_model_config):
     messages = [
         {
             "role": "system",
@@ -2269,7 +2155,6 @@ def test_parse_chat_messages_include_thinking_chunk(
     conversation_with_thinking, _, _ = parse_chat_messages(
         messages,
         mistral_model_config,
-        mistral_tokenizer,
         content_format="openai",
     )
 
@@ -2353,7 +2238,6 @@ def test_apply_mistral_chat_template_thinking_chunk():
 
 def test_parse_chat_messages_single_empty_audio_with_uuid(
     qwen2_audio_model_config,
-    qwen2_audio_tokenizer,
 ):
     audio_uuid = "abcd"
     conversation, mm_data, mm_uuids = parse_chat_messages(
@@ -2371,7 +2255,6 @@ def test_parse_chat_messages_single_empty_audio_with_uuid(
             }
         ],
         qwen2_audio_model_config,
-        qwen2_audio_tokenizer,
         content_format="string",
     )
 
@@ -2389,7 +2272,6 @@ def test_parse_chat_messages_single_empty_audio_with_uuid(
 @pytest.mark.asyncio
 async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
     qwen2_audio_model_config,
-    qwen2_audio_tokenizer,
 ):
     audio_uuid = "abcd"
     conversation, mm_future, mm_uuids = parse_chat_messages_futures(
@@ -2407,7 +2289,6 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
             }
         ],
         qwen2_audio_model_config,
-        qwen2_audio_tokenizer,
         content_format="string",
     )
 
diff --git a/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py b/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py
index 276de2ff8e2cd..b30556fbc81fb 100644
--- a/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py
+++ b/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import base64
-import io
 import json
 
 import openai  # use the official client for correctness check
@@ -13,6 +11,7 @@ from transformers import AutoConfig
 
 from tests.conftest import ImageTestAssets
 from tests.utils import RemoteOpenAIServer
+from vllm.utils.serial_utils import tensor2base64
 
 # any model with a chat template should work here
 MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
@@ -50,18 +49,6 @@ async def client_with_image_embeds(server_with_image_embeds):
         yield async_client
 
 
-def encode_image_embedding_to_base64(image_embedding) -> str:
-    """
-    Encode image embedding to base64 string
-    """
-    buffer = io.BytesIO()
-    torch.save(image_embedding, buffer)
-    buffer.seek(0)
-    binary_data = buffer.read()
-    base64_image_embedding = base64.b64encode(binary_data).decode("utf-8")
-    return base64_image_embedding
-
-
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("dtype", [torch.half, torch.float16, torch.float32])
@@ -73,7 +60,7 @@ async def test_completions_with_image_embeds(
 ):
     # Test case: Single image embeds input
     image_embeds = image_assets[0].image_embeds.to(dtype=dtype)
-    base64_image_embedding = encode_image_embedding_to_base64(image_embeds)
+    base64_image_embedding = tensor2base64(image_embeds)
     chat_completion = await client_with_image_embeds.chat.completions.create(
         messages=[
             {"role": "system", "content": "You are a helpful assistant."},
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 1b3a7d2665b41..077fe681bc5b8 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -536,7 +536,7 @@ def resolve_hf_chat_template(
 def _resolve_chat_template_content_format(
     chat_template: str | None,
     tools: list[dict[str, Any]] | None,
-    tokenizer: TokenizerLike,
+    tokenizer: TokenizerLike | None,
     *,
     model_config: ModelConfig,
 ) -> _ChatTemplateContentFormat:
@@ -593,7 +593,7 @@ def resolve_chat_template_content_format(
     chat_template: str | None,
     tools: list[dict[str, Any]] | None,
     given_format: ChatTemplateContentFormatOption,
-    tokenizer: TokenizerLike,
+    tokenizer: TokenizerLike | None,
     *,
     model_config: ModelConfig,
 ) -> _ChatTemplateContentFormat:
@@ -627,11 +627,10 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
     maximum per prompt.
     """
 
-    def __init__(self, model_config: ModelConfig, tokenizer: TokenizerLike):
+    def __init__(self, model_config: ModelConfig):
         super().__init__()
 
         self._model_config = model_config
-        self._tokenizer = tokenizer
 
         self._items_by_modality = defaultdict[str, list[_T | None]](list)
         self._uuids_by_modality = defaultdict[str, list[str | None]](list)
@@ -1612,7 +1611,6 @@ def _postprocess_messages(messages: list[ConversationMessage]) -> None:
 def parse_chat_messages(
     messages: list[ChatCompletionMessageParam],
     model_config: ModelConfig,
-    tokenizer: TokenizerLike,
     content_format: _ChatTemplateContentFormat,
 ) -> tuple[
     list[ConversationMessage],
@@ -1620,7 +1618,7 @@ def parse_chat_messages(
     MultiModalUUIDDict | None,
 ]:
     conversation: list[ConversationMessage] = []
-    mm_tracker = MultiModalItemTracker(model_config, tokenizer)
+    mm_tracker = MultiModalItemTracker(model_config)
 
     for msg in messages:
         sub_messages = _parse_chat_message_content(
@@ -1644,7 +1642,6 @@ def parse_chat_messages(
 def parse_chat_messages_futures(
     messages: list[ChatCompletionMessageParam],
     model_config: ModelConfig,
-    tokenizer: TokenizerLike,
     content_format: _ChatTemplateContentFormat,
 ) -> tuple[
     list[ConversationMessage],
@@ -1652,7 +1649,7 @@ def parse_chat_messages_futures(
     MultiModalUUIDDict | None,
 ]:
     conversation: list[ConversationMessage] = []
-    mm_tracker = AsyncMultiModalItemTracker(model_config, tokenizer)
+    mm_tracker = AsyncMultiModalItemTracker(model_config)
 
     for msg in messages:
         sub_messages = _parse_chat_message_content(
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index c121fa71f0196..481a47a97f7d4 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -834,7 +834,6 @@ class LLM:
             conversation, mm_data, mm_uuids = parse_chat_messages(
                 msgs,
                 model_config,
-                tokenizer,
                 content_format=resolved_content_format,
             )
 
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 9642024dd1e9e..bfa98f29a064b 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -1088,11 +1088,6 @@ class OpenAIServing:
         Sequence[RequestPrompt],
         list[EngineTokensPrompt],
     ]:
-        if tokenizer is None:
-            raise ValueError(
-                "Unable to get tokenizer because `skip_tokenizer_init=True`"
-            )
-
         model_config = self.model_config
 
         resolved_content_format = resolve_chat_template_content_format(
@@ -1105,7 +1100,6 @@ class OpenAIServing:
         conversation, mm_data_future, mm_uuids = parse_chat_messages_futures(
             messages,
             model_config,
-            tokenizer,
             content_format=resolved_content_format,
         )
 
diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py
index 8819c85af9a26..072ddd4c90b16 100644
--- a/vllm/entrypoints/score_utils.py
+++ b/vllm/entrypoints/score_utils.py
@@ -89,12 +89,10 @@ def parse_score_data(
     data_1: str | ScoreContentPartParam,
     data_2: str | ScoreContentPartParam,
     model_config: ModelConfig,
-    tokenizer: TokenizerLike,
 ) -> tuple[str, str, MultiModalDataDict | None]:
-    mm_tracker = MultiModalItemTracker(model_config, tokenizer)
+    mm_tracker = MultiModalItemTracker(model_config)
 
     content_1 = _parse_score_content(data_1, mm_tracker)
-
     content_2 = _parse_score_content(data_2, mm_tracker)
 
     def ensure_str(content: _ContentPart | None) -> str:
@@ -188,7 +186,6 @@ def get_score_prompt(
         data_1,
         data_2,
         model_config,
-        tokenizer,
     )
     from vllm.model_executor.model_loader import get_model_cls
 
diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py
index 6537b6df876a9..5aef09ca9c256 100644
--- a/vllm/model_executor/models/hunyuan_vision.py
+++ b/vllm/model_executor/models/hunyuan_vision.py
@@ -62,6 +62,7 @@ from vllm.multimodal.inputs import (
 from vllm.multimodal.parse import (
     DictEmbeddingItems,
     ImageSize,
+    ModalityDataItems,
     MultiModalDataItems,
     MultiModalDataParser,
 )
@@ -570,7 +571,7 @@ class HunYuanVLMultiModalDataParser(MultiModalDataParser):
     def _parse_image_data(
         self,
         data: dict[str, torch.Tensor] | ModalityData[ImageItem],
-    ):
+    ) -> ModalityDataItems[Any, Any] | None:
         if isinstance(data, dict):
             return DictEmbeddingItems(
                 data,
diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
index 8817601558148..09acf8372e168 100644
--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -1000,7 +1000,7 @@ class KeyeMultiModalDataParser(MultiModalDataParser):
     def _parse_image_data(
         self,
         data: dict[str, torch.Tensor] | ModalityData[ImageItem],
-    ) -> ModalityDataItems[Any, Any]:
+    ) -> ModalityDataItems[Any, Any] | None:
         if isinstance(data, dict):
             return DictEmbeddingItems(
                 data,
@@ -1017,7 +1017,7 @@ class KeyeMultiModalDataParser(MultiModalDataParser):
     def _parse_video_data(
         self,
         data: dict[str, torch.Tensor] | ModalityData[VideoItem],
-    ) -> ModalityDataItems[Any, Any]:
+    ) -> ModalityDataItems[Any, Any] | None:
         if isinstance(data, dict):
             return DictEmbeddingItems(
                 data,
diff --git a/vllm/model_executor/models/keye_vl1_5.py b/vllm/model_executor/models/keye_vl1_5.py
index 124e9c2afa217..2b04e3bd4b75b 100644
--- a/vllm/model_executor/models/keye_vl1_5.py
+++ b/vllm/model_executor/models/keye_vl1_5.py
@@ -333,7 +333,7 @@ class KeyeVL1_5MultiModalDataParser(MultiModalDataParser):
     def _parse_image_data(
         self,
         data: dict[str, torch.Tensor] | ModalityData[ImageItem],
-    ) -> ModalityDataItems[Any, Any]:
+    ) -> ModalityDataItems[Any, Any] | None:
         if isinstance(data, dict):
             return DictEmbeddingItems(
                 data,
@@ -350,7 +350,7 @@ class KeyeVL1_5MultiModalDataParser(MultiModalDataParser):
     def _parse_video_data(
         self,
         data: dict[str, torch.Tensor] | ModalityData[VideoItem],
-    ) -> ModalityDataItems[Any, Any]:
+    ) -> ModalityDataItems[Any, Any] | None:
         if isinstance(data, dict):
             return DictEmbeddingItems(
                 data,
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
index b93a42ffd24c1..062547401c3cf 100644
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -11,6 +11,7 @@ import pybase64
 import torch
 
 from vllm.utils.import_utils import PlaceholderModule
+from vllm.utils.serial_utils import tensor2base64
 
 from .base import MediaIO
 
@@ -135,8 +136,4 @@ class AudioEmbeddingMediaIO(MediaIO[torch.Tensor]):
         return torch.load(filepath, weights_only=True)
 
     def encode_base64(self, media: torch.Tensor) -> str:
-        buffer = BytesIO()
-        torch.save(media, buffer)
-        buffer.seek(0)
-        binary_data = buffer.read()
-        return pybase64.b64encode(binary_data).decode("utf-8")
+        return tensor2base64(media)
diff --git a/vllm/utils/serial_utils.py b/vllm/utils/serial_utils.py
index b89fa6ce4db66..a6d717e03d37d 100644
--- a/vllm/utils/serial_utils.py
+++ b/vllm/utils/serial_utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import base64
+import io
 import sys
 from dataclasses import dataclass
 from typing import Literal
@@ -52,6 +53,15 @@ Endianness = Literal["native", "big", "little"]
 EncodingFormat = Literal["float", "base64", "bytes"]
 
 
+def tensor2base64(x: torch.Tensor) -> str:
+    with io.BytesIO() as buf:
+        torch.save(x, buf)
+        buf.seek(0)
+        binary_data = buf.read()
+
+    return base64.b64encode(binary_data).decode("utf-8")
+
+
 def tensor2binary(
     tensor: torch.Tensor, embed_dtype: EmbedDType, endianness: Endianness
 ) -> bytes:

From 82a64b3d8f93521d39569078d4ac56992a50a640 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Thu, 4 Dec 2025 15:12:12 +0800
Subject: [PATCH 213/770] [Bugfix] fixed deepseekv32 tool calling error
 (#30025)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/tokenizers/deepseek_v32_encoding.py | 6 ++++--
 vllm/tokenizers/deepseekv32.py           | 3 ++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/vllm/tokenizers/deepseek_v32_encoding.py b/vllm/tokenizers/deepseek_v32_encoding.py
index fb8989e651e3a..521bd92959312 100644
--- a/vllm/tokenizers/deepseek_v32_encoding.py
+++ b/vllm/tokenizers/deepseek_v32_encoding.py
@@ -95,8 +95,10 @@ def tool_calls_to_openai_format(tool_calls):
 def encode_arguments_to_dsml(tool_call: dict[str, str]) -> str:
     p_dsml_template = """<{dsml_token}parameter name="{key}" string="{is_str}">{value}</{dsml_token}parameter>"""
     P_dsml_strs = []
-
-    arguments = json.loads(tool_call["arguments"])
+    if isinstance(tool_call["arguments"], str):
+        arguments = json.loads(tool_call["arguments"])
+    else:
+        arguments = tool_call["arguments"]
 
     for k, v in arguments.items():
         p_dsml_str = p_dsml_template.format(
diff --git a/vllm/tokenizers/deepseekv32.py b/vllm/tokenizers/deepseekv32.py
index 7466ad4076bd8..1140357cf861d 100644
--- a/vllm/tokenizers/deepseekv32.py
+++ b/vllm/tokenizers/deepseekv32.py
@@ -43,7 +43,8 @@ class DeepseekV32Tokenizer(HfTokenizer):
         thinking_mode = "thinking"
         if not thinking:
             thinking_mode = "chat"
-        messages = messages.copy()
+        conversation = kwargs.get("conversation", messages)
+        messages = conversation.copy()
         drop_thinking = True
         if tools is not None and len(tools) > 0:
             messages.insert(0, {"role": "system"})

From 404fc4bfc049fc86cc1ddd1d975ecfc72609db4f Mon Sep 17 00:00:00 2001
From: daniel-salib <danielsalib@meta.com>
Date: Wed, 3 Dec 2025 23:36:57 -0800
Subject: [PATCH 214/770] [Frontend] refactor harmony utils output message
 parsing (#29820)

Signed-off-by: Daniel Salib <danielsalib@meta.com>
---
 vllm/entrypoints/harmony_utils.py | 216 ++++++++++++++++--------------
 1 file changed, 117 insertions(+), 99 deletions(-)

diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py
index 47a252348c102..bb932e39e0472 100644
--- a/vllm/entrypoints/harmony_utils.py
+++ b/vllm/entrypoints/harmony_utils.py
@@ -328,6 +328,105 @@ def render_for_completion(messages: list[Message]) -> list[int]:
     return token_ids
 
 
+def _parse_browser_tool_call(message: Message, recipient: str) -> ResponseOutputItem:
+    """Parse browser tool calls (search, open, find) into web search items."""
+    if len(message.content) != 1:
+        raise ValueError("Invalid number of contents in browser message")
+    content = message.content[0]
+
+    # Parse JSON args (with retry detection)
+    try:
+        browser_call = json.loads(content.text)
+    except json.JSONDecodeError:
+        json_retry_output_message = (
+            f"Invalid JSON args, caught and retried: {content.text}"
+        )
+        browser_call = {
+            "query": json_retry_output_message,
+            "url": json_retry_output_message,
+            "pattern": json_retry_output_message,
+        }
+
+    # Create appropriate action based on recipient
+    if recipient == "browser.search":
+        action = ActionSearch(
+            query=f"cursor:{browser_call.get('query', '')}", type="search"
+        )
+    elif recipient == "browser.open":
+        action = ActionOpenPage(
+            url=f"cursor:{browser_call.get('url', '')}", type="open_page"
+        )
+    elif recipient == "browser.find":
+        action = ActionFind(
+            pattern=browser_call.get("pattern", ""),
+            url=f"cursor:{browser_call.get('url', '')}",
+            type="find",
+        )
+    else:
+        raise ValueError(f"Unknown browser action: {recipient}")
+
+    return ResponseFunctionWebSearch(
+        id=f"ws_{random_uuid()}",
+        action=action,
+        status="completed",
+        type="web_search_call",
+    )
+
+
+def _parse_function_call(message: Message, recipient: str) -> list[ResponseOutputItem]:
+    """Parse function calls into function tool call items."""
+    function_name = recipient.split(".")[-1]
+    output_items = []
+    for content in message.content:
+        random_id = random_uuid()
+        response_item = ResponseFunctionToolCall(
+            arguments=content.text,
+            call_id=f"call_{random_id}",
+            type="function_call",
+            name=function_name,
+            id=f"fc_{random_id}",
+        )
+        output_items.append(response_item)
+    return output_items
+
+
+def _parse_reasoning_content(message: Message) -> list[ResponseOutputItem]:
+    """Parse reasoning/analysis content into reasoning items."""
+    output_items = []
+    for content in message.content:
+        reasoning_item = ResponseReasoningItem(
+            id=f"rs_{random_uuid()}",
+            summary=[],
+            type="reasoning",
+            content=[
+                ResponseReasoningTextContent(text=content.text, type="reasoning_text")
+            ],
+            status=None,
+        )
+        output_items.append(reasoning_item)
+    return output_items
+
+
+def _parse_final_message(message: Message) -> ResponseOutputItem:
+    """Parse final channel messages into output message items."""
+    contents = []
+    for content in message.content:
+        output_text = ResponseOutputText(
+            text=content.text,
+            annotations=[],  # TODO
+            type="output_text",
+            logprobs=None,  # TODO
+        )
+        contents.append(output_text)
+    return ResponseOutputMessage(
+        id=f"msg_{random_uuid()}",
+        content=contents,
+        role=message.author.role,
+        status="completed",
+        type="message",
+    )
+
+
 def parse_output_message(message: Message) -> list[ResponseOutputItem]:
     """
     Parse a Harmony message into a list of output response items.
@@ -340,119 +439,38 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]:
 
     output_items: list[ResponseOutputItem] = []
     recipient = message.recipient
+
+    # Browser tool calls
     if recipient is not None and recipient.startswith("browser."):
-        if len(message.content) != 1:
-            raise ValueError("Invalid number of contents in browser message")
-        content = message.content[0]
-        # We do not need to check the VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY
-        # env variable since if it is not set, we are certain the json is valid
-        # The use of Actions for web search will be removed entirely in
-        # the future, so this is only necessary temporarily
-        try:
-            browser_call = json.loads(content.text)
-        except json.JSONDecodeError:
-            # If the content is not valid JSON, then it was
-            # caught and retried by vLLM, which means we
-            # need to make note of that so the user is aware
-            json_retry_output_message = (
-                f"Invalid JSON args, caught and retried: {content.text}"
-            )
-            browser_call = {
-                "query": json_retry_output_message,
-                "url": json_retry_output_message,
-                "pattern": json_retry_output_message,
-            }
-        # TODO: translate to url properly!
-        if recipient == "browser.search":
-            action = ActionSearch(
-                query=f"cursor:{browser_call.get('query', '')}", type="search"
-            )
-        elif recipient == "browser.open":
-            action = ActionOpenPage(
-                url=f"cursor:{browser_call.get('url', '')}", type="open_page"
-            )
-        elif recipient == "browser.find":
-            action = ActionFind(
-                pattern=browser_call["pattern"],
-                url=f"cursor:{browser_call.get('url', '')}",
-                type="find",
-            )
-        else:
-            raise ValueError(f"Unknown browser action: {recipient}")
-        web_search_item = ResponseFunctionWebSearch(
-            id=f"ws_{random_uuid()}",
-            action=action,
-            status="completed",
-            type="web_search_call",
-        )
-        output_items.append(web_search_item)
+        output_items.append(_parse_browser_tool_call(message, recipient))
+
+    # Analysis channel (reasoning/chain-of-thought)
     elif message.channel == "analysis":
-        for content in message.content:
-            reasoning_item = ResponseReasoningItem(
-                id=f"rs_{random_uuid()}",
-                summary=[],
-                type="reasoning",
-                content=[
-                    ResponseReasoningTextContent(
-                        text=content.text, type="reasoning_text"
-                    )
-                ],
-                status=None,
-            )
-            output_items.append(reasoning_item)
+        output_items.extend(_parse_reasoning_content(message))
+
+    # Commentary channel
     elif message.channel == "commentary":
+        # Function calls
         if recipient is not None and recipient.startswith("functions."):
-            function_name = recipient.split(".")[-1]
-            for content in message.content:
-                random_id = random_uuid()
-                response_item = ResponseFunctionToolCall(
-                    arguments=content.text,
-                    call_id=f"call_{random_id}",
-                    type="function_call",
-                    name=function_name,
-                    id=f"fc_{random_id}",
-                )
-                output_items.append(response_item)
+            output_items.extend(_parse_function_call(message, recipient))
+
+        # Built-in tools on commentary channel are treated as reasoning for now
         elif recipient is not None and (
             recipient.startswith("python")
             or recipient.startswith("browser")
             or recipient.startswith("container")
         ):
-            for content in message.content:
-                reasoning_item = ResponseReasoningItem(
-                    id=f"rs_{random_uuid()}",
-                    summary=[],
-                    type="reasoning",
-                    content=[
-                        ResponseReasoningTextContent(
-                            text=content.text, type="reasoning_text"
-                        )
-                    ],
-                    status=None,
-                )
-                output_items.append(reasoning_item)
+            output_items.extend(_parse_reasoning_content(message))
         else:
             raise ValueError(f"Unknown recipient: {recipient}")
+
+    # Final output message
     elif message.channel == "final":
-        contents = []
-        for content in message.content:
-            output_text = ResponseOutputText(
-                text=content.text,
-                annotations=[],  # TODO
-                type="output_text",
-                logprobs=None,  # TODO
-            )
-            contents.append(output_text)
-        text_item = ResponseOutputMessage(
-            id=f"msg_{random_uuid()}",
-            content=contents,
-            role=message.author.role,
-            status="completed",
-            type="message",
-        )
-        output_items.append(text_item)
+        output_items.append(_parse_final_message(message))
+
     else:
         raise ValueError(f"Unknown channel: {message.channel}")
+
     return output_items
 
 
From fd68e909db1804f211707bb027a49b82bb5c2d8f Mon Sep 17 00:00:00 2001
From: CYJiang <86391540+googs1025@users.noreply.github.com>
Date: Thu, 4 Dec 2025 15:46:15 +0800
Subject: [PATCH 215/770] [docs] Remove _total from counter metrics names
 (#30028)

In Prometheus Counters always expose their actual numeric value with a metric name that ends in _total. We should document the base name, as this what appears in the get_metrics() API.

Signed-off-by: CYJiang <86391540+googs1025@users.noreply.github.com>
---
 docs/design/metrics.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/design/metrics.md b/docs/design/metrics.md
index 59cb6ba46fe17..13264f6861b0c 100644
--- a/docs/design/metrics.md
+++ b/docs/design/metrics.md
@@ -57,15 +57,15 @@ vLLM also provides [a reference example](../../examples/online_serving/prometheu
 The subset of metrics exposed in the Grafana dashboard gives us an indication of which metrics are especially important:
 
 - `vllm:e2e_request_latency_seconds_bucket` - End to end request latency measured in seconds.
-- `vllm:prompt_tokens_total` - Prompt tokens.
-- `vllm:generation_tokens_total` - Generation tokens.
+- `vllm:prompt_tokens` - Prompt tokens.
+- `vllm:generation_tokens` - Generation tokens.
 - `vllm:time_per_output_token_seconds` - Inter-token latency (Time Per Output Token, TPOT) in seconds.
 - `vllm:time_to_first_token_seconds` - Time to First Token (TTFT) latency in seconds.
 - `vllm:num_requests_running` (also, `_swapped` and `_waiting`) - Number of requests in the RUNNING, WAITING, and SWAPPED states.
 - `vllm:gpu_cache_usage_perc` - Percentage of used cache blocks by vLLM.
 - `vllm:request_prompt_tokens` - Request prompt length.
 - `vllm:request_generation_tokens` - Request generation length.
-- `vllm:request_success_total` - Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached.
+- `vllm:request_success` - Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached.
 - `vllm:request_queue_time_seconds` - Queue time.
 - `vllm:request_prefill_time_seconds` - Requests prefill time.
 - `vllm:request_decode_time_seconds` - Requests decode time.
@@ -571,9 +571,9 @@ model and then validate those tokens with the larger model.
 
 - `vllm:spec_decode_draft_acceptance_rate` (Gauge)
 - `vllm:spec_decode_efficiency` (Gauge)
-- `vllm:spec_decode_num_accepted_tokens_total` (Counter)
-- `vllm:spec_decode_num_draft_tokens_total` (Counter)
-- `vllm:spec_decode_num_emitted_tokens_total` (Counter)
+- `vllm:spec_decode_num_accepted_tokens` (Counter)
+- `vllm:spec_decode_num_draft_tokens` (Counter)
+- `vllm:spec_decode_num_emitted_tokens` (Counter)
 
 There is a PR under review (<https://github.com/vllm-project/vllm/pull/12193>) to add "prompt lookup (ngram)"
 speculative decoding to v1. Other techniques will follow. We should

From 9aa33a74b00b2db7d7da22a59ed64b44ebbabe14 Mon Sep 17 00:00:00 2001
From: Charlie Fu <charlifu@amd.com>
Date: Thu, 4 Dec 2025 01:52:28 -0600
Subject: [PATCH 216/770] [Rocm][CI] Fix test_speculator_eagle3 by skipping the
 CompressedTensorw4a16 Model (#30001)

Signed-off-by: charlifu <charlifu@amd.com>
Co-authored-by: Alexei-V-Ivanov-AMD <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
---
 tests/v1/spec_decode/test_speculators_eagle3.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/v1/spec_decode/test_speculators_eagle3.py b/tests/v1/spec_decode/test_speculators_eagle3.py
index 5ce6e1593b5c1..9a252cfffc8f0 100644
--- a/tests/v1/spec_decode/test_speculators_eagle3.py
+++ b/tests/v1/spec_decode/test_speculators_eagle3.py
@@ -5,6 +5,7 @@ import torch
 
 from vllm.config import SpeculativeConfig
 from vllm.model_executor.models.interfaces import supports_eagle3
+from vllm.platforms import current_platform
 
 
 @pytest.mark.parametrize(
@@ -21,6 +22,10 @@ from vllm.model_executor.models.interfaces import supports_eagle3
         pytest.param(
             "nm-testing/Speculator-Qwen3-8B-Eagle3-converted-071-quantized-w4a16",
             id="qwen3-eagle3-speculator-w4a16-verifier",
+            marks=pytest.mark.skipif(
+                current_platform.is_rocm(),
+                reason="The tests are skipped on rocm platform.",
+            ),
         ),
     ],
 )

From 3f1b03739ae1422361446d3d23bed970bd549ebc Mon Sep 17 00:00:00 2001
From: TJian <tunjian.tan@embeddedllm.com>
Date: Thu, 4 Dec 2025 16:20:24 +0800
Subject: [PATCH 217/770] [ROCm] [Bugfix] `compute_attn_mask_seqlen` for qwen3
 omni (#29974)

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 vllm/model_executor/models/qwen3_omni_moe_thinker.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
index fe825198dcaa4..e6979211b707f 100755
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -494,7 +494,10 @@ class Qwen3Omni_VisionTransformer(nn.Module):
         cu_seqlens: torch.Tensor,
     ) -> torch.Tensor:
         max_seqlen = torch.zeros([], device=cu_seqlens.device)
-        if self.attn_backend == AttentionBackendEnum.FLASH_ATTN:
+        if self.attn_backend in {
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
+        }:
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
         return max_seqlen
 

From 5430e110c099fdc6c8c80f443bf5adffe67aa30b Mon Sep 17 00:00:00 2001
From: Micah Williamson <micah.williamson@amd.com>
Date: Thu, 4 Dec 2025 02:20:54 -0600
Subject: [PATCH 218/770] [CI][AMD] Match Main CI Behavior By Skipping
 test_eplb_spec_decode In AMD CI (#30006)

Signed-off-by: Micah Williamson <micah.williamson@amd.com>
---
 tests/distributed/test_eplb_spec_decode.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tests/distributed/test_eplb_spec_decode.py b/tests/distributed/test_eplb_spec_decode.py
index 868cc702866e2..22977ce94404b 100644
--- a/tests/distributed/test_eplb_spec_decode.py
+++ b/tests/distributed/test_eplb_spec_decode.py
@@ -6,6 +6,7 @@ import lm_eval
 import pytest
 
 from tests.utils import large_gpu_mark
+from vllm.platforms import current_platform
 
 
 def get_model_args(
@@ -45,6 +46,12 @@ def get_model_args(
     return model_args
 
 
+pytestmark = pytest.mark.skipif(
+    current_platform.is_rocm(),
+    reason="EPLB with Spec Decode is a work in progress on ROCm.",
+)
+
+
 @pytest.mark.parametrize(
     "model_setup",
     [

From 68eb5c8d970a453a440776211f8dbff215fb40c3 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 4 Dec 2025 16:21:19 +0800
Subject: [PATCH 219/770] [Misc] Move functions into `PoolingMetadata` (#30027)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/layers/pooler.py | 50 ++++------------------------
 vllm/model_executor/models/gritlm.py |  6 ++--
 vllm/v1/pool/metadata.py             | 21 ++++++++++++
 3 files changed, 30 insertions(+), 47 deletions(-)

diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index 7dd02e32ff211..185e03e5f3bd7 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -64,42 +64,6 @@ class PoolingParamsUpdate:
         params.requires_token_ids = self.requires_token_ids
 
 
-def get_prompt_lens(
-    hidden_states: torch.Tensor | list[torch.Tensor],
-    pooling_metadata: PoolingMetadata,
-) -> torch.Tensor:
-    return pooling_metadata.prompt_lens
-
-
-def get_prompt_token_ids(pooling_metadata: PoolingMetadata) -> list[torch.Tensor]:
-    assert pooling_metadata.prompt_token_ids is not None, (
-        "Please set `requires_token_ids=True` in `get_pooling_updates`"
-    )
-
-    return [
-        pooling_metadata.prompt_token_ids[i, :num]
-        for i, num in enumerate(pooling_metadata.prompt_lens)
-    ]
-
-
-def get_pooling_params(pooling_metadata: PoolingMetadata) -> list[PoolingParams]:
-    pooling_params = pooling_metadata.pooling_params
-    return pooling_params
-
-
-def get_tasks(pooling_metadata: PoolingMetadata) -> list[PoolingTask]:
-    pooling_params = get_pooling_params(pooling_metadata)
-
-    tasks: list[PoolingTask] = [
-        task
-        for pooling_param in pooling_params
-        if (task := pooling_param.task) is not None
-    ]
-    assert len(pooling_params) == len(tasks)
-
-    return tasks
-
-
 def get_classification_activation_function(config: PretrainedConfig):
     # Implement alignment with transformers ForSequenceClassificationLoss
     # https://github.com/huggingface/transformers/blob/57bb6db6ee4cfaccc45b8d474dfad5a17811ca60/src/transformers/loss/loss_utils.py#L92
@@ -466,7 +430,7 @@ class EmbeddingPoolerHead(PoolerHead):
             pooled_data = self.projector(pooled_data)
         # pooled_data shape: [batchsize, embedding_dimension]
 
-        pooling_params = get_pooling_params(pooling_metadata)
+        pooling_params = pooling_metadata.pooling_params
 
         # for matryoshka representation
         dimensions_list = [pooling_param.dimensions for pooling_param in pooling_params]
@@ -606,7 +570,7 @@ class ClassifierPooler(Pooler):
         if self.logit_bias is not None:
             pooled_data -= self.logit_bias
 
-        pooling_params = get_pooling_params(pooling_metadata)
+        pooling_params = pooling_metadata.pooling_params
         flags = [p.use_activation for p in pooling_params]
 
         if len(set(flags)) == 1:
@@ -704,7 +668,7 @@ class AllPooler(Pooler):
         pooling_metadata: PoolingMetadata,
     ) -> PoolerOutput:
         pooled_data = self.pooling(hidden_states, pooling_metadata)
-        pooling_params = get_pooling_params(pooling_metadata)
+        pooling_params = pooling_metadata.pooling_params
         assert len(pooled_data) == len(pooling_params)
 
         pooled_data = [self.head(d, p) for d, p in zip(pooled_data, pooling_params)]
@@ -724,11 +688,11 @@ class StepPooler(Pooler):
         pooling_metadata: PoolingMetadata,
     ) -> torch.Tensor | list[torch.Tensor]:
         pooled_data_lst = self.pooling(hidden_states, pooling_metadata)
-        prompt_token_ids = get_prompt_token_ids(pooling_metadata)
+        prompt_token_ids = pooling_metadata.get_prompt_token_ids()
 
         pooled_data = list[torch.Tensor]()
 
-        pooling_params = get_pooling_params(pooling_metadata)
+        pooling_params = pooling_metadata.pooling_params
 
         for data, token_id, pooling_param in zip(
             pooled_data_lst, prompt_token_ids, pooling_params
@@ -757,7 +721,7 @@ class StepPooler(Pooler):
         pooling_metadata: PoolingMetadata,
     ) -> PoolerOutput:
         pooled_data = self.extract_states(hidden_states, pooling_metadata)
-        pooling_params = get_pooling_params(pooling_metadata)
+        pooling_params = pooling_metadata.pooling_params
         assert len(pooled_data) == len(pooling_params)
 
         pooled_data = [self.head(d, p) for d, p in zip(pooled_data, pooling_params)]
@@ -794,7 +758,7 @@ class DispatchPooler(Pooler):
 
         outputs = list[torch.Tensor]()
         offset = 0
-        for task, group in groupby(get_tasks(pooling_metadata)):
+        for task, group in groupby(pooling_metadata.tasks):
             if not (pooler := poolers_by_task.get(task)):
                 raise ValueError(
                     f"Unsupported task: {task} "
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
index 550e8b014d5e7..2aba626a7c737 100644
--- a/vllm/model_executor/models/gritlm.py
+++ b/vllm/model_executor/models/gritlm.py
@@ -14,8 +14,6 @@ from vllm.model_executor.layers.pooler import (
     PoolerHead,
     PoolerNormalize,
     PoolingParamsUpdate,
-    get_prompt_lens,
-    get_prompt_token_ids,
 )
 from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.tasks import PoolingTask
@@ -153,11 +151,11 @@ class GritLMMeanPool(nn.Module):
         hidden_states: torch.Tensor | list[torch.Tensor],
         pooling_metadata: PoolingMetadata,
     ) -> list[torch.Tensor] | torch.Tensor:
-        prompt_lens = get_prompt_lens(hidden_states, pooling_metadata)
+        prompt_lens = pooling_metadata.prompt_lens
         instr_lens = torch.tensor(
             [
                 self._get_instruction_len(token_ids.cpu().numpy())
-                for token_ids in get_prompt_token_ids(pooling_metadata)
+                for token_ids in pooling_metadata.get_prompt_token_ids()
             ],
             device="cpu",
         )
diff --git a/vllm/v1/pool/metadata.py b/vllm/v1/pool/metadata.py
index 7bd2c7415dafe..9ee588ea44ca4 100644
--- a/vllm/v1/pool/metadata.py
+++ b/vllm/v1/pool/metadata.py
@@ -5,6 +5,7 @@ from dataclasses import dataclass
 import torch
 
 from vllm.pooling_params import PoolingParams
+from vllm.tasks import PoolingTask
 from vllm.utils.platform_utils import is_pin_memory_available
 
 pin_memory = is_pin_memory_available()
@@ -40,6 +41,18 @@ class PoolingMetadata:
     pooling_params: list[PoolingParams]
     pooling_cursor: PoolingCursor | None = None
 
+    def __post_init__(self) -> None:
+        pooling_params = self.pooling_params
+
+        tasks: list[PoolingTask] = [
+            task
+            for pooling_param in pooling_params
+            if (task := pooling_param.task) is not None
+        ]
+        assert len(pooling_params) == len(tasks)
+
+        self.tasks = tasks
+
     def __getitem__(self, indices: slice):
         return PoolingMetadata(
             prompt_lens=self.prompt_lens[indices],
@@ -52,6 +65,14 @@ class PoolingMetadata:
             else self.pooling_cursor[indices],
         )
 
+    def get_prompt_token_ids(self) -> list[torch.Tensor]:
+        prompt_token_ids = self.prompt_token_ids
+        assert prompt_token_ids is not None, (
+            "Please set `requires_token_ids=True` in `get_pooling_updates`"
+        )
+
+        return [prompt_token_ids[i, :num] for i, num in enumerate(self.prompt_lens)]
+
     def build_pooling_cursor(
         self, num_scheduled_tokens: list[int], device: torch.device
     ):

From 899e2ef558e7345b99bc0d53c2e1c60ffdca7470 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Thu, 4 Dec 2025 08:22:03 +0000
Subject: [PATCH 220/770] [Core] Fix standalone runs of
 test_reset_prefix_cache_e2e (#29899)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 tests/v1/core/test_reset_prefix_cache_e2e.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/v1/core/test_reset_prefix_cache_e2e.py b/tests/v1/core/test_reset_prefix_cache_e2e.py
index e543c30a156ec..083fc3f34f545 100644
--- a/tests/v1/core/test_reset_prefix_cache_e2e.py
+++ b/tests/v1/core/test_reset_prefix_cache_e2e.py
@@ -11,7 +11,9 @@ PROMPTS = [
 ]
 
 
-def test_reset_prefix_cache_e2e():
+def test_reset_prefix_cache_e2e(monkeypatch):
+    # "spawn" is required for test to be deterministic
+    monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
     engine_args = EngineArgs(
         model="Qwen/Qwen3-0.6B",
         gpu_memory_utilization=0.2,

From b8a6ae415859bf8c9a3509cbd714695866e35d66 Mon Sep 17 00:00:00 2001
From: "Ye (Charlotte) Qi" <yeq@meta.com>
Date: Thu, 4 Dec 2025 00:45:57 -0800
Subject: [PATCH 221/770] [ROCm] add fallback for aiter fp8 decode mla (#30005)

Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
---
 vllm/_aiter_ops.py | 37 +++++++++++++++++++++++++++++++++----
 1 file changed, 33 insertions(+), 4 deletions(-)

diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py
index a8f472d147a0d..35920d826578e 100644
--- a/vllm/_aiter_ops.py
+++ b/vllm/_aiter_ops.py
@@ -283,6 +283,28 @@ def _rocm_aiter_grouped_topk_fake(
     pass
 
 
+# Cache whether aiter supports FP8 MLA parameters
+_AITER_MLA_SUPPORTS_FP8: bool | None = None
+
+
+def _check_aiter_mla_fp8_support() -> bool:
+    """Check if aiter.mla.mla_decode_fwd supports q_scale and kv_scale parameters."""
+    global _AITER_MLA_SUPPORTS_FP8
+    if _AITER_MLA_SUPPORTS_FP8 is None:
+        try:
+            import inspect
+
+            from aiter.mla import mla_decode_fwd
+
+            sig = inspect.signature(mla_decode_fwd)
+            _AITER_MLA_SUPPORTS_FP8 = (
+                "q_scale" in sig.parameters and "kv_scale" in sig.parameters
+            )
+        except Exception:
+            _AITER_MLA_SUPPORTS_FP8 = False
+    return _AITER_MLA_SUPPORTS_FP8
+
+
 def _rocm_aiter_mla_decode_fwd_impl(
     q: torch.Tensor,
     kv_buffer: torch.Tensor,
@@ -299,6 +321,16 @@ def _rocm_aiter_mla_decode_fwd_impl(
 ) -> None:
     from aiter.mla import mla_decode_fwd
 
+    kwargs = {
+        "sm_scale": sm_scale,
+        "logit_cap": logit_cap,
+    }
+
+    # Only pass q_scale and kv_scale if the aiter library supports them
+    if _check_aiter_mla_fp8_support():
+        kwargs["q_scale"] = q_scale
+        kwargs["kv_scale"] = kv_scale
+
     mla_decode_fwd(
         q,
         kv_buffer.view(-1, 1, 1, q.shape[-1]),
@@ -308,10 +340,7 @@ def _rocm_aiter_mla_decode_fwd_impl(
         kv_indices,
         kv_last_page_lens,
         max_seqlen_qo,
-        sm_scale=sm_scale,
-        logit_cap=logit_cap,
-        q_scale=q_scale,
-        kv_scale=kv_scale,
+        **kwargs,
     )
 
 
From ffdd18111b767d271786b982378d51b51ce151fe Mon Sep 17 00:00:00 2001
From: Xu Wenqing <121550081+Xu-Wenqing@users.noreply.github.com>
Date: Thu, 4 Dec 2025 16:46:34 +0800
Subject: [PATCH 222/770] Add DeepSeek-V3.2 tool parser. (#29848)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 许文卿 <xwq391974@alibaba-inc.com>
---
 .../openai/tool_parsers/__init__.py           |   4 +
 .../tool_parsers/deepseekv32_tool_parser.py   | 591 ++++++++++++++++++
 2 files changed, 595 insertions(+)
 create mode 100644 vllm/entrypoints/openai/tool_parsers/deepseekv32_tool_parser.py

diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py
index 89e439dd53f5f..ed43ea7eec82f 100644
--- a/vllm/entrypoints/openai/tool_parsers/__init__.py
+++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
@@ -30,6 +30,10 @@ _TOOL_PARSERS_TO_REGISTER = {
         "deepseekv31_tool_parser",
         "DeepSeekV31ToolParser",
     ),
+    "deepseek_v32": (
+        "deepseekv32_tool_parser",
+        "DeepSeekV32ToolParser",
+    ),
     "ernie45": (
         "ernie45_tool_parser",
         "Ernie45ToolParser",
diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv32_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/deepseekv32_tool_parser.py
new file mode 100644
index 0000000000000..4973deb7cefa8
--- /dev/null
+++ b/vllm/entrypoints/openai/tool_parsers/deepseekv32_tool_parser.py
@@ -0,0 +1,591 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+import uuid
+from collections.abc import Sequence
+from typing import Any
+
+import regex as re
+
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser,
+)
+from vllm.logger import init_logger
+from vllm.tokenizers import TokenizerLike
+
+logger = init_logger(__name__)
+
+
+class DeepSeekV32ToolParser(ToolParser):
+    """
+    example tool call content:
+    <｜DSML｜function_calls>
+    <｜DSML｜invoke name="get_weather">
+    <｜DSML｜parameter name="location" string="true">杭州</｜DSML｜parameter>
+    <｜DSML｜parameter name="date" string="true">2024-01-16</｜DSML｜parameter>
+    </｜DSML｜invoke>
+    <｜DSML｜invoke name="get_weather">
+    <｜DSML｜parameter name="location" string="true">北京</｜DSML｜parameter>
+    <｜DSML｜parameter name="date" string="true">2024-01-16</｜DSML｜parameter>
+    </｜DSML｜invoke>
+    </｜DSML｜function_calls>
+    """
+
+    def __init__(self, tokenizer: TokenizerLike):
+        super().__init__(tokenizer)
+
+        self.prev_tool_call_arr: list[dict] = []
+
+        # Sentinel tokens
+        self.dsml_token: str = "｜DSML｜"
+        self.dsml_start_check: str = "<" + self.dsml_token
+        self.tool_call_start_token: str = "<｜DSML｜function_calls>"
+        self.tool_call_end_token: str = "</｜DSML｜function_calls>"
+        self.invoke_start_prefix: str = "<｜DSML｜invoke name="
+        self.invoke_end_token: str = "</｜DSML｜invoke>"
+        self.parameter_prefix: str = "<｜DSML｜parameter name="
+        self.parameter_end_token: str = "</｜DSML｜parameter>"
+
+        # Streaming state variables
+        self.current_tool_name_sent: bool = False
+        # Override base class type - we use string IDs for tool calls
+        self.current_tool_id: str | None = None  # type: ignore
+        self.streamed_args_for_tool: list[str] = []
+        self.is_tool_call_started: bool = False
+        self.failed_count: int = 0
+
+        # Initialize streaming state variables
+        self.current_tool_index: int = 0
+        self.invoke_index: int = 0
+        self.header_sent: bool = False
+        self.current_function_name: str | None = None
+        self.current_param_name: str | None = None
+        self.current_param_value: str = ""
+        self.param_count: int = 0
+        self.in_param: bool = False
+        self.in_function: bool = False
+        self.json_started: bool = False
+        self.json_closed: bool = False
+        self.accumulated_params: dict = {}
+        self.streaming_request: ChatCompletionRequest | None = None
+
+        # Enhanced streaming state - reset for each new message
+        self._reset_streaming_state()
+
+        # Regex patterns for complete parsing
+        self.tool_call_complete_regex = re.compile(
+            r"<｜DSML｜function_calls>(.*?)</｜DSML｜function_calls>", re.DOTALL
+        )
+        self.invoke_complete_regex = re.compile(
+            r'<｜DSML｜invoke\s+name="([^"]+)"\s*>(.*?)</｜DSML｜invoke>', re.DOTALL
+        )
+        self.parameter_complete_regex = re.compile(
+            r'<｜DSML｜parameter\s+name="([^"]+)"\s+string="(?:true|false)"\s*>(.*?)</｜DSML｜parameter>',
+            re.DOTALL,
+        )
+
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ToolParser "
+                "constructor during construction."
+            )
+
+        logger.debug(
+            "vLLM Successfully import tool parser %s !", self.__class__.__name__
+        )
+
+    def _generate_tool_call_id(self) -> str:
+        """Generate a unique tool call ID."""
+        return f"call_{uuid.uuid4().hex[:24]}"
+
+    def _reset_streaming_state(self):
+        """Reset all streaming state."""
+        self.current_tool_index = 0
+        self.invoke_index = 0
+        self.is_tool_call_started = False
+        self.header_sent = False
+        self.current_tool_id = None
+        self.current_function_name = None
+        self.current_param_name = None
+        self.current_param_value = ""
+        self.param_count = 0
+        self.in_param = False
+        self.in_function = False
+        self.json_started = False
+        self.json_closed = False
+        # Store accumulated parameters for type conversion
+        self.accumulated_params = {}
+        self.streaming_request = None
+        # Clear previous tool call history to avoid state pollution
+        self.prev_tool_call_arr.clear()
+
+    def _parse_invoke_params(self, invoke_str: str) -> dict | None:
+        param_dict = dict()
+        for param_name, param_val in self.parameter_complete_regex.findall(invoke_str):
+            param_dict[param_name] = param_val
+        return param_dict
+
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+        """Extract tool calls from complete model output (non-streaming)."""
+        # Quick check
+        if self.tool_call_start_token not in model_output:
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
+
+        try:
+            tool_calls = []
+
+            # Find all complete tool_call blocks
+            for tool_call_match in self.tool_call_complete_regex.findall(model_output):
+                # Find all invokes within this tool_call
+                for invoke_name, invoke_content in self.invoke_complete_regex.findall(
+                    tool_call_match
+                ):
+                    param_dict = self._parse_invoke_params(invoke_content)
+                    tool_calls.append(
+                        ToolCall(
+                            type="function",
+                            function=FunctionCall(
+                                name=invoke_name,
+                                arguments=json.dumps(param_dict, ensure_ascii=False),
+                            ),
+                        )
+                    )
+
+            if not tool_calls:
+                return ExtractedToolCallInformation(
+                    tools_called=False, tool_calls=[], content=model_output
+                )
+
+            # Extract content before first tool call
+            first_tool_idx = model_output.find(self.tool_call_start_token)
+            content = model_output[:first_tool_idx] if first_tool_idx > 0 else None
+
+            return ExtractedToolCallInformation(
+                tools_called=True, tool_calls=tool_calls, content=content
+            )
+
+        except Exception:
+            logger.exception("Error extracting tool calls")
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
+
+    def _extract_name(self, name_str: str) -> str:
+        """Extract name from quoted string."""
+        name_str = name_str.strip()
+        if (
+            name_str.startswith('"')
+            and name_str.endswith('"')
+            or name_str.startswith("'")
+            and name_str.endswith("'")
+        ):
+            return name_str[1:-1]
+        return name_str
+
+    def _extract_param_name(self, input_str: str) -> str:
+        """Extract param name"""
+        start = input_str.find('"') + 1
+        end = input_str.find('"', start)
+        return input_str[start:end] if start > 0 and end > start else input_str
+
+    def _convert_param_value(self, value: str, param_type: str) -> Any:
+        """Convert parameter value to the correct type."""
+        if value.lower() == "null":
+            return None
+
+        param_type = param_type.lower()
+        if param_type in ["string", "str", "text"]:
+            return value
+        elif param_type in ["integer", "int"]:
+            try:
+                return int(value)
+            except (ValueError, TypeError):
+                return value
+        elif param_type in ["number", "float"]:
+            try:
+                val = float(value)
+                return val if val != int(val) else int(val)
+            except (ValueError, TypeError):
+                return value
+        elif param_type in ["boolean", "bool"]:
+            return value.lower() in ["true", "1"]
+        elif param_type in ["object", "array"]:
+            try:
+                return json.loads(value)
+            except json.JSONDecodeError:
+                return value
+        else:
+            # Try JSON parse first, fallback to string
+            try:
+                return json.loads(value)
+            except json.JSONDecodeError:
+                return value
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],  # pylint: disable=unused-argument
+        current_token_ids: Sequence[int],  # pylint: disable=unused-argument
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> DeltaMessage | None:
+        """Extract tool calls from streaming model output."""
+
+        # Store request for type conversion
+        if not previous_text:
+            self._reset_streaming_state()
+            self.streaming_request = request
+
+        # If no delta text, return None unless it's an EOS token after tools
+        if not delta_text:
+            # Check if this is an EOS token after all tool calls are complete
+            if delta_token_ids:
+                # Count complete tool calls
+                complete_calls = len(
+                    self.tool_call_complete_regex.findall(current_text)
+                )
+
+                # If we have completed tool calls and populated prev_tool_call_arr
+                if complete_calls > 0 and len(self.prev_tool_call_arr) > 0:
+                    # Check if all tool calls are closed
+                    open_calls = current_text.count(
+                        self.tool_call_start_token
+                    ) - current_text.count(self.tool_call_end_token)
+                    if open_calls == 0:
+                        # Return empty delta for finish_reason processing
+                        return DeltaMessage(content="")
+                elif not self.is_tool_call_started and current_text:
+                    # This is a regular content response that's now complete
+                    return DeltaMessage(content="")
+            return None
+
+        # Check if we need to advance to next tool
+        if self.json_closed and not self.in_function:
+            # Check if this tool call has ended
+            invoke_ends = current_text.count(self.invoke_end_token)
+            if invoke_ends > self.current_tool_index:
+                # This tool has ended, advance to next
+                self.current_tool_index += 1
+                self.header_sent = False
+                self.param_count = 0
+                self.json_started = False
+                self.json_closed = False
+                self.in_function = False  # Now we can safely set this to False
+                self.accumulated_params = {}
+                # Continue processing next tool
+                return None
+
+        # Handle normal content before tool calls
+        if not self.is_tool_call_started:
+            # Check if tool call is starting
+            if self.dsml_token in current_text:
+                self.is_tool_call_started = True
+                # Return any content before the tool call
+                if self.dsml_start_check in delta_text:
+                    content_before = delta_text[
+                        : delta_text.index(self.dsml_start_check)
+                    ]
+                    if content_before:
+                        return DeltaMessage(content=content_before)
+                return None
+            else:
+                # Check if we're between tool calls - skip whitespace
+                if (
+                    current_text.rstrip().endswith(self.tool_call_end_token)
+                    and delta_text.strip() == ""
+                ):
+                    # We just ended a tool call, skip whitespace
+                    return None
+                # Normal content, no tool call
+                if delta_text.endswith("<"):
+                    return DeltaMessage(content=delta_text[:-1])
+                if previous_text and previous_text.endswith("<"):
+                    return DeltaMessage(content="<" + delta_text)
+                return DeltaMessage(content=delta_text)
+
+        # Check if we're between tool calls (waiting for next one)
+        invoke_starts_count = current_text.count(self.invoke_start_prefix)
+        if self.current_tool_index >= invoke_starts_count:
+            # We're past all tool calls, shouldn't be here
+            return None
+
+        # Find the current tool call portion
+        invoke_start_positions: list[int] = []
+        idx = 0
+        while True:
+            idx = current_text.find(self.invoke_start_prefix, idx)
+            if idx == -1:
+                break
+            invoke_start_positions.append(idx)
+            idx += len(self.invoke_start_prefix)
+
+        if self.current_tool_index >= len(invoke_start_positions):
+            # No more tool calls to process yet
+            return None
+
+        invoke_start_idx = invoke_start_positions[self.current_tool_index]
+        # Find where this tool call ends (or current position if not ended yet)
+        invoke_end_idx = current_text.find(self.invoke_end_token, invoke_start_idx)
+        if invoke_end_idx == -1:
+            tool_text = current_text[invoke_start_idx:]
+        else:
+            tool_text = current_text[
+                invoke_start_idx : invoke_end_idx + len(self.invoke_end_token)
+            ]
+
+        # Looking for function header
+        if not self.header_sent:
+            if self.invoke_start_prefix in tool_text:
+                func_start = tool_text.find(self.invoke_start_prefix) + len(
+                    self.invoke_start_prefix
+                )
+                # Find the end quote for the function name
+                func_end = tool_text.find(">", func_start)
+
+                if func_end != -1:
+                    # Found complete function name
+                    function_name_raw = tool_text[func_start:func_end]
+                    self.current_function_name = self._extract_name(function_name_raw)
+                    self.current_tool_id = self._generate_tool_call_id()
+                    self.header_sent = True
+                    self.in_function = True
+
+                    # Add to prev_tool_call_arr immediately when we detect a tool call
+                    # Each tool call should be recorded regardless of function name
+                    # Ensure we don't add the same tool call index multiple times
+                    if len(self.prev_tool_call_arr) <= self.current_tool_index:
+                        self.prev_tool_call_arr.append(
+                            {
+                                "name": self.current_function_name,
+                                "arguments": "{}",  # Placeholder, will be updated later
+                            }
+                        )
+
+                    # Send header with function info
+                    return DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.current_tool_index,
+                                id=self.current_tool_id,
+                                function=DeltaFunctionCall(
+                                    name=self.current_function_name, arguments=""
+                                ),
+                                type="function",
+                            )
+                        ]
+                    )
+            return None
+
+        # We've sent header, now handle function body
+        if self.in_function:
+            # Send opening brace if not sent yet
+            if self.in_function and not self.json_started:
+                self.json_started = True
+                return DeltaMessage(
+                    tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_index,
+                            function=DeltaFunctionCall(arguments="{"),
+                        )
+                    ]
+                )
+
+            # Make sure json_started is set if we're processing parameters
+            if not self.json_started:
+                self.json_started = True
+
+            # Check for function end in accumulated text
+            if not self.json_closed and self.invoke_end_token in tool_text:
+                # Count total parameters in the tool text
+                total_param_count = tool_text.count(self.parameter_prefix)
+
+                # Only close JSON if all parameters have been processed
+                if self.param_count >= total_param_count:
+                    # Close JSON
+                    self.json_closed = True
+
+                    # Extract complete tool call
+                    # Find the invoke content
+                    invoke_start = tool_text.find(self.invoke_start_prefix) + len(
+                        self.invoke_start_prefix
+                    )
+                    invoke_content_end = tool_text.find(
+                        self.invoke_end_token, invoke_start
+                    )
+                    if invoke_content_end != -1:
+                        invoke_content = tool_text[invoke_start:invoke_content_end]
+                        # Parse to get the complete arguments
+                        try:
+                            invoke_params = self._parse_invoke_params(invoke_content)
+                            if invoke_params and self.current_tool_index < len(
+                                self.prev_tool_call_arr
+                            ):
+                                # Update existing entry in prev_tool_call_arr
+                                self.prev_tool_call_arr[self.current_tool_index][
+                                    "arguments"
+                                ] = json.dumps(invoke_params, ensure_ascii=False)
+                        except Exception:
+                            pass  # Ignore parsing errors during streaming
+
+                    result = DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.current_tool_index,
+                                function=DeltaFunctionCall(arguments="}"),
+                            )
+                        ]
+                    )
+
+                    # Reset state for next tool
+                    self.json_closed = True
+                    self.in_function = False
+                    self.accumulated_params = {}
+
+                    logger.debug("[M2_STREAMING] Tool call completed")
+
+                    return result
+                else:
+                    # Don't close JSON yet, continue processing parameters
+                    return None
+
+            # Look for parameters
+            # Find all parameter starts
+            param_starts = []
+            idx = 0
+            while True:
+                idx = tool_text.find(self.parameter_prefix, idx)
+                if idx == -1:
+                    break
+                param_starts.append(idx)
+                idx += len(self.parameter_prefix)
+
+            # Check if we should start a new parameter
+            if (
+                not self.in_param
+                and self.param_count < len(param_starts)
+                and len(param_starts) > self.param_count
+            ):
+                # Process the next parameter
+                param_idx = param_starts[self.param_count]
+                param_start = param_idx + len(self.parameter_prefix)
+                remaining = tool_text[param_start:]
+
+                if ">" in remaining:
+                    # We have the complete parameter name
+                    name_end = remaining.find(">")
+                    param_name_raw = remaining[:name_end]
+                    self.current_param_name = self._extract_param_name(param_name_raw)
+
+                    # Find the parameter value
+                    value_start = param_start + name_end + 1
+                    value_text = tool_text[value_start:]
+                    if value_text.startswith("\n"):
+                        value_text = value_text[1:]
+
+                    # Find where this parameter ends
+                    param_end_idx = value_text.find(self.parameter_end_token)
+                    if param_end_idx == -1:
+                        # No closing tag, look for next parameter or function end
+                        next_param_idx = value_text.find(self.parameter_prefix)
+                        func_end_idx = value_text.find(self.invoke_end_token)
+
+                        if next_param_idx != -1 and (
+                            func_end_idx == -1 or next_param_idx < func_end_idx
+                        ):
+                            param_end_idx = next_param_idx
+                        elif func_end_idx != -1:
+                            param_end_idx = func_end_idx
+                        else:
+                            # Neither found, check if tool call is complete
+                            if self.invoke_end_token in tool_text:
+                                # Tool call and parameter is complete
+                                param_end_idx = len(value_text)
+                            else:
+                                # Still streaming, wait for more content
+                                return None
+
+                    if param_end_idx != -1:
+                        # Complete parameter found
+                        param_value = value_text[:param_end_idx]
+                        if param_value.endswith("\n"):
+                            param_value = param_value[:-1]
+
+                        # Store raw value for later processing
+                        self.accumulated_params[self.current_param_name] = param_value
+
+                        # Get parameter configuration for type conversion
+                        param_config = {}
+                        if self.streaming_request and self.streaming_request.tools:
+                            for tool in self.streaming_request.tools:
+                                if (
+                                    hasattr(tool, "function")
+                                    and tool.function.name == self.current_function_name
+                                    and hasattr(tool.function, "parameters")
+                                ):
+                                    params = tool.function.parameters
+                                    if (
+                                        isinstance(params, dict)
+                                        and "properties" in params
+                                    ):
+                                        param_config = params["properties"]
+                                    break
+
+                        # Get parameter type
+                        param_type = "string"
+                        if (
+                            self.current_param_name in param_config
+                            and isinstance(param_config[self.current_param_name], dict)
+                            and "type" in param_config[self.current_param_name]
+                        ):
+                            param_type = param_config[self.current_param_name]["type"]
+
+                        # Convert param value to appropriate type
+                        converted_value = self._convert_param_value(
+                            param_value, param_type
+                        )
+
+                        # Build JSON fragment based on the converted type
+                        # Use json.dumps to properly serialize the value
+                        serialized_value = json.dumps(
+                            converted_value, ensure_ascii=False
+                        )
+
+                        if self.param_count == 0:
+                            json_fragment = (
+                                f'"{self.current_param_name}": {serialized_value}'
+                            )
+                        else:
+                            json_fragment = (
+                                f', "{self.current_param_name}": {serialized_value}'
+                            )
+
+                        self.param_count += 1
+
+                        return DeltaMessage(
+                            tool_calls=[
+                                DeltaToolCall(
+                                    index=self.current_tool_index,
+                                    function=DeltaFunctionCall(arguments=json_fragment),
+                                )
+                            ]
+                        )
+
+        return None

From dfdda96747c4d06e96355ea96c7207c9f7dd3816 Mon Sep 17 00:00:00 2001
From: Arpit Khandelwal <60464796+arpitkh101@users.noreply.github.com>
Date: Thu, 4 Dec 2025 04:15:04 -0500
Subject: [PATCH 223/770] [Core] Remove forced None assignment for deprecated
 PassConfig flags (#29994)

Signed-off-by: arpitkh101 <arpit5khandelwal@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 tests/compile/test_config.py | 21 +++++++++++++++------
 vllm/config/compilation.py   | 23 ++++++++++++-----------
 2 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py
index 9e912c6d810d2..8dd6959a01d09 100644
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -392,39 +392,48 @@ def test_pass_config_deprecation(caplog_vllm):
     assert "enable_fusion is deprecated" in caplog_vllm.text
     assert config.fuse_norm_quant is True
     assert config.fuse_act_quant is True
-    assert config.enable_fusion is None
+    assert config.enable_fusion is True
 
     # Test enable_attn_fusion -> fuse_attn_quant
     caplog_vllm.clear()
     config = PassConfig(enable_attn_fusion=True)
     assert "enable_attn_fusion is deprecated" in caplog_vllm.text
     assert config.fuse_attn_quant is True
-    assert config.enable_attn_fusion is None
+    assert config.enable_attn_fusion is True
 
     # Test enable_noop -> eliminate_noops
     caplog_vllm.clear()
     config = PassConfig(enable_noop=True)
     assert "enable_noop is deprecated" in caplog_vllm.text
     assert config.eliminate_noops is True
-    assert config.enable_noop is None
+    assert config.enable_noop is True
 
     # Test enable_sequence_parallelism -> enable_sp
     caplog_vllm.clear()
     config = PassConfig(enable_sequence_parallelism=True)
     assert "enable_sequence_parallelism is deprecated" in caplog_vllm.text
     assert config.enable_sp is True
-    assert config.enable_sequence_parallelism is None
+    assert config.enable_sequence_parallelism is True
 
     # Test enable_async_tp -> fuse_gemm_comms
     caplog_vllm.clear()
     config = PassConfig(enable_async_tp=True)
     assert "enable_async_tp is deprecated" in caplog_vllm.text
     assert config.fuse_gemm_comms is True
-    assert config.enable_async_tp is None
+    assert config.enable_async_tp is True
 
     # Test enable_fi_allreduce_fusion -> fuse_allreduce_rms
     caplog_vllm.clear()
     config = PassConfig(enable_fi_allreduce_fusion=True)
     assert "enable_fi_allreduce_fusion is deprecated" in caplog_vllm.text
     assert config.fuse_allreduce_rms is True
-    assert config.enable_fi_allreduce_fusion is None
+    assert config.enable_fi_allreduce_fusion is True
+
+    # Test hash consistency
+    config_old = PassConfig(enable_fusion=True)
+    config_new = PassConfig(fuse_norm_quant=True, fuse_act_quant=True)
+    assert config_old.compute_hash() == config_new.compute_hash()
+
+    config_old = PassConfig(enable_async_tp=True)
+    config_new = PassConfig(fuse_gemm_comms=True)
+    assert config_old.compute_hash() == config_new.compute_hash()
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 963b091939e0e..d3d50e6ae7b2e 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -4,7 +4,7 @@
 import enum
 from collections import Counter
 from collections.abc import Callable
-from dataclasses import asdict, field
+from dataclasses import field
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, ClassVar, Literal
 
@@ -13,7 +13,7 @@ from pydantic.dataclasses import dataclass
 
 import vllm.envs as envs
 from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
-from vllm.config.utils import config, handle_deprecated
+from vllm.config.utils import config, get_hash_factors, handle_deprecated, hash_factors
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils.import_utils import resolve_obj_by_qualname
@@ -196,7 +196,16 @@ class PassConfig:
         Any new fields that affect compilation should be added to the hash.
         Any future fields that don't affect compilation should be excluded.
         """
-        return InductorPass.hash_dict(asdict(self))
+
+        ignored_fields = [
+            "enable_fusion",
+            "enable_attn_fusion",
+            "enable_noop",
+            "enable_sequence_parallelism",
+            "enable_async_tp",
+            "enable_fi_allreduce_fusion",
+        ]
+        return hash_factors(get_hash_factors(self, ignored_factors=ignored_fields))
 
     @field_validator(
         "fuse_norm_quant",
@@ -267,14 +276,6 @@ class PassConfig:
             "v0.13.0 or v1.0.0, whichever is sooner",
         )
 
-        # Force old flags to None to ensure they are not used
-        self.enable_fusion = None
-        self.enable_attn_fusion = None
-        self.enable_noop = None
-        self.enable_sequence_parallelism = None
-        self.enable_async_tp = None
-        self.enable_fi_allreduce_fusion = None
-
         if not self.eliminate_noops:
             if self.fuse_norm_quant or self.fuse_act_quant:
                 logger.warning_once(

From f2f4cea6ccaad20becb6f02e253ae673f8a249ae Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Thu, 4 Dec 2025 03:30:22 -0600
Subject: [PATCH 224/770] [CI/Build][AMD] Skip test on
 test_hybrid_attention_mamba_tensor_shapes on ROCm, requires FLASHINFER
 (#29995)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 tests/v1/worker/test_gpu_model_runner.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index 0439bef1226e3..459abcfdd53cf 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -761,6 +761,10 @@ def test_init_kv_cache_with_kv_sharing_valid():
     assert kv_cache_config_after_init.kv_cache_groups[0].layer_names[1] == layer_1
 
 
+@pytest.mark.skipif(
+    current_platform.is_rocm(),
+    reason="Attention backend FLASHINFER is not supported on ROCm.",
+)
 def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
     """
     The GPU model runner creates different views into the

From 842aba501d92ac77874d45612b7e3c6fed2ca243 Mon Sep 17 00:00:00 2001
From: dtc <790567447@qq.com>
Date: Thu, 4 Dec 2025 17:51:36 +0800
Subject: [PATCH 225/770] [P/D] Introduce Mooncake Transfer Engine as
 kv_connector (#24718)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Tianchen Ding <dtcccc@linux.alibaba.com>
Signed-off-by: dtc <dtcccc@linux.alibaba.com>
Co-authored-by: Nicolò Lucchesi <nicolo.lucchesi@gmail.com>
---
 docs/features/mooncake_connector_usage.md     |  58 ++
 .../kv_transfer/kv_connector/factory.py       |   5 +
 .../kv_transfer/kv_connector/utils.py         | 124 +++
 .../kv_connector/v1/mooncake_connector.py     | 914 ++++++++++++++++++
 .../kv_connector/v1/nixl_connector.py         | 128 +--
 vllm/envs.py                                  |  10 +
 6 files changed, 1114 insertions(+), 125 deletions(-)
 create mode 100644 docs/features/mooncake_connector_usage.md
 create mode 100644 vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py

diff --git a/docs/features/mooncake_connector_usage.md b/docs/features/mooncake_connector_usage.md
new file mode 100644
index 0000000000000..653ea29ad9433
--- /dev/null
+++ b/docs/features/mooncake_connector_usage.md
@@ -0,0 +1,58 @@
+# MooncakeConnector Usage Guide
+
+## About Mooncake
+
+Mooncake aims to enhance the inference efficiency of large language models (LLMs), especially in slow object storage environments, by constructing a multi-level caching pool on high-speed interconnected DRAM/SSD resources. Compared to traditional caching systems, Mooncake utilizes (GPUDirect) RDMA technology to transfer data directly in a zero-copy manner, while maximizing the use of multi-NIC resources on a single machine.
+
+For more details about Mooncake, please refer to [Mooncake project](https://github.com/kvcache-ai/Mooncake) and [Mooncake documents](https://kvcache-ai.github.io/Mooncake/).
+
+## Prerequisites
+
+### Installation
+
+Install mooncake through pip: `uv pip install mooncake-transfer-engine`.
+
+Refer to [Mooncake official repository](https://github.com/kvcache-ai/Mooncake) for more installation instructions
+
+## Usage
+
+### Prefiller Node (192.168.0.2)
+
+```bash
+vllm serve Qwen/Qwen2.5-7B-Instruct --port 8010 --kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_producer"}'
+```
+
+### Decoder Node (192.168.0.3)
+
+```bash
+vllm serve Qwen/Qwen2.5-7B-Instruct --port 8020 --kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_consumer"}'
+```
+
+### Proxy
+
+```bash
+python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --prefiller-host 192.168.0.2 --prefiller-port 8010 --decoder-host 192.168.0.3 --decoder-port 8020
+```
+
+> NOTE: The Mooncake Connector currently uses the proxy from nixl_integration. This will be replaced with a self-developed proxy in the future.
+
+Now you can send requests to the proxy server through port 8000.
+
+## Environment Variables
+
+- `VLLM_MOONCAKE_BOOTSTRAP_PORT`: Port for Mooncake bootstrap server
+    - Default: 8998
+    - Required only for prefiller instances
+    - Each vLLM worker needs a unique port on its host; using the same port number across different hosts is fine
+    - For TP/DP deployments, each worker's port on a node is computed as: base_port + dp_rank * tp_size + tp_rank
+    - Used for the decoder notifying the prefiller
+
+- `VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT`: Timeout (in seconds) for automatically releasing the prefiller’s KV cache for a particular request. (Optional)
+    - Default: 480
+    - If a request is aborted and the decoder has not yet notified the prefiller, the prefill instance will release its KV-cache blocks after this timeout to avoid holding them indefinitely.
+
+## KV Role Options
+
+- **kv_producer**: For prefiller instances that generate KV caches
+- **kv_consumer**: For decoder instances that consume KV caches from prefiller
+- **kv_both**: Enables symmetric functionality where the connector can act as both producer and consumer. This provides flexibility for experimental setups and scenarios where the role distinction is not predetermined.
diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
index df871dd7cbe4f..02f51a1dce112 100644
--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -190,3 +190,8 @@ KVConnectorFactory.register_connector(
     "vllm.distributed.kv_transfer.kv_connector.v1.decode_bench_connector",
     "DecodeBenchConnector",
 )
+KVConnectorFactory.register_connector(
+    "MooncakeConnector",
+    "vllm.distributed.kv_transfer.kv_connector.v1.mooncake_connector",
+    "MooncakeConnector",
+)
diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py
index b2c2c0e6b596d..99d3be57c1381 100644
--- a/vllm/distributed/kv_transfer/kv_connector/utils.py
+++ b/vllm/distributed/kv_transfer/kv_connector/utils.py
@@ -4,10 +4,13 @@
 KV cache helper for store.
 """
 
+from dataclasses import dataclass
 from typing import TYPE_CHECKING, Literal
 
 import torch
 
+from vllm.attention.backends.abstract import AttentionBackend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import get_current_vllm_config
 from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
 from vllm.logger import init_logger
@@ -181,3 +184,124 @@ def copy_kv_blocks(
         src_tensor = src_kv_caches[layer_name]
         dst_tensor = dst_kv_caches[layer_name]
         copy_fn(src_tensor, dst_tensor, src_indices, dst_indices)
+
+
+@dataclass
+class TpKVTopology:
+    """
+    Helper class for tensor parallel and KV topology information for
+    mapping between local and remote TP workers.
+    """
+
+    tp_rank: int
+    remote_tp_size: dict[str, int]
+    is_mla: bool
+    total_num_kv_heads: int
+    attn_backend: type[AttentionBackend]
+    engine_id: str
+    remote_block_size: dict[str, int]
+
+    def __post_init__(self):
+        # Figure out whether the first dimension of the cache is K/V
+        # or num_blocks. This is used to register the memory regions correctly.
+        kv_cache_shape = self.attn_backend.get_kv_cache_shape(
+            num_blocks=1, block_size=16, num_kv_heads=1, head_size=1
+        )
+        # Non-MLA backends caches have 5 dims [2, num_blocks, H,N,D],
+        # we just mock num_blocks to 1 for the dimension check below.
+        self._is_kv_layout_blocks_first = (
+            len(kv_cache_shape) == 5 and kv_cache_shape[0] == 1
+        )
+
+        attn_backend = AttentionBackendEnum[self.attn_backend.get_name()]
+        self._use_pallas = attn_backend == AttentionBackendEnum.PALLAS
+
+    @property
+    def is_kv_layout_blocks_first(self) -> bool:
+        return self._is_kv_layout_blocks_first
+
+    @property
+    def split_k_and_v(self) -> bool:
+        # Whether to register regions for K and V separately (when present).
+        return not (self.is_mla or self._use_pallas or self.is_kv_layout_blocks_first)
+
+    @property
+    def tp_size(self) -> int:
+        return self.remote_tp_size[self.engine_id]
+
+    @property
+    def block_size(self) -> int:
+        return self.remote_block_size[self.engine_id]
+
+    def tp_ratio(
+        self,
+        remote_tp_size: int,
+    ) -> int:
+        """
+        Calculate the tensor parallel ratio between local and remote TP.
+        We can think of it as the number of local TP workers-per-remote TP
+        workers. Local workers will read from the same remote TP worker in
+        groups of size `tp_ratio`.
+        """
+        assert self.tp_size % remote_tp_size == 0, (
+            f"Local tensor parallel size {self.tp_size} is not divisible "
+            f"by remote tensor parallel size {remote_tp_size}."
+        )
+        return self.tp_size // remote_tp_size
+
+    def block_size_ratio(
+        self,
+        remote_block_size: int,
+    ) -> float:
+        """
+        Calculate the block size ratio between local and remote TP.
+        """
+        assert self.block_size % remote_block_size == 0, (
+            f"Local block size {self.block_size} is not divisible "
+            f"by remote block size {remote_block_size} or vice versa."
+        )
+        return self.block_size // remote_block_size
+
+    def tp_ratio_from_engine_id(
+        self,
+        remote_engine_id: str,
+    ) -> int:
+        remote_tp_size = self.remote_tp_size[remote_engine_id]
+        return self.tp_ratio(remote_tp_size)
+
+    def block_size_ratio_from_engine_id(
+        self,
+        remote_engine_id: str,
+    ) -> float:
+        remote_block_size = self.remote_block_size[remote_engine_id]
+        return self.block_size_ratio(remote_block_size)
+
+    def is_kv_replicated(self, engine_id: str) -> bool:
+        """
+        Whether the KV cache is replicated across TP workers due to the
+        number of TP workers being greater than the number of KV heads.
+        """
+        tp_size = self.remote_tp_size[engine_id]
+        return tp_size // self.total_num_kv_heads >= 1
+
+    def replicates_kv_cache(self, remote_engine_id: str) -> bool:
+        # MLA is always replicated as the hidden dim can't be split.
+        return self.is_mla or self.is_kv_replicated(remote_engine_id)
+
+    def get_target_remote_rank(
+        self,
+        remote_tp_size: int,
+    ) -> int:
+        """
+        Get the remote TP rank (on P) that the current local TP rank
+        (on D) will read from.
+        """
+        tp_ratio = self.tp_ratio(remote_tp_size)
+        return self.tp_rank // tp_ratio
+
+    def get_target_remote_rank_from_engine_id(
+        self,
+        remote_engine_id: str,
+    ) -> int:
+        remote_tp_size = self.remote_tp_size[remote_engine_id]
+        return self.get_target_remote_rank(remote_tp_size)
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py
new file mode 100644
index 0000000000000..705960aebe2da
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py
@@ -0,0 +1,914 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import threading
+import time
+import uuid
+from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Optional
+
+import msgspec
+import numpy as np
+import torch
+import zmq
+import zmq.asyncio
+
+from vllm import envs
+from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.attention.selector import get_attn_backend
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.utils import TpKVTopology
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorBase_V1,
+    KVConnectorMetadata,
+    KVConnectorRole,
+)
+from vllm.distributed.parallel_state import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    get_tp_group,
+)
+from vllm.forward_context import ForwardContext
+from vllm.logger import init_logger
+from vllm.utils.network_utils import get_ip, make_zmq_path, make_zmq_socket
+from vllm.v1.attention.backends.utils import get_kv_cache_layout
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.request import RequestStatus
+
+try:
+    from mooncake.engine import TransferEngine
+except ImportError as e:
+    raise ImportError(
+        "Please install mooncake by following the instructions at "
+        "https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md "  # noqa: E501
+        "to run VLLM with MooncakeTransferEngine."
+    ) from e
+
+if TYPE_CHECKING:
+    from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+    from vllm.v1.kv_cache_interface import KVCacheConfig
+    from vllm.v1.request import Request
+
+EngineId = str
+ReqId = str
+
+TRANS_DONE = b"trans_done"
+TRANS_ERROR = b"trans_error"
+
+logger = init_logger(__name__)
+
+
+class MooncakeAgentMetadata(
+    msgspec.Struct,
+    omit_defaults=True,  # type: ignore[call-arg]
+    # required for @cached_property.
+    dict=True,
+):
+    remote_hostname: str
+    remote_port: int
+    request_ids: list[ReqId]
+    kv_caches_base_addr: list[int]
+    block_ids: list[list[int]]
+
+
+@dataclass
+class RecvReqMeta:
+    local_block_ids: list[int]
+    remote_host: str
+    remote_port: int
+
+
+@dataclass
+class SendBlockMeta:
+    local_block_ids: list[int]
+    ready: threading.Event
+    expire_time: float = float("inf")
+
+
+@dataclass
+class SendReqMeta:
+    reqs: dict[ReqId, SendBlockMeta]
+    lock: threading.Lock
+
+
+@dataclass
+class FinishedSendReqSet:
+    set: set[ReqId]
+    lock: threading.Lock
+
+
+@dataclass
+class FinishedReceiveReqSet:
+    set: set[ReqId]
+    lock: asyncio.Lock
+
+
+class MooncakeConnectorMetadata(KVConnectorMetadata):
+    def __init__(self):
+        self.reqs_to_recv: dict[ReqId, RecvReqMeta] = {}
+        self.reqs_to_send: dict[ReqId, list[int]] = {}
+
+    def add_new_req(
+        self,
+        request_id: ReqId,
+        local_block_ids: list[int],
+        kv_transfer_params: dict[str, Any],
+        load_remote_cache: bool = True,
+    ):
+        if load_remote_cache:
+            self.reqs_to_recv[request_id] = RecvReqMeta(
+                local_block_ids=local_block_ids,
+                remote_host=kv_transfer_params["remote_host"],
+                remote_port=kv_transfer_params["remote_port"],
+            )
+        else:
+            self.reqs_to_send[request_id] = local_block_ids
+
+
+class MooncakeConnector(KVConnectorBase_V1):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        role: KVConnectorRole,
+        kv_cache_config: Optional["KVCacheConfig"] = None,
+    ):
+        super().__init__(vllm_config, role, kv_cache_config)
+
+        assert vllm_config.kv_transfer_config is not None
+        assert vllm_config.kv_transfer_config.engine_id is not None
+        self.engine_id: EngineId = vllm_config.kv_transfer_config.engine_id
+
+        if role == KVConnectorRole.SCHEDULER:
+            self.connector_scheduler: MooncakeConnectorScheduler | None = (
+                MooncakeConnectorScheduler(vllm_config, self.engine_id)
+            )
+            self.connector_worker: MooncakeConnectorWorker | None = None
+        elif role == KVConnectorRole.WORKER:
+            self.connector_scheduler = None
+            self.connector_worker = MooncakeConnectorWorker(vllm_config, self.engine_id)
+
+    ############################################################
+    # Scheduler Side Methods
+    ############################################################
+
+    def get_num_new_matched_tokens(
+        self, request: "Request", num_computed_tokens: int
+    ) -> tuple[int, bool]:
+        assert self.connector_scheduler is not None
+        return self.connector_scheduler.get_num_new_matched_tokens(
+            request, num_computed_tokens
+        )
+
+    def update_state_after_alloc(
+        self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int
+    ):
+        assert self.connector_scheduler is not None
+        return self.connector_scheduler.update_state_after_alloc(
+            request, blocks, num_external_tokens
+        )
+
+    def build_connector_meta(
+        self,
+        scheduler_output: SchedulerOutput,
+    ) -> KVConnectorMetadata:
+        assert self.connector_scheduler is not None
+        return self.connector_scheduler.build_connector_meta(scheduler_output)
+
+    def request_finished(
+        self,
+        request: "Request",
+        block_ids: list[int],
+    ) -> tuple[bool, dict[str, Any] | None]:
+        assert self.connector_scheduler is not None
+        return self.connector_scheduler.request_finished(request, block_ids)
+
+    ############################################################
+    # Worker Side Methods
+    ############################################################
+    def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
+        assert self.connector_worker is not None
+        self.connector_worker.register_kv_caches(kv_caches)
+
+    def get_finished(
+        self, finished_req_ids: set[str]
+    ) -> tuple[set[str] | None, set[str] | None]:
+        """Get the finished recving and sending requests."""
+        assert self.connector_worker is not None
+        return self.connector_worker.get_finished()
+
+    def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None:
+        assert self.connector_worker is not None
+        assert isinstance(self._connector_metadata, MooncakeConnectorMetadata)
+        self.connector_worker.start_load_kv(self._connector_metadata)
+
+    def wait_for_layer_load(self, layer_name: str) -> None:
+        """MooncakeConnector does not do layerwise saving."""
+        pass
+
+    def save_kv_layer(
+        self,
+        layer_name: str,
+        kv_layer: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        **kwargs,
+    ) -> None:
+        """MooncakeConnector does not save explicitly."""
+        pass
+
+    def wait_for_save(self):
+        pass
+
+
+class MooncakeConnectorScheduler:
+    """Implementation of Scheduler side methods"""
+
+    def __init__(self, vllm_config: VllmConfig, engine_id: str):
+        self.vllm_config = vllm_config
+        self.engine_id: EngineId = engine_id
+        self.side_channel_host = get_ip()
+        self.side_channel_port = get_mooncake_side_channel_port(vllm_config)
+
+        assert vllm_config.kv_transfer_config
+        self.kv_role = vllm_config.kv_transfer_config.kv_role
+        logger.info("Initializing Mooncake Transfer Engine Scheduler %s", engine_id)
+
+        # Requests that need to start recv/send.
+        # New requests are added by update_state_after_alloc in
+        # the scheduler. Used to make metadata passed to Worker.
+        self._reqs_need_recv: dict[ReqId, tuple[Request, list[int]]] = {}
+        self._reqs_need_send: dict[ReqId, list[int]] = {}
+
+    def get_num_new_matched_tokens(
+        self, request: "Request", num_computed_tokens: int
+    ) -> tuple[int, bool]:
+        """
+        For remote prefill, pull all prompt blocks from remote
+        asynchronously relative to engine execution.
+
+        Args:
+            request (Request): the request object.
+            num_computed_tokens (int): the number of locally
+                computed tokens for this request
+        Returns:
+            * the number of tokens that can be loaded from the
+              external KV cache beyond what is already computed.
+            * true if the external KV cache tokens will be loaded
+              asynchronously (between scheduler steps).
+        """
+
+        params = request.kv_transfer_params
+        logger.debug(
+            "MooncakeConnector get_num_new_matched_tokens: "
+            "num_computed_tokens=%s, kv_transfer_params=%s",
+            num_computed_tokens,
+            params,
+        )
+
+        if params is not None and params.get("do_remote_prefill"):
+            # Remote prefill: get all prompt blocks from remote.
+            token_ids = request.prompt_token_ids or []
+            count = len(token_ids) - num_computed_tokens
+            if count > 0:
+                return count, True
+
+        # No remote prefill for this request.
+        return 0, False
+
+    def update_state_after_alloc(
+        self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int
+    ):
+        params = request.kv_transfer_params
+        logger.debug(
+            "MooncakeConnector update_state_after_alloc: "
+            "num_external_tokens=%s, kv_transfer_params=%s",
+            num_external_tokens,
+            params,
+        )
+
+        if not params:
+            return
+
+        if params.get("do_remote_prefill"):
+            assert self.kv_role != "kv_producer"
+            if all(p in params for p in ("remote_host", "remote_port")):
+                # If remote_blocks and num_external_tokens = 0, we have
+                # a full prefix cache hit on the D worker. We need to call
+                # send_notif in _read_blocks to free the memory on the P.
+                local_block_ids = (
+                    blocks.get_unhashed_block_ids() if num_external_tokens > 0 else []
+                )
+                # Get unhashed blocks to pull from remote.
+                self._reqs_need_recv[request.request_id] = (request, local_block_ids)
+            else:
+                logger.warning(
+                    "Got invalid KVTransferParams: %s. This "
+                    "request will not utilize KVTransfer",
+                    params,
+                )
+            # Only trigger 1 KV transfer per request.
+            params["do_remote_prefill"] = False
+
+        elif params.get("do_remote_decode"):
+            # Add an empty list to worker to create event.
+            self._reqs_need_send[request.request_id] = []
+
+    def build_connector_meta(
+        self,
+        scheduler_output: SchedulerOutput,
+    ) -> KVConnectorMetadata:
+        meta = MooncakeConnectorMetadata()
+
+        # Loop through scheduled reqs and convert to RecvReqMeta.
+        if self.kv_role != "kv_producer":
+            for req_id, (req, block_ids) in self._reqs_need_recv.items():
+                assert req.kv_transfer_params is not None
+                meta.add_new_req(
+                    request_id=req_id,
+                    local_block_ids=block_ids,
+                    kv_transfer_params=req.kv_transfer_params,
+                )
+            self._reqs_need_recv.clear()
+
+        if self.kv_role != "kv_consumer":
+            for req_id, block_ids in self._reqs_need_send.items():
+                meta.add_new_req(
+                    request_id=req_id,
+                    local_block_ids=block_ids,
+                    kv_transfer_params={},
+                    load_remote_cache=False,
+                )
+            self._reqs_need_send.clear()
+
+        return meta
+
+    def request_finished(
+        self,
+        request: "Request",
+        block_ids: list[int],
+    ) -> tuple[bool, dict[str, Any] | None]:
+        """
+        Once a request is finished, determine whether request blocks
+        should be freed now or will be sent asynchronously and freed later.
+        """
+
+        params = request.kv_transfer_params
+        logger.debug(
+            "MooncakeConnector request_finished, request_status=%s, "
+            "kv_transfer_params=%s",
+            request.status,
+            params,
+        )
+        if not params:
+            return False, None
+
+        if params.get("do_remote_prefill"):
+            # If do_remote_prefill is still True when the request is finished,
+            # update_state_after_alloc must not have been called (the request
+            # must have been aborted before it was scheduled).
+            # To avoid stranding the prefill blocks in the prefill instance,
+            # we must add empty block_ids to _reqs_need_recv so that our
+            # worker side will notify and free blocks in the prefill instance.
+            assert self.kv_role != "kv_producer"
+            self._reqs_need_recv[request.request_id] = (request, [])
+            params["do_remote_prefill"] = False
+            return False, None
+
+        if (
+            not params.get("do_remote_decode")
+            or request.status != RequestStatus.FINISHED_LENGTH_CAPPED
+        ):
+            return False, None
+
+        assert self.kv_role != "kv_consumer"
+
+        # TODO: check whether block_ids actually ever be 0. If not we could
+        # remove the conditional below
+        delay_free_blocks = len(block_ids) > 0
+
+        if delay_free_blocks:
+            self._reqs_need_send[request.request_id] = block_ids
+
+        return delay_free_blocks, dict(
+            do_remote_prefill=True,
+            do_remote_decode=False,
+            remote_host=self.side_channel_host,
+            remote_port=self.side_channel_port,
+        )
+
+
+class MooncakeConnectorWorker:
+    """Implementation of Worker side methods"""
+
+    def __init__(self, vllm_config: VllmConfig, engine_id: str):
+        logger.info("Initializing Mooncake Transfer Engine worker %s", engine_id)
+
+        self.vllm_config = vllm_config
+
+        self.engine = TransferEngine()
+        self.hostname = get_ip()
+        ret_value = self.engine.initialize(self.hostname, "P2PHANDSHAKE", "rdma", "")
+        if ret_value != 0:
+            raise RuntimeError("Mooncake Transfer Engine initialization failed.")
+
+        self.rpc_port = self.engine.get_rpc_port()
+
+        logger.debug(
+            "Mooncake Transfer Engine initialized at %s:%d",
+            self.hostname,
+            self.rpc_port,
+        )
+
+        # Mooncake handshake port.
+        self.side_channel_port: int = get_mooncake_side_channel_port(vllm_config)
+
+        self.engine_id: EngineId = engine_id
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.world_size = get_tensor_model_parallel_world_size()
+        self.tp_group = get_tp_group()
+        self.num_blocks = 0
+
+        assert vllm_config.kv_transfer_config
+        self.kv_role = vllm_config.kv_transfer_config.kv_role
+        self.num_workers = vllm_config.kv_transfer_config.kv_connector_extra_config.get(
+            "num_workers", 10
+        )
+
+        self.kv_caches_base_addr: list[int] = []
+        self.device_kv_caches: dict[str, torch.Tensor] = {}
+        self.reqs_need_send: SendReqMeta = SendReqMeta(reqs={}, lock=threading.Lock())
+
+        # For kv_both, we will act both prefiller and decoder.
+        if self.kv_role != "kv_consumer":
+            # Background thread for sending kvcaches to D.
+            self._mooncake_sender_t: threading.Thread | None = None
+            # Background thread for processing new sending requests.
+            self._sender_executor = ThreadPoolExecutor(
+                max_workers=self.num_workers, thread_name_prefix="vllm-mooncake-sender"
+            )
+            logger.debug(
+                "Mooncake Prefiller: use %d workers to send kvcaches", self.num_workers
+            )
+        if self.kv_role != "kv_producer":
+            self.receiver_loop = asyncio.new_event_loop()
+            self._mooncake_receiver_t = threading.Thread(
+                target=self._receiver_loop, args=(self.receiver_loop,), daemon=True
+            )
+            self._mooncake_receiver_t.start()
+            logger.debug("Mooncake Decoder: start receiver thread")
+
+        self.finished_sending_reqs: FinishedSendReqSet = FinishedSendReqSet(
+            set(), threading.Lock()
+        )
+        self.finished_recving_reqs: FinishedReceiveReqSet = FinishedReceiveReqSet(
+            set(), asyncio.Lock()
+        )
+
+        self.block_size = vllm_config.cache_config.block_size
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.use_mla = self.model_config.use_mla
+
+        backend = get_attn_backend(
+            self.model_config.get_head_size(),
+            self.model_config.dtype,
+            self.cache_config.cache_dtype,
+            self.block_size,
+            use_mla=self.use_mla,
+        )
+        self.backend_name = backend.get_name()
+        self.kv_cache_layout = get_kv_cache_layout()
+        logger.debug("Detected attention backend %s", self.backend_name)
+        logger.debug("Detected kv cache layout %s", self.kv_cache_layout)
+
+        self._tp_size: dict[EngineId, int] = {self.engine_id: self.world_size}
+        self._block_size: dict[EngineId, int] = {self.engine_id: self.block_size}
+        self.kv_topo = TpKVTopology(
+            tp_rank=self.tp_rank,
+            engine_id=self.engine_id,
+            remote_tp_size=self._tp_size,  # shared state
+            remote_block_size=self._block_size,  # shared state
+            is_mla=self.use_mla,
+            total_num_kv_heads=self.model_config.get_total_num_kv_heads(),
+            attn_backend=backend,
+        )
+        self._use_pallas = self.kv_topo._use_pallas
+
+        self.zmq_ctx = zmq.Context()
+        self.async_zmq_ctx = zmq.asyncio.Context()
+        self._encoder = msgspec.msgpack.Encoder()
+        self._decoder = msgspec.msgpack.Decoder(MooncakeAgentMetadata)
+
+    def __del__(self):
+        self.shutdown()
+
+    def shutdown(self):
+        """Cleanup background threads on destruction."""
+        self.zmq_ctx.term()
+        self.async_zmq_ctx.term()
+        if self.kv_role != "kv_consumer":
+            self._sender_executor.shutdown(wait=False)
+            if self._mooncake_sender_t:
+                self._mooncake_sender_t.join()
+        if self.kv_role != "kv_producer" and self.receiver_loop.is_running():
+            self.receiver_loop.call_soon_threadsafe(self.receiver_loop.stop)
+            self._mooncake_receiver_t.join()
+
+    def _receiver_loop(self, loop: asyncio.AbstractEventLoop):
+        asyncio.set_event_loop(loop)
+        loop.run_forever()
+
+    def _mooncake_sender(
+        self, ready_event: threading.Event, base_port: int, tp_rank: int
+    ):
+        """
+        Background thread that listens for Mooncake requests, dispatches them
+        to a thread pool, and sends acknowledgments upon completion.
+        """
+
+        frontend_path = make_zmq_path("tcp", self.hostname, base_port + tp_rank)
+        frontend = make_zmq_socket(self.zmq_ctx, frontend_path, zmq.ROUTER)
+        logger.debug("Mooncake sender starting listening on path: %s", frontend_path)
+
+        backend_path = make_zmq_path("inproc", str(uuid.uuid4()))
+        backend = make_zmq_socket(self.zmq_ctx, backend_path, zmq.PULL)
+
+        poller = zmq.Poller()
+        poller.register(frontend, zmq.POLLIN)
+        poller.register(backend, zmq.POLLIN)
+
+        ready_event.set()
+
+        try:
+            while True:
+                sockets = dict(poller.poll())
+
+                if frontend in sockets:
+                    identity, _, metadata_bytes = frontend.recv_multipart()
+                    self._sender_executor.submit(
+                        self._sender_worker,
+                        identity,
+                        metadata_bytes,
+                        backend_path,
+                    )
+
+                if backend in sockets:
+                    identity, status = backend.recv_multipart()
+                    frontend.send_multipart((identity, b"", status))
+
+        except zmq.ContextTerminated:
+            logger.debug("ZMQ context terminated, exiting Mooncake sender thread.")
+        except Exception as e:
+            logger.error("Error in Mooncake sender thread: %s. Exiting thread.", str(e))
+        finally:
+            frontend.close()
+            backend.close()
+
+    def _sender_worker(
+        self, identity: bytes, metadata_bytes: bytes, worker_channel_path: str
+    ):
+        status = TRANS_ERROR
+
+        try:
+            metadata = self._decoder.decode(metadata_bytes)
+            self.send_kv_to_decode(metadata)
+            status = TRANS_DONE
+        except Exception as e:
+            logger.error("Error processing Mooncake handshake: %s", e)
+        finally:
+            pusher = make_zmq_socket(self.zmq_ctx, worker_channel_path, zmq.PUSH)
+            try:
+                pusher.send_multipart((identity, status))
+            except zmq.ZMQError as e:
+                logger.warning(
+                    "Internal error, maybe the server is shutting down. Error: %s",
+                    e,
+                )
+            finally:
+                pusher.close()
+
+    def send_kv_to_decode(self, meta: MooncakeAgentMetadata):
+        send_reqs: list[tuple[ReqId, SendBlockMeta]] = []
+        with self.reqs_need_send.lock:
+            for req_id in meta.request_ids:
+                send_meta = self.reqs_need_send.reqs.get(req_id)
+                if send_meta is None:
+                    logger.warning("Request %s not found in reqs_need_send", req_id)
+                    return
+                # Mark it as not expired. We will send it now.
+                send_meta.expire_time = float("inf")
+                send_reqs.append((req_id, send_meta))
+
+        self._send_blocks(send_reqs, meta)
+
+        with self.reqs_need_send.lock:
+            for req_id in meta.request_ids:
+                del self.reqs_need_send.reqs[req_id]
+
+        with self.finished_sending_reqs.lock:
+            self.finished_sending_reqs.set.update(meta.request_ids)
+
+    def _send_blocks(
+        self,
+        send_reqs: list[tuple[ReqId, SendBlockMeta]],
+        agent_meta: MooncakeAgentMetadata,
+    ):
+        src_ptrs = []
+        dst_ptrs = []
+        lengths = []
+        local_base_addr = self.kv_caches_base_addr
+        remote_base_addr = agent_meta.kv_caches_base_addr
+        block_len = self.block_len
+        remote_session = f"{agent_meta.remote_hostname}:{agent_meta.remote_port}"
+
+        assert len(send_reqs) == len(agent_meta.block_ids)
+        for (req_id, send_meta), remote_block_ids in zip(
+            send_reqs, agent_meta.block_ids
+        ):
+            send_meta.ready.wait()
+
+            num_remote_blocks = len(remote_block_ids)
+            if num_remote_blocks == 0:
+                continue
+
+            local_block_ids = send_meta.local_block_ids
+            # Partial prefix cache hit: just read uncomputed blocks.
+            num_local_blocks = len(local_block_ids)
+            assert num_local_blocks >= num_remote_blocks
+            if num_local_blocks > num_remote_blocks:
+                local_block_ids = local_block_ids[-num_remote_blocks:]
+
+            # Group by indices
+            group_local_block_ids, group_remote_block_ids = group_concurrent_contiguous(
+                local_block_ids, remote_block_ids
+            )
+
+            for local_layer_addr, remote_layer_addr in zip(
+                local_base_addr, remote_base_addr
+            ):
+                for group_local_block_id, group_remote_block_id in zip(
+                    group_local_block_ids, group_remote_block_ids
+                ):
+                    src_ptrs.append(
+                        local_layer_addr + group_local_block_id[0] * block_len
+                    )
+                    dst_ptrs.append(
+                        remote_layer_addr + group_remote_block_id[0] * block_len
+                    )
+                    lengths.append(block_len * len(group_local_block_id))
+
+            logger.debug(
+                "Sending kv_caches for request %s (%d blocks) to %s",
+                req_id,
+                num_remote_blocks,
+                remote_session,
+            )
+
+        start_time = time.perf_counter()
+        ret_value = self.engine.batch_transfer_sync_write(
+            remote_session, src_ptrs, dst_ptrs, lengths
+        )
+        if ret_value != 0:
+            raise RuntimeError(f"Error in batch_transfer_sync_write: {ret_value}")
+
+        logger.debug(
+            "Sending to %s done, took %s",
+            remote_session,
+            time.perf_counter() - start_time,
+        )
+
+    def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
+        """Register the KV Cache data in mooncake."""
+
+        logger.info("Registering KV_Caches. use_mla: %s", self.use_mla)
+
+        kv_data_ptrs = []
+        kv_data_lens = []
+        seen_base_addresses = []
+
+        split_k_and_v = self.kv_topo.split_k_and_v
+        tensor_size_bytes = None
+        for layer_name, cache_or_caches in kv_caches.items():
+            logger.debug(
+                "registering layer %s with shape %s", layer_name, cache_or_caches.shape
+            )
+            cache_list = cache_or_caches if split_k_and_v else [cache_or_caches]
+
+            for cache in cache_list:
+                base_addr = cache.data_ptr()
+                if base_addr in seen_base_addresses:
+                    continue
+
+                seen_base_addresses.append(base_addr)
+                curr_tensor_size_bytes = cache.nbytes
+
+                if tensor_size_bytes is None:
+                    tensor_size_bytes = curr_tensor_size_bytes
+                    self.num_blocks = cache.shape[0]
+
+                assert tensor_size_bytes == curr_tensor_size_bytes, (
+                    "All kv cache tensors must have the same size"
+                )
+                kernel_block_size = cache.shape[-2 if self.use_mla else -3]
+                assert self.block_size == kernel_block_size
+                kv_data_ptrs.append(base_addr)
+                kv_data_lens.append(tensor_size_bytes)
+
+        self.kv_caches_base_addr = seen_base_addresses
+
+        ret_value = self.engine.batch_register_memory(kv_data_ptrs, kv_data_lens)
+        if ret_value != 0:
+            raise RuntimeError("Mooncake batch memory registration failed.")
+
+        assert tensor_size_bytes is not None
+        assert self.num_blocks != 0
+        assert tensor_size_bytes % self.num_blocks == 0
+        self.block_len = tensor_size_bytes // self.num_blocks
+        self.device_kv_caches = kv_caches
+        logger.debug(
+            "registered num_blocks=%d block_len=%d", self.num_blocks, self.block_len
+        )
+
+        # No need to launch server for D node.
+        if self.kv_role == "kv_consumer":
+            return
+
+        ready_event = threading.Event()
+        self._mooncake_sender_t = threading.Thread(
+            target=self._mooncake_sender,
+            args=(ready_event, self.side_channel_port, self.tp_rank),
+            daemon=True,
+            name="mooncake_sender",
+        )
+        self._mooncake_sender_t.start()
+        ready_event.wait()  # Wait for listener ZMQ socket to be ready.
+
+    async def fetch_finished_recving_reqs(self) -> set[ReqId]:
+        async with self.finished_recving_reqs.lock:
+            finished_recving_reqs = self.finished_recving_reqs.set
+            self.finished_recving_reqs.set = set()
+        return finished_recving_reqs
+
+    def get_finished(self) -> tuple[set[str] | None, set[str] | None]:
+        """
+        Get requests that are done sending or recving on this specific worker.
+        The scheduler process (via the MultiprocExecutor) will use this output
+        to track which workers are done.
+        """
+        fut = None
+        if self.kv_role != "kv_producer":
+            fut = asyncio.run_coroutine_threadsafe(
+                self.fetch_finished_recving_reqs(), self.receiver_loop
+            )
+
+        if self.kv_role != "kv_consumer":
+            with self.finished_sending_reqs.lock:
+                finished_sending_reqs = self.finished_sending_reqs.set
+                self.finished_sending_reqs.set = set()
+        else:
+            finished_sending_reqs = set()
+
+        finished_recving_reqs = fut.result() if fut else set()
+
+        if finished_sending_reqs or finished_recving_reqs:
+            logger.debug(
+                "Rank %s, get_finished: %s requests done sending "
+                "and %s requests done recving",
+                self.tp_rank,
+                len(finished_sending_reqs),
+                len(finished_recving_reqs),
+            )
+
+        # Handle timeout to avoid stranding blocks on remote.
+        now = time.perf_counter()
+        with self.reqs_need_send.lock:
+            expired_reqs = [
+                req_id
+                for req_id, send_meta in self.reqs_need_send.reqs.items()
+                if send_meta.expire_time < now
+            ]
+            for req_id in expired_reqs:
+                logger.warning(
+                    "Request %s timed out after %d seconds without "
+                    "being sent. Freeing its blocks on the producer side.",
+                    req_id,
+                    envs.VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT,
+                )
+                del self.reqs_need_send.reqs[req_id]
+            if expired_reqs:
+                finished_sending_reqs.update(expired_reqs)
+
+        return finished_sending_reqs or None, finished_recving_reqs or None
+
+    async def receive_kv(self, path: str, req_blocks: list[tuple[str, list[int]]]):
+        req_ids, block_ids = map(list, zip(*req_blocks))
+        metadata = MooncakeAgentMetadata(
+            remote_hostname=self.hostname,
+            remote_port=self.rpc_port,
+            request_ids=req_ids,
+            kv_caches_base_addr=self.kv_caches_base_addr,
+            block_ids=block_ids,
+        )
+
+        encoded_data = self._encoder.encode(metadata)
+        logger.debug(
+            "Size of encoded MooncakeAgentMetadata: %d bytes", len(encoded_data)
+        )
+        logger.debug("Sending kv transfer request for %s on path: %s", req_ids, path)
+
+        # Send query for the request.
+        sock: zmq.asyncio.Socket = make_zmq_socket(
+            self.async_zmq_ctx, path, zmq.REQ, bind=False, linger=0
+        )
+        sock.setsockopt(zmq.RCVTIMEO, 60000)
+        try:
+            await sock.send(encoded_data)
+            ret_msg = await sock.recv()
+            if ret_msg != TRANS_DONE:
+                logger.error(
+                    "Error happens during tranfering kvcache for %s, see logs in prefiller.",  # noqa: E501
+                    req_ids,
+                )
+                return
+        except zmq.ContextTerminated:
+            logger.debug("ZMQ context terminated, exiting Mooncake receiver thread.")
+        except Exception as e:
+            logger.error("MooncakeAgentMetadata transfer failed for %s: %s", req_ids, e)
+            return
+        finally:
+            sock.close()
+
+        async with self.finished_recving_reqs.lock:
+            self.finished_recving_reqs.set.update(req_ids)
+
+        logger.debug("pulling kv_caches for %s finished", req_ids)
+
+    def group_kv_pull(self, metadata: MooncakeConnectorMetadata):
+        kv_pulls = defaultdict(list)
+        for req_id, meta in metadata.reqs_to_recv.items():
+            logger.debug(
+                "start_load_kv for request %s from remote engine. "
+                "Num local_block_ids: %s.",
+                req_id,
+                len(meta.local_block_ids),
+            )
+            path = make_zmq_path(
+                "tcp", meta.remote_host, meta.remote_port + self.tp_rank
+            )
+            kv_pulls[path].append((req_id, meta.local_block_ids))
+
+        return kv_pulls
+
+    def start_load_kv(self, metadata: MooncakeConnectorMetadata):
+        if self.kv_role != "kv_producer":
+            kv_pulls = self.group_kv_pull(metadata)
+            for path, req_blocks in kv_pulls.items():
+                asyncio.run_coroutine_threadsafe(
+                    self.receive_kv(path, req_blocks), self.receiver_loop
+                )
+
+        if self.kv_role != "kv_consumer":
+            with self.reqs_need_send.lock:
+                for req_id, block_ids in metadata.reqs_to_send.items():
+                    if block_ids:
+                        # Already gone through request_finished()
+                        send_meta = self.reqs_need_send.reqs[req_id]
+                        send_meta.local_block_ids = block_ids
+                        send_meta.ready.set()
+                        send_meta.expire_time = (
+                            time.perf_counter()
+                            + envs.VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT
+                        )
+                    else:
+                        # From update_state_after_alloc(),
+                        # but not reach request_finished() yet
+                        self.reqs_need_send.reqs[req_id] = SendBlockMeta(
+                            local_block_ids=[], ready=threading.Event()
+                        )
+
+
+def group_concurrent_contiguous(
+    src_indices: list[int], dst_indices: list[int]
+) -> tuple[list[list[int]], list[list[int]]]:
+    """Vectorised NumPy implementation."""
+    if len(src_indices) == 0:
+        return [], []
+
+    brk = np.where((np.diff(src_indices) != 1) | (np.diff(dst_indices) != 1))[0] + 1
+    src_groups = np.split(src_indices, brk)
+    dst_groups = np.split(dst_indices, brk)
+
+    src_groups = [g.tolist() for g in src_groups]
+    dst_groups = [g.tolist() for g in dst_groups]
+
+    return src_groups, dst_groups
+
+
+def get_mooncake_side_channel_port(vllm_config: VllmConfig) -> int:
+    # This logic is now centralized
+    return (
+        envs.VLLM_MOONCAKE_BOOTSTRAP_PORT
+        + vllm_config.parallel_config.data_parallel_rank
+        * vllm_config.parallel_config.tensor_parallel_size
+    )
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 41e32bb73d40b..24b7599a4fe0c 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -20,10 +20,10 @@ import torch
 import zmq
 
 from vllm import envs
-from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
-from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.attention.selector import get_attn_backend
 from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.utils import TpKVTopology
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     CopyBlocksOp,
     KVConnectorBase_V1,
@@ -668,128 +668,6 @@ class NixlConnectorScheduler:
 class NixlConnectorWorker:
     """Implementation of Worker side methods"""
 
-    @dataclass
-    class TpKVTopology:
-        """
-        Helper class for tensor parallel and KV topology information for
-        mapping between local and remote TP workers.
-        """
-
-        tp_rank: int
-        remote_tp_size: dict[EngineId, int]
-        is_mla: bool
-        total_num_kv_heads: int
-        attn_backend: type[AttentionBackend]
-        engine_id: EngineId
-        remote_block_size: dict[EngineId, int]
-
-        def __post_init__(self):
-            # Figure out whether the first dimension of the cache is K/V
-            # or num_blocks. This is used to register the memory regions correctly.
-            kv_cache_shape = self.attn_backend.get_kv_cache_shape(
-                num_blocks=1, block_size=16, num_kv_heads=1, head_size=1
-            )
-            # Non-MLA backends caches have 5 dims [2, num_blocks, H,N,D],
-            # we just mock num_blocks to 1 for the dimension check below.
-            self._is_kv_layout_blocks_first = (
-                len(kv_cache_shape) == 5 and kv_cache_shape[0] == 1
-            )
-
-            attn_backend = AttentionBackendEnum[self.attn_backend.get_name()]
-            self._use_pallas = attn_backend == AttentionBackendEnum.PALLAS
-
-        @property
-        def is_kv_layout_blocks_first(self) -> bool:
-            return self._is_kv_layout_blocks_first
-
-        @property
-        def split_k_and_v(self) -> bool:
-            # Whether to register regions for K and V separately (when present).
-            return not (
-                self.is_mla or self._use_pallas or self.is_kv_layout_blocks_first
-            )
-
-        @property
-        def tp_size(self) -> int:
-            return self.remote_tp_size[self.engine_id]
-
-        @property
-        def block_size(self) -> int:
-            return self.remote_block_size[self.engine_id]
-
-        def tp_ratio(
-            self,
-            remote_tp_size: int,
-        ) -> int:
-            """
-            Calculate the tensor parallel ratio between local and remote TP.
-            We can think of it as the number of local TP workers-per-remote TP
-            workers. Local workers will read from the same remote TP worker in
-            groups of size `tp_ratio`.
-            """
-            assert self.tp_size % remote_tp_size == 0, (
-                f"Local tensor parallel size {self.tp_size} is not divisible "
-                f"by remote tensor parallel size {remote_tp_size}."
-            )
-            return self.tp_size // remote_tp_size
-
-        def block_size_ratio(
-            self,
-            remote_block_size: int,
-        ) -> float:
-            """
-            Calculate the block size ratio between local and remote TP.
-            """
-            assert self.block_size % remote_block_size == 0, (
-                f"Local block size {self.block_size} is not divisible "
-                f"by remote block size {remote_block_size} or vice versa."
-            )
-            return self.block_size // remote_block_size
-
-        def tp_ratio_from_engine_id(
-            self,
-            remote_engine_id: EngineId,
-        ) -> int:
-            remote_tp_size = self.remote_tp_size[remote_engine_id]
-            return self.tp_ratio(remote_tp_size)
-
-        def block_size_ratio_from_engine_id(
-            self,
-            remote_engine_id: EngineId,
-        ) -> float:
-            remote_block_size = self.remote_block_size[remote_engine_id]
-            return self.block_size_ratio(remote_block_size)
-
-        def is_kv_replicated(self, engine_id: EngineId) -> bool:
-            """
-            Whether the KV cache is replicated across TP workers due to the
-            number of TP workers being greater than the number of KV heads.
-            """
-            tp_size = self.remote_tp_size[engine_id]
-            return tp_size // self.total_num_kv_heads >= 1
-
-        def replicates_kv_cache(self, remote_engine_id: EngineId) -> bool:
-            # MLA is always replicated as the hidden dim can't be split.
-            return self.is_mla or self.is_kv_replicated(remote_engine_id)
-
-        def get_target_remote_rank(
-            self,
-            remote_tp_size: int,
-        ) -> int:
-            """
-            Get the remote TP rank (on P) that the current local TP rank
-            (on D) will read from.
-            """
-            tp_ratio = self.tp_ratio(remote_tp_size)
-            return self.tp_rank // tp_ratio
-
-        def get_target_remote_rank_from_engine_id(
-            self,
-            remote_engine_id: EngineId,
-        ) -> int:
-            remote_tp_size = self.remote_tp_size[remote_engine_id]
-            return self.get_target_remote_rank(remote_tp_size)
-
     def __init__(self, vllm_config: VllmConfig, engine_id: str):
         if NixlWrapper is None:
             logger.error("NIXL is not available")
@@ -958,7 +836,7 @@ class NixlConnectorWorker:
         self.consumer_notification_counts_by_req = defaultdict[ReqId, int](int)
         self.xfer_stats = NixlKVConnectorStats()
 
-        self.kv_topo = self.TpKVTopology(
+        self.kv_topo = TpKVTopology(
             tp_rank=self.tp_rank,
             engine_id=self.engine_id,
             remote_tp_size=self._tp_size,  # shared state
diff --git a/vllm/envs.py b/vllm/envs.py
index 2ed5816b350b7..37711dece9abc 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -175,6 +175,7 @@ if TYPE_CHECKING:
     VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False
     VLLM_NIXL_SIDE_CHANNEL_HOST: str = "localhost"
     VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5600
+    VLLM_MOONCAKE_BOOTSTRAP_PORT: int = 8998
     VLLM_ALL2ALL_BACKEND: Literal[
         "naive",
         "pplx",
@@ -197,6 +198,7 @@ if TYPE_CHECKING:
     VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16: bool = True
     VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB: int | None = None
     VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 480
+    VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT: int = 480
     VLLM_USE_CUDNN_PREFILL: bool = False
     VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL: bool = False
     VLLM_ENABLE_CUDAGRAPH_GC: bool = False
@@ -1260,6 +1262,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_NIXL_SIDE_CHANNEL_PORT": lambda: int(
         os.getenv("VLLM_NIXL_SIDE_CHANNEL_PORT", "5600")
     ),
+    # Port used for Mooncake handshake between remote agents.
+    "VLLM_MOONCAKE_BOOTSTRAP_PORT": lambda: int(
+        os.getenv("VLLM_MOONCAKE_BOOTSTRAP_PORT", "8998")
+    ),
     # all2all backend for vllm's expert parallel communication
     # Available options:
     # - "naive": naive all2all implementation using broadcasts
@@ -1369,6 +1375,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_NIXL_ABORT_REQUEST_TIMEOUT": lambda: int(
         os.getenv("VLLM_NIXL_ABORT_REQUEST_TIMEOUT", "480")
     ),
+    # Timeout (in seconds) for MooncakeConnector in PD disaggregated setup.
+    "VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT": lambda: int(
+        os.getenv("VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT", "480")
+    ),
     # Controls whether or not to use cudnn prefill
     "VLLM_USE_CUDNN_PREFILL": lambda: bool(
         int(os.getenv("VLLM_USE_CUDNN_PREFILL", "0"))

From 6366c098d7c76120b6a55a6829a2649c727a2862 Mon Sep 17 00:00:00 2001
From: Noa Neria <noa@run.ai>
Date: Thu, 4 Dec 2025 12:04:43 +0200
Subject: [PATCH 226/770] Validating Runai Model Streamer Integration with S3
 Object Storage (#29320)

Signed-off-by: Noa Neria <noa@run.ai>
---
 docker/Dockerfile                             |  2 +-
 requirements/nightly_torch_test.txt           |  2 +-
 requirements/rocm.txt                         |  2 +-
 requirements/test.in                          |  2 +-
 requirements/test.txt                         |  6 +--
 setup.py                                      |  2 +-
 .../__init__.py                               |  0
 .../runai_streamer_loader/conftest.py         | 39 ++++++++++++++
 .../test_runai_model_streamer_loader.py       |  0
 .../test_runai_model_streamer_s3.py           | 52 +++++++++++++++++++
 .../test_runai_utils.py                       |  0
 .../test_weight_utils.py                      |  0
 vllm/transformers_utils/runai_utils.py        |  4 +-
 13 files changed, 100 insertions(+), 11 deletions(-)
 rename tests/model_executor/model_loader/{runai_model_streamer => runai_streamer_loader}/__init__.py (100%)
 create mode 100644 tests/model_executor/model_loader/runai_streamer_loader/conftest.py
 rename tests/model_executor/model_loader/{runai_model_streamer => runai_streamer_loader}/test_runai_model_streamer_loader.py (100%)
 create mode 100644 tests/model_executor/model_loader/runai_streamer_loader/test_runai_model_streamer_s3.py
 rename tests/model_executor/model_loader/{runai_model_streamer => runai_streamer_loader}/test_runai_utils.py (100%)
 rename tests/model_executor/model_loader/{runai_model_streamer => runai_streamer_loader}/test_weight_utils.py (100%)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 73cb4d7e0dc10..0d50d97e54c6c 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -580,7 +580,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     else \
         BITSANDBYTES_VERSION="0.46.1"; \
     fi; \
-    uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3,gcs]>=0.15.0'
+    uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3,gcs]>=0.15.3'
 
 ENV VLLM_USAGE_SOURCE production-docker-image
 
diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt
index 53b012372be8e..7b2c665448a3b 100644
--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
@@ -42,6 +42,6 @@ tritonclient==2.51.0
 
 numba == 0.61.2 # Required for N-gram speculative decoding
 numpy
-runai-model-streamer[s3,gcs]==0.15.0
+runai-model-streamer[s3,gcs]==0.15.3
 fastsafetensors>=0.1.10
 pydantic>=2.12 # 2.11 leads to error on python 3.13
diff --git a/requirements/rocm.txt b/requirements/rocm.txt
index abbd33d6e1240..05b9a21791c92 100644
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -12,7 +12,7 @@ tensorizer==2.10.1
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
-runai-model-streamer[s3,gcs]==0.15.0
+runai-model-streamer[s3,gcs]==0.15.3
 conch-triton-kernels==1.2.1
 timm>=1.0.17
 fastsafetensors @ git+https://github.com/foundation-model-stack/fastsafetensors.git@d6f998a03432b2452f8de2bb5cefb5af9795d459
diff --git a/requirements/test.in b/requirements/test.in
index da7a7db1f00c9..dfae5b75821f8 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -51,7 +51,7 @@ tritonclient==2.51.0
 arctic-inference == 0.1.1 # Required for suffix decoding test
 numba == 0.61.2 # Required for N-gram speculative decoding
 numpy
-runai-model-streamer[s3,gcs]==0.15.0
+runai-model-streamer[s3,gcs]==0.15.3
 fastsafetensors>=0.1.10
 pydantic>=2.12 # 2.11 leads to error on python 3.13
 decord==0.6.0
diff --git a/requirements/test.txt b/requirements/test.txt
index c5f103b8b0d78..571194e05c1ba 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -965,11 +965,11 @@ rsa==4.9.1
     # via google-auth
 rtree==1.4.0
     # via torchgeo
-runai-model-streamer==0.15.0
+runai-model-streamer==0.15.3
     # via -r requirements/test.in
-runai-model-streamer-gcs==0.15.0
+runai-model-streamer-gcs==0.15.3
     # via runai-model-streamer
-runai-model-streamer-s3==0.15.0
+runai-model-streamer-s3==0.15.3
     # via runai-model-streamer
 s3transfer==0.10.3
     # via boto3
diff --git a/setup.py b/setup.py
index af7282d4f747b..6fcb6653bc4a3 100644
--- a/setup.py
+++ b/setup.py
@@ -797,7 +797,7 @@ setup(
         "bench": ["pandas", "matplotlib", "seaborn", "datasets"],
         "tensorizer": ["tensorizer==2.10.1"],
         "fastsafetensors": ["fastsafetensors >= 0.1.10"],
-        "runai": ["runai-model-streamer[s3,gcs] >= 0.15.0"],
+        "runai": ["runai-model-streamer[s3,gcs] >= 0.15.3"],
         "audio": [
             "librosa",
             "soundfile",
diff --git a/tests/model_executor/model_loader/runai_model_streamer/__init__.py b/tests/model_executor/model_loader/runai_streamer_loader/__init__.py
similarity index 100%
rename from tests/model_executor/model_loader/runai_model_streamer/__init__.py
rename to tests/model_executor/model_loader/runai_streamer_loader/__init__.py
diff --git a/tests/model_executor/model_loader/runai_streamer_loader/conftest.py b/tests/model_executor/model_loader/runai_streamer_loader/conftest.py
new file mode 100644
index 0000000000000..9a022f6bbd9d1
--- /dev/null
+++ b/tests/model_executor/model_loader/runai_streamer_loader/conftest.py
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.utils.network_utils import get_distributed_init_method, get_ip, get_open_port
+from vllm.v1.executor import UniProcExecutor
+from vllm.v1.worker.worker_base import WorkerWrapperBase
+
+
+# This is a dummy executor for patching in test_runai_model_streamer_s3.py.
+# We cannot use vllm_runner fixture here, because it spawns worker process.
+# The worker process reimports the patched entities, and the patch is not applied.
+class RunaiDummyExecutor(UniProcExecutor):
+    def _init_executor(self) -> None:
+        distributed_init_method = get_distributed_init_method(get_ip(), get_open_port())
+
+        local_rank = 0
+        rank = 0
+        is_driver_worker = True
+
+        device_info = self.vllm_config.device_config.device.__str__().split(":")
+        if len(device_info) > 1:
+            local_rank = int(device_info[1])
+
+        worker_rpc_kwargs = dict(
+            vllm_config=self.vllm_config,
+            local_rank=local_rank,
+            rank=rank,
+            distributed_init_method=distributed_init_method,
+            is_driver_worker=is_driver_worker,
+        )
+
+        wrapper_kwargs = {
+            "vllm_config": self.vllm_config,
+        }
+
+        self.driver_worker = WorkerWrapperBase(**wrapper_kwargs)
+
+        self.collective_rpc("init_worker", args=([worker_rpc_kwargs],))
+        self.collective_rpc("init_device")
diff --git a/tests/model_executor/model_loader/runai_model_streamer/test_runai_model_streamer_loader.py b/tests/model_executor/model_loader/runai_streamer_loader/test_runai_model_streamer_loader.py
similarity index 100%
rename from tests/model_executor/model_loader/runai_model_streamer/test_runai_model_streamer_loader.py
rename to tests/model_executor/model_loader/runai_streamer_loader/test_runai_model_streamer_loader.py
diff --git a/tests/model_executor/model_loader/runai_streamer_loader/test_runai_model_streamer_s3.py b/tests/model_executor/model_loader/runai_streamer_loader/test_runai_model_streamer_s3.py
new file mode 100644
index 0000000000000..d60c9ba64cbdb
--- /dev/null
+++ b/tests/model_executor/model_loader/runai_streamer_loader/test_runai_model_streamer_s3.py
@@ -0,0 +1,52 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from pathlib import Path
+
+from huggingface_hub import snapshot_download
+from runai_model_streamer.safetensors_streamer.streamer_mock import StreamerPatcher
+
+from vllm.engine.arg_utils import EngineArgs
+
+from .conftest import RunaiDummyExecutor
+
+load_format = "runai_streamer"
+test_model = "openai-community/gpt2"
+
+
+def test_runai_model_loader_download_files_s3_mocked_with_patch(
+    vllm_runner,
+    tmp_path: Path,
+    monkeypatch,
+):
+    patcher = StreamerPatcher(str(tmp_path))
+
+    test_mock_s3_model = "s3://my-mock-bucket/gpt2/"
+
+    # Download model from HF
+    mock_model_dir = f"{tmp_path}/gpt2"
+    snapshot_download(repo_id=test_model, local_dir=mock_model_dir)
+
+    monkeypatch.setattr(
+        "vllm.transformers_utils.runai_utils.runai_list_safetensors",
+        patcher.shim_list_safetensors,
+    )
+    monkeypatch.setattr(
+        "vllm.transformers_utils.runai_utils.runai_pull_files",
+        patcher.shim_pull_files,
+    )
+    monkeypatch.setattr(
+        "vllm.model_executor.model_loader.weight_utils.SafetensorsStreamer",
+        patcher.create_mock_streamer,
+    )
+
+    engine_args = EngineArgs(
+        model=test_mock_s3_model,
+        load_format=load_format,
+        tensor_parallel_size=1,
+    )
+
+    vllm_config = engine_args.create_engine_config()
+
+    executor = RunaiDummyExecutor(vllm_config)
+    executor.driver_worker.load_model()
diff --git a/tests/model_executor/model_loader/runai_model_streamer/test_runai_utils.py b/tests/model_executor/model_loader/runai_streamer_loader/test_runai_utils.py
similarity index 100%
rename from tests/model_executor/model_loader/runai_model_streamer/test_runai_utils.py
rename to tests/model_executor/model_loader/runai_streamer_loader/test_runai_utils.py
diff --git a/tests/model_executor/model_loader/runai_model_streamer/test_weight_utils.py b/tests/model_executor/model_loader/runai_streamer_loader/test_weight_utils.py
similarity index 100%
rename from tests/model_executor/model_loader/runai_model_streamer/test_weight_utils.py
rename to tests/model_executor/model_loader/runai_streamer_loader/test_weight_utils.py
diff --git a/vllm/transformers_utils/runai_utils.py b/vllm/transformers_utils/runai_utils.py
index eac4294bb59cd..041056720a96b 100644
--- a/vllm/transformers_utils/runai_utils.py
+++ b/vllm/transformers_utils/runai_utils.py
@@ -18,9 +18,7 @@ SUPPORTED_SCHEMES = ["s3://", "gs://"]
 try:
     from runai_model_streamer import list_safetensors as runai_list_safetensors
     from runai_model_streamer import pull_files as runai_pull_files
-except (ImportError, OSError):
-    # see https://github.com/run-ai/runai-model-streamer/issues/26
-    # OSError will be raised on arm64 platform
+except ImportError:
     runai_model_streamer = PlaceholderModule("runai_model_streamer")  # type: ignore[assignment]
     runai_pull_files = runai_model_streamer.placeholder_attr("pull_files")
     runai_list_safetensors = runai_model_streamer.placeholder_attr("list_safetensors")

From e96a6a6dca930d00902852ea6937a214a584b89b Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Thu, 4 Dec 2025 05:00:16 -0600
Subject: [PATCH 227/770] [ROCm][CI][Bugfix] Fixing the `Multi-Modal Models
 Test (Extended) 1` group (#30013)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 .buildkite/test-amd.yaml                      |  6 ++-
 .../models/multimodal/generation/conftest.py  | 16 +++++++
 .../multimodal/generation/test_common.py      | 12 ++++-
 .../generation/test_granite_speech.py         | 15 ++++++-
 .../multimodal/generation/test_pixtral.py     | 10 +++++
 .../generation/vlm_utils/custom_inputs.py     |  2 +-
 .../generation/vlm_utils/model_utils.py       | 45 ++++++++++++++++++-
 tests/models/multimodal/pooling/conftest.py   | 24 ++++++++++
 tests/models/registry.py                      |  4 ++
 vllm/v1/attention/backends/flex_attention.py  | 14 +++++-
 10 files changed, 139 insertions(+), 9 deletions(-)
 create mode 100644 tests/models/multimodal/pooling/conftest.py

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index ee4fdebae5675..022b6ea236d54 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -987,7 +987,8 @@ steps:
   commands:
   - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
 
-- label: Multi-Modal Models Test (Extended) 1
+- label: Multi-Modal Models Test (Extended) 1 # 60min
+  timeout_in_minutes: 120
   mirror_hardwares: [amdexperimental]
   agent_pool: mi325_1
   # grade: Blocking
@@ -1011,7 +1012,8 @@ steps:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
 
-- label: Multi-Modal Models Test (Extended) 3
+- label: Multi-Modal Models Test (Extended) 3 # 75min
+  timeout_in_minutes: 150
   mirror_hardwares: [amdexperimental]
   agent_pool: mi325_1
   # grade: Blocking
diff --git a/tests/models/multimodal/generation/conftest.py b/tests/models/multimodal/generation/conftest.py
index ee3ecdb10fdb8..26f8586742cea 100644
--- a/tests/models/multimodal/generation/conftest.py
+++ b/tests/models/multimodal/generation/conftest.py
@@ -2,6 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Pytest configuration for vLLM tests."""
 
+import warnings
+
 import torch
 
 from vllm.platforms import current_platform
@@ -14,6 +16,20 @@ def pytest_configure(config):
     if not current_platform.is_rocm():
         return
 
+    skip_patterns = ["test_granite_speech.py"]
+    if any(pattern in str(arg) for arg in config.args for pattern in skip_patterns):
+        # Skip disabling SDP for Granite Speech tests on ROCm
+        return
+
+    # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
+    # accuracy issues
+    # TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
     torch.backends.cuda.enable_flash_sdp(False)
     torch.backends.cuda.enable_mem_efficient_sdp(False)
     torch.backends.cuda.enable_math_sdp(True)
+    warnings.warn(
+        "ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp "
+        "to avoid HuggingFace Transformers accuracy issues",
+        UserWarning,
+        stacklevel=1,
+    )
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index 0eaf7198f91b7..f896126a49089 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -403,12 +403,13 @@ VLM_TEST_SETTINGS = {
         # So, we need to reduce the number of tokens for the test to pass.
         max_tokens=8,
         num_logprobs=10,
+        auto_cls=AutoModelForCausalLM,
         marks=[large_gpu_mark(min_gb=32)],
     ),
     "glm4_1v": VLMTestInfo(
         models=["zai-org/GLM-4.1V-9B-Thinking"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",
+        prompt_formatter=lambda img_prompt: f"[gMASK]<|user|>\n{img_prompt}<|assistant|>\n",  # noqa: E501
         img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>",
         video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>",
         max_model_len=2048,
@@ -423,6 +424,7 @@ VLM_TEST_SETTINGS = {
         models=["zai-org/GLM-4.1V-9B-Thinking"],
         # GLM4.1V require include video metadata for input
         test_type=VLMTestType.CUSTOM_INPUTS,
+        prompt_formatter=lambda vid_prompt: f"[gMASK]<|user|>\n{vid_prompt}<|assistant|>\n",  # noqa: E501
         max_model_len=4096,
         max_num_seqs=2,
         auto_cls=AutoModelForImageTextToText,
@@ -737,7 +739,13 @@ VLM_TEST_SETTINGS = {
         max_model_len=8192,
         max_num_seqs=2,
         auto_cls=AutoModelForImageTextToText,
-        marks=[large_gpu_mark(min_gb=48)],
+        marks=[
+            large_gpu_mark(min_gb=48),
+            pytest.mark.skipif(
+                current_platform.is_rocm(),
+                reason="Model produces a vector of <UNK> output in HF on ROCm",
+            ),
+        ],
     ),
     "qwen_vl": VLMTestInfo(
         models=["Qwen/Qwen-VL"],
diff --git a/tests/models/multimodal/generation/test_granite_speech.py b/tests/models/multimodal/generation/test_granite_speech.py
index e39dfc888779e..f528a993f8551 100644
--- a/tests/models/multimodal/generation/test_granite_speech.py
+++ b/tests/models/multimodal/generation/test_granite_speech.py
@@ -8,6 +8,7 @@ from transformers import AutoModelForSpeechSeq2Seq
 
 from vllm.logprobs import SampleLogprobs
 from vllm.lora.request import LoRARequest
+from vllm.platforms import current_platform
 
 from ....conftest import AudioTestAssets, HfRunner, PromptAudioInput, VllmRunner
 from ...registry import HF_EXAMPLE_MODELS
@@ -34,6 +35,12 @@ audio_lora_path = MODEL_NAME
 models = [MODEL_NAME]
 
 
+@pytest.fixture(autouse=True)
+def set_attention_backend_for_rocm(monkeypatch):
+    if current_platform.is_rocm():
+        monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
+
+
 def run_test(
     hf_runner: type[HfRunner],
     vllm_runner: type[VllmRunner],
@@ -111,8 +118,12 @@ def run_test(
 
 
 @pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_model_len", [2048])
+@pytest.mark.parametrize(
+    "dtype", ["float16"] if current_platform.is_rocm() else ["bfloat16"]
+)
+@pytest.mark.parametrize(
+    "max_model_len", [512] if current_platform.is_rocm() else [2048]
+)
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
 def test_models(
diff --git a/tests/models/multimodal/generation/test_pixtral.py b/tests/models/multimodal/generation/test_pixtral.py
index 3cad2c43d5623..375099f4365ac 100644
--- a/tests/models/multimodal/generation/test_pixtral.py
+++ b/tests/models/multimodal/generation/test_pixtral.py
@@ -15,6 +15,7 @@ from transformers import AutoProcessor
 from vllm import SamplingParams, TextPrompt, TokensPrompt
 from vllm.logprobs import Logprob, SampleLogprobs
 from vllm.multimodal import MultiModalDataBuiltins
+from vllm.platforms import current_platform
 
 from ....utils import VLLM_PATH, large_gpu_test
 from ...utils import check_logprobs_close
@@ -165,6 +166,15 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
 def test_chat(
     vllm_runner, max_model_len: int, model: str, dtype: str, local_asset_server
 ) -> None:
+    if (
+        model == MISTRAL_SMALL_3_1_ID
+        and max_model_len == 65536
+        and current_platform.is_rocm()
+    ):
+        pytest.skip(
+            "OOM on ROCm: 24B model with 65536 context length exceeds GPU memory"
+        )
+
     EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_CHAT[model])
     with vllm_runner(
         model,
diff --git a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
index 8c9c390911bdc..84109233685bb 100644
--- a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
+++ b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
@@ -140,7 +140,7 @@ def video_with_metadata_glm4_1v():
     metadata = VIDEO_ASSETS[0].metadata
     question = "Describe the video."
     video_prompt = "<|begin_of_video|><|video|><|end_of_video|>"
-    formatted_prompt = f"<|user|>\n{video_prompt}{question}<|assistant|>\n"
+    formatted_prompt = f"[gMASK]<|user|>\n{video_prompt}{question}<|assistant|>\n"
 
     scales = [0.1, 0.2, 0.25]
     video_input = [
diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py
index 87cd5c3cd3554..b2c62fbd119cc 100644
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -25,6 +25,7 @@ from transformers import (
 from transformers.video_utils import VideoMetadata
 
 from vllm.logprobs import SampleLogprobs
+from vllm.platforms import current_platform
 from vllm.utils.collection_utils import is_list_of
 
 from .....conftest import HfRunner, ImageAsset, ImageTestAssets
@@ -366,6 +367,40 @@ def gemma3_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOut
 
 def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     """Patches and returns an instance of the HfRunner to use for GLM4V."""
+    if current_platform.is_rocm():
+        import types
+
+        config = hf_model.model.config
+        if hasattr(config, "num_layers") and not hasattr(config, "num_hidden_layers"):
+            config.num_hidden_layers = config.num_layers
+        config.output_hidden_states = True
+
+        def patched_prepare_cache(
+            self, generation_config, model_kwargs, *args, **kwargs
+        ):
+            model_kwargs["past_key_values"] = None
+            model_kwargs["use_cache"] = False
+            return model_kwargs
+
+        hf_model.model._prepare_cache_for_generation = types.MethodType(
+            patched_prepare_cache, hf_model.model
+        )
+        original_generate = hf_model.model.generate
+
+        def patched_generate(*args, **kwargs):
+            kwargs["output_hidden_states"] = True
+            kwargs["return_dict_in_generate"] = True
+            return original_generate(*args, **kwargs)
+
+        hf_model.model.generate = patched_generate
+        original_forward = hf_model.model.forward
+
+        def patched_forward(*args, **kwargs):
+            kwargs["output_hidden_states"] = True
+            return original_forward(*args, **kwargs)
+
+        hf_model.model.forward = patched_forward
+
     hf_processor = hf_model.processor
 
     def processor(*args, text="", images=None, **kwargs):
@@ -406,7 +441,15 @@ def glm4_1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
         if videos is not None and is_list_of(videos, tuple):
             # If videos is a list of tuples, we assume each tuple contains
             # (video_array, metadata) as in the case of GLM4.1V.
-            video_metadata = [[VideoMetadata(**video[1])] for video in videos]
+            # Filter out 'do_sample_frames' as it's not a valid VideoMetadata arg
+            video_metadata = [
+                [
+                    VideoMetadata(
+                        **{k: v for k, v in video[1].items() if k != "do_sample_frames"}
+                    )
+                ]
+                for video in videos
+            ]
             videos = [[video[0]] for video in videos]
         else:
             video_metadata = None
diff --git a/tests/models/multimodal/pooling/conftest.py b/tests/models/multimodal/pooling/conftest.py
new file mode 100644
index 0000000000000..c5f40cb42ca2a
--- /dev/null
+++ b/tests/models/multimodal/pooling/conftest.py
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Pytest configuration for vLLM pooling tests."""
+
+import os
+import warnings
+
+from vllm.platforms import current_platform
+
+
+def pytest_collection_modifyitems(config, items):
+    """Set FLEX_ATTENTION backend for SigLIP tests on ROCm."""
+    if not current_platform.is_rocm():
+        return
+
+    siglip_tests = [item for item in items if "test_siglip" in item.nodeid]
+
+    if siglip_tests:
+        os.environ["VLLM_ATTENTION_BACKEND"] = "FLEX_ATTENTION"
+        warnings.warn(
+            "ROCm: Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION for SigLIP tests",
+            UserWarning,
+            stacklevel=1,
+        )
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 6b1d24b1c99b5..bf88bac209808 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -667,6 +667,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
         "moonshotai/Kimi-VL-A3B-Instruct",
         extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"},
         trust_remote_code=True,
+        max_transformers_version="4.53.3",
+        transformers_version_reason="HF model uses deprecated transformers API "
+        "(PytorchGELUTanh, DynamicCache.seen_tokens, and more). See: "
+        "https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/discussions/31",
     ),
     "LightOnOCRForConditionalGeneration": _HfExamplesInfo(
         "lightonai/LightOnOCR-1B",
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index fe92f6570501c..a2a6eeeb16b24 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -31,6 +31,7 @@ from vllm.logger import init_logger
 from vllm.model_executor.layers.batch_invariant import (
     vllm_is_batch_invariant,
 )
+from vllm.platforms import current_platform
 from vllm.utils.math_utils import cdiv
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 from vllm.v1.attention.backends.utils import (
@@ -927,7 +928,18 @@ def get_kernel_options(
 
         if torch.cuda.is_available():
             device_props = torch.cuda.get_device_properties()
-            max_shared_memory = device_props.shared_memory_per_block_optin
+            # ROCm doesn't expose shared_memory_per_block_optin attribute
+            # AMD GPUs typically have 64KB LDS (Local Data Share) per workgroup
+            if hasattr(device_props, "shared_memory_per_block_optin"):
+                max_shared_memory = device_props.shared_memory_per_block_optin
+            elif current_platform.is_rocm():
+                # ROCm fallback: use 64KB
+                max_shared_memory = 65536
+            else:
+                raise RuntimeError(
+                    "Unable to determine shared memory size on this hardware."
+                )
+
             if max_shared_memory < 144 * 1024:
                 block_m_candidate = ensure_divisible(
                     max(1, block_m_candidate // 2), block_m

From 6796ce8bdbf29f5624fcdc03792626574c919b41 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Thu, 4 Dec 2025 19:11:59 +0800
Subject: [PATCH 228/770] [Bugfix] Fix the issue with interleaved thinking when
 using streaming (#30033)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Signed-off-by: Chauncey <chaunceyjiang@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 .../reasoning/test_base_thinking_reasoning_parser.py | 12 +++++++++++-
 vllm/reasoning/basic_parsers.py                      |  9 ++++++++-
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/tests/reasoning/test_base_thinking_reasoning_parser.py b/tests/reasoning/test_base_thinking_reasoning_parser.py
index d31b1c7d169b7..34e9483de54b3 100644
--- a/tests/reasoning/test_base_thinking_reasoning_parser.py
+++ b/tests/reasoning/test_base_thinking_reasoning_parser.py
@@ -112,7 +112,7 @@ class TestBaseThinkingReasoningParserMethods:
         """Test the is_reasoning_end method."""
         parser = TestThinkingReasoningParser(test_tokenizer)
         end_token_id = parser.end_token_id
-
+        start_token_id = parser.start_token_id
         # Test with end token present
         assert parser.is_reasoning_end([1, 2, end_token_id, 4]) is True
 
@@ -122,6 +122,16 @@ class TestBaseThinkingReasoningParserMethods:
         # Test with empty list
         assert parser.is_reasoning_end([]) is False
 
+        # Test with interleaved thinking
+        assert parser.is_reasoning_end([1, start_token_id, 2, end_token_id]) is True
+        assert parser.is_reasoning_end([1, start_token_id, 2, 3]) is False
+        assert (
+            parser.is_reasoning_end(
+                [1, start_token_id, 2, end_token_id, 2, 2, start_token_id]
+            )
+            is False
+        )
+
     def test_extract_content_ids(self, test_tokenizer):
         """Test the extract_content_ids method."""
         parser = TestThinkingReasoningParser(test_tokenizer)
diff --git a/vllm/reasoning/basic_parsers.py b/vllm/reasoning/basic_parsers.py
index 35084c0e7cc86..e78ac4a5ebb37 100644
--- a/vllm/reasoning/basic_parsers.py
+++ b/vllm/reasoning/basic_parsers.py
@@ -64,8 +64,15 @@ class BaseThinkingReasoningParser(ReasoningParser):
             )
 
     def is_reasoning_end(self, input_ids: list[int]) -> bool:
+        start_token_id = self.start_token_id
         end_token_id = self.end_token_id
-        return any(input_id == end_token_id for input_id in reversed(input_ids))
+
+        for i in range(len(input_ids) - 1, -1, -1):
+            if input_ids[i] == start_token_id:
+                return False
+            if input_ids[i] == end_token_id:
+                return True
+        return False
 
     def extract_content_ids(self, input_ids: list[int]) -> list[int]:
         """

From 1b7c7f5159484063af28cb47809d79e83d3301ec Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <khluu000@gmail.com>
Date: Thu, 4 Dec 2025 03:18:29 -0800
Subject: [PATCH 229/770] [release] install regex (#30008)

Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 .buildkite/scripts/upload-wheels.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
index 2eaa91c04086c..0ac8fdd45bd0a 100644
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -81,7 +81,7 @@ else
     alias_arg=""
 fi
 
-$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg
+$PYTHON pip install regex && .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg
 
 # copy indices to /<commit>/ unconditionally
 echo "Uploading indices to $S3_COMMIT_PREFIX"

From 74c4d80c6ca8160578b6e812079cb11dfd8b3d22 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <yuqi.wang@daocloud.io>
Date: Thu, 4 Dec 2025 21:44:15 +0800
Subject: [PATCH 230/770] [Model][6/N] Improve all pooling task | Support
 chunked prefill with ALL pooling (#27145)

Signed-off-by: wang.yuqi <noooop@126.com>
Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 docs/features/README.md                       |  2 +-
 .../pooling/classify/test_offline.py          |  7 +-
 .../pooling/classify/test_online.py           | 12 +--
 .../entrypoints/pooling/embed/test_offline.py |  2 +-
 .../pooling/reward/test_offline.py            |  7 ++
 .../test_all_pooling_plus_chunked_prefill.py  | 53 +++++++++++
 .../pooling/test_extract_hidden_states.py     |  1 -
 tests/test_config.py                          |  8 +-
 vllm/config/model.py                          | 24 +++--
 vllm/model_executor/layers/pooler.py          | 93 ++++++++++++++-----
 vllm/model_executor/models/terratorch.py      |  4 +-
 vllm/v1/outputs.py                            |  2 +-
 vllm/v1/pool/metadata.py                      | 29 +++++-
 vllm/v1/worker/gpu_input_batch.py             | 23 ++++-
 vllm/v1/worker/gpu_model_runner.py            | 50 +++-------
 15 files changed, 224 insertions(+), 93 deletions(-)
 create mode 100644 tests/models/language/pooling/test_all_pooling_plus_chunked_prefill.py

diff --git a/docs/features/README.md b/docs/features/README.md
index 5faf3768f3214..684802301a44f 100644
--- a/docs/features/README.md
+++ b/docs/features/README.md
@@ -54,7 +54,7 @@ th:not(:first-child) {
 | beam-search | ✅ | ✅ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](https://github.com/vllm-project/vllm/issues/7968) | ❔ | ✅ | ✅ | |
 | [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❔ | ❔ | ❌ | ❔ | ❔ | ✅ |
 
-\* Chunked prefill and prefix caching are only applicable to last-token pooling.  
+\* Chunked prefill and prefix caching are only applicable to last-token or all pooling with causal attention.  
 <sup>^</sup> LoRA is only applicable to the language backbone of multimodal models.
 
 ### Feature x Hardware
diff --git a/tests/entrypoints/pooling/classify/test_offline.py b/tests/entrypoints/pooling/classify/test_offline.py
index 1063c3b6b755c..a07fcd372721a 100644
--- a/tests/entrypoints/pooling/classify/test_offline.py
+++ b/tests/entrypoints/pooling/classify/test_offline.py
@@ -61,11 +61,8 @@ def test_pooling_params(llm: LLM):
 
 
 @pytest.mark.skip_global_cleanup
-def test_encode_api(llm: LLM):
-    # chunked prefill does not support all pooling
-    err_msg = "pooling_task must be one of.+"
-    with pytest.raises(ValueError, match=err_msg):
-        llm.encode(prompts, pooling_task="token_classify", use_tqdm=False)
+def test_token_classify(llm: LLM):
+    llm.encode(prompts, pooling_task="token_classify", use_tqdm=False)
 
 
 def test_score_api(llm: LLM):
diff --git a/tests/entrypoints/pooling/classify/test_online.py b/tests/entrypoints/pooling/classify/test_online.py
index 6fef688586955..1a6c33b455e65 100644
--- a/tests/entrypoints/pooling/classify/test_online.py
+++ b/tests/entrypoints/pooling/classify/test_online.py
@@ -255,21 +255,21 @@ async def test_pooling_classify(server: RemoteOpenAIServer, model_name: str):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_pooling_token_classify(server: RemoteOpenAIServer, model_name: str):
-    # token_classify uses ALL pooling, which does not support chunked prefill.
     task = "token_classify"
+    input_text = ["This product was excellent and exceeded my expectations"]
     response = requests.post(
         server.url_for("pooling"),
         json={
             "model": model_name,
-            "input": "test",
+            "input": input_text,
             "encoding_format": "float",
             "task": task,
         },
     )
-    assert response.json()["error"]["type"] == "BadRequestError"
-    assert response.json()["error"]["message"].startswith(
-        f"Task {task} is not supported"
-    )
+    poolings = PoolingResponse.model_validate(response.json())
+    assert len(poolings.data) == 1
+    assert len(poolings.data[0].data) == 8
+    assert len(poolings.data[0].data[0]) == 2
 
 
 @pytest.mark.asyncio
diff --git a/tests/entrypoints/pooling/embed/test_offline.py b/tests/entrypoints/pooling/embed/test_offline.py
index f5eab4c29ae18..12b47b1a08a8b 100644
--- a/tests/entrypoints/pooling/embed/test_offline.py
+++ b/tests/entrypoints/pooling/embed/test_offline.py
@@ -42,7 +42,7 @@ def llm():
 
 
 @pytest.mark.skip_global_cleanup
-def test_encode_api(llm: LLM):
+def test_token_embed(llm: LLM):
     outputs = llm.encode(prompts, pooling_task="token_embed", use_tqdm=False)
     multi_vector = outputs[0].outputs.data
     assert multi_vector.shape == (11, 384)
diff --git a/tests/entrypoints/pooling/reward/test_offline.py b/tests/entrypoints/pooling/reward/test_offline.py
index 0255704cecd94..b061b55145155 100644
--- a/tests/entrypoints/pooling/reward/test_offline.py
+++ b/tests/entrypoints/pooling/reward/test_offline.py
@@ -36,6 +36,13 @@ def llm():
     cleanup_dist_env_and_memory()
 
 
+@pytest.mark.skip_global_cleanup
+def test_config(llm: LLM):
+    vllm_config = llm.llm_engine.vllm_config
+    assert vllm_config.cache_config.enable_prefix_caching
+    assert vllm_config.scheduler_config.enable_chunked_prefill
+
+
 def test_pooling_params(llm: LLM):
     def get_outputs(use_activation):
         outputs = llm.reward(
diff --git a/tests/models/language/pooling/test_all_pooling_plus_chunked_prefill.py b/tests/models/language/pooling/test_all_pooling_plus_chunked_prefill.py
new file mode 100644
index 0000000000000..c259c532220b2
--- /dev/null
+++ b/tests/models/language/pooling/test_all_pooling_plus_chunked_prefill.py
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+from transformers import AutoModel
+
+from tests.models.utils import check_embeddings_close
+from vllm import TokensPrompt
+
+
+@pytest.mark.parametrize(
+    "model",
+    ["Qwen/Qwen3-Embedding-0.6B"],
+)
+@torch.inference_mode
+def test_embed_models(hf_runner, vllm_runner, model: str):
+    chunk_size = 10
+    n_prompt_tokens = [55, 56, 57]
+    token_prompts = [[1024 + i for i in range(n)] for n in n_prompt_tokens]
+
+    with vllm_runner(
+        model,
+        runner="pooling",
+        max_model_len=128,
+        max_num_batched_tokens=chunk_size,
+        enforce_eager=True,
+        # `enable_chunked_prefill`: Set to `False` instead of `None` in VllmRunner
+        enable_chunked_prefill=True,
+        enable_prefix_caching=True,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.token_embed(
+            [TokensPrompt(prompt_token_ids=t) for t in token_prompts],
+        )
+
+    with hf_runner(
+        model,
+        auto_cls=AutoModel,
+    ) as hf_model:
+        hf_outputs = []
+        for token_prompt in token_prompts:
+            inputs = hf_model.wrap_device({"input_ids": torch.tensor([token_prompt])})
+            input_ids = inputs["input_ids"]
+            output = hf_model.model(input_ids)
+            hf_outputs.append(output.last_hidden_state.cpu().float()[0])
+
+    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
+        check_embeddings_close(
+            embeddings_0_lst=hf_output,
+            embeddings_1_lst=vllm_output,
+            name_0="hf",
+            name_1="vllm",
+            tol=1e-2,
+        )
diff --git a/tests/models/language/pooling/test_extract_hidden_states.py b/tests/models/language/pooling/test_extract_hidden_states.py
index 0d41b93233d5a..488b27e2da0f1 100644
--- a/tests/models/language/pooling/test_extract_hidden_states.py
+++ b/tests/models/language/pooling/test_extract_hidden_states.py
@@ -20,7 +20,6 @@ def test_extract_hidden_states(hf_runner, vllm_runner, model: str):
         max_model_len=128,
         enforce_eager=True,
         runner="pooling",
-        enable_chunked_prefill=False,
         enable_prefix_caching=True,
     ) as vllm_model:
         pooling_outputs = vllm_model.llm.encode(
diff --git a/tests/test_config.py b/tests/test_config.py
index 019c0d6d8733f..203447cd531fb 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -629,8 +629,8 @@ def test_s3_url_different_models_create_different_directories(mock_pull_files):
         (
             "internlm/internlm2-1_8b-reward",
             "decoder",
-            False,
-            "Pooling models with all pooling does not support chunked prefill.",
+            True,
+            "Pooling models with causal attn and all pooling support chunked prefill.",
         ),
         (
             "BAAI/bge-base-en",
@@ -748,8 +748,8 @@ def test_is_chunked_prefill_supported(
         (
             "internlm/internlm2-1_8b-reward",
             "decoder",
-            False,
-            "Pooling models with all pooling does not support prefix caching.",
+            True,
+            "Pooling models with causal attn and all pooling support prefix caching.",
         ),
         (
             "BAAI/bge-base-en",
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 655b7c995f6d2..ae5189ce68d9a 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -1780,20 +1780,22 @@ class ModelConfig:
                 return False
             elif attn_type == "decoder":
                 pooling_type = self.pooler_config.pooling_type.lower()
-                if pooling_type in ["all", "mean", "step", "cls"]:
+                if pooling_type in ["mean", "step", "cls"]:
                     logger.debug(
                         "Pooling models with %s pooling does not "
                         "support chunked prefill.",
                         pooling_type,
                     )
                     return False
-                else:
-                    # pooling_type == "last"
+                elif pooling_type in ["all", "last"]:
                     logger.debug(
-                        "Pooling models with causal attn and last pooling support "
-                        "chunked prefill."
+                        "Pooling models with causal attn and %s pooling support "
+                        "chunked prefill.",
+                        pooling_type,
                     )
                     return True
+                else:
+                    raise ValueError(f"{pooling_type=} not supported.")
             # vllm currently does not have pooling models using hybrid,
             # attention_free or encoder_decoder attn types.
             return attn_type != "encoder_decoder"
@@ -1817,20 +1819,22 @@ class ModelConfig:
                 return False
             elif attn_type == "decoder":
                 pooling_type = self.pooler_config.pooling_type.lower()
-                if pooling_type in ["all", "mean", "step", "cls"]:
+                if pooling_type in ["mean", "step", "cls"]:
                     logger.debug(
                         "Pooling models with %s pooling does not "
                         "support prefix caching.",
                         pooling_type,
                     )
                     return False
-                else:
-                    # pooling_type == "last"
+                elif pooling_type in ["all", "last"]:
                     logger.debug(
-                        "Pooling models with causal attn and last pooling support "
-                        "prefix caching."
+                        "Pooling models with causal attn and %s pooling support "
+                        "prefix caching.",
+                        pooling_type,
                     )
                     return True
+                else:
+                    raise ValueError(f"{pooling_type=} not supported.")
             # vllm currently does not have pooling models using hybrid,
             # attention_free or encoder_decoder attn types.
             return False
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index 185e03e5f3bd7..d1942689d7f5c 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -127,14 +127,14 @@ class PoolingMethod(nn.Module, ABC):
         self,
         hidden_states: torch.Tensor,
         pooling_cursor: PoolingCursor,
-    ) -> list[torch.Tensor] | torch.Tensor:
+    ) -> PoolerOutput:
         raise NotImplementedError
 
     def forward(
         self,
         hidden_states: torch.Tensor,
         pooling_metadata: PoolingMetadata,
-    ) -> list[torch.Tensor] | torch.Tensor:
+    ) -> PoolerOutput:
         pooling_cursor = pooling_metadata.pooling_cursor
         return self.forward_all(hidden_states, pooling_cursor)
 
@@ -147,7 +147,7 @@ class CLSPool(PoolingMethod):
         self,
         hidden_states: torch.Tensor,
         pooling_cursor: PoolingCursor,
-    ) -> list[torch.Tensor] | torch.Tensor:
+    ) -> PoolerOutput:
         assert not pooling_cursor.is_partial_prefill(), (
             "partial prefill not supported with CLS pooling"
         )
@@ -163,27 +163,65 @@ class LastPool(PoolingMethod):
         self,
         hidden_states: torch.Tensor,
         pooling_cursor: PoolingCursor,
-    ) -> list[torch.Tensor] | torch.Tensor:
+    ) -> PoolerOutput:
         return hidden_states[pooling_cursor.last_token_indices_gpu]
 
 
 class AllPool(PoolingMethod):
+    def __init__(self):
+        super().__init__()
+
+        vllm_config = get_current_vllm_config()
+        self.enable_chunked_prefill = (
+            vllm_config.scheduler_config.enable_chunked_prefill
+        )
+
     def get_supported_tasks(self) -> Set[PoolingTask]:
         return {"token_embed", "token_classify"}
 
     def forward_all(
-        self,
-        hidden_states: torch.Tensor,
-        pooling_cursor: PoolingCursor,
-    ) -> list[torch.Tensor] | torch.Tensor:
-        assert not pooling_cursor.is_partial_prefill(), (
-            "partial prefill not supported with ALL pooling"
+        self, hidden_states: torch.Tensor, pooling_cursor: PoolingCursor
+    ) -> PoolerOutput:
+        raise NotImplementedError(
+            "forward_all is not implemented for AllPool. Use forward instead."
         )
 
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> PoolerOutput:
+        pooling_cursor = pooling_metadata.pooling_cursor
+        is_finished = pooling_cursor.is_finished()
         hidden_states_lst = list(
             hidden_states.split(pooling_cursor.num_scheduled_tokens_cpu.tolist())
         )
-        return [hidden_states_lst[i] for i in pooling_cursor.index]
+        hidden_states_lst = [hidden_states_lst[i] for i in pooling_cursor.index]
+
+        if not self.enable_chunked_prefill:
+            return hidden_states_lst
+
+        pooling_states = pooling_metadata.pooling_states
+
+        # If chunked_prefill is enabled
+        # 1. first store the chunked hidden_states in pooling_states.hidden_states_cache
+        for p, hs_chunk in zip(pooling_states, hidden_states_lst):
+            p.hidden_states_cache.append(hs_chunk)
+
+        # 2. Once prefill is finished, send hidden_states_cache to PoolerHead
+        output_list: PoolerOutput = []
+        for p, finished in zip(pooling_states, is_finished):
+            if finished:
+                hidden_states_cache = p.hidden_states_cache
+                if len(hidden_states_cache) == 1:
+                    output_list.append(hidden_states_cache[0])
+                else:
+                    output_list.append(torch.concat(hidden_states_cache, dim=0))
+                p.clean()
+            else:
+                output_list.append(None)
+
+        return output_list
 
 
 class MeanPool(PoolingMethod):
@@ -194,7 +232,7 @@ class MeanPool(PoolingMethod):
         self,
         hidden_states: torch.Tensor,
         pooling_cursor: PoolingCursor,
-    ) -> list[torch.Tensor] | torch.Tensor:
+    ) -> PoolerOutput:
         assert not pooling_cursor.is_partial_prefill(), (
             "partial prefill not supported with MEAN pooling"
         )
@@ -399,7 +437,7 @@ class PoolerHead(nn.Module):
         self,
         pooled_data: list[torch.Tensor] | torch.Tensor,
         pooling_metadata: PoolingMetadata,
-    ):
+    ) -> PoolerOutput:
         return self.activation(pooled_data)
 
 
@@ -418,7 +456,7 @@ class EmbeddingPoolerHead(PoolerHead):
         self,
         pooled_data: list[torch.Tensor] | torch.Tensor,
         pooling_metadata: PoolingMetadata,
-    ):
+    ) -> PoolerOutput:
         if isinstance(pooled_data, list):
             pooled_data = torch.stack(pooled_data)
         # pooled_data shape: [batchsize, hidden_dimension]
@@ -586,8 +624,12 @@ class ClassifierPooler(Pooler):
 
 class TokenEmbeddingPoolerHead(EmbeddingPoolerHead):
     def forward(
-        self, pooled_data: torch.Tensor, pooling_param: PoolingParams
-    ) -> torch.Tensor:
+        self, pooled_data: torch.Tensor | None, pooling_param: PoolingParams
+    ) -> PoolerOutput:
+        # for unfinished chunked prefill
+        if pooled_data is None:
+            return None
+
         pooled_data = pooled_data.to(self.head_dtype)
         # pooled_data shape: [n_tokens, hidden_dimension]
 
@@ -630,9 +672,13 @@ class TokenClassifierPoolerHead(nn.Module):
 
     def forward(
         self,
-        hidden_states: torch.Tensor,
+        hidden_states: torch.Tensor | None,
         pooling_param: PoolingParams,
-    ) -> torch.Tensor:
+    ) -> PoolerOutput:
+        # for unfinished chunked prefill
+        if hidden_states is None:
+            return None
+
         hidden_states = hidden_states.to(self.head_dtype)
         # hidden_states shape: [n_token, hidden_size]
 
@@ -686,17 +732,20 @@ class StepPooler(Pooler):
         self,
         hidden_states: torch.Tensor | list[torch.Tensor],
         pooling_metadata: PoolingMetadata,
-    ) -> torch.Tensor | list[torch.Tensor]:
+    ) -> PoolerOutput:
         pooled_data_lst = self.pooling(hidden_states, pooling_metadata)
         prompt_token_ids = pooling_metadata.get_prompt_token_ids()
-
-        pooled_data = list[torch.Tensor]()
-
         pooling_params = pooling_metadata.pooling_params
 
+        pooled_data: PoolerOutput = []
         for data, token_id, pooling_param in zip(
             pooled_data_lst, prompt_token_ids, pooling_params
         ):
+            # for unfinished chunked prefill
+            if data is None:
+                pooled_data.append(data)
+                continue
+
             step_tag_id = pooling_param.step_tag_id
             returned_token_ids = pooling_param.returned_token_ids
 
diff --git a/vllm/model_executor/models/terratorch.py b/vllm/model_executor/models/terratorch.py
index 19052c8d49e44..9f34090e31071 100644
--- a/vllm/model_executor/models/terratorch.py
+++ b/vllm/model_executor/models/terratorch.py
@@ -64,7 +64,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import IsAttentionFree, MultiModalEmbeddings, SupportsMultiModal
-from .interfaces_base import default_pooling_type
+from .interfaces_base import attn_type
 
 logger = init_logger(__name__)
 
@@ -220,7 +220,7 @@ class TerratorchMultiModalProcessor(BaseMultiModalProcessor):
         )
 
 
-@default_pooling_type("All")
+@attn_type("attention_free")
 @MULTIMODAL_REGISTRY.register_processor(
     TerratorchMultiModalProcessor,
     info=TerratorchProcessingInfo,
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index 88ac6b4aeb4bb..546eacebf83e5 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -89,7 +89,7 @@ class LogprobsTensors(NamedTuple):
 
 # [num_reqs, <dynamic>]
 # The shape of each element depends on the pooler used
-PoolerOutput = torch.Tensor | list[torch.Tensor]
+PoolerOutput = list[torch.Tensor | None] | torch.Tensor | None
 
 
 @dataclass
diff --git a/vllm/v1/pool/metadata.py b/vllm/v1/pool/metadata.py
index 9ee588ea44ca4..acd1a00e87553 100644
--- a/vllm/v1/pool/metadata.py
+++ b/vllm/v1/pool/metadata.py
@@ -17,6 +17,7 @@ class PoolingCursor:
     first_token_indices_gpu: torch.Tensor
     last_token_indices_gpu: torch.Tensor
     prompt_lens_cpu: torch.Tensor
+    seq_lens_cpu: torch.Tensor
     num_scheduled_tokens_cpu: torch.Tensor
 
     def __getitem__(self, indices: slice):
@@ -25,12 +26,25 @@ class PoolingCursor:
             first_token_indices_gpu=self.first_token_indices_gpu[indices],
             last_token_indices_gpu=self.last_token_indices_gpu[indices],
             prompt_lens_cpu=self.prompt_lens_cpu[indices],
+            seq_lens_cpu=self.seq_lens_cpu[indices],
             num_scheduled_tokens_cpu=self.num_scheduled_tokens_cpu[indices],
         )
 
     def is_partial_prefill(self):
         return not torch.all(self.prompt_lens_cpu == self.num_scheduled_tokens_cpu)
 
+    def is_finished(self):
+        return self.prompt_lens_cpu == self.seq_lens_cpu
+
+
+class PoolingStates:
+    def __init__(self):
+        # for chunked prefill with ALL pooling
+        self.hidden_states_cache: list[torch.Tensor] = []
+
+    def clean(self):
+        self.hidden_states_cache.clear()
+
 
 @dataclass
 class PoolingMetadata:
@@ -39,6 +53,7 @@ class PoolingMetadata:
     prompt_lens: torch.Tensor  # CPU Tensor
     prompt_token_ids: torch.Tensor | None
     pooling_params: list[PoolingParams]
+    pooling_states: list[PoolingStates]
     pooling_cursor: PoolingCursor | None = None
 
     def __post_init__(self) -> None:
@@ -60,6 +75,7 @@ class PoolingMetadata:
             if self.prompt_token_ids is None
             else self.prompt_token_ids[indices],
             pooling_params=self.pooling_params[indices],
+            pooling_states=self.pooling_states[indices],
             pooling_cursor=None
             if self.pooling_cursor is None
             else self.pooling_cursor[indices],
@@ -74,15 +90,21 @@ class PoolingMetadata:
         return [prompt_token_ids[i, :num] for i, num in enumerate(self.prompt_lens)]
 
     def build_pooling_cursor(
-        self, num_scheduled_tokens: list[int], device: torch.device
+        self,
+        num_scheduled_tokens: list[int],
+        seq_lens_cpu: torch.Tensor,
+        device: torch.device,
     ):
         self.pooling_cursor = build_pooling_cursor(
-            num_scheduled_tokens, self.prompt_lens, device
+            num_scheduled_tokens, seq_lens_cpu, self.prompt_lens, device
         )
 
 
 def build_pooling_cursor(
-    num_scheduled_tokens: list[int], prompt_lens: torch.Tensor, device: torch.device
+    num_scheduled_tokens: list[int],
+    seq_lens_cpu: torch.Tensor,
+    prompt_lens: torch.Tensor,
+    device: torch.device,
 ):
     assert len(prompt_lens) == len(num_scheduled_tokens)
 
@@ -99,5 +121,6 @@ def build_pooling_cursor(
         first_token_indices_gpu=cumsum[:n_seq],
         last_token_indices_gpu=cumsum[1:] - 1,
         prompt_lens_cpu=prompt_lens,
+        seq_lens_cpu=seq_lens_cpu,
         num_scheduled_tokens_cpu=num_scheduled_tokens_cpu,
     )
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 516c76a5e4b15..ead7a3619dea5 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -15,7 +15,7 @@ from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.utils import length_from_prompt_token_ids_or_embeds
 from vllm.utils.collection_utils import swap_dict_values
 from vllm.v1.outputs import LogprobsTensors
-from vllm.v1.pool.metadata import PoolingMetadata
+from vllm.v1.pool.metadata import PoolingMetadata, PoolingStates
 from vllm.v1.sample.logits_processor import (
     BatchUpdateBuilder,
     LogitsProcessors,
@@ -33,7 +33,6 @@ class CachedRequestState:
     prompt_token_ids: list[int] | None
     mm_features: list[MultiModalFeatureSpec]
     sampling_params: SamplingParams | None
-    pooling_params: PoolingParams | None
     generator: torch.Generator | None
 
     block_ids: tuple[list[int], ...]
@@ -51,11 +50,18 @@ class CachedRequestState:
     # Used when both async_scheduling and spec_decode are enabled.
     prev_num_draft_len: int = 0
 
+    # for pooling models
+    pooling_params: PoolingParams | None = None
+    pooling_states: PoolingStates | None = None
+
     def __post_init__(self):
         self.num_prompt_tokens = length_from_prompt_token_ids_or_embeds(
             self.prompt_token_ids, self.prompt_embeds
         )
 
+        if self.pooling_params is not None:
+            self.pooling_states = PoolingStates()
+
     @property
     def num_tokens(self) -> int:
         return self.num_prompt_tokens + len(self.output_token_ids)
@@ -255,7 +261,9 @@ class InputBatch:
         # This is updated each time the batch constituents change.
         self.sampling_metadata = self._make_sampling_metadata()
 
+        # for pooling models
         self.pooling_params: dict[str, PoolingParams] = {}
+        self.pooling_states: dict[str, PoolingStates] = {}
 
         # Cached reference to the GPU tensor of previously sampled tokens
         self.prev_sampled_token_ids: torch.Tensor | None = None
@@ -413,7 +421,11 @@ class InputBatch:
                     sampling_params.bad_words_token_ids
                 )
         elif pooling_params := request.pooling_params:
+            pooling_states = request.pooling_states
+            assert pooling_states is not None
+
             self.pooling_params[req_id] = pooling_params
+            self.pooling_states[req_id] = pooling_states
             self.logits_processing_needs_token_ids[req_index] = (
                 pooling_params.requires_token_ids
             )
@@ -469,6 +481,7 @@ class InputBatch:
 
         if self.is_pooling_model:
             self.pooling_params.pop(req_id, None)
+            self.pooling_states.pop(req_id, None)
             return req_index
 
         self.greedy_reqs.discard(req_id)
@@ -837,13 +850,19 @@ class InputBatch:
         assert len(self.req_ids) == len(self.pooling_params)
         return [self.pooling_params[req_id] for req_id in self.req_ids]
 
+    def get_pooling_states(self) -> list[PoolingStates]:
+        assert len(self.req_ids) == len(self.pooling_states)
+        return [self.pooling_states[req_id] for req_id in self.req_ids]
+
     def get_pooling_metadata(self) -> PoolingMetadata:
         pooling_params = self.get_pooling_params()
+        pooling_states = self.get_pooling_states()
 
         return PoolingMetadata(
             prompt_lens=torch.from_numpy(self.num_prompt_tokens[: self.num_reqs]),
             prompt_token_ids=self.sampling_metadata.prompt_token_ids,
             pooling_params=pooling_params,
+            pooling_states=pooling_states,
         )
 
     def _make_prompt_token_ids_tensor(self) -> torch.Tensor:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 3f043e3b2648b..a7eb9cdae8b10 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -131,7 +131,7 @@ from vllm.v1.outputs import (
     SamplerOutput,
     make_empty_encoder_model_runner_output,
 )
-from vllm.v1.pool.metadata import PoolingMetadata
+from vllm.v1.pool.metadata import PoolingMetadata, PoolingStates
 from vllm.v1.sample.logits_processor import LogitsProcessors, build_logitsprocs
 from vllm.v1.sample.logits_processor.interface import LogitsProcessor
 from vllm.v1.sample.metadata import SamplingMetadata
@@ -2291,20 +2291,6 @@ class GPUModelRunner(
 
         supported_tasks = list(model.pooler.get_supported_tasks())
 
-        if self.scheduler_config.enable_chunked_prefill:
-            if "token_embed" in supported_tasks:
-                supported_tasks.remove("token_embed")
-            if "token_classify" in supported_tasks:
-                supported_tasks.remove("token_classify")
-
-            logger.debug_once(
-                "Chunked prefill is not supported with "
-                "token_embed and token_classify tasks "
-                "which using ALL pooling. "
-                "Please turn off chunked prefill by "
-                "`--no-enable-chunked-prefill` before using it."
-            )
-
         if "score" in supported_tasks:
             num_labels = getattr(self.model_config.hf_config, "num_labels", 0)
             if num_labels != 1:
@@ -2381,11 +2367,12 @@ class GPUModelRunner(
         )
 
         hidden_states = hidden_states[:num_scheduled_tokens]
+        seq_lens_cpu = self.seq_lens.cpu[: self.input_batch.num_reqs]
+
         pooling_metadata = self.input_batch.get_pooling_metadata()
         pooling_metadata.build_pooling_cursor(
-            num_scheduled_tokens_np.tolist(), device=hidden_states.device
+            num_scheduled_tokens_np.tolist(), seq_lens_cpu, device=hidden_states.device
         )
-        seq_lens_cpu = self.seq_lens.cpu[: self.input_batch.num_reqs]
 
         model = cast(VllmModelForPooling, self.model)
         raw_pooler_output: PoolerOutput = model.pooler(
@@ -2393,7 +2380,7 @@ class GPUModelRunner(
             pooling_metadata=pooling_metadata,
         )
         raw_pooler_output = json_map_leaves(
-            lambda x: x.to("cpu", non_blocking=True),
+            lambda x: x.to("cpu", non_blocking=True) if x is not None else x,
             raw_pooler_output,
         )
         self._sync_device()
@@ -4248,10 +4235,13 @@ class GPUModelRunner(
             prompt_lens=dummy_prompt_lens,
             prompt_token_ids=dummy_token_ids,
             pooling_params=[dummy_pooling_params] * num_reqs,
+            pooling_states=[PoolingStates() for i in range(num_reqs)],
         )
 
         dummy_metadata.build_pooling_cursor(
-            num_scheduled_tokens_list, device=hidden_states.device
+            num_scheduled_tokens_list,
+            seq_lens_cpu=dummy_prompt_lens,
+            device=hidden_states.device,
         )
 
         try:
@@ -4278,22 +4268,12 @@ class GPUModelRunner(
         supported_pooling_tasks = self.get_supported_pooling_tasks()
 
         if not supported_pooling_tasks:
-            if self.scheduler_config.enable_chunked_prefill:
-                raise RuntimeError(
-                    f"Model {self.model_config.model} does not support "
-                    "any pooling tasks with chunked prefill enabled. "
-                    "Please add --no-enable-chunked-prefill to your "
-                    "config or CLI args. See "
-                    "https://docs.vllm.ai/en/latest/models/pooling_models.html "
-                    "to learn more."
-                )
-            else:
-                raise RuntimeError(
-                    f"Model {self.model_config.model} does not support "
-                    "any pooling tasks. See "
-                    "https://docs.vllm.ai/en/latest/models/pooling_models.html "
-                    "to learn more."
-                )
+            raise RuntimeError(
+                f"Model {self.model_config.model} does not support "
+                "any pooling tasks. See "
+                "https://docs.vllm.ai/en/latest/models/pooling_models.html "
+                "to learn more."
+            )
 
         output_size = dict[PoolingTask, float]()
         for task in supported_pooling_tasks:

From 9998ea5b576972a508c854227f57829dd4bca940 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 4 Dec 2025 13:44:50 +0000
Subject: [PATCH 231/770] Delete HF version of Phi 4 MM (#30049)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/models/supported_models.md               |    1 -
 .../generation/test_phi4_multimodal.py        |  281 ----
 .../multimodal/processing/test_common.py      |   22 -
 tests/models/registry.py                      |    4 -
 vllm/model_executor/models/phi4_multimodal.py | 1447 -----------------
 vllm/model_executor/models/registry.py        |    2 +-
 6 files changed, 1 insertion(+), 1756 deletions(-)
 delete mode 100644 tests/models/multimodal/generation/test_phi4_multimodal.py
 delete mode 100644 vllm/model_executor/models/phi4_multimodal.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 040107c11efcf..96d5ec25c0064 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -711,7 +711,6 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ |
 | `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ |
 | `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ |
-| `Phi4MultimodalForCausalLM` | Phi-4-multimodal (HF Transformers) | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct` (with revision `refs/pr/70`), etc. | ✅︎ | ✅︎ |
 | `PixtralForConditionalGeneration` | Ministral 3 (Mistral format), Mistral 3 (Mistral format), Mistral Large 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Mistral-Large-3-675B-Instruct-2512` `mistralai/Pixtral-12B-2409` etc. | | ✅︎ |
 | `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ |
 | `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ |
diff --git a/tests/models/multimodal/generation/test_phi4_multimodal.py b/tests/models/multimodal/generation/test_phi4_multimodal.py
deleted file mode 100644
index 62456221711ed..0000000000000
--- a/tests/models/multimodal/generation/test_phi4_multimodal.py
+++ /dev/null
@@ -1,281 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import os
-from collections.abc import Sequence
-
-import librosa
-import pytest
-from huggingface_hub import snapshot_download
-
-from vllm.assets.image import ImageAsset
-from vllm.lora.request import LoRARequest
-from vllm.multimodal.image import rescale_image_size
-
-from ....conftest import (
-    IMAGE_ASSETS,
-    HfRunner,
-    PromptAudioInput,
-    PromptImageInput,
-    VllmRunner,
-)
-from ....utils import large_gpu_test
-from ...utils import check_logprobs_close
-
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
-    {
-        "stop_sign": "<|user|>\n<|image|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
-        "cherry_blossom": "<|user|>\n<|image|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n",  # noqa: E501
-    }
-)
-HF_MULTIIMAGE_IMAGE_PROMPT = (
-    "<|user|>\n<|image|>\n<|image|>\nDescribe these images.<|end|>\n<|assistant|>\n"  # noqa: E501
-)
-
-model_path = snapshot_download(
-    "microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
-)
-# Since the vision-lora and speech-lora co-exist with the base model,
-# we have to manually specify the path of the lora weights.
-vision_lora_path = os.path.join(model_path, "vision-lora")
-speech_question = os.path.join(
-    model_path, "examples", "what_is_shown_in_this_image.wav"
-)
-models = [model_path]
-
-target_dtype = "half"
-
-
-def run_test(
-    hf_runner: type[HfRunner],
-    vllm_runner: type[VllmRunner],
-    inputs: Sequence[tuple[list[str], PromptImageInput, PromptAudioInput | None]],
-    model: str,
-    *,
-    max_model_len: int,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    mm_limit: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: str | None = None,
-):
-    """Inference result should be the same between hf and vllm.
-
-    All the image fixtures for the test are from IMAGE_ASSETS.
-    For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalDataDict objects
-    and corresponding MultiModalConfig as input.
-    Note, the text input is also adjusted to abide by vllm contract.
-    The text output is sanitized to be able to compare with hf.
-    """
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-    # max_model_len should be greater than image_feature_size
-    with vllm_runner(
-        model,
-        task="generate",
-        max_model_len=max_model_len,
-        max_num_seqs=2,
-        dtype=dtype,
-        limit_mm_per_prompt={"image": mm_limit},
-        tensor_parallel_size=tensor_parallel_size,
-        distributed_executor_backend=distributed_executor_backend,
-        enable_lora=True,
-        max_lora_rank=320,
-        gpu_memory_utilization=0.8,  # set to 0.8 to avoid OOM in CI
-        enforce_eager=True,
-        trust_remote_code=False,
-    ) as vllm_model:
-        lora_request = LoRARequest("vision", 1, vision_lora_path)
-        vllm_outputs_per_case = [
-            vllm_model.generate_greedy_logprobs(
-                prompts,
-                max_tokens,
-                num_logprobs=num_logprobs,
-                images=images,
-                audios=audios,
-                lora_request=lora_request,
-            )
-            for prompts, images, audios in inputs
-        ]
-
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_model.model.load_adapter(
-            vision_lora_path,
-            adapter_name="vision",
-        )
-        hf_processor = hf_model.processor
-        eos_token_id = hf_processor.tokenizer.eos_token_id
-        hf_outputs_per_case = [
-            hf_model.generate_greedy_logprobs_limit(
-                prompts,
-                max_tokens,
-                num_logprobs=num_logprobs,
-                images=images,
-                audios=audios,
-                eos_token_id=eos_token_id,
-            )
-            for prompts, images, audios in inputs
-        ]
-
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_model_len", [12800])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [10])
-def test_models(
-    hf_runner,
-    vllm_runner,
-    image_assets,
-    model,
-    size_factors,
-    dtype: str,
-    max_model_len: int,
-    max_tokens: int,
-    num_logprobs: int,
-) -> None:
-    images = [asset.pil_image for asset in image_assets]
-
-    inputs_per_image = [
-        (
-            [prompt for _ in size_factors],
-            [rescale_image_size(image, factor) for factor in size_factors],
-            None,
-        )
-        for image, prompt in zip(images, HF_IMAGE_PROMPTS)
-    ]
-
-    run_test(
-        hf_runner,
-        vllm_runner,
-        inputs_per_image,
-        model,
-        dtype=dtype,
-        max_model_len=max_model_len,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        mm_limit=1,
-        tensor_parallel_size=1,
-    )
-
-
-@large_gpu_test(min_gb=48)
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        # [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_model_len", [25600])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [10])
-def test_multi_images_models(
-    hf_runner,
-    vllm_runner,
-    image_assets,
-    model,
-    size_factors,
-    dtype: str,
-    max_model_len: int,
-    max_tokens: int,
-    num_logprobs: int,
-) -> None:
-    images = [asset.pil_image for asset in image_assets]
-
-    inputs_per_case = [
-        (
-            [HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
-            [
-                [rescale_image_size(image, factor) for image in images]
-                for factor in size_factors
-            ],
-            None,
-        ),
-    ]
-
-    run_test(
-        hf_runner,
-        vllm_runner,
-        inputs_per_case,
-        model,
-        dtype=dtype,
-        max_model_len=max_model_len,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        mm_limit=2,
-        tensor_parallel_size=1,
-    )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_model_len", [12800])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [10])
-def test_vision_speech_models(
-    hf_runner,
-    vllm_runner,
-    model,
-    dtype: str,
-    max_model_len: int,
-    max_tokens: int,
-    num_logprobs: int,
-) -> None:
-    # use the example speech question so that the model outputs are reasonable
-    audio = librosa.load(speech_question, sr=16000)
-    image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
-
-    inputs_vision_speech = [
-        (
-            ["<|user|><|image|><|audio|><|end|><|assistant|>"],
-            [image],
-            [audio],
-        ),
-    ]
-
-    run_test(
-        hf_runner,
-        vllm_runner,
-        inputs_vision_speech,
-        model,
-        dtype=dtype,
-        max_model_len=max_model_len,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        mm_limit=1,
-        tensor_parallel_size=1,
-    )
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 8ef1fba8df3e3..6b9d388f2b9b4 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -396,28 +396,6 @@ def test_processing_correctness(
     )
 
 
-# Phi4MultimodalForCausalLM share same model repo with original format
-# Phi4MMForCausalLM, so we add it as a separate test case
-# Remove this test after conversion PR merged:
-# https://huggingface.co/microsoft/Phi-4-multimodal-instruct/discussions/70
-@pytest.mark.parametrize("model_arch", ["Phi4MultimodalForCausalLM"])
-@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
-@pytest.mark.parametrize("num_batches", [32])
-@pytest.mark.parametrize("simplify_rate", [1.0])
-def test_processing_correctness_phi4_multimodal(
-    model_arch: str,
-    hit_rate: float,
-    num_batches: int,
-    simplify_rate: float,
-):
-    _test_processing_correctness(
-        model_arch,
-        hit_rate=hit_rate,
-        num_batches=num_batches,
-        simplify_rate=simplify_rate,
-    )
-
-
 def _assert_inputs_equal(
     a: MultiModalInputs,
     b: MultiModalInputs,
diff --git a/tests/models/registry.py b/tests/models/registry.py
index bf88bac209808..b9f9945eb5fb8 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -771,10 +771,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     "Phi4MMForCausalLM": _HfExamplesInfo(
         "microsoft/Phi-4-multimodal-instruct", trust_remote_code=True
     ),
-    "Phi4MultimodalForCausalLM": _HfExamplesInfo(
-        "microsoft/Phi-4-multimodal-instruct",
-        revision="refs/pr/70",
-    ),
     "PixtralForConditionalGeneration": _HfExamplesInfo(
         "mistralai/Pixtral-12B-2409",
         extras={
diff --git a/vllm/model_executor/models/phi4_multimodal.py b/vllm/model_executor/models/phi4_multimodal.py
deleted file mode 100644
index 0f1230a55bae6..0000000000000
--- a/vllm/model_executor/models/phi4_multimodal.py
+++ /dev/null
@@ -1,1447 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import math
-from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Any, Literal, TypeAlias
-
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from transformers import (
-    BatchFeature,
-    Phi4MultimodalAudioConfig,
-    Phi4MultimodalConfig,
-    Phi4MultimodalFeatureExtractor,
-    Phi4MultimodalImageProcessorFast,
-)
-from transformers import Phi4MultimodalProcessor as Phi4MMProcessor
-from transformers.models.phi4_multimodal.modeling_phi4_multimodal import (
-    Phi4MultimodalAudioConvModule,
-    Phi4MultimodalAudioNemoConvSubsampling,
-    Phi4MultimodalAudioRelativeAttentionBias,
-    adaptive_enc_mask,
-    unfold_tensor,
-)
-
-from vllm.config import VllmConfig
-from vllm.config.multimodal import BaseDummyOptions
-from vllm.distributed import (
-    divide,
-    get_tensor_model_parallel_rank,
-    get_tensor_model_parallel_world_size,
-)
-from vllm.model_executor.layers.activation import MulAndSilu, get_act_fn
-from vllm.model_executor.layers.linear import (
-    ColumnParallelLinear,
-    MergedColumnParallelLinear,
-    QKVParallelLinear,
-    RowParallelLinear,
-)
-from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.module_mapping import MultiModelKeys
-from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (
-    MultiModalDataDict,
-    MultiModalFieldConfig,
-    MultiModalKwargsItems,
-    NestedTensors,
-)
-from vllm.multimodal.parse import (
-    AudioProcessorItems,
-    ImageEmbeddingItems,
-    ImageProcessorItems,
-    ImageSize,
-    MultiModalDataItems,
-    MultiModalDataParser,
-)
-from vllm.multimodal.processing import (
-    BaseMultiModalProcessor,
-    BaseProcessingInfo,
-    PromptReplacement,
-    PromptUpdate,
-)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder
-from vllm.sequence import IntermediateTensors
-from vllm.utils.tensor_schema import TensorSchema, TensorShape
-
-from .idefics2_vision_model import Idefics2VisionTransformer
-from .interfaces import MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal
-from .utils import (
-    AutoWeightsLoader,
-    WeightsMapper,
-    init_vllm_registered_model,
-    maybe_prefix,
-)
-
-_AUDIO_MAX_SOUNDFILE_SIZE = 241_000
-
-
-def _get_padding_size(
-    orig_width: int, orig_height: int, target_height: int, target_width: int
-):
-    ratio_width = target_width / orig_width
-    ratio_height = target_height / orig_height
-
-    if ratio_width < ratio_height:
-        padding_width = 0
-        padding_height = target_height - int(orig_height * ratio_width)
-    else:
-        padding_width = target_width - int(orig_width * ratio_height)
-        padding_height = 0
-    return padding_height, padding_width
-
-
-class Phi4MMProjector(nn.Module):
-    def __init__(self, input_size: int, hidden_size: int):
-        super().__init__()
-        self.up = ColumnParallelLinear(input_size, hidden_size)
-        self.down = RowParallelLinear(hidden_size, hidden_size)
-        self.act = get_act_fn("gelu")
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x, _ = self.up(x)
-        x = self.act(x)
-        x, _ = self.down(x)
-        return x
-
-
-class Phi4MMImageEmbedding(nn.Module):
-    """Image embedding."""
-
-    def __init__(self, config: Phi4MultimodalConfig):
-        super().__init__()
-        self.config = config
-        self.layer_idx = config.vision_config.feature_layer
-        self.crop_size = config.vision_config.crop_size
-        self.image_dim_out = config.vision_config.hidden_size
-
-        n_patches = config.vision_config.image_size // config.vision_config.patch_size
-        if n_patches % 2 != 0:
-            self.img_processor_padding = nn.ReflectionPad2d((0, 1, 0, 1))
-            n_patches += 1
-        self.num_img_tokens = (n_patches // 2) ** 2
-
-        num_hidden_layers = (
-            config.vision_config.num_hidden_layers + self.layer_idx + 1
-            if self.layer_idx < 0
-            else self.layer_idx + 1
-        )
-        self.img_processor = Idefics2VisionTransformer(
-            config.vision_config,
-            require_post_norm=False,
-            num_hidden_layers_override=num_hidden_layers,
-        )
-        self.image_token_compression = nn.AvgPool2d(kernel_size=2, stride=2)
-        self.img_projection = Phi4MMProjector(self.image_dim_out, config.hidden_size)
-        self.global_img_feature_extensor = nn.Parameter(
-            torch.zeros([1, 1, self.image_dim_out])
-        )
-        self.sub_img_feature_extensor = nn.Parameter(
-            torch.zeros([1, 1, 1, self.image_dim_out])
-        )
-
-    def get_img_features(
-        self,
-        img_embeds: torch.FloatTensor,
-        attention_mask: torch.Tensor | None = None,
-    ) -> torch.FloatTensor:
-        img_feature = self.img_processor(
-            img_embeds, patch_attention_mask=attention_mask
-        )
-
-        patch_feature = img_feature
-        # reshape to 2D tensor
-        width = int(math.sqrt(patch_feature.size(1)))
-        patch_feature = patch_feature.view(-1, width, width, patch_feature.size(-1))
-        # convert to NCHW
-        patch_feature = patch_feature.permute(0, 3, 1, 2)
-        if getattr(self, "img_processor_padding", None) is not None:
-            patch_feature = self.img_processor_padding(patch_feature)
-        patch_feature = self.image_token_compression(patch_feature)
-        # convert to NHWC
-        patch_feature = patch_feature.permute(0, 2, 3, 1)
-        patch_feature = patch_feature.view(
-            -1, patch_feature.size(1) * patch_feature.size(2), patch_feature.size(-1)
-        )
-        return patch_feature
-
-    def forward(
-        self,
-        image_pixel_values: torch.FloatTensor,
-        image_sizes: torch.Tensor | None = None,
-        image_attention_mask: torch.Tensor | None = None,
-    ) -> torch.FloatTensor:
-        image_pixel_values = image_pixel_values.to(
-            self.img_processor.embeddings.patch_embedding.weight.dtype
-        )
-
-        target_device = self.img_projection.up.bias.device
-        target_dtype = self.img_projection.up.bias.dtype
-
-        batch_size = image_pixel_values.shape[0]
-
-        img_features = self.get_img_features(
-            image_pixel_values.flatten(0, 1),
-            attention_mask=image_attention_mask.flatten(0, 1).to(
-                dtype=bool, device=target_device
-            ),
-        )
-        base_feat_size = int(np.sqrt(img_features.shape[1]))
-        img_features = img_features.view(
-            batch_size, -1, base_feat_size**2, self.image_dim_out
-        )
-        image_sizes = image_sizes.view(-1, 2)
-
-        output_imgs = []
-        for idx in range(batch_size):
-            height, width = image_sizes[idx]
-            height_ratio = height // self.crop_size
-            width_ratio = width // self.crop_size
-            area_ratio = height_ratio * width_ratio
-
-            global_img = img_features[idx, :1]
-            global_img = global_img.reshape(
-                1, base_feat_size, base_feat_size, self.image_dim_out
-            ).contiguous()
-            temporary_extensor = self.sub_img_feature_extensor.repeat(
-                1, base_feat_size, 1, 1
-            )
-            global_img = torch.cat([global_img, temporary_extensor], dim=2).reshape(
-                1, -1, self.image_dim_out
-            )
-
-            sub_img = img_features[idx, 1:]
-            sub_img = sub_img[:area_ratio]
-            sub_img = (
-                sub_img.reshape(
-                    height_ratio,
-                    width_ratio,
-                    base_feat_size,
-                    base_feat_size,
-                    self.image_dim_out,
-                )
-                .transpose(1, 2)
-                .reshape(
-                    1,
-                    height_ratio * base_feat_size,
-                    width_ratio * base_feat_size,
-                    self.image_dim_out,
-                )
-                .contiguous()
-            )
-
-            if image_attention_mask is not None:
-                reshaped_image_attention_mask = (
-                    image_attention_mask[idx, 1 : area_ratio + 1, 0::2, 0::2]
-                    .reshape(height_ratio, width_ratio, base_feat_size, base_feat_size)
-                    .transpose(1, 2)
-                    .reshape(
-                        1, height_ratio * base_feat_size, width_ratio * base_feat_size
-                    )
-                )
-                useful_height = int(reshaped_image_attention_mask[0, :, 0].sum().item())
-                useful_width = int(reshaped_image_attention_mask[0, 0, :].sum().item())
-                sub_img = sub_img[:, :useful_height, :useful_width]
-                temporary_extensor = self.sub_img_feature_extensor.repeat(
-                    1, useful_height, 1, 1
-                )
-            else:
-                temporary_extensor = self.sub_img_feature_extensor.repeat(
-                    1, height_ratio * base_feat_size, 1, 1
-                )
-
-            sub_img = torch.cat([sub_img, temporary_extensor], dim=2).reshape(
-                1, -1, self.image_dim_out
-            )
-
-            # Merge global and sub
-            output_imgs.append(
-                torch.cat(
-                    [sub_img, self.global_img_feature_extensor, global_img], dim=1
-                )
-            )
-
-        img_set_tensor = []
-        for output_img in output_imgs:
-            output_img = output_img.to(device=target_device, dtype=target_dtype)
-            img_feature_proj = self.img_projection(output_img)
-            img_set_tensor.append(img_feature_proj.flatten(0, 1))
-
-        return img_set_tensor
-
-
-class Phi4MultimodalAudioMLP(nn.Module):
-    def __init__(
-        self,
-        config: Phi4MultimodalAudioConfig,
-        quant_config: QuantizationConfig | None = None,
-        prefix: str = "",
-    ):
-        super().__init__()
-        self.layer_norm = nn.LayerNorm(config.hidden_size)
-        self.act_fn = MulAndSilu()
-        self.gate_up_proj = MergedColumnParallelLinear(
-            config.hidden_size,
-            [config.intermediate_size] * 2,
-            bias=True,
-            quant_config=quant_config,
-            prefix=f"{prefix}.gate_up_proj",
-        )
-        self.down_proj = RowParallelLinear(
-            config.intermediate_size,
-            config.hidden_size,
-            bias=True,
-            quant_config=quant_config,
-            prefix=f"{prefix}.down_proj",
-        )
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.layer_norm(hidden_states)
-        hidden_states, _ = self.gate_up_proj(hidden_states)
-        hidden_states = self.act_fn(hidden_states)
-        hidden_states, _ = self.down_proj(hidden_states)
-        return hidden_states
-
-
-class Phi4MultimodalAudioAttention(nn.Module):
-    def __init__(
-        self,
-        config: Phi4MultimodalAudioConfig,
-        quant_config: QuantizationConfig | None = None,
-        prefix: str = "",
-    ):
-        super().__init__()
-        self.config = config
-        self.embed_dim = config.hidden_size
-        self.total_num_heads = config.num_attention_heads
-        self.head_dim = self.embed_dim // self.total_num_heads
-        if self.head_dim * self.total_num_heads != self.embed_dim:
-            raise ValueError(
-                "embed_dim must be divisible by num_heads "
-                f"(got `embed_dim`: {self.embed_dim} and `num_heads`:"
-                f" {self.num_heads})."
-            )
-        self.scale = self.head_dim**-0.5
-
-        self.qkv_proj = QKVParallelLinear(
-            hidden_size=self.embed_dim,
-            head_size=self.head_dim,
-            total_num_heads=self.total_num_heads,
-            quant_config=quant_config,
-            prefix=f"{prefix}.qkv_proj",
-        )
-
-        self.o_proj = RowParallelLinear(
-            input_size=self.embed_dim,
-            output_size=self.embed_dim,
-            quant_config=quant_config,
-            prefix=f"{prefix}.out_proj",
-        )
-
-        self.tp_size = get_tensor_model_parallel_world_size()
-        self.tp_rank = get_tensor_model_parallel_rank()
-        self.num_heads = divide(self.total_num_heads, self.tp_size)
-
-    def split_attn_mask(self, attention_mask: torch.Tensor) -> torch.Tensor:
-        start_idx = self.num_heads * self.tp_rank
-        end_idx = self.num_heads * (self.tp_rank + 1)
-        return attention_mask[:, start_idx:end_idx]
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: torch.Tensor,
-    ) -> torch.Tensor:
-        qkv_states, _ = self.qkv_proj(hidden_states)
-        query, key, value = qkv_states.chunk(3, dim=-1)
-
-        bsz, seq_len, _ = query.size()
-        query = query.view(bsz, seq_len, self.num_heads, self.head_dim)
-        key = key.view(bsz, seq_len, self.num_heads, self.head_dim)
-        value = value.view(bsz, seq_len, self.num_heads, self.head_dim)
-        query, key, value = (x.transpose(1, 2) for x in (query, key, value))
-
-        attention_mask = self.split_attn_mask(attention_mask)
-        out = F.scaled_dot_product_attention(
-            query,
-            key,
-            value,
-            scale=self.scale,
-            attn_mask=attention_mask,
-        )
-        out = out.transpose(1, 2).reshape(bsz, seq_len, -1)
-
-        attn_output, _ = self.o_proj(out)
-
-        return attn_output
-
-
-class Phi4MultimodalAudioConformerEncoderLayer(nn.Module):
-    def __init__(self, config: Phi4MultimodalAudioConfig):
-        super().__init__()
-
-        self.feed_forward_in = Phi4MultimodalAudioMLP(config)
-        self.self_attn = Phi4MultimodalAudioAttention(config)
-        self.conv = Phi4MultimodalAudioConvModule(config)
-        self.feed_forward_out = Phi4MultimodalAudioMLP(config)
-        self.layer_norm_att = nn.LayerNorm(config.hidden_size)
-        self.layer_norm = nn.LayerNorm(config.hidden_size)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: torch.Tensor,
-    ) -> torch.Tensor:
-        residual = hidden_states + 0.5 * self.feed_forward_in(hidden_states)
-        hidden_states = self.layer_norm_att(residual)
-
-        hidden_states = residual + self.self_attn(hidden_states, attention_mask)
-        hidden_states = hidden_states + self.conv(hidden_states)
-        hidden_states = hidden_states + 0.5 * self.feed_forward_out(hidden_states)
-
-        out = self.layer_norm(hidden_states)
-
-        return out
-
-
-class Phi4MMAudioMeanVarianceNormLayer(nn.Module):
-    """Mean/variance normalization layer.
-
-    Will subtract mean and multiply input by inverted standard deviation.
-    Typically used as a very first layer in a model.
-
-    Args:
-        config: [Phi4MultimodalAudioConfig](https://huggingface.co/docs/transformers/model_doc/phi4_multimodal#transformers.Phi4MultimodalAudioConfig)
-            object containing model parameters.
-    """
-
-    def __init__(self, config: Phi4MultimodalAudioConfig):
-        super().__init__()
-        self.global_mean = nn.Parameter(torch.zeros(config.input_size))
-        self.global_invstd = nn.Parameter(torch.ones(config.input_size))
-
-    def forward(self, input_: torch.Tensor) -> torch.Tensor:
-        """MeanVarianceNormLayer Forward
-
-        Args:
-            input_: torch.Tensor
-                input tensor.
-        """
-        return (input_ - self.global_mean) * self.global_invstd
-
-
-class Phi4MultimodalAudioModel(nn.Module):
-    def __init__(self, config: Phi4MultimodalAudioConfig):
-        super().__init__()
-        self.config = config
-
-        self.encoder_embedding = Phi4MMAudioMeanVarianceNormLayer(config)
-        self.embed = Phi4MultimodalAudioNemoConvSubsampling(config)
-        self.relative_attention_bias_layer = Phi4MultimodalAudioRelativeAttentionBias(
-            config
-        )
-        self.encoders = nn.ModuleList(
-            [
-                Phi4MultimodalAudioConformerEncoderLayer(config)
-                for _ in range(config.num_blocks)
-            ]
-        )
-
-    def _streaming_mask(
-        self,
-        seq_len: int,
-        batch_size: int,
-        chunk_size: int,
-        left_chunk: int,
-    ):
-        # Create mask matrix for streaming
-        # S stores start index. if chunksize is 18, s is [0,18,36,....]
-        chunk_start_idx = np.arange(0, seq_len, chunk_size)
-
-        enc_streaming_mask = (
-            adaptive_enc_mask(seq_len, chunk_start_idx, left_window=left_chunk)
-            .unsqueeze(0)
-            .expand([batch_size, -1, -1])
-        )
-        return enc_streaming_mask
-
-    def forward_embeddings(
-        self,
-        hidden_states: torch.Tensor,
-        masks: torch.Tensor,
-    ):
-        """Forwarding the inputs through the top embedding layers"""
-        seq_len = math.ceil(hidden_states.shape[1] / self.config.time_reduction)
-        if seq_len <= 0:
-            raise ValueError(
-                f"Sequence length after time reduction is invalid: {seq_len}."
-                "Your input feature is too short."
-            )
-
-        batch_size = hidden_states.shape[0]
-
-        enc_streaming_mask = self._streaming_mask(
-            seq_len, batch_size, self.config.chunk_size, self.config.left_chunk
-        )
-        enc_streaming_mask = enc_streaming_mask.to(hidden_states.device)
-
-        hidden_states, masks = self.embed(hidden_states, masks)
-
-        streaming_mask = enc_streaming_mask
-        if streaming_mask is not None and masks is not None:
-            hs_mask = masks & streaming_mask
-        elif masks is not None:
-            hs_mask = masks
-        else:
-            hs_mask = streaming_mask
-
-        return hidden_states, hs_mask, masks
-
-    def calculate_hs_mask(
-        self, hidden_states: torch.Tensor, device: torch.device, mask: torch.Tensor
-    ):
-        max_audio_length = hidden_states.shape[1]
-        batch_size = hidden_states.shape[0]
-        enc_streaming_mask = self._streaming_mask(
-            max_audio_length, batch_size, self.config.chunk_size, self.config.left_chunk
-        )
-        enc_streaming_mask = enc_streaming_mask.to(device)
-        if mask is None:
-            return enc_streaming_mask
-
-        feature_lens = mask.sum(1)
-        padding_length = feature_lens
-        pad_mask = torch.arange(0, max_audio_length, device=device).expand(
-            padding_length.size(0), -1
-        ) < padding_length.unsqueeze(1)
-        pad_mask = pad_mask.unsqueeze(1)
-        pad_mask = pad_mask & enc_streaming_mask
-        return pad_mask
-
-    def forward(self, hidden_states: torch.Tensor, mask: torch.Tensor | None = None):
-        hidden_states = self.encoder_embedding(hidden_states)
-        hidden_states, hs_mask, mask = self.forward_embeddings(hidden_states, mask)
-
-        unfolded = False
-        bs, seq_len, _ = hidden_states.shape
-        max_seq_len = 500  # maximum position for absolute positional encoding
-        if seq_len > max_seq_len:
-            # audio sequence is longer than max_seq_len,
-            # unfold it into chunks of max_seq_len
-            unfolded = True
-            # the unfold op will drop residual frames,
-            # pad it to the multiple of max_seq_len
-            if seq_len % max_seq_len > 0:
-                chunk_pad_size = max_seq_len - (seq_len % max_seq_len)
-            else:
-                chunk_pad_size = 0
-            if chunk_pad_size > 0:
-                hidden_states_pad = F.pad(
-                    hidden_states, (0, 0, 0, chunk_pad_size), "constant", 0
-                )
-                hidden_states = hidden_states_pad.to(hidden_states.device)
-
-            hidden_states = unfold_tensor(hidden_states, max_seq_len)
-            masks_unfold = None
-            if mask is not None:
-                # revise hs_mask here because the previous calculated hs_mask
-                # did not consider extra pad
-                subsampled_pad_mask = mask.squeeze(1)  # [bz, subsampled_unmask_seq_len]
-                extra_padded_subsamlped_pad_mask = F.pad(
-                    subsampled_pad_mask, (0, chunk_pad_size), "constant", False
-                )  # extra padding to the pad mask
-                extra_padded_subsamlped_pad_mask = (
-                    extra_padded_subsamlped_pad_mask.unsqueeze(-1).float()
-                )
-                masks_unfold = unfold_tensor(
-                    extra_padded_subsamlped_pad_mask, max_seq_len
-                )  # unfold the pad mask like we did to the input tensor
-                masks_unfold = masks_unfold.squeeze(
-                    -1
-                ).bool()  # unfold op does not support bool tensor
-            hs_mask = self.calculate_hs_mask(
-                hidden_states, hidden_states.device, masks_unfold
-            )  # calculate hs_mask based on the unfolded pad mask
-
-        relative_attention_bias = self.relative_attention_bias_layer(hidden_states)
-        attention_mask = hs_mask.unsqueeze(1) + relative_attention_bias
-
-        for layer in self.encoders:
-            hidden_states = layer(hidden_states, attention_mask)
-
-        if unfolded:
-            embed_dim = hidden_states.shape[-1]
-            hidden_states = hidden_states.reshape(bs, -1, embed_dim)
-            # if we ever padded before unfolding, we need to remove the padding
-            if chunk_pad_size > 0:
-                hidden_states = hidden_states[:, :-chunk_pad_size, :]
-
-        return hidden_states
-
-
-class Phi4MMAudioEmbedding(nn.Module):
-    def __init__(self, config: Phi4MultimodalConfig):
-        super().__init__()
-        self.config = config
-        self.layer_idx = config.audio_config.feature_layer
-
-        self.encoder = Phi4MultimodalAudioModel(config.audio_config)
-
-        audio_config = config.audio_config
-        proj_input_size = audio_config.hidden_size * audio_config.downsample_rate
-        self.vision_speech_projection = Phi4MMProjector(
-            proj_input_size, config.hidden_size
-        )
-        self.speech_projection = Phi4MMProjector(proj_input_size, config.hidden_size)
-
-    def get_projection(
-        self,
-        audio_projection_mode: Literal["speech", "vision"],
-    ) -> Phi4MMProjector:
-        if audio_projection_mode == "speech":
-            return self.speech_projection
-        elif audio_projection_mode == "vision":
-            return self.vision_speech_projection
-
-    def forward(
-        self,
-        audio_input_features: torch.FloatTensor,
-        audio_embed_sizes=None,
-        audio_attention_mask=None,
-        audio_projection_mode="speech",
-    ) -> torch.FloatTensor:
-        audio_projection = self.get_projection(audio_projection_mode)
-
-        target_device = audio_projection.up.bias.device
-        target_dtype = audio_projection.up.bias.dtype
-
-        audio_input_features = audio_input_features.to(
-            device=target_device, dtype=target_dtype
-        )
-
-        audio_encoder_hidden_states = self.encoder(
-            audio_input_features, audio_attention_mask
-        )
-        audio_embeds = audio_projection(audio_encoder_hidden_states)
-
-        return audio_embeds.flatten(0, 1)
-
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-
-        for name, loaded_weight in weights:
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
-
-class Phi4MMImagePixelInputs(TensorSchema):
-    """
-    Dimensions:
-        - bn: Batch size * number of images
-        - p: Number of patches (1 + num_patches)
-        - c: Number of channels (3)
-        - h: Height of each image patch
-        - w: Width of each image patch
-        - nc: Number of crops
-        - H_mask: Height of attention mask
-        - W_mask: Width of attention mask
-    """
-
-    type: Literal["pixel_values"]
-
-    pixel_values: Annotated[
-        torch.Tensor | list[torch.Tensor],
-        TensorShape(
-            "bn", "p", 3, "h", "w", dynamic_dims={"p"}
-        ),  # may be different per batch and image
-    ]
-
-    image_sizes: Annotated[
-        torch.Tensor,
-        TensorShape("bn", 2),  # (height, width)
-    ]
-
-    num_img_tokens: Annotated[
-        list[int],
-        TensorShape("bn"),
-    ]
-
-    image_attention_mask: Annotated[
-        torch.Tensor,
-        TensorShape("bn", "nc", 32, 32),  # H_mask, W_mask
-    ]
-
-
-class Phi4MMImageEmbeddingInputs(TensorSchema):
-    """
-    Dimensions:
-        - bn: Batch size * number of images
-        - f: Image feature size
-        - h: Hidden size (must match language model backbone)
-    """
-
-    type: Literal["image_embeds"]
-
-    data: Annotated[
-        torch.Tensor | list[torch.Tensor],
-        TensorShape("bn", "f", "h"),
-    ]
-
-
-class Phi4MMAudioFeatureInputs(TensorSchema):
-    """
-    Dimensions:
-        - bn: Batch size * number of audios
-        - f: Number of Mel filterbank bins (80)
-        - t: Time frames (M)
-    """
-
-    type: Literal["audio_features"]
-
-    audio_features: Annotated[
-        torch.Tensor | list[torch.Tensor],
-        TensorShape("bn", "t", 80, dynamic_dims={"t"}),
-    ]
-
-
-class Phi4MMAudioEmbeddingInputs(TensorSchema):
-    """
-    Dimensions:
-        - b: Batch size
-        - n: Number of audios
-        - f: Audio feature size
-        - h: Hidden size (must match language model backbone)
-    """
-
-    type: Literal["audio_embeds"]
-
-    data: Annotated[
-        NestedTensors,
-        TensorShape("b", "n", "f", "h"),
-    ]
-
-
-Phi4MMImageInput: TypeAlias = Phi4MMImagePixelInputs | Phi4MMImageEmbeddingInputs
-Phi4MMAudioInputs: TypeAlias = Phi4MMAudioFeatureInputs | Phi4MMAudioEmbeddingInputs
-
-
-def cat_with_pad(tensors, dim, padding_value=0):
-    """
-    cat along dim, while pad to max for all other dims
-    """
-    ndim = tensors[0].dim()
-    assert all(t.dim() == ndim for t in tensors[1:]), (
-        "All tensors must have the same number of dimensions"
-    )
-
-    out_size = [max(t.shape[i] for t in tensors) for i in range(ndim)]
-    out_size[dim] = sum(t.shape[dim] for t in tensors)
-    output = tensors[0].new_full(out_size, padding_value)
-
-    index = 0
-    for t in tensors:
-        # Create a slice list where every dimension except dim is full slice
-        slices = [slice(0, t.shape[d]) for d in range(ndim)]
-        # Update only the concat dimension slice
-        slices[dim] = slice(index, index + t.shape[dim])
-
-        output[slices] = t
-        index += t.shape[dim]
-
-    return output
-
-
-class Phi4MMProcessingInfo(BaseProcessingInfo):
-    def get_hf_config(self) -> Phi4MultimodalConfig:
-        return self.ctx.get_hf_config(Phi4MultimodalConfig)
-
-    def get_hf_processor(self, **kwargs: object) -> Phi4MMProcessor:
-        return self.ctx.get_hf_processor(Phi4MMProcessor, **kwargs)
-
-    def get_feature_extractor(self, **kwargs: object) -> Phi4MultimodalFeatureExtractor:
-        return self.get_hf_processor(**kwargs).audio_processor
-
-    def get_image_processor(
-        self,
-        processor: Phi4MMProcessor | None = None,
-    ) -> Phi4MultimodalImageProcessorFast:
-        if processor is None:
-            processor = self.get_hf_processor()
-        return processor.image_processor
-
-    def get_dynamic_hd(
-        self,
-        processor: Phi4MMProcessor | None = None,
-    ) -> int:
-        return self.get_image_processor(processor).dynamic_hd
-
-    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
-        return {"audio": None, "image": None}
-
-    def _find_target_aspect_ratio(
-        self,
-        orig_width: int,
-        orig_height: int,
-        image_size: int,
-        max_num: int,
-        min_num: int,
-    ):
-        w_crop_num = math.ceil(orig_width / float(image_size))
-        h_crop_num = math.ceil(orig_height / float(image_size))
-        if w_crop_num * h_crop_num > max_num:
-            aspect_ratio = orig_width / orig_height
-
-            # calculate the existing image aspect ratio
-            target_ratios = set(
-                (i, j)
-                for i in range(1, max_num + 1)
-                for j in range(1, max_num + 1)
-                if i * j <= max_num and i * j >= min_num
-            )
-            target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
-
-            # find the closest aspect ratio to the target
-            image_processor = self.get_image_processor()
-            target_aspect_ratio = image_processor.find_closest_aspect_ratio(
-                aspect_ratio,
-                target_ratios,
-                orig_width,
-                orig_height,
-                image_size,
-            )
-
-            # calculate the target width and height
-            target_width = image_size * target_aspect_ratio[0]
-            target_height = image_size * target_aspect_ratio[1]
-        else:
-            target_width = image_size * w_crop_num
-            target_height = image_size * h_crop_num
-            target_aspect_ratio = (w_crop_num, h_crop_num)
-        return target_aspect_ratio, target_height, target_width
-
-    def _compute_num_image_tokens(
-        self,
-        orig_width: int,
-        orig_height: int,
-        dynamic_hd_size: int,
-        vit_image_size: int,
-        vit_patch_size: int,
-        token_compression_factor: int = 2,
-    ):
-        """
-        compute the number of tokens an image is expected to take up considering
-        the image encoder architecture and exclude output features containing
-        only padding pixels
-
-        for siglip, vit_image_size=448, vit_patch_size=14, so output will be
-        32x32 feature map
-        NOTE right now, Phi4MM uses hard-coded token_compression_factor=2
-        """
-        assert vit_image_size % vit_patch_size == 0, (
-            "vit_image_size must be divisible by vit_patch_size"
-        )
-        assert vit_image_size // vit_patch_size % token_compression_factor == 0, (
-            "vit_image_size // vit_patch_size must be divisible by "
-            "token_compression_factor"
-        )
-
-        target_aspect_ratio, target_height, target_width = (
-            self._find_target_aspect_ratio(
-                orig_width, orig_height, vit_image_size, dynamic_hd_size, min_num=1
-            )
-        )
-        assert target_aspect_ratio[0] * vit_image_size == target_width, (
-            f"{target_aspect_ratio[0]} * {vit_image_size} != {target_width}"
-        )
-        assert target_aspect_ratio[1] * vit_image_size == target_height, (
-            f"{target_aspect_ratio[1]} * {vit_image_size} != {target_height}"
-        )
-        assert (
-            target_height % vit_image_size == 0 and target_width % vit_image_size == 0
-        )
-
-        padding_height, padding_width = _get_padding_size(
-            orig_width, orig_height, target_height, target_width
-        )
-        assert padding_width == 0 or padding_height == 0, (
-            "padding_width or padding_height must be 0"
-        )
-
-        target_feat_width = target_width // vit_patch_size
-        target_feat_height = target_height // vit_patch_size
-        if padding_width >= vit_patch_size:
-            assert padding_height == 0, "padding_height not 0"
-            non_pad_feat_width = target_feat_width - math.floor(
-                padding_width / vit_patch_size
-            )
-            non_pad_feat_height = target_feat_height
-        elif padding_height >= vit_patch_size:
-            assert padding_width == 0, "padding_width not 0"
-            non_pad_feat_height = target_feat_height - math.floor(
-                padding_height / vit_patch_size
-            )
-            non_pad_feat_width = target_feat_width
-        else:
-            # small padding shorter than a vit patch
-            non_pad_feat_width = target_feat_width
-            non_pad_feat_height = target_feat_height
-
-        feat_width = non_pad_feat_width // token_compression_factor
-        feat_height = non_pad_feat_height // token_compression_factor
-        # NOTE it's possible that the non-padding feature is not divisible
-        if non_pad_feat_width % token_compression_factor != 0:
-            feat_width += 1
-        if non_pad_feat_height % token_compression_factor != 0:
-            feat_height += 1
-        num_hd_patch_tokens = feat_width * feat_height
-        num_hd_newline_tokens = feat_height
-        vit_feature_size = vit_image_size // vit_patch_size
-        num_global_image_tokens = (vit_feature_size // token_compression_factor) ** 2
-        num_sep_tokens = 1
-        num_global_image_newline_tokens = vit_feature_size // token_compression_factor
-
-        return (
-            num_global_image_tokens
-            + num_sep_tokens
-            + num_hd_patch_tokens
-            + num_hd_newline_tokens
-            + num_global_image_newline_tokens
-        )
-
-    def get_num_image_tokens(
-        self,
-        *,
-        image_width: int,
-        image_height: int,
-        processor: Phi4MMProcessor | None = None,
-    ) -> int:
-        hf_config = self.get_hf_config()
-        vision_config = hf_config.vision_config
-        vit_image_size = vision_config.image_size
-        vit_patch_size = vision_config.patch_size
-
-        dynamic_hd_size = self.get_dynamic_hd(processor=processor)
-
-        # we use default `token_compression_factor=2`,
-        # since it's not in HF vision config.
-        image_num_tokens = self._compute_num_image_tokens(
-            image_width,
-            image_height,
-            dynamic_hd_size=dynamic_hd_size,
-            vit_image_size=vit_image_size,
-            vit_patch_size=vit_patch_size,
-        )
-
-        return image_num_tokens
-
-    def get_image_size_with_most_features(
-        self,
-        processor: Phi4MMProcessor | None = None,
-    ) -> ImageSize:
-        vit_image_size = self.get_hf_config().vision_config.image_size
-
-        max_side = vit_image_size * self.get_dynamic_hd(processor=processor)
-        return ImageSize(height=max_side, width=vit_image_size)
-
-    def get_audio_num_frames(self, audio_len: int, sr: float) -> int:
-        """
-        Compute the output size of the `extract_features` method.
-
-        Args:
-            audio_len (int): Length of the input waveform in samples.
-            sr (float): Sampling rate of the waveform, either 16000 or 8000.
-
-        Returns:
-            tuple (int, int): Output size as (T, D), where:
-                T: Number of time frames.
-                D: Number of Mel filterbank bins (80).
-        """
-
-        # Resample to 16000 or 8000 if needed
-        if sr > 16000:
-            audio_len //= sr // 16000
-        elif 8000 <= sr < 16000:
-            # We'll resample to 16K from 8K
-            audio_len *= 2
-        elif sr < 8000:
-            raise RuntimeError(f"Unsupported sample rate {sr}")
-
-        # Spectrogram parameters for 16 kHz
-        win_length = 400  # Frame length in samples
-        hop_length = 160  # Frame shift in samples
-
-        # Calculate number of frames (T)
-        num_frames = (audio_len - win_length) // hop_length + 1
-        if num_frames < 1:
-            raise ValueError("Waveform too short for given parameters.")
-
-        # Return time frames (T)
-        return num_frames
-
-    def _compute_audio_embed_size(self, audio_frames: int) -> int:
-        """
-        Compute the size of audio embeddings from the number of audio frames.
-        """
-        # `_compute_audio_embed_size` in audio_processor use torch for
-        # computation, therefore we re-implement it to use pythonic
-        # numeric computation to avoid extra tensor conversion.
-        audio_processor = self.get_feature_extractor()
-        audio_compression_rate = audio_processor.audio_compression_rate
-        audio_downsample_rate = audio_processor.audio_downsample_rate
-
-        integer = audio_frames // audio_compression_rate
-        remainder = audio_frames % audio_compression_rate
-        result = integer + int(remainder > 0)
-
-        integer = result // audio_downsample_rate
-        remainder = result % audio_downsample_rate
-        result = integer + int(remainder > 0)  # qformer compression
-
-        return result
-
-
-class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]):
-    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
-        num_audios = mm_counts.get("audio", 0)
-        num_images = mm_counts.get("image", 0)
-
-        tokenizer = self.info.get_tokenizer()
-        image_tokens: str = tokenizer.image_token * num_images
-        audio_tokens: str = tokenizer.audio_token * num_audios
-
-        return image_tokens + audio_tokens
-
-    def get_dummy_mm_data(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-        mm_options: Mapping[str, BaseDummyOptions] | None = None,
-    ) -> MultiModalDataDict:
-        num_audios = mm_counts.get("audio", 0)
-        num_images = mm_counts.get("image", 0)
-
-        target_width, target_height = self.info.get_image_size_with_most_features()
-
-        image_overrides = mm_options.get("image") if mm_options else None
-        audio_overrides = mm_options.get("audio") if mm_options else None
-
-        mm_data = {
-            "image": self._get_dummy_images(
-                width=target_width,
-                height=target_height,
-                num_images=num_images,
-                overrides=image_overrides,
-            ),
-            "audio": self._get_dummy_audios(
-                length=_AUDIO_MAX_SOUNDFILE_SIZE,
-                num_audios=num_audios,
-                overrides=audio_overrides,
-            ),
-        }
-
-        return mm_data
-
-
-class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
-    def _get_data_parser(self) -> MultiModalDataParser:
-        feature_extractor = self.info.get_feature_extractor()
-        return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
-
-    def _call_hf_processor(
-        self,
-        prompt: str,
-        mm_data: Mapping[str, object],
-        mm_kwargs: Mapping[str, object],
-        tok_kwargs: Mapping[str, object],
-    ) -> BatchFeature:
-        if not mm_data:
-            prompt_ids = self.info.get_tokenizer().encode(prompt)
-            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
-            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
-
-        audio_data = mm_data.pop("audios", [])
-        if audio_data:
-            mm_data["audio"] = audio_data
-
-        processed_outputs = super()._call_hf_processor(
-            prompt, mm_data, mm_kwargs, tok_kwargs
-        )
-
-        if "image_pixel_values" in processed_outputs:
-            num_img_tokens = [
-                self.info.get_num_image_tokens(
-                    image_width=img_size[0], image_height=img_size[1]
-                )
-                for img_size in processed_outputs["image_sizes"]
-            ]
-            processed_outputs["num_img_tokens"] = num_img_tokens
-
-        if audio_data:
-            audio_features = processed_outputs["audio_input_features"]
-            sr = self.info.get_feature_extractor(**mm_kwargs).sampling_rate
-            feature_sizes = [
-                self.info.get_audio_num_frames(len(audio), sr) for audio in audio_data
-            ]
-            processed_outputs["audio_input_features"] = [
-                audio_features[idx, :size] for idx, size in enumerate(feature_sizes)
-            ]
-
-        return processed_outputs
-
-    def _get_mm_fields_config(
-        self,
-        hf_inputs: BatchFeature,
-        hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(
-            image_pixel_values=MultiModalFieldConfig.batched("image"),
-            image_attention_mask=MultiModalFieldConfig.batched("image"),
-            image_sizes=MultiModalFieldConfig.batched("image"),
-            num_img_tokens=MultiModalFieldConfig.batched("image"),
-            audio_input_features=MultiModalFieldConfig.batched("audio"),
-        )
-
-    def _get_prompt_updates(
-        self,
-        mm_items: MultiModalDataItems,
-        hf_processor_mm_kwargs: Mapping[str, Any],
-        out_mm_kwargs: MultiModalKwargsItems,
-    ) -> Sequence[PromptUpdate]:
-        tokenizer = self.info.get_tokenizer()
-        image_token_id: int = tokenizer.vocab[tokenizer.image_token]
-        audio_token_id: int = tokenizer.vocab[tokenizer.audio_token]
-
-        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
-        audio_processor = self.info.get_feature_extractor(**hf_processor_mm_kwargs)
-
-        def get_image_replacement_phi4mm(item_idx: int):
-            images = mm_items.get_items(
-                "image", (ImageEmbeddingItems, ImageProcessorItems)
-            )
-
-            if isinstance(images, ImageEmbeddingItems):
-                num_image_tokens = images.get_feature_size(item_idx)
-            else:
-                image_size = images.get_image_size(item_idx)
-                num_image_tokens = self.info.get_num_image_tokens(
-                    image_width=image_size.width,
-                    image_height=image_size.height,
-                    processor=hf_processor,
-                )
-
-            return [image_token_id] * num_image_tokens
-
-        def get_audio_replacement_phi4mm(item_idx: int):
-            audios = mm_items.get_items("audio", AudioProcessorItems)
-            # TODO(Isotr0py): support embedding inputs
-            audio_len = audios.get_audio_length(item_idx)
-            audio_frames = self.info.get_audio_num_frames(
-                audio_len, audio_processor.sampling_rate
-            )
-            audio_embed_size = self.info._compute_audio_embed_size(audio_frames)
-
-            return [audio_token_id] * audio_embed_size
-
-        return [
-            PromptReplacement(
-                modality="audio",
-                target=[audio_token_id],
-                replacement=get_audio_replacement_phi4mm,
-            ),
-            PromptReplacement(
-                modality="image",
-                target=[image_token_id],
-                replacement=get_image_replacement_phi4mm,
-            ),
-        ]
-
-
-@MULTIMODAL_REGISTRY.register_processor(
-    Phi4MMMultiModalProcessor,
-    info=Phi4MMProcessingInfo,
-    dummy_inputs=Phi4MMDummyInputsBuilder,
-)
-class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
-    """
-    Implements the Phi-4-multimodal-instruct model in vLLM.
-    """
-
-    merge_by_field_config = True
-
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "qkv_proj",
-        ],
-        "gate_up_proj": [
-            "gate_up_proj",
-        ],
-    }
-
-    hf_to_vllm_mapper = WeightsMapper(
-        orig_to_new_prefix={
-            # Multimodal embedding
-            "model.embed_tokens_extend.": "",
-            # LLM backbone
-            "model.": "language_model.model.",
-        },
-        orig_to_new_substr={
-            # projection
-            ".img_projection_": ".img_projection.",
-            ".up_proj_for_speech.": ".speech_projection.up.",
-            ".up_proj_for_vision_speech.": ".vision_speech_projection.up.",
-            ".down_proj_for_speech.": ".speech_projection.down.",
-            ".down_proj_for_vision_speech.": ".vision_speech_projection.down.",
-        },
-    )
-
-    @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
-        if modality.startswith("image"):
-            return "<|image|>"
-        if modality.startswith("audio"):
-            return "<|audio|>"
-
-        raise ValueError("Only image or audio modality is supported")
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-        config = vllm_config.model_config.hf_config
-        multimodal_config = vllm_config.model_config.multimodal_config
-        self.config = config
-        self.multimodal_config = multimodal_config
-
-        # TODO: Optionally initializes these for supporting input embeddings.
-        self.image_embed = Phi4MMImageEmbedding(
-            config,
-            # prefix=maybe_prefix(prefix, "image_embed"),
-        )
-        self.audio_embed = Phi4MMAudioEmbedding(
-            config,
-            # prefix=maybe_prefix(prefix, "audio_embed"),
-        )
-
-        self.language_model = init_vllm_registered_model(
-            vllm_config=vllm_config,
-            prefix=maybe_prefix(prefix, "language_model"),
-            architectures=["Phi3ForCausalLM"],
-        )
-
-        self.make_empty_intermediate_tensors = (
-            self.language_model.make_empty_intermediate_tensors
-        )
-
-    def _parse_and_validate_audio_input(
-        self, **kwargs: object
-    ) -> Phi4MMAudioInputs | None:
-        """
-        Parse and validate the audio input to the model.  This handles both
-        audio features and audio embeddings, but only the former is used for
-        now.
-
-        Args:
-            kwargs (object): Keyword arguments.
-
-        Returns:
-            Optional[Phi4MMAudioInputs]: Parsed and validated audio inputs.
-        """
-        audio_features = kwargs.pop("audio_input_features", None)
-        audio_embeds = kwargs.pop("audio_embeds", None)
-
-        if audio_features is None and audio_embeds is None:
-            return None
-
-        if audio_features is not None:
-            return Phi4MMAudioFeatureInputs(
-                type="audio_features",
-                audio_features=audio_features,
-            )
-
-        if audio_embeds is not None:
-            return Phi4MMAudioEmbeddingInputs(type="audio_embeds", data=audio_embeds)
-
-        raise AssertionError("This line should be unreachable.")
-
-    def _process_audio_input(
-        self, audio_input: Phi4MMAudioInputs, audio_projection_mode: str
-    ) -> NestedTensors:
-        """
-        Create the audio embeddings from the audio input, where the audio input
-        is pairs of audio features and audio embed lengths.  The audio input is
-        created by `input_mapper_for_phi4mm_audio`.
-
-        Args:
-            audio_input (Phi4MMAudioInputs): Audio input.
-
-        Returns:
-            NestedTensors: Audio embeddings
-        """
-        if audio_input["type"] == "audio_embeds":
-            return audio_input["data"]
-
-        audio_features = audio_input["audio_features"]
-        # (e.g. multiple examples) and the second dim is the multi-audio dim
-        # (e.g. multiple audios in the same example)
-
-        dtype = next(self.audio_embed.parameters()).dtype
-        audio_embeds = [
-            self.audio_embed(
-                features.unsqueeze(0).to(dtype),
-                audio_projection_mode=audio_projection_mode,
-            )
-            for features in audio_features
-        ]
-        return audio_embeds
-
-    def _parse_and_validate_image_input(
-        self, **kwargs: object
-    ) -> Phi4MMImagePixelInputs | None:
-        pixel_values = kwargs.get("image_pixel_values")
-        if pixel_values is None:
-            return None
-
-        image_sizes = kwargs.get("image_sizes")
-        image_attention_mask = kwargs.get("image_attention_mask")
-        num_img_tokens = kwargs.get("num_img_tokens")
-        assert (
-            image_sizes is not None
-            and image_attention_mask is not None
-            and num_img_tokens is not None
-        ), "Missing image inputs"
-
-        return Phi4MMImagePixelInputs(
-            type="pixel_values",
-            pixel_values=pixel_values,
-            image_sizes=image_sizes,
-            image_attention_mask=image_attention_mask,
-            num_img_tokens=num_img_tokens,
-        )
-
-    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
-        modalities = {}
-
-        # Preserve the order of modalities if there are multiple of them
-        # from the order of kwargs.
-        for input_key in kwargs:
-            if (
-                input_key in ("image_pixel_values", "image_embeds")
-                and "images" not in modalities
-            ):
-                modalities["images"] = self._parse_and_validate_image_input(**kwargs)
-            if (
-                input_key in ("audio_input_features", "audio_embeds")
-                and "audios" not in modalities
-            ):
-                modalities["audios"] = self._parse_and_validate_audio_input(**kwargs)
-
-        return modalities
-
-    def _process_image_input(
-        self, image_input: Phi4MMImagePixelInputs
-    ) -> list[torch.Tensor]:
-        if image_input["type"] == "image_embeds":
-            image_embeds = image_input["image_embeds"].type(self.visual.dtype)
-        else:
-            dtype = next(self.image_embed.parameters()).dtype
-            pixel_values = image_input["pixel_values"].to(dtype)
-            image_sizes = image_input["image_sizes"]
-            image_attention_mask = image_input["image_attention_mask"]
-            image_embeds = self.image_embed(
-                pixel_values, image_sizes, image_attention_mask
-            )
-        return image_embeds
-
-    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
-        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
-        if not modalities:
-            return []
-
-        # The result multimodal_embeddings is tuple of tensors, with each
-        # tensor corresponding to a multimodal data item (image or video).
-        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
-
-        # NOTE: It is important to iterate over the keys in this dictionary
-        # to preserve the order of the modalities.
-        audio_projection_mode = "speech"
-        for modality in modalities:
-            # make sure process images first
-            if modality == "images":
-                audio_projection_mode = "vision"
-                image_input = modalities["images"]
-                image_embeddings = self._process_image_input(image_input)
-                multimodal_embeddings += tuple(image_embeddings)
-            if modality == "audios":
-                audio_input = modalities["audios"]
-                audio_embeddings = self._process_audio_input(
-                    audio_input, audio_projection_mode=audio_projection_mode
-                )
-                multimodal_embeddings += tuple(audio_embeddings)
-
-        return multimodal_embeddings
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        intermediate_tensors: IntermediateTensors | None = None,
-        inputs_embeds: torch.Tensor | None = None,
-        **kwargs: object,
-    ) -> torch.Tensor:
-        if intermediate_tensors is not None:
-            inputs_embeds = None
-
-        hidden_states = self.language_model(
-            input_ids,
-            positions,
-            intermediate_tensors,
-            inputs_embeds=inputs_embeds,
-        )
-
-        return hidden_states
-
-    def compute_logits(
-        self,
-        hidden_states: torch.Tensor,
-    ) -> torch.Tensor | None:
-        return self.language_model.compute_logits(hidden_states)
-
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        loader = AutoWeightsLoader(self)
-        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
-
-    def get_mm_mapping(self) -> MultiModelKeys:
-        """
-        Get the module prefix in multimodal models
-        """
-        return MultiModelKeys.from_string_field(
-            language_model="language_model.",
-            connector=[
-                "img_projection",
-                "vision_speech_projection",
-                "speech_projection",
-            ],
-            tower_model=["image_embed", "audio_embed"],
-        )
-
-    def get_language_model(self) -> torch.nn.Module:
-        return self.language_model
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index d3b6268e7647b..a4a964bc7c1a6 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -374,7 +374,6 @@ _MULTIMODAL_MODELS = {
     ),
     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
     "Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
-    "Phi4MultimodalForCausalLM": ("phi4_multimodal", "Phi4MultimodalForCausalLM"),  # noqa: E501
     "PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"),  # noqa: E501
     "QwenVLForConditionalGeneration": ("qwen_vl", "QwenVLForConditionalGeneration"),  # noqa: E501
     "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
@@ -507,6 +506,7 @@ _PREVIOUSLY_SUPPORTED_MODELS = {
     "MotifForCausalLM": "0.10.2",
     "Phi3SmallForCausalLM": "0.9.2",
     "Phi4FlashForCausalLM": "0.10.2",
+    "Phi4MultimodalForCausalLM": "0.12.0",
     # encoder-decoder models except whisper
     # have been removed for V0 deprecation.
     "BartModel": "0.10.2",

From dd97e047e03f5218bddf05fa71f0592df0b8e30e Mon Sep 17 00:00:00 2001
From: Yongtao Huang <yongtaoh2022@gmail.com>
Date: Thu, 4 Dec 2025 22:04:42 +0800
Subject: [PATCH 232/770] Fix broken multiline assert in
 `LoRAModelManager.register_module` (#30032)

Signed-off-by: Yongtao Huang <yongtaoh2022@gmail.com>
---
 vllm/lora/models.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index f568b8b9ba595..ada30da600440 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -574,9 +574,9 @@ class LoRAModelManager:
 
     def register_module(self, module_name: str, module: "BaseLayerWithLoRA"):
         assert isinstance(module, BaseLayerWithLoRA), (
-            f"Module {module_name} must be a BaseLayerWithLoRA instance,"
+            f"Module {module_name} must be a BaseLayerWithLoRA instance, "
+            f"got {type(module)}"
         )
-        f" got {type(module)}"
         self.modules[module_name] = module
 
     def create_dummy_lora(

From 5c32a06a049829c59300929627002998d1cc34ba Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 4 Dec 2025 14:54:28 +0000
Subject: [PATCH 233/770] Use Transformers v5 RoPE standardisation and
 validation (#30046)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/transformers_utils/config.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 2911dcff2ab49..1075bc2449b3c 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -304,14 +304,19 @@ def set_default_rope_theta(config: PretrainedConfig, default_theta: float) -> No
 
 def patch_rope_parameters(config: PretrainedConfig) -> None:
     """Provide backwards compatibility for RoPE."""
+    rope_theta = getattr(config, "rope_theta", None)
     if Version(version("transformers")) < Version("5.0.0.dev0"):
         # Transformers v4 installed, legacy config fields may be present
         if (rope_scaling := getattr(config, "rope_scaling", None)) is not None:
             config.rope_parameters = rope_scaling
-        if (rope_theta := getattr(config, "rope_theta", None)) is not None:
+        if rope_theta is not None:
             if not hasattr(config, "rope_parameters"):
                 config.rope_parameters = {"rope_type": "default"}
             config.rope_parameters["rope_theta"] = rope_theta
+    elif rope_theta is not None or hasattr(config, "rope_parameters"):
+        # Transformers v5 installed
+        config.standardize_rope_params()
+        config.validate_rope()
 
     # No RoPE parameters to patch
     if getattr(config, "rope_parameters", None) is None:

From cc050558f424714f9548774cc2c661b3916d96ca Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 4 Dec 2025 07:19:42 -0800
Subject: [PATCH 234/770] [Model Runner V2] Implement
 get_num_sampled_and_rejected kernel (#30029)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/input_batch.py             | 49 +++++++++++++++++++
 vllm/v1/worker/gpu/model_runner.py            | 33 ++++++-------
 .../gpu/spec_decode/rejection_sample.py       | 12 -----
 3 files changed, 65 insertions(+), 29 deletions(-)

diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py
index 8ae887fe82cfe..1b78734fba78f 100644
--- a/vllm/v1/worker/gpu/input_batch.py
+++ b/vllm/v1/worker/gpu/input_batch.py
@@ -354,6 +354,55 @@ def combine_sampled_and_draft_tokens(
     return logits_indices
 
 
+@triton.jit
+def _get_num_sampled_and_rejected_kernel(
+    num_sampled_ptr,
+    num_rejected_ptr,
+    seq_lens_ptr,
+    cu_num_logits_ptr,
+    idx_mapping_ptr,
+    prefill_len_ptr,
+):
+    batch_idx = tl.program_id(0)
+    req_state_idx = tl.load(idx_mapping_ptr + batch_idx)
+
+    seq_len = tl.load(seq_lens_ptr + batch_idx)
+    prefill_len = tl.load(prefill_len_ptr + req_state_idx)
+    is_chunked_prefilling = seq_len < prefill_len
+
+    num_sampled = tl.load(num_sampled_ptr + batch_idx)
+    num_sampled = tl.where(is_chunked_prefilling, 0, num_sampled)
+    tl.store(num_sampled_ptr + batch_idx, num_sampled)
+
+    logits_start = tl.load(cu_num_logits_ptr + batch_idx)
+    logits_end = tl.load(cu_num_logits_ptr + batch_idx + 1)
+    num_logits = logits_end - logits_start
+
+    num_rejected = num_logits - num_sampled
+    num_rejected = tl.where(is_chunked_prefilling, 0, num_rejected)
+    tl.store(num_rejected_ptr + batch_idx, num_rejected)
+
+
+def get_num_sampled_and_rejected(
+    num_sampled: torch.Tensor,
+    seq_lens: torch.Tensor,
+    cu_num_logits: torch.Tensor,
+    idx_mapping: torch.Tensor,
+    prefill_len: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    num_reqs = idx_mapping.shape[0]
+    num_rejected = torch.empty_like(num_sampled)
+    _get_num_sampled_and_rejected_kernel[(num_reqs,)](
+        num_sampled,
+        num_rejected,
+        seq_lens,
+        cu_num_logits,
+        idx_mapping,
+        prefill_len,
+    )
+    return num_sampled, num_rejected
+
+
 @triton.jit
 def _post_update_kernel(
     idx_mapping_ptr,
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 9bf345053c30c..464f7b7bd3532 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -43,6 +43,7 @@ from vllm.v1.worker.gpu.input_batch import (
     InputBatch,
     InputBuffers,
     combine_sampled_and_draft_tokens,
+    get_num_sampled_and_rejected,
     post_update,
     prepare_pos_seq_lens,
     prepare_prefill_inputs,
@@ -54,10 +55,7 @@ from vllm.v1.worker.gpu.sample.metadata import (
 )
 from vllm.v1.worker.gpu.sample.sampler import Sampler
 from vllm.v1.worker.gpu.spec_decode import init_speculator
-from vllm.v1.worker.gpu.spec_decode.rejection_sample import (
-    get_num_rejected,
-    rejection_sample,
-)
+from vllm.v1.worker.gpu.spec_decode.rejection_sample import rejection_sample
 from vllm.v1.worker.gpu.states import RequestState
 from vllm.v1.worker.gpu.structured_outputs import apply_grammar_bitmask
 from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin
@@ -621,16 +619,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Sample tokens and compute logprobs (if needed).
         sampler_output = self.sampler(logits, sampling_metadata)
 
-        # Get the number of sampled tokens.
-        prefill_len = self.req_states.prefill_len.gpu[input_batch.idx_mapping]
-        is_chunked_prefilling = input_batch.seq_lens < prefill_len
         if input_batch.num_draft_tokens == 0:
             # No draft tokens (common case).
-            # 0 if chunked-prefilling, 1 if not.
-            num_sampled = (~is_chunked_prefilling).int()
-            num_rejected = torch.zeros_like(num_sampled)
+            num_sampled = torch.ones(
+                input_batch.num_reqs, dtype=torch.int32, device=self.device
+            )
         else:
-            # Draft tokens for spec decoding.
+            # Rejection sampling for spec decoding.
             input_ids = input_batch.input_ids[input_batch.logits_indices]
             sampled_tokens, num_sampled = rejection_sample(
                 sampler_output.sampled_token_ids,
@@ -638,13 +633,17 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 input_batch.cu_num_logits,
                 self.num_speculative_steps,
             )
-            num_sampled *= ~is_chunked_prefilling
-            num_rejected = get_num_rejected(
-                input_batch.cu_num_logits,
-                num_sampled,
-            )
             sampler_output.sampled_token_ids = sampled_tokens
-            # TODO(woosuk): Support logprobs with spec decoding.
+
+        # Get the number of sampled and rejected tokens.
+        # For chunked prefills, num_sampled and num_rejected are both 0.
+        num_sampled, num_rejected = get_num_sampled_and_rejected(
+            num_sampled,
+            input_batch.seq_lens,
+            input_batch.cu_num_logits,
+            input_batch.idx_mapping,
+            self.req_states.prefill_len.gpu,
+        )
         return sampler_output, num_sampled, num_rejected
 
     def compute_prompt_logprobs(
diff --git a/vllm/v1/worker/gpu/spec_decode/rejection_sample.py b/vllm/v1/worker/gpu/spec_decode/rejection_sample.py
index 43c6ac518bccc..8a7bf28bacbd4 100644
--- a/vllm/v1/worker/gpu/spec_decode/rejection_sample.py
+++ b/vllm/v1/worker/gpu/spec_decode/rejection_sample.py
@@ -69,15 +69,3 @@ def rejection_sample(
         num_warps=1,
     )
     return sampled, num_sampled
-
-
-@torch.compile(dynamic=True)
-def get_num_rejected(
-    cu_num_logits: torch.Tensor,
-    num_sampled: torch.Tensor,
-) -> torch.Tensor:
-    num_logits = cu_num_logits[1:] - cu_num_logits[:-1]
-    num_rejected = num_logits - num_sampled
-    # No token is rejected for chunked prefills.
-    num_rejected *= num_sampled > 0
-    return num_rejected

From 5b4b42c0b6e190dacbf6dbfed3506c9e58bfc8de Mon Sep 17 00:00:00 2001
From: Doug Smith <dosmith@redhat.com>
Date: Thu, 4 Dec 2025 10:38:03 -0500
Subject: [PATCH 235/770] Mark DBO test as flaky on b200 for Distributed B200
 test (#29913)

Signed-off-by: dougbtv <dosmith@redhat.com>
---
 tests/v1/distributed/test_dbo.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/tests/v1/distributed/test_dbo.py b/tests/v1/distributed/test_dbo.py
index 16f154d196ba5..f3a159762ea54 100644
--- a/tests/v1/distributed/test_dbo.py
+++ b/tests/v1/distributed/test_dbo.py
@@ -9,10 +9,22 @@ correctly with the DeepSeek-V2-Lite model using GSM8K evaluation.
 """
 
 import pytest
+import torch
 
 from tests.evals.gsm8k.gsm8k_eval import evaluate_gsm8k
 from tests.utils import RemoteOpenAIServer
 
+# Detect Blackwell / B200 (compute capability 10.x)
+try:
+    if torch.cuda.is_available():
+        cap = torch.cuda.get_device_capability(0)
+        IS_BLACKWELL = cap[0] >= 10
+    else:
+        IS_BLACKWELL = False
+except Exception:
+    # Be conservative: if we can't detect, don't xfail by default
+    IS_BLACKWELL = False
+
 MODEL_NAME = "deepseek-ai/DeepSeek-V2-Lite-Chat"
 DP_SIZE = 2
 
@@ -33,6 +45,13 @@ DEEPEP_BACKENDS = [
 
 
 @pytest.mark.parametrize("all2all_backend", DEEPEP_BACKENDS)
+@pytest.mark.xfail(
+    IS_BLACKWELL,
+    reason=(
+        "Temporary: DBO accuracy unstable on Blackwell "
+        "(doesn't meet expectation of MIN_ACCURACY = 0.62)"
+    ),
+)
 def test_dbo_dp_ep_gsm8k(all2all_backend: str, num_gpus_available):
     """
     Test DBO with DP+EP using GSM8K evaluation.

From 990f806473888451ef6590f85a6ed8436db7801c Mon Sep 17 00:00:00 2001
From: Shengqi Chen <harry-chen@outlook.com>
Date: Fri, 5 Dec 2025 00:28:37 +0800
Subject: [PATCH 236/770] [Doc] clarify nightly builds in developer docs
 (#30019)

Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
---
 docs/contributing/ci/nightly_builds.md | 160 +++++++++++++++++++++++++
 1 file changed, 160 insertions(+)
 create mode 100644 docs/contributing/ci/nightly_builds.md

diff --git a/docs/contributing/ci/nightly_builds.md b/docs/contributing/ci/nightly_builds.md
new file mode 100644
index 0000000000000..a07b9c1c2fa4a
--- /dev/null
+++ b/docs/contributing/ci/nightly_builds.md
@@ -0,0 +1,160 @@
+# Nightly Builds of vLLM Wheels
+
+vLLM maintains a per-commit wheel repository (commonly referred to as "nightly") at `https://wheels.vllm.ai` that provides pre-built wheels for every commit on the `main` branch since `v0.5.3`. This document explains how the nightly wheel index mechanism works.
+
+## Build and Upload Process on CI
+
+### Wheel Building
+
+Wheels are built in the `Release` pipeline (`.buildkite/release-pipeline.yaml`) after a PR is merged into the main branch, with multiple variants:
+
+- **Backend variants**: `cpu` and `cuXXX` (e.g., `cu129`, `cu130`).
+- **Architecture variants**: `x86_64` and `aarch64`.
+
+Each build step:
+
+1. Builds the wheel in a Docker container.
+2. Renames the wheel filename to use the correct manylinux tag (currently `manylinux_2_31`) for PEP 600 compliance.
+3. Uploads the wheel to S3 bucket `vllm-wheels` under `/{commit_hash}/`.
+
+### Index Generation
+
+After uploading each wheel, the `.buildkite/scripts/upload-wheels.sh` script:
+
+1. **Lists all existing wheels** in the commit directory from S3
+2. **Generates indices** using `.buildkite/scripts/generate-nightly-index.py`:
+    - Parses wheel filenames to extract metadata (version, variant, platform tags).
+    - Creates HTML index files (`index.html`) for PyPI compatibility.
+    - Generates machine-readable `metadata.json` files.
+3. **Uploads indices** to multiple locations (overriding existing ones):
+    - `/{commit_hash}/` - Always uploaded for commit-specific access.
+    - `/nightly/` - Only for commits on `main` branch (not PRs).
+    - `/{version}/` - Only for release wheels (no `dev` in its version).
+
+!!! tip "Handling Concurrent Builds"
+    The index generation script can handle multiple variants being built concurrently by always listing all wheels in the commit directory before generating indices, avoiding race conditions.
+
+## Directory Structure
+
+The S3 bucket structure follows this pattern:
+
+```text
+s3://vllm-wheels/
+├── {commit_hash}/              # Commit-specific wheels and indices
+│   ├── vllm-*.whl              # All wheel files
+│   ├── index.html              # Project list (default variant)
+│   ├── vllm/
+│   │   ├── index.html          # Package index (default variant)
+│   │   └── metadata.json       # Metadata (default variant)
+│   ├── cu129/                  # Variant subdirectory
+│   │   ├── index.html          # Project list (cu129 variant)
+│   │   └── vllm/
+│   │       ├── index.html      # Package index (cu129 variant)
+│   │       └── metadata.json   # Metadata (cu129 variant)
+│   ├── cu130/                  # Variant subdirectory
+│   ├── cpu/                    # Variant subdirectory
+│   └── .../                    # More variant subdirectories
+├── nightly/                    # Latest main branch wheels (mirror of latest commit)
+└── {version}/                  # Release version indices (e.g., 0.11.2)
+```
+
+All built wheels are stored in `/{commit_hash}/`, while different indices are generated and reference them.
+This avoids duplication of wheel files.
+
+For example, you can specify the following URLs to use different indices:
+
+- `https://wheels.vllm.ai/nightly/cu130` for the latest main branch wheels built with CUDA 13.0.
+- `https://wheels.vllm.ai/{commit_hash}` for wheels built at a specific commit (default variant).
+- `https://wheels.vllm.ai/0.12.0/cpu` for 0.12.0 release wheels built for CPU variant.
+
+Please note that not all variants are present on every commit. The available variants are subject to change over time, e.g., changing cu130 to cu131.
+
+### Variant Organization
+
+Indices are organized by variant:
+
+- **Default variant**: Wheels without variant suffix (i.e., built with the current `VLLM_MAIN_CUDA_VERSION`) are placed in the root.
+- **Variant subdirectories**: Wheels with variant suffixes (e.g., `+cu130`, `.cpu`) are organized in subdirectories.
+- **Alias to default**: The default variant can have an alias (e.g., `cu129` for now) for consistency and convenience.
+
+The variant is extracted from the wheel filename (as described in the [file name convention](https://packaging.python.org/en/latest/specifications/binary-distribution-format/#file-name-convention)):
+
+- The variant is encoded in the local version identifier (e.g. `+cu129` or `dev<N>+g<hash>.cu130`).
+- Examples:
+    - `vllm-0.11.2.dev278+gdbc3d9991-cp38-abi3-manylinux1_x86_64.whl` → default variant
+    - `vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl` → `cu129` variant
+    - `vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl` → `cu130` variant
+
+## Index Generation Details
+
+The `generate-nightly-index.py` script performs the following:
+
+1. **Parses wheel filenames** using regex to extract:
+    - Package name
+    - Version (with variant extracted)
+    - Python tag, ABI tag, platform tag
+    - Build tag (if present)
+2. **Groups wheels by variant**, then by package name:
+    - Currently only `vllm` is built, but the structure supports multiple packages in the future.
+3. **Generates HTML indices** (compliant with the [Simple repository API](https://packaging.python.org/en/latest/specifications/simple-repository-api/#simple-repository-api)):
+    - Top-level `index.html`: Lists all packages and variant subdirectories
+    - Package-level `index.html`: Lists all wheel files for that package
+    - Uses relative paths to wheel files for portability
+4. **Generates metadata.json**:
+    - Machine-readable JSON containing all wheel metadata
+    - Includes `path` field with URL-encoded relative path to wheel file
+    - Used by `setup.py` to locate compatible pre-compiled wheels during Python-only builds
+
+### Special Handling for AWS Services
+
+The wheels and indices are directly stored on AWS S3, and we use AWS CloudFront as a CDN in front of the S3 bucket.
+
+Since S3 does not provide proper directory listing, to support PyPI-compatible simple repository API behavior, we deploy a CloudFront Function that:
+
+- redirects any URL that does not end with `/` and does not look like a file (i.e., does not contain a dot `.` in the last path segment) to the same URL with a trailing `/`
+- appends `/index.html` to any URL that ends with `/`
+
+For example, the following requests would be handled as:
+
+- `/nightly` -> `/nightly/index.html`
+- `/nightly/cu130/` -> `/nightly/cu130/index.html`
+- `/nightly/index.html` or `/nightly/vllm.whl` -> unchanged
+
+!!! note "AWS S3 Filename Escaping"
+
+    S3 will automatically escape filenames upon upload according to its [naming rule](https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html). The direct impact on vllm is that `+` in filenames will be converted to `%2B`. We take special care in the index generation script to escape filenames properly when generating the HTML indices and JSON metadata, to ensure the URLs are correct and can be directly used.
+
+## Usage of precompiled wheels in `setup.py` {#precompiled-wheels-usage}
+
+When installing vLLM with `VLLM_USE_PRECOMPILED=1`, the `setup.py` script:
+
+1. **Determines wheel location** via `precompiled_wheel_utils.determine_wheel_url()`:
+    - Env var `VLLM_PRECOMPILED_WHEEL_LOCATION` (user-specified URL/path) always takes precedence and skips all other steps.
+    - Determines the variant from `VLLM_MAIN_CUDA_VERSION` (can be overridden with env var `VLLM_PRECOMPILED_WHEEL_VARIANT`); the default variant will also be tried as a fallback.
+    - Determines the _base commit_ (explained later) of this branch (can be overridden with env var `VLLM_PRECOMPILED_WHEEL_COMMIT`).
+2. **Fetches metadata** from `https://wheels.vllm.ai/{commit}/vllm/metadata.json` (for the default variant) or `https://wheels.vllm.ai/{commit}/{variant}/vllm/metadata.json` (for a specific variant).
+3. **Selects compatible wheel** based on:
+    - Package name (`vllm`)
+    - Platform tag (architecture match)
+4. **Downloads and extracts** precompiled binaries from the wheel:
+    - C++ extension modules (`.so` files)
+    - Flash Attention Python modules
+    - Triton kernel Python files
+5. **Patches package_data** to include extracted files in the installation
+
+!!! note "What is the base commit?"
+
+    The base commit is determined by finding the merge-base
+    between the current branch and upstream `main`, ensuring
+    compatibility between source code and precompiled binaries.
+
+_Note: it's users' responsibility to ensure there is no native code (e.g., C++ or CUDA) changes before using precompiled wheels._
+
+## Implementation Files
+
+Key files involved in the nightly wheel mechanism:
+
+- **`.buildkite/release-pipeline.yaml`**: CI pipeline that builds wheels
+- **`.buildkite/scripts/upload-wheels.sh`**: Script that uploads wheels and generates indices
+- **`.buildkite/scripts/generate-nightly-index.py`**: Python script that generates PyPI-compatible indices
+- **`setup.py`**: Contains `precompiled_wheel_utils` class for fetching and using precompiled wheels

From b286a311c2bab639ce6edc998a9a67c0affc9cbf Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 5 Dec 2025 01:21:24 +0800
Subject: [PATCH 237/770] [Chore] Deprecate `merge_by_field_config` arg
 (#30035)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../multimodal/processing/test_common.py      |   4 +-
 .../multimodal/processing/test_glm4_1v.py     |   7 +-
 .../processing/test_tensor_schema.py          |   5 +-
 tests/multimodal/test_cache.py                |   9 +-
 tests/multimodal/test_inputs.py               |  91 -----------
 vllm/model_executor/models/deepseek_ocr.py    |   4 +-
 vllm/model_executor/models/interfaces.py      |   2 +-
 vllm/model_executor/models/lightonocr.py      |   4 +-
 .../model_executor/models/nano_nemotron_vl.py |  12 +-
 vllm/model_executor/models/opencua.py         |   4 +-
 vllm/model_executor/models/paddleocr_vl.py    |   4 +-
 vllm/model_executor/models/paligemma.py       |   8 +-
 vllm/model_executor/models/qwen2_5_vl.py      |   4 +-
 vllm/multimodal/cache.py                      |   9 +-
 vllm/multimodal/inputs.py                     | 142 ++++++------------
 vllm/multimodal/utils.py                      |  59 ++------
 vllm/v1/serial_utils.py                       |  19 ---
 vllm/v1/worker/gpu_model_runner.py            |   3 -
 vllm/v1/worker/tpu_model_runner.py            |   2 -
 19 files changed, 90 insertions(+), 302 deletions(-)
 delete mode 100644 tests/multimodal/test_inputs.py

diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 6b9d388f2b9b4..2e032ac4ca526 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -20,7 +20,7 @@ from vllm.config.multimodal import (
 )
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
 from vllm.multimodal.cache import MultiModalProcessorOnlyCache
-from vllm.multimodal.inputs import MultiModalInputs
+from vllm.multimodal.inputs import MultiModalInputs, batched_tensors_equal
 from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
 from vllm.tokenizers import (
     MistralTokenizer,
@@ -418,4 +418,4 @@ def _assert_inputs_equal(
         a_data.pop(key, None)
         b_data.pop(key, None)
 
-    assert a_data == b_data, msg
+    assert batched_tensors_equal(a_data, b_data), msg
diff --git a/tests/models/multimodal/processing/test_glm4_1v.py b/tests/models/multimodal/processing/test_glm4_1v.py
index 553a5f719bd35..51071c93531de 100644
--- a/tests/models/multimodal/processing/test_glm4_1v.py
+++ b/tests/models/multimodal/processing/test_glm4_1v.py
@@ -5,6 +5,7 @@ import pytest
 
 from vllm.assets.video import VideoAsset
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import batched_tensors_equal
 from vllm.multimodal.video import OpenCVDynamicVideoBackend, OpenCVVideoBackend
 
 from ...utils import build_model_context
@@ -103,7 +104,7 @@ def test_video_loader_consistency(
     dynamic_outputs = processor.apply(prompt, dynamic_mm_data, hf_processor_mm_kwargs)
 
     assert static_outputs["prompt_token_ids"] == dynamic_outputs["prompt_token_ids"]
-    assert (
-        static_outputs["mm_kwargs"].get_data()
-        == dynamic_outputs["mm_kwargs"].get_data()
+    assert batched_tensors_equal(
+        static_outputs["mm_kwargs"].get_data(),
+        dynamic_outputs["mm_kwargs"].get_data(),
     )
diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py
index 7628ab4fe2349..5d489549c5b46 100644
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -130,10 +130,9 @@ def create_batched_mm_kwargs(
         hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
         tokenization_kwargs=processor_inputs.tokenization_kwargs,
     )["mm_kwargs"].require_data()
-    items = [item for modality in supported_mm_limits for item in mm_kwargs[modality]]
+
     return group_mm_kwargs_by_modality(
-        items,
-        merge_by_field_config=model_cls.merge_by_field_config,
+        [item for modality in supported_mm_limits for item in mm_kwargs[modality]]
     )
 
 
diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py
index 2ddc93f8daf7b..e4fcc34740edb 100644
--- a/tests/multimodal/test_cache.py
+++ b/tests/multimodal/test_cache.py
@@ -85,12 +85,6 @@ def _dummy_items(
         (_dummy_item("a", {"a1": 100}), 100),
         (_dummy_item("a", {"a1": 100, "a2": 110}), 210),
         (_dummy_items({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460),  # noqa: E501
-        (
-            _dummy_items(
-                {"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}
-            ).get_data(),
-            460,
-        ),  # noqa: E501
     ],
 )
 def test_cache_item_size(item, expected_size):
@@ -107,6 +101,9 @@ def test_cache_item_size(item, expected_size):
     cache[""] = MultiModalProcessorCacheItemMetadata(item, [prompt_update])
     assert cache.currsize == expected_size
 
+    cache[""] = item.get_data()
+    assert cache.currsize == expected_size
+
 
 def _create_vllm_config(
     *,
diff --git a/tests/multimodal/test_inputs.py b/tests/multimodal/test_inputs.py
deleted file mode 100644
index 88e92bee3a292..0000000000000
--- a/tests/multimodal/test_inputs.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-import torch
-
-from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
-
-pytestmark = pytest.mark.cpu_test
-
-
-def assert_nested_tensors_equal(expected: NestedTensors, actual: NestedTensors):
-    assert type(expected) == type(actual)  # noqa: E721
-    if isinstance(expected, torch.Tensor):
-        assert torch.equal(expected, actual)
-    else:
-        for expected_item, actual_item in zip(expected, actual):
-            assert_nested_tensors_equal(expected_item, actual_item)
-
-
-def assert_multimodal_inputs_equal(
-    expected: MultiModalKwargs, actual: MultiModalKwargs
-):
-    assert set(expected.keys()) == set(actual.keys())
-    for key in expected:
-        assert_nested_tensors_equal(expected[key], actual[key])
-
-
-def test_multimodal_input_batch_single_tensor():
-    t = torch.rand([1, 2])
-    result = MultiModalKwargs.batch([{"image": t}])
-    assert_multimodal_inputs_equal(result, {"image": t.unsqueeze(0)})
-
-
-def test_multimodal_input_batch_multiple_tensors():
-    a = torch.rand([1, 1, 2])
-    b = torch.rand([1, 1, 2])
-    c = torch.rand([1, 1, 2])
-    result = MultiModalKwargs.batch([{"image": a}, {"image": b}, {"image": c}])
-    assert_multimodal_inputs_equal(result, {"image": torch.stack([a, b, c])})
-
-
-def test_multimodal_input_batch_multiple_heterogeneous_tensors():
-    a = torch.rand([1, 2, 2])
-    b = torch.rand([1, 3, 2])
-    c = torch.rand([1, 4, 2])
-    result = MultiModalKwargs.batch([{"image": a}, {"image": b}, {"image": c}])
-    assert_multimodal_inputs_equal(result, {"image": [a, b, c]})
-
-
-def test_multimodal_input_batch_nested_tensors():
-    a = torch.rand([2, 3])
-    b = torch.rand([2, 3])
-    c = torch.rand([2, 3])
-    result = MultiModalKwargs.batch([{"image": [a]}, {"image": [b]}, {"image": [c]}])
-    assert_multimodal_inputs_equal(
-        result, {"image": torch.stack([a.unsqueeze(0), b.unsqueeze(0), c.unsqueeze(0)])}
-    )
-
-
-def test_multimodal_input_batch_heterogeneous_lists():
-    a = torch.rand([1, 2, 3])
-    b = torch.rand([1, 2, 3])
-    c = torch.rand([1, 2, 3])
-    result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c]}])
-    assert_multimodal_inputs_equal(
-        result, {"image": [torch.stack([a, b]), c.unsqueeze(0)]}
-    )
-
-
-def test_multimodal_input_batch_multiple_batchable_lists():
-    a = torch.rand([1, 2, 3])
-    b = torch.rand([1, 2, 3])
-    c = torch.rand([1, 2, 3])
-    d = torch.rand([1, 2, 3])
-    result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c, d]}])
-    assert_multimodal_inputs_equal(
-        result, {"image": torch.stack([torch.stack([a, b]), torch.stack([c, d])])}
-    )
-
-
-def test_multimodal_input_batch_mixed_stacking_depths():
-    a = torch.rand([1, 2, 3])
-    b = torch.rand([1, 3, 3])
-    c = torch.rand([1, 4, 3])
-
-    result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c]}])
-    assert_multimodal_inputs_equal(result, {"image": [[a, b], c.unsqueeze(0)]})
-
-    result = MultiModalKwargs.batch([{"image": [a]}, {"image": [b, c]}])
-    assert_multimodal_inputs_equal(result, {"image": [a.unsqueeze(0), [b, c]]})
diff --git a/vllm/model_executor/models/deepseek_ocr.py b/vllm/model_executor/models/deepseek_ocr.py
index 019fb3e29ab91..a612ebd956282 100644
--- a/vllm/model_executor/models/deepseek_ocr.py
+++ b/vllm/model_executor/models/deepseek_ocr.py
@@ -27,7 +27,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (
     MultiModalDataDict,
     MultiModalFieldConfig,
-    MultiModalKwargs,
+    MultiModalKwargsItems,
     NestedTensors,
 )
 from vllm.multimodal.parse import (
@@ -305,7 +305,7 @@ class DeepseekOCRMultiModalProcessor(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 0f65683cf7c57..01b3e7827424d 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -78,7 +78,7 @@ class SupportsMultiModal(Protocol):
     `multimodal_config.mm_encoder_tp_mode="data"`.
     """
 
-    merge_by_field_config: ClassVar[bool] = False
+    merge_by_field_config: ClassVar[bool] = True
     """
     A flag that indicates which implementation of
     `vllm.multimodal.utils.group_mm_kwargs_by_modality` to use.
diff --git a/vllm/model_executor/models/lightonocr.py b/vllm/model_executor/models/lightonocr.py
index 9839e4f8f707e..353ee7806b1b1 100644
--- a/vllm/model_executor/models/lightonocr.py
+++ b/vllm/model_executor/models/lightonocr.py
@@ -28,7 +28,7 @@ from vllm.model_executor.models.utils import (
 )
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.cache import BaseMultiModalProcessorCache
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargsItems
 from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems
 from vllm.multimodal.processing import (
     BaseMultiModalProcessor,
@@ -103,7 +103,7 @@ class LightOnOCRMultiModalProcessor(BaseMultiModalProcessor[Mistral3ProcessingIn
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index
diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
index 891a9ce080233..c4198d36b392e 100644
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -52,7 +52,6 @@ from vllm.multimodal.evs import (
 from vllm.multimodal.inputs import (
     MultiModalDataDict,
     MultiModalFieldConfig,
-    MultiModalKwargs,
     MultiModalKwargsItems,
     VideoItem,
 )
@@ -849,17 +848,18 @@ class NanoNemotronBaseVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
-        if "image_num_patches" in out_mm_kwargs:
-            image_num_patches = out_mm_kwargs["image_num_patches"]
+        out_mm_data = out_mm_kwargs.get_data()
+        if "image_num_patches" in out_mm_data:
+            image_num_patches = out_mm_data["image_num_patches"]
             assert isinstance(image_num_patches, torch.Tensor)
             image_num_patches = image_num_patches.tolist()
-        elif "image_embeds" in out_mm_kwargs:
+        elif "image_embeds" in out_mm_data:
             # to compute num_patches (similar to Qwen2-VL)
-            image_num_patches = [None] * len(out_mm_kwargs["image_embeds"])
+            image_num_patches = [None] * len(out_mm_data["image_embeds"])
         else:
             image_num_patches = []
 
diff --git a/vllm/model_executor/models/opencua.py b/vllm/model_executor/models/opencua.py
index 4338918663378..b92f0c9dac32b 100644
--- a/vllm/model_executor/models/opencua.py
+++ b/vllm/model_executor/models/opencua.py
@@ -23,7 +23,7 @@ from vllm.config import VllmConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (
     MultiModalFieldConfig,
-    MultiModalKwargs,
+    MultiModalKwargsItems,
 )
 from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser
 from vllm.multimodal.processing import (
@@ -153,7 +153,7 @@ class OpenCUAMultiModalProcessor(BaseMultiModalProcessor[OpenCUAProcessingInfo])
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, Any],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)
diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py
index 5256d8ba7fd86..1df5ff62fa5b5 100644
--- a/vllm/model_executor/models/paddleocr_vl.py
+++ b/vllm/model_executor/models/paddleocr_vl.py
@@ -62,7 +62,7 @@ from vllm.multimodal.inputs import (
     MultiModalDataDict,
     MultiModalFeatureSpec,
     MultiModalFieldConfig,
-    MultiModalKwargs,
+    MultiModalKwargsItems,
 )
 from vllm.multimodal.parse import (
     ImageProcessorItems,
@@ -307,7 +307,7 @@ class PaddleOCRVLMultiModalProcessor(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)
         hf_config = self.info.get_hf_config()
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index ec5d0fa6226dd..9fa32f01d37a0 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -40,7 +40,6 @@ from .siglip import SiglipVisionModel
 from .utils import (
     AutoWeightsLoader,
     WeightsMapper,
-    flatten_bn,
     init_vllm_registered_model,
     maybe_prefix,
 )
@@ -252,6 +251,8 @@ class PaliGemmaMultiModalProcessor(BaseMultiModalProcessor[PaliGemmaProcessingIn
     dummy_inputs=PaliGemmaDummyInputsBuilder,
 )
 class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
+    merge_by_field_config = True
+
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -327,9 +328,8 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP
             return None
 
         if pixel_values is not None:
-            pixel_values = flatten_bn(pixel_values, concat=True)
-
             h = w = self.config.vision_config.image_size
+
             return PaliGemmaImagePixelInputs(
                 type="pixel_values",
                 data=pixel_values,
@@ -337,8 +337,6 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP
             )
 
         if image_embeds is not None:
-            image_embeds = flatten_bn(image_embeds, concat=True)
-
             return PaliGemmaImageEmbeddingInputs(
                 type="image_embeds",
                 data=image_embeds,
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 6ca490f467634..cb521ebdf0afb 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -77,7 +77,7 @@ from vllm.multimodal.evs import (
 from vllm.multimodal.inputs import (
     MultiModalFeatureSpec,
     MultiModalFieldConfig,
-    MultiModalKwargs,
+    MultiModalKwargsItems,
 )
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import PromptReplacement, PromptUpdate
@@ -973,7 +973,7 @@ class Qwen2_5_VLMultiModalProcessor(Qwen2VLMultiModalProcessor):
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, Any],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)
diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py
index 97f6aa461b90c..67bdf5e1557f9 100644
--- a/vllm/multimodal/cache.py
+++ b/vllm/multimodal/cache.py
@@ -25,7 +25,6 @@ from .inputs import (
     MultiModalBatchedField,
     MultiModalFeatureSpec,
     MultiModalFieldElem,
-    MultiModalKwargs,
     MultiModalKwargsItem,
     MultiModalKwargsItems,
     NestedTensors,
@@ -90,7 +89,6 @@ MultiModalCacheValue: TypeAlias = (
     | MultiModalProcessorCacheItemMetadata
     | MultiModalKwargsItems
     | MultiModalKwargsItem
-    | MultiModalKwargs
     | Mapping[str, NestedTensors]
 )
 
@@ -108,12 +106,7 @@ class MultiModalCache:
         # These are not subclasses of dict
         if isinstance(
             leaf,
-            (
-                MultiModalKwargs,
-                MultiModalKwargsItems,
-                MultiModalKwargsItem,
-                MultiModalFieldElem,
-            ),
+            (MultiModalKwargsItems, MultiModalKwargsItem, MultiModalFieldElem),
         ):
             return cls.get_item_size(leaf.data)  # type: ignore
 
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 397684fa2f83c..32f15240cb7da 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -3,7 +3,7 @@
 
 from abc import ABC, abstractmethod
 from collections import UserDict, defaultdict
-from collections.abc import Mapping, Sequence
+from collections.abc import Mapping, Sequence, Set
 from dataclasses import dataclass
 from functools import partial
 from itertools import accumulate
@@ -201,8 +201,10 @@ Uses a list instead of a tensor if the dimensions of each element do not match.
 
 
 def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
-    """Equality check between
-    [`NestedTensors`][vllm.multimodal.inputs.NestedTensors] objects."""
+    """
+    Equality check between
+    [`NestedTensors`][vllm.multimodal.inputs.NestedTensors] objects.
+    """
     if isinstance(a, torch.Tensor):
         return isinstance(b, torch.Tensor) and torch.equal(a, b)
     elif isinstance(b, torch.Tensor):
@@ -224,10 +226,24 @@ def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
 BatchedTensorInputs: TypeAlias = dict[str, NestedTensors]
 """
 A dictionary containing nested tensors which have been batched via
-[`MultiModalKwargs.batch`][vllm.multimodal.inputs.MultiModalKwargs.batch].
+[`MultiModalKwargsItems.get_data`][vllm.multimodal.inputs.MultiModalKwargsItems.get_data].
 """
 
 
+def batched_tensors_equal(a: BatchedTensorInputs, b: BatchedTensorInputs) -> bool:
+    """
+    Equality check between
+    [`BatchedTensorInputs`][vllm.multimodal.inputs.BatchedTensorInputs] objects.
+    """
+    for k in a:
+        if k not in b:
+            return False
+        if not nested_tensors_equal(a[k], b[k]):
+            return False
+
+    return True
+
+
 @dataclass
 class MultiModalFeatureSpec:
     """
@@ -823,7 +839,14 @@ class MultiModalKwargsItems(UserDict[str, Sequence[_I]]):
 
         return self  # type: ignore[return-value]
 
-    def get_data(self, *, pin_memory: bool = False) -> "MultiModalKwargs":
+    def get_data(
+        self,
+        *,
+        device: torch.types.Device = None,
+        pin_memory: bool = False,
+        cpu_fields: Set[str] = frozenset(),
+    ) -> BatchedTensorInputs:
+        """Construct a dictionary of keyword arguments to pass to the model."""
         elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list)
         for modality, items in self.items():
             for i, item in enumerate(items):
@@ -835,12 +858,23 @@ class MultiModalKwargsItems(UserDict[str, Sequence[_I]]):
                 for key, elem in item.items():
                     elems_by_key[key].append(elem)
 
-        return MultiModalKwargs(
-            {
-                key: elems[0].field.reduce_data(elems, pin_memory=pin_memory)
-                for key, elems in elems_by_key.items()
-            }
-        )
+        data = {
+            key: elems[0].field.reduce_data(elems, pin_memory=pin_memory)
+            for key, elems in elems_by_key.items()
+        }
+
+        if device is not None:
+            for k in data.keys() - cpu_fields:
+                data[k] = json_map_leaves(
+                    (
+                        lambda x: x.to(device=device, non_blocking=True)
+                        if isinstance(x, torch.Tensor)
+                        else x
+                    ),
+                    data[k],
+                )
+
+        return data
 
 
 MultiModalKwargsOptionalItems: TypeAlias = (
@@ -849,6 +883,7 @@ MultiModalKwargsOptionalItems: TypeAlias = (
 )
 
 
+@deprecated("`MultiModalKwargs` is deprecated and will be removed in v0.13.")
 class MultiModalKwargs(UserDict[str, NestedTensors]):
     """
     A dictionary that represents the keyword arguments to
@@ -882,91 +917,6 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
     ):
         return MultiModalKwargsItems.from_seq(items).get_data(pin_memory=pin_memory)
 
-    @staticmethod
-    def _try_stack(
-        nested_tensors: NestedTensors, pin_memory: bool = False
-    ) -> NestedTensors:
-        """
-        Stack the inner dimensions that have the same shape in
-        a nested list of tensors.
-
-        Thus, a dimension represented by a list means that the inner
-        dimensions are different for each element along that dimension.
-        """
-        if isinstance(nested_tensors, torch.Tensor):
-            return nested_tensors
-
-        # TODO: Remove these once all models have been migrated
-        if isinstance(nested_tensors, np.ndarray):
-            return torch.from_numpy(nested_tensors)
-        if isinstance(nested_tensors, (int, float)):
-            return torch.tensor(nested_tensors)
-
-        stacked = [MultiModalKwargs._try_stack(t, pin_memory) for t in nested_tensors]
-        if not is_list_of(stacked, torch.Tensor, check="all"):
-            # Only tensors (not lists) can be stacked.
-            return stacked
-
-        tensors_ = cast(list[torch.Tensor], stacked)
-        if len(tensors_) == 1:
-            # An optimization when `tensors_` contains only one tensor:
-            # - produce exactly same result as `torch.stack(tensors_)`
-            # - will achieve zero-copy if the tensor is contiguous
-            return tensors_[0].unsqueeze(0).contiguous()
-
-        if any(t.shape != tensors_[0].shape for t in tensors_):
-            # The tensors have incompatible shapes and can't be stacked.
-            return tensors_
-
-        outputs = torch.empty(
-            len(tensors_),
-            *tensors_[0].shape,
-            dtype=tensors_[0].dtype,
-            device=tensors_[0].device,
-            pin_memory=pin_memory,
-        )
-        return torch.stack(tensors_, out=outputs)
-
-    @staticmethod
-    def batch(
-        inputs_list: list["MultiModalKwargs"], pin_memory: bool = False
-    ) -> BatchedTensorInputs:
-        """
-        Batch multiple inputs together into a dictionary.
-
-        The resulting dictionary has the same keys as the inputs.
-        If the corresponding value from each input is a tensor and they all
-        share the same shape, the output value is a single batched tensor;
-        otherwise, the output value is a list containing the original value
-        from each input.
-        """
-        if len(inputs_list) == 0:
-            return {}
-
-        # We need to consider the case where each item in the batch
-        # contains different modalities (i.e. different keys).
-        item_lists = defaultdict[str, list[NestedTensors]](list)
-
-        for inputs in inputs_list:
-            for k, v in inputs.items():
-                item_lists[k].append(v)
-
-        return {
-            k: MultiModalKwargs._try_stack(item_list, pin_memory)
-            for k, item_list in item_lists.items()
-        }
-
-    @staticmethod
-    def as_kwargs(
-        batched_inputs: BatchedTensorInputs,
-        *,
-        device: torch.types.Device,
-    ) -> BatchedTensorInputs:
-        return json_map_leaves(
-            lambda x: x.to(device=device, non_blocking=True),
-            batched_inputs,
-        )
-
     def __getitem__(self, key: str):
         if key not in self:
             raise KeyError(
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 1840220854858..f8e8847e8e609 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -19,7 +19,6 @@ from PIL import Image, UnidentifiedImageError
 import vllm.envs as envs
 from vllm.connections import HTTPConnection, global_http_connection
 from vllm.logger import init_logger
-from vllm.utils.jsontree import json_map_leaves
 from vllm.utils.registry import ExtensionManager
 
 from .audio import AudioEmbeddingMediaIO, AudioMediaIO
@@ -427,59 +426,25 @@ def group_mm_kwargs_by_modality(
     Yields:
         A tuple `(modality, num_items, grouped_kwargs)`.
     """
-    if merge_by_field_config is None:
-        raise RuntimeError(
-            "`group_mm_kwargs_by_modality` now requires "
-            "`merge_by_field_config` arg, please update your model runner "
-            "according to https://github.com/vllm-project/vllm/pull/25676."
-        )
-    if merge_by_field_config is False:
+    # TODO: After v0.13, remove merge_by_field_config attribute from model impls
+    if merge_by_field_config is not None:
         logger.warning_once(
-            "The legacy code for batching multi-modal kwargs is deprecated and "
-            "will be removed in v0.12. Please update your model with "
-            "`merge_by_field_config=True` to use the new code defined by "
-            "`MultiModalFieldConfig`. You can refer to "
-            "https://github.com/vllm-project/vllm/issues/26149 "
-            "for some examples on how to do this."
+            "The `merge_by_field_config` argument of `group_mm_kwargs_by_modality` "
+            "is deprecated and will be removed in v0.13."
         )
 
-    from vllm.multimodal.inputs import MultiModalKwargs, MultiModalKwargsItems
+    from vllm.multimodal.inputs import MultiModalKwargsItems
 
     for modality, items in groupby(mm_kwargs, key=lambda item: item.modality):
         items_lst = list(items)
+        mm_kwargs_items = MultiModalKwargsItems.from_seq(items_lst)
+        mm_kwargs_data = mm_kwargs_items.get_data(
+            device=device,
+            pin_memory=pin_memory,
+            cpu_fields=multimodal_cpu_fields,
+        )
 
-        if merge_by_field_config:
-            mm_kwargs_group: BatchedTensorInputs = dict(
-                MultiModalKwargsItems.from_seq(items_lst).get_data(
-                    pin_memory=pin_memory
-                )
-            )
-
-            if device is not None:
-                mm_kwargs_group = {
-                    k: json_map_leaves(
-                        lambda x: x.to(device=device, non_blocking=True)
-                        if isinstance(x, torch.Tensor)
-                        else x,
-                        v,
-                    )
-                    if k not in multimodal_cpu_fields
-                    else v
-                    for k, v in mm_kwargs_group.items()
-                }
-        else:
-            mm_kwargs_group = MultiModalKwargs.as_kwargs(
-                MultiModalKwargs.batch(
-                    [
-                        MultiModalKwargsItems.from_seq([item]).get_data()
-                        for item in items_lst
-                    ],
-                    pin_memory=pin_memory,
-                ),
-                device=device,
-            )
-
-        yield modality, len(items_lst), mm_kwargs_group
+        yield modality, len(items_lst), mm_kwargs_data
 
 
 def fetch_audio(
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 0a6806390451d..14ae487f3eb13 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -27,7 +27,6 @@ from vllm.multimodal.inputs import (
     MultiModalFieldConfig,
     MultiModalFieldElem,
     MultiModalFlatField,
-    MultiModalKwargs,
     MultiModalKwargsItem,
     MultiModalKwargsItems,
     MultiModalSharedField,
@@ -176,9 +175,6 @@ class MsgpackEncoder:
         if isinstance(obj, MultiModalKwargsItems):
             return self._encode_mm_items(obj)
 
-        if isinstance(obj, MultiModalKwargs):
-            return self._encode_mm_kwargs(obj)
-
         if isinstance(obj, UtilityResult):
             result = obj.result
             if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION:
@@ -259,11 +255,6 @@ class MsgpackEncoder:
             "field": self._encode_mm_field(elem.field),
         }
 
-    def _encode_mm_kwargs(self, kw: MultiModalKwargs) -> dict[str, Any]:
-        return {
-            modality: self._encode_nested_tensors(data) for modality, data in kw.items()
-        }
-
     def _encode_nested_tensors(self, nt: NestedTensors) -> Any:
         if isinstance(nt, torch.Tensor):
             return self._encode_tensor(nt)
@@ -325,8 +316,6 @@ class MsgpackDecoder:
                 return self._decode_mm_item(obj)
             if issubclass(t, MultiModalKwargsItems):
                 return self._decode_mm_items(obj)
-            if issubclass(t, MultiModalKwargs):
-                return self._decode_mm_kwargs(obj)
             if t is UtilityResult:
                 return self._decode_utility_result(obj)
         return obj
@@ -414,14 +403,6 @@ class MsgpackDecoder:
         obj["field"] = factory_meth(None, *field_args).field
         return MultiModalFieldElem(**obj)
 
-    def _decode_mm_kwargs(self, obj: dict[str, Any]) -> MultiModalKwargs:
-        return MultiModalKwargs(
-            {
-                modality: self._decode_nested_tensors(data)
-                for modality, data in obj.items()
-            }
-        )
-
     def _decode_nested_tensors(self, obj: Any) -> NestedTensors:
         if isinstance(obj, (int, float)):
             # Although it violates NestedTensors type, MultiModalKwargs
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a7eb9cdae8b10..58043a42da94e 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2106,7 +2106,6 @@ class GPUModelRunner(
             mm_kwargs,
             device=self.device,
             pin_memory=self.pin_memory,
-            merge_by_field_config=model.merge_by_field_config,
             multimodal_cpu_fields=model.multimodal_cpu_fields,
         ):
             curr_group_outputs: list[torch.Tensor] = []
@@ -2133,7 +2132,6 @@ class GPUModelRunner(
                             [video_mm_kwargs_item],
                             device=self.device,
                             pin_memory=self.pin_memory,
-                            merge_by_field_config=model.merge_by_field_config,
                             multimodal_cpu_fields=model.multimodal_cpu_fields,
                         )
                     )
@@ -3849,7 +3847,6 @@ class GPUModelRunner(
                 dummy_mm_items,
                 device=self.device,
                 pin_memory=self.pin_memory,
-                merge_by_field_config=model.merge_by_field_config,
                 multimodal_cpu_fields=model.multimodal_cpu_fields,
             )
         )
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index f3dd9aa96d2ae..292f12969aaed 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -969,7 +969,6 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             mm_kwargs,
             device=self.device,
             pin_memory=self.pin_memory,
-            merge_by_field_config=model.merge_by_field_config,
             multimodal_cpu_fields=model.multimodal_cpu_fields,
         ):
             # Run the encoder.
@@ -2058,7 +2057,6 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 dummy_mm_items,
                 device=self.device,
                 pin_memory=self.pin_memory,
-                merge_by_field_config=model.merge_by_field_config,
                 multimodal_cpu_fields=model.multimodal_cpu_fields,
             )
         )

From 46cbbca05c31372f672476f5fc3f37b8bbdd5457 Mon Sep 17 00:00:00 2001
From: Qiu <qiuchunshuo@huawei.com>
Date: Fri, 5 Dec 2025 01:28:21 +0800
Subject: [PATCH 238/770] [CI][DCP][Perf] reduce DCP CI execution time (#29858)

Signed-off-by: QiuChunshuo <qiuchunshuo@huawei.com>
---
 tests/distributed/test_context_parallel.py | 188 +++++++++++----------
 tests/models/registry.py                   |   6 +-
 2 files changed, 100 insertions(+), 94 deletions(-)

diff --git a/tests/distributed/test_context_parallel.py b/tests/distributed/test_context_parallel.py
index 7e4713b8aece0..3cb533dccd62c 100644
--- a/tests/distributed/test_context_parallel.py
+++ b/tests/distributed/test_context_parallel.py
@@ -16,16 +16,35 @@ from typing import Literal, NamedTuple
 import pytest
 import torch
 
+from tests.evals.gsm8k.gsm8k_eval import evaluate_gsm8k
+from tests.utils import RemoteOpenAIServer, create_new_process_for_each_test
 from vllm.config.model import RunnerOption
 from vllm.logger import init_logger
 
 from ..models.registry import HF_EXAMPLE_MODELS
-from ..utils import compare_two_settings, create_new_process_for_each_test
 
 logger = init_logger("test_context_parallel")
 
 VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
 
+CP_TEST_MODELS = [
+    # TODO support other models
+    # [LANGUAGE GENERATION]
+    "deepseek-ai/DeepSeek-V2-Lite-Chat",
+    "Qwen/Qwen2.5-1.5B-Instruct",
+]
+
+# GSM8K eval configuration
+NUM_QUESTIONS = 256  # Fast eval for CI
+NUM_SHOTS = 5  # Few-shot examples
+# tp accuracy with 2% buffer
+MIN_ACCURACY = {
+    # .buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
+    "deepseek-ai/DeepSeek-V2-Lite-Chat": 0.64,
+    # .buildkite/lm-eval-harness/configs/Qwen2.5-1.5B-Instruct.yaml
+    "Qwen/Qwen2.5-1.5B-Instruct": 0.52,
+}
+
 
 class ParallelSetup(NamedTuple):
     tp_size: int
@@ -38,7 +57,6 @@ class ParallelSetup(NamedTuple):
 
 class CPTestOptions(NamedTuple):
     multi_node_only: bool
-    load_format: str | None = None
     attn_backend: str | None = None
 
 
@@ -54,17 +72,20 @@ class CPTestSettings:
         *,
         tp_base: int = 4,
         pp_base: int = 1,
-        dcp_base: int = 1,
+        dcp_multipliers: list[float] | None = None,
         cp_kv_cache_interleave_size: int = 1,
         multi_node_only: bool = False,
         runner: RunnerOption = "auto",
-        load_format: str | None = None,
         attn_backend: str | None = None,
     ):
         parallel_setups = []
+        if dcp_multipliers is None:
+            dcp_multipliers = [
+                0.5,
+            ]
         for eager_mode_val in [False]:
             for pp_multiplier in [1]:
-                for dcp_multiplier in [0.5, 1]:
+                for dcp_multiplier in dcp_multipliers:
                     for chunked_prefill_val in [True]:
                         parallel_setups.append(
                             ParallelSetup(
@@ -82,7 +103,6 @@ class CPTestSettings:
             runner=runner,
             test_options=CPTestOptions(
                 multi_node_only=multi_node_only,
-                load_format=load_format,
                 attn_backend=attn_backend,
             ),
         )
@@ -101,7 +121,24 @@ class CPTestSettings:
                 )
 
 
-def _compare_cp_with_tp(
+CP_TEXT_GENERATION_MODELS = {
+    "deepseek-ai/DeepSeek-V2-Lite-Chat": [
+        CPTestSettings.detailed(
+            dcp_multipliers=[0.5, 1], cp_kv_cache_interleave_size=64
+        ),
+    ],
+    "Qwen/Qwen2.5-1.5B-Instruct": [
+        CPTestSettings.detailed(
+            cp_kv_cache_interleave_size=16, attn_backend="FLASH_ATTN"
+        ),
+        CPTestSettings.detailed(
+            cp_kv_cache_interleave_size=16, attn_backend="FLASHINFER"
+        ),
+    ],
+}
+
+
+def _test_cp_gsm8k(
     model_id: str,
     parallel_setup: ParallelSetup,
     distributed_backend: str,
@@ -121,7 +158,7 @@ def _compare_cp_with_tp(
         chunked_prefill,
     ) = parallel_setup
 
-    multi_node_only, load_format, attn_backend = test_options
+    multi_node_only, attn_backend = test_options
 
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
     model_info.check_transformers_version(on_fail="skip")
@@ -130,22 +167,7 @@ def _compare_cp_with_tp(
     tokenizer_mode = model_info.tokenizer_mode
     hf_overrides = model_info.hf_overrides
 
-    if load_format == "dummy":
-        # Avoid OOM
-        text_overrides = {
-            "num_hidden_layers": 4,
-            "hidden_size": 512,
-            "intermediate_size": 800,
-            "num_attention_heads": 4,
-            "num_key_value_heads": 1,
-        }
-
-        if is_multimodal:
-            hf_overrides.update({"text_config": text_overrides})
-        else:
-            hf_overrides.update(text_overrides)
-    else:
-        model_info.check_available_online(on_fail="skip")
+    model_info.check_available_online(on_fail="skip")
 
     if num_gpus_available < tp_size * pp_size:
         pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
@@ -157,90 +179,70 @@ def _compare_cp_with_tp(
     if multi_node_only and not VLLM_MULTI_NODE:
         pytest.skip("Not in multi-node setting")
 
-    common_args = [
+    server_args = [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
         "bfloat16",
         "--max-model-len",
-        "2048",
+        "4096",
         "--max-num-seqs",
-        "8",
+        "64",
     ]
     if chunked_prefill:
-        common_args.append("--enable-chunked-prefill")
+        server_args.append("--enable-chunked-prefill")
     if eager_mode:
-        common_args.append("--enforce-eager")
+        server_args.append("--enforce-eager")
     if runner != "auto":
-        common_args.extend(["--runner", runner])
+        server_args.extend(["--runner", runner])
     if trust_remote_code:
-        common_args.append("--trust-remote-code")
+        server_args.append("--trust-remote-code")
     if tokenizer_mode:
-        common_args.extend(["--tokenizer-mode", tokenizer_mode])
-    if load_format:
-        common_args.extend(["--load-format", load_format])
+        server_args.extend(["--tokenizer-mode", tokenizer_mode])
     if hf_overrides:
-        common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
+        server_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
 
-    if not attn_backend:
-        cp_env = tp_env = {}
-    else:
-        cp_env = tp_env = {
-            "VLLM_ATTENTION_BACKEND": attn_backend,
-        }
-
-    cp_args = [
-        *common_args,
-        "--tensor-parallel-size",
-        str(tp_size),
-        "--pipeline-parallel-size",
-        str(pp_size),
-        "--decode-context-parallel-size",
-        str(dcp_size),
-        "--dcp-kv-cache-interleave-size",
-        str(cp_kv_cache_interleave_size),
-        "--distributed-executor-backend",
-        distributed_backend,
-    ]
-
-    tp_args = [
-        *common_args,
-        "--tensor-parallel-size",
-        str(tp_size),
-        "--pipeline-parallel-size",
-        str(pp_size),
-        "--distributed-executor-backend",
-        distributed_backend,
-    ]
-
-    compare_two_settings(
-        model_id,
-        cp_args,
-        tp_args,
-        cp_env,
-        tp_env,
-        method=method,
-        max_wait_seconds=720,
+    server_args.extend(
+        [
+            "--tensor-parallel-size",
+            str(tp_size),
+            "--pipeline-parallel-size",
+            str(pp_size),
+            "--decode-context-parallel-size",
+            str(dcp_size),
+            "--dcp-kv-cache-interleave-size",
+            str(cp_kv_cache_interleave_size),
+            "--distributed-executor-backend",
+            distributed_backend,
+        ]
     )
 
+    server_env = {}
+    if attn_backend:
+        server_env["VLLM_ATTENTION_BACKEND"] = attn_backend
 
-CP_TEXT_GENERATION_MODELS = {
-    "deepseek-ai/DeepSeek-V2-Lite-Chat": [
-        CPTestSettings.detailed(),
-        CPTestSettings.detailed(tp_base=2),
-        CPTestSettings.detailed(tp_base=2, cp_kv_cache_interleave_size=64),
-    ],
-    "bigcode/gpt_bigcode-santacoder": [
-        CPTestSettings.detailed(),
-        CPTestSettings.detailed(tp_base=2),
-    ],
-}
+    with RemoteOpenAIServer(
+        model_id,
+        server_args,
+        env_dict=server_env,
+        max_wait_seconds=720,
+    ) as remote_server:
+        host = f"http://{remote_server.host}"
+        port = remote_server.port
 
-CP_TEST_MODELS = [
-    # TODO support other models
-    # [LANGUAGE GENERATION]
-    "deepseek-ai/DeepSeek-V2-Lite-Chat",
-    "bigcode/gpt_bigcode-santacoder",
-]
+        # Run GSM8K evaluation
+        results = evaluate_gsm8k(
+            num_questions=NUM_QUESTIONS,
+            num_shots=NUM_SHOTS,
+            host=host,
+            port=port,
+        )
+
+        # Validate accuracy is reasonable
+        accuracy = results["accuracy"]
+        min_accuracy = MIN_ACCURACY[model_id]
+        assert accuracy >= min_accuracy, (
+            f"TP+DCP accuracy too low: {accuracy:.3f} < {min_accuracy:.3f}"
+        )
 
 
 @pytest.mark.parametrize(
@@ -274,12 +276,12 @@ def test_cp_generation(
     ):
         pytest.skip(reason="MLA+DCP requires compute capability of 9.0 or higher")
     if (
-        model_id == "bigcode/gpt_bigcode-santacoder"
+        model_id == "Qwen/Qwen2.5-1.5B-Instruct"
         and torch.cuda.get_device_capability() != (9, 0)
     ):
         pytest.skip(reason="GQA+DCP currently requires compute capability of 9.0")
 
-    _compare_cp_with_tp(
+    _test_cp_gsm8k(
         model_id,
         parallel_setup,
         distributed_backend,
diff --git a/tests/models/registry.py b/tests/models/registry.py
index b9f9945eb5fb8..352abdd2da9a0 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -416,7 +416,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
         trust_remote_code=True,
     ),
     "Qwen2ForCausalLM": _HfExamplesInfo(
-        "Qwen/Qwen2-0.5B-Instruct", extras={"2.5": "Qwen/Qwen2.5-0.5B-Instruct"}
+        "Qwen/Qwen2-0.5B-Instruct",
+        extras={
+            "2.5": "Qwen/Qwen2.5-0.5B-Instruct",
+            "2.5-1.5B": "Qwen/Qwen2.5-1.5B-Instruct",
+        },
     ),
     "Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"),
     "Qwen3ForCausalLM": _HfExamplesInfo("Qwen/Qwen3-8B"),

From 6dcb07f676ae473703b4ab69ef15d93f51d24ce8 Mon Sep 17 00:00:00 2001
From: Tao Yun <30410832+taoyun951753@users.noreply.github.com>
Date: Fri, 5 Dec 2025 01:34:06 +0800
Subject: [PATCH 239/770] support qwen3-vl handle requests with embeddings
 (#30037)

Signed-off-by: taoyun <1069423820@qq.com>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docs/features/multimodal_inputs.md     | 2 ++
 vllm/model_executor/models/qwen3_vl.py | 7 +++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
index 2b25dc7666c37..0adb32a7ac33c 100644
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -443,6 +443,8 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd
         print(generated_text)
     ```
 
+For Qwen3-VL, the `image_embeds` should contain both the base image embedding and deepstack features.
+
 #### Audio Embeddings
 
 You can pass pre-computed audio embeddings similar to image embeddings:
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 1d3929b936a9f..58721303dfc87 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -103,7 +103,7 @@ from .qwen2_5_vl import (
     Qwen2_5_VLVideoInputs,
     Qwen2_5_VLVideoPixelInputs,
 )
-from .qwen2_vl import Qwen2VLProcessingInfo
+from .qwen2_vl import Qwen2VLMultiModalDataParser, Qwen2VLProcessingInfo
 from .qwen3 import Qwen3ForCausalLM, Qwen3Model
 from .utils import (
     AutoWeightsLoader,
@@ -884,7 +884,10 @@ class Qwen3VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3VLProcessingInfo]):
 
 class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo]):
     def _get_data_parser(self) -> MultiModalDataParser:
-        return MultiModalDataParser(video_needs_metadata=True)
+        return Qwen2VLMultiModalDataParser(
+            self.info.get_hf_config().vision_config.spatial_merge_size,
+            video_needs_metadata=True,
+        )
 
     def _call_hf_processor(
         self,

From 652ba93da36d793e7f3ff8a0ecdb5d6b00107e68 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 5 Dec 2025 02:17:49 +0800
Subject: [PATCH 240/770] [Bugfix] Fix FP8 MoE LoRA (#29890)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/model_executor/layers/quantization/fp8.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 48223c9f103ea..0e3e13f5945ea 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -124,12 +124,16 @@ class Fp8MoeBackend(Enum):
 
 
 def get_fp8_moe_backend(
-    block_quant: bool, moe_parallel_config: FusedMoEParallelConfig
+    block_quant: bool,
+    moe_parallel_config: FusedMoEParallelConfig,
+    with_lora_support: bool,
 ) -> Fp8MoeBackend:
     """
     Select the primary FP8 MoE backend
     Note: Shape-specific fallbacks may still occur at runtime.
     """
+    if with_lora_support:
+        return Fp8MoeBackend.TRITON
     # Prefer FlashInfer backends on supported GPUs; allow SM90 and SM100.
     if (
         current_platform.is_cuda()
@@ -665,7 +669,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         self.weight_block_size = self.quant_config.weight_block_size
         self.block_quant: bool = self.weight_block_size is not None
         self.fp8_backend = get_fp8_moe_backend(
-            self.block_quant, layer.moe_parallel_config
+            self.block_quant, layer.moe_parallel_config, self.moe.is_lora_enabled
         )
 
         self.marlin_input_dtype = None
@@ -1084,6 +1088,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         from vllm.model_executor.layers.fused_moe import (
             BatchedDeepGemmExperts,
             BatchedTritonExperts,
+            TritonExperts,
             TritonOrDeepGemmExperts,
         )
 
@@ -1116,7 +1121,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 num_dispatchers=prepare_finalize.num_dispatchers(),
                 quant_config=self.moe_quant_config,
             )
-
+        elif self.moe.is_lora_enabled:
+            return TritonExperts(quant_config=self.moe_quant_config)
         elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
             # Select GEMM experts with block-scale when weights are block-quantized
             experts = select_cutlass_fp8_gemm_impl(

From ece2825a29e6b54ce6b114c27ec7ea498c66b416 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Fri, 5 Dec 2025 02:20:48 +0800
Subject: [PATCH 241/770] [KVConnector] Remove v0-related kv connector
 components such as kv pipe and kv lookup buffer (#29705)

Signed-off-by: KuntaiDu <kuntai@uchicago.edu>
---
 tests/kv_transfer/test_lookup_buffer.py       | 160 ----------
 tests/kv_transfer/test_lookup_buffer.sh       |   8 -
 tests/kv_transfer/test_module.py              |  62 ----
 tests/kv_transfer/test_send_recv.py           | 154 ---------
 tests/kv_transfer/test_send_recv.sh           |   9 -
 .../kv_transfer/kv_lookup_buffer/__init__.py  |   0
 .../kv_transfer/kv_lookup_buffer/base.py      | 179 -----------
 .../kv_lookup_buffer/mooncake_store.py        | 164 ----------
 .../kv_lookup_buffer/simple_buffer.py         | 242 --------------
 .../kv_transfer/kv_pipe/__init__.py           |   0
 vllm/distributed/kv_transfer/kv_pipe/base.py  |  66 ----
 .../kv_transfer/kv_pipe/mooncake_pipe.py      | 295 ------------------
 .../kv_transfer/kv_pipe/pynccl_pipe.py        | 285 -----------------
 13 files changed, 1624 deletions(-)
 delete mode 100644 tests/kv_transfer/test_lookup_buffer.py
 delete mode 100644 tests/kv_transfer/test_lookup_buffer.sh
 delete mode 100644 tests/kv_transfer/test_module.py
 delete mode 100644 tests/kv_transfer/test_send_recv.py
 delete mode 100644 tests/kv_transfer/test_send_recv.sh
 delete mode 100644 vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py
 delete mode 100644 vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
 delete mode 100644 vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
 delete mode 100644 vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
 delete mode 100644 vllm/distributed/kv_transfer/kv_pipe/__init__.py
 delete mode 100644 vllm/distributed/kv_transfer/kv_pipe/base.py
 delete mode 100644 vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
 delete mode 100644 vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py

diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py
deleted file mode 100644
index a61ccef700624..0000000000000
--- a/tests/kv_transfer/test_lookup_buffer.py
+++ /dev/null
@@ -1,160 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import os
-import random
-
-import torch
-from tqdm import tqdm
-
-from vllm.config import KVTransferConfig
-from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_buffer import SimpleBuffer
-from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe
-
-# TODO: the test depends on a lot of fields in the current implementation.
-# We should have standard interface instead direct field access
-
-
-def test_run(my_rank, buffer, device):
-    # buffer should be empty in the beginning
-    if my_rank == 0:
-        assert buffer.buffer_size == 0
-        assert len(buffer.buffer) == 0
-
-    print(f"My rank: {my_rank}, device: {device}")
-
-    # insert
-    tokens = torch.tensor([1, 2, 3]).to(device)
-    roi = tokens > 0
-    if my_rank == 0:
-        key = 2.0 * torch.ones([5, 6]).to(device)
-        value = 3.0 * torch.ones([5, 6]).to(device)
-
-        placeholder = torch.tensor([1]).to(device)
-
-        buffer.insert(tokens, roi, key, value, placeholder)
-
-    torch.distributed.barrier()
-
-    # drop_select
-    if my_rank == 1:
-        tok, roi_, key, value, hidden = buffer.drop_select(tokens, roi)
-        assert torch.allclose(tokens, tok)
-        assert torch.allclose(roi, roi_)
-        assert torch.allclose(key, 2.0 * torch.ones([5, 6], device=device))
-        assert torch.allclose(value, 3.0 * torch.ones([5, 6], device=device))
-    torch.distributed.barrier()
-
-    if my_rank == 0:
-        assert buffer.buffer_size == 0
-        assert len(buffer.buffer) == 0
-
-    print(f"My rank: {my_rank}, Test run passed!")
-
-
-def stress_test(my_rank, buf, device):
-    torch.distributed.barrier()
-    torch.manual_seed(100)
-
-    reqs = [
-        (
-            torch.rand(100).to(device),  # tokens
-            torch.ones(100).bool().to(device),  # roi
-            torch.rand(100).to(device),  # key
-            torch.rand(100).to(device),  # value
-            torch.rand(100).to(device),  # hidden
-        )
-        for i in tqdm(range(200))
-    ]
-
-    random.seed(my_rank)
-    random.shuffle(reqs)
-
-    torch.distributed.barrier()
-
-    n = 0
-
-    # the buffer size can only store 100 reqs
-    # so the sender will occasionally block to wait for the receiver.
-    for req in tqdm(reqs):
-        if my_rank == 0:
-            buf.insert(*req)
-        else:
-            tok, roi, k, v, h = req
-            tok_, roi_, k_, v_, h_ = buf.drop_select(tok, roi)
-
-            if tok_ is None:
-                assert roi_ is None
-                assert k_ is None
-                assert v_ is None
-                assert h_ is None
-                n += 1
-            else:
-                assert torch.allclose(tok, tok_)
-                assert torch.allclose(roi, roi_)
-                assert torch.allclose(k, k_)
-                assert torch.allclose(v, v_)
-                assert torch.allclose(h, h_)
-    print(f"Rank {my_rank} done")
-    torch.distributed.barrier()
-
-    if my_rank == 0:
-        x = torch.tensor([0])
-        torch.distributed.recv(x, 1)
-        # the # of None received is the kv that are not selected
-        assert x.item() == len(buf.buffer)
-        # and the size of the buffer should be 2000 * buffer len
-        print(buf.buffer_size)
-        assert buf.buffer_size == 1700 * len(buf.buffer)
-    else:
-        torch.distributed.send(torch.tensor([n]), 0)
-
-    print(f"My rank: {my_rank}, Passed stress test!")
-
-
-if __name__ == "__main__":
-    my_rank = int(os.environ["RANK"])
-
-    torch.distributed.init_process_group(
-        backend="gloo",
-        init_method="tcp://localhost:12398",
-        world_size=2,
-        rank=my_rank,
-    )
-
-    print(f"initialized! My rank is {my_rank}")
-
-    config = KVTransferConfig(
-        kv_connector="P2pNcclConnector",
-        kv_buffer_device="cuda",
-        kv_buffer_size=1e9,
-        kv_rank=my_rank,
-        kv_role="kv_both",  # this arg doesn't matter in this test
-        kv_parallel_size=2,
-        kv_ip="127.0.0.1",
-        kv_port=12345,
-    )
-
-    data_pipe = PyNcclPipe(
-        local_rank=my_rank,
-        config=config,
-        device="cuda",
-        port_offset=0,
-    )
-    cpu_pipe = PyNcclPipe(
-        local_rank=my_rank,
-        config=config,
-        device="cpu",
-        port_offset=1,
-    )
-
-    buffer = SimpleBuffer(cpu_pipe, data_pipe, 170000)
-
-    test_run(my_rank, buffer, data_pipe.device)
-
-    stress_test(my_rank, buffer, data_pipe.device)
-
-    buffer.close()
-    data_pipe.close()
-    cpu_pipe.close()
-    print("Done")
diff --git a/tests/kv_transfer/test_lookup_buffer.sh b/tests/kv_transfer/test_lookup_buffer.sh
deleted file mode 100644
index f2aeaee9ca6d5..0000000000000
--- a/tests/kv_transfer/test_lookup_buffer.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/bash
-RANK=0 python3 test_lookup_buffer.py &
-PID0=$!
-RANK=1 python3 test_lookup_buffer.py &
-PID1=$!
-
-wait $PID0
-wait $PID1
diff --git a/tests/kv_transfer/test_module.py b/tests/kv_transfer/test_module.py
deleted file mode 100644
index b9a28e4bceb7c..0000000000000
--- a/tests/kv_transfer/test_module.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import subprocess
-import sys
-
-import pytest
-import torch
-
-
-def run_python_script(script_name, timeout):
-    script_name = f"kv_transfer/{script_name}"
-    try:
-        # Start both processes asynchronously using Popen
-        process0 = subprocess.Popen(
-            [sys.executable, script_name],
-            env={"RANK": "0"},  # Set the RANK environment variable for process 0
-            stdout=sys.stdout,  # Pipe stdout to current stdout
-            stderr=sys.stderr,  # Pipe stderr to current stderr
-        )
-
-        process1 = subprocess.Popen(
-            [sys.executable, script_name],
-            env={"RANK": "1"},  # Set the RANK environment variable for process 1
-            stdout=sys.stdout,  # Pipe stdout to current stdout
-            stderr=sys.stderr,  # Pipe stderr to current stderr
-        )
-
-        # Wait for both processes to complete, with a timeout
-        process0.wait(timeout=timeout)
-        process1.wait(timeout=timeout)
-
-        # Check the return status of both processes
-        if process0.returncode != 0:
-            pytest.fail(f"Test {script_name} failed for RANK=0, {process0.returncode}")
-        if process1.returncode != 0:
-            pytest.fail(f"Test {script_name} failed for RANK=1, {process1.returncode}")
-
-    except subprocess.TimeoutExpired:
-        # If either process times out, terminate both and fail the test
-        process0.terminate()
-        process1.terminate()
-        pytest.fail(f"Test {script_name} timed out")
-    except Exception as e:
-        pytest.fail(f"Test {script_name} failed with error: {str(e)}")
-
-
-# Define the test cases using pytest's parametrize
-@pytest.mark.parametrize(
-    "script_name,timeout",
-    [
-        ("test_lookup_buffer.py", 60),  # Second test case with a 60-second timeout
-        ("test_send_recv.py", 120),  # First test case with a 120-second timeout
-    ],
-)
-def test_run_python_script(script_name, timeout):
-    # Check the number of GPUs
-    if torch.cuda.device_count() < 2:
-        pytest.skip(f"Skipping test {script_name} because <2 GPUs are available")
-
-    # Run the test if there are at least 2 GPUs
-    run_python_script(script_name, timeout)
diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py
deleted file mode 100644
index 5762224eff76d..0000000000000
--- a/tests/kv_transfer/test_send_recv.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import os
-import time
-
-import torch
-from tqdm import tqdm
-
-from vllm.config import KVTransferConfig
-from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe
-
-
-def test_run(my_rank, pipe):
-    print(f"rank {my_rank} test_run starts....")
-    # test run
-    x = torch.tensor([1]).to(pipe.device)
-    y = torch.tensor([[2.0, 3.0, 4.0, 8.0]]).to(pipe.device)
-    if my_rank == 0:
-        pipe.send_tensor(x)
-        print(f"rank {my_rank} sent tensor x")
-        pipe.send_tensor(y)
-        print(f"rank {my_rank} sent tensor y")
-        x2 = pipe.recv_tensor()
-        print(f"rank {my_rank} received x2 = ", x2)
-        y2 = pipe.recv_tensor()
-        print(f"rank {my_rank} received y2 = ", y2)
-
-    else:
-        x2 = pipe.recv_tensor()
-        print(f"rank {my_rank} received x2 = ", x2)
-        y2 = pipe.recv_tensor()
-        print(f"rank {my_rank} received y2 = ", y2)
-        pipe.send_tensor(x)
-        print(f"rank {my_rank} sent tensor x")
-        pipe.send_tensor(y)
-        print(f"rank {my_rank} sent tensor y")
-
-    assert torch.allclose(x, x2)
-    assert torch.allclose(y, y2)
-
-    print(f"rank {my_rank} test_run passed!")
-
-
-def stress_test(my_rank, pipe):
-    print(f"rank {my_rank} stress_test starts....")
-
-    tensors: list[torch.Tensor] = []
-
-    torch.distributed.barrier()
-    torch.manual_seed(0)
-
-    for i in tqdm(range(500)):
-        mean = torch.rand(1).item() * 100
-        std = torch.rand(1).item() * 100
-        size = torch.randint(900, 1000, (2,))
-        x = torch.normal(mean * 1.0, std * 1.0, size=size.tolist()).to(pipe.device)
-
-        # 5% probability of sending a None
-        if torch.rand(1).item() < 0.05:
-            tensors.append(None)
-            tensors.append(None)
-            tensors.append(None)
-        else:
-            tensors.append(x)
-            tensors.append(x.mean().unsqueeze(0))
-            tensors.append(x.std().unsqueeze(0))
-
-    torch.distributed.barrier()
-
-    for i in tqdm(range(500)):
-        if my_rank == int((i % 10) > 3):
-            pipe.send_tensor(tensors[3 * i])
-            pipe.send_tensor(tensors[3 * i + 1])
-            pipe.send_tensor(tensors[3 * i + 2])
-        else:
-            x = pipe.recv_tensor()
-            mean = pipe.recv_tensor()
-            std = pipe.recv_tensor()
-
-            if x is None:
-                assert mean is None
-                assert std is None
-            else:
-                assert torch.allclose(x, tensors[3 * i])
-                assert x.mean() == mean[0]
-                assert x.std() == std[0]
-
-        torch.distributed.barrier()
-
-
-def latency_test(my_rank, pipe, nelement, ntensor):
-    latencies = []
-
-    torch.distributed.barrier()
-
-    for i in tqdm(range(500)):
-        tensors = []
-
-        if my_rank == 0:
-            # create tensor
-            tensors = [torch.rand(nelement).to(pipe.device) for _ in range(ntensor)]
-
-        torch.distributed.barrier()
-
-        if my_rank == 0:
-            t = torch.tensor([time.time()], dtype=torch.float64).to(pipe.device)
-            for tensor in tensors:
-                pipe.send_tensor(tensor)
-            pipe.send_tensor(t)
-        else:
-            for _ in range(ntensor):
-                pipe.recv_tensor()
-            t = pipe.recv_tensor()
-            latencies.append(time.time() - t.item())
-
-    torch.distributed.barrier()
-
-    print("Latency test passed.")
-    print("Latency:", torch.tensor(latencies).mean().item() * 1000, "ms")
-
-
-if __name__ == "__main__":
-    my_rank = int(os.environ["RANK"])
-
-    torch.distributed.init_process_group(
-        backend="gloo",
-        init_method="tcp://localhost:12398",
-        world_size=2,
-        rank=my_rank,
-    )
-
-    config = KVTransferConfig(
-        kv_connector="P2pNcclConnector",
-        kv_buffer_device="cuda",
-        kv_buffer_size=1e9,
-        kv_rank=my_rank,
-        kv_role="kv_both",  # this arg doesn't matter in this test
-        kv_parallel_size=2,
-        kv_ip="127.0.0.1",
-        kv_port=12345,
-    )
-
-    pipe = PyNcclPipe(
-        local_rank=my_rank,
-        config=config,
-    )
-
-    test_run(my_rank, pipe)
-
-    stress_test(my_rank, pipe)
-
-    # Use this function if you want to test the latency of pipe impl.
-    # latency_test(my_rank, pipe, 1024 * 8 * 128, 80)
diff --git a/tests/kv_transfer/test_send_recv.sh b/tests/kv_transfer/test_send_recv.sh
deleted file mode 100644
index 54e0604806841..0000000000000
--- a/tests/kv_transfer/test_send_recv.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-
-RANK=0 python3 test_send_recv.py &
-PID0=$!
-RANK=1 python3 test_send_recv.py &
-PID1=$!
-
-wait $PID0
-wait $PID1
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
deleted file mode 100644
index f48d03d0b0cd5..0000000000000
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
+++ /dev/null
@@ -1,179 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-This file contains a new class `KVLookupBufferBase` that allows developers to
-think of KV cache operations as inserting new KV cache entries (`insert`)
-into the lookup buffer and querying existing KV caches (`drop_select`)
-from the lookup buffer.
-
-This file also contains a new class `KVStoreBufferBase` that allows developers
-to manage the KVCache buffer as a simple key-value storage buffer with basic
-put/get operations.
-
-These classes above are abstracted behind class `KVCacheBufferBase`.
-"""
-
-from abc import ABC, abstractmethod
-
-import torch
-
-
-class KVCacheBufferBase(ABC):
-    """
-    Abstract base class for a KVCache buffer.
-    """
-
-    @abstractmethod
-    def close(self) -> None:
-        """Close the buffer and release resources.
-
-        This method is responsible for cleaning up resources related to the
-        KVCache buffer when it is no longer needed.
-
-        Raises:
-            NotImplementedError: This method must be implemented in subclasses.
-        """
-        raise NotImplementedError
-
-
-class KVLookupBufferBase(KVCacheBufferBase):
-    """
-    Abstract base class for a KVCache lookup buffer.
-
-    This class provides an abstraction for a key-value (KV) cache lookup buffer.
-
-    The key of the lookup buffer:
-    - input_tokens: token IDs of the request
-    - roi: a binary mask on top of input_tokens.
-      - Purpose of roi: Since KV cache may only be available for a subset of
-        tokens in the input (for example, when vLLM is connected to an external
-        KV cache service), roi specifies the subset of tokens that the KV cache
-        is associated with.
-      - NOTE: roi can be further extended to describe which part of KV the
-        current process is holding (each process may only hold a part of KV
-        due to TP and PP). This is not implemented for now.
-
-    The value of the lookup buffer:
-    - key: the key tensor in the KV cache
-    - value: the value tensor in the KV cache
-    - hidden: the final hidden state generated by model forwarding. This allows
-      vLLM to bypass further model forwarding by transmitting the hidden state.
-    """
-
-    @abstractmethod
-    def insert(
-        self,
-        input_tokens: torch.Tensor,
-        roi: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        hidden: torch.Tensor,
-    ) -> None:
-        """Insert into the lookup buffer.
-
-        The functionality is similar to the following python statement
-        ```
-        buffer[input_tokens, roi] = [key, value, hidden]
-        ```
-
-        FIXME: in the future, we should only have two arguments, key and value,
-        where key is a tensor dict and value is a tensor dict.
-
-        FIXME: we should transmit both sampler outputs and the hidden states.
-
-        Args:
-            input_tokens (torch.Tensor): token IDs.
-            roi (torch.Tensor): A binary mask on top of the input tokens
-            key (torch.Tensor): The key tensor in the KV cache.
-            value (torch.Tensor): The value tensor in the KV cache.
-            hidden (torch.Tensor): The final hidden state tensor generated
-                                   during model forwarding to bypass model
-                                   forwarding.
-
-        Raises:
-            NotImplementedError: This method must be implemented in subclasses.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def drop_select(
-        self, input_tokens: torch.Tensor | None, roi: torch.Tensor | None
-    ) -> list[torch.Tensor | None]:
-        """Select and *drop* KV cache entries from the lookup buffer.
-
-        The functionality is similar to the following python statements
-        ```
-        ret = buffer.pop(input_tokens, roi)
-        return ret
-        ```
-
-        If `input_tokens` and `roi` is `None`, it means selecting any of the
-        KV caches in the buffer, return, and remove it from the buffer, useful
-        when offloading KV cache to KV cache storage service.
-
-        Args:
-            input_tokens (torch.Tensor): token IDs.
-            roi (torch.Tensor): A binary mask on top of the input tokens
-
-        Returns:
-            list[Optional[torch.Tensor]]: A list of tensors. Can be None.
-
-        Raises:
-            NotImplementedError: This method must be implemented in subclasses.
-        """
-        raise NotImplementedError
-
-
-class KVStoreBufferBase(KVCacheBufferBase):
-    """
-    Abstract base class for a KVCache storage buffer with key-value semantics.
-    This class provides a simple key-value storage buffer abstract with basic
-    put/get operations, which enables flexible KVCache transfer granular
-    control.
-
-    The functionality is similar to a distributed key-value store, where:
-    - Key: A unique string identifier for the cached entry
-    - Value:
-        - Tensor to be stored and retrieved
-        - None (indicating deletion or empty value)
-    """
-
-    @abstractmethod
-    def put(
-        self,
-        key: str,
-        value: torch.Tensor | None,
-    ) -> None:
-        """Store a key-value pair in the buffer.
-
-        Args:
-            key (str): Unique identifier for a tensor, this tensor could be the
-                key cache tensor, value cache tensor, or hidden state tensor
-                generated during model forwarding.
-
-            value (Optional[torch.Tensor]): Tensor to be stored.
-
-        Raises:
-            NotImplementedError: This method must be implemented in subclasses.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def get(
-        self,
-        key: str,
-    ) -> torch.Tensor | None:
-        """Retrieve a value from the buffer by key.
-
-        Args:
-            key (str): Unique identifier for a tensor, this tensor could be the
-                key cache tensor, value cache tensor, or hidden state tensor
-                generated during model forwarding.
-
-        Returns:
-            Optional[torch.Tensor]: Stored tensor if exists, None otherwise.
-
-        Raises:
-            NotImplementedError: This method must be implemented in subclasses.
-        """
-        raise NotImplementedError
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
deleted file mode 100644
index 7861bea1f9c54..0000000000000
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-This file contains a new class `MooncakeStore` that allows developers to
-think of KV cache transfer operations as putting new KV cache entries
-into a remote KVStore-based lookup buffer and getting existing KV caches
-from this remote lookup buffer.
-"""
-
-import json
-import os
-from dataclasses import dataclass
-
-import torch
-from safetensors.torch import load as safetensors_load
-from safetensors.torch import save as safetensors_save
-
-from vllm.config import VllmConfig
-from vllm.distributed.kv_transfer.kv_lookup_buffer.base import KVStoreBufferBase
-from vllm.logger import init_logger
-
-DEFAULT_GLOBAL_SEGMENT_SIZE = 3355443200  # 3.125 GiB
-DEFAULT_LOCAL_BUFFER_SIZE = 1073741824  # 1.0 GiB
-
-logger = init_logger(__name__)
-
-
-@dataclass
-class MooncakeStoreConfig:
-    local_hostname: str
-    metadata_server: str
-    global_segment_size: int
-    local_buffer_size: int
-    protocol: str
-    device_name: str
-    master_server_address: str
-
-    @staticmethod
-    def from_file(file_path: str) -> "MooncakeStoreConfig":
-        """Load the config from a JSON file."""
-        with open(file_path) as fin:
-            config = json.load(fin)
-        return MooncakeStoreConfig(
-            local_hostname=config.get("local_hostname"),
-            metadata_server=config.get("metadata_server"),
-            global_segment_size=config.get(
-                "global_segment_size", DEFAULT_GLOBAL_SEGMENT_SIZE
-            ),
-            local_buffer_size=config.get(
-                "local_buffer_size", DEFAULT_LOCAL_BUFFER_SIZE
-            ),
-            protocol=config.get("protocol", "tcp"),
-            device_name=config.get("device_name", ""),
-            master_server_address=config.get("master_server_address"),
-        )
-
-    @staticmethod
-    def load_from_env() -> "MooncakeStoreConfig":
-        """Load config from a file specified in the environment variable."""
-        config_file_path = os.getenv("MOONCAKE_CONFIG_PATH")
-        if config_file_path is None:
-            raise ValueError(
-                "The environment variable 'MOONCAKE_CONFIG_PATH' is not set."
-            )
-        return MooncakeStoreConfig.from_file(config_file_path)
-
-
-class MooncakeStore(KVStoreBufferBase):
-    def __init__(
-        self,
-        config: VllmConfig,
-    ):
-        try:
-            from mooncake.store import MooncakeDistributedStore
-        except ImportError as e:
-            raise ImportError(
-                "Please install mooncake by following the instructions at "
-                "https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md "  # noqa: E501
-                "to run vLLM with MooncakeConnector."
-            ) from e
-
-        try:
-            self.store = MooncakeDistributedStore()
-            self.config = MooncakeStoreConfig.load_from_env()
-            logger.info("Mooncake Configuration loaded successfully.")
-
-            self.store.setup(
-                self.config.local_hostname,
-                self.config.metadata_server,
-                self.config.global_segment_size,
-                self.config.local_buffer_size,
-                self.config.protocol,
-                self.config.device_name,
-                self.config.master_server_address,
-            )
-
-        except ValueError as e:
-            logger.error("Configuration loading failed: %s", e)
-            raise
-        except Exception as exc:
-            logger.error("An error occurred while loading the configuration: %s", exc)
-            raise
-
-    def close(self):
-        # MooncakeDistributedStore will automatically call the destructor, so
-        # it is unnecessary to close it manually.
-        pass
-
-    def put(
-        self,
-        key: str,
-        value: torch.Tensor | None,
-    ) -> None:
-        # A message queue needs to be introduced before making it asynchronous.
-        if value is not None:
-            self._put_impl(key, value)
-
-    def get(
-        self,
-        key: str,
-    ) -> torch.Tensor | None:
-        # A message queue needs to be introduced before making it asynchronous.
-        value = self._get_impl(key)
-        return value
-
-    def _put_impl(
-        self,
-        key: str,
-        value: torch.Tensor,
-    ) -> None:
-        """Put KVCache to Mooncake Store"""
-        device_id = value.device.index if value.device.type == "cuda" else -1
-        device_tensor = torch.tensor(device_id, dtype=torch.int32)
-        value_bytes = safetensors_save({"tensor": value, "device_id": device_tensor})
-        try:
-            self.store.put(key, value_bytes)
-        except TypeError as err:
-            logger.error("Failed to put value into Mooncake Store: %s", err)
-            raise TypeError("Mooncake Store Put Type Error.") from err
-
-    def _get_impl(
-        self,
-        key: str,
-    ) -> torch.Tensor | None:
-        """Get KVCache from Mooncake Store"""
-        try:
-            data = self.store.get(key)
-        except TypeError as err:
-            logger.error("Failed to get value from Mooncake Store: %s", err)
-            raise TypeError("Mooncake Store Get Type Error.") from err
-
-        if data:
-            loaded_tensors = safetensors_load(data)
-            tensor = loaded_tensors["tensor"]
-            device_id_tensor = loaded_tensors["device_id"]
-            device_id = int(device_id_tensor.item())
-            device = (
-                torch.device("cuda", device_id)
-                if device_id >= 0
-                else torch.device("cpu")
-            )
-            return tensor.to(device)
-
-        return None
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
deleted file mode 100644
index f046a349874e6..0000000000000
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
+++ /dev/null
@@ -1,242 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-Implements a distributed key-value (KV) cache transfer mechanism.
-
-Key Features:
-- Distributed KV cache transmission using PyNccl pipes.
-- Non-blocking `insert`, blocking `drop_select`.
-- Use CPU signal pipe to avoid racing condition
-- Handles buffer size constraints and provide backpressure mechanism to
-  stop the prefill instance when the decode instance is slow.
-"""
-
-import threading
-from collections import deque
-
-import torch
-
-from vllm.distributed.kv_transfer.kv_lookup_buffer.base import KVLookupBufferBase
-from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
-
-
-class SimpleBuffer(KVLookupBufferBase):
-    def __init__(
-        self, signal_pipe: KVPipeBase, data_pipe: KVPipeBase, buffer_size_thresh: float
-    ):
-        """
-        signal_pipe: on CPU
-
-        NOTE: on-device recv will block all threads in the process, making the
-        KV cache producer unable to listen to new request while transmitting
-        KV cache. Luckily CPU recv only blocks the current thread so we use
-        CPU recv to listen to new request.
-
-        data_pipe: on device (e.g. GPU)
-        """
-
-        self.buffer: deque[list[torch.Tensor]] = deque()
-
-        self.buffer_size = 0
-        self.buffer_size_threshold = buffer_size_thresh
-        self.buffer_cv = threading.Condition()
-        self.signal_pipe = signal_pipe
-        self.data_pipe = data_pipe
-        self.request_handling_thread: threading.Thread | None = None
-
-        self.normal_signal = torch.tensor([0], device="cpu")
-        self.end_signal = None
-
-    def _matches(
-        self,
-        tokens_roi_sender: list[torch.Tensor],
-        tokens_roi_recver: list[torch.Tensor],
-    ):
-        # tokens_roi_sender: tokens and roi of the producer (in the buffer)
-        # tokens_roi_recver: tokens and roi of the consumer (query)
-
-        tokens_sender = tokens_roi_sender[0]
-        tokens_recver = tokens_roi_recver[0]
-        roi_sender = tokens_roi_sender[1]
-        roi_recver = tokens_roi_recver[1]
-
-        if tokens_recver is None:
-            # consumer sends an empty request
-            # semantics: DROP SELECT * LIMIT 1
-            # so any of the data in the buffer can be drop-selected
-            return True
-
-        # Assuming that roi is a binary mask on tokens
-        tokens_sender = tokens_sender[roi_sender]
-        tokens_recver = tokens_recver[roi_recver]
-
-        # simple common prefix matching
-        min_length = min(len(tokens_sender), len(tokens_recver))
-        if torch.allclose(tokens_sender[:min_length], tokens_recver[:min_length]):
-            return min_length
-
-        return 0
-
-    def _send_tensor_and_dec_size(self, tensor: torch.Tensor | None) -> None:
-        assert tensor is not None, "Use self.data_pipe.send(None) instead"
-        self.buffer_size -= tensor.element_size() * tensor.numel()
-        if tensor.dtype == torch.bool:
-            tensor = tensor.float()
-        self.data_pipe.send_tensor(tensor)
-
-    def _get_element_size(self, data: list | torch.Tensor | None):
-        if isinstance(data, torch.Tensor):
-            return data.element_size() * data.numel()
-        if not data:
-            # cannot perform `not data` on a tensor
-            # so this check needs to go after the check above
-            return 0
-
-        raise AssertionError(f"Unknown data type {type(data)}")
-
-    def _add_to_buffer(
-        self,
-        input_tokens: torch.Tensor,
-        roi: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        hidden: torch.Tensor,
-    ):
-        if isinstance(input_tokens, torch.Tensor):
-            input_tokens = input_tokens.clone()
-        if isinstance(roi, torch.Tensor):
-            roi = roi.clone()
-        if isinstance(key, torch.Tensor):
-            key = key.clone()
-        if isinstance(value, torch.Tensor):
-            value = value.clone()
-        if isinstance(hidden, torch.Tensor):
-            hidden = hidden.clone()
-
-        buffer_item = [input_tokens, roi, key, value, hidden]
-        data_size = sum([self._get_element_size(data) for data in buffer_item])
-
-        with self.buffer_cv:
-            if self.buffer_size + data_size > self.buffer_size_threshold:
-                # log outside the while loop to avoid this message being logged
-                # repeatedly.
-                logger.debug("KV transfer buffer is full. Handling...")
-                while self.buffer_size + data_size > self.buffer_size_threshold:
-                    self.buffer_cv.wait()
-
-            self.buffer_size += data_size
-            self.buffer.append(buffer_item)
-            self.buffer_cv.notify()
-
-    def _is_end_signal(self, signal):
-        return signal is None
-
-    def drop_select_handler(self):
-        try:
-            while True:
-                signal = self.signal_pipe.recv_tensor()
-                if self._is_end_signal(signal):
-                    logger.info("Received end signal!")
-                    break
-
-                input_tokens = self.data_pipe.recv_tensor()
-
-                roi = self.data_pipe.recv_tensor()
-                assert roi is not None, (
-                    "Please provide the roi when sending drop-select request"
-                )
-                roi = roi > 0.5
-                tokens_roi_recver = [input_tokens, roi]
-
-                def is_buffer_available(
-                    tokens_roi_recver: list[torch.Tensor],
-                ) -> bool:
-                    # perform input tokens and roi matching
-                    # FIXME: this matching is O(n), ideally it should be O(1)
-                    # but this buffer size won't (and shouldn't) be too large so
-                    # the fix is not urgent.
-                    for _ in range(len(self.buffer)):
-                        if self._matches(self.buffer[0], tokens_roi_recver) > 0:
-                            return True
-                        # rotate the element we just accessed to the end
-                        self.buffer.rotate(-1)
-                    return False
-
-                with self.buffer_cv:
-                    while not is_buffer_available(tokens_roi_recver):
-                        logger.debug("KV transfer buffer is not available. Waiting...")
-                        self.buffer_cv.wait()
-                    # need to clone the tensor
-                    # in case the tensor is freed before sending finishes
-                    matched_item = self.buffer.popleft()
-                    for tensor in matched_item:
-                        self._send_tensor_and_dec_size(tensor)
-                    self.buffer_cv.notify()
-
-        except RuntimeError as e:
-            if "Connection closed by peer" not in str(e):
-                raise e
-
-        logger.debug("Closing drop_select_handler")
-
-    def drop_select(
-        self, input_tokens: torch.Tensor | None, roi: torch.Tensor | None
-    ) -> list[torch.Tensor | None]:
-        assert self.request_handling_thread is None, (
-            "drop_select should be called by the KV cache consumer "
-            "(e.g. the decode vLLM instance)"
-        )
-
-        if isinstance(input_tokens, torch.Tensor):
-            input_tokens = input_tokens.clone()
-        if isinstance(roi, torch.Tensor):
-            roi = roi.clone().float()
-
-        self.signal_pipe.send_tensor(self.normal_signal)
-        self.data_pipe.send_tensor(input_tokens)
-        self.data_pipe.send_tensor(roi)
-
-        input_tokens = self.data_pipe.recv_tensor()
-        roi = self.data_pipe.recv_tensor()
-        if roi is not None:
-            # convert from float tensor to bool tensor
-            # as PyNccl does not support sending bool tensor
-            roi = roi > 0.5
-        key = self.data_pipe.recv_tensor()
-        value = self.data_pipe.recv_tensor()
-        hidden = self.data_pipe.recv_tensor()
-
-        return [input_tokens, roi, key, value, hidden]
-
-    def insert(
-        self,
-        input_tokens: torch.Tensor,
-        roi: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        hidden: torch.Tensor,
-    ) -> None:
-        self._add_to_buffer(input_tokens, roi, key, value, hidden)
-
-        # when calling the insert, the current process is a sender
-        # need to launch the request handler and start listening to request.
-        if self.request_handling_thread is None:
-            self.request_handling_thread = threading.Thread(
-                target=self.drop_select_handler
-            )
-            self.request_handling_thread.start()
-
-    def close(self):
-        if (
-            hasattr(self, "request_handling_thread")
-            and self.request_handling_thread is not None
-        ):
-            self.request_handling_thread.join()
-
-        else:
-            # TODO: have a explicit close signal and have a explicit way to
-            # check if it's requester
-            self.signal_pipe.send_tensor(self.end_signal)
diff --git a/vllm/distributed/kv_transfer/kv_pipe/__init__.py b/vllm/distributed/kv_transfer/kv_pipe/__init__.py
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/vllm/distributed/kv_transfer/kv_pipe/base.py b/vllm/distributed/kv_transfer/kv_pipe/base.py
deleted file mode 100644
index 1fe7a90e9a712..0000000000000
--- a/vllm/distributed/kv_transfer/kv_pipe/base.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-This file defines an interface `KVPipeBase`
-that provides an abstraction for sending and receiving tensors, or None, via
-distributed communications.
-
-All classes instantiated from this interface are assumed to be a FIFO pipe.
-
-If your distributed communication platform already supports key-value lookup,
-you can bypass this interface and directly start from `kv_lookup_buffer`.
-"""
-
-from abc import ABC, abstractmethod
-
-import torch
-
-
-class KVPipeBase(ABC):
-    """
-    This class provides an interface for sending and receiving tensors, or
-    None, by distributed communications.
-    """
-
-    @abstractmethod
-    def send_tensor(self, tensor: torch.Tensor | None) -> None:
-        """Send a tensor, or None, via the pipe.
-
-        Need to support sending None -- important for error handling.
-
-        TODO: add a `key` argument so that we can use traditional
-        key-value database as the distributed communication mechanism behind
-        the pipe.
-
-        Args:
-            tensor (Optional[torch.Tensor]): The tensor to be sent. Can be None.
-
-        Raises:
-            NotImplementedError: This method must be implemented in subclasses.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def recv_tensor(self) -> torch.Tensor | None:
-        """Receive a tensor (can be None) from the pipeline.
-
-        Returns:
-            Optional[torch.Tensor]: The tensor received from the pipeline. Can
-                                    be None.
-
-        Raises:
-            NotImplementedError: This method must be implemented in subclasses.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def close(self) -> None:
-        """Close the pipeline and release resources.
-
-        This method is responsible for closing the communication pipeline
-        and releasing any resources associated with it.
-
-        Raises:
-            NotImplementedError: This method must be implemented in subclasses.
-        """
-        raise NotImplementedError
diff --git a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
deleted file mode 100644
index 542dde09abad4..0000000000000
--- a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
+++ /dev/null
@@ -1,295 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import json
-import os
-import struct
-from concurrent.futures import ThreadPoolExecutor
-from dataclasses import dataclass
-
-import torch
-import zmq
-from safetensors.torch import load as safetensors_load
-from safetensors.torch import save as safetensors_save
-
-from vllm.config.kv_transfer import KVTransferConfig
-from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
-from vllm.logger import init_logger
-from vllm.utils.network_utils import join_host_port, make_zmq_path, split_host_port
-
-logger = init_logger(__name__)
-NONE_INT = -150886311
-
-
-@dataclass
-class MooncakeTransferEngineConfig:
-    prefill_url: str
-    decode_url: str
-    metadata_backend: str | None
-    metadata_server: str
-    protocol: str
-    device_name: str
-
-    @staticmethod
-    def from_file(file_path: str) -> "MooncakeTransferEngineConfig":
-        """Load the config from a JSON file."""
-        with open(file_path) as fin:
-            config = json.load(fin)
-        return MooncakeTransferEngineConfig(
-            prefill_url=config.get("prefill_url"),
-            decode_url=config.get("decode_url"),
-            metadata_backend=config.get("metadata_backend", None),
-            metadata_server=config.get("metadata_server"),
-            protocol=config.get("protocol", "tcp"),
-            device_name=config.get("device_name", ""),
-        )
-
-    @staticmethod
-    def load_from_env() -> "MooncakeTransferEngineConfig":
-        """Load config from a file specified in the environment variable."""
-        config_file_path = os.getenv("MOONCAKE_CONFIG_PATH")
-        if config_file_path is None:
-            raise ValueError(
-                "The environment variable 'MOONCAKE_CONFIG_PATH' is not set."
-            )
-        return MooncakeTransferEngineConfig.from_file(config_file_path)
-
-
-class MooncakeTransferEngine:
-    """Handles the transfer of data using mooncake_vllm_adaptor and ZeroMQ."""
-
-    def __init__(self, kv_rank: int, local_rank: int):
-        try:
-            from mooncake.engine import TransferEngine
-        except ImportError as e:
-            raise ImportError(
-                "Please install mooncake by following the instructions at "
-                "https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md "  # noqa: E501
-                "to run vLLM with MooncakeConnector."
-            ) from e
-
-        self.engine = TransferEngine()
-        self.local_rank = local_rank
-
-        try:
-            self.config = MooncakeTransferEngineConfig.load_from_env()
-            logger.info("Mooncake Configuration loaded successfully.")
-        except ValueError as e:
-            logger.error(e)
-            raise
-        except Exception as exc:
-            logger.error("An error occurred while loading the configuration: %s", exc)
-            raise
-        prefill_host, base_prefill_port = split_host_port(self.config.prefill_url)
-        decode_host, base_decode_port = split_host_port(self.config.decode_url)
-
-        # Avoid ports conflict when running prefill and decode on the same node
-        if prefill_host == decode_host and base_prefill_port == base_decode_port:
-            base_decode_port = base_decode_port + 100
-
-        prefill_port = base_prefill_port + self.local_rank
-        decode_port = base_decode_port + self.local_rank
-        self.prefill_url = join_host_port(prefill_host, prefill_port)
-        self.decode_url = join_host_port(decode_host, decode_port)
-
-        self.initialize(
-            self.prefill_url if kv_rank == 0 else self.decode_url,
-            self.config.metadata_server,
-            self.config.protocol,
-            self.config.device_name,
-            self.config.metadata_backend,
-        )
-
-        self.remote_url = self.decode_url if kv_rank == 0 else self.prefill_url
-
-        # Initialize ZeroMQ context and sockets
-        self.context = zmq.Context()  # type: ignore[attr-defined]
-        self.sender_socket = self.context.socket(zmq.constants.PUSH)
-        self.receiver_socket = self.context.socket(zmq.constants.PULL)
-        self.sender_ack = self.context.socket(zmq.constants.PULL)
-        self.receiver_ack = self.context.socket(zmq.constants.PUSH)
-
-        self.buffer_cleaner = ThreadPoolExecutor(max_workers=1)
-        self._setup_metadata_sockets(
-            kv_rank, prefill_host, base_prefill_port, decode_host, base_decode_port
-        )
-
-    def _setup_metadata_sockets(
-        self, kv_rank: int, p_host: str, p_port: int, d_host: str, d_port: int
-    ) -> None:
-        """Set up ZeroMQ sockets for sending and receiving data."""
-        # Offsets < 8 are left for initialization in case tp and pp are enabled
-        p_rank_offset = p_port + 8 + self.local_rank * 2
-        d_rank_offset = d_port + 8 + self.local_rank * 2
-        if kv_rank == 0:
-            self.sender_socket.bind(make_zmq_path("tcp", p_host, p_rank_offset + 1))
-            self.receiver_socket.connect(
-                make_zmq_path("tcp", d_host, d_rank_offset + 1)
-            )
-            self.sender_ack.connect(make_zmq_path("tcp", d_host, d_rank_offset + 2))
-            self.receiver_ack.bind(make_zmq_path("tcp", p_host, p_rank_offset + 2))
-        else:
-            self.receiver_socket.connect(
-                make_zmq_path("tcp", p_host, p_rank_offset + 1)
-            )
-            self.sender_socket.bind(make_zmq_path("tcp", d_host, d_rank_offset + 1))
-            self.receiver_ack.bind(make_zmq_path("tcp", d_host, d_rank_offset + 2))
-            self.sender_ack.connect(make_zmq_path("tcp", p_host, p_rank_offset + 2))
-
-    def initialize(
-        self,
-        local_hostname: str,
-        metadata_server: str,
-        protocol: str,
-        device_name: str,
-        metadata_backend: str | None,
-    ) -> None:
-        """Initialize the mooncake instance."""
-        if metadata_backend is None:
-            self.engine.initialize(
-                local_hostname, metadata_server, protocol, device_name
-            )
-        else:
-            supported_backend = ["etcd", "redis"]
-            metadata_backend = metadata_backend.lower()
-            if metadata_backend not in supported_backend:
-                raise ValueError(
-                    "Mooncake Configuration error. `metadata_backend`"
-                    f" should be one of {supported_backend}."
-                )
-
-            self.engine.initialize_ext(
-                local_hostname, metadata_server, protocol, device_name, metadata_backend
-            )
-
-    def allocate_managed_buffer(self, length: int) -> int:
-        """Allocate a managed buffer of the specified length."""
-        ret = self.engine.allocate_managed_buffer(length)
-        if ret <= 0:
-            logger.error("Allocation Return Error")
-            raise Exception("Allocation Return Error")
-        return ret
-
-    def free_managed_buffer(self, buffer: int, length: int) -> int:
-        """Free a previously allocated managed buffer."""
-        return self.engine.free_managed_buffer(buffer, length)
-
-    def transfer_sync(self, buffer: int, peer_buffer_address: int, length: int) -> int:
-        """Synchronously transfer data to the specified address."""
-        ret = self.engine.transfer_sync_read(
-            self.remote_url, buffer, peer_buffer_address, length
-        )
-        if ret < 0:
-            logger.error("Transfer Return Error")
-            raise Exception("Transfer Return Error")
-        return ret
-
-    def write_bytes_to_buffer(self, buffer: int, user_data: bytes, length: int) -> int:
-        """Write bytes to the allocated buffer."""
-        return self.engine.write_bytes_to_buffer(buffer, user_data, length)
-
-    def read_bytes_from_buffer(self, buffer: int, length: int) -> bytes:
-        """Read bytes from the allocated buffer."""
-        return self.engine.read_bytes_from_buffer(buffer, length)
-
-    def wait_for_ack(self, src_ptr: int, length: int) -> None:
-        """Asynchronously wait for ACK from the receiver."""
-        ack = self.sender_ack.recv()
-        if ack != b"ACK":
-            logger.error("Failed to receive ACK from the receiver")
-
-        self.free_managed_buffer(src_ptr, length)
-
-    def send_bytes(self, user_data: bytes) -> None:
-        """Send bytes to the remote process."""
-        length = len(user_data)
-        src_ptr = self.allocate_managed_buffer(length)
-        self.write_bytes_to_buffer(src_ptr, user_data, length)
-        self.sender_socket.send_multipart(
-            [struct.pack("!Q", src_ptr), struct.pack("!Q", length)]
-        )
-        self.buffer_cleaner.submit(self.wait_for_ack, src_ptr, length)
-
-    def recv_bytes(self) -> bytes:
-        """Receive bytes from the remote process."""
-        data = self.receiver_socket.recv_multipart()
-        src_ptr = struct.unpack("!Q", data[0])[0]
-        length = struct.unpack("!Q", data[1])[0]
-        dst_ptr = self.allocate_managed_buffer(length)
-        self.transfer_sync(dst_ptr, src_ptr, length)
-        ret = self.read_bytes_from_buffer(dst_ptr, length)
-
-        # Buffer cleanup
-        self.receiver_ack.send(b"ACK")
-        self.free_managed_buffer(dst_ptr, length)
-
-        return ret
-
-
-class MooncakePipe(KVPipeBase):
-    """MooncakeTransferEngine based Pipe implementation."""
-
-    def __init__(
-        self, local_rank: int, config: KVTransferConfig, device: str | None = None
-    ):
-        """Initialize the mooncake pipe and set related parameters."""
-        self.config = config
-        self.local_rank = local_rank
-        self.kv_rank = self.config.kv_rank
-        assert self.kv_rank is not None
-        if device is None:
-            self.device = self._select_device(self.config.kv_buffer_device)
-        else:
-            self.device = self._select_device(device)
-
-        self.transfer_engine = MooncakeTransferEngine(self.kv_rank, self.local_rank)
-        self.transport_thread: ThreadPoolExecutor | None = None
-        self.none_tensor = torch.tensor([NONE_INT], device=self.device)
-
-    def _select_device(self, device: str) -> torch.device:
-        """Select available device (CUDA or CPU)."""
-        logger.info("Selecting device: %s", device)
-        if device == "cuda":
-            return torch.device(f"cuda:{self.local_rank}")
-        else:
-            return torch.device("cpu")
-
-    def tensor_hash(self, tensor: torch.Tensor) -> int:
-        """Calculate the hash value of the tensor."""
-        return hash(tensor.data_ptr())
-
-    def _send_impl(self, tensor: torch.Tensor) -> None:
-        """Implement the tensor sending logic using safetensors."""
-        self.transfer_engine.send_bytes(safetensors_save({"tensor": tensor}))
-
-    def _recv_impl(self) -> torch.Tensor:
-        """Implement the tensor receiving logic using safetensors."""
-        data = self.transfer_engine.recv_bytes()
-        return safetensors_load(data)["tensor"].to(self.device)
-
-    def send_tensor(self, tensor: torch.Tensor | None) -> None:
-        """Send tensor to the target process."""
-        if self.transport_thread is None:
-            self.transport_thread = ThreadPoolExecutor(max_workers=1)
-        tensor = tensor if tensor is not None else self.none_tensor
-        assert len(tensor.shape) > 0
-        self.transport_thread.submit(self._send_impl, tensor)
-
-    def recv_tensor(self) -> torch.Tensor | None:
-        """Receive tensor from other processes."""
-        if self.transport_thread is None:
-            self.transport_thread = ThreadPoolExecutor(max_workers=1)
-        tensor = self.transport_thread.submit(self._recv_impl).result()
-        if tensor.numel() == 1 and tensor.item() == NONE_INT:
-            return None
-        else:
-            return tensor
-
-    def close(self) -> None:
-        """Cleanup logic when closing the pipe."""
-        self.transfer_engine.sender_socket.close()
-        self.transfer_engine.receiver_socket.close()
-        self.transfer_engine.sender_ack.close()
-        self.transfer_engine.receiver_ack.close()
-        self.transfer_engine.context.term()  # Terminate the ZMQ context
-        logger.info("Closed the transfer engine and cleaned up resources.")
diff --git a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
deleted file mode 100644
index 526c5cd1d5278..0000000000000
--- a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
+++ /dev/null
@@ -1,285 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-This module implements a PyNccl pipe for sending and receiving
-Optional[torch.Tensor] between distributed ranks with advanced
-communication features.
-
-Key Features:
-- Supports sending and receiving tensors with metadata
-- Handles both CUDA and CPU device communications
-- Implements a non-blocking tensor transfer mechanism
-- Manages buffer size and provides backpressure control
-- Supports distributed process groups with configurable parameters
-"""
-
-import threading
-import time
-from collections.abc import Callable
-from concurrent.futures import ThreadPoolExecutor
-
-import torch
-
-from vllm.config.kv_transfer import KVTransferConfig
-from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
-from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
-from vllm.distributed.utils import StatelessProcessGroup
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
-
-
-class BrokenPipeException(Exception):
-    def __init__(self, message):
-        self.message = message
-        super().__init__(self.message)
-
-
-Metadata = dict[str, torch.Tensor | None]
-
-
-class PyNcclPipe(KVPipeBase):
-    METADATA_LENGTH = 16
-    MAX_TENSOR_DIMENSIONS = 14
-    METADATA_DTYPE = torch.int64
-
-    def __init__(
-        self,
-        local_rank: int,
-        config: KVTransferConfig,
-        device: str | None = None,
-        port_offset: int = 0,
-    ):
-        self.config = config
-        self.local_rank = local_rank
-        self.kv_rank = self.config.kv_rank
-        assert self.kv_rank is not None
-        self.kv_parallel_size = self.config.kv_parallel_size
-        if device is None:
-            self.device = self._select_device(self.config.kv_buffer_device)
-        else:
-            self.device = self._select_device(device)
-
-        # build distributed connection and send/recv implementation
-        store_timeout = self.config.get_from_extra_config("store_timeout", 300)
-        self.group = StatelessProcessGroup.create(
-            host=self.config.kv_ip,
-            port=self.config.kv_port + port_offset,
-            rank=self.kv_rank,
-            world_size=self.kv_parallel_size,
-            store_timeout=store_timeout,
-        )
-        # add a barrier to make sure the connection is initiated properly
-        self.group.barrier()
-        impl = self._get_device_send_recv_impl(self.group)
-        self.device_send_func, self.device_recv_func = impl
-        # set target rank
-        self.target_rank_for_send = (self.kv_rank + 1) % self.kv_parallel_size
-        self.target_rank_for_recv = (self.kv_rank - 1) % self.kv_parallel_size
-
-        # transportation-related variables
-        self.transport_thread: ThreadPoolExecutor | None = None
-        self.buffer_size = 0
-        self.buffer_size_lock = threading.Lock()
-        self.buffer_size_thresh = self.config.kv_buffer_size
-
-    def _get_device_send_recv_impl(
-        self, group: StatelessProcessGroup
-    ) -> tuple[
-        Callable[[torch.Tensor, int], None], Callable[[torch.Tensor, int], None]
-    ]:
-        send: Callable[[torch.Tensor, int], None]
-        recv: Callable[[torch.Tensor, int], None]
-        if self.device.type == "cuda":
-            # use PyNCCL for send / recv
-            comm = PyNcclCommunicator(group, device=self.local_rank)
-            comm.disabled = False
-            send, recv = comm.send, comm.recv  # type: ignore
-        else:
-            # This send / recv implementation here is NOT intended to transfer
-            # KV caches (and should NOT be repurposed to transfer KV caches).
-            # Currently it is only used to transmit control-plane messages
-            # for PyNcclBuffer.
-            send = group.send_obj
-
-            def my_recv(x, src):
-                x[...] = group.recv_obj(src)
-
-            recv = my_recv
-
-        return send, recv
-
-    def _select_device(self, device: str):
-        logger.info("Selecting device: %s", device)
-        if device == "cuda":
-            return torch.device(f"cuda:{self.local_rank}")
-        else:
-            return torch.device("cpu")
-
-    def _make_metadata(self, tensor: torch.Tensor | None) -> Metadata:
-        """
-        Create the metadata as a dictionary based on the input tensor.
-
-        Args:
-            tensor: The input tensor or None if no tensor is provided.
-
-        Returns:
-            metadata: A dictionary with the following keys:
-                - "dtype": The data type of the tensor or None.
-                - "shape": The shape of the tensor or None.
-        """
-        if tensor is None:
-            return {"dtype": None, "shape": None}
-        else:
-            return {"dtype": tensor.dtype, "shape": tensor.shape}
-
-    def _prepare_recv_buffer(self, metadata: Metadata) -> torch.Tensor:
-        """
-        Create a buffer to receive the tensor based on the provided metadata.
-
-        Args:
-            metadata: A dictionary with keys "dtype" and "shape",
-                describing the tensor's data type and shape.
-
-        Returns:
-            buffer: A tensor of the specified type and shape,
-                allocated on `self.device`.
-        """
-        return torch.empty(
-            metadata["shape"], dtype=metadata["dtype"], device=self.device
-        )
-
-    def _send_metadata(self, metadata: Metadata):
-        """
-        Send the metadata dictionary to the target rank.
-
-        Args:
-            metadata: A dictionary with keys "dtype" and "shape".
-        """
-        self.group.send_obj(metadata, self.target_rank_for_send)
-
-    def _recv_metadata(self) -> Metadata:
-        """
-        Receive the metadata dictionary from the target rank.
-
-        Returns:
-            metadata: A dictionary with keys "dtype" and "shape"
-                describing the tensor.
-        """
-        return self.group.recv_obj(self.target_rank_for_recv)
-
-    def _send_impl(self, tensor: torch.Tensor | None) -> None:
-        """
-        The actual implementation of sending the tensor and its metadata to the
-        target rank.
-
-        Args:
-            tensor: The input tensor to be sent, or `None` if no tensor is
-                being sent.
-        """
-        metadata = self._make_metadata(tensor)
-        self._send_metadata(metadata)
-        if tensor is not None:
-            self.device_send_func(tensor.to(self.device), self.target_rank_for_send)
-
-    def _recv_impl(self) -> torch.Tensor | None:
-        """
-        The actual implementation of receiving a tensor and its metadata from
-        the target rank.
-
-        Returns:
-            buffer: The received tensor, or `None` if no tensor is received.
-        """
-        metadata = self._recv_metadata()
-        if metadata["dtype"] is None:
-            return None
-        buffer = self._prepare_recv_buffer(metadata)
-        self.device_recv_func(buffer, self.target_rank_for_recv)
-
-        return buffer
-
-    def send_tensor_wrapper(
-        self, tensor: torch.Tensor | None, tensor_size: int
-    ) -> None:
-        """
-        Wrapper for _send_impl to handle exceptions and update buffer size.
-        """
-        try:
-            self._send_impl(tensor)
-
-            with self.buffer_size_lock:
-                self.buffer_size -= tensor_size
-        except Exception as e:
-            logger.error(
-                "[rank%d]: Exception when trying to send %s, msg: %s",
-                torch.distributed.get_rank(),
-                str(tensor),
-                str(e),
-            )
-            import traceback
-
-            traceback.print_exc()
-
-    def block_if_full(self):
-        """
-        Block the current thread if the buffer size is larger than the
-        threshold.
-        """
-        while self.buffer_size > self.buffer_size_thresh:
-            logger.debug("KV cache transfer pipe is full. Waiting...")
-            time.sleep(0.05)
-
-    def send_tensor(self, tensor: torch.Tensor | None) -> None:
-        """
-        Sends a tensor and its metadata to the destination rank in a
-        non-blocking way.
-
-        Args:
-            tensor: The tensor to send, or `None` if no tensor is being sent.
-        """
-        if self.transport_thread is None:
-            self.transport_thread = ThreadPoolExecutor(max_workers=1)
-
-        if tensor is not None:
-            tensor_size = tensor.element_size() * tensor.numel()
-        else:
-            tensor_size = 0
-
-        self.block_if_full()
-
-        with self.buffer_size_lock:
-            self.buffer_size += tensor_size
-
-        self.transport_thread.submit(self.send_tensor_wrapper, tensor, tensor_size)
-
-    def recv_tensor(self) -> torch.Tensor | None:
-        """
-        Receives a tensor and its metadata from the source rank. Blocking call.
-
-        Returns:
-            The received tensor, or `None` if no tensor is received.
-        """
-        if self.transport_thread is None:
-            self.transport_thread = ThreadPoolExecutor(max_workers=1)
-
-        future = self.transport_thread.submit(self._recv_impl)
-
-        try:
-            tensor = future.result()
-        except Exception as e:
-            logger.error("Encountering exception in KV receiving thread")
-            logger.error("%s", e)
-            logger.error("My device: %s", self.device)
-            import traceback
-
-            traceback.print_exc()
-            raise e
-
-        return tensor
-
-    def close(self):
-        """
-        Close the pipe and release associated resources.
-        """
-        if hasattr(self, "transport_thread") and self.transport_thread is not None:
-            self.transport_thread.shutdown()

From e10c84e06af7264d5c0b3e7ec5604ada2eee7094 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 4 Dec 2025 18:42:49 +0000
Subject: [PATCH 242/770] Access `partial_rotary_factor` from `rope_parameters`
 (#29966)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/kernels/core/test_mrope.py              |  8 ++------
 .../layers/rotary_embedding/__init__.py       |  5 ++++-
 vllm/model_executor/models/apertus.py         |  5 +----
 vllm/model_executor/models/bailing_moe.py     |  3 ---
 vllm/model_executor/models/bamba.py           |  4 +---
 vllm/model_executor/models/config.py          |  5 -----
 vllm/model_executor/models/falcon_h1.py       |  4 +---
 vllm/model_executor/models/glm.py             |  3 ++-
 vllm/model_executor/models/glm4.py            |  3 +--
 vllm/model_executor/models/glm4_moe.py        |  3 +--
 vllm/model_executor/models/gpt_neox.py        |  6 ++----
 vllm/model_executor/models/llama.py           |  3 ---
 vllm/model_executor/models/nemotron.py        |  2 --
 vllm/model_executor/models/nemotron_nas.py    |  1 -
 vllm/model_executor/models/persimmon.py       |  2 --
 vllm/model_executor/models/phi.py             |  5 +----
 vllm/model_executor/models/qwen3_next.py      |  1 -
 vllm/model_executor/models/stablelm.py        |  4 ----
 vllm/transformers_utils/config.py             | 10 +++++++++-
 vllm/transformers_utils/configs/nemotron.py   | 20 ++++++++++++-------
 vllm/transformers_utils/configs/qwen3_next.py |  8 +++++---
 21 files changed, 43 insertions(+), 62 deletions(-)

diff --git a/tests/kernels/core/test_mrope.py b/tests/kernels/core/test_mrope.py
index 43b242ab2d586..4e1559a049bf9 100644
--- a/tests/kernels/core/test_mrope.py
+++ b/tests/kernels/core/test_mrope.py
@@ -113,12 +113,10 @@ def test_mrope(
     is_neox_style = True
 
     max_position = config.max_position_embeddings
-    partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
-    rotary_dim = int(head_dim * partial_rotary_factor)
 
     mrope_helper_class = get_rope(
         head_size=head_dim,
-        rotary_dim=rotary_dim,
+        rotary_dim=head_dim,
         max_position=max_position,
         is_neox_style=is_neox_style,
         rope_parameters=config.rope_parameters,
@@ -184,12 +182,10 @@ def test_mrope_torch_compile_tracing(
     )
     is_neox_style = True
     max_position = config.max_position_embeddings
-    partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
-    rotary_dim = int(head_dim * partial_rotary_factor)
 
     mrope_helper_class = get_rope(
         head_size=head_dim,
-        rotary_dim=rotary_dim,
+        rotary_dim=head_dim,
         max_position=max_position,
         is_neox_style=is_neox_style,
         rope_parameters=config.rope_parameters,
diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py
index aa6ece30026d3..4dff984f92be6 100644
--- a/vllm/model_executor/layers/rotary_embedding/__init__.py
+++ b/vllm/model_executor/layers/rotary_embedding/__init__.py
@@ -30,7 +30,6 @@ def get_rope(
     is_neox_style: bool = True,
     rope_parameters: dict[str, Any] | None = None,
     dtype: torch.dtype | None = None,
-    partial_rotary_factor: float = 1.0,
     dual_chunk_attention_config: dict[str, Any] | None = None,
 ) -> RotaryEmbedding:
     if dtype is None:
@@ -55,6 +54,10 @@ def get_rope(
     else:
         dual_chunk_attention_args = None
 
+    partial_rotary_factor = 1.0
+    if rope_parameters is not None:
+        partial_rotary_factor = rope_parameters.get("partial_rotary_factor", 1.0)
+
     if partial_rotary_factor < 1.0:
         rotary_dim = int(rotary_dim * partial_rotary_factor)
     key = (
diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py
index 4a69787af55e2..2a8be29d8d306 100644
--- a/vllm/model_executor/models/apertus.py
+++ b/vllm/model_executor/models/apertus.py
@@ -148,8 +148,6 @@ class ApertusAttention(nn.Module):
         if head_dim is None:
             head_dim = self.hidden_size // self.total_num_heads
         self.head_dim = head_dim
-        # Phi models introduced a partial_rotary_factor parameter in the config
-        self.partial_rotary_factor = getattr(config, "partial_rotary_factor", 1)
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
@@ -228,11 +226,10 @@ class ApertusAttention(nn.Module):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=int(self.partial_rotary_factor * self.head_dim),
+            rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
             rope_parameters=config.rope_parameters,
             is_neox_style=is_neox_style,
-            partial_rotary_factor=self.partial_rotary_factor,
         )
 
 
diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py
index f7a5d4e7889e5..0143e140af265 100644
--- a/vllm/model_executor/models/bailing_moe.py
+++ b/vllm/model_executor/models/bailing_moe.py
@@ -127,8 +127,6 @@ class BailingAttention(nn.Module):
             prefix=f"{prefix}.dense",
         )
 
-        self.partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
-
         self.rotary_dim = getattr(config, "rotary_dim", self.head_dim)
 
         self.rotary_emb = get_rope(
@@ -137,7 +135,6 @@ class BailingAttention(nn.Module):
             max_position=config.max_position_embeddings,
             rope_parameters=config.rope_parameters,
             is_neox_style=True,
-            partial_rotary_factor=self.partial_rotary_factor,
         )
 
         self.attn = Attention(
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
index 1d6493b18c343..00d742f84ef79 100644
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -178,9 +178,7 @@ class BambaAttentionDecoderLayer(nn.Module):
         self.scaling = self.head_dim**-0.5
         self.max_position_embeddings = max_position_embeddings
 
-        if hasattr(config, "partial_rotary_factor"):
-            rotary_dim = int(self.head_dim * config.partial_rotary_factor)
-        elif hasattr(config, "attn_rotary_emb"):
+        if hasattr(config, "attn_rotary_emb"):
             rotary_dim = config.attn_rotary_emb  # for backward compatibility
         else:
             rotary_dim = self.head_dim  # default
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index d7e802ba1aca0..4bca36aa4b7de 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -8,7 +8,6 @@ import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.model_executor.models import ModelRegistry
 from vllm.platforms import current_platform
-from vllm.transformers_utils.config import set_default_rope_theta
 from vllm.utils.math_utils import cdiv, round_up
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec
@@ -78,8 +77,6 @@ class JinaRobertaModelConfig(VerifyAndUpdateConfig):
             if not model_config.enforce_eager:
                 max_position = round_up(max_position, 8)
 
-            set_default_rope_theta(config, default_theta=config.rotary_emb_base)
-
             config.rotary_kwargs = {
                 "head_size": head_dim,
                 "rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
@@ -119,8 +116,6 @@ class NomicBertModelConfig(VerifyAndUpdateConfig):
         rotary_emb_dim = int(head_dim * config.rotary_emb_fraction)
         max_trained_positions = getattr(config, "max_trained_positions", 2048)
 
-        set_default_rope_theta(config, default_theta=config.rotary_emb_base)
-
         config.rotary_kwargs = {
             "head_size": head_dim,
             "rotary_dim": rotary_emb_dim,
diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py
index 83ceb9303cfb5..a1c1263f8d724 100644
--- a/vllm/model_executor/models/falcon_h1.py
+++ b/vllm/model_executor/models/falcon_h1.py
@@ -242,9 +242,7 @@ class FalconH1AttentionDecoderLayer(nn.Module):
         self.scaling = self.head_dim**-0.5
         self.max_position_embeddings = max_position_embeddings
 
-        if hasattr(config, "partial_rotary_factor"):
-            rotary_dim = self.head_dim * config.partial_rotary_factor
-        elif hasattr(config, "attn_rotary_emb"):
+        if hasattr(config, "attn_rotary_emb"):
             rotary_dim = config.attn_rotary_emb  # for backward compatibility
         else:
             rotary_dim = self.head_dim  # default
diff --git a/vllm/model_executor/models/glm.py b/vllm/model_executor/models/glm.py
index a6991f8e43fef..26d7c29aae6e2 100644
--- a/vllm/model_executor/models/glm.py
+++ b/vllm/model_executor/models/glm.py
@@ -10,7 +10,8 @@ from .utils import PPMissingLayer
 
 class GlmForCausalLM(LlamaForCausalLM):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        vllm_config.model_config.hf_config.partial_rotary_factor = 0.5
+        hf_config = vllm_config.model_config.hf_config
+        hf_config.rope_parameters["partial_rotary_factor"] = 0.5
         super().__init__(vllm_config=vllm_config, prefix=prefix)
         # Hack Llama model to fit HF format GLM implementation
         # Attention difference between GLM and Llama:
diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py
index 002cdb721e1db..9adfa942b99fa 100644
--- a/vllm/model_executor/models/glm4.py
+++ b/vllm/model_executor/models/glm4.py
@@ -78,7 +78,7 @@ class Glm4Attention(nn.Module):
             # Number of KV heads is less than TP size, so we replicate
             # the KV heads across multiple tensor parallel GPUs.
             assert tp_size % self.total_num_kv_heads == 0
-        partial_rotary_factor = getattr(config, "partial_rotary_factor", 0.5)
+        config.rope_parameters.setdefault("partial_rotary_factor", 0.5)
         self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
         self.head_dim = head_dim or hidden_size // self.total_num_heads
         self.rotary_dim = self.head_dim
@@ -106,7 +106,6 @@ class Glm4Attention(nn.Module):
             rotary_dim=self.rotary_dim,
             max_position=max_position,
             rope_parameters=config.rope_parameters,
-            partial_rotary_factor=partial_rotary_factor,
             is_neox_style=False,
         )
         self.attn = Attention(
diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
index c99f824e1bd4d..8cae5ee425e4d 100644
--- a/vllm/model_executor/models/glm4_moe.py
+++ b/vllm/model_executor/models/glm4_moe.py
@@ -282,13 +282,12 @@ class Glm4MoeAttention(nn.Module):
             prefix=f"{prefix}.o_proj",
         )
 
-        partial_rotary_factor = getattr(config, "partial_rotary_factor", 0.5)
+        config.rope_parameters.setdefault("partial_rotary_factor", 0.5)
         self.rotary_emb = get_rope(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
             rope_parameters=config.rope_parameters,
-            partial_rotary_factor=partial_rotary_factor,
         )
         self.attn = Attention(
             self.num_heads,
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index b9959682cbcef..212d605c17285 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -89,16 +89,14 @@ class GPTNeoXAttention(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.dense",
         )
-        scaling = self.head_size**-0.5
-        rotary_dim = int(self.head_size * config.rotary_pct)
-        assert rotary_dim % 2 == 0
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.rotary_emb = get_rope(
             self.head_size,
-            rotary_dim=rotary_dim,
+            rotary_dim=self.head_size,
             max_position=max_position_embeddings,
             rope_parameters=config.rope_parameters,
         )
+        scaling = self.head_size**-0.5
         self.attn = Attention(
             self.num_heads,
             self.head_size,
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 8f5a967cd422a..167dfbca248ce 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -149,8 +149,6 @@ class LlamaAttention(nn.Module):
         if head_dim is None:
             head_dim = self.hidden_size // self.total_num_heads
         self.head_dim = head_dim
-        # Phi models introduced a partial_rotary_factor parameter in the config
-        self.partial_rotary_factor = getattr(config, "partial_rotary_factor", 1)
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
@@ -265,7 +263,6 @@ class LlamaAttention(nn.Module):
             max_position=self.max_position_embeddings,
             rope_parameters=getattr(config, "rope_parameters", None),
             is_neox_style=is_neox_style,
-            partial_rotary_factor=self.partial_rotary_factor,
         )
 
 
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index ffba6c9dfe739..bf83ee5e42a15 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -178,7 +178,6 @@ class NemotronAttention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.partial_rotary_factor = config.partial_rotary_factor
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -203,7 +202,6 @@ class NemotronAttention(nn.Module):
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
             rope_parameters=config.rope_parameters,
-            partial_rotary_factor=self.partial_rotary_factor,
         )
         self.attn = Attention(
             self.num_heads,
diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py
index 9d968dee87114..734fbc60709fa 100644
--- a/vllm/model_executor/models/nemotron_nas.py
+++ b/vllm/model_executor/models/nemotron_nas.py
@@ -122,7 +122,6 @@ class DeciLMAttention(LlamaAttention):
             max_position=self.max_position_embeddings,
             rope_parameters=config.rope_parameters,
             is_neox_style=is_neox_style,
-            partial_rotary_factor=self.partial_rotary_factor,
         )
 
 
diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py
index 795cd25f16753..8f26c68720a5c 100644
--- a/vllm/model_executor/models/persimmon.py
+++ b/vllm/model_executor/models/persimmon.py
@@ -106,7 +106,6 @@ class PersimmonAttention(nn.Module):
         self.num_heads = self.total_num_heads // tensor_parallel_world_size
         self.head_dim = self.hidden_size // self.total_num_heads
         self.max_position_embeddings = config.max_position_embeddings
-        self.partial_rotary_factor = config.partial_rotary_factor
         self.is_causal = True
 
         assert (self.head_dim * self.total_num_heads) == self.hidden_size
@@ -138,7 +137,6 @@ class PersimmonAttention(nn.Module):
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
             rope_parameters=config.rope_parameters,
-            partial_rotary_factor=self.partial_rotary_factor,
         )
         self.scaling = self.head_dim**-0.5
         self.attn = Attention(
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index 70016d9ed246c..253fbbc41330c 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -109,10 +109,7 @@ class PhiAttention(nn.Module):
         )
 
         scaling = self.head_size**-0.5
-        rotary_dim = int(
-            config.partial_rotary_factor
-            * (config.hidden_size // config.num_attention_heads)
-        )
+        rotary_dim = config.hidden_size // config.num_attention_heads
         assert rotary_dim % 2 == 0
 
         max_position_embeddings = getattr(config, "max_position_embeddings", 2048)
diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py
index 661a182151d74..dd64e3983e381 100644
--- a/vllm/model_executor/models/qwen3_next.py
+++ b/vllm/model_executor/models/qwen3_next.py
@@ -750,7 +750,6 @@ class Qwen3NextAttention(nn.Module):
             rotary_dim=self.head_dim,
             max_position=config.max_position_embeddings,
             rope_parameters=config.rope_parameters,
-            partial_rotary_factor=config.partial_rotary_factor,
             dual_chunk_attention_config=self.dual_chunk_attention_config,
         )
 
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index 65092584edced..e879599ad3ead 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -119,9 +119,6 @@ class StablelmAttention(nn.Module):
         self.num_key_value_heads = max(1, self.total_num_key_value_heads // tp_size)
         self.head_dim = self.hidden_size // self.total_num_heads
         self.max_position_embeddings = config.max_position_embeddings
-        self.partial_rotary_factor = getattr(
-            config, "rope_pct", getattr(config, "partial_rotary_factor", 1)
-        )
         self.scaling = self.head_dim**-0.5
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_key_value_heads * self.head_dim
@@ -154,7 +151,6 @@ class StablelmAttention(nn.Module):
             rotary_dim=self.head_dim,
             max_position=self.config.max_position_embeddings,
             rope_parameters=self.config.rope_parameters,
-            partial_rotary_factor=self.partial_rotary_factor,
         )
         self.attn = Attention(
             self.num_heads,
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 1075bc2449b3c..f926b523afdfa 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -25,6 +25,7 @@ from transformers.models.auto.tokenization_auto import get_tokenizer_config
 from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
 
 from vllm import envs
+from vllm.config.utils import getattr_iter
 from vllm.logger import init_logger
 from vllm.transformers_utils.utils import parse_safetensors_file_metadata
 
@@ -304,7 +305,8 @@ def set_default_rope_theta(config: PretrainedConfig, default_theta: float) -> No
 
 def patch_rope_parameters(config: PretrainedConfig) -> None:
     """Provide backwards compatibility for RoPE."""
-    rope_theta = getattr(config, "rope_theta", None)
+    rope_theta_names = ("rope_theta", "rotary_emb_base")
+    rope_theta = getattr_iter(config, rope_theta_names, None)
     if Version(version("transformers")) < Version("5.0.0.dev0"):
         # Transformers v4 installed, legacy config fields may be present
         if (rope_scaling := getattr(config, "rope_scaling", None)) is not None:
@@ -313,6 +315,12 @@ def patch_rope_parameters(config: PretrainedConfig) -> None:
             if not hasattr(config, "rope_parameters"):
                 config.rope_parameters = {"rope_type": "default"}
             config.rope_parameters["rope_theta"] = rope_theta
+        partial_rotary_factor_names = ("partial_rotary_factor", "rotary_pct")
+        partial_rotary_factor = getattr_iter(config, partial_rotary_factor_names, None)
+        if partial_rotary_factor is not None:
+            if not hasattr(config, "rope_parameters"):
+                config.rope_parameters = {"rope_type": "default"}
+            config.rope_parameters["partial_rotary_factor"] = partial_rotary_factor
     elif rope_theta is not None or hasattr(config, "rope_parameters"):
         # Transformers v5 installed
         config.standardize_rope_params()
diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py
index d112c71d7d20b..62f52703029b7 100644
--- a/vllm/transformers_utils/configs/nemotron.py
+++ b/vllm/transformers_utils/configs/nemotron.py
@@ -89,9 +89,14 @@ class NemotronConfig(PretrainedConfig):
         tie_word_embeddings (`bool`, *optional*, defaults to `False`):
             Whether to tie weight embeddings
         rope_parameters (`dict`, *optional*):
-            The parameters of the RoPE embeddings.
-        partial_rotary_factor (`float`, *optional*, defaults to 0.5):
-            Percentage of the query and keys which will have rotary embedding.
+            The parameters of the RoPE embeddings. Expected contents:
+                `rope_theta` (`float`): The base period of the RoPE embeddings.
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear',
+                    'dynamic', 'yarn', 'longrope', 'llama3'], with 'default' being the
+                    original RoPE implementation.
+                `partial_rotary_factor` (`float`, *optional*, defaults to 0.5):
+                    Percentage of the query and keys which will have rotary embedding.
         attention_bias (`bool`, *optional*, defaults to `False`):
             Whether to use a bias in the query, key, value and output
             projection layers during self-attention.
@@ -133,7 +138,6 @@ class NemotronConfig(PretrainedConfig):
         eos_token_id=3,
         tie_word_embeddings=False,
         rope_parameters=None,
-        partial_rotary_factor=0.5,
         attention_bias=False,
         attention_dropout=0.0,
         mlp_bias=False,
@@ -165,14 +169,16 @@ class NemotronConfig(PretrainedConfig):
         rope_theta = kwargs.pop("rope_theta", 10000.0)
         if "rope_theta" not in rope_parameters:
             rope_parameters["rope_theta"] = rope_theta
-        self.rope_parameters = rope_parameters
         # for backward compatibility
         partial_rotary_factor = (
             kwargs.get("rope_percent")
             or kwargs.get("rope_percentage")
-            or partial_rotary_factor
+            or kwargs.get("partial_rotary_factor")
+            or 0.5
         )
-        self.partial_rotary_factor = partial_rotary_factor
+        if "partial_rotary_factor" not in rope_parameters:
+            rope_parameters["partial_rotary_factor"] = partial_rotary_factor
+        self.rope_parameters = rope_parameters
         self._rope_parameters_validation()
         self.attention_bias = attention_bias
         self.attention_dropout = attention_dropout
diff --git a/vllm/transformers_utils/configs/qwen3_next.py b/vllm/transformers_utils/configs/qwen3_next.py
index fd36b49245f56..8230a18343c5e 100644
--- a/vllm/transformers_utils/configs/qwen3_next.py
+++ b/vllm/transformers_utils/configs/qwen3_next.py
@@ -103,8 +103,8 @@ class Qwen3NextConfig(PretrainedConfig):
                     Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
                 `high_freq_factor` (`float`, *optional*):
                     Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
-        partial_rotary_factor (`float`, *optional*, defaults to 0.25):
-            Percentage of the query and keys which will have rotary embedding.
+                `partial_rotary_factor` (`float`, *optional*, defaults to 0.25):
+                    Percentage of the query and keys which will have rotary embedding.
         attention_bias (`bool`, *optional*, defaults to `False`):
             Whether to use a bias in the query, key, value and output projection layers during self-attention.
         attention_dropout (`float`, *optional*, defaults to 0.0):
@@ -198,7 +198,6 @@ class Qwen3NextConfig(PretrainedConfig):
         use_cache=True,
         tie_word_embeddings=False,
         rope_parameters=None,
-        partial_rotary_factor=0.25,
         attention_bias=False,
         attention_dropout=0.0,
         head_dim=256,
@@ -239,6 +238,9 @@ class Qwen3NextConfig(PretrainedConfig):
         rope_theta = kwargs.pop("rope_theta", 10000.0)
         if "rope_theta" not in rope_parameters:
             rope_parameters["rope_theta"] = rope_theta
+        partial_rotary_factor = kwargs.pop("partial_rotary_factor", 0.25)
+        if "partial_rotary_factor" not in rope_parameters:
+            rope_parameters["partial_rotary_factor"] = partial_rotary_factor
         self.rope_parameters = rope_parameters
         self.partial_rotary_factor = partial_rotary_factor
         self.attention_bias = attention_bias

From 1119f6e47abd8a56d2279520ccf1721ac5176f66 Mon Sep 17 00:00:00 2001
From: Mercykid-bash <ruanche0218@gmail.com>
Date: Fri, 5 Dec 2025 03:09:09 +0800
Subject: [PATCH 243/770] Abstract eplb algo (#26471)

Signed-off-by: Che Ruan <cr623@ic.ac.uk>
Signed-off-by: mengxingkongzhouhan <117415539+mengxingkongzhouhan@users.noreply.github.com>
Signed-off-by: Mercykid-bash <ruanche0218@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Che Ruan <cr623@ic.ac.uk>
Co-authored-by: mengxingkongzhouhan <117415539+mengxingkongzhouhan@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/distributed/test_eplb_algo.py      |  32 +--
 vllm/config/parallel.py                  |   4 +
 vllm/distributed/eplb/__init__.py        |   7 +-
 vllm/distributed/eplb/eplb_state.py      |  20 +-
 vllm/distributed/eplb/policy/__init__.py |  19 ++
 vllm/distributed/eplb/policy/abstract.py |  40 ++++
 vllm/distributed/eplb/policy/default.py  | 267 +++++++++++++++++++++++
 vllm/distributed/eplb/rebalance_algo.py  | 260 ----------------------
 8 files changed, 364 insertions(+), 285 deletions(-)
 create mode 100644 vllm/distributed/eplb/policy/__init__.py
 create mode 100644 vllm/distributed/eplb/policy/abstract.py
 create mode 100644 vllm/distributed/eplb/policy/default.py
 delete mode 100644 vllm/distributed/eplb/rebalance_algo.py

diff --git a/tests/distributed/test_eplb_algo.py b/tests/distributed/test_eplb_algo.py
index 79805a7cce53b..a53a61840e79e 100644
--- a/tests/distributed/test_eplb_algo.py
+++ b/tests/distributed/test_eplb_algo.py
@@ -4,7 +4,7 @@
 import pytest
 import torch
 
-from vllm.distributed.eplb.rebalance_algo import rebalance_experts
+from vllm.distributed.eplb.policy.default import DefaultEplbPolicy
 
 
 def test_basic_rebalance():
@@ -23,7 +23,7 @@ def test_basic_rebalance():
     num_nodes = 2
     num_gpus = 8
 
-    phy2log, log2phy, logcnt = rebalance_experts(
+    phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
         weight, num_replicas, num_groups, num_nodes, num_gpus
     )
 
@@ -77,7 +77,7 @@ def test_single_gpu_case():
     num_nodes = 1
     num_gpus = 1
 
-    phy2log, log2phy, logcnt = rebalance_experts(
+    phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
         weight, num_replicas, num_groups, num_nodes, num_gpus
     )
 
@@ -99,7 +99,7 @@ def test_equal_weights():
     num_nodes = 2
     num_gpus = 4
 
-    phy2log, log2phy, logcnt = rebalance_experts(
+    phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
         weight, num_replicas, num_groups, num_nodes, num_gpus
     )
 
@@ -122,7 +122,7 @@ def test_extreme_weight_imbalance():
     num_nodes = 2
     num_gpus = 4
 
-    phy2log, log2phy, logcnt = rebalance_experts(
+    phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
         weight, num_replicas, num_groups, num_nodes, num_gpus
     )
 
@@ -150,7 +150,7 @@ def test_multiple_layers():
     num_nodes = 2
     num_gpus = 4
 
-    phy2log, log2phy, logcnt = rebalance_experts(
+    phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
         weight, num_replicas, num_groups, num_nodes, num_gpus
     )
 
@@ -175,14 +175,14 @@ def test_parameter_validation():
     # Test non-divisible case - this should handle normally without throwing
     # errors because the function will fall back to global load balancing
     # strategy
-    phy2log, log2phy, logcnt = rebalance_experts(weight, 8, 3, 2, 4)
+    phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(weight, 8, 3, 2, 4)
     assert phy2log.shape == (1, 8)
     assert logcnt.shape == (1, 4)
 
     # Test cases that will actually cause errors:
     # num_physical_experts not divisible by num_gpus
     with pytest.raises(AssertionError):
-        rebalance_experts(weight, 7, 2, 2, 4)  # 7 not divisible by 4
+        DefaultEplbPolicy.rebalance_experts(weight, 7, 2, 2, 4)  # 7 not divisible by 4
 
 
 def test_small_scale_hierarchical():
@@ -197,7 +197,7 @@ def test_small_scale_hierarchical():
     num_nodes = 2  # 2 nodes
     num_gpus = 4  # 4 GPUs
 
-    phy2log, log2phy, logcnt = rebalance_experts(
+    phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
         weight, num_replicas, num_groups, num_nodes, num_gpus
     )
 
@@ -224,7 +224,7 @@ def test_global_load_balance_fallback():
     num_nodes = 2
     num_gpus = 4
 
-    phy2log, log2phy, logcnt = rebalance_experts(
+    phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
         weight, num_replicas, num_groups, num_nodes, num_gpus
     )
 
@@ -246,7 +246,7 @@ def test_device_compatibility(device):
     num_nodes = 1
     num_gpus = 2
 
-    phy2log, log2phy, logcnt = rebalance_experts(
+    phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
         weight, num_replicas, num_groups, num_nodes, num_gpus
     )
 
@@ -263,7 +263,9 @@ def test_additional_cases():
     weight1 = torch.tensor(
         [[50, 100, 75, 120, 90, 60, 80, 110, 40, 70, 95, 85, 65, 55, 45, 35]]
     )
-    phy2log1, log2phy1, logcnt1 = rebalance_experts(weight1, 24, 8, 4, 8)
+    phy2log1, log2phy1, logcnt1 = DefaultEplbPolicy.rebalance_experts(
+        weight1, 24, 8, 4, 8
+    )
 
     assert phy2log1.shape == (1, 24)
     assert logcnt1.shape == (1, 16)
@@ -276,7 +278,9 @@ def test_additional_cases():
             [12, 25, 50, 100, 150, 200],  # Increasing weights
         ]
     )
-    phy2log2, log2phy2, logcnt2 = rebalance_experts(weight2, 10, 3, 1, 2)
+    phy2log2, log2phy2, logcnt2 = DefaultEplbPolicy.rebalance_experts(
+        weight2, 10, 3, 1, 2
+    )
 
     assert phy2log2.shape == (2, 10)
     assert logcnt2.shape == (2, 6)
@@ -300,7 +304,7 @@ if __name__ == "__main__":
     num_nodes = 2
     num_gpus = 8
 
-    phy2log, log2phy, logcnt = rebalance_experts(
+    phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
         weight, num_replicas, num_groups, num_nodes, num_gpus
     )
     print(phy2log)
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index 20de672257107..3a768bcd4f2ce 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -35,6 +35,7 @@ logger = init_logger(__name__)
 ExpertPlacementStrategy = Literal["linear", "round_robin"]
 DistributedExecutorBackend = Literal["ray", "mp", "uni", "external_launcher"]
 DataParallelBackend = Literal["ray", "mp"]
+EPLBPolicyOption = Literal["default"]
 
 
 @config
@@ -65,6 +66,9 @@ class EPLBConfig:
     Whether to use non-blocking EPLB.
     """
 
+    policy: EPLBPolicyOption = "default"
+    """The policy type for expert parallel load balancing (EPLB)."""
+
 
 @config
 @dataclass
diff --git a/vllm/distributed/eplb/__init__.py b/vllm/distributed/eplb/__init__.py
index 4cd51dd384ad2..12e6cd417c50d 100644
--- a/vllm/distributed/eplb/__init__.py
+++ b/vllm/distributed/eplb/__init__.py
@@ -1,8 +1,3 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-Expert parallelism load balancer (EPLB).
-"""
-
-from .eplb_state import *
-from .rebalance_algo import *
+"""Expert parallelism load balancer (EPLB)."""
diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py
index 9f8798a96a2fc..c5654659b79d6 100644
--- a/vllm/distributed/eplb/eplb_state.py
+++ b/vllm/distributed/eplb/eplb_state.py
@@ -45,7 +45,7 @@ from vllm.logger import init_logger
 from vllm.model_executor.models.interfaces import MixtureOfExperts
 
 from .async_worker import start_async_worker
-from .rebalance_algo import rebalance_experts
+from .policy import EPLB_POLICIES, AbstractEplbPolicy, DefaultEplbPolicy
 from .rebalance_execute import move_from_buffer, rearrange_expert_weights_inplace
 
 logger = init_logger(__name__)
@@ -213,18 +213,23 @@ class EplbState:
         self.parallel_config = parallel_config
         self.device = device
         self.model_states: dict[str, EplbModelState] = {}
+        self.policy: type[AbstractEplbPolicy] = DefaultEplbPolicy
+        """
+        Selected EPLB algorithm class
+        """
+        self.expert_load_window_step: int = 0
         """
         Current step in the sliding window.
 
         Different from `expert_rearrangement_step`, 
         each EP rank may have its own `expert_load_window_step`.
         """
-        self.expert_load_window_step: int = 0
+        self.expert_load_window_size: int = 0
         """
         Size of the expert load sliding window.
         This is a constant and is taken from the config.
         """
-        self.expert_load_window_size: int = 0
+        self.expert_rearrangement_step: int = 0
         """
         Steps after last rearrangement.
         Will trigger a rearrangement if it exceeds the threshold.
@@ -415,6 +420,10 @@ class EplbState:
         )
         self.expert_rearrangement_step_interval = eplb_step_interval
 
+        # Set the policy based on the selected eplb algorithm type.
+        policy_type = self.parallel_config.eplb_config.policy
+        self.policy = EPLB_POLICIES[policy_type]
+        logger.debug("Selected EPLB policy: %d", policy_type)
         if global_expert_load is not None:
             ep_group = get_ep_group().device_group
             assert global_expert_load.shape == (
@@ -441,7 +450,7 @@ class EplbState:
                 new_physical_to_logical_map,
                 new_logical_to_physical_map,
                 new_logical_replica_count,
-            ) = rebalance_experts(
+            ) = self.policy.rebalance_experts(
                 global_expert_load,
                 num_replicas,
                 num_groups,
@@ -776,6 +785,7 @@ class EplbState:
                 f"{num_gpus=}, {num_nodes=}"
             )
 
+        # Get new expert mappings
         for eplb_model_state, global_expert_load_window in zip(
             self.model_states.values(), global_expert_load_windows
         ):
@@ -784,7 +794,7 @@ class EplbState:
                 new_physical_to_logical_map,
                 new_logical_to_physical_map,
                 new_logical_replica_count,
-            ) = rebalance_experts(
+            ) = self.policy.rebalance_experts(
                 global_expert_load_window,
                 num_replicas,
                 num_groups,
diff --git a/vllm/distributed/eplb/policy/__init__.py b/vllm/distributed/eplb/policy/__init__.py
new file mode 100644
index 0000000000000..8e78d7bac0e35
--- /dev/null
+++ b/vllm/distributed/eplb/policy/__init__.py
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import get_args
+
+from vllm.config.parallel import EPLBPolicyOption
+
+from .abstract import AbstractEplbPolicy
+from .default import DefaultEplbPolicy
+
+EPLB_POLICIES = {"default": DefaultEplbPolicy}
+
+# Ensure that the EPLB_POLICIES keys match the EPLBPolicyOption values
+assert set(EPLB_POLICIES.keys()) == set(get_args(EPLBPolicyOption))
+
+__all__ = [
+    "AbstractEplbPolicy",
+    "DefaultEplbPolicy",
+    "EPLB_POLICIES",
+]
diff --git a/vllm/distributed/eplb/policy/abstract.py b/vllm/distributed/eplb/policy/abstract.py
new file mode 100644
index 0000000000000..40ed621c84892
--- /dev/null
+++ b/vllm/distributed/eplb/policy/abstract.py
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from abc import ABC, abstractmethod
+
+import torch
+
+
+class AbstractEplbPolicy(ABC):
+    @classmethod
+    @abstractmethod
+    def rebalance_experts(
+        cls,
+        weight: torch.Tensor,
+        num_replicas: int,
+        num_groups: int,
+        num_nodes: int,
+        num_ranks: int,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Entry point for expert-parallelism load balancer.
+
+        Parameters:
+            weight: [layers, num_logical_experts], the load statistics
+                for all logical experts
+            num_replicas: number of physical experts, must be a multiple of
+                `num_ranks`
+            num_groups: number of expert groups
+            num_nodes: number of server nodes
+            num_ranks: number of ranks, must be a multiple of `num_nodes`
+
+        Returns:
+            physical_to_logical_map: [layers, num_replicas], the expert
+                index of each replica
+            logical_to_physical_map: [layers, num_logical_experts, X],
+                the replica indices for each expert
+            expert_count: [layers, num_logical_experts], number of
+                physical replicas for each logical expert
+        """
+        raise NotImplementedError
diff --git a/vllm/distributed/eplb/policy/default.py b/vllm/distributed/eplb/policy/default.py
new file mode 100644
index 0000000000000..6127ec703184a
--- /dev/null
+++ b/vllm/distributed/eplb/policy/default.py
@@ -0,0 +1,267 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Expert parallelism load balancer (EPLB) for vLLM.
+
+This module implements the core rearrangement algorithm.
+
+The rearrangement algorithm is adapted from
+[DeepSeek EPLB](https://github.com/deepseek-ai/eplb).
+
+Please find at [#12](https://github.com/deepseek-ai/EPLB/issues/12) an example
+on how the EPLB algorithm works.
+"""
+
+import numpy as np
+import torch
+
+from .abstract import AbstractEplbPolicy
+
+
+class DefaultEplbPolicy(AbstractEplbPolicy):
+    @classmethod
+    def balanced_packing(
+        cls, weight: torch.Tensor, num_packs: int
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Pack n weighted objects to m packs, such that each bin contains exactly
+        n/m objects and the weights of all packs are as balanced as possible.
+
+        Parameters:
+            weight: [X, n], the weight of each item
+            num_packs: number of packs
+
+        Returns:
+            pack_index: [X, n], the pack index of each item
+            rank_in_pack: [X, n], the rank of the item in the pack
+        """
+        num_layers, num_groups = weight.shape
+        assert num_groups % num_packs == 0
+        groups_per_pack = num_groups // num_packs
+
+        device = weight.device
+
+        if groups_per_pack == 1:
+            pack_index = torch.arange(
+                weight.size(-1), dtype=torch.int64, device=device
+            ).expand(weight.shape)
+            rank_in_pack = torch.zeros_like(weight, dtype=torch.int64, device=device)
+            return pack_index, rank_in_pack
+
+        weight_np = weight.cpu().numpy()
+
+        # Sort and get indices in decending order
+        indices_np = np.argsort(-weight_np, axis=-1)
+
+        pack_index_np = np.full((num_layers, num_groups), -1, dtype=np.int64)
+        rank_in_pack_np = np.full((num_layers, num_groups), -1, dtype=np.int64)
+
+        # Run the packing algorithm
+        for i in range(num_layers):
+            pack_weights = [0.0] * num_packs
+            pack_items = [0] * num_packs
+
+            for group in indices_np[i]:
+                # Find a pack with capacity that has the lowest weight
+                pack = min(
+                    (j for j in range(num_packs) if pack_items[j] < groups_per_pack),
+                    key=pack_weights.__getitem__,
+                )
+
+                assert pack_items[pack] < groups_per_pack
+                pack_index_np[i, group] = pack
+                rank_in_pack_np[i, group] = pack_items[pack]
+                pack_weights[pack] += weight_np[i, group]
+                pack_items[pack] += 1
+
+        pack_index = torch.from_numpy(pack_index_np).to(device)
+        rank_in_pack = torch.from_numpy(rank_in_pack_np).to(device)
+
+        return pack_index, rank_in_pack
+
+    @classmethod
+    def replicate_experts(
+        cls, weight: torch.Tensor, num_phy: int
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Replicate `num_log` experts to `num_phy` replicas, such that the maximum
+        load of all replicas is minimized.
+
+        Parameters:
+            weight: [X, num_log]
+            num_phy: total number of experts after replication
+
+        Returns:
+            phy2log: [X, num_phy], logical expert id of each physical expert
+            rank: [X, num_phy], the replica rank
+            logcnt: [X, num_log], number of replicas for each logical expert
+        """
+        n, num_log = weight.shape
+        num_redundant = num_phy - num_log
+        assert num_redundant >= 0
+        device = weight.device
+        phy2log = torch.arange(num_phy, dtype=torch.int64, device=device).repeat(n, 1)
+        rank = torch.zeros(n, num_phy, dtype=torch.int64, device=device)
+        logcnt = torch.ones(n, num_log, dtype=torch.int64, device=device)
+        arangen = torch.arange(n, dtype=torch.int64, device=device)
+        for i in range(num_log, num_phy):
+            redundant_indices = (weight / logcnt).max(dim=-1).indices
+            phy2log[:, i] = redundant_indices
+            rank[:, i] = logcnt[arangen, redundant_indices]
+            logcnt[arangen, redundant_indices] += 1
+        return phy2log, rank, logcnt
+
+    @classmethod
+    def rebalance_experts_hierarchical(
+        cls,
+        weight: torch.Tensor,
+        num_physical_experts: int,
+        num_groups: int,
+        num_nodes: int,
+        num_gpus: int,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Parameters:
+            weight: [num_moe_layers, num_logical_experts]
+            num_physical_experts: number of physical experts after replication
+            num_groups: number of expert groups
+            num_nodes: number of server nodes, where the intra-node network
+                (e.g, NVLink) is faster
+            num_gpus: number of GPUs, must be a multiple of `num_nodes`
+
+        Returns:
+            phy2log: [layers, num_replicas], the expert
+                index of each replica
+            log2phy: [layers, num_logical_experts, X],
+                the replica indices for each expert
+            logcnt: [layers, num_logical_experts], number of
+                physical replicas for each logical expert
+        """
+        num_layers, num_logical_experts = weight.shape
+        assert num_logical_experts % num_groups == 0
+        group_size = num_logical_experts // num_groups
+        assert num_groups % num_nodes == 0
+        groups_per_node = num_groups // num_nodes
+        assert num_gpus % num_nodes == 0
+        assert num_physical_experts % num_gpus == 0
+        phy_experts_per_gpu = num_physical_experts // num_gpus
+
+        def inverse(perm: torch.Tensor) -> torch.Tensor:
+            inv = torch.empty_like(perm)
+            inv.scatter_(
+                1,
+                perm,
+                torch.arange(
+                    perm.size(1), dtype=torch.int64, device=perm.device
+                ).expand(perm.shape),
+            )
+            return inv
+
+        # Step 1: pack groups to nodes
+        tokens_per_group = weight.unflatten(-1, (num_groups, group_size)).sum(-1)
+        group_pack_index, group_rank_in_pack = cls.balanced_packing(
+            tokens_per_group, num_nodes
+        )
+        log2mlog = (
+            (
+                (group_pack_index * groups_per_node + group_rank_in_pack) * group_size
+            ).unsqueeze(-1)
+            + torch.arange(
+                group_size, dtype=torch.int64, device=group_pack_index.device
+            )
+        ).flatten(-2)
+        mlog2log = inverse(log2mlog)
+
+        # Step 2: construct redundant experts within nodes
+        # [num_layers * num_nodes, num_logical_experts // num_nodes]
+        tokens_per_mlog = weight.gather(-1, mlog2log).view(
+            -1, num_logical_experts // num_nodes
+        )
+        phy2mlog, phyrank, mlogcnt = cls.replicate_experts(
+            tokens_per_mlog, num_physical_experts // num_nodes
+        )
+
+        # Step 3: pack physical_experts to GPUs
+        # [num_layers * num_nodes, num_physical_experts // num_nodes]
+        tokens_per_phy = (tokens_per_mlog / mlogcnt).gather(-1, phy2mlog)
+        pack_index, rank_in_pack = cls.balanced_packing(
+            tokens_per_phy, num_gpus // num_nodes
+        )
+        phy2pphy = pack_index * phy_experts_per_gpu + rank_in_pack
+        pphy2phy = inverse(phy2pphy)
+
+        pphy2mlog = phy2mlog.gather(
+            -1, pphy2phy
+        )  # [num_layers * num_nodes, num_log_per_nodes]
+        pphy2mlog = (
+            pphy2mlog.view(num_layers, num_nodes, -1)
+            + torch.arange(
+                0,
+                num_logical_experts,
+                num_logical_experts // num_nodes,
+                device=group_pack_index.device,
+            ).view(1, -1, 1)
+        ).flatten(-2)
+        pphy2log = mlog2log.gather(-1, pphy2mlog)
+        pphyrank = phyrank.gather(-1, pphy2phy).view(num_layers, -1)
+        logcnt = mlogcnt.view(num_layers, -1).gather(-1, log2mlog)
+        return pphy2log, pphyrank, logcnt
+
+    @classmethod
+    def rebalance_experts(
+        cls,
+        weight: torch.Tensor,
+        num_replicas: int,
+        num_groups: int,
+        num_nodes: int,
+        num_ranks: int,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Entry point for expert-parallelism load balancer.
+
+        Parameters:
+            weight: [layers, num_logical_experts], the load statistics for all
+                logical experts
+            num_replicas: number of physical experts, must be a multiple of
+                `num_gpus`
+            num_groups: number of expert groups
+            num_nodes: number of server nodes, where the intra-node network
+                (e.g, NVLink) is faster
+            num_ranks: number of ranks, must be a multiple of `num_nodes`
+
+        Returns:
+            phy2log: [layers, num_replicas], the expert
+                index of each replica
+            log2phy: [layers, num_logical_experts, X],
+                the replica indices for each expert
+            logcnt: [layers, num_logical_experts], number of
+                physical replicas for each logical expert
+        """
+        num_layers, num_logical_experts = weight.shape
+        weight = weight.float()
+        if num_groups % num_nodes == 0:
+            # use hierarchical load-balance policy
+            phy2log, phyrank, logcnt = cls.rebalance_experts_hierarchical(
+                weight, num_replicas, num_groups, num_nodes, num_ranks
+            )
+        else:
+            # use global load-balance policy
+            phy2log, phyrank, logcnt = cls.rebalance_experts_hierarchical(
+                weight, num_replicas, 1, 1, num_ranks
+            )
+        num_redundant_experts = num_replicas - num_logical_experts
+        maxlogcnt = num_redundant_experts + 1
+        log2phy: torch.Tensor = torch.full(
+            (num_layers, num_logical_experts, maxlogcnt),
+            -1,
+            dtype=torch.int64,
+            device=logcnt.device,
+        )
+        log2phy.view(num_layers, -1).scatter_(
+            -1,
+            phy2log * maxlogcnt + phyrank,
+            torch.arange(num_replicas, dtype=torch.int64, device=log2phy.device).expand(
+                num_layers, -1
+            ),
+        )
+        return phy2log, log2phy, logcnt
diff --git a/vllm/distributed/eplb/rebalance_algo.py b/vllm/distributed/eplb/rebalance_algo.py
deleted file mode 100644
index e6645e524cc3e..0000000000000
--- a/vllm/distributed/eplb/rebalance_algo.py
+++ /dev/null
@@ -1,260 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-Expert parallelism load balancer (EPLB) for vLLM.
-
-This module implements the core rearrangement algorithm.
-
-The rearrangement algorithm is adapted from
-[DeepSeek EPLB](https://github.com/deepseek-ai/eplb).
-
-Please find at [#12](https://github.com/deepseek-ai/EPLB/issues/12) an example
-on how the EPLB algorithm works.
-"""
-
-import numpy as np
-import torch
-
-
-def balanced_packing(
-    weight: torch.Tensor, num_packs: int
-) -> tuple[torch.Tensor, torch.Tensor]:
-    """
-    Pack n weighted objects to m packs, such that each bin contains exactly
-    n/m objects and the weights of all packs are as balanced as possible.
-
-    Parameters:
-        weight: [X, n], the weight of each item
-        num_packs: number of packs
-
-    Returns:
-        pack_index: [X, n], the pack index of each item
-        rank_in_pack: [X, n], the rank of the item in the pack
-    """
-    num_layers, num_groups = weight.shape
-    assert num_groups % num_packs == 0
-    groups_per_pack = num_groups // num_packs
-
-    device = weight.device
-
-    if groups_per_pack == 1:
-        pack_index = torch.arange(
-            weight.size(-1), dtype=torch.int64, device=device
-        ).expand(weight.shape)
-        rank_in_pack = torch.zeros_like(weight, dtype=torch.int64, device=device)
-        return pack_index, rank_in_pack
-
-    weight_np = weight.cpu().numpy()
-
-    # Sort and get indices in decending order
-    indices_np = np.argsort(-weight_np, axis=-1)
-
-    pack_index_np = np.full((num_layers, num_groups), -1, dtype=np.int64)
-    rank_in_pack_np = np.full((num_layers, num_groups), -1, dtype=np.int64)
-
-    # Run the packing algorithm
-    for i in range(num_layers):
-        pack_weights = [0.0] * num_packs
-        pack_items = [0] * num_packs
-
-        for group in indices_np[i]:
-            # Find a pack with capacity that has the lowest weight
-            pack = min(
-                (j for j in range(num_packs) if pack_items[j] < groups_per_pack),
-                key=pack_weights.__getitem__,
-            )
-
-            assert pack_items[pack] < groups_per_pack
-            pack_index_np[i, group] = pack
-            rank_in_pack_np[i, group] = pack_items[pack]
-            pack_weights[pack] += weight_np[i, group]
-            pack_items[pack] += 1
-
-    pack_index = torch.from_numpy(pack_index_np).to(device)
-    rank_in_pack = torch.from_numpy(rank_in_pack_np).to(device)
-
-    return pack_index, rank_in_pack
-
-
-def replicate_experts(
-    weight: torch.Tensor, num_phy: int
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    """
-    Replicate `num_log` experts to `num_phy` replicas, such that the maximum
-    load of all replicas is minimized.
-
-    Parameters:
-        weight: [X, num_log]
-        num_phy: total number of experts after replication
-
-    Returns:
-        phy2log: [X, num_phy], logical expert id of each physical expert
-        rank: [X, num_phy], the replica rank
-        logcnt: [X, num_log], number of replicas for each logical expert
-    """
-    n, num_log = weight.shape
-    num_redundant = num_phy - num_log
-    assert num_redundant >= 0
-    device = weight.device
-    phy2log = torch.arange(num_phy, dtype=torch.int64, device=device).repeat(n, 1)
-    rank = torch.zeros(n, num_phy, dtype=torch.int64, device=device)
-    logcnt = torch.ones(n, num_log, dtype=torch.int64, device=device)
-    arangen = torch.arange(n, dtype=torch.int64, device=device)
-    for i in range(num_log, num_phy):
-        redundant_indices = (weight / logcnt).max(dim=-1).indices
-        phy2log[:, i] = redundant_indices
-        rank[:, i] = logcnt[arangen, redundant_indices]
-        logcnt[arangen, redundant_indices] += 1
-    return phy2log, rank, logcnt
-
-
-def rebalance_experts_hierarchical(
-    weight: torch.Tensor,
-    num_physical_experts: int,
-    num_groups: int,
-    num_nodes: int,
-    num_gpus: int,
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    """
-    Parameters:
-        weight: [num_moe_layers, num_logical_experts]
-        num_physical_experts: number of physical experts after replication
-        num_groups: number of expert groups
-        num_nodes: number of server nodes, where the intra-node network
-            (e.g., NVLink) is faster
-        num_gpus: number of GPUs, must be a multiple of `num_nodes`
-
-    Returns:
-        physical_to_logical_map (torch.Tensor):
-            [num_moe_layers, num_physical_experts]
-        logical_to_physical_map (torch.Tensor):
-            [num_moe_layers, num_logical_experts, X]
-        logical_count (torch.Tensor):
-            [num_moe_layers, num_logical_experts]
-    """
-    num_layers, num_logical_experts = weight.shape
-    assert num_logical_experts % num_groups == 0
-    group_size = num_logical_experts // num_groups
-    assert num_groups % num_nodes == 0
-    groups_per_node = num_groups // num_nodes
-    assert num_gpus % num_nodes == 0
-    assert num_physical_experts % num_gpus == 0
-    phy_experts_per_gpu = num_physical_experts // num_gpus
-
-    def inverse(perm: torch.Tensor) -> torch.Tensor:
-        inv = torch.empty_like(perm)
-        inv.scatter_(
-            1,
-            perm,
-            torch.arange(perm.size(1), dtype=torch.int64, device=perm.device).expand(
-                perm.shape
-            ),
-        )
-        return inv
-
-    # Step 1: pack groups to nodes
-    tokens_per_group = weight.unflatten(-1, (num_groups, group_size)).sum(-1)
-    group_pack_index, group_rank_in_pack = balanced_packing(tokens_per_group, num_nodes)
-    log2mlog = (
-        (
-            (group_pack_index * groups_per_node + group_rank_in_pack) * group_size
-        ).unsqueeze(-1)
-        + torch.arange(group_size, dtype=torch.int64, device=group_pack_index.device)
-    ).flatten(-2)
-    mlog2log = inverse(log2mlog)
-
-    # Step 2: construct redundant experts within nodes
-    # [num_layers * num_nodes, num_logical_experts // num_nodes]
-    tokens_per_mlog = weight.gather(-1, mlog2log).view(
-        -1, num_logical_experts // num_nodes
-    )
-    phy2mlog, phyrank, mlogcnt = replicate_experts(
-        tokens_per_mlog, num_physical_experts // num_nodes
-    )
-
-    # Step 3: pack physical_experts to GPUs
-    # [num_layers * num_nodes, num_physical_experts // num_nodes]
-    tokens_per_phy = (tokens_per_mlog / mlogcnt).gather(-1, phy2mlog)
-    pack_index, rank_in_pack = balanced_packing(tokens_per_phy, num_gpus // num_nodes)
-    phy2pphy = pack_index * phy_experts_per_gpu + rank_in_pack
-    pphy2phy = inverse(phy2pphy)
-
-    pphy2mlog = phy2mlog.gather(
-        -1, pphy2phy
-    )  # [num_layers * num_nodes, num_log_per_nodes]
-    pphy2mlog = (
-        pphy2mlog.view(num_layers, num_nodes, -1)
-        + torch.arange(
-            0,
-            num_logical_experts,
-            num_logical_experts // num_nodes,
-            device=group_pack_index.device,
-        ).view(1, -1, 1)
-    ).flatten(-2)
-    pphy2log = mlog2log.gather(-1, pphy2mlog)
-    pphyrank = phyrank.gather(-1, pphy2phy).view(num_layers, -1)
-    logcnt = mlogcnt.view(num_layers, -1).gather(-1, log2mlog)
-    return pphy2log, pphyrank, logcnt
-
-
-def rebalance_experts(
-    weight: torch.Tensor,
-    num_replicas: int,
-    num_groups: int,
-    num_nodes: int,
-    num_gpus: int,
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    """
-    Entry point for expert-parallelism load balancer.
-
-    Parameters:
-        weight: [layers, num_logical_experts], the load statistics for all
-            logical experts
-        num_replicas: number of physical experts, must be a multiple of
-            `num_gpus`
-        num_groups: number of expert groups
-        num_nodes: number of server nodes, where the intra-node network
-            (e.g, NVLink) is faster
-        num_gpus: number of GPUs, must be a multiple of `num_nodes`
-
-    Returns:
-        physical_to_logical_map:
-            [layers, num_replicas], the expert index of each replica
-        logical_to_physical_map:
-            [layers, num_logical_experts, X], the replica indices for each
-            expert
-        expert_count:
-            [layers, num_logical_experts], number of physical
-            replicas for each logical expert
-    """
-    num_layers, num_logical_experts = weight.shape
-    weight = weight.float()
-    if num_groups % num_nodes == 0:
-        # use hierarchical load-balance policy
-        phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
-            weight, num_replicas, num_groups, num_nodes, num_gpus
-        )
-    else:
-        # use global load-balance policy
-        phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
-            weight, num_replicas, 1, 1, num_gpus
-        )
-    num_redundant_experts = num_replicas - num_logical_experts
-    maxlogcnt = num_redundant_experts + 1
-    log2phy: torch.Tensor = torch.full(
-        (num_layers, num_logical_experts, maxlogcnt),
-        -1,
-        dtype=torch.int64,
-        device=logcnt.device,
-    )
-    log2phy.view(num_layers, -1).scatter_(
-        -1,
-        phy2log * maxlogcnt + phyrank,
-        torch.arange(num_replicas, dtype=torch.int64, device=log2phy.device).expand(
-            num_layers, -1
-        ),
-    )
-    return phy2log, log2phy, logcnt
-
-
-__all__ = ["rebalance_experts"]

From 48a5fff66e78985a634abac0d8d7f271da744000 Mon Sep 17 00:00:00 2001
From: Peng-YM <1048217874pengym@gmail.com>
Date: Fri, 5 Dec 2025 03:09:39 +0800
Subject: [PATCH 244/770] [Bugfix] Missing tokens in `return_token_ids` when
 tool parsers is enabled in streaming mode (#29074)

Signed-off-by: Peng-YM <1048217874pengym@gmail.com>
---
 vllm/entrypoints/openai/serving_chat.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index cecd1da1e5548..9b7bc461e4511 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -1072,10 +1072,15 @@ class OpenAIServingChat(OpenAIServing):
                     # wasn't ready to send a token, then
                     #   get the next token without streaming a chunk
                     if delta_message is None:
-                        if output.finish_reason is None:
+                        # NOTE: If return_token_ids is enabled, we still need to
+                        # send a chunk with token_ids even if delta_message is None
+                        # to ensure all tokens are included in the response
+                        if (
+                            output.finish_reason is None
+                            and not request.return_token_ids
+                        ):
                             continue
-                        else:
-                            delta_message = DeltaMessage()
+                        delta_message = DeltaMessage()
 
                     # Log streaming delta if output logging is enabled
                     if self.enable_log_outputs and self.request_logger:

From c8ab988b15af5e30e87c6eb27a0ededf0377ac9e Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Thu, 4 Dec 2025 14:48:54 -0500
Subject: [PATCH 245/770] [BugFix] Fix DBO assert `assert B_block_table == B_q`
 (#29933)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 .../v1/attention/test_attention_splitting.py  | 12 +++--
 vllm/v1/spec_decode/eagle.py                  |  4 +-
 vllm/v1/worker/dp_utils.py                    | 43 ++----------------
 vllm/v1/worker/gpu_model_runner.py            | 45 ++++++++++++-------
 vllm/v1/worker/ubatch_utils.py                | 42 +++++++++++++++--
 5 files changed, 83 insertions(+), 63 deletions(-)

diff --git a/tests/v1/attention/test_attention_splitting.py b/tests/v1/attention/test_attention_splitting.py
index 1cbd0fe56be6d..f60861e3489d6 100644
--- a/tests/v1/attention/test_attention_splitting.py
+++ b/tests/v1/attention/test_attention_splitting.py
@@ -13,7 +13,7 @@ from vllm.v1.attention.backends.utils import (
     split_attn_metadata,
     split_decodes_and_prefills,
 )
-from vllm.v1.worker.ubatch_utils import create_ubatch_slices
+from vllm.v1.worker.ubatch_utils import maybe_create_ubatch_slices
 
 
 @pytest.fixture
@@ -294,8 +294,14 @@ def test_prefill_split_across_ubatches(
     qsl_np = common.query_start_loc_cpu.numpy()
     num_tokens = common.num_actual_tokens
 
-    ubatch_slices = create_ubatch_slices(num_scheduled_tokens, split_point)
-    assert len(ubatch_slices) == 2
+    ubatch_slices, _ = maybe_create_ubatch_slices(
+        True,
+        num_scheduled_tokens,
+        num_tokens,
+        batch_spec.batch_size,
+        split_point=split_point,
+    )
+    assert ubatch_slices is not None and len(ubatch_slices) == 2
 
     first_meta = _make_metadata_with_slice(ubatch_slices[0], common)
     second_meta = _make_metadata_with_slice(ubatch_slices[1], common)
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 1c7845a14b742..31428db2d3afc 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -1258,7 +1258,7 @@ class EagleProposer:
         num_tokens_padded: int,
     ) -> tuple[int, torch.Tensor]:
         # TODO(Flechman): support DBO ubatching
-        ubatch_slices, num_toks_across_dp = coordinate_batch_across_dp(
+        should_ubatch, num_toks_across_dp = coordinate_batch_across_dp(
             num_tokens_unpadded=num_tokens_unpadded,
             parallel_config=self.vllm_config.parallel_config,
             allow_microbatching=False,
@@ -1267,7 +1267,7 @@ class EagleProposer:
             uniform_decode=None,
             num_scheduled_tokens_per_request=None,
         )
-        assert ubatch_slices is None, "DBO ubatching not implemented for EAGLE"
+        assert not should_ubatch, "DBO ubatching not implemented for EAGLE"
 
         num_tokens_dp_padded = num_tokens_padded
         if num_toks_across_dp is not None:
diff --git a/vllm/v1/worker/dp_utils.py b/vllm/v1/worker/dp_utils.py
index 6539d72d81cb7..5da55d740c347 100644
--- a/vllm/v1/worker/dp_utils.py
+++ b/vllm/v1/worker/dp_utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+
 import numpy as np
 import torch
 import torch.distributed as dist
@@ -9,10 +10,7 @@ from vllm.config import ParallelConfig
 from vllm.distributed.parallel_state import get_dp_group
 from vllm.logger import init_logger
 from vllm.v1.worker.ubatch_utils import (
-    UBatchSlice,
-    UBatchSlices,
     check_ubatch_thresholds,
-    create_ubatch_slices,
     is_second_ubatch_empty,
 )
 
@@ -91,20 +89,6 @@ def _post_process_dp_padding(tensor: torch.Tensor, should_dp_pad: bool) -> torch
         return num_tokens_across_dp.cpu()
 
 
-# This just pads the second ubatch slice out to the total number of tokens
-# (num_tokens + padding) since we do `create_ubatch_slices` before applying DP padding.
-def _pad_out_ubatch_slice(
-    ubatch_slices: UBatchSlices, num_total_tokens: int
-) -> UBatchSlices:
-    padded_second_token_slice = slice(
-        ubatch_slices[1].token_slice.start, num_total_tokens
-    )
-    ubatch_slices[1] = UBatchSlice(
-        ubatch_slices[1].request_slice, padded_second_token_slice
-    )
-    return ubatch_slices
-
-
 def _synchronize_dp_ranks(
     num_tokens_unpadded: int,
     num_tokens_padded: int,
@@ -175,7 +159,7 @@ def coordinate_batch_across_dp(
     num_tokens_padded: int | None = None,
     uniform_decode: bool | None = None,
     num_scheduled_tokens_per_request: np.ndarray | None = None,
-) -> tuple[UBatchSlices | None, torch.Tensor | None]:
+) -> tuple[bool, torch.Tensor | None]:
     """
     Coordinates amongst all DP ranks to determine if and how the full batch
     should be split into microbatches.
@@ -204,7 +188,7 @@ def coordinate_batch_across_dp(
     """
     if parallel_config.data_parallel_size == 1:
         # Early exit.
-        return None, None
+        return False, None
 
     # If the caller has explicitly enabled microbatching.
     should_attempt_ubatching = False
@@ -228,23 +212,4 @@ def coordinate_batch_across_dp(
         parallel_config,
     )
 
-    # Don't microbatch unless every other DP worker is also microbatching
-    if not should_ubatch:
-        return (None, num_tokens_after_padding)
-
-    # This doesn't actually pad the ubatch slices. It just initializes the
-    # split point to the padded value so that padding can be applied
-    # to the second ubatch in pad_out_ubatch_slice after attention
-    # metadata creation
-    assert num_tokens_after_padding is not None
-    num_tokens_padded = int(num_tokens_after_padding[0].item())
-    token_split_point = int(num_tokens_padded) // 2
-
-    assert num_scheduled_tokens_per_request is not None
-    ubatch_slices = create_ubatch_slices(
-        num_scheduled_tokens_per_request, token_split_point
-    )
-    ubatch_slices = _pad_out_ubatch_slice(ubatch_slices, num_tokens_padded)
-    assert sum(s.num_tokens for s in ubatch_slices) == num_tokens_padded
-
-    return (ubatch_slices, num_tokens_after_padding)
+    return (should_ubatch, num_tokens_after_padding)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 58043a42da94e..152bea2c0975c 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -153,6 +153,7 @@ from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 from vllm.v1.worker.ubatch_utils import (
     UBatchSlices,
     check_ubatch_thresholds,
+    maybe_create_ubatch_slices,
 )
 from vllm.v1.worker.utils import is_residual_scattered_for_sp
 
@@ -2743,7 +2744,7 @@ class GPUModelRunner(
     ) -> tuple[
         CUDAGraphMode,
         BatchDescriptor,
-        UBatchSlices | None,
+        bool,
         torch.Tensor | None,
         CUDAGraphStat | None,
     ]:
@@ -2779,7 +2780,7 @@ class GPUModelRunner(
 
         # Extra coordination when running data-parallel since we need to coordinate
         # across ranks
-        ubatch_slices, num_tokens_across_dp = None, None
+        should_ubatch, num_tokens_across_dp = False, None
         if self.vllm_config.parallel_config.data_parallel_size > 1:
             # Disable DP padding when running eager to avoid excessive padding when
             # running prefills. This lets us set cudagraph_mode="NONE" on the prefiller
@@ -2789,8 +2790,8 @@ class GPUModelRunner(
                 self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
             )
 
-            ubatch_slices, num_tokens_across_dp = coordinate_batch_across_dp(
-                num_tokens_unpadded=num_tokens_padded,
+            should_ubatch, num_tokens_across_dp = coordinate_batch_across_dp(
+                num_tokens_unpadded=num_tokens,
                 parallel_config=self.parallel_config,
                 allow_microbatching=allow_microbatching,
                 allow_dp_padding=allow_dp_padding,
@@ -2822,7 +2823,7 @@ class GPUModelRunner(
         return (
             cudagraph_mode,
             batch_descriptor,
-            ubatch_slices,
+            should_ubatch,
             num_tokens_across_dp,
             cudagraph_stats,
         )
@@ -2921,7 +2922,7 @@ class GPUModelRunner(
                 (
                     cudagraph_mode,
                     batch_desc,
-                    ubatch_slices,
+                    should_ubatch,
                     num_tokens_across_dp,
                     cudagraph_stats,
                 ) = self._determine_batch_execution_and_padding(
@@ -2934,10 +2935,10 @@ class GPUModelRunner(
 
                 logger.debug(
                     "Running batch with cudagraph_mode: %s, batch_descriptor: %s, "
-                    "ubatch_slices: %s, num_tokens_across_dp: %s",
+                    "should_ubatch: %s, num_tokens_across_dp: %s",
                     cudagraph_mode,
                     batch_desc,
-                    ubatch_slices,
+                    should_ubatch,
                     num_tokens_across_dp,
                 )
 
@@ -2945,9 +2946,17 @@ class GPUModelRunner(
                 num_reqs_padded = (
                     batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs
                 )
+                ubatch_slices, ubatch_slices_padded = maybe_create_ubatch_slices(
+                    should_ubatch,
+                    num_scheduled_tokens_np,
+                    num_tokens_padded,
+                    num_reqs_padded,
+                )
+
+                pad_attn = cudagraph_mode == CUDAGraphMode.FULL
 
                 use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0
-                pad_attn = cudagraph_mode == CUDAGraphMode.FULL
+                ubatch_slices_attn = ubatch_slices_padded if pad_attn else ubatch_slices
 
                 (attn_metadata, spec_decode_common_attn_metadata) = (
                     self._build_attention_metadata(
@@ -2956,7 +2965,7 @@ class GPUModelRunner(
                         num_reqs=num_reqs,
                         num_reqs_padded=num_reqs_padded if pad_attn else None,
                         max_query_len=max_num_scheduled_tokens,
-                        ubatch_slices=ubatch_slices,
+                        ubatch_slices=ubatch_slices_attn,
                         logits_indices=logits_indices,
                         use_spec_decode=use_spec_decode,
                         num_scheduled_tokens=scheduler_output.num_scheduled_tokens,
@@ -2993,7 +3002,7 @@ class GPUModelRunner(
                 num_tokens_across_dp=num_tokens_across_dp,
                 cudagraph_runtime_mode=cudagraph_mode,
                 batch_descriptor=batch_desc,
-                ubatch_slices=ubatch_slices,
+                ubatch_slices=ubatch_slices_padded,
             ),
             record_function_or_nullcontext("gpu_model_runner: forward"),
             self.maybe_get_kv_connector_output(scheduler_output) as kv_connector_output,
@@ -3945,7 +3954,7 @@ class GPUModelRunner(
 
         num_sampled_tokens = np.ones(num_reqs, dtype=np.int32)
 
-        _cudagraph_mode, batch_desc, ubatch_slices, num_tokens_across_dp, _ = (
+        _cudagraph_mode, batch_desc, should_ubatch, num_tokens_across_dp, _ = (
             self._determine_batch_execution_and_padding(
                 num_tokens=num_tokens_unpadded,
                 num_reqs=num_reqs,
@@ -3979,6 +3988,9 @@ class GPUModelRunner(
         num_reqs_padded = (
             batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs
         )
+        ubatch_slices, ubatch_slices_padded = maybe_create_ubatch_slices(
+            should_ubatch, num_scheduled_tokens, num_tokens_padded, num_reqs_padded
+        )
 
         attn_metadata: PerLayerAttnMetadata | None = None
 
@@ -4000,11 +4012,12 @@ class GPUModelRunner(
             self.query_start_loc.np[1 : num_reqs + 1] = cum_num_tokens
             self.query_start_loc.copy_to_gpu()
 
+            pad_attn = cudagraph_runtime_mode == CUDAGraphMode.FULL
             attn_metadata, _ = self._build_attention_metadata(
                 num_tokens=num_tokens_unpadded,
                 num_reqs=num_reqs_padded,
                 max_query_len=max_query_len,
-                ubatch_slices=ubatch_slices,
+                ubatch_slices=ubatch_slices_padded if pad_attn else ubatch_slices,
                 for_cudagraph_capture=is_graph_capturing,
             )
 
@@ -4056,11 +4069,11 @@ class GPUModelRunner(
                     num_tokens_padded, None, False
                 )
 
-            if ubatch_slices is not None:
+            if ubatch_slices_padded is not None:
                 # Adjust values to reflect a single ubatch.
                 # TODO(sage,lucas): this is cruft that should be addressed in
                 #  the padding refactor.
-                num_tokens_padded = ubatch_slices[0].num_tokens
+                num_tokens_padded = ubatch_slices_padded[0].num_tokens
                 if num_tokens_across_dp is not None:
                     num_tokens_across_dp[:] = num_tokens_padded
 
@@ -4073,7 +4086,7 @@ class GPUModelRunner(
                     num_tokens_across_dp=num_tokens_across_dp,
                     cudagraph_runtime_mode=cudagraph_runtime_mode,
                     batch_descriptor=batch_desc,
-                    ubatch_slices=ubatch_slices,
+                    ubatch_slices=ubatch_slices_padded,
                 ),
             ):
                 outputs = self.model(
diff --git a/vllm/v1/worker/ubatch_utils.py b/vllm/v1/worker/ubatch_utils.py
index 33a1921d2d98e..44788476fc9c5 100644
--- a/vllm/v1/worker/ubatch_utils.py
+++ b/vllm/v1/worker/ubatch_utils.py
@@ -42,9 +42,37 @@ def check_ubatch_thresholds(
         return num_tokens >= config.dbo_prefill_token_threshold
 
 
-def create_ubatch_slices(
-    num_scheduled_tokens: np.ndarray, split_point: int
+# This just pads the second ubatch slice out to the total number of tokens
+# (num_tokens + padding) since we do `create_ubatch_slices` before applying DP padding.
+def _pad_out_ubatch_slices(
+    ubatch_slices: UBatchSlices, num_total_tokens: int, num_reqs_padded: int
 ) -> UBatchSlices:
+    # TODO(lucas): handle empty second ubatch
+    padded_second_request_slice = slice(
+        ubatch_slices[1].request_slice.start, num_reqs_padded
+    )
+    padded_second_token_slice = slice(
+        ubatch_slices[1].token_slice.start, num_total_tokens
+    )
+    return [
+        ubatch_slices[0],
+        UBatchSlice(padded_second_request_slice, padded_second_token_slice),
+    ]
+
+
+def maybe_create_ubatch_slices(
+    should_ubatch: bool,
+    num_scheduled_tokens: np.ndarray,
+    num_tokens_padded: int,
+    num_reqs_padded: int,
+    split_point: int | None = None,
+) -> tuple[UBatchSlices | None, UBatchSlices | None]:
+    if not should_ubatch:
+        return None, None
+
+    if split_point is None:
+        split_point = int(num_tokens_padded) // 2
+
     # TODO(lucas): Refactor the gpu_model_runner.py so we can pass
     # in cu_num_tokens directly (i.e. query_start_loc)
     cu_num_tokens = np.zeros(len(num_scheduled_tokens) + 1, dtype=np.int32)
@@ -67,7 +95,15 @@ def create_ubatch_slices(
     )
     second_ubatch_req_slice = slice(second_ubatch_req_start, len(cu_num_tokens) - 1)
 
-    return [
+    ubatch_slices = [
         UBatchSlice(first_ubatch_req_slice, first_ubatch_token_slice),
         UBatchSlice(second_ubatch_req_slice, second_ubatch_token_slice),
     ]
+
+    ubatch_slices_padded = _pad_out_ubatch_slices(
+        ubatch_slices, num_tokens_padded, num_reqs_padded
+    )
+
+    assert sum(s.num_tokens for s in ubatch_slices_padded) == num_tokens_padded
+
+    return ubatch_slices, ubatch_slices_padded

From 1f0d1845909877b3492d725cb63b386d80c39b47 Mon Sep 17 00:00:00 2001
From: Laith Sakka <lsakka@meta.com>
Date: Thu, 4 Dec 2025 14:33:45 -0800
Subject: [PATCH 246/770] [aot_compile]change VLLM backend to read fake args
 from example_value (#29104)

Signed-off-by: Laith Sakka <lsakka@meta.com>
---
 tests/compile/test_aot_compile.py | 66 +++++++++++++++++++++++++++++++
 vllm/compilation/backends.py      | 24 ++++++-----
 vllm/compilation/decorators.py    |  1 -
 3 files changed, 81 insertions(+), 10 deletions(-)

diff --git a/tests/compile/test_aot_compile.py b/tests/compile/test_aot_compile.py
index c65e5a25934d2..8fa305d6d72f5 100644
--- a/tests/compile/test_aot_compile.py
+++ b/tests/compile/test_aot_compile.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import functools
+import multiprocessing
 import tempfile
 from contextlib import contextmanager
 
@@ -137,3 +139,67 @@ def test_shape_env(monkeypatch: pytest.MonkeyPatch):
                 artifacts = compiled_mod.aot_compiled_fn._artifacts
                 guards_string = artifacts.compiled_fn.shape_env.format_guards()
                 assert guards_string == " - s77 <= 42\n - Eq(Mod(s77, 2), 0)"
+
+
+@pytest.mark.skipif(
+    not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+)
+@use_vllm_config(make_vllm_config())
+def test_gpt2_cache_hit(monkeypatch: pytest.MonkeyPatch):
+    """
+    Test that compiling gpt2 twice results in a cache hit and
+    capture torch dynamic symbol creations to ensure make_symbol
+    not called on cache hit.
+    """
+
+    import torch.fx.experimental.symbolic_shapes as symbolic_shapes_module
+    from torch.utils._sympy.symbol import make_symbol
+
+    from vllm import LLM
+
+    create_symbol_counter = multiprocessing.Value("i", 0)
+    original_make_symbol = make_symbol
+
+    @functools.wraps(original_make_symbol)
+    def counting_make_symbol(prefix, idx, **kwargs):
+        with create_symbol_counter.get_lock():
+            create_symbol_counter.value += 1
+        return original_make_symbol(prefix, idx, **kwargs)
+
+    symbolic_shapes_module.make_symbol = counting_make_symbol
+    try:
+        with monkeypatch.context() as m, tempfile.TemporaryDirectory() as tmpdirname:
+            m.setenv("VLLM_CACHE_ROOT", tmpdirname)
+            m.setenv("VLLM_USE_AOT_COMPILE", "1")
+            # First compilation - initialize model and generate
+            llm_model = LLM(
+                model="gpt2",
+                compilation_config=CompilationConfig(
+                    mode=CompilationMode.VLLM_COMPILE,
+                ),
+                max_model_len=256,
+            )
+
+            llm_model.generate("Hello, my name is")
+            assert create_symbol_counter.value == 2
+            create_symbol_counter.value = 0
+
+            # Clean up first model
+            del llm_model
+
+            # Second compilation - should hit cache
+            m.setenv("VLLM_FORCE_AOT_LOAD", "1")
+            llm_model = LLM(
+                model="gpt2",
+                compilation_config=CompilationConfig(
+                    mode=CompilationMode.VLLM_COMPILE,
+                ),
+                max_model_len=256,
+            )
+            llm_model.generate("Hello, my name is")
+
+            assert create_symbol_counter.value == 0
+
+    finally:
+        # Restore original method
+        symbolic_shapes_module.make_symbol = original_make_symbol
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 1773913d0b6c6..b5b7fe2b76c27 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -402,6 +402,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
         self.extra_traceback = False
 
     def run(self, *args):
+        # maybe instead just assert inputs are fake?
         fake_args = [
             self.fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t
             for t in args
@@ -416,11 +417,13 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
         kwargs: dict[str, Any],
     ) -> Any:
         assert isinstance(target, str)
+
         output = super().call_module(target, args, kwargs)
 
         if target in self.compile_submod_names:
             index = self.compile_submod_names.index(target)
             submod = self.fetch_attr(target)
+
             sym_shape_indices = [
                 i for i, x in enumerate(args) if isinstance(x, torch.SymInt)
             ]
@@ -746,11 +749,21 @@ class VllmBackend:
             if not item.is_splitting_graph
         ]
 
+        # Extract fake values from the graph to use them when needed.
+        all_fake_values = []
+        for i in graph.graph.find_nodes(op="placeholder"):
+            all_fake_values.append(i.meta["example_value"])
+
+        fake_args = [
+            all_fake_values[i] if isinstance(t, torch.Tensor) else t
+            for i, t in enumerate(example_inputs)
+        ]
+
         # propagate the split graph to the piecewise backend,
         # compile submodules with symbolic shapes
         PiecewiseCompileInterpreter(
             self.split_gm, submod_names_to_compile, self.vllm_config, self
-        ).run(*example_inputs)
+        ).run(*fake_args)
 
         graph_path = os.path.join(local_cache_dir, "computation_graph.py")
         if not os.path.exists(graph_path):
@@ -780,14 +793,7 @@ class VllmBackend:
             )
 
         # if we need to copy input buffers for cudagraph
-        from torch._guards import detect_fake_mode
-
-        fake_mode = detect_fake_mode()
-        fake_args = [
-            fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t
-            for t in example_inputs
-        ]
-
+        #
         # index of tensors that have symbolic shapes (batch size)
         # for weights and static buffers, they will have concrete shapes.
         # symbolic shape only happens for input tensors.
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 6d9da1c488c6d..eed7795cdb349 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -433,7 +433,6 @@ def _support_torch_compile(
             return TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs)
 
         # This is the path for the first compilation.
-
         # the first compilation needs to have dynamic shapes marked
         _mark_dynamic_inputs(
             self,

From 690cc3ef20eec0d080b8e2fce397bf4f981beaf1 Mon Sep 17 00:00:00 2001
From: TimWang <7367474+haitwang-cloud@users.noreply.github.com>
Date: Fri, 5 Dec 2025 07:37:14 +0800
Subject: [PATCH 247/770] docs: update metrics design doc to use new
 vllm:kv_cache_usage_perc (#30041)

Signed-off-by: Tim <tim.wang03@sap.com>
---
 docs/design/metrics.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/design/metrics.md b/docs/design/metrics.md
index 13264f6861b0c..28b5405871ac2 100644
--- a/docs/design/metrics.md
+++ b/docs/design/metrics.md
@@ -62,7 +62,7 @@ The subset of metrics exposed in the Grafana dashboard gives us an indication of
 - `vllm:time_per_output_token_seconds` - Inter-token latency (Time Per Output Token, TPOT) in seconds.
 - `vllm:time_to_first_token_seconds` - Time to First Token (TTFT) latency in seconds.
 - `vllm:num_requests_running` (also, `_swapped` and `_waiting`) - Number of requests in the RUNNING, WAITING, and SWAPPED states.
-- `vllm:gpu_cache_usage_perc` - Percentage of used cache blocks by vLLM.
+- `vllm:kv_cache_usage_perc` - Percentage of used cache blocks by vLLM.
 - `vllm:request_prompt_tokens` - Request prompt length.
 - `vllm:request_generation_tokens` - Request generation length.
 - `vllm:request_success` - Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached.

From 4470ee2f90661d9eb632687750dfb1a5a2404032 Mon Sep 17 00:00:00 2001
From: Alexander Matveev <59768536+alexm-redhat@users.noreply.github.com>
Date: Thu, 4 Dec 2025 19:03:17 -0500
Subject: [PATCH 248/770] [Perf] Enable separate shared_experts stream only for
 CUDA (#30085)

Signed-off-by: Alexander Matveev <amatveev@redhat.com>
---
 vllm/model_executor/layers/fused_moe/layer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 902a77987d61a..6001b6d83c398 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -863,7 +863,8 @@ class FusedMoE(CustomOp):
         use_chunked_impl: bool,
     ) -> tuple[bool, torch.Tensor | None]:
         use_shared_experts_stream = (
-            has_separate_shared_experts
+            current_platform.is_cuda()
+            and has_separate_shared_experts
             and not use_chunked_impl
             and self.shared_experts_stream is not None
             and (

From bcf43ab1f380208ea33769c49d116ea83f915080 Mon Sep 17 00:00:00 2001
From: Zhewen Li <zhewenli@meta.com>
Date: Thu, 4 Dec 2025 16:07:20 -0800
Subject: [PATCH 249/770] [CI/Build][AMD] Add Llama4 Maverick FP8 to AMD CI
 (#28695)

Signed-off-by: zhewenli <zhewenli@meta.com>
---
 ...lama-4-Maverick-17B-128E-Instruct-FP8.yaml |   1 +
 .../configs/models-large-rocm.txt             |   1 +
 .../test_lm_eval_correctness.py               |  75 +++++--
 .buildkite/test-amd.yaml                      | 195 +++++++++---------
 4 files changed, 159 insertions(+), 113 deletions(-)
 create mode 100644 .buildkite/lm-eval-harness/configs/models-large-rocm.txt

diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
index 46f1a9fbf6ff9..6c0b5540cbb6a 100644
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
@@ -8,3 +8,4 @@ tasks:
     value: 0.80
 limit: 250 # will run on 250 * 14 subjects = 3500 samples
 num_fewshot: 5
+rtol: 0.05
diff --git a/.buildkite/lm-eval-harness/configs/models-large-rocm.txt b/.buildkite/lm-eval-harness/configs/models-large-rocm.txt
new file mode 100644
index 0000000000000..4fb0b84bc4d81
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/models-large-rocm.txt
@@ -0,0 +1 @@
+Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
index 3627b760eddcf..f94d681197d2d 100644
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -9,11 +9,40 @@ pytest -s -v test_lm_eval_correctness.py \
     --tp-size=1
 """
 
+import os
+from contextlib import contextmanager
+
 import lm_eval
 import numpy as np
 import yaml
 
-RTOL = 0.08
+DEFAULT_RTOL = 0.08
+
+
+@contextmanager
+def scoped_env_vars(new_env: dict[str, str]):
+    if not new_env:
+        # Fast path: nothing to do
+        yield
+        return
+
+    old_values = {}
+    new_keys = []
+
+    try:
+        for key, value in new_env.items():
+            if key in os.environ:
+                old_values[key] = os.environ[key]
+            else:
+                new_keys.append(key)
+            os.environ[key] = str(value)
+        yield
+    finally:
+        # Restore / clean up
+        for key, value in old_values.items():
+            os.environ[key] = value
+        for key in new_keys:
+            os.environ.pop(key, None)
 
 
 def launch_lm_eval(eval_config, tp_size):
@@ -32,23 +61,26 @@ def launch_lm_eval(eval_config, tp_size):
         f"trust_remote_code={trust_remote_code},"
         f"max_model_len={max_model_len},"
     )
-    results = lm_eval.simple_evaluate(
-        model=backend,
-        model_args=model_args,
-        tasks=[task["name"] for task in eval_config["tasks"]],
-        num_fewshot=eval_config["num_fewshot"],
-        limit=eval_config["limit"],
-        # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
-        # text models. however, this is regressing measured strict-match for
-        # existing text models in CI, so only apply it for mm, or explicitly set
-        apply_chat_template=eval_config.get(
-            "apply_chat_template", backend == "vllm-vlm"
-        ),
-        fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
-        # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
-        gen_kwargs=eval_config.get("gen_kwargs"),
-        batch_size=batch_size,
-    )
+
+    env_vars = eval_config.get("env_vars", None)
+    with scoped_env_vars(env_vars):
+        results = lm_eval.simple_evaluate(
+            model=backend,
+            model_args=model_args,
+            tasks=[task["name"] for task in eval_config["tasks"]],
+            num_fewshot=eval_config["num_fewshot"],
+            limit=eval_config["limit"],
+            # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
+            # text models. however, this is regressing measured strict-match for
+            # existing text models in CI, so only apply it for mm, or explicitly set
+            apply_chat_template=eval_config.get(
+                "apply_chat_template", backend == "vllm-vlm"
+            ),
+            fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
+            # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
+            gen_kwargs=eval_config.get("gen_kwargs"),
+            batch_size=batch_size,
+        )
     return results
 
 
@@ -57,6 +89,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
 
     results = launch_lm_eval(eval_config, tp_size)
 
+    rtol = eval_config.get("rtol", DEFAULT_RTOL)
+
     success = True
     for task in eval_config["tasks"]:
         for metric in task["metrics"]:
@@ -64,8 +98,9 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
             measured_value = results["results"][task["name"]][metric["name"]]
             print(
                 f"{task['name']} | {metric['name']}: "
-                f"ground_truth={ground_truth} | measured={measured_value}"
+                f"ground_truth={ground_truth:.3f} | "
+                f"measured={measured_value:.3f} | rtol={rtol}"
             )
-            success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
+            success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
 
     assert success
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 022b6ea236d54..6950ad774edd8 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -718,17 +718,6 @@ steps:
   - uv pip install --system conch-triton-kernels
   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 
-- label: LM Eval Small Models # 15min
-  timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  # grade: Blocking
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
-
 - label: OpenAI API correctness # 10min
   timeout_in_minutes: 15
   mirror_hardwares: [amdexperimental, amdproduction]
@@ -974,19 +963,6 @@ steps:
     - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
     - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 
-- label: Multi-Modal Accuracy Eval (Small Models) # 10min
-  timeout_in_minutes: 70
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  # grade: Blocking
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-  source_file_dependencies:
-  - vllm/multimodal/
-  - vllm/inputs/
-  - vllm/v1/core/
-  commands:
-  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
-
 - label: Multi-Modal Models Test (Extended) 1 # 60min
   timeout_in_minutes: 120
   mirror_hardwares: [amdexperimental]
@@ -1162,21 +1138,6 @@ steps:
     # Run all e2e fusion tests
     - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
 
-- label: ROCm GPT-OSS Eval
-  timeout_in_minutes: 60
-  working_dir: "/vllm-workspace/"
-  agent_pool: mi325_1
-  mirror_hardwares: [amdexperimental, amdproduction]
-  optional: true # run on nightlies
-  source_file_dependencies:
-  - tests/evals/gpt_oss
-  - vllm/model_executor/models/gpt_oss.py
-  - vllm/model_executor/layers/quantization/mxfp4.py
-  - vllm/v1/attention/backends/flashinfer.py
-  commands:
-    - uv pip install --system 'gpt-oss[eval]==0.0.5'
-    - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
-
 - label: Blackwell Quantized MoE Test
   timeout_in_minutes: 60
   working_dir: "/vllm-workspace/"
@@ -1194,16 +1155,6 @@ steps:
   commands:
     - pytest -s -v tests/quantization/test_blackwell_moe.py
 
-- label: Blackwell LM Eval Small Models
-  timeout_in_minutes: 120
-  gpu: b200
-  optional: true # run on nightlies
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
-
 #####  1 GPU test  #####
 #####  multi gpus test  #####
 
@@ -1380,7 +1331,7 @@ steps:
     - pytest -v -s -x lora/test_llm_with_multi_loras.py
     - pytest -v -s -x lora/test_olmoe_tp.py
 
-    # Disabled for now because MXFP4 backend on non-cuda platform 
+    # Disabled for now because MXFP4 backend on non-cuda platform
     # doesn't support LoRA yet
     #- pytest -v -s -x lora/test_gptoss_tp.py
 
@@ -1446,37 +1397,6 @@ steps:
   - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
   - pytest -v -s -x lora/test_mixtral.py
 
-- label: LM Eval Large Models # optional
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_4
-  # grade: Blocking
-  gpu: a100
-  optional: true
-  num_gpus: 4
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  commands:
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
-
-##### H100 test #####
-- label: LM Eval Large Models (H100) # optional
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_4
-  # grade: Blocking
-  gpu: h100
-  optional: true
-  num_gpus: 4
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  commands:
-    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
-    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
-
 ##### H200 test #####
 - label: Distributed Tests (H200) # optional
   mirror_hardwares: [amdexperimental]
@@ -1508,20 +1428,94 @@ steps:
     - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
     - pytest -v -s tests/v1/distributed/test_dbo.py
 
-##### RL Integration Tests #####
-- label: Prime-RL Integration Test # 15min
-  mirror_hardwares: [amdexperimental]
-  agent_pool: mi325_2
+##### E2E Eval Tests #####
+- label: LM Eval Small Models (1 Card) # 15min
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
   # grade: Blocking
-  timeout_in_minutes: 30
-  optional: true
-  num_gpus: 2
-  working_dir: "/vllm-workspace"
   source_file_dependencies:
-  - vllm/
-  - .buildkite/scripts/run-prime-rl-test.sh
+  - csrc/
+  - vllm/model_executor/layers/quantization
   commands:
-    - bash .buildkite/scripts/run-prime-rl-test.sh
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+
+- label: Blackwell LM Eval Small Models
+  timeout_in_minutes: 120
+  gpu: b200
+  optional: true # run on nightlies
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
+
+- label: Multi-Modal Accuracy Eval (Small Models) # 10min
+  timeout_in_minutes: 70
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - vllm/multimodal/
+  - vllm/inputs/
+  - vllm/v1/core/
+  commands:
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
+
+- label: LM Eval Large Models (4 Card)
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_4
+  # grade: Blocking
+  gpu: a100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
+
+- label: LM Eval Large Models (H100) # optional
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_4
+  # grade: Blocking
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
+    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
+
+- label: ROCm LM Eval Large Models (8 Card)
+  mirror_hardwares: [amdproduction]
+  agent_pool: mi325_8
+  num_gpus: 8
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8
+
+- label: ROCm GPT-OSS Eval
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/"
+  agent_pool: mi325_1
+  mirror_hardwares: [amdexperimental, amdproduction]
+  optional: true # run on nightlies
+  source_file_dependencies:
+  - tests/evals/gpt_oss
+  - vllm/model_executor/models/gpt_oss.py
+  - vllm/model_executor/layers/quantization/mxfp4.py
+  - vllm/v1/attention/backends/flashinfer.py
+  commands:
+    - uv pip install --system 'gpt-oss[eval]==0.0.5'
+    - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
 
 - label: DeepSeek V2-Lite Accuracy
   mirror_hardwares: [amdexperimental, amdproduction]
@@ -1554,4 +1548,19 @@ steps:
   num_gpus: 2
   working_dir: "/vllm-workspace"
   commands:
-  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
\ No newline at end of file
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
+
+##### RL Integration Tests #####
+- label: Prime-RL Integration Test # 15min
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_2
+  # grade: Blocking
+  timeout_in_minutes: 30
+  optional: true
+  num_gpus: 2
+  working_dir: "/vllm-workspace"
+  source_file_dependencies:
+  - vllm/
+  - .buildkite/scripts/run-prime-rl-test.sh
+  commands:
+    - bash .buildkite/scripts/run-prime-rl-test.sh

From 263c38d74d87b196ad08449002fc7799e76bad9d Mon Sep 17 00:00:00 2001
From: Zhewen Li <zhewenli@meta.com>
Date: Thu, 4 Dec 2025 16:42:37 -0800
Subject: [PATCH 250/770] [CI/Build] Update batch invariant test trigger
 (#30080)

Signed-off-by: zhewenli <zhewenli@meta.com>
---
 .buildkite/test-pipeline.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index a79f0b0c6bbdf..0a99994e243ae 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -350,7 +350,8 @@ steps:
   timeout_in_minutes: 25
   gpu: h100
   source_file_dependencies:
-    - vllm/
+    - vllm/v1/attention
+    - vllm/model_executor/layers
     - tests/v1/determinism/
   commands:
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn

From aaddc9c82a6df73f0f93912d3aee987859d28a53 Mon Sep 17 00:00:00 2001
From: Shengqi Chen <harry-chen@outlook.com>
Date: Fri, 5 Dec 2025 08:48:59 +0800
Subject: [PATCH 251/770] [CI] fix silent error in nightly wheel index
 generation script, add generation time to HTML index (#30060)

Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
---
 .buildkite/scripts/generate-nightly-index.py | 33 +++++++++++++++-----
 .buildkite/scripts/upload-wheels.sh          |  5 ++-
 2 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py
index 4d28ec9619de9..f10cb2f0b6e21 100644
--- a/.buildkite/scripts/generate-nightly-index.py
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -9,6 +9,7 @@ import argparse
 import json
 import sys
 from dataclasses import asdict, dataclass
+from datetime import datetime
 from pathlib import Path
 from typing import Any
 from urllib.parse import quote
@@ -20,6 +21,7 @@ if not sys.version_info >= (3, 12):
 
 INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
 <html>
+  <!-- {comment} -->
   <meta name="pypi:repository-version" content="1.0">
   <body>
 {items}
@@ -90,7 +92,7 @@ def parse_from_filename(file: str) -> WheelFileInfo:
     )
 
 
-def generate_project_list(subdir_names: list[str]) -> str:
+def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
     """
     Generate project list HTML content linking to each project & variant sub-directory.
     """
@@ -98,11 +100,14 @@ def generate_project_list(subdir_names: list[str]) -> str:
     for name in sorted(subdir_names):
         name = name.strip("/").strip(".")
         href_tags.append(f'    <a href="{name}/">{name}/</a><br/>')
-    return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
+    return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
 
 
 def generate_package_index_and_metadata(
-    wheel_files: list[WheelFileInfo], wheel_base_dir: Path, index_base_dir: Path
+    wheel_files: list[WheelFileInfo],
+    wheel_base_dir: Path,
+    index_base_dir: Path,
+    comment: str = "",
 ) -> tuple[str, str]:
     """
     Generate package index HTML content for a specific package, linking to actual wheel files.
@@ -120,7 +125,7 @@ def generate_package_index_and_metadata(
         file_meta = asdict(file)
         file_meta["path"] = file_path_quoted
         metadata.append(file_meta)
-    index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
+    index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
     metadata_str = json.dumps(metadata, indent=2)
     return index_str, metadata_str
 
@@ -131,6 +136,7 @@ def generate_index_and_metadata(
     index_base_dir: Path,
     default_variant: str | None = None,
     alias_to_default: str | None = None,
+    comment: str = "",
 ):
     """
     Generate index for all wheel files.
@@ -141,6 +147,7 @@ def generate_index_and_metadata(
         index_base_dir (Path): Base directory to store index files.
         default_variant (str | None): The default variant name, if any.
         alias_to_default (str | None): Alias variant name for the default variant, if any.
+        comment (str | None): Optional comment to include in the generated HTML files.
 
     First, parse all wheel files to extract metadata.
     We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
@@ -234,6 +241,10 @@ def generate_index_and_metadata(
             variant_to_files[alias_to_default] = variant_to_files["default"].copy()
             print(f"Alias variant '{alias_to_default}' created for default variant.")
 
+    # Generate comment in HTML header
+    comment_str = f" ({comment})" if comment else ""
+    comment_tmpl = f"Generated on {datetime.now().isoformat()}{comment_str}"
+
     # Generate index for each variant
     subdir_names = set()
     for variant, files in variant_to_files.items():
@@ -253,7 +264,7 @@ def generate_index_and_metadata(
             subdir_names = subdir_names.union(packages)
         else:
             # generate project list for this variant directly
-            project_list_str = generate_project_list(sorted(packages))
+            project_list_str = generate_project_list(sorted(packages), comment_tmpl)
             with open(variant_dir / "index.html", "w") as f:
                 f.write(project_list_str)
 
@@ -263,7 +274,7 @@ def generate_index_and_metadata(
             package_dir = variant_dir / package
             package_dir.mkdir(parents=True, exist_ok=True)
             index_str, metadata_str = generate_package_index_and_metadata(
-                package_files, wheel_base_dir, package_dir
+                package_files, wheel_base_dir, package_dir, comment
             )
             with open(package_dir / "index.html", "w") as f:
                 f.write(index_str)
@@ -271,7 +282,7 @@ def generate_index_and_metadata(
                 f.write(metadata_str)
 
     # Generate top-level project list index
-    project_list_str = generate_project_list(sorted(subdir_names))
+    project_list_str = generate_project_list(sorted(subdir_names), comment_tmpl)
     with open(index_base_dir / "index.html", "w") as f:
         f.write(project_list_str)
 
@@ -283,6 +294,7 @@ if __name__ == "__main__":
         --current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
         --output-dir <output_directory> : directory to store generated index files
         --alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
+        --comment <comment_string> : (optional) comment string to include in generated HTML files
     """
 
     parser = argparse.ArgumentParser(
@@ -312,6 +324,12 @@ if __name__ == "__main__":
         default=None,
         help="Alias variant name for the default variant",
     )
+    parser.add_argument(
+        "--comment",
+        type=str,
+        default="",
+        help="Optional comment string to include in generated HTML files",
+    )
 
     args = parser.parse_args()
 
@@ -366,5 +384,6 @@ if __name__ == "__main__":
         index_base_dir=index_base_dir,
         default_variant=None,
         alias_to_default=args.alias_to_default,
+        comment=args.comment.strip(),
     )
     print(f"Successfully generated index and metadata in {output_dir}")
diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
index 0ac8fdd45bd0a..8e38ace0bfbc2 100644
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -81,7 +81,10 @@ else
     alias_arg=""
 fi
 
-$PYTHON pip install regex && .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg
+# HACK: we do not need regex module here, but it is required by pre-commit hook
+# To avoid any external dependency, we simply replace it back to the stdlib re module
+sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
+$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg
 
 # copy indices to /<commit>/ unconditionally
 echo "Uploading indices to $S3_COMMIT_PREFIX"

From befb59e5b102d2ffbdea85a85347eec8e2f7c27f Mon Sep 17 00:00:00 2001
From: Hubert de La Jonquiere <hubert@hcompany.ai>
Date: Fri, 5 Dec 2025 03:38:45 +0100
Subject: [PATCH 252/770] [Model] Add Holo2 reasoning parser (#30048)

Signed-off-by: hdlj-h <hubert@hcompany.ai>
---
 docs/features/reasoning_outputs.md            |   2 +
 .../reasoning/test_holo2_reasoning_parser.py  | 188 ++++++++++++++++++
 vllm/reasoning/__init__.py                    |   4 +
 vllm/reasoning/holo2_reasoning_parser.py      |  83 ++++++++
 4 files changed, 277 insertions(+)
 create mode 100644 tests/reasoning/test_holo2_reasoning_parser.py
 create mode 100644 vllm/reasoning/holo2_reasoning_parser.py

diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md
index 08a0dd69efa90..3315c0949afca 100644
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@@ -18,6 +18,7 @@ vLLM currently supports the following reasoning models:
 | [ERNIE-4.5-VL series](https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-PT) | `ernie45` | `json`, `regex` | ❌ |
 | [ERNIE-4.5-21B-A3B-Thinking](https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking) | `ernie45` | `json`, `regex` | ✅ |
 | [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `json`, `regex` | ✅ |
+| [Holo2 series](https://huggingface.co/collections/Hcompany/holo2) | `holo2` | `json`, `regex` | ✅ |
 | [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `json`, `regex` | ✅ |
 | [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ |
 | [MiniMax-M2](https://huggingface.co/MiniMaxAI/MiniMax-M2) | `minimax_m2_append_think` | `json`, `regex` | ✅ |
@@ -28,6 +29,7 @@ vLLM currently supports the following reasoning models:
     IBM Granite 3.2 and DeepSeek-V3.1 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
     The reasoning feature for the Qwen3 series is enabled by default. To disable it, you must pass `enable_thinking=False` in your `chat_template_kwargs`.
     DeepSeek-V3.1 tool calling is supported in non-thinking mode.
+    Holo2 reasoning is enabled by default. To disable it, you must also pass `thinking=False` in your `chat_template_kwargs`.
 
 ## Quickstart
 
diff --git a/tests/reasoning/test_holo2_reasoning_parser.py b/tests/reasoning/test_holo2_reasoning_parser.py
new file mode 100644
index 0000000000000..438bb2e957b85
--- /dev/null
+++ b/tests/reasoning/test_holo2_reasoning_parser.py
@@ -0,0 +1,188 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+from transformers import AutoTokenizer
+
+from tests.reasoning.utils import run_reasoning_extraction
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
+from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
+from vllm.reasoning.holo2_reasoning_parser import Holo2ReasoningParser
+from vllm.reasoning.identity_reasoning_parser import IdentityReasoningParser
+
+REASONING_MODEL_NAME = "HCompany/Holo2-4B"
+
+
+@pytest.fixture(scope="module")
+def tokenizer():
+    return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
+
+
+@pytest.mark.parametrize(
+    "thinking,expected_parser_type",
+    [
+        (True, DeepSeekR1ReasoningParser),
+        (False, IdentityReasoningParser),
+    ],
+)
+def test_parser_selection(tokenizer, thinking, expected_parser_type):
+    parser = Holo2ReasoningParser(
+        tokenizer,
+        chat_template_kwargs={
+            "thinking": thinking,
+        },
+    )
+
+    assert isinstance(parser._parser, expected_parser_type)
+
+
+def test_holo2_default_parser_is_deepseekr1(tokenizer):
+    parser = Holo2ReasoningParser(tokenizer)
+
+    assert isinstance(parser._parser, DeepSeekR1ReasoningParser)
+
+
+def test_holo2_supports_structured_output(tokenizer):
+    # Structured output manager uses the reasoning parser to check if the
+    # reasoning content is ended before applying the grammar. The main function
+    # used is is_reasoning_end. This test checks if the parser is able to
+    # correctly identify the end of the reasoning content.
+
+    # important to not pass chat_template_kwargs here as it is done in the
+    # StructuredOutputManager
+    parser = Holo2ReasoningParser(tokenizer)
+
+    end_token_id = tokenizer.encode("</think>", add_special_tokens=False)[0]
+
+    assert parser.is_reasoning_end([1, 2, 4, end_token_id])
+    assert not parser.is_reasoning_end([1, 2, 4])
+    assert parser.is_reasoning_end([1, 2, 4, end_token_id, 5])
+
+
+# thinking is True, non-streaming
+WITH_THINK = {
+    "output": "This is a reasoning section</think>This is the rest",
+    "reasoning": "This is a reasoning section",
+    "content": "This is the rest",
+}
+# thinking is True, streaming
+WITH_THINK_STREAM = {
+    "output": "This is a reasoning section</think>This is the rest",
+    "reasoning": "This is a reasoning section",
+    "content": "This is the rest",
+}
+# thinking is False, non-streaming
+THINKING_DISABLED = {
+    "output": "This is the rest",
+    "reasoning": None,
+    "content": "This is the rest",
+}
+# thinking is False, streaming
+THINKING_DISABLED_STREAM = {
+    "output": "This is the rest",
+    "reasoning": None,
+    "content": "This is the rest",
+}
+# thinking is False but the model output </think>, non-streaming
+THINKING_DISABLED_WITH_CLOSE_TAG = {
+    "output": "</think>This is the rest",
+    "reasoning": None,
+    "content": "</think>This is the rest",
+}
+# thinking is False but the model output </think>, streaming
+THINKING_DISABLED_WITH_CLOSE_TAG_STREAM = {
+    "output": "some text</think>This is the rest",
+    "reasoning": None,
+    "content": "some text</think>This is the rest",
+}
+COMPLETE_REASONING = {
+    "output": "This is a reasoning section</think>",
+    "reasoning": "This is a reasoning section",
+    "content": None,
+}
+
+TEST_CASES = [
+    pytest.param(
+        False,
+        WITH_THINK,
+        None,
+        id="with_think",
+    ),
+    pytest.param(
+        True,
+        WITH_THINK_STREAM,
+        None,
+        id="with_think_stream",
+    ),
+    pytest.param(
+        False,
+        WITH_THINK,
+        {"thinking": True},
+        id="with_think_enabled",
+    ),
+    pytest.param(
+        True,
+        WITH_THINK_STREAM,
+        {"thinking": True},
+        id="with_think_stream_enabled",
+    ),
+    pytest.param(
+        False,
+        THINKING_DISABLED,
+        {"thinking": False},
+        id="thinking_disabled",
+    ),
+    pytest.param(
+        True,
+        THINKING_DISABLED_STREAM,
+        {"thinking": False},
+        id="thinking_disabled_stream",
+    ),
+    pytest.param(
+        False,
+        THINKING_DISABLED_WITH_CLOSE_TAG,
+        {"thinking": False},
+        id="thinking_disabled_with_close_tag",
+    ),
+    pytest.param(
+        True,
+        THINKING_DISABLED_WITH_CLOSE_TAG_STREAM,
+        {"thinking": False},
+        id="thinking_disabled_with_close_tag_stream",
+    ),
+    pytest.param(
+        False,
+        COMPLETE_REASONING,
+        None,
+        id="complete_reasoning",
+    ),
+    pytest.param(
+        True,
+        COMPLETE_REASONING,
+        None,
+        id="complete_reasoning_stream",
+    ),
+]
+
+
+@pytest.mark.parametrize("streaming, param_dict, chat_template_kwargs", TEST_CASES)
+def test_reasoning(
+    streaming: bool,
+    param_dict: dict,
+    chat_template_kwargs: dict | None,
+    tokenizer,
+):
+    output = tokenizer.tokenize(param_dict["output"])
+    output_tokens: list[str] = [
+        tokenizer.convert_tokens_to_string([token]) for token in output
+    ]
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser("holo2")(
+        tokenizer,
+        chat_template_kwargs=chat_template_kwargs,
+    )
+
+    reasoning, content = run_reasoning_extraction(
+        parser, output_tokens, streaming=streaming
+    )
+
+    assert reasoning == param_dict["reasoning"]
+    assert content == param_dict["content"]
diff --git a/vllm/reasoning/__init__.py b/vllm/reasoning/__init__.py
index 36e58dba6b497..7b918d2e3b78f 100644
--- a/vllm/reasoning/__init__.py
+++ b/vllm/reasoning/__init__.py
@@ -44,6 +44,10 @@ _REASONING_PARSERS_TO_REGISTER = {
         "granite_reasoning_parser",
         "GraniteReasoningParser",
     ),
+    "holo2": (
+        "holo2_reasoning_parser",
+        "Holo2ReasoningParser",
+    ),
     "hunyuan_a13b": (
         "hunyuan_a13b_reasoning_parser",
         "HunyuanA13BReasoningParser",
diff --git a/vllm/reasoning/holo2_reasoning_parser.py b/vllm/reasoning/holo2_reasoning_parser.py
new file mode 100644
index 0000000000000..76de1c077c88b
--- /dev/null
+++ b/vllm/reasoning/holo2_reasoning_parser.py
@@ -0,0 +1,83 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Sequence
+
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
+from vllm.logger import init_logger
+from vllm.reasoning import (
+    ReasoningParser,
+)
+from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
+from vllm.reasoning.identity_reasoning_parser import IdentityReasoningParser
+from vllm.tokenizers import TokenizerLike
+
+logger = init_logger(__name__)
+
+
+class Holo2ReasoningParser(ReasoningParser):
+    """
+    Reasoning parser for the Holo2 models which are based on Qwen3.
+
+    The Holo2 model uses <think>...</think> tokens to denote reasoning text but <think>
+    is part of the chat template. This parser extracts the reasoning content until
+    </think> in the model's output.
+
+    The model provides a switch to enable or disable reasoning
+    output via the 'thinking=False' parameter.
+
+    Chat template args:
+    - thinking: Whether to enable reasoning output (default: True)
+
+
+    Parsing rules on model output:
+        - thinking == False
+            -> Model output is treated as purely the content |content|
+        - thinking == True
+            -> Model output is |reasoning_content|</think>|content|
+    """
+
+    def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
+        super().__init__(tokenizer, *args, **kwargs)
+
+        chat_kwargs = kwargs.get("chat_template_kwargs", {}) or {}
+        # Deepseek V3 and Holo2 are similar. However, Holo2 models think by default.
+        # this parser without user specified chat template args is initiated once for
+        # all requests in the structured output manager. So it is important that without
+        # user specified chat template args, the default thinking is True.
+
+        enable_thinking = bool(chat_kwargs.get("thinking", True))
+
+        if enable_thinking:
+            self._parser = DeepSeekR1ReasoningParser(tokenizer, *args, **kwargs)
+        else:
+            self._parser = IdentityReasoningParser(tokenizer, *args, **kwargs)
+
+    def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
+        return self._parser.is_reasoning_end(input_ids)
+
+    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
+        return self._parser.extract_content_ids(input_ids)
+
+    def extract_reasoning(
+        self, model_output: str, request: ChatCompletionRequest
+    ) -> tuple[str | None, str | None]:
+        return self._parser.extract_reasoning(model_output, request)
+
+    def extract_reasoning_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> DeltaMessage | None:
+        return self._parser.extract_reasoning_streaming(
+            previous_text,
+            current_text,
+            delta_text,
+            previous_token_ids,
+            current_token_ids,
+            delta_token_ids,
+        )

From 0098a6e3dab74ac1e3e9371638bd9173c1ba83ad Mon Sep 17 00:00:00 2001
From: Qiu <qiuchunshuo@huawei.com>
Date: Fri, 5 Dec 2025 10:40:51 +0800
Subject: [PATCH 253/770] [PCP&DCP] move CUDAGraph check for PCP&DCP to the
 check func of platforms (#29952)

Signed-off-by: QiuChunshuo <qiuchunshuo@huawei.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/config/vllm.py    | 30 ++++++++----------------------
 vllm/platforms/cuda.py | 17 +++++++++++++++++
 vllm/platforms/rocm.py | 18 ++++++++++++++++++
 3 files changed, 43 insertions(+), 22 deletions(-)

diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 735b0afbaaeb3..823bd96db9ac9 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -671,36 +671,22 @@ class VllmConfig:
 
         if current_platform.support_static_graph_mode():
             # if cudagraph_mode has full cudagraphs, we need to check support
-            if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
-                # decode context parallel does not support full cudagraphs
-                if self.parallel_config.decode_context_parallel_size > 1:
+            if (
+                self.compilation_config.cudagraph_mode.has_full_cudagraphs()
+                and self.model_config is not None
+            ):
+                if self.model_config.pooler_config is not None:
                     logger.warning_once(
-                        "Decode context parallel (DCP) is enabled, which is "
-                        "incompatible with full CUDA graphs. "
+                        "Pooling models do not support full cudagraphs. "
                         "Overriding cudagraph_mode to PIECEWISE."
                     )
                     self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
-                # prefill context parallel do not support full cudagraphs
-                elif self.parallel_config.prefill_context_parallel_size > 1:
+                elif self.model_config.is_encoder_decoder:
                     logger.warning_once(
-                        "Prefill context parallel (PCP) is enabled, which is "
-                        "incompatible with full CUDA graphs. "
+                        "Encoder-decoder models do not support full cudagraphs. "
                         "Overriding cudagraph_mode to PIECEWISE."
                     )
                     self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
-                elif self.model_config is not None:
-                    if self.model_config.pooler_config is not None:
-                        logger.warning_once(
-                            "Pooling models do not support full cudagraphs. "
-                            "Overriding cudagraph_mode to PIECEWISE."
-                        )
-                        self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
-                    elif self.model_config.is_encoder_decoder:
-                        logger.warning_once(
-                            "Encoder-decoder models do not support full cudagraphs. "
-                            "Overriding cudagraph_mode to PIECEWISE."
-                        )
-                        self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
 
             # disable cudagraph when enforce eager execution
             if self.model_config is not None and self.model_config.enforce_eager:
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 4bf9401b6b051..1467ca71efec1 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -233,6 +233,23 @@ class CudaPlatformBase(Platform):
         from vllm.config import CUDAGraphMode
 
         compilation_config = vllm_config.compilation_config
+        if compilation_config.cudagraph_mode.has_full_cudagraphs():
+            # decode context parallel does not support full cudagraphs
+            if parallel_config.decode_context_parallel_size > 1:
+                logger.warning_once(
+                    "Decode context parallel (DCP) is enabled, which is "
+                    "incompatible with full CUDA graphs. "
+                    "Overriding cudagraph_mode to PIECEWISE."
+                )
+                compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
+            # prefill context parallel do not support full cudagraphs
+            elif parallel_config.prefill_context_parallel_size > 1:
+                logger.warning_once(
+                    "Prefill context parallel (PCP) is enabled, which is "
+                    "incompatible with full CUDA graphs. "
+                    "Overriding cudagraph_mode to PIECEWISE."
+                )
+                compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
         if (
             parallel_config.all2all_backend == "deepep_high_throughput"
             and parallel_config.data_parallel_size > 1
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index ccf3446a3a6e5..32c7f8e536639 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -381,6 +381,24 @@ class RocmPlatform(Platform):
         parallel_config = vllm_config.parallel_config
         is_eager_execution = compilation_config == CUDAGraphMode.NONE
 
+        if compilation_config.cudagraph_mode.has_full_cudagraphs():
+            # decode context parallel does not support full cudagraphs
+            if parallel_config.decode_context_parallel_size > 1:
+                logger.warning_once(
+                    "Decode context parallel (DCP) is enabled, which is "
+                    "incompatible with full CUDA graphs. "
+                    "Overriding cudagraph_mode to PIECEWISE."
+                )
+                compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
+            # prefill context parallel do not support full cudagraphs
+            elif parallel_config.prefill_context_parallel_size > 1:
+                logger.warning_once(
+                    "Prefill context parallel (PCP) is enabled, which is "
+                    "incompatible with full CUDA graphs. "
+                    "Overriding cudagraph_mode to PIECEWISE."
+                )
+                compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
+
         use_aiter_rms_norm = rocm_aiter_ops.is_rmsnorm_enabled()
 
         if cache_config and cache_config.block_size is None:

From 7c9b2c8f8132e47fa9b04c0ae9a49872e0172f5f Mon Sep 17 00:00:00 2001
From: Charlie Fu <charlifu@amd.com>
Date: Thu, 4 Dec 2025 21:34:51 -0600
Subject: [PATCH 254/770] [ROCm][CI] Add jiwer dependency for testing (#30081)

Signed-off-by: charlifu <charlifu@amd.com>
---
 requirements/rocm-test.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt
index 394728b67eaa4..a92f14d7aafe3 100644
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -58,6 +58,7 @@ schemathesis==3.39.15
 
 # Evaluation and benchmarking
 lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
+jiwer==4.0.0
 
 # Required for multiprocessed tests that use spawn method, Datasets and Evaluate Test
 multiprocess==0.70.16

From 5867819eaffe3c939c0920c15d5048cb7f9129f8 Mon Sep 17 00:00:00 2001
From: Laith Sakka <lsakka@meta.com>
Date: Thu, 4 Dec 2025 20:10:12 -0800
Subject: [PATCH 255/770] Do not guard during noop elimination pass (#30095)

Signed-off-by: Laith Sakka <lsakka@meta.com>
---
 vllm/compilation/noop_elimination.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/vllm/compilation/noop_elimination.py b/vllm/compilation/noop_elimination.py
index 42b8d3daac985..06e1771bac960 100644
--- a/vllm/compilation/noop_elimination.py
+++ b/vllm/compilation/noop_elimination.py
@@ -5,6 +5,7 @@ from collections.abc import Iterable
 
 import torch.fx
 from torch import SymInt
+from torch.fx.experimental.symbolic_shapes import statically_known_true
 
 from vllm.logger import init_logger
 
@@ -116,12 +117,7 @@ class NoOpEliminationPass(VllmInductorPass):
         2. The dimensions both correspond to the same SymInt
         """
         # Case 1
-        if isinstance(i_dim, int) and isinstance(dim, int):
-            return dim == i_dim
-        # Case 2
-        if isinstance(i_dim, SymInt) and isinstance(dim, SymInt):
-            return dim == i_dim
-        return False
+        return statically_known_true(dim == i_dim)
 
     def all_dims_equivalent(
         self, dims: Iterable[int | SymInt], i_dims: Iterable[int | SymInt]

From 2c22c4ca2d88df503818741470a3ffc21f30a4b4 Mon Sep 17 00:00:00 2001
From: Charlie Fu <charlifu@amd.com>
Date: Thu, 4 Dec 2025 22:51:44 -0600
Subject: [PATCH 256/770] [ROCm][CI] Increase the memory threshold for
 test_deep_sleep_fp8_kvcache (#30104)

Signed-off-by: charlifu <charlifu@amd.com>
---
 tests/basic_correctness/test_cumem.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index dc9c69bf58b95..3bd0b6609d88d 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -260,13 +260,18 @@ def test_deep_sleep_fp8_kvcache():
     llm.sleep(level=2)
 
     used_bytes = current_platform.get_current_memory_usage() - used_bytes_baseline
-    assert used_bytes < 3 * GiB_bytes
+
+    # Rocm uses more memory for CudaGraphs, so we add 2 GiB more for the threshold
+    rocm_extra_mem_bytes = 2 * GiB_bytes if current_platform.is_rocm() else 0
+    mem_threshold_after_sleep = 3 * GiB_bytes + rocm_extra_mem_bytes
+    assert used_bytes < mem_threshold_after_sleep
 
     llm.wake_up(tags=["weights"])
     llm.collective_rpc("reload_weights")
 
     used_bytes = current_platform.get_current_memory_usage() - used_bytes_baseline
-    assert used_bytes < 4 * GiB_bytes
+    mem_threshold_after_wake_up = 4 * GiB_bytes + rocm_extra_mem_bytes
+    assert used_bytes < mem_threshold_after_wake_up
 
     # now allocate kv cache and cuda graph memory
     llm.wake_up(tags=["kv_cache"])

From d698bb382db95af6b8836936eb0dfae71c791d11 Mon Sep 17 00:00:00 2001
From: Jingchun Gao <63247409+gjc0824@users.noreply.github.com>
Date: Fri, 5 Dec 2025 13:54:31 +0800
Subject: [PATCH 257/770] [Bugfix] Correct num_q_heads on DCP for Flashinfer
 backends  (#29487)

Signed-off-by: Jingchun Gao <gaojingchun1@huawei.com>
Signed-off-by: Jingchun Gao <63247409+gjc0824@users.noreply.github.com>
Co-authored-by: Jingchun Gao <gaojingchun1@huawei.com>
---
 vllm/v1/attention/backends/flashinfer.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 69a6a5e5fae82..3d9640a2d4024 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -482,9 +482,8 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
             self.dcp_rank = 0
             self.dcp_kv_cache_interleave_size = 1
 
-        self.num_qo_heads = (
-            self.model_config.get_num_attention_heads(self.vllm_config.parallel_config)
-            * self.dcp_world_size
+        self.num_qo_heads = self.model_config.get_num_attention_heads(
+            self.vllm_config.parallel_config
         )
 
         self.num_kv_heads = self.kv_cache_spec.num_kv_heads

From 6e865b6a83565c5d661091ec0886403edb171794 Mon Sep 17 00:00:00 2001
From: Chukwuma Nwaugha <20521315+nwaughachukwuma@users.noreply.github.com>
Date: Fri, 5 Dec 2025 06:44:32 +0000
Subject: [PATCH 258/770] Refactor example prompts fixture (#29854)

Signed-off-by: nwaughac@gmail.com
---
 tests/conftest.py | 47 +++++++++++++++++++++++++----------------------
 1 file changed, 25 insertions(+), 22 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index b20c9efef542a..204452b5835ce 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -27,7 +27,7 @@ import threading
 from collections.abc import Generator
 from contextlib import nullcontext
 from enum import Enum
-from typing import Any, Callable, TypedDict, TypeVar, cast
+from typing import Any, Callable, TypedDict, TypeVar, cast, TYPE_CHECKING
 
 import numpy as np
 import pytest
@@ -67,6 +67,11 @@ from vllm.transformers_utils.utils import maybe_model_redirect
 from vllm.utils.collection_utils import is_list_of
 from vllm.utils.torch_utils import set_default_torch_num_threads
 
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+    from transformers.generation.utils import GenerateOutput
+
+
 logger = init_logger(__name__)
 
 _TEST_DIR = os.path.dirname(__file__)
@@ -202,10 +207,7 @@ def dynamo_reset():
 
 @pytest.fixture
 def example_prompts() -> list[str]:
-    prompts = []
-    for filename in _TEST_PROMPTS:
-        prompts += _read_prompts(filename)
-    return prompts
+    return [prompt for filename in _TEST_PROMPTS for prompt in _read_prompts(filename)]
 
 
 @pytest.fixture
@@ -224,10 +226,7 @@ class DecoderPromptType(Enum):
 
 @pytest.fixture
 def example_long_prompts() -> list[str]:
-    prompts = []
-    for filename in _LONG_PROMPTS:
-        prompts += _read_prompts(filename)
-    return prompts
+    return [prompt for filename in _LONG_PROMPTS for prompt in _read_prompts(filename)]
 
 
 @pytest.fixture(scope="session")
@@ -353,10 +352,13 @@ class HfRunner:
                 trust_remote_code=trust_remote_code,
             )
         else:
-            model = auto_cls.from_pretrained(
-                model_name,
-                trust_remote_code=trust_remote_code,
-                **model_kwargs,
+            model = cast(
+                nn.Module,
+                auto_cls.from_pretrained(
+                    model_name,
+                    trust_remote_code=trust_remote_code,
+                    **model_kwargs,
+                ),
             )
 
             # in case some unquantized custom models are not in same dtype
@@ -374,10 +376,12 @@ class HfRunner:
             self.model = model
 
         if not skip_tokenizer_init:
-            self.tokenizer = AutoTokenizer.from_pretrained(
-                model_name,
-                dtype=dtype,
-                trust_remote_code=trust_remote_code,
+            self.tokenizer: "PreTrainedTokenizer | PreTrainedTokenizerFast" = (
+                AutoTokenizer.from_pretrained(
+                    model_name,
+                    dtype=dtype,
+                    trust_remote_code=trust_remote_code,
+                )
             )
 
         # don't put this import at the top level
@@ -495,7 +499,7 @@ class HfRunner:
 
         outputs: list[tuple[list[list[int]], list[str]]] = []
         for inputs in all_inputs:
-            output_ids = self.model.generate(
+            output_ids: torch.Tensor = self.model.generate(
                 **self.wrap_device(inputs),
                 use_cache=True,
                 **kwargs,
@@ -505,8 +509,7 @@ class HfRunner:
                 skip_special_tokens=True,
                 clean_up_tokenization_spaces=False,
             )
-            output_ids = output_ids.cpu().tolist()
-            outputs.append((output_ids, output_str))
+            outputs.append((output_ids.cpu().tolist(), output_str))
         return outputs
 
     def generate_greedy(
@@ -574,7 +577,7 @@ class HfRunner:
 
         all_logprobs: list[list[torch.Tensor]] = []
         for inputs in all_inputs:
-            output = self.model.generate(
+            output: "GenerateOutput" = self.model.generate(
                 **self.wrap_device(inputs),
                 use_cache=True,
                 do_sample=False,
@@ -656,7 +659,7 @@ class HfRunner:
         all_output_strs: list[str] = []
 
         for inputs in all_inputs:
-            output = self.model.generate(
+            output: "GenerateOutput" = self.model.generate(
                 **self.wrap_device(inputs),
                 use_cache=True,
                 do_sample=False,

From 06579f9a82daa4be451a59c0bbc2d28f8f653b1c Mon Sep 17 00:00:00 2001
From: Micah Williamson <micah.williamson@amd.com>
Date: Fri, 5 Dec 2025 00:48:23 -0600
Subject: [PATCH 259/770] [AMD][CI] Add ray[default] Dependency On ROCm To Pass
 v1/metrics/test_engine_logger_apis.py (#30110)

Signed-off-by: Micah Williamson <micah.williamson@amd.com>
---
 requirements/rocm-test.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt
index a92f14d7aafe3..9d3d711c33797 100644
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -63,6 +63,9 @@ jiwer==4.0.0
 # Required for multiprocessed tests that use spawn method, Datasets and Evaluate Test
 multiprocess==0.70.16
 
+# Required for v1/metrics/test_engine_logger_apis.py
+ray[cgraph,default]>=2.48.0
+
 # Plugins test
 terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e
 torchgeo==0.7.0

From 60a66ea2dc887d50a88f20a1ecffaee2ada8f9dc Mon Sep 17 00:00:00 2001
From: Tiger Xu / Zhonghu Xu <xuzhonghu@huawei.com>
Date: Fri, 5 Dec 2025 16:11:03 +0800
Subject: [PATCH 260/770] [DOC]: Add kthena to integrations (#29931)

Signed-off-by: Zhonghu Xu <xuzhonghu@huawei.com>
---
 docs/deployment/integrations/kthena.md | 333 +++++++++++++++++++++++++
 docs/deployment/k8s.md                 |   1 +
 2 files changed, 334 insertions(+)
 create mode 100644 docs/deployment/integrations/kthena.md

diff --git a/docs/deployment/integrations/kthena.md b/docs/deployment/integrations/kthena.md
new file mode 100644
index 0000000000000..483dd7474440b
--- /dev/null
+++ b/docs/deployment/integrations/kthena.md
@@ -0,0 +1,333 @@
+# Kthena
+
+[**Kthena**](https://github.com/volcano-sh/kthena) is a Kubernetes-native LLM inference platform that transforms how organizations deploy and manage Large Language Models in production. Built with declarative model lifecycle management and intelligent request routing, it provides high performance and enterprise-grade scalability for LLM inference workloads.
+
+This guide shows how to deploy a production-grade, **multi-node vLLM** service on Kubernetes.
+
+We’ll:
+
+- Install the required components (Kthena + Volcano).
+- Deploy a multi-node vLLM model via Kthena’s `ModelServing` CR.
+- Validate the deployment.
+
+---
+
+## 1. Prerequisites
+
+You need:
+
+- A Kubernetes cluster with **GPU nodes**.
+- `kubectl` access with cluster-admin or equivalent permissions.
+- **Volcano** installed for gang scheduling.
+- **Kthena** installed with the `ModelServing` CRD available.
+- A valid **Hugging Face token** if loading models from Hugging Face Hub.
+
+### 1.1 Install Volcano
+
+```bash
+helm repo add volcano-sh https://volcano-sh.github.io/helm-charts
+helm repo update
+helm install volcano volcano-sh/volcano -n volcano-system --create-namespace
+```
+
+This provides the gang-scheduling and network topology features used by Kthena.
+
+### 1.2 Install Kthena
+
+```bash
+helm install kthena oci://ghcr.io/volcano-sh/charts/kthena --version v0.1.0 --namespace kthena-system --create-namespace
+```
+
+- The `kthena-system` namespace is created.
+- Kthena controllers and CRDs, including `ModelServing`, are installed and healthy.
+
+Validate:
+
+```bash
+kubectl get crd | grep modelserving
+```
+
+You should see:
+
+```text
+modelservings.workload.serving.volcano.sh   ...
+```
+
+---
+
+## 2. The Multi-Node vLLM `ModelServing` Example
+
+Kthena provides an example manifest to deploy a **multi-node vLLM cluster running Llama**. Conceptually this is equivalent to the vLLM production stack Helm deployment, but expressed with `ModelServing`.
+
+A simplified version of the example (`llama-multinode`) looks like:
+
+- `spec.replicas: 1` – one `ServingGroup` (one logical model deployment).
+- `roles`:
+    - `entryTemplate` – defines **leader** pods that run:
+        - vLLM’s **multi-node cluster bootstrap script** (Ray cluster).
+        - vLLM **OpenAI-compatible API server**.
+    - `workerTemplate` – defines **worker** pods that join the leader’s Ray cluster.
+
+Key points from the example YAML:
+
+- **Image**: `vllm/vllm-openai:latest` (matches upstream vLLM images).
+- **Command** (leader):
+
+  ```yaml
+  command:
+    - sh
+    - -c
+    - >
+      bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=2;
+      python3 -m vllm.entrypoints.openai.api_server
+        --port 8080
+        --model meta-llama/Llama-3.1-405B-Instruct
+        --tensor-parallel-size 8
+        --pipeline-parallel-size 2
+  ```
+
+- **Command** (worker):
+
+  ```yaml
+  command:
+    - sh
+    - -c
+    - >
+      bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(ENTRY_ADDRESS)
+  ```
+
+---
+
+## 3. Deploying Multi-Node llama vLLM via Kthena
+
+### 3.1 Prepare the Manifest
+
+**Recommended**: use a Secret instead of a raw env var:
+
+```bash
+kubectl create secret generic hf-token \
+  -n default \
+  --from-literal=HUGGING_FACE_HUB_TOKEN='<your-token>'
+```
+
+### 3.2 Apply the `ModelServing`
+
+```bash
+cat  <<EOF | kubectl apply -f -
+apiVersion: workload.serving.volcano.sh/v1alpha1
+kind: ModelServing
+metadata:
+  name: llama-multinode
+  namespace: default
+spec:
+  schedulerName: volcano
+  replicas: 1  # group replicas
+  template:
+    restartGracePeriodSeconds: 60
+    gangPolicy:
+      minRoleReplicas:
+        405b: 1
+    roles:
+      - name: 405b
+        replicas: 2
+        entryTemplate:
+          spec:
+            containers:
+              - name: leader
+                image: vllm/vllm-openai:latest
+                env:
+                  - name: HUGGING_FACE_HUB_TOKEN
+                    valueFrom:
+                      secretKeyRef:
+                        name: hf-token
+                        key: HUGGING_FACE_HUB_TOKEN
+                command:
+                  - sh
+                  - -c
+                  - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=2; 
+                    python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline-parallel-size 2"
+                resources:
+                  limits:
+                    nvidia.com/gpu: "8"
+                    memory: 1124Gi
+                    ephemeral-storage: 800Gi
+                  requests:
+                    ephemeral-storage: 800Gi
+                    cpu: 125
+                ports:
+                  - containerPort: 8080
+                readinessProbe:
+                  tcpSocket:
+                    port: 8080
+                  initialDelaySeconds: 15
+                  periodSeconds: 10
+                volumeMounts:
+                  - mountPath: /dev/shm
+                    name: dshm
+            volumes:
+            - name: dshm
+              emptyDir:
+                medium: Memory
+                sizeLimit: 15Gi
+        workerReplicas: 1
+        workerTemplate:
+          spec:
+            containers:
+              - name: worker
+                image: vllm/vllm-openai:latest
+                command:
+                  - sh
+                  - -c
+                  - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(ENTRY_ADDRESS)"
+                resources:
+                  limits:
+                    nvidia.com/gpu: "8"
+                    memory: 1124Gi
+                    ephemeral-storage: 800Gi
+                  requests:
+                    ephemeral-storage: 800Gi
+                    cpu: 125
+                env:
+                  - name: HUGGING_FACE_HUB_TOKEN
+                    valueFrom:
+                      secretKeyRef:
+                        name: hf-token
+                        key: HUGGING_FACE_HUB_TOKEN
+                volumeMounts:
+                  - mountPath: /dev/shm
+                    name: dshm   
+            volumes:
+            - name: dshm
+              emptyDir:
+                medium: Memory
+                sizeLimit: 15Gi
+EOF
+```
+
+Kthena will:
+
+- Create a `ModelServing` object.
+- Derive a `PodGroup` for Volcano gang scheduling.
+- Create the leader and worker pods for each `ServingGroup` and `Role`.
+
+---
+
+## 4. Verifying the Deployment
+
+### 4.1 Check ModelServing Status
+
+Use the snippet from the Kthena docs:
+
+```bash
+kubectl get modelserving -oyaml | grep status -A 10
+```
+
+You should see something like:
+
+```yaml
+status:
+  availableReplicas: 1
+  conditions:
+    - type: Available
+      status: "True"
+      reason: AllGroupsReady
+      message: All Serving groups are ready
+    - type: Progressing
+      status: "False"
+      ...
+  replicas: 1
+  updatedReplicas: 1
+```
+
+### 4.2 Check Pods
+
+List pods for your deployment:
+
+```bash
+kubectl get pod -owide -l modelserving.volcano.sh/name=llama-multinode
+```
+
+Example output (from docs):
+
+```text
+NAMESPACE   NAME                          READY   STATUS    RESTARTS   AGE   IP            NODE           ...
+default     llama-multinode-0-405b-0-0    1/1     Running   0          15m   10.244.0.56   192.168.5.12   ...
+default     llama-multinode-0-405b-0-1    1/1     Running   0          15m   10.244.0.58   192.168.5.43   ...
+default     llama-multinode-0-405b-1-0    1/1     Running   0          15m   10.244.0.57   192.168.5.58   ...
+default     llama-multinode-0-405b-1-1    1/1     Running   0          15m   10.244.0.53   192.168.5.36   ...
+```
+
+Pod name pattern:
+
+- `llama-multinode-<group-idx>-<role-name>-<replica-idx>-<ordinal>`.
+
+The first number indicates `ServingGroup`. The second (`405b`) is the `Role`. The remaining indices identify the pod within the role.
+
+---
+
+## 6. Accessing the vLLM OpenAI-Compatible API
+
+Expose the entry via a Service:
+
+```yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: llama-multinode-openai
+  namespace: default
+spec:
+  selector:
+    modelserving.volcano.sh/name: llama-multinode
+    modelserving.volcano.sh/entry: "true"
+    # optionally further narrow to leader role if you label it
+  ports:
+    - name: http
+      port: 80
+      targetPort: 8080
+  type: ClusterIP
+```
+
+Port-forward from your local machine:
+
+```bash
+kubectl port-forward svc/llama-multinode-openai 30080:80 -n default
+```
+
+Then:
+
+- List models:
+
+  ```bash
+  curl -s http://localhost:30080/v1/models
+  ```
+
+- Send a completion request (mirroring vLLM production stack docs):
+
+  ```bash
+  curl -X POST http://localhost:30080/v1/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+      "model": "meta-llama/Llama-3.1-405B-Instruct",
+      "prompt": "Once upon a time,",
+      "max_tokens": 10
+    }'
+  ```
+
+You should see an OpenAI-style response from vLLM.
+
+---
+
+## 7. Clean Up
+
+To remove the deployment and its resources:
+
+```bash
+kubectl delete modelserving llama-multinode -n default
+```
+
+If you’re done with the entire stack:
+
+```bash
+helm uninstall kthena -n kthena-system   # or your Kthena release name
+helm uninstall volcano -n volcano-system
+```
diff --git a/docs/deployment/k8s.md b/docs/deployment/k8s.md
index abffb7bc5f948..05814cbad9bfc 100644
--- a/docs/deployment/k8s.md
+++ b/docs/deployment/k8s.md
@@ -14,6 +14,7 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:
 - [InftyAI/llmaz](integrations/llmaz.md)
 - [KAITO](integrations/kaito.md)
 - [KServe](integrations/kserve.md)
+- [Kthena](integrations/kthena.md)
 - [KubeRay](integrations/kuberay.md)
 - [kubernetes-sigs/lws](frameworks/lws.md)
 - [meta-llama/llama-stack](integrations/llamastack.md)

From 6038b1b04b7ee1b716a75591040fd3ecaa596a3d Mon Sep 17 00:00:00 2001
From: amitz-nv <203509407+amitz-nv@users.noreply.github.com>
Date: Fri, 5 Dec 2025 10:34:33 +0200
Subject: [PATCH 261/770] [Frontend][Model] Add 'float16' to possible mamba
 cache dtype values, override mamba SSM cache dtype value for NemotronH
 (#29978)

Signed-off-by: amitz-nv <203509407+amitz-nv@users.noreply.github.com>
---
 vllm/config/cache.py                 |  2 +-
 vllm/model_executor/models/config.py | 21 +++++++++++++++++++++
 vllm/utils/torch_utils.py            |  1 +
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/vllm/config/cache.py b/vllm/config/cache.py
index 91f083a5534ba..067799a44db30 100644
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -29,7 +29,7 @@ CacheDType = Literal[
     "fp8_inc",
     "fp8_ds_mla",
 ]
-MambaDType = Literal["auto", "float32"]
+MambaDType = Literal["auto", "float32", "float16"]
 PrefixCachingHashAlgo = Literal["sha256", "sha256_cbor", "xxhash", "xxhash_cbor"]
 KVOffloadingBackend = Literal["native", "lmcache"]
 
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index 4bca36aa4b7de..fbeb28a1c0b36 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -485,6 +485,26 @@ class DeepseekV32ForCausalLM(VerifyAndUpdateConfig):
             logger.info("Using bfloat16 kv-cache for DeepSeekV3.2")
 
 
+class NemotronHForCausalLMConfig(VerifyAndUpdateConfig):
+    @staticmethod
+    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
+        """Update mamba_ssm_cache_dtype for NemotronH models when set to 'auto'
+        (or not explicitly set), to the value specified in the HF config, or to
+        float16 if not specified.
+        """
+        cache_config = vllm_config.cache_config
+        if cache_config.mamba_ssm_cache_dtype == "auto":
+            hf_config = vllm_config.model_config.hf_config
+            mamba_ssm_cache_dtype = getattr(
+                hf_config, "mamba_ssm_cache_dtype", "float16"
+            )
+            logger.info(
+                "Updating mamba_ssm_cache_dtype to '%s' for NemotronH model",
+                mamba_ssm_cache_dtype,
+            )
+            cache_config.mamba_ssm_cache_dtype = mamba_ssm_cache_dtype
+
+
 MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
     "GteModel": SnowflakeGteNewModelConfig,
     "GteNewModel": GteNewModelConfig,
@@ -502,4 +522,5 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
     "Mamba2ForCausalLM": MambaModelConfig,
     "FalconMambaForCausalLM": MambaModelConfig,
     "DeepseekV32ForCausalLM": DeepseekV32ForCausalLM,
+    "NemotronHForCausalLM": NemotronHForCausalLMConfig,
 }
diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py
index f5c49ac169f0c..c97efce312b56 100644
--- a/vllm/utils/torch_utils.py
+++ b/vllm/utils/torch_utils.py
@@ -28,6 +28,7 @@ else:
 STR_DTYPE_TO_TORCH_DTYPE = {
     "float32": torch.float32,
     "half": torch.half,
+    "float16": torch.float16,
     "bfloat16": torch.bfloat16,
     "float": torch.float,
     "fp8": torch.uint8,

From feecba09afcd3c9a37c6b03b868dedf4cb851eb1 Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Fri, 5 Dec 2025 02:42:25 -0600
Subject: [PATCH 262/770] [CI/Build][AMD] Use float16 in
 test_reset_prefix_cache_e2e to avoid accuracy issues (#29997)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 tests/v1/core/test_reset_prefix_cache_e2e.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/v1/core/test_reset_prefix_cache_e2e.py b/tests/v1/core/test_reset_prefix_cache_e2e.py
index 083fc3f34f545..b80789945d2fc 100644
--- a/tests/v1/core/test_reset_prefix_cache_e2e.py
+++ b/tests/v1/core/test_reset_prefix_cache_e2e.py
@@ -21,6 +21,7 @@ def test_reset_prefix_cache_e2e(monkeypatch):
         max_num_batched_tokens=32,
         max_model_len=2048,
         compilation_config={"mode": 0},
+        dtype="float16",
     )
     engine = LLMEngine.from_engine_args(engine_args)
     sampling_params = SamplingParams(

From 62b3333448c9401f8e14feaab3b30192323f3a33 Mon Sep 17 00:00:00 2001
From: Yanan Cao <gmagogsfm@users.noreply.github.com>
Date: Fri, 5 Dec 2025 00:47:22 -0800
Subject: [PATCH 263/770] [Frontend] Remove deprecated -O.xx flag (#29991)

Signed-off-by: Yanan Cao <gmagogsfm@gmail.com>
---
 docs/design/debug_vllm_compile.md   |  2 +-
 tests/utils_/test_argparse_utils.py | 27 ++++++++++++---------------
 vllm/utils/argparse_utils.py        | 21 ++++++++-------------
 3 files changed, 21 insertions(+), 29 deletions(-)

diff --git a/docs/design/debug_vllm_compile.md b/docs/design/debug_vllm_compile.md
index e565f17da62ad..731e542a0307b 100644
--- a/docs/design/debug_vllm_compile.md
+++ b/docs/design/debug_vllm_compile.md
@@ -86,7 +86,7 @@ LLM(model, enforce_eager=True)
 ```
 
 To turn off just torch.compile, pass `mode = NONE` to the compilation config.
-(`-cc` is short for `--compilation_config`; `-O.*` dotted syntax is deprecated):
+(`-cc` is short for `--compilation_config`):
 
 ```sh
 # Online
diff --git a/tests/utils_/test_argparse_utils.py b/tests/utils_/test_argparse_utils.py
index 2d969b8c9347d..6f24c77e066a4 100644
--- a/tests/utils_/test_argparse_utils.py
+++ b/tests/utils_/test_argparse_utils.py
@@ -460,23 +460,20 @@ def test_flat_product():
     ]
 
 
-def test_o_legacy_syntax_deprecation(caplog_vllm):
-    """Test that -O.* dotted syntax emits warnings and converts correctly to -cc syntax."""
+def test_o_dotted_syntax_error():
+    """Test that -O.* dotted syntax raises a clear error message."""
     parser = FlexibleArgumentParser()
     parser.add_argument("-cc", "--compilation-config", type=json.loads)
 
-    # Test that -O.backend gets converted correctly AND emits warning
-    args = parser.parse_args(["-O.backend=eager"])
-    assert args.compilation_config == {"backend": "eager"}
+    # Test that -O.* syntax raises a clear ValueError
+    with pytest.raises(ValueError, match=r"The -O\.\* syntax is no longer supported"):
+        parser.parse_args(["-O.backend=eager"])
 
-    # Check that deprecation warning was logged
-    assert len(caplog_vllm.records) >= 1
-    assert (
-        "The -O.* dotted syntax for --compilation-config is deprecated"
-        in caplog_vllm.text
-    )
+    with pytest.raises(ValueError, match=r"Please use -cc\.\* instead"):
+        parser.parse_args(["-O.mode=2"])
 
-    # Test that -O.mode gets converted correctly
-    # Note: warning_once won't emit again in same session
-    args = parser.parse_args(["-O.mode=2"])
-    assert args.compilation_config == {"mode": 2}
+    with pytest.raises(
+        ValueError,
+        match=r"replace '-O\.cudagraph_mode=NONE' with '-cc\.cudagraph_mode=NONE'",
+    ):
+        parser.parse_args(["-O.cudagraph_mode=NONE"])
diff --git a/vllm/utils/argparse_utils.py b/vllm/utils/argparse_utils.py
index 555fcfea491e2..356f383cc52bd 100644
--- a/vllm/utils/argparse_utils.py
+++ b/vllm/utils/argparse_utils.py
@@ -244,9 +244,15 @@ class FlexibleArgumentParser(ArgumentParser):
                 else:
                     key = pattern.sub(repl, arg, count=1)
                     processed_args.append(key)
-            elif arg.startswith("-O") and arg != "-O" and arg[2] != ".":
+            elif arg.startswith("-O."):
+                # Provide clear error for deprecated -O.* syntax
+                raise ValueError(
+                    f"The -O.* syntax is no longer supported. "
+                    f"Please use -cc.* instead. "
+                    f"For example, replace '{arg}' with '{arg.replace('-O', '-cc', 1)}'"
+                )
+            elif arg.startswith("-O") and arg != "-O":
                 # allow -O flag to be used without space, e.g. -O3 or -Odecode
-                # -O.<...> handled later
                 # also handle -O=<optimization_level> here
                 optimization_level = arg[3:] if arg[2] == "=" else arg[2:]
                 processed_args += ["--optimization-level", optimization_level]
@@ -257,17 +263,6 @@ class FlexibleArgumentParser(ArgumentParser):
             ):
                 # Convert -O <n> to --optimization-level <n>
                 processed_args.append("--optimization-level")
-            elif arg.startswith("-O."):
-                # Handle -O.* dotted syntax - ALL dotted syntax is deprecated
-                logger.warning_once(
-                    "The -O.* dotted syntax for --compilation-config is "
-                    "deprecated and will be removed in v0.13.0 or v1.0.0"
-                    ", whichever is earlier.  Please use -cc.* instead. "
-                    "Example: -cc.backend=eager instead of "
-                    "-O.backend=eager."
-                )
-                converted_arg = arg.replace("-O", "-cc", 1)
-                processed_args.append(converted_arg)
             else:
                 processed_args.append(arg)
 

From 65ee97288a5b3b622de8d7a460fa965514ff543a Mon Sep 17 00:00:00 2001
From: Alec S <10566873+alecsolder@users.noreply.github.com>
Date: Fri, 5 Dec 2025 03:49:37 -0500
Subject: [PATCH 264/770] [BugFix] Adding env variable to disable async grammar
 compilation (#29996)

Signed-off-by: Alec Solder <alecs@fb.com>
Signed-off-by: Alec S <10566873+alecsolder@users.noreply.github.com>
Co-authored-by: Alec Solder <alecs@fb.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com>
---
 .../test_backend_guidance.py                  | 74 +++++++++++++++++++
 vllm/v1/structured_output/__init__.py         | 17 ++++-
 2 files changed, 89 insertions(+), 2 deletions(-)

diff --git a/tests/v1/structured_output/test_backend_guidance.py b/tests/v1/structured_output/test_backend_guidance.py
index 771076186a3b4..4c01560fc88c3 100644
--- a/tests/v1/structured_output/test_backend_guidance.py
+++ b/tests/v1/structured_output/test_backend_guidance.py
@@ -1,9 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import time
+from concurrent.futures import Future
+
+import pytest
 from transformers import AutoTokenizer
 
 from vllm.config import StructuredOutputsConfig, VllmConfig
 from vllm.config.model import ModelConfig
+from vllm.config.parallel import ParallelConfig
 from vllm.config.speculative import SpeculativeConfig
 from vllm.sampling_params import SamplingParams, StructuredOutputsParams
 from vllm.v1.request import Request
@@ -116,3 +121,72 @@ def test_grammar_bitmask_with_specdec():
         )  # EOS not the final token
         grammar_bitmask(request, prompt[i:])  # EOS not present
         grammar_bitmask(request, prompt[i:] + [tokenizer.eos_token_id])
+
+
+@pytest.mark.parametrize("async_grammar", [True, False])
+def test_grammar_init_async_and_sync(async_grammar):
+    """Test grammar initialization works correctly in both async and sync modes.
+
+    This test validates that the distributed_executor_backend config option
+    correctly controls whether grammar compilation happens asynchronously
+    (via executor.submit) or synchronously. When set to "external_launcher",
+    grammar compilation is synchronous to avoid deadlocks.
+    """
+    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
+    prompt = tokenizer.encode('{"a": "b"}')
+
+    # Use "external_launcher" for sync mode, None for async mode
+    executor_backend = None if async_grammar else "external_launcher"
+    vllm_config = VllmConfig(
+        model_config=ModelConfig(tokenizer=TOKENIZER),
+        structured_outputs_config=StructuredOutputsConfig(backend="guidance"),
+        parallel_config=ParallelConfig(distributed_executor_backend=executor_backend),
+    )
+    structured_output_manager = StructuredOutputManager(vllm_config)
+
+    sampling_params = SamplingParams(
+        structured_outputs=StructuredOutputsParams(
+            json='{"type": "object"}',
+        ),
+    )
+    sampling_params.structured_outputs._backend = "guidance"
+
+    request = Request(
+        "test_request",
+        prompt_token_ids=prompt,
+        sampling_params=sampling_params,
+        pooling_params=None,
+        eos_token_id=tokenizer.eos_token_id,
+    )
+
+    structured_output_manager.grammar_init(request)
+
+    # Check the internal _grammar type immediately after init
+    # Before _check_grammar_completion is called, async mode should have a Future
+    raw_grammar = request.structured_output_request._grammar
+    if async_grammar:
+        assert isinstance(raw_grammar, Future), (
+            "Async mode should store a Future before completion"
+        )
+    else:
+        assert not isinstance(raw_grammar, Future), (
+            "Sync mode should store the grammar directly, not a Future"
+        )
+
+    # Wait for grammar to be ready (handles both async and sync cases)
+    start_time = time.time()
+    while not request.structured_output_request._check_grammar_completion():
+        if time.time() - start_time > 5:  # 5-second timeout
+            pytest.fail("Grammar compilation timed out")
+        time.sleep(0.01)
+
+    # After completion, _grammar should no longer be a Future
+    assert not isinstance(request.structured_output_request._grammar, Future)
+
+    # Verify grammar is properly initialized and functional
+    grammar = request.structured_output_request.grammar
+    assert grammar is not None
+    assert not grammar.is_terminated()
+
+    # Verify the grammar can accept valid tokens
+    assert grammar.accept_tokens(request.request_id, prompt)
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index d087d28b1dae3..5ee88178cdf60 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -40,6 +40,16 @@ class StructuredOutputManager:
         self.reasoner: ReasoningParser | None = None
         self.vllm_config = vllm_config
 
+        # When in external_launcher mode, async grammar compilation causes deadlocks
+        # due to external_launcher mode having a scheduler for each TP rank.
+        # Async grammar compilation causes the WAITING_FOR_FSM → WAITING transition to
+        # happen at different times on different TP ranks,
+        # breaking the determinism assumption that external_launcher relies on.
+        self._use_async_grammar_compilation = (
+            vllm_config.parallel_config.distributed_executor_backend
+            != "external_launcher"
+        )
+
         self._grammar_bitmask: torch.Tensor | None = None
         self._full_mask = torch.tensor(-1, dtype=torch.int32)
 
@@ -138,10 +148,13 @@ class StructuredOutputManager:
             else:
                 raise ValueError(f"Unsupported structured output backend: {backend}")
 
-        grammar = self.executor.submit(self._async_create_grammar, request)
+        if self._use_async_grammar_compilation:
+            grammar = self.executor.submit(self._create_grammar, request)
+        else:
+            grammar = self._create_grammar(request)  # type: ignore[assignment]
         request.structured_output_request.grammar = grammar  # type: ignore[assignment]
 
-    def _async_create_grammar(
+    def _create_grammar(
         self,
         request: Request,
     ) -> StructuredOutputGrammar:

From f16356fe361df9ccef3a9b46d6a43d43a854e2e0 Mon Sep 17 00:00:00 2001
From: Ming Yang <minos.future@gmail.com>
Date: Fri, 5 Dec 2025 02:26:52 -0800
Subject: [PATCH 265/770] [bench] Support common prefix len config (for
 decode-only bench) (#29934)

Signed-off-by: Ming Yang <minos.future@gmail.com>
---
 vllm/benchmarks/datasets.py | 1 +
 vllm/benchmarks/serve.py    | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index ec9b0fd6e969c..638ece26071ef 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -1842,6 +1842,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
                 random_seed=args.seed,
                 dataset_path=args.dataset_path,
                 disable_shuffle=args.disable_shuffle,
+                prefix_len=args.common_prefix_len,
             ).sample(
                 tokenizer=tokenizer,
                 num_requests=args.num_prompts,
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index 2933f5d01b274..890cd7e089fd6 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -1221,6 +1221,12 @@ def add_cli_args(parser: argparse.ArgumentParser):
         help="Repetition penalty sampling parameter. Only has effect on "
         "openai-compatible backends.",
     )
+    sampling_group.add_argument(
+        "--common-prefix-len",
+        type=int,
+        default=None,
+        help="Common prefix length shared by all prompts (used by random dataset)",
+    )
 
     parser.add_argument(
         "--tokenizer-mode",

From 7ae13c66ba63a1e999d9a8939856bea3e6e152a0 Mon Sep 17 00:00:00 2001
From: Ning Xie <andy.xning@gmail.com>
Date: Fri, 5 Dec 2025 18:46:08 +0800
Subject: [PATCH 266/770] [typing] fix type (#29964)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
---
 vllm/reasoning/abs_reasoning_parsers.py   | 2 +-
 vllm/reasoning/gptoss_reasoning_parser.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py
index 4a04292be009e..5c6ac7dad9930 100644
--- a/vllm/reasoning/abs_reasoning_parsers.py
+++ b/vllm/reasoning/abs_reasoning_parsers.py
@@ -121,7 +121,7 @@ class ReasoningParser:
         self,
         original_tag: str | None,
         tool_server: ToolServer | None,
-    ) -> str:
+    ) -> str | None:
         """
         Instance method that is implemented for preparing the structured tag
         Otherwise, None is returned
diff --git a/vllm/reasoning/gptoss_reasoning_parser.py b/vllm/reasoning/gptoss_reasoning_parser.py
index 0c1b54d0bd359..fa45b12856c7d 100644
--- a/vllm/reasoning/gptoss_reasoning_parser.py
+++ b/vllm/reasoning/gptoss_reasoning_parser.py
@@ -145,7 +145,7 @@ class GptOssReasoningParser(ReasoningParser):
     # This function prepares the structural tag to format reasoning output
     def prepare_structured_tag(
         self, original_tag: str | None, tool_server: ToolServer | None
-    ) -> str:
+    ) -> str | None:
         if original_tag is None:
             if tool_server is None:
                 return json.dumps(no_func_reaonsing_tag)

From b73b158ab0715d860e5a86218e0e9605fc2c1fe0 Mon Sep 17 00:00:00 2001
From: strinczer <strinczer@icloud.com>
Date: Fri, 5 Dec 2025 10:51:12 +0000
Subject: [PATCH 267/770] [Bugfix] Fix parse_output_message crash on commentary
 with no recipient (#29972)

Signed-off-by: Shai Trinczer <strinczer@icloud.com>
Signed-off-by: strinczer <strinczer@icloud.com>
---
 tests/entrypoints/test_harmony_utils.py | 189 +++++++++++++++++++++++-
 vllm/entrypoints/harmony_utils.py       |  10 +-
 2 files changed, 194 insertions(+), 5 deletions(-)

diff --git a/tests/entrypoints/test_harmony_utils.py b/tests/entrypoints/test_harmony_utils.py
index 6fa051a678d68..82ff562d5c6d2 100644
--- a/tests/entrypoints/test_harmony_utils.py
+++ b/tests/entrypoints/test_harmony_utils.py
@@ -1,11 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from openai_harmony import Role
+from openai.types.responses import ResponseFunctionToolCall, ResponseReasoningItem
+from openai_harmony import Author, Message, Role, TextContent
 
 from vllm.entrypoints.harmony_utils import (
     has_custom_tools,
     parse_input_to_harmony_message,
+    parse_output_message,
 )
 
 
@@ -257,6 +259,191 @@ class TestParseInputToHarmonyMessage:
         assert messages[0].content[1].text == "actual text"
 
 
+class TestParseOutputMessage:
+    """Tests for parse_output_message function."""
+
+    def test_commentary_with_no_recipient_creates_reasoning(self):
+        """Test that commentary with recipient=None (preambles) creates reasoning items.
+
+        Per Harmony format, commentary channel can contain preambles to calling
+        multiple functions - explanatory text with no recipient.
+        """
+        message = Message.from_role_and_content(
+            Role.ASSISTANT, "I will now search for the weather information."
+        )
+        message = message.with_channel("commentary")
+        # recipient is None by default, representing a preamble
+
+        output_items = parse_output_message(message)
+
+        assert len(output_items) == 1
+        assert isinstance(output_items[0], ResponseReasoningItem)
+        assert output_items[0].type == "reasoning"
+        assert (
+            output_items[0].content[0].text
+            == "I will now search for the weather information."
+        )
+        assert output_items[0].content[0].type == "reasoning_text"
+
+    def test_commentary_with_function_recipient_creates_function_call(self):
+        """Test commentary with recipient='functions.X' creates function calls."""
+        message = Message.from_role_and_content(
+            Role.ASSISTANT, '{"location": "San Francisco", "units": "celsius"}'
+        )
+        message = message.with_channel("commentary")
+        message = message.with_recipient("functions.get_weather")
+
+        output_items = parse_output_message(message)
+
+        assert len(output_items) == 1
+        assert isinstance(output_items[0], ResponseFunctionToolCall)
+        assert output_items[0].type == "function_call"
+        assert output_items[0].name == "get_weather"
+        assert (
+            output_items[0].arguments
+            == '{"location": "San Francisco", "units": "celsius"}'
+        )
+        assert output_items[0].call_id.startswith("call_")
+        assert output_items[0].id.startswith("fc_")
+
+    def test_commentary_with_python_recipient_creates_reasoning(self):
+        """Test that commentary with recipient='python' creates reasoning items."""
+        message = Message.from_role_and_content(
+            Role.ASSISTANT, "import numpy as np\nprint(np.array([1, 2, 3]))"
+        )
+        message = message.with_channel("commentary")
+        message = message.with_recipient("python")
+
+        output_items = parse_output_message(message)
+
+        assert len(output_items) == 1
+        assert isinstance(output_items[0], ResponseReasoningItem)
+        assert output_items[0].type == "reasoning"
+        assert (
+            output_items[0].content[0].text
+            == "import numpy as np\nprint(np.array([1, 2, 3]))"
+        )
+
+    def test_commentary_with_browser_recipient_creates_reasoning(self):
+        """Test that commentary with recipient='browser' creates reasoning items."""
+        message = Message.from_role_and_content(
+            Role.ASSISTANT, "Navigating to the specified URL"
+        )
+        message = message.with_channel("commentary")
+        message = message.with_recipient("browser")
+
+        output_items = parse_output_message(message)
+
+        assert len(output_items) == 1
+        assert isinstance(output_items[0], ResponseReasoningItem)
+        assert output_items[0].type == "reasoning"
+        assert output_items[0].content[0].text == "Navigating to the specified URL"
+
+    def test_commentary_with_container_recipient_creates_reasoning(self):
+        """Test that commentary with recipient='container' creates reasoning items."""
+        message = Message.from_role_and_content(
+            Role.ASSISTANT, "Running command in container"
+        )
+        message = message.with_channel("commentary")
+        message = message.with_recipient("container")
+
+        output_items = parse_output_message(message)
+
+        assert len(output_items) == 1
+        assert isinstance(output_items[0], ResponseReasoningItem)
+        assert output_items[0].type == "reasoning"
+        assert output_items[0].content[0].text == "Running command in container"
+
+    def test_commentary_with_empty_content_and_no_recipient(self):
+        """Test edge case: empty commentary with recipient=None."""
+        message = Message.from_role_and_content(Role.ASSISTANT, "")
+        message = message.with_channel("commentary")
+
+        output_items = parse_output_message(message)
+
+        assert len(output_items) == 1
+        assert isinstance(output_items[0], ResponseReasoningItem)
+        assert output_items[0].content[0].text == ""
+
+    def test_commentary_with_multiple_contents_and_no_recipient(self):
+        """Test multiple content items in commentary with no recipient."""
+        contents = [
+            TextContent(text="Step 1: Analyze the request"),
+            TextContent(text="Step 2: Prepare to call functions"),
+        ]
+        message = Message.from_role_and_contents(Role.ASSISTANT, contents)
+        message = message.with_channel("commentary")
+
+        output_items = parse_output_message(message)
+
+        assert len(output_items) == 2
+        assert all(isinstance(item, ResponseReasoningItem) for item in output_items)
+        assert output_items[0].content[0].text == "Step 1: Analyze the request"
+        assert output_items[1].content[0].text == "Step 2: Prepare to call functions"
+
+    def test_commentary_with_multiple_function_calls(self):
+        """Test multiple function calls in commentary channel."""
+        contents = [
+            TextContent(text='{"location": "San Francisco"}'),
+            TextContent(text='{"location": "New York"}'),
+        ]
+        message = Message.from_role_and_contents(Role.ASSISTANT, contents)
+        message = message.with_channel("commentary")
+        message = message.with_recipient("functions.get_weather")
+
+        output_items = parse_output_message(message)
+
+        assert len(output_items) == 2
+        assert all(isinstance(item, ResponseFunctionToolCall) for item in output_items)
+        assert output_items[0].name == "get_weather"
+        assert output_items[1].name == "get_weather"
+        assert output_items[0].arguments == '{"location": "San Francisco"}'
+        assert output_items[1].arguments == '{"location": "New York"}'
+
+    def test_commentary_with_unknown_recipient_raises_error(self):
+        """Test that commentary with unknown recipient raises ValueError."""
+        message = Message.from_role_and_content(Role.ASSISTANT, "some content")
+        message = message.with_channel("commentary")
+        message = message.with_recipient("unknown_recipient")
+
+        try:
+            parse_output_message(message)
+            raise AssertionError("Expected ValueError to be raised")
+        except ValueError as e:
+            assert "Unknown recipient: unknown_recipient" in str(e)
+
+    def test_analysis_channel_creates_reasoning(self):
+        """Test that analysis channel creates reasoning items."""
+        message = Message.from_role_and_content(
+            Role.ASSISTANT, "Analyzing the problem step by step..."
+        )
+        message = message.with_channel("analysis")
+
+        output_items = parse_output_message(message)
+
+        assert len(output_items) == 1
+        assert isinstance(output_items[0], ResponseReasoningItem)
+        assert output_items[0].type == "reasoning"
+        assert (
+            output_items[0].content[0].text == "Analyzing the problem step by step..."
+        )
+
+    def test_non_assistant_message_returns_empty(self):
+        """Test that non-assistant messages return empty list.
+
+        Per the implementation, tool messages to assistant (e.g., search results)
+        are not included in final output to align with OpenAI behavior.
+        """
+        message = Message.from_author_and_content(
+            Author.new(Role.TOOL, "functions.get_weather"),
+            "The weather is sunny, 72°F",
+        )
+
+        output_items = parse_output_message(message)
+
+        assert len(output_items) == 0
+
+
 def test_has_custom_tools() -> None:
     assert not has_custom_tools(set())
     assert not has_custom_tools({"web_search_preview", "code_interpreter", "container"})
diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py
index bb932e39e0472..7da0914ce3d3e 100644
--- a/vllm/entrypoints/harmony_utils.py
+++ b/vllm/entrypoints/harmony_utils.py
@@ -455,11 +455,13 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]:
             output_items.extend(_parse_function_call(message, recipient))
 
         # Built-in tools on commentary channel are treated as reasoning for now
-        elif recipient is not None and (
-            recipient.startswith("python")
-            or recipient.startswith("browser")
-            or recipient.startswith("container")
+        elif (
+            recipient is None  # Preambles: explanatory text before tool calls
+            or recipient.startswith(("python", "browser", "container"))
         ):
+            # Per Harmony format, commentary channel can contain preambles to calling
+            # multiple functions - explanatory text with no recipient. Built-in tool
+            # recipients (python/browser/container) also generate reasoning output.
             output_items.extend(_parse_reasoning_content(message))
         else:
             raise ValueError(f"Unknown recipient: {recipient}")

From 3628bcaaf229f3ce86b64e73ab88dd64211ddf38 Mon Sep 17 00:00:00 2001
From: Zhiwei <532707544@qq.com>
Date: Fri, 5 Dec 2025 19:01:16 +0800
Subject: [PATCH 268/770] [ROCm][MXFP4] Infer w4a4 quant method in rocm aiter
 fused moe (#29775)

Signed-off-by: ZhiweiYan-96 <zhiwei.yan@amd.com>
---
 vllm/model_executor/layers/fused_moe/config.py               | 4 ++++
 vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
index 1826fafa8c4f5..e52845dfa246d 100644
--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -345,6 +345,10 @@ class FusedMoEQuantConfig:
     def use_mxfp4_w4a16(self) -> bool:
         return self._a1.dtype is None and self._w1.dtype == "mxfp4"
 
+    @property
+    def use_mxfp4_w4a4(self) -> bool:
+        return self._a1.dtype == "mxfp4" and self._w1.dtype == "mxfp4"
+
     @property
     def use_nvfp4_w4a4(self) -> bool:
         return self.quant_dtype == "nvfp4"
diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index 8f05828d74f5f..882ad0a537cd5 100644
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -221,8 +221,8 @@ def rocm_aiter_fused_experts(
 
     else:
         quant_method = QuantMethod.NO.value
-        # quark moe for mxfp4 w_dtype
-        if quant_config.use_mxfp4_w4a16:
+        # quark moe for mxfp4 w_dtype mxfp4 a_dtype
+        if quant_config.use_mxfp4_w4a4:
             quant_method = QuantMethod.BLOCK_1X32.value
         # w8a8 block-scaled
         if quant_config.block_shape is not None and quant_config.use_fp8_w8a8:

From c2894d3883c1b21299822c685e6be73ec3be046b Mon Sep 17 00:00:00 2001
From: Max Hu <hyoung2991@gmail.com>
Date: Fri, 5 Dec 2025 06:20:07 -0500
Subject: [PATCH 269/770] [Feature] Add Layer-wise NVTX Support (#29990)

Signed-off-by: Max Hu <hyoung2991@gmail.com>
Signed-off-by: Max Hu <maxhu@nvidia.com>
Co-authored-by: Max Hu <maxhu@nvidia.com>
---
 vllm/compilation/wrapper.py        |  30 ++-
 vllm/config/observability.py       |   5 +
 vllm/engine/arg_utils.py           |   8 +
 vllm/utils/nvtx_pytorch_hooks.py   | 286 +++++++++++++++++++++++++++++
 vllm/v1/worker/gpu_model_runner.py |  49 +++++
 5 files changed, 375 insertions(+), 3 deletions(-)
 create mode 100644 vllm/utils/nvtx_pytorch_hooks.py

diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index b120c85bf232e..69e1ed37a5beb 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -14,6 +14,7 @@ import torch._C._dynamo.guards
 import vllm.envs as envs
 from vllm.config import CompilationMode, CUDAGraphMode, get_current_vllm_config
 from vllm.logger import init_logger
+from vllm.utils.nvtx_pytorch_hooks import layerwise_nvtx_marker_context
 
 logger = init_logger(__name__)
 
@@ -92,12 +93,29 @@ class TorchCompileWithNoGuardsWrapper:
 
         return self.forward(*args, **kwargs)
 
+    def _call_with_optional_nvtx_range(self, callable_fn, *args, **kwargs):
+        if self.layerwise_nvtx_tracing_enabled:
+            args_list = list(args)
+            kwargs_dict = dict(kwargs)
+            with layerwise_nvtx_marker_context(
+                "Torch Compiled Module (input):{}".format(self.__class__.__name__),
+                self,
+                in_tensor=args_list,
+                kwargs=kwargs_dict,
+            ) as ctx:
+                ctx.result = callable_fn(*args, **kwargs)
+            return ctx.result
+        return callable_fn(*args, **kwargs)
+
     def __init__(self):
         self.compiled = False
 
         vllm_config = get_current_vllm_config()
         self.vllm_config = vllm_config
         mode = vllm_config.compilation_config.mode
+        self.layerwise_nvtx_tracing_enabled = (
+            vllm_config.observability_config.enable_layerwise_nvtx_tracing
+        )
         if mode is None:
             raise RuntimeError("Compilation mode cannot be NO_COMPILATION")
 
@@ -168,13 +186,19 @@ class TorchCompileWithNoGuardsWrapper:
                 # Make sure a compilation is triggered by clearing dynamo
                 # cache.
                 torch._dynamo.eval_frame.remove_from_cache(self.original_code_object())
-                return self._compiled_callable(*args, **kwargs)
+                return self._call_with_optional_nvtx_range(
+                    self._compiled_callable, *args, **kwargs
+                )
             else:
                 with self._dispatch_to_compiled_code():
-                    return self.forward(*args, **kwargs)
+                    return self._call_with_optional_nvtx_range(
+                        self.forward, *args, **kwargs
+                    )
         else:
             with _compilation_context():
-                return self._compiled_callable(*args, **kwargs)
+                return self._call_with_optional_nvtx_range(
+                    self._compiled_callable, *args, **kwargs
+                )
 
     @abstractmethod
     def forward(self, *args, **kwargs): ...
diff --git a/vllm/config/observability.py b/vllm/config/observability.py
index fdc27aee380ef..e40bf18a00ce2 100644
--- a/vllm/config/observability.py
+++ b/vllm/config/observability.py
@@ -59,6 +59,11 @@ class ObservabilityConfig:
     """Enable CUDA graph metrics (number of padded/unpadded tokens, runtime cudagraph
     dispatch modes, and their observed frequencies at every logging interval)."""
 
+    enable_layerwise_nvtx_tracing: bool = False
+    """Enable layerwise NVTX tracing. This traces the execution of each layer or
+    module in the model and attach informations such as input/output shapes to
+    nvtx range markers. Noted that this doesn't work with CUDA graphs enabled."""
+
     @cached_property
     def collect_model_forward_time(self) -> bool:
         """Whether to collect model forward time for the request."""
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index fd07cded7bc51..883ae370f9e74 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -519,6 +519,9 @@ class EngineArgs:
         ObservabilityConfig, "kv_cache_metrics_sample"
     )
     cudagraph_metrics: bool = ObservabilityConfig.cudagraph_metrics
+    enable_layerwise_nvtx_tracing: bool = (
+        ObservabilityConfig.enable_layerwise_nvtx_tracing
+    )
     scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
     scheduler_cls: str | type[object] | None = SchedulerConfig.scheduler_cls
 
@@ -1026,6 +1029,10 @@ class EngineArgs:
             "--cudagraph-metrics",
             **observability_kwargs["cudagraph_metrics"],
         )
+        observability_group.add_argument(
+            "--enable-layerwise-nvtx-tracing",
+            **observability_kwargs["enable_layerwise_nvtx_tracing"],
+        )
 
         # Scheduler arguments
         scheduler_kwargs = get_kwargs(SchedulerConfig)
@@ -1704,6 +1711,7 @@ class EngineArgs:
             kv_cache_metrics=self.kv_cache_metrics,
             kv_cache_metrics_sample=self.kv_cache_metrics_sample,
             cudagraph_metrics=self.cudagraph_metrics,
+            enable_layerwise_nvtx_tracing=self.enable_layerwise_nvtx_tracing,
         )
 
         # Compilation config overrides
diff --git a/vllm/utils/nvtx_pytorch_hooks.py b/vllm/utils/nvtx_pytorch_hooks.py
new file mode 100644
index 0000000000000..39e2a9a136e63
--- /dev/null
+++ b/vllm/utils/nvtx_pytorch_hooks.py
@@ -0,0 +1,286 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from contextlib import contextmanager
+
+import torch
+import torch.cuda.nvtx as nvtx
+
+
+def print_tensor(tensor_obj, prefix, tensor_list=None):
+    """Descends iterators that contains Tensors and prints the Tensor.
+    Recursive function that descends iterator type arguments until
+    it finds a Tensor object.
+    """
+    if tensor_list is None:
+        tensor_list = []
+
+    if isinstance(tensor_obj, (list, tuple)):
+        for ten in tensor_obj:
+            tensor_list = print_tensor(ten, prefix, tensor_list)
+    elif isinstance(tensor_obj, torch.Tensor):
+        tensor_dims = list(tensor_obj.size())
+        tensor_list.append(tensor_dims)
+    return tensor_list
+
+
+def process_layer_params(module_obj):
+    """Extract the static parameters from LLM and VLM relevant layer types"""
+    param_info = {}
+    # Extract parameters for layers commonly used in LLMs and VLMs
+    if isinstance(module_obj, (torch.nn.Conv1d, torch.nn.Conv2d, torch.nn.Conv3d)):
+        conv_params = {}
+        conv_params["in_chan"] = module_obj.in_channels
+        conv_params["out_chan"] = module_obj.out_channels
+        conv_params["filter_dim"] = module_obj.kernel_size
+        conv_params["stride"] = module_obj.stride
+        conv_params["padding"] = module_obj.padding
+        conv_params["dilation"] = module_obj.dilation
+        conv_params["transposed"] = module_obj.transposed
+        conv_params["output_padding"] = module_obj.output_padding
+        conv_params["groups"] = module_obj.groups
+        conv_params["padding_mode"] = module_obj.padding_mode
+        param_info = conv_params
+    elif isinstance(
+        module_obj,
+        (
+            torch.nn.ConvTranspose1d,
+            torch.nn.ConvTranspose2d,
+            torch.nn.ConvTranspose3d,
+        ),
+    ):
+        convtranspose_params = {}
+        convtranspose_params["in_chan"] = module_obj.in_channels
+        convtranspose_params["out_chan"] = module_obj.out_channels
+        convtranspose_params["filter_dim"] = module_obj.kernel_size
+        convtranspose_params["stride"] = module_obj.stride
+        convtranspose_params["padding"] = module_obj.padding
+        convtranspose_params["dilation"] = module_obj.dilation
+        convtranspose_params["transposed"] = module_obj.transposed
+        convtranspose_params["output_padding"] = module_obj.output_padding
+        convtranspose_params["groups"] = module_obj.groups
+        convtranspose_params["padding_mode"] = module_obj.padding_mode
+        param_info = convtranspose_params
+    elif isinstance(
+        module_obj, (torch.nn.MaxPool1d, torch.nn.MaxPool2d, torch.nn.MaxPool3d)
+    ):
+
+        def _handle_int_or_tuple(parameter):
+            if isinstance(parameter, tuple):
+                return list(parameter)
+            elif isinstance(parameter, int):
+                return [parameter, parameter]
+
+        pooling_params = {}
+        pooling_params["filter_dim"] = _handle_int_or_tuple(module_obj.kernel_size)
+        pooling_params["stride"] = _handle_int_or_tuple(module_obj.stride)
+        pooling_params["padding"] = _handle_int_or_tuple(module_obj.padding)
+        pooling_params["dilation"] = _handle_int_or_tuple(module_obj.dilation)
+        param_info = pooling_params
+    elif isinstance(
+        module_obj, (torch.nn.AvgPool1d, torch.nn.AvgPool2d, torch.nn.AvgPool3d)
+    ):
+        pooling_params = {}
+        pooling_params["filter_dim"] = [
+            module_obj.kernel_size,
+            module_obj.kernel_size,
+        ]
+        pooling_params["stride"] = [module_obj.stride, module_obj.stride]
+        pooling_params["padding"] = [module_obj.padding, module_obj.padding]
+        pooling_params["ceil_mode"] = module_obj.ceil_mode
+        pooling_params["count_include_pad"] = module_obj.count_include_pad
+        param_info = pooling_params
+    elif isinstance(
+        module_obj,
+        (
+            torch.nn.AdaptiveAvgPool1d,
+            torch.nn.AdaptiveAvgPool2d,
+            torch.nn.AdaptiveAvgPool3d,
+        ),
+    ):
+        pooling_params = {}
+        pooling_params["output_size"] = [
+            module_obj.output_size,
+            module_obj.output_size,
+        ]
+        param_info = pooling_params
+    elif isinstance(module_obj, torch.nn.Linear):
+        param_info["in_features"] = module_obj.in_features
+        param_info["out_features"] = module_obj.out_features
+    elif isinstance(
+        module_obj,
+        (torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, torch.nn.BatchNorm3d),
+    ):
+        param_info["num_features"] = module_obj.num_features
+        param_info["epsilon"] = module_obj.eps
+        param_info["momentum"] = module_obj.momentum
+    elif isinstance(module_obj, torch.nn.ReLU):
+        param_info["in_place"] = module_obj.inplace
+    elif isinstance(module_obj, torch.nn.Dropout):
+        param_info["p"] = module_obj.p
+        param_info["in_place"] = module_obj.inplace
+    elif isinstance(module_obj, torch.nn.Embedding):
+        param_info["num_embeddings"] = module_obj.num_embeddings
+        param_info["embedding_dim"] = module_obj.embedding_dim
+    elif isinstance(
+        module_obj,
+        (
+            torch.nn.Upsample,
+            torch.nn.UpsamplingNearest2d,
+            torch.nn.UpsamplingBilinear2d,
+        ),
+    ):
+        param_info["scale_factor"] = module_obj.scale_factor
+
+    return param_info
+
+
+def construct_marker_dict_and_push(
+    module_name, module_obj, in_tensor, kwargs=None, out_tensor=None
+):
+    marker_dict = {}
+    marker_dict["Module"] = module_name
+
+    ## Get trainable parameters like weights and bias
+    module_params = module_obj.named_parameters(recurse=False)
+    for idx, (param_name, param_obj) in enumerate(module_params):
+        if idx == 0:
+            marker_dict["TrainableParams"] = {}
+        marker_dict["TrainableParams"][param_name] = list(param_obj.size())
+
+    in_tensor_list = print_tensor(in_tensor, "Input")
+    if in_tensor_list:
+        marker_dict["Inputs"] = in_tensor_list
+
+    out_tensor_list = print_tensor(out_tensor, "Output")
+    if out_tensor_list:
+        marker_dict["Outputs"] = out_tensor_list
+
+    ## Get Kwargs like input_ids and positions for the top module
+    if kwargs:
+        for key, value in kwargs.items():
+            if isinstance(value, (torch.Tensor, list, tuple)):
+                tensor_list = print_tensor(value, key)
+                if tensor_list:
+                    marker_dict[key] = tensor_list
+
+    param_info = process_layer_params(module_obj)
+    if param_info:
+        marker_dict["StaticParams"] = param_info
+    nvtx.range_push("{}".format(marker_dict))
+
+
+class ResultHolder:
+    """Holder for storing results from within a context manager."""
+
+    result = None
+
+
+@contextmanager
+def layerwise_nvtx_marker_context(module_name, module_obj, in_tensor=None, kwargs=None):
+    """Context manager for NVTX markers that automatically pushes on enter
+    and pops on exit.
+
+    Example:
+        with nvtx_marker_context("Module:MyModule", module, in_tensor=args,
+                                 kwargs=kwargs) as ctx:
+            ctx.result = module(*args, **kwargs)
+        return ctx.result
+    """
+    holder = ResultHolder()
+
+    # Push input marker
+    construct_marker_dict_and_push(
+        module_name,
+        module_obj,
+        in_tensor=in_tensor,
+        kwargs=kwargs,
+    )
+    try:
+        yield holder
+    finally:
+        # Pop input marker
+        nvtx.range_pop()
+        # Push and pop output marker
+        output_name = module_name.replace("(input)", "(output)")
+        construct_marker_dict_and_push(
+            output_name,
+            module_obj,
+            in_tensor=None,
+            kwargs=None,
+            out_tensor=holder.result,
+        )
+        nvtx.range_pop()
+
+
+class PytHooks:
+    """This module contains all the code needed to enable forward hooks
+    in a pytorch network.
+
+    To register the hooks for a given network, the user needs to instantiate
+    a PytHook object. Then call the register_hooks method.
+
+    Example:
+
+        my_hook = PytHook()
+        my_hook.register_hooks(my_network_model)
+    """
+
+    def __init__(self):
+        """Initialize module variables."""
+        super().__init__()
+        self.module_to_name_map = {}
+
+    def _process_layer_params(self, module_obj):
+        return process_layer_params(module_obj)
+
+    def module_fwd_hook(self, module_obj, in_tensor, out_tensor):
+        """Callback function that ends the NVTX marker.
+        Records the module name and tensor information.
+        Called after the module executes the forward method.
+        """
+        nvtx.range_pop()
+        module_name = self.module_to_name_map.get(module_obj, "unknown")
+        construct_marker_dict_and_push(
+            module_name, module_obj, in_tensor=None, kwargs=None, out_tensor=out_tensor
+        )
+        nvtx.range_pop()
+        return
+
+    def module_fwd_pre_hook(self, module_obj, in_tensor, kwargs):
+        """Creates an NVTX marker with the module name in it.
+        This function is called before the module executes.
+        """
+        module_name = self.module_to_name_map.get(module_obj, "unknown")
+        construct_marker_dict_and_push(
+            module_name, module_obj, in_tensor=in_tensor, kwargs=kwargs, out_tensor=None
+        )
+        return
+
+    def register_hooks(self, network_model, module_prefix="top"):
+        """User level function that activates all the hooks.
+        The user needs to call this method from the network source code.
+        The code descends all the modules in the network and registers their
+        respective hooks.
+        """
+        # Module types to skip (simple operations that don't need detailed profiling)
+        skip_types = (
+            torch.nn.Identity,
+            torch.nn.Dropout,
+            torch.nn.Dropout1d,
+            torch.nn.Dropout2d,
+            torch.nn.Dropout3d,
+        )
+
+        for name, module in network_model.named_modules(prefix=module_prefix):
+            # Skip certain module types to reduce profiling overhead
+            if isinstance(module, skip_types):
+                continue
+
+            module.register_forward_pre_hook(self.module_fwd_pre_hook, with_kwargs=True)
+            module.register_forward_hook(self.module_fwd_hook)
+            if module not in self.module_to_name_map:
+                self.module_to_name_map[module] = name
+            else:
+                raise ValueError("Module instance {} is not unique ".format(module))
+        return
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 152bea2c0975c..b6a8145226b3f 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -88,6 +88,7 @@ from vllm.utils.jsontree import json_map_leaves
 from vllm.utils.math_utils import cdiv, round_up
 from vllm.utils.mem_constants import GiB_bytes
 from vllm.utils.mem_utils import DeviceMemoryProfiler
+from vllm.utils.nvtx_pytorch_hooks import PytHooks
 from vllm.utils.platform_utils import is_pin_memory_available
 from vllm.utils.torch_utils import (
     get_dtype_size,
@@ -599,6 +600,7 @@ class GPUModelRunner(
         # Ephemeral state transferred between execute_model() and sample_tokens().
         self.execute_model_state: ExecuteModelState | None = None
         self.kv_connector_output: KVConnectorOutput | None = None
+        self.layerwise_nvtx_hooks_registered = False
 
     def reset_mm_cache(self) -> None:
         if self.mm_budget:
@@ -2828,6 +2830,42 @@ class GPUModelRunner(
             cudagraph_stats,
         )
 
+    def _register_layerwise_nvtx_hooks(self) -> None:
+        """
+        Register layerwise NVTX hooks if --enable-layerwise-nvtx-tracing is enabled
+        to trace detailed information of each layer or module in the model.
+        """
+
+        if (
+            self.vllm_config.observability_config.enable_layerwise_nvtx_tracing
+            and not self.layerwise_nvtx_hooks_registered
+        ):
+            if self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE:
+                logger.debug_once(
+                    "layerwise NVTX tracing is not supported when CUDA graph is "
+                    "turned off; you may observe part or all of the model "
+                    "missing NVTX markers"
+                )
+
+            # In STOCK_TORCH_COMPILE mode, after registering hooks here,
+            # the __call__ function of nn.module will be recompiled with
+            # fullgraph=True. Since nvtx.range_push/pop are not traceable
+            # by torch dynamo, we can't register hook functions here
+            # because hook functions will also be traced by torch dynamo.
+            if (
+                self.vllm_config.compilation_config.mode
+                == CompilationMode.STOCK_TORCH_COMPILE
+            ):
+                logger.debug_once(
+                    "layerwise NVTX tracing is not supported when "
+                    "CompilationMode is STOCK_TORCH_COMPILE, skipping "
+                    "function hooks registration"
+                )
+            else:
+                pyt_hooks = PytHooks()
+                pyt_hooks.register_hooks(self.model, self.model.__class__.__name__)
+                self.layerwise_nvtx_hooks_registered = True
+
     @torch.inference_mode()
     def execute_model(
         self,
@@ -4122,6 +4160,17 @@ class GPUModelRunner(
                     is_graph_capturing=is_graph_capturing,
                 )
 
+        # We register layerwise NVTX hooks here after the first dynamo tracing is
+        # done to avoid nvtx operations in hook functions being traced by
+        # torch dynamo and causing graph breaks.
+        # Note that for DYNAMO_ONCE and VLLM_COMPILE mode,
+        # compiled model's dynamo tracing is only done once and the compiled model's
+        # __call__ function is replaced by calling the compiled function.
+        # So it's safe to register hooks here. Hooks will be registered to
+        # both compiled and uncompiled models but they will never
+        # be called on the compiled model execution path.
+        self._register_layerwise_nvtx_hooks()
+
         # This is necessary to avoid blocking DP.
         # For dummy runs, we typically skip EPLB since we don't have any real
         # requests to process.

From b7d85cf25c0a6699eb493e595ec44923ffce21b1 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 5 Dec 2025 13:03:45 +0000
Subject: [PATCH 270/770] [CI] Have pre-commit comment on a PR if pre-commit
 was not used (#30077)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .github/mergify.yml | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/.github/mergify.yml b/.github/mergify.yml
index 997a40e18e588..5cb9fcdf9f86a 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -14,6 +14,38 @@ pull_request_rules:
     comment:
       message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"
 
+- name: comment-pre-commit-failure
+  description: Comment on PR when pre-commit check fails
+  conditions:
+    - status-failure=pre-commit
+    - -closed
+    - -draft
+  actions:
+    comment:
+      message: |
+        Hi @{{author}}, the pre-commit checks have failed. Please run:
+
+        ```bash 
+        uv pip install pre-commit
+        pre-commit install
+        pre-commit run --all-files
+        ```
+
+        Then, commit the changes and push to your branch.
+
+        For future commits, `pre-commit` will run automatically on changed files before each commit.
+
+- name: comment-dco-failure
+  description: Comment on PR when DCO check fails
+  conditions:
+    - status-failure=dco
+    - -closed
+    - -draft
+  actions:
+    comment:
+      message: |
+        Hi @{{author}}, the DCO check has failed. Please click on DCO in the Checks section for instructions on how to resolve this.
+
 - name: label-ci-build
   description: Automatically apply ci/build label
   conditions:

From 9843e332da56307597deb2739bb83b85c18c5dde Mon Sep 17 00:00:00 2001
From: Elham <harirpoush.elham@gmail.com>
Date: Fri, 5 Dec 2025 08:09:20 -0500
Subject: [PATCH 271/770] [CPU][Perf] Add fast vectorized exp impl from Arm
 Optimized Routines (#30068)

Signed-off-by: Ubuntu <ubuntu@ip-10-252-30-150.eu-west-1.compute.internal>
Signed-off-by: Elham Harirpoush <elham.harirpoush@arm.com>
Co-authored-by: Ubuntu <ubuntu@ip-10-252-30-150.eu-west-1.compute.internal>
---
 csrc/cpu/cpu_attn_impl.hpp | 13 ----------
 csrc/cpu/cpu_attn_macros.h | 50 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 13 deletions(-)

diff --git a/csrc/cpu/cpu_attn_impl.hpp b/csrc/cpu/cpu_attn_impl.hpp
index 98f55d7c014be..02164ed3666e3 100644
--- a/csrc/cpu/cpu_attn_impl.hpp
+++ b/csrc/cpu/cpu_attn_impl.hpp
@@ -1246,14 +1246,8 @@ class AttentionMainLoop {
         // rescale sum and partial outputs
         if (need_rescale) {
           // compute rescale factor
-#ifdef DEFINE_FAST_EXP
-          vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
-          rescale_factor_vec = fast_exp(rescale_factor_vec);
-          rescale_factor = rescale_factor_vec.get_last_elem();
-#else
           rescale_factor = std::exp(rescale_factor);
           vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
-#endif
 
           // rescale sum
           new_sum_val += rescale_factor * init_sum_val;
@@ -1889,15 +1883,8 @@ class AttentionMainLoop {
                                    : curr_output_buffer;
           float rescale_factor = final_max > curr_max ? curr_max - final_max
                                                       : final_max - curr_max;
-
-#ifdef DEFINE_FAST_EXP
-          vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
-          rescale_factor_vec = fast_exp(rescale_factor_vec);
-          rescale_factor = rescale_factor_vec.get_last_elem();
-#else
           rescale_factor = std::exp(rescale_factor);
           vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
-#endif
 
           local_sum[head_idx] = final_max > curr_max
                                     ? final_sum + rescale_factor * curr_sum
diff --git a/csrc/cpu/cpu_attn_macros.h b/csrc/cpu/cpu_attn_macros.h
index 6458e43419370..35716a0790ab3 100644
--- a/csrc/cpu/cpu_attn_macros.h
+++ b/csrc/cpu/cpu_attn_macros.h
@@ -60,4 +60,54 @@
 
 #endif
 
+#ifdef __aarch64__
+  // Implementation copied from Arm Optimized Routines (expf AdvSIMD)
+  // https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/advsimd/expf.c
+  #include <limits>
+  #define DEFINE_FAST_EXP                                                      \
+    const float32x4_t inv_ln2 = vdupq_n_f32(0x1.715476p+0f);                   \
+    const float ln2_hi = 0x1.62e4p-1f;                                         \
+    const float ln2_lo = 0x1.7f7d1cp-20f;                                      \
+    const float c0 = 0x1.0e4020p-7f;                                           \
+    const float c2 = 0x1.555e66p-3f;                                           \
+    const float32x4_t ln2_c02 = {ln2_hi, ln2_lo, c0, c2};                      \
+    const uint32x4_t exponent_bias = vdupq_n_u32(0x3f800000);                  \
+    const float32x4_t c1 = vdupq_n_f32(0x1.573e2ep-5f);                        \
+    const float32x4_t c3 = vdupq_n_f32(0x1.fffdb6p-2f);                        \
+    const float32x4_t c4 = vdupq_n_f32(0x1.ffffecp-1f);                        \
+    const float32x4_t pos_special_bound = vdupq_n_f32(0x1.5d5e2ap+6f);         \
+    const float32x4_t neg_special_bound = vnegq_f32(pos_special_bound);        \
+    const float32x4_t inf =                                                    \
+        vdupq_n_f32(std::numeric_limits<float>::infinity());                   \
+    const float32x4_t zero = vdupq_n_f32(0.0f);                                \
+    auto neon_expf = [&](float32x4_t values) __attribute__((always_inline)) {  \
+      float32x4_t n = vrndaq_f32(vmulq_f32(values, inv_ln2));                  \
+      float32x4_t r = vfmsq_laneq_f32(values, n, ln2_c02, 0);                  \
+      r = vfmsq_laneq_f32(r, n, ln2_c02, 1);                                   \
+      uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_s32(vcvtq_s32_f32(n)), 23); \
+      float32x4_t scale = vreinterpretq_f32_u32(vaddq_u32(e, exponent_bias));  \
+      float32x4_t r2 = vmulq_f32(r, r);                                        \
+      float32x4_t p = vfmaq_laneq_f32(c1, r, ln2_c02, 2);                      \
+      float32x4_t q = vfmaq_laneq_f32(c3, r, ln2_c02, 3);                      \
+      q = vfmaq_f32(q, p, r2);                                                 \
+      p = vmulq_f32(c4, r);                                                    \
+      float32x4_t poly = vfmaq_f32(p, q, r2);                                  \
+      poly = vfmaq_f32(scale, poly, scale);                                    \
+      const uint32x4_t hi_mask = vcgeq_f32(values, pos_special_bound);         \
+      const uint32x4_t lo_mask = vcleq_f32(values, neg_special_bound);         \
+      poly = vbslq_f32(hi_mask, inf, poly);                                    \
+      return vbslq_f32(lo_mask, zero, poly);                                   \
+    };                                                                         \
+    auto fast_exp = [&](vec_op::FP32Vec16& vec)                                \
+                        __attribute__((always_inline)) {                       \
+                          float32x4x4_t result;                                \
+                          result.val[0] = neon_expf(vec.reg.val[0]);           \
+                          result.val[1] = neon_expf(vec.reg.val[1]);           \
+                          result.val[2] = neon_expf(vec.reg.val[2]);           \
+                          result.val[3] = neon_expf(vec.reg.val[3]);           \
+                          return vec_op::FP32Vec16(result);                    \
+                        };
+
+#endif  // __aarch64__
+
 #endif
\ No newline at end of file

From 0d8a7d8a264354ed53f822821b29cb0485bfa70f Mon Sep 17 00:00:00 2001
From: Yi Liu <yi4.liu@intel.com>
Date: Fri, 5 Dec 2025 22:02:09 +0800
Subject: [PATCH 272/770] [Compressed Tensors] Add XPU `wNa16` support (#29484)

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../scripts/hardware_ci/run-xpu-test.sh       |  1 +
 .../kernels/mixed_precision/__init__.py       |  4 +
 .../kernels/mixed_precision/xpu.py            | 97 +++++++++++++++++++
 3 files changed, 102 insertions(+)
 create mode 100644 vllm/model_executor/layers/quantization/kernels/mixed_precision/xpu.py

diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index 4d163399cfc6c..1d5dba3f26f5a 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -38,6 +38,7 @@ docker run \
     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
+    python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
     VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
     cd tests
     pytest -v -s v1/core
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
index 0cf3f12af5522..c4160157cd628 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
@@ -30,6 +30,9 @@ from vllm.model_executor.layers.quantization.kernels.mixed_precision.MPLinearKer
     MPLinearKernel,
     MPLinearLayerConfig,
 )
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.xpu import (  # noqa: E501
+    XPUwNa16LinearKernel,
+)
 from vllm.platforms import current_platform
 
 # in priority/performance order (when available)
@@ -42,6 +45,7 @@ _POSSIBLE_KERNELS: list[type[MPLinearKernel]] = [
     BitBLASLinearKernel,
     ConchLinearKernel,
     ExllamaLinearKernel,
+    XPUwNa16LinearKernel,
 ]
 
 
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/xpu.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/xpu.py
new file mode 100644
index 0000000000000..abd2e047aeed0
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/xpu.py
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import torch
+
+from vllm.platforms import current_platform
+
+from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
+
+
+class XPUwNa16LinearKernel(MPLinearKernel):
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 0
+
+    @classmethod
+    def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]:
+        if not current_platform.is_xpu():
+            return False, "IPEX wNa16 only supported on XPU/CPU devices"
+
+        # TODO: (yiliu30) relax these restrictions in later PRs
+        if c.zero_points:
+            return False, "Zero points not supported for Now"
+
+        return True, None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        from packaging import version
+
+        MIN_IPEX_VERSION = "2.6.0"
+        bias = layer.bias if not layer.skip_bias_add else None
+
+        try:
+            import intel_extension_for_pytorch as ipex
+
+            if version.parse(ipex.__version__) < version.parse(MIN_IPEX_VERSION):
+                raise ImportError(
+                    "intel_extension_for_pytorch version is "
+                    "wrong. Please install "
+                    f"intel_extension_for_pytorch>={MIN_IPEX_VERSION}."
+                )
+        except ImportError as err:
+            raise ImportError(
+                "Please install "
+                f"intel_extension_for_pytorch>={MIN_IPEX_VERSION} via "
+                f"`pip install intel_extension_for_pytorch>={MIN_IPEX_VERSION}`"
+                " to use IPEX-AWQ linear method."
+            ) from err
+        # Using the compute dtype (lowp_mode) as INT8 to leverage instructions
+        # with better performance.
+        lowp_mode = ipex.quantization.WoqLowpMode.INT8
+        # The weight will be de-packed from INT4 to INT8.
+        weight_dtype = ipex.quantization.WoqWeightDtype.INT4
+        # The float activation will be quantized (dynamic, per-token) to INT8.
+        act_quant_mode = ipex.quantization.WoqActQuantMode.PER_BATCH
+
+        qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping(
+            weight_dtype=weight_dtype,
+            lowp_mode=lowp_mode,
+            act_quant_mode=act_quant_mode,
+            group_size=self.config.group_size,
+            weight_qscheme=ipex.quantization.WoqWeightQScheme.SYMMETRIC,
+        )
+        qweight = layer.weight_packed
+        g_idx = layer.weight_g_idx if self.config.has_g_idx else None
+        scales = layer.weight_scale
+        qzeros = None
+        if self.config.zero_points:
+            qzeros = layer.weight_zero_point.contiguous()
+        qweight = qweight.t().contiguous()
+        scales = scales.t().contiguous()
+        layer.ipex_output_size = self.config.partition_weight_shape[1]
+        layer.ipex_qlinear = (
+            ipex.llm.quantization.woq_linear.IPEXWeightOnlyQuantizedLinear.from_weight(
+                qweight,
+                scales,
+                qzeros,
+                in_features=self.config.partition_weight_shape[0],
+                out_features=self.config.partition_weight_shape[1],
+                qconfig=qconfig,
+                g_idx=g_idx,
+                bias=bias,
+                group_size=self.config.group_size,
+                quant_method=0,  # `0` stands for the IPEX GPTQ
+            )
+        )
+
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        reshaped_x = x.reshape(-1, x.shape[-1])
+        out = layer.ipex_qlinear(reshaped_x)
+        return out.reshape(x.shape[:-1] + (layer.ipex_output_size,))

From 2c174420f5184256159e3d1acfe4184f3f70083e Mon Sep 17 00:00:00 2001
From: Alec S <10566873+alecsolder@users.noreply.github.com>
Date: Fri, 5 Dec 2025 09:02:49 -0500
Subject: [PATCH 273/770] Reduce validation to a warning (#28749)

Signed-off-by: Alec Solder <alecs@fb.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Alec Solder <alecs@fb.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/config/structured_outputs.py       | 16 ----------------
 vllm/reasoning/abs_reasoning_parsers.py |  5 ++++-
 2 files changed, 4 insertions(+), 17 deletions(-)

diff --git a/vllm/config/structured_outputs.py b/vllm/config/structured_outputs.py
index 1672c1d7c9a09..8c060c816fd15 100644
--- a/vllm/config/structured_outputs.py
+++ b/vllm/config/structured_outputs.py
@@ -65,22 +65,6 @@ class StructuredOutputsConfig:
 
     @model_validator(mode="after")
     def _validate_structured_output_config(self) -> Self:
-        # Import here to avoid circular import
-        from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager
-
-        if self.reasoning_parser_plugin and len(self.reasoning_parser_plugin) > 3:
-            ReasoningParserManager.import_reasoning_parser(self.reasoning_parser_plugin)
-
-        valid_reasoning_parsers = ReasoningParserManager.list_registered()
-        if (
-            self.reasoning_parser != ""
-            and self.reasoning_parser not in valid_reasoning_parsers
-        ):
-            raise ValueError(
-                f"invalid reasoning parser: {self.reasoning_parser} "
-                f"(chose from {{ {','.join(valid_reasoning_parsers)} }})"
-            )
-
         if self.disable_any_whitespace and self.backend not in ("xgrammar", "guidance"):
             raise ValueError(
                 "disable_any_whitespace is only supported for "
diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py
index 5c6ac7dad9930..d0661d1f23b06 100644
--- a/vllm/reasoning/abs_reasoning_parsers.py
+++ b/vllm/reasoning/abs_reasoning_parsers.py
@@ -160,7 +160,10 @@ class ReasoningParserManager:
         if name in cls.lazy_parsers:
             return cls._load_lazy_parser(name)
 
-        raise KeyError(f"Reasoning parser '{name}' not found.")
+        registered = ", ".join(cls.list_registered())
+        raise KeyError(
+            f"Reasoning parser '{name}' not found. Available parsers: {registered}"
+        )
 
     @classmethod
     def list_registered(cls) -> list[str]:

From 949a6a19d26f2179c0e5fc5d94f7d033e4c6a695 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Fri, 5 Dec 2025 14:52:45 +0000
Subject: [PATCH 274/770] [NIXL] Add compatibility checking to NIXL KV
 connector handshake (#29503)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 .../kv_connector/unit/test_nixl_connector.py  | 224 +++++++++++++++++-
 tests/v1/kv_connector/unit/utils.py           |  10 +-
 .../kv_connector/v1/nixl_connector.py         | 172 +++++++++++++-
 3 files changed, 380 insertions(+), 26 deletions(-)

diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index b7d7a10057b8b..ae4125d54190c 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -9,8 +9,10 @@ import textwrap
 import time
 import uuid
 from collections import defaultdict
-from unittest.mock import patch
+from typing import Any
+from unittest.mock import MagicMock, patch
 
+import msgspec
 import pytest
 import ray
 import torch
@@ -18,6 +20,7 @@ import torch
 from vllm import LLM
 from vllm.config import KVTransferConfig
 from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
+from vllm.distributed.kv_transfer.kv_connector.v1 import nixl_connector
 from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats
 from vllm.distributed.kv_transfer.kv_connector.v1.multi_connector import (
     MultiKVConnectorStats,
@@ -29,7 +32,9 @@ from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import (
     NixlConnectorMetadata,
     NixlConnectorScheduler,
     NixlConnectorWorker,
+    NixlHandshakePayload,
     NixlKVConnectorStats,
+    compute_nixl_compatibility_hash,
 )
 from vllm.distributed.kv_transfer.kv_transfer_state import (
     ensure_kv_transfer_shutdown,
@@ -317,13 +322,19 @@ def test_kv_transfer_handshake(dist_init):
     }
     prefill_connector.register_kv_caches(kv_caches)
 
-    # Simulate EngineCore initialization that would
-    # gather connector metadata from all workers, the scheduler connector
-    # expects metadata to be in dict[int, KVConnectorHandshakeMetadata],
-    # where the first key is the dp_rank, the second key is the tp_rank.
-    metadata = {0: prefill_connector.get_handshake_metadata()}
+    # Simulate EngineCore initialization that would gather connector
+    # metadata from all workers
+    metadata = prefill_connector.get_handshake_metadata()
+
+    # metadata is a NixlHandshakePayload, decode it to get NixlAgentMetadata
+    decoder = msgspec.msgpack.Decoder(NixlAgentMetadata)
+    expected_agent_metadata = decoder.decode(metadata.agent_metadata_bytes)
+
+    # The scheduler connector expects metadata to be in
+    # dict[int, KVConnectorHandshakeMetadata], where the first key is
+    # the dp_rank, the second key is the tp_rank.
     scheduler_connector = scheduler.get_kv_connector()
-    scheduler_connector.set_xfer_handshake_metadata(metadata)
+    scheduler_connector.set_xfer_handshake_metadata({0: metadata})
 
     # Simulate a request that finishes prefill, which returns
     # corresponding NixlConnectorMetadata for decode instance.
@@ -362,9 +373,9 @@ def test_kv_transfer_handshake(dist_init):
         )
 
         received_metadata = mock_add_remote_agent.call_args.args
+        assert received_metadata[0] == expected_agent_metadata
         assert received_metadata[1] == 0  # remote_tp_rank
         assert received_metadata[2] == 1  # remote_tp_size
-        assert metadata[0] == received_metadata[0]
 
     # Need to shutdown the background thread to release NIXL side channel port
     scheduler_connector.shutdown()
@@ -403,7 +414,6 @@ class FakeNixlConnectorWorker(NixlConnectorWorker):
                 device_id=0,
                 num_blocks=1,
                 block_lens=self.block_len_per_layer,
-                attn_backend_name=self.backend_name,
                 # `self.kv_cache_layout` is only forced to HND when vllm engine
                 # is started. We mock HND here.
                 kv_cache_layout="HND",
@@ -651,7 +661,6 @@ class TestNixlHandshake:
                 device_id=0,
                 num_blocks=1,
                 block_lens=worker.block_len_per_layer,
-                attn_backend_name=worker.backend_name,
                 kv_cache_layout=mismatched_layout,
                 block_size=worker.block_size,
             )
@@ -706,7 +715,6 @@ class TestNixlHandshake:
                 num_blocks=1,
                 # prefill TP=1, decode TP=2, remote block_lens is double to local
                 block_lens=[i * 2 for i in worker.block_len_per_layer],
-                attn_backend_name=worker.backend_name,
                 kv_cache_layout="HND",
                 block_size=worker.block_size,
             )
@@ -1168,6 +1176,9 @@ def test_register_kv_caches(dist_init, attn_backend, monkeypatch):
         mock_wrapper_instance = mock_nixl_wrapper.return_value
         connector.connector_worker.nixl_wrapper = mock_wrapper_instance
 
+        # Appease NixlHandshakePayload encoding with some bytes
+        mock_wrapper_instance.get_agent_metadata.return_value = b"fake_agent_metadata"
+
         # Reassure the shutdown() check that the thread is terminated
         mock_thread.return_value.is_alive.return_value = False
 
@@ -1534,3 +1545,194 @@ def test_transfer_setup_failure_returns_finished(dist_init):
     # ensure request appears in get_finished
     _, done_recving = connector.get_finished(finished_req_ids=set())
     assert request_id in done_recving
+
+
+@pytest.mark.parametrize(
+    "mismatch_type,config_overrides,version_override,should_fail,enforce_handshake_compat",
+    [
+        ("vllm_version", {}, {"vllm_version": "0.6.1"}, True, True),
+        ("nixl_connector_version", {}, {"connector_version": 37}, True, True),
+        ("model_name", {"model": "facebook/opt-350m"}, {}, True, True),
+        ("dtype", {"dtype": "bfloat16"}, {}, True, True),
+        ("cache_dtype", {"cache_dtype": "fp8"}, {}, True, True),
+        ("num_kv_heads", {"hf_overrides": {"num_key_value_heads": 8}}, {}, True, True),
+        (
+            "num_hidden_layers",
+            {"hf_overrides": {"num_hidden_layers": 24}},
+            {},
+            True,
+            True,
+        ),
+        ("hidden_size", {"hf_overrides": {"hidden_size": 1536}}, {}, True, True),
+        ("block_size", {"block_size": 8}, {}, False, True),
+        ("matching_config", {}, {}, False, True),
+        ("escape_hatch", {"model": "facebook/opt-350m"}, {}, False, False),
+    ],
+)
+@patch(
+    "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
+    FakeNixlWrapper,
+)
+def test_compatibility_hash_validation(
+    dist_init,
+    mismatch_type,
+    config_overrides,
+    version_override,
+    should_fail,
+    enforce_handshake_compat,
+):
+    """
+    Test NIXL compatibility hash validation during handshake.
+
+    Parameters:
+        mismatch_type: description of what is being tested
+        config_overrides: dict of config to override for the remote instance
+        version_override: version dict e.g. {"vllm_version": "0.6.1"}
+        should_fail: whether the handshake should fail
+        enforce_handshake_compat: whether to enforce compatibility checking
+    """
+    local_vllm_config = create_vllm_config(
+        model="facebook/opt-125m",
+        block_size=16,
+        kv_connector_extra_config={
+            "enforce_handshake_compat": enforce_handshake_compat
+        },
+    )
+    decode_connector = NixlConnector(local_vllm_config, KVConnectorRole.WORKER)
+    decode_worker = decode_connector.connector_worker
+
+    remote_config_params: dict[str, Any] = {
+        "model": "facebook/opt-125m",
+        "block_size": 16,
+        **config_overrides,
+    }
+    remote_vllm_config = create_vllm_config(**remote_config_params)
+
+    with contextlib.ExitStack() as stack:
+        if "vllm_version" in version_override:
+            stack.enter_context(
+                patch("vllm.__version__", version_override["vllm_version"])
+            )
+        elif "connector_version" in version_override:
+            stack.enter_context(
+                patch.object(
+                    nixl_connector,
+                    "NIXL_CONNECTOR_VERSION",
+                    version_override["connector_version"],
+                )
+            )
+        remote_hash = compute_nixl_compatibility_hash(
+            remote_vllm_config, decode_worker.backend_name
+        )
+
+    prefill_block_size = config_overrides.get("block_size", 16)
+    prefill_metadata = NixlAgentMetadata(
+        engine_id=FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+        agent_metadata=FakeNixlWrapper.AGENT_METADATA,
+        kv_caches_base_addr=[0],
+        device_id=0,
+        num_blocks=1,
+        block_lens=[4096 * prefill_block_size],  # slot_size * block_size
+        kv_cache_layout="HND",
+        block_size=prefill_block_size,
+    )
+    handshake_payload = NixlHandshakePayload(
+        compatibility_hash=remote_hash,
+        agent_metadata_bytes=msgspec.msgpack.encode(prefill_metadata),
+    )
+
+    # Mock ZMQ socket to return our handshake payload
+    mock_socket = MagicMock()
+    mock_socket.recv.return_value = msgspec.msgpack.encode(handshake_payload)
+
+    # Mock add_remote_agent to avoid actual NIXL operations
+    # Patch zmq_ctx to return our mock socket
+    with (
+        patch.object(decode_worker, "add_remote_agent", return_value="fake_agent"),
+        patch.object(nixl_connector, "zmq_ctx") as mock_zmq_ctx,
+    ):
+        mock_zmq_ctx.return_value.__enter__.return_value = mock_socket
+
+        if should_fail:
+            with pytest.raises(RuntimeError, match="compatibility hash mismatch"):
+                decode_worker._nixl_handshake(
+                    host="localhost",
+                    port=1234,
+                    remote_tp_size=1,
+                    expected_engine_id=FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+                )
+        else:
+            result = decode_worker._nixl_handshake(
+                host="localhost",
+                port=1234,
+                remote_tp_size=1,
+                expected_engine_id=FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+            )
+            # Verify handshake returned agent mapping
+            assert isinstance(result, dict)
+            assert len(result) == 1
+
+
+@pytest.mark.parametrize(
+    "error_scenario",
+    [
+        "handshake_decode_error",
+        "handshake_validation_error",
+        "metadata_decode_error",
+        "metadata_validation_error",
+    ],
+)
+@patch(
+    "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
+    FakeNixlWrapper,
+)
+def test_handshake_decode_errors(dist_init, error_scenario):
+    """
+    Test that msgspec decode errors are properly handled during handshake.
+
+    Tests both DecodeError and ValidationError for both decoders:
+    - NixlHandshakePayload decoder
+    - NixlAgentMetadata decoder
+    """
+    local_vllm_config = create_vllm_config(
+        model="facebook/opt-125m",
+        block_size=16,
+    )
+    decode_connector = NixlConnector(local_vllm_config, KVConnectorRole.WORKER)
+    decode_worker = decode_connector.connector_worker
+
+    if error_scenario == "handshake_decode_error":
+        msg_bytes = b"this is not valid msgpack data"
+    elif error_scenario == "handshake_validation_error":
+        msg_bytes = msgspec.msgpack.encode({"wrong_field": "value"})
+    elif error_scenario == "metadata_decode_error":
+        valid_handshake = NixlHandshakePayload(
+            compatibility_hash=decode_worker.compat_hash,
+            agent_metadata_bytes=b"invalid msgpack for metadata",
+        )
+        msg_bytes = msgspec.msgpack.encode(valid_handshake)
+
+    elif error_scenario == "metadata_validation_error":
+        valid_handshake = NixlHandshakePayload(
+            compatibility_hash=decode_worker.compat_hash,
+            agent_metadata_bytes=msgspec.msgpack.encode({"missing": "fields"}),
+        )
+        msg_bytes = msgspec.msgpack.encode(valid_handshake)
+    else:
+        raise AssertionError(f"{error_scenario} not a valid scenario")
+
+    mock_socket = MagicMock()
+    mock_socket.recv.return_value = msg_bytes
+    with (
+        patch.object(decode_worker, "add_remote_agent", return_value="fake_agent"),
+        patch.object(nixl_connector, "zmq_ctx") as mock_zmq_ctx,
+    ):
+        mock_zmq_ctx.return_value.__enter__.return_value = mock_socket
+
+        with pytest.raises(RuntimeError):
+            decode_worker._nixl_handshake(
+                host="localhost",
+                port=1234,
+                remote_tp_size=1,
+                expected_engine_id=FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+            )
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index 98f1f44923b1c..cea41c3ab18a6 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -90,13 +90,18 @@ def create_vllm_config(
     max_model_len: int = 10000,
     enable_chunked_prefill: bool = True,
     enable_permute_local_kv: bool = False,
+    kv_connector_extra_config: dict[str, Any] | None = None,
+    dtype: str = "float16",
+    cache_dtype: str = "auto",
+    hf_overrides: dict[str, Any] | None = None,
 ) -> VllmConfig:
     """Initialize VllmConfig For Testing."""
     model_config = ModelConfig(
         model=model,
         trust_remote_code=True,
-        dtype="float16",
+        dtype=dtype,
         seed=42,
+        hf_overrides=hf_overrides or {},
     )
     scheduler_config = SchedulerConfig(
         max_num_seqs=max_num_seqs,
@@ -110,13 +115,14 @@ def create_vllm_config(
         block_size=block_size,
         gpu_memory_utilization=0.9,
         swap_space=0,
-        cache_dtype="auto",
+        cache_dtype=cache_dtype,
         enable_prefix_caching=True,
     )
     kv_transfer_config = KVTransferConfig(
         kv_connector="NixlConnector",
         kv_role="kv_both",
         enable_permute_local_kv=enable_permute_local_kv,
+        kv_connector_extra_config=kv_connector_extra_config or {},
     )
     return VllmConfig(
         scheduler_config=scheduler_config,
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 24b7599a4fe0c..49330abcec324 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -59,6 +59,21 @@ Transfer = tuple[int, float]  # (xfer_handle, start_time)
 EngineId = str
 ReqId = str
 
+#
+# NIXL Connector Version
+#
+# Increment this version whenever there is an incompatible change to:
+#   - NixlAgentMetadata schema
+#   - kv_transfer_params schema or semantics
+#   - NIXL transfer protocol or wire format
+#   - KV cache memory layout or block organization
+#   - Any other change that breaks P/D interoperability
+#
+# Version History:
+#   1: Initial version with compatibility checking
+#
+NIXL_CONNECTOR_VERSION: int = 1
+
 GET_META_MSG = b"get_meta_msg"
 
 logger = init_logger(__name__)
@@ -97,18 +112,95 @@ _NIXL_SUPPORTED_DEVICE.update(current_platform.get_nixl_supported_devices())
 
 
 @dataclass
-class NixlAgentMetadata(KVConnectorHandshakeMetadata):
+class NixlAgentMetadata:
     engine_id: str
     agent_metadata: bytes
     kv_caches_base_addr: list[int]
     device_id: int
     num_blocks: int
     block_lens: list[int]
-    attn_backend_name: str
     kv_cache_layout: str
     block_size: int
 
 
+@dataclass
+class NixlHandshakePayload(KVConnectorHandshakeMetadata):
+    """
+    Wrapper for NIXL handshake sent over the wire.
+
+    Enables two-phase decoding for graceful compatibility checking:
+    1. Decode NixlHandshakePayload to get compatibility_hash
+    2. Compute local hash and compare
+    3. Only if hashes match, decode agent_metadata_bytes
+
+    This prevents decoder errors when NixlAgentMetadata schema is
+    incompatible, allowing graceful failure with clear error message.
+    """
+
+    compatibility_hash: str
+    agent_metadata_bytes: bytes  # NixlAgentMetadata encoded
+
+
+def compute_nixl_compatibility_hash(
+    vllm_config: VllmConfig, attn_backend_name: str
+) -> str:
+    """
+    Compute compatibility hash for NIXL KV transfer.
+
+    Hash only the factors that affect whether two NIXL instances can
+    successfully transfer KV cache data.
+
+    Factors included:
+    - vLLM version and NIXL connector version
+    - Model architecture (name, dtype, KV heads, layers)
+    - KV cache format (dtype, sliding window)
+    - Attention backend
+
+    Note: Factors like tensor_parallel_size, block_size, and kv_cache_layout
+    are validated at runtime in _validate_remote_agent_handshake and are not
+    included in this hash to support heterogeneous deployments.
+
+    Note - the set of factors are likely to evolve significantly over
+    time to be more or less permissive.
+
+    Returns:
+        SHA-256 hex digest
+    """
+    from vllm import __version__ as vllm_version
+    from vllm.config.utils import hash_factors
+
+    model_config = vllm_config.model_config
+    cache_config = vllm_config.cache_config
+
+    factors = {
+        # Version compatibility
+        "vllm_version": vllm_version,
+        "nixl_connector_version": NIXL_CONNECTOR_VERSION,
+        # Model architecture - affects KV cache shape
+        "model": model_config.model,
+        "dtype": str(model_config.dtype),
+        "num_kv_heads": model_config.get_total_num_kv_heads(),
+        "head_size": model_config.get_head_size(),
+        "num_hidden_layers": model_config.get_total_num_hidden_layers(),
+        # Attention backend and KV cache dtype affect memory layout
+        "attn_backend_name": attn_backend_name,
+        "cache_dtype": str(cache_config.cache_dtype),
+    }
+
+    compat_hash = hash_factors(factors)
+    logger.info(
+        "NIXL compatibility hash: %s (model=%s, dtype=%s, num_kv_heads=%d, "
+        "cache_dtype=%s, attn_backend=%s)",
+        compat_hash,
+        factors["model"],
+        factors["dtype"],
+        factors["num_kv_heads"],
+        factors["cache_dtype"],
+        attn_backend_name,
+    )
+    return compat_hash
+
+
 @dataclass
 class ReqMeta:
     local_block_ids: list[int]
@@ -396,14 +488,14 @@ class NixlConnectorScheduler:
         encoded_data: dict[int, bytes] = {}
         encoder = msgspec.msgpack.Encoder()
         for tp_rank, rank_metadata in metadata.items():
-            if not isinstance(rank_metadata, NixlAgentMetadata):
+            if not isinstance(rank_metadata, NixlHandshakePayload):
                 raise ValueError(
-                    "NixlConnectorScheduler expects NixlAgentMetadata for "
+                    "NixlConnectorScheduler expects NixlHandshakePayload for "
                     "handshake metadata."
                 )
             encoded_data[tp_rank] = encoder.encode(rank_metadata)
             logger.debug(
-                "Tp rank %d: encoded NixlAgentMetadata size: %s bytes",
+                "Tp rank %d: encoded NixlHandshakePayload size: %s bytes",
                 tp_rank,
                 str(len(encoded_data[tp_rank])),
             )
@@ -794,7 +886,7 @@ class NixlConnectorWorker:
         self._failed_recv_reqs: set[ReqId] = set()
 
         # Handshake metadata of this worker for NIXL transfers.
-        self.xfer_handshake_metadata: NixlAgentMetadata | None = None
+        self.xfer_handshake_metadata: NixlHandshakePayload | None = None
         # Background thread for initializing new NIXL handshakes.
         self._handshake_initiation_executor = ThreadPoolExecutor(
             # NIXL is not guaranteed to be thread-safe, limit 1 worker.
@@ -829,6 +921,13 @@ class NixlConnectorWorker:
         logger.debug("Detected attention backend %s", self.backend_name)
         logger.debug("Detected kv cache layout %s", self.kv_cache_layout)
 
+        self.compat_hash = compute_nixl_compatibility_hash(
+            self.vllm_config, self.backend_name
+        )
+        self.enforce_compat_hash = self.kv_transfer_config.get_from_extra_config(
+            "enforce_handshake_compat", True
+        )
+
         self._tp_size: dict[EngineId, int] = {self.engine_id: self.world_size}
         self._block_size: dict[EngineId, int] = {self.engine_id: self.block_size}
         # With heterogeneous TP, P must wait for all assigned D TP workers to
@@ -877,14 +976,58 @@ class NixlConnectorWorker:
             # Set receive timeout to 5 seconds to avoid hanging on dead server
             sock.setsockopt(zmq.RCVTIMEO, 5000)  # milliseconds
             sock.send(msg)
-            metadata_bytes = sock.recv()
-            decoder = msgspec.msgpack.Decoder(NixlAgentMetadata)
-            metadata = decoder.decode(metadata_bytes)
+            handshake_bytes = sock.recv()
+
+            # Decode handshake payload to get compatibility hash
+            handshake_decoder = msgspec.msgpack.Decoder(NixlHandshakePayload)
+            try:
+                handshake_payload = handshake_decoder.decode(handshake_bytes)
+            except (msgspec.DecodeError, msgspec.ValidationError) as e:
+                raise RuntimeError(
+                    f"Failed to decode NixlHandshakePayload. This likely indicates "
+                    f"an incompatibility between connector version. Error: {e}"
+                ) from e
+
             got_metadata_time = time.perf_counter()
             logger.debug(
                 "NIXL handshake: get metadata took: %s", got_metadata_time - start_time
             )
 
+            # Check compatibility hash BEFORE decoding agent metadata
+            if (
+                self.enforce_compat_hash
+                and handshake_payload.compatibility_hash != self.compat_hash
+            ):
+                raise RuntimeError(
+                    f"NIXL compatibility hash mismatch. "
+                    f"Local: {self.compat_hash}, "
+                    f"Remote: {handshake_payload.compatibility_hash}. "
+                    f"Prefill and decode instances have incompatible configurations. "
+                    f"This may be due to: different vLLM versions, models, dtypes, "
+                    f"KV cache layouts, attention backends, etc. "
+                    f"Both instances must use identical configurations."
+                    f"Disable this check using "
+                    f'--kv-transfer-config \'{{"kv_connector_extra_config": '
+                    f'{{"enforce_handshake_compat": false}}}}\''
+                )
+
+            logger.info(
+                "NIXL compatibility check passed (hash: %s)",
+                handshake_payload.compatibility_hash,
+            )
+
+            # Decode agent metadata
+            metadata_decoder = msgspec.msgpack.Decoder(NixlAgentMetadata)
+            try:
+                metadata = metadata_decoder.decode(
+                    handshake_payload.agent_metadata_bytes
+                )
+            except (msgspec.DecodeError, msgspec.ValidationError) as e:
+                # This should not happen if hash matched
+                raise RuntimeError(
+                    f"Failed to decode NixlAgentMetadata. Error: {e}"
+                ) from e
+
             # Ensure engine id matches.
             if metadata.engine_id != expected_engine_id:
                 raise RuntimeError(
@@ -1175,19 +1318,24 @@ class NixlConnectorWorker:
             assert len(self.block_window_per_layer) == self.num_layers
 
         # After KV Caches registered, listen for new connections.
-        self.xfer_handshake_metadata = NixlAgentMetadata(
+        agent_metadata = NixlAgentMetadata(
             engine_id=self.engine_id,
             agent_metadata=self.nixl_wrapper.get_agent_metadata(),
             kv_caches_base_addr=self.kv_caches_base_addr[self.engine_id],
             device_id=self.device_id,
             num_blocks=self.num_blocks,
             block_lens=self.block_len_per_layer,
-            attn_backend_name=self.backend_name,
             kv_cache_layout=self.kv_cache_layout
             if not self.use_host_buffer
             else self.host_buffer_kv_cache_layout,
             block_size=self.block_size,
         )
+        # Wrap metadata in payload with hash for defensive decoding
+        encoder = msgspec.msgpack.Encoder()
+        self.xfer_handshake_metadata = NixlHandshakePayload(
+            compatibility_hash=self.compat_hash,
+            agent_metadata_bytes=encoder.encode(agent_metadata),
+        )
 
     def register_local_xfer_handler(
         self,
@@ -1402,8 +1550,6 @@ class NixlConnectorWorker:
         remote_engine_id = nixl_agent_meta.engine_id
 
         assert self._tp_size[remote_engine_id] == remote_tp_size
-        # TODO We may eventually want to skip enforcing the same attn backend.
-        assert nixl_agent_meta.attn_backend_name == self.backend_name
 
         tp_ratio = self.kv_topo.tp_ratio_from_engine_id(remote_engine_id)
         block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(

From da7bc54ea8f44a2dcacc4a9869721bd105006e10 Mon Sep 17 00:00:00 2001
From: Andrew Xia <axia@meta.com>
Date: Fri, 5 Dec 2025 08:11:50 -0800
Subject: [PATCH 275/770] [responsesAPI][5] ResponsesParser with tools for full
 MCP python loop (#29798)

Signed-off-by: Andrew Xia <axia@fb.com>
Signed-off-by: Andrew Xia <axia@meta.com>
Co-authored-by: Andrew Xia <axia@fb.com>
---
 .../openai_responses_client_with_tools.py     |  2 +-
 .../test_response_api_parsable_context.py     | 99 ++++++++++++++++++-
 vllm/entrypoints/context.py                   | 92 ++++++++++++++++-
 .../openai/parser/responses_parser.py         | 34 +++++++
 vllm/entrypoints/openai/serving_engine.py     | 65 ++++++++++--
 vllm/entrypoints/openai/serving_responses.py  |  6 +-
 vllm/entrypoints/responses_utils.py           | 21 +++-
 vllm/entrypoints/tool.py                      | 44 +++++++++
 8 files changed, 347 insertions(+), 16 deletions(-)

diff --git a/examples/online_serving/openai_responses_client_with_tools.py b/examples/online_serving/openai_responses_client_with_tools.py
index 276010197b5ab..c85c8cf807b49 100644
--- a/examples/online_serving/openai_responses_client_with_tools.py
+++ b/examples/online_serving/openai_responses_client_with_tools.py
@@ -3,7 +3,7 @@
 """
 Set up this example by starting a vLLM OpenAI-compatible server with tool call
 options enabled.
-Reasoning models can be used through the Responses API as seen here 
+Reasoning models can be used through the Responses API as seen here
 https://platform.openai.com/docs/api-reference/responses
 For example:
 vllm serve Qwen/Qwen3-1.7B --reasoning-parser qwen3 \
diff --git a/tests/entrypoints/openai/test_response_api_parsable_context.py b/tests/entrypoints/openai/test_response_api_parsable_context.py
index 1b2795770d4c7..1899c5f04fe3f 100644
--- a/tests/entrypoints/openai/test_response_api_parsable_context.py
+++ b/tests/entrypoints/openai/test_response_api_parsable_context.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import importlib
+import json
 
 import pytest
 import pytest_asyncio
@@ -13,12 +15,27 @@ MODEL_NAME = "Qwen/Qwen3-8B"
 
 @pytest.fixture(scope="module")
 def server():
-    args = ["--reasoning-parser", "qwen3", "--max_model_len", "5000"]
+    assert importlib.util.find_spec("gpt_oss") is not None, (
+        "Harmony tests require gpt_oss package to be installed"
+    )
+
+    args = [
+        "--reasoning-parser",
+        "qwen3",
+        "--max_model_len",
+        "5000",
+        "--structured-outputs-config.backend",
+        "xgrammar",
+        "--enable-auto-tool-choice",
+        "--tool-call-parser",
+        "hermes",
+        "--tool-server",
+        "demo",
+    ]
     env_dict = dict(
         VLLM_ENABLE_RESPONSES_API_STORE="1",
         VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT="1",
-        # uncomment for tool calling
-        # PYTHON_EXECUTION_BACKEND="dangerously_use_uv",
+        PYTHON_EXECUTION_BACKEND="dangerously_use_uv",
     )
 
     with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as remote_server:
@@ -85,3 +102,79 @@ async def test_reasoning_and_function_items(client: OpenAI, model_name: str):
     assert response.output[0].type == "reasoning"
     assert response.output[1].type == "message"
     assert type(response.output[1].content[0].text) is str
+
+
+def get_horoscope(sign):
+    return f"{sign}: Next Tuesday you will befriend a baby otter."
+
+
+def call_function(name, args):
+    if name == "get_horoscope":
+        return get_horoscope(**args)
+    else:
+        raise ValueError(f"Unknown function: {name}")
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_function_call_first_turn(client: OpenAI, model_name: str):
+    tools = [
+        {
+            "type": "function",
+            "name": "get_horoscope",
+            "description": "Get today's horoscope for an astrological sign.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "sign": {"type": "string"},
+                },
+                "required": ["sign"],
+                "additionalProperties": False,
+            },
+            "strict": True,
+        }
+    ]
+
+    response = await client.responses.create(
+        model=model_name,
+        input="What is the horoscope for Aquarius today?",
+        tools=tools,
+        temperature=0.0,
+    )
+    assert response is not None
+    assert response.status == "completed"
+    assert len(response.output) == 2
+    assert response.output[0].type == "reasoning"
+    assert response.output[1].type == "function_call"
+
+    function_call = response.output[1]
+    assert function_call.name == "get_horoscope"
+    assert function_call.call_id is not None
+
+    args = json.loads(function_call.arguments)
+    assert "sign" in args
+
+    # the multi turn function call is tested above in
+    # test_reasoning_and_function_items
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_mcp_tool_call(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="What is 13 * 24? Use python to calculate the result.",
+        tools=[{"type": "code_interpreter", "container": {"type": "auto"}}],
+        temperature=0.0,
+    )
+
+    assert response is not None
+    assert response.status == "completed"
+    assert response.output[0].type == "reasoning"
+    assert response.output[1].type == "mcp_call"
+    assert type(response.output[1].arguments) is str
+    assert type(response.output[1].output) is str
+    assert response.output[2].type == "reasoning"
+    # make sure the correct math is in the final output
+    assert response.output[3].type == "message"
+    assert "312" in response.output[3].content[0].text
diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
index 43783c92667af..f50c473d7a773 100644
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -9,10 +9,16 @@ from collections.abc import Callable
 from contextlib import AsyncExitStack
 from typing import TYPE_CHECKING, Union
 
+from openai.types.responses.response_function_tool_call_output_item import (
+    ResponseFunctionToolCallOutputItem,
+)
 from openai.types.responses.tool import Mcp
 from openai_harmony import Author, Message, Role, StreamState, TextContent
 
 from vllm import envs
+from vllm.entrypoints.chat_utils import (
+    ChatTemplateContentFormatOption,
+)
 from vllm.entrypoints.harmony_utils import (
     get_encoding,
     get_streamable_parser_for_assistant,
@@ -22,16 +28,20 @@ from vllm.entrypoints.openai.parser.responses_parser import (
     get_responses_parser_for_simple_context,
 )
 from vllm.entrypoints.openai.protocol import (
+    FunctionCall,
     ResponseInputOutputItem,
     ResponseRawMessageAndToken,
     ResponsesRequest,
 )
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ToolParser
 from vllm.entrypoints.responses_utils import construct_tool_dicts
 from vllm.entrypoints.tool import Tool
 from vllm.entrypoints.tool_server import ToolServer
 from vllm.outputs import RequestOutput
 from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
+from vllm.tokenizers.protocol import TokenizerLike
 from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import random_uuid
 
 if TYPE_CHECKING:
     from mcp.client import ClientSession
@@ -221,6 +231,10 @@ class ParsableContext(ConversationContext):
         tokenizer: AnyTokenizer,
         reasoning_parser_cls: Callable[[AnyTokenizer], ReasoningParser] | None,
         request: ResponsesRequest,
+        available_tools: list[str] | None,
+        tool_parser_cls: Callable[[TokenizerLike], ToolParser] | None,
+        chat_template: str | None,
+        chat_template_content_format: ChatTemplateContentFormatOption,
     ):
         self.num_prompt_tokens = 0
         self.num_output_tokens = 0
@@ -238,12 +252,19 @@ class ParsableContext(ConversationContext):
             reasoning_parser_cls=reasoning_parser_cls,
             response_messages=response_messages,
             request=request,
+            tool_parser_cls=tool_parser_cls,
         )
+        self.tool_parser_cls = tool_parser_cls
+        self.request = request
+        self.tokenizer = tokenizer
 
+        self.available_tools = available_tools or []
         self._tool_sessions: dict[str, ClientSession | Tool] = {}
         self.called_tools: set[str] = set()
 
         self.tool_dicts = construct_tool_dicts(request.tools, request.tool_choice)
+        self.chat_template = chat_template
+        self.chat_template_content_format = chat_template_content_format
 
     def append_output(self, output: RequestOutput) -> None:
         self.num_prompt_tokens = len(output.prompt_token_ids or [])
@@ -252,14 +273,50 @@ class ParsableContext(ConversationContext):
         self.parser.process(output.outputs[0])
 
     def append_tool_output(self, output: list[ResponseInputOutputItem]) -> None:
-        raise NotImplementedError("Should not be called.")
+        self.parser.response_messages.extend(output)
 
     def need_builtin_tool_call(self) -> bool:
         """Return true if the last message is a MCP tool call"""
+        last_message = self.parser.response_messages[-1]
+        # TODO: figure out which tools are MCP tools
+        if (  # noqa: SIM103
+            last_message.type == "function_call"
+            and last_message.name in ("code_interpreter", "python")
+        ):
+            return True
+
         return False
 
+    async def call_python_tool(
+        self, tool_session: Union["ClientSession", Tool], last_msg: FunctionCall
+    ) -> list[ResponseInputOutputItem]:
+        self.called_tools.add("python")
+        if isinstance(tool_session, Tool):
+            return await tool_session.get_result_parsable_context(self)
+        args = json.loads(last_msg.arguments)
+        param = {
+            "code": args["code"],
+        }
+        result = await tool_session.call_tool("python", param)
+        result_str = result.content[0].text
+
+        message = ResponseFunctionToolCallOutputItem(
+            id=f"fco_{random_uuid()}",
+            type="function_call_output",
+            call_id=f"call_{random_uuid()}",
+            output=result_str,
+            status="completed",
+        )
+
+        return [message]
+
     async def call_tool(self) -> list[ResponseInputOutputItem]:
-        raise NotImplementedError("Should not be called.")
+        if not self.parser.response_messages:
+            return []
+        last_msg = self.parser.response_messages[-1]
+        if last_msg.name == "code_interpreter":
+            return await self.call_python_tool(self._tool_sessions["python"], last_msg)
+        return []
 
     def render_for_completion(self):
         raise NotImplementedError("Should not be called.")
@@ -271,11 +328,38 @@ class ParsableContext(ConversationContext):
         request_id: str,
         mcp_tools: dict[str, Mcp],
     ):
-        pass
+        if tool_server:
+            for tool_name in self.available_tools:
+                if tool_name in self._tool_sessions:
+                    continue
+
+                tool_type = _map_tool_name_to_tool_type(tool_name)
+                headers = (
+                    mcp_tools[tool_type].headers if tool_type in mcp_tools else None
+                )
+                tool_session = await exit_stack.enter_async_context(
+                    tool_server.new_session(tool_name, request_id, headers)
+                )
+                self._tool_sessions[tool_name] = tool_session
+                exit_stack.push_async_exit(self.cleanup_session)
 
     async def cleanup_session(self, *args, **kwargs) -> None:
         """Can be used as coro to used in __aexit__"""
-        raise NotImplementedError("Should not be called.")
+
+        async def cleanup_tool_session(tool_session):
+            if not isinstance(tool_session, Tool):
+                logger.info(
+                    "Cleaning up tool session for %s", tool_session._client_info
+                )
+                with contextlib.suppress(Exception):
+                    await tool_session.call_tool("cleanup_session", {})
+
+        await asyncio.gather(
+            *(
+                cleanup_tool_session(self._tool_sessions[tool])
+                for tool in self.called_tools
+            )
+        )
 
 
 class HarmonyContext(ConversationContext):
diff --git a/vllm/entrypoints/openai/parser/responses_parser.py b/vllm/entrypoints/openai/parser/responses_parser.py
index 1bc8e81bd9dfc..00045a7ccfd24 100644
--- a/vllm/entrypoints/openai/parser/responses_parser.py
+++ b/vllm/entrypoints/openai/parser/responses_parser.py
@@ -3,6 +3,7 @@
 import logging
 from collections.abc import Callable
 
+from openai.types.responses.response_function_tool_call import ResponseFunctionToolCall
 from openai.types.responses.response_output_message import ResponseOutputMessage
 from openai.types.responses.response_output_text import ResponseOutputText
 from openai.types.responses.response_reasoning_item import (
@@ -11,8 +12,10 @@ from openai.types.responses.response_reasoning_item import (
 )
 
 from vllm.entrypoints.openai.protocol import ResponseInputOutputItem, ResponsesRequest
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ToolParser
 from vllm.outputs import CompletionOutput
 from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
+from vllm.tokenizers.protocol import TokenizerLike
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import random_uuid
 
@@ -29,6 +32,7 @@ class ResponsesParser:
         reasoning_parser_cls: Callable[[AnyTokenizer], ReasoningParser],
         response_messages: list[ResponseInputOutputItem],
         request: ResponsesRequest,
+        tool_parser_cls: Callable[[TokenizerLike], ToolParser] | None,
     ):
         self.response_messages: list[ResponseInputOutputItem] = (
             # TODO: initial messages may not be properly typed
@@ -39,6 +43,9 @@ class ResponsesParser:
         self.request = request
 
         self.reasoning_parser_instance = reasoning_parser_cls(tokenizer)
+        self.tool_parser_instance = None
+        if tool_parser_cls is not None:
+            self.tool_parser_instance = tool_parser_cls(tokenizer)
 
     def process(self, output: CompletionOutput) -> "ResponsesParser":
         reasoning_content, content = self.reasoning_parser_instance.extract_reasoning(
@@ -59,6 +66,29 @@ class ResponsesParser:
                 )
             )
 
+        function_calls: list[ResponseFunctionToolCall] = []
+        if self.tool_parser_instance is not None:
+            tool_call_info = self.tool_parser_instance.extract_tool_calls(
+                content if content is not None else "",
+                request=self.request,  # type: ignore
+            )
+            if tool_call_info is not None and tool_call_info.tools_called:
+                # extract_tool_calls() returns a list of tool calls.
+                function_calls.extend(
+                    ResponseFunctionToolCall(
+                        id=f"fc_{random_uuid()}",
+                        call_id=f"call_{random_uuid()}",
+                        type="function_call",
+                        status="completed",
+                        name=tool_call.function.name,
+                        arguments=tool_call.function.arguments,
+                    )
+                    for tool_call in tool_call_info.tool_calls
+                )
+                content = tool_call_info.content
+                if content and content.strip() == "":
+                    content = None
+
         if content:
             self.response_messages.append(
                 ResponseOutputMessage(
@@ -76,6 +106,8 @@ class ResponsesParser:
                     ],
                 )
             )
+        if len(function_calls) > 0:
+            self.response_messages.extend(function_calls)
 
         return self
 
@@ -86,6 +118,7 @@ def get_responses_parser_for_simple_context(
     reasoning_parser_cls: Callable[[AnyTokenizer], ReasoningParser],
     response_messages: list[ResponseInputOutputItem],
     request: ResponsesRequest,
+    tool_parser_cls,
 ) -> ResponsesParser:
     """Factory function to create a ResponsesParser with
     optional reasoning parser.
@@ -98,4 +131,5 @@ def get_responses_parser_for_simple_context(
         reasoning_parser_cls=reasoning_parser_cls,
         response_messages=response_messages,
         request=request,
+        tool_parser_cls=tool_parser_cls,
     )
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index bfa98f29a064b..99936f588f28b 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -18,6 +18,16 @@ from pydantic import ConfigDict, TypeAdapter
 from starlette.datastructures import Headers
 from typing_extensions import TypeIs
 
+from vllm.entrypoints.context import (
+    HarmonyContext,
+    ParsableContext,
+    StreamingHarmonyContext,
+)
+from vllm.entrypoints.openai.protocol import (
+    FunctionCall,
+    ResponseInputOutputItem,
+    ResponsesRequest,
+)
 from vllm.entrypoints.pooling.classify.protocol import (
     ClassificationChatRequest,
     ClassificationCompletionRequest,
@@ -39,6 +49,7 @@ from vllm.entrypoints.pooling.score.protocol import (
     ScoreRequest,
     ScoreResponse,
 )
+from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 if sys.version_info >= (3, 12):
     from typing import TypedDict
@@ -72,9 +83,7 @@ from vllm.entrypoints.openai.protocol import (
     DetokenizeRequest,
     ErrorInfo,
     ErrorResponse,
-    FunctionCall,
     FunctionDefinition,
-    ResponsesRequest,
     TokenizeChatRequest,
     TokenizeCompletionRequest,
     TokenizeResponse,
@@ -85,6 +94,9 @@ from vllm.entrypoints.openai.protocol import (
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
 from vllm.entrypoints.renderer import BaseRenderer, CompletionRenderer, RenderConfig
+from vllm.entrypoints.responses_utils import (
+    construct_input_messages,
+)
 from vllm.entrypoints.serve.disagg.protocol import GenerateRequest, GenerateResponse
 from vllm.entrypoints.utils import _validate_truncation_size
 from vllm.inputs.data import PromptType
@@ -1224,6 +1236,31 @@ class OpenAIServing:
         )
         return engine_request, tokenization_kwargs
 
+    async def _render_next_turn(
+        self,
+        request: ResponsesRequest,
+        tokenizer: AnyTokenizer,
+        messages: list[ResponseInputOutputItem],
+        tool_dicts: list[dict[str, Any]] | None,
+        tool_parser,
+        chat_template: str | None,
+        chat_template_content_format: ChatTemplateContentFormatOption,
+    ):
+        new_messages = construct_input_messages(
+            request_input=messages,
+        )
+
+        _, request_prompts, engine_prompts = await self._preprocess_chat(
+            request,
+            tokenizer,
+            new_messages,
+            tool_dicts=tool_dicts,
+            tool_parser=tool_parser,
+            chat_template=chat_template,
+            chat_template_content_format=chat_template_content_format,
+        )
+        return request_prompts, engine_prompts
+
     async def _generate_with_builtin_tools(
         self,
         request_id: str,
@@ -1286,11 +1323,27 @@ class OpenAIServing:
 
             # Create inputs for the next turn.
             # Render the next prompt token ids.
-            prompt_token_ids = context.render_for_completion()
-            engine_prompt = EngineTokensPrompt(prompt_token_ids=prompt_token_ids)
-            request_prompt = prompt_token_ids
+            if isinstance(context, (HarmonyContext, StreamingHarmonyContext)):
+                prompt_token_ids = context.render_for_completion()
+                engine_prompt = EngineTokensPrompt(prompt_token_ids=prompt_token_ids)
+                request_prompt = prompt_token_ids
+            elif isinstance(context, ParsableContext):
+                request_prompts, engine_prompts = await self._render_next_turn(
+                    context.request,
+                    context.tokenizer,
+                    context.parser.response_messages,
+                    context.tool_dicts,
+                    context.tool_parser_cls,
+                    context.chat_template,
+                    context.chat_template_content_format,
+                )
+                engine_prompt = engine_prompts[0]
+                request_prompt = request_prompts[0]
+
             # Update the sampling params.
-            sampling_params.max_tokens = self.max_model_len - len(prompt_token_ids)
+            sampling_params.max_tokens = self.max_model_len - len(
+                engine_prompt["prompt_token_ids"]
+            )
             # OPTIMIZATION
             priority = orig_priority - 1
             sub_request += 1
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 3c9ae8e8c8087..1eb1243e7e5bc 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -375,7 +375,7 @@ class OpenAIServingResponses(OpenAIServing):
         generators: list[AsyncGenerator[ConversationContext, None]] = []
 
         builtin_tool_list: list[str] = []
-        if self.use_harmony and self.tool_server is not None:
+        if self.tool_server is not None:
             if self.tool_server.has_tool("browser"):
                 builtin_tool_list.append("browser")
             if self.tool_server.has_tool("python"):
@@ -423,6 +423,10 @@ class OpenAIServingResponses(OpenAIServing):
                             tokenizer=tokenizer,
                             reasoning_parser_cls=self.reasoning_parser,
                             request=request,
+                            tool_parser_cls=self.tool_parser,
+                            available_tools=available_tools,
+                            chat_template=self.chat_template,
+                            chat_template_content_format=self.chat_template_content_format,
                         )
                     else:
                         context = SimpleContext()
diff --git a/vllm/entrypoints/responses_utils.py b/vllm/entrypoints/responses_utils.py
index 5f21e2c44450c..fbc137bac4543 100644
--- a/vllm/entrypoints/responses_utils.py
+++ b/vllm/entrypoints/responses_utils.py
@@ -16,6 +16,7 @@ from openai.types.responses.response import ToolChoice
 from openai.types.responses.response_function_tool_call_output_item import (
     ResponseFunctionToolCallOutputItem,
 )
+from openai.types.responses.response_output_item import McpCall
 from openai.types.responses.response_output_message import ResponseOutputMessage
 from openai.types.responses.response_reasoning_item import ResponseReasoningItem
 from openai.types.responses.tool import Tool
@@ -25,6 +26,7 @@ from vllm.entrypoints.openai.protocol import (
     ChatCompletionMessageParam,
     ResponseInputOutputItem,
 )
+from vllm.utils import random_uuid
 
 
 def make_response_output_items_from_parsable_context(
@@ -36,7 +38,24 @@ def make_response_output_items_from_parsable_context(
         if not isinstance(message, ResponseFunctionToolCallOutputItem):
             output_messages.append(message)
         else:
-            raise NotImplementedError("tool calls not supported for response context")
+            if len(output_messages) == 0:
+                raise ValueError(
+                    "Cannot have a FunctionToolCallOutput before FunctionToolCall."
+                )
+            if isinstance(output_messages[-1], ResponseFunctionToolCall):
+                mcp_message = McpCall(
+                    id=f"mcp_{random_uuid()}",
+                    arguments=output_messages[-1].arguments,
+                    name=output_messages[-1].name,
+                    server_label=output_messages[
+                        -1
+                    ].name,  # TODO: store the server label
+                    type="mcp_call",
+                    status="completed",
+                    output=message.output,
+                    # TODO: support error output
+                )
+                output_messages[-1] = mcp_message
 
     return output_messages
 
diff --git a/vllm/entrypoints/tool.py b/vllm/entrypoints/tool.py
index c74ce1ee16de1..4feed827385d1 100644
--- a/vllm/entrypoints/tool.py
+++ b/vllm/entrypoints/tool.py
@@ -1,12 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
 import os
 from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING, Any
 
+from openai.types.responses.response_function_tool_call_output_item import (
+    ResponseFunctionToolCallOutputItem,
+)
 from openai_harmony import Author, Message, Role, TextContent
 
 from vllm.logger import init_logger
+from vllm.utils import random_uuid
 
 if TYPE_CHECKING:
     # Avoid circular import.
@@ -46,6 +51,10 @@ class Tool(ABC):
     async def get_result(self, context: "ConversationContext") -> Any:
         pass
 
+    @abstractmethod
+    async def get_result_parsable_context(self, context: "ConversationContext") -> Any:
+        pass
+
 
 class HarmonyBrowserTool(Tool):
     def __init__(self):
@@ -81,6 +90,9 @@ class HarmonyBrowserTool(Tool):
             tool_output_msgs.append(msg)
         return tool_output_msgs
 
+    async def get_result_parsable_context(self, context: "ConversationContext") -> Any:
+        raise NotImplementedError("Not implemented yet")
+
     @property
     def tool_config(self) -> Any:
         return self.browser_tool.tool_config
@@ -138,6 +150,38 @@ class HarmonyPythonTool(Tool):
             tool_output_msgs.append(msg)
         return tool_output_msgs
 
+    async def get_result_parsable_context(self, context: "ConversationContext") -> Any:
+        """
+        This function converts parsable context types to harmony and
+        back so we can use GPTOSS demo python tool
+        """
+        from vllm.entrypoints.context import ParsableContext
+
+        assert isinstance(context, ParsableContext)
+
+        last_msg = context.parser.response_messages[-1]
+        args = json.loads(last_msg.arguments)
+
+        last_msg_harmony = Message(
+            author=Author(role="assistant", name=None),
+            content=[TextContent(text=args["code"])],
+            channel="analysis",
+            recipient="python",
+            content_type="code",
+        )
+
+        tool_output_msgs = []
+        async for msg in self.python_tool.process(last_msg_harmony):
+            processed = ResponseFunctionToolCallOutputItem(
+                id=f"fco_{random_uuid()}",
+                type="function_call_output",
+                call_id=f"call_{random_uuid()}",
+                output=msg.content[0].text,
+                status="completed",
+            )
+            tool_output_msgs.append(processed)
+        return tool_output_msgs
+
     @property
     def tool_config(self) -> Any:
         return self.python_tool.tool_config

From e7296b08da66b4c79eb43d0932a3d8628178d036 Mon Sep 17 00:00:00 2001
From: Angela Yi <yiangela7@gmail.com>
Date: Fri, 5 Dec 2025 08:54:26 -0800
Subject: [PATCH 276/770] [bugfix] Pass globals to aot_compiled function
 (#29428)

Signed-off-by: angelayi <yiangela7@gmail.com>
---
 vllm/compilation/decorators.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index eed7795cdb349..6bb66ce3eec0c 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -409,7 +409,9 @@ def _support_torch_compile(
                     open(aot_compilation_path, "rb") as f,
                 ):
                     start_monitoring_torch_compile(self.vllm_config)
-                    loaded_fn = torch.compiler.load_compiled_function(f)
+                    loaded_fn = torch.compiler.load_compiled_function(
+                        f, f_globals=self.forward.__globals__
+                    )
                 _verify_source_unchanged(loaded_fn.source_info(), self.vllm_config)
                 loaded_fn.disable_guard_check()
                 self.aot_compiled_fn = loaded_fn

From 78c44fd722fe3ce010b0ba9c5e7349fe4094b39d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Fri, 5 Dec 2025 18:17:36 +0100
Subject: [PATCH 277/770] [NIXL] Small cleanup of unused variables (#29618)

Signed-off-by: NickLucche <nlucches@redhat.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 .../kv_connector/unit/test_nixl_connector.py  |  2 +-
 .../kv_connector/v1/nixl_connector.py         | 20 +++++++------------
 2 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index ae4125d54190c..65db16f48c2c9 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -1307,7 +1307,7 @@ def test_shutdown_cleans_up_resources(dist_init):
         patch.object(nixl_wrapper, "remove_remote_agent") as mock_rem_agent,
         patch.object(nixl_wrapper, "deregister_memory") as mock_dereg,
     ):
-        worker._recving_transfers = {"req1": [(123, time.perf_counter())]}
+        worker._recving_transfers = {"req1": [123]}
         worker.src_xfer_side_handle = 456
         worker.dst_xfer_side_handles = {"engine1": 789}
         worker._remote_agents = {"engine1": {0: "agent1"}}
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 49330abcec324..649e54adaba49 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -55,7 +55,7 @@ if TYPE_CHECKING:
     from vllm.v1.kv_cache_interface import KVCacheConfig
     from vllm.v1.request import Request
 
-Transfer = tuple[int, float]  # (xfer_handle, start_time)
+TransferHandle = int
 EngineId = str
 ReqId = str
 
@@ -874,7 +874,7 @@ class NixlConnectorWorker:
         # In progress transfers.
         # [req_id -> list[handle]]
         self._recving_metadata: dict[ReqId, ReqMeta] = {}
-        self._recving_transfers = defaultdict[ReqId, list[Transfer]](list)
+        self._recving_transfers = defaultdict[ReqId, list[TransferHandle]](list)
         # Track the expiration time of requests that are waiting to be sent.
         self._reqs_to_send: dict[ReqId, float] = {}
         # Set of requests that have been part of a batch, regardless of status.
@@ -1201,14 +1201,11 @@ class NixlConnectorWorker:
         # Enable different block lengths for different layers when MLA is used.
         self.block_len_per_layer = list[int]()
         self.slot_size_per_layer = list[int]()  # HD bytes in kv terms
-        self.device_id = self.tp_rank
         for layer_name, cache_or_caches in xfer_buffers.items():
             cache_list = cache_or_caches if split_k_and_v else [cache_or_caches]
 
             for cache in cache_list:
                 base_addr = cache.data_ptr()
-                if not self.use_host_buffer and current_platform.is_cuda_alike():
-                    self.device_id = cache.device.index
                 if base_addr in seen_base_addresses:
                     continue
 
@@ -1251,8 +1248,7 @@ class NixlConnectorWorker:
                         "All kv cache tensors must have the same size"
                     )
                 # Need to make sure the device ID is non-negative for NIXL,
-                # Torch uses -1 to indicate CPU tensors while NIXL uses explicit
-                # memory type.
+                # Torch uses -1 to indicate CPU tensors.
                 self.device_id = max(cache.get_device(), 0)
                 caches_data.append(
                     (base_addr, curr_tensor_size_bytes, self.device_id, "")
@@ -1842,9 +1838,7 @@ class NixlConnectorWorker:
                     self._reqs_to_send.pop(req_id, None)
         return notified_req_ids
 
-    def _pop_done_transfers(
-        self, transfers: dict[str, list[tuple[int, float]]]
-    ) -> set[str]:
+    def _pop_done_transfers(self, transfers: dict[str, list[int]]) -> set[str]:
         """
         Pop completed xfers by checking for DONE state.
         Args:
@@ -1855,7 +1849,7 @@ class NixlConnectorWorker:
         done_req_ids: set[str] = set()
         for req_id, handles in list(transfers.items()):
             in_progress = False
-            for handle, xfer_start_time in handles:
+            for handle in handles:
                 try:
                     xfer_state = self.nixl_wrapper.check_xfer_state(handle)
                     if xfer_state == "DONE":
@@ -2120,7 +2114,7 @@ class NixlConnectorWorker:
             self.nixl_wrapper.transfer(handle)
 
             # Use handle to check completion in future step().
-            self._recving_transfers[request_id].append((handle, time.perf_counter()))
+            self._recving_transfers[request_id].append(handle)
         except Exception:
             logger.exception(
                 "NIXL transfer setup/initiation failed for request %s. "
@@ -2251,7 +2245,7 @@ class NixlConnectorWorker:
         """Shutdown the connector worker."""
         self._handshake_initiation_executor.shutdown(wait=False)
         for handles in self._recving_transfers.values():
-            for handle, _ in handles:
+            for handle in handles:
                 self.nixl_wrapper.release_xfer_handle(handle)
         self._recving_transfers.clear()
         if self.src_xfer_side_handle:

From dc264bcea17b279173c7cbb99645c35a56bed778 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 5 Dec 2025 09:28:32 -0800
Subject: [PATCH 278/770] [BugFix] Eagerly abort cancelled final-step requests
 (#29987)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, when requests are cancelled while executing their final
step, "completion" is handled based on normal stop processing (e.g.
length or stop token), so the abort has no effect. This is typically
not a problem, but when a kv connector is involved it thinks the
request completed successfully rather than being aborted.

This is problematic for disaggregated prefill which will free kv
cache blocks if the request was aborted but not if it completed
successfully—since the cancelled request will never be sent to
the decode side, kv cache blocks remain pinned until the fall-back
timeout expires. The problem is exacerbated when many requests
are cancelled and/or there are large prefills whose forward pass
takes a long time (since the window is bigger).

This PR fixes the problem by processing pending aborts
immediately prior to processing model output each step; we process
only aborts, not new requests, since it's preferable for latency to
process model outputs before new incoming requests.

Fixes #26400.

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/engine/test_abort_final_step.py | 311 +++++++++++++++++++++++
 vllm/v1/engine/core.py                   |  37 ++-
 vllm/v1/worker/gpu_worker.py             |  10 +-
 3 files changed, 352 insertions(+), 6 deletions(-)
 create mode 100644 tests/v1/engine/test_abort_final_step.py

diff --git a/tests/v1/engine/test_abort_final_step.py b/tests/v1/engine/test_abort_final_step.py
new file mode 100644
index 0000000000000..560c5c2b1e300
--- /dev/null
+++ b/tests/v1/engine/test_abort_final_step.py
@@ -0,0 +1,311 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Test for the fix in PR #29987: Eagerly abort cancelled final-step requests.
+
+This test verifies that when a request is aborted during its final execution
+step (when it would naturally complete), it is properly marked as aborted
+rather than being treated as normally completed.
+
+The test uses a dummy KV connector to verify that the connector receives
+the correct finish status (FINISHED_ABORTED, not FINISHED_LENGTH_CAPPED).
+"""
+
+import asyncio
+import tempfile
+import time
+from pathlib import Path
+from typing import Any
+from unittest.mock import patch
+
+import pytest
+
+from vllm import SamplingParams
+from vllm.config import KVTransferConfig, VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorBase_V1,
+    KVConnectorMetadata,
+    KVConnectorRole,
+)
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.platforms import current_platform
+from vllm.sampling_params import RequestOutputKind
+from vllm.utils.torch_utils import set_default_torch_num_threads
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.engine.async_llm import AsyncLLM
+from vllm.v1.kv_cache_interface import KVCacheConfig
+from vllm.v1.request import Request
+
+if not current_platform.is_cuda():
+    pytest.skip(reason="V1 currently only supported on CUDA.", allow_module_level=True)
+
+TEXT_PROMPT = "Hello"
+
+
+class DummyKVConnectorMetadata(KVConnectorMetadata):
+    """Dummy metadata for the test connector."""
+
+    def __init__(self):
+        self.requests: list = []
+
+
+class DummyKVConnector(KVConnectorBase_V1):
+    """
+    Dummy KV connector that captures request finish statuses to a file.
+    This is used to verify the fix - without the fix, a request aborted
+    during its final step would be captured as FINISHED_LENGTH_CAPPED
+    instead of FINISHED_ABORTED.
+
+    The connector runs in a separate process, so we write statuses to a file
+    that can be read by the test process.
+    """
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        role: KVConnectorRole,
+        kv_cache_config: KVCacheConfig | None = None,
+    ):
+        super().__init__(vllm_config, role, kv_cache_config)
+        # Get the status file path from extra config
+        extra_config = vllm_config.kv_transfer_config.kv_connector_extra_config or {}
+        self.status_file = extra_config.get("status_file")
+        # Log that we were initialized
+        if self.status_file:
+            try:
+                with open(self.status_file, "a") as f:
+                    f.write(f"INIT:{role.name}\n")
+            except Exception:
+                pass
+
+    def get_num_new_matched_tokens(
+        self,
+        request: Request,
+        num_computed_tokens: int,
+    ) -> tuple[int | None, bool]:
+        return (0, False)
+
+    def update_state_after_alloc(
+        self,
+        request: Request,
+        blocks: Any,
+        num_external_tokens: int,
+    ):
+        pass
+
+    def build_connector_meta(
+        self, scheduler_output: SchedulerOutput
+    ) -> KVConnectorMetadata:
+        return DummyKVConnectorMetadata()
+
+    def request_finished(
+        self,
+        request: Request,
+        block_ids: list[int],
+    ) -> tuple[bool, dict[str, Any] | None]:
+        """Capture the request status when finished by writing to a file."""
+        if self.status_file:
+            try:
+                with open(self.status_file, "a") as f:
+                    # Write the status name (e.g., "FINISHED_ABORTED")
+                    f.write(f"{request.status.name}\n")
+            except Exception as e:
+                # Log but don't fail - this is just test instrumentation
+                print(f"[DummyKVConnector] Failed to write status: {e}")
+        return False, None
+
+    def start_load_kv(self, forward_context: Any, **kwargs: Any) -> None:
+        pass
+
+    def wait_for_layer_load(self, layer_name: str) -> None:
+        pass
+
+    def save_kv_layer(
+        self,
+        layer_name: str,
+        kv_layer: Any,
+        attn_metadata: Any,
+        **kwargs: Any,
+    ) -> None:
+        pass
+
+    def wait_for_save(self):
+        pass
+
+
+# Register the dummy connector
+KVConnectorFactory.register_connector(
+    "DummyKVConnector", __name__, DummyKVConnector.__name__
+)
+
+
+@pytest.mark.parametrize("async_scheduling", [False, True])
+@pytest.mark.asyncio
+async def test_abort_during_final_step(async_scheduling: bool):
+    """
+    Test that a request aborted during its final execution step is treated as
+    aborted rather than completed.
+
+    This test:
+    1. Monkeypatches execute_model to wait for a file to be deleted
+    2. Configures a dummy KV connector to capture finish statuses
+    3. Starts a request with max_tokens=1 (will complete on first decode step)
+    4. Aborts the request, then deletes the file to unblock execute_model
+    5. Verifies the KV connector received FINISHED_ABORTED not FINISHED_LENGTH_CAPPED
+
+    See https://github.com/vllm-project/vllm/pull/29987.
+
+    Without the fix, the KV connector would see FINISHED_LENGTH_CAPPED because
+    update_from_output() would mark the request as completed before processing
+    the abort. This causes KV cache blocks to not be freed properly in
+    disaggregated prefill scenarios.
+
+    With the fix, _process_aborts_queue() runs before update_from_output(), so the
+    abort takes precedence and the KV connector sees FINISHED_ABORTED.
+    """
+
+    # Create three temporary files:
+    # 1. ready_file: deleted by execute_model to signal it has started
+    # 2. block_file: execute_model waits for this to be deleted
+    # 3. status_file: KV connector writes finish statuses here
+    with tempfile.NamedTemporaryFile(delete=False) as f:
+        ready_file = Path(f.name)
+    with tempfile.NamedTemporaryFile(delete=False) as f2:
+        block_file = Path(f2.name)
+    with tempfile.NamedTemporaryFile(delete=False, mode="w") as f3:
+        status_file = Path(f3.name)
+
+    try:
+        # Get the original execute_model method
+        from vllm.v1.worker.gpu_worker import Worker
+
+        original_execute_model = Worker.execute_model
+
+        def execute_model_with_wait(self, scheduler_output):
+            # Signal that execute_model has been called by deleting ready_file
+            if ready_file.exists():
+                ready_file.unlink()
+
+            # Wait for the block file to be deleted (triggered from test after abort)
+            # This runs in the worker process (after fork), so we poll the filesystem
+            while block_file.exists():
+                time.sleep(0.01)
+            return original_execute_model(self, scheduler_output)
+
+        # Patch execute_model to inject the wait
+        # This happens before the worker process is forked, so the patch applies there
+        with patch.object(Worker, "execute_model", execute_model_with_wait):
+            request_id = "test-abort-final-step"
+
+            # Configure engine with dummy KV connector
+            # Pass the status file path so the connector can write to it
+            kv_transfer_config = KVTransferConfig(
+                kv_connector="DummyKVConnector",
+                kv_role="kv_both",
+                kv_connector_extra_config={"status_file": str(status_file)},
+            )
+            engine_args = AsyncEngineArgs(
+                model="meta-llama/Llama-3.2-1B-Instruct",
+                enforce_eager=True,
+                async_scheduling=async_scheduling,
+                kv_transfer_config=kv_transfer_config,
+            )
+
+            with set_default_torch_num_threads(1):
+                engine = AsyncLLM.from_engine_args(engine_args)
+
+            try:
+                # Create a request that will complete after just 1 token
+                sampling_params = SamplingParams(
+                    max_tokens=1,
+                    ignore_eos=True,
+                    output_kind=RequestOutputKind.DELTA,
+                )
+
+                # Start generation in a task
+                outputs = []
+
+                async def generate():
+                    async for output in engine.generate(
+                        request_id=request_id,
+                        prompt=TEXT_PROMPT,
+                        sampling_params=sampling_params,
+                    ):
+                        outputs.append(output)
+
+                gen_task = asyncio.create_task(generate())
+
+                # Wait for execute_model to signal it has started (with timeout)
+                timeout = 5.0  # 5 second timeout
+                start_time = time.time()
+                while ready_file.exists():
+                    if time.time() - start_time > timeout:
+                        raise TimeoutError(
+                            "Timeout waiting for execute_model to start. "
+                            "The monkeypatch may not be working correctly, "
+                            "for example if spawn was used instead of fork."
+                        )
+                    await asyncio.sleep(0.01)
+
+                # Abort the request while execute_model is blocked
+                await engine.abort(request_id)
+
+                # Now unblock execute_model by deleting the file
+                # The abort should be processed before the model output
+                block_file.unlink()
+
+                # Wait for generation to complete
+                await gen_task
+
+                # Give the scheduler a moment to finish cleanup
+                await asyncio.sleep(0.1)
+
+                # Verify we got output
+                assert len(outputs) > 0, "Should have received at least one output"
+
+                # The final output should have finish_reason="abort"
+                final_output = outputs[-1]
+                assert final_output.finished, (
+                    "Final output should be marked as finished"
+                )
+                assert final_output.outputs[0].finish_reason == "abort", (
+                    f"Expected finish_reason='abort' but got "
+                    f"'{final_output.outputs[0].finish_reason}'. "
+                )
+
+                with open(status_file) as f4:
+                    status_lines = f4.read().strip().split("\n")
+                    # Filter for actual finish statuses (not INIT or empty lines)
+                    captured_statuses = [
+                        line
+                        for line in status_lines
+                        if line and line.startswith("FINISHED_")
+                    ]
+
+                assert len(captured_statuses) >= 1, (
+                    f"Expected at least 1 captured finish status, got "
+                    f"{len(captured_statuses)}. File content: {status_lines}"
+                )
+
+                assert "FINISHED_ABORTED" in captured_statuses, (
+                    f"KV connector should see FINISHED_ABORTED but got "
+                    f"{captured_statuses}. "
+                )
+
+                # Verify cleanup
+                assert not engine.output_processor.has_unfinished_requests()
+
+            finally:
+                # Shutdown the engine
+                engine.shutdown()
+
+    finally:
+        # Clean up temporary files if they still exist
+        if ready_file.exists():
+            ready_file.unlink()
+        if block_file.exists():
+            block_file.unlink()
+        if status_file.exists():
+            status_file.unlink()
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 61b8422dd6633..8e34dfcea7f61 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -204,6 +204,8 @@ class EngineCore:
         )
         self.async_scheduling = vllm_config.scheduler_config.async_scheduling
 
+        self.aborts_queue = queue.Queue[list[str]]()
+
         # Mark the startup heap as static so that it's ignored by GC.
         # Reduces pause times of oldest generation collections.
         freeze_gc_heap()
@@ -347,6 +349,9 @@ class EngineCore:
             if model_output is None:
                 model_output = self.model_executor.sample_tokens(grammar_output)
 
+        # Before processing the model output, process any aborts that happened
+        # during the model execution.
+        self._process_aborts_queue()
         engine_core_outputs = self.scheduler.update_from_output(
             scheduler_output, model_output
         )
@@ -440,6 +445,9 @@ class EngineCore:
         with self.log_error_detail(scheduler_output):
             model_output = future.result()
 
+        # Before processing the model output, process any aborts that happened
+        # during the model execution.
+        self._process_aborts_queue()
         engine_core_outputs = self.scheduler.update_from_output(
             scheduler_output, model_output
         )
@@ -458,6 +466,18 @@ class EngineCore:
 
         return engine_core_outputs, model_executed
 
+    def _process_aborts_queue(self):
+        if not self.aborts_queue.empty():
+            request_ids = []
+            while not self.aborts_queue.empty():
+                ids = self.aborts_queue.get_nowait()
+                if isinstance(ids, str):
+                    # Should be a list here, but also handle string just in case.
+                    ids = (ids,)
+                request_ids.extend(ids)
+            # More efficient to abort all as a single batch.
+            self.abort_requests(request_ids)
+
     def shutdown(self):
         self.structured_output_manager.clear_backend()
         if self.model_executor:
@@ -871,9 +891,13 @@ class EngineCoreProc(EngineCore):
             and not self.scheduler.has_requests()
             and not self.batch_queue
         ):
-            if logger.isEnabledFor(DEBUG) and self.input_queue.empty():
-                logger.debug("EngineCore waiting for work.")
-                waited = True
+            if self.input_queue.empty():
+                # Drain aborts queue; all aborts are also processed via input_queue.
+                with self.aborts_queue.mutex:
+                    self.aborts_queue.queue.clear()
+                if logger.isEnabledFor(DEBUG):
+                    logger.debug("EngineCore waiting for work.")
+                    waited = True
             req = self.input_queue.get()
             self._handle_client_request(*req)
 
@@ -1027,6 +1051,13 @@ class EngineCoreProc(EngineCore):
                     else:
                         request = generic_decoder.decode(data_frames)
 
+                        if request_type == EngineCoreRequestType.ABORT:
+                            # Aborts are added to *both* queues, allows us to eagerly
+                            # process aborts while also ensuring ordering in the input
+                            # queue to avoid leaking requests. This is ok because
+                            # aborting in the scheduler is idempotent.
+                            self.aborts_queue.put_nowait(request)
+
                     # Push to input queue for core busy loop.
                     self.input_queue.put_nowait((request_type, request))
 
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index a133575cbbced..d189d0860b9a7 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -51,7 +51,6 @@ from vllm.v1.outputs import (
     ModelRunnerOutput,
 )
 from vllm.v1.utils import report_usage_stats
-from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 from vllm.v1.worker.utils import is_residual_scattered_for_sp
 from vllm.v1.worker.worker_base import WorkerBase
 
@@ -59,6 +58,7 @@ logger = init_logger(__name__)
 
 if TYPE_CHECKING:
     from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
+    from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 
 
 class Worker(WorkerBase):
@@ -259,7 +259,11 @@ class Worker(WorkerBase):
                 self.vllm_config, self.device
             )
         else:
-            self.model_runner = GPUModelRunner(self.vllm_config, self.device)
+            from vllm.v1.worker.gpu_model_runner import (
+                GPUModelRunner as GPUModelRunnerV1,
+            )
+
+            self.model_runner = GPUModelRunnerV1(self.vllm_config, self.device)
 
         if self.rank == 0:
             # If usage stat is enabled, collect relevant info.
@@ -556,7 +560,7 @@ class Worker(WorkerBase):
             and forward_pass
         ):
             # currently only supported by V1 GPUModelRunner
-            assert isinstance(self.model_runner, GPUModelRunner)
+            assert not self.use_v2_model_runner
             num_scheduled_tokens_np = np.array(
                 list(scheduler_output.num_scheduled_tokens.values()),
                 dtype=np.int32,

From dff0a2b39475096f5456721bfc8df3c7fea3cc57 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Fri, 5 Dec 2025 17:43:48 +0000
Subject: [PATCH 279/770] [NIXL] Add remote_request_id to kv_transfer_params
 (#29665)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 .../v1/kv_connector/unit/test_nixl_connector.py |  6 ++++++
 tests/v1/kv_connector/unit/utils.py             |  1 +
 .../kv_connector/v1/nixl_connector.py           | 17 ++++++++++++++---
 3 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index 65db16f48c2c9..5045ae0eef33f 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -470,6 +470,7 @@ class TestNixlHandshake:
                             num_xfers + 6,
                         ],
                         "remote_engine_id": FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+                        "remote_request_id": f"prefill-{request_id}",
                         "remote_host": "localhost",
                         "remote_port": 1234,
                         "remote_tp_size": 1,
@@ -536,6 +537,7 @@ class TestNixlHandshake:
             kv_transfer_params={
                 "remote_block_ids": [4, 5, 6],
                 "remote_engine_id": FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+                "remote_request_id": "prefill-id",
                 "remote_host": "localhost",
                 "remote_port": 1234,
                 "remote_tp_size": prefill_tp_size,
@@ -591,6 +593,7 @@ class TestNixlHandshake:
                 kv_transfer_params={
                     "remote_block_ids": [4, 5, 6],
                     "remote_engine_id": FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+                    "remote_request_id": f"prefill-id-{i}",
                     "remote_host": "localhost",
                     "remote_port": 1234,
                     "remote_tp_size": 1,
@@ -754,6 +757,7 @@ def test_kv_connector_stats(dist_init):
         kv_transfer_params={
             "remote_block_ids": [4, 5, 6],
             "remote_engine_id": FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+            "remote_request_id": f"prefill-{request_id}",
             "remote_host": "localhost",
             "remote_port": 1234,
             "remote_tp_size": 1,
@@ -1470,6 +1474,7 @@ def test_handshake_failure_returns_finished(dist_init):
         kv_transfer_params={
             "remote_block_ids": [4, 5, 6],
             "remote_engine_id": FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+            "remote_request_id": f"prefill-{request_id}",
             "remote_host": "localhost",
             "remote_port": 1234,
             "remote_tp_size": 1,
@@ -1519,6 +1524,7 @@ def test_transfer_setup_failure_returns_finished(dist_init):
         kv_transfer_params={
             "remote_block_ids": [10, 11, 12],
             "remote_engine_id": FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+            "remote_request_id": f"prefill-{request_id}",
             "remote_host": "localhost",
             "remote_port": 1234,
             "remote_tp_size": 1,
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index cea41c3ab18a6..58f1a7282352b 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -194,6 +194,7 @@ def create_request(
             do_remote_prefill=True,
             do_remote_decode=False,
             remote_engine_id="my-engine-id",
+            remote_request_id=f"prefill-{request_id}",
             remote_block_ids=list(range(num_remote_blocks)),
             remote_host="my-host",
             remote_port=1234,
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 649e54adaba49..7aa12e9993c76 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -71,8 +71,9 @@ ReqId = str
 #
 # Version History:
 #   1: Initial version with compatibility checking
+#   2: Add remote_request_id to kv_transfer_params
 #
-NIXL_CONNECTOR_VERSION: int = 1
+NIXL_CONNECTOR_VERSION: int = 2
 
 GET_META_MSG = b"get_meta_msg"
 
@@ -210,6 +211,7 @@ class ReqMeta:
     remote_host: str
     remote_port: int
     remote_engine_id: str
+    remote_request_id: str
     tp_size: int
 
 
@@ -236,6 +238,7 @@ class NixlConnectorMetadata(KVConnectorMetadata):
             local_physical_block_ids=local_block_ids,
             remote_block_ids=kv_transfer_params["remote_block_ids"],
             remote_engine_id=kv_transfer_params["remote_engine_id"],
+            remote_request_id=kv_transfer_params["remote_request_id"],
             remote_host=kv_transfer_params["remote_host"],
             remote_port=kv_transfer_params["remote_port"],
             # P workers don't need to receive tp_size from proxy here.
@@ -622,7 +625,12 @@ class NixlConnectorScheduler:
             if params.get("remote_block_ids"):
                 if all(
                     p in params
-                    for p in ("remote_engine_id", "remote_host", "remote_port")
+                    for p in (
+                        "remote_engine_id",
+                        "remote_request_id",
+                        "remote_host",
+                        "remote_port",
+                    )
                 ):
                     # If remote_blocks and num_external_tokens = 0, we have
                     # a full prefix cache hit on the D worker. We need to call
@@ -751,6 +759,7 @@ class NixlConnectorScheduler:
             do_remote_decode=False,
             remote_block_ids=block_ids,
             remote_engine_id=self.engine_id,
+            remote_request_id=request.request_id,
             remote_host=self.side_channel_host,
             remote_port=self.side_channel_port,
             tp_size=self.vllm_config.parallel_config.tensor_parallel_size,
@@ -1964,6 +1973,7 @@ class NixlConnectorWorker:
         self._read_blocks(
             request_id=req_id,
             dst_engine_id=meta.remote_engine_id,
+            remote_request_id=meta.remote_request_id,
             local_block_ids=meta.local_physical_block_ids,
             remote_block_ids=meta.remote_block_ids,
         )
@@ -1974,6 +1984,7 @@ class NixlConnectorWorker:
         remote_block_ids: list[int],
         dst_engine_id: str,
         request_id: str,
+        remote_request_id: str,
     ):
         block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(dst_engine_id)
         if block_size_ratio > 1:
@@ -2006,7 +2017,7 @@ class NixlConnectorWorker:
         # Number of D TP workers that will read from dst P. Propagate tp_ratio
         # on notification so that dst worker can wait before freeing blocks.
         tp_ratio = self.kv_topo.tp_ratio_from_engine_id(dst_engine_id)
-        notif_id = f"{request_id}:{tp_ratio}".encode()
+        notif_id = f"{remote_request_id}:{tp_ratio}".encode()
 
         # Full prefix cache hit: do not need to read remote blocks,
         # just notify P worker that we have the blocks we need.

From 66e674cdd549e89b341fb33405a2c02ec5f5ae8d Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Fri, 5 Dec 2025 12:48:43 -0500
Subject: [PATCH 280/770] [Attention][UX][1/N] Add AttentionConfig and change
 attention env vars to CLI arguments (#26315)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
---
 tests/compile/test_fusion_attn.py             |   5 +-
 .../kv_connector/unit/test_nixl_connector.py  |  39 ++++-
 tests/v1/worker/test_gpu_model_runner.py      | 158 +++++++++---------
 vllm/attention/backends/abstract.py           |  26 ++-
 vllm/attention/layer.py                       |   4 +-
 vllm/attention/selector.py                    | 137 +--------------
 vllm/attention/utils/fa_utils.py              |  11 +-
 vllm/config/__init__.py                       |   3 +
 vllm/config/attention.py                      | 114 +++++++++++++
 vllm/config/model.py                          |  13 --
 vllm/config/vllm.py                           |   7 +
 vllm/engine/arg_utils.py                      |  31 +++-
 vllm/model_executor/models/config.py          |  11 +-
 vllm/model_executor/models/vision.py          |   7 +-
 vllm/platforms/cuda.py                        |  23 +--
 vllm/utils/flashinfer.py                      |  37 ++--
 vllm/v1/attention/backends/flash_attn.py      |   9 +-
 vllm/v1/attention/backends/flashinfer.py      |  26 +--
 vllm/v1/attention/backends/mla/common.py      |  19 ++-
 .../attention/backends/mla/flashattn_mla.py   |   5 +-
 vllm/v1/attention/backends/rocm_attn.py       |   2 +-
 vllm/v1/attention/backends/triton_attn.py     |   5 +-
 22 files changed, 367 insertions(+), 325 deletions(-)
 create mode 100644 vllm/config/attention.py

diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
index 9b4486e56c73e..db95dff5e0fc7 100644
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -12,13 +12,13 @@ from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
 from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import Attention
-from vllm.attention.selector import global_force_attn_backend_context_manager
 from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass
 from vllm.compilation.fx_utils import find_op_nodes
 from vllm.compilation.matcher_utils import QUANT_OPS
 from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.compilation.post_cleanup import PostCleanupPass
 from vllm.config import (
+    AttentionConfig,
     CacheConfig,
     CompilationConfig,
     CompilationMode,
@@ -335,6 +335,7 @@ def test_attention_quant_pattern(
             custom_ops=custom_ops_list,
         ),
         cache_config=CacheConfig(cache_dtype="fp8"),
+        attention_config=AttentionConfig(backend=backend),
     )
 
     # Create test inputs
@@ -352,7 +353,6 @@ def test_attention_quant_pattern(
     with (
         set_current_vllm_config(vllm_config_unfused),
         set_forward_context(attn_metadata=None, vllm_config=vllm_config_unfused),
-        global_force_attn_backend_context_manager(backend),
     ):
         model_unfused = model_class(
             num_qo_heads=num_qo_heads,
@@ -378,7 +378,6 @@ def test_attention_quant_pattern(
     with (
         set_current_vllm_config(vllm_config),
         set_forward_context(attn_metadata=None, vllm_config=vllm_config),
-        global_force_attn_backend_context_manager(backend),
     ):
         model_fused = model_class(
             num_qo_heads=num_qo_heads,
diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index 5045ae0eef33f..ec9ff7315d097 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -1151,13 +1151,29 @@ def test_register_kv_caches(dist_init, attn_backend, monkeypatch):
     }
 
     # Store tensor info for validation
-    expected_tensor_size = shared_tensor[0].element_size() * shared_tensor[0].numel()
-    expected_base_addrs = [
-        shared_tensor[0].data_ptr(),
-        shared_tensor[1].data_ptr(),
-        unique_tensor[0].data_ptr(),
-        unique_tensor[1].data_ptr(),
-    ]
+    test_shape = backend_cls.get_kv_cache_shape(
+        num_blocks=1, block_size=16, num_kv_heads=1, head_size=1
+    )
+    is_blocks_first = len(test_shape) == 5 and test_shape[0] == 1
+
+    if is_blocks_first:
+        expected_tensor_size = shared_tensor.element_size() * shared_tensor.numel()
+        expected_base_addrs = [
+            shared_tensor.data_ptr(),
+            unique_tensor.data_ptr(),
+        ]
+        expected_num_entries = 2
+    else:
+        expected_tensor_size = (
+            shared_tensor[0].element_size() * shared_tensor[0].numel()
+        )
+        expected_base_addrs = [
+            shared_tensor[0].data_ptr(),
+            shared_tensor[1].data_ptr(),
+            unique_tensor[0].data_ptr(),
+            unique_tensor[1].data_ptr(),
+        ]
+        expected_num_entries = 4
 
     with (
         patch(
@@ -1192,7 +1208,7 @@ def test_register_kv_caches(dist_init, attn_backend, monkeypatch):
         # Verify get_reg_descs was called with caches_data
         assert mock_wrapper_instance.get_reg_descs.called
         caches_data, _ = mock_wrapper_instance.get_reg_descs.call_args[0]
-        assert len(caches_data) == 4
+        assert len(caches_data) == expected_num_entries
 
         for i, cache_entry in enumerate(caches_data):
             base_addr, size, _tp_rank, _ = cache_entry
@@ -1214,7 +1230,12 @@ def test_register_kv_caches(dist_init, attn_backend, monkeypatch):
             f"Expected {expected_blocks_count} blocks, got {len(blocks_data)}"
         )
 
-        expected_block_len = expected_tensor_size // 2
+        num_blocks = 2
+        if is_blocks_first:
+            expected_block_len = expected_tensor_size // num_blocks // 2
+        else:
+            expected_block_len = expected_tensor_size // num_blocks
+
         for i, block_entry in enumerate(blocks_data):
             block_start_addr, block_len, tp_rank = block_entry
             assert block_len == expected_block_len, (
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index 459abcfdd53cf..7b8c4268a5237 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -6,8 +6,10 @@ import pytest
 import torch
 
 from vllm.attention.backends.abstract import MultipleOf
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import Attention
 from vllm.config import (
+    AttentionConfig,
     CacheConfig,
     ModelConfig,
     ParallelConfig,
@@ -765,7 +767,7 @@ def test_init_kv_cache_with_kv_sharing_valid():
     current_platform.is_rocm(),
     reason="Attention backend FLASHINFER is not supported on ROCm.",
 )
-def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
+def test_hybrid_attention_mamba_tensor_shapes():
     """
     The GPU model runner creates different views into the
     KVCacheTensors for the attention and mamba layers
@@ -806,11 +808,13 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
         cache_dtype="auto",
     )
     parallel_config = ParallelConfig()
+    attention_config = AttentionConfig(backend=AttentionBackendEnum.FLASHINFER)
     vllm_config = VllmConfig(
         model_config=model_config,
         cache_config=cache_config,
         scheduler_config=scheduler_config,
         parallel_config=parallel_config,
+        attention_config=attention_config,
     )
 
     layer_0 = "model.layers.0.self_attn.attn"
@@ -820,8 +824,7 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
     layer_4 = "model.layers.4.mixer"
     layer_5 = "model.layers.5.mixer"
 
-    with set_current_vllm_config(vllm_config), monkeypatch.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
+    with set_current_vllm_config(vllm_config):
         hf_config = vllm_config.model_config.hf_config
         fwd_context = {}
         for key in [layer_0, layer_1]:
@@ -851,10 +854,7 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
             )
         # suppress var not used error
         assert fwd_context is not None
-    vllm_ctx = vllm_config.compilation_config.static_forward_context
-
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
+        vllm_ctx = vllm_config.compilation_config.static_forward_context
 
         runner = GPUModelRunner(vllm_config, DEVICE)
         kv_cache_spec = runner.get_kv_cache_spec()
@@ -865,94 +865,94 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
         )[0]
         runner.initialize_kv_cache(kv_cache_config)
 
-        # random partition of blocks
-        # blocks0 will be assigned to attention layers
-        # blocks1 will be assigned to mamba layers
-        num_blocks = kv_cache_config.num_blocks
-        ind = np.arange(num_blocks)
-        np.random.shuffle(ind)
-        blocks0, blocks1 = ind[: (num_blocks // 2)], ind[(num_blocks // 2) :]
+    # random partition of blocks
+    # blocks0 will be assigned to attention layers
+    # blocks1 will be assigned to mamba layers
+    num_blocks = kv_cache_config.num_blocks
+    ind = np.arange(num_blocks)
+    np.random.shuffle(ind)
+    blocks0, blocks1 = ind[: (num_blocks // 2)], ind[(num_blocks // 2) :]
 
-        attn_shape = vllm_ctx[layer_0].kv_cache[0].shape
-        conv_shape = vllm_ctx[layer_2].kv_cache[0][0].shape
-        ssm_shape = vllm_ctx[layer_2].kv_cache[0][1].shape
+    attn_shape = vllm_ctx[layer_0].kv_cache[0].shape
+    conv_shape = vllm_ctx[layer_2].kv_cache[0][0].shape
+    ssm_shape = vllm_ctx[layer_2].kv_cache[0][1].shape
 
-        # assert we are using FlashInfer
-        assert attn_shape[0] % num_blocks == 0
-        block_split_ratio = attn_shape[0] // num_blocks
+    # assert we are using FlashInfer
+    assert attn_shape[0] % num_blocks == 0
+    block_split_ratio = attn_shape[0] // num_blocks
 
-        # use small blocks for testing to avoid memory issues
-        test_block_size = min(2, len(blocks0), len(blocks1))
+    # use small blocks for testing to avoid memory issues
+    test_block_size = min(2, len(blocks0), len(blocks1))
 
-        # use non-overlapping blocks to avoid data contamination
-        # Split kernel blocks: first half for attention, second half for mamba
-        mid_point = num_blocks // 2
+    # use non-overlapping blocks to avoid data contamination
+    # Split kernel blocks: first half for attention, second half for mamba
+    mid_point = num_blocks // 2
 
-        # attention uses kernel blocks from first half (mapped to logical blocks)
-        kv_blocks_for_attention = np.array([0, 1])[:test_block_size]
+    # attention uses kernel blocks from first half (mapped to logical blocks)
+    kv_blocks_for_attention = np.array([0, 1])[:test_block_size]
 
-        # mamba uses kernel blocks from second half
-        kv_blocks_for_mamba = np.array([mid_point, mid_point + 1])[:test_block_size]
+    # mamba uses kernel blocks from second half
+    kv_blocks_for_mamba = np.array([mid_point, mid_point + 1])[:test_block_size]
 
-        # create small constant tensors for testing with corrected shapes
-        # attention: [block_size, ...] starting from dimension 2
-        attn_constant_shape = attn_shape[2:]
-        conv_constant_shape = conv_shape[1:]
-        ssm_constant_shape = ssm_shape[1:]
+    # create small constant tensors for testing with corrected shapes
+    # attention: [block_size, ...] starting from dimension 2
+    attn_constant_shape = attn_shape[2:]
+    conv_constant_shape = conv_shape[1:]
+    ssm_constant_shape = ssm_shape[1:]
 
-        attn_blocks_constant = torch.full(
-            (test_block_size, *attn_constant_shape), device=DEVICE, fill_value=3.33
-        )
-        conv_blocks_constant = torch.full(
-            (test_block_size, *conv_constant_shape), device=DEVICE, fill_value=6.66
-        )
-        ssm_blocks_constant = torch.full(
-            (test_block_size, *ssm_constant_shape), device=DEVICE, fill_value=9.99
-        )
+    attn_blocks_constant = torch.full(
+        (test_block_size, *attn_constant_shape), device=DEVICE, fill_value=3.33
+    )
+    conv_blocks_constant = torch.full(
+        (test_block_size, *conv_constant_shape), device=DEVICE, fill_value=6.66
+    )
+    ssm_blocks_constant = torch.full(
+        (test_block_size, *ssm_constant_shape), device=DEVICE, fill_value=9.99
+    )
 
-        # Fill attention blocks with constants using kv block indices
-        kernel_blocks_for_attention = kv_blocks_for_attention * block_split_ratio
+    # Fill attention blocks with constants using kv block indices
+    kernel_blocks_for_attention = kv_blocks_for_attention * block_split_ratio
 
-        for layer in [layer_0, layer_1]:
-            # attention: kv_cache[0][kernel_block_idx, kv_idx, ...]
-            for i, kernel_block in enumerate(kernel_blocks_for_attention):
-                vllm_ctx[layer].kv_cache[0][kernel_block, :] = attn_blocks_constant[i]
+    for layer in [layer_0, layer_1]:
+        # attention: kv_cache[0][kernel_block_idx, kv_idx, ...]
+        for i, kernel_block in enumerate(kernel_blocks_for_attention):
+            vllm_ctx[layer].kv_cache[0][kernel_block, :] = attn_blocks_constant[i]
 
-        # fill mamba blocks with constants using kernel block indices
-        for layer in [layer_2, layer_3, layer_4, layer_5]:
-            # mamba: kv_cache[0][component][kernel_block_idx, ...]
-            for i, kv_block in enumerate(kv_blocks_for_mamba):
-                vllm_ctx[layer].kv_cache[0][0][kv_block, :] = conv_blocks_constant[i]
-                vllm_ctx[layer].kv_cache[0][1][kv_block, :] = ssm_blocks_constant[i]
+    # fill mamba blocks with constants using kernel block indices
+    for layer in [layer_2, layer_3, layer_4, layer_5]:
+        # mamba: kv_cache[0][component][kernel_block_idx, ...]
+        for i, kv_block in enumerate(kv_blocks_for_mamba):
+            vllm_ctx[layer].kv_cache[0][0][kv_block, :] = conv_blocks_constant[i]
+            vllm_ctx[layer].kv_cache[0][1][kv_block, :] = ssm_blocks_constant[i]
 
-        # verify attention and mamba contents are correct
-        for layer in [layer_0, layer_1]:
-            for i, kernel_block in enumerate(kernel_blocks_for_attention):
-                actual_kv = vllm_ctx[layer].kv_cache[0][kernel_block, :]
-                expected = attn_blocks_constant[i]
+    # verify attention and mamba contents are correct
+    for layer in [layer_0, layer_1]:
+        for i, kernel_block in enumerate(kernel_blocks_for_attention):
+            actual_kv = vllm_ctx[layer].kv_cache[0][kernel_block, :]
+            expected = attn_blocks_constant[i]
 
-                # Check K and V separately
-                assert torch.equal(actual_kv[0], expected)
-                assert torch.equal(actual_kv[1], expected)
+            # Check K and V separately
+            assert torch.equal(actual_kv[0], expected)
+            assert torch.equal(actual_kv[1], expected)
 
-        for layer in [layer_2, layer_3, layer_4, layer_5]:
-            for i, kv_block in enumerate(kv_blocks_for_mamba):
-                actual_conv = vllm_ctx[layer].kv_cache[0][0][kv_block, :]
-                actual_ssm = vllm_ctx[layer].kv_cache[0][1][kv_block, :]
-                expected_conv = conv_blocks_constant[i]
-                expected_ssm = ssm_blocks_constant[i]
+    for layer in [layer_2, layer_3, layer_4, layer_5]:
+        for i, kv_block in enumerate(kv_blocks_for_mamba):
+            actual_conv = vllm_ctx[layer].kv_cache[0][0][kv_block, :]
+            actual_ssm = vllm_ctx[layer].kv_cache[0][1][kv_block, :]
+            expected_conv = conv_blocks_constant[i]
+            expected_ssm = ssm_blocks_constant[i]
 
-                assert torch.equal(actual_conv, expected_conv)
-                assert torch.equal(actual_ssm, expected_ssm)
+            assert torch.equal(actual_conv, expected_conv)
+            assert torch.equal(actual_ssm, expected_ssm)
 
-        for layer in [layer_2, layer_3, layer_4, layer_5]:
-            for i, kv_block in enumerate(kv_blocks_for_mamba):
-                actual_conv = vllm_ctx[layer].kv_cache[0][0][kv_block, :]
-                actual_ssm = vllm_ctx[layer].kv_cache[0][1][kv_block, :]
-                expected_conv = conv_blocks_constant[i]
-                expected_ssm = ssm_blocks_constant[i]
-                assert torch.equal(actual_conv, expected_conv)
-                assert torch.equal(actual_ssm, expected_ssm)
+    for layer in [layer_2, layer_3, layer_4, layer_5]:
+        for i, kv_block in enumerate(kv_blocks_for_mamba):
+            actual_conv = vllm_ctx[layer].kv_cache[0][0][kv_block, :]
+            actual_ssm = vllm_ctx[layer].kv_cache[0][1][kv_block, :]
+            expected_conv = conv_blocks_constant[i]
+            expected_ssm = ssm_blocks_constant[i]
+            assert torch.equal(actual_conv, expected_conv)
+            assert torch.equal(actual_ssm, expected_ssm)
 
 
 def test_hybrid_block_table_initialization():
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index c290670eeacb0..84cca8e686075 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -289,6 +289,16 @@ class AttentionImpl(ABC, Generic[T]):
     # even if they can return lse (for efficiency reasons)
     need_to_return_lse_for_decode: bool = False
 
+    # Whether this attention implementation supports pre-quantized query input.
+    # When True, the attention layer will quantize queries before passing them
+    # to this backend, allowing torch.compile to fuse the quantization with
+    # previous operations. This is typically supported when using FP8 KV cache
+    # with compatible attention kernels (e.g., TRT-LLM).
+    # Subclasses should set this in __init__.
+    # TODO add support to more backends:
+    # https://github.com/vllm-project/vllm/issues/25584
+    supports_quant_query_input: bool = False
+
     dcp_world_size: int
     dcp_rank: int
 
@@ -368,22 +378,6 @@ class AttentionImpl(ABC, Generic[T]):
         """
         return False
 
-    def supports_quant_query_input(self) -> bool:
-        """
-        Check if this attention implementation supports pre-quantized query input.
-
-        When True, the attention layer will quantize queries before passing them
-        to this backend, allowing torch.compile to fuse the quantization with
-        previous operations. This is typically supported when using FP8 KV cache
-        with compatible attention kernels (e.g., TRT-LLM).
-        TODO add support to more backends:
-        https://github.com/vllm-project/vllm/issues/25584
-
-        Returns:
-            bool: True if the implementation can accept pre-quantized queries.
-        """
-        return False
-
     def process_weights_after_loading(self, act_dtype: torch.dtype):
         pass
 
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index da5a626171298..8a522deedf3ce 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -303,7 +303,7 @@ class Attention(nn.Module, AttentionLayerBase):
         self.query_quant = None
         if (
             self.kv_cache_dtype.startswith("fp8")
-            and self.impl.supports_quant_query_input()
+            and self.impl.supports_quant_query_input
         ):
             self.query_quant = QuantFP8(static=True, group_shape=GroupShape.PER_TENSOR)
 
@@ -338,7 +338,7 @@ class Attention(nn.Module, AttentionLayerBase):
             assert self.kv_cache_dtype in {"fp8", "fp8_e4m3"}
 
             # check if query quantization is supported
-            if self.impl.supports_quant_query_input():
+            if self.impl.supports_quant_query_input:
                 query, _ = self.query_quant(query, self._q_scale)
 
         if self.use_output:
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index a7190df3c4f10..aeb130dfe8726 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -2,19 +2,14 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import inspect
-import os
-from collections.abc import Generator
-from contextlib import contextmanager
 from functools import cache
 from typing import cast, get_args
 
 import torch
 
-import vllm.envs as envs
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.attention.backends.registry import (
     MAMBA_TYPE_TO_BACKEND_MAP,
-    AttentionBackendEnum,
     MambaAttentionBackendEnum,
 )
 from vllm.config.cache import CacheDType
@@ -24,60 +19,6 @@ from vllm.utils.import_utils import resolve_obj_by_qualname
 logger = init_logger(__name__)
 
 
-def get_env_variable_attn_backend() -> AttentionBackendEnum | None:
-    """
-    Get the backend override specified by the vLLM attention
-    backend environment variable, if one is specified.
-
-    Returns:
-
-    * AttentionBackendEnum value if an override is specified
-    * None otherwise
-    """
-    backend_name = os.environ.get("VLLM_ATTENTION_BACKEND")
-    if backend_name is None:
-        return None
-    if backend_name == "XFORMERS":
-        raise ValueError(
-            "Attention backend 'XFORMERS' has been removed (See PR #29262 for "
-            "details). Please select a supported attention backend."
-        )
-    return AttentionBackendEnum[backend_name]
-
-
-# Global state allows a particular choice of backend
-# to be forced, overriding the logic which auto-selects
-# a backend based on system & workload configuration
-# (default behavior if this variable is None)
-#
-# THIS SELECTION TAKES PRECEDENCE OVER THE
-# VLLM_ATTENTION_BACKEND ENVIRONMENT VARIABLE
-forced_attn_backend: AttentionBackendEnum | None = None
-
-
-def global_force_attn_backend(attn_backend: AttentionBackendEnum | None) -> None:
-    """
-    Force all attention operations to use a specified backend.
-
-    Passing `None` for the argument re-enables automatic
-    backend selection.,
-
-    Arguments:
-
-    * attn_backend: backend selection (None to revert to auto)
-    """
-    global forced_attn_backend
-    forced_attn_backend = attn_backend
-
-
-def get_global_forced_attn_backend() -> AttentionBackendEnum | None:
-    """
-    Get the currently-forced choice of attention backend,
-    or None if auto-selection is currently enabled.
-    """
-    return forced_attn_backend
-
-
 def get_attn_backend(
     head_size: int,
     dtype: torch.dtype,
@@ -97,7 +38,13 @@ def get_attn_backend(
             f"Valid values are: {valid_cache_dtypes}"
         )
 
+    from vllm.config import get_current_vllm_config
+
+    vllm_config = get_current_vllm_config()
+    backend_enum = vllm_config.attention_config.backend
+
     return _cached_get_attn_backend(
+        backend=backend_enum,
         head_size=head_size,
         dtype=dtype,
         kv_cache_dtype=cast(CacheDType | None, kv_cache_dtype),
@@ -111,6 +58,7 @@ def get_attn_backend(
 
 @cache
 def _cached_get_attn_backend(
+    backend,
     head_size: int,
     dtype: torch.dtype,
     kv_cache_dtype: CacheDType | None,
@@ -120,39 +68,6 @@ def _cached_get_attn_backend(
     use_sparse: bool = False,
     attn_type: str | None = None,
 ) -> type[AttentionBackend]:
-    # Check whether a particular choice of backend was
-    # previously forced.
-    #
-    # THIS SELECTION OVERRIDES THE VLLM_ATTENTION_BACKEND
-    # ENVIRONMENT VARIABLE.
-    selected_backend = None
-    backend_by_global_setting: AttentionBackendEnum | None = (
-        get_global_forced_attn_backend()
-    )
-    if backend_by_global_setting is not None:
-        selected_backend = backend_by_global_setting
-    else:
-        # Check the environment variable and override if specified
-        backend_by_env_var: str | None = envs.VLLM_ATTENTION_BACKEND
-        if backend_by_env_var is not None:
-            if backend_by_env_var.endswith("_VLLM_V1"):
-                logger.warning(
-                    "The suffix '_VLLM_V1' in the environment variable "
-                    "VLLM_ATTENTION_BACKEND is no longer necessary as "
-                    "V0 backends have been deprecated. "
-                    "Please remove this suffix from your "
-                    "environment variable setting.",
-                )
-                backend_by_env_var = backend_by_env_var.removesuffix("_VLLM_V1")
-            try:
-                selected_backend = AttentionBackendEnum[backend_by_env_var]
-            except KeyError as e:
-                raise ValueError(
-                    f"Invalid attention backend: '{backend_by_env_var}'. Valid "
-                    f"backends are: {list(AttentionBackendEnum.__members__.keys())}"
-                ) from e
-
-    # get device-specific attn_backend
     from vllm.platforms import current_platform
 
     sig = inspect.signature(current_platform.get_attn_backend_cls)
@@ -163,7 +78,7 @@ def _cached_get_attn_backend(
             "remove it from your plugin code."
         )
         attention_cls = current_platform.get_attn_backend_cls(
-            selected_backend,
+            backend,
             head_size,
             dtype,
             kv_cache_dtype,
@@ -176,7 +91,7 @@ def _cached_get_attn_backend(
         )
     else:
         attention_cls = current_platform.get_attn_backend_cls(
-            selected_backend,
+            backend,
             head_size,
             dtype,
             kv_cache_dtype,
@@ -232,37 +147,3 @@ def _cached_get_mamba_attn_backend(
 
     mamba_attn_backend = selected_backend.get_class()
     return mamba_attn_backend
-
-
-@contextmanager
-def global_force_attn_backend_context_manager(
-    attn_backend: AttentionBackendEnum,
-) -> Generator[None, None, None]:
-    """
-    Globally force a vLLM attention backend override within a
-    context manager, reverting the global attention backend
-    override to its prior state upon exiting the context
-    manager.
-
-    Arguments:
-
-    * attn_backend: attention backend to force
-
-    Returns:
-
-    * Generator
-    """
-
-    # Save the current state of the global backend override (if any)
-    original_value = get_global_forced_attn_backend()
-
-    # Globally force the new backend override
-    global_force_attn_backend(attn_backend)
-
-    # Yield control back to the enclosed code block
-    try:
-        yield
-    finally:
-        # Revert the original global backend override, if any
-        global_force_attn_backend(original_value)
-        _cached_get_attn_backend.cache_clear()
diff --git a/vllm/attention/utils/fa_utils.py b/vllm/attention/utils/fa_utils.py
index 8a46587473e43..e38c88f4838d1 100644
--- a/vllm/attention/utils/fa_utils.py
+++ b/vllm/attention/utils/fa_utils.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from vllm import envs
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 
@@ -49,10 +48,12 @@ def get_flash_attn_version(requires_alibi: bool = False) -> int | None:
             3 if (device_capability.major == 9 and is_fa_version_supported(3)) else 2
         )
 
-        # 2. override if passed by environment
-        if envs.VLLM_FLASH_ATTN_VERSION is not None:
-            assert envs.VLLM_FLASH_ATTN_VERSION in [2, 3]
-            fa_version = envs.VLLM_FLASH_ATTN_VERSION
+        # 2. override if passed by environment or config
+        from vllm.config import get_current_vllm_config
+
+        vllm_config = get_current_vllm_config()
+        if vllm_config.attention_config.flash_attn_version is not None:
+            fa_version = vllm_config.attention_config.flash_attn_version
 
         # 3. fallback for unsupported combinations
         if device_capability.major == 10 and fa_version == 3:
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index dd76a722106ef..0f84f3ca9d3e3 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from vllm.config.attention import AttentionConfig
 from vllm.config.cache import CacheConfig
 from vllm.config.compilation import (
     CompilationConfig,
@@ -46,6 +47,8 @@ from vllm.config.vllm import (
 # __all__ should only contain classes and functions.
 # Types and globals should be imported from their respective modules.
 __all__ = [
+    # From vllm.config.attention
+    "AttentionConfig",
     # From vllm.config.cache
     "CacheConfig",
     # From vllm.config.compilation
diff --git a/vllm/config/attention.py b/vllm/config/attention.py
new file mode 100644
index 0000000000000..dd62d88826bd6
--- /dev/null
+++ b/vllm/config/attention.py
@@ -0,0 +1,114 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any, Literal
+
+from pydantic import field_validator
+from pydantic.dataclasses import dataclass
+
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.config.utils import config
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+@config
+@dataclass
+class AttentionConfig:
+    """Configuration for attention mechanisms in vLLM."""
+
+    backend: AttentionBackendEnum | None = None
+    """Attention backend to use. If None, will be selected automatically."""
+
+    flash_attn_version: Literal[2, 3] | None = None
+    """Force vllm to use a specific flash-attention version (2 or 3).
+    Only valid when using the flash-attention backend."""
+
+    use_prefill_decode_attention: bool = False
+    """Use separate prefill and decode kernels for attention instead of
+    the unified triton kernel."""
+
+    flash_attn_max_num_splits_for_cuda_graph: int = 32
+    """Flash Attention max number splits for cuda graph decode."""
+
+    use_cudnn_prefill: bool = False
+    """Whether to use cudnn prefill."""
+
+    use_trtllm_ragged_deepseek_prefill: bool = False
+    """Whether to use TRTLLM ragged deepseek prefill."""
+
+    use_trtllm_attention: bool | None = None
+    """If set to True/False, use or don't use the TRTLLM attention backend
+    in flashinfer. If None, auto-detect the attention backend in flashinfer."""
+
+    disable_flashinfer_prefill: bool = False
+    """Whether to disable flashinfer prefill."""
+
+    disable_flashinfer_q_quantization: bool = False
+    """If set, when using fp8 kv, do not quantize Q to fp8."""
+
+    def compute_hash(self) -> str:
+        """
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        from vllm.config.utils import get_hash_factors, hash_factors
+
+        ignored_factors: list[str] = []
+        factors = get_hash_factors(self, ignored_factors)
+        return hash_factors(factors)
+
+    @field_validator("backend", mode="before")
+    @classmethod
+    def validate_backend_before(cls, value: Any) -> Any:
+        """Enable parsing of the `backend` enum type from string."""
+        if isinstance(value, str):
+            return AttentionBackendEnum[value.upper()]
+        return value
+
+    def _set_from_env_if_set(self, field_name: str, env_var_name: str) -> None:
+        """Set field from env var if set, with deprecation warning."""
+        from vllm import envs
+
+        if envs.is_set(env_var_name):
+            value = getattr(envs, env_var_name)
+            if field_name == "backend":
+                value = self.validate_backend_before(value)
+            setattr(self, field_name, value)
+            logger.warning_once(
+                "Using %s environment variable is deprecated and will be removed in "
+                "v0.14.0 or v1.0.0, whichever is soonest. Please use "
+                "--attention-config.%s command line argument or "
+                "AttentionConfig(%s=...) config field instead.",
+                env_var_name,
+                field_name,
+                field_name,
+            )
+
+    def __post_init__(self) -> None:
+        self._set_from_env_if_set("backend", "VLLM_ATTENTION_BACKEND")
+        self._set_from_env_if_set("flash_attn_version", "VLLM_FLASH_ATTN_VERSION")
+        self._set_from_env_if_set(
+            "use_prefill_decode_attention", "VLLM_V1_USE_PREFILL_DECODE_ATTENTION"
+        )
+        self._set_from_env_if_set(
+            "flash_attn_max_num_splits_for_cuda_graph",
+            "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH",
+        )
+        self._set_from_env_if_set("use_cudnn_prefill", "VLLM_USE_CUDNN_PREFILL")
+        self._set_from_env_if_set(
+            "use_trtllm_ragged_deepseek_prefill",
+            "VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL",
+        )
+        self._set_from_env_if_set("use_trtllm_attention", "VLLM_USE_TRTLLM_ATTENTION")
+        self._set_from_env_if_set(
+            "disable_flashinfer_prefill", "VLLM_DISABLE_FLASHINFER_PREFILL"
+        )
+        self._set_from_env_if_set(
+            "disable_flashinfer_q_quantization",
+            "VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION",
+        )
diff --git a/vllm/config/model.py b/vllm/config/model.py
index ae5189ce68d9a..5be7d5e7f2df7 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -4,7 +4,6 @@
 import warnings
 from collections.abc import Callable
 from dataclasses import InitVar, field
-from importlib.util import find_spec
 from typing import TYPE_CHECKING, Any, Literal, cast, get_args
 
 import torch
@@ -467,18 +466,6 @@ class ModelConfig:
 
         self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer)
 
-        if (
-            (backend := envs.VLLM_ATTENTION_BACKEND)
-            and backend == "FLASHINFER"
-            and find_spec("flashinfer") is None
-        ):
-            raise ValueError(
-                "VLLM_ATTENTION_BACKEND is set to FLASHINFER, but flashinfer "
-                "module was not found. See "
-                "https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile "  # noqa: E501
-                "for instructions on how to install it."
-            )
-
         from vllm.platforms import current_platform
 
         if self.override_attention_dtype is not None and not current_platform.is_rocm():
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 823bd96db9ac9..ce3d3b20865db 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -27,6 +27,7 @@ from vllm.transformers_utils.runai_utils import is_runai_obj_uri
 from vllm.utils import random_uuid
 from vllm.utils.hashing import safe_hash
 
+from .attention import AttentionConfig
 from .cache import CacheConfig
 from .compilation import CompilationConfig, CompilationMode, CUDAGraphMode
 from .device import DeviceConfig
@@ -192,6 +193,8 @@ class VllmConfig:
     """Device configuration."""
     load_config: LoadConfig = Field(default_factory=LoadConfig)
     """Load configuration."""
+    attention_config: AttentionConfig = Field(default_factory=AttentionConfig)
+    """Attention configuration."""
     lora_config: LoRAConfig | None = None
     """LoRA configuration."""
     speculative_config: SpeculativeConfig | None = None
@@ -279,6 +282,10 @@ class VllmConfig:
             vllm_factors.append(self.load_config.compute_hash())
         else:
             vllm_factors.append("None")
+        if self.attention_config:
+            vllm_factors.append(self.attention_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.lora_config:
             vllm_factors.append(self.lora_config.compute_hash())
         else:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 883ae370f9e74..aad0719548d18 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -34,6 +34,7 @@ from typing_extensions import TypeIs
 import vllm.envs as envs
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import (
+    AttentionConfig,
     CacheConfig,
     CompilationConfig,
     ConfigType,
@@ -527,6 +528,7 @@ class EngineArgs:
 
     pooler_config: PoolerConfig | None = ModelConfig.pooler_config
     compilation_config: CompilationConfig = get_field(VllmConfig, "compilation_config")
+    attention_config: AttentionConfig = get_field(VllmConfig, "attention_config")
     worker_cls: str = ParallelConfig.worker_cls
     worker_extension_cls: str = ParallelConfig.worker_extension_cls
 
@@ -542,6 +544,7 @@ class EngineArgs:
     )
     model_impl: str = ModelConfig.model_impl
     override_attention_dtype: str = ModelConfig.override_attention_dtype
+    attention_backend: AttentionBackendEnum | None = AttentionConfig.backend
 
     calculate_kv_scales: bool = CacheConfig.calculate_kv_scales
     mamba_cache_dtype: MambaDType = CacheConfig.mamba_cache_dtype
@@ -580,6 +583,8 @@ class EngineArgs:
         # CompilationConfig object
         if isinstance(self.compilation_config, dict):
             self.compilation_config = CompilationConfig(**self.compilation_config)
+        if isinstance(self.attention_config, dict):
+            self.attention_config = AttentionConfig(**self.attention_config)
         if isinstance(self.eplb_config, dict):
             self.eplb_config = EPLBConfig(**self.eplb_config)
         # Setup plugins
@@ -717,6 +722,16 @@ class EngineArgs:
             "--pt-load-map-location", **load_kwargs["pt_load_map_location"]
         )
 
+        # Attention arguments
+        attention_kwargs = get_kwargs(AttentionConfig)
+        attention_group = parser.add_argument_group(
+            title="AttentionConfig",
+            description=AttentionConfig.__doc__,
+        )
+        attention_group.add_argument(
+            "--attention-backend", **attention_kwargs["backend"]
+        )
+
         # Structured outputs arguments
         structured_outputs_kwargs = get_kwargs(StructuredOutputsConfig)
         structured_outputs_group = parser.add_argument_group(
@@ -1140,6 +1155,9 @@ class EngineArgs:
         vllm_group.add_argument(
             "--compilation-config", "-cc", **vllm_kwargs["compilation_config"]
         )
+        vllm_group.add_argument(
+            "--attention-config", "-ac", **vllm_kwargs["attention_config"]
+        )
         vllm_group.add_argument(
             "--additional-config", **vllm_kwargs["additional_config"]
         )
@@ -1693,6 +1711,16 @@ class EngineArgs:
         if model_config.quantization == "bitsandbytes":
             self.quantization = self.load_format = "bitsandbytes"
 
+        # Attention config overrides
+        attention_config = copy.deepcopy(self.attention_config)
+        if self.attention_backend is not None:
+            if attention_config.backend is not None:
+                raise ValueError(
+                    "attention_backend and attention_config.backend "
+                    "are mutually exclusive"
+                )
+            attention_config.backend = self.attention_backend
+
         load_config = self.create_load_config()
 
         # Pass reasoning_parser into StructuredOutputsConfig
@@ -1750,9 +1778,10 @@ class EngineArgs:
             parallel_config=parallel_config,
             scheduler_config=scheduler_config,
             device_config=device_config,
+            load_config=load_config,
+            attention_config=attention_config,
             lora_config=lora_config,
             speculative_config=speculative_config,
-            load_config=load_config,
             structured_outputs_config=self.structured_outputs_config,
             observability_config=observability_config,
             compilation_config=compilation_config,
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index fbeb28a1c0b36..55dd6e50ad249 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -4,7 +4,7 @@ from copy import deepcopy
 from math import lcm
 from typing import TYPE_CHECKING
 
-import vllm.envs as envs
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
 from vllm.model_executor.models import ModelRegistry
 from vllm.platforms import current_platform
@@ -331,6 +331,7 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
         # Enable FULL_AND_PIECEWISE by default
         MambaModelConfig.verify_and_update_config(vllm_config)
 
+        attention_config = vllm_config.attention_config
         cache_config = vllm_config.cache_config
         model_config = vllm_config.model_config
         parallel_config = vllm_config.parallel_config
@@ -347,7 +348,9 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
         #   * CUTLASS_MLA backend: kernel_block_size 128 alignment
         #   * Other MLA backends: kernel_block_size 64 alignment
         if model_config.use_mla:
-            use_cutlass_mla = envs.VLLM_ATTENTION_BACKEND == "CUTLASS_MLA"
+            use_cutlass_mla = (
+                attention_config.backend == AttentionBackendEnum.CUTLASS_MLA
+            )
             kernel_block_alignment_size = 128 if use_cutlass_mla else 64
             attn_page_size_1_token = MLAAttentionSpec(
                 block_size=1,
@@ -361,8 +364,8 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
                 current_platform.is_device_capability(100)
                 and model_config.get_head_size() == 256
                 and (
-                    envs.VLLM_ATTENTION_BACKEND is None
-                    or envs.VLLM_ATTENTION_BACKEND == "FLASHINFER"
+                    attention_config.backend is None
+                    or attention_config.backend == AttentionBackendEnum.FLASHINFER
                 )
             ):
                 # https://github.com/flashinfer-ai/flashinfer/issues/1993 reports that`
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index e5d70eb7bc2fc..7602eca9c3257 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -11,7 +11,7 @@ import torch
 from transformers import PretrainedConfig
 
 from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.config import VllmConfig
+from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.distributed import (
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
@@ -91,10 +91,7 @@ def get_vit_attn_backend(
     if attn_backend_override is not None:
         return attn_backend_override
 
-    # Lazy import to avoid circular dependency
-    from vllm.attention.selector import get_env_variable_attn_backend
-
-    selected_backend: AttentionBackendEnum | None = get_env_variable_attn_backend()
+    selected_backend = get_current_vllm_config().attention_config.backend
     if selected_backend is not None:
         return selected_backend
 
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 1467ca71efec1..7e6ce6aeef536 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -14,7 +14,6 @@ from typing_extensions import ParamSpec
 
 # import custom ops, trigger op registration
 import vllm._C  # noqa
-import vllm.envs as envs
 from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
@@ -149,6 +148,8 @@ class CudaPlatformBase(Platform):
 
     @classmethod
     def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
+        from vllm.attention.backends.registry import AttentionBackendEnum
+
         parallel_config = vllm_config.parallel_config
         model_config = vllm_config.model_config
 
@@ -171,7 +172,7 @@ class CudaPlatformBase(Platform):
             and cache_config.block_size is not None
         ):
             use_sparse = hasattr(vllm_config.model_config.hf_config, "index_topk")
-            # If `VLLM_ATTENTION_BACKEND` is not set and we are using MLA,
+            # If `--attention-config.backend` is not set and we are using MLA,
             # then we default to FlashMLA backend for non-blackwell GPUs,
             # else we default to CutlassMLA. For each case, we force the
             # required block_size.
@@ -179,23 +180,25 @@ class CudaPlatformBase(Platform):
             use_cutlass_mla = False
             use_flashinfer_mla = False
 
-            if envs.VLLM_ATTENTION_BACKEND is None:
+            if vllm_config.attention_config.backend is None:
                 # Default case
                 if cls.is_device_capability(100):
                     # Blackwell => Force CutlassMLA.
                     use_cutlass_mla = True
-                    # TODO: This does not work, because the
-                    # global_force_attn_backend_context_manager is not set.
-                    # See vllm/attention/selector.py:_cached_get_attn_backend
-                    envs.VLLM_ATTENTION_BACKEND = "CUTLASS_MLA"
+                    # Set the backend in AttentionConfig so it's used during
+                    # backend selection
+                    vllm_config.attention_config.backend = (
+                        AttentionBackendEnum.CUTLASS_MLA
+                    )
                 else:
                     # Not Blackwell
                     use_flashmla = True
             else:
                 # Forced case
-                use_flashmla = envs.VLLM_ATTENTION_BACKEND == "FLASHMLA"
-                use_cutlass_mla = envs.VLLM_ATTENTION_BACKEND == "CUTLASS_MLA"
-                use_flashinfer_mla = envs.VLLM_ATTENTION_BACKEND == "FLASHINFER_MLA"
+                backend = vllm_config.attention_config.backend
+                use_flashmla = backend == AttentionBackendEnum.FLASHMLA
+                use_cutlass_mla = backend == AttentionBackendEnum.CUTLASS_MLA
+                use_flashinfer_mla = backend == AttentionBackendEnum.FLASHINFER_MLA
 
             from vllm.attention.ops.flashmla import is_flashmla_dense_supported
 
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index 9f9976d52b4ae..7aaf690cbaa13 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -267,21 +267,16 @@ def supports_trtllm_attention() -> bool:
     return current_platform.is_device_capability(100) and has_nvidia_artifactory()
 
 
-@functools.cache
-def _force_use_trtllm_attention(env_value: bool | None) -> bool | None:
-    """Cache the env value for VLLM_USE_TRTLLM_ATTENTION"""
-    if env_value is not None:
-        logger.info_once("VLLM_USE_TRTLLM_ATTENTION is set to %s", env_value)
-    return env_value
-
-
 def force_use_trtllm_attention() -> bool | None:
     """
-    Return `None` if VLLM_USE_TRTLLM_ATTENTION is not set,
+    Return `None` if --attention-config.use_trtllm_attention is not set,
     return `True` if TRTLLM attention is forced to be used,
     return `False` if TRTLLM attention is forced to be not used.
     """
-    return _force_use_trtllm_attention(envs.VLLM_USE_TRTLLM_ATTENTION)
+    from vllm.config import get_current_vllm_config
+
+    vllm_config = get_current_vllm_config()
+    return vllm_config.attention_config.use_trtllm_attention
 
 
 def can_use_trtllm_attention(num_qo_heads: int, num_kv_heads: int) -> bool:
@@ -307,7 +302,7 @@ def use_trtllm_attention(
     """Return `True` if TRTLLM attention is used."""
     force_use_trtllm = force_use_trtllm_attention()
 
-    # Environment variable is set to 0 - respect it
+    # CLI argument is set to 0 - respect it
     if force_use_trtllm is not None and not force_use_trtllm:
         return False
 
@@ -324,7 +319,7 @@ def use_trtllm_attention(
         if force_use_trtllm:
             logger.warning_once(
                 "TRTLLM attention is not supported on this platform, "
-                "but VLLM_USE_TRTLLM_ATTENTION is set to 1"
+                "but --attention-config.use_trtllm_attention is set to 1"
             )
         return False
 
@@ -333,7 +328,8 @@ def use_trtllm_attention(
         if force_use_trtllm:
             logger.warning_once(
                 "TRTLLM attention is not supported for this combination of "
-                "query and key heads, but VLLM_USE_TRTLLM_ATTENTION is set to 1"
+                "query and key heads, but --attention-config.use_trtllm_attention is "
+                "set to 1"
             )
         return False
 
@@ -354,7 +350,7 @@ def use_trtllm_attention(
         return True
 
     if force_use_trtllm is None:
-        # Environment variable not set - use auto-detection
+        # CLI argument not set - use auto-detection
         if is_prefill:
             # Prefill auto-detection
             use_trtllm = kv_cache_dtype == "auto"
@@ -367,8 +363,10 @@ def use_trtllm_attention(
                 logger.warning_once("Using TRTLLM decode attention (auto-detected).")
         return use_trtllm
 
-    # Environment variable is set to 1 - respect it
-    logger.info_once("Using TRTLLM attention (VLLM_USE_TRTLLM_ATTENTION is set to 1)")
+    # CLI argument is set to 1 - respect it
+    logger.info_once(
+        "Using TRTLLM attention (--attention-config.use_trtllm_attention is set to 1)"
+    )
     return True
 
 
@@ -500,12 +498,6 @@ def flashinfer_scaled_fp8_mm(
     return output
 
 
-@functools.cache
-def flashinfer_disable_q_quantization() -> bool:
-    """Cache result which only depends on the environment"""
-    return envs.VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION
-
-
 __all__ = [
     "has_flashinfer",
     "flashinfer_trtllm_fp8_block_scale_moe",
@@ -526,7 +518,6 @@ __all__ = [
     "supports_trtllm_attention",
     "can_use_trtllm_attention",
     "use_trtllm_attention",
-    "flashinfer_disable_q_quantization",
     "flashinfer_scaled_fp4_mm",
     "flashinfer_scaled_fp8_mm",
 ]
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index fb080b0b33bc0..f5ad98cf2125c 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -8,7 +8,6 @@ from typing import ClassVar
 import numpy as np
 import torch
 
-from vllm import envs
 from vllm.attention.backends.abstract import (
     AttentionBackend,
     AttentionImpl,
@@ -264,6 +263,7 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad
         self.parallel_config = vllm_config.parallel_config
         self.cache_config = vllm_config.cache_config
         self.compilation_config = vllm_config.compilation_config
+        self.attention_config = vllm_config.attention_config
 
         self.num_heads_q = self.model_config.get_num_attention_heads(
             self.parallel_config
@@ -304,7 +304,9 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad
             # When using cuda graph, we need to set the upper bound of the
             # number of splits so that large enough intermediate buffers are
             # pre-allocated during capture.
-            self.max_num_splits = envs.VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH
+            self.max_num_splits = (
+                self.attention_config.flash_attn_max_num_splits_for_cuda_graph
+            )
 
         # Sliding window size to be used with the AOT scheduler will be
         # populated on first build() call.
@@ -554,8 +556,7 @@ class FlashAttentionImpl(AttentionImpl):
                 "heads in the layer"
             )
 
-    def supports_quant_query_input(self) -> bool:
-        return True
+        self.supports_quant_query_input = True
 
     def forward(
         self,
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 3d9640a2d4024..8e9d764e4a123 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -26,7 +26,7 @@ from vllm.attention.backends.abstract import (
 )
 from vllm.attention.ops.common import cp_lse_ag_out_rs
 from vllm.attention.ops.merge_attn_states import merge_attn_states
-from vllm.config import CUDAGraphMode, VllmConfig
+from vllm.config import CUDAGraphMode, VllmConfig, get_current_vllm_config
 from vllm.config.cache import CacheDType
 from vllm.distributed.parallel_state import get_dcp_group
 from vllm.logger import init_logger
@@ -43,7 +43,6 @@ from vllm.platforms.interface import DeviceCapability
 from vllm.triton_utils import tl, triton
 from vllm.utils.flashinfer import (
     can_use_trtllm_attention,
-    flashinfer_disable_q_quantization,
     use_trtllm_attention,
 )
 from vllm.utils.math_utils import cdiv
@@ -362,7 +361,8 @@ class FlashInferBackend(AttentionBackend):
             supports_trtllm_attention,
         )
 
-        # Respect explicit disable flag (e.g., VLLM_USE_TRTLLM_ATTENTION=0)
+        # Respect explicit disable flag (e.g.,
+        # --attention-config.use_trtllm_attention=0)
         if force_use_trtllm_attention() is False:
             return False
 
@@ -500,11 +500,14 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
             self.kv_cache_dtype = self.kv_cache_spec.dtype
 
         # Use model dtype as q dtype when TRTLLM attn is not supported, or
-        # VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION is set to 1. Otherwise, try to
-        # use fp8 q if kv cache is fp8, and will fall back to model dtype
+        # --attention-config.disable_flashinfer_q_quantization is set to 1. Otherwise,
+        # try to use fp8 q if kv cache is fp8, and will fall back to model dtype
         # if TRTLLM attention kernel is not used when building attn metadata
         can_use_trtllm = can_use_trtllm_attention(self.num_qo_heads, self.num_kv_heads)
-        if can_use_trtllm and not flashinfer_disable_q_quantization():
+        if (
+            can_use_trtllm
+            and not vllm_config.attention_config.disable_flashinfer_q_quantization
+        ):
             self.q_data_type = self.kv_cache_dtype
         else:
             self.q_data_type = self.model_config.dtype
@@ -1035,6 +1038,11 @@ class FlashInferImpl(AttentionImpl):
             self.sinks = sinks
 
         self.support_trtllm_attn = can_use_trtllm_attention(num_heads, num_kv_heads)
+        vllm_config = get_current_vllm_config()
+        self.supports_quant_query_input = (
+            self.support_trtllm_attn
+            and not vllm_config.attention_config.disable_flashinfer_q_quantization
+        )
         self.bmm1_scale: float | None = None
         self.bmm2_scale: float | None = None
         self.o_sf_scale: float | None = None
@@ -1046,12 +1054,6 @@ class FlashInferImpl(AttentionImpl):
             and quant_key in (kFp8StaticTensorSym, kNvfp4Quant)
         )
 
-    def supports_quant_query_input(self) -> bool:
-        if flashinfer_disable_q_quantization():
-            return False
-
-        return self.support_trtllm_attn
-
     # FlashInfer requires attention sinks to be float32
     def process_weights_after_loading(self, act_dtype: torch.dtype):
         if self.sinks is not None and self.sinks.dtype != torch.float32:
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 180625b6ce897..309ddee4fc2f0 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -438,19 +438,25 @@ A = TypeVar("A")
 def use_flashinfer_prefill() -> bool:
     # For blackwell default to flashinfer prefill if it's available since
     # it is faster than FA2.
+    from vllm.config import get_current_vllm_config
+
+    vllm_config = get_current_vllm_config()
     return (
-        not envs.VLLM_DISABLE_FLASHINFER_PREFILL
+        not vllm_config.attention_config.disable_flashinfer_prefill
         and flashinfer_available
-        and not envs.VLLM_USE_CUDNN_PREFILL
-        and not envs.VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL
+        and not vllm_config.attention_config.use_cudnn_prefill
+        and not vllm_config.attention_config.use_trtllm_ragged_deepseek_prefill
         and current_platform.is_device_capability(100)
     )
 
 
 def use_cudnn_prefill() -> bool:
+    from vllm.config import get_current_vllm_config
+
+    vllm_config = get_current_vllm_config()
     return (
         flashinfer_available
-        and envs.VLLM_USE_CUDNN_PREFILL
+        and vllm_config.attention_config.use_cudnn_prefill
         and current_platform.is_device_capability(100)
         and has_nvidia_artifactory()
     )
@@ -458,9 +464,12 @@ def use_cudnn_prefill() -> bool:
 
 def use_trtllm_ragged_deepseek_prefill() -> bool:
     """Check if TRT-LLM ragged DeepSeek prefill should be used."""
+    from vllm.config import get_current_vllm_config
+
+    vllm_config = get_current_vllm_config()
     return (
         flashinfer_available
-        and envs.VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL
+        and vllm_config.attention_config.use_trtllm_ragged_deepseek_prefill
         and current_platform.is_device_capability(100)
     )
 
diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py
index d369814c10b6f..eccf4ec791095 100644
--- a/vllm/v1/attention/backends/mla/flashattn_mla.py
+++ b/vllm/v1/attention/backends/mla/flashattn_mla.py
@@ -6,7 +6,6 @@ from typing import ClassVar
 
 import torch
 
-from vllm import envs
 from vllm.attention.backends.abstract import (
     AttentionLayer,
     AttentionType,
@@ -131,7 +130,9 @@ class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata]
             # When using cuda graph, we need to set the upper bound of the
             # number of splits so that large enough intermediate buffers are
             # pre-allocated during capture.
-            self.max_num_splits = envs.VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH
+            self.max_num_splits = (
+                vllm_config.attention_config.flash_attn_max_num_splits_for_cuda_graph
+            )
 
         if vllm_is_batch_invariant():
             self.max_num_splits = 1
diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py
index 868143cc192e7..e2410a70b1a63 100644
--- a/vllm/v1/attention/backends/rocm_attn.py
+++ b/vllm/v1/attention/backends/rocm_attn.py
@@ -165,7 +165,7 @@ class RocmAttentionBackend(AttentionBackend):
             raise ValueError(
                 f"Head size {head_size} is not supported by {attn_type}. "
                 f"Supported head sizes are: {cls.get_supported_head_sizes()}. "
-                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
+                "Set --attention-config.backend=FLEX_ATTENTION to use "
                 "FlexAttention backend which supports all head sizes."
             )
 
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index d051a89f03bb4..3b17c4bcd89cc 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -210,9 +210,6 @@ class TritonAttentionImpl(AttentionImpl):
     def fused_output_quant_supported(self, quant_key: QuantKey):
         return quant_key == kFp8StaticTensorSym
 
-    def supports_quant_query_input(self) -> bool:
-        return current_platform.is_cuda()
-
     def __init__(
         self,
         num_heads: int,
@@ -262,6 +259,8 @@ class TritonAttentionImpl(AttentionImpl):
                 f"num_heads: {num_heads}."
             )
 
+        self.supports_quant_query_input = current_platform.is_cuda()
+
     def forward(
         self,
         layer: torch.nn.Module,

From 4e26d3b09e9895279bf127f225c5f45922612261 Mon Sep 17 00:00:00 2001
From: Ilya Markov <markovilya197@gmail.com>
Date: Fri, 5 Dec 2025 19:17:32 +0100
Subject: [PATCH 281/770] [Compile] Conditional compilation. Introduce
 compile_ranges (#24252)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Luka Govedič <lgovedic@redhat.com>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Signed-off-by: ilmarkov <markovilya197@gmail.com>
Signed-off-by: Luka Govedič <luka.govedic@gmail.com>
Signed-off-by: ProExpertProg <lgovedic@redhat.com>
Co-authored-by: Luka Govedič <lgovedic@redhat.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Co-authored-by: Luka Govedič <luka.govedic@gmail.com>
---
 tests/compile/distributed/test_fusions_e2e.py |  23 ++-
 tests/compile/test_compile_ranges.py          | 168 ++++++++++++++++++
 tests/conftest.py                             |  14 ++
 vllm/compilation/backends.py                  | 147 ++++++---------
 vllm/compilation/collective_fusion.py         | 146 +++++++--------
 vllm/compilation/compiler_interface.py        |  40 ++---
 vllm/compilation/inductor_pass.py             |  11 +-
 vllm/compilation/pass_manager.py              |  16 +-
 vllm/compilation/piecewise_backend.py         | 123 ++++++++-----
 vllm/compilation/sequence_parallelism.py      |   5 +-
 vllm/config/compilation.py                    |  38 +++-
 vllm/config/utils.py                          |  34 +++-
 vllm/config/vllm.py                           |  48 +++++
 vllm/v1/worker/gpu_worker.py                  |  35 +++-
 vllm/v1/worker/utils.py                       |   2 +-
 15 files changed, 582 insertions(+), 268 deletions(-)
 create mode 100644 tests/compile/test_compile_ranges.py

diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py
index 5d2786e122a61..75a81efedea3b 100644
--- a/tests/compile/distributed/test_fusions_e2e.py
+++ b/tests/compile/distributed/test_fusions_e2e.py
@@ -298,10 +298,14 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
         r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
         log_holder.text,
     )
-    assert len(log_matches) == 2, log_holder.text
+    # 2 for each compile range
+    # (global compile range can be split due to fuse_allreduce_rmsnorm)
+    num_compile_ranges = len(compilation_config.get_compile_ranges())
+    assert num_compile_ranges in [1, 2]
 
-    assert int(log_matches[0]) == matches.attention_fusion
-    assert int(log_matches[1]) == matches.attention_fusion
+    assert len(log_matches) == 2 * num_compile_ranges, log_holder.text
+
+    assert all(int(log_match) == matches.attention_fusion for log_match in log_matches)
 
     log_matches = re.findall(
         r"collective_fusion.py:\d+] Replaced (\d+) patterns",
@@ -312,6 +316,12 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
     assert int(log_matches[0]) == matches.allreduce_fusion
     assert int(log_matches[1]) == matches.allreduce_fusion
 
+    log_matches = re.findall(
+        r"pass_manager.py:\d+] Skipping .*AllReduceFusionPass.* with compile range",
+        log_holder.text,
+    )
+    assert len(log_matches) == 2 * (num_compile_ranges - 1), log_holder.text
+
 
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize(
@@ -446,7 +456,6 @@ def run_model(compile_config: int | CompilationConfig, model: str, **model_kwarg
     # No cudagraphs by default
     if compilation_config.cudagraph_mode is None:
         compilation_config.cudagraph_mode = CUDAGraphMode.NONE
-
     llm = LLM(
         model=model,
         compilation_config=compilation_config,
@@ -459,3 +468,9 @@ def run_model(compile_config: int | CompilationConfig, model: str, **model_kwarg
         prompt = output.prompt
         generated_text = output.outputs[0].text
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    # Get the compile ranges split points after vllm config post init
+    # in order to compute compile ranges correctly
+    compilation_config.compile_ranges_split_points = (
+        llm.llm_engine.vllm_config.compilation_config.compile_ranges_split_points
+    )
diff --git a/tests/compile/test_compile_ranges.py b/tests/compile/test_compile_ranges.py
new file mode 100644
index 0000000000000..d849a8617ebd2
--- /dev/null
+++ b/tests/compile/test_compile_ranges.py
@@ -0,0 +1,168 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+import torch
+from torch import fx as fx
+from torch import nn
+
+# This import automatically registers `torch.ops.silly.attention`
+import tests.compile.silly_attention  # noqa
+from vllm.compilation.counter import compilation_counter
+from vllm.compilation.decorators import support_torch_compile
+from vllm.compilation.inductor_pass import (
+    InductorPass,
+    get_pass_context,
+)
+from vllm.config import (
+    VllmConfig,
+    set_current_vllm_config,
+)
+from vllm.config.compilation import CompilationConfig, CompilationMode
+from vllm.config.scheduler import SchedulerConfig
+from vllm.config.utils import Range
+from vllm.forward_context import set_forward_context
+
+BATCH_SIZE = 64
+MLP_SIZE = 128
+
+
+@support_torch_compile
+class TestModel(nn.Module):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> None:
+        super().__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + x
+        attn_output = torch.empty_like(x)
+        torch.ops.silly.attention(x, x, x, attn_output)
+        x = attn_output
+        x = x * 3
+        return x
+
+
+@torch.inference_mode
+def run_model(vllm_config: VllmConfig, model: nn.Module, batch_sizes: list[int]):
+    with set_forward_context({}, vllm_config=vllm_config):
+        model(torch.randn(BATCH_SIZE, MLP_SIZE))
+        for batch_size in batch_sizes:
+            model(torch.randn(batch_size, MLP_SIZE))
+
+
+class PostGradRangeChecker(InductorPass):
+    def __init__(self, ranges: list[Range]):
+        self.ranges = ranges
+        self.num_calls = 0
+
+    def __call__(self, graph: fx.Graph):
+        compile_range = get_pass_context().compile_range
+        assert compile_range in self.ranges, (
+            f"Compile range {compile_range} not in {self.ranges}"
+        )
+        self.num_calls += 1
+
+    def uuid(self) -> str:
+        state: dict[str, Any] = {}
+        return InductorPass.hash_dict(state)
+
+
+def test_compile_ranges(use_fresh_inductor_cache):
+    post_grad_range_checker = PostGradRangeChecker(
+        [
+            Range(start=1, end=8),
+            Range(start=16, end=16),
+            Range(start=9, end=32),
+            Range(start=64, end=64),
+            Range(start=33, end=8192),
+        ]
+    )
+    torch.set_default_device("cuda")
+    vllm_config = VllmConfig(
+        scheduler_config=SchedulerConfig(
+            max_num_batched_tokens=8192,
+        ),
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            compile_ranges_split_points=[8, 32],
+            compile_sizes=[16, 64, 128],
+            inductor_compile_config={
+                "post_grad_custom_post_pass": post_grad_range_checker,
+            },
+        ),
+    )
+
+    with set_current_vllm_config(vllm_config):
+        model = TestModel(vllm_config=vllm_config, prefix="").eval()
+        # Number of compilations: 3 for each compile range + 2 compile sizes
+        batch_sizes = [1, 4, 16, 24, 48, 64, 8192]
+
+        with compilation_counter.expect(
+            num_graphs_seen=1,
+            num_piecewise_graphs_seen=1,
+            num_backend_compilations=5,
+        ):
+            run_model(vllm_config, model, batch_sizes)
+        assert post_grad_range_checker.num_calls == 5
+
+
+def test_compile_config_get_compile_ranges():
+    compilation_config = CompilationConfig(
+        compile_ranges_split_points=[8, 32],
+    )
+    VllmConfig(
+        scheduler_config=SchedulerConfig(
+            max_num_batched_tokens=8192,
+        ),
+        compilation_config=compilation_config,
+    )
+    assert compilation_config.get_compile_ranges() == [
+        Range(start=1, end=8),
+        Range(start=9, end=32),
+        Range(start=33, end=8192),
+    ]
+
+
+def test_inductor_cache_compile_ranges(monkeypatch, use_fresh_inductor_cache):
+    # To force multiple compilations, we disable the compile cache
+    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
+
+    post_grad_range_checker = PostGradRangeChecker(
+        ranges=[
+            Range(start=1, end=8),
+            Range(start=9, end=8192),
+        ]
+    )
+    scheduler_config = SchedulerConfig(
+        max_num_batched_tokens=8192,
+    )
+    torch.set_default_device("cuda")
+
+    def create_vllm_config():
+        return VllmConfig(
+            scheduler_config=scheduler_config,
+            compilation_config=CompilationConfig(
+                mode=CompilationMode.VLLM_COMPILE,
+                compile_ranges_split_points=[8],
+                inductor_compile_config={
+                    "post_grad_custom_post_pass": post_grad_range_checker,
+                },
+            ),
+        )
+
+    vllm_config_1 = create_vllm_config()
+    with set_current_vllm_config(vllm_config_1):
+        model1 = TestModel(vllm_config=vllm_config_1, prefix="").eval()
+        batch_sizes = [1, 16]
+        run_model(vllm_config_1, model1, batch_sizes)
+        assert post_grad_range_checker.num_calls == 2
+
+    post_grad_range_checker.num_calls = 0
+    # Create a new vllm config with the new pass context
+    vllm_config_2 = create_vllm_config()
+    with set_current_vllm_config(vllm_config_2):
+        model2 = TestModel(vllm_config=vllm_config_2, prefix="").eval()
+        batch_sizes = [4, 32]
+        run_model(vllm_config_2, model2, batch_sizes)
+        # Check that cache is used, so the number of calls
+        # should be 0
+        assert post_grad_range_checker.num_calls == 0
diff --git a/tests/conftest.py b/tests/conftest.py
index 204452b5835ce..0d456fb364cda 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -67,6 +67,9 @@ from vllm.transformers_utils.utils import maybe_model_redirect
 from vllm.utils.collection_utils import is_list_of
 from vllm.utils.torch_utils import set_default_torch_num_threads
 
+from torch._inductor.utils import fresh_cache
+
+
 if TYPE_CHECKING:
     from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
     from transformers.generation.utils import GenerateOutput
@@ -1465,3 +1468,14 @@ def clean_gpu_memory_between_tests():
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
         gc.collect()
+
+
+@pytest.fixture
+def use_fresh_inductor_cache():
+    """
+    Use a fresh inductor cache for the test.
+    This is useful to ensure that the test is not affected by the
+    previous test calls.
+    """
+    with fresh_cache():
+        yield
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index b5b7fe2b76c27..26f4f16a845a2 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -26,7 +26,7 @@ from vllm.compilation.partition_rules import (
     should_split,
 )
 from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
-from vllm.config.utils import hash_factors
+from vllm.config.utils import Range, hash_factors
 from vllm.logger import init_logger
 from vllm.logging_utils import lazy
 from vllm.platforms import current_platform
@@ -90,7 +90,7 @@ class CompilerManager:
     """
 
     def __init__(self, compilation_config: CompilationConfig):
-        self.cache: dict[tuple[int | None, int, str], Any] = dict()
+        self.cache: dict[tuple[Range, int, str], Any] = dict()
         self.is_cache_updated = False
         self.compilation_config = compilation_config
         self.compiler = make_compiler(compilation_config)
@@ -99,11 +99,11 @@ class CompilerManager:
         return self.compiler.compute_hash(vllm_config)
 
     @contextmanager
-    def compile_context(self, runtime_shape: int | None = None):
+    def compile_context(self, compile_range: Range):
         """Provide compilation context for the duration of compilation to set
         any torch global properties we want to scope to a single Inductor
         compilation (e.g. partition rules, pass context)."""
-        with pass_context(runtime_shape):
+        with pass_context(compile_range):
             if self.compilation_config.use_inductor_graph_partition:
                 with inductor_partition_rule_context(
                     self.compilation_config.splitting_ops
@@ -159,29 +159,21 @@ class CompilerManager:
         graph: fx.GraphModule,
         example_inputs: list[Any],
         graph_index: int,
-        runtime_shape: int | None = None,
+        compile_range: Range,
     ) -> Callable | None:
-        if (runtime_shape, graph_index, self.compiler.name) not in self.cache:
+        if (compile_range, graph_index, self.compiler.name) not in self.cache:
             return None
-        handle = self.cache[(runtime_shape, graph_index, self.compiler.name)]
+        handle = self.cache[(compile_range, graph_index, self.compiler.name)]
         compiled_graph = self.compiler.load(
-            handle, graph, example_inputs, graph_index, runtime_shape
+            handle, graph, example_inputs, graph_index, compile_range
+        )
+        logger.debug(
+            "Directly load the %s-th graph for compile range %sfrom %s via handle %s",
+            graph_index,
+            str(compile_range),
+            self.compiler.name,
+            handle,
         )
-        if runtime_shape is None:
-            logger.debug(
-                "Directly load the %s-th graph for dynamic shape from %s via handle %s",
-                graph_index,
-                self.compiler.name,
-                handle,
-            )
-        else:
-            logger.debug(
-                "Directly load the %s-th graph for shape %s from %s via handle %s",
-                graph_index,
-                str(runtime_shape),
-                self.compiler.name,
-                handle,
-            )
         return compiled_graph
 
     def compile(
@@ -190,9 +182,9 @@ class CompilerManager:
         example_inputs,
         additional_inductor_config,
         compilation_config: CompilationConfig,
+        compile_range: Range,
         graph_index: int = 0,
         num_graphs: int = 1,
-        runtime_shape: int | None = None,
     ) -> Any:
         if graph_index == 0:
             # before compiling the first graph, record the start time
@@ -204,7 +196,7 @@ class CompilerManager:
         compiled_graph = None
 
         # try to load from the cache
-        compiled_graph = self.load(graph, example_inputs, graph_index, runtime_shape)
+        compiled_graph = self.load(graph, example_inputs, graph_index, compile_range)
         if compiled_graph is not None:
             if graph_index == num_graphs - 1:
                 # after loading the last graph for this shape, record the time.
@@ -212,19 +204,12 @@ class CompilerManager:
                 now = time.time()
                 elapsed = now - compilation_start_time
                 compilation_config.compilation_time += elapsed
-                if runtime_shape is None:
-                    logger.info(
-                        "Directly load the compiled graph(s) for dynamic shape "
-                        "from the cache, took %.3f s",
-                        elapsed,
-                    )
-                else:
-                    logger.info(
-                        "Directly load the compiled graph(s) for shape %s "
-                        "from the cache, took %.3f s",
-                        str(runtime_shape),
-                        elapsed,
-                    )
+                logger.info(
+                    "Directly load the compiled graph(s) for compile range %s "
+                    "from the cache, took %.3f s",
+                    str(compile_range),
+                    elapsed,
+                )
             return compiled_graph
 
         # no compiler cached the graph, or the cache is disabled,
@@ -233,14 +218,15 @@ class CompilerManager:
             # Let compile_fx generate a key for us
             maybe_key = None
         else:
-            maybe_key = f"artifact_shape_{runtime_shape}_subgraph_{graph_index}"
-
-        with self.compile_context(runtime_shape):
+            maybe_key = "artifact_compile_range_"
+            maybe_key += f"{compile_range.start}_{compile_range.end}"
+            maybe_key += f"_subgraph_{graph_index}"
+        with self.compile_context(compile_range):
             compiled_graph, handle = self.compiler.compile(
                 graph,
                 example_inputs,
                 additional_inductor_config,
-                runtime_shape,
+                compile_range,
                 maybe_key,
             )
 
@@ -248,55 +234,34 @@ class CompilerManager:
 
         # store the artifact in the cache
         if is_compile_cache_enabled(additional_inductor_config) and handle is not None:
-            self.cache[(runtime_shape, graph_index, self.compiler.name)] = handle
+            self.cache[(compile_range, graph_index, self.compiler.name)] = handle
             compilation_counter.num_cache_entries_updated += 1
             self.is_cache_updated = True
             if graph_index == 0:
                 # adds some info logging for the first graph
-                if runtime_shape is None:
-                    logger.info_once(
-                        "Cache the graph for dynamic shape for later use", scope="local"
-                    )
-                else:
-                    logger.info_once(
-                        "Cache the graph of shape %s for later use",
-                        str(runtime_shape),
-                        scope="local",
-                    )
-            if runtime_shape is None:
-                logger.debug(
-                    "Store the %s-th graph for dynamic shape from %s via handle %s",
-                    graph_index,
-                    self.compiler.name,
-                    handle,
-                )
-            else:
-                logger.debug(
-                    "Store the %s-th graph for shape %s from %s via handle %s",
-                    graph_index,
-                    str(runtime_shape),
-                    self.compiler.name,
-                    handle,
+                logger.info_once(
+                    "Cache the graph of compile range %s for later use",
+                    str(compile_range),
                 )
+            logger.debug(
+                "Store the %s-th graph for compile range%s from %s via handle %s",
+                graph_index,
+                str(compile_range),
+                self.compiler.name,
+                handle,
+            )
 
         # after compiling the last graph, record the end time
         if graph_index == num_graphs - 1:
             now = time.time()
             elapsed = now - compilation_start_time
             compilation_config.compilation_time += elapsed
-            if runtime_shape is None:
-                logger.info_once(
-                    "Compiling a graph for dynamic shape takes %.2f s",
-                    elapsed,
-                    scope="local",
-                )
-            else:
-                logger.info_once(
-                    "Compiling a graph for shape %s takes %.2f s",
-                    runtime_shape,
-                    elapsed,
-                    scope="local",
-                )
+            logger.info_once(
+                "Compiling a graph for compile range %s takes %.2f s",
+                str(compile_range),
+                elapsed,
+                scope="local",
+            )
 
         return compiled_graph
 
@@ -427,19 +392,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
             sym_shape_indices = [
                 i for i, x in enumerate(args) if isinstance(x, torch.SymInt)
             ]
-            global compilation_start_time
 
-            compiled_graph_for_dynamic_shape = (
-                self.vllm_backend.compiler_manager.compile(
-                    submod,
-                    args,
-                    self.vllm_backend.inductor_config,
-                    self.compilation_config,
-                    graph_index=index,
-                    num_graphs=len(self.compile_submod_names),
-                    runtime_shape=None,
-                )
-            )
             # Lazy import here to avoid circular import
             from .piecewise_backend import PiecewiseBackend
 
@@ -449,7 +402,6 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
                 index,
                 len(self.compile_submod_names),
                 sym_shape_indices,
-                compiled_graph_for_dynamic_shape,
                 self.vllm_backend,
             )
 
@@ -589,8 +541,13 @@ class VllmBackend:
                 )
             else:
                 # Config should automatically wrap all inductor passes
-                assert isinstance(self.inductor_config[self.pass_key], InductorPass)
-                self.pass_manager.add(self.inductor_config[self.pass_key])
+                assert isinstance(
+                    self.compilation_config.inductor_compile_config[self.pass_key],
+                    InductorPass,
+                )
+                self.pass_manager.add(
+                    self.compilation_config.inductor_compile_config[self.pass_key]
+                )
         self.inductor_config[self.pass_key] = self.pass_manager
 
     def __call__(
diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py
index 69d4606d73ebd..2717738dd7c29 100644
--- a/vllm/compilation/collective_fusion.py
+++ b/vllm/compilation/collective_fusion.py
@@ -10,6 +10,7 @@ from torch._inductor.pattern_matcher import PatternMatcherPass
 from torch.distributed._symmetric_memory import enable_symm_mem_for_group
 
 from vllm.config import VllmConfig
+from vllm.config.utils import Range
 from vllm.distributed import get_tp_group, tensor_model_parallel_all_reduce
 from vllm.distributed.parallel_state import (
     get_tensor_model_parallel_rank,
@@ -431,7 +432,7 @@ class AsyncTPPass(VllmPatternMatcherPass):
 
         self.dump_patterns(config, self.patterns)
 
-    def is_applicable(self, shape: int | None) -> bool:
+    def is_applicable_for_range(self, compile_range: Range) -> bool:
         # This pass is applied on top of the sequence parallelism pass.
         # It inherits the same applicability condition as `SequenceParallelismPass`.
         # See `SequenceParallelismPass.is_applicable` for more details.
@@ -441,7 +442,7 @@ class AsyncTPPass(VllmPatternMatcherPass):
         ):
             return True
         tp_size = get_tensor_model_parallel_world_size()
-        return shape is not None and shape % tp_size == 0
+        return compile_range.is_single_size() and compile_range.end % tp_size == 0
 
     @VllmInductorPass.time_and_log
     def __call__(self, graph: fx.Graph):
@@ -505,91 +506,60 @@ if flashinfer_comm is not None:
         num_tokens, hidden_size = allreduce_in.shape
         element_size = allreduce_in.element_size()
         current_tensor_size = num_tokens * hidden_size * element_size
+        max_tensor_size = max_token_num * hidden_size * element_size
+        assert current_tensor_size <= max_tensor_size, (
+            f"Current tensor size {current_tensor_size} is larger than "
+            f"max token num {max_token_num} * hidden size {hidden_size} * "
+            f"element size {element_size}"
+        )
+        device_capability = current_platform.get_device_capability().to_int()
+        # Get one shot input size limit for the current world size
+        # for the current device capability
+        max_one_shot_size = _FI_ALLREDUCE_ONE_SHOT_MAX_SIZES_MB.get(
+            device_capability, {}
+        ).get(world_size, None)
+        # Use one shot if no max size is specified
+        use_oneshot = (
+            max_one_shot_size is None or current_tensor_size <= max_one_shot_size * MiB
+        )
 
-        if num_tokens <= max_token_num:
-            device_capability = current_platform.get_device_capability().to_int()
-            # Get one shot input size limit for the current world size
-            # for the current device capability
-            max_one_shot_size_mb = _FI_ALLREDUCE_ONE_SHOT_MAX_SIZES_MB.get(
-                device_capability, {}
-            ).get(world_size, None)
-            # Use one shot if no max size for one shot is specified
-            use_oneshot = (
-                max_one_shot_size_mb is None
-                or current_tensor_size <= max_one_shot_size_mb * MiB
-            )
-
-            assert _FI_WORKSPACE_TENSOR is not None, (
-                "Flashinfer must be enabled when using flashinfer"
-            )
-            if norm_out is None:
-                norm_out = allreduce_in
-                residual_out = residual
-            else:
-                # return residual_out as allreduce_out with zeroed residual_in
-                # as flashinfer does not support rms_norm
-                # and allreduce_out together
-                residual_out = allreduce_in
-            # For the sizes that are smaller than the max size,
-            # we only use flashinfer one shot allreduce
-            flashinfer_comm.trtllm_allreduce_fusion(
-                allreduce_in=allreduce_in,
-                token_num=allreduce_in.shape[0],
-                residual_in=residual,
-                residual_out=residual_out,
-                norm_out=norm_out,
-                rms_gamma=rms_gamma,
-                rms_eps=rms_eps,
-                world_rank=world_rank,
-                world_size=world_size,
-                hidden_dim=allreduce_in.shape[-1],
-                workspace_ptrs=_FI_WORKSPACE_TENSOR,
-                launch_with_pdl=launch_with_pdl,
-                use_oneshot=use_oneshot,
-                trigger_completion_at_end=trigger_completion_at_end,
-                fp32_acc=fp32_acc,
-                pattern_code=pattern_code,
-                allreduce_out=None,
-                quant_out=quant_out,
-                scale_out=scale_out,
-                # in vllm we only support swizzled layout
-                layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4,
-                scale_factor=scale_factor,
-            )
+        assert _FI_WORKSPACE_TENSOR is not None, (
+            "Flashinfer must be enabled when using flashinfer"
+        )
+        if norm_out is None:
+            norm_out = allreduce_in
+            residual_out = residual
         else:
-            allreduce_out = tensor_model_parallel_all_reduce(allreduce_in)
-            if scale_factor is not None and scale_out is None:
-                # Do fused rms norm static fp8 quant fused op
-                if norm_out is None:
-                    torch.ops._C.fused_add_rms_norm_static_fp8_quant(
-                        quant_out,
-                        allreduce_out,
-                        residual,
-                        rms_gamma,
-                        scale_factor,
-                        rms_eps,
-                    )
-                else:
-                    torch.ops._C.rms_norm_static_fp8_quant(
-                        quant_out, allreduce_out, rms_gamma, scale_factor, rms_eps
-                    )
-            else:
-                if norm_out is None:
-                    torch.ops._C.fused_add_rms_norm(
-                        allreduce_out, residual, rms_gamma, rms_eps
-                    )
-                    norm_out = allreduce_out
-                else:
-                    torch.ops._C.rms_norm(norm_out, allreduce_out, rms_gamma, rms_eps)
-                if scale_factor is not None and scale_out is not None:
-                    torch.ops._C.scaled_fp4_quant(
-                        quant_out, norm_out, scale_out, scale_factor
-                    )
-            if scale_factor is None or norm_out is not None:
-                # we need to return allreduce output
-                # in cases of non quant fused AR + RMS norm
-                # and fused AR + RMS norm + quant without fused add
-                allreduce_in.copy_(allreduce_out)
+            # return residual_out as allreduce_out with zeroed residual_in
+            # as flashinfer does not support rms_norm
+            # and allreduce_out together
+            residual_out = allreduce_in
+        # For the sizes that are smaller than the max size,
+        # we only use flashinfer one shot allreduce
+        flashinfer_comm.trtllm_allreduce_fusion(
+            allreduce_in=allreduce_in,
+            token_num=allreduce_in.shape[0],
+            residual_in=residual,
+            residual_out=residual_out,
+            norm_out=norm_out,
+            rms_gamma=rms_gamma,
+            rms_eps=rms_eps,
+            world_rank=world_rank,
+            world_size=world_size,
+            hidden_dim=allreduce_in.shape[-1],
+            workspace_ptrs=_FI_WORKSPACE_TENSOR,
+            launch_with_pdl=launch_with_pdl,
+            use_oneshot=use_oneshot,
+            trigger_completion_at_end=trigger_completion_at_end,
+            fp32_acc=fp32_acc,
+            pattern_code=pattern_code,
+            allreduce_out=None,
+            quant_out=quant_out,
+            scale_out=scale_out,
+            # in vllm we only support swizzled layout
+            layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4,
+            scale_factor=scale_factor,
+        )
 
     def call_trtllm_fused_allreduce_norm_fake(
         allreduce_in: torch.Tensor,
@@ -1128,7 +1098,8 @@ class AllReduceFusionPass(VllmPatternMatcherPass):
         if max_size is None:
             # Flashinfer doesn't support current world size
             logger.warning(
-                "Flashinfer allreduce fusion is not supported for world size %s",
+                "Flashinfer allreduce fusion is not supported for world size %s"
+                " or max size is not provided",
                 self.tp_size,
             )
             return
@@ -1216,6 +1187,9 @@ class AllReduceFusionPass(VllmPatternMatcherPass):
 
         self.disabled = False
 
+    def is_applicable_for_range(self, compile_range: Range) -> bool:
+        return compile_range.end <= self.max_token_num
+
     @VllmInductorPass.time_and_log
     def __call__(self, graph: fx.Graph):
         if self.disabled:
diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index 7deaba1a99fad..ab56d3561c569 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -15,6 +15,7 @@ import torch.fx as fx
 import vllm.envs as envs
 from vllm.compilation.counter import compilation_counter
 from vllm.config import VllmConfig
+from vllm.config.utils import Range
 from vllm.utils.hashing import safe_hash
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
@@ -63,16 +64,16 @@ class CompilerInterface:
         graph: fx.GraphModule,
         example_inputs: list[Any],
         compiler_config: dict[str, Any],
-        runtime_shape: int | None = None,
+        compile_range: Range,
         key: str | None = None,
     ) -> tuple[Callable | None, Any | None]:
         """
         Compile the graph with the given example inputs and compiler config,
-        with a runtime shape. If the `runtime_shape` is None, it means
-        the `example_inputs` have a dynamic shape. Otherwise, the
-        `runtime_shape` specifies the shape of the inputs. Right now we only
-        support one variable shape for all inputs, which is the batchsize
-        (number of tokens) during inference.
+        with a range. The `compile_range` specifies the range of the inputs,
+        it could be concrete size (if compile_sizes is provided), e.g. [4, 4]
+        or a range [5, 8].
+        Right now we only support one variable in ranges for all inputs,
+         which is the batchsize (number of tokens) during inference.
 
         Dynamo will make sure `graph(*example_inputs)` is valid.
 
@@ -98,7 +99,7 @@ class CompilerInterface:
         graph: fx.GraphModule,
         example_inputs: list[Any],
         graph_index: int,
-        runtime_shape: int | None = None,
+        compile_range: Range,
     ) -> Callable:
         """
         Load the compiled function from the handle.
@@ -212,20 +213,20 @@ class InductorStandaloneAdaptor(CompilerInterface):
         graph: fx.GraphModule,
         example_inputs: list[Any],
         compiler_config: dict[str, Any],
-        runtime_shape: int | None = None,
+        compile_range: Range,
         key: str | None = None,
     ) -> tuple[Callable | None, Any | None]:
         compilation_counter.num_inductor_compiles += 1
         current_config = {}
         if compiler_config is not None:
             current_config.update(compiler_config)
-        set_inductor_config(current_config, runtime_shape)
+        set_inductor_config(current_config, compile_range)
         set_functorch_config()
 
-        if isinstance(runtime_shape, int):
+        if compile_range.is_single_size():
             dynamic_shapes = "from_example_inputs"
         else:
-            dynamic_shapes = "from_tracing_context"
+            dynamic_shapes = "from_graph"
 
         from torch._inductor import standalone_compile
 
@@ -235,7 +236,6 @@ class InductorStandaloneAdaptor(CompilerInterface):
             dynamic_shapes=dynamic_shapes,
             options={"config_patches": current_config},
         )
-
         # Save the compiled artifact to disk in the specified path
         assert key is not None
         path = os.path.join(self.cache_dir, key)
@@ -251,7 +251,7 @@ class InductorStandaloneAdaptor(CompilerInterface):
         graph: fx.GraphModule,
         example_inputs: list[Any],
         graph_index: int,
-        runtime_shape: int | None = None,
+        compile_range: Range,
     ) -> Callable:
         assert isinstance(handle, tuple)
         assert isinstance(handle[0], str)
@@ -315,7 +315,7 @@ class InductorAdaptor(CompilerInterface):
         graph: fx.GraphModule,
         example_inputs: list[Any],
         compiler_config: dict[str, Any],
-        runtime_shape: int | None = None,
+        compile_range: Range,
         key: str | None = None,
     ) -> tuple[Callable | None, Any | None]:
         compilation_counter.num_inductor_compiles += 1
@@ -329,7 +329,7 @@ class InductorAdaptor(CompilerInterface):
         current_config["fx_graph_cache"] = True
         current_config["fx_graph_remote_cache"] = False
 
-        set_inductor_config(current_config, runtime_shape)
+        set_inductor_config(current_config, compile_range)
         set_functorch_config()
 
         # inductor can inplace modify the graph, so we need to copy it
@@ -512,7 +512,7 @@ class InductorAdaptor(CompilerInterface):
         graph: fx.GraphModule,
         example_inputs: list[Any],
         graph_index: int,
-        runtime_shape: int | None = None,
+        compile_range: Range,
     ) -> Callable:
         assert isinstance(handle, tuple)
         assert isinstance(handle[0], str)
@@ -608,9 +608,9 @@ class InductorAdaptor(CompilerInterface):
             return contextlib.nullcontext()
 
 
-def set_inductor_config(config, runtime_shape):
-    if isinstance(runtime_shape, int):
-        # for a specific batchsize, tuning triton kernel parameters
+def set_inductor_config(config, compile_range: Range):
+    if compile_range.is_single_size():
+        # for a specific batch size, tuning triton kernel parameters
         # can be beneficial
         config["max_autotune"] = envs.VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE
         config["coordinate_descent_tuning"] = (
@@ -630,7 +630,7 @@ class EagerAdaptor(CompilerInterface):
         graph: fx.GraphModule,
         example_inputs: list[Any],
         compiler_config: dict[str, Any],
-        runtime_shape: int | None = None,
+        compile_range: Range,
         key: str | None = None,
     ) -> tuple[Callable | None, Any | None]:
         compilation_counter.num_eager_compiles += 1
diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py
index 9af635a929b4b..8159b817f637a 100644
--- a/vllm/compilation/inductor_pass.py
+++ b/vllm/compilation/inductor_pass.py
@@ -14,6 +14,7 @@ import torch
 from torch import fx
 from torch._subclasses.fake_tensor import FakeTensorMode, unset_fake_temporarily
 
+from vllm.config.utils import Range
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
 if is_torch_equal_or_newer("2.6"):
@@ -28,8 +29,8 @@ _pass_context = None
 
 
 class PassContext:
-    def __init__(self, runtime_shape: int | None):
-        self.runtime_shape = runtime_shape
+    def __init__(self, compile_range: Range):
+        self.compile_range: Range = compile_range
 
 
 def get_pass_context() -> PassContext:
@@ -39,13 +40,13 @@ def get_pass_context() -> PassContext:
 
 
 @contextmanager
-def pass_context(runtime_shape: int | None):
+def pass_context(compile_range: Range):
     """A context manager that stores the current pass context,
     usually it is a list of sizes to specialize.
     """
     global _pass_context
     prev_context = _pass_context
-    _pass_context = PassContext(runtime_shape)
+    _pass_context = PassContext(compile_range)
     try:
         yield
     finally:
@@ -96,7 +97,7 @@ class InductorPass(CustomGraphPass):
         encoded = json.dumps(dict_, sort_keys=True).encode("utf-8")
         return hashlib.sha256(encoded).hexdigest()
 
-    def is_applicable(self, shape: int | None):
+    def is_applicable_for_range(self, compile_range: Range):
         return True
 
 
diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
index 37f48721ea208..6848bfb6a3c53 100644
--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
@@ -24,7 +24,11 @@ if current_platform.is_cuda():
     from .collective_fusion import AllReduceFusionPass, AsyncTPPass
 
 from .fix_functionalization import FixFunctionalizationPass
-from .inductor_pass import CustomGraphPass, InductorPass, get_pass_context
+from .inductor_pass import (
+    CustomGraphPass,
+    InductorPass,
+    get_pass_context,
+)
 from .noop_elimination import NoOpEliminationPass
 
 logger = init_logger(__name__)
@@ -70,13 +74,13 @@ class PostGradPassManager(CustomGraphPass):
     def __call__(self, graph: fx.Graph):
         VllmInductorPass.dump_prefix = 0  # reset dump index
 
-        shape = get_pass_context().runtime_shape
+        compile_range = get_pass_context().compile_range
         for pass_ in self.passes:
-            if pass_.is_applicable(shape):
+            if pass_.is_applicable_for_range(compile_range):
                 pass_(graph)
                 VllmInductorPass.dump_prefix += 1
             else:
-                logger.debug("Skipping %s with shape %s", pass_, shape)
+                logger.debug("Skipping %s with compile range %s", pass_, compile_range)
 
         # post-cleanup goes before fix_functionalization
         # because it requires a functional graph
@@ -133,4 +137,8 @@ class PostGradPassManager(CustomGraphPass):
             state["passes"].append(pass_.uuid())
         state["passes"].append(self.fix_functionalization.uuid())
 
+        # Include the compile range in the uuid to ensure that inductor
+        # recompiles the graph for the new dynamic compile range.
+        state["compile_range"] = str(get_pass_context().compile_range)
+
         return InductorPass.hash_dict(state)
diff --git a/vllm/compilation/piecewise_backend.py b/vllm/compilation/piecewise_backend.py
index e535d2c461c6e..129b9b5deea31 100644
--- a/vllm/compilation/piecewise_backend.py
+++ b/vllm/compilation/piecewise_backend.py
@@ -7,18 +7,18 @@ from typing import Any
 
 import torch.fx as fx
 
-import vllm.envs as envs
 from vllm.compilation.backends import VllmBackend
 from vllm.compilation.monitor import end_monitoring_torch_compile
 from vllm.config import VllmConfig
+from vllm.config.compilation import Range
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
 
 
 @dataclasses.dataclass
-class ConcreteSizeEntry:
-    runtime_shape: int
+class RangeEntry:
+    compile_range: Range
     compiled: bool = False
     runnable: Callable = None  # type: ignore
 
@@ -31,7 +31,6 @@ class PiecewiseBackend:
         piecewise_compile_index: int,
         total_piecewise_compiles: int,
         sym_shape_indices: list[int],
-        compiled_graph_for_general_shape: Callable,
         vllm_backend: VllmBackend,
     ):
         """
@@ -55,67 +54,111 @@ class PiecewiseBackend:
 
         self.is_full_graph = total_piecewise_compiles == 1
 
-        self.compile_sizes: set[int] = set(self.compilation_config.compile_sizes)
+        self.compile_ranges = self.compilation_config.get_compile_ranges()
+        log_string = f"PiecewiseBackend: compile_ranges: {self.compile_ranges}"
+        logger.debug_once(log_string)
 
-        self.first_run_finished = False
-
-        self.compiled_graph_for_general_shape = compiled_graph_for_general_shape  # noqa
+        self.compile_sizes = self.compilation_config.compile_sizes
+        log_string = f"PiecewiseBackend: compile_sizes: {self.compile_sizes}"
+        logger.debug_once(log_string)
 
         self.sym_shape_indices = sym_shape_indices
 
-        self.is_debugging_mode = envs.VLLM_LOGGING_LEVEL == "DEBUG"
+        # the entries for ranges that we need to either
+        self.range_entries: dict[Range, RangeEntry] = {}
 
-        # the entries for different shapes that we need to compile
-        self.concrete_size_entries: dict[int, ConcreteSizeEntry] = {}
-
-        # to_be_compiled_sizes tracks the remaining sizes to compile,
+        # to_be_compiled_ranges tracks the remaining ranges to compile,
         # and updates during the compilation process, so we need to copy it
-        self.to_be_compiled_sizes: set[int] = self.compile_sizes.copy()
+        self.to_be_compiled_ranges: set[Range] = set(self.compile_ranges)
 
         # We only keep compilation management inside this class directly.
-        for shape in self.compile_sizes:
-            self.concrete_size_entries[shape] = ConcreteSizeEntry(
-                runtime_shape=shape,
-                runnable=self.compiled_graph_for_general_shape,
+        for size in self.compile_sizes:
+            range = Range(start=size, end=size)
+            if range not in self.compile_ranges:
+                self.range_entries[range] = RangeEntry(
+                    compile_range=range,
+                )
+                self.to_be_compiled_ranges.add(range)
+
+        for range in self.compile_ranges:
+            self.range_entries[range] = RangeEntry(
+                compile_range=range,
             )
 
     def check_for_ending_compilation(self):
-        if self.is_last_graph and not self.to_be_compiled_sizes:
+        if self.is_last_graph and not self.to_be_compiled_ranges:
             # no specific sizes to compile
             # save the hash of the inductor graph for the next run
             self.vllm_backend.compiler_manager.save_to_file()
             end_monitoring_torch_compile(self.vllm_config)
 
-    def __call__(self, *args) -> Any:
-        if not self.first_run_finished:
-            self.first_run_finished = True
-            self.check_for_ending_compilation()
-            return self.compiled_graph_for_general_shape(*args)
+    def _fakify_args(self, args: list[Any]) -> list[Any]:
+        # We need to pass fake example_inputs, otherwise torch.compile
+        # will fakify the example_inputs potentially causing some non dynamic
+        # dimension to be be duck shaped to other existing shapes that have hints
+        # matching their values.
+        # This is problem because it can lead to unintended specializations!
+        # if the new wrongly dynamic dim is specialized
+        # it will force specializing the whole shape
+        # torch.compile probably should not accept
+        # non fake tensors as example inputs!
+        # See issue https://github.com/vllm-project/vllm/issues/27899
+        fake_example_inputs = []
+        for node in self.graph.graph.nodes:
+            # All place holders come first
+            if node.op == "placeholder":
+                fake_example_inputs.append(node.meta["example_value"])
+            else:
+                break
+        assert len(fake_example_inputs) == len(args)
+        return fake_example_inputs
 
-        runtime_shape = args[self.sym_shape_indices[0]]
+    def _maybe_compile_for_range_entry(self, range_entry: RangeEntry, args) -> Any:
+        if not range_entry.compiled:
+            range_entry.compiled = True
+            self.to_be_compiled_ranges.remove(range_entry.compile_range)
 
-        if runtime_shape not in self.concrete_size_entries:
-            # we don't need to do anything for this shape
-            return self.compiled_graph_for_general_shape(*args)
-
-        entry = self.concrete_size_entries[runtime_shape]
-
-        if not entry.compiled:
-            entry.compiled = True
-            self.to_be_compiled_sizes.remove(runtime_shape)
             # args are real arguments
-            entry.runnable = self.vllm_backend.compiler_manager.compile(
+            # fakify for range, real args for concrete size.
+            # For concrete size, we clear the shape env in
+            # compiler_manager.compile() so no need to fakify.
+            args = (
+                self._fakify_args(args)
+                if not range_entry.compile_range.is_single_size()
+                else args
+            )
+            range_entry.runnable = self.vllm_backend.compiler_manager.compile(
                 self.graph,
                 args,
                 self.vllm_backend.inductor_config,
                 self.compilation_config,
+                compile_range=range_entry.compile_range,
                 graph_index=self.piecewise_compile_index,
                 num_graphs=self.total_piecewise_compiles,
-                runtime_shape=runtime_shape,
             )
 
-            # finished compilations for all required shapes
-            if self.is_last_graph and not self.to_be_compiled_sizes:
-                self.check_for_ending_compilation()
+            self.check_for_ending_compilation()
 
-        return entry.runnable(*args)
+    def _find_range_for_shape(self, runtime_shape: int) -> Range | None:
+        # First we try to find the range entry for the concrete compile size
+        # If not found, we search for the range entry
+        # that contains the runtime shape.
+        if runtime_shape in self.compile_sizes:
+            return self.range_entries[Range(start=runtime_shape, end=runtime_shape)]
+        else:
+            for range in self.compile_ranges:
+                if runtime_shape in range:
+                    return self.range_entries[range]
+        return None
+
+    def __call__(self, *args) -> Any:
+        runtime_shape = args[self.sym_shape_indices[0]]
+        range_entry = self._find_range_for_shape(runtime_shape)
+
+        assert range_entry is not None, (
+            f"Shape out of considered range: {runtime_shape} "
+            "[1, max_num_batched_tokens]"
+        )
+
+        self._maybe_compile_for_range_entry(range_entry, args)
+        return range_entry.runnable(*args)
diff --git a/vllm/compilation/sequence_parallelism.py b/vllm/compilation/sequence_parallelism.py
index cf4b8118f6b5c..a4046356bcda0 100644
--- a/vllm/compilation/sequence_parallelism.py
+++ b/vllm/compilation/sequence_parallelism.py
@@ -9,6 +9,7 @@ import torch.fx as fx
 from torch._inductor.pattern_matcher import PatternMatcherPass
 
 from vllm.config import VllmConfig
+from vllm.config.compilation import Range
 from vllm.distributed import get_tp_group, tensor_model_parallel_all_reduce
 from vllm.distributed.parallel_state import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
@@ -333,7 +334,7 @@ class SequenceParallelismPass(VllmPatternMatcherPass):
 
         self.dump_patterns(config, self.patterns)
 
-    def is_applicable(self, shape: int | None) -> bool:
+    def is_applicable_for_range(self, compile_range: Range) -> bool:
         # When sequence parallelism is enabled, the residual tensor from RMSNorm
         # needs to be split along the sequence dimension. However, this dimension
         # is symbolic during piecewise compilation, and splitting symbolic shapes
@@ -353,7 +354,7 @@ class SequenceParallelismPass(VllmPatternMatcherPass):
         ):
             return True
         tp_size = get_tensor_model_parallel_world_size()
-        return shape is not None and shape % tp_size == 0
+        return (compile_range.is_single_size()) and (compile_range.end % tp_size == 0)
 
     @VllmInductorPass.time_and_log
     def __call__(self, graph: fx.Graph):
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index d3d50e6ae7b2e..5f9e2cfdd789a 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -13,7 +13,13 @@ from pydantic.dataclasses import dataclass
 
 import vllm.envs as envs
 from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
-from vllm.config.utils import config, get_hash_factors, handle_deprecated, hash_factors
+from vllm.config.utils import (
+    Range,
+    config,
+    get_hash_factors,
+    handle_deprecated,
+    hash_factors,
+)
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils.import_utils import resolve_obj_by_qualname
@@ -173,6 +179,9 @@ class PassConfig:
         """
 
         MiB = 1024 * 1024
+        FI_SUPPORTED_WORLD_SIZES = [2, 4, 8]
+        if world_size not in FI_SUPPORTED_WORLD_SIZES:
+            return None
         max_size_mb = self.fi_allreduce_fusion_max_size_mb
         if max_size_mb is None:
             max_size_mb = self.default_fi_allreduce_fusion_max_size_mb().get(world_size)
@@ -379,6 +388,8 @@ class CompilationConfig:
         [vllm.config.CompilationConfig.cudagraph_copy_inputs]
     - Inductor compilation:
         - [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes]
+        - [`compile_ranges_split_points`]
+            [vllm.config.CompilationConfig.compile_ranges_split_points]
         - [`inductor_compile_config`]
         [vllm.config.CompilationConfig.inductor_compile_config]
         - [`inductor_passes`][vllm.config.CompilationConfig.inductor_passes]
@@ -492,6 +503,21 @@ class CompilationConfig:
     to integers, it also supports "cudagraph_capture_sizes" to
     specify the sizes for cudagraph capture."""
 
+    compile_ranges_split_points: list[int] | None = None
+    """Split points that represent compile ranges for inductor.
+    The compile ranges are 
+    [1, split_points[0]], 
+    [split_points[0] + 1, split_points[1]], ..., 
+    [split_points[-1] + 1, max_num_batched_tokens].
+    Compile sizes are also used single element ranges,
+    the range is represented as [compile_sizes[i], compile_sizes[i]].
+    
+    If a range overlaps with the compile size, graph for compile size 
+    will be prioritized, i.e. if we have a range [1, 8] and a compile size 4,
+    graph for compile size 4 will be compiled and used instead of the graph
+    for range [1, 8].
+    """
+
     inductor_compile_config: dict = field(default_factory=dict)
     """Additional configurations for inductor.
     - None: use default configurations."""
@@ -1153,3 +1179,13 @@ class CompilationConfig:
                     self.bs_to_padded_graph_size[bs] = start
                 else:
                     self.bs_to_padded_graph_size[bs] = end
+
+    def get_compile_ranges(self) -> list[Range]:
+        """Get the compile ranges for the compilation config."""
+        if self.compile_ranges_split_points is None:
+            return []
+        split_points = sorted(set(self.compile_ranges_split_points))
+        return [
+            Range(start=s + 1, end=e)
+            for s, e in zip([0] + split_points[:-1], split_points)
+        ]
diff --git a/vllm/config/utils.py b/vllm/config/utils.py
index 3124fcf007396..93da3fd417ace 100644
--- a/vllm/config/utils.py
+++ b/vllm/config/utils.py
@@ -10,7 +10,7 @@ import json
 import pathlib
 import textwrap
 from collections.abc import Iterable, Mapping, Sequence, Set
-from dataclasses import MISSING, Field, field, fields, is_dataclass, replace
+from dataclasses import MISSING, Field, dataclass, field, fields, is_dataclass, replace
 from itertools import pairwise
 from typing import TYPE_CHECKING, Any, Protocol, TypeVar
 
@@ -322,3 +322,35 @@ def handle_deprecated(
 
     for new_name in new_names:
         setattr(config, new_name, old_val)
+
+
+@dataclass
+class Range:
+    """
+    A range of numbers.
+    Inclusive of start, inclusive of end.
+    """
+
+    start: int
+    end: int
+
+    def is_single_size(self) -> bool:
+        return self.start == self.end
+
+    def __contains__(self, size: int) -> bool:
+        # Inclusive of start, inclusive of end
+        return self.start <= size <= self.end
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, Range):
+            return False
+        return self.start == other.start and self.end == other.end
+
+    def __hash__(self) -> int:
+        return hash((self.start, self.end))
+
+    def __str__(self) -> str:
+        return f"({self.start}, {self.end})"
+
+    def __repr__(self) -> str:
+        return self.__str__()
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index ce3d3b20865db..47e7ffded797b 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -725,6 +725,8 @@ class VllmConfig:
                 "--kv-sharing-fast-prefill requires changes on model side for "
                 "correctness and to realize prefill savings. "
             )
+        # TODO: Move after https://github.com/vllm-project/vllm/pull/26847 lands
+        self._set_compile_ranges()
 
         if self.model_config and self.model_config.is_encoder_decoder:
             from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -1126,6 +1128,52 @@ class VllmConfig:
         # complete the remaining process.
         self.compilation_config.post_init_cudagraph_sizes()
 
+    def _set_compile_ranges(self):
+        """
+        Set the compile ranges for the compilation config.
+        """
+        compilation_config = self.compilation_config
+        computed_compile_ranges_split_points = []
+
+        # The upper bound of the compile ranges is the max_num_batched_tokens
+        max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
+        if max_num_batched_tokens is not None:
+            computed_compile_ranges_split_points.append(max_num_batched_tokens)
+
+        # Add the compile ranges for flashinfer
+        if compilation_config.pass_config.fuse_allreduce_rms:
+            tp_size = self.parallel_config.tensor_parallel_size
+            max_size = compilation_config.pass_config.flashinfer_max_size(tp_size)
+            if max_size is not None:
+                max_token_num = max_size // (
+                    self.model_config.get_hidden_size()
+                    * self.model_config.dtype.itemsize
+                )
+                if (
+                    max_num_batched_tokens is not None
+                    and max_token_num < max_num_batched_tokens
+                ):
+                    computed_compile_ranges_split_points.append(max_token_num)
+                else:
+                    logger.debug(
+                        "Max num batched tokens below allreduce-rms fusion threshold, "
+                        "allreduce-rms fusion will be enabled for all num_tokens."
+                    )
+
+        if compilation_config.compile_ranges_split_points is not None:
+            for x in compilation_config.compile_ranges_split_points:
+                assert isinstance(x, int)
+                assert x > 0, f"Invalid compile range split point: {x}"
+                if (
+                    max_num_batched_tokens is not None
+                    and x < max_num_batched_tokens
+                    and x > 1
+                ):
+                    computed_compile_ranges_split_points.append(x)
+        compilation_config.compile_ranges_split_points = sorted(
+            computed_compile_ranges_split_points
+        )
+
     def recalculate_max_model_len(self, max_model_len: int):
         # Can only be called in try_verify_and_update_config
         model_config = self.model_config
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index d189d0860b9a7..a46ec2bd118fe 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -15,6 +15,7 @@ import torch.nn as nn
 
 import vllm.envs as envs
 from vllm.config import CUDAGraphMode, VllmConfig
+from vllm.config.compilation import CompilationMode
 from vllm.distributed import (
     ensure_model_parallel_initialized,
     init_distributed_environment,
@@ -407,15 +408,31 @@ class Worker(WorkerBase):
             self.model_runner.initialize_kv_cache(kv_cache_config)
 
     def compile_or_warm_up_model(self) -> None:
-        # warm up sizes that are not in cudagraph capture sizes,
-        # but users still want to compile for better performance,
-        # e.g. for the max-num-batched token size in chunked prefill.
-        compile_sizes = self.vllm_config.compilation_config.compile_sizes
-        warmup_sizes = compile_sizes.copy() if compile_sizes is not None else []
-        if not self.model_config.enforce_eager:
-            capture_sizes = self.vllm_config.compilation_config.cudagraph_capture_sizes
-            if capture_sizes is not None:
-                warmup_sizes = [x for x in warmup_sizes if x not in capture_sizes]
+        warmup_sizes = []
+
+        if self.vllm_config.compilation_config.mode == CompilationMode.VLLM_COMPILE:
+            # warm up sizes that are not in cudagraph capture sizes,
+            # but users still want to compile for better performance,
+            # e.g. for the max-num-batched token size in chunked prefill.
+            compile_sizes = self.vllm_config.compilation_config.compile_sizes
+            warmup_sizes = compile_sizes.copy() if compile_sizes is not None else []
+            cg_capture_sizes: list[int] = []
+
+            if self.vllm_config.compilation_config.cudagraph_mode != CUDAGraphMode.NONE:
+                cg_sizes = self.vllm_config.compilation_config.cudagraph_capture_sizes
+                cg_capture_sizes = [] if cg_sizes is None else cg_sizes
+                warmup_sizes = [x for x in warmup_sizes if x not in cg_capture_sizes]
+
+            compile_ranges = self.vllm_config.compilation_config.get_compile_ranges()
+            # For each compile_range, if none of the batch sizes
+            # in warmup_sizes or cudagraph_capture_sizes are in the range,
+            # add the end of the range to ensure compilation/warmup.
+            all_sizes = set(cg_capture_sizes)
+            all_sizes.update([x for x in warmup_sizes if isinstance(x, int)])
+            for compile_range in compile_ranges:
+                if not any(x in compile_range for x in all_sizes):
+                    warmup_sizes.append(compile_range.end)
+
         # We skip EPLB here since we don't want to record dummy metrics
         for size in sorted(warmup_sizes, reverse=True):
             logger.info("Compile and warming up model for size %d", size)
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 427a0d296b253..0b0e2006d73d2 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -337,7 +337,7 @@ def is_residual_scattered_for_sp(
     The residual tensor is scattered across tensor parallel ranks when sequence
     parallelism and tensor parallelism is enabled.
 
-    This follows the same logic as SequenceParallelismPass.is_applicable():
+    This follows the same logic as SequenceParallelismPass.is_applicable_for_range():
     - In full-graph compilation mode (no splitting ops or using inductor graph
       partition), SP is always applied
     - Otherwise, SP is only applied for specific shapes in compile_sizes

From adb315060cd9943b38e68141cd9a54421144ae64 Mon Sep 17 00:00:00 2001
From: Tova Movshovitz <tovam@pliops.com>
Date: Fri, 5 Dec 2025 20:33:26 +0200
Subject: [PATCH 282/770] [KVConnector][Feature] Support KV connector cache
 reset via /reset_prefix_cache (#27170)

Signed-off-by: tovam <tovam@pliops.com>
Signed-off-by: Tova Movshovitz <tovam@pliops.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 .../kv_transfer/kv_connector/v1/base.py       | 14 ++++++++++
 .../kv_connector/v1/multi_connector.py        |  4 +++
 vllm/engine/protocol.py                       |  6 +++--
 vllm/entrypoints/llm.py                       |  8 ++++--
 vllm/entrypoints/openai/api_server.py         | 21 ++++++++++++---
 vllm/v1/core/sched/interface.py               |  4 ++-
 vllm/v1/core/sched/scheduler.py               | 22 +++++++++++++++-
 vllm/v1/engine/async_llm.py                   |  8 ++++--
 vllm/v1/engine/core.py                        |  8 ++++--
 vllm/v1/engine/core_client.py                 | 26 +++++++++++++------
 vllm/v1/engine/llm_engine.py                  |  8 ++++--
 11 files changed, 105 insertions(+), 24 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
index d37ec25675b72..8e9182a9bca4c 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -573,3 +573,17 @@ class KVConnectorBase_V1(ABC):
         expose connector transfer stats via Prometheus.
         """
         return None
+
+    def reset_cache(self) -> bool | None:
+        """
+        Reset the connector's internal cache.
+
+        Returns:
+            bool: True if the cache was successfully reset, False otherwise.
+        """
+        logger.debug(
+            "Connector cache reset requested, but %s does not implement reset_cache().",
+            type(self).__name__,
+        )
+
+        return None
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
index 51d5df6c6ba15..c80dc1a567fdb 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
@@ -452,3 +452,7 @@ class MultiConnector(KVConnectorBase_V1):
             per_engine_labelvalues,
             prom_metrics,
         )
+
+    def reset_cache(self) -> bool:
+        results = [c.reset_cache() is not False for c in self._connectors]
+        return all(results)
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index 1b6330c9f9b65..d94951a0cffc8 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -116,8 +116,10 @@ class EngineClient(ABC):
         ...
 
     @abstractmethod
-    async def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
-        """Reset the prefix cache"""
+    async def reset_prefix_cache(
+        self, reset_running_requests: bool = False, reset_connector: bool = False
+    ) -> bool:
+        """Reset the prefix cache and optionally any configured connector cache"""
         ...
 
     @abstractmethod
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 481a47a97f7d4..add9176340a9e 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1491,8 +1491,12 @@ class LLM:
     def stop_profile(self) -> None:
         self.llm_engine.stop_profile()
 
-    def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
-        return self.llm_engine.reset_prefix_cache(reset_running_requests)
+    def reset_prefix_cache(
+        self, reset_running_requests: bool = False, reset_connector: bool = False
+    ) -> bool:
+        return self.llm_engine.reset_prefix_cache(
+            reset_running_requests, reset_connector
+        )
 
     def sleep(self, level: int = 1):
         """
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 2fa6afa2bacb5..7be601d824f34 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -663,14 +663,27 @@ if envs.VLLM_SERVER_DEV_MODE:
 
     @router.post("/reset_prefix_cache")
     async def reset_prefix_cache(
-        raw_request: Request, reset_running_requests: bool = Query(default=False)
+        raw_request: Request,
+        reset_running_requests: bool = Query(default=False),
+        reset_external: bool = Query(default=False),
     ):
         """
-        Reset the prefix cache. Note that we currently do not check if the
-        prefix cache is successfully reset in the API server.
+        Reset the local prefix cache.
+
+        Optionally, if the query parameter `reset_external=true`
+        also resets the external (connector-managed) prefix cache.
+
+        Note that we currently do not check if the prefix cache
+        is successfully reset in the API server.
+
+        Example:
+            POST /reset_prefix_cache?reset_external=true
         """
         logger.info("Resetting prefix cache...")
-        await engine_client(raw_request).reset_prefix_cache(reset_running_requests)
+
+        await engine_client(raw_request).reset_prefix_cache(
+            reset_running_requests, reset_external
+        )
         return Response(status_code=200)
 
     @router.post("/reset_mm_cache")
diff --git a/vllm/v1/core/sched/interface.py b/vllm/v1/core/sched/interface.py
index c2f503ef2354e..596ab05ad320a 100644
--- a/vllm/v1/core/sched/interface.py
+++ b/vllm/v1/core/sched/interface.py
@@ -152,7 +152,9 @@ class SchedulerInterface(ABC):
         return self.has_unfinished_requests() or self.has_finished_requests()
 
     @abstractmethod
-    def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
+    def reset_prefix_cache(
+        self, reset_running_requests: bool = False, reset_connector: bool = False
+    ) -> bool:
         """Reset the prefix cache for KV cache.
 
         This is particularly required when the model weights are live-updated.
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 75a7385df38b1..0a8efa2fd512f 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -1380,7 +1380,9 @@ class Scheduler(SchedulerInterface):
     def has_finished_requests(self) -> bool:
         return len(self.finished_req_ids) > 0
 
-    def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
+    def reset_prefix_cache(
+        self, reset_running_requests: bool = False, reset_connector: bool = False
+    ) -> bool:
         """Reset the KV prefix cache.
 
         If reset_running_requests is True, all the running requests will be
@@ -1418,8 +1420,26 @@ class Scheduler(SchedulerInterface):
                 "the presence of running requests waiting for remote KV transfer, "
                 "which is not supported yet."
             )
+
+        if reset_connector:
+            reset_successful = self.reset_connector_cache() and reset_successful
+
         return reset_successful
 
+    def reset_connector_cache(self) -> bool:
+        if self.connector is None:
+            logger.warning("reset_connector called but no KV connector is configured.")
+            return False
+
+        if self.connector.reset_cache() is False:
+            return False
+
+        if self.log_stats:
+            assert self.connector_prefix_cache_stats is not None
+            self.connector_prefix_cache_stats.reset = True
+
+        return True
+
     def make_stats(
         self,
         spec_decoding_stats: SpecDecodingStats | None = None,
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index ec5d6e95ce3aa..fd7e04dc02082 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -749,8 +749,12 @@ class AsyncLLM(EngineClient):
         self.input_processor.clear_mm_cache()
         await self.engine_core.reset_mm_cache_async()
 
-    async def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
-        return await self.engine_core.reset_prefix_cache_async(reset_running_requests)
+    async def reset_prefix_cache(
+        self, reset_running_requests: bool = False, reset_connector: bool = False
+    ) -> bool:
+        return await self.engine_core.reset_prefix_cache_async(
+            reset_running_requests, reset_connector
+        )
 
     async def sleep(self, level: int = 1) -> None:
         await self.reset_prefix_cache()
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 8e34dfcea7f61..3d3a1e138ddef 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -503,8 +503,12 @@ class EngineCore:
 
         self.model_executor.reset_mm_cache()
 
-    def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
-        return self.scheduler.reset_prefix_cache(reset_running_requests)
+    def reset_prefix_cache(
+        self, reset_running_requests: bool = False, reset_connector: bool = False
+    ) -> bool:
+        return self.scheduler.reset_prefix_cache(
+            reset_running_requests, reset_connector
+        )
 
     def sleep(self, level: int = 1):
         self.model_executor.sleep(level)
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index afa0593921d06..c936646aa7993 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -138,7 +138,9 @@ class EngineCoreClient(ABC):
     def reset_mm_cache(self) -> None:
         raise NotImplementedError
 
-    def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
+    def reset_prefix_cache(
+        self, reset_running_requests: bool = False, reset_connector: bool = False
+    ) -> bool:
         raise NotImplementedError
 
     def sleep(self, level: int = 1) -> None:
@@ -209,7 +211,7 @@ class EngineCoreClient(ABC):
         raise NotImplementedError
 
     async def reset_prefix_cache_async(
-        self, reset_running_requests: bool = False
+        self, reset_running_requests: bool = False, reset_connector: bool = False
     ) -> bool:
         raise NotImplementedError
 
@@ -289,8 +291,12 @@ class InprocClient(EngineCoreClient):
     def reset_mm_cache(self) -> None:
         self.engine_core.reset_mm_cache()
 
-    def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
-        return self.engine_core.reset_prefix_cache(reset_running_requests)
+    def reset_prefix_cache(
+        self, reset_running_requests: bool = False, reset_connector: bool = False
+    ) -> bool:
+        return self.engine_core.reset_prefix_cache(
+            reset_running_requests, reset_connector
+        )
 
     def sleep(self, level: int = 1) -> None:
         self.engine_core.sleep(level)
@@ -753,8 +759,12 @@ class SyncMPClient(MPClient):
     def reset_mm_cache(self) -> None:
         self.call_utility("reset_mm_cache")
 
-    def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
-        return self.call_utility("reset_prefix_cache", reset_running_requests)
+    def reset_prefix_cache(
+        self, reset_running_requests: bool = False, reset_connector: bool = False
+    ) -> bool:
+        return self.call_utility(
+            "reset_prefix_cache", reset_running_requests, reset_connector
+        )
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
         return self.call_utility("add_lora", lora_request)
@@ -958,10 +968,10 @@ class AsyncMPClient(MPClient):
         await self.call_utility_async("reset_mm_cache")
 
     async def reset_prefix_cache_async(
-        self, reset_running_requests: bool = False
+        self, reset_running_requests: bool = False, reset_connector: bool = False
     ) -> bool:
         return await self.call_utility_async(
-            "reset_prefix_cache", reset_running_requests
+            "reset_prefix_cache", reset_running_requests, reset_connector
         )
 
     async def sleep_async(self, level: int = 1) -> None:
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 8772f2e488dc0..4c31291005477 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -328,8 +328,12 @@ class LLMEngine:
         self.input_processor.clear_mm_cache()
         self.engine_core.reset_mm_cache()
 
-    def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
-        return self.engine_core.reset_prefix_cache(reset_running_requests)
+    def reset_prefix_cache(
+        self, reset_running_requests: bool = False, reset_connector: bool = False
+    ) -> bool:
+        return self.engine_core.reset_prefix_cache(
+            reset_running_requests, reset_connector
+        )
 
     def sleep(self, level: int = 1):
         self.engine_core.sleep(level)

From bff78310d979bf516b45d8908b5e621a170578b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Fri, 5 Dec 2025 20:23:33 +0100
Subject: [PATCH 283/770] [Enc-Dec] Fix OOT tokenizer issue (#30144)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 vllm/inputs/preprocess.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 2893a56b1190f..0372b06d0017f 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -198,7 +198,7 @@ class InputPreprocessor:
     ) -> dict[str, Any]:
         kwargs = dict[str, Any]()
 
-        if self.model_config.hf_config.model_type == "whisper":
+        if self.model_config.is_encoder_decoder:
             # For Whisper, special tokens should be provided by the user based
             # on the task and language of their request. Also needed to avoid
             # appending an EOS token to the prompt which disrupts generation.
@@ -573,7 +573,6 @@ class InputPreprocessor:
         """
         encoder_inputs: SingletonInputs
         decoder_inputs: SingletonInputs | None
-
         if is_explicit_encoder_decoder_prompt(prompt):
             # `cast` is needed for mypy, but not pyright
             prompt_ = cast(ExplicitEncoderDecoderPrompt, prompt)
@@ -585,7 +584,9 @@ class InputPreprocessor:
             if (decoder_input := prompt_["decoder_prompt"]) is None:
                 decoder_inputs = None
             else:
-                decoder_inputs = self._prompt_to_llm_inputs(decoder_input)
+                decoder_inputs = self._prompt_to_llm_inputs(
+                    decoder_input, tokenization_kwargs=tokenization_kwargs
+                )
             # For multimodal model, override decoder prompt from processor
             # with explicit decoder prompt.
             if self.model_config.is_multimodal_model:

From 3633035a3fdee20cca8a8deb72490dc9cacea0f8 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Fri, 5 Dec 2025 14:41:40 -0500
Subject: [PATCH 284/770] [Misc] Rename CohereForAI references to CohereLabs
 (#30147)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 docs/models/supported_models.md                           | 2 +-
 examples/offline_inference/vision_language.py             | 2 +-
 examples/offline_inference/vision_language_multi_image.py | 2 +-
 tests/distributed/test_pipeline_parallel.py               | 2 +-
 tests/models/multimodal/generation/test_common.py         | 4 ++--
 tests/models/registry.py                                  | 6 +++---
 6 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 96d5ec25c0064..1089de87b994e 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -666,7 +666,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
 |--------------|--------|--------|-------------------|----------------------|---------------------------|
 | `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | |
-| `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. | | ✅︎ |
+| `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereLabs/aya-vision-8b`, `CohereLabs/aya-vision-32b`, etc. | | ✅︎ |
 | `BeeForConditionalGeneration` | Bee-8B | T + I<sup>E+</sup> | `Open-Bee/Bee-8B-RL`, `Open-Bee/Bee-8B-SFT` | | ✅︎ |
 | `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ |
 | `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ |
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 0888a9d60a3fa..22802dddf7893 100755
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -72,7 +72,7 @@ def run_aria(questions: list[str], modality: str) -> ModelRequestData:
 # Aya Vision
 def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
-    model_name = "CohereForAI/aya-vision-8b"
+    model_name = "CohereLabs/aya-vision-8b"
 
     engine_args = EngineArgs(
         model=model_name,
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 560ca768d1a6c..28c466c03dfa5 100755
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -76,7 +76,7 @@ def load_aria(question: str, image_urls: list[str]) -> ModelRequestData:
 
 
 def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData:
-    model_name = "CohereForAI/aya-vision-8b"
+    model_name = "CohereLabs/aya-vision-8b"
 
     engine_args = EngineArgs(
         model=model_name,
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 89f035d2cdd6f..cc6251514c3dc 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -109,7 +109,7 @@ TEXT_GENERATION_MODELS = {
     "baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(),
     "bigscience/bloomz-1b1": PPTestSettings.fast(),
     "zai-org/chatglm3-6b": PPTestSettings.fast(),
-    "CohereForAI/c4ai-command-r-v01": PPTestSettings.fast(load_format="dummy"),
+    "CohereLabs/c4ai-command-r-v01": PPTestSettings.fast(load_format="dummy"),
     "databricks/dbrx-instruct": PPTestSettings.fast(load_format="dummy"),
     "Deci/DeciLM-7B-instruct": PPTestSettings.fast(),
     "deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index f896126a49089..fd26b838ae209 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -278,7 +278,7 @@ VLM_TEST_SETTINGS = {
         marks=[large_gpu_mark(min_gb=64)],
     ),
     "aya_vision": VLMTestInfo(
-        models=["CohereForAI/aya-vision-8b"],
+        models=["CohereLabs/aya-vision-8b"],
         test_type=(VLMTestType.IMAGE),
         prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",  # noqa: E501
         single_image_prompts=IMAGE_ASSETS.prompts(
@@ -294,7 +294,7 @@ VLM_TEST_SETTINGS = {
         vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}},
     ),
     "aya_vision-multi_image": VLMTestInfo(
-        models=["CohereForAI/aya-vision-8b"],
+        models=["CohereLabs/aya-vision-8b"],
         test_type=(VLMTestType.MULTI_IMAGE),
         prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",  # noqa: E501
         single_image_prompts=IMAGE_ASSETS.prompts(
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 352abdd2da9a0..020cb749341a6 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -211,10 +211,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
         trust_remote_code=True,
     ),
     "CohereForCausalLM": _HfExamplesInfo(
-        "CohereForAI/c4ai-command-r-v01", trust_remote_code=True
+        "CohereLabs/c4ai-command-r-v01", trust_remote_code=True
     ),
     "Cohere2ForCausalLM": _HfExamplesInfo(
-        "CohereForAI/c4ai-command-r7b-12-2024",
+        "CohereLabs/c4ai-command-r7b-12-2024",
         trust_remote_code=True,
     ),
     "CwmForCausalLM": _HfExamplesInfo("facebook/cwm", min_transformers_version="4.58"),
@@ -581,7 +581,7 @@ _AUTOMATIC_CONVERTED_MODELS = {
 _MULTIMODAL_EXAMPLE_MODELS = {
     # [Decoder-only]
     "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
-    "AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereForAI/aya-vision-8b"),
+    "AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereLabs/aya-vision-8b"),
     "BeeForConditionalGeneration": _HfExamplesInfo(
         "Open-Bee/Bee-8B-RL",
         trust_remote_code=True,

From e23ca3a0e8bbf8d1e386bd619e0056ec7255810c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Fri, 5 Dec 2025 20:47:37 +0100
Subject: [PATCH 285/770] [CI] Re-use whisper_client for all tests (#30148)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 .../test_transcription_validation_whisper.py  | 30 ++++++++-----------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/tests/entrypoints/openai/test_transcription_validation_whisper.py b/tests/entrypoints/openai/test_transcription_validation_whisper.py
index 47cd7b1f12d00..3c507ee0a3fa7 100644
--- a/tests/entrypoints/openai/test_transcription_validation_whisper.py
+++ b/tests/entrypoints/openai/test_transcription_validation_whisper.py
@@ -32,24 +32,20 @@ async def whisper_client(server):
 
 
 @pytest.mark.asyncio
-async def test_basic_audio(mary_had_lamb):
-    server_args = ["--enforce-eager"]
-
+async def test_basic_audio(whisper_client, mary_had_lamb):
     # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
-    with RemoteOpenAIServer(MODEL_NAME, server_args) as remote_server:
-        client = remote_server.get_async_client()
-        transcription = await client.audio.transcriptions.create(
-            model=MODEL_NAME,
-            file=mary_had_lamb,
-            language="en",
-            response_format="text",
-            temperature=0.0,
-        )
-        out = json.loads(transcription)
-        out_text = out["text"]
-        out_usage = out["usage"]
-        assert "Mary had a little lamb," in out_text
-        assert out_usage["seconds"] == 16, out_usage["seconds"]
+    transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="text",
+        temperature=0.0,
+    )
+    out = json.loads(transcription)
+    out_text = out["text"]
+    out_usage = out["usage"]
+    assert "Mary had a little lamb," in out_text
+    assert out_usage["seconds"] == 16, out_usage["seconds"]
 
 
 @pytest.mark.asyncio

From 962d703818c038b6ea2ae18eab8061a6f2fb8f65 Mon Sep 17 00:00:00 2001
From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Date: Fri, 5 Dec 2025 13:57:26 -0600
Subject: [PATCH 286/770] [Bugfix][llama4_eagle] Fix missing 'lm_head'
 attribute (#29926)

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
---
 tests/v1/e2e/test_spec_decode.py           |  6 +++++-
 vllm/model_executor/models/llama4_eagle.py | 13 +++++++++++--
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 5246ea6517f6c..575a6a151f579 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -402,7 +402,11 @@ def test_eagle_correctness(
             # Scout requires default backend selection
             # because vision encoder has head_dim 88 being incompatible
             #  with FLASH_ATTN and needs to fall back to Flex Attn
-            pass
+
+            # pass if not ROCm
+            if current_platform.is_rocm():
+                # TODO: Enable Flex Attn for spec_decode on ROCm
+                pytest.skip("Flex Attn for spec_decode not supported on ROCm currently")
         else:
             m.setenv("VLLM_MLA_DISABLE", "1")
             m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
diff --git a/vllm/model_executor/models/llama4_eagle.py b/vllm/model_executor/models/llama4_eagle.py
index 0146b30579287..02f5b5ff639bd 100644
--- a/vllm/model_executor/models/llama4_eagle.py
+++ b/vllm/model_executor/models/llama4_eagle.py
@@ -28,7 +28,10 @@ from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.torchao import TorchAOConfig
-from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.llama4 import Llama4DecoderLayer, Llama4ForCausalLM
 from vllm.model_executor.models.utils import extract_layer_index
@@ -182,6 +185,12 @@ class EagleLlama4ForCausalLM(Llama4ForCausalLM):
             self.config.vocab_size, scale=logit_scale
         )
 
+        self.lm_head = ParallelLMHead(
+            self.config.draft_vocab_size,
+            self.config.hidden_size,
+            prefix=maybe_prefix(prefix, "lm_head"),
+        )
+
         # Set MoE hyperparameters
         self.set_moe_parameters()
 
@@ -211,6 +220,6 @@ class EagleLlama4ForCausalLM(Llama4ForCausalLM):
         loader = AutoWeightsLoader(
             self,
             # lm_head is tied with target model (Llama4ForCausalLM)
-            skip_prefixes=(["lm_head."]),
+            skip_prefixes=([]),
         )
         loader.load_weights(map(transform, weights))

From 77e44728090cf11b6ece2ae25d7c732a065fbb45 Mon Sep 17 00:00:00 2001
From: Bangsheng Tang <5318912+bangshengtang@users.noreply.github.com>
Date: Fri, 5 Dec 2025 13:33:42 -0800
Subject: [PATCH 287/770] let draft model follow target model's config_format
 (#30152)

---
 vllm/config/speculative.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py
index c6d6f705f535c..bf533bf14e55c 100644
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@@ -337,6 +337,7 @@ class SpeculativeConfig:
                     enforce_eager=self.target_model_config.enforce_eager,
                     max_logprobs=self.target_model_config.max_logprobs,
                     hf_overrides=SpeculativeConfig.hf_config_override,
+                    config_format=self.target_model_config.config_format,
                 )
 
                 # Automatically detect the method

From 7b5575fa7dcf76ac86ab8d18501b9cc04f74f6bb Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Fri, 5 Dec 2025 16:42:12 -0500
Subject: [PATCH 288/770] [Bug] Fix vLLM config is not set error (#29999)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 .../layers/fused_moe/cutlass_moe.py           |  2 +
 .../fused_moe/fused_moe_modular_method.py     |  6 ++
 .../layers/fused_moe/modular_kernel.py        | 57 ++++++++++---------
 .../compressed_tensors_moe.py                 |  3 +
 .../quantization/utils/flashinfer_utils.py    |  6 ++
 5 files changed, 47 insertions(+), 27 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
index 6753a19250b3b..30144ca5452eb 100644
--- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -460,6 +460,7 @@ def cutlass_moe_fp8(
     expert_map: torch.Tensor | None = None,
     apply_router_weight_on_input: bool = False,
     global_num_experts: int = -1,
+    parallel_config=None,
 ) -> torch.Tensor:
     """
     This function computes a a8w8-quantized Mixture of Experts (MoE) layer
@@ -537,6 +538,7 @@ def cutlass_moe_fp8(
             c_strides2=c_strides2,
             quant_config=quant_config,
         ),
+        parallel_config=parallel_config,
     )
 
     return fn(
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
index c23c41df226f0..b33e7fd8a0215 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
@@ -44,6 +44,11 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
         prepare_finalize: FusedMoEPrepareAndFinalize,
         shared_experts: torch.nn.Module | None,
     ) -> "FusedMoEModularMethod":
+        parallel_config = getattr(
+            getattr(moe_layer, "vllm_config", None),
+            "parallel_config",
+            None,
+        )
         return FusedMoEModularMethod(
             old_quant_method,
             FusedMoEModularKernel(
@@ -51,6 +56,7 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
                 old_quant_method.select_gemm_impl(prepare_finalize, moe_layer),
                 shared_experts,
                 getattr(moe_layer, "shared_experts_stream", None),
+                parallel_config=parallel_config,
             ),
         )
 
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index b2af58cdca887..51d3299e7ddf1 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -10,7 +10,7 @@ from typing import final
 import torch
 
 import vllm.envs as envs
-from vllm.config import get_current_vllm_config
+from vllm.config import ParallelConfig, get_current_vllm_config
 from vllm.forward_context import get_forward_context, is_forward_context_available
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
@@ -716,6 +716,7 @@ class FusedMoEModularKernel(torch.nn.Module):
         fused_experts: FusedMoEPermuteExpertsUnpermute,
         shared_experts: torch.nn.Module | None = None,
         shared_experts_stream: torch.cuda.Stream | None = None,
+        parallel_config: ParallelConfig | None = None,
     ):
         super().__init__()
         self.prepare_finalize = prepare_finalize
@@ -723,6 +724,14 @@ class FusedMoEModularKernel(torch.nn.Module):
         self.shared_experts = shared_experts
         self.shared_experts_stream = shared_experts_stream
 
+        # cache whether this worker is using DP+EP
+        if parallel_config is None:
+            parallel_config = get_current_vllm_config().parallel_config
+        self.is_dp_ep = (
+            parallel_config.data_parallel_size > 1
+            and parallel_config.enable_expert_parallel
+        )
+
         self._post_init_setup()
         assert (
             prepare_finalize.activation_format == fused_experts.activation_formats[0]
@@ -811,33 +820,27 @@ class FusedMoEModularKernel(torch.nn.Module):
             is_forward_context_available()
             and get_forward_context().attn_metadata is None
         )
-        if is_profile_run and self.fused_experts.supports_chunking():
-            parallel_config = get_current_vllm_config().parallel_config
-            is_dp_ep = (
-                parallel_config.data_parallel_size > 1
-                and parallel_config.enable_expert_parallel
+        if is_profile_run and self.fused_experts.supports_chunking() and self.is_dp_ep:
+            max_workspace_13, max_workspace_2, max_fused_out_shape = (
+                self.fused_experts.workspace_shapes(
+                    envs.VLLM_FUSED_MOE_CHUNK_SIZE,
+                    N,
+                    K,
+                    top_k,
+                    global_num_experts,
+                    local_num_experts,
+                    expert_tokens_meta,
+                )
+            )
+            buffers.workspace13.get(
+                max_workspace_13, device=device, dtype=workspace_dtype
+            )
+            buffers.workspace2.get(
+                max_workspace_2, device=device, dtype=workspace_dtype
+            )
+            buffers.fused_out.get(
+                max_fused_out_shape, device=device, dtype=workspace_dtype
             )
-            if is_dp_ep:
-                max_workspace_13, max_workspace_2, max_fused_out_shape = (
-                    self.fused_experts.workspace_shapes(
-                        envs.VLLM_FUSED_MOE_CHUNK_SIZE,
-                        N,
-                        K,
-                        top_k,
-                        global_num_experts,
-                        local_num_experts,
-                        expert_tokens_meta,
-                    )
-                )
-                buffers.workspace13.get(
-                    max_workspace_13, device=device, dtype=workspace_dtype
-                )
-                buffers.workspace2.get(
-                    max_workspace_2, device=device, dtype=workspace_dtype
-                )
-                buffers.fused_out.get(
-                    max_fused_out_shape, device=device, dtype=workspace_dtype
-                )
 
         # Get intermediate workspace shapes based off the chunked M size.
         workspace13_shape, workspace2_shape, _ = self.fused_experts.workspace_shapes(
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index d7fb6d2ca367d..8013b29f733bb 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -1287,6 +1287,9 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                     ab_strides2=self.ab_strides2,
                     c_strides1=self.c_strides1,
                     c_strides2=self.ab_strides1_c_strides2,
+                    parallel_config=getattr(
+                        getattr(layer, "vllm_config", None), "parallel_config", None
+                    ),
                 )
 
         else:
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
index eef7a0896c375..00c2720a34875 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
@@ -247,6 +247,11 @@ def flashinfer_cutlass_moe_fp8(
     assert quant_config is not None
 
     # Construct modular kernel with block-scale support when requested.
+    parallel_config = getattr(
+        getattr(layer, "vllm_config", None),
+        "parallel_config",
+        None,
+    )
     fused_experts = mk.FusedMoEModularKernel(
         build_flashinfer_fp8_cutlass_moe_prepare_finalize(
             moe=moe, use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale
@@ -257,6 +262,7 @@ def flashinfer_cutlass_moe_fp8(
             out_dtype=hidden_states.dtype,
             use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale,
         ),
+        parallel_config=parallel_config,
     )
 
     return fused_experts(

From 02a41691932683aa544b8a0139586f43e2f8b4bd Mon Sep 17 00:00:00 2001
From: Deboleina <debroy@redhat.com>
Date: Fri, 5 Dec 2025 22:03:29 -0500
Subject: [PATCH 289/770] [Tests] Tool call tests for openai/gpt-oss-20b
 (#26237)

Signed-off-by: Debolina Roy <debroy@redhat.com>
---
 requirements/rocm-test.txt                    |   1 +
 .../tool_parsers/test_openai_tool_parser.py   | 359 ++++++++++++++++++
 2 files changed, 360 insertions(+)
 create mode 100644 tests/entrypoints/openai/tool_parsers/test_openai_tool_parser.py

diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt
index 9d3d711c33797..f25835c68ddcf 100644
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -49,6 +49,7 @@ blobfile==3.0.0
     # Multi-Modal Models Test
 decord==0.6.0
     # video processing, required by entrypoints/openai/test_video.py
+rapidfuzz==3.12.1
 
 # OpenAI compatibility and testing
 gpt-oss==0.0.8
diff --git a/tests/entrypoints/openai/tool_parsers/test_openai_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_openai_tool_parser.py
new file mode 100644
index 0000000000000..7cb87fd13ecfa
--- /dev/null
+++ b/tests/entrypoints/openai/tool_parsers/test_openai_tool_parser.py
@@ -0,0 +1,359 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import jsonschema
+import openai
+import pytest
+import pytest_asyncio
+from rapidfuzz import fuzz
+
+from ....utils import RemoteOpenAIServer
+
+MODEL_NAME = "openai/gpt-oss-20b"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        "--enable-auto-tool-choice",
+        "--tool-call-parser",
+        "openai",
+    ]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    """Async fixture providing an OpenAI-compatible vLLM client."""
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+# ==========================================================
+# Tool Definitions
+# ==========================================================
+TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "calculator",
+            "description": "Performs basic arithmetic calculations.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "expression": {
+                        "type": "string",
+                        "description": (
+                            "Arithmetic expression to evaluate, e.g. '123 + 456'."
+                        ),
+                    }
+                },
+                "required": ["expression"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_time",
+            "description": "Retrieves the current local time for a given city.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type": "string",
+                        "description": "City name, e.g. 'New York'.",
+                    }
+                },
+                "required": ["city"],
+            },
+        },
+    },
+]
+
+
+# ==========================================================
+# Message Examples
+# ==========================================================
+MESSAGES_CALC = [
+    {"role": "user", "content": "Calculate 123 + 456 using the calculator."}
+]
+
+MESSAGES_GET_TIME = [
+    {"role": "user", "content": "What is the current time in New York?"}
+]
+
+MESSAGES_MULTIPLE_CALLS = [
+    {
+        "role": "system",
+        "content": (
+            "You can call multiple tools. "
+            "When using more than one, return single JSON object with tool_calls array"
+            "containing each tool call with its function name and arguments. "
+            "Do not output multiple JSON objects separately."
+        ),
+    },
+    {
+        "role": "user",
+        "content": "First, calculate 7 * 8 using the calculator. "
+        "Then, use get_time to tell me the current time in New York.",
+    },
+]
+
+MESSAGES_INVALID_CALL = [
+    {
+        "role": "user",
+        "content": "Can you help with something, "
+        "but don’t actually perform any calculation?",
+    }
+]
+
+
+# Expected outputs
+FUNC_CALC = "calculator"
+FUNC_ARGS_CALC = '{"expression":"123 + 456"}'
+
+FUNC_TIME = "get_time"
+FUNC_ARGS_TIME = '{"city": "New York"}'
+
+
+# ==========================================================
+# Utility to extract reasoning and tool calls
+# ==========================================================
+def extract_reasoning_and_calls(chunks: list) -> tuple[str, list[str], list[str]]:
+    """
+    Extract accumulated reasoning text and tool call arguments
+    from streaming chunks.
+    """
+    reasoning_content: str = ""
+    tool_calls: dict[int, dict[str, str]] = {}
+
+    for chunk in chunks:
+        choice = getattr(chunk.choices[0], "delta", None)
+        if not choice:
+            continue
+
+        if hasattr(choice, "reasoning_content") and choice.reasoning_content:
+            reasoning_content += choice.reasoning_content
+
+        for tc in getattr(choice, "tool_calls", []) or []:
+            idx = getattr(tc, "index", 0)
+            tool_entry = tool_calls.setdefault(idx, {"name": "", "arguments": ""})
+
+            if getattr(tc, "function", None):
+                func = tc.function
+                if getattr(func, "name", None):
+                    tool_entry["name"] = func.name
+                if getattr(func, "arguments", None):
+                    tool_entry["arguments"] += func.arguments
+
+    function_names: list[str] = [v["name"] for _, v in sorted(tool_calls.items())]
+    arguments: list[str] = [v["arguments"] for _, v in sorted(tool_calls.items())]
+
+    return reasoning_content, arguments, function_names
+
+
+# ==========================================================
+# Test Scenarios
+# ==========================================================
+@pytest.mark.asyncio
+async def test_calculator_tool_call_and_argument_accuracy(client: openai.AsyncOpenAI):
+    """Verify calculator tool call is made and arguments are accurate."""
+
+    response = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=MESSAGES_CALC,
+        tools=TOOLS,
+        temperature=0.0,
+        stream=False,
+    )
+
+    message = response.choices[0].message
+    tool_calls = getattr(message, "tool_calls", [])
+    assert tool_calls, "No tool calls detected"
+
+    calc_call = next((c for c in tool_calls if c.function.name == FUNC_CALC), None)
+    assert calc_call, "Calculator function not called"
+
+    raw_args = calc_call.function.arguments
+    assert raw_args, "Calculator arguments missing"
+    assert "123" in raw_args and "456" in raw_args, (
+        f"Expected values not in raw arguments: {raw_args}"
+    )
+
+    try:
+        parsed_args = json.loads(raw_args)
+    except json.JSONDecodeError:
+        pytest.fail(f"Invalid JSON in calculator arguments: {raw_args}")
+
+    expected_expr = "123 + 456"
+    actual_expr = parsed_args.get("expression", "")
+    similarity = fuzz.ratio(actual_expr, expected_expr)
+
+    assert similarity > 90, (
+        f"Expression mismatch: expected '{expected_expr}' "
+        f"got '{actual_expr}' (similarity={similarity}%)"
+    )
+
+
+@pytest.mark.asyncio
+async def test_streaming_tool_call_get_time_with_reasoning(client: openai.AsyncOpenAI):
+    """Verify streamed reasoning and tool call behavior for get_time."""
+
+    stream = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=MESSAGES_GET_TIME,
+        tools=TOOLS,
+        temperature=0.0,
+        stream=True,
+    )
+
+    chunks = [chunk async for chunk in stream]
+    reasoning, arguments, function_names = extract_reasoning_and_calls(chunks)
+
+    assert FUNC_TIME in function_names, "get_time function not called"
+
+    assert any("New York" in arg for arg in arguments), (
+        f"Expected get_time arguments for New York not found in {arguments}"
+    )
+
+    assert len(reasoning) > 0, "Expected reasoning content missing"
+
+    assert any(keyword in reasoning for keyword in ["New York", "time", "current"]), (
+        f"Reasoning is not relevant to the request: {reasoning}"
+    )
+
+
+@pytest.mark.asyncio
+async def test_streaming_multiple_tools(client: openai.AsyncOpenAI):
+    """Test streamed multi-tool response with reasoning."""
+    stream = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=MESSAGES_MULTIPLE_CALLS,
+        tools=TOOLS,
+        temperature=0.0,
+        stream=True,
+    )
+
+    chunks = [chunk async for chunk in stream]
+    reasoning, arguments, function_names = extract_reasoning_and_calls(chunks)
+
+    try:
+        assert FUNC_CALC in function_names, (
+            f"Calculator tool missing — found {function_names}"
+        )
+        assert FUNC_TIME in function_names, (
+            f"Time tool missing — found {function_names}"
+        )
+        assert len(reasoning) > 0, "Expected reasoning content in streamed response"
+    except AssertionError as e:
+        print(f"ERROR: {e}")
+
+
+@pytest.mark.asyncio
+async def test_invalid_tool_call(client: openai.AsyncOpenAI):
+    """
+    Verify that ambiguous instructions that should not trigger a tool
+    do not produce any tool calls.
+    """
+    response = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=MESSAGES_INVALID_CALL,
+        tools=TOOLS,
+        temperature=0.0,
+        stream=False,
+    )
+
+    message = response.choices[0].message
+
+    assert message is not None, "Expected message in response"
+    assert hasattr(message, "content"), "Expected 'content' field in message"
+
+    tool_calls = getattr(message, "tool_calls", [])
+    assert not tool_calls, (
+        f"Model unexpectedly attempted a tool call on invalid input: {tool_calls}"
+    )
+
+
+@pytest.mark.asyncio
+async def test_tool_call_with_temperature(client: openai.AsyncOpenAI):
+    """
+    Verify model produces valid tool or text output
+    under non-deterministic sampling.
+    """
+    response = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=MESSAGES_CALC,
+        tools=TOOLS,
+        temperature=0.7,
+        stream=False,
+    )
+
+    message = response.choices[0].message
+    assert message is not None, "Expected non-empty message in response"
+    assert message.tool_calls or message.content, (
+        "Response missing both text and tool calls"
+    )
+
+    print(f"\nTool calls: {message.tool_calls}")
+    print(f"Text: {message.content}")
+
+
+@pytest.mark.asyncio
+async def test_tool_response_schema_accuracy(client: openai.AsyncOpenAI):
+    """Validate that tool call arguments adhere to their declared JSON schema."""
+    response = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=MESSAGES_MULTIPLE_CALLS,
+        tools=TOOLS,
+        temperature=0.0,
+    )
+
+    calls = response.choices[0].message.tool_calls
+    assert calls, "No tool calls produced"
+
+    for call in calls:
+        func_name = call.function.name
+        args = json.loads(call.function.arguments)
+
+        schema: dict[str, object] | None = None
+        for tool_entry in TOOLS:
+            function_def = tool_entry.get("function")
+            if (
+                function_def
+                and isinstance(function_def, dict)
+                and function_def.get("name") == func_name
+            ):
+                schema = function_def.get("parameters")
+                break
+
+        assert schema is not None, f"No matching tool schema found for {func_name}"
+
+        jsonschema.validate(instance=args, schema=schema)
+
+
+@pytest.mark.asyncio
+async def test_semantic_consistency_with_temperature(client: openai.AsyncOpenAI):
+    """Test that temperature variation doesn't cause contradictory reasoning."""
+    responses = []
+    for temp in [0.0, 0.5, 1.0]:
+        resp = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=MESSAGES_CALC,
+            tools=TOOLS,
+            temperature=temp,
+        )
+        text = (resp.choices[0].message.content or "").strip()
+        responses.append(text)
+
+    # Compare fuzzy similarity between low- and mid-temperature outputs
+    low_mid_sim = fuzz.ratio(responses[0], responses[1])
+    assert low_mid_sim > 60, (
+        f"Semantic drift too large between T=0.0 and T=0.5 ({low_mid_sim}%)"
+    )

From dc839ad03d31104c8ebcb0b8f5a75021f1796760 Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Fri, 5 Dec 2025 22:52:11 -0600
Subject: [PATCH 290/770] [CI/Build][AMD][Quantization] Fix test_int8_kernel.py
 by updating int8_utils to use hip.libdevice.round (#30151)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 .../layers/quantization/utils/int8_utils.py   | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/utils/int8_utils.py b/vllm/model_executor/layers/quantization/utils/int8_utils.py
index 925d0a516ce63..32192225f61e2 100644
--- a/vllm/model_executor/layers/quantization/utils/int8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/int8_utils.py
@@ -83,26 +83,11 @@ def block_dequant(
 
 
 if current_platform.is_rocm():
-    from triton.language import core
-
-    # NOTE: This can be removed when hip.libdevice.round() is available.
-    @core.extern
-    def round_f32(arg0, _builder=None):
-        return core.extern_elementwise(
-            "",
-            "",
-            [arg0],
-            {
-                (core.dtype("fp32"),): ("llvm.round", core.dtype("fp32")),
-                (core.dtype("fp64"),): ("llvm.round", core.dtype("fp64")),
-            },
-            is_pure=True,
-            _builder=_builder,
-        )
 
     @triton.jit
     def round_int8(x):
-        return round_f32(x).to(tl.int8)
+        return tl.extra.hip.libdevice.round(x).to(tl.int8)
+
 else:
 
     @triton.jit

From 7e31c3a3f699b2ec4bf18cb195edaf7270c1434c Mon Sep 17 00:00:00 2001
From: Samuel Shen <102553648+sammshen@users.noreply.github.com>
Date: Fri, 5 Dec 2025 20:53:34 -0800
Subject: [PATCH 291/770] [CI]: Remove unnecessary imports from
 test_lmache_integration (#30157)

Signed-off-by: Samuel Shen <slshen@uchicago.edu>
Co-authored-by: Samuel Shen <slshen@uchicago.edu>
---
 .../unit/test_lmcache_integration.py          | 62 +------------------
 1 file changed, 2 insertions(+), 60 deletions(-)

diff --git a/tests/v1/kv_connector/unit/test_lmcache_integration.py b/tests/v1/kv_connector/unit/test_lmcache_integration.py
index 33418edc325af..cfe8d810cf98a 100644
--- a/tests/v1/kv_connector/unit/test_lmcache_integration.py
+++ b/tests/v1/kv_connector/unit/test_lmcache_integration.py
@@ -64,22 +64,6 @@ def test_multimodal_interface():
     assumes(PlaceholderRange, "offset")
     assumes(PlaceholderRange, "length")
 
-    # test a minimal case
-    import torch
-
-    from vllm.distributed.kv_transfer.kv_connector.v1.lmcache_integration.utils import (
-        apply_mm_hashes_to_token_ids,
-    )
-
-    token_ids = torch.arange(10, dtype=torch.long)
-    mm_hashes = ["0000", "1111"]  # hex repr of 0 and 4369
-    mm_positions = [
-        PlaceholderRange(offset=0, length=4),
-        PlaceholderRange(offset=5, length=4),
-    ]
-    apply_mm_hashes_to_token_ids(token_ids, mm_hashes, mm_positions)
-    assert token_ids.tolist() == [0, 0, 0, 0, 4, 4369, 4369, 4369, 4369, 9]
-
 
 @pytest.mark.skipif(
     current_platform.is_rocm(), reason="Requires libcudart.so, not available on ROCm"
@@ -122,16 +106,6 @@ def test_config_interface():
     assumes(CacheConfig, "block_size")
     assumes(CacheConfig, "gpu_memory_utilization")
 
-    # mla metadata minimal cases
-    from vllm.distributed.kv_transfer.kv_connector.v1.lmcache_integration.utils import (
-        mla_enabled,
-    )
-
-    model_config = ModelConfig(model="deepseek-ai/DeepSeek-R1")
-    assert mla_enabled(model_config)
-    model_config = ModelConfig(model="Qwen/Qwen3-0.6B")
-    assert not mla_enabled(model_config)
-
     # kv metadata minimal case
     from vllm.utils.torch_utils import get_kv_cache_torch_dtype
 
@@ -139,7 +113,7 @@ def test_config_interface():
     parallel_config = ParallelConfig()
     cache_config = CacheConfig(cache_dtype="bfloat16")
     kv_dtype = get_kv_cache_torch_dtype(cache_config.cache_dtype, model_config.dtype)
-    use_mla = mla_enabled(model_config)
+    use_mla = False
     chunk_size = 256
     num_layer = model_config.get_num_layers(parallel_config)
     num_kv_head = model_config.get_num_kv_heads(parallel_config)
@@ -184,43 +158,11 @@ def test_request_interface():
     assumes(req, "num_tokens")
     assumes(req, "kv_transfer_params", is_instance_of=(dict, NoneType))
 
-    from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalKwargsItem
+    from vllm.multimodal.inputs import MultiModalFeatureSpec
 
     assumes(MultiModalFeatureSpec, "identifier")
     assumes(MultiModalFeatureSpec, "mm_position")
 
-    # minimal case:
-    from vllm.multimodal.inputs import PlaceholderRange
-
-    request = Request(
-        request_id="test_request",
-        prompt_token_ids=[1, 2, 3],
-        sampling_params=SamplingParams(max_tokens=10),
-        pooling_params=None,
-        eos_token_id=100,
-        lora_request=None,
-        mm_features=[
-            MultiModalFeatureSpec(
-                modality="image",
-                identifier="0000",
-                data=MultiModalKwargsItem.dummy("dummy_m"),
-                mm_position=PlaceholderRange(offset=0, length=10),
-            )
-        ],
-    )
-
-    from vllm.distributed.kv_transfer.kv_connector.v1.lmcache_integration.utils import (
-        extract_mm_features,
-    )
-
-    mm_hashes, mm_positions = extract_mm_features(request)
-    assert isinstance(mm_hashes, list)
-    assert len(mm_hashes) == 1
-    assert isinstance(mm_positions, list)
-    assert len(mm_positions) == 1
-    assert mm_positions[0].offset == 0
-    assert mm_positions[0].length == 10
-
 
 def test_new_request_interface():
     # protect against interface changes

From bf4a901af91e431a4cdb51ed4557b31bf89c0e5d Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sat, 6 Dec 2025 04:53:52 +0000
Subject: [PATCH 292/770] Better error when world size is larger than node and
 `distributed_executor_backend` is not set (#30140)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/config/parallel.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index 3a768bcd4f2ce..0327832c4fb8c 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -184,13 +184,14 @@ class ParallelConfig:
     distributed_executor_backend: (
         str | DistributedExecutorBackend | type[Executor] | None
     ) = None
-    """Backend to use for distributed model
-    workers, either "ray" or "mp" (multiprocessing). If the product
-    of pipeline_parallel_size and tensor_parallel_size is less than
-    or equal to the number of GPUs available, "mp" will be used to
-    keep processing on a single host. Otherwise, this will default
-    to "ray" if Ray is installed and fail otherwise. Note that tpu
-    only support Ray for distributed inference."""
+    """Backend to use for distributed model workers, either "ray" or "mp"
+    (multiprocessing). If the product of pipeline_parallel_size and tensor_parallel_size
+    is less than or equal to the number of GPUs available, "mp" will be used to
+    keep processing on a single host. Otherwise, an error will be raised. To use "mp"
+    you must also set nnodes, and to use "ray" you must manually set
+    distributed_executor_backend to "ray".
+
+    Note that tpu only support Ray for distributed inference."""
 
     worker_cls: str = "auto"
     """The full name of the worker class to use. If "auto", the worker class
@@ -566,8 +567,11 @@ class ParallelConfig:
             ):
                 gpu_count = cuda_device_count_stateless()
                 raise ValueError(
-                    f"Tensor parallel size ({self.world_size}) cannot be "
-                    f"larger than the number of available GPUs ({gpu_count})."
+                    f"World size ({self.world_size}) is larger than the number of "
+                    f"available GPUs ({gpu_count}) in this node. If this is "
+                    "intentional and you are using:\n"
+                    "- ray, set '--distributed-executor-backend ray'.\n"
+                    "- multiprocessing, set '--nnodes' appropriately."
                 )
             elif self.data_parallel_backend == "ray":
                 logger.info(

From 62079d86004448a42c4205d00e4a977fe85b69a6 Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Fri, 5 Dec 2025 22:54:17 -0600
Subject: [PATCH 293/770] [CI/Build][AMD] Skip marlin, machete, and hadacore
 tests since these require _C functions not defined for ROCm (#30109)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 tests/kernels/quantization/test_hadacore.py    | 7 +++++++
 tests/kernels/quantization/test_machete_mm.py  | 6 ++++++
 tests/kernels/quantization/test_marlin_gemm.py | 8 ++++++++
 3 files changed, 21 insertions(+)

diff --git a/tests/kernels/quantization/test_hadacore.py b/tests/kernels/quantization/test_hadacore.py
index 3ccee9db048cf..7a5c7fbd55f72 100644
--- a/tests/kernels/quantization/test_hadacore.py
+++ b/tests/kernels/quantization/test_hadacore.py
@@ -8,6 +8,13 @@ import torch
 from compressed_tensors.transform import deterministic_hadamard_matrix
 
 from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "These tests require hadacore_transform, not supported on ROCm.",
+        allow_module_level=True,
+    )
 
 
 @pytest.mark.parametrize("batch_size", [1, 32])
diff --git a/tests/kernels/quantization/test_machete_mm.py b/tests/kernels/quantization/test_machete_mm.py
index efa81de158d38..7f4ce2a085807 100644
--- a/tests/kernels/quantization/test_machete_mm.py
+++ b/tests/kernels/quantization/test_machete_mm.py
@@ -23,6 +23,12 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
 from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
 
+if current_platform.is_rocm():
+    pytest.skip(
+        "These tests require machete_prepack_B, not supported on ROCm.",
+        allow_module_level=True,
+    )
+
 CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 
 # TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
diff --git a/tests/kernels/quantization/test_marlin_gemm.py b/tests/kernels/quantization/test_marlin_gemm.py
index 59516db1b115d..995e777bb5e8b 100644
--- a/tests/kernels/quantization/test_marlin_gemm.py
+++ b/tests/kernels/quantization/test_marlin_gemm.py
@@ -56,6 +56,14 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 
+if current_platform.is_rocm():
+    pytest.skip(
+        "These tests require gptq_marlin_repack,"
+        "marlin_int4_fp8_preprocess, gptq_marlin_24_gemm,"
+        "or gptq_marlin_gemm which are not supported on ROCm.",
+        allow_module_level=True,
+    )
+
 ACT_ORDER_OPTS = [False, True]
 K_FULL_OPTS = [False, True]
 USE_ATOMIC_ADD_OPTS = [False, True]

From c4d62618ca7cf8507c1e357fa1180fb162c670fa Mon Sep 17 00:00:00 2001
From: yuttian1 <yuttian@amd.com>
Date: Sat, 6 Dec 2025 12:54:38 +0800
Subject: [PATCH 294/770] Fix AWQ MoE marlin check issue in marlin_utils.py for
 AMD backend (#30102)

Signed-off-by: yuttian1 <yuttian@amd.com>
---
 vllm/model_executor/layers/quantization/utils/marlin_utils.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 14337ee1d7bee..072b46f055210 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -179,6 +179,8 @@ def check_marlin_supports_shape(
 
 
 def check_marlin_supports_layer(layer: LinearBase, group_size: int) -> bool:
+    if current_platform.is_rocm():
+        return False
     output_size_per_partition = (
         getattr(layer, "output_size_per_partition", None) or layer.output_size
     )
@@ -195,6 +197,8 @@ def check_marlin_supports_layer(layer: LinearBase, group_size: int) -> bool:
 
 
 def check_moe_marlin_supports_layer(layer: LinearBase, group_size: int) -> bool:
+    if current_platform.is_rocm():
+        return False
     hidden_size = layer.hidden_size
     intermediate_size_per_partition = layer.intermediate_size_per_partition
     # apply_router_weight_on_input is not supported for moe marlin

From e3fbb6f152fe721506f22b3d8d2ec70c10229569 Mon Sep 17 00:00:00 2001
From: Dongjie Zou <85092850+baonudesifeizhai@users.noreply.github.com>
Date: Fri, 5 Dec 2025 23:55:09 -0500
Subject: [PATCH 295/770] fix#30092 Kimi-Linear model loading failure with
 missing indexer_rotary_emb (#30093)

Signed-off-by: baonudesifeizhai <baonudesifeizhai@gmail.com>
---
 vllm/model_executor/layers/mla.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/mla.py b/vllm/model_executor/layers/mla.py
index dad960160f2ad..1656f4deb6717 100644
--- a/vllm/model_executor/layers/mla.py
+++ b/vllm/model_executor/layers/mla.py
@@ -24,9 +24,9 @@ class MLAModules:
     q_b_proj: torch.nn.Module | None
     q_proj: torch.nn.Module | None
     indexer: torch.nn.Module | None
-    indexer_rotary_emb: torch.nn.Module | None
     is_sparse: bool
     topk_indices_buffer: torch.Tensor | None
+    indexer_rotary_emb: torch.nn.Module | None = None
 
 
 @CustomOp.register("multi_head_latent_attention")

From e858bc4d1490808e23bbc32e17202ebbf8713a76 Mon Sep 17 00:00:00 2001
From: Peter Salas <peter@fixie.ai>
Date: Fri, 5 Dec 2025 20:55:43 -0800
Subject: [PATCH 296/770] [Model] Add support for transformer-based Ultravox
 v0.7 projector (#30089)

Signed-off-by: Peter Salas <peter@fixie.ai>
---
 vllm/model_executor/models/ultravox.py      | 96 +++++++++++++++++++--
 vllm/transformers_utils/configs/ultravox.py |  2 +
 2 files changed, 91 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 26a8355cd22b5..2444159b2ad61 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -4,15 +4,21 @@
 # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py
 """PyTorch Ultravox model."""
 
+import copy
 from collections.abc import Iterable, Mapping, Sequence
+from types import SimpleNamespace
 from typing import Annotated, Any, Literal, TypeAlias
 
 import torch
 from torch import nn
 from torch.nn import functional as F
 from transformers import BatchFeature, ProcessorMixin
+from transformers.modeling_utils import ModuleUtilsMixin
 from transformers.models.whisper import WhisperFeatureExtractor
-from transformers.models.whisper.modeling_whisper import WhisperEncoder
+from transformers.models.whisper.modeling_whisper import (
+    WhisperEncoder,
+    WhisperEncoderLayer,
+)
 
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
@@ -282,7 +288,7 @@ class StackAudioFrames(nn.Module):
         return audio_embeds
 
 
-class UltravoxProjector(nn.Module):
+class UltravoxFeedForwardProjector(nn.Module):
     def __init__(self, config: UltravoxConfig):
         super().__init__()
         self.hidden_dim = config.hidden_size
@@ -310,7 +316,9 @@ class UltravoxProjector(nn.Module):
             self.ln_mid = nn.Identity()
             self.ln_post = RMSNorm(dim_out)
 
-    def forward(self, audio_features: torch.Tensor) -> torch.Tensor:
+    def forward(
+        self, audio_features: torch.Tensor, audio_token_len: torch.Tensor
+    ) -> torch.Tensor:
         audio_features = self._pad_and_stack(audio_features)
         audio_features = self.ln_pre(audio_features)
         hidden_states = self.linear_1(audio_features)
@@ -321,6 +329,70 @@ class UltravoxProjector(nn.Module):
         return hidden_states
 
 
+class UltravoxTransformerProjector(nn.Module, ModuleUtilsMixin):
+    def __init__(self, config: UltravoxConfig):
+        super().__init__()
+        self.config = SimpleNamespace(is_decoder=False)
+
+        self._pad_and_stack = StackAudioFrames(config.stack_factor)
+        dim_in = config.audio_config.hidden_size * config.stack_factor
+
+        projector_audio_config = copy.deepcopy(config.audio_config)
+
+        self.ln_pre = RMSNorm(dim_in)
+        self.linear_in = nn.Linear(dim_in, projector_audio_config.d_model)
+
+        self.embed_positions = nn.Embedding(
+            projector_audio_config.max_source_positions,
+            projector_audio_config.d_model,
+        )
+
+        self.layers = nn.ModuleList(
+            [
+                WhisperEncoderLayer(projector_audio_config)
+                for _ in range(config.num_projector_layers)
+            ]
+        )
+
+        self.ln_post = RMSNorm(projector_audio_config.d_model)
+        self.linear_out = nn.Linear(
+            projector_audio_config.d_model, config.text_config.hidden_size
+        )
+
+    def forward(
+        self, audio_features: torch.Tensor, audio_token_len: torch.Tensor
+    ) -> torch.Tensor:
+        audio_features = self._pad_and_stack(audio_features)
+
+        max_len_stacked = audio_features.shape[1]
+        attention_mask = torch.arange(max_len_stacked, device=audio_features.device)[
+            None, :
+        ].lt(audio_token_len[:, None])
+        extended_attention_mask = self.get_extended_attention_mask(
+            attention_mask, attention_mask.shape, audio_features.dtype
+        )
+
+        hidden_states = self.ln_pre(audio_features)
+        hidden_states = self.linear_in(hidden_states)
+
+        positions = self.embed_positions(
+            torch.arange(hidden_states.size(1), device=hidden_states.device)
+        )
+        hidden_states = hidden_states + positions
+
+        for layer in self.layers:
+            layer_outputs = layer(
+                hidden_states,
+                attention_mask=extended_attention_mask,
+                layer_head_mask=None,
+            )
+            hidden_states = layer_outputs[0]
+
+        hidden_states = self.ln_post(hidden_states)
+        hidden_states = self.linear_out(hidden_states)
+        return hidden_states
+
+
 class ModifiedWhisperEncoder(WhisperEncoder):
     """
     Encoder portion of OpenAI's Whisper model.
@@ -464,7 +536,10 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
                     prefix="audio_tower.",
                 )
             )
-        self.multi_modal_projector = UltravoxProjector(config)
+        if config.num_projector_layers > 0:
+            self.multi_modal_projector = UltravoxTransformerProjector(config)
+        else:
+            self.multi_modal_projector = UltravoxFeedForwardProjector(config)
         self.language_model = init_vllm_registered_model(
             vllm_config=vllm_config,
             hf_config=config.wrapped_model_config,
@@ -496,7 +571,10 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
         )
 
     def _audio_features_to_embeddings(
-        self, input_features: torch.Tensor, audio_lens: torch.Tensor
+        self,
+        input_features: torch.Tensor,
+        audio_lens: torch.Tensor,
+        audio_token_len: torch.Tensor,
     ) -> torch.Tensor:
         audio_features = input_features.to(self.audio_tower.dtype)
         batch_size = audio_features.size(0)
@@ -512,7 +590,9 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
             batch_features = batch_features.to(self.audio_tower.dtype)
 
             # Process through projector
-            batch_embeddings = self.multi_modal_projector(batch_features)
+            batch_embeddings = self.multi_modal_projector(
+                batch_features, audio_token_len[start:end]
+            )
             audio_embeddings.append(batch_embeddings)
 
         # Concatenate results
@@ -559,7 +639,9 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
         audio_lens = audio_input["lens"]
         audio_token_len = audio_input["token_len"]
 
-        embeddings = self._audio_features_to_embeddings(audio_features, audio_lens)
+        embeddings = self._audio_features_to_embeddings(
+            audio_features, audio_lens, audio_token_len
+        )
 
         # We should flatten and concatenate embeddings based on token lengths
         # For example, with token_len = [4, 2, 3], flattened_embeddings will be
diff --git a/vllm/transformers_utils/configs/ultravox.py b/vllm/transformers_utils/configs/ultravox.py
index fc0360a9ecb4e..395b3130d40af 100644
--- a/vllm/transformers_utils/configs/ultravox.py
+++ b/vllm/transformers_utils/configs/ultravox.py
@@ -61,6 +61,7 @@ class UltravoxConfig(transformers.PretrainedConfig):
         norm_init: float = 0.4,
         projector_act: str = "swiglu",
         projector_ln_mid: bool = False,
+        num_projector_layers: int = 0,
         **kwargs,
     ):
         self.ignore_index = ignore_index
@@ -71,6 +72,7 @@ class UltravoxConfig(transformers.PretrainedConfig):
         self.norm_init = norm_init
         self.projector_act = projector_act
         self.projector_ln_mid = projector_ln_mid
+        self.num_projector_layers = num_projector_layers
 
         # N.B. May set the wrapped_model_config below.
         self.text_model_id = text_model_id

From 40a046cd82af87e65d3be9db8bd27a4be65f1b00 Mon Sep 17 00:00:00 2001
From: Rohan Potdar <66227218+Rohan138@users.noreply.github.com>
Date: Fri, 5 Dec 2025 22:56:40 -0600
Subject: [PATCH 297/770] [Bugfix]: Fix `TokenizerLike` interface (#30009)

Signed-off-by: Rohan138 <rohanpotdar138@gmail.com>
---
 vllm/benchmarks/datasets.py    | 59 ++++++++++++++++++----------------
 vllm/benchmarks/serve.py       | 32 +++++++++---------
 vllm/benchmarks/throughput.py  | 22 +++++++++----
 vllm/config/model.py           |  3 +-
 vllm/tokenizers/deepseekv32.py |  3 ++
 vllm/tokenizers/mistral.py     |  6 +++-
 vllm/tokenizers/protocol.py    |  3 ++
 vllm/tokenizers/registry.py    |  2 +-
 8 files changed, 78 insertions(+), 52 deletions(-)

diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 638ece26071ef..49ee0faf049d1 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -32,7 +32,6 @@ from typing import Any, cast
 
 import numpy as np
 from PIL import Image
-from transformers import PreTrainedTokenizerBase
 from typing_extensions import deprecated
 
 from vllm.lora.request import LoRARequest
@@ -189,7 +188,7 @@ class BenchmarkDataset(ABC):
     @abstractmethod
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         request_id_prefix: str = "",
         no_oversample: bool = False,
@@ -201,7 +200,7 @@ class BenchmarkDataset(ABC):
         for generating a list of SampleRequest objects.
 
         Args:
-            tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
+            tokenizer (TokenizerLike): The tokenizer to be used
                 for processing the dataset's text.
             num_requests (int): The number of sample requests to generate.
             request_id_prefix (str): The prefix of request_id.
@@ -380,7 +379,7 @@ def process_video(video: Any) -> Mapping[str, Any]:
 
 
 def gen_prompt_decode_to_target_len(
-    tokenizer: PreTrainedTokenizerBase,
+    tokenizer: TokenizerLike,
     token_sequence: list[int],
     target_token_len: int,
     max_retry: int = 10,
@@ -468,7 +467,7 @@ class RandomDataset(BenchmarkDataset):
 
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         request_id_prefix: str = "",
         no_oversample: bool = False,
@@ -580,7 +579,7 @@ class RandomDataset(BenchmarkDataset):
         range_ratio: float,
         input_len: int,
         output_len: int,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
     ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
         """
         Get the sampling parameters for the dataset.
@@ -626,7 +625,7 @@ class RandomDataset(BenchmarkDataset):
     def generate_token_sequence(
         self,
         *,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         prefix_token_ids: list[int],
         prefix_len: int,
         vocab_size: int,
@@ -686,7 +685,7 @@ class RandomDatasetForReranking(RandomDataset):
 
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         request_id_prefix: str = "",
         range_ratio: float = RandomDataset.DEFAULT_RANGE_RATIO,
@@ -716,7 +715,11 @@ class RandomDatasetForReranking(RandomDataset):
         doc_lens, _, doc_offsets = self.get_sampling_params(
             num_requests, range_ratio, doc_len_param, 0, tokenizer
         )
+
         vocab_size = tokenizer.vocab_size
+        prohibited_tokens = tokenizer.all_special_ids
+        all_tokens = np.arange(vocab_size)
+        allowed_tokens = np.array(list(set(all_tokens) - set(prohibited_tokens)))
 
         query_prompt, query_input_len, token_mismatch_total = (
             self.generate_token_sequence(
@@ -727,6 +730,7 @@ class RandomDatasetForReranking(RandomDataset):
                 input_len=query_len,
                 offset=int(query_offsets[0]),
                 index=0,
+                allowed_tokens=allowed_tokens,
             )
         )
 
@@ -740,6 +744,7 @@ class RandomDatasetForReranking(RandomDataset):
                 input_len=int(doc_lens[i]),
                 offset=int(doc_offsets[i]),
                 index=i + 1,
+                allowed_tokens=allowed_tokens,
             )
             token_mismatch_total += token_mismatch
             requests.append((prompt, total_input_len))
@@ -1077,7 +1082,7 @@ class RandomMultiModalDataset(RandomDataset):
 
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         request_id_prefix: str = "",
         no_oversample: bool = False,
@@ -1231,7 +1236,7 @@ class ShareGPTDataset(BenchmarkDataset):
 
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         lora_path: str | None = None,
         max_loras: int | None = None,
@@ -1633,7 +1638,7 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
     )
 
 
-def get_samples(args, tokenizer) -> list[SampleRequest]:
+def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]:
     if not hasattr(args, "request_id_prefix"):
         args.request_id_prefix = ""
 
@@ -1971,7 +1976,7 @@ class CustomDataset(BenchmarkDataset):
 
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         lora_path: str | None = None,
         max_loras: int | None = None,
@@ -2101,7 +2106,7 @@ class SonnetDataset(BenchmarkDataset):
 
     def sample(
         self,
-        tokenizer,
+        tokenizer: TokenizerLike,
         num_requests: int,
         prefix_len: int = DEFAULT_PREFIX_LEN,
         input_len: int = DEFAULT_INPUT_LEN,
@@ -2202,7 +2207,7 @@ class BurstGPTDataset(BenchmarkDataset):
 
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         max_loras: int | None = None,
         lora_path: str | None = None,
@@ -2287,7 +2292,7 @@ class ConversationDataset(HuggingFaceDataset):
 
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         output_len: int | None = None,
         enable_multimodal_chat: bool = False,
@@ -2347,7 +2352,7 @@ class MultiModalConversationDataset(HuggingFaceDataset):
 
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         output_len: int | None = None,
         enable_multimodal_chat: bool = False,
@@ -2416,7 +2421,7 @@ class VisionArenaDataset(HuggingFaceDataset):
 
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         output_len: int | None = None,
         enable_multimodal_chat: bool = False,
@@ -2470,7 +2475,7 @@ class MMVUDataset(HuggingFaceDataset):
 
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         output_len: int | None = None,
         enable_multimodal_chat: bool = False,
@@ -2531,7 +2536,7 @@ class InstructCoderDataset(HuggingFaceDataset):
 
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         output_len: int | None = None,
         enable_multimodal_chat: bool = False,
@@ -2595,7 +2600,7 @@ class MTBenchDataset(HuggingFaceDataset):
 
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         output_len: int | None = None,
         enable_multimodal_chat: bool = False,
@@ -2661,7 +2666,7 @@ class BlazeditDataset(HuggingFaceDataset):
 
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         output_len: int | None = None,
         skip_chat_template: bool = False,
@@ -2742,7 +2747,7 @@ class AIMODataset(HuggingFaceDataset):
 
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         output_len: int | None = None,
         request_id_prefix: str = "",
@@ -2852,7 +2857,7 @@ class NextEditPredictionDataset(HuggingFaceDataset):
 
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         request_id_prefix: str = "",
         no_oversample: bool = False,
@@ -2924,7 +2929,7 @@ class ASRDataset(HuggingFaceDataset):
 
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         output_len: int | None = None,
         request_id_prefix: str = "",
@@ -3002,7 +3007,7 @@ class MLPerfDataset(HuggingFaceDataset):
 
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         output_len: int | None = None,
         request_id_prefix: str = "",
@@ -3081,7 +3086,7 @@ class PrefixRepetitionRandomDataset(BenchmarkDataset):
 
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         prefix_len: int = DEFAULT_PREFIX_LEN,
         suffix_len: int = DEFAULT_SUFFIX_LEN,
@@ -3167,7 +3172,7 @@ class MMStarDataset(HuggingFaceDataset):
 
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         output_len: int | None = None,
         enable_multimodal_chat: bool = False,
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index 890cd7e089fd6..568290aa894ff 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -36,7 +36,6 @@ from typing import Any, Literal
 import aiohttp
 import numpy as np
 from tqdm.asyncio import tqdm
-from transformers import PreTrainedTokenizerBase
 
 from vllm.benchmarks.datasets import SampleRequest, add_dataset_parser, get_samples
 from vllm.benchmarks.lib.endpoint_request_func import (
@@ -47,7 +46,7 @@ from vllm.benchmarks.lib.endpoint_request_func import (
 )
 from vllm.benchmarks.lib.ready_checker import wait_for_endpoint
 from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json
-from vllm.tokenizers import get_tokenizer
+from vllm.tokenizers import TokenizerLike, get_tokenizer
 from vllm.utils.gc_utils import freeze_gc_heap
 from vllm.utils.network_utils import join_host_port
 
@@ -286,7 +285,7 @@ def calculate_metrics(
     input_requests: list[SampleRequest],
     outputs: list[RequestFuncOutput],
     dur_s: float,
-    tokenizer: PreTrainedTokenizerBase,
+    tokenizer: TokenizerLike,
     selected_percentiles: list[float],
     goodput_config_dict: dict[str, float],
 ) -> tuple[BenchmarkMetrics, list[int]]:
@@ -489,7 +488,7 @@ async def benchmark(
     base_url: str,
     model_id: str,
     model_name: str,
-    tokenizer: PreTrainedTokenizerBase,
+    tokenizer: TokenizerLike,
     input_requests: list[SampleRequest],
     logprobs: int | None,
     request_rate: float,
@@ -1032,6 +1031,19 @@ def add_cli_args(parser: argparse.ArgumentParser):
         type=str,
         help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
     )
+    parser.add_argument(
+        "--tokenizer-mode",
+        type=str,
+        default="auto",
+        help="""Tokenizer mode:\n
+        - "auto" will use the tokenizer from `mistral_common` for Mistral models
+        if available, otherwise it will use the "hf" tokenizer.\n
+        - "hf" will use the fast tokenizer if available.\n
+        - "slow" will always use the slow tokenizer.\n
+        - "mistral" will always use the tokenizer from `mistral_common`.\n
+        - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
+        - Other custom values can be supported via plugins.""",
+    )
     parser.add_argument("--use-beam-search", action="store_true")
     parser.add_argument(
         "--logprobs",
@@ -1228,18 +1240,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
         help="Common prefix length shared by all prompts (used by random dataset)",
     )
 
-    parser.add_argument(
-        "--tokenizer-mode",
-        type=str,
-        default="auto",
-        choices=["auto", "slow", "mistral", "custom"],
-        help='The tokenizer mode.\n\n* "auto" will use the '
-        'fast tokenizer if available.\n* "slow" will '
-        "always use the slow tokenizer. \n* "
-        '"mistral" will always use the `mistral_common` tokenizer. \n*'
-        '"custom" will use --tokenizer to select the preregistered tokenizer.',
-    )
-
     parser.add_argument(
         "--served-model-name",
         type=str,
diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py
index 23b5faa1b2c32..ea693613fdd16 100644
--- a/vllm/benchmarks/throughput.py
+++ b/vllm/benchmarks/throughput.py
@@ -14,7 +14,7 @@ from typing import Any
 import torch
 import uvloop
 from tqdm import tqdm
-from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
+from transformers import AutoModelForCausalLM, PreTrainedTokenizerBase
 
 from vllm.benchmarks.datasets import (
     AIMODataset,
@@ -35,6 +35,7 @@ from vllm.inputs import TextPrompt, TokensPrompt
 from vllm.lora.request import LoRARequest
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
+from vllm.tokenizers import TokenizerLike, get_tokenizer
 from vllm.utils.async_utils import merge_async_iterators
 
 
@@ -246,12 +247,15 @@ async def run_vllm_async(
 def run_hf(
     requests: list[SampleRequest],
     model: str,
-    tokenizer: PreTrainedTokenizerBase,
+    tokenizer: TokenizerLike,
     n: int,
     max_batch_size: int,
     trust_remote_code: bool,
     disable_detokenize: bool = False,
 ) -> float:
+    assert isinstance(tokenizer, PreTrainedTokenizerBase), (
+        "the hf backend only supports HF tokenizers"
+    )
     llm = AutoModelForCausalLM.from_pretrained(
         model, dtype=torch.float16, trust_remote_code=trust_remote_code
     )
@@ -692,15 +696,21 @@ def add_cli_args(parser: argparse.ArgumentParser):
 
 
 def main(args: argparse.Namespace):
-    if args.tokenizer is None:
-        args.tokenizer = args.model
     validate_args(args)
     if args.seed is None:
         args.seed = 0
     random.seed(args.seed)
     # Sample the requests.
-    tokenizer = AutoTokenizer.from_pretrained(
-        args.tokenizer, trust_remote_code=args.trust_remote_code
+    if (
+        args.backend == "hf" or args.backend == "mii"
+    ) and args.tokenizer_mode == "auto":
+        # mistral_common tokenizer is only supported on vllm and vllm-chat backends;
+        # for hf and mii backends, we use hf tokenizer
+        args.tokenizer_mode = "hf"
+    tokenizer = get_tokenizer(
+        args.tokenizer,
+        tokenizer_mode=args.tokenizer_mode,
+        trust_remote_code=args.trust_remote_code,
     )
     requests = get_requests(args, tokenizer)
     is_multi_modal = any(request.multi_modal_data is not None for request in requests)
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 5be7d5e7f2df7..509a9c5e162f7 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -136,7 +136,8 @@ class ModelConfig:
     name or path will be used."""
     tokenizer_mode: TokenizerMode | str = "auto"
     """Tokenizer mode:\n
-    - "auto" will use "hf" tokenizer if Mistral's tokenizer is not available.\n
+    - "auto" will use the tokenizer from `mistral_common` for Mistral models
+    if available, otherwise it will use the "hf" tokenizer.\n
     - "hf" will use the fast tokenizer if available.\n
     - "slow" will always use the slow tokenizer.\n
     - "mistral" will always use the tokenizer from `mistral_common`.\n
diff --git a/vllm/tokenizers/deepseekv32.py b/vllm/tokenizers/deepseekv32.py
index 1140357cf861d..b0490dacbe2d4 100644
--- a/vllm/tokenizers/deepseekv32.py
+++ b/vllm/tokenizers/deepseekv32.py
@@ -54,6 +54,9 @@ class DeepseekV32Tokenizer(HfTokenizer):
         prompt_str = encode_messages(messages, **encode_config)  # type: ignore
         return prompt_str
 
+    def num_special_tokens_to_add(self) -> int:
+        return len(self.encode(""))
+
     @property
     def all_special_tokens(self) -> list[str]:
         return self.tokenizer.all_special_tokens
diff --git a/vllm/tokenizers/mistral.py b/vllm/tokenizers/mistral.py
index 37d67607c2cfe..1f44037dd55ec 100644
--- a/vllm/tokenizers/mistral.py
+++ b/vllm/tokenizers/mistral.py
@@ -309,6 +309,9 @@ class MistralTokenizer(TokenizerLike):
             for i in all_special_ids
         ]
 
+    def num_special_tokens_to_add(self) -> int:
+        return len(self.encode(""))
+
     # the following attributes are set to fit vLLM's design and are used
     # by the structured output backends.
     @property
@@ -421,6 +424,7 @@ class MistralTokenizer(TokenizerLike):
     ) -> list[int]:
         add_generation_prompt = kwargs.pop("add_generation_prompt", False)
         continue_final_message = kwargs.get("continue_final_message", False)
+        tokenize = kwargs.get("tokenize", True)
         padding = kwargs.get("padding", False)
         truncation = kwargs.get("truncation", False)
         max_length = kwargs.get("max_length")
@@ -433,7 +437,7 @@ class MistralTokenizer(TokenizerLike):
             conversation=messages,
             tools=tools,
             continue_final_message=continue_final_message,
-            tokenize=True,
+            tokenize=tokenize,
             padding=padding,
             truncation=truncation,
             max_length=max_length,
diff --git a/vllm/tokenizers/protocol.py b/vllm/tokenizers/protocol.py
index 6c807bd998781..d6a3b0ba9b5f5 100644
--- a/vllm/tokenizers/protocol.py
+++ b/vllm/tokenizers/protocol.py
@@ -22,6 +22,9 @@ class TokenizerLike(Protocol):
     ) -> "TokenizerLike":
         raise NotImplementedError
 
+    def num_special_tokens_to_add(self) -> int:
+        raise NotImplementedError
+
     @property
     def all_special_tokens(self) -> list[str]:
         raise NotImplementedError
diff --git a/vllm/tokenizers/registry.py b/vllm/tokenizers/registry.py
index 87048f2ec7845..1d44feeee500f 100644
--- a/vllm/tokenizers/registry.py
+++ b/vllm/tokenizers/registry.py
@@ -183,7 +183,7 @@ def get_tokenizer(
             "`tokenizer_mode='custom'` when initializing vLLM.",
             tokenizer_args,
             str(tokenizer_kwargs),
-            tokenizer_mode,
+            tokenizer_name,
         )
 
         tokenizer_mode = str(tokenizer_name)

From b12f4a983077f0f085e3734d4d5b0c25f2576cec Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Fri, 5 Dec 2025 22:57:38 -0600
Subject: [PATCH 298/770] [CI/Build][AMD] Use ROCM_ATTN instead of FLASH_ATTN
 test for test_register_kv_caches for ROCm and update test for TRITON_ATTN
 (#29985)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
---
 .../kv_connector/unit/test_nixl_connector.py  | 48 ++++++++++++++-----
 1 file changed, 37 insertions(+), 11 deletions(-)

diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index ec9ff7315d097..53da09cfbc21d 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -41,6 +41,7 @@ from vllm.distributed.kv_transfer.kv_transfer_state import (
     has_kv_transfer_group,
 )
 from vllm.forward_context import ForwardContext
+from vllm.platforms import current_platform
 from vllm.platforms.interface import Platform
 from vllm.sampling_params import SamplingParams
 from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
@@ -1111,7 +1112,26 @@ def _run_abort_timeout_test(llm: LLM, timeout: int):
     llm.llm_engine.engine_core.shutdown()
 
 
-@pytest.mark.parametrize("attn_backend", ["FLASH_ATTN", "TRITON_ATTN"])
+@pytest.mark.parametrize(
+    "attn_backend",
+    [
+        pytest.param(
+            "FLASH_ATTN",
+            marks=pytest.mark.skipif(
+                current_platform.is_rocm(),
+                reason="Attention backend FLASH_ATTN is not supported on ROCm",
+            ),
+        ),
+        pytest.param(
+            "ROCM_ATTN",
+            marks=pytest.mark.skipif(
+                not current_platform.is_rocm(),
+                reason="Attention backend ROCM_ATTN is only supported on ROCm",
+            ),
+        ),
+        "TRITON_ATTN",
+    ],
+)
 def test_register_kv_caches(dist_init, attn_backend, monkeypatch):
     """
     Test that register_kv_caches() properly calls nixl_wrapper methods with
@@ -1133,6 +1153,10 @@ def test_register_kv_caches(dist_init, attn_backend, monkeypatch):
         from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
 
         backend_cls = FlashAttentionBackend
+    elif attn_backend == "ROCM_ATTN":
+        from vllm.v1.attention.backends.rocm_attn import RocmAttentionBackend
+
+        backend_cls = RocmAttentionBackend
     else:  # TRITON_ATTN
         from vllm.v1.attention.backends.triton_attn import TritonAttentionBackend
 
@@ -1151,6 +1175,7 @@ def test_register_kv_caches(dist_init, attn_backend, monkeypatch):
     }
 
     # Store tensor info for validation
+
     test_shape = backend_cls.get_kv_cache_shape(
         num_blocks=1, block_size=16, num_kv_heads=1, head_size=1
     )
@@ -1175,17 +1200,18 @@ def test_register_kv_caches(dist_init, attn_backend, monkeypatch):
         ]
         expected_num_entries = 4
 
+    nixl_module = "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector"
     with (
-        patch(
-            "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper"
-        ) as mock_nixl_wrapper,
-        patch(
-            "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.threading.Event"
-        ),
-        patch(
-            "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.threading.Thread"
-        ) as mock_thread,
-    ):  # noqa: E501
+        patch(f"{nixl_module}.NixlWrapper") as mock_nixl_wrapper,
+        patch(f"{nixl_module}.threading.Event"),
+        patch(f"{nixl_module}.threading.Thread") as mock_thread,
+        patch(f"{nixl_module}.get_attn_backend") as mock_get_attn_backend,
+    ):
+        # Ensure get_attn_backend returns the correct value due to
+        # _cached_get_attn_backend returning the backend from previous
+        # test run if not mocking.
+        mock_get_attn_backend.return_value = backend_cls
+
         # Create connector
         connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
         connector.connector_worker = FakeNixlConnectorWorker(

From 4026ae31e910d50da2b80c1c386f1d1db7f1b7d8 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 5 Dec 2025 20:59:04 -0800
Subject: [PATCH 299/770] [Misc] Move `disable_nccl_for_dp_synchronization`
 init logic into `VllmConfig` (#30161)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/config/vllm.py      | 9 +++++++++
 vllm/engine/arg_utils.py | 6 ------
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 47e7ffded797b..b99be1e5dfbc8 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -586,6 +586,15 @@ class VllmConfig:
             else:
                 self.scheduler_config.async_scheduling = True
 
+        if (
+            self.scheduler_config.async_scheduling
+            and not self.parallel_config.disable_nccl_for_dp_synchronization
+        ):
+            logger.info(
+                "Disabling NCCL for DP synchronization when using async scheduling."
+            )
+            self.parallel_config.disable_nccl_for_dp_synchronization = True
+
         from vllm.platforms import current_platform
 
         if (
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index aad0719548d18..ceac5407af6e2 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1602,12 +1602,6 @@ class EngineArgs:
             model_config.skip_tokenizer_init = True
             logger.info("Skipping tokenizer initialization for tokens-only mode.")
 
-        if self.async_scheduling and not self.disable_nccl_for_dp_synchronization:
-            logger.info(
-                "Disabling NCCL for DP synchronization when using async scheduling."
-            )
-            self.disable_nccl_for_dp_synchronization = True
-
         parallel_config = ParallelConfig(
             pipeline_parallel_size=self.pipeline_parallel_size,
             tensor_parallel_size=self.tensor_parallel_size,

From a238cbd89d07b4b0ed8fb3dff3c219a3ee3a1651 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 5 Dec 2025 21:42:47 -0800
Subject: [PATCH 300/770] [Model Runner V2] Support min-p sampling (#30171)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/sample/metadata.py | 13 +++++++
 vllm/v1/worker/gpu/sample/min_p.py    | 53 +++++++++++++++++++++++++++
 vllm/v1/worker/gpu/sample/sampler.py  |  4 ++
 vllm/v1/worker/gpu/states.py          |  7 ++++
 4 files changed, 77 insertions(+)
 create mode 100644 vllm/v1/worker/gpu/sample/min_p.py

diff --git a/vllm/v1/worker/gpu/sample/metadata.py b/vllm/v1/worker/gpu/sample/metadata.py
index 040771c051bb4..f10c72049cbae 100644
--- a/vllm/v1/worker/gpu/sample/metadata.py
+++ b/vllm/v1/worker/gpu/sample/metadata.py
@@ -13,6 +13,7 @@ class SamplingMetadata:
 
     top_p: torch.Tensor | None
     top_k: torch.Tensor | None
+    min_p: torch.Tensor | None
 
     repetition_penalty: torch.Tensor
     frequency_penalty: torch.Tensor
@@ -44,6 +45,7 @@ class SamplingMetadata:
         # top_k = torch.full((num_reqs,), 20, dtype=torch.int32, device=device)
         top_p = None
         top_k = None
+        min_p = torch.zeros(num_reqs, dtype=torch.float32, device=device)
         # NOTE(woosuk): We must set penalties to their default values to make sure
         # the penalties kernel does not touch the placeholder bin_counts tensors.
         repetition_penalty = torch.ones(num_reqs, dtype=torch.float32, device=device)
@@ -64,6 +66,7 @@ class SamplingMetadata:
             temperature=temperature,
             top_p=top_p,
             top_k=top_k,
+            min_p=min_p,
             repetition_penalty=repetition_penalty,
             frequency_penalty=frequency_penalty,
             presence_penalty=presence_penalty,
@@ -85,6 +88,8 @@ def _expand_sampling_metadata_kernel(
     expanded_top_p_ptr,
     top_k_ptr,
     expanded_top_k_ptr,
+    min_p_ptr,
+    expanded_min_p_ptr,
     rep_penalty_ptr,
     expanded_rep_penalty_ptr,
     freq_penalty_ptr,
@@ -115,6 +120,10 @@ def _expand_sampling_metadata_kernel(
         top_k = tl.load(top_k_ptr + req_idx)
         tl.store(expanded_top_k_ptr + start_idx + block, top_k, mask=mask)
 
+    if min_p_ptr is not None:
+        min_p = tl.load(min_p_ptr + req_idx)
+        tl.store(expanded_min_p_ptr + start_idx + block, min_p, mask=mask)
+
     rep_penalty = tl.load(rep_penalty_ptr + req_idx)
     tl.store(expanded_rep_penalty_ptr + start_idx + block, rep_penalty, mask=mask)
 
@@ -138,6 +147,7 @@ def expand_sampling_metadata(
     expanded_temp = create_empty(sampling_metadata.temperature)
     expanded_top_p = create_empty(sampling_metadata.top_p)
     expanded_top_k = create_empty(sampling_metadata.top_k)
+    expanded_min_p = create_empty(sampling_metadata.min_p)
     expanded_repetition_penalty = create_empty(sampling_metadata.repetition_penalty)
     expanded_frequency_penalty = create_empty(sampling_metadata.frequency_penalty)
     expanded_presence_penalty = create_empty(sampling_metadata.presence_penalty)
@@ -151,6 +161,8 @@ def expand_sampling_metadata(
         expanded_top_p,
         sampling_metadata.top_k,
         expanded_top_k,
+        sampling_metadata.min_p,
+        expanded_min_p,
         sampling_metadata.repetition_penalty,
         expanded_repetition_penalty,
         sampling_metadata.frequency_penalty,
@@ -166,6 +178,7 @@ def expand_sampling_metadata(
         temperature=expanded_temp,
         top_p=expanded_top_p,
         top_k=expanded_top_k,
+        min_p=expanded_min_p,
         seeds=expanded_seeds,
         repetition_penalty=expanded_repetition_penalty,
         frequency_penalty=expanded_frequency_penalty,
diff --git a/vllm/v1/worker/gpu/sample/min_p.py b/vllm/v1/worker/gpu/sample/min_p.py
new file mode 100644
index 0000000000000..0638818006f50
--- /dev/null
+++ b/vllm/v1/worker/gpu/sample/min_p.py
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+
+from vllm.triton_utils import tl, triton
+
+
+@triton.jit
+def _min_p_kernel(
+    logits_ptr,
+    logits_stride,
+    min_p_ptr,
+    vocab_size,
+    BLOCK_SIZE: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+    min_p = tl.load(min_p_ptr + req_idx).to(tl.float32)
+    if min_p == 0.0:
+        return
+
+    max_val = float("-inf")
+    for i in range(0, vocab_size, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        mask = block < vocab_size
+        logits = tl.load(
+            logits_ptr + req_idx * logits_stride + block, mask=mask, other=float("-inf")
+        )
+        max_val = tl.max(tl.maximum(logits, max_val))
+    max_val = max_val.to(tl.float32)  # type: ignore
+
+    threshold = max_val + tl.log(min_p)
+    for i in range(0, vocab_size, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        mask = block < vocab_size
+        logits = tl.load(
+            logits_ptr + req_idx * logits_stride + block, mask=mask, other=float("-inf")
+        )
+        logits = tl.where(logits < threshold, float("-inf"), logits)
+        tl.store(logits_ptr + req_idx * logits_stride + block, logits, mask=mask)
+
+
+def apply_min_p(logits: torch.Tensor, min_p: torch.Tensor | None) -> None:
+    if min_p is None:
+        return
+    num_reqs, vocab_size = logits.shape
+    BLOCK_SIZE = 1024
+    _min_p_kernel[(num_reqs,)](
+        logits,
+        logits.stride(0),
+        min_p,
+        vocab_size,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
diff --git a/vllm/v1/worker/gpu/sample/sampler.py b/vllm/v1/worker/gpu/sample/sampler.py
index 3429dd3e4d0fb..9a4224d8fddef 100644
--- a/vllm/v1/worker/gpu/sample/sampler.py
+++ b/vllm/v1/worker/gpu/sample/sampler.py
@@ -9,6 +9,7 @@ from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
 from vllm.v1.worker.gpu.sample.gumbel import gumbel_sample
 from vllm.v1.worker.gpu.sample.logprob import compute_topk_logprobs
 from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata
+from vllm.v1.worker.gpu.sample.min_p import apply_min_p
 from vllm.v1.worker.gpu.sample.penalties import apply_penalties_and_temperature
 
 
@@ -61,6 +62,9 @@ class Sampler:
 
         # Apply penalties and temperature in place.
         apply_penalties_and_temperature(logits, sampling_metadata)
+        # Apply min_p in place.
+        apply_min_p(logits, sampling_metadata.min_p)
+        # Apply top_k and/or top_p. This might return a new tensor.
         logits = apply_top_k_top_p(
             logits, sampling_metadata.top_k, sampling_metadata.top_p
         )
diff --git a/vllm/v1/worker/gpu/states.py b/vllm/v1/worker/gpu/states.py
index 367348c4a18f7..6823c0c8ee5c7 100644
--- a/vllm/v1/worker/gpu/states.py
+++ b/vllm/v1/worker/gpu/states.py
@@ -87,6 +87,7 @@ class RequestState:
         self.temperature = self._make_param(self.max_num_reqs, torch.float32)
         self.top_p = self._make_param(self.max_num_reqs, torch.float32)
         self.top_k = self._make_param(self.max_num_reqs, torch.int32)
+        self.min_p = self._make_param(self.max_num_reqs, torch.float32)
         self.repetition_penalty = self._make_param(self.max_num_reqs, torch.float32)
         self.frequency_penalty = self._make_param(self.max_num_reqs, torch.float32)
         self.presence_penalty = self._make_param(self.max_num_reqs, torch.float32)
@@ -162,6 +163,7 @@ class RequestState:
         else:
             top_k = self.vocab_size
         self.top_k.np[req_idx] = top_k
+        self.min_p.np[req_idx] = sampling_params.min_p
         self.repetition_penalty.np[req_idx] = sampling_params.repetition_penalty
         self.frequency_penalty.np[req_idx] = sampling_params.frequency_penalty
         self.presence_penalty.np[req_idx] = sampling_params.presence_penalty
@@ -217,6 +219,10 @@ class RequestState:
         no_top_k = np.all(top_k == self.vocab_size)
         top_k = self.top_k.copy_np_to_gpu(top_k) if not no_top_k else None
 
+        min_p = self.min_p.np[idx_mapping_np]
+        no_min_p = np.all(min_p == 0.0)
+        min_p = self.min_p.copy_np_to_gpu(min_p) if not no_min_p else None
+
         rep_penalty = self.repetition_penalty.np[idx_mapping_np]
         rep_penalty = self.repetition_penalty.copy_np_to_gpu(rep_penalty)
         freq_penalty = self.frequency_penalty.np[idx_mapping_np]
@@ -236,6 +242,7 @@ class RequestState:
             temperature=temperature,
             top_p=top_p,
             top_k=top_k,
+            min_p=min_p,
             repetition_penalty=rep_penalty,
             frequency_penalty=freq_penalty,
             presence_penalty=pres_penalty,

From d6aeaddf4a6201e35ec89bcd4b3719e4e7293f1f Mon Sep 17 00:00:00 2001
From: kx <1670186653@qq.com>
Date: Sat, 6 Dec 2025 15:11:31 +0800
Subject: [PATCH 301/770] [bugfix] fix type[AttentionBackend] bug in
 kv_connector_base_v1 (#30051)

Signed-off-by: 01267596 <xiongkai123@cmbchina.com>
Co-authored-by: 01267596 <xiongkai123@cmbchina.com>
---
 vllm/distributed/kv_transfer/kv_connector/v1/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
index 8e9182a9bca4c..91f6443f92cbe 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -239,7 +239,7 @@ class KVConnectorBase_V1(ABC):
         return
 
     def register_cross_layers_kv_cache(
-        self, kv_cache: torch.Tensor, attn_backend: type[AttentionBackend]
+        self, kv_cache: torch.Tensor, attn_backend: type["AttentionBackend"]
     ):
         """
         Initialize with a single KV cache tensor used by all layers.

From 6476382384d3560367e30732995e2456ec569c5d Mon Sep 17 00:00:00 2001
From: redwrasse <mail@redwrasse.io>
Date: Fri, 5 Dec 2025 23:39:56 -0800
Subject: [PATCH 302/770] prefix caching design doc sha256 now default (#29261)

Signed-off-by: redwrasse <mail@redwrasse.io>
---
 docs/design/prefix_caching.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/design/prefix_caching.md b/docs/design/prefix_caching.md
index cf792fdabe1a6..6f2eb3062b3b9 100644
--- a/docs/design/prefix_caching.md
+++ b/docs/design/prefix_caching.md
@@ -22,8 +22,8 @@ In the example above, the KV cache in the first block can be uniquely identified
     We only cache full blocks.
 
 !!! note "Note 2"
-    The above hash key structure is not 100% collision free. Theoretically it’s still possible for the different prefix tokens to have the same hash value. To avoid any hash collisions **in a multi-tenant setup, we advise to use SHA256** as hash function instead of the default builtin hash.
-    SHA256 is supported since vLLM v0.8.3 and must be enabled with a command line argument. It comes with a performance impact of about 100-200ns per token (~6ms for 50k tokens of context).
+    The above hash key structure is not 100% collision free. Theoretically it’s still possible for the different prefix tokens to have the same hash value. To avoid any hash collisions **in a multi-tenant setup, we use SHA256** as hash function instead of the builtin hash.
+    SHA256 is supported since vLLM v0.8.3 and the default since v0.10.2. It comes with a negligible performance impact of about 75ns per token (<4ms for 50k tokens of context).
 
 **A hashing example with multi-modality inputs**  
 In this example, we illustrate how prefix caching works with multi-modality inputs (e.g., images). Assuming we have a request with the following messages:

From c46b932df2b801ba0a6452e436268f086029d82b Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 6 Dec 2025 15:57:28 +0800
Subject: [PATCH 303/770] [Chore] Deprecate
 `SupportsMultiModal.merge_by_field_config` (#30170)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/aria.py            |  2 --
 vllm/model_executor/models/aya_vision.py      |  2 --
 vllm/model_executor/models/blip2.py           |  2 --
 vllm/model_executor/models/chameleon.py       |  2 --
 vllm/model_executor/models/clip.py            |  1 -
 vllm/model_executor/models/cohere2_vision.py  |  2 --
 vllm/model_executor/models/deepseek_ocr.py    |  2 --
 vllm/model_executor/models/deepseek_vl2.py    |  2 --
 vllm/model_executor/models/dots_ocr.py        |  2 --
 vllm/model_executor/models/ernie45_vl.py      |  2 --
 vllm/model_executor/models/fuyu.py            |  2 --
 vllm/model_executor/models/gemma3_mm.py       |  2 --
 vllm/model_executor/models/gemma3n_mm.py      |  1 -
 vllm/model_executor/models/glm4_1v.py         |  2 --
 vllm/model_executor/models/glm4v.py           |  2 --
 vllm/model_executor/models/granite_speech.py  |  1 -
 vllm/model_executor/models/hunyuan_vision.py  |  1 -
 .../models/hyperclovax_vision.py              |  2 --
 vllm/model_executor/models/idefics3.py        |  2 --
 vllm/model_executor/models/interfaces.py      | 25 ++++++++++++++++---
 vllm/model_executor/models/interns1.py        |  2 --
 vllm/model_executor/models/internvl.py        |  2 --
 vllm/model_executor/models/keye.py            |  2 --
 vllm/model_executor/models/kimi_vl.py         |  2 --
 vllm/model_executor/models/llava.py           |  2 --
 vllm/model_executor/models/llava_next.py      |  2 --
 .../model_executor/models/llava_next_video.py |  2 --
 vllm/model_executor/models/llava_onevision.py |  2 --
 vllm/model_executor/models/midashenglm.py     |  2 --
 vllm/model_executor/models/minicpmv.py        |  2 --
 vllm/model_executor/models/minimax_vl_01.py   |  2 --
 vllm/model_executor/models/mistral3.py        |  2 --
 vllm/model_executor/models/mllama4.py         |  2 --
 vllm/model_executor/models/molmo.py           |  2 --
 .../model_executor/models/nano_nemotron_vl.py |  2 --
 vllm/model_executor/models/nemotron_vl.py     |  2 --
 vllm/model_executor/models/opencua.py         |  1 -
 vllm/model_executor/models/ovis.py            |  2 --
 vllm/model_executor/models/ovis2_5.py         |  2 --
 vllm/model_executor/models/paddleocr_vl.py    |  2 --
 vllm/model_executor/models/paligemma.py       |  2 --
 vllm/model_executor/models/phi3v.py           |  2 --
 vllm/model_executor/models/phi4mm.py          |  2 --
 vllm/model_executor/models/pixtral.py         |  2 --
 .../models/qwen2_5_omni_thinker.py            |  2 --
 vllm/model_executor/models/qwen2_5_vl.py      |  1 -
 vllm/model_executor/models/qwen2_audio.py     |  2 --
 vllm/model_executor/models/qwen2_vl.py        |  1 -
 .../models/qwen3_omni_moe_thinker.py          |  2 --
 vllm/model_executor/models/qwen3_vl.py        |  1 -
 vllm/model_executor/models/qwen_vl.py         |  2 --
 vllm/model_executor/models/siglip.py          |  1 -
 vllm/model_executor/models/skyworkr1v.py      |  2 --
 vllm/model_executor/models/step3_vl.py        |  2 --
 vllm/model_executor/models/tarsier.py         |  2 --
 vllm/model_executor/models/terratorch.py      |  1 -
 .../models/transformers/multimodal.py         |  2 +-
 vllm/model_executor/models/ultravox.py        |  2 --
 vllm/model_executor/models/voxtral.py         |  2 --
 vllm/model_executor/models/whisper.py         |  1 -
 vllm/multimodal/utils.py                      |  1 -
 61 files changed, 23 insertions(+), 110 deletions(-)

diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 3d07e6b612ca3..c6d7f19cbe90d 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -499,8 +499,6 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
     model to perform tasks that involve both image and text inputs.
     """
 
-    merge_by_field_config = True
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             # mapping for new names in checkpoint saved after transformers v4.52
diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py
index 0ada2ed5028bb..ee9e210a3240f 100644
--- a/vllm/model_executor/models/aya_vision.py
+++ b/vllm/model_executor/models/aya_vision.py
@@ -318,8 +318,6 @@ def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
     dummy_inputs=AyaVisionDummyInputsBuilder,
 )
 class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             # mapping for new names in checkpoint saved after transformers v4.52
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index f71b9c01d359d..1244f97a1bd68 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -523,8 +523,6 @@ class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
 class Blip2ForConditionalGeneration(
     nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant
 ):
-    merge_by_field_config = True
-
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 3aa01bb1905fe..dfc05a366b286 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -918,8 +918,6 @@ class ChameleonModel(nn.Module):
 class ChameleonForConditionalGeneration(
     nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant
 ):
-    merge_by_field_config = True
-
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index b8af3050990bc..22f3ecad748e6 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -784,7 +784,6 @@ class CLIPEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
     is_pooling_model = True
 
     packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
-    merge_by_field_config = True
 
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> str | None:
diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py
index 139ccba9df6d8..07dc7a01dc316 100644
--- a/vllm/model_executor/models/cohere2_vision.py
+++ b/vllm/model_executor/models/cohere2_vision.py
@@ -331,8 +331,6 @@ class Cohere2VisionMultiModalProcessor(
     dummy_inputs=Cohere2VisionDummyInputsBuilder,
 )
 class Cohere2VisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             "model.vision_tower.": "vision_tower.",
diff --git a/vllm/model_executor/models/deepseek_ocr.py b/vllm/model_executor/models/deepseek_ocr.py
index a612ebd956282..1f07381c0cbd0 100644
--- a/vllm/model_executor/models/deepseek_ocr.py
+++ b/vllm/model_executor/models/deepseek_ocr.py
@@ -344,8 +344,6 @@ class DeepseekOCRMultiModalProcessor(
     dummy_inputs=DeepseekOCRDummyInputsBuilder,
 )
 class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             # map prefix for language backbone
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 56c1a87a25401..9f8faf9ed91ce 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -344,8 +344,6 @@ class DeepseekVL2MultiModalProcessor(
     dummy_inputs=DeepseekVL2DummyInputsBuilder,
 )
 class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             "language.": "language_model.",
diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py
index 5cc2a48f26d64..da19d8fdb15e0 100644
--- a/vllm/model_executor/models/dots_ocr.py
+++ b/vllm/model_executor/models/dots_ocr.py
@@ -690,8 +690,6 @@ class DotsVisionTransformer(nn.Module):
     dummy_inputs=DotsOCRDummyInputsBuilder,
 )
 class DotsOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
-    merge_by_field_config = True
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_substr={
             ".attn.qkv_proj.": ".attn.qkv.",
diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py
index 81663dd7bbb45..3305b6a0e58f0 100644
--- a/vllm/model_executor/models/ernie45_vl.py
+++ b/vllm/model_executor/models/ernie45_vl.py
@@ -1254,8 +1254,6 @@ class Ernie4_5_VLDummyInputsBuilder(BaseDummyInputsBuilder[Ernie4_5_VLProcessing
 class Ernie4_5_VLMoeForConditionalGeneration(
     nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
 ):
-    merge_by_field_config = True
-
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 269c36ab5b9c7..8a7a3dd771c38 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -260,8 +260,6 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
     dummy_inputs=FuyuDummyInputsBuilder,
 )
 class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             "model.vision_embed_tokens.": "vision_embed_tokens.",
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 43c69e5e13992..e8dec36a1c5b8 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -483,8 +483,6 @@ class Gemma3MultiModalProjector(nn.Module):
 class Gemma3ForConditionalGeneration(
     nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA
 ):
-    merge_by_field_config = True
-
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py
index 6ae76976eb46c..7036118ada084 100644
--- a/vllm/model_executor/models/gemma3n_mm.py
+++ b/vllm/model_executor/models/gemma3n_mm.py
@@ -463,7 +463,6 @@ class Gemma3nMultimodalEmbedder(nn.Module):
 class Gemma3nForConditionalGeneration(
     nn.Module, SupportsMultiModal, SupportsTranscription
 ):
-    merge_by_field_config = True
     supported_languages = ISO639_1_SUPPORTED_LANGS
 
     packed_modules_mapping = {
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 5ba3c0a35928d..3cb53f2cbabee 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -1424,8 +1424,6 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
 class Glm4vForConditionalGeneration(
     nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
 ):
-    merge_by_field_config = True
-
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
index 514082cf60ce2..ec5af94e297c1 100644
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -561,8 +561,6 @@ class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]):
 class GLM4VForCausalLM(
     ChatGLMBaseModel, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
 ):
-    merge_by_field_config = True
-
     packed_modules_mapping = {
         "query_key_value": ["query_key_value"],
         "dense_h_to_4h": ["dense_h_to_4h"],
diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py
index accf7e6ef2f47..a4e50f4086281 100644
--- a/vllm/model_executor/models/granite_speech.py
+++ b/vllm/model_executor/models/granite_speech.py
@@ -564,7 +564,6 @@ class GraniteSpeechForConditionalGeneration(
     SupportsLoRA,
     SupportsTranscription,
 ):
-    merge_by_field_config = True
     supported_languages = ISO639_1_SUPPORTED_LANGS
 
     packed_modules_mapping = {
diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py
index 5aef09ca9c256..52ce9564c8d79 100644
--- a/vllm/model_executor/models/hunyuan_vision.py
+++ b/vllm/model_executor/models/hunyuan_vision.py
@@ -786,7 +786,6 @@ class HunYuanVLForConditionalGeneration(
     SupportsQuant,
     SupportsXDRoPE,
 ):
-    merge_by_field_config = True
     multimodal_cpu_fields = {"image_grid_thw"}
 
     # To ensure correct weight loading and mapping.
diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py
index db46353efde5c..3a083870e4b5a 100644
--- a/vllm/model_executor/models/hyperclovax_vision.py
+++ b/vllm/model_executor/models/hyperclovax_vision.py
@@ -592,8 +592,6 @@ class HCXVisionCAbstractor(nn.Module):
     dummy_inputs=HCXVisionDummyInputsBuilder,
 )
 class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 7c3933c6feb7e..0eed464487865 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -576,8 +576,6 @@ class Idefics3Model(nn.Module):
     dummy_inputs=Idefics3DummyInputsBuilder,
 )
 class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA):
-    merge_by_field_config = True
-
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 01b3e7827424d..416ab236cd18d 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -78,9 +78,9 @@ class SupportsMultiModal(Protocol):
     `multimodal_config.mm_encoder_tp_mode="data"`.
     """
 
-    merge_by_field_config: ClassVar[bool] = True
+    merge_by_field_config: ClassVar[bool | None] = None
     """
-    A flag that indicates which implementation of
+    [DEPRECATED] A flag that indicates which implementation of
     `vllm.multimodal.utils.group_mm_kwargs_by_modality` to use.
     """
 
@@ -260,7 +260,26 @@ def supports_multimodal(model: object) -> TypeIs[SupportsMultiModal]: ...
 def supports_multimodal(
     model: type[object] | object,
 ) -> TypeIs[type[SupportsMultiModal]] | TypeIs[SupportsMultiModal]:
-    return getattr(model, "supports_multimodal", False)
+    res = getattr(model, "supports_multimodal", False)
+
+    if res:
+        # We can remove this starting from v0.14
+        merge_by_field_config = getattr(model, "merge_by_field_config", None)
+        if merge_by_field_config is False:
+            raise ValueError(
+                "`merge_by_field_config=False` is no longer effective, "
+                "please update your model to consider the new batching logic "
+                "in `group_mm_kwargs_by_modality` (refer to "
+                "https://github.com/vllm-project/vllm/issues/26149), "
+                "and then remove the override from your model."
+            )
+        if merge_by_field_config is True:
+            logger.warning_once(
+                "`merge_by_field_config=True` is redundant, "
+                "please remove the override from your model."
+            )
+
+    return res
 
 
 def supports_multimodal_raw_input_only(model: type[object] | object) -> bool:
diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py
index c2195fd0cb88d..18985cefbf5ea 100644
--- a/vllm/model_executor/models/interns1.py
+++ b/vllm/model_executor/models/interns1.py
@@ -509,8 +509,6 @@ class InternS1MultiModalProcessor(BaseMultiModalProcessor[InternS1ProcessingInfo
 class InternS1ForConditionalGeneration(
     nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA
 ):
-    merge_by_field_config = True
-
     # To ensure correct weight loading and mapping.
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index fccddf3a6b293..15f7d4f418e48 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -1074,8 +1074,6 @@ class InternVLMultiModalProcessor(
     dummy_inputs=InternVLDummyInputsBuilder,
 )
 class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
-    merge_by_field_config = True
-
     supports_encoder_tp_data = True
 
     @classmethod
diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
index 09acf8372e168..f31da0ee302b3 100644
--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -1292,8 +1292,6 @@ class KeyeMultiModalProcessor(BaseMultiModalProcessor[KeyeProcessingInfo]):
 
 
 class BaseKeyeModule(nn.Module):
-    merge_by_field_config = True
-
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py
index 8167b82f32330..85267ccda8a91 100644
--- a/vllm/model_executor/models/kimi_vl.py
+++ b/vllm/model_executor/models/kimi_vl.py
@@ -298,8 +298,6 @@ class KimiVLMultiModalProcessor(BaseMultiModalProcessor[KimiVLProcessingInfo]):
     dummy_inputs=KimiVLDummyInputsBuilder,
 )
 class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     supports_encoder_tp_data = True
 
     @classmethod
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index c1fb2d4f4af7d..66a327bb7603d 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -506,8 +506,6 @@ def init_vision_tower_for_llava(
     dummy_inputs=LlavaDummyInputsBuilder,
 )
 class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index b995cac47ac1c..526846d0d9812 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -223,8 +223,6 @@ class LlavaNextMultiModalProcessor(
     dummy_inputs=LlavaDummyInputsBuilder,
 )
 class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             # mapping for new names in checkpoint saved after transformers v4.52
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 902c598c226f0..cd55cfec6cdec 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -299,8 +299,6 @@ class LlavaNextMultiModalProjector(nn.Module):
     dummy_inputs=LlavaNextVideoDummyInputsBuilder,
 )
 class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             # mapping for new names in checkpoint saved after transformers v4.52
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 4e243ade68358..5aa8de7dc252e 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -479,8 +479,6 @@ class LlavaOnevisionMultiModalProjector(nn.Module):
     dummy_inputs=LlavaOnevisionDummyInputsBuilder,
 )
 class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             # mapping for new names in checkpoint saved after transformers v4.52
diff --git a/vllm/model_executor/models/midashenglm.py b/vllm/model_executor/models/midashenglm.py
index d9b23811730d4..2d506978d266e 100644
--- a/vllm/model_executor/models/midashenglm.py
+++ b/vllm/model_executor/models/midashenglm.py
@@ -683,8 +683,6 @@ class MiDashengLMMultiModalProcessor(
     dummy_inputs=MiDashengLMDummyInputsBuilder,
 )
 class MiDashengLMModel(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 6d0ebf5c9825c..c45bdf95e7487 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -1003,8 +1003,6 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
     instantiated.
     """
 
-    merge_by_field_config = True
-
     supports_encoder_tp_data = True
 
     @classmethod
diff --git a/vllm/model_executor/models/minimax_vl_01.py b/vllm/model_executor/models/minimax_vl_01.py
index 0939a72ba53ec..e480454953df8 100644
--- a/vllm/model_executor/models/minimax_vl_01.py
+++ b/vllm/model_executor/models/minimax_vl_01.py
@@ -179,8 +179,6 @@ class MiniMaxVL01MultiModalProcessor(
     dummy_inputs=MiniMaxVL01DummyInputsBuilder,
 )
 class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py
index 1ddb470a0f93d..e9161e69e731b 100644
--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -423,8 +423,6 @@ def init_vision_tower_for_llava(
 class Mistral3ForConditionalGeneration(
     nn.Module, SupportsLoRA, SupportsMultiModal, SupportsPP
 ):
-    merge_by_field_config = True
-
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index 286859d188d34..e944c0ee38aa1 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -741,8 +741,6 @@ class Llama4ForConditionalGeneration(
     SupportsEagle3,
     SupportsLoRA,
 ):
-    merge_by_field_config = True
-
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 7b53299cccbe4..a6cd9ad16c188 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -1354,8 +1354,6 @@ class MolmoMultiModalProcessor(BaseMultiModalProcessor[MolmoProcessingInfo]):
 class MolmoForCausalLM(
     nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, SupportsQuant
 ):
-    merge_by_field_config = True
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_substr={
             # vision backbone mapping
diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
index c4198d36b392e..6dfab595e5b92 100644
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -1116,8 +1116,6 @@ class NanoNemotronVLDummyInputsBuilder(
 class NemotronH_Nano_VL_V2(
     nn.Module, HasInnerState, IsHybrid, SupportsMultiModal, SupportsMultiModalPruning
 ):
-    merge_by_field_config = True
-
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py
index a57668b21fb86..391980fc61f9e 100644
--- a/vllm/model_executor/models/nemotron_vl.py
+++ b/vllm/model_executor/models/nemotron_vl.py
@@ -358,8 +358,6 @@ class NemotronVLProcessingInfo(BaseInternVLProcessingInfo):
     dummy_inputs=BaseInternVLDummyInputsBuilder[NemotronVLProcessingInfo],
 )
 class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
-    merge_by_field_config = True
-
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
diff --git a/vllm/model_executor/models/opencua.py b/vllm/model_executor/models/opencua.py
index b92f0c9dac32b..76a2d1cc242c5 100644
--- a/vllm/model_executor/models/opencua.py
+++ b/vllm/model_executor/models/opencua.py
@@ -201,7 +201,6 @@ class OpenCUADummyInputsBuilder(Qwen2VLDummyInputsBuilder):
     dummy_inputs=OpenCUADummyInputsBuilder,
 )
 class OpenCUAForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
-    merge_by_field_config = True
     multimodal_cpu_fields = {"image_grid_thw"}
 
     packed_modules_mapping = {
diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py
index a0fab820720fb..0691bbc615be9 100644
--- a/vllm/model_executor/models/ovis.py
+++ b/vllm/model_executor/models/ovis.py
@@ -414,8 +414,6 @@ class OvisMultiModalProcessor(BaseMultiModalProcessor[OvisProcessingInfo]):
     dummy_inputs=OvisDummyInputsBuilder,
 )
 class Ovis(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py
index 85f37cfea10b1..0ad22aab748e3 100644
--- a/vllm/model_executor/models/ovis2_5.py
+++ b/vllm/model_executor/models/ovis2_5.py
@@ -456,8 +456,6 @@ class Ovis2_5MultiModalProcessor(BaseMultiModalProcessor[Ovis2_5ProcessingInfo])
     dummy_inputs=Ovis2_5DummyInputsBuilder,
 )
 class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py
index 1df5ff62fa5b5..9703a5b417d02 100644
--- a/vllm/model_executor/models/paddleocr_vl.py
+++ b/vllm/model_executor/models/paddleocr_vl.py
@@ -1103,8 +1103,6 @@ class SiglipVisionModel(nn.Module):
     dummy_inputs=PaddleOCRVLDummyInputsBuilder,
 )
 class PaddleOCRVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsMRoPE):
-    merge_by_field_config = True
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             "model.": "language_model.model.",
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 9fa32f01d37a0..67240c6e71249 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -251,8 +251,6 @@ class PaliGemmaMultiModalProcessor(BaseMultiModalProcessor[PaliGemmaProcessingIn
     dummy_inputs=PaliGemmaDummyInputsBuilder,
 )
 class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 384572217bc19..b7ae548069f25 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -562,8 +562,6 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
     dummy_inputs=Phi3VDummyInputsBuilder,
 )
 class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant):
-    merge_by_field_config = True
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             "model.vision_embed_tokens.wte": "embed_tokens",
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 8425549a7bd20..179d5df869bea 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -984,8 +984,6 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
     Implements the Phi-4-multimodal-instruct model in vLLM.
     """
 
-    merge_by_field_config = True
-
     packed_modules_mapping = {
         "qkv_proj": [
             "qkv_proj",
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index cad241842cd30..faf2d80d24bba 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -365,8 +365,6 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo])
     dummy_inputs=PixtralDummyInputsBuilder,
 )
 class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index 1ce0fb4e4d93d..3438406c4fac1 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -773,8 +773,6 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
     SupportsMRoPE,
     Qwen2_5OmniConditionalGenerationMixin,
 ):
-    merge_by_field_config = True
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             "thinker.lm_head.": "language_model.lm_head.",
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index cb521ebdf0afb..488af192bfad0 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -1039,7 +1039,6 @@ class Qwen2_5_VLForConditionalGeneration(
     SupportsMultiModalPruning,
     SupportsMRoPE,
 ):
-    merge_by_field_config = True
     multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"}
 
     packed_modules_mapping = {
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 7e883a393aa8d..f84ddfa84f6ab 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -313,8 +313,6 @@ class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor[Qwen2AudioProcessing
     dummy_inputs=Qwen2AudioDummyInputsBuilder,
 )
 class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("audio"):
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index b748768498412..9da5080f84303 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -1131,7 +1131,6 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo])
 class Qwen2VLForConditionalGeneration(
     nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
 ):
-    merge_by_field_config = True
     multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"}
 
     # To ensure correct weight loading and mapping.
diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
index e6979211b707f..dbe7bcd07576b 100755
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -1131,8 +1131,6 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
     SupportsMRoPE,
     Qwen3OmniMoeConditionalGenerationMixin,
 ):
-    merge_by_field_config = True
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             "thinker.lm_head.": "language_model.lm_head.",
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 58721303dfc87..a5b10c95872dd 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -1190,7 +1190,6 @@ class Qwen3VLForConditionalGeneration(
     SupportsMRoPE,
     SupportsEagle3,
 ):
-    merge_by_field_config = True
     multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"}
 
     packed_modules_mapping = {
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index 55680b8e7ddfd..caac14716782a 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -703,8 +703,6 @@ class QwenVLMultiModalProcessor(BaseMultiModalProcessor[QwenVLProcessingInfo]):
 class QwenVLForConditionalGeneration(
     QWenBaseModel, SupportsPP, SupportsLoRA, SupportsMultiModal
 ):
-    merge_by_field_config = True
-
     packed_modules_mapping = {
         "c_attn": ["c_attn"],
         "gate_up_proj": [
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 9db1423d98e07..2600dc1c9f79c 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -989,7 +989,6 @@ class SiglipEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
     is_pooling_model = True
 
     packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
-    merge_by_field_config = True
 
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> str | None:
diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py
index 55c25ce6190fb..f95fbffc1d0b4 100644
--- a/vllm/model_executor/models/skyworkr1v.py
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -647,8 +647,6 @@ class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[SkyworkR1VProcessing
     dummy_inputs=SkyworkR1VDummyInputsBuilder,
 )
 class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py
index 3e55ada0ed2e1..e5038e56a2708 100644
--- a/vllm/model_executor/models/step3_vl.py
+++ b/vllm/model_executor/models/step3_vl.py
@@ -916,8 +916,6 @@ class Step3VisionTransformer(nn.Module):
     dummy_inputs=Step3VLDummyInputsBuilder,
 )
 class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             "model.": "language_model.model.",
diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py
index 4d310712f303e..7e82a4d725a62 100644
--- a/vllm/model_executor/models/tarsier.py
+++ b/vllm/model_executor/models/tarsier.py
@@ -400,8 +400,6 @@ def init_vision_tower_for_tarsier(
     dummy_inputs=TarsierDummyInputsBuilder,
 )
 class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
diff --git a/vllm/model_executor/models/terratorch.py b/vllm/model_executor/models/terratorch.py
index 9f34090e31071..402081a70631e 100644
--- a/vllm/model_executor/models/terratorch.py
+++ b/vllm/model_executor/models/terratorch.py
@@ -227,7 +227,6 @@ class TerratorchMultiModalProcessor(BaseMultiModalProcessor):
     dummy_inputs=TerratorchInputBuilder,
 )
 class Terratorch(nn.Module, IsAttentionFree, SupportsMultiModal):
-    merge_by_field_config = True
     supports_multimodal_raw_input_only = True
     is_pooling_model = True
 
diff --git a/vllm/model_executor/models/transformers/multimodal.py b/vllm/model_executor/models/transformers/multimodal.py
index ccf6053719871..9d77dee2810c3 100644
--- a/vllm/model_executor/models/transformers/multimodal.py
+++ b/vllm/model_executor/models/transformers/multimodal.py
@@ -264,7 +264,7 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
 
 class MultiModalMixin(SupportsMultiModal, SupportsMRoPE):
     supports_multimodal_raw_input_only = True
-    merge_by_field_config = True
+
     # Backwards compatibility for prev released models. State dicts back then
     # had different formats and cannot be loaded with `AutoModel` mapping as is
     hf_to_vllm_mapper = WeightsMapper(
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 2444159b2ad61..32a2ba1ef38f7 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -498,8 +498,6 @@ class ModifiedWhisperEncoder(WhisperEncoder):
     dummy_inputs=UltravoxDummyInputsBuilder,
 )
 class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
-    merge_by_field_config = True
-
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py
index 45f8fa079c714..7b408248ec74c 100644
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -330,8 +330,6 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo])
 class VoxtralForConditionalGeneration(
     nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, SupportsTranscription
 ):
-    merge_by_field_config = True
-
     supported_languages = ISO639_1_SUPPORTED_LANGS
 
     packed_modules_mapping = {
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index 0daf6bda61ccb..b2feff1335151 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -775,7 +775,6 @@ class WhisperMultiModalProcessor(EncDecMultiModalProcessor[WhisperProcessingInfo
 class WhisperForConditionalGeneration(
     nn.Module, SupportsTranscription, SupportsMultiModal
 ):
-    merge_by_field_config = True
     packed_modules_mapping = {
         "self_attn.qkv_proj": [
             "self_attn.q_proj",
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index f8e8847e8e609..9c5e3fb2b32ad 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -426,7 +426,6 @@ def group_mm_kwargs_by_modality(
     Yields:
         A tuple `(modality, num_items, grouped_kwargs)`.
     """
-    # TODO: After v0.13, remove merge_by_field_config attribute from model impls
     if merge_by_field_config is not None:
         logger.warning_once(
             "The `merge_by_field_config` argument of `group_mm_kwargs_by_modality` "

From 43e75930318e1cb101fb6178876b027634f91f71 Mon Sep 17 00:00:00 2001
From: Yu Jiaqi <54204033+piood@users.noreply.github.com>
Date: Sat, 6 Dec 2025 17:12:53 +0800
Subject: [PATCH 304/770] Support tokenization_kwargs override (#29794)

Signed-off-by: piood <2477084691@qq.com>
---
 tests/conftest.py                             | 17 ++++++++++----
 .../models/multimodal/pooling/test_siglip.py  | 18 +++++++++++++--
 vllm/entrypoints/llm.py                       | 22 +++++++++++++++++--
 3 files changed, 49 insertions(+), 8 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 0d456fb364cda..9f811d5d8db2a 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -405,6 +405,7 @@ class HfRunner:
         images: PromptImageInput | None = None,
         videos: PromptVideoInput | None = None,
         audios: PromptAudioInput | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
     ) -> list[BatchFeature | BatchEncoding | dict[str, torch.Tensor]]:
         if images is not None:
             assert len(prompts) == len(images)
@@ -418,10 +419,18 @@ class HfRunner:
         all_inputs: list[BatchFeature | BatchEncoding | dict[str, torch.Tensor]] = []
         for i, prompt in enumerate(prompts):
             if isinstance(prompt, str):
-                processor_kwargs: dict[str, Any] = {
-                    "text": prompt,
-                    "return_tensors": "pt",
-                }
+                # Create a copy to avoid modifying the original dict
+                processor_kwargs = (
+                    tokenization_kwargs.copy()
+                    if tokenization_kwargs is not None
+                    else {}
+                )
+                processor_kwargs.update(
+                    {
+                        "text": prompt,
+                        "return_tensors": "pt",
+                    }
+                )
                 if images is not None and (image := images[i]) is not None:
                     processor_kwargs["images"] = image
                 if videos is not None and (video := videos[i]) is not None:
diff --git a/tests/models/multimodal/pooling/test_siglip.py b/tests/models/multimodal/pooling/test_siglip.py
index 92ae115a19831..72886cbf7f323 100644
--- a/tests/models/multimodal/pooling/test_siglip.py
+++ b/tests/models/multimodal/pooling/test_siglip.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from typing import Any
+
 import pytest
 from transformers import SiglipModel
 
@@ -35,7 +37,11 @@ def _run_test(
     model: str,
     *,
     dtype: str,
+    tokenization_kwargs: dict[str, Any] | None = None,
 ) -> None:
+    if tokenization_kwargs is None:
+        tokenization_kwargs = {}
+
     with vllm_runner(
         model,
         runner="pooling",
@@ -44,10 +50,14 @@ def _run_test(
         max_model_len=64,
         gpu_memory_utilization=0.7,
     ) as vllm_model:
-        vllm_outputs = vllm_model.embed(input_texts, images=input_images)
+        vllm_outputs = vllm_model.embed(
+            input_texts, images=input_images, tokenization_kwargs=tokenization_kwargs
+        )
 
     with hf_runner(model, dtype=dtype, auto_cls=SiglipModel) as hf_model:
-        all_inputs = hf_model.get_inputs(input_texts, images=input_images)
+        all_inputs = hf_model.get_inputs(
+            input_texts, images=input_images, tokenization_kwargs=tokenization_kwargs
+        )
 
         all_outputs = []
         for inputs in all_inputs:
@@ -94,6 +104,10 @@ def test_models_text(
         input_images,  # type: ignore
         model,
         dtype=dtype,
+        tokenization_kwargs={
+            "padding": "max_length",
+            "max_length": 64,
+        },  # siglip2 was trained with this padding setting.
     )
 
 
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index add9176340a9e..913324fd5f9c3 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1076,6 +1076,7 @@ class LLM:
             params=pooling_params,
             use_tqdm=use_tqdm,
             lora_request=lora_request,
+            tokenization_kwargs=tokenization_kwargs,
         )
 
         outputs = self._run_engine(use_tqdm=use_tqdm)
@@ -1113,6 +1114,7 @@ class LLM:
         use_tqdm: bool | Callable[..., tqdm] = True,
         pooling_params: PoolingParams | Sequence[PoolingParams] | None = None,
         lora_request: list[LoRARequest] | LoRARequest | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
     ) -> list[EmbeddingRequestOutput]:
         """
         Generate an embedding vector for each prompt.
@@ -1150,6 +1152,7 @@ class LLM:
             pooling_params=pooling_params,
             lora_request=lora_request,
             pooling_task="embed",
+            tokenization_kwargs=tokenization_kwargs,
         )
 
         return [EmbeddingRequestOutput.from_base(item) for item in items]
@@ -1161,6 +1164,7 @@ class LLM:
         use_tqdm: bool | Callable[..., tqdm] = True,
         pooling_params: PoolingParams | Sequence[PoolingParams] | None = None,
         lora_request: list[LoRARequest] | LoRARequest | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
     ) -> list[ClassificationRequestOutput]:
         """
         Generate class logits for each prompt.
@@ -1196,6 +1200,7 @@ class LLM:
             pooling_params=pooling_params,
             lora_request=lora_request,
             pooling_task="classify",
+            tokenization_kwargs=tokenization_kwargs,
         )
 
         return [ClassificationRequestOutput.from_base(item) for item in items]
@@ -1209,6 +1214,7 @@ class LLM:
         use_tqdm: bool | Callable[..., tqdm] = True,
         pooling_params: PoolingParams | Sequence[PoolingParams] | None = None,
         lora_request: list[LoRARequest] | LoRARequest | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
     ) -> list[PoolingRequestOutput]:
         """
         Generate rewards for each prompt.
@@ -1236,6 +1242,7 @@ class LLM:
             pooling_params=pooling_params,
             truncate_prompt_tokens=truncate_prompt_tokens,
             pooling_task="token_classify",
+            tokenization_kwargs=tokenization_kwargs,
         )
 
     def _embedding_score(
@@ -1247,6 +1254,7 @@ class LLM:
         use_tqdm: bool | Callable[..., tqdm] = True,
         pooling_params: PoolingParams | None = None,
         lora_request: list[LoRARequest] | LoRARequest | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
     ) -> list[ScoringRequestOutput]:
         encoded_output: list[PoolingRequestOutput] = self.encode(
             text_1 + text_2,
@@ -1255,6 +1263,7 @@ class LLM:
             lora_request=lora_request,
             pooling_params=pooling_params,
             pooling_task="embed",
+            tokenization_kwargs=tokenization_kwargs,
         )
 
         encoded_output_1: list[PoolingRequestOutput] = encoded_output[0 : len(text_1)]
@@ -1279,6 +1288,7 @@ class LLM:
         use_tqdm: bool | Callable[..., tqdm] = True,
         pooling_params: PoolingParams | None = None,
         lora_request: list[LoRARequest] | LoRARequest | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
     ) -> list[ScoringRequestOutput]:
         model_config = self.model_config
 
@@ -1294,7 +1304,8 @@ class LLM:
         pooling_params.verify("score", model_config)
         pooling_params_list = list[PoolingParams]()
 
-        tokenization_kwargs: dict[str, Any] = {}
+        local_kwargs = tokenization_kwargs or {}
+        tokenization_kwargs = local_kwargs.copy()
 
         _validate_truncation_size(
             model_config.max_model_len, truncate_prompt_tokens, tokenization_kwargs
@@ -1557,6 +1568,7 @@ class LLM:
         use_tqdm: bool | Callable[..., tqdm] = True,
         lora_request: Sequence[LoRARequest] | LoRARequest | None,
         priority: list[int] | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
     ) -> None:
         if isinstance(prompts, (str, dict)):
             # Convert a single prompt to a list.
@@ -1602,6 +1614,7 @@ class LLM:
                     if isinstance(lora_request, Sequence)
                     else lora_request,
                     priority=priority[i] if priority else 0,
+                    tokenization_kwargs=tokenization_kwargs,
                 )
                 added_request_ids.append(request_id)
         except Exception as e:
@@ -1665,9 +1678,12 @@ class LLM:
         *,
         lora_request: LoRARequest | None,
         priority: int,
+        tokenization_kwargs: dict[str, Any] | None = None,
     ) -> tuple[EngineCoreRequest, dict[str, Any]]:
         """Use the Processor to process inputs for LLMEngine."""
-        tokenization_kwargs: dict[str, Any] = {}
+
+        local_kwargs = tokenization_kwargs or {}
+        tokenization_kwargs = local_kwargs.copy()
         _validate_truncation_size(
             self.model_config.max_model_len,
             params.truncate_prompt_tokens,
@@ -1690,6 +1706,7 @@ class LLM:
         params: SamplingParams | PoolingParams,
         lora_request: LoRARequest | None = None,
         priority: int = 0,
+        tokenization_kwargs: dict[str, Any] | None = None,
     ) -> str:
         prompt_text, _, _ = get_prompt_components(prompt)
         request_id = str(next(self.request_counter))
@@ -1700,6 +1717,7 @@ class LLM:
             params,
             lora_request=lora_request,
             priority=priority,
+            tokenization_kwargs=tokenization_kwargs,
         )
 
         self.llm_engine.add_request(

From 92c35abb242babbf592390960fb5a4155261e017 Mon Sep 17 00:00:00 2001
From: "Ye (Charlotte) Qi" <yeq@meta.com>
Date: Sat, 6 Dec 2025 01:24:03 -0800
Subject: [PATCH 305/770] [Misc] Fix circular import in
 vllm.transformers_utils.config (#30179)

Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
---
 vllm/transformers_utils/config.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index f926b523afdfa..773fc05a52ef3 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -25,7 +25,6 @@ from transformers.models.auto.tokenization_auto import get_tokenizer_config
 from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
 
 from vllm import envs
-from vllm.config.utils import getattr_iter
 from vllm.logger import init_logger
 from vllm.transformers_utils.utils import parse_safetensors_file_metadata
 
@@ -305,6 +304,8 @@ def set_default_rope_theta(config: PretrainedConfig, default_theta: float) -> No
 
 def patch_rope_parameters(config: PretrainedConfig) -> None:
     """Provide backwards compatibility for RoPE."""
+    from vllm.config.utils import getattr_iter
+
     rope_theta_names = ("rope_theta", "rotary_emb_base")
     rope_theta = getattr_iter(config, rope_theta_names, None)
     if Version(version("transformers")) < Version("5.0.0.dev0"):

From 17a9abec2b7c38d55ebb6ea0dfe5cbe135993af2 Mon Sep 17 00:00:00 2001
From: Chukwuma Nwaugha <20521315+nwaughachukwuma@users.noreply.github.com>
Date: Sat, 6 Dec 2025 09:42:41 +0000
Subject: [PATCH 306/770] simplify requires_files list creation (#29656)

Signed-off-by: Chukwuma Nwaugha <nwaughac@gmail.com>
---
 use_existing_torch.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/use_existing_torch.py b/use_existing_torch.py
index fd4caa69ec9c1..e2d3f2ec81956 100644
--- a/use_existing_torch.py
+++ b/use_existing_torch.py
@@ -3,9 +3,7 @@
 
 import glob
 
-requires_files = glob.glob("requirements/*.txt")
-requires_files += ["pyproject.toml"]
-for file in requires_files:
+for file in (*glob.glob("requirements/*.txt"), "pyproject.toml"):
     print(f">>> cleaning {file}")
     with open(file) as f:
         lines = f.readlines()
@@ -17,5 +15,4 @@ for file in requires_files:
                     f.write(line)
                 else:
                     print(line.strip())
-    print(f"<<< done cleaning {file}")
-    print()
+    print(f"<<< done cleaning {file}\n")

From 21bb323542bad9d7a7206d949f33734caf48c40c Mon Sep 17 00:00:00 2001
From: Viacheslav <slava.barinov2002@yandex.ru>
Date: Sat, 6 Dec 2025 15:04:14 +0300
Subject: [PATCH 307/770] Gigachat 3 tool parser and tests (#29905)

Signed-off-by: Viacheslav Barinov <viacheslav.teh@gmail.com>
---
 docs/features/tool_calling.md                 |  13 ++
 .../test_gigachat3_tool_parser.py             | 176 ++++++++++++++++
 .../openai/tool_parsers/__init__.py           |   4 +
 .../tool_parsers/gigachat3_tool_parser.py     | 190 ++++++++++++++++++
 4 files changed, 383 insertions(+)
 create mode 100644 tests/entrypoints/openai/tool_parsers/test_gigachat3_tool_parser.py
 create mode 100644 vllm/entrypoints/openai/tool_parsers/gigachat3_tool_parser.py

diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md
index b6dfbf10b4568..c77fe44659790 100644
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -376,6 +376,19 @@ Supported models:
 
 Flags: `--tool-call-parser olmo3`
 
+### Gigachat 3 Models (`gigachat3`)
+
+Use chat template from the Hugging Face model files.
+
+Supported models:
+
+* `ai-sage/GigaChat3-702B-A36B-preview`
+* `ai-sage/GigaChat3-702B-A36B-preview-bf16`
+* `ai-sage/GigaChat3-10B-A1.8B`
+* `ai-sage/GigaChat3-10B-A1.8B-bf16`
+
+Flags: `--tool-call-parser gigachat3`
+
 ### Models with Pythonic Tool Calls (`pythonic`)
 
 A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models.
diff --git a/tests/entrypoints/openai/tool_parsers/test_gigachat3_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_gigachat3_tool_parser.py
new file mode 100644
index 0000000000000..02c5189d0f6c1
--- /dev/null
+++ b/tests/entrypoints/openai/tool_parsers/test_gigachat3_tool_parser.py
@@ -0,0 +1,176 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import pytest
+
+from tests.entrypoints.openai.tool_parsers.utils import (
+    run_tool_extraction,
+    run_tool_extraction_streaming,
+)
+from vllm.entrypoints.openai.protocol import FunctionCall
+from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
+from vllm.tokenizers import TokenizerLike
+
+SIMPLE_ARGS_DICT = {
+    "action": "create",
+    "id": "preferences",
+}
+SIMPLE_FUNCTION_JSON = json.dumps(
+    {
+        "name": "manage_user_memory",
+        "arguments": SIMPLE_ARGS_DICT,
+    },
+    ensure_ascii=False,
+)
+SIMPLE_FUNCTION_OUTPUT = "function call" + SIMPLE_FUNCTION_JSON
+SIMPLE_FUNCTION_CALL = FunctionCall(
+    name="manage_user_memory",
+    arguments=json.dumps(SIMPLE_ARGS_DICT, ensure_ascii=False),
+)
+
+
+PARAMETERLESS_FUNCTION_JSON = json.dumps(
+    {
+        "name": "manage_user_memory",
+        "arguments": {},
+    },
+    ensure_ascii=False,
+)
+PARAMETERLESS_FUNCTION_OUTPUT = "function call" + PARAMETERLESS_FUNCTION_JSON
+PARAMETERLESS_FUNCTION_CALL = FunctionCall(
+    name="manage_user_memory",
+    arguments=json.dumps({}, ensure_ascii=False),
+)
+
+
+COMPLEX_ARGS_DICT = {
+    "action": "create",
+    "id": "preferences",
+    "content": {
+        "short_answers": True,
+        "hate_emojis": True,
+        "english_ui": False,
+        "russian_math_explanations": True,
+    },
+}
+COMPLEX_FUNCTION_JSON = json.dumps(
+    {
+        "name": "manage_user_memory",
+        "arguments": COMPLEX_ARGS_DICT,
+    },
+    ensure_ascii=False,
+)
+COMPLEX_FUNCTION_OUTPUT = "function call" + COMPLEX_FUNCTION_JSON
+COMPLEX_FUNCTION_CALL = FunctionCall(
+    name="manage_user_memory",
+    arguments=json.dumps(COMPLEX_ARGS_DICT, ensure_ascii=False),
+)
+
+
+@pytest.mark.parametrize("streaming", [True, False])
+def test_no_tool_call(streaming: bool, default_tokenizer: TokenizerLike):
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("gigachat3")(
+        default_tokenizer
+    )
+    model_output = "How can I help you today?"
+    content, tool_calls = run_tool_extraction(
+        tool_parser, model_output, streaming=streaming
+    )
+    assert content == model_output
+    assert len(tool_calls) == 0
+
+
+TEST_CASES = [
+    pytest.param(
+        True,
+        SIMPLE_FUNCTION_OUTPUT,
+        [SIMPLE_FUNCTION_CALL],
+        None,
+        id="simple_streaming",
+    ),
+    pytest.param(
+        False,
+        SIMPLE_FUNCTION_OUTPUT,
+        [SIMPLE_FUNCTION_CALL],
+        None,
+        id="simple_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        PARAMETERLESS_FUNCTION_OUTPUT,
+        [PARAMETERLESS_FUNCTION_CALL],
+        None,
+        id="parameterless_streaming",
+    ),
+    pytest.param(
+        False,
+        PARAMETERLESS_FUNCTION_OUTPUT,
+        [PARAMETERLESS_FUNCTION_CALL],
+        None,
+        id="parameterless_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        COMPLEX_FUNCTION_OUTPUT,
+        [COMPLEX_FUNCTION_CALL],
+        None,
+        id="complex_streaming",
+    ),
+    pytest.param(
+        False,
+        COMPLEX_FUNCTION_OUTPUT,
+        [COMPLEX_FUNCTION_CALL],
+        None,
+        id="complex_nonstreaming",
+    ),
+]
+
+
+@pytest.mark.parametrize(
+    "streaming, model_output, expected_tool_calls, expected_content", TEST_CASES
+)
+def test_tool_call(
+    streaming: bool,
+    model_output: str,
+    expected_tool_calls: list[FunctionCall],
+    expected_content: str | None,
+    default_tokenizer: TokenizerLike,
+):
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("gigachat3")(
+        default_tokenizer
+    )
+    content, tool_calls = run_tool_extraction(
+        tool_parser, model_output, streaming=streaming
+    )
+    assert content == expected_content
+    assert len(tool_calls) == len(expected_tool_calls)
+    for actual, expected in zip(tool_calls, expected_tool_calls):
+        assert actual.type == "function"
+        assert actual.function.name == expected.name
+        actual_args = json.loads(actual.function.arguments)
+        expected_args = json.loads(expected.arguments)
+        assert actual_args == expected_args
+
+
+def test_streaming_tool_call_with_large_steps(default_tokenizer: TokenizerLike):
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("gigachat3")(
+        default_tokenizer
+    )
+    model_output_deltas = [
+        "function call",
+        COMPLEX_FUNCTION_JSON[:40],
+        COMPLEX_FUNCTION_JSON[40:],
+    ]
+    reconstructor = run_tool_extraction_streaming(
+        tool_parser,
+        model_output_deltas,
+        assert_one_tool_per_delta=False,
+    )
+    assert len(reconstructor.tool_calls) == 1
+    call = reconstructor.tool_calls[0]
+    assert call.type == "function"
+    assert call.function.name == "manage_user_memory"
+    args_dict = json.loads(call.function.arguments)
+    assert args_dict == COMPLEX_ARGS_DICT
diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py
index ed43ea7eec82f..7be1263e802dc 100644
--- a/vllm/entrypoints/openai/tool_parsers/__init__.py
+++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
@@ -134,6 +134,10 @@ _TOOL_PARSERS_TO_REGISTER = {
         "xlam_tool_parser",
         "xLAMToolParser",
     ),
+    "gigachat3": (
+        "gigachat3_tool_parser",
+        "GigaChat3ToolParser",
+    ),
 }
 
 
diff --git a/vllm/entrypoints/openai/tool_parsers/gigachat3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/gigachat3_tool_parser.py
new file mode 100644
index 0000000000000..dd27ffa83cfc4
--- /dev/null
+++ b/vllm/entrypoints/openai/tool_parsers/gigachat3_tool_parser.py
@@ -0,0 +1,190 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from collections.abc import Sequence
+
+import regex as re
+
+from vllm.entrypoints.chat_utils import make_tool_call_id
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ToolParser
+from vllm.logger import init_logger
+from vllm.tokenizers import TokenizerLike
+
+logger = init_logger(__name__)
+
+REGEX_FUNCTION_CALL = re.compile(
+    r"function call(?:<\|role_sep\|>\n)?(\{.*)",
+    re.DOTALL,
+)
+
+NAME_REGEX = re.compile(
+    r'"name"\s*:\s*"([^"]*)"',
+    re.DOTALL,
+)
+
+ARGS_REGEX = re.compile(
+    r'"arguments"\s*:\s*(.*)',
+    re.DOTALL,
+)
+
+
+class GigaChat3ToolParser(ToolParser):
+    def __init__(self, tokenizer: TokenizerLike):
+        super().__init__(tokenizer)
+        self.tool_started: bool = False
+        self.tool_name_sent: bool = False
+        self.tool_id: str | None = None
+        self.prev_tool_call_arr: list[dict] = []
+        self.content_buffer: str = ""
+        self.trigger_start = "function call{"
+
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+        match = REGEX_FUNCTION_CALL.search(model_output)
+        if not match:
+            return ExtractedToolCallInformation(
+                tools_called=False,
+                tool_calls=[],
+                content=model_output,
+            )
+        json_candidate = match.group(1).strip()
+        try:
+            data = json.loads(json_candidate)
+        except json.JSONDecodeError:
+            return ExtractedToolCallInformation(
+                tools_called=False,
+                tool_calls=[],
+                content=model_output,
+            )
+        if not (isinstance(data, dict) and "name" in data and "arguments" in data):
+            return ExtractedToolCallInformation(
+                tools_called=False,
+                tool_calls=[],
+                content=model_output,
+            )
+        name = data["name"]
+        args = data["arguments"]
+        if not isinstance(args, str):
+            args = json.dumps(args, ensure_ascii=False)
+
+        tool_calls = [
+            ToolCall(
+                type="function",
+                function=FunctionCall(
+                    name=name,
+                    arguments=args,
+                ),
+            )
+        ]
+        prefix = model_output[: match.start()]
+        content = prefix.rstrip() if prefix and prefix.strip() else None
+
+        return ExtractedToolCallInformation(
+            tools_called=True,
+            tool_calls=tool_calls,
+            content=content,
+        )
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> DeltaMessage | None:
+        func_name = None
+        cur_args = None
+        if not self.tool_started:
+            match = REGEX_FUNCTION_CALL.search(current_text)
+            if match:
+                self.tool_started = True
+                self.content_buffer = ""
+            else:
+                self.content_buffer += delta_text
+                clean_buffer = self.content_buffer.lstrip()
+                is_prefix = self.trigger_start.startswith(clean_buffer)
+                starts_with_trigger = clean_buffer.startswith(self.trigger_start)
+                if is_prefix or starts_with_trigger:
+                    return None
+                else:
+                    flush_text = self.content_buffer
+                    self.content_buffer = ""
+                    return DeltaMessage(content=flush_text)
+
+        match = REGEX_FUNCTION_CALL.search(current_text)
+        if not match:
+            return None
+        json_tail = match.group(1).strip()
+        name_match = NAME_REGEX.search(json_tail)
+        if name_match:
+            func_name = name_match.group(1)
+        args_match = ARGS_REGEX.search(json_tail)
+        if args_match:
+            cur_args = args_match.group(1).strip()
+            if cur_args.endswith("}"):  # last '}' end of json
+                try:
+                    candidate = cur_args[:-1].strip()
+                    json.loads(candidate)
+                    cur_args = candidate
+                except json.JSONDecodeError:
+                    pass
+        if not self.prev_tool_call_arr:
+            self.prev_tool_call_arr.append({})
+        if not self.tool_name_sent:
+            if not func_name:
+                return None
+            self.tool_name_sent = True
+            self.tool_id = make_tool_call_id()
+            self.prev_tool_call_arr[0]["name"] = func_name
+            return DeltaMessage(
+                tool_calls=[
+                    DeltaToolCall(
+                        index=0,
+                        id=self.tool_id,
+                        type="function",
+                        function=DeltaFunctionCall(
+                            name=func_name,
+                        ).model_dump(exclude_none=True),
+                    )
+                ],
+                content=None,
+            )
+        if cur_args is None:
+            return None
+        prev_args = self.prev_tool_call_arr[0].get("arguments", "")
+        if not prev_args:
+            delta_args = cur_args
+        elif cur_args.startswith(prev_args):
+            delta_args = cur_args[len(prev_args) :]
+        else:
+            return None
+        if not delta_args:
+            return None
+        self.prev_tool_call_arr[0]["arguments"] = cur_args
+        return DeltaMessage(
+            tool_calls=[
+                DeltaToolCall(
+                    index=0,
+                    function=DeltaFunctionCall(
+                        arguments=delta_args,
+                    ).model_dump(exclude_none=True),
+                )
+            ],
+            content=None,
+        )

From 671427efbf57d4e40370a336cf1d3a4d1fd8eb91 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 6 Dec 2025 21:40:02 +0800
Subject: [PATCH 308/770] [Model] Move `multimodal_cpu_fields` definition to
 field config (#30181)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/distributed/test_shm_storage.py        |   2 +-
 tests/multimodal/test_cache.py               |   2 +-
 tests/v1/test_serial_utils.py                |  21 +++-
 vllm/model_executor/models/glm4_1v.py        |  29 ++---
 vllm/model_executor/models/hunyuan_vision.py |   4 +-
 vllm/model_executor/models/interfaces.py     |  13 ++-
 vllm/model_executor/models/opencua.py        |   2 -
 vllm/model_executor/models/qwen2_5_vl.py     |   2 -
 vllm/model_executor/models/qwen2_vl.py       |  12 +-
 vllm/model_executor/models/qwen3_vl.py       |   6 +-
 vllm/multimodal/inputs.py                    | 116 +++++++++++++------
 vllm/multimodal/utils.py                     |   8 +-
 vllm/v1/serial_utils.py                      |  11 +-
 vllm/v1/worker/gpu_model_runner.py           |   5 -
 vllm/v1/worker/tpu_model_runner.py           |   3 -
 15 files changed, 141 insertions(+), 95 deletions(-)

diff --git a/tests/distributed/test_shm_storage.py b/tests/distributed/test_shm_storage.py
index b9a5c22447fd8..9ab35a292f872 100644
--- a/tests/distributed/test_shm_storage.py
+++ b/tests/distributed/test_shm_storage.py
@@ -28,7 +28,7 @@ def _dummy_elem(modality: str, key: str, size: int):
         modality=modality,
         key=key,
         data=torch.empty((size,), dtype=torch.int8),
-        field=MultiModalSharedField(1),
+        field=MultiModalSharedField(batch_size=1),
     )
 
 
diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py
index e4fcc34740edb..e641b1111abaf 100644
--- a/tests/multimodal/test_cache.py
+++ b/tests/multimodal/test_cache.py
@@ -51,7 +51,7 @@ def _dummy_elem(
         modality=modality,
         key=key,
         data=data,
-        field=MultiModalSharedField(1),
+        field=MultiModalSharedField(batch_size=1),
     )
 
 
diff --git a/tests/v1/test_serial_utils.py b/tests/v1/test_serial_utils.py
index 00749c5415c8e..dbbbfce97d286 100644
--- a/tests/v1/test_serial_utils.py
+++ b/tests/v1/test_serial_utils.py
@@ -104,22 +104,31 @@ class MyRequest(msgspec.Struct):
 
 def test_multimodal_kwargs():
     e1 = MultiModalFieldElem(
-        "audio", "a0", torch.zeros(1000, dtype=torch.bfloat16), MultiModalBatchedField()
+        "audio",
+        "a0",
+        torch.zeros(1000, dtype=torch.bfloat16),
+        MultiModalBatchedField(),
     )
     e2 = MultiModalFieldElem(
         "video",
         "v0",
         [torch.zeros(1000, dtype=torch.int8) for _ in range(4)],
-        MultiModalFlatField([[slice(1, 2, 3), slice(4, 5, 6)], [slice(None, 2)]], 0),
+        MultiModalFlatField(
+            slices=[[slice(1, 2, 3), slice(4, 5, 6)], [slice(None, 2)]],
+            dim=0,
+        ),
     )
     e3 = MultiModalFieldElem(
-        "image", "i0", torch.zeros(1000, dtype=torch.int32), MultiModalSharedField(4)
+        "image",
+        "i0",
+        torch.zeros(1000, dtype=torch.int32),
+        MultiModalSharedField(batch_size=4),
     )
     e4 = MultiModalFieldElem(
         "image",
         "i1",
         torch.zeros(1000, dtype=torch.int32),
-        MultiModalFlatField([slice(1, 2, 3), slice(4, 5, 6)], 2),
+        MultiModalFlatField(slices=[slice(1, 2, 3), slice(4, 5, 6)], dim=2),
     )
     audio = MultiModalKwargsItem.from_elems([e1])
     video = MultiModalKwargsItem.from_elems([e2])
@@ -138,8 +147,8 @@ def test_multimodal_kwargs():
 
     total_len = sum(memoryview(x).cast("B").nbytes for x in encoded)
 
-    # expected total encoding length, should be 14306, +-20 for minor changes
-    assert 14275 <= total_len <= 14325
+    # expected total encoding length, should be 14395, +-20 for minor changes
+    assert 14375 <= total_len <= 14425
     decoded = decoder.decode(encoded).mm[0]
     assert isinstance(decoded, MultiModalKwargsItems)
 
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 3cb53f2cbabee..39a837b789bed 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -787,10 +787,10 @@ class Glm4vVisionTransformer(nn.Module):
     def forward(
         self,
         x: torch.Tensor,
-        grid_thw: list[list[int]],
+        grid_thw: torch.Tensor | list[list[int]],
     ) -> torch.Tensor:
-        # Convert grid_thw to tensor (always expecting list format now)
-        grid_thw = torch.tensor(grid_thw, device=x.device, dtype=torch.long)
+        if isinstance(grid_thw, list):
+            grid_thw = torch.tensor(grid_thw, dtype=torch.int32)
 
         # patchify
         x = x.to(device=self.device, dtype=self.dtype)
@@ -805,7 +805,8 @@ class Glm4vVisionTransformer(nn.Module):
         cu_seqlens = torch.repeat_interleave(
             grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
         ).cumsum(dim=0, dtype=torch.int32)
-        cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0)
+        cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens])
+        cu_seqlens = cu_seqlens.to(self.device, non_blocking=True)
 
         # pre-compute max_seqlen for attn mask to reduce cuMemcpy operations
         max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
@@ -1548,7 +1549,6 @@ class Glm4vForConditionalGeneration(
     ) -> tuple[torch.Tensor, ...]:
         grid_thw = image_input["image_grid_thw"]
         assert grid_thw.ndim == 2
-        grid_thw_list = grid_thw.tolist()
 
         if image_input["type"] == "image_embeds":
             image_embeds = image_input["image_embeds"].type(self.visual.dtype)
@@ -1559,12 +1559,10 @@ class Glm4vForConditionalGeneration(
                     self.visual, pixel_values, grid_thw.tolist(), rope_type="rope_3d"
                 )
             else:
-                image_embeds = self.visual(pixel_values, grid_thw=grid_thw.tolist())
+                image_embeds = self.visual(pixel_values, grid_thw=grid_thw)
+
         merge_size = self.visual.spatial_merge_size
-        sizes = (
-            torch.tensor(grid_thw_list, dtype=torch.long).prod(-1)
-            // (merge_size * merge_size)
-        ).tolist()
+        sizes = (grid_thw.prod(-1) // merge_size // merge_size).tolist()
         return image_embeds.split(sizes)
 
     def _process_video_input(
@@ -1572,7 +1570,6 @@ class Glm4vForConditionalGeneration(
     ) -> tuple[torch.Tensor, ...]:
         grid_thw = video_input["video_grid_thw"]
         assert grid_thw.ndim == 2
-        grid_thw_list = grid_thw.tolist()
 
         if video_input["type"] == "video_embeds":
             video_embeds = video_input["video_embeds"].type(self.visual.dtype)
@@ -1588,15 +1585,11 @@ class Glm4vForConditionalGeneration(
                     rope_type="rope_3d",
                 )
             else:
-                video_embeds = self.visual(
-                    pixel_values_videos, grid_thw=grid_thw.tolist()
-                )
+                video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw)
+
         # Split concatenated embeddings for each video item.
         merge_size = self.visual.spatial_merge_size
-        sizes = (
-            torch.tensor(grid_thw_list, dtype=torch.long).prod(-1)
-            // (merge_size * merge_size)
-        ).tolist()
+        sizes = (grid_thw.prod(-1) // merge_size // merge_size).tolist()
         return video_embeds.split(sizes)
 
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py
index 52ce9564c8d79..e5c1be626be07 100644
--- a/vllm/model_executor/models/hunyuan_vision.py
+++ b/vllm/model_executor/models/hunyuan_vision.py
@@ -563,7 +563,7 @@ def _hunyuan_vl_field_config(hf_inputs: Mapping[str, torch.Tensor]):
     return dict(
         pixel_values=MultiModalFieldConfig.flat_from_sizes("image", image_grid_sizes),
         image_embeds=MultiModalFieldConfig.flat_from_sizes("image", image_grid_sizes),
-        image_grid_thw=MultiModalFieldConfig.batched("image"),
+        image_grid_thw=MultiModalFieldConfig.batched("image", keep_on_cpu=True),
     )
 
 
@@ -786,8 +786,6 @@ class HunYuanVLForConditionalGeneration(
     SupportsQuant,
     SupportsXDRoPE,
 ):
-    multimodal_cpu_fields = {"image_grid_thw"}
-
     # To ensure correct weight loading and mapping.
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 416ab236cd18d..607ff55835f1d 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -84,9 +84,9 @@ class SupportsMultiModal(Protocol):
     `vllm.multimodal.utils.group_mm_kwargs_by_modality` to use.
     """
 
-    multimodal_cpu_fields: ClassVar[Set[str]] = frozenset()
+    multimodal_cpu_fields: ClassVar[Set[str] | None] = None
     """
-    A set indicating CPU-only multimodal fields.
+    [DEPRECATED] A set indicating CPU-only multimodal fields.
     """
 
     _processor_factory: ClassVar[_ProcessorFactories]
@@ -279,6 +279,15 @@ def supports_multimodal(
                 "please remove the override from your model."
             )
 
+        multimodal_cpu_fields = getattr(model, "multimodal_cpu_fields", None)
+        if multimodal_cpu_fields is not None:
+            raise ValueError(
+                "`multimodal_cpu_fields` is no longer effective, "
+                "please set `keep_on_cpu=True` in `MultiModalFieldConfig` "
+                "(refer to https://github.com/vllm-project/vllm/pull/30181), "
+                "and then remove the override from your model."
+            )
+
     return res
 
 
diff --git a/vllm/model_executor/models/opencua.py b/vllm/model_executor/models/opencua.py
index 76a2d1cc242c5..23668cc2b746e 100644
--- a/vllm/model_executor/models/opencua.py
+++ b/vllm/model_executor/models/opencua.py
@@ -201,8 +201,6 @@ class OpenCUADummyInputsBuilder(Qwen2VLDummyInputsBuilder):
     dummy_inputs=OpenCUADummyInputsBuilder,
 )
 class OpenCUAForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
-    multimodal_cpu_fields = {"image_grid_thw"}
-
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 488af192bfad0..3cc3a3a7873c6 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -1039,8 +1039,6 @@ class Qwen2_5_VLForConditionalGeneration(
     SupportsMultiModalPruning,
     SupportsMRoPE,
 ):
-    multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"}
-
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 9da5080f84303..885e172d1e81a 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -811,14 +811,14 @@ def _create_qwen2vl_field_factory(
             image_embeds=MultiModalFieldConfig.flat_from_sizes(
                 "image", image_embed_grid_sizes
             ),
-            image_grid_thw=MultiModalFieldConfig.batched("image"),
+            image_grid_thw=MultiModalFieldConfig.batched("image", keep_on_cpu=True),
             pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
                 "video", video_grid_sizes
             ),
             video_embeds=MultiModalFieldConfig.flat_from_sizes(
                 "video", video_embed_grid_sizes
             ),
-            video_grid_thw=MultiModalFieldConfig.batched("video"),
+            video_grid_thw=MultiModalFieldConfig.batched("video", keep_on_cpu=True),
         )
 
     return _qwen2vl_field_config
@@ -1131,8 +1131,6 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo])
 class Qwen2VLForConditionalGeneration(
     nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
 ):
-    multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"}
-
     # To ensure correct weight loading and mapping.
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
@@ -1393,9 +1391,11 @@ class Qwen2VLForConditionalGeneration(
         else:
             pixel_values_videos = video_input["pixel_values_videos"]
             if self.use_data_parallel:
-                grid_thw_list = grid_thw.tolist()
                 return run_dp_sharded_mrope_vision_model(
-                    self.visual, pixel_values_videos, grid_thw_list, rope_type="rope_3d"
+                    self.visual,
+                    pixel_values_videos,
+                    grid_thw.tolist(),
+                    rope_type="rope_3d",
                 )
             else:
                 video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw)
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index a5b10c95872dd..1add39d6b0a84 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -984,14 +984,14 @@ class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo])
             image_embeds=MultiModalFieldConfig.flat_from_sizes(
                 "image", image_grid_sizes
             ),
-            image_grid_thw=MultiModalFieldConfig.batched("image"),
+            image_grid_thw=MultiModalFieldConfig.batched("image", keep_on_cpu=True),
             pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
                 "video", video_grid_sizes
             ),
             video_embeds=MultiModalFieldConfig.flat_from_sizes(
                 "video", video_grid_sizes
             ),
-            video_grid_thw=MultiModalFieldConfig.batched("video"),
+            video_grid_thw=MultiModalFieldConfig.batched("video", keep_on_cpu=True),
         )
 
     def _get_prompt_updates(
@@ -1190,8 +1190,6 @@ class Qwen3VLForConditionalGeneration(
     SupportsMRoPE,
     SupportsEagle3,
 ):
-    multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"}
-
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 32f15240cb7da..d9118f5b9e9a5 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -3,7 +3,7 @@
 
 from abc import ABC, abstractmethod
 from collections import UserDict, defaultdict
-from collections.abc import Mapping, Sequence, Set
+from collections.abc import Mapping, Sequence
 from dataclasses import dataclass
 from functools import partial
 from itertools import accumulate
@@ -223,6 +223,23 @@ def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
     return a == b
 
 
+def _nested_tensors_h2d(
+    tensors: NestedTensors,
+    device: torch.types.Device,
+) -> NestedTensors:
+    if device is None:
+        return tensors
+
+    return json_map_leaves(
+        (
+            lambda x: x.to(device=device, non_blocking=True)
+            if isinstance(x, torch.Tensor)
+            else x
+        ),
+        tensors,
+    )
+
+
 BatchedTensorInputs: TypeAlias = dict[str, NestedTensors]
 """
 A dictionary containing nested tensors which have been batched via
@@ -334,7 +351,7 @@ class MultiModalFieldElem:
         )  # noqa: E721
 
 
-@dataclass(frozen=True)
+@dataclass(frozen=True, kw_only=True)
 class BaseMultiModalField(ABC):
     """
     Defines how to interpret tensor data belonging to a keyword argument in
@@ -342,6 +359,12 @@ class BaseMultiModalField(ABC):
     multi-modal items, and vice versa.
     """
 
+    keep_on_cpu: bool = False
+    """
+    If `True`, then this field is excluded from being moved to the accelerator
+    when `MultiModalKwargsItems.get_data()` is called to batch the data.
+    """
+
     def _field_factory(self, *, modality: str, key: str):
         f = partial(
             MultiModalFieldElem,
@@ -386,6 +409,7 @@ class BaseMultiModalField(ABC):
         self,
         elems: list[MultiModalFieldElem],
         *,
+        device: torch.types.Device = None,
         pin_memory: bool = False,
     ) -> NestedTensors:
         """
@@ -399,11 +423,17 @@ class BaseMultiModalField(ABC):
         if len(set(field_types)) > 1:
             raise ValueError(f"Cannot merge different {field_types=}")
 
+        if device is not None and self.keep_on_cpu:
+            device = "cpu"
+        if pin_memory and self.keep_on_cpu:
+            pin_memory = False
+
         batch = [elem.data for elem in elems]
-        return self._reduce_data(batch, pin_memory=pin_memory)
+        out = self._reduce_data(batch, pin_memory=pin_memory)
+        return _nested_tensors_h2d(out, device=device)
 
 
-@dataclass(frozen=True)
+@dataclass(frozen=True, kw_only=True)
 class MultiModalBatchedField(BaseMultiModalField):
     """
     Info:
@@ -445,7 +475,7 @@ class MultiModalBatchedField(BaseMultiModalField):
         return batch
 
 
-@dataclass(frozen=True)
+@dataclass(frozen=True, kw_only=True)
 class MultiModalFlatField(BaseMultiModalField):
     """
     Info:
@@ -505,7 +535,7 @@ class MultiModalFlatField(BaseMultiModalField):
         return [e for elem in batch for e in elem]
 
 
-@dataclass(frozen=True)
+@dataclass(frozen=True, kw_only=True)
 class MultiModalSharedField(BaseMultiModalField):
     """
     Info:
@@ -532,9 +562,10 @@ class MultiModalSharedField(BaseMultiModalField):
         return batch[0]
 
 
+@dataclass(frozen=True)
 class MultiModalFieldConfig:
     @staticmethod
-    def batched(modality: str):
+    def batched(modality: str, *, keep_on_cpu: bool = False):
         """
         Defines a field where an element in the batch is obtained by
         indexing into the first dimension of the underlying data.
@@ -542,6 +573,7 @@ class MultiModalFieldConfig:
         Args:
             modality: The modality of the multi-modal item that uses this
                 keyword argument.
+            keep_on_cpu: Whether to keep this field on the CPU for the model inputs.
 
         Example:
 
@@ -558,7 +590,7 @@ class MultiModalFieldConfig:
         ```
         """
         return MultiModalFieldConfig(
-            field=MultiModalBatchedField(),
+            field=MultiModalBatchedField(keep_on_cpu=keep_on_cpu),
             modality=modality,
         )
 
@@ -567,6 +599,8 @@ class MultiModalFieldConfig:
         modality: str,
         slices: Sequence[slice] | Sequence[Sequence[slice]],
         dim: int = 0,
+        *,
+        keep_on_cpu: bool = False,
     ):
         """
         Defines a field where an element in the batch is obtained by
@@ -579,6 +613,7 @@ class MultiModalFieldConfig:
                 slices (dim>0) that is used to extract the data corresponding
                 to it.
             dim: The dimension to extract data, default to 0.
+            keep_on_cpu: Whether to keep this field on the CPU for the model inputs.
 
         Example:
 
@@ -613,12 +648,22 @@ class MultiModalFieldConfig:
         ```
         """
         return MultiModalFieldConfig(
-            field=MultiModalFlatField(slices=slices, dim=dim),
+            field=MultiModalFlatField(
+                slices=slices,
+                dim=dim,
+                keep_on_cpu=keep_on_cpu,
+            ),
             modality=modality,
         )
 
     @staticmethod
-    def flat_from_sizes(modality: str, size_per_item: "torch.Tensor", dim: int = 0):
+    def flat_from_sizes(
+        modality: str,
+        size_per_item: "torch.Tensor",
+        dim: int = 0,
+        *,
+        keep_on_cpu: bool = False,
+    ):
         """
         Defines a field where an element in the batch is obtained by
         slicing along the first dimension of the underlying data.
@@ -629,6 +674,7 @@ class MultiModalFieldConfig:
             size_per_item: For each multi-modal item, the size of the slice
                 that is used to extract the data corresponding to it.
             dim: The dimension to slice, default to 0.
+            keep_on_cpu: Whether to keep this field on the CPU for the model inputs.
 
         Example:
 
@@ -676,10 +722,20 @@ class MultiModalFieldConfig:
             for i in range(len(size_per_item))
         ]
 
-        return MultiModalFieldConfig.flat(modality, slices, dim=dim)
+        return MultiModalFieldConfig.flat(
+            modality,
+            slices,
+            dim=dim,
+            keep_on_cpu=keep_on_cpu,
+        )
 
     @staticmethod
-    def shared(modality: str, batch_size: int):
+    def shared(
+        modality: str,
+        batch_size: int,
+        *,
+        keep_on_cpu: bool = False,
+    ):
         """
         Defines a field where an element in the batch is obtained by
         taking the entirety of the underlying data.
@@ -690,6 +746,7 @@ class MultiModalFieldConfig:
             modality: The modality of the multi-modal item that uses this
                 keyword argument.
             batch_size: The number of multi-modal items which share this data.
+            keep_on_cpu: Whether to keep this field on the CPU for the model inputs.
 
         Example:
 
@@ -708,18 +765,15 @@ class MultiModalFieldConfig:
         ```
         """
         return MultiModalFieldConfig(
-            field=MultiModalSharedField(batch_size),
+            field=MultiModalSharedField(
+                batch_size=batch_size,
+                keep_on_cpu=keep_on_cpu,
+            ),
             modality=modality,
         )
 
-    def __init__(self, field: BaseMultiModalField, modality: str) -> None:
-        super().__init__()
-
-        self.field = field
-        self.modality = modality
-
-    def __repr__(self) -> str:
-        return f"MultiModalFieldConfig(field={self.field}, modality={self.modality})"
+    field: BaseMultiModalField
+    modality: str
 
     def build_elems(
         self,
@@ -744,7 +798,7 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
             modality=modality,
             key="dummy",
             data=torch.empty(nbytes, dtype=torch.uint8),
-            field=MultiModalSharedField(1),
+            field=MultiModalSharedField(batch_size=1),
         )
         return MultiModalKwargsItem.from_elems([mm_elem])
 
@@ -844,7 +898,6 @@ class MultiModalKwargsItems(UserDict[str, Sequence[_I]]):
         *,
         device: torch.types.Device = None,
         pin_memory: bool = False,
-        cpu_fields: Set[str] = frozenset(),
     ) -> BatchedTensorInputs:
         """Construct a dictionary of keyword arguments to pass to the model."""
         elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list)
@@ -859,21 +912,14 @@ class MultiModalKwargsItems(UserDict[str, Sequence[_I]]):
                     elems_by_key[key].append(elem)
 
         data = {
-            key: elems[0].field.reduce_data(elems, pin_memory=pin_memory)
+            key: elems[0].field.reduce_data(
+                elems,
+                device=device,
+                pin_memory=pin_memory,
+            )
             for key, elems in elems_by_key.items()
         }
 
-        if device is not None:
-            for k in data.keys() - cpu_fields:
-                data[k] = json_map_leaves(
-                    (
-                        lambda x: x.to(device=device, non_blocking=True)
-                        if isinstance(x, torch.Tensor)
-                        else x
-                    ),
-                    data[k],
-                )
-
         return data
 
 
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 9c5e3fb2b32ad..d4bdc55e569b2 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -413,7 +413,7 @@ def group_mm_kwargs_by_modality(
     device: torch.types.Device = None,
     pin_memory: bool = False,
     merge_by_field_config: bool | None = None,
-    multimodal_cpu_fields: Set[str] = frozenset(),
+    multimodal_cpu_fields: Set[str] | None = None,
 ) -> Generator[tuple[str, int, BatchedTensorInputs], None, None]:
     """Group consecutive `MultiModalKwargsItem`s from `mm_kwargs` with the same
     modality together into the same `MultiModalKwargs` instance.
@@ -431,6 +431,11 @@ def group_mm_kwargs_by_modality(
             "The `merge_by_field_config` argument of `group_mm_kwargs_by_modality` "
             "is deprecated and will be removed in v0.13."
         )
+    if multimodal_cpu_fields is not None:
+        logger.warning_once(
+            "The `multimodal_cpu_fields` argument of `group_mm_kwargs_by_modality` "
+            "is deprecated and will be removed in v0.13."
+        )
 
     from vllm.multimodal.inputs import MultiModalKwargsItems
 
@@ -440,7 +445,6 @@ def group_mm_kwargs_by_modality(
         mm_kwargs_data = mm_kwargs_items.get_data(
             device=device,
             pin_memory=pin_memory,
-            cpu_fields=multimodal_cpu_fields,
         )
 
         yield modality, len(items_lst), mm_kwargs_data
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 14ae487f3eb13..a3c30e368b828 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -269,10 +269,11 @@ class MsgpackEncoder:
         name = MMF_CLASS_TO_FACTORY.get(field.__class__)
         if not name:
             raise TypeError(f"Unsupported field type: {field.__class__}")
+
         # We just need to copy all of the field values in order
         # which will be then used to reconstruct the field.
-        field_values = (getattr(field, f.name) for f in dataclasses.fields(field))
-        return name, *field_values
+        factory_kw = {f.name: getattr(field, f.name) for f in dataclasses.fields(field)}
+        return name, factory_kw
 
 
 class MsgpackDecoder:
@@ -392,15 +393,15 @@ class MsgpackDecoder:
             obj["data"] = self._decode_nested_tensors(obj["data"])
 
         # Reconstruct the field processor using MultiModalFieldConfig
-        factory_meth_name, *field_args = obj["field"]
+        factory_meth_name, factory_kw = obj["field"]
         factory_meth = getattr(MultiModalFieldConfig, factory_meth_name)
 
         # Special case: decode the union "slices" field of
         # MultiModalFlatField
         if factory_meth_name == "flat":
-            field_args[0] = self._decode_nested_slices(field_args[0])
+            factory_kw["slices"] = self._decode_nested_slices(factory_kw["slices"])
 
-        obj["field"] = factory_meth(None, *field_args).field
+        obj["field"] = factory_meth("", **factory_kw).field
         return MultiModalFieldElem(**obj)
 
     def _decode_nested_tensors(self, obj: Any) -> NestedTensors:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index b6a8145226b3f..a50360ab08694 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1097,7 +1097,6 @@ class GPUModelRunner(
             device=self.device,
             pin_memory=self.pin_memory,
             merge_by_field_config=model.merge_by_field_config,
-            multimodal_cpu_fields=model.multimodal_cpu_fields,
         ):
             mm_kwargs_combined.update(mm_kwargs_group)
 
@@ -2109,7 +2108,6 @@ class GPUModelRunner(
             mm_kwargs,
             device=self.device,
             pin_memory=self.pin_memory,
-            multimodal_cpu_fields=model.multimodal_cpu_fields,
         ):
             curr_group_outputs: list[torch.Tensor] = []
 
@@ -2135,7 +2133,6 @@ class GPUModelRunner(
                             [video_mm_kwargs_item],
                             device=self.device,
                             pin_memory=self.pin_memory,
-                            multimodal_cpu_fields=model.multimodal_cpu_fields,
                         )
                     )
 
@@ -3887,14 +3884,12 @@ class GPUModelRunner(
         dummy_mm_item = dummy_mm_data[modality][0]
         dummy_mm_items = [dummy_mm_item] * max_items_per_batch
 
-        model = cast(SupportsMultiModal, self.model)
         return next(
             mm_kwargs_group
             for _, _, mm_kwargs_group in group_mm_kwargs_by_modality(
                 dummy_mm_items,
                 device=self.device,
                 pin_memory=self.pin_memory,
-                multimodal_cpu_fields=model.multimodal_cpu_fields,
             )
         )
 
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 292f12969aaed..283f21b779e38 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -969,7 +969,6 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             mm_kwargs,
             device=self.device,
             pin_memory=self.pin_memory,
-            multimodal_cpu_fields=model.multimodal_cpu_fields,
         ):
             # Run the encoder.
             # `curr_group_outputs` is either of the following:
@@ -2050,14 +2049,12 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         dummy_mm_item = dummy_mm_data[modality][0]
         dummy_mm_items = [dummy_mm_item] * max_items_per_batch
 
-        model = cast(SupportsMultiModal, self.model)
         return next(
             grouped_mm_kwargs
             for _, _, grouped_mm_kwargs in group_mm_kwargs_by_modality(
                 dummy_mm_items,
                 device=self.device,
                 pin_memory=self.pin_memory,
-                multimodal_cpu_fields=model.multimodal_cpu_fields,
             )
         )
 

From 421125d03a110df7d49f84c7cf8ee9fa089d1dff Mon Sep 17 00:00:00 2001
From: Andrew Xia <axia@meta.com>
Date: Sat, 6 Dec 2025 14:34:34 -0800
Subject: [PATCH 309/770] [ez] move harmony utils to parser folder (#30117)

Signed-off-by: Andrew Xia <axia@fb.com>
Co-authored-by: Andrew Xia <axia@fb.com>
---
 tests/entrypoints/openai/parser/__init__.py                 | 0
 tests/entrypoints/{ => openai/parser}/test_harmony_utils.py | 2 +-
 tests/entrypoints/openai/test_response_api_with_harmony.py  | 2 +-
 vllm/entrypoints/context.py                                 | 2 +-
 vllm/entrypoints/{ => openai/parser}/harmony_utils.py       | 0
 vllm/entrypoints/openai/serving_chat.py                     | 4 ++--
 vllm/entrypoints/openai/serving_responses.py                | 4 ++--
 vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py  | 2 +-
 vllm/reasoning/gptoss_reasoning_parser.py                   | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)
 create mode 100644 tests/entrypoints/openai/parser/__init__.py
 rename tests/entrypoints/{ => openai/parser}/test_harmony_utils.py (99%)
 rename vllm/entrypoints/{ => openai/parser}/harmony_utils.py (100%)

diff --git a/tests/entrypoints/openai/parser/__init__.py b/tests/entrypoints/openai/parser/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/entrypoints/test_harmony_utils.py b/tests/entrypoints/openai/parser/test_harmony_utils.py
similarity index 99%
rename from tests/entrypoints/test_harmony_utils.py
rename to tests/entrypoints/openai/parser/test_harmony_utils.py
index 82ff562d5c6d2..ae6f558f22b03 100644
--- a/tests/entrypoints/test_harmony_utils.py
+++ b/tests/entrypoints/openai/parser/test_harmony_utils.py
@@ -4,7 +4,7 @@
 from openai.types.responses import ResponseFunctionToolCall, ResponseReasoningItem
 from openai_harmony import Author, Message, Role, TextContent
 
-from vllm.entrypoints.harmony_utils import (
+from vllm.entrypoints.openai.parser.harmony_utils import (
     has_custom_tools,
     parse_input_to_harmony_message,
     parse_output_message,
diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py
index 8fd3545eccffa..6f2a50020699c 100644
--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@@ -726,7 +726,7 @@ async def test_function_calling_required(client: OpenAI, model_name: str):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_system_message_with_tools(client: OpenAI, model_name: str):
-    from vllm.entrypoints.harmony_utils import get_system_message
+    from vllm.entrypoints.openai.parser.harmony_utils import get_system_message
 
     # Test with custom tools enabled - commentary channel should be available
     sys_msg = get_system_message(with_custom_tools=True)
diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
index f50c473d7a773..a484a437c853e 100644
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -19,7 +19,7 @@ from vllm import envs
 from vllm.entrypoints.chat_utils import (
     ChatTemplateContentFormatOption,
 )
-from vllm.entrypoints.harmony_utils import (
+from vllm.entrypoints.openai.parser.harmony_utils import (
     get_encoding,
     get_streamable_parser_for_assistant,
     render_for_completion,
diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/openai/parser/harmony_utils.py
similarity index 100%
rename from vllm/entrypoints/harmony_utils.py
rename to vllm/entrypoints/openai/parser/harmony_utils.py
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 9b7bc461e4511..c6333d170c663 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -21,7 +21,8 @@ from vllm.entrypoints.chat_utils import (
     get_history_tool_calls_cnt,
     make_tool_call_id,
 )
-from vllm.entrypoints.harmony_utils import (
+from vllm.entrypoints.logger import RequestLogger
+from vllm.entrypoints.openai.parser.harmony_utils import (
     get_developer_message,
     get_stop_tokens_for_assistant_actions,
     get_streamable_parser_for_assistant,
@@ -30,7 +31,6 @@ from vllm.entrypoints.harmony_utils import (
     parse_input_to_harmony_message,
     render_for_completion,
 )
-from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionLogProb,
     ChatCompletionLogProbs,
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 1eb1243e7e5bc..91616a78e11dc 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -64,7 +64,8 @@ from vllm.entrypoints.context import (
     SimpleContext,
     StreamingHarmonyContext,
 )
-from vllm.entrypoints.harmony_utils import (
+from vllm.entrypoints.logger import RequestLogger
+from vllm.entrypoints.openai.parser.harmony_utils import (
     construct_harmony_previous_input_messages,
     get_developer_message,
     get_stop_tokens_for_assistant_actions,
@@ -76,7 +77,6 @@ from vllm.entrypoints.harmony_utils import (
     parse_response_input,
     render_for_completion,
 )
-from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
     DeltaMessage,
     ErrorResponse,
diff --git a/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
index 8bdf35d408805..387e87f208e66 100644
--- a/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
@@ -4,7 +4,7 @@ import json
 from collections.abc import Sequence
 from typing import TYPE_CHECKING
 
-from vllm.entrypoints.harmony_utils import parse_output_into_messages
+from vllm.entrypoints.openai.parser.harmony_utils import parse_output_into_messages
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionRequest,
     DeltaMessage,
diff --git a/vllm/reasoning/gptoss_reasoning_parser.py b/vllm/reasoning/gptoss_reasoning_parser.py
index fa45b12856c7d..e0920ef3160b2 100644
--- a/vllm/reasoning/gptoss_reasoning_parser.py
+++ b/vllm/reasoning/gptoss_reasoning_parser.py
@@ -5,7 +5,7 @@ from collections.abc import Sequence
 
 from transformers import PreTrainedTokenizerBase
 
-from vllm.entrypoints.harmony_utils import parse_chat_output
+from vllm.entrypoints.openai.parser.harmony_utils import parse_chat_output
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
 from vllm.entrypoints.tool_server import ToolServer
 from vllm.logger import init_logger

From 8d3da4c79ddbd199d4185c505bcf3a5d7e7a3316 Mon Sep 17 00:00:00 2001
From: AuruTus <33182215+AuruTus@users.noreply.github.com>
Date: Sun, 7 Dec 2025 08:21:03 +0800
Subject: [PATCH 310/770] [MISC]: change NIXL compatibility hash logging level
 to debug (#30182)

---
 vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 7aa12e9993c76..514b8534aaa6b 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -189,7 +189,7 @@ def compute_nixl_compatibility_hash(
     }
 
     compat_hash = hash_factors(factors)
-    logger.info(
+    logger.debug(
         "NIXL compatibility hash: %s (model=%s, dtype=%s, num_kv_heads=%d, "
         "cache_dtype=%s, attn_backend=%s)",
         compat_hash,

From cbedb703cc594632db796f9ca748ea4b7b4e8435 Mon Sep 17 00:00:00 2001
From: Yanan Cao <gmagogsfm@users.noreply.github.com>
Date: Sat, 6 Dec 2025 18:53:42 -0800
Subject: [PATCH 311/770] [Frontend] Remove confusing -O.xx flag error (#30169)

Signed-off-by: Yanan Cao <gmagogsfm@gmail.com>
---
 tests/utils_/test_argparse_utils.py | 19 -------------------
 vllm/utils/argparse_utils.py        |  7 -------
 2 files changed, 26 deletions(-)

diff --git a/tests/utils_/test_argparse_utils.py b/tests/utils_/test_argparse_utils.py
index 6f24c77e066a4..fbc278404e3f0 100644
--- a/tests/utils_/test_argparse_utils.py
+++ b/tests/utils_/test_argparse_utils.py
@@ -458,22 +458,3 @@ def test_flat_product():
         (3, 4, "a", 5, 6),
         (3, 4, "b", 5, 6),
     ]
-
-
-def test_o_dotted_syntax_error():
-    """Test that -O.* dotted syntax raises a clear error message."""
-    parser = FlexibleArgumentParser()
-    parser.add_argument("-cc", "--compilation-config", type=json.loads)
-
-    # Test that -O.* syntax raises a clear ValueError
-    with pytest.raises(ValueError, match=r"The -O\.\* syntax is no longer supported"):
-        parser.parse_args(["-O.backend=eager"])
-
-    with pytest.raises(ValueError, match=r"Please use -cc\.\* instead"):
-        parser.parse_args(["-O.mode=2"])
-
-    with pytest.raises(
-        ValueError,
-        match=r"replace '-O\.cudagraph_mode=NONE' with '-cc\.cudagraph_mode=NONE'",
-    ):
-        parser.parse_args(["-O.cudagraph_mode=NONE"])
diff --git a/vllm/utils/argparse_utils.py b/vllm/utils/argparse_utils.py
index 356f383cc52bd..87ee6f54c0c9b 100644
--- a/vllm/utils/argparse_utils.py
+++ b/vllm/utils/argparse_utils.py
@@ -244,13 +244,6 @@ class FlexibleArgumentParser(ArgumentParser):
                 else:
                     key = pattern.sub(repl, arg, count=1)
                     processed_args.append(key)
-            elif arg.startswith("-O."):
-                # Provide clear error for deprecated -O.* syntax
-                raise ValueError(
-                    f"The -O.* syntax is no longer supported. "
-                    f"Please use -cc.* instead. "
-                    f"For example, replace '{arg}' with '{arg.replace('-O', '-cc', 1)}'"
-                )
             elif arg.startswith("-O") and arg != "-O":
                 # allow -O flag to be used without space, e.g. -O3 or -Odecode
                 # also handle -O=<optimization_level> here

From dce6d229f7d405a4757aa8f0a76ba62f0e39eaa4 Mon Sep 17 00:00:00 2001
From: jeremyteboul <80506730+jeremyteboul@users.noreply.github.com>
Date: Sat, 6 Dec 2025 20:34:24 -0800
Subject: [PATCH 312/770] Support multiple image/audio embeddings per requests
 (#29988)

Signed-off-by: Jeremy Teboul <jeremyteboul@fb.com>
Co-authored-by: Jeremy Teboul <jeremyteboul@fb.com>
---
 docs/features/multimodal_inputs.md   |  10 +-
 tests/entrypoints/test_chat_utils.py | 178 +++++++++++++++++++++++++++
 vllm/entrypoints/chat_utils.py       |  30 ++---
 3 files changed, 198 insertions(+), 20 deletions(-)

diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
index 0adb32a7ac33c..c3fd726e9938c 100644
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -445,7 +445,7 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd
 
 For Qwen3-VL, the `image_embeds` should contain both the base image embedding and deepstack features.
 
-#### Audio Embeddings
+#### Audio Embedding Inputs
 
 You can pass pre-computed audio embeddings similar to image embeddings:
 
@@ -892,5 +892,11 @@ For Online Serving, you can also skip sending media if you expect cache hits wit
     ```
 
 !!! note
-    Only one message can contain `{"type": "image_embeds"}`.
+    Multiple messages can now contain `{"type": "image_embeds"}`, enabling you to pass multiple image embeddings in a single request (similar to regular images). The number of embeddings is limited by `--limit-mm-per-prompt`.
+
+    **Important**: The embedding shape format differs based on the number of embeddings:
+
+    - **Single embedding**: 3D tensor of shape `(1, feature_size, hidden_size)`
+    - **Multiple embeddings**: List of 2D tensors, each of shape `(feature_size, hidden_size)`
+
     If used with a model that requires additional parameters, you must also provide a tensor for each of them, e.g. `image_grid_thw`, `image_sizes`, etc.
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 75be34820bcd7..527322c71ae4b 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -6,6 +6,7 @@ from collections.abc import Mapping
 from typing import Literal
 
 import pytest
+import torch
 from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
 
 from vllm.assets.audio import AudioAsset
@@ -915,6 +916,183 @@ async def test_parse_chat_messages_audio_embeds_async(
     _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None])
 
 
+def test_parse_chat_messages_multiple_image_embeds(
+    phi3v_model_config_image_embeds,
+):
+    """Test that multiple image_embeds in a single message are now supported.
+
+    This test validates the fix for the limitation that previously only allowed
+    one message with {'type': 'image_embeds'}. Now multiple image embeddings
+    can be provided in a single request, similar to regular images.
+    """
+    # Create two sample image embedding tensors
+    image_embedding_1 = torch.randn(256, 1024)
+    image_embedding_2 = torch.randn(128, 1024)
+
+    # Encode them as base64 using the convenience function
+    base64_image_embedding_1 = tensor2base64(image_embedding_1)
+    base64_image_embedding_2 = tensor2base64(image_embedding_2)
+
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_embeds",
+                        "image_embeds": base64_image_embedding_1,
+                    },
+                    {
+                        "type": "image_embeds",
+                        "image_embeds": base64_image_embedding_2,
+                    },
+                    {"type": "text", "text": "Describe these two images."},
+                ],
+            }
+        ],
+        phi3v_model_config_image_embeds,
+        content_format="string",
+    )
+
+    # Verify conversation structure
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "<|image_1|>\n<|image_2|>\nDescribe these two images.",
+        }
+    ]
+
+    # Verify mm_data contains a list of embeddings (not a single embedding)
+    assert mm_data is not None
+    assert "image" in mm_data
+    assert isinstance(mm_data["image"], list)
+    assert len(mm_data["image"]) == 2
+
+    # Verify each embedding has the correct shape
+    assert isinstance(mm_data["image"][0], torch.Tensor)
+    assert mm_data["image"][0].shape == image_embedding_1.shape
+    assert isinstance(mm_data["image"][1], torch.Tensor)
+    assert mm_data["image"][1].shape == image_embedding_2.shape
+
+    # Verify UUIDs (None since we didn't provide any)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
+
+
+def test_parse_chat_messages_multiple_image_embeds_with_uuids(
+    phi3v_model_config_image_embeds,
+):
+    """Test multiple image_embeds with UUIDs.
+
+    This validates that UUIDs are properly tracked for multiple embeddings.
+    """
+    uuid1 = "image-uuid-1"
+    uuid2 = "image-uuid-2"
+
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_embeds",
+                        "image_embeds": None,
+                        "uuid": uuid1,
+                    },
+                    {
+                        "type": "image_embeds",
+                        "image_embeds": None,
+                        "uuid": uuid2,
+                    },
+                    {"type": "text", "text": "Compare these images."},
+                ],
+            }
+        ],
+        phi3v_model_config_image_embeds,
+        content_format="string",
+    )
+
+    # Verify conversation structure
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "<|image_1|>\n<|image_2|>\nCompare these images.",
+        }
+    ]
+
+    # Verify mm_data contains a list with None values (UUID references)
+    assert mm_data is not None
+    assert "image" in mm_data
+    assert isinstance(mm_data["image"], list)
+    assert len(mm_data["image"]) == 2
+    assert mm_data["image"][0] is None
+    assert mm_data["image"][1] is None
+
+    # Verify UUIDs are correctly tracked
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[uuid1, uuid2])
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_multiple_image_embeds_async(
+    phi3v_model_config_image_embeds,
+):
+    """Test multiple image_embeds with async parsing.
+
+    This validates the AsyncMultiModalItemTracker also supports multiple embeddings.
+    """
+    # Create two sample image embedding tensors
+    image_embedding_1 = torch.randn(200, 768)
+    image_embedding_2 = torch.randn(150, 768)
+
+    # Encode them as base64 using the convenience function
+    base64_image_embedding_1 = tensor2base64(image_embedding_1)
+    base64_image_embedding_2 = tensor2base64(image_embedding_2)
+
+    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
+        [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_embeds",
+                        "image_embeds": base64_image_embedding_1,
+                    },
+                    {
+                        "type": "image_embeds",
+                        "image_embeds": base64_image_embedding_2,
+                    },
+                    {"type": "text", "text": "What do these images show?"},
+                ],
+            }
+        ],
+        phi3v_model_config_image_embeds,
+        content_format="string",
+    )
+
+    # Verify conversation structure
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "<|image_1|>\n<|image_2|>\nWhat do these images show?",
+        }
+    ]
+
+    # Await the future and verify mm_data
+    mm_data = await mm_future
+    assert mm_data is not None
+    assert "image" in mm_data
+    assert isinstance(mm_data["image"], list)
+    assert len(mm_data["image"]) == 2
+
+    # Verify each embedding has the correct shape
+    assert isinstance(mm_data["image"][0], torch.Tensor)
+    assert mm_data["image"][0].shape == image_embedding_1.shape
+    assert isinstance(mm_data["image"][1], torch.Tensor)
+    assert mm_data["image"][1].shape == image_embedding_2.shape
+
+    # Verify UUIDs
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
+
+
 @pytest.mark.asyncio
 async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
     phi3v_model_config_image_embeds,
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 077fe681bc5b8..aceaa8bd45b81 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -694,16 +694,10 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
             raise ValueError("Mixing raw image and embedding inputs is not allowed")
 
         if "image_embeds" in uuids_by_modality:
-            image_embeds_uuids = uuids_by_modality["image_embeds"]
-            if len(image_embeds_uuids) > 1:
-                raise ValueError("Only one message can have {'type': 'image_embeds'}")
             mm_uuids["image"] = uuids_by_modality["image_embeds"]
         if "image" in uuids_by_modality:
             mm_uuids["image"] = uuids_by_modality["image"]  # UUIDs of images
         if "audio_embeds" in uuids_by_modality:
-            audio_embeds_uuids = uuids_by_modality["audio_embeds"]
-            if len(audio_embeds_uuids) > 1:
-                raise ValueError("Only one message can have {'type': 'audio_embeds'}")
             mm_uuids["audio"] = uuids_by_modality["audio_embeds"]
         if "audio" in uuids_by_modality:
             mm_uuids["audio"] = uuids_by_modality["audio"]  # UUIDs of audios
@@ -729,16 +723,16 @@ class MultiModalItemTracker(BaseMultiModalItemTracker[object]):
 
         if "image_embeds" in items_by_modality:
             image_embeds_lst = items_by_modality["image_embeds"]
-            if len(image_embeds_lst) > 1:
-                raise ValueError("Only one message can have {'type': 'image_embeds'}")
-            mm_inputs["image"] = image_embeds_lst[0]
+            mm_inputs["image"] = (
+                image_embeds_lst if len(image_embeds_lst) != 1 else image_embeds_lst[0]
+            )
         if "image" in items_by_modality:
             mm_inputs["image"] = items_by_modality["image"]  # A list of images
         if "audio_embeds" in items_by_modality:
             audio_embeds_lst = items_by_modality["audio_embeds"]
-            if len(audio_embeds_lst) > 1:
-                raise ValueError("Only one message can have {'type': 'audio_embeds'}")
-            mm_inputs["audio"] = audio_embeds_lst[0]
+            mm_inputs["audio"] = (
+                audio_embeds_lst if len(audio_embeds_lst) != 1 else audio_embeds_lst[0]
+            )
         if "audio" in items_by_modality:
             mm_inputs["audio"] = items_by_modality["audio"]  # A list of audios
         if "video" in items_by_modality:
@@ -771,16 +765,16 @@ class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]):
 
         if "image_embeds" in items_by_modality:
             image_embeds_lst = items_by_modality["image_embeds"]
-            if len(image_embeds_lst) > 1:
-                raise ValueError("Only one message can have {'type': 'image_embeds'}")
-            mm_inputs["image"] = image_embeds_lst[0]
+            mm_inputs["image"] = (
+                image_embeds_lst if len(image_embeds_lst) != 1 else image_embeds_lst[0]
+            )
         if "image" in items_by_modality:
             mm_inputs["image"] = items_by_modality["image"]  # A list of images
         if "audio_embeds" in items_by_modality:
             audio_embeds_lst = items_by_modality["audio_embeds"]
-            if len(audio_embeds_lst) > 1:
-                raise ValueError("Only one message can have {'type': 'audio_embeds'}")
-            mm_inputs["audio"] = audio_embeds_lst[0]
+            mm_inputs["audio"] = (
+                audio_embeds_lst if len(audio_embeds_lst) != 1 else audio_embeds_lst[0]
+            )
         if "audio" in items_by_modality:
             mm_inputs["audio"] = items_by_modality["audio"]  # A list of audios
         if "video" in items_by_modality:

From 17eb25e3271f8f22a0d8920a8115158495827cba Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Sat, 6 Dec 2025 23:44:50 -0500
Subject: [PATCH 313/770] [Perf] Enable cuda graph for deepepHT, 5.3%
 throughput improvement, 4.4% TTFT improvement (#29558)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 tests/compile/test_config.py |  66 ++++++++++++++++++++-
 vllm/config/compilation.py   | 111 +++++++++++++++++++++++------------
 vllm/config/vllm.py          |   5 +-
 vllm/platforms/cuda.py       |  38 ------------
 4 files changed, 142 insertions(+), 78 deletions(-)

diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py
index 8dd6959a01d09..0e91cf525411e 100644
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -10,7 +10,7 @@ from pydantic import ValidationError
 
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.fix_functionalization import FixFunctionalizationPass
-from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
+from vllm.config import CompilationConfig, CUDAGraphMode, ParallelConfig, VllmConfig
 from vllm.config.compilation import CompilationMode, PassConfig
 from vllm.engine.arg_utils import EngineArgs
 from vllm.logger import _print_warning_once
@@ -235,6 +235,70 @@ def test_splitting_ops_dynamic():
     assert config.compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE
 
 
+def test_moe_splitting_ops_deepep_ht_piecewise():
+    # Non-inductor, non-attn-fusion case: DeepEP HT with dp>1
+    # should add MoE ops to splitting_ops on top of attention ops.
+    config = VllmConfig(
+        parallel_config=ParallelConfig(
+            all2all_backend="deepep_high_throughput",
+            data_parallel_size=8,
+        ),
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+        ),
+    )
+    splitting_ops = config.compilation_config.splitting_ops
+    assert splitting_ops is not None
+    assert "vllm::moe_forward" in splitting_ops
+    assert "vllm::moe_forward_shared" in splitting_ops
+
+
+def test_moe_splitting_ops_deepep_ht_inductor_partition():
+    # Inductor partition case: user-provided splitting_ops should be
+    # preserved and MoE ops should be appended for DeepEP HT with dp>1.
+    config = VllmConfig(
+        parallel_config=ParallelConfig(
+            all2all_backend="deepep_high_throughput",
+            data_parallel_size=8,
+        ),
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            use_inductor_graph_partition=True,
+            splitting_ops=[
+                "vllm::unified_attention",
+                "vllm::moe_forward",
+                "vllm::moe_forward_shared",
+            ],
+        ),
+    )
+    splitting_ops = config.compilation_config.splitting_ops
+    assert splitting_ops == [
+        "vllm::unified_attention",
+        "vllm::moe_forward",
+        "vllm::moe_forward_shared",
+    ]
+
+
+def test_moe_splitting_ops_deepep_ht_attn_fusion_no_inductor():
+    # Pure attn-fusion case without inductor partition: even with
+    # DeepEP HT and dp>1, we should not re-enable piecewise compilation
+    # or add MoE ops into splitting_ops.
+    config = VllmConfig(
+        parallel_config=ParallelConfig(
+            all2all_backend="deepep_high_throughput",
+            data_parallel_size=8,
+        ),
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            pass_config={"enable_attn_fusion": True, "enable_noop": True},
+            custom_ops=["+quant_fp8"],
+            cudagraph_mode=CUDAGraphMode.PIECEWISE,
+        ),
+    )
+    assert config.compilation_config.splitting_ops == []
+    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL
+
+
 def test_should_split():
     import torch
 
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 5f9e2cfdd789a..b79200f0e4779 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -966,7 +966,9 @@ class CompilationConfig:
         # May get recomputed in the model runner if adjustment is needed for spec-decode
         self.compute_bs_to_padded_graph_size()
 
-    def set_splitting_ops_for_v1(self):
+    def set_splitting_ops_for_v1(
+        self, all2all_backend: str | None = None, data_parallel_size: int | None = None
+    ):
         # To compatible with OOT hardware plugin platform (for example vllm-ascend)
         # which currently only supports sequence parallelism in eager mode.
         if self.mode != CompilationMode.VLLM_COMPILE:
@@ -981,50 +983,83 @@ class CompilationConfig:
             "mode is CompilationMode.VLLM_COMPILE"
         )
 
-        if self.use_inductor_graph_partition:
-            self.set_splitting_ops_for_inductor_graph_partition()
-            return
+        added_default_splitting_ops = False
 
-        if self.pass_config.fuse_attn_quant:
-            # here use_inductor_graph_partition is False
+        if self.pass_config.fuse_attn_quant and not self.use_inductor_graph_partition:
             self.set_splitting_ops_for_attn_fusion()
-            return
-
-        if self.splitting_ops is None:
-            # NOTE: When using full cudagraph, instead of setting an empty
-            # list and capture the full cudagraph inside the flattened fx
-            # graph, we keep the piecewise fx graph structure but capture
-            # the full cudagraph outside the fx graph. This reduces some
-            # cpu overhead when the runtime batch_size is not cudagraph
-            # captured. see https://github.com/vllm-project/vllm/pull/20059
-            # for details. Make a copy to avoid mutating the class-level
-            # list via reference.
-            self.splitting_ops = list(self._attention_ops)
-        elif len(self.splitting_ops) == 0:
-            logger.warning_once("Using piecewise compilation with empty splitting_ops")
-            if self.cudagraph_mode == CUDAGraphMode.PIECEWISE:
+        else:
+            if self.splitting_ops is None:
+                # NOTE: When using full cudagraph, instead of setting an empty
+                # list and capture the full cudagraph inside the flattened fx
+                # graph, we keep the piecewise fx graph structure but capture
+                # the full cudagraph outside the fx graph. This reduces some
+                # cpu overhead when the runtime batch_size is not cudagraph
+                # captured. see https://github.com/vllm-project/vllm/pull/20059
+                # for details. Make a copy to avoid mutating the class-level
+                # list via reference.
+                self.splitting_ops = list(self._attention_ops)
+                added_default_splitting_ops = True
+            elif len(self.splitting_ops) == 0:
                 logger.warning_once(
-                    "Piecewise compilation with empty splitting_ops do not"
-                    "contains piecewise cudagraph. Setting cudagraph_"
-                    "mode to NONE. Hint: If you are using attention backends "
-                    "that support cudagraph, consider manually setting "
-                    "cudagraph_mode to FULL or FULL_DECODE_ONLY to enable "
-                    "full cudagraphs."
+                    "Using piecewise compilation with empty splitting_ops"
                 )
+                if self.cudagraph_mode == CUDAGraphMode.PIECEWISE:
+                    logger.warning_once(
+                        "Piecewise compilation with empty splitting_ops do not"
+                        "contains piecewise cudagraph. Setting cudagraph_"
+                        "mode to NONE. Hint: If you are using attention "
+                        "backends that support cudagraph, consider manually "
+                        "setting cudagraph_mode to FULL or FULL_DECODE_ONLY "
+                        "to enable full cudagraphs."
+                    )
+                    self.cudagraph_mode = CUDAGraphMode.NONE
+                elif self.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE:
+                    logger.warning_once(
+                        "Piecewise compilation with empty splitting_ops do "
+                        "not contains piecewise cudagraph. Setting "
+                        "cudagraph_mode to FULL."
+                    )
+                    self.cudagraph_mode = CUDAGraphMode.FULL
+                self.splitting_ops = []
+
+        # split MoE ops for cudagraph
+        moe_ops = [
+            "vllm::moe_forward",
+            "vllm::moe_forward_shared",
+        ]
+        backend = all2all_backend or envs.VLLM_ALL2ALL_BACKEND
+        dp_size = data_parallel_size if data_parallel_size is not None else 1
+        need_moe_splitting = (
+            backend == "deepep_high_throughput"
+            and dp_size > 1
+            # pure attn-fusion without inductor partition deliberately disables
+            # piecewise graphs and MoE splitting.
+            and not (
+                self.pass_config.fuse_attn_quant
+                and not self.use_inductor_graph_partition
+            )
+        )
+
+        if need_moe_splitting and self.cudagraph_mode != CUDAGraphMode.NONE:
+            # if we just initialized default splitting_ops for this config,
+            # automatically append the MoE ops
+            if added_default_splitting_ops:
+                for op in moe_ops:
+                    if op not in self.splitting_ops:
+                        self.splitting_ops.append(op)
+
+            # make sure MoE ops are split out
+            if not any(op in self.splitting_ops for op in moe_ops):
                 self.cudagraph_mode = CUDAGraphMode.NONE
-            elif self.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE:
                 logger.warning_once(
-                    "Piecewise compilation with empty splitting_ops do not "
-                    "contains piecewise cudagraph. Setting cudagraph_mode "
-                    "to FULL."
+                    "DeepEP high throughput backend with data_parallel_size > 1 "
+                    "requires splitting MoE ops from cudagraphs. Please ensure "
+                    "'vllm::moe_forward' or 'vllm::moe_forward_shared' are "
+                    "present in CompilationConfig.splitting_ops."
                 )
-                self.cudagraph_mode = CUDAGraphMode.FULL
-            self.splitting_ops = []
-
-    def set_splitting_ops_for_inductor_graph_partition(self):
-        assert self.use_inductor_graph_partition
-        if self.splitting_ops is None:
-            self.splitting_ops = list(self._attention_ops)
+            elif self.cudagraph_mode.has_full_cudagraphs():
+                # fall back to piecewise when MoE splitting is required.
+                self.cudagraph_mode = CUDAGraphMode.PIECEWISE
 
     def set_splitting_ops_for_attn_fusion(self):
         assert self.pass_config.fuse_attn_quant
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index b99be1e5dfbc8..36e4bd159dc72 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -813,7 +813,10 @@ class VllmConfig:
         ), "MTP with cp_kv_cache_interleave_size > 1 is not supported now."
 
         # Do this after all the updates to compilation_config.mode
-        self.compilation_config.set_splitting_ops_for_v1()
+        self.compilation_config.set_splitting_ops_for_v1(
+            all2all_backend=self.parallel_config.all2all_backend,
+            data_parallel_size=self.parallel_config.data_parallel_size,
+        )
 
         if self.compilation_config.pass_config.enable_sp:
             # With pipeline parallelism or dynamo partitioning,
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 7e6ce6aeef536..37c95f48669f8 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -232,44 +232,6 @@ class CudaPlatformBase(Platform):
                 logger.info(
                     "Forcing kv cache block size to 64 for FlashMLASparse backend."
                 )
-        # lazy import to avoid circular import
-        from vllm.config import CUDAGraphMode
-
-        compilation_config = vllm_config.compilation_config
-        if compilation_config.cudagraph_mode.has_full_cudagraphs():
-            # decode context parallel does not support full cudagraphs
-            if parallel_config.decode_context_parallel_size > 1:
-                logger.warning_once(
-                    "Decode context parallel (DCP) is enabled, which is "
-                    "incompatible with full CUDA graphs. "
-                    "Overriding cudagraph_mode to PIECEWISE."
-                )
-                compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
-            # prefill context parallel do not support full cudagraphs
-            elif parallel_config.prefill_context_parallel_size > 1:
-                logger.warning_once(
-                    "Prefill context parallel (PCP) is enabled, which is "
-                    "incompatible with full CUDA graphs. "
-                    "Overriding cudagraph_mode to PIECEWISE."
-                )
-                compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
-        if (
-            parallel_config.all2all_backend == "deepep_high_throughput"
-            and parallel_config.data_parallel_size > 1
-            and compilation_config.cudagraph_mode != CUDAGraphMode.NONE
-        ):
-            # TODO: Piecewise Cuda graph might be enabled
-            # if torch compile cache key issue fixed
-            # See https://github.com/vllm-project/vllm/pull/25093
-            logger.info(
-                "WideEP: Disabling CUDA Graphs since DeepEP high-throughput "
-                "kernels are optimized for prefill and are incompatible with "
-                "CUDA Graphs. "
-                "In order to use CUDA Graphs for decode-optimized workloads, "
-                "use --all2all-backend with another option, such as "
-                "deepep_low_latency, pplx, or allgather_reducescatter."
-            )
-            compilation_config.cudagraph_mode = CUDAGraphMode.NONE
 
     @classmethod
     def get_current_memory_usage(

From a49d813fa88338a4409928a6e6d2ab3d0019f83b Mon Sep 17 00:00:00 2001
From: Luke <yq0536@gmail.com>
Date: Sat, 6 Dec 2025 23:13:14 -0800
Subject: [PATCH 314/770] Lazy loading to avoid importing all files (#29716)

Signed-off-by: Luke <yq0536@gmail.com>
---
 vllm/transformers_utils/configs/__init__.py | 90 ++++++++++++---------
 1 file changed, 52 insertions(+), 38 deletions(-)

diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 0e8d167886935..e536ca8521325 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -10,46 +10,47 @@ Model configs may be defined in this directory for the following reasons:
   deepseek-ai/DeepSeek-V3.2-Exp.
 """
 
-from transformers import DeepseekV3Config
+from __future__ import annotations
 
-from vllm.transformers_utils.configs.afmoe import AfmoeConfig
-from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
-from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config
-from vllm.transformers_utils.configs.dotsocr import DotsOCRConfig
-from vllm.transformers_utils.configs.eagle import EAGLEConfig
+import importlib
 
-# RWConfig is for the original tiiuae/falcon-40b(-instruct) and
-# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
-# `FalconConfig` class from the official HuggingFace transformers library.
-from vllm.transformers_utils.configs.falcon import RWConfig
-from vllm.transformers_utils.configs.flex_olmo import FlexOlmoConfig
-from vllm.transformers_utils.configs.hunyuan_vl import (
-    HunYuanVLConfig,
-    HunYuanVLTextConfig,
-    HunYuanVLVisionConfig,
-)
-from vllm.transformers_utils.configs.jais import JAISConfig
-from vllm.transformers_utils.configs.kimi_linear import KimiLinearConfig
-from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig
-from vllm.transformers_utils.configs.lfm2_moe import Lfm2MoeConfig
-from vllm.transformers_utils.configs.medusa import MedusaConfig
-from vllm.transformers_utils.configs.midashenglm import MiDashengLMConfig
-from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
-from vllm.transformers_utils.configs.moonvit import MoonViTConfig
-from vllm.transformers_utils.configs.nemotron import NemotronConfig
-from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig
-from vllm.transformers_utils.configs.olmo3 import Olmo3Config
-from vllm.transformers_utils.configs.ovis import OvisConfig
-from vllm.transformers_utils.configs.qwen3_next import Qwen3NextConfig
-from vllm.transformers_utils.configs.radio import RadioConfig
-from vllm.transformers_utils.configs.speculators.base import SpeculatorsConfig
-from vllm.transformers_utils.configs.step3_vl import (
-    Step3TextConfig,
-    Step3VisionEncoderConfig,
-    Step3VLConfig,
-)
-from vllm.transformers_utils.configs.tarsier2 import Tarsier2Config
-from vllm.transformers_utils.configs.ultravox import UltravoxConfig
+_CLASS_TO_MODULE: dict[str, str] = {
+    "AfmoeConfig": "vllm.transformers_utils.configs.afmoe",
+    "ChatGLMConfig": "vllm.transformers_utils.configs.chatglm",
+    "DeepseekVLV2Config": "vllm.transformers_utils.configs.deepseek_vl2",
+    "DotsOCRConfig": "vllm.transformers_utils.configs.dotsocr",
+    "EAGLEConfig": "vllm.transformers_utils.configs.eagle",
+    "FlexOlmoConfig": "vllm.transformers_utils.configs.flex_olmo",
+    "HunYuanVLConfig": "vllm.transformers_utils.configs.hunyuan_vl",
+    "HunYuanVLTextConfig": "vllm.transformers_utils.configs.hunyuan_vl",
+    "HunYuanVLVisionConfig": "vllm.transformers_utils.configs.hunyuan_vl",
+    # RWConfig is for the original tiiuae/falcon-40b(-instruct) and
+    # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
+    # `FalconConfig` class from the official HuggingFace transformers library.
+    "RWConfig": "vllm.transformers_utils.configs.falcon",
+    "JAISConfig": "vllm.transformers_utils.configs.jais",
+    "Lfm2MoeConfig": "vllm.transformers_utils.configs.lfm2_moe",
+    "MedusaConfig": "vllm.transformers_utils.configs.medusa",
+    "MiDashengLMConfig": "vllm.transformers_utils.configs.midashenglm",
+    "MLPSpeculatorConfig": "vllm.transformers_utils.configs.mlp_speculator",
+    "MoonViTConfig": "vllm.transformers_utils.configs.moonvit",
+    "KimiLinearConfig": "vllm.transformers_utils.configs.kimi_linear",
+    "KimiVLConfig": "vllm.transformers_utils.configs.kimi_vl",
+    "NemotronConfig": "vllm.transformers_utils.configs.nemotron",
+    "NemotronHConfig": "vllm.transformers_utils.configs.nemotron_h",
+    "Olmo3Config": "vllm.transformers_utils.configs.olmo3",
+    "OvisConfig": "vllm.transformers_utils.configs.ovis",
+    "RadioConfig": "vllm.transformers_utils.configs.radio",
+    "SpeculatorsConfig": "vllm.transformers_utils.configs.speculators.base",
+    "UltravoxConfig": "vllm.transformers_utils.configs.ultravox",
+    "Step3VLConfig": "vllm.transformers_utils.configs.step3_vl",
+    "Step3VisionEncoderConfig": "vllm.transformers_utils.configs.step3_vl",
+    "Step3TextConfig": "vllm.transformers_utils.configs.step3_vl",
+    "Qwen3NextConfig": "vllm.transformers_utils.configs.qwen3_next",
+    "Tarsier2Config": "vllm.transformers_utils.configs.tarsier2",
+    # Special case: DeepseekV3Config is from HuggingFace Transformers
+    "DeepseekV3Config": "transformers",
+}
 
 __all__ = [
     "AfmoeConfig",
@@ -84,3 +85,16 @@ __all__ = [
     "Qwen3NextConfig",
     "Tarsier2Config",
 ]
+
+
+def __getattr__(name: str):
+    if name in _CLASS_TO_MODULE:
+        module_name = _CLASS_TO_MODULE[name]
+        module = importlib.import_module(module_name)
+        return getattr(module, name)
+
+    raise AttributeError(f"module 'configs' has no attribute '{name}'")
+
+
+def __dir__():
+    return sorted(list(__all__))

From 27f4c2fd46b99778d7ea19dfe7751fbaab615177 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 7 Dec 2025 15:15:42 +0800
Subject: [PATCH 315/770] [Renderer] Separate out `RendererConfig` from
 `ModelConfig` (#30145)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/contributing/model/transcription.md      |  12 +-
 .../distributed/test_sequence_parallelism.py  |   2 +
 tests/compile/test_functionalization.py       |   6 +-
 tests/compile/test_fusion.py                  |   6 +-
 tests/compile/test_fusion_attn.py             |   2 +
 tests/compile/test_pass_manager.py            |   8 +-
 tests/compile/test_qk_norm_rope_fusion.py     |   5 +-
 tests/distributed/test_kvlayout.py            |   3 +
 .../entrypoints/openai/test_chat_template.py  |  22 +-
 .../entrypoints/openai/test_lora_resolvers.py |  21 +-
 tests/entrypoints/openai/test_serving_chat.py |  28 ++-
 .../entrypoints/openai/test_serving_engine.py |   8 +-
 .../entrypoints/openai/test_serving_models.py |   8 +-
 tests/entrypoints/test_chat_utils.py          | 190 +++++++-----------
 tests/lora/test_lora_manager.py               |  14 +-
 tests/lora/test_worker.py                     |   2 +
 .../test_model_load_with_params.py            |  22 +-
 tests/models/language/pooling/test_gritlm.py  |   5 +-
 .../multimodal/processing/test_common.py      |  22 +-
 .../multimodal/processing/test_glm4_1v.py     |   4 +-
 .../multimodal/processing/test_h2ovl.py       |   2 +-
 .../multimodal/processing/test_idefics3.py    |   2 +-
 .../multimodal/processing/test_internvl.py    |   2 +-
 .../multimodal/processing/test_llama4.py      |   2 +-
 .../multimodal/processing/test_llava_next.py  |   6 +-
 .../processing/test_llava_onevision.py        |   6 +-
 .../processing/test_minimax_vl_01.py          |   4 +-
 .../multimodal/processing/test_mllama4.py     |   2 +-
 .../multimodal/processing/test_nemotron_vl.py |   2 +-
 .../multimodal/processing/test_phi3v.py       |   2 +-
 .../multimodal/processing/test_phi4mm.py      |   2 +-
 .../multimodal/processing/test_qwen2_vl.py    |   2 +-
 .../multimodal/processing/test_smolvlm.py     |   2 +-
 .../processing/test_tensor_schema.py          |  24 +--
 .../processing/test_transformers.py           |   5 +-
 tests/models/multimodal/test_mapping.py       |  33 +--
 tests/models/registry.py                      |  33 ++-
 tests/models/utils.py                         |  17 +-
 tests/multimodal/test_cache.py                |  27 ++-
 tests/multimodal/test_processing.py           |  24 ++-
 tests/multimodal/test_registry.py             |   4 +-
 tests/test_config.py                          | 131 +++++++-----
 tests/test_inputs.py                          |   7 +-
 tests/v1/attention/utils.py                   |   2 +
 tests/v1/core/test_kv_cache_utils.py          |  20 +-
 tests/v1/core/test_scheduler.py               |   2 +
 tests/v1/core/utils.py                        |   2 +
 tests/v1/engine/test_engine_core.py           |   2 +
 .../engine/test_process_multi_modal_uuids.py  |  24 ++-
 tests/v1/kv_connector/unit/utils.py           |   2 +
 tests/v1/spec_decode/test_eagle.py            |   2 +
 tests/v1/spec_decode/test_mtp.py              |   2 +
 tests/v1/spec_decode/test_ngram.py            |   2 +
 .../test_backend_guidance.py                  |  12 +-
 .../test_reasoning_structured_output.py       |  35 ++--
 tests/v1/tpu/worker/test_tpu_model_runner.py  |   2 +
 tests/v1/worker/test_gpu_model_runner.py      |   3 +
 vllm/config/__init__.py                       |   3 +
 vllm/config/model.py                          | 141 +++----------
 vllm/config/multimodal.py                     |   4 -
 vllm/config/renderer.py                       | 109 ++++++++++
 vllm/config/speculative.py                    |   5 -
 vllm/config/vllm.py                           |  25 ++-
 vllm/engine/arg_utils.py                      |  99 +++++----
 vllm/engine/protocol.py                       |   3 +-
 vllm/entrypoints/chat_utils.py                |  79 ++++----
 vllm/entrypoints/llm.py                       |  14 +-
 vllm/entrypoints/openai/api_server.py         |   2 +-
 vllm/entrypoints/openai/serving_completion.py |   2 +-
 vllm/entrypoints/openai/serving_engine.py     |  11 +-
 vllm/entrypoints/openai/serving_models.py     |   1 +
 vllm/entrypoints/openai/speech_to_text.py     |  10 +-
 vllm/entrypoints/pooling/pooling/serving.py   |   2 +-
 vllm/entrypoints/pooling/score/serving.py     |   4 +-
 vllm/entrypoints/score_utils.py               |  13 +-
 vllm/entrypoints/utils.py                     |   8 +-
 vllm/inputs/preprocess.py                     |   9 +-
 vllm/model_executor/models/adapters.py        |  20 +-
 vllm/model_executor/models/deepseek_ocr.py    |   4 +-
 vllm/model_executor/models/deepseek_vl2.py    |   4 +-
 vllm/model_executor/models/gemma3n_mm.py      |   8 +-
 vllm/model_executor/models/granite_speech.py  |  14 +-
 vllm/model_executor/models/gritlm.py          |  14 +-
 vllm/model_executor/models/interfaces.py      |  10 +-
 vllm/model_executor/models/interns1.py        |   2 +-
 .../model_executor/models/nano_nemotron_vl.py |  13 +-
 vllm/model_executor/models/nemotron_vl.py     |   2 +-
 vllm/model_executor/models/pixtral.py         |   2 +-
 vllm/model_executor/models/voxtral.py         |  22 +-
 vllm/model_executor/models/whisper.py         |  14 +-
 vllm/multimodal/cache.py                      |  22 +-
 vllm/multimodal/processing.py                 |  28 ++-
 vllm/multimodal/registry.py                   |  64 +++---
 vllm/tokenizers/registry.py                   |  24 +--
 vllm/transformers_utils/processor.py          |  28 ++-
 vllm/v1/core/encoder_cache_manager.py         |   8 +-
 vllm/v1/core/sched/scheduler.py               |   2 +-
 vllm/v1/engine/async_llm.py                   |   7 +-
 vllm/v1/engine/input_processor.py             |   7 +-
 vllm/v1/engine/llm_engine.py                  |   7 +-
 vllm/v1/spec_decode/eagle.py                  |   2 +-
 vllm/v1/structured_output/__init__.py         |  18 +-
 vllm/v1/worker/gpu_model_runner.py            |   7 +-
 vllm/v1/worker/tpu_model_runner.py            |   7 +-
 vllm/v1/worker/utils.py                       |  19 +-
 105 files changed, 969 insertions(+), 797 deletions(-)
 create mode 100644 vllm/config/renderer.py

diff --git a/docs/contributing/model/transcription.md b/docs/contributing/model/transcription.md
index fca941acd5076..c5605789022d4 100644
--- a/docs/contributing/model/transcription.md
+++ b/docs/contributing/model/transcription.md
@@ -22,7 +22,7 @@ Declare supported languages and capabilities:
     import torch
     from torch import nn
 
-    from vllm.config import ModelConfig, SpeechToTextConfig
+    from vllm.config import RendererConfig, SpeechToTextConfig
     from vllm.inputs.data import PromptType
     from vllm.model_executor.models.interfaces import SupportsTranscription
     
@@ -52,7 +52,7 @@ This is for controlling general behavior of the API when serving your model:
         @classmethod
         def get_speech_to_text_config(
             cls,
-            model_config: ModelConfig,
+            renderer_config: RendererConfig,
             task_type: Literal["transcribe", "translate"],
         ) -> SpeechToTextConfig:
             return SpeechToTextConfig(
@@ -83,7 +83,7 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
             cls,
             audio: np.ndarray,
             stt_config: SpeechToTextConfig,
-            model_config: ModelConfig,
+            renderer_config: RendererConfig,
             language: str | None,
             task_type: Literal["transcribe", "translate"],
             request_prompt: str,
@@ -120,7 +120,7 @@ Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
             cls,
             audio: np.ndarray,
             stt_config: SpeechToTextConfig,
-            model_config: ModelConfig,
+            renderer_config: RendererConfig,
             language: str | None,
             task_type: Literal["transcribe", "translate"],
             request_prompt: str,
@@ -183,7 +183,7 @@ Provide a fast duration→token estimate to improve streaming usage statistics:
             cls,
             audio_duration_s: float,
             stt_config: SpeechToTextConfig,
-            model_config: ModelConfig,
+            renderer_config: RendererConfig,
         ) -> int | None:
             # Return None if unknown; otherwise return an estimate.
             return int(audio_duration_s * stt_config.sample_rate // 320)  # example
@@ -216,7 +216,7 @@ Relevant server logic:
             prompt = self.model_cls.get_generation_prompt(
                 audio=chunk,
                 stt_config=self.asr_config,
-                model_config=self.model_config,
+                renderer_config=self.renderer_config,
                 language=language,
                 task_type=self.task_type,
                 request_prompt=request.prompt,
diff --git a/tests/compile/distributed/test_sequence_parallelism.py b/tests/compile/distributed/test_sequence_parallelism.py
index d9fdc3acc3d6f..77d3a24d42923 100644
--- a/tests/compile/distributed/test_sequence_parallelism.py
+++ b/tests/compile/distributed/test_sequence_parallelism.py
@@ -17,6 +17,7 @@ from vllm.config import (
     DeviceConfig,
     ModelConfig,
     PassConfig,
+    RendererConfig,
     VllmConfig,
     get_current_vllm_config,
     set_current_vllm_config,
@@ -276,6 +277,7 @@ def sequence_parallelism_pass_on_test_model(
 
     vllm_config = VllmConfig(
         model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         device_config=device_config,
         compilation_config=compilation_config,
     )
diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py
index 7585915892700..52d6fd1e5d75e 100644
--- a/tests/compile/test_functionalization.py
+++ b/tests/compile/test_functionalization.py
@@ -15,6 +15,7 @@ from vllm.config import (
     CompilationConfig,
     ModelConfig,
     PassConfig,
+    RendererConfig,
     VllmConfig,
     set_current_vllm_config,
 )
@@ -219,8 +220,11 @@ def test_fix_functionalization(
     torch.set_default_device("cuda")
     torch.set_default_dtype(dtype)
 
+    model_config = ModelConfig(dtype=dtype)
+
     vllm_config = VllmConfig(
-        model_config=ModelConfig(dtype=dtype),
+        model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         compilation_config=CompilationConfig(
             custom_ops=["all"],
             pass_config=PassConfig(
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index d0ba8385f4a01..bb4ee6b8e3eca 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -15,6 +15,7 @@ from vllm.config import (
     CompilationMode,
     ModelConfig,
     PassConfig,
+    RendererConfig,
     VllmConfig,
 )
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -154,8 +155,11 @@ def test_fusion_rmsnorm_quant(
         custom_ops.append("+rms_norm")
     if enable_quant_fp8_custom_op:
         custom_ops.append("+quant_fp8")
+
+    model_config = ModelConfig(dtype=dtype)
     vllm_config = VllmConfig(
-        model_config=ModelConfig(dtype=dtype),
+        model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
             custom_ops=custom_ops,
diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
index db95dff5e0fc7..f87825db29817 100644
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -24,6 +24,7 @@ from vllm.config import (
     CompilationMode,
     ModelConfig,
     PassConfig,
+    RendererConfig,
     SchedulerConfig,
     VllmConfig,
     set_current_vllm_config,
@@ -325,6 +326,7 @@ def test_attention_quant_pattern(
     )
     vllm_config = VllmConfig(
         model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         scheduler_config=SchedulerConfig(
             max_num_seqs=1024,
             max_model_len=model_config.max_model_len,
diff --git a/tests/compile/test_pass_manager.py b/tests/compile/test_pass_manager.py
index 6d0ba6b655031..c95e9e3ff8ae8 100644
--- a/tests/compile/test_pass_manager.py
+++ b/tests/compile/test_pass_manager.py
@@ -7,7 +7,7 @@ import torch
 
 from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
 from vllm.compilation.pass_manager import PostGradPassManager
-from vllm.config import ModelConfig, VllmConfig
+from vllm.config import ModelConfig, RendererConfig, VllmConfig
 
 
 # dummy custom pass that doesn't inherit
@@ -43,7 +43,11 @@ class ProperPass(InductorPass):
 )
 def test_pass_manager_uuid(callable):
     # Some passes need dtype to be set
-    config = VllmConfig(model_config=ModelConfig(dtype=torch.bfloat16))
+    model_config = ModelConfig(dtype=torch.bfloat16)
+    config = VllmConfig(
+        model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
+    )
 
     pass_manager = PostGradPassManager()
     pass_manager.configure(config)
diff --git a/tests/compile/test_qk_norm_rope_fusion.py b/tests/compile/test_qk_norm_rope_fusion.py
index e0968ac799256..4d109015be48a 100644
--- a/tests/compile/test_qk_norm_rope_fusion.py
+++ b/tests/compile/test_qk_norm_rope_fusion.py
@@ -19,6 +19,7 @@ from vllm.config import (
     CompilationMode,
     ModelConfig,
     PassConfig,
+    RendererConfig,
     VllmConfig,
     set_current_vllm_config,
 )
@@ -133,8 +134,10 @@ def test_qk_norm_rope_fusion(
     if enable_rope_custom_op:
         custom_ops.append("+rotary_embedding")
 
+    model_config = ModelConfig(dtype=dtype)
     vllm_config = VllmConfig(
-        model_config=ModelConfig(dtype=dtype),
+        model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
             custom_ops=custom_ops,
diff --git a/tests/distributed/test_kvlayout.py b/tests/distributed/test_kvlayout.py
index b190b2820451b..0d51a51a50804 100644
--- a/tests/distributed/test_kvlayout.py
+++ b/tests/distributed/test_kvlayout.py
@@ -5,6 +5,7 @@ from vllm.config import (
     DeviceConfig,
     KVTransferConfig,
     ModelConfig,
+    RendererConfig,
     VllmConfig,
     set_current_vllm_config,
 )
@@ -47,6 +48,7 @@ def test_get_kv_connector_cache_layout_with_nixl_connector():
     vllm_config = VllmConfig(
         device_config=DeviceConfig("cpu"),
         model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         kv_transfer_config=kv_transfer_config,
     )
     with set_current_vllm_config(vllm_config):
@@ -70,6 +72,7 @@ def test_get_kv_connector_cache_layout_with_multi_connector():
     vllm_config = VllmConfig(
         device_config=DeviceConfig("cpu"),
         model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         kv_transfer_config=kv_transfer_config,
     )
     with set_current_vllm_config(vllm_config):
diff --git a/tests/entrypoints/openai/test_chat_template.py b/tests/entrypoints/openai/test_chat_template.py
index 77087ac21ea8b..b050cfdb561cf 100644
--- a/tests/entrypoints/openai/test_chat_template.py
+++ b/tests/entrypoints/openai/test_chat_template.py
@@ -3,7 +3,6 @@
 
 import pytest
 
-from vllm.config import ModelConfig
 from vllm.entrypoints.chat_utils import apply_hf_chat_template, load_chat_template
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.tokenizers import get_tokenizer
@@ -107,24 +106,11 @@ def test_get_gen_prompt(
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
     model_info.check_available_online(on_fail="skip")
 
-    model_config = ModelConfig(
-        model,
-        tokenizer=model_info.tokenizer or model,
-        tokenizer_mode=model_info.tokenizer_mode,
-        trust_remote_code=model_info.trust_remote_code,
-        revision=model_info.revision,
-        hf_overrides=model_info.hf_overrides,
-        skip_tokenizer_init=model_info.require_embed_inputs,
-        enable_prompt_embeds=model_info.require_embed_inputs,
-        enable_mm_embeds=model_info.require_embed_inputs,
-        enforce_eager=model_info.enforce_eager,
-        dtype=model_info.dtype,
-    )
+    renderer_config = model_info.build_renderer_config(model)
 
-    # Initialize the tokenizer
     tokenizer = get_tokenizer(
-        tokenizer_name=model_config.tokenizer,
-        trust_remote_code=model_config.trust_remote_code,
+        renderer_config.tokenizer,
+        trust_remote_code=renderer_config.trust_remote_code,
     )
     template_content = load_chat_template(chat_template=template)
 
@@ -143,7 +129,7 @@ def test_get_gen_prompt(
         tokenizer=tokenizer,
         conversation=mock_request.messages,
         chat_template=mock_request.chat_template or template_content,
-        model_config=model_config,
+        renderer_config=renderer_config,
         tools=None,
         add_generation_prompt=mock_request.add_generation_prompt,
         continue_final_message=mock_request.continue_final_message,
diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py
index ea6b3d812d8fe..7310c2610ce3b 100644
--- a/tests/entrypoints/openai/test_lora_resolvers.py
+++ b/tests/entrypoints/openai/test_lora_resolvers.py
@@ -33,26 +33,34 @@ class MockModelConfig:
     """Minimal mock ModelConfig for testing."""
 
     model: str = MODEL_NAME
-    tokenizer: str = MODEL_NAME
     trust_remote_code: bool = False
-    tokenizer_mode: str = "auto"
     max_model_len: int = 100
-    tokenizer_revision: str | None = None
     multimodal_config: MultiModalConfig = field(default_factory=MultiModalConfig)
     hf_config: MockHFConfig = field(default_factory=MockHFConfig)
     logits_processors: list[str] | None = None
     logits_processor_pattern: str | None = None
     diff_sampling_param: dict | None = None
-    allowed_local_media_path: str = ""
-    allowed_media_domains: list[str] | None = None
     encoder_config = None
     generation_config: str = "auto"
-    skip_tokenizer_init: bool = False
 
     def get_diff_sampling_param(self):
         return self.diff_sampling_param or {}
 
 
+@dataclass
+class MockRendererConfig:
+    """Minimal mock RendererConfig for testing."""
+
+    model_config: MockModelConfig
+
+    tokenizer: str = MODEL_NAME
+    tokenizer_mode: str = "auto"
+    tokenizer_revision: str | None = None
+    skip_tokenizer_init: bool = False
+    allowed_local_media_path: str = ""
+    allowed_media_domains: list[str] | None = None
+
+
 class MockLoRAResolver(LoRAResolver):
     async def resolve_lora(
         self, base_model_name: str, lora_name: str
@@ -114,6 +122,7 @@ def mock_serving_setup():
     mock_engine.add_lora.reset_mock()
 
     mock_engine.model_config = MockModelConfig()
+    mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config)
     mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 9ea65f9fa6e7a..9df8f886edd96 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -346,27 +346,33 @@ class MockHFConfig:
 class MockModelConfig:
     task = "generate"
     runner_type = "generate"
-    tokenizer = MODEL_NAME
     trust_remote_code = False
-    tokenizer_mode = "auto"
     max_model_len = 100
-    tokenizer_revision = None
     multimodal_config = MultiModalConfig()
     hf_config = MockHFConfig()
     logits_processors: list[str] | None = None
     logits_processor_pattern = None
     diff_sampling_param: dict | None = None
-    allowed_local_media_path: str = ""
-    allowed_media_domains: list[str] | None = None
     encoder_config = None
     generation_config: str = "auto"
-    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
-    skip_tokenizer_init = False
 
     def get_diff_sampling_param(self):
         return self.diff_sampling_param or {}
 
 
+@dataclass
+class MockRendererConfig:
+    model_config: MockModelConfig = field(default_factory=MockModelConfig)
+
+    tokenizer = MODEL_NAME
+    tokenizer_mode = "auto"
+    tokenizer_revision = None
+    skip_tokenizer_init = False
+    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
+    allowed_local_media_path: str = ""
+    allowed_media_domains: list[str] | None = None
+
+
 def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
     models = OpenAIServingModels(
         engine_client=engine,
@@ -399,6 +405,7 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
 @dataclass
 class MockEngine:
     model_config: MockModelConfig = field(default_factory=MockModelConfig)
+    renderer_config: MockRendererConfig = field(default_factory=MockRendererConfig)
     input_processor: MagicMock = field(default_factory=MagicMock)
     io_processor: MagicMock = field(default_factory=MagicMock)
 
@@ -429,6 +436,7 @@ async def test_serving_chat_returns_correct_model_name():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = MockModelConfig()
+    mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config)
     mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
@@ -459,6 +467,7 @@ async def test_serving_chat_should_set_correct_max_tokens():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = MockModelConfig()
+    mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config)
     mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
@@ -492,6 +501,7 @@ async def test_serving_chat_should_set_correct_max_tokens():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = mock_model_config
+    mock_engine.renderer_config = MockRendererConfig(mock_model_config)
     mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
@@ -537,6 +547,7 @@ async def test_serving_chat_should_set_correct_max_tokens():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = mock_model_config
+    mock_engine.renderer_config = MockRendererConfig(mock_model_config)
     mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
@@ -583,6 +594,7 @@ async def test_serving_chat_could_load_correct_generation_config():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = mock_model_config
+    mock_engine.renderer_config = MockRendererConfig(mock_model_config)
     mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
@@ -629,6 +641,7 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = mock_model_config
+    mock_engine.renderer_config = MockRendererConfig(mock_model_config)
     mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
@@ -662,6 +675,7 @@ async def test_serving_chat_data_parallel_rank_extraction():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = MockModelConfig()
+    mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config)
     mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
diff --git a/tests/entrypoints/openai/test_serving_engine.py b/tests/entrypoints/openai/test_serving_engine.py
index 956a06dc5487c..6ab0942b58da8 100644
--- a/tests/entrypoints/openai/test_serving_engine.py
+++ b/tests/entrypoints/openai/test_serving_engine.py
@@ -7,7 +7,7 @@ from unittest.mock import Mock
 
 import pytest
 
-from vllm.config import ModelConfig
+from vllm.config import ModelConfig, RendererConfig
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.tokenizers import MistralTokenizer
@@ -19,10 +19,16 @@ def serving() -> OpenAIServing:
 
     # Create minimal mocks
     engine_client = Mock()
+
     model_config = Mock(spec=ModelConfig)
     model_config.max_model_len = 32768
+
+    renderer_config = Mock(spec=RendererConfig)
+    renderer_config.model_config = model_config
+
     models = Mock(spec=OpenAIServingModels)
     models.model_config = model_config
+    models.renderer_config = renderer_config
     models.input_processor = Mock()
     models.io_processor = Mock()
 
diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py
index b585835a0667a..376df6cfecb9f 100644
--- a/tests/entrypoints/openai/test_serving_models.py
+++ b/tests/entrypoints/openai/test_serving_models.py
@@ -6,7 +6,7 @@ from unittest.mock import MagicMock
 
 import pytest
 
-from vllm.config import ModelConfig
+from vllm.config import ModelConfig, RendererConfig
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.openai.protocol import (
     ErrorResponse,
@@ -27,9 +27,15 @@ LORA_UNLOADING_SUCCESS_MESSAGE = (
 async def _async_serving_models_init() -> OpenAIServingModels:
     mock_engine_client = MagicMock(spec=EngineClient)
     # Set the max_model_len attribute to avoid missing attribute
+
     mock_model_config = MagicMock(spec=ModelConfig)
     mock_model_config.max_model_len = 2048
+
+    mock_renderer_config = MagicMock(spec=RendererConfig)
+    mock_renderer_config.model_config = mock_model_config
+
     mock_engine_client.model_config = mock_model_config
+    mock_engine_client.renderer_config = mock_renderer_config
     mock_engine_client.input_processor = MagicMock()
     mock_engine_client.io_processor = MagicMock()
 
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 527322c71ae4b..7b296eae7c5a2 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -12,7 +12,7 @@ from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
 from vllm.assets.audio import AudioAsset
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
-from vllm.config import ModelConfig
+from vllm.config import ModelConfig, RendererConfig
 from vllm.entrypoints.chat_utils import (
     _try_extract_ast,
     apply_mistral_chat_template,
@@ -233,7 +233,7 @@ def test_parse_chat_messages_single_image(
                 ],
             }
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -265,7 +265,7 @@ def test_parse_chat_messages_single_image_with_uuid(
                 ],
             }
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -295,7 +295,7 @@ def test_parse_chat_messages_single_empty_image_with_uuid(
                 ],
             }
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -328,7 +328,7 @@ def test_parse_chat_messages_single_image_with_bad_uuid_format(
                 ],
             }
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -369,7 +369,7 @@ def test_parse_chat_messages_multiple_images_with_uuids(
                 ],
             }
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -409,7 +409,7 @@ def test_parse_chat_messages_multiple_empty_images_with_uuids(
                 ],
             }
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -451,7 +451,7 @@ def test_parse_chat_messages_mixed_empty_images_with_uuids(
                 ],
             }
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -485,7 +485,7 @@ async def test_parse_chat_messages_single_image_with_uuid_async(
                 ],
             }
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -516,7 +516,7 @@ async def test_parse_chat_messages_empty_image_with_uuid_async(
                 ],
             }
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -554,7 +554,7 @@ async def test_parse_chat_messages_multiple_images_with_uuids_async(
                 ],
             }
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -595,7 +595,7 @@ async def test_parse_chat_messages_multiple_empty_images_with_uuids_async(
                 ],
             }
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -634,7 +634,7 @@ async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
                 ],
             }
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -660,7 +660,7 @@ def test_parse_chat_messages_empty_system(
                 "content": [{"type": "text", "text": "Who are you?"}],
             },
         ],
-        mistral_model_config,
+        RendererConfig(model_config=mistral_model_config),
         content_format="string",
     )
     assert conversation == [
@@ -677,7 +677,7 @@ def test_parse_chat_messages_empty_system(
                 "content": [{"type": "text", "text": "Who are you?"}],
             },
         ],
-        mistral_model_config,
+        RendererConfig(model_config=mistral_model_config),
         content_format="openai",
     )
     assert conversation == [
@@ -701,7 +701,7 @@ async def test_parse_chat_messages_single_image_async(
                 ],
             }
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -730,7 +730,7 @@ def test_parse_chat_messages_multiple_images(
                 ],
             }
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -758,7 +758,7 @@ def test_parse_chat_messages_empty_pil_image_with_uuid(
                 ],
             }
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -786,7 +786,7 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
                 ],
             }
         ],
-        phi3v_model_config_image_embeds,
+        RendererConfig(model_config=phi3v_model_config_image_embeds),
         content_format="string",
     )
 
@@ -818,7 +818,7 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid(
                 ],
             }
         ],
-        audio_embeds_model_config,
+        RendererConfig(model_config=audio_embeds_model_config),
         content_format="string",
     )
 
@@ -858,7 +858,7 @@ def test_parse_chat_messages_audio_embeds_with_string(
                 ],
             }
         ],
-        audio_embeds_model_config,
+        RendererConfig(model_config=audio_embeds_model_config),
         content_format="string",
     )
 
@@ -900,7 +900,7 @@ async def test_parse_chat_messages_audio_embeds_async(
                 ],
             }
         ],
-        audio_embeds_model_config,
+        RendererConfig(model_config=audio_embeds_model_config),
         content_format="string",
     )
 
@@ -1108,7 +1108,7 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
                 ],
             }
         ],
-        phi3v_model_config_image_embeds,
+        RendererConfig(model_config=phi3v_model_config_image_embeds),
         content_format="string",
     )
 
@@ -1144,7 +1144,7 @@ async def test_parse_chat_messages_multiple_images_async(
                 ],
             }
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -1176,7 +1176,7 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
                 ],
             }
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
     assert conversation == [
@@ -1208,7 +1208,7 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
                 ],
             }
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -1245,7 +1245,7 @@ def test_parse_chat_messages_multiple_images_across_messages(
                 ],
             },
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -1289,7 +1289,7 @@ def test_parse_chat_messages_multiple_images_with_uuids_across_messages(
                 ],
             },
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -1314,7 +1314,7 @@ def test_parse_chat_messages_context_text_format(
             {"role": "assistant", "content": "Some stuff."},
             {"role": "user", "content": "What about this one?"},
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="openai",
     )
 
@@ -1367,7 +1367,7 @@ def test_parse_chat_messages_rejects_too_many_images_in_one_message(
                         ],
                     }
                 ],
-                phi3v_model_config,
+                RendererConfig(model_config=phi3v_model_config),
                 content_format="string",
             )
 
@@ -1410,7 +1410,7 @@ def test_parse_chat_messages_rejects_too_many_images_across_messages(
                         ],
                     },
                 ],
-                phi3v_model_config,
+                RendererConfig(model_config=phi3v_model_config),
                 content_format="string",
             )
 
@@ -1430,7 +1430,7 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
                 ],
             }
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -1464,7 +1464,7 @@ def test_parse_chat_messages_multiple_images_interleave(
                 ],
             }
         ],
-        phi3v_model_config_mm_interleaved,
+        RendererConfig(model_config=phi3v_model_config_mm_interleaved),
         content_format="string",
     )
 
@@ -1500,7 +1500,7 @@ async def test_parse_chat_messages_multiple_images_interleave_async(
                 ],
             }
         ],
-        phi3v_model_config_mm_interleaved,
+        RendererConfig(model_config=phi3v_model_config_mm_interleaved),
         content_format="string",
     )
 
@@ -1545,7 +1545,7 @@ async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async(
                 ],
             }
         ],
-        phi3v_model_config_mm_interleaved,
+        RendererConfig(model_config=phi3v_model_config_mm_interleaved),
         content_format="string",
     )
 
@@ -1583,7 +1583,7 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
                 ],
             },
         ],
-        phi3v_model_config_mm_interleaved,
+        RendererConfig(model_config=phi3v_model_config_mm_interleaved),
         content_format="string",
     )
 
@@ -1631,7 +1631,7 @@ def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interl
                 ],
             },
         ],
-        phi3v_model_config_mm_interleaved,
+        RendererConfig(model_config=phi3v_model_config_mm_interleaved),
         content_format="string",
     )
 
@@ -1675,7 +1675,7 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
                 ],
             },
         ],
-        qwen25omni_model_config_mm_interleaved,
+        RendererConfig(model_config=qwen25omni_model_config_mm_interleaved),
         content_format="string",
     )
 
@@ -1743,7 +1743,7 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl
                 ],
             },
         ],
-        qwen25omni_model_config_mm_interleaved,
+        RendererConfig(model_config=qwen25omni_model_config_mm_interleaved),
         content_format="string",
     )
 
@@ -1813,7 +1813,7 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_mes
                 ],
             },
         ],
-        qwen25omni_model_config_mm_interleaved,
+        RendererConfig(model_config=qwen25omni_model_config_mm_interleaved),
         content_format="string",
     )
 
@@ -1879,7 +1879,7 @@ def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_message
                 ],
             },
         ],
-        qwen25omni_model_config_mm_interleaved,
+        RendererConfig(model_config=qwen25omni_model_config_mm_interleaved),
         content_format="string",
     )
 
@@ -1927,7 +1927,7 @@ def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
                     ],
                 }
             ],
-            phi3v_model_config_mm_interleaved,
+            RendererConfig(model_config=phi3v_model_config_mm_interleaved),
             content_format="string",
         )
 
@@ -1945,24 +1945,11 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
     model_info.check_available_online(on_fail="skip")
 
-    model_config = ModelConfig(
-        model,
-        tokenizer=model_info.tokenizer or model,
-        tokenizer_mode=model_info.tokenizer_mode,
-        revision=model_info.revision,
-        trust_remote_code=model_info.trust_remote_code,
-        hf_overrides=model_info.hf_overrides,
-        skip_tokenizer_init=model_info.require_embed_inputs,
-        enable_prompt_embeds=model_info.require_embed_inputs,
-        enable_mm_embeds=model_info.require_embed_inputs,
-        enforce_eager=model_info.enforce_eager,
-        dtype=model_info.dtype,
-    )
+    renderer_config = model_info.build_renderer_config(model)
 
-    # Build the tokenizer
     tokenizer = get_tokenizer(
-        model,
-        trust_remote_code=model_config.trust_remote_code,
+        renderer_config.tokenizer,
+        trust_remote_code=renderer_config.trust_remote_code,
     )
 
     tools = (
@@ -1985,7 +1972,7 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
         tokenizer,
         chat_template=None,
         tools=tools,
-        model_config=model_config,
+        model_config=renderer_config.model_config,
     )
     assert isinstance(chat_template, str)
 
@@ -2047,24 +2034,11 @@ def test_resolve_hf_chat_template_kwargs(sample_json_schema, model, expected_kwa
         "enable_thinking": True,
     }
 
-    model_config = ModelConfig(
-        model,
-        tokenizer=model_info.tokenizer or model,
-        tokenizer_mode=model_info.tokenizer_mode,
-        revision=model_info.revision,
-        trust_remote_code=model_info.trust_remote_code,
-        hf_overrides=model_info.hf_overrides,
-        skip_tokenizer_init=model_info.require_embed_inputs,
-        enable_prompt_embeds=model_info.require_embed_inputs,
-        enable_mm_embeds=model_info.require_embed_inputs,
-        enforce_eager=model_info.enforce_eager,
-        dtype=model_info.dtype,
-    )
+    renderer_config = model_info.build_renderer_config(model)
 
-    # Build the tokenizer
     tokenizer = get_tokenizer(
-        model,
-        trust_remote_code=model_config.trust_remote_code,
+        renderer_config.tokenizer,
+        trust_remote_code=renderer_config.trust_remote_code,
     )
 
     # Test detecting the tokenizer's chat_template
@@ -2072,7 +2046,7 @@ def test_resolve_hf_chat_template_kwargs(sample_json_schema, model, expected_kwa
         tokenizer,
         chat_template=None,
         tools=tools,
-        model_config=model_config,
+        model_config=renderer_config.model_config,
     )
     with pytest.raises(
         ValueError, match="Found unexpected chat template kwargs from request"
@@ -2143,23 +2117,11 @@ def test_resolve_content_format_hf_defined(model, expected_format):
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
     model_info.check_available_online(on_fail="skip")
 
-    model_config = ModelConfig(
-        model,
-        tokenizer=model_info.tokenizer or model,
-        tokenizer_mode=model_info.tokenizer_mode,
-        revision=model_info.revision,
-        trust_remote_code=model_info.trust_remote_code,
-        hf_overrides=model_info.hf_overrides,
-        skip_tokenizer_init=model_info.require_embed_inputs,
-        enable_prompt_embeds=model_info.require_embed_inputs,
-        enable_mm_embeds=model_info.require_embed_inputs,
-        enforce_eager=model_info.enforce_eager,
-        dtype=model_info.dtype,
-    )
+    renderer_config = model_info.build_renderer_config(model)
 
     tokenizer = get_tokenizer(
-        model,
-        trust_remote_code=model_config.trust_remote_code,
+        renderer_config.tokenizer,
+        trust_remote_code=renderer_config.trust_remote_code,
     )
 
     # Test detecting the tokenizer's chat_template
@@ -2167,7 +2129,7 @@ def test_resolve_content_format_hf_defined(model, expected_format):
         tokenizer,
         chat_template=None,
         tools=None,
-        model_config=model_config,
+        model_config=renderer_config.model_config,
     )
     assert isinstance(chat_template, str)
 
@@ -2181,7 +2143,7 @@ def test_resolve_content_format_hf_defined(model, expected_format):
         None,
         "auto",
         tokenizer,
-        model_config=model_config,
+        renderer_config=renderer_config,
     )
 
     assert resolved_format == expected_format
@@ -2203,23 +2165,11 @@ def test_resolve_content_format_fallbacks(model, expected_format):
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
     model_info.check_available_online(on_fail="skip")
 
-    model_config = ModelConfig(
-        model,
-        tokenizer=model_info.tokenizer or model,
-        tokenizer_mode=model_info.tokenizer_mode,
-        revision=model_info.revision,
-        trust_remote_code=model_info.trust_remote_code,
-        hf_overrides=model_info.hf_overrides,
-        skip_tokenizer_init=model_info.require_embed_inputs,
-        enable_prompt_embeds=model_info.require_embed_inputs,
-        enable_mm_embeds=model_info.require_embed_inputs,
-        enforce_eager=model_info.enforce_eager,
-        dtype=model_info.dtype,
-    )
+    renderer_config = model_info.build_renderer_config(model)
 
     tokenizer = get_tokenizer(
-        model_config.tokenizer,
-        trust_remote_code=model_config.trust_remote_code,
+        renderer_config.tokenizer,
+        trust_remote_code=renderer_config.trust_remote_code,
     )
 
     # Test detecting the tokenizer's chat_template
@@ -2227,7 +2177,7 @@ def test_resolve_content_format_fallbacks(model, expected_format):
         tokenizer,
         chat_template=None,
         tools=None,
-        model_config=model_config,
+        model_config=renderer_config.model_config,
     )
     assert isinstance(chat_template, str)
 
@@ -2241,7 +2191,7 @@ def test_resolve_content_format_fallbacks(model, expected_format):
         None,
         "auto",
         tokenizer,
-        model_config=model_config,
+        renderer_config=renderer_config,
     )
 
     assert resolved_format == expected_format
@@ -2272,15 +2222,13 @@ def test_resolve_content_format_fallbacks(model, expected_format):
     ],
 )
 def test_resolve_content_format_examples(template_path, expected_format):
-    model_config = ModelConfig(
-        PHI3V_MODEL_ID,  # Dummy
-        tokenizer=PHI3V_MODEL_ID,  # Dummy
-        trust_remote_code=True,
-    )
+    model = PHI3V_MODEL_ID  # Dummy
+    model_config = ModelConfig(model, trust_remote_code=True)
+    renderer_config = RendererConfig(model_config=model_config, tokenizer=model)
 
     dummy_tokenizer = get_tokenizer(
-        PHI3V_MODEL_ID,  # Dummy
-        trust_remote_code=model_config.trust_remote_code,
+        renderer_config.tokenizer,
+        trust_remote_code=renderer_config.trust_remote_code,
     )
     dummy_tokenizer.chat_template = None
 
@@ -2297,7 +2245,7 @@ def test_resolve_content_format_examples(template_path, expected_format):
         None,
         "auto",
         dummy_tokenizer,
-        model_config=model_config,
+        renderer_config=renderer_config,
     )
 
     assert resolved_format == expected_format
@@ -2332,7 +2280,7 @@ def test_parse_chat_messages_include_thinking_chunk(mistral_model_config):
 
     conversation_with_thinking, _, _ = parse_chat_messages(
         messages,
-        mistral_model_config,
+        RendererConfig(model_config=mistral_model_config),
         content_format="openai",
     )
 
@@ -2432,7 +2380,7 @@ def test_parse_chat_messages_single_empty_audio_with_uuid(
                 ],
             }
         ],
-        qwen2_audio_model_config,
+        RendererConfig(model_config=qwen2_audio_model_config),
         content_format="string",
     )
 
@@ -2466,7 +2414,7 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
                 ],
             }
         ],
-        qwen2_audio_model_config,
+        RendererConfig(model_config=qwen2_audio_model_config),
         content_format="string",
     )
 
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index 081f14d6fabfb..7158120fc0217 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -8,7 +8,7 @@ import torch
 from safetensors.torch import load_file
 from torch import nn
 
-from vllm.config import ModelConfig, VllmConfig
+from vllm.config import ModelConfig, RendererConfig, VllmConfig
 from vllm.config.lora import LoRAConfig
 from vllm.lora.layers import (
     ColumnParallelLinearWithLoRA,
@@ -422,7 +422,11 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_pa
     )
 
     model_config = ModelConfig(max_model_len=16)
-    vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config)
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
+        lora_config=lora_config,
+    )
 
     vllm_config.scheduler_config.max_num_seqs = 4
     vllm_config.scheduler_config.max_num_batched_tokens = 2
@@ -525,7 +529,11 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path
     )
 
     model_config = ModelConfig(max_model_len=16)
-    vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config)
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
+        lora_config=lora_config,
+    )
 
     vllm_config.scheduler_config.max_num_seqs = 4
     vllm_config.scheduler_config.max_num_batched_tokens = 2
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index 54059ec561907..42d8c6202e79e 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -11,6 +11,7 @@ from vllm.config import (
     DeviceConfig,
     ModelConfig,
     ParallelConfig,
+    RendererConfig,
     SchedulerConfig,
     VllmConfig,
 )
@@ -43,6 +44,7 @@ def test_worker_apply_lora(qwen3_lora_files):
 
     vllm_config = VllmConfig(
         model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         load_config=LoadConfig(
             download_dir=None,
             load_format="dummy",
diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py
index 489ac1e6475b9..e368671078fdd 100644
--- a/tests/model_executor/test_model_load_with_params.py
+++ b/tests/model_executor/test_model_load_with_params.py
@@ -42,8 +42,10 @@ def test_model_loading_with_params(vllm_runner, monkeypatch):
             "Write a short story about a robot that dreams for the first time.\n"
         )
 
-        model_config = vllm_model.llm.llm_engine.model_config
-        model_tokenizer = vllm_model.llm.llm_engine.tokenizer
+        llm_engine = vllm_model.llm.llm_engine
+        model_config = llm_engine.model_config
+        renderer_config = llm_engine.renderer_config
+        tokenizer = llm_engine.tokenizer
 
         # asserts on the bert model config file
         assert model_config.encoder_config["max_seq_length"] == 512
@@ -54,8 +56,8 @@ def test_model_loading_with_params(vllm_runner, monkeypatch):
         assert model_config.pooler_config.normalize
 
         # asserts on the tokenizer loaded
-        assert model_config.tokenizer == "BAAI/bge-base-en-v1.5"
-        assert model_tokenizer.model_max_length == 512
+        assert renderer_config.tokenizer == "BAAI/bge-base-en-v1.5"
+        assert tokenizer.model_max_length == 512
 
         def check_model(model):
             assert isinstance(model, BertEmbeddingModel)
@@ -86,8 +88,10 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
             "Write a short story about a robot that dreams for the first time.\n"
         )
 
-        model_config = vllm_model.llm.llm_engine.model_config
-        model_tokenizer = vllm_model.llm.llm_engine.tokenizer
+        llm_engine = vllm_model.llm.llm_engine
+        model_config = llm_engine.model_config
+        renderer_config = llm_engine.renderer_config
+        tokenizer = llm_engine.tokenizer
 
         # asserts on the bert model config file
         assert model_config.encoder_config["max_seq_length"] == 512
@@ -98,8 +102,8 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
         assert model_config.pooler_config.normalize
 
         # asserts on the tokenizer loaded
-        assert model_config.tokenizer == "intfloat/multilingual-e5-base"
-        assert model_tokenizer.model_max_length == 512
+        assert renderer_config.tokenizer == "intfloat/multilingual-e5-base"
+        assert tokenizer.model_max_length == 512
 
         def check_model(model):
             assert isinstance(model, RobertaEmbeddingModel)
@@ -128,7 +132,7 @@ def test_facebook_roberta_model_loading_with_params(vllm_runner, monkeypatch):
             "Write a short story about a robot that dreams for the first time.\n"
         )
 
-        assert vllm_model.llm.llm_engine.model_config.tokenizer == model_name
+        assert vllm_model.llm.llm_engine.renderer_config.tokenizer == model_name
 
         def check_model(model):
             assert isinstance(model, RobertaEmbeddingModel)
diff --git a/tests/models/language/pooling/test_gritlm.py b/tests/models/language/pooling/test_gritlm.py
index 0adc9b5cf25f6..11ee003585487 100644
--- a/tests/models/language/pooling/test_gritlm.py
+++ b/tests/models/language/pooling/test_gritlm.py
@@ -6,7 +6,7 @@ import pytest
 from scipy.spatial.distance import cosine
 
 from vllm import LLM, SamplingParams
-from vllm.config import ModelConfig
+from vllm.config import ModelConfig, RendererConfig
 
 from ....utils import RemoteOpenAIServer
 
@@ -31,7 +31,8 @@ def test_find_array():
         dtype="bfloat16",
         seed=0,
     )
-    pooling = GritLMMeanPool(model_config=model_config)
+    renderer_config = RendererConfig(model_config=model_config)
+    pooling = GritLMMeanPool(renderer_config=renderer_config)
 
     arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
 
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 2e032ac4ca526..9b2b29b758765 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -25,7 +25,6 @@ from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingC
 from vllm.tokenizers import (
     MistralTokenizer,
     TokenizerLike,
-    cached_tokenizer_from_config,
 )
 
 from ....multimodal.utils import random_audio, random_image, random_video
@@ -212,31 +211,20 @@ def _test_processing_correctness(
     else:
         model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id_or_arch)
         model_id = model_id_or_arch
+
     model_info.check_available_online(on_fail="skip")
     model_info.check_transformers_version(on_fail="skip")
 
-    model_config = ModelConfig(
-        model_id,
-        tokenizer=model_info.tokenizer or model_id,
-        tokenizer_mode=model_info.tokenizer_mode,
-        revision=model_info.revision,
-        trust_remote_code=model_info.trust_remote_code,
-        hf_overrides=model_info.hf_overrides,
+    renderer_config = model_info.build_renderer_config(
+        model=model_id,
         # Ensure that the cache can fit all of the data
         mm_processor_cache_gb=2048,
-        skip_tokenizer_init=model_info.require_embed_inputs,
-        enable_prompt_embeds=model_info.require_embed_inputs,
-        enable_mm_embeds=model_info.require_embed_inputs,
-        enforce_eager=model_info.enforce_eager,
-        dtype=model_info.dtype,
     )
+    model_config = renderer_config.model_config
 
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
     factories = model_cls._processor_factory
-    ctx = InputProcessingContext(
-        model_config,
-        tokenizer=cached_tokenizer_from_config(model_config),
-    )
+    ctx = InputProcessingContext.from_config(renderer_config)
     cache = MultiModalProcessorOnlyCache(model_config)
 
     processing_info = factories.info(ctx)
diff --git a/tests/models/multimodal/processing/test_glm4_1v.py b/tests/models/multimodal/processing/test_glm4_1v.py
index 51071c93531de..fdc6352e2ec83 100644
--- a/tests/models/multimodal/processing/test_glm4_1v.py
+++ b/tests/models/multimodal/processing/test_glm4_1v.py
@@ -40,7 +40,7 @@ def test_processor_override(
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"video": 1},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
     tokenizer = processor.info.get_tokenizer()
     hf_processor_mm_kwargs = {"fps": fps}
 
@@ -79,7 +79,7 @@ def test_video_loader_consistency(
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"video": 1},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
     hf_processor_mm_kwargs = {"fps": fps}
 
     # Build the image str / prompt based on the number of images we pass
diff --git a/tests/models/multimodal/processing/test_h2ovl.py b/tests/models/multimodal/processing/test_h2ovl.py
index 1701d9dd8f011..1263d663e6af6 100644
--- a/tests/models/multimodal/processing/test_h2ovl.py
+++ b/tests/models/multimodal/processing/test_h2ovl.py
@@ -162,7 +162,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": len(size_factors)},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     min_num = min_dynamic_patch if dynamic_image_size else 1
diff --git a/tests/models/multimodal/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py
index 351b9d018eec2..bf12e79a718b7 100644
--- a/tests/models/multimodal/processing/test_idefics3.py
+++ b/tests/models/multimodal/processing/test_idefics3.py
@@ -38,7 +38,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     # Build the image str / prompt based on the number of images we pass
diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py
index b4994295d3a80..51f0d2e891b3f 100644
--- a/tests/models/multimodal/processing/test_internvl.py
+++ b/tests/models/multimodal/processing/test_internvl.py
@@ -116,7 +116,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": len(size_factors)},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     min_num = min_dynamic_patch if dynamic_image_size else 1
diff --git a/tests/models/multimodal/processing/test_llama4.py b/tests/models/multimodal/processing/test_llama4.py
index b73246b68b36a..04bc8d3f53818 100644
--- a/tests/models/multimodal/processing/test_llama4.py
+++ b/tests/models/multimodal/processing/test_llama4.py
@@ -30,7 +30,7 @@ def test_processor_override(
         limit_mm_per_prompt={"image": num_imgs},
         mm_processor_cache_gb=mm_processor_cache_gb,
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
     config = processor.info.get_hf_config()
     tokenizer = processor.info.get_tokenizer()
     hf_processor = processor.info.get_hf_processor()
diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py
index ffe7ca17b5d61..cd01002a32af2 100644
--- a/tests/models/multimodal/processing/test_llava_next.py
+++ b/tests/models/multimodal/processing/test_llava_next.py
@@ -42,7 +42,7 @@ def test_processor_max_tokens(model_id):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": 1},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
     info = processor.info
 
     seen_aspect_ratios = set[float]()
@@ -140,7 +140,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
 
     image_ratios = [
         (171, 152),
@@ -173,7 +173,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
 
     seen_aspect_ratios = set[float]()
     image_sizes = list[ImageSize]()
diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py
index f5c552fe6476a..be505d95a500f 100644
--- a/tests/models/multimodal/processing/test_llava_onevision.py
+++ b/tests/models/multimodal/processing/test_llava_onevision.py
@@ -42,7 +42,7 @@ def test_processor_max_tokens(model_id):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": 1},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
     info = processor.info
 
     seen_aspect_ratios = set[float]()
@@ -138,7 +138,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
 
     image_ratios = [
         (171, 152),
@@ -171,7 +171,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
 
     seen_aspect_ratios = set[float]()
     image_sizes = list[ImageSize]()
diff --git a/tests/models/multimodal/processing/test_minimax_vl_01.py b/tests/models/multimodal/processing/test_minimax_vl_01.py
index 11e0001235110..17ac54fdd0a49 100644
--- a/tests/models/multimodal/processing/test_minimax_vl_01.py
+++ b/tests/models/multimodal/processing/test_minimax_vl_01.py
@@ -24,7 +24,7 @@ def test_processor_override(
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
     prompt = "<image>" * num_imgs
     image = Image.new("RGB", size=(364, 364))
     mm_data = {"image": [image] * num_imgs}
@@ -83,7 +83,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
 
     image_ratios = [
         (171, 152),
diff --git a/tests/models/multimodal/processing/test_mllama4.py b/tests/models/multimodal/processing/test_mllama4.py
index e5ff2d1391b62..9a65e2ddc85c6 100644
--- a/tests/models/multimodal/processing/test_mllama4.py
+++ b/tests/models/multimodal/processing/test_mllama4.py
@@ -25,7 +25,7 @@ def test_profiling(model_id: str, max_model_len: int):
         limit_mm_per_prompt=mm_counts,
     )
 
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
     profiler = MultiModalProfiler(processor)
 
     decoder_dummy_data = profiler.get_decoder_dummy_data(
diff --git a/tests/models/multimodal/processing/test_nemotron_vl.py b/tests/models/multimodal/processing/test_nemotron_vl.py
index 5311ab1b78c69..f3609743b7c85 100644
--- a/tests/models/multimodal/processing/test_nemotron_vl.py
+++ b/tests/models/multimodal/processing/test_nemotron_vl.py
@@ -118,7 +118,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": len(size_factors)},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     min_num = min_dynamic_patch if dynamic_image_size else 1
diff --git a/tests/models/multimodal/processing/test_phi3v.py b/tests/models/multimodal/processing/test_phi3v.py
index 8faff2611e6fe..f51bd97861783 100644
--- a/tests/models/multimodal/processing/test_phi3v.py
+++ b/tests/models/multimodal/processing/test_phi3v.py
@@ -39,7 +39,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     # Build the image str / prompt based on the number of images we pass
diff --git a/tests/models/multimodal/processing/test_phi4mm.py b/tests/models/multimodal/processing/test_phi4mm.py
index 5391555c26675..271357b0d1507 100644
--- a/tests/models/multimodal/processing/test_phi4mm.py
+++ b/tests/models/multimodal/processing/test_phi4mm.py
@@ -39,7 +39,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     # Build the image str / prompt based on the number of images we pass
diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py
index 9f4cdb6789b2c..d65a270a7da3b 100644
--- a/tests/models/multimodal/processing/test_qwen2_vl.py
+++ b/tests/models/multimodal/processing/test_qwen2_vl.py
@@ -34,7 +34,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
     tokenizer = processor.info.get_tokenizer()
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
diff --git a/tests/models/multimodal/processing/test_smolvlm.py b/tests/models/multimodal/processing/test_smolvlm.py
index 6f77d5516d147..e0e6264de4e3a 100644
--- a/tests/models/multimodal/processing/test_smolvlm.py
+++ b/tests/models/multimodal/processing/test_smolvlm.py
@@ -38,7 +38,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     # Build the image str / prompt based on the number of images we pass
diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py
index 5d489549c5b46..24959fa48ad6d 100644
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -11,7 +11,7 @@ import pytest
 import torch.nn as nn
 from PIL import Image
 
-from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
+from vllm.config import ModelConfig, RendererConfig, VllmConfig, set_current_vllm_config
 from vllm.config.multimodal import (
     AudioDummyOptions,
     BaseDummyOptions,
@@ -31,7 +31,6 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
 from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
 from vllm.multimodal.utils import group_mm_kwargs_by_modality
 from vllm.platforms import current_platform
-from vllm.tokenizers import cached_tokenizer_from_config
 from vllm.utils.collection_utils import is_list_of
 from vllm.utils.torch_utils import set_default_torch_dtype
 
@@ -150,7 +149,10 @@ def initialize_dummy_model(
         backend="nccl",
     )
     initialize_model_parallel(tensor_model_parallel_size=1)
-    vllm_config = VllmConfig(model_config=model_config)
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
+    )
     with set_current_vllm_config(vllm_config=vllm_config):
         with set_default_torch_dtype(model_config.dtype):
             model = model_cls(vllm_config=vllm_config)
@@ -182,19 +184,12 @@ def test_model_tensor_schema(model_id: str):
     else:
         dtype = model_info.dtype
 
-    model_config = ModelConfig(
+    renderer_config = model_info.build_renderer_config(
         model_id,
-        tokenizer=model_info.tokenizer or model_id,
-        tokenizer_mode=model_info.tokenizer_mode,
-        revision=model_info.revision,
-        trust_remote_code=model_info.trust_remote_code,
         hf_overrides=hf_overrides_fn,
-        skip_tokenizer_init=model_info.require_embed_inputs,
-        enable_prompt_embeds=model_info.require_embed_inputs,
-        enable_mm_embeds=model_info.require_embed_inputs,
-        enforce_eager=model_info.enforce_eager,
         dtype=dtype,
     )
+    model_config = renderer_config.model_config
 
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
     assert supports_multimodal(model_cls)
@@ -212,10 +207,7 @@ def test_model_tensor_schema(model_id: str):
     if not any(inputs_parse_methods):
         pytest.skip(f"{model_arch} does not support tensor schema validation.")
 
-    ctx = InputProcessingContext(
-        model_config,
-        tokenizer=cached_tokenizer_from_config(model_config),
-    )
+    ctx = InputProcessingContext.from_config(renderer_config)
     processing_info = factories.info(ctx)
     supported_mm_limits = processing_info.get_supported_mm_limits()
     limit_mm_per_prompt = {
diff --git a/tests/models/multimodal/processing/test_transformers.py b/tests/models/multimodal/processing/test_transformers.py
index e2a2186f470b4..c9a90eb882da4 100644
--- a/tests/models/multimodal/processing/test_transformers.py
+++ b/tests/models/multimodal/processing/test_transformers.py
@@ -3,7 +3,7 @@
 import pytest
 
 from vllm.assets.image import ImageAsset
-from vllm.config import ModelConfig
+from vllm.config import ModelConfig, RendererConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
 
 
@@ -13,8 +13,9 @@ def test_multimodal_processor(model_id):
         model=model_id,
         model_impl="transformers",
     )
+    renderer_config = RendererConfig(model_config=model_config)
 
-    mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config)
+    mm_processor = MULTIMODAL_REGISTRY.create_processor(renderer_config)
 
     image_pil = ImageAsset("cherry_blossom").pil_image
     mm_data = {"image": image_pil}
diff --git a/tests/models/multimodal/test_mapping.py b/tests/models/multimodal/test_mapping.py
index 0d2eaca95504e..73de6b5f7d2f9 100644
--- a/tests/models/multimodal/test_mapping.py
+++ b/tests/models/multimodal/test_mapping.py
@@ -7,7 +7,6 @@ import torch
 import transformers
 from transformers import AutoConfig, PreTrainedModel
 
-from vllm.config import ModelConfig
 from vllm.model_executor.models.utils import WeightsMapper
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.transformers_utils.config import try_get_safetensors_metadata
@@ -50,37 +49,11 @@ def test_hf_model_weights_mapper(model_arch: str):
     model_info.check_available_online(on_fail="skip")
     model_info.check_transformers_version(on_fail="skip")
 
-    is_mistral_model = model_arch in [
-        "Mistral3ForConditionalGeneration",
-        "PixtralForConditionalGeneration",
-        "VoxtralForConditionalGeneration",
-    ]
-
-    if not is_mistral_model or model_info.tokenizer_mode == "mistral":
-        tokenizer_mode = model_info.tokenizer_mode
-    else:
-        tokenizer_mode = "hf"
-
-    model_id = model_info.default
-
-    model_config = ModelConfig(
-        model_id,
-        tokenizer=model_info.tokenizer or model_id,
-        tokenizer_mode=tokenizer_mode,
-        config_format="hf",
-        revision=model_info.revision,
-        trust_remote_code=model_info.trust_remote_code,
-        hf_overrides=model_info.hf_overrides,
-        skip_tokenizer_init=model_info.require_embed_inputs,
-        enable_prompt_embeds=model_info.require_embed_inputs,
-        enable_mm_embeds=model_info.require_embed_inputs,
-        enforce_eager=model_info.enforce_eager,
-        dtype=model_info.dtype,
-    )
+    model_config = model_info.build_model_config(config_format="hf")
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
 
-    original_weights = create_repo_dummy_weights(model_id)
-    hf_dummy_model = create_dummy_model(model_id, model_arch)
+    original_weights = create_repo_dummy_weights(model_config.model)
+    hf_dummy_model = create_dummy_model(model_config.model, model_arch)
     hf_converted_weights = hf_dummy_model.named_parameters()
     hf_converted_buffers = hf_dummy_model.named_buffers()
     mapper: WeightsMapper = model_cls.hf_to_vllm_mapper
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 020cb749341a6..e2cb5bcbc6c91 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -9,7 +9,8 @@ import pytest
 from packaging.version import Version
 from transformers import __version__ as TRANSFORMERS_VERSION
 
-from vllm.config.model import ModelDType, TokenizerMode
+from vllm.config.model import ModelConfig, ModelDType
+from vllm.config.renderer import RendererConfig, TokenizerMode
 
 
 @dataclass(frozen=True)
@@ -170,6 +171,36 @@ class _HfExamplesInfo:
             else:
                 pytest.skip(msg)
 
+    def build_model_config(self, model: str | None = None, **kwargs) -> ModelConfig:
+        if model is None:
+            model = self.default
+
+        return ModelConfig(
+            **{
+                "model": model,
+                "revision": self.revision,
+                "trust_remote_code": self.trust_remote_code,
+                "hf_overrides": self.hf_overrides,
+                "enable_prompt_embeds": self.require_embed_inputs,
+                "enable_mm_embeds": self.require_embed_inputs,
+                "enforce_eager": self.enforce_eager,
+                "dtype": self.dtype,
+                **kwargs,
+            }
+        )
+
+    def build_renderer_config(
+        self, model: str | None = None, **kwargs
+    ) -> RendererConfig:
+        model_config = self.build_model_config(model, **kwargs)
+
+        return RendererConfig(
+            model_config=model_config,
+            tokenizer=self.tokenizer or model_config.model,
+            tokenizer_mode=self.tokenizer_mode,
+            skip_tokenizer_init=self.require_embed_inputs,
+        )
+
 
 _TEXT_GENERATION_EXAMPLE_MODELS = {
     # [Decoder-only]
diff --git a/tests/models/utils.py b/tests/models/utils.py
index d84b4b820533e..87292cc4538d9 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -13,7 +13,6 @@ from transformers import PretrainedConfig
 from vllm.config.model import ModelConfig, ModelDType, RunnerOption
 from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
 from vllm.multimodal.processing import InputProcessingContext
-from vllm.tokenizers import cached_tokenizer_from_config
 
 from .. import ci_envs
 from .registry import HF_EXAMPLE_MODELS
@@ -296,30 +295,18 @@ def build_model_context(
 
     model_config_kwargs = model_config_kwargs or {}
     limit_mm_per_prompt = limit_mm_per_prompt or {}
-    model_config = ModelConfig(
+    renderer_config = model_info.build_renderer_config(
         model_id,
         runner=runner,
-        tokenizer=model_info.tokenizer or model_id,
-        tokenizer_mode=model_info.tokenizer_mode,
-        revision=model_info.revision,
-        trust_remote_code=model_info.trust_remote_code,
         dtype=dtype,
         seed=0,
         mm_processor_kwargs=mm_processor_kwargs,
         limit_mm_per_prompt=limit_mm_per_prompt,
         mm_processor_cache_gb=mm_processor_cache_gb,
-        hf_overrides=model_info.hf_overrides,
-        skip_tokenizer_init=model_info.require_embed_inputs,
-        enable_prompt_embeds=model_info.require_embed_inputs,
-        enable_mm_embeds=model_info.require_embed_inputs,
-        enforce_eager=model_info.enforce_eager,
         **model_config_kwargs,
     )
 
-    return InputProcessingContext(
-        model_config,
-        tokenizer=cached_tokenizer_from_config(model_config),
-    )
+    return InputProcessingContext.from_config(renderer_config)
 
 
 def check_embeddings_close(
diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py
index e641b1111abaf..ce16d90130aa4 100644
--- a/tests/multimodal/test_cache.py
+++ b/tests/multimodal/test_cache.py
@@ -6,7 +6,7 @@ import numpy as np
 import pytest
 import torch
 
-from vllm.config import ModelConfig, ParallelConfig, VllmConfig
+from vllm.config import ModelConfig, ParallelConfig, RendererConfig, VllmConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.cache import (
     BaseMultiModalProcessorCache,
@@ -110,11 +110,14 @@ def _create_vllm_config(
     mm_processor_cache_gb: float,
     enable_ipc: bool,
 ):
+    model_config = ModelConfig(
+        model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
+        mm_processor_cache_gb=mm_processor_cache_gb,
+    )
+
     return VllmConfig(
-        model_config=ModelConfig(
-            model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
-            mm_processor_cache_gb=mm_processor_cache_gb,
-        ),
+        model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         parallel_config=ParallelConfig(data_parallel_size=1 if enable_ipc else 2),
     )
 
@@ -506,13 +509,15 @@ def _run_test_cache_eviction_shm(
 
 
 def test_cache_eviction_shm_cache():
+    model_config = ModelConfig(
+        model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
+        mm_processor_cache_type="shm",
+        mm_shm_cache_max_object_size_mb=6,
+        mm_processor_cache_gb=15.2 * MiB_bytes / GiB_bytes,
+    )
     vllm_config = VllmConfig(
-        model_config=ModelConfig(
-            model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
-            mm_processor_cache_type="shm",
-            mm_shm_cache_max_object_size_mb=6,
-            mm_processor_cache_gb=15.2 * MiB_bytes / GiB_bytes,
-        ),
+        model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
     )
     sender_cache = ShmObjectStoreSenderCache(vllm_config)
     receiver_cache = ShmObjectStoreReceiverCache(vllm_config, mp.Lock())
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 262ea42e4d0fa..adff572524a9b 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -7,7 +7,7 @@ from contextlib import nullcontext
 import numpy as np
 import pytest
 
-from vllm.config import ModelConfig
+from vllm.config import ModelConfig, RendererConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.processing import (
     InputProcessingContext,
@@ -920,8 +920,9 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
         model=model_id,
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
+    renderer_config = RendererConfig(model_config=model_config)
 
-    processor = MULTIMODAL_REGISTRY.create_processor(model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(renderer_config)
     processor._supported_mm_limits = {"image": num_supported}
 
     profiler = MultiModalProfiler(processor)
@@ -955,8 +956,9 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
         model=model_id,
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
+    renderer_config = RendererConfig(model_config=model_config)
 
-    processor = MULTIMODAL_REGISTRY.create_processor(model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(renderer_config)
 
     rng = np.random.RandomState(0)
     image = random_image(rng, min_wh=128, max_wh=256)
@@ -1012,11 +1014,13 @@ def test_hf_processor_init_kwargs(
     inference_kwargs,
     expected_kwargs,
 ):
-    ctx = InputProcessingContext(
-        model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
-        tokenizer=None,
+    model_config = ModelConfig(model_id, mm_processor_kwargs=config_kwargs)
+    renderer_config = RendererConfig(
+        model_config=model_config,
+        tokenizer=model_id,
     )
 
+    ctx = InputProcessingContext.from_config(renderer_config)
     processor = ctx.get_hf_processor(
         DummyProcessor,  # type: ignore[arg-type]
         **inference_kwargs,
@@ -1045,11 +1049,13 @@ def test_hf_processor_call_kwargs(
     inference_kwargs,
     expected_kwargs,
 ):
-    ctx = InputProcessingContext(
-        model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
-        tokenizer=None,
+    model_config = ModelConfig(model_id, mm_processor_kwargs=config_kwargs)
+    renderer_config = RendererConfig(
+        model_config=model_config,
+        tokenizer=model_id,
     )
 
+    ctx = InputProcessingContext.from_config(renderer_config)
     processor = ctx.get_hf_processor(DummyProcessor)  # type: ignore[arg-type]
 
     result = ctx.call_hf_processor(processor, {}, inference_kwargs)
diff --git a/tests/multimodal/test_registry.py b/tests/multimodal/test_registry.py
index 3b01bda7f54c8..8127fac09968b 100644
--- a/tests/multimodal/test_registry.py
+++ b/tests/multimodal/test_registry.py
@@ -31,4 +31,6 @@ def test_supports_multimodal_inputs(model_id, limit_mm_per_prompt, expected):
         model_id,
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
-    assert MULTIMODAL_REGISTRY.supports_multimodal_inputs(ctx.model_config) is expected
+    assert (
+        MULTIMODAL_REGISTRY.supports_multimodal_inputs(ctx.renderer_config) is expected
+    )
diff --git a/tests/test_config.py b/tests/test_config.py
index 203447cd531fb..7464fcd1e9fe5 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -13,6 +13,7 @@ from vllm.config import (
     CompilationConfig,
     ModelConfig,
     PoolerConfig,
+    RendererConfig,
     SchedulerConfig,
     VllmConfig,
     update_config,
@@ -476,27 +477,41 @@ def test_load_config_pt_load_map_location(pt_load_map_location):
         ("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", 131073, 131072, True),
     ],
 )
-def test_get_and_verify_max_len(
+def test_recalculate_max_model_len(
     model_id, max_model_len, expected_max_len, should_raise
 ):
-    """Test get_and_verify_max_len with different configurations."""
+    """Test recalculate_max_model_len with different configurations."""
     model_config = ModelConfig(model_id)
 
     if should_raise:
         with pytest.raises(ValueError):
-            model_config.get_and_verify_max_len(max_model_len)
+            model_config.recalculate_max_model_len(
+                max_model_len,
+                tokenizer=model_id,
+                tokenizer_revision=None,
+            )
     else:
-        actual_max_len = model_config.get_and_verify_max_len(max_model_len)
-        assert actual_max_len == expected_max_len
+        model_config.recalculate_max_model_len(
+            max_model_len,
+            tokenizer=model_id,
+            tokenizer_revision=None,
+        )
+        assert model_config.max_model_len == expected_max_len
 
 
-class MockConfig:
-    """Simple mock object for testing maybe_pull_model_tokenizer_for_runai"""
+class MockModelConfig:
+    """Simple mock object for testing maybe_pull_model_for_runai"""
 
-    def __init__(self, model: str, tokenizer: str):
+    def __init__(self, model: str):
         self.model = model
-        self.tokenizer = tokenizer
-        self.model_weights = None
+
+
+class MockRendererConfig:
+    """Simple mock object for testing maybe_pull_tokenizer_for_runai"""
+
+    def __init__(self, model_config: MockModelConfig):
+        self.model_config = model_config
+        self.tokenizer = model_config.model
 
 
 @pytest.mark.parametrize(
@@ -514,59 +529,65 @@ def test_s3_url_model_tokenizer_paths(mock_pull_files, s3_url):
     mock_pull_files.return_value = None
 
     # Create first mock and run the method
-    config1 = MockConfig(model=s3_url, tokenizer=s3_url)
-    ModelConfig.maybe_pull_model_tokenizer_for_runai(config1, s3_url, s3_url)
+    model_config1 = MockModelConfig(model=s3_url)
+    renderer_config1 = MockRendererConfig(model_config=model_config1)
+    ModelConfig.maybe_pull_model_for_runai(model_config1, s3_url)
+    RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config1, s3_url)
 
     # Check that model and tokenizer point to existing directories
-    assert os.path.exists(config1.model), (
-        f"Model directory does not exist: {config1.model}"
+    assert os.path.exists(model_config1.model), (
+        f"Model directory does not exist: {model_config1.model}"
     )
-    assert os.path.isdir(config1.model), (
-        f"Model path is not a directory: {config1.model}"
+    assert os.path.isdir(model_config1.model), (
+        f"Model path is not a directory: {model_config1.model}"
     )
-    assert os.path.exists(config1.tokenizer), (
-        f"Tokenizer directory does not exist: {config1.tokenizer}"
+    assert os.path.exists(renderer_config1.tokenizer), (
+        f"Tokenizer directory does not exist: {renderer_config1.tokenizer}"
     )
-    assert os.path.isdir(config1.tokenizer), (
-        f"Tokenizer path is not a directory: {config1.tokenizer}"
+    assert os.path.isdir(renderer_config1.tokenizer), (
+        f"Tokenizer path is not a directory: {renderer_config1.tokenizer}"
     )
 
     # Verify that the paths are different from the original S3 URL
-    assert config1.model != s3_url, "Model path should be converted to local directory"
-    assert config1.tokenizer != s3_url, (
+    assert model_config1.model != s3_url, (
+        "Model path should be converted to local directory"
+    )
+    assert renderer_config1.tokenizer != s3_url, (
         "Tokenizer path should be converted to local directory"
     )
 
     # Store the original paths
-    created_model_dir = config1.model
-    create_tokenizer_dir = config1.tokenizer
+    created_model_dir = model_config1.model
+    create_tokenizer_dir = renderer_config1.tokenizer
 
     # Create a new mock and run the method with the same S3 URL
-    config2 = MockConfig(model=s3_url, tokenizer=s3_url)
-    ModelConfig.maybe_pull_model_tokenizer_for_runai(config2, s3_url, s3_url)
+    model_config2 = MockModelConfig(model=s3_url)
+    renderer_config2 = MockRendererConfig(model_config=model_config2)
+    ModelConfig.maybe_pull_model_for_runai(model_config2, s3_url)
+    RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config2, s3_url)
 
     # Check that the new directories exist
-    assert os.path.exists(config2.model), (
-        f"Model directory does not exist: {config2.model}"
+    assert os.path.exists(model_config2.model), (
+        f"Model directory does not exist: {model_config2.model}"
     )
-    assert os.path.isdir(config2.model), (
-        f"Model path is not a directory: {config2.model}"
+    assert os.path.isdir(model_config2.model), (
+        f"Model path is not a directory: {model_config2.model}"
     )
-    assert os.path.exists(config2.tokenizer), (
-        f"Tokenizer directory does not exist: {config2.tokenizer}"
+    assert os.path.exists(renderer_config2.tokenizer), (
+        f"Tokenizer directory does not exist: {renderer_config2.tokenizer}"
     )
-    assert os.path.isdir(config2.tokenizer), (
-        f"Tokenizer path is not a directory: {config2.tokenizer}"
+    assert os.path.isdir(renderer_config2.tokenizer), (
+        f"Tokenizer path is not a directory: {renderer_config2.tokenizer}"
     )
 
     # Verify that the paths are deterministic (same as before)
-    assert config2.model == created_model_dir, (
+    assert model_config2.model == created_model_dir, (
         f"Model paths are not deterministic. "
-        f"Original: {created_model_dir}, New: {config2.model}"
+        f"Original: {created_model_dir}, New: {model_config2.model}"
     )
-    assert config2.tokenizer == create_tokenizer_dir, (
+    assert renderer_config2.tokenizer == create_tokenizer_dir, (
         f"Tokenizer paths are not deterministic. "
-        f"Original: {create_tokenizer_dir}, New: {config2.tokenizer}"
+        f"Original: {create_tokenizer_dir}, New: {renderer_config2.tokenizer}"
     )
 
 
@@ -580,28 +601,36 @@ def test_s3_url_different_models_create_different_directories(mock_pull_files):
     s3_url2 = "s3://example-bucket-2/model/"
 
     # Create mocks with different S3 URLs and run the method
-    config1 = MockConfig(model=s3_url1, tokenizer=s3_url1)
-    ModelConfig.maybe_pull_model_tokenizer_for_runai(config1, s3_url1, s3_url1)
+    model_config1 = MockModelConfig(model=s3_url1)
+    renderer_config1 = MockRendererConfig(model_config=model_config1)
+    ModelConfig.maybe_pull_model_for_runai(model_config1, s3_url1)
+    RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config1, s3_url1)
 
-    config2 = MockConfig(model=s3_url2, tokenizer=s3_url2)
-    ModelConfig.maybe_pull_model_tokenizer_for_runai(config2, s3_url2, s3_url2)
+    model_config2 = MockModelConfig(model=s3_url2)
+    renderer_config2 = MockRendererConfig(model_config=model_config2)
+    ModelConfig.maybe_pull_model_for_runai(model_config2, s3_url2)
+    RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config2, s3_url2)
 
     # Verify that different URLs produce different directories
-    assert config1.model != config2.model, (
+    assert model_config1.model != model_config2.model, (
         f"Different S3 URLs should create different model directories. "
-        f"URL1 model: {config1.model}, URL2 model: {config2.model}"
+        f"URL1 model: {model_config1.model}, URL2 model: {model_config2.model}"
     )
-    assert config1.tokenizer != config2.tokenizer, (
+    assert renderer_config1.tokenizer != renderer_config2.tokenizer, (
         f"Different S3 URLs should create different tokenizer directories. "
-        f"URL1 tokenizer: {config1.tokenizer}, "
-        f"URL2 tokenizer: {config2.tokenizer}"
+        f"URL1 tokenizer: {renderer_config1.tokenizer}, "
+        f"URL2 tokenizer: {renderer_config2.tokenizer}"
     )
 
     # Verify that both sets of directories exist
-    assert os.path.exists(config1.model) and os.path.isdir(config1.model)
-    assert os.path.exists(config1.tokenizer) and os.path.isdir(config1.tokenizer)
-    assert os.path.exists(config2.model) and os.path.isdir(config2.model)
-    assert os.path.exists(config2.tokenizer) and os.path.isdir(config2.tokenizer)
+    assert os.path.exists(model_config1.model) and os.path.isdir(model_config1.model)
+    assert os.path.exists(renderer_config1.tokenizer) and os.path.isdir(
+        renderer_config1.tokenizer
+    )
+    assert os.path.exists(model_config2.model) and os.path.isdir(model_config2.model)
+    assert os.path.exists(renderer_config2.tokenizer) and os.path.isdir(
+        renderer_config2.tokenizer
+    )
 
 
 @pytest.mark.parametrize(
diff --git a/tests/test_inputs.py b/tests/test_inputs.py
index c4339827de8b6..48fd076ab3c67 100644
--- a/tests/test_inputs.py
+++ b/tests/test_inputs.py
@@ -3,7 +3,7 @@
 
 import pytest
 
-from vllm.config import ModelConfig
+from vllm.config import ModelConfig, RendererConfig
 from vllm.inputs import zip_enc_dec_prompts
 from vllm.inputs.parse import parse_raw_prompts
 from vllm.inputs.preprocess import InputPreprocessor
@@ -108,8 +108,9 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
 )
 def test_preprocessor_always_mm_code_path(model_id, prompt):
     model_config = ModelConfig(model=model_id)
-    tokenizer = init_tokenizer_from_config(model_config)
-    input_preprocessor = InputPreprocessor(model_config, tokenizer)
+    renderer_config = RendererConfig(model_config=model_config)
+    tokenizer = init_tokenizer_from_config(renderer_config)
+    input_preprocessor = InputPreprocessor(renderer_config, tokenizer)
 
     # HF processor adds sep token
     sep_token_id = tokenizer.vocab[tokenizer.sep_token]
diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
index 6cab129c116c5..49307e3e5437d 100644
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@@ -16,6 +16,7 @@ from vllm.config import (
     LoadConfig,
     ModelConfig,
     ParallelConfig,
+    RendererConfig,
     SchedulerConfig,
     VllmConfig,
 )
@@ -216,6 +217,7 @@ def create_vllm_config(
 
     return VllmConfig(
         model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         cache_config=cache_config,
         parallel_config=parallel_config,
         scheduler_config=scheduler_config,
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index fd5cf6d3e74aa..4a414bca591d1 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -8,7 +8,7 @@ import pytest
 import torch
 
 import vllm.v1.core.kv_cache_utils as kv_cache_utils
-from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
+from vllm.config import ModelConfig, RendererConfig, SchedulerConfig, VllmConfig
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.inputs import (
     MultiModalFeatureSpec,
@@ -667,7 +667,10 @@ def test_metrics_empty_stats():
 
 def test_get_kv_cache_configs_multiple_workers():
     model_config = ModelConfig(max_model_len=16)
-    vllm_config = VllmConfig(model_config=model_config)
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
+    )
 
     ref_kv_cache_spec = new_kv_cache_spec()
     same_kv_cache_specs = [
@@ -1136,6 +1139,7 @@ def test_estimate_max_model_len(model_id, max_model_len, want_estimated_max_len)
 
     vllm_config = VllmConfig(
         model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         scheduler_config=scheduler_config,
     )
 
@@ -1175,6 +1179,7 @@ def test_get_max_concurrency_for_kv_cache_config():
 
     vllm_config = VllmConfig(
         model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         scheduler_config=scheduler_config,
     )
 
@@ -1293,7 +1298,10 @@ def test_allocate_with_lookahead():
 def test_get_kv_cache_config_one_worker():
     # pass max_model_len to pass check_enough_kv_cache_memory
     model_config = ModelConfig(max_model_len=16)
-    vllm_config = VllmConfig(model_config=model_config)
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
+    )
 
     mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2
     # all layers are full attention -> single group
@@ -1584,7 +1592,11 @@ def test_get_kv_cache_config_one_worker():
 
 def test_get_kv_cache_configs_attention_free():
     kv_cache_specs: dict[str, KVCacheSpec] = {}
-    vllm_config = VllmConfig(model_config=ModelConfig(max_model_len=16))
+    model_config = ModelConfig(max_model_len=16)
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
+    )
     kv_cache_configs = get_kv_cache_configs(vllm_config, [kv_cache_specs], [0])
     assert kv_cache_configs == [
         KVCacheConfig(
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index c6c4a5085bff7..1505415a63619 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -11,6 +11,7 @@ from vllm.config import (
     ECTransferConfig,
     KVTransferConfig,
     ModelConfig,
+    RendererConfig,
     SchedulerConfig,
     SpeculativeConfig,
     VllmConfig,
@@ -1563,6 +1564,7 @@ def create_scheduler_with_priority(
     vllm_config = VllmConfig(
         scheduler_config=scheduler_config,
         model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         cache_config=cache_config,
         kv_transfer_config=kv_transfer_config,
         speculative_config=speculative_config,
diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py
index f5ba613d38db1..086885c298145 100644
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -9,6 +9,7 @@ from vllm.config import (
     ECTransferConfig,
     KVTransferConfig,
     ModelConfig,
+    RendererConfig,
     SchedulerConfig,
     SpeculativeConfig,
     VllmConfig,
@@ -132,6 +133,7 @@ def create_scheduler(
     vllm_config = VllmConfig(
         scheduler_config=scheduler_config,
         model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         cache_config=cache_config,
         kv_transfer_config=kv_transfer_config,
         speculative_config=speculative_config,
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 48be8c15aba9e..c606100a12bf1 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -15,6 +15,7 @@ from vllm.config import (
     ECTransferConfig,
     KVTransferConfig,
     ModelConfig,
+    RendererConfig,
     SchedulerConfig,
     VllmConfig,
 )
@@ -522,6 +523,7 @@ def test_encoder_instance_zero_kv_cache(
 
     vllm_config = VllmConfig(
         model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         cache_config=cache_config,
         scheduler_config=scheduler_config,
         kv_transfer_config=kv_transfer_config,
diff --git a/tests/v1/engine/test_process_multi_modal_uuids.py b/tests/v1/engine/test_process_multi_modal_uuids.py
index 1b11b8af49d17..85fab3a855fd7 100644
--- a/tests/v1/engine/test_process_multi_modal_uuids.py
+++ b/tests/v1/engine/test_process_multi_modal_uuids.py
@@ -5,7 +5,14 @@ import pytest
 
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
-from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
+from vllm.config import (
+    CacheConfig,
+    DeviceConfig,
+    ModelConfig,
+    MultiModalConfig,
+    RendererConfig,
+    VllmConfig,
+)
 from vllm.sampling_params import SamplingParams
 from vllm.v1.engine import input_processor as input_processor_mod
 from vllm.v1.engine.input_processor import InputProcessor
@@ -44,22 +51,21 @@ def _mock_input_processor(
     monkeypatch.setattr(VllmConfig, "__post_init__", lambda self: None, raising=True)
 
     model_config = ModelConfig(
-        skip_tokenizer_init=True,
         max_model_len=128,
         mm_processor_cache_gb=mm_cache_gb,
         generation_config="vllm",
+    )
+    model_config.multimodal_config = MultiModalConfig(mm_processor_cache_gb=mm_cache_gb)
+
+    renderer_config = RendererConfig(
+        model_config=model_config,
         tokenizer="dummy",
+        skip_tokenizer_init=True,
     )
 
-    # Minimal multimodal_config to satisfy references in
-    # Processor.process_inputs.
-    class _MockMMConfig:
-        def __init__(self, gb: float):
-            self.mm_processor_cache_gb = gb
-
-    model_config.multimodal_config = _MockMMConfig(mm_cache_gb)  # type: ignore[attr-defined]
     vllm_config = VllmConfig(
         model_config=model_config,
+        renderer_config=renderer_config,
         cache_config=CacheConfig(enable_prefix_caching=enable_prefix_caching),
         device_config=DeviceConfig(device="cpu"),
     )
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index 58f1a7282352b..768b338b5fe53 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -15,6 +15,7 @@ from vllm.config import (
     DeviceConfig,
     KVTransferConfig,
     ModelConfig,
+    RendererConfig,
     SchedulerConfig,
     VllmConfig,
 )
@@ -127,6 +128,7 @@ def create_vllm_config(
     return VllmConfig(
         scheduler_config=scheduler_config,
         model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         cache_config=cache_config,
         kv_transfer_config=kv_transfer_config,
         device_config=DeviceConfig("cpu"),
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index 616e57de339e2..888ea0169b759 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -19,6 +19,7 @@ from vllm.config import (
     DeviceConfig,
     ModelConfig,
     ParallelConfig,
+    RendererConfig,
     SchedulerConfig,
     SpeculativeConfig,
     VllmConfig,
@@ -61,6 +62,7 @@ def _create_proposer(
 
     vllm_config = VllmConfig(
         model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         cache_config=CacheConfig(),
         speculative_config=speculative_config,
         device_config=DeviceConfig(device=current_platform.device_type),
diff --git a/tests/v1/spec_decode/test_mtp.py b/tests/v1/spec_decode/test_mtp.py
index 3b8813ceb818a..4483c82438530 100644
--- a/tests/v1/spec_decode/test_mtp.py
+++ b/tests/v1/spec_decode/test_mtp.py
@@ -18,6 +18,7 @@ from vllm.config import (
     DeviceConfig,
     ModelConfig,
     ParallelConfig,
+    RendererConfig,
     SchedulerConfig,
     SpeculativeConfig,
     VllmConfig,
@@ -46,6 +47,7 @@ def _create_mtp_proposer(num_speculative_tokens: int) -> EagleProposer:
 
     vllm_config = VllmConfig(
         model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         cache_config=CacheConfig(),
         speculative_config=speculative_config,
         device_config=DeviceConfig(device=current_platform.device_type),
diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py
index 6bc412abe8695..2e365e08a4e72 100644
--- a/tests/v1/spec_decode/test_ngram.py
+++ b/tests/v1/spec_decode/test_ngram.py
@@ -4,6 +4,7 @@ import numpy as np
 
 from vllm.config import (
     ModelConfig,
+    RendererConfig,
     SpeculativeConfig,
     VllmConfig,
 )
@@ -69,6 +70,7 @@ def test_ngram_proposer():
         return NgramProposer(
             vllm_config=VllmConfig(
                 model_config=model_config,
+                renderer_config=RendererConfig(model_config=model_config),
                 speculative_config=SpeculativeConfig(
                     prompt_lookup_min=min_n,
                     prompt_lookup_max=max_n,
diff --git a/tests/v1/structured_output/test_backend_guidance.py b/tests/v1/structured_output/test_backend_guidance.py
index 4c01560fc88c3..baef2459f8df0 100644
--- a/tests/v1/structured_output/test_backend_guidance.py
+++ b/tests/v1/structured_output/test_backend_guidance.py
@@ -6,7 +6,7 @@ from concurrent.futures import Future
 import pytest
 from transformers import AutoTokenizer
 
-from vllm.config import StructuredOutputsConfig, VllmConfig
+from vllm.config import RendererConfig, StructuredOutputsConfig, VllmConfig
 from vllm.config.model import ModelConfig
 from vllm.config.parallel import ParallelConfig
 from vllm.config.speculative import SpeculativeConfig
@@ -72,8 +72,11 @@ def test_backend_guidance_rollback_terminated():
 def test_grammar_bitmask_with_specdec():
     tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
     prompt = tokenizer.encode('{"a": "b"}')
+
+    model_config = ModelConfig(tokenizer=TOKENIZER)
     vllm_config = VllmConfig(
-        model_config=ModelConfig(tokenizer=TOKENIZER),
+        model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config, tokenizer=TOKENIZER),
         structured_outputs_config=StructuredOutputsConfig(backend="guidance"),
         speculative_config=SpeculativeConfig(model="[ngram]", num_speculative_tokens=3),
     )
@@ -137,8 +140,11 @@ def test_grammar_init_async_and_sync(async_grammar):
 
     # Use "external_launcher" for sync mode, None for async mode
     executor_backend = None if async_grammar else "external_launcher"
+
+    model_config = ModelConfig(tokenizer=TOKENIZER)
     vllm_config = VllmConfig(
-        model_config=ModelConfig(tokenizer=TOKENIZER),
+        model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config, tokenizer=TOKENIZER),
         structured_outputs_config=StructuredOutputsConfig(backend="guidance"),
         parallel_config=ParallelConfig(distributed_executor_backend=executor_backend),
     )
diff --git a/tests/v1/structured_output/test_reasoning_structured_output.py b/tests/v1/structured_output/test_reasoning_structured_output.py
index 70047a993c3f9..5901d38d1b78b 100644
--- a/tests/v1/structured_output/test_reasoning_structured_output.py
+++ b/tests/v1/structured_output/test_reasoning_structured_output.py
@@ -7,7 +7,7 @@ from unittest.mock import Mock
 
 import pytest
 
-from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
+from vllm.config import ModelConfig, RendererConfig, SchedulerConfig, VllmConfig
 from vllm.reasoning import ReasoningParser
 from vllm.v1.request import Request
 from vllm.v1.structured_output import StructuredOutputManager
@@ -17,19 +17,26 @@ class TestReasoningStructuredOutput:
     """Test reasoning-aware structured output functionality."""
 
     @pytest.fixture
-    def mock_model_config(self):
-        """Create a mock ModelConfig."""
-        config = Mock(spec=ModelConfig)
-        config.skip_tokenizer_init = True  # Skip tokenizer init to avoid network calls
-        config.get_vocab_size = Mock(return_value=50000)
+    def mock_renderer_config(self):
+        """Create a mock RendererConfig."""
+        renderer_config = Mock(spec=RendererConfig)
+        renderer_config.skip_tokenizer_init = (
+            True  # Skip tokenizer init to avoid network calls
+        )
+
+        model_config = Mock(spec=ModelConfig)
+        model_config.get_vocab_size = Mock(return_value=50000)
+        model_config.trust_remote_code = False
         # Add missing runner_type attribute that tokenizer initialization expects
-        config.runner_type = "generate"
+        model_config.runner_type = "generate"
+        renderer_config.model_config = model_config
+
         # Add other attributes that tokenizer initialization might need
-        config.tokenizer = "test-tokenizer"
-        config.tokenizer_mode = "auto"
-        config.trust_remote_code = False
-        config.tokenizer_revision = None
-        return config
+        renderer_config.tokenizer = "test-tokenizer"
+        renderer_config.tokenizer_mode = "auto"
+        renderer_config.tokenizer_revision = None
+
+        return renderer_config
 
     @pytest.fixture
     def mock_scheduler_config(self):
@@ -39,10 +46,10 @@ class TestReasoningStructuredOutput:
         return config
 
     @pytest.fixture
-    def mock_vllm_config(self, mock_model_config, mock_scheduler_config):
+    def mock_vllm_config(self, mock_renderer_config, mock_scheduler_config):
         """Create a mock VllmConfig."""
         config = Mock(spec=VllmConfig)
-        config.model_config = mock_model_config
+        config.renderer_config = mock_renderer_config
         config.scheduler_config = mock_scheduler_config
         config.structured_outputs_config = Mock()
         config.structured_outputs_config.reasoning_parser = None
diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index cfc06666e7984..080d23863652d 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -7,6 +7,7 @@ from vllm.attention.layer import Attention
 from vllm.config import (
     CacheConfig,
     ModelConfig,
+    RendererConfig,
     SchedulerConfig,
     VllmConfig,
     set_current_vllm_config,
@@ -45,6 +46,7 @@ def get_vllm_config():
     )
     vllm_config = VllmConfig(
         model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         cache_config=cache_config,
         scheduler_config=scheduler_config,
     )
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index 7b8c4268a5237..464e3ab99c76d 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -13,6 +13,7 @@ from vllm.config import (
     CacheConfig,
     ModelConfig,
     ParallelConfig,
+    RendererConfig,
     SchedulerConfig,
     VllmConfig,
     set_current_vllm_config,
@@ -101,6 +102,7 @@ def get_vllm_config():
     parallel_config = ParallelConfig()
     vllm_config = VllmConfig(
         model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         cache_config=cache_config,
         scheduler_config=scheduler_config,
         parallel_config=parallel_config,
@@ -811,6 +813,7 @@ def test_hybrid_attention_mamba_tensor_shapes():
     attention_config = AttentionConfig(backend=AttentionBackendEnum.FLASHINFER)
     vllm_config = VllmConfig(
         model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         cache_config=cache_config,
         scheduler_config=scheduler_config,
         parallel_config=parallel_config,
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 0f84f3ca9d3e3..a4f9fd8d28292 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -24,6 +24,7 @@ from vllm.config.multimodal import MultiModalConfig
 from vllm.config.observability import ObservabilityConfig
 from vllm.config.parallel import EPLBConfig, ParallelConfig
 from vllm.config.pooler import PoolerConfig
+from vllm.config.renderer import RendererConfig
 from vllm.config.scheduler import SchedulerConfig
 from vllm.config.speculative import SpeculativeConfig
 from vllm.config.speech_to_text import SpeechToTextConfig
@@ -81,6 +82,8 @@ __all__ = [
     "ParallelConfig",
     # From vllm.config.pooler
     "PoolerConfig",
+    # From vllm.config.renderer
+    "RendererConfig",
     # From vllm.config.scheduler
     "SchedulerConfig",
     # From vllm.config.speculative
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 509a9c5e162f7..b0d4fb8e01e64 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -36,7 +36,6 @@ from vllm.transformers_utils.config import (
     uses_xdrope_dim,
 )
 from vllm.transformers_utils.gguf_utils import (
-    is_gguf,
     is_remote_gguf,
     maybe_patch_hf_config_from_gguf,
     split_remote_gguf,
@@ -83,7 +82,6 @@ TaskOption = Literal[
     "transcription",
     "draft",
 ]
-TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"]
 ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
 LogprobsMode = Literal[
     "raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs"
@@ -131,18 +129,6 @@ class ModelConfig:
 
     Note that the model may support other tasks using the same model runner.
     """
-    tokenizer: SkipValidation[str] = None  # type: ignore
-    """Name or path of the Hugging Face tokenizer to use. If unspecified, model
-    name or path will be used."""
-    tokenizer_mode: TokenizerMode | str = "auto"
-    """Tokenizer mode:\n
-    - "auto" will use the tokenizer from `mistral_common` for Mistral models
-    if available, otherwise it will use the "hf" tokenizer.\n
-    - "hf" will use the fast tokenizer if available.\n
-    - "slow" will always use the slow tokenizer.\n
-    - "mistral" will always use the tokenizer from `mistral_common`.\n
-    - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
-    - Other custom values can be supported via plugins."""
     trust_remote_code: bool = False
     """Trust remote code (e.g., from HuggingFace) when downloading the model
     and tokenizer."""
@@ -168,13 +154,6 @@ class ModelConfig:
     hf_config_path: str | None = None
     """Name or path of the Hugging Face config to use. If unspecified, model
     name or path will be used."""
-    allowed_local_media_path: str = ""
-    """Allowing API requests to read local images or videos from directories
-    specified by the server file system. This is a security risk. Should only
-    be enabled in trusted environments."""
-    allowed_media_domains: list[str] | None = None
-    """If set, only media URLs that belong to this domain can be used for
-    multi-modal inputs. """
     revision: str | None = None
     """The specific model version to use. It can be a branch name, a tag name,
     or a commit id. If unspecified, will use the default version."""
@@ -182,10 +161,6 @@ class ModelConfig:
     """The specific revision to use for the model code on the Hugging Face Hub.
     It can be a branch name, a tag name, or a commit id. If unspecified, will
     use the default version."""
-    tokenizer_revision: str | None = None
-    """The specific revision to use for the tokenizer on the Hugging Face Hub.
-    It can be a branch name, a tag name, or a commit id. If unspecified, will
-    use the default version."""
     max_model_len: SkipValidation[int] = None  # type: ignore
     """Model context length (prompt and output). If unspecified, will be
     automatically derived from the model config.
@@ -230,10 +205,6 @@ class ModelConfig:
     preventing potential numerical issues. Note that even if this is set to
     False, cascade attention will be only used when the heuristic tells that
     it's beneficial."""
-    skip_tokenizer_init: bool = False
-    """Skip initialization of tokenizer and detokenizer. Expects valid
-    `prompt_token_ids` and `None` for prompt from the input. The generated
-    output will contain token ids."""
     enable_prompt_embeds: bool = False
     """If `True`, enables passing text embeddings as inputs via the
     `prompt_embeds` key.
@@ -294,8 +265,6 @@ class ModelConfig:
     logits_processors: list[str | type[LogitsProcessor]] | None = None
     """One or more logits processors' fully-qualified class names or class
     definitions"""
-    io_processor_plugin: str | None = None
-    """IOProcessor plugin name to load at model startup"""
 
     # Pooler config
     pooler_config: PoolerConfig | None = None
@@ -308,7 +277,6 @@ class ModelConfig:
     from the architecture of `self.model`."""
     limit_mm_per_prompt: InitVar[dict[str, int | dict[str, int]] | None] = None
     enable_mm_embeds: InitVar[bool | None] = None
-    media_io_kwargs: InitVar[dict[str, dict[str, Any]] | None] = None
     mm_processor_kwargs: InitVar[dict[str, Any] | None] = None
     mm_processor_cache_gb: InitVar[float | None] = None
     mm_processor_cache_type: InitVar[MMCacheType | None] = None
@@ -335,18 +303,12 @@ class ModelConfig:
             "runner",
             "convert",
             "task",
-            "tokenizer",
-            "tokenizer_mode",
             "seed",
             "hf_config_path",
-            "allowed_local_media_path",
-            "allowed_media_domains",
-            "tokenizer_revision",
             "spec_target_max_model_len",
             "enforce_eager",
             "logprobs_mode",
             "disable_cascade_attn",
-            "skip_tokenizer_init",
             "served_model_name",
             "config_format",
             "hf_token",
@@ -354,11 +316,9 @@ class ModelConfig:
             "logits_processor_pattern",
             "override_attention_dtype",
             "logits_processors",
-            "io_processor_plugin",
             "pooler_config",
             "multimodal_config",
             "limit_mm_per_prompt",
-            "media_io_kwargs",
             "mm_processor_kwargs",
             "mm_processor_cache_gb",
             "mm_processor_cache_type",
@@ -423,7 +383,6 @@ class ModelConfig:
         # Multimodal config init vars
         limit_mm_per_prompt: dict[str, int | dict[str, int]] | None,
         enable_mm_embeds: bool | None,
-        media_io_kwargs: dict[str, dict[str, Any]] | None,
         mm_processor_kwargs: dict[str, Any] | None,
         mm_processor_cache_gb: float | None,
         mm_processor_cache_type: MMCacheType | None,
@@ -438,13 +397,8 @@ class ModelConfig:
         self.served_model_name = get_served_model_name(
             self.model, self.served_model_name
         )
-        self.model = maybe_model_redirect(self.model)
-        # The tokenizer is consistent with the model by default.
-        if self.tokenizer is None:
-            self.tokenizer = self.model
-        if self.tokenizer_revision is None:
-            self.tokenizer_revision = self.revision
-        self.tokenizer = maybe_model_redirect(self.tokenizer)
+        self.original_model = self.model
+        self.model = maybe_model_redirect(self.original_model)
 
         if isinstance(self.hf_config_path, str):
             self.hf_config_path = maybe_model_redirect(self.hf_config_path)
@@ -465,7 +419,7 @@ class ModelConfig:
                     hf_overrides_kw[key] = value
             hf_overrides_fn = None
 
-        self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer)
+        self.maybe_pull_model_for_runai(self.model)
 
         from vllm.platforms import current_platform
 
@@ -648,7 +602,8 @@ class ModelConfig:
         )
 
         self.original_max_model_len = self.max_model_len
-        self.max_model_len = self.get_and_verify_max_len(self.max_model_len)
+        self.recalculate_max_model_len(self.original_max_model_len)
+
         # Init multimodal config if needed
         if self._model_info.supports_multimodal:
             if (
@@ -664,7 +619,6 @@ class ModelConfig:
             mm_config_kwargs = dict(
                 limit_per_prompt=limit_mm_per_prompt,
                 enable_mm_embeds=enable_mm_embeds,
-                media_io_kwargs=media_io_kwargs,
                 mm_processor_kwargs=mm_processor_kwargs,
                 mm_processor_cache_gb=mm_processor_cache_gb,
                 mm_processor_cache_type=mm_processor_cache_type,
@@ -682,16 +636,8 @@ class ModelConfig:
 
             self.multimodal_config = MultiModalConfig(**mm_config_kwargs)
 
-        # Multimodal GGUF models must use original repo for mm processing
-        if is_gguf(self.tokenizer) and self.is_multimodal_model:
-            raise ValueError(
-                "Loading a multimodal GGUF model needs to use original "
-                "tokenizer. Please specify the unquantized hf model's "
-                "repo name or path using the --tokenizer argument."
-            )
-
         if self.disable_sliding_window:
-            # Set after get_and_verify_max_len to ensure that max_model_len
+            # Set after recalculate_max_model_len to ensure that max_model_len
             # can be correctly capped to sliding window size
             self.hf_text_config.sliding_window = None
 
@@ -715,10 +661,9 @@ class ModelConfig:
 
     @model_validator(mode="after")
     def validate_model_config_after(self: "ModelConfig") -> "ModelConfig":
-        if not isinstance(self.tokenizer, str):
-            raise ValueError("tokenizer must be a string after __post_init__.")
         if not isinstance(self.max_model_len, int):
             raise ValueError("max_model_len must be an integer after __post_init__.")
+
         return self
 
     def _get_transformers_backend_cls(self) -> str:
@@ -767,49 +712,17 @@ class ModelConfig:
         """The architecture vllm actually used."""
         return self._architecture
 
-    def maybe_pull_model_tokenizer_for_runai(self, model: str, tokenizer: str) -> None:
-        """Pull model/tokenizer from Object Storage to temporary
-        directory when needed.
-
-        Args:
-            model: Model name or path
-            tokenizer: Tokenizer name or path
-        """
-
-        if not (is_runai_obj_uri(model) or is_runai_obj_uri(tokenizer)):
+    def maybe_pull_model_for_runai(self, model: str) -> None:
+        """Pull model from Object Storage to temporary directory when needed."""
+        if not is_runai_obj_uri(model):
             return
 
-        if is_runai_obj_uri(model):
-            object_storage_model = ObjectStorageModel(url=model)
-            object_storage_model.pull_files(
-                model, allow_pattern=["*.model", "*.py", "*.json"]
-            )
-            self.model_weights = model
-            self.model = object_storage_model.dir
-
-            # If tokenizer is same as model, download to same directory
-            if model == tokenizer:
-                object_storage_model.pull_files(
-                    model,
-                    ignore_pattern=[
-                        "*.pt",
-                        "*.safetensors",
-                        "*.bin",
-                        "*.tensors",
-                        "*.pth",
-                    ],
-                )
-                self.tokenizer = object_storage_model.dir
-                return
-
-        # Only download tokenizer if needed and not already handled
-        if is_runai_obj_uri(tokenizer):
-            object_storage_tokenizer = ObjectStorageModel(url=tokenizer)
-            object_storage_tokenizer.pull_files(
-                model,
-                ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors", "*.pth"],
-            )
-            self.tokenizer = object_storage_tokenizer.dir
+        object_storage_model = ObjectStorageModel(url=model)
+        object_storage_model.pull_files(
+            model, allow_pattern=["*.model", "*.py", "*.json"]
+        )
+        self.model_weights = model
+        self.model = object_storage_model.dir
 
     def _get_encoder_config(self):
         model = self.model
@@ -1712,30 +1625,38 @@ class ModelConfig:
             return dense_modules[-1]["out_features"]
         return self.get_hidden_size()
 
-    def get_and_verify_max_len(self, max_model_len: int):
+    def recalculate_max_model_len(
+        self,
+        original_max_model_len: int | None,
+        *,
+        tokenizer: str | None = None,
+        tokenizer_revision: str | None = None,
+    ) -> None:
         # Consider max_model_len in tokenizer_config only when
         # pooling models use absolute position_embedding.
+        # NOTE: For simplicity we assume `args.model == args.tokenizer`
+        # since this is
         tokenizer_config = None
         if (
             self.runner_type == "pooling"
             and getattr(self.hf_config, "position_embedding_type", "") == "absolute"
         ):
             tokenizer_config = try_get_tokenizer_config(
-                self.tokenizer,
+                tokenizer or self.model,
                 trust_remote_code=self.trust_remote_code,
-                revision=self.tokenizer_revision,
+                revision=tokenizer_revision or self.revision,
             )
-        max_model_len = _get_and_verify_max_len(
+
+        self.max_model_len = _get_and_verify_max_len(
             hf_config=self.hf_text_config,
             tokenizer_config=tokenizer_config,
-            max_model_len=max_model_len,
+            max_model_len=original_max_model_len,
             disable_sliding_window=self.disable_sliding_window,
             sliding_window=self.get_sliding_window(),
             spec_target_max_model_len=self.spec_target_max_model_len,
             encoder_config=self.encoder_config,
         )
-        logger.info("Using max model len %s", max_model_len)
-        return max_model_len
+        logger.info("Using max model len %s", self.max_model_len)
 
     @property
     def attn_type(self) -> AttnTypeStr:
diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py
index 8a2936de96d6f..37e2f6b4d419a 100644
--- a/vllm/config/multimodal.py
+++ b/vllm/config/multimodal.py
@@ -79,10 +79,6 @@ class MultiModalConfig:
 
     WARNING: The vLLM engine may crash if incorrect shape of embeddings is passed.
     Only enable this flag for trusted users!"""
-    media_io_kwargs: dict[str, dict[str, Any]] = Field(default_factory=dict)
-    """Additional args passed to process media inputs, keyed by modalities.
-    For example, to set num_frames for video, set
-    `--media-io-kwargs '{"video": {"num_frames": 40} }'`"""
     mm_processor_kwargs: dict[str, object] | None = None
     """Arguments to be forwarded to the model's processor for multi-modal data,
     e.g., image processor. Overrides for the multi-modal processor obtained
diff --git a/vllm/config/renderer.py b/vllm/config/renderer.py
new file mode 100644
index 0000000000000..36a922b93ca05
--- /dev/null
+++ b/vllm/config/renderer.py
@@ -0,0 +1,109 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any, Literal
+
+from pydantic import Field, SkipValidation
+from pydantic.dataclasses import dataclass
+
+from vllm.config.model import ModelConfig
+from vllm.config.utils import config
+from vllm.transformers_utils.gguf_utils import is_gguf
+from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri
+from vllm.transformers_utils.utils import maybe_model_redirect
+
+TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"]
+
+
+@config
+@dataclass
+class RendererConfig:
+    """Configuration for the renderer."""
+
+    # NOTE: In reality, this is a required argument.
+    # We provide a dummy default value here to generate the CLI args.
+    model_config: SkipValidation[ModelConfig] = None  # type: ignore
+    """Provides model context to the renderer."""
+
+    tokenizer: str = ""
+    """Name or path of the Hugging Face tokenizer to use. If unspecified, model
+    name or path will be used."""
+    tokenizer_mode: TokenizerMode | str = "auto"
+    """Tokenizer mode:\n
+    - "auto" will use the tokenizer from `mistral_common` for Mistral models
+    if available, otherwise it will use the "hf" tokenizer.\n
+    - "hf" will use the fast tokenizer if available.\n
+    - "slow" will always use the slow tokenizer.\n
+    - "mistral" will always use the tokenizer from `mistral_common`.\n
+    - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
+    - Other custom values can be supported via plugins."""
+    tokenizer_revision: str | None = None
+    """The specific revision to use for the tokenizer on the Hugging Face Hub.
+    It can be a branch name, a tag name, or a commit id. If unspecified, will
+    use the default version."""
+    skip_tokenizer_init: bool = False
+    """Skip initialization of tokenizer and detokenizer. Expects valid
+    `prompt_token_ids` and `None` for prompt from the input. The generated
+    output will contain token ids."""
+
+    io_processor_plugin: str | None = None
+    """IOProcessor plugin name to load at model startup."""
+
+    media_io_kwargs: dict[str, dict[str, Any]] = Field(default_factory=dict)
+    """Additional args passed to process media inputs, keyed by modalities.
+    For example, to set num_frames for video, set
+    `--media-io-kwargs '{"video": {"num_frames": 40} }'`"""
+    allowed_local_media_path: str = ""
+    """Allowing API requests to read local images or videos from directories
+    specified by the server file system. This is a security risk. Should only
+    be enabled in trusted environments."""
+    allowed_media_domains: list[str] | None = None
+    """If set, only media URLs that belong to this domain can be used for
+    multi-modal inputs. """
+
+    @property
+    def trust_remote_code(self) -> bool:
+        return self.model_config.trust_remote_code
+
+    def __post_init__(self) -> None:
+        model_config = self.model_config
+
+        # The tokenizer is consistent with the model by default.
+        if not self.tokenizer:
+            self.tokenizer = (
+                ModelConfig.model
+                if model_config is None
+                else model_config.original_model
+            )
+        if not self.tokenizer_revision:
+            self.tokenizer_revision = (
+                ModelConfig.revision if model_config is None else model_config.revision
+            )
+
+        self.original_tokenizer = self.tokenizer
+        self.tokenizer = maybe_model_redirect(self.original_tokenizer)
+        self.maybe_pull_tokenizer_for_runai(self.tokenizer)
+
+        # Multimodal GGUF models must use original repo for mm processing
+        is_multimodal_model = (
+            ModelConfig.is_multimodal_model
+            if model_config is None
+            else model_config.is_multimodal_model
+        )
+        if is_gguf(self.tokenizer) and is_multimodal_model:
+            raise ValueError(
+                "Loading a multimodal GGUF model needs to use original "
+                "tokenizer. Please specify the unquantized hf model's "
+                "repo name or path using the --tokenizer argument."
+            )
+
+    def maybe_pull_tokenizer_for_runai(self, tokenizer: str) -> None:
+        """Pull tokenizer from Object Storage to temporary directory when needed."""
+        if not is_runai_obj_uri(tokenizer):
+            return
+
+        object_storage_tokenizer = ObjectStorageModel(url=tokenizer)
+        object_storage_tokenizer.pull_files(
+            tokenizer,
+            ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors", "*.pth"],
+        )
+        self.tokenizer = object_storage_tokenizer.dir
diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py
index bf533bf14e55c..63b63eac907d2 100644
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@@ -322,16 +322,11 @@ class SpeculativeConfig:
                 self.draft_model_config = ModelConfig(
                     model=self.model,
                     runner="draft",
-                    tokenizer=self.target_model_config.tokenizer,
-                    tokenizer_mode=self.target_model_config.tokenizer_mode,
                     trust_remote_code=self.target_model_config.trust_remote_code,
-                    allowed_local_media_path=self.target_model_config.allowed_local_media_path,
-                    allowed_media_domains=self.target_model_config.allowed_media_domains,
                     dtype=self.target_model_config.dtype,
                     seed=self.target_model_config.seed,
                     revision=self.revision,
                     code_revision=self.code_revision,
-                    tokenizer_revision=self.target_model_config.tokenizer_revision,
                     spec_target_max_model_len=self.target_model_config.max_model_len,
                     quantization=self.quantization,
                     enforce_eager=self.target_model_config.enforce_eager,
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 36e4bd159dc72..417797c445b95 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -39,6 +39,7 @@ from .lora import LoRAConfig
 from .model import ModelConfig
 from .observability import ObservabilityConfig
 from .parallel import ParallelConfig
+from .renderer import RendererConfig
 from .scheduler import SchedulerConfig
 from .speculative import SpeculativeConfig
 from .structured_outputs import StructuredOutputsConfig
@@ -181,6 +182,8 @@ class VllmConfig:
     # try to download a model
     model_config: ModelConfig = Field(default=None)
     """Model configuration."""
+    renderer_config: RendererConfig = Field(default_factory=RendererConfig)
+    """Renderer configuration."""
     cache_config: CacheConfig = Field(default_factory=CacheConfig)
     """Cache configuration."""
     parallel_config: ParallelConfig = Field(default_factory=ParallelConfig)
@@ -741,7 +744,7 @@ class VllmConfig:
             from vllm.multimodal import MULTIMODAL_REGISTRY
 
             self.scheduler_config.max_num_encoder_input_tokens = (
-                MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config)
+                MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.renderer_config)
             )
             logger.debug(
                 "Encoder-decoder model detected: setting "
@@ -1186,11 +1189,13 @@ class VllmConfig:
             computed_compile_ranges_split_points
         )
 
-    def recalculate_max_model_len(self, max_model_len: int):
-        # Can only be called in try_verify_and_update_config
-        model_config = self.model_config
-        max_model_len = model_config.get_and_verify_max_len(max_model_len)
-        self.model_config.max_model_len = max_model_len
+    def recalculate_max_model_len(self, original_max_model_len: int | None) -> None:
+        # Can only be called during try_verify_and_update_config
+        self.model_config.recalculate_max_model_len(
+            original_max_model_len,
+            tokenizer=self.renderer_config.tokenizer,
+            tokenizer_revision=self.renderer_config.tokenizer_revision,
+        )
 
     def try_verify_and_update_config(self):
         if self.model_config is None:
@@ -1264,11 +1269,11 @@ class VllmConfig:
         return (
             f"model={self.model_config.model!r}, "
             f"speculative_config={self.speculative_config!r}, "
-            f"tokenizer={self.model_config.tokenizer!r}, "
-            f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}, "
-            f"tokenizer_mode={self.model_config.tokenizer_mode}, "
+            f"tokenizer={self.renderer_config.tokenizer!r}, "
+            f"skip_tokenizer_init={self.renderer_config.skip_tokenizer_init}, "
+            f"tokenizer_mode={self.renderer_config.tokenizer_mode}, "
             f"revision={self.model_config.revision}, "
-            f"tokenizer_revision={self.model_config.tokenizer_revision}, "
+            f"tokenizer_revision={self.renderer_config.tokenizer_revision}, "
             f"trust_remote_code={self.model_config.trust_remote_code}, "
             f"dtype={self.model_config.dtype}, "
             f"max_seq_len={self.model_config.max_model_len}, "
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index ceac5407af6e2..bd398abb0bf81 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -71,11 +71,11 @@ from vllm.config.model import (
     ModelDType,
     RunnerOption,
     TaskOption,
-    TokenizerMode,
 )
 from vllm.config.multimodal import MMCacheType, MMEncoderTPMode
 from vllm.config.observability import DetailedTraceModules
 from vllm.config.parallel import DistributedExecutorBackend, ExpertPlacementStrategy
+from vllm.config.renderer import RendererConfig, TokenizerMode
 from vllm.config.scheduler import SchedulerPolicy
 from vllm.config.utils import get_field
 from vllm.config.vllm import OptimizationLevel
@@ -355,17 +355,12 @@ class EngineArgs:
 
     model: str = ModelConfig.model
     served_model_name: str | list[str] | None = ModelConfig.served_model_name
-    tokenizer: str | None = ModelConfig.tokenizer
     hf_config_path: str | None = ModelConfig.hf_config_path
     runner: RunnerOption = ModelConfig.runner
     convert: ConvertOption = ModelConfig.convert
     task: TaskOption | None = ModelConfig.task
-    skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init
     enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds
-    tokenizer_mode: TokenizerMode | str = ModelConfig.tokenizer_mode
     trust_remote_code: bool = ModelConfig.trust_remote_code
-    allowed_local_media_path: str = ModelConfig.allowed_local_media_path
-    allowed_media_domains: list[str] | None = ModelConfig.allowed_media_domains
     download_dir: str | None = LoadConfig.download_dir
     safetensors_load_strategy: str = LoadConfig.safetensors_load_strategy
     load_format: str | LoadFormats = LoadConfig.load_format
@@ -449,7 +444,6 @@ class EngineArgs:
     code_revision: str | None = ModelConfig.code_revision
     hf_token: bool | str | None = ModelConfig.hf_token
     hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides")
-    tokenizer_revision: str | None = ModelConfig.tokenizer_revision
     quantization: QuantizationMethods | None = ModelConfig.quantization
     enforce_eager: bool = ModelConfig.enforce_eager
     disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
@@ -458,9 +452,6 @@ class EngineArgs:
     )
     enable_mm_embeds: bool = MultiModalConfig.enable_mm_embeds
     interleave_mm_strings: bool = MultiModalConfig.interleave_mm_strings
-    media_io_kwargs: dict[str, dict[str, Any]] = get_field(
-        MultiModalConfig, "media_io_kwargs"
-    )
     mm_processor_kwargs: dict[str, Any] | None = MultiModalConfig.mm_processor_kwargs
     disable_mm_preprocessor_cache: bool = False  # DEPRECATED
     mm_processor_cache_gb: float = MultiModalConfig.mm_processor_cache_gb
@@ -474,9 +465,19 @@ class EngineArgs:
     mm_encoder_attn_backend: AttentionBackendEnum | str | None = (
         MultiModalConfig.mm_encoder_attn_backend
     )
-    io_processor_plugin: str | None = None
     skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling
     video_pruning_rate: float = MultiModalConfig.video_pruning_rate
+    # Renderer fields
+    tokenizer: str | None = None
+    tokenizer_mode: TokenizerMode | str = RendererConfig.tokenizer_mode
+    tokenizer_revision: str | None = RendererConfig.tokenizer_revision
+    skip_tokenizer_init: bool = RendererConfig.skip_tokenizer_init
+    io_processor_plugin: str | None = None
+    media_io_kwargs: dict[str, dict[str, Any]] = get_field(
+        RendererConfig, "media_io_kwargs"
+    )
+    allowed_local_media_path: str = RendererConfig.allowed_local_media_path
+    allowed_media_domains: list[str] | None = RendererConfig.allowed_media_domains
     # LoRA fields
     enable_lora: bool = False
     max_loras: int = LoRAConfig.max_loras
@@ -627,25 +628,14 @@ class EngineArgs:
         model_group.add_argument("--runner", **model_kwargs["runner"])
         model_group.add_argument("--convert", **model_kwargs["convert"])
         model_group.add_argument("--task", **model_kwargs["task"], deprecated=True)
-        model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"])
-        model_group.add_argument("--tokenizer-mode", **model_kwargs["tokenizer_mode"])
         model_group.add_argument(
             "--trust-remote-code", **model_kwargs["trust_remote_code"]
         )
         model_group.add_argument("--dtype", **model_kwargs["dtype"])
         model_group.add_argument("--seed", **model_kwargs["seed"])
         model_group.add_argument("--hf-config-path", **model_kwargs["hf_config_path"])
-        model_group.add_argument(
-            "--allowed-local-media-path", **model_kwargs["allowed_local_media_path"]
-        )
-        model_group.add_argument(
-            "--allowed-media-domains", **model_kwargs["allowed_media_domains"]
-        )
         model_group.add_argument("--revision", **model_kwargs["revision"])
         model_group.add_argument("--code-revision", **model_kwargs["code_revision"])
-        model_group.add_argument(
-            "--tokenizer-revision", **model_kwargs["tokenizer_revision"]
-        )
         model_group.add_argument("--max-model-len", **model_kwargs["max_model_len"])
         model_group.add_argument("--quantization", "-q", **model_kwargs["quantization"])
         model_group.add_argument("--enforce-eager", **model_kwargs["enforce_eager"])
@@ -657,9 +647,6 @@ class EngineArgs:
         model_group.add_argument(
             "--disable-cascade-attn", **model_kwargs["disable_cascade_attn"]
         )
-        model_group.add_argument(
-            "--skip-tokenizer-init", **model_kwargs["skip_tokenizer_init"]
-        )
         model_group.add_argument(
             "--enable-prompt-embeds", **model_kwargs["enable_prompt_embeds"]
         )
@@ -698,8 +685,34 @@ class EngineArgs:
         model_group.add_argument(
             "--logits-processors", **model_kwargs["logits_processors"]
         )
-        model_group.add_argument(
-            "--io-processor-plugin", **model_kwargs["io_processor_plugin"]
+
+        # Renderer arguments
+        renderer_kwargs = get_kwargs(RendererConfig)
+        renderer_group = parser.add_argument_group(
+            title="RendererConfig",
+            description=RendererConfig.__doc__,
+        )
+        renderer_group.add_argument("--tokenizer", **renderer_kwargs["tokenizer"])
+        renderer_group.add_argument(
+            "--tokenizer-mode", **renderer_kwargs["tokenizer_mode"]
+        )
+        renderer_group.add_argument(
+            "--tokenizer-revision", **renderer_kwargs["tokenizer_revision"]
+        )
+        renderer_group.add_argument(
+            "--skip-tokenizer-init", **renderer_kwargs["skip_tokenizer_init"]
+        )
+        renderer_group.add_argument(
+            "--media-io-kwargs", **renderer_kwargs["media_io_kwargs"]
+        )
+        renderer_group.add_argument(
+            "--allowed-local-media-path", **renderer_kwargs["allowed_local_media_path"]
+        )
+        renderer_group.add_argument(
+            "--allowed-media-domains", **renderer_kwargs["allowed_media_domains"]
+        )
+        renderer_group.add_argument(
+            "--io-processor-plugin", **renderer_kwargs["io_processor_plugin"]
         )
 
         # Model loading arguments
@@ -949,9 +962,6 @@ class EngineArgs:
         multimodal_group.add_argument(
             "--enable-mm-embeds", **multimodal_kwargs["enable_mm_embeds"]
         )
-        multimodal_group.add_argument(
-            "--media-io-kwargs", **multimodal_kwargs["media_io_kwargs"]
-        )
         multimodal_group.add_argument(
             "--mm-processor-kwargs", **multimodal_kwargs["mm_processor_kwargs"]
         )
@@ -1255,18 +1265,13 @@ class EngineArgs:
             runner=self.runner,
             convert=self.convert,
             task=self.task,
-            tokenizer=self.tokenizer,
-            tokenizer_mode=self.tokenizer_mode,
             trust_remote_code=self.trust_remote_code,
-            allowed_local_media_path=self.allowed_local_media_path,
-            allowed_media_domains=self.allowed_media_domains,
             dtype=self.dtype,
             seed=self.seed,
             revision=self.revision,
             code_revision=self.code_revision,
             hf_token=self.hf_token,
             hf_overrides=self.hf_overrides,
-            tokenizer_revision=self.tokenizer_revision,
             max_model_len=self.max_model_len,
             quantization=self.quantization,
             enforce_eager=self.enforce_eager,
@@ -1274,13 +1279,11 @@ class EngineArgs:
             logprobs_mode=self.logprobs_mode,
             disable_sliding_window=self.disable_sliding_window,
             disable_cascade_attn=self.disable_cascade_attn,
-            skip_tokenizer_init=self.skip_tokenizer_init,
             enable_prompt_embeds=self.enable_prompt_embeds,
             served_model_name=self.served_model_name,
             limit_mm_per_prompt=self.limit_mm_per_prompt,
             enable_mm_embeds=self.enable_mm_embeds,
             interleave_mm_strings=self.interleave_mm_strings,
-            media_io_kwargs=self.media_io_kwargs,
             skip_mm_profiling=self.skip_mm_profiling,
             config_format=self.config_format,
             mm_processor_kwargs=self.mm_processor_kwargs,
@@ -1298,7 +1301,6 @@ class EngineArgs:
             override_attention_dtype=self.override_attention_dtype,
             logits_processors=self.logits_processors,
             video_pruning_rate=self.video_pruning_rate,
-            io_processor_plugin=self.io_processor_plugin,
         )
 
     def validate_tensorizer_args(self):
@@ -1394,9 +1396,25 @@ class EngineArgs:
             )
 
         model_config = self.create_model_config()
-        self.model = model_config.model
-        self.tokenizer = model_config.tokenizer
+        renderer_config = RendererConfig(
+            model_config=model_config,
+            tokenizer=self.tokenizer or "",
+            tokenizer_mode=self.tokenizer_mode,
+            tokenizer_revision=self.tokenizer_revision,
+            skip_tokenizer_init=self.skip_tokenizer_init,
+            io_processor_plugin=self.io_processor_plugin,
+            media_io_kwargs=self.media_io_kwargs,
+            allowed_local_media_path=self.allowed_local_media_path,
+            allowed_media_domains=self.allowed_media_domains,
+        )
 
+        model_config.recalculate_max_model_len(
+            model_config.original_max_model_len,
+            tokenizer=renderer_config.tokenizer,
+            tokenizer_revision=renderer_config.tokenizer_revision,
+        )
+
+        self.model = model_config.model
         self._check_feature_supported(model_config)
         self._set_default_chunked_prefill_and_prefix_caching_args(model_config)
         self._set_default_max_num_seqs_and_batched_tokens_args(
@@ -1768,6 +1786,7 @@ class EngineArgs:
             )
         config = VllmConfig(
             model_config=model_config,
+            renderer_config=renderer_config,
             cache_config=cache_config,
             parallel_config=parallel_config,
             scheduler_config=scheduler_config,
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index d94951a0cffc8..7b60e7f89861b 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -5,7 +5,7 @@ from abc import ABC, abstractmethod
 from collections.abc import AsyncGenerator, Iterable, Mapping
 from typing import Any
 
-from vllm.config import ModelConfig, VllmConfig
+from vllm.config import ModelConfig, RendererConfig, VllmConfig
 from vllm.inputs.data import PromptType
 from vllm.lora.request import LoRARequest
 from vllm.outputs import PoolingRequestOutput, RequestOutput
@@ -22,6 +22,7 @@ class EngineClient(ABC):
     """Protocol class for Clients to Engine"""
 
     vllm_config: VllmConfig
+    renderer_config: RendererConfig
     model_config: ModelConfig
     input_processor: InputProcessor
     io_processor: IOProcessor | None
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index aceaa8bd45b81..5ad256c2f3eb3 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -44,7 +44,7 @@ from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast, Processor
 from typing_extensions import Required, TypedDict
 
 from vllm import envs
-from vllm.config import ModelConfig
+from vllm.config import ModelConfig, RendererConfig
 from vllm.logger import init_logger
 from vllm.model_executor.models import SupportsMultiModal
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalUUIDDict
@@ -452,9 +452,10 @@ This is needed because `lru_cache` does not cache when an exception happens.
 
 def _try_get_processor_chat_template(
     tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
-    model_config: ModelConfig,
+    *,
+    trust_remote_code: bool,
 ) -> str | None:
-    cache_key = (tokenizer.name_or_path, model_config.trust_remote_code)
+    cache_key = (tokenizer.name_or_path, trust_remote_code)
     if cache_key in _PROCESSOR_CHAT_TEMPLATES:
         return _PROCESSOR_CHAT_TEMPLATES[cache_key]
 
@@ -466,7 +467,7 @@ def _try_get_processor_chat_template(
                 PreTrainedTokenizerFast,
                 ProcessorMixin,
             ),
-            trust_remote_code=model_config.trust_remote_code,
+            trust_remote_code=trust_remote_code,
         )
         if (
             isinstance(processor, ProcessorMixin)
@@ -499,7 +500,10 @@ def resolve_hf_chat_template(
 
     # 2nd priority: AutoProcessor chat template, unless tool calling is enabled
     if tools is None:
-        chat_template = _try_get_processor_chat_template(tokenizer, model_config)
+        chat_template = _try_get_processor_chat_template(
+            tokenizer,
+            trust_remote_code=model_config.trust_remote_code,
+        )
         if chat_template is not None:
             return chat_template
 
@@ -513,10 +517,10 @@ def resolve_hf_chat_template(
             exc_info=True,
         )
 
-    # 4th priority: Predefined fallbacks
+    # 4th priority: Predefined fallbacks]
     path = get_chat_template_fallback_path(
         model_type=model_config.hf_config.model_type,
-        tokenizer_name_or_path=model_config.tokenizer,
+        tokenizer_name_or_path=tokenizer.name_or_path,
     )
     if path is not None:
         logger.info_once(
@@ -538,14 +542,14 @@ def _resolve_chat_template_content_format(
     tools: list[dict[str, Any]] | None,
     tokenizer: TokenizerLike | None,
     *,
-    model_config: ModelConfig,
+    renderer_config: RendererConfig,
 ) -> _ChatTemplateContentFormat:
     if isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
         hf_chat_template = resolve_hf_chat_template(
             tokenizer,
             chat_template=chat_template,
             tools=tools,
-            model_config=model_config,
+            model_config=renderer_config.model_config,
         )
     else:
         hf_chat_template = None
@@ -595,7 +599,7 @@ def resolve_chat_template_content_format(
     given_format: ChatTemplateContentFormatOption,
     tokenizer: TokenizerLike | None,
     *,
-    model_config: ModelConfig,
+    renderer_config: RendererConfig,
 ) -> _ChatTemplateContentFormat:
     if given_format != "auto":
         return given_format
@@ -604,7 +608,7 @@ def resolve_chat_template_content_format(
         chat_template,
         tools,
         tokenizer,
-        model_config=model_config,
+        renderer_config=renderer_config,
     )
 
     _log_chat_template_content_format(
@@ -627,32 +631,32 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
     maximum per prompt.
     """
 
-    def __init__(self, model_config: ModelConfig):
+    def __init__(self, renderer_config: RendererConfig):
         super().__init__()
 
-        self._model_config = model_config
+        self._renderer_config = renderer_config
 
         self._items_by_modality = defaultdict[str, list[_T | None]](list)
         self._uuids_by_modality = defaultdict[str, list[str | None]](list)
 
     @property
-    def model_config(self) -> ModelConfig:
-        return self._model_config
+    def renderer_config(self) -> RendererConfig:
+        return self._renderer_config
 
     @cached_property
     def model_cls(self) -> type[SupportsMultiModal]:
         from vllm.model_executor.model_loader import get_model_cls
 
-        model_cls = get_model_cls(self.model_config)
+        model_cls = get_model_cls(self.renderer_config.model_config)
         return cast(type[SupportsMultiModal], model_cls)
 
     @property
     def allowed_local_media_path(self):
-        return self._model_config.allowed_local_media_path
+        return self._renderer_config.allowed_local_media_path
 
     @property
     def allowed_media_domains(self):
-        return self._model_config.allowed_media_domains
+        return self._renderer_config.allowed_media_domains
 
     @property
     def mm_registry(self):
@@ -660,7 +664,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
 
     @cached_property
     def mm_processor(self):
-        return self.mm_registry.create_processor(self.model_config)
+        return self.mm_registry.create_processor(self.renderer_config)
 
     def add(
         self,
@@ -851,19 +855,20 @@ class MultiModalContentParser(BaseMultiModalContentParser):
         super().__init__()
 
         self._tracker = tracker
-        multimodal_config = self._tracker.model_config.multimodal_config
-        media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None)
-
         self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load(
             envs.VLLM_MEDIA_CONNECTOR,
-            media_io_kwargs=media_io_kwargs,
+            media_io_kwargs=self.renderer_config.media_io_kwargs,
             allowed_local_media_path=tracker.allowed_local_media_path,
             allowed_media_domains=tracker.allowed_media_domains,
         )
 
+    @property
+    def renderer_config(self) -> RendererConfig:
+        return self._tracker.renderer_config
+
     @property
     def model_config(self) -> ModelConfig:
-        return self._tracker.model_config
+        return self.renderer_config.model_config
 
     def parse_image(self, image_url: str | None, uuid: str | None = None) -> None:
         image = self._connector.fetch_image(image_url) if image_url else None
@@ -963,18 +968,20 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
         super().__init__()
 
         self._tracker = tracker
-        multimodal_config = self._tracker.model_config.multimodal_config
-        media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None)
         self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load(
             envs.VLLM_MEDIA_CONNECTOR,
-            media_io_kwargs=media_io_kwargs,
+            media_io_kwargs=self.renderer_config.media_io_kwargs,
             allowed_local_media_path=tracker.allowed_local_media_path,
             allowed_media_domains=tracker.allowed_media_domains,
         )
 
+    @property
+    def renderer_config(self) -> RendererConfig:
+        return self._tracker.renderer_config
+
     @property
     def model_config(self) -> ModelConfig:
-        return self._tracker.model_config
+        return self.renderer_config.model_config
 
     def parse_image(self, image_url: str | None, uuid: str | None = None) -> None:
         image_coro = self._connector.fetch_image_async(image_url) if image_url else None
@@ -1604,15 +1611,17 @@ def _postprocess_messages(messages: list[ConversationMessage]) -> None:
 
 def parse_chat_messages(
     messages: list[ChatCompletionMessageParam],
-    model_config: ModelConfig,
+    renderer_config: RendererConfig,
     content_format: _ChatTemplateContentFormat,
 ) -> tuple[
     list[ConversationMessage],
     MultiModalDataDict | None,
     MultiModalUUIDDict | None,
 ]:
+    model_config = renderer_config.model_config
+
     conversation: list[ConversationMessage] = []
-    mm_tracker = MultiModalItemTracker(model_config)
+    mm_tracker = MultiModalItemTracker(renderer_config)
 
     for msg in messages:
         sub_messages = _parse_chat_message_content(
@@ -1635,15 +1644,17 @@ def parse_chat_messages(
 
 def parse_chat_messages_futures(
     messages: list[ChatCompletionMessageParam],
-    model_config: ModelConfig,
+    renderer_config: RendererConfig,
     content_format: _ChatTemplateContentFormat,
 ) -> tuple[
     list[ConversationMessage],
     Awaitable[MultiModalDataDict | None],
     MultiModalUUIDDict | None,
 ]:
+    model_config = renderer_config.model_config
+
     conversation: list[ConversationMessage] = []
-    mm_tracker = AsyncMultiModalItemTracker(model_config)
+    mm_tracker = AsyncMultiModalItemTracker(renderer_config)
 
     for msg in messages:
         sub_messages = _parse_chat_message_content(
@@ -1748,14 +1759,14 @@ def apply_hf_chat_template(
     chat_template: str | None,
     tools: list[dict[str, Any]] | None,
     *,
-    model_config: ModelConfig,
+    renderer_config: RendererConfig,
     **kwargs: Any,
 ) -> str:
     hf_chat_template = resolve_hf_chat_template(
         tokenizer,
         chat_template=chat_template,
         tools=tools,
-        model_config=model_config,
+        model_config=renderer_config.model_config,
     )
 
     if hf_chat_template is None:
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 913324fd5f9c3..6b3cb26afb626 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -29,8 +29,8 @@ from vllm.config.model import (
     HfOverrides,
     ModelDType,
     RunnerOption,
-    TokenizerMode,
 )
+from vllm.config.renderer import TokenizerMode
 from vllm.engine.arg_utils import EngineArgs
 from vllm.entrypoints.chat_utils import (
     ChatCompletionMessageParam,
@@ -343,6 +343,7 @@ class LLM:
         logger.info("Supported tasks: %s", supported_tasks)
         self.supported_tasks = supported_tasks
 
+        self.renderer_config = self.llm_engine.renderer_config
         self.model_config = self.llm_engine.model_config
         self.input_processor = self.llm_engine.input_processor
         self.io_processor = self.llm_engine.io_processor
@@ -808,13 +809,13 @@ class LLM:
             list_of_messages = [cast(list[ChatCompletionMessageParam], messages)]
 
         tokenizer = self.get_tokenizer()
-        model_config = self.model_config
+        renderer_config = self.renderer_config
         resolved_content_format = resolve_chat_template_content_format(
             chat_template,
             tools,
             chat_template_content_format,
             tokenizer,
-            model_config=model_config,
+            renderer_config=renderer_config,
         )
 
         _chat_template_kwargs: dict[str, Any] = dict(
@@ -833,7 +834,7 @@ class LLM:
             # the chat message parsing for it.
             conversation, mm_data, mm_uuids = parse_chat_messages(
                 msgs,
-                model_config,
+                renderer_config,
                 content_format=resolved_content_format,
             )
 
@@ -847,7 +848,7 @@ class LLM:
                 prompt_str = apply_hf_chat_template(
                     tokenizer=tokenizer,
                     conversation=conversation,
-                    model_config=model_config,
+                    renderer_config=renderer_config,
                     **_chat_template_kwargs,
                 )
                 # Special tokens are already included in chat templates so
@@ -1290,6 +1291,7 @@ class LLM:
         lora_request: list[LoRARequest] | LoRARequest | None = None,
         tokenization_kwargs: dict[str, Any] | None = None,
     ) -> list[ScoringRequestOutput]:
+        renderer_config = self.renderer_config
         model_config = self.model_config
 
         if isinstance(tokenizer, MistralTokenizer):
@@ -1317,7 +1319,7 @@ class LLM:
 
         for q, d in input_pairs:
             _, engine_prompt = get_score_prompt(
-                model_config=model_config,
+                renderer_config=renderer_config,
                 data_1=q,
                 data_2=d,
                 tokenizer=tokenizer,
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 7be601d824f34..d77d611a2654d 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1099,7 +1099,7 @@ async def init_app_state(
     logger.info("Supported tasks: %s", supported_tasks)
 
     resolved_chat_template = await process_chat_template(
-        args.chat_template, engine_client, vllm_config.model_config
+        args.chat_template, engine_client, vllm_config.renderer_config
     )
 
     if args.tool_server == "demo":
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 3e421e21e3e80..a9e72fb00c5bd 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -122,7 +122,7 @@ class OpenAIServingCompletion(OpenAIServing):
         try:
             lora_request = self._maybe_get_adapters(request)
 
-            if self.model_config.skip_tokenizer_init:
+            if self.renderer_config.skip_tokenizer_init:
                 tokenizer = None
             else:
                 tokenizer = await self.engine_client.get_tokenizer()
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 99936f588f28b..d887cf48d89f9 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -291,6 +291,7 @@ class OpenAIServing:
 
         self.input_processor = self.models.input_processor
         self.io_processor = self.models.io_processor
+        self.renderer_config = self.models.renderer_config
         self.model_config = self.models.model_config
         self.max_model_len = self.model_config.max_model_len
 
@@ -1100,18 +1101,18 @@ class OpenAIServing:
         Sequence[RequestPrompt],
         list[EngineTokensPrompt],
     ]:
-        model_config = self.model_config
+        renderer_config = self.renderer_config
 
         resolved_content_format = resolve_chat_template_content_format(
             chat_template,
             tool_dicts,
             chat_template_content_format,
             tokenizer,
-            model_config=model_config,
+            renderer_config=renderer_config,
         )
         conversation, mm_data_future, mm_uuids = parse_chat_messages_futures(
             messages,
-            model_config,
+            renderer_config,
             content_format=resolved_content_format,
         )
 
@@ -1138,14 +1139,14 @@ class OpenAIServing:
             request_prompt = tokenizer.apply_chat_template(
                 conversation=conversation,
                 messages=messages,
-                model_config=model_config,
+                model_config=renderer_config.model_config,
                 **_chat_template_kwargs,
             )
         else:
             request_prompt = apply_hf_chat_template(
                 tokenizer=tokenizer,
                 conversation=conversation,
-                model_config=model_config,
+                renderer_config=renderer_config,
                 **_chat_template_kwargs,
             )
 
diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
index 953398a9a72ae..ec65e659383d8 100644
--- a/vllm/entrypoints/openai/serving_models.py
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -71,6 +71,7 @@ class OpenAIServingModels:
 
         self.input_processor = self.engine_client.input_processor
         self.io_processor = self.engine_client.io_processor
+        self.renderer_config = self.engine_client.renderer_config
         self.model_config = self.engine_client.model_config
         self.max_model_len = self.model_config.max_model_len
 
diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py
index cea9924ebbaca..5fd79eed1909b 100644
--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@@ -91,7 +91,7 @@ class OpenAISpeechToText(OpenAIServing):
         self.task_type = task_type
 
         self.asr_config = self.model_cls.get_speech_to_text_config(
-            self.model_config, task_type
+            self.renderer_config, task_type
         )
 
         self.enable_force_include_usage = enable_force_include_usage
@@ -101,8 +101,8 @@ class OpenAISpeechToText(OpenAIServing):
             self.tokenizer = cast(
                 PreTrainedTokenizerBase,
                 get_tokenizer(
-                    tokenizer_name=self.model_config.tokenizer,
-                    tokenizer_mode=self.model_config.tokenizer_mode,
+                    tokenizer_name=self.renderer_config.tokenizer,
+                    tokenizer_mode=self.renderer_config.tokenizer_mode,
                 ),
             )
 
@@ -154,7 +154,7 @@ class OpenAISpeechToText(OpenAIServing):
             prompt = self.model_cls.get_generation_prompt(
                 audio=chunk,
                 stt_config=self.asr_config,
-                model_config=self.model_config,
+                renderer_config=self.renderer_config,
                 language=language,
                 task_type=self.task_type,
                 request_prompt=request.prompt,
@@ -428,7 +428,7 @@ class OpenAISpeechToText(OpenAIServing):
                     if res.prompt_token_ids is not None:
                         num_prompt_tokens = len(res.prompt_token_ids)
                         if audio_tokens := self.model_cls.get_num_audio_tokens(
-                            audio_duration_s, self.asr_config, self.model_config
+                            audio_duration_s, self.asr_config, self.renderer_config
                         ):
                             num_prompt_tokens += audio_tokens
 
diff --git a/vllm/entrypoints/pooling/pooling/serving.py b/vllm/entrypoints/pooling/pooling/serving.py
index 7fb767e26d019..cd28ccba9ef95 100644
--- a/vllm/entrypoints/pooling/pooling/serving.py
+++ b/vllm/entrypoints/pooling/pooling/serving.py
@@ -94,7 +94,7 @@ class OpenAIServingPooling(OpenAIServing):
         try:
             lora_request = self._maybe_get_adapters(request)
 
-            if self.model_config.skip_tokenizer_init:
+            if self.renderer_config.skip_tokenizer_init:
                 tokenizer = None
             else:
                 tokenizer = await self.engine_client.get_tokenizer()
diff --git a/vllm/entrypoints/pooling/score/serving.py b/vllm/entrypoints/pooling/score/serving.py
index e5a66783005a6..f657fcefd3a86 100644
--- a/vllm/entrypoints/pooling/score/serving.py
+++ b/vllm/entrypoints/pooling/score/serving.py
@@ -160,10 +160,8 @@ class ServingScores(OpenAIServing):
         data_1: str | ScoreContentPartParam,
         data_2: str | ScoreContentPartParam,
     ) -> tuple[str, TokensPrompt]:
-        model_config = self.model_config
-
         full_prompt, engine_prompt = get_score_prompt(
-            model_config=model_config,
+            renderer_config=self.renderer_config,
             data_1=data_1,
             data_2=data_2,
             tokenizer=tokenizer,
diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py
index 072ddd4c90b16..561adbe454f3c 100644
--- a/vllm/entrypoints/score_utils.py
+++ b/vllm/entrypoints/score_utils.py
@@ -5,7 +5,7 @@ from typing import Any, TypeAlias, cast
 from torch.nn import CosineSimilarity
 from typing_extensions import Required, TypedDict
 
-from vllm.config import ModelConfig
+from vllm.config import ModelConfig, RendererConfig
 from vllm.entrypoints.chat_utils import (
     BaseMultiModalItemTracker,
     ChatCompletionContentPartImageEmbedsParam,
@@ -88,9 +88,9 @@ def _validate_score_input_lens(
 def parse_score_data(
     data_1: str | ScoreContentPartParam,
     data_2: str | ScoreContentPartParam,
-    model_config: ModelConfig,
+    renderer_config: RendererConfig,
 ) -> tuple[str, str, MultiModalDataDict | None]:
-    mm_tracker = MultiModalItemTracker(model_config)
+    mm_tracker = MultiModalItemTracker(renderer_config)
 
     content_1 = _parse_score_content(data_1, mm_tracker)
     content_2 = _parse_score_content(data_2, mm_tracker)
@@ -176,7 +176,7 @@ def post_process_tokens(
 
 
 def get_score_prompt(
-    model_config: ModelConfig,
+    renderer_config: RendererConfig,
     tokenizer: TokenizerLike,
     tokenization_kwargs: dict[str, Any],
     data_1: str | ScoreContentPartParam,
@@ -185,11 +185,14 @@ def get_score_prompt(
     prompt_1, prompt_2, mm_data = parse_score_data(
         data_1,
         data_2,
-        model_config,
+        renderer_config,
     )
+
     from vllm.model_executor.model_loader import get_model_cls
 
+    model_config = renderer_config.model_config
     model = get_model_cls(model_config)
+
     if supports_score_template(model):
         full_prompt = apply_score_template(model_config, prompt_1, prompt_2)
         prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs)
diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
index daeeb995bc749..a81f73ac9e618 100644
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -13,7 +13,7 @@ from fastapi import Request
 from fastapi.responses import JSONResponse, StreamingResponse
 from starlette.background import BackgroundTask, BackgroundTasks
 
-from vllm.config import ModelConfig
+from vllm.config import RendererConfig
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.chat_utils import (
@@ -288,7 +288,7 @@ def process_lora_modules(
 async def process_chat_template(
     args_chat_template: Path | str | None,
     engine_client: EngineClient,
-    model_config: ModelConfig,
+    renderer_config: RendererConfig,
 ) -> str | None:
     resolved_chat_template = load_chat_template(args_chat_template)
     if resolved_chat_template is not None:
@@ -305,7 +305,7 @@ async def process_chat_template(
                 tokenizer=tokenizer,
                 chat_template=None,
                 tools=None,
-                model_config=model_config,
+                model_config=renderer_config.model_config,
             )
 
             if hf_chat_template != resolved_chat_template:
@@ -314,6 +314,6 @@ async def process_chat_template(
                     "It is different from official chat template '%s'. "
                     "This discrepancy may lead to performance degradation.",
                     resolved_chat_template,
-                    model_config.model,
+                    renderer_config.model_config.model,
                 )
     return resolved_chat_template
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 0372b06d0017f..f534d102fc3b7 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -6,7 +6,7 @@ from typing import Any, cast
 
 from typing_extensions import assert_never
 
-from vllm.config import ModelConfig
+from vllm.config import RendererConfig
 from vllm.logger import init_logger
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.multimodal.cache import BaseMultiModalProcessorCache
@@ -45,14 +45,15 @@ logger = init_logger(__name__)
 class InputPreprocessor:
     def __init__(
         self,
-        model_config: ModelConfig,
+        renderer_config: RendererConfig,
         tokenizer: TokenizerLike | None,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
         mm_processor_cache: BaseMultiModalProcessorCache | None = None,
     ) -> None:
         super().__init__()
 
-        self.model_config = model_config
+        self.renderer_config = renderer_config
+        self.model_config = renderer_config.model_config
         self.tokenizer = tokenizer
         self.mm_registry = mm_registry
         self.mm_processor_cache = mm_processor_cache
@@ -231,7 +232,7 @@ class InputPreprocessor:
     def _get_mm_processor(self) -> BaseMultiModalProcessor:
         if not hasattr(self, "_mm_processor"):
             self._mm_processor = self.mm_registry.create_processor(
-                self.model_config,
+                self.renderer_config,
                 tokenizer=self.tokenizer,
                 cache=self.mm_processor_cache,
             )
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 007d847ac3b7b..a2700bd5a5016 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -415,7 +415,7 @@ def load_weights_using_from_2_way_softmax(
     from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
     from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 
-    model_config = model.vllm_config.model_config
+    renderer_config = model.vllm_config.renderer_config
     quant_config = model.vllm_config.quant_config
     text_config = model.config.get_text_config()
 
@@ -447,10 +447,10 @@ def load_weights_using_from_2_way_softmax(
     from vllm.tokenizers import get_tokenizer
 
     tokenizer = get_tokenizer(
-        model_config.tokenizer,
-        revision=model_config.tokenizer_revision,
-        tokenizer_mode=model_config.tokenizer_mode,
-        trust_remote_code=model_config.trust_remote_code,
+        renderer_config.tokenizer,
+        revision=renderer_config.tokenizer_revision,
+        tokenizer_mode=renderer_config.tokenizer_mode,
+        trust_remote_code=renderer_config.trust_remote_code,
     )
 
     false_id = tokenizer.convert_tokens_to_ids(tokens[0])
@@ -473,7 +473,7 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te
     from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
     from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 
-    model_config = model.vllm_config.model_config
+    renderer_config = model.vllm_config.renderer_config
     quant_config = model.vllm_config.quant_config
     text_config = model.config.get_text_config()
 
@@ -501,10 +501,10 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te
     from vllm.tokenizers import get_tokenizer
 
     tokenizer = get_tokenizer(
-        model_config.tokenizer,
-        revision=model_config.tokenizer_revision,
-        tokenizer_mode=model_config.tokenizer_mode,
-        trust_remote_code=model_config.trust_remote_code,
+        renderer_config.tokenizer,
+        revision=renderer_config.tokenizer_revision,
+        tokenizer_mode=renderer_config.tokenizer_mode,
+        trust_remote_code=renderer_config.trust_remote_code,
     )
 
     token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens]
diff --git a/vllm/model_executor/models/deepseek_ocr.py b/vllm/model_executor/models/deepseek_ocr.py
index 1f07381c0cbd0..bd472474982ff 100644
--- a/vllm/model_executor/models/deepseek_ocr.py
+++ b/vllm/model_executor/models/deepseek_ocr.py
@@ -377,8 +377,8 @@ class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         self.projector_config = config.projector_config
         self.text_config = config.text_config
 
-        model_config = vllm_config.model_config
-        tokenizer = cached_tokenizer_from_config(model_config)
+        renderer_config = vllm_config.renderer_config
+        tokenizer = cached_tokenizer_from_config(renderer_config)
         self.image_token_id = tokenizer.vocab[_IMAGE_TOKEN]
 
         self.sam_model = build_sam_vit_b()
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 9f8faf9ed91ce..be03e1df81d22 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -370,8 +370,8 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         self.projector_config = config.projector_config
         self.text_config = config.text_config
 
-        model_config = vllm_config.model_config
-        tokenizer = cached_tokenizer_from_config(model_config)
+        renderer_config = vllm_config.renderer_config
+        tokenizer = cached_tokenizer_from_config(renderer_config)
         self.image_token_id: int = tokenizer.vocab[_IMAGE_TOKEN]
 
         self.vision = self._init_vision_module(
diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py
index 7036118ada084..f82529d849a64 100644
--- a/vllm/model_executor/models/gemma3n_mm.py
+++ b/vllm/model_executor/models/gemma3n_mm.py
@@ -18,7 +18,7 @@ from transformers.models.gemma3n import (
 )
 from transformers.models.siglip import SiglipImageProcessorFast
 
-from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
+from vllm.config import RendererConfig, SpeechToTextConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
@@ -760,7 +760,7 @@ class Gemma3nForConditionalGeneration(
         cls,
         audio: np.ndarray,
         stt_config: SpeechToTextConfig,
-        model_config: ModelConfig,
+        renderer_config: RendererConfig,
         language: Optional[str],
         task_type: Literal["transcribe", "translate"],
         request_prompt: str,
@@ -798,7 +798,9 @@ class Gemma3nForConditionalGeneration(
 
     @classmethod
     def get_speech_to_text_config(
-        cls, model_config: ModelConfig, task_type: str
+        cls,
+        renderer_config: RendererConfig,
+        task_type: str,
     ) -> SpeechToTextConfig:
         return SpeechToTextConfig(
             # Let's set this to 30 as suggested in the docs for now, although
diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py
index a4e50f4086281..96645f20b79df 100644
--- a/vllm/model_executor/models/granite_speech.py
+++ b/vllm/model_executor/models/granite_speech.py
@@ -34,7 +34,7 @@ import torch.nn.functional as F
 from torch import nn
 from transformers import BatchFeature, PretrainedConfig
 
-from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig
+from vllm.config import CacheConfig, RendererConfig, SpeechToTextConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.inputs.data import PromptType
 from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
@@ -840,7 +840,7 @@ class GraniteSpeechForConditionalGeneration(
     def get_generation_prompt(
         cls,
         audio: np.ndarray,
-        model_config: ModelConfig,
+        renderer_config: RendererConfig,
         stt_config: SpeechToTextConfig,
         language: str | None,
         task_type: Literal["transcribe", "translate"],
@@ -861,7 +861,7 @@ class GraniteSpeechForConditionalGeneration(
         else:
             raise ValueError(f"Unsupported task type {task_type}")
 
-        tokenizer = cached_tokenizer_from_config(model_config)
+        tokenizer = cached_tokenizer_from_config(renderer_config)
         chat = [dict(role="user", content=user_prompt)]
         prompt = tokenizer.apply_chat_template(
             chat,
@@ -882,10 +882,10 @@ class GraniteSpeechForConditionalGeneration(
         cls,
         audio_duration_s: float,
         stt_config: SpeechToTextConfig,
-        model_config: ModelConfig,
+        renderer_config: RendererConfig,
     ) -> int | None:
         """Get the number of audio tokens for an audio duration in sec."""
-        processor = cached_processor_from_config(model_config)
+        processor = cached_processor_from_config(renderer_config)
         hop_length = processor.audio_processor.melspec_kwargs["hop_length"]
         proj_win_size = processor.audio_processor.projector_window_size
         ds_rate = processor.audio_processor.projector_downsample_rate
@@ -903,7 +903,9 @@ class GraniteSpeechForConditionalGeneration(
 
     @classmethod
     def get_speech_to_text_config(
-        cls, model_config: ModelConfig, task_type: str
+        cls,
+        renderer_config: RendererConfig,
+        task_type: str,
     ) -> SpeechToTextConfig:
         """Get the stt config for this model."""
         # Default settings are reasonable for this model and we don't currently
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
index 2aba626a7c737..b9f3ac8aee5f7 100644
--- a/vllm/model_executor/models/gritlm.py
+++ b/vllm/model_executor/models/gritlm.py
@@ -6,7 +6,7 @@ import numpy as np
 import torch
 import torch.nn as nn
 
-from vllm.config import ModelConfig, VllmConfig
+from vllm.config import RendererConfig, VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.pooler import (
     DispatchPooler,
@@ -29,12 +29,12 @@ logger = init_logger(__name__)
 class GritLMMeanPool(nn.Module):
     """As `MeanPool`, but only includes non-instruction tokens."""
 
-    def __init__(self, model_config: ModelConfig):
+    def __init__(self, renderer_config: RendererConfig):
         super().__init__()
 
-        self.model_config = model_config
+        self.renderer_config = renderer_config
 
-        tokenizer = cached_tokenizer_from_config(self.model_config)
+        tokenizer = cached_tokenizer_from_config(self.renderer_config)
 
         # Collect the tokens needed for pattern matching.
         # "▁<" is different from "_<". The former uses "▁" to indicate that
@@ -174,10 +174,10 @@ class GritLMMeanPool(nn.Module):
 
 
 class GritLMPooler(Pooler):
-    def __init__(self, model_config: ModelConfig):
+    def __init__(self, renderer_config: RendererConfig):
         super().__init__()
 
-        self.pooling = GritLMMeanPool(model_config)
+        self.pooling = GritLMMeanPool(renderer_config)
         self.head = PoolerHead(PoolerNormalize())
 
     def get_supported_tasks(self) -> Set[PoolingTask]:
@@ -238,6 +238,6 @@ class GritLM(LlamaForCausalLM):
             self.pooler = DispatchPooler(
                 {
                     "token_embed": Pooler.for_token_embed(pooler_config),
-                    "embed": GritLMPooler(vllm_config.model_config),
+                    "embed": GritLMPooler(vllm_config.renderer_config),
                 }
             )
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 607ff55835f1d..4df91aaf8b7f2 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -19,7 +19,7 @@ from torch import Tensor
 from transformers.models.whisper.tokenization_whisper import LANGUAGES
 from typing_extensions import Self, TypeIs
 
-from vllm.config import ModelConfig, SpeechToTextConfig
+from vllm.config import RendererConfig, SpeechToTextConfig
 from vllm.inputs import TokensPrompt
 from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
@@ -887,7 +887,7 @@ class SupportsTranscription(Protocol):
         cls,
         audio: np.ndarray,
         stt_config: SpeechToTextConfig,
-        model_config: ModelConfig,
+        renderer_config: RendererConfig,
         language: str | None,
         task_type: Literal["transcribe", "translate"],
         request_prompt: str,
@@ -930,7 +930,9 @@ class SupportsTranscription(Protocol):
 
     @classmethod
     def get_speech_to_text_config(
-        cls, model_config: ModelConfig, task_type: Literal["transcribe", "translate"]
+        cls,
+        renderer_config: RendererConfig,
+        task_type: Literal["transcribe", "translate"],
     ) -> SpeechToTextConfig:
         """Get the speech to text config for the ASR model."""
         ...
@@ -940,7 +942,7 @@ class SupportsTranscription(Protocol):
         cls,
         audio_duration_s: float,
         stt_config: SpeechToTextConfig,
-        model_config: ModelConfig,
+        renderer_config: RendererConfig,
     ) -> int | None:
         """
         Map from audio duration to number of audio tokens produced by the ASR
diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py
index 18985cefbf5ea..d75637da1d0e2 100644
--- a/vllm/model_executor/models/interns1.py
+++ b/vllm/model_executor/models/interns1.py
@@ -182,7 +182,7 @@ class InternS1ProcessingInfo(BaseProcessingInfo):
     def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
         hf_processor = self.ctx.get_hf_processor(InternVLProcessor, **kwargs)
         hf_processor.video_processor = cached_video_processor_from_config(
-            self.ctx.model_config,
+            self.ctx.renderer_config,
             processor_cls=InternVLVideoProcessor,
             size=hf_processor.image_processor.size,
             **kwargs,
diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
index 6dfab595e5b92..4daaefd0c2709 100644
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -1169,16 +1169,17 @@ class NemotronH_Nano_VL_V2(
         self.mlp1 = self.mlp1.to(self.language_model.config.dtype)
 
         self.config = config
-        self.model_config = vllm_config.model_config
 
         # Pre-tokenize special tokens for video processing
         # to avoid repeated tokenization
-        tokenizer = cached_tokenizer_from_config(vllm_config.model_config)
-        self._img_start_token_ids = tokenizer.encode(
+        self._tokenizer = cached_tokenizer_from_config(vllm_config.renderer_config)
+        self._img_start_token_ids = self._tokenizer.encode(
             IMG_START, add_special_tokens=False
         )
-        self._img_end_token_ids = tokenizer.encode(IMG_END, add_special_tokens=False)
-        self._img_context_token_ids = tokenizer.encode(
+        self._img_end_token_ids = self._tokenizer.encode(
+            IMG_END, add_special_tokens=False
+        )
+        self._img_context_token_ids = self._tokenizer.encode(
             IMG_CONTEXT, add_special_tokens=False
         )
 
@@ -1364,7 +1365,7 @@ class NemotronH_Nano_VL_V2(
         input_embeds for the LLM.
         """
         device = video_embeddings.device
-        tokenizer = cached_tokenizer_from_config(self.model_config)
+        tokenizer = self._tokenizer
 
         # Generate video replacement token IDs using get_video_repl
         # This tokenizes each frame separator independently, then uses pre-tokenized
diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py
index 391980fc61f9e..797793e658f7b 100644
--- a/vllm/model_executor/models/nemotron_vl.py
+++ b/vllm/model_executor/models/nemotron_vl.py
@@ -347,7 +347,7 @@ class NemotronVLProcessingInfo(BaseInternVLProcessingInfo):
 
     def get_image_processor(self, **kwargs: object):
         return cached_image_processor_from_config(
-            self.ctx.model_config,
+            self.ctx.renderer_config,
             **kwargs,
         )
 
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index faf2d80d24bba..ebe743fa82a00 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -193,7 +193,7 @@ class PixtralProcessorAdapter:
 
 class PixtralProcessingInfo(BaseProcessingInfo):
     def get_tokenizer(self) -> MistralTokenizer:
-        tokenizer = cached_tokenizer_from_config(self.ctx.model_config)
+        tokenizer = cached_tokenizer_from_config(self.ctx.renderer_config)
         if not isinstance(tokenizer, MistralTokenizer):
             raise ValueError("This model requires `--tokenizer-mode mistral`")
 
diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py
index 7b408248ec74c..0acd564e2e54f 100644
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -20,7 +20,7 @@ from mistral_common.tokens.tokenizers.audio import Audio, AudioEncoder
 from transformers import BatchFeature, TensorType, WhisperConfig
 from transformers.tokenization_utils_base import TextInput
 
-from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
+from vllm.config import RendererConfig, SpeechToTextConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
@@ -176,7 +176,7 @@ class VoxtralProcessorAdapter:
 
 class VoxtralProcessingInfo(BaseProcessingInfo):
     def get_tokenizer(self) -> MistralTokenizer:
-        tokenizer = cached_tokenizer_from_config(self.ctx.model_config)
+        tokenizer = cached_tokenizer_from_config(self.ctx.renderer_config)
         if not isinstance(tokenizer, MistralTokenizer):
             raise ValueError("This model requires `--tokenizer-mode mistral`")
 
@@ -339,7 +339,7 @@ class VoxtralForConditionalGeneration(
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
-        self.tokenizer = cached_tokenizer_from_config(vllm_config.model_config)
+        self.tokenizer = cached_tokenizer_from_config(vllm_config.renderer_config)
 
         # update quant config to so that ignored module and target module names
         # match the vLLM model names
@@ -450,9 +450,11 @@ class VoxtralForConditionalGeneration(
 
     @classmethod
     def get_speech_to_text_config(
-        cls, model_config: ModelConfig, task_type: str
+        cls,
+        renderer_config: RendererConfig,
+        task_type: str,
     ) -> SpeechToTextConfig:
-        tokenizer = cached_tokenizer_from_config(model_config)
+        tokenizer = cached_tokenizer_from_config(renderer_config)
         audio_config = tokenizer.instruct.audio_encoder.audio_config
         max_audio_clip_s = audio_config.chunk_length_s
         sample_rate = audio_config.sampling_rate
@@ -468,17 +470,17 @@ class VoxtralForConditionalGeneration(
     def get_generation_prompt(
         cls,
         audio: np.ndarray,
-        model_config: ModelConfig,
+        renderer_config: RendererConfig,  # not needed here
         stt_config: SpeechToTextConfig,
         language: str | None,
         task_type: Literal["transcribe", "translate"],
         request_prompt: str,
         to_language: str | None,
     ) -> PromptType:
-        tokenizer = cached_tokenizer_from_config(model_config)
+        tokenizer = cached_tokenizer_from_config(renderer_config)
         audio = Audio(audio, int(stt_config.sample_rate), format="wav")  # lossless
         req = TranscriptionRequest(
-            model=model_config.model,
+            model=renderer_config.model_config.model,
             audio=RawAudio.from_audio(audio),
             language=language,
         )
@@ -494,14 +496,14 @@ class VoxtralForConditionalGeneration(
         cls,
         audio_duration_s: float,
         stt_config: SpeechToTextConfig,
-        model_config: ModelConfig,
+        renderer_config: RendererConfig,
     ) -> int | None:
         """
         Map from audio duration to number of audio tokens produced by the ASR
         model, without running a forward pass.
         This is used for estimating the amount of processing for this audio.
         """
-        tokenizer = cached_tokenizer_from_config(model_config)
+        tokenizer = cached_tokenizer_from_config(renderer_config)
         adapter = VoxtralProcessorAdapter(tokenizer)
         return adapter.get_num_audio_tokens(
             int(audio_duration_s * stt_config.sample_rate)
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index b2feff1335151..6f526e3956fff 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -19,7 +19,7 @@ from transformers.models.whisper.modeling_whisper import sinusoids
 from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.layer import Attention, MultiHeadAttention
 from vllm.attention.layers.cross_attention import CrossAttention
-from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig
+from vllm.config import CacheConfig, RendererConfig, SpeechToTextConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.inputs.data import PromptType
@@ -811,7 +811,7 @@ class WhisperForConditionalGeneration(
     def get_generation_prompt(
         cls,
         audio: np.ndarray,
-        model_config: ModelConfig,  # not needed here
+        renderer_config: RendererConfig,  # not needed here
         stt_config: SpeechToTextConfig,
         language: str | None,
         task_type: Literal["transcribe", "translate"],
@@ -847,9 +847,11 @@ class WhisperForConditionalGeneration(
 
     @classmethod
     def get_speech_to_text_config(
-        cls, model_config: ModelConfig, task_type: str
+        cls,
+        renderer_config: RendererConfig,
+        task_type: str,
     ) -> SpeechToTextConfig:
-        processor = cached_processor_from_config(model_config)
+        processor = cached_processor_from_config(renderer_config)
 
         return SpeechToTextConfig(
             max_audio_clip_s=processor.feature_extractor.chunk_length,
@@ -861,9 +863,9 @@ class WhisperForConditionalGeneration(
         cls,
         audio_duration_s: float,
         stt_config: SpeechToTextConfig,
-        model_config: ModelConfig,
+        renderer_config: RendererConfig,
     ) -> int | None:
-        processor = cached_processor_from_config(model_config)
+        processor = cached_processor_from_config(renderer_config)
         hop_length = processor.feature_extractor.hop_length
         assert hop_length is not None
         # NOTE(NickLucche) user can't pass encoder
diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py
index 67bdf5e1557f9..9c838fe679582 100644
--- a/vllm/multimodal/cache.py
+++ b/vllm/multimodal/cache.py
@@ -31,7 +31,7 @@ from .inputs import (
 )
 
 if TYPE_CHECKING:
-    from vllm.config import ModelConfig, VllmConfig
+    from vllm.config import ModelConfig, RendererConfig, VllmConfig
 
     from .processing import ResolvedPromptUpdate
     from .registry import MultiModalRegistry
@@ -561,13 +561,13 @@ class ShmObjectStoreSenderCache(BaseMultiModalProcessorCache):
 
 
 def _enable_processor_cache(
-    model_config: "ModelConfig",
+    renderer_config: "RendererConfig",
     mm_registry: "MultiModalRegistry",
 ) -> bool:
-    if not mm_registry.supports_multimodal_inputs(model_config):
+    if not mm_registry.supports_multimodal_inputs(renderer_config):
         return False
 
-    mm_config = model_config.get_multimodal_config()
+    mm_config = renderer_config.model_config.get_multimodal_config()
     return mm_config.mm_processor_cache_gb > 0
 
 
@@ -599,7 +599,7 @@ def processor_cache_from_config(
     """Return a `BaseMultiModalProcessorCache`, if enabled."""
     model_config = vllm_config.model_config
 
-    if not _enable_processor_cache(model_config, mm_registry):
+    if not _enable_processor_cache(vllm_config.renderer_config, mm_registry):
         return None
 
     if not _enable_ipc_cache(vllm_config):
@@ -611,14 +611,14 @@ def processor_cache_from_config(
 
 
 def processor_only_cache_from_config(
-    model_config: "ModelConfig",
+    renderer_config: "RendererConfig",
     mm_registry: "MultiModalRegistry",
 ):
     """Return a `MultiModalProcessorOnlyCache`, if enabled."""
-    if not _enable_processor_cache(model_config, mm_registry):
+    if not _enable_processor_cache(renderer_config, mm_registry):
         return None
 
-    return MultiModalProcessorOnlyCache(model_config)
+    return MultiModalProcessorOnlyCache(renderer_config.model_config)
 
 
 class BaseMultiModalReceiverCache(
@@ -787,7 +787,7 @@ def engine_receiver_cache_from_config(
     """
     model_config = vllm_config.model_config
 
-    if not _enable_processor_cache(model_config, mm_registry):
+    if not _enable_processor_cache(vllm_config.renderer_config, mm_registry):
         return None
 
     if not _enable_ipc_cache(vllm_config):
@@ -809,9 +809,7 @@ def worker_receiver_cache_from_config(
     Return a `BaseMultiModalReceiverCache` only when IPC caching is enabled and
     mm_processor_cache_type=="shm".
     """
-    model_config = vllm_config.model_config
-
-    if not _enable_processor_cache(model_config, mm_registry):
+    if not _enable_processor_cache(vllm_config.renderer_config, mm_registry):
         return None
 
     if not _enable_ipc_cache(vllm_config):
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 0390773783961..81ceb76a4b961 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -23,7 +23,7 @@ import torch
 from typing_extensions import TypeVar, assert_never
 
 from vllm.logger import init_logger
-from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
 from vllm.transformers_utils.processor import cached_processor_from_config
 from vllm.utils.collection_utils import flatten_2d_lists, full_groupby
 from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
@@ -53,7 +53,7 @@ if TYPE_CHECKING:
     from transformers.feature_extraction_utils import BatchFeature
     from transformers.processing_utils import ProcessorMixin
 
-    from vllm.config import ModelConfig
+    from vllm.config import ModelConfig, RendererConfig
 
     from .cache import BaseMultiModalProcessorCache
     from .profiling import BaseDummyInputsBuilder
@@ -63,6 +63,7 @@ else:
     ProcessorMixin = object
 
     ModelConfig = object
+    RendererConfig = object
 
     BaseMultiModalProcessorCache = object
 
@@ -945,12 +946,29 @@ class InputProcessingContext:
     modify the inputs.
     """
 
-    model_config: ModelConfig
-    """The configuration of the model."""
+    renderer_config: RendererConfig
+    """The configuration of the renderer."""
 
     tokenizer: TokenizerLike | None
     """The tokenizer used to tokenize the inputs."""
 
+    @classmethod
+    def from_config(
+        cls,
+        renderer_config: RendererConfig,
+        *,
+        tokenizer: TokenizerLike | None = None,
+    ):
+        if tokenizer is None and not renderer_config.skip_tokenizer_init:
+            tokenizer = cached_tokenizer_from_config(renderer_config)
+
+        return cls(renderer_config, tokenizer)
+
+    @property
+    def model_config(self) -> ModelConfig:
+        """The configuration of the model."""
+        return self.renderer_config.model_config
+
     def get_tokenizer(self) -> TokenizerLike:
         if self.tokenizer is None:
             raise ValueError(
@@ -1047,7 +1065,7 @@ class InputProcessingContext:
             typ = ProcessorMixin
 
         return cached_processor_from_config(
-            self.model_config,
+            self.renderer_config,
             processor_cls=typ,
             tokenizer=self.tokenizer,
             **kwargs,
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 00a84f9dec4f7..e49aaa5045c62 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Generic, Protocol, TypeVar, cast
 
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.logger import init_logger
-from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
+from vllm.tokenizers import TokenizerLike
 
 from .cache import BaseMultiModalProcessorCache
 from .processing import (
@@ -22,7 +22,7 @@ from .profiling import (
 )
 
 if TYPE_CHECKING:
-    from vllm.config import ModelConfig
+    from vllm.config import ModelConfig, RendererConfig
     from vllm.model_executor.models.interfaces import SupportsMultiModal
 
 logger = init_logger(__name__)
@@ -114,17 +114,18 @@ class MultiModalRegistry:
 
         return mm_options if len(mm_options) > 0 else None
 
-    def supports_multimodal_inputs(self, model_config: "ModelConfig") -> bool:
+    def supports_multimodal_inputs(self, renderer_config: "RendererConfig") -> bool:
         """
         Checks if the model supports multimodal inputs.
         Returns True if the model is multimodal with any non-zero supported
         modalities, otherwise returns False, effectively running in
         text-only mode.
         """
+        model_config = renderer_config.model_config
         if not model_config.is_multimodal_model:
             return False
 
-        info = self._create_processing_info(model_config, tokenizer=None)
+        info = self._create_processing_info(renderer_config, tokenizer=None)
         supported_modalities = info.get_supported_mm_limits()
 
         mm_config = model_config.get_multimodal_config()
@@ -144,7 +145,7 @@ class MultiModalRegistry:
 
     def get_max_tokens_per_item_by_modality(
         self,
-        model_config: "ModelConfig",
+        renderer_config: "RendererConfig",
         *,
         cache: BaseMultiModalProcessorCache | None = None,
         profiler_limits: Mapping[str, int] | None = None,
@@ -153,10 +154,11 @@ class MultiModalRegistry:
         Get the maximum number of tokens per data item from each modality based
         on underlying model configuration.
         """
+        model_config = renderer_config.model_config
         if not model_config.is_multimodal_model:
             return {}
 
-        processor = self.create_processor(model_config, cache=cache)
+        processor = self.create_processor(renderer_config, cache=cache)
         profiler: MultiModalProfiler = MultiModalProfiler(processor)
 
         seq_len = model_config.max_model_len
@@ -171,7 +173,7 @@ class MultiModalRegistry:
 
     def get_mm_limits_per_prompt(
         self,
-        model_config: "ModelConfig",
+        renderer_config: "RendererConfig",
         *,
         cache: BaseMultiModalProcessorCache | None = None,
     ) -> Mapping[str, int]:
@@ -179,10 +181,11 @@ class MultiModalRegistry:
         Get the maximum number of multi-modal input instances for each modality
         that are allowed per prompt for a model class.
         """
+        model_config = renderer_config.model_config
         if not model_config.is_multimodal_model:
             return {}
 
-        processor = self.create_processor(model_config, cache=cache)
+        processor = self.create_processor(renderer_config, cache=cache)
         profiler: MultiModalProfiler = MultiModalProfiler(processor)
         return profiler.get_mm_limits()
 
@@ -228,30 +231,21 @@ class MultiModalRegistry:
         assert hasattr(model_cls, "_processor_factory")
         return cast("SupportsMultiModal", model_cls)
 
-    def _create_processing_ctx(
-        self,
-        model_config: "ModelConfig",
-        tokenizer: TokenizerLike | None = None,
-    ) -> InputProcessingContext:
-        if tokenizer is None and not model_config.skip_tokenizer_init:
-            tokenizer = cached_tokenizer_from_config(model_config)
-
-        return InputProcessingContext(model_config, tokenizer)
-
     def _create_processing_info(
         self,
-        model_config: "ModelConfig",
+        renderer_config: "RendererConfig",
         *,
         tokenizer: TokenizerLike | None = None,
     ) -> BaseProcessingInfo:
-        model_cls = self._get_model_cls(model_config)
+        model_cls = self._get_model_cls(renderer_config.model_config)
         factories = model_cls._processor_factory
-        ctx = self._create_processing_ctx(model_config, tokenizer)
+
+        ctx = InputProcessingContext.from_config(renderer_config, tokenizer=tokenizer)
         return factories.info(ctx)
 
     def create_processor(
         self,
-        model_config: "ModelConfig",
+        renderer_config: "RendererConfig",
         *,
         tokenizer: TokenizerLike | None = None,
         cache: BaseMultiModalProcessorCache | None = None,
@@ -259,19 +253,19 @@ class MultiModalRegistry:
         """
         Create a multi-modal processor for a specific model and tokenizer.
         """
+        model_config = renderer_config.model_config
         if not model_config.is_multimodal_model:
             raise ValueError(f"{model_config.model} is not a multimodal model")
 
         model_cls = self._get_model_cls(model_config)
         factories = model_cls._processor_factory
 
-        ctx = self._create_processing_ctx(model_config, tokenizer)
-
+        ctx = InputProcessingContext.from_config(renderer_config, tokenizer=tokenizer)
         return factories.build_processor(ctx, cache=cache)
 
     def get_decoder_dummy_data(
         self,
-        model_config: "ModelConfig",
+        renderer_config: "RendererConfig",
         seq_len: int,
         mm_counts: Mapping[str, int] | None = None,
         *,
@@ -280,15 +274,15 @@ class MultiModalRegistry:
         """
         Create dummy data for profiling the memory usage of a model.
 
-        The model is identified by `model_config`.
+        The model is identified by `renderer_config`.
         """
-        processor = self.create_processor(model_config, cache=cache)
+        processor = self.create_processor(renderer_config, cache=cache)
         profiler: MultiModalProfiler = MultiModalProfiler(processor)
 
         # Extract configurable options from multimodal config.
         # Only include modalities that use advanced option types so legacy
         # count-only behavior remains unchanged.
-        mm_options = self._extract_mm_options(model_config)
+        mm_options = self._extract_mm_options(renderer_config.model_config)
 
         dummy_data = profiler.get_decoder_dummy_data(seq_len, mm_counts, mm_options)
 
@@ -304,7 +298,7 @@ class MultiModalRegistry:
 
     def get_encoder_dummy_data(
         self,
-        model_config: "ModelConfig",
+        renderer_config: "RendererConfig",
         seq_len: int,
         mm_counts: Mapping[str, int] | None = None,
         *,
@@ -313,15 +307,15 @@ class MultiModalRegistry:
         """
         Create dummy data for profiling the memory usage of a model.
 
-        The model is identified by `model_config`.
+        The model is identified by `renderer_config`.
         """
-        processor = self.create_processor(model_config, cache=cache)
+        processor = self.create_processor(renderer_config, cache=cache)
         profiler: MultiModalProfiler = MultiModalProfiler(processor)
 
         # Extract configurable options from multimodal config.
         # Only include modalities that use advanced option types so legacy
         # count-only behavior remains unchanged.
-        mm_options = self._extract_mm_options(model_config)
+        mm_options = self._extract_mm_options(renderer_config.model_config)
 
         dummy_data = profiler.get_encoder_dummy_data(seq_len, mm_counts, mm_options)
 
@@ -336,13 +330,15 @@ class MultiModalRegistry:
 
         return dummy_data
 
-    def get_encdec_max_encoder_len(self, model_config: "ModelConfig") -> int:
+    def get_encdec_max_encoder_len(self, renderer_config: "RendererConfig") -> int:
         """
         Get the maximum length of the encoder input for encoder-decoder models.
         """
+        model_config = renderer_config.model_config
         if not model_config.is_encoder_decoder:
             return 0
-        max_tokens = self.get_max_tokens_per_item_by_modality(model_config)
+
+        max_tokens = self.get_max_tokens_per_item_by_modality(renderer_config)
         if not max_tokens:
             # TODO - this function assumes encoder-decoder models are
             # multimodal. This will need to change when adding support for more
diff --git a/vllm/tokenizers/registry.py b/vllm/tokenizers/registry.py
index 1d44feeee500f..c9575511af8c9 100644
--- a/vllm/tokenizers/registry.py
+++ b/vllm/tokenizers/registry.py
@@ -24,7 +24,7 @@ from vllm.utils.import_utils import resolve_obj_by_qualname
 from .protocol import TokenizerLike
 
 if TYPE_CHECKING:
-    from vllm.config import ModelConfig
+    from vllm.config import RendererConfig
 
 logger = init_logger(__name__)
 
@@ -205,18 +205,18 @@ def get_tokenizer(
 cached_get_tokenizer = lru_cache(get_tokenizer)
 
 
-def cached_tokenizer_from_config(model_config: "ModelConfig", **kwargs):
+def cached_tokenizer_from_config(renderer_config: "RendererConfig", **kwargs):
     return cached_get_tokenizer(
-        model_config.tokenizer,
-        tokenizer_mode=model_config.tokenizer_mode,
-        revision=model_config.tokenizer_revision,
-        trust_remote_code=model_config.trust_remote_code,
+        renderer_config.tokenizer,
+        tokenizer_mode=renderer_config.tokenizer_mode,
+        revision=renderer_config.tokenizer_revision,
+        trust_remote_code=renderer_config.trust_remote_code,
         **kwargs,
     )
 
 
-def init_tokenizer_from_config(model_config: "ModelConfig"):
-    runner_type = model_config.runner_type
+def init_tokenizer_from_config(renderer_config: "RendererConfig"):
+    runner_type = renderer_config.model_config.runner_type
     if runner_type == "generate" or runner_type == "draft":
         truncation_side = "left"
     elif runner_type == "pooling":
@@ -225,9 +225,9 @@ def init_tokenizer_from_config(model_config: "ModelConfig"):
         assert_never(runner_type)
 
     return get_tokenizer(
-        model_config.tokenizer,
-        tokenizer_mode=model_config.tokenizer_mode,
-        trust_remote_code=model_config.trust_remote_code,
-        revision=model_config.tokenizer_revision,
+        renderer_config.tokenizer,
+        tokenizer_mode=renderer_config.tokenizer_mode,
+        trust_remote_code=renderer_config.trust_remote_code,
+        revision=renderer_config.tokenizer_revision,
         truncation_side=truncation_side,
     )
diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py
index e9864b0c1531d..bdebd2686bae2 100644
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@@ -23,7 +23,7 @@ from vllm.transformers_utils.utils import convert_model_repo_to_path
 from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
 
 if TYPE_CHECKING:
-    from vllm.config import ModelConfig
+    from vllm.config import ModelConfig, RendererConfig
 
 _P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
 _V = TypeVar("_V", bound=BaseVideoProcessor, default=BaseVideoProcessor)
@@ -233,17 +233,18 @@ def cached_get_processor_without_dynamic_kwargs(
 
 
 def cached_processor_from_config(
-    model_config: "ModelConfig",
+    renderer_config: "RendererConfig",
     processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin,
     **kwargs: Any,
 ) -> _P:
+    model_config = renderer_config.model_config
     if is_gguf(model_config.model):
-        assert not is_gguf(model_config.tokenizer), (
+        assert not is_gguf(renderer_config.tokenizer), (
             "For multimodal GGUF models, the original tokenizer "
             "should be used to correctly load processor."
         )
-        model = model_config.tokenizer
-        revision = model_config.tokenizer_revision
+        model = renderer_config.tokenizer
+        revision = renderer_config.tokenizer_revision
     else:
         model = model_config.model
         revision = model_config.revision
@@ -297,9 +298,11 @@ cached_get_feature_extractor = lru_cache(get_feature_extractor)
 
 
 def cached_feature_extractor_from_config(
-    model_config: "ModelConfig",
+    renderer_config: "RendererConfig",
     **kwargs: Any,
 ):
+    model_config = renderer_config.model_config
+
     return cached_get_feature_extractor(
         model_config.model,
         revision=model_config.revision,
@@ -348,16 +351,17 @@ cached_get_image_processor = lru_cache(get_image_processor)
 
 
 def cached_image_processor_from_config(
-    model_config: "ModelConfig",
+    renderer_config: "RendererConfig",
     **kwargs: Any,
 ):
+    model_config = renderer_config.model_config
     if is_gguf(model_config.model):
-        assert not is_gguf(model_config.tokenizer), (
+        assert not is_gguf(renderer_config.tokenizer), (
             "For multimodal GGUF models, the original tokenizer "
             "should be used to correctly load image processor."
         )
-        model = model_config.tokenizer
-        revision = model_config.tokenizer_revision
+        model = renderer_config.tokenizer
+        revision = renderer_config.tokenizer_revision
     else:
         model = model_config.model
         revision = model_config.revision
@@ -411,10 +415,12 @@ cached_get_video_processor = lru_cache(get_video_processor)
 
 
 def cached_video_processor_from_config(
-    model_config: "ModelConfig",
+    renderer_config: "RendererConfig",
     processor_cls: type[_V] | None = None,
     **kwargs: Any,
 ):
+    model_config = renderer_config.model_config
+
     return cached_get_video_processor(
         model_config.model,
         revision=model_config.revision,
diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py
index 3959e9a59a53b..21315b85f22aa 100644
--- a/vllm/v1/core/encoder_cache_manager.py
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -10,7 +10,7 @@ from vllm.multimodal import MultiModalRegistry
 from vllm.v1.request import Request
 
 if TYPE_CHECKING:
-    from vllm.config import ModelConfig, SchedulerConfig
+    from vllm.config import RendererConfig, SchedulerConfig
 
 logger = init_logger(__name__)
 
@@ -250,7 +250,7 @@ class EncoderCacheManager:
 
 
 def compute_encoder_budget(
-    model_config: "ModelConfig",
+    renderer_config: "RendererConfig",
     scheduler_config: "SchedulerConfig",
     mm_registry: MultiModalRegistry,
 ) -> tuple[int, int]:
@@ -263,9 +263,9 @@ def compute_encoder_budget(
         - Space budget for encoder cache size, measured in number of tokens
             from the input sequence.
     """
-    if mm_registry.supports_multimodal_inputs(model_config):
+    if mm_registry.supports_multimodal_inputs(renderer_config):
         max_tokens_by_modality = mm_registry.get_max_tokens_per_item_by_modality(
-            model_config
+            renderer_config
         )
 
         return compute_mm_encoder_budget(
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 0a8efa2fd512f..96073efc5f56a 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -164,7 +164,7 @@ class Scheduler(SchedulerInterface):
         # This can be changed when we make encoder cache for embedding caching
         # across requests.
         encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
-            model_config=vllm_config.model_config,
+            renderer_config=vllm_config.renderer_config,
             scheduler_config=vllm_config.scheduler_config,
             mm_registry=mm_registry,
         )
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index fd7e04dc02082..b76f9c0595d61 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -91,6 +91,7 @@ class AsyncLLM(EngineClient):
         # Ensure we can serialize custom transformer configs
         maybe_register_config_serialize_by_value()
 
+        self.renderer_config = vllm_config.renderer_config
         self.model_config = vllm_config.model_config
         self.vllm_config = vllm_config
         self.observability_config = vllm_config.observability_config
@@ -108,15 +109,15 @@ class AsyncLLM(EngineClient):
                 "enabling logging without default stat loggers."
             )
 
-        if self.model_config.skip_tokenizer_init:
+        if self.renderer_config.skip_tokenizer_init:
             tokenizer = None
         else:
-            tokenizer = init_tokenizer_from_config(self.model_config)
+            tokenizer = init_tokenizer_from_config(self.renderer_config)
 
         self.input_processor = InputProcessor(self.vllm_config, tokenizer)
         self.io_processor = get_io_processor(
             self.vllm_config,
-            self.model_config.io_processor_plugin,
+            self.renderer_config.io_processor_plugin,
         )
 
         # OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py
index e6a94f4e3de5d..a2f6ba5be8c17 100644
--- a/vllm/v1/engine/input_processor.py
+++ b/vllm/v1/engine/input_processor.py
@@ -43,6 +43,7 @@ class InputProcessor:
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
     ) -> None:
         self.vllm_config = vllm_config
+        self.renderer_config = vllm_config.renderer_config
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
         self.lora_config = vllm_config.lora_config
@@ -54,7 +55,7 @@ class InputProcessor:
         self.mm_processor_cache = processor_cache_from_config(vllm_config, mm_registry)
 
         self.input_preprocessor = InputPreprocessor(
-            self.model_config,
+            self.renderer_config,
             tokenizer,
             mm_registry,
             mm_processor_cache=self.mm_processor_cache,
@@ -252,7 +253,7 @@ class InputProcessor:
         if not params.structured_outputs or not self.structured_outputs_config:
             return
 
-        if self.model_config.skip_tokenizer_init and params.structured_outputs:
+        if self.renderer_config.skip_tokenizer_init and params.structured_outputs:
             raise ValueError(
                 "Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'"  # noqa: E501
             )
@@ -582,7 +583,7 @@ class InputProcessor:
             if prompt_type == "encoder" and model_config.is_multimodal_model:
                 mm_registry = self.input_preprocessor.mm_registry
                 mm_processor = mm_registry.create_processor(
-                    model_config,
+                    self.renderer_config,
                     tokenizer=tokenizer,
                 )
                 assert isinstance(mm_processor, EncDecMultiModalProcessor)
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 4c31291005477..ba0e1cf25cb08 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -60,6 +60,7 @@ class LLMEngine:
     ) -> None:
         self.vllm_config = vllm_config
         self.observability_config = vllm_config.observability_config
+        self.renderer_config = vllm_config.renderer_config
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
 
@@ -83,15 +84,15 @@ class LLMEngine:
             self.dp_group = None
         self.should_execute_dummy_batch = False
 
-        if self.model_config.skip_tokenizer_init:
+        if self.renderer_config.skip_tokenizer_init:
             tokenizer = None
         else:
-            tokenizer = init_tokenizer_from_config(self.model_config)
+            tokenizer = init_tokenizer_from_config(self.renderer_config)
 
         self.input_processor = InputProcessor(self.vllm_config, tokenizer)
         self.io_processor = get_io_processor(
             self.vllm_config,
-            self.model_config.io_processor_plugin,
+            self.renderer_config.io_processor_plugin,
         )
 
         # OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 31428db2d3afc..7976418510aab 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -85,7 +85,7 @@ class EagleProposer:
         # Multi-modal data support
         self.mm_registry = MULTIMODAL_REGISTRY
         self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
-            vllm_config.model_config
+            vllm_config.renderer_config
         )
 
         self.attn_metadata_builder: AttentionMetadataBuilder | None = None
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 5ee88178cdf60..36aa3d9bb3f94 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -63,7 +63,7 @@ class StructuredOutputManager:
             max_workers = max(1, min(multiprocessing.cpu_count() // 2, 8))
             self.executor_for_fillmask = ThreadPoolExecutor(max_workers=max_workers)
 
-        if not self.vllm_config.model_config.skip_tokenizer_init:
+        if not vllm_config.renderer_config.skip_tokenizer_init:
             # The default max_workers if not specified is the number of
             # CPUs * 5, which is way too high since these tasks are CPU-bound,
             # not I/O bound. We also know we would never dominate CPU usage
@@ -71,21 +71,15 @@ class StructuredOutputManager:
             # of CPUs.
             max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
             self.executor = ThreadPoolExecutor(max_workers=max_workers)
-            self.tokenizer = init_tokenizer_from_config(
-                model_config=self.vllm_config.model_config
-            )
-            reasoning_parser = (
-                self.vllm_config.structured_outputs_config.reasoning_parser
-            )
+            self.tokenizer = init_tokenizer_from_config(vllm_config.renderer_config)
+            reasoning_parser = vllm_config.structured_outputs_config.reasoning_parser
             reasoning_parser_plugin = (
-                self.vllm_config.structured_outputs_config.reasoning_parser_plugin
+                vllm_config.structured_outputs_config.reasoning_parser_plugin
             )
             if reasoning_parser_plugin and len(reasoning_parser_plugin) > 3:
                 ReasoningParserManager.import_reasoning_parser(reasoning_parser_plugin)
 
-            reasoning_parser = (
-                self.vllm_config.structured_outputs_config.reasoning_parser
-            )
+            reasoning_parser = vllm_config.structured_outputs_config.reasoning_parser
             if reasoning_parser:
                 reasoner_cls = ReasoningParserManager.get_reasoning_parser(
                     reasoning_parser
@@ -93,7 +87,7 @@ class StructuredOutputManager:
                 self.reasoner = reasoner_cls(tokenizer=self.tokenizer)
 
         self.enable_in_reasoning = (
-            self.vllm_config.structured_outputs_config.enable_in_reasoning
+            vllm_config.structured_outputs_config.enable_in_reasoning
         )
 
     def grammar_init(self, request: Request) -> None:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a50360ab08694..b3c8d4da22b63 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -271,6 +271,7 @@ class GPUModelRunner(
         device: torch.device,
     ):
         self.vllm_config = vllm_config
+        self.renderer_config = vllm_config.renderer_config
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
         self.compilation_config = vllm_config.compilation_config
@@ -335,7 +336,7 @@ class GPUModelRunner(
         self.uses_mrope = model_config.uses_mrope
         self.uses_xdrope_dim = model_config.uses_xdrope_dim
         self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
-            model_config
+            self.renderer_config
         )
 
         if self.model_config.is_encoder_decoder:
@@ -558,7 +559,7 @@ class GPUModelRunner(
 
         self.mm_budget = (
             MultiModalBudget(
-                self.model_config,
+                self.renderer_config,
                 self.scheduler_config,
                 self.mm_registry,
             )
@@ -3873,7 +3874,7 @@ class GPUModelRunner(
         assert self.mm_budget is not None
 
         dummy_decoder_data = self.mm_registry.get_decoder_dummy_data(
-            model_config=self.model_config,
+            renderer_config=self.renderer_config,
             seq_len=self.max_model_len,
             mm_counts={modality: 1},
             cache=self.mm_budget.cache,
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 283f21b779e38..7e2a6af68e6af 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -143,6 +143,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         original_parallel_config: ParallelConfig | None = None,
     ):
         self.vllm_config = vllm_config
+        self.renderer_config = vllm_config.renderer_config
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
         self.lora_config = vllm_config.lora_config
@@ -222,7 +223,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.mm_registry = MULTIMODAL_REGISTRY
         self.uses_mrope = model_config.uses_mrope
         self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
-            model_config
+            self.renderer_config
         )
         # TODO: Support M-RoPE (e.g, Qwen2-VL)
         assert not self.uses_mrope, "TPU does not support M-RoPE yet."
@@ -353,7 +354,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         self.mm_budget = (
             MultiModalBudget(
-                self.model_config,
+                self.renderer_config,
                 self.scheduler_config,
                 self.mm_registry,
             )
@@ -2038,7 +2039,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         assert self.mm_budget is not None
 
         dummy_decoder_data = self.mm_registry.get_decoder_dummy_data(
-            model_config=self.model_config,
+            renderer_config=self.renderer_config,
             seq_len=self.max_model_len,
             mm_counts={modality: 1},
             cache=self.mm_budget.cache,
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 0b0e2006d73d2..44418b9985e2e 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -7,7 +7,7 @@ import torch
 
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.attention.layer import Attention
-from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
+from vllm.config import RendererConfig, SchedulerConfig, VllmConfig
 from vllm.model_executor.models.interfaces import MultiModalEmbeddings
 from vllm.model_executor.models.utils import extract_layer_index
 from vllm.multimodal.cache import processor_only_cache_from_config
@@ -23,24 +23,29 @@ class MultiModalBudget:
 
     def __init__(
         self,
-        model_config: ModelConfig,
+        renderer_config: RendererConfig,
         scheduler_config: SchedulerConfig,
         mm_registry: MultiModalRegistry,
     ) -> None:
         super().__init__()
 
-        self.model_config = model_config
+        self.renderer_config = renderer_config
+        self.model_config = renderer_config.model_config
         self.scheduler_config = scheduler_config
         self.mm_registry = mm_registry
-        self.cache = cache = processor_only_cache_from_config(model_config, mm_registry)
+        self.cache = cache = processor_only_cache_from_config(
+            renderer_config, mm_registry
+        )
 
-        self.max_model_len = model_config.max_model_len
+        self.max_model_len = self.model_config.max_model_len
         self.max_num_reqs = scheduler_config.max_num_seqs
 
-        self.mm_limits = mm_registry.get_mm_limits_per_prompt(model_config, cache=cache)
+        self.mm_limits = mm_registry.get_mm_limits_per_prompt(
+            renderer_config, cache=cache
+        )
 
         max_tokens_by_modality = mm_registry.get_max_tokens_per_item_by_modality(
-            model_config,
+            renderer_config,
             cache=cache,
             profiler_limits=self.mm_limits,
         )

From e83b7e379c11bf136c1b96bc6a67b6d2207cfde4 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 7 Dec 2025 16:00:22 +0800
Subject: [PATCH 316/770] Revert "[Renderer] Separate out `RendererConfig` from
 `ModelConfig` (#30145)" (#30199)

---
 docs/contributing/model/transcription.md      |  12 +-
 .../distributed/test_sequence_parallelism.py  |   2 -
 tests/compile/test_functionalization.py       |   6 +-
 tests/compile/test_fusion.py                  |   6 +-
 tests/compile/test_fusion_attn.py             |   2 -
 tests/compile/test_pass_manager.py            |   8 +-
 tests/compile/test_qk_norm_rope_fusion.py     |   5 +-
 tests/distributed/test_kvlayout.py            |   3 -
 .../entrypoints/openai/test_chat_template.py  |  22 +-
 .../entrypoints/openai/test_lora_resolvers.py |  21 +-
 tests/entrypoints/openai/test_serving_chat.py |  28 +--
 .../entrypoints/openai/test_serving_engine.py |   8 +-
 .../entrypoints/openai/test_serving_models.py |   8 +-
 tests/entrypoints/test_chat_utils.py          | 190 +++++++++++-------
 tests/lora/test_lora_manager.py               |  14 +-
 tests/lora/test_worker.py                     |   2 -
 .../test_model_load_with_params.py            |  22 +-
 tests/models/language/pooling/test_gritlm.py  |   5 +-
 .../multimodal/processing/test_common.py      |  22 +-
 .../multimodal/processing/test_glm4_1v.py     |   4 +-
 .../multimodal/processing/test_h2ovl.py       |   2 +-
 .../multimodal/processing/test_idefics3.py    |   2 +-
 .../multimodal/processing/test_internvl.py    |   2 +-
 .../multimodal/processing/test_llama4.py      |   2 +-
 .../multimodal/processing/test_llava_next.py  |   6 +-
 .../processing/test_llava_onevision.py        |   6 +-
 .../processing/test_minimax_vl_01.py          |   4 +-
 .../multimodal/processing/test_mllama4.py     |   2 +-
 .../multimodal/processing/test_nemotron_vl.py |   2 +-
 .../multimodal/processing/test_phi3v.py       |   2 +-
 .../multimodal/processing/test_phi4mm.py      |   2 +-
 .../multimodal/processing/test_qwen2_vl.py    |   2 +-
 .../multimodal/processing/test_smolvlm.py     |   2 +-
 .../processing/test_tensor_schema.py          |  24 ++-
 .../processing/test_transformers.py           |   5 +-
 tests/models/multimodal/test_mapping.py       |  33 ++-
 tests/models/registry.py                      |  33 +--
 tests/models/utils.py                         |  17 +-
 tests/multimodal/test_cache.py                |  27 +--
 tests/multimodal/test_processing.py           |  24 +--
 tests/multimodal/test_registry.py             |   4 +-
 tests/test_config.py                          | 131 +++++-------
 tests/test_inputs.py                          |   7 +-
 tests/v1/attention/utils.py                   |   2 -
 tests/v1/core/test_kv_cache_utils.py          |  20 +-
 tests/v1/core/test_scheduler.py               |   2 -
 tests/v1/core/utils.py                        |   2 -
 tests/v1/engine/test_engine_core.py           |   2 -
 .../engine/test_process_multi_modal_uuids.py  |  24 +--
 tests/v1/kv_connector/unit/utils.py           |   2 -
 tests/v1/spec_decode/test_eagle.py            |   2 -
 tests/v1/spec_decode/test_mtp.py              |   2 -
 tests/v1/spec_decode/test_ngram.py            |   2 -
 .../test_backend_guidance.py                  |  12 +-
 .../test_reasoning_structured_output.py       |  35 ++--
 tests/v1/tpu/worker/test_tpu_model_runner.py  |   2 -
 tests/v1/worker/test_gpu_model_runner.py      |   3 -
 vllm/config/__init__.py                       |   3 -
 vllm/config/model.py                          | 141 ++++++++++---
 vllm/config/multimodal.py                     |   4 +
 vllm/config/renderer.py                       | 109 ----------
 vllm/config/speculative.py                    |   5 +
 vllm/config/vllm.py                           |  25 +--
 vllm/engine/arg_utils.py                      |  99 ++++-----
 vllm/engine/protocol.py                       |   3 +-
 vllm/entrypoints/chat_utils.py                |  79 ++++----
 vllm/entrypoints/llm.py                       |  14 +-
 vllm/entrypoints/openai/api_server.py         |   2 +-
 vllm/entrypoints/openai/serving_completion.py |   2 +-
 vllm/entrypoints/openai/serving_engine.py     |  11 +-
 vllm/entrypoints/openai/serving_models.py     |   1 -
 vllm/entrypoints/openai/speech_to_text.py     |  10 +-
 vllm/entrypoints/pooling/pooling/serving.py   |   2 +-
 vllm/entrypoints/pooling/score/serving.py     |   4 +-
 vllm/entrypoints/score_utils.py               |  13 +-
 vllm/entrypoints/utils.py                     |   8 +-
 vllm/inputs/preprocess.py                     |   9 +-
 vllm/model_executor/models/adapters.py        |  20 +-
 vllm/model_executor/models/deepseek_ocr.py    |   4 +-
 vllm/model_executor/models/deepseek_vl2.py    |   4 +-
 vllm/model_executor/models/gemma3n_mm.py      |   8 +-
 vllm/model_executor/models/granite_speech.py  |  14 +-
 vllm/model_executor/models/gritlm.py          |  14 +-
 vllm/model_executor/models/interfaces.py      |  10 +-
 vllm/model_executor/models/interns1.py        |   2 +-
 .../model_executor/models/nano_nemotron_vl.py |  13 +-
 vllm/model_executor/models/nemotron_vl.py     |   2 +-
 vllm/model_executor/models/pixtral.py         |   2 +-
 vllm/model_executor/models/voxtral.py         |  22 +-
 vllm/model_executor/models/whisper.py         |  14 +-
 vllm/multimodal/cache.py                      |  22 +-
 vllm/multimodal/processing.py                 |  28 +--
 vllm/multimodal/registry.py                   |  64 +++---
 vllm/tokenizers/registry.py                   |  24 +--
 vllm/transformers_utils/processor.py          |  28 +--
 vllm/v1/core/encoder_cache_manager.py         |   8 +-
 vllm/v1/core/sched/scheduler.py               |   2 +-
 vllm/v1/engine/async_llm.py                   |   7 +-
 vllm/v1/engine/input_processor.py             |   7 +-
 vllm/v1/engine/llm_engine.py                  |   7 +-
 vllm/v1/spec_decode/eagle.py                  |   2 +-
 vllm/v1/structured_output/__init__.py         |  18 +-
 vllm/v1/worker/gpu_model_runner.py            |   7 +-
 vllm/v1/worker/tpu_model_runner.py            |   7 +-
 vllm/v1/worker/utils.py                       |  19 +-
 105 files changed, 797 insertions(+), 969 deletions(-)
 delete mode 100644 vllm/config/renderer.py

diff --git a/docs/contributing/model/transcription.md b/docs/contributing/model/transcription.md
index c5605789022d4..fca941acd5076 100644
--- a/docs/contributing/model/transcription.md
+++ b/docs/contributing/model/transcription.md
@@ -22,7 +22,7 @@ Declare supported languages and capabilities:
     import torch
     from torch import nn
 
-    from vllm.config import RendererConfig, SpeechToTextConfig
+    from vllm.config import ModelConfig, SpeechToTextConfig
     from vllm.inputs.data import PromptType
     from vllm.model_executor.models.interfaces import SupportsTranscription
     
@@ -52,7 +52,7 @@ This is for controlling general behavior of the API when serving your model:
         @classmethod
         def get_speech_to_text_config(
             cls,
-            renderer_config: RendererConfig,
+            model_config: ModelConfig,
             task_type: Literal["transcribe", "translate"],
         ) -> SpeechToTextConfig:
             return SpeechToTextConfig(
@@ -83,7 +83,7 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
             cls,
             audio: np.ndarray,
             stt_config: SpeechToTextConfig,
-            renderer_config: RendererConfig,
+            model_config: ModelConfig,
             language: str | None,
             task_type: Literal["transcribe", "translate"],
             request_prompt: str,
@@ -120,7 +120,7 @@ Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
             cls,
             audio: np.ndarray,
             stt_config: SpeechToTextConfig,
-            renderer_config: RendererConfig,
+            model_config: ModelConfig,
             language: str | None,
             task_type: Literal["transcribe", "translate"],
             request_prompt: str,
@@ -183,7 +183,7 @@ Provide a fast duration→token estimate to improve streaming usage statistics:
             cls,
             audio_duration_s: float,
             stt_config: SpeechToTextConfig,
-            renderer_config: RendererConfig,
+            model_config: ModelConfig,
         ) -> int | None:
             # Return None if unknown; otherwise return an estimate.
             return int(audio_duration_s * stt_config.sample_rate // 320)  # example
@@ -216,7 +216,7 @@ Relevant server logic:
             prompt = self.model_cls.get_generation_prompt(
                 audio=chunk,
                 stt_config=self.asr_config,
-                renderer_config=self.renderer_config,
+                model_config=self.model_config,
                 language=language,
                 task_type=self.task_type,
                 request_prompt=request.prompt,
diff --git a/tests/compile/distributed/test_sequence_parallelism.py b/tests/compile/distributed/test_sequence_parallelism.py
index 77d3a24d42923..d9fdc3acc3d6f 100644
--- a/tests/compile/distributed/test_sequence_parallelism.py
+++ b/tests/compile/distributed/test_sequence_parallelism.py
@@ -17,7 +17,6 @@ from vllm.config import (
     DeviceConfig,
     ModelConfig,
     PassConfig,
-    RendererConfig,
     VllmConfig,
     get_current_vllm_config,
     set_current_vllm_config,
@@ -277,7 +276,6 @@ def sequence_parallelism_pass_on_test_model(
 
     vllm_config = VllmConfig(
         model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
         device_config=device_config,
         compilation_config=compilation_config,
     )
diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py
index 52d6fd1e5d75e..7585915892700 100644
--- a/tests/compile/test_functionalization.py
+++ b/tests/compile/test_functionalization.py
@@ -15,7 +15,6 @@ from vllm.config import (
     CompilationConfig,
     ModelConfig,
     PassConfig,
-    RendererConfig,
     VllmConfig,
     set_current_vllm_config,
 )
@@ -220,11 +219,8 @@ def test_fix_functionalization(
     torch.set_default_device("cuda")
     torch.set_default_dtype(dtype)
 
-    model_config = ModelConfig(dtype=dtype)
-
     vllm_config = VllmConfig(
-        model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
+        model_config=ModelConfig(dtype=dtype),
         compilation_config=CompilationConfig(
             custom_ops=["all"],
             pass_config=PassConfig(
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index bb4ee6b8e3eca..d0ba8385f4a01 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -15,7 +15,6 @@ from vllm.config import (
     CompilationMode,
     ModelConfig,
     PassConfig,
-    RendererConfig,
     VllmConfig,
 )
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -155,11 +154,8 @@ def test_fusion_rmsnorm_quant(
         custom_ops.append("+rms_norm")
     if enable_quant_fp8_custom_op:
         custom_ops.append("+quant_fp8")
-
-    model_config = ModelConfig(dtype=dtype)
     vllm_config = VllmConfig(
-        model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
+        model_config=ModelConfig(dtype=dtype),
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
             custom_ops=custom_ops,
diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
index f87825db29817..db95dff5e0fc7 100644
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -24,7 +24,6 @@ from vllm.config import (
     CompilationMode,
     ModelConfig,
     PassConfig,
-    RendererConfig,
     SchedulerConfig,
     VllmConfig,
     set_current_vllm_config,
@@ -326,7 +325,6 @@ def test_attention_quant_pattern(
     )
     vllm_config = VllmConfig(
         model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
         scheduler_config=SchedulerConfig(
             max_num_seqs=1024,
             max_model_len=model_config.max_model_len,
diff --git a/tests/compile/test_pass_manager.py b/tests/compile/test_pass_manager.py
index c95e9e3ff8ae8..6d0ba6b655031 100644
--- a/tests/compile/test_pass_manager.py
+++ b/tests/compile/test_pass_manager.py
@@ -7,7 +7,7 @@ import torch
 
 from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
 from vllm.compilation.pass_manager import PostGradPassManager
-from vllm.config import ModelConfig, RendererConfig, VllmConfig
+from vllm.config import ModelConfig, VllmConfig
 
 
 # dummy custom pass that doesn't inherit
@@ -43,11 +43,7 @@ class ProperPass(InductorPass):
 )
 def test_pass_manager_uuid(callable):
     # Some passes need dtype to be set
-    model_config = ModelConfig(dtype=torch.bfloat16)
-    config = VllmConfig(
-        model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
-    )
+    config = VllmConfig(model_config=ModelConfig(dtype=torch.bfloat16))
 
     pass_manager = PostGradPassManager()
     pass_manager.configure(config)
diff --git a/tests/compile/test_qk_norm_rope_fusion.py b/tests/compile/test_qk_norm_rope_fusion.py
index 4d109015be48a..e0968ac799256 100644
--- a/tests/compile/test_qk_norm_rope_fusion.py
+++ b/tests/compile/test_qk_norm_rope_fusion.py
@@ -19,7 +19,6 @@ from vllm.config import (
     CompilationMode,
     ModelConfig,
     PassConfig,
-    RendererConfig,
     VllmConfig,
     set_current_vllm_config,
 )
@@ -134,10 +133,8 @@ def test_qk_norm_rope_fusion(
     if enable_rope_custom_op:
         custom_ops.append("+rotary_embedding")
 
-    model_config = ModelConfig(dtype=dtype)
     vllm_config = VllmConfig(
-        model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
+        model_config=ModelConfig(dtype=dtype),
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
             custom_ops=custom_ops,
diff --git a/tests/distributed/test_kvlayout.py b/tests/distributed/test_kvlayout.py
index 0d51a51a50804..b190b2820451b 100644
--- a/tests/distributed/test_kvlayout.py
+++ b/tests/distributed/test_kvlayout.py
@@ -5,7 +5,6 @@ from vllm.config import (
     DeviceConfig,
     KVTransferConfig,
     ModelConfig,
-    RendererConfig,
     VllmConfig,
     set_current_vllm_config,
 )
@@ -48,7 +47,6 @@ def test_get_kv_connector_cache_layout_with_nixl_connector():
     vllm_config = VllmConfig(
         device_config=DeviceConfig("cpu"),
         model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
         kv_transfer_config=kv_transfer_config,
     )
     with set_current_vllm_config(vllm_config):
@@ -72,7 +70,6 @@ def test_get_kv_connector_cache_layout_with_multi_connector():
     vllm_config = VllmConfig(
         device_config=DeviceConfig("cpu"),
         model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
         kv_transfer_config=kv_transfer_config,
     )
     with set_current_vllm_config(vllm_config):
diff --git a/tests/entrypoints/openai/test_chat_template.py b/tests/entrypoints/openai/test_chat_template.py
index b050cfdb561cf..77087ac21ea8b 100644
--- a/tests/entrypoints/openai/test_chat_template.py
+++ b/tests/entrypoints/openai/test_chat_template.py
@@ -3,6 +3,7 @@
 
 import pytest
 
+from vllm.config import ModelConfig
 from vllm.entrypoints.chat_utils import apply_hf_chat_template, load_chat_template
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.tokenizers import get_tokenizer
@@ -106,11 +107,24 @@ def test_get_gen_prompt(
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
     model_info.check_available_online(on_fail="skip")
 
-    renderer_config = model_info.build_renderer_config(model)
+    model_config = ModelConfig(
+        model,
+        tokenizer=model_info.tokenizer or model,
+        tokenizer_mode=model_info.tokenizer_mode,
+        trust_remote_code=model_info.trust_remote_code,
+        revision=model_info.revision,
+        hf_overrides=model_info.hf_overrides,
+        skip_tokenizer_init=model_info.require_embed_inputs,
+        enable_prompt_embeds=model_info.require_embed_inputs,
+        enable_mm_embeds=model_info.require_embed_inputs,
+        enforce_eager=model_info.enforce_eager,
+        dtype=model_info.dtype,
+    )
 
+    # Initialize the tokenizer
     tokenizer = get_tokenizer(
-        renderer_config.tokenizer,
-        trust_remote_code=renderer_config.trust_remote_code,
+        tokenizer_name=model_config.tokenizer,
+        trust_remote_code=model_config.trust_remote_code,
     )
     template_content = load_chat_template(chat_template=template)
 
@@ -129,7 +143,7 @@ def test_get_gen_prompt(
         tokenizer=tokenizer,
         conversation=mock_request.messages,
         chat_template=mock_request.chat_template or template_content,
-        renderer_config=renderer_config,
+        model_config=model_config,
         tools=None,
         add_generation_prompt=mock_request.add_generation_prompt,
         continue_final_message=mock_request.continue_final_message,
diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py
index 7310c2610ce3b..ea6b3d812d8fe 100644
--- a/tests/entrypoints/openai/test_lora_resolvers.py
+++ b/tests/entrypoints/openai/test_lora_resolvers.py
@@ -33,34 +33,26 @@ class MockModelConfig:
     """Minimal mock ModelConfig for testing."""
 
     model: str = MODEL_NAME
+    tokenizer: str = MODEL_NAME
     trust_remote_code: bool = False
+    tokenizer_mode: str = "auto"
     max_model_len: int = 100
+    tokenizer_revision: str | None = None
     multimodal_config: MultiModalConfig = field(default_factory=MultiModalConfig)
     hf_config: MockHFConfig = field(default_factory=MockHFConfig)
     logits_processors: list[str] | None = None
     logits_processor_pattern: str | None = None
     diff_sampling_param: dict | None = None
+    allowed_local_media_path: str = ""
+    allowed_media_domains: list[str] | None = None
     encoder_config = None
     generation_config: str = "auto"
+    skip_tokenizer_init: bool = False
 
     def get_diff_sampling_param(self):
         return self.diff_sampling_param or {}
 
 
-@dataclass
-class MockRendererConfig:
-    """Minimal mock RendererConfig for testing."""
-
-    model_config: MockModelConfig
-
-    tokenizer: str = MODEL_NAME
-    tokenizer_mode: str = "auto"
-    tokenizer_revision: str | None = None
-    skip_tokenizer_init: bool = False
-    allowed_local_media_path: str = ""
-    allowed_media_domains: list[str] | None = None
-
-
 class MockLoRAResolver(LoRAResolver):
     async def resolve_lora(
         self, base_model_name: str, lora_name: str
@@ -122,7 +114,6 @@ def mock_serving_setup():
     mock_engine.add_lora.reset_mock()
 
     mock_engine.model_config = MockModelConfig()
-    mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config)
     mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 9df8f886edd96..9ea65f9fa6e7a 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -346,33 +346,27 @@ class MockHFConfig:
 class MockModelConfig:
     task = "generate"
     runner_type = "generate"
+    tokenizer = MODEL_NAME
     trust_remote_code = False
+    tokenizer_mode = "auto"
     max_model_len = 100
+    tokenizer_revision = None
     multimodal_config = MultiModalConfig()
     hf_config = MockHFConfig()
     logits_processors: list[str] | None = None
     logits_processor_pattern = None
     diff_sampling_param: dict | None = None
+    allowed_local_media_path: str = ""
+    allowed_media_domains: list[str] | None = None
     encoder_config = None
     generation_config: str = "auto"
+    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
+    skip_tokenizer_init = False
 
     def get_diff_sampling_param(self):
         return self.diff_sampling_param or {}
 
 
-@dataclass
-class MockRendererConfig:
-    model_config: MockModelConfig = field(default_factory=MockModelConfig)
-
-    tokenizer = MODEL_NAME
-    tokenizer_mode = "auto"
-    tokenizer_revision = None
-    skip_tokenizer_init = False
-    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
-    allowed_local_media_path: str = ""
-    allowed_media_domains: list[str] | None = None
-
-
 def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
     models = OpenAIServingModels(
         engine_client=engine,
@@ -405,7 +399,6 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
 @dataclass
 class MockEngine:
     model_config: MockModelConfig = field(default_factory=MockModelConfig)
-    renderer_config: MockRendererConfig = field(default_factory=MockRendererConfig)
     input_processor: MagicMock = field(default_factory=MagicMock)
     io_processor: MagicMock = field(default_factory=MagicMock)
 
@@ -436,7 +429,6 @@ async def test_serving_chat_returns_correct_model_name():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = MockModelConfig()
-    mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config)
     mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
@@ -467,7 +459,6 @@ async def test_serving_chat_should_set_correct_max_tokens():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = MockModelConfig()
-    mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config)
     mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
@@ -501,7 +492,6 @@ async def test_serving_chat_should_set_correct_max_tokens():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = mock_model_config
-    mock_engine.renderer_config = MockRendererConfig(mock_model_config)
     mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
@@ -547,7 +537,6 @@ async def test_serving_chat_should_set_correct_max_tokens():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = mock_model_config
-    mock_engine.renderer_config = MockRendererConfig(mock_model_config)
     mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
@@ -594,7 +583,6 @@ async def test_serving_chat_could_load_correct_generation_config():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = mock_model_config
-    mock_engine.renderer_config = MockRendererConfig(mock_model_config)
     mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
@@ -641,7 +629,6 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = mock_model_config
-    mock_engine.renderer_config = MockRendererConfig(mock_model_config)
     mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
@@ -675,7 +662,6 @@ async def test_serving_chat_data_parallel_rank_extraction():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = MockModelConfig()
-    mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config)
     mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
diff --git a/tests/entrypoints/openai/test_serving_engine.py b/tests/entrypoints/openai/test_serving_engine.py
index 6ab0942b58da8..956a06dc5487c 100644
--- a/tests/entrypoints/openai/test_serving_engine.py
+++ b/tests/entrypoints/openai/test_serving_engine.py
@@ -7,7 +7,7 @@ from unittest.mock import Mock
 
 import pytest
 
-from vllm.config import ModelConfig, RendererConfig
+from vllm.config import ModelConfig
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.tokenizers import MistralTokenizer
@@ -19,16 +19,10 @@ def serving() -> OpenAIServing:
 
     # Create minimal mocks
     engine_client = Mock()
-
     model_config = Mock(spec=ModelConfig)
     model_config.max_model_len = 32768
-
-    renderer_config = Mock(spec=RendererConfig)
-    renderer_config.model_config = model_config
-
     models = Mock(spec=OpenAIServingModels)
     models.model_config = model_config
-    models.renderer_config = renderer_config
     models.input_processor = Mock()
     models.io_processor = Mock()
 
diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py
index 376df6cfecb9f..b585835a0667a 100644
--- a/tests/entrypoints/openai/test_serving_models.py
+++ b/tests/entrypoints/openai/test_serving_models.py
@@ -6,7 +6,7 @@ from unittest.mock import MagicMock
 
 import pytest
 
-from vllm.config import ModelConfig, RendererConfig
+from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.openai.protocol import (
     ErrorResponse,
@@ -27,15 +27,9 @@ LORA_UNLOADING_SUCCESS_MESSAGE = (
 async def _async_serving_models_init() -> OpenAIServingModels:
     mock_engine_client = MagicMock(spec=EngineClient)
     # Set the max_model_len attribute to avoid missing attribute
-
     mock_model_config = MagicMock(spec=ModelConfig)
     mock_model_config.max_model_len = 2048
-
-    mock_renderer_config = MagicMock(spec=RendererConfig)
-    mock_renderer_config.model_config = mock_model_config
-
     mock_engine_client.model_config = mock_model_config
-    mock_engine_client.renderer_config = mock_renderer_config
     mock_engine_client.input_processor = MagicMock()
     mock_engine_client.io_processor = MagicMock()
 
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 7b296eae7c5a2..527322c71ae4b 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -12,7 +12,7 @@ from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
 from vllm.assets.audio import AudioAsset
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
-from vllm.config import ModelConfig, RendererConfig
+from vllm.config import ModelConfig
 from vllm.entrypoints.chat_utils import (
     _try_extract_ast,
     apply_mistral_chat_template,
@@ -233,7 +233,7 @@ def test_parse_chat_messages_single_image(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -265,7 +265,7 @@ def test_parse_chat_messages_single_image_with_uuid(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -295,7 +295,7 @@ def test_parse_chat_messages_single_empty_image_with_uuid(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -328,7 +328,7 @@ def test_parse_chat_messages_single_image_with_bad_uuid_format(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -369,7 +369,7 @@ def test_parse_chat_messages_multiple_images_with_uuids(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -409,7 +409,7 @@ def test_parse_chat_messages_multiple_empty_images_with_uuids(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -451,7 +451,7 @@ def test_parse_chat_messages_mixed_empty_images_with_uuids(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -485,7 +485,7 @@ async def test_parse_chat_messages_single_image_with_uuid_async(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -516,7 +516,7 @@ async def test_parse_chat_messages_empty_image_with_uuid_async(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -554,7 +554,7 @@ async def test_parse_chat_messages_multiple_images_with_uuids_async(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -595,7 +595,7 @@ async def test_parse_chat_messages_multiple_empty_images_with_uuids_async(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -634,7 +634,7 @@ async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -660,7 +660,7 @@ def test_parse_chat_messages_empty_system(
                 "content": [{"type": "text", "text": "Who are you?"}],
             },
         ],
-        RendererConfig(model_config=mistral_model_config),
+        mistral_model_config,
         content_format="string",
     )
     assert conversation == [
@@ -677,7 +677,7 @@ def test_parse_chat_messages_empty_system(
                 "content": [{"type": "text", "text": "Who are you?"}],
             },
         ],
-        RendererConfig(model_config=mistral_model_config),
+        mistral_model_config,
         content_format="openai",
     )
     assert conversation == [
@@ -701,7 +701,7 @@ async def test_parse_chat_messages_single_image_async(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -730,7 +730,7 @@ def test_parse_chat_messages_multiple_images(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -758,7 +758,7 @@ def test_parse_chat_messages_empty_pil_image_with_uuid(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -786,7 +786,7 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config_image_embeds),
+        phi3v_model_config_image_embeds,
         content_format="string",
     )
 
@@ -818,7 +818,7 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid(
                 ],
             }
         ],
-        RendererConfig(model_config=audio_embeds_model_config),
+        audio_embeds_model_config,
         content_format="string",
     )
 
@@ -858,7 +858,7 @@ def test_parse_chat_messages_audio_embeds_with_string(
                 ],
             }
         ],
-        RendererConfig(model_config=audio_embeds_model_config),
+        audio_embeds_model_config,
         content_format="string",
     )
 
@@ -900,7 +900,7 @@ async def test_parse_chat_messages_audio_embeds_async(
                 ],
             }
         ],
-        RendererConfig(model_config=audio_embeds_model_config),
+        audio_embeds_model_config,
         content_format="string",
     )
 
@@ -1108,7 +1108,7 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config_image_embeds),
+        phi3v_model_config_image_embeds,
         content_format="string",
     )
 
@@ -1144,7 +1144,7 @@ async def test_parse_chat_messages_multiple_images_async(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -1176,7 +1176,7 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
     assert conversation == [
@@ -1208,7 +1208,7 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -1245,7 +1245,7 @@ def test_parse_chat_messages_multiple_images_across_messages(
                 ],
             },
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -1289,7 +1289,7 @@ def test_parse_chat_messages_multiple_images_with_uuids_across_messages(
                 ],
             },
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -1314,7 +1314,7 @@ def test_parse_chat_messages_context_text_format(
             {"role": "assistant", "content": "Some stuff."},
             {"role": "user", "content": "What about this one?"},
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="openai",
     )
 
@@ -1367,7 +1367,7 @@ def test_parse_chat_messages_rejects_too_many_images_in_one_message(
                         ],
                     }
                 ],
-                RendererConfig(model_config=phi3v_model_config),
+                phi3v_model_config,
                 content_format="string",
             )
 
@@ -1410,7 +1410,7 @@ def test_parse_chat_messages_rejects_too_many_images_across_messages(
                         ],
                     },
                 ],
-                RendererConfig(model_config=phi3v_model_config),
+                phi3v_model_config,
                 content_format="string",
             )
 
@@ -1430,7 +1430,7 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -1464,7 +1464,7 @@ def test_parse_chat_messages_multiple_images_interleave(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config_mm_interleaved),
+        phi3v_model_config_mm_interleaved,
         content_format="string",
     )
 
@@ -1500,7 +1500,7 @@ async def test_parse_chat_messages_multiple_images_interleave_async(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config_mm_interleaved),
+        phi3v_model_config_mm_interleaved,
         content_format="string",
     )
 
@@ -1545,7 +1545,7 @@ async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config_mm_interleaved),
+        phi3v_model_config_mm_interleaved,
         content_format="string",
     )
 
@@ -1583,7 +1583,7 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
                 ],
             },
         ],
-        RendererConfig(model_config=phi3v_model_config_mm_interleaved),
+        phi3v_model_config_mm_interleaved,
         content_format="string",
     )
 
@@ -1631,7 +1631,7 @@ def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interl
                 ],
             },
         ],
-        RendererConfig(model_config=phi3v_model_config_mm_interleaved),
+        phi3v_model_config_mm_interleaved,
         content_format="string",
     )
 
@@ -1675,7 +1675,7 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
                 ],
             },
         ],
-        RendererConfig(model_config=qwen25omni_model_config_mm_interleaved),
+        qwen25omni_model_config_mm_interleaved,
         content_format="string",
     )
 
@@ -1743,7 +1743,7 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl
                 ],
             },
         ],
-        RendererConfig(model_config=qwen25omni_model_config_mm_interleaved),
+        qwen25omni_model_config_mm_interleaved,
         content_format="string",
     )
 
@@ -1813,7 +1813,7 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_mes
                 ],
             },
         ],
-        RendererConfig(model_config=qwen25omni_model_config_mm_interleaved),
+        qwen25omni_model_config_mm_interleaved,
         content_format="string",
     )
 
@@ -1879,7 +1879,7 @@ def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_message
                 ],
             },
         ],
-        RendererConfig(model_config=qwen25omni_model_config_mm_interleaved),
+        qwen25omni_model_config_mm_interleaved,
         content_format="string",
     )
 
@@ -1927,7 +1927,7 @@ def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
                     ],
                 }
             ],
-            RendererConfig(model_config=phi3v_model_config_mm_interleaved),
+            phi3v_model_config_mm_interleaved,
             content_format="string",
         )
 
@@ -1945,11 +1945,24 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
     model_info.check_available_online(on_fail="skip")
 
-    renderer_config = model_info.build_renderer_config(model)
+    model_config = ModelConfig(
+        model,
+        tokenizer=model_info.tokenizer or model,
+        tokenizer_mode=model_info.tokenizer_mode,
+        revision=model_info.revision,
+        trust_remote_code=model_info.trust_remote_code,
+        hf_overrides=model_info.hf_overrides,
+        skip_tokenizer_init=model_info.require_embed_inputs,
+        enable_prompt_embeds=model_info.require_embed_inputs,
+        enable_mm_embeds=model_info.require_embed_inputs,
+        enforce_eager=model_info.enforce_eager,
+        dtype=model_info.dtype,
+    )
 
+    # Build the tokenizer
     tokenizer = get_tokenizer(
-        renderer_config.tokenizer,
-        trust_remote_code=renderer_config.trust_remote_code,
+        model,
+        trust_remote_code=model_config.trust_remote_code,
     )
 
     tools = (
@@ -1972,7 +1985,7 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
         tokenizer,
         chat_template=None,
         tools=tools,
-        model_config=renderer_config.model_config,
+        model_config=model_config,
     )
     assert isinstance(chat_template, str)
 
@@ -2034,11 +2047,24 @@ def test_resolve_hf_chat_template_kwargs(sample_json_schema, model, expected_kwa
         "enable_thinking": True,
     }
 
-    renderer_config = model_info.build_renderer_config(model)
+    model_config = ModelConfig(
+        model,
+        tokenizer=model_info.tokenizer or model,
+        tokenizer_mode=model_info.tokenizer_mode,
+        revision=model_info.revision,
+        trust_remote_code=model_info.trust_remote_code,
+        hf_overrides=model_info.hf_overrides,
+        skip_tokenizer_init=model_info.require_embed_inputs,
+        enable_prompt_embeds=model_info.require_embed_inputs,
+        enable_mm_embeds=model_info.require_embed_inputs,
+        enforce_eager=model_info.enforce_eager,
+        dtype=model_info.dtype,
+    )
 
+    # Build the tokenizer
     tokenizer = get_tokenizer(
-        renderer_config.tokenizer,
-        trust_remote_code=renderer_config.trust_remote_code,
+        model,
+        trust_remote_code=model_config.trust_remote_code,
     )
 
     # Test detecting the tokenizer's chat_template
@@ -2046,7 +2072,7 @@ def test_resolve_hf_chat_template_kwargs(sample_json_schema, model, expected_kwa
         tokenizer,
         chat_template=None,
         tools=tools,
-        model_config=renderer_config.model_config,
+        model_config=model_config,
     )
     with pytest.raises(
         ValueError, match="Found unexpected chat template kwargs from request"
@@ -2117,11 +2143,23 @@ def test_resolve_content_format_hf_defined(model, expected_format):
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
     model_info.check_available_online(on_fail="skip")
 
-    renderer_config = model_info.build_renderer_config(model)
+    model_config = ModelConfig(
+        model,
+        tokenizer=model_info.tokenizer or model,
+        tokenizer_mode=model_info.tokenizer_mode,
+        revision=model_info.revision,
+        trust_remote_code=model_info.trust_remote_code,
+        hf_overrides=model_info.hf_overrides,
+        skip_tokenizer_init=model_info.require_embed_inputs,
+        enable_prompt_embeds=model_info.require_embed_inputs,
+        enable_mm_embeds=model_info.require_embed_inputs,
+        enforce_eager=model_info.enforce_eager,
+        dtype=model_info.dtype,
+    )
 
     tokenizer = get_tokenizer(
-        renderer_config.tokenizer,
-        trust_remote_code=renderer_config.trust_remote_code,
+        model,
+        trust_remote_code=model_config.trust_remote_code,
     )
 
     # Test detecting the tokenizer's chat_template
@@ -2129,7 +2167,7 @@ def test_resolve_content_format_hf_defined(model, expected_format):
         tokenizer,
         chat_template=None,
         tools=None,
-        model_config=renderer_config.model_config,
+        model_config=model_config,
     )
     assert isinstance(chat_template, str)
 
@@ -2143,7 +2181,7 @@ def test_resolve_content_format_hf_defined(model, expected_format):
         None,
         "auto",
         tokenizer,
-        renderer_config=renderer_config,
+        model_config=model_config,
     )
 
     assert resolved_format == expected_format
@@ -2165,11 +2203,23 @@ def test_resolve_content_format_fallbacks(model, expected_format):
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
     model_info.check_available_online(on_fail="skip")
 
-    renderer_config = model_info.build_renderer_config(model)
+    model_config = ModelConfig(
+        model,
+        tokenizer=model_info.tokenizer or model,
+        tokenizer_mode=model_info.tokenizer_mode,
+        revision=model_info.revision,
+        trust_remote_code=model_info.trust_remote_code,
+        hf_overrides=model_info.hf_overrides,
+        skip_tokenizer_init=model_info.require_embed_inputs,
+        enable_prompt_embeds=model_info.require_embed_inputs,
+        enable_mm_embeds=model_info.require_embed_inputs,
+        enforce_eager=model_info.enforce_eager,
+        dtype=model_info.dtype,
+    )
 
     tokenizer = get_tokenizer(
-        renderer_config.tokenizer,
-        trust_remote_code=renderer_config.trust_remote_code,
+        model_config.tokenizer,
+        trust_remote_code=model_config.trust_remote_code,
     )
 
     # Test detecting the tokenizer's chat_template
@@ -2177,7 +2227,7 @@ def test_resolve_content_format_fallbacks(model, expected_format):
         tokenizer,
         chat_template=None,
         tools=None,
-        model_config=renderer_config.model_config,
+        model_config=model_config,
     )
     assert isinstance(chat_template, str)
 
@@ -2191,7 +2241,7 @@ def test_resolve_content_format_fallbacks(model, expected_format):
         None,
         "auto",
         tokenizer,
-        renderer_config=renderer_config,
+        model_config=model_config,
     )
 
     assert resolved_format == expected_format
@@ -2222,13 +2272,15 @@ def test_resolve_content_format_fallbacks(model, expected_format):
     ],
 )
 def test_resolve_content_format_examples(template_path, expected_format):
-    model = PHI3V_MODEL_ID  # Dummy
-    model_config = ModelConfig(model, trust_remote_code=True)
-    renderer_config = RendererConfig(model_config=model_config, tokenizer=model)
+    model_config = ModelConfig(
+        PHI3V_MODEL_ID,  # Dummy
+        tokenizer=PHI3V_MODEL_ID,  # Dummy
+        trust_remote_code=True,
+    )
 
     dummy_tokenizer = get_tokenizer(
-        renderer_config.tokenizer,
-        trust_remote_code=renderer_config.trust_remote_code,
+        PHI3V_MODEL_ID,  # Dummy
+        trust_remote_code=model_config.trust_remote_code,
     )
     dummy_tokenizer.chat_template = None
 
@@ -2245,7 +2297,7 @@ def test_resolve_content_format_examples(template_path, expected_format):
         None,
         "auto",
         dummy_tokenizer,
-        renderer_config=renderer_config,
+        model_config=model_config,
     )
 
     assert resolved_format == expected_format
@@ -2280,7 +2332,7 @@ def test_parse_chat_messages_include_thinking_chunk(mistral_model_config):
 
     conversation_with_thinking, _, _ = parse_chat_messages(
         messages,
-        RendererConfig(model_config=mistral_model_config),
+        mistral_model_config,
         content_format="openai",
     )
 
@@ -2380,7 +2432,7 @@ def test_parse_chat_messages_single_empty_audio_with_uuid(
                 ],
             }
         ],
-        RendererConfig(model_config=qwen2_audio_model_config),
+        qwen2_audio_model_config,
         content_format="string",
     )
 
@@ -2414,7 +2466,7 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
                 ],
             }
         ],
-        RendererConfig(model_config=qwen2_audio_model_config),
+        qwen2_audio_model_config,
         content_format="string",
     )
 
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index 7158120fc0217..081f14d6fabfb 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -8,7 +8,7 @@ import torch
 from safetensors.torch import load_file
 from torch import nn
 
-from vllm.config import ModelConfig, RendererConfig, VllmConfig
+from vllm.config import ModelConfig, VllmConfig
 from vllm.config.lora import LoRAConfig
 from vllm.lora.layers import (
     ColumnParallelLinearWithLoRA,
@@ -422,11 +422,7 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_pa
     )
 
     model_config = ModelConfig(max_model_len=16)
-    vllm_config = VllmConfig(
-        model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
-        lora_config=lora_config,
-    )
+    vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config)
 
     vllm_config.scheduler_config.max_num_seqs = 4
     vllm_config.scheduler_config.max_num_batched_tokens = 2
@@ -529,11 +525,7 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path
     )
 
     model_config = ModelConfig(max_model_len=16)
-    vllm_config = VllmConfig(
-        model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
-        lora_config=lora_config,
-    )
+    vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config)
 
     vllm_config.scheduler_config.max_num_seqs = 4
     vllm_config.scheduler_config.max_num_batched_tokens = 2
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index 42d8c6202e79e..54059ec561907 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -11,7 +11,6 @@ from vllm.config import (
     DeviceConfig,
     ModelConfig,
     ParallelConfig,
-    RendererConfig,
     SchedulerConfig,
     VllmConfig,
 )
@@ -44,7 +43,6 @@ def test_worker_apply_lora(qwen3_lora_files):
 
     vllm_config = VllmConfig(
         model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
         load_config=LoadConfig(
             download_dir=None,
             load_format="dummy",
diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py
index e368671078fdd..489ac1e6475b9 100644
--- a/tests/model_executor/test_model_load_with_params.py
+++ b/tests/model_executor/test_model_load_with_params.py
@@ -42,10 +42,8 @@ def test_model_loading_with_params(vllm_runner, monkeypatch):
             "Write a short story about a robot that dreams for the first time.\n"
         )
 
-        llm_engine = vllm_model.llm.llm_engine
-        model_config = llm_engine.model_config
-        renderer_config = llm_engine.renderer_config
-        tokenizer = llm_engine.tokenizer
+        model_config = vllm_model.llm.llm_engine.model_config
+        model_tokenizer = vllm_model.llm.llm_engine.tokenizer
 
         # asserts on the bert model config file
         assert model_config.encoder_config["max_seq_length"] == 512
@@ -56,8 +54,8 @@ def test_model_loading_with_params(vllm_runner, monkeypatch):
         assert model_config.pooler_config.normalize
 
         # asserts on the tokenizer loaded
-        assert renderer_config.tokenizer == "BAAI/bge-base-en-v1.5"
-        assert tokenizer.model_max_length == 512
+        assert model_config.tokenizer == "BAAI/bge-base-en-v1.5"
+        assert model_tokenizer.model_max_length == 512
 
         def check_model(model):
             assert isinstance(model, BertEmbeddingModel)
@@ -88,10 +86,8 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
             "Write a short story about a robot that dreams for the first time.\n"
         )
 
-        llm_engine = vllm_model.llm.llm_engine
-        model_config = llm_engine.model_config
-        renderer_config = llm_engine.renderer_config
-        tokenizer = llm_engine.tokenizer
+        model_config = vllm_model.llm.llm_engine.model_config
+        model_tokenizer = vllm_model.llm.llm_engine.tokenizer
 
         # asserts on the bert model config file
         assert model_config.encoder_config["max_seq_length"] == 512
@@ -102,8 +98,8 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
         assert model_config.pooler_config.normalize
 
         # asserts on the tokenizer loaded
-        assert renderer_config.tokenizer == "intfloat/multilingual-e5-base"
-        assert tokenizer.model_max_length == 512
+        assert model_config.tokenizer == "intfloat/multilingual-e5-base"
+        assert model_tokenizer.model_max_length == 512
 
         def check_model(model):
             assert isinstance(model, RobertaEmbeddingModel)
@@ -132,7 +128,7 @@ def test_facebook_roberta_model_loading_with_params(vllm_runner, monkeypatch):
             "Write a short story about a robot that dreams for the first time.\n"
         )
 
-        assert vllm_model.llm.llm_engine.renderer_config.tokenizer == model_name
+        assert vllm_model.llm.llm_engine.model_config.tokenizer == model_name
 
         def check_model(model):
             assert isinstance(model, RobertaEmbeddingModel)
diff --git a/tests/models/language/pooling/test_gritlm.py b/tests/models/language/pooling/test_gritlm.py
index 11ee003585487..0adc9b5cf25f6 100644
--- a/tests/models/language/pooling/test_gritlm.py
+++ b/tests/models/language/pooling/test_gritlm.py
@@ -6,7 +6,7 @@ import pytest
 from scipy.spatial.distance import cosine
 
 from vllm import LLM, SamplingParams
-from vllm.config import ModelConfig, RendererConfig
+from vllm.config import ModelConfig
 
 from ....utils import RemoteOpenAIServer
 
@@ -31,8 +31,7 @@ def test_find_array():
         dtype="bfloat16",
         seed=0,
     )
-    renderer_config = RendererConfig(model_config=model_config)
-    pooling = GritLMMeanPool(renderer_config=renderer_config)
+    pooling = GritLMMeanPool(model_config=model_config)
 
     arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
 
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 9b2b29b758765..2e032ac4ca526 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -25,6 +25,7 @@ from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingC
 from vllm.tokenizers import (
     MistralTokenizer,
     TokenizerLike,
+    cached_tokenizer_from_config,
 )
 
 from ....multimodal.utils import random_audio, random_image, random_video
@@ -211,20 +212,31 @@ def _test_processing_correctness(
     else:
         model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id_or_arch)
         model_id = model_id_or_arch
-
     model_info.check_available_online(on_fail="skip")
     model_info.check_transformers_version(on_fail="skip")
 
-    renderer_config = model_info.build_renderer_config(
-        model=model_id,
+    model_config = ModelConfig(
+        model_id,
+        tokenizer=model_info.tokenizer or model_id,
+        tokenizer_mode=model_info.tokenizer_mode,
+        revision=model_info.revision,
+        trust_remote_code=model_info.trust_remote_code,
+        hf_overrides=model_info.hf_overrides,
         # Ensure that the cache can fit all of the data
         mm_processor_cache_gb=2048,
+        skip_tokenizer_init=model_info.require_embed_inputs,
+        enable_prompt_embeds=model_info.require_embed_inputs,
+        enable_mm_embeds=model_info.require_embed_inputs,
+        enforce_eager=model_info.enforce_eager,
+        dtype=model_info.dtype,
     )
-    model_config = renderer_config.model_config
 
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
     factories = model_cls._processor_factory
-    ctx = InputProcessingContext.from_config(renderer_config)
+    ctx = InputProcessingContext(
+        model_config,
+        tokenizer=cached_tokenizer_from_config(model_config),
+    )
     cache = MultiModalProcessorOnlyCache(model_config)
 
     processing_info = factories.info(ctx)
diff --git a/tests/models/multimodal/processing/test_glm4_1v.py b/tests/models/multimodal/processing/test_glm4_1v.py
index fdc6352e2ec83..51071c93531de 100644
--- a/tests/models/multimodal/processing/test_glm4_1v.py
+++ b/tests/models/multimodal/processing/test_glm4_1v.py
@@ -40,7 +40,7 @@ def test_processor_override(
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"video": 1},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     tokenizer = processor.info.get_tokenizer()
     hf_processor_mm_kwargs = {"fps": fps}
 
@@ -79,7 +79,7 @@ def test_video_loader_consistency(
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"video": 1},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     hf_processor_mm_kwargs = {"fps": fps}
 
     # Build the image str / prompt based on the number of images we pass
diff --git a/tests/models/multimodal/processing/test_h2ovl.py b/tests/models/multimodal/processing/test_h2ovl.py
index 1263d663e6af6..1701d9dd8f011 100644
--- a/tests/models/multimodal/processing/test_h2ovl.py
+++ b/tests/models/multimodal/processing/test_h2ovl.py
@@ -162,7 +162,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": len(size_factors)},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     min_num = min_dynamic_patch if dynamic_image_size else 1
diff --git a/tests/models/multimodal/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py
index bf12e79a718b7..351b9d018eec2 100644
--- a/tests/models/multimodal/processing/test_idefics3.py
+++ b/tests/models/multimodal/processing/test_idefics3.py
@@ -38,7 +38,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     # Build the image str / prompt based on the number of images we pass
diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py
index 51f0d2e891b3f..b4994295d3a80 100644
--- a/tests/models/multimodal/processing/test_internvl.py
+++ b/tests/models/multimodal/processing/test_internvl.py
@@ -116,7 +116,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": len(size_factors)},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     min_num = min_dynamic_patch if dynamic_image_size else 1
diff --git a/tests/models/multimodal/processing/test_llama4.py b/tests/models/multimodal/processing/test_llama4.py
index 04bc8d3f53818..b73246b68b36a 100644
--- a/tests/models/multimodal/processing/test_llama4.py
+++ b/tests/models/multimodal/processing/test_llama4.py
@@ -30,7 +30,7 @@ def test_processor_override(
         limit_mm_per_prompt={"image": num_imgs},
         mm_processor_cache_gb=mm_processor_cache_gb,
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     config = processor.info.get_hf_config()
     tokenizer = processor.info.get_tokenizer()
     hf_processor = processor.info.get_hf_processor()
diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py
index cd01002a32af2..ffe7ca17b5d61 100644
--- a/tests/models/multimodal/processing/test_llava_next.py
+++ b/tests/models/multimodal/processing/test_llava_next.py
@@ -42,7 +42,7 @@ def test_processor_max_tokens(model_id):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": 1},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     info = processor.info
 
     seen_aspect_ratios = set[float]()
@@ -140,7 +140,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
 
     image_ratios = [
         (171, 152),
@@ -173,7 +173,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
 
     seen_aspect_ratios = set[float]()
     image_sizes = list[ImageSize]()
diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py
index be505d95a500f..f5c552fe6476a 100644
--- a/tests/models/multimodal/processing/test_llava_onevision.py
+++ b/tests/models/multimodal/processing/test_llava_onevision.py
@@ -42,7 +42,7 @@ def test_processor_max_tokens(model_id):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": 1},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     info = processor.info
 
     seen_aspect_ratios = set[float]()
@@ -138,7 +138,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
 
     image_ratios = [
         (171, 152),
@@ -171,7 +171,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
 
     seen_aspect_ratios = set[float]()
     image_sizes = list[ImageSize]()
diff --git a/tests/models/multimodal/processing/test_minimax_vl_01.py b/tests/models/multimodal/processing/test_minimax_vl_01.py
index 17ac54fdd0a49..11e0001235110 100644
--- a/tests/models/multimodal/processing/test_minimax_vl_01.py
+++ b/tests/models/multimodal/processing/test_minimax_vl_01.py
@@ -24,7 +24,7 @@ def test_processor_override(
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     prompt = "<image>" * num_imgs
     image = Image.new("RGB", size=(364, 364))
     mm_data = {"image": [image] * num_imgs}
@@ -83,7 +83,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
 
     image_ratios = [
         (171, 152),
diff --git a/tests/models/multimodal/processing/test_mllama4.py b/tests/models/multimodal/processing/test_mllama4.py
index 9a65e2ddc85c6..e5ff2d1391b62 100644
--- a/tests/models/multimodal/processing/test_mllama4.py
+++ b/tests/models/multimodal/processing/test_mllama4.py
@@ -25,7 +25,7 @@ def test_profiling(model_id: str, max_model_len: int):
         limit_mm_per_prompt=mm_counts,
     )
 
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     profiler = MultiModalProfiler(processor)
 
     decoder_dummy_data = profiler.get_decoder_dummy_data(
diff --git a/tests/models/multimodal/processing/test_nemotron_vl.py b/tests/models/multimodal/processing/test_nemotron_vl.py
index f3609743b7c85..5311ab1b78c69 100644
--- a/tests/models/multimodal/processing/test_nemotron_vl.py
+++ b/tests/models/multimodal/processing/test_nemotron_vl.py
@@ -118,7 +118,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": len(size_factors)},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     min_num = min_dynamic_patch if dynamic_image_size else 1
diff --git a/tests/models/multimodal/processing/test_phi3v.py b/tests/models/multimodal/processing/test_phi3v.py
index f51bd97861783..8faff2611e6fe 100644
--- a/tests/models/multimodal/processing/test_phi3v.py
+++ b/tests/models/multimodal/processing/test_phi3v.py
@@ -39,7 +39,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     # Build the image str / prompt based on the number of images we pass
diff --git a/tests/models/multimodal/processing/test_phi4mm.py b/tests/models/multimodal/processing/test_phi4mm.py
index 271357b0d1507..5391555c26675 100644
--- a/tests/models/multimodal/processing/test_phi4mm.py
+++ b/tests/models/multimodal/processing/test_phi4mm.py
@@ -39,7 +39,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     # Build the image str / prompt based on the number of images we pass
diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py
index d65a270a7da3b..9f4cdb6789b2c 100644
--- a/tests/models/multimodal/processing/test_qwen2_vl.py
+++ b/tests/models/multimodal/processing/test_qwen2_vl.py
@@ -34,7 +34,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     tokenizer = processor.info.get_tokenizer()
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
diff --git a/tests/models/multimodal/processing/test_smolvlm.py b/tests/models/multimodal/processing/test_smolvlm.py
index e0e6264de4e3a..6f77d5516d147 100644
--- a/tests/models/multimodal/processing/test_smolvlm.py
+++ b/tests/models/multimodal/processing/test_smolvlm.py
@@ -38,7 +38,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     # Build the image str / prompt based on the number of images we pass
diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py
index 24959fa48ad6d..5d489549c5b46 100644
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -11,7 +11,7 @@ import pytest
 import torch.nn as nn
 from PIL import Image
 
-from vllm.config import ModelConfig, RendererConfig, VllmConfig, set_current_vllm_config
+from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
 from vllm.config.multimodal import (
     AudioDummyOptions,
     BaseDummyOptions,
@@ -31,6 +31,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
 from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
 from vllm.multimodal.utils import group_mm_kwargs_by_modality
 from vllm.platforms import current_platform
+from vllm.tokenizers import cached_tokenizer_from_config
 from vllm.utils.collection_utils import is_list_of
 from vllm.utils.torch_utils import set_default_torch_dtype
 
@@ -149,10 +150,7 @@ def initialize_dummy_model(
         backend="nccl",
     )
     initialize_model_parallel(tensor_model_parallel_size=1)
-    vllm_config = VllmConfig(
-        model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
-    )
+    vllm_config = VllmConfig(model_config=model_config)
     with set_current_vllm_config(vllm_config=vllm_config):
         with set_default_torch_dtype(model_config.dtype):
             model = model_cls(vllm_config=vllm_config)
@@ -184,12 +182,19 @@ def test_model_tensor_schema(model_id: str):
     else:
         dtype = model_info.dtype
 
-    renderer_config = model_info.build_renderer_config(
+    model_config = ModelConfig(
         model_id,
+        tokenizer=model_info.tokenizer or model_id,
+        tokenizer_mode=model_info.tokenizer_mode,
+        revision=model_info.revision,
+        trust_remote_code=model_info.trust_remote_code,
         hf_overrides=hf_overrides_fn,
+        skip_tokenizer_init=model_info.require_embed_inputs,
+        enable_prompt_embeds=model_info.require_embed_inputs,
+        enable_mm_embeds=model_info.require_embed_inputs,
+        enforce_eager=model_info.enforce_eager,
         dtype=dtype,
     )
-    model_config = renderer_config.model_config
 
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
     assert supports_multimodal(model_cls)
@@ -207,7 +212,10 @@ def test_model_tensor_schema(model_id: str):
     if not any(inputs_parse_methods):
         pytest.skip(f"{model_arch} does not support tensor schema validation.")
 
-    ctx = InputProcessingContext.from_config(renderer_config)
+    ctx = InputProcessingContext(
+        model_config,
+        tokenizer=cached_tokenizer_from_config(model_config),
+    )
     processing_info = factories.info(ctx)
     supported_mm_limits = processing_info.get_supported_mm_limits()
     limit_mm_per_prompt = {
diff --git a/tests/models/multimodal/processing/test_transformers.py b/tests/models/multimodal/processing/test_transformers.py
index c9a90eb882da4..e2a2186f470b4 100644
--- a/tests/models/multimodal/processing/test_transformers.py
+++ b/tests/models/multimodal/processing/test_transformers.py
@@ -3,7 +3,7 @@
 import pytest
 
 from vllm.assets.image import ImageAsset
-from vllm.config import ModelConfig, RendererConfig
+from vllm.config import ModelConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
 
 
@@ -13,9 +13,8 @@ def test_multimodal_processor(model_id):
         model=model_id,
         model_impl="transformers",
     )
-    renderer_config = RendererConfig(model_config=model_config)
 
-    mm_processor = MULTIMODAL_REGISTRY.create_processor(renderer_config)
+    mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config)
 
     image_pil = ImageAsset("cherry_blossom").pil_image
     mm_data = {"image": image_pil}
diff --git a/tests/models/multimodal/test_mapping.py b/tests/models/multimodal/test_mapping.py
index 73de6b5f7d2f9..0d2eaca95504e 100644
--- a/tests/models/multimodal/test_mapping.py
+++ b/tests/models/multimodal/test_mapping.py
@@ -7,6 +7,7 @@ import torch
 import transformers
 from transformers import AutoConfig, PreTrainedModel
 
+from vllm.config import ModelConfig
 from vllm.model_executor.models.utils import WeightsMapper
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.transformers_utils.config import try_get_safetensors_metadata
@@ -49,11 +50,37 @@ def test_hf_model_weights_mapper(model_arch: str):
     model_info.check_available_online(on_fail="skip")
     model_info.check_transformers_version(on_fail="skip")
 
-    model_config = model_info.build_model_config(config_format="hf")
+    is_mistral_model = model_arch in [
+        "Mistral3ForConditionalGeneration",
+        "PixtralForConditionalGeneration",
+        "VoxtralForConditionalGeneration",
+    ]
+
+    if not is_mistral_model or model_info.tokenizer_mode == "mistral":
+        tokenizer_mode = model_info.tokenizer_mode
+    else:
+        tokenizer_mode = "hf"
+
+    model_id = model_info.default
+
+    model_config = ModelConfig(
+        model_id,
+        tokenizer=model_info.tokenizer or model_id,
+        tokenizer_mode=tokenizer_mode,
+        config_format="hf",
+        revision=model_info.revision,
+        trust_remote_code=model_info.trust_remote_code,
+        hf_overrides=model_info.hf_overrides,
+        skip_tokenizer_init=model_info.require_embed_inputs,
+        enable_prompt_embeds=model_info.require_embed_inputs,
+        enable_mm_embeds=model_info.require_embed_inputs,
+        enforce_eager=model_info.enforce_eager,
+        dtype=model_info.dtype,
+    )
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
 
-    original_weights = create_repo_dummy_weights(model_config.model)
-    hf_dummy_model = create_dummy_model(model_config.model, model_arch)
+    original_weights = create_repo_dummy_weights(model_id)
+    hf_dummy_model = create_dummy_model(model_id, model_arch)
     hf_converted_weights = hf_dummy_model.named_parameters()
     hf_converted_buffers = hf_dummy_model.named_buffers()
     mapper: WeightsMapper = model_cls.hf_to_vllm_mapper
diff --git a/tests/models/registry.py b/tests/models/registry.py
index e2cb5bcbc6c91..020cb749341a6 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -9,8 +9,7 @@ import pytest
 from packaging.version import Version
 from transformers import __version__ as TRANSFORMERS_VERSION
 
-from vllm.config.model import ModelConfig, ModelDType
-from vllm.config.renderer import RendererConfig, TokenizerMode
+from vllm.config.model import ModelDType, TokenizerMode
 
 
 @dataclass(frozen=True)
@@ -171,36 +170,6 @@ class _HfExamplesInfo:
             else:
                 pytest.skip(msg)
 
-    def build_model_config(self, model: str | None = None, **kwargs) -> ModelConfig:
-        if model is None:
-            model = self.default
-
-        return ModelConfig(
-            **{
-                "model": model,
-                "revision": self.revision,
-                "trust_remote_code": self.trust_remote_code,
-                "hf_overrides": self.hf_overrides,
-                "enable_prompt_embeds": self.require_embed_inputs,
-                "enable_mm_embeds": self.require_embed_inputs,
-                "enforce_eager": self.enforce_eager,
-                "dtype": self.dtype,
-                **kwargs,
-            }
-        )
-
-    def build_renderer_config(
-        self, model: str | None = None, **kwargs
-    ) -> RendererConfig:
-        model_config = self.build_model_config(model, **kwargs)
-
-        return RendererConfig(
-            model_config=model_config,
-            tokenizer=self.tokenizer or model_config.model,
-            tokenizer_mode=self.tokenizer_mode,
-            skip_tokenizer_init=self.require_embed_inputs,
-        )
-
 
 _TEXT_GENERATION_EXAMPLE_MODELS = {
     # [Decoder-only]
diff --git a/tests/models/utils.py b/tests/models/utils.py
index 87292cc4538d9..d84b4b820533e 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -13,6 +13,7 @@ from transformers import PretrainedConfig
 from vllm.config.model import ModelConfig, ModelDType, RunnerOption
 from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
 from vllm.multimodal.processing import InputProcessingContext
+from vllm.tokenizers import cached_tokenizer_from_config
 
 from .. import ci_envs
 from .registry import HF_EXAMPLE_MODELS
@@ -295,18 +296,30 @@ def build_model_context(
 
     model_config_kwargs = model_config_kwargs or {}
     limit_mm_per_prompt = limit_mm_per_prompt or {}
-    renderer_config = model_info.build_renderer_config(
+    model_config = ModelConfig(
         model_id,
         runner=runner,
+        tokenizer=model_info.tokenizer or model_id,
+        tokenizer_mode=model_info.tokenizer_mode,
+        revision=model_info.revision,
+        trust_remote_code=model_info.trust_remote_code,
         dtype=dtype,
         seed=0,
         mm_processor_kwargs=mm_processor_kwargs,
         limit_mm_per_prompt=limit_mm_per_prompt,
         mm_processor_cache_gb=mm_processor_cache_gb,
+        hf_overrides=model_info.hf_overrides,
+        skip_tokenizer_init=model_info.require_embed_inputs,
+        enable_prompt_embeds=model_info.require_embed_inputs,
+        enable_mm_embeds=model_info.require_embed_inputs,
+        enforce_eager=model_info.enforce_eager,
         **model_config_kwargs,
     )
 
-    return InputProcessingContext.from_config(renderer_config)
+    return InputProcessingContext(
+        model_config,
+        tokenizer=cached_tokenizer_from_config(model_config),
+    )
 
 
 def check_embeddings_close(
diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py
index ce16d90130aa4..e641b1111abaf 100644
--- a/tests/multimodal/test_cache.py
+++ b/tests/multimodal/test_cache.py
@@ -6,7 +6,7 @@ import numpy as np
 import pytest
 import torch
 
-from vllm.config import ModelConfig, ParallelConfig, RendererConfig, VllmConfig
+from vllm.config import ModelConfig, ParallelConfig, VllmConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.cache import (
     BaseMultiModalProcessorCache,
@@ -110,14 +110,11 @@ def _create_vllm_config(
     mm_processor_cache_gb: float,
     enable_ipc: bool,
 ):
-    model_config = ModelConfig(
-        model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
-        mm_processor_cache_gb=mm_processor_cache_gb,
-    )
-
     return VllmConfig(
-        model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
+        model_config=ModelConfig(
+            model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
+            mm_processor_cache_gb=mm_processor_cache_gb,
+        ),
         parallel_config=ParallelConfig(data_parallel_size=1 if enable_ipc else 2),
     )
 
@@ -509,15 +506,13 @@ def _run_test_cache_eviction_shm(
 
 
 def test_cache_eviction_shm_cache():
-    model_config = ModelConfig(
-        model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
-        mm_processor_cache_type="shm",
-        mm_shm_cache_max_object_size_mb=6,
-        mm_processor_cache_gb=15.2 * MiB_bytes / GiB_bytes,
-    )
     vllm_config = VllmConfig(
-        model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
+        model_config=ModelConfig(
+            model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
+            mm_processor_cache_type="shm",
+            mm_shm_cache_max_object_size_mb=6,
+            mm_processor_cache_gb=15.2 * MiB_bytes / GiB_bytes,
+        ),
     )
     sender_cache = ShmObjectStoreSenderCache(vllm_config)
     receiver_cache = ShmObjectStoreReceiverCache(vllm_config, mp.Lock())
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index adff572524a9b..262ea42e4d0fa 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -7,7 +7,7 @@ from contextlib import nullcontext
 import numpy as np
 import pytest
 
-from vllm.config import ModelConfig, RendererConfig
+from vllm.config import ModelConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.processing import (
     InputProcessingContext,
@@ -920,9 +920,8 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
         model=model_id,
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
-    renderer_config = RendererConfig(model_config=model_config)
 
-    processor = MULTIMODAL_REGISTRY.create_processor(renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(model_config)
     processor._supported_mm_limits = {"image": num_supported}
 
     profiler = MultiModalProfiler(processor)
@@ -956,9 +955,8 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
         model=model_id,
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
-    renderer_config = RendererConfig(model_config=model_config)
 
-    processor = MULTIMODAL_REGISTRY.create_processor(renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(model_config)
 
     rng = np.random.RandomState(0)
     image = random_image(rng, min_wh=128, max_wh=256)
@@ -1014,13 +1012,11 @@ def test_hf_processor_init_kwargs(
     inference_kwargs,
     expected_kwargs,
 ):
-    model_config = ModelConfig(model_id, mm_processor_kwargs=config_kwargs)
-    renderer_config = RendererConfig(
-        model_config=model_config,
-        tokenizer=model_id,
+    ctx = InputProcessingContext(
+        model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
+        tokenizer=None,
     )
 
-    ctx = InputProcessingContext.from_config(renderer_config)
     processor = ctx.get_hf_processor(
         DummyProcessor,  # type: ignore[arg-type]
         **inference_kwargs,
@@ -1049,13 +1045,11 @@ def test_hf_processor_call_kwargs(
     inference_kwargs,
     expected_kwargs,
 ):
-    model_config = ModelConfig(model_id, mm_processor_kwargs=config_kwargs)
-    renderer_config = RendererConfig(
-        model_config=model_config,
-        tokenizer=model_id,
+    ctx = InputProcessingContext(
+        model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
+        tokenizer=None,
     )
 
-    ctx = InputProcessingContext.from_config(renderer_config)
     processor = ctx.get_hf_processor(DummyProcessor)  # type: ignore[arg-type]
 
     result = ctx.call_hf_processor(processor, {}, inference_kwargs)
diff --git a/tests/multimodal/test_registry.py b/tests/multimodal/test_registry.py
index 8127fac09968b..3b01bda7f54c8 100644
--- a/tests/multimodal/test_registry.py
+++ b/tests/multimodal/test_registry.py
@@ -31,6 +31,4 @@ def test_supports_multimodal_inputs(model_id, limit_mm_per_prompt, expected):
         model_id,
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
-    assert (
-        MULTIMODAL_REGISTRY.supports_multimodal_inputs(ctx.renderer_config) is expected
-    )
+    assert MULTIMODAL_REGISTRY.supports_multimodal_inputs(ctx.model_config) is expected
diff --git a/tests/test_config.py b/tests/test_config.py
index 7464fcd1e9fe5..203447cd531fb 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -13,7 +13,6 @@ from vllm.config import (
     CompilationConfig,
     ModelConfig,
     PoolerConfig,
-    RendererConfig,
     SchedulerConfig,
     VllmConfig,
     update_config,
@@ -477,41 +476,27 @@ def test_load_config_pt_load_map_location(pt_load_map_location):
         ("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", 131073, 131072, True),
     ],
 )
-def test_recalculate_max_model_len(
+def test_get_and_verify_max_len(
     model_id, max_model_len, expected_max_len, should_raise
 ):
-    """Test recalculate_max_model_len with different configurations."""
+    """Test get_and_verify_max_len with different configurations."""
     model_config = ModelConfig(model_id)
 
     if should_raise:
         with pytest.raises(ValueError):
-            model_config.recalculate_max_model_len(
-                max_model_len,
-                tokenizer=model_id,
-                tokenizer_revision=None,
-            )
+            model_config.get_and_verify_max_len(max_model_len)
     else:
-        model_config.recalculate_max_model_len(
-            max_model_len,
-            tokenizer=model_id,
-            tokenizer_revision=None,
-        )
-        assert model_config.max_model_len == expected_max_len
+        actual_max_len = model_config.get_and_verify_max_len(max_model_len)
+        assert actual_max_len == expected_max_len
 
 
-class MockModelConfig:
-    """Simple mock object for testing maybe_pull_model_for_runai"""
+class MockConfig:
+    """Simple mock object for testing maybe_pull_model_tokenizer_for_runai"""
 
-    def __init__(self, model: str):
+    def __init__(self, model: str, tokenizer: str):
         self.model = model
-
-
-class MockRendererConfig:
-    """Simple mock object for testing maybe_pull_tokenizer_for_runai"""
-
-    def __init__(self, model_config: MockModelConfig):
-        self.model_config = model_config
-        self.tokenizer = model_config.model
+        self.tokenizer = tokenizer
+        self.model_weights = None
 
 
 @pytest.mark.parametrize(
@@ -529,65 +514,59 @@ def test_s3_url_model_tokenizer_paths(mock_pull_files, s3_url):
     mock_pull_files.return_value = None
 
     # Create first mock and run the method
-    model_config1 = MockModelConfig(model=s3_url)
-    renderer_config1 = MockRendererConfig(model_config=model_config1)
-    ModelConfig.maybe_pull_model_for_runai(model_config1, s3_url)
-    RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config1, s3_url)
+    config1 = MockConfig(model=s3_url, tokenizer=s3_url)
+    ModelConfig.maybe_pull_model_tokenizer_for_runai(config1, s3_url, s3_url)
 
     # Check that model and tokenizer point to existing directories
-    assert os.path.exists(model_config1.model), (
-        f"Model directory does not exist: {model_config1.model}"
+    assert os.path.exists(config1.model), (
+        f"Model directory does not exist: {config1.model}"
     )
-    assert os.path.isdir(model_config1.model), (
-        f"Model path is not a directory: {model_config1.model}"
+    assert os.path.isdir(config1.model), (
+        f"Model path is not a directory: {config1.model}"
     )
-    assert os.path.exists(renderer_config1.tokenizer), (
-        f"Tokenizer directory does not exist: {renderer_config1.tokenizer}"
+    assert os.path.exists(config1.tokenizer), (
+        f"Tokenizer directory does not exist: {config1.tokenizer}"
     )
-    assert os.path.isdir(renderer_config1.tokenizer), (
-        f"Tokenizer path is not a directory: {renderer_config1.tokenizer}"
+    assert os.path.isdir(config1.tokenizer), (
+        f"Tokenizer path is not a directory: {config1.tokenizer}"
     )
 
     # Verify that the paths are different from the original S3 URL
-    assert model_config1.model != s3_url, (
-        "Model path should be converted to local directory"
-    )
-    assert renderer_config1.tokenizer != s3_url, (
+    assert config1.model != s3_url, "Model path should be converted to local directory"
+    assert config1.tokenizer != s3_url, (
         "Tokenizer path should be converted to local directory"
     )
 
     # Store the original paths
-    created_model_dir = model_config1.model
-    create_tokenizer_dir = renderer_config1.tokenizer
+    created_model_dir = config1.model
+    create_tokenizer_dir = config1.tokenizer
 
     # Create a new mock and run the method with the same S3 URL
-    model_config2 = MockModelConfig(model=s3_url)
-    renderer_config2 = MockRendererConfig(model_config=model_config2)
-    ModelConfig.maybe_pull_model_for_runai(model_config2, s3_url)
-    RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config2, s3_url)
+    config2 = MockConfig(model=s3_url, tokenizer=s3_url)
+    ModelConfig.maybe_pull_model_tokenizer_for_runai(config2, s3_url, s3_url)
 
     # Check that the new directories exist
-    assert os.path.exists(model_config2.model), (
-        f"Model directory does not exist: {model_config2.model}"
+    assert os.path.exists(config2.model), (
+        f"Model directory does not exist: {config2.model}"
     )
-    assert os.path.isdir(model_config2.model), (
-        f"Model path is not a directory: {model_config2.model}"
+    assert os.path.isdir(config2.model), (
+        f"Model path is not a directory: {config2.model}"
     )
-    assert os.path.exists(renderer_config2.tokenizer), (
-        f"Tokenizer directory does not exist: {renderer_config2.tokenizer}"
+    assert os.path.exists(config2.tokenizer), (
+        f"Tokenizer directory does not exist: {config2.tokenizer}"
     )
-    assert os.path.isdir(renderer_config2.tokenizer), (
-        f"Tokenizer path is not a directory: {renderer_config2.tokenizer}"
+    assert os.path.isdir(config2.tokenizer), (
+        f"Tokenizer path is not a directory: {config2.tokenizer}"
     )
 
     # Verify that the paths are deterministic (same as before)
-    assert model_config2.model == created_model_dir, (
+    assert config2.model == created_model_dir, (
         f"Model paths are not deterministic. "
-        f"Original: {created_model_dir}, New: {model_config2.model}"
+        f"Original: {created_model_dir}, New: {config2.model}"
     )
-    assert renderer_config2.tokenizer == create_tokenizer_dir, (
+    assert config2.tokenizer == create_tokenizer_dir, (
         f"Tokenizer paths are not deterministic. "
-        f"Original: {create_tokenizer_dir}, New: {renderer_config2.tokenizer}"
+        f"Original: {create_tokenizer_dir}, New: {config2.tokenizer}"
     )
 
 
@@ -601,36 +580,28 @@ def test_s3_url_different_models_create_different_directories(mock_pull_files):
     s3_url2 = "s3://example-bucket-2/model/"
 
     # Create mocks with different S3 URLs and run the method
-    model_config1 = MockModelConfig(model=s3_url1)
-    renderer_config1 = MockRendererConfig(model_config=model_config1)
-    ModelConfig.maybe_pull_model_for_runai(model_config1, s3_url1)
-    RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config1, s3_url1)
+    config1 = MockConfig(model=s3_url1, tokenizer=s3_url1)
+    ModelConfig.maybe_pull_model_tokenizer_for_runai(config1, s3_url1, s3_url1)
 
-    model_config2 = MockModelConfig(model=s3_url2)
-    renderer_config2 = MockRendererConfig(model_config=model_config2)
-    ModelConfig.maybe_pull_model_for_runai(model_config2, s3_url2)
-    RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config2, s3_url2)
+    config2 = MockConfig(model=s3_url2, tokenizer=s3_url2)
+    ModelConfig.maybe_pull_model_tokenizer_for_runai(config2, s3_url2, s3_url2)
 
     # Verify that different URLs produce different directories
-    assert model_config1.model != model_config2.model, (
+    assert config1.model != config2.model, (
         f"Different S3 URLs should create different model directories. "
-        f"URL1 model: {model_config1.model}, URL2 model: {model_config2.model}"
+        f"URL1 model: {config1.model}, URL2 model: {config2.model}"
     )
-    assert renderer_config1.tokenizer != renderer_config2.tokenizer, (
+    assert config1.tokenizer != config2.tokenizer, (
         f"Different S3 URLs should create different tokenizer directories. "
-        f"URL1 tokenizer: {renderer_config1.tokenizer}, "
-        f"URL2 tokenizer: {renderer_config2.tokenizer}"
+        f"URL1 tokenizer: {config1.tokenizer}, "
+        f"URL2 tokenizer: {config2.tokenizer}"
     )
 
     # Verify that both sets of directories exist
-    assert os.path.exists(model_config1.model) and os.path.isdir(model_config1.model)
-    assert os.path.exists(renderer_config1.tokenizer) and os.path.isdir(
-        renderer_config1.tokenizer
-    )
-    assert os.path.exists(model_config2.model) and os.path.isdir(model_config2.model)
-    assert os.path.exists(renderer_config2.tokenizer) and os.path.isdir(
-        renderer_config2.tokenizer
-    )
+    assert os.path.exists(config1.model) and os.path.isdir(config1.model)
+    assert os.path.exists(config1.tokenizer) and os.path.isdir(config1.tokenizer)
+    assert os.path.exists(config2.model) and os.path.isdir(config2.model)
+    assert os.path.exists(config2.tokenizer) and os.path.isdir(config2.tokenizer)
 
 
 @pytest.mark.parametrize(
diff --git a/tests/test_inputs.py b/tests/test_inputs.py
index 48fd076ab3c67..c4339827de8b6 100644
--- a/tests/test_inputs.py
+++ b/tests/test_inputs.py
@@ -3,7 +3,7 @@
 
 import pytest
 
-from vllm.config import ModelConfig, RendererConfig
+from vllm.config import ModelConfig
 from vllm.inputs import zip_enc_dec_prompts
 from vllm.inputs.parse import parse_raw_prompts
 from vllm.inputs.preprocess import InputPreprocessor
@@ -108,9 +108,8 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
 )
 def test_preprocessor_always_mm_code_path(model_id, prompt):
     model_config = ModelConfig(model=model_id)
-    renderer_config = RendererConfig(model_config=model_config)
-    tokenizer = init_tokenizer_from_config(renderer_config)
-    input_preprocessor = InputPreprocessor(renderer_config, tokenizer)
+    tokenizer = init_tokenizer_from_config(model_config)
+    input_preprocessor = InputPreprocessor(model_config, tokenizer)
 
     # HF processor adds sep token
     sep_token_id = tokenizer.vocab[tokenizer.sep_token]
diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
index 49307e3e5437d..6cab129c116c5 100644
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@@ -16,7 +16,6 @@ from vllm.config import (
     LoadConfig,
     ModelConfig,
     ParallelConfig,
-    RendererConfig,
     SchedulerConfig,
     VllmConfig,
 )
@@ -217,7 +216,6 @@ def create_vllm_config(
 
     return VllmConfig(
         model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
         cache_config=cache_config,
         parallel_config=parallel_config,
         scheduler_config=scheduler_config,
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 4a414bca591d1..fd5cf6d3e74aa 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -8,7 +8,7 @@ import pytest
 import torch
 
 import vllm.v1.core.kv_cache_utils as kv_cache_utils
-from vllm.config import ModelConfig, RendererConfig, SchedulerConfig, VllmConfig
+from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.inputs import (
     MultiModalFeatureSpec,
@@ -667,10 +667,7 @@ def test_metrics_empty_stats():
 
 def test_get_kv_cache_configs_multiple_workers():
     model_config = ModelConfig(max_model_len=16)
-    vllm_config = VllmConfig(
-        model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
-    )
+    vllm_config = VllmConfig(model_config=model_config)
 
     ref_kv_cache_spec = new_kv_cache_spec()
     same_kv_cache_specs = [
@@ -1139,7 +1136,6 @@ def test_estimate_max_model_len(model_id, max_model_len, want_estimated_max_len)
 
     vllm_config = VllmConfig(
         model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
         scheduler_config=scheduler_config,
     )
 
@@ -1179,7 +1175,6 @@ def test_get_max_concurrency_for_kv_cache_config():
 
     vllm_config = VllmConfig(
         model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
         scheduler_config=scheduler_config,
     )
 
@@ -1298,10 +1293,7 @@ def test_allocate_with_lookahead():
 def test_get_kv_cache_config_one_worker():
     # pass max_model_len to pass check_enough_kv_cache_memory
     model_config = ModelConfig(max_model_len=16)
-    vllm_config = VllmConfig(
-        model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
-    )
+    vllm_config = VllmConfig(model_config=model_config)
 
     mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2
     # all layers are full attention -> single group
@@ -1592,11 +1584,7 @@ def test_get_kv_cache_config_one_worker():
 
 def test_get_kv_cache_configs_attention_free():
     kv_cache_specs: dict[str, KVCacheSpec] = {}
-    model_config = ModelConfig(max_model_len=16)
-    vllm_config = VllmConfig(
-        model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
-    )
+    vllm_config = VllmConfig(model_config=ModelConfig(max_model_len=16))
     kv_cache_configs = get_kv_cache_configs(vllm_config, [kv_cache_specs], [0])
     assert kv_cache_configs == [
         KVCacheConfig(
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 1505415a63619..c6c4a5085bff7 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -11,7 +11,6 @@ from vllm.config import (
     ECTransferConfig,
     KVTransferConfig,
     ModelConfig,
-    RendererConfig,
     SchedulerConfig,
     SpeculativeConfig,
     VllmConfig,
@@ -1564,7 +1563,6 @@ def create_scheduler_with_priority(
     vllm_config = VllmConfig(
         scheduler_config=scheduler_config,
         model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
         cache_config=cache_config,
         kv_transfer_config=kv_transfer_config,
         speculative_config=speculative_config,
diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py
index 086885c298145..f5ba613d38db1 100644
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -9,7 +9,6 @@ from vllm.config import (
     ECTransferConfig,
     KVTransferConfig,
     ModelConfig,
-    RendererConfig,
     SchedulerConfig,
     SpeculativeConfig,
     VllmConfig,
@@ -133,7 +132,6 @@ def create_scheduler(
     vllm_config = VllmConfig(
         scheduler_config=scheduler_config,
         model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
         cache_config=cache_config,
         kv_transfer_config=kv_transfer_config,
         speculative_config=speculative_config,
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index c606100a12bf1..48be8c15aba9e 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -15,7 +15,6 @@ from vllm.config import (
     ECTransferConfig,
     KVTransferConfig,
     ModelConfig,
-    RendererConfig,
     SchedulerConfig,
     VllmConfig,
 )
@@ -523,7 +522,6 @@ def test_encoder_instance_zero_kv_cache(
 
     vllm_config = VllmConfig(
         model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
         cache_config=cache_config,
         scheduler_config=scheduler_config,
         kv_transfer_config=kv_transfer_config,
diff --git a/tests/v1/engine/test_process_multi_modal_uuids.py b/tests/v1/engine/test_process_multi_modal_uuids.py
index 85fab3a855fd7..1b11b8af49d17 100644
--- a/tests/v1/engine/test_process_multi_modal_uuids.py
+++ b/tests/v1/engine/test_process_multi_modal_uuids.py
@@ -5,14 +5,7 @@ import pytest
 
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
-from vllm.config import (
-    CacheConfig,
-    DeviceConfig,
-    ModelConfig,
-    MultiModalConfig,
-    RendererConfig,
-    VllmConfig,
-)
+from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
 from vllm.sampling_params import SamplingParams
 from vllm.v1.engine import input_processor as input_processor_mod
 from vllm.v1.engine.input_processor import InputProcessor
@@ -51,21 +44,22 @@ def _mock_input_processor(
     monkeypatch.setattr(VllmConfig, "__post_init__", lambda self: None, raising=True)
 
     model_config = ModelConfig(
+        skip_tokenizer_init=True,
         max_model_len=128,
         mm_processor_cache_gb=mm_cache_gb,
         generation_config="vllm",
-    )
-    model_config.multimodal_config = MultiModalConfig(mm_processor_cache_gb=mm_cache_gb)
-
-    renderer_config = RendererConfig(
-        model_config=model_config,
         tokenizer="dummy",
-        skip_tokenizer_init=True,
     )
 
+    # Minimal multimodal_config to satisfy references in
+    # Processor.process_inputs.
+    class _MockMMConfig:
+        def __init__(self, gb: float):
+            self.mm_processor_cache_gb = gb
+
+    model_config.multimodal_config = _MockMMConfig(mm_cache_gb)  # type: ignore[attr-defined]
     vllm_config = VllmConfig(
         model_config=model_config,
-        renderer_config=renderer_config,
         cache_config=CacheConfig(enable_prefix_caching=enable_prefix_caching),
         device_config=DeviceConfig(device="cpu"),
     )
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index 768b338b5fe53..58f1a7282352b 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -15,7 +15,6 @@ from vllm.config import (
     DeviceConfig,
     KVTransferConfig,
     ModelConfig,
-    RendererConfig,
     SchedulerConfig,
     VllmConfig,
 )
@@ -128,7 +127,6 @@ def create_vllm_config(
     return VllmConfig(
         scheduler_config=scheduler_config,
         model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
         cache_config=cache_config,
         kv_transfer_config=kv_transfer_config,
         device_config=DeviceConfig("cpu"),
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index 888ea0169b759..616e57de339e2 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -19,7 +19,6 @@ from vllm.config import (
     DeviceConfig,
     ModelConfig,
     ParallelConfig,
-    RendererConfig,
     SchedulerConfig,
     SpeculativeConfig,
     VllmConfig,
@@ -62,7 +61,6 @@ def _create_proposer(
 
     vllm_config = VllmConfig(
         model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
         cache_config=CacheConfig(),
         speculative_config=speculative_config,
         device_config=DeviceConfig(device=current_platform.device_type),
diff --git a/tests/v1/spec_decode/test_mtp.py b/tests/v1/spec_decode/test_mtp.py
index 4483c82438530..3b8813ceb818a 100644
--- a/tests/v1/spec_decode/test_mtp.py
+++ b/tests/v1/spec_decode/test_mtp.py
@@ -18,7 +18,6 @@ from vllm.config import (
     DeviceConfig,
     ModelConfig,
     ParallelConfig,
-    RendererConfig,
     SchedulerConfig,
     SpeculativeConfig,
     VllmConfig,
@@ -47,7 +46,6 @@ def _create_mtp_proposer(num_speculative_tokens: int) -> EagleProposer:
 
     vllm_config = VllmConfig(
         model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
         cache_config=CacheConfig(),
         speculative_config=speculative_config,
         device_config=DeviceConfig(device=current_platform.device_type),
diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py
index 2e365e08a4e72..6bc412abe8695 100644
--- a/tests/v1/spec_decode/test_ngram.py
+++ b/tests/v1/spec_decode/test_ngram.py
@@ -4,7 +4,6 @@ import numpy as np
 
 from vllm.config import (
     ModelConfig,
-    RendererConfig,
     SpeculativeConfig,
     VllmConfig,
 )
@@ -70,7 +69,6 @@ def test_ngram_proposer():
         return NgramProposer(
             vllm_config=VllmConfig(
                 model_config=model_config,
-                renderer_config=RendererConfig(model_config=model_config),
                 speculative_config=SpeculativeConfig(
                     prompt_lookup_min=min_n,
                     prompt_lookup_max=max_n,
diff --git a/tests/v1/structured_output/test_backend_guidance.py b/tests/v1/structured_output/test_backend_guidance.py
index baef2459f8df0..4c01560fc88c3 100644
--- a/tests/v1/structured_output/test_backend_guidance.py
+++ b/tests/v1/structured_output/test_backend_guidance.py
@@ -6,7 +6,7 @@ from concurrent.futures import Future
 import pytest
 from transformers import AutoTokenizer
 
-from vllm.config import RendererConfig, StructuredOutputsConfig, VllmConfig
+from vllm.config import StructuredOutputsConfig, VllmConfig
 from vllm.config.model import ModelConfig
 from vllm.config.parallel import ParallelConfig
 from vllm.config.speculative import SpeculativeConfig
@@ -72,11 +72,8 @@ def test_backend_guidance_rollback_terminated():
 def test_grammar_bitmask_with_specdec():
     tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
     prompt = tokenizer.encode('{"a": "b"}')
-
-    model_config = ModelConfig(tokenizer=TOKENIZER)
     vllm_config = VllmConfig(
-        model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config, tokenizer=TOKENIZER),
+        model_config=ModelConfig(tokenizer=TOKENIZER),
         structured_outputs_config=StructuredOutputsConfig(backend="guidance"),
         speculative_config=SpeculativeConfig(model="[ngram]", num_speculative_tokens=3),
     )
@@ -140,11 +137,8 @@ def test_grammar_init_async_and_sync(async_grammar):
 
     # Use "external_launcher" for sync mode, None for async mode
     executor_backend = None if async_grammar else "external_launcher"
-
-    model_config = ModelConfig(tokenizer=TOKENIZER)
     vllm_config = VllmConfig(
-        model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config, tokenizer=TOKENIZER),
+        model_config=ModelConfig(tokenizer=TOKENIZER),
         structured_outputs_config=StructuredOutputsConfig(backend="guidance"),
         parallel_config=ParallelConfig(distributed_executor_backend=executor_backend),
     )
diff --git a/tests/v1/structured_output/test_reasoning_structured_output.py b/tests/v1/structured_output/test_reasoning_structured_output.py
index 5901d38d1b78b..70047a993c3f9 100644
--- a/tests/v1/structured_output/test_reasoning_structured_output.py
+++ b/tests/v1/structured_output/test_reasoning_structured_output.py
@@ -7,7 +7,7 @@ from unittest.mock import Mock
 
 import pytest
 
-from vllm.config import ModelConfig, RendererConfig, SchedulerConfig, VllmConfig
+from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
 from vllm.reasoning import ReasoningParser
 from vllm.v1.request import Request
 from vllm.v1.structured_output import StructuredOutputManager
@@ -17,26 +17,19 @@ class TestReasoningStructuredOutput:
     """Test reasoning-aware structured output functionality."""
 
     @pytest.fixture
-    def mock_renderer_config(self):
-        """Create a mock RendererConfig."""
-        renderer_config = Mock(spec=RendererConfig)
-        renderer_config.skip_tokenizer_init = (
-            True  # Skip tokenizer init to avoid network calls
-        )
-
-        model_config = Mock(spec=ModelConfig)
-        model_config.get_vocab_size = Mock(return_value=50000)
-        model_config.trust_remote_code = False
+    def mock_model_config(self):
+        """Create a mock ModelConfig."""
+        config = Mock(spec=ModelConfig)
+        config.skip_tokenizer_init = True  # Skip tokenizer init to avoid network calls
+        config.get_vocab_size = Mock(return_value=50000)
         # Add missing runner_type attribute that tokenizer initialization expects
-        model_config.runner_type = "generate"
-        renderer_config.model_config = model_config
-
+        config.runner_type = "generate"
         # Add other attributes that tokenizer initialization might need
-        renderer_config.tokenizer = "test-tokenizer"
-        renderer_config.tokenizer_mode = "auto"
-        renderer_config.tokenizer_revision = None
-
-        return renderer_config
+        config.tokenizer = "test-tokenizer"
+        config.tokenizer_mode = "auto"
+        config.trust_remote_code = False
+        config.tokenizer_revision = None
+        return config
 
     @pytest.fixture
     def mock_scheduler_config(self):
@@ -46,10 +39,10 @@ class TestReasoningStructuredOutput:
         return config
 
     @pytest.fixture
-    def mock_vllm_config(self, mock_renderer_config, mock_scheduler_config):
+    def mock_vllm_config(self, mock_model_config, mock_scheduler_config):
         """Create a mock VllmConfig."""
         config = Mock(spec=VllmConfig)
-        config.renderer_config = mock_renderer_config
+        config.model_config = mock_model_config
         config.scheduler_config = mock_scheduler_config
         config.structured_outputs_config = Mock()
         config.structured_outputs_config.reasoning_parser = None
diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index 080d23863652d..cfc06666e7984 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -7,7 +7,6 @@ from vllm.attention.layer import Attention
 from vllm.config import (
     CacheConfig,
     ModelConfig,
-    RendererConfig,
     SchedulerConfig,
     VllmConfig,
     set_current_vllm_config,
@@ -46,7 +45,6 @@ def get_vllm_config():
     )
     vllm_config = VllmConfig(
         model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
         cache_config=cache_config,
         scheduler_config=scheduler_config,
     )
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index 464e3ab99c76d..7b8c4268a5237 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -13,7 +13,6 @@ from vllm.config import (
     CacheConfig,
     ModelConfig,
     ParallelConfig,
-    RendererConfig,
     SchedulerConfig,
     VllmConfig,
     set_current_vllm_config,
@@ -102,7 +101,6 @@ def get_vllm_config():
     parallel_config = ParallelConfig()
     vllm_config = VllmConfig(
         model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
         cache_config=cache_config,
         scheduler_config=scheduler_config,
         parallel_config=parallel_config,
@@ -813,7 +811,6 @@ def test_hybrid_attention_mamba_tensor_shapes():
     attention_config = AttentionConfig(backend=AttentionBackendEnum.FLASHINFER)
     vllm_config = VllmConfig(
         model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
         cache_config=cache_config,
         scheduler_config=scheduler_config,
         parallel_config=parallel_config,
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index a4f9fd8d28292..0f84f3ca9d3e3 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -24,7 +24,6 @@ from vllm.config.multimodal import MultiModalConfig
 from vllm.config.observability import ObservabilityConfig
 from vllm.config.parallel import EPLBConfig, ParallelConfig
 from vllm.config.pooler import PoolerConfig
-from vllm.config.renderer import RendererConfig
 from vllm.config.scheduler import SchedulerConfig
 from vllm.config.speculative import SpeculativeConfig
 from vllm.config.speech_to_text import SpeechToTextConfig
@@ -82,8 +81,6 @@ __all__ = [
     "ParallelConfig",
     # From vllm.config.pooler
     "PoolerConfig",
-    # From vllm.config.renderer
-    "RendererConfig",
     # From vllm.config.scheduler
     "SchedulerConfig",
     # From vllm.config.speculative
diff --git a/vllm/config/model.py b/vllm/config/model.py
index b0d4fb8e01e64..509a9c5e162f7 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -36,6 +36,7 @@ from vllm.transformers_utils.config import (
     uses_xdrope_dim,
 )
 from vllm.transformers_utils.gguf_utils import (
+    is_gguf,
     is_remote_gguf,
     maybe_patch_hf_config_from_gguf,
     split_remote_gguf,
@@ -82,6 +83,7 @@ TaskOption = Literal[
     "transcription",
     "draft",
 ]
+TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"]
 ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
 LogprobsMode = Literal[
     "raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs"
@@ -129,6 +131,18 @@ class ModelConfig:
 
     Note that the model may support other tasks using the same model runner.
     """
+    tokenizer: SkipValidation[str] = None  # type: ignore
+    """Name or path of the Hugging Face tokenizer to use. If unspecified, model
+    name or path will be used."""
+    tokenizer_mode: TokenizerMode | str = "auto"
+    """Tokenizer mode:\n
+    - "auto" will use the tokenizer from `mistral_common` for Mistral models
+    if available, otherwise it will use the "hf" tokenizer.\n
+    - "hf" will use the fast tokenizer if available.\n
+    - "slow" will always use the slow tokenizer.\n
+    - "mistral" will always use the tokenizer from `mistral_common`.\n
+    - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
+    - Other custom values can be supported via plugins."""
     trust_remote_code: bool = False
     """Trust remote code (e.g., from HuggingFace) when downloading the model
     and tokenizer."""
@@ -154,6 +168,13 @@ class ModelConfig:
     hf_config_path: str | None = None
     """Name or path of the Hugging Face config to use. If unspecified, model
     name or path will be used."""
+    allowed_local_media_path: str = ""
+    """Allowing API requests to read local images or videos from directories
+    specified by the server file system. This is a security risk. Should only
+    be enabled in trusted environments."""
+    allowed_media_domains: list[str] | None = None
+    """If set, only media URLs that belong to this domain can be used for
+    multi-modal inputs. """
     revision: str | None = None
     """The specific model version to use. It can be a branch name, a tag name,
     or a commit id. If unspecified, will use the default version."""
@@ -161,6 +182,10 @@ class ModelConfig:
     """The specific revision to use for the model code on the Hugging Face Hub.
     It can be a branch name, a tag name, or a commit id. If unspecified, will
     use the default version."""
+    tokenizer_revision: str | None = None
+    """The specific revision to use for the tokenizer on the Hugging Face Hub.
+    It can be a branch name, a tag name, or a commit id. If unspecified, will
+    use the default version."""
     max_model_len: SkipValidation[int] = None  # type: ignore
     """Model context length (prompt and output). If unspecified, will be
     automatically derived from the model config.
@@ -205,6 +230,10 @@ class ModelConfig:
     preventing potential numerical issues. Note that even if this is set to
     False, cascade attention will be only used when the heuristic tells that
     it's beneficial."""
+    skip_tokenizer_init: bool = False
+    """Skip initialization of tokenizer and detokenizer. Expects valid
+    `prompt_token_ids` and `None` for prompt from the input. The generated
+    output will contain token ids."""
     enable_prompt_embeds: bool = False
     """If `True`, enables passing text embeddings as inputs via the
     `prompt_embeds` key.
@@ -265,6 +294,8 @@ class ModelConfig:
     logits_processors: list[str | type[LogitsProcessor]] | None = None
     """One or more logits processors' fully-qualified class names or class
     definitions"""
+    io_processor_plugin: str | None = None
+    """IOProcessor plugin name to load at model startup"""
 
     # Pooler config
     pooler_config: PoolerConfig | None = None
@@ -277,6 +308,7 @@ class ModelConfig:
     from the architecture of `self.model`."""
     limit_mm_per_prompt: InitVar[dict[str, int | dict[str, int]] | None] = None
     enable_mm_embeds: InitVar[bool | None] = None
+    media_io_kwargs: InitVar[dict[str, dict[str, Any]] | None] = None
     mm_processor_kwargs: InitVar[dict[str, Any] | None] = None
     mm_processor_cache_gb: InitVar[float | None] = None
     mm_processor_cache_type: InitVar[MMCacheType | None] = None
@@ -303,12 +335,18 @@ class ModelConfig:
             "runner",
             "convert",
             "task",
+            "tokenizer",
+            "tokenizer_mode",
             "seed",
             "hf_config_path",
+            "allowed_local_media_path",
+            "allowed_media_domains",
+            "tokenizer_revision",
             "spec_target_max_model_len",
             "enforce_eager",
             "logprobs_mode",
             "disable_cascade_attn",
+            "skip_tokenizer_init",
             "served_model_name",
             "config_format",
             "hf_token",
@@ -316,9 +354,11 @@ class ModelConfig:
             "logits_processor_pattern",
             "override_attention_dtype",
             "logits_processors",
+            "io_processor_plugin",
             "pooler_config",
             "multimodal_config",
             "limit_mm_per_prompt",
+            "media_io_kwargs",
             "mm_processor_kwargs",
             "mm_processor_cache_gb",
             "mm_processor_cache_type",
@@ -383,6 +423,7 @@ class ModelConfig:
         # Multimodal config init vars
         limit_mm_per_prompt: dict[str, int | dict[str, int]] | None,
         enable_mm_embeds: bool | None,
+        media_io_kwargs: dict[str, dict[str, Any]] | None,
         mm_processor_kwargs: dict[str, Any] | None,
         mm_processor_cache_gb: float | None,
         mm_processor_cache_type: MMCacheType | None,
@@ -397,8 +438,13 @@ class ModelConfig:
         self.served_model_name = get_served_model_name(
             self.model, self.served_model_name
         )
-        self.original_model = self.model
-        self.model = maybe_model_redirect(self.original_model)
+        self.model = maybe_model_redirect(self.model)
+        # The tokenizer is consistent with the model by default.
+        if self.tokenizer is None:
+            self.tokenizer = self.model
+        if self.tokenizer_revision is None:
+            self.tokenizer_revision = self.revision
+        self.tokenizer = maybe_model_redirect(self.tokenizer)
 
         if isinstance(self.hf_config_path, str):
             self.hf_config_path = maybe_model_redirect(self.hf_config_path)
@@ -419,7 +465,7 @@ class ModelConfig:
                     hf_overrides_kw[key] = value
             hf_overrides_fn = None
 
-        self.maybe_pull_model_for_runai(self.model)
+        self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer)
 
         from vllm.platforms import current_platform
 
@@ -602,8 +648,7 @@ class ModelConfig:
         )
 
         self.original_max_model_len = self.max_model_len
-        self.recalculate_max_model_len(self.original_max_model_len)
-
+        self.max_model_len = self.get_and_verify_max_len(self.max_model_len)
         # Init multimodal config if needed
         if self._model_info.supports_multimodal:
             if (
@@ -619,6 +664,7 @@ class ModelConfig:
             mm_config_kwargs = dict(
                 limit_per_prompt=limit_mm_per_prompt,
                 enable_mm_embeds=enable_mm_embeds,
+                media_io_kwargs=media_io_kwargs,
                 mm_processor_kwargs=mm_processor_kwargs,
                 mm_processor_cache_gb=mm_processor_cache_gb,
                 mm_processor_cache_type=mm_processor_cache_type,
@@ -636,8 +682,16 @@ class ModelConfig:
 
             self.multimodal_config = MultiModalConfig(**mm_config_kwargs)
 
+        # Multimodal GGUF models must use original repo for mm processing
+        if is_gguf(self.tokenizer) and self.is_multimodal_model:
+            raise ValueError(
+                "Loading a multimodal GGUF model needs to use original "
+                "tokenizer. Please specify the unquantized hf model's "
+                "repo name or path using the --tokenizer argument."
+            )
+
         if self.disable_sliding_window:
-            # Set after recalculate_max_model_len to ensure that max_model_len
+            # Set after get_and_verify_max_len to ensure that max_model_len
             # can be correctly capped to sliding window size
             self.hf_text_config.sliding_window = None
 
@@ -661,9 +715,10 @@ class ModelConfig:
 
     @model_validator(mode="after")
     def validate_model_config_after(self: "ModelConfig") -> "ModelConfig":
+        if not isinstance(self.tokenizer, str):
+            raise ValueError("tokenizer must be a string after __post_init__.")
         if not isinstance(self.max_model_len, int):
             raise ValueError("max_model_len must be an integer after __post_init__.")
-
         return self
 
     def _get_transformers_backend_cls(self) -> str:
@@ -712,17 +767,49 @@ class ModelConfig:
         """The architecture vllm actually used."""
         return self._architecture
 
-    def maybe_pull_model_for_runai(self, model: str) -> None:
-        """Pull model from Object Storage to temporary directory when needed."""
-        if not is_runai_obj_uri(model):
+    def maybe_pull_model_tokenizer_for_runai(self, model: str, tokenizer: str) -> None:
+        """Pull model/tokenizer from Object Storage to temporary
+        directory when needed.
+
+        Args:
+            model: Model name or path
+            tokenizer: Tokenizer name or path
+        """
+
+        if not (is_runai_obj_uri(model) or is_runai_obj_uri(tokenizer)):
             return
 
-        object_storage_model = ObjectStorageModel(url=model)
-        object_storage_model.pull_files(
-            model, allow_pattern=["*.model", "*.py", "*.json"]
-        )
-        self.model_weights = model
-        self.model = object_storage_model.dir
+        if is_runai_obj_uri(model):
+            object_storage_model = ObjectStorageModel(url=model)
+            object_storage_model.pull_files(
+                model, allow_pattern=["*.model", "*.py", "*.json"]
+            )
+            self.model_weights = model
+            self.model = object_storage_model.dir
+
+            # If tokenizer is same as model, download to same directory
+            if model == tokenizer:
+                object_storage_model.pull_files(
+                    model,
+                    ignore_pattern=[
+                        "*.pt",
+                        "*.safetensors",
+                        "*.bin",
+                        "*.tensors",
+                        "*.pth",
+                    ],
+                )
+                self.tokenizer = object_storage_model.dir
+                return
+
+        # Only download tokenizer if needed and not already handled
+        if is_runai_obj_uri(tokenizer):
+            object_storage_tokenizer = ObjectStorageModel(url=tokenizer)
+            object_storage_tokenizer.pull_files(
+                model,
+                ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors", "*.pth"],
+            )
+            self.tokenizer = object_storage_tokenizer.dir
 
     def _get_encoder_config(self):
         model = self.model
@@ -1625,38 +1712,30 @@ class ModelConfig:
             return dense_modules[-1]["out_features"]
         return self.get_hidden_size()
 
-    def recalculate_max_model_len(
-        self,
-        original_max_model_len: int | None,
-        *,
-        tokenizer: str | None = None,
-        tokenizer_revision: str | None = None,
-    ) -> None:
+    def get_and_verify_max_len(self, max_model_len: int):
         # Consider max_model_len in tokenizer_config only when
         # pooling models use absolute position_embedding.
-        # NOTE: For simplicity we assume `args.model == args.tokenizer`
-        # since this is
         tokenizer_config = None
         if (
             self.runner_type == "pooling"
             and getattr(self.hf_config, "position_embedding_type", "") == "absolute"
         ):
             tokenizer_config = try_get_tokenizer_config(
-                tokenizer or self.model,
+                self.tokenizer,
                 trust_remote_code=self.trust_remote_code,
-                revision=tokenizer_revision or self.revision,
+                revision=self.tokenizer_revision,
             )
-
-        self.max_model_len = _get_and_verify_max_len(
+        max_model_len = _get_and_verify_max_len(
             hf_config=self.hf_text_config,
             tokenizer_config=tokenizer_config,
-            max_model_len=original_max_model_len,
+            max_model_len=max_model_len,
             disable_sliding_window=self.disable_sliding_window,
             sliding_window=self.get_sliding_window(),
             spec_target_max_model_len=self.spec_target_max_model_len,
             encoder_config=self.encoder_config,
         )
-        logger.info("Using max model len %s", self.max_model_len)
+        logger.info("Using max model len %s", max_model_len)
+        return max_model_len
 
     @property
     def attn_type(self) -> AttnTypeStr:
diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py
index 37e2f6b4d419a..8a2936de96d6f 100644
--- a/vllm/config/multimodal.py
+++ b/vllm/config/multimodal.py
@@ -79,6 +79,10 @@ class MultiModalConfig:
 
     WARNING: The vLLM engine may crash if incorrect shape of embeddings is passed.
     Only enable this flag for trusted users!"""
+    media_io_kwargs: dict[str, dict[str, Any]] = Field(default_factory=dict)
+    """Additional args passed to process media inputs, keyed by modalities.
+    For example, to set num_frames for video, set
+    `--media-io-kwargs '{"video": {"num_frames": 40} }'`"""
     mm_processor_kwargs: dict[str, object] | None = None
     """Arguments to be forwarded to the model's processor for multi-modal data,
     e.g., image processor. Overrides for the multi-modal processor obtained
diff --git a/vllm/config/renderer.py b/vllm/config/renderer.py
deleted file mode 100644
index 36a922b93ca05..0000000000000
--- a/vllm/config/renderer.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Literal
-
-from pydantic import Field, SkipValidation
-from pydantic.dataclasses import dataclass
-
-from vllm.config.model import ModelConfig
-from vllm.config.utils import config
-from vllm.transformers_utils.gguf_utils import is_gguf
-from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri
-from vllm.transformers_utils.utils import maybe_model_redirect
-
-TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"]
-
-
-@config
-@dataclass
-class RendererConfig:
-    """Configuration for the renderer."""
-
-    # NOTE: In reality, this is a required argument.
-    # We provide a dummy default value here to generate the CLI args.
-    model_config: SkipValidation[ModelConfig] = None  # type: ignore
-    """Provides model context to the renderer."""
-
-    tokenizer: str = ""
-    """Name or path of the Hugging Face tokenizer to use. If unspecified, model
-    name or path will be used."""
-    tokenizer_mode: TokenizerMode | str = "auto"
-    """Tokenizer mode:\n
-    - "auto" will use the tokenizer from `mistral_common` for Mistral models
-    if available, otherwise it will use the "hf" tokenizer.\n
-    - "hf" will use the fast tokenizer if available.\n
-    - "slow" will always use the slow tokenizer.\n
-    - "mistral" will always use the tokenizer from `mistral_common`.\n
-    - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
-    - Other custom values can be supported via plugins."""
-    tokenizer_revision: str | None = None
-    """The specific revision to use for the tokenizer on the Hugging Face Hub.
-    It can be a branch name, a tag name, or a commit id. If unspecified, will
-    use the default version."""
-    skip_tokenizer_init: bool = False
-    """Skip initialization of tokenizer and detokenizer. Expects valid
-    `prompt_token_ids` and `None` for prompt from the input. The generated
-    output will contain token ids."""
-
-    io_processor_plugin: str | None = None
-    """IOProcessor plugin name to load at model startup."""
-
-    media_io_kwargs: dict[str, dict[str, Any]] = Field(default_factory=dict)
-    """Additional args passed to process media inputs, keyed by modalities.
-    For example, to set num_frames for video, set
-    `--media-io-kwargs '{"video": {"num_frames": 40} }'`"""
-    allowed_local_media_path: str = ""
-    """Allowing API requests to read local images or videos from directories
-    specified by the server file system. This is a security risk. Should only
-    be enabled in trusted environments."""
-    allowed_media_domains: list[str] | None = None
-    """If set, only media URLs that belong to this domain can be used for
-    multi-modal inputs. """
-
-    @property
-    def trust_remote_code(self) -> bool:
-        return self.model_config.trust_remote_code
-
-    def __post_init__(self) -> None:
-        model_config = self.model_config
-
-        # The tokenizer is consistent with the model by default.
-        if not self.tokenizer:
-            self.tokenizer = (
-                ModelConfig.model
-                if model_config is None
-                else model_config.original_model
-            )
-        if not self.tokenizer_revision:
-            self.tokenizer_revision = (
-                ModelConfig.revision if model_config is None else model_config.revision
-            )
-
-        self.original_tokenizer = self.tokenizer
-        self.tokenizer = maybe_model_redirect(self.original_tokenizer)
-        self.maybe_pull_tokenizer_for_runai(self.tokenizer)
-
-        # Multimodal GGUF models must use original repo for mm processing
-        is_multimodal_model = (
-            ModelConfig.is_multimodal_model
-            if model_config is None
-            else model_config.is_multimodal_model
-        )
-        if is_gguf(self.tokenizer) and is_multimodal_model:
-            raise ValueError(
-                "Loading a multimodal GGUF model needs to use original "
-                "tokenizer. Please specify the unquantized hf model's "
-                "repo name or path using the --tokenizer argument."
-            )
-
-    def maybe_pull_tokenizer_for_runai(self, tokenizer: str) -> None:
-        """Pull tokenizer from Object Storage to temporary directory when needed."""
-        if not is_runai_obj_uri(tokenizer):
-            return
-
-        object_storage_tokenizer = ObjectStorageModel(url=tokenizer)
-        object_storage_tokenizer.pull_files(
-            tokenizer,
-            ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors", "*.pth"],
-        )
-        self.tokenizer = object_storage_tokenizer.dir
diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py
index 63b63eac907d2..bf533bf14e55c 100644
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@@ -322,11 +322,16 @@ class SpeculativeConfig:
                 self.draft_model_config = ModelConfig(
                     model=self.model,
                     runner="draft",
+                    tokenizer=self.target_model_config.tokenizer,
+                    tokenizer_mode=self.target_model_config.tokenizer_mode,
                     trust_remote_code=self.target_model_config.trust_remote_code,
+                    allowed_local_media_path=self.target_model_config.allowed_local_media_path,
+                    allowed_media_domains=self.target_model_config.allowed_media_domains,
                     dtype=self.target_model_config.dtype,
                     seed=self.target_model_config.seed,
                     revision=self.revision,
                     code_revision=self.code_revision,
+                    tokenizer_revision=self.target_model_config.tokenizer_revision,
                     spec_target_max_model_len=self.target_model_config.max_model_len,
                     quantization=self.quantization,
                     enforce_eager=self.target_model_config.enforce_eager,
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 417797c445b95..36e4bd159dc72 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -39,7 +39,6 @@ from .lora import LoRAConfig
 from .model import ModelConfig
 from .observability import ObservabilityConfig
 from .parallel import ParallelConfig
-from .renderer import RendererConfig
 from .scheduler import SchedulerConfig
 from .speculative import SpeculativeConfig
 from .structured_outputs import StructuredOutputsConfig
@@ -182,8 +181,6 @@ class VllmConfig:
     # try to download a model
     model_config: ModelConfig = Field(default=None)
     """Model configuration."""
-    renderer_config: RendererConfig = Field(default_factory=RendererConfig)
-    """Renderer configuration."""
     cache_config: CacheConfig = Field(default_factory=CacheConfig)
     """Cache configuration."""
     parallel_config: ParallelConfig = Field(default_factory=ParallelConfig)
@@ -744,7 +741,7 @@ class VllmConfig:
             from vllm.multimodal import MULTIMODAL_REGISTRY
 
             self.scheduler_config.max_num_encoder_input_tokens = (
-                MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.renderer_config)
+                MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config)
             )
             logger.debug(
                 "Encoder-decoder model detected: setting "
@@ -1189,13 +1186,11 @@ class VllmConfig:
             computed_compile_ranges_split_points
         )
 
-    def recalculate_max_model_len(self, original_max_model_len: int | None) -> None:
-        # Can only be called during try_verify_and_update_config
-        self.model_config.recalculate_max_model_len(
-            original_max_model_len,
-            tokenizer=self.renderer_config.tokenizer,
-            tokenizer_revision=self.renderer_config.tokenizer_revision,
-        )
+    def recalculate_max_model_len(self, max_model_len: int):
+        # Can only be called in try_verify_and_update_config
+        model_config = self.model_config
+        max_model_len = model_config.get_and_verify_max_len(max_model_len)
+        self.model_config.max_model_len = max_model_len
 
     def try_verify_and_update_config(self):
         if self.model_config is None:
@@ -1269,11 +1264,11 @@ class VllmConfig:
         return (
             f"model={self.model_config.model!r}, "
             f"speculative_config={self.speculative_config!r}, "
-            f"tokenizer={self.renderer_config.tokenizer!r}, "
-            f"skip_tokenizer_init={self.renderer_config.skip_tokenizer_init}, "
-            f"tokenizer_mode={self.renderer_config.tokenizer_mode}, "
+            f"tokenizer={self.model_config.tokenizer!r}, "
+            f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}, "
+            f"tokenizer_mode={self.model_config.tokenizer_mode}, "
             f"revision={self.model_config.revision}, "
-            f"tokenizer_revision={self.renderer_config.tokenizer_revision}, "
+            f"tokenizer_revision={self.model_config.tokenizer_revision}, "
             f"trust_remote_code={self.model_config.trust_remote_code}, "
             f"dtype={self.model_config.dtype}, "
             f"max_seq_len={self.model_config.max_model_len}, "
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index bd398abb0bf81..ceac5407af6e2 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -71,11 +71,11 @@ from vllm.config.model import (
     ModelDType,
     RunnerOption,
     TaskOption,
+    TokenizerMode,
 )
 from vllm.config.multimodal import MMCacheType, MMEncoderTPMode
 from vllm.config.observability import DetailedTraceModules
 from vllm.config.parallel import DistributedExecutorBackend, ExpertPlacementStrategy
-from vllm.config.renderer import RendererConfig, TokenizerMode
 from vllm.config.scheduler import SchedulerPolicy
 from vllm.config.utils import get_field
 from vllm.config.vllm import OptimizationLevel
@@ -355,12 +355,17 @@ class EngineArgs:
 
     model: str = ModelConfig.model
     served_model_name: str | list[str] | None = ModelConfig.served_model_name
+    tokenizer: str | None = ModelConfig.tokenizer
     hf_config_path: str | None = ModelConfig.hf_config_path
     runner: RunnerOption = ModelConfig.runner
     convert: ConvertOption = ModelConfig.convert
     task: TaskOption | None = ModelConfig.task
+    skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init
     enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds
+    tokenizer_mode: TokenizerMode | str = ModelConfig.tokenizer_mode
     trust_remote_code: bool = ModelConfig.trust_remote_code
+    allowed_local_media_path: str = ModelConfig.allowed_local_media_path
+    allowed_media_domains: list[str] | None = ModelConfig.allowed_media_domains
     download_dir: str | None = LoadConfig.download_dir
     safetensors_load_strategy: str = LoadConfig.safetensors_load_strategy
     load_format: str | LoadFormats = LoadConfig.load_format
@@ -444,6 +449,7 @@ class EngineArgs:
     code_revision: str | None = ModelConfig.code_revision
     hf_token: bool | str | None = ModelConfig.hf_token
     hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides")
+    tokenizer_revision: str | None = ModelConfig.tokenizer_revision
     quantization: QuantizationMethods | None = ModelConfig.quantization
     enforce_eager: bool = ModelConfig.enforce_eager
     disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
@@ -452,6 +458,9 @@ class EngineArgs:
     )
     enable_mm_embeds: bool = MultiModalConfig.enable_mm_embeds
     interleave_mm_strings: bool = MultiModalConfig.interleave_mm_strings
+    media_io_kwargs: dict[str, dict[str, Any]] = get_field(
+        MultiModalConfig, "media_io_kwargs"
+    )
     mm_processor_kwargs: dict[str, Any] | None = MultiModalConfig.mm_processor_kwargs
     disable_mm_preprocessor_cache: bool = False  # DEPRECATED
     mm_processor_cache_gb: float = MultiModalConfig.mm_processor_cache_gb
@@ -465,19 +474,9 @@ class EngineArgs:
     mm_encoder_attn_backend: AttentionBackendEnum | str | None = (
         MultiModalConfig.mm_encoder_attn_backend
     )
+    io_processor_plugin: str | None = None
     skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling
     video_pruning_rate: float = MultiModalConfig.video_pruning_rate
-    # Renderer fields
-    tokenizer: str | None = None
-    tokenizer_mode: TokenizerMode | str = RendererConfig.tokenizer_mode
-    tokenizer_revision: str | None = RendererConfig.tokenizer_revision
-    skip_tokenizer_init: bool = RendererConfig.skip_tokenizer_init
-    io_processor_plugin: str | None = None
-    media_io_kwargs: dict[str, dict[str, Any]] = get_field(
-        RendererConfig, "media_io_kwargs"
-    )
-    allowed_local_media_path: str = RendererConfig.allowed_local_media_path
-    allowed_media_domains: list[str] | None = RendererConfig.allowed_media_domains
     # LoRA fields
     enable_lora: bool = False
     max_loras: int = LoRAConfig.max_loras
@@ -628,14 +627,25 @@ class EngineArgs:
         model_group.add_argument("--runner", **model_kwargs["runner"])
         model_group.add_argument("--convert", **model_kwargs["convert"])
         model_group.add_argument("--task", **model_kwargs["task"], deprecated=True)
+        model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"])
+        model_group.add_argument("--tokenizer-mode", **model_kwargs["tokenizer_mode"])
         model_group.add_argument(
             "--trust-remote-code", **model_kwargs["trust_remote_code"]
         )
         model_group.add_argument("--dtype", **model_kwargs["dtype"])
         model_group.add_argument("--seed", **model_kwargs["seed"])
         model_group.add_argument("--hf-config-path", **model_kwargs["hf_config_path"])
+        model_group.add_argument(
+            "--allowed-local-media-path", **model_kwargs["allowed_local_media_path"]
+        )
+        model_group.add_argument(
+            "--allowed-media-domains", **model_kwargs["allowed_media_domains"]
+        )
         model_group.add_argument("--revision", **model_kwargs["revision"])
         model_group.add_argument("--code-revision", **model_kwargs["code_revision"])
+        model_group.add_argument(
+            "--tokenizer-revision", **model_kwargs["tokenizer_revision"]
+        )
         model_group.add_argument("--max-model-len", **model_kwargs["max_model_len"])
         model_group.add_argument("--quantization", "-q", **model_kwargs["quantization"])
         model_group.add_argument("--enforce-eager", **model_kwargs["enforce_eager"])
@@ -647,6 +657,9 @@ class EngineArgs:
         model_group.add_argument(
             "--disable-cascade-attn", **model_kwargs["disable_cascade_attn"]
         )
+        model_group.add_argument(
+            "--skip-tokenizer-init", **model_kwargs["skip_tokenizer_init"]
+        )
         model_group.add_argument(
             "--enable-prompt-embeds", **model_kwargs["enable_prompt_embeds"]
         )
@@ -685,34 +698,8 @@ class EngineArgs:
         model_group.add_argument(
             "--logits-processors", **model_kwargs["logits_processors"]
         )
-
-        # Renderer arguments
-        renderer_kwargs = get_kwargs(RendererConfig)
-        renderer_group = parser.add_argument_group(
-            title="RendererConfig",
-            description=RendererConfig.__doc__,
-        )
-        renderer_group.add_argument("--tokenizer", **renderer_kwargs["tokenizer"])
-        renderer_group.add_argument(
-            "--tokenizer-mode", **renderer_kwargs["tokenizer_mode"]
-        )
-        renderer_group.add_argument(
-            "--tokenizer-revision", **renderer_kwargs["tokenizer_revision"]
-        )
-        renderer_group.add_argument(
-            "--skip-tokenizer-init", **renderer_kwargs["skip_tokenizer_init"]
-        )
-        renderer_group.add_argument(
-            "--media-io-kwargs", **renderer_kwargs["media_io_kwargs"]
-        )
-        renderer_group.add_argument(
-            "--allowed-local-media-path", **renderer_kwargs["allowed_local_media_path"]
-        )
-        renderer_group.add_argument(
-            "--allowed-media-domains", **renderer_kwargs["allowed_media_domains"]
-        )
-        renderer_group.add_argument(
-            "--io-processor-plugin", **renderer_kwargs["io_processor_plugin"]
+        model_group.add_argument(
+            "--io-processor-plugin", **model_kwargs["io_processor_plugin"]
         )
 
         # Model loading arguments
@@ -962,6 +949,9 @@ class EngineArgs:
         multimodal_group.add_argument(
             "--enable-mm-embeds", **multimodal_kwargs["enable_mm_embeds"]
         )
+        multimodal_group.add_argument(
+            "--media-io-kwargs", **multimodal_kwargs["media_io_kwargs"]
+        )
         multimodal_group.add_argument(
             "--mm-processor-kwargs", **multimodal_kwargs["mm_processor_kwargs"]
         )
@@ -1265,13 +1255,18 @@ class EngineArgs:
             runner=self.runner,
             convert=self.convert,
             task=self.task,
+            tokenizer=self.tokenizer,
+            tokenizer_mode=self.tokenizer_mode,
             trust_remote_code=self.trust_remote_code,
+            allowed_local_media_path=self.allowed_local_media_path,
+            allowed_media_domains=self.allowed_media_domains,
             dtype=self.dtype,
             seed=self.seed,
             revision=self.revision,
             code_revision=self.code_revision,
             hf_token=self.hf_token,
             hf_overrides=self.hf_overrides,
+            tokenizer_revision=self.tokenizer_revision,
             max_model_len=self.max_model_len,
             quantization=self.quantization,
             enforce_eager=self.enforce_eager,
@@ -1279,11 +1274,13 @@ class EngineArgs:
             logprobs_mode=self.logprobs_mode,
             disable_sliding_window=self.disable_sliding_window,
             disable_cascade_attn=self.disable_cascade_attn,
+            skip_tokenizer_init=self.skip_tokenizer_init,
             enable_prompt_embeds=self.enable_prompt_embeds,
             served_model_name=self.served_model_name,
             limit_mm_per_prompt=self.limit_mm_per_prompt,
             enable_mm_embeds=self.enable_mm_embeds,
             interleave_mm_strings=self.interleave_mm_strings,
+            media_io_kwargs=self.media_io_kwargs,
             skip_mm_profiling=self.skip_mm_profiling,
             config_format=self.config_format,
             mm_processor_kwargs=self.mm_processor_kwargs,
@@ -1301,6 +1298,7 @@ class EngineArgs:
             override_attention_dtype=self.override_attention_dtype,
             logits_processors=self.logits_processors,
             video_pruning_rate=self.video_pruning_rate,
+            io_processor_plugin=self.io_processor_plugin,
         )
 
     def validate_tensorizer_args(self):
@@ -1396,25 +1394,9 @@ class EngineArgs:
             )
 
         model_config = self.create_model_config()
-        renderer_config = RendererConfig(
-            model_config=model_config,
-            tokenizer=self.tokenizer or "",
-            tokenizer_mode=self.tokenizer_mode,
-            tokenizer_revision=self.tokenizer_revision,
-            skip_tokenizer_init=self.skip_tokenizer_init,
-            io_processor_plugin=self.io_processor_plugin,
-            media_io_kwargs=self.media_io_kwargs,
-            allowed_local_media_path=self.allowed_local_media_path,
-            allowed_media_domains=self.allowed_media_domains,
-        )
-
-        model_config.recalculate_max_model_len(
-            model_config.original_max_model_len,
-            tokenizer=renderer_config.tokenizer,
-            tokenizer_revision=renderer_config.tokenizer_revision,
-        )
-
         self.model = model_config.model
+        self.tokenizer = model_config.tokenizer
+
         self._check_feature_supported(model_config)
         self._set_default_chunked_prefill_and_prefix_caching_args(model_config)
         self._set_default_max_num_seqs_and_batched_tokens_args(
@@ -1786,7 +1768,6 @@ class EngineArgs:
             )
         config = VllmConfig(
             model_config=model_config,
-            renderer_config=renderer_config,
             cache_config=cache_config,
             parallel_config=parallel_config,
             scheduler_config=scheduler_config,
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index 7b60e7f89861b..d94951a0cffc8 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -5,7 +5,7 @@ from abc import ABC, abstractmethod
 from collections.abc import AsyncGenerator, Iterable, Mapping
 from typing import Any
 
-from vllm.config import ModelConfig, RendererConfig, VllmConfig
+from vllm.config import ModelConfig, VllmConfig
 from vllm.inputs.data import PromptType
 from vllm.lora.request import LoRARequest
 from vllm.outputs import PoolingRequestOutput, RequestOutput
@@ -22,7 +22,6 @@ class EngineClient(ABC):
     """Protocol class for Clients to Engine"""
 
     vllm_config: VllmConfig
-    renderer_config: RendererConfig
     model_config: ModelConfig
     input_processor: InputProcessor
     io_processor: IOProcessor | None
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 5ad256c2f3eb3..aceaa8bd45b81 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -44,7 +44,7 @@ from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast, Processor
 from typing_extensions import Required, TypedDict
 
 from vllm import envs
-from vllm.config import ModelConfig, RendererConfig
+from vllm.config import ModelConfig
 from vllm.logger import init_logger
 from vllm.model_executor.models import SupportsMultiModal
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalUUIDDict
@@ -452,10 +452,9 @@ This is needed because `lru_cache` does not cache when an exception happens.
 
 def _try_get_processor_chat_template(
     tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
-    *,
-    trust_remote_code: bool,
+    model_config: ModelConfig,
 ) -> str | None:
-    cache_key = (tokenizer.name_or_path, trust_remote_code)
+    cache_key = (tokenizer.name_or_path, model_config.trust_remote_code)
     if cache_key in _PROCESSOR_CHAT_TEMPLATES:
         return _PROCESSOR_CHAT_TEMPLATES[cache_key]
 
@@ -467,7 +466,7 @@ def _try_get_processor_chat_template(
                 PreTrainedTokenizerFast,
                 ProcessorMixin,
             ),
-            trust_remote_code=trust_remote_code,
+            trust_remote_code=model_config.trust_remote_code,
         )
         if (
             isinstance(processor, ProcessorMixin)
@@ -500,10 +499,7 @@ def resolve_hf_chat_template(
 
     # 2nd priority: AutoProcessor chat template, unless tool calling is enabled
     if tools is None:
-        chat_template = _try_get_processor_chat_template(
-            tokenizer,
-            trust_remote_code=model_config.trust_remote_code,
-        )
+        chat_template = _try_get_processor_chat_template(tokenizer, model_config)
         if chat_template is not None:
             return chat_template
 
@@ -517,10 +513,10 @@ def resolve_hf_chat_template(
             exc_info=True,
         )
 
-    # 4th priority: Predefined fallbacks]
+    # 4th priority: Predefined fallbacks
     path = get_chat_template_fallback_path(
         model_type=model_config.hf_config.model_type,
-        tokenizer_name_or_path=tokenizer.name_or_path,
+        tokenizer_name_or_path=model_config.tokenizer,
     )
     if path is not None:
         logger.info_once(
@@ -542,14 +538,14 @@ def _resolve_chat_template_content_format(
     tools: list[dict[str, Any]] | None,
     tokenizer: TokenizerLike | None,
     *,
-    renderer_config: RendererConfig,
+    model_config: ModelConfig,
 ) -> _ChatTemplateContentFormat:
     if isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
         hf_chat_template = resolve_hf_chat_template(
             tokenizer,
             chat_template=chat_template,
             tools=tools,
-            model_config=renderer_config.model_config,
+            model_config=model_config,
         )
     else:
         hf_chat_template = None
@@ -599,7 +595,7 @@ def resolve_chat_template_content_format(
     given_format: ChatTemplateContentFormatOption,
     tokenizer: TokenizerLike | None,
     *,
-    renderer_config: RendererConfig,
+    model_config: ModelConfig,
 ) -> _ChatTemplateContentFormat:
     if given_format != "auto":
         return given_format
@@ -608,7 +604,7 @@ def resolve_chat_template_content_format(
         chat_template,
         tools,
         tokenizer,
-        renderer_config=renderer_config,
+        model_config=model_config,
     )
 
     _log_chat_template_content_format(
@@ -631,32 +627,32 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
     maximum per prompt.
     """
 
-    def __init__(self, renderer_config: RendererConfig):
+    def __init__(self, model_config: ModelConfig):
         super().__init__()
 
-        self._renderer_config = renderer_config
+        self._model_config = model_config
 
         self._items_by_modality = defaultdict[str, list[_T | None]](list)
         self._uuids_by_modality = defaultdict[str, list[str | None]](list)
 
     @property
-    def renderer_config(self) -> RendererConfig:
-        return self._renderer_config
+    def model_config(self) -> ModelConfig:
+        return self._model_config
 
     @cached_property
     def model_cls(self) -> type[SupportsMultiModal]:
         from vllm.model_executor.model_loader import get_model_cls
 
-        model_cls = get_model_cls(self.renderer_config.model_config)
+        model_cls = get_model_cls(self.model_config)
         return cast(type[SupportsMultiModal], model_cls)
 
     @property
     def allowed_local_media_path(self):
-        return self._renderer_config.allowed_local_media_path
+        return self._model_config.allowed_local_media_path
 
     @property
     def allowed_media_domains(self):
-        return self._renderer_config.allowed_media_domains
+        return self._model_config.allowed_media_domains
 
     @property
     def mm_registry(self):
@@ -664,7 +660,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
 
     @cached_property
     def mm_processor(self):
-        return self.mm_registry.create_processor(self.renderer_config)
+        return self.mm_registry.create_processor(self.model_config)
 
     def add(
         self,
@@ -855,20 +851,19 @@ class MultiModalContentParser(BaseMultiModalContentParser):
         super().__init__()
 
         self._tracker = tracker
+        multimodal_config = self._tracker.model_config.multimodal_config
+        media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None)
+
         self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load(
             envs.VLLM_MEDIA_CONNECTOR,
-            media_io_kwargs=self.renderer_config.media_io_kwargs,
+            media_io_kwargs=media_io_kwargs,
             allowed_local_media_path=tracker.allowed_local_media_path,
             allowed_media_domains=tracker.allowed_media_domains,
         )
 
-    @property
-    def renderer_config(self) -> RendererConfig:
-        return self._tracker.renderer_config
-
     @property
     def model_config(self) -> ModelConfig:
-        return self.renderer_config.model_config
+        return self._tracker.model_config
 
     def parse_image(self, image_url: str | None, uuid: str | None = None) -> None:
         image = self._connector.fetch_image(image_url) if image_url else None
@@ -968,20 +963,18 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
         super().__init__()
 
         self._tracker = tracker
+        multimodal_config = self._tracker.model_config.multimodal_config
+        media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None)
         self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load(
             envs.VLLM_MEDIA_CONNECTOR,
-            media_io_kwargs=self.renderer_config.media_io_kwargs,
+            media_io_kwargs=media_io_kwargs,
             allowed_local_media_path=tracker.allowed_local_media_path,
             allowed_media_domains=tracker.allowed_media_domains,
         )
 
-    @property
-    def renderer_config(self) -> RendererConfig:
-        return self._tracker.renderer_config
-
     @property
     def model_config(self) -> ModelConfig:
-        return self.renderer_config.model_config
+        return self._tracker.model_config
 
     def parse_image(self, image_url: str | None, uuid: str | None = None) -> None:
         image_coro = self._connector.fetch_image_async(image_url) if image_url else None
@@ -1611,17 +1604,15 @@ def _postprocess_messages(messages: list[ConversationMessage]) -> None:
 
 def parse_chat_messages(
     messages: list[ChatCompletionMessageParam],
-    renderer_config: RendererConfig,
+    model_config: ModelConfig,
     content_format: _ChatTemplateContentFormat,
 ) -> tuple[
     list[ConversationMessage],
     MultiModalDataDict | None,
     MultiModalUUIDDict | None,
 ]:
-    model_config = renderer_config.model_config
-
     conversation: list[ConversationMessage] = []
-    mm_tracker = MultiModalItemTracker(renderer_config)
+    mm_tracker = MultiModalItemTracker(model_config)
 
     for msg in messages:
         sub_messages = _parse_chat_message_content(
@@ -1644,17 +1635,15 @@ def parse_chat_messages(
 
 def parse_chat_messages_futures(
     messages: list[ChatCompletionMessageParam],
-    renderer_config: RendererConfig,
+    model_config: ModelConfig,
     content_format: _ChatTemplateContentFormat,
 ) -> tuple[
     list[ConversationMessage],
     Awaitable[MultiModalDataDict | None],
     MultiModalUUIDDict | None,
 ]:
-    model_config = renderer_config.model_config
-
     conversation: list[ConversationMessage] = []
-    mm_tracker = AsyncMultiModalItemTracker(renderer_config)
+    mm_tracker = AsyncMultiModalItemTracker(model_config)
 
     for msg in messages:
         sub_messages = _parse_chat_message_content(
@@ -1759,14 +1748,14 @@ def apply_hf_chat_template(
     chat_template: str | None,
     tools: list[dict[str, Any]] | None,
     *,
-    renderer_config: RendererConfig,
+    model_config: ModelConfig,
     **kwargs: Any,
 ) -> str:
     hf_chat_template = resolve_hf_chat_template(
         tokenizer,
         chat_template=chat_template,
         tools=tools,
-        model_config=renderer_config.model_config,
+        model_config=model_config,
     )
 
     if hf_chat_template is None:
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 6b3cb26afb626..913324fd5f9c3 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -29,8 +29,8 @@ from vllm.config.model import (
     HfOverrides,
     ModelDType,
     RunnerOption,
+    TokenizerMode,
 )
-from vllm.config.renderer import TokenizerMode
 from vllm.engine.arg_utils import EngineArgs
 from vllm.entrypoints.chat_utils import (
     ChatCompletionMessageParam,
@@ -343,7 +343,6 @@ class LLM:
         logger.info("Supported tasks: %s", supported_tasks)
         self.supported_tasks = supported_tasks
 
-        self.renderer_config = self.llm_engine.renderer_config
         self.model_config = self.llm_engine.model_config
         self.input_processor = self.llm_engine.input_processor
         self.io_processor = self.llm_engine.io_processor
@@ -809,13 +808,13 @@ class LLM:
             list_of_messages = [cast(list[ChatCompletionMessageParam], messages)]
 
         tokenizer = self.get_tokenizer()
-        renderer_config = self.renderer_config
+        model_config = self.model_config
         resolved_content_format = resolve_chat_template_content_format(
             chat_template,
             tools,
             chat_template_content_format,
             tokenizer,
-            renderer_config=renderer_config,
+            model_config=model_config,
         )
 
         _chat_template_kwargs: dict[str, Any] = dict(
@@ -834,7 +833,7 @@ class LLM:
             # the chat message parsing for it.
             conversation, mm_data, mm_uuids = parse_chat_messages(
                 msgs,
-                renderer_config,
+                model_config,
                 content_format=resolved_content_format,
             )
 
@@ -848,7 +847,7 @@ class LLM:
                 prompt_str = apply_hf_chat_template(
                     tokenizer=tokenizer,
                     conversation=conversation,
-                    renderer_config=renderer_config,
+                    model_config=model_config,
                     **_chat_template_kwargs,
                 )
                 # Special tokens are already included in chat templates so
@@ -1291,7 +1290,6 @@ class LLM:
         lora_request: list[LoRARequest] | LoRARequest | None = None,
         tokenization_kwargs: dict[str, Any] | None = None,
     ) -> list[ScoringRequestOutput]:
-        renderer_config = self.renderer_config
         model_config = self.model_config
 
         if isinstance(tokenizer, MistralTokenizer):
@@ -1319,7 +1317,7 @@ class LLM:
 
         for q, d in input_pairs:
             _, engine_prompt = get_score_prompt(
-                renderer_config=renderer_config,
+                model_config=model_config,
                 data_1=q,
                 data_2=d,
                 tokenizer=tokenizer,
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index d77d611a2654d..7be601d824f34 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1099,7 +1099,7 @@ async def init_app_state(
     logger.info("Supported tasks: %s", supported_tasks)
 
     resolved_chat_template = await process_chat_template(
-        args.chat_template, engine_client, vllm_config.renderer_config
+        args.chat_template, engine_client, vllm_config.model_config
     )
 
     if args.tool_server == "demo":
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index a9e72fb00c5bd..3e421e21e3e80 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -122,7 +122,7 @@ class OpenAIServingCompletion(OpenAIServing):
         try:
             lora_request = self._maybe_get_adapters(request)
 
-            if self.renderer_config.skip_tokenizer_init:
+            if self.model_config.skip_tokenizer_init:
                 tokenizer = None
             else:
                 tokenizer = await self.engine_client.get_tokenizer()
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index d887cf48d89f9..99936f588f28b 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -291,7 +291,6 @@ class OpenAIServing:
 
         self.input_processor = self.models.input_processor
         self.io_processor = self.models.io_processor
-        self.renderer_config = self.models.renderer_config
         self.model_config = self.models.model_config
         self.max_model_len = self.model_config.max_model_len
 
@@ -1101,18 +1100,18 @@ class OpenAIServing:
         Sequence[RequestPrompt],
         list[EngineTokensPrompt],
     ]:
-        renderer_config = self.renderer_config
+        model_config = self.model_config
 
         resolved_content_format = resolve_chat_template_content_format(
             chat_template,
             tool_dicts,
             chat_template_content_format,
             tokenizer,
-            renderer_config=renderer_config,
+            model_config=model_config,
         )
         conversation, mm_data_future, mm_uuids = parse_chat_messages_futures(
             messages,
-            renderer_config,
+            model_config,
             content_format=resolved_content_format,
         )
 
@@ -1139,14 +1138,14 @@ class OpenAIServing:
             request_prompt = tokenizer.apply_chat_template(
                 conversation=conversation,
                 messages=messages,
-                model_config=renderer_config.model_config,
+                model_config=model_config,
                 **_chat_template_kwargs,
             )
         else:
             request_prompt = apply_hf_chat_template(
                 tokenizer=tokenizer,
                 conversation=conversation,
-                renderer_config=renderer_config,
+                model_config=model_config,
                 **_chat_template_kwargs,
             )
 
diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
index ec65e659383d8..953398a9a72ae 100644
--- a/vllm/entrypoints/openai/serving_models.py
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -71,7 +71,6 @@ class OpenAIServingModels:
 
         self.input_processor = self.engine_client.input_processor
         self.io_processor = self.engine_client.io_processor
-        self.renderer_config = self.engine_client.renderer_config
         self.model_config = self.engine_client.model_config
         self.max_model_len = self.model_config.max_model_len
 
diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py
index 5fd79eed1909b..cea9924ebbaca 100644
--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@@ -91,7 +91,7 @@ class OpenAISpeechToText(OpenAIServing):
         self.task_type = task_type
 
         self.asr_config = self.model_cls.get_speech_to_text_config(
-            self.renderer_config, task_type
+            self.model_config, task_type
         )
 
         self.enable_force_include_usage = enable_force_include_usage
@@ -101,8 +101,8 @@ class OpenAISpeechToText(OpenAIServing):
             self.tokenizer = cast(
                 PreTrainedTokenizerBase,
                 get_tokenizer(
-                    tokenizer_name=self.renderer_config.tokenizer,
-                    tokenizer_mode=self.renderer_config.tokenizer_mode,
+                    tokenizer_name=self.model_config.tokenizer,
+                    tokenizer_mode=self.model_config.tokenizer_mode,
                 ),
             )
 
@@ -154,7 +154,7 @@ class OpenAISpeechToText(OpenAIServing):
             prompt = self.model_cls.get_generation_prompt(
                 audio=chunk,
                 stt_config=self.asr_config,
-                renderer_config=self.renderer_config,
+                model_config=self.model_config,
                 language=language,
                 task_type=self.task_type,
                 request_prompt=request.prompt,
@@ -428,7 +428,7 @@ class OpenAISpeechToText(OpenAIServing):
                     if res.prompt_token_ids is not None:
                         num_prompt_tokens = len(res.prompt_token_ids)
                         if audio_tokens := self.model_cls.get_num_audio_tokens(
-                            audio_duration_s, self.asr_config, self.renderer_config
+                            audio_duration_s, self.asr_config, self.model_config
                         ):
                             num_prompt_tokens += audio_tokens
 
diff --git a/vllm/entrypoints/pooling/pooling/serving.py b/vllm/entrypoints/pooling/pooling/serving.py
index cd28ccba9ef95..7fb767e26d019 100644
--- a/vllm/entrypoints/pooling/pooling/serving.py
+++ b/vllm/entrypoints/pooling/pooling/serving.py
@@ -94,7 +94,7 @@ class OpenAIServingPooling(OpenAIServing):
         try:
             lora_request = self._maybe_get_adapters(request)
 
-            if self.renderer_config.skip_tokenizer_init:
+            if self.model_config.skip_tokenizer_init:
                 tokenizer = None
             else:
                 tokenizer = await self.engine_client.get_tokenizer()
diff --git a/vllm/entrypoints/pooling/score/serving.py b/vllm/entrypoints/pooling/score/serving.py
index f657fcefd3a86..e5a66783005a6 100644
--- a/vllm/entrypoints/pooling/score/serving.py
+++ b/vllm/entrypoints/pooling/score/serving.py
@@ -160,8 +160,10 @@ class ServingScores(OpenAIServing):
         data_1: str | ScoreContentPartParam,
         data_2: str | ScoreContentPartParam,
     ) -> tuple[str, TokensPrompt]:
+        model_config = self.model_config
+
         full_prompt, engine_prompt = get_score_prompt(
-            renderer_config=self.renderer_config,
+            model_config=model_config,
             data_1=data_1,
             data_2=data_2,
             tokenizer=tokenizer,
diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py
index 561adbe454f3c..072ddd4c90b16 100644
--- a/vllm/entrypoints/score_utils.py
+++ b/vllm/entrypoints/score_utils.py
@@ -5,7 +5,7 @@ from typing import Any, TypeAlias, cast
 from torch.nn import CosineSimilarity
 from typing_extensions import Required, TypedDict
 
-from vllm.config import ModelConfig, RendererConfig
+from vllm.config import ModelConfig
 from vllm.entrypoints.chat_utils import (
     BaseMultiModalItemTracker,
     ChatCompletionContentPartImageEmbedsParam,
@@ -88,9 +88,9 @@ def _validate_score_input_lens(
 def parse_score_data(
     data_1: str | ScoreContentPartParam,
     data_2: str | ScoreContentPartParam,
-    renderer_config: RendererConfig,
+    model_config: ModelConfig,
 ) -> tuple[str, str, MultiModalDataDict | None]:
-    mm_tracker = MultiModalItemTracker(renderer_config)
+    mm_tracker = MultiModalItemTracker(model_config)
 
     content_1 = _parse_score_content(data_1, mm_tracker)
     content_2 = _parse_score_content(data_2, mm_tracker)
@@ -176,7 +176,7 @@ def post_process_tokens(
 
 
 def get_score_prompt(
-    renderer_config: RendererConfig,
+    model_config: ModelConfig,
     tokenizer: TokenizerLike,
     tokenization_kwargs: dict[str, Any],
     data_1: str | ScoreContentPartParam,
@@ -185,14 +185,11 @@ def get_score_prompt(
     prompt_1, prompt_2, mm_data = parse_score_data(
         data_1,
         data_2,
-        renderer_config,
+        model_config,
     )
-
     from vllm.model_executor.model_loader import get_model_cls
 
-    model_config = renderer_config.model_config
     model = get_model_cls(model_config)
-
     if supports_score_template(model):
         full_prompt = apply_score_template(model_config, prompt_1, prompt_2)
         prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs)
diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
index a81f73ac9e618..daeeb995bc749 100644
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -13,7 +13,7 @@ from fastapi import Request
 from fastapi.responses import JSONResponse, StreamingResponse
 from starlette.background import BackgroundTask, BackgroundTasks
 
-from vllm.config import RendererConfig
+from vllm.config import ModelConfig
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.chat_utils import (
@@ -288,7 +288,7 @@ def process_lora_modules(
 async def process_chat_template(
     args_chat_template: Path | str | None,
     engine_client: EngineClient,
-    renderer_config: RendererConfig,
+    model_config: ModelConfig,
 ) -> str | None:
     resolved_chat_template = load_chat_template(args_chat_template)
     if resolved_chat_template is not None:
@@ -305,7 +305,7 @@ async def process_chat_template(
                 tokenizer=tokenizer,
                 chat_template=None,
                 tools=None,
-                model_config=renderer_config.model_config,
+                model_config=model_config,
             )
 
             if hf_chat_template != resolved_chat_template:
@@ -314,6 +314,6 @@ async def process_chat_template(
                     "It is different from official chat template '%s'. "
                     "This discrepancy may lead to performance degradation.",
                     resolved_chat_template,
-                    renderer_config.model_config.model,
+                    model_config.model,
                 )
     return resolved_chat_template
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index f534d102fc3b7..0372b06d0017f 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -6,7 +6,7 @@ from typing import Any, cast
 
 from typing_extensions import assert_never
 
-from vllm.config import RendererConfig
+from vllm.config import ModelConfig
 from vllm.logger import init_logger
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.multimodal.cache import BaseMultiModalProcessorCache
@@ -45,15 +45,14 @@ logger = init_logger(__name__)
 class InputPreprocessor:
     def __init__(
         self,
-        renderer_config: RendererConfig,
+        model_config: ModelConfig,
         tokenizer: TokenizerLike | None,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
         mm_processor_cache: BaseMultiModalProcessorCache | None = None,
     ) -> None:
         super().__init__()
 
-        self.renderer_config = renderer_config
-        self.model_config = renderer_config.model_config
+        self.model_config = model_config
         self.tokenizer = tokenizer
         self.mm_registry = mm_registry
         self.mm_processor_cache = mm_processor_cache
@@ -232,7 +231,7 @@ class InputPreprocessor:
     def _get_mm_processor(self) -> BaseMultiModalProcessor:
         if not hasattr(self, "_mm_processor"):
             self._mm_processor = self.mm_registry.create_processor(
-                self.renderer_config,
+                self.model_config,
                 tokenizer=self.tokenizer,
                 cache=self.mm_processor_cache,
             )
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index a2700bd5a5016..007d847ac3b7b 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -415,7 +415,7 @@ def load_weights_using_from_2_way_softmax(
     from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
     from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 
-    renderer_config = model.vllm_config.renderer_config
+    model_config = model.vllm_config.model_config
     quant_config = model.vllm_config.quant_config
     text_config = model.config.get_text_config()
 
@@ -447,10 +447,10 @@ def load_weights_using_from_2_way_softmax(
     from vllm.tokenizers import get_tokenizer
 
     tokenizer = get_tokenizer(
-        renderer_config.tokenizer,
-        revision=renderer_config.tokenizer_revision,
-        tokenizer_mode=renderer_config.tokenizer_mode,
-        trust_remote_code=renderer_config.trust_remote_code,
+        model_config.tokenizer,
+        revision=model_config.tokenizer_revision,
+        tokenizer_mode=model_config.tokenizer_mode,
+        trust_remote_code=model_config.trust_remote_code,
     )
 
     false_id = tokenizer.convert_tokens_to_ids(tokens[0])
@@ -473,7 +473,7 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te
     from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
     from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 
-    renderer_config = model.vllm_config.renderer_config
+    model_config = model.vllm_config.model_config
     quant_config = model.vllm_config.quant_config
     text_config = model.config.get_text_config()
 
@@ -501,10 +501,10 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te
     from vllm.tokenizers import get_tokenizer
 
     tokenizer = get_tokenizer(
-        renderer_config.tokenizer,
-        revision=renderer_config.tokenizer_revision,
-        tokenizer_mode=renderer_config.tokenizer_mode,
-        trust_remote_code=renderer_config.trust_remote_code,
+        model_config.tokenizer,
+        revision=model_config.tokenizer_revision,
+        tokenizer_mode=model_config.tokenizer_mode,
+        trust_remote_code=model_config.trust_remote_code,
     )
 
     token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens]
diff --git a/vllm/model_executor/models/deepseek_ocr.py b/vllm/model_executor/models/deepseek_ocr.py
index bd472474982ff..1f07381c0cbd0 100644
--- a/vllm/model_executor/models/deepseek_ocr.py
+++ b/vllm/model_executor/models/deepseek_ocr.py
@@ -377,8 +377,8 @@ class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         self.projector_config = config.projector_config
         self.text_config = config.text_config
 
-        renderer_config = vllm_config.renderer_config
-        tokenizer = cached_tokenizer_from_config(renderer_config)
+        model_config = vllm_config.model_config
+        tokenizer = cached_tokenizer_from_config(model_config)
         self.image_token_id = tokenizer.vocab[_IMAGE_TOKEN]
 
         self.sam_model = build_sam_vit_b()
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index be03e1df81d22..9f8faf9ed91ce 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -370,8 +370,8 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         self.projector_config = config.projector_config
         self.text_config = config.text_config
 
-        renderer_config = vllm_config.renderer_config
-        tokenizer = cached_tokenizer_from_config(renderer_config)
+        model_config = vllm_config.model_config
+        tokenizer = cached_tokenizer_from_config(model_config)
         self.image_token_id: int = tokenizer.vocab[_IMAGE_TOKEN]
 
         self.vision = self._init_vision_module(
diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py
index f82529d849a64..7036118ada084 100644
--- a/vllm/model_executor/models/gemma3n_mm.py
+++ b/vllm/model_executor/models/gemma3n_mm.py
@@ -18,7 +18,7 @@ from transformers.models.gemma3n import (
 )
 from transformers.models.siglip import SiglipImageProcessorFast
 
-from vllm.config import RendererConfig, SpeechToTextConfig, VllmConfig
+from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
@@ -760,7 +760,7 @@ class Gemma3nForConditionalGeneration(
         cls,
         audio: np.ndarray,
         stt_config: SpeechToTextConfig,
-        renderer_config: RendererConfig,
+        model_config: ModelConfig,
         language: Optional[str],
         task_type: Literal["transcribe", "translate"],
         request_prompt: str,
@@ -798,9 +798,7 @@ class Gemma3nForConditionalGeneration(
 
     @classmethod
     def get_speech_to_text_config(
-        cls,
-        renderer_config: RendererConfig,
-        task_type: str,
+        cls, model_config: ModelConfig, task_type: str
     ) -> SpeechToTextConfig:
         return SpeechToTextConfig(
             # Let's set this to 30 as suggested in the docs for now, although
diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py
index 96645f20b79df..a4e50f4086281 100644
--- a/vllm/model_executor/models/granite_speech.py
+++ b/vllm/model_executor/models/granite_speech.py
@@ -34,7 +34,7 @@ import torch.nn.functional as F
 from torch import nn
 from transformers import BatchFeature, PretrainedConfig
 
-from vllm.config import CacheConfig, RendererConfig, SpeechToTextConfig, VllmConfig
+from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.inputs.data import PromptType
 from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
@@ -840,7 +840,7 @@ class GraniteSpeechForConditionalGeneration(
     def get_generation_prompt(
         cls,
         audio: np.ndarray,
-        renderer_config: RendererConfig,
+        model_config: ModelConfig,
         stt_config: SpeechToTextConfig,
         language: str | None,
         task_type: Literal["transcribe", "translate"],
@@ -861,7 +861,7 @@ class GraniteSpeechForConditionalGeneration(
         else:
             raise ValueError(f"Unsupported task type {task_type}")
 
-        tokenizer = cached_tokenizer_from_config(renderer_config)
+        tokenizer = cached_tokenizer_from_config(model_config)
         chat = [dict(role="user", content=user_prompt)]
         prompt = tokenizer.apply_chat_template(
             chat,
@@ -882,10 +882,10 @@ class GraniteSpeechForConditionalGeneration(
         cls,
         audio_duration_s: float,
         stt_config: SpeechToTextConfig,
-        renderer_config: RendererConfig,
+        model_config: ModelConfig,
     ) -> int | None:
         """Get the number of audio tokens for an audio duration in sec."""
-        processor = cached_processor_from_config(renderer_config)
+        processor = cached_processor_from_config(model_config)
         hop_length = processor.audio_processor.melspec_kwargs["hop_length"]
         proj_win_size = processor.audio_processor.projector_window_size
         ds_rate = processor.audio_processor.projector_downsample_rate
@@ -903,9 +903,7 @@ class GraniteSpeechForConditionalGeneration(
 
     @classmethod
     def get_speech_to_text_config(
-        cls,
-        renderer_config: RendererConfig,
-        task_type: str,
+        cls, model_config: ModelConfig, task_type: str
     ) -> SpeechToTextConfig:
         """Get the stt config for this model."""
         # Default settings are reasonable for this model and we don't currently
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
index b9f3ac8aee5f7..2aba626a7c737 100644
--- a/vllm/model_executor/models/gritlm.py
+++ b/vllm/model_executor/models/gritlm.py
@@ -6,7 +6,7 @@ import numpy as np
 import torch
 import torch.nn as nn
 
-from vllm.config import RendererConfig, VllmConfig
+from vllm.config import ModelConfig, VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.pooler import (
     DispatchPooler,
@@ -29,12 +29,12 @@ logger = init_logger(__name__)
 class GritLMMeanPool(nn.Module):
     """As `MeanPool`, but only includes non-instruction tokens."""
 
-    def __init__(self, renderer_config: RendererConfig):
+    def __init__(self, model_config: ModelConfig):
         super().__init__()
 
-        self.renderer_config = renderer_config
+        self.model_config = model_config
 
-        tokenizer = cached_tokenizer_from_config(self.renderer_config)
+        tokenizer = cached_tokenizer_from_config(self.model_config)
 
         # Collect the tokens needed for pattern matching.
         # "▁<" is different from "_<". The former uses "▁" to indicate that
@@ -174,10 +174,10 @@ class GritLMMeanPool(nn.Module):
 
 
 class GritLMPooler(Pooler):
-    def __init__(self, renderer_config: RendererConfig):
+    def __init__(self, model_config: ModelConfig):
         super().__init__()
 
-        self.pooling = GritLMMeanPool(renderer_config)
+        self.pooling = GritLMMeanPool(model_config)
         self.head = PoolerHead(PoolerNormalize())
 
     def get_supported_tasks(self) -> Set[PoolingTask]:
@@ -238,6 +238,6 @@ class GritLM(LlamaForCausalLM):
             self.pooler = DispatchPooler(
                 {
                     "token_embed": Pooler.for_token_embed(pooler_config),
-                    "embed": GritLMPooler(vllm_config.renderer_config),
+                    "embed": GritLMPooler(vllm_config.model_config),
                 }
             )
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 4df91aaf8b7f2..607ff55835f1d 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -19,7 +19,7 @@ from torch import Tensor
 from transformers.models.whisper.tokenization_whisper import LANGUAGES
 from typing_extensions import Self, TypeIs
 
-from vllm.config import RendererConfig, SpeechToTextConfig
+from vllm.config import ModelConfig, SpeechToTextConfig
 from vllm.inputs import TokensPrompt
 from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
@@ -887,7 +887,7 @@ class SupportsTranscription(Protocol):
         cls,
         audio: np.ndarray,
         stt_config: SpeechToTextConfig,
-        renderer_config: RendererConfig,
+        model_config: ModelConfig,
         language: str | None,
         task_type: Literal["transcribe", "translate"],
         request_prompt: str,
@@ -930,9 +930,7 @@ class SupportsTranscription(Protocol):
 
     @classmethod
     def get_speech_to_text_config(
-        cls,
-        renderer_config: RendererConfig,
-        task_type: Literal["transcribe", "translate"],
+        cls, model_config: ModelConfig, task_type: Literal["transcribe", "translate"]
     ) -> SpeechToTextConfig:
         """Get the speech to text config for the ASR model."""
         ...
@@ -942,7 +940,7 @@ class SupportsTranscription(Protocol):
         cls,
         audio_duration_s: float,
         stt_config: SpeechToTextConfig,
-        renderer_config: RendererConfig,
+        model_config: ModelConfig,
     ) -> int | None:
         """
         Map from audio duration to number of audio tokens produced by the ASR
diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py
index d75637da1d0e2..18985cefbf5ea 100644
--- a/vllm/model_executor/models/interns1.py
+++ b/vllm/model_executor/models/interns1.py
@@ -182,7 +182,7 @@ class InternS1ProcessingInfo(BaseProcessingInfo):
     def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
         hf_processor = self.ctx.get_hf_processor(InternVLProcessor, **kwargs)
         hf_processor.video_processor = cached_video_processor_from_config(
-            self.ctx.renderer_config,
+            self.ctx.model_config,
             processor_cls=InternVLVideoProcessor,
             size=hf_processor.image_processor.size,
             **kwargs,
diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
index 4daaefd0c2709..6dfab595e5b92 100644
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -1169,17 +1169,16 @@ class NemotronH_Nano_VL_V2(
         self.mlp1 = self.mlp1.to(self.language_model.config.dtype)
 
         self.config = config
+        self.model_config = vllm_config.model_config
 
         # Pre-tokenize special tokens for video processing
         # to avoid repeated tokenization
-        self._tokenizer = cached_tokenizer_from_config(vllm_config.renderer_config)
-        self._img_start_token_ids = self._tokenizer.encode(
+        tokenizer = cached_tokenizer_from_config(vllm_config.model_config)
+        self._img_start_token_ids = tokenizer.encode(
             IMG_START, add_special_tokens=False
         )
-        self._img_end_token_ids = self._tokenizer.encode(
-            IMG_END, add_special_tokens=False
-        )
-        self._img_context_token_ids = self._tokenizer.encode(
+        self._img_end_token_ids = tokenizer.encode(IMG_END, add_special_tokens=False)
+        self._img_context_token_ids = tokenizer.encode(
             IMG_CONTEXT, add_special_tokens=False
         )
 
@@ -1365,7 +1364,7 @@ class NemotronH_Nano_VL_V2(
         input_embeds for the LLM.
         """
         device = video_embeddings.device
-        tokenizer = self._tokenizer
+        tokenizer = cached_tokenizer_from_config(self.model_config)
 
         # Generate video replacement token IDs using get_video_repl
         # This tokenizes each frame separator independently, then uses pre-tokenized
diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py
index 797793e658f7b..391980fc61f9e 100644
--- a/vllm/model_executor/models/nemotron_vl.py
+++ b/vllm/model_executor/models/nemotron_vl.py
@@ -347,7 +347,7 @@ class NemotronVLProcessingInfo(BaseInternVLProcessingInfo):
 
     def get_image_processor(self, **kwargs: object):
         return cached_image_processor_from_config(
-            self.ctx.renderer_config,
+            self.ctx.model_config,
             **kwargs,
         )
 
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index ebe743fa82a00..faf2d80d24bba 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -193,7 +193,7 @@ class PixtralProcessorAdapter:
 
 class PixtralProcessingInfo(BaseProcessingInfo):
     def get_tokenizer(self) -> MistralTokenizer:
-        tokenizer = cached_tokenizer_from_config(self.ctx.renderer_config)
+        tokenizer = cached_tokenizer_from_config(self.ctx.model_config)
         if not isinstance(tokenizer, MistralTokenizer):
             raise ValueError("This model requires `--tokenizer-mode mistral`")
 
diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py
index 0acd564e2e54f..7b408248ec74c 100644
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -20,7 +20,7 @@ from mistral_common.tokens.tokenizers.audio import Audio, AudioEncoder
 from transformers import BatchFeature, TensorType, WhisperConfig
 from transformers.tokenization_utils_base import TextInput
 
-from vllm.config import RendererConfig, SpeechToTextConfig, VllmConfig
+from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
@@ -176,7 +176,7 @@ class VoxtralProcessorAdapter:
 
 class VoxtralProcessingInfo(BaseProcessingInfo):
     def get_tokenizer(self) -> MistralTokenizer:
-        tokenizer = cached_tokenizer_from_config(self.ctx.renderer_config)
+        tokenizer = cached_tokenizer_from_config(self.ctx.model_config)
         if not isinstance(tokenizer, MistralTokenizer):
             raise ValueError("This model requires `--tokenizer-mode mistral`")
 
@@ -339,7 +339,7 @@ class VoxtralForConditionalGeneration(
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
-        self.tokenizer = cached_tokenizer_from_config(vllm_config.renderer_config)
+        self.tokenizer = cached_tokenizer_from_config(vllm_config.model_config)
 
         # update quant config to so that ignored module and target module names
         # match the vLLM model names
@@ -450,11 +450,9 @@ class VoxtralForConditionalGeneration(
 
     @classmethod
     def get_speech_to_text_config(
-        cls,
-        renderer_config: RendererConfig,
-        task_type: str,
+        cls, model_config: ModelConfig, task_type: str
     ) -> SpeechToTextConfig:
-        tokenizer = cached_tokenizer_from_config(renderer_config)
+        tokenizer = cached_tokenizer_from_config(model_config)
         audio_config = tokenizer.instruct.audio_encoder.audio_config
         max_audio_clip_s = audio_config.chunk_length_s
         sample_rate = audio_config.sampling_rate
@@ -470,17 +468,17 @@ class VoxtralForConditionalGeneration(
     def get_generation_prompt(
         cls,
         audio: np.ndarray,
-        renderer_config: RendererConfig,  # not needed here
+        model_config: ModelConfig,
         stt_config: SpeechToTextConfig,
         language: str | None,
         task_type: Literal["transcribe", "translate"],
         request_prompt: str,
         to_language: str | None,
     ) -> PromptType:
-        tokenizer = cached_tokenizer_from_config(renderer_config)
+        tokenizer = cached_tokenizer_from_config(model_config)
         audio = Audio(audio, int(stt_config.sample_rate), format="wav")  # lossless
         req = TranscriptionRequest(
-            model=renderer_config.model_config.model,
+            model=model_config.model,
             audio=RawAudio.from_audio(audio),
             language=language,
         )
@@ -496,14 +494,14 @@ class VoxtralForConditionalGeneration(
         cls,
         audio_duration_s: float,
         stt_config: SpeechToTextConfig,
-        renderer_config: RendererConfig,
+        model_config: ModelConfig,
     ) -> int | None:
         """
         Map from audio duration to number of audio tokens produced by the ASR
         model, without running a forward pass.
         This is used for estimating the amount of processing for this audio.
         """
-        tokenizer = cached_tokenizer_from_config(renderer_config)
+        tokenizer = cached_tokenizer_from_config(model_config)
         adapter = VoxtralProcessorAdapter(tokenizer)
         return adapter.get_num_audio_tokens(
             int(audio_duration_s * stt_config.sample_rate)
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index 6f526e3956fff..b2feff1335151 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -19,7 +19,7 @@ from transformers.models.whisper.modeling_whisper import sinusoids
 from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.layer import Attention, MultiHeadAttention
 from vllm.attention.layers.cross_attention import CrossAttention
-from vllm.config import CacheConfig, RendererConfig, SpeechToTextConfig, VllmConfig
+from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.inputs.data import PromptType
@@ -811,7 +811,7 @@ class WhisperForConditionalGeneration(
     def get_generation_prompt(
         cls,
         audio: np.ndarray,
-        renderer_config: RendererConfig,  # not needed here
+        model_config: ModelConfig,  # not needed here
         stt_config: SpeechToTextConfig,
         language: str | None,
         task_type: Literal["transcribe", "translate"],
@@ -847,11 +847,9 @@ class WhisperForConditionalGeneration(
 
     @classmethod
     def get_speech_to_text_config(
-        cls,
-        renderer_config: RendererConfig,
-        task_type: str,
+        cls, model_config: ModelConfig, task_type: str
     ) -> SpeechToTextConfig:
-        processor = cached_processor_from_config(renderer_config)
+        processor = cached_processor_from_config(model_config)
 
         return SpeechToTextConfig(
             max_audio_clip_s=processor.feature_extractor.chunk_length,
@@ -863,9 +861,9 @@ class WhisperForConditionalGeneration(
         cls,
         audio_duration_s: float,
         stt_config: SpeechToTextConfig,
-        renderer_config: RendererConfig,
+        model_config: ModelConfig,
     ) -> int | None:
-        processor = cached_processor_from_config(renderer_config)
+        processor = cached_processor_from_config(model_config)
         hop_length = processor.feature_extractor.hop_length
         assert hop_length is not None
         # NOTE(NickLucche) user can't pass encoder
diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py
index 9c838fe679582..67bdf5e1557f9 100644
--- a/vllm/multimodal/cache.py
+++ b/vllm/multimodal/cache.py
@@ -31,7 +31,7 @@ from .inputs import (
 )
 
 if TYPE_CHECKING:
-    from vllm.config import ModelConfig, RendererConfig, VllmConfig
+    from vllm.config import ModelConfig, VllmConfig
 
     from .processing import ResolvedPromptUpdate
     from .registry import MultiModalRegistry
@@ -561,13 +561,13 @@ class ShmObjectStoreSenderCache(BaseMultiModalProcessorCache):
 
 
 def _enable_processor_cache(
-    renderer_config: "RendererConfig",
+    model_config: "ModelConfig",
     mm_registry: "MultiModalRegistry",
 ) -> bool:
-    if not mm_registry.supports_multimodal_inputs(renderer_config):
+    if not mm_registry.supports_multimodal_inputs(model_config):
         return False
 
-    mm_config = renderer_config.model_config.get_multimodal_config()
+    mm_config = model_config.get_multimodal_config()
     return mm_config.mm_processor_cache_gb > 0
 
 
@@ -599,7 +599,7 @@ def processor_cache_from_config(
     """Return a `BaseMultiModalProcessorCache`, if enabled."""
     model_config = vllm_config.model_config
 
-    if not _enable_processor_cache(vllm_config.renderer_config, mm_registry):
+    if not _enable_processor_cache(model_config, mm_registry):
         return None
 
     if not _enable_ipc_cache(vllm_config):
@@ -611,14 +611,14 @@ def processor_cache_from_config(
 
 
 def processor_only_cache_from_config(
-    renderer_config: "RendererConfig",
+    model_config: "ModelConfig",
     mm_registry: "MultiModalRegistry",
 ):
     """Return a `MultiModalProcessorOnlyCache`, if enabled."""
-    if not _enable_processor_cache(renderer_config, mm_registry):
+    if not _enable_processor_cache(model_config, mm_registry):
         return None
 
-    return MultiModalProcessorOnlyCache(renderer_config.model_config)
+    return MultiModalProcessorOnlyCache(model_config)
 
 
 class BaseMultiModalReceiverCache(
@@ -787,7 +787,7 @@ def engine_receiver_cache_from_config(
     """
     model_config = vllm_config.model_config
 
-    if not _enable_processor_cache(vllm_config.renderer_config, mm_registry):
+    if not _enable_processor_cache(model_config, mm_registry):
         return None
 
     if not _enable_ipc_cache(vllm_config):
@@ -809,7 +809,9 @@ def worker_receiver_cache_from_config(
     Return a `BaseMultiModalReceiverCache` only when IPC caching is enabled and
     mm_processor_cache_type=="shm".
     """
-    if not _enable_processor_cache(vllm_config.renderer_config, mm_registry):
+    model_config = vllm_config.model_config
+
+    if not _enable_processor_cache(model_config, mm_registry):
         return None
 
     if not _enable_ipc_cache(vllm_config):
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 81ceb76a4b961..0390773783961 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -23,7 +23,7 @@ import torch
 from typing_extensions import TypeVar, assert_never
 
 from vllm.logger import init_logger
-from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
+from vllm.tokenizers import TokenizerLike
 from vllm.transformers_utils.processor import cached_processor_from_config
 from vllm.utils.collection_utils import flatten_2d_lists, full_groupby
 from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
@@ -53,7 +53,7 @@ if TYPE_CHECKING:
     from transformers.feature_extraction_utils import BatchFeature
     from transformers.processing_utils import ProcessorMixin
 
-    from vllm.config import ModelConfig, RendererConfig
+    from vllm.config import ModelConfig
 
     from .cache import BaseMultiModalProcessorCache
     from .profiling import BaseDummyInputsBuilder
@@ -63,7 +63,6 @@ else:
     ProcessorMixin = object
 
     ModelConfig = object
-    RendererConfig = object
 
     BaseMultiModalProcessorCache = object
 
@@ -946,29 +945,12 @@ class InputProcessingContext:
     modify the inputs.
     """
 
-    renderer_config: RendererConfig
-    """The configuration of the renderer."""
+    model_config: ModelConfig
+    """The configuration of the model."""
 
     tokenizer: TokenizerLike | None
     """The tokenizer used to tokenize the inputs."""
 
-    @classmethod
-    def from_config(
-        cls,
-        renderer_config: RendererConfig,
-        *,
-        tokenizer: TokenizerLike | None = None,
-    ):
-        if tokenizer is None and not renderer_config.skip_tokenizer_init:
-            tokenizer = cached_tokenizer_from_config(renderer_config)
-
-        return cls(renderer_config, tokenizer)
-
-    @property
-    def model_config(self) -> ModelConfig:
-        """The configuration of the model."""
-        return self.renderer_config.model_config
-
     def get_tokenizer(self) -> TokenizerLike:
         if self.tokenizer is None:
             raise ValueError(
@@ -1065,7 +1047,7 @@ class InputProcessingContext:
             typ = ProcessorMixin
 
         return cached_processor_from_config(
-            self.renderer_config,
+            self.model_config,
             processor_cls=typ,
             tokenizer=self.tokenizer,
             **kwargs,
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index e49aaa5045c62..00a84f9dec4f7 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Generic, Protocol, TypeVar, cast
 
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.logger import init_logger
-from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
 
 from .cache import BaseMultiModalProcessorCache
 from .processing import (
@@ -22,7 +22,7 @@ from .profiling import (
 )
 
 if TYPE_CHECKING:
-    from vllm.config import ModelConfig, RendererConfig
+    from vllm.config import ModelConfig
     from vllm.model_executor.models.interfaces import SupportsMultiModal
 
 logger = init_logger(__name__)
@@ -114,18 +114,17 @@ class MultiModalRegistry:
 
         return mm_options if len(mm_options) > 0 else None
 
-    def supports_multimodal_inputs(self, renderer_config: "RendererConfig") -> bool:
+    def supports_multimodal_inputs(self, model_config: "ModelConfig") -> bool:
         """
         Checks if the model supports multimodal inputs.
         Returns True if the model is multimodal with any non-zero supported
         modalities, otherwise returns False, effectively running in
         text-only mode.
         """
-        model_config = renderer_config.model_config
         if not model_config.is_multimodal_model:
             return False
 
-        info = self._create_processing_info(renderer_config, tokenizer=None)
+        info = self._create_processing_info(model_config, tokenizer=None)
         supported_modalities = info.get_supported_mm_limits()
 
         mm_config = model_config.get_multimodal_config()
@@ -145,7 +144,7 @@ class MultiModalRegistry:
 
     def get_max_tokens_per_item_by_modality(
         self,
-        renderer_config: "RendererConfig",
+        model_config: "ModelConfig",
         *,
         cache: BaseMultiModalProcessorCache | None = None,
         profiler_limits: Mapping[str, int] | None = None,
@@ -154,11 +153,10 @@ class MultiModalRegistry:
         Get the maximum number of tokens per data item from each modality based
         on underlying model configuration.
         """
-        model_config = renderer_config.model_config
         if not model_config.is_multimodal_model:
             return {}
 
-        processor = self.create_processor(renderer_config, cache=cache)
+        processor = self.create_processor(model_config, cache=cache)
         profiler: MultiModalProfiler = MultiModalProfiler(processor)
 
         seq_len = model_config.max_model_len
@@ -173,7 +171,7 @@ class MultiModalRegistry:
 
     def get_mm_limits_per_prompt(
         self,
-        renderer_config: "RendererConfig",
+        model_config: "ModelConfig",
         *,
         cache: BaseMultiModalProcessorCache | None = None,
     ) -> Mapping[str, int]:
@@ -181,11 +179,10 @@ class MultiModalRegistry:
         Get the maximum number of multi-modal input instances for each modality
         that are allowed per prompt for a model class.
         """
-        model_config = renderer_config.model_config
         if not model_config.is_multimodal_model:
             return {}
 
-        processor = self.create_processor(renderer_config, cache=cache)
+        processor = self.create_processor(model_config, cache=cache)
         profiler: MultiModalProfiler = MultiModalProfiler(processor)
         return profiler.get_mm_limits()
 
@@ -231,21 +228,30 @@ class MultiModalRegistry:
         assert hasattr(model_cls, "_processor_factory")
         return cast("SupportsMultiModal", model_cls)
 
+    def _create_processing_ctx(
+        self,
+        model_config: "ModelConfig",
+        tokenizer: TokenizerLike | None = None,
+    ) -> InputProcessingContext:
+        if tokenizer is None and not model_config.skip_tokenizer_init:
+            tokenizer = cached_tokenizer_from_config(model_config)
+
+        return InputProcessingContext(model_config, tokenizer)
+
     def _create_processing_info(
         self,
-        renderer_config: "RendererConfig",
+        model_config: "ModelConfig",
         *,
         tokenizer: TokenizerLike | None = None,
     ) -> BaseProcessingInfo:
-        model_cls = self._get_model_cls(renderer_config.model_config)
+        model_cls = self._get_model_cls(model_config)
         factories = model_cls._processor_factory
-
-        ctx = InputProcessingContext.from_config(renderer_config, tokenizer=tokenizer)
+        ctx = self._create_processing_ctx(model_config, tokenizer)
         return factories.info(ctx)
 
     def create_processor(
         self,
-        renderer_config: "RendererConfig",
+        model_config: "ModelConfig",
         *,
         tokenizer: TokenizerLike | None = None,
         cache: BaseMultiModalProcessorCache | None = None,
@@ -253,19 +259,19 @@ class MultiModalRegistry:
         """
         Create a multi-modal processor for a specific model and tokenizer.
         """
-        model_config = renderer_config.model_config
         if not model_config.is_multimodal_model:
             raise ValueError(f"{model_config.model} is not a multimodal model")
 
         model_cls = self._get_model_cls(model_config)
         factories = model_cls._processor_factory
 
-        ctx = InputProcessingContext.from_config(renderer_config, tokenizer=tokenizer)
+        ctx = self._create_processing_ctx(model_config, tokenizer)
+
         return factories.build_processor(ctx, cache=cache)
 
     def get_decoder_dummy_data(
         self,
-        renderer_config: "RendererConfig",
+        model_config: "ModelConfig",
         seq_len: int,
         mm_counts: Mapping[str, int] | None = None,
         *,
@@ -274,15 +280,15 @@ class MultiModalRegistry:
         """
         Create dummy data for profiling the memory usage of a model.
 
-        The model is identified by `renderer_config`.
+        The model is identified by `model_config`.
         """
-        processor = self.create_processor(renderer_config, cache=cache)
+        processor = self.create_processor(model_config, cache=cache)
         profiler: MultiModalProfiler = MultiModalProfiler(processor)
 
         # Extract configurable options from multimodal config.
         # Only include modalities that use advanced option types so legacy
         # count-only behavior remains unchanged.
-        mm_options = self._extract_mm_options(renderer_config.model_config)
+        mm_options = self._extract_mm_options(model_config)
 
         dummy_data = profiler.get_decoder_dummy_data(seq_len, mm_counts, mm_options)
 
@@ -298,7 +304,7 @@ class MultiModalRegistry:
 
     def get_encoder_dummy_data(
         self,
-        renderer_config: "RendererConfig",
+        model_config: "ModelConfig",
         seq_len: int,
         mm_counts: Mapping[str, int] | None = None,
         *,
@@ -307,15 +313,15 @@ class MultiModalRegistry:
         """
         Create dummy data for profiling the memory usage of a model.
 
-        The model is identified by `renderer_config`.
+        The model is identified by `model_config`.
         """
-        processor = self.create_processor(renderer_config, cache=cache)
+        processor = self.create_processor(model_config, cache=cache)
         profiler: MultiModalProfiler = MultiModalProfiler(processor)
 
         # Extract configurable options from multimodal config.
         # Only include modalities that use advanced option types so legacy
         # count-only behavior remains unchanged.
-        mm_options = self._extract_mm_options(renderer_config.model_config)
+        mm_options = self._extract_mm_options(model_config)
 
         dummy_data = profiler.get_encoder_dummy_data(seq_len, mm_counts, mm_options)
 
@@ -330,15 +336,13 @@ class MultiModalRegistry:
 
         return dummy_data
 
-    def get_encdec_max_encoder_len(self, renderer_config: "RendererConfig") -> int:
+    def get_encdec_max_encoder_len(self, model_config: "ModelConfig") -> int:
         """
         Get the maximum length of the encoder input for encoder-decoder models.
         """
-        model_config = renderer_config.model_config
         if not model_config.is_encoder_decoder:
             return 0
-
-        max_tokens = self.get_max_tokens_per_item_by_modality(renderer_config)
+        max_tokens = self.get_max_tokens_per_item_by_modality(model_config)
         if not max_tokens:
             # TODO - this function assumes encoder-decoder models are
             # multimodal. This will need to change when adding support for more
diff --git a/vllm/tokenizers/registry.py b/vllm/tokenizers/registry.py
index c9575511af8c9..1d44feeee500f 100644
--- a/vllm/tokenizers/registry.py
+++ b/vllm/tokenizers/registry.py
@@ -24,7 +24,7 @@ from vllm.utils.import_utils import resolve_obj_by_qualname
 from .protocol import TokenizerLike
 
 if TYPE_CHECKING:
-    from vllm.config import RendererConfig
+    from vllm.config import ModelConfig
 
 logger = init_logger(__name__)
 
@@ -205,18 +205,18 @@ def get_tokenizer(
 cached_get_tokenizer = lru_cache(get_tokenizer)
 
 
-def cached_tokenizer_from_config(renderer_config: "RendererConfig", **kwargs):
+def cached_tokenizer_from_config(model_config: "ModelConfig", **kwargs):
     return cached_get_tokenizer(
-        renderer_config.tokenizer,
-        tokenizer_mode=renderer_config.tokenizer_mode,
-        revision=renderer_config.tokenizer_revision,
-        trust_remote_code=renderer_config.trust_remote_code,
+        model_config.tokenizer,
+        tokenizer_mode=model_config.tokenizer_mode,
+        revision=model_config.tokenizer_revision,
+        trust_remote_code=model_config.trust_remote_code,
         **kwargs,
     )
 
 
-def init_tokenizer_from_config(renderer_config: "RendererConfig"):
-    runner_type = renderer_config.model_config.runner_type
+def init_tokenizer_from_config(model_config: "ModelConfig"):
+    runner_type = model_config.runner_type
     if runner_type == "generate" or runner_type == "draft":
         truncation_side = "left"
     elif runner_type == "pooling":
@@ -225,9 +225,9 @@ def init_tokenizer_from_config(renderer_config: "RendererConfig"):
         assert_never(runner_type)
 
     return get_tokenizer(
-        renderer_config.tokenizer,
-        tokenizer_mode=renderer_config.tokenizer_mode,
-        trust_remote_code=renderer_config.trust_remote_code,
-        revision=renderer_config.tokenizer_revision,
+        model_config.tokenizer,
+        tokenizer_mode=model_config.tokenizer_mode,
+        trust_remote_code=model_config.trust_remote_code,
+        revision=model_config.tokenizer_revision,
         truncation_side=truncation_side,
     )
diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py
index bdebd2686bae2..e9864b0c1531d 100644
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@@ -23,7 +23,7 @@ from vllm.transformers_utils.utils import convert_model_repo_to_path
 from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
 
 if TYPE_CHECKING:
-    from vllm.config import ModelConfig, RendererConfig
+    from vllm.config import ModelConfig
 
 _P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
 _V = TypeVar("_V", bound=BaseVideoProcessor, default=BaseVideoProcessor)
@@ -233,18 +233,17 @@ def cached_get_processor_without_dynamic_kwargs(
 
 
 def cached_processor_from_config(
-    renderer_config: "RendererConfig",
+    model_config: "ModelConfig",
     processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin,
     **kwargs: Any,
 ) -> _P:
-    model_config = renderer_config.model_config
     if is_gguf(model_config.model):
-        assert not is_gguf(renderer_config.tokenizer), (
+        assert not is_gguf(model_config.tokenizer), (
             "For multimodal GGUF models, the original tokenizer "
             "should be used to correctly load processor."
         )
-        model = renderer_config.tokenizer
-        revision = renderer_config.tokenizer_revision
+        model = model_config.tokenizer
+        revision = model_config.tokenizer_revision
     else:
         model = model_config.model
         revision = model_config.revision
@@ -298,11 +297,9 @@ cached_get_feature_extractor = lru_cache(get_feature_extractor)
 
 
 def cached_feature_extractor_from_config(
-    renderer_config: "RendererConfig",
+    model_config: "ModelConfig",
     **kwargs: Any,
 ):
-    model_config = renderer_config.model_config
-
     return cached_get_feature_extractor(
         model_config.model,
         revision=model_config.revision,
@@ -351,17 +348,16 @@ cached_get_image_processor = lru_cache(get_image_processor)
 
 
 def cached_image_processor_from_config(
-    renderer_config: "RendererConfig",
+    model_config: "ModelConfig",
     **kwargs: Any,
 ):
-    model_config = renderer_config.model_config
     if is_gguf(model_config.model):
-        assert not is_gguf(renderer_config.tokenizer), (
+        assert not is_gguf(model_config.tokenizer), (
             "For multimodal GGUF models, the original tokenizer "
             "should be used to correctly load image processor."
         )
-        model = renderer_config.tokenizer
-        revision = renderer_config.tokenizer_revision
+        model = model_config.tokenizer
+        revision = model_config.tokenizer_revision
     else:
         model = model_config.model
         revision = model_config.revision
@@ -415,12 +411,10 @@ cached_get_video_processor = lru_cache(get_video_processor)
 
 
 def cached_video_processor_from_config(
-    renderer_config: "RendererConfig",
+    model_config: "ModelConfig",
     processor_cls: type[_V] | None = None,
     **kwargs: Any,
 ):
-    model_config = renderer_config.model_config
-
     return cached_get_video_processor(
         model_config.model,
         revision=model_config.revision,
diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py
index 21315b85f22aa..3959e9a59a53b 100644
--- a/vllm/v1/core/encoder_cache_manager.py
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -10,7 +10,7 @@ from vllm.multimodal import MultiModalRegistry
 from vllm.v1.request import Request
 
 if TYPE_CHECKING:
-    from vllm.config import RendererConfig, SchedulerConfig
+    from vllm.config import ModelConfig, SchedulerConfig
 
 logger = init_logger(__name__)
 
@@ -250,7 +250,7 @@ class EncoderCacheManager:
 
 
 def compute_encoder_budget(
-    renderer_config: "RendererConfig",
+    model_config: "ModelConfig",
     scheduler_config: "SchedulerConfig",
     mm_registry: MultiModalRegistry,
 ) -> tuple[int, int]:
@@ -263,9 +263,9 @@ def compute_encoder_budget(
         - Space budget for encoder cache size, measured in number of tokens
             from the input sequence.
     """
-    if mm_registry.supports_multimodal_inputs(renderer_config):
+    if mm_registry.supports_multimodal_inputs(model_config):
         max_tokens_by_modality = mm_registry.get_max_tokens_per_item_by_modality(
-            renderer_config
+            model_config
         )
 
         return compute_mm_encoder_budget(
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 96073efc5f56a..0a8efa2fd512f 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -164,7 +164,7 @@ class Scheduler(SchedulerInterface):
         # This can be changed when we make encoder cache for embedding caching
         # across requests.
         encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
-            renderer_config=vllm_config.renderer_config,
+            model_config=vllm_config.model_config,
             scheduler_config=vllm_config.scheduler_config,
             mm_registry=mm_registry,
         )
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index b76f9c0595d61..fd7e04dc02082 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -91,7 +91,6 @@ class AsyncLLM(EngineClient):
         # Ensure we can serialize custom transformer configs
         maybe_register_config_serialize_by_value()
 
-        self.renderer_config = vllm_config.renderer_config
         self.model_config = vllm_config.model_config
         self.vllm_config = vllm_config
         self.observability_config = vllm_config.observability_config
@@ -109,15 +108,15 @@ class AsyncLLM(EngineClient):
                 "enabling logging without default stat loggers."
             )
 
-        if self.renderer_config.skip_tokenizer_init:
+        if self.model_config.skip_tokenizer_init:
             tokenizer = None
         else:
-            tokenizer = init_tokenizer_from_config(self.renderer_config)
+            tokenizer = init_tokenizer_from_config(self.model_config)
 
         self.input_processor = InputProcessor(self.vllm_config, tokenizer)
         self.io_processor = get_io_processor(
             self.vllm_config,
-            self.renderer_config.io_processor_plugin,
+            self.model_config.io_processor_plugin,
         )
 
         # OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py
index a2f6ba5be8c17..e6a94f4e3de5d 100644
--- a/vllm/v1/engine/input_processor.py
+++ b/vllm/v1/engine/input_processor.py
@@ -43,7 +43,6 @@ class InputProcessor:
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
     ) -> None:
         self.vllm_config = vllm_config
-        self.renderer_config = vllm_config.renderer_config
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
         self.lora_config = vllm_config.lora_config
@@ -55,7 +54,7 @@ class InputProcessor:
         self.mm_processor_cache = processor_cache_from_config(vllm_config, mm_registry)
 
         self.input_preprocessor = InputPreprocessor(
-            self.renderer_config,
+            self.model_config,
             tokenizer,
             mm_registry,
             mm_processor_cache=self.mm_processor_cache,
@@ -253,7 +252,7 @@ class InputProcessor:
         if not params.structured_outputs or not self.structured_outputs_config:
             return
 
-        if self.renderer_config.skip_tokenizer_init and params.structured_outputs:
+        if self.model_config.skip_tokenizer_init and params.structured_outputs:
             raise ValueError(
                 "Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'"  # noqa: E501
             )
@@ -583,7 +582,7 @@ class InputProcessor:
             if prompt_type == "encoder" and model_config.is_multimodal_model:
                 mm_registry = self.input_preprocessor.mm_registry
                 mm_processor = mm_registry.create_processor(
-                    self.renderer_config,
+                    model_config,
                     tokenizer=tokenizer,
                 )
                 assert isinstance(mm_processor, EncDecMultiModalProcessor)
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index ba0e1cf25cb08..4c31291005477 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -60,7 +60,6 @@ class LLMEngine:
     ) -> None:
         self.vllm_config = vllm_config
         self.observability_config = vllm_config.observability_config
-        self.renderer_config = vllm_config.renderer_config
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
 
@@ -84,15 +83,15 @@ class LLMEngine:
             self.dp_group = None
         self.should_execute_dummy_batch = False
 
-        if self.renderer_config.skip_tokenizer_init:
+        if self.model_config.skip_tokenizer_init:
             tokenizer = None
         else:
-            tokenizer = init_tokenizer_from_config(self.renderer_config)
+            tokenizer = init_tokenizer_from_config(self.model_config)
 
         self.input_processor = InputProcessor(self.vllm_config, tokenizer)
         self.io_processor = get_io_processor(
             self.vllm_config,
-            self.renderer_config.io_processor_plugin,
+            self.model_config.io_processor_plugin,
         )
 
         # OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 7976418510aab..31428db2d3afc 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -85,7 +85,7 @@ class EagleProposer:
         # Multi-modal data support
         self.mm_registry = MULTIMODAL_REGISTRY
         self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
-            vllm_config.renderer_config
+            vllm_config.model_config
         )
 
         self.attn_metadata_builder: AttentionMetadataBuilder | None = None
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 36aa3d9bb3f94..5ee88178cdf60 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -63,7 +63,7 @@ class StructuredOutputManager:
             max_workers = max(1, min(multiprocessing.cpu_count() // 2, 8))
             self.executor_for_fillmask = ThreadPoolExecutor(max_workers=max_workers)
 
-        if not vllm_config.renderer_config.skip_tokenizer_init:
+        if not self.vllm_config.model_config.skip_tokenizer_init:
             # The default max_workers if not specified is the number of
             # CPUs * 5, which is way too high since these tasks are CPU-bound,
             # not I/O bound. We also know we would never dominate CPU usage
@@ -71,15 +71,21 @@ class StructuredOutputManager:
             # of CPUs.
             max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
             self.executor = ThreadPoolExecutor(max_workers=max_workers)
-            self.tokenizer = init_tokenizer_from_config(vllm_config.renderer_config)
-            reasoning_parser = vllm_config.structured_outputs_config.reasoning_parser
+            self.tokenizer = init_tokenizer_from_config(
+                model_config=self.vllm_config.model_config
+            )
+            reasoning_parser = (
+                self.vllm_config.structured_outputs_config.reasoning_parser
+            )
             reasoning_parser_plugin = (
-                vllm_config.structured_outputs_config.reasoning_parser_plugin
+                self.vllm_config.structured_outputs_config.reasoning_parser_plugin
             )
             if reasoning_parser_plugin and len(reasoning_parser_plugin) > 3:
                 ReasoningParserManager.import_reasoning_parser(reasoning_parser_plugin)
 
-            reasoning_parser = vllm_config.structured_outputs_config.reasoning_parser
+            reasoning_parser = (
+                self.vllm_config.structured_outputs_config.reasoning_parser
+            )
             if reasoning_parser:
                 reasoner_cls = ReasoningParserManager.get_reasoning_parser(
                     reasoning_parser
@@ -87,7 +93,7 @@ class StructuredOutputManager:
                 self.reasoner = reasoner_cls(tokenizer=self.tokenizer)
 
         self.enable_in_reasoning = (
-            vllm_config.structured_outputs_config.enable_in_reasoning
+            self.vllm_config.structured_outputs_config.enable_in_reasoning
         )
 
     def grammar_init(self, request: Request) -> None:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index b3c8d4da22b63..a50360ab08694 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -271,7 +271,6 @@ class GPUModelRunner(
         device: torch.device,
     ):
         self.vllm_config = vllm_config
-        self.renderer_config = vllm_config.renderer_config
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
         self.compilation_config = vllm_config.compilation_config
@@ -336,7 +335,7 @@ class GPUModelRunner(
         self.uses_mrope = model_config.uses_mrope
         self.uses_xdrope_dim = model_config.uses_xdrope_dim
         self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
-            self.renderer_config
+            model_config
         )
 
         if self.model_config.is_encoder_decoder:
@@ -559,7 +558,7 @@ class GPUModelRunner(
 
         self.mm_budget = (
             MultiModalBudget(
-                self.renderer_config,
+                self.model_config,
                 self.scheduler_config,
                 self.mm_registry,
             )
@@ -3874,7 +3873,7 @@ class GPUModelRunner(
         assert self.mm_budget is not None
 
         dummy_decoder_data = self.mm_registry.get_decoder_dummy_data(
-            renderer_config=self.renderer_config,
+            model_config=self.model_config,
             seq_len=self.max_model_len,
             mm_counts={modality: 1},
             cache=self.mm_budget.cache,
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 7e2a6af68e6af..283f21b779e38 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -143,7 +143,6 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         original_parallel_config: ParallelConfig | None = None,
     ):
         self.vllm_config = vllm_config
-        self.renderer_config = vllm_config.renderer_config
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
         self.lora_config = vllm_config.lora_config
@@ -223,7 +222,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.mm_registry = MULTIMODAL_REGISTRY
         self.uses_mrope = model_config.uses_mrope
         self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
-            self.renderer_config
+            model_config
         )
         # TODO: Support M-RoPE (e.g, Qwen2-VL)
         assert not self.uses_mrope, "TPU does not support M-RoPE yet."
@@ -354,7 +353,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         self.mm_budget = (
             MultiModalBudget(
-                self.renderer_config,
+                self.model_config,
                 self.scheduler_config,
                 self.mm_registry,
             )
@@ -2039,7 +2038,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         assert self.mm_budget is not None
 
         dummy_decoder_data = self.mm_registry.get_decoder_dummy_data(
-            renderer_config=self.renderer_config,
+            model_config=self.model_config,
             seq_len=self.max_model_len,
             mm_counts={modality: 1},
             cache=self.mm_budget.cache,
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 44418b9985e2e..0b0e2006d73d2 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -7,7 +7,7 @@ import torch
 
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.attention.layer import Attention
-from vllm.config import RendererConfig, SchedulerConfig, VllmConfig
+from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
 from vllm.model_executor.models.interfaces import MultiModalEmbeddings
 from vllm.model_executor.models.utils import extract_layer_index
 from vllm.multimodal.cache import processor_only_cache_from_config
@@ -23,29 +23,24 @@ class MultiModalBudget:
 
     def __init__(
         self,
-        renderer_config: RendererConfig,
+        model_config: ModelConfig,
         scheduler_config: SchedulerConfig,
         mm_registry: MultiModalRegistry,
     ) -> None:
         super().__init__()
 
-        self.renderer_config = renderer_config
-        self.model_config = renderer_config.model_config
+        self.model_config = model_config
         self.scheduler_config = scheduler_config
         self.mm_registry = mm_registry
-        self.cache = cache = processor_only_cache_from_config(
-            renderer_config, mm_registry
-        )
+        self.cache = cache = processor_only_cache_from_config(model_config, mm_registry)
 
-        self.max_model_len = self.model_config.max_model_len
+        self.max_model_len = model_config.max_model_len
         self.max_num_reqs = scheduler_config.max_num_seqs
 
-        self.mm_limits = mm_registry.get_mm_limits_per_prompt(
-            renderer_config, cache=cache
-        )
+        self.mm_limits = mm_registry.get_mm_limits_per_prompt(model_config, cache=cache)
 
         max_tokens_by_modality = mm_registry.get_max_tokens_per_item_by_modality(
-            renderer_config,
+            model_config,
             cache=cache,
             profiler_limits=self.mm_limits,
         )

From 1b0482b9d1b8a082ff94a95516ec78349de6f515 Mon Sep 17 00:00:00 2001
From: Yifan Qiao <yifanqiao@berkeley.edu>
Date: Sun, 7 Dec 2025 00:39:21 -0800
Subject: [PATCH 317/770] [Misc][Core] Remove unused `req_index` increment in
 scheduler (#30176)

Signed-off-by: Yifan Qiao <yifanqiao@berkeley.edu>
---
 vllm/v1/core/sched/scheduler.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 0a8efa2fd512f..d858e840039c4 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -607,7 +607,6 @@ class Scheduler(SchedulerInterface):
 
                 self._update_connector_prefix_cache_stats(request)
 
-                req_index += 1
                 self.running.append(request)
                 if self.log_stats:
                     request.record_event(

From 879ddb09c3b7b5a21f75e46ded148ce70f1c486e Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <jinzhen.ljz@antgroup.com>
Date: Sun, 7 Dec 2025 17:58:47 +0800
Subject: [PATCH 318/770] [Kernel][MoE] optimize `moe_align_block_size`
 (#29642)

Signed-off-by: Jinzhen Lin <jinzhen.ljz@antgroup.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 .../kernels/benchmark_moe_align_block_size.py |  21 ++-
 csrc/moe/moe_align_sum_kernels.cu             | 152 ++++++++++++------
 csrc/moe/moe_ops.h                            |   3 +-
 csrc/moe/torch_bindings.cpp                   |   3 +-
 tests/kernels/moe/test_moe.py                 |  16 +-
 .../kernels/moe/test_moe_align_block_size.py  |  22 ++-
 vllm/_custom_ops.py                           |   2 +
 .../layers/fused_moe/fused_marlin_moe.py      |   6 +-
 .../layers/fused_moe/fused_moe.py             |   9 +-
 .../layers/fused_moe/moe_align_block_size.py  |  24 ++-
 10 files changed, 195 insertions(+), 63 deletions(-)

diff --git a/benchmarks/kernels/benchmark_moe_align_block_size.py b/benchmarks/kernels/benchmark_moe_align_block_size.py
index f540cff6261a8..5f9a131f79b0e 100644
--- a/benchmarks/kernels/benchmark_moe_align_block_size.py
+++ b/benchmarks/kernels/benchmark_moe_align_block_size.py
@@ -24,12 +24,15 @@ def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
 num_tokens_range = [1, 16, 256, 4096]
 num_experts_range = [16, 64, 224, 256, 280, 512]
 topk_range = [1, 2, 8]
-configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
+ep_size_range = [1, 8]
+configs = list(
+    itertools.product(num_tokens_range, num_experts_range, topk_range, ep_size_range)
+)
 
 
 @triton.testing.perf_report(
     triton.testing.Benchmark(
-        x_names=["num_tokens", "num_experts", "topk"],
+        x_names=["num_tokens", "num_experts", "topk", "ep_size"],
         x_vals=configs,
         line_arg="provider",
         line_vals=["vllm"],
@@ -38,16 +41,26 @@ configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range
         args={},
     )
 )
-def benchmark(num_tokens, num_experts, topk, provider):
+def benchmark(num_tokens, num_experts, topk, ep_size, provider):
     """Benchmark function for Triton."""
     block_size = 256
+    torch.cuda.manual_seed_all(0)
     topk_ids = get_topk_ids(num_tokens, num_experts, topk)
 
+    e_map = None
+    if ep_size != 1:
+        local_e = num_experts // ep_size
+        e_ids = torch.randperm(num_experts, device="cuda", dtype=torch.int32)[:local_e]
+        e_map = torch.full((num_experts,), -1, device="cuda", dtype=torch.int32)
+        e_map[e_ids] = torch.arange(local_e, device="cuda", dtype=torch.int32)
+
     quantiles = [0.5, 0.2, 0.8]
 
     if provider == "vllm":
         ms, min_ms, max_ms = triton.testing.do_bench(
-            lambda: moe_align_block_size(topk_ids, block_size, num_experts),
+            lambda: moe_align_block_size(
+                topk_ids, block_size, num_experts, e_map, ignore_invalid_experts=True
+            ),
             quantiles=quantiles,
         )
 
diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu
index b3d0c0aa58e9e..ddcdcc38b4fea 100644
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -83,14 +83,22 @@ template <typename scalar_t>
 __global__ void moe_align_block_size_kernel(
     const scalar_t* __restrict__ topk_ids,
     int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
-    int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts,
+    int32_t* __restrict__ total_tokens_post_pad,
+    int32_t* __restrict__ expert_map, int32_t num_experts,
     int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size,
-    size_t numel, int32_t* __restrict__ cumsum, int32_t max_num_tokens_padded) {
+    size_t numel, int32_t* __restrict__ cumsum, int32_t max_num_tokens_padded,
+    bool has_expert_map) {
   extern __shared__ int32_t shared_counts[];
 
-  // Initialize sorted_token_ids with numel
-  for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) {
-    sorted_token_ids[it] = numel;
+  // Use a separate threadblock to fill sorted_token_ids.
+  // This is safe since the current kernel does not use sorted_token_ids.
+  if (blockIdx.x == 1) {
+    // Initialize sorted_token_ids with numel
+    for (size_t it = threadIdx.x; it < max_num_tokens_padded;
+         it += blockDim.x) {
+      sorted_token_ids[it] = numel;
+    }
+    return;
   }
 
   const int warp_id = threadIdx.x / WARP_SIZE;
@@ -112,6 +120,11 @@ __global__ void moe_align_block_size_kernel(
     if (expert_id >= num_experts) {
       continue;
     }
+    if (has_expert_map) {
+      expert_id = expert_map[expert_id];
+      // filter invalid experts
+      if (expert_id == -1) continue;
+    }
     int warp_idx = expert_id / experts_per_warp;
     int expert_offset = expert_id % experts_per_warp;
     atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset], 1);
@@ -163,7 +176,8 @@ template <typename scalar_t>
 __global__ void count_and_sort_expert_tokens_kernel(
     const scalar_t* __restrict__ topk_ids,
     int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
-    size_t numel, int32_t num_experts) {
+    int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts,
+    bool has_expert_map) {
   const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
   const size_t stride = blockDim.x * gridDim.x;
 
@@ -172,6 +186,11 @@ __global__ void count_and_sort_expert_tokens_kernel(
     if (expert_id >= num_experts) {
       continue;
     }
+    if (has_expert_map) {
+      expert_id = expert_map[expert_id];
+      // filter invalid experts
+      if (expert_id == -1) continue;
+    }
     int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1);
     sorted_token_ids[rank_post_pad] = i;
   }
@@ -193,50 +212,69 @@ __global__ void moe_sum_kernel(
   }
 }
 
-template <typename scalar_t>
+template <typename scalar_t, int32_t fill_threads>
 __global__ void moe_align_block_size_small_batch_expert_kernel(
     const scalar_t* __restrict__ topk_ids,
     int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
-    int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts,
-    int32_t block_size, size_t numel, int32_t max_num_tokens_padded) {
-  // Initialize sorted_token_ids with numel
-  for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) {
-    sorted_token_ids[it] = numel;
+    int32_t* __restrict__ total_tokens_post_pad,
+    int32_t* __restrict__ expert_map, int32_t num_experts, int32_t block_size,
+    size_t numel, int32_t max_num_tokens_padded, bool has_expert_map) {
+  // Use an additional group of threads to fill sorted_token_ids.
+  // Since the current kernel will use sorted_token_ids afterward,
+  // we fill sorted_token_ids within the same threadblock to make
+  // synchronization easier.
+  if (threadIdx.x < fill_threads) {
+    // Initialize sorted_token_ids with numel
+    for (size_t it = threadIdx.x; it < max_num_tokens_padded;
+         it += fill_threads) {
+      sorted_token_ids[it] = numel;
+    }
+    // Three __syncthreads() corresponding to the other threads
+    __syncthreads();
+    __syncthreads();
+    __syncthreads();
+    return;
   }
 
-  const size_t tid = threadIdx.x;
-  const size_t stride = blockDim.x;
+  const size_t tid = threadIdx.x - fill_threads;
+  const size_t stride = blockDim.x - fill_threads;
 
   extern __shared__ int32_t shared_mem[];
   int32_t* cumsum = shared_mem;
   int32_t* tokens_cnts = (int32_t*)(shared_mem + num_experts + 1);
 
   for (int i = 0; i < num_experts; ++i) {
-    tokens_cnts[(threadIdx.x + 1) * num_experts + i] = 0;
+    tokens_cnts[(tid + 1) * num_experts + i] = 0;
   }
 
   for (size_t i = tid; i < numel; i += stride) {
-    ++tokens_cnts[(threadIdx.x + 1) * num_experts + topk_ids[i]];
+    int32_t expert_id = topk_ids[i];
+    if (has_expert_map) {
+      expert_id = expert_map[expert_id];
+      // filter invalid expert
+      if (expert_id == -1) continue;
+    }
+    ++tokens_cnts[(tid + 1) * num_experts + expert_id];
   }
 
   __syncthreads();
 
-  if (threadIdx.x < num_experts) {
-    tokens_cnts[threadIdx.x] = 0;
-    for (int i = 1; i <= blockDim.x; ++i) {
-      tokens_cnts[i * num_experts + threadIdx.x] +=
-          tokens_cnts[(i - 1) * num_experts + threadIdx.x];
+  if (tid < num_experts) {
+    tokens_cnts[tid] = 0;
+    for (int i = 1; i <= stride; ++i) {
+      tokens_cnts[i * num_experts + tid] +=
+          tokens_cnts[(i - 1) * num_experts + tid];
     }
   }
 
   __syncthreads();
 
-  if (threadIdx.x == 0) {
+  if (tid == 0) {
     cumsum[0] = 0;
     for (int i = 1; i <= num_experts; ++i) {
       cumsum[i] =
           cumsum[i - 1] +
-          CEILDIV(tokens_cnts[blockDim.x * num_experts + i - 1], block_size) *
+          CEILDIV(tokens_cnts[stride * num_experts + i - 1], block_size) *
               block_size;
     }
     *total_tokens_post_pad = static_cast<int32_t>(cumsum[num_experts]);
@@ -244,26 +282,30 @@ __global__ void moe_align_block_size_small_batch_expert_kernel(
 
   __syncthreads();
 
-  if (threadIdx.x < num_experts) {
-    for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
-         i += block_size) {
-      expert_ids[i / block_size] = threadIdx.x;
+  if (tid < num_experts) {
+    for (int i = cumsum[tid]; i < cumsum[tid + 1]; i += block_size) {
+      expert_ids[i / block_size] = tid;
     }
   }
 
   // Fill remaining expert_ids with 0
-  const size_t fill_start_idx = cumsum[num_experts] / block_size + threadIdx.x;
+  const size_t fill_start_idx = cumsum[num_experts] / block_size + tid;
   const size_t expert_ids_size = CEILDIV(max_num_tokens_padded, block_size);
-  for (size_t i = fill_start_idx; i < expert_ids_size; i += blockDim.x) {
+  for (size_t i = fill_start_idx; i < expert_ids_size; i += stride) {
     expert_ids[i] = 0;
   }
 
   for (size_t i = tid; i < numel; i += stride) {
     int32_t expert_id = topk_ids[i];
+    if (has_expert_map) {
+      expert_id = expert_map[expert_id];
+      // filter invalid expert
+      if (expert_id == -1) continue;
+    }
     int32_t rank_post_pad =
-        tokens_cnts[threadIdx.x * num_experts + expert_id] + cumsum[expert_id];
+        tokens_cnts[tid * num_experts + expert_id] + cumsum[expert_id];
     sorted_token_ids[rank_post_pad] = i;
-    ++tokens_cnts[threadIdx.x * num_experts + expert_id];
+    ++tokens_cnts[tid * num_experts + expert_id];
   }
 }
 
@@ -275,7 +317,8 @@ __global__ void moe_align_block_size_small_batch_expert_kernel(
 void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
                           int64_t block_size, torch::Tensor sorted_token_ids,
                           torch::Tensor experts_ids,
-                          torch::Tensor num_tokens_post_pad) {
+                          torch::Tensor num_tokens_post_pad,
+                          std::optional<torch::Tensor> maybe_expert_map) {
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   int64_t padded_num_experts =
@@ -287,14 +330,19 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
   // BlockScan uses 1024 threads and assigns one thread per expert.
   TORCH_CHECK(padded_num_experts < 1024,
               "padded_num_experts must be less than 1024");
+  auto options_int =
+      torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
+  bool has_expert_map = maybe_expert_map.has_value();
+  torch::Tensor expert_map;
+  if (has_expert_map) {
+    expert_map = maybe_expert_map.value();
+  } else {
+    expert_map = torch::empty({0}, options_int);
+  }
 
   VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
       topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
         // calc needed amount of shared mem for `cumsum` tensors
-        auto options_int =
-            torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
-        torch::Tensor cumsum_buffer =
-            torch::empty({num_experts + 1}, options_int);
         bool small_batch_expert_mode =
             (topk_ids.numel() < 1024) && (num_experts <= 64);
 
@@ -304,30 +352,41 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
               ((threads + 1) * num_experts + (num_experts + 1)) *
               sizeof(int32_t);
 
+          // threadIdx.x >= fill_threads: counting experts and aligning
+          // threadIdx.x < fill_threads: filling sorted_token_ids
+          constexpr int32_t fill_threads = 256;
           auto small_batch_expert_kernel =
               vllm::moe::moe_align_block_size_small_batch_expert_kernel<
-                  scalar_t>;
-          small_batch_expert_kernel<<<1, threads, shared_mem_size, stream>>>(
+                  scalar_t, fill_threads>;
+          small_batch_expert_kernel<<<1, fill_threads + threads,
+                                      shared_mem_size, stream>>>(
               topk_ids.data_ptr<scalar_t>(),
               sorted_token_ids.data_ptr<int32_t>(),
               experts_ids.data_ptr<int32_t>(),
-              num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
-              topk_ids.numel(), sorted_token_ids.size(0));
+              num_tokens_post_pad.data_ptr<int32_t>(),
+              expert_map.data_ptr<int32_t>(), num_experts, block_size,
+              topk_ids.numel(), sorted_token_ids.size(0), has_expert_map);
         } else {
+          torch::Tensor cumsum_buffer =
+              torch::empty({num_experts + 1}, options_int);
           auto align_kernel = vllm::moe::moe_align_block_size_kernel<scalar_t>;
 
           size_t num_warps = CEILDIV(padded_num_experts, experts_per_warp);
           size_t shared_mem_size =
               num_warps * experts_per_warp * sizeof(int32_t);
 
-          align_kernel<<<1, threads, shared_mem_size, stream>>>(
+          // launch two threadblocks
+          // blockIdx.x == 0: counting experts and aligning
+          // blockIdx.x == 1: filling sorted_token_ids
+          align_kernel<<<2, threads, shared_mem_size, stream>>>(
               topk_ids.data_ptr<scalar_t>(),
               sorted_token_ids.data_ptr<int32_t>(),
               experts_ids.data_ptr<int32_t>(),
-              num_tokens_post_pad.data_ptr<int32_t>(), num_experts,
-              padded_num_experts, experts_per_warp, block_size,
-              topk_ids.numel(), cumsum_buffer.data_ptr<int32_t>(),
-              sorted_token_ids.size(0));
+              num_tokens_post_pad.data_ptr<int32_t>(),
+              expert_map.data_ptr<int32_t>(), num_experts, padded_num_experts,
+              experts_per_warp, block_size, topk_ids.numel(),
+              cumsum_buffer.data_ptr<int32_t>(), sorted_token_ids.size(0),
+              has_expert_map);
 
           const int block_threads = std::min(256, (int)threads);
           const int num_blocks =
@@ -340,7 +399,8 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
           sort_kernel<<<actual_blocks, block_threads, 0, stream>>>(
               topk_ids.data_ptr<scalar_t>(),
               sorted_token_ids.data_ptr<int32_t>(),
-              cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel(), num_experts);
+              cumsum_buffer.data_ptr<int32_t>(), expert_map.data_ptr<int32_t>(),
+              topk_ids.numel(), num_experts, has_expert_map);
         }
       });
 }
diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h
index 11c6875f7f1d0..4c7accf03440a 100644
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@@ -11,7 +11,8 @@ void moe_sum(torch::Tensor& input, torch::Tensor& output);
 void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
                           int64_t block_size, torch::Tensor sorted_token_ids,
                           torch::Tensor experts_ids,
-                          torch::Tensor num_tokens_post_pad);
+                          torch::Tensor num_tokens_post_pad,
+                          std::optional<torch::Tensor> maybe_expert_map);
 
 void batched_moe_align_block_size(int64_t max_tokens_per_batch,
                                   int64_t block_size,
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index e0a8280722f3c..fca57c31caf8e 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -19,7 +19,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
       "moe_align_block_size(Tensor topk_ids, int num_experts,"
       "                     int block_size, Tensor! sorted_token_ids,"
       "                     Tensor! experts_ids,"
-      "                     Tensor! num_tokens_post_pad) -> ()");
+      "                     Tensor! num_tokens_post_pad,"
+      "                     Tensor? maybe_expert_map) -> ()");
   m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
 
   // Aligning the number of tokens to be processed by each expert such
diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
index bacf6f37f2b08..82659276af37c 100644
--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -955,9 +955,22 @@ def test_fused_marlin_moe_with_bias(m):
     torch.testing.assert_close(marlin_output, torch_output, atol=5e-2, rtol=0)
 
 
-def test_moe_align_block_size_opcheck():
+@pytest.mark.parametrize("ep_size", [1, 2])
+def test_moe_align_block_size_opcheck(ep_size):
     num_experts = 4
     block_size = 4
+
+    expert_map = None
+    if ep_size != 1:
+        local_num_experts = num_experts // ep_size
+        expert_ids = torch.randint(
+            0, num_experts, (local_num_experts,), device="cuda", dtype=torch.int32
+        )
+        expert_map = torch.full((num_experts,), -1, device="cuda", dtype=torch.int32)
+        expert_map[expert_ids] = torch.arange(
+            local_num_experts, device="cuda", dtype=torch.int32
+        )
+
     topk_ids = torch.randint(0, num_experts, (3, 4), dtype=torch.int32, device="cuda")
 
     max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
@@ -980,6 +993,7 @@ def test_moe_align_block_size_opcheck():
             sorted_ids,
             expert_ids,
             num_tokens_post_pad,
+            expert_map,
         ),
     )
 
diff --git a/tests/kernels/moe/test_moe_align_block_size.py b/tests/kernels/moe/test_moe_align_block_size.py
index 8975f00bd4c6e..1abfc11fb460e 100644
--- a/tests/kernels/moe/test_moe_align_block_size.py
+++ b/tests/kernels/moe/test_moe_align_block_size.py
@@ -106,6 +106,8 @@ def torch_moe_align_block_size(
     max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
     if pad_sorted_ids:
         max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
+    if topk_ids.numel() < num_experts:
+        max_num_tokens_padded = topk_ids.numel() * block_size
 
     flattened_token_indices = torch.arange(
         topk_ids.numel(), device=topk_ids.device, dtype=torch.int32
@@ -126,6 +128,8 @@ def torch_moe_align_block_size(
     )
     for expert_id in range(num_experts):
         original_count = expert_token_counts[expert_id]
+        if expert_map is not None and expert_map[expert_id] == -1:
+            continue
         if original_count > 0:
             expert_padded_counts[expert_id] = (
                 (original_count + block_size - 1) // block_size
@@ -143,6 +147,9 @@ def torch_moe_align_block_size(
     current_pos = 0
     current_block = 0
     for expert_id in range(num_experts):
+        if expert_map is not None and expert_map[expert_id] == -1:
+            continue
+
         expert_mask = sorted_expert_ids == expert_id
         expert_tokens = sorted_token_indices[expert_mask]
         num_expert_tokens = expert_tokens.shape[0]
@@ -153,7 +160,13 @@ def torch_moe_align_block_size(
             )
 
             expert_blocks_needed = expert_padded_counts[expert_id] // block_size
-            expert_ids[current_block : current_block + expert_blocks_needed] = expert_id
+
+            expert_id_new = expert_id
+            if expert_map is not None:
+                expert_id_new = expert_map[expert_id]
+            expert_ids[current_block : current_block + expert_blocks_needed] = (
+                expert_id_new
+            )
 
             current_pos += expert_padded_counts[expert_id]
             current_block += expert_blocks_needed
@@ -163,8 +176,6 @@ def torch_moe_align_block_size(
         [total_padded_tokens], dtype=torch.int32, device=topk_ids.device
     )
 
-    if expert_map is not None:
-        expert_ids = expert_map[expert_ids]
     return sorted_token_ids, expert_ids, num_tokens_post_pad
 
 
@@ -229,9 +240,9 @@ def test_moe_align_block_size(
     )
 
 
-@pytest.mark.parametrize("m", [16, 32])
+@pytest.mark.parametrize("m", [16, 32, 2048])
 @pytest.mark.parametrize("topk", [2, 4])
-@pytest.mark.parametrize("num_experts", [8])
+@pytest.mark.parametrize("num_experts", [8, 64])
 @pytest.mark.parametrize("block_size", [64])
 @pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
 def test_moe_align_block_size_with_expert_map(
@@ -253,6 +264,7 @@ def test_moe_align_block_size_with_expert_map(
         block_size=block_size,
         num_experts=num_experts,
         expert_map=expert_map,
+        ignore_invalid_experts=True,
     )
     golden_sorted_ids, golden_expert_ids, golden_num_tokens = (
         torch_moe_align_block_size(
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index e60158898685a..94e27545239f4 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1877,6 +1877,7 @@ def moe_align_block_size(
     sorted_token_ids: torch.Tensor,
     experts_ids: torch.Tensor,
     num_tokens_post_pad: torch.Tensor,
+    expert_map: torch.Tensor | None = None,
 ) -> None:
     torch.ops._moe_C.moe_align_block_size(
         topk_ids,
@@ -1885,6 +1886,7 @@ def moe_align_block_size(
         sorted_token_ids,
         experts_ids,
         num_tokens_post_pad,
+        expert_map,
     )
 
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index 9c377db720132..92d72b75656cd 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -316,7 +316,11 @@ def fused_marlin_moe(
     if global_num_experts == -1:
         global_num_experts = E
     sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
-        topk_ids, block_size_m, global_num_experts, expert_map
+        topk_ids,
+        block_size_m,
+        global_num_experts,
+        expert_map,
+        ignore_invalid_experts=True,
     )
 
     assert activation is not None
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index df208eae2e71c..f3c158ee2f9dc 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1887,7 +1887,11 @@ def fused_experts_impl(
         )
 
         sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
-            curr_topk_ids, config["BLOCK_SIZE_M"], global_num_experts, expert_map
+            curr_topk_ids,
+            config["BLOCK_SIZE_M"],
+            global_num_experts,
+            expert_map,
+            ignore_invalid_experts=True,
         )
 
         invoke_fused_moe_kernel(
@@ -1946,6 +1950,9 @@ def fused_experts_impl(
             block_shape=block_shape,
         )
 
+        if expert_map is not None:
+            intermediate_cache3.zero_()
+
         invoke_fused_moe_kernel(
             qintermediate_cache2,
             w2,
diff --git a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
index 7f6155997264d..7fc8bfcf824d9 100644
--- a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
+++ b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
@@ -14,6 +14,7 @@ def moe_align_block_size(
     num_experts: int,
     expert_map: torch.Tensor | None = None,
     pad_sorted_ids: bool = False,
+    ignore_invalid_experts: bool = False,
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """
     Aligns the token distribution across experts to be compatible with block
@@ -35,7 +36,13 @@ def moe_align_block_size(
         expert parallel shard. If the expert is not in the current expert
         parallel shard, the mapping is set to -1.
     - pad_sorted_ids: A flag indicating whether the sorted_token_ids length
-      should be padded to a multiple of block_size,
+        should be padded to a multiple of block_size,
+    - ignore_invalid_experts: A flag indicating whether to ignore invalid
+        experts. When False, all expert_ids in topk_ids will participate in
+        counting and ranking, but invalid experts in expert_ids will be marked
+        as -1. When True, all invalid expert_ids in topk_ids will be ignored
+        and will not participate in counting or ranking, and there will be no
+        -1 in expert_ids.
 
     Returns:
     - sorted_token_ids: A tensor containing the sorted token indices according
@@ -67,6 +74,10 @@ def moe_align_block_size(
     max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
     if pad_sorted_ids:
         max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
+    if topk_ids.numel() < num_experts:
+        max_num_tokens_padded = min(
+            topk_ids.numel() * block_size, max_num_tokens_padded
+        )
     sorted_ids = torch.empty(
         (max_num_tokens_padded,), dtype=torch.int32, device=topk_ids.device
     )
@@ -77,9 +88,16 @@ def moe_align_block_size(
     num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device)
 
     ops.moe_align_block_size(
-        topk_ids, num_experts, block_size, sorted_ids, expert_ids, num_tokens_post_pad
+        topk_ids,
+        num_experts,
+        block_size,
+        sorted_ids,
+        expert_ids,
+        num_tokens_post_pad,
+        expert_map if ignore_invalid_experts else None,
     )
-    if expert_map is not None:
+
+    if expert_map is not None and not ignore_invalid_experts:
         expert_ids = expert_map[expert_ids]
 
     return sorted_ids, expert_ids, num_tokens_post_pad

From b0f4866a77f22720a5e295cbcffee5cbe9fc2b56 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sun, 7 Dec 2025 20:27:11 +0800
Subject: [PATCH 319/770] [CI/Build]Temporary workaround for
 test_default_mm_loras timeout (#30202)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/test_default_mm_loras.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/lora/test_default_mm_loras.py b/tests/lora/test_default_mm_loras.py
index 407b29fdd1d58..1d16862b30e52 100644
--- a/tests/lora/test_default_mm_loras.py
+++ b/tests/lora/test_default_mm_loras.py
@@ -13,6 +13,7 @@ from huggingface_hub import snapshot_download
 from vllm.lora.request import LoRARequest
 
 from ..conftest import AudioTestAssets, VllmRunner
+from ..utils import create_new_process_for_each_test
 
 MODEL_PATH = snapshot_download("microsoft/Phi-4-multimodal-instruct")
 AUDIO_LORA_PATH = os.path.join(MODEL_PATH, "speech-lora")
@@ -60,6 +61,7 @@ def run_test(vllm_runner, audio_assets, lora_request, expected_suffix, **kwargs)
         assert vllm_outputs_with_default_lora[-1][-1][-1].endswith(expected_suffix)
 
 
+@create_new_process_for_each_test()
 def test_active_default_mm_lora(
     vllm_runner: type[VllmRunner],
     audio_assets: AudioTestAssets,
@@ -74,6 +76,7 @@ def test_active_default_mm_lora(
     )
 
 
+@create_new_process_for_each_test()
 def test_inactive_default_mm_lora(
     vllm_runner: type[VllmRunner],
     audio_assets: AudioTestAssets,
@@ -89,6 +92,7 @@ def test_inactive_default_mm_lora(
     )
 
 
+@create_new_process_for_each_test()
 def test_default_mm_lora_succeeds_with_redundant_lora_request(
     vllm_runner: type[VllmRunner],
     audio_assets: AudioTestAssets,
@@ -103,6 +107,7 @@ def test_default_mm_lora_succeeds_with_redundant_lora_request(
     )
 
 
+@create_new_process_for_each_test()
 def test_default_mm_lora_fails_with_overridden_lora_request(
     vllm_runner: type[VllmRunner],
     audio_assets: AudioTestAssets,
@@ -118,6 +123,7 @@ def test_default_mm_lora_fails_with_overridden_lora_request(
     )
 
 
+@create_new_process_for_each_test()
 def test_default_mm_lora_does_not_expand_string_reqs(vllm_runner):
     class MockEngineException(Exception):
         pass

From 541a2ef892720489f770569417bc1bc4436dbb21 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Sun, 7 Dec 2025 07:31:14 -0500
Subject: [PATCH 320/770] [Perf] Deepgemm fused layout kernel for activations,
 4.3% throughput improvement, 10.7% TTFT improvement. (#29546)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 csrc/ops.h                                    |   8 +
 .../w8a8/fp8/per_token_group_quant.cu         | 185 ++++++++++++++++++
 csrc/torch_bindings.cpp                       |   9 +
 .../layers/fused_moe/deep_gemm_moe.py         |  41 ++--
 .../layers/quantization/utils/fp8_utils.py    |  80 +++++++-
 5 files changed, 311 insertions(+), 12 deletions(-)

diff --git a/csrc/ops.h b/csrc/ops.h
index 4bb7857b15032..d302f04913266 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -299,6 +299,14 @@ void per_token_group_quant_int8(const torch::Tensor& input,
                                 torch::Tensor& output_q,
                                 torch::Tensor& output_s, int64_t group_size,
                                 double eps, double int8_min, double int8_max);
+
+// Fused activation quantisation + DeepGEMM-compatible UE8M0-packed scales.
+void per_token_group_quant_8bit_packed(const torch::Tensor& input,
+                                       torch::Tensor& output_q,
+                                       torch::Tensor& output_s_packed,
+                                       int64_t group_size, double eps,
+                                       double min_8bit, double max_8bit);
+
 #endif
 
 void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
diff --git a/csrc/quantization/w8a8/fp8/per_token_group_quant.cu b/csrc/quantization/w8a8/fp8/per_token_group_quant.cu
index e3ab0676b254e..f9ac874c43730 100644
--- a/csrc/quantization/w8a8/fp8/per_token_group_quant.cu
+++ b/csrc/quantization/w8a8/fp8/per_token_group_quant.cu
@@ -206,6 +206,191 @@ void per_token_group_quant_8bit(const torch::Tensor& input,
 #undef LAUNCH_KERNEL
 }
 
+template <typename T, typename DST_DTYPE>
+__global__ void per_token_group_quant_8bit_packed_kernel(
+    const T* __restrict__ input, void* __restrict__ output_q,
+    unsigned int* __restrict__ output_s_packed, const int group_size,
+    const int num_groups, const int groups_per_block, const int groups_per_row,
+    const int mn, const int tma_aligned_mn, const float eps,
+    const float min_8bit, const float max_8bit) {
+  const int threads_per_group = 16;
+  const int64_t local_group_id = threadIdx.x / threads_per_group;
+  const int lane_id = threadIdx.x % threads_per_group;
+
+  const int64_t block_group_id = blockIdx.x * groups_per_block;
+  const int64_t global_group_id = block_group_id + local_group_id;
+  if (global_group_id >= num_groups) {
+    return;
+  }
+
+  const int64_t block_group_offset = global_group_id * group_size;
+
+  float local_absmax = eps;
+
+  const T* group_input = input + block_group_offset;
+  DST_DTYPE* group_output =
+      static_cast<DST_DTYPE*>(output_q) + block_group_offset;
+
+  // shared memory to cache each group's data to avoid double DRAM reads.
+  extern __shared__ __align__(16) char smem_raw[];
+  T* smem = reinterpret_cast<T*>(smem_raw);
+  T* smem_group = smem + local_group_id * group_size;
+
+  constexpr int vec_size = 16 / sizeof(T);
+  using vec_t = vllm::vec_n_t<T, vec_size>;
+
+  // copy global -> shared & compute absmax
+  auto scalar_op_cache = [&] __device__(T & dst, const T& src) {
+    float abs_v = fabsf(static_cast<float>(src));
+    local_absmax = fmaxf(local_absmax, abs_v);
+    dst = src;
+  };
+
+  vllm::vectorize_with_alignment<vec_size>(
+      group_input,        // in
+      smem_group,         // out (shared)
+      group_size,         // elements per group
+      lane_id,            // thread id
+      threads_per_group,  // stride in group
+      scalar_op_cache);   // scalar handler
+
+  local_absmax = GroupReduceMax(local_absmax);
+
+  float y_s = local_absmax / max_8bit;
+  y_s = exp2f(ceilf(log2f(fmaxf(fabsf(y_s), 1e-10f))));
+
+  // pack 4 scales into a uint32
+  if (lane_id == 0) {
+    // map flat group id to 2D indices (mn_idx, sf_k_idx)
+    const int sf_k_idx = static_cast<int>(global_group_id % groups_per_row);
+    const int mn_idx = static_cast<int>(global_group_id / groups_per_row);
+
+    if (mn_idx < mn) {
+      // each uint32 in output_s_packed stores 4 packed scales
+      const int sf_k_pack_idx = sf_k_idx / 4;
+      const int pos = sf_k_idx % 4;
+
+      // reinterpret the UE8M0 scale y_s as IEEE bits, extract the 8-bit
+      // exponent, and place it into the correct byte of the 32-bit word.
+      const unsigned int bits = __float_as_uint(y_s);
+      const unsigned int exponent = (bits >> 23u) & 0xffu;
+      const unsigned int contrib = exponent << (pos * 8u);
+
+      const int out_idx = sf_k_pack_idx * tma_aligned_mn + mn_idx;
+      // atomically OR 8-bit exponent into the packed scales buffer
+      atomicOr(output_s_packed + out_idx, contrib);
+    }
+  }
+
+  __syncthreads();
+
+  // quantize shared -> global 8-bit
+  auto scalar_op_quant = [&] __device__(DST_DTYPE & dst, const T& src) {
+    float q = fminf(fmaxf(static_cast<float>(src) / y_s, min_8bit), max_8bit);
+    dst = DST_DTYPE(q);
+  };
+
+  vllm::vectorize_with_alignment<vec_size>(
+      smem_group,         // in (shared)
+      group_output,       // out (global quant tensor)
+      group_size,         // elements
+      lane_id,            // tid
+      threads_per_group,  // stride
+      scalar_op_quant);   // scalar handler
+}
+
+void per_token_group_quant_8bit_packed(const torch::Tensor& input,
+                                       torch::Tensor& output_q,
+                                       torch::Tensor& output_s_packed,
+                                       int64_t group_size, double eps,
+                                       double min_8bit, double max_8bit) {
+  TORCH_CHECK(input.is_contiguous());
+  TORCH_CHECK(output_q.is_contiguous());
+
+  const int64_t k = input.size(-1);
+  TORCH_CHECK(k % group_size == 0, "Last dimension (", k,
+              ") must be divisible by group_size (", group_size, ").");
+
+  const int64_t mn = input.numel() / k;
+  const int64_t groups_per_row = k / group_size;
+  const int64_t num_groups = mn * groups_per_row;
+
+  TORCH_CHECK(output_s_packed.dim() == 2,
+              "output_s_packed must be 2D, got dim=", output_s_packed.dim(),
+              ".");
+
+  const int64_t k_num_packed_sfk = (groups_per_row + 3) / 4;
+  const int64_t tma_aligned_mn = ((mn + 3) / 4) * 4;
+
+  TORCH_CHECK(output_s_packed.scalar_type() == at::ScalarType::Int,
+              "output_s_packed must have dtype int32 for UE8M0-packed scales.");
+  // DeepGEMM expects SFA scales in MN-major form with shape
+  // [mn, ceil_div(K, 128 * 4)] and TMA-aligned stride on the last
+  // dimension.
+  TORCH_CHECK(output_s_packed.size(0) == mn &&
+                  output_s_packed.size(1) == k_num_packed_sfk,
+              "output_s_packed shape must be [", mn, ", ", k_num_packed_sfk,
+              "], but got [", output_s_packed.size(0), ", ",
+              output_s_packed.size(1), "].");
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  constexpr int THREADS_PER_GROUP = 16;
+
+  int groups_per_block = 1;
+
+  if (num_groups % 16 == 0) {
+    groups_per_block = 16;
+  } else if (num_groups % 8 == 0) {
+    groups_per_block = 8;
+  } else if (num_groups % 4 == 0) {
+    groups_per_block = 4;
+  } else if (num_groups % 2 == 0) {
+    groups_per_block = 2;
+  }
+
+  auto dst_type = output_q.scalar_type();
+  const int num_blocks = num_groups / groups_per_block;
+  const int num_threads = groups_per_block * THREADS_PER_GROUP;
+
+  // zero-initialize packed scales, since we use atomicOr to accumulate
+  // exponents from different groups.
+  output_s_packed.zero_();
+
+#define LAUNCH_PACKED_KERNEL(T, DST_DTYPE)                                \
+  do {                                                                    \
+    dim3 grid(num_blocks);                                                \
+    dim3 block(num_threads);                                              \
+    size_t smem_bytes =                                                   \
+        static_cast<size_t>(groups_per_block) * group_size * sizeof(T);   \
+    per_token_group_quant_8bit_packed_kernel<T, DST_DTYPE>                \
+        <<<grid, block, smem_bytes, stream>>>(                            \
+            static_cast<const T*>(input.data_ptr()), output_q.data_ptr(), \
+            reinterpret_cast<unsigned int*>(output_s_packed.data_ptr()),  \
+            static_cast<int>(group_size), static_cast<int>(num_groups),   \
+            groups_per_block, static_cast<int>(groups_per_row),           \
+            static_cast<int>(mn), static_cast<int>(tma_aligned_mn),       \
+            static_cast<float>(eps), static_cast<float>(min_8bit),        \
+            static_cast<float>(max_8bit));                                \
+  } while (0)
+
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "per_token_group_quant_8bit_packed", ([&] {
+        if (dst_type == at::ScalarType::Float8_e4m3fn) {
+          LAUNCH_PACKED_KERNEL(scalar_t, __nv_fp8_e4m3);
+        } else if (dst_type == at::ScalarType::Char) {
+          LAUNCH_PACKED_KERNEL(scalar_t, int8_t);
+        } else {
+          TORCH_CHECK(
+              false,
+              "per_token_group_quant_8bit_packed only supports FP8/INT8 "
+              "outputs.");
+        }
+      }));
+
+#undef LAUNCH_PACKED_KERNEL
+}
+
 void per_token_group_quant_fp8(const torch::Tensor& input,
                                torch::Tensor& output_q, torch::Tensor& output_s,
                                int64_t group_size, double eps, double fp8_min,
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 914227838558a..23ac1d9abeea9 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -617,6 +617,15 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.impl("per_token_group_fp8_quant", torch::kCUDA,
            &per_token_group_quant_fp8);
 
+  // Compute per-token-group 8-bit quantized tensor and UE8M0-packed,
+  // TMA-aligned scales for DeepGEMM.
+  ops.def(
+      "per_token_group_fp8_quant_packed(Tensor input, Tensor! output_q, "
+      "Tensor! output_s_packed, int group_size, float eps, float fp8_min, "
+      "float fp8_max) -> ()");
+  ops.impl("per_token_group_fp8_quant_packed", torch::kCUDA,
+           &per_token_group_quant_8bit_packed);
+
   // Compute per-token-group INT8 quantized tensor and scaling factor.
   ops.def(
       "per_token_group_quant_int8(Tensor input, Tensor! output_q, Tensor! "
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
index 9f47e692d5ae2..4a64736ed767b 100644
--- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -23,9 +23,11 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
 from vllm.model_executor.layers.fused_moe.utils import _resize_cache
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     per_token_group_quant_fp8,
+    per_token_group_quant_fp8_packed_for_deepgemm,
     silu_mul_per_token_group_quant_fp8_colmajor,
 )
 from vllm.utils.deep_gemm import (
+    DeepGemmQuantScaleFMT,
     get_mk_alignment_for_contiguous_layout,
     m_grouped_fp8_gemm_nt_contiguous,
 )
@@ -157,23 +159,40 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
     def _act_mul_quant(
         self, input: torch.Tensor, output: torch.Tensor, activation: str
     ) -> tuple[torch.Tensor, torch.Tensor]:
-        if activation == "silu":
-            return silu_mul_per_token_group_quant_fp8_colmajor(
-                input=input, output=output
-            )
-        else:
-            # This is a fallback path. If we find ourselves using any activation other
-            # than silu, we should add that activation to
-            # silu_mul_per_token_group_quant_fp8_colmajor kernel as it is much faster.
+        assert self.block_shape is not None
+        block_k = self.block_shape[1]
+        scale_fmt = DeepGemmQuantScaleFMT.from_oracle()
+
+        # 1. DeepGemm UE8M0: use packed per-token-group quant
+        if scale_fmt == DeepGemmQuantScaleFMT.UE8M0:
             M_sum, N = input.size()
             act_out = torch.empty(
                 (M_sum, N // 2), dtype=input.dtype, device=input.device
             )
             self.activation(activation, act_out, input)
-            assert self.block_shape is not None
-            return per_token_group_quant_fp8(
-                act_out, self.block_shape[1], column_major_scales=True, out_q=output
+            a2q, a2q_scale = per_token_group_quant_fp8_packed_for_deepgemm(
+                act_out,
+                block_k,
+                out_q=output,
             )
+            return a2q, a2q_scale
+
+        # 2. Hopper / non‑E8M0: prefer the fused SiLU+mul+quant kernel
+        if activation == "silu":
+            use_ue8m0 = scale_fmt == DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0
+            return silu_mul_per_token_group_quant_fp8_colmajor(
+                input=input,
+                output=output,
+                use_ue8m0=use_ue8m0,
+            )
+
+        # 3. fallback path for non-SiLU activations in non‑UE8M0 cases.
+        M_sum, N = input.size()
+        act_out = torch.empty((M_sum, N // 2), dtype=input.dtype, device=input.device)
+        self.activation(activation, act_out, input)
+        return per_token_group_quant_fp8(
+            act_out, block_k, column_major_scales=True, out_q=output
+        )
 
     def apply(
         self,
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 6e73833d1ae1c..7e1bda8639ac7 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -269,7 +269,11 @@ class W8A8BlockFp8LinearOp:
         weight_scale: torch.Tensor,
     ) -> torch.Tensor:
         assert self.deepgemm_input_quant_op is not None
-        q_input, input_scale = self.deepgemm_input_quant_op(input_2d)
+        q_input, input_scale = per_token_group_quant_fp8_packed_for_deepgemm(
+            input_2d,
+            group_size=self.act_quant_group_shape.col,
+            use_ue8m0=True,
+        )
         output = torch.empty(
             (q_input.shape[0], weight.shape[0]),
             dtype=torch.bfloat16,
@@ -791,6 +795,80 @@ def per_token_group_quant_fp8(
     return x_q, x_s
 
 
+def per_token_group_quant_fp8_packed_for_deepgemm(
+    x: torch.Tensor,
+    group_size: int,
+    eps: float = 1e-10,
+    use_ue8m0: bool | None = None,
+    out_q: torch.Tensor | None = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """FP8 per-token-group quantization for DeepGEMM.
+
+    Returns:
+        (x_q, x_s_packed)
+            x_q: FP8 activations, same shape as `x`.
+            x_s_packed: Int32 tensor with logical shape
+                        [mn, ceil(num_groups_per_row / 4)], laid out with
+                        TMA-aligned stride along the packed-K dimension
+    """
+    if use_ue8m0 is None:
+        use_ue8m0 = is_deep_gemm_e8m0_used()
+    # for DeepGEMM UE8M0-packed layout we *require* UE8M0 scales.
+    assert use_ue8m0, (
+        "per_token_group_quant_fp8_packed_for_deepgemm requires UE8M0 scales."
+    )
+
+    dtype = current_platform.fp8_dtype()
+    assert x.shape[-1] % group_size == 0, (
+        f"the last dimension of `x` {x.shape[-1]} must be divisible "
+        f"by `group_size` {group_size}"
+    )
+    assert x.stride(-1) == 1, "`x` groups must be contiguous"
+
+    finfo = torch.finfo(dtype)
+    fp8_min, fp8_max = finfo.min, finfo.max
+
+    # compute DeepGEMM-style packed scale tensor shape.
+    hidden_dim = x.shape[-1]
+    mn = x.numel() // hidden_dim
+    num_groups_per_row = hidden_dim // group_size
+    k_num_packed_sf_k = (num_groups_per_row + 3) // 4
+    tma_aligned_mn = ((mn + 3) // 4) * 4
+
+    x_s_packed = torch.empty_strided(
+        (mn, k_num_packed_sf_k),
+        (1, tma_aligned_mn),
+        device=x.device,
+        dtype=torch.int32,
+    )
+
+    # CUDA kernel path only (DeepGEMM + E8M0 is CUDA-specific).
+    assert current_platform.is_cuda(), (
+        "per_token_group_quant_fp8_packed_for_deepgemm is only valid on CUDA "
+        "platforms using DeepGEMM."
+    )
+
+    x_contiguous = x.contiguous()
+    if out_q is not None:
+        x_q_local = out_q
+    else:
+        x_q_local = torch.empty_like(x_contiguous, device=x.device, dtype=dtype)
+
+    torch.ops._C.per_token_group_fp8_quant_packed(
+        x_contiguous,
+        x_q_local,
+        x_s_packed,
+        group_size,
+        eps,
+        fp8_min,
+        fp8_max,
+    )
+
+    # return a tensor with the original logical shape.
+    x_q = x_q_local.view_as(x)
+    return x_q, x_s_packed
+
+
 @triton.jit
 def _w8a8_triton_block_scaled_mm(
     # Pointers to inputs and output

From b952f4d3c31068de3a98d680261be9a5caa04caa Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 7 Dec 2025 23:51:36 +0800
Subject: [PATCH 321/770] [v1] Add PrefixLM support to FlexAttention backend
 (#27938)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 docs/models/supported_models.md               | 20 ------
 .../multimodal/generation/test_common.py      |  1 -
 .../vllm_add_dummy_platform/dummy_platform.py |  1 +
 vllm/attention/backends/abstract.py           |  9 +++
 vllm/attention/layer.py                       |  5 ++
 vllm/attention/selector.py                    |  5 ++
 vllm/config/model.py                          | 14 +++++
 vllm/multimodal/inputs.py                     | 25 ++++++++
 vllm/platforms/cpu.py                         |  1 +
 vllm/platforms/cuda.py                        | 19 ++++++
 vllm/platforms/interface.py                   |  1 +
 vllm/platforms/rocm.py                        |  1 +
 vllm/platforms/tpu.py                         |  5 +-
 vllm/platforms/xpu.py                         |  3 +-
 vllm/v1/attention/backends/flex_attention.py  | 62 +++++++++++++++++++
 vllm/v1/worker/gpu_model_runner.py            | 26 +++++++-
 16 files changed, 173 insertions(+), 25 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 1089de87b994e..ec3ba4474c192 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -740,23 +740,6 @@ Some models are supported only via the [Transformers modeling backend](#transfor
 <sup>E</sup> Pre-computed embeddings can be inputted for this modality.
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
-!!! warning
-    Both V0 and V1 support `Gemma3ForConditionalGeneration` for text-only inputs.
-    However, there are differences in how they handle text + image inputs:
-
-    V0 correctly implements the model's attention pattern:
-    - Uses bidirectional attention between the image tokens corresponding to the same image
-    - Uses causal attention for other tokens
-    - Implemented via (naive) PyTorch SDPA with masking tensors
-    - Note: May use significant memory for long prompts with image
-
-    V1 currently uses a simplified attention pattern:
-    - Uses causal attention for all tokens, including image tokens
-    - Generates reasonable outputs but does not match the original model's attention for text + image inputs, especially when `{"do_pan_and_scan": true}`
-    - Will be updated in the future to support the correct behavior
-
-    This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
-
 !!! note
     `Gemma3nForConditionalGeneration` is only supported on V1 due to shared KV caching and it depends on `timm>=1.0.17` to make use of its
     MobileNet-v5 vision backbone.
@@ -776,9 +759,6 @@ Some models are supported only via the [Transformers modeling backend](#transfor
     The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now.
     For more details, please see: <https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630>
 
-!!! warning
-    Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1.
-
 !!! note
     For Qwen2.5-Omni and Qwen3-Omni, reading audio from video pre-processing (`--mm-processor-kwargs '{"use_audio_in_video": true}'`) is currently work in progress and not yet supported.
 
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index fd26b838ae209..c5a0b6748f797 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -382,7 +382,6 @@ VLM_TEST_SETTINGS = {
         auto_cls=AutoModelForImageTextToText,
         vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
         patch_hf_runner=model_utils.gemma3_patch_hf_runner,
-        num_logprobs=10,
     ),
     "glm4v": VLMTestInfo(
         models=["zai-org/glm-4v-9b"],
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
index a80617a366cab..8448003e70531 100644
--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
@@ -30,5 +30,6 @@ class DummyPlatform(Platform):
         use_mla,
         has_sink,
         use_sparse,
+        use_mm_prefix,
     ):
         return "vllm_add_dummy_platform.dummy_attention_backend.DummyAttentionBackend"  # noqa E501
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 84cca8e686075..03f4c40302eb8 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -166,6 +166,10 @@ class AttentionBackend(ABC):
     def supports_sink(cls) -> bool:
         return False
 
+    @classmethod
+    def supports_mm_prefix(cls) -> bool:
+        return False
+
     @classmethod
     def is_sparse(cls) -> bool:
         return False
@@ -207,6 +211,7 @@ class AttentionBackend(ABC):
         use_mla: bool,
         has_sink: bool,
         use_sparse: bool,
+        use_mm_prefix: bool,
         device_capability: "DeviceCapability",
         attn_type: str,
     ) -> list[str]:
@@ -219,6 +224,10 @@ class AttentionBackend(ABC):
             invalid_reasons.append("kv_cache_dtype not supported")
         if not cls.supports_block_size(block_size):
             invalid_reasons.append("block_size not supported")
+        if use_mm_prefix and not cls.supports_mm_prefix():
+            invalid_reasons.append(
+                "partial multimodal token full attention not supported"
+            )
         if use_mla != cls.is_mla():
             if use_mla:
                 invalid_reasons.append("MLA not supported")
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 8a522deedf3ce..340b161ea1e15 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -230,6 +230,10 @@ class Attention(nn.Module, AttentionLayerBase):
         self.sliding_window = sliding_window
         self.has_sink = extra_impl_args.get("sinks") is not None
 
+        # NOTE: model_config may be None during certain tests
+        model_config = vllm_config.model_config
+        self.use_mm_prefix = model_config is not None and model_config.is_mm_prefix_lm
+
         # During model initialization, the default dtype is set as the model
         # weight and activation dtype.
         dtype = torch.get_default_dtype()
@@ -241,6 +245,7 @@ class Attention(nn.Module, AttentionLayerBase):
                 block_size,
                 use_mla=False,
                 has_sink=self.has_sink,
+                use_mm_prefix=self.use_mm_prefix,
                 attn_type=attn_type,
             )
         else:
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index aeb130dfe8726..f6aba271d2e96 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -27,6 +27,7 @@ def get_attn_backend(
     use_mla: bool = False,
     has_sink: bool = False,
     use_sparse: bool = False,
+    use_mm_prefix: bool = False,
     attn_type: str | None = None,
 ) -> type[AttentionBackend]:
     """Selects which attention backend to use and lazily imports it."""
@@ -52,6 +53,7 @@ def get_attn_backend(
         use_mla=use_mla,
         has_sink=has_sink,
         use_sparse=use_sparse,
+        use_mm_prefix=use_mm_prefix,
         attn_type=attn_type,
     )
 
@@ -66,6 +68,7 @@ def _cached_get_attn_backend(
     use_mla: bool = False,
     has_sink: bool = False,
     use_sparse: bool = False,
+    use_mm_prefix: bool = False,
     attn_type: str | None = None,
 ) -> type[AttentionBackend]:
     from vllm.platforms import current_platform
@@ -87,6 +90,7 @@ def _cached_get_attn_backend(
             use_mla,
             has_sink,
             use_sparse,
+            use_mm_prefix,
             attn_type,
         )
     else:
@@ -99,6 +103,7 @@ def _cached_get_attn_backend(
             use_mla,
             has_sink,
             use_sparse,
+            use_mm_prefix,
             attn_type,
         )
     if not attention_cls:
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 509a9c5e162f7..583904a949ea1 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -4,6 +4,7 @@
 import warnings
 from collections.abc import Callable
 from dataclasses import InitVar, field
+from functools import cached_property
 from typing import TYPE_CHECKING, Any, Literal, cast, get_args
 
 import torch
@@ -1217,6 +1218,19 @@ class ModelConfig:
             )
         return False
 
+    @cached_property
+    def is_mm_prefix_lm(self) -> bool:
+        """Whether to use bidirectional attention for mm positions."""
+        MM_PREFIX_LM_MODELS = (
+            "gemma3",
+            # TODO(Isotr0py): Disable paligemma for now before
+            # we supports soft cap attention for FlexAttention
+            # "paligemma",
+        )
+        if not hasattr(self.hf_config, "model_type"):
+            return False
+        return self.hf_config.model_type in MM_PREFIX_LM_MODELS
+
     def get_head_size(self) -> int:
         # TODO remove hard code
         if self.is_deepseek_mla:
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index d9118f5b9e9a5..2ed66554e358e 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -175,6 +175,31 @@ class PlaceholderRange:
 
         return int(self.is_embed.sum().item())
 
+    def extract_embeds_range(self) -> list[tuple[int, int]]:
+        """Extract the start and end indices of the embedded region in prompt.
+
+        For example, given `PlaceholderRange(offset=2, length=5)` and
+        `is_embed = [False, True, False, True, True]`, the output is
+        `[(1 + offset, 1 + offset), (3 + offset, 4 + offset)]`.
+
+        Returns:
+            A tuple `(start, end)` representing the start and end
+            indices (inclusive) of the embedded region.
+            Returns full placeholder range if `is_embed` is `None`.
+        """
+        if self.is_embed is None:
+            return [(self.offset, self.offset + self.length)]
+
+        mask_i = self.is_embed.int()
+        starts = torch.nonzero(
+            torch.diff(mask_i, prepend=mask_i.new_zeros(1)) == 1
+        ).flatten()
+        ends = torch.nonzero(
+            torch.diff(mask_i, append=mask_i.new_zeros(1)) == -1
+        ).flatten()
+        ranges = torch.stack((starts, ends), dim=1) + self.offset
+        return [tuple(x) for x in ranges.tolist()]
+
     def __eq__(self, other: object) -> bool:
         if not isinstance(other, self.__class__):
             return False
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index a2518d5fd3dc4..a49b6e92df00d 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -133,6 +133,7 @@ class CpuPlatform(Platform):
         use_mla: bool,
         has_sink: bool,
         use_sparse: bool,
+        use_mm_prefix: bool,
         attn_type: str | None = None,
     ) -> str:
         if selected_backend and selected_backend != AttentionBackendEnum.CPU_ATTN:
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 37c95f48669f8..39101c43142f7 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -233,6 +233,20 @@ class CudaPlatformBase(Platform):
                     "Forcing kv cache block size to 64 for FlashMLASparse backend."
                 )
 
+        scheduler_config = vllm_config.scheduler_config
+        # Note: model_config may be None during testing
+        if (
+            model_config is not None
+            and model_config.is_mm_prefix_lm
+            and scheduler_config.is_multimodal_model
+            and not scheduler_config.disable_chunked_mm_input
+        ):
+            logger.warning(
+                "Forcing --disable_chunked_mm_input for models "
+                "with multimodal-bidirectional attention."
+            )
+            scheduler_config.disable_chunked_mm_input = True
+
     @classmethod
     def get_current_memory_usage(
         cls, device: torch.types.Device | None = None
@@ -268,6 +282,7 @@ class CudaPlatformBase(Platform):
         use_mla,
         has_sink,
         use_sparse,
+        use_mm_prefix,
         device_capability,
         attn_type,
     ) -> tuple[
@@ -289,6 +304,7 @@ class CudaPlatformBase(Platform):
                     use_mla,
                     has_sink,
                     use_sparse,
+                    use_mm_prefix,
                     device_capability,
                     attn_type,
                 )
@@ -312,6 +328,7 @@ class CudaPlatformBase(Platform):
         use_mla: bool,
         has_sink: bool,
         use_sparse: bool,
+        use_mm_prefix: bool,
         attn_type: str | None = None,
     ) -> str:
         if attn_type is None:
@@ -332,6 +349,7 @@ class CudaPlatformBase(Platform):
                     use_mla,
                     has_sink,
                     use_sparse,
+                    use_mm_prefix,
                     device_capability,
                     attn_type,
                 )
@@ -356,6 +374,7 @@ class CudaPlatformBase(Platform):
             use_mla,
             has_sink,
             use_sparse,
+            use_mm_prefix,
             device_capability,
             attn_type,
         )
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 27c6fac09f498..f04e94e425257 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -239,6 +239,7 @@ class Platform:
         use_mla: bool,
         has_sink: bool,
         use_sparse: bool,
+        use_mm_prefix: bool,
         attn_type: str | None = None,
     ) -> str:
         """Get the attention backend class of a device."""
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 32c7f8e536639..ff0fc78517876 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -216,6 +216,7 @@ class RocmPlatform(Platform):
         use_mla,
         has_sink,
         use_sparse,
+        use_mm_prefix,
         attn_type: str | None = None,
     ) -> str:
         from vllm._aiter_ops import rocm_aiter_ops
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index cbc0a996f3661..d6998e7a308af 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -62,8 +62,9 @@ class TpuPlatform(Platform):
         kv_cache_dtype: str | None,
         block_size: int,
         use_mla: bool,
-        has_sink,
-        use_sparse,
+        has_sink: bool,
+        use_sparse: bool,
+        use_mm_prefix: bool,
         attn_type: str | None = None,
     ) -> str:
         if use_sparse:
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 768714fb16726..0a05750764d8d 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -48,7 +48,8 @@ class XPUPlatform(Platform):
         block_size: int,
         use_mla: bool,
         has_sink: bool,
-        use_sparse,
+        use_sparse: bool,
+        use_mm_prefix: bool,
         attn_type: str | None = None,
     ) -> str:
         from vllm.v1.attention.backends.utils import set_kv_cache_layout
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index a2a6eeeb16b24..d8dbe4cbae013 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -17,6 +17,7 @@ from torch.nn.attention.flex_attention import (
     and_masks,
     create_block_mask,
     flex_attention,
+    or_masks,
 )
 
 from vllm.attention.backends.abstract import (
@@ -42,6 +43,7 @@ from vllm.v1.kv_cache_interface import AttentionSpec
 
 logger = init_logger(__name__)
 
+torch._dynamo.config.recompile_limit = 16
 create_block_mask_compiled = torch.compile(
     create_block_mask, fullgraph=True, mode="reduce-overhead"
 )
@@ -91,6 +93,11 @@ class FlexAttentionBackend(AttentionBackend):
         """FlexAttention supports both decoder and encoder-only attention."""
         return attn_type in (AttentionType.DECODER, AttentionType.ENCODER_ONLY)
 
+    @classmethod
+    def supports_mm_prefix(cls) -> bool:
+        """FlexAttention supports full attention for image tokens."""
+        return True
+
     @staticmethod
     def get_impl_cls() -> type["FlexAttentionImpl"]:
         return FlexAttentionImpl
@@ -316,6 +323,7 @@ class FlexAttentionMetadata:
     kv_block_size: int = 16
     transformed_score_mod: _score_mod_signature | None = None
     sliding_window: int | None = None
+    mm_prefix_range: dict[int, list[tuple[int, int]]] | None = None
 
     @cached_property
     def logical_block_ids(self):
@@ -443,6 +451,45 @@ class FlexAttentionMetadata:
 
         return final_mask_mod if self.causal else sliding_window_mask_mod
 
+    def get_prefix_lm_mask_mod(self) -> _mask_mod_signature:
+        """Creates the prefix LM mask_mod function for FlexAttention."""
+
+        assert self.doc_ids is not None
+        request_lookup = self.doc_ids
+
+        def prefix_lm_mask_mod(
+            b: torch.Tensor,
+            h: torch.Tensor,
+            cu_q_idx: torch.Tensor,
+            q_idx: torch.Tensor,
+            kv_idx: torch.Tensor,
+        ):
+            mask = torch.zeros_like(q_idx, dtype=torch.bool)
+            for req, doc_range_lst in (self.mm_prefix_range or {}).items():
+                req_mask = request_lookup[cu_q_idx] == req
+                for start, end in doc_range_lst:
+                    doc_mask_q = (q_idx >= start) & (q_idx <= end)
+                    doc_mask_kv = (kv_idx >= start) & (kv_idx <= end)
+                    mask = mask | (req_mask & doc_mask_q & doc_mask_kv)
+            return mask
+
+        def final_mask_mod(
+            b: torch.Tensor,
+            h: torch.Tensor,
+            q_idx: torch.Tensor,
+            physical_kv_idx: torch.Tensor,
+        ) -> torch.Tensor:
+            (is_valid, logical_q_idx, logical_kv_idx) = (
+                self._convert_physical_to_logical(self.doc_ids, q_idx, physical_kv_idx)
+            )
+            return torch.where(
+                is_valid,
+                prefix_lm_mask_mod(b, h, q_idx, logical_q_idx, logical_kv_idx),
+                False,
+            )
+
+        return final_mask_mod
+
     def get_mask_mod(self):
         # Stage-1: initialize the base mask_mod
         # (causal mask for decoder or bidirectional mask for encoder)
@@ -456,6 +503,10 @@ class FlexAttentionMetadata:
             # Add sliding window mask for sliding window attention
             sliding_window_mask_mod = self.get_sliding_window_mask_mod()
             mask_mod = and_masks(mask_mod, sliding_window_mask_mod)
+        if self.mm_prefix_range:
+            # Add prefix LM mask for vision-language prefix LM attention
+            prefix_lm_mask_mod = self.get_prefix_lm_mask_mod()
+            mask_mod = or_masks(mask_mod, prefix_lm_mask_mod)
         return mask_mod
 
     def get_transformed_score_mod(self) -> _score_mod_signature | None:
@@ -709,6 +760,7 @@ class FlexAttentionImpl(AttentionImpl):
     sliding_window: int | None
     alibi_slopes: torch.Tensor | None
     logits_soft_cap: float | None
+    mm_prefix_range: dict[int, list[tuple[int, int]]] | None = None
 
     def __init__(
         self,
@@ -810,11 +862,21 @@ class FlexAttentionImpl(AttentionImpl):
 
         num_actual_tokens = attn_metadata.num_actual_tokens
 
+        needs_rebuild_block_mask = False
         if attn_metadata.sliding_window != self.sliding_window:
             attn_metadata.sliding_window = self.sliding_window
             if attn_metadata.direct_build:
                 # update mask mod in attention metadata
                 attn_metadata.mask_mod = attn_metadata.get_mask_mod()
+            needs_rebuild_block_mask = True
+
+        if self.mm_prefix_range != getattr(attn_metadata, "mm_prefix_range", None):
+            self.mm_prefix_range = attn_metadata.mm_prefix_range
+            attn_metadata.mask_mod = attn_metadata.get_mask_mod()
+            needs_rebuild_block_mask = True
+
+        if needs_rebuild_block_mask:
+            if attn_metadata.direct_build and attn_metadata.causal:
                 attn_metadata.block_mask = attn_metadata._build_block_mask_direct()
             else:
                 attn_metadata.block_mask = attn_metadata.build_block_mask()
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a50360ab08694..22a3f9d8d2dda 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -48,7 +48,10 @@ from vllm.distributed.parallel_state import (
     is_global_first_rank,
     prepare_communication_buffer_for_model,
 )
-from vllm.forward_context import BatchDescriptor, set_forward_context
+from vllm.forward_context import (
+    BatchDescriptor,
+    set_forward_context,
+)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 from vllm.model_executor.layers.rotary_embedding import (
@@ -329,6 +332,7 @@ class GPUModelRunner(
         self.use_alibi = model_config.uses_alibi
 
         self.cascade_attn_enabled = not self.model_config.disable_cascade_attn
+        self.is_mm_prefix_lm = self.model_config.is_mm_prefix_lm
 
         # Multi-modal data support
         self.mm_registry = MULTIMODAL_REGISTRY
@@ -1700,6 +1704,26 @@ class GPUModelRunner(
                     for layer_name in attn_group.layer_names:
                         attn_metadata[layer_name] = attn_metadata_i
 
+        if self.is_mm_prefix_lm:
+            req_doc_ranges = {}
+            for req_id in self.input_batch.req_ids:
+                image_doc_ranges = []
+                req_state = self.requests[req_id]
+                for mm_feature in req_state.mm_features:
+                    pos_info = mm_feature.mm_position
+                    img_doc_range = pos_info.extract_embeds_range()
+                    image_doc_ranges.extend(img_doc_range)
+                req_idx = self.input_batch.req_id_to_index[req_id]
+                req_doc_ranges[req_idx] = image_doc_ranges
+
+            if isinstance(attn_metadata, list):
+                for ub_metadata in attn_metadata:
+                    for _metadata in ub_metadata.values():
+                        _metadata.mm_prefix_range = req_doc_ranges  # type: ignore[attr-defined]
+            else:
+                for _metadata in attn_metadata.values():
+                    _metadata.mm_prefix_range = req_doc_ranges  # type: ignore[attr-defined]
+
         if spec_decode_common_attn_metadata is not None and (
             num_reqs != num_reqs_padded or num_tokens != num_tokens_padded
         ):

From 0044c4038c571e7fb38acb008256f925dfe515f1 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Sun, 7 Dec 2025 10:53:51 -0500
Subject: [PATCH 322/770] [BugFix][DeepSeek-V3.2] Fix backend selection logic
 for Blackwell (#30195)

---
 vllm/platforms/cuda.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 39101c43142f7..915392a4125f9 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -182,8 +182,8 @@ class CudaPlatformBase(Platform):
 
             if vllm_config.attention_config.backend is None:
                 # Default case
-                if cls.is_device_capability(100):
-                    # Blackwell => Force CutlassMLA.
+                if cls.is_device_capability(100) and not use_sparse:
+                    # Blackwell => Force CutlassMLA (unless sparse, i.e. DSv3.2).
                     use_cutlass_mla = True
                     # Set the backend in AttentionConfig so it's used during
                     # backend selection

From af0444bf40b7db2f3fb9fe1508d25ceba24cac87 Mon Sep 17 00:00:00 2001
From: ElizaWszola <ewszola@redhat.com>
Date: Sun, 7 Dec 2025 17:38:04 +0100
Subject: [PATCH 323/770] [Performance] Fused blockwise quant RMS norm (#27883)

Signed-off-by: ElizaWszola <ewszola@redhat.com>
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Co-authored-by: yewentao256 <zhyanwentao@126.com>
---
 .../fused_kernels/layernorm_rms_benchmarks.py |  89 +++-
 csrc/dispatch_utils.h                         |  18 +
 csrc/ops.h                                    |   7 +
 ...fused_layernorm_dynamic_per_token_quant.cu | 144 ++++++-
 .../fused_kernels/layernorm_utils.cuh         | 408 +++++++++++++-----
 csrc/torch_bindings.cpp                       |   8 +
 tests/compile/test_fusion.py                  |  69 ++-
 .../core/test_fused_quant_layernorm.py        |  82 +++-
 vllm/_custom_ops.py                           |  40 ++
 vllm/compilation/fusion.py                    | 172 +++++++-
 vllm/compilation/matcher_utils.py             |  44 +-
 .../layers/quantization/utils/fp8_utils.py    |   2 +-
 .../layers/quantization/utils/quant_utils.py  |   6 +
 vllm/utils/deep_gemm.py                       |  17 +
 14 files changed, 949 insertions(+), 157 deletions(-)

diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
index d809bf1db8cbc..fb3329975cee3 100644
--- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@@ -14,6 +14,9 @@ from tqdm import tqdm
 
 import vllm._custom_ops as ops
 from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    per_token_group_quant_fp8,
+)
 
 
 @dataclass
@@ -22,6 +25,7 @@ class bench_params_t:
     hidden_size: int
     add_residual: bool
     dtype: torch.dtype
+    group_size: list[int]
 
     def description(self):
         return (
@@ -29,6 +33,7 @@ class bench_params_t:
             f"x D {self.hidden_size} "
             f"x R {self.add_residual} "
             f"x DT {self.dtype}"
+            f"x GS {self.group_size}"
         )
 
 
@@ -38,10 +43,11 @@ def get_bench_params() -> list[bench_params_t]:
     HIDDEN_SIZES = list(range(1024, 8129, 1024))
     ADD_RESIDUAL = [True, False]
     DTYPES = [torch.bfloat16, torch.float]
+    GROUP_SIZES = [[1, 64], [1, 128]]
 
-    combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES)
+    combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES, GROUP_SIZES)
     bench_params = list(
-        map(lambda x: bench_params_t(x[0], x[1], x[2], x[3]), combinations)
+        map(lambda x: bench_params_t(x[0], x[1], x[2], x[3], x[4]), combinations)
     )
     return bench_params
 
@@ -52,6 +58,7 @@ def unfused_int8_impl(
     x: torch.Tensor,
     residual: torch.Tensor | None,
     quant_dtype: torch.dtype,
+    group_size: list[int],
 ):
     # Norm
     torch_out = None
@@ -69,6 +76,7 @@ def unfused_fp8_impl(
     x: torch.Tensor,
     residual: torch.Tensor | None,
     quant_dtype: torch.dtype,
+    group_size: list[int],
 ):
     # Norm
     torch_out = None
@@ -81,23 +89,63 @@ def unfused_fp8_impl(
     torch_out, _ = ops.scaled_fp8_quant(torch_out)
 
 
+def unfused_groupwise_fp8_impl(
+    rms_norm_layer: RMSNorm,
+    x: torch.Tensor,
+    residual: torch.Tensor | None,
+    quant_dtype: torch.dtype,
+    group_size: list[int],
+):
+    # Norm
+    torch_out = None
+    if residual is None:
+        torch_out = rms_norm_layer.forward_cuda(x, residual)
+    else:
+        torch_out, _ = rms_norm_layer.forward_cuda(x, residual)
+
+    # Quant
+    torch_out, _ = per_token_group_quant_fp8(
+        torch_out, group_size=group_size[1], use_ue8m0=False
+    )
+
+
 def fused_impl(
     rms_norm_layer: RMSNorm,  # this stores the weights
     x: torch.Tensor,
     residual: torch.Tensor | None,
     quant_dtype: torch.dtype,
+    group_size: list[int],
 ):
     out, _ = ops.rms_norm_dynamic_per_token_quant(
         x, rms_norm_layer.weight, 1e-6, quant_dtype, residual=residual
     )
 
 
+def fused_groupwise_impl(
+    rms_norm_layer: RMSNorm,  # this stores the weights
+    x: torch.Tensor,
+    residual: torch.Tensor | None,
+    quant_dtype: torch.dtype,
+    group_size: list[int],
+):
+    out, _ = ops.rms_norm_per_block_quant(
+        x,
+        rms_norm_layer.weight,
+        1e-6,
+        quant_dtype,
+        group_size,
+        residual=residual,
+        is_scale_transposed=True,
+    )
+
+
 # Bench functions
 def bench_fn(
     rms_norm_layer: RMSNorm,
     x: torch.Tensor,
     residual: torch.Tensor,
     quant_dtype: torch.dtype,
+    group_size: list[int],
     label: str,
     sub_label: str,
     fn: Callable,
@@ -110,10 +158,11 @@ def bench_fn(
         "x": x,
         "residual": residual,
         "quant_dtype": quant_dtype,
+        "group_size": group_size,
         "fn": fn,
     }
     return TBenchmark.Timer(
-        stmt="fn(rms_norm_layer, x, residual, quant_dtype)",
+        stmt="fn(rms_norm_layer, x, residual, quant_dtype, group_size)",
         globals=globals,
         label=label,
         sub_label=sub_label,
@@ -147,6 +196,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
             x,
             residual,
             torch.int8,
+            params.group_size,
             label,
             sub_label,
             unfused_int8_impl,
@@ -161,6 +211,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
             x,
             residual,
             torch.float8_e4m3fn,
+            params.group_size,
             label,
             sub_label,
             unfused_fp8_impl,
@@ -175,6 +226,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
             x,
             residual,
             torch.int8,
+            params.group_size,
             label,
             sub_label,
             fused_impl,
@@ -189,6 +241,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
             x,
             residual,
             torch.float8_e4m3fn,
+            params.group_size,
             label,
             sub_label,
             fused_impl,
@@ -196,6 +249,36 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
         )
     )
 
+    # unfused groupwise fp8 impl.
+    timers.append(
+        bench_fn(
+            layer,
+            x,
+            residual,
+            torch.float8_e4m3fn,
+            params.group_size,
+            label,
+            sub_label,
+            unfused_groupwise_fp8_impl,
+            "unfused_groupwise_fp8_impl",
+        )
+    )
+
+    # fused groupwise fp8 impl.
+    timers.append(
+        bench_fn(
+            layer,
+            x,
+            residual,
+            torch.float8_e4m3fn,
+            params.group_size,
+            label,
+            sub_label,
+            fused_groupwise_impl,
+            "fused_groupwise_fp8_impl",
+        )
+    )
+
     print_timers(timers)
 
     return timers
diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h
index e1d131e4a7851..de0c505b7a62f 100644
--- a/csrc/dispatch_utils.h
+++ b/csrc/dispatch_utils.h
@@ -118,6 +118,24 @@
     }                                         \
   }
 
+#define VLLM_DISPATCH_BOOL(expr, const_expr, ...) \
+  if (expr) {                                     \
+    constexpr bool const_expr = true;             \
+    __VA_ARGS__();                                \
+  } else {                                        \
+    constexpr bool const_expr = false;            \
+    __VA_ARGS__();                                \
+  }
+
+#define VLLM_DISPATCH_GROUP_SIZE(group_size, const_group_size, ...) \
+  if (group_size == 128) {                                          \
+    constexpr int const_group_size = 128;                           \
+    __VA_ARGS__();                                                  \
+  } else if (group_size == 64) {                                    \
+    constexpr int const_group_size = 64;                            \
+    __VA_ARGS__();                                                  \
+  }
+
 #define VLLM_DISPATCH_RANK234(NUM_DIMS, ...)                                   \
   switch (NUM_DIMS) {                                                          \
     case 2: {                                                                  \
diff --git a/csrc/ops.h b/csrc/ops.h
index d302f04913266..9617d6358e18a 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -128,6 +128,13 @@ void rms_norm_dynamic_per_token_quant(torch::Tensor& out,
                                       std::optional<torch::Tensor> scale_ub,
                                       std::optional<torch::Tensor> residual);
 
+void rms_norm_per_block_quant(torch::Tensor& out, torch::Tensor const& input,
+                              torch::Tensor const& weight,
+                              torch::Tensor& scales, double const epsilon,
+                              std::optional<torch::Tensor> scale_ub,
+                              std::optional<torch::Tensor> residual,
+                              int64_t group_size, bool is_scale_transposed);
+
 void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
                       std::optional<torch::Tensor> key, int64_t head_size,
                       torch::Tensor& cos_sin_cache, bool is_neox);
diff --git a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
index 92d6c2f402a24..2080ef3cd39b5 100644
--- a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
+++ b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
@@ -31,14 +31,15 @@ __device__ void rms_norm_dynamic_per_token_quant_vec(
 
   // RMS Norm + Quant
   if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
+    token_scale = 1.0f / token_scale;
     vllm::vectorized::norm_and_quant<scalar_t, scalar_out_t, true,
                                      has_residual>(
-        out, input, weight, rms, 1.0f / token_scale, hidden_size, residual);
+        out, input, weight, rms, &token_scale, hidden_size, residual);
   } else {
     // FP8 - Do not invert token_scale for exact match with FBGemm
     vllm::vectorized::norm_and_quant<scalar_t, scalar_out_t, false,
                                      has_residual>(
-        out, input, weight, rms, token_scale, hidden_size, residual);
+        out, input, weight, rms, &token_scale, hidden_size, residual);
   }
 }
 
@@ -75,14 +76,52 @@ __global__ void rms_norm_dynamic_per_token_quant_kernel(
 
   // RMS Norm + Quant
   if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
+    token_scale = 1.0f / token_scale;
     vllm::norm_and_quant<scalar_t, scalar_out_t, true, has_residual>(
-        out, input, weight, rms, 1.0f / token_scale, hidden_size, residual);
+        out, input, weight, rms, &token_scale, hidden_size, residual);
   } else {
     // FP8 - Do not invert s_token_scale for exact match with FBGemm
     vllm::norm_and_quant<scalar_t, scalar_out_t, false, has_residual>(
-        out, input, weight, rms, token_scale, hidden_size, residual);
+        out, input, weight, rms, &token_scale, hidden_size, residual);
   }
 }
+
+// RMS norm + quant kernel
+template <typename scalar_t, typename scalar_out_t, bool has_residual = false,
+          bool is_scale_transposed = false, int32_t group_size = 0>
+__global__ void rms_norm_per_block_quant_kernel(
+    scalar_out_t* __restrict__ out,  // [..., hidden_size]
+    float* __restrict__ scales,      // [num_tokens, hidden_size / group_size]
+                                     // or
+                                     // [hidden_size / group_size, num_tokens]
+    scalar_t const* __restrict__ input,   // [..., hidden_size]
+    scalar_t const* __restrict__ weight,  // [hidden_size]
+    float const* scale_ub, float const var_epsilon, int32_t const hidden_size,
+    scalar_t* __restrict__ residual = nullptr) {
+  float rms;
+  // Compute RMS
+  // Always able to vectorize due to constraints on hidden_size
+  vllm::vectorized::compute_rms<scalar_t, has_residual>(
+      &rms, input, hidden_size, var_epsilon, residual);
+
+  // Compute Scale
+  // Always able to vectorize due to constraints on hidden_size and group_size
+  vllm::vectorized::compute_dynamic_per_token_scales<
+      scalar_t, scalar_out_t, has_residual, is_scale_transposed, group_size>(
+      nullptr, scales, input, weight, rms, scale_ub, hidden_size, residual);
+
+  // RMS Norm + Quant
+  // Always able to vectorize due to constraints on hidden_size
+  // For int8, don't invert token_scale here: do it inside the norm_and_quant
+  // kernel. We do it because particular elements of token_scale can be shared
+  // between multiple threads, so this way, we avoid extra synchronization
+  // overhead.
+  vllm::vectorized::norm_and_quant<
+      scalar_t, scalar_out_t, std::is_same_v<scalar_out_t, int8_t>,
+      has_residual, is_scale_transposed, group_size>(
+      out, input, weight, rms, scales, hidden_size, residual);
+}
+
 }  // namespace vllm
 
 // Residual add + RMS norm + dynamic per token
@@ -103,30 +142,19 @@ void rms_norm_dynamic_per_token_quant_dispatch(
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  if (residual.has_value()) {
+  VLLM_DISPATCH_BOOL(residual.has_value(), has_residual, [&] {
     VLLM_DISPATCH_QUANT_TYPES(
         out.scalar_type(), "rms_norm_dynamic_per_token_quant_kernel", [&] {
           vllm::rms_norm_dynamic_per_token_quant_kernel<scalar_in_t, scalar_t,
-                                                        true>
+                                                        has_residual>
               <<<grid, block, 0, stream>>>(
                   out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
                   input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
                   scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
-                  var_epsilon, hidden_size, residual->data_ptr<scalar_in_t>());
+                  var_epsilon, hidden_size,
+                  has_residual ? residual->data_ptr<scalar_in_t>() : nullptr);
         });
-
-  } else {
-    VLLM_DISPATCH_QUANT_TYPES(
-        out.scalar_type(), "rms_norm_dynamic_per_token_quant_kernel", [&] {
-          vllm::rms_norm_dynamic_per_token_quant_kernel<scalar_in_t, scalar_t,
-                                                        false>
-              <<<grid, block, 0, stream>>>(
-                  out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
-                  input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
-                  scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
-                  var_epsilon, hidden_size, nullptr);
-        });
-  }
+  });
 }
 
 void rms_norm_dynamic_per_token_quant(
@@ -157,3 +185,79 @@ void rms_norm_dynamic_per_token_quant(
             out, input, weight, scales, var_epsilon, scale_ub, residual);
       });
 }
+
+// Residual add + RMS norm + dynamic per token
+void rms_norm_per_block_quant_dispatch(
+    torch::Tensor& out,           // [..., hidden_size]
+    torch::Tensor const& input,   // [..., hidden_size]
+    torch::Tensor const& weight,  // [hidden_size]
+    torch::Tensor& scales,        // [num_tokens, hidden_size / group_size] or
+                                  // [hidden_size / group_size, num_tokens]
+    int32_t group_size,
+    double const var_epsilon,  // Variance epsilon used in norm calculation
+    std::optional<at::Tensor> const& scale_ub,
+    std::optional<at::Tensor>& residual, bool is_scale_transposed) {
+  int32_t hidden_size = input.size(-1);
+  auto num_tokens = input.numel() / hidden_size;
+
+  dim3 grid(num_tokens);
+  const int max_block_size = (num_tokens <= 256) ? 512 : 256;
+  dim3 block(std::min(hidden_size, max_block_size));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "rms_norm_per_block_quant_fp_dispatch", [&] {
+        using scalar_in_t = scalar_t;
+        VLLM_DISPATCH_GROUP_SIZE(group_size, gs, [&] {
+          VLLM_DISPATCH_BOOL(residual.has_value(), has_residual, [&] {
+            VLLM_DISPATCH_BOOL(is_scale_transposed, transpose_scale, [&] {
+              VLLM_DISPATCH_QUANT_TYPES(
+                  out.scalar_type(), "rms_norm_per_block_quant_kernel", [&] {
+                    vllm::rms_norm_per_block_quant_kernel<scalar_in_t, scalar_t,
+                                                          has_residual,
+                                                          transpose_scale, gs>
+                        <<<grid, block, 0, stream>>>(
+                            out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
+                            input.data_ptr<scalar_in_t>(),
+                            weight.data_ptr<scalar_in_t>(),
+                            scale_ub.has_value() ? scale_ub->data_ptr<float>()
+                                                 : nullptr,
+                            var_epsilon, hidden_size,
+                            has_residual ? residual->data_ptr<scalar_in_t>()
+                                         : nullptr);
+                  });
+            });
+          });
+        });
+      });
+}
+
+void rms_norm_per_block_quant(torch::Tensor& out, torch::Tensor const& input,
+                              torch::Tensor const& weight,
+                              torch::Tensor& scales, double const var_epsilon,
+                              std::optional<torch::Tensor> scale_ub,
+                              std::optional<torch::Tensor> residual,
+                              int64_t group_size, bool is_scale_transposed) {
+  static c10::ScalarType kFp8Type = is_fp8_ocp()
+                                        ? c10::ScalarType::Float8_e4m3fn
+                                        : c10::ScalarType::Float8_e4m3fnuz;
+  TORCH_CHECK(out.dtype() == kFp8Type || out.dtype() == torch::kInt8);
+  TORCH_CHECK(out.is_contiguous() && input.is_contiguous());
+
+  if (scale_ub.has_value()) {
+    TORCH_CHECK(out.dtype() == kFp8Type);
+  }
+  TORCH_CHECK(weight.dtype() == input.dtype());
+  TORCH_CHECK(scales.dtype() == torch::kFloat32);
+  if (residual) {
+    TORCH_CHECK(residual->scalar_type() == input.scalar_type());
+  }
+
+  TORCH_CHECK(group_size == 128 || group_size == 64,
+              "Unsupported group size: ", group_size);
+
+  rms_norm_per_block_quant_dispatch(out, input, weight, scales, group_size,
+                                    var_epsilon, scale_ub, residual,
+                                    is_scale_transposed);
+}
\ No newline at end of file
diff --git a/csrc/quantization/fused_kernels/layernorm_utils.cuh b/csrc/quantization/fused_kernels/layernorm_utils.cuh
index 2d2fd771205c7..cb7adc3125734 100644
--- a/csrc/quantization/fused_kernels/layernorm_utils.cuh
+++ b/csrc/quantization/fused_kernels/layernorm_utils.cuh
@@ -9,6 +9,7 @@
 #include "quant_conversions.cuh"
 
 #include "../../cub_helpers.h"
+#include "../../cuda_compat.h"
 
 namespace vllm {
 
@@ -43,62 +44,150 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
   *rms = s_rms;
 }
 
-template <typename scalar_t, typename scalar_out_t, bool has_residual = false>
+__device__ float warpReduceMaxSpecialized(volatile float* val, int64_t tid,
+                                          int64_t thread_in_warp,
+                                          int64_t reduced_elems) {
+  static_assert(WARP_SIZE == 32 || WARP_SIZE == 64);
+  if constexpr (WARP_SIZE == 64) {
+    if (thread_in_warp + 64 < reduced_elems)
+      val[tid] = fmaxf(val[tid], val[tid + 64]);
+  }
+  if (thread_in_warp + 32 < reduced_elems)
+    val[tid] = fmaxf(val[tid], val[tid + 32]);
+  if (thread_in_warp + 16 < reduced_elems)
+    val[tid] = fmaxf(val[tid], val[tid + 16]);
+  if (thread_in_warp + 8 < reduced_elems)
+    val[tid] = fmaxf(val[tid], val[tid + 8]);
+  if (thread_in_warp + 4 < reduced_elems)
+    val[tid] = fmaxf(val[tid], val[tid + 4]);
+  if (thread_in_warp + 2 < reduced_elems)
+    val[tid] = fmaxf(val[tid], val[tid + 2]);
+  if (thread_in_warp + 1 < reduced_elems)
+    val[tid] = fmaxf(val[tid], val[tid + 1]);
+  return val[tid];
+}
+
+template <typename scalar_t, typename scalar_out_t, bool has_residual = false,
+          bool is_scale_transposed = false>
 __device__ void compute_dynamic_per_token_scales(
     float* __restrict__ token_scale, float* __restrict__ all_token_scales,
     scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
     float const rms, float const* __restrict__ scale_ub,
-    int32_t const hidden_size,
-    scalar_t const* __restrict__ residual = nullptr) {
-  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
-  ;
-  constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};
-
+    int32_t const hidden_size, scalar_t const* __restrict__ residual = nullptr,
+    int32_t const group_size = 0) {
   float block_absmax_val_maybe = 0.0f;
-  for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
-    float x = static_cast<float>(input[token_offset + i]);
-    if constexpr (has_residual) {
-      x += static_cast<float>(residual[token_offset + i]);
-    }
-
-    x = static_cast<float>(static_cast<scalar_t>(x * rms) * weight[i]);
-    block_absmax_val_maybe = fmaxf(block_absmax_val_maybe, fabsf(x));
-  }
-
-  using BlockReduce = cub::BlockReduce<float, 1024>;
-  __shared__ typename BlockReduce::TempStorage reduceStore;
-  block_absmax_val_maybe =
-      BlockReduce(reduceStore)
-          .Reduce(block_absmax_val_maybe, CubMaxOp{}, blockDim.x);
-
-  __shared__ float s_token_scale;
-  if (threadIdx.x == 0) {
-    float scale = 0.0f;
-    if (scale_ub) {
-      scale = min(block_absmax_val_maybe, *scale_ub);
-    } else {
-      scale = block_absmax_val_maybe;
-    }
-    // token scale computation
-    scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
-    s_token_scale = scale;                 // Shared memory store
-    all_token_scales[blockIdx.x] = scale;  // Global output store
-  }
+  constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};
   __syncthreads();
+  if (group_size > 0) {
+    __shared__ float s_max_vals[1024];
+    int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+    int64_t num_groups = hidden_size / group_size;
+    int64_t const threads_per_group = blockDim.x / num_groups;
+    int64_t const thread_in_group = threadIdx.x % threads_per_group;
+    int64_t const group_offset = threadIdx.x / threads_per_group * group_size;
+    int64_t const thread_offset = group_offset + thread_in_group;
+    int64_t const thread_end =
+        min(group_offset + group_size, static_cast<int64_t>(hidden_size));
+    for (auto i = thread_offset; i < thread_end; i += threads_per_group) {
+      float x = static_cast<float>(input[token_offset + i]);
+      if constexpr (has_residual) {
+        x += static_cast<float>(residual[token_offset + i]);
+      }
+      x = static_cast<float>(static_cast<scalar_t>(x * rms) * weight[i]);
+      block_absmax_val_maybe = fmaxf(block_absmax_val_maybe, fabsf(x));
+    }
+    s_max_vals[threadIdx.x] = block_absmax_val_maybe;
+    __syncthreads();
 
-  *token_scale = s_token_scale;
+    int64_t const warp_size = WARP_SIZE;
+    int64_t const num_warps = blockDim.x / warp_size;
+    int64_t const warp_id = threadIdx.x / warp_size;
+    int64_t const thread_in_warp = threadIdx.x % warp_size;
+    int64_t const groups_per_warp = (num_groups + num_warps - 1) / num_warps;
+    for (auto i = 0; i < groups_per_warp; ++i) {
+      int64_t const group_id = i * num_warps + warp_id;
+      if (group_id < num_groups) {
+        int64_t warp_start = group_id * threads_per_group;
+        int64_t const start = warp_start + thread_in_warp;
+        int64_t const warp_end = min(warp_start + threads_per_group,
+                                     static_cast<int64_t>(hidden_size));
+        for (auto j = start; j + warp_size < warp_end; j += warp_size) {
+          s_max_vals[start] =
+              fmaxf(s_max_vals[start], s_max_vals[j + warp_size]);
+        }
+        warpReduceMaxSpecialized(s_max_vals, start, thread_in_warp,
+                                 min(warp_end - warp_start, warp_size));
+      }
+    }
+    __syncthreads();
+
+    if (thread_in_group == 0 && thread_offset < thread_end) {
+      block_absmax_val_maybe = s_max_vals[threadIdx.x];
+      float scale = 0.0f;
+      if (scale_ub) {
+        scale = min(block_absmax_val_maybe, *scale_ub);
+      } else {
+        scale = block_absmax_val_maybe;
+      }
+      // token scale computation
+      scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
+      // Global output store
+      if constexpr (is_scale_transposed) {
+        all_token_scales[(threadIdx.x / threads_per_group) * gridDim.x +
+                         blockIdx.x] = scale;
+      } else {
+        all_token_scales[blockIdx.x * num_groups +
+                         threadIdx.x / threads_per_group] = scale;
+      }
+    }
+    __syncthreads();
+  } else {
+    int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+
+    for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+      float x = static_cast<float>(input[token_offset + i]);
+      if constexpr (has_residual) {
+        x += static_cast<float>(residual[token_offset + i]);
+      }
+
+      x = static_cast<float>(static_cast<scalar_t>(x * rms) * weight[i]);
+      block_absmax_val_maybe = fmaxf(block_absmax_val_maybe, fabsf(x));
+    }
+    using BlockReduce = cub::BlockReduce<float, 1024>;
+    __shared__ typename BlockReduce::TempStorage reduceStore;
+    block_absmax_val_maybe =
+        BlockReduce(reduceStore)
+            .Reduce(block_absmax_val_maybe, CubMaxOp{}, blockDim.x);
+
+    __shared__ float s_token_scale;
+    if (threadIdx.x == 0) {
+      float scale = 0.0f;
+      if (scale_ub) {
+        scale = min(block_absmax_val_maybe, *scale_ub);
+      } else {
+        scale = block_absmax_val_maybe;
+      }
+      // token scale computation
+      scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
+      s_token_scale = scale;                 // Shared memory store
+      all_token_scales[blockIdx.x] = scale;  // Global output store
+    }
+    __syncthreads();
+
+    *token_scale = s_token_scale;
+  }
 }
 
 template <typename scalar_t, typename scalar_out_t, bool is_scale_inverted,
-          bool has_residual = false>
+          bool has_residual = false, bool is_scale_transposed = false>
 __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
                                scalar_t const* __restrict__ input,
                                scalar_t const* __restrict__ weight,
-                               float const rms, float const scale,
+                               float const rms, float* const scale,
                                int32_t const hidden_size,
-                               scalar_t* __restrict__ residual = nullptr) {
+                               scalar_t* __restrict__ residual = nullptr,
+                               int32_t const group_size = 0) {
   int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
-  ;
 
   for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
     float x = static_cast<float>(input[token_offset + i]);
@@ -109,8 +198,21 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
     // Norm
     x = static_cast<float>(static_cast<scalar_t>(x * rms) * weight[i]);
     // Quant
+    // If groupwise is_scale_inverted is true, so we invert the scale here.
+    int64_t scale_idx = 0;
+    if (group_size > 0) {
+      if constexpr (is_scale_transposed) {
+        scale_idx = (i / group_size) * gridDim.x + blockIdx.x;
+      } else {
+        scale_idx = blockIdx.x * (hidden_size / group_size) + i / group_size;
+      }
+    }
+    auto scale_val =
+        (group_size > 0
+             ? (is_scale_inverted ? 1.0f / scale[scale_idx] : scale[scale_idx])
+             : *scale);
     output[token_offset + i] =
-        ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(x, scale);
+        ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(x, scale_val);
   }
 }
 
@@ -178,95 +280,191 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
 
 // Vectorized version of vllm::compute_dynamic_per_token_scales
 // hidden_size must be a multiple of 4
-template <typename scalar_t, typename scalar_out_t, bool has_residual = false>
+template <typename scalar_t, typename scalar_out_t, bool has_residual = false,
+          bool is_scale_transposed = false, int32_t group_size = 0>
 __device__ void compute_dynamic_per_token_scales(
     float* __restrict__ token_scale, float* __restrict__ all_token_scales,
     scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
     float const rms, float const* __restrict__ scale_ub,
     int32_t const hidden_size,
     scalar_t const* __restrict__ residual = nullptr) {
-  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
-  ;
-
-  // Vectorized input/weight/residual to better utilize memory bandwidth.
-  vec4_t<scalar_t> const* vec_input =
-      reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
-  vec4_t<scalar_t> const* vec_weight =
-      reinterpret_cast<vec4_t<scalar_t> const*>(weight);
-  vec4_t<scalar_t> const* vec_residual = nullptr;
-  if constexpr (has_residual) {
-    vec_residual =
-        reinterpret_cast<vec4_t<scalar_t> const*>(&residual[token_offset]);
-  }
-
   constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};
 
   const int VEC_SIZE = 4;
-  int32_t const num_vec_elems = hidden_size >> 2;
   float block_absmax_val_maybe = 0.0f;
 
-#pragma unroll 4
-  for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
-    vec4_t<scalar_t> in = vec_input[i];
-    vec4_t<scalar_t> const w = vec_weight[i];
+  // Vectorized input/weight/residual to better utilize memory bandwidth.
+  vec4_t<scalar_t> const* vec_input = nullptr;
+  vec4_t<scalar_t> const* vec_weight = nullptr;
+  vec4_t<scalar_t> const* vec_residual = nullptr;
 
-    vec4_t<float> x;
-#pragma unroll
-    for (int j = 0; j < VEC_SIZE; ++j) {
-      x.val[j] = static_cast<float>(in.val[j]);
-    }
+  if constexpr (group_size > 0) {
+    __shared__ float s_max_vals[1024];
 
+    int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+    int64_t const num_groups = hidden_size / group_size;
+    int64_t const threads_per_group = blockDim.x / num_groups;
+    int64_t const thread_in_group = threadIdx.x % threads_per_group;
+    int64_t const group_offset =
+        threadIdx.x / threads_per_group * (group_size >> 2);
+    int64_t const thread_offset = group_offset + thread_in_group;
+    int64_t const thread_end = min(group_offset + (group_size >> 2),
+                                   static_cast<int64_t>(hidden_size >> 2));
+    vec_input = reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
+    vec_weight = reinterpret_cast<vec4_t<scalar_t> const*>(weight);
     if constexpr (has_residual) {
-      vec4_t<scalar_t> r = vec_residual[i];
+      vec_residual =
+          reinterpret_cast<vec4_t<scalar_t> const*>(&residual[token_offset]);
+    }
+    int32_t const num_vec_elems = thread_end;
+
+#pragma unroll 4
+    for (auto i = thread_offset; i < num_vec_elems; i += threads_per_group) {
+      vec4_t<scalar_t> in = vec_input[i];
+      vec4_t<scalar_t> const w = vec_weight[i];
+
+      vec4_t<float> x;
 #pragma unroll
       for (int j = 0; j < VEC_SIZE; ++j) {
-        x.val[j] += static_cast<float>(r.val[j]);
+        x.val[j] = static_cast<float>(in.val[j]);
+      }
+
+      if constexpr (has_residual) {
+        vec4_t<scalar_t> r = vec_residual[i];
+#pragma unroll
+        for (int j = 0; j < VEC_SIZE; ++j) {
+          x.val[j] += static_cast<float>(r.val[j]);
+        }
+      }
+
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; ++j) {
+        block_absmax_val_maybe =
+            fmaxf(block_absmax_val_maybe,
+                  fabs(static_cast<scalar_t>(x.val[j] * rms) * w.val[j]));
       }
     }
 
+    s_max_vals[threadIdx.x] = block_absmax_val_maybe;
+    __syncthreads();
+
+    int64_t const warp_size = WARP_SIZE;
+    int64_t const num_warps = blockDim.x / warp_size;
+    int64_t const warp_id = threadIdx.x / warp_size;
+    int64_t const thread_in_warp = threadIdx.x % warp_size;
+    int64_t const groups_per_warp = (num_groups + num_warps - 1) / num_warps;
+    for (auto i = 0; i < groups_per_warp; ++i) {
+      int64_t const group_id = i * num_warps + warp_id;
+      if (group_id < num_groups) {
+        int64_t warp_start = group_id * threads_per_group;
+        int64_t const start = warp_start + thread_in_warp;
+        int64_t const warp_end = min(warp_start + threads_per_group,
+                                     static_cast<int64_t>(hidden_size));
+        for (auto j = start; j + warp_size < warp_end; j += warp_size) {
+          s_max_vals[start] =
+              fmaxf(s_max_vals[start], s_max_vals[j + warp_size]);
+        }
+        warpReduceMaxSpecialized(s_max_vals, start, thread_in_warp,
+                                 min(warp_end - warp_start, warp_size));
+      }
+    }
+    __syncthreads();
+
+    if (thread_in_group == 0 && thread_offset < thread_end) {
+      block_absmax_val_maybe = s_max_vals[threadIdx.x];
+      float scale = 0.0f;
+      if (scale_ub) {
+        scale = min(block_absmax_val_maybe, *scale_ub);
+      } else {
+        scale = block_absmax_val_maybe;
+      }
+      // token scale computation
+      scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
+      // Global output store
+      if constexpr (is_scale_transposed) {
+        all_token_scales[(threadIdx.x / threads_per_group) * gridDim.x +
+                         blockIdx.x] = scale;
+      } else {
+        all_token_scales[blockIdx.x * num_groups +
+                         threadIdx.x / threads_per_group] = scale;
+      }
+    }
+    __syncthreads();
+
+  } else {
+    int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+    vec_input = reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
+    vec_weight = reinterpret_cast<vec4_t<scalar_t> const*>(weight);
+    if constexpr (has_residual) {
+      vec_residual =
+          reinterpret_cast<vec4_t<scalar_t> const*>(&residual[token_offset]);
+    }
+
+    int32_t const num_vec_elems = (hidden_size >> 2);
+
+#pragma unroll 4
+    for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
+      vec4_t<scalar_t> in = vec_input[i];
+      vec4_t<scalar_t> const w = vec_weight[i];
+
+      vec4_t<float> x;
 #pragma unroll
-    for (int j = 0; j < VEC_SIZE; ++j) {
-      block_absmax_val_maybe =
-          fmaxf(block_absmax_val_maybe,
-                fabs(static_cast<scalar_t>(x.val[j] * rms) * w.val[j]));
+      for (int j = 0; j < VEC_SIZE; ++j) {
+        x.val[j] = static_cast<float>(in.val[j]);
+      }
+
+      if constexpr (has_residual) {
+        vec4_t<scalar_t> r = vec_residual[i];
+#pragma unroll
+        for (int j = 0; j < VEC_SIZE; ++j) {
+          x.val[j] += static_cast<float>(r.val[j]);
+        }
+      }
+
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; ++j) {
+        block_absmax_val_maybe =
+            fmaxf(block_absmax_val_maybe,
+                  fabs(static_cast<scalar_t>(x.val[j] * rms) * w.val[j]));
+      }
     }
-  }
 
-  using BlockReduce = cub::BlockReduce<float, 1024>;
-  __shared__ typename BlockReduce::TempStorage reduceStore;
-  block_absmax_val_maybe =
-      BlockReduce(reduceStore)
-          .Reduce(block_absmax_val_maybe, CubMaxOp{}, blockDim.x);
+    using BlockReduce = cub::BlockReduce<float, 1024>;
+    __shared__ typename BlockReduce::TempStorage reduceStore;
+    block_absmax_val_maybe =
+        BlockReduce(reduceStore)
+            .Reduce(block_absmax_val_maybe, CubMaxOp{}, blockDim.x);
 
-  __shared__ float s_token_scale;
-  if (threadIdx.x == 0) {
-    float scale = 0.0f;
-    if (scale_ub) {
-      scale = min(block_absmax_val_maybe, *scale_ub);
-    } else {
-      scale = block_absmax_val_maybe;
+    __shared__ float s_token_scale;
+    if (threadIdx.x == 0) {
+      float scale = 0.0f;
+      if (scale_ub) {
+        scale = min(block_absmax_val_maybe, *scale_ub);
+      } else {
+        scale = block_absmax_val_maybe;
+      }
+      // token scale computation
+      scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
+      s_token_scale = scale;                 // shared memory store
+      all_token_scales[blockIdx.x] = scale;  // global output store
     }
-    // token scale computation
-    scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
-    s_token_scale = scale;                 // shared memory store
-    all_token_scales[blockIdx.x] = scale;  // global output store
-  }
-  __syncthreads();
+    __syncthreads();
 
-  *token_scale = s_token_scale;
+    *token_scale = s_token_scale;
+  }
 }
 
 // hidden_size must be a multiple of 4
 template <typename scalar_t, typename scalar_out_t, bool is_scale_inverted,
-          bool has_residual = false>
+          bool has_residual = false, bool is_scale_transposed = false,
+          int32_t group_size = 0>
 __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
                                scalar_t const* __restrict__ input,
                                scalar_t const* __restrict__ weight,
-                               float const rms, float const scale,
+                               float const rms, float* const scale,
                                int32_t const hidden_size,
                                scalar_t* __restrict__ residual = nullptr) {
   int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
-  ;
 
   // Vectorized input/output/weight/residual to better utilize memory bandwidth.
   vec4_t<scalar_t> const* vec_input =
@@ -311,10 +509,26 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
     }
 
     q8x4_t<scalar_out_t> out;
+
+    float scale_val;
+
+    if constexpr (group_size > 0) {
+      int64_t const num_groups = hidden_size / group_size;
+      int64_t scale_idx = 0;
+      if constexpr (is_scale_transposed) {
+        scale_idx = (i * VEC_SIZE / group_size) * gridDim.x + blockIdx.x;
+      } else {
+        scale_idx = blockIdx.x * num_groups + i * VEC_SIZE / group_size;
+      }
+      scale_val =
+          is_scale_inverted ? 1.0f / scale[scale_idx] : scale[scale_idx];
+    } else {
+      scale_val = *scale;
+    }
 #pragma unroll
     for (int j = 0; j < VEC_SIZE; ++j) {
       out.val[j] = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
-          static_cast<scalar_t>(x.val[j] * rms) * w.val[j], scale);
+          static_cast<scalar_t>(x.val[j] * rms) * w.val[j], scale_val);
     }
     vec_output[i] = out;
   }
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 23ac1d9abeea9..db37a9b9b88e3 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -215,6 +215,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.impl("rms_norm_dynamic_per_token_quant", torch::kCUDA,
            &rms_norm_dynamic_per_token_quant);
 
+  // Fused Layernorm + Block quant kernels
+  ops.def(
+      "rms_norm_per_block_quant(Tensor! result, Tensor input, "
+      "Tensor weight, Tensor! scale, float epsilon, "
+      "Tensor? scale_ub, Tensor!? residual, int group_size, "
+      "bool is_scale_transposed) -> ()");
+  ops.impl("rms_norm_per_block_quant", torch::kCUDA, &rms_norm_per_block_quant);
+
   // Rotary embedding
   // Apply GPT-NeoX or GPT-J style rotary embedding to query and key.
   ops.def(
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index d0ba8385f4a01..2ad34a79859a3 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -18,6 +18,9 @@ from vllm.config import (
     VllmConfig,
 )
 from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    W8A8BlockFp8LinearOp,
+)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     GroupShape,
     QuantKey,
@@ -25,10 +28,12 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
 )
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     Fp8LinearOp,
+    cutlass_block_fp8_supported,
     cutlass_fp8_supported,
     maybe_create_device_identity,
 )
 from vllm.platforms import current_platform
+from vllm.utils.deep_gemm import is_deep_gemm_supported
 
 from ..utils import override_cutlass_fp8_supported
 from .backend import TestBackend
@@ -44,7 +49,7 @@ class TestModel(torch.nn.Module):
         self,
         hidden_size: int,
         eps: float,
-        static: bool,
+        group_shape: GroupShape,
         cuda_force_torch: bool,
         *args,
         **kwargs,
@@ -52,8 +57,17 @@ class TestModel(torch.nn.Module):
         super().__init__(*args, **kwargs)
         self.cuda_force_torch = cuda_force_torch
         self.norm = [RMSNorm(hidden_size, eps) for _ in range(4)]
-        self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
-        group_shape = GroupShape.PER_TENSOR if static else GroupShape.PER_TOKEN
+        if group_shape.is_per_group():
+            self.wscale = [
+                torch.rand(
+                    (hidden_size // group_shape[1], hidden_size // group_shape[1]),
+                    dtype=torch.float32,
+                )
+                for _ in range(3)
+            ]
+        else:
+            self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
+        static = group_shape == GroupShape.PER_TENSOR
         quant_scale = ScaleDesc(torch.float32, static, group_shape)
         self.quant_key = QuantKey(dtype=FP8_DTYPE, scale=quant_scale, symmetric=True)
         if static:
@@ -61,18 +75,29 @@ class TestModel(torch.nn.Module):
         else:
             self.scale = [None for _ in range(3)]
         self.w = [
-            torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
-            for _ in range(3)
+            torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE) for _ in range(3)
         ]
+        if not group_shape.is_per_group():
+            self.w = [self.w[0].t() for _ in range(3)]
 
-        with override_cutlass_fp8_supported(not cuda_force_torch):
-            self.fp8_linear = Fp8LinearOp(
-                act_quant_static=static,
+        if group_shape.is_per_group():
+            self.fp8_linear = W8A8BlockFp8LinearOp(
+                weight_group_shape=GroupShape(group_shape[1], group_shape[1]),
                 act_quant_group_shape=group_shape,
+                cutlass_block_fp8_supported=cutlass_block_fp8_supported(),
+                use_aiter_and_is_supported=False,
             )
+            self.enable_quant_fp8_custom_op = self.fp8_linear.input_quant_op.enabled()
+        else:
+            with override_cutlass_fp8_supported(not cuda_force_torch):
+                self.fp8_linear = Fp8LinearOp(
+                    act_quant_static=static,
+                    act_quant_group_shape=group_shape,
+                )
+                self.enable_quant_fp8_custom_op = self.fp8_linear.quant_fp8.enabled()
 
         self.enable_rms_norm_custom_op = self.norm[0].enabled()
-        self.enable_quant_fp8_custom_op = self.fp8_linear.quant_fp8.enabled()
+        self.group_shape = group_shape
 
     def forward(self, x):
         # avoid having graph input be an arg to a pattern directly
@@ -119,11 +144,19 @@ class TestModel(torch.nn.Module):
         )
 
 
+GROUP_SHAPES = [
+    GroupShape.PER_TOKEN,
+    GroupShape.PER_TENSOR,
+    GroupShape(1, 128),
+    GroupShape(1, 64),
+]
+
+
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
-@pytest.mark.parametrize("hidden_size", [64])
+@pytest.mark.parametrize("hidden_size", [256])
 @pytest.mark.parametrize("num_tokens", [257])
 @pytest.mark.parametrize("eps", [1e-5, 1e-6])
-@pytest.mark.parametrize("static", [True, False])
+@pytest.mark.parametrize("group_shape", GROUP_SHAPES)
 @pytest.mark.parametrize("enable_rms_norm_custom_op", [True, False])
 @pytest.mark.parametrize("enable_quant_fp8_custom_op", [True, False])
 # cuda_force_torch used to test torch code path on platforms that
@@ -139,7 +172,7 @@ def test_fusion_rmsnorm_quant(
     hidden_size,
     num_tokens,
     eps,
-    static,
+    group_shape,
     enable_rms_norm_custom_op,
     enable_quant_fp8_custom_op,
     cuda_force_torch,
@@ -149,6 +182,15 @@ def test_fusion_rmsnorm_quant(
     torch.manual_seed(1)
     maybe_create_device_identity()  # needed for certain non-cutlass fp8 paths
 
+    if not enable_quant_fp8_custom_op and group_shape.is_per_group():
+        pytest.skip("Unsupported unwrapped quant fp8 op for blockwise quantization")
+
+    # Skip test for 64-bit group shape when running with cutlass or deepgemm
+    if group_shape == GroupShape(1, 64) and (
+        cutlass_block_fp8_supported() or is_deep_gemm_supported()
+    ):
+        pytest.skip("Unsupported group shape 64 for CUTLASS/DeepGemm")
+
     custom_ops = []
     if enable_rms_norm_custom_op:
         custom_ops.append("+rms_norm")
@@ -172,8 +214,7 @@ def test_fusion_rmsnorm_quant(
 
         backend = TestBackend(noop_pass, fusion_pass, cleanup_pass)
         backend2 = TestBackend(noop_pass, cleanup_pass)
-        model = TestModel(hidden_size, eps, static, cuda_force_torch)
-
+        model = TestModel(hidden_size, eps, group_shape, cuda_force_torch)
         # First dimension dynamic
         x = torch.rand(num_tokens, hidden_size)
         torch._dynamo.mark_dynamic(x, 0)
diff --git a/tests/kernels/core/test_fused_quant_layernorm.py b/tests/kernels/core/test_fused_quant_layernorm.py
index b5fc653ca7353..094073f5d3f92 100644
--- a/tests/kernels/core/test_fused_quant_layernorm.py
+++ b/tests/kernels/core/test_fused_quant_layernorm.py
@@ -8,6 +8,12 @@ import torch
 import vllm._custom_ops as ops
 from tests.kernels.utils import opcheck
 from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    per_token_group_quant_fp8,
+)
+from vllm.model_executor.layers.quantization.utils.int8_utils import (
+    per_token_group_quant_int8,
+)
 
 DTYPES = [torch.bfloat16, torch.float]
 QUANT_DTYPES = [torch.int8, torch.float8_e4m3fn]
@@ -21,6 +27,7 @@ NUM_TOKENS_HIDDEN_SIZES = [
 
 ADD_RESIDUAL = [False, True]
 SCALE_UBS = [True, False]
+GROUP_SIZES = [None, [1, 64], [1, 128]]
 SEEDS = [0]
 CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 
@@ -45,12 +52,13 @@ def ref_rms_norm(
     return out, residual
 
 
-def ref_dynamic_per_token_quant(
+def ref_dynamic_per_token_or_block_quant(
     rms_norm_layer: RMSNorm,
     x: torch.Tensor,
     quant_dtype: torch.dtype,
     residual: torch.Tensor | None,
     scale_ub: torch.Tensor | None,
+    group_size: list[int] | None,
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
     if scale_ub is not None:
         assert quant_dtype == torch.float8_e4m3fn
@@ -59,13 +67,24 @@ def ref_dynamic_per_token_quant(
     torch_out, residual = ref_rms_norm(rms_norm_layer, x, residual)
 
     # Quant
-    if quant_dtype == torch.float8_e4m3fn:
-        torch_out, scales = ops.scaled_fp8_quant(
-            torch_out, scale_ub=scale_ub, use_per_token_if_dynamic=True
-        )
+    if group_size is not None:
+        if quant_dtype == torch.float8_e4m3fn:
+            torch_out, scales = per_token_group_quant_fp8(
+                torch_out, group_size=group_size[1], use_ue8m0=False
+            )
+        else:
+            assert quant_dtype == torch.int8
+            torch_out, scales = per_token_group_quant_int8(
+                torch_out, group_size=group_size[1]
+            )
     else:
-        assert quant_dtype == torch.int8
-        torch_out, scales, _ = ops.scaled_int8_quant(torch_out)
+        if quant_dtype == torch.float8_e4m3fn:
+            torch_out, scales = ops.scaled_fp8_quant(
+                torch_out, scale_ub=scale_ub, use_per_token_if_dynamic=True
+            )
+        else:
+            assert quant_dtype == torch.int8
+            torch_out, scales, _ = ops.scaled_int8_quant(torch_out)
 
     return torch_out, scales, residual
 
@@ -76,24 +95,32 @@ def ref_impl(
     quant_dtype: torch.dtype,
     residual: torch.Tensor | None,
     scale_ub: torch.Tensor | None,
+    group_size: list[int] | None,
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
-    return ref_dynamic_per_token_quant(
-        rms_norm_layer, x, quant_dtype, residual, scale_ub
+    return ref_dynamic_per_token_or_block_quant(
+        rms_norm_layer, x, quant_dtype, residual, scale_ub, group_size
     )
 
 
-def ops_dynamic_per_token_quant(
+def ops_dynamic_per_token_or_block_quant(
     weight: torch.Tensor,
     x: torch.Tensor,
     quant_dtype: torch.dtype,
     residual: torch.Tensor | None,
     scale_ub: torch.Tensor | None,
+    group_size: list[int] | None,
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
     if residual is not None:
         residual = residual.clone()
-    out, scales = ops.rms_norm_dynamic_per_token_quant(
-        x, weight, EPS, quant_dtype, scale_ub, residual
-    )
+    if group_size is not None:
+        out, scales = ops.rms_norm_per_block_quant(
+            x, weight, EPS, quant_dtype, group_size, scale_ub, residual, True
+        )
+        scales = scales.contiguous()
+    else:
+        out, scales = ops.rms_norm_dynamic_per_token_quant(
+            x, weight, EPS, quant_dtype, scale_ub, residual
+        )
     return out, scales, residual
 
 
@@ -103,8 +130,11 @@ def ops_impl(
     quant_dtype: torch.dtype,
     residual: torch.Tensor | None,
     scale_ub: torch.Tensor | None,
+    group_size: list[int] | None,
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
-    return ops_dynamic_per_token_quant(weight, x, quant_dtype, residual, scale_ub)
+    return ops_dynamic_per_token_or_block_quant(
+        weight, x, quant_dtype, residual, scale_ub, group_size
+    )
 
 
 @pytest.mark.parametrize("num_tokens, hidden_size", NUM_TOKENS_HIDDEN_SIZES)
@@ -112,6 +142,7 @@ def ops_impl(
 @pytest.mark.parametrize("has_scale_ub", SCALE_UBS)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("quant_dtype", QUANT_DTYPES)
+@pytest.mark.parametrize("group_size", GROUP_SIZES)
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
@@ -122,6 +153,7 @@ def test_rms_norm(
     has_scale_ub: bool,
     dtype: torch.dtype,
     quant_dtype: torch.dtype,
+    group_size: list[int] | None,
     seed: int,
     device: str,
 ) -> None:
@@ -130,6 +162,14 @@ def test_rms_norm(
         torch.cuda.manual_seed(seed)
     torch.set_default_device(device)
 
+    if group_size is not None and hidden_size % group_size[1] != 0:
+        # skip
+        return
+
+    if group_size is not None and has_scale_ub:
+        # blockwise baseline doesn't support scale_ub
+        return
+
     if has_scale_ub and quant_dtype != torch.float8_e4m3fn:
         # skip
         return
@@ -150,10 +190,10 @@ def test_rms_norm(
         scale_ub = None
 
     ref_out, ref_scales, ref_residual = ref_impl(
-        layer, x, quant_dtype, residual, scale_ub
+        layer, x, quant_dtype, residual, scale_ub, group_size
     )
     ops_out, ops_scales, ops_residual = ops_impl(
-        layer.weight, x, quant_dtype, residual, scale_ub
+        layer.weight, x, quant_dtype, residual, scale_ub, group_size
     )
 
     assert ref_out.dtype == quant_dtype
@@ -166,11 +206,15 @@ def test_rms_norm(
         assert torch.allclose(ref_scales, ops_scales)
         a = ref_out.to(dtype=torch.float32)
         b = ops_out.to(dtype=torch.float32)
-        ok = torch.allclose(a, b)
+        ok = torch.allclose(a, b, atol=1e-6)
         if not ok:
             # fallback: compare dequantized values with relaxed tolerance
-            a_deq = a * ref_scales.view(-1, 1)
-            b_deq = b * ops_scales.view(-1, 1)
+            if group_size is None:
+                a_deq = a * ref_scales.view(-1, 1)
+                b_deq = b * ops_scales.view(-1, 1)
+            else:
+                a_deq = a * ref_scales.repeat_interleave(group_size[1], dim=1)
+                b_deq = b * ops_scales.repeat_interleave(group_size[1], dim=1)
             # NOTE: It is possible that some future test cases trigger this
             # max diff due to precision issues. If such an error is
             # encountered, it's recommended to inspect the differences between
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 94e27545239f4..77d5453291e3c 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -436,6 +436,46 @@ def rms_norm_dynamic_per_token_quant(
     return output, scales
 
 
+# fused quant layer norm ops blocked
+def rms_norm_per_block_quant(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    epsilon: float,
+    quant_dtype: torch.dtype,
+    group_size: list[int],
+    scale_ub: torch.Tensor | None = None,
+    residual: torch.Tensor | None = None,
+    is_scale_transposed: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    assert len(group_size) == 2
+    output = torch.empty_like(input, dtype=quant_dtype)
+    if is_scale_transposed:
+        scales = torch.empty(
+            (input.shape[-1] // group_size[1], input.numel() // input.shape[-1]),
+            device=input.device,
+            dtype=torch.float32,
+        ).transpose(0, 1)
+    else:
+        scales = torch.empty(
+            (input.numel() // input.shape[-1], input.shape[-1] // group_size[1]),
+            device=input.device,
+            dtype=torch.float32,
+        )
+
+    torch.ops._C.rms_norm_per_block_quant(
+        output,
+        input,
+        weight,
+        scales,
+        epsilon,
+        scale_ub,
+        residual,
+        group_size[1],
+        is_scale_transposed,
+    )
+    return output, scales
+
+
 # quantization ops
 # awq
 def awq_dequantize(
diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py
index 1d6e297b495eb..de083a2e5e3c7 100644
--- a/vllm/compilation/fusion.py
+++ b/vllm/compilation/fusion.py
@@ -15,13 +15,22 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
     GroupShape,
     QuantKey,
     ScaleDesc,
+    kFp8Dynamic64Sym,
+    kFp8Dynamic128Sym,
     kFp8DynamicTensorSym,
     kFp8DynamicTokenSym,
     kFp8StaticTensorSym,
     kNvfp4Quant,
     kStaticTensorScale,
 )
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    cutlass_block_fp8_supported,
+)
 from vllm.platforms import current_platform
+from vllm.utils.deep_gemm import (
+    is_deep_gemm_e8m0_used,
+    should_use_deepgemm_for_fp8_linear_for_nk,
+)
 
 from .inductor_pass import enable_fake_mode
 from .matcher_utils import MatcherFusedAddRMSNorm, MatcherQuantFP8, MatcherRMSNorm
@@ -58,6 +67,9 @@ QUANT_OPS: dict[QuantKey, OpOverload] = {
 }
 if current_platform.is_cuda() and hasattr(torch.ops._C, "scaled_fp4_quant"):
     QUANT_OPS[kNvfp4Quant] = torch.ops._C.scaled_fp4_quant.default
+if current_platform.is_cuda():
+    QUANT_OPS[kFp8Dynamic128Sym] = torch.ops._C.per_token_group_fp8_quant.default  # noqa: E501
+    QUANT_OPS[kFp8Dynamic64Sym] = torch.ops._C.per_token_group_fp8_quant.default  # noqa: E501
 
 
 class FusedRMSQuantKey(NamedTuple):
@@ -90,6 +102,18 @@ FUSED_OPS: dict[FusedRMSQuantKey, OpOverload] = {
     FusedRMSQuantKey(
         kFp8DynamicTokenSym, True
     ): torch.ops._C.rms_norm_dynamic_per_token_quant.default,  # noqa: E501
+    FusedRMSQuantKey(
+        kFp8Dynamic128Sym, False
+    ): torch.ops._C.rms_norm_per_block_quant.default,  # noqa: E501
+    FusedRMSQuantKey(
+        kFp8Dynamic128Sym, True
+    ): torch.ops._C.rms_norm_per_block_quant.default,  # noqa: E501
+    FusedRMSQuantKey(
+        kFp8Dynamic64Sym, False
+    ): torch.ops._C.rms_norm_per_block_quant.default,  # noqa: E501
+    FusedRMSQuantKey(
+        kFp8Dynamic64Sym, True
+    ): torch.ops._C.rms_norm_per_block_quant.default,  # noqa: E501
 }
 
 
@@ -100,6 +124,15 @@ class RMSNormQuantPattern:
         config = get_current_vllm_config()
         self.model_dtype = config.model_config.dtype if config.model_config else None
 
+        # groupwise FP8 linear uses col major scales if deepgemm and cutlass
+        using_deepgemm = should_use_deepgemm_for_fp8_linear_for_nk(
+            self.model_dtype,
+            config.model_config.hf_config.intermediate_size,
+            config.model_config.hf_config.hidden_size,
+        )
+        use_col_major_scales = using_deepgemm or cutlass_block_fp8_supported()
+        use_e8m0 = is_deep_gemm_e8m0_used() if using_deepgemm else False
+
         assert key in FUSED_OPS, f"unsupported fused rmsnorm+quant op for {key}"
         self.FUSED_OP = FUSED_OPS[key]
 
@@ -108,7 +141,9 @@ class RMSNormQuantPattern:
             if not key.fused_add
             else MatcherFusedAddRMSNorm(epsilon)
         )
-        self.quant_matcher = MatcherQuantFP8(key.quant)
+        self.quant_matcher = MatcherQuantFP8(
+            key.quant, use_col_major_scales=use_col_major_scales, use_e8m0=use_e8m0
+        )
 
 
 class RMSNormStaticQuantPattern(RMSNormQuantPattern):
@@ -218,6 +253,120 @@ class FusedAddRMSNormStaticQuantPattern(RMSNormQuantPattern):
         )
 
 
+class FusedAddRMSNormGroupQuantPattern(RMSNormQuantPattern):
+    def __init__(
+        self,
+        epsilon: float,
+        quant_dtype: torch.dtype,
+        group_shape: GroupShape,
+        symmetric=True,
+    ):
+        scale = ScaleDesc(torch.float32, False, group_shape)
+        key = FusedRMSQuantKey(
+            fused_add=True,
+            quant=QuantKey(dtype=quant_dtype, scale=scale, symmetric=symmetric),
+        )
+        self.group_shape = group_shape
+        super().__init__(epsilon, key)
+
+    def register(self, pm_pass: PatternMatcherPass):
+        def pattern(input: torch.Tensor, weight: torch.Tensor, residual: torch.Tensor):
+            result_rms, residual = self.rmsnorm_matcher(input, weight, residual)
+            result, scale = self.quant_matcher(result_rms)
+            return result, residual, scale
+
+        def replacement(
+            input: torch.Tensor, weight: torch.Tensor, residual: torch.Tensor
+        ):
+            # In case we're matching native rms-norm, conversions might be
+            # optimized out. We convert here just to be safe.
+            input = input.to(dtype=self.model_dtype)
+
+            result = torch.empty_like(input, dtype=self.quant_dtype)
+            scale = self.quant_matcher.make_scale(
+                input, transposed=self.quant_matcher.use_col_major_scales
+            )
+            at = auto_functionalized(
+                self.FUSED_OP,
+                result=result,
+                input=input,
+                weight=weight,
+                scale=scale,
+                epsilon=self.epsilon,
+                scale_ub=None,
+                residual=residual,
+                group_size=self.group_shape[1],
+                is_scale_transposed=self.quant_matcher.use_col_major_scales,
+            )
+
+            # result, residual, scale
+            return at[1], at[3], at[2]
+
+        pm.register_replacement(
+            pattern,
+            replacement,
+            self.rmsnorm_matcher.inputs(),
+            pm.fwd_only,
+            pm_pass,
+        )
+
+
+class RMSNormGroupQuantPattern(RMSNormQuantPattern):
+    def __init__(
+        self,
+        epsilon: float,
+        quant_dtype: torch.dtype,
+        group_shape: GroupShape,
+        symmetric=True,
+    ):
+        scale = ScaleDesc(torch.float32, False, group_shape)
+        key = FusedRMSQuantKey(
+            fused_add=False,
+            quant=QuantKey(dtype=quant_dtype, scale=scale, symmetric=symmetric),
+        )
+        self.group_shape = group_shape
+        super().__init__(epsilon, key)
+
+    def register(self, pm_pass: PatternMatcherPass):
+        def pattern(input: torch.Tensor, weight: torch.Tensor):
+            result_rms = self.rmsnorm_matcher(input, weight)
+            result, scale = self.quant_matcher(result_rms)
+            return result, scale
+
+        def replacement(input: torch.Tensor, weight: torch.Tensor):
+            # In case we're matching native rms-norm, conversions might be
+            # optimized out. We convert here just to be safe.
+            input = input.to(dtype=self.model_dtype)
+
+            result = torch.empty_like(input, dtype=self.quant_dtype)
+            scale = self.quant_matcher.make_scale(
+                input, transposed=self.quant_matcher.use_col_major_scales
+            )
+            at = auto_functionalized(
+                self.FUSED_OP,
+                result=result,
+                input=input,
+                weight=weight,
+                scale=scale,
+                epsilon=self.epsilon,
+                scale_ub=None,
+                residual=None,
+                group_size=self.group_shape[1],
+                is_scale_transposed=self.quant_matcher.use_col_major_scales,
+            )
+
+            # result, scale
+            return at[1], at[2]
+
+        pm.register_replacement(
+            pattern,
+            replacement,
+            self.rmsnorm_matcher.inputs(),
+            pm.fwd_only,
+            pm_pass,
+        )
+
+
 class RMSNormDynamicQuantPattern(RMSNormQuantPattern):
     def __init__(
         self,
@@ -340,6 +489,25 @@ class RMSNormQuantFusionPass(VllmPatternMatcherPass):
         # Make sure fused add patterns are before simple rms norm,
         # as the latter is a subset of the former in torch ops
         for epsilon in [1e-5, 1e-6]:
+            # Fuse fused_add_rms_norm + fp8 group quant
+            FusedAddRMSNormGroupQuantPattern(
+                epsilon, FP8_DTYPE, group_shape=GroupShape(1, 128)
+            ).register(self.patterns)
+
+            # Fuse rms_norm + fp8 group quant
+            RMSNormGroupQuantPattern(
+                epsilon, FP8_DTYPE, group_shape=GroupShape(1, 128)
+            ).register(self.patterns)
+
+            FusedAddRMSNormGroupQuantPattern(
+                epsilon, FP8_DTYPE, group_shape=GroupShape(1, 64)
+            ).register(self.patterns)
+
+            # Fuse rms_norm + fp8 group quant
+            RMSNormGroupQuantPattern(
+                epsilon, FP8_DTYPE, group_shape=GroupShape(1, 64)
+            ).register(self.patterns)
+
             # Fuse fused_add_rms_norm + static fp8 quant
             FusedAddRMSNormStaticQuantPattern(epsilon, FP8_DTYPE).register(
                 self.patterns
@@ -366,9 +534,11 @@ class RMSNormQuantFusionPass(VllmPatternMatcherPass):
     def uuid(self) -> Any:
         return self.hash_source(
             self,
+            RMSNormGroupQuantPattern,
             RMSNormQuantPattern,
             RMSNormStaticQuantPattern,
             RMSNormDynamicQuantPattern,
             FusedAddRMSNormStaticQuantPattern,
             FusedAddRMSNormDynamicQuantPattern,
+            FusedAddRMSNormGroupQuantPattern,
         )
diff --git a/vllm/compilation/matcher_utils.py b/vllm/compilation/matcher_utils.py
index e4cd063d2aee1..0c0bece9b3fda 100644
--- a/vllm/compilation/matcher_utils.py
+++ b/vllm/compilation/matcher_utils.py
@@ -13,6 +13,8 @@ from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     QuantKey,
     _normalize_quant_group_shape,
+    kFp8Dynamic64Sym,
+    kFp8Dynamic128Sym,
     kFp8DynamicTensorSym,
     kFp8DynamicTokenSym,
     kFp8StaticTensorSym,
@@ -35,6 +37,10 @@ QUANT_OPS: dict[QuantKey, OpOverload] = {
 if current_platform.is_cuda() and hasattr(torch.ops._C, "scaled_fp4_quant"):
     QUANT_OPS[kNvfp4Quant] = torch.ops._C.scaled_fp4_quant.default  # noqa: E501
 
+if current_platform.is_cuda():
+    QUANT_OPS[kFp8Dynamic128Sym] = torch.ops._C.per_token_group_fp8_quant.default  # noqa: E501
+    QUANT_OPS[kFp8Dynamic64Sym] = torch.ops._C.per_token_group_fp8_quant.default  # noqa: E501
+
 SILU_MUL_OP = torch.ops._C.silu_and_mul.default
 
 
@@ -224,12 +230,20 @@ class MatcherFusedAddRMSNorm(MatcherCustomOp):
 
 
 class MatcherQuantFP8(MatcherCustomOp):
-    def __init__(self, quant_key: QuantKey, enabled: bool | None = None):
+    def __init__(
+        self,
+        quant_key: QuantKey,
+        enabled: bool | None = None,
+        use_col_major_scales: bool = False,
+        use_e8m0: bool = False,
+    ):
         if enabled is None:
             enabled = QuantFP8.enabled()
 
         super().__init__(enabled)
         self.quant_key = quant_key
+        self.use_col_major_scales = use_col_major_scales
+        self.use_e8m0 = use_e8m0
         assert quant_key in QUANT_OPS, f"unsupported quantization scheme {quant_key}"
         self.QUANT_OP = QUANT_OPS[quant_key]
 
@@ -248,6 +262,27 @@ class MatcherQuantFP8(MatcherCustomOp):
             input.shape, device=input.device, dtype=self.quant_key.dtype
         )
 
+        if self.quant_key.scale.group_shape.is_per_group():
+            assert scale is None
+            scale = self.make_scale(input, transposed=self.use_col_major_scales)
+
+            finfo = torch.finfo(self.quant_key.dtype)
+            fp8_min = finfo.min
+            fp8_max = finfo.max
+
+            _, result, scale = auto_functionalized(
+                self.QUANT_OP,
+                input=input,
+                output_q=result,
+                output_s=scale,
+                group_size=self.quant_key.scale.group_shape[1],
+                eps=1e-10,
+                fp8_min=fp8_min,
+                fp8_max=fp8_max,
+                scale_ue8m0=self.use_e8m0,
+            )
+            return result, scale
+
         if self.quant_key.scale.static:
             assert scale is not None
             _, result = auto_functionalized(
@@ -269,7 +304,7 @@ class MatcherQuantFP8(MatcherCustomOp):
     ) -> tuple[torch.Tensor, torch.Tensor]:
         return self.quant_fp8(input, scale)
 
-    def make_scale(self, input: torch.Tensor):
+    def make_scale(self, input: torch.Tensor, transposed: bool = False):
         normalized_group_shape = _normalize_quant_group_shape(
             input, self.quant_key.scale.group_shape
         )
@@ -277,6 +312,11 @@ class MatcherQuantFP8(MatcherCustomOp):
             input.shape[0] // normalized_group_shape[0],
             input.shape[1] // normalized_group_shape[1],
         )
+        if transposed:
+            scale_shape = tuple(reversed(scale_shape))
+            return torch.empty(
+                scale_shape, device=input.device, dtype=torch.float32
+            ).permute(-1, -2)
 
         return torch.empty(scale_shape, device=input.device, dtype=torch.float32)
 
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 7e1bda8639ac7..ad92f4ec63c34 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -733,7 +733,7 @@ def per_token_group_quant_fp8(
     assert out_q is None or out_q.shape == x.shape
     x_q = out_q
     if x_q is None:
-        x_q = torch.empty_like(x, device=x.device, dtype=dtype)
+        x_q = torch.empty(x.shape, device=x.device, dtype=dtype)
 
     # Allocate the scale tensor in either row- or column-major format.
     if column_major_scales:
diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py
index d056d3404385a..92ee8c498e01f 100644
--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -115,6 +115,12 @@ kFp8DynamicTokenSym = QuantKey(FP8_DTYPE, kDynamicTokenScale, symmetric=True)
 kNvfp4GroupScale = ScaleDesc(FP8_DTYPE, False, GroupShape(1, 16))
 kNvfp4Quant = QuantKey(FP4_DTYPE, scale=kNvfp4GroupScale, scale2=kStaticTensorScale)
 
+kDynamic128Scale = ScaleDesc(torch.float32, False, GroupShape(1, 128))
+kFp8Dynamic128Sym = QuantKey(FP8_DTYPE, kDynamic128Scale, symmetric=True)
+
+kDynamic64Scale = ScaleDesc(torch.float32, False, GroupShape(1, 64))
+kFp8Dynamic64Sym = QuantKey(FP8_DTYPE, kDynamic64Scale, symmetric=True)
+
 
 # Normalize the group_shape to the full extent for any dims that are -1
 def _normalize_quant_group_shape(x: torch.Tensor, group_shape: GroupShape):
diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index b25c1e3e1ece3..8545108a02666 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -381,6 +381,22 @@ def should_use_deepgemm_for_fp8_linear(
     )
 
 
+def should_use_deepgemm_for_fp8_linear_for_nk(
+    output_dtype: torch.dtype,
+    shape0: int,
+    shape1: int,
+    supports_deep_gemm: bool | None = None,
+):
+    if supports_deep_gemm is None:
+        supports_deep_gemm = is_deep_gemm_supported()
+    return (
+        supports_deep_gemm
+        and output_dtype == torch.bfloat16
+        and shape0 % 128 == 0
+        and shape1 % 128 == 0
+    )
+
+
 __all__ = [
     "calc_diff",
     "fp8_gemm_nt",
@@ -394,6 +410,7 @@ __all__ = [
     "is_deep_gemm_supported",
     "get_num_sms",
     "should_use_deepgemm_for_fp8_linear",
+    "should_use_deepgemm_for_fp8_linear_for_nk",
     "get_col_major_tma_aligned_tensor",
     "get_mk_alignment_for_contiguous_layout",
 ]

From 444f0e3f339caba85f84c6628e1df50605b241a0 Mon Sep 17 00:00:00 2001
From: daniel-salib <danielsalib@meta.com>
Date: Sun, 7 Dec 2025 18:02:52 -0800
Subject: [PATCH 324/770] [Frontend] Add MCP type support infrastructure to
 Responses API (#30054)

Signed-off-by: Daniel Salib <danielsalib@meta.com>
---
 .../openai/parser/test_harmony_utils.py       | 185 +++++++++++++++++-
 .../openai/parser/harmony_utils.py            | 163 +++++++++++----
 vllm/entrypoints/openai/protocol.py           |   8 +
 3 files changed, 309 insertions(+), 47 deletions(-)

diff --git a/tests/entrypoints/openai/parser/test_harmony_utils.py b/tests/entrypoints/openai/parser/test_harmony_utils.py
index ae6f558f22b03..a3fd80938de6a 100644
--- a/tests/entrypoints/openai/parser/test_harmony_utils.py
+++ b/tests/entrypoints/openai/parser/test_harmony_utils.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from openai.types.responses import ResponseFunctionToolCall, ResponseReasoningItem
+from openai.types.responses.response_output_item import McpCall
 from openai_harmony import Author, Message, Role, TextContent
 
 from vllm.entrypoints.openai.parser.harmony_utils import (
@@ -400,17 +401,19 @@ class TestParseOutputMessage:
         assert output_items[0].arguments == '{"location": "San Francisco"}'
         assert output_items[1].arguments == '{"location": "New York"}'
 
-    def test_commentary_with_unknown_recipient_raises_error(self):
-        """Test that commentary with unknown recipient raises ValueError."""
-        message = Message.from_role_and_content(Role.ASSISTANT, "some content")
+    def test_commentary_with_unknown_recipient_creates_mcp_call(self):
+        """Test that commentary with unknown recipient creates MCP call."""
+        message = Message.from_role_and_content(Role.ASSISTANT, '{"arg": "value"}')
         message = message.with_channel("commentary")
-        message = message.with_recipient("unknown_recipient")
+        message = message.with_recipient("custom_tool")
 
-        try:
-            parse_output_message(message)
-            raise AssertionError("Expected ValueError to be raised")
-        except ValueError as e:
-            assert "Unknown recipient: unknown_recipient" in str(e)
+        output_items = parse_output_message(message)
+
+        assert len(output_items) == 1
+        assert isinstance(output_items[0], McpCall)
+        assert output_items[0].type == "mcp_call"
+        assert output_items[0].name == "custom_tool"
+        assert output_items[0].server_label == "custom_tool"
 
     def test_analysis_channel_creates_reasoning(self):
         """Test that analysis channel creates reasoning items."""
@@ -451,3 +454,167 @@ def test_has_custom_tools() -> None:
     assert has_custom_tools(
         {"web_search_preview", "code_interpreter", "container", "others"}
     )
+
+
+def test_parse_mcp_call_basic() -> None:
+    """Test that MCP calls are parsed with correct type and server_label."""
+    message = Message.from_role_and_content(Role.ASSISTANT, '{"path": "/tmp"}')
+    message = message.with_recipient("filesystem")
+    message = message.with_channel("commentary")
+
+    output_items = parse_output_message(message)
+
+    assert len(output_items) == 1
+    assert isinstance(output_items[0], McpCall)
+    assert output_items[0].type == "mcp_call"
+    assert output_items[0].name == "filesystem"
+    assert output_items[0].server_label == "filesystem"
+    assert output_items[0].arguments == '{"path": "/tmp"}'
+    assert output_items[0].status == "completed"
+
+
+def test_parse_mcp_call_dotted_recipient() -> None:
+    """Test that dotted recipients extract the tool name correctly."""
+    message = Message.from_role_and_content(Role.ASSISTANT, '{"cmd": "ls"}')
+    message = message.with_recipient("repo_browser.list")
+    message = message.with_channel("commentary")
+
+    output_items = parse_output_message(message)
+
+    assert len(output_items) == 1
+    assert isinstance(output_items[0], McpCall)
+    assert output_items[0].name == "list"
+    assert output_items[0].server_label == "repo_browser"
+
+
+def test_mcp_vs_function_call() -> None:
+    """Test that function calls are not parsed as MCP calls."""
+    func_message = Message.from_role_and_content(Role.ASSISTANT, '{"arg": "value"}')
+    func_message = func_message.with_recipient("functions.my_tool")
+    func_message = func_message.with_channel("commentary")
+
+    func_items = parse_output_message(func_message)
+
+    assert len(func_items) == 1
+    assert not isinstance(func_items[0], McpCall)
+    assert func_items[0].type == "function_call"
+
+
+def test_mcp_vs_builtin_tools() -> None:
+    """Test that built-in tools (python, container) are not parsed as MCP calls."""
+    # Test python (built-in tool) - should be reasoning, not MCP
+    python_message = Message.from_role_and_content(Role.ASSISTANT, "print('hello')")
+    python_message = python_message.with_recipient("python")
+    python_message = python_message.with_channel("commentary")
+
+    python_items = parse_output_message(python_message)
+
+    assert len(python_items) == 1
+    assert not isinstance(python_items[0], McpCall)
+    assert python_items[0].type == "reasoning"
+
+
+def test_parse_remaining_state_commentary_channel() -> None:
+    """Test parse_remaining_state with commentary channel and various recipients."""
+    from unittest.mock import Mock
+
+    from vllm.entrypoints.openai.parser.harmony_utils import parse_remaining_state
+
+    # Test 1: functions.* recipient → should return function tool call
+    parser_func = Mock()
+    parser_func.current_content = '{"arg": "value"}'
+    parser_func.current_role = Role.ASSISTANT
+    parser_func.current_channel = "commentary"
+    parser_func.current_recipient = "functions.my_tool"
+
+    func_items = parse_remaining_state(parser_func)
+
+    assert len(func_items) == 1
+    assert not isinstance(func_items[0], McpCall)
+    assert func_items[0].type == "function_call"
+    assert func_items[0].name == "my_tool"
+    assert func_items[0].status == "in_progress"
+
+    # Test 2: MCP tool (not builtin) → should return MCP call
+    parser_mcp = Mock()
+    parser_mcp.current_content = '{"path": "/tmp"}'
+    parser_mcp.current_role = Role.ASSISTANT
+    parser_mcp.current_channel = "commentary"
+    parser_mcp.current_recipient = "filesystem"
+
+    mcp_items = parse_remaining_state(parser_mcp)
+
+    assert len(mcp_items) == 1
+    assert isinstance(mcp_items[0], McpCall)
+    assert mcp_items[0].type == "mcp_call"
+    assert mcp_items[0].name == "filesystem"
+    assert mcp_items[0].server_label == "filesystem"
+    assert mcp_items[0].status == "in_progress"
+
+    # Test 3: Built-in tool (python)
+    # should NOT return MCP call, falls through to reasoning
+    parser_builtin = Mock()
+    parser_builtin.current_content = "print('hello')"
+    parser_builtin.current_role = Role.ASSISTANT
+    parser_builtin.current_channel = "commentary"
+    parser_builtin.current_recipient = "python"
+
+    builtin_items = parse_remaining_state(parser_builtin)
+
+    # Should fall through to reasoning logic
+    assert len(builtin_items) == 1
+    assert not isinstance(builtin_items[0], McpCall)
+    assert builtin_items[0].type == "reasoning"
+
+
+def test_parse_remaining_state_analysis_channel() -> None:
+    """Test parse_remaining_state with analysis channel and various recipients."""
+    from unittest.mock import Mock
+
+    from vllm.entrypoints.openai.parser.harmony_utils import parse_remaining_state
+
+    # Test 1: functions.* recipient → should return function tool call
+    parser_func = Mock()
+    parser_func.current_content = '{"arg": "value"}'
+    parser_func.current_role = Role.ASSISTANT
+    parser_func.current_channel = "analysis"
+    parser_func.current_recipient = "functions.my_tool"
+
+    func_items = parse_remaining_state(parser_func)
+
+    assert len(func_items) == 1
+    assert not isinstance(func_items[0], McpCall)
+    assert func_items[0].type == "function_call"
+    assert func_items[0].name == "my_tool"
+    assert func_items[0].status == "in_progress"
+
+    # Test 2: MCP tool (not builtin) → should return MCP call
+    parser_mcp = Mock()
+    parser_mcp.current_content = '{"query": "test"}'
+    parser_mcp.current_role = Role.ASSISTANT
+    parser_mcp.current_channel = "analysis"
+    parser_mcp.current_recipient = "database"
+
+    mcp_items = parse_remaining_state(parser_mcp)
+
+    assert len(mcp_items) == 1
+    assert isinstance(mcp_items[0], McpCall)
+    assert mcp_items[0].type == "mcp_call"
+    assert mcp_items[0].name == "database"
+    assert mcp_items[0].server_label == "database"
+    assert mcp_items[0].status == "in_progress"
+
+    # Test 3: Built-in tool (container)
+    # should NOT return MCP call, falls through to reasoning
+    parser_builtin = Mock()
+    parser_builtin.current_content = "docker run"
+    parser_builtin.current_role = Role.ASSISTANT
+    parser_builtin.current_channel = "analysis"
+    parser_builtin.current_recipient = "container"
+
+    builtin_items = parse_remaining_state(parser_builtin)
+
+    # Should fall through to reasoning logic
+    assert len(builtin_items) == 1
+    assert not isinstance(builtin_items[0], McpCall)
+    assert builtin_items[0].type == "reasoning"
diff --git a/vllm/entrypoints/openai/parser/harmony_utils.py b/vllm/entrypoints/openai/parser/harmony_utils.py
index 7da0914ce3d3e..2260e9604c3ed 100644
--- a/vllm/entrypoints/openai/parser/harmony_utils.py
+++ b/vllm/entrypoints/openai/parser/harmony_utils.py
@@ -19,6 +19,7 @@ from openai.types.responses.response_function_web_search import (
     ActionSearch,
     ResponseFunctionWebSearch,
 )
+from openai.types.responses.response_output_item import McpCall
 from openai.types.responses.response_reasoning_item import (
     Content as ResponseReasoningTextContent,
 )
@@ -155,11 +156,7 @@ def get_developer_message(
                 "web_search_preview",
                 "code_interpreter",
                 "container",
-                "mcp",
             ):
-                # These are built-in tools that are added to the system message.
-                # Adding in MCP for now until we support MCP tools executed
-                # server side
                 pass
 
             elif tool.type == "function":
@@ -427,6 +424,44 @@ def _parse_final_message(message: Message) -> ResponseOutputItem:
     )
 
 
+def _parse_mcp_recipient(recipient: str) -> tuple[str, str]:
+    """
+    Parse MCP recipient into (server_label, tool_name).
+
+    For dotted recipients like "repo_browser.list":
+        - server_label: "repo_browser" (namespace/server)
+        - tool_name: "list" (specific tool)
+
+    For simple recipients like "filesystem":
+        - server_label: "filesystem"
+        - tool_name: "filesystem"
+    """
+    if "." in recipient:
+        server_label = recipient.split(".")[0]
+        tool_name = recipient.split(".")[-1]
+    else:
+        server_label = recipient
+        tool_name = recipient
+    return server_label, tool_name
+
+
+def _parse_mcp_call(message: Message, recipient: str) -> list[ResponseOutputItem]:
+    """Parse MCP calls into MCP call items."""
+    server_label, tool_name = _parse_mcp_recipient(recipient)
+    output_items = []
+    for content in message.content:
+        response_item = McpCall(
+            arguments=content.text,
+            type="mcp_call",
+            name=tool_name,
+            server_label=server_label,
+            id=f"mcp_{random_uuid()}",
+            status="completed",
+        )
+        output_items.append(response_item)
+    return output_items
+
+
 def parse_output_message(message: Message) -> list[ResponseOutputItem]:
     """
     Parse a Harmony message into a list of output response items.
@@ -440,33 +475,34 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]:
     output_items: list[ResponseOutputItem] = []
     recipient = message.recipient
 
-    # Browser tool calls
-    if recipient is not None and recipient.startswith("browser."):
-        output_items.append(_parse_browser_tool_call(message, recipient))
+    if recipient is not None:
+        # Browser tool calls
+        if recipient.startswith("browser."):
+            output_items.append(_parse_browser_tool_call(message, recipient))
 
-    # Analysis channel (reasoning/chain-of-thought)
+        # Function calls (should only happen on commentary channel)
+        elif message.channel == "commentary" and recipient.startswith("functions."):
+            output_items.extend(_parse_function_call(message, recipient))
+
+        # Built-in tools are treated as reasoning
+        elif recipient.startswith(("python", "browser", "container")):
+            # Built-in tool recipients (python/browser/container)
+            # generate reasoning output
+            output_items.extend(_parse_reasoning_content(message))
+
+        # All other recipients are MCP calls
+        else:
+            output_items.extend(_parse_mcp_call(message, recipient))
+
+    # No recipient - handle based on channel for non-tool messages
     elif message.channel == "analysis":
         output_items.extend(_parse_reasoning_content(message))
 
-    # Commentary channel
     elif message.channel == "commentary":
-        # Function calls
-        if recipient is not None and recipient.startswith("functions."):
-            output_items.extend(_parse_function_call(message, recipient))
+        # Per Harmony format, commentary channel can contain preambles to calling
+        # multiple functions - explanatory text with no recipient
+        output_items.extend(_parse_reasoning_content(message))
 
-        # Built-in tools on commentary channel are treated as reasoning for now
-        elif (
-            recipient is None  # Preambles: explanatory text before tool calls
-            or recipient.startswith(("python", "browser", "container"))
-        ):
-            # Per Harmony format, commentary channel can contain preambles to calling
-            # multiple functions - explanatory text with no recipient. Built-in tool
-            # recipients (python/browser/container) also generate reasoning output.
-            output_items.extend(_parse_reasoning_content(message))
-        else:
-            raise ValueError(f"Unknown recipient: {recipient}")
-
-    # Final output message
     elif message.channel == "final":
         output_items.append(_parse_final_message(message))
 
@@ -485,20 +521,70 @@ def parse_remaining_state(parser: StreamableParser) -> list[ResponseOutputItem]:
     if current_recipient is not None and current_recipient.startswith("browser."):
         return []
 
-    if parser.current_channel == "analysis":
-        reasoning_item = ResponseReasoningItem(
-            id=f"rs_{random_uuid()}",
-            summary=[],
-            type="reasoning",
-            content=[
-                ResponseReasoningTextContent(
-                    text=parser.current_content, type="reasoning_text"
+    if current_recipient and parser.current_channel in ("commentary", "analysis"):
+        if current_recipient.startswith("functions."):
+            rid = random_uuid()
+            return [
+                ResponseFunctionToolCall(
+                    arguments=parser.current_content,
+                    call_id=f"call_{rid}",
+                    type="function_call",
+                    name=current_recipient.split(".")[-1],
+                    id=f"fc_{rid}",
+                    status="in_progress",
                 )
-            ],
-            status=None,
-        )
-        return [reasoning_item]
-    elif parser.current_channel == "final":
+            ]
+        # Built-in tools (python, browser, container) should be treated as reasoning
+        elif not (
+            current_recipient.startswith("python")
+            or current_recipient.startswith("browser")
+            or current_recipient.startswith("container")
+        ):
+            # All other recipients are MCP calls
+            rid = random_uuid()
+            server_label, tool_name = _parse_mcp_recipient(current_recipient)
+            return [
+                McpCall(
+                    arguments=parser.current_content,
+                    type="mcp_call",
+                    name=tool_name,
+                    server_label=server_label,
+                    id=f"mcp_{rid}",
+                    status="in_progress",
+                )
+            ]
+
+    if parser.current_channel == "commentary":
+        return [
+            ResponseReasoningItem(
+                id=f"rs_{random_uuid()}",
+                summary=[],
+                type="reasoning",
+                content=[
+                    ResponseReasoningTextContent(
+                        text=parser.current_content, type="reasoning_text"
+                    )
+                ],
+                status=None,
+            )
+        ]
+
+    if parser.current_channel == "analysis":
+        return [
+            ResponseReasoningItem(
+                id=f"rs_{random_uuid()}",
+                summary=[],
+                type="reasoning",
+                content=[
+                    ResponseReasoningTextContent(
+                        text=parser.current_content, type="reasoning_text"
+                    )
+                ],
+                status=None,
+            )
+        ]
+
+    if parser.current_channel == "final":
         output_text = ResponseOutputText(
             text=parser.current_content,
             annotations=[],  # TODO
@@ -515,6 +601,7 @@ def parse_remaining_state(parser: StreamableParser) -> list[ResponseOutputItem]:
             type="message",
         )
         return [text_item]
+
     return []
 
 
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 2d34a6a0cd5ad..aeff6bded7f00 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -25,6 +25,10 @@ from openai.types.responses import (
     ResponseContentPartDoneEvent,
     ResponseFunctionToolCall,
     ResponseInputItemParam,
+    ResponseMcpCallArgumentsDeltaEvent,
+    ResponseMcpCallArgumentsDoneEvent,
+    ResponseMcpCallCompletedEvent,
+    ResponseMcpCallInProgressEvent,
     ResponseOutputItem,
     ResponseOutputItemAddedEvent,
     ResponseOutputItemDoneEvent,
@@ -1790,6 +1794,10 @@ StreamingResponsesResponse: TypeAlias = (
     | ResponseCodeInterpreterCallCodeDoneEvent
     | ResponseCodeInterpreterCallInterpretingEvent
     | ResponseCodeInterpreterCallCompletedEvent
+    | ResponseMcpCallArgumentsDeltaEvent
+    | ResponseMcpCallArgumentsDoneEvent
+    | ResponseMcpCallInProgressEvent
+    | ResponseMcpCallCompletedEvent
 )
 
 
From 735284ed865e0c863cbd084527f0ae77a560fac1 Mon Sep 17 00:00:00 2001
From: Andrew Xia <axia@meta.com>
Date: Sun, 7 Dec 2025 18:04:03 -0800
Subject: [PATCH 325/770] [responsesAPI][7] Browser, Container MCP tools for
 non harmony models (#29989)

Signed-off-by: Andrew Xia <axia@meta.com>
Signed-off-by: Andrew Xia <axia@fb.com>
Co-authored-by: Andrew Xia <axia@fb.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/entrypoints/context.py | 89 ++++++++++++++++++++++++++++++++++---
 1 file changed, 83 insertions(+), 6 deletions(-)

diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
index a484a437c853e..01ddab473723b 100644
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -278,12 +278,14 @@ class ParsableContext(ConversationContext):
     def need_builtin_tool_call(self) -> bool:
         """Return true if the last message is a MCP tool call"""
         last_message = self.parser.response_messages[-1]
-        # TODO: figure out which tools are MCP tools
-        if (  # noqa: SIM103
-            last_message.type == "function_call"
-            and last_message.name in ("code_interpreter", "python")
-        ):
-            return True
+        # TODO(qandrew): figure out which tools are MCP tools
+        if last_message.type == "function_call":  # noqa: SIM102
+            if last_message.name in (
+                "code_interpreter",
+                "python",
+                "web_search_preview",
+            ) or last_message.name.startswith("container"):
+                return True
 
         return False
 
@@ -310,12 +312,87 @@ class ParsableContext(ConversationContext):
 
         return [message]
 
+    async def call_search_tool(
+        self, tool_session: Union["ClientSession", Tool], last_msg: FunctionCall
+    ) -> list[ResponseInputOutputItem]:
+        self.called_tools.add("browser")
+        if isinstance(tool_session, Tool):
+            return await tool_session.get_result_parsable_context(self)
+        if envs.VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY:
+            try:
+                args = json.loads(last_msg.arguments)
+            except json.JSONDecodeError as e:
+                return _create_json_parse_error_messages(last_msg, e)
+        else:
+            args = json.loads(last_msg.arguments)
+        result = await tool_session.call_tool("search", args)
+        result_str = result.content[0].text
+
+        message = ResponseFunctionToolCallOutputItem(
+            id=f"fco_{random_uuid()}",
+            type="function_call_output",
+            call_id=f"call_{random_uuid()}",
+            output=result_str,
+            status="completed",
+        )
+
+        return [message]
+
+    async def call_container_tool(
+        self, tool_session: Union["ClientSession", Tool], last_msg: Message
+    ) -> list[Message]:
+        """
+        Call container tool. Expect this to be run in a stateful docker
+        with command line terminal.
+        The official container tool would at least
+        expect the following format:
+        - for tool name: exec
+            - args:
+                {
+                    "cmd":List[str] "command to execute",
+                    "workdir":optional[str] "current working directory",
+                    "env":optional[object/dict] "environment variables",
+                    "session_name":optional[str] "session name",
+                    "timeout":optional[int] "timeout in seconds",
+                    "user":optional[str] "user name",
+                }
+        """
+        self.called_tools.add("container")
+        if isinstance(tool_session, Tool):
+            return await tool_session.get_result_parsable_context(self)
+        # tool_name = last_msg.recipient.split(".")[1].split(" ")[0]
+        if envs.VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY:
+            try:
+                args = json.loads(last_msg.arguments)
+            except json.JSONDecodeError as e:
+                return _create_json_parse_error_messages(last_msg, e)
+        else:
+            args = json.loads(last_msg.arguments)
+        result = await tool_session.call_tool("exec", args)
+        result_str = result.content[0].text
+
+        message = ResponseFunctionToolCallOutputItem(
+            id=f"fco_{random_uuid()}",
+            type="function_call_output",
+            call_id=f"call_{random_uuid()}",
+            output=result_str,
+            status="completed",
+        )
+
+        return [message]
+
     async def call_tool(self) -> list[ResponseInputOutputItem]:
         if not self.parser.response_messages:
             return []
         last_msg = self.parser.response_messages[-1]
         if last_msg.name == "code_interpreter":
             return await self.call_python_tool(self._tool_sessions["python"], last_msg)
+        elif last_msg.name == "web_search_preview":
+            return await self.call_search_tool(self._tool_sessions["browser"], last_msg)
+        elif last_msg.name.startswith("container"):
+            return await self.call_container_tool(
+                self._tool_sessions["container"], last_msg
+            )
         return []
 
     def render_for_completion(self):

From 344b50d5258d7cf3f136416e1dbcd9b5ee99bb00 Mon Sep 17 00:00:00 2001
From: Zhijian Jiang <Zhijian.Jiang@outlook.com>
Date: Sun, 7 Dec 2025 19:26:25 -0800
Subject: [PATCH 326/770] Address comment to mergify.yml in #30117 (#30219)

Signed-off-by: Zhijian Jiang <Zhijian.Jiang@outlook.com>
---
 .github/mergify.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/mergify.yml b/.github/mergify.yml
index 5cb9fcdf9f86a..58a5d77867c95 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -172,7 +172,7 @@ pull_request_rules:
       - files~=^tests/entrypoints/test_context.py
       - files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
       - files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
-      - files~=^vllm/entrypoints/harmony_utils.py
+      - files~=^vllm/entrypoints/openai/parser/harmony_utils.py
       - files~=^vllm/entrypoints/tool_server.py
       - files~=^vllm/entrypoints/tool.py
       - files~=^vllm/entrypoints/context.py
@@ -390,4 +390,4 @@ pull_request_rules:
   actions:
     label:
       add:
-        - kv-connector
\ No newline at end of file
+        - kv-connector

From d726a7b0ed0b58b7703709dc3e486c5fe2f55db3 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Sun, 7 Dec 2025 20:21:05 -0800
Subject: [PATCH 327/770] [BugFix] Unblock use of LoRA with data parallel mode
 (#30220)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/metrics/loggers.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 6961e15c2d0c5..882e0ce0b2e03 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -952,7 +952,10 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
         self.gauge_lora_info: Gauge | None = None
         if vllm_config.lora_config is not None:
             if len(self.engine_indexes) > 1:
-                raise NotImplementedError("LoRA in DP mode is not supported yet.")
+                logger.warning(
+                    "vllm:lora_requests_info prometheus metrics may be "
+                    "incorrect/misleading with data parallel deployments."
+                )
             self.labelname_max_lora = "max_lora"
             self.labelname_waiting_lora_adapters = "waiting_lora_adapters"
             self.labelname_running_lora_adapters = "running_lora_adapters"

From c6df05ebb49901510890aea7c7411a17a38c0a8f Mon Sep 17 00:00:00 2001
From: Zhiwei <532707544@qq.com>
Date: Mon, 8 Dec 2025 13:23:46 +0800
Subject: [PATCH 328/770] [ROCm] [Fused Moe EP] Use binary expert mask for
 aiter fused moe kernel (#29773)

Signed-off-by: ZhiweiYan-96 <zhiwei.yan@amd.com>
---
 vllm/model_executor/layers/fused_moe/layer.py              | 4 ++++
 vllm/model_executor/layers/quantization/quark/quark_moe.py | 1 +
 2 files changed, 5 insertions(+)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 6001b6d83c398..9b4d77a060c29 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -520,6 +520,10 @@ class FusedMoE(CustomOp):
         self._init_aiter_shared_experts_topK_buffer(
             vllm_config=vllm_config, dp_size=dp_size_
         )
+        if self.use_ep and self.rocm_aiter_fmoe_enabled:
+            assert self.expert_mask is None or torch.all(
+                (expert_mask == 0) | (expert_mask == 1)
+            ), "Aiter Fused MoE kernel only supports expert_map with 0 and 1s."
 
         assert intermediate_size % self.tp_size == 0
         self.hidden_size = hidden_size
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index 8be0299eaa66f..9e2b2134310fc 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -633,6 +633,7 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
                 topk_ids=topk_ids,
                 activation=activation,
                 quant_config=self.moe_quant_config,
+                expert_map=expert_map,
             )
         else:
             from vllm.model_executor.layers.fused_moe import fused_experts

From d143271234454026454c5ee6a55fc516dd298dac Mon Sep 17 00:00:00 2001
From: Jiangyun Zhu <riverclouds.zhu@qq.com>
Date: Mon, 8 Dec 2025 14:43:47 +0800
Subject: [PATCH 329/770] [Bugfix] fix fuse_allreduce_rms when tp =1 (#30178)

Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
---
 vllm/compilation/collective_fusion.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py
index 2717738dd7c29..57bd94c7e8ad6 100644
--- a/vllm/compilation/collective_fusion.py
+++ b/vllm/compilation/collective_fusion.py
@@ -1076,11 +1076,15 @@ class AllReduceFusionPass(VllmPatternMatcherPass):
         self.disabled = True
         self.tp_size = get_tensor_model_parallel_world_size()
         if self.tp_size <= 1:
+            logger.warning_once("AllReduce fusion pass is disabled for tp_size <= 1.")
             return
         self.patterns: PatternMatcherPass = PatternMatcherPass(
             pass_name="all_reduce_fusion_pass"
         )
         if config.model_config is None:
+            logger.warning_once(
+                "AllReduce fusion pass is disabled for missing model_config."
+            )
             return
         self.hidden_dim = config.model_config.get_hidden_size()
         self.group = get_tp_group().device_group
@@ -1188,6 +1192,9 @@ class AllReduceFusionPass(VllmPatternMatcherPass):
         self.disabled = False
 
     def is_applicable_for_range(self, compile_range: Range) -> bool:
+        if self.disabled:
+            logger.warning_once("AllReduce fusion pass is disabled.")
+            return False
         return compile_range.end <= self.max_token_num
 
     @VllmInductorPass.time_and_log

From cd00c443d2272e5325dc6d730616f0a68ad533d1 Mon Sep 17 00:00:00 2001
From: Zhiyu <zhiyuc@nvidia.com>
Date: Sun, 7 Dec 2025 23:05:27 -0800
Subject: [PATCH 330/770] [Misc] Rename TensorRT Model Optimizer to Model
 Optimizer (#30091)

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 docs/features/quantization/README.md   | 2 +-
 docs/features/quantization/modelopt.md | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/features/quantization/README.md b/docs/features/quantization/README.md
index 7b5287bad3bb8..8b4dcf01969ae 100644
--- a/docs/features/quantization/README.md
+++ b/docs/features/quantization/README.md
@@ -14,7 +14,7 @@ Contents:
 - [INT4 W4A16](int4.md)
 - [INT8 W8A8](int8.md)
 - [FP8 W8A8](fp8.md)
-- [NVIDIA TensorRT Model Optimizer](modelopt.md)
+- [NVIDIA Model Optimizer](modelopt.md)
 - [AMD Quark](quark.md)
 - [Quantized KV Cache](quantized_kvcache.md)
 - [TorchAO](torchao.md)
diff --git a/docs/features/quantization/modelopt.md b/docs/features/quantization/modelopt.md
index c48ccb719a79d..b02d5ba9e89a2 100644
--- a/docs/features/quantization/modelopt.md
+++ b/docs/features/quantization/modelopt.md
@@ -1,6 +1,6 @@
-# NVIDIA TensorRT Model Optimizer
+# NVIDIA Model Optimizer
 
-The [NVIDIA TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer) is a library designed to optimize models for inference with NVIDIA GPUs. It includes tools for Post-Training Quantization (PTQ) and Quantization Aware Training (QAT) of Large Language Models (LLMs), Vision Language Models (VLMs), and diffusion models.
+The [NVIDIA Model Optimizer](https://github.com/NVIDIA/Model-Optimizer) is a library designed to optimize models for inference with NVIDIA GPUs. It includes tools for Post-Training Quantization (PTQ) and Quantization Aware Training (QAT) of Large Language Models (LLMs), Vision Language Models (VLMs), and diffusion models.
 
 We recommend installing the library with:
 
@@ -10,7 +10,7 @@ pip install nvidia-modelopt
 
 ## Quantizing HuggingFace Models with PTQ
 
-You can quantize HuggingFace models using the example scripts provided in the TensorRT Model Optimizer repository. The primary script for LLM PTQ is typically found within the `examples/llm_ptq` directory.
+You can quantize HuggingFace models using the example scripts provided in the Model Optimizer repository. The primary script for LLM PTQ is typically found within the `examples/llm_ptq` directory.
 
 Below is an example showing how to quantize a model using modelopt's PTQ API:
 

From bcb6f5947f8acac22f5d1d5fc92a91b06fd57e77 Mon Sep 17 00:00:00 2001
From: Dazhi Jiang <dazhi_jiang@163.com>
Date: Mon, 8 Dec 2025 15:12:42 +0800
Subject: [PATCH 331/770] [Perf] Remove sync point in vit torch sdpa attn
 backend (#30232)

Signed-off-by: Dazhi Jiang <dazhi_jiang@163.com>
---
 vllm/attention/ops/vit_attn_wrappers.py  | 12 ++++++------
 vllm/model_executor/models/ernie45_vl.py | 12 ++++++------
 vllm/model_executor/models/glm4_1v.py    | 12 ++++++------
 vllm/model_executor/models/qwen2_vl.py   | 12 ++++++------
 4 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/vllm/attention/ops/vit_attn_wrappers.py b/vllm/attention/ops/vit_attn_wrappers.py
index d9f15f1e42858..9036c2b801949 100644
--- a/vllm/attention/ops/vit_attn_wrappers.py
+++ b/vllm/attention/ops/vit_attn_wrappers.py
@@ -93,12 +93,12 @@ def torch_sdpa_wrapper(
     cu_seqlens: torch.Tensor,
 ) -> torch.Tensor:
     outputs = []
-    for i in range(1, len(cu_seqlens)):
-        start_idx = cu_seqlens[i - 1]
-        end_idx = cu_seqlens[i]
-        q_i = q[:, start_idx:end_idx]
-        k_i = k[:, start_idx:end_idx]
-        v_i = v[:, start_idx:end_idx]
+
+    lens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+    q_chunks = torch.split(q, lens, dim=1)
+    k_chunks = torch.split(k, lens, dim=1)
+    v_chunks = torch.split(v, lens, dim=1)
+    for q_i, k_i, v_i in zip(q_chunks, k_chunks, v_chunks):
         q_i, k_i, v_i = (
             einops.rearrange(x, "b s h d -> b h s d") for x in [q_i, k_i, v_i]
         )
diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py
index 3305b6a0e58f0..053d260cc09b2 100644
--- a/vllm/model_executor/models/ernie45_vl.py
+++ b/vllm/model_executor/models/ernie45_vl.py
@@ -289,12 +289,12 @@ class Ernie4_5_VisionAttention(nn.Module):
         elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
             # Execute attention entry by entry for speed & less VRAM.
             outputs = []
-            for i in range(1, len(cu_seqlens)):
-                start_idx = cu_seqlens[i - 1]
-                end_idx = cu_seqlens[i]
-                q_i = q[:, start_idx:end_idx]
-                k_i = k[:, start_idx:end_idx]
-                v_i = v[:, start_idx:end_idx]
+
+            lens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+            q_chunks = torch.split(q, lens, dim=1)
+            k_chunks = torch.split(k, lens, dim=1)
+            v_chunks = torch.split(v, lens, dim=1)
+            for q_i, k_i, v_i in zip(q_chunks, k_chunks, v_chunks):
                 q_i, k_i, v_i = (
                     rearrange(x, "b s h d -> b h s d") for x in [q_i, k_i, v_i]
                 )
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 39a837b789bed..741edfdda3e2c 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -377,12 +377,12 @@ class Glm4vVisionAttention(nn.Module):
         elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
             # Execute attention entry by entry for speed & less VRAM.
             outputs = []
-            for i in range(1, len(cu_seqlens)):
-                start_idx = cu_seqlens[i - 1]
-                end_idx = cu_seqlens[i]
-                q_i = q[:, start_idx:end_idx]
-                k_i = k[:, start_idx:end_idx]
-                v_i = v[:, start_idx:end_idx]
+
+            lens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+            q_chunks = torch.split(q, lens, dim=1)
+            k_chunks = torch.split(k, lens, dim=1)
+            v_chunks = torch.split(v, lens, dim=1)
+            for q_i, k_i, v_i in zip(q_chunks, k_chunks, v_chunks):
                 q_i, k_i, v_i = (
                     rearrange(x, "b s h d -> b h s d") for x in [q_i, k_i, v_i]
                 )
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 885e172d1e81a..608e90337f452 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -424,12 +424,12 @@ class Qwen2VisionAttention(nn.Module):
                 k = k.contiguous()
                 v = v.contiguous()
             outputs = []
-            for i in range(1, len(cu_seqlens)):
-                start_idx = cu_seqlens[i - 1]
-                end_idx = cu_seqlens[i]
-                q_i = q[:, start_idx:end_idx]
-                k_i = k[:, start_idx:end_idx]
-                v_i = v[:, start_idx:end_idx]
+
+            lens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+            q_chunks = torch.split(q, lens, dim=1)
+            k_chunks = torch.split(k, lens, dim=1)
+            v_chunks = torch.split(v, lens, dim=1)
+            for q_i, k_i, v_i in zip(q_chunks, k_chunks, v_chunks):
                 q_i, k_i, v_i = (
                     rearrange(x, "b s h d -> b h s d") for x in [q_i, k_i, v_i]
                 )

From 9e77ffca3f41e0e73879098f1686a4c82b8619d9 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <yuqi.wang@daocloud.io>
Date: Mon, 8 Dec 2025 16:10:09 +0800
Subject: [PATCH 332/770] [Model][7/N] Improve all pooling task | Deprecation
 as_reward_model. Extract hidden states prefer using new multi-vector
 retrieval API (#26686)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
---
 docs/models/pooling_models.md                 | 10 ++-
 docs/models/supported_models.md               |  9 +--
 .../pooling/token_embed/jina_embeddings_v4.py | 71 +++++++++++++++++++
 tests/models/test_registry.py                 |  2 -
 tests/test_config.py                          |  2 +-
 vllm/config/model.py                          | 10 ++-
 vllm/model_executor/model_loader/utils.py     |  4 --
 vllm/model_executor/models/adapters.py        | 38 ----------
 8 files changed, 88 insertions(+), 58 deletions(-)
 create mode 100644 examples/pooling/token_embed/jina_embeddings_v4.py

diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index e2d427e8a4590..32ffcf96fabef 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -33,8 +33,8 @@ shown in the table below.
 | Architecture                                    | `--convert` | Supported pooling tasks               |
 |-------------------------------------------------|-------------|---------------------------------------|
 | `*ForTextEncoding`, `*EmbeddingModel`, `*Model` | `embed`     | `token_embed`, `embed`                |
+| `*ForRewardModeling`, `*RewardModel`            | `embed`     | `token_embed`, `embed`                |
 | `*For*Classification`, `*ClassificationModel`   | `classify`  | `token_classify`, `classify`, `score` |
-| `*ForRewardModeling`, `*RewardModel`            | `reward`    | `token_classify`                      |
 
 !!! tip
     You can explicitly set `--convert <type>` to specify how to convert the model.
@@ -70,7 +70,6 @@ the pooler assigned to each task has the following attributes by default:
 
 | Task       | Pooling Type | Normalization | Softmax |
 |------------|--------------|---------------|---------|
-| `reward`   | `ALL`        | ❌            | ❌     |
 | `embed`    | `LAST`       | ✅︎            | ❌      |
 | `classify` | `LAST`       | ❌            | ✅︎      |
 
@@ -318,3 +317,10 @@ We have split the `encode` task into two more specific token-wise tasks: `token_
 ### Remove softmax from PoolingParams
 
 We are going to remove `softmax` and `activation` from `PoolingParams`. Instead, use `use_activation`, since we allow `classify` and `token_classify` to use any activation function.
+
+### as_reward_model
+
+Pooling models now default support all pooling, you can use it without any settings.
+
+- Extracting hidden states prefers using `token_embed` task.
+- Reward models prefers using `token_classify` task.
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index ec3ba4474c192..d0166060c267a 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -581,16 +581,9 @@ These models primarily support the [`LLM.reward`](./pooling_models.md#llmreward)
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
 |--------------|--------|-------------------|----------------------|---------------------------|
 | `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ |
-| `LlamaForCausalLM`<sup>C</sup> | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ |
+| `LlamaForCausalLM` | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ |
 | `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ |
 | `Qwen2ForProcessRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-PRM-7B`, etc. | ✅︎ | ✅︎ |
-| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* |
-
-<sup>C</sup> Automatically converted into a reward model via `--convert reward`. ([details](./pooling_models.md#model-conversion))  
-\* Feature support is the same as that of the original model.
-
-If your model is not in the above list, we will try to automatically convert the model using
-[as_reward_model][vllm.model_executor.models.adapters.as_reward_model]. By default, we return the hidden states of each token directly.
 
 !!! important
     For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
diff --git a/examples/pooling/token_embed/jina_embeddings_v4.py b/examples/pooling/token_embed/jina_embeddings_v4.py
new file mode 100644
index 0000000000000..83d4c446d426c
--- /dev/null
+++ b/examples/pooling/token_embed/jina_embeddings_v4.py
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm import LLM
+from vllm.inputs.data import TextPrompt
+from vllm.multimodal.utils import fetch_image
+
+# Initialize model
+model = LLM(
+    model="jinaai/jina-embeddings-v4-vllm-text-matching",
+    runner="pooling",
+    max_model_len=1024,
+    gpu_memory_utilization=0.8,
+)
+
+# Create text prompts
+text1 = "Ein wunderschöner Sonnenuntergang am Strand"
+text1_prompt = TextPrompt(prompt=f"Query: {text1}")
+
+text2 = "浜辺に沈む美しい夕日"
+text2_prompt = TextPrompt(prompt=f"Query: {text2}")
+
+# Create image prompt
+image = fetch_image(
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/eskimo.jpg"  # noqa: E501
+)
+image_prompt = TextPrompt(
+    prompt="<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe the image.<|im_end|>\n",  # noqa: E501
+    multi_modal_data={"image": image},
+)
+
+# Encode all prompts
+prompts = [text1_prompt, text2_prompt, image_prompt]
+outputs = model.encode(prompts, pooling_task="token_embed")
+
+
+def get_embeddings(outputs):
+    VISION_START_TOKEN_ID, VISION_END_TOKEN_ID = 151652, 151653
+
+    embeddings = []
+    for output in outputs:
+        if VISION_START_TOKEN_ID in output.prompt_token_ids:
+            # Gather only vision tokens
+            img_start_pos = torch.where(
+                torch.tensor(output.prompt_token_ids) == VISION_START_TOKEN_ID
+            )[0][0]
+            img_end_pos = torch.where(
+                torch.tensor(output.prompt_token_ids) == VISION_END_TOKEN_ID
+            )[0][0]
+            embeddings_tensor = output.outputs.data.detach().clone()[
+                img_start_pos : img_end_pos + 1
+            ]
+        else:
+            # Use all tokens for text-only prompts
+            embeddings_tensor = output.outputs.data.detach().clone()
+
+        # Pool and normalize embeddings
+        pooled_output = (
+            embeddings_tensor.sum(dim=0, dtype=torch.float32)
+            / embeddings_tensor.shape[0]
+        )
+        embeddings.append(torch.nn.functional.normalize(pooled_output, dim=-1))
+    return embeddings
+
+
+embeddings = get_embeddings(outputs)
+
+for embedding in embeddings:
+    print(embedding.shape)
diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py
index 9017a0fd91407..a089696e10ffc 100644
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -13,7 +13,6 @@ from vllm.model_executor.models import (
 )
 from vllm.model_executor.models.adapters import (
     as_embedding_model,
-    as_reward_model,
     as_seq_cls_model,
 )
 from vllm.model_executor.models.registry import (
@@ -46,7 +45,6 @@ def test_registry_imports(model_arch):
     # All vLLM models should be convertible to a pooling model
     assert is_pooling_model(as_seq_cls_model(model_cls))
     assert is_pooling_model(as_embedding_model(model_cls))
-    assert is_pooling_model(as_reward_model(model_cls))
 
     if model_arch in _MULTIMODAL_MODELS:
         assert supports_multimodal(model_cls)
diff --git a/tests/test_config.py b/tests/test_config.py
index 203447cd531fb..77d3a7115978e 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -97,7 +97,7 @@ def test_update_config():
         ("intfloat/multilingual-e5-small", "pooling", "none", "embed"),
         ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"),
         ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "none", "classify"),
-        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none", "reward"),
+        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none", "embed"),
         ("openai/whisper-small", "generate", "none", "transcription"),
     ],
 )
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 583904a949ea1..764bdf7000561 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -516,7 +516,11 @@ class ModelConfig:
             if task == "classify":
                 return "classify"
             if task == "reward":
-                return "reward"
+                logger.warning(
+                    "Pooling models now default support all pooling; "
+                    "you can use it without any settings."
+                )
+                return "embed"
             if task == "score":
                 new_task = self._get_default_pooling_task(architectures)
                 return "classify" if new_task == "classify" else "embed"
@@ -1899,8 +1903,8 @@ _SUFFIX_TO_DEFAULTS: list[tuple[str, tuple[RunnerType, ConvertType]]] = [
     ("ForImageClassification", ("pooling", "classify")),
     ("ForVideoClassification", ("pooling", "classify")),
     ("ClassificationModel", ("pooling", "classify")),
-    ("ForRewardModeling", ("pooling", "reward")),
-    ("RewardModel", ("pooling", "reward")),
+    ("ForRewardModeling", ("pooling", "embed")),
+    ("RewardModel", ("pooling", "embed")),
     # Let other `*Model`s take priority
     ("Model", ("pooling", "embed")),
 ]
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index eeb2444150eef..74b02e4c62583 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -167,7 +167,6 @@ _MODEL_ARCH_BY_HASH = dict[int, tuple[type[nn.Module], str]]()
 def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module], str]:
     from vllm.model_executor.models.adapters import (
         as_embedding_model,
-        as_reward_model,
         as_seq_cls_model,
         try_create_mm_pooling_model_cls,
     )
@@ -207,9 +206,6 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
     elif convert_type == "classify":
         logger.debug_once("Converting to sequence classification model.")
         model_cls = as_seq_cls_model(model_cls)
-    elif convert_type == "reward":
-        logger.debug_once("Converting to reward model.")
-        model_cls = as_reward_model(model_cls)
     else:
         assert_never(convert_type)
 
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 007d847ac3b7b..70f203b9f7c64 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -346,44 +346,6 @@ def as_seq_cls_model(cls: _T) -> _T:
     return ModelForSequenceClassification  # type: ignore
 
 
-def as_reward_model(cls: _T) -> _T:
-    """
-    Subclass an existing vLLM model to support reward modeling.
-
-    By default, we return the hidden states of each token directly.
-
-    Note:
-        We assume that no extra layers are added to the original model;
-        please implement your own model if this is not the case.
-    """
-    # Avoid modifying existing reward models
-    if is_pooling_model(cls):
-        return cls
-
-    # Lazy import
-    from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
-
-    from .interfaces_base import default_pooling_type
-
-    @default_pooling_type("ALL")
-    class ModelForReward(_create_pooling_model_cls(cls)):
-        def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""):
-            pooler_config = vllm_config.model_config.pooler_config
-            assert pooler_config is not None
-
-            self.pooler = DispatchPooler(
-                {
-                    "token_classify": Pooler.for_token_classify(
-                        pooler_config=pooler_config
-                    )
-                }
-            )
-
-    ModelForReward.__name__ = _get_pooling_model_name(cls.__name__, "ForReward")
-
-    return ModelForReward  # type: ignore
-
-
 class SequenceClassificationConfig(VerifyAndUpdateConfig):
     @staticmethod
     def verify_and_update_config(vllm_config: "VllmConfig") -> None:

From 408cf42f67dbcd50027fcd0f6ba35df83ced9107 Mon Sep 17 00:00:00 2001
From: Shiming Zhang <wzshiming@hotmail.com>
Date: Mon, 8 Dec 2025 18:29:14 +0800
Subject: [PATCH 333/770] [CI] Prevents triggering of an inactive issue/PR
 check for forked repository. (#29654)

Signed-off-by: Shiming Zhang <wzshiming@hotmail.com>
---
 .github/workflows/stale.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index dca3089f496c9..c8a52f1a63269 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -7,6 +7,8 @@ on:
 
 jobs:
   close-issues-and-pull-requests:
+    # Prevents triggering on forks or other repos
+    if: github.repository == 'vllm-project/vllm'
     permissions:
       issues: write
       pull-requests: write

From 2e660c24349944ac78abcf2c35d17e3caf6cdd08 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <yuqi.wang@daocloud.io>
Date: Mon, 8 Dec 2025 20:01:21 +0800
Subject: [PATCH 334/770] [Frontend] Binary embedding response does not return
 metadata by setting encoding_format to bytes_only. (#30249)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
Signed-off-by: wang.yuqi <noooop@126.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 .../embed/embedding_requests_bytes_client.py  | 37 +++++++++++-
 .../entrypoints/pooling/embed/test_online.py  | 50 +++++++++++++++++
 .../pooling/pooling/test_online.py            | 56 +++++++++++++++++++
 vllm/entrypoints/pooling/embed/api_router.py  |  4 +-
 vllm/entrypoints/pooling/embed/protocol.py    |  4 +-
 vllm/entrypoints/pooling/embed/serving.py     | 34 ++++++-----
 .../entrypoints/pooling/pooling/api_router.py |  4 +-
 vllm/entrypoints/pooling/pooling/protocol.py  |  4 +-
 vllm/entrypoints/pooling/pooling/serving.py   | 35 +++++++-----
 vllm/utils/serial_utils.py                    | 43 ++++++++++++--
 10 files changed, 230 insertions(+), 41 deletions(-)

diff --git a/examples/pooling/embed/embedding_requests_bytes_client.py b/examples/pooling/embed/embedding_requests_bytes_client.py
index c2832f1b54ce7..5ea4525241497 100644
--- a/examples/pooling/embed/embedding_requests_bytes_client.py
+++ b/examples/pooling/embed/embedding_requests_bytes_client.py
@@ -16,6 +16,7 @@ from vllm.utils.serial_utils import (
     EMBED_DTYPE_TO_TORCH_DTYPE,
     ENDIANNESS,
     MetadataItem,
+    build_metadata_items,
     decode_pooling_output,
 )
 
@@ -38,6 +39,11 @@ def parse_args():
 def main(args):
     api_url = f"http://{args.host}:{args.port}/v1/embeddings"
     model_name = args.model
+    embedding_size = 0
+
+    input_texts = [
+        "The best thing about vLLM is that it supports many different models",
+    ] * 2
 
     # The OpenAI client does not support the bytes encoding_format.
     # The OpenAI client does not support the embed_dtype and endianness parameters.
@@ -45,7 +51,7 @@ def main(args):
         for endianness in ENDIANNESS:
             prompt = {
                 "model": model_name,
-                "input": "vLLM is great!",
+                "input": input_texts,
                 "encoding_format": "bytes",
                 "embed_dtype": embed_dtype,
                 "endianness": endianness,
@@ -57,7 +63,34 @@ def main(args):
 
             embedding = decode_pooling_output(items=items, body=body)
             embedding = [x.to(torch.float32) for x in embedding]
-            embedding = torch.cat(embedding)
+            embedding = torch.stack(embedding)
+            embedding_size = embedding.shape[-1]
+            print(embed_dtype, endianness, embedding.shape)
+
+    # The vllm server always sorts the returned embeddings in the order of input. So
+    # returning metadata is not necessary. You can set encoding_format to bytes_only
+    # to let the server not return metadata.
+    for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE:
+        for endianness in ENDIANNESS:
+            prompt = {
+                "model": model_name,
+                "input": input_texts,
+                "encoding_format": "bytes_only",
+                "embed_dtype": embed_dtype,
+                "endianness": endianness,
+            }
+            response = post_http_request(prompt=prompt, api_url=api_url)
+            body = response.content
+
+            items = build_metadata_items(
+                embed_dtype=embed_dtype,
+                endianness=endianness,
+                shape=(embedding_size,),
+                n_request=len(input_texts),
+            )
+            embedding = decode_pooling_output(items=items, body=body)
+            embedding = [x.to(torch.float32) for x in embedding]
+            embedding = torch.stack(embedding)
             print(embed_dtype, endianness, embedding.shape)
 
 
diff --git a/tests/entrypoints/pooling/embed/test_online.py b/tests/entrypoints/pooling/embed/test_online.py
index ddba1c790ba8c..f96338c47f0be 100644
--- a/tests/entrypoints/pooling/embed/test_online.py
+++ b/tests/entrypoints/pooling/embed/test_online.py
@@ -24,6 +24,7 @@ from vllm.utils.serial_utils import (
     ENDIANNESS,
     MetadataItem,
     binary2tensor,
+    build_metadata_items,
     decode_pooling_output,
 )
 
@@ -344,6 +345,55 @@ async def test_bytes_embed_dtype_and_endianness(
             )
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_bytes_only_embed_dtype_and_endianness(
+    server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
+):
+    input_texts = [
+        "The best thing about vLLM is that it supports many different models",
+    ] * 2
+
+    responses_float = await client.embeddings.create(
+        input=input_texts, model=model_name, encoding_format="float"
+    )
+    float_data = [d.embedding for d in responses_float.data]
+    embedding_size = len(float_data[0])
+
+    for embed_dtype in list(EMBED_DTYPE_TO_TORCH_DTYPE.keys()):
+        for endianness in ENDIANNESS:
+            responses_bytes = requests.post(
+                server.url_for("/v1/embeddings"),
+                json={
+                    "model": model_name,
+                    "input": input_texts,
+                    "encoding_format": "bytes_only",
+                    "embed_dtype": embed_dtype,
+                    "endianness": endianness,
+                },
+            )
+
+            assert "metadata" not in responses_bytes.headers
+            body = responses_bytes.content
+            items = build_metadata_items(
+                embed_dtype=embed_dtype,
+                endianness=endianness,
+                shape=(embedding_size,),
+                n_request=len(input_texts),
+            )
+
+            bytes_data = decode_pooling_output(items=items, body=body)
+            bytes_data = [x.to(torch.float32).tolist() for x in bytes_data]
+
+            check_embeddings_close(
+                embeddings_0_lst=float_data,
+                embeddings_1_lst=bytes_data,
+                name_0="float_data",
+                name_1="bytes_data",
+                tol=1e-2,
+            )
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("param_name", ["encoding_format", "embed_dtype", "endianness"])
diff --git a/tests/entrypoints/pooling/pooling/test_online.py b/tests/entrypoints/pooling/pooling/test_online.py
index cc5c2f26f80fb..33add5bdaef49 100644
--- a/tests/entrypoints/pooling/pooling/test_online.py
+++ b/tests/entrypoints/pooling/pooling/test_online.py
@@ -18,6 +18,7 @@ from vllm.utils.serial_utils import (
     ENDIANNESS,
     MetadataItem,
     binary2tensor,
+    build_metadata_items,
     decode_pooling_output,
 )
 
@@ -352,6 +353,61 @@ async def test_bytes_embed_dtype_and_endianness(
             )
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_bytes_only_embed_dtype_and_endianness(
+    server: RemoteOpenAIServer, model_name: str
+):
+    input_texts = [
+        "The best thing about vLLM is that it supports many different models",
+    ] * 2
+
+    url = server.url_for("pooling")
+    float_response = requests.post(
+        url,
+        json={
+            "model": model_name,
+            "input": input_texts,
+            "encoding_format": "float",
+        },
+    )
+    responses_float = PoolingResponse.model_validate(float_response.json())
+    float_data = [np.array(d.data).squeeze(-1).tolist() for d in responses_float.data]
+    n_tokens = responses_float.usage.prompt_tokens // len(input_texts)
+
+    for embed_dtype in list(EMBED_DTYPE_TO_TORCH_DTYPE.keys()):
+        for endianness in ENDIANNESS:
+            responses_bytes = requests.post(
+                url,
+                json={
+                    "model": model_name,
+                    "input": input_texts,
+                    "encoding_format": "bytes_only",
+                    "embed_dtype": embed_dtype,
+                    "endianness": endianness,
+                },
+            )
+
+            assert "metadata" not in responses_bytes.headers
+            body = responses_bytes.content
+            items = build_metadata_items(
+                embed_dtype=embed_dtype,
+                endianness=endianness,
+                shape=(n_tokens, 1),
+                n_request=len(input_texts),
+            )
+            bytes_data = decode_pooling_output(items=items, body=body)
+            bytes_data = [x.to(torch.float32).view(-1).tolist() for x in bytes_data]
+
+            check_embeddings_close(
+                embeddings_0_lst=float_data,
+                embeddings_1_lst=bytes_data,
+                name_0="float_data",
+                name_1="bytes_data",
+                tol=1e-2,
+            )
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("param_name", ["encoding_format", "embed_dtype", "endianness"])
diff --git a/vllm/entrypoints/pooling/embed/api_router.py b/vllm/entrypoints/pooling/embed/api_router.py
index 5b10a32e79f81..24b0c8c2b3cf6 100644
--- a/vllm/entrypoints/pooling/embed/api_router.py
+++ b/vllm/entrypoints/pooling/embed/api_router.py
@@ -59,8 +59,8 @@ async def create_embedding(
         return JSONResponse(content=generator.model_dump())
     elif isinstance(generator, EmbeddingBytesResponse):
         return StreamingResponse(
-            content=generator.body,
-            headers={"metadata": generator.metadata},
+            content=generator.content,
+            headers=generator.headers,
             media_type=generator.media_type,
         )
 
diff --git a/vllm/entrypoints/pooling/embed/protocol.py b/vllm/entrypoints/pooling/embed/protocol.py
index 7eb53e14d5d8a..6a8f8c4434e55 100644
--- a/vllm/entrypoints/pooling/embed/protocol.py
+++ b/vllm/entrypoints/pooling/embed/protocol.py
@@ -203,6 +203,6 @@ class EmbeddingResponse(OpenAIBaseModel):
 
 
 class EmbeddingBytesResponse(OpenAIBaseModel):
-    body: list[bytes]
-    metadata: str
+    content: list[bytes]
+    headers: dict[str, str] | None = None
     media_type: str = "application/octet-stream"
diff --git a/vllm/entrypoints/pooling/embed/serving.py b/vllm/entrypoints/pooling/embed/serving.py
index 868a3cb017a6b..aafc354897105 100644
--- a/vllm/entrypoints/pooling/embed/serving.py
+++ b/vllm/entrypoints/pooling/embed/serving.py
@@ -163,29 +163,35 @@ class EmbeddingMixin(OpenAIServing):
                 usage=usage,
             )
 
-        def encode_bytes():
-            body, items, usage = encode_pooling_bytes(
+        def encode_bytes(bytes_only: bool) -> EmbeddingBytesResponse:
+            content, items, usage = encode_pooling_bytes(
                 pooling_outputs=final_res_batch_checked,
                 embed_dtype=embed_dtype,
                 endianness=endianness,
             )
 
-            metadata = {
-                "id": ctx.request_id,
-                "created": ctx.created_time,
-                "model": ctx.model_name,
-                "data": items,
-                "usage": usage,
-            }
-            return EmbeddingBytesResponse(
-                body=body,
-                metadata=json.dumps(metadata),
+            headers = (
+                None
+                if bytes_only
+                else {
+                    "metadata": json.dumps(
+                        {
+                            "id": ctx.request_id,
+                            "created": ctx.created_time,
+                            "model": ctx.model_name,
+                            "data": items,
+                            "usage": usage,
+                        }
+                    )
+                }
             )
 
+            return EmbeddingBytesResponse(content=content, headers=headers)
+
         if encoding_format == "float" or encoding_format == "base64":
             return encode_float_base64()
-        elif encoding_format == "bytes":
-            return encode_bytes()
+        elif encoding_format == "bytes" or encoding_format == "bytes_only":
+            return encode_bytes(bytes_only=encoding_format == "bytes_only")
         else:
             assert_never(encoding_format)
 
diff --git a/vllm/entrypoints/pooling/pooling/api_router.py b/vllm/entrypoints/pooling/pooling/api_router.py
index 674da94d126cf..4baaf8f30f6bb 100644
--- a/vllm/entrypoints/pooling/pooling/api_router.py
+++ b/vllm/entrypoints/pooling/pooling/api_router.py
@@ -55,8 +55,8 @@ async def create_pooling(request: PoolingRequest, raw_request: Request):
         return JSONResponse(content=generator.model_dump())
     elif isinstance(generator, PoolingBytesResponse):
         return StreamingResponse(
-            content=generator.body,
-            headers={"metadata": generator.metadata},
+            content=generator.content,
+            headers=generator.headers,
             media_type=generator.media_type,
         )
 
diff --git a/vllm/entrypoints/pooling/pooling/protocol.py b/vllm/entrypoints/pooling/pooling/protocol.py
index 364cd93738b84..76b361b49b668 100644
--- a/vllm/entrypoints/pooling/pooling/protocol.py
+++ b/vllm/entrypoints/pooling/pooling/protocol.py
@@ -143,6 +143,6 @@ class PoolingResponse(OpenAIBaseModel):
 
 
 class PoolingBytesResponse(OpenAIBaseModel):
-    body: list[bytes]
-    metadata: str
+    content: list[bytes]
+    headers: dict[str, str] | None = None
     media_type: str = "application/octet-stream"
diff --git a/vllm/entrypoints/pooling/pooling/serving.py b/vllm/entrypoints/pooling/pooling/serving.py
index 7fb767e26d019..57f1a6440cf76 100644
--- a/vllm/entrypoints/pooling/pooling/serving.py
+++ b/vllm/entrypoints/pooling/pooling/serving.py
@@ -314,29 +314,38 @@ class OpenAIServingPooling(OpenAIServing):
                 usage=usage,
             )
 
-        def encode_bytes():
-            body, items, usage = encode_pooling_bytes(
+        def encode_bytes(bytes_only: bool) -> PoolingBytesResponse:
+            content, items, usage = encode_pooling_bytes(
                 pooling_outputs=final_res_batch,
                 embed_dtype=embed_dtype,
                 endianness=endianness,
             )
 
-            metadata = {
-                "id": request_id,
-                "created": created_time,
-                "model": model_name,
-                "data": items,
-                "usage": usage,
-            }
+            headers = (
+                None
+                if bytes_only
+                else {
+                    "metadata": json.dumps(
+                        {
+                            "id": request_id,
+                            "created": created_time,
+                            "model": model_name,
+                            "data": items,
+                            "usage": usage,
+                        }
+                    )
+                }
+            )
+
             return PoolingBytesResponse(
-                body=body,
-                metadata=json.dumps(metadata),
+                content=content,
+                headers=headers,
             )
 
         if encoding_format == "float" or encoding_format == "base64":
             return encode_float_base64()
-        elif encoding_format == "bytes":
-            return encode_bytes()
+        elif encoding_format == "bytes" or encoding_format == "bytes_only":
+            return encode_bytes(bytes_only=encoding_format == "bytes_only")
         else:
             assert_never(encoding_format)
 
diff --git a/vllm/utils/serial_utils.py b/vllm/utils/serial_utils.py
index a6d717e03d37d..07db5eaf74c8d 100644
--- a/vllm/utils/serial_utils.py
+++ b/vllm/utils/serial_utils.py
@@ -2,15 +2,19 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import base64
 import io
+import math
 import sys
 from dataclasses import dataclass
-from typing import Literal
+from typing import TYPE_CHECKING, Any, Literal
 
 import numpy as np
 import torch
 from typing_extensions import assert_never
 
-from vllm import PoolingRequestOutput
+if TYPE_CHECKING:
+    from vllm import PoolingRequestOutput
+else:
+    PoolingRequestOutput = Any
 
 sys_byteorder = sys.byteorder
 
@@ -27,6 +31,14 @@ EMBED_DTYPE_TO_TORCH_DTYPE = {
     "fp8_e5m2": torch.float8_e5m2,
 }
 
+EMBED_DTYPE_TO_N_BYTES = {
+    "float32": 4,
+    "float16": 2,
+    "bfloat16": 2,
+    "fp8_e4m3": 1,
+    "fp8_e5m2": 1,
+}
+
 
 EMBED_DTYPE_TO_TORCH_DTYPE_VIEW = {
     "float32": torch.float32,
@@ -50,7 +62,7 @@ ENDIANNESS = ["native", "big", "little"]
 
 EmbedDType = Literal["float32", "float16", "bfloat16", "fp8_e4m3", "fp8_e5m2"]
 Endianness = Literal["native", "big", "little"]
-EncodingFormat = Literal["float", "base64", "bytes"]
+EncodingFormat = Literal["float", "base64", "bytes", "bytes_only"]
 
 
 def tensor2base64(x: torch.Tensor) -> str:
@@ -114,7 +126,7 @@ def encode_pooling_output(
     elif encoding_format == "base64":
         embedding_bytes = tensor2binary(output.outputs.data, embed_dtype, endianness)
         return base64.b64encode(embedding_bytes).decode("utf-8")
-    elif encoding_format == "bytes":
+    elif encoding_format == "bytes" or encoding_format == "bytes_only":
         return tensor2binary(output.outputs.data, embed_dtype, endianness)
     assert_never(encoding_format)
 
@@ -129,6 +141,29 @@ class MetadataItem:
     shape: tuple[int, ...]
 
 
+def build_metadata_items(
+    embed_dtype: EmbedDType,
+    endianness: Endianness,
+    shape: tuple[int, ...],
+    n_request: int,
+):
+    n_bytes = EMBED_DTYPE_TO_N_BYTES[embed_dtype]
+    size = math.prod(shape)
+    items = [
+        MetadataItem(
+            index=i,
+            embed_dtype=embed_dtype,
+            endianness=endianness,
+            start=i * size * n_bytes,
+            end=(i + 1) * size * n_bytes,
+            shape=shape,
+        )
+        for i in range(n_request)
+    ]
+
+    return items
+
+
 def encode_pooling_bytes(
     pooling_outputs: list[PoolingRequestOutput],
     embed_dtype: EmbedDType,

From 77072e93b327fd391482f5facadf93a804ced0cc Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Mon, 8 Dec 2025 03:06:20 -0900
Subject: [PATCH 335/770] [docs] governance documents (#24801)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: simon-mo <simon.mo@hey.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Signed-off-by: Simon Mo <simon.mo@hey.com>
Co-authored-by: Mark McLoughlin <markmc@redhat.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docs/.nav.yml                    |   1 +
 docs/governance/collaboration.md |  43 ++++++++
 docs/governance/committers.md    | 183 +++++++++++++++++++++++++++++++
 docs/governance/process.md       | 125 +++++++++++++++++++++
 4 files changed, 352 insertions(+)
 create mode 100644 docs/governance/collaboration.md
 create mode 100644 docs/governance/committers.md
 create mode 100644 docs/governance/process.md

diff --git a/docs/.nav.yml b/docs/.nav.yml
index aa98ad52be215..835cc773e7599 100644
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@@ -59,6 +59,7 @@ nav:
   - CLI Reference: cli
   - Community:
     - community/*
+    - Governance: governance
     - Blog: https://blog.vllm.ai
     - Forum: https://discuss.vllm.ai
     - Slack: https://slack.vllm.ai
diff --git a/docs/governance/collaboration.md b/docs/governance/collaboration.md
new file mode 100644
index 0000000000000..5b3d2beffe5b9
--- /dev/null
+++ b/docs/governance/collaboration.md
@@ -0,0 +1,43 @@
+# Collaboration Policy
+
+This page outlines how vLLM collaborates with model providers, hardware vendors, and other stakeholders.
+
+## Adding New Major Features
+
+Anyone can contribute to vLLM. For major features, submit an RFC (request for comments) first. To submit an RFC, create an [issue](https://github.com/vllm-project/vllm/issues/new/choose) and select the `RFC` template.
+RFCs are similar to design docs that discuss the motivation, problem solved, alternatives considered, and proposed change.
+
+Once you submit the RFC, please post it in the #contributors channel in vLLM Slack, and loop in area owners and committers for feedback.
+For high-interest features, the committers nominate a person to help with the RFC process and PR review. This makes sure someone is guiding you through the process. It is reflected as the "assignee" field in the RFC issue.
+If the assignee and lead maintainers find the feature to be contentious, the maintainer team aims to make decisions quickly after learning the details from everyone. This involves assigning a committer as the DRI (Directly Responsible Individual) to make the decision and shepherd the code contribution process.
+
+For features that you intend to maintain, please feel free to add yourself in [`mergify.yml`](https://github.com/vllm-project/vllm/blob/main/.github/mergify.yml) to receive notifications and auto-assignment when the PRs touching the feature you are maintaining. Over time, the ownership will be evaluated and updated through the committers nomination and voting process.
+
+## Adding New Models
+
+If you use vLLM, we recommend you making the model work with vLLM by following the [model registration](../contributing/model/registration.md) process before you release it publicly.
+
+The vLLM team helps with new model architectures not supported by vLLM, especially models pushing architectural frontiers.
+Here's how the vLLM team works with model providers. The vLLM team includes all [committers](./committers.md) of the project. model providers can exclude certain members but shouldn't, as this may harm release timelines due to missing expertise. Contact [project leads](./process.md) if you want to collaborate.
+
+Once we establish the connection between the vLLM team and model provider:
+
+- The vLLM team learns the model architecture and relevant changes, then plans which area owners to involve and what features to include.
+- The vLLM team creates a private communication channel (currently a Slack channel in the vLLM workspace) and a private fork within the vllm-project organization. The model provider team can invite others to the channel and repo.
+- Third parties like compute providers, hosted inference providers, hardware vendors, and other organizations often work with both the model provider and vLLM on model releases. We establish direct communication (with permission) or three-way communication as needed.
+
+The vLLM team works with model providers on features, integrations, and release timelines. We work to meet release timelines, but engineering challenges like feature development, model accuracy alignment, and optimizations can cause delays.
+
+The vLLM maintainers will not publicly share details about model architecture, release timelines, or upcoming releases. We maintain model weights on secure servers with security measures (though we can work with security reviews and testing without certification). We delete pre-release weights or artifacts upon request.
+
+The vLLM team collaborates on marketing and promotional efforts for model releases. model providers can use vLLM's trademark and logo in publications and materials.
+
+## Adding New Hardware
+
+vLLM is designed as a platform for frontier model architectures and high-performance accelerators.
+For new hardware, follow the [hardware plugin](../design/plugin_system.md) system to add support.
+Use the platform plugin system to add hardware support.
+As hardware gains popularity, we help endorse it in our documentation and marketing materials.
+The vLLM GitHub organization can host hardware plugin repositories, especially for collaborative efforts among companies.
+
+We rarely add new hardware to vLLM directly. Instead, we make existing hardware platforms modular to keep the vLLM core hardware-agnostic.
diff --git a/docs/governance/committers.md b/docs/governance/committers.md
new file mode 100644
index 0000000000000..c9428027da953
--- /dev/null
+++ b/docs/governance/committers.md
@@ -0,0 +1,183 @@
+# Committers
+
+This document lists the current committers of the vLLM project and the core areas they maintain.
+Committers have write access to the vLLM repository and are responsible for reviewing and merging PRs.
+You can also refer to the [CODEOWNERS](https://github.com/vllm-project/vllm/blob/main/.github/CODEOWNERS) file for concrete file-level ownership and reviewers. Both this documents and the CODEOWNERS file are living documents and they complement each other.
+
+## Active Committers
+
+We try to summarize each committer's role in vLLM in a few words. In general, vLLM committers cover a wide range of areas and help each other in the maintenance process.
+Please refer to the later section about Area Owners for exact component ownership details.
+Sorted alphabetically by GitHub handle:
+
+- [@22quinn](https://github.com/22quinn): RL API
+- [@aarnphm](https://github.com/aarnphm): Structured output
+- [@alexm-redhat](https://github.com/alexm-redhat): Performance
+- [@ApostaC](https://github.com/ApostaC): Connectors, offloading
+- [@benchislett](https://github.com/benchislett): Engine core and spec decode
+- [@bigPYJ1151](https://github.com/bigPYJ1151): Intel CPU/XPU integration
+- [@chaunceyjiang](https://github.com/chaunceyjiang): Tool use and reasoning parser
+- [@DarkLight1337](https://github.com/DarkLight1337): Multimodality, API server
+- [@esmeetu](https://github.com/esmeetu): developer marketing, community
+- [@gshtras](https://github.com/gshtras): AMD integration
+- [@heheda12345](https://github.com/heheda12345): Hybrid memory allocator
+- [@hmellor](https://github.com/hmellor): Hugging Face integration, documentation
+- [@houseroad](https://github.com/houseroad): Engine core and Llama models
+- [@Isotr0py](https://github.com/Isotr0py): Multimodality, new model support
+- [@jeejeelee](https://github.com/jeejeelee): LoRA, new model support
+- [@jikunshang](https://github.com/jikunshang): Intel CPU/XPU integration
+- [@khluu](https://github.com/khluu): CI infrastructure
+- [@KuntaiDu](https://github.com/KuntaiDu): KV Connector
+- [@LucasWilkinson](https://github.com/LucasWilkinson): Kernels and performance
+- [@luccafong](https://github.com/luccafong): Llama models, speculative decoding, distributed
+- [@markmc](https://github.com/markmc): Observability
+- [@mgoin](https://github.com/mgoin): Quantization and performance
+- [@NickLucche](https://github.com/NickLucche): KV connector
+- [@njhill](https://github.com/njhill): Distributed, API server, engine core
+- [@noooop](https://github.com/noooop): Pooling models
+- [@patrickvonplaten](https://github.com/patrickvonplaten): Mistral models, new model support
+- [@pavanimajety](https://github.com/pavanimajety): NVIDIA GPU integration
+- [@ProExpertProg](https://github.com/ProExpertProg): Compilation, startup UX
+- [@robertgshaw2-redhat](https://github.com/robertgshaw2-redhat): Core, distributed, disagg
+- [@ruisearch42](https://github.com/ruisearch42): Pipeline parallelism, Ray Support
+- [@russellb](https://github.com/russellb): Structured output, engine core, security
+- [@sighingnow](https://github.com/sighingnow): Qwen models, new model support
+- [@simon-mo](https://github.com/simon-mo): Project lead, API entrypoints, community
+- [@tdoublep](https://github.com/tdoublep): State space models
+- [@tjtanaa](https://github.com/tjtanaa): AMD GPU integration
+- [@tlrmchlsmth](https://github.com/tlrmchlsmth): Kernels and performance, distributed, disagg
+- [@WoosukKwon](https://github.com/WoosukKwon): Project lead, engine core
+- [@yaochengji](https://github.com/yaochengji): TPU integration
+- [@yeqcharlotte](https://github.com/yeqcharlotte): Benchmark, Llama models
+- [@yewentao256](https://github.com/yewentao256): Kernels and performance
+- [@Yikun](https://github.com/Yikun): Pluggable hardware interface
+- [@youkaichao](https://github.com/youkaichao): Project lead, distributed, compile, community
+- [@ywang96](https://github.com/ywang96): Multimodality, benchmarks
+- [@zhuohan123](https://github.com/zhuohan123): Project lead, RL integration, numerics
+- [@zou3519](https://github.com/zou3519): Compilation
+
+### Emeritus Committers
+
+Committers who have contributed to vLLM significantly in the past (thank you!) but no longer active:
+
+- [@andoorve](https://github.com/andoorve): Pipeline parallelism
+- [@cadedaniel](https://github.com/cadedaniel): Speculative decoding
+- [@comaniac](https://github.com/comaniac): KV cache management, pipeline parallelism
+- [@LiuXiaoxuanPKU](https://github.com/LiuXiaoxuanPKU): Speculative decoding
+- [@pcmoritz](https://github.com/pcmoritz): MoE
+- [@rkooo567](https://github.com/rkooo567): Chunked prefill
+- [@sroy745](https://github.com/sroy745): Speculative decoding
+- [@Yard1](https://github.com/Yard1): kernels and performance
+- [@zhisbug](https://github.com/zhisbug): Arctic models, distributed
+
+## Area Owners
+
+This section breaks down the active committers by vLLM components and lists the area owners.
+If you have PRs touching the area, please feel free to ping the area owner for review.
+
+### Engine Core
+
+- Scheduler: the core vLLM engine loop scheduling requests to next batch
+    - @WoosukKwon, @robertgshaw2-redhat, @njhill, @heheda12345
+- KV Cache Manager: memory management layer within scheduler maintaining KV cache logical block data
+    - @heheda12345, @WoosukKwon
+- AsyncLLM: the zmq based protocol hosting engine core and making it accessible for entrypoints
+    - @robertgshaw2-redhat, @njhill, @russellb
+- ModelRunner, Executor, Worker: the abstractions for engine wrapping model implementation
+    - @WoosukKwon, @tlrmchlsmth, @heheda12345, @LucasWilkinson, @ProExpertProg
+- KV Connector: Connector interface and implementation for KV cache offload and transfer
+    - @robertgshaw2-redhat, @njhill, @KuntaiDu, @NickLucche, @ApostaC
+- Distributed, Parallelism, Process Management: Process launchers managing each worker, and assign them to the right DP/TP/PP/EP ranks
+    - @youkaichao, @njhill, @WoosukKwon, @ruisearch42
+- Collectives: the usage of nccl and other communication libraries/kernels
+    - @tlrmchlsmth, @youkaichao
+- Multimodality engine and memory management: core scheduling and memory management concerning vision, audio, and video inputs.
+    - @ywang96, @DarkLight1337
+
+### Model Implementations
+
+- Model Interface: The `nn.Module` interface and implementation for various models
+    - @zhuohan123, @mgoin, @simon-mo, @houseroad, @ywang96 (multimodality), @jeejeelee (lora)
+- Logits Processors / Sampler: The provided sampler class and pluggable logits processors
+    - @njhill, @houseroad, @22quinn
+- Custom Layers: Utility layers in vLLM such as rotary embedding and rms norms
+    - @ProExpertProg
+- Attention: Attention interface for paged attention
+    - @WoosukKwon, @LucasWilkinson, @heheda12345
+- FusedMoE: FusedMoE kernel, Modular kernel framework, EPLB
+    - @tlrmchlsmth
+- Quantization: Various quantization config, weight loading, and kernel.
+    - @mgoin, @Isotr0py, @yewentao256
+- Custom quantized GEMM kernels (cutlass_scaled_mm, marlin, machete)
+    - @tlrmchlsmth, @LucasWilkinson
+- Multi-modal Input Processing: Components that load and process image/video/audio data into feature tensors
+    - @DarkLight1337, @ywang96, @Isotr0py
+- torch compile: The torch.compile integration in vLLM, custom passes & transformations
+    - @ProExpertProg, @zou3519, @youkaichao
+- State space models: The state space models implementation in vLLM
+    - @tdoublep, @tlrmchlsmth
+- Reasoning and tool calling parsers
+    - @chaunceyjiang, @aarnphm
+
+### Entrypoints
+
+- LLM Class: The LLM class for offline inference
+    - @DarkLight1337
+- API Server: The OpenAI-compatible API server
+    - @DarkLight1337, @njhill, @aarnphm, @simon-mo, @heheda12345 (Responses API)
+- Batch Runner: The OpenAI-compatible batch runner
+    - @simon-mo
+
+### Features
+
+- Spec Decode: Covers model definition, attention, sampler, and scheduler related to n-grams, EAGLE, and MTP.
+    - @WoosukKwon, @benchislett, @luccafong
+- Structured Output: The structured output implementation
+    - @russellb, @aarnphm
+- RL: The RL related features such as collective rpc, sleep mode, etc.
+    - @youkaichao, @zhuohan123, @22quinn
+- LoRA: @jeejeelee
+- Observability: Metrics and Logging
+    - @markmc, @robertgshaw2-redhat, @simon-mo
+
+### Code Base
+
+- Config: Configuration registration and parsing
+    - @hmellor
+- Documentation: @hmellor, @DarkLight1337, @simon-mo
+- Benchmarks: @ywang96, @simon-mo
+- CI, Build, Release Process: @khluu, @njhill, @simon-mo
+- Security: @russellb
+
+### External Kernels Integration
+
+- FlashAttention: @LucasWilkinson
+- FlashInfer: @LucasWilkinson, @mgoin, @WoosukKwon
+- Blackwell Kernels: @mgoin, @yewentao256
+- DeepEP/DeepGEMM/pplx: @mgoin, @yewentao256
+
+### Integrations
+
+- Hugging Face: @hmellor, @Isotr0py
+- Ray: @ruisearch42
+- NIXL: @robertgshaw2-redhat, @NickLucche
+
+### Collaboration with Model Vendors
+
+- gpt-oss: @heheda12345, @simon-mo, @zhuohan123
+- Llama: @luccafong
+- Qwen: @sighingnow
+- Mistral: @patrickvonplaten
+
+### Hardware
+
+- Plugin Interface: @youkaichao, @Yikun
+- NVIDIA GPU: @pavanimajety
+- AMD GPU: @gshtras, @tjtanaa
+- Intel CPU/GPU: @jikunshang, @bigPYJ1151
+- Google TPU: @yaochengji
+
+### Ecosystem Projects
+
+- Ascend NPU: [@wangxiyuan](https://github.com/wangxiyuan) and [see more details](https://vllm-ascend.readthedocs.io/en/latest/community/contributors.html#maintainers)
+- Intel Gaudi HPU [@xuechendi](https://github.com/xuechendi) and [@kzawora-intel](https://github.com/kzawora-intel)
diff --git a/docs/governance/process.md b/docs/governance/process.md
new file mode 100644
index 0000000000000..1e088dd3c1e64
--- /dev/null
+++ b/docs/governance/process.md
@@ -0,0 +1,125 @@
+# Governance Process
+
+vLLM's success comes from our strong open source community. We favor informal, meritocratic norms over formal policies. This document clarifies our governance philosophy and practices.
+
+## Values
+
+vLLM aims to be the fastest and easiest-to-use LLM inference and serving engine. We stay current with advances, enable innovation, and support diverse models, modalities, and hardware.
+
+### Design Values
+
+1. **Top performance**: System performance is our top priority. We monitor overheads, optimize kernels, and publish benchmarks. We never leave performance on the table.
+2. **Ease of use**: vLLM must be simple to install, configure, and operate. We provide clear documentation, fast startup, clean logs, helpful error messages, and monitoring guides. Many users fork our code or study it deeply, so we keep it readable and modular.
+3. **Wide coverage**: vLLM supports frontier models and high-performance accelerators. We make it easy to add new models and hardware. vLLM + PyTorch form a simple interface that avoids complexity.
+4. **Production ready**: vLLM runs 24/7 in production. It must be easy to operate and monitor for health issues.
+5. **Extensibility**: vLLM serves as fundamental LLM infrastructure. Our codebase cannot cover every use case, so we design for easy forking and customization.
+
+### Collaboration Values
+
+1. **Tightly Knit and Fast-Moving**: Our maintainer team is aligned on vision, philosophy, and roadmap. We work closely to unblock each other and move quickly.
+2. **Individual Merit**: No one buys their way into governance. Committer status belongs to individuals, not companies. We reward contribution, maintenance, and project stewardship.
+
+## Project Maintainers
+
+Maintainers form a hierarchy based on sustained, high-quality contributions and alignment with our design philosophy.
+
+### Core Maintainers
+
+Core Maintainers function like a project planning and decision making committee. In other convention, they might be called a Technical Steering Committee (TSC). In vLLM vocabulary, they are often known as "Project Leads". They meet weekly to coordinate roadmap priorities and allocate engineering resources. Current active leads: @WoosukKwon, @zhuohan123, @simon-mo, @youkaichao, @robertshaw2-redhat, @tlrmchlsmth, @mgoin, @njhill, @ywang96, @houseroad, @yeqcharlotte, @ApostaC
+
+The responsibilities of the core maintainers are:
+
+* Author quarterly roadmap and responsible for each development effort.
+* Making major changes to the technical direction or scope of vLLM and vLLM projects.
+* Defining the project's release strategy.
+* Work with model providers, hardware vendors, and key users of vLLM to ensure the project is on the right track.
+
+### Lead Maintainers
+
+While Core maintainers assume the day-to-day responsibilities of the project, Lead maintainers are responsible for the overall direction and strategy of the project. A committee of @WoosukKwon, @zhuohan123, @simon-mo, and @youkaichao currently shares this role with divided responsibilities.
+
+The responsibilities of the lead maintainers are:
+
+* Making decisions where consensus among core maintainers cannot be reached.
+* Adopting changes to the project's technical governance.
+* Organizing the voting process for new committers.
+
+### Committers and Area Owners
+
+Committers have write access and merge rights. They typically have deep expertise in specific areas and help the community.
+
+The responsibilities of the committers are:
+
+* Reviewing PRs and providing feedback.
+* Addressing issues and questions from the community.
+* Own specific areas of the codebase and development efforts: reviewing PRs, addressing issues, answering questions, improving documentation.
+
+Specially, committers are almost all area owners. They author subsystems, review PRs, refactor code, monitor tests, and ensure compatibility with other areas. All area owners are committers with deep expertise in that area, but not all committers own areas.
+
+For a full list of committers and their respective areas, see the [committers](./committers.md) page.
+
+#### Nomination Process
+
+Any committer can nominate candidates via our private mailing list:
+
+1. **Nominate**: Any committer may nominate a candidate by email to the private maintainers’ list, citing evidence mapped to the pre‑existing standards with links to PRs, reviews, RFCs, issues, benchmarks, and adoption evidence.
+2. **Vote**: The lead maintainers will group voices support or concerns. Shared concerns can stop the process. The vote typically last 3 working days. For concerns, committers group discuss the clear criteria for such person to be nominated again. The lead maintainers will make the final decision.
+3. **Confirm**: The lead maintainers send invitation, update CODEOWNERS, assign permissions, add to communications channels (mailing list and Slack).
+
+Committership is highly selective and merit based. The selection criteria requires:
+
+* **Area expertise**: leading design/implementation of core subsystems, material performance or reliability improvements adopted project‑wide, or accepted RFCs that shape technical direction.
+* **Sustained contributions**: high‑quality merged contributions and reviews across releases, responsiveness to feedback, and stewardship of code health.
+* **Community leadership**: mentoring contributors, triaging issues, improving docs, and elevating project standards.
+
+To further illustrate, a committer typically satisfies at least two of the following accomplishment patterns:
+
+* Author of an accepted RFC or design that materially shaped project direction
+* Measurable, widely adopted performance or reliability improvement in core paths
+* Long‑term ownership of a subsystem with demonstrable quality and stability gains
+* Significant cross‑project compatibility or ecosystem enablement work (models, hardware, tooling)
+
+While there isn't a quantitative bar, past committers have:
+
+* Submitted approximately 30+ PRs of substantial quality and scope
+* Provided high-quality reviews of approximately 10+ substantial external contributor PRs
+* Addressed multiple issues and questions from the community in issues/forums/Slack
+* Led concentrated efforts on RFCs and their implementation, or significant performance or reliability improvements adopted project‑wide
+
+### Working Groups
+
+vLLM runs informal working groups such as CI, CI infrastructure, torch compile, and startup UX. These can be loosely tracked via `#sig-` (or `#feat-`) channels in vLLM Slack. Some groups have regular sync meetings.
+
+### Advisory Board
+
+vLLM project leads consult with an informal advisory board that is composed of model providers, hardware vendors, and ecosystem partners. This manifests as a collaboration channel in Slack and frequent communications.
+
+## Process
+
+### Project Roadmap
+
+Project Leads publish quarterly roadmaps as GitHub issues. These clarify current priorities. Unlisted topics aren't excluded but may get less review attention. See [https://roadmap.vllm.ai/](https://roadmap.vllm.ai/).
+
+### Decision Making
+
+We make technical decisions in Slack and GitHub using RFCs and design docs. Discussion may happen elsewhere, but we maintain public records of significant changes: problem statements, rationale, and alternatives considered.
+
+### Merging Code
+
+Contributors and maintainers often collaborate closely on code changes, especially within organizations or specific areas. Maintainers should give others appropriate review opportunities based on change significance.
+
+PRs requires at least one committer review and approval. If the code is covered by CODEOWNERS, the PR should be reviewed by the CODEOWNERS. There are cases where the code is trivial or hotfix, the PR can be merged by the lead maintainers directly.
+
+In case where CI didn't pass due to the failure is not related to the PR, the PR can be merged by the lead maintainers using "force merge" option that overrides the CI checks.
+
+### Slack
+
+Contributors are encouraged to join `#pr-reviews` and `#contributors` channels.
+
+There are `#sig-` and `#feat-` channels for discussion and coordination around specific topics.
+
+The project maintainer group also uses a private channel for high-bandwidth collaboration.
+
+### Meetings
+
+We hold weekly contributor syncs with standup-style updates on progress, blockers, and plans. You can refer to the notes [standup.vllm.ai](https://standup.vllm.ai) for joining instructions.

From 5c2433a6f35a84741f26aa546432649891b896bb Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 8 Dec 2025 13:11:51 +0000
Subject: [PATCH 336/770] Add tip for `mypy` and `markdownlint` to the
 pre-commit comment (#30259)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .github/mergify.yml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/.github/mergify.yml b/.github/mergify.yml
index 58a5d77867c95..3ad79f93bc7ad 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -35,6 +35,20 @@ pull_request_rules:
 
         For future commits, `pre-commit` will run automatically on changed files before each commit.
 
+        > [!TIP]
+        > <details>
+        > <summary>Is <code>mypy</code> or <code>markdownlint</code> failing?</summary>
+        > <br/>
+        > <code>mypy</code> and <code>markdownlint</code> are run differently in CI. If the failure is related to either of these checks, please use the following commands to run them locally:
+        >
+        > ```bash
+        > # For mypy (substitute "3.10" with the failing version if needed)
+        > pre-commit run --hook-stage manual mypy-3.10
+        > # For markdownlint
+        > pre-commit run --hook-stage manual markdownlint
+        > ```
+        > </details>
+
 - name: comment-dco-failure
   description: Comment on PR when DCO check fails
   conditions:

From 80433e225ee0f91a7f6a082bf4e136df75ab4746 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Mon, 8 Dec 2025 21:29:47 +0800
Subject: [PATCH 337/770] [LoRA]  Reduce the loading time of MoE LoRA (#30243)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/lora/models.py | 34 +++++++++++++++++++++++++++-------
 1 file changed, 27 insertions(+), 7 deletions(-)

diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index ada30da600440..567ffce4e75fc 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -115,7 +115,7 @@ class LoRAModel:
         weights_mapper: WeightsMapper | None = None,
     ) -> "LoRAModel":
         """Create a LoRAModel from a dictionary of tensors."""
-        pin_memory = str(device) == "cpu" and is_pin_memory_available()
+
         loras: dict[str, LoRALayerWeights] = {}
         for tensor_name, tensor in tensors.items():
             if is_base_embeddding_weights(tensor_name):
@@ -139,14 +139,8 @@ class LoRAModel:
                         f" with the base model's vocabulary size({model_vocab_size})."
                     )
                 loras[module_name].lora_a = tensor.to(device=device, dtype=dtype)
-                if pin_memory:
-                    loras[module_name].lora_a = loras[module_name].lora_a.pin_memory()
             else:
                 loras[module_name].lora_b = tensor.to(device=device, dtype=dtype)
-
-                if pin_memory:
-                    loras[module_name].lora_b = loras[module_name].lora_b.pin_memory()
-
         return cls(lora_model_id, peft_helper.r, loras)
 
     @classmethod
@@ -742,6 +736,32 @@ class LoRAModelManager:
         for lora in lora_model.loras.values():
             lora.optimize()
 
+        first_lora: LoRALayerWeights = next(iter(lora_model.loras.values()))
+        assert first_lora.lora_a is not None
+        if isinstance(first_lora.lora_a, list):
+            lora_device = next(iter(first_lora.lora_a))
+        else:
+            lora_device = first_lora.lora_a.device
+        # Execute pin_memory after LoRA weight merging, mainly because:
+        # 1. Some MoE models have a large number of LoRA weights. If we
+        # perform # pin_memory immediately after loading weights, the
+        # overhead is significant.
+        # 2. The weight packing above (e.g., pack_moe) may invalidate the
+        # pin_memory allocation, so we execute it after packing.
+
+        pin_memory = str(lora_device) == "cpu" and is_pin_memory_available()
+        if pin_memory:
+            for lora in lora_model.loras.values():
+                if isinstance(lora.lora_a, list):
+                    for index in range(len(lora.lora_a)):
+                        if lora.lora_a[index] is None:
+                            continue
+                        lora.lora_a[index] = lora.lora_a[index].pin_memory()
+                        lora.lora_b[index] = lora.lora_b[index].pin_memory()
+                else:
+                    lora.lora_a = lora.lora_a.pin_memory()
+                    lora.lora_b = lora.lora_b.pin_memory()
+
     def _get_lora_layer_weights(
         self, lora_model: LoRAModel, module_name: str
     ) -> LoRALayerWeights | None:

From eb1051fb95323493d6d950c03dabac8ee56cb33e Mon Sep 17 00:00:00 2001
From: "Ye (Charlotte) Qi" <yeq@meta.com>
Date: Mon, 8 Dec 2025 06:44:48 -0800
Subject: [PATCH 338/770] [ROCm] Guard group quant RMS norm fusion patterns
 (#30239)

---
 vllm/compilation/fusion.py | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py
index de083a2e5e3c7..a7e6a69e64c91 100644
--- a/vllm/compilation/fusion.py
+++ b/vllm/compilation/fusion.py
@@ -490,23 +490,25 @@ class RMSNormQuantFusionPass(VllmPatternMatcherPass):
         # as the latter is a subset of the former in torch ops
         for epsilon in [1e-5, 1e-6]:
             # Fuse fused_add_rms_norm + fp8 group quant
-            FusedAddRMSNormGroupQuantPattern(
-                epsilon, FP8_DTYPE, group_shape=GroupShape(1, 128)
-            ).register(self.patterns)
+            # Only register group quant patterns on CUDA where the C++ op exists
+            if current_platform.is_cuda():
+                FusedAddRMSNormGroupQuantPattern(
+                    epsilon, FP8_DTYPE, group_shape=GroupShape(1, 128)
+                ).register(self.patterns)
 
-            # Fuse rms_norm + fp8 group quant
-            RMSNormGroupQuantPattern(
-                epsilon, FP8_DTYPE, group_shape=GroupShape(1, 128)
-            ).register(self.patterns)
+                # Fuse rms_norm + fp8 group quant
+                RMSNormGroupQuantPattern(
+                    epsilon, FP8_DTYPE, group_shape=GroupShape(1, 128)
+                ).register(self.patterns)
 
-            FusedAddRMSNormGroupQuantPattern(
-                epsilon, FP8_DTYPE, group_shape=GroupShape(1, 64)
-            ).register(self.patterns)
+                FusedAddRMSNormGroupQuantPattern(
+                    epsilon, FP8_DTYPE, group_shape=GroupShape(1, 64)
+                ).register(self.patterns)
 
-            # Fuse rms_norm + fp8 group quant
-            RMSNormGroupQuantPattern(
-                epsilon, FP8_DTYPE, group_shape=GroupShape(1, 64)
-            ).register(self.patterns)
+                # Fuse rms_norm + fp8 group quant
+                RMSNormGroupQuantPattern(
+                    epsilon, FP8_DTYPE, group_shape=GroupShape(1, 64)
+                ).register(self.patterns)
 
             # Fuse fused_add_rms_norm + static fp8 quant
             FusedAddRMSNormStaticQuantPattern(epsilon, FP8_DTYPE).register(

From 184076c3fecf8c322648f36f69a4de836d7f519b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20C=C3=A1mpora?=
 <961215+dcampora@users.noreply.github.com>
Date: Mon, 8 Dec 2025 15:55:58 +0100
Subject: [PATCH 339/770] [DeepSeek v3.2] Make top-k work for any logit values.
 (#27568)

Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 csrc/ops.h                                |  13 +-
 csrc/sampler.cu                           | 743 ++++++++++++++++------
 csrc/torch_bindings.cpp                   |  10 +-
 tests/kernels/test_top_k_per_row.py       |  95 ++-
 vllm/model_executor/models/deepseek_v2.py |   6 +-
 5 files changed, 643 insertions(+), 224 deletions(-)

diff --git a/csrc/ops.h b/csrc/ops.h
index 9617d6358e18a..5fce3a1a3fea3 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -102,13 +102,16 @@ void apply_repetition_penalties_(torch::Tensor& logits,
                                  const torch::Tensor& output_mask,
                                  const torch::Tensor& repetition_penalties);
 
-void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
-                   const torch::Tensor& rowEnds, torch::Tensor& indices,
-                   int64_t numRows, int64_t stride0, int64_t stride1);
+void top_k_per_row_prefill(const torch::Tensor& logits,
+                           const torch::Tensor& rowStarts,
+                           const torch::Tensor& rowEnds, torch::Tensor& indices,
+                           int64_t numRows, int64_t stride0, int64_t stride1,
+                           int64_t topK);
 
 void top_k_per_row_decode(const torch::Tensor& logits, int64_t next_n,
-                          const torch::Tensor& seq_lens, torch::Tensor& indices,
-                          int64_t numRows, int64_t stride0, int64_t stride1);
+                          const torch::Tensor& seqLens, torch::Tensor& indices,
+                          int64_t numRows, int64_t stride0, int64_t stride1,
+                          int64_t topK);
 
 void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
                                torch::Tensor& weight, torch::Tensor& scale,
diff --git a/csrc/sampler.cu b/csrc/sampler.cu
index 410b8988f4939..fc2154beff9e0 100644
--- a/csrc/sampler.cu
+++ b/csrc/sampler.cu
@@ -44,41 +44,300 @@ __global__ void apply_repetition_penalties_kernel(
   }
 }
 
-static inline __device__ uint16_t extractBinIdx(float x) {
-  union {
-    __half h;
-    uint16_t u16;
-  } tmp;
-  tmp.h = __float2half_rn(x);
-  tmp.u16 = (x < 0.f) ? (~tmp.u16 & 0xffff) : (tmp.u16 | 0x8000);
-  return 511 - (tmp.u16 >> 7);
+__device__ __forceinline__ auto convert_to_uint32(float x) -> uint32_t {
+  uint32_t bits = __float_as_uint(x);
+  return (bits & 0x80000000) ? bits : ~bits & 0x7fffffff;
 }
 
-template <int kNumThreadsPerBlock = 512, int kNumBins = 512, int kTopK = 2048>
-__device__ void topKPerRowJob(const float* logits, const int rowStart,
-                              const int rowEnd, const int rowIdx,
-                              int* outIndices, int stride0, int stride1) {
-  // The number of elements per thread for the final top-k sort.
-  static constexpr int kNumTopKItemsPerThread = kTopK / kNumThreadsPerBlock;
-  // The class to sort the elements during the final top-k sort.
-  using TopKSort = cub::BlockRadixSort<float, kNumThreadsPerBlock,
-                                       kNumTopKItemsPerThread, int>;
+template <int step>
+static inline __device__ uint32_t extractBinIdx(float x) {
+  if constexpr (step == 0) {
+    __half hx = __float2half(x);
+    uint16_t bits = __half_as_ushort(hx);
+    bits = (bits & 0x8000) ? bits : ~bits & 0x7fff;
+    return bits >> 5;
+  } else {
+    uint32_t bits = __float_as_uint(x);
+    bits = (bits & 0x80000000) ? bits : ~bits & 0x7fffffff;
 
+    if constexpr (step == 1) {
+      return bits >> 21;
+    } else if constexpr (step == 2) {
+      return (bits >> 10) & 0x7ff;
+    } else if constexpr (step == 3) {
+      return bits & 0x3ff;
+    }
+  }
+}
+
+template <int shift>
+static inline __device__ bool isPartialMatch(float x, uint32_t pattern) {
+  if constexpr (shift == 0) {
+    return true;
+  }
+  uint32_t bits = __float_as_uint(x);
+  bits = (bits & 0x80000000) ? bits : ~bits & 0x7fffffff;
+  return (bits ^ pattern) >> shift == 0;
+}
+
+/**
+ * Map a Func over the input data, using vectorized load instructions if
+ * possible.
+ *
+ * @tparam T element type
+ * @tparam IdxT indexing type
+ * @tparam Func void (T x, IdxT idx)
+ *
+ * @param thread_rank rank of the calling thread among all participating threads
+ * @param num_threads number of the threads that participate in processing
+ * @param in the input data
+ * @param len the number of elements to read
+ * @param f the lambda taking two arguments (T x, IdxT idx)
+ */
+template <typename T, typename idxT, typename Func>
+__device__ void vectorized_process(size_t thread_rank, size_t num_threads,
+                                   const T* in, idxT len, Func f) {
+  constexpr int WARP_SIZE = 32;
+  using WideT = float4;
+  if constexpr (sizeof(T) >= sizeof(WideT)) {
+    for (idxT i = thread_rank; i < len; i += num_threads) {
+      f(in[i], i);
+    }
+  } else {
+    static_assert(sizeof(WideT) % sizeof(T) == 0);
+    constexpr int items_per_scalar = sizeof(WideT) / sizeof(T);
+    // TODO: it's UB
+    union {
+      WideT scalar;
+      T array[items_per_scalar];
+    } wide;
+
+    int skip_cnt =
+        (reinterpret_cast<size_t>(in) % sizeof(WideT))
+            ? ((sizeof(WideT) - reinterpret_cast<size_t>(in) % sizeof(WideT)) /
+               sizeof(T))
+            : 0;
+    if (skip_cnt > len) {
+      skip_cnt = len;
+    }
+    const WideT* in_cast = reinterpret_cast<decltype(in_cast)>(in + skip_cnt);
+    const idxT len_cast = (len - skip_cnt) / items_per_scalar;
+
+    for (idxT i = thread_rank; i < len_cast; i += num_threads) {
+      wide.scalar = in_cast[i];
+      const idxT real_i = skip_cnt + i * items_per_scalar;
+#pragma unroll
+      for (int j = 0; j < items_per_scalar; ++j) {
+        f(wide.array[j], real_i + j);
+      }
+    }
+
+    static_assert(WARP_SIZE >= items_per_scalar);
+    // and because items_per_scalar > skip_cnt, WARP_SIZE > skip_cnt
+    // no need to use loop
+    if (thread_rank < skip_cnt) {
+      f(in[thread_rank], thread_rank);
+    }
+    // because len_cast = (len - skip_cnt) / items_per_scalar,
+    // len_cast * items_per_scalar + items_per_scalar > len - skip_cnt;
+    // and so
+    // len - (skip_cnt + len_cast * items_per_scalar) < items_per_scalar <=
+    // WARP_SIZE no need to use loop
+    const idxT remain_i = skip_cnt + len_cast * items_per_scalar + thread_rank;
+    if (remain_i < len) {
+      f(in[remain_i], remain_i);
+    }
+  }
+}
+
+template <int step, int kNumThreadsPerBlock, int kNumBins, int kNumFinalItems,
+          bool multipleBlocksPerRow, bool mergeBlocks, typename SmemFinalType,
+          typename SmemOutputType>
+__device__ bool processHistogramStep(
+    const int* indices, const float* logits, int rowEnd, uint32_t& logitPattern,
+    int& thresholdBinIdx, SmemOutputType& smemOutput, int* smemThresholdBinIdx,
+    int* smemFinalDstIdx, int* smemFinalBinSize, int* smemFoundTopKValues,
+    SmemFinalType& smemFinal, int stride1, int rowStart, int topK) {
+  // Clear the histogram.
+#pragma unroll
+  for (int idx = threadIdx.x; idx < kNumBins; idx += kNumThreadsPerBlock) {
+    smemFinal.histo.data[idx] = 0;
+  }
+
+  // Make sure the histogram is ready.
+  __syncthreads();
+
+  // Update pattern
+  constexpr auto patternShift = step < 2 ? 0 : step == 2 ? 21 : 10;
+  if constexpr (step == 2) {
+    logitPattern = static_cast<uint32_t>(thresholdBinIdx & 0x7ff)
+                   << patternShift;
+  } else if constexpr (step == 3) {
+    logitPattern |= static_cast<uint32_t>(thresholdBinIdx & 0x7ff)
+                    << patternShift;
+  }
+
+  auto distributeToBins = [&](float logit, int /* idx */ = 0) {
+    if (isPartialMatch<patternShift>(logit, logitPattern)) {
+      uint32_t binIdx = extractBinIdx<step>(logit);
+      atomicAdd(&smemFinal.histo.data[binIdx], 1);
+    }
+  };
+
+  // Distribute the elements to the histogram bins.
+  if (stride1 == 1) {
+    vectorized_process(threadIdx.x, kNumThreadsPerBlock, logits + rowStart,
+                       rowEnd - rowStart, distributeToBins);
+  } else {
+    for (int idx = rowStart + threadIdx.x; idx < rowEnd;
+         idx += kNumThreadsPerBlock) {
+      float logit = logits[idx * stride1];
+      distributeToBins(logit, idx);
+    }
+  }
+  // Make sure the histogram is ready.
+  __syncthreads();
+
+  // Reads the value of the starting position in the smemOutput array
+  int lastValue = smemFoundTopKValues[0];
+
+  for (int round = 0; round < kNumBins / kNumThreadsPerBlock; round++) {
+    // Read the values from SMEM.
+    int idx = threadIdx.x + kNumThreadsPerBlock * round;
+    int binCount{0};
+    binCount = smemFinal.histo.data[idx];
+
+    // Make sure each thread has read its value.
+    __syncthreads();
+
+    // Compute the prefix sum.
+    int prefixSum{0}, totalSum{0};
+    using Scan = cub::BlockScan<int, kNumThreadsPerBlock>;
+    Scan(smemFinal.histo.scan).ExclusiveSum(binCount, prefixSum, totalSum);
+
+    // Update the histogram with the prefix sums.
+    prefixSum += lastValue;
+    totalSum += lastValue;
+    smemFinal.histo.data[idx] = prefixSum;
+
+    // Make sure the data is in shared memory.
+    __syncthreads();
+
+    // Find the last valid bin.
+    bool foundThreshold = false;
+    if (prefixSum < topK) {
+      int nextPrefixSum = threadIdx.x == kNumThreadsPerBlock - 1
+                              ? totalSum
+                              : smemFinal.histo.data[idx + 1];
+
+      if (nextPrefixSum >= topK) {
+        smemThresholdBinIdx[0] = idx;
+        smemFinalBinSize[0] = nextPrefixSum - prefixSum;
+        foundThreshold = true;
+      }
+    }
+
+    // Early exit: if any thread found the threshold, we can skip remaining
+    // rounds
+    if (__syncthreads_or(foundThreshold)) {
+      break;
+    }
+
+    lastValue = totalSum;
+  }
+
+  // Make sure the data is in shared memory.
+  __syncthreads();
+
+  // The threshold bin.
+  thresholdBinIdx = smemThresholdBinIdx[0];
+
+  auto processBins = [&](float logit, int idx) {
+    if (isPartialMatch<patternShift>(logit, logitPattern)) {
+      uint32_t binIdx = extractBinIdx<step>(logit);
+      if (binIdx < thresholdBinIdx) {
+        // The element is part of the top-k selection
+        int dstIdx = atomicAdd(&smemFoundTopKValues[0], 1);
+
+        if constexpr (mergeBlocks) {
+          smemOutput[dstIdx] = indices[idx];
+        } else if constexpr (multipleBlocksPerRow) {
+          smemOutput[dstIdx] = idx + rowStart;
+          reinterpret_cast<float*>(smemOutput + topK)[dstIdx] = logit;
+        } else {
+          smemOutput[dstIdx] = idx;
+        }
+      }
+      if constexpr (step < 3) {
+        // Only fill the final items for sorting if the threshold bin fits
+        if (binIdx == thresholdBinIdx &&
+            smemFinalBinSize[0] <= kNumFinalItems) {
+          int dstIdx = atomicAdd(&smemFinalDstIdx[0], 1);
+          smemFinal.items.logits[dstIdx] = logit;
+          if constexpr (mergeBlocks) {
+            smemFinal.items.indices[dstIdx] = indices[idx];
+          } else if constexpr (multipleBlocksPerRow) {
+            smemFinal.items.indices[dstIdx] = idx + rowStart;
+          } else {
+            smemFinal.items.indices[dstIdx] = idx;
+          }
+        }
+      } else {
+        if (binIdx == thresholdBinIdx) {
+          // The elements in the threshold bin share the same 32 bits at step 3
+          int dstIdx = atomicAdd(&smemFinal.histo.data[binIdx], 1);
+          if (dstIdx < topK) {
+            if constexpr (mergeBlocks) {
+              smemOutput[dstIdx] = indices[idx];
+            } else if constexpr (multipleBlocksPerRow) {
+              smemOutput[dstIdx] = idx + rowStart;
+              reinterpret_cast<float*>(smemOutput + topK)[dstIdx] = logit;
+            } else {
+              smemOutput[dstIdx] = idx;
+            }
+          }
+        }
+      }
+    }
+  };
+
+  if (stride1 == 1) {
+    vectorized_process(threadIdx.x, kNumThreadsPerBlock, logits + rowStart,
+                       rowEnd - rowStart, processBins);
+  } else {
+    for (int idx = rowStart + threadIdx.x; idx < rowEnd;
+         idx += kNumThreadsPerBlock) {
+      float logit = logits[idx * stride1];
+      processBins(logit, idx);
+    }
+  }
+
+  // Make sure the elements are in shared memory.
+  __syncthreads();
+
+  // Check if we should continue to next step
+  return smemFinalBinSize[0] > kNumFinalItems;
+}
+
+// Follows half - 11 - 11 - 10 bit iterations
+template <int kNumThreadsPerBlock, int kNumBins, bool useRadixSort,
+          bool multipleBlocksPerRow = false, bool mergeBlocks = false>
+static __device__ void topKPerRowJob(const int* indices, const float* logits,
+                                     int rowStart, int rowEnd, int* outIndices,
+                                     float* outLogits, int stride1, int topK) {
   // The number of slots for the final pass.
-  static constexpr int kNumFinalItems = 3072;
+  static constexpr int kNumFinalItems = 2048;
   // The number of elements per thread for the final sort.
   static constexpr int kNumFinalItemsPerThread =
       kNumFinalItems / kNumThreadsPerBlock;
   // The class to sort the elements during the final pass.
   using FinalSort = cub::BlockRadixSort<float, kNumThreadsPerBlock,
                                         kNumFinalItemsPerThread, int>;
-
+  using FinalSortTempStorage =
+      std::conditional_t<useRadixSort, typename FinalSort::TempStorage, int>;
   // The class to compute the inclusive prefix-sum over the histogram.
   using Scan = cub::BlockScan<int, kNumThreadsPerBlock>;
 
-  // Shared memory to compute the block scan.
-  __shared__ typename Scan::TempStorage smemScan;
-
   // The structure to store the final items (for the final pass).
   struct FinalItems {
     // Shared memory to store the indices for the final pass.
@@ -87,200 +346,225 @@ __device__ void topKPerRowJob(const float* logits, const int rowStart,
     float logits[kNumFinalItems];
   };
 
+  struct Histogram {
+    typename Scan::TempStorage scan;
+    int data[kNumBins];
+  };
+
   // Shared memory to compute the block sort.
   __shared__ union {
     FinalItems items;
-    typename FinalSort::TempStorage finalSort;
-    typename TopKSort::TempStorage topKSort;
+    FinalSortTempStorage finalSort;
+    Histogram histo;
   } smemFinal;
 
-  // Shared memory to store the histogram.
-  __shared__ int smemHistogram[kNumBins];
   // Shared memory to store the selected indices.
-  __shared__ int smemIndices[kTopK];
+  // If we are processing using multiple blocks, we need to store the logits and
+  // indices.
+  extern __shared__ int32_t smemOutput[];
+
   // Shared memory to store the threshold bin.
   __shared__ int smemThresholdBinIdx[1];
   // Shared memory counter to register the candidates for the final phase.
   __shared__ int smemFinalDstIdx[1];
+  // Shared memory to determine if the threshold bin fits in the final items.
+  __shared__ int smemFinalBinSize[1];
+  // Shared memory to keep track of the top-k values found so far by the
+  // previous iterations
+  __shared__ int smemFoundTopKValues[1];
 
   // The length of the row.
   int rowLen = rowEnd - rowStart;
 
   // Shortcut if the length of the row is smaller than Top-K. Indices are not
   // sorted by their corresponding logit.
-  if (rowLen <= kTopK) {
+  if (rowLen <= topK) {
     for (int rowIt = threadIdx.x; rowIt < rowLen;
          rowIt += kNumThreadsPerBlock) {
-      int idx = rowStart + rowIt;
-      outIndices[rowIdx * kTopK + rowIt] = idx - rowStart;
+      if constexpr (multipleBlocksPerRow) {
+        outIndices[rowIt] = rowIt + rowStart;
+        outLogits[rowIt] = logits[rowIt + rowStart];
+      } else {
+        outIndices[rowIt] = rowIt;
+      }
     }
-    for (int rowIt = rowLen + threadIdx.x; rowIt < kTopK;
+    for (int rowIt = rowLen + threadIdx.x; rowIt < topK;
          rowIt += kNumThreadsPerBlock) {
-      outIndices[rowIdx * kTopK + rowIt] = -1;
+      outIndices[rowIt] = -1;
+      if constexpr (multipleBlocksPerRow) {
+        outLogits[rowIt] = -FLT_MAX;
+      }
     }
+
     return;
   }
-
-  // Clear the histogram.
-  if (threadIdx.x < kNumBins) {
-    smemHistogram[threadIdx.x] = 0;
-  }
-
-  // Make sure the histogram is ready.
-  __syncthreads();
-
-  // Fetch elements one-by-one.
-  for (int rowIt = rowStart + threadIdx.x; rowIt < rowEnd;
-       rowIt += kNumThreadsPerBlock) {
-    uint16_t idx = extractBinIdx(logits[rowIdx * stride0 + rowIt * stride1]);
-    atomicAdd(&smemHistogram[idx], 1);
-  }
-
-  // Make sure the histogram is ready.
-  __syncthreads();
-
-  // Read the values from SMEM.
-  int binCount{0};
-  if (threadIdx.x < kNumBins) {
-    binCount = smemHistogram[threadIdx.x];
-  }
-
-  // Make sure each thread has read its value.
-  __syncthreads();
-
-  // Compute the prefix sum.
-  int prefixSum{0}, totalSum{0};
-  Scan(smemScan).ExclusiveSum(binCount, prefixSum, totalSum);
-
-  // Update the histogram with the prefix sums.
-  if (threadIdx.x < kNumBins) {
-    smemHistogram[threadIdx.x] = prefixSum;
-  }
-
-  // Make sure the data is in shared memory.
-  __syncthreads();
-
-  // Find the last valid bin.
-  if (threadIdx.x < kNumBins) {
-    int nextPrefixSum =
-        threadIdx.x == kNumBins - 1 ? totalSum : smemHistogram[threadIdx.x + 1];
-    if (prefixSum < kTopK && nextPrefixSum >= kTopK) {
-      smemThresholdBinIdx[0] = threadIdx.x;
-    }
-  }
-
-  // Clear the counter to store the items for the final phase.
+  // Initialize values
   if (threadIdx.x == 0) {
     smemFinalDstIdx[0] = 0;
+    smemFoundTopKValues[0] = 0;
+  }
+  __syncthreads();
+  int thresholdBinIdx = -1;
+  uint32_t logitPattern = 0;
+
+  // Step 0: Process first 11 bits of half representation
+  bool continueToNextStep =
+      processHistogramStep<0, kNumThreadsPerBlock, kNumBins, kNumFinalItems,
+                           multipleBlocksPerRow, mergeBlocks>(
+          indices, logits, rowEnd, logitPattern, thresholdBinIdx, smemOutput,
+          smemThresholdBinIdx, smemFinalDstIdx, smemFinalBinSize,
+          smemFoundTopKValues, smemFinal, stride1, rowStart, topK);
+
+  if (continueToNextStep) {
+    // Step 1: Process next 11 bits
+    continueToNextStep =
+        processHistogramStep<1, kNumThreadsPerBlock, kNumBins, kNumFinalItems,
+                             multipleBlocksPerRow, mergeBlocks>(
+            indices, logits, rowEnd, logitPattern, thresholdBinIdx, smemOutput,
+            smemThresholdBinIdx, smemFinalDstIdx, smemFinalBinSize,
+            smemFoundTopKValues, smemFinal, stride1, rowStart, topK);
   }
 
-  // Make sure the data is in shared memory.
-  __syncthreads();
+  if (continueToNextStep) {
+    // Step 2: Process next 11 bits
+    continueToNextStep =
+        processHistogramStep<2, kNumThreadsPerBlock, kNumBins, kNumFinalItems,
+                             multipleBlocksPerRow, mergeBlocks>(
+            indices, logits, rowEnd, logitPattern, thresholdBinIdx, smemOutput,
+            smemThresholdBinIdx, smemFinalDstIdx, smemFinalBinSize,
+            smemFoundTopKValues, smemFinal, stride1, rowStart, topK);
+  }
 
-  // The threshold bin.
-  int thresholdBinIdx = smemThresholdBinIdx[0];
+  if (continueToNextStep) {
+    // Step 3: Process last 10 bits
+    processHistogramStep<3, kNumThreadsPerBlock, kNumBins, kNumFinalItems,
+                         multipleBlocksPerRow, mergeBlocks>(
+        indices, logits, rowEnd, logitPattern, thresholdBinIdx, smemOutput,
+        smemThresholdBinIdx, smemFinalDstIdx, smemFinalBinSize,
+        smemFoundTopKValues, smemFinal, stride1, rowStart, topK);
+  }
 
-  // Fetch elements one-by-one and populate the shared memory buffers.
-  for (int rowIt = rowStart + threadIdx.x; rowIt < rowEnd;
-       rowIt += kNumThreadsPerBlock) {
-    float logit = logits[rowIdx * stride0 + rowIt * stride1];
-    uint16_t idx = extractBinIdx(logit);
-    if (idx < thresholdBinIdx) {
-      int dstIdx = atomicAdd(&smemHistogram[idx], 1);
-      smemIndices[dstIdx] = rowIt;
-    } else if (idx == thresholdBinIdx) {
-      int dstIdx = atomicAdd(&smemFinalDstIdx[0], 1);
-      if (dstIdx < kNumFinalItems) {
-        smemFinal.items.logits[dstIdx] = logit;
-        smemFinal.items.indices[dstIdx] = rowIt;
+  if (!continueToNextStep) {
+    // The histogram did not proceed to the final 10 bits, therefore we need to
+    // sort the final items The logits of the elements to be sorted in the final
+    // pass.
+    if constexpr (useRadixSort) {
+      // Sorting with radix sort
+      float finalLogits[kNumFinalItemsPerThread];
+      // The indices of the elements to be sorted in the final pass.
+      int finalIndices[kNumFinalItemsPerThread];
+
+#pragma unroll
+      for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
+        finalLogits[ii] = -FLT_MAX;
+      }
+
+      // Read the elements from SMEM.
+#pragma unroll
+      for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
+        int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x;
+        if (srcIdx < smemFinalDstIdx[0]) {
+          finalLogits[ii] = smemFinal.items.logits[srcIdx];
+          finalIndices[ii] = smemFinal.items.indices[srcIdx];
+        }
+      }
+      // Make sure the shared memory has been read.
+      __syncthreads();
+
+      // Sort the elements.
+      FinalSort(smemFinal.finalSort)
+          .SortDescendingBlockedToStriped(finalLogits, finalIndices);
+
+      // Copy the data back to the shared memory storage.
+      int baseIdx = smemFoundTopKValues[0];
+
+#pragma unroll
+      for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
+        int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x;
+        int dstIdx = baseIdx + srcIdx;
+
+        if (dstIdx < topK) {
+          smemOutput[dstIdx] = finalIndices[ii];
+          if constexpr (multipleBlocksPerRow) {
+            reinterpret_cast<float*>(smemOutput + topK)[dstIdx] =
+                finalLogits[ii];
+          }
+        }
+      }
+    } else {
+      // Sorting with insertion sort
+      auto baseIdx = smemFoundTopKValues[0];
+      for (int i = threadIdx.x; i < smemFinalDstIdx[0];
+           i += kNumThreadsPerBlock) {
+        int outIndex = 0;
+        auto logit = smemFinal.items.logits[i];
+        for (int j = 0; j < smemFinalDstIdx[0]; j++) {
+          auto otherLogit = smemFinal.items.logits[j];
+          if (logit < otherLogit || (logit == otherLogit && i < j)) {
+            outIndex++;
+          }
+        }
+        // Store if outIndex is in bounds
+        if (outIndex + baseIdx < topK) {
+          smemOutput[outIndex + baseIdx] = smemFinal.items.indices[i];
+          if constexpr (multipleBlocksPerRow) {
+            reinterpret_cast<float*>(smemOutput + topK)[outIndex + baseIdx] =
+                smemFinal.items.logits[i];
+          }
+        }
+      }
+    }
+    __syncthreads();
+  }
+
+  // Store to global memory.
+  for (int i = threadIdx.x; i < topK; i += kNumThreadsPerBlock) {
+    if constexpr (multipleBlocksPerRow) {
+      outIndices[i] = smemOutput[i];
+      outLogits[i] = reinterpret_cast<float*>(smemOutput + topK)[i];
+    } else {
+      if (stride1 == 1) {
+        // stride1 == 1 will use vectorized_process, which indexes already skip
+        // the rowStart.
+        outIndices[i] = smemOutput[i];
+      } else {
+        outIndices[i] = smemOutput[i] - rowStart;
       }
     }
   }
-
-  // Make sure the elements are in shared memory.
-  __syncthreads();
-
-  // The logits of the elements to be sorted in the final pass.
-  float finalLogits[kNumFinalItemsPerThread];
-  // The indices of the elements to be sorted in the final pass.
-  int finalIndices[kNumFinalItemsPerThread];
-
-// Init.
-#pragma unroll
-  for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
-    finalLogits[ii] = -FLT_MAX;
-  }
-
-// Read the elements from SMEM.
-#pragma unroll
-  for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
-    int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x;
-    if (srcIdx < smemFinalDstIdx[0]) {
-      finalLogits[ii] = smemFinal.items.logits[srcIdx];
-      finalIndices[ii] = smemFinal.items.indices[srcIdx];
-    }
-  }
-
-  // Make sure the shared memory has been read.
-  __syncthreads();
-
-  // Sort the elements.
-  FinalSort(smemFinal.finalSort)
-      .SortDescendingBlockedToStriped(finalLogits, finalIndices);
-
-  // Copy the data back to the shared memory storage.
-  int baseIdx = thresholdBinIdx > 0 ? smemHistogram[thresholdBinIdx - 1] : 0;
-#pragma unroll
-  for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
-    int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x;
-    int dstIdx = baseIdx + srcIdx;
-    if (dstIdx < kTopK) {
-      smemIndices[dstIdx] = finalIndices[ii];
-    }
-  }
-
-  // Make sure the data is in shared memory.
-  __syncthreads();
-
-// Store to global memory.
-#pragma unroll
-  for (int ii = 0; ii < kNumTopKItemsPerThread; ++ii) {
-    int offset = rowIdx * kTopK + ii * kNumThreadsPerBlock + threadIdx.x;
-    outIndices[offset] =
-        smemIndices[ii * kNumThreadsPerBlock + threadIdx.x] - rowStart;
-  }
 }
 
-template <int kNumThreadsPerBlock = 512>
-static __global__ void topKPerRow(const float* logits, const int* rowStarts,
-                                  const int* rowEnds, int* outIndices,
-                                  int stride0, int stride1) {
+template <int kNumThreadsPerBlock, bool useRadixSort>
+static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowPrefill(
+    const float* logits, const int* rowStarts, const int* rowEnds,
+    int* outIndices, int stride0, int stride1, const int topK,
+    const int offsetIndex) {
   // The number of bins in the histogram.
-  static constexpr int kNumBins = 512;
-
-  // The top-k width.
-  static constexpr int kTopK = 2048;
+  static constexpr int kNumBins = 2048;
 
   // The row computed by this block.
-  int rowIdx = blockIdx.x;
+  int rowIdx = blockIdx.x + offsetIndex;
 
   // The range of logits within the row.
   int rowStart = rowStarts[rowIdx];
   int rowEnd = rowEnds[rowIdx];
 
-  topKPerRowJob<kNumThreadsPerBlock, kNumBins, kTopK>(
-      logits, rowStart, rowEnd, rowIdx, outIndices, stride0, stride1);
+  // Local pointers to this block
+  outIndices += rowIdx * topK;
+  logits += rowIdx * stride0;
+
+  topKPerRowJob<kNumThreadsPerBlock, kNumBins, useRadixSort>(
+      nullptr, logits, rowStart, rowEnd, outIndices, nullptr, stride1, topK);
 }
 
-template <int kNumThreadsPerBlock = 512>
-static __global__ void topKPerRowDecode(const float* logits, const int* seqLens,
-                                        int* outIndices, int stride0,
-                                        int stride1, int next_n) {
+template <int kNumThreadsPerBlock, bool useRadixSort,
+          bool multipleBlocksPerRow = false, bool mergeBlocks = false>
+static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowDecode(
+    const float* logits, const int* seqLens, int* outIndices, int stride0,
+    int stride1, const int topK, int next_n, float* outLogits = nullptr,
+    const int numBlocksToMerge = 0, const int* indices = nullptr) {
   // The number of bins in the histogram.
-  static constexpr int kNumBins = 512;
-
-  // The top-k width.
-  static constexpr int kTopK = 2048;
+  static constexpr int kNumBins = 2048;
 
   // The row computed by this block.
   int rowIdx = blockIdx.x;
@@ -290,8 +574,25 @@ static __global__ void topKPerRowDecode(const float* logits, const int* seqLens,
   int seq_len = seqLens[rowIdx / next_n];
   int rowEnd = seq_len - next_n + (rowIdx % next_n) + 1;
 
-  topKPerRowJob<kNumThreadsPerBlock, kNumBins, kTopK>(
-      logits, rowStart, rowEnd, rowIdx, outIndices, stride0, stride1);
+  // Local pointers to this block
+  if constexpr (!multipleBlocksPerRow && !mergeBlocks) {
+    outIndices += rowIdx * topK;
+  } else if constexpr (multipleBlocksPerRow) {
+    const auto blockSize = rowEnd / gridDim.y;  // 16384 / 2 = 8192
+    rowStart = blockSize * blockIdx.y;          // 8192 * 1 = 8192
+    rowEnd = gridDim.y == blockIdx.y + 1 ? rowEnd : rowStart + blockSize;
+    outIndices += rowIdx * gridDim.y * topK + blockIdx.y * topK;
+    outLogits += rowIdx * gridDim.y * topK + blockIdx.y * topK;
+  } else if constexpr (mergeBlocks) {
+    rowEnd = numBlocksToMerge * topK;
+    indices += rowIdx * numBlocksToMerge * topK;
+    outIndices += rowIdx * topK;
+  }
+  logits += rowIdx * stride0;
+
+  topKPerRowJob<kNumThreadsPerBlock, kNumBins, useRadixSort,
+                multipleBlocksPerRow, mergeBlocks>(
+      indices, logits, rowStart, rowEnd, outIndices, outLogits, stride1, topK);
 }
 
 }  // namespace vllm
@@ -339,28 +640,84 @@ void apply_repetition_penalties_(
 
 void top_k_per_row_decode(const torch::Tensor& logits, int64_t next_n,
                           const torch::Tensor& seqLens, torch::Tensor& indices,
-                          int64_t numRows, int64_t stride0, int64_t stride1) {
-  // Compute the results on the device.
+                          int64_t numRows, int64_t stride0, int64_t stride1,
+                          int64_t topK) {
+  constexpr int kSortingAlgorithmThreshold = 12288;
+  constexpr int kSplitWorkThreshold = 200 * 1000;
+  constexpr int kNumThreadsPerBlock = 512;
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  const auto numColumns = logits.size(1);
+
+  if (numColumns < kSortingAlgorithmThreshold) {
+    // Use insertion sort
+    vllm::topKPerRowDecode<kNumThreadsPerBlock, false>
+        <<<numRows, kNumThreadsPerBlock, topK * sizeof(int32_t), stream>>>(
+            logits.data_ptr<float>(), seqLens.data_ptr<int>(),
+            indices.data_ptr<int>(), static_cast<int>(stride0),
+            static_cast<int>(stride1), static_cast<int>(topK),
+            static_cast<int>(next_n));
+  } else if (numColumns < kSplitWorkThreshold) {
+    // From this threshold, use radix sort instead
+    vllm::topKPerRowDecode<kNumThreadsPerBlock, true>
+        <<<numRows, kNumThreadsPerBlock, topK * sizeof(int32_t), stream>>>(
+            logits.data_ptr<float>(), seqLens.data_ptr<int>(),
+            indices.data_ptr<int>(), static_cast<int>(stride0),
+            static_cast<int>(stride1), static_cast<int>(topK),
+            static_cast<int>(next_n));
+  } else {
+    // Long sequences are run in two steps
+    constexpr auto multipleBlocksPerRowConfig = 10;
+
+    const auto outIndicesAux =
+        torch::empty({numRows, multipleBlocksPerRowConfig, topK},
+                     torch::dtype(torch::kInt32).device(logits.device()));
+    const auto outLogitsAux =
+        torch::empty({numRows, multipleBlocksPerRowConfig, topK},
+                     torch::dtype(torch::kFloat).device(logits.device()));
+
+    vllm::topKPerRowDecode<kNumThreadsPerBlock, true, true>
+        <<<dim3(numRows, multipleBlocksPerRowConfig), kNumThreadsPerBlock,
+           2 * topK * sizeof(int32_t), stream>>>(
+            logits.data_ptr<float>(), seqLens.data_ptr<int>(),
+            outIndicesAux.data_ptr<int>(), static_cast<int>(stride0),
+            static_cast<int>(stride1), static_cast<int>(topK),
+            static_cast<int>(next_n), outLogitsAux.data_ptr<float>());
+
+    constexpr int kNumThreadsPerBlockMerge = 1024;
+    vllm::topKPerRowDecode<kNumThreadsPerBlockMerge, true, false, true>
+        <<<numRows, kNumThreadsPerBlockMerge, topK * sizeof(int32_t), stream>>>(
+            outLogitsAux.data_ptr<float>(), seqLens.data_ptr<int>(),
+            indices.data_ptr<int>(), multipleBlocksPerRowConfig * topK, 1,
+            static_cast<int>(topK), static_cast<int>(next_n), nullptr,
+            multipleBlocksPerRowConfig, outIndicesAux.data_ptr<int>());
+  }
+}
+
+void top_k_per_row_prefill(const torch::Tensor& logits,
+                           const torch::Tensor& rowStarts,
+                           const torch::Tensor& rowEnds, torch::Tensor& indices,
+                           int64_t numRows, int64_t stride0, int64_t stride1,
+                           int64_t topK) {
+  constexpr int kSortingAlgorithmThreshold = 12288;
   constexpr int kNumThreadsPerBlock = 512;
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  vllm::topKPerRowDecode<kNumThreadsPerBlock>
-      <<<numRows, kNumThreadsPerBlock, 0, stream>>>(
-          logits.data_ptr<float>(), seqLens.data_ptr<int>(),
-          indices.data_ptr<int>(), static_cast<int>(stride0),
-          static_cast<int>(stride1), static_cast<int>(next_n));
-}
+  int numInsertionBlocks =
+      std::min(static_cast<int>(numRows), kSortingAlgorithmThreshold);
+  vllm::topKPerRowPrefill<kNumThreadsPerBlock, false>
+      <<<numInsertionBlocks, kNumThreadsPerBlock, topK * sizeof(int32_t),
+         stream>>>(logits.data_ptr<float>(), rowStarts.data_ptr<int>(),
+                   rowEnds.data_ptr<int>(), indices.data_ptr<int>(),
+                   static_cast<int>(stride0), static_cast<int>(stride1),
+                   static_cast<int>(topK), 0);
 
-void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
-                   const torch::Tensor& rowEnds, torch::Tensor& indices,
-                   int64_t numRows, int64_t stride0, int64_t stride1) {
-  // Compute the results on the device.
-  constexpr int kNumThreadsPerBlock = 512;
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  vllm::topKPerRow<kNumThreadsPerBlock>
-      <<<numRows, kNumThreadsPerBlock, 0, stream>>>(
-          logits.data_ptr<float>(), rowStarts.data_ptr<int>(),
-          rowEnds.data_ptr<int>(), indices.data_ptr<int>(),
-          static_cast<int>(stride0), static_cast<int>(stride1));
+  if (numRows > kSortingAlgorithmThreshold) {
+    int numRadixBlocks = numRows - kSortingAlgorithmThreshold;
+    vllm::topKPerRowPrefill<kNumThreadsPerBlock, true>
+        <<<numRadixBlocks, kNumThreadsPerBlock, topK * sizeof(int32_t),
+           stream>>>(logits.data_ptr<float>(), rowStarts.data_ptr<int>(),
+                     rowEnds.data_ptr<int>(), indices.data_ptr<int>(),
+                     static_cast<int>(stride0), static_cast<int>(stride1),
+                     static_cast<int>(topK), kSortingAlgorithmThreshold);
+  }
 }
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index db37a9b9b88e3..62212f98b4766 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -179,15 +179,15 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 
   // Optimized top-k per row operation
   ops.def(
-      "top_k_per_row(Tensor logits, Tensor rowStarts, Tensor rowEnds, "
+      "top_k_per_row_prefill(Tensor logits, Tensor rowStarts, Tensor rowEnds, "
       "Tensor! indices, int numRows, int stride0, "
-      "int stride1) -> ()");
-  ops.impl("top_k_per_row", torch::kCUDA, &top_k_per_row);
+      "int stride1, int topK) -> ()");
+  ops.impl("top_k_per_row_prefill", torch::kCUDA, &top_k_per_row_prefill);
 
   ops.def(
       "top_k_per_row_decode(Tensor logits, int next_n, "
-      "Tensor seq_lens, Tensor! indices, int numRows, "
-      "int stride0, int stride1) -> ()");
+      "Tensor seq_lens, Tensor! indices, "
+      "int numRows, int stride0, int stride1, int topK) -> ()");
   ops.impl("top_k_per_row_decode", torch::kCUDA, &top_k_per_row_decode);
 
   // Layernorm-quant
diff --git a/tests/kernels/test_top_k_per_row.py b/tests/kernels/test_top_k_per_row.py
index cadda27b49e9c..3bf69389753e3 100644
--- a/tests/kernels/test_top_k_per_row.py
+++ b/tests/kernels/test_top_k_per_row.py
@@ -9,23 +9,45 @@ from vllm.platforms import current_platform
 
 # Test parameters
 NUM_ROWS = [1, 32, 2050]
-TOP_K_VALUES = [2048]
-BATCH_SIZE = [1, 2, 4, 2048, 4096]
-NEXT_N = [1, 2, 4, 8]
+TOP_K_VALUES = [2048, 3000]
+BATCH_SIZE = [1, 2, 2048]
+NEXT_N = [1, 8]
+DATA_GENERATION = ["random", "10LSBits"]
 
 
 def create_random_logits(
     row_starts: torch.Tensor,
     row_ends: torch.Tensor,
-    vocab_size: int,
     dtype: torch.dtype,
     seed: int,
+    data_generation: str,
 ) -> torch.Tensor:
     """Create random logits tensor for testing."""
     torch.manual_seed(seed)
     np.random.seed(seed)
     # Generate logits with some structure to make testing more meaningful
-    logits = torch.randn(row_starts.shape[0], max(row_ends), dtype=dtype, device="cuda")
+    if data_generation == "random":
+        logits = torch.randn(
+            row_starts.shape[0], max(row_ends), dtype=dtype, device="cuda"
+        )
+    elif data_generation == "10LSBits":
+        top_22_bits_mask = 0xFFFFFC00
+        last_10_bits_mask = 0x000003FF
+        fixed_top_22_bits = 0x3F900000
+        # Generate random bits for the last 10 bits
+        random_bottom_bits = torch.randint(
+            0,
+            2**10,
+            (row_starts.shape[0], max(row_ends)),
+            dtype=torch.int32,
+            device="cuda",
+        )
+        # Combine: fixed top 22 bits with random last 10 bits
+        logits_bits = (fixed_top_22_bits & top_22_bits_mask) | (
+            random_bottom_bits & last_10_bits_mask
+        )
+        logits = logits_bits.view(dtype)
+
     for i, end in enumerate(row_ends):
         logits[i, end:] = float("-inf")
     return logits
@@ -113,13 +135,13 @@ def test_top_k_per_row(
     # Create test data
     vocab_size = 20000
     row_starts, row_ends = create_row_boundaries(num_rows, vocab_size)
-    logits = create_random_logits(row_starts, row_ends, vocab_size, torch.float32, 42)
+    logits = create_random_logits(row_starts, row_ends, torch.float32, 42, "random")
 
     # Create output tensors
     indices = torch.empty((num_rows, top_k), dtype=torch.int32, device="cuda")
 
     # Run CUDA implementation
-    torch.ops._C.top_k_per_row(
+    torch.ops._C.top_k_per_row_prefill(
         logits,
         row_starts,
         row_ends,
@@ -127,6 +149,7 @@ def test_top_k_per_row(
         num_rows,
         logits.stride(0),
         logits.stride(1),
+        top_k,
     )
 
     # Run reference implementation
@@ -139,27 +162,23 @@ def test_top_k_per_row(
     # Compare results
     assert compare_top_k_results(
         logits, indices, torch_indices, row_starts, row_ends, top_k
-    ), "CUDA top_k_per_row results don't match torch.topk"
+    ), "CUDA top_k_per_row_prefill results don't match torch.topk"
 
 
-@pytest.mark.parametrize("top_k", TOP_K_VALUES)
-@pytest.mark.parametrize("batch_size", BATCH_SIZE)
-@pytest.mark.parametrize("next_n", NEXT_N)
-@pytest.mark.skipif(not current_platform.is_cuda(), reason="This test requires CUDA")
-@torch.inference_mode()
-def test_top_k_per_row_decode(
+def _run_top_k_per_row_decode_test(
     top_k: int,
     batch_size: int,
     next_n: int,
+    vocab_size: int,
+    data_generation: str,
 ) -> None:
     """
-    Test top_k_per_row with seq_lens tensor.
+    Helper function to run top_k_per_row_decode test with given parameters.
     """
     torch.set_default_device("cuda:0")
 
     # Create test data
     num_rows = batch_size * next_n
-    vocab_size = 20000
     seq_lens = torch.randint(
         vocab_size, (batch_size,), dtype=torch.int32, device="cuda"
     )
@@ -167,7 +186,9 @@ def test_top_k_per_row_decode(
     row_indices = torch.arange(num_rows, device="cuda") // next_n
     next_n_offset = torch.arange(num_rows, device="cuda") % next_n
     row_ends = seq_lens[row_indices] - next_n + next_n_offset + 1
-    logits = create_random_logits(row_starts, row_ends, vocab_size, torch.float32, 42)
+    logits = create_random_logits(
+        row_starts, row_ends, torch.float32, 42, data_generation
+    )
 
     # Create output tensors
     indices = torch.empty((num_rows, top_k), dtype=torch.int32, device="cuda")
@@ -181,6 +202,7 @@ def test_top_k_per_row_decode(
         num_rows,
         logits.stride(0),
         logits.stride(1),
+        top_k,
     )
 
     torch.cuda.synchronize()
@@ -195,4 +217,41 @@ def test_top_k_per_row_decode(
     # Compare results
     assert compare_top_k_results(
         logits, indices, torch_indices, row_starts, row_ends, top_k
-    ), "CUDA top_k_per_row results don't match torch.topk"
+    ), "CUDA top_k_per_row_decode results don't match torch.topk"
+
+
+@pytest.mark.parametrize("top_k", TOP_K_VALUES)
+@pytest.mark.parametrize("batch_size", BATCH_SIZE)
+@pytest.mark.parametrize("next_n", NEXT_N)
+@pytest.mark.parametrize("data_generation", DATA_GENERATION)
+@pytest.mark.skipif(not current_platform.is_cuda(), reason="This test requires CUDA")
+@torch.inference_mode()
+def test_top_k_per_row_decode(
+    top_k: int,
+    batch_size: int,
+    next_n: int,
+    data_generation: str,
+) -> None:
+    """
+    Test top_k_per_row with seq_lens tensor.
+    """
+    vocab_size = 20000
+    _run_top_k_per_row_decode_test(
+        top_k, batch_size, next_n, vocab_size, data_generation
+    )
+
+
+@pytest.mark.skipif(not current_platform.is_cuda(), reason="This test requires CUDA")
+@torch.inference_mode()
+def test_top_k_per_row_decode_large_vocab_size() -> None:
+    """
+    Test top_k_per_row_decode with large vocabulary size.
+    """
+    top_k = 2048
+    batch_size = 2
+    next_n = 2
+    vocab_size = 300000
+    data_generation = "random"
+    _run_top_k_per_row_decode_test(
+        top_k, batch_size, next_n, vocab_size, data_generation
+    )
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index a8eb4a69b6f2b..0b6513789aea8 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -684,11 +684,10 @@ def sparse_attn_indexer(
                 chunk.cu_seqlen_ke,
             )
             num_rows = logits.shape[0]
-            assert topk_tokens == 2048, "top_k_per_row assumes size 2048"
             topk_indices = topk_indices_buffer[
                 chunk.token_start : chunk.token_end, :topk_tokens
             ]
-            torch.ops._C.top_k_per_row(
+            torch.ops._C.top_k_per_row_prefill(
                 logits,
                 chunk.cu_seqlen_ks,
                 chunk.cu_seqlen_ke,
@@ -696,6 +695,7 @@ def sparse_attn_indexer(
                 num_rows,
                 logits.stride(0),
                 logits.stride(1),
+                topk_tokens,
             )
 
     if has_decode:
@@ -738,7 +738,6 @@ def sparse_attn_indexer(
             max_model_len=max_model_len,
         )
         num_rows = logits.shape[0]
-        assert topk_tokens == 2048, "top_k_per_row assumes size 2048"
         topk_indices = topk_indices_buffer[:num_decode_tokens, :topk_tokens]
 
         torch.ops._C.top_k_per_row_decode(
@@ -749,6 +748,7 @@ def sparse_attn_indexer(
             num_rows,
             logits.stride(0),
             logits.stride(1),
+            topk_tokens,
         )
         if decode_metadata.requires_padding:
             # if padded, we need to unpack

From 87aee9ed2b65b9ef7cdb83cc69dc1fc15f39ff36 Mon Sep 17 00:00:00 2001
From: Laith Sakka <lsakka@meta.com>
Date: Mon, 8 Dec 2025 07:46:15 -0800
Subject: [PATCH 340/770] Add evaluate_guards option to DynamicShapesConfig
 (#27432)

Signed-off-by: Laith Sakka <lsakka@meta.com>
---
 .../test_dynamic_shapes_compilation.py        | 139 +++++++++++++++++-
 vllm/compilation/backends.py                  |  26 +++-
 vllm/compilation/decorators.py                |   4 +-
 vllm/compilation/wrapper.py                   |  65 ++++++--
 vllm/config/compilation.py                    |  17 ++-
 vllm/config/vllm.py                           |   2 +-
 6 files changed, 222 insertions(+), 31 deletions(-)

diff --git a/tests/compile/test_dynamic_shapes_compilation.py b/tests/compile/test_dynamic_shapes_compilation.py
index 1966b03cd9c89..bc3dbf5533312 100644
--- a/tests/compile/test_dynamic_shapes_compilation.py
+++ b/tests/compile/test_dynamic_shapes_compilation.py
@@ -2,12 +2,21 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import gc
+import tempfile
+from contextlib import contextmanager
 
 import pytest
 import torch
 
 from vllm import LLM, SamplingParams
-from vllm.config.compilation import CompilationMode, DynamicShapesType
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config
+from vllm.config.compilation import (
+    CompilationMode,
+    DynamicShapesConfig,
+    DynamicShapesType,
+)
+from vllm.forward_context import set_forward_context
 from vllm.tokenizers import get_tokenizer
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
@@ -29,18 +38,19 @@ def get_test_models():
 )
 @pytest.mark.parametrize("use_aot_compile", ["0"])
 @pytest.mark.parametrize("use_bytecode_hook", [True, False])
+@pytest.mark.parametrize("evaluate_guards", [False, True])
 @pytest.mark.skipif(
     not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
 )
 def test_dynamic_shapes_compilation(
-    monkeypatch, model_name, shapes_type, use_aot_compile, use_bytecode_hook
+    monkeypatch,
+    model_name,
+    shapes_type,
+    use_aot_compile,
+    use_bytecode_hook,
+    evaluate_guards,
 ):
     """Test that all dynamic shapes types compile successfully"""
-    print(
-        f"\nTesting model: {model_name} with {shapes_type.name}, "
-        f"AOT compile: {use_aot_compile}, "
-        f"Bytecode hook: {use_bytecode_hook}"
-    )
     if use_bytecode_hook and shapes_type == DynamicShapesType.UNBACKED:
         pytest.skip("UNBACKED dynamic shapes require VLLM_USE_BYTECODE_HOOK=0")
 
@@ -58,6 +68,7 @@ def test_dynamic_shapes_compilation(
             "mode": CompilationMode.VLLM_COMPILE,
             "dynamic_shapes_config": {
                 "type": shapes_type.value,
+                "evaluate_guards": evaluate_guards,
             },
         },
     )
@@ -86,3 +97,117 @@ def test_dynamic_shapes_compilation(
     torch.cuda.empty_cache()
     torch.cuda.synchronize()
     print("GPU memory cleared")
+
+
+@pytest.mark.parametrize("use_aot_compile", ["0", "1"])
+@pytest.mark.parametrize(
+    "dynamic_shapes_type",
+    [
+        DynamicShapesType.BACKED,
+        DynamicShapesType.BACKED_SIZE_OBLIVIOUS,
+    ],
+)
+@pytest.mark.parametrize("evaluate_guards", [False, True])
+def test_model_specialization_with_evaluate_guards(
+    monkeypatch, use_aot_compile, dynamic_shapes_type, evaluate_guards
+):
+    """Test that evaluate_guards correctly detects shape specialization
+    violations.
+    """
+
+    if (
+        use_aot_compile == "1"
+        and dynamic_shapes_type == DynamicShapesType.BACKED
+        and evaluate_guards
+    ):
+        pytest.skip("evaluate_guards for backed does not work with aot_compile =1")
+
+    @support_torch_compile
+    class ModelWithSizeCheck(torch.nn.Module):
+        def __init__(self, **kwargs):
+            super().__init__()
+
+        def forward(self, x: torch.Tensor):
+            # This will cause specialization - torch.compile will guard on
+            # sx.shape[0]
+            if x.shape[0] >= 10:
+                return x * 10
+            else:
+                return x * 10
+
+    @support_torch_compile
+    class ModelWithOneSizeCheck(torch.nn.Module):
+        def __init__(self, **kwargs):
+            super().__init__()
+
+        def forward(self, x: torch.Tensor):
+            # This will cause 0/1 specializations.
+            if x.shape[0] == 0:
+                return x * 10
+            if x.shape[0] == 1:
+                return x * 10
+            else:
+                return x * 10
+
+    @contextmanager
+    def use_vllm_config(vllm_config: VllmConfig):
+        with set_forward_context({}, vllm_config), set_current_vllm_config(vllm_config):
+            yield
+
+    monkeypatch.setenv("TOKENIZERS_PARALLELISM", "true")
+    monkeypatch.setenv("VLLM_USE_AOT_COMPILE", use_aot_compile)
+    monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "0")
+
+    # Create vllm config with the desired settings
+    from vllm.config import CompilationMode
+
+    vllm_config = VllmConfig(
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            dynamic_shapes_config=DynamicShapesConfig(
+                type=dynamic_shapes_type,
+                evaluate_guards=evaluate_guards,
+            ),
+        )
+    )
+
+    def test(model_class, input1, input2, is_01_specialization=False):
+        with (
+            torch.no_grad(),
+            use_vllm_config(vllm_config),
+            tempfile.TemporaryDirectory() as tmpdirname,
+        ):
+            monkeypatch.setenv("VLLM_CACHE_ROOT", tmpdirname)
+
+            model = model_class(vllm_config=vllm_config).cuda()
+
+            model(input1)
+
+            if evaluate_guards and (
+                not (
+                    is_01_specialization
+                    and dynamic_shapes_type == DynamicShapesType.BACKED
+                )
+            ):
+                # This should fail because guards were added.
+                with pytest.raises(RuntimeError) as excinfo:
+                    model(input2)
+
+                # Expected failure - guard was violated
+                error_msg = str(excinfo.value)
+                assert (
+                    "GuardManager check failed" in error_msg
+                    or "Detected recompile when torch.compile stance" in error_msg
+                ), error_msg
+
+            else:
+                model(input2)
+
+    test(ModelWithSizeCheck, torch.randn(20, 10).cuda(), torch.randn(5, 10).cuda())
+    test(ModelWithSizeCheck, torch.randn(5, 10).cuda(), torch.randn(20, 10).cuda())
+    test(
+        ModelWithOneSizeCheck,
+        torch.randn(20, 10).cuda(),
+        torch.randn(1, 10).cuda(),
+        is_01_specialization=True,
+    )
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 26f4f16a845a2..dd2233522263d 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -26,6 +26,7 @@ from vllm.compilation.partition_rules import (
     should_split,
 )
 from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
+from vllm.config.compilation import DynamicShapesType
 from vllm.config.utils import Range, hash_factors
 from vllm.logger import init_logger
 from vllm.logging_utils import lazy
@@ -722,6 +723,29 @@ class VllmBackend:
             self.split_gm, submod_names_to_compile, self.vllm_config, self
         ).run(*fake_args)
 
+        from torch._guards import detect_fake_mode
+
+        fake_mode = detect_fake_mode()
+
+        if (
+            self.compilation_config.dynamic_shapes_config.evaluate_guards
+            and self.compilation_config.dynamic_shapes_config.type
+            == DynamicShapesType.BACKED
+        ):
+            from torch.utils._sympy.value_ranges import ValueRanges
+
+            # Drop counter-0/1 specializations guards; for backed dynamic shapes,
+            # torch.compile will specialize for 0/1 inputs or otherwise guards that
+            # shape is >= 2. This is because it's really hard not to hit a check
+            # against 0/1. When we evaluate shape guards, we exclude checking those
+            # guards (We would fail always otherwise).
+
+            # We avoid that by updating the ranges of backed sizes when the min is
+            # 2 for any, we assume it's 0.
+            for s, r in fake_mode.shape_env.var_to_range.items():
+                if r.lower == 2:
+                    fake_mode.shape_env.var_to_range[s] = ValueRanges(0, r.upper)
+
         graph_path = os.path.join(local_cache_dir, "computation_graph.py")
         if not os.path.exists(graph_path):
             # code adapted from
@@ -749,8 +773,6 @@ class VllmBackend:
                 graph, example_inputs, self.prefix, self.split_gm
             )
 
-        # if we need to copy input buffers for cudagraph
-        #
         # index of tensors that have symbolic shapes (batch size)
         # for weights and static buffers, they will have concrete shapes.
         # symbolic shape only happens for input tensors.
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 6bb66ce3eec0c..31f5e78408460 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -392,7 +392,6 @@ def _support_torch_compile(
 
             factors.append(_model_hash_key(self.forward))
             hash_key = hashlib.sha256(str(factors).encode()).hexdigest()
-
             cache_dir = os.path.join(
                 envs.VLLM_CACHE_ROOT,
                 "torch_aot_compile",
@@ -413,7 +412,8 @@ def _support_torch_compile(
                         f, f_globals=self.forward.__globals__
                     )
                 _verify_source_unchanged(loaded_fn.source_info(), self.vllm_config)
-                loaded_fn.disable_guard_check()
+                if not self.compilation_config.dynamic_shapes_config.evaluate_guards:
+                    loaded_fn.disable_guard_check()
                 self.aot_compiled_fn = loaded_fn
             except Exception as e:
                 if os.path.exists(aot_compilation_path):
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index 69e1ed37a5beb..b59a4a9dd1527 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -4,7 +4,7 @@
 import os
 import sys
 from abc import abstractmethod
-from contextlib import contextmanager
+from contextlib import contextmanager, nullcontext
 from types import CodeType
 from typing import Any
 
@@ -13,6 +13,7 @@ import torch._C._dynamo.guards
 
 import vllm.envs as envs
 from vllm.config import CompilationMode, CUDAGraphMode, get_current_vllm_config
+from vllm.config.compilation import DynamicShapesType
 from vllm.logger import init_logger
 from vllm.utils.nvtx_pytorch_hooks import layerwise_nvtx_marker_context
 
@@ -125,23 +126,49 @@ class TorchCompileWithNoGuardsWrapper:
         if isinstance(backend, str) and backend == "inductor":
             options = vllm_config.compilation_config.inductor_compile_config
 
-        if mode != CompilationMode.STOCK_TORCH_COMPILE:
-            # Drop all the guards.
-            options["guard_filter_fn"] = lambda x: [False for _ in x]
-
-        # Validate that unbacked dynamic shapes require VLLM_USE_BYTECODE_HOOK=False
-        from vllm.compilation.decorators import DynamicShapesType
+        self.first_compile = True
+        self.evaluate_guards = (
+            vllm_config.compilation_config.dynamic_shapes_config.evaluate_guards
+        )
 
         ds_type = vllm_config.compilation_config.dynamic_shapes_config.type
-        compiled_ptr: Any = self.forward
-        if ds_type == DynamicShapesType.UNBACKED:
-            if envs.VLLM_USE_BYTECODE_HOOK:
-                # reason is that bytecode does this hack torch._dynamo.eval_frame.
-                # remove_from_cache(self.original_code_object()) to force a new
-                # re-compilation.
-                raise ValueError(
-                    "UNBACKED dynamic shapes require VLLM_USE_BYTECODE_HOOK=0. "
+
+        if mode != CompilationMode.STOCK_TORCH_COMPILE:
+            # Drop all the guards.
+            if self.evaluate_guards:
+                assert not envs.VLLM_USE_BYTECODE_HOOK, (
+                    "compilation_config.dynamic_shapes_config.evaluate_guards "
+                    "requires VLLM_USE_BYTECODE_HOOK=0. "
                 )
+
+                if envs.VLLM_USE_AOT_COMPILE:
+                    # disabled until https://github.com/pytorch/pytorch/pull/169239
+                    # is picked up.
+                    assert ds_type != DynamicShapesType.BACKED, (
+                        "evaluate_guards for backed shapes requires "
+                        "VLLM_USE_AOT_COMPILE=False. "
+                    )
+
+                options["guard_filter_fn"] = lambda x: [
+                    entry.guard_type == "SHAPE_ENV" for entry in x
+                ]
+            else:
+                options["guard_filter_fn"] = lambda x: [False for _ in x]
+
+        compiled_ptr: Any = self.forward
+        # Validate that unbacked dynamic shapes require VLLM_USE_BYTECODE_HOOK=False
+
+        if ds_type == DynamicShapesType.UNBACKED:
+            # reason is that bytecode does torch._dynamo.eval_frame.
+            # remove_from_cache(self.original_code_object()) to force a new
+            # re-compilation. And if we use
+            # compiled_ptr = self.check_invariants_and_forward
+            # it will reset all entries.
+            assert not envs.VLLM_USE_BYTECODE_HOOK, (
+                "UNBACKED dynamic shapes requires VLLM_USE_BYTECODE_HOOK=0. "
+            )
+            assert not self.evaluate_guards, "UNBACKED dynamic shapes do not add guards"
+
             compiled_ptr = self.check_invariants_and_forward
 
         if envs.VLLM_USE_AOT_COMPILE:
@@ -195,7 +222,13 @@ class TorchCompileWithNoGuardsWrapper:
                         self.forward, *args, **kwargs
                     )
         else:
-            with _compilation_context():
+            ctx = (
+                nullcontext()
+                if self.first_compile or not self.evaluate_guards
+                else torch.compiler.set_stance("fail_on_recompile")
+            )
+            self.first_compile = False
+            with _compilation_context(), ctx:
                 return self._call_with_optional_nvtx_range(
                     self._compiled_callable, *args, **kwargs
                 )
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index b79200f0e4779..51e4912aad9db 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -344,7 +344,18 @@ class DynamicShapesConfig:
       backed/unbacked.
     """
 
-    # TODO add a debug mode to fail
+    evaluate_guards: bool = False
+    """
+    A debug mode to detect and fail if Dynamo ever specializes a dynamic shape by
+    guarding on it. When True, dynamic shape guards are not dropped from dynamo.
+    And a failure will be triggered if a recompilation ever happens due to that.
+    This mode requires VLLM_USE_BYTECODE_HOOK to be 0.
+    Enabling this allow observing the dynamic shapes guards in the tlparse
+    artifacts also.
+    When type is backed, aot_compile must be disabled for this mode to work.
+    until this change picked up https://github.com/pytorch/pytorch/pull/169239.
+
+    """
 
     def compute_hash(self) -> str:
         """
@@ -455,8 +466,8 @@ class CompilationConfig:
     We use string to avoid serialization issues when using compilation in a
     distributed setting. When the compilation mode is 1 or 2, the backend is
     used for the compilation directly (it sees the whole graph). When the
-    compilation mode is 3, the backend supports both whole graph and piecewise 
-    compilation, available backends include eager, inductor, and custom backends, 
+    compilation mode is 3, the backend supports both whole graph and piecewise
+    compilation, available backends include eager, inductor, and custom backends,
     the latter of which can be defined via `get_compile_backend`. Furthermore,
     compilation is only piecewise if splitting ops is set accordingly and
     use_inductor_graph_partition is off. Note that the default options for
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 36e4bd159dc72..a74413536407b 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -66,7 +66,7 @@ class OptimizationLevel(IntEnum):
     """O0 : No optimization. no compilation, no cudagraphs, no other
     optimization, just starting up immediately"""
     O1 = 1
-    """O1: Quick optimizations. Dynamo+Inductor compilation and Piecewise 
+    """O1: Quick optimizations. Dynamo+Inductor compilation and Piecewise
     cudagraphs"""
     O2 = 2
     """O2: Full optimizations. -O1 as well as Full and Piecewise cudagraphs."""

From 67312cad11835bd75ca55fda83708d4806b82436 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 9 Dec 2025 00:59:31 +0800
Subject: [PATCH 341/770] [Misc] Split the LoRA code (#30253)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/test_layers.py                 |   2 +-
 tests/lora/test_lora_checkpoints.py       |   2 +-
 tests/lora/test_lora_huggingface.py       |   2 +-
 tests/lora/test_lora_manager.py           |   4 +-
 tests/lora/test_worker.py                 |   2 +-
 vllm/lora/lora_model.py                   | 246 ++++++++++++++++++++++
 vllm/lora/{models.py => model_manager.py} | 237 +--------------------
 vllm/lora/utils.py                        |   9 +
 vllm/lora/worker_manager.py               |   4 +-
 9 files changed, 265 insertions(+), 243 deletions(-)
 create mode 100644 vllm/lora/lora_model.py
 rename vllm/lora/{models.py => model_manager.py} (74%)

diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 9df3a07a9e5e9..47d1fcfe9a0c7 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -28,7 +28,7 @@ from vllm.lora.layers import (
     RowParallelLinearWithShardedLoRA,
     VocabParallelEmbeddingWithLoRA,
 )
-from vllm.lora.models import LoRALayerWeights, PackedLoRALayerWeights
+from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
 from vllm.lora.punica_wrapper import get_punica_wrapper
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py
index e9653a2fedfaf..e6816e83da001 100644
--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
@@ -3,7 +3,7 @@
 
 import pytest
 
-from vllm.lora.models import LoRAModel
+from vllm.lora.lora_model import LoRAModel
 from vllm.lora.peft_helper import PEFTHelper
 from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM
 from vllm.model_executor.models.utils import WeightsMapper
diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py
index 3348d2f8ce654..7c7f4eb4b626b 100644
--- a/tests/lora/test_lora_huggingface.py
+++ b/tests/lora/test_lora_huggingface.py
@@ -3,7 +3,7 @@
 
 import pytest
 
-from vllm.lora.models import LoRAModel
+from vllm.lora.lora_model import LoRAModel
 from vllm.lora.peft_helper import PEFTHelper
 from vllm.lora.utils import get_adapter_absolute_path
 from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index 081f14d6fabfb..50f17ced5dd74 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -15,10 +15,10 @@ from vllm.lora.layers import (
     MergedColumnParallelLinearWithLoRA,
     RowParallelLinearWithLoRA,
 )
+from vllm.lora.lora_model import LoRAModel
 from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
-from vllm.lora.models import (
+from vllm.lora.model_manager import (
     LoRAMapping,
-    LoRAModel,
     LoRAModelManager,
     LRUCacheLoRAModelManager,
 )
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index 54059ec561907..445aaf9cb7d1e 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -16,7 +16,7 @@ from vllm.config import (
 )
 from vllm.config.load import LoadConfig
 from vllm.config.lora import LoRAConfig
-from vllm.lora.models import LoRAMapping
+from vllm.lora.model_manager import LoRAMapping
 from vllm.lora.request import LoRARequest
 from vllm.v1.worker.gpu_worker import Worker
 
diff --git a/vllm/lora/lora_model.py b/vllm/lora/lora_model.py
new file mode 100644
index 0000000000000..db170f13ae1c7
--- /dev/null
+++ b/vllm/lora/lora_model.py
@@ -0,0 +1,246 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+
+import safetensors.torch
+import torch
+
+from vllm.logger import init_logger
+from vllm.lora.lora_weights import LoRALayerWeights
+from vllm.lora.peft_helper import PEFTHelper
+from vllm.lora.utils import (
+    get_lora_id,
+    is_base_embeddding_weights,
+    is_regex_target_modules,
+    parse_fine_tuned_lora_name,
+)
+from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
+from vllm.model_executor.models.utils import WeightsMapper
+from vllm.utils.platform_utils import is_pin_memory_available
+
+logger = init_logger(__name__)
+
+
+class LoRAModel:
+    """A LoRA fine-tuned model."""
+
+    def __init__(
+        self,
+        lora_model_id: int,
+        rank: int,
+        loras: dict[str, LoRALayerWeights],
+    ) -> None:
+        """
+        Args:
+            lora_model_id: The integer id for the lora model.
+            rank: lora rank.
+            loras: module name -> weights for lora-replaced layers.
+
+        """
+        self.id = lora_model_id
+
+        assert lora_model_id > 0, (
+            f"a valid lora id should be greater than 0, got {self.id}"
+        )
+        self.rank = rank
+        self.loras: dict[str, LoRALayerWeights] = loras
+
+    def clone(self, lora_model_id: int) -> "LoRAModel":
+        """Return a copy of the object with different ids.
+
+        Will share the underlying tensors."""
+        return self.__class__(
+            lora_model_id,
+            rank=self.rank,
+            loras=self.loras.copy(),
+        )
+
+    def get_lora(self, module_name: str) -> LoRALayerWeights | None:
+        """Get LoRA for a given module by name"""
+        return self.loras.get(module_name, None)
+
+    def check_lora_name(self, lora_name: str) -> bool:
+        return lora_name in self.loras
+
+    @classmethod
+    def from_lora_tensors(
+        cls,
+        lora_model_id: int,
+        tensors: dict[str, torch.Tensor],
+        peft_helper: PEFTHelper,
+        device: str = "cuda",
+        dtype: torch.dtype | None = None,
+        model_vocab_size: int | None = None,
+        weights_mapper: WeightsMapper | None = None,
+    ) -> "LoRAModel":
+        """Create a LoRAModel from a dictionary of tensors."""
+        pin_memory = str(device) == "cpu" and is_pin_memory_available()
+        loras: dict[str, LoRALayerWeights] = {}
+        for tensor_name, tensor in tensors.items():
+            if is_base_embeddding_weights(tensor_name):
+                continue
+            module_name, is_lora_a = parse_fine_tuned_lora_name(
+                tensor_name, weights_mapper
+            )
+            if module_name not in loras:
+                loras[module_name] = LoRALayerWeights.from_config(
+                    module_name, peft_helper
+                )
+
+            if is_lora_a:
+                if (
+                    "lora_embedding_A" in tensor_name
+                    and model_vocab_size is not None
+                    and model_vocab_size != tensor.shape[1]
+                ):
+                    raise RuntimeError(
+                        f"The embedding LoRA size({tensor.shape[1]}) must be consistent"
+                        f" with the base model's vocabulary size({model_vocab_size})."
+                    )
+                loras[module_name].lora_a = tensor.to(device=device, dtype=dtype)
+                if pin_memory:
+                    loras[module_name].lora_a = loras[module_name].lora_a.pin_memory()
+            else:
+                loras[module_name].lora_b = tensor.to(device=device, dtype=dtype)
+
+                if pin_memory:
+                    loras[module_name].lora_b = loras[module_name].lora_b.pin_memory()
+
+        return cls(lora_model_id, peft_helper.r, loras)
+
+    @classmethod
+    def from_local_checkpoint(
+        cls,
+        lora_dir: str,
+        expected_lora_modules: set[str],
+        peft_helper: PEFTHelper,
+        *,
+        lora_model_id: int | None = None,
+        device: str = "cuda",
+        dtype: torch.dtype | None = None,
+        model_vocab_size: int | None = None,
+        weights_mapper: WeightsMapper | None = None,
+        tensorizer_config_dict: dict | None = None,
+    ) -> "LoRAModel":
+        """Create a LoRAModel from a local checkpoint.
+
+        Args:
+            lora_dir: The local path that has lora data.
+            expected_lora_modules: Name of modules that are expected to be
+                replaced by lora.
+            peft_helper: Loaded lora configuration information.
+            lora_model_id: LoRA model id. If not given, automatically set by
+                a global counter.
+            device: Device where the lora model is loaded.
+            dtype: dtype of the lora model weights.
+
+        Returns:
+            Loaded LoRA Model.
+        """
+        lora_tensor_path = os.path.join(lora_dir, "adapter_model.safetensors")
+        lora_bin_file_path = os.path.join(lora_dir, "adapter_model.bin")
+        lora_pt_file_path = os.path.join(lora_dir, "adapter_model.pt")
+
+        tensors: dict[str, torch.Tensor] = {}
+        unexpected_modules: list[list[str] | str] = []
+
+        def check_unexpected_modules(modules: dict):
+            for lora_module in modules.keys():  # noqa
+                if is_base_embeddding_weights(lora_module):
+                    continue
+                # Handle PEFT file format where experts.base_layer is the
+                # gate_up_proj and experts is the down_proj
+                if "base_layer" in lora_module:
+                    continue
+                module_name, _ = parse_fine_tuned_lora_name(lora_module, weights_mapper)
+                # Case for expert lora weights
+                if ".experts" in module_name:
+                    expert_idx = module_name.find(".experts")
+                    expert_suffix = module_name[expert_idx + 1 :]
+                    if expert_suffix not in expected_lora_modules:
+                        unexpected_modules.append(module_name)
+
+                elif module_name.rsplit(".", 1)[-1] not in expected_lora_modules:
+                    unexpected_modules.append(module_name)
+
+            if unexpected_modules:
+                raise ValueError(
+                    f"While loading {lora_dir}, expected"
+                    f" target modules in {expected_lora_modules}"
+                    f" but received {unexpected_modules}."
+                    f" Please verify that the loaded LoRA module is correct"
+                )
+
+        if tensorizer_config_dict:
+            from tensorizer import TensorDeserializer
+
+            tensorizer_config = TensorizerConfig(**tensorizer_config_dict)
+            lora_tensor_path = os.path.join(
+                tensorizer_config.tensorizer_dir, "adapter_model.tensors"
+            )
+            tensorizer_args = tensorizer_config._construct_tensorizer_args()
+            tensors = TensorDeserializer(
+                lora_tensor_path,
+                dtype=tensorizer_config.dtype,
+                **tensorizer_args.deserialization_kwargs,
+            )
+            check_unexpected_modules(tensors)
+
+        elif os.path.isfile(lora_tensor_path):
+            # Find unexpected modules.
+            # Use safetensor key as a source of truth to find expected modules.
+            # in peft if you have target_modules A, B, C and C does not exist
+            # in the model it won’t error and model will be trained with A, B
+            # loraified. C won’t exist in the safetensor but it will exist in
+            # the target_modules of the adapter_config.json.
+            unexpected_modules = []
+            with safetensors.safe_open(lora_tensor_path, framework="pt") as f:  # type: ignore
+                # Load tensors if there are only expected modules.
+                check_unexpected_modules(f)
+                for module in f.keys():  # noqa
+                    tensors[module] = f.get_tensor(module)
+        elif os.path.isfile(lora_bin_file_path) or os.path.isfile(lora_pt_file_path):
+            # When a bin/pt file is provided, we rely on config to find
+            # unexpected modules.
+            unexpected_modules = []
+            target_modules = peft_helper.target_modules
+            if not isinstance(target_modules, list):
+                target_modules = [target_modules]
+            for module in target_modules:
+                # Compatible with more modules,
+                # such as:layers.11.self_attn.k_proj
+                part_name = module.split(".")[-1]
+                if part_name not in expected_lora_modules:
+                    unexpected_modules.append(module)
+            # loaded lora's target modules must be a subset of
+            # expected_lora_modules. It is not reliable. See
+            # https://github.com/vllm-project/vllm/pull/5909. But there's no
+            # other better mechanism.
+            if unexpected_modules and not is_regex_target_modules(
+                peft_helper.target_modules, expected_lora_modules
+            ):
+                raise ValueError(
+                    f"While loading {lora_dir}, expected"
+                    f" target modules in {expected_lora_modules}"
+                    f" but received {unexpected_modules}."
+                    f" Please verify that the loaded LoRA module is correct"
+                )
+            lora_file_path = (
+                lora_bin_file_path
+                if os.path.isfile(lora_bin_file_path)
+                else lora_pt_file_path
+            )
+            tensors = torch.load(lora_file_path, map_location=device, weights_only=True)
+        else:
+            raise ValueError(f"{lora_dir} doesn't contain tensors")
+
+        return cls.from_lora_tensors(
+            lora_model_id=get_lora_id() if lora_model_id is None else lora_model_id,
+            tensors=tensors,
+            peft_helper=peft_helper,
+            device=device,
+            dtype=dtype,
+            model_vocab_size=model_vocab_size,
+            weights_mapper=weights_mapper,
+        )
diff --git a/vllm/lora/models.py b/vllm/lora/model_manager.py
similarity index 74%
rename from vllm/lora/models.py
rename to vllm/lora/model_manager.py
index 567ffce4e75fc..44e0448d92de0 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/model_manager.py
@@ -2,38 +2,32 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
-import os
 from collections.abc import Callable
 from typing import TypeVar
 
 import regex as re
-import safetensors.torch
 import torch
 from torch import nn
 
 from vllm.config.lora import LoRAConfig
 from vllm.logger import init_logger
 from vllm.lora.layers import BaseLayerWithLoRA, FusedMoE3DWithLoRA, LoRAMapping
+from vllm.lora.lora_model import LoRAModel
 from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
-from vllm.lora.peft_helper import PEFTHelper
 from vllm.lora.punica_wrapper import get_punica_wrapper
 from vllm.lora.utils import (
     from_layer,
     from_layer_logits_processor,
     get_supported_lora_modules,
-    is_base_embeddding_weights,
     is_moe_model,
-    is_regex_target_modules,
-    parse_fine_tuned_lora_name,
     process_packed_modules_mapping,
     replace_submodule,
 )
 from vllm.model_executor.layers.fused_moe import FusedMoE
-from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
 from vllm.model_executor.models import SupportsLoRA, supports_multimodal
 from vllm.model_executor.models.interfaces import is_pooling_model
 from vllm.model_executor.models.module_mapping import MultiModelKeys
-from vllm.model_executor.models.utils import PPMissingLayer, WeightsMapper
+from vllm.model_executor.models.utils import PPMissingLayer
 from vllm.utils.cache import LRUCache
 from vllm.utils.platform_utils import is_pin_memory_available
 
@@ -53,233 +47,6 @@ class AdapterLRUCache(LRUCache[int, T]):
         return super()._on_remove(key, value)
 
 
-_GLOBAL_LORA_ID = 0
-
-
-def get_lora_id():
-    global _GLOBAL_LORA_ID
-    _GLOBAL_LORA_ID += 1
-    return _GLOBAL_LORA_ID
-
-
-class LoRAModel:
-    """A LoRA fine-tuned model."""
-
-    def __init__(
-        self,
-        lora_model_id: int,
-        rank: int,
-        loras: dict[str, LoRALayerWeights],
-    ) -> None:
-        """
-        Args:
-            lora_model_id: The integer id for the lora model.
-            rank: lora rank.
-            loras: module name -> weights for lora-replaced layers.
-
-        """
-        self.id = lora_model_id
-
-        assert lora_model_id > 0, (
-            f"a valid lora id should be greater than 0, got {self.id}"
-        )
-        self.rank = rank
-        self.loras: dict[str, LoRALayerWeights] = loras
-
-    def clone(self, lora_model_id: int) -> "LoRAModel":
-        """Return a copy of the object with different ids.
-
-        Will share the underlying tensors."""
-        return self.__class__(
-            lora_model_id,
-            rank=self.rank,
-            loras=self.loras.copy(),
-        )
-
-    def get_lora(self, module_name: str) -> LoRALayerWeights | None:
-        """Get LoRA for a given module by name"""
-        return self.loras.get(module_name, None)
-
-    def check_lora_name(self, lora_name: str) -> bool:
-        return lora_name in self.loras
-
-    @classmethod
-    def from_lora_tensors(
-        cls,
-        lora_model_id: int,
-        tensors: dict[str, torch.Tensor],
-        peft_helper: PEFTHelper,
-        device: str = "cuda",
-        dtype: torch.dtype | None = None,
-        model_vocab_size: int | None = None,
-        weights_mapper: WeightsMapper | None = None,
-    ) -> "LoRAModel":
-        """Create a LoRAModel from a dictionary of tensors."""
-
-        loras: dict[str, LoRALayerWeights] = {}
-        for tensor_name, tensor in tensors.items():
-            if is_base_embeddding_weights(tensor_name):
-                continue
-            module_name, is_lora_a = parse_fine_tuned_lora_name(
-                tensor_name, weights_mapper
-            )
-            if module_name not in loras:
-                loras[module_name] = LoRALayerWeights.from_config(
-                    module_name, peft_helper
-                )
-
-            if is_lora_a:
-                if (
-                    "lora_embedding_A" in tensor_name
-                    and model_vocab_size is not None
-                    and model_vocab_size != tensor.shape[1]
-                ):
-                    raise RuntimeError(
-                        f"The embedding LoRA size({tensor.shape[1]}) must be consistent"
-                        f" with the base model's vocabulary size({model_vocab_size})."
-                    )
-                loras[module_name].lora_a = tensor.to(device=device, dtype=dtype)
-            else:
-                loras[module_name].lora_b = tensor.to(device=device, dtype=dtype)
-        return cls(lora_model_id, peft_helper.r, loras)
-
-    @classmethod
-    def from_local_checkpoint(
-        cls,
-        lora_dir: str,
-        expected_lora_modules: set[str],
-        peft_helper: PEFTHelper,
-        *,
-        lora_model_id: int | None = None,
-        device: str = "cuda",
-        dtype: torch.dtype | None = None,
-        model_vocab_size: int | None = None,
-        weights_mapper: WeightsMapper | None = None,
-        tensorizer_config_dict: dict | None = None,
-    ) -> "LoRAModel":
-        """Create a LoRAModel from a local checkpoint.
-
-        Args:
-            lora_dir: The local path that has lora data.
-            expected_lora_modules: Name of modules that are expected to be
-                replaced by lora.
-            peft_helper: Loaded lora configuration information.
-            lora_model_id: LoRA model id. If not given, automatically set by
-                a global counter.
-            device: Device where the lora model is loaded.
-            dtype: dtype of the lora model weights.
-
-        Returns:
-            Loaded LoRA Model.
-        """
-        lora_tensor_path = os.path.join(lora_dir, "adapter_model.safetensors")
-        lora_bin_file_path = os.path.join(lora_dir, "adapter_model.bin")
-        lora_pt_file_path = os.path.join(lora_dir, "adapter_model.pt")
-
-        tensors: dict[str, torch.Tensor] = {}
-        unexpected_modules: list[list[str] | str] = []
-
-        def check_unexpected_modules(modules: dict):
-            for lora_module in modules.keys():  # noqa
-                if is_base_embeddding_weights(lora_module):
-                    continue
-                # Handle PEFT file format where experts.base_layer is the
-                # gate_up_proj and experts is the down_proj
-                if "base_layer" in lora_module:
-                    continue
-                module_name, _ = parse_fine_tuned_lora_name(lora_module, weights_mapper)
-                # Case for expert lora weights
-                if ".experts" in module_name:
-                    expert_idx = module_name.find(".experts")
-                    expert_suffix = module_name[expert_idx + 1 :]
-                    if expert_suffix not in expected_lora_modules:
-                        unexpected_modules.append(module_name)
-
-                elif module_name.rsplit(".", 1)[-1] not in expected_lora_modules:
-                    unexpected_modules.append(module_name)
-
-            if unexpected_modules:
-                raise ValueError(
-                    f"While loading {lora_dir}, expected"
-                    f" target modules in {expected_lora_modules}"
-                    f" but received {unexpected_modules}."
-                    f" Please verify that the loaded LoRA module is correct"
-                )
-
-        if tensorizer_config_dict:
-            from tensorizer import TensorDeserializer
-
-            tensorizer_config = TensorizerConfig(**tensorizer_config_dict)
-            lora_tensor_path = os.path.join(
-                tensorizer_config.tensorizer_dir, "adapter_model.tensors"
-            )
-            tensorizer_args = tensorizer_config._construct_tensorizer_args()
-            tensors = TensorDeserializer(
-                lora_tensor_path,
-                dtype=tensorizer_config.dtype,
-                **tensorizer_args.deserialization_kwargs,
-            )
-            check_unexpected_modules(tensors)
-
-        elif os.path.isfile(lora_tensor_path):
-            # Find unexpected modules.
-            # Use safetensor key as a source of truth to find expected modules.
-            # in peft if you have target_modules A, B, C and C does not exist
-            # in the model it won’t error and model will be trained with A, B
-            # loraified. C won’t exist in the safetensor but it will exist in
-            # the target_modules of the adapter_config.json.
-            unexpected_modules = []
-            with safetensors.safe_open(lora_tensor_path, framework="pt") as f:  # type: ignore
-                # Load tensors if there are only expected modules.
-                check_unexpected_modules(f)
-                for module in f.keys():  # noqa
-                    tensors[module] = f.get_tensor(module)
-        elif os.path.isfile(lora_bin_file_path) or os.path.isfile(lora_pt_file_path):
-            # When a bin/pt file is provided, we rely on config to find
-            # unexpected modules.
-            unexpected_modules = []
-            target_modules = peft_helper.target_modules
-            if not isinstance(target_modules, list):
-                target_modules = [target_modules]
-            for module in target_modules:
-                # Compatible with more modules,
-                # such as:layers.11.self_attn.k_proj
-                part_name = module.split(".")[-1]
-                if part_name not in expected_lora_modules:
-                    unexpected_modules.append(module)
-            # loaded lora's target modules must be a subset of
-            # expected_lora_modules. It is not reliable. See
-            # https://github.com/vllm-project/vllm/pull/5909. But there's no
-            # other better mechanism.
-            if unexpected_modules and not is_regex_target_modules(
-                peft_helper.target_modules, expected_lora_modules
-            ):
-                raise ValueError(
-                    f"While loading {lora_dir}, expected"
-                    f" target modules in {expected_lora_modules}"
-                    f" but received {unexpected_modules}."
-                    f" Please verify that the loaded LoRA module is correct"
-                )
-            lora_file_path = (
-                lora_bin_file_path
-                if os.path.isfile(lora_bin_file_path)
-                else lora_pt_file_path
-            )
-            tensors = torch.load(lora_file_path, map_location=device, weights_only=True)
-        else:
-            raise ValueError(f"{lora_dir} doesn't contain tensors")
-
-        return cls.from_lora_tensors(
-            lora_model_id=get_lora_id() if lora_model_id is None else lora_model_id,
-            tensors=tensors,
-            peft_helper=peft_helper,
-            device=device,
-            dtype=dtype,
-            model_vocab_size=model_vocab_size,
-            weights_mapper=weights_mapper,
-        )
-
-
 class LoRAModelManager:
     """A manager that manages multiple LoRA-fine-tuned models."""
 
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index 47484b2b984df..4d264c06826b8 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -48,6 +48,15 @@ if TYPE_CHECKING:
 
 logger = init_logger(__name__)
 
+_GLOBAL_LORA_ID = 0
+
+
+def get_lora_id():
+    global _GLOBAL_LORA_ID
+    _GLOBAL_LORA_ID += 1
+    return _GLOBAL_LORA_ID
+
+
 _all_lora_classes: set[type[BaseLayerWithLoRA]] = {
     VocabParallelEmbeddingWithLoRA,
     ColumnParallelLinearWithLoRA,
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index 7d77ba7247ef0..28c2a53d84e42 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -8,8 +8,8 @@ import torch
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.lora.models import (
-    LoRAModel,
+from vllm.lora.lora_model import LoRAModel
+from vllm.lora.model_manager import (
     LoRAModelManager,
     LRUCacheLoRAModelManager,
     create_lora_manager,

From 398a596ed249c14b76806900894304a62d653603 Mon Sep 17 00:00:00 2001
From: weiguihua2 <weiguihua2@huawei.com>
Date: Tue, 9 Dec 2025 01:33:48 +0800
Subject: [PATCH 342/770] [MP executor] fix get device count for multi node of
 mp executor feature (#30042)

Signed-off-by: weiguihua2 <weiguihua2@huawei.com>
---
 vllm/distributed/device_communicators/shm_broadcast.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 052df19e34d72..114516ff07a1f 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -27,6 +27,7 @@ from zmq import (  # type: ignore
 import vllm.envs as envs
 from vllm.distributed.utils import StatelessProcessGroup, sched_yield
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 from vllm.utils.network_utils import (
     get_ip,
     get_open_port,
@@ -632,7 +633,7 @@ class MessageQueue:
             The MessageQueue instance for the calling process,
             and a list of handles (only non-empty for the reader process).
         """
-        local_size = torch.cuda.device_count()
+        local_size = current_platform.device_count()
         rank = dist.get_rank()
         same_node = rank // local_size == reader_rank // local_size
         buffer_io = MessageQueue(

From fcd5306f65876f72122d7e5852b6000738498d7e Mon Sep 17 00:00:00 2001
From: shaharmor98 <17088876+shaharmor98@users.noreply.github.com>
Date: Mon, 8 Dec 2025 19:35:01 +0200
Subject: [PATCH 343/770] Add latent MoE support (#30203)

Signed-off-by: Shahar Mor <smor@nvidia.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 vllm/model_executor/models/nemotron_h.py      | 62 +++++++++++++++++--
 vllm/transformers_utils/configs/nemotron_h.py |  2 +
 2 files changed, 58 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py
index baeb901bbb05a..2d9dfbd3e7688 100644
--- a/vllm/model_executor/models/nemotron_h.py
+++ b/vllm/model_executor/models/nemotron_h.py
@@ -83,6 +83,7 @@ class NemotronHMLP(nn.Module):
     def __init__(
         self,
         config: NemotronHConfig,
+        hidden_size: int,
         intermediate_size: int,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -93,7 +94,7 @@ class NemotronHMLP(nn.Module):
         super().__init__()
 
         self.up_proj = ColumnParallelLinear(
-            input_size=config.hidden_size,
+            input_size=hidden_size,
             output_size=intermediate_size,
             bias=bias,
             quant_config=quant_config,
@@ -102,7 +103,7 @@ class NemotronHMLP(nn.Module):
         )
         self.down_proj = RowParallelLinear(
             input_size=intermediate_size,
-            output_size=config.hidden_size,
+            output_size=hidden_size,
             bias=bias,
             quant_config=quant_config,
             reduce_results=reduce_results,
@@ -135,6 +136,10 @@ class NemotronHMoE(nn.Module):
         self.ep_size = self.ep_group.size()
         self.n_routed_experts: int = config.n_routed_experts
         self.n_shared_experts: int = config.n_shared_experts
+        self.use_latent_moe: bool = getattr(config, "moe_latent_size", None) is not None
+        self.moe_hidden_size: int = (
+            config.moe_latent_size if self.use_latent_moe else config.hidden_size
+        )
 
         self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe
 
@@ -172,6 +177,7 @@ class NemotronHMoE(nn.Module):
 
             self.shared_experts = NemotronHMLP(
                 config=config,
+                hidden_size=config.hidden_size,
                 intermediate_size=intermediate_size,
                 quant_config=quant_config,
                 reduce_results=False,
@@ -180,10 +186,12 @@ class NemotronHMoE(nn.Module):
             )
 
         self.experts = SharedFusedMoE(
-            shared_experts=self.shared_experts,
+            # TODO: make it possible for shared experts to have
+            # different input in SharedFusedMoE
+            shared_experts=self.shared_experts if not self.use_latent_moe else None,
             num_experts=config.n_routed_experts,
             top_k=config.num_experts_per_tok,
-            hidden_size=config.hidden_size,
+            hidden_size=self.moe_hidden_size,
             intermediate_size=config.moe_intermediate_size,
             reduce_results=False,
             renormalize=config.norm_topk_prob,
@@ -201,6 +209,32 @@ class NemotronHMoE(nn.Module):
             is_sequence_parallel=self.is_sequence_parallel,
         )
 
+        if self.use_latent_moe:
+            # TODO: check if using ReplicatedLinear is better than
+            # ColumnParallelLinear + all_gather
+            self.fc1_latent_proj = ColumnParallelLinear(
+                input_size=config.hidden_size,
+                output_size=self.moe_hidden_size,
+                bias=config.mlp_bias,
+                quant_config=quant_config,
+                disable_tp=self.is_sequence_parallel,
+                # We need to gather the output to prepare input for moe
+                gather_output=True,
+                prefix=f"{prefix}.fc1_latent_proj",
+            )
+            self.fc2_latent_proj = ReplicatedLinear(
+                input_size=self.moe_hidden_size,
+                output_size=config.hidden_size,
+                bias=config.mlp_bias,
+                quant_config=quant_config,
+                disable_tp=self.is_sequence_parallel,
+                prefix=f"{prefix}.fc2_latent_proj",
+            )
+
+        else:
+            self.fc1_latent_proj = None
+            self.fc2_latent_proj = None
+
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         num_tokens, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
@@ -210,12 +244,20 @@ class NemotronHMoE(nn.Module):
 
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states.to(dtype=torch.float32))
+        shared_output = None
+        if self.use_latent_moe:
+            if self.shared_experts is not None:
+                shared_output = self.shared_experts(hidden_states)
+            hidden_states, _ = self.fc1_latent_proj(hidden_states)
 
         fused_moe_out = self.experts(
             hidden_states=hidden_states, router_logits=router_logits
         )
 
-        shared_output, final_hidden_states = fused_moe_out
+        if self.use_latent_moe:
+            _, final_hidden_states = fused_moe_out
+        else:
+            shared_output, final_hidden_states = fused_moe_out
 
         # Fix FP16 overflow
         # See DeepseekV2DecoderLayer for more details.
@@ -225,6 +267,13 @@ class NemotronHMoE(nn.Module):
             assert shared_output is not None
             shared_output *= 1.0 / self.routed_scaling_factor
 
+        # TODO: currently latent up_proj is done before all-reduce for simplicity.
+        #  if and when shared experts will be part of SharedFusedMoE,
+        #  we should do the up_proj after all-reduce,
+        #  to have the all-reduce in the smaller latent dimension.
+        if self.use_latent_moe:
+            final_hidden_states, _ = self.fc2_latent_proj(final_hidden_states)
+
         if self.shared_experts is not None:
             assert shared_output is not None
             final_hidden_states += shared_output
@@ -268,6 +317,7 @@ class NemotronHMLPDecoderLayer(nn.Module):
 
         self.mixer = NemotronHMLP(
             config,
+            hidden_size=config.hidden_size,
             intermediate_size=intermediate_size,
             quant_config=quant_config,
             bias=config.mlp_bias,
@@ -846,5 +896,5 @@ class NemotronHForCausalLM(
         return logits
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        loader = AutoWeightsLoader(self)
+        loader = AutoWeightsLoader(self, skip_prefixes=["mtp"])
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/transformers_utils/configs/nemotron_h.py b/vllm/transformers_utils/configs/nemotron_h.py
index 68c40002098c8..86c117fd9d59f 100644
--- a/vllm/transformers_utils/configs/nemotron_h.py
+++ b/vllm/transformers_utils/configs/nemotron_h.py
@@ -189,6 +189,7 @@ class NemotronHConfig(PretrainedConfig):
         n_shared_experts=1,
         moe_intermediate_size=7688,
         moe_shared_expert_intermediate_size=7688,
+        moe_latent_size=None,
         num_experts_per_tok=2,
         routed_scaling_factor=1.0,
         n_group=1,
@@ -254,6 +255,7 @@ class NemotronHConfig(PretrainedConfig):
         self.n_shared_experts = n_shared_experts
         self.moe_intermediate_size = moe_intermediate_size
         self.moe_shared_expert_intermediate_size = moe_shared_expert_intermediate_size  # noqa: E501
+        self.moe_latent_size = moe_latent_size
         self.num_experts_per_tok = num_experts_per_tok
         self.routed_scaling_factor = routed_scaling_factor
         self.n_group = n_group

From d1b5e7afbf8a4ff45904c191748594167dd34e78 Mon Sep 17 00:00:00 2001
From: Johnny Yang <24908445+jcyang43@users.noreply.github.com>
Date: Mon, 8 Dec 2025 12:10:10 -0800
Subject: [PATCH 344/770] [TPU] Bump tpu-inference to 0.12.0 (#30221)

Signed-off-by: Johnny Yang <johnnyyang@google.com>
---
 requirements/tpu.txt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/requirements/tpu.txt b/requirements/tpu.txt
index e6fff58f7b794..7695b4ba2f4cb 100644
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -11,5 +11,4 @@ ray[default]
 ray[data]
 setuptools==78.1.0
 nixl==0.3.0
-tpu_info==0.4.0
-tpu-inference==0.11.1
+tpu-inference==0.12.0

From 0d402d2600490bac17bc5d079e89b1136fe37eda Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vkuzo@users.noreply.github.com>
Date: Mon, 8 Dec 2025 15:15:10 -0500
Subject: [PATCH 345/770] online fp8 quant with streaming weight
 post-processing (#29196)

Signed-off-by: vasiliy <vasiliy@fb.com>
---
 .../model_executor/layers/quantization/fp8.py | 67 ++++++++++++++++++-
 1 file changed, 66 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 0e3e13f5945ea..419ddd91b64e0 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -465,6 +465,30 @@ class Fp8LinearMethod(LinearMethodBase):
                 output_size_per_partition, input_size_per_partition, weight_loader
             )
         else:
+
+            def patched_weight_loader(param, loaded_weight, *args, **kwargs):
+                # load the current weight chunk
+                res = weight_loader(param, loaded_weight, *args, **kwargs)  # type: ignore[misc]
+
+                # track how many elements we have updated
+                if not hasattr(layer, "_loaded_numel"):
+                    layer._loaded_numel = 0
+                layer._loaded_numel += loaded_weight.numel()
+
+                # if we have loaded all of the elements, call
+                # process_weights_after_loading
+                target_loaded_numel = layer.weight.numel()
+                if layer._loaded_numel == target_loaded_numel:
+                    self.process_weights_after_loading(layer)
+
+                    # Delete the bookkeeping
+                    del layer._loaded_numel
+                    # Prevent the usual `process_weights_after_loading` call from doing
+                    # anything
+                    layer._already_called_process_weights_after_loading = True
+
+                return res
+
             # For non-serialized checkpoints, use original dtype
             weight = ModelWeightParameter(
                 data=torch.empty(
@@ -474,7 +498,7 @@ class Fp8LinearMethod(LinearMethodBase):
                 ),
                 input_dim=1,
                 output_dim=0,
-                weight_loader=weight_loader,
+                weight_loader=patched_weight_loader,
             )
         layer.register_parameter("weight", weight)
 
@@ -515,6 +539,9 @@ class Fp8LinearMethod(LinearMethodBase):
                 layer.register_parameter("input_scale", None)
 
     def process_weights_after_loading(self, layer: Module) -> None:
+        if getattr(layer, "_already_called_process_weights_after_loading", False):
+            return
+
         size_k_first = True
         input_scale = None
         # TODO(rob): refactor block quant into separate class.
@@ -738,6 +765,41 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                     f"weight quantization block_k = {block_k}."
                 )
 
+        # if we are doing online quantization, patch the weight
+        # loaded to call `process_weights_after_loading` in a streaming fashion
+        # as soon as the last weight chunk is loaded
+        if not self.quant_config.is_checkpoint_fp8_serialized:
+            weight_loader = extra_weight_attrs["weight_loader"]
+            # create a new holder to prevent modifying behavior of any other
+            # objects which might depend on the old one
+            new_extra_weight_attrs = extra_weight_attrs
+
+            def patched_weight_loader(param, loaded_weight, *args, **kwargs):
+                # load the current weight chunk
+                res = weight_loader(param, loaded_weight, *args, **kwargs)  # type: ignore[misc]
+
+                # add a counter to track how many elements we have updated
+                if not hasattr(layer, "_loaded_numel"):
+                    layer._loaded_numel = 0
+                layer._loaded_numel += loaded_weight.numel()
+
+                # if we have loaded all of the elements, call
+                # process_weights_after_loading
+                target_loaded_numel = layer.w13_weight.numel() + layer.w2_weight.numel()
+                if layer._loaded_numel == target_loaded_numel:
+                    self.process_weights_after_loading(layer)
+
+                    # Delete the bookkeeping
+                    del layer._loaded_numel
+                    # Prevent the usual `process_weights_after_loading` call
+                    # from doing anything
+                    layer._already_called_process_weights_after_loading = True
+
+                return res
+
+            new_extra_weight_attrs["weight_loader"] = patched_weight_loader
+            extra_weight_attrs = new_extra_weight_attrs
+
         # WEIGHTS
         w13_weight = torch.nn.Parameter(
             torch.empty(
@@ -839,6 +901,9 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         self.rocm_aiter_moe_enabled = False
 
     def process_weights_after_loading(self, layer: Module) -> None:
+        if getattr(layer, "_already_called_process_weights_after_loading", False):
+            return
+
         # Lazy import to avoid importing triton too early.
 
         self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()

From 799804d140fc99ce3964648ba91aaa810cf28fef Mon Sep 17 00:00:00 2001
From: Dmitry Tokarev <dtokarev@nvidia.com>
Date: Mon, 8 Dec 2025 15:24:34 -0500
Subject: [PATCH 346/770] Bump nvshmem to 3.3.24 and fix CUDA 13 installation
 (#30149)

Signed-off-by: Dmitry Tokarev <dtokarev@nvidia.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 tools/ep_kernels/install_python_libraries.sh | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh
index 88be5cd778fff..1bb7fd8345238 100755
--- a/tools/ep_kernels/install_python_libraries.sh
+++ b/tools/ep_kernels/install_python_libraries.sh
@@ -10,9 +10,10 @@ set -ex
 CUDA_HOME=${CUDA_HOME:-/usr/local/cuda}
 PPLX_COMMIT_HASH=${PPLX_COMMIT_HASH:-"12cecfd"}
 DEEPEP_COMMIT_HASH=${DEEPEP_COMMIT_HASH:-"73b6ea4"}
-NVSHMEM_VER=3.3.9
+NVSHMEM_VER=3.3.24  # Suppports both CUDA 12 and 13
 WORKSPACE=${WORKSPACE:-$(pwd)/ep_kernels_workspace}
 MODE=${MODE:-install}
+CUDA_VERSION_MAJOR=$(${CUDA_HOME}/bin/nvcc --version | egrep -o "release [0-9]+" | cut -d ' ' -f 2)
 
 # Parse arguments
 while [[ $# -gt 0 ]]; do
@@ -75,11 +76,9 @@ ARCH=$(uname -m)
 case "${ARCH,,}" in
   x86_64|amd64)
     NVSHMEM_SUBDIR="linux-x86_64"
-    NVSHMEM_FILE="libnvshmem-linux-x86_64-${NVSHMEM_VER}_cuda12-archive.tar.xz"
     ;;
   aarch64|arm64)
     NVSHMEM_SUBDIR="linux-sbsa"
-    NVSHMEM_FILE="libnvshmem-linux-sbsa-${NVSHMEM_VER}_cuda12-archive.tar.xz"
     ;;
   *)
     echo "Unsupported architecture: ${ARCH}" >&2
@@ -87,6 +86,7 @@ case "${ARCH,,}" in
     ;;
 esac
 
+NVSHMEM_FILE="libnvshmem-${NVSHMEM_SUBDIR}-${NVSHMEM_VER}_cuda${CUDA_VERSION_MAJOR}-archive.tar.xz"
 NVSHMEM_URL="https://developer.download.nvidia.com/compute/nvshmem/redist/libnvshmem/${NVSHMEM_SUBDIR}/${NVSHMEM_FILE}"
 
 pushd "$WORKSPACE"
@@ -142,13 +142,6 @@ clone_repo() {
     fi
 }
 
-deepep_cuda13_patch() {
-    cuda_version_major=$(${CUDA_HOME}/bin/nvcc --version | egrep -o "release [0-9]+" | cut -d ' ' -f 2)
-    if [ ${cuda_version_major} -ge 13 ]; then
-        sed -i "s|f'{nvshmem_dir}/include']|f'{nvshmem_dir}/include', '${CUDA_HOME}/include/cccl']|" "setup.py"
-    fi
-}
-
 do_build() {
     local repo=$1
     local name=$2
@@ -160,8 +153,9 @@ do_build() {
     clone_repo "$repo" "$name" "$key" "$commit"
     cd "$name"
 
-    if [ "$name" == "DeepEP" ]; then
-        deepep_cuda13_patch
+    # DeepEP CUDA 13 patch
+    if [[ "$name" == "DeepEP" && "${CUDA_VERSION_MAJOR}" -ge 13 ]]; then
+        sed -i "s|f'{nvshmem_dir}/include']|f'{nvshmem_dir}/include', '${CUDA_HOME}/include/cccl']|" "setup.py"
     fi
 
     if [ "$MODE" = "install" ]; then

From ae0f69b16aaa54197257267a8262555beb7d5ae3 Mon Sep 17 00:00:00 2001
From: roikoren755 <26850796+roikoren755@users.noreply.github.com>
Date: Mon, 8 Dec 2025 23:45:18 +0200
Subject: [PATCH 347/770] Add SpecDec support to `selective_state_update`
 (#29488)

Signed-off-by: Roi Koren <roik@nvidia.com>
---
 tests/kernels/mamba/test_mamba_ssm.py         | 325 ++++++++++++++++++
 .../layers/mamba/ops/mamba_ssm.py             | 252 ++++++++++----
 2 files changed, 505 insertions(+), 72 deletions(-)

diff --git a/tests/kernels/mamba/test_mamba_ssm.py b/tests/kernels/mamba/test_mamba_ssm.py
index 98edc959957d0..50e48aad6ebaa 100644
--- a/tests/kernels/mamba/test_mamba_ssm.py
+++ b/tests/kernels/mamba/test_mamba_ssm.py
@@ -425,6 +425,80 @@ def test_selective_state_update(dim, dstate, has_z, itype):
     assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
 
 
+@pytest.mark.parametrize("itype", [torch.float32, torch.bfloat16])
+@pytest.mark.parametrize("has_z", [False, True])
+@pytest.mark.parametrize("dstate", [16, 64])
+@pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
+@pytest.mark.parametrize("max_seq_len", [1, 2, 4])
+def test_selective_state_update_varlen(dim, dstate, has_z, itype, max_seq_len):
+    device = "cuda"
+    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (5e-3, 1e-2)
+    if itype == torch.bfloat16:
+        rtol, atol = 5e-2, 1.5e-1
+        if torch.version.hip:
+            atol *= 2
+    # set seed
+    current_platform.seed_everything(0)
+    batch_size = 4
+    token_counts = torch.randint(1, max_seq_len + 1, (batch_size,), device=device)
+    total_tokens = int(token_counts.sum().item())
+    cu_seqlens = torch.tensor(
+        [0] + torch.cumsum(token_counts, dim=0).tolist(),
+        dtype=torch.int32,
+        device=device,
+    )
+    state = torch.randn(batch_size, dim, dstate, dtype=itype, device=device)
+    x = torch.randn(total_tokens, dim, device=device, dtype=itype)
+    out = torch.empty_like(x)
+    dt = torch.randn(total_tokens, dim, device=device, dtype=itype)
+    dt_bias = torch.rand(dim, device=device) - 4.0
+    A = -torch.rand(dim, dstate, device=device) - 1.0
+    B = torch.randn(total_tokens, dstate, device=device)
+    C = torch.randn(total_tokens, dstate, device=device)
+    D = torch.randn(dim, device=device)
+    z = torch.randn_like(x) if has_z else None
+    state_ref = state.detach().clone()
+    selective_state_update(
+        state,
+        x,
+        dt,
+        A,
+        B,
+        C,
+        D=D,
+        z=z,
+        dt_bias=dt_bias,
+        dt_softplus=True,
+        out=out,
+        cu_seqlens=cu_seqlens,
+    )
+
+    out_ref_list = []
+    for seq_idx in range(batch_size):
+        start_idx = cu_seqlens[seq_idx].item()
+        end_idx = cu_seqlens[seq_idx + 1].item()
+        num_tokens = end_idx - start_idx
+        for token_idx in range(num_tokens):
+            idx = start_idx + token_idx
+            out_ref_list.append(
+                selective_state_update_ref(
+                    state_ref[seq_idx : seq_idx + 1],
+                    x[idx : idx + 1],
+                    dt[idx : idx + 1],
+                    A,
+                    B[idx : idx + 1],
+                    C[idx : idx + 1],
+                    D=D,
+                    z=z[idx : idx + 1] if has_z else None,
+                    dt_bias=dt_bias,
+                    dt_softplus=True,
+                )
+            )
+    out_ref = torch.cat(out_ref_list, dim=0)
+    assert torch.allclose(state, state_ref, rtol=rtol, atol=atol)
+    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
+
+
 @pytest.mark.parametrize("wtype", [torch.float32])
 @pytest.mark.parametrize("itype", [torch.float32])
 @pytest.mark.parametrize("seqlen", [1, 256, 1024, 4096])
@@ -766,3 +840,254 @@ def test_selective_state_update_with_heads_with_batch_indices(
     print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
     assert torch.allclose(state[state_indices, :], state_ref, rtol=rtol, atol=atol)
     assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
+
+
+@pytest.mark.parametrize("itype", [torch.float32, torch.bfloat16])
+@pytest.mark.parametrize("has_z", [False, True])
+@pytest.mark.parametrize("dstate", [16, 64])
+@pytest.mark.parametrize("dim", [2048, 4096])
+@pytest.mark.parametrize("max_seq_len", [2, 4])
+def test_selective_state_update_with_num_accepted_tokens(
+    dim, dstate, has_z, itype, max_seq_len
+):
+    device = "cuda"
+    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (5e-3, 1e-2)
+    if itype == torch.bfloat16:
+        rtol, atol = 5e-2, 1.5e-1
+        if torch.version.hip:
+            atol *= 2
+
+    current_platform.seed_everything(0)
+    batch_size = 4
+
+    tokens_per_seq = torch.randint(1, max_seq_len + 1, (batch_size,), device=device)
+    total_tokens = int(tokens_per_seq.sum().item())
+
+    num_accepted_tokens = torch.randint(0, max_seq_len, (batch_size,), device=device)
+    num_accepted_tokens[0] = 0  # Add edge-case of no accepted tokens
+    num_accepted_tokens[1] = max_seq_len  # Add edge-case of all tokens accepted
+
+    cu_seqlens = torch.tensor(
+        [0] + torch.cumsum(tokens_per_seq, dim=0).tolist(),
+        dtype=torch.int32,
+        device=device,
+    )
+
+    total_state_slots = 50
+    state = torch.randn(total_state_slots, dim, dstate, dtype=itype, device=device)
+
+    state_batch_indices = torch.full(
+        (batch_size, max_seq_len), PAD_SLOT_ID, dtype=torch.int32, device=device
+    )
+    initial_state_slots = torch.randint(
+        0, 15, (batch_size,), device=device, dtype=torch.int32
+    )
+    for seq_idx in range(batch_size):
+        token_pos = max(num_accepted_tokens[seq_idx].item() - 1, 0)
+        state_batch_indices[seq_idx, token_pos] = initial_state_slots[seq_idx]
+
+    dst_state_batch_indices = torch.full(
+        (batch_size, max_seq_len), PAD_SLOT_ID, dtype=torch.int32, device=device
+    )
+    slot_offset = 15
+    dst_slots_map = {}
+    for seq_idx in range(batch_size):
+        for token_idx in range(tokens_per_seq[seq_idx].item()):
+            dst_state_batch_indices[seq_idx, token_idx] = slot_offset
+            dst_slots_map[(seq_idx, token_idx)] = slot_offset
+            slot_offset += 1
+
+    x = torch.randn(total_tokens, dim, device=device, dtype=itype)
+    out = torch.empty_like(x)
+    dt = torch.randn(total_tokens, dim, device=device, dtype=itype)
+    dt_bias = torch.rand(dim, device=device) - 4.0
+    A = -torch.rand(dim, dstate, device=device) - 1.0
+    B = torch.randn(total_tokens, dstate, device=device)
+    C = torch.randn(total_tokens, dstate, device=device)
+    D = torch.randn(dim, device=device)
+    z = torch.randn_like(x) if has_z else None
+
+    state_ref_intermediate = {}
+    out_ref_list = []
+
+    for seq_idx in range(batch_size):
+        seq_start = cu_seqlens[seq_idx].item()
+        seq_end = cu_seqlens[seq_idx + 1].item()
+        num_tokens = seq_end - seq_start
+
+        token_pos = max(num_accepted_tokens[seq_idx].item() - 1, 0)
+        initial_slot = state_batch_indices[seq_idx, token_pos].item()
+        state_seq = state[initial_slot : initial_slot + 1].clone()
+
+        for token_idx in range(num_tokens):
+            global_idx = seq_start + token_idx
+
+            out_token = selective_state_update_ref(
+                state_seq,
+                x[global_idx : global_idx + 1],
+                dt[global_idx : global_idx + 1],
+                A,
+                B[global_idx : global_idx + 1],
+                C[global_idx : global_idx + 1],
+                D=D,
+                z=z[global_idx : global_idx + 1] if has_z else None,
+                dt_bias=dt_bias,
+                dt_softplus=True,
+            )
+            out_ref_list.append(out_token)
+            state_ref_intermediate[(seq_idx, token_idx)] = state_seq.clone()
+
+    out_ref = torch.cat(out_ref_list, dim=0)
+
+    selective_state_update(
+        state,
+        x,
+        dt,
+        A,
+        B,
+        C,
+        D=D,
+        z=z,
+        dt_bias=dt_bias,
+        dt_softplus=True,
+        out=out,
+        cu_seqlens=cu_seqlens,
+        state_batch_indices=state_batch_indices,
+        dst_state_batch_indices=dst_state_batch_indices,
+        num_accepted_tokens=num_accepted_tokens,
+        pad_slot_id=PAD_SLOT_ID,
+    )
+
+    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
+
+    for seq_idx in range(batch_size):
+        num_tokens = tokens_per_seq[seq_idx].item()
+        for token_idx in range(num_tokens):
+            dst_slot = dst_slots_map[(seq_idx, token_idx)]
+            state_ref = state_ref_intermediate[(seq_idx, token_idx)].squeeze(0)
+            assert torch.allclose(state[dst_slot], state_ref, rtol=rtol, atol=atol)
+
+
+@pytest.mark.parametrize("itype", [torch.float32, torch.bfloat16])
+@pytest.mark.parametrize("has_z", [False, True])
+@pytest.mark.parametrize("dstate", [16, 64])
+@pytest.mark.parametrize("dim", [2048, 4096])
+@pytest.mark.parametrize("max_seq_len", [2, 4])
+def test_selective_state_update_varlen_with_num_accepted(
+    dim, dstate, has_z, itype, max_seq_len
+):
+    device = "cuda"
+    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (5e-3, 1e-2)
+    if itype == torch.bfloat16:
+        rtol, atol = 5e-2, 1.5e-1
+        if torch.version.hip:
+            atol *= 2
+
+    current_platform.seed_everything(0)
+    batch_size = 4
+
+    tokens_per_seq = torch.randint(1, max_seq_len + 1, (batch_size,), device=device)
+    total_tokens = int(tokens_per_seq.sum().item())
+
+    num_accepted_tokens = torch.randint(0, max_seq_len, (batch_size,), device=device)
+    num_accepted_tokens[0] = 0  # Add edge-case of no accepted tokens
+    num_accepted_tokens[1] = max_seq_len  # Add edge-case of all tokens accepted
+
+    cu_seqlens = torch.tensor(
+        [0] + torch.cumsum(tokens_per_seq, dim=0).tolist(),
+        dtype=torch.int32,
+        device=device,
+    )
+
+    total_state_slots = 50
+    state = torch.randn(total_state_slots, dim, dstate, dtype=itype, device=device)
+
+    state_batch_indices = torch.full(
+        (batch_size, max_seq_len), PAD_SLOT_ID, dtype=torch.int32, device=device
+    )
+
+    initial_state_slots = torch.randint(
+        0, 15, (batch_size,), device=device, dtype=torch.int32
+    )
+    for seq_idx in range(batch_size):
+        token_pos = max(num_accepted_tokens[seq_idx].item() - 1, 0)
+        state_batch_indices[seq_idx, token_pos] = initial_state_slots[seq_idx]
+
+    dst_state_batch_indices = torch.full(
+        (batch_size, max_seq_len), PAD_SLOT_ID, dtype=torch.int32, device=device
+    )
+
+    slot_offset = 15
+    dst_slots_map = {}
+    for seq_idx in range(batch_size):
+        for token_idx in range(tokens_per_seq[seq_idx].item()):
+            dst_state_batch_indices[seq_idx, token_idx] = slot_offset
+            dst_slots_map[(seq_idx, token_idx)] = slot_offset
+            slot_offset += 1
+
+    x = torch.randn(total_tokens, dim, device=device, dtype=itype)
+    out = torch.empty_like(x)
+    dt = torch.randn(total_tokens, dim, device=device, dtype=itype)
+    dt_bias = torch.rand(dim, device=device) - 4.0
+    A = -torch.rand(dim, dstate, device=device) - 1.0
+    B = torch.randn(total_tokens, dstate, device=device)
+    C = torch.randn(total_tokens, dstate, device=device)
+    D = torch.randn(dim, device=device)
+    z = torch.randn_like(x) if has_z else None
+
+    state_ref_intermediate = {}
+
+    for seq_idx in range(batch_size):
+        seq_start = cu_seqlens[seq_idx].item()
+        seq_end = cu_seqlens[seq_idx + 1].item()
+        num_tokens = seq_end - seq_start
+
+        token_pos = max(num_accepted_tokens[seq_idx].item() - 1, 0)
+        initial_slot = state_batch_indices[seq_idx, token_pos].item()
+        state_seq = state[initial_slot : initial_slot + 1].clone()
+
+        for token_idx in range(num_tokens):
+            global_idx = seq_start + token_idx
+
+            selective_state_update_ref(
+                state_seq,
+                x[global_idx : global_idx + 1],
+                dt[global_idx : global_idx + 1],
+                A,
+                B[global_idx : global_idx + 1],
+                C[global_idx : global_idx + 1],
+                D=D,
+                z=z[global_idx : global_idx + 1] if has_z else None,
+                dt_bias=dt_bias,
+                dt_softplus=True,
+            )
+
+            state_ref_intermediate[(seq_idx, token_idx)] = state_seq.clone()
+
+    selective_state_update(
+        state,
+        x,
+        dt,
+        A,
+        B,
+        C,
+        D=D,
+        z=z,
+        dt_bias=dt_bias,
+        dt_softplus=True,
+        out=out,
+        cu_seqlens=cu_seqlens,
+        state_batch_indices=state_batch_indices,
+        dst_state_batch_indices=dst_state_batch_indices,
+        num_accepted_tokens=num_accepted_tokens,
+        pad_slot_id=PAD_SLOT_ID,
+    )
+
+    for seq_idx in range(batch_size):
+        num_tokens = tokens_per_seq[seq_idx].item()
+
+        for token_idx in range(num_tokens):
+            dst_slot = dst_slots_map[(seq_idx, token_idx)]
+            state_ref = state_ref_intermediate[(seq_idx, token_idx)].squeeze(0)
+
+            assert torch.allclose(state[dst_slot], state_ref, rtol=rtol, atol=atol)
diff --git a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
index 53fd5d5458b09..800f8bd840792 100644
--- a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
+++ b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
@@ -36,10 +36,14 @@ else:
         is not None
     }
 )
+@triton.heuristics(
+    {"IS_SPEC_DECODING": lambda args: args["num_accepted_tokens_ptr"] is not None}
+)
+@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens_ptr"] is not None})
 @triton.heuristics(
     {"BLOCK_SIZE_DSTATE": lambda args: triton.next_power_of_2(args["dstate"])}
 )
-@triton.jit
+@triton.jit(do_not_specialize=["N"])
 def _selective_scan_update_kernel(
     # Pointers to matrices
     state_ptr,
@@ -55,8 +59,10 @@ def _selective_scan_update_kernel(
     state_batch_indices_ptr,
     dst_state_batch_indices_ptr,
     pad_slot_id,
+    num_accepted_tokens_ptr,
+    cu_seqlens_ptr,
     # Matrix dimensions
-    batch,
+    N,
     nheads,
     dim,
     dstate,
@@ -91,6 +97,10 @@ def _selective_scan_update_kernel(
     stride_out_batch,
     stride_out_head,
     stride_out_dim,
+    stride_state_indices_batch,
+    stride_state_indices_T,
+    stride_dst_state_indices_batch,
+    stride_dst_state_indices_T,
     # Meta-parameters
     DT_SOFTPLUS: tl.constexpr,
     TIE_HDIM: tl.constexpr,
@@ -99,22 +109,50 @@ def _selective_scan_update_kernel(
     HAS_D: tl.constexpr,
     HAS_Z: tl.constexpr,
     HAS_STATE_BATCH_INDICES: tl.constexpr,
+    IS_SPEC_DECODING: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
     BLOCK_SIZE_DSTATE: tl.constexpr,
 ):
     pid_m = tl.program_id(axis=0)
     pid_b = tl.program_id(axis=1)
     pid_h = tl.program_id(axis=2)
 
+    if IS_VARLEN:
+        bos = tl.load(cu_seqlens_ptr + pid_b).to(tl.int64)
+        eos = tl.load(cu_seqlens_ptr + pid_b + 1).to(tl.int64)
+        seq_len = eos - bos
+
+        if seq_len == 0:
+            return
+    else:
+        bos = pid_b
+        seq_len = 1
+
+    state_ptr_base = state_ptr
+
     # If HAS_STATE_BATCH_INDICES is true, then the ssm state's batch coordinate
     # is taken from the state_batch_indices_ptr Otherwise, the state coordinate
     # is the same as the batch id.
     if HAS_STATE_BATCH_INDICES:
-        dst_state_batch_indices_ptr += pid_b
-        dst_state_batch_idx = tl.load(dst_state_batch_indices_ptr).to(tl.int64)
-        dst_state_ptr = state_ptr + (
-            dst_state_batch_idx * stride_state_batch + pid_h * stride_state_head
+        if IS_SPEC_DECODING:
+            num_accepted = tl.load(num_accepted_tokens_ptr + pid_b).to(tl.int64)
+            init_token_idx = tl.maximum(num_accepted - 1, 0)
+        else:
+            init_token_idx = 0
+
+        dst_state_batch_indices_ptr += pid_b * stride_dst_state_indices_batch
+        if not IS_SPEC_DECODING:
+            dst_state_batch_idx = tl.load(
+                dst_state_batch_indices_ptr
+                + init_token_idx * stride_dst_state_indices_T
+            ).to(tl.int64)
+            dst_state_ptr = state_ptr + (
+                dst_state_batch_idx * stride_state_batch + pid_h * stride_state_head
+            )
+
+        state_batch_indices_ptr += (
+            pid_b * stride_state_indices_batch + init_token_idx * stride_state_indices_T
         )
-        state_batch_indices_ptr += pid_b
         state_batch_idx = tl.load(state_batch_indices_ptr).to(tl.int64)
         state_ptr += state_batch_idx * stride_state_batch + pid_h * stride_state_head
     else:
@@ -123,86 +161,112 @@ def _selective_scan_update_kernel(
         )
         state_ptr += pid_b * stride_state_batch + pid_h * stride_state_head
 
-    x_ptr += pid_b * stride_x_batch + pid_h * stride_x_head
-    dt_ptr += pid_b * stride_dt_batch + pid_h * stride_dt_head
+    x_ptr += bos * stride_x_batch + pid_h * stride_x_head
+    dt_ptr += bos * stride_dt_batch + pid_h * stride_dt_head
     if HAS_DT_BIAS:
         dt_bias_ptr += pid_h * stride_dt_bias_head
     A_ptr += pid_h * stride_A_head
-    B_ptr += pid_b * stride_B_batch + (pid_h // nheads_ngroups_ratio) * stride_B_group
-    C_ptr += pid_b * stride_C_batch + (pid_h // nheads_ngroups_ratio) * stride_C_group
+    B_ptr += bos * stride_B_batch + (pid_h // nheads_ngroups_ratio) * stride_B_group
+    C_ptr += bos * stride_C_batch + (pid_h // nheads_ngroups_ratio) * stride_C_group
     if HAS_Z:
-        z_ptr += pid_b * stride_z_batch + pid_h * stride_z_head
-    out_ptr += pid_b * stride_out_batch + pid_h * stride_out_head
+        z_ptr += bos * stride_z_batch + pid_h * stride_z_head
+    out_ptr += bos * stride_out_batch + pid_h * stride_out_head
 
     offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
     offs_n = tl.arange(0, BLOCK_SIZE_DSTATE)
     state_ptrs = state_ptr + (
         offs_m[:, None] * stride_state_dim + offs_n[None, :] * stride_state_dstate
     )
-    dst_state_ptrs = dst_state_ptr + (
-        offs_m[:, None] * stride_state_dim + offs_n[None, :] * stride_state_dstate
-    )
-    x_ptrs = x_ptr + offs_m * stride_x_dim
-    dt_ptrs = dt_ptr + offs_m * stride_dt_dim
+    if not IS_SPEC_DECODING:
+        dst_state_ptrs = dst_state_ptr + (
+            offs_m[:, None] * stride_state_dim + offs_n[None, :] * stride_state_dstate
+        )
+
+    mask = (offs_m[:, None] < dim) & (offs_n[None, :] < dstate)
+    if HAS_STATE_BATCH_INDICES:
+        mask &= state_batch_idx != pad_slot_id
+    state = tl.load(state_ptrs, mask=mask, other=0.0).to(tl.float32)
+
     if HAS_DT_BIAS:
         dt_bias_ptrs = dt_bias_ptr + offs_m * stride_dt_bias_dim
     if HAS_D:
         D_ptr += pid_h * stride_D_head
-    A_ptrs = A_ptr + (
-        offs_m[:, None] * stride_A_dim + offs_n[None, :] * stride_A_dstate
-    )
-    B_ptrs = B_ptr + offs_n * stride_B_dstate
-    C_ptrs = C_ptr + offs_n * stride_C_dstate
-    if HAS_D:
         D_ptrs = D_ptr + offs_m * stride_D_dim
-    if HAS_Z:
-        z_ptrs = z_ptr + offs_m * stride_z_dim
-    out_ptrs = out_ptr + offs_m * stride_out_dim
-    mask = (offs_m[:, None] < dim) & (offs_n[None, :] < dstate)
-    if HAS_STATE_BATCH_INDICES:
-        mask &= state_batch_idx != pad_slot_id
-    state = tl.load(state_ptrs, mask=mask, other=0.0)
+    A_ptrs = A_ptr + offs_m[:, None] * stride_A_dim + offs_n[None, :] * stride_A_dstate
 
-    x = tl.load(x_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-    if not TIE_HDIM:
-        dt = tl.load(dt_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-        if HAS_DT_BIAS:
-            dt += tl.load(dt_bias_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-        if DT_SOFTPLUS:
-            dt = softplus(dt)
-        A = tl.load(
-            A_ptrs, mask=(offs_m[:, None] < dim) & (offs_n[None, :] < dstate), other=0.0
-        ).to(tl.float32)
-        dA = tl.exp(A * dt[:, None])
-    else:
-        dt = tl.load(dt_ptr).to(tl.float32)
-        if HAS_DT_BIAS:
-            dt += tl.load(dt_bias_ptr).to(tl.float32)
-        if DT_SOFTPLUS:
-            dt = softplus(dt)
-        A = tl.load(A_ptr).to(tl.float32)
-        dA = tl.exp(A * dt)  # scalar, not a matrix
+    for i_t in range(seq_len):
+        x_ptrs = x_ptr + offs_m * stride_x_dim
+        dt_ptrs = dt_ptr + offs_m * stride_dt_dim
+        B_ptrs = B_ptr + offs_n * stride_B_dstate
+        C_ptrs = C_ptr + offs_n * stride_C_dstate
+        if HAS_Z:
+            z_ptrs = z_ptr + offs_m * stride_z_dim
+        out_ptrs = out_ptr + offs_m * stride_out_dim
 
-    B = tl.load(B_ptrs, mask=offs_n < dstate, other=0.0).to(tl.float32)
-    C = tl.load(C_ptrs, mask=offs_n < dstate, other=0.0).to(tl.float32)
-    if HAS_D:
-        D = tl.load(D_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-    if HAS_Z:
-        z = tl.load(z_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+        x = tl.load(x_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+        if not TIE_HDIM:
+            dt = tl.load(dt_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+            if HAS_DT_BIAS:
+                dt += tl.load(dt_bias_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+            if DT_SOFTPLUS:
+                dt = softplus(dt)
+            A = tl.load(
+                A_ptrs,
+                mask=(offs_m[:, None] < dim) & (offs_n[None, :] < dstate),
+                other=0.0,
+            ).to(tl.float32)
+            dA = tl.exp(A * dt[:, None])
+        else:
+            dt = tl.load(dt_ptr).to(tl.float32)
+            if HAS_DT_BIAS:
+                dt += tl.load(dt_bias_ptr).to(tl.float32)
+            if DT_SOFTPLUS:
+                dt = softplus(dt)
+            A = tl.load(A_ptr).to(tl.float32)
+            dA = tl.exp(A * dt)  # scalar, not a matrix
 
-    dB = B[None, :] * dt[:, None] if not TIE_HDIM else B * dt
-    state = state * dA + dB * x[:, None]
+        B = tl.load(B_ptrs, mask=offs_n < dstate, other=0.0).to(tl.float32)
+        C = tl.load(C_ptrs, mask=offs_n < dstate, other=0.0).to(tl.float32)
+        if HAS_D:
+            D = tl.load(D_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+        if HAS_Z:
+            z = tl.load(z_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
 
-    mask = (offs_m[:, None] < dim) & (offs_n[None, :] < dstate)
-    if HAS_STATE_BATCH_INDICES:
-        mask &= state_batch_idx != pad_slot_id
-    tl.store(dst_state_ptrs, state, mask=mask)
-    out = tl.sum(state * C[None, :], axis=1)
-    if HAS_D:
-        out += x * D
-    if HAS_Z:
-        out *= z * tl.sigmoid(z)
-    tl.store(out_ptrs, out, mask=offs_m < dim)
+        dB = B[None, :] * dt[:, None] if not TIE_HDIM else B * dt
+        state = state * dA + dB * x[:, None]
+
+        if IS_SPEC_DECODING:
+            dst_idx_ptr = dst_state_batch_indices_ptr + i_t * stride_dst_state_indices_T
+            token_dst_idx = tl.load(dst_idx_ptr).to(tl.int64)
+            if token_dst_idx != pad_slot_id:
+                token_dst_ptrs = (
+                    state_ptr_base
+                    + token_dst_idx * stride_state_batch
+                    + pid_h * stride_state_head
+                    + offs_m[:, None] * stride_state_dim
+                    + offs_n[None, :] * stride_state_dstate
+                )
+                tl.store(
+                    token_dst_ptrs, state.to(token_dst_ptrs.dtype.element_ty), mask=mask
+                )
+
+        out = tl.sum(state * C[None, :], axis=1)
+        if HAS_D:
+            out += x * D
+        if HAS_Z:
+            out *= z * tl.sigmoid(z)
+        tl.store(out_ptrs, out, mask=offs_m < dim)
+
+        x_ptr += stride_x_batch
+        dt_ptr += stride_dt_batch
+        B_ptr += stride_B_batch
+        C_ptr += stride_C_batch
+        out_ptr += stride_out_batch
+        if HAS_Z:
+            z_ptr += stride_z_batch
+
+    if not IS_SPEC_DECODING:
+        tl.store(dst_state_ptrs, state.to(dst_state_ptrs.dtype.element_ty), mask=mask)
 
 
 def selective_state_update(
@@ -220,6 +284,8 @@ def selective_state_update(
     dst_state_batch_indices=None,
     pad_slot_id=PAD_SLOT_ID,
     out=None,
+    num_accepted_tokens=None,
+    cu_seqlens=None,
 ):
     """
     Argument:
@@ -240,6 +306,11 @@ def selective_state_update(
             indices 0 and 3
         out: Preallocated ssm output tensor. Assume same shape as x.
              In-place updated.
+        num_accepted_tokens: (batch,)
+            number of accepted tokens from previous verification step,
+            tells the kernel which initial state to use
+        cu_seqlens: (batch,)
+            length per sequence, for variable length in speculative decoding cases
     """
     if state.dim() == 3:
         state = state.unsqueeze(1)
@@ -261,9 +332,26 @@ def selective_state_update(
         dt_bias = dt_bias.unsqueeze(0)
     if out.dim() == 2:
         out = out.unsqueeze(1)
+    if num_accepted_tokens is not None:
+        assert state_batch_indices is not None and state_batch_indices.dim() == 2
+        assert dst_state_batch_indices is None or dst_state_batch_indices.dim() == 2
+    if state_batch_indices is not None and state_batch_indices.dim() == 1:
+        state_batch_indices = state_batch_indices.unsqueeze(1)
+    if dst_state_batch_indices is not None and dst_state_batch_indices.dim() == 1:
+        dst_state_batch_indices = dst_state_batch_indices.unsqueeze(1)
 
     _, nheads, dim, dstate = state.shape
     batch = x.shape[0]
+    if cu_seqlens is not None:
+        N = len(cu_seqlens) - 1
+        # Only used to verify the shape of
+        # state_batch_indices and dst_state_batch_indices
+        max_seqlen = (
+            state_batch_indices.size(-1) if state_batch_indices is not None else 1
+        )
+    else:
+        N = batch
+        max_seqlen = 1
 
     assert x.shape == (batch, nheads, dim)
     assert dt.shape == x.shape
@@ -279,16 +367,30 @@ def selective_state_update(
     if dt_bias is not None:
         assert dt_bias.shape == (nheads, dim)
     if state_batch_indices is not None:
-        assert state_batch_indices.shape == (batch,)
+        assert state_batch_indices.shape[0] >= N
+        assert state_batch_indices.shape[1] >= max_seqlen
     if dst_state_batch_indices is not None:
-        assert dst_state_batch_indices.shape == (batch,)
+        assert dst_state_batch_indices.shape[0] >= N
+        assert dst_state_batch_indices.shape[1] >= max_seqlen
     else:
         # revert to the default behavior of in-place state updates
         dst_state_batch_indices = state_batch_indices
     assert out.shape == x.shape
+    if num_accepted_tokens is not None:
+        assert num_accepted_tokens.shape == (N,)
 
-    grid = lambda META: (triton.cdiv(dim, META["BLOCK_SIZE_M"]), batch, nheads)
+    grid = lambda META: (triton.cdiv(dim, META["BLOCK_SIZE_M"]), N, nheads)
     z_strides = (z.stride(0), z.stride(1), z.stride(2)) if z is not None else (0, 0, 0)
+    state_batch_indices_strides = (
+        (state_batch_indices.stride(0), state_batch_indices.stride(1))
+        if state_batch_indices is not None
+        else (0, 0)
+    )
+    dst_state_batch_indices_strides = (
+        (dst_state_batch_indices.stride(0), dst_state_batch_indices.stride(1))
+        if dst_state_batch_indices is not None
+        else (0, 0)
+    )
     # We don't want autotune since it will overwrite the state
     # We instead tune by hand.
     BLOCK_SIZE_M, num_warps = (
@@ -321,7 +423,9 @@ def selective_state_update(
             state_batch_indices,
             dst_state_batch_indices,
             pad_slot_id,
-            batch,
+            num_accepted_tokens,
+            cu_seqlens,
+            N,
             nheads,
             dim,
             dstate,
@@ -353,6 +457,10 @@ def selective_state_update(
             out.stride(0),
             out.stride(1),
             out.stride(2),
+            state_batch_indices_strides[0],
+            state_batch_indices_strides[1],
+            dst_state_batch_indices_strides[0],
+            dst_state_batch_indices_strides[1],
             dt_softplus,
             tie_hdim,
             BLOCK_SIZE_M,

From 6af70e11a0a3cb7109ce904e23ff90c73f573ef0 Mon Sep 17 00:00:00 2001
From: Charlie Fu <charlifu@amd.com>
Date: Mon, 8 Dec 2025 15:58:30 -0600
Subject: [PATCH 348/770] [ROCm][CI] Fix test_max_len.py for Rocm (#29916)

Signed-off-by: charlifu <charlifu@amd.com>
Signed-off-by: Charlie Fu <Charlie.Fu@amd.com>
---
 tests/basic_correctness/test_basic_correctness.py | 5 ++++-
 tests/utils.py                                    | 4 ++--
 tests/v1/e2e/test_spec_decode.py                  | 4 ++--
 tests/v1/spec_decode/test_eagle.py                | 8 ++++++--
 tests/v1/spec_decode/test_max_len.py              | 2 +-
 5 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 521d6c33dd390..9e1cc309edd1d 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -13,12 +13,15 @@ import pytest
 import torch
 
 from vllm import LLM
+from vllm.platforms import current_platform
 from vllm.v1.engine.llm_engine import LLMEngine
 
 from ..conftest import HfRunner, VllmRunner
 from ..models.utils import check_outputs_equal
 from ..utils import multi_gpu_test
 
+ATTN_BACKEND = ["ROCM_ATTN"] if current_platform.is_rocm() else ["FLASH_ATTN"]
+
 MODELS = [
     "hmellor/tiny-random-Gemma2ForCausalLM",
     "meta-llama/Llama-3.2-1B-Instruct",
@@ -57,7 +60,7 @@ def _fix_prompt_embed_outputs(
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("backend", ["FLASH_ATTN"])
+@pytest.mark.parametrize("backend", ATTN_BACKEND)
 @pytest.mark.parametrize("max_tokens", [5])
 @pytest.mark.parametrize("enforce_eager", [False])
 @pytest.mark.parametrize("async_scheduling", [True, False])
diff --git a/tests/utils.py b/tests/utils.py
index 539f67c47ac1d..ea3675b1461b8 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -1225,9 +1225,9 @@ def get_attn_backend_list_based_on_platform() -> list[str]:
         try:
             import aiter  # noqa: F401
 
-            attn_backend_list.append("FLASH_ATTN")
+            attn_backend_list.append("ROCM_AITER_FA")
         except Exception:
-            print("Skip FLASH_ATTN on ROCm as aiter is not installed")
+            print("Skip ROCM_AITER_FA on ROCm as aiter is not installed")
 
         return attn_backend_list
     elif current_platform.is_xpu():
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 575a6a151f579..416b582dfaa63 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -417,9 +417,9 @@ def test_eagle_correctness(
                 "multi-token eagle spec decode on current platform"
             )
 
-        if attn_backend == "FLASH_ATTN" and current_platform.is_rocm():
+        if attn_backend == "ROCM_AITER_FA" and current_platform.is_rocm():
             if "deepseek" in model_setup[1].lower():
-                pytest.skip("FLASH_ATTN for deepseek not supported on ROCm platform")
+                pytest.skip("ROCM_AITER_FA for deepseek not supported on ROCm platform")
             else:
                 m.setenv("VLLM_ROCM_USE_AITER", "1")
 
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index 616e57de339e2..55e9b4d0660f5 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -339,7 +339,7 @@ def test_load_model(
             "multi-token eagle spec decode on current platform"
         )
 
-    if attn_backend == "FLASH_ATTN" and current_platform.is_rocm():
+    if attn_backend == "ROCM_AITER_FA" and current_platform.is_rocm():
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
     # Setup draft model mock
@@ -434,7 +434,7 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch):
             "because it requires special input mocking."
         )
 
-    if attn_backend == "FLASH_ATTN" and current_platform.is_rocm():
+    if attn_backend == "ROCM_AITER_FA" and current_platform.is_rocm():
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
     # Use GPU device
@@ -541,6 +541,10 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch):
         attn_metadata_builder_cls, _ = try_get_attention_backend(
             AttentionBackendEnum.TREE_ATTN
         )
+    elif attn_backend == "ROCM_AITER_FA":
+        attn_metadata_builder_cls, _ = try_get_attention_backend(
+            AttentionBackendEnum.ROCM_AITER_FA
+        )
     else:
         raise ValueError(f"Unsupported attention backend: {attn_backend}")
 
diff --git a/tests/v1/spec_decode/test_max_len.py b/tests/v1/spec_decode/test_max_len.py
index fa1d0437f7c71..81da8609aa6cf 100644
--- a/tests/v1/spec_decode/test_max_len.py
+++ b/tests/v1/spec_decode/test_max_len.py
@@ -47,7 +47,7 @@ def test_eagle_max_len(
                 "multi-token eagle spec decode on current platform"
             )
 
-        if attn_backend == "FLASH_ATTN" and current_platform.is_rocm():
+        if attn_backend == "ROCM_AITER_FA" and current_platform.is_rocm():
             m.setenv("VLLM_ROCM_USE_AITER", "1")
 
         llm = LLM(

From 1fb632fdb6b6391cd1fe3146d05cc2858c2446ab Mon Sep 17 00:00:00 2001
From: Lain <siyuanf@nvidia.com>
Date: Mon, 8 Dec 2025 15:02:34 -0800
Subject: [PATCH 349/770] [Perf] Improve fp8 quant in mla; replace ReduceSum
 with ReduceScatterSum (#29795)

Signed-off-by: Siyuan Fu <siyuanf@nvidia.com>
---
 .../device_communicators/cuda_communicator.py |  2 +-
 vllm/v1/attention/backends/mla/common.py      | 33 ++++++++++++-------
 2 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py
index 2e878eef908ac..cd9c267beb5b5 100644
--- a/vllm/distributed/device_communicators/cuda_communicator.py
+++ b/vllm/distributed/device_communicators/cuda_communicator.py
@@ -225,7 +225,7 @@ class CudaCommunicator(DeviceCommunicatorBase):
             output_shape, dtype=input_tensor.dtype, device=input_tensor.device
         )
 
-        if sizes is not None:
+        if sizes is not None and sizes.count(sizes[0]) != len(sizes):
             pynccl_comm.reduce_scatterv(output, input_tensor, sizes=sizes)
         else:
             pynccl_comm.reduce_scatter(output, input_tensor)
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 309ddee4fc2f0..0a5257a1d87d8 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -2037,21 +2037,30 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
 
             if fp8_attention:
                 ql_nope_shape = decode_ql_nope.shape
-                decode_ql_nope, _ = ops.scaled_fp8_quant(
-                    decode_ql_nope.reshape(
-                        [ql_nope_shape[0], ql_nope_shape[1] * ql_nope_shape[2]]
-                    ),
-                    layer._q_scale,
-                )
-                decode_ql_nope = decode_ql_nope.reshape(ql_nope_shape)
                 q_pe_shape = decode_q_pe.shape
-                decode_q_pe, _ = ops.scaled_fp8_quant(
-                    decode_q_pe.reshape([q_pe_shape[0], q_pe_shape[1] * q_pe_shape[2]]),
+                assert decode_ql_nope.shape[0] == decode_q_pe.shape[0]
+                assert decode_ql_nope.shape[1] == decode_q_pe.shape[1]
+                decode_q_shape = (
+                    ql_nope_shape[0],
+                    ql_nope_shape[1],
+                    ql_nope_shape[2] + q_pe_shape[2],
+                )
+                # Using empty and copy since torch.cat introduces significant overhead.
+                decode_q0 = torch.empty(
+                    decode_q_shape,
+                    device=decode_ql_nope.device,
+                    dtype=decode_ql_nope.dtype,
+                )
+                decode_q0[..., : ql_nope_shape[2]].copy_(decode_ql_nope)
+                decode_q0[..., ql_nope_shape[2] :].copy_(decode_q_pe)
+
+                decode_q, _ = ops.scaled_fp8_quant(
+                    decode_q0.view(decode_q_shape[0], -1),
                     layer._q_scale,
                 )
-                decode_q_pe = decode_q_pe.reshape(q_pe_shape)
-
-            decode_q = (decode_ql_nope, decode_q_pe)
+                decode_q = decode_q.view(decode_q_shape)
+            else:
+                decode_q = (decode_ql_nope, decode_q_pe)
             if self.dcp_world_size > 1:
                 assert not fp8_attention, "DCP not support fp8 kvcache now."
                 # concatenate decode_ql_nope and decode_q_pe -> (B, N, L + P)

From 60d17251c920ae3c9d02e4b4101b738e4905aee4 Mon Sep 17 00:00:00 2001
From: Ming Yang <minos.future@gmail.com>
Date: Mon, 8 Dec 2025 16:01:08 -0800
Subject: [PATCH 350/770] [Disagg] Support large batch size in proxy server and
 update NixlConnector doc for DP (#28782)

Signed-off-by: Ming Yang <minos.future@gmail.com>
---
 docs/features/nixl_connector_usage.md         |  2 ++
 .../disagg_proxy_server.py                    | 21 +++++++++++++++--
 .../nixl_integration/toy_proxy_server.py      | 23 +++++++++++++++++--
 3 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/docs/features/nixl_connector_usage.md b/docs/features/nixl_connector_usage.md
index f0e25e31aa0b3..84c8f9e77d6d3 100644
--- a/docs/features/nixl_connector_usage.md
+++ b/docs/features/nixl_connector_usage.md
@@ -146,6 +146,8 @@ python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \
   --decoder-ports 8000 8000
 ```
 
+For multi-host DP deployment, only need to provide the host/port of the head instances.
+
 ### KV Role Options
 
 - **kv_producer**: For prefiller instances that generate KV caches
diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py
index 5d8e38c73b89a..c8965e050ff0b 100644
--- a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py
+++ b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py
@@ -26,9 +26,21 @@ async def lifespan(app: FastAPI):
     )
 
     app.state.prefill_client = httpx.AsyncClient(
-        timeout=None, base_url=prefiller_base_url
+        timeout=None,
+        base_url=prefiller_base_url,
+        limits=httpx.Limits(
+            max_connections=None,
+            max_keepalive_connections=None,
+        ),
+    )
+    app.state.decode_client = httpx.AsyncClient(
+        timeout=None,
+        base_url=decoder_base_url,
+        limits=httpx.Limits(
+            max_connections=None,
+            max_keepalive_connections=None,
+        ),
     )
-    app.state.decode_client = httpx.AsyncClient(timeout=None, base_url=decoder_base_url)
 
     yield
 
@@ -105,6 +117,11 @@ async def send_request_to_service(
     headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
     response = await client.post(endpoint, json=req_data, headers=headers)
     response.raise_for_status()
+
+    # read/consume the response body to release the connection
+    # otherwise, it would http.ReadError
+    await response.aread()
+
     return response
 
 
diff --git a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
index 5768fcdb57ceb..b92d3fcd6fb8b 100644
--- a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
+++ b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
@@ -30,7 +30,14 @@ async def lifespan(app: FastAPI):
         prefiller_base_url = f"http://{host}:{port}/v1"
         app.state.prefill_clients.append(
             {
-                "client": httpx.AsyncClient(timeout=None, base_url=prefiller_base_url),
+                "client": httpx.AsyncClient(
+                    timeout=None,
+                    base_url=prefiller_base_url,
+                    limits=httpx.Limits(
+                        max_connections=None,
+                        max_keepalive_connections=None,
+                    ),
+                ),
                 "host": host,
                 "port": port,
                 "id": i,
@@ -42,7 +49,14 @@ async def lifespan(app: FastAPI):
         decoder_base_url = f"http://{host}:{port}/v1"
         app.state.decode_clients.append(
             {
-                "client": httpx.AsyncClient(timeout=None, base_url=decoder_base_url),
+                "client": httpx.AsyncClient(
+                    timeout=None,
+                    base_url=decoder_base_url,
+                    limits=httpx.Limits(
+                        max_connections=None,
+                        max_keepalive_connections=None,
+                    ),
+                ),
                 "host": host,
                 "port": port,
                 "id": i,
@@ -169,6 +183,10 @@ async def send_request_to_service(
     )
     response.raise_for_status()
 
+    # read/consume the response body to release the connection
+    # otherwise, it would http.ReadError
+    await response.aread()
+
     return response
 
 
@@ -206,6 +224,7 @@ async def _handle_completions(api: str, request: Request):
 
         # Extract the needed fields
         response_json = response.json()
+        await response.aclose()  # CRITICAL: Release connection back to pool
         kv_transfer_params = response_json.get("kv_transfer_params", {})
         if kv_transfer_params:
             req_data["kv_transfer_params"] = kv_transfer_params

From f1599ca55d79cb686cb94dc3ff2f65d82db94940 Mon Sep 17 00:00:00 2001
From: Victor Ziliang Peng <ziliang@character.ai>
Date: Mon, 8 Dec 2025 16:08:48 -0800
Subject: [PATCH 351/770] feat(metrics): Add prefill KV compute metric
 excluding cached tokens (#30189)

Signed-off-by: Ziliang Peng <ziliang@character.ai>
---
 tests/v1/metrics/test_stats.py     | 103 ++++++++++++++++++++++++++++-
 vllm/v1/engine/output_processor.py |   1 +
 vllm/v1/metrics/loggers.py         |  20 ++++++
 vllm/v1/metrics/stats.py           |   3 +
 4 files changed, 126 insertions(+), 1 deletion(-)

diff --git a/tests/v1/metrics/test_stats.py b/tests/v1/metrics/test_stats.py
index 48067def8357e..7d902bbc6fc24 100644
--- a/tests/v1/metrics/test_stats.py
+++ b/tests/v1/metrics/test_stats.py
@@ -1,8 +1,109 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from vllm.v1.metrics.stats import IterationStats
+from vllm.v1.engine import FinishReason
+from vllm.v1.metrics.stats import IterationStats, RequestStateStats
 
 
 def test_iteration_stats_repr():
     iteration_stats = IterationStats()
     assert repr(iteration_stats).startswith("IterationStats(")
+
+
+def test_prefill_kv_computed_with_cache():
+    """Test that prefill KV compute correctly excludes cached tokens."""
+    iteration_stats = IterationStats()
+    req_stats = RequestStateStats(arrival_time=0.0)
+    req_stats.scheduled_ts = 0.1
+    req_stats.first_token_ts = 0.5
+    req_stats.last_token_ts = 5.0
+    req_stats.num_generation_tokens = 50
+
+    # Case 1: With prefix cache (1200 tokens cached)
+    iteration_stats.update_from_finished_request(
+        finish_reason=FinishReason.STOP,
+        num_prompt_tokens=10000,
+        max_tokens_param=100,
+        req_stats=req_stats,
+        num_cached_tokens=1200,
+    )
+
+    finished_req = iteration_stats.finished_requests[0]
+    assert finished_req.num_prompt_tokens == 10000
+    assert finished_req.num_cached_tokens == 1200
+
+    # Verify calculation: prefill KV = prompt tokens - cached tokens
+    prefill_kv_computed = finished_req.num_prompt_tokens - max(
+        finished_req.num_cached_tokens, 0
+    )
+    assert prefill_kv_computed == 8800  # 10000 - 1200
+
+
+def test_prefill_kv_computed_no_cache():
+    """Test prefill KV compute without prefix caching."""
+    iteration_stats = IterationStats()
+    req_stats = RequestStateStats(arrival_time=0.0)
+    req_stats.scheduled_ts = 0.1
+    req_stats.first_token_ts = 0.5
+    req_stats.last_token_ts = 2.0
+    req_stats.num_generation_tokens = 10
+
+    # Case 2: No prefix cache
+    iteration_stats.update_from_finished_request(
+        finish_reason=FinishReason.STOP,
+        num_prompt_tokens=2000,
+        max_tokens_param=100,
+        req_stats=req_stats,
+        num_cached_tokens=0,
+    )
+
+    finished_req = iteration_stats.finished_requests[0]
+    assert finished_req.num_prompt_tokens == 2000
+    assert finished_req.num_cached_tokens == 0
+
+    # Verify calculation: prefill KV = full prompt when no cache
+    prefill_kv_computed = finished_req.num_prompt_tokens - max(
+        finished_req.num_cached_tokens, 0
+    )
+    assert prefill_kv_computed == 2000
+
+
+def test_prefill_kv_computed_edge_cases():
+    """Test edge cases for prefill KV compute calculation."""
+    iteration_stats = IterationStats()
+    req_stats = RequestStateStats(arrival_time=0.0)
+    req_stats.scheduled_ts = 0.1
+    req_stats.first_token_ts = 0.5
+    req_stats.last_token_ts = 1.0
+    req_stats.num_generation_tokens = 1
+
+    # Case 3: Negative num_cached_tokens (shouldn't happen, but handle gracefully)
+    iteration_stats.update_from_finished_request(
+        finish_reason=FinishReason.STOP,
+        num_prompt_tokens=100,
+        max_tokens_param=10,
+        req_stats=req_stats,
+        num_cached_tokens=-1,
+    )
+
+    finished_req = iteration_stats.finished_requests[0]
+    # max() should handle negative values
+    prefill_kv_computed = finished_req.num_prompt_tokens - max(
+        finished_req.num_cached_tokens, 0
+    )
+    assert prefill_kv_computed == 100  # Should treat negative as 0
+
+    # Case 4: All tokens cached (shouldn't happen in practice)
+    iteration_stats2 = IterationStats()
+    iteration_stats2.update_from_finished_request(
+        finish_reason=FinishReason.STOP,
+        num_prompt_tokens=100,
+        max_tokens_param=10,
+        req_stats=req_stats,
+        num_cached_tokens=100,
+    )
+
+    finished_req2 = iteration_stats2.finished_requests[0]
+    prefill_kv_computed2 = finished_req2.num_prompt_tokens - max(
+        finished_req2.num_cached_tokens, 0
+    )
+    assert prefill_kv_computed2 == 0  # All cached, nothing computed
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index e85fbb4ee0fb0..9be3f4da7352d 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -650,6 +650,7 @@ class OutputProcessor:
             ),
             max_tokens_param=req_state.max_tokens_param,
             req_stats=req_state.stats,
+            num_cached_tokens=req_state.num_cached_tokens,
         )
         self.lora_states.request_finished(req_state.request_id, req_state.lora_name)
 
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 882e0ce0b2e03..9eaee1bb97bb9 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -870,6 +870,19 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
             histogram_decode_time_request, engine_indexes, model_name
         )
 
+        histogram_prefill_kv_computed_request = self._histogram_cls(
+            name="vllm:request_prefill_kv_computed_tokens",
+            documentation=(
+                "Histogram of new KV tokens computed during prefill "
+                "(excluding cached tokens)."
+            ),
+            buckets=build_1_2_5_buckets(max_model_len),
+            labelnames=labelnames,
+        )
+        self.histogram_prefill_kv_computed_request = make_per_engine(
+            histogram_prefill_kv_computed_request, engine_indexes, model_name
+        )
+
         #
         # KV Cache residency metrics
         #
@@ -1118,6 +1131,13 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
             self.histogram_decode_time_request[engine_idx].observe(
                 finished_request.decode_time
             )
+            # Calculate prefill KV compute (excludes cached tokens)
+            prefill_kv_computed = finished_request.num_prompt_tokens - max(
+                finished_request.num_cached_tokens, 0
+            )
+            self.histogram_prefill_kv_computed_request[engine_idx].observe(
+                prefill_kv_computed
+            )
             self.histogram_num_prompt_tokens_request[engine_idx].observe(
                 finished_request.num_prompt_tokens
             )
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 733d3ae12e67f..a0cc58d0a64e8 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -224,6 +224,7 @@ class FinishedRequestStats:
     decode_time: float = 0.0
     mean_time_per_output_token: float = 0.0
     is_corrupted: bool = False
+    num_cached_tokens: int = 0
 
 
 class IterationStats:
@@ -330,6 +331,7 @@ class IterationStats:
         num_prompt_tokens: int,
         max_tokens_param: int | None,
         req_stats: RequestStateStats,
+        num_cached_tokens: int = 0,
     ):
         e2e_latency = self._time_since(req_stats.arrival_time)
 
@@ -367,6 +369,7 @@ class IterationStats:
             decode_time=decode_time,
             mean_time_per_output_token=mean_time_per_output_token,
             is_corrupted=req_stats.is_corrupted,
+            num_cached_tokens=num_cached_tokens,
         )
         self.finished_requests.append(finished_req)
 

From 9d6235ca9a36e76911045999ed72e3c8aad66b8a Mon Sep 17 00:00:00 2001
From: Ming Yang <minos.future@gmail.com>
Date: Mon, 8 Dec 2025 16:29:36 -0800
Subject: [PATCH 352/770] [moe] Allow disabling DP chunking (#29936)

Signed-off-by: Ming Yang <minos.future@gmail.com>
---
 vllm/envs.py                                  | 4 ++++
 vllm/model_executor/layers/fused_moe/layer.py | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 37711dece9abc..91d1b01076b11 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -144,6 +144,7 @@ if TYPE_CHECKING:
     VLLM_DP_MASTER_IP: str = ""
     VLLM_DP_MASTER_PORT: int = 0
     VLLM_MOE_DP_CHUNK_SIZE: int = 256
+    VLLM_ENABLE_MOE_DP_CHUNK: bool = True
     VLLM_RANDOMIZE_DP_DUMMY_INPUTS: bool = False
     VLLM_RAY_DP_PACK_STRATEGY: Literal["strict", "fill", "span"] = "strict"
     VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
@@ -1101,6 +1102,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # rank. All DP ranks process the activations in VLLM_MOE_DP_CHUNK_SIZE
     # units.
     "VLLM_MOE_DP_CHUNK_SIZE": lambda: int(os.getenv("VLLM_MOE_DP_CHUNK_SIZE", "256")),
+    "VLLM_ENABLE_MOE_DP_CHUNK": lambda: bool(
+        int(os.getenv("VLLM_ENABLE_MOE_DP_CHUNK", "1"))
+    ),
     # Randomize inputs during dummy runs when using Data Parallel
     "VLLM_RANDOMIZE_DP_DUMMY_INPUTS": lambda: os.environ.get(
         "VLLM_RANDOMIZE_DP_DUMMY_INPUTS", "0"
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 9b4d77a060c29..5df3486093cd9 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -753,7 +753,7 @@ class FusedMoE(CustomOp):
             self.moe_parallel_config.use_pplx_kernels
             or self.moe_parallel_config.use_deepep_ll_kernels
             or (self.dp_size > 1 and self.use_flashinfer_cutlass_kernels)
-        )
+        ) and envs.VLLM_ENABLE_MOE_DP_CHUNK
 
     @property
     def is_internal_router(self) -> bool:

From d9417096d1347ead26b7c0cc1afb22fc0028e6e8 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Mon, 8 Dec 2025 19:31:57 -0500
Subject: [PATCH 353/770] [Feature] Batch invariant: Enable `TRITON_MLA`
 without prefix-caching (#29125)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 tests/v1/determinism/test_batch_invariance.py |  6 +---
 .../test_online_batch_invariance.py           |  5 ++-
 tests/v1/determinism/utils.py                 |  1 +
 vllm/attention/layer.py                       | 36 +++++++++++++++++++
 vllm/model_executor/layers/batch_invariant.py |  2 +-
 5 files changed, 43 insertions(+), 7 deletions(-)

diff --git a/tests/v1/determinism/test_batch_invariance.py b/tests/v1/determinism/test_batch_invariance.py
index 4311547baccf1..fc953a66f0820 100644
--- a/tests/v1/determinism/test_batch_invariance.py
+++ b/tests/v1/determinism/test_batch_invariance.py
@@ -185,7 +185,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
     llm = LLM(
         model=model_name,
         tensor_parallel_size=tp_size,
-        enable_prefix_caching=False,
+        # enable_prefix_caching=False,
         max_num_seqs=32,
         max_model_len=8192,
         dtype="bfloat16",  # not everything is supported
@@ -393,7 +393,6 @@ def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch):
         gpu_memory_utilization=0.9,
         max_model_len=2048,
         dtype="bfloat16",
-        enable_prefix_caching=False,
     )
 
     prompt = "the capital of france is"
@@ -457,7 +456,6 @@ def test_logprobs_without_batch_invariance_should_fail(
     llm = LLM(
         model=model_name,
         tensor_parallel_size=tp_size,
-        enable_prefix_caching=False,
         max_num_seqs=32,
         max_model_len=8192,
         dtype="bfloat16",
@@ -681,7 +679,6 @@ def test_decode_logprobs_match_prefill_logprobs(
     llm = LLM(
         model=model_name,
         tensor_parallel_size=tp_size,
-        enable_prefix_caching=False,
         max_num_seqs=32,
         max_model_len=8192,
         dtype="bfloat16",
@@ -928,7 +925,6 @@ def LLM_with_max_seqs(
         max_model_len=max_model_len,
         dtype="bfloat16",
         tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")),
-        enable_prefix_caching=False,
         # Enable for MOE models
         # enable_expert_parallel=True,
     )
diff --git a/tests/v1/determinism/test_online_batch_invariance.py b/tests/v1/determinism/test_online_batch_invariance.py
index d74b435797f8f..5e3b997364949 100644
--- a/tests/v1/determinism/test_online_batch_invariance.py
+++ b/tests/v1/determinism/test_online_batch_invariance.py
@@ -153,7 +153,10 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
     }
 
     tp_size = os.getenv("VLLM_TP_SIZE", "1")
-    server_args: list[str] = []
+    server_args: list[str] = [
+        "--max-model-len=8192",
+        "--max-num-seqs=32",
+    ]
     if tp_size:
         server_args += ["-tp", tp_size]
 
diff --git a/tests/v1/determinism/utils.py b/tests/v1/determinism/utils.py
index 0d7da107728b4..6aab50cf84ab6 100644
--- a/tests/v1/determinism/utils.py
+++ b/tests/v1/determinism/utils.py
@@ -17,6 +17,7 @@ skip_unsupported = pytest.mark.skipif(
 
 BACKENDS: list[str] = [
     "FLASH_ATTN",
+    "TRITON_MLA",
 ]
 
 if has_flashinfer():
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 340b161ea1e15..7e5adfe0742d3 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -25,6 +25,7 @@ from vllm.config.vllm import VllmConfig
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
+from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     UnquantizedLinearMethod,
@@ -251,6 +252,24 @@ class Attention(nn.Module, AttentionLayerBase):
         else:
             self.attn_backend = attn_backend
 
+        # prefix caching + batch invariance is currently not supported for
+        # FLASHINFER and TRITON_MLA.
+        if (
+            cache_config is not None
+            and cache_config.enable_prefix_caching
+            and vllm_is_batch_invariant()
+            and (
+                self.attn_backend.get_name() == "FLASHINFER"
+                or self.attn_backend.get_name() == "TRITON_MLA"
+            )
+        ):
+            logger.warning_once(
+                "Disabling prefix caching for FLASHINFER/TRITON_MLA "
+                "with batch invariance, as it is not yet supported.",
+                scope="local",
+            )
+            cache_config.enable_prefix_caching = False
+
         impl_cls = self.attn_backend.get_impl_cls()
         self.impl = impl_cls(
             num_heads,
@@ -628,6 +647,23 @@ class MLAAttention(nn.Module, AttentionLayerBase):
             use_mla=True,
             use_sparse=use_sparse,
         )
+
+        if (
+            cache_config is not None
+            and cache_config.enable_prefix_caching
+            and vllm_is_batch_invariant()
+            and (
+                self.attn_backend.get_name() == "TRITON_MLA"
+                or self.attn_backend.get_name() == "FLASHINFER"
+            )
+        ):
+            logger.warning_once(
+                "Disabling prefix caching for TRITON_MLA / FLASHINFER "
+                "with batch invariance, as it is not yet supported.",
+                scope="local",
+            )
+            cache_config.enable_prefix_caching = False
+
         impl_cls = cast(type[MLAAttentionImpl], self.attn_backend.get_impl_cls())
         self.impl = impl_cls(
             num_heads=self.num_heads,
diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py
index 4154122636dcf..4cab47f4192a2 100644
--- a/vllm/model_executor/layers/batch_invariant.py
+++ b/vllm/model_executor/layers/batch_invariant.py
@@ -1006,11 +1006,11 @@ def override_envs_for_invariance():
         "FLASH_ATTN",  # best supported backend
         "FLASHINFER",
         "FLASH_ATTN_MLA",
+        "TRITON_MLA",
         # Not yet supported MLA backends
         # "FLASHMLA",
         # "FLEX_ATTENTION", # IMA issue even if we disable batch invariance
         # "FLASHINFER_MLA", https://github.com/vllm-project/vllm/pull/28967
-        # "TRITON_MLA",
     ]
     if curr_attn_backend not in supported_backends:
         error = (

From 0ee6416f678de761f827f4bd387b36f0cbc43a0a Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Mon, 8 Dec 2025 19:44:01 -0500
Subject: [PATCH 354/770] [Perf] Optimize `group_topk` kernel, 1.9% Throughput
 improvement, 2.1% TPOT improvemnt (#30159)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 csrc/moe/grouped_topk_kernels.cu | 175 ++++++++++++++++++++++---------
 1 file changed, 128 insertions(+), 47 deletions(-)

diff --git a/csrc/moe/grouped_topk_kernels.cu b/csrc/moe/grouped_topk_kernels.cu
index 69b4c1fb11d1a..47ee5f021eb4a 100644
--- a/csrc/moe/grouped_topk_kernels.cu
+++ b/csrc/moe/grouped_topk_kernels.cu
@@ -444,23 +444,27 @@ __device__ inline T apply_sigmoid(T val) {
   return cuda_cast<T, float>(sigmoid_accurate(f));
 }
 
-template <typename T>
+template <ScoringFunc SF, typename T>
+__device__ inline T apply_scoring(T val) {
+  if constexpr (SF == SCORING_SIGMOID) {
+    return apply_sigmoid(val);
+  } else {
+    return val;
+  }
+}
+
+template <typename T, ScoringFunc SF>
 __device__ void topk_with_k2(T* output, T const* input, T const* bias,
                              cg::thread_block_tile<32> const& tile,
                              int32_t const lane_id,
-                             int const num_experts_per_group,
-                             int const scoring_func) {
+                             int const num_experts_per_group) {
   // Get the top2 per thread
   T largest = neg_inf<T>();
   T second_largest = neg_inf<T>();
 
   if (num_experts_per_group > WARP_SIZE) {
     for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
-      T value = input[i];
-      // Apply scoring function if needed
-      if (scoring_func == SCORING_SIGMOID) {
-        value = apply_sigmoid(value);
-      }
+      T value = apply_scoring<SF>(input[i]);
       value = value + bias[i];
 
       if (value > largest) {
@@ -472,11 +476,7 @@ __device__ void topk_with_k2(T* output, T const* input, T const* bias,
     }
   } else {
     for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
-      T value = input[i];
-      // Apply scoring function if needed
-      if (scoring_func == SCORING_SIGMOID) {
-        value = apply_sigmoid(value);
-      }
+      T value = apply_scoring<SF>(input[i]);
       value = value + bias[i];
       largest = value;
     }
@@ -501,13 +501,12 @@ __device__ void topk_with_k2(T* output, T const* input, T const* bias,
   }
 }
 
-template <typename T>
+template <typename T, ScoringFunc SF>
 __global__ void topk_with_k2_kernel(T* output, T* input, T const* bias,
                                     int64_t const num_tokens,
                                     int64_t const num_cases,
                                     int64_t const n_group,
-                                    int64_t const num_experts_per_group,
-                                    int const scoring_func) {
+                                    int64_t const num_experts_per_group) {
   int32_t warp_id = threadIdx.x / WARP_SIZE;
   int32_t lane_id = threadIdx.x % WARP_SIZE;
 
@@ -525,21 +524,21 @@ __global__ void topk_with_k2_kernel(T* output, T* input, T const* bias,
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
     asm volatile("griddepcontrol.wait;");
 #endif
-    topk_with_k2(output, input, group_bias, tile, lane_id,
-                 num_experts_per_group, scoring_func);
+    topk_with_k2<T, SF>(output, input, group_bias, tile, lane_id,
+                        num_experts_per_group);
   }
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
   asm volatile("griddepcontrol.launch_dependents;");
 #endif
 }
 
-template <typename T, typename IdxT>
+template <typename T, typename IdxT, ScoringFunc SF, int NGroup = -1>
 __global__ void group_idx_and_topk_idx_kernel(
     T* scores, T const* group_scores, float* topk_values, IdxT* topk_indices,
     T const* bias, int64_t const num_tokens, int64_t const n_group,
     int64_t const topk_group, int64_t const topk, int64_t const num_experts,
     int64_t const num_experts_per_group, bool renormalize,
-    double routed_scaling_factor, int scoring_func) {
+    double routed_scaling_factor) {
   int32_t warp_id = threadIdx.x / WARP_SIZE;
   int32_t lane_id = threadIdx.x % WARP_SIZE;
   int32_t case_id =
@@ -549,6 +548,11 @@ __global__ void group_idx_and_topk_idx_kernel(
   topk_values += case_id * topk;
   topk_indices += case_id * topk;
 
+  constexpr bool kUseStaticNGroup = (NGroup > 0);
+  // use int32 to avoid implicit conversion
+  int32_t const n_group_i32 =
+      kUseStaticNGroup ? NGroup : static_cast<int32_t>(n_group);
+
   int32_t align_num_experts_per_group =
       warp_topk::round_up_to_multiple_of<WARP_SIZE>(num_experts_per_group);
 
@@ -574,13 +578,14 @@ __global__ void group_idx_and_topk_idx_kernel(
 
   if (case_id < num_tokens) {
     // calculate group_idx
-    int32_t target_num_min = WARP_SIZE - n_group + topk_group;
+    int32_t target_num_min =
+        WARP_SIZE - n_group_i32 + static_cast<int32_t>(topk_group);
     // The check is necessary to avoid abnormal input
-    if (lane_id < n_group && is_finite(group_scores[lane_id])) {
+    if (lane_id < n_group_i32 && is_finite(group_scores[lane_id])) {
       value = group_scores[lane_id];
     }
 
-    int count_equal_to_top_value = WARP_SIZE - n_group;
+    int count_equal_to_top_value = WARP_SIZE - n_group_i32;
     int pre_count_equal_to_top_value = 0;
     // Use loop to find the largset top_group
     while (count_equal_to_top_value < target_num_min) {
@@ -604,7 +609,7 @@ __global__ void group_idx_and_topk_idx_kernel(
   int count_equalto_topkth_group = 0;
   bool if_proceed_next_topk = topk_group_value != neg_inf<T>();
   if (case_id < num_tokens && if_proceed_next_topk) {
-    for (int i_group = 0; i_group < n_group; i_group++) {
+    auto process_group = [&](int i_group) {
       if ((group_scores[i_group] > topk_group_value) ||
           ((group_scores[i_group] == topk_group_value) &&
            (count_equalto_topkth_group < num_equalto_topkth_group))) {
@@ -613,11 +618,10 @@ __global__ void group_idx_and_topk_idx_kernel(
              i += WARP_SIZE) {
           T candidates = neg_inf<T>();
           if (i < num_experts_per_group) {
-            // Apply scoring function (if any) and add bias
+            // apply scoring function (if any) and add bias
             T input = scores[offset + i];
             if (is_finite(input)) {
-              T score = (scoring_func == SCORING_SIGMOID) ? apply_sigmoid(input)
-                                                          : input;
+              T score = apply_scoring<SF>(input);
               candidates = score + bias[offset + i];
             }
           }
@@ -627,6 +631,17 @@ __global__ void group_idx_and_topk_idx_kernel(
           count_equalto_topkth_group++;
         }
       }
+    };
+
+    if constexpr (kUseStaticNGroup) {
+#pragma unroll
+      for (int i_group = 0; i_group < NGroup; ++i_group) {
+        process_group(i_group);
+      }
+    } else {
+      for (int i_group = 0; i_group < n_group_i32; ++i_group) {
+        process_group(i_group);
+      }
     }
     queue.done();
     __syncwarp();
@@ -646,12 +661,13 @@ __global__ void group_idx_and_topk_idx_kernel(
       if (i < topk) {
         // Load the score value (without bias) for normalization
         T input = scores[s_topk_idx[i]];
-        value =
-            (scoring_func == SCORING_SIGMOID) ? apply_sigmoid(input) : input;
+        value = apply_scoring<SF>(input);
         s_topk_value[i] = value;
       }
-      topk_sum +=
-          cg::reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
+      if (renormalize) {
+        topk_sum +=
+            cg::reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
+      }
     }
   }
 
@@ -660,13 +676,9 @@ __global__ void group_idx_and_topk_idx_kernel(
   if (case_id < num_tokens) {
     if (if_proceed_next_topk) {
       for (int i = lane_id; i < topk; i += WARP_SIZE) {
-        float value;
-        if (renormalize) {
-          value = cuda_cast<float, T>(s_topk_value[i]) / topk_sum *
-                  routed_scaling_factor;
-        } else {
-          value = cuda_cast<float, T>(s_topk_value[i]) * routed_scaling_factor;
-        }
+        float base = cuda_cast<float, T>(s_topk_value[i]);
+        float value = renormalize ? (base / topk_sum * routed_scaling_factor)
+                                  : (base * routed_scaling_factor);
         topk_indices[i] = s_topk_idx[i];
         topk_values[i] = value;
       }
@@ -684,6 +696,45 @@ __global__ void group_idx_and_topk_idx_kernel(
 #endif
 }
 
+template <typename T, typename IdxT, ScoringFunc SF>
+inline void launch_group_idx_and_topk_kernel(
+    cudaLaunchConfig_t const& config, T* scores, T* group_scores,
+    float* topk_values, IdxT* topk_indices, T const* bias,
+    int64_t const num_tokens, int64_t const n_group, int64_t const topk_group,
+    int64_t const topk, int64_t const num_experts,
+    int64_t const num_experts_per_group, bool const renormalize,
+    double const routed_scaling_factor) {
+  auto launch = [&](auto* kernel_instance2) {
+    cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores,
+                       topk_values, topk_indices, bias, num_tokens, n_group,
+                       topk_group, topk, num_experts, num_experts_per_group,
+                       renormalize, routed_scaling_factor);
+  };
+
+  switch (n_group) {
+    case 4: {
+      launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 4>);
+      break;
+    }
+    case 8: {
+      launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 8>);
+      break;
+    }
+    case 16: {
+      launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 16>);
+      break;
+    }
+    case 32: {
+      launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 32>);
+      break;
+    }
+    default: {
+      launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF>);
+      break;
+    }
+  }
+}
+
 template <typename T, typename IdxT>
 void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
                    IdxT* topk_indices, T const* bias, int64_t const num_tokens,
@@ -694,7 +745,6 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
                    cudaStream_t const stream = 0) {
   int64_t num_cases = num_tokens * n_group;
   int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1;
-  auto* kernel_instance1 = &topk_with_k2_kernel<T>;
   cudaLaunchConfig_t config;
   config.gridDim = topk_with_k2_num_blocks;
   config.blockDim = BLOCK_SIZE;
@@ -705,16 +755,33 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
   attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
   config.numAttrs = 1;
   config.attrs = attrs;
-  cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores, bias,
-                     num_tokens, num_cases, n_group, num_experts / n_group,
-                     scoring_func);
+  auto const sf = static_cast<ScoringFunc>(scoring_func);
+  int64_t const num_experts_per_group = num_experts / n_group;
+  auto launch_topk_with_k2 = [&](auto* kernel_instance1) {
+    cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores, bias,
+                       num_tokens, num_cases, n_group, num_experts_per_group);
+  };
+  switch (sf) {
+    case SCORING_NONE: {
+      auto* kernel_instance1 = &topk_with_k2_kernel<T, SCORING_NONE>;
+      launch_topk_with_k2(kernel_instance1);
+      break;
+    }
+    case SCORING_SIGMOID: {
+      auto* kernel_instance1 = &topk_with_k2_kernel<T, SCORING_SIGMOID>;
+      launch_topk_with_k2(kernel_instance1);
+      break;
+    }
+    default:
+      // should be guarded by higher level checks.
+      TORCH_CHECK(false, "Unsupported scoring_func in invokeNoAuxTc");
+  }
 
   int64_t topk_with_k_group_num_blocks =
       (num_tokens - 1) / NUM_WARPS_PER_BLOCK + 1;
   size_t dynamic_smem_in_bytes =
       warp_topk::calc_smem_size_for_block_wide<T, int32_t>(NUM_WARPS_PER_BLOCK,
                                                            topk);
-  auto* kernel_instance2 = &group_idx_and_topk_idx_kernel<T, IdxT>;
   config.gridDim = topk_with_k_group_num_blocks;
   config.blockDim = BLOCK_SIZE;
   config.dynamicSmemBytes = dynamic_smem_in_bytes;
@@ -723,10 +790,24 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
   attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
   config.numAttrs = 1;
   config.attrs = attrs;
-  cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores,
-                     topk_values, topk_indices, bias, num_tokens, n_group,
-                     topk_group, topk, num_experts, num_experts / n_group,
-                     renormalize, routed_scaling_factor, scoring_func);
+  switch (sf) {
+    case SCORING_NONE: {
+      launch_group_idx_and_topk_kernel<T, IdxT, SCORING_NONE>(
+          config, scores, group_scores, topk_values, topk_indices, bias,
+          num_tokens, n_group, topk_group, topk, num_experts,
+          num_experts_per_group, renormalize, routed_scaling_factor);
+      break;
+    }
+    case SCORING_SIGMOID: {
+      launch_group_idx_and_topk_kernel<T, IdxT, SCORING_SIGMOID>(
+          config, scores, group_scores, topk_values, topk_indices, bias,
+          num_tokens, n_group, topk_group, topk, num_experts,
+          num_experts_per_group, renormalize, routed_scaling_factor);
+      break;
+    }
+    default:
+      TORCH_CHECK(false, "Unsupported scoring_func in invokeNoAuxTc");
+  }
 }
 
 #define INSTANTIATE_NOAUX_TC(T, IdxT)                                       \

From ae339b1a67ac6651dae926aae2afa0c168c1ddb5 Mon Sep 17 00:00:00 2001
From: Zhewen Li <zhewenli@meta.com>
Date: Mon, 8 Dec 2025 17:05:27 -0800
Subject: [PATCH 355/770] [Bugfix] Fix DeepGEMM after #29546  (#30267)

Signed-off-by: zhewenli <zhewenli@meta.com>
Signed-off-by: Zhewen Li <zhewenli@meta.com>
---
 .../layers/quantization/utils/fp8_utils.py       | 16 ++++++++++------
 vllm/utils/deep_gemm.py                          |  1 +
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index ad92f4ec63c34..366c5778fa5d9 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -30,6 +30,7 @@ from vllm.model_executor.parameter import (
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils.deep_gemm import (
+    DeepGemmQuantScaleFMT,
     fp8_gemm_nt,
     is_deep_gemm_e8m0_used,
     is_deep_gemm_supported,
@@ -268,12 +269,15 @@ class W8A8BlockFp8LinearOp:
         weight: torch.Tensor,
         weight_scale: torch.Tensor,
     ) -> torch.Tensor:
-        assert self.deepgemm_input_quant_op is not None
-        q_input, input_scale = per_token_group_quant_fp8_packed_for_deepgemm(
-            input_2d,
-            group_size=self.act_quant_group_shape.col,
-            use_ue8m0=True,
-        )
+        if DeepGemmQuantScaleFMT.from_oracle() == DeepGemmQuantScaleFMT.UE8M0:
+            q_input, input_scale = per_token_group_quant_fp8_packed_for_deepgemm(
+                input_2d,
+                group_size=self.act_quant_group_shape.col,
+                use_ue8m0=True,
+            )
+        else:
+            assert self.deepgemm_input_quant_op is not None
+            q_input, input_scale = self.deepgemm_input_quant_op(input_2d)
         output = torch.empty(
             (q_input.shape[0], weight.shape[0]),
             dtype=torch.bfloat16,
diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index 8545108a02666..a099fde1bdc45 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -399,6 +399,7 @@ def should_use_deepgemm_for_fp8_linear_for_nk(
 
 __all__ = [
     "calc_diff",
+    "DeepGemmQuantScaleFMT",
     "fp8_gemm_nt",
     "m_grouped_fp8_gemm_nt_contiguous",
     "fp8_m_grouped_gemm_nt_masked",

From 7b35011ad136748b3d18d069103feda9a239459f Mon Sep 17 00:00:00 2001
From: Yanan Cao <gmagogsfm@users.noreply.github.com>
Date: Mon, 8 Dec 2025 17:14:10 -0800
Subject: [PATCH 356/770] Mark qwen2_5_vl as xfail (#30283)

Signed-off-by: Yanan Cao <gmagogsfm@gmail.com>
---
 tests/compile/fullgraph/test_multimodal_compile.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/compile/fullgraph/test_multimodal_compile.py b/tests/compile/fullgraph/test_multimodal_compile.py
index 621f6a51a918f..e2897b2271b8f 100644
--- a/tests/compile/fullgraph/test_multimodal_compile.py
+++ b/tests/compile/fullgraph/test_multimodal_compile.py
@@ -17,6 +17,7 @@ def test_compile():
 # forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
 @pytest.mark.forked
 @pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
+@pytest.mark.xfail
 def test_qwen2_5_vl_compilation(vllm_runner, monkeypatch):
     """Test that Qwen2.5-VL vision submodules are compiled.
 

From e41312a2f5086cc9199e024c5d451e65a303a4d1 Mon Sep 17 00:00:00 2001
From: Christina Norman <truffle@gmail.com>
Date: Mon, 8 Dec 2025 19:52:43 -0600
Subject: [PATCH 357/770] [Bugfix] Skip generation config fallback for GGUF to
 prevent multi-process hang (#30209)

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
---
 vllm/transformers_utils/config.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 773fc05a52ef3..d761802da9403 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -954,6 +954,13 @@ def try_get_generation_config(
     revision: str | None = None,
     config_format: str | ConfigFormat = "auto",
 ) -> GenerationConfig | None:
+    # GGUF files don't have generation_config.json - their config is embedded
+    # in the file header. Skip all filesystem lookups to avoid re-reading the
+    # memory-mapped file, which can hang in multi-process scenarios when the
+    # EngineCore process already has the file mapped.
+    if is_gguf(model):
+        return None
+
     try:
         return GenerationConfig.from_pretrained(
             model,

From 78c75033642b8c2832fcfb8d483871b65673326b Mon Sep 17 00:00:00 2001
From: Micah Williamson <micah.williamson@amd.com>
Date: Mon, 8 Dec 2025 20:14:02 -0600
Subject: [PATCH 358/770] [ROCm][CI] Skip NVIDIA-Only Prime-RL Test in AMD CI
 (#29420)

Signed-off-by: Micah Williamson <micah.williamson@amd.com>
---
 .buildkite/scripts/run-prime-rl-test.sh | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.buildkite/scripts/run-prime-rl-test.sh b/.buildkite/scripts/run-prime-rl-test.sh
index 5b25c358fc4aa..3fb7c82c8d333 100755
--- a/.buildkite/scripts/run-prime-rl-test.sh
+++ b/.buildkite/scripts/run-prime-rl-test.sh
@@ -12,6 +12,11 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
 PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
 PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
 
+if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then
+    echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..."
+    exit 0
+fi
+
 echo "Setting up Prime-RL integration test environment..."
 
 # Clean up any existing Prime-RL directory

From db14f61f2d6d33e38d382f2e3e7de514dac8e218 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <khluu000@gmail.com>
Date: Mon, 8 Dec 2025 18:25:43 -0800
Subject: [PATCH 359/770] [ci] Refactor CI file structure (#29343)

---
 .buildkite/ci_config.yaml                     |  24 +++
 .buildkite/image_build/image_build.sh         |  56 +++++
 .buildkite/image_build/image_build.yaml       |  57 +++++
 .buildkite/image_build/image_build_cpu.sh     |  36 ++++
 .../image_build/image_build_cpu_arm64.sh      |  33 +++
 .buildkite/image_build/image_build_hpu.sh     |  34 +++
 .buildkite/test_areas/attention.yaml          |  21 ++
 .buildkite/test_areas/basic_correctness.yaml  |  16 ++
 .buildkite/test_areas/benchmarks.yaml         |  19 ++
 .buildkite/test_areas/compile.yaml            |  57 +++++
 .buildkite/test_areas/cuda.yaml               |  22 ++
 .buildkite/test_areas/distributed.yaml        | 199 ++++++++++++++++++
 .buildkite/test_areas/e2e_integration.yaml    |  59 ++++++
 .buildkite/test_areas/engine.yaml             |  26 +++
 .buildkite/test_areas/entrypoints.yaml        |  68 ++++++
 .buildkite/test_areas/expert_parallelism.yaml |  23 ++
 .buildkite/test_areas/kernels.yaml            | 117 ++++++++++
 .buildkite/test_areas/lm_eval.yaml            |  46 ++++
 .buildkite/test_areas/lora.yaml               |  31 +++
 .buildkite/test_areas/misc.yaml               | 163 ++++++++++++++
 .buildkite/test_areas/model_executor.yaml     |  17 ++
 .buildkite/test_areas/models_basic.yaml       |  62 ++++++
 .buildkite/test_areas/models_distributed.yaml |  22 ++
 .buildkite/test_areas/models_language.yaml    |  91 ++++++++
 .buildkite/test_areas/models_multimodal.yaml  |  79 +++++++
 .buildkite/test_areas/plugins.yaml            |  34 +++
 .buildkite/test_areas/pytorch.yaml            |  50 +++++
 .buildkite/test_areas/quantization.yaml       |  46 ++++
 .buildkite/test_areas/samplers.yaml           |  14 ++
 .buildkite/test_areas/tool_use.yaml           |  23 ++
 .buildkite/test_areas/weight_loading.yaml     |  25 +++
 31 files changed, 1570 insertions(+)
 create mode 100644 .buildkite/ci_config.yaml
 create mode 100755 .buildkite/image_build/image_build.sh
 create mode 100644 .buildkite/image_build/image_build.yaml
 create mode 100755 .buildkite/image_build/image_build_cpu.sh
 create mode 100755 .buildkite/image_build/image_build_cpu_arm64.sh
 create mode 100755 .buildkite/image_build/image_build_hpu.sh
 create mode 100644 .buildkite/test_areas/attention.yaml
 create mode 100644 .buildkite/test_areas/basic_correctness.yaml
 create mode 100644 .buildkite/test_areas/benchmarks.yaml
 create mode 100644 .buildkite/test_areas/compile.yaml
 create mode 100644 .buildkite/test_areas/cuda.yaml
 create mode 100644 .buildkite/test_areas/distributed.yaml
 create mode 100644 .buildkite/test_areas/e2e_integration.yaml
 create mode 100644 .buildkite/test_areas/engine.yaml
 create mode 100644 .buildkite/test_areas/entrypoints.yaml
 create mode 100644 .buildkite/test_areas/expert_parallelism.yaml
 create mode 100644 .buildkite/test_areas/kernels.yaml
 create mode 100644 .buildkite/test_areas/lm_eval.yaml
 create mode 100644 .buildkite/test_areas/lora.yaml
 create mode 100644 .buildkite/test_areas/misc.yaml
 create mode 100644 .buildkite/test_areas/model_executor.yaml
 create mode 100644 .buildkite/test_areas/models_basic.yaml
 create mode 100644 .buildkite/test_areas/models_distributed.yaml
 create mode 100644 .buildkite/test_areas/models_language.yaml
 create mode 100644 .buildkite/test_areas/models_multimodal.yaml
 create mode 100644 .buildkite/test_areas/plugins.yaml
 create mode 100644 .buildkite/test_areas/pytorch.yaml
 create mode 100644 .buildkite/test_areas/quantization.yaml
 create mode 100644 .buildkite/test_areas/samplers.yaml
 create mode 100644 .buildkite/test_areas/tool_use.yaml
 create mode 100644 .buildkite/test_areas/weight_loading.yaml

diff --git a/.buildkite/ci_config.yaml b/.buildkite/ci_config.yaml
new file mode 100644
index 0000000000000..199c33159fde3
--- /dev/null
+++ b/.buildkite/ci_config.yaml
@@ -0,0 +1,24 @@
+name: vllm_ci
+job_dirs:
+  - ".buildkite/test_areas"
+  - ".buildkite/image_build"
+run_all_patterns:
+  - "docker/Dockerfile"
+  - "CMakeLists.txt"
+  - "requirements/common.txt"
+  - "requirements/cuda.txt"
+  - "requirements/build.txt"
+  - "requirements/test.txt"
+  - "setup.py"
+  - "csrc/"
+  - "cmake/"
+run_all_exclude_patterns:
+  - "docker/Dockerfile."
+  - "csrc/cpu/"
+  - "csrc/rocm/"
+  - "cmake/hipify.py"
+  - "cmake/cpu_extension.cmake"
+registries: public.ecr.aws/q9t5s3a7
+repositories:
+  main: "vllm-ci-postmerge-repo"
+  premerge: "vllm-ci-test-repo"
diff --git a/.buildkite/image_build/image_build.sh b/.buildkite/image_build/image_build.sh
new file mode 100755
index 0000000000000..9a2384e524b63
--- /dev/null
+++ b/.buildkite/image_build/image_build.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 8 ]]; then
+  echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
+  exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+BRANCH=$4
+VLLM_USE_PRECOMPILED=$5
+VLLM_MERGE_BASE_COMMIT=$6
+CACHE_FROM=$7
+CACHE_TO=$8
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
+
+# docker buildx 
+docker buildx create --name vllm-builder --driver docker-container --use
+docker buildx inspect --bootstrap
+docker buildx ls
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
+  echo "Image not found, proceeding with build..."
+else
+  echo "Image found"
+  exit 0
+fi
+
+if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
+  merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
+else
+  merge_base_commit_build_args=""
+fi
+
+# build
+docker buildx build --file docker/Dockerfile \
+  --build-arg max_jobs=16 \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --build-arg USE_SCCACHE=1 \
+  --build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
+  --build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
+  --build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
+  ${merge_base_commit_build_args} \
+  --cache-from type=registry,ref=${CACHE_FROM},mode=max \
+  --cache-to type=registry,ref=${CACHE_TO},mode=max \
+  --tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
+  $( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
+  --push \
+  --target test \
+  --progress plain .
diff --git a/.buildkite/image_build/image_build.yaml b/.buildkite/image_build/image_build.yaml
new file mode 100644
index 0000000000000..d01c71dd9becf
--- /dev/null
+++ b/.buildkite/image_build/image_build.yaml
@@ -0,0 +1,57 @@
+group: Abuild
+steps:
+  - label: ":docker: Build image"
+    key: image-build
+    depends_on: []
+    commands:
+    - .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 2
+        - exit_status: -10  # Agent was lost
+          limit: 2
+
+  - label: ":docker: Build CPU image"
+    key: image-build-cpu
+    depends_on: []
+    commands:
+    - .buildkite/image_build/image_build_cpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
+    env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 2
+        - exit_status: -10  # Agent was lost
+          limit: 2
+
+  - label: ":docker: Build HPU image"
+    soft_fail: true
+    depends_on: []
+    key: image-build-hpu
+    commands:
+    - .buildkite/image_build/image_build_hpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
+    env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 2
+        - exit_status: -10  # Agent was lost
+          limit: 2
+  
+  - label: ":docker: Build CPU arm64 image"
+    key: cpu-arm64-image-build
+    depends_on: []
+    optional: true
+    commands:
+    - .buildkite/image_build/image_build_cpu_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT
+    env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 2
+        - exit_status: -10  # Agent was lost
+          limit: 2
diff --git a/.buildkite/image_build/image_build_cpu.sh b/.buildkite/image_build/image_build_cpu.sh
new file mode 100755
index 0000000000000..a69732f430985
--- /dev/null
+++ b/.buildkite/image_build/image_build_cpu.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 3 ]]; then
+  echo "Usage: $0 <registry> <repo> <commit>"
+  exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
+  echo "Image not found, proceeding with build..."
+else
+  echo "Image found"
+  exit 0
+fi
+
+# build
+docker build --file docker/Dockerfile.cpu \
+  --build-arg max_jobs=16 \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --build-arg VLLM_CPU_AVX512BF16=true \
+  --build-arg VLLM_CPU_AVX512VNNI=true \
+  --build-arg VLLM_CPU_AMXBF16=true \
+  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
+  --target vllm-test \
+  --progress plain .
+
+# push
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
diff --git a/.buildkite/image_build/image_build_cpu_arm64.sh b/.buildkite/image_build/image_build_cpu_arm64.sh
new file mode 100755
index 0000000000000..615298b6555bd
--- /dev/null
+++ b/.buildkite/image_build/image_build_cpu_arm64.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 3 ]]; then
+  echo "Usage: $0 <registry> <repo> <commit>"
+  exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
+  echo "Image not found, proceeding with build..."
+else
+  echo "Image found"
+  exit 0
+fi
+
+# build
+docker build --file docker/Dockerfile.cpu \
+  --build-arg max_jobs=16 \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
+  --target vllm-test \
+  --progress plain .
+
+# push
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
diff --git a/.buildkite/image_build/image_build_hpu.sh b/.buildkite/image_build/image_build_hpu.sh
new file mode 100755
index 0000000000000..192447ef4577e
--- /dev/null
+++ b/.buildkite/image_build/image_build_hpu.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 3 ]]; then
+  echo "Usage: $0 <registry> <repo> <commit>"
+  exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
+  echo "Image not found, proceeding with build..."
+else
+  echo "Image found"
+  exit 0
+fi
+
+# build
+docker build \
+  --file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
+  --build-arg max_jobs=16 \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
+  --progress plain \
+  https://github.com/vllm-project/vllm-gaudi.git
+
+# push
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu
diff --git a/.buildkite/test_areas/attention.yaml b/.buildkite/test_areas/attention.yaml
new file mode 100644
index 0000000000000..6e444eae14c74
--- /dev/null
+++ b/.buildkite/test_areas/attention.yaml
@@ -0,0 +1,21 @@
+group: Attention
+depends_on: 
+  - image-build
+steps:
+- label: V1 attention (H100)
+  timeout_in_minutes: 30
+  gpu: h100
+  source_file_dependencies:
+    - vllm/v1/attention
+    - tests/v1/attention
+  commands:
+    - pytest -v -s v1/attention
+
+- label: V1 attention (B200)
+  timeout_in_minutes: 30
+  gpu: b200
+  source_file_dependencies:
+    - vllm/v1/attention
+    - tests/v1/attention
+  commands:
+    - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
diff --git a/.buildkite/test_areas/basic_correctness.yaml b/.buildkite/test_areas/basic_correctness.yaml
new file mode 100644
index 0000000000000..759d2b5358714
--- /dev/null
+++ b/.buildkite/test_areas/basic_correctness.yaml
@@ -0,0 +1,16 @@
+group: Basic Correctness
+depends_on: 
+  - image-build
+steps:
+- label: Basic Correctness
+  timeout_in_minutes: 30
+  source_file_dependencies:
+  - vllm/
+  - tests/basic_correctness/test_basic_correctness
+  - tests/basic_correctness/test_cpu_offload
+  - tests/basic_correctness/test_cumem.py
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s basic_correctness/test_cumem.py
+  - pytest -v -s basic_correctness/test_basic_correctness.py
+  - pytest -v -s basic_correctness/test_cpu_offload.py
diff --git a/.buildkite/test_areas/benchmarks.yaml b/.buildkite/test_areas/benchmarks.yaml
new file mode 100644
index 0000000000000..574b642d407b0
--- /dev/null
+++ b/.buildkite/test_areas/benchmarks.yaml
@@ -0,0 +1,19 @@
+group: Benchmarks
+depends_on: 
+  - image-build
+steps:
+- label: Benchmarks
+  timeout_in_minutes: 20
+  working_dir: "/vllm-workspace/.buildkite"
+  source_file_dependencies:
+  - benchmarks/
+  commands:
+  - bash scripts/run-benchmarks.sh
+
+- label: Benchmarks CLI Test
+  timeout_in_minutes: 20
+  source_file_dependencies:
+  - vllm/
+  - tests/benchmarks/
+  commands:
+  - pytest -v -s benchmarks/
diff --git a/.buildkite/test_areas/compile.yaml b/.buildkite/test_areas/compile.yaml
new file mode 100644
index 0000000000000..0ba00925a4838
--- /dev/null
+++ b/.buildkite/test_areas/compile.yaml
@@ -0,0 +1,57 @@
+group: Compile
+depends_on: 
+  - image-build
+steps:
+- label: Fusion and Compile Tests (B200)
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/v1/worker/
+  - vllm/v1/cudagraph_dispatcher.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/test_fusion_attn.py
+  - tests/compile/test_silu_mul_quant_fusion.py
+  - tests/compile/distributed/test_fusion_all_reduce.py
+  - tests/compile/distributed/test_fusions_e2e.py
+  - tests/compile/fullgraph/test_full_graph.py
+  commands:
+    - nvidia-smi
+    - pytest -v -s tests/compile/test_fusion_attn.py
+    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+    # this runner has 2 GPUs available even though num_gpus=2 is not set
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
+    # Wrap with quotes to escape yaml
+    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
+    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
+
+- label: Fusion E2E (2 GPUs)(B200)
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/distributed/test_fusions_e2e.py
+  commands:
+    - nvidia-smi
+    # Run all e2e fusion tests
+    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
+
diff --git a/.buildkite/test_areas/cuda.yaml b/.buildkite/test_areas/cuda.yaml
new file mode 100644
index 0000000000000..50c0c338c2434
--- /dev/null
+++ b/.buildkite/test_areas/cuda.yaml
@@ -0,0 +1,22 @@
+group: CUDA
+depends_on: 
+  - image-build
+steps:
+- label: Platform Tests (CUDA)
+  timeout_in_minutes: 15
+  source_file_dependencies:
+  - vllm/
+  - tests/cuda
+  commands:
+    - pytest -v -s cuda/test_cuda_context.py
+
+- label: Cudagraph
+  timeout_in_minutes: 20
+  source_file_dependencies:
+  - tests/v1/cudagraph
+  - vllm/v1/cudagraph_dispatcher.py
+  - vllm/config/compilation.py
+  - vllm/compilation
+  commands:
+    - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
+    - pytest -v -s v1/cudagraph/test_cudagraph_mode.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml
new file mode 100644
index 0000000000000..2cc90698d916a
--- /dev/null
+++ b/.buildkite/test_areas/distributed.yaml
@@ -0,0 +1,199 @@
+group: Distributed
+depends_on: 
+  - image-build
+steps:
+- label: Distributed Comm Ops
+  timeout_in_minutes: 20
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/distributed
+  - tests/distributed
+  commands:
+  - pytest -v -s distributed/test_comm_ops.py
+  - pytest -v -s distributed/test_shm_broadcast.py
+  - pytest -v -s distributed/test_shm_buffer.py
+  - pytest -v -s distributed/test_shm_storage.py
+
+- label: Distributed (2 GPUs)
+  timeout_in_minutes: 90
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/compilation/
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/worker/worker_base.py
+  - vllm/v1/engine/
+  - vllm/v1/worker/
+  - tests/compile/fullgraph/test_basic_correctness.py
+  - tests/compile/test_wrapper.py
+  - tests/distributed/
+  - tests/entrypoints/llm/test_collective_rpc.py
+  - tests/v1/distributed
+  - tests/v1/entrypoints/openai/test_multi_api_servers.py
+  - tests/v1/shutdown
+  - tests/v1/worker/test_worker_memory_snapshot.py
+  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  - export NCCL_CUMEM_HOST_ENABLE=0
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
+  - pytest -v -s entrypoints/llm/test_collective_rpc.py
+  - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
+  - pytest -v -s ./compile/test_wrapper.py
+  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+  - pytest -v -s distributed/test_sequence_parallel.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
+  - pytest -v -s v1/worker/test_worker_memory_snapshot.py
+
+- label: Distributed Tests (4 GPUs)
+  timeout_in_minutes: 50
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/distributed/
+  - tests/distributed/test_utils
+  - tests/distributed/test_pynccl
+  - tests/distributed/test_events
+  - tests/compile/fullgraph/test_basic_correctness.py
+  - examples/offline_inference/rlhf.py
+  - examples/offline_inference/rlhf_colocate.py
+  - tests/examples/offline_inference/data_parallel.py
+  - tests/v1/distributed
+  - tests/v1/engine/test_engine_core_client.py
+  - tests/distributed/test_symm_mem_allreduce.py
+  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  - export NCCL_CUMEM_HOST_ENABLE=0
+  # test with torchrun tp=2 and external_dp=2
+  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+  # test with torchrun tp=2 and pp=2
+  - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+  # test with torchrun tp=4 and dp=1
+  - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  # test with torchrun tp=2, pp=2 and dp=1
+  - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  # test with torchrun tp=1 and dp=4 with ep
+  - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  # test with torchrun tp=2 and dp=2 with ep
+  - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  # test with internal dp
+  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
+  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
+  - pytest -v -s distributed/test_utils.py
+  - pytest -v -s compile/fullgraph/test_basic_correctness.py
+  - pytest -v -s distributed/test_pynccl.py
+  - pytest -v -s distributed/test_events.py
+  - pytest -v -s distributed/test_symm_mem_allreduce.py
+  # TODO: create a dedicated test section for multi-GPU example tests
+  # when we have multiple distributed example tests
+  - cd ../examples/offline_inference
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
+
+- label: Distributed Tests (8 GPUs)(H100)
+  timeout_in_minutes: 10
+  gpu: h100
+  num_gpus: 8
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - examples/offline_inference/torchrun_dp_example.py
+  - vllm/config/parallel.py
+  - vllm/distributed/
+  - vllm/v1/engine/llm_engine.py
+  - vllm/v1/executor/uniproc_executor.py
+  - vllm/v1/worker/gpu_worker.py
+  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  - export NCCL_CUMEM_HOST_ENABLE=0
+  # test with torchrun tp=2 and dp=4 with ep
+  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
+
+- label: Distributed Tests (4 GPUs)(A100)
+  gpu: a100
+  optional: true
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/
+  commands:
+  # NOTE: don't test llama model here, it seems hf implementation is buggy
+  # see https://github.com/vllm-project/vllm/pull/5689 for details
+  - pytest -v -s distributed/test_custom_all_reduce.py
+  - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
+  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+  - pytest -v -s -x lora/test_mixtral.py
+
+- label: Distributed Tests (2 GPUs)(H200)
+  gpu: h200
+  optional: true
+  working_dir: "/vllm-workspace/"
+  num_gpus: 2
+  commands:
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
+    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
+    - pytest -v -s tests/distributed/test_context_parallel.py
+    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+    - pytest -v -s tests/v1/distributed/test_dbo.py
+
+- label: Distributed Tests (2 GPUs)(B200)
+  gpu: b200
+  optional: true
+  working_dir: "/vllm-workspace/"
+  num_gpus: 2
+  commands:
+    - pytest -v -s tests/distributed/test_context_parallel.py
+    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
+    - pytest -v -s tests/v1/distributed/test_dbo.py
+
+- label: 2 Node Test (4 GPUs)
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  num_nodes: 2
+  source_file_dependencies:
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/model_executor/models/
+  - tests/distributed/
+  - tests/examples/offline_inference/data_parallel.py
+  commands:
+    - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
+
+- label: Distributed NixlConnector PD accuracy (4 GPUs)
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - tests/v1/kv_connector/nixl_integration/
+  commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+    - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
+
+- label: Pipeline + Context Parallelism (4 GPUs))
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/model_executor/models/
+  - tests/distributed/
+  commands:
+  - pytest -v -s distributed/test_pp_cudagraph.py
+  - pytest -v -s distributed/test_pipeline_parallel.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/e2e_integration.yaml b/.buildkite/test_areas/e2e_integration.yaml
new file mode 100644
index 0000000000000..93d389815edac
--- /dev/null
+++ b/.buildkite/test_areas/e2e_integration.yaml
@@ -0,0 +1,59 @@
+group: E2E Integration
+depends_on: 
+  - image-build
+steps:
+- label: DeepSeek V2-Lite Accuracy
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
+  timeout_in_minutes: 60
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
+
+- label: Prime-RL Integration (2 GPUs)
+  timeout_in_minutes: 30
+  optional: true
+  num_gpus: 2
+  working_dir: "/vllm-workspace"
+  source_file_dependencies:
+  - vllm/
+  - .buildkite/scripts/run-prime-rl-test.sh
+  commands:
+    - bash .buildkite/scripts/run-prime-rl-test.sh
+
+- label: DeepSeek V2-Lite Async EPLB Accuracy
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
+
+- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
diff --git a/.buildkite/test_areas/engine.yaml b/.buildkite/test_areas/engine.yaml
new file mode 100644
index 0000000000000..a028e0e4af4c1
--- /dev/null
+++ b/.buildkite/test_areas/engine.yaml
@@ -0,0 +1,26 @@
+group: Engine
+depends_on: 
+  - image-build
+steps:
+- label: Engine
+  timeout_in_minutes: 15
+  source_file_dependencies:
+  - vllm/
+  - tests/engine
+  - tests/test_sequence
+  - tests/test_config
+  - tests/test_logger
+  - tests/test_vllm_port
+  commands:
+  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
+
+- label: V1 e2e + engine
+  timeout_in_minutes: 45
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    # TODO: accuracy does not match, whether setting
+    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
+    - pytest -v -s v1/e2e
+    - pytest -v -s v1/engine
diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml
new file mode 100644
index 0000000000000..0a789be943f37
--- /dev/null
+++ b/.buildkite/test_areas/entrypoints.yaml
@@ -0,0 +1,68 @@
+group: Entrypoints
+depends_on: 
+  - image-build
+steps:
+- label: Entrypoints Unit Tests  
+  timeout_in_minutes: 10
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/entrypoints
+  - tests/entrypoints/
+  commands:
+  - pytest -v -s entrypoints/openai/tool_parsers
+  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
+
+- label: Entrypoints Integration (LLM)
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/llm
+  - tests/entrypoints/offline_mode
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
+  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
+  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+
+- label: Entrypoints Integration (API Server)
+  timeout_in_minutes: 130
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/openai
+  - tests/entrypoints/test_chat_utils
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
+  - pytest -v -s entrypoints/test_chat_utils.py
+
+
+- label: Entrypoints Integration (Pooling)
+  timeout_in_minutes: 50
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/pooling
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s entrypoints/pooling
+
+
+- label: Entrypoints V1
+  timeout_in_minutes: 50
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    - pytest -v -s v1/entrypoints
+
+- label: OpenAI API Correctness
+  timeout_in_minutes: 30
+  source_file_dependencies:
+  - csrc/
+  - vllm/entrypoints/openai/
+  - vllm/model_executor/models/whisper.py
+  commands: # LMEval+Transcription WER check
+  - pytest -s entrypoints/openai/correctness/
diff --git a/.buildkite/test_areas/expert_parallelism.yaml b/.buildkite/test_areas/expert_parallelism.yaml
new file mode 100644
index 0000000000000..feb8252148c7f
--- /dev/null
+++ b/.buildkite/test_areas/expert_parallelism.yaml
@@ -0,0 +1,23 @@
+group: Expert Parallelism
+depends_on: 
+  - image-build
+steps:
+- label: EPLB Algorithm
+  timeout_in_minutes: 15
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/distributed/eplb
+  - tests/distributed/test_eplb_algo.py
+  commands:
+  - pytest -v -s distributed/test_eplb_algo.py
+
+- label: EPLB Execution
+  timeout_in_minutes: 20
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/distributed/eplb
+  - tests/distributed/test_eplb_execute.py
+  commands:
+  - pytest -v -s distributed/test_eplb_execute.py
+  - pytest -v -s distributed/test_eplb_spec_decode.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/kernels.yaml b/.buildkite/test_areas/kernels.yaml
new file mode 100644
index 0000000000000..7ca099516d641
--- /dev/null
+++ b/.buildkite/test_areas/kernels.yaml
@@ -0,0 +1,117 @@
+group: Kernels
+depends_on: 
+  - image-build
+steps:
+- label: Kernels Core Operation Test
+  timeout_in_minutes: 75
+  source_file_dependencies:
+  - csrc/
+  - tests/kernels/core
+  - tests/kernels/test_top_k_per_row.py
+  commands:
+    - pytest -v -s kernels/core kernels/test_top_k_per_row.py
+
+- label: Kernels Attention Test %N
+  timeout_in_minutes: 35
+  source_file_dependencies:
+  - csrc/attention/
+  - vllm/attention
+  - vllm/v1/attention
+  - tests/kernels/attention
+  commands:
+    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 2
+
+- label: Kernels Quantization Test %N
+  timeout_in_minutes: 90
+  source_file_dependencies:
+  - csrc/quantization/
+  - vllm/model_executor/layers/quantization
+  - tests/kernels/quantization
+  commands:
+    - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 2
+
+- label: Kernels MoE Test %N
+  timeout_in_minutes: 60
+  source_file_dependencies:
+  - csrc/quantization/cutlass_w8a8/moe/
+  - csrc/moe/
+  - tests/kernels/moe
+  - vllm/model_executor/layers/fused_moe/
+  - vllm/distributed/device_communicators/
+  - vllm/envs.py
+  - vllm/config
+  commands:
+    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 2
+
+- label: Kernels Mamba Test
+  timeout_in_minutes: 45
+  source_file_dependencies:
+  - csrc/mamba/
+  - tests/kernels/mamba
+  - vllm/model_executor/layers/mamba/ops
+  commands:
+    - pytest -v -s kernels/mamba
+
+- label: Kernels DeepGEMM Test (H100)
+  timeout_in_minutes: 45
+  gpu: h100
+  num_gpus: 1
+  source_file_dependencies:
+  - tools/install_deepgemm.sh
+  - vllm/utils/deep_gemm.py
+  - vllm/model_executor/layers/fused_moe
+  - vllm/model_executor/layers/quantization
+  - tests/kernels/quantization/test_block_fp8.py
+  - tests/kernels/moe/test_deepgemm.py
+  - tests/kernels/moe/test_batched_deepgemm.py
+  - tests/kernels/attention/test_deepgemm_attention.py
+  commands:
+    - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
+    - pytest -v -s kernels/moe/test_deepgemm.py
+    - pytest -v -s kernels/moe/test_batched_deepgemm.py
+    - pytest -v -s kernels/attention/test_deepgemm_attention.py
+
+- label: Kernels (B200)
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  # optional: true
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - csrc/attention/mla/
+  - csrc/quantization/cutlass_w8a8/moe/
+  - vllm/model_executor/layers/fused_moe/cutlass_moe.py
+  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/v1/attention/backends/mla/cutlass_mla.py
+  - vllm/v1/attention/backends/mla/flashinfer_mla.py
+  - vllm/platforms/cuda.py
+  - vllm/attention/selector.py
+  commands:
+    - nvidia-smi
+    - python3 examples/offline_inference/basic/chat.py
+    # Attention
+    # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
+    - pytest -v -s tests/kernels/attention/test_attention_selector.py
+    - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
+    - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
+    - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
+    - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
+    # Quantization
+    - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
+    - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
+    - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
+    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
+    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
+    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
+    - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
+    - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
+    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
+    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
+    - pytest -v -s tests/kernels/moe/test_flashinfer.py
+    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/lm_eval.yaml b/.buildkite/test_areas/lm_eval.yaml
new file mode 100644
index 0000000000000..9af43e0c375a8
--- /dev/null
+++ b/.buildkite/test_areas/lm_eval.yaml
@@ -0,0 +1,46 @@
+group: LM Eval
+depends_on: 
+  - image-build
+steps:
+- label: LM Eval Small Models
+  timeout_in_minutes: 75
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  autorun_on_main: true
+  commands:
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+
+- label: LM Eval Large Models (4 GPUs)(A100)
+  gpu: a100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
+
+- label: LM Eval Large Models (4 GPUs)(H100)
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
+    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
+
+- label: LM Eval Small Models (B200)
+  timeout_in_minutes: 120
+  gpu: b200
+  optional: true
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
diff --git a/.buildkite/test_areas/lora.yaml b/.buildkite/test_areas/lora.yaml
new file mode 100644
index 0000000000000..809b4138f44ba
--- /dev/null
+++ b/.buildkite/test_areas/lora.yaml
@@ -0,0 +1,31 @@
+group: LoRA
+depends_on: 
+  - image-build
+steps:
+- label: LoRA %N
+  timeout_in_minutes: 30
+  source_file_dependencies:
+  - vllm/lora
+  - tests/lora
+  commands:
+    - pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py
+  parallelism: 4
+
+
+- label: LoRA TP (Distributed)
+  timeout_in_minutes: 30
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/lora
+  - tests/lora
+  commands:
+    # FIXIT: find out which code initialize cuda before running the test
+    # before the fix, we need to use spawn to test it
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    # There is some Tensor Parallelism related processing logic in LoRA that
+    # requires multi-GPU testing for validation.
+    - pytest -v -s -x lora/test_chatglm3_tp.py
+    - pytest -v -s -x lora/test_llama_tp.py
+    - pytest -v -s -x lora/test_llm_with_multi_loras.py
+    - pytest -v -s -x lora/test_olmoe_tp.py
+    - pytest -v -s -x lora/test_gptoss_tp.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml
new file mode 100644
index 0000000000000..072bccadb726a
--- /dev/null
+++ b/.buildkite/test_areas/misc.yaml
@@ -0,0 +1,163 @@
+group: Miscellaneous
+depends_on: 
+  - image-build
+steps:
+- label: V1 Others
+  timeout_in_minutes: 60
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+    # split the test to avoid interference
+    - pytest -v -s -m 'not cpu_test' v1/core
+    - pytest -v -s v1/executor
+    - pytest -v -s v1/kv_offload
+    - pytest -v -s v1/sample
+    - pytest -v -s v1/logits_processors
+    - pytest -v -s v1/worker
+    - pytest -v -s v1/spec_decode
+    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
+    - pytest -v -s -m 'not cpu_test' v1/metrics
+    - pytest -v -s v1/test_oracle.py
+    - pytest -v -s v1/test_request.py
+    - pytest -v -s v1/test_outputs.py
+    # Integration test for streaming correctness (requires special branch).
+    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
+    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
+
+- label: V1 Others (CPU)
+  depends_on: ~
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  no_gpu: true
+  commands:
+    # split the test to avoid interference
+    - pytest -v -s -m 'cpu_test' v1/core
+    - pytest -v -s v1/structured_output
+    - pytest -v -s v1/test_serial_utils.py
+    - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
+    - pytest -v -s -m 'cpu_test' v1/metrics
+
+- label: Regression
+  timeout_in_minutes: 20
+  source_file_dependencies:
+  - vllm/
+  - tests/test_regression
+  commands:
+  - pip install modelscope
+  - pytest -v -s test_regression.py
+  working_dir: "/vllm-workspace/tests" # optional
+
+- label: Examples
+  timeout_in_minutes: 45
+  working_dir: "/vllm-workspace/examples"
+  source_file_dependencies:
+  - vllm/entrypoints
+  - vllm/multimodal
+  - examples/
+  commands:
+    - pip install tensorizer # for tensorizer test
+    - python3 offline_inference/basic/chat.py # for basic
+    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
+    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+    - python3 offline_inference/basic/classify.py
+    - python3 offline_inference/basic/embed.py
+    - python3 offline_inference/basic/score.py
+    # for multi-modal models
+    - python3 offline_inference/audio_language.py --seed 0
+    - python3 offline_inference/vision_language.py --seed 0
+    - python3 offline_inference/vision_language_multi_image.py --seed 0
+    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
+     # for pooling models
+    - python3 pooling/pooling/vision_language_pooling.py --seed 0
+    # for features demo
+    - python3 offline_inference/prefix_caching.py
+    - python3 offline_inference/llm_engine_example.py
+    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
+    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
+
+- label: Metrics, Tracing (2 GPUs)
+  timeout_in_minutes: 20
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/
+  - tests/v1/tracing
+  commands:
+  - "pip install \
+      'opentelemetry-sdk>=1.26.0' \
+      'opentelemetry-api>=1.26.0' \
+      'opentelemetry-exporter-otlp>=1.26.0' \
+      'opentelemetry-semantic-conventions-ai>=0.4.1'"
+  - pytest -v -s v1/tracing
+
+- label: Python-only Installation
+  depends_on: ~
+  timeout_in_minutes: 20
+  source_file_dependencies:
+  - tests/standalone_tests/python_only_compile.sh
+  - setup.py
+  commands:
+  - bash standalone_tests/python_only_compile.sh
+
+- label: Async Engine, Inputs, Utils, Worker
+  timeout_in_minutes: 50
+  source_file_dependencies:
+  - vllm/
+  - tests/multimodal
+  - tests/utils_
+  commands:
+  - pytest -v -s -m 'not cpu_test' multimodal
+  - pytest -v -s utils_
+
+- label: Async Engine, Inputs, Utils, Worker, Config (CPU)
+  depends_on: ~
+  timeout_in_minutes: 20
+  source_file_dependencies:
+  - vllm/
+  - tests/test_inputs.py
+  - tests/test_outputs.py
+  - tests/multimodal
+  - tests/standalone_tests/lazy_imports.py
+  - tests/tokenizers_
+  - tests/transformers_utils
+  - tests/config
+  no_gpu: true
+  commands:
+  - python3 standalone_tests/lazy_imports.py
+  - pytest -v -s test_inputs.py
+  - pytest -v -s test_outputs.py
+  - pytest -v -s -m 'cpu_test' multimodal
+  - pytest -v -s tokenizers_
+  - pytest -v -s transformers_utils
+  - pytest -v -s config
+
+- label: GPT-OSS Eval (B200)
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  optional: true
+  source_file_dependencies:
+  - tests/evals/gpt_oss
+  - vllm/model_executor/models/gpt_oss.py
+  - vllm/model_executor/layers/quantization/mxfp4.py
+  - vllm/v1/attention/backends/flashinfer.py
+  commands:
+    - uv pip install --system 'gpt-oss[eval]==0.0.5'
+    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+
+- label: Batch Invariance (H100)
+  timeout_in_minutes: 25
+  gpu: h100
+  source_file_dependencies:
+    - vllm/v1/attention
+    - vllm/model_executor/layers
+    - tests/v1/determinism/
+  commands:
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pip install pytest-timeout pytest-forked
+    - pytest -v -s v1/determinism/test_batch_invariance.py
+    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/model_executor.yaml b/.buildkite/test_areas/model_executor.yaml
new file mode 100644
index 0000000000000..996c8bb8b780a
--- /dev/null
+++ b/.buildkite/test_areas/model_executor.yaml
@@ -0,0 +1,17 @@
+group: Model Executor
+depends_on: 
+  - image-build
+steps:
+- label: Model Executor
+  timeout_in_minutes: 35
+  source_file_dependencies:
+  - vllm/engine/arg_utils.py
+  - vllm/config/model.py
+  - vllm/model_executor
+  - tests/model_executor
+  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
+  commands:
+    - apt-get update && apt-get install -y curl libsodium23
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -v -s model_executor
+    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
diff --git a/.buildkite/test_areas/models_basic.yaml b/.buildkite/test_areas/models_basic.yaml
new file mode 100644
index 0000000000000..39a5d51c48833
--- /dev/null
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -0,0 +1,62 @@
+group: Models - Basic
+depends_on: 
+  - image-build
+steps:
+- label: Basic Models Tests (Initialization)
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/test_initialization.py
+  commands:
+    # Run a subset of model initialization tests
+    - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
+
+- label: Basic Models Tests (Extra Initialization) %N
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/model_executor/models/
+  - tests/models/test_initialization.py
+  commands:
+    # Only when vLLM model source is modified - test initialization of a large
+    # subset of supported models (the complement of the small subset in the above
+    # test.) Also run if model initialization test file is modified
+    - pytest -v -s models/test_initialization.py -k 'not test_can_initialize_small_subset' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
+  parallelism: 2
+
+- label: Basic Models Tests (Other)
+  timeout_in_minutes: 45
+  source_file_dependencies:
+  - vllm/
+  - tests/models/test_transformers.py
+  - tests/models/test_registry.py
+  commands:
+    - pytest -v -s models/test_transformers.py models/test_registry.py
+
+- label: Basic Models Test (Other CPU) # 5min
+  timeout_in_minutes: 10
+  source_file_dependencies:
+  - vllm/
+  - tests/models/test_utils.py
+  - tests/models/test_vision.py
+  no_gpu: true
+  commands:
+    - pytest -v -s models/test_utils.py models/test_vision.py
+
+- label: Transformers Nightly Models
+  working_dir: "/vllm-workspace/"
+  optional: true
+  soft_fail: true
+  commands:
+    - pip install --upgrade git+https://github.com/huggingface/transformers
+    - pytest -v -s tests/models/test_initialization.py
+    - pytest -v -s tests/models/test_transformers.py
+    - pytest -v -s tests/models/multimodal/processing/
+    - pytest -v -s tests/models/multimodal/test_mapping.py
+    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+    # Whisper needs spawn method to avoid deadlock
+    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
diff --git a/.buildkite/test_areas/models_distributed.yaml b/.buildkite/test_areas/models_distributed.yaml
new file mode 100644
index 0000000000000..b6bfbf2ddab47
--- /dev/null
+++ b/.buildkite/test_areas/models_distributed.yaml
@@ -0,0 +1,22 @@
+group: Models - Distributed
+depends_on: 
+  - image-build
+steps:
+- label: Distributed Model Tests (2 GPUs)
+  timeout_in_minutes: 50
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/model_executor/model_loader/sharded_state_loader.py
+  - vllm/model_executor/models/
+  - tests/basic_correctness/
+  - tests/model_executor/model_loader/test_sharded_state_loader.py
+  - tests/models/
+  commands:
+  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
+  # Avoid importing model tests that cause CUDA reinitialization error
+  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/language -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
+  - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
diff --git a/.buildkite/test_areas/models_language.yaml b/.buildkite/test_areas/models_language.yaml
new file mode 100644
index 0000000000000..f70192c4ebc0a
--- /dev/null
+++ b/.buildkite/test_areas/models_language.yaml
@@ -0,0 +1,91 @@
+group: Models - Language
+depends_on: 
+  - image-build
+steps:
+- label: Language Models Tests (Standard)
+  timeout_in_minutes: 25
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language
+  commands:
+    # Test standard language models, excluding a subset of slow tests
+    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/language -m 'core_model and (not slow_test)'
+
+- label: Language Models Tests (Extra Standard) %N
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/model_executor/models/
+  - tests/models/language/pooling/test_embedding.py
+  - tests/models/language/generation/test_common.py
+  - tests/models/language/pooling/test_classification.py
+  commands:
+    # Shard slow subset of standard language models tests. Only run when model
+    # source is modified, or when specified test files are modified
+    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
+  parallelism: 2
+
+- label: Language Models Tests (Hybrid) %N
+  timeout_in_minutes: 75
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/generation
+  commands:
+    # Install fast path packages for testing against transformers
+    # Note: also needed to run plamo2 model in vLLM
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+    # Shard hybrid language model tests
+    - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
+  parallelism: 2
+
+- label: Language Models Test (Extended Generation) # 80min
+  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/generation
+  commands:
+    # Install fast path packages for testing against transformers
+    # Note: also needed to run plamo2 model in vLLM
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
+
+- label: Language Models Test (PPL)
+  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/generation_ppl_test
+  commands:
+    - pytest -v -s models/language/generation_ppl_test
+
+- label: Language Models Test (Extended Pooling)  # 36min
+  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental]
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/pooling
+  commands:
+    - pytest -v -s models/language/pooling -m 'not core_model'
+
+- label: Language Models Test (MTEB)
+  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/pooling_mteb_test
+  commands:
+    - pytest -v -s models/language/pooling_mteb_test
diff --git a/.buildkite/test_areas/models_multimodal.yaml b/.buildkite/test_areas/models_multimodal.yaml
new file mode 100644
index 0000000000000..fc24068c20a46
--- /dev/null
+++ b/.buildkite/test_areas/models_multimodal.yaml
@@ -0,0 +1,79 @@
+group: Models - Multimodal
+depends_on: 
+  - image-build
+steps:
+- label: Multi-Modal Models (Standard) # 60min
+  timeout_in_minutes: 80
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
+    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
+
+- label: Multi-Modal Processor Test (CPU)
+  timeout_in_minutes: 60
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  no_gpu: true
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
+
+- label: Multi-Modal Processor # 44min
+  timeout_in_minutes: 60
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/processing/test_tensor_schema.py
+
+- label: Multi-Modal Accuracy Eval (Small Models) # 50min
+  timeout_in_minutes: 70
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - vllm/multimodal/
+  - vllm/inputs/
+  - vllm/v1/core/
+  commands:
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
+
+- label: Multi-Modal Models (Extended) 1
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
+
+- label: Multi-Modal Models (Extended) 2
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
+
+- label: Multi-Modal Models (Extended) 3
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
+
+# This test is used only in PR development phase to test individual models and should never run on main
+- label: Custom Models
+  optional: true
+  commands:
+    - echo 'Testing custom models...'
+    # PR authors can temporarily add commands below to test individual models
+    # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
+    # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
diff --git a/.buildkite/test_areas/plugins.yaml b/.buildkite/test_areas/plugins.yaml
new file mode 100644
index 0000000000000..60c179aa098e1
--- /dev/null
+++ b/.buildkite/test_areas/plugins.yaml
@@ -0,0 +1,34 @@
+group: Plugins
+depends_on: 
+  - image-build
+steps:
+- label: Plugin Tests (2 GPUs)
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/plugins/
+  - tests/plugins/
+  commands:
+  # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
+  - pip install -e ./plugins/vllm_add_dummy_platform
+  - pytest -v -s plugins_tests/test_platform_plugins.py
+  - pip uninstall vllm_add_dummy_platform -y
+  # end platform plugin tests
+  # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
+  - pip install -e ./plugins/prithvi_io_processor_plugin
+  - pytest -v -s plugins_tests/test_io_processor_plugins.py
+  - pip uninstall prithvi_io_processor_plugin -y
+  # end io_processor plugins test
+  # begin stat_logger plugins test
+  - pip install -e ./plugins/vllm_add_dummy_stat_logger
+  - pytest -v -s plugins_tests/test_stats_logger_plugins.py
+  - pip uninstall dummy_stat_logger -y
+  # end stat_logger plugins test
+  # other tests continue here:
+  - pytest -v -s plugins_tests/test_scheduler_plugins.py
+  - pip install -e ./plugins/vllm_add_dummy_model
+  - pytest -v -s distributed/test_distributed_oot.py
+  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
+  - pytest -v -s models/test_oot_registration.py # it needs a clean process
+  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
diff --git a/.buildkite/test_areas/pytorch.yaml b/.buildkite/test_areas/pytorch.yaml
new file mode 100644
index 0000000000000..703c82eb1a91b
--- /dev/null
+++ b/.buildkite/test_areas/pytorch.yaml
@@ -0,0 +1,50 @@
+group: PyTorch
+depends_on: 
+  - image-build
+steps:
+- label: PyTorch Compilation Unit Tests
+  timeout_in_minutes: 30
+  source_file_dependencies:
+    - vllm/
+    - tests/compile
+  commands:
+  # Run unit tests defined directly under compile/,
+  # not including subdirectories, which are usually heavier
+  # tests covered elsewhere.
+  # Use `find` to launch multiple instances of pytest so that
+  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+  - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\;"
+
+- label: PyTorch Fullgraph Smoke Test
+  timeout_in_minutes: 30
+  source_file_dependencies:
+  - vllm/
+  - tests/compile
+  commands:
+  # Run smoke tests under fullgraph directory, except test_full_graph.py
+  # as it is a heavy test that is covered in other steps.
+  # Use `find` to launch multiple instances of pytest so that
+  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"
+
+- label: PyTorch Fullgraph
+  timeout_in_minutes: 40
+  source_file_dependencies:
+  - vllm/
+  - tests/compile
+  commands:
+    # fp8 kv scales not supported on sm89, tested on Blackwell instead
+  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+    # Limit to no custom ops to reduce running time
+    # Wrap with quotes to escape yaml and avoid starting -k string with a -
+  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+
+- label: Pytorch Nightly Dependency Override Check # 2min
+  # if this test fails, it means the nightly torch version is not compatible with some
+  # of the dependencies. Please check the error message and add the package to whitelist
+  # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
+  soft_fail: true
+  source_file_dependencies:
+  - requirements/nightly_torch_test.txt
+  commands:
+  - bash standalone_tests/pytorch_nightly_dependency.sh
\ No newline at end of file
diff --git a/.buildkite/test_areas/quantization.yaml b/.buildkite/test_areas/quantization.yaml
new file mode 100644
index 0000000000000..6e89d6af3b8d1
--- /dev/null
+++ b/.buildkite/test_areas/quantization.yaml
@@ -0,0 +1,46 @@
+group: Quantization
+depends_on: 
+  - image-build
+steps:
+- label: Quantization
+  timeout_in_minutes: 90
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  - tests/quantization
+  commands:
+  # temporary install here since we need nightly, will move to requirements/test.in
+  # after torchao 0.12 release, and pin a working version of torchao nightly here
+
+  # since torchao nightly is only compatible with torch nightly currently
+  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
+  # we can only upgrade after this is resolved
+  # TODO(jerryzh168): resolve the above comment
+  - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
+  - uv pip install --system conch-triton-kernels
+  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
+
+- label: Quantized MoE Test (B200)
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  source_file_dependencies:
+  - tests/quantization/test_blackwell_moe.py
+  - vllm/model_executor/models/deepseek_v2.py
+  - vllm/model_executor/models/gpt_oss.py
+  - vllm/model_executor/models/llama4.py
+  - vllm/model_executor/layers/fused_moe
+  - vllm/model_executor/layers/quantization/compressed_tensors
+  - vllm/model_executor/layers/quantization/modelopt.py
+  - vllm/model_executor/layers/quantization/mxfp4.py
+  - vllm/v1/attention/backends/flashinfer.py
+  commands:
+    - pytest -s -v tests/quantization/test_blackwell_moe.py
+
+- label: Quantized Models Test
+  timeout_in_minutes: 60
+  source_file_dependencies:
+  - vllm/model_executor/layers/quantization
+  - tests/models/quantization
+  commands:
+    - pytest -v -s models/quantization
diff --git a/.buildkite/test_areas/samplers.yaml b/.buildkite/test_areas/samplers.yaml
new file mode 100644
index 0000000000000..ad377148fd073
--- /dev/null
+++ b/.buildkite/test_areas/samplers.yaml
@@ -0,0 +1,14 @@
+group: Samplers
+depends_on: 
+  - image-build
+steps:
+- label: Samplers Test
+  timeout_in_minutes: 75
+  source_file_dependencies:
+  - vllm/model_executor/layers
+  - vllm/sampling_metadata.py
+  - tests/samplers
+  - tests/conftest.py
+  commands:
+    - pytest -v -s samplers
+    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
diff --git a/.buildkite/test_areas/tool_use.yaml b/.buildkite/test_areas/tool_use.yaml
new file mode 100644
index 0000000000000..7040cd1d253b3
--- /dev/null
+++ b/.buildkite/test_areas/tool_use.yaml
@@ -0,0 +1,23 @@
+group: Tool use
+depends_on: 
+  - image-build
+steps:
+- label: OpenAI-Compatible Tool Use
+  timeout_in_minutes: 35
+  mirror_hardwares: [amdexperimental]
+  fast_check: false
+  source_file_dependencies:
+    - vllm/
+    - tests/tool_use
+  commands:
+    - pytest -v -s -m 'not cpu_test' tool_use
+
+- label: OpenAI-Compatible Tool Use (CPU)
+  depends_on: ~
+  timeout_in_minutes: 10
+  source_file_dependencies:
+    - vllm/
+    - tests/tool_use
+  no_gpu: true
+  commands:
+    - pytest -v -s -m 'cpu_test' tool_use
diff --git a/.buildkite/test_areas/weight_loading.yaml b/.buildkite/test_areas/weight_loading.yaml
new file mode 100644
index 0000000000000..cfc5bb20fe7ad
--- /dev/null
+++ b/.buildkite/test_areas/weight_loading.yaml
@@ -0,0 +1,25 @@
+group: Weight Loading
+depends_on: 
+  - image-build
+steps:
+- label: Weight Loading Multiple GPU  # 33min
+  timeout_in_minutes: 45
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/weight_loading
+  commands:
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
+
+- label: Weight Loading Multiple GPU - Large Models # optional
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  gpu: a100
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/weight_loading
+  commands:
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt

From ea657f2078072541f68fbc11ab83f81a585a9dc4 Mon Sep 17 00:00:00 2001
From: gnovack <gnovack@amazon.com>
Date: Mon, 8 Dec 2025 18:35:16 -0800
Subject: [PATCH 360/770] Lora MoE Align Improvements (#29257)

Signed-off-by: gnovack <gnovack@amazon.com>
---
 CMakeLists.txt                         |   1 -
 csrc/moe/moe_align_sum_kernels.cu      | 425 ++++++++++++++++++++-----
 csrc/moe/moe_lora_align_sum_kernels.cu | 174 ----------
 csrc/moe/moe_ops.h                     |   2 +-
 csrc/moe/torch_bindings.cpp            |   3 +-
 tests/lora/test_moe_lora_align_sum.py  |   2 +-
 vllm/_custom_ops.py                    |   2 +
 7 files changed, 360 insertions(+), 249 deletions(-)
 delete mode 100644 csrc/moe/moe_lora_align_sum_kernels.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e09972fe71995..69a538b06cba3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -944,7 +944,6 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
 set(VLLM_MOE_EXT_SRC
   "csrc/moe/torch_bindings.cpp"
   "csrc/moe/moe_align_sum_kernels.cu"
-  "csrc/moe/moe_lora_align_sum_kernels.cu"
   "csrc/moe/topk_softmax_kernels.cu")
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu
index ddcdcc38b4fea..5c9e474024082 100644
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -14,7 +14,6 @@
 
 namespace vllm {
 namespace moe {
-
 namespace batched_moe_align_block_size {
 
 // Note num_threads needs to be 1024 for BlockScan Reduction in the kernel.
@@ -80,23 +79,30 @@ __global__ void batched_moe_align_block_size_kernel(
 }  // namespace batched_moe_align_block_size
 
 template <typename scalar_t>
-__global__ void moe_align_block_size_kernel(
+__device__ void _moe_align_block_size(
     const scalar_t* __restrict__ topk_ids,
     int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
     int32_t* __restrict__ total_tokens_post_pad,
     int32_t* __restrict__ expert_map, int32_t num_experts,
     int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size,
     size_t numel, int32_t* __restrict__ cumsum, int32_t max_num_tokens_padded,
-    bool has_expert_map) {
+    int32_t max_num_m_blocks, int32_t model_offset, int32_t inactive_expert_id,
+    int32_t topk_num, int32_t* token_mask, bool has_expert_map) {
   extern __shared__ int32_t shared_counts[];
 
-  // Use a separate threadblock to fill sorted_token_ids.
+  // Compute input buffer offsets. Typically these will all be 0, except when
+  // using Multi LoRA.
+  int sorted_token_ids_offset = max_num_tokens_padded * model_offset;
+  int expert_ids_offset = max_num_m_blocks * model_offset;
+  int cumsum_offset = (num_experts + 1) * model_offset;
+
+  // Use separate threadblocks to fill sorted_token_ids.
   // This is safe since the current kernel does not use sorted_token_ids.
-  if (blockIdx.x == 1) {
+  if (blockIdx.x % 2) {
     // Initialize sorted_token_ids with numel
     for (size_t it = threadIdx.x; it < max_num_tokens_padded;
          it += blockDim.x) {
-      sorted_token_ids[it] = numel;
+      sorted_token_ids[sorted_token_ids_offset + it] = numel;
     }
     return;
   }
@@ -127,7 +133,9 @@ __global__ void moe_align_block_size_kernel(
     }
     int warp_idx = expert_id / experts_per_warp;
     int expert_offset = expert_id % experts_per_warp;
-    atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset], 1);
+    int mask = token_mask == nullptr ? 1 : token_mask[i / topk_num];
+    atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset],
+              mask);
   }
 
   __syncthreads();
@@ -148,77 +156,44 @@ __global__ void moe_align_block_size_kernel(
   int cumsum_val;
   BlockScan(temp_storage).ExclusiveSum(expert_count, cumsum_val);
   if (expert_id <= num_experts) {
-    cumsum[expert_id] = cumsum_val;
+    cumsum[cumsum_offset + expert_id] = cumsum_val;
   }
 
   if (expert_id == num_experts) {
-    *total_tokens_post_pad = cumsum_val;
+    total_tokens_post_pad[model_offset] = cumsum_val;
   }
 
   __syncthreads();
 
   if (threadIdx.x < num_experts) {
-    for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
-         i += block_size) {
-      expert_ids[i / block_size] = threadIdx.x;
+    for (int i = cumsum[cumsum_offset + threadIdx.x];
+         i < cumsum[cumsum_offset + threadIdx.x + 1]; i += block_size) {
+      expert_ids[expert_ids_offset + i / block_size] = threadIdx.x;
     }
   }
 
   // Fill remaining expert_ids with 0
-  const size_t fill_start_idx = cumsum[num_experts] / block_size + threadIdx.x;
-  const size_t expert_ids_size = CEILDIV(max_num_tokens_padded, block_size);
-  for (size_t i = fill_start_idx; i < expert_ids_size; i += blockDim.x) {
-    expert_ids[i] = 0;
-  }
-}
-
-template <typename scalar_t>
-__global__ void count_and_sort_expert_tokens_kernel(
-    const scalar_t* __restrict__ topk_ids,
-    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
-    int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts,
-    bool has_expert_map) {
-  const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-  const size_t stride = blockDim.x * gridDim.x;
-
-  for (size_t i = tid; i < numel; i += stride) {
-    int32_t expert_id = topk_ids[i];
-    if (expert_id >= num_experts) {
-      continue;
-    }
-    if (has_expert_map) {
-      expert_id = expert_map[expert_id];
-      // filter invalid experts
-      if (expert_id == -1) continue;
-    }
-    int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1);
-    sorted_token_ids[rank_post_pad] = i;
-  }
-}
-
-template <typename scalar_t, int TOPK>
-__global__ void moe_sum_kernel(
-    scalar_t* __restrict__ out,          // [..., d]
-    const scalar_t* __restrict__ input,  // [..., topk, d]
-    const int d) {
-  const int64_t token_idx = blockIdx.x;
-  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
-    scalar_t x = 0.0;
-#pragma unroll
-    for (int k = 0; k < TOPK; ++k) {
-      x += VLLM_LDG(&input[token_idx * TOPK * d + k * d + idx]);
-    }
-    out[token_idx * d + idx] = x;
+  const size_t fill_start_idx =
+      cumsum[cumsum_offset + num_experts] / block_size + threadIdx.x;
+  for (size_t i = fill_start_idx; i < max_num_m_blocks; i += blockDim.x) {
+    expert_ids[expert_ids_offset + i] = inactive_expert_id;
   }
 }
 
 template <typename scalar_t, int32_t fill_threads>
-__global__ void moe_align_block_size_small_batch_expert_kernel(
+__device__ void _moe_align_block_size_small_batch_expert(
     const scalar_t* __restrict__ topk_ids,
     int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
     int32_t* __restrict__ total_tokens_post_pad,
     int32_t* __restrict__ expert_map, int32_t num_experts, int32_t block_size,
-    size_t numel, int32_t max_num_tokens_padded, bool has_expert_map) {
+    size_t numel, int32_t max_num_tokens_padded, int32_t max_num_m_blocks,
+    int32_t inactive_expert_id, int32_t model_offset, int32_t topk_num,
+    int32_t* token_mask, bool has_expert_map) {
+  // Compute input buffer offsets. Typically these will all be 0, except when
+  // using Multi LoRA.
+  int sorted_token_ids_offset = max_num_tokens_padded * model_offset;
+  int expert_ids_offset = max_num_m_blocks * model_offset;
+
   // Use an additional group of threads to fill sorted_token_ids.
   // Since the current kernel will use sorted_token_ids afterward,
   // we fill sorted_token_ids within the same threadblock to make
@@ -227,7 +202,7 @@ __global__ void moe_align_block_size_small_batch_expert_kernel(
     // Initialize sorted_token_ids with numel
     for (size_t it = threadIdx.x; it < max_num_tokens_padded;
          it += fill_threads) {
-      sorted_token_ids[it] = numel;
+      sorted_token_ids[sorted_token_ids_offset + it] = numel;
     }
     // Three __syncthreads() corresponding to the other threads
     __syncthreads();
@@ -254,7 +229,8 @@ __global__ void moe_align_block_size_small_batch_expert_kernel(
       // filter invalid expert
       if (expert_id == -1) continue;
     }
-    ++tokens_cnts[(tid + 1) * num_experts + expert_id];
+    int mask = token_mask == nullptr ? 1 : token_mask[i / topk_num];
+    tokens_cnts[(tid + 1) * num_experts + expert_id] += mask;
   }
 
   __syncthreads();
@@ -277,22 +253,22 @@ __global__ void moe_align_block_size_small_batch_expert_kernel(
           CEILDIV(tokens_cnts[stride * num_experts + i - 1], block_size) *
               block_size;
     }
-    *total_tokens_post_pad = static_cast<int32_t>(cumsum[num_experts]);
+    total_tokens_post_pad[model_offset] =
+        static_cast<int32_t>(cumsum[num_experts]);
   }
 
   __syncthreads();
 
   if (tid < num_experts) {
     for (int i = cumsum[tid]; i < cumsum[tid + 1]; i += block_size) {
-      expert_ids[i / block_size] = tid;
+      expert_ids[expert_ids_offset + i / block_size] = tid;
     }
   }
 
   // Fill remaining expert_ids with 0
   const size_t fill_start_idx = cumsum[num_experts] / block_size + tid;
-  const size_t expert_ids_size = CEILDIV(max_num_tokens_padded, block_size);
-  for (size_t i = fill_start_idx; i < expert_ids_size; i += stride) {
-    expert_ids[i] = 0;
+  for (size_t i = fill_start_idx; i < max_num_m_blocks; i += stride) {
+    expert_ids[expert_ids_offset + i] = inactive_expert_id;
   }
 
   for (size_t i = tid; i < numel; i += stride) {
@@ -304,11 +280,195 @@ __global__ void moe_align_block_size_small_batch_expert_kernel(
     }
     int32_t rank_post_pad =
         tokens_cnts[tid * num_experts + expert_id] + cumsum[expert_id];
-    sorted_token_ids[rank_post_pad] = i;
-    ++tokens_cnts[tid * num_experts + expert_id];
+
+    if (token_mask == nullptr || token_mask[i / topk_num]) {
+      sorted_token_ids[sorted_token_ids_offset + rank_post_pad] = i;
+      ++tokens_cnts[tid * num_experts + expert_id];
+    }
   }
 }
 
+template <typename scalar_t>
+__device__ void _count_and_sort_expert_tokens(
+    const scalar_t* __restrict__ topk_ids,
+    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
+    int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts,
+    int32_t max_num_tokens_padded, int32_t* __restrict__ token_mask,
+    int32_t model_offset, int32_t topk_num, bool has_expert_map) {
+  const size_t tid = blockIdx.y * blockDim.x + threadIdx.x;
+  const size_t stride = blockDim.x * gridDim.y;
+
+  for (size_t i = tid; i < numel; i += stride) {
+    int32_t expert_id = topk_ids[i];
+    if (expert_id >= num_experts) {
+      continue;
+    }
+
+    if (has_expert_map) {
+      expert_id = expert_map[expert_id];
+      // filter invalid experts
+      if (expert_id == -1) continue;
+    }
+
+    if (token_mask == nullptr || token_mask[i / topk_num]) {
+      int32_t rank_post_pad = atomicAdd(
+          &cumsum_buffer[(model_offset * (num_experts + 1)) + expert_id], 1);
+      sorted_token_ids[max_num_tokens_padded * model_offset + rank_post_pad] =
+          i;
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void moe_align_block_size_kernel(
+    const scalar_t* __restrict__ topk_ids,
+    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
+    int32_t* __restrict__ total_tokens_post_pad,
+    int32_t* __restrict__ expert_map, int32_t num_experts,
+    int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size,
+    size_t numel, int32_t* __restrict__ cumsum, int32_t max_num_tokens_padded,
+    int32_t topk_num, bool has_expert_map) {
+  _moe_align_block_size(
+      topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map,
+      num_experts, padded_num_experts, experts_per_warp, block_size, numel,
+      cumsum, max_num_tokens_padded, CEILDIV(max_num_tokens_padded, block_size),
+      0, 0, topk_num, nullptr, has_expert_map);
+}
+
+template <typename scalar_t>
+__global__ void count_and_sort_expert_tokens_kernel(
+    const scalar_t* __restrict__ topk_ids,
+    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
+    int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts,
+    int32_t max_num_tokens_padded, int32_t topk_num, bool has_expert_map) {
+  _count_and_sort_expert_tokens(
+      topk_ids, sorted_token_ids, cumsum_buffer, expert_map, numel, num_experts,
+      max_num_tokens_padded, nullptr, 0, topk_num, has_expert_map);
+}
+
+template <typename scalar_t, int TOPK>
+__global__ void moe_sum_kernel(
+    scalar_t* __restrict__ out,          // [..., d]
+    const scalar_t* __restrict__ input,  // [..., topk, d]
+    const int d) {
+  const int64_t token_idx = blockIdx.x;
+  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    scalar_t x = 0.0;
+#pragma unroll
+    for (int k = 0; k < TOPK; ++k) {
+      x += VLLM_LDG(&input[token_idx * TOPK * d + k * d + idx]);
+    }
+    out[token_idx * d + idx] = x;
+  }
+}
+
+template <typename scalar_t, int32_t fill_threads>
+__global__ void moe_align_block_size_small_batch_expert_kernel(
+    const scalar_t* __restrict__ topk_ids,
+    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
+    int32_t* __restrict__ total_tokens_post_pad,
+    int32_t* __restrict__ expert_map, int32_t num_experts, int32_t block_size,
+    size_t numel, int32_t max_num_tokens_padded, int32_t topk_num,
+    bool has_expert_map) {
+  _moe_align_block_size_small_batch_expert<scalar_t, fill_threads>(
+      topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map,
+      num_experts, block_size, numel, max_num_tokens_padded,
+      CEILDIV(max_num_tokens_padded, block_size), 0, 0, topk_num, nullptr,
+      has_expert_map);
+}
+
+template <typename scalar_t>
+__global__ void moe_lora_align_block_size_kernel(
+    scalar_t* __restrict__ topk_ids, int32_t* __restrict__ token_lora_mapping,
+    int64_t block_size, int32_t* __restrict__ expert_map, int num_experts,
+    int max_loras, size_t numel, int max_num_tokens_padded,
+    int max_num_m_blocks, int32_t* __restrict__ sorted_token_ids,
+    int32_t* __restrict__ expert_ids, int32_t topk_num,
+    int32_t* total_tokens_post_pad, int32_t* adapter_enabled,
+    int32_t* __restrict__ cumsum, int32_t experts_per_warp,
+    int32_t padded_num_experts, int32_t* lora_ids,
+    int32_t* __restrict__ token_mask, bool has_expert_map) {
+  int lora_idx = blockIdx.x / 2;
+  int lora_id = lora_ids[lora_idx];
+  if (lora_id == -1 || adapter_enabled[lora_id] == 0) {
+    return;
+  }
+
+  // Populate the token_mask based on the token-LoRA mapping
+  int num_tokens = numel / topk_num;
+  if (threadIdx.x == 0) {
+    total_tokens_post_pad[lora_id] = 0;
+
+    for (int i = 0; i < num_tokens; i++) {
+      token_mask[(lora_id * num_tokens) + i] =
+          (int)token_lora_mapping[i] == lora_id;
+    }
+  }
+
+  __syncthreads();
+
+  _moe_align_block_size(
+      topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map,
+      num_experts, padded_num_experts, experts_per_warp, block_size, numel,
+      cumsum, max_num_tokens_padded, max_num_m_blocks, lora_id, -1, topk_num,
+      &token_mask[(lora_id * num_tokens)], has_expert_map);
+}
+
+template <typename scalar_t>
+__global__ void lora_count_and_sort_expert_tokens_kernel(
+    const scalar_t* __restrict__ topk_ids,
+    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
+    int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts,
+    int32_t max_num_tokens_padded, int32_t topk_num, int32_t* token_mask,
+    int32_t* lora_ids, bool has_expert_map) {
+  int lora_idx = blockIdx.x;
+  int lora_id = lora_ids[lora_idx];
+  if (lora_id == -1) {
+    return;
+  }
+
+  int num_tokens = numel / topk_num;
+
+  _count_and_sort_expert_tokens(
+      topk_ids, sorted_token_ids, cumsum_buffer, expert_map, numel, num_experts,
+      max_num_tokens_padded, &token_mask[(lora_id * num_tokens)], lora_id,
+      topk_num, has_expert_map);
+}
+
+template <typename scalar_t, int32_t fill_threads>
+__global__ void moe_lora_align_block_size_small_batch_expert_kernel(
+    scalar_t* __restrict__ topk_ids, int32_t* token_lora_mapping,
+    int64_t block_size, int32_t* __restrict__ expert_map, int num_experts,
+    int max_loras, size_t numel, int max_num_tokens_padded,
+    int max_num_m_blocks, int32_t* __restrict__ sorted_token_ids,
+    int32_t* __restrict__ expert_ids, int topk_num,
+    int32_t* total_tokens_post_pad, int32_t* adapter_enabled, int32_t* lora_ids,
+    int32_t* token_mask, bool has_expert_map) {
+  int lora_idx = blockIdx.x;
+  int lora_id = lora_ids[lora_idx];
+  if (lora_id == -1 || adapter_enabled[lora_id] == 0) {
+    return;
+  }
+
+  int num_tokens = numel / topk_num;
+  if (threadIdx.x == 0) {
+    total_tokens_post_pad[lora_id] = 0;
+
+    for (int i = 0; i < num_tokens; i++) {
+      token_mask[(lora_id * num_tokens) + i] =
+          (int)token_lora_mapping[i] == lora_id;
+    }
+  }
+
+  __syncthreads();
+
+  _moe_align_block_size_small_batch_expert<scalar_t, fill_threads>(
+      topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map,
+      num_experts, block_size, numel, max_num_tokens_padded, max_num_m_blocks,
+      -1, lora_id, topk_num, &token_mask[(lora_id * num_tokens)],
+      has_expert_map);
+}
+
 }  // namespace moe
 }  // namespace vllm
 
@@ -365,7 +525,8 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
               experts_ids.data_ptr<int32_t>(),
               num_tokens_post_pad.data_ptr<int32_t>(),
               expert_map.data_ptr<int32_t>(), num_experts, block_size,
-              topk_ids.numel(), sorted_token_ids.size(0), has_expert_map);
+              topk_ids.numel(), sorted_token_ids.size(0), topk_ids.size(1),
+              has_expert_map);
         } else {
           torch::Tensor cumsum_buffer =
               torch::empty({num_experts + 1}, options_int);
@@ -386,21 +547,23 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
               expert_map.data_ptr<int32_t>(), num_experts, padded_num_experts,
               experts_per_warp, block_size, topk_ids.numel(),
               cumsum_buffer.data_ptr<int32_t>(), sorted_token_ids.size(0),
-              has_expert_map);
+              topk_ids.size(1), has_expert_map);
 
           const int block_threads = std::min(256, (int)threads);
           const int num_blocks =
               (topk_ids.numel() + block_threads - 1) / block_threads;
           const int max_blocks = 65535;
           const int actual_blocks = std::min(num_blocks, max_blocks);
+          dim3 gridDims(1, actual_blocks);
 
           auto sort_kernel =
               vllm::moe::count_and_sort_expert_tokens_kernel<scalar_t>;
-          sort_kernel<<<actual_blocks, block_threads, 0, stream>>>(
+          sort_kernel<<<gridDims, block_threads, 0, stream>>>(
               topk_ids.data_ptr<scalar_t>(),
               sorted_token_ids.data_ptr<int32_t>(),
               cumsum_buffer.data_ptr<int32_t>(), expert_map.data_ptr<int32_t>(),
-              topk_ids.numel(), num_experts, has_expert_map);
+              topk_ids.numel(), num_experts, sorted_token_ids.size(0),
+              topk_ids.size(1), has_expert_map);
         }
       });
 }
@@ -474,3 +637,123 @@ void moe_sum(torch::Tensor& input,   // [num_tokens, topk, hidden_size]
       break;
   }
 }
+
+void moe_lora_align_block_size(
+    torch::Tensor topk_ids, torch::Tensor token_lora_mapping,
+    int64_t num_experts, int64_t block_size, int64_t max_loras,
+    int64_t max_num_tokens_padded, int64_t max_num_m_blocks,
+    torch::Tensor sorted_token_ids, torch::Tensor expert_ids,
+    torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled,
+    torch::Tensor lora_ids, std::optional<torch::Tensor> maybe_expert_map) {
+  const int topk_num = topk_ids.size(1);
+
+  TORCH_CHECK(block_size > 0, "block_size should be greater than 0. ");
+
+  int device_max_shared_mem;
+  auto dev = topk_ids.get_device();
+  cudaDeviceGetAttribute(&device_max_shared_mem,
+                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  int64_t padded_num_experts =
+      ((num_experts + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
+
+  // BlockScan uses 1024 threads and assigns one thread per expert.
+  TORCH_CHECK(padded_num_experts < 1024,
+              "padded_num_experts must be less than 1024");
+
+  auto options_int =
+      torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
+  torch::Tensor token_mask =
+      torch::empty({max_loras * topk_ids.size(0)}, options_int);
+  bool has_expert_map = maybe_expert_map.has_value();
+  torch::Tensor expert_map;
+  if (has_expert_map) {
+    expert_map = maybe_expert_map.value();
+  } else {
+    expert_map = torch::empty({0}, options_int);
+  }
+
+  VLLM_DISPATCH_INTEGRAL_TYPES(
+      topk_ids.scalar_type(), "moe_lora_align_sum_kernel", [&] {
+        bool small_batch_expert_mode =
+            (topk_ids.numel() < 1024) && (num_experts <= 64);
+
+        if (small_batch_expert_mode) {
+          const int32_t num_thread = max((int32_t)num_experts, 128);
+          const int32_t shared_mem =
+              (num_thread + 1) * num_experts * sizeof(int32_t) +
+              (num_experts + 1) * sizeof(int32_t);
+          if (shared_mem > device_max_shared_mem) {
+            TORCH_CHECK(false, "Shared memory usage exceeds device limit.");
+          }
+
+          // threadIdx.x >= fill_threads: counting experts and aligning
+          // threadIdx.x < fill_threads: filling sorted_token_ids
+          constexpr int32_t fill_threads = 256;
+
+          dim3 blockDim(num_thread + fill_threads);
+          auto kernel =
+              vllm::moe::moe_lora_align_block_size_small_batch_expert_kernel<
+                  scalar_t, fill_threads>;
+          AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
+              (void*)kernel, shared_mem));
+          kernel<<<max_loras, blockDim, shared_mem, stream>>>(
+              topk_ids.data_ptr<scalar_t>(),
+              token_lora_mapping.data_ptr<int32_t>(), block_size,
+              expert_map.data_ptr<int32_t>(), num_experts, max_loras,
+              topk_ids.numel(), max_num_tokens_padded, max_num_m_blocks,
+              sorted_token_ids.data_ptr<int32_t>(),
+              expert_ids.data_ptr<int32_t>(), topk_num,
+              num_tokens_post_pad.data_ptr<int32_t>(),
+              adapter_enabled.data_ptr<int32_t>(), lora_ids.data_ptr<int32_t>(),
+              token_mask.data_ptr<int32_t>(), has_expert_map);
+        } else {
+          int num_thread = 1024;
+          dim3 blockDim(num_thread);
+          size_t num_warps = CEILDIV(padded_num_experts, WARP_SIZE);
+
+          size_t shared_mem_size = num_warps * WARP_SIZE * sizeof(int32_t);
+
+          // cumsum buffer
+          torch::Tensor cumsum =
+              torch::zeros({max_loras * (num_experts + 1)}, options_int);
+
+          auto align_kernel =
+              vllm::moe::moe_lora_align_block_size_kernel<scalar_t>;
+
+          // launch two threadblocks for each lora
+          // blockIdx.x % 2 == 0: counting experts and aligning
+          // blockIdx.x % 2 == 1: filling sorted_token_ids
+          align_kernel<<<max_loras * 2, blockDim, shared_mem_size, stream>>>(
+              topk_ids.data_ptr<scalar_t>(),
+              token_lora_mapping.data_ptr<int32_t>(), block_size,
+              expert_map.data_ptr<int32_t>(), num_experts, max_loras,
+              topk_ids.numel(), max_num_tokens_padded, max_num_m_blocks,
+              sorted_token_ids.data_ptr<int32_t>(),
+              expert_ids.data_ptr<int32_t>(), topk_num,
+              num_tokens_post_pad.data_ptr<int32_t>(),
+              adapter_enabled.data_ptr<int32_t>(), cumsum.data_ptr<int32_t>(),
+              WARP_SIZE, padded_num_experts, lora_ids.data_ptr<int32_t>(),
+              token_mask.data_ptr<int32_t>(), has_expert_map);
+
+          const int block_threads = std::min(256, (int)num_thread);
+          const int num_blocks =
+              (topk_ids.numel() + block_threads - 1) / block_threads;
+
+          const int max_blocks = 65535;
+          const int actual_blocks = std::min(num_blocks, max_blocks);
+
+          dim3 gridDims(max_loras, actual_blocks);
+          auto sort_kernel =
+              vllm::moe::lora_count_and_sort_expert_tokens_kernel<scalar_t>;
+
+          sort_kernel<<<gridDims, block_threads, 0, stream>>>(
+              topk_ids.data_ptr<scalar_t>(),
+              sorted_token_ids.data_ptr<int32_t>(), cumsum.data_ptr<int32_t>(),
+              expert_map.data_ptr<int32_t>(), topk_ids.numel(), num_experts,
+              max_num_tokens_padded, topk_num, token_mask.data_ptr<int32_t>(),
+              lora_ids.data_ptr<int32_t>(), has_expert_map);
+        }
+      });
+}
\ No newline at end of file
diff --git a/csrc/moe/moe_lora_align_sum_kernels.cu b/csrc/moe/moe_lora_align_sum_kernels.cu
deleted file mode 100644
index 360f1312cf579..0000000000000
--- a/csrc/moe/moe_lora_align_sum_kernels.cu
+++ /dev/null
@@ -1,174 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-#include <torch/all.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-
-#include <ATen/ATen.h>
-#include <ATen/cuda/Atomic.cuh>
-
-#include "../cuda_compat.h"
-#include "../dispatch_utils.h"
-#include "core/math.hpp"
-
-namespace {
-
-__device__ __forceinline__ int32_t index(int32_t total_col, int32_t row,
-                                         int32_t col) {
-  return row * total_col + col;
-}
-
-}  // namespace
-
-// TODO: Refactor common parts with moe_align_sum_kernels
-template <typename scalar_t, typename token_cnts_t>
-__global__ void moe_lora_align_sum_kernel(
-    scalar_t* __restrict__ topk_ids, int32_t* token_lora_mapping,
-    int64_t block_size, int num_experts, int max_loras, size_t numel,
-    int max_num_tokens_padded, int max_num_m_blocks,
-    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
-    int topk_num, int32_t* total_tokens_post_pad, int32_t* adapter_enabled,
-    int32_t* lora_ids) {
-  const size_t tokens_per_thread = div_ceil(numel, blockDim.x);
-  const size_t start_idx = threadIdx.x * tokens_per_thread;
-
-  int lora_idx = blockIdx.x;
-  int lora_id = lora_ids[lora_idx];
-  if (lora_id == -1 || adapter_enabled[lora_id] == 0) {
-    return;
-  }
-  extern __shared__ int32_t shared_mem[];
-  int32_t* cumsum = shared_mem;
-  token_cnts_t* tokens_cnts = (token_cnts_t*)(shared_mem + num_experts + 1);
-
-  // Initialize sorted_token_ids with numel
-  for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) {
-    sorted_token_ids[lora_id * max_num_tokens_padded + it] = numel;
-  }
-
-  // Initialize expert_ids with -1
-  for (size_t it = threadIdx.x; it < max_num_m_blocks; it += blockDim.x) {
-    expert_ids[lora_id * max_num_m_blocks + it] = -1;
-  }
-
-  // Initialize total_tokens_post_pad with 0
-  if (threadIdx.x == 0) {
-    total_tokens_post_pad[lora_id] = 0;
-  }
-
-  for (int i = 0; i < num_experts; ++i) {
-    tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
-  }
-
-  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
-    int mask = token_lora_mapping[i / topk_num] == lora_id;
-    int idx = index(num_experts, threadIdx.x + 1, topk_ids[i]);
-    tokens_cnts[idx] += mask;
-  }
-
-  __syncthreads();
-
-  // For each expert we accumulate the token counts from the different threads.
-  if (threadIdx.x < num_experts) {
-    tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
-    for (int i = 1; i <= blockDim.x; ++i) {
-      tokens_cnts[index(num_experts, i, threadIdx.x)] +=
-          tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
-    }
-  }
-
-  __syncthreads();
-
-  // We accumulate the token counts of all experts in thread 0.
-  if (threadIdx.x == 0) {
-    cumsum[0] = 0;
-    for (int i = 1; i <= num_experts; ++i) {
-      cumsum[i] = cumsum[i - 1] +
-                  div_ceil(tokens_cnts[index(num_experts, blockDim.x, i - 1)],
-                           block_size) *
-                      block_size;
-    }
-    total_tokens_post_pad[lora_id] = static_cast<int32_t>(cumsum[num_experts]);
-  }
-
-  __syncthreads();
-
-  /**
-   * For each expert, each thread processes the tokens of the corresponding
-   * blocks and stores the corresponding expert_id for each block.
-   */
-  if (threadIdx.x < num_experts) {
-    for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
-         i += block_size) {
-      expert_ids[index(max_num_m_blocks, lora_id, i / block_size)] =
-          threadIdx.x;
-    }
-  }
-
-  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
-    int32_t expert_id = topk_ids[i];
-    /** The cumsum[expert_id] stores the starting index of the tokens that the
-     * expert with expert_id needs to process, and
-     * tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens
-     * processed by the expert with expert_id within the current thread's token
-     * shard.
-     */
-    int32_t rank_post_pad =
-        tokens_cnts[index(num_experts, threadIdx.x, expert_id)] +
-        cumsum[expert_id];
-
-    int mask = (int)token_lora_mapping[i / topk_num] == lora_id;
-    atomicAdd(
-        &sorted_token_ids[index(max_num_tokens_padded, lora_id, rank_post_pad)],
-        (i - numel) * mask);
-    tokens_cnts[index(num_experts, threadIdx.x, expert_id)] += mask;
-  }
-}
-
-void moe_lora_align_block_size(
-    torch::Tensor topk_ids, torch::Tensor token_lora_mapping,
-    int64_t num_experts, int64_t block_size, int64_t max_loras,
-    int64_t max_num_tokens_padded, int64_t max_num_m_blocks,
-    torch::Tensor sorted_token_ids, torch::Tensor expert_ids,
-    torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled,
-    torch::Tensor lora_ids) {
-  const int topk_num = topk_ids.size(1);
-
-  TORCH_CHECK(block_size > 0, "block_size should be greater than 0. ");
-
-  int device_max_shared_mem;
-  auto dev = topk_ids.get_device();
-  cudaDeviceGetAttribute(&device_max_shared_mem,
-                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  const int32_t num_thread = max((int32_t)num_experts, 128);  // WARP_SIZE,
-  TORCH_CHECK(num_thread <= 1024,
-              "num_thread must be less than 1024, "
-              "and fallback is not implemented yet.");
-  const int32_t shared_mem = (num_thread + 1) * num_experts * sizeof(int32_t) +
-                             (num_experts + 1) * sizeof(int32_t);
-
-  if (shared_mem > device_max_shared_mem) {
-    TORCH_CHECK(false,
-                "Shared memory usage exceeds device limit, and global memory "
-                "fallback is not implemented yet.");
-  }
-
-  VLLM_DISPATCH_INTEGRAL_TYPES(
-      topk_ids.scalar_type(), "moe_lora_align_sum_kernel", [&] {
-        dim3 blockDim(num_thread);
-        auto kernel = moe_lora_align_sum_kernel<scalar_t, int32_t>;
-        AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
-            (void*)kernel, shared_mem));
-        kernel<<<max_loras, blockDim, shared_mem, stream>>>(
-            topk_ids.data_ptr<scalar_t>(),
-            token_lora_mapping.data_ptr<int32_t>(), block_size, num_experts,
-            max_loras, topk_ids.numel(), max_num_tokens_padded,
-            max_num_m_blocks, sorted_token_ids.data_ptr<int32_t>(),
-            expert_ids.data_ptr<int32_t>(), topk_num,
-            num_tokens_post_pad.data_ptr<int32_t>(),
-            adapter_enabled.data_ptr<int32_t>(), lora_ids.data_ptr<int32_t>());
-      });
-}
\ No newline at end of file
diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h
index 4c7accf03440a..337dcc50b079e 100644
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@@ -27,7 +27,7 @@ void moe_lora_align_block_size(
     int64_t max_num_tokens_padded, int64_t max_num_m_blocks,
     torch::Tensor sorted_token_ids, torch::Tensor expert_ids,
     torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled,
-    torch::Tensor lora_ids);
+    torch::Tensor lora_ids, std::optional<torch::Tensor> maybe_expert_map);
 #ifndef USE_ROCM
 torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
                              torch::Tensor b_qweight, torch::Tensor b_scales,
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index fca57c31caf8e..779ad70ad1e09 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -47,7 +47,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
       "                     Tensor !experts_ids,"
       "                     Tensor !num_tokens_post_pad,"
       "                     Tensor !adapter_enabled,"
-      "                     Tensor !lora_ids) -> () ");
+      "                     Tensor !lora_ids,"
+      "                     Tensor? maybe_expert_map) -> () ");
   m.impl("moe_lora_align_block_size", torch::kCUDA, &moe_lora_align_block_size);
 
 #ifndef USE_ROCM
diff --git a/tests/lora/test_moe_lora_align_sum.py b/tests/lora/test_moe_lora_align_sum.py
index 72f1d759f1e7a..3a17f3eba6e8b 100644
--- a/tests/lora/test_moe_lora_align_sum.py
+++ b/tests/lora/test_moe_lora_align_sum.py
@@ -32,7 +32,7 @@ def sample_data(num_experts, max_loras, num_tokens, topk_num):
 
 @pytest.mark.parametrize("num_tokens", [100, 200, 1024, 4096])  # 81920
 @pytest.mark.parametrize("topk_num", [6])
-@pytest.mark.parametrize("num_experts", [64, 128])
+@pytest.mark.parametrize("num_experts", [64, 128, 256, 512])
 @pytest.mark.parametrize("max_loras", [2, 32])
 @pytest.mark.parametrize("block_size", [16])
 def test_moe_lora_align_block_size(
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 77d5453291e3c..56c780ceb1cb5 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1961,6 +1961,7 @@ def moe_lora_align_block_size(
     num_tokens_post_pad: torch.Tensor,
     adapter_enabled: torch.Tensor,
     lora_ids: torch.Tensor,
+    expert_map: torch.Tensor | None = None,
 ) -> None:
     torch.ops._moe_C.moe_lora_align_block_size(
         topk_ids,
@@ -1975,6 +1976,7 @@ def moe_lora_align_block_size(
         num_tokens_post_pad,
         adapter_enabled,
         lora_ids,
+        expert_map,
     )
 
 
From f6227c22ab8976a24913122874c24624102da1b4 Mon Sep 17 00:00:00 2001
From: czhu-cohere <conway.zhu@cohere.com>
Date: Mon, 8 Dec 2025 22:29:06 -0500
Subject: [PATCH 361/770] [Kernel]Support W4A8 Grouped GEMM on Hopper (#29691)

Signed-off-by: czhu-cohere <conway.zhu@cohere.com>
---
 CMakeLists.txt                                |   5 +-
 csrc/ops.h                                    |   3 +-
 .../cutlass_w4a8/get_group_starts.cuh         | 104 ++++
 .../cutlass_w4a8/w4a8_grouped_mm_entry.cu     | 483 ++++++++++++++++++
 .../cutlass_w4a8/w4a8_mm_entry.cu             |  70 +--
 csrc/quantization/cutlass_w4a8/w4a8_utils.cu  |  90 ++++
 csrc/quantization/cutlass_w4a8/w4a8_utils.cuh |  11 +
 .../quantization/w8a8/cutlass/moe/moe_data.cu |   8 +-
 .../w8a8/cutlass/scaled_mm_entry.cu           |   8 +-
 csrc/torch_bindings.cpp                       |  26 +-
 .../kernels/quantization/test_cutlass_w4a8.py |  46 +-
 .../quantization/test_cutlass_w4a8_moe.py     | 340 ++++++++++++
 vllm/_custom_ops.py                           |  90 +++-
 .../layers/fused_moe/__init__.py              |   4 +
 .../model_executor/layers/fused_moe/config.py |  29 ++
 .../layers/fused_moe/cutlass_moe.py           | 401 +++++++++++++++
 .../layers/fused_moe/modular_kernel.py        |   2 +-
 .../compressed_tensors/compressed_tensors.py  |   2 +-
 .../compressed_tensors_moe.py                 | 340 +++++++++++-
 .../schemes/compressed_tensors_w4a8_fp8.py    |  15 +-
 .../kernels/mixed_precision/cutlass.py        |  19 +-
 .../layers/quantization/utils/quant_utils.py  |  50 +-
 22 files changed, 2045 insertions(+), 101 deletions(-)
 create mode 100644 csrc/quantization/cutlass_w4a8/get_group_starts.cuh
 create mode 100644 csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu
 create mode 100644 csrc/quantization/cutlass_w4a8/w4a8_utils.cu
 create mode 100644 csrc/quantization/cutlass_w4a8/w4a8_utils.cuh
 create mode 100644 tests/kernels/quantization/test_cutlass_w4a8_moe.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 69a538b06cba3..6b93e3fe91603 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -874,7 +874,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS)
     set(SRCS
-       "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu")
+       "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu"
+       "csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu"
+       "csrc/quantization/cutlass_w4a8/w4a8_utils.cu"
+       )
 
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
diff --git a/csrc/ops.h b/csrc/ops.h
index 5fce3a1a3fea3..37e3aaf7499d5 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -262,7 +262,8 @@ void get_cutlass_moe_mm_data(
 void get_cutlass_moe_mm_problem_sizes(
     const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
     torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
-    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets);
+    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
+    std::optional<bool> force_swap_ab = std::nullopt);
 
 void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
                                   torch::Tensor& problem_sizes1,
diff --git a/csrc/quantization/cutlass_w4a8/get_group_starts.cuh b/csrc/quantization/cutlass_w4a8/get_group_starts.cuh
new file mode 100644
index 0000000000000..fec142d0d87a1
--- /dev/null
+++ b/csrc/quantization/cutlass_w4a8/get_group_starts.cuh
@@ -0,0 +1,104 @@
+// see csrc/quantization/w8a8/cutlass/moe/get_group_starts.cuh
+#pragma once
+
+#include <cuda.h>
+#include <torch/all.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include "core/scalar_type.hpp"
+#include "cutlass/bfloat16.h"
+#include "cutlass/float8.h"
+
+// ElementB is int32 (packed int4)
+// ElementGroupScale is cutlass::Array<cutlass::float_e4m3_t, 8> (packed fp8)
+template <typename ElementA, typename ElementB, typename ElementC,
+          typename ElementAccumulator, typename ElementGroupScale>
+__global__ void get_group_gemm_starts(
+    int64_t* expert_offsets, ElementA** a_offsets, ElementB** b_offsets,
+    ElementC** out_offsets, ElementAccumulator** a_scales_offsets,
+    ElementAccumulator** b_scales_offsets,
+    ElementGroupScale** b_group_scales_offsets, ElementA* a_base_as_int,
+    ElementB* b_base_as_int, ElementC* out_base_as_int,
+    ElementAccumulator* a_scales_base_as_int,
+    ElementAccumulator* b_scales_base_as_int,
+    ElementGroupScale* b_group_scales_base_as_int, int64_t n, int64_t k,
+    int64_t scale_k) {
+  int expert_id = threadIdx.x;
+
+  int64_t expert_offset = expert_offsets[expert_id];
+
+  // same as w8a8
+  a_offsets[expert_id] = a_base_as_int + expert_offset * k;
+  out_offsets[expert_id] = out_base_as_int + expert_offset * n;
+  a_scales_offsets[expert_id] = a_scales_base_as_int + expert_offset;
+  b_scales_offsets[expert_id] = b_scales_base_as_int + (n * expert_id);
+
+  // w4a8 specific
+  constexpr int pack_factor = 8;  // pack 8 int4 into int32
+  b_offsets[expert_id] = b_base_as_int + (expert_id * k * n / pack_factor);
+  b_group_scales_offsets[expert_id] =
+      b_group_scales_base_as_int + (expert_id * scale_k * n);
+}
+
+#define __CALL_GET_STARTS_KERNEL(TENSOR_C_TYPE, C_TYPE)                  \
+  else if (out_tensors.dtype() == TENSOR_C_TYPE) {                       \
+    get_group_gemm_starts<cutlass::float_e4m3_t, int32_t, C_TYPE, float, \
+                          cutlass::Array<cutlass::float_e4m3_t, 8>>      \
+        <<<1, num_experts, 0, stream>>>(                                 \
+            static_cast<int64_t*>(expert_offsets.data_ptr()),            \
+            static_cast<cutlass::float_e4m3_t**>(a_ptrs.data_ptr()),     \
+            static_cast<int32_t**>(b_ptrs.data_ptr()),                   \
+            static_cast<C_TYPE**>(out_ptrs.data_ptr()),                  \
+            static_cast<float**>(a_scales_ptrs.data_ptr()),              \
+            static_cast<float**>(b_scales_ptrs.data_ptr()),              \
+            static_cast<cutlass::Array<cutlass::float_e4m3_t, 8>**>(     \
+                b_group_scales_ptrs.data_ptr()),                         \
+            static_cast<cutlass::float_e4m3_t*>(a_tensors.data_ptr()),   \
+            static_cast<int32_t*>(b_tensors.data_ptr()),                 \
+            static_cast<C_TYPE*>(out_tensors.data_ptr()),                \
+            static_cast<float*>(a_scales.data_ptr()),                    \
+            static_cast<float*>(b_scales.data_ptr()),                    \
+            static_cast<cutlass::Array<cutlass::float_e4m3_t, 8>*>(      \
+                b_group_scales.data_ptr()),                              \
+            n, k, scale_k);                                              \
+  }
+
+namespace {
+
+void run_get_group_gemm_starts(
+    torch::Tensor const& expert_offsets, torch::Tensor& a_ptrs,
+    torch::Tensor& b_ptrs, torch::Tensor& out_ptrs,
+    torch::Tensor& a_scales_ptrs, torch::Tensor& b_scales_ptrs,
+    torch::Tensor& b_group_scales_ptrs, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor& out_tensors,
+    torch::Tensor const& a_scales, torch::Tensor const& b_scales,
+    torch::Tensor const& b_group_scales, const int64_t b_group_size) {
+  TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b_tensors.dtype() == torch::kInt32);  // int4 8x packed into int32
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_group_scales.dtype() ==
+              torch::kFloat8_e4m3fn);  // the underlying torch type is e4m3
+  TORCH_CHECK(out_tensors.dtype() ==
+              torch::kBFloat16);  // only support bf16 for now
+  // expect int64_t to avoid overflow during offset calculations
+  TORCH_CHECK(expert_offsets.dtype() == torch::kInt64);
+
+  int num_experts = static_cast<int>(expert_offsets.size(0));
+  // logical k, n
+  int64_t n = out_tensors.size(1);
+  int64_t k = a_tensors.size(1);
+  int64_t scale_k = cutlass::ceil_div(k, b_group_size);
+
+  auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
+
+  if (false) {
+  }
+  __CALL_GET_STARTS_KERNEL(torch::kBFloat16, cutlass::bfloat16_t)
+  __CALL_GET_STARTS_KERNEL(torch::kFloat16, half)
+  else {
+    TORCH_CHECK(false, "Invalid output type (must be float16 or bfloat16)");
+  }
+}
+
+}  // namespace
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu b/csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu
new file mode 100644
index 0000000000000..4b425790dbac7
--- /dev/null
+++ b/csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu
@@ -0,0 +1,483 @@
+#include <vector>
+#include <tuple>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/group_array_problem_shape.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/mixed_dtype_utils.hpp"
+
+// vllm includes
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+#include "cutlass_extensions/torch_utils.hpp"
+#include "cutlass_extensions/common.hpp"
+
+#include "core/registration.h"
+#include "get_group_starts.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+#include "w4a8_utils.cuh"
+
+namespace vllm::cutlass_w4a8_moe {
+
+using namespace cute;
+
+// -------------------------------------------------------------------------------------
+// Static configuration shared across all instantiations
+// -------------------------------------------------------------------------------------
+using ProblemShape =
+    cutlass::gemm::GroupProblemShape<Shape<int, int, int>>;  // <M,N,K> per
+                                                             // group
+using MmaType = cutlass::float_e4m3_t;
+using QuantType = cutlass::int4b_t;
+
+constexpr int TileShapeK = 128 * 8 / sizeof_bits<MmaType>::value;
+static int constexpr PackFactor = 8;  // 8 int4 packed into int32
+
+// A matrix configuration
+using ElementA = MmaType;
+using LayoutA = cutlass::layout::RowMajor;  // Layout type for A matrix operand
+constexpr int AlignmentA =
+    128 /
+    cutlass::sizeof_bits<ElementA>::value;  // Alignment of A matrix in units of
+                                            // elements (up to 16 bytes)
+
+// B matrix configuration
+using ElementB = QuantType;  // Element type for B matrix operand
+using LayoutB =
+    cutlass::layout::ColumnMajor;  // Layout type for B matrix operand
+constexpr int AlignmentB =
+    128 / cutlass::sizeof_bits<
+              ElementB>::value;  // Memory access granularity/alignment of B
+                                 // matrix in units of elements (up to 16 bytes)
+
+// This example manually swaps and transposes, so keep transpose of input
+// layouts
+using LayoutA_Transpose =
+    typename cutlass::layout::LayoutTranspose<LayoutA>::type;
+using LayoutB_Transpose =
+    typename cutlass::layout::LayoutTranspose<LayoutB>::type;
+
+// Need to pass a pointer type to make the 3rd dimension of Stride be _0
+using StrideA =
+    cute::remove_pointer_t<cutlass::detail::TagToStrideA_t<LayoutA*>>;
+using StrideB =
+    cute::remove_pointer_t<cutlass::detail::TagToStrideB_t<LayoutB*>>;
+
+// Define the CuTe layout for reoredered quantized tensor B
+// LayoutAtomQuant places values that will be read by the same thread in
+// contiguous locations in global memory. It specifies the reordering within a
+// single warp's fragment
+using LayoutAtomQuant =
+    decltype(cutlass::compute_memory_reordering_atom<MmaType>());
+using LayoutB_Reordered = decltype(cute::tile_to_shape(
+    LayoutAtomQuant{}, Layout<Shape<int, int, Int<1>>, StrideB>{}));
+
+using ElementScale = cutlass::float_e4m3_t;
+using LayoutScale = cutlass::layout::RowMajor;
+
+// C/D matrix configuration
+using ElementC =
+    cutlass::bfloat16_t;  // Element type for C and D matrix operands
+using LayoutC =
+    cutlass::layout::RowMajor;  // Layout type for C and D matrix operands
+constexpr int AlignmentC =
+    128 / cutlass::sizeof_bits<
+              ElementC>::value;  // Memory access granularity/alignment of C
+                                 // matrix in units of elements (up to 16 bytes)
+
+// D matrix configuration
+using ElementD = ElementC;
+using LayoutD = LayoutC;
+constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+// Core kernel configurations
+using ElementAccumulator = float;     // Element type for internal accumulation
+using ArchTag = cutlass::arch::Sm90;  // Tag indicating the minimum SM that
+                                      // supports the intended feature
+using OperatorClass = cutlass::arch::OpClassTensorOp;  // Operator class tag
+using StageCountType =
+    cutlass::gemm::collective::StageCountAuto;  // Stage count maximized based
+                                                // on the tile size
+
+// per-channel and per-token scales for epilogue
+using ElementSChannel = float;
+
+template <class TileShape_MN, class ClusterShape_MNK, class KernelSchedule,
+          class EpilogueSchedule>
+struct W4A8GroupedGemmKernel {
+  using TileShape =
+      decltype(cute::append(TileShape_MN{}, cute::Int<TileShapeK>{}));
+  using ClusterShape = ClusterShape_MNK;
+
+  // per-channel, per-token scales epilogue
+  using ChTokScalesEpilogue =
+      typename vllm::c3x::ScaledEpilogueArray<ElementAccumulator, ElementD,
+                                              TileShape>;
+  using EVTCompute = typename ChTokScalesEpilogue::EVTCompute;
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, TileShape, ClusterShape,
+          cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
+          ElementSChannel, ElementC,
+          typename cutlass::layout::LayoutTranspose<LayoutC>::type*, AlignmentC,
+          ElementD, typename cutlass::layout::LayoutTranspose<LayoutD>::type*,
+          AlignmentD, EpilogueSchedule, EVTCompute>::CollectiveOp;
+
+  // =========================================================== MIXED INPUT
+  // WITH SCALES
+  // ===========================================================================
+  // The Scale information must get paired with the operand that will be scaled.
+  // In this example, B is scaled so we make a tuple of B's information and the
+  // scale information.
+  using CollectiveMainloopShuffled =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag, OperatorClass,
+          cute::tuple<ElementB, cutlass::Array<ElementScale, 8>>,
+          LayoutB_Reordered*, AlignmentB, ElementA, LayoutA_Transpose*,
+          AlignmentA, ElementAccumulator, TileShape, ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          KernelSchedule>::CollectiveOp;
+
+  using GemmKernelShuffled = cutlass::gemm::kernel::GemmUniversal<
+      ProblemShape, CollectiveMainloopShuffled, CollectiveEpilogue>;
+
+  using GemmShuffled =
+      cutlass::gemm::device::GemmUniversalAdapter<GemmKernelShuffled>;
+
+  using StrideC = typename GemmKernelShuffled::InternalStrideC;
+  using StrideD = typename GemmKernelShuffled::InternalStrideD;
+
+  using StrideC_ref = cutlass::detail::TagToStrideC_t<LayoutC>;
+  using StrideD_ref = cutlass::detail::TagToStrideC_t<LayoutD>;
+  using StrideS = typename CollectiveMainloopShuffled::StrideScale;
+  using StrideS_ref = cutlass::detail::TagToStrideB_t<LayoutScale>;
+
+  // static asserts for passing in strides/layouts
+  // pack to 2x int64
+  static_assert(sizeof(StrideS) == 2 * sizeof(int64_t));
+  // pack to 3xint32,
+  static_assert(sizeof(LayoutB_Reordered) % sizeof(int32_t) == 0,
+                "LayoutB_Reordered size must be divisible by 4 bytes");
+
+  static void grouped_mm(
+      torch::Tensor& out_tensors, const torch::Tensor& a_tensors,
+      const torch::Tensor& b_tensors, const torch::Tensor& a_scales,
+      const torch::Tensor& b_scales, const torch::Tensor& b_group_scales,
+      const int64_t b_group_size, const torch::Tensor& expert_offsets,
+      const torch::Tensor& problem_sizes_torch, const torch::Tensor& a_strides,
+      const torch::Tensor& b_strides, const torch::Tensor& c_strides,
+      const torch::Tensor& group_scale_strides) {
+    auto device = a_tensors.device();
+    auto device_id = device.index();
+    const at::cuda::OptionalCUDAGuard device_guard(device);
+    auto stream = at::cuda::getCurrentCUDAStream(device_id);
+
+    int num_experts = static_cast<int>(expert_offsets.size(0));
+    int n = static_cast<int>(b_tensors.size(1));
+    int k = static_cast<int>(b_tensors.size(2)) * PackFactor;
+
+    auto options_int =
+        torch::TensorOptions().dtype(torch::kInt64).device(device);
+    torch::Tensor a_ptrs = torch::empty(num_experts, options_int);
+    torch::Tensor b_ptrs = torch::empty(num_experts, options_int);
+    torch::Tensor out_ptrs = torch::empty(num_experts, options_int);
+    torch::Tensor a_scales_ptrs = torch::empty(num_experts, options_int);
+    torch::Tensor b_scales_ptrs = torch::empty(num_experts, options_int);
+    torch::Tensor b_group_scales_ptrs = torch::empty(num_experts, options_int);
+
+    // get the correct offsets to pass to gemm
+    run_get_group_gemm_starts(expert_offsets, a_ptrs, b_ptrs, out_ptrs,
+                              a_scales_ptrs, b_scales_ptrs, b_group_scales_ptrs,
+                              a_tensors, b_tensors, out_tensors, a_scales,
+                              b_scales, b_group_scales, b_group_size);
+
+    // construct args
+    using Args = typename GemmShuffled::Arguments;
+    using MainloopArguments = typename GemmKernelShuffled::MainloopArguments;
+    using EpilogueArguments = typename GemmKernelShuffled::EpilogueArguments;
+    Args arguments;
+
+    ProblemShape::UnderlyingProblemShape* problem_sizes_as_shapes =
+        static_cast<ProblemShape::UnderlyingProblemShape*>(
+            problem_sizes_torch.data_ptr());
+    ProblemShape prob_shape{num_experts, problem_sizes_as_shapes, nullptr};
+
+    // SwapAB so B operands come first
+    MainloopArguments mainloop_arguments{
+        static_cast<const QuantType**>(b_ptrs.data_ptr()),
+        static_cast<LayoutB_Reordered*>(b_strides.data_ptr()),
+        static_cast<const MmaType**>(a_ptrs.data_ptr()),
+        static_cast<StrideA*>(a_strides.data_ptr()),
+        static_cast<const cutlass::Array<ElementScale, 8>**>(
+            b_group_scales_ptrs.data_ptr()),
+        static_cast<StrideS*>(group_scale_strides.data_ptr()),
+        static_cast<int>(b_group_size)};
+
+    EpilogueArguments epilogue_arguments{
+        // since we are doing SwapAB the channel scales comes first, then token
+        // scales
+        ChTokScalesEpilogue::prepare_args(  // see ScaledEpilogueArray
+            static_cast<const ElementAccumulator**>(
+                b_scales_ptrs.data_ptr()),  // per-channel
+            static_cast<const ElementAccumulator**>(
+                a_scales_ptrs.data_ptr()),  // per-token
+            true, true),
+        nullptr,                                       // C
+        static_cast<StrideC*>(c_strides.data_ptr()),   // C
+        static_cast<ElementD**>(out_ptrs.data_ptr()),  // D
+        static_cast<StrideC*>(c_strides.data_ptr())    // D
+    };
+
+    static const cutlass::KernelHardwareInfo hw_info{
+        device_id,
+        cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
+            device_id)};
+
+    arguments = Args{cutlass::gemm::GemmUniversalMode::kGrouped, prob_shape,
+                     mainloop_arguments, epilogue_arguments, hw_info};
+
+    // Allocate workspace
+    size_t workspace_size = GemmShuffled::get_workspace_size(arguments);
+    torch::Tensor workspace =
+        torch::empty(workspace_size,
+                     torch::TensorOptions().dtype(torch::kU8).device(device));
+
+    // Run GEMM
+    GemmShuffled gemm;
+    CUTLASS_CHECK(gemm.can_implement(arguments));
+    CUTLASS_CHECK(gemm.initialize(arguments, workspace.data_ptr(), stream));
+    CUTLASS_CHECK(gemm.run(stream));
+  }
+};
+
+// ----------------------------------------------------------------------------
+// Kernel instantiations and dispatch logic
+// ----------------------------------------------------------------------------
+using Coop = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperative;
+using CoopEpi = cutlass::epilogue::PtrArrayTmaWarpSpecializedCooperative;
+
+// Kernel_TileShape_ClusterShape_Schedule
+using Kernel_128x16_1x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_128, _16>, Shape<_1, _1, _1>, Coop, CoopEpi>;
+using Kernel_128x16_2x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_128, _16>, Shape<_2, _1, _1>, Coop, CoopEpi>;
+
+using Kernel_256x16_1x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_256, _16>, Shape<_1, _1, _1>, Coop, CoopEpi>;
+using Kernel_256x16_2x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_256, _16>, Shape<_2, _1, _1>, Coop, CoopEpi>;
+
+using Kernel_256x32_1x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_256, _32>, Shape<_1, _1, _1>, Coop, CoopEpi>;
+using Kernel_256x32_2x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_256, _32>, Shape<_2, _1, _1>, Coop, CoopEpi>;
+
+using Kernel_256x64_1x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_256, _64>, Shape<_1, _1, _1>, Coop, CoopEpi>;
+using Kernel_256x64_2x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_256, _64>, Shape<_2, _1, _1>, Coop, CoopEpi>;
+
+using Kernel_256x128_1x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_256, _128>, Shape<_1, _1, _1>, Coop, CoopEpi>;
+using Kernel_256x128_2x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_256, _128>, Shape<_2, _1, _1>, Coop, CoopEpi>;
+
+using Kernel_128x256_2x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_128, _256>, Shape<_2, _1, _1>, Coop, CoopEpi>;
+
+void mm_dispatch(
+    torch::Tensor& out_tensors, const torch::Tensor& a_tensors,
+    const torch::Tensor& b_tensors, const torch::Tensor& a_scales,
+    const torch::Tensor& b_scales, const torch::Tensor& b_group_scales,
+    const int64_t b_group_size, const torch::Tensor& expert_offsets,
+    const torch::Tensor& problem_sizes, const torch::Tensor& a_strides,
+    const torch::Tensor& b_strides, const torch::Tensor& c_strides,
+    const torch::Tensor& group_scale_strides, const std::string& schedule) {
+  if (schedule == "Kernel_128x16_1x1x1_Coop") {
+    Kernel_128x16_1x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_128x16_2x1x1_Coop") {
+    Kernel_128x16_2x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_256x16_1x1x1_Coop") {
+    Kernel_256x16_1x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_256x16_2x1x1_Coop") {
+    Kernel_256x16_2x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_256x32_1x1x1_Coop") {
+    Kernel_256x32_1x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_256x32_2x1x1_Coop") {
+    Kernel_256x32_2x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_256x64_1x1x1_Coop") {
+    Kernel_256x64_1x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_256x64_2x1x1_Coop") {
+    Kernel_256x64_2x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_256x128_1x1x1_Coop") {
+    Kernel_256x128_1x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_256x128_2x1x1_Coop") {
+    Kernel_256x128_2x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_128x256_2x1x1_Coop") {
+    Kernel_128x256_2x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else {
+    TORCH_CHECK(false,
+                "cutlass_w4a8_moe_mm: unknown schedule string: ", schedule);
+  }
+}
+
+void mm(torch::Tensor& out_tensors, const torch::Tensor& a_tensors,
+        const torch::Tensor& b_tensors, const torch::Tensor& a_scales,
+        const torch::Tensor& b_scales, const torch::Tensor& b_group_scales,
+        const int64_t b_group_size, const torch::Tensor& expert_offsets,
+        const torch::Tensor& problem_sizes, const torch::Tensor& a_strides,
+        const torch::Tensor& b_strides, const torch::Tensor& c_strides,
+        const torch::Tensor& group_scale_strides,
+        std::optional<std::string> maybe_schedule) {
+  // user has specified a schedule
+  if (maybe_schedule) {
+    mm_dispatch(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
+                b_group_scales, b_group_size, expert_offsets, problem_sizes,
+                a_strides, b_strides, c_strides, group_scale_strides,
+                *maybe_schedule);
+    return;
+  }
+
+  // use heuristic
+  int m_full = a_tensors.size(0);
+  int n = b_tensors.size(1);
+  int k = b_tensors.size(2) * PackFactor;  // logical k
+  int num_experts = b_tensors.size(0);
+  // per-expert batch size assuming uniform distribution
+  int m_expert = m_full / num_experts;
+
+  std::string schedule;
+  if (m_expert <= 16) {
+    schedule = "Kernel_128x16_2x1x1_Coop";
+  } else if (m_expert <= 32) {
+    schedule = "Kernel_256x32_1x1x1_Coop";
+  } else if (m_expert <= 64) {
+    schedule = "Kernel_256x64_1x1x1_Coop";
+  } else if (m_expert <= 128) {
+    schedule = "Kernel_256x128_2x1x1_Coop";
+  } else {  // m_expert > 128
+    schedule = "Kernel_128x256_2x1x1_Coop";
+  }
+
+  mm_dispatch(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
+              b_group_scales, b_group_size, expert_offsets, problem_sizes,
+              a_strides, b_strides, c_strides, group_scale_strides, schedule);
+}
+
+std::tuple<torch::Tensor, torch::Tensor> encode_and_reorder_int4b(
+    torch::Tensor const& b_tensors) {
+  TORCH_CHECK(b_tensors.dtype() == torch::kInt32);
+  TORCH_CHECK(b_tensors.dim() == 3);  // (experts, n, k)
+  TORCH_CHECK(b_tensors.is_contiguous());
+  TORCH_CHECK(b_tensors.is_cuda());
+
+  int n = static_cast<int>(b_tensors.size(1));
+  int k = static_cast<int>(b_tensors.size(2)) * PackFactor;  // logical k
+
+  // CUTLASS reorder_tensor requires k % 256 == 0 and n % 16 == 0.
+  // These misalignments cause silent OOB unless run under Compute Sanitizer.
+  TORCH_CHECK(k % 256 == 0, "logical k must be divisible by 256");
+  TORCH_CHECK(n % 16 == 0, "n must be divisible by 16");
+
+  // we will store the layout to an int32 tensor;
+  // this is the number of elements we need per layout
+  constexpr size_t layout_width = sizeof(LayoutB_Reordered) / sizeof(int32_t);
+
+  torch::Tensor b_tensors_packed = torch::empty_like(b_tensors);
+  int num_experts = static_cast<int>(b_tensors.size(0));
+
+  auto b_ptr = static_cast<QuantType const*>(b_tensors.const_data_ptr());
+  auto b_packed_ptr = static_cast<QuantType*>(b_tensors_packed.data_ptr());
+
+  // multiply by ull so result does not overflow int32
+  size_t num_int4_elems = 1ull * num_experts * n * k;
+  bool ok = vllm::cutlass_w4a8_utils::unified_encode_int4b(b_ptr, b_packed_ptr,
+                                                           num_int4_elems);
+  TORCH_CHECK(ok, "unified_encode_int4b failed");
+
+  // construct the layout once; assumes each expert has the same layout
+  using LayoutType = LayoutB_Reordered;
+  std::vector<LayoutType> layout_B_reordered_host(num_experts);
+  auto stride_B = cutlass::make_cute_packed_stride(StrideB{}, {n, k, Int<1>{}});
+  auto shape_B = cute::make_shape(n, k, Int<1>{});
+  auto layout_B = make_layout(shape_B, stride_B);
+  LayoutType layout_B_reordered = tile_to_shape(LayoutAtomQuant{}, shape_B);
+
+  // reorder weights for each expert
+  for (int i = 0; i < num_experts; i++) {
+    // since the storage type of int4b is 1 byte but one element is 4 bits
+    // we need to adjust the offset
+    int64_t offset =
+        1ull * i * n * k * cutlass::sizeof_bits<QuantType>::value / 8;
+    cutlass::reorder_tensor(b_packed_ptr + offset, layout_B,
+                            layout_B_reordered);
+  }
+
+  // save the packed layout to torch tensor so we can re-use it
+  auto cpu_opts =
+      torch::TensorOptions().dtype(torch::kInt32).device(torch::kCPU);
+  torch::Tensor layout_cpu =
+      torch::empty({num_experts, layout_width}, cpu_opts);
+
+  int32_t* layout_data = layout_cpu.data_ptr<int32_t>();
+  for (int i = 0; i < num_experts; ++i) {
+    std::memcpy(layout_data + i * layout_width,  // dst (int32*)
+                &layout_B_reordered,             // src (LayoutType*)
+                sizeof(LayoutType));             // number of bytes
+  }
+
+  torch::Tensor packed_layout =
+      layout_cpu.to(b_tensors.device(), /*non_blocking=*/false);
+
+  return {b_tensors_packed, packed_layout};
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("cutlass_w4a8_moe_mm", &mm);
+  m.impl("cutlass_encode_and_reorder_int4b_grouped", &encode_and_reorder_int4b);
+}
+
+}  // namespace vllm::cutlass_w4a8_moe
+/////////////////////////////////////////////////////////////////////////////////////////////////
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu b/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu
index 2d1568b08651c..f77af06cd6c08 100644
--- a/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu
+++ b/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu
@@ -7,6 +7,7 @@
 #include <c10/cuda/CUDAGuard.h>
 #include <torch/all.h>
 #include "cutlass_extensions/torch_utils.hpp"
+#include "w4a8_utils.cuh"
 
 #include "core/registration.h"
 
@@ -395,71 +396,6 @@ torch::Tensor pack_scale_fp8(torch::Tensor const& scales) {
   return packed_scales;
 }
 
-/*
-  GPU-accelerated implementation of cutlass::unified_encode_int4b.
-  Constructs a lookup table in constant memory to map 8 bits
-  (two 4-bit values) at a time. Assumes memory is contiguous
-  and pointers are 16-byte aligned.
-*/
-__constant__ uint8_t kNibbleLUT[256];
-
-__global__ void unified_encode_int4b_device(const uint8_t* in, uint8_t* out,
-                                            size_t nbytes) {
-  constexpr size_t V = sizeof(uint4);  // 16 bytes
-  const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-  const size_t nthreads = size_t(gridDim.x) * blockDim.x;
-  const size_t nvec = nbytes / V;
-
-  // 1-D grid-stride loop over 16-byte chunks
-  for (size_t vec = tid; vec < nvec; vec += nthreads) {
-    uint4 v = reinterpret_cast<const uint4*>(in)[vec];
-    uint8_t* b = reinterpret_cast<uint8_t*>(&v);
-#pragma unroll
-    for (int i = 0; i < int(V); ++i) b[i] = kNibbleLUT[b[i]];
-    reinterpret_cast<uint4*>(out)[vec] = v;
-  }
-}
-
-static bool upload_lut() {
-  std::array<uint8_t, 256> lut{};
-  auto map_nib = [](uint8_t v) -> uint8_t {
-    // 1..7 -> (8 - v); keep 0 and 8..15
-    return (v == 0 || (v & 0x8)) ? v : uint8_t(8 - v);
-  };
-  for (int b = 0; b < 256; ++b) {
-    uint8_t lo = b & 0xF;
-    uint8_t hi = (b >> 4) & 0xF;
-    lut[b] = uint8_t((map_nib(hi) << 4) | map_nib(lo));
-  }
-  cudaError_t e = cudaMemcpyToSymbol(kNibbleLUT, lut.data(), lut.size(),
-                                     /*offset=*/0, cudaMemcpyHostToDevice);
-
-  return (e == cudaSuccess);
-}
-
-static bool unified_encode_int4b(cutlass::int4b_t const* in,
-                                 cutlass::int4b_t* out, size_t num_int4_elems) {
-  // Build/upload LUT
-  if (!upload_lut()) return false;
-
-  static_assert(sizeof(typename cutlass::int4b_t::Storage) == 1,
-                "int4 storage must be 1 byte");
-  const size_t nbytes = num_int4_elems >> 1;
-
-  auto* in_bytes = reinterpret_cast<uint8_t const*>(in);
-  auto* out_bytes = reinterpret_cast<uint8_t*>(out);
-
-  // kernel launch params
-  constexpr int block = 256;
-  const size_t nvec = nbytes / sizeof(uint4);  // # of 16B vectors
-  int grid = int((nvec + block - 1) / block);
-  if (grid == 0) grid = 1;  // ensure we still cover the tail in the kernel
-
-  unified_encode_int4b_device<<<grid, block>>>(in_bytes, out_bytes, nbytes);
-  cudaError_t err = cudaGetLastError();
-  return (err == cudaSuccess);
-}
-
 torch::Tensor encode_and_reorder_int4b(torch::Tensor const& B) {
   TORCH_CHECK(B.dtype() == torch::kInt32);
   TORCH_CHECK(B.dim() == 2);
@@ -477,8 +413,8 @@ torch::Tensor encode_and_reorder_int4b(torch::Tensor const& B) {
   LayoutB_Reordered layout_B_reordered =
       cute::tile_to_shape(LayoutAtomQuant{}, shape_B);
 
-  bool ok =
-      vllm::cutlass_w4a8::unified_encode_int4b(B_ptr, B_packed_ptr, n * k);
+  bool ok = vllm::cutlass_w4a8_utils::unified_encode_int4b(B_ptr, B_packed_ptr,
+                                                           n * k);
   TORCH_CHECK(ok, "unified_encode_int4b failed");
   cutlass::reorder_tensor(B_packed_ptr, layout_B, layout_B_reordered);
 
diff --git a/csrc/quantization/cutlass_w4a8/w4a8_utils.cu b/csrc/quantization/cutlass_w4a8/w4a8_utils.cu
new file mode 100644
index 0000000000000..f238d0a5b2d78
--- /dev/null
+++ b/csrc/quantization/cutlass_w4a8/w4a8_utils.cu
@@ -0,0 +1,90 @@
+#include "w4a8_utils.cuh"
+
+#include <array>
+#include <cuda_runtime.h>
+#include <cstdio>
+
+namespace vllm::cutlass_w4a8_utils {
+
+/*
+  GPU-accelerated implementation of cutlass::unified_encode_int4b.
+  Constructs a lookup table in constant memory to map 8 bits
+  (two 4-bit values) at a time. Assumes memory is contiguous
+  and pointers are 16-byte aligned.
+*/
+__constant__ uint8_t kNibbleLUT[256];
+
+__global__ void unified_encode_int4b_device(const uint8_t* in, uint8_t* out,
+                                            size_t nbytes) {
+  constexpr size_t V = sizeof(uint4);  // 16 bytes
+  const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const size_t nthreads = size_t(gridDim.x) * blockDim.x;
+  const size_t nvec = nbytes / V;
+
+  // 1-D grid-stride loop over 16-byte chunks
+  for (size_t vec = tid; vec < nvec; vec += nthreads) {
+    uint4 v = reinterpret_cast<const uint4*>(in)[vec];
+    uint8_t* b = reinterpret_cast<uint8_t*>(&v);
+#pragma unroll
+    for (int i = 0; i < int(V); ++i) b[i] = kNibbleLUT[b[i]];
+    reinterpret_cast<uint4*>(out)[vec] = v;
+  }
+}
+
+static bool upload_lut() {
+  std::array<uint8_t, 256> lut{};
+  auto map_nib = [](uint8_t v) -> uint8_t {
+    // 1..7 -> (8 - v); keep 0 and 8..15
+    return (v == 0 || (v & 0x8)) ? v : uint8_t(8 - v);
+  };
+  for (int b = 0; b < 256; ++b) {
+    uint8_t lo = b & 0xF;
+    uint8_t hi = (b >> 4) & 0xF;
+    lut[b] = uint8_t((map_nib(hi) << 4) | map_nib(lo));
+  }
+  cudaError_t e = cudaMemcpyToSymbol(kNibbleLUT, lut.data(), lut.size(),
+                                     /*offset=*/0, cudaMemcpyHostToDevice);
+
+  return (e == cudaSuccess);
+}
+
+bool unified_encode_int4b(cutlass::int4b_t const* in, cutlass::int4b_t* out,
+                          size_t num_int4_elems) {
+  // Build/upload LUT
+  if (!upload_lut()) return false;
+
+  static_assert(sizeof(typename cutlass::int4b_t::Storage) == 1,
+                "int4 storage must be 1 byte");
+  const size_t nbytes = num_int4_elems >> 1;
+
+  auto* in_bytes = reinterpret_cast<uint8_t const*>(in);
+  auto* out_bytes = reinterpret_cast<uint8_t*>(out);
+
+  // kernel launch params
+  constexpr int block = 256;
+  const size_t nvec = nbytes / sizeof(uint4);  // # of 16B vectors
+  int grid = int((nvec + block - 1) / block);
+  if (grid == 0) grid = 1;  // ensure we still cover the tail in the kernel
+
+  unified_encode_int4b_device<<<grid, block>>>(in_bytes, out_bytes, nbytes);
+
+  // launch errors
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("unified_encode_int4b_device launch error: %s (%d)\n",
+           cudaGetErrorString(err), err);
+    return false;
+  }
+
+  // runtime errors
+  err = cudaDeviceSynchronize();
+  if (err != cudaSuccess) {
+    printf("unified_encode_int4b_device runtime error: %s (%d)\n",
+           cudaGetErrorString(err), err);
+    return false;
+  }
+
+  return true;
+}
+
+}  // namespace vllm::cutlass_w4a8_utils
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w4a8/w4a8_utils.cuh b/csrc/quantization/cutlass_w4a8/w4a8_utils.cuh
new file mode 100644
index 0000000000000..25090091a368d
--- /dev/null
+++ b/csrc/quantization/cutlass_w4a8/w4a8_utils.cuh
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <cstddef>
+#include "cutlass/numeric_types.h"
+
+namespace vllm::cutlass_w4a8_utils {
+
+bool unified_encode_int4b(cutlass::int4b_t const* in, cutlass::int4b_t* out,
+                          size_t num_int4_elems);
+
+}  // namespace vllm::cutlass_w4a8_utils
\ No newline at end of file
diff --git a/csrc/quantization/w8a8/cutlass/moe/moe_data.cu b/csrc/quantization/w8a8/cutlass/moe/moe_data.cu
index 49cafcc32adc6..99fec8fd6febc 100644
--- a/csrc/quantization/w8a8/cutlass/moe/moe_data.cu
+++ b/csrc/quantization/w8a8/cutlass/moe/moe_data.cu
@@ -136,15 +136,17 @@ inline void launch_compute_problem_sizes(const torch::Tensor& topk_ids,
 void get_cutlass_moe_mm_problem_sizes_caller(
     const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
     torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
-    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets) {
+    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
+    std::optional<bool> force_swap_ab = std::nullopt) {
   auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index());
   auto options_int32 =
       torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device());
   torch::Tensor atomic_buffer = torch::zeros(num_experts, options_int32);
 
   // Swap-AB should be disabled for FP4 path
-  bool may_swap_ab = (!blockscale_offsets.has_value()) &&
-                     (topk_ids.numel() <= SWAP_AB_THRESHOLD);
+  bool may_swap_ab =
+      force_swap_ab.value_or((!blockscale_offsets.has_value()) &&
+                             (topk_ids.numel() <= SWAP_AB_THRESHOLD));
 
   launch_compute_problem_sizes(topk_ids, problem_sizes1, problem_sizes2,
                                atomic_buffer, num_experts, n, k, stream,
diff --git a/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
index c5012a8669317..5de21cfbbaafb 100644
--- a/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
+++ b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
@@ -80,7 +80,8 @@ void get_cutlass_moe_mm_data_caller(
 void get_cutlass_moe_mm_problem_sizes_caller(
     const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
     torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
-    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets);
+    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
+    std::optional<bool> force_swap_ab = std::nullopt);
 
 void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
                                          torch::Tensor& problem_sizes1,
@@ -303,14 +304,15 @@ void get_cutlass_moe_mm_data(
 void get_cutlass_moe_mm_problem_sizes(
     const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
     torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
-    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets) {
+    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
+    std::optional<bool> force_swap_ab = std::nullopt) {
   int32_t version_num = get_sm_version_num();
 #if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) ||   \
     (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
     (defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
   get_cutlass_moe_mm_problem_sizes_caller(topk_ids, problem_sizes1,
                                           problem_sizes2, num_experts, n, k,
-                                          blockscale_offsets);
+                                          blockscale_offsets, force_swap_ab);
   return;
 #endif
   TORCH_CHECK_NOT_IMPLEMENTED(
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 62212f98b4766..d4c6f8c67c516 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -350,6 +350,29 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("cutlass_encode_and_reorder_int4b(Tensor B) -> Tensor");
   // conditionally compiled so impl registration is in source file
 
+  // CUTLASS w4a8 grouped GEMM
+  ops.def(
+      "cutlass_w4a8_moe_mm("
+      "   Tensor! out_tensors,"
+      "   Tensor a_tensors,"
+      "   Tensor b_tensors,"
+      "   Tensor a_scales,"
+      "   Tensor b_scales,"
+      "   Tensor b_group_scales,"
+      "   int b_group_size,"
+      "   Tensor expert_offsets,"
+      "   Tensor problem_sizes,"
+      "   Tensor a_strides,"
+      "   Tensor b_strides,"
+      "   Tensor c_strides,"
+      "   Tensor group_scale_strides,"
+      "   str? maybe_schedule"
+      ") -> ()");
+  ops.def(
+      "cutlass_encode_and_reorder_int4b_grouped(Tensor b_tensors) -> (Tensor, "
+      "Tensor)");
+  // conditionally compiled so impl registration is in source file
+
 #endif
 
   // Dequantization for GGML.
@@ -466,7 +489,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "                                 Tensor! problem_sizes1, "
       "                                 Tensor! problem_sizes2, "
       "                                 int num_experts, int n, int k, "
-      "                                 Tensor? blockscale_offsets) -> ()");
+      "                                 Tensor? blockscale_offsets, "
+      "                                 bool? force_swap_ab) -> ()");
   ops.impl("get_cutlass_moe_mm_problem_sizes", torch::kCUDA,
            &get_cutlass_moe_mm_problem_sizes);
 
diff --git a/tests/kernels/quantization/test_cutlass_w4a8.py b/tests/kernels/quantization/test_cutlass_w4a8.py
index 465e24fd7eb97..cccef28f5e931 100644
--- a/tests/kernels/quantization/test_cutlass_w4a8.py
+++ b/tests/kernels/quantization/test_cutlass_w4a8.py
@@ -12,8 +12,11 @@ import torch
 
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    convert_packed_uint4b8_to_signed_int4_inplace,
+    pack_cols,
     pack_rows,
     quantize_weights,
+    unpack_quantized_values_into_int32,
 )
 from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
@@ -167,8 +170,7 @@ def create_test_tensors(
 
     # for the practical use case we need per-tok scales for fp8 activations
     w_tok_s = torch.randn((m,), device="cuda", dtype=types.token_scale_type)
-    # weights are already per-group quantized, use placeholder here
-    w_ch_s = torch.ones((n,), device="cuda", dtype=types.channel_scale_type)
+    w_ch_s = torch.randn((n,), device="cuda", dtype=types.channel_scale_type)
 
     return Tensors(
         w_ref=w_ref,
@@ -211,7 +213,7 @@ def mm_test_helper(
     print(output_ref)
 
     torch.testing.assert_close(
-        output, output_ref.to(output.dtype), rtol=1e-3, atol=1e-3
+        output, output_ref.to(output.dtype), rtol=1e-2, atol=1e-2
     )
 
 
@@ -257,7 +259,7 @@ def test_w4a8_cuda_graph():
     )
 
     w_tok_s = torch.randn((m,), device="cuda", dtype=torch.float32)
-    w_ch_s = torch.ones((n,), device="cuda", dtype=torch.float32)
+    w_ch_s = torch.randn((n,), device="cuda", dtype=torch.float32)
 
     # Construct a trivial model with a single layer that calls the kernel
     model = W4A8Layer(
@@ -287,4 +289,38 @@ def test_w4a8_cuda_graph():
     output.zero_()
     g.replay()
 
-    torch.testing.assert_close(output, output_ref, rtol=1e-3, atol=1e-3)
+    torch.testing.assert_close(output, output_ref, rtol=1e-2, atol=1e-2)
+
+
+@pytest.mark.skipif(
+    not IS_SUPPORTED_BY_GPU, reason="CUTLASS W4A8 is not supported on this GPU type."
+)
+@pytest.mark.parametrize("shape", MNK_SHAPES)
+def test_convert_packed_uint4b8_to_signed_int4_inplace(shape):
+    """
+    The W4A16 checkpoints encode the weights as int4b8 packed to int32.
+    The CUTLASS kernels expect signed int4 packed to int32.
+    This tests checks that the runtime int4b8 -> signed int4 conversion
+    matches the offline conversion step exactly.
+    """
+    _, N, K = shape
+    # random weights packed to int32
+    t = torch.randint(
+        low=torch.iinfo(torch.int32).min,
+        high=torch.iinfo(torch.int32).max + 1,
+        size=(N, K // 8),
+        dtype=torch.int32,
+        device="cuda",
+    )
+
+    # compute reference
+    unpacked = unpack_quantized_values_into_int32(
+        t.clone(), scalar_types.uint4b8, packed_dim=1
+    )
+    unpacked = unpacked - 8  # int4b8 -> signed int4
+    ref = pack_cols(unpacked & 0x0F, 4, *unpacked.shape)
+
+    out = convert_packed_uint4b8_to_signed_int4_inplace(t.clone())
+
+    assert torch.equal(ref, out)
+    assert not torch.equal(ref, t)
diff --git a/tests/kernels/quantization/test_cutlass_w4a8_moe.py b/tests/kernels/quantization/test_cutlass_w4a8_moe.py
new file mode 100644
index 0000000000000..3560402a29e90
--- /dev/null
+++ b/tests/kernels/quantization/test_cutlass_w4a8_moe.py
@@ -0,0 +1,340 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Tests for the CUTLASS-based W4A8 grouped GEMM kernel and the full MoE layer.
+"""
+
+import random
+from dataclasses import dataclass
+
+import pytest
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    pack_rows,
+    quantize_weights,
+)
+from vllm.platforms import current_platform
+from vllm.scalar_type import ScalarType, scalar_types
+
+IS_SUPPORTED_BY_GPU = current_platform.get_device_capability()[0] >= 9
+
+
+def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return tensor.clamp(min=finfo.min, max=finfo.max).to(dtype=torch.float8_e4m3fn)
+
+
+def cutlass_quantize(
+    atype: torch.dtype,
+    w: torch.Tensor,
+    wtype: ScalarType,
+    stype: torch.dtype | None,
+    group_size: int | None,
+    zero_points: bool = False,
+):
+    """
+    Quantize weights into W4 and compute reference dequantized weights.
+
+    Encoding/reordering of weights and packing of scales is deferred
+    until after all experts are combined.
+    """
+    assert wtype.is_integer(), "TODO: support floating point weights"
+
+    w_ref, w_q, w_s, w_zp = quantize_weights(
+        w, wtype, group_size=group_size, zero_points=zero_points
+    )
+
+    # Since scales are later cast to fp8, recompute w_ref in atype here.
+    w_ref = (
+        w_q.to(torch.float32)
+        * w_s.to(atype).to(torch.float32).repeat_interleave(group_size, dim=0)
+    ).to(atype)
+
+    # Bit mask prevents sign extension of int4 when packing.
+    w_q = pack_rows(w_q & 0x0F, wtype.size_bits, *w_q.shape)
+    # Make weights row-major (N, K).
+    w_q = w_q.t().contiguous()
+
+    return w_ref, w_q, w_s.to(atype), w_zp
+
+
+def cutlass_preprocess(
+    w_q_experts: list[torch.Tensor], w_s_experts: list[torch.Tensor]
+):
+    """
+    Reorder/encode expert weights and pack scales.
+
+    Returns:
+        w_q_packed: Packed/encoded int4 weights for all experts.
+        w_s_packed: Packed fp8 scales for all experts.
+        packed_layout: Layout/stride metadata for grouped GEMM.
+    """
+    w_s_packed = ops.cutlass_pack_scale_fp8(torch.stack(w_s_experts))
+    w_q_packed, packed_layout = ops.cutlass_encode_and_reorder_int4b_grouped(
+        torch.stack(w_q_experts)
+    )  # expects dim 3
+    return w_q_packed, w_s_packed, packed_layout
+
+
+GROUP_SIZE = 128
+# (num_experts, N, K)
+TEST_SHAPES = [
+    (8, 512, 2048),
+    (8, 2048, 2048),
+    (64, 512, 1024),
+    (64, 2048, 2048),
+    (4, 2048, 768),
+    (8, 768, 2048),
+    (64, 1536, 2048),
+    (128, 8192, 4096),  # test overflow int32
+]
+ALIGNMENT = 16  # torch._scaled_mm alignment for M, needed for reference check
+
+
+@dataclass
+class MoETestSetup:
+    num_experts: int
+    K: int
+    N: int
+    Ms: list[int]
+    M_full: int
+    a: torch.Tensor
+    a_ref: torch.Tensor
+    a_strides: torch.Tensor
+    out: torch.Tensor
+    c_strides: torch.Tensor
+    per_tok_scales: torch.Tensor
+    per_chan_scales: torch.Tensor
+    w_refs: list[torch.Tensor]
+    w_q_packed: torch.Tensor
+    w_s_packed: torch.Tensor
+    problem_sizes: torch.Tensor
+    expert_offsets: torch.Tensor
+    b_strides: torch.Tensor
+    group_scale_strides: torch.Tensor
+
+
+def make_moe_test_setup(
+    num_experts: int,
+    K: int,
+    N: int,
+    *,
+    alignment: int = ALIGNMENT,
+    max_blocks: int = 64,
+    device: str = "cuda",
+    random_zero: bool = False,
+) -> MoETestSetup:
+    """Create a full set of tensors for testing cutlass_w4a8_moe_mm."""
+
+    assert K % GROUP_SIZE == 0
+    # Token counts per expert (multiples of `alignment`).
+    Ms = [alignment * random.randint(1, max_blocks) for _ in range(num_experts)]
+
+    # set random experts to 0 tokens
+    if random_zero and num_experts > 1:
+        num_zero = max(1, num_experts // 8)
+        zero_indices = random.sample(range(num_experts), k=num_zero)
+        for idx in zero_indices:
+            Ms[idx] = 0
+
+    M_full = sum(Ms)
+    assert M_full > 0
+
+    # Activations.
+    a = to_fp8(torch.randn((M_full, K), device=device))
+    a_ref = a.to(torch.float32)
+    a_strides = torch.full((num_experts,), K, dtype=torch.int64, device=device)
+
+    # Output buffer.
+    out = torch.empty((M_full, N), dtype=torch.bfloat16, device=device)
+    c_strides = torch.full((num_experts,), N, dtype=torch.int64, device=device)
+
+    # Channel/token scales.
+    per_tok_scales = torch.randn((M_full, 1), dtype=torch.float32, device=device)
+    per_chan_scales = torch.randn(
+        (num_experts, N, 1), dtype=torch.float32, device=device
+    )
+
+    # Expert weights and scales.
+    wtype = scalar_types.int4
+    atype = stype = torch.float8_e4m3fn
+    w_refs, w_qs, w_ss = [], [], []
+    for _ in range(num_experts):
+        b = to_fp8(torch.randn((K, N), device=device))
+        w_ref, w_q, w_s, _ = cutlass_quantize(
+            atype, b.to(torch.float16), wtype, stype, GROUP_SIZE, zero_points=False
+        )
+        w_refs.append(w_ref)
+        w_qs.append(w_q)
+        w_ss.append(w_s)
+
+    w_q_packed, w_s_packed, packed_layout = cutlass_preprocess(w_qs, w_ss)
+
+    problem_sizes = torch.tensor(
+        [[N, M, K] for M in Ms], dtype=torch.int32, device=device
+    )
+
+    expert_offsets = torch.cat(
+        [
+            torch.tensor([0], dtype=torch.int64),
+            torch.cumsum(torch.tensor(Ms, dtype=torch.int64), dim=0)[:-1],
+        ]
+    ).to(device=device)
+
+    # B strides and group scale strides.
+    b_strides = packed_layout
+    group_scale_strides = torch.zeros(
+        (num_experts, 2), dtype=torch.int64, device=device
+    )
+    group_scale_strides[:, 0] = N
+
+    return MoETestSetup(
+        num_experts=num_experts,
+        K=K,
+        N=N,
+        Ms=Ms,
+        M_full=M_full,
+        a=a,
+        a_ref=a_ref,
+        a_strides=a_strides,
+        out=out,
+        c_strides=c_strides,
+        per_tok_scales=per_tok_scales,
+        per_chan_scales=per_chan_scales,
+        w_refs=w_refs,
+        w_q_packed=w_q_packed,
+        w_s_packed=w_s_packed,
+        problem_sizes=problem_sizes,
+        expert_offsets=expert_offsets,
+        b_strides=b_strides,
+        group_scale_strides=group_scale_strides,
+    )
+
+
+def compute_moe_reference_output(setup: MoETestSetup) -> torch.Tensor:
+    """Compute reference output using torch._scaled_mm per expert."""
+    out_ref = torch.empty_like(setup.out)
+
+    ends = torch.cumsum(torch.tensor(setup.Ms), 0).tolist()
+    starts = setup.expert_offsets.cpu().tolist()
+
+    for i in range(setup.num_experts):
+        start, end = starts[i], ends[i]
+        if start == end:
+            continue
+
+        out_ref_i = torch._scaled_mm(
+            setup.a_ref[start:end].to(torch.float8_e4m3fn),
+            setup.w_refs[i].to(torch.float8_e4m3fn).t().contiguous().t(),
+            setup.per_tok_scales[start:end],  # (M, 1)
+            setup.per_chan_scales[i].reshape(1, -1),  # (1, N)
+            out_dtype=torch.bfloat16,
+            use_fast_accum=True,
+        )
+        out_ref[start:end] = out_ref_i
+
+    return out_ref
+
+
+@pytest.mark.skipif(
+    not IS_SUPPORTED_BY_GPU,
+    reason="W4A8 Grouped GEMM is not supported on this GPU type.",
+)
+@pytest.mark.parametrize("shape", TEST_SHAPES)
+@pytest.mark.parametrize("random_zero", [True, False])
+def test_cutlass_w4a8_moe_mm_end_to_end(shape, random_zero):
+    num_experts, N, K = shape
+    current_platform.seed_everything(42)
+    setup = make_moe_test_setup(
+        num_experts=num_experts, K=K, N=N, max_blocks=64, random_zero=random_zero
+    )
+
+    ops.cutlass_w4a8_moe_mm(
+        setup.out,
+        setup.a,
+        setup.w_q_packed,
+        setup.per_tok_scales,
+        setup.per_chan_scales,
+        setup.w_s_packed,
+        GROUP_SIZE,
+        setup.expert_offsets,
+        setup.problem_sizes,
+        setup.a_strides,
+        setup.b_strides,
+        setup.c_strides,
+        setup.group_scale_strides,
+    )
+    torch.cuda.synchronize()
+
+    out_ref = compute_moe_reference_output(setup)
+    torch.testing.assert_close(setup.out, out_ref, rtol=1e-2, atol=1e-2)
+
+
+class W4A8MoELayer(torch.nn.Module):
+    """
+    Minimal wrapper module to test cuda graphs
+    """
+
+    def __init__(self, setup: MoETestSetup):
+        super().__init__()
+        self.setup = setup
+
+    def forward(self, a: torch.Tensor) -> torch.Tensor:
+        s = self.setup
+        ops.cutlass_w4a8_moe_mm(
+            s.out,
+            a,
+            s.w_q_packed,
+            s.per_tok_scales,
+            s.per_chan_scales,
+            s.w_s_packed,
+            GROUP_SIZE,
+            s.expert_offsets,
+            s.problem_sizes,
+            s.a_strides,
+            s.b_strides,
+            s.c_strides,
+            s.group_scale_strides,
+        )
+        return s.out
+
+
+@pytest.mark.skipif(
+    not IS_SUPPORTED_BY_GPU,
+    reason="W4A8 Grouped GEMM is not supported on this GPU type.",
+)
+def test_cutlass_w4a8_moe_mm_cuda_graph():
+    current_platform.seed_everything(42)
+    # Fixed config for CUDA graph test (single parameter point).
+    num_experts = 8
+    K = 512
+    N = 2048
+
+    setup = make_moe_test_setup(
+        num_experts=num_experts,
+        K=K,
+        N=N,
+        max_blocks=32,
+    )
+
+    # Construct model that calls the grouped GEMM kernel.
+    model = W4A8MoELayer(setup)
+
+    # Build reference output once.
+    out_ref = compute_moe_reference_output(setup)
+
+    # Capture and run the model in a CUDA graph.
+    a_static = setup.a.clone()  # static input tensor for graph replay
+
+    stream = torch.cuda.Stream()
+    with torch.cuda.stream(stream):
+        g = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(g):
+            out_static = model(a_static)
+
+    out_static.zero_()
+    g.replay()
+
+    torch.testing.assert_close(out_static, out_ref, rtol=1e-2, atol=1e-2)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 56c780ceb1cb5..6bbfe11b6e925 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -695,6 +695,10 @@ if hasattr(torch.ops._C, "gptq_marlin_24_gemm"):
     def cutlass_encode_and_reorder_int4b_fake(b: torch.Tensor) -> torch.Tensor:
         return torch.empty_like(b, memory_format=torch.contiguous_format)
 
+    @register_fake("_C::cutlass_encode_and_reorder_int4b_grouped")
+    def cutlass_encode_and_reorder_int4b_grouped_fake(b: torch.Tensor) -> torch.Tensor:
+        return torch.empty_like(b, memory_format=torch.contiguous_format)
+
 
 if hasattr(torch.ops._C, "allspark_w8a16_gemm"):
 
@@ -1058,6 +1062,7 @@ def get_cutlass_moe_mm_problem_sizes(
     n: int,
     k: int,
     blockscale_offsets: torch.Tensor | None = None,
+    force_swap_ab: bool | None = None,
 ):
     """
     Compute only the per-expert problem sizes needed by the two grouped matrix
@@ -1067,9 +1072,20 @@ def get_cutlass_moe_mm_problem_sizes(
     - problem_sizes1, problem_sizes2: M×N×K sizes of each expert's
                                     multiplication for the two grouped MMs
                                     used in the fused MoE operation.
+    Optional:
+    - force_swap_ab: If set to True or False, explicitly enable or disable the
+                     A/B input swap optimization. If None (default), the swap
+                     is selected automatically based on tensor sizes.
     """
     return torch.ops._C.get_cutlass_moe_mm_problem_sizes(
-        topk_ids, problem_sizes1, problem_sizes2, num_experts, n, k, blockscale_offsets
+        topk_ids,
+        problem_sizes1,
+        problem_sizes2,
+        num_experts,
+        n,
+        k,
+        blockscale_offsets,
+        force_swap_ab,
     )
 
 
@@ -1457,6 +1473,78 @@ def cutlass_encode_and_reorder_int4b(b: torch.Tensor) -> torch.Tensor:
     return torch.ops._C.cutlass_encode_and_reorder_int4b(b)
 
 
+def cutlass_w4a8_moe_mm(
+    out_tensors: torch.Tensor,
+    a_tensors: torch.Tensor,
+    b_tensors: torch.Tensor,
+    a_scales: torch.Tensor,
+    b_scales: torch.Tensor,
+    b_group_scales: torch.Tensor,
+    b_group_size: int,
+    expert_offsets: torch.Tensor,
+    problem_sizes: torch.Tensor,
+    a_strides: torch.Tensor,
+    b_strides: torch.Tensor,
+    c_strides: torch.Tensor,
+    group_scale_strides: torch.Tensor,
+    maybe_schedule: str | None = None,
+):
+    """
+    Executes the CUTLASS-based fused-MoE grouped matrix multiplication for the
+    W4A8 quantization scheme. Uses group-wise quantization (INT4 -> FP8)
+    and both per-channel + per-token scaling in the epilogue.
+
+    Args:
+        out_tensors:
+            Output buffer for all experts (updated in-place).
+        a_tensors:
+            FP8 (E4M3FN) activations for all experts.
+        b_tensors:
+            INT4-packed weight matrix for all experts, packed to INT32
+        a_scales:
+            Per-token FP8 activation scales, applied in the epilogue.
+        b_scales:
+            Per-channel FP8 weight scales for each expert, applied in the epilogue.
+        b_group_scales:
+            FP8 scale values for group-wise INT4 weight blocks.
+        b_group_size:
+            Number of elements grouped under each entry of b_group_scales.
+        expert_offsets:
+            Cumulative token offsets
+        problem_sizes:
+            Per-expert (M, N, K) GEMM sizes used by the grouped GEMM launcher.
+        a/b/c/group_scale_strides:
+            Strides describing the memory layout of the input tensors.
+        maybe_schedule:
+            Optional override to choose a specific kernel or epilogue schedule.
+
+    Returns:
+        out_tensors updated in-place with the dequantized INT4xFP8 grouped GEMM result.
+    """
+    return torch.ops._C.cutlass_w4a8_moe_mm(
+        out_tensors,
+        a_tensors,
+        b_tensors,
+        a_scales,
+        b_scales,
+        b_group_scales,
+        b_group_size,
+        expert_offsets,
+        problem_sizes,
+        a_strides,
+        b_strides,
+        c_strides,
+        group_scale_strides,
+        maybe_schedule,
+    )
+
+
+def cutlass_encode_and_reorder_int4b_grouped(
+    b_tensors: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    return torch.ops._C.cutlass_encode_and_reorder_int4b_grouped(b_tensors)
+
+
 if hasattr(torch.ops._C, "permute_cols"):
 
     @register_fake("_C::permute_cols")
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 9103e84aa7057..1e145a8fcd791 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -63,8 +63,10 @@ if HAS_TRITON:
     from vllm.model_executor.layers.fused_moe.cutlass_moe import (
         CutlassBatchedExpertsFp8,
         CutlassExpertsFp8,
+        CutlassExpertsW4A8Fp8,
         cutlass_moe_fp4,
         cutlass_moe_fp8,
+        cutlass_moe_w4a8_fp8,
     )
     from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts
     from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
@@ -88,8 +90,10 @@ if HAS_TRITON:
         "grouped_topk",
         "cutlass_moe_fp8",
         "cutlass_moe_fp4",
+        "cutlass_moe_w4a8_fp8",
         "CutlassExpertsFp8",
         "CutlassBatchedExpertsFp8",
+        "CutlassExpertsW4A8Fp8",
         "TritonExperts",
         "BatchedTritonExperts",
         "DeepGemmExperts",
diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
index e52845dfa246d..f35cafa0f77dc 100644
--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -143,6 +143,7 @@ class FusedMoEQuantDesc:
     scale: Union[torch.Tensor, "PrecisionConfig", None] = None
 
     # Quantization alphas or gscales, used for nvfp4 types.
+    # W4A8 FP8: used for per-channel scales
     # TODO(bnell): put some of these in subclasses
     alpha_or_gscale: torch.Tensor | None = None
 
@@ -442,7 +443,9 @@ class FusedMoEQuantConfig:
         - a1_scale: Optional scale to be used for a1.
         - a2_scale: Optional scale to be used for a2.
         - g1_alphas: Optional global quantization scales for w1 (for nvfp4).
+            per-channel scales for w1 (for W4A8 FP8).
         - g2_alphas: Optional global quantization scales for w2 (for nvfp4).
+            per-channel scales for w2 (for W4A8 FP8).
         - a1_gscale: Optional global quantization scales for a1 (for nvfp4).
         - a2_gscale: Optional global quantization scales for a2 (for nvfp4).
         - w1_bias: Optional biases for w1 (GPT OSS Triton).
@@ -461,6 +464,7 @@ class FusedMoEQuantConfig:
             "mxfp4",
             "mxfp6_e3m2",
             "mxfp6_e2m3",
+            "int4",
         }
 
         if weight_dtype is None:
@@ -671,6 +675,31 @@ def int8_w8a16_moe_quant_config(
     )
 
 
+def int4_w4afp8_moe_quant_config(
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    g1_alphas: torch.Tensor,
+    g2_alphas: torch.Tensor,
+    per_act_token_quant: bool = False,
+    per_out_ch_quant: bool = False,
+    block_shape: list[int] | None = None,
+) -> FusedMoEQuantConfig:
+    """
+    Construct a quant config for fp8 activations and int4 weights.
+    """
+    return FusedMoEQuantConfig.make(
+        torch.float8_e4m3fn,  # quant dtype for activations
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        g1_alphas=g1_alphas,
+        g2_alphas=g2_alphas,
+        per_act_token_quant=per_act_token_quant,
+        per_out_ch_quant=per_out_ch_quant,
+        block_shape=block_shape,
+        weight_dtype="int4",  # weight dtype for weights
+    )
+
+
 def biased_moe_quant_config(
     w1_bias: torch.Tensor | None,
     w2_bias: torch.Tensor | None,
diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
index 30144ca5452eb..552e38a71bf98 100644
--- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -1052,3 +1052,404 @@ def run_cutlass_block_scaled_fused_experts(
     return (
         c2[c_map].view(m, topk, k) * topk_weights.view(m, topk, 1).to(out_dtype)
     ).sum(dim=1)
+
+
+# W4A8
+def run_cutlass_moe_w4a8_fp8(
+    output: torch.Tensor,
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_ids: torch.Tensor,
+    activation_callable: Callable,
+    global_num_experts: int,
+    expert_map: torch.Tensor | None,
+    w1_scale: torch.Tensor | None,
+    w2_scale: torch.Tensor | None,
+    a1q_scale: torch.Tensor | None,
+    a2_scale: torch.Tensor | None,
+    w1_chan_scale: torch.Tensor,
+    w2_chan_scale: torch.Tensor,
+    a_strides1: torch.Tensor,
+    a_strides2: torch.Tensor,
+    b_strides1: torch.Tensor,
+    b_strides2: torch.Tensor,
+    c_strides1: torch.Tensor,
+    c_strides2: torch.Tensor,
+    s_strides1: torch.Tensor,
+    s_strides2: torch.Tensor,
+    workspace13: torch.Tensor,
+    workspace2: torch.Tensor,
+    expert_num_tokens: torch.Tensor | None,
+    out_dtype: torch.dtype,
+    per_act_token: bool,
+    per_out_ch: bool,
+    use_batched_format: bool,
+    topk_weights: torch.Tensor | None,
+    group_size: int,
+):
+    a1q = hidden_states
+    M = a1q.size(0)
+    local_E = w1.size(0)
+    device = a1q.device
+    _, K, N_packed = w2.shape
+    N = N_packed * 8  # logical N, pack 8 int4 into 1 int32
+
+    assert per_act_token, "W4A8 must use per-token scales"
+    assert per_out_ch, "W4A8 must use per-channel scales"
+    assert w1_scale is not None
+    assert w2_scale is not None
+    assert w1_scale.dtype == torch.float8_e4m3fn
+    assert w2_scale.dtype == torch.float8_e4m3fn
+    assert w1.dtype == torch.int32
+    assert w2.dtype == torch.int32
+    assert w1_chan_scale.dtype == torch.float32
+    assert w2_chan_scale.dtype == torch.float32
+    assert w1.size(0) == w2.size(0), "Weights expert number mismatch"
+    assert a1q_scale is not None
+    assert a2_scale is None
+    assert out_dtype in [torch.bfloat16], f"Invalid output dtype: {out_dtype}"
+    if expert_map is not None:
+        assert expert_num_tokens is None
+    assert not use_batched_format, "batched format not supported yet"
+    assert group_size == 128, f"Only group size 128 supported but got {group_size=}"
+
+    assert global_num_experts != -1
+    assert w1.size(2) * 8 == K, (
+        f"w1 hidden size mismatch: got {w1.size(2) * 8}, expected {K=}"
+    )
+
+    # Translate info from expert_map to topk_ids
+    if expert_map is not None:
+        local_topk_ids = torch.where(
+            expert_map[topk_ids] != -1, expert_map[topk_ids], -1
+        )
+    else:
+        local_topk_ids = topk_ids
+
+    topk = local_topk_ids.size(1)
+    a1q_perm = _resize_cache(workspace2.view(dtype=torch.float8_e4m3fn), (M * topk, K))
+    mm1_out = _resize_cache(workspace13, (M * topk, N * 2))
+    act_out = _resize_cache(workspace2, (M * topk, N))
+    # original workspace are based on input hidden_states dtype (bf16)
+    quant_out = _resize_cache(
+        workspace13.view(dtype=torch.float8_e4m3fn), (M * topk, N)
+    )
+    mm2_out = _resize_cache(workspace2, (M * topk, K))
+
+    problem_sizes1 = torch.empty(
+        (global_num_experts, 3), dtype=torch.int32, device=device
+    )
+    problem_sizes2 = torch.empty(
+        (global_num_experts, 3), dtype=torch.int32, device=device
+    )
+
+    num_expert = global_num_experts if expert_map is None else expert_map.size(0)
+    # permuted a1q reuses workspace2
+    a1q, a1q_scale, expert_offsets, inv_perm, _ = moe_permute(
+        a1q,
+        a1q_scale,
+        topk_ids,
+        num_expert,
+        local_E,
+        expert_map,
+        permuted_hidden_states=a1q_perm,
+    )
+    expert_offsets = expert_offsets[:-1]
+
+    # For RS gemm SwapAB is always enabled (swap logical M, N in the problem shape)
+    ops.get_cutlass_moe_mm_problem_sizes(
+        local_topk_ids,
+        problem_sizes1,
+        problem_sizes2,
+        global_num_experts,
+        N,
+        K,
+        force_swap_ab=True,
+    )
+
+    ops.cutlass_w4a8_moe_mm(
+        mm1_out,
+        a1q,
+        w1,
+        a1q_scale,
+        w1_chan_scale,
+        w1_scale,
+        group_size,
+        expert_offsets,
+        problem_sizes1,
+        a_strides1,
+        b_strides1,
+        c_strides1,
+        s_strides1,
+    )
+
+    activation_callable(act_out, mm1_out)
+
+    a2q, a2q_scale = ops.scaled_fp8_quant(
+        act_out, a2_scale, use_per_token_if_dynamic=per_act_token, output=quant_out
+    )
+
+    if expert_map is not None:
+        mm2_out.fill_(0)
+
+    ops.cutlass_w4a8_moe_mm(
+        mm2_out,
+        a2q,
+        w2,
+        a2q_scale,
+        w2_chan_scale,
+        w2_scale,
+        group_size,
+        expert_offsets,
+        problem_sizes2,
+        a_strides2,
+        b_strides2,
+        c_strides2,
+        s_strides2,
+    )
+
+    # for non-chunking mode the output is resized from workspace13
+    # so we need to make sure mm2_out uses workspace2.
+    moe_unpermute(
+        out=output,
+        permuted_hidden_states=mm2_out,
+        topk_weights=topk_weights,
+        inv_permuted_idx=inv_perm,
+    )
+
+
+class CutlassExpertsW4A8Fp8(mk.FusedMoEPermuteExpertsUnpermute):
+    def __init__(
+        self,
+        out_dtype: torch.dtype | None,
+        a_strides1: torch.Tensor,
+        a_strides2: torch.Tensor,
+        b_strides1: torch.Tensor,
+        b_strides2: torch.Tensor,
+        c_strides1: torch.Tensor,
+        c_strides2: torch.Tensor,
+        s_strides1: torch.Tensor,
+        s_strides2: torch.Tensor,
+        quant_config: FusedMoEQuantConfig,
+        group_size: int,
+    ):
+        super().__init__(quant_config)
+        self.out_dtype = out_dtype
+        self.a_strides1 = a_strides1
+        self.a_strides2 = a_strides2
+        self.b_strides1 = b_strides1
+        self.b_strides2 = b_strides2
+        self.c_strides1 = c_strides1
+        self.c_strides2 = c_strides2
+        self.s_strides1 = s_strides1
+        self.s_strides2 = s_strides2
+        self.group_size = group_size
+
+    @property
+    def activation_formats(
+        self,
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (
+            mk.FusedMoEActivationFormat.Standard,
+            mk.FusedMoEActivationFormat.Standard,
+        )
+
+    def supports_chunking(self) -> bool:
+        return True
+
+    def supports_expert_map(self) -> bool:
+        return True
+
+    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
+        # topk weights and reduction are fused in moe_unpermute cuda kernel
+        return TopKWeightAndReduceNoOP()
+
+    def workspace_dtype(self, act_dtype: torch.dtype) -> torch.dtype:
+        return self.out_dtype if self.out_dtype is not None else act_dtype
+
+    def workspace_shapes(
+        self,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
+        workspace1 = (M * topk, max(N, K))
+        workspace2 = (M * topk, max(N // 2, K))
+        output = (M, K)
+        return (workspace1, workspace2, output)
+
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
+        workspace13: torch.Tensor | None,
+        workspace2: torch.Tensor | None,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        apply_router_weight_on_input: bool,
+    ):
+        assert self.w1_zp is None, "w1_zp is not supported in CUTLASS MoE"
+        assert self.w2_zp is None, "w2_zp is not supported in CUTLASS MoE"
+
+        expert_num_tokens = None
+        activation_callable = lambda o, i: self.activation(activation, o, i)
+
+        use_batched_format = (
+            self.activation_formats[0] == mk.FusedMoEActivationFormat.BatchedExperts
+        )
+        assert not use_batched_format, "batched format not supported"
+
+        in_dtype = hidden_states.dtype
+
+        run_cutlass_moe_w4a8_fp8(
+            output,
+            hidden_states,
+            w1,
+            w2,
+            topk_ids,
+            activation_callable,
+            global_num_experts,
+            expert_map,
+            self.w1_scale,
+            self.w2_scale,
+            a1q_scale,
+            a2_scale,
+            self.g1_alphas,  # per-channel scales
+            self.g2_alphas,  # per-channel scales
+            self.a_strides1,
+            self.a_strides2,
+            self.b_strides1,
+            self.b_strides2,
+            self.c_strides1,
+            self.c_strides2,
+            self.s_strides1,
+            self.s_strides2,
+            workspace13,
+            workspace2,
+            expert_num_tokens,
+            self.out_dtype if self.out_dtype is not None else in_dtype,
+            self.per_act_token_quant,
+            self.per_out_ch_quant,
+            use_batched_format,
+            topk_weights,
+            self.group_size,
+        )
+
+
+def cutlass_moe_w4a8_fp8(
+    a: torch.Tensor,
+    w1_q: torch.Tensor,
+    w2_q: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    a_strides1: torch.Tensor,
+    a_strides2: torch.Tensor,
+    b_strides1: torch.Tensor,
+    b_strides2: torch.Tensor,
+    c_strides1: torch.Tensor,
+    c_strides2: torch.Tensor,
+    s_strides1: torch.Tensor,
+    s_strides2: torch.Tensor,
+    quant_config: FusedMoEQuantConfig,
+    activation: str = "silu",
+    expert_map: torch.Tensor | None = None,
+    apply_router_weight_on_input: bool = False,
+    global_num_experts: int = -1,
+    group_size: int = 128,
+) -> torch.Tensor:
+    """
+    This function computes a w4a8-quantized Mixture of Experts (MoE) layer
+    using two sets of quantized weights, w1_q and w2_q, and top-k gating
+    mechanism. The matrix multiplications are implemented with CUTLASS
+    mixed-dtype grouped gemm.
+
+    Parameters:
+    - a (torch.Tensor): The input tensor to the MoE layer.
+        Shape: [M, K]
+    - w1_q (torch.Tensor): The first set of fp8-quantized expert weights.
+        Shape: [num_experts, 2*N, K // packed_factor]
+    - w2_q (torch.Tensor): The second set of fp8-quantized expert weights.
+        Shape: [num_experts, K, N // packed_factor]
+    - topk_weights (torch.Tensor): The weights of each token->expert mapping.
+    - topk_ids (torch.Tensor): The token->expert mappings.
+    - a_strides1 (torch.Tensor): The input strides for the first gemm.
+        Shape: [num_experts]
+    - a_strides2 (torch.Tensor): The input strides for the second gemm.
+        Shape: [num_experts]
+    - b_strides1 (torch.Tensor): The packed layout for the first gemm weights.
+        Shape: [num_experts, 3]
+        dtype: torch.int32
+    - b_strides2 (torch.Tensor): The packed layout for the second gemm weights.
+        Shape: [num_experts, 3]
+        dtype: torch.int32
+    - c_strides1 (torch.Tensor): The output strides for the first gemm.
+        Shape: [num_experts]
+    - c_strides2 (torch.Tensor): The output strides for the second gemm.
+        Shape: [num_experts]
+    - s_strides1 (torch.Tensor): strides for the group-wise scales for the first gemm.
+        Shape: [num_experts, 2]
+        dtype: torch.int64
+    - s_strides2 (torch.Tensor): strides for the group-wise scales for the second gemm.
+        Shape: [num_experts, 2]
+        dtype: torch.int64
+    - per_act_token (Optional[bool]): Whether the scale is per-token or
+                                      per-tensor.
+    - activation (str): The activation function to use.
+    - expert_map (Optional[torch.Tensor]): In the case of Expert parallel,
+        every Rank is responsible for a subset of experts. expert_map is a
+        mapping from global expert-id to local expert-id. When expert_map[i]
+        is -1, it means that this Rank is not responsible for global
+        expert-id i.
+    - apply_router_weight_on_input (bool): When true, the topk weights are
+        applied directly on the inputs. This is only applicable when topk is 1.
+    - global_num_experts (int): The total number of experts.
+    - group_size (int): The number of weights per scale factor
+
+    Returns:
+    - torch.Tensor: The bf16 output tensor after applying the MoE layer.
+    """
+    assert quant_config is not None
+
+    num_experts = global_num_experts if global_num_experts != -1 else w1_q.size(0)
+
+    fn = mk.FusedMoEModularKernel(
+        MoEPrepareAndFinalizeNoEP(),
+        CutlassExpertsW4A8Fp8(
+            out_dtype=a.dtype,
+            a_strides1=a_strides1,
+            a_strides2=a_strides2,
+            b_strides1=b_strides1,
+            b_strides2=b_strides2,
+            c_strides1=c_strides1,
+            c_strides2=c_strides2,
+            s_strides1=s_strides1,
+            s_strides2=s_strides2,
+            quant_config=quant_config,
+            group_size=group_size,
+        ),
+    )
+
+    return fn(
+        a,
+        w1_q,
+        w2_q,
+        topk_weights,
+        topk_ids,
+        activation=activation,
+        global_num_experts=num_experts,
+        expert_map=expert_map,
+        apply_router_weight_on_input=apply_router_weight_on_input,
+    )
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index 51d3299e7ddf1..075610ec588ae 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -367,7 +367,7 @@ class FusedMoEPrepareAndFinalize(ABC):
 class FusedMoEPermuteExpertsUnpermute(ABC):
     """
     An abstract base class for the [Permute-Experts-Unpermute] step described
-    above.
+        above.
     """
 
     def __init__(
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index b91ecb59fee18..21f4cfe51d08e 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -256,7 +256,7 @@ class CompressedTensorsConfig(QuantizationConfig):
                     if format is not None
                     else is_activation_quantization_format(quant_format)
                 )
-                # TODO(czhu): w4a8fp8 is in packed-quantized format
+                # w4a8fp8 is in packed-quantized format
                 # but needs input activation quantization
                 input_activations = quant_config.get("input_activations")
                 if act_quant_format or input_activations:
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 8013b29f733bb..619162272c94f 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -33,6 +33,7 @@ from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEQuantConfig,
     fp8_w8a8_moe_quant_config,
     int4_w4a16_moe_quant_config,
+    int4_w4afp8_moe_quant_config,
     int8_w8a8_moe_quant_config,
     int8_w8a16_moe_quant_config,
     nvfp4_moe_quant_config,
@@ -79,7 +80,11 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     prepare_moe_fp8_layer_for_marlin,
 )
-from vllm.model_executor.layers.quantization.utils.quant_utils import swizzle_blockscale
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    convert_bf16_scales_to_fp8,
+    convert_packed_uint4b8_to_signed_int4_inplace,
+    swizzle_blockscale,
+)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     all_close_1d,
     normalize_e4m3fn_to_e4m3fnuz,
@@ -204,6 +209,11 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
             return CompressedTensorsW8A8Int8MoEMethod(
                 weight_quant, input_quant, layer.moe_config
             )
+        elif quant_config._is_fp8_w4a8_sm90(weight_quant, input_quant):
+            logger.info_once("Using CompressedTensorsW4A8Fp8MoEMethod")
+            return CompressedTensorsW4A8Fp8MoEMethod(
+                weight_quant, input_quant, layer.moe_config
+            )
         elif quant_config._is_dynamic_token_w4a8_int(weight_quant, input_quant):
             return CompressedTensorsW4A8Int8MoEMethod(
                 weight_quant, input_quant, layer.moe_config
@@ -2428,3 +2438,331 @@ class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod):
             apply_router_weight_on_input,
             int(_act_kind(activation)),
         )
+
+
+class CompressedTensorsW4A8Fp8MoEMethod(CompressedTensorsMoEMethod):
+    def __init__(
+        self,
+        weight_quant: QuantizationArgs,
+        input_quant: QuantizationArgs,
+        moe: FusedMoEConfig,
+        layer_name: str | None = None,
+    ):
+        super().__init__(moe)
+        self.weight_quant = weight_quant
+        self.input_quant = input_quant
+
+        self.group_size = self.weight_quant.group_size
+        self.num_bits = self.weight_quant.num_bits
+        self.packed_factor = 32 // self.num_bits
+
+        assert self.weight_quant.symmetric, (
+            "Only symmetric quantization is supported for W4A8 MoE"
+        )
+        assert self.weight_quant.actorder != "group"
+        assert self.group_size == 128, "Only group size 128 supported for W4A8 MoE"
+
+        self.disable_expert_map = False
+        self.layer_name = layer_name
+
+        from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
+        from vllm.model_executor.layers.quantization.utils.quant_utils import (
+            GroupShape,
+        )
+
+        self.quant_fp8 = QuantFP8(static=False, group_shape=GroupShape.PER_TOKEN)
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        layer.intermediate_size_per_partition = intermediate_size_per_partition
+        layer.hidden_size = hidden_size
+        layer.num_experts = num_experts
+        layer.orig_dtype = params_dtype
+        layer.weight_block_size = None
+
+        # requirement for CUTLASS reorder_tensor
+        assert hidden_size % 256 == 0, f"{hidden_size=} must be divisible by 256"
+        assert intermediate_size_per_partition % 256 == 0, (
+            f"{intermediate_size_per_partition=} must be divisible by 256"
+        )
+        # storage type, pack 8xint4 into int32
+        params_dtype = torch.int32
+
+        # WEIGHTS
+        w13_weight_packed = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size // self.packed_factor,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_packed", w13_weight_packed)
+        set_weight_attrs(w13_weight_packed, extra_weight_attrs)
+
+        w2_weight_packed = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition // self.packed_factor,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight_packed", w2_weight_packed)
+        set_weight_attrs(w2_weight_packed, extra_weight_attrs)
+
+        # SCALES
+        # weight_scale refers to the group-wise scales
+        # they are initially loaded as bf16, we will convert to fp8
+        # after loading
+        w13_weight_scale = torch.nn.Parameter(
+            torch.ones(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size // self.group_size,
+                dtype=layer.orig_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+
+        w2_weight_scale = torch.nn.Parameter(
+            torch.ones(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition // self.group_size,
+                dtype=layer.orig_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+        # Add PER-GROUP quantization for FusedMoE.weight_loader.
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.GROUP.value}
+        )
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        # weight shapes
+        w2_weight_shape = torch.nn.Parameter(
+            torch.empty(num_experts, 2), requires_grad=False
+        )
+        layer.register_parameter("w2_weight_shape", w2_weight_shape)
+        set_weight_attrs(w2_weight_shape, extra_weight_attrs)
+        w13_weight_shape = torch.nn.Parameter(
+            torch.empty(num_experts, 2), requires_grad=False
+        )
+        layer.register_parameter("w13_weight_shape", w13_weight_shape)
+        set_weight_attrs(w13_weight_shape, extra_weight_attrs)
+
+        # don't use input scales
+        layer.w13_input_scale = None
+        layer.w2_input_scale = None
+
+    def process_weights_after_loading(self, layer):
+        device = layer.w13_weight_packed.device
+
+        # STRIDES
+        # A, C
+        self.a_strides1_c_strides2 = torch.full(
+            (layer.local_num_experts,),
+            layer.hidden_size,
+            device=device,
+            dtype=torch.int64,
+        )
+        self.a_strides2 = torch.full(
+            (layer.local_num_experts,),
+            layer.intermediate_size_per_partition,
+            device=device,
+            dtype=torch.int64,
+        )
+        self.c_strides1 = torch.full(
+            (layer.local_num_experts,),
+            2 * layer.intermediate_size_per_partition,
+            device=device,
+            dtype=torch.int64,
+        )
+
+        # S (group-wise scales)
+        # sizeof(StrideS) = 16 bytes, so we need to use 2xint64 to encode it
+        self.s_strides1 = torch.zeros(
+            (layer.local_num_experts, 2), device=device, dtype=torch.int64
+        )
+        self.s_strides1[:, 0] = 2 * layer.intermediate_size_per_partition
+
+        self.s_strides2 = torch.zeros(
+            (layer.local_num_experts, 2), device=device, dtype=torch.int64
+        )
+        self.s_strides2[:, 0] = layer.hidden_size
+
+        # encode and reorder weight tensors, and get the layout to pass to
+        # the grouped gemm kernel. `b_strides1/2` specifies the entire layout
+        convert_packed_uint4b8_to_signed_int4_inplace(layer.w13_weight_packed)
+        w13_weight_shuffled, self.b_strides1 = (
+            ops.cutlass_encode_and_reorder_int4b_grouped(layer.w13_weight_packed)
+        )
+        replace_parameter(layer, "w13_weight_packed", w13_weight_shuffled)
+        convert_packed_uint4b8_to_signed_int4_inplace(layer.w2_weight_packed)
+        w2_weight_shuffled, self.b_strides2 = (
+            ops.cutlass_encode_and_reorder_int4b_grouped(layer.w2_weight_packed)
+        )
+        replace_parameter(layer, "w2_weight_packed", w2_weight_shuffled)
+
+        # convert bf16 scales to (fp8_scales, channel_scales)
+        w13_weight_scale, w13_weight_chan_scale = convert_bf16_scales_to_fp8(
+            self.quant_fp8, layer.w13_weight_scale
+        )
+        w2_weight_scale, w2_weight_chan_scale = convert_bf16_scales_to_fp8(
+            self.quant_fp8, layer.w2_weight_scale
+        )
+
+        # register channel scales
+        layer.register_parameter(
+            "w13_weight_chan_scale",
+            torch.nn.Parameter(w13_weight_chan_scale, requires_grad=False),
+        )
+        layer.register_parameter(
+            "w2_weight_chan_scale",
+            torch.nn.Parameter(w2_weight_chan_scale, requires_grad=False),
+        )
+
+        # The scales are stored as (E, N, K // 128) but the kernel expects
+        # (E, K // 128, N) in row-major format, so we need to permute the last 2 dims
+        # and make it contiguous
+        w13_weight_scale_packed = ops.cutlass_pack_scale_fp8(
+            w13_weight_scale.permute(0, 2, 1).contiguous()
+        )
+        replace_parameter(layer, "w13_weight_scale", w13_weight_scale_packed)
+        w2_weight_scale_packed = ops.cutlass_pack_scale_fp8(
+            w2_weight_scale.permute(0, 2, 1).contiguous()
+        )
+        replace_parameter(layer, "w2_weight_scale", w2_weight_scale_packed)
+
+    def maybe_make_prepare_finalize(
+        self,
+        routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
+    ) -> mk.FusedMoEPrepareAndFinalize | None:
+        return super().maybe_make_prepare_finalize(routing_tables)
+
+    def get_fused_moe_quant_config(
+        self, layer: torch.nn.Module
+    ) -> FusedMoEQuantConfig | None:
+        # Store quantization scales; both per-group and per-channel
+        # Note we haven't specified the group size here because
+        # the quant config logic assumes group-wise scaling
+        # and channel-wise scaling are exclusive.
+        return int4_w4afp8_moe_quant_config(
+            w1_scale=layer.w13_weight_scale,  # group scale
+            w2_scale=layer.w2_weight_scale,  # group scale
+            g1_alphas=layer.w13_weight_chan_scale,
+            g2_alphas=layer.w2_weight_chan_scale,
+            per_act_token_quant=True,  # always use dynamc per-token
+            per_out_ch_quant=True,  # always use per-channel
+        )
+
+    def select_gemm_impl(
+        self,
+        prepare_finalize: mk.FusedMoEPrepareAndFinalize,
+        layer: torch.nn.Module,
+    ) -> mk.FusedMoEPermuteExpertsUnpermute:
+        assert self.moe_quant_config is not None
+        assert (
+            prepare_finalize.activation_format == FusedMoEActivationFormat.Standard
+        ), "BatchedExperts not supported"
+
+        from vllm.model_executor.layers.fused_moe import CutlassExpertsW4A8Fp8
+
+        experts: FusedMoEPermuteExpertsUnpermute
+
+        logger.debug("CutlassExpertsW4A8Fp8(%s)", self.__class__.__name__)
+        experts = CutlassExpertsW4A8Fp8(
+            out_dtype=self.moe.in_dtype,
+            a_strides1=self.a_strides1_c_strides2,
+            a_strides2=self.a_strides2,
+            b_strides1=self.b_strides1,
+            b_strides2=self.b_strides2,
+            c_strides1=self.c_strides1,
+            c_strides2=self.a_strides1_c_strides2,
+            s_strides1=self.s_strides1,
+            s_strides2=self.s_strides2,
+            quant_config=self.moe_quant_config,
+            group_size=self.group_size,
+        )
+
+        num_dispatchers = prepare_finalize.num_dispatchers()
+        self.disable_expert_map = (
+            num_dispatchers > 1 or not experts.supports_expert_map()
+        )
+
+        return experts
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
+        global_num_experts: int = -1,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+        e_score_correction_bias: torch.Tensor | None = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ):
+        if enable_eplb:
+            raise NotImplementedError(
+                "EPLB not supported for `CompressedTensorsW4A8Fp8MoEMethod` yet."
+            )
+        assert self.moe_quant_config is not None
+        topk_weights, topk_ids, _ = layer.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+        )
+
+        from vllm.model_executor.layers.fused_moe.cutlass_moe import (
+            cutlass_moe_w4a8_fp8,
+        )
+
+        return cutlass_moe_w4a8_fp8(
+            x,
+            layer.w13_weight_packed,
+            layer.w2_weight_packed,
+            topk_weights,
+            topk_ids,
+            quant_config=self.moe_quant_config,
+            activation=activation,
+            global_num_experts=global_num_experts,
+            expert_map=None if self.disable_expert_map else expert_map,
+            a_strides1=self.a_strides1_c_strides2,
+            a_strides2=self.a_strides2,
+            b_strides1=self.b_strides1,
+            b_strides2=self.b_strides2,
+            c_strides1=self.c_strides1,
+            c_strides2=self.a_strides1_c_strides2,
+            s_strides1=self.s_strides1,
+            s_strides2=self.s_strides2,
+            group_size=self.group_size,
+        )
+
+    @property
+    def supports_eplb(self) -> bool:
+        return False
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py
index a23961e897534..9a25e08cbad75 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py
@@ -128,14 +128,15 @@ class CompressedTensorsW4A8Fp8(CompressedTensorsScheme):
             ),
         )
 
-        # TODO(czhu): allocate the packed fp8 scales memory here?
-        # the scales will be expanded by 8x via `cutlass_pack_scale_fp8`
+        # After loading, we will transform bf16 -> fp8 ->
+        # expand by 8x via `cutlass_pack_scale_fp8`
+        # and construct per-channel fp32 scales.
         weight_scale_args = {
             "weight_loader": weight_loader,
             "data": torch.empty(
                 output_size_per_partition,
                 scales_and_zp_size,
-                dtype=torch.float8_e4m3fn,
+                dtype=params_dtype,
             ),
         }
 
@@ -152,17 +153,9 @@ class CompressedTensorsW4A8Fp8(CompressedTensorsScheme):
             data=torch.empty(2, dtype=torch.int64), weight_loader=weight_loader
         )
 
-        # per-channel scales
-        weight_chan_scale = ChannelQuantScaleParameter(
-            data=torch.empty((output_size_per_partition, 1), dtype=torch.float32),
-            output_dim=0,
-            weight_loader=weight_loader,
-        )
-
         layer.register_parameter("weight_packed", weight)
         layer.register_parameter("weight_scale", weight_scale)
         layer.register_parameter("weight_shape", weight_shape)
-        layer.register_parameter("weight_chan_scale", weight_chan_scale)
 
         self.kernel = kernel_type(
             mp_linear_kernel_config,
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py
index 8ef6457c952f1..c9c1a3abf7fd3 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py
@@ -6,7 +6,11 @@ import torch
 
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
-from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape,
+    convert_bf16_scales_to_fp8,
+    convert_packed_uint4b8_to_signed_int4_inplace,
+)
 from vllm.model_executor.parameter import BasevLLMParameter, permute_param_layout_
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
@@ -48,7 +52,6 @@ class CutlassW4A8LinearKernel(MPLinearKernel):
                 "CUTLASS W4A8, only supported int4",
             )
 
-        # TODO(czhu): support -1 (column-wise)
         if c.group_size != 128:
             return False, "Only group_size 128 is supported"
 
@@ -71,9 +74,9 @@ class CutlassW4A8LinearKernel(MPLinearKernel):
     #  `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0}
     #  `weight_scale`  is: {input_dim = 0, output_dim = 1}
     def process_weights_after_loading(self, layer: torch.nn.Module):
-        # TODO(czhu): optimize speed/mem usage
         def transform_w_q(x):
             assert isinstance(x, BasevLLMParameter)
+            convert_packed_uint4b8_to_signed_int4_inplace(x.data)
             permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
             x.data = ops.cutlass_encode_and_reorder_int4b(x.data.t().contiguous().t())
             return x
@@ -85,10 +88,18 @@ class CutlassW4A8LinearKernel(MPLinearKernel):
             x.data = ops.cutlass_pack_scale_fp8(x.data)
             return x
 
+        w_s = getattr(layer, self.w_s_name)
+        fp8_scales, chan_scales = convert_bf16_scales_to_fp8(self.quant_fp8, w_s.data)
+        w_s.data = fp8_scales
+
+        # register per-channel scales
+        layer.register_parameter(
+            "weight_chan_scale", torch.nn.Parameter(chan_scales, requires_grad=False)
+        )
+
         # Encode/reorder weights and pack scales
         self._transform_param(layer, self.w_q_name, transform_w_q)
         self._transform_param(layer, self.w_s_name, transform_w_s)
-        self._transform_param(layer, "weight_chan_scale", lambda x: x)
 
     def apply_weights(
         self,
diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py
index 92ee8c498e01f..d01263f82007d 100644
--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """This file is used for /tests and /benchmarks"""
 
-from collections.abc import Mapping
+from collections.abc import Callable, Mapping
 from dataclasses import dataclass
 from types import MappingProxyType
 from typing import ClassVar, NamedTuple
@@ -691,3 +691,51 @@ def cutlass_fp4_supported() -> bool:
     capability_tuple = current_platform.get_device_capability()
     capability = -1 if capability_tuple is None else capability_tuple.to_int()
     return cutlass_scaled_mm_supports_fp4(capability)
+
+
+def convert_bf16_scales_to_fp8(
+    quant_fp8: Callable, scales: torch.Tensor
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Convert a BF16 scale tensor into the pair of (fp8_scales, channel_scales)
+    expected by W4A8 GEMM kernels.
+    """
+    assert scales.is_contiguous(), (
+        f"scale tensor must be contiguous, got {scales.stride()=}"
+    )
+    assert scales.is_cuda, "scales must be on gpu"
+
+    orig_shape = scales.shape
+    k_groups = orig_shape[-1]
+    flat_scales = scales.view(-1, k_groups)
+
+    fp8_scales, chan_scales = quant_fp8(flat_scales)
+    fp8_scales = (fp8_scales.float() / 8.0).to(torch.float8_e4m3fn)
+    chan_scales *= 8.0
+
+    # restore original shape
+    fp8_scales = fp8_scales.view(orig_shape)
+    chan_scales = chan_scales.view(orig_shape[:-1], -1)
+
+    return fp8_scales, chan_scales
+
+
+def convert_packed_uint4b8_to_signed_int4_inplace(t: torch.Tensor) -> torch.Tensor:
+    """
+    Convert int4b8 (packed to int32) to signed int4
+    """
+    assert t.is_cuda, "tensor must be on gpu"
+    assert t.dtype == torch.int32, f"expected int32 packed weights but got {t.dtype}"
+
+    # loop through the 8 4-bit nibbles in each int32 entry
+    for i in range(8):
+        shift = 4 * i
+        # extract the i-th 4-bit nibble
+        nib = (t >> shift) & 0xF
+        # clear the original nibble by masking out
+        t &= ~(0xF << shift)
+        # convert int4b8 [0..15] to signed int4 [-8..7] by subtracting 8
+        # and update in-place
+        t |= ((nib - 8) & 0xF) << shift
+
+    return t

From 03b91f726214b4a022f73a084ee283001e5bba0c Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Mon, 8 Dec 2025 23:44:28 -0500
Subject: [PATCH 362/770] [Bugfix] Fix compressed-tensors models failing to
 load with transformers backend (#30287)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 .../compressed_tensors/compressed_tensors.py  | 39 ++++++++++++++-----
 1 file changed, 30 insertions(+), 9 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 21f4cfe51d08e..f835584219cca 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -116,16 +116,37 @@ class CompressedTensorsConfig(QuantizationConfig):
         return "compressed-tensors"
 
     def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
-        self.target_scheme_map = hf_to_vllm_mapper.apply_dict(self.target_scheme_map)
-        self.ignore = hf_to_vllm_mapper.apply_list(self.ignore)
-        self.sparsity_scheme_map = hf_to_vllm_mapper.apply_dict(
-            self.sparsity_scheme_map
-        )
-        self.sparsity_ignore_list = hf_to_vllm_mapper.apply_list(
-            self.sparsity_ignore_list
-        )
+        """
+        Transform layer paths in config targets to match vLLM's naming.
+
+        The WeightsMapper is designed for weight paths, but some backends
+        (e.g. transformers) use broad prefix mappings like "" -> "model."
+        which would incorrectly transform non-path targets.
+
+        compressed-tensors targets can be:
+        - Layer paths: "layers.0.self_attn.q_proj" -> transformed
+        - Module class names: "Linear" -> preserved (no ".")
+        - Regex patterns: "re:.*proj" -> preserved (starts with "re:")
+        """
+
+        def _map_target(target: str) -> str | None:
+            is_layer_path = "." in target and not target.startswith("re:")
+            if is_layer_path:
+                return hf_to_vllm_mapper._map_name(target)
+            return target
+
+        def _apply_dict(d: dict) -> dict:
+            return {k: v for t, v in d.items() if (k := _map_target(t)) is not None}
+
+        def _apply_list(lst: list) -> list:
+            return [t for x in lst if (t := _map_target(x)) is not None]
+
+        self.target_scheme_map = _apply_dict(self.target_scheme_map)
+        self.ignore = _apply_list(self.ignore)
+        self.sparsity_scheme_map = _apply_dict(self.sparsity_scheme_map)
+        self.sparsity_ignore_list = _apply_list(self.sparsity_ignore_list)
         if self.kv_cache_scheme is not None:
-            self.kv_cache_scheme = hf_to_vllm_mapper.apply_dict(self.kv_cache_scheme)
+            self.kv_cache_scheme = _apply_dict(self.kv_cache_scheme)
 
     def get_quant_method(
         self,

From 4c6fd258808ed42fc98a94f3a849f5fc9efebc20 Mon Sep 17 00:00:00 2001
From: Or Ozeri <oro@il.ibm.com>
Date: Tue, 9 Dec 2025 06:46:09 +0200
Subject: [PATCH 363/770] kv_transfer: Rename the shared storage connectors
 (#30201)

Signed-off-by: Or Ozeri <oro@il.ibm.com>
---
 .../scripts/hardware_ci/run-xpu-test.sh       |  2 +-
 docs/features/disagg_encoder.md               |  6 +-
 docs/features/disagg_prefill.md               |  4 +-
 .../decode_example.py                         |  2 +-
 .../prefill_example.py                        |  2 +-
 .../kv_load_failure_recovery/README.md        |  4 +-
 .../decode_example.py                         |  6 +-
 ....py => load_recovery_example_connector.py} | 20 ++---
 .../prefill_example.py                        |  2 +-
 .../disaggregated_encoder/README.md           |  6 +-
 .../disagg_1e1p1d_example.sh                  |  4 +-
 .../disagg_1e1pd_example.sh                   |  4 +-
 tests/distributed/test_kvlayout.py            |  2 +-
 tests/v1/core/test_scheduler.py               |  6 +-
 tests/v1/core/utils.py                        |  4 +-
 .../integration/run_epd_correctness_test.sh   |  8 +-
 ...nector.py => test_ec_example_connector.py} | 86 +++++++++----------
 tests/v1/engine/test_engine_core.py           |  4 +-
 .../unit/test_backwards_compatibility.py      |  8 +-
 ...connector.py => test_example_connector.py} |  6 +-
 .../unit/test_kv_connector_lifecyle.py        | 10 +--
 .../kv_connector/unit/test_multi_connector.py | 22 ++---
 tests/v1/kv_connector/unit/utils.py           | 10 +--
 ...rage_connector.py => example_connector.py} |  8 +-
 .../ec_transfer/ec_connector/factory.py       |  6 +-
 .../kv_transfer/kv_connector/factory.py       |  6 +-
 ...rage_connector.py => example_connector.py} | 10 +--
 27 files changed, 129 insertions(+), 129 deletions(-)
 rename examples/offline_inference/kv_load_failure_recovery/{rogue_shared_storage_connector.py => load_recovery_example_connector.py} (88%)
 rename tests/v1/ec_connector/unit/{test_ec_shared_storage_connector.py => test_ec_example_connector.py} (90%)
 rename tests/v1/kv_connector/unit/{test_shared_storage_connector.py => test_example_connector.py} (97%)
 rename vllm/distributed/ec_transfer/ec_connector/{shared_storage_connector.py => example_connector.py} (96%)
 rename vllm/distributed/kv_transfer/kv_connector/v1/{shared_storage_connector.py => example_connector.py} (98%)

diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index 1d5dba3f26f5a..dfc9db512d1e9 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -47,6 +47,6 @@ docker run \
     pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
     pytest -v -s v1/structured_output
     pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
-    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
+    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
     pytest -v -s v1/test_serial_utils.py
 '
diff --git a/docs/features/disagg_encoder.md b/docs/features/disagg_encoder.md
index 7d40af7069822..f18a0e85e4b3b 100644
--- a/docs/features/disagg_encoder.md
+++ b/docs/features/disagg_encoder.md
@@ -32,14 +32,14 @@ Design doc: <https://docs.google.com/document/d/1aed8KtC6XkXtdoV87pWT0a8OJlZ-Cpn
 
 ## 2  Usage Example
 
-The current reference pathway is **SharedStorageConnector**.  
+The current reference pathway is **ExampleConnector**.  
 Below ready-to-run scripts shows the workflow:
 
 1 Encoder instance + 1 PD instance:
-`examples/online_serving/disaggregated_encoder/shared_storage_connector/disagg_encoder_example.sh`
+`examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh`
 
 1 Encoder instance + 1 Prefill instance + 1 Decode instance:
-`examples/online_serving/disaggregated_encoder/shared_storage_connector/disagg_epd_example.sh`
+`examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh`
 
 ---
 
diff --git a/docs/features/disagg_prefill.md b/docs/features/disagg_prefill.md
index fd4f249f2ec6c..dc5e11ea257da 100644
--- a/docs/features/disagg_prefill.md
+++ b/docs/features/disagg_prefill.md
@@ -21,14 +21,14 @@ Please refer to [examples/online_serving/disaggregated_prefill.sh](../../example
 
 Now supports 5 types of connectors:
 
-- **SharedStorageConnector**: refer to [examples/offline_inference/disaggregated-prefill-v1/run.sh](../../examples/offline_inference/disaggregated-prefill-v1/run.sh) for the example usage of SharedStorageConnector disaggregated prefilling.
+- **ExampleConnector**: refer to [examples/offline_inference/disaggregated-prefill-v1/run.sh](../../examples/offline_inference/disaggregated-prefill-v1/run.sh) for the example usage of ExampleConnector disaggregated prefilling.
 - **LMCacheConnectorV1**: refer to [examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh](../../examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh) for the example usage of LMCacheConnectorV1 disaggregated prefilling which uses NIXL as the underlying KV transmission.
 - **NixlConnector**: refer to [tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh](../../tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh) for the example usage of NixlConnector disaggregated prefilling which support fully async send/recv. For detailed usage guide, see [NixlConnector Usage Guide](nixl_connector_usage.md).
 - **P2pNcclConnector**: refer to [examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh](../../examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh) for the example usage of P2pNcclConnector disaggregated prefilling.
 - **MultiConnector**: take advantage of the kv_connector_extra_config: dict[str, Any] already present in KVTransferConfig to stash all the connectors we want in an ordered list of kwargs.such as:
 
   ```bash
-  --kv-transfer-config '{"kv_connector":"MultiConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"NixlConnector","kv_role":"kv_both"},{"kv_connector":"SharedStorageConnector","kv_role":"kv_both","kv_connector_extra_config":{"shared_storage_path":"local_storage"}}]}}'
+  --kv-transfer-config '{"kv_connector":"MultiConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"NixlConnector","kv_role":"kv_both"},{"kv_connector":"ExampleConnector","kv_role":"kv_both","kv_connector_extra_config":{"shared_storage_path":"local_storage"}}]}}'
   ```
 
 For NixlConnector, you may also specify one or multiple NIXL_Backend. Such as:
diff --git a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
index 8f3d1a5c00369..2d575840e6a71 100644
--- a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
+++ b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
@@ -30,7 +30,7 @@ def main():
         max_num_batched_tokens=64,
         max_num_seqs=16,
         kv_transfer_config=KVTransferConfig(
-            kv_connector="SharedStorageConnector",
+            kv_connector="ExampleConnector",
             kv_role="kv_both",
             kv_connector_extra_config={"shared_storage_path": "local_storage"},
         ),
diff --git a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
index 0bfe7ec0e6cf6..207c6daebc2f5 100644
--- a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
+++ b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
@@ -26,7 +26,7 @@ def main():
         enforce_eager=True,
         gpu_memory_utilization=0.8,
         kv_transfer_config=KVTransferConfig(
-            kv_connector="SharedStorageConnector",
+            kv_connector="ExampleConnector",
             kv_role="kv_both",
             kv_connector_extra_config={"shared_storage_path": "local_storage"},
         ),
diff --git a/examples/offline_inference/kv_load_failure_recovery/README.md b/examples/offline_inference/kv_load_failure_recovery/README.md
index 230a16812b25e..1f29a6ff56dbc 100644
--- a/examples/offline_inference/kv_load_failure_recovery/README.md
+++ b/examples/offline_inference/kv_load_failure_recovery/README.md
@@ -10,7 +10,7 @@ It demonstrates vLLM's ability to recover from KV load failures in both synchron
 - `decode_example.py` – performs the decode stage. Accepts:
     - `--simulate-failure`: simulates KV load failure using a custom connector.
     - `--async-load`: enables asynchronous KV loading mode.
-- `rogue_shared_storage_connector.py` – defines `RogueSharedStorageConnector`, a subclass of `SharedStorageConnector`, that simulates missing or corrupted external KV blocks by failing to load blocks for the first decode request.
+- `load_recovery_example_connector.py` – defines `LoadRecoveryExampleConnector`, a subclass of `ExampleConnector`, that simulates missing or corrupted external KV blocks by failing to load blocks for the first decode request.
 - `run.sh` – orchestrates the test: runs the prefill stage, then three decode stages:
     1. Normal decode (baseline).
     2. Decode with simulated sync KV load failure.
@@ -20,7 +20,7 @@ It demonstrates vLLM's ability to recover from KV load failures in both synchron
 
 ## How It Works
 
-- The test dynamically loads `RogueSharedStorageConnector` via `KVTransferConfig.kv_connector_module_path`, enabling controlled simulation of load failures without modifying the original connector.
+- The test dynamically loads `LoadRecoveryExampleConnector` via `KVTransferConfig.kv_connector_module_path`, enabling controlled simulation of load failures without modifying the original connector.
 - The decode stages that simulate failure are expected to trigger recovery logic in vLLM, resulting in the same output as the baseline decode.
 - If recovery fails, the script prints a unified diff of the output mismatch and exits with error.
 
diff --git a/examples/offline_inference/kv_load_failure_recovery/decode_example.py b/examples/offline_inference/kv_load_failure_recovery/decode_example.py
index 69523f56eace3..d0df54167aeac 100644
--- a/examples/offline_inference/kv_load_failure_recovery/decode_example.py
+++ b/examples/offline_inference/kv_load_failure_recovery/decode_example.py
@@ -35,13 +35,13 @@ def main():
 
     if args.simulate_failure:
         ktc = KVTransferConfig(
-            kv_connector="RogueSharedStorageConnector",
+            kv_connector="LoadRecoveryExampleConnector",
             kv_role="kv_both",
             kv_connector_extra_config={
                 "shared_storage_path": "local_storage",
                 "async_load": args.async_load,
             },
-            kv_connector_module_path="rogue_shared_storage_connector",
+            kv_connector_module_path="load_recovery_example_connector",
         )
         out_file = (
             "async_decode_recovered_output.txt"
@@ -50,7 +50,7 @@ def main():
         )
     else:
         ktc = KVTransferConfig(
-            kv_connector="SharedStorageConnector",
+            kv_connector="ExampleConnector",
             kv_role="kv_both",
             kv_connector_extra_config={
                 "shared_storage_path": "local_storage",
diff --git a/examples/offline_inference/kv_load_failure_recovery/rogue_shared_storage_connector.py b/examples/offline_inference/kv_load_failure_recovery/load_recovery_example_connector.py
similarity index 88%
rename from examples/offline_inference/kv_load_failure_recovery/rogue_shared_storage_connector.py
rename to examples/offline_inference/kv_load_failure_recovery/load_recovery_example_connector.py
index 5b2acea4c9457..7aab07f8a2c33 100644
--- a/examples/offline_inference/kv_load_failure_recovery/rogue_shared_storage_connector.py
+++ b/examples/offline_inference/kv_load_failure_recovery/load_recovery_example_connector.py
@@ -10,9 +10,9 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorMetadata,
     KVConnectorRole,
 )
-from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import (
-    SharedStorageConnector,
-    SharedStorageConnectorMetadata,
+from vllm.distributed.kv_transfer.kv_connector.v1.example_connector import (
+    ExampleConnector,
+    ExampleConnectorMetadata,
 )
 from vllm.forward_context import ForwardContext
 from vllm.v1.core.kv_cache_manager import KVCacheBlocks
@@ -26,15 +26,15 @@ logging.basicConfig(level=logging.INFO)
 
 
 @dataclass
-class RogueSharedStorageConnectorMetadata(SharedStorageConnectorMetadata):
+class LoadRecoveryExampleConnectorMetadata(ExampleConnectorMetadata):
     req_to_block_ids: dict[str, set[int]] = field(default_factory=dict)
 
     @classmethod
-    def from_base(cls, base: SharedStorageConnectorMetadata):
+    def from_base(cls, base: ExampleConnectorMetadata):
         return cls(requests=base.requests)
 
 
-class RogueSharedStorageConnector(SharedStorageConnector):
+class LoadRecoveryExampleConnector(ExampleConnector):
     def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
         super().__init__(vllm_config=vllm_config, role=role)
         self._async_load = vllm_config.kv_transfer_config.get_from_extra_config(
@@ -45,7 +45,7 @@ class RogueSharedStorageConnector(SharedStorageConnector):
         self._req_to_block_ids: dict[str, list[int]] = dict()
 
     def bind_connector_metadata(self, connector_metadata: KVConnectorMetadata) -> None:
-        assert isinstance(connector_metadata, RogueSharedStorageConnectorMetadata)
+        assert isinstance(connector_metadata, LoadRecoveryExampleConnectorMetadata)
         index, failed_request = next(
             (
                 (i, x)
@@ -84,7 +84,7 @@ class RogueSharedStorageConnector(SharedStorageConnector):
     ) -> tuple[set[str] | None, set[str] | None]:
         if self._async_load:
             meta = self._get_connector_metadata()
-            assert isinstance(meta, RogueSharedStorageConnectorMetadata)
+            assert isinstance(meta, LoadRecoveryExampleConnectorMetadata)
             if meta.req_to_block_ids:
                 return None, set(meta.req_to_block_ids)
 
@@ -126,9 +126,9 @@ class RogueSharedStorageConnector(SharedStorageConnector):
     ) -> KVConnectorMetadata:
         if not self._async_load:
             base = super().build_connector_meta(scheduler_output)
-            meta = RogueSharedStorageConnectorMetadata.from_base(base)
+            meta = LoadRecoveryExampleConnectorMetadata.from_base(base)
         else:
-            meta = RogueSharedStorageConnectorMetadata()
+            meta = LoadRecoveryExampleConnectorMetadata()
             if self._requests_need_load:
                 for req_id, request in self._requests_need_load.items():
                     meta.add_request(
diff --git a/examples/offline_inference/kv_load_failure_recovery/prefill_example.py b/examples/offline_inference/kv_load_failure_recovery/prefill_example.py
index 047b81c82df53..ee4a84fd95003 100644
--- a/examples/offline_inference/kv_load_failure_recovery/prefill_example.py
+++ b/examples/offline_inference/kv_load_failure_recovery/prefill_example.py
@@ -26,7 +26,7 @@ def main():
         enforce_eager=True,
         gpu_memory_utilization=0.8,
         kv_transfer_config=KVTransferConfig(
-            kv_connector="SharedStorageConnector",
+            kv_connector="ExampleConnector",
             kv_role="kv_both",
             kv_connector_extra_config={"shared_storage_path": "local_storage"},
         ),
diff --git a/examples/online_serving/disaggregated_encoder/README.md b/examples/online_serving/disaggregated_encoder/README.md
index 5813a3cecf73b..b2c3bb974dfab 100644
--- a/examples/online_serving/disaggregated_encoder/README.md
+++ b/examples/online_serving/disaggregated_encoder/README.md
@@ -50,12 +50,12 @@ The vllm instances and `disagg_encoder_proxy` supports local URIs with ```{"url"
 
 ## EC connector and KV transfer
 
-The `ECSharedStorageConnector` is used to store the encoder cache on local disk and facilitate transfer. To enable the encoder disaggregation feature, add the following configuration:
+The `ECExampleonnector` is used to store the encoder cache on local disk and facilitate transfer. To enable the encoder disaggregation feature, add the following configuration:
 
 ```bash
 # Add to encoder instance: 
 --ec-transfer-config '{
-    "ec_connector": "ECSharedStorageConnector",
+    "ec_connector": "ECExampleConnector",
     "ec_role": "ec_producer",
     "ec_connector_extra_config": {
         "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
@@ -64,7 +64,7 @@ The `ECSharedStorageConnector` is used to store the encoder cache on local disk
 
 # Add to prefill/prefill+decode instance: 
 --ec-transfer-config '{
-    "ec_connector": "ECSharedStorageConnector",
+    "ec_connector": "ECExampleConnector",
     "ec_role": "ec_consumer",
     "ec_connector_extra_config": {
         "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
diff --git a/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh b/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh
index 57489df64f51e..95a418374ad28 100644
--- a/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh
+++ b/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh
@@ -102,7 +102,7 @@ CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
     --max-num-seqs 128 \
     --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
     --ec-transfer-config '{
-        "ec_connector": "ECSharedStorageConnector",
+        "ec_connector": "ECExampleConnector",
         "ec_role": "ec_producer",
         "ec_connector_extra_config": {
             "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
@@ -126,7 +126,7 @@ vllm serve "$MODEL" \
     --max-num-seqs 128 \
     --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
     --ec-transfer-config '{
-        "ec_connector": "ECSharedStorageConnector",
+        "ec_connector": "ECExampleConnector",
         "ec_role": "ec_consumer",
         "ec_connector_extra_config": {
             "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
diff --git a/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh b/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh
index 6073e0580b11d..c4a591d7438cb 100644
--- a/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh
+++ b/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh
@@ -96,7 +96,7 @@ CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
     --max-num-seqs 128 \
     --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
     --ec-transfer-config '{
-        "ec_connector": "ECSharedStorageConnector",
+        "ec_connector": "ECExampleConnector",
         "ec_role": "ec_producer",
         "ec_connector_extra_config": {
             "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
@@ -117,7 +117,7 @@ CUDA_VISIBLE_DEVICES="$GPU_PD" vllm serve "$MODEL" \
     --max-num-seqs 128 \
     --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
     --ec-transfer-config '{
-        "ec_connector": "ECSharedStorageConnector",
+        "ec_connector": "ECExampleConnector",
         "ec_role": "ec_consumer",
         "ec_connector_extra_config": {
             "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
diff --git a/tests/distributed/test_kvlayout.py b/tests/distributed/test_kvlayout.py
index b190b2820451b..c8177f1c7c2ff 100644
--- a/tests/distributed/test_kvlayout.py
+++ b/tests/distributed/test_kvlayout.py
@@ -61,7 +61,7 @@ def test_get_kv_connector_cache_layout_with_multi_connector():
         kv_role="kv_both",
         kv_connector_extra_config={
             "connectors": [
-                {"kv_connector": "SharedStorageConnector", "kv_role": "kv_both"},
+                {"kv_connector": "ExampleConnector", "kv_role": "kv_both"},
                 {"kv_connector": "NixlConnector", "kv_role": "kv_both"},
             ]
         },
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index c6c4a5085bff7..1999e9f6c3b99 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -1536,7 +1536,7 @@ def create_scheduler_with_priority(
     )
     kv_transfer_config = (
         KVTransferConfig(
-            kv_connector="SharedStorageConnector",
+            kv_connector="ExampleConnector",
             kv_role="kv_both",
             kv_connector_extra_config={"shared_storage_path": "local_storage"},
         )
@@ -1552,7 +1552,7 @@ def create_scheduler_with_priority(
 
     ec_transfer_config = (
         ECTransferConfig(
-            ec_connector="ECSharedStorageConnector",
+            ec_connector="ECExampleConnector",
             ec_role=ec_role,
             ec_connector_extra_config={"shared_storage_path": "/tmp/ec_test"},
         )
@@ -2413,7 +2413,7 @@ def _assert_right_ec_connector_metadata(
     metadata_dict = {mm_data.mm_hash: mm_data for mm_data in metadata.mm_datas}
 
     # Check all required identifiers exist in metadata; and no extra
-    # In ECSharedStorageConnector format
+    # In ECExampleConnector format
     # NOTE: even having same identifier, the mm_features can be different
     # since their mm_position can be in different offsets, etc
     identifiers_dict = {f.identifier for f in mm_features_list}
diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py
index f5ba613d38db1..531b9c595b04d 100644
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -108,7 +108,7 @@ def create_scheduler(
         )
     elif use_kv_connector:
         kv_transfer_config = KVTransferConfig(
-            kv_connector="SharedStorageConnector",
+            kv_connector="ExampleConnector",
             kv_role="kv_both",
             kv_connector_extra_config={"shared_storage_path": "local_storage"},
         )
@@ -121,7 +121,7 @@ def create_scheduler(
 
     ec_transfer_config = (
         ECTransferConfig(
-            ec_connector="ECSharedStorageConnector",
+            ec_connector="ECExampleConnector",
             ec_role=ec_role,
             ec_connector_extra_config={"shared_storage_path": "/tmp/ec_test"},
         )
diff --git a/tests/v1/ec_connector/integration/run_epd_correctness_test.sh b/tests/v1/ec_connector/integration/run_epd_correctness_test.sh
index 55dd39c0a957f..0c2666306558c 100644
--- a/tests/v1/ec_connector/integration/run_epd_correctness_test.sh
+++ b/tests/v1/ec_connector/integration/run_epd_correctness_test.sh
@@ -148,7 +148,7 @@ run_epd_1e_1pd() {
         --max-num-seqs 128 \
         --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
         --ec-transfer-config '{
-            "ec_connector": "ECSharedStorageConnector",
+            "ec_connector": "ECExampleConnector",
             "ec_role": "ec_producer",
             "ec_connector_extra_config": {
                 "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
@@ -167,7 +167,7 @@ run_epd_1e_1pd() {
         --max-num-seqs 128 \
         --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
         --ec-transfer-config '{
-            "ec_connector": "ECSharedStorageConnector",
+            "ec_connector": "ECExampleConnector",
             "ec_role": "ec_consumer",
             "ec_connector_extra_config": {
                 "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
@@ -348,7 +348,7 @@ run_epd_1e_1p_1d() {
         --max-num-seqs 128 \
         --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
         --ec-transfer-config '{
-            "ec_connector": "ECSharedStorageConnector",
+            "ec_connector": "ECExampleConnector",
             "ec_role": "ec_producer",
             "ec_connector_extra_config": {
                 "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
@@ -369,7 +369,7 @@ run_epd_1e_1p_1d() {
         --max-num-seqs 128 \
         --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
         --ec-transfer-config '{
-            "ec_connector": "ECSharedStorageConnector",
+            "ec_connector": "ECExampleConnector",
             "ec_role": "ec_consumer",
             "ec_connector_extra_config": {
                 "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
diff --git a/tests/v1/ec_connector/unit/test_ec_shared_storage_connector.py b/tests/v1/ec_connector/unit/test_ec_example_connector.py
similarity index 90%
rename from tests/v1/ec_connector/unit/test_ec_shared_storage_connector.py
rename to tests/v1/ec_connector/unit/test_ec_example_connector.py
index a58daa2628e21..7e9eb21310031 100644
--- a/tests/v1/ec_connector/unit/test_ec_shared_storage_connector.py
+++ b/tests/v1/ec_connector/unit/test_ec_example_connector.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
-Unit tests for ECSharedStorageConnector.
+Unit tests for ECExampleConnector.
 """
 
 import os
@@ -13,9 +13,9 @@ import torch
 
 from vllm.config import VllmConfig
 from vllm.distributed.ec_transfer.ec_connector.base import ECConnectorRole
-from vllm.distributed.ec_transfer.ec_connector.shared_storage_connector import (
-    ECSharedStorageConnector,
-    ECSharedStorageConnectorMetadata,
+from vllm.distributed.ec_transfer.ec_connector.example_connector import (
+    ECExampleConnector,
+    ECExampleConnectorMetadata,
     MMMeta,
 )
 from vllm.multimodal.inputs import MultiModalFeatureSpec, PlaceholderRange
@@ -81,12 +81,12 @@ def mock_request_with_3_mm():
 
 
 # ------------------ Unit Tests ------------------ #
-class TestECSharedStorageConnectorBasics:
+class TestECExampleConnectorBasics:
     """Test basic EC connector functionality."""
 
     def test_initialization_producer(self, mock_vllm_config_producer, temp_storage):
         """Test connector initializes correctly as producer."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_producer,
             role=ECConnectorRole.SCHEDULER,
         )
@@ -98,7 +98,7 @@ class TestECSharedStorageConnectorBasics:
 
     def test_initialization_consumer(self, mock_vllm_config_consumer, temp_storage):
         """Test connector initializes correctly as consumer."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_consumer,
             role=ECConnectorRole.WORKER,
         )
@@ -109,11 +109,11 @@ class TestECSharedStorageConnectorBasics:
 
     def test_role_assignment(self, mock_vllm_config_producer):
         """Test role is correctly assigned."""
-        scheduler_connector = ECSharedStorageConnector(
+        scheduler_connector = ECExampleConnector(
             vllm_config=mock_vllm_config_producer,
             role=ECConnectorRole.SCHEDULER,
         )
-        worker_connector = ECSharedStorageConnector(
+        worker_connector = ECExampleConnector(
             vllm_config=mock_vllm_config_producer,
             role=ECConnectorRole.WORKER,
         )
@@ -133,7 +133,7 @@ class TestCacheExistence:
     ):
         """Test has_caches returns True when all 3 caches exist."""
         # Test for producer first
-        producer = ECSharedStorageConnector(
+        producer = ECExampleConnector(
             vllm_config=mock_vllm_config_producer,
             role=ECConnectorRole.SCHEDULER,
         )
@@ -154,7 +154,7 @@ class TestCacheExistence:
         assert all(producer_result), f"Expected all True, got {producer_result}"
 
         # Also test consumer can check if cache exists
-        consumer = ECSharedStorageConnector(
+        consumer = ECExampleConnector(
             vllm_config=mock_vllm_config_consumer,
             role=ECConnectorRole.SCHEDULER,
         )
@@ -170,7 +170,7 @@ class TestCacheExistence:
         self, mock_vllm_config_producer, mock_request_with_3_mm
     ):
         """Test has_caches returns False when no caches exist."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_producer,
             role=ECConnectorRole.SCHEDULER,
         )
@@ -186,7 +186,7 @@ class TestCacheExistence:
         self, mock_vllm_config_producer, mock_request_with_3_mm
     ):
         """Test has_caches with some caches existing (1 of 3)."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_producer,
             role=ECConnectorRole.SCHEDULER,
         )
@@ -213,7 +213,7 @@ class TestStateManagement:
         self, mock_vllm_config_producer, mock_request_with_3_mm
     ):
         """Test state update after allocation for 3 MM items."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_producer,
             role=ECConnectorRole.SCHEDULER,
         )
@@ -238,7 +238,7 @@ class TestStateManagement:
         self, mock_vllm_config_producer, mock_request_with_3_mm
     ):
         """Test metadata building for 3 MM items."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_producer,
             role=ECConnectorRole.SCHEDULER,
         )
@@ -252,7 +252,7 @@ class TestStateManagement:
         metadata = connector.build_connector_meta(scheduler_output)
 
         # Assert
-        assert isinstance(metadata, ECSharedStorageConnectorMetadata)
+        assert isinstance(metadata, ECExampleConnectorMetadata)
         assert len(metadata.mm_datas) == 3
         assert metadata.mm_datas[0].mm_hash == "img_hash_1"
         assert metadata.mm_datas[0].num_token == 100
@@ -266,7 +266,7 @@ class TestStateManagement:
 
     def test_build_connector_meta_empty(self, mock_vllm_config_producer):
         """Test metadata building with empty state."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_producer,
             role=ECConnectorRole.SCHEDULER,
         )
@@ -274,14 +274,14 @@ class TestStateManagement:
         scheduler_output = Mock(spec=SchedulerOutput)
         metadata = connector.build_connector_meta(scheduler_output)
 
-        assert isinstance(metadata, ECSharedStorageConnectorMetadata)
+        assert isinstance(metadata, ECExampleConnectorMetadata)
         assert len(metadata.mm_datas) == 0
 
     def test_state_cleared_after_metadata_build(
         self, mock_vllm_config_producer, mock_request_with_3_mm
     ):
         """Test that state is properly cleared after building metadata."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_producer,
             role=ECConnectorRole.SCHEDULER,
         )
@@ -310,7 +310,7 @@ class TestCacheSaving:
         self, mock_vllm_config_producer, mock_request_with_3_mm, temp_storage
     ):
         """Test cache saving as producer for 3 different MM items."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_producer,
             role=ECConnectorRole.WORKER,
         )
@@ -336,7 +336,7 @@ class TestCacheSaving:
 
     def test_save_caches_consumer_skips(self, mock_vllm_config_consumer):
         """Test cache saving is skipped for consumer."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_consumer,
             role=ECConnectorRole.WORKER,
         )
@@ -366,7 +366,7 @@ class TestCacheLoading:
     ):
         """Test consumer loads 3 caches from storage."""
         # First, create producer to save caches
-        producer = ECSharedStorageConnector(
+        producer = ECExampleConnector(
             vllm_config=mock_vllm_config_producer,
             role=ECConnectorRole.WORKER,
         )
@@ -379,13 +379,13 @@ class TestCacheLoading:
             producer.save_caches(saved_caches, mm_hash)
 
         # Now consumer loads
-        consumer = ECSharedStorageConnector(
+        consumer = ECExampleConnector(
             vllm_config=mock_vllm_config_consumer,
             role=ECConnectorRole.WORKER,
         )
 
         # Setup metadata for all 3
-        metadata = ECSharedStorageConnectorMetadata()
+        metadata = ECExampleConnectorMetadata()
         for mm_hash in mm_hashes:
             metadata.add_mm_data(MMMeta.make_meta(mm_hash, 100))
         consumer.bind_connector_metadata(metadata)
@@ -410,7 +410,7 @@ class TestCacheLoading:
     ):
         """Test cache loading skips already cached items."""
         # Setup: producer saves cache
-        producer = ECSharedStorageConnector(
+        producer = ECExampleConnector(
             vllm_config=mock_vllm_config_producer,
             role=ECConnectorRole.WORKER,
         )
@@ -420,12 +420,12 @@ class TestCacheLoading:
         producer.save_caches({mm_hash: saved_cache}, mm_hash)
 
         # Consumer setup
-        consumer = ECSharedStorageConnector(
+        consumer = ECExampleConnector(
             vllm_config=mock_vllm_config_consumer,
             role=ECConnectorRole.WORKER,
         )
 
-        metadata = ECSharedStorageConnectorMetadata()
+        metadata = ECExampleConnectorMetadata()
         metadata.add_mm_data(MMMeta.make_meta(mm_hash, 100))
         consumer.bind_connector_metadata(metadata)
 
@@ -444,13 +444,13 @@ class TestCacheLoading:
 
     def test_start_load_caches_empty_metadata(self, mock_vllm_config_consumer):
         """Test loading with empty metadata does nothing."""
-        consumer = ECSharedStorageConnector(
+        consumer = ECExampleConnector(
             vllm_config=mock_vllm_config_consumer,
             role=ECConnectorRole.WORKER,
         )
 
         # Setup empty metadata
-        metadata = ECSharedStorageConnectorMetadata()
+        metadata = ECExampleConnectorMetadata()
         consumer.bind_connector_metadata(metadata)
 
         # Load (should not raise)
@@ -466,7 +466,7 @@ class TestFilenameGeneration:
 
     def test_generate_foldername(self, mock_vllm_config_producer, temp_storage):
         """Test folder name generation."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_producer,
             role=ECConnectorRole.WORKER,
         )
@@ -479,7 +479,7 @@ class TestFilenameGeneration:
 
     def test_generate_filename(self, mock_vllm_config_producer, temp_storage):
         """Test filename generation."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_producer,
             role=ECConnectorRole.WORKER,
         )
@@ -493,7 +493,7 @@ class TestFilenameGeneration:
 
     def test_generate_filename_consistency(self, mock_vllm_config_producer):
         """Test filename generation is consistent."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_producer,
             role=ECConnectorRole.WORKER,
         )
@@ -510,12 +510,12 @@ class TestMetadataBindingLifecycle:
 
     def test_bind_connector_metadata(self, mock_vllm_config_consumer):
         """Test binding connector metadata."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_consumer,
             role=ECConnectorRole.WORKER,
         )
 
-        metadata = ECSharedStorageConnectorMetadata()
+        metadata = ECExampleConnectorMetadata()
         metadata.add_mm_data(MMMeta.make_meta("hash_1", 100))
 
         connector.bind_connector_metadata(metadata)
@@ -524,12 +524,12 @@ class TestMetadataBindingLifecycle:
 
     def test_clear_connector_metadata(self, mock_vllm_config_consumer):
         """Test clearing connector metadata."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_consumer,
             role=ECConnectorRole.WORKER,
         )
 
-        metadata = ECSharedStorageConnectorMetadata()
+        metadata = ECExampleConnectorMetadata()
         connector.bind_connector_metadata(metadata)
 
         connector.clear_connector_metadata()
@@ -538,12 +538,12 @@ class TestMetadataBindingLifecycle:
 
     def test_get_connector_metadata(self, mock_vllm_config_consumer):
         """Test getting connector metadata."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_consumer,
             role=ECConnectorRole.WORKER,
         )
 
-        metadata = ECSharedStorageConnectorMetadata()
+        metadata = ECExampleConnectorMetadata()
         connector.bind_connector_metadata(metadata)
 
         retrieved = connector._get_connector_metadata()
@@ -552,7 +552,7 @@ class TestMetadataBindingLifecycle:
 
     def test_get_connector_metadata_not_set(self, mock_vllm_config_consumer):
         """Test getting metadata when not set raises."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_consumer,
             role=ECConnectorRole.WORKER,
         )
@@ -566,7 +566,7 @@ class TestEdgeCases:
 
     def test_save_empty_cache(self, mock_vllm_config_producer):
         """Test saving empty tensor."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_producer,
             role=ECConnectorRole.WORKER,
         )
@@ -579,12 +579,12 @@ class TestEdgeCases:
 
     def test_load_nonexistent_cache(self, mock_vllm_config_consumer):
         """Test loading cache that doesn't exist raises error."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_consumer,
             role=ECConnectorRole.WORKER,
         )
 
-        metadata = ECSharedStorageConnectorMetadata()
+        metadata = ECExampleConnectorMetadata()
         metadata.add_mm_data(MMMeta.make_meta("nonexistent_hash", 100))
         connector.bind_connector_metadata(metadata)
 
@@ -596,7 +596,7 @@ class TestEdgeCases:
 
     def test_has_caches_empty_request(self, mock_vllm_config_producer):
         """Test has_caches with request that has no MM data."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_producer,
             role=ECConnectorRole.SCHEDULER,
         )
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 48be8c15aba9e..5fa16897b4e0c 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -507,7 +507,7 @@ def test_encoder_instance_zero_kv_cache(
     )
     kv_transfer_config = (
         KVTransferConfig(
-            kv_connector="SharedStorageConnector",
+            kv_connector="ExampleConnector",
             kv_role="kv_both",
             kv_connector_extra_config={"shared_storage_path": "local_storage"},
         )
@@ -515,7 +515,7 @@ def test_encoder_instance_zero_kv_cache(
         else None
     )
     ec_transfer_config = ECTransferConfig(
-        ec_connector="ECSharedStorageConnector",
+        ec_connector="ECExampleConnector",
         ec_role=ec_role,
         ec_connector_extra_config={"shared_storage_path": "/tmp/ec_test_encoder"},
     )
diff --git a/tests/v1/kv_connector/unit/test_backwards_compatibility.py b/tests/v1/kv_connector/unit/test_backwards_compatibility.py
index 7cd23805c599d..0d29ca5fca5e5 100644
--- a/tests/v1/kv_connector/unit/test_backwards_compatibility.py
+++ b/tests/v1/kv_connector/unit/test_backwards_compatibility.py
@@ -218,12 +218,12 @@ def test_internal_connector_uses_new_signature():
     Test that internal connectors (registered in factory) always use the new
     signature and get kv_cache_config.
     """
-    from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import (
-        SharedStorageConnector,
+    from vllm.distributed.kv_transfer.kv_connector.v1.example_connector import (
+        ExampleConnector,
     )
 
     vllm_config = create_vllm_config()
-    vllm_config.kv_transfer_config.kv_connector = "SharedStorageConnector"
+    vllm_config.kv_transfer_config.kv_connector = "ExampleConnector"
 
     scheduler = create_scheduler(vllm_config)
     kv_cache_config = scheduler.kv_cache_config
@@ -233,7 +233,7 @@ def test_internal_connector_uses_new_signature():
     )
 
     assert connector is not None
-    assert isinstance(connector, SharedStorageConnector)
+    assert isinstance(connector, ExampleConnector)
     assert connector._kv_cache_config is not None
     assert connector._kv_cache_config == kv_cache_config
 
diff --git a/tests/v1/kv_connector/unit/test_shared_storage_connector.py b/tests/v1/kv_connector/unit/test_example_connector.py
similarity index 97%
rename from tests/v1/kv_connector/unit/test_shared_storage_connector.py
rename to tests/v1/kv_connector/unit/test_example_connector.py
index ff4697a978255..75edb79fb4af4 100644
--- a/tests/v1/kv_connector/unit/test_shared_storage_connector.py
+++ b/tests/v1/kv_connector/unit/test_example_connector.py
@@ -119,16 +119,16 @@ def process_prompt(processor, llm: LLM, question: str, image_urls: list[Image]):
 )
 def test_shared_storage_connector_hashes(tmp_path):
     """
-    Tests that SharedStorageConnector saves KV to the storage locations
+    Tests that ExampleConnector saves KV to the storage locations
     with proper hashes; that are unique for inputs with identical text but
     different images (same size), or same multiple images but different orders.
     """
     # Using tmp_path as the storage path to store KV
     print(f"KV storage path at: {str(tmp_path)}")
 
-    # Configure the SharedStorageConnector
+    # Configure the ExampleConnector
     kv_transfer_config = KVTransferConfig(
-        kv_connector="SharedStorageConnector",
+        kv_connector="ExampleConnector",
         kv_role="kv_both",
         kv_connector_extra_config={"shared_storage_path": str(tmp_path)},
     )
diff --git a/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py b/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py
index d0a6eeae6286d..4ba6b2201d0e2 100644
--- a/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py
+++ b/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import (  # noqa: E501
-    SharedStorageConnectorMetadata,
+from vllm.distributed.kv_transfer.kv_connector.v1.example_connector import (  # noqa: E501
+    ExampleConnectorMetadata,
 )
 from vllm.distributed.kv_transfer.kv_transfer_state import (
     ensure_kv_transfer_initialized,
@@ -11,7 +11,7 @@ from vllm.distributed.kv_transfer.kv_transfer_state import (
 from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput
 from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin
 
-# Importing utils registers TestSharedStorageConnector with the factory
+# Importing utils registers TestExampleConnector with the factory
 from .utils import create_vllm_config
 
 
@@ -26,13 +26,13 @@ def _make_empty_scheduler_output():
         num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        kv_connector_metadata=SharedStorageConnectorMetadata(),
+        kv_connector_metadata=ExampleConnectorMetadata(),
     )
 
 
 def test_kv_connector_mixin_clears_metadata():
     vllm_config = create_vllm_config()
-    vllm_config.kv_transfer_config.kv_connector = "TestSharedStorageConnector"
+    vllm_config.kv_transfer_config.kv_connector = "TestExampleConnector"
     vllm_config.kv_transfer_config.kv_role = "kv_both"
     vllm_config.kv_transfer_config.kv_connector_extra_config["name"] = "unit"
 
diff --git a/tests/v1/kv_connector/unit/test_multi_connector.py b/tests/v1/kv_connector/unit/test_multi_connector.py
index ffa7d884d2762..9b6d52e7c294d 100644
--- a/tests/v1/kv_connector/unit/test_multi_connector.py
+++ b/tests/v1/kv_connector/unit/test_multi_connector.py
@@ -77,9 +77,9 @@ def _compare_directories(dir1: Path, dir2: Path) -> bool:
         "https://github.com/ROCm/pytorch/issues/2822"
     ),
 )
-def test_multi_shared_storage_connector_consistency():
+def test_multi_example_connector_consistency():
     """
-    Tests that MultiConnector with two SharedStorageConnectors saves
+    Tests that MultiConnector with two ExampleConnectors saves
     identical KV cache data to separate storage locations.
     """
     storage_1_path = Path("storage_1/")
@@ -89,14 +89,14 @@ def test_multi_shared_storage_connector_consistency():
     storage_1_path.mkdir()
     storage_2_path.mkdir()
 
-    # Configure MultiConnector with two SharedStorageConnectors
+    # Configure MultiConnector with two ExampleConnectors
     kv_transfer_config = KVTransferConfig(
         kv_connector="MultiConnector",
         kv_role="kv_both",
         kv_connector_extra_config={
             "connectors": [
                 {
-                    "kv_connector": "TestSharedStorageConnector",
+                    "kv_connector": "TestExampleConnector",
                     "kv_role": "kv_both",
                     "kv_connector_extra_config": {
                         "shared_storage_path": str(storage_1_path),
@@ -105,7 +105,7 @@ def test_multi_shared_storage_connector_consistency():
                     "kv_connector_module_path": "tests.v1.kv_connector.unit.utils",
                 },
                 {
-                    "kv_connector": "TestSharedStorageConnector",
+                    "kv_connector": "TestExampleConnector",
                     "kv_role": "kv_both",
                     "kv_connector_extra_config": {
                         "shared_storage_path": str(storage_2_path),
@@ -427,7 +427,7 @@ class TestMultiConnectorStats:
 
     def test_build_kv_connector_stats_skips_connectors_without_custom_stats(self):
         """Test that connectors without custom stats (return None) are skipped."""
-        # SharedStorageConnector doesn't override build_kv_connector_stats,
+        # ExampleConnector doesn't override build_kv_connector_stats,
         # so it returns None and should be skipped
         serialized_data = {
             "NixlConnector": {
@@ -440,7 +440,7 @@ class TestMultiConnectorStats:
                     "num_failed_notifications": [],
                 }
             },
-            "SharedStorageConnector": {"data": {"some_field": [1, 2, 3]}},
+            "ExampleConnector": {"data": {"some_field": [1, 2, 3]}},
         }
 
         stats = MultiConnector.build_kv_connector_stats(data=serialized_data)
@@ -451,8 +451,8 @@ class TestMultiConnectorStats:
         assert len(stats.data) == 1
         assert "NixlConnector" in stats.data
         assert isinstance(stats.data["NixlConnector"], NixlKVConnectorStats)
-        # SharedStorageConnector should be skipped (returns None)
-        assert "SharedStorageConnector" not in stats.data
+        # ExampleConnector should be skipped (returns None)
+        assert "ExampleConnector" not in stats.data
 
     def test_build_kv_connector_stats_handles_malformed_data(self):
         """Test that malformed data raises appropriate errors."""
@@ -527,13 +527,13 @@ class TestMultiConnectorStats:
         )
 
         stats2 = MultiKVConnectorStats(
-            data={"SharedStorageConnector": KVConnectorStats(data={"field": [1, 2]})}
+            data={"ExampleConnector": KVConnectorStats(data={"field": [1, 2]})}
         )
 
         result = stats1.aggregate(stats2)
 
         assert "NixlConnector" in result.data
-        assert "SharedStorageConnector" in result.data
+        assert "ExampleConnector" in result.data
 
     def test_reduce(self):
         """Test that reduce() correctly reduces all nested connector stats."""
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index 58f1a7282352b..5cdb1f84b30d4 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -24,8 +24,8 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorMetadata,
     KVConnectorRole,
 )
-from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import (  # noqa
-    SharedStorageConnector,
+from vllm.distributed.kv_transfer.kv_connector.v1.example_connector import (  # noqa
+    ExampleConnector,
 )
 from vllm.utils.hashing import sha256
 from vllm.v1.core.kv_cache_manager import KVCacheBlocks
@@ -264,10 +264,10 @@ def create_model_runner_output(
     )
 
 
-class TestSharedStorageConnector(SharedStorageConnector):
+class TestExampleConnector(ExampleConnector):
     def __init__(self, config: VllmConfig, role, kv_cache_config):
         self.name = config.kv_transfer_config.kv_connector_extra_config["name"]
-        self._connector = SharedStorageConnector(config, role)
+        self._connector = ExampleConnector(config, role)
         self.call_record: dict[str, int] = defaultdict(int)
         # Use a unique temp file per connector
         self._event_file = (
@@ -394,7 +394,7 @@ class MockKVConnector(KVConnectorBase_V1):
 
 
 KVConnectorFactory.register_connector(
-    "TestSharedStorageConnector", __name__, TestSharedStorageConnector.__name__
+    "TestExampleConnector", __name__, TestExampleConnector.__name__
 )
 
 KVConnectorFactory.register_connector(
diff --git a/vllm/distributed/ec_transfer/ec_connector/shared_storage_connector.py b/vllm/distributed/ec_transfer/ec_connector/example_connector.py
similarity index 96%
rename from vllm/distributed/ec_transfer/ec_connector/shared_storage_connector.py
rename to vllm/distributed/ec_transfer/ec_connector/example_connector.py
index c8388141dcc97..5f2eff5a8e6a8 100644
--- a/vllm/distributed/ec_transfer/ec_connector/shared_storage_connector.py
+++ b/vllm/distributed/ec_transfer/ec_connector/example_connector.py
@@ -32,7 +32,7 @@ class MMMeta:
 
 
 @dataclass
-class ECSharedStorageConnectorMetadata(ECConnectorMetadata):
+class ECExampleConnectorMetadata(ECConnectorMetadata):
     mm_datas: list[MMMeta]
 
     def __init__(self):
@@ -42,7 +42,7 @@ class ECSharedStorageConnectorMetadata(ECConnectorMetadata):
         self.mm_datas.append(mm_data)
 
 
-class ECSharedStorageConnector(ECConnectorBase):
+class ECExampleConnector(ECConnectorBase):
     # NOTE: This is Simple debug implementation of the EC connector.
     # It save / load the EC cache to / from the disk.
 
@@ -76,7 +76,7 @@ class ECSharedStorageConnector(ECConnectorBase):
 
         # Get the metadata
         metadata: ECConnectorMetadata = self._get_connector_metadata()
-        assert isinstance(metadata, ECSharedStorageConnectorMetadata)
+        assert isinstance(metadata, ECExampleConnectorMetadata)
         assert encoder_cache is not None
         if metadata is None:
             logger.warning(
@@ -160,7 +160,7 @@ class ECSharedStorageConnector(ECConnectorBase):
         Args:
             scheduler_output (SchedulerOutput): the scheduler output object.
         """
-        meta = ECSharedStorageConnectorMetadata()
+        meta = ECExampleConnectorMetadata()
         for mm_hash, num_encoder_token in self._mm_datas_need_loads.items():
             meta.add_mm_data(MMMeta.make_meta(mm_hash, num_encoder_token))
         self._mm_datas_need_loads.clear()
diff --git a/vllm/distributed/ec_transfer/ec_connector/factory.py b/vllm/distributed/ec_transfer/ec_connector/factory.py
index e51b32e6f6dff..32f36ffbb14d2 100644
--- a/vllm/distributed/ec_transfer/ec_connector/factory.py
+++ b/vllm/distributed/ec_transfer/ec_connector/factory.py
@@ -79,7 +79,7 @@ class ECConnectorFactory:
 # only load the files corresponding to the current connector.
 
 ECConnectorFactory.register_connector(
-    "ECSharedStorageConnector",
-    "vllm.distributed.ec_transfer.ec_connector.shared_storage_connector",
-    "ECSharedStorageConnector",
+    "ECExampleConnector",
+    "vllm.distributed.ec_transfer.ec_connector.example_connector",
+    "ECExampleConnector",
 )
diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
index 02f51a1dce112..02d9a1ec9599e 100644
--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -144,9 +144,9 @@ class KVConnectorFactory:
 # only load the files corresponding to the current connector.
 
 KVConnectorFactory.register_connector(
-    "SharedStorageConnector",
-    "vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector",
-    "SharedStorageConnector",
+    "ExampleConnector",
+    "vllm.distributed.kv_transfer.kv_connector.v1.example_connector",
+    "ExampleConnector",
 )
 
 KVConnectorFactory.register_connector(
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py
similarity index 98%
rename from vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
rename to vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py
index ed641cfc43ddd..41243fc866b59 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py
@@ -65,7 +65,7 @@ class ReqMeta:
 
 
 @dataclass
-class SharedStorageConnectorMetadata(KVConnectorMetadata):
+class ExampleConnectorMetadata(KVConnectorMetadata):
     requests: list[ReqMeta] = field(default_factory=list)
 
     def add_request(
@@ -81,7 +81,7 @@ class SharedStorageConnectorMetadata(KVConnectorMetadata):
         )
 
 
-class SharedStorageConnector(KVConnectorBase_V1):
+class ExampleConnector(KVConnectorBase_V1):
     # NOTE: This is Simple debug implementation of the KV connector.
     # It save / load the KV cache to / from the disk.
     # It does extra work which will overwrite the existing prefix-cache in GPU
@@ -157,7 +157,7 @@ class SharedStorageConnector(KVConnectorBase_V1):
 
         # Get the metadata
         metadata: KVConnectorMetadata = self._get_connector_metadata()
-        assert isinstance(metadata, SharedStorageConnectorMetadata)
+        assert isinstance(metadata, ExampleConnectorMetadata)
 
         if metadata is None:
             logger.warning(
@@ -241,7 +241,7 @@ class SharedStorageConnector(KVConnectorBase_V1):
             return layer.reshape(2, num_pages * page_size, -1)[:, slot_mapping, ...]
 
         connector_metadata = self._get_connector_metadata()
-        assert isinstance(connector_metadata, SharedStorageConnectorMetadata)
+        assert isinstance(connector_metadata, ExampleConnectorMetadata)
         for request in connector_metadata.requests:
             if request.is_store:
                 filename = self._generate_filename_debug(
@@ -315,7 +315,7 @@ class SharedStorageConnector(KVConnectorBase_V1):
         Args:
             scheduler_output (SchedulerOutput): the scheduler output object.
         """
-        meta = SharedStorageConnectorMetadata()
+        meta = ExampleConnectorMetadata()
 
         total_need_load = 0
         for new_req in scheduler_output.scheduled_new_reqs:

From 4b03b502119f857bbead7064a29267b9ea8999e5 Mon Sep 17 00:00:00 2001
From: liangel-02 <liangel@meta.com>
Date: Mon, 8 Dec 2025 23:46:35 -0500
Subject: [PATCH 364/770] update torchao safetensors impl (#30155)

Signed-off-by: Angel Li <liangel@meta.com>
---
 vllm/model_executor/model_loader/weight_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 0496b7a84507b..610e6a620ade2 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -641,7 +641,6 @@ def safetensors_weights_iterator(
     if safetensors_load_strategy == "eager":
         loading_desc += " (eager)"
 
-    state_dict = {}
     leftover_state_dict: dict[str, torch.Tensor] = {}
 
     for st_file in tqdm(
@@ -667,6 +666,7 @@ def safetensors_weights_iterator(
             )
 
             with safe_open(st_file, framework="pt") as f:
+                state_dict = {}
                 for name in f.keys():  # noqa: SIM118
                     state_dict[name] = f.get_tensor(name)
 

From e130845984b77b248436bdbe8f3afdf7b3107a62 Mon Sep 17 00:00:00 2001
From: Fadi Arafeh <115173828+fadara01@users.noreply.github.com>
Date: Tue, 9 Dec 2025 04:55:39 +0000
Subject: [PATCH 365/770] [CPU][CI] Enable fused MoE tests in Arm CI (#30132)

Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
---
 .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
index b5f6b2494792f..9c6e7766b2ac4 100755
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
@@ -40,7 +40,8 @@ function cpu_tests() {
   docker exec cpu-test bash -c "
     set -e
     pytest -x -v -s tests/kernels/test_onednn.py
-    pytest -x -v -s tests/kernels/attention/test_cpu_attn.py"
+    pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
+    pytest -x -v -s tests/kernels/moe/test_moe.py -k test_cpu_fused_moe_basic"
 
   # basic online serving
   docker exec cpu-test bash -c '

From c2e1987a6e794b745be4c060ae4bbd06864d8028 Mon Sep 17 00:00:00 2001
From: Fanli Lin <fanli.lin@intel.com>
Date: Tue, 9 Dec 2025 13:16:44 +0800
Subject: [PATCH 366/770] [Doc] update Intel GPU MM status in Feature x
 Hardware matrix (#30294)

Signed-off-by: Lin, Fanli <fanli.lin@intel.com>
---
 docs/features/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/features/README.md b/docs/features/README.md
index 684802301a44f..e9e5232929b72 100644
--- a/docs/features/README.md
+++ b/docs/features/README.md
@@ -68,8 +68,8 @@ th:not(:first-child) {
 | CUDA graph                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | [❌](https://github.com/vllm-project/vllm/issues/26970)        |
 | [pooling](../models/pooling_models.md)                    | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
 | <abbr title="Encoder-Decoder Models">enc-dec</abbr>       | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❌     | ✅        |
-| [mm](multimodal_inputs.md)                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | [🟠](https://github.com/vllm-project/vllm/issues/26965)       |
-| [prompt-embeds](prompt_embeds.md)                         | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❔     | ✅       |
+| [mm](multimodal_inputs.md)                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
+| [prompt-embeds](prompt_embeds.md)                         | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❔     | ✅        |
 | <abbr title="Logprobs">logP</abbr>                        | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
 | <abbr title="Prompt Logprobs">prmpt logP</abbr>           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
 | <abbr title="Async Output Processing">async output</abbr> | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ❌     | ✅        |

From 58d5b3f51455706bf4f1f2360a0feb83d161147e Mon Sep 17 00:00:00 2001
From: Tsukasa OI <floss_llm@irq.a4lg.com>
Date: Tue, 9 Dec 2025 14:30:05 +0900
Subject: [PATCH 367/770] [Model][Quantization] Restore MoE + GGUF models
 support (incl. Qwen3 MoE) by allowing Sideload Parameters (#30116)

Signed-off-by: Tsukasa OI <floss_llm@irq.a4lg.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 .../layers/quantization/gguf.py               |  1 +
 .../model_loader/gguf_loader.py               | 24 ++++++++++++++++++-
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index bcdfafb50fc5a..ee819df292ed1 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -82,6 +82,7 @@ class GGUFConfig(QuantizationConfig):
                 return UnquantizedEmbeddingMethod()
             return GGUFEmbeddingMethod(self)
         elif isinstance(layer, FusedMoE):
+            # TODO: Select UnquantizedFusedMoEMethod on unquantized layers.
             return GGUFMoEMethod(self, layer.moe_config)
         return None
 
diff --git a/vllm/model_executor/model_loader/gguf_loader.py b/vllm/model_executor/model_loader/gguf_loader.py
index 74052f72ceab9..7f94bd234fd38 100644
--- a/vllm/model_executor/model_loader/gguf_loader.py
+++ b/vllm/model_executor/model_loader/gguf_loader.py
@@ -4,6 +4,7 @@ import os
 from collections.abc import Generator
 
 import gguf
+import regex as re
 import torch
 import torch.nn as nn
 from huggingface_hub import hf_hub_download
@@ -94,6 +95,7 @@ class GGUFModelLoader(BaseModelLoader):
             hasattr(config, "vision_config") and config.vision_config is not None
         )
         gguf_to_hf_name_map = {}
+        sideload_params: list[re.Pattern] = []
         # hack: ggufs have a different name than transformers
         if model_type == "cohere":
             model_type = "command-r"
@@ -118,6 +120,12 @@ class GGUFModelLoader(BaseModelLoader):
                 gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = (
                     f"model.layers.{idx}.mlp.experts.0.up_proj.weight"
                 )
+                sideload_params.append(
+                    re.compile(
+                        f"model\\.layers\\.{idx}"
+                        r"\.mlp\.experts\.[0-9]+\.(gate|up|down)_proj\.weight"
+                    )
+                )
         if model_type in ("qwen2_moe", "qwen3_moe"):
             model_type = model_type.replace("_", "")
             # GGUF layer map assumes that we will have a merged expert weights
@@ -132,6 +140,12 @@ class GGUFModelLoader(BaseModelLoader):
                 gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = (
                     f"model.layers.{idx}.mlp.experts.0.up_proj.weight"
                 )
+                sideload_params.append(
+                    re.compile(
+                        f"model\\.layers\\.{idx}"
+                        r"\.mlp\.experts\.[0-9]+\.(gate|up|down)_proj\.weight"
+                    )
+                )
 
         arch = None
         for key, value in gguf.MODEL_ARCH_NAMES.items():
@@ -241,7 +255,15 @@ class GGUFModelLoader(BaseModelLoader):
                 # Parameter not in manual overrides either
                 unmapped_params.append(hf_name)
 
-        # All parameters must be mapped: both vision/projector and backbone
+        # All parameters (except those initialized by other means) must be mapped:
+        # both vision/projector and backbone
+        if unmapped_params:
+            unmapped_params = list(
+                filter(
+                    lambda x: not any(re.fullmatch(p, x) for p in sideload_params),
+                    unmapped_params,
+                )
+            )
         if unmapped_params:
             raise RuntimeError(
                 f"Failed to map GGUF parameters "

From e4605d225e020154bc98efd15a05e11b83eaefb7 Mon Sep 17 00:00:00 2001
From: Yongtao Huang <yongtaoh2022@gmail.com>
Date: Tue, 9 Dec 2025 14:50:06 +0800
Subject: [PATCH 368/770] [Misc] Fix safetensors import for safe_open (#30300)

Signed-off-by: Yongtao Huang <yongtaoh2022@gmail.com>
---
 vllm/lora/lora_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/lora/lora_model.py b/vllm/lora/lora_model.py
index db170f13ae1c7..f5e36697ed18c 100644
--- a/vllm/lora/lora_model.py
+++ b/vllm/lora/lora_model.py
@@ -3,7 +3,7 @@
 
 import os
 
-import safetensors.torch
+import safetensors
 import torch
 
 from vllm.logger import init_logger

From aed846917fee3b27df876d21ade4d3a30d4de402 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Tue, 9 Dec 2025 02:24:01 -0500
Subject: [PATCH 369/770] [Attention] Make `split_decodes_and_prefills(...,
 require_uniform=True)` support padding (#29644)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: Benjamin Chislett <chislett.ben@gmail.com>
---
 .../v1/attention/test_attention_splitting.py  | 25 ++++++++++++++++++-
 vllm/v1/attention/backends/utils.py           | 10 +++++---
 2 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/tests/v1/attention/test_attention_splitting.py b/tests/v1/attention/test_attention_splitting.py
index f60861e3489d6..f08e2f480e30f 100644
--- a/tests/v1/attention/test_attention_splitting.py
+++ b/tests/v1/attention/test_attention_splitting.py
@@ -154,7 +154,10 @@ def test_split_attn_metadata_decode_batch(large_decode_metadata):
 
 
 def apply_split_decodes_and_prefills(
-    query_lens: list[int], decode_threshold: int, require_uniform: bool
+    query_lens: list[int],
+    decode_threshold: int,
+    require_uniform: bool,
+    padded_num_tokens: int | None = None,
 ):
     """Helper function to apply split_decodes_and_prefills and return
     the results."""
@@ -165,6 +168,10 @@ def apply_split_decodes_and_prefills(
         block_size=16,
         device=device,
     )
+
+    if padded_num_tokens is not None:
+        common_metadata.num_actual_tokens = padded_num_tokens
+
     return split_decodes_and_prefills(
         common_metadata,
         decode_threshold=decode_threshold,
@@ -271,6 +278,22 @@ def test_split_decodes_and_prefills_uniform_mixed_batch_non_uniform_decodes():
     assert num_prefill_tokens == (sum(query_lens) - 2)  # rest of the tokens
 
 
+def test_split_decodes_and_prefills_uniform_padded_batch_all_same():
+    """uniform batch where all query lengths are identical with 0 length padded reqs."""
+    # All query lengths are 2, with decode_threshold=3 (so 2 <= 3)
+    # This triggers the padded uniform path at line 891
+    query_lens = [2, 2, 2, 0]
+    padded_num_tokens = 8
+    num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
+        apply_split_decodes_and_prefills(query_lens, 3, True, padded_num_tokens)
+    )
+    # With uniform batch, all requests are treated as decodes
+    assert num_decodes == 4
+    assert num_prefills == 0
+    assert num_decode_tokens == padded_num_tokens
+    assert num_prefill_tokens == 0
+
+
 @pytest.mark.parametrize(
     "seq_lens,query_lens,split_point,expected_first_reqs,expected_second_reqs",
     [
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 8edfbb5140bc9..5200bc48b3156 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -883,11 +883,15 @@ def split_decodes_and_prefills(
         return 0, num_reqs, 0, num_tokens
 
     if require_uniform:
+        # check if we are in a padded uniform batch; this is used for full-CGs, some
+        # requests may have a query length of 0 but since they are padding its fine
+        # to treat them as decodes (ensures num_decodes matches the captured size)
+        if torch.all((query_lens == query_lens[0]) | (query_lens == 0)):
+            assert num_reqs * query_lens[0] == num_tokens, "tokens not padded correctly"
+            return num_reqs, 0, num_tokens, 0  # all decodes
         is_prefill = query_lens != query_lens[0]
     else:
-        # 0-query len indicates a padded request; leave this at the back
-        # of the batch with the prefills
-        is_prefill = (query_lens > decode_threshold) | (query_lens == 0)
+        is_prefill = query_lens > decode_threshold
 
     if not torch.any(is_prefill):
         return num_reqs, 0, num_tokens, 0

From aeb82b1930454498fccc7e91f7c4e0f360cf658a Mon Sep 17 00:00:00 2001
From: Micah Williamson <micah.williamson@amd.com>
Date: Tue, 9 Dec 2025 01:33:34 -0600
Subject: [PATCH 370/770] [CI] Fix Flaky test_eagle_max_len Test (#30306)

Signed-off-by: Micah Williamson <micah.williamson@amd.com>
---
 tests/v1/spec_decode/test_max_len.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/v1/spec_decode/test_max_len.py b/tests/v1/spec_decode/test_max_len.py
index 81da8609aa6cf..15a6bd2659ea9 100644
--- a/tests/v1/spec_decode/test_max_len.py
+++ b/tests/v1/spec_decode/test_max_len.py
@@ -82,7 +82,7 @@ def test_eagle_max_len(
                 len(o.prompt_token_ids)
                 < 80
                 < len(o.prompt_token_ids) + len(o.outputs[0].token_ids)
-                < 200
+                <= 200
             ), (
                 "This test is only meaningful if the output "
                 "is longer than the eagle max length"

From 9c32df6101b24040c824dee5b0e5543a94dc6f45 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <yuqi.wang@daocloud.io>
Date: Tue, 9 Dec 2025 16:04:02 +0800
Subject: [PATCH 371/770] [Bugfix] Qwen 3 VL Embedding loading (#30303)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
Signed-off-by: wang.yuqi <noooop@126.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/model_executor/models/adapters.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 70f203b9f7c64..9ba76f312edac 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -175,9 +175,14 @@ def _create_pooling_model_cls(orig_cls: _T) -> _T:
             self.vllm_config = vllm_config
 
             # These are not used in pooling models
-            for attr in ("lm_head", "logits_processor"):
-                if hasattr(self, attr):
-                    delattr(self, attr)
+            objects_to_clean = [self]
+            if language_model := getattr(self, "language_model", None):
+                objects_to_clean.append(language_model)
+
+            for obj in objects_to_clean:
+                for attr in ("lm_head", "logits_processor"):
+                    if hasattr(obj, attr):
+                        delattr(obj, attr)
 
             # If the model already defines a pooler instance, don't overwrite it
             if not getattr(self, "pooler", None):

From 67475a6e81abea915857f82e6f10d80b03b842c9 Mon Sep 17 00:00:00 2001
From: Jaya Yuan <yuanyongjie.yyj@antgroup.com>
Date: Tue, 9 Dec 2025 16:22:14 +0800
Subject: [PATCH 372/770] [DCP][Bugfix][CI] Fix accuracy issue of DCP when
 using FLASH_ATTN_MLA (#30309)

Signed-off-by: FENP <yuanyongjie.yyj@antgroup.com>
---
 tests/distributed/test_context_parallel.py      | 5 ++++-
 vllm/v1/attention/backends/mla/flashattn_mla.py | 3 ++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/distributed/test_context_parallel.py b/tests/distributed/test_context_parallel.py
index 3cb533dccd62c..aa47f28a34dd5 100644
--- a/tests/distributed/test_context_parallel.py
+++ b/tests/distributed/test_context_parallel.py
@@ -123,8 +123,11 @@ class CPTestSettings:
 
 CP_TEXT_GENERATION_MODELS = {
     "deepseek-ai/DeepSeek-V2-Lite-Chat": [
+        CPTestSettings.detailed(dcp_multipliers=[1]),
         CPTestSettings.detailed(
-            dcp_multipliers=[0.5, 1], cp_kv_cache_interleave_size=64
+            dcp_multipliers=[0.5],
+            cp_kv_cache_interleave_size=64,
+            attn_backend="FLASHMLA",
         ),
     ],
     "Qwen/Qwen2.5-1.5B-Instruct": [
diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py
index eccf4ec791095..b28814aceada9 100644
--- a/vllm/v1/attention/backends/mla/flashattn_mla.py
+++ b/vllm/v1/attention/backends/mla/flashattn_mla.py
@@ -105,13 +105,14 @@ class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata]
         vllm_config: VllmConfig,
         device: torch.device,
     ):
+        interleave_size = vllm_config.parallel_config.cp_kv_cache_interleave_size
         super().__init__(
             kv_cache_spec,
             layer_names,
             vllm_config,
             device,
             FlashAttnMLAMetadata,
-            supports_dcp_with_varlen=True,
+            supports_dcp_with_varlen=(interleave_size == 1),
         )
         self.max_num_splits = 0  # No upper bound on the number of splits.
         self.fa_aot_schedule = get_flash_attn_version() == 3

From c72ea1072313b30fc8b4701697520fdd0b56fc35 Mon Sep 17 00:00:00 2001
From: Hubert de La Jonquiere <hubert@hcompany.ai>
Date: Tue, 9 Dec 2025 11:54:08 +0100
Subject: [PATCH 373/770] [Structured Output][Reasoning] Improves decoding
 throughput for models using single-token reasoning endings. (#30056)

---
 docs/features/reasoning_outputs.md            |  3 ++
 .../test_base_thinking_reasoning_parser.py    | 35 +++++++++++++++++++
 .../test_deepseekv3_reasoning_parser.py       |  1 +
 .../test_reasoning_structured_output.py       |  1 +
 vllm/reasoning/abs_reasoning_parsers.py       | 25 +++++++++++++
 vllm/reasoning/basic_parsers.py               |  6 ++++
 .../reasoning/deepseek_v3_reasoning_parser.py |  5 +++
 vllm/reasoning/holo2_reasoning_parser.py      |  5 +++
 vllm/reasoning/identity_reasoning_parser.py   |  5 +++
 vllm/v1/structured_output/__init__.py         |  4 ++-
 10 files changed, 89 insertions(+), 1 deletion(-)

diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md
index 3315c0949afca..93cca23856a9b 100644
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@@ -299,6 +299,9 @@ Additionally, to enable structured output, you'll need to create a new `Reasoner
 
         def is_reasoning_end(self, input_ids: list[int]) -> bool:
             return self.end_token_id in input_ids
+
+        def is_reasoning_end_streaming(self, input_ids: list[int], delta_ids: list[int]) -> bool:
+            return self.end_token_id in delta_token_ids
         ...
     ```
 
diff --git a/tests/reasoning/test_base_thinking_reasoning_parser.py b/tests/reasoning/test_base_thinking_reasoning_parser.py
index 34e9483de54b3..165e91a2c79f2 100644
--- a/tests/reasoning/test_base_thinking_reasoning_parser.py
+++ b/tests/reasoning/test_base_thinking_reasoning_parser.py
@@ -132,6 +132,41 @@ class TestBaseThinkingReasoningParserMethods:
             is False
         )
 
+    def test_is_reasoning_end_streaming(self, test_tokenizer):
+        """Test the is_reasoning_end_streaming method."""
+        parser = TestThinkingReasoningParser(test_tokenizer)
+        end_token_id = parser.end_token_id
+        start_token_id = parser.start_token_id
+
+        assert (
+            parser.is_reasoning_end_streaming([1, 2, end_token_id], [end_token_id])
+            is True
+        )
+        assert parser.is_reasoning_end_streaming([1, 2, 3, 4], [4]) is False
+        assert parser.is_reasoning_end_streaming([], []) is False
+        assert (
+            parser.is_reasoning_end_streaming(
+                [1, start_token_id, 2, end_token_id], [end_token_id]
+            )
+            is True
+        )
+        assert (
+            parser.is_reasoning_end_streaming([1, start_token_id, 2, 3], [3]) is False
+        )
+        assert (
+            parser.is_reasoning_end_streaming(
+                [1, start_token_id, 2, end_token_id, 2, start_token_id, 2],
+                [2],
+            )
+            is False
+        )
+        assert (
+            parser.is_reasoning_end_streaming(
+                [1, start_token_id, 2, end_token_id, 2, 2], [2]
+            )
+            is False
+        )
+
     def test_extract_content_ids(self, test_tokenizer):
         """Test the extract_content_ids method."""
         parser = TestThinkingReasoningParser(test_tokenizer)
diff --git a/tests/reasoning/test_deepseekv3_reasoning_parser.py b/tests/reasoning/test_deepseekv3_reasoning_parser.py
index 6e8f0e8dcc9b9..874fdef778110 100644
--- a/tests/reasoning/test_deepseekv3_reasoning_parser.py
+++ b/tests/reasoning/test_deepseekv3_reasoning_parser.py
@@ -40,6 +40,7 @@ def test_identity_reasoning_parser_basic(tokenizer):
     input_tokens = tokenizer.tokenize(input_text)
     input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
     assert parser.is_reasoning_end(input_ids) is True
+    assert parser.is_reasoning_end_streaming(input_ids, input_ids) is True
 
     # Test extract_content_ids returns all input_ids
     assert parser.extract_content_ids(input_ids) == input_ids
diff --git a/tests/v1/structured_output/test_reasoning_structured_output.py b/tests/v1/structured_output/test_reasoning_structured_output.py
index 70047a993c3f9..ba52af3ad604d 100644
--- a/tests/v1/structured_output/test_reasoning_structured_output.py
+++ b/tests/v1/structured_output/test_reasoning_structured_output.py
@@ -70,6 +70,7 @@ class TestReasoningStructuredOutput:
         request.use_structured_output = True
         request.prompt_token_ids = [1, 2, 3, 4, 5]
         request.all_token_ids = [1, 2, 3, 4, 5, 6, 7, 8]
+        request.num_computed_tokens = 5
         return request
 
     def test_should_fill_bitmask_with_enable_in_reasoning(
diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py
index d0661d1f23b06..bf593ca4e52a0 100644
--- a/vllm/reasoning/abs_reasoning_parsers.py
+++ b/vllm/reasoning/abs_reasoning_parsers.py
@@ -63,6 +63,31 @@ class ReasoningParser:
             True if the reasoning content ends in the input_ids.
         """
 
+    def is_reasoning_end_streaming(
+        self, input_ids: list[int], delta_ids: list[int]
+    ) -> bool:
+        """
+        Check if the reasoning content ends in the input_ids on a
+        decode step.
+
+        It is used in structured engines like `xgrammar` to check if the
+        reasoning content ends in the model output during a decode step.
+        `input_ids` the entire model output and `delta_ids` are the last few
+        computed tokens of the model output (like during a decode step).
+
+        Parameters:
+        input_ids: list[int]
+            The entire model output.
+        delta_ids: list[int]
+            The last few computed tokens of the model output at the current decode step.
+
+        Returns:
+        bool
+            True if the reasoning content ends in the `delta_ids` on a
+            decode step.
+        """
+        return self.is_reasoning_end(input_ids)
+
     @abstractmethod
     def extract_content_ids(self, input_ids: list[int]) -> list[int]:
         """
diff --git a/vllm/reasoning/basic_parsers.py b/vllm/reasoning/basic_parsers.py
index e78ac4a5ebb37..43067ca4afe05 100644
--- a/vllm/reasoning/basic_parsers.py
+++ b/vllm/reasoning/basic_parsers.py
@@ -74,6 +74,12 @@ class BaseThinkingReasoningParser(ReasoningParser):
                 return True
         return False
 
+    def is_reasoning_end_streaming(
+        self, input_ids: list[int], delta_ids: list[int]
+    ) -> bool:
+        end_token_id = self.end_token_id
+        return end_token_id in delta_ids
+
     def extract_content_ids(self, input_ids: list[int]) -> list[int]:
         """
         Extract the content after the end tokens
diff --git a/vllm/reasoning/deepseek_v3_reasoning_parser.py b/vllm/reasoning/deepseek_v3_reasoning_parser.py
index afdf73262aca0..6604f70badbcf 100644
--- a/vllm/reasoning/deepseek_v3_reasoning_parser.py
+++ b/vllm/reasoning/deepseek_v3_reasoning_parser.py
@@ -35,6 +35,11 @@ class DeepSeekV3ReasoningParser(ReasoningParser):
     def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
         return self._parser.is_reasoning_end(input_ids)
 
+    def is_reasoning_end_streaming(
+        self, input_ids: list[int], delta_ids: list[int]
+    ) -> bool:
+        return self._parser.is_reasoning_end_streaming(input_ids, delta_ids)
+
     def extract_content_ids(self, input_ids: list[int]) -> list[int]:
         return self._parser.extract_content_ids(input_ids)
 
diff --git a/vllm/reasoning/holo2_reasoning_parser.py b/vllm/reasoning/holo2_reasoning_parser.py
index 76de1c077c88b..f80190d28d6aa 100644
--- a/vllm/reasoning/holo2_reasoning_parser.py
+++ b/vllm/reasoning/holo2_reasoning_parser.py
@@ -56,6 +56,11 @@ class Holo2ReasoningParser(ReasoningParser):
     def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
         return self._parser.is_reasoning_end(input_ids)
 
+    def is_reasoning_end_streaming(
+        self, input_ids: list[int], delta_ids: list[int]
+    ) -> bool:
+        return self._parser.is_reasoning_end_streaming(input_ids, delta_ids)
+
     def extract_content_ids(self, input_ids: list[int]) -> list[int]:
         return self._parser.extract_content_ids(input_ids)
 
diff --git a/vllm/reasoning/identity_reasoning_parser.py b/vllm/reasoning/identity_reasoning_parser.py
index e92f8add0391a..e998e071efcf6 100644
--- a/vllm/reasoning/identity_reasoning_parser.py
+++ b/vllm/reasoning/identity_reasoning_parser.py
@@ -32,6 +32,11 @@ class IdentityReasoningParser(ReasoningParser):
         # Always return True, since we never treat reasoning specially
         return True
 
+    def is_reasoning_end_streaming(
+        self, input_ids: list[int], delta_ids: list[int]
+    ) -> bool:
+        return True
+
     def extract_content_ids(self, input_ids: list[int]) -> list[int]:
         # Identity: return all tokens as content
         return input_ids
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 5ee88178cdf60..4dd478804049b 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -339,7 +339,9 @@ class StructuredOutputManager:
             return True
 
         # Check if reasoning ends in *this* step
-        if self.reasoner.is_reasoning_end(request.all_token_ids):
+        if self.reasoner.is_reasoning_end_streaming(
+            request.all_token_ids, request.all_token_ids[request.num_computed_tokens :]
+        ):
             # Reasoning just ended, so we shouldn't advance til
             # next pass
             structured_req.reasoning_ended = True

From 03416eada6c01770fb71c3d988fc3c74958d8f5e Mon Sep 17 00:00:00 2001
From: haoyangli-amd <lihaoyang0109@gmail.com>
Date: Tue, 9 Dec 2025 19:28:50 +0800
Subject: [PATCH 374/770] [bugfix][quantization] Fix fp8 per_tensor scale shape
 (#30257)

Signed-off-by: Haoyang Li <lihaoyang0109@gmail.com>
---
 vllm/_custom_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 6bbfe11b6e925..6d862c5812560 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1726,7 +1726,7 @@ def scaled_fp8_quant(
                 output, input, scale, scale_ub
             )
         else:
-            scale = torch.empty((1, 1), device=input.device, dtype=torch.float32)
+            scale = torch.empty(1, device=input.device, dtype=torch.float32)
             torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
     else:
         assert scale.numel() == 1, f"{scale.shape}"

From 1166c31cc78073378a16509fbbbed4cb4f040a4d Mon Sep 17 00:00:00 2001
From: Dongjie Zou <85092850+baonudesifeizhai@users.noreply.github.com>
Date: Tue, 9 Dec 2025 07:20:21 -0500
Subject: [PATCH 375/770] [Bugfix]: Fix glm46 awq marlin moe wna16
 compatibility (#30210)

Signed-off-by: baonudesifeizhai <baonudesifeizhai@gmail.com>
---
 .../layers/fused_moe/fused_moe.py             | 45 +++++++++++++++++++
 .../layers/quantization/moe_wna16.py          |  9 ++--
 2 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index f3c158ee2f9dc..0b83a3f5c4803 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -895,6 +895,48 @@ def get_moe_configs(
     return None
 
 
+def _ensure_block_size_k_divisible(
+    size_k: int, block_size_k: int, group_size: int
+) -> int:
+    """Ensure block_size_k is a divisor of size_k and divisible by group_size.
+
+    This ensures BLOCK_SIZE_K compatibility with MoeWNA16 CUDA kernel which
+    requires size_k % BLOCK_SIZE_K == 0 and BLOCK_SIZE_K % group_size == 0.
+
+    Args:
+        size_k: The size_k dimension that must be divisible by result.
+        block_size_k: Preferred block size (will be adjusted if needed).
+        group_size: The result must be divisible by this.
+
+    Returns:
+        A valid BLOCK_SIZE_K that divides size_k and is divisible by group_size.
+    """
+    # Fast path: already valid
+    if size_k % block_size_k == 0 and block_size_k % group_size == 0:
+        return block_size_k
+
+    # Find the largest value that:
+    # 1. Divides size_k (size_k % candidate == 0)
+    # 2. Is divisible by group_size (candidate % group_size == 0)
+    # 3. Is <= block_size_k (prefer smaller values close to block_size_k)
+    #
+    # Strategy: Search from min(block_size_k, size_k) down to group_size,
+    # stepping by group_size to ensure divisibility by group_size
+    max_search = min(block_size_k, size_k)
+    start = (max_search // group_size) * group_size
+    for candidate in range(start, group_size - 1, -group_size):
+        if size_k % candidate == 0:
+            return candidate
+
+    # Fallback: if group_size divides size_k, use it
+    # This should always be true with correct group_size configuration
+    if size_k % group_size == 0:
+        return group_size
+
+    # This should not happen with correct group_size, but ensure divisibility
+    return size_k
+
+
 def get_moe_wna16_block_config(
     config: dict[str, int],
     use_moe_wna16_cuda: bool,
@@ -960,6 +1002,9 @@ def get_moe_wna16_block_config(
             # at the same time.
             block_size_n = 1024
 
+        # Ensure BLOCK_SIZE_K is a divisor of size_k for CUDA kernel compatibility
+        block_size_k = _ensure_block_size_k_divisible(size_k, block_size_k, group_size)
+
         return {"BLOCK_SIZE_N": block_size_n, "BLOCK_SIZE_K": block_size_k}
 
 
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index cf348290a2716..8570b8c339987 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -60,7 +60,7 @@ class MoeWNA16Config(QuantizationConfig):
 
         if self.linear_quant_method == "gptq":
             self.use_marlin = GPTQMarlinConfig.is_gptq_marlin_compatible(full_config)
-        elif self.linear_quant_method == "awq":
+        elif self.linear_quant_method in ("awq", "awq_marlin"):
             capability_tuple = current_platform.get_device_capability()
             device_capability = (
                 -1 if capability_tuple is None else capability_tuple.to_int()
@@ -107,7 +107,7 @@ class MoeWNA16Config(QuantizationConfig):
         if linear_quant_method == "gptq":
             has_zp = not cls.get_from_keys(config, ["sym"])
             modules_to_not_convert = []
-        elif linear_quant_method == "awq":
+        elif linear_quant_method in ("awq", "awq_marlin"):
             has_zp = cls.get_from_keys(config, ["zero_point"])
             modules_to_not_convert = cls.get_from_keys_or(
                 config, ["modules_to_not_convert"], None
@@ -184,7 +184,7 @@ class MoeWNA16Config(QuantizationConfig):
                     return GPTQConfig.from_config(self.full_config).get_quant_method(
                         layer, prefix
                     )
-            elif self.linear_quant_method == "awq":
+            elif self.linear_quant_method in ("awq", "awq_marlin"):
                 if self.use_marlin and check_marlin_supports_layer(
                     layer, self.group_size
                 ):
@@ -468,7 +468,8 @@ class MoeWNA16Method(FusedMoEMethodBase):
             shard_size = layer.intermediate_size_per_partition
 
             # convert gptq and awq weight to a standard format
-            if layer.quant_config.linear_quant_method == "awq":
+            # awq_marlin uses the same weight format as awq
+            if layer.quant_config.linear_quant_method in ("awq", "awq_marlin"):
                 assert layer.quant_config.weight_bits == 4
                 if "weight" in weight_name:
                     loaded_weight = convert_awq_tensor(loaded_weight, "qweight")

From ee14644ba9a3696c83ede2c948b73ebc3e1ffb33 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Tue, 9 Dec 2025 22:27:37 +0800
Subject: [PATCH 376/770] [ROCm] Aiter Quant Kernels (#25552)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 vllm/_aiter_ops.py                            | 87 +++++++++++++++++++
 .../layers/quantization/input_quant_fp8.py    | 31 +++++++
 vllm/platforms/rocm.py                        |  7 +-
 3 files changed, 123 insertions(+), 2 deletions(-)

diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py
index 35920d826578e..94bbc9b00225e 100644
--- a/vllm/_aiter_ops.py
+++ b/vllm/_aiter_ops.py
@@ -9,6 +9,8 @@ import vllm.envs as envs
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import direct_register_custom_op, is_torch_equal_or_newer
 
+_FP8_DTYPE = current_platform.fp8_dtype()
+
 
 def is_aiter_found() -> bool:
     from importlib.util import find_spec
@@ -467,6 +469,59 @@ def _rocm_aiter_rmsnorm2d_fwd_with_add_fake(
     return torch.empty_like(x), torch.empty_like(residual)
 
 
+def _rocm_aiter_per_tensor_quant_impl(
+    x: torch.Tensor,
+    quant_dtype: torch.dtype,
+    scale: torch.Tensor | None = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    from aiter.ops.quant import per_tensor_quant_hip
+
+    return per_tensor_quant_hip(x, scale, quant_dtype)
+
+
+def _rocm_aiter_per_tensor_quant_fake(
+    x: torch.Tensor,
+    quant_dtype: torch.dtype,
+    scale: torch.Tensor | None = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    return torch.empty_like(x, dtype=quant_dtype), torch.empty(
+        1, dtype=torch.float32, device=x.device
+    )
+
+
+def _rocm_aiter_per_token_quant_impl(
+    x: torch.Tensor, quant_dtype: torch.dtype, scale: torch.Tensor | None = None
+) -> tuple[torch.Tensor, torch.Tensor]:
+    from aiter.ops.quant import dynamic_per_token_scaled_quant
+
+    assert quant_dtype in [torch.int8, _FP8_DTYPE]
+
+    out_shape = x.shape
+    out = torch.empty(x.shape, dtype=_FP8_DTYPE, device=x.device)
+    if scale is None:
+        scale = torch.empty((*out_shape[:-1], 1), dtype=torch.float32, device=x.device)
+    dynamic_per_token_scaled_quant(
+        out,
+        x,
+        scale,
+        scale_ub=None,
+        shuffle_scale=False,
+        num_rows=None,
+        num_rows_factor=1,
+    )
+    return out, scale
+
+
+def _rocm_aiter_per_token_quant_fake(
+    x: torch.Tensor, quant_dtype: torch.dtype, scale: torch.Tensor | None = None
+) -> tuple[torch.Tensor, torch.Tensor]:
+    out_shape = x.shape
+    return (
+        torch.empty(x.shape, dtype=_FP8_DTYPE, device=x.device),
+        torch.empty((*out_shape[:-1], 1), dtype=torch.float32, device=x.device),
+    )
+
+
 # Global flag to ensure ops are registered only once
 _OPS_REGISTERED = False
 
@@ -665,6 +720,22 @@ class rocm_aiter_ops:
                 dispatch_key=current_platform.dispatch_key,
             )
 
+            direct_register_custom_op(
+                op_name="rocm_aiter_per_tensor_quant",
+                op_func=_rocm_aiter_per_tensor_quant_impl,
+                mutates_args=[],
+                fake_impl=_rocm_aiter_per_tensor_quant_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_per_token_quant",
+                op_func=_rocm_aiter_per_token_quant_impl,
+                mutates_args=["scale"],
+                fake_impl=_rocm_aiter_per_token_quant_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
             _OPS_REGISTERED = True
 
     @staticmethod
@@ -859,6 +930,22 @@ class rocm_aiter_ops:
             kv_scale=kv_scale,
         )
 
+    @staticmethod
+    def per_tensor_quant(
+        x: torch.Tensor,
+        quant_dtype: torch.dtype,
+        scale: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        return torch.ops.vllm.rocm_aiter_per_tensor_quant(x, quant_dtype, scale)
+
+    @staticmethod
+    def per_token_quant(
+        x: torch.Tensor,
+        quant_dtype: torch.dtype,
+        scale: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        return torch.ops.vllm.rocm_aiter_per_token_quant(x, quant_dtype, scale)
+
     @staticmethod
     def triton_fp4_gemm_dynamic_qaunt(
         x: torch.Tensor,
diff --git a/vllm/model_executor/layers/quantization/input_quant_fp8.py b/vllm/model_executor/layers/quantization/input_quant_fp8.py
index 7ded8eea79060..a5db086fb4729 100644
--- a/vllm/model_executor/layers/quantization/input_quant_fp8.py
+++ b/vllm/model_executor/layers/quantization/input_quant_fp8.py
@@ -5,6 +5,7 @@ import torch
 import torch.nn.functional as F
 
 from vllm import _custom_ops as ops
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.platforms import current_platform
@@ -45,10 +46,13 @@ class QuantFP8(CustomOp):
         super().__init__()
         self.static = static
         self.group_shape = group_shape
+        self.use_per_token_if_dynamic = group_shape == GroupShape.PER_TOKEN
         self.num_token_padding = num_token_padding
         self.column_major_scales = column_major_scales
         self.use_ue8m0 = use_ue8m0
 
+        self.use_aiter = rocm_aiter_ops.is_linear_fp8_enaled()
+
         self.is_group_quant = group_shape.is_per_group()
         if self.is_group_quant:
             assert not static, "Group quantization only supports dynamic mode"
@@ -92,6 +96,33 @@ class QuantFP8(CustomOp):
             use_per_token_if_dynamic=self.use_per_token_if_dynamic,
         )
 
+    def forward_hip(
+        self,
+        x: torch.Tensor,
+        scale: torch.Tensor | None = None,
+        scale_ub: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        use_aiter_quant = (
+            not self.is_group_quant
+            and self.use_aiter
+            and scale_ub is None
+            and x.is_contiguous()
+        )
+        use_aiter_per_tensor_quant = (
+            use_aiter_quant and self.group_shape == GroupShape.PER_TENSOR
+        )
+        use_aiter_per_token_quant = (
+            use_aiter_quant and self.group_shape == GroupShape.PER_TOKEN
+        )
+
+        if use_aiter_per_tensor_quant:
+            return rocm_aiter_ops.per_tensor_quant(x, _FP8_DTYPE, scale)
+        if use_aiter_per_token_quant:
+            return rocm_aiter_ops.per_token_quant(x, _FP8_DTYPE, scale)
+
+        # Fallback to CUDA implementation
+        return self.forward_cuda(x, scale, scale_ub)
+
     def forward_native(
         self,
         x: torch.Tensor,
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index ff0fc78517876..f7adecbd88746 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -381,6 +381,8 @@ class RocmPlatform(Platform):
         compilation_config = vllm_config.compilation_config
         parallel_config = vllm_config.parallel_config
         is_eager_execution = compilation_config == CUDAGraphMode.NONE
+        use_aiter_rms_norm = rocm_aiter_ops.is_rmsnorm_enabled()
+        use_aiter_fp8_linear = rocm_aiter_ops.is_linear_fp8_enaled()
 
         if compilation_config.cudagraph_mode.has_full_cudagraphs():
             # decode context parallel does not support full cudagraphs
@@ -400,8 +402,6 @@ class RocmPlatform(Platform):
                 )
                 compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
 
-        use_aiter_rms_norm = rocm_aiter_ops.is_rmsnorm_enabled()
-
         if cache_config and cache_config.block_size is None:
             cache_config.block_size = 16
 
@@ -415,6 +415,9 @@ class RocmPlatform(Platform):
         ):
             compilation_config.custom_ops.append("+rms_norm")
 
+        if use_aiter_fp8_linear and "-quant_fp8" not in compilation_config.custom_ops:
+            compilation_config.custom_ops.append("+quant_fp8")
+
     @classmethod
     def verify_model_arch(cls, model_arch: str) -> None:
         if model_arch in _ROCM_UNSUPPORTED_MODELS:

From 5c213d2899f5a2d439c8d771a0abc156a5412a2b Mon Sep 17 00:00:00 2001
From: Julien Denize <40604584+juliendenize@users.noreply.github.com>
Date: Tue, 9 Dec 2025 15:55:38 +0100
Subject: [PATCH 377/770] [BUGFIX] Mistral tool call parser v11+ (#30332)

Signed-off-by: juliendenize <julien.denize@mistral.ai>
---
 tests/tool_use/test_mistral_tool_parser.py    | 16 ++++++++++
 .../tool_parsers/mistral_tool_parser.py       | 32 ++++++++-----------
 2 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/tests/tool_use/test_mistral_tool_parser.py b/tests/tool_use/test_mistral_tool_parser.py
index e5deb7f40eb35..2dd0399cb8eeb 100644
--- a/tests/tool_use/test_mistral_tool_parser.py
+++ b/tests/tool_use/test_mistral_tool_parser.py
@@ -615,6 +615,7 @@ def test_extract_tool_calls_streaming(
         "single_tool_weather",
         "multiple_tool_calls",
         "content_before_tool",
+        "complex",
     ],
     argnames=["model_output", "expected_tool_calls", "expected_content"],
     argvalues=[
@@ -673,6 +674,21 @@ def test_extract_tool_calls_streaming(
             ],
             "bla",
         ),
+        (
+            # Complex
+            """[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="bash",
+                        arguments=json.dumps(
+                            {"command": "print(\"hello world!\")\nre.compile(r'{}')"}
+                        ),
+                    )
+                )
+            ],
+            "",
+        ),
     ],
 )
 def test_extract_tool_calls_streaming_one_chunk(
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index aa5089ffe84d7..bc827f045606c 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -99,12 +99,7 @@ class MistralToolParser(ToolParser):
         self.bot_token = "[TOOL_CALLS]"
         self.bot_token_id = self.vocab.get(self.bot_token)
         self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL)
-        if not _is_pre_v11_tokeniser(self.model_tokenizer):
-            self.fn_name_regex = re.compile(
-                r"([a-zA-Z0-9_-]+)(\{[\s\S]*?\}+)", re.DOTALL
-            )
-        else:
-            self.fn_name_regex = None
+        self._is_pre_v11 = _is_pre_v11_tokeniser(self.model_tokenizer)
 
         if self.bot_token_id is None:
             raise RuntimeError(
@@ -148,23 +143,24 @@ class MistralToolParser(ToolParser):
         tool_content = model_output.replace(self.bot_token, "").strip()
 
         try:
-            # we first try to directly load the json as parsing very nested
-            # jsons is difficult
             try:
-                if self.fn_name_regex:
+                if not self._is_pre_v11:
                     function_call_arr = []
                     for single_tool_content in model_output.split(self.bot_token):
-                        matches = self.fn_name_regex.findall(single_tool_content)
+                        if "{" not in single_tool_content:
+                            continue
 
-                        for match in matches:
-                            fn_name = match[0]
-                            args = match[1]
+                        end_name = single_tool_content.find("{")
+                        fn_name, args = (
+                            single_tool_content[:end_name],
+                            single_tool_content[end_name:],
+                        )
 
-                            # fn_name is encoded outside serialized json dump
-                            # only arguments are serialized
-                            function_call_arr.append(
-                                {"name": fn_name, "arguments": json.loads(args)}
-                            )
+                        # fn_name is encoded outside serialized json dump
+                        # only arguments are serialized
+                        function_call_arr.append(
+                            {"name": fn_name, "arguments": json.loads(args)}
+                        )
                 else:
                     function_call_arr = json.loads(tool_content)
             except json.JSONDecodeError:

From 5dcd593baf9ceaad68a0e5e16f20f6ccccd10797 Mon Sep 17 00:00:00 2001
From: quanliu <18646313696@163.com>
Date: Tue, 9 Dec 2025 23:01:38 +0800
Subject: [PATCH 378/770] [Feature] Batch-Invariant Support for FA2 and LoRA
 (#30018)

Signed-off-by: quanliu <18646313696@163.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
---
 tests/v1/determinism/test_batch_invariance.py | 10 ++++++++++
 tests/v1/determinism/utils.py                 | 10 ++++++++--
 vllm/model_executor/layers/batch_invariant.py |  6 +++++-
 3 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/tests/v1/determinism/test_batch_invariance.py b/tests/v1/determinism/test_batch_invariance.py
index fc953a66f0820..1c45e7fe366ff 100644
--- a/tests/v1/determinism/test_batch_invariance.py
+++ b/tests/v1/determinism/test_batch_invariance.py
@@ -10,6 +10,7 @@ from utils import (
     BACKENDS,
     _extract_step_logprobs,
     _random_prompt,
+    is_device_capability_below_90,
     resolve_model_name,
     skip_unsupported,
 )
@@ -17,6 +18,8 @@ from utils import (
 import vllm.model_executor.layers.batch_invariant as batch_invariant
 from vllm import LLM, SamplingParams
 
+IS_DEVICE_CAPABILITY_BELOW_90 = is_device_capability_below_90()
+
 
 @skip_unsupported
 @pytest.mark.timeout(1000)
@@ -190,6 +193,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
         max_model_len=8192,
         dtype="bfloat16",  # not everything is supported
         gpu_memory_utilization=0.9,
+        enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
     )
 
     # Use more realistic prompts for better token generation
@@ -393,6 +397,8 @@ def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch):
         gpu_memory_utilization=0.9,
         max_model_len=2048,
         dtype="bfloat16",
+        enable_prefix_caching=False,
+        enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
     )
 
     prompt = "the capital of france is"
@@ -459,6 +465,7 @@ def test_logprobs_without_batch_invariance_should_fail(
         max_num_seqs=32,
         max_model_len=8192,
         dtype="bfloat16",
+        enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
     )
 
     # build ragged prompts to change shapes significantly across BS=1 vs BS=N
@@ -682,6 +689,7 @@ def test_decode_logprobs_match_prefill_logprobs(
         max_num_seqs=32,
         max_model_len=8192,
         dtype="bfloat16",
+        enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
     )
 
     # Use a few test prompts
@@ -925,6 +933,8 @@ def LLM_with_max_seqs(
         max_model_len=max_model_len,
         dtype="bfloat16",
         tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")),
+        enable_prefix_caching=False,
+        enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
         # Enable for MOE models
         # enable_expert_parallel=True,
     )
diff --git a/tests/v1/determinism/utils.py b/tests/v1/determinism/utils.py
index 6aab50cf84ab6..a8013ed229cfc 100644
--- a/tests/v1/determinism/utils.py
+++ b/tests/v1/determinism/utils.py
@@ -11,8 +11,10 @@ from vllm.platforms import current_platform
 from vllm.utils.flashinfer import has_flashinfer
 
 skip_unsupported = pytest.mark.skipif(
-    not (current_platform.is_cuda() and current_platform.has_device_capability(90)),
-    reason="Requires CUDA and >= Hopper (SM90)",
+    not (current_platform.is_cuda() and current_platform.has_device_capability(80)),
+    # Supports testing on Ampere and Ada Lovelace devices.
+    # Note: For devices with SM < 90, batch invariance does not support CUDA Graphs.
+    reason="Requires CUDA and >= Ampere (SM80)",
 )
 
 BACKENDS: list[str] = [
@@ -97,3 +99,7 @@ def _extract_step_logprobs(request_output):
             return t, inner.token_ids
 
     return None, None
+
+
+def is_device_capability_below_90() -> bool:
+    return not current_platform.has_device_capability(90)
diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py
index 4cab47f4192a2..b14e7dad77f9a 100644
--- a/vllm/model_executor/layers/batch_invariant.py
+++ b/vllm/model_executor/layers/batch_invariant.py
@@ -935,7 +935,11 @@ def enable_batch_invariant_mode():
 
     # Batch invariant matmuls are no longer needed after cublas overrides
     if not is_torch_equal_or_newer("2.10.0.dev"):
-        if current_platform.is_device_capability(100):
+        if (
+            current_platform.is_device_capability(100)
+            or current_platform.is_device_capability(80)
+            or current_platform.is_device_capability(89)
+        ):
             # For PyTorch 2.9, B200 uses GEMV for bs=1
             # Requires https://github.com/pytorch/pytorch/pull/166735
             _batch_invariant_LIB.impl("aten::mm", mm_batch_invariant, "CUDA")

From 56037dfa2fc10a5f28e919d2c26e4c2abbd313a6 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Tue, 9 Dec 2025 10:36:12 -0500
Subject: [PATCH 379/770] [BugFix] Fix `assert  batch_descriptor.num_tokens ==
 num_tokens_padded` (#30173)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 tests/v1/cudagraph/test_cudagraph_dispatch.py |  4 +-
 vllm/forward_context.py                       |  2 +-
 vllm/v1/cudagraph_dispatcher.py               |  4 +-
 vllm/v1/spec_decode/eagle.py                  |  2 +-
 vllm/v1/worker/dp_utils.py                    | 49 ++++++++++++++-----
 vllm/v1/worker/gpu_model_runner.py            | 37 ++++++++------
 6 files changed, 65 insertions(+), 33 deletions(-)

diff --git a/tests/v1/cudagraph/test_cudagraph_dispatch.py b/tests/v1/cudagraph/test_cudagraph_dispatch.py
index b86534d3d4381..0e71d6c63ce68 100644
--- a/tests/v1/cudagraph/test_cudagraph_dispatch.py
+++ b/tests/v1/cudagraph/test_cudagraph_dispatch.py
@@ -161,10 +161,10 @@ class TestCudagraphDispatcher:
         assert rt_mode == CUDAGraphMode.NONE
         assert key == BatchDescriptor(num_tokens=15)
 
-        # 4. Cascade attention should have a fall back mode
+        # 4. disable_full should have a fall back mode (e.g., cascade attention)
         desc_full_exact = BatchDescriptor(num_tokens=8, uniform=False)
         rt_mode, key = dispatcher.dispatch(
-            num_tokens=8, uniform_decode=False, has_lora=False, use_cascade_attn=True
+            num_tokens=8, uniform_decode=False, has_lora=False, disable_full=True
         )
         if "PIECEWISE" in cudagraph_mode_str:  # string contains check
             assert rt_mode == CUDAGraphMode.PIECEWISE
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 173d366267e87..033cc1f544b3b 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -292,7 +292,7 @@ def set_forward_context(
         if num_tokens_across_dp is None:
             assert ubatch_slices is None
             assert num_tokens is not None
-            _, num_tokens_across_dp = coordinate_batch_across_dp(
+            _, num_tokens_across_dp, _ = coordinate_batch_across_dp(
                 num_tokens_unpadded=num_tokens,
                 parallel_config=vllm_config.parallel_config,
                 allow_microbatching=False,
diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
index ef0f8d9e67452..8a3500c0aac6b 100644
--- a/vllm/v1/cudagraph_dispatcher.py
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -145,7 +145,7 @@ class CudagraphDispatcher:
         num_tokens: int,
         uniform_decode: bool,
         has_lora: bool,
-        use_cascade_attn: bool = False,
+        disable_full: bool = False,
     ) -> tuple[CUDAGraphMode, BatchDescriptor]:
         """
         Given conditions(e.g.,batch descriptor and if using cascade attention),
@@ -165,7 +165,7 @@ class CudagraphDispatcher:
         )
         relaxed_batch_desc = batch_desc.relax_for_mixed_batch_cudagraphs()
 
-        if not use_cascade_attn:
+        if not disable_full:
             # check if key exists for full cudagraph
             if batch_desc in self.cudagraph_keys[CUDAGraphMode.FULL]:
                 return CUDAGraphMode.FULL, batch_desc
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 31428db2d3afc..9f7859a5c3565 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -1258,7 +1258,7 @@ class EagleProposer:
         num_tokens_padded: int,
     ) -> tuple[int, torch.Tensor]:
         # TODO(Flechman): support DBO ubatching
-        should_ubatch, num_toks_across_dp = coordinate_batch_across_dp(
+        should_ubatch, num_toks_across_dp, _ = coordinate_batch_across_dp(
             num_tokens_unpadded=num_tokens_unpadded,
             parallel_config=self.vllm_config.parallel_config,
             allow_microbatching=False,
diff --git a/vllm/v1/worker/dp_utils.py b/vllm/v1/worker/dp_utils.py
index 5da55d740c347..1b9646e1980a8 100644
--- a/vllm/v1/worker/dp_utils.py
+++ b/vllm/v1/worker/dp_utils.py
@@ -40,16 +40,18 @@ def _run_ar(
     should_dp_pad: bool,
     orig_num_tokens_per_ubatch: int,
     padded_num_tokens_per_ubatch: int,
+    cudagraph_mode: int,
     parallel_config: ParallelConfig,
 ) -> torch.Tensor:
     dp_size = parallel_config.data_parallel_size
     dp_rank = parallel_config.data_parallel_rank
     device, group = _get_device_and_group(parallel_config)
-    tensor = torch.zeros(4, dp_size, device=device, dtype=torch.int32)
+    tensor = torch.zeros(5, dp_size, device=device, dtype=torch.int32)
     tensor[0][dp_rank] = orig_num_tokens_per_ubatch
     tensor[1][dp_rank] = padded_num_tokens_per_ubatch
     tensor[2][dp_rank] = 1 if should_ubatch else 0
     tensor[3][dp_rank] = 1 if should_dp_pad else 0
+    tensor[4][dp_rank] = cudagraph_mode
     dist.all_reduce(tensor, group=group)
     return tensor
 
@@ -89,13 +91,23 @@ def _post_process_dp_padding(tensor: torch.Tensor, should_dp_pad: bool) -> torch
         return num_tokens_across_dp.cpu()
 
 
+def _post_process_cudagraph_mode(tensor: torch.Tensor) -> int:
+    """
+    Synchronize cudagraph_mode across DP ranks by taking the minimum.
+    If any rank has NONE (0), all ranks use NONE.
+    This ensures all ranks send consistent values (all padded or all unpadded).
+    """
+    return int(tensor[4, :].min().item())
+
+
 def _synchronize_dp_ranks(
     num_tokens_unpadded: int,
     num_tokens_padded: int,
     should_attempt_ubatching: bool,
     should_attempt_dp_padding: bool,
+    cudagraph_mode: int,
     parallel_config: ParallelConfig,
-) -> tuple[bool, torch.Tensor | None]:
+) -> tuple[bool, torch.Tensor | None, int]:
     """
     1. Decides if each DP rank is going to microbatch. Either all ranks
     run with microbatching or none of them do.
@@ -104,10 +116,13 @@ def _synchronize_dp_ranks(
     When running microbatched or if should_attempt_dp_padding is True, all
     ranks will be padded out so that the run with the same number of tokens
 
+    3. Synchronizes cudagraph_mode across ranks by taking the minimum.
+
     Returns: tuple[
         should_ubatch: Are all DP ranks going to microbatch
         num_tokens_after_padding: A tensor containing the total number of
         tokens per-microbatch for each DP rank including any DP padding.
+        synced_cudagraph_mode: The synchronized cudagraph mode (min across ranks)
     ]
 
     """
@@ -121,6 +136,7 @@ def _synchronize_dp_ranks(
         should_dp_pad=should_attempt_dp_padding,
         orig_num_tokens_per_ubatch=num_tokens_unpadded,
         padded_num_tokens_per_ubatch=num_tokens_padded,
+        cudagraph_mode=cudagraph_mode,
         parallel_config=parallel_config,
     )
 
@@ -148,7 +164,10 @@ def _synchronize_dp_ranks(
         should_dp_pad,
     )
 
-    return should_ubatch, num_tokens_after_padding
+    # Synchronize cudagraph_mode across ranks (take min)
+    synced_cudagraph_mode = _post_process_cudagraph_mode(tensor)
+
+    return should_ubatch, num_tokens_after_padding, synced_cudagraph_mode
 
 
 def coordinate_batch_across_dp(
@@ -159,7 +178,8 @@ def coordinate_batch_across_dp(
     num_tokens_padded: int | None = None,
     uniform_decode: bool | None = None,
     num_scheduled_tokens_per_request: np.ndarray | None = None,
-) -> tuple[bool, torch.Tensor | None]:
+    cudagraph_mode: int = 0,
+) -> tuple[bool, torch.Tensor | None, int]:
     """
     Coordinates amongst all DP ranks to determine if and how the full batch
     should be split into microbatches.
@@ -175,6 +195,7 @@ def coordinate_batch_across_dp(
             only contains single token decodes
         num_scheduled_tokens_per_request: Only used if allow_microbatching is True. The
             number of tokens per request.
+        cudagraph_mode: The cudagraph mode for this rank (0=NONE, 1=PIECEWISE, 2=FULL)
 
     Returns: tuple[
         ubatch_slices: if this is set then all DP ranks have agreed to
@@ -183,12 +204,13 @@ def coordinate_batch_across_dp(
         tokens per-microbatch for each DP rank including padding. Will be
         padded up to the max value across all DP ranks when allow_dp_padding
         is True.
+        synced_cudagraph_mode: The synchronized cudagraph mode (min across ranks)
     ]
 
     """
     if parallel_config.data_parallel_size == 1:
         # Early exit.
-        return False, None
+        return False, None, cudagraph_mode
 
     # If the caller has explicitly enabled microbatching.
     should_attempt_ubatching = False
@@ -204,12 +226,15 @@ def coordinate_batch_across_dp(
     if num_tokens_padded is None:
         num_tokens_padded = num_tokens_unpadded
 
-    (should_ubatch, num_tokens_after_padding) = _synchronize_dp_ranks(
-        num_tokens_unpadded,
-        num_tokens_padded,
-        should_attempt_ubatching,
-        allow_dp_padding,
-        parallel_config,
+    (should_ubatch, num_tokens_after_padding, synced_cudagraph_mode) = (
+        _synchronize_dp_ranks(
+            num_tokens_unpadded,
+            num_tokens_padded,
+            should_attempt_ubatching,
+            allow_dp_padding,
+            cudagraph_mode,
+            parallel_config,
+        )
     )
 
-    return (should_ubatch, num_tokens_after_padding)
+    return (should_ubatch, num_tokens_after_padding, synced_cudagraph_mode)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 22a3f9d8d2dda..766c2acd0e1d8 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2788,17 +2788,19 @@ class GPUModelRunner(
         )
 
         dispatch_cudagraph = (
-            lambda num_tokens: self.cudagraph_dispatcher.dispatch(
+            lambda num_tokens, disable_full: self.cudagraph_dispatcher.dispatch(
                 num_tokens=num_tokens,
                 has_lora=has_lora,
-                use_cascade_attn=use_cascade_attn,
                 uniform_decode=uniform_decode,
+                disable_full=disable_full,
             )
             if not force_eager
             else (CUDAGraphMode.NONE, BatchDescriptor(num_tokens_padded))
         )
 
-        cudagraph_mode, batch_descriptor = dispatch_cudagraph(num_tokens_padded)
+        cudagraph_mode, batch_descriptor = dispatch_cudagraph(
+            num_tokens_padded, use_cascade_attn
+        )
         num_tokens_padded = batch_descriptor.num_tokens
 
         # Extra coordination when running data-parallel since we need to coordinate
@@ -2813,23 +2815,28 @@ class GPUModelRunner(
                 self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
             )
 
-            should_ubatch, num_tokens_across_dp = coordinate_batch_across_dp(
-                num_tokens_unpadded=num_tokens,
-                parallel_config=self.parallel_config,
-                allow_microbatching=allow_microbatching,
-                allow_dp_padding=allow_dp_padding,
-                num_tokens_padded=num_tokens_padded,
-                uniform_decode=uniform_decode,
-                num_scheduled_tokens_per_request=num_scheduled_tokens_np,
+            should_ubatch, num_tokens_across_dp, synced_cudagraph_mode = (
+                coordinate_batch_across_dp(
+                    num_tokens_unpadded=num_tokens,
+                    parallel_config=self.parallel_config,
+                    allow_microbatching=allow_microbatching,
+                    allow_dp_padding=allow_dp_padding,
+                    num_tokens_padded=num_tokens_padded,
+                    uniform_decode=uniform_decode,
+                    num_scheduled_tokens_per_request=num_scheduled_tokens_np,
+                    cudagraph_mode=cudagraph_mode.value,
+                )
             )
 
-            # Extract DP padding if there is any
+            # Extract DP-synced values
             if num_tokens_across_dp is not None:
                 dp_rank = self.parallel_config.data_parallel_rank
                 num_tokens_padded = int(num_tokens_across_dp[dp_rank].item())
-
-                # Re-dispatch with DP padding
-                cudagraph_mode, batch_descriptor = dispatch_cudagraph(num_tokens_padded)
+                # Re-dispatch with DP padding so we have the correct batch_descriptor
+                cudagraph_mode, batch_descriptor = dispatch_cudagraph(
+                    num_tokens_padded,
+                    disable_full=synced_cudagraph_mode <= CUDAGraphMode.PIECEWISE.value,
+                )
                 # Assert to make sure the agreed upon token count is correct otherwise
                 # num_tokens_across_dp will no-longer be valid
                 assert batch_descriptor.num_tokens == num_tokens_padded

From 83319b44c26af45de4753c74f55a07df8c637a25 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Tue, 9 Dec 2025 10:40:37 -0500
Subject: [PATCH 380/770] [Compile] Fix torch warning `TensorFloat32 tensor
 cores for float32 matrix multiplication available but not enabled` (#29897)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 tests/v1/e2e/test_async_scheduling.py | 2 ++
 vllm/envs.py                          | 9 +++++++++
 vllm/v1/worker/gpu_worker.py          | 4 ++++
 3 files changed, 15 insertions(+)

diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py
index 945276376d665..838d05f0486c1 100644
--- a/tests/v1/e2e/test_async_scheduling.py
+++ b/tests/v1/e2e/test_async_scheduling.py
@@ -124,6 +124,8 @@ def run_tests(
     with monkeypatch.context() as m:
         # avoid precision errors
         m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
+        # lock matmul precision to full FP32
+        m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "highest")
         # m.setenv("VLLM_BATCH_INVARIANT", "1")
         outputs: list[tuple[str, list, list]] = []
         for n, (
diff --git a/vllm/envs.py b/vllm/envs.py
index 91d1b01076b11..bda9e6e423356 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -75,6 +75,7 @@ if TYPE_CHECKING:
     VLLM_MM_INPUT_CACHE_GIB: int = 4
     VLLM_TARGET_DEVICE: str = "cuda"
     VLLM_MAIN_CUDA_VERSION: str = "12.9"
+    VLLM_FLOAT32_MATMUL_PRECISION: Literal["highest", "high", "medium"] = "highest"
     MAX_JOBS: str | None = None
     NVCC_THREADS: str | None = None
     VLLM_USE_PRECOMPILED: bool = False
@@ -452,6 +453,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # Main CUDA version of vLLM. This follows PyTorch but can be overridden.
     "VLLM_MAIN_CUDA_VERSION": lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower()
     or "12.9",
+    # Controls PyTorch float32 matmul precision mode within vLLM workers.
+    # Valid options mirror torch.set_float32_matmul_precision
+    "VLLM_FLOAT32_MATMUL_PRECISION": env_with_choices(
+        "VLLM_FLOAT32_MATMUL_PRECISION",
+        "highest",
+        ["highest", "high", "medium"],
+        case_sensitive=False,
+    ),
     # Maximum number of compilation jobs to run in parallel.
     # By default this is the number of CPUs
     "MAX_JOBS": lambda: os.getenv("MAX_JOBS", None),
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index a46ec2bd118fe..24a3533a169f0 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -79,6 +79,10 @@ class Worker(WorkerBase):
             is_driver_worker=is_driver_worker,
         )
 
+        # configure float32 matmul precision according to vLLM env.
+        precision = envs.VLLM_FLOAT32_MATMUL_PRECISION
+        torch.set_float32_matmul_precision(precision)
+
         if self.model_config.trust_remote_code:
             # note: lazy import to avoid importing torch before initializing
             from vllm.utils.import_utils import init_cached_hf_modules

From 804e3468c04b1a43c0019d2835dabc74b779c1fc Mon Sep 17 00:00:00 2001
From: Alexei-V-Ivanov-AMD
 <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
Date: Tue, 9 Dec 2025 11:31:30 -0600
Subject: [PATCH 381/770] Update AMD test definitions (2025-12-08) (#30298)

Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
---
 .buildkite/test-amd.yaml | 188 +++++++++++++++++++++++++++------------
 1 file changed, 130 insertions(+), 58 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 6950ad774edd8..4038d32834e68 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -398,7 +398,8 @@ steps:
   timeout_in_minutes: 25
   gpu: h100
   source_file_dependencies:
-    - vllm/
+    - vllm/v1/attention
+    - vllm/model_executor/layers
     - tests/v1/determinism/
   commands:
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
@@ -440,23 +441,29 @@ steps:
   working_dir: "/vllm-workspace/examples"
   source_file_dependencies:
   - vllm/entrypoints
+  - vllm/multimodal
   - examples/
   commands:
     - pip install tensorizer # for tensorizer test
+    # for basic
+    - python3 offline_inference/basic/chat.py
     - python3 offline_inference/basic/generate.py --model facebook/opt-125m
     - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
-    - python3 offline_inference/basic/chat.py
-    - python3 offline_inference/prefix_caching.py
-    - python3 offline_inference/llm_engine_example.py
+    - python3 offline_inference/basic/classify.py
+    - python3 offline_inference/basic/embed.py
+    - python3 offline_inference/basic/score.py
+    # for multi-modal models
     - python3 offline_inference/audio_language.py --seed 0
     - python3 offline_inference/vision_language.py --seed 0
     - python3 offline_inference/vision_language_pooling.py --seed 0
     - python3 offline_inference/vision_language_multi_image.py --seed 0
-    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
     - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
-    - python3 offline_inference/basic/classify.py
-    - python3 offline_inference/basic/embed.py
-    - python3 offline_inference/basic/score.py
+    # for pooling models
+    - python3 pooling/pooling/vision_language_pooling.py --seed 0
+    # for features demo
+    - python3 offline_inference/prefix_caching.py
+    - python3 offline_inference/llm_engine_example.py
+    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
     - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
     # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
     - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
@@ -718,6 +725,18 @@ steps:
   - uv pip install --system conch-triton-kernels
   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 
+- label: LM Eval Small Models # 53min
+  timeout_in_minutes: 75
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  autorun_on_main: true
+  commands:
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+
 - label: OpenAI API correctness # 10min
   timeout_in_minutes: 15
   mirror_hardwares: [amdexperimental, amdproduction]
@@ -727,7 +746,7 @@ steps:
   - csrc/
   - vllm/entrypoints/openai/
   - vllm/model_executor/models/whisper.py
-  commands: # LMEval
+  commands: # LMEval+Transcription WER check
   # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
   - pytest -s entrypoints/openai/correctness/
 
@@ -963,6 +982,19 @@ steps:
     - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
     - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 
+- label: Multi-Modal Accuracy Eval (Small Models) # 150min - 180min
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - vllm/multimodal/
+  - vllm/inputs/
+  - vllm/v1/core/
+  commands:
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
+
 - label: Multi-Modal Models Test (Extended) 1 # 60min
   timeout_in_minutes: 120
   mirror_hardwares: [amdexperimental]
@@ -1098,7 +1130,6 @@ steps:
   - vllm/model_executor/layers/layernorm.py
   - vllm/model_executor/layers/activation.py
   - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - vllm/model_executor/layers/fused_moe/layer.py
   - tests/compile/test_fusion_attn.py
   - tests/compile/test_silu_mul_quant_fusion.py
   - tests/compile/distributed/test_fusion_all_reduce.py
@@ -1132,12 +1163,25 @@ steps:
   - vllm/model_executor/layers/activation.py
   - vllm/model_executor/layers/quantization/input_quant_fp8.py
   - tests/compile/distributed/test_fusions_e2e.py
-  - tests/compile/fullgraph/test_full_graph.py
   commands:
     - nvidia-smi
     # Run all e2e fusion tests
     - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
 
+- label: Blackwell GPT-OSS Eval
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  optional: true # run on nightlies
+  source_file_dependencies:
+  - tests/evals/gpt_oss
+  - vllm/model_executor/models/gpt_oss.py
+  - vllm/model_executor/layers/quantization/mxfp4.py
+  - vllm/v1/attention/backends/flashinfer.py
+  commands:
+    - uv pip install --system 'gpt-oss[eval]==0.0.5'
+    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+
 - label: Blackwell Quantized MoE Test
   timeout_in_minutes: 60
   working_dir: "/vllm-workspace/"
@@ -1155,6 +1199,16 @@ steps:
   commands:
     - pytest -s -v tests/quantization/test_blackwell_moe.py
 
+- label: Blackwell LM Eval Small Models
+  timeout_in_minutes: 120
+  gpu: b200
+  optional: true # run on nightlies
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
+
 #####  1 GPU test  #####
 #####  multi gpus test  #####
 
@@ -1397,6 +1451,39 @@ steps:
   - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
   - pytest -v -s -x lora/test_mixtral.py
 
+
+- label: LM Eval Large Models # optional
+  gpu: a100
+  optional: true
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_4
+  # grade: Blocking
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
+
+##### H100 test #####
+- label: LM Eval Large Models (H100) # optional
+  gpu: h100
+  optional: true
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_4
+  # grade: Blocking
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
+    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
+
+
 ##### H200 test #####
 - label: Distributed Tests (H200) # optional
   mirror_hardwares: [amdexperimental]
@@ -1440,29 +1527,6 @@ steps:
   commands:
   - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
 
-- label: Blackwell LM Eval Small Models
-  timeout_in_minutes: 120
-  gpu: b200
-  optional: true # run on nightlies
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
-
-- label: Multi-Modal Accuracy Eval (Small Models) # 10min
-  timeout_in_minutes: 70
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  # grade: Blocking
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-  source_file_dependencies:
-  - vllm/multimodal/
-  - vllm/inputs/
-  - vllm/v1/core/
-  commands:
-  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
-
 - label: LM Eval Large Models (4 Card)
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_4
@@ -1478,21 +1542,6 @@ steps:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
 
-- label: LM Eval Large Models (H100) # optional
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_4
-  # grade: Blocking
-  gpu: h100
-  optional: true
-  num_gpus: 4
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  commands:
-    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
-    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
-
 - label: ROCm LM Eval Large Models (8 Card)
   mirror_hardwares: [amdproduction]
   agent_pool: mi325_8
@@ -1517,6 +1566,20 @@ steps:
     - uv pip install --system 'gpt-oss[eval]==0.0.5'
     - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
 
+##### RL Integration Tests #####
+- label: Prime-RL Integration Test # 15min
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_2
+  # grade: Blocking
+  timeout_in_minutes: 30
+  optional: true
+  num_gpus: 2
+  working_dir: "/vllm-workspace"
+  source_file_dependencies:
+  - vllm/
+  - .buildkite/scripts/run-prime-rl-test.sh
+  commands:
+    - bash .buildkite/scripts/run-prime-rl-test.sh
 - label: DeepSeek V2-Lite Accuracy
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_4
@@ -1550,17 +1613,26 @@ steps:
   commands:
   - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
 
-##### RL Integration Tests #####
-- label: Prime-RL Integration Test # 15min
+- label: DeepSeek V2-Lite Async EPLB Accuracy
+  timeout_in_minutes: 60
   mirror_hardwares: [amdexperimental]
-  agent_pool: mi325_2
+  agent_pool: mi325_4
   # grade: Blocking
-  timeout_in_minutes: 30
+  gpu: h100
   optional: true
-  num_gpus: 2
+  num_gpus: 4
   working_dir: "/vllm-workspace"
-  source_file_dependencies:
-  - vllm/
-  - .buildkite/scripts/run-prime-rl-test.sh
   commands:
-    - bash .buildkite/scripts/run-prime-rl-test.sh
+  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
+
+- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
+  timeout_in_minutes: 60
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_4
+  # grade: Blocking
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040

From 0b6a8a304cd8bab21383c4c2904064f6f6f2fd62 Mon Sep 17 00:00:00 2001
From: Ilya Markov <markovilya197@gmail.com>
Date: Tue, 9 Dec 2025 18:57:55 +0100
Subject: [PATCH 382/770] [BugFix] Fix non detected failing tests (#30277)

Signed-off-by: ilmarkov <markovilya197@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |  8 ++-
 .../fullgraph/test_multimodal_compile.py      |  1 -
 tests/compile/test_compile_ranges.py          |  6 ++
 tests/compile/test_pass_manager.py            | 65 ++++++++++---------
 vllm/compilation/inductor_pass.py             |  8 ++-
 vllm/compilation/piecewise_backend.py         | 19 ++++++
 6 files changed, 73 insertions(+), 34 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 0a99994e243ae..8fc3587f7813c 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -468,7 +468,9 @@ steps:
   # tests covered elsewhere.
   # Use `find` to launch multiple instances of pytest so that
   # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
-  - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
+  # However, find does not normally propagate error codes, so we combine it with xargs
+  # (using -0 for proper path handling)
+  - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
 
 - label: PyTorch Fullgraph Smoke Test # 15min
   timeout_in_minutes: 30
@@ -482,7 +484,9 @@ steps:
   # as it is a heavy test that is covered in other steps.
   # Use `find` to launch multiple instances of pytest so that
   # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
-  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
+  # However, find does not normally propagate error codes, so we combine it with xargs
+  # (using -0 for proper path handling)
+  - "find compile/fullgraph -maxdepth 1 -name 'test_*.py' -not -name 'test_full_graph.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
 
 - label: PyTorch Fullgraph Test # 27min
   timeout_in_minutes: 40
diff --git a/tests/compile/fullgraph/test_multimodal_compile.py b/tests/compile/fullgraph/test_multimodal_compile.py
index e2897b2271b8f..621f6a51a918f 100644
--- a/tests/compile/fullgraph/test_multimodal_compile.py
+++ b/tests/compile/fullgraph/test_multimodal_compile.py
@@ -17,7 +17,6 @@ def test_compile():
 # forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
 @pytest.mark.forked
 @pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
-@pytest.mark.xfail
 def test_qwen2_5_vl_compilation(vllm_runner, monkeypatch):
     """Test that Qwen2.5-VL vision submodules are compiled.
 
diff --git a/tests/compile/test_compile_ranges.py b/tests/compile/test_compile_ranges.py
index d849a8617ebd2..14ae8233f1131 100644
--- a/tests/compile/test_compile_ranges.py
+++ b/tests/compile/test_compile_ranges.py
@@ -80,6 +80,8 @@ def test_compile_ranges(use_fresh_inductor_cache):
     vllm_config = VllmConfig(
         scheduler_config=SchedulerConfig(
             max_num_batched_tokens=8192,
+            max_model_len=8192,
+            is_encoder_decoder=False,
         ),
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
@@ -112,6 +114,8 @@ def test_compile_config_get_compile_ranges():
     VllmConfig(
         scheduler_config=SchedulerConfig(
             max_num_batched_tokens=8192,
+            max_model_len=8192,
+            is_encoder_decoder=False,
         ),
         compilation_config=compilation_config,
     )
@@ -134,6 +138,8 @@ def test_inductor_cache_compile_ranges(monkeypatch, use_fresh_inductor_cache):
     )
     scheduler_config = SchedulerConfig(
         max_num_batched_tokens=8192,
+        max_model_len=8192,
+        is_encoder_decoder=False,
     )
     torch.set_default_device("cuda")
 
diff --git a/tests/compile/test_pass_manager.py b/tests/compile/test_pass_manager.py
index 6d0ba6b655031..6ed77b0085f51 100644
--- a/tests/compile/test_pass_manager.py
+++ b/tests/compile/test_pass_manager.py
@@ -5,9 +5,14 @@ import copy
 import pytest
 import torch
 
-from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
+from vllm.compilation.inductor_pass import (
+    CallableInductorPass,
+    InductorPass,
+    pass_context,
+)
 from vllm.compilation.pass_manager import PostGradPassManager
 from vllm.config import ModelConfig, VllmConfig
+from vllm.config.utils import Range
 
 
 # dummy custom pass that doesn't inherit
@@ -42,35 +47,37 @@ class ProperPass(InductorPass):
     ],
 )
 def test_pass_manager_uuid(callable):
-    # Some passes need dtype to be set
-    config = VllmConfig(model_config=ModelConfig(dtype=torch.bfloat16))
+    # Set the pass context as PassManager uuid uses it
+    with pass_context(Range(start=1, end=8)):
+        # Some passes need dtype to be set
+        config = VllmConfig(model_config=ModelConfig(dtype=torch.bfloat16))
 
-    pass_manager = PostGradPassManager()
-    pass_manager.configure(config)
+        pass_manager = PostGradPassManager()
+        pass_manager.configure(config)
 
-    # Check that UUID is different if the same pass is added 2x
-    pass_manager.add(callable)
-    uuid1 = pass_manager.uuid()
-    pass_manager.add(callable)
-    uuid2 = pass_manager.uuid()
-    assert uuid1 != uuid2
+        # Check that UUID is different if the same pass is added 2x
+        pass_manager.add(callable)
+        uuid1 = pass_manager.uuid()
+        pass_manager.add(callable)
+        uuid2 = pass_manager.uuid()
+        assert uuid1 != uuid2
 
-    # UUID should be the same as the original one,
-    # as we constructed in the same way.
-    pass_manager2 = PostGradPassManager()
-    pass_manager2.configure(config)
-    pass_manager2.add(callable)
-    assert uuid1 == pass_manager2.uuid()
+        # UUID should be the same as the original one,
+        # as we constructed in the same way.
+        pass_manager2 = PostGradPassManager()
+        pass_manager2.configure(config)
+        pass_manager2.add(callable)
+        assert uuid1 == pass_manager2.uuid()
 
-    # UUID should be different due to config change
-    config2 = copy.deepcopy(config)
-    config2.compilation_config.pass_config.fuse_norm_quant = (
-        not config2.compilation_config.pass_config.fuse_norm_quant
-    )
-    config2.compilation_config.pass_config.fuse_act_quant = (
-        not config2.compilation_config.pass_config.fuse_act_quant
-    )
-    pass_manager3 = PostGradPassManager()
-    pass_manager3.configure(config2)
-    pass_manager3.add(callable)
-    assert uuid1 != pass_manager3.uuid()
+        # UUID should be different due to config change
+        config2 = copy.deepcopy(config)
+        config2.compilation_config.pass_config.fuse_norm_quant = (
+            not config2.compilation_config.pass_config.fuse_norm_quant
+        )
+        config2.compilation_config.pass_config.fuse_act_quant = (
+            not config2.compilation_config.pass_config.fuse_act_quant
+        )
+        pass_manager3 = PostGradPassManager()
+        pass_manager3.configure(config2)
+        pass_manager3.add(callable)
+        assert uuid1 != pass_manager3.uuid()
diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py
index 8159b817f637a..dbf154eeb86a4 100644
--- a/vllm/compilation/inductor_pass.py
+++ b/vllm/compilation/inductor_pass.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from __future__ import annotations
+
 import functools
 import hashlib
 import inspect
@@ -8,15 +10,17 @@ import json
 import types
 from collections.abc import Callable
 from contextlib import contextmanager
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
 import torch
 from torch import fx
 from torch._subclasses.fake_tensor import FakeTensorMode, unset_fake_temporarily
 
-from vllm.config.utils import Range
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
+if TYPE_CHECKING:
+    from vllm.config.utils import Range
+
 if is_torch_equal_or_newer("2.6"):
     from torch._inductor.custom_graph_pass import CustomGraphPass
 else:
diff --git a/vllm/compilation/piecewise_backend.py b/vllm/compilation/piecewise_backend.py
index 129b9b5deea31..a15c693767a51 100644
--- a/vllm/compilation/piecewise_backend.py
+++ b/vllm/compilation/piecewise_backend.py
@@ -53,8 +53,27 @@ class PiecewiseBackend:
         self.is_last_graph = piecewise_compile_index == total_piecewise_compiles - 1
 
         self.is_full_graph = total_piecewise_compiles == 1
+        # TODO: we need to generalize encoder compilation to other models
+        self.is_encoder_compilation = vllm_backend.prefix in [
+            "Qwen2_5_VisionPatchEmbed",
+            "Qwen2_5_VisionPatchMerger",
+            "Qwen2_5_VisionBlock",
+        ]
 
         self.compile_ranges = self.compilation_config.get_compile_ranges()
+        if self.is_encoder_compilation:
+            # For encoder compilation we use the max int32 value
+            # to set the upper bound of the compile ranges
+            max_int32 = 2**31 - 1
+            last_compile_range = self.compile_ranges[-1]
+            assert (
+                last_compile_range.end
+                == vllm_config.scheduler_config.max_num_batched_tokens
+            )
+            self.compile_ranges[-1] = Range(
+                start=last_compile_range.start, end=max_int32
+            )
+
         log_string = f"PiecewiseBackend: compile_ranges: {self.compile_ranges}"
         logger.debug_once(log_string)
 

From 9e6562a3f625279fd7c8b9ac53c30fed3b01f5b9 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 9 Dec 2025 09:59:54 -0800
Subject: [PATCH 383/770] [Model Runner V2] Fix Triton warning on tl.where
 (#30355)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/sample/penalties.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/v1/worker/gpu/sample/penalties.py b/vllm/v1/worker/gpu/sample/penalties.py
index c8d4b7d81841d..b4fcc822ecfce 100644
--- a/vllm/v1/worker/gpu/sample/penalties.py
+++ b/vllm/v1/worker/gpu/sample/penalties.py
@@ -62,6 +62,7 @@ def _penalties_and_temperature_kernel(
                 mask=packed_block < tl.cdiv(vocab_size, 32),
             )
             prompt_bin_mask = (packed_mask[:, None] >> (tl.arange(0, 32)[None, :])) & 1
+            prompt_bin_mask = prompt_bin_mask.to(tl.int1)
             prompt_bin_mask = prompt_bin_mask.reshape(BLOCK_SIZE)
 
             # If token appears in prompt or output, apply, otherwise use 1.0 for no-op.

From d471b2aff09028f9c62e861f760a74fd8f99081d Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 9 Dec 2025 10:00:49 -0800
Subject: [PATCH 384/770] [Model Runner V2] Support num NaNs in logits (#30187)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/async_utils.py      | 41 +++++++++++++------------
 vllm/v1/worker/gpu/metrics/__init__.py |  0
 vllm/v1/worker/gpu/metrics/logits.py   | 42 ++++++++++++++++++++++++++
 vllm/v1/worker/gpu/model_runner.py     |  2 +-
 vllm/v1/worker/gpu/sample/min_p.py     |  4 +--
 vllm/v1/worker/gpu/sample/output.py    | 14 +++++++++
 vllm/v1/worker/gpu/sample/sampler.py   | 12 ++++++--
 7 files changed, 89 insertions(+), 26 deletions(-)
 create mode 100644 vllm/v1/worker/gpu/metrics/__init__.py
 create mode 100644 vllm/v1/worker/gpu/metrics/logits.py
 create mode 100644 vllm/v1/worker/gpu/sample/output.py

diff --git a/vllm/v1/worker/gpu/async_utils.py b/vllm/v1/worker/gpu/async_utils.py
index f6bc607c1ae67..a2e3decad0486 100644
--- a/vllm/v1/worker/gpu/async_utils.py
+++ b/vllm/v1/worker/gpu/async_utils.py
@@ -2,14 +2,15 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from contextlib import contextmanager
 
+import numpy as np
 import torch
 
 from vllm.v1.outputs import (
     AsyncModelRunnerOutput,
     LogprobsTensors,
     ModelRunnerOutput,
-    SamplerOutput,
 )
+from vllm.v1.worker.gpu.sample.output import SamplerOutput
 
 
 class AsyncOutput(AsyncModelRunnerOutput):
@@ -34,29 +35,18 @@ class AsyncOutput(AsyncModelRunnerOutput):
         with torch.cuda.stream(self.copy_stream):
             self.copy_stream.wait_stream(default_stream)
 
-            # NOTE(woosuk): We must ensure that CPU tensors are not freed
-            # before the device-to-host copy is fully completed. For instance,
-            # operations like
-            # self.sampled_token_np = ...to("cpu", non_blocking=True).numpy()
-            # are unsafe because the underlying CPU tensor can be prematurely freed and
-            # reused by other tensors before the asynchronous copy finishes, potentially
-            # causing race conditions. To prevent this, we delay freeing by holding
-            # references until the copy event signals completion.
-            # Likewise, we also need to keep the reference to the GPU tensors.
-            # This is done by keeping the reference to sampler_output and
-            # model_runner_output.
-            self.sampled_token_ids = sampler_output.sampled_token_ids.to(
-                "cpu", non_blocking=True
-            )
+            self.sampled_token_ids = async_copy_to_np(sampler_output.sampled_token_ids)
             if sampler_output.logprobs_tensors is not None:
                 self.logprobs_tensors: LogprobsTensors | None = (
                     sampler_output.logprobs_tensors.to_cpu_nonblocking()
                 )
             else:
                 self.logprobs_tensors = None
-            self.num_sampled_tokens_cpu = num_sampled_tokens.to(
-                "cpu", non_blocking=True
-            )
+            if sampler_output.num_nans is not None:
+                self.num_nans = async_copy_to_np(sampler_output.num_nans)
+            else:
+                self.num_nans = None
+            self.num_sampled_tokens_np = async_copy_to_np(num_sampled_tokens)
             self.prompt_logprobs_dict: dict[str, LogprobsTensors | None] = {}
             if self.model_runner_output.prompt_logprobs_dict:
                 for k, v in self.model_runner_output.prompt_logprobs_dict.items():
@@ -68,7 +58,6 @@ class AsyncOutput(AsyncModelRunnerOutput):
 
     def get_output(self) -> ModelRunnerOutput:
         self.copy_event.synchronize()
-        num_sampled_tokens_np = self.num_sampled_tokens_cpu.numpy()
 
         # NOTE(woosuk): The following code is to ensure compatibility with
         # the existing model runner.
@@ -76,10 +65,18 @@ class AsyncOutput(AsyncModelRunnerOutput):
         # rather than Python lists.
         sampled_token_ids: list[list[int]] = self.sampled_token_ids.tolist()
         num_reqs = len(sampled_token_ids)
+        num_sampled_tokens = self.num_sampled_tokens_np.tolist()
         for i in range(num_reqs):
-            del sampled_token_ids[i][num_sampled_tokens_np[i] :]
+            del sampled_token_ids[i][num_sampled_tokens[i] :]
         self.model_runner_output.sampled_token_ids = sampled_token_ids
 
+        if self.num_nans is not None:
+            num_nans = self.num_nans.tolist()
+            self.model_runner_output.num_nans_in_logits = {
+                req_id: num_nans[i]
+                for i, req_id in enumerate(self.model_runner_output.req_ids)
+            }
+
         if self.logprobs_tensors is not None:
             self.model_runner_output.logprobs = self.logprobs_tensors.tolists()
         self.model_runner_output.prompt_logprobs_dict = self.prompt_logprobs_dict
@@ -95,3 +92,7 @@ def async_barrier(event: torch.cuda.Event | None):
     finally:
         if event is not None:
             event.record()
+
+
+def async_copy_to_np(x: torch.Tensor) -> np.ndarray:
+    return x.to("cpu", non_blocking=True).numpy()
diff --git a/vllm/v1/worker/gpu/metrics/__init__.py b/vllm/v1/worker/gpu/metrics/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/v1/worker/gpu/metrics/logits.py b/vllm/v1/worker/gpu/metrics/logits.py
new file mode 100644
index 0000000000000..fd7b30beaa1f8
--- /dev/null
+++ b/vllm/v1/worker/gpu/metrics/logits.py
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+from torch._inductor.runtime.triton_helpers import libdevice
+
+from vllm.triton_utils import tl, triton
+
+
+@triton.jit
+def _num_nans_kernel(
+    logits_ptr,
+    logits_stride,
+    num_nans_ptr,
+    vocab_size,
+    BLOCK_SIZE: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+    num_nans = 0
+    for i in range(0, vocab_size, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        mask = block < vocab_size
+        logits = tl.load(
+            logits_ptr + req_idx * logits_stride + block, mask=mask, other=0
+        )
+        logits = logits.to(tl.float32)
+        is_nan = libdevice.isnan(logits).to(tl.int1)
+        num_nans += tl.sum(is_nan).to(tl.int32)
+    tl.store(num_nans_ptr + req_idx, num_nans)
+
+
+def get_num_nans(logits: torch.Tensor) -> torch.Tensor:
+    num_reqs, vocab_size = logits.shape
+    BLOCK_SIZE = 8192
+    num_nans = torch.empty(num_reqs, dtype=torch.int32, device=logits.device)
+    _num_nans_kernel[(num_reqs,)](
+        logits,
+        logits.stride(0),
+        num_nans,
+        vocab_size,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
+    return num_nans
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 464f7b7bd3532..9f4c6edfb6aa9 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -25,7 +25,6 @@ from vllm.v1.outputs import (
     LogprobsTensors,
     ModelRunnerOutput,
 )
-from vllm.v1.sample.sampler import SamplerOutput
 from vllm.v1.worker.gpu.async_utils import AsyncOutput, async_barrier
 from vllm.v1.worker.gpu.attn_utils import (
     build_attn_metadata,
@@ -53,6 +52,7 @@ from vllm.v1.worker.gpu.sample.metadata import (
     SamplingMetadata,
     expand_sampling_metadata,
 )
+from vllm.v1.worker.gpu.sample.output import SamplerOutput
 from vllm.v1.worker.gpu.sample.sampler import Sampler
 from vllm.v1.worker.gpu.spec_decode import init_speculator
 from vllm.v1.worker.gpu.spec_decode.rejection_sample import rejection_sample
diff --git a/vllm/v1/worker/gpu/sample/min_p.py b/vllm/v1/worker/gpu/sample/min_p.py
index 0638818006f50..c98a42cb2b1bb 100644
--- a/vllm/v1/worker/gpu/sample/min_p.py
+++ b/vllm/v1/worker/gpu/sample/min_p.py
@@ -39,9 +39,7 @@ def _min_p_kernel(
         tl.store(logits_ptr + req_idx * logits_stride + block, logits, mask=mask)
 
 
-def apply_min_p(logits: torch.Tensor, min_p: torch.Tensor | None) -> None:
-    if min_p is None:
-        return
+def apply_min_p(logits: torch.Tensor, min_p: torch.Tensor) -> None:
     num_reqs, vocab_size = logits.shape
     BLOCK_SIZE = 1024
     _min_p_kernel[(num_reqs,)](
diff --git a/vllm/v1/worker/gpu/sample/output.py b/vllm/v1/worker/gpu/sample/output.py
new file mode 100644
index 0000000000000..13e8cf1d6c1ec
--- /dev/null
+++ b/vllm/v1/worker/gpu/sample/output.py
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+
+import torch
+
+from vllm.v1.outputs import LogprobsTensors
+
+
+@dataclass
+class SamplerOutput:
+    sampled_token_ids: torch.Tensor
+    logprobs_tensors: LogprobsTensors | None
+    num_nans: torch.Tensor | None
diff --git a/vllm/v1/worker/gpu/sample/sampler.py b/vllm/v1/worker/gpu/sample/sampler.py
index 9a4224d8fddef..84a3e18671b2c 100644
--- a/vllm/v1/worker/gpu/sample/sampler.py
+++ b/vllm/v1/worker/gpu/sample/sampler.py
@@ -3,13 +3,15 @@
 
 import torch
 
+import vllm.envs as envs
 from vllm.config.model import LogprobsMode
-from vllm.v1.outputs import SamplerOutput
 from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
+from vllm.v1.worker.gpu.metrics.logits import get_num_nans
 from vllm.v1.worker.gpu.sample.gumbel import gumbel_sample
 from vllm.v1.worker.gpu.sample.logprob import compute_topk_logprobs
 from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata
 from vllm.v1.worker.gpu.sample.min_p import apply_min_p
+from vllm.v1.worker.gpu.sample.output import SamplerOutput
 from vllm.v1.worker.gpu.sample.penalties import apply_penalties_and_temperature
 
 
@@ -21,12 +23,16 @@ class Sampler:
         if logprobs_mode not in ["processed_logprobs", "raw_logprobs"]:
             raise NotImplementedError(f"Unsupported logprobs_mode: {logprobs_mode}")
         self.logprobs_mode = logprobs_mode
+        self.compute_nans = envs.VLLM_COMPUTE_NANS_IN_LOGITS  # False by default.
 
     def __call__(
         self,
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> SamplerOutput:
+        # NOTE(woosuk): We intentionally compute num_nans before sampling to make clear
+        # that num_nans is computed before applying penalties and temperature.
+        num_nans = get_num_nans(logits) if self.compute_nans else None
         sampled, processed_logits = self.sample(logits, sampling_metadata)
         if sampling_metadata.max_num_logprobs is not None:
             logits = (
@@ -49,6 +55,7 @@ class Sampler:
             # token per request.
             sampled_token_ids=sampled.view(-1, 1),
             logprobs_tensors=logprobs_tensors,
+            num_nans=num_nans,
         )
         return sampler_output
 
@@ -63,7 +70,8 @@ class Sampler:
         # Apply penalties and temperature in place.
         apply_penalties_and_temperature(logits, sampling_metadata)
         # Apply min_p in place.
-        apply_min_p(logits, sampling_metadata.min_p)
+        if sampling_metadata.min_p is not None:
+            apply_min_p(logits, sampling_metadata.min_p)
         # Apply top_k and/or top_p. This might return a new tensor.
         logits = apply_top_k_top_p(
             logits, sampling_metadata.top_k, sampling_metadata.top_p

From e858bfe05167a3bbb064e283da5a1a7709dee24e Mon Sep 17 00:00:00 2001
From: Benjamin Chislett <bchislett@nvidia.com>
Date: Tue, 9 Dec 2025 13:29:33 -0500
Subject: [PATCH 385/770] [Cleanup] Refactor profiling env vars into a CLI
 config (#29912)

Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
Signed-off-by: Benjamin Chislett <chislett.ben@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 benchmarks/auto_tune/auto_tune.sh             |   5 +-
 .../benchmark_serving_structured_output.py    |   3 +-
 docs/api/README.md                            |   1 +
 docs/contributing/profiling.md                |  23 +-
 .../offline_inference/simple_profiling.py     |  13 +-
 tests/v1/worker/test_gpu_profiler.py          |  71 ++++---
 vllm/benchmarks/latency.py                    |  12 +-
 vllm/benchmarks/serve.py                      |   3 +-
 vllm/benchmarks/throughput.py                 |   3 +-
 vllm/config/__init__.py                       |   3 +
 vllm/config/profiler.py                       | 199 ++++++++++++++++++
 vllm/config/vllm.py                           |   5 +
 vllm/engine/arg_utils.py                      |   6 +-
 vllm/entrypoints/llm.py                       |  17 ++
 vllm/entrypoints/serve/profile/api_router.py  |  17 +-
 vllm/envs.py                                  | 110 +++++-----
 vllm/profiler/{gpu_profiler.py => wrapper.py} |  72 ++++---
 vllm/v1/engine/async_llm.py                   |  22 +-
 vllm/v1/worker/cpu_worker.py                  |  36 +---
 vllm/v1/worker/gpu_worker.py                  |  18 +-
 vllm/v1/worker/tpu_worker.py                  |   4 +-
 vllm/v1/worker/xpu_worker.py                  |  42 +---
 22 files changed, 433 insertions(+), 252 deletions(-)
 create mode 100644 vllm/config/profiler.py
 rename vllm/profiler/{gpu_profiler.py => wrapper.py} (73%)

diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh
index 56b721cbb4021..25baa9cbda39c 100644
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@@ -96,8 +96,9 @@ start_server() {
     # This correctly passes each element as a separate argument.
     if [[ -n "$profile_dir" ]]; then
         # Start server with profiling enabled
-        VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
-            vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
+        local profile_config_json="{\"profiler\": \"torch\", \"torch_profiler_dir\": \"$profile_dir\"}"
+        VLLM_SERVER_DEV_MODE=1 \
+            vllm serve --profiler-config "$profile_config_json" "${common_args_array[@]}" > "$vllm_log" 2>&1 &
     else
         # Start server without profiling
         VLLM_SERVER_DEV_MODE=1 \
diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index df122b4c5e8db..a4e1b163dcca9 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -963,8 +963,7 @@ def create_argument_parser():
     parser.add_argument(
         "--profile",
         action="store_true",
-        help="Use Torch Profiler. The endpoint must be launched with "
-        "VLLM_TORCH_PROFILER_DIR to enable profiler.",
+        help="Use vLLM Profiling. --profiler-config must be provided on the server.",
     )
     parser.add_argument(
         "--result-dir",
diff --git a/docs/api/README.md b/docs/api/README.md
index d3a141f327308..d51329ec2faa3 100644
--- a/docs/api/README.md
+++ b/docs/api/README.md
@@ -15,6 +15,7 @@ API documentation for vLLM's configuration classes.
 - [vllm.config.MultiModalConfig][]
 - [vllm.config.PoolerConfig][]
 - [vllm.config.StructuredOutputsConfig][]
+- [vllm.config.ProfilerConfig][]
 - [vllm.config.ObservabilityConfig][]
 - [vllm.config.KVTransferConfig][]
 - [vllm.config.CompilationConfig][]
diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md
index 65382afbe4f21..cbce14ce992ec 100644
--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@@ -5,16 +5,15 @@
 
 ## Profile with PyTorch Profiler
 
-We support tracing vLLM workers using the `torch.profiler` module. You can enable tracing by setting the `VLLM_TORCH_PROFILER_DIR` environment variable to the directory where you want to save the traces: `VLLM_TORCH_PROFILER_DIR=/mnt/traces/`. Additionally, you can control the profiling content by specifying the following environment variables:
+We support tracing vLLM workers using the `torch.profiler` module. You can enable the torch profiler by setting `--profiler-config`
+when launching the server, and setting the entries `profiler` to `'torch'` and `torch_profiler_dir` to the directory where you want to save the traces. Additionally, you can control the profiling content by specifying the following additional arguments in the config:
 
-- `VLLM_TORCH_PROFILER_RECORD_SHAPES=1` to enable recording Tensor Shapes, off by default
-- `VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1` to record memory, off by default
-- `VLLM_TORCH_PROFILER_WITH_STACK=1` to enable recording stack information, on by default
-- `VLLM_TORCH_PROFILER_WITH_FLOPS=1` to enable recording FLOPs, off by default
-- `VLLM_TORCH_PROFILER_USE_GZIP=0` to disable gzip-compressing profiling files, on by default
-- `VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL=0` to disable dumping and printing the aggregated CUDA self time table, on by default
-
-The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set.
+- `torch_profiler_record_shapes` to enable recording Tensor Shapes, off by default
+- `torch_profiler_with_memory` to record memory, off by default
+- `torch_profiler_with_stack` to enable recording stack information, on by default
+- `torch_profiler_with_flops` to enable recording FLOPs, off by default
+- `torch_profiler_use_gzip` to control gzip-compressing profiling files, on by default
+- `torch_profiler_dump_cuda_time_total` to control dumping and printing the aggregated CUDA self time table, on by default
 
 When using `vllm bench serve`, you can enable profiling by passing the `--profile` flag.
 
@@ -40,8 +39,7 @@ Refer to [examples/offline_inference/simple_profiling.py](../../examples/offline
 #### OpenAI Server
 
 ```bash
-VLLM_TORCH_PROFILER_DIR=./vllm_profile \
-    vllm serve meta-llama/Llama-3.1-8B-Instruct
+vllm serve meta-llama/Llama-3.1-8B-Instruct --profiler-config '{"profiler": "torch", "torch_profiler_dir": "./vllm_profile"}'
 ```
 
 vllm bench command:
@@ -104,13 +102,12 @@ To profile the server, you will want to prepend your `vllm serve` command with `
 
 ```bash
 # server
-VLLM_TORCH_CUDA_PROFILE=1 \
 nsys profile \
     --trace-fork-before-exec=true \
     --cuda-graph-trace=node \
     --capture-range=cudaProfilerApi \
     --capture-range-end repeat \
-    vllm serve meta-llama/Llama-3.1-8B-Instruct
+    vllm serve meta-llama/Llama-3.1-8B-Instruct --profiler-config.profiler cuda
 
 # client
 vllm bench serve \
diff --git a/examples/offline_inference/simple_profiling.py b/examples/offline_inference/simple_profiling.py
index 46858fffadc52..e8a75cd03befb 100644
--- a/examples/offline_inference/simple_profiling.py
+++ b/examples/offline_inference/simple_profiling.py
@@ -1,14 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import os
 import time
 
 from vllm import LLM, SamplingParams
 
-# enable torch profiler, can also be set on cmd line
-os.environ["VLLM_TORCH_PROFILER_DIR"] = "./vllm_profile"
-
 # Sample prompts.
 prompts = [
     "Hello, my name is",
@@ -22,7 +18,14 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
 def main():
     # Create an LLM.
-    llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)
+    llm = LLM(
+        model="facebook/opt-125m",
+        tensor_parallel_size=1,
+        profiler_config={
+            "profiler": "torch",
+            "torch_profiler_dir": "./vllm_profile",
+        },
+    )
 
     llm.start_profile()
 
diff --git a/tests/v1/worker/test_gpu_profiler.py b/tests/v1/worker/test_gpu_profiler.py
index f7255fae05a4e..933ea42f18cd5 100644
--- a/tests/v1/worker/test_gpu_profiler.py
+++ b/tests/v1/worker/test_gpu_profiler.py
@@ -2,8 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
-import vllm.envs as envs
-from vllm.profiler.gpu_profiler import WorkerProfiler
+from vllm.config import ProfilerConfig
+from vllm.profiler.wrapper import WorkerProfiler
 
 
 class ConcreteWorkerProfiler(WorkerProfiler):
@@ -11,11 +11,11 @@ class ConcreteWorkerProfiler(WorkerProfiler):
     A basic implementation of a worker profiler for testing purposes.
     """
 
-    def __init__(self):
+    def __init__(self, profiler_config: ProfilerConfig):
         self.start_call_count = 0
         self.stop_call_count = 0
         self.should_fail_start = False
-        super().__init__()
+        super().__init__(profiler_config)
 
     def _start(self) -> None:
         if self.should_fail_start:
@@ -26,17 +26,19 @@ class ConcreteWorkerProfiler(WorkerProfiler):
         self.stop_call_count += 1
 
 
-@pytest.fixture(autouse=True)
-def reset_mocks():
-    """Fixture to reset mocks and env variables before each test."""
-    envs.VLLM_PROFILER_DELAY_ITERS = 0
-    envs.VLLM_PROFILER_MAX_ITERS = 0
+@pytest.fixture
+def default_profiler_config():
+    return ProfilerConfig(
+        profiler="torch",
+        torch_profiler_dir="/tmp/mock",
+        delay_iterations=0,
+        max_iterations=0,
+    )
 
 
-def test_immediate_start_stop():
+def test_immediate_start_stop(default_profiler_config):
     """Test standard start without delay."""
-    profiler = ConcreteWorkerProfiler()
-
+    profiler = ConcreteWorkerProfiler(default_profiler_config)
     profiler.start()
     assert profiler._running is True
     assert profiler._active is True
@@ -48,10 +50,10 @@ def test_immediate_start_stop():
     assert profiler.stop_call_count == 1
 
 
-def test_delayed_start():
+def test_delayed_start(default_profiler_config):
     """Test that profiler waits for N steps before actually starting."""
-    envs.VLLM_PROFILER_DELAY_ITERS = 2
-    profiler = ConcreteWorkerProfiler()
+    default_profiler_config.delay_iterations = 2
+    profiler = ConcreteWorkerProfiler(default_profiler_config)
 
     # User requests start
     profiler.start()
@@ -71,10 +73,10 @@ def test_delayed_start():
     assert profiler.start_call_count == 1
 
 
-def test_max_iterations():
+def test_max_iterations(default_profiler_config):
     """Test that profiler stops automatically after max iterations."""
-    envs.VLLM_PROFILER_MAX_ITERS = 2
-    profiler = ConcreteWorkerProfiler()
+    default_profiler_config.max_iterations = 2
+    profiler = ConcreteWorkerProfiler(default_profiler_config)
 
     profiler.start()
     assert profiler._running is True
@@ -95,12 +97,11 @@ def test_max_iterations():
     assert profiler.stop_call_count == 1
 
 
-def test_delayed_start_and_max_iters():
+def test_delayed_start_and_max_iters(default_profiler_config):
     """Test combined delayed start and max iterations."""
-    envs.VLLM_PROFILER_DELAY_ITERS = 2
-    envs.VLLM_PROFILER_MAX_ITERS = 2
-    profiler = ConcreteWorkerProfiler()
-
+    default_profiler_config.delay_iterations = 2
+    default_profiler_config.max_iterations = 2
+    profiler = ConcreteWorkerProfiler(default_profiler_config)
     profiler.start()
 
     # Step 1
@@ -127,9 +128,9 @@ def test_delayed_start_and_max_iters():
     assert profiler.stop_call_count == 1
 
 
-def test_idempotency():
+def test_idempotency(default_profiler_config):
     """Test that calling start/stop multiple times doesn't break logic."""
-    profiler = ConcreteWorkerProfiler()
+    profiler = ConcreteWorkerProfiler(default_profiler_config)
 
     # Double Start
     profiler.start()
@@ -142,10 +143,10 @@ def test_idempotency():
     assert profiler.stop_call_count == 1  # Should only stop once
 
 
-def test_step_inactive():
+def test_step_inactive(default_profiler_config):
     """Test that stepping while inactive does nothing."""
-    envs.VLLM_PROFILER_DELAY_ITERS = 2
-    profiler = ConcreteWorkerProfiler()
+    default_profiler_config.delay_iterations = 2
+    profiler = ConcreteWorkerProfiler(default_profiler_config)
 
     # Not started yet
     profiler.step()
@@ -155,9 +156,9 @@ def test_step_inactive():
     assert profiler.start_call_count == 0
 
 
-def test_start_failure():
+def test_start_failure(default_profiler_config):
     """Test behavior when the underlying _start method raises exception."""
-    profiler = ConcreteWorkerProfiler()
+    profiler = ConcreteWorkerProfiler(default_profiler_config)
     profiler.should_fail_start = True
 
     profiler.start()
@@ -168,9 +169,9 @@ def test_start_failure():
     assert profiler.start_call_count == 0  # Logic failed inside start
 
 
-def test_shutdown():
+def test_shutdown(default_profiler_config):
     """Test that shutdown calls stop only if running."""
-    profiler = ConcreteWorkerProfiler()
+    profiler = ConcreteWorkerProfiler(default_profiler_config)
 
     # Case 1: Not running
     profiler.shutdown()
@@ -182,10 +183,10 @@ def test_shutdown():
     assert profiler.stop_call_count == 1
 
 
-def test_mixed_delay_and_stop():
+def test_mixed_delay_and_stop(default_profiler_config):
     """Test manual stop during the delay period."""
-    envs.VLLM_PROFILER_DELAY_ITERS = 5
-    profiler = ConcreteWorkerProfiler()
+    default_profiler_config.delay_iterations = 5
+    profiler = ConcreteWorkerProfiler(default_profiler_config)
 
     profiler.start()
     profiler.step()
diff --git a/vllm/benchmarks/latency.py b/vllm/benchmarks/latency.py
index b4f1751837f48..99c1c846f19af 100644
--- a/vllm/benchmarks/latency.py
+++ b/vllm/benchmarks/latency.py
@@ -12,7 +12,6 @@ from typing import Any
 import numpy as np
 from tqdm import tqdm
 
-import vllm.envs as envs
 from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json
 from vllm.engine.arg_utils import EngineArgs
 from vllm.inputs import PromptType
@@ -79,12 +78,11 @@ def add_cli_args(parser: argparse.ArgumentParser):
 
 
 def main(args: argparse.Namespace):
-    if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
-        raise OSError(
-            "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
-            "Please set it to a valid path to use torch profiler."
-        )
     engine_args = EngineArgs.from_cli_args(args)
+    if args.profile and not engine_args.profiler_config.profiler == "torch":
+        raise ValueError(
+            "The torch profiler is not enabled. Please provide profiler_config."
+        )
 
     # Lazy import to avoid importing LLM when the bench command is not selected.
     from vllm import LLM, SamplingParams
@@ -144,7 +142,7 @@ def main(args: argparse.Namespace):
         run_to_completion(profile_dir=None)
 
     if args.profile:
-        profile_dir = envs.VLLM_TORCH_PROFILER_DIR
+        profile_dir = engine_args.profiler_config.torch_profiler_dir
         print(f"Profiling (results will be saved to '{profile_dir}')...")
         run_to_completion(profile_dir=profile_dir)
         return
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index 568290aa894ff..2e2054a8a4b13 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -1097,8 +1097,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--profile",
         action="store_true",
-        help="Use Torch Profiler. The endpoint must be launched with "
-        "VLLM_TORCH_PROFILER_DIR to enable profiler.",
+        help="Use vLLM Profiling. --profiler-config must be provided on the server.",
     )
     parser.add_argument(
         "--save-result",
diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py
index ea693613fdd16..d824e982b7489 100644
--- a/vllm/benchmarks/throughput.py
+++ b/vllm/benchmarks/throughput.py
@@ -655,8 +655,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
         "--profile",
         action="store_true",
         default=False,
-        help="Use Torch Profiler. The env variable "
-        "VLLM_TORCH_PROFILER_DIR must be set to enable profiler.",
+        help="Use vLLM Profiling. --profiler-config must be provided on the server.",
     )
 
     # prefix repetition dataset
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 0f84f3ca9d3e3..0e91dd57420a8 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -24,6 +24,7 @@ from vllm.config.multimodal import MultiModalConfig
 from vllm.config.observability import ObservabilityConfig
 from vllm.config.parallel import EPLBConfig, ParallelConfig
 from vllm.config.pooler import PoolerConfig
+from vllm.config.profiler import ProfilerConfig
 from vllm.config.scheduler import SchedulerConfig
 from vllm.config.speculative import SpeculativeConfig
 from vllm.config.speech_to_text import SpeechToTextConfig
@@ -89,6 +90,8 @@ __all__ = [
     "SpeechToTextConfig",
     # From vllm.config.structured_outputs
     "StructuredOutputsConfig",
+    # From vllm.config.profiler
+    "ProfilerConfig",
     # From vllm.config.utils
     "ConfigType",
     "SupportsMetricsInfo",
diff --git a/vllm/config/profiler.py b/vllm/config/profiler.py
new file mode 100644
index 0000000000000..76cc546f3c9e2
--- /dev/null
+++ b/vllm/config/profiler.py
@@ -0,0 +1,199 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+from typing import Any, Literal
+
+from pydantic import Field, model_validator
+from pydantic.dataclasses import dataclass
+from typing_extensions import Self
+
+import vllm.envs as envs
+from vllm.config.utils import config
+from vllm.logger import init_logger
+from vllm.utils.hashing import safe_hash
+
+logger = init_logger(__name__)
+
+ProfilerKind = Literal["torch", "cuda"]
+
+
+@config
+@dataclass
+class ProfilerConfig:
+    """Dataclass which contains profiler config for the engine."""
+
+    profiler: ProfilerKind | None = None
+    """Which profiler to use. Defaults to None. Options are:
+
+    - 'torch': Use PyTorch profiler.\n
+    - 'cuda': Use CUDA profiler."""
+
+    torch_profiler_dir: str = ""
+    """Directory to save torch profiler traces. Both AsyncLLM's CPU traces and
+    worker's traces (CPU & GPU) will be saved under this directory. Note that
+    it must be an absolute path."""
+
+    torch_profiler_with_stack: bool = True
+    """If `True`, enables stack tracing in the torch profiler. Enabled by default."""
+
+    torch_profiler_with_flops: bool = False
+    """If `True`, enables FLOPS counting in the torch profiler. Disabled by default."""
+
+    torch_profiler_use_gzip: bool = True
+    """If `True`, saves torch profiler traces in gzip format. Enabled by default"""
+
+    torch_profiler_dump_cuda_time_total: bool = True
+    """If `True`, dumps total CUDA time in torch profiler traces. Enabled by default."""
+
+    torch_profiler_record_shapes: bool = False
+    """If `True`, records tensor shapes in the torch profiler. Disabled by default."""
+
+    torch_profiler_with_memory: bool = False
+    """If `True`, enables memory profiling in the torch profiler.
+    Disabled by default."""
+
+    ignore_frontend: bool = False
+    """If `True`, disables the front-end profiling of AsyncLLM when using the 
+    'torch' profiler. This is needed to reduce overhead when using delay/limit options,
+    since the front-end profiling does not track iterations and will capture the
+    entire range.
+    """
+
+    delay_iterations: int = Field(default=0, ge=0)
+    """Number of engine iterations to skip before starting profiling.
+    Defaults to 0, meaning profiling starts immediately after receiving /start_profile.
+    """
+
+    max_iterations: int = Field(default=0, ge=0)
+    """Maximum number of engine iterations to profile after starting profiling.
+    Defaults to 0, meaning no limit.
+    """
+
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: list[Any] = []
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
+        return hash_str
+
+    def _get_from_env_if_set(self, field_name: str, env_var_name: str) -> None:
+        """Get field from env var if set, with deprecation warning."""
+
+        if envs.is_set(env_var_name):
+            value = getattr(envs, env_var_name)
+            logger.warning_once(
+                "Using %s environment variable is deprecated and will be removed in "
+                "v0.14.0 or v1.0.0, whichever is soonest. Please use "
+                "--profiler-config.%s command line argument or "
+                "ProfilerConfig(%s=...) config field instead.",
+                env_var_name,
+                field_name,
+                field_name,
+            )
+            return value
+        return None
+
+    def _set_from_env_if_set(
+        self,
+        field_name: str,
+        env_var_name: str,
+        to_bool: bool = True,
+        to_int: bool = False,
+    ) -> None:
+        """Set field from env var if set, with deprecation warning."""
+        value = self._get_from_env_if_set(field_name, env_var_name)
+        if value is not None:
+            if to_bool:
+                value = value == "1"
+            if to_int:
+                value = int(value)
+            setattr(self, field_name, value)
+
+    @model_validator(mode="after")
+    def _validate_profiler_config(self) -> Self:
+        maybe_use_cuda_profiler = self._get_from_env_if_set(
+            "profiler", "VLLM_TORCH_CUDA_PROFILE"
+        )
+        if maybe_use_cuda_profiler is not None:
+            self.profiler = "cuda" if maybe_use_cuda_profiler == "1" else None
+        else:
+            self._set_from_env_if_set(
+                "torch_profiler_dir", "VLLM_TORCH_PROFILER_DIR", to_bool=False
+            )
+            if self.torch_profiler_dir:
+                self.profiler = "torch"
+                self._set_from_env_if_set(
+                    "torch_profiler_record_shapes",
+                    "VLLM_TORCH_PROFILER_RECORD_SHAPES",
+                )
+                self._set_from_env_if_set(
+                    "torch_profiler_with_memory",
+                    "VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY",
+                )
+                self._set_from_env_if_set(
+                    "torch_profiler_with_stack",
+                    "VLLM_TORCH_PROFILER_WITH_STACK",
+                )
+                self._set_from_env_if_set(
+                    "torch_profiler_with_flops",
+                    "VLLM_TORCH_PROFILER_WITH_FLOPS",
+                )
+                self._set_from_env_if_set(
+                    "ignore_frontend",
+                    "VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM",
+                )
+                self._set_from_env_if_set(
+                    "torch_profiler_use_gzip",
+                    "VLLM_TORCH_PROFILER_USE_GZIP",
+                )
+                self._set_from_env_if_set(
+                    "torch_profiler_dump_cuda_time_total",
+                    "VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL",
+                )
+
+        self._set_from_env_if_set(
+            "delay_iterations", "VLLM_PROFILER_DELAY_ITERS", to_bool=False, to_int=True
+        )
+        self._set_from_env_if_set(
+            "max_iterations", "VLLM_PROFILER_MAX_ITERS", to_bool=False, to_int=True
+        )
+
+        has_delay_or_limit = self.delay_iterations > 0 or self.max_iterations > 0
+        if self.profiler == "torch" and has_delay_or_limit and not self.ignore_frontend:
+            logger.warning_once(
+                "Using 'torch' profiler with delay_iterations or max_iterations "
+                "while ignore_frontend is False may result in high overhead."
+            )
+
+        profiler_dir = self.torch_profiler_dir
+        if profiler_dir and self.profiler != "torch":
+            raise ValueError(
+                "torch_profiler_dir is only applicable when profiler is set to 'torch'"
+            )
+        if self.profiler == "torch" and not profiler_dir:
+            raise ValueError("torch_profiler_dir must be set when profiler is 'torch'")
+
+        if profiler_dir:
+            is_gs_path = (
+                profiler_dir.startswith("gs://")
+                and profiler_dir[5:]
+                and profiler_dir[5] != "/"
+            )
+            if not is_gs_path:
+                self.torch_profiler_dir = os.path.abspath(
+                    os.path.expanduser(profiler_dir)
+                )
+
+        return self
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index a74413536407b..614a3226cb711 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -39,6 +39,7 @@ from .lora import LoRAConfig
 from .model import ModelConfig
 from .observability import ObservabilityConfig
 from .parallel import ParallelConfig
+from .profiler import ProfilerConfig
 from .scheduler import SchedulerConfig
 from .speculative import SpeculativeConfig
 from .structured_outputs import StructuredOutputsConfig
@@ -218,6 +219,8 @@ class VllmConfig:
     You can specify the full compilation config like so:
     `{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
     """
+    profiler_config: ProfilerConfig = Field(default_factory=ProfilerConfig)
+    """Profiling configuration."""
     kv_transfer_config: KVTransferConfig | None = None
     """The configurations for distributed KV cache transfer."""
     kv_events_config: KVEventsConfig | None = None
@@ -296,6 +299,8 @@ class VllmConfig:
             vllm_factors.append("None")
         if self.structured_outputs_config:
             vllm_factors.append(self.structured_outputs_config.compute_hash())
+        if self.profiler_config:
+            vllm_factors.append(self.profiler_config.compute_hash())
         else:
             vllm_factors.append("None")
         vllm_factors.append(self.observability_config.compute_hash())
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index ceac5407af6e2..2f307a7ccf16d 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -50,6 +50,7 @@ from vllm.config import (
     ObservabilityConfig,
     ParallelConfig,
     PoolerConfig,
+    ProfilerConfig,
     SchedulerConfig,
     SpeculativeConfig,
     StructuredOutputsConfig,
@@ -532,6 +533,8 @@ class EngineArgs:
     worker_cls: str = ParallelConfig.worker_cls
     worker_extension_cls: str = ParallelConfig.worker_extension_cls
 
+    profiler_config: ProfilerConfig = get_field(VllmConfig, "profiler_config")
+
     kv_transfer_config: KVTransferConfig | None = None
     kv_events_config: KVEventsConfig | None = None
 
@@ -1164,7 +1167,7 @@ class EngineArgs:
         vllm_group.add_argument(
             "--structured-outputs-config", **vllm_kwargs["structured_outputs_config"]
         )
-
+        vllm_group.add_argument("--profiler-config", **vllm_kwargs["profiler_config"])
         vllm_group.add_argument(
             "--optimization-level", **vllm_kwargs["optimization_level"]
         )
@@ -1782,6 +1785,7 @@ class EngineArgs:
             kv_transfer_config=self.kv_transfer_config,
             kv_events_config=self.kv_events_config,
             ec_transfer_config=self.ec_transfer_config,
+            profiler_config=self.profiler_config,
             additional_config=self.additional_config,
             optimization_level=self.optimization_level,
         )
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 913324fd5f9c3..5d5c4a1cdb77b 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -20,6 +20,7 @@ from vllm.beam_search import (
 from vllm.config import (
     CompilationConfig,
     PoolerConfig,
+    ProfilerConfig,
     StructuredOutputsConfig,
     is_init_field,
 )
@@ -211,6 +212,7 @@ class LLM:
         structured_outputs_config: dict[str, Any]
         | StructuredOutputsConfig
         | None = None,
+        profiler_config: dict[str, Any] | ProfilerConfig | None = None,
         kv_cache_memory_bytes: int | None = None,
         compilation_config: int | dict[str, Any] | CompilationConfig | None = None,
         logits_processors: list[str | type[LogitsProcessor]] | None = None,
@@ -282,6 +284,20 @@ class LLM:
         else:
             structured_outputs_instance = StructuredOutputsConfig()
 
+        if profiler_config is not None:
+            if isinstance(profiler_config, dict):
+                profiler_config_instance = ProfilerConfig(
+                    **{
+                        k: v
+                        for k, v in profiler_config.items()
+                        if is_init_field(ProfilerConfig, k)
+                    }
+                )
+            else:
+                profiler_config_instance = profiler_config
+        else:
+            profiler_config_instance = ProfilerConfig()
+
         # warn about single-process data parallel usage.
         _dp_size = int(kwargs.get("data_parallel_size", 1))
         _distributed_executor_backend = kwargs.get("distributed_executor_backend")
@@ -324,6 +340,7 @@ class LLM:
             mm_processor_kwargs=mm_processor_kwargs,
             pooler_config=pooler_config,
             structured_outputs_config=structured_outputs_instance,
+            profiler_config=profiler_config_instance,
             compilation_config=compilation_config_instance,
             logits_processors=logits_processors,
             **kwargs,
diff --git a/vllm/entrypoints/serve/profile/api_router.py b/vllm/entrypoints/serve/profile/api_router.py
index 166f13764eb36..eeed6b45ef4e9 100644
--- a/vllm/entrypoints/serve/profile/api_router.py
+++ b/vllm/entrypoints/serve/profile/api_router.py
@@ -5,7 +5,7 @@
 from fastapi import APIRouter, FastAPI, Request
 from fastapi.responses import Response
 
-import vllm.envs as envs
+from vllm.config import ProfilerConfig
 from vllm.engine.protocol import EngineClient
 from vllm.logger import init_logger
 
@@ -35,15 +35,12 @@ async def stop_profile(raw_request: Request):
 
 
 def attach_router(app: FastAPI):
-    if envs.VLLM_TORCH_PROFILER_DIR:
+    profiler_config = getattr(app.state.args, "profiler_config", None)
+    assert profiler_config is None or isinstance(profiler_config, ProfilerConfig)
+    if profiler_config is not None and profiler_config.profiler is not None:
         logger.warning_once(
-            "Torch Profiler is enabled in the API server. This should ONLY be "
-            "used for local development!"
+            "Profiler with mode '%s' is enabled in the "
+            "API server. This should ONLY be used for local development!",
+            profiler_config.profiler,
         )
-    elif envs.VLLM_TORCH_CUDA_PROFILE:
-        logger.warning_once(
-            "CUDA Profiler is enabled in the API server. This should ONLY be "
-            "used for local development!"
-        )
-    if envs.VLLM_TORCH_PROFILER_DIR or envs.VLLM_TORCH_CUDA_PROFILE:
         app.include_router(router)
diff --git a/vllm/envs.py b/vllm/envs.py
index bda9e6e423356..8246109eb73af 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -89,20 +89,23 @@ if TYPE_CHECKING:
     VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5  # seconds
     VLLM_PLUGINS: list[str] | None = None
     VLLM_LORA_RESOLVER_CACHE_DIR: str | None = None
-    VLLM_TORCH_CUDA_PROFILE: bool = False
+    # Deprecated env variables for profiling, kept for backward compatibility
+    # See also vllm/config/profiler.py and `--profiler-config` argument
+    VLLM_TORCH_CUDA_PROFILE: str | None = None
     VLLM_TORCH_PROFILER_DIR: str | None = None
-    VLLM_TORCH_PROFILER_RECORD_SHAPES: bool = False
-    VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: bool = False
-    VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM: bool = False
+    VLLM_TORCH_PROFILER_RECORD_SHAPES: str | None = None
+    VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: str | None = None
+    VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM: str | None = None
+    VLLM_TORCH_PROFILER_WITH_STACK: str | None = None
+    VLLM_TORCH_PROFILER_WITH_FLOPS: str | None = None
+    VLLM_TORCH_PROFILER_USE_GZIP: str | None = None
+    VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL: str | None = None
+    VLLM_PROFILER_DELAY_ITERS: str | None = None
+    VLLM_PROFILER_MAX_ITERS: str | None = None
+    # End of deprecated env variables for profiling
     VLLM_USE_AOT_COMPILE: bool = False
     VLLM_USE_BYTECODE_HOOK: bool = False
     VLLM_FORCE_AOT_LOAD: bool = False
-    VLLM_TORCH_PROFILER_WITH_STACK: bool = True
-    VLLM_TORCH_PROFILER_WITH_FLOPS: bool = False
-    VLLM_PROFILER_DELAY_ITERS: int = 0
-    VLLM_PROFILER_MAX_ITERS: int = 0
-    VLLM_TORCH_PROFILER_USE_GZIP: bool = True
-    VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL: bool = True
     VLLM_USE_TRITON_AWQ: bool = False
     VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
     VLLM_SKIP_P2P_CHECK: bool = False
@@ -850,71 +853,52 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_LORA_RESOLVER_CACHE_DIR": lambda: os.getenv(
         "VLLM_LORA_RESOLVER_CACHE_DIR", None
     ),
-    # Enables torch CUDA profiling if set.
-    # On NVIDIA GPUs, this will start/stop cudaProfilerApi when triggered.
-    "VLLM_TORCH_CUDA_PROFILE": lambda: bool(
-        os.getenv("VLLM_TORCH_CUDA_PROFILE", "0") != "0"
-    ),
+    # Enables torch CUDA profiling if set to 1.
+    # Deprecated, see profiler_config.
+    "VLLM_TORCH_CUDA_PROFILE": lambda: os.getenv("VLLM_TORCH_CUDA_PROFILE"),
     # Enables torch profiler if set.
-    # Both AsyncLLM's CPU traces as well as workers'
-    # traces (CPU & GPU) will be saved under this directory.
-    # Note that it must be an absolute path.
-    "VLLM_TORCH_PROFILER_DIR": lambda: (
-        None
-        if (val := os.getenv("VLLM_TORCH_PROFILER_DIR")) is None
-        else (
-            val
-            if val.startswith("gs://") and val[5:] and val[5] != "/"
-            else os.path.abspath(os.path.expanduser(val))
-        )
+    # Deprecated, see profiler_config.
+    "VLLM_TORCH_PROFILER_DIR": lambda: os.getenv("VLLM_TORCH_PROFILER_DIR"),
+    # Enable torch profiler to record shapes if set to 1.
+    # Deprecated, see profiler_config.
+    "VLLM_TORCH_PROFILER_RECORD_SHAPES": lambda: (
+        os.getenv("VLLM_TORCH_PROFILER_RECORD_SHAPES")
     ),
-    # Enable torch profiler to record shapes if set
-    # VLLM_TORCH_PROFILER_RECORD_SHAPES=1. If not set, torch profiler will
-    # not record shapes.
-    "VLLM_TORCH_PROFILER_RECORD_SHAPES": lambda: bool(
-        os.getenv("VLLM_TORCH_PROFILER_RECORD_SHAPES", "0") != "0"
+    # Enable torch profiler to profile memory if set to 1.
+    # Deprecated, see profiler_config.
+    "VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY": lambda: (
+        os.getenv("VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY")
     ),
-    # Enable torch profiler to profile memory if set
-    # VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1. If not set, torch profiler
-    # will not profile memory.
-    "VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY": lambda: bool(
-        os.getenv("VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY", "0") != "0"
+    # Enable torch profiler to profile stack if set to 1.
+    # Deprecated, see profiler_config.
+    "VLLM_TORCH_PROFILER_WITH_STACK": lambda: (
+        os.getenv("VLLM_TORCH_PROFILER_WITH_STACK")
     ),
-    # Enable torch profiler to profile stack if set
-    # VLLM_TORCH_PROFILER_WITH_STACK=1. If not set, torch profiler WILL
-    # profile stack by default.
-    "VLLM_TORCH_PROFILER_WITH_STACK": lambda: bool(
-        os.getenv("VLLM_TORCH_PROFILER_WITH_STACK", "1") != "0"
+    # Enable torch profiler to profile flops if set to 1.
+    # Deprecated, see profiler_config.
+    "VLLM_TORCH_PROFILER_WITH_FLOPS": lambda: (
+        os.getenv("VLLM_TORCH_PROFILER_WITH_FLOPS")
     ),
-    # Enable torch profiler to profile flops if set
-    # VLLM_TORCH_PROFILER_WITH_FLOPS=1. If not set, torch profiler will
-    # not profile flops.
-    "VLLM_TORCH_PROFILER_WITH_FLOPS": lambda: bool(
-        os.getenv("VLLM_TORCH_PROFILER_WITH_FLOPS", "0") != "0"
-    ),
-    # Disable torch profiling of the AsyncLLMEngine process.
-    # If set to 1, will not profile the engine process.
-    "VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM": lambda: bool(
-        os.getenv("VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM", "0") != "0"
+    # Disable torch profiling of the AsyncLLMEngine process if set to 1.
+    # Deprecated, see profiler_config.
+    "VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM": lambda: (
+        os.getenv("VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM")
     ),
     # Delay number of iterations before starting profiling when using
     # the torch/torch CUDA profiler. If set to 0, will start profiling immediately.
-    "VLLM_PROFILER_DELAY_ITERS": lambda: int(
-        os.getenv("VLLM_PROFILER_DELAY_ITERS", "0")
-    ),
+    # Deprecated, see profiler_config.
+    "VLLM_PROFILER_DELAY_ITERS": lambda: (os.getenv("VLLM_PROFILER_DELAY_ITERS")),
     # Maximum number of iterations to profile when using the torch/torch CUDA profiler.
     # If set to 0, will not limit the number of iterations.
-    "VLLM_PROFILER_MAX_ITERS": lambda: int(os.getenv("VLLM_PROFILER_MAX_ITERS", "0")),
+    "VLLM_PROFILER_MAX_ITERS": lambda: os.getenv("VLLM_PROFILER_MAX_ITERS"),
     # Control whether torch profiler gzip-compresses profiling files.
-    # Set VLLM_TORCH_PROFILER_USE_GZIP=0 to disable gzip (enabled by default).
-    "VLLM_TORCH_PROFILER_USE_GZIP": lambda: bool(
-        os.getenv("VLLM_TORCH_PROFILER_USE_GZIP", "1") != "0"
-    ),
+    # Deprecated, see profiler_config.
+    "VLLM_TORCH_PROFILER_USE_GZIP": lambda: os.getenv("VLLM_TORCH_PROFILER_USE_GZIP"),
     # Control whether torch profiler dumps the self_cuda_time_total table.
-    # Set VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL=0 to disable dumping
-    # (enabled by default).
-    "VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL": lambda: bool(
-        os.getenv("VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL", "1") != "0"
+    # Set to 0 to disable dumping the table.
+    # Deprecated, see profiler_config.
+    "VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL": lambda: (
+        os.getenv("VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL")
     ),
     # If set, vLLM will use Triton implementations of AWQ.
     "VLLM_USE_TRITON_AWQ": lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))),
diff --git a/vllm/profiler/gpu_profiler.py b/vllm/profiler/wrapper.py
similarity index 73%
rename from vllm/profiler/gpu_profiler.py
rename to vllm/profiler/wrapper.py
index 798c615221b9f..a44a6a5eea0dd 100644
--- a/vllm/profiler/gpu_profiler.py
+++ b/vllm/profiler/wrapper.py
@@ -3,26 +3,27 @@
 
 from abc import ABC, abstractmethod
 from contextlib import nullcontext
+from typing import Literal
 
 import torch
 from typing_extensions import override
 
-import vllm.envs as envs
+from vllm.config import ProfilerConfig
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
 
 
 class WorkerProfiler(ABC):
-    def __init__(self) -> None:
-        self._delay_iters = envs.VLLM_PROFILER_DELAY_ITERS
+    def __init__(self, profiler_config: ProfilerConfig) -> None:
+        self._delay_iters = profiler_config.delay_iterations
         if self._delay_iters > 0:
             logger.info_once(
                 "GPU profiling will start "
                 f"{self._delay_iters} steps after start_profile."
             )
 
-        self._max_iters = envs.VLLM_PROFILER_MAX_ITERS
+        self._max_iters = profiler_config.max_iterations
         if self._max_iters > 0:
             logger.info_once(
                 "GPU profiling will stop "
@@ -133,12 +134,27 @@ class WorkerProfiler(ABC):
         return nullcontext()
 
 
+TorchProfilerActivity = Literal["CPU", "CUDA", "XPU"]
+TorchProfilerActivityMap = {
+    "CPU": torch.profiler.ProfilerActivity.CPU,
+    "CUDA": torch.profiler.ProfilerActivity.CUDA,
+    "XPU": torch.profiler.ProfilerActivity.XPU,
+}
+
+
 class TorchProfilerWrapper(WorkerProfiler):
-    def __init__(self, worker_name: str, local_rank: int) -> None:
-        super().__init__()
+    def __init__(
+        self,
+        profiler_config: ProfilerConfig,
+        worker_name: str,
+        local_rank: int,
+        activities: list[TorchProfilerActivity],
+    ) -> None:
+        super().__init__(profiler_config)
 
         self.local_rank = local_rank
-        torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
+        self.profiler_config = profiler_config
+        torch_profiler_trace_dir = profiler_config.torch_profiler_dir
         if local_rank in (None, 0):
             logger.info(
                 "Torch profiling enabled. Traces will be saved to: %s",
@@ -147,24 +163,23 @@ class TorchProfilerWrapper(WorkerProfiler):
             logger.debug(
                 "Profiler config: record_shapes=%s,"
                 "profile_memory=%s,with_stack=%s,with_flops=%s",
-                envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
-                envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
-                envs.VLLM_TORCH_PROFILER_WITH_STACK,
-                envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
+                profiler_config.torch_profiler_record_shapes,
+                profiler_config.torch_profiler_with_memory,
+                profiler_config.torch_profiler_with_stack,
+                profiler_config.torch_profiler_with_flops,
             )
+
+        self.dump_cpu_time_total = "CPU" in activities and len(activities) == 1
         self.profiler = torch.profiler.profile(
-            activities=[
-                torch.profiler.ProfilerActivity.CPU,
-                torch.profiler.ProfilerActivity.CUDA,
-            ],
-            record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
-            profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
-            with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
-            with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
+            activities=[TorchProfilerActivityMap[activity] for activity in activities],
+            record_shapes=profiler_config.torch_profiler_record_shapes,
+            profile_memory=profiler_config.torch_profiler_with_memory,
+            with_stack=profiler_config.torch_profiler_with_stack,
+            with_flops=profiler_config.torch_profiler_with_flops,
             on_trace_ready=torch.profiler.tensorboard_trace_handler(
                 torch_profiler_trace_dir,
                 worker_name=worker_name,
-                use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP,
+                use_gzip=profiler_config.torch_profiler_use_gzip,
             ),
         )
 
@@ -176,9 +191,10 @@ class TorchProfilerWrapper(WorkerProfiler):
     def _stop(self) -> None:
         self.profiler.stop()
 
-        if envs.VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL:
-            rank = self.local_rank
-            profiler_dir = envs.VLLM_TORCH_PROFILER_DIR
+        profiler_config = self.profiler_config
+        rank = self.local_rank
+        if profiler_config.torch_profiler_dump_cuda_time_total:
+            profiler_dir = profiler_config.torch_profiler_dir
             profiler_out_file = f"{profiler_dir}/profiler_out_{rank}.txt"
             sort_key = "self_cuda_time_total"
             table = self.profiler.key_averages().table(sort_by=sort_key)
@@ -189,6 +205,12 @@ class TorchProfilerWrapper(WorkerProfiler):
             # only print profiler results on rank 0
             if rank == 0:
                 print(table)
+        if self.dump_cpu_time_total and rank == 0:
+            logger.info(
+                self.profiler.key_averages().table(
+                    sort_by="self_cpu_time_total", row_limit=50
+                )
+            )
 
     @override
     def annotate_context_manager(self, name: str):
@@ -196,8 +218,8 @@ class TorchProfilerWrapper(WorkerProfiler):
 
 
 class CudaProfilerWrapper(WorkerProfiler):
-    def __init__(self) -> None:
-        super().__init__()
+    def __init__(self, profiler_config: ProfilerConfig) -> None:
+        super().__init__(profiler_config)
         # Note: lazy import to avoid dependency issues if CUDA is not available.
         import torch.cuda.profiler as cuda_profiler
 
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index fd7e04dc02082..931d13be3d9b6 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -166,32 +166,24 @@ class AsyncLLM(EngineClient):
             pass
 
         if (
-            envs.VLLM_TORCH_PROFILER_DIR
-            and not envs.VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM
+            vllm_config.profiler_config.profiler == "torch"
+            and not vllm_config.profiler_config.ignore_frontend
         ):
+            profiler_dir = vllm_config.profiler_config.torch_profiler_dir
             logger.info(
                 "Torch profiler enabled. AsyncLLM CPU traces will be collected under %s",  # noqa: E501
-                envs.VLLM_TORCH_PROFILER_DIR,
+                profiler_dir,
             )
-            if envs.VLLM_PROFILER_MAX_ITERS > 0 or envs.VLLM_PROFILER_DELAY_ITERS > 0:
-                logger.warning_once(
-                    "Torch profiler received max_iters or delay_iters setting. These "
-                    "are not compatible with the AsyncLLM profiler and will be ignored "
-                    "for the AsyncLLM process. Engine process profiling will still "
-                    "respect these settings. Consider setting "
-                    "VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM=1 to disable "
-                    "AsyncLLM profiling."
-                )
             worker_name = f"{socket.gethostname()}_{os.getpid()}.async_llm"
             self.profiler = torch.profiler.profile(
                 activities=[
                     torch.profiler.ProfilerActivity.CPU,
                 ],
-                with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
+                with_stack=vllm_config.profiler_config.torch_profiler_with_stack,
                 on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                    envs.VLLM_TORCH_PROFILER_DIR,
+                    profiler_dir,
                     worker_name=worker_name,
-                    use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP,
+                    use_gzip=vllm_config.profiler_config.torch_profiler_use_gzip,
                 ),
             )
         else:
diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py
index b080fea1d2dd6..e54b995ab908f 100644
--- a/vllm/v1/worker/cpu_worker.py
+++ b/vllm/v1/worker/cpu_worker.py
@@ -13,6 +13,7 @@ from vllm.logger import init_logger
 from vllm.model_executor.utils import set_random_seed
 from vllm.platforms import CpuArchEnum, current_platform
 from vllm.platforms.cpu import CpuPlatform, LogicalCPUInfo
+from vllm.profiler.wrapper import TorchProfilerWrapper
 from vllm.v1.worker.cpu_model_runner import CPUModelRunner
 from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment
 
@@ -38,30 +39,17 @@ class CPUWorker(Worker):
 
         self.parallel_config.disable_custom_all_reduce = True
 
-        # Torch profiler. Enabled and configured through env vars:
-        # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
+        # Torch profiler. Enabled and configured through profiler_config.
         self.profiler: Any | None = None
-        if envs.VLLM_TORCH_PROFILER_DIR:
-            torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
+        profiler_config = vllm_config.profiler_config
+        if profiler_config.profiler == "torch":
             worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
-            logger.info(
-                "Profiling enabled. Traces will be saved to: %s",
-                torch_profiler_trace_dir,
+            self.profiler = TorchProfilerWrapper(
+                profiler_config,
+                worker_name=worker_name,
+                local_rank=self.local_rank,
+                activities=["CPU"],
             )
-            self.profiler = torch.profiler.profile(
-                activities=[
-                    torch.profiler.ProfilerActivity.CPU,
-                ],
-                record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
-                profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
-                with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
-                with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
-                on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                    torch_profiler_trace_dir, worker_name=worker_name, use_gzip=False
-                ),
-            )
-        else:
-            self.profiler = None
 
     def init_device(self):
         # Setup OpenMP threads affinity.
@@ -202,9 +190,3 @@ class CPUWorker(Worker):
             self.profiler.start()
         else:
             self.profiler.stop()
-            if self.local_rank == 0:
-                logger.info(
-                    self.profiler.key_averages().table(
-                        sort_by="self_cpu_time_total", row_limit=50
-                    )
-                )
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 24a3533a169f0..f2b6a1f76b0b9 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -38,7 +38,7 @@ from vllm.model_executor import set_random_seed
 from vllm.model_executor.models.interfaces import is_mixture_of_experts
 from vllm.model_executor.warmup.kernel_warmup import kernel_warmup
 from vllm.platforms import current_platform
-from vllm.profiler.gpu_profiler import CudaProfilerWrapper, TorchProfilerWrapper
+from vllm.profiler.wrapper import CudaProfilerWrapper, TorchProfilerWrapper
 from vllm.sequence import IntermediateTensors
 from vllm.tasks import SupportedTask
 from vllm.utils.mem_constants import GiB_bytes
@@ -92,17 +92,19 @@ class Worker(WorkerBase):
         # Buffers saved before sleep
         self._sleep_saved_buffers: dict[str, torch.Tensor] = {}
 
-        # Torch/CUDA profiler. Enabled and configured through env vars:
-        # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
-        # VLLM_TORCH_CUDA_PROFILE=1
+        # Torch/CUDA profiler. Enabled and configured through profiler_config.
         self.profiler: Any | None = None
-        if envs.VLLM_TORCH_PROFILER_DIR:
+        profiler_config = vllm_config.profiler_config
+        if profiler_config.profiler == "torch":
             worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
             self.profiler = TorchProfilerWrapper(
-                worker_name=worker_name, local_rank=self.local_rank
+                profiler_config,
+                worker_name=worker_name,
+                local_rank=self.local_rank,
+                activities=["CPU", "CUDA"],
             )
-        elif envs.VLLM_TORCH_CUDA_PROFILE:
-            self.profiler = CudaProfilerWrapper()
+        elif profiler_config.profiler == "cuda":
+            self.profiler = CudaProfilerWrapper(profiler_config)
         else:
             self.profiler = None
 
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index ce18ca6c37165..7a10ac198985e 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -98,10 +98,10 @@ class TPUWorker:
         # MP runtime is initialized.
         self.profiler = None
         self.profile_dir = None
-        if envs.VLLM_TORCH_PROFILER_DIR and self.rank < 1:
+        if vllm_config.profiler_config.profiler == "torch" and self.rank < 1:
             # For TPU, we can only have 1 active profiler session for 1 profiler
             # server. So we only profile on rank0.
-            self.profile_dir = envs.VLLM_TORCH_PROFILER_DIR
+            self.profile_dir = vllm_config.profiler_config.torch_profiler_dir
             logger.info(
                 "Profiling enabled. Traces will be saved to: %s", self.profile_dir
             )
diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py
index 267369c730368..1faa1a24ff0ea 100644
--- a/vllm/v1/worker/xpu_worker.py
+++ b/vllm/v1/worker/xpu_worker.py
@@ -6,12 +6,12 @@ from typing import Any
 import torch
 import torch.distributed
 
-import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.distributed import get_world_group
 from vllm.logger import init_logger
 from vllm.model_executor import set_random_seed
 from vllm.platforms import current_platform
+from vllm.profiler.wrapper import TorchProfilerWrapper
 from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment
 from vllm.v1.worker.xpu_model_runner import XPUModelRunner
 
@@ -36,41 +36,17 @@ class XPUWorker(Worker):
         assert device_config.device_type == "xpu"
         assert current_platform.is_xpu()
 
-        # Torch profiler. Enabled and configured through env vars:
-        # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
+        # Torch profiler. Enabled and configured through profiler_config.
         self.profiler: Any | None = None
-        if envs.VLLM_TORCH_PROFILER_DIR:
-            torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
+        profiler_config = vllm_config.profiler_config
+        if profiler_config.profiler == "torch":
             worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
-            logger.info(
-                "Profiling enabled. Traces will be saved to: %s",
-                torch_profiler_trace_dir,
+            self.profiler = TorchProfilerWrapper(
+                profiler_config,
+                worker_name=worker_name,
+                local_rank=self.local_rank,
+                activities=["CPU", "XPU"],
             )
-            logger.debug(
-                "Profiler config: record_shapes=%s,"
-                "profile_memory=%s,with_stack=%s,with_flops=%s",
-                envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
-                envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
-                envs.VLLM_TORCH_PROFILER_WITH_STACK,
-                envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
-            )
-            self.profiler = torch.profiler.profile(
-                activities=[
-                    torch.profiler.ProfilerActivity.CPU,
-                    torch.profiler.ProfilerActivity.XPU,
-                ],
-                record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
-                profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
-                with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
-                with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
-                on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                    torch_profiler_trace_dir,
-                    worker_name=worker_name,
-                    use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP,
-                ),
-            )
-        else:
-            self.profiler = None
 
     # we provide this function due to `torch.xpu.mem_get_info()` doesn't
     # return correct free_gpu_memory on intel client GPU. We need to

From 95501a70ec69d182b124774ff708c3050ab4e91e Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Tue, 9 Dec 2025 13:51:19 -0500
Subject: [PATCH 386/770] [BugFix] Fix DeepSeek-R1 hang with DP and MTP
 (#30119)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Matthew Bonanni <mbonanni@redhat.com>
---
 vllm/v1/worker/gpu_model_runner.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 766c2acd0e1d8..7398defd74a38 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -4168,10 +4168,19 @@ class GPUModelRunner(
 
             if self.speculative_config and self.speculative_config.use_eagle():
                 assert isinstance(self.drafter, EagleProposer)
+                # Eagle currently only supports PIECEWISE cudagraphs.
+                # Therefore only use cudagraphs if the main model uses PIECEWISE
+                # NOTE(lucas): this is a hack, need to clean up.
                 use_cudagraphs = (
-                    cudagraph_runtime_mode.has_mode(CUDAGraphMode.PIECEWISE)
-                    and not self.speculative_config.enforce_eager
-                )
+                    (
+                        is_graph_capturing
+                        and cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
+                    )
+                    or (
+                        not is_graph_capturing
+                        and cudagraph_runtime_mode != CUDAGraphMode.NONE
+                    )
+                ) and not self.speculative_config.enforce_eager
 
                 # Note(gnovack) - We need to disable cudagraphs for one of the two
                 # lora cases when cudagraph_specialize_lora is enabled. This is a

From b37bf51e7594133772f4f06446dd5aa08aa5be0e Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Tue, 9 Dec 2025 13:52:20 -0500
Subject: [PATCH 387/770] [CI/Test] Fix FP8 per-tensor quant test reference
 scale shape (#30352)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 tests/kernels/quant_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py
index 830d43569e98b..e29f66dca313f 100644
--- a/tests/kernels/quant_utils.py
+++ b/tests/kernels/quant_utils.py
@@ -103,7 +103,7 @@ def ref_dynamic_per_tensor_fp8_quant(
         .clamp(fp8_traits_min, fp8_traits_max)
         .to(FP8_DTYPE)
     )
-    return ref_out, ref_scale.view((1, 1))
+    return ref_out, ref_scale.view(1)
 
 
 def native_w8a8_block_matmul(

From 73a484caa1ad320d6e695f098c25c479a71e6774 Mon Sep 17 00:00:00 2001
From: Tsukasa OI <floss_llm@irq.a4lg.com>
Date: Wed, 10 Dec 2025 04:13:10 +0900
Subject: [PATCH 388/770] [Model][Quantization] Fix / Add GGUF support for
 Qwen2 MoE models (#30307)

Signed-off-by: Tsukasa OI <floss_llm@irq.a4lg.com>
---
 vllm/model_executor/models/qwen2_moe.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 5a428740082f6..cbc618f1abd08 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -367,6 +367,8 @@ class Qwen2MoeModel(nn.Module):
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.embed_tokens",
         )
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
@@ -512,6 +514,12 @@ class Qwen2MoeModel(nn.Module):
                             continue
                         else:
                             name = remapped_kv_scale_name
+                    # GGUF: make sure that shared_expert_gate is a 2D tensor.
+                    if (
+                        "mlp.shared_expert_gate" in name
+                        and len(loaded_weight.shape) == 1
+                    ):
+                        loaded_weight = loaded_weight[None, :]
                     param = params_dict[name]
                     weight_loader = getattr(
                         param, "weight_loader", default_weight_loader

From 7cab92fd45ce6ad7fd78d705ef31b12d3beebb4b Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 9 Dec 2025 20:03:16 +0000
Subject: [PATCH 389/770] Bump actions/checkout from 6.0.0 to 6.0.1 (#30233)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/cleanup_pr_body.yml  | 2 +-
 .github/workflows/macos-smoke-test.yml | 2 +-
 .github/workflows/pre-commit.yml       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml
index 56fbe5ca704a1..df8910837715d 100644
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -13,7 +13,7 @@ jobs:
 
     steps:
       - name: Checkout repository
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
 
       - name: Set up Python
         uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
diff --git a/.github/workflows/macos-smoke-test.yml b/.github/workflows/macos-smoke-test.yml
index 3a12c4b3a8300..e80a5c0cc80f9 100644
--- a/.github/workflows/macos-smoke-test.yml
+++ b/.github/workflows/macos-smoke-test.yml
@@ -12,7 +12,7 @@ jobs:
     timeout-minutes: 30
 
     steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v6.0.1
 
       - uses: astral-sh/setup-uv@v7
         with:
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index a03b979ad761d..1041653c2f57e 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -16,7 +16,7 @@ jobs:
   pre-commit:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+    - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
     - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
       with:
         python-version: "3.12"

From f8dacc66b69bfbd9a9addf542572487035f0a1db Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 9 Dec 2025 20:12:14 +0000
Subject: [PATCH 390/770] Bump actions/stale from 10.1.0 to 10.1.1 (#30234)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/stale.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index c8a52f1a63269..44bf71db5e9de 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -15,7 +15,7 @@ jobs:
       actions: write
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
+      - uses: actions/stale@997185467fa4f803885201cee163a9f38240193d # v10.1.1
         with:
           # Increasing this value ensures that changes to this workflow
           # propagate to all issues and PRs in days rather than months

From 7618dc973dd1e56a46162bc7bd6e7625143bead0 Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Tue, 9 Dec 2025 14:18:17 -0600
Subject: [PATCH 391/770] [CI/Build] Make test_mha_attn.py run on correct
 platform only and check for flash_attn_varlen_func in layer.py (#29145)

---
 tests/kernels/attention/test_mha_attn.py | 11 +++++++++--
 vllm/attention/layer.py                  |  5 ++++-
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py
index ae3c63cc62d6b..639abdf6f0487 100644
--- a/tests/kernels/attention/test_mha_attn.py
+++ b/tests/kernels/attention/test_mha_attn.py
@@ -26,7 +26,14 @@ def clear_cache():
     _cached_get_attn_backend.cache_clear()
 
 
-@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
+devices = ["cpu"]
+if current_platform.is_cuda():
+    devices.append("cuda")
+if current_platform.is_rocm():
+    devices.append("hip")
+
+
+@pytest.mark.parametrize("device", devices)
 def test_mha_attn_platform(device: str):
     """
     Test the attention selector between different platform and device.
@@ -46,7 +53,7 @@ def test_mha_attn_platform(device: str):
             patch("vllm.model_executor.models.vision.current_platform", RocmPlatform()),
         ):
             attn = MultiHeadAttention(16, 64, scale=1)
-            assert attn.attn_backend == AttentionBackendEnum.TORCH_SDPA
+            assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN
     else:
         # Test CUDA with head_size=64 (divisible by 32)
         # - should use vLLM's FlashAttention
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 7e5adfe0742d3..c77fc0fad0038 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -89,7 +89,10 @@ def maybe_get_vit_flash_attn_backend(
         if attn_backend == AttentionBackendEnum.ROCM_AITER_FA:
             from aiter import flash_attn_varlen_func
         else:
-            from vllm.attention.utils.fa_utils import flash_attn_varlen_func
+            try:
+                from vllm.attention.utils.fa_utils import flash_attn_varlen_func
+            except ImportError:
+                flash_attn_varlen_func = None
     else:
         flash_attn_varlen_func = None
 

From 00e5cbb96789fd9d083b4015cdcff318e4de3808 Mon Sep 17 00:00:00 2001
From: bnellnm <49004751+bnellnm@users.noreply.github.com>
Date: Tue, 9 Dec 2025 16:48:25 -0500
Subject: [PATCH 392/770] [MoE][Refactor] Remove most arguments to
 FusedMoEMethodBase.apply (#29066)

Signed-off-by: Bill Nell <bnell@redhat.com>
Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
---
 .../layers/fused_moe/__init__.py              |   6 +-
 .../layers/fused_moe/fused_moe_method_base.py |  18 --
 .../fused_moe/fused_moe_modular_method.py     |  26 +-
 vllm/model_executor/layers/fused_moe/layer.py |  74 ++---
 .../fused_moe/unquantized_fused_moe_method.py | 224 +++++----------
 .../layers/quantization/awq_marlin.py         |  26 +-
 .../layers/quantization/bitsandbytes.py       |  26 +-
 .../compressed_tensors_moe.py                 | 254 +++++-------------
 .../layers/quantization/experts_int8.py       |  26 +-
 .../model_executor/layers/quantization/fp8.py | 104 +++----
 .../layers/quantization/gguf.py               |  25 +-
 .../layers/quantization/gptq_marlin.py        |  26 +-
 .../layers/quantization/ipex_quant.py         |  30 +--
 .../layers/quantization/modelopt.py           | 106 +++-----
 .../layers/quantization/moe_wna16.py          |  26 +-
 .../layers/quantization/mxfp4.py              |  97 +++----
 .../layers/quantization/quark/quark_moe.py    |  72 ++---
 .../model_executor/layers/quantization/rtn.py |  24 +-
 18 files changed, 318 insertions(+), 872 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 1e145a8fcd791..d71cfc5ad8200 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -4,7 +4,10 @@
 from contextlib import contextmanager
 from typing import Any
 
-from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig,
+    RoutingMethodType,
+)
 from vllm.model_executor.layers.fused_moe.fused_moe_method_base import (
     FusedMoEMethodBase,
 )
@@ -49,6 +52,7 @@ __all__ = [
     "FusedMoEPermuteExpertsUnpermute",
     "FusedMoEActivationFormat",
     "FusedMoEPrepareAndFinalize",
+    "RoutingMethodType",
     "SharedFusedMoE",
     "activation_without_mul",
     "override_config",
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
index ef7090c349fc6..8c9d8a2777d58 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import abstractmethod
-from collections.abc import Callable
 
 import torch
 
@@ -100,22 +99,5 @@ class FusedMoEMethodBase(QuantizeMethodBase):
         layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         raise NotImplementedError
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
index b33e7fd8a0215..1947423bf4777 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Callable
 
 import torch
 
@@ -97,23 +96,6 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
         layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         topk_weights, topk_ids, zero_expert_result = layer.select_experts(
             hidden_states=x,
@@ -127,10 +109,10 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
             topk_weights=topk_weights,
             topk_ids=topk_ids,
             inplace=self.allow_inplace,
-            activation=activation,
-            global_num_experts=global_num_experts,
-            apply_router_weight_on_input=apply_router_weight_on_input,
-            expert_map=None if self.disable_expert_map else expert_map,
+            activation=layer.activation,
+            global_num_experts=layer.global_num_experts,
+            apply_router_weight_on_input=layer.apply_router_weight_on_input,
+            expert_map=None if self.disable_expert_map else layer.expert_map,
         )
 
         if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 5df3486093cd9..e635382068a63 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -33,10 +33,6 @@ from vllm.model_executor.layers.fused_moe.config import (
     RoutingMethodType,
 )
 from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton
-from vllm.model_executor.layers.fused_moe.modular_kernel import (
-    FusedMoEPermuteExpertsUnpermute,
-    FusedMoEPrepareAndFinalize,
-)
 from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
     init_aiter_topK_meta_data,
 )
@@ -57,11 +53,8 @@ from vllm.utils.torch_utils import (
 from vllm.v1.worker.ubatching import dbo_current_ubatch_id
 
 if current_platform.is_cuda_alike():
-    from .fused_moe import eplb_map_to_physical_and_record, fused_experts
+    from .fused_moe import eplb_map_to_physical_and_record
 else:
-    fused_experts = None  # type: ignore
-    FusedMoEPermuteExpertsUnpermute = object  # type: ignore
-    FusedMoEPrepareAndFinalize = object  # type: ignore
 
     def _eplb_map_to_physical_and_record(
         topk_ids: torch.Tensor,
@@ -483,7 +476,7 @@ class FusedMoE(CustomOp):
                 enable_eplb=self.enable_eplb,
             )
 
-            self.expert_map: torch.Tensor | None
+            self._expert_map: torch.Tensor | None
             local_num_experts, expert_map, expert_mask = determine_expert_map(
                 ep_size=self.ep_size,
                 ep_rank=self.ep_rank,
@@ -493,7 +486,7 @@ class FusedMoE(CustomOp):
                 return_expert_mask=self.rocm_aiter_fmoe_enabled,
             )
             self.local_num_experts = local_num_experts
-            self.register_buffer("expert_map", expert_map)
+            self.register_buffer("_expert_map", expert_map)
             self.register_buffer("expert_mask", expert_mask)
             self._maybe_init_expert_routing_tables()
             logger.info_once(
@@ -506,10 +499,10 @@ class FusedMoE(CustomOp):
                 self.expert_placement_strategy,
                 self.local_num_experts,
                 self.global_num_experts,
-                get_compressed_expert_map(self.expert_map),
+                get_compressed_expert_map(self._expert_map),
             )
         else:
-            self.local_num_experts, self.expert_map, self.expert_mask = (
+            self.local_num_experts, self._expert_map, self.expert_mask = (
                 self.global_num_experts,
                 None,
                 None,
@@ -781,7 +774,7 @@ class FusedMoE(CustomOp):
                 ),
             )
 
-        if self.expert_map is None:
+        if self._expert_map is None:
             return None
 
         routing_tables = self.ensure_round_robin_expert_routing_tables(
@@ -789,7 +782,7 @@ class FusedMoE(CustomOp):
             ep_size=self.ep_size,
             ep_rank=self.ep_rank,
             local_num_experts=self.local_num_experts,
-            device=self.expert_map.device,
+            device=self._expert_map.device,
         )
 
         global_to_physical, physical_to_global, local_global = routing_tables
@@ -840,8 +833,8 @@ class FusedMoE(CustomOp):
 
     def update_expert_map(self):
         # ep_size and ep_rank should already be updated
-        assert self.expert_map is not None
-        with self.expert_map.device:
+        assert self._expert_map is not None
+        with self._expert_map.device:
             local_num_experts, expert_map, expert_mask = determine_expert_map(
                 ep_size=self.ep_size,
                 ep_rank=self.ep_rank,
@@ -851,7 +844,7 @@ class FusedMoE(CustomOp):
                 return_expert_mask=self.rocm_aiter_fmoe_enabled,
             )
             self.local_num_experts = local_num_experts
-            self.register_buffer("expert_map", expert_map)
+            self.register_buffer("_expert_map", expert_map)
             self.register_buffer("expert_mask", expert_mask)
             self._maybe_init_expert_routing_tables()
             if self.aiter_fmoe_shared_expert_enabled:
@@ -1068,9 +1061,9 @@ class FusedMoE(CustomOp):
             expert_data.copy_(loaded_weight)
 
     def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int:
-        if self.expert_map is None:
+        if self._expert_map is None:
             return expert_id
-        return self.expert_map[expert_id].item()
+        return self._expert_map[expert_id].item()
 
     def _init_aiter_shared_experts_topK_buffer(
         self, vllm_config: VllmConfig, dp_size: int
@@ -1744,6 +1737,12 @@ class FusedMoE(CustomOp):
                 reduce_output(fused_output)[..., :og_hidden_states],
             )
 
+    @property
+    def expert_map(self) -> torch.Tensor | None:
+        return (
+            self._expert_map if not self.rocm_aiter_fmoe_enabled else self.expert_mask
+        )
+
     def forward_cuda(
         self,
         hidden_states: torch.Tensor,
@@ -1805,24 +1804,6 @@ class FusedMoE(CustomOp):
                 layer=self,
                 x=staged_hidden_states,
                 router_logits=staged_router_logits,
-                top_k=self.top_k,
-                renormalize=self.renormalize,
-                use_grouped_topk=self.use_grouped_topk,
-                global_num_experts=self.global_num_experts,
-                expert_map=self.expert_map
-                if not self.rocm_aiter_fmoe_enabled
-                else self.expert_mask,
-                topk_group=self.topk_group,
-                num_expert_group=self.num_expert_group,
-                custom_routing_function=self.custom_routing_function,
-                scoring_func=self.scoring_func,
-                routed_scaling_factor=self.routed_scaling_factor,
-                e_score_correction_bias=self.e_score_correction_bias,
-                activation=self.activation,
-                enable_eplb=self.enable_eplb,
-                expert_load_view=self.expert_load_view,
-                logical_to_physical_map=self.logical_to_physical_map,
-                logical_replica_count=self.logical_replica_count,
             )
 
             if has_separate_shared_experts:
@@ -1968,25 +1949,6 @@ class FusedMoE(CustomOp):
                 if do_naive_dispatch_combine
                 else hidden_states,
                 router_logits=router_logits,
-                top_k=self.top_k,
-                renormalize=self.renormalize,
-                use_grouped_topk=self.use_grouped_topk,
-                global_num_experts=self.global_num_experts,
-                expert_map=self.expert_map
-                if not self.rocm_aiter_fmoe_enabled
-                else self.expert_mask,
-                topk_group=self.topk_group,
-                num_expert_group=self.num_expert_group,
-                custom_routing_function=self.custom_routing_function,
-                scoring_func=self.scoring_func,
-                routed_scaling_factor=self.routed_scaling_factor,
-                e_score_correction_bias=self.e_score_correction_bias,
-                activation=self.activation,
-                apply_router_weight_on_input=self.apply_router_weight_on_input,
-                enable_eplb=self.enable_eplb,
-                expert_load_view=self.expert_load_view,
-                logical_to_physical_map=self.logical_to_physical_map,
-                logical_replica_count=self.logical_replica_count,
             )
 
             if has_separate_shared_experts:
diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
index 48e5a8907f926..6182f10aa70f0 100644
--- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
+++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Callable
 
 import torch
 import torch.nn.functional as F
@@ -269,53 +268,14 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            assert expert_load_view is not None
-            assert logical_to_physical_map is not None
-            assert logical_replica_count is not None
-
         return self.forward(
-            x=x,
             layer=layer,
+            x=x,
             router_logits=router_logits,
-            top_k=top_k,
-            renormalize=renormalize,
-            use_grouped_topk=use_grouped_topk,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            activation=activation,
-            apply_router_weight_on_input=apply_router_weight_on_input,
-            enable_eplb=enable_eplb,
-            expert_load_view=expert_load_view,
-            logical_to_physical_map=logical_to_physical_map,
-            logical_replica_count=logical_replica_count,
         )
 
     def get_fused_moe_quant_config(
@@ -333,24 +293,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         self,
         layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
         x: torch.Tensor,
-        use_grouped_topk: bool,
-        top_k: int,
         router_logits: torch.Tensor,
-        renormalize: bool,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         topk_weights, topk_ids, zero_expert_result = layer.select_experts(
             hidden_states=x,
@@ -364,9 +307,9 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                 w2=layer.w2_weight,
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
-                expert_map=expert_map,
-                activation=activation,
-                apply_router_weight_on_input=apply_router_weight_on_input,
+                expert_map=layer.expert_map,
+                activation=layer.activation,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
             )
         elif self.flashinfer_cutlass_moe_enabled:
             return self.flashinfer_cutlass_moe(
@@ -375,8 +318,8 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                 w2=layer.w2_weight,
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
-                activation=activation,
-                apply_router_weight_on_input=apply_router_weight_on_input,
+                activation=layer.activation,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
             )
         else:
             result = fused_experts(
@@ -386,11 +329,11 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
                 inplace=True,
-                activation=activation,
+                activation=layer.activation,
                 quant_config=self.moe_quant_config,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
+                global_num_experts=layer.global_num_experts,
+                expert_map=layer.expert_map,
             )
 
         if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
@@ -405,148 +348,101 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         self,
         layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
         x: torch.Tensor,
-        use_grouped_topk: bool,
-        top_k: int,
         router_logits: torch.Tensor,
-        renormalize: bool,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if (
-            enable_eplb is not False
-            or expert_load_view is not None
-            or logical_to_physical_map is not None
-            or logical_replica_count is not None
+            layer.enable_eplb is not False
+            or layer.expert_load_view is not None
+            or layer.logical_to_physical_map is not None
+            or layer.logical_replica_count is not None
         ):
             raise NotImplementedError("Expert load balancing is not supported for CPU.")
+
         return layer.cpu_fused_moe(
             layer,
             x,
-            use_grouped_topk,
-            top_k,
+            layer.use_grouped_topk,
+            layer.top_k,
             router_logits,
-            renormalize,
-            topk_group,
-            num_expert_group,
-            global_num_experts,
-            expert_map,
-            custom_routing_function,
-            scoring_func,
-            routed_scaling_factor,
-            e_score_correction_bias,
-            apply_router_weight_on_input,
-            activation,
+            layer.renormalize,
+            layer.topk_group,
+            layer.num_expert_group,
+            layer.global_num_experts,
+            layer.expert_map,
+            layer.custom_routing_function,
+            layer.scoring_func,
+            layer.routed_scaling_factor,
+            layer.e_score_correction_bias,
+            layer.apply_router_weight_on_input,
+            layer.activation,
         )
 
     def forward_xpu(
         self,
         layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
         x: torch.Tensor,
-        use_grouped_topk: bool,
-        top_k: int,
         router_logits: torch.Tensor,
-        renormalize: bool,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if (
-            enable_eplb is not False
-            or expert_load_view is not None
-            or logical_to_physical_map is not None
-            or logical_replica_count is not None
+            layer.enable_eplb is not False
+            or layer.expert_load_view is not None
+            or layer.logical_to_physical_map is not None
+            or layer.logical_replica_count is not None
         ):
             raise NotImplementedError("Expert load balancing is not supported for XPU.")
         return layer.ipex_fusion(
             x,
-            use_grouped_topk,
-            top_k,
+            layer.use_grouped_topk,
+            layer.top_k,
             router_logits,
-            renormalize,
-            topk_group,
-            num_expert_group,
-            custom_routing_function=custom_routing_function,
+            layer.renormalize,
+            layer.topk_group,
+            layer.num_expert_group,
+            custom_routing_function=layer.custom_routing_function,
         )
 
     def forward_tpu(
         self,
         layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
         x: torch.Tensor,
-        use_grouped_topk: bool,
-        top_k: int,
         router_logits: torch.Tensor,
-        renormalize: bool,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        assert not use_grouped_topk
-        assert num_expert_group is None
-        assert topk_group is None
-        assert custom_routing_function is None
-        assert apply_router_weight_on_input is False
-        if scoring_func != "softmax":
+        assert not layer.use_grouped_topk
+        assert layer.num_expert_group is None
+        assert layer.topk_group is None
+        assert layer.custom_routing_function is None
+        assert layer.apply_router_weight_on_input is False
+        if layer.scoring_func != "softmax":
             raise NotImplementedError(
                 "Only softmax scoring function is supported for TPU."
             )
-        if e_score_correction_bias is not None:
+        if layer.e_score_correction_bias is not None:
             raise NotImplementedError(
                 "Expert score correction bias is not supported for TPU."
             )
-        assert activation == "silu", f"{activation} is not supported for TPU."
-        assert routed_scaling_factor == 1.0, (
-            f"routed_scaling_factor {routed_scaling_factor} is not supported for TPU."
+        assert layer.activation == "silu", (
+            f"{layer.activation} is not supported for TPU."
+        )
+        assert layer.routed_scaling_factor == 1.0, (
+            f"routed_scaling_factor {layer.routed_scaling_factor} is "
+            "not supported for TPU."
         )
         if (
-            enable_eplb is not False
-            or expert_load_view is not None
-            or logical_to_physical_map is not None
-            or logical_replica_count is not None
+            layer.enable_eplb is not False
+            or layer.expert_load_view is not None
+            or layer.logical_to_physical_map is not None
+            or layer.logical_replica_count is not None
         ):
             raise NotImplementedError("Expert load balancing is not supported for TPU.")
         return fused_moe_pallas(
             hidden_states=x,
             w1=layer.w13_weight,
             w2=layer.w2_weight,
-            topk=top_k,
+            topk=layer.top_k,
             gating_output=router_logits,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
-            renormalize=renormalize,
+            global_num_experts=layer.global_num_experts,
+            expert_map=layer.expert_map,
+            renormalize=layer.renormalize,
         )
 
     if current_platform.is_tpu():
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index d463e181fd2db..16aa4f1e22698 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Callable
 from typing import TYPE_CHECKING, Any, Optional
 
 import torch
@@ -669,25 +668,8 @@ class AWQMarlinMoEMethod(FusedMoEMethodBase):
         layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        assert activation == "silu", "Only SiLU activation is supported."
+        assert layer.activation == "silu", "Only SiLU activation is supported."
 
         topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
@@ -708,9 +690,9 @@ class AWQMarlinMoEMethod(FusedMoEMethodBase):
             input_global_scale1=getattr(layer, "w13_input_global_scale", None),
             input_global_scale2=getattr(layer, "w2_input_global_scale", None),
             quant_type_id=self.quant_type.id,
-            apply_router_weight_on_input=apply_router_weight_on_input,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
+            apply_router_weight_on_input=layer.apply_router_weight_on_input,
+            global_num_experts=layer.global_num_experts,
+            expert_map=layer.expert_map,
             w1_zeros=layer.w13_qzeros,
             w2_zeros=layer.w2_qzeros,
             workspace=layer.workspace,
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 1e57fa218b797..1fd959cb3423d 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Callable
 from typing import Any, Union
 
 import torch
@@ -498,23 +497,6 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
         layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
@@ -534,10 +516,10 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
             topk_weights=topk_weights,
             topk_ids=topk_ids,
             inplace=True,
-            activation=activation,
-            apply_router_weight_on_input=apply_router_weight_on_input,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
+            activation=layer.activation,
+            apply_router_weight_on_input=layer.apply_router_weight_on_input,
+            global_num_experts=layer.global_num_experts,
+            expert_map=layer.expert_map,
             quant_config=self.moe_quant_config,
         )
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 619162272c94f..5ad26f9318df3 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
-from collections.abc import Callable
 from enum import Enum
 
 import torch
@@ -558,31 +557,14 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
         layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        assert activation == "silu", "Only SiLU activation is supported."
+        assert layer.activation == "silu", "Only SiLU activation is supported."
 
         if (
             self.allow_flashinfer
             and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
         ):
-            if enable_eplb:
+            if layer.enable_eplb:
                 raise NotImplementedError(
                     "EPLB not supported for `CompressedTensorsW4A4MoEMethod` yet."
                 )
@@ -591,12 +573,12 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
                 layer=layer,
                 x=x,
                 router_logits=router_logits,
-                top_k=top_k,
-                global_num_experts=global_num_experts,
-                num_expert_group=num_expert_group,
-                topk_group=topk_group,
-                custom_routing_function=custom_routing_function,
-                e_score_correction_bias=e_score_correction_bias,
+                top_k=layer.top_k,
+                global_num_experts=layer.global_num_experts,
+                num_expert_group=layer.num_expert_group,
+                topk_group=layer.topk_group,
+                custom_routing_function=layer.custom_routing_function,
+                e_score_correction_bias=layer.e_score_correction_bias,
             )
 
         topk_weights, topk_ids, _ = layer.select_experts(
@@ -619,9 +601,9 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
                 global_scale1=layer.w13_weight_scale_2,
                 global_scale2=layer.w2_weight_scale_2,
                 quant_type_id=scalar_types.float4_e2m1f.id,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
+                global_num_experts=layer.global_num_experts,
+                expert_map=layer.expert_map,
                 input_dtype=self.marlin_input_dtype,
                 workspace=layer.workspace,
             )
@@ -646,15 +628,15 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
                 topk_ids=topk_ids,
                 quant_config=self.moe_quant_config,
                 inplace=False,  # TODO(shuw): fix later, now output is high prec
-                activation=activation,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
-                apply_router_weight_on_input=apply_router_weight_on_input,
+                activation=layer.activation,
+                global_num_experts=layer.global_num_experts,
+                expert_map=layer.expert_map,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
             )
         else:
             from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
 
-            assert expert_map is None, (
+            assert layer.expert_map is None, (
                 "Expert Parallelism / expert_map "
                 "is currently not supported for "
                 "CompressedTensorsW4A4Nvfp4MoEMethod."
@@ -670,7 +652,7 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
                 quant_config=self.moe_quant_config,
-                apply_router_weight_on_input=apply_router_weight_on_input,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
                 # TODO(bnell): derive these from arguments
                 m=x.shape[0],
                 n=layer.w2_weight.shape[2] * 2,
@@ -1188,23 +1170,6 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
@@ -1215,7 +1180,9 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         per_channel_quant = self.weight_quant.strategy == QuantizationStrategy.CHANNEL
 
         if self.use_marlin:
-            assert activation == "silu", f"{activation} not supported for Marlin MoE."
+            assert layer.activation == "silu", (
+                f"{layer.activation} not supported for Marlin MoE."
+            )
             return fused_marlin_moe(
                 x,
                 layer.w13_weight,
@@ -1228,9 +1195,9 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 topk_weights,
                 topk_ids,
                 quant_type_id=scalar_types.float8_e4m3fn.id,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
+                global_num_experts=layer.global_num_experts,
+                expert_map=layer.expert_map,
                 input_dtype=self.marlin_input_dtype,
                 workspace=layer.workspace,
             )
@@ -1248,9 +1215,9 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 w2=layer.w2_weight,
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
-                activation=activation,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                expert_map=expert_map,
+                activation=layer.activation,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
+                expert_map=layer.expert_map,
                 quant_config=self.moe_quant_config,
             )
 
@@ -1270,10 +1237,12 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                     topk_weights=topk_weights,
                     topk_ids=topk_ids,
                     inplace=True,
-                    activation=activation,
-                    apply_router_weight_on_input=apply_router_weight_on_input,
-                    global_num_experts=global_num_experts,
-                    expert_map=None if self.disable_expert_map else expert_map,
+                    activation=layer.activation,
+                    apply_router_weight_on_input=layer.apply_router_weight_on_input,
+                    global_num_experts=layer.global_num_experts,
+                    expert_map=None
+                    if self.disable_expert_map
+                    else layer.expert_map,  # ???
                     quant_config=self.moe_quant_config,
                 )
             else:
@@ -1290,9 +1259,9 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                     topk_weights,
                     topk_ids,
                     quant_config=self.moe_quant_config,
-                    activation=activation,
-                    global_num_experts=global_num_experts,
-                    expert_map=None if self.disable_expert_map else expert_map,
+                    activation=layer.activation,
+                    global_num_experts=layer.global_num_experts,
+                    expert_map=None if self.disable_expert_map else layer.expert_map,
                     ab_strides1=self.ab_strides1_c_strides2,
                     ab_strides2=self.ab_strides2,
                     c_strides1=self.c_strides1,
@@ -1314,10 +1283,10 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
                 inplace=True,
-                activation=activation,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
+                activation=layer.activation,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
+                global_num_experts=layer.global_num_experts,
+                expert_map=layer.expert_map,
                 quant_config=self.moe_quant_config,
             )
 
@@ -1437,23 +1406,6 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
         layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
@@ -1469,10 +1421,10 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
             topk_weights=topk_weights,
             topk_ids=topk_ids,
             inplace=True,
-            activation=activation,
-            apply_router_weight_on_input=apply_router_weight_on_input,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
+            activation=layer.activation,
+            apply_router_weight_on_input=layer.apply_router_weight_on_input,
+            global_num_experts=layer.global_num_experts,
+            expert_map=layer.expert_map,
             quant_config=self.moe_quant_config,
         )
 
@@ -1814,25 +1766,10 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
         layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        assert activation == "silu", f"{activation} not supported for Marlin MoE."
+        assert layer.activation == "silu", (
+            f"{layer.activation} not supported for Marlin MoE."
+        )
 
         topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
@@ -1853,9 +1790,9 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
             input_global_scale1=getattr(layer, "w13_input_global_scale", None),
             input_global_scale2=getattr(layer, "w2_input_global_scale", None),
             quant_type_id=self.quant_type.id,
-            apply_router_weight_on_input=apply_router_weight_on_input,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
+            apply_router_weight_on_input=layer.apply_router_weight_on_input,
+            global_num_experts=layer.global_num_experts,
+            expert_map=layer.expert_map,
             g_idx1=layer.w13_weight_g_idx,
             g_idx2=layer.w2_weight_g_idx,
             sort_indices1=layer.w13_g_idx_sort_indices,
@@ -2057,23 +1994,6 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
         layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
@@ -2089,10 +2009,10 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
             topk_weights=topk_weights,
             topk_ids=topk_ids,
             inplace=True,
-            activation=activation,
-            apply_router_weight_on_input=apply_router_weight_on_input,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
+            activation=layer.activation,
+            apply_router_weight_on_input=layer.apply_router_weight_on_input,
+            global_num_experts=layer.global_num_experts,
+            expert_map=layer.expert_map,
             quant_config=self.moe_quant_config,
         )
 
@@ -2372,32 +2292,15 @@ class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor:
-        assert not enable_eplb, "EPLB not supported for W4A8-int MoE yet."
-        assert activation in ("silu", "swigluoai", "swiglu"), (
+        assert not layer.enable_eplb, "EPLB not supported for W4A8-int MoE yet."
+        assert layer.activation in ("silu", "swigluoai", "swiglu"), (
             "Only SiLU/SwiGLUGU/SwiGLUUG are supported."
         )
-        assert expert_map is None, """expert_map/EP not implemented
+        assert layer.expert_map is None, """expert_map/EP not implemented
         for CPU dyn-4bit MoE."""
 
         def _act_kind(s: str) -> int:
@@ -2414,15 +2317,9 @@ class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod):
         topk_weights, topk_ids = select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
+            top_k=layer.top_k,
+            use_grouped_topk=layer.use_grouped_topk,
+            renormalize=layer.renormalize,
         )
 
         return torch.ops._C.dynamic_4bit_int_moe(
@@ -2435,8 +2332,8 @@ class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod):
             layer.w2_in_features,
             layer.w13_out_features,
             layer.group_size,
-            apply_router_weight_on_input,
-            int(_act_kind(activation)),
+            layer.apply_router_weight_on_input,
+            int(_act_kind(layer.activation)),
         )
 
 
@@ -2707,28 +2604,11 @@ class CompressedTensorsW4A8Fp8MoEMethod(CompressedTensorsMoEMethod):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ):
-        if enable_eplb:
+        if layer.enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `CompressedTensorsW4A8Fp8MoEMethod` yet."
             )
@@ -2749,9 +2629,9 @@ class CompressedTensorsW4A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             topk_weights,
             topk_ids,
             quant_config=self.moe_quant_config,
-            activation=activation,
-            global_num_experts=global_num_experts,
-            expert_map=None if self.disable_expert_map else expert_map,
+            activation=layer.activation,
+            global_num_experts=layer.global_num_experts,
+            expert_map=None if self.disable_expert_map else layer.expert_map,
             a_strides1=self.a_strides1_c_strides2,
             a_strides2=self.a_strides2,
             b_strides1=self.b_strides1,
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index 7ebe40ec84687..11097cf36f5ca 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Callable
 from typing import Any, Optional
 
 import torch
@@ -140,23 +139,6 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
         layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
@@ -172,10 +154,10 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
             topk_weights=topk_weights,
             topk_ids=topk_ids,
             inplace=True,
-            activation=activation,
-            apply_router_weight_on_input=apply_router_weight_on_input,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
+            activation=layer.activation,
+            apply_router_weight_on_input=layer.apply_router_weight_on_input,
+            global_num_experts=layer.global_num_experts,
+            expert_map=layer.expert_map,
             quant_config=self.moe_quant_config,
         )
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 419ddd91b64e0..8567f64b936b5 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Callable
 from enum import Enum
 from functools import partial
 from typing import TYPE_CHECKING, Any, Optional
@@ -1242,41 +1241,20 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            assert expert_load_view is not None
-            assert logical_to_physical_map is not None
-            assert logical_replica_count is not None
-            assert isinstance(layer, FusedMoE)
-
         if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
-            assert activation == "silu", (
-                f"Expected 'silu' activation but got {activation}"
+            if layer.enable_eplb:
+                raise NotImplementedError("EPLB not supported for `Fp8MoEMethod` yet.")
+            assert layer.activation == "silu", (
+                f"Expected 'silu' activation but got {layer.activation}"
             )
 
             if self.block_quant:
                 import vllm.model_executor.layers.fused_moe.flashinfer_trtllm_moe  # noqa: E501, F401
 
                 e_score_correction_bias = (
-                    e_score_correction_bias.to(x.dtype)
-                    if e_score_correction_bias is not None
+                    layer.e_score_correction_bias.to(x.dtype)
+                    if layer.e_score_correction_bias is not None
                     else None
                 )
                 routing_method_type = layer.routing_method_type
@@ -1290,29 +1268,31 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                     w13_weight_scale_inv=layer.w13_weight_scale_inv,
                     w2_weight=layer.w2_weight,
                     w2_weight_scale_inv=layer.w2_weight_scale_inv,
-                    global_num_experts=global_num_experts,
-                    top_k=top_k,
-                    num_expert_group=num_expert_group,
-                    topk_group=topk_group,
+                    global_num_experts=layer.global_num_experts,
+                    top_k=layer.top_k,
+                    num_expert_group=layer.num_expert_group,
+                    topk_group=layer.topk_group,
                     intermediate_size=layer.intermediate_size_per_partition,
                     expert_offset=layer.ep_rank * layer.local_num_experts,
                     local_num_experts=layer.local_num_experts,
                     block_shape=self.weight_block_size,
                     routing_method_type=routing_method_type,
-                    routed_scaling=routed_scaling_factor,
+                    routed_scaling=layer.routed_scaling_factor,
                 )
             else:
-                assert not renormalize and custom_routing_function is not None
+                assert (
+                    not layer.renormalize and layer.custom_routing_function is not None
+                )
                 result = apply_flashinfer_per_tensor_scale_fp8(
                     layer=layer,
                     hidden_states=x,
                     router_logits=router_logits,
-                    routing_bias=e_score_correction_bias,
-                    global_num_experts=global_num_experts,
-                    top_k=top_k,
-                    num_expert_group=num_expert_group,
-                    topk_group=topk_group,
-                    apply_router_weight_on_input=apply_router_weight_on_input,
+                    routing_bias=layer.e_score_correction_bias,
+                    global_num_experts=layer.global_num_experts,
+                    top_k=layer.top_k,
+                    num_expert_group=layer.num_expert_group,
+                    topk_group=layer.topk_group,
+                    apply_router_weight_on_input=layer.apply_router_weight_on_input,
                 )
 
         select_result = layer.select_experts(
@@ -1333,13 +1313,15 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 layer.w2_weight,
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
-                activation=activation,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                expert_map=expert_map,
+                activation=layer.activation,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
+                expert_map=layer.expert_map,
                 quant_config=self.moe_quant_config,
             )
         elif self.use_marlin:
-            assert activation == "silu", f"{activation} not supported for Marlin MoE."
+            assert layer.activation == "silu", (
+                f"{layer.activation} not supported for Marlin MoE."
+            )
             result = fused_marlin_moe(
                 x,
                 layer.w13_weight,
@@ -1352,20 +1334,22 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 topk_weights,
                 topk_ids,
                 quant_type_id=scalar_types.float8_e4m3fn.id,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
+                global_num_experts=layer.global_num_experts,
+                expert_map=layer.expert_map,
                 input_dtype=self.marlin_input_dtype,
                 workspace=layer.workspace,
             )
         elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
-            assert activation == "silu", (
-                f"Expected 'silu' activation but got {activation}"
+            assert layer.activation == "silu", (
+                f"Expected 'silu' activation but got {layer.activation}"
             )
             if not self.block_quant:
-                assert not renormalize and custom_routing_function is not None
-                assert scoring_func == "sigmoid", (
-                    f"Expected 'sigmoid' scoring func but got {scoring_func}"
+                assert (
+                    not layer.renormalize and layer.custom_routing_function is not None
+                )
+                assert layer.scoring_func == "sigmoid", (
+                    f"Expected 'sigmoid' scoring func but got {layer.scoring_func}"
                 )
             # Delegate to CUTLASS FlashInfer path; function already bound with
             # use_deepseek_fp8_block_scale for block-quant when applicable
@@ -1375,10 +1359,10 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 topk_weights,
                 topk_ids,
                 inplace=False,
-                activation=activation,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
-                apply_router_weight_on_input=apply_router_weight_on_input,
+                activation=layer.activation,
+                global_num_experts=layer.global_num_experts,
+                expert_map=layer.expert_map,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
             )
         else:
             from vllm.model_executor.layers.fused_moe import fused_experts
@@ -1390,10 +1374,10 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
                 inplace=True,
-                activation=activation,
-                global_num_experts=global_num_experts,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                expert_map=expert_map,
+                activation=layer.activation,
+                global_num_experts=layer.global_num_experts,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
+                expert_map=layer.expert_map,
                 quant_config=self.moe_quant_config,
                 allow_deep_gemm=self.allow_deep_gemm,
                 allow_cutlass_block_scaled_grouped_gemm=(
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index ee819df292ed1..13aa2bcad21ba 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Callable, Mapping
+from collections.abc import Mapping
 from types import MappingProxyType
 from typing import Any, Optional
 
@@ -625,26 +625,9 @@ class GGUFMoEMethod(FusedMoEMethodBase):
         layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        assert activation == "silu", "Only SiLU activation is supported."
-        if apply_router_weight_on_input:
+        assert layer.activation == "silu", "Only SiLU activation is supported."
+        if layer.apply_router_weight_on_input:
             raise NotImplementedError(
                 "Apply router weight on input is not supported for"
                 "fused GGUF MoE method."
@@ -662,7 +645,7 @@ class GGUFMoEMethod(FusedMoEMethodBase):
             topk_ids,
             layer.w13_qweight_type.weight_type,
             layer.w2_qweight_type.weight_type,
-            activation,
+            layer.activation,
         )
 
 
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 56034e11329dc..8d1715f52f097 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Callable
 from copy import deepcopy
 from typing import Any, Optional
 
@@ -790,25 +789,8 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
         layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        assert activation == "silu", "Only SiLU activation is supported."
+        assert layer.activation == "silu", "Only SiLU activation is supported."
 
         topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
@@ -829,9 +811,9 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
             input_global_scale1=getattr(layer, "w13_input_global_scale", None),
             input_global_scale2=getattr(layer, "w2_input_global_scale", None),
             quant_type_id=self.quant_type.id,
-            apply_router_weight_on_input=apply_router_weight_on_input,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
+            apply_router_weight_on_input=layer.apply_router_weight_on_input,
+            global_num_experts=layer.global_num_experts,
+            expert_map=layer.expert_map,
             g_idx1=layer.w13_g_idx,
             g_idx2=layer.w2_g_idx,
             sort_indices1=layer.w13_g_idx_sort_indices,
diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py
index a1571afba2974..463c74c1c1482 100644
--- a/vllm/model_executor/layers/quantization/ipex_quant.py
+++ b/vllm/model_executor/layers/quantization/ipex_quant.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Callable
 from typing import Any, Optional
 
 import torch
@@ -440,31 +439,14 @@ class XPUFp8MoEMethod(FusedMoEMethodBase):
         layer: torch.nn.Module,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor:
         return layer.ipex_fusion(
             x,
-            use_grouped_topk,
-            top_k,
+            layer.use_grouped_topk,
+            layer.top_k,
             router_logits,
-            renormalize,
-            topk_group,
-            num_expert_group,
-            custom_routing_function=custom_routing_function,
+            layer.renormalize,
+            layer.topk_group,
+            layer.num_expert_group,
+            custom_routing_function=layer.custom_routing_function,
         )
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 034e97a713cdd..e825cb33c3580 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Callable
 from fnmatch import fnmatch
 from typing import TYPE_CHECKING, Any, Optional
 
@@ -707,43 +706,27 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
         layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
             if layer.enable_eplb:
                 raise NotImplementedError(
                     "EPLB not supported for `ModelOptFp8MoEMethod` yet."
                 )
-            assert activation == "silu", (
-                f"Expected 'silu' activation but got {activation}"
+            assert layer.activation == "silu", (
+                f"Expected 'silu' activation but got {layer.activation}"
             )
-            assert not renormalize
+
+            assert not layer.renormalize
             return apply_flashinfer_per_tensor_scale_fp8(
                 layer=layer,
                 hidden_states=x,
                 router_logits=router_logits,
-                routing_bias=e_score_correction_bias,
-                global_num_experts=global_num_experts,
-                top_k=top_k,
-                num_expert_group=num_expert_group,
-                topk_group=topk_group,
-                apply_router_weight_on_input=apply_router_weight_on_input,
+                routing_bias=layer.e_score_correction_bias,
+                global_num_experts=layer.global_num_experts,
+                top_k=layer.top_k,
+                num_expert_group=layer.num_expert_group,
+                topk_group=layer.topk_group,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
             )
 
         # Expert selection
@@ -753,9 +736,9 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
         )
 
         if self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
-            assert activation in ("silu", "relu2_no_mul"), (
+            assert layer.activation in ("silu", "relu2_no_mul"), (
                 "Expected activation to be in ('silu', 'relu2_no_mul'),"
-                f"but got {activation}"
+                f"but got {layer.activation}"
             )
             return flashinfer_cutlass_moe_fp8(
                 x,
@@ -763,10 +746,10 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
                 topk_weights,
                 topk_ids,
                 inplace=False,
-                activation=activation,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
-                apply_router_weight_on_input=apply_router_weight_on_input,
+                activation=layer.activation,
+                global_num_experts=layer.global_num_experts,
+                expert_map=layer.expert_map,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
             )
         else:
             from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
@@ -780,11 +763,11 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
                 inplace=True,
-                activation=activation,
+                activation=layer.activation,
                 quant_config=self.moe_quant_config,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
-                apply_router_weight_on_input=apply_router_weight_on_input,
+                global_num_experts=layer.global_num_experts,
+                expert_map=layer.expert_map,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
             )
 
 
@@ -1504,23 +1487,6 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if not self.moe.is_act_and_mul:
             assert (
@@ -1535,7 +1501,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
             self.allow_flashinfer
             and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
         ):
-            if enable_eplb:
+            if layer.enable_eplb:
                 raise NotImplementedError(
                     "EPLB not supported for `ModelOptNvFp4FusedMoE` yet."
                 )
@@ -1543,12 +1509,12 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 layer=layer,
                 x=x,
                 router_logits=router_logits,
-                top_k=top_k,
-                global_num_experts=global_num_experts,
-                num_expert_group=num_expert_group,
-                topk_group=topk_group,
-                custom_routing_function=custom_routing_function,
-                e_score_correction_bias=e_score_correction_bias,
+                top_k=layer.top_k,
+                global_num_experts=layer.global_num_experts,
+                num_expert_group=layer.num_expert_group,
+                topk_group=layer.topk_group,
+                custom_routing_function=layer.custom_routing_function,
+                e_score_correction_bias=layer.e_score_correction_bias,
             )
 
         topk_weights, topk_ids, _ = layer.select_experts(
@@ -1571,9 +1537,9 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 global_scale1=layer.w13_weight_scale_2,
                 global_scale2=layer.w2_weight_scale_2,
                 quant_type_id=scalar_types.float4_e2m1f.id,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
+                global_num_experts=layer.global_num_experts,
+                expert_map=layer.expert_map,
                 input_dtype=self.marlin_input_dtype,
             )
 
@@ -1604,10 +1570,10 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 topk_ids=topk_ids,
                 quant_config=self.moe_quant_config,
                 inplace=False,
-                activation=activation,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
-                apply_router_weight_on_input=apply_router_weight_on_input,
+                activation=layer.activation,
+                global_num_experts=layer.global_num_experts,
+                expert_map=layer.expert_map,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
             )
         else:
             # If no modular kernel is provided, use cutlass_moe_fp4 for TP case
@@ -1622,8 +1588,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
                 quant_config=self.moe_quant_config,
-                expert_map=expert_map,
-                apply_router_weight_on_input=apply_router_weight_on_input,
+                expert_map=layer.expert_map,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
                 # TODO: derive from arguments
                 m=x.shape[0],
                 n=layer.w2_weight.shape[2] * 2,
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index 8570b8c339987..0131a330f70d2 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Callable
 from typing import Any, Optional
 
 import torch
@@ -362,27 +361,10 @@ class MoeWNA16Method(FusedMoEMethodBase):
         layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
-        assert activation == "silu", "Only SiLU activation is supported."
+        assert layer.activation == "silu", "Only SiLU activation is supported."
         topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
@@ -395,9 +377,9 @@ class MoeWNA16Method(FusedMoEMethodBase):
             topk_weights=topk_weights,
             topk_ids=topk_ids,
             inplace=True,
-            apply_router_weight_on_input=apply_router_weight_on_input,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
+            apply_router_weight_on_input=layer.apply_router_weight_on_input,
+            global_num_experts=layer.global_num_experts,
+            expert_map=layer.expert_map,
             quant_config=self.moe_quant_config,
         )
 
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 5d330e837eea0..6eae4e9e66e1b 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Callable
 from enum import Enum
 from typing import Optional
 
@@ -892,25 +891,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
         layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
+        if layer.enable_eplb:
             raise NotImplementedError("EPLB is not supported for mxfp4")
 
         if self.mxfp4_backend == Mxfp4Backend.MARLIN:
@@ -933,26 +915,26 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                 global_scale1=None,
                 global_scale2=None,
                 quant_type_id=scalar_types.float4_e2m1f.id,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                global_num_experts=global_num_experts,
-                activation=activation,
-                expert_map=expert_map,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
+                global_num_experts=layer.global_num_experts,
+                activation=layer.activation,
+                expert_map=layer.expert_map,
                 input_dtype=self.marlin_input_dtype,
             )
 
         assert _can_support_mxfp4(
-            use_grouped_topk,
-            topk_group,
-            num_expert_group,
-            expert_map,
-            custom_routing_function,
-            e_score_correction_bias,
-            apply_router_weight_on_input,
-            scoring_func,
-            activation,
-            expert_load_view,
-            logical_to_physical_map,
-            logical_replica_count,
+            layer.use_grouped_topk,
+            layer.topk_group,
+            layer.num_expert_group,
+            layer.expert_map,
+            layer.custom_routing_function,
+            layer.e_score_correction_bias,
+            layer.apply_router_weight_on_input,
+            layer.scoring_func,
+            layer.activation,
+            layer.expert_load_view,
+            layer.logical_to_physical_map,
+            layer.logical_replica_count,
         ), "MXFP4 are not supported with this configuration."
 
         if (
@@ -988,8 +970,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                 None,  # output1_scale_scalar
                 None,  # output1_scale_gate_scalar
                 None,  # output2_scale_scalar
-                global_num_experts,
-                top_k,
+                layer.global_num_experts,
+                layer.top_k,
                 None,  # n_group
                 None,  # topk_group
                 self.intermediate_size,  # padded to multiple of 256
@@ -997,7 +979,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                 self.num_experts,  # local num experts
                 None,
                 None,
-                1 if renormalize else 0,  # routing_method_type, renormalize
+                1 if layer.renormalize else 0,  # routing_method_type, renormalize
                 True,  # do finalize
                 tune_max_num_tokens=max(self.max_capture_size, 1),
             )[0]
@@ -1081,12 +1063,12 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                 w1=layer.w13_weight,
                 w2=layer.w2_weight,
                 gating_output=router_logits,
-                topk=top_k,
-                renormalize=renormalize,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
+                topk=layer.top_k,
+                renormalize=layer.renormalize,
+                global_num_experts=layer.global_num_experts,
+                expert_map=layer.expert_map,
                 quant_config=self.moe_quant_config,
-                apply_router_weight_on_input=apply_router_weight_on_input,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
             )
         else:
             raise ValueError(f"Unsupported backend: {self.mxfp4_backend}")
@@ -1138,37 +1120,20 @@ class IpexMxfp4MoEMethod(Mxfp4MoEMethod):
         layer: torch.nn.Module,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor:
-        assert activation == "swigluoai", (
+        assert layer.activation == "swigluoai", (
             "Only swiglu_oai activation is supported for IPEX MXFP4 MoE"
         )
         hidden_size_pad = round_up(self.original_hidden_size, 128)
         x_pad = torch.nn.functional.pad(x, (0, hidden_size_pad - x.size(-1)))
         hidden_states = layer.ipex_fusion(
             x_pad,
-            use_grouped_topk,
-            top_k,
+            layer.use_grouped_topk,
+            layer.top_k,
             router_logits,
-            renormalize,
-            topk_group,
-            num_expert_group,
+            layer.renormalize,
+            layer.topk_group,
+            layer.num_expert_group,
             activation="swiglu_oai",
         )
         hidden_states = hidden_states[..., : self.original_hidden_size].contiguous()
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index 9e2b2134310fc..d84e22d1fa0f2 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Callable
 from typing import Any
 
 import torch
@@ -337,23 +336,6 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
         layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
@@ -371,13 +353,15 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
                 w2=layer.w2_weight,
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
-                activation=activation,
-                apply_router_weight_on_input=apply_router_weight_on_input,
+                activation=layer.activation,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
                 quant_config=self.moe_quant_config,
-                expert_map=expert_map,
+                expert_map=layer.expert_map,
             )
         elif self.use_marlin:
-            assert activation == "silu", f"{activation} not supported for Marlin MoE."
+            assert layer.activation == "silu", (
+                f"{layer.activation} not supported for Marlin MoE."
+            )
             return fused_marlin_moe(
                 x,
                 layer.w13_weight,
@@ -390,9 +374,9 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
                 topk_weights,
                 topk_ids,
                 quant_type_id=scalar_types.float8_e4m3fn.id,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
+                global_num_experts=layer.global_num_experts,
+                expert_map=layer.expert_map,
             )
         else:
             from vllm.model_executor.layers.fused_moe import fused_experts
@@ -404,10 +388,10 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
                 inplace=True,
-                activation=activation,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
+                activation=layer.activation,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
+                global_num_experts=layer.global_num_experts,
+                expert_map=layer.expert_map,
                 quant_config=self.moe_quant_config,
             )
 
@@ -597,23 +581,6 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
         layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
@@ -631,9 +598,9 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
                 layer.w2_weight,
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
-                activation=activation,
+                activation=layer.activation,
                 quant_config=self.moe_quant_config,
-                expert_map=expert_map,
+                expert_map=layer.expert_map,
             )
         else:
             from vllm.model_executor.layers.fused_moe import fused_experts
@@ -645,10 +612,11 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
                 inplace=True,
-                activation=activation,
-                global_num_experts=global_num_experts,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                expert_map=expert_map,
+                activation=layer.activation,
+                global_num_experts=layer.global_num_experts,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
+                expert_map=layer.expert_map,
                 quant_config=self.moe_quant_config,
             )
+
         return out
diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py
index 7b51b828009fc..b2ecb0b175f81 100644
--- a/vllm/model_executor/layers/quantization/rtn.py
+++ b/vllm/model_executor/layers/quantization/rtn.py
@@ -3,7 +3,6 @@
 # Copyright © 2025, Oracle and/or its affiliates.
 
 import os
-from collections.abc import Callable
 from typing import Any, Optional
 
 import numpy as np
@@ -359,23 +358,6 @@ class RTNMoEMethod(FusedMoEMethodBase):
         layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
@@ -394,9 +376,9 @@ class RTNMoEMethod(FusedMoEMethodBase):
             topk_weights,
             topk_ids,
             quant_type_id=self.quant_config.quant_type.id,
-            apply_router_weight_on_input=apply_router_weight_on_input,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
+            apply_router_weight_on_input=layer.apply_router_weight_on_input,
+            global_num_experts=layer.global_num_experts,
+            expert_map=layer.expert_map,
             workspace=workspace,
         )
 

From fccd5325874321f34daef0a8ed4d1b15b26ca34e Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Tue, 9 Dec 2025 16:54:32 -0500
Subject: [PATCH 393/770] [Quantization] FP8 Weight Reloading for Quantized RL
 Rollout (#28480)

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 tests/quantization/test_fp8.py                |  88 +++++++++++
 .../model_executor/layers/quantization/fp8.py | 149 +++++++++---------
 .../layers/quantization/kv_cache.py           |   7 +
 .../layers/quantization/utils/fp8_utils.py    |   7 +-
 .../quantization/utils/marlin_utils_fp8.py    |  11 +-
 .../layers/quantization/utils/w8a8_utils.py   |   5 +-
 vllm/model_executor/utils.py                  |  25 +++
 7 files changed, 206 insertions(+), 86 deletions(-)

diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index 7bcac9ad768e7..62203186510ce 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -10,10 +10,14 @@ import torch
 
 from tests.quantization.utils import is_quant_method_supported
 from vllm import _custom_ops as ops
+from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.quantization.fp8 import (
+    Fp8Config,
     Fp8KVCacheMethod,
     Fp8LinearMethod,
+    Fp8MoEMethod,
 )
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.platforms import current_platform
 
 MODELS = [
@@ -261,3 +265,87 @@ def test_scaled_fp8_quant(dtype) -> None:
             torch.narrow(y_nc_pad, 0, 0, x_nc.shape[0]), inv_scale_nc, dtype
         ),
     )
+
+
+@pytest.mark.parametrize("method_cls", [Fp8LinearMethod, Fp8MoEMethod])
+# FP8 weight reloading does not support online quantization
+@pytest.mark.parametrize("is_checkpoint_fp8_serialized", [True])  # skip False
+@pytest.mark.parametrize("weight_block_size", [None, [1, 1]])
+# any postprocessing that is applied to the weights such as padding and repacking
+# (excluding device sharding) must also be applied to the reloaded weights
+#
+# this is the case for marlin as well as per-tensor Fp8MoEMethod
+@pytest.mark.parametrize("use_marlin", [False])  # skip True
+def test_fp8_reloading(
+    method_cls, is_checkpoint_fp8_serialized, weight_block_size, use_marlin, dist_init
+):
+    if is_checkpoint_fp8_serialized is False:
+        pytest.skip("FP8 weight reloading does not support online quantization")
+
+    if method_cls is Fp8MoEMethod and weight_block_size is None:
+        pytest.skip(
+            "FP8 Tensor weight reloading does not support fusing w13_weight_scale. "
+            "If this is your use case, consider using a restore function like #26327"
+        )
+
+    with torch.device("cuda:0"):
+        config = Fp8Config(
+            is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized,
+            weight_block_size=weight_block_size,
+        )
+
+        if method_cls is Fp8LinearMethod:
+            layer = torch.nn.Linear(1, 1)
+            method = method_cls(config)
+            method.create_weights(
+                layer=layer,
+                input_size_per_partition=1,
+                output_partition_sizes=[1],
+                input_size=1,
+                output_size=1,
+                params_dtype=torch.bfloat16,
+                weight_loader=default_weight_loader,
+            )
+
+        else:
+            layer = FusedMoE(
+                num_experts=1,
+                top_k=1,
+                hidden_size=1,
+                intermediate_size=1,
+            )
+            method = method_cls(config, layer)
+            method.create_weights(
+                layer=layer,
+                num_experts=1,
+                hidden_size=1,
+                intermediate_size_per_partition=1,
+                params_dtype=torch.bfloat16,
+                weight_loader=default_weight_loader,
+            )
+
+        method.use_marlin = use_marlin
+
+    # capture weights format during loading
+    original_metadata = [
+        (name, param.shape, getattr(param, "weight_loader", default_weight_loader))
+        for name, param in layer.named_parameters()
+    ]
+
+    # test loading
+    for name, shape, _ in original_metadata:
+        param = getattr(layer, name)
+        weight_loader = getattr(param, "weight_loader", default_weight_loader)
+        weight_loader(param, torch.zeros(shape))  # cannot use empty
+
+    method.process_weights_after_loading(layer)
+
+    # test reloading works after loading
+    # assuming that no reshaping occurred
+    for name, shape, original_weight_loader in original_metadata:
+        param = getattr(layer, name)
+        weight_loader = getattr(param, "weight_loader", default_weight_loader)
+        assert weight_loader is original_weight_loader
+        weight_loader(param, torch.zeros(shape))  # cannot use empty
+
+    method.process_weights_after_loading(layer)
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 8567f64b936b5..60dde9eb57e0f 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -94,7 +94,7 @@ from vllm.model_executor.parameter import (
     ModelWeightParameter,
     PerTensorScaleParameter,
 )
-from vllm.model_executor.utils import set_weight_attrs
+from vllm.model_executor.utils import replace_parameter, set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 from vllm.utils.deep_gemm import (
@@ -548,46 +548,50 @@ class Fp8LinearMethod(LinearMethodBase):
             assert not self.act_q_static
             size_k_first = False
 
-            weight, weight_scale = process_fp8_weight_block_strategy(
+            weight, weight_scale_inv = process_fp8_weight_block_strategy(
                 layer.weight, layer.weight_scale_inv
             )
-            # Delete the weight_scale_inv parameter to avoid confusion
-            # with the weight_scale parameter
-            del layer.weight_scale_inv
+
+            # Update layer with new values
+            replace_parameter(layer, "weight", weight.data)
+            replace_parameter(layer, "weight_scale_inv", weight_scale_inv.data)
 
         # If checkpoint not serialized fp8, quantize the weights.
-        elif not self.quant_config.is_checkpoint_fp8_serialized:
-            qweight, weight_scale = ops.scaled_fp8_quant(layer.weight, scale=None)
-            weight = qweight.t()
-
-        # If checkpoint is fp8 per-tensor, handle that there are N scales for N
-        # shards in a fused module
         else:
-            weight = layer.weight
-            weight_scale = layer.weight_scale
+            if not self.quant_config.is_checkpoint_fp8_serialized:
+                qweight, weight_scale = ops.scaled_fp8_quant(layer.weight, scale=None)
+                weight = qweight.t()
 
-            # If using w8a8, torch._scaled_mm needs per tensor, so
-            # requantize the logical shards as a single weight.
-            if not self.use_marlin:
-                weight, weight_scale, input_scale = process_fp8_weight_tensor_strategy(
-                    weight,
-                    weight_scale,
-                    layer.logical_widths,
-                    getattr(layer, "input_scale", None),
-                )
-                if self.act_q_static:
-                    assert input_scale is not None
-                    input_scale = input_scale.max()
-            weight = weight.t()
+            # If checkpoint is fp8 per-tensor, handle that there are N scales for N
+            # shards in a fused module
+            else:
+                weight = layer.weight
+                weight_scale = layer.weight_scale
 
-        # Update layer with new values.
-        layer.weight = Parameter(weight.data, requires_grad=False)
-        layer.weight_scale = Parameter(weight_scale.data, requires_grad=False)
-        layer.input_scale = (
-            Parameter(input_scale, requires_grad=False)
-            if input_scale is not None
-            else None
-        )
+                # If using w8a8, torch._scaled_mm needs per tensor, so
+                # requantize the logical shards as a single weight.
+                if not self.use_marlin:
+                    weight, weight_scale, input_scale = (
+                        process_fp8_weight_tensor_strategy(
+                            weight,
+                            weight_scale,
+                            layer.logical_widths,
+                            getattr(layer, "input_scale", None),
+                        )
+                    )
+                    if self.act_q_static:
+                        assert input_scale is not None
+                        input_scale = input_scale.max()
+                weight = weight.t()
+
+            # Update layer with new values.
+            replace_parameter(layer, "weight", weight.data)
+            replace_parameter(layer, "weight_scale", weight_scale.data)
+
+        if input_scale is not None:
+            replace_parameter(layer, "input_scale", input_scale)
+        else:
+            layer.input_scale = None
 
         if self.use_marlin:
             prepare_fp8_layer_for_marlin(
@@ -614,7 +618,7 @@ class Fp8LinearMethod(LinearMethodBase):
                 return self.w8a8_block_fp8_linear.apply(
                     input=x,
                     weight=layer.weight,
-                    weight_scale=layer.weight_scale,
+                    weight_scale=layer.weight_scale_inv,
                     input_scale=layer.input_scale,
                     bias=bias,
                 )
@@ -643,10 +647,15 @@ class Fp8LinearMethod(LinearMethodBase):
                 return torch.nn.functional.linear(x, weight_bf16.t(), bias)
 
         if self.use_marlin:
+            if self.block_quant:
+                weight_scale = layer.weight_scale_inv
+            else:
+                weight_scale = layer.weight_scale
+
             return apply_fp8_marlin_linear(
                 input=x,
                 weight=layer.weight,
-                weight_scale=layer.weight_scale,
+                weight_scale=weight_scale,
                 workspace=layer.workspace,
                 size_n=layer.output_size_per_partition,
                 size_k=layer.input_size_per_partition,
@@ -660,7 +669,7 @@ class Fp8LinearMethod(LinearMethodBase):
             return self.w8a8_block_fp8_linear.apply(
                 input=x,
                 weight=layer.weight,
-                weight_scale=layer.weight_scale,
+                weight_scale=layer.weight_scale_inv,
                 input_scale=layer.input_scale,
                 bias=bias,
             )
@@ -937,22 +946,18 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 w2_weight_scale_inv = layer.w2_weight_scale_inv
 
             # torch.compile() cannot use Parameter subclasses.
-            layer.w13_weight = Parameter(w13_weight, requires_grad=False)
-            layer.w13_weight_scale_inv = Parameter(
-                w13_weight_scale_inv, requires_grad=False
-            )
-            layer.w2_weight = Parameter(w2_weight, requires_grad=False)
-            layer.w2_weight_scale_inv = Parameter(
-                w2_weight_scale_inv, requires_grad=False
-            )
+            replace_parameter(layer, "w13_weight", w13_weight)
+            replace_parameter(layer, "w13_weight_scale_inv", w13_weight_scale_inv)
+            replace_parameter(layer, "w2_weight", w2_weight)
+            replace_parameter(layer, "w2_weight_scale_inv", w2_weight_scale_inv)
             if self.rocm_aiter_moe_enabled:
                 # reshaping weights is required for aiter moe kernel.
                 shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
                     layer.w13_weight.data, layer.w2_weight.data
                 )
 
-                layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False)
-                layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False)
+                replace_parameter(layer, "w13_weight", shuffled_w13)
+                replace_parameter(layer, "w2_weight", shuffled_w2)
 
             # DeepGemm scales need to be transposed and aligned. We try to do
             # it ahead of time for performance reasons.
@@ -990,13 +995,14 @@ class Fp8MoEMethod(FusedMoEMethodBase):
 
             # Re-initialize w13_scale because we directly quantize
             # merged w13 weights and generate a single scaling factor.
-            layer.w13_weight_scale = torch.nn.Parameter(
+            replace_parameter(
+                layer,
+                "w13_weight_scale",
                 torch.ones(
                     layer.local_num_experts,
                     dtype=torch.float32,
                     device=w13_weight.device,
                 ),
-                requires_grad=False,
             )
             for expert in range(layer.local_num_experts):
                 w13_weight[expert, :, :], layer.w13_weight_scale[expert] = (
@@ -1005,16 +1011,17 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 w2_weight[expert, :, :], layer.w2_weight_scale[expert] = (
                     ops.scaled_fp8_quant(layer.w2_weight.data[expert, :, :])
                 )
-            layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
-            layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
+            replace_parameter(layer, "w13_weight", w13_weight)
+            replace_parameter(layer, "w2_weight", w2_weight)
+
             if self.rocm_aiter_moe_enabled:
                 # reshaping weights is required for aiter moe kernel.
                 shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
                     layer.w13_weight, layer.w2_weight
                 )
 
-                layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False)
-                layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False)
+                replace_parameter(layer, "w13_weight", shuffled_w13)
+                replace_parameter(layer, "w2_weight", shuffled_w2)
         # If checkpoint is fp8, we need to handle that the
         # MoE kernels require single activation scale and single weight
         # scale for w13 per expert.
@@ -1035,12 +1042,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                         "fp8 MoE layer. Using the maximum across experts "
                         "for each layer."
                     )
-                layer.w13_input_scale = torch.nn.Parameter(
-                    layer.w13_input_scale.max(), requires_grad=False
-                )
-                layer.w2_input_scale = torch.nn.Parameter(
-                    layer.w2_input_scale.max(), requires_grad=False
-                )
+                replace_parameter(layer, "w13_input_scale", layer.w13_input_scale.max())
+                replace_parameter(layer, "w2_input_scale", layer.w2_input_scale.max())
             if current_platform.is_fp8_fnuz():
                 # Normalize the weights and scales
                 w13_weight, w13_weight_scale, w13_input_scale = (
@@ -1054,22 +1057,14 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                     )
                 )
                 # Reset the parameter
-                layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
-                layer.w13_weight_scale = torch.nn.Parameter(
-                    w13_weight_scale, requires_grad=False
-                )
+                replace_parameter(layer, "w13_weight", w13_weight)
+                replace_parameter(layer, "w13_weight_scale", w13_weight_scale)
                 if w13_input_scale is not None:
-                    layer.w13_input_scale = torch.nn.Parameter(
-                        w13_input_scale, requires_grad=False
-                    )
-                layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
-                layer.w2_weight_scale = torch.nn.Parameter(
-                    w2_weight_scale, requires_grad=False
-                )
+                    replace_parameter(layer, "w13_input_scale", w13_input_scale)
+                replace_parameter(layer, "w2_weight", w2_weight)
+                replace_parameter(layer, "w2_weight_scale", w2_weight_scale)
                 if w2_input_scale is not None:
-                    layer.w2_input_scale = torch.nn.Parameter(
-                        w2_input_scale, requires_grad=False
-                    )
+                    replace_parameter(layer, "w2_input_scale", w2_input_scale)
 
             # Fp8 moe kernel needs single weight scale for w13 per expert.
             # We take the max then dequant and requant each expert.
@@ -1093,12 +1088,10 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                     layer.w13_weight, layer.w2_weight
                 )
 
-                layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False)
-                layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False)
+                replace_parameter(layer, "w13_weight", shuffled_w13)
+                replace_parameter(layer, "w2_weight", shuffled_w2)
 
-            layer.w13_weight_scale = torch.nn.Parameter(
-                max_w13_scales, requires_grad=False
-            )
+            replace_parameter(layer, "w13_weight_scale", max_w13_scales)
 
             if self.flashinfer_moe_backend is not None:
                 # NOTE: weights have to be swapped since the activation is
diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py
index 78456dcf1ca56..f0497a8722909 100644
--- a/vllm/model_executor/layers/quantization/kv_cache.py
+++ b/vllm/model_executor/layers/quantization/kv_cache.py
@@ -45,6 +45,13 @@ class BaseKVCacheMethod(QuantizeMethodBase):
         raise RuntimeError(f"{self.__class__.__name__}.apply should not be called.")
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # skip if there are no weights to process (for example, weight reloading)
+        if not hasattr(layer, "q_scale"):
+            assert not hasattr(layer, "k_scale")
+            assert not hasattr(layer, "v_scale")
+            assert not hasattr(layer, "prob_scale")
+            return
+
         # If the kv-cache dtype is auto, we enforce the k/v_scale to be 1.0
         # regardless whether the kv-scale is available in the checkpoint.
         # No need to process kv scales after loading if we are going to
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 366c5778fa5d9..f5200d7d34891 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -27,6 +27,7 @@ from vllm.model_executor.parameter import (
     ChannelQuantScaleParameter,
     PerTensorScaleParameter,
 )
+from vllm.model_executor.utils import replace_parameter
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils.deep_gemm import (
@@ -1404,12 +1405,12 @@ def maybe_post_process_fp8_weight_block(layer: torch.nn.Module):
     if should_use_deepgemm:
         dg_weight, dg_weight_scale = deepgemm_post_process_fp8_weight_block(
             wq=layer.weight.data,
-            ws=layer.weight_scale.data,
+            ws=layer.weight_scale_inv.data,
             quant_block_shape=tuple(layer.weight_block_size),
             use_e8m0=is_deep_gemm_e8m0_used(),
         )
-        layer.weight = torch.nn.Parameter(dg_weight, requires_grad=False)
-        layer.weight_scale = torch.nn.Parameter(dg_weight_scale, requires_grad=False)
+        replace_parameter(layer, "weight", dg_weight)
+        replace_parameter(layer, "weight_scale_inv", dg_weight_scale)
 
 
 def expert_weight_is_col_major(x: torch.Tensor) -> bool:
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
index e6b4f567caea4..c67e4f437cf0c 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -14,6 +14,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     marlin_quant_input,
     should_use_atomic_add_reduce,
 )
+from vllm.model_executor.utils import replace_parameter
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 
@@ -130,7 +131,7 @@ def prepare_fp8_layer_for_marlin(
         size_n=part_size_n,
         num_bits=8,
     )
-    layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False)
+    replace_parameter(layer, "weight", marlin_qweight)
 
     # WEIGHT SCALES
     # Permute scales
@@ -138,7 +139,6 @@ def prepare_fp8_layer_for_marlin(
         scales = layer.weight_scale.to(layer.orig_dtype)
     elif "weight_scale_inv" in dir(layer):
         scales = layer.weight_scale_inv.to(layer.orig_dtype)
-        del layer.weight_scale_inv
 
     group_size = -1 if weight_block_size is None else weight_block_size[1]
 
@@ -177,12 +177,15 @@ def prepare_fp8_layer_for_marlin(
     )
     if input_dtype != torch.float8_e4m3fn:
         marlin_scales = fp8_fused_exponent_bias_into_scales(marlin_scales)
-    layer.weight_scale = torch.nn.Parameter(marlin_scales, requires_grad=False)
+    if hasattr(layer, "weight_scale"):
+        replace_parameter(layer, "weight_scale", marlin_scales)
+    elif hasattr(layer, "weight_scale_inv"):
+        replace_parameter(layer, "weight_scale_inv", marlin_scales)
 
     if hasattr(layer, "bias") and layer.bias is not None:
         assert layer.bias.shape == (part_size_n,)
         bias = marlin_permute_bias(layer.bias)
-        layer.bias = torch.nn.Parameter(bias, requires_grad=False)
+        replace_parameter(layer, "bias", bias)
 
 
 def prepare_moe_fp8_layer_for_marlin(
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index fceed3e55c2df..4287922417c63 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -118,8 +118,11 @@ def requantize_with_max_scale(
     # from disk in this case. Skip requantization in this case (since)
     # we already are quantized with the single scale.
     # * Sample Model: nm-testing/Phi-3-mini-128k-instruct-FP8
+    #
+    # Extra note: upon weight reloading weight_scale.ndim == 0
     unfused_module_in_checkpoint = (
-        weight_scale[-1] > torch.finfo(torch.float8_e4m3fn).min
+        weight_scale.ndim != 0
+        and weight_scale[-1] > torch.finfo(torch.float8_e4m3fn).min
     )
 
     # If unfused checkpoint, need requanize with the single scale.
diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py
index 8aad59e84ff25..b89371d987541 100644
--- a/vllm/model_executor/utils.py
+++ b/vllm/model_executor/utils.py
@@ -50,6 +50,31 @@ def set_weight_attrs(
         setattr(weight, key, value)
 
 
+def replace_parameter(layer: torch.nn.Module, param_name: str, new_data: torch.Tensor):
+    """
+    Replace a parameter of a layer while maintaining the ability to reload the weight.
+    Called within implementations of the `process_weights_after_loading` method.
+
+    This function should not be called on weights which are tied/shared
+
+    Args:
+        layer: Layer containing parameter to replace
+        param_name: Name of parameter to replace
+        new_data: New data of the new parameter
+    """
+    # should not be used on a tied/shared param
+    if isinstance(new_data, torch.nn.Parameter):
+        new_data = new_data.data
+    new_param = torch.nn.Parameter(new_data, requires_grad=False)
+
+    old_param: torch.nn.Parameter | None = getattr(layer, param_name, None)
+    if old_param is not None and hasattr(old_param, "weight_loader"):
+        weight_loader = old_param.weight_loader
+        set_weight_attrs(new_param, {"weight_loader": weight_loader})
+
+    setattr(layer, param_name, new_param)
+
+
 def get_packed_modules_mapping(model: torch.nn.Module) -> dict[str, list[str]]:
     parent_map = getattr(model, "packed_modules_mapping", None)
     parent_map = copy.deepcopy(parent_map) if parent_map is not None else {}

From 3c680f4a17057d7994af8fbb1dc8c2d98307c890 Mon Sep 17 00:00:00 2001
From: Charlie Fu <charlifu@amd.com>
Date: Tue, 9 Dec 2025 16:39:26 -0600
Subject: [PATCH 394/770] [Rocm][torch.compile] Adding layernorm + fp8 block
 quant and silu + fp8 block quant for Aiter (#25693)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: charlifu <charlifu@amd.com>
Signed-off-by: Micah Williamson <micah.williamson@amd.com>
Signed-off-by: Charlie Fu <Charlie.Fu@amd.com>
Co-authored-by: Micah Williamson <micah.williamson@amd.com>
Co-authored-by: wuhuikx <hattie.wu@amd.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
---
 tests/compile/test_fusion.py                  |  98 ++++++-
 tests/compile/test_silu_mul_quant_fusion.py   |  62 ++++-
 vllm/_aiter_ops.py                            | 214 ++++++++++++----
 vllm/compilation/pass_manager.py              |  11 +
 vllm/compilation/rocm_aiter_fusion.py         | 242 ++++++++++++++++++
 .../layers/quantization/utils/fp8_utils.py    |  43 +++-
 6 files changed, 610 insertions(+), 60 deletions(-)
 create mode 100644 vllm/compilation/rocm_aiter_fusion.py

diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index 2ad34a79859a3..6b72c595cd779 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -1,10 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import itertools
+
 import pytest
 import torch
 
 import vllm.plugins
+from vllm._aiter_ops import IS_AITER_FOUND, rocm_aiter_ops
 from vllm.compilation.fusion import FUSED_OPS, FusedRMSQuantKey, RMSNormQuantFusionPass
 from vllm.compilation.fx_utils import find_op_nodes
 from vllm.compilation.matcher_utils import QUANT_OPS
@@ -152,13 +155,79 @@ GROUP_SHAPES = [
 ]
 
 
+class TestRmsnormGroupFp8QuantModel(torch.nn.Module):
+    def __init__(self, hidden_size: int, eps: float, **kwargs):
+        super().__init__()
+        self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp(
+            weight_group_shape=GroupShape(128, 128),
+            act_quant_group_shape=GroupShape(1, 128),
+            cutlass_block_fp8_supported=False,
+            use_aiter_and_is_supported=True,
+        )
+        self.w = [
+            torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
+            for _ in range(3)
+        ]
+
+        scale_hidden_size = (hidden_size + 128 - 1) // 128
+        self.wscale = [
+            torch.rand((scale_hidden_size, scale_hidden_size), dtype=torch.float32)
+            for _ in range(3)
+        ]
+
+        self.norm_weight = [torch.ones(hidden_size) for _ in range(4)]
+        self.eps = eps
+
+    def forward(self, x):
+        # avoid having graph input be an arg to a pattern directly
+        x = resid = torch.relu(x)
+        y = rocm_aiter_ops.rms_norm(x, self.norm_weight[0], self.eps)
+
+        x2 = self.w8a8_block_fp8_linear.apply(y, self.w[0], self.wscale[0])
+        # make sure resid is used for replacement to work
+        y2, resid = rocm_aiter_ops.rms_norm2d_with_add(
+            x2, resid, self.norm_weight[1], self.eps
+        )
+
+        x3 = self.w8a8_block_fp8_linear.apply(y2, self.w[1], self.wscale[1])
+
+        y3, resid = rocm_aiter_ops.rms_norm2d_with_add(
+            x3, resid, self.norm_weight[2], self.eps
+        )
+
+        x4 = self.w8a8_block_fp8_linear.apply(y3, self.w[2], self.wscale[2])
+
+        y4, resid = rocm_aiter_ops.rms_norm2d_with_add(
+            x4, resid, self.norm_weight[3], self.eps
+        )
+        return y4
+
+    def ops_in_model_before(self):
+        return [
+            torch.ops.vllm.rocm_aiter_rms_norm,
+            torch.ops.vllm.rocm_aiter_group_fp8_quant,
+        ]
+
+    def ops_in_model_before_partial(self):
+        return []
+
+    def ops_in_model_after(self):
+        return [
+            torch.ops.vllm.rocm_aiter_rmsnorm_fp8_group_quant,
+            torch.ops.vllm.rocm_aiter_rmsnorm_with_add_fp8_group_quant,
+        ]
+
+
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("hidden_size", [256])
 @pytest.mark.parametrize("num_tokens", [257])
 @pytest.mark.parametrize("eps", [1e-5, 1e-6])
 @pytest.mark.parametrize("group_shape", GROUP_SHAPES)
-@pytest.mark.parametrize("enable_rms_norm_custom_op", [True, False])
-@pytest.mark.parametrize("enable_quant_fp8_custom_op", [True, False])
+@pytest.mark.parametrize(
+    "model_class, enable_rms_norm_custom_op, enable_quant_fp8_custom_op",
+    list(itertools.product([TestModel], [True, False], [True, False]))
+    + [(TestRmsnormGroupFp8QuantModel, False, False)],
+)
 # cuda_force_torch used to test torch code path on platforms that
 # cutlass_fp8_supported() == True.
 @pytest.mark.parametrize(
@@ -173,10 +242,14 @@ def test_fusion_rmsnorm_quant(
     num_tokens,
     eps,
     group_shape,
+    model_class,
     enable_rms_norm_custom_op,
     enable_quant_fp8_custom_op,
     cuda_force_torch,
 ):
+    if model_class is TestRmsnormGroupFp8QuantModel and not IS_AITER_FOUND:
+        pytest.skip("AITER is not supported on this GPU.")
+
     torch.set_default_device("cuda")
     torch.set_default_dtype(dtype)
     torch.manual_seed(1)
@@ -209,12 +282,24 @@ def test_fusion_rmsnorm_quant(
     with vllm.config.set_current_vllm_config(vllm_config):
         # Reshape pass is needed for the fusion pass to work
         noop_pass = NoOpEliminationPass(vllm_config)
-        fusion_pass = RMSNormQuantFusionPass(vllm_config)
+        if model_class is TestRmsnormGroupFp8QuantModel:
+            from vllm.compilation.rocm_aiter_fusion import (
+                RocmAiterRMSNormFp8GroupQuantFusionPass,
+            )
+
+            fusion_pass = RocmAiterRMSNormFp8GroupQuantFusionPass(vllm_config)
+        else:
+            fusion_pass = RMSNormQuantFusionPass(vllm_config)
         cleanup_pass = PostCleanupPass(vllm_config)
 
         backend = TestBackend(noop_pass, fusion_pass, cleanup_pass)
         backend2 = TestBackend(noop_pass, cleanup_pass)
-        model = TestModel(hidden_size, eps, group_shape, cuda_force_torch)
+        model = model_class(
+            hidden_size=hidden_size,
+            eps=eps,
+            group_shape=group_shape,
+            cuda_force_torch=cuda_force_torch,
+        )
         # First dimension dynamic
         x = torch.rand(num_tokens, hidden_size)
         torch._dynamo.mark_dynamic(x, 0)
@@ -243,7 +328,10 @@ def test_fusion_rmsnorm_quant(
         # there's a risk that the fused add doesn't get included in the
         # replacement and only the rms part gets fused with quant.
         # Hence, we check only 2 add nodes are left (final fused rmsnorm add).
-        if not enable_rms_norm_custom_op:
+        if (
+            not enable_rms_norm_custom_op
+            and model_class is not TestRmsnormGroupFp8QuantModel
+        ):
             n_add_nodes = lambda g: sum(1 for _ in find_op_nodes(torch.ops.aten.add, g))
             # 7 = 1 (RMS) + 3x2 (3xRMS_ADD, 2 each)
             assert n_add_nodes(backend.graph_pre_pass) == 7
diff --git a/tests/compile/test_silu_mul_quant_fusion.py b/tests/compile/test_silu_mul_quant_fusion.py
index c336a45955cb5..eb0dee8d4e399 100644
--- a/tests/compile/test_silu_mul_quant_fusion.py
+++ b/tests/compile/test_silu_mul_quant_fusion.py
@@ -7,6 +7,7 @@ import torch
 
 import vllm.envs as envs
 from tests.kernels.quantization.nvfp4_utils import quant_nvfp4_tensor
+from vllm._aiter_ops import IS_AITER_FOUND
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
 from vllm.compilation.activation_quant_fusion import (
     FUSED_OPS,
@@ -24,6 +25,7 @@ from vllm.config import (
     set_current_vllm_config,
 )
 from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.quantization.utils.fp8_utils import W8A8BlockFp8LinearOp
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     GroupShape,
     kFp8StaticTensorSym,
@@ -126,6 +128,39 @@ class TestSiluMulNvfp4QuantModel(torch.nn.Module):
         return [FUSED_OPS[kNvfp4Quant]]
 
 
+class TestSiluMulGroupFp8QuantModel(torch.nn.Module):
+    def __init__(self, hidden_size: int, **kwargs):
+        super().__init__()
+        self.silu_and_mul = SiluAndMul()
+        self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp(
+            weight_group_shape=GroupShape(128, 128),
+            act_quant_group_shape=GroupShape(1, 128),
+            cutlass_block_fp8_supported=False,
+            use_aiter_and_is_supported=True,
+        )
+        self.w = torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
+
+        scale_hidden_size = (hidden_size + 128 - 1) // 128
+        self.wscale = torch.rand(
+            (scale_hidden_size, scale_hidden_size), dtype=torch.float32
+        )
+
+        self.enable_silu_mul_custom_op = self.silu_and_mul.enabled()
+
+    def forward(self, x):
+        y = self.silu_and_mul(x)
+        x2 = self.w8a8_block_fp8_linear.apply(y, self.w, self.wscale)
+        return x2
+
+    def ops_in_model_before(self):
+        return [
+            SILU_MUL_OP if self.enable_silu_mul_custom_op else torch.ops.aten.mul,
+        ]
+
+    def ops_in_model_after(self):
+        return [torch.ops.vllm.rocm_aiter_act_mul_and_fp8_group_quant]
+
+
 @pytest.mark.parametrize("num_tokens", [32, 64])
 @pytest.mark.parametrize("hidden_size", [128, 256])
 @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
@@ -133,7 +168,10 @@ class TestSiluMulNvfp4QuantModel(torch.nn.Module):
 @pytest.mark.parametrize(
     "model_class, enable_quant_fp8_custom_op, cuda_force_torch",
     list(itertools.product([TestSiluMulFp8QuantModel], [True, False], [True, False]))
-    + [(TestSiluMulNvfp4QuantModel, False, False)],
+    + [
+        (TestSiluMulNvfp4QuantModel, False, False),
+        (TestSiluMulGroupFp8QuantModel, False, False),
+    ],
 )
 # cuda_force_torch used to test torch code path on platforms that
 # cutlass_fp8_supported() == True.
@@ -144,13 +182,19 @@ def test_fusion_silu_and_mul_quant(
     num_tokens: int,
     hidden_size: int,
     dtype: torch.dtype,
-    model_class: type[TestSiluMulFp8QuantModel | TestSiluMulNvfp4QuantModel],
+    model_class: type[
+        TestSiluMulFp8QuantModel
+        | TestSiluMulNvfp4QuantModel
+        | TestSiluMulGroupFp8QuantModel
+    ],
     enable_silu_mul_custom_op: bool,
     enable_quant_fp8_custom_op: bool,
     cuda_force_torch: bool,
 ):
     if model_class is TestSiluMulNvfp4QuantModel and not is_nvfp4_supported():
         pytest.skip("NVFP4 is not supported on this GPU.")
+    if model_class is TestSiluMulGroupFp8QuantModel and not IS_AITER_FOUND:
+        pytest.skip("AITER is not supported on this GPU.")
 
     torch.set_default_device("cuda")
     torch.set_default_dtype(dtype)
@@ -173,9 +217,15 @@ def test_fusion_silu_and_mul_quant(
     )
 
     with set_current_vllm_config(config):
-        fusion_pass = ActivationQuantFusionPass(config)
+        fusion_passes = [ActivationQuantFusionPass(config)]
+        if IS_AITER_FOUND:
+            from vllm.compilation.rocm_aiter_fusion import (
+                RocmAiterSiluMulFp8GroupQuantFusionPass,
+            )
 
-        passes = [NoOpEliminationPass(config), fusion_pass, PostCleanupPass(config)]
+            fusion_passes += [RocmAiterSiluMulFp8GroupQuantFusionPass(config)]
+
+        passes = [NoOpEliminationPass(config), *fusion_passes, PostCleanupPass(config)]
         backend = TestBackend(*passes)
         model = model_class(
             hidden_size=hidden_size, cuda_force_torch=cuda_force_torch, x=x
@@ -194,12 +244,14 @@ def test_fusion_silu_and_mul_quant(
             atol, rtol = 1e-3, 1e-3
         elif model_class == TestSiluMulNvfp4QuantModel:
             atol, rtol = 1e-1, 1e-1
+        elif model_class == TestSiluMulGroupFp8QuantModel:
+            atol, rtol = 5e-2, 5e-2
 
         torch.testing.assert_close(
             result[0].to(dtype=dtype), result2[0].to(dtype=dtype), atol=atol, rtol=rtol
         )
 
-        assert fusion_pass.matched_count == 1
+        assert sum([p.matched_count for p in fusion_passes]) == 1
 
         # In pre-nodes, quant op should be present and fused kernels should not
         backend.check_before_ops(model.ops_in_model_before())
diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py
index 94bbc9b00225e..010817e79a936 100644
--- a/vllm/_aiter_ops.py
+++ b/vllm/_aiter_ops.py
@@ -24,6 +24,15 @@ def is_aiter_found() -> bool:
 # we keep this global outside to not cause torch compile breaks.
 IS_AITER_FOUND = is_aiter_found()
 
+# Can't use dtypes.fp8 directly inside an op
+# because it returns wrong result on gfx942.
+# This is a workaround to get the correct FP8 dtype.
+# This might because that the get_gfx() is wrapped as a custom op.
+if IS_AITER_FOUND:
+    from aiter import dtypes
+
+    AITER_FP8_DTYPE = dtypes.fp8
+
 
 def if_aiter_supported(func: Callable) -> Callable:
     """Decorator that only executes the function if
@@ -45,36 +54,6 @@ def if_aiter_supported(func: Callable) -> Callable:
     return wrapper
 
 
-def _rocm_aiter_group_fp8_quant_impl(
-    x: torch.Tensor,
-    group_size: int,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    assert x.shape[-1] % group_size == 0, "Input shape must be divisible by group size"
-    from aiter import QuantType, dtypes, get_hip_quant
-
-    aiter_per1x128_quant = get_hip_quant(QuantType.per_1x128)
-    return aiter_per1x128_quant(x.contiguous(), quant_dtype=dtypes.fp8)
-
-
-def _rocm_aiter_group_fp8_quant_fake(
-    x: torch.Tensor,
-    group_size: int,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    from aiter import dtypes
-
-    M, N = x.shape
-    x_fp8 = torch.empty((M, N), dtype=dtypes.fp8, device=x.device)
-    out_bs = torch.empty(
-        (
-            M,
-            (N + group_size - 1) // group_size,
-        ),
-        dtype=torch.float32,
-        device=x.device,
-    )
-    return x_fp8, out_bs
-
-
 def _rocm_aiter_fused_moe_impl(
     hidden_states: torch.Tensor,
     w1: torch.Tensor,
@@ -522,6 +501,142 @@ def _rocm_aiter_per_token_quant_fake(
     )
 
 
+def _rocm_aiter_rmsnorm_with_add_fp8_group_quant_impl(
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    variance_epsilon: float,
+    group_size: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    from aiter.ops.triton.fused_fp8_quant import fused_rms_fp8_group_quant
+
+    (x_quant, x_quant_scales), _, _, res = fused_rms_fp8_group_quant(
+        x,
+        weight,
+        variance_epsilon,
+        None,
+        None,
+        None,
+        group_size=group_size,
+        dtype_quant=AITER_FP8_DTYPE,
+        res1=residual,
+    )
+    return (x_quant, x_quant_scales, res)
+
+
+def _rocm_aiter_rmsnorm_with_add_fp8_group_quant_fake(
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    variance_epsilon: float,
+    group_size: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    M, N = x.shape
+    scale_shape = (M, (N + group_size - 1) // group_size)
+    return (
+        torch.empty_like(x, dtype=AITER_FP8_DTYPE, device=x.device),
+        torch.empty(scale_shape, dtype=torch.float32, device=x.device),
+        torch.empty_like(residual, device=residual.device),
+    )
+
+
+def _rocm_aiter_rmsnorm_fp8_group_quant_impl(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    variance_epsilon: float,
+    group_size: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    from aiter.ops.triton.fused_fp8_quant import fused_rms_fp8_group_quant
+
+    (x_quant, x_quant_scales), _, _, res = fused_rms_fp8_group_quant(
+        x,
+        weight,
+        variance_epsilon,
+        None,
+        None,
+        None,
+        group_size=group_size,
+        dtype_quant=AITER_FP8_DTYPE,
+        res1=None,
+    )
+    return (x_quant, x_quant_scales)
+
+
+def _rocm_aiter_rmsnorm_fp8_group_quant_fake(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    variance_epsilon: float,
+    group_size: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    M, N = x.shape
+    scale_shape = (M, (N + group_size - 1) // group_size)
+    return (
+        torch.empty_like(x, dtype=AITER_FP8_DTYPE, device=x.device),
+        torch.empty(scale_shape, dtype=torch.float32, device=x.device),
+    )
+
+
+def _rocm_aiter_group_fp8_quant_impl(
+    x: torch.Tensor,
+    group_size: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    assert x.shape[-1] % group_size == 0, "Input shape must be divisible by group size"
+    from aiter import QuantType, get_hip_quant
+
+    aiter_per1x128_quant = get_hip_quant(QuantType.per_1x128)
+    return aiter_per1x128_quant(x.contiguous(), quant_dtype=AITER_FP8_DTYPE)
+
+
+def _rocm_aiter_group_fp8_quant_fake(
+    x: torch.Tensor,
+    group_size: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    M, N = x.shape
+    x_fp8 = torch.empty((M, N), dtype=AITER_FP8_DTYPE, device=x.device)
+    out_bs = torch.empty(
+        (
+            M,
+            (N + group_size - 1) // group_size,
+        ),
+        dtype=torch.float32,
+        device=x.device,
+    )
+    return x_fp8, out_bs
+
+
+def _rocm_aiter_act_mul_and_fp8_group_quant_impl(
+    x: torch.Tensor,
+    group_size: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    from aiter.ops.triton.activation import act_mul_and_fp8_group_quant
+
+    return act_mul_and_fp8_group_quant(
+        x,
+        activation="silu",
+        group_size=group_size,
+        dtype_quant=AITER_FP8_DTYPE,
+    )
+
+
+def _rocm_aiter_act_mul_and_fp8_group_quant_fake(
+    x: torch.Tensor,
+    group_size: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    M, N = x.shape
+    assert N % 2 == 0
+    N_half = N // 2
+    x_fp8 = torch.empty((M, N_half), dtype=AITER_FP8_DTYPE, device=x.device)
+    out_bs = torch.empty(
+        (
+            M,
+            (N_half + group_size - 1) // group_size,
+        ),
+        dtype=torch.float32,
+        device=x.device,
+    )
+    return x_fp8, out_bs
+
+
 # Global flag to ensure ops are registered only once
 _OPS_REGISTERED = False
 
@@ -557,7 +672,7 @@ class rocm_aiter_ops:
     @if_aiter_supported
     def is_linear_fp8_enaled(cls) -> bool:
         """ "Verifies device specs and availability of env variable."""
-        return cls.is_linear_enabled() and current_platform.is_fp8_fnuz()
+        return cls.is_linear_enabled()
 
     @classmethod
     @if_aiter_supported
@@ -632,14 +747,6 @@ class rocm_aiter_ops:
             )
 
             # register all the custom ops here
-            direct_register_custom_op(
-                op_name="rocm_aiter_group_fp8_quant",
-                op_func=_rocm_aiter_group_fp8_quant_impl,
-                mutates_args=[],
-                fake_impl=_rocm_aiter_group_fp8_quant_fake,
-                dispatch_key=current_platform.dispatch_key,
-            )
-
             direct_register_custom_op(
                 op_name="rocm_aiter_asm_moe_tkw1",
                 op_func=_rocm_aiter_asm_moe_tkw1_impl,
@@ -699,27 +806,46 @@ class rocm_aiter_ops:
             direct_register_custom_op(
                 op_name="rocm_aiter_gemm_a8w8_blockscale",
                 op_func=_rocm_aiter_gemm_a8w8_blockscale_impl,
-                mutates_args=[],
                 fake_impl=_rocm_aiter_gemm_a8w8_blockscale_fake,
-                dispatch_key=current_platform.dispatch_key,
             )
 
             direct_register_custom_op(
                 op_name="rocm_aiter_rms_norm",
                 op_func=_rocm_aiter_rms_norm_impl,
-                mutates_args=[],
                 fake_impl=_rocm_aiter_rms_norm_fake,
-                dispatch_key=current_platform.dispatch_key,
             )
 
             direct_register_custom_op(
                 op_name="rocm_aiter_rmsnorm2d_fwd_with_add",
                 op_func=_rocm_aiter_rmsnorm2d_fwd_with_add_impl,
-                mutates_args=[],
                 fake_impl=_rocm_aiter_rmsnorm2d_fwd_with_add_fake,
                 dispatch_key=current_platform.dispatch_key,
             )
 
+            direct_register_custom_op(
+                op_name="rocm_aiter_rmsnorm_fp8_group_quant",
+                op_func=_rocm_aiter_rmsnorm_fp8_group_quant_impl,
+                fake_impl=_rocm_aiter_rmsnorm_fp8_group_quant_fake,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_rmsnorm_with_add_fp8_group_quant",
+                op_func=_rocm_aiter_rmsnorm_with_add_fp8_group_quant_impl,
+                fake_impl=_rocm_aiter_rmsnorm_with_add_fp8_group_quant_fake,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_act_mul_and_fp8_group_quant",
+                op_func=_rocm_aiter_act_mul_and_fp8_group_quant_impl,
+                fake_impl=_rocm_aiter_act_mul_and_fp8_group_quant_fake,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_group_fp8_quant",
+                op_func=_rocm_aiter_group_fp8_quant_impl,
+                fake_impl=_rocm_aiter_group_fp8_quant_fake,
+            )
+
             direct_register_custom_op(
                 op_name="rocm_aiter_per_tensor_quant",
                 op_func=_rocm_aiter_per_tensor_quant_impl,
diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
index 6848bfb6a3c53..4ebb386f75ed8 100644
--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
@@ -5,6 +5,7 @@ import functools
 from torch import fx as fx
 
 from vllm import envs
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
@@ -13,6 +14,12 @@ from vllm.utils.system_utils import set_env_var
 from .post_cleanup import PostCleanupPass
 from .vllm_inductor_pass import VllmInductorPass
 
+if rocm_aiter_ops.is_enabled():
+    from vllm.compilation.rocm_aiter_fusion import (
+        RocmAiterRMSNormFp8GroupQuantFusionPass,
+        RocmAiterSiluMulFp8GroupQuantFusionPass,
+    )
+
 if current_platform.is_cuda_alike():
     from .activation_quant_fusion import ActivationQuantFusionPass
     from .fusion import RMSNormQuantFusionPass
@@ -109,8 +116,12 @@ class PostGradPassManager(CustomGraphPass):
 
             if self.pass_config.fuse_norm_quant:
                 self.passes += [RMSNormQuantFusionPass(config)]
+                if rocm_aiter_ops.is_enabled():
+                    self.passes += [RocmAiterRMSNormFp8GroupQuantFusionPass(config)]
             if self.pass_config.fuse_act_quant:
                 self.passes += [ActivationQuantFusionPass(config)]
+                if rocm_aiter_ops.is_enabled():
+                    self.passes += [RocmAiterSiluMulFp8GroupQuantFusionPass(config)]
 
             if self.pass_config.fuse_attn_quant:
                 self.passes += [AttnFusionPass(config)]
diff --git a/vllm/compilation/rocm_aiter_fusion.py b/vllm/compilation/rocm_aiter_fusion.py
new file mode 100644
index 0000000000000..8b5db9de38181
--- /dev/null
+++ b/vllm/compilation/rocm_aiter_fusion.py
@@ -0,0 +1,242 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+import torch
+import torch._inductor.pattern_matcher as pm
+from torch import fx
+from torch._inductor.pattern_matcher import PatternMatcherPass
+from torch._ops import OpOverload
+
+import vllm.model_executor.layers.quantization.utils.fp8_utils  # noqa: F401
+from vllm.compilation.activation_quant_fusion import ActivationQuantPattern
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+
+from .fusion import empty_bf16
+from .inductor_pass import enable_fake_mode
+from .matcher_utils import MatcherSiluAndMul
+from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass
+
+logger = init_logger(__name__)
+FP8_DTYPE = current_platform.fp8_dtype()
+
+AITER_RMS_GROUP_QUANT_OP = torch.ops.vllm.rocm_aiter_rmsnorm_fp8_group_quant.default
+AITER_RMS_ADD_GROUP_QUANT_OP = (
+    torch.ops.vllm.rocm_aiter_rmsnorm_with_add_fp8_group_quant.default
+)
+
+AITER_RMS_OP = torch.ops.vllm.rocm_aiter_rms_norm.default
+AITER_RMS_ADD_OP = torch.ops.vllm.rocm_aiter_rmsnorm2d_fwd_with_add.default
+
+AITER_GROUP_FP8_QUANT_OP = torch.ops.vllm.rocm_aiter_group_fp8_quant.default
+TRITON_GROUP_FP8_QUANT_OP = torch.ops.vllm.triton_per_token_group_quant_fp8.default
+
+FUSED_SILU_MUL_QUANT_OP = torch.ops.vllm.rocm_aiter_act_mul_and_fp8_group_quant.default
+
+
+class AiterRMSFp8GroupQuantPattern:
+    """
+    This pattern fuses aiter rms_norm & group fp8 quant custom
+    ops into an aiter rms_norm_group_fp8_quant op.
+    """
+
+    def __init__(self, epsilon: float, quant_dtype: torch.dtype, quant_op: OpOverload):
+        self.epsilon = epsilon
+        self.quant_dtype = quant_dtype
+        self.quant_op = quant_op
+
+    def register(self, pm_pass: PatternMatcherPass):
+        def pattern(
+            input: torch.Tensor,
+            weight: torch.Tensor,
+        ):
+            at1 = AITER_RMS_OP(x=input, weight=weight, variance_epsilon=self.epsilon)
+
+            at2 = self.quant_op(at1, 128)
+
+            return at2[0], at2[1]
+
+        def replacement(
+            input: torch.Tensor,
+            weight: torch.Tensor,
+        ):
+            at = AITER_RMS_GROUP_QUANT_OP(
+                x=input,
+                weight=weight,
+                variance_epsilon=self.epsilon,
+                group_size=128,
+            )
+
+            return at[0], at[1]
+
+        inputs = [
+            empty_bf16(5, 4),  # input
+            empty_bf16(1, 5),  # weight
+        ]
+
+        pm.register_replacement(pattern, replacement, inputs, pm.fwd_only, pm_pass)
+
+
+class AiterFusedAddRMSFp8GroupQuantPattern:
+    """
+    This pattern fuses aiter rms_norm_with_add & group fp8 quant custom ops
+    into a aiter rms_norm_with_add_group_fp8_quant op.
+    """
+
+    def __init__(self, epsilon: float, quant_dtype: torch.dtype, quant_op: OpOverload):
+        self.epsilon = epsilon
+        self.quant_dtype = quant_dtype
+        self.quant_op = quant_op
+
+    def register(self, pm_pass: PatternMatcherPass):
+        def pattern(
+            input: torch.Tensor,
+            residual: torch.Tensor,
+            weight: torch.Tensor,
+        ):
+            at1 = AITER_RMS_ADD_OP(
+                x=input,
+                residual=residual,
+                weight=weight,
+                variance_epsilon=self.epsilon,
+            )
+
+            at2 = self.quant_op(at1[0], 128)
+
+            # result, scale, residual
+            return at2[0], at2[1], at1[1]
+
+        def replacement(
+            input: torch.Tensor,
+            residual: torch.Tensor,
+            weight: torch.Tensor,
+        ):
+            at = AITER_RMS_ADD_GROUP_QUANT_OP(
+                x=input,
+                residual=residual,
+                weight=weight,
+                variance_epsilon=self.epsilon,
+                group_size=128,
+            )
+
+            # result, scale, residual
+            return at[0], at[1], at[2]
+
+        inputs = [
+            empty_bf16(5, 4),  # input
+            empty_bf16(5, 4),  # residual
+            empty_bf16(1, 5),  # weight
+        ]
+
+        pm.register_replacement(pattern, replacement, inputs, pm.fwd_only, pm_pass)
+
+
+class RocmAiterRMSNormFp8GroupQuantFusionPass(VllmPatternMatcherPass):
+    """
+    This pass fuses rms_norm & quant custom ops into a fused rms_norm_quant op.
+    It also supports fused_add_rms_norm.
+    """
+
+    @enable_fake_mode
+    def __init__(self, config: VllmConfig):
+        super().__init__(config)
+
+        self.patterns: PatternMatcherPass = PatternMatcherPass(
+            pass_name="rocm_aiter_rms_norm_fp8_group_quant_fusion_pass"
+        )
+
+        # Make sure fused add patterns are before simple rms norm,
+        # as the latter is a subset of the former in torch ops
+        for epsilon in [1e-5, 1e-6]:
+            # Fuse rms_norm + dynamic group fp8 quant
+            for quant_op in [AITER_GROUP_FP8_QUANT_OP, TRITON_GROUP_FP8_QUANT_OP]:
+                AiterRMSFp8GroupQuantPattern(epsilon, FP8_DTYPE, quant_op).register(
+                    self.patterns
+                )
+
+                AiterFusedAddRMSFp8GroupQuantPattern(
+                    epsilon, FP8_DTYPE, quant_op
+                ).register(self.patterns)
+
+        self.dump_patterns(config, self.patterns)
+
+    @VllmInductorPass.time_and_log
+    def __call__(self, graph: fx.Graph):
+        self.matched_count = self.patterns.apply(graph)
+        logger.debug("Replaced %s patterns", self.matched_count)
+
+    def uuid(self) -> Any:
+        fusion_patterns = [
+            AiterRMSFp8GroupQuantPattern,
+            AiterFusedAddRMSFp8GroupQuantPattern,
+        ]
+        return self.hash_source(self, *fusion_patterns)
+
+
+class AiterSiluMulFp8GroupQuantPattern(ActivationQuantPattern):
+    """
+    This pattern fuses aiter silu_and_mul & group fp8 quant custom
+    ops into an aiter silu_and_mul_group_fp8_quant op.
+    """
+
+    def __init__(self, quant_op: OpOverload):
+        self.silu_and_mul_matcher = MatcherSiluAndMul()
+        self.quant_op = quant_op
+
+    def register(self, pm_pass: PatternMatcherPass):
+        def pattern(
+            input: torch.Tensor,
+        ):
+            at1 = self.silu_and_mul_matcher(input)
+            at2 = self.quant_op(at1, 128)
+            return at2[0], at2[1]
+
+        def replacement(
+            input: torch.Tensor,
+        ):
+            at = FUSED_SILU_MUL_QUANT_OP(x=input, group_size=128)
+            return at[0], at[1]
+
+        inputs = [
+            self.silu_and_mul_matcher.inputs()[0],
+        ]
+
+        pm.register_replacement(pattern, replacement, inputs, pm.fwd_only, pm_pass)
+
+
+class RocmAiterSiluMulFp8GroupQuantFusionPass(VllmPatternMatcherPass):
+    """
+    This pass fuses a pre-defined set of custom ops into fused ops.
+    It uses the torch pattern matcher to find the patterns and replace them.
+
+    Because patterns can only be registered once, the pass is a singleton.
+    This will be addressed in a future version of PyTorch:
+    https://github.com/pytorch/pytorch/pull/139321#issuecomment-2452354980
+    """
+
+    @enable_fake_mode
+    def __init__(self, config: VllmConfig):
+        super().__init__(config)
+
+        self.patterns: PatternMatcherPass = PatternMatcherPass(
+            pass_name="rocm_aiter_silu_mul_fp8_group_quant_fusion_pass"
+        )
+
+        for quant_op in [AITER_GROUP_FP8_QUANT_OP, TRITON_GROUP_FP8_QUANT_OP]:
+            AiterSiluMulFp8GroupQuantPattern(quant_op).register(self.patterns)
+
+        self.dump_patterns(config, self.patterns)
+
+    @VllmInductorPass.time_and_log
+    def __call__(self, graph: torch.fx.Graph):
+        self.matched_count = self.patterns.apply(graph)
+        logger.debug("Replaced %s patterns", self.matched_count)
+
+    def uuid(self):
+        fusion_patterns = [
+            ActivationQuantPattern,
+            AiterSiluMulFp8GroupQuantPattern,
+        ]
+        return VllmInductorPass.hash_source(self, *fusion_patterns)
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index f5200d7d34891..b459d5947863b 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -196,6 +196,39 @@ direct_register_custom_op(
 )
 
 
+def _triton_per_token_group_quant_fp8_impl(
+    x: torch.Tensor,
+    group_size: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    return per_token_group_quant_fp8(
+        x, group_size, column_major_scales=False, use_ue8m0=False
+    )
+
+
+def _triton_per_token_group_quant_fp8_fake(
+    x: torch.Tensor,
+    group_size: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    M, N = x.shape
+    x_fp8 = torch.empty((M, N), dtype=current_platform.fp8_dtype(), device=x.device)
+    out_bs = torch.empty(
+        (
+            M,
+            (N + group_size - 1) // group_size,
+        ),
+        dtype=torch.float32,
+        device=x.device,
+    )
+    return x_fp8, out_bs
+
+
+direct_register_custom_op(
+    "triton_per_token_group_quant_fp8",
+    _triton_per_token_group_quant_fp8_impl,
+    fake_impl=_triton_per_token_group_quant_fp8_fake,
+)
+
+
 # TODO fix ROCm->Triton custom path:
 #  https://github.com/vllm-project/vllm/issues/14397
 class W8A8BlockFp8LinearOp:
@@ -341,17 +374,15 @@ class W8A8BlockFp8LinearOp:
 
         if input_scale is not None:
             q_input = input_2d
-        # MI350 case uses triton kernel
         elif use_triton:
-            q_input, input_scale = per_token_group_quant_fp8(
+            q_input, input_scale = torch.ops.vllm.triton_per_token_group_quant_fp8(
                 input_2d,
                 self.act_quant_group_shape.col,
-                column_major_scales=False,
-                use_ue8m0=False,
             )
-        # MI300 uses tuned AITER ASM/C++ kernel
         else:
-            q_input, input_scale = rocm_aiter_ops.group_fp8_quant(input_2d)
+            q_input, input_scale = rocm_aiter_ops.group_fp8_quant(
+                input_2d, self.act_quant_group_shape.col
+            )
 
         return gemm_a8w8_blockscale_op(
             q_input,

From 2e7054da065504a4786d251f4c5bd099a9ddab86 Mon Sep 17 00:00:00 2001
From: Hashem Hashemi <159079214+amd-hhashemi@users.noreply.github.com>
Date: Tue, 9 Dec 2025 15:51:32 -0800
Subject: [PATCH 395/770] Improve wvsplitK tile and balance heristics. (#29937)

Signed-off-by: Hashem Hashemi <hashem.hashemi@amd.com>
---
 csrc/rocm/skinny_gemms.cu | 97 +++++++++++++++++++--------------------
 1 file changed, 48 insertions(+), 49 deletions(-)

diff --git a/csrc/rocm/skinny_gemms.cu b/csrc/rocm/skinny_gemms.cu
index 2ef579a1b7537..8ebe55cef391d 100644
--- a/csrc/rocm/skinny_gemms.cu
+++ b/csrc/rocm/skinny_gemms.cu
@@ -1241,33 +1241,16 @@ __global__ void wvSplitK_hf_big_(const int K, const int M, const int Bx,
 }
 #endif  // defined(__HIP__GFX9__) TODO: Add NAVI support
 
+// Find the min val of div2 that doesn't increase N/(div1*div2)
 int mindiv(int N, int div1, int div2) {
   int nPrRnd = div1 * div2;
-  int rnds0 = N / nPrRnd;
-  nPrRnd -= div1 * 3;
-  int rnds3 = N / nPrRnd;
-  nPrRnd -= div1;
-  int rnds4 = N / nPrRnd;
-  nPrRnd -= div1;
-  int rnds5 = N / nPrRnd;
-  nPrRnd -= div1;
-  int rnds6 = N / nPrRnd;
-  nPrRnd -= div1;
-  int rnds7 = N / nPrRnd;
-  nPrRnd -= div1;
-  int rnds8 = N / nPrRnd;
-  nPrRnd -= div1;
-  int rnds9 = N / nPrRnd;
-  nPrRnd -= div1;
-  int rtn = div2;
-  if (rnds0 == rnds3) rtn = div2 - 3;
-  if (rnds0 == rnds4) rtn = div2 - 4;
-  if (rnds0 == rnds5) rtn = div2 - 5;
-  if (rnds0 == rnds6) rtn = div2 - 6;
-  if (rnds0 == rnds7) rtn = div2 - 7;
-  if (rnds0 == rnds8) rtn = div2 - 8;
-  if (rnds0 == rnds9) rtn = div2 - 9;
-  return rtn;
+  int rnds[13];
+  for (int i = 0; i < 13; i++) {
+    rnds[i] = (N + nPrRnd - 1) / nPrRnd;
+    nPrRnd -= div1;
+  }
+  for (int i = 12; i >= 0; i--)
+    if (rnds[0] == rnds[i]) return (div2 - i);
 }
 
 torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
@@ -1300,26 +1283,37 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   const int max_lds_len = get_lds_size() / 2;
 
-#define WVSPLITK(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \
-                 _N)                                                          \
-  {                                                                           \
-    dim3 block(64, _WvPrGrp);                                                 \
-    if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) {              \
-      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp);              \
-      wvSplitK_hf_sml_<fptype, 64, _YTILEs, _WvPrGrp, 8, _UNRLs, _N>          \
-          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4,    \
-                                       biasf4, c, __wvPrGrp, CuCount);        \
-    } else if (K_in * N_in <= max_lds_len * 1.2) {                            \
-      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp);              \
-      wvSplitK_hf_<fptype, 64, _YTILEm, _WvPrGrp, 8, _UNRLm, _N>              \
-          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4,    \
-                                       biasf4, c, __wvPrGrp, CuCount);        \
-    } else {                                                                  \
-      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEb, _WvPrGrp);              \
-      wvSplitK_hf_big_<fptype, 64, _YTILEb, _WvPrGrp, 8, _UNRLb, _N>          \
-          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4,    \
-                                       biasf4, c, __wvPrGrp, CuCount);        \
-    }                                                                         \
+#define WVSPLITK(_YTILE, _UNRL, _N)                                        \
+  {                                                                        \
+    dim3 block(64, 16);                                                    \
+    int __wvPrGrp = mindiv(M_in, CuCount * _YTILE, 16);                    \
+    if ((K_in * N_in <= max_lds_len) && (M_in % _YTILE == 0))              \
+      wvSplitK_hf_sml_<fptype, 64, _YTILE, 16, 8, _UNRL, _N>               \
+          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
+                                       biasf4, c, __wvPrGrp, CuCount);     \
+    else if (K_in * N_in <= max_lds_len * 1.2)                             \
+      wvSplitK_hf_<fptype, 64, _YTILE, 16, 8, _UNRL, _N>                   \
+          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
+                                       biasf4, c, __wvPrGrp, CuCount);     \
+    else                                                                   \
+      wvSplitK_hf_big_<fptype, 64, _YTILE, 16, 8, _UNRL, _N>               \
+          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
+                                       biasf4, c, __wvPrGrp, CuCount);     \
+  }
+
+#define WVSPLIT_TILE(_sYT, __N)                           \
+  {                                                       \
+    bool fit_lds = (K_in * N_in <= max_lds_len);          \
+    if (_sYT <= 1)                                        \
+      WVSPLITK(1, 4, __N)                                 \
+    else if ((__N == 1) || (!fit_lds) || (_sYT <= 4 * 2)) \
+      WVSPLITK(2, 2, __N)                                 \
+    else if (_sYT <= 4 * 3)                               \
+      WVSPLITK(3, 2, __N)                                 \
+    else if (__N == 4)                                    \
+      WVSPLITK(4, 1, __N)                                 \
+    else                                                  \
+      WVSPLITK(4, 2, __N)                                 \
   }
 
   AT_DISPATCH_REDUCED_FLOATING_TYPES(in_b.scalar_type(), "wvSplitK", [&] {
@@ -1331,18 +1325,23 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
             ? reinterpret_cast<const fptype*>(in_bias->data_ptr())
             : nullptr;
     fptype* c = reinterpret_cast<fptype*>(out_c.data_ptr());
+
+    // first shoot for biggest tile-size that keeps all simd busy,
+    // then cut the active waves to balance their distribution...
+    int sYT = (M_in + CuCount * 4 - 1) / (CuCount * 4);
+
     switch (N_in) {
       case 1:
-        WVSPLITK(16, 2, 2, 2, 2, 2, 2, 1)
+        WVSPLIT_TILE(sYT, 1)
         break;
       case 2:
-        WVSPLITK(16, 2, 2, 2, 2, 2, 2, 2)
+        WVSPLIT_TILE(sYT, 2)
         break;
       case 3:
-        WVSPLITK(16, 4, 7, 7, 1, 1, 1, 3)
+        WVSPLIT_TILE(sYT, 3)
         break;
       case 4:
-        WVSPLITK(16, 4, 7, 7, 1, 1, 1, 4)
+        WVSPLIT_TILE(sYT, 4)
         break;
       default:
         throw std::runtime_error(

From 03b5f940fdcff25024ce5d37c357d770344a8f20 Mon Sep 17 00:00:00 2001
From: dongbo910220 <32610838+dongbo910220@users.noreply.github.com>
Date: Wed, 10 Dec 2025 08:15:01 +0800
Subject: [PATCH 396/770] [V1][Spec Decode] Optimize Medusa proposer to avoid
 GPU-CPU sync (#29723)

Signed-off-by: dongbo910220 <1275604947@qq.com>
---
 vllm/v1/spec_decode/medusa.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vllm/v1/spec_decode/medusa.py b/vllm/v1/spec_decode/medusa.py
index 12b903ccaca97..989478f348161 100644
--- a/vllm/v1/spec_decode/medusa.py
+++ b/vllm/v1/spec_decode/medusa.py
@@ -38,16 +38,16 @@ class MedusaProposer:
         self,
         target_hidden_states: torch.Tensor,
         sampling_metadata: SamplingMetadata,
-    ) -> list[list[int]]:
+    ) -> torch.Tensor:
         # Generate blocks and compute logits
         blocks = self.model(target_hidden_states)
         logits = self.model.compute_logits(blocks)
 
-        # Get draft tokens and transpose the result
-        # TODO(woosuk): OPTIMIZATION: Return GPU tensor without GPU-CPU
-        # synchronization.
-        draft_tokens = [logit.argmax(dim=-1).tolist() for logit in logits]
-        return [list(row) for row in zip(*draft_tokens)]
+        # Compute argmax for each Medusa head and stack into a single tensor
+        # Shape: [batch_size, num_heads]
+        draft_tokens = torch.stack([logit.argmax(dim=-1) for logit in logits], dim=1)
+
+        return draft_tokens
 
     def load_model(self, target_model: nn.Module) -> None:
         from vllm.compilation.backends import set_model_tag

From 4c2e10ea19b9053924d66f30f3d7121fbd9684f8 Mon Sep 17 00:00:00 2001
From: PatrykSaffer <patryk.saffer@mistral.ai>
Date: Wed, 10 Dec 2025 01:47:07 +0100
Subject: [PATCH 397/770] [Bugfix] Fix cuda graph sizes when running with
 speculative decoding (#30330)

Signed-off-by: Patryk Saffer <patryk.saffer99@gmail.com>
Signed-off-by: PatrykSaffer <patryk.saffer@mistral.ai>
Co-authored-by: Patryk Saffer <patryk.saffer99@gmail.com>
---
 vllm/config/vllm.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 614a3226cb711..8f27db0013305 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -1047,8 +1047,14 @@ class VllmConfig:
                 self.compilation_config.max_cudagraph_capture_size
             )
             if max_cudagraph_capture_size is None:
+                decode_query_len = 1
+                if (
+                    self.speculative_config
+                    and self.speculative_config.num_speculative_tokens
+                ):
+                    decode_query_len += self.speculative_config.num_speculative_tokens
                 max_cudagraph_capture_size = min(
-                    self.scheduler_config.max_num_seqs * 2, 512
+                    self.scheduler_config.max_num_seqs * decode_query_len * 2, 512
                 )
             max_num_tokens = self.scheduler_config.max_num_batched_tokens
             max_cudagraph_capture_size = min(max_num_tokens, max_cudagraph_capture_size)

From 2e7035dd8cc2e6c907873462b4ac0bb9f08e0abb Mon Sep 17 00:00:00 2001
From: ElizaWszola <ewszola@redhat.com>
Date: Wed, 10 Dec 2025 02:17:25 +0100
Subject: [PATCH 398/770] [Bugfix] Fix fp8 DeepGemm compilation issues (#30336)

---
 vllm/model_executor/layers/quantization/utils/fp8_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index b459d5947863b..e12fe61bf3d97 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -31,7 +31,6 @@ from vllm.model_executor.utils import replace_parameter
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils.deep_gemm import (
-    DeepGemmQuantScaleFMT,
     fp8_gemm_nt,
     is_deep_gemm_e8m0_used,
     is_deep_gemm_supported,
@@ -248,6 +247,7 @@ class W8A8BlockFp8LinearOp:
         self.act_quant_group_shape = act_quant_group_shape
         self.is_deep_gemm_supported = is_deep_gemm_supported()
         self.is_hopper = current_platform.is_device_capability(90)
+        self.is_blackwell = current_platform.is_device_capability(100)
         self.use_deep_gemm_e8m0 = is_deep_gemm_e8m0_used()
 
         # Get the correct blockscale mul and input quant operations.
@@ -303,7 +303,7 @@ class W8A8BlockFp8LinearOp:
         weight: torch.Tensor,
         weight_scale: torch.Tensor,
     ) -> torch.Tensor:
-        if DeepGemmQuantScaleFMT.from_oracle() == DeepGemmQuantScaleFMT.UE8M0:
+        if self.use_deep_gemm_e8m0 and self.is_blackwell:
             q_input, input_scale = per_token_group_quant_fp8_packed_for_deepgemm(
                 input_2d,
                 group_size=self.act_quant_group_shape.col,

From abe93bce5952ed8adf90d4b77af6ed3515958620 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Tue, 9 Dec 2025 20:18:10 -0500
Subject: [PATCH 399/770] [Attention] Make seq_lens_cpu optional in
 CommonAttentionMetadata to enable true async spec-decode (#29624)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: Benjamin Chislett <chislett.ben@gmail.com>
---
 tests/v1/attention/utils.py                 |   4 +-
 tests/v1/e2e/test_async_spec_decode.py      | 131 ++++++++++++++++++++
 tests/v1/spec_decode/test_tree_attention.py |   4 +-
 vllm/attention/layers/cross_attention.py    |   2 +-
 vllm/v1/attention/backends/gdn_attn.py      |   2 +-
 vllm/v1/attention/backends/utils.py         |  66 +++++++---
 vllm/v1/spec_decode/eagle.py                |  20 +--
 vllm/v1/worker/gpu/attn_utils.py            |   4 +-
 vllm/v1/worker/gpu_model_runner.py          |   4 +-
 9 files changed, 200 insertions(+), 37 deletions(-)
 create mode 100644 tests/v1/e2e/test_async_spec_decode.py

diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
index 6cab129c116c5..4dcaf9d908690 100644
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@@ -106,8 +106,8 @@ def create_common_attn_metadata(
         query_start_loc=query_start_loc,
         query_start_loc_cpu=query_start_loc_cpu,
         seq_lens=seq_lens,
-        seq_lens_cpu=seq_lens_cpu,
-        num_computed_tokens_cpu=num_computed_tokens_cpu,
+        _seq_lens_cpu=seq_lens_cpu,
+        _num_computed_tokens_cpu=num_computed_tokens_cpu,
         num_reqs=batch_spec.batch_size,
         num_actual_tokens=num_tokens,
         max_query_len=max_query_len,
diff --git a/tests/v1/e2e/test_async_spec_decode.py b/tests/v1/e2e/test_async_spec_decode.py
new file mode 100644
index 0000000000000..561f37a52d573
--- /dev/null
+++ b/tests/v1/e2e/test_async_spec_decode.py
@@ -0,0 +1,131 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Test that verifies no implicit GPU-CPU synchronization occurs during
+speculative decoding generation under expected conditions.
+"""
+
+import multiprocessing
+import sys
+import traceback
+
+import pytest
+import torch
+
+
+@pytest.fixture
+def sync_tracker():
+    """
+    Fixture that patches CommonAttentionMetadata.seq_lens_cpu to detect
+    lazy init syncs. Prints stack traces immediately when syncs occur.
+    """
+    from vllm.v1.attention.backends.utils import CommonAttentionMetadata
+
+    # Shared counter for cross-process communication (inherited by fork)
+    sync_count = multiprocessing.Value("i", 0)
+
+    # Save original property
+    original_prop = CommonAttentionMetadata.seq_lens_cpu
+    original_fget = original_prop.fget
+
+    # Create tracking wrapper
+    def tracking_seq_lens_cpu(self):
+        if self._seq_lens_cpu is None:
+            # Increment counter
+            with sync_count.get_lock():
+                sync_count.value += 1
+                count = sync_count.value
+            # Print stack trace immediately (shows in subprocess output)
+            print(f"\n{'=' * 60}", file=sys.stderr)
+            print(f"SYNC #{count}: seq_lens_cpu lazy init triggered!", file=sys.stderr)
+            print(f"{'=' * 60}", file=sys.stderr)
+            traceback.print_stack(file=sys.stderr)
+            print(f"{'=' * 60}\n", file=sys.stderr)
+            sys.stderr.flush()
+        return original_fget(self)
+
+    # Apply patch
+    CommonAttentionMetadata.seq_lens_cpu = property(tracking_seq_lens_cpu)
+
+    class SyncTracker:
+        @property
+        def count(self) -> int:
+            return sync_count.value
+
+        def assert_no_sync(self, msg: str = ""):
+            count = sync_count.value
+            assert count == 0, (
+                f"Unexpected GPU-CPU sync: seq_lens_cpu lazy init triggered "
+                f"{count} times. See stack traces above. {msg}"
+            )
+
+    yield SyncTracker()
+
+    # Restore original property
+    CommonAttentionMetadata.seq_lens_cpu = original_prop
+    torch._dynamo.reset()
+
+
+# Test configurations: (model, spec_model, method, num_spec_tokens, backend_env)
+SPEC_DECODE_CONFIGS = [
+    pytest.param(
+        "meta-llama/Llama-3.2-1B-Instruct",
+        "nm-testing/Llama3_2_1B_speculator.eagle3",
+        "eagle3",
+        2,
+        id="eagle3-llama",
+    ),
+    pytest.param(
+        "eagle618/deepseek-v3-random",
+        "eagle618/eagle-deepseek-v3-random",
+        "eagle",
+        2,
+        id="eagle-mla-deepseek",
+    ),
+]
+
+
+@pytest.mark.parametrize(
+    "model,spec_model,method,num_spec_tokens",
+    SPEC_DECODE_CONFIGS,
+)
+def test_no_sync_with_spec_decode(
+    sync_tracker,
+    model: str,
+    spec_model: str,
+    method: str,
+    num_spec_tokens: int,
+):
+    """
+    Test that no implicit GPU-CPU sync occurs during speculative decoding
+    generation.
+    """
+    # Import vLLM AFTER sync_tracker fixture has applied the patch
+    from vllm import LLM, SamplingParams
+    from vllm.distributed import cleanup_dist_env_and_memory
+
+    llm = LLM(
+        model=model,
+        max_model_len=256,
+        speculative_config={
+            "method": method,
+            "num_speculative_tokens": num_spec_tokens,
+            "model": spec_model,
+        },
+        enforce_eager=True,
+        async_scheduling=True,
+    )
+
+    outputs = llm.generate(
+        ["Hello, my name is"],
+        SamplingParams(temperature=0, max_tokens=10),
+    )
+
+    assert len(outputs) == 1
+    assert len(outputs[0].outputs[0].text) > 0
+
+    del llm
+    torch.cuda.empty_cache()
+    cleanup_dist_env_and_memory()
+
+    sync_tracker.assert_no_sync()
diff --git a/tests/v1/spec_decode/test_tree_attention.py b/tests/v1/spec_decode/test_tree_attention.py
index a4ee53008ce82..0afeeb8914b87 100644
--- a/tests/v1/spec_decode/test_tree_attention.py
+++ b/tests/v1/spec_decode/test_tree_attention.py
@@ -88,8 +88,8 @@ def forward_attention(
         query_start_loc=query_start_loc,
         query_start_loc_cpu=query_start_loc.cpu(),
         seq_lens=seq_lens,
-        seq_lens_cpu=seq_lens.cpu(),
-        num_computed_tokens_cpu=context_lens.cpu(),
+        _seq_lens_cpu=seq_lens.cpu(),
+        _num_computed_tokens_cpu=context_lens.cpu(),
         num_reqs=batch_size,
         num_actual_tokens=num_actual_tokens,
         max_query_len=max_query_len,
diff --git a/vllm/attention/layers/cross_attention.py b/vllm/attention/layers/cross_attention.py
index 068fd0a0eb7d0..cfd203bdd37b9 100644
--- a/vllm/attention/layers/cross_attention.py
+++ b/vllm/attention/layers/cross_attention.py
@@ -103,7 +103,7 @@ def create_cross_attention_backend(
             # needed here to know how many tokens to attend to from the cached
             # cross-attention KV cache.
             new_metadata.seq_lens = common_attn_metadata.encoder_seq_lens
-            new_metadata.seq_lens_cpu = torch.from_numpy(
+            new_metadata._seq_lens_cpu = torch.from_numpy(
                 common_attn_metadata.encoder_seq_lens_cpu
             )
 
diff --git a/vllm/v1/attention/backends/gdn_attn.py b/vllm/v1/attention/backends/gdn_attn.py
index e921f8c3de073..3a2f92d9921c3 100644
--- a/vllm/v1/attention/backends/gdn_attn.py
+++ b/vllm/v1/attention/backends/gdn_attn.py
@@ -370,6 +370,6 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata]
 
         num_accepted_tokens = torch.diff(m.query_start_loc)
         num_decode_draft_tokens_cpu = (num_accepted_tokens - 1).cpu()
-        m.num_computed_tokens_cpu = m.seq_lens_cpu - num_accepted_tokens.cpu()
+        m._num_computed_tokens_cpu = m.seq_lens_cpu - num_accepted_tokens.cpu()
 
         return self.build(0, m, num_accepted_tokens, num_decode_draft_tokens_cpu)
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 5200bc48b3156..79a1f7d4757d9 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -18,7 +18,7 @@ from typing import (
 
 import numpy as np
 import torch
-from typing_extensions import runtime_checkable
+from typing_extensions import deprecated, runtime_checkable
 
 from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.utils.math_utils import cdiv
@@ -66,11 +66,6 @@ class CommonAttentionMetadata:
     """(batch_size + 1,), the start location of each request in query Tensor"""
 
     seq_lens: torch.Tensor
-    seq_lens_cpu: torch.Tensor
-    """(batch_size,), the length of each request including both computed tokens
-    and newly scheduled tokens"""
-
-    num_computed_tokens_cpu: torch.Tensor
     """(batch_size,), the number of computed tokens for each request"""
 
     num_reqs: int
@@ -81,7 +76,7 @@ class CommonAttentionMetadata:
     max_query_len: int
     """Longest query in batch"""
     max_seq_len: int
-    """Longest context length in batch"""
+    """Longest context length (may be an upper bound)"""
 
     block_table_tensor: torch.Tensor
     slot_mapping: torch.Tensor
@@ -100,6 +95,40 @@ class CommonAttentionMetadata:
     dcp_local_seq_lens_cpu: torch.Tensor | None = None
     """Sequence lengths of the local rank in decode context parallelism world"""
 
+    # WARNING: Deprecated fields. Will be removed in a future release (v0.14.0)
+    _seq_lens_cpu: torch.Tensor | None = None
+    _num_computed_tokens_cpu: torch.Tensor | None = None
+
+    @property
+    @deprecated(
+        """
+    Prefer using device seq_lens directly to avoid implicit H<>D sync.
+    If a CPU copy is needed, use `seq_lens.cpu()` instead.
+    Will be removed in a future release (v0.14.0)
+    """
+    )
+    def seq_lens_cpu(self) -> torch.Tensor:
+        if self._seq_lens_cpu is None:
+            self._seq_lens_cpu = self.seq_lens.to("cpu")
+        return self._seq_lens_cpu
+
+    @property
+    @deprecated(
+        """
+    Prefer using device seq_lens directly to avoid implicit H<>D sync which breaks full
+    async scheduling. If a CPU copy is needed, it can be derived from 
+    query_start_loc_cpu and seq_lens.
+    Will be removed in a future release (v0.14.0)
+    """
+    )
+    def num_computed_tokens_cpu(self) -> torch.Tensor:
+        if self._num_computed_tokens_cpu is None:
+            query_seq_lens = (
+                self.query_start_loc_cpu[1:] - self.query_start_loc_cpu[:-1]
+            )
+            self._num_computed_tokens_cpu = self.seq_lens_cpu - query_seq_lens
+        return self._num_computed_tokens_cpu
+
     # TODO(lucas): remove once we have FULL-CG spec-decode support
     def unpadded(
         self, num_actual_tokens: int, num_actual_reqs: int
@@ -109,8 +138,12 @@ class CommonAttentionMetadata:
             query_start_loc=self.query_start_loc[: num_actual_reqs + 1],
             query_start_loc_cpu=self.query_start_loc_cpu[: num_actual_reqs + 1],
             seq_lens=self.seq_lens[:num_actual_reqs],
-            seq_lens_cpu=self.seq_lens_cpu[:num_actual_reqs],
-            num_computed_tokens_cpu=self.num_computed_tokens_cpu[:num_actual_reqs],
+            _seq_lens_cpu=self._seq_lens_cpu[:num_actual_reqs]
+            if self._seq_lens_cpu is not None
+            else None,
+            _num_computed_tokens_cpu=self._num_computed_tokens_cpu[:num_actual_reqs]
+            if self._num_computed_tokens_cpu is not None
+            else None,
             num_reqs=num_actual_reqs,
             num_actual_tokens=num_actual_tokens,
             max_query_len=self.max_query_len,
@@ -224,14 +257,14 @@ def _make_metadata_with_slice(
         query_start_loc=query_start_loc,
         query_start_loc_cpu=query_start_loc_cpu,
         seq_lens=seq_lens,
-        seq_lens_cpu=seq_lens_cpu,
-        num_computed_tokens_cpu=num_computed_tokens_cpu,
         num_reqs=num_requests,
         num_actual_tokens=num_actual_tokens,
         max_query_len=max_query_len,
         max_seq_len=max_seq_len,
         block_table_tensor=block_table_tensor,
         slot_mapping=slot_mapping,
+        _seq_lens_cpu=seq_lens_cpu,
+        _num_computed_tokens_cpu=num_computed_tokens_cpu,
     )
 
 
@@ -689,9 +722,7 @@ def make_local_attention_virtual_batches(
     return CommonAttentionMetadata(
         query_start_loc_cpu=query_start_loc_cpu,
         query_start_loc=query_start_loc_cpu.to(device=device, non_blocking=True),
-        seq_lens_cpu=seq_lens_cpu,
         seq_lens=seq_lens_cpu.to(device=device, non_blocking=True),
-        num_computed_tokens_cpu=torch.from_numpy(num_computed_tokens_local),
         num_reqs=len(seq_lens_cpu),
         num_actual_tokens=common_attn_metadata.num_actual_tokens,
         max_query_len=seqlens_q_local.max(),
@@ -699,6 +730,8 @@ def make_local_attention_virtual_batches(
         block_table_tensor=block_table_local,
         slot_mapping=common_attn_metadata.slot_mapping,
         causal=True,
+        _seq_lens_cpu=seq_lens_cpu,
+        _num_computed_tokens_cpu=torch.from_numpy(num_computed_tokens_local),
     )
 
 
@@ -719,7 +752,6 @@ def make_kv_sharing_fast_prefill_common_attn_metadata(
     logits_indices = logits_indices_padded[:num_logits_indices]
     num_reqs = common_attn_metadata.num_reqs
     query_start_loc = common_attn_metadata.query_start_loc
-    seq_lens = common_attn_metadata.seq_lens
     # Example inputs
     # num_reqs: 3
     # generation_indices:  [14, 18, 19, 27]
@@ -748,9 +780,7 @@ def make_kv_sharing_fast_prefill_common_attn_metadata(
     common_attn_metadata = CommonAttentionMetadata(
         query_start_loc=decode_query_start_loc,
         query_start_loc_cpu=decode_query_start_loc.to("cpu", non_blocking=True),
-        seq_lens=seq_lens,
-        seq_lens_cpu=seq_lens.to("cpu", non_blocking=True),
-        num_computed_tokens_cpu=common_attn_metadata.num_computed_tokens_cpu,
+        seq_lens=common_attn_metadata.seq_lens,
         num_reqs=num_reqs,
         num_actual_tokens=total_num_decode_tokens,
         max_query_len=decode_max_query_len,
@@ -758,6 +788,8 @@ def make_kv_sharing_fast_prefill_common_attn_metadata(
         block_table_tensor=common_attn_metadata.block_table_tensor,
         slot_mapping=common_attn_metadata.slot_mapping,
         causal=True,
+        _seq_lens_cpu=common_attn_metadata._seq_lens_cpu,
+        _num_computed_tokens_cpu=common_attn_metadata._num_computed_tokens_cpu,
     )
     return common_attn_metadata
 
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 9f7859a5c3565..4cc78ae9d23ae 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -440,16 +440,16 @@ class EagleProposer:
             # of main model.
             # Increment the sequence lengths.
             common_attn_metadata.seq_lens += 1
-            # This is an out-of-place operation to avoid modifying the original tensor.
-            common_attn_metadata.seq_lens_cpu = common_attn_metadata.seq_lens_cpu + 1
             # For the requests that exceed the max model length, we set the
             # sequence length to 1 to minimize their overheads in attention.
-
             common_attn_metadata.seq_lens.masked_fill_(exceeds_max_model_len, 1)
 
-            common_attn_metadata.num_computed_tokens_cpu = (
-                common_attn_metadata.seq_lens_cpu - 1
-            )
+            # Also update the CPU-side shadow; NOTE: this is hacky and should be
+            # removed in when common_attn_metadata.seq_lens_cpu is deprecated.
+            if common_attn_metadata._seq_lens_cpu is not None:
+                common_attn_metadata._seq_lens_cpu += 1
+            if common_attn_metadata._num_computed_tokens_cpu is not None:
+                common_attn_metadata._num_computed_tokens_cpu += 1
 
             # Compute the slot mapping.
             if self.uses_mrope:
@@ -656,8 +656,8 @@ class EagleProposer:
             query_start_loc=common_attn_metadata.query_start_loc,
             seq_lens=common_attn_metadata.seq_lens,
             query_start_loc_cpu=query_start_loc_cpu,
-            seq_lens_cpu=common_attn_metadata.seq_lens_cpu,
-            num_computed_tokens_cpu=common_attn_metadata.num_computed_tokens_cpu,
+            _seq_lens_cpu=common_attn_metadata._seq_lens_cpu,
+            _num_computed_tokens_cpu=common_attn_metadata._num_computed_tokens_cpu,
             num_reqs=common_attn_metadata.num_reqs,
             num_actual_tokens=total_num_tokens,
             max_query_len=new_query_len_per_req.max().item(),
@@ -932,8 +932,8 @@ class EagleProposer:
             query_start_loc=new_query_start_loc_cpu.to(device, non_blocking=True),
             seq_lens=new_seq_lens_cpu.to(device, non_blocking=True),
             query_start_loc_cpu=new_query_start_loc_cpu,
-            seq_lens_cpu=new_seq_lens_cpu,
-            num_computed_tokens_cpu=common_attn_metadata.num_computed_tokens_cpu,
+            _seq_lens_cpu=new_seq_lens_cpu,
+            _num_computed_tokens_cpu=common_attn_metadata._num_computed_tokens_cpu,
             num_reqs=common_attn_metadata.num_reqs,
             num_actual_tokens=total_num_tokens,
             max_query_len=new_query_len_per_req.max().item(),
diff --git a/vllm/v1/worker/gpu/attn_utils.py b/vllm/v1/worker/gpu/attn_utils.py
index 5aa1a33d851cc..6386f1a08b446 100644
--- a/vllm/v1/worker/gpu/attn_utils.py
+++ b/vllm/v1/worker/gpu/attn_utils.py
@@ -168,9 +168,9 @@ def build_attn_metadata(
             query_start_loc=query_start_loc_gpu,
             query_start_loc_cpu=query_start_loc_cpu,
             seq_lens=seq_lens,
-            seq_lens_cpu=seq_lens_cpu,
+            _seq_lens_cpu=seq_lens_cpu,
             max_seq_len=max_seq_len,
-            num_computed_tokens_cpu=num_computed_tokens_cpu,
+            _num_computed_tokens_cpu=num_computed_tokens_cpu,
             num_reqs=num_reqs,
             num_actual_tokens=num_tokens,
             max_query_len=max_query_len,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 7398defd74a38..f6f89d6eb6736 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1626,8 +1626,8 @@ class GPUModelRunner(
                 query_start_loc=query_start_loc,
                 query_start_loc_cpu=query_start_loc_cpu,
                 seq_lens=seq_lens,
-                seq_lens_cpu=seq_lens_cpu,
-                num_computed_tokens_cpu=num_computed_tokens_cpu,
+                _seq_lens_cpu=seq_lens_cpu,
+                _num_computed_tokens_cpu=num_computed_tokens_cpu,
                 num_actual_tokens=num_tokens_padded,
                 num_reqs=num_reqs_padded,
                 max_query_len=max_query_len,

From c3487aca3425f532730c3433cfbd44e880fce2a8 Mon Sep 17 00:00:00 2001
From: Andrew Xia <axia@meta.com>
Date: Tue, 9 Dec 2025 18:13:13 -0800
Subject: [PATCH 400/770] [responsesAPI][6] Fix multi turn MCP tokenization
 (#30230)

Signed-off-by: Andrew Xia <axia@fb.com>
Co-authored-by: Andrew Xia <axia@fb.com>
---
 tests/entrypoints/test_responses_utils.py | 52 ++++++++++++++++---
 vllm/entrypoints/constants.py             |  2 +
 vllm/entrypoints/context.py               |  6 ++-
 vllm/entrypoints/openai/serving_engine.py |  1 +
 vllm/entrypoints/responses_utils.py       | 62 +++++++++++++++++++++--
 5 files changed, 110 insertions(+), 13 deletions(-)

diff --git a/tests/entrypoints/test_responses_utils.py b/tests/entrypoints/test_responses_utils.py
index 3951bd4840085..a522967111307 100644
--- a/tests/entrypoints/test_responses_utils.py
+++ b/tests/entrypoints/test_responses_utils.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
+from openai.types.responses.response_function_tool_call import ResponseFunctionToolCall
 from openai.types.responses.response_function_tool_call_output_item import (
     ResponseFunctionToolCallOutputItem,
 )
@@ -14,7 +15,8 @@ from openai.types.responses.response_reasoning_item import (
 )
 
 from vllm.entrypoints.responses_utils import (
-    construct_chat_message_with_tool_call,
+    _construct_single_message_from_response_item,
+    construct_chat_messages_with_tool_call,
     convert_tool_responses_to_completions_format,
 )
 
@@ -42,7 +44,43 @@ class TestResponsesUtils:
 
         assert result == {"type": "function", "function": input_tool}
 
-    def test_construct_chat_message_with_tool_call(self):
+    def test_construct_chat_messages_with_tool_call(self):
+        """Test construction of chat messages with tool calls."""
+        reasoning_item = ResponseReasoningItem(
+            id="lol",
+            summary=[],
+            type="reasoning",
+            content=[
+                Content(
+                    text="Leroy Jenkins",
+                    type="reasoning_text",
+                )
+            ],
+            encrypted_content=None,
+            status=None,
+        )
+        mcp_tool_item = ResponseFunctionToolCall(
+            id="mcp_123",
+            call_id="call_123",
+            type="function_call",
+            status="completed",
+            name="python",
+            arguments='{"code": "123+456"}',
+        )
+        input_items = [reasoning_item, mcp_tool_item]
+        messages = construct_chat_messages_with_tool_call(input_items)
+
+        assert len(messages) == 1
+        message = messages[0]
+        assert message["role"] == "assistant"
+        assert message["reasoning"] == "Leroy Jenkins"
+        assert message["tool_calls"][0]["id"] == "call_123"
+        assert message["tool_calls"][0]["function"]["name"] == "python"
+        assert (
+            message["tool_calls"][0]["function"]["arguments"] == '{"code": "123+456"}'
+        )
+
+    def test_construct_single_message_from_response_item(self):
         item = ResponseReasoningItem(
             id="lol",
             summary=[],
@@ -56,7 +94,7 @@ class TestResponsesUtils:
             encrypted_content=None,
             status=None,
         )
-        formatted_item = construct_chat_message_with_tool_call(item)
+        formatted_item = _construct_single_message_from_response_item(item)
         assert formatted_item["role"] == "assistant"
         assert formatted_item["reasoning"] == "Leroy Jenkins"
 
@@ -74,7 +112,7 @@ class TestResponsesUtils:
             status=None,
         )
 
-        formatted_item = construct_chat_message_with_tool_call(item)
+        formatted_item = _construct_single_message_from_response_item(item)
         assert formatted_item["role"] == "assistant"
         assert (
             formatted_item["reasoning"]
@@ -88,7 +126,7 @@ class TestResponsesUtils:
             output="1234",
             status="completed",
         )
-        formatted_item = construct_chat_message_with_tool_call(tool_call_output)
+        formatted_item = _construct_single_message_from_response_item(tool_call_output)
         assert formatted_item["role"] == "tool"
         assert formatted_item["content"] == "1234"
         assert formatted_item["tool_call_id"] == "temp"
@@ -102,7 +140,7 @@ class TestResponsesUtils:
             status=None,
         )
         with pytest.raises(ValueError):
-            construct_chat_message_with_tool_call(item)
+            _construct_single_message_from_response_item(item)
 
         output_item = ResponseOutputMessage(
             id="msg_bf585bbbe3d500e0",
@@ -119,6 +157,6 @@ class TestResponsesUtils:
             type="message",
         )
 
-        formatted_item = construct_chat_message_with_tool_call(output_item)
+        formatted_item = _construct_single_message_from_response_item(output_item)
         assert formatted_item["role"] == "assistant"
         assert formatted_item["content"] == "dongyi"
diff --git a/vllm/entrypoints/constants.py b/vllm/entrypoints/constants.py
index b5bcccc35d6c8..5726ee0735d4c 100644
--- a/vllm/entrypoints/constants.py
+++ b/vllm/entrypoints/constants.py
@@ -8,3 +8,5 @@ Shared constants for vLLM entrypoints.
 # These constants help mitigate header abuse attacks
 H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT = 4194304  # 4 MB
 H11_MAX_HEADER_COUNT_DEFAULT = 256
+
+MCP_PREFIX = "mcp_"
diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
index 01ddab473723b..c70eaaa082fe5 100644
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -19,6 +19,7 @@ from vllm import envs
 from vllm.entrypoints.chat_utils import (
     ChatTemplateContentFormatOption,
 )
+from vllm.entrypoints.constants import MCP_PREFIX
 from vllm.entrypoints.openai.parser.harmony_utils import (
     get_encoding,
     get_streamable_parser_for_assistant,
@@ -303,7 +304,7 @@ class ParsableContext(ConversationContext):
         result_str = result.content[0].text
 
         message = ResponseFunctionToolCallOutputItem(
-            id=f"fco_{random_uuid()}",
+            id=f"mcpo_{random_uuid()}",
             type="function_call_output",
             call_id=f"call_{random_uuid()}",
             output=result_str,
@@ -385,6 +386,9 @@ class ParsableContext(ConversationContext):
         if not self.parser.response_messages:
             return []
         last_msg = self.parser.response_messages[-1]
+        # change this to a mcp_ function call
+        last_msg.id = f"{MCP_PREFIX}{random_uuid()}"
+        self.parser.response_messages[-1] = last_msg
         if last_msg.name == "code_interpreter":
             return await self.call_python_tool(self._tool_sessions["python"], last_msg)
         elif last_msg.name == "web_search_preview":
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 99936f588f28b..44b0f1842a6c1 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -1339,6 +1339,7 @@ class OpenAIServing:
                 )
                 engine_prompt = engine_prompts[0]
                 request_prompt = request_prompts[0]
+                prompt_text, _, _ = self._get_prompt_components(request_prompt)
 
             # Update the sampling params.
             sampling_params.max_tokens = self.max_model_len - len(
diff --git a/vllm/entrypoints/responses_utils.py b/vllm/entrypoints/responses_utils.py
index fbc137bac4543..99080fa43cb8e 100644
--- a/vllm/entrypoints/responses_utils.py
+++ b/vllm/entrypoints/responses_utils.py
@@ -22,6 +22,7 @@ from openai.types.responses.response_reasoning_item import ResponseReasoningItem
 from openai.types.responses.tool import Tool
 
 from vllm import envs
+from vllm.entrypoints.constants import MCP_PREFIX
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionMessageParam,
     ResponseInputOutputItem,
@@ -44,13 +45,13 @@ def make_response_output_items_from_parsable_context(
                 )
             if isinstance(output_messages[-1], ResponseFunctionToolCall):
                 mcp_message = McpCall(
-                    id=f"mcp_{random_uuid()}",
+                    id=f"{MCP_PREFIX}{random_uuid()}",
                     arguments=output_messages[-1].arguments,
                     name=output_messages[-1].name,
                     server_label=output_messages[
                         -1
                     ].name,  # TODO: store the server label
-                    type="mcp_call",
+                    type=f"{MCP_PREFIX}call",
                     status="completed",
                     output=message.output,
                     # TODO: support error output
@@ -98,12 +99,63 @@ def construct_input_messages(
     if isinstance(request_input, str):
         messages.append({"role": "user", "content": request_input})
     else:
-        for item in request_input:
-            messages.append(construct_chat_message_with_tool_call(item))
+        input_messages = construct_chat_messages_with_tool_call(request_input)
+        messages.extend(input_messages)
     return messages
 
 
-def construct_chat_message_with_tool_call(
+def _maybe_combine_reasoning_and_tool_call(
+    item: ResponseInputOutputItem, messages: list[ChatCompletionMessageParam]
+) -> ChatCompletionMessageParam | None:
+    """Many models treat MCP calls and reasoning as a single message.
+    This function checks if the last message is a reasoning message and
+    the current message is a tool call"""
+    if not (
+        isinstance(item, ResponseFunctionToolCall) and item.id.startswith(MCP_PREFIX)
+    ):
+        return None
+    if len(messages) == 0:
+        return None
+    last_message = messages[-1]
+    if not (
+        last_message.get("role") == "assistant"
+        and last_message.get("reasoning") is not None
+    ):
+        return None
+
+    last_message["tool_calls"] = [
+        ChatCompletionMessageToolCallParam(
+            id=item.call_id,
+            function=FunctionCallTool(
+                name=item.name,
+                arguments=item.arguments,
+            ),
+            type="function",
+        )
+    ]
+    return last_message
+
+
+def construct_chat_messages_with_tool_call(
+    input_messages: list[ResponseInputOutputItem],
+) -> list[ChatCompletionMessageParam]:
+    """This function wraps _construct_single_message_from_response_item
+    Because some chatMessages come from multiple response items
+    for example a reasoning item and a MCP tool call are two response items
+    but are one chat message
+    """
+    messages: list[ChatCompletionMessageParam] = []
+    for item in input_messages:
+        maybe_combined_message = _maybe_combine_reasoning_and_tool_call(item, messages)
+        if maybe_combined_message is not None:
+            messages[-1] = maybe_combined_message
+        else:
+            messages.append(_construct_single_message_from_response_item(item))
+
+    return messages
+
+
+def _construct_single_message_from_response_item(
     item: ResponseInputOutputItem,
 ) -> ChatCompletionMessageParam:
     if isinstance(item, ResponseFunctionToolCall):

From b75f826fca4febb17a76c12a45d5e315111c7618 Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Tue, 9 Dec 2025 20:28:37 -0600
Subject: [PATCH 401/770] [CI/Build][AMD] Skip quantization kernels tests that
 require CUTLASS or e4m3fn when not supported by platform (#30020)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 tests/kernels/quantization/test_block_fp8.py    | 17 ++++++++++++++---
 .../quantization/test_cutlass_scaled_mm.py      |  3 +++
 tests/kernels/quantization/test_cutlass_w4a8.py |  3 +++
 3 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py
index d0e4f6554a91f..32c77b9a01ece 100644
--- a/tests/kernels/quantization/test_block_fp8.py
+++ b/tests/kernels/quantization/test_block_fp8.py
@@ -54,6 +54,10 @@ def setup_cuda():
     torch.set_default_device("cuda")
 
 
+@pytest.mark.skipif(
+    current_platform.is_fp8_fnuz(),
+    reason="This platform supports e4m3fnuz, not e4m3fn.",
+)
 @pytest.mark.parametrize(
     "num_tokens,d,dtype,group_size,seed",
     itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, SEEDS),
@@ -78,14 +82,14 @@ def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
 def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
     torch.manual_seed(seed)
     factor_for_scale = 1e-2
-    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_info = torch.finfo(current_platform.fp8_dtype())
     fp8_max, fp8_min = fp8_info.max, fp8_info.min
 
     A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
-    A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+    A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(current_platform.fp8_dtype())
 
     B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
-    B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+    B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(current_platform.fp8_dtype())
 
     block_n, block_k = block_size[0], block_size[1]
     n_tiles = (N + block_n - 1) // block_n
@@ -103,6 +107,9 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
     assert rel_diff < 0.001
 
 
+@pytest.mark.skipif(
+    not current_platform.is_cuda(), reason="CUTLASS only supported on CUDA platform."
+)
 @torch.inference_mode()
 def test_w8a8_block_fp8_cutlass_matmul():
     # Test simple case where weight.shape % 128 != 0,
@@ -151,6 +158,10 @@ def test_w8a8_block_fp8_cutlass_matmul():
     assert rel_diff < 0.001
 
 
+@pytest.mark.skipif(
+    current_platform.is_fp8_fnuz(),
+    reason="This platform supports e4m3fnuz, not e4m3fn.",
+)
 @pytest.mark.parametrize(
     "M,N,K,block_size,out_dtype,seed",
     itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS),
diff --git a/tests/kernels/quantization/test_cutlass_scaled_mm.py b/tests/kernels/quantization/test_cutlass_scaled_mm.py
index de595b0a34e46..bc4744df7e69e 100644
--- a/tests/kernels/quantization/test_cutlass_scaled_mm.py
+++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py
@@ -15,6 +15,9 @@ from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
 from vllm.utils.math_utils import cdiv
 
+if not current_platform.is_cuda():
+    pytest.skip("These tests use CUTLASS which requires CUDA", allow_module_level=True)
+
 MNK_FACTORS = [
     (1, 256, 128),
     (1, 16384, 1024),
diff --git a/tests/kernels/quantization/test_cutlass_w4a8.py b/tests/kernels/quantization/test_cutlass_w4a8.py
index cccef28f5e931..8cfc993fe8e82 100644
--- a/tests/kernels/quantization/test_cutlass_w4a8.py
+++ b/tests/kernels/quantization/test_cutlass_w4a8.py
@@ -21,6 +21,9 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
 from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
 
+if not current_platform.is_cuda():
+    pytest.skip("These tests use CUTLASS which requires CUDA", allow_module_level=True)
+
 # TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
 #  unit tests to a common utility function. Currently the use of
 #  `is_quant_method_supported` conflates kernels with quantization methods

From 7d80c73d4277187d0468f15a22bba959ce853261 Mon Sep 17 00:00:00 2001
From: Micah Williamson <micah.williamson@amd.com>
Date: Tue, 9 Dec 2025 20:35:49 -0600
Subject: [PATCH 402/770] [CI] Reduce Flakiness For
 test_spec_decode.py::test_suffix_decoding_acceptance (#30367)

Signed-off-by: Micah Williamson <micah.williamson@amd.com>
---
 tests/v1/e2e/test_spec_decode.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 416b582dfaa63..8c904a8cddac4 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -191,8 +191,8 @@ def test_suffix_decoding_acceptance(
     # Expect the acceptance rate to improve.
     assert first_accept_rate < last_accept_rate
 
-    # Heuristic: expect at least 82.5% acceptance rate at the end.
-    assert last_accept_rate > 0.825
+    # Heuristic: expect at least 80.0% acceptance rate at the end.
+    assert last_accept_rate > 0.80
 
     del spec_llm
     torch.cuda.empty_cache()

From 06462392e40f9ae1bf87290c4cec10533fdd3205 Mon Sep 17 00:00:00 2001
From: haoyangli-amd <lihaoyang0109@gmail.com>
Date: Wed, 10 Dec 2025 11:24:12 +0800
Subject: [PATCH 403/770] [bugfix][quantization] fix quark qwen3 kv_cache
 quantization (#30308)

Signed-off-by: Haoyang Li <lihaoyang0109@gmail.com>
---
 vllm/model_executor/models/qwen3_moe.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index 6f520706a3176..c6984dc37c51c 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -403,6 +403,7 @@ class Qwen3MoeModel(nn.Module):
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         self.config = config
+        self.quant_config = quant_config
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
@@ -505,6 +506,19 @@ class Qwen3MoeModel(nn.Module):
         loaded_params: set[str] = set()
         expert_params_mapping = self.get_expert_mapping()
         for name, loaded_weight in weights:
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                assert loaded_weight.numel() == 1, (
+                    f"KV scale numel {loaded_weight.numel()} != 1"
+                )
+                loaded_weight = loaded_weight.squeeze()
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 # Skip non-stacked layers and experts (experts handled below).
                 if weight_name not in name:

From 3bdd426636cec97d4cd5cff0e1a057b45429e07c Mon Sep 17 00:00:00 2001
From: Wilson Wu <iwilsonwu@gmail.com>
Date: Wed, 10 Dec 2025 12:05:28 +0800
Subject: [PATCH 404/770] Fix typos in comments across multiple files (#30345)

Signed-off-by: Wilson Wu <iwilsonwu@gmail.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
---
 csrc/cpu/cpu_attn_impl.hpp                                      | 2 +-
 csrc/quantization/machete/machete_mainloop.cuh                  | 2 +-
 docs/features/nixl_connector_usage.md                           | 2 +-
 vllm/model_executor/layers/fused_moe/layer.py                   | 2 +-
 .../schemes/compressed_tensors_w4a16_nvfp4.py                   | 2 +-
 vllm/v1/worker/gpu_model_runner.py                              | 2 +-
 vllm/v1/worker/utils.py                                         | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/csrc/cpu/cpu_attn_impl.hpp b/csrc/cpu/cpu_attn_impl.hpp
index 02164ed3666e3..e3e077b845f4f 100644
--- a/csrc/cpu/cpu_attn_impl.hpp
+++ b/csrc/cpu/cpu_attn_impl.hpp
@@ -186,7 +186,7 @@ struct AttentionMetadata {
 //  - Intermediate outputs: q_tile_size * head_dim * output_buffer_elem_size + 2
 //  * q_tile_size * 4, partial output, max + sum (float)
 // Reduction scratchpad contains:
-//  - flags: bool array to indicate wether the split is finished
+//  - flags: bool array to indicate whether the split is finished
 //  - outputs: split_num * q_tile_size * head_dim * output_buffer_elem_size
 //  - max, sum: 2 * split_num * q_tile_size * 4
 class AttentionScratchPad {
diff --git a/csrc/quantization/machete/machete_mainloop.cuh b/csrc/quantization/machete/machete_mainloop.cuh
index 2f52a6b7a0246..9f02f4f179741 100644
--- a/csrc/quantization/machete/machete_mainloop.cuh
+++ b/csrc/quantization/machete/machete_mainloop.cuh
@@ -617,7 +617,7 @@ struct MacheteCollectiveMma {
 
   // Same as upstream, should be kept the same when possible, not formatted for
   // easier comparison
-  //   with `SwapAB ? N : M -> M` since we dont support SwapAB
+  //   with `SwapAB ? N : M -> M` since we don't support SwapAB
   // clang-format off
   template<class ProblemShape>
   static bool
diff --git a/docs/features/nixl_connector_usage.md b/docs/features/nixl_connector_usage.md
index 84c8f9e77d6d3..601205e1ed0b1 100644
--- a/docs/features/nixl_connector_usage.md
+++ b/docs/features/nixl_connector_usage.md
@@ -22,7 +22,7 @@ python tools/install_nixl_from_source_ubuntu.py
 NixlConnector uses NIXL library for underlying communication, which supports multiple transport backends. UCX (Unified Communication X) is the primary default transport library used by NIXL. Configure transport environment variables:
 
 ```bash
-# Example UCX configuration, adjust according to your enviroment
+# Example UCX configuration, adjust according to your environment
 export UCX_TLS=all  # or specify specific transports like "rc,ud,sm,^cuda_ipc" ..etc
 export UCX_NET_DEVICES=all  # or specify network devices like "mlx5_0:1,mlx5_1:1"
 ```
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index e635382068a63..61dd1892d67ea 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -881,7 +881,7 @@ class FusedMoE(CustomOp):
             # Record that the clone will be used by shared_experts_stream
             # to avoid gc issue from deallocation of hidden_states_clone
             # For more details: https://docs.pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html # noqa: E501
-            # NOTE: We dont need shared_output.record_stream(current_stream())
+            # NOTE: We don't need shared_output.record_stream(current_stream())
             # because we synch the streams before using shared_output.
             hidden_states_clone.record_stream(self.shared_experts_stream)
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py
index 3afadc6eb7e5b..d2701a464f129 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py
@@ -28,7 +28,7 @@ class CompressedTensorsW4A16Fp4(CompressedTensorsScheme):
 
     @classmethod
     def get_min_capability(cls) -> int:
-        # dont restrict as emulations
+        # don't restrict as emulations
         return 80
 
     def create_weights(
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index f6f89d6eb6736..39456d2e80ed0 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -4871,7 +4871,7 @@ class GPUModelRunner(
         # we need to adjust the cudagraph sizes to be a multiple of the uniform
         # decode query length to avoid: https://github.com/vllm-project/vllm/issues/28207
         # temp-fix: https://github.com/vllm-project/vllm/issues/28207#issuecomment-3504004536
-        # Will be removed in the near future when we have seperate cudagraph capture
+        # Will be removed in the near future when we have separate cudagraph capture
         # sizes for decode and mixed prefill-decode.
         if (
             cudagraph_mode.decode_mode() == CUDAGraphMode.FULL
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 0b0e2006d73d2..4dd9463ee6285 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -135,7 +135,7 @@ class AttentionGroup:
     kv_cache_spec: KVCacheSpec
     kv_cache_group_id: int
     # When ubatching is enabled we will have a metadata builder for each ubatch
-    # so that if they use internal persistant buffers for cudagraphs, and they
+    # so that if they use internal persistent buffers for cudagraphs, and they
     # won't have to worry about conflicting with the other ubatches.
     metadata_builders: list[AttentionMetadataBuilder] = field(
         default_factory=lambda: []

From d007387aa742c25f60d9b35bc103cbaf753114c8 Mon Sep 17 00:00:00 2001
From: Mingliang Li <limingliang0527@gmail.com>
Date: Wed, 10 Dec 2025 12:05:51 +0800
Subject: [PATCH 405/770] [Bugfix] Cache added_vocab to avoid per-token
 overhead (#30351)

Signed-off-by: limingliang <limingliang@stepfun.com>
Co-authored-by: limingliang <limingliang@stepfun.com>
---
 vllm/tokenizers/deepseekv32.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/tokenizers/deepseekv32.py b/vllm/tokenizers/deepseekv32.py
index b0490dacbe2d4..5c4936b5e7ad3 100644
--- a/vllm/tokenizers/deepseekv32.py
+++ b/vllm/tokenizers/deepseekv32.py
@@ -17,6 +17,8 @@ class DeepseekV32Tokenizer(HfTokenizer):
         self.name_or_path = (
             tokenizer.name_or_path if hasattr(tokenizer, "name_or_path") else ""
         )
+        self._added_vocab = self.tokenizer.get_added_vocab()
+        self._added_vocab_size = len(self._added_vocab)
 
     @classmethod
     def from_pretrained(
@@ -98,7 +100,7 @@ class DeepseekV32Tokenizer(HfTokenizer):
 
     def __len__(self) -> int:
         # </think> is an added token in DeepseekV32 tokenizer
-        return self.vocab_size + len(self.get_added_vocab())
+        return self.vocab_size + self._added_vocab_size
 
     def __call__(
         self,
@@ -120,7 +122,7 @@ class DeepseekV32Tokenizer(HfTokenizer):
         return self.tokenizer.get_vocab()
 
     def get_added_vocab(self) -> dict[str, int]:
-        return self.tokenizer.get_added_vocab()
+        return self._added_vocab.copy()
 
     def encode(
         self,

From 180345807f594c30ca8e36167bdfac9b5a955308 Mon Sep 17 00:00:00 2001
From: Radu Salavat <radu.salavat@arm.com>
Date: Tue, 9 Dec 2025 20:27:19 -0800
Subject: [PATCH 406/770] [CMake][Build]: Remove unused ACL CMake env variables
 (#30339)

Signed-off-by: Radu Salavat <radu.salavat@arm.com>
---
 cmake/cpu_extension.cmake | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index fbbb03c5ed465..85b286f8d8d0a 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -251,17 +251,6 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
         endif()
 
         # Build ACL with CMake
-        set(ARM_COMPUTE_BUILD_SHARED_LIB "OFF")
-        set(CMAKE_BUILD_TYPE "Release")
-        set(ARM_COMPUTE_ARCH "armv8.2-a")
-        set(ARM_COMPUTE_ENABLE_ASSERTS "OFF")
-        set(ARM_COMPUTE_ENABLE_CPPTHREADS "OFF")
-        set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
-        set(ARM_COMPUTE_ENABLE_OPENMP "ON")
-        set(ARM_COMPUTE_ENABLE_WERROR "OFF")
-        set(ARM_COMPUTE_BUILD_EXAMPLES "OFF")
-        set(ARM_COMPUTE_BUILD_TESTING "OFF")
-
         set(_cmake_config_cmd
              ${CMAKE_COMMAND} -G Ninja -B build 
             -DARM_COMPUTE_BUILD_SHARED_LIB=OFF 

From ed7af3178aa24b618be276104e21fdf8b9fcc9f2 Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Tue, 9 Dec 2025 23:33:13 -0600
Subject: [PATCH 407/770] [ROCm][CI] Attempt to fix the failures under a
 subgroup of the e2e the test group (#29358)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
Signed-off-by: Micah Williamson <micah.williamson@amd.com>
Co-authored-by: Micah Williamson <micah.williamson@amd.com>
---
 requirements/rocm-test.txt            |  2 +-
 tests/multimodal/test_utils.py        | 10 +++-
 tests/v1/e2e/test_async_scheduling.py | 86 +++++++++++++++++++++++----
 3 files changed, 85 insertions(+), 13 deletions(-)

diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt
index f25835c68ddcf..3f0fd235fba50 100644
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -75,7 +75,7 @@ torchgeo==0.7.0
 mteb==2.1.2
 
 # Data processing
-xgrammar==0.1.27
+xgrammar @ git+https://github.com/divakar-amd/xgrammar@3272f7c520564858056a60480d5afdf69ae79c84
 # Test async scheduling
 
 # Utilities
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index 639e290406fe2..636cd0ffd445e 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import asyncio
 import base64
 import mimetypes
 import os
@@ -186,6 +187,7 @@ async def test_fetch_image_error_conversion():
         connector.fetch_image(broken_img)
 
 
+@pytest.mark.flaky(reruns=3, reruns_delay=5)
 @pytest.mark.asyncio
 @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
 @pytest.mark.parametrize("num_frames", [-1, 32, 1800])
@@ -198,8 +200,12 @@ async def test_fetch_video_http(video_url: str, num_frames: int):
         }
     )
 
-    video_sync, metadata_sync = connector.fetch_video(video_url)
-    video_async, metadata_async = await connector.fetch_video_async(video_url)
+    try:
+        video_sync, metadata_sync = connector.fetch_video(video_url)
+        video_async, metadata_async = await connector.fetch_video_async(video_url)
+    except (TimeoutError, asyncio.TimeoutError) as e:
+        pytest.skip(f"Timeout fetching video (CI network flakiness): {e}")
+
     assert np.array_equal(video_sync, video_async)
     assert metadata_sync == metadata_async
 
diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py
index 838d05f0486c1..13b36c54123ce 100644
--- a/tests/v1/e2e/test_async_scheduling.py
+++ b/tests/v1/e2e/test_async_scheduling.py
@@ -8,6 +8,7 @@ import torch._dynamo.config as dynamo_config
 
 from vllm import SamplingParams
 from vllm.logprobs import Logprob
+from vllm.platforms import current_platform
 from vllm.sampling_params import StructuredOutputsParams
 from vllm.v1.metrics.reader import Metric
 
@@ -70,6 +71,18 @@ def test_without_spec_decoding(
         (True, "uni", True, None, True),
     ]
 
+    if current_platform.is_rocm():
+        # On ROCm, Only test with structured_outputs (deterministic)
+        # and skip chunk_prefill (more variable).
+        test_configs = [
+            cfg
+            for cfg in test_configs
+            if not cfg[4]  # skip chunk_prefill=True
+        ]
+        test_sampling_params = [
+            p for p in test_sampling_params if p.get("structured_outputs") is not None
+        ]
+
     run_tests(monkeypatch, MODEL, test_configs, test_sampling_params)
 
 
@@ -108,7 +121,14 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
         (True, "uni", True, spec_config_short, True),
     ]
 
-    run_tests(monkeypatch, MTP_MODEL, test_configs, test_sampling_params)
+    # On ROCm, use TRITON_ATTN + float32 for better numerical consistency
+    run_tests(
+        monkeypatch,
+        MTP_MODEL,
+        test_configs,
+        test_sampling_params,
+        is_testing_with_spec_decoding=True,
+    )
 
 
 @dynamo_config.patch(cache_size_limit=16)
@@ -117,13 +137,21 @@ def run_tests(
     model: str,
     test_configs: list[tuple],
     test_sampling_params: list[dict[str, Any]],
+    is_testing_with_spec_decoding: bool = False,
 ):
     """Test consistency of combos of async scheduling, preemption,
     uni/multiproc executor with spec decoding."""
 
     with monkeypatch.context() as m:
         # avoid precision errors
-        m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
+        if current_platform.is_rocm():
+            if is_testing_with_spec_decoding:
+                # Use TRITON_ATTN for spec decoding test for consistency
+                m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
+            else:
+                m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_AITER_FA")
+        else:
+            m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
         # lock matmul precision to full FP32
         m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "highest")
         # m.setenv("VLLM_BATCH_INVARIANT", "1")
@@ -145,6 +173,7 @@ def run_tests(
                 async_scheduling,
                 spec_config,
                 test_prefill_chunking=test_prefill_chunking,
+                is_testing_with_spec_decoding=is_testing_with_spec_decoding,
             )
             outputs.append(test_results)
 
@@ -174,17 +203,34 @@ def run_tests(
                     name_0=f"baseline=[{baseline_config}], params={params}",
                     name_1=f"config=[{test_config}], params={params}",
                 )
-                assert _all_logprobs_match(base_logprobs, test_logprobs)
+
+                # On ROCm with TRITON_ATTN (spec decoding test), skip strict
+                # logprobs comparison when logprobs are requested
+                skip_logprobs_check = (
+                    current_platform.is_rocm()
+                    and params.get("logprobs")
+                    and is_testing_with_spec_decoding
+                )
+                if not skip_logprobs_check:
+                    assert _all_logprobs_match(base_logprobs, test_logprobs)
 
                 if (
                     base_acceptance_rate is not None
                     and test_acceptance_rate is not None
                 ):
                     if "spec_mml=None" in test_config:
+                        # Preemption causes more variance in acceptance rates
+                        if (
+                            current_platform.is_rocm()
+                            and "preemption=True" in test_config
+                        ):
+                            tolerance = 0.10
+                        else:
+                            tolerance = 0.05
                         assert (
                             test_acceptance_rate > base_acceptance_rate
                             or test_acceptance_rate
-                            == pytest.approx(base_acceptance_rate, rel=5e-2)
+                            == pytest.approx(base_acceptance_rate, rel=tolerance)
                         )
                     else:
                         # Currently the reported acceptance rate is expected to be
@@ -215,6 +261,7 @@ def run_test(
     async_scheduling: bool,
     spec_config: dict[str, Any] | None,
     test_prefill_chunking: bool,
+    is_testing_with_spec_decoding: bool = False,
 ):
     spec_decoding = spec_config is not None
     cache_arg: dict[str, Any] = (
@@ -233,6 +280,15 @@ def run_test(
     print("-" * 80)
     print(f"---- TESTING {test_str}: {test_config}")
     print("-" * 80)
+
+    # On ROCm: use float16 for first test (ROCM_AITER_FA), but float32 for
+    # spec decoding test (TRITON_ATTN) for better precision.
+    # On others: always use float32.
+    if current_platform.is_rocm() and not is_testing_with_spec_decoding:
+        dtype = "float16"
+    else:
+        dtype = "float32"
+
     with VllmRunner(
         model,
         max_model_len=512,
@@ -242,7 +298,7 @@ def run_test(
         # enforce_eager=True,
         async_scheduling=async_scheduling,
         distributed_executor_backend=executor,
-        dtype="float32",  # avoid precision errors
+        dtype=dtype,
         speculative_config=spec_config,
         disable_log_stats=False,
         **cache_arg,
@@ -302,11 +358,21 @@ def _all_logprobs_match(req_a, req_b) -> bool:
 
 
 def _logprobs_match(lps_a: dict[int, Logprob], lps_b: dict[int, Logprob]) -> bool:
-    return len(lps_a) == len(lps_b) and all(
-        a.decoded_token == b.decoded_token
-        and a.rank == b.rank
-        and a.logprob == pytest.approx(b.logprob, rel=1e-3, abs=1e-6)
-        for a, b in ((lps_a[x], lps_b[x]) for x in lps_a)
+    if current_platform.is_rocm():
+        # ROCm has higher numerical variance
+        # due to use of float16.
+        rel_tol, abs_tol = 5e-2, 1e-5
+    else:
+        rel_tol, abs_tol = 1e-3, 1e-6
+    return (
+        len(lps_a) == len(lps_b)
+        and lps_a.keys() == lps_b.keys()
+        and all(
+            a.decoded_token == b.decoded_token
+            and a.rank == b.rank
+            and a.logprob == pytest.approx(b.logprob, rel=rel_tol, abs=abs_tol)
+            for a, b in ((lps_a[x], lps_b[x]) for x in lps_a)
+        )
     )
 
 
From 434ac76a7c2f2eb6aac80bb3b73cf856e1bba0e6 Mon Sep 17 00:00:00 2001
From: Fadi Arafeh <115173828+fadara01@users.noreply.github.com>
Date: Wed, 10 Dec 2025 05:37:35 +0000
Subject: [PATCH 408/770] [cpu][ci] Add CPU Attention Tests for Neon Backend
 (#30347)

Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
---
 tests/kernels/attention/test_cpu_attn.py | 73 ++++++++++++++++++++----
 1 file changed, 63 insertions(+), 10 deletions(-)

diff --git a/tests/kernels/attention/test_cpu_attn.py b/tests/kernels/attention/test_cpu_attn.py
index fb3b1799ba48e..be5d66197f6ef 100644
--- a/tests/kernels/attention/test_cpu_attn.py
+++ b/tests/kernels/attention/test_cpu_attn.py
@@ -7,7 +7,8 @@ import math
 import pytest
 import torch
 
-from vllm.platforms import current_platform
+from vllm.platforms import CpuArchEnum, current_platform
+from vllm.v1.attention.backends.cpu_attn import _get_attn_isa
 
 if not current_platform.is_cpu():
     pytest.skip("skipping CPU-only tests", allow_module_level=True)
@@ -36,6 +37,21 @@ SEQ_LENS = [  # (q_len, kv_len)
 ]
 
 
+def get_attn_isa(
+    block_size: int | None = None,
+    dtype: torch.dtype | None = None,
+):
+    if block_size and dtype:
+        return _get_attn_isa(dtype, block_size)
+    else:
+        if current_platform.get_cpu_architecture() == CpuArchEnum.ARM:
+            return "neon"
+        elif torch._C._cpu._is_amx_tile_supported():
+            return "amx"
+        else:
+            return "vec"
+
+
 # rand number generation takes too much time, cache rand tensors
 @functools.lru_cache(maxsize=128, typed=False)
 def tensor_cache(
@@ -452,6 +468,49 @@ def test_varlen_with_paged_kv_normal_vec16(
     )
 
 
+@pytest.mark.parametrize("seq_lens", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", [96, 128])
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
+@pytest.mark.parametrize("dtype", QTYPES)
+@pytest.mark.parametrize("soft_cap", [None])
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("use_alibi", [False])
+@pytest.mark.parametrize("use_sink", [False])
+@pytest.mark.parametrize("isa", ["neon"])
+@pytest.mark.skipif(
+    current_platform.get_cpu_architecture() != CpuArchEnum.ARM,
+    reason="Not an Arm CPU.",
+)
+def test_varlen_with_paged_kv_normal_neon(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: int | None,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+    num_blocks: int,
+    use_alibi: bool,
+    use_sink: bool,
+    isa: str,
+) -> None:
+    varlen_with_paged_kv(
+        seq_lens=seq_lens,
+        num_heads=num_heads,
+        head_size=head_size,
+        sliding_window=sliding_window,
+        dtype=dtype,
+        block_size=block_size,
+        soft_cap=soft_cap,
+        num_blocks=num_blocks,
+        use_alibi=use_alibi,
+        use_sink=use_sink,
+        isa=isa,
+    )
+
+
 @pytest.mark.parametrize("seq_lens", SEQ_LENS)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", [96])
@@ -462,9 +521,7 @@ def test_varlen_with_paged_kv_normal_vec16(
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 @pytest.mark.parametrize("use_alibi", [False])
 @pytest.mark.parametrize("use_sink", [False])
-@pytest.mark.parametrize(
-    "isa", ["amx"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
-)
+@pytest.mark.parametrize("isa", [get_attn_isa()])
 def test_varlen_with_paged_kv_softcap(
     seq_lens: list[tuple[int, int]],
     num_heads: tuple[int, int],
@@ -503,9 +560,7 @@ def test_varlen_with_paged_kv_softcap(
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 @pytest.mark.parametrize("use_alibi", [True])
 @pytest.mark.parametrize("use_sink", [False])
-@pytest.mark.parametrize(
-    "isa", ["amx"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
-)
+@pytest.mark.parametrize("isa", [get_attn_isa()])
 def test_varlen_with_paged_kv_alibi(
     seq_lens: list[tuple[int, int]],
     num_heads: tuple[int, int],
@@ -544,9 +599,7 @@ def test_varlen_with_paged_kv_alibi(
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 @pytest.mark.parametrize("use_alibi", [False])
 @pytest.mark.parametrize("use_sink", [True])
-@pytest.mark.parametrize(
-    "isa", ["amx"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
-)
+@pytest.mark.parametrize("isa", [get_attn_isa()])
 def test_varlen_with_paged_kv_sink(
     seq_lens: list[tuple[int, int]],
     num_heads: tuple[int, int],

From 9db78f34dce03d149f3571d45a2d2f259bdc7d15 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Wed, 10 Dec 2025 16:30:16 +0800
Subject: [PATCH 409/770] [Bugfix] Fix the issue where DeepSeek v3.2 cannot use
 structured_output (#30371)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 vllm/v1/structured_output/backend_xgrammar.py | 23 ++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py
index f8a2df43dd90e..826ee08caa4e2 100644
--- a/vllm/v1/structured_output/backend_xgrammar.py
+++ b/vllm/v1/structured_output/backend_xgrammar.py
@@ -10,7 +10,7 @@ import torch
 import vllm.envs
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams
-from vllm.tokenizers import MistralTokenizer
+from vllm.tokenizers import DeepseekV32Tokenizer, MistralTokenizer
 from vllm.utils.import_utils import LazyLoader
 from vllm.v1.structured_output.backend_types import (
     StructuredOutputBackend,
@@ -56,6 +56,27 @@ class XgrammarBackend(StructuredOutputBackend):
                 stop_token_ids=stop_token_ids,
                 add_prefix_space=True,
             )
+        elif isinstance(self.tokenizer, DeepseekV32Tokenizer):
+            # copy from xgr.TokenizerInfo.from_huggingface()
+            # because we are using a custom tokenizer wrapper here.
+            vocab_dict = self.tokenizer.get_vocab()
+            tokenizer_vocab_size = max(len(vocab_dict), self.tokenizer.max_token_id + 1)
+            vocab_size = self.vocab_size or tokenizer_vocab_size
+            # maintain tokenizer's indexing
+            encoded_vocab = [""] * vocab_size
+            for token, idx in vocab_dict.items():
+                if idx < vocab_size:
+                    encoded_vocab[idx] = token
+            stop_token_ids = [self.tokenizer.eos_token_id]
+            backend_str = self.tokenizer.tokenizer.backend_tokenizer.to_str()
+            metadata = xgr.TokenizerInfo._detect_metadata_from_hf(backend_str)
+            tokenizer_info = xgr.TokenizerInfo(
+                encoded_vocab=encoded_vocab,
+                vocab_type=metadata["vocab_type"],
+                vocab_size=vocab_size,
+                stop_token_ids=stop_token_ids,
+                add_prefix_space=metadata["add_prefix_space"],
+            )
         else:
             tokenizer_info = xgr.TokenizerInfo.from_huggingface(
                 self.tokenizer,

From 53d2420b4447fbcab572dc23d2c3bb9224a8a561 Mon Sep 17 00:00:00 2001
From: Daniele <36171005+dtrifiro@users.noreply.github.com>
Date: Wed, 10 Dec 2025 13:58:35 +0100
Subject: [PATCH 410/770] [Bugfix] tpu_model_runner: set vllm config context
 when calling reset_dynamo_cache() (#30331)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com>
---
 vllm/v1/worker/tpu_worker.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index 7a10ac198985e..5f6136b178b46 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -10,7 +10,7 @@ import torch
 import torch.nn as nn
 
 import vllm.envs as envs
-from vllm.config import VllmConfig
+from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.distributed import (
     ensure_model_parallel_initialized,
     init_distributed_environment,
@@ -207,7 +207,8 @@ class TPUWorker:
         # one compiled bytecode. Having one FX graph/cached bytecode per
         # compiled model is required for `support_torch_compile` decorator to
         # skip dynamo guard.
-        self.model_runner.reset_dynamo_cache()
+        with set_current_vllm_config(self.vllm_config):
+            self.model_runner.reset_dynamo_cache()
 
         # Get the maximum amount of memory used by the model weights and
         # intermediate activations.

From cebda2a4afa9ec9c6656c0aa5e96d0003e9b185d Mon Sep 17 00:00:00 2001
From: Aditya Tewari <aditya.tewari@arm.com>
Date: Wed, 10 Dec 2025 12:58:42 +0000
Subject: [PATCH 411/770] [CPU] Support for Whisper (#30062)

Signed-off-by: Aditya Tewari <aditya.tewari@arm.com>
---
 .../scripts/hardware_ci/run-cpu-test-arm.sh   |  5 +++
 csrc/cpu/cpu_attn.cpp                         |  1 -
 .../multimodal/generation/test_whisper.py     | 21 +++++++++-
 vllm/v1/attention/backends/cpu_attn.py        | 38 +++++++++----------
 vllm/v1/worker/utils.py                       |  8 +++-
 5 files changed, 49 insertions(+), 24 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
index 9c6e7766b2ac4..b6274d698d01a 100755
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
@@ -36,6 +36,11 @@ function cpu_tests() {
     set -e
     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
 
+  # Run model tests
+  docker exec cpu-test bash -c "
+    set -e
+    pytest -x -v -s tests/models/multimodal/generation/test_whisper.py -m cpu_model"
+
   # Run kernel tests
   docker exec cpu-test bash -c "
     set -e
diff --git a/csrc/cpu/cpu_attn.cpp b/csrc/cpu/cpu_attn.cpp
index 92f8bee5a47a0..02c722ba031a4 100644
--- a/csrc/cpu/cpu_attn.cpp
+++ b/csrc/cpu/cpu_attn.cpp
@@ -117,7 +117,6 @@ torch::Tensor get_scheduler_metadata(
   input.casual = casual;
   input.isa = isa;
   input.enable_kv_split = enable_kv_split;
-  TORCH_CHECK(casual, "Only supports casual mask for now.");
 
   VLLM_DISPATCH_FLOATING_TYPES(dtype, "get_scheduler_metadata", [&]() {
     CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {
diff --git a/tests/models/multimodal/generation/test_whisper.py b/tests/models/multimodal/generation/test_whisper.py
index eca2b61e37d53..8c99b6b4690a9 100644
--- a/tests/models/multimodal/generation/test_whisper.py
+++ b/tests/models/multimodal/generation/test_whisper.py
@@ -92,13 +92,14 @@ def run_test(
     *,
     tensor_parallel_size: int,
     distributed_executor_backend: str | None = None,
+    dtype: str = "half",
 ) -> None:
     prompt_list = PROMPTS * 10
     expected_list = EXPECTED[model] * 10
 
     with vllm_runner(
         model,
-        dtype="half",
+        dtype=dtype,
         max_model_len=448,
         tensor_parallel_size=tensor_parallel_size,
         distributed_executor_backend=distributed_executor_backend,
@@ -120,12 +121,28 @@ def run_test(
 
 @pytest.mark.core_model
 @pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
+@pytest.mark.parametrize("dtype", ["half"])
 @create_new_process_for_each_test()
-def test_models(vllm_runner, model) -> None:
+def test_models(vllm_runner, model, dtype) -> None:
     run_test(
         vllm_runner,
         model,
         tensor_parallel_size=1,
+        dtype=dtype,
+    )
+
+
+@pytest.mark.cpu_model
+@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
+@pytest.mark.parametrize("dtype", ["half"])
+def test_models_cpu(vllm_runner, model, dtype) -> None:
+    # @create_new_process_for_each_test() does not work for some runners
+    # TODO: to fix cpu privilege issues in run-cpu-test-arm.sh
+    run_test(
+        vllm_runner,
+        model,
+        tensor_parallel_size=1,
+        dtype=dtype,
     )
 
 
diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py
index fed7dcdf293bd..394d0c2f67136 100644
--- a/vllm/v1/attention/backends/cpu_attn.py
+++ b/vllm/v1/attention/backends/cpu_attn.py
@@ -21,7 +21,7 @@ from vllm.v1.attention.backends.utils import (
     CommonAttentionMetadata,
     split_decodes_and_prefills,
 )
-from vllm.v1.kv_cache_interface import AttentionSpec
+from vllm.v1.kv_cache_interface import AttentionSpec, CrossAttentionSpec
 
 logger = init_logger(__name__)
 
@@ -50,11 +50,13 @@ class CPUAttentionBackend(AttentionBackend):
 
     @classmethod
     def supports_attn_type(cls, attn_type: str) -> bool:
-        """CPU attention supports decoder and encoder-only attention."""
+        """CPU attention supports decoder,
+        encoder-only and encoder-decoder attention."""
         return attn_type in (
             AttentionType.DECODER,
             AttentionType.ENCODER,
             AttentionType.ENCODER_ONLY,
+            AttentionType.ENCODER_DECODER,
         )
 
     @staticmethod
@@ -136,6 +138,7 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata]
             self.window_size = -1
         self.block_size = vllm_config.cache_config.block_size
         self.isa = _get_attn_isa(self.dtype, self.block_size)
+        self.is_cross_attention = isinstance(kv_cache_spec, CrossAttentionSpec)
 
     def build(
         self,
@@ -151,7 +154,7 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata]
         seq_lens = common_attn_metadata.seq_lens
         block_table_tensor = common_attn_metadata.block_table_tensor
         slot_mapping = common_attn_metadata.slot_mapping
-        causal = common_attn_metadata.causal
+        causal = False if self.is_cross_attention else common_attn_metadata.causal
 
         sdpa_start_loc = query_start_loc
         num_decode_tokens = 0
@@ -171,22 +174,19 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata]
             query_start_loc = query_start_loc[: num_decodes + 1]
             block_table_tensor = block_table_tensor[:num_decodes]
 
-        sheduler_metadata = None
-        if causal:
-            # for decode batch, use the custom kernel
-            sheduler_metadata = ops.cpu_attn_get_scheduler_metadata(
-                num_reqs=num_reqs,
-                num_heads=self.num_heads,
-                num_kv_heads=self.num_kv_heads,
-                head_dim=self.head_dim,
-                seq_lens=seq_lens,
-                dtype=self.dtype,
-                query_start_loc=query_start_loc,
-                causal=causal,
-                sliding_window_size=self.window_size,
-                isa=self.isa,
-                enable_kv_split=True,
-            )
+        sheduler_metadata = ops.cpu_attn_get_scheduler_metadata(
+            num_reqs=num_reqs,
+            num_heads=self.num_heads,
+            num_kv_heads=self.num_kv_heads,
+            head_dim=self.head_dim,
+            seq_lens=seq_lens,
+            dtype=self.dtype,
+            query_start_loc=query_start_loc,
+            causal=causal,
+            sliding_window_size=self.window_size,
+            isa=self.isa,
+            enable_kv_split=True,
+        )
 
         attn_metadata = CPUAttentionMetadata(
             isa=self.isa,
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 4dd9463ee6285..e9c48223d58b9 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -313,8 +313,12 @@ def bind_kv_cache(
             # TODO - analyze where runner_kv_caches is used and the right
             # way to ensure it properly reflects multiple attention layers
             # in the same decoder block.
-            if current_platform.is_cuda_alike() or current_platform.is_xpu():
-                # We know that the GPU runner is not impacted by this
+            if (
+                current_platform.is_cuda_alike()
+                or current_platform.is_xpu()
+                or current_platform.is_cpu()
+            ):
+                # We know that the GPU / CPU runner is not impacted by this
                 # case. Some test code depends on runner_kv_caches, but
                 # not in a way that's impacted by ignoring this.
                 pass

From d017bceb08eaac7bae2c499124ece737fb4fb22b Mon Sep 17 00:00:00 2001
From: Roger Young <42564206+rogeryoungh@users.noreply.github.com>
Date: Wed, 10 Dec 2025 20:58:50 +0800
Subject: [PATCH 412/770] [BugFix] Fix minimax m2 model rotary_dim (#30384)

Signed-off-by: xuebi <xuebi@minimaxi.com>
Co-authored-by: xuebi <xuebi@minimaxi.com>
---
 vllm/model_executor/models/minimax_m2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/minimax_m2.py b/vllm/model_executor/models/minimax_m2.py
index dd98e36ec0851..3e6a9add9ec49 100644
--- a/vllm/model_executor/models/minimax_m2.py
+++ b/vllm/model_executor/models/minimax_m2.py
@@ -201,7 +201,7 @@ class MiniMaxM2Attention(nn.Module):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=rotary_dim,
+            rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
             rope_parameters=rope_parameters,
         )

From c756fb678184b867ed94e5613a529198f1aee423 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Wed, 10 Dec 2025 15:14:24 +0100
Subject: [PATCH 413/770] [Core] Whisper enable `FULL_DECODE_ONLY` CudaGraph 
 (#30072)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 .../multimodal/generation/test_whisper.py     |  2 ++
 vllm/config/vllm.py                           | 30 ++++++++++++-------
 vllm/v1/worker/gpu_model_runner.py            | 11 ++++++-
 3 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/tests/models/multimodal/generation/test_whisper.py b/tests/models/multimodal/generation/test_whisper.py
index 8c99b6b4690a9..592862c2a0bb0 100644
--- a/tests/models/multimodal/generation/test_whisper.py
+++ b/tests/models/multimodal/generation/test_whisper.py
@@ -103,6 +103,8 @@ def run_test(
         max_model_len=448,
         tensor_parallel_size=tensor_parallel_size,
         distributed_executor_backend=distributed_executor_backend,
+        # TODO (NickLucche) figure out output differences with non-eager and re-enable
+        enforce_eager=True,
     ) as vllm_model:
         llm = vllm_model.llm
 
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 8f27db0013305..607bb44cddd26 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -666,8 +666,9 @@ class VllmConfig:
 
         default_config = OPTIMIZATION_LEVEL_TO_CONFIG[self.optimization_level]
         self._apply_optimization_level_defaults(default_config)
+
         if (
-            self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+            self.compilation_config.cudagraph_mode.requires_piecewise_compilation()
             and self.compilation_config.mode != CompilationMode.VLLM_COMPILE
         ):
             logger.info(
@@ -692,22 +693,29 @@ class VllmConfig:
 
         if current_platform.support_static_graph_mode():
             # if cudagraph_mode has full cudagraphs, we need to check support
-            if (
-                self.compilation_config.cudagraph_mode.has_full_cudagraphs()
-                and self.model_config is not None
-            ):
-                if self.model_config.pooler_config is not None:
+            if model_config := self.model_config:
+                if (
+                    self.compilation_config.cudagraph_mode.has_full_cudagraphs()
+                    and model_config.pooler_config is not None
+                ):
                     logger.warning_once(
                         "Pooling models do not support full cudagraphs. "
                         "Overriding cudagraph_mode to PIECEWISE."
                     )
                     self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
-                elif self.model_config.is_encoder_decoder:
-                    logger.warning_once(
-                        "Encoder-decoder models do not support full cudagraphs. "
-                        "Overriding cudagraph_mode to PIECEWISE."
+                elif (
+                    model_config.is_encoder_decoder
+                    and self.compilation_config.cudagraph_mode
+                    not in (CUDAGraphMode.NONE, CUDAGraphMode.FULL_DECODE_ONLY)
+                ):
+                    logger.info_once(
+                        "Encoder-decoder models do not support %s. "
+                        "Overriding cudagraph_mode to FULL_DECODE_ONLY.",
+                        self.compilation_config.cudagraph_mode.name,
+                    )
+                    self.compilation_config.cudagraph_mode = (
+                        CUDAGraphMode.FULL_DECODE_ONLY
                     )
-                    self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
 
             # disable cudagraph when enforce eager execution
             if self.model_config is not None and self.model_config.enforce_eager:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 39456d2e80ed0..ca06f048f290b 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1267,6 +1267,8 @@ class GPUModelRunner(
         if not isinstance(kv_cache_spec, CrossAttentionSpec):
             return None, None
 
+        # Zero out buffer for padding requests that are not actually scheduled (CGs)
+        self.encoder_seq_lens.np[:num_reqs] = 0
         # Build encoder_seq_lens array mapping request indices to
         # encoder lengths for inputs scheduled in this batch
         for req_id in num_scheduled_tokens:
@@ -2764,6 +2766,7 @@ class GPUModelRunner(
         # be improved in model runner v2)
         force_uniform_decode: bool | None = None,
         force_has_lora: bool | None = None,
+        num_encoder_reqs: int = 0,
     ) -> tuple[
         CUDAGraphMode,
         BatchDescriptor,
@@ -2780,6 +2783,11 @@ class GPUModelRunner(
             if force_uniform_decode is None
             else force_uniform_decode
         )
+        # Encoder-decoder models only support CG for decoder_step > 0 (no enc_output
+        # is present). Also, chunked-prefill is disabled, so batch are uniform.
+        has_encoder_output = (
+            self.model_config.is_encoder_decoder and num_encoder_reqs > 0
+        )
 
         has_lora = (
             len(self.input_batch.lora_id_to_lora_request) > 0
@@ -2799,7 +2807,7 @@ class GPUModelRunner(
         )
 
         cudagraph_mode, batch_descriptor = dispatch_cudagraph(
-            num_tokens_padded, use_cascade_attn
+            num_tokens_padded, use_cascade_attn or has_encoder_output
         )
         num_tokens_padded = batch_descriptor.num_tokens
 
@@ -2997,6 +3005,7 @@ class GPUModelRunner(
                     num_scheduled_tokens_np=num_scheduled_tokens_np,
                     max_num_scheduled_tokens=max_num_scheduled_tokens,
                     use_cascade_attn=cascade_attn_prefix_lens is not None,
+                    num_encoder_reqs=len(scheduler_output.scheduled_encoder_inputs),
                 )
 
                 logger.debug(

From aacf0abf8bc219211b888a82f11f028e67b59531 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Wed, 10 Dec 2025 10:59:23 -0500
Subject: [PATCH 414/770] [BugFix] Fix `AttributeError:
 'MergedColumnParallelLinear' object has no attribute 'weight_scale'` (#30399)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 vllm/model_executor/warmup/deep_gemm_warmup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/warmup/deep_gemm_warmup.py b/vllm/model_executor/warmup/deep_gemm_warmup.py
index e0c584df8760b..936f6b1e28ce1 100644
--- a/vllm/model_executor/warmup/deep_gemm_warmup.py
+++ b/vllm/model_executor/warmup/deep_gemm_warmup.py
@@ -89,7 +89,7 @@ def _extract_data_from_linear_base_module(
     assert m.quant_method.quant_config is not None
 
     w = m.weight
-    ws = m.weight_scale
+    ws = m.weight_scale_inv if hasattr(m, "weight_scale_inv") else m.weight_scale
     quant_block_size = m.quant_method.quant_config.weight_block_size
 
     assert isinstance(w, torch.Tensor)

From 2dcbac9077ecadff0aa78b7c282f9e147a260e86 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Wed, 10 Dec 2025 16:09:34 +0000
Subject: [PATCH 415/770] [Docs] Generate full list of metrics in user docs
 (#30388)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Co-authored-by: Claude <noreply@anthropic.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/design/metrics.md                |  12 +--
 docs/mkdocs/hooks/generate_metrics.py | 149 ++++++++++++++++++++++++++
 docs/usage/metrics.md                 |  16 ++-
 mkdocs.yaml                           |   1 +
 4 files changed, 163 insertions(+), 15 deletions(-)
 create mode 100644 docs/mkdocs/hooks/generate_metrics.py

diff --git a/docs/design/metrics.md b/docs/design/metrics.md
index 28b5405871ac2..2722e12fdaeaf 100644
--- a/docs/design/metrics.md
+++ b/docs/design/metrics.md
@@ -21,30 +21,20 @@ The mental model is that server-level metrics help explain the values of request
 
 ### v1 Metrics
 
-In v1, the following metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix:
+In v1, an extensive set of metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix, for example:
 
 - `vllm:num_requests_running` (Gauge) - Number of requests currently running.
-- `vllm:num_requests_waiting` (Gauge) - Number of requests currently waiting.
 - `vllm:kv_cache_usage_perc` (Gauge) - Fraction of used KV cache blocks (0–1).
 - `vllm:prefix_cache_queries` (Counter) - Number of prefix cache queries.
 - `vllm:prefix_cache_hits` (Counter) - Number of prefix cache hits.
-- `vllm:mm_cache_queries` (Counter) - (For multimodal models) Number of multimodal cache queries.
-- `vllm:mm_cache_hits` (Counter) - (For multimodal models) Number of multimodal cache hits.
-- `vllm:num_preemptions_total` (Counter) - Number of preemptions.
 - `vllm:prompt_tokens_total` (Counter) - Total number of prompt tokens processed.
 - `vllm:generation_tokens_total` (Counter) - Total number of generated tokens.
-- `vllm:iteration_tokens_total` (Histogram) - Histogram of tokens processed in each engine step.
-- `vllm:cache_config_info` (Gauge) - Information about the cache configuration.
 - `vllm:request_success_total` (Counter) - Number of finished requests (by finish reason).
 - `vllm:request_prompt_tokens` (Histogram) - Histogram of input prompt token counts.
 - `vllm:request_generation_tokens` (Histogram) - Histogram of generation token counts.
-- `vllm:request_params_n` (Histogram) - Histogram of request parameter n.
-- `vllm:request_params_max_tokens` - (Histogram) - Histogram of max_tokens parameter in requests.
 - `vllm:time_to_first_token_seconds` (Histogram) - Time to first token (TTFT).
 - `vllm:inter_token_latency_seconds` (Histogram) - Inter-token latency.
 - `vllm:e2e_request_latency_seconds` (Histogram) - End-to-end request latency.
-- `vllm:request_queue_time_seconds` (Histogram) - Time spent in the queue.
-- `vllm:request_inference_time_seconds` (Histogram) - Request inference time.
 - `vllm:request_prefill_time_seconds` (Histogram) - Request prefill time.
 - `vllm:request_decode_time_seconds` (Histogram) - Request decode time.
 
diff --git a/docs/mkdocs/hooks/generate_metrics.py b/docs/mkdocs/hooks/generate_metrics.py
new file mode 100644
index 0000000000000..b20d43c4b2e92
--- /dev/null
+++ b/docs/mkdocs/hooks/generate_metrics.py
@@ -0,0 +1,149 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import ast
+import logging
+from pathlib import Path
+from typing import Literal
+
+logger = logging.getLogger("mkdocs")
+
+ROOT_DIR = Path(__file__).parent.parent.parent.parent
+DOCS_DIR = ROOT_DIR / "docs"
+GENERATED_METRICS_DIR = DOCS_DIR / "generated" / "metrics"
+
+# Files to scan for metric definitions - each will generate a separate table
+METRIC_SOURCE_FILES = [
+    {"path": "vllm/v1/metrics/loggers.py", "output": "general.md"},
+    {
+        "path": "vllm/v1/spec_decode/metrics.py",
+        "output": "spec_decode.md",
+    },
+    {
+        "path": "vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py",
+        "output": "nixl_connector.md",
+    },
+]
+
+
+class MetricExtractor(ast.NodeVisitor):
+    """AST visitor to extract metric definitions."""
+
+    def __init__(self):
+        self.metrics: list[dict[str, str]] = []
+
+    def visit_Call(self, node: ast.Call) -> None:
+        """Visit function calls to find metric class instantiations."""
+        metric_type = self._get_metric_type(node)
+        if metric_type:
+            name = self._extract_kwarg(node, "name")
+            documentation = self._extract_kwarg(node, "documentation")
+
+            if name:
+                self.metrics.append(
+                    {
+                        "name": name,
+                        "type": metric_type,
+                        "documentation": documentation or "",
+                    }
+                )
+
+        self.generic_visit(node)
+
+    def _get_metric_type(self, node: ast.Call) -> str | None:
+        """Determine if this call creates a metric and return its type."""
+        metric_type_map = {
+            "_gauge_cls": "gauge",
+            "_counter_cls": "counter",
+            "_histogram_cls": "histogram",
+        }
+        if isinstance(node.func, ast.Attribute):
+            return metric_type_map.get(node.func.attr)
+        return None
+
+    def _extract_kwarg(self, node: ast.Call, key: str) -> str | None:
+        """Extract a keyword argument value from a function call."""
+        for keyword in node.keywords:
+            if keyword.arg == key:
+                return self._get_string_value(keyword.value)
+        return None
+
+    def _get_string_value(self, node: ast.AST) -> str | None:
+        """Extract string value from an AST node."""
+        if isinstance(node, ast.Constant):
+            return str(node.value) if node.value is not None else None
+        return None
+
+
+def extract_metrics_from_file(filepath: Path) -> list[dict[str, str]]:
+    """Parse a Python file and extract all metric definitions."""
+    try:
+        with open(filepath, encoding="utf-8") as f:
+            source = f.read()
+
+        tree = ast.parse(source, filename=str(filepath))
+        extractor = MetricExtractor()
+        extractor.visit(tree)
+        return extractor.metrics
+    except Exception as e:
+        raise RuntimeError(f"Failed to parse {filepath}: {e}") from e
+
+
+def generate_markdown_table(metrics: list[dict[str, str]]) -> str:
+    """Generate a markdown table from extracted metrics."""
+    if not metrics:
+        return "No metrics found.\n"
+
+    # Sort by type, then by name
+    metrics_sorted = sorted(metrics, key=lambda m: (m["type"], m["name"]))
+
+    lines = []
+    lines.append("| Metric Name | Type | Description |")
+    lines.append("|-------------|------|-------------|")
+
+    for metric in metrics_sorted:
+        name = metric["name"]
+        metric_type = metric["type"].capitalize()
+        doc = metric["documentation"].replace("\n", " ").strip()
+        lines.append(f"| `{name}` | {metric_type} | {doc} |")
+
+    return "\n".join(lines) + "\n"
+
+
+def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
+    """Generate metrics documentation tables from source files."""
+    logger.info("Generating metrics documentation")
+
+    # Create generated directory if it doesn't exist
+    GENERATED_METRICS_DIR.mkdir(parents=True, exist_ok=True)
+
+    total_metrics = 0
+    for source_config in METRIC_SOURCE_FILES:
+        source_path = source_config["path"]
+        output_file = source_config["output"]
+
+        filepath = ROOT_DIR / source_path
+        if not filepath.exists():
+            raise FileNotFoundError(f"Metrics source file not found: {filepath}")
+
+        logger.debug("Extracting metrics from: %s", source_path)
+        metrics = extract_metrics_from_file(filepath)
+        logger.debug("Found %d metrics in %s", len(metrics), source_path)
+
+        # Generate and write the markdown table for this source
+        table_content = generate_markdown_table(metrics)
+        output_path = GENERATED_METRICS_DIR / output_file
+        with open(output_path, "w", encoding="utf-8") as f:
+            f.write(table_content)
+
+        total_metrics += len(metrics)
+        logger.info(
+            "Generated metrics table: %s (%d metrics)",
+            output_path.relative_to(ROOT_DIR),
+            len(metrics),
+        )
+
+    logger.info(
+        "Total metrics generated: %d across %d files",
+        total_metrics,
+        len(METRIC_SOURCE_FILES),
+    )
diff --git a/docs/usage/metrics.md b/docs/usage/metrics.md
index d756e32476f0a..829533b84328f 100644
--- a/docs/usage/metrics.md
+++ b/docs/usage/metrics.md
@@ -33,11 +33,19 @@ Then query the endpoint to get the latest metrics from the server:
 
 The following metrics are exposed:
 
-??? code
+## General Metrics
 
-    ```python
-    --8<-- "vllm/engine/metrics.py:metrics-definitions"
-    ```
+--8<-- "docs/generated/metrics/general.md"
+
+## Speculative Decoding Metrics
+
+--8<-- "docs/generated/metrics/spec_decode.md"
+
+## NIXL KV Connector Metrics
+
+--8<-- "docs/generated/metrics/nixl_connector.md"
+
+## Deprecation Policy
 
 Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
 but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch,
diff --git a/mkdocs.yaml b/mkdocs.yaml
index bf97093dafb11..8fb8f0568c6ef 100644
--- a/mkdocs.yaml
+++ b/mkdocs.yaml
@@ -51,6 +51,7 @@ hooks:
   - docs/mkdocs/hooks/remove_announcement.py
   - docs/mkdocs/hooks/generate_examples.py
   - docs/mkdocs/hooks/generate_argparse.py
+  - docs/mkdocs/hooks/generate_metrics.py
   - docs/mkdocs/hooks/url_schemes.py
 
 plugins:

From 794a7875ee0df7d2c12ff0ba83b76438ca68bf26 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Wed, 10 Dec 2025 12:44:02 -0500
Subject: [PATCH 416/770] [Misc] Consistent case for `vllm bench serve` results
 (#30403)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 benchmarks/benchmark_serving_structured_output.py | 2 +-
 docs/benchmarking/cli.md                          | 2 +-
 vllm/benchmarks/serve.py                          | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index a4e1b163dcca9..33aca831883aa 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -574,7 +574,7 @@ async def benchmark(
     )
     print(
         "{:<40} {:<10.2f}".format(
-            "Total Token throughput (tok/s):", metrics.total_token_throughput
+            "Total token throughput (tok/s):", metrics.total_token_throughput
         )
     )
 
diff --git a/docs/benchmarking/cli.md b/docs/benchmarking/cli.md
index 1ce6b611745b1..dd5a12e408b02 100644
--- a/docs/benchmarking/cli.md
+++ b/docs/benchmarking/cli.md
@@ -84,7 +84,7 @@ Total input tokens:                      1369
 Total generated tokens:                  2212
 Request throughput (req/s):              1.73
 Output token throughput (tok/s):         382.89
-Total Token throughput (tok/s):          619.85
+Total token throughput (tok/s):          619.85
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          71.54
 Median TTFT (ms):                        73.88
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index 2e2054a8a4b13..254e4d35e5350 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -788,7 +788,7 @@ async def benchmark(
         )
     print(
         "{:<40} {:<10.2f}".format(
-            "Total Token throughput (tok/s):", metrics.total_token_throughput
+            "Total token throughput (tok/s):", metrics.total_token_throughput
         )
     )
 

From 253305d5b22bb0795bb8fd8469053e1df67a9be6 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 11 Dec 2025 01:48:38 +0800
Subject: [PATCH 417/770] [Chore] Delay recent deprecations (#30398)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/multimodal/inputs.py                 |  6 +++---
 vllm/multimodal/utils.py                  |  4 ++--
 vllm/transformers_utils/tokenizer.py      | 14 +++++++-------
 vllm/transformers_utils/tokenizer_base.py |  4 ++--
 vllm/v1/engine/async_llm.py               |  2 +-
 vllm/v1/engine/llm_engine.py              |  2 +-
 vllm/v1/engine/processor.py               |  2 +-
 7 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 2ed66554e358e..6b1cbbe24e2e7 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -954,7 +954,7 @@ MultiModalKwargsOptionalItems: TypeAlias = (
 )
 
 
-@deprecated("`MultiModalKwargs` is deprecated and will be removed in v0.13.")
+@deprecated("`MultiModalKwargs` is deprecated and will be removed in v0.14.")
 class MultiModalKwargs(UserDict[str, NestedTensors]):
     """
     A dictionary that represents the keyword arguments to
@@ -964,7 +964,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
     @staticmethod
     @deprecated(
         "`MultiModalKwargs.from_hf_inputs` is deprecated and "
-        "will be removed in v0.13. "
+        "will be removed in v0.14. "
         "Please use `MultiModalKwargsItems.from_hf_inputs` and "
         "access the tensor data using `.get_data()`."
     )
@@ -977,7 +977,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
     @staticmethod
     @deprecated(
         "`MultiModalKwargs.from_items` is deprecated and "
-        "will be removed in v0.13. "
+        "will be removed in v0.14. "
         "Please use `MultiModalKwargsItems.from_seq` and "
         "access the tensor data using `.get_data()`."
     )
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index d4bdc55e569b2..7fd05af583b0a 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -429,12 +429,12 @@ def group_mm_kwargs_by_modality(
     if merge_by_field_config is not None:
         logger.warning_once(
             "The `merge_by_field_config` argument of `group_mm_kwargs_by_modality` "
-            "is deprecated and will be removed in v0.13."
+            "is deprecated and will be removed in v0.14."
         )
     if multimodal_cpu_fields is not None:
         logger.warning_once(
             "The `multimodal_cpu_fields` argument of `group_mm_kwargs_by_modality` "
-            "is deprecated and will be removed in v0.13."
+            "is deprecated and will be removed in v0.14."
         )
 
     from vllm.multimodal.inputs import MultiModalKwargsItems
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 32999903b3480..8745e1d9dbbbc 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -17,7 +17,7 @@ def __getattr__(name: str):
         warnings.warn(
             "`vllm.transformers_utils.tokenizer.AnyTokenizer` has been moved to "
             "`vllm.tokenizers.TokenizerLike`. "
-            "The old name will be removed in v0.13.",
+            "The old name will be removed in v0.14.",
             DeprecationWarning,
             stacklevel=2,
         )
@@ -29,7 +29,7 @@ def __getattr__(name: str):
         warnings.warn(
             "`vllm.transformers_utils.tokenizer.get_tokenizer` "
             "has been moved to `vllm.tokenizers.get_tokenizer`. "
-            "The old name will be removed in v0.13.",
+            "The old name will be removed in v0.14.",
             DeprecationWarning,
             stacklevel=2,
         )
@@ -41,7 +41,7 @@ def __getattr__(name: str):
         warnings.warn(
             "`vllm.transformers_utils.tokenizer.cached_get_tokenizer` "
             "has been moved to `vllm.tokenizers.cached_get_tokenizer`. "
-            "The old name will be removed in v0.13.",
+            "The old name will be removed in v0.14.",
             DeprecationWarning,
             stacklevel=2,
         )
@@ -53,7 +53,7 @@ def __getattr__(name: str):
         warnings.warn(
             "`vllm.transformers_utils.tokenizer.cached_tokenizer_from_config` "
             "has been moved to `vllm.tokenizers.cached_tokenizer_from_config`. "
-            "The old name will be removed in v0.13.",
+            "The old name will be removed in v0.14.",
             DeprecationWarning,
             stacklevel=2,
         )
@@ -65,7 +65,7 @@ def __getattr__(name: str):
         warnings.warn(
             "`vllm.transformers_utils.tokenizer.init_tokenizer_from_configs` "
             "has been moved to `vllm.tokenizers.init_tokenizer_from_config`. "
-            "The old name will be removed in v0.13.",
+            "The old name will be removed in v0.14.",
             DeprecationWarning,
             stacklevel=2,
         )
@@ -75,7 +75,7 @@ def __getattr__(name: str):
     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
 
 
-@deprecated("Will be removed in v0.13. Please use `tokenizer.decode()` instead.")
+@deprecated("Will be removed in v0.14. Please use `tokenizer.decode()` instead.")
 def decode_tokens(
     tokenizer: TokenizerLike,
     token_ids: list[int],
@@ -97,7 +97,7 @@ def decode_tokens(
     return tokenizer.decode(token_ids, **kw_args)
 
 
-@deprecated("Will be removed in v0.13. Please use `tokenizer.encode()` instead.")
+@deprecated("Will be removed in v0.14. Please use `tokenizer.encode()` instead.")
 def encode_tokens(
     tokenizer: TokenizerLike,
     text: str,
diff --git a/vllm/transformers_utils/tokenizer_base.py b/vllm/transformers_utils/tokenizer_base.py
index 78fb6edc8b9ed..3dfd4b4f2f6c1 100644
--- a/vllm/transformers_utils/tokenizer_base.py
+++ b/vllm/transformers_utils/tokenizer_base.py
@@ -11,7 +11,7 @@ def __getattr__(name: str):
         warnings.warn(
             "`vllm.transformers_utils.tokenizer_base.TokenizerBase` has been "
             "moved to `vllm.tokenizers.TokenizerLike`. "
-            "The old name will be removed in v0.13.",
+            "The old name will be removed in v0.14.",
             DeprecationWarning,
             stacklevel=2,
         )
@@ -23,7 +23,7 @@ def __getattr__(name: str):
         warnings.warn(
             "`vllm.transformers_utils.tokenizer_base.TokenizerRegistry` has been "
             "moved to `vllm.tokenizers.TokenizerRegistry`. "
-            "The old name will be removed in v0.13.",
+            "The old name will be removed in v0.14.",
             DeprecationWarning,
             stacklevel=2,
         )
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 931d13be3d9b6..fa3fb7a18895a 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -192,7 +192,7 @@ class AsyncLLM(EngineClient):
     @property
     @deprecated(
         "`AsyncLLM.processor` has been renamed to `AsyncLLM.input_processor`. "
-        "The old name will be removed in v0.13."
+        "The old name will be removed in v0.14."
     )
     def processor(self):
         return self.input_processor
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 4c31291005477..1cb206c4e004c 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -139,7 +139,7 @@ class LLMEngine:
     @property
     @deprecated(
         "`LLMEngine.processor` has been renamed to `LLMEngine.input_processor`. "
-        "The old name will be removed in v0.13."
+        "The old name will be removed in v0.14."
     )
     def processor(self):
         return self.input_processor
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index bc5c7fc400fde..a8c93499299d3 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -10,7 +10,7 @@ def __getattr__(name: str):
         warnings.warn(
             "`vllm.v1.engine.processor.Processor` has been moved to "
             "`vllm.v1.engine.input_processor.InputProcessor`. "
-            "The old name will be removed in v0.13.",
+            "The old name will be removed in v0.14.",
             DeprecationWarning,
             stacklevel=2,
         )

From e8e8cd73e5ddc4b56896e806066c37e9803e54b7 Mon Sep 17 00:00:00 2001
From: Anker <20343812+anker-c2@users.noreply.github.com>
Date: Wed, 10 Dec 2025 19:09:31 +0100
Subject: [PATCH 418/770] [Bugfix] Fix HunyuanOCR cross-image contamination in
 batch processing (#30344)

Signed-off-by: Lennart Brog <lennart.borg@list-ag.de>
Signed-off-by: Anker <20343812+anker-c2@users.noreply.github.com>
---
 vllm/model_executor/models/hunyuan_vision.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py
index e5c1be626be07..be084f4ee0f8e 100644
--- a/vllm/model_executor/models/hunyuan_vision.py
+++ b/vllm/model_executor/models/hunyuan_vision.py
@@ -502,6 +502,7 @@ class HunYuanVisionTransformer(nn.Module):
         cu_seqlens: list = [0]
 
         hidden_states = x.to(device=self.device, dtype=self.dtype)
+        # embeddings = patch_embeds + patch_pos_embed
         hidden_states = self.embeddings(hidden_states, grid_thw)
 
         for t, h, w in grid_thw:
@@ -515,8 +516,14 @@ class HunYuanVisionTransformer(nn.Module):
 
         hidden_states = hidden_states.reshape(seq_len, -1)
         hidden_states = hidden_states.unsqueeze(0)
-        for layer_num, layer in enumerate(self.layers):
-            hidden_states = layer(hidden_states)
+
+        # build per-image lengths once
+        split_lengths = [int(h) * int(w) for (_, h, w) in grid_thw]
+        for layer in self.layers:
+            # hidden_states: (1, T_total, D)
+            parts = hidden_states.split(split_lengths, dim=1)  # list of (1, L_i, D)
+            parts = [layer(p) for p in parts]
+            hidden_states = torch.cat(parts, dim=1)
 
         # adapter
         split_lengths = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()

From a9e4106f28834315de4bfb1cb1186c9a2dc95856 Mon Sep 17 00:00:00 2001
From: Will Eaton <wseaton@users.noreply.github.com>
Date: Wed, 10 Dec 2025 14:00:52 -0500
Subject: [PATCH 419/770] [P/D] KV Load Failure Recovery/Abort Configuration
 (#26813)

Signed-off-by: Will Eaton <weaton@redhat.com>
Signed-off-by: Will Eaton <me@wseaton.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Mark McLoughlin <markmc@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 tests/entrypoints/openai/test_chat_error.py   | 228 +++++++++
 .../openai/test_completion_error.py           | 216 +++++++++
 .../openai/test_responses_error.py            |  89 ++++
 .../unit/test_cache_pollution_prevention.py   | 163 +++++++
 .../unit/test_error_propagation.py            | 147 ++++++
 .../unit/test_invalid_blocks_correctness.py   | 454 ++++++++++++++++++
 vllm/config/kv_transfer.py                    |   5 +
 vllm/entrypoints/openai/serving_chat.py       |  17 +-
 vllm/entrypoints/openai/serving_completion.py |  15 +-
 vllm/entrypoints/openai/serving_engine.py     |  61 +++
 vllm/entrypoints/openai/serving_responses.py  |  53 +-
 vllm/v1/core/block_pool.py                    |  19 +
 vllm/v1/core/kv_cache_manager.py              |   8 +
 vllm/v1/core/sched/scheduler.py               | 114 +++--
 vllm/v1/engine/__init__.py                    |   9 +-
 vllm/v1/request.py                            |   2 +
 16 files changed, 1552 insertions(+), 48 deletions(-)
 create mode 100644 tests/entrypoints/openai/test_chat_error.py
 create mode 100644 tests/entrypoints/openai/test_completion_error.py
 create mode 100644 tests/entrypoints/openai/test_responses_error.py
 create mode 100644 tests/v1/kv_connector/unit/test_cache_pollution_prevention.py
 create mode 100644 tests/v1/kv_connector/unit/test_error_propagation.py
 create mode 100644 tests/v1/kv_connector/unit/test_invalid_blocks_correctness.py

diff --git a/tests/entrypoints/openai/test_chat_error.py b/tests/entrypoints/openai/test_chat_error.py
new file mode 100644
index 0000000000000..102eeaf614410
--- /dev/null
+++ b/tests/entrypoints/openai/test_chat_error.py
@@ -0,0 +1,228 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass, field
+from http import HTTPStatus
+from typing import Any
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from vllm.config.multimodal import MultiModalConfig
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest, ErrorResponse
+from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
+from vllm.outputs import CompletionOutput, RequestOutput
+from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.v1.engine.async_llm import AsyncLLM
+
+MODEL_NAME = "openai-community/gpt2"
+MODEL_NAME_SHORT = "gpt2"
+BASE_MODEL_PATHS = [
+    BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME),
+    BaseModelPath(name=MODEL_NAME_SHORT, model_path=MODEL_NAME_SHORT),
+]
+
+
+@dataclass
+class MockHFConfig:
+    model_type: str = "any"
+
+
+@dataclass
+class MockModelConfig:
+    task = "generate"
+    runner_type = "generate"
+    tokenizer = MODEL_NAME
+    trust_remote_code = False
+    tokenizer_mode = "auto"
+    max_model_len = 100
+    tokenizer_revision = None
+    multimodal_config = MultiModalConfig()
+    hf_config = MockHFConfig()
+    logits_processor_pattern = None
+    logits_processors: list[str] | None = None
+    diff_sampling_param: dict | None = None
+    allowed_local_media_path: str = ""
+    allowed_media_domains: list[str] | None = None
+    encoder_config = None
+    generation_config: str = "auto"
+    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
+    skip_tokenizer_init = False
+
+    def get_diff_sampling_param(self):
+        return self.diff_sampling_param or {}
+
+
+def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
+    models = OpenAIServingModels(
+        engine_client=engine,
+        base_model_paths=BASE_MODEL_PATHS,
+    )
+    serving_chat = OpenAIServingChat(
+        engine,
+        models,
+        response_role="assistant",
+        request_logger=None,
+        chat_template=None,
+        chat_template_content_format="auto",
+    )
+
+    async def _fake_process_inputs(
+        request_id,
+        engine_prompt,
+        sampling_params,
+        *,
+        lora_request,
+        trace_headers,
+        priority,
+    ):
+        return dict(engine_prompt), {}
+
+    async def _fake_preprocess_chat(*args, **kwargs):
+        # return conversation, request_prompts, engine_prompts
+        return (
+            [{"role": "user", "content": "Test"}],
+            [[1, 2, 3]],
+            [{"prompt_token_ids": [1, 2, 3]}],
+        )
+
+    serving_chat._process_inputs = AsyncMock(side_effect=_fake_process_inputs)
+    serving_chat._preprocess_chat = AsyncMock(side_effect=_fake_preprocess_chat)
+    return serving_chat
+
+
+@pytest.mark.asyncio
+async def test_chat_error_non_stream():
+    """test finish_reason='error' returns 500 InternalServerError (non-streaming)"""
+    mock_engine = MagicMock(spec=AsyncLLM)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
+    mock_engine.model_config = MockModelConfig()
+    mock_engine.input_processor = MagicMock()
+    mock_engine.io_processor = MagicMock()
+
+    serving_chat = _build_serving_chat(mock_engine)
+
+    completion_output = CompletionOutput(
+        index=0,
+        text="",
+        token_ids=[],
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason="error",
+    )
+
+    request_output = RequestOutput(
+        request_id="test-id",
+        prompt="Test prompt",
+        prompt_token_ids=[1, 2, 3],
+        prompt_logprobs=None,
+        outputs=[completion_output],
+        finished=True,
+        metrics=None,
+        lora_request=None,
+        encoder_prompt=None,
+        encoder_prompt_token_ids=None,
+    )
+
+    async def mock_generate(*args, **kwargs):
+        yield request_output
+
+    mock_engine.generate = MagicMock(side_effect=mock_generate)
+
+    request = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[{"role": "user", "content": "Test prompt"}],
+        max_tokens=10,
+        stream=False,
+    )
+
+    response = await serving_chat.create_chat_completion(request)
+
+    assert isinstance(response, ErrorResponse)
+    assert response.error.type == "InternalServerError"
+    assert response.error.message == "Internal server error"
+    assert response.error.code == HTTPStatus.INTERNAL_SERVER_ERROR
+
+
+@pytest.mark.asyncio
+async def test_chat_error_stream():
+    """test finish_reason='error' returns 500 InternalServerError (streaming)"""
+    mock_engine = MagicMock(spec=AsyncLLM)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
+    mock_engine.model_config = MockModelConfig()
+    mock_engine.input_processor = MagicMock()
+    mock_engine.io_processor = MagicMock()
+
+    serving_chat = _build_serving_chat(mock_engine)
+
+    completion_output_1 = CompletionOutput(
+        index=0,
+        text="Hello",
+        token_ids=[100],
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+    )
+
+    request_output_1 = RequestOutput(
+        request_id="test-id",
+        prompt="Test prompt",
+        prompt_token_ids=[1, 2, 3],
+        prompt_logprobs=None,
+        outputs=[completion_output_1],
+        finished=False,
+        metrics=None,
+        lora_request=None,
+        encoder_prompt=None,
+        encoder_prompt_token_ids=None,
+    )
+
+    completion_output_2 = CompletionOutput(
+        index=0,
+        text="Hello",
+        token_ids=[100],
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason="error",
+    )
+
+    request_output_2 = RequestOutput(
+        request_id="test-id",
+        prompt="Test prompt",
+        prompt_token_ids=[1, 2, 3],
+        prompt_logprobs=None,
+        outputs=[completion_output_2],
+        finished=True,
+        metrics=None,
+        lora_request=None,
+        encoder_prompt=None,
+        encoder_prompt_token_ids=None,
+    )
+
+    async def mock_generate(*args, **kwargs):
+        yield request_output_1
+        yield request_output_2
+
+    mock_engine.generate = MagicMock(side_effect=mock_generate)
+
+    request = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[{"role": "user", "content": "Test prompt"}],
+        max_tokens=10,
+        stream=True,
+    )
+
+    response = await serving_chat.create_chat_completion(request)
+
+    chunks = []
+    async for chunk in response:
+        chunks.append(chunk)
+
+    assert len(chunks) >= 2
+    assert any("Internal server error" in chunk for chunk in chunks), (
+        f"Expected error message in chunks: {chunks}"
+    )
+    assert chunks[-1] == "data: [DONE]\n\n"
diff --git a/tests/entrypoints/openai/test_completion_error.py b/tests/entrypoints/openai/test_completion_error.py
new file mode 100644
index 0000000000000..ca56cc2ddb6a7
--- /dev/null
+++ b/tests/entrypoints/openai/test_completion_error.py
@@ -0,0 +1,216 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass, field
+from http import HTTPStatus
+from typing import Any
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from vllm.config.multimodal import MultiModalConfig
+from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse
+from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
+from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
+from vllm.outputs import CompletionOutput, RequestOutput
+from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.v1.engine.async_llm import AsyncLLM
+
+MODEL_NAME = "openai-community/gpt2"
+MODEL_NAME_SHORT = "gpt2"
+BASE_MODEL_PATHS = [
+    BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME),
+    BaseModelPath(name=MODEL_NAME_SHORT, model_path=MODEL_NAME_SHORT),
+]
+
+
+@dataclass
+class MockHFConfig:
+    model_type: str = "any"
+
+
+@dataclass
+class MockModelConfig:
+    task = "generate"
+    runner_type = "generate"
+    tokenizer = MODEL_NAME
+    trust_remote_code = False
+    tokenizer_mode = "auto"
+    max_model_len = 100
+    tokenizer_revision = None
+    multimodal_config = MultiModalConfig()
+    hf_config = MockHFConfig()
+    logits_processor_pattern = None
+    logits_processors: list[str] | None = None
+    diff_sampling_param: dict | None = None
+    allowed_local_media_path: str = ""
+    allowed_media_domains: list[str] | None = None
+    encoder_config = None
+    generation_config: str = "auto"
+    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
+    skip_tokenizer_init = False
+
+    def get_diff_sampling_param(self):
+        return self.diff_sampling_param or {}
+
+
+def _build_serving_completion(engine: AsyncLLM) -> OpenAIServingCompletion:
+    models = OpenAIServingModels(
+        engine_client=engine,
+        base_model_paths=BASE_MODEL_PATHS,
+    )
+    serving_completion = OpenAIServingCompletion(
+        engine,
+        models,
+        request_logger=None,
+    )
+
+    async def _fake_process_inputs(
+        request_id,
+        engine_prompt,
+        sampling_params,
+        *,
+        lora_request,
+        trace_headers,
+        priority,
+    ):
+        return dict(engine_prompt), {}
+
+    serving_completion._process_inputs = AsyncMock(side_effect=_fake_process_inputs)
+    return serving_completion
+
+
+@pytest.mark.asyncio
+async def test_completion_error_non_stream():
+    """test finish_reason='error' returns 500 InternalServerError (non-streaming)"""
+    mock_engine = MagicMock(spec=AsyncLLM)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
+    mock_engine.model_config = MockModelConfig()
+    mock_engine.input_processor = MagicMock()
+    mock_engine.io_processor = MagicMock()
+
+    serving_completion = _build_serving_completion(mock_engine)
+
+    completion_output = CompletionOutput(
+        index=0,
+        text="",
+        token_ids=[],
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason="error",
+    )
+
+    request_output = RequestOutput(
+        request_id="test-id",
+        prompt="Test prompt",
+        prompt_token_ids=[1, 2, 3],
+        prompt_logprobs=None,
+        outputs=[completion_output],
+        finished=True,
+        metrics=None,
+        lora_request=None,
+        encoder_prompt=None,
+        encoder_prompt_token_ids=None,
+    )
+
+    async def mock_generate(*args, **kwargs):
+        yield request_output
+
+    mock_engine.generate = MagicMock(side_effect=mock_generate)
+
+    request = CompletionRequest(
+        model=MODEL_NAME,
+        prompt="Test prompt",
+        max_tokens=10,
+        stream=False,
+    )
+
+    response = await serving_completion.create_completion(request)
+
+    assert isinstance(response, ErrorResponse)
+    assert response.error.type == "InternalServerError"
+    assert response.error.message == "Internal server error"
+    assert response.error.code == HTTPStatus.INTERNAL_SERVER_ERROR
+
+
+@pytest.mark.asyncio
+async def test_completion_error_stream():
+    """test finish_reason='error' returns 500 InternalServerError (streaming)"""
+    mock_engine = MagicMock(spec=AsyncLLM)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
+    mock_engine.model_config = MockModelConfig()
+    mock_engine.input_processor = MagicMock()
+    mock_engine.io_processor = MagicMock()
+
+    serving_completion = _build_serving_completion(mock_engine)
+
+    completion_output_1 = CompletionOutput(
+        index=0,
+        text="Hello",
+        token_ids=[100],
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+    )
+
+    request_output_1 = RequestOutput(
+        request_id="test-id",
+        prompt="Test prompt",
+        prompt_token_ids=[1, 2, 3],
+        prompt_logprobs=None,
+        outputs=[completion_output_1],
+        finished=False,
+        metrics=None,
+        lora_request=None,
+        encoder_prompt=None,
+        encoder_prompt_token_ids=None,
+    )
+
+    completion_output_2 = CompletionOutput(
+        index=0,
+        text="Hello",
+        token_ids=[100],
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason="error",
+    )
+
+    request_output_2 = RequestOutput(
+        request_id="test-id",
+        prompt="Test prompt",
+        prompt_token_ids=[1, 2, 3],
+        prompt_logprobs=None,
+        outputs=[completion_output_2],
+        finished=True,
+        metrics=None,
+        lora_request=None,
+        encoder_prompt=None,
+        encoder_prompt_token_ids=None,
+    )
+
+    async def mock_generate(*args, **kwargs):
+        yield request_output_1
+        yield request_output_2
+
+    mock_engine.generate = MagicMock(side_effect=mock_generate)
+
+    request = CompletionRequest(
+        model=MODEL_NAME,
+        prompt="Test prompt",
+        max_tokens=10,
+        stream=True,
+    )
+
+    response = await serving_completion.create_completion(request)
+
+    chunks = []
+    async for chunk in response:
+        chunks.append(chunk)
+
+    assert len(chunks) >= 2
+    assert any("Internal server error" in chunk for chunk in chunks), (
+        f"Expected error message in chunks: {chunks}"
+    )
+    assert chunks[-1] == "data: [DONE]\n\n"
diff --git a/tests/entrypoints/openai/test_responses_error.py b/tests/entrypoints/openai/test_responses_error.py
new file mode 100644
index 0000000000000..f8ea178288835
--- /dev/null
+++ b/tests/entrypoints/openai/test_responses_error.py
@@ -0,0 +1,89 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from http import HTTPStatus
+from unittest.mock import MagicMock
+
+import pytest
+
+from vllm.entrypoints.openai.protocol import ErrorResponse
+from vllm.entrypoints.openai.serving_engine import GenerationError, OpenAIServing
+
+
+@pytest.mark.asyncio
+async def test_raise_if_error_raises_generation_error():
+    """test _raise_if_error raises GenerationError"""
+    # create a minimal OpenAIServing instance
+    mock_engine = MagicMock()
+    mock_engine.model_config = MagicMock()
+    mock_engine.model_config.max_model_len = 100
+    mock_models = MagicMock()
+
+    serving = OpenAIServing(
+        engine_client=mock_engine,
+        models=mock_models,
+        request_logger=None,
+    )
+
+    # test that error finish_reason raises GenerationError
+    with pytest.raises(GenerationError) as exc_info:
+        serving._raise_if_error("error", "test-request-id")
+
+    assert str(exc_info.value) == "Internal server error"
+    assert exc_info.value.status_code == HTTPStatus.INTERNAL_SERVER_ERROR
+
+    # test that other finish_reasons don't raise
+    serving._raise_if_error("stop", "test-request-id")  # should not raise
+    serving._raise_if_error("length", "test-request-id")  # should not raise
+    serving._raise_if_error(None, "test-request-id")  # should not raise
+
+
+@pytest.mark.asyncio
+async def test_convert_generation_error_to_response():
+    """test _convert_generation_error_to_response creates proper ErrorResponse"""
+    mock_engine = MagicMock()
+    mock_engine.model_config = MagicMock()
+    mock_engine.model_config.max_model_len = 100
+    mock_models = MagicMock()
+
+    serving = OpenAIServing(
+        engine_client=mock_engine,
+        models=mock_models,
+        request_logger=None,
+    )
+
+    # create a GenerationError
+    gen_error = GenerationError("Internal server error")
+
+    # convert to ErrorResponse
+    error_response = serving._convert_generation_error_to_response(gen_error)
+
+    assert isinstance(error_response, ErrorResponse)
+    assert error_response.error.type == "InternalServerError"
+    assert error_response.error.message == "Internal server error"
+    assert error_response.error.code == HTTPStatus.INTERNAL_SERVER_ERROR
+
+
+@pytest.mark.asyncio
+async def test_convert_generation_error_to_streaming_response():
+    """test _convert_generation_error_to_streaming_response output"""
+    mock_engine = MagicMock()
+    mock_engine.model_config = MagicMock()
+    mock_engine.model_config.max_model_len = 100
+    mock_models = MagicMock()
+
+    serving = OpenAIServing(
+        engine_client=mock_engine,
+        models=mock_models,
+        request_logger=None,
+    )
+
+    # create a GenerationError
+    gen_error = GenerationError("Internal server error")
+
+    # convert to streaming error response
+    error_json = serving._convert_generation_error_to_streaming_response(gen_error)
+
+    assert isinstance(error_json, str)
+    assert "Internal server error" in error_json
+    assert "InternalServerError" in error_json
diff --git a/tests/v1/kv_connector/unit/test_cache_pollution_prevention.py b/tests/v1/kv_connector/unit/test_cache_pollution_prevention.py
new file mode 100644
index 0000000000000..ec3fb8231e19e
--- /dev/null
+++ b/tests/v1/kv_connector/unit/test_cache_pollution_prevention.py
@@ -0,0 +1,163 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+test that invalid blocks are evicted from prefix cache to prevent pollution.
+
+verifies that when sync-loading fails, invalid blocks are removed from the
+prefix cache hash table so future requests cannot match and reuse corrupted data.
+"""
+
+from collections.abc import Callable
+from unittest.mock import Mock
+
+import pytest
+
+from vllm.v1.core.sched.scheduler import Scheduler
+from vllm.v1.request import Request, RequestStatus
+
+from .utils import (
+    create_model_runner_output,
+    create_request,
+    create_scheduler,
+    create_vllm_config,
+)
+
+pytestmark = pytest.mark.cpu_test
+
+
+def _make_get_num_new_matched_tokens(
+    req_num_new_matched_tokens: dict[str, int],
+    async_load: bool,
+) -> Callable[[Request, int], tuple[int, bool]]:
+    def get_num_new_matched_tokens(request: Request, _: int) -> tuple[int, bool]:
+        value = req_num_new_matched_tokens.get(request.request_id, 0)
+        return value, async_load
+
+    return get_num_new_matched_tokens
+
+
+@pytest.fixture
+def fail_scheduler():
+    """scheduler with kv_load_failure_policy='fail'"""
+    vllm_config = create_vllm_config()
+    vllm_config.kv_transfer_config.kv_load_failure_policy = "fail"
+    return create_scheduler(vllm_config)
+
+
+def test_invalid_blocks_evicted_prevents_cache_pollution(
+    fail_scheduler: Scheduler,
+):
+    """
+    verify invalid blocks are evicted to prevent future cache hits.
+
+    scenario:
+    1. request 1 loads externally-computed blocks (sync mode)
+    2. some blocks fail to load and are marked invalid
+    3. with fail policy, invalid blocks should be evicted from prefix cache
+    4. request is marked as FINISHED_ERROR
+    """
+    num_prompt_blocks = 100
+    num_external_computed_blocks = 99
+    invalid_block_idx = 50
+
+    num_prompt_tokens = num_prompt_blocks * fail_scheduler.block_size
+    num_external_computed_tokens = (
+        num_external_computed_blocks * fail_scheduler.block_size
+    )
+
+    # request 1: will have invalid blocks
+    request1 = create_request(num_tokens=num_prompt_tokens, request_id=1)
+    fail_scheduler.add_request(request=request1)
+
+    req_num_new_matched_tokens = {
+        request1.request_id: num_external_computed_tokens,
+    }
+
+    # mock connector indicating sync load
+    fail_scheduler.connector = Mock()
+    fail_scheduler.connector.get_num_new_matched_tokens.side_effect = (
+        _make_get_num_new_matched_tokens(req_num_new_matched_tokens, False)
+    )
+    fail_scheduler.connector.request_finished.return_value = (False, None)
+    fail_scheduler.connector.take_events.return_value = ()
+
+    scheduler_output = fail_scheduler.schedule()
+
+    # request should be running with sync KV load
+    assert len(fail_scheduler.running) == 1
+    assert request1.status == RequestStatus.RUNNING
+
+    # get allocated block IDs
+    req_block_ids = scheduler_output.scheduled_new_reqs[0].block_ids[0]
+    invalid_block_id = req_block_ids[invalid_block_idx]
+    invalid_block_ids = {invalid_block_id}
+
+    # get the block object to verify eviction later
+    block = fail_scheduler.kv_cache_manager.block_pool.blocks[invalid_block_id]
+
+    # cache the blocks to simulate they've been computed and cached
+    # (in real scenario blocks would be cached after compute)
+    fail_scheduler.kv_cache_manager.cache_blocks(request1, num_external_computed_tokens)
+
+    # verify block has a hash (is cached) before reporting invalid blocks
+    assert block.block_hash is not None, (
+        f"block {invalid_block_id} should be cached (have a hash) before "
+        f"eviction test, but hash is None"
+    )
+
+    # report invalid blocks
+    model_runner_output = create_model_runner_output(
+        [request1],
+        invalid_block_ids=invalid_block_ids,
+        use_eos=False,
+    )
+
+    fail_scheduler.update_from_output(scheduler_output, model_runner_output)
+
+    # verify request finished with error (fail policy)
+    assert request1.status == RequestStatus.FINISHED_ERROR
+
+    # critical assertion: invalid block and all subsequent blocks should be evicted
+    # all blocks from invalid_block_idx onwards become invalid since they were
+    # computed based on the failed block
+    for idx in range(invalid_block_idx, len(req_block_ids)):
+        block_id = req_block_ids[idx]
+        block_obj = fail_scheduler.kv_cache_manager.block_pool.blocks[block_id]
+        assert block_obj.block_hash is None, (
+            f"block {block_id} at index {idx} should have been evicted "
+            f"(hash reset to None), but hash is {block_obj.block_hash}. "
+            f"All blocks from index {invalid_block_idx} onwards should be evicted "
+            f"since they depend on the invalid block at index {invalid_block_idx}."
+        )
+
+    # verify cache contains exactly the valid blocks (before first affected block)
+    # and none of the invalid blocks (from first affected block onwards)
+
+    # valid blocks: all blocks before invalid_block_idx should be cached
+    for idx in range(invalid_block_idx):
+        block_id = req_block_ids[idx]
+        block_obj = fail_scheduler.kv_cache_manager.block_pool.blocks[block_id]
+        assert block_obj.block_hash is not None, (
+            f"valid block {block_id} at index {idx} should still be cached "
+            f"(have a hash), but hash is None. Only blocks from index "
+            f"{invalid_block_idx} onwards should be evicted."
+        )
+
+    # invalid blocks: verify they're not in the cached_block_hash_to_block map
+    cached_blocks = (
+        fail_scheduler.kv_cache_manager.block_pool.cached_block_hash_to_block
+    )
+    cached_block_ids = {
+        b.block_id
+        for blocks_val in cached_blocks._cache.values()
+        for b in (
+            [blocks_val] if not isinstance(blocks_val, dict) else blocks_val.values()
+        )
+    }
+
+    for idx in range(invalid_block_idx, len(req_block_ids)):
+        block_id = req_block_ids[idx]
+        assert block_id not in cached_block_ids, (
+            f"invalid block {block_id} at index {idx} should not be in cache hash table"
+        )
diff --git a/tests/v1/kv_connector/unit/test_error_propagation.py b/tests/v1/kv_connector/unit/test_error_propagation.py
new file mode 100644
index 0000000000000..20e181f379f5c
--- /dev/null
+++ b/tests/v1/kv_connector/unit/test_error_propagation.py
@@ -0,0 +1,147 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Callable
+from unittest.mock import Mock
+
+import pytest
+
+from vllm.v1.core.sched.scheduler import Scheduler
+from vllm.v1.request import FinishReason, Request, RequestStatus
+
+from .utils import (
+    create_model_runner_output,
+    create_request,
+    create_scheduler,
+    create_vllm_config,
+)
+
+pytestmark = pytest.mark.cpu_test
+
+
+def _make_get_num_new_matched_tokens(
+    req_num_new_matched_tokens: dict[str, int],
+    async_load: bool,
+) -> Callable[[Request, int], tuple[int, bool]]:
+    def get_num_new_matched_tokens(request: Request, _: int) -> tuple[int, bool]:
+        value = req_num_new_matched_tokens.get(request.request_id, 0)
+        return value, async_load
+
+    return get_num_new_matched_tokens
+
+
+@pytest.fixture
+def fail_scheduler():
+    """scheduler with kv_load_failure_policy='fail'"""
+    vllm_config = create_vllm_config()
+    vllm_config.kv_transfer_config.kv_load_failure_policy = "fail"
+    return create_scheduler(vllm_config)
+
+
+def test_error_propagation_sync_load(fail_scheduler: Scheduler):
+    """test invalid_block_ids with fail policy -> FINISHED_ERROR (sync load)"""
+    num_prompt_blocks = 100
+    num_external_computed_blocks = 99
+    invalid_block_idx = 50
+
+    num_prompt_tokens = num_prompt_blocks * fail_scheduler.block_size
+    num_external_computed_tokens = (
+        num_external_computed_blocks * fail_scheduler.block_size
+    )
+
+    request = create_request(num_tokens=num_prompt_tokens)
+    fail_scheduler.add_request(request=request)
+
+    req_num_new_matched_tokens = {
+        request.request_id: num_external_computed_tokens,
+    }
+
+    fail_scheduler.connector = Mock()
+    fail_scheduler.connector.get_num_new_matched_tokens.side_effect = (
+        _make_get_num_new_matched_tokens(req_num_new_matched_tokens, False)
+    )
+    fail_scheduler.connector.request_finished.return_value = (False, None)
+    fail_scheduler.connector.take_events.return_value = ()
+
+    scheduler_output = fail_scheduler.schedule()
+
+    assert len(fail_scheduler.running) == 1
+    assert len(scheduler_output.scheduled_new_reqs) == 1
+    assert fail_scheduler.connector.get_num_new_matched_tokens.call_count == 1
+
+    req_block_ids = scheduler_output.scheduled_new_reqs[0].block_ids[0]
+    invalid_block_ids = {req_block_ids[invalid_block_idx]}
+    model_runner_output = create_model_runner_output(
+        [request],
+        invalid_block_ids=invalid_block_ids,
+        use_eos=True,
+    )
+
+    outputs = fail_scheduler.update_from_output(scheduler_output, model_runner_output)
+
+    assert request.status == RequestStatus.FINISHED_ERROR
+    assert request.get_finished_reason() == FinishReason.ERROR
+
+    assert len(outputs) == 1
+    engine_outputs = next(iter(outputs.values()))
+    assert len(engine_outputs.outputs) == 1
+    output = engine_outputs.outputs[0]
+    assert output.request_id == request.request_id
+    assert output.finish_reason == FinishReason.ERROR
+
+    assert len(fail_scheduler.running) == 0
+
+
+def test_error_propagation_async_load(fail_scheduler: Scheduler):
+    """test invalid_block_ids with fail policy -> FINISHED_ERROR (async load)"""
+    num_prompt_blocks = 100
+    num_external_computed_blocks = 99
+    invalid_block_idx = 50
+
+    num_prompt_tokens = num_prompt_blocks * fail_scheduler.block_size
+    num_external_computed_tokens = (
+        num_external_computed_blocks * fail_scheduler.block_size
+    )
+
+    request = create_request(num_tokens=num_prompt_tokens)
+    fail_scheduler.add_request(request=request)
+
+    req_num_new_matched_tokens = {
+        request.request_id: num_external_computed_tokens,
+    }
+
+    fail_scheduler.connector = Mock()
+    fail_scheduler.connector.get_num_new_matched_tokens.side_effect = (
+        _make_get_num_new_matched_tokens(req_num_new_matched_tokens, True)
+    )
+    fail_scheduler.connector.request_finished.return_value = (False, None)
+    fail_scheduler.connector.take_events.return_value = ()
+
+    scheduler_output = fail_scheduler.schedule()
+
+    assert len(fail_scheduler.waiting) == 1
+    assert request.status == RequestStatus.WAITING_FOR_REMOTE_KVS
+    assert request.num_computed_tokens == 0
+
+    (req_block_ids,) = fail_scheduler.kv_cache_manager.get_block_ids(request.request_id)
+    invalid_block_ids = {req_block_ids[invalid_block_idx]}
+    model_runner_output = create_model_runner_output(
+        reqs=[],
+        finished_recving=set(),
+        invalid_block_ids=invalid_block_ids,
+        use_eos=True,
+    )
+
+    outputs = fail_scheduler.update_from_output(scheduler_output, model_runner_output)
+
+    assert request.status == RequestStatus.FINISHED_ERROR
+    assert request.get_finished_reason() == FinishReason.ERROR
+
+    assert len(outputs) == 1
+    engine_outputs = next(iter(outputs.values()))
+    assert len(engine_outputs.outputs) == 1
+    output = engine_outputs.outputs[0]
+    assert output.request_id == request.request_id
+    assert output.finish_reason == FinishReason.ERROR
+
+    assert len(fail_scheduler.waiting) == 0
diff --git a/tests/v1/kv_connector/unit/test_invalid_blocks_correctness.py b/tests/v1/kv_connector/unit/test_invalid_blocks_correctness.py
new file mode 100644
index 0000000000000..940f3a98308b6
--- /dev/null
+++ b/tests/v1/kv_connector/unit/test_invalid_blocks_correctness.py
@@ -0,0 +1,454 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Tests for correctness in invalid block handling.
+
+These tests verify correct behavior in three scenarios:
+1. Sync recompute case: Blocks should not be freed for running requests
+   that need to recompute invalid blocks
+2. Sync fail case: Invalid blocks must be evicted from cache when request fails
+3. Async recompute case: Invalid blocks should not be cached after transfer
+"""
+
+from collections.abc import Callable
+from unittest.mock import Mock
+
+import pytest
+
+from vllm.v1.core.sched.scheduler import Scheduler
+from vllm.v1.request import FinishReason, Request, RequestStatus
+
+from .utils import (
+    create_model_runner_output,
+    create_request,
+    create_scheduler,
+    create_vllm_config,
+)
+
+pytestmark = pytest.mark.cpu_test
+
+
+def _make_get_num_new_matched_tokens(
+    req_num_new_matched_tokens: dict[str, int],
+    async_load: bool,
+) -> Callable[[Request, int], tuple[int, bool]]:
+    def get_num_new_matched_tokens(request: Request, _: int) -> tuple[int, bool]:
+        value = req_num_new_matched_tokens.get(request.request_id, 0)
+        return value, async_load
+
+    return get_num_new_matched_tokens
+
+
+@pytest.fixture
+def fail_scheduler():
+    """scheduler with kv_load_failure_policy='fail'"""
+    vllm_config = create_vllm_config()
+    vllm_config.kv_transfer_config.kv_load_failure_policy = "fail"
+    return create_scheduler(vllm_config)
+
+
+@pytest.fixture
+def recompute_scheduler():
+    """scheduler with kv_load_failure_policy='recompute'"""
+    vllm_config = create_vllm_config()
+    vllm_config.kv_transfer_config.kv_load_failure_policy = "recompute"
+    return create_scheduler(vllm_config)
+
+
+def test_sync_recompute_blocks_not_freed_for_running_requests(
+    recompute_scheduler: Scheduler,
+):
+    """
+    Test sync recompute case - blocks must not be freed for running requests.
+
+    When a running request has invalid blocks and retry_policy is 'recompute':
+    1. Request should remain in RUNNING state
+    2. num_computed_tokens should be truncated to invalid block boundary
+    3. Blocks should NOT be freed (request still needs them for recomputation)
+    4. Request should remain in scheduler.requests and scheduler.running
+    """
+    num_prompt_blocks = 100
+    num_external_computed_blocks = 99
+    invalid_block_idx = 50
+
+    num_prompt_tokens = num_prompt_blocks * recompute_scheduler.block_size
+    num_external_computed_tokens = (
+        num_external_computed_blocks * recompute_scheduler.block_size
+    )
+
+    request = create_request(num_tokens=num_prompt_tokens)
+    recompute_scheduler.add_request(request=request)
+
+    req_num_new_matched_tokens = {
+        request.request_id: num_external_computed_tokens,
+    }
+
+    # mock connector indicating sync load
+    recompute_scheduler.connector = Mock()
+    recompute_scheduler.connector.get_num_new_matched_tokens.side_effect = (
+        _make_get_num_new_matched_tokens(req_num_new_matched_tokens, False)
+    )
+    recompute_scheduler.connector.request_finished.return_value = (False, None)
+    recompute_scheduler.connector.take_events.return_value = ()
+
+    scheduler_output = recompute_scheduler.schedule()
+
+    # request should be running with sync KV load
+    assert len(recompute_scheduler.running) == 1
+    assert len(scheduler_output.scheduled_new_reqs) == 1
+    assert request.status == RequestStatus.RUNNING
+
+    # get the allocated block IDs before invalid blocks are reported
+    req_block_ids = scheduler_output.scheduled_new_reqs[0].block_ids[0]
+    invalid_block_ids = {req_block_ids[invalid_block_idx]}
+
+    # store original num_computed_tokens for comparison
+    original_num_computed_tokens = request.num_computed_tokens
+
+    model_runner_output = create_model_runner_output(
+        [request],
+        invalid_block_ids=invalid_block_ids,
+        use_eos=False,  # not finished - should continue running
+    )
+
+    outputs = recompute_scheduler.update_from_output(
+        scheduler_output, model_runner_output
+    )
+
+    # critical assertions for recompute case:
+
+    # 1. request should still be RUNNING (not finished, not aborted)
+    assert request.status == RequestStatus.RUNNING, (
+        f"Request should remain RUNNING for recompute, got {request.status}"
+    )
+
+    # 2. num_computed_tokens should be truncated to first invalid block
+    expected_truncated_tokens = invalid_block_idx * recompute_scheduler.block_size
+    assert request.num_computed_tokens == expected_truncated_tokens, (
+        f"num_computed_tokens should be truncated to {expected_truncated_tokens}, "
+        f"got {request.num_computed_tokens}"
+    )
+    assert request.num_computed_tokens < original_num_computed_tokens, (
+        "num_computed_tokens should be reduced after invalid block detection"
+    )
+
+    # 3. no output should be generated (request is still running)
+    # the request should be skipped in the output loop
+    assert len(outputs) == 0 or request.request_id not in [
+        out.request_id for outs in outputs.values() for out in outs.outputs
+    ], "No output should be generated for recompute requests"
+
+    # 4. request should still be in running queue
+    assert request in recompute_scheduler.running, (
+        "Request should remain in running queue for recomputation"
+    )
+
+    # 5. request should still be in scheduler.requests (not deleted)
+    assert request.request_id in recompute_scheduler.requests, (
+        "Request should not be deleted from scheduler.requests"
+    )
+
+    # 6. blocks should NOT be freed - verify blocks are still allocated
+    try:
+        allocated_blocks = recompute_scheduler.kv_cache_manager.get_block_ids(
+            request.request_id
+        )
+        assert allocated_blocks is not None
+        assert len(allocated_blocks[0]) > 0, (
+            "Blocks should still be allocated for recomputation"
+        )
+    except KeyError:
+        pytest.fail(
+            "Blocks were freed incorrectly! Running requests need their blocks "
+            "to recompute invalid portions."
+        )
+
+    # 7. verify request can be rescheduled in next step
+    scheduler_output_2 = recompute_scheduler.schedule()
+
+    # request should appear in the new schedule to recompute invalid blocks
+    scheduled_req_ids = [
+        req.request_id for req in scheduler_output_2.scheduled_new_reqs
+    ]
+    if scheduler_output_2.num_scheduled_tokens:
+        scheduled_req_ids.extend(scheduler_output_2.num_scheduled_tokens.keys())
+
+    assert (
+        request.request_id in scheduled_req_ids or len(recompute_scheduler.running) > 0
+    ), "Request should be reschedulable for recomputation"
+
+
+def test_sync_fail_invalid_blocks_evicted(fail_scheduler: Scheduler):
+    """
+    Test sync fail case - invalid blocks must be evicted from cache.
+
+    When a request fails with policy='fail' and has invalid blocks from sync loading:
+    1. Request should be finished with FINISHED_ERROR
+    2. Invalid blocks should be evicted from the KV cache
+    3. Valid blocks (if shared) should remain in cache
+    4. Future requests should not reuse the invalid blocks
+
+    This test verifies that invalid blocks are properly evicted to prevent
+    cache corruption and reuse of invalid data.
+    """
+    num_prompt_blocks = 100
+    num_external_computed_blocks = 99
+    invalid_block_idx = 50
+
+    num_prompt_tokens = num_prompt_blocks * fail_scheduler.block_size
+    num_external_computed_tokens = (
+        num_external_computed_blocks * fail_scheduler.block_size
+    )
+
+    request = create_request(num_tokens=num_prompt_tokens)
+    fail_scheduler.add_request(request=request)
+
+    req_num_new_matched_tokens = {
+        request.request_id: num_external_computed_tokens,
+    }
+
+    # mock connector indicating sync load
+    fail_scheduler.connector = Mock()
+    fail_scheduler.connector.get_num_new_matched_tokens.side_effect = (
+        _make_get_num_new_matched_tokens(req_num_new_matched_tokens, False)
+    )
+    fail_scheduler.connector.request_finished.return_value = (False, None)
+    fail_scheduler.connector.take_events.return_value = ()
+
+    scheduler_output = fail_scheduler.schedule()
+
+    # request should be running with sync KV load
+    assert len(fail_scheduler.running) == 1
+    assert request.status == RequestStatus.RUNNING
+
+    # get allocated block IDs
+    req_block_ids = scheduler_output.scheduled_new_reqs[0].block_ids[0]
+    invalid_block_id = req_block_ids[invalid_block_idx]
+    invalid_block_ids = {invalid_block_id}
+
+    # verify the block is in the block pool before we report it as invalid
+    block = fail_scheduler.kv_cache_manager.block_pool.blocks[invalid_block_id]
+    assert block is not None
+
+    # report invalid blocks - request should fail
+    model_runner_output = create_model_runner_output(
+        [request],
+        invalid_block_ids=invalid_block_ids,
+        use_eos=True,
+    )
+
+    outputs = fail_scheduler.update_from_output(scheduler_output, model_runner_output)
+
+    # verify request is finished with error
+    assert request.status == RequestStatus.FINISHED_ERROR
+    assert request.get_finished_reason() == FinishReason.ERROR
+
+    # verify output is generated
+    assert len(outputs) == 1
+    engine_outputs = next(iter(outputs.values()))
+    assert len(engine_outputs.outputs) == 1
+    output = engine_outputs.outputs[0]
+    assert output.request_id == request.request_id
+    assert output.finish_reason == FinishReason.ERROR
+
+    # verify the request was removed from scheduler
+    assert request.request_id not in fail_scheduler.requests
+    assert len(fail_scheduler.running) == 0
+
+    # critical: verify invalid block was actually freed from cache
+    # this is the key assertion - the invalid block should no longer be
+    # tracked by the KV cache manager for this request
+    # if it's still there, a future request could reuse the invalid data
+    try:
+        block_ids = fail_scheduler.kv_cache_manager.get_block_ids(request.request_id)
+        # if we get here, check if blocks were actually freed
+        if block_ids is not None and len(block_ids[0]) > 0:
+            pytest.fail(
+                f"Invalid blocks still tracked for finished request! "
+                f"Request {request.request_id} should have been freed but "
+                f"still has {len(block_ids[0])} blocks allocated."
+            )
+        # blocks list exists but is empty - this is fine, they were freed
+    except KeyError:
+        # expected - request completely removed from tracking
+        pass
+
+    # critical: verify invalid block was evicted from prefix cache
+    # the block should no longer have a hash (hash is reset on eviction)
+    assert block.block_hash is None, (
+        f"Invalid block {invalid_block_id} should have been evicted from cache "
+        f"(hash should be None), but hash is still {block.block_hash}"
+    )
+
+
+def test_async_recompute_blocks_not_cached_when_invalid(
+    recompute_scheduler: Scheduler,
+):
+    """
+    Test async recompute case - invalid blocks not cached after transfer.
+
+    When async KV loading has invalid blocks and retry_policy is 'recompute':
+    1. Blocks are allocated but not cached yet
+    2. When async transfer completes, only valid blocks should be cached
+    3. Invalid blocks should never enter the prefix cache
+
+    This test verifies correctness, the failed_recving_kv_req_ids protection
+    ensures only valid blocks are cached when the transfer completes, and we
+    only evict blocks from cache that are already hashed in the block table.
+    """
+    from unittest.mock import patch
+
+    num_prompt_blocks = 100
+    num_external_computed_blocks = 99
+    invalid_block_idx = 50
+
+    num_prompt_tokens = num_prompt_blocks * recompute_scheduler.block_size
+    num_external_computed_tokens = (
+        num_external_computed_blocks * recompute_scheduler.block_size
+    )
+
+    request = create_request(num_tokens=num_prompt_tokens)
+    recompute_scheduler.add_request(request=request)
+
+    req_num_new_matched_tokens = {
+        request.request_id: num_external_computed_tokens,
+    }
+
+    # mock connector indicating async load
+    recompute_scheduler.connector = Mock()
+    recompute_scheduler.connector.get_num_new_matched_tokens.side_effect = (
+        _make_get_num_new_matched_tokens(req_num_new_matched_tokens, True)
+    )
+    recompute_scheduler.connector.request_finished.return_value = (False, None)
+    recompute_scheduler.connector.take_events.return_value = ()
+
+    scheduler_output = recompute_scheduler.schedule()
+
+    # request should be waiting for remote KVs
+    assert len(recompute_scheduler.waiting) == 1
+    assert request.status == RequestStatus.WAITING_FOR_REMOTE_KVS
+    assert request.num_computed_tokens == 0
+
+    # get the allocated block IDs
+    (req_block_ids,) = recompute_scheduler.kv_cache_manager.get_block_ids(
+        request.request_id
+    )
+    invalid_block_id = req_block_ids[invalid_block_idx]
+    invalid_block_ids = {invalid_block_id}
+
+    # get the block object to verify it's not cached yet and stays uncached
+    block = recompute_scheduler.kv_cache_manager.block_pool.blocks[invalid_block_id]
+
+    # verify block has no hash before invalid blocks are reported
+    assert block.block_hash is None, (
+        "Async loading blocks should not be cached yet (no hash)"
+    )
+
+    # report invalid blocks (transfer not finished yet)
+    model_runner_output = create_model_runner_output(
+        reqs=[],
+        finished_recving=None,  # transfer NOT finished
+        invalid_block_ids=invalid_block_ids,
+        use_eos=False,
+    )
+
+    # critical: spy on evict_blocks to verify it's NOT called for async blocks
+    original_evict_blocks = recompute_scheduler.kv_cache_manager.evict_blocks
+    evict_blocks_calls = []
+
+    def evict_blocks_spy(block_ids):
+        evict_blocks_calls.append(set(block_ids))
+        return original_evict_blocks(block_ids)
+
+    with patch.object(
+        recompute_scheduler.kv_cache_manager, "evict_blocks", evict_blocks_spy
+    ):
+        recompute_scheduler.update_from_output(scheduler_output, model_runner_output)
+
+    # verify evict_blocks was NOT called (async blocks excluded from eviction)
+    assert len(evict_blocks_calls) == 0, (
+        f"evict_blocks should not be called for async-only invalid blocks, "
+        f"but was called {len(evict_blocks_calls)} time(s) with {evict_blocks_calls}"
+    )
+
+    # request should still be waiting (not finished with error due to recompute policy)
+    assert request.status == RequestStatus.WAITING_FOR_REMOTE_KVS
+    assert request.request_id in recompute_scheduler.failed_recving_kv_req_ids
+
+    # verify num_computed_tokens was truncated to before invalid block
+    expected_valid_tokens = invalid_block_idx * recompute_scheduler.block_size
+    assert request.num_computed_tokens == expected_valid_tokens
+
+    # verify invalid block still has no hash (was not evicted)
+    assert block.block_hash is None, (
+        f"Async loading blocks shouldn't be cached or evicted. "
+        f"Block {invalid_block_id} hash should be None but is {block.block_hash}"
+    )
+
+    # now simulate async transfer completing
+    model_runner_output_2 = create_model_runner_output(
+        reqs=[],
+        finished_recving={request.request_id},
+        invalid_block_ids=None,
+        use_eos=False,
+    )
+
+    recompute_scheduler.update_from_output(scheduler_output, model_runner_output_2)
+
+    # verify request is now marked as finished receiving and ready to be processed
+    assert request.request_id in recompute_scheduler.finished_recving_kv_req_ids
+    assert request.request_id in recompute_scheduler.failed_recving_kv_req_ids
+
+    # critical: verify invalid block still has no hash before recompute
+    # the async transfer invalid data was never cached
+    assert block.block_hash is None, (
+        f"Invalid block {invalid_block_id} should not be cached before recompute "
+        f"(hash should be None), but hash is {block.block_hash}"
+    )
+
+    # critical end-to-end test: spy on cache_blocks to verify it's called with
+    # the truncated num_computed_tokens value
+    original_cache_blocks = recompute_scheduler.kv_cache_manager.cache_blocks
+    cache_blocks_calls = []
+
+    def cache_blocks_spy(req, num_tokens):
+        cache_blocks_calls.append((req.request_id, num_tokens))
+        return original_cache_blocks(req, num_tokens)
+
+    with patch.object(
+        recompute_scheduler.kv_cache_manager, "cache_blocks", cache_blocks_spy
+    ):
+        # call schedule() again - this triggers _update_waiting_for_remote_kv()
+        # which should call cache_blocks with the truncated value
+        recompute_scheduler.schedule()
+
+    # verify cache_blocks was called with the truncated value
+    assert len(cache_blocks_calls) == 1, (
+        f"cache_blocks should be called exactly once, "
+        f"got {len(cache_blocks_calls)} calls"
+    )
+    cached_req_id, cached_num_tokens = cache_blocks_calls[0]
+    assert cached_req_id == request.request_id
+    assert cached_num_tokens == expected_valid_tokens, (
+        f"cache_blocks should be called with truncated value {expected_valid_tokens}, "
+        f"but was called with {cached_num_tokens}"
+    )
+
+    # request should now be RUNNING (scheduled immediately after transfer completes)
+    # the flow is: WAITING_FOR_REMOTE_KVS -> WAITING -> RUNNING in same schedule() call
+    assert request.status == RequestStatus.RUNNING
+
+    # num_computed_tokens should be >= expected_valid_tokens because the scheduler
+    # will schedule additional new tokens (up to max_num_batched_tokens) for the request
+    assert request.num_computed_tokens >= expected_valid_tokens, (
+        f"num_computed_tokens should be at least {expected_valid_tokens}, "
+        f"got {request.num_computed_tokens}"
+    )
+
+    # request should no longer be in the failed/finished receiving sets
+    assert request.request_id not in recompute_scheduler.failed_recving_kv_req_ids
+    assert request.request_id not in recompute_scheduler.finished_recving_kv_req_ids
+
+    # request should be in the running queue
+    assert request in recompute_scheduler.running
diff --git a/vllm/config/kv_transfer.py b/vllm/config/kv_transfer.py
index 88f8b91c292bb..98cea821c678e 100644
--- a/vllm/config/kv_transfer.py
+++ b/vllm/config/kv_transfer.py
@@ -64,6 +64,11 @@ class KVTransferConfig:
     enable_permute_local_kv: bool = False
     """Experiment feature flag to enable HND to NHD KV Transfer"""
 
+    kv_load_failure_policy: Literal["recompute", "fail"] = "recompute"
+    """Policy for handling KV cache load failures.
+    'recompute': reschedule the request to recompute failed blocks (default)
+    'fail': immediately fail the request with an error finish reason"""
+
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index c6333d170c663..2560a5b2cdf41 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -51,7 +51,11 @@ from vllm.entrypoints.openai.protocol import (
     ToolCall,
     UsageInfo,
 )
-from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs
+from vllm.entrypoints.openai.serving_engine import (
+    GenerationError,
+    OpenAIServing,
+    clamp_prompt_logprobs,
+)
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.openai.tool_parsers import ToolParser
 from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolCall
@@ -380,6 +384,8 @@ class OpenAIServingChat(OpenAIServing):
                 tokenizer,
                 request_metadata,
             )
+        except GenerationError as e:
+            return self._convert_generation_error_to_response(e)
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
@@ -1120,6 +1126,10 @@ class OpenAIServingChat(OpenAIServing):
 
                     # if the model is finished generating
                     else:
+                        # check for error finish reason and abort streaming
+                        # finish_reason='error' indicates a retryable error
+                        self._raise_if_error(output.finish_reason, request_id)
+
                         # check to make sure we haven't "forgotten" to stream
                         #   any tokens that were generated but previously
                         #   matched by partial json parsing
@@ -1287,6 +1297,8 @@ class OpenAIServingChat(OpenAIServing):
                         delta=False,
                     )
 
+        except GenerationError as e:
+            yield f"data: {self._convert_generation_error_to_streaming_response(e)}\n\n"
         except Exception as e:
             # TODO: Use a vllm-specific Validation Error
             logger.exception("Error in chat completion stream generator.")
@@ -1327,6 +1339,9 @@ class OpenAIServingChat(OpenAIServing):
 
         role = self.get_chat_request_role(request)
         for output in final_res.outputs:
+            # check for error finish reason and raise GenerationError
+            # finish_reason='error' indicates a retryable request-level internal error
+            self._raise_if_error(output.finish_reason, request_id)
             token_ids = output.token_ids
             out_logprobs = output.logprobs
             tool_call_info = None
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 3e421e21e3e80..1be0afc8c74e5 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -24,7 +24,11 @@ from vllm.entrypoints.openai.protocol import (
     RequestResponseMetadata,
     UsageInfo,
 )
-from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs
+from vllm.entrypoints.openai.serving_engine import (
+    GenerationError,
+    OpenAIServing,
+    clamp_prompt_logprobs,
+)
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.renderer import RenderConfig
 from vllm.entrypoints.utils import get_max_tokens, should_include_usage
@@ -300,6 +304,8 @@ class OpenAIServingCompletion(OpenAIServing):
             )
         except asyncio.CancelledError:
             return self.create_error_response("Client disconnected")
+        except GenerationError as e:
+            return self._convert_generation_error_to_response(e)
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
@@ -437,6 +443,8 @@ class OpenAIServingCompletion(OpenAIServing):
                     finish_reason = output.finish_reason
                     stop_reason = output.stop_reason
 
+                    self._raise_if_error(finish_reason, request_id)
+
                     chunk = CompletionStreamResponse(
                         id=request_id,
                         created=created_time,
@@ -498,8 +506,11 @@ class OpenAIServingCompletion(OpenAIServing):
             # report to FastAPI middleware aggregate usage across all choices
             request_metadata.final_usage_info = final_usage_info
 
+        except GenerationError as e:
+            yield f"data: {self._convert_generation_error_to_streaming_response(e)}\n\n"
         except Exception as e:
             # TODO: Use a vllm-specific Validation Error
+            logger.exception("Error in completion stream generator.")
             data = self.create_streaming_error_response(str(e))
             yield f"data: {data}\n\n"
         yield "data: [DONE]\n\n"
@@ -530,6 +541,8 @@ class OpenAIServingCompletion(OpenAIServing):
             out_logprobs: GenericSequence[dict[int, Logprob] | None] | None
 
             for output in final_res.outputs:
+                self._raise_if_error(output.finish_reason, request_id)
+
                 assert request.max_tokens is not None
                 if request.echo:
                     if request.return_token_ids:
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 44b0f1842a6c1..a799432baeb40 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -133,6 +133,15 @@ from vllm.utils.async_utils import (
 from vllm.utils.collection_utils import is_list_of
 from vllm.v1.engine import EngineCoreRequest
 
+
+class GenerationError(Exception):
+    """raised when finish_reason indicates internal server error (500)"""
+
+    def __init__(self, message: str = "Internal server error"):
+        super().__init__(message)
+        self.status_code = HTTPStatus.INTERNAL_SERVER_ERROR
+
+
 logger = init_logger(__name__)
 
 CompletionLikeRequest: TypeAlias = (
@@ -456,6 +465,29 @@ class OpenAIServing:
             # Iterate through all beam inference results
             for i, result in enumerate(output):
                 current_beam = all_beams[i]
+
+                # check for error finish reason and abort beam search
+                if result.outputs[0].finish_reason == "error":
+                    # yield error output and terminate beam search
+                    yield RequestOutput(
+                        request_id=request_id,
+                        prompt=prompt_text,
+                        outputs=[
+                            CompletionOutput(
+                                index=0,
+                                text="",
+                                token_ids=[],
+                                cumulative_logprob=None,
+                                logprobs=None,
+                                finish_reason="error",
+                            )
+                        ],
+                        finished=True,
+                        prompt_token_ids=prompt_token_ids,
+                        prompt_logprobs=None,
+                    )
+                    return
+
                 if result.outputs[0].logprobs is not None:
                     logprobs = result.outputs[0].logprobs[0]
                     all_beams_token_id.extend(list(logprobs.keys()))
@@ -780,6 +812,35 @@ class OpenAIServing:
         )
         return json_str
 
+    def _raise_if_error(self, finish_reason: str | None, request_id: str) -> None:
+        """Raise GenerationError if finish_reason indicates an error."""
+        if finish_reason == "error":
+            logger.error(
+                "Request %s failed with an internal error during generation",
+                request_id,
+            )
+            raise GenerationError("Internal server error")
+
+    def _convert_generation_error_to_response(
+        self, e: GenerationError
+    ) -> ErrorResponse:
+        """Convert GenerationError to ErrorResponse."""
+        return self.create_error_response(
+            str(e),
+            err_type="InternalServerError",
+            status_code=e.status_code,
+        )
+
+    def _convert_generation_error_to_streaming_response(
+        self, e: GenerationError
+    ) -> str:
+        """Convert GenerationError to streaming error response."""
+        return self.create_streaming_error_response(
+            str(e),
+            err_type="InternalServerError",
+            status_code=e.status_code,
+        )
+
     async def _check_model(
         self,
         request: AnyRequest,
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 91616a78e11dc..60d14337dcaaf 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -50,6 +50,7 @@ from openai.types.responses.response_reasoning_item import (
 )
 from openai.types.responses.tool import Mcp, Tool
 from openai_harmony import Message as OpenAIHarmonyMessage
+from pydantic import TypeAdapter
 
 from vllm import envs
 from vllm.engine.protocol import EngineClient
@@ -94,7 +95,10 @@ from vllm.entrypoints.openai.protocol import (
     ResponseUsage,
     StreamingResponsesResponse,
 )
-from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_engine import (
+    GenerationError,
+    OpenAIServing,
+)
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.responses_utils import (
     construct_input_messages,
@@ -541,6 +545,8 @@ class OpenAIServingResponses(OpenAIServing):
                 tokenizer,
                 request_metadata,
             )
+        except GenerationError as e:
+            return self._convert_generation_error_to_response(e)
         except Exception as e:
             return self.create_error_response(str(e))
 
@@ -648,6 +654,8 @@ class OpenAIServingResponses(OpenAIServing):
                     status = "incomplete"
                 elif context.finish_reason == "abort":
                     status = "cancelled"
+                else:
+                    self._raise_if_error(context.finish_reason, request.request_id)
             else:
                 status = "incomplete"
         elif isinstance(context, ParsableContext):
@@ -673,6 +681,9 @@ class OpenAIServingResponses(OpenAIServing):
             assert len(final_res.outputs) == 1
             final_output = final_res.outputs[0]
 
+            # finish_reason='error' indicates retryable internal error
+            self._raise_if_error(final_output.finish_reason, request.request_id)
+
             output = self._make_response_output_items(request, final_output, tokenizer)
 
             if request.enable_response_messages:
@@ -1066,6 +1077,8 @@ class OpenAIServingResponses(OpenAIServing):
             async for event in generator:
                 event_deque.append(event)
                 new_event_signal.set()  # Signal new event available
+        except GenerationError as e:
+            response = self._convert_generation_error_to_response(e)
         except Exception as e:
             logger.exception("Background request failed for %s", request.request_id)
             response = self.create_error_response(str(e))
@@ -1089,6 +1102,8 @@ class OpenAIServingResponses(OpenAIServing):
     ):
         try:
             response = await self.responses_full_generator(request, *args, **kwargs)
+        except GenerationError as e:
+            response = self._convert_generation_error_to_response(e)
         except Exception as e:
             logger.exception("Background request failed for %s", request.request_id)
             response = self.create_error_response(str(e))
@@ -1227,6 +1242,8 @@ class OpenAIServingResponses(OpenAIServing):
                 continue
             if ctx.last_output.outputs:
                 output = ctx.last_output.outputs[0]
+                # finish_reason='error' indicates a retryable error
+                self._raise_if_error(output.finish_reason, request.request_id)
                 if reasoning_parser:
                     delta_message = reasoning_parser.extract_reasoning_streaming(
                         previous_text=previous_text,
@@ -1522,6 +1539,9 @@ class OpenAIServingResponses(OpenAIServing):
         async for ctx in result_generator:
             assert isinstance(ctx, StreamingHarmonyContext)
 
+            # finish_reason='error' indicates a retryable error
+            self._raise_if_error(ctx.finish_reason, request.request_id)
+
             if ctx.is_expecting_start():
                 current_output_index += 1
                 sent_output_item_added = False
@@ -2016,18 +2036,25 @@ class OpenAIServingResponses(OpenAIServing):
                 )
             )
 
-            async for event_data in processer(
-                request,
-                sampling_params,
-                result_generator,
-                context,
-                model_name,
-                tokenizer,
-                request_metadata,
-                created_time,
-                _increment_sequence_number_and_return,
-            ):
-                yield event_data
+            try:
+                async for event_data in processer(
+                    request,
+                    sampling_params,
+                    result_generator,
+                    context,
+                    model_name,
+                    tokenizer,
+                    request_metadata,
+                    created_time,
+                    _increment_sequence_number_and_return,
+                ):
+                    yield event_data
+            except GenerationError as e:
+                error_json = self._convert_generation_error_to_streaming_response(e)
+                yield _increment_sequence_number_and_return(
+                    TypeAdapter(StreamingResponsesResponse).validate_json(error_json)
+                )
+                return
 
             async def empty_async_generator():
                 # A hack to trick Python to think this is a generator but
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
index cfb2c02e00f1b..c779e3d34b3ed 100644
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -397,6 +397,25 @@ class BlockPool:
             [block for block in blocks_list if block.ref_cnt == 0 and not block.is_null]
         )
 
+    def evict_blocks(self, block_ids: set[int]) -> None:
+        """evict blocks from the prefix cache by their block IDs.
+
+        only evicts blocks that are currently cached (have a hash). blocks
+        with ref_cnt > 0 are not freed from the block pool, only evicted
+        from the prefix cache hash table.
+
+        Args:
+            block_ids: Set of block IDs to evict from cache.
+        """
+        for block_id in block_ids:
+            assert block_id < len(self.blocks), (
+                f"Invalid block_id {block_id} >= {len(self.blocks)}. "
+                f"This indicates a bug in the KV connector - workers should "
+                f"only report block IDs that were allocated by the scheduler."
+            )
+            block = self.blocks[block_id]
+            self._maybe_evict_cached_block(block)
+
     def reset_prefix_cache(self) -> bool:
         """Reset prefix cache. This function may be used in RLHF
         flows to invalid prefix caching after the weights are updated,
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 33e8c81514c5f..13086a66f6ea6 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -333,6 +333,14 @@ class KVCacheManager:
         """
         self.coordinator.free(request.request_id)
 
+    def evict_blocks(self, block_ids: set[int]) -> None:
+        """evict blocks from the prefix cache by their block IDs.
+
+        Args:
+            block_ids: Set of block IDs to evict from cache.
+        """
+        self.block_pool.evict_blocks(block_ids)
+
     def reset_prefix_cache(self) -> bool:
         """Reset prefix cache. This function may be used in RLHF
         flows to invalidate prefix caching after the weights are updated,
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index d858e840039c4..c3d504f2e72c3 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -106,6 +106,7 @@ class Scheduler(SchedulerInterface):
         # KV Connector pushes/pull of remote KVs for P/D and offloading.
         self.connector = None
         self.connector_prefix_cache_stats: PrefixCacheStats | None = None
+        self.recompute_kv_load_failures = True
         if self.vllm_config.kv_transfer_config is not None:
             assert not self.is_encoder_decoder, (
                 "Encoder-decoder models are not currently supported with KV connectors"
@@ -117,6 +118,10 @@ class Scheduler(SchedulerInterface):
             )
             if self.log_stats:
                 self.connector_prefix_cache_stats = PrefixCacheStats()
+            kv_load_failure_policy = (
+                self.vllm_config.kv_transfer_config.kv_load_failure_policy
+            )
+            self.recompute_kv_load_failures = kv_load_failure_policy == "recompute"
 
         self.kv_event_publisher = EventPublisherFactory.create(
             self.kv_events_config,
@@ -1066,7 +1071,7 @@ class Scheduler(SchedulerInterface):
         for req_id, num_tokens_scheduled in num_scheduled_tokens.items():
             assert num_tokens_scheduled > 0
             if failed_kv_load_req_ids and req_id in failed_kv_load_req_ids:
-                # Skip requests that were recovered from KV load failure
+                # skip failed or rescheduled requests from KV load failure
                 continue
             request = self.requests.get(req_id)
             if request is None:
@@ -1177,6 +1182,21 @@ class Scheduler(SchedulerInterface):
             # This is a rare case and unlikely to impact performance.
             self.waiting.remove_requests(stopped_preempted_reqs)
 
+        if failed_kv_load_req_ids and not self.recompute_kv_load_failures:
+            requests = [self.requests[req_id] for req_id in failed_kv_load_req_ids]
+            self.finish_requests(failed_kv_load_req_ids, RequestStatus.FINISHED_ERROR)
+            for request in requests:
+                outputs[request.client_index].append(
+                    EngineCoreOutput(
+                        request_id=request.request_id,
+                        new_token_ids=[],
+                        finish_reason=request.get_finished_reason(),
+                        events=request.take_events(),
+                        trace_headers=request.trace_headers,
+                        num_cached_tokens=request.num_cached_tokens,
+                    )
+                )
+
         # KV Connector: update state for finished KV Transfers.
         if kv_connector_output:
             self._update_from_kv_xfer_finished(kv_connector_output)
@@ -1610,8 +1630,11 @@ class Scheduler(SchedulerInterface):
             self._free_blocks(self.requests[req_id])
 
     def _update_requests_with_invalid_blocks(
-        self, requests: Iterable[Request], invalid_block_ids: set[int]
-    ) -> tuple[set[str], int]:
+        self,
+        requests: Iterable[Request],
+        invalid_block_ids: set[int],
+        evict_blocks: bool = True,
+    ) -> tuple[set[str], int, set[int]]:
         """
         Identify and update requests affected by invalid KV cache blocks.
 
@@ -1623,16 +1646,21 @@ class Scheduler(SchedulerInterface):
         Args:
             requests: The set of requests to scan for invalid blocks.
             invalid_block_ids: IDs of invalid blocks.
+            evict_blocks: Whether to collect blocks for eviction (False for
+                async requests which aren't cached yet).
 
         Returns:
             tuple:
                 - affected_req_ids (set[str]): IDs of requests impacted by
                 invalid blocks.
                 - total_affected_tokens (int): Total number of tokens that must
-                be recomputed across all affected requests (for observability).
+                be recomputed across all affected requests.
+                - blocks_to_evict (set[int]): Block IDs to evict from cache,
+                including invalid blocks and downstream dependent blocks.
         """
         affected_req_ids: set[str] = set()
         total_affected_tokens = 0
+        blocks_to_evict: set[int] = set()
         # If a block is invalid and shared by multiple requests in the batch,
         # these requests must be rescheduled, but only the first will recompute
         # it. This set tracks blocks already marked for recomputation.
@@ -1690,6 +1718,9 @@ class Scheduler(SchedulerInterface):
                 )
                 total_affected_tokens += num_affected_tokens
                 request.num_external_computed_tokens -= num_affected_tokens
+                # collect invalid block and all downstream dependent blocks
+                if evict_blocks:
+                    blocks_to_evict.update(req_block_ids[idx:])
 
             if is_affected:
                 if not marked_invalid_block:
@@ -1705,47 +1736,70 @@ class Scheduler(SchedulerInterface):
 
                 affected_req_ids.add(request.request_id)
 
-        return affected_req_ids, total_affected_tokens
+        return affected_req_ids, total_affected_tokens, blocks_to_evict
 
     def _handle_invalid_blocks(self, invalid_block_ids: set[int]) -> set[str]:
-        total_requests_to_reschedule = 0
-        total_tokens_to_reschedule = 0
+        """
+        Handle requests affected by invalid KV cache blocks.
 
-        # --- Handle async KV loads (WAITING_FOR_REMOTE_KVS) ---
+        Returns:
+            Set of affected request IDs to skip in update_from_output main loop.
+        """
+        should_fail = not self.recompute_kv_load_failures
+
+        # handle async KV loads (not cached yet, evict_blocks=False)
         async_load_reqs = (
             req
             for req in self.waiting
             if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS
         )
-        async_affected_req_ids, num_tokens_to_reschedule = (
+        async_failed_req_ids, num_failed_tokens, _ = (
             self._update_requests_with_invalid_blocks(
-                async_load_reqs, invalid_block_ids
+                async_load_reqs, invalid_block_ids, evict_blocks=False
             )
         )
 
-        total_requests_to_reschedule += len(async_affected_req_ids)
-        total_tokens_to_reschedule += num_tokens_to_reschedule
+        total_failed_requests = len(async_failed_req_ids)
+        total_failed_tokens = num_failed_tokens
 
-        # Mark requests with async KV load failures; they will be rescheduled
-        # once loading completes.
-        self.failed_recving_kv_req_ids |= async_affected_req_ids
-
-        # --- Handle sync KV loads (running requests) ---
-        sync_affected_req_ids, num_tokens_to_reschedule = (
-            self._update_requests_with_invalid_blocks(self.running, invalid_block_ids)
+        # handle sync loads (may be cached, collect blocks for eviction)
+        sync_failed_req_ids, num_failed_tokens, sync_blocks_to_evict = (
+            self._update_requests_with_invalid_blocks(
+                self.running, invalid_block_ids, evict_blocks=True
+            )
         )
 
-        total_requests_to_reschedule += len(sync_affected_req_ids)
-        total_tokens_to_reschedule += num_tokens_to_reschedule
+        total_failed_requests += len(sync_failed_req_ids)
+        total_failed_tokens += num_failed_tokens
 
-        if total_requests_to_reschedule:
-            logger.warning(
-                "Recovered from KV load failure: "
-                "%d request(s) rescheduled (%d tokens affected).",
-                total_requests_to_reschedule,
-                total_tokens_to_reschedule,
+        if not total_failed_requests:
+            return set()
+
+        # evict invalid blocks and downstream dependent blocks from cache
+        # only when not using recompute policy (where blocks will be recomputed
+        # and reused by other requests sharing them)
+        if sync_blocks_to_evict and not self.recompute_kv_load_failures:
+            self.kv_cache_manager.evict_blocks(sync_blocks_to_evict)
+
+        if should_fail:
+            all_failed_req_ids = async_failed_req_ids | sync_failed_req_ids
+            logger.error(
+                "Failing %d request(s) due to KV load failure "
+                "(failure_policy=fail, %d tokens affected). Request IDs: %s",
+                total_failed_requests,
+                total_failed_tokens,
+                all_failed_req_ids,
             )
+            return all_failed_req_ids
 
-        # Return the IDs of affected running requests to skip in
-        # update_from_output.
-        return sync_affected_req_ids
+        logger.warning(
+            "Recovered from KV load failure: "
+            "%d request(s) rescheduled (%d tokens affected).",
+            total_failed_requests,
+            total_failed_tokens,
+        )
+
+        # Mark async requests with KV load failures for retry once loading completes
+        self.failed_recving_kv_req_ids |= async_failed_req_ids
+        # Return sync affected IDs to skip in update_from_output
+        return sync_failed_req_ids
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index ce2aae77108da..4f54d12f4b8d0 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -19,24 +19,27 @@ from vllm.v1.serial_utils import UtilityResult
 
 # These are possible values of RequestOutput.finish_reason,
 # so form part of the external API.
-FINISH_REASON_STRINGS = ("stop", "length", "abort")
+FINISH_REASON_STRINGS = ("stop", "length", "abort", "error")
 
 
 class FinishReason(enum.IntEnum):
     """
-    Reason a request finished - stop, length, or abort.
+    Reason a request finished - stop, length, abort, or error.
 
     Int rather than Str for more compact serialization.
 
     stop - a stop string was emitted
     length - max_tokens was consumed, or max_model_len was reached
-    abort - aborted for another reason
+    abort - aborted by client
+    error - retryable request-level internal error (e.g., KV load failure).
+            Invariant: always converted to 500 Internal Server Error.
 
     """
 
     STOP = 0
     LENGTH = 1
     ABORT = 2
+    ERROR = 3
 
     def __str__(self):
         return FINISH_REASON_STRINGS[self.value]
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 33762fe34e64f..a775e840e841c 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -255,6 +255,7 @@ class RequestStatus(enum.IntEnum):
     FINISHED_LENGTH_CAPPED = enum.auto()
     FINISHED_ABORTED = enum.auto()
     FINISHED_IGNORED = enum.auto()
+    FINISHED_ERROR = enum.auto()
 
     def __str__(self):
         return self.name
@@ -277,4 +278,5 @@ _FINISHED_REASON_MAP = {
     RequestStatus.FINISHED_LENGTH_CAPPED: FinishReason.LENGTH,
     RequestStatus.FINISHED_ABORTED: FinishReason.ABORT,
     RequestStatus.FINISHED_IGNORED: FinishReason.LENGTH,
+    RequestStatus.FINISHED_ERROR: FinishReason.ERROR,
 }

From e72d65b959f759fcf56b329ecaaee7d166c012d2 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 11 Dec 2025 03:10:58 +0800
Subject: [PATCH 420/770] {Deprecation] Remove tokenizer setter (#30400)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/entrypoints/llm.py           | 13 +------------
 vllm/v1/engine/async_llm.py       |  4 ----
 vllm/v1/engine/input_processor.py |  4 ----
 vllm/v1/engine/llm_engine.py      |  4 ----
 4 files changed, 1 insertion(+), 24 deletions(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 5d5c4a1cdb77b..3fce3338503ef 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -9,7 +9,7 @@ import cloudpickle
 import torch.nn as nn
 from pydantic import ValidationError
 from tqdm.auto import tqdm
-from typing_extensions import TypeVar, deprecated
+from typing_extensions import TypeVar
 
 from vllm.beam_search import (
     BeamSearchInstance,
@@ -73,7 +73,6 @@ from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import BeamSearchParams, RequestOutputKind, SamplingParams
 from vllm.tasks import PoolingTask
 from vllm.tokenizers import MistralTokenizer, TokenizerLike
-from vllm.tokenizers.hf import get_cached_tokenizer
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils.collection_utils import as_iter, is_list_of
 from vllm.utils.counter import Counter
@@ -367,16 +366,6 @@ class LLM:
     def get_tokenizer(self) -> TokenizerLike:
         return self.llm_engine.get_tokenizer()
 
-    @deprecated("`set_tokenizer` is deprecated and will be removed in v0.13.")
-    def set_tokenizer(self, tokenizer: TokenizerLike) -> None:
-        # While CachedTokenizer is dynamic, have no choice but
-        # compare class name. Misjudgment will arise from
-        # user-defined tokenizer started with 'Cached'
-        if tokenizer.__class__.__name__.startswith("Cached"):
-            self.llm_engine.tokenizer = tokenizer
-        else:
-            self.llm_engine.tokenizer = get_cached_tokenizer(tokenizer)
-
     def reset_mm_cache(self) -> None:
         self.input_processor.clear_mm_cache()
         self.llm_engine.reset_mm_cache()
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index fa3fb7a18895a..8eff61563ccea 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -701,10 +701,6 @@ class AsyncLLM(EngineClient):
     def tokenizer(self) -> TokenizerLike | None:
         return self.input_processor.tokenizer
 
-    @tokenizer.setter
-    def tokenizer(self, tokenizer: TokenizerLike | None) -> None:
-        self.input_processor.tokenizer = tokenizer
-
     async def get_tokenizer(self) -> TokenizerLike:
         if self.tokenizer is None:
             raise ValueError(
diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py
index e6a94f4e3de5d..a3c18464d3f52 100644
--- a/vllm/v1/engine/input_processor.py
+++ b/vllm/v1/engine/input_processor.py
@@ -64,10 +64,6 @@ class InputProcessor:
     def tokenizer(self) -> TokenizerLike | None:
         return self.input_preprocessor.tokenizer
 
-    @tokenizer.setter
-    def tokenizer(self, tokenizer: TokenizerLike | None) -> None:
-        self.input_preprocessor.tokenizer = tokenizer
-
     def _validate_logprobs(
         self,
         params: SamplingParams,
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 1cb206c4e004c..4422eced82fea 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -358,10 +358,6 @@ class LLMEngine:
     def tokenizer(self) -> TokenizerLike | None:
         return self.input_processor.tokenizer
 
-    @tokenizer.setter
-    def tokenizer(self, tokenizer: TokenizerLike | None) -> None:
-        self.input_processor.tokenizer = tokenizer
-
     def get_tokenizer(self) -> TokenizerLike:
         if self.tokenizer is None:
             raise ValueError(

From 9f042ba26b59e1bfc9bef031165033fa931f3457 Mon Sep 17 00:00:00 2001
From: Jialin Ouyang <Jialin.Ouyang@gmail.com>
Date: Wed, 10 Dec 2025 11:13:01 -0800
Subject: [PATCH 421/770] [Perf] Enable environment cache in EngineCore to
 enable the feature for UniProcExecutor as well (#29289)

Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
---
 tests/test_envs.py                 | 38 ++++++++++++++++++++++++++++++
 vllm/distributed/parallel_state.py |  2 ++
 vllm/envs.py                       | 20 ++++++++++++++++
 vllm/v1/engine/core.py             |  7 +++---
 4 files changed, 63 insertions(+), 4 deletions(-)

diff --git a/tests/test_envs.py b/tests/test_envs.py
index 11bbec38202bf..b6b7cf38d4abc 100644
--- a/tests/test_envs.py
+++ b/tests/test_envs.py
@@ -8,6 +8,7 @@ import pytest
 
 import vllm.envs as envs
 from vllm.envs import (
+    disable_envs_cache,
     enable_envs_cache,
     env_list_with_choices,
     env_set_with_choices,
@@ -57,6 +58,43 @@ def test_getattr_with_cache(monkeypatch: pytest.MonkeyPatch):
     envs.__getattr__ = envs.__getattr__.__wrapped__
 
 
+def test_getattr_with_reset(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setenv("VLLM_HOST_IP", "1.1.1.1")
+    # __getattr__ is not decorated with functools.cache
+    assert not hasattr(envs.__getattr__, "cache_info")
+
+    # Enable envs cache and ignore ongoing environment changes
+    enable_envs_cache()
+    assert envs.VLLM_HOST_IP == "1.1.1.1"
+    # With cache enabled, the environment variable value is cached and unchanged
+    monkeypatch.setenv("VLLM_HOST_IP", "2.2.2.2")
+    assert envs.VLLM_HOST_IP == "1.1.1.1"
+
+    disable_envs_cache()
+    assert envs.VLLM_HOST_IP == "2.2.2.2"
+    # After cache disabled, the environment variable value would be synced
+    # with os.environ
+    monkeypatch.setenv("VLLM_HOST_IP", "3.3.3.3")
+    assert envs.VLLM_HOST_IP == "3.3.3.3"
+
+
+def test_is_envs_cache_enabled() -> None:
+    assert not envs._is_envs_cache_enabled()
+    enable_envs_cache()
+    assert envs._is_envs_cache_enabled()
+
+    # Only wrap one-layer of cache, so we only need to
+    # call disable once to reset.
+    enable_envs_cache()
+    enable_envs_cache()
+    enable_envs_cache()
+    disable_envs_cache()
+    assert not envs._is_envs_cache_enabled()
+
+    disable_envs_cache()
+    assert not envs._is_envs_cache_enabled()
+
+
 class TestEnvWithChoices:
     """Test cases for env_with_choices function."""
 
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index f910f10407d44..338cb1f1814b5 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -1586,6 +1586,8 @@ def destroy_distributed_environment():
 
 
 def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
+    # Reset environment variable cache
+    envs.disable_envs_cache()
     # Ensure all objects are not frozen before cleanup
     gc.unfreeze()
 
diff --git a/vllm/envs.py b/vllm/envs.py
index 8246109eb73af..230f2cf3450a9 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -1580,6 +1580,12 @@ def __getattr__(name: str):
     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
 
 
+def _is_envs_cache_enabled() -> bool:
+    """Checked if __getattr__ is wrapped with functools.cache"""
+    global __getattr__
+    return hasattr(__getattr__, "cache_clear")
+
+
 def enable_envs_cache() -> None:
     """
     Enables caching of environment variables. This is useful for performance
@@ -1590,6 +1596,9 @@ def enable_envs_cache() -> None:
     runtime overhead. This also means that environment variables should NOT
     be updated after the service is initialized.
     """
+    if _is_envs_cache_enabled():
+        # Avoid wrapping functools.cache multiple times
+        return
     # Tag __getattr__ with functools.cache
     global __getattr__
     __getattr__ = functools.cache(__getattr__)
@@ -1599,6 +1608,17 @@ def enable_envs_cache() -> None:
         __getattr__(key)
 
 
+def disable_envs_cache() -> None:
+    """
+    Resets the environment variables cache. It could be used to isolate environments
+    between unit tests.
+    """
+    global __getattr__
+    # If __getattr__ is wrapped by functions.cache, unwrap the caching layer.
+    if _is_envs_cache_enabled():
+        __getattr__ = __getattr__.__wrapped__
+
+
 def __dir__():
     return list(environment_variables.keys())
 
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 3d3a1e138ddef..0045b8c1dd3e7 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -211,6 +211,9 @@ class EngineCore:
         freeze_gc_heap()
         # If enable, attach GC debugger after static variable freeze.
         maybe_attach_gc_debug_callback()
+        # Enable environment variable cache (e.g. assume no more
+        # environment variable overrides after this point)
+        enable_envs_cache()
 
     def _initialize_kv_caches(
         self, vllm_config: VllmConfig
@@ -672,10 +675,6 @@ class EngineCoreProc(EngineCore):
                 assert addresses.coordinator_input is not None
                 logger.info("Waiting for READY message from DP Coordinator...")
 
-        # Enable environment variable cache (e.g. assume no more
-        # environment variable overrides after this point)
-        enable_envs_cache()
-
     @contextmanager
     def _perform_handshakes(
         self,

From eea41804a4b4f84a80f63375ce2e77668d70bda5 Mon Sep 17 00:00:00 2001
From: "Po-Han Huang (NVIDIA)" <53919306+nvpohanh@users.noreply.github.com>
Date: Thu, 11 Dec 2025 03:18:51 +0800
Subject: [PATCH 422/770] [bug] Fix "Current vLLM config is not set." warnings
 when FlashInfer attention is used (#30241)

Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
---
 vllm/utils/flashinfer.py                 | 5 ++++-
 vllm/v1/attention/backends/flashinfer.py | 2 ++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index 7aaf690cbaa13..9a66049350cd8 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -269,6 +269,8 @@ def supports_trtllm_attention() -> bool:
 
 def force_use_trtllm_attention() -> bool | None:
     """
+    This function should only be called during initialization stage when vllm config
+    is set.
     Return `None` if --attention-config.use_trtllm_attention is not set,
     return `True` if TRTLLM attention is forced to be used,
     return `False` if TRTLLM attention is forced to be not used.
@@ -296,11 +298,12 @@ def use_trtllm_attention(
     kv_cache_dtype: str,
     q_dtype: torch.dtype,
     is_prefill: bool,
+    # None means auto-detection, True means force on, False means force off
+    force_use_trtllm: bool | None = None,
     has_sinks: bool = False,
     has_spec: bool = False,
 ) -> bool:
     """Return `True` if TRTLLM attention is used."""
-    force_use_trtllm = force_use_trtllm_attention()
 
     # CLI argument is set to 0 - respect it
     if force_use_trtllm is not None and not force_use_trtllm:
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 8e9d764e4a123..4174b80ee312e 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -429,6 +429,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         super().__init__(kv_cache_spec, layer_names, vllm_config, device)
         self.cache_config = vllm_config.cache_config
         self.model_config = vllm_config.model_config
+        self.attention_config = vllm_config.attention_config
         self._workspace_buffer = None
         self._prefill_wrapper: (
             BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper | None
@@ -779,6 +780,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
             self.cache_dtype,
             self.q_data_type,
             is_prefill=True,
+            force_use_trtllm=self.attention_config.use_trtllm_attention,
             has_sinks=self.has_sinks,
             has_spec=uses_spec_reorder,
         )

From 6ccb7baeb1a124ad9b6e87fe9bbd48ae40830869 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Wed, 10 Dec 2025 11:52:01 -0800
Subject: [PATCH 423/770] [LMCache] Fix breakage due to new LMCache version
 (#30216)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 requirements/kv_connectors.txt                                | 2 +-
 .../kv_connector/v1/lmcache_integration/vllm_v1_adapter.py    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/requirements/kv_connectors.txt b/requirements/kv_connectors.txt
index 083230c171096..f60a01a55d07c 100644
--- a/requirements/kv_connectors.txt
+++ b/requirements/kv_connectors.txt
@@ -1,2 +1,2 @@
-lmcache
+lmcache >= 0.3.10.post1
 nixl >= 0.7.1 # Required for disaggregated prefill
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
index 15ac5b049fce9..cdc2969a7735e 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
@@ -27,7 +27,7 @@ from lmcache.v1.lookup_client.lmcache_async_lookup_client import (
     LMCacheAsyncLookupServer,
 )
 from lmcache.v1.offload_server.zmq_server import ZMQOffloadServer
-from lmcache.v1.plugin.plugin_launcher import PluginLauncher
+from lmcache.v1.plugin.runtime_plugin_launcher import RuntimePluginLauncher
 
 from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import VllmConfig
@@ -683,7 +683,7 @@ class LMCacheConnectorV1Impl:
             self.api_server = InternalAPIServer(self)
             self.api_server.start()
             # Launch plugins
-            self.plugin_launcher = PluginLauncher(
+            self.plugin_launcher = RuntimePluginLauncher(
                 self.config,
                 role,
                 self.worker_count,

From fcb894222f2b8a353072e1aea33b38f4403bbd7a Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Wed, 10 Dec 2025 15:56:51 -0500
Subject: [PATCH 424/770] [Docs] Update EPLB docs (#30426)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 docs/serving/expert_parallel_deployment.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/docs/serving/expert_parallel_deployment.md b/docs/serving/expert_parallel_deployment.md
index ec07896592ba3..98f242ab8b892 100644
--- a/docs/serving/expert_parallel_deployment.md
+++ b/docs/serving/expert_parallel_deployment.md
@@ -40,10 +40,12 @@ EP_SIZE = TP_SIZE × DP_SIZE
 
 Where:
 
-- `TP_SIZE`: Tensor parallel size (always 1 for now)
+- `TP_SIZE`: Tensor parallel size
 - `DP_SIZE`: Data parallel size
 - `EP_SIZE`: Expert parallel size (computed automatically)
 
+When EP is enabled, MoE layers use expert parallelism instead of tensor parallelism, while attention layers continue to use tensor parallelism if `TP_SIZE > 1`.
+
 ### Example Command
 
 The following command serves a `DeepSeek-V3-0324` model with 1-way tensor parallel, 8-way (attention) data parallel, and 8-way expert parallel. The attention weights are replicated across all GPUs, while the expert weights are split across GPUs. It will work on a H200 (or H20) node with 8 GPUs. For H100, you can try to serve a smaller model or refer to the multi-node deployment section.
@@ -119,9 +121,6 @@ While MoE models are typically trained so that each expert receives a similar nu
 
 Enable EPLB with the `--enable-eplb` flag.
 
-!!! note "Model Support"
-    Currently only DeepSeek V3 architecture is supported.
-
 When enabled, vLLM collects load statistics with every forward pass and periodically rebalances expert distribution.
 
 ### EPLB Parameters
@@ -134,6 +133,8 @@ Configure EPLB with the `--eplb-config` argument, which accepts a JSON string. T
 | `step_interval`| Frequency of rebalancing (every N engine steps) | 3000 |
 | `log_balancedness` | Log balancedness metrics (avg tokens per expert ÷ max tokens per expert) | `false` |
 | `num_redundant_experts` | Additional global experts per EP rank beyond equal distribution | `0` |
+| `use_async` | Use non-blocking EPLB for reduced latency overhead | `false` |
+| `policy` | The policy type for expert parallel load balancing | `"default"` |
 
 For example:
 

From b9e0951f964e1b8adfebb973c30462c0e0417c1f Mon Sep 17 00:00:00 2001
From: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com>
Date: Wed, 10 Dec 2025 17:15:54 -0500
Subject: [PATCH 425/770] [docs] Improve wide-EP performance + benchmarking
 documentation (#27933)

Signed-off-by: Seiji Eicher <seiji@anyscale.com>
---
 docs/serving/data_parallel_deployment.md   | 14 ++++++++++-
 docs/serving/expert_parallel_deployment.md | 28 +++++++++++++++++++++-
 tools/ep_kernels/README.md                 |  4 ++--
 3 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/docs/serving/data_parallel_deployment.md b/docs/serving/data_parallel_deployment.md
index eff9c5d5e4efa..e5954917cd790 100644
--- a/docs/serving/data_parallel_deployment.md
+++ b/docs/serving/data_parallel_deployment.md
@@ -24,7 +24,7 @@ There are two distinct modes supported for online deployments - self-contained w
 
 vLLM supports "self-contained" data parallel deployments that expose a single API endpoint.
 
-It can be configured by simply including e.g. `--data-parallel-size=4` in the vllm serve command line arguments. This will require 4 GPUs. It can be combined with tensor parallel, for example `--data-parallel-size=4 --tensor-parallel-size=2`, which would require 8 GPUs.
+It can be configured by simply including e.g. `--data-parallel-size=4` in the vllm serve command line arguments. This will require 4 GPUs. It can be combined with tensor parallel, for example `--data-parallel-size=4 --tensor-parallel-size=2`, which would require 8 GPUs. When sizing DP deployments, remember that `--max-num-seqs` applies per DP rank.
 
 Running a single data parallel deployment across multiple nodes requires a different `vllm serve` to be run on each node, specifying which DP ranks should run on that node. In this case, there will still be a single HTTP entrypoint - the API server(s) will run only on one node, but it doesn't necessarily need to be co-located with the DP ranks.
 
@@ -80,6 +80,18 @@ When deploying large DP sizes using this method, the API server process can beco
 ![DP Internal LB Diagram](../assets/deployment/dp_internal_lb.png)
 </figure>
 
+## Hybrid Load Balancing
+
+Hybrid load balancing sits between the internal and external approaches. Each node runs its own API server(s) that only queue requests to the data-parallel engines colocated on that node. An upstream load balancer (for example, an ingress controller or traffic router) spreads user requests across those per-node endpoints.
+
+Enable this mode with `--data-parallel-hybrid-lb` while still launching every node with the global data-parallel size. The key differences from internal load balancing are:
+
+- You must provide `--data-parallel-size-local` and `--data-parallel-start-rank` so each node knows which ranks it owns.
+- Not compatible with `--headless` since every node exposes an API endpoint.
+- Scale `--api-server-count` per node based on the number of local ranks
+
+In this configuration, each node keeps scheduling decisions local, which reduces cross-node traffic and avoids single node bottlenecks at larger DP sizes.
+
 ## External Load Balancing
 
 For larger scale deployments especially, it can make sense to handle the orchestration and load balancing of data parallel ranks externally.
diff --git a/docs/serving/expert_parallel_deployment.md b/docs/serving/expert_parallel_deployment.md
index 98f242ab8b892..923020dc88c91 100644
--- a/docs/serving/expert_parallel_deployment.md
+++ b/docs/serving/expert_parallel_deployment.md
@@ -83,7 +83,7 @@ vllm serve deepseek-ai/DeepSeek-V3-0324 \
     --data-parallel-size-local 8 \           # Local DP size on this node (8 GPUs per node)
     --data-parallel-address 192.168.1.100 \  # Replace with actual IP of Node 1
     --data-parallel-rpc-port 13345 \         # RPC communication port, can be any port as long as reachable by all nodes
-    --api-server-count=8                     # Number of API servers for load handling (scaling this out to total ranks are recommended)
+    --api-server-count=8                     # Number of API servers for load handling (scaling this out to # local ranks is recommended)
 
 # Node 2 (Secondary - headless mode, no API server)
 vllm serve deepseek-ai/DeepSeek-V3-0324 \
@@ -184,6 +184,26 @@ vllm serve deepseek-ai/DeepSeek-V3-0324 \
 
 For multi-node deployment, add these EPLB flags to each node's command. We recommend setting `--eplb-config '{"num_redundant_experts":32}'` to 32 in large scale use cases so the most popular experts are always available.
 
+## Advanced Configuration
+
+### Performance Optimization
+
+- **DeepEP kernels**: The `high_throughput` and `low_latency` kernels are optimized for disaggregated serving and may show poor performance for mixed workloads
+- **Dual Batch Overlap**: Use `--enable-dbo` to overlap all-to-all communication with compute. See [Dual Batch Overlap](../design/dbo.md) for more details.
+- **Async scheduling (experimental)**: Try `--async-scheduling` to overlap scheduling with model execution.
+
+### Troubleshooting
+
+- **`non-zero status: 7 cannot register cq buf`**: When using Infiniband/RoCE, make sure host VM and pods show `ulimit -l` "unlimited".
+- **`init failed for transport: IBGDA`**: The InfiniBand GDA kernel modules are missing. Run `tools/ep_kernels/configure_system_drivers.sh` on each GPU node and reboot. Also fixes error `NVSHMEM API called before NVSHMEM initialization has completed`.
+- **NVSHMEM peer disconnect**: Usually a networking misconfiguration. If deploying via Kubernetes, verify that every pod runs with `hostNetwork: true`, `securityContext.privileged: true` to access Infiniband.
+
+### Benchmarking
+
+- Use simulator flags `VLLM_MOE_ROUTING_SIMULATION_STRATEGY=uniform_random` and `VLLM_RANDOMIZE_DP_DUMMY_INPUTS=1` so token routing is balanced across EP ranks.
+
+- Increasing `VLLM_MOE_DP_CHUNK_SIZE` may increase throughput by increasing the maximum batch size for inter-rank token transfers. This may cause DeepEP  to throw `assert self.nvshmem_qp_depth >= (num_max_dispatch_tokens_per_rank + 1) * 2`, which can be fixed by increasing environment variable `NVSHMEM_QP_DEPTH`.
+
 ## Disaggregated Serving (Prefill/Decode Split)
 
 For production deployments requiring strict SLA guarantees for time-to-first-token and inter-token latency, disaggregated serving allows independent scaling of prefill and decode operations.
@@ -274,3 +294,9 @@ except Exception as e:
     print(f"❌ Error during disaggregated serving: {e}")
     print("Check that both prefill and decode instances are running and accessible")
 ```
+
+### Benchmarking
+
+- To simulate the decode deployment of disaggregated serving, pass `--kv-transfer-config '{"kv_connector":"DecodeBenchConnector","kv_role":"kv_both"}'` to the `vllm serve` invocation. The connector populates KV cache with random values so decode can be profiled in isolation.
+
+- **CUDAGraph capture**: Use `--compilation_config '{"cudagraph_mode": "FULL_DECODE_ONLY"}'` to enable CUDA graph capture for decode only and save KV cache.
diff --git a/tools/ep_kernels/README.md b/tools/ep_kernels/README.md
index 85e9d2a4f8129..ab0e358802bf8 100644
--- a/tools/ep_kernels/README.md
+++ b/tools/ep_kernels/README.md
@@ -7,7 +7,7 @@ Here we break down the requirements in 2 steps:
 1. Build and install the Python libraries (both [pplx-kernels](https://github.com/ppl-ai/pplx-kernels) and [DeepEP](https://github.com/deepseek-ai/DeepEP)), including necessary dependencies like NVSHMEM. This step does not require any privileged access. Any user can do this.
 2. Configure NVIDIA driver to enable IBGDA. This step requires root access, and must be done on the host machine.
 
-2 is necessary for multi-node deployment.
+Step 2 is necessary for multi-node deployment.
 
 All scripts accept a positional argument as workspace path for staging the build, defaulting to `$(pwd)/ep_kernels_workspace`.
 
@@ -23,6 +23,6 @@ TORCH_CUDA_ARCH_LIST="10.0" bash install_python_libraries.sh
 Additional step for multi-node deployment:
 
 ```bash
-sudo bash configure_system_drivers.sh
+sudo bash configure_system_drivers.sh # update-initramfs can take several minutes
 sudo reboot # Reboot is required to load the new driver
 ```

From 166ac3c94d6ee845d4d8dc1a6dced4d9033fa4e3 Mon Sep 17 00:00:00 2001
From: Christina Norman <truffle@gmail.com>
Date: Wed, 10 Dec 2025 17:01:19 -0600
Subject: [PATCH 426/770] fix(shm): Add memory barriers for cross-process
 shared memory visibility (#30407)

Signed-off-by: Christina Holland <hey@christinaholland.com>
Signed-off-by: Christina <truffle@gmail.com>
---
 .../device_communicators/shm_broadcast.py     | 44 +++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 114516ff07a1f..31c6084c9b507 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import functools
 import pickle
+import threading
 import time
 from contextlib import contextmanager
 from dataclasses import dataclass, field
@@ -43,6 +44,33 @@ VLLM_RINGBUFFER_WARNING_INTERVAL = envs.VLLM_RINGBUFFER_WARNING_INTERVAL
 from_bytes_big = functools.partial(int.from_bytes, byteorder="big")
 
 
+# Memory fence for cross-process shared memory visibility.
+# Required for correct producer-consumer synchronization when using
+# shared memory without locks.
+_memory_fence_lock = threading.Lock()
+
+
+def memory_fence():
+    """
+    Full memory barrier for shared memory synchronization.
+
+    Ensures all prior memory writes are visible to other processes before
+    any subsequent reads. This is critical for lock-free producer-consumer
+    patterns using shared memory.
+
+    Implementation acquires and immediately releases a lock. Python's
+    threading.Lock provides sequentially consistent memory barrier semantics
+    across all major platforms (POSIX, Windows). This is a lightweight
+    operation (~20ns) that guarantees:
+    - All stores before the barrier are visible to other threads/processes
+    - All loads after the barrier see the latest values
+    """
+    # Lock acquire/release provides full memory barrier semantics.
+    # Using context manager ensures lock release even on exceptions.
+    with _memory_fence_lock:
+        pass
+
+
 def to_bytes_big(value: int, size: int) -> bytes:
     return value.to_bytes(size, byteorder="big")
 
@@ -414,6 +442,10 @@ class MessageQueue:
         n_warning = 1
         while True:
             with self.buffer.get_metadata(self.current_idx) as metadata_buffer:
+                # Memory fence ensures we see the latest read flags from readers.
+                # Without this, we may read stale flags from our CPU cache and
+                # spin indefinitely even though readers have completed.
+                memory_fence()
                 read_count = sum(metadata_buffer[1:])
                 written_flag = metadata_buffer[0]
                 if written_flag and read_count != self.buffer.n_reader:
@@ -458,6 +490,10 @@ class MessageQueue:
                     metadata_buffer[i] = 0
                 # mark the block as written
                 metadata_buffer[0] = 1
+                # Memory fence ensures the write is visible to readers on other cores
+                # before we proceed. Without this, readers may spin indefinitely
+                # waiting for a write that's stuck in our CPU's store buffer.
+                memory_fence()
                 self.current_idx = (self.current_idx + 1) % self.buffer.max_chunks
                 break
 
@@ -473,6 +509,10 @@ class MessageQueue:
         n_warning = 1
         while True:
             with self.buffer.get_metadata(self.current_idx) as metadata_buffer:
+                # Memory fence ensures we see the latest writes from the writer.
+                # Without this, we may read stale flags from our CPU cache
+                # and spin indefinitely even though writer has updated them.
+                memory_fence()
                 read_flag = metadata_buffer[self.local_reader_rank + 1]
                 written_flag = metadata_buffer[0]
                 if not written_flag or read_flag:
@@ -513,6 +553,10 @@ class MessageQueue:
                 # caller has read from the buffer
                 # set the read flag
                 metadata_buffer[self.local_reader_rank + 1] = 1
+                # Memory fence ensures the read flag is visible to the writer.
+                # Without this, writer may not see our read completion and
+                # could wait indefinitely for all readers to finish.
+                memory_fence()
                 self.current_idx = (self.current_idx + 1) % self.buffer.max_chunks
 
                 self._read_spin_timer.record_activity()

From 8580919ac36b9ada425668264437c70935943e05 Mon Sep 17 00:00:00 2001
From: shivampr <shivampr.dev@gmail.com>
Date: Wed, 10 Dec 2025 15:17:41 -0800
Subject: [PATCH 427/770] [Bugfix] fix confusing OOM errors during v1 init
 (#28051)

Signed-off-by: Shivam <shivamprasad91@gmail.com>
Signed-off-by: shivampr <shivampr.dev@gmail.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
---
 tests/v1/engine/test_init_error_messaging.py |  54 +++++++
 vllm/v1/core/kv_cache_utils.py               |  10 +-
 vllm/v1/worker/gpu_model_runner.py           | 139 ++++++++++---------
 3 files changed, 138 insertions(+), 65 deletions(-)
 create mode 100644 tests/v1/engine/test_init_error_messaging.py

diff --git a/tests/v1/engine/test_init_error_messaging.py b/tests/v1/engine/test_init_error_messaging.py
new file mode 100644
index 0000000000000..bc23a68f9deb1
--- /dev/null
+++ b/tests/v1/engine/test_init_error_messaging.py
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.v1.core.kv_cache_utils import check_enough_kv_cache_memory
+from vllm.v1.kv_cache_interface import FullAttentionSpec
+
+
+def test_kv_cache_oom_no_memory():
+    from unittest.mock import MagicMock
+
+    config = MagicMock()
+    config.model_config.max_model_len = 2048
+
+    spec = {
+        "layer_0": FullAttentionSpec(
+            block_size=16,
+            num_kv_heads=8,
+            head_size=128,
+            dtype="float16",
+        )
+    }
+
+    with pytest.raises(ValueError):
+        check_enough_kv_cache_memory(config, spec, 0)
+
+
+def test_kv_cache_oom_insufficient_memory(monkeypatch):
+    from unittest.mock import MagicMock
+
+    config = MagicMock()
+    config.model_config.max_model_len = 2048
+    config.cache_config.block_size = 16
+    config.parallel_config.tensor_parallel_size = 1
+    config.parallel_config.pipeline_parallel_size = 1
+    config.parallel_config.decode_context_parallel_size = 1
+
+    monkeypatch.setattr(
+        "vllm.v1.core.kv_cache_utils.max_memory_usage_bytes",
+        lambda c, s: 100 * 1024**3,  # 100 GiB
+    )
+
+    spec = {
+        "layer_0": FullAttentionSpec(
+            block_size=16,
+            num_kv_heads=8,
+            head_size=128,
+            dtype="float16",
+        )
+    }
+
+    with pytest.raises(ValueError):
+        check_enough_kv_cache_memory(config, spec, 1024**3)  # 1 GiB
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 774200deed158..e4360de3717d1 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -687,7 +687,9 @@ def check_enough_kv_cache_memory(
         raise ValueError(
             "No available memory for the cache blocks. "
             "Try increasing `gpu_memory_utilization` when "
-            "initializing the engine."
+            "initializing the engine. "
+            "See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
+            "for more details."
         )
 
     max_model_len = vllm_config.model_config.max_model_len
@@ -711,8 +713,10 @@ def check_enough_kv_cache_memory(
             f"cache is needed, which is larger than the available KV cache "
             f"memory ({available_memory / GiB_bytes:.2f} GiB). "
             f"{estimated_msg} "
-            f"Try increasing `gpu_memory_utilization` or decreasing "
-            f"`max_model_len` when initializing the engine."
+            f"Try increasing `gpu_memory_utilization` or decreasing `max_model_len` "
+            f"when initializing the engine. "
+            f"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
+            f"for more details."
         )
 
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index ca06f048f290b..7dc86f1ee4815 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -3571,74 +3571,89 @@ class GPUModelRunner(
         if self.parallel_config.enable_eplb:
             self.eplb_state = EplbState(self.parallel_config, self.device)
             eplb_models = 0
-        with DeviceMemoryProfiler() as m:
-            time_before_load = time.perf_counter()
-            model_loader = get_model_loader(self.load_config)
-            self.model = model_loader.load_model(
-                vllm_config=self.vllm_config, model_config=self.model_config
-            )
-            if self.lora_config:
-                self.model = self.load_lora_model(
-                    self.model, self.vllm_config, self.device
+
+        try:
+            with DeviceMemoryProfiler() as m:
+                time_before_load = time.perf_counter()
+                model_loader = get_model_loader(self.load_config)
+                self.model = model_loader.load_model(
+                    vllm_config=self.vllm_config, model_config=self.model_config
                 )
-            if hasattr(self, "drafter"):
-                logger.info_once("Loading drafter model...")
-                self.drafter.load_model(self.model)
-                if (
-                    hasattr(self.drafter, "model")
-                    and is_mixture_of_experts(self.drafter.model)
-                    and self.parallel_config.enable_eplb
-                ):
-                    spec_config = self.vllm_config.speculative_config
-                    assert spec_config is not None
-                    assert spec_config.draft_model_config is not None
-                    logger.info_once(
-                        "EPLB is enabled for drafter model %s.",
-                        spec_config.draft_model_config.model,
+                if self.lora_config:
+                    self.model = self.load_lora_model(
+                        self.model, self.vllm_config, self.device
                     )
+                if hasattr(self, "drafter"):
+                    logger.info_once("Loading drafter model...")
+                    self.drafter.load_model(self.model)
+                    if (
+                        hasattr(self.drafter, "model")
+                        and is_mixture_of_experts(self.drafter.model)
+                        and self.parallel_config.enable_eplb
+                    ):
+                        spec_config = self.vllm_config.speculative_config
+                        assert spec_config is not None
+                        assert spec_config.draft_model_config is not None
+                        logger.info_once(
+                            "EPLB is enabled for drafter model %s.",
+                            spec_config.draft_model_config.model,
+                        )
 
-                    global_expert_load = (
-                        global_expert_loads[eplb_models]
-                        if global_expert_loads
-                        else None
-                    )
-                    old_global_expert_indices = (
-                        old_global_expert_indices_per_model[eplb_models]
-                        if old_global_expert_indices_per_model
-                        else None
-                    )
-                    if self.eplb_state is None:
-                        self.eplb_state = EplbState(self.parallel_config, self.device)
-                    self.eplb_state.add_model(
-                        self.drafter.model,
-                        spec_config.draft_model_config,
-                        global_expert_load,
-                        old_global_expert_indices,
-                        rank_mapping,
-                    )
-                    eplb_models += 1
+                        global_expert_load = (
+                            global_expert_loads[eplb_models]
+                            if global_expert_loads
+                            else None
+                        )
+                        old_global_expert_indices = (
+                            old_global_expert_indices_per_model[eplb_models]
+                            if old_global_expert_indices_per_model
+                            else None
+                        )
+                        if self.eplb_state is None:
+                            self.eplb_state = EplbState(
+                                self.parallel_config, self.device
+                            )
+                        self.eplb_state.add_model(
+                            self.drafter.model,
+                            spec_config.draft_model_config,
+                            global_expert_load,
+                            old_global_expert_indices,
+                            rank_mapping,
+                        )
+                        eplb_models += 1
 
-            if self.use_aux_hidden_state_outputs:
-                if not supports_eagle3(self.get_model()):
-                    raise RuntimeError(
-                        "Model does not support EAGLE3 interface but "
-                        "aux_hidden_state_outputs was requested"
-                    )
+                if self.use_aux_hidden_state_outputs:
+                    if not supports_eagle3(self.get_model()):
+                        raise RuntimeError(
+                            "Model does not support EAGLE3 interface but "
+                            "aux_hidden_state_outputs was requested"
+                        )
 
-                # Try to get auxiliary layers from speculative config,
-                # otherwise use model's default layers
-                aux_layers = self._get_eagle3_aux_layers_from_config()
-                if aux_layers:
-                    logger.info(
-                        "Using auxiliary layers from speculative config: %s",
-                        aux_layers,
-                    )
-                else:
-                    aux_layers = self.model.get_eagle3_aux_hidden_state_layers()
+                    # Try to get auxiliary layers from speculative config,
+                    # otherwise use model's default layers
+                    aux_layers = self._get_eagle3_aux_layers_from_config()
+                    if aux_layers:
+                        logger.info(
+                            "Using auxiliary layers from speculative config: %s",
+                            aux_layers,
+                        )
+                    else:
+                        aux_layers = self.model.get_eagle3_aux_hidden_state_layers()
 
-                self.model.set_aux_hidden_state_layers(aux_layers)
-            time_after_load = time.perf_counter()
-        self.model_memory_usage = m.consumed_memory
+                    self.model.set_aux_hidden_state_layers(aux_layers)
+                time_after_load = time.perf_counter()
+            self.model_memory_usage = m.consumed_memory
+        except torch.cuda.OutOfMemoryError as e:
+            msg = (
+                "Failed to load model - not enough GPU memory. "
+                "Try lowering --gpu-memory-utilization to free memory for weights, "
+                "increasing --tensor-parallel-size, or using --quantization. "
+                "See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
+                "for more tips."
+            )
+            combined_msg = f"{msg} (original error: {e})"
+            logger.error(combined_msg)
+            raise e
         logger.info_once(
             "Model loading took %.4f GiB memory and %.6f seconds",
             self.model_memory_usage / GiB_bytes,

From 25221b44bbb6856c25d7a3c01bb6f79e999927b0 Mon Sep 17 00:00:00 2001
From: Xu Song <xusong.vip@gmail.com>
Date: Thu, 11 Dec 2025 08:12:21 +0800
Subject: [PATCH 428/770] Add more docs for regex (#30106)

Signed-off-by: Xu Song <xusong.vip@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 docs/features/structured_outputs.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md
index 7d52891bea7b9..3ac987559e622 100644
--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@@ -61,7 +61,7 @@ Now let´s see an example for each of the cases, starting with the `choice`, as
     print(completion.choices[0].message.content)
     ```
 
-The next example shows how to use the `regex`. The idea is to generate an email address, given a simple regex template:
+The next example shows how to use the `regex`. The supported regex syntax depends on the structured output backend. For example, `xgrammar`, `guidance`, and `outlines` use Rust-style regex, while `lm-format-enforcer` uses Python's `re` module. The idea is to generate an email address, given a simple regex template:
 
 ??? code
 

From b4054c8ab469a9c3c3c77a1c2f22f54a69b87145 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Wed, 10 Dec 2025 16:48:35 -0800
Subject: [PATCH 429/770] Revert "[CI] Add Async Eplb nightly CI tests
 (#29385)" (#30431)

---
 .../deepseek_v2_lite_ep_async_eplb.sh         | 73 ------------------
 .../deepseek_v2_lite_ep_eplb.sh               |  1 -
 .../qwen3_next_mtp_async_eplb.sh              | 74 -------------------
 .buildkite/test-pipeline.yaml                 | 20 +----
 vllm/distributed/eplb/rebalance_execute.py    |  3 +
 5 files changed, 4 insertions(+), 167 deletions(-)
 delete mode 100644 .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh
 delete mode 100644 .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh

diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh
deleted file mode 100644
index d7167161b0059..0000000000000
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh
+++ /dev/null
@@ -1,73 +0,0 @@
-#!/usr/bin/env bash
-set -euxo pipefail
-
-# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
-THRESHOLD=${1:-0.25}
-NUM_Q=${2:-1319}
-PORT=${3:-8030}
-OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
-mkdir -p "${OUT_DIR}"
-
-wait_for_server() {
-  local port=$1
-  timeout 600 bash -c '
-    until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
-      sleep 1
-    done'
-}
-
-MODEL="deepseek-ai/DeepSeek-V2-lite"
-
-# Set BACKENDS based on platform
-if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
-  # ROCm platform
-  BACKENDS=("allgather_reducescatter")
-  # Disable MOE padding for ROCm since it is causing eplb to fail
-  export VLLM_ROCM_MOE_PADDING=0
-else
-  # Non-ROCm platform (CUDA/other)
-  BACKENDS=("deepep_high_throughput" "deepep_low_latency")
-fi
-
-cleanup() {
-  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
-    kill "${SERVER_PID}" 2>/dev/null || true
-    for _ in {1..20}; do
-      kill -0 "${SERVER_PID}" 2>/dev/null || break
-      sleep 0.5
-    done
-    kill -9 "${SERVER_PID}" 2>/dev/null || true
-  fi
-}
-trap cleanup EXIT
-
-for BACK in "${BACKENDS[@]}"; do
-  VLLM_DEEP_GEMM_WARMUP=skip \
-  VLLM_ALL2ALL_BACKEND=$BACK \
-  vllm serve "$MODEL" \
-    --enforce-eager \
-    --tensor-parallel-size 2 \
-    --data-parallel-size 2 \
-    --enable-expert-parallel \
-    --enable-eplb \
-    --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
-    --trust-remote-code \
-    --max-model-len 2048 \
-    --port $PORT &
-  SERVER_PID=$!
-  wait_for_server $PORT
-
-  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
-  OUT="${OUT_DIR}/${TAG}_${BACK}_async_eplb.json"
-  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
-  python3 - <<PY
-import json; acc=json.load(open('${OUT}'))['accuracy']
-print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
-assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
-PY
-
-  cleanup
-  SERVER_PID=
-  sleep 1
-  PORT=$((PORT+1))
-done
diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
index 693418da6093e..8106f50f18f66 100644
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
@@ -50,7 +50,6 @@ for BACK in "${BACKENDS[@]}"; do
     --data-parallel-size 2 \
     --enable-expert-parallel \
     --enable-eplb \
-    --eplb-config '{"window_size":200,"step_interval":600}' \
     --trust-remote-code \
     --max-model-len 2048 \
     --port $PORT &
diff --git a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
deleted file mode 100644
index 937a43d1a3221..0000000000000
--- a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
+++ /dev/null
@@ -1,74 +0,0 @@
-#!/usr/bin/env bash
-set -euxo pipefail
-
-# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
-THRESHOLD=${1:-0.25}
-NUM_Q=${2:-1319}
-PORT=${3:-8040}
-OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
-mkdir -p "${OUT_DIR}"
-
-wait_for_server() {
-  local port=$1
-  timeout 600 bash -c '
-    until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
-      sleep 1
-    done'
-}
-
-MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"
-
-# Set BACKENDS based on platform
-if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
-  # ROCm platform
-  BACKENDS=("allgather_reducescatter")
-  # Disable MOE padding for ROCm since it is causing eplb to fail
-  export VLLM_ROCM_MOE_PADDING=0
-else
-  # Non-ROCm platform (CUDA/other)
-  BACKENDS=("deepep_high_throughput" "deepep_low_latency")
-fi
-
-cleanup() {
-  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
-    kill "${SERVER_PID}" 2>/dev/null || true
-    for _ in {1..20}; do
-      kill -0 "${SERVER_PID}" 2>/dev/null || break
-      sleep 0.5
-    done
-    kill -9 "${SERVER_PID}" 2>/dev/null || true
-  fi
-}
-trap cleanup EXIT
-
-for BACK in "${BACKENDS[@]}"; do
-  VLLM_DEEP_GEMM_WARMUP=skip \
-  VLLM_ALL2ALL_BACKEND=$BACK \
-  vllm serve "$MODEL" \
-    --enforce-eager \
-    --tensor-parallel-size 4 \
-    --enable-expert-parallel \
-    --enable-eplb \
-    --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
-    --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
-    --trust-remote-code \
-    --max-model-len 2048 \
-    --gpu-memory-utilization 0.9 \
-    --port $PORT &
-  SERVER_PID=$!
-  wait_for_server $PORT
-
-  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
-  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
-  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
-  python3 - <<PY
-import json; acc=json.load(open('${OUT}'))['accuracy']
-print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
-assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
-PY
-
-  cleanup
-  SERVER_PID=
-  sleep 1
-  PORT=$((PORT+1))
-done
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 8fc3587f7813c..750e7c038351c 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -1379,22 +1379,4 @@ steps:
   num_gpus: 2
   working_dir: "/vllm-workspace"
   commands:
-  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
-
-- label: DeepSeek V2-Lite Async EPLB Accuracy
-  timeout_in_minutes: 60
-  gpu: h100
-  optional: true
-  num_gpus: 4
-  working_dir: "/vllm-workspace"
-  commands:
-  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
-
-- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
-  timeout_in_minutes: 60
-  gpu: h100
-  optional: true
-  num_gpus: 4
-  working_dir: "/vllm-workspace"
-  commands:
-  - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
\ No newline at end of file
diff --git a/vllm/distributed/eplb/rebalance_execute.py b/vllm/distributed/eplb/rebalance_execute.py
index 55856d940f001..376dad8a72ef1 100644
--- a/vllm/distributed/eplb/rebalance_execute.py
+++ b/vllm/distributed/eplb/rebalance_execute.py
@@ -322,6 +322,9 @@ async def transfer_layer(
     num_local_physical_experts = next(iter(expert_weights[0])).shape[0]
     assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts)
     assert num_physical_experts == ep_size * num_local_physical_experts
+    # A buffer to hold the expert weights in one layer during the exchange.
+    # NOTE: Currently we assume the same weights across different layers
+    # have the same shape.
 
     is_unchanged, is_received_locally, experts_recv_loc = move_to_buffer(
         num_local_experts=num_local_physical_experts,

From b51255f369cf45456e3062e32ecbfebd03a9f169 Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Wed, 10 Dec 2025 19:12:58 -0600
Subject: [PATCH 430/770] [ROCm] Fix broken import in platform attention
 backend dispatching (#30432)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 vllm/platforms/rocm.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index f7adecbd88746..876114c2d33a4 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -403,7 +403,21 @@ class RocmPlatform(Platform):
                 compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
 
         if cache_config and cache_config.block_size is None:
-            cache_config.block_size = 16
+            if (
+                envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION and envs.VLLM_ROCM_USE_AITER
+                # NOTE: This block has been deprecated
+                # or get_env_variable_attn_backend()
+                # == AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN
+                # TODO: monitor https://github.com/vllm-project/vllm/pull/30396
+                # to see how we can transition to the new way of selecting
+                # attention backends
+            ):
+                cache_config.block_size = 64
+                logger.warning(
+                    "[ROCM_AITER_UNIFIED_ATTN]: Setting kv cache block size to 64."
+                )
+            else:
+                cache_config.block_size = 16
 
         if parallel_config.worker_cls == "auto":
             parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker"

From d1e1fb4363c61080b7cd20469d5a751e88a1cdb3 Mon Sep 17 00:00:00 2001
From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Date: Wed, 10 Dec 2025 21:47:18 -0600
Subject: [PATCH 431/770] [Bugfix] Fix grouped_topk pytorch impl when
 num_experts can't be grouped properly (#29439)

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
Co-authored-by: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
---
 vllm/model_executor/layers/fused_moe/layer.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 61dd1892d67ea..7f803720d4770 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1556,6 +1556,14 @@ class FusedMoE(CustomOp):
                     f"EPLB is not supported for {self.quant_method.method_name}."
                 )
 
+        def valid_grouping() -> bool:
+            # Check if num_experts is greater than num_expert_group
+            # and is divisible by num_expert_group
+            num_experts = router_logits.shape[-1]
+            if num_experts <= self.num_expert_group:
+                return False
+            return num_experts % self.num_expert_group == 0
+
         indices_type = self.quant_method.topk_indices_dtype
 
         # Check if we should use a routing simulation strategy
@@ -1570,7 +1578,7 @@ class FusedMoE(CustomOp):
             )
 
         # DeepSeekv2 uses grouped_top_k
-        elif self.use_grouped_topk:
+        elif self.use_grouped_topk and valid_grouping():
             assert self.topk_group is not None
             assert self.num_expert_group is not None
             if rocm_aiter_ops.is_fused_moe_enabled():

From 5a87d8b9b1f357a65a9b73773178ae17fd7cd9c8 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 11 Dec 2025 11:59:35 +0800
Subject: [PATCH 432/770] [Deprecation] Remove deprecated plugin and
 compilation fields for v0.13 release (#30396)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/design/plugin_system.md         |  4 +-
 tests/compile/test_config.py         | 63 +---------------------
 tests/kernels/moe/test_ocp_mx_moe.py |  4 +-
 tests/quantization/test_quark.py     |  4 +-
 tests/test_config.py                 |  2 +-
 vllm/attention/backends/registry.py  | 32 -----------
 vllm/attention/selector.py           | 46 +++++-----------
 vllm/config/compilation.py           | 81 +---------------------------
 vllm/config/vllm.py                  |  2 +-
 vllm/engine/arg_utils.py             | 22 --------
 10 files changed, 22 insertions(+), 238 deletions(-)

diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md
index 3485c40c36811..b0ca2dad23d5b 100644
--- a/docs/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@@ -152,5 +152,5 @@ The interface for the model/module may change during vLLM's development. If you
 ## Deprecation announcement
 
 !!! warning "Deprecations"
-    - `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It will be removed in v0.13.0 or v1.0.0.
-    - `_Backend` in `vllm.attention` is deprecated. It will be removed in v0.13.0 or v1.0.0. Please use `vllm.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead.
+    - `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It has been removed in v0.13.0.
+    - `_Backend` in `vllm.attention` is deprecated. It has been removed in v0.13.0. Please use `vllm.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead.
diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py
index 0e91cf525411e..04bb56ecb6470 100644
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
-import logging
 from contextlib import nullcontext
 from unittest.mock import patch
 
@@ -13,7 +12,6 @@ from vllm.compilation.fix_functionalization import FixFunctionalizationPass
 from vllm.config import CompilationConfig, CUDAGraphMode, ParallelConfig, VllmConfig
 from vllm.config.compilation import CompilationMode, PassConfig
 from vllm.engine.arg_utils import EngineArgs
-from vllm.logger import _print_warning_once
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import _is_torch_equal_or_newer
 
@@ -290,7 +288,7 @@ def test_moe_splitting_ops_deepep_ht_attn_fusion_no_inductor():
         ),
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
-            pass_config={"enable_attn_fusion": True, "enable_noop": True},
+            pass_config={"fuse_attn_quant": True, "eliminate_noops": True},
             custom_ops=["+quant_fp8"],
             cudagraph_mode=CUDAGraphMode.PIECEWISE,
         ),
@@ -442,62 +440,3 @@ def test_cudagraph_sizes_post_init(
             vllm_config.compilation_config.max_cudagraph_capture_size
             == expected_max_size
         )
-
-
-def test_pass_config_deprecation(caplog_vllm):
-    caplog_vllm.set_level(logging.WARNING)
-
-    # Clear cache to ensure warnings are re-issued
-    _print_warning_once.cache_clear()
-
-    # Test enable_fusion -> fuse_norm_quant, fuse_act_quant
-    caplog_vllm.clear()
-    config = PassConfig(enable_fusion=True)
-    assert "enable_fusion is deprecated" in caplog_vllm.text
-    assert config.fuse_norm_quant is True
-    assert config.fuse_act_quant is True
-    assert config.enable_fusion is True
-
-    # Test enable_attn_fusion -> fuse_attn_quant
-    caplog_vllm.clear()
-    config = PassConfig(enable_attn_fusion=True)
-    assert "enable_attn_fusion is deprecated" in caplog_vllm.text
-    assert config.fuse_attn_quant is True
-    assert config.enable_attn_fusion is True
-
-    # Test enable_noop -> eliminate_noops
-    caplog_vllm.clear()
-    config = PassConfig(enable_noop=True)
-    assert "enable_noop is deprecated" in caplog_vllm.text
-    assert config.eliminate_noops is True
-    assert config.enable_noop is True
-
-    # Test enable_sequence_parallelism -> enable_sp
-    caplog_vllm.clear()
-    config = PassConfig(enable_sequence_parallelism=True)
-    assert "enable_sequence_parallelism is deprecated" in caplog_vllm.text
-    assert config.enable_sp is True
-    assert config.enable_sequence_parallelism is True
-
-    # Test enable_async_tp -> fuse_gemm_comms
-    caplog_vllm.clear()
-    config = PassConfig(enable_async_tp=True)
-    assert "enable_async_tp is deprecated" in caplog_vllm.text
-    assert config.fuse_gemm_comms is True
-    assert config.enable_async_tp is True
-
-    # Test enable_fi_allreduce_fusion -> fuse_allreduce_rms
-    caplog_vllm.clear()
-    config = PassConfig(enable_fi_allreduce_fusion=True)
-    assert "enable_fi_allreduce_fusion is deprecated" in caplog_vllm.text
-    assert config.fuse_allreduce_rms is True
-    assert config.enable_fi_allreduce_fusion is True
-
-    # Test hash consistency
-    config_old = PassConfig(enable_fusion=True)
-    config_new = PassConfig(fuse_norm_quant=True, fuse_act_quant=True)
-    assert config_old.compute_hash() == config_new.compute_hash()
-
-    config_old = PassConfig(enable_async_tp=True)
-    config_new = PassConfig(fuse_gemm_comms=True)
-    assert config_old.compute_hash() == config_new.compute_hash()
diff --git a/tests/kernels/moe/test_ocp_mx_moe.py b/tests/kernels/moe/test_ocp_mx_moe.py
index 91b508d4163cc..5a850dda4f6fd 100644
--- a/tests/kernels/moe/test_ocp_mx_moe.py
+++ b/tests/kernels/moe/test_ocp_mx_moe.py
@@ -70,12 +70,12 @@ def test_mxfp4_loading_and_execution_moe(vllm_runner, model_case: ModelCase):
             f"{torch.cuda.device_count()}"
         )
 
-    # `cuda_graph_sizes=[16]` to reduce load time.
+    # `cudagraph_capture_sizes=[16]` to reduce load time.
     with vllm_runner(
         model_case.model_id,
         tensor_parallel_size=model_case.tp,
         load_format="dummy",
-        cuda_graph_sizes=[16],
+        cudagraph_capture_sizes=[16],
     ) as llm:
         # Disabled as check_model is broken: https://github.com/vllm-project/vllm/pull/18465#issuecomment-3329880562
         # def check_model(model):
diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py
index 334f9a65e4c03..0ff6e8407ce67 100644
--- a/tests/quantization/test_quark.py
+++ b/tests/quantization/test_quark.py
@@ -212,11 +212,11 @@ def test_ocp_mx_wikitext_correctness(config: AccuracyTestConfig, tp_size: int):
     task = "wikitext"
     rtol = 0.1
 
-    # Smaller cuda_graph_sizes to speed up the test.
+    # Smaller cudagraph_capture_sizes to speed up the test.
     results = lm_eval.simple_evaluate(
         model="vllm",
         model_args=config.get_model_args(
-            tp_size=tp_size, kwargs={"cuda_graph_sizes": [16]}
+            tp_size=tp_size, kwargs={"cudagraph_capture_sizes": [16]}
         ),
         tasks=task,
         batch_size=64,
diff --git a/tests/test_config.py b/tests/test_config.py
index 77d3a7115978e..0768c6d2cddf5 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -1085,7 +1085,7 @@ def test_vllm_config_explicit_overrides():
     )
 
     # Override one field but not others
-    pass_config = PassConfig(enable_noop=False)
+    pass_config = PassConfig(eliminate_noops=False)
     compilation_config = CompilationConfig(pass_config=pass_config)
     config = VllmConfig(
         model_config=regular_model,
diff --git a/vllm/attention/backends/registry.py b/vllm/attention/backends/registry.py
index 125e4e3827747..eaa0fa1d5db39 100644
--- a/vllm/attention/backends/registry.py
+++ b/vllm/attention/backends/registry.py
@@ -252,35 +252,3 @@ def register_backend(
         return lambda x: x
 
     return decorator
-
-
-# Backwards compatibility alias for plugins
-class _BackendMeta(type):
-    """Metaclass to provide deprecation warnings when accessing _Backend."""
-
-    def __getattribute__(cls, name: str):
-        if name not in ("__class__", "__mro__", "__name__"):
-            logger.warning(
-                "_Backend has been renamed to AttentionBackendEnum. "
-                "Please update your code to use AttentionBackendEnum instead. "
-                "_Backend will be removed in a future release."
-            )
-        return getattr(AttentionBackendEnum, name)
-
-    def __getitem__(cls, name: str):
-        logger.warning(
-            "_Backend has been renamed to AttentionBackendEnum. "
-            "Please update your code to use AttentionBackendEnum instead. "
-            "_Backend will be removed in a future release."
-        )
-        return AttentionBackendEnum[name]
-
-
-class _Backend(metaclass=_BackendMeta):
-    """Deprecated: Use AttentionBackendEnum instead.
-
-    This class is provided for backwards compatibility with plugins
-    and will be removed in a future release.
-    """
-
-    pass
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index f6aba271d2e96..bbf95ff009001 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import inspect
 from functools import cache
 from typing import cast, get_args
 
@@ -73,39 +72,18 @@ def _cached_get_attn_backend(
 ) -> type[AttentionBackend]:
     from vllm.platforms import current_platform
 
-    sig = inspect.signature(current_platform.get_attn_backend_cls)
-    if "use_v1" in sig.parameters:
-        logger.warning_once(
-            "use_v1 parameter for get_attn_backend_cls is deprecated and will "
-            "be removed in v0.13.0 or v1.0.0, whichever is soonest. Please "
-            "remove it from your plugin code."
-        )
-        attention_cls = current_platform.get_attn_backend_cls(
-            backend,
-            head_size,
-            dtype,
-            kv_cache_dtype,
-            block_size,
-            True,  # use_v1
-            use_mla,
-            has_sink,
-            use_sparse,
-            use_mm_prefix,
-            attn_type,
-        )
-    else:
-        attention_cls = current_platform.get_attn_backend_cls(
-            backend,
-            head_size,
-            dtype,
-            kv_cache_dtype,
-            block_size,
-            use_mla,
-            has_sink,
-            use_sparse,
-            use_mm_prefix,
-            attn_type,
-        )
+    attention_cls = current_platform.get_attn_backend_cls(
+        backend,
+        head_size,
+        dtype,
+        kv_cache_dtype,
+        block_size,
+        use_mla,
+        has_sink,
+        use_sparse,
+        use_mm_prefix,
+        attn_type,
+    )
     if not attention_cls:
         raise ValueError(
             f"Invalid attention backend for {current_platform.device_name}"
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 51e4912aad9db..3b6cb8a343608 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -17,7 +17,6 @@ from vllm.config.utils import (
     Range,
     config,
     get_hash_factors,
-    handle_deprecated,
     hash_factors,
 )
 from vllm.logger import init_logger
@@ -127,27 +126,6 @@ class PassConfig:
     fuse_allreduce_rms: bool = Field(default=None)
     """Enable flashinfer allreduce fusion."""
 
-    # Deprecated flags
-    enable_fusion: bool = Field(default=None)
-    """Deprecated in: v0.12.0. Use fuse_norm_quant and fuse_act_quant 
-    instead. Will be removed in v0.13.0 or v1.0.0, whichever is sooner.
-    """
-    enable_attn_fusion: bool = Field(default=None)
-    """Deprecated in: v0.12.0. Use fuse_attn_quant instead. 
-    Will be removed in v0.13.0 or v1.0.0, whichever is sooner."""
-    enable_noop: bool = Field(default=None)
-    """Deprecated in: v0.12.0. Use eliminate_noops instead. 
-    Will be removed in v0.13.0 or v1.0.0, whichever is sooner."""
-    enable_sequence_parallelism: bool = Field(default=None)
-    """Deprecated in: v0.12.0. Use enable_sp instead. 
-    Will be removed in v0.13.0 or v1.0.0, whichever is sooner."""
-    enable_async_tp: bool = Field(default=None)
-    """Deprecated in: v0.12.0. Use fuse_gemm_comms instead. 
-    Will be removed in v0.13.0 or v1.0.0, whichever is sooner."""
-    enable_fi_allreduce_fusion: bool = Field(default=None)
-    """Deprecated in: v0.12.0. Use fuse_allreduce_rms instead. 
-    Will be removed in v0.13.0 or v1.0.0, whichever is sooner."""
-
     fi_allreduce_fusion_max_size_mb: float | None = None
     """The threshold of the communicated tensor sizes under which
     vllm should use flashinfer fused allreduce. Specified as a
@@ -206,15 +184,7 @@ class PassConfig:
         Any future fields that don't affect compilation should be excluded.
         """
 
-        ignored_fields = [
-            "enable_fusion",
-            "enable_attn_fusion",
-            "enable_noop",
-            "enable_sequence_parallelism",
-            "enable_async_tp",
-            "enable_fi_allreduce_fusion",
-        ]
-        return hash_factors(get_hash_factors(self, ignored_factors=ignored_fields))
+        return hash_factors(get_hash_factors(self, set()))
 
     @field_validator(
         "fuse_norm_quant",
@@ -224,12 +194,6 @@ class PassConfig:
         "enable_sp",
         "fuse_gemm_comms",
         "fuse_allreduce_rms",
-        "enable_fusion",
-        "enable_attn_fusion",
-        "enable_noop",
-        "enable_sequence_parallelism",
-        "enable_async_tp",
-        "enable_fi_allreduce_fusion",
         mode="wrap",
     )
     @classmethod
@@ -242,49 +206,6 @@ class PassConfig:
     def __post_init__(self) -> None:
         # Handle deprecation and defaults
 
-        # Map old flags to new flags and issue warnings
-        handle_deprecated(
-            self,
-            "enable_fusion",
-            ["fuse_norm_quant", "fuse_act_quant"],
-            "v0.13.0 or v1.0.0, whichever is sooner",
-        )
-
-        handle_deprecated(
-            self,
-            "enable_attn_fusion",
-            "fuse_attn_quant",
-            "v0.13.0 or v1.0.0, whichever is sooner",
-        )
-
-        handle_deprecated(
-            self,
-            "enable_sequence_parallelism",
-            "enable_sp",
-            "v0.13.0 or v1.0.0, whichever is sooner",
-        )
-
-        handle_deprecated(
-            self,
-            "enable_async_tp",
-            "fuse_gemm_comms",
-            "v0.13.0 or v1.0.0, whichever is sooner",
-        )
-
-        handle_deprecated(
-            self,
-            "enable_fi_allreduce_fusion",
-            "fuse_allreduce_rms",
-            "v0.13.0 or v1.0.0, whichever is sooner",
-        )
-
-        handle_deprecated(
-            self,
-            "enable_noop",
-            "eliminate_noops",
-            "v0.13.0 or v1.0.0, whichever is sooner",
-        )
-
         if not self.eliminate_noops:
             if self.fuse_norm_quant or self.fuse_act_quant:
                 logger.warning_once(
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 607bb44cddd26..a3a9eec9b3203 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -1014,7 +1014,7 @@ class VllmConfig:
         max_graph_size = min(max_num_seqs * 2, 512)
         # 1, 2, 4, then multiples of 8 up to 256 and then multiples of 16
         # up to max_graph_size
-        cuda_graph_sizes = [1, 2, 4] + list(range(8, 256, 8)) + list(
+        cudagraph_capture_sizes = [1, 2, 4] + list(range(8, 256, 8)) + list(
             range(256, max_graph_size + 1, 16))
 
         In the end, `vllm_config.compilation_config.cudagraph_capture_sizes`
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 2f307a7ccf16d..cbb4862434a98 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -375,7 +375,6 @@ class EngineArgs:
     kv_cache_dtype: CacheDType = CacheConfig.cache_dtype
     seed: int | None = 0
     max_model_len: int | None = ModelConfig.max_model_len
-    cuda_graph_sizes: list[int] | None = CompilationConfig.cudagraph_capture_sizes
     cudagraph_capture_sizes: list[int] | None = (
         CompilationConfig.cudagraph_capture_sizes
     )
@@ -1121,15 +1120,6 @@ class EngineArgs:
         compilation_group.add_argument(
             "--cudagraph-capture-sizes", **compilation_kwargs["cudagraph_capture_sizes"]
         )
-        compilation_kwargs["cudagraph_capture_sizes"]["help"] = (
-            "--cuda-graph-sizes is deprecated and will be removed in v0.13.0 or v1.0.0,"
-            " whichever is soonest. Please use --cudagraph-capture-sizes instead."
-        )
-        compilation_group.add_argument(
-            "--cuda-graph-sizes",
-            **compilation_kwargs["cudagraph_capture_sizes"],
-            deprecated=True,
-        )
         compilation_group.add_argument(
             "--max-cudagraph-capture-size",
             **compilation_kwargs["max_cudagraph_capture_size"],
@@ -1741,18 +1731,6 @@ class EngineArgs:
 
         # Compilation config overrides
         compilation_config = copy.deepcopy(self.compilation_config)
-        if self.cuda_graph_sizes is not None:
-            logger.warning(
-                "--cuda-graph-sizes is deprecated and will be removed in v0.13.0 or "
-                "v1.0.0, whichever is soonest. Please use --cudagraph-capture-sizes "
-                "instead."
-            )
-            if compilation_config.cudagraph_capture_sizes is not None:
-                raise ValueError(
-                    "cuda_graph_sizes and compilation_config."
-                    "cudagraph_capture_sizes are mutually exclusive"
-                )
-            compilation_config.cudagraph_capture_sizes = self.cuda_graph_sizes
         if self.cudagraph_capture_sizes is not None:
             if compilation_config.cudagraph_capture_sizes is not None:
                 raise ValueError(

From 7e24e5d4d65abbe5ffc7e653fdfd670c7e300944 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 11 Dec 2025 11:59:39 +0800
Subject: [PATCH 433/770] [Deprecation] Remove deprecated task, seed and MM
 settings (#30397)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 benchmarks/benchmark_ngram_proposer.py        |   2 +-
 examples/offline_inference/audio_language.py  |   2 +-
 .../encoder_decoder_multimodal.py             |   2 +-
 .../qwen2_5_omni/only_thinker.py              |   2 +-
 .../qwen3_omni/only_thinker.py                |   2 +-
 examples/offline_inference/vision_language.py |   2 +-
 .../vision_language_multi_image.py            |   6 +-
 .../plugin/prithvi_geospatial_mae_client.py   |   2 +-
 .../pooling/vision_language_pooling.py        |   6 +-
 tests/conftest.py                             |   2 +-
 tests/test_config.py                          |  58 --------
 tests/utils.py                                |   4 +-
 vllm/config/model.py                          | 131 ------------------
 vllm/engine/arg_utils.py                      |  73 ++--------
 vllm/entrypoints/llm.py                       |   2 +-
 vllm/envs.py                                  |   5 -
 16 files changed, 25 insertions(+), 276 deletions(-)

diff --git a/benchmarks/benchmark_ngram_proposer.py b/benchmarks/benchmark_ngram_proposer.py
index cac401456b62a..872a263318ff7 100644
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@@ -37,7 +37,7 @@ def benchmark_propose(args):
             tokenizer="facebook/opt-125m",
             tokenizer_mode="auto",
             dtype="auto",
-            seed=None,
+            seed=0,
             trust_remote_code=False,
         )
         proposer = NgramProposer(
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index df6e96ca375fc..40462c78ae8c2 100755
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -422,7 +422,7 @@ def parse_args():
     parser.add_argument(
         "--seed",
         type=int,
-        default=None,
+        default=0,
         help="Set the seed when initializing `vllm.LLM`.",
     )
     parser.add_argument(
diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py
index c1d6c6db53dfb..857767ac3c628 100644
--- a/examples/offline_inference/encoder_decoder_multimodal.py
+++ b/examples/offline_inference/encoder_decoder_multimodal.py
@@ -77,7 +77,7 @@ def parse_args():
     parser.add_argument(
         "--seed",
         type=int,
-        default=None,
+        default=0,
         help="Set the seed when initializing `vllm.LLM`.",
     )
     return parser.parse_args()
diff --git a/examples/offline_inference/qwen2_5_omni/only_thinker.py b/examples/offline_inference/qwen2_5_omni/only_thinker.py
index ed005e6a69b80..cee83519fadcc 100644
--- a/examples/offline_inference/qwen2_5_omni/only_thinker.py
+++ b/examples/offline_inference/qwen2_5_omni/only_thinker.py
@@ -158,7 +158,7 @@ def parse_args():
     parser.add_argument(
         "--seed",
         type=int,
-        default=None,
+        default=0,
         help="Set the seed when initializing `vllm.LLM`.",
     )
 
diff --git a/examples/offline_inference/qwen3_omni/only_thinker.py b/examples/offline_inference/qwen3_omni/only_thinker.py
index 88a61ed694c2e..62131633da8aa 100644
--- a/examples/offline_inference/qwen3_omni/only_thinker.py
+++ b/examples/offline_inference/qwen3_omni/only_thinker.py
@@ -158,7 +158,7 @@ def parse_args():
     parser.add_argument(
         "--seed",
         type=int,
-        default=None,
+        default=0,
         help="Set the seed when initializing `vllm.LLM`.",
     )
 
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 22802dddf7893..9142279140e56 100755
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -2031,7 +2031,7 @@ def parse_args():
     parser.add_argument(
         "--seed",
         type=int,
-        default=None,
+        default=0,
         help="Set the seed when initializing `vllm.LLM`.",
     )
 
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 28c466c03dfa5..3c01806baa203 100755
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -1382,7 +1382,7 @@ def run_generate(
     model,
     question: str,
     image_urls: list[str],
-    seed: int | None,
+    seed: int,
     tensor_parallel_size: int | None,
 ):
     req_data = model_example_map[model](question, image_urls)
@@ -1416,7 +1416,7 @@ def run_chat(
     model: str,
     question: str,
     image_urls: list[str],
-    seed: int | None,
+    seed: int,
     tensor_parallel_size: int | None,
 ):
     req_data = model_example_map[model](question, image_urls)
@@ -1494,7 +1494,7 @@ def parse_args():
     parser.add_argument(
         "--seed",
         type=int,
-        default=None,
+        default=0,
         help="Set the seed when initializing `vllm.LLM`.",
     )
     parser.add_argument(
diff --git a/examples/pooling/plugin/prithvi_geospatial_mae_client.py b/examples/pooling/plugin/prithvi_geospatial_mae_client.py
index a6246999c14d6..1ba1fd6a92ca4 100644
--- a/examples/pooling/plugin/prithvi_geospatial_mae_client.py
+++ b/examples/pooling/plugin/prithvi_geospatial_mae_client.py
@@ -16,7 +16,7 @@ import requests
 # - start vllm in serving mode with the below args
 #   --model='christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM'
 #   --model-impl terratorch
-#   --task embed --trust-remote-code
+#   --trust-remote-code
 #   --skip-tokenizer-init --enforce-eager
 #   --io-processor-plugin terratorch_segmentation
 #   --enable-mm-embeds
diff --git a/examples/pooling/pooling/vision_language_pooling.py b/examples/pooling/pooling/vision_language_pooling.py
index 530aad4bc031c..dda56bc34df2e 100644
--- a/examples/pooling/pooling/vision_language_pooling.py
+++ b/examples/pooling/pooling/vision_language_pooling.py
@@ -305,7 +305,7 @@ def get_query(modality: QueryModality):
     raise ValueError(msg)
 
 
-def run_encode(model: str, modality: QueryModality, seed: int | None):
+def run_encode(model: str, modality: QueryModality, seed: int):
     query = get_query(modality)
     req_data = model_example_map[model](query)
 
@@ -335,7 +335,7 @@ def run_encode(model: str, modality: QueryModality, seed: int | None):
         print("-" * 50)
 
 
-def run_score(model: str, modality: QueryModality, seed: int | None):
+def run_score(model: str, modality: QueryModality, seed: int):
     query = get_query(modality)
     req_data = model_example_map[model](query)
 
@@ -390,7 +390,7 @@ def parse_args():
     parser.add_argument(
         "--seed",
         type=int,
-        default=None,
+        default=0,
         help="Set the seed when initializing `vllm.LLM`.",
     )
     return parser.parse_args()
diff --git a/tests/conftest.py b/tests/conftest.py
index 9f811d5d8db2a..5b26a02823c56 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -741,7 +741,7 @@ class VllmRunner:
         tokenizer_name: str | None = None,
         tokenizer_mode: str = "auto",
         trust_remote_code: bool = True,
-        seed: int | None = 0,
+        seed: int = 0,
         max_model_len: int | None = 1024,
         dtype: str = "auto",
         disable_log_stats: bool = True,
diff --git a/tests/test_config.py b/tests/test_config.py
index 0768c6d2cddf5..ee706ab3d9c87 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -89,64 +89,6 @@ def test_update_config():
         new_config3 = update_config(config3, {"a": "new_value"})
 
 
-# Can remove once --task option is fully deprecated
-@pytest.mark.parametrize(
-    ("model_id", "expected_runner_type", "expected_convert_type", "expected_task"),
-    [
-        ("distilbert/distilgpt2", "generate", "none", "generate"),
-        ("intfloat/multilingual-e5-small", "pooling", "none", "embed"),
-        ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"),
-        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "none", "classify"),
-        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none", "embed"),
-        ("openai/whisper-small", "generate", "none", "transcription"),
-    ],
-)
-def test_auto_task(
-    model_id, expected_runner_type, expected_convert_type, expected_task
-):
-    config = ModelConfig(model_id, task="auto")
-
-    assert config.runner_type == expected_runner_type
-    assert config.convert_type == expected_convert_type
-
-
-# Can remove once --task option is fully deprecated
-@pytest.mark.parametrize(
-    ("model_id", "expected_runner_type", "expected_convert_type", "expected_task"),
-    [
-        ("distilbert/distilgpt2", "pooling", "embed", "embed"),
-        ("intfloat/multilingual-e5-small", "pooling", "embed", "embed"),
-        ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"),
-        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify", "classify"),
-        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "embed", "embed"),
-        ("openai/whisper-small", "pooling", "embed", "embed"),
-    ],
-)
-def test_score_task(
-    model_id, expected_runner_type, expected_convert_type, expected_task
-):
-    config = ModelConfig(model_id, task="score")
-
-    assert config.runner_type == expected_runner_type
-    assert config.convert_type == expected_convert_type
-
-
-# Can remove once --task option is fully deprecated
-@pytest.mark.parametrize(
-    ("model_id", "expected_runner_type", "expected_convert_type", "expected_task"),
-    [
-        ("openai/whisper-small", "generate", "none", "transcription"),
-    ],
-)
-def test_transcription_task(
-    model_id, expected_runner_type, expected_convert_type, expected_task
-):
-    config = ModelConfig(model_id, task="transcription")
-
-    assert config.runner_type == expected_runner_type
-    assert config.convert_type == expected_convert_type
-
-
 @pytest.mark.parametrize(
     ("model_id", "expected_runner_type", "expected_convert_type"),
     [
diff --git a/tests/utils.py b/tests/utils.py
index ea3675b1461b8..d8102331b3612 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -119,7 +119,7 @@ class RemoteOpenAIServer:
         vllm_serve_args: list[str],
         *,
         env_dict: dict[str, str] | None = None,
-        seed: int | None = 0,
+        seed: int = 0,
         auto_port: bool = True,
         max_wait_seconds: float | None = None,
         override_hf_configs: dict[str, Any] | None = None,
@@ -283,7 +283,7 @@ class RemoteOpenAIServerCustom(RemoteOpenAIServer):
         child_process_fxn: Callable[[dict[str, str] | None, str, list[str]], None],
         *,
         env_dict: dict[str, str] | None = None,
-        seed: int | None = 0,
+        seed: int = 0,
         auto_port: bool = True,
         max_wait_seconds: float | None = None,
     ) -> None:
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 764bdf7000561..bd98111ffb5db 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -73,17 +73,6 @@ logger = init_logger(__name__)
 RunnerOption = Literal["auto", RunnerType]
 ConvertType = Literal["none", "embed", "classify", "reward"]
 ConvertOption = Literal["auto", ConvertType]
-TaskOption = Literal[
-    "auto",
-    "generate",
-    "embedding",
-    "embed",
-    "classify",
-    "score",
-    "reward",
-    "transcription",
-    "draft",
-]
 TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"]
 ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
 LogprobsMode = Literal[
@@ -93,12 +82,6 @@ HfOverrides = dict[str, Any] | Callable[[PretrainedConfig], PretrainedConfig]
 ModelImpl = Literal["auto", "vllm", "transformers", "terratorch"]
 LayerBlockType = Literal["attention", "linear_attention", "mamba"]
 
-_RUNNER_TASKS: dict[RunnerType, list[TaskOption]] = {
-    "generate": ["generate", "transcription"],
-    "pooling": ["embedding", "embed", "classify", "score", "reward"],
-    "draft": ["draft"],
-}
-
 _RUNNER_CONVERTS: dict[RunnerType, list[ConvertType]] = {
     "generate": [],
     "pooling": ["embed", "classify", "reward"],
@@ -126,12 +109,6 @@ class ModelConfig:
     """Convert the model using adapters defined in
     [vllm.model_executor.models.adapters][]. The most common use case is to
     adapt a text generation model to be used for pooling tasks."""
-    task: TaskOption | None = None
-    """[DEPRECATED] The task to use the model for. If the model supports more
-    than one model runner, this is used to select which model runner to run.
-
-    Note that the model may support other tasks using the same model runner.
-    """
     tokenizer: SkipValidation[str] = None  # type: ignore
     """Name or path of the Hugging Face tokenizer to use. If unspecified, model
     name or path will be used."""
@@ -335,7 +312,6 @@ class ModelConfig:
         ignored_factors = {
             "runner",
             "convert",
-            "task",
             "tokenizer",
             "tokenizer_mode",
             "seed",
@@ -510,97 +486,6 @@ class ModelConfig:
         is_generative_model = registry.is_text_generation_model(architectures, self)
         is_pooling_model = registry.is_pooling_model(architectures, self)
 
-        def _task_to_convert(task: TaskOption) -> ConvertType:
-            if task == "embedding" or task == "embed":
-                return "embed"
-            if task == "classify":
-                return "classify"
-            if task == "reward":
-                logger.warning(
-                    "Pooling models now default support all pooling; "
-                    "you can use it without any settings."
-                )
-                return "embed"
-            if task == "score":
-                new_task = self._get_default_pooling_task(architectures)
-                return "classify" if new_task == "classify" else "embed"
-
-            return "none"
-
-        if self.task is not None:
-            runner: RunnerOption = "auto"
-            convert: ConvertOption = "auto"
-            msg_prefix = (
-                "The 'task' option has been deprecated and will be "
-                "removed in v0.13.0 or v1.0, whichever comes first."
-            )
-            msg_hint = "Please remove this option."
-
-            is_generative_task = self.task in _RUNNER_TASKS["generate"]
-            is_pooling_task = self.task in _RUNNER_TASKS["pooling"]
-
-            if is_generative_model and is_pooling_model:
-                if is_generative_task:
-                    runner = "generate"
-                    convert = "auto"
-                    msg_hint = (
-                        "Please replace this option with `--runner "
-                        "generate` to continue using this model "
-                        "as a generative model."
-                    )
-                elif is_pooling_task:
-                    runner = "pooling"
-                    convert = "auto"
-                    msg_hint = (
-                        "Please replace this option with `--runner "
-                        "pooling` to continue using this model "
-                        "as a pooling model."
-                    )
-                else:  # task == "auto"
-                    pass
-            elif is_generative_model or is_pooling_model:
-                if is_generative_task:
-                    runner = "generate"
-                    convert = "auto"
-                    msg_hint = "Please remove this option"
-                elif is_pooling_task:
-                    runner = "pooling"
-                    convert = _task_to_convert(self.task)
-                    msg_hint = (
-                        "Please replace this option with `--convert "
-                        f"{convert}` to continue using this model "
-                        "as a pooling model."
-                    )
-                else:  # task == "auto"
-                    pass
-            else:
-                # Neither generative nor pooling model - try to convert if possible
-                if is_pooling_task:
-                    runner = "pooling"
-                    convert = _task_to_convert(self.task)
-                    msg_hint = (
-                        "Please replace this option with `--runner pooling "
-                        f"--convert {convert}` to continue using this model "
-                        "as a pooling model."
-                    )
-                else:
-                    debug_info = {
-                        "architectures": architectures,
-                        "is_generative_model": is_generative_model,
-                        "is_pooling_model": is_pooling_model,
-                    }
-                    raise AssertionError(
-                        "The model should be a generative or "
-                        "pooling model when task is set to "
-                        f"{self.task!r}. Found: {debug_info}"
-                    )
-
-            self.runner = runner
-            self.convert = convert
-
-            msg = f"{msg_prefix} {msg_hint}"
-            warnings.warn(msg, DeprecationWarning, stacklevel=2)
-
         self.runner_type = self._get_runner_type(architectures, self.runner)
         self.convert_type = self._get_convert_type(
             architectures, self.runner_type, self.convert
@@ -918,22 +803,6 @@ class ModelConfig:
 
         return convert_type
 
-    def _get_default_pooling_task(
-        self,
-        architectures: list[str],
-    ) -> Literal["embed", "classify", "reward"]:
-        if self.registry.is_cross_encoder_model(architectures, self):
-            return "classify"
-
-        for arch in architectures:
-            match = try_match_architecture_defaults(arch, runner_type="pooling")
-            if match:
-                _, (_, convert_type) = match
-                assert convert_type != "none"
-                return convert_type
-
-        return "embed"
-
     def _parse_quant_hf_config(self, hf_config: PretrainedConfig):
         quant_cfg = getattr(hf_config, "quantization_config", None)
         if quant_cfg is None:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index cbb4862434a98..f303bef17b6a9 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -71,7 +71,6 @@ from vllm.config.model import (
     LogprobsMode,
     ModelDType,
     RunnerOption,
-    TaskOption,
     TokenizerMode,
 )
 from vllm.config.multimodal import MMCacheType, MMEncoderTPMode
@@ -360,7 +359,6 @@ class EngineArgs:
     hf_config_path: str | None = ModelConfig.hf_config_path
     runner: RunnerOption = ModelConfig.runner
     convert: ConvertOption = ModelConfig.convert
-    task: TaskOption | None = ModelConfig.task
     skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init
     enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds
     tokenizer_mode: TokenizerMode | str = ModelConfig.tokenizer_mode
@@ -373,7 +371,7 @@ class EngineArgs:
     config_format: str = ModelConfig.config_format
     dtype: ModelDType = ModelConfig.dtype
     kv_cache_dtype: CacheDType = CacheConfig.cache_dtype
-    seed: int | None = 0
+    seed: int = ModelConfig.seed
     max_model_len: int | None = ModelConfig.max_model_len
     cudagraph_capture_sizes: list[int] | None = (
         CompilationConfig.cudagraph_capture_sizes
@@ -462,7 +460,6 @@ class EngineArgs:
         MultiModalConfig, "media_io_kwargs"
     )
     mm_processor_kwargs: dict[str, Any] | None = MultiModalConfig.mm_processor_kwargs
-    disable_mm_preprocessor_cache: bool = False  # DEPRECATED
     mm_processor_cache_gb: float = MultiModalConfig.mm_processor_cache_gb
     mm_processor_cache_type: MMCacheType | None = (
         MultiModalConfig.mm_processor_cache_type
@@ -558,9 +555,6 @@ class EngineArgs:
     use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
     pt_load_map_location: str = LoadConfig.pt_load_map_location
 
-    # DEPRECATED
-    enable_multimodal_encoder_data_parallel: bool = False
-
     logits_processors: list[str | type[LogitsProcessor]] | None = (
         ModelConfig.logits_processors
     )
@@ -628,7 +622,6 @@ class EngineArgs:
             model_group.add_argument("--model", **model_kwargs["model"])
         model_group.add_argument("--runner", **model_kwargs["runner"])
         model_group.add_argument("--convert", **model_kwargs["convert"])
-        model_group.add_argument("--task", **model_kwargs["task"], deprecated=True)
         model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"])
         model_group.add_argument("--tokenizer-mode", **model_kwargs["tokenizer_mode"])
         model_group.add_argument(
@@ -882,11 +875,6 @@ class EngineArgs:
         parallel_group.add_argument(
             "--worker-extension-cls", **parallel_kwargs["worker_extension_cls"]
         )
-        parallel_group.add_argument(
-            "--enable-multimodal-encoder-data-parallel",
-            action="store_true",
-            deprecated=True,
-        )
 
         # KV cache arguments
         cache_kwargs = get_kwargs(CacheConfig)
@@ -960,9 +948,6 @@ class EngineArgs:
         multimodal_group.add_argument(
             "--mm-processor-cache-gb", **multimodal_kwargs["mm_processor_cache_gb"]
         )
-        multimodal_group.add_argument(
-            "--disable-mm-preprocessor-cache", action="store_true", deprecated=True
-        )
         multimodal_group.add_argument(
             "--mm-processor-cache-type", **multimodal_kwargs["mm_processor_cache_type"]
         )
@@ -1192,62 +1177,20 @@ class EngineArgs:
         if is_gguf(self.model):
             self.quantization = self.load_format = "gguf"
 
-        # NOTE(woosuk): In V1, we use separate processes for workers (unless
-        # VLLM_ENABLE_V1_MULTIPROCESSING=0), so setting a seed here
-        # doesn't affect the user process.
-        if self.seed is None:
-            logger.warning_once(
-                "`seed=None` is equivalent to `seed=0` in V1 Engine. "
-                "You will no longer be allowed to pass `None` in v0.13.",
-                scope="local",
+        if not envs.VLLM_ENABLE_V1_MULTIPROCESSING:
+            logger.warning(
+                "The global random seed is set to %d. Since "
+                "VLLM_ENABLE_V1_MULTIPROCESSING is set to False, this may "
+                "affect the random state of the Python process that "
+                "launched vLLM.",
+                self.seed,
             )
 
-            self.seed = 0
-            if not envs.VLLM_ENABLE_V1_MULTIPROCESSING:
-                logger.warning(
-                    "The global random seed is set to %d. Since "
-                    "VLLM_ENABLE_V1_MULTIPROCESSING is set to False, this may "
-                    "affect the random state of the Python process that "
-                    "launched vLLM.",
-                    self.seed,
-                )
-
-        if self.disable_mm_preprocessor_cache:
-            logger.warning_once(
-                "`--disable-mm-preprocessor-cache` is deprecated "
-                "and will be removed in v0.13. "
-                "Please use `--mm-processor-cache-gb 0` instead.",
-                scope="local",
-            )
-
-            self.mm_processor_cache_gb = 0
-        elif envs.VLLM_MM_INPUT_CACHE_GIB != 4:
-            logger.warning_once(
-                "VLLM_MM_INPUT_CACHE_GIB` is deprecated "
-                "and will be removed in v0.13. "
-                "Please use `--mm-processor-cache-gb %d` instead.",
-                envs.VLLM_MM_INPUT_CACHE_GIB,
-                scope="local",
-            )
-
-            self.mm_processor_cache_gb = envs.VLLM_MM_INPUT_CACHE_GIB
-
-        if self.enable_multimodal_encoder_data_parallel:
-            logger.warning_once(
-                "--enable-multimodal-encoder-data-parallel` is deprecated "
-                "and will be removed in v0.13. "
-                "Please use `--mm-encoder-tp-mode data` instead.",
-                scope="local",
-            )
-
-            self.mm_encoder_tp_mode = "data"
-
         return ModelConfig(
             model=self.model,
             hf_config_path=self.hf_config_path,
             runner=self.runner,
             convert=self.convert,
-            task=self.task,
             tokenizer=self.tokenizer,
             tokenizer_mode=self.tokenizer_mode,
             trust_remote_code=self.trust_remote_code,
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 3fce3338503ef..6440b702f4fa6 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -198,7 +198,7 @@ class LLM:
         quantization: QuantizationMethods | None = None,
         revision: str | None = None,
         tokenizer_revision: str | None = None,
-        seed: int | None = None,
+        seed: int = 0,
         gpu_memory_utilization: float = 0.9,
         swap_space: float = 4,
         cpu_offload_gb: float = 0,
diff --git a/vllm/envs.py b/vllm/envs.py
index 230f2cf3450a9..0cf0408054063 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -72,7 +72,6 @@ if TYPE_CHECKING:
     VLLM_MAX_AUDIO_CLIP_FILESIZE_MB: int = 25
     VLLM_VIDEO_LOADER_BACKEND: str = "opencv"
     VLLM_MEDIA_CONNECTOR: str = "http"
-    VLLM_MM_INPUT_CACHE_GIB: int = 4
     VLLM_TARGET_DEVICE: str = "cuda"
     VLLM_MAIN_CUDA_VERSION: str = "12.9"
     VLLM_FLOAT32_MATMUL_PRECISION: Literal["highest", "high", "medium"] = "highest"
@@ -786,9 +785,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # imported at runtime.
     # If a non-existing backend is used, an AssertionError will be thrown.
     "VLLM_MEDIA_CONNECTOR": lambda: os.getenv("VLLM_MEDIA_CONNECTOR", "http"),
-    # [DEPRECATED] Cache size (in GiB per process) for multimodal input cache
-    # Default is 4 GiB per API process + 4 GiB per engine core process
-    "VLLM_MM_INPUT_CACHE_GIB": lambda: int(os.getenv("VLLM_MM_INPUT_CACHE_GIB", "4")),
     # Path to the XLA persistent cache directory.
     # Only used for XLA devices such as TPUs.
     "VLLM_XLA_CACHE_PATH": lambda: os.path.expanduser(
@@ -1681,7 +1677,6 @@ def compile_factors() -> dict[str, object]:
         "VLLM_MEDIA_CONNECTOR",
         "VLLM_ASSETS_CACHE",
         "VLLM_ASSETS_CACHE_MODEL_CLEAN",
-        "VLLM_MM_INPUT_CACHE_GIB",
         "VLLM_WORKER_MULTIPROC_METHOD",
         "VLLM_ENABLE_V1_MULTIPROCESSING",
         "VLLM_V1_OUTPUT_PROC_CHUNK_SIZE",

From d6464f267979946a1c2d9c6029ef2007be73ca09 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Wed, 10 Dec 2025 23:05:56 -0500
Subject: [PATCH 434/770] [Chore] Fix torch precision warning (#30428)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 tests/v1/e2e/test_async_scheduling.py |  4 ++--
 vllm/envs.py                          | 10 ++++++----
 vllm/v1/worker/gpu_worker.py          |  2 +-
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py
index 13b36c54123ce..5cef9b33c9984 100644
--- a/tests/v1/e2e/test_async_scheduling.py
+++ b/tests/v1/e2e/test_async_scheduling.py
@@ -152,8 +152,8 @@ def run_tests(
                 m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_AITER_FA")
         else:
             m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
-        # lock matmul precision to full FP32
-        m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "highest")
+        # lock matmul precision to full FP32 (IEEE)
+        m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "ieee")
         # m.setenv("VLLM_BATCH_INVARIANT", "1")
         outputs: list[tuple[str, list, list]] = []
         for n, (
diff --git a/vllm/envs.py b/vllm/envs.py
index 0cf0408054063..cb75ba1a62de9 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -74,7 +74,7 @@ if TYPE_CHECKING:
     VLLM_MEDIA_CONNECTOR: str = "http"
     VLLM_TARGET_DEVICE: str = "cuda"
     VLLM_MAIN_CUDA_VERSION: str = "12.9"
-    VLLM_FLOAT32_MATMUL_PRECISION: Literal["highest", "high", "medium"] = "highest"
+    VLLM_FLOAT32_MATMUL_PRECISION: Literal["ieee", "tf32"] = "ieee"
     MAX_JOBS: str | None = None
     NVCC_THREADS: str | None = None
     VLLM_USE_PRECOMPILED: bool = False
@@ -456,11 +456,13 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_MAIN_CUDA_VERSION": lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower()
     or "12.9",
     # Controls PyTorch float32 matmul precision mode within vLLM workers.
-    # Valid options mirror torch.set_float32_matmul_precision
+    # Accepted values:
+    #   - "ieee" (default): force full IEEE FP32 matmul precision.
+    #   - "tf32": enable TensorFloat32-based fast matmul.
     "VLLM_FLOAT32_MATMUL_PRECISION": env_with_choices(
         "VLLM_FLOAT32_MATMUL_PRECISION",
-        "highest",
-        ["highest", "high", "medium"],
+        "ieee",
+        ["ieee", "tf32"],
         case_sensitive=False,
     ),
     # Maximum number of compilation jobs to run in parallel.
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index f2b6a1f76b0b9..25ac5aaf99818 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -81,7 +81,7 @@ class Worker(WorkerBase):
 
         # configure float32 matmul precision according to vLLM env.
         precision = envs.VLLM_FLOAT32_MATMUL_PRECISION
-        torch.set_float32_matmul_precision(precision)
+        torch.backends.cuda.matmul.fp32_precision = precision
 
         if self.model_config.trust_remote_code:
             # note: lazy import to avoid importing torch before initializing

From 1a516557e11809cd7ab01c8cc399333ea02f7ac6 Mon Sep 17 00:00:00 2001
From: xyDong0223 <dongxinyu23@gmail.com>
Date: Thu, 11 Dec 2025 12:52:17 +0800
Subject: [PATCH 435/770] [Doc] Add Baidu Kunlun XPU support (#30455)

Signed-off-by: xyDong0223 <dongxinyu23@gmail.com>
---
 docs/getting_started/installation/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/getting_started/installation/README.md b/docs/getting_started/installation/README.md
index d5082bc7dd3a9..cff7ce1a882a1 100644
--- a/docs/getting_started/installation/README.md
+++ b/docs/getting_started/installation/README.md
@@ -26,3 +26,4 @@ The backends below live **outside** the main `vllm` repository and follow the
 | Rebellions ATOM / REBEL NPU | `vllm-rbln` | <https://github.com/rebellions-sw/vllm-rbln> |
 | IBM Spyre AIU | `vllm-spyre` | <https://github.com/vllm-project/vllm-spyre> |
 | Cambricon MLU | `vllm-mlu` | <https://github.com/Cambricon/vllm-mlu> |
+| Baidu Kunlun XPU | N/A, install from source | <https://github.com/baidu/vLLM-Kunlun> |

From 36c9ce25543b4f48194d7adc4ba3d17f5b6102be Mon Sep 17 00:00:00 2001
From: gh-wf <111619017+gh-wf@users.noreply.github.com>
Date: Thu, 11 Dec 2025 00:26:49 -0500
Subject: [PATCH 436/770] Ensure minimum frames for GLM 4.6V compatibility
 (#30285)

Signed-off-by: Wayne Ferguson <wayneferguson@gmail.com>
---
 vllm/model_executor/models/glm4_1v.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 741edfdda3e2c..de091f03e881c 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -1257,6 +1257,7 @@ class Glm4vDummyInputsBuilder(BaseDummyInputsBuilder[Glm4vProcessingInfo]):
                     )
                 height = min(height, overrides.height)
 
+        num_frames = max(num_frames, 2)  # GLM 4.6V requires 2 frames
         video = np.full((num_frames, width, height, 3), 255, dtype=np.uint8)
         video_items = []
         for i in range(num_videos):

From 979f50efd04552654eca57c7e71e38160a7cbb5c Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 11 Dec 2025 14:58:23 +0800
Subject: [PATCH 437/770] [Deprecation] Remove fallbacks for `embed_input_ids`
 and `embed_multimodal` (#30458)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/interfaces.py      | 15 +------
 vllm/model_executor/models/interfaces_base.py |  9 -----
 .../models/mistral_large_3_eagle.py           | 39 +++----------------
 vllm/model_executor/models/phi3v.py           |  7 +---
 vllm/model_executor/models/qwen3_vl.py        |  7 +---
 5 files changed, 9 insertions(+), 68 deletions(-)

diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 607ff55835f1d..1e5d80dd2f313 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -111,13 +111,7 @@ class SupportsMultiModal(Protocol):
             the appearances of their corresponding multimodal data item in the
             input prompt.
         """
-        if hasattr(self, "get_multimodal_embeddings"):
-            logger.warning_once(
-                "`get_multimodal_embeddings` for vLLM models is deprecated and will be "
-                "removed in v0.13.0 or v1.0.0, whichever is earlier. Please rename "
-                "this method to `embed_multimodal`."
-            )
-            return self.get_multimodal_embeddings(**kwargs)
+        ...
 
     def get_language_model(self) -> VllmModel:
         """
@@ -196,12 +190,7 @@ class SupportsMultiModal(Protocol):
         if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
             return inputs_embeds
 
-        if is_multimodal is None:
-            raise ValueError(
-                "`embed_input_ids` now requires `is_multimodal` arg, "
-                "please update your model runner according to "
-                "https://github.com/vllm-project/vllm/pull/16229."
-            )
+        assert is_multimodal is not None
 
         return _merge_multimodal_embeddings(
             inputs_embeds=inputs_embeds,
diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index e8d521ec2e8aa..f988873c9c77c 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -68,15 +68,6 @@ def _check_vllm_model_init(model: type[object] | object) -> bool:
 def _check_vllm_model_embed_input_ids(model: type[object] | object) -> bool:
     model_embed_input_ids = getattr(model, "embed_input_ids", None)
     if not callable(model_embed_input_ids):
-        model_get_input_embeddings = getattr(model, "get_input_embeddings", None)
-        if callable(model_get_input_embeddings):
-            logger.warning(
-                "`get_input_embeddings` for vLLM models is deprecated and will be "
-                "removed in v0.13.0 or v1.0.0, whichever is earlier. Please rename "
-                "this method to `embed_input_ids`."
-            )
-            model.embed_input_ids = model_get_input_embeddings
-            return True
         logger.warning(
             "The model (%s) is missing the `embed_input_ids` method.",
             model,
diff --git a/vllm/model_executor/models/mistral_large_3_eagle.py b/vllm/model_executor/models/mistral_large_3_eagle.py
index e3ca9e4ca82d0..37cd4324e53d9 100644
--- a/vllm/model_executor/models/mistral_large_3_eagle.py
+++ b/vllm/model_executor/models/mistral_large_3_eagle.py
@@ -18,15 +18,10 @@ from vllm.model_executor.models.deepseek_v2 import (
     DeepseekV2DecoderLayer,
     DeepseekV2Model,
 )
-from vllm.model_executor.models.interfaces import MultiModalEmbeddings
 from vllm.model_executor.models.mistral_large_3 import MistralLarge3ForCausalLM
-from vllm.multimodal.inputs import NestedTensors
 
-from .utils import (
-    _merge_multimodal_embeddings,
-    make_empty_intermediate_tensors_factory,
-    maybe_prefix,
-)
+from .interfaces import SupportsMultiModal
+from .utils import make_empty_intermediate_tensors_factory, maybe_prefix
 
 logger = init_logger(__name__)
 
@@ -117,26 +112,10 @@ class EagleMistralLarge3ForCausalLM(MistralLarge3ForCausalLM):
         )
         super().__init__(vllm_config=vllm_config, prefix=prefix)
 
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: MultiModalEmbeddings | None = None,
-        *,
-        is_multimodal: torch.Tensor | None = None,
-        handle_oov_mm_token: bool = False,
-    ) -> torch.Tensor:
-        inputs_embeds = super().embed_input_ids(input_ids)
+    def get_language_model(self) -> torch.nn.Module:
+        return self.model
 
-        if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
-            return inputs_embeds
-
-        assert is_multimodal is not None
-
-        return _merge_multimodal_embeddings(
-            inputs_embeds=inputs_embeds,
-            multimodal_embeddings=multimodal_embeddings,
-            is_multimodal=is_multimodal,
-        )
+    embed_input_ids = SupportsMultiModal.embed_input_ids  # type: ignore
 
     def forward(
         self,
@@ -155,11 +134,3 @@ class EagleMistralLarge3ForCausalLM(MistralLarge3ForCausalLM):
             "model.embed_tokens.weight",
             "lm_head.weight",
         }
-
-    def embed_input_ids(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: NestedTensors | None = None,
-        is_multimodal: torch.Tensor | None = None,
-    ) -> torch.Tensor:
-        return self.model.embed_input_ids(input_ids)
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index b7ae548069f25..0d39e29dcc97b 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -687,12 +687,7 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant)
         if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
             return inputs_embeds
 
-        if is_multimodal is None:
-            raise ValueError(
-                "`embed_input_ids` now requires `is_multimodal` arg, "
-                "please update your model runner according to "
-                "https://github.com/vllm-project/vllm/pull/16229."
-            )
+        assert is_multimodal is not None
 
         return _merge_multimodal_embeddings(
             inputs_embeds=inputs_embeds,
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 1add39d6b0a84..eac3774196a0a 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -1572,12 +1572,7 @@ class Qwen3VLForConditionalGeneration(
         if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
             return inputs_embeds
 
-        if is_multimodal is None:
-            raise ValueError(
-                "`embed_input_ids` now requires `is_multimodal` arg, "
-                "please update your model runner according to "
-                "https://github.com/vllm-project/vllm/pull/16229."
-            )
+        assert is_multimodal is not None
 
         if self.use_deepstack:
             (

From d02d1043dea56e4d2b1149a311079d82ff251d9d Mon Sep 17 00:00:00 2001
From: Ning Xie <andy.xning@gmail.com>
Date: Thu, 11 Dec 2025 15:30:33 +0800
Subject: [PATCH 438/770] fix: enhance human_readable_int function (#30337)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
---
 tests/engine/test_arg_utils.py | 22 ++++++++++++++++++----
 vllm/engine/arg_utils.py       |  3 +++
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index e46f118f8e846..c2cf77ffa12b6 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -350,21 +350,35 @@ def test_human_readable_model_len():
     assert args.max_model_len == 1_000_000
     args = parser.parse_args(["--max-model-len", "10k"])
     assert args.max_model_len == 10_000
+    args = parser.parse_args(["--max-model-len", "2g"])
+    assert args.max_model_len == 2_000_000_000
+    args = parser.parse_args(["--max-model-len", "2t"])
+    assert args.max_model_len == 2_000_000_000_000
 
     # Capital
     args = parser.parse_args(["--max-model-len", "3K"])
-    assert args.max_model_len == 1024 * 3
+    assert args.max_model_len == 2**10 * 3
     args = parser.parse_args(["--max-model-len", "10M"])
     assert args.max_model_len == 2**20 * 10
+    args = parser.parse_args(["--max-model-len", "4G"])
+    assert args.max_model_len == 2**30 * 4
+    args = parser.parse_args(["--max-model-len", "4T"])
+    assert args.max_model_len == 2**40 * 4
 
     # Decimal values
     args = parser.parse_args(["--max-model-len", "10.2k"])
     assert args.max_model_len == 10200
     # ..truncated to the nearest int
-    args = parser.parse_args(["--max-model-len", "10.212345k"])
+    args = parser.parse_args(["--max-model-len", "10.2123451234567k"])
     assert args.max_model_len == 10212
+    args = parser.parse_args(["--max-model-len", "10.2123451234567m"])
+    assert args.max_model_len == 10212345
+    args = parser.parse_args(["--max-model-len", "10.2123451234567g"])
+    assert args.max_model_len == 10212345123
+    args = parser.parse_args(["--max-model-len", "10.2123451234567t"])
+    assert args.max_model_len == 10212345123456
 
     # Invalid (do not allow decimals with binary multipliers)
-    for invalid in ["1a", "pwd", "10.24", "1.23M"]:
+    for invalid in ["1a", "pwd", "10.24", "1.23M", "1.22T"]:
         with pytest.raises(ArgumentError):
-            args = parser.parse_args(["--max-model-len", invalid])
+            parser.parse_args(["--max-model-len", invalid])
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index f303bef17b6a9..3f23b95641d61 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1783,6 +1783,7 @@ class EngineArgs:
         except Exception:
             # This is only used to set default_max_num_batched_tokens
             device_memory = 0
+            device_name = ""
 
         # NOTE(Kuntai): Setting large `max_num_batched_tokens` for A100 reduces
         # throughput, see PR #17885 for more details.
@@ -2042,11 +2043,13 @@ def human_readable_int(value):
             "k": 10**3,
             "m": 10**6,
             "g": 10**9,
+            "t": 10**12,
         }
         binary_multiplier = {
             "K": 2**10,
             "M": 2**20,
             "G": 2**30,
+            "T": 2**40,
         }
 
         number, suffix = match.groups()

From fba89069302e9b4d0457bc8eeddeeec76f27f0b1 Mon Sep 17 00:00:00 2001
From: Ming Yang <minos.future@gmail.com>
Date: Thu, 11 Dec 2025 00:20:45 -0800
Subject: [PATCH 439/770] [perf] Use direct copy (broadcast) instead of cat for
 k_nope/k_pe in MLA prefill (#29710)

Signed-off-by: Ming Yang <minos.future@gmail.com>
---
 benchmarks/kernels/benchmark_mla_k_concat.py | 150 +++++++++++++++++++
 vllm/v1/attention/backends/mla/common.py     |  33 +++-
 2 files changed, 180 insertions(+), 3 deletions(-)
 create mode 100644 benchmarks/kernels/benchmark_mla_k_concat.py

diff --git a/benchmarks/kernels/benchmark_mla_k_concat.py b/benchmarks/kernels/benchmark_mla_k_concat.py
new file mode 100644
index 0000000000000..fb3b6c8f12003
--- /dev/null
+++ b/benchmarks/kernels/benchmark_mla_k_concat.py
@@ -0,0 +1,150 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Benchmark script comparing torch.cat vs direct copy for k_nope/k_pe concatenation
+in MLA (Multi-head Latent Attention) prefill.
+
+This validates that the optimization from commit 8d4142bd is beneficial across
+various batch sizes, not just the originally tested batch size of 32768.
+"""
+
+import time
+from collections.abc import Callable
+
+import torch
+
+# DeepSeek-V3 MLA dimensions
+NUM_HEADS = 128
+QK_NOPE_HEAD_DIM = 128
+PE_DIM = 64
+
+
+def cat_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor:
+    """Original torch.cat approach with expand."""
+    return torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
+
+
+def direct_copy_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor:
+    """Optimized direct copy approach (avoids expand + cat overhead)."""
+    k = torch.empty(
+        (*k_nope.shape[:-1], k_nope.shape[-1] + k_pe.shape[-1]),
+        dtype=k_nope.dtype,
+        device=k_nope.device,
+    )
+    k[..., : k_nope.shape[-1]] = k_nope
+    k[..., k_nope.shape[-1] :] = k_pe
+    return k
+
+
+def benchmark_method(
+    method: Callable,
+    k_nope: torch.Tensor,
+    k_pe: torch.Tensor,
+    num_warmup: int = 10,
+    num_iters: int = 100,
+) -> float:
+    """Benchmark a concatenation method and return mean latency in ms."""
+    # Warmup
+    for _ in range(num_warmup):
+        _ = method(k_nope, k_pe)
+    torch.cuda.synchronize()
+
+    # Benchmark
+    start = time.perf_counter()
+    for _ in range(num_iters):
+        _ = method(k_nope, k_pe)
+    torch.cuda.synchronize()
+    end = time.perf_counter()
+
+    return (end - start) / num_iters * 1000  # Convert to ms
+
+
+@torch.inference_mode()
+def run_benchmark(dtype: torch.dtype, dtype_name: str):
+    """Run benchmark for a specific dtype."""
+    torch.set_default_device("cuda")
+
+    # Batch sizes to test (powers of 2 from 32 to 65536)
+    batch_sizes = [32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536]
+
+    print("=" * 80)
+    print("Benchmark: torch.cat vs direct copy for MLA k_nope/k_pe concatenation")
+    print("=" * 80)
+    print(
+        f"Tensor shapes: k_nope=[B, {NUM_HEADS}, {QK_NOPE_HEAD_DIM}], "
+        f"k_pe=[B, 1, {PE_DIM}]"
+    )
+    print(f"dtype: {dtype_name}")
+    print()
+    print(
+        f"{'Batch Size':>12} | {'cat (ms)':>10} | {'direct (ms)':>12} | "
+        f"{'Speedup':>8} | {'Reduction':>10}"
+    )
+    print("-" * 70)
+
+    results = []
+    for batch_size in batch_sizes:
+        # Create input tensors (generate in float32 then convert for FP8 compatibility)
+        k_nope = torch.randn(
+            batch_size, NUM_HEADS, QK_NOPE_HEAD_DIM, dtype=torch.float32, device="cuda"
+        ).to(dtype)
+        k_pe = torch.randn(
+            batch_size, 1, PE_DIM, dtype=torch.float32, device="cuda"
+        ).to(dtype)
+
+        # Benchmark both methods
+        cat_time = benchmark_method(cat_method, k_nope, k_pe)
+        direct_time = benchmark_method(direct_copy_method, k_nope, k_pe)
+
+        speedup = cat_time / direct_time
+        reduction = (1 - direct_time / cat_time) * 100
+
+        results.append((batch_size, cat_time, direct_time, speedup, reduction))
+
+        print(
+            f"{batch_size:>12} | {cat_time:>10.3f} | {direct_time:>12.3f} | "
+            f"{speedup:>7.2f}x | {reduction:>9.1f}%"
+        )
+
+    print("=" * 80)
+
+    # Summary statistics
+    speedups = [r[3] for r in results]
+    print("\nSpeedup summary:")
+    print(f"  Min:  {min(speedups):.2f}x")
+    print(f"  Max:  {max(speedups):.2f}x")
+    print(f"  Mean: {sum(speedups) / len(speedups):.2f}x")
+
+    # Find crossover point
+    crossover_batch = None
+    for batch_size, _, _, speedup, _ in results:
+        if speedup >= 1.0:
+            crossover_batch = batch_size
+            break
+
+    print("\nConclusion:")
+    if crossover_batch:
+        print(f"  - Direct copy becomes beneficial at batch size >= {crossover_batch}")
+    # Filter for large batches (>= 512 which is typical for prefill)
+    large_batch_speedups = [r[3] for r in results if r[0] >= 512]
+    if large_batch_speedups:
+        avg_large = sum(large_batch_speedups) / len(large_batch_speedups)
+        print(f"  - For batch sizes >= 512: avg speedup = {avg_large:.2f}x")
+    print("  - MLA prefill typically uses large batches, so optimization is effective")
+
+    return results
+
+
+@torch.inference_mode()
+def main():
+    # Test bfloat16
+    print("\n")
+    run_benchmark(torch.bfloat16, "bfloat16")
+
+    # Test float8_e4m3fn
+    print("\n")
+    run_benchmark(torch.float8_e4m3fn, "float8_e4m3fn")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 0a5257a1d87d8..8265503c28c35 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -1654,6 +1654,33 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
             # Convert from (L, N, P) to (N, P, L)
             self.W_UK_T = W_UK.permute(1, 2, 0)
 
+    def _concat_k_nope_k_pe(
+        self, k_nope: torch.Tensor, k_pe: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Efficiently concatenate k_nope and k_pe tensors along the last dimension.
+
+        This function avoids the performance penalty of torch.cat with expanded
+        non-contiguous tensors by pre-allocating the output and using direct copies.
+
+        Args:
+            k_nope: Tensor of shape [..., nope_dim]
+            k_pe: Tensor to broadcast and concatenate, typically shape [..., 1, pe_dim]
+                or [..., pe_dim]
+
+        Returns:
+            Tensor of shape [..., nope_dim + pe_dim]
+        """
+        k = torch.empty(
+            (*k_nope.shape[:-1], k_nope.shape[-1] + k_pe.shape[-1]),
+            dtype=k_nope.dtype,
+            device=k_nope.device,
+        )
+        # Direct copies with efficient broadcasting
+        k[..., : k_nope.shape[-1]] = k_nope
+        k[..., k_nope.shape[-1] :] = k_pe
+        return k
+
     def _compute_prefill_context(
         self,
         q: torch.Tensor,
@@ -1690,7 +1717,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
             )
             k_nope, v = kv_nope.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
 
-            k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
+            k = self._concat_k_nope_k_pe(k_nope, k_pe)
 
             attn_output, attn_softmax_lse = self._run_prefill_context_chunk(
                 prefill=prefill_metadata,
@@ -1794,7 +1821,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
                 -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim
             )
             k_nope, v = kv_nope.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
-            k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
+            k = self._concat_k_nope_k_pe(k_nope, k_pe)
 
             attn_output, attn_softmax_lse = self._run_prefill_context_chunk(
                 prefill=prefill_metadata,
@@ -1843,7 +1870,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         )
         k_nope, v = kv_nope.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
 
-        k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
+        k = self._concat_k_nope_k_pe(k_nope, k_pe)
 
         output_prefill = self._run_prefill_new_tokens(
             prefill=attn_metadata.prefill,

From 6299628d326f429eba78736acb44e76749b281f5 Mon Sep 17 00:00:00 2001
From: "Rei." <56646027+JaviS-Rei@users.noreply.github.com>
Date: Thu, 11 Dec 2025 17:05:08 +0800
Subject: [PATCH 440/770] [bugfix] fix MiniMaxM2ReasoningParser streaming
 output not separating reasoning_content. (#29882)

Signed-off-by: Rei <1477174254@qq.com>
---
 ...test_minimax_m2_append_reasoning_parser.py | 195 +++++++++++++++
 .../test_minimax_m2_reasoning_parser.py       | 230 ++++++++++++++++++
 vllm/reasoning/minimax_m2_reasoning_parser.py |  43 ++++
 3 files changed, 468 insertions(+)
 create mode 100644 tests/reasoning/test_minimax_m2_append_reasoning_parser.py
 create mode 100644 tests/reasoning/test_minimax_m2_reasoning_parser.py

diff --git a/tests/reasoning/test_minimax_m2_append_reasoning_parser.py b/tests/reasoning/test_minimax_m2_append_reasoning_parser.py
new file mode 100644
index 0000000000000..eefe5e3eff74c
--- /dev/null
+++ b/tests/reasoning/test_minimax_m2_append_reasoning_parser.py
@@ -0,0 +1,195 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+from transformers import AutoTokenizer
+
+from tests.reasoning.utils import run_reasoning_extraction
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
+
+parser_name = "minimax_m2_append_think"
+end_token = "</think>"
+
+# MiniMax M2 model path
+REASONING_MODEL_NAME = "MiniMaxAI/MiniMax-M2"
+
+
+@pytest.fixture(scope="module")
+def minimax_m2_tokenizer():
+    return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
+
+
+# =============================================================================
+# MiniMaxM2AppendThinkReasoningParser behavior:
+# - Prepends <think> to the beginning of the output
+# - Does NOT separate reasoning and content
+# - Returns everything as content (with <think> prepended)
+# - reasoning is always None
+#
+# This parser is used when you want to keep the raw output with <think> added
+# =============================================================================
+
+# Case: simple output with end token
+SIMPLE_OUTPUT = {
+    "output": "This is reasoning</think>This is response",
+    "reasoning": None,
+    "content": "<think>This is reasoning</think>This is response",
+    "is_reasoning_end": True,
+}
+
+# Case: output without end token (reasoning in progress)
+NO_END_TOKEN = {
+    "output": "This is reasoning in progress",
+    "reasoning": None,
+    "content": "<think>This is reasoning in progress",
+    "is_reasoning_end": False,
+}
+
+# Case: only end token
+ONLY_END_TOKEN = {
+    "output": "</think>This is response",
+    "reasoning": None,
+    "content": "<think></think>This is response",
+    "is_reasoning_end": True,
+}
+
+# Case: multiple lines
+MULTIPLE_LINES = {
+    "output": "Line 1\nLine 2</think>Response 1\nResponse 2",
+    "reasoning": None,
+    "content": "<think>Line 1\nLine 2</think>Response 1\nResponse 2",
+    "is_reasoning_end": True,
+}
+
+# Case: empty output (non-streaming prepends <think>)
+EMPTY = {
+    "output": "",
+    "reasoning": None,
+    "content": "<think>",
+    "is_reasoning_end": False,
+}
+
+# Case: empty output streaming (no tokens = no output)
+EMPTY_STREAMING = {
+    "output": "",
+    "reasoning": None,
+    "content": None,
+    "is_reasoning_end": False,
+}
+
+# Case: special characters
+SPECIAL_CHARS = {
+    "output": "Let me think... 1+1=2</think>Yes!",
+    "reasoning": None,
+    "content": "<think>Let me think... 1+1=2</think>Yes!",
+    "is_reasoning_end": True,
+}
+
+# Case: code in output
+CODE_OUTPUT = {
+    "output": "```python\nprint('hi')\n```</think>Here's the code.",
+    "reasoning": None,
+    "content": "<think>```python\nprint('hi')\n```</think>Here's the code.",
+    "is_reasoning_end": True,
+}
+
+TEST_CASES = [
+    pytest.param(
+        False,
+        SIMPLE_OUTPUT,
+        id="simple_output",
+    ),
+    pytest.param(
+        True,
+        SIMPLE_OUTPUT,
+        id="simple_output_streaming",
+    ),
+    pytest.param(
+        False,
+        NO_END_TOKEN,
+        id="no_end_token",
+    ),
+    pytest.param(
+        True,
+        NO_END_TOKEN,
+        id="no_end_token_streaming",
+    ),
+    pytest.param(
+        False,
+        ONLY_END_TOKEN,
+        id="only_end_token",
+    ),
+    pytest.param(
+        True,
+        ONLY_END_TOKEN,
+        id="only_end_token_streaming",
+    ),
+    pytest.param(
+        False,
+        MULTIPLE_LINES,
+        id="multiple_lines",
+    ),
+    pytest.param(
+        True,
+        MULTIPLE_LINES,
+        id="multiple_lines_streaming",
+    ),
+    pytest.param(
+        False,
+        EMPTY,
+        id="empty",
+    ),
+    pytest.param(
+        True,
+        EMPTY_STREAMING,
+        id="empty_streaming",
+    ),
+    pytest.param(
+        False,
+        SPECIAL_CHARS,
+        id="special_chars",
+    ),
+    pytest.param(
+        True,
+        SPECIAL_CHARS,
+        id="special_chars_streaming",
+    ),
+    pytest.param(
+        False,
+        CODE_OUTPUT,
+        id="code_output",
+    ),
+    pytest.param(
+        True,
+        CODE_OUTPUT,
+        id="code_output_streaming",
+    ),
+]
+
+
+@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
+def test_reasoning(
+    streaming: bool,
+    param_dict: dict,
+    minimax_m2_tokenizer,
+):
+    output = minimax_m2_tokenizer.tokenize(param_dict["output"])
+    # decode everything to tokens
+    output_tokens: list[str] = [
+        minimax_m2_tokenizer.convert_tokens_to_string([token]) for token in output
+    ]
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
+        minimax_m2_tokenizer
+    )
+
+    reasoning, content = run_reasoning_extraction(
+        parser, output_tokens, streaming=streaming
+    )
+
+    assert reasoning == param_dict["reasoning"]
+    assert content == param_dict["content"]
+
+    # Test is_reasoning_end
+    output_ids = minimax_m2_tokenizer.convert_tokens_to_ids(output)
+    is_reasoning_end = parser.is_reasoning_end(output_ids)
+    assert is_reasoning_end == param_dict["is_reasoning_end"]
diff --git a/tests/reasoning/test_minimax_m2_reasoning_parser.py b/tests/reasoning/test_minimax_m2_reasoning_parser.py
new file mode 100644
index 0000000000000..0d1056894c6ae
--- /dev/null
+++ b/tests/reasoning/test_minimax_m2_reasoning_parser.py
@@ -0,0 +1,230 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+from transformers import AutoTokenizer
+
+from tests.reasoning.utils import run_reasoning_extraction
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
+
+parser_name = "minimax_m2"
+end_token = "</think>"
+
+# MiniMax M2 model path
+REASONING_MODEL_NAME = "MiniMaxAI/MiniMax-M2"
+
+
+@pytest.fixture(scope="module")
+def minimax_m2_tokenizer():
+    return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
+
+
+# =============================================================================
+# MiniMax M2 specific behavior:
+# - Model does NOT generate <think> start token
+# - Model only generates </think> end token
+# - All content before </think> is reasoning
+# - All content after </think> is the actual response (content)
+# =============================================================================
+
+# Case: reasoning + end token + content (typical case)
+SIMPLE_REASONING = {
+    "output": "This is a reasoning section</think>This is the rest",
+    "reasoning": "This is a reasoning section",
+    "content": "This is the rest",
+    "is_reasoning_end": True,
+}
+
+# Case: reasoning + end token only (no content after)
+COMPLETE_REASONING = {
+    "output": "This is a reasoning section</think>",
+    "reasoning": "This is a reasoning section",
+    "content": None,
+    "is_reasoning_end": True,
+}
+
+# Case: no end token yet (streaming in progress, all is reasoning)
+NO_END_TOKEN = {
+    "output": "This is reasoning in progress",
+    "reasoning": "This is reasoning in progress",
+    "content": None,
+    "is_reasoning_end": False,
+}
+
+# Case: multiple lines of reasoning
+MULTIPLE_LINES = {
+    "output": "First line\nSecond line</think>Response first line\nResponse second",
+    "reasoning": "First line\nSecond line",
+    "content": "Response first line\nResponse second",
+    "is_reasoning_end": True,
+}
+
+# Case: only end token (empty reasoning, immediate response)
+SHORTEST_REASONING_NO_STREAMING = {
+    "output": "</think>This is the response",
+    "reasoning": "",
+    "content": "This is the response",
+    "is_reasoning_end": True,
+}
+
+# Case: only end token streaming (reasoning is None because it's just the token)
+SHORTEST_REASONING_STREAMING = {
+    "output": "</think>This is the response",
+    "reasoning": None,
+    "content": "This is the response",
+    "is_reasoning_end": True,
+}
+
+# Case: empty output
+EMPTY = {
+    "output": "",
+    "reasoning": "",
+    "content": None,
+    "is_reasoning_end": False,
+}
+
+# Case: empty streaming
+EMPTY_STREAMING = {
+    "output": "",
+    "reasoning": None,
+    "content": None,
+    "is_reasoning_end": False,
+}
+
+# Case: long reasoning with special characters
+SPECIAL_CHARS = {
+    "output": "Let me think... 1+1=2, right?</think>Yes, 1+1=2.",
+    "reasoning": "Let me think... 1+1=2, right?",
+    "content": "Yes, 1+1=2.",
+    "is_reasoning_end": True,
+}
+
+# Case: reasoning with code blocks
+CODE_IN_REASONING = {
+    "output": "```python\nprint('hello')\n```</think>Here is the code.",
+    "reasoning": "```python\nprint('hello')\n```",
+    "content": "Here is the code.",
+    "is_reasoning_end": True,
+}
+
+TEST_CASES = [
+    # Core cases: no start token (MiniMax M2 actual behavior)
+    pytest.param(
+        False,
+        SIMPLE_REASONING,
+        id="simple_reasoning",
+    ),
+    pytest.param(
+        True,
+        SIMPLE_REASONING,
+        id="simple_reasoning_streaming",
+    ),
+    pytest.param(
+        False,
+        COMPLETE_REASONING,
+        id="complete_reasoning",
+    ),
+    pytest.param(
+        True,
+        COMPLETE_REASONING,
+        id="complete_reasoning_streaming",
+    ),
+    pytest.param(
+        False,
+        NO_END_TOKEN,
+        id="no_end_token",
+    ),
+    pytest.param(
+        True,
+        NO_END_TOKEN,
+        id="no_end_token_streaming",
+    ),
+    pytest.param(
+        False,
+        MULTIPLE_LINES,
+        id="multiple_lines",
+    ),
+    pytest.param(
+        True,
+        MULTIPLE_LINES,
+        id="multiple_lines_streaming",
+    ),
+    pytest.param(
+        False,
+        SHORTEST_REASONING_NO_STREAMING,
+        id="shortest_reasoning",
+    ),
+    pytest.param(
+        True,
+        SHORTEST_REASONING_STREAMING,
+        id="shortest_reasoning_streaming",
+    ),
+    pytest.param(
+        False,
+        EMPTY,
+        id="empty",
+    ),
+    pytest.param(
+        True,
+        EMPTY_STREAMING,
+        id="empty_streaming",
+    ),
+    pytest.param(
+        False,
+        SPECIAL_CHARS,
+        id="special_chars",
+    ),
+    pytest.param(
+        True,
+        SPECIAL_CHARS,
+        id="special_chars_streaming",
+    ),
+    pytest.param(
+        False,
+        CODE_IN_REASONING,
+        id="code_in_reasoning",
+    ),
+    pytest.param(
+        True,
+        CODE_IN_REASONING,
+        id="code_in_reasoning_streaming",
+    ),
+]
+
+
+@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
+def test_reasoning(
+    streaming: bool,
+    param_dict: dict,
+    minimax_m2_tokenizer,
+):
+    output = minimax_m2_tokenizer.tokenize(param_dict["output"])
+    # decode everything to tokens
+    output_tokens: list[str] = [
+        minimax_m2_tokenizer.convert_tokens_to_string([token]) for token in output
+    ]
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
+        minimax_m2_tokenizer
+    )
+
+    reasoning, content = run_reasoning_extraction(
+        parser, output_tokens, streaming=streaming
+    )
+
+    assert reasoning == param_dict["reasoning"]
+    assert content == param_dict["content"]
+
+    # Test is_reasoning_end
+    output_ids = minimax_m2_tokenizer.convert_tokens_to_ids(output)
+    is_reasoning_end = parser.is_reasoning_end(output_ids)
+    assert is_reasoning_end == param_dict["is_reasoning_end"]
+
+    # Test extract_content
+    if param_dict["content"] is not None:
+        content = parser.extract_content_ids(output_ids)
+        assert content == minimax_m2_tokenizer.convert_tokens_to_ids(
+            minimax_m2_tokenizer.tokenize(param_dict["content"])
+        )
+    else:
+        content = parser.extract_content_ids(output)
+        assert content == []
diff --git a/vllm/reasoning/minimax_m2_reasoning_parser.py b/vllm/reasoning/minimax_m2_reasoning_parser.py
index 138d1b4e6dacf..a2b9224cb3bff 100644
--- a/vllm/reasoning/minimax_m2_reasoning_parser.py
+++ b/vllm/reasoning/minimax_m2_reasoning_parser.py
@@ -19,6 +19,10 @@ logger = init_logger(__name__)
 class MiniMaxM2ReasoningParser(BaseThinkingReasoningParser):
     """
     Reasoning parser for MiniMax M2 model.
+
+    MiniMax M2 models don't generate <think> start token, only </think> end
+    token. All content before </think> is reasoning, content after is the
+    actual response.
     """
 
     @property
@@ -31,6 +35,45 @@ class MiniMaxM2ReasoningParser(BaseThinkingReasoningParser):
         """The token that ends reasoning content."""
         return "</think>"
 
+    def extract_reasoning_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> DeltaMessage | None:
+        """
+        Extract reasoning content from a delta message for streaming.
+
+        MiniMax M2 models don't generate <think> start token, so we assume
+        all content is reasoning until we encounter the </think> end token.
+        """
+        # Skip single end token
+        if len(delta_token_ids) == 1 and delta_token_ids[0] == self.end_token_id:
+            return None
+
+        # Check if end token has already appeared in previous tokens
+        # meaning we're past the reasoning phase
+        if self.end_token_id in previous_token_ids:
+            # We're past the reasoning phase, this is content
+            return DeltaMessage(content=delta_text)
+
+        # Check if end token is in delta tokens
+        if self.end_token_id in delta_token_ids:
+            # End token in delta, split reasoning and content
+            end_index = delta_text.find(self.end_token)
+            reasoning = delta_text[:end_index]
+            content = delta_text[end_index + len(self.end_token) :]
+            return DeltaMessage(
+                reasoning=reasoning if reasoning else None,
+                content=content if content else None,
+            )
+
+        # No end token yet, all content is reasoning
+        return DeltaMessage(reasoning=delta_text)
+
 
 class MiniMaxM2AppendThinkReasoningParser(ReasoningParser):
     """

From b4e8b91278e6cb8547b5545eba28626a3d5ac052 Mon Sep 17 00:00:00 2001
From: wz1qqx <55830058+wz1qqx@users.noreply.github.com>
Date: Thu, 11 Dec 2025 17:23:52 +0800
Subject: [PATCH 441/770] [Fix]fix import error from lmcache (#30376)

Signed-off-by: wz1qqx <ziqi.wang@novita.ai>
Co-authored-by: wz1qqx <ziqi.wang@novita.ai>

From 13d63b65e0604db23c1485d370dbf9adc4e651c7 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 11 Dec 2025 18:06:36 +0800
Subject: [PATCH 442/770] [Deprecation] Remove missed fallback for
 `embed_input_ids` (#30469)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/interfaces_base.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index f988873c9c77c..134a1d9483804 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -49,13 +49,7 @@ class VllmModel(Protocol[T_co]):
 
     def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         """Apply token embeddings to `input_ids`."""
-        if hasattr(self, "get_input_embeddings"):
-            logger.warning_once(
-                "`get_input_embeddings` for vLLM models is deprecated and will be "
-                "removed in v0.13.0 or v1.0.0, whichever is earlier. Please rename "
-                "this method to `embed_input_ids`."
-            )
-            return self.get_input_embeddings(input_ids)
+        ...
 
     def forward(self, input_ids: torch.Tensor, positions: torch.Tensor) -> T_co: ...
 

From 4515eb1a0b71fbdca68c95eb261b046bbd830d15 Mon Sep 17 00:00:00 2001
From: jeremyteboul <80506730+jeremyteboul@users.noreply.github.com>
Date: Thu, 11 Dec 2025 02:14:57 -0800
Subject: [PATCH 443/770] [Fix] Update lazing loading of video loader backend
 (#30444)

Signed-off-by: Jeremy Teboul <jeremyteboul@fb.com>
Co-authored-by: Jeremy Teboul <jeremyteboul@fb.com>
---
 tests/multimodal/test_video.py | 124 ++++++++++++++++++++++++++++++++-
 vllm/multimodal/video.py       |   9 ++-
 2 files changed, 131 insertions(+), 2 deletions(-)

diff --git a/tests/multimodal/test_video.py b/tests/multimodal/test_video.py
index 6ed21de368ac3..eccaa53ea1004 100644
--- a/tests/multimodal/test_video.py
+++ b/tests/multimodal/test_video.py
@@ -147,7 +147,7 @@ def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch):
     """
     Regression test for handling videos with broken frames.
     This test uses a pre-corrupted video file (assets/corrupted.mp4) that
-    contains broken/unreadable frames to verify the video loader handles
+    contains broken frames to verify the video loader handles
     them gracefully without crashing and returns accurate metadata.
     """
     with monkeypatch.context() as m:
@@ -177,3 +177,125 @@ def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch):
             f"Expected fewer than {metadata['total_num_frames']} frames, "
             f"but loaded {frames.shape[0]} frames"
         )
+
+
+@VIDEO_LOADER_REGISTRY.register("test_video_backend_override_1")
+class TestVideoBackendOverride1(VideoLoader):
+    """Test loader that returns FAKE_OUTPUT_1 to verify backend selection."""
+
+    @classmethod
+    def load_bytes(
+        cls, data: bytes, num_frames: int = -1, **kwargs
+    ) -> tuple[npt.NDArray, dict]:
+        return FAKE_OUTPUT_1, {"video_backend": "test_video_backend_override_1"}
+
+
+@VIDEO_LOADER_REGISTRY.register("test_video_backend_override_2")
+class TestVideoBackendOverride2(VideoLoader):
+    """Test loader that returns FAKE_OUTPUT_2 to verify backend selection."""
+
+    @classmethod
+    def load_bytes(
+        cls, data: bytes, num_frames: int = -1, **kwargs
+    ) -> tuple[npt.NDArray, dict]:
+        return FAKE_OUTPUT_2, {"video_backend": "test_video_backend_override_2"}
+
+
+def test_video_media_io_backend_kwarg_override(monkeypatch: pytest.MonkeyPatch):
+    """
+    Test that video_backend kwarg can override the VLLM_VIDEO_LOADER_BACKEND
+    environment variable.
+
+    This allows users to dynamically select a different video backend
+    via --media-io-kwargs without changing the global env var, which is
+    useful when plugins set a default backend but a specific request
+    needs a different one.
+    """
+    with monkeypatch.context() as m:
+        # Set the env var to one backend
+        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_video_backend_override_1")
+
+        imageio = ImageMediaIO()
+
+        # Without video_backend kwarg, should use env var backend
+        videoio_default = VideoMediaIO(imageio, num_frames=10)
+        frames_default, metadata_default = videoio_default.load_bytes(b"test")
+        np.testing.assert_array_equal(frames_default, FAKE_OUTPUT_1)
+        assert metadata_default["video_backend"] == "test_video_backend_override_1"
+
+        # With video_backend kwarg, should override env var
+        videoio_override = VideoMediaIO(
+            imageio, num_frames=10, video_backend="test_video_backend_override_2"
+        )
+        frames_override, metadata_override = videoio_override.load_bytes(b"test")
+        np.testing.assert_array_equal(frames_override, FAKE_OUTPUT_2)
+        assert metadata_override["video_backend"] == "test_video_backend_override_2"
+
+
+def test_video_media_io_backend_kwarg_not_passed_to_loader(
+    monkeypatch: pytest.MonkeyPatch,
+):
+    """
+    Test that video_backend kwarg is consumed by VideoMediaIO and NOT passed
+    through to the underlying video loader's load_bytes method.
+
+    This ensures the kwarg is properly popped from kwargs before forwarding.
+    """
+
+    @VIDEO_LOADER_REGISTRY.register("test_reject_video_backend_kwarg")
+    class RejectVideoBackendKwargLoader(VideoLoader):
+        """Test loader that fails if video_backend is passed through."""
+
+        @classmethod
+        def load_bytes(
+            cls, data: bytes, num_frames: int = -1, **kwargs
+        ) -> tuple[npt.NDArray, dict]:
+            # This should never receive video_backend in kwargs
+            if "video_backend" in kwargs:
+                raise AssertionError(
+                    "video_backend should be consumed by VideoMediaIO, "
+                    "not passed to loader"
+                )
+            return FAKE_OUTPUT_1, {"received_kwargs": list(kwargs.keys())}
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_reject_video_backend_kwarg")
+
+        imageio = ImageMediaIO()
+
+        # Even when video_backend is provided, it should NOT be passed to loader
+        videoio = VideoMediaIO(
+            imageio,
+            num_frames=10,
+            video_backend="test_reject_video_backend_kwarg",
+            other_kwarg="should_pass_through",
+        )
+
+        # This should NOT raise AssertionError
+        frames, metadata = videoio.load_bytes(b"test")
+        np.testing.assert_array_equal(frames, FAKE_OUTPUT_1)
+        # Verify other kwargs are still passed through
+        assert "other_kwarg" in metadata["received_kwargs"]
+
+
+def test_video_media_io_backend_env_var_fallback(monkeypatch: pytest.MonkeyPatch):
+    """
+    Test that when video_backend kwarg is None or not provided,
+    VideoMediaIO falls back to VLLM_VIDEO_LOADER_BACKEND env var.
+    """
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_video_backend_override_2")
+
+        imageio = ImageMediaIO()
+
+        # Explicit None should fall back to env var
+        videoio_none = VideoMediaIO(imageio, num_frames=10, video_backend=None)
+        frames_none, metadata_none = videoio_none.load_bytes(b"test")
+        np.testing.assert_array_equal(frames_none, FAKE_OUTPUT_2)
+        assert metadata_none["video_backend"] == "test_video_backend_override_2"
+
+        # Not providing video_backend should also fall back to env var
+        videoio_missing = VideoMediaIO(imageio, num_frames=10)
+        frames_missing, metadata_missing = videoio_missing.load_bytes(b"test")
+        np.testing.assert_array_equal(frames_missing, FAKE_OUTPUT_2)
+        assert metadata_missing["video_backend"] == "test_video_backend_override_2"
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index abfc226a689c2..024252799cf74 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -283,8 +283,15 @@ class VideoMediaIO(MediaIO[tuple[npt.NDArray, dict[str, Any]]]):
         # They can be passed to the underlying
         # media loaders (e.g. custom implementations)
         # for flexible control.
+
+        # Allow per-request override of video backend via kwargs.
+        # This enables users to specify a different backend than the
+        # global VLLM_VIDEO_LOADER_BACKEND env var, e.g.:
+        #   --media-io-kwargs '{"video": {"video_backend": "torchcodec"}}'
+        video_loader_backend = (
+            kwargs.pop("video_backend", None) or envs.VLLM_VIDEO_LOADER_BACKEND
+        )
         self.kwargs = kwargs
-        video_loader_backend = envs.VLLM_VIDEO_LOADER_BACKEND
         self.video_loader = VIDEO_LOADER_REGISTRY.load(video_loader_backend)
 
     def load_bytes(self, data: bytes) -> tuple[npt.NDArray, dict[str, Any]]:

From a5f9fb59604f3a84e8be1317e33b2d368c9fc6f9 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <yuqi.wang@daocloud.io>
Date: Thu, 11 Dec 2025 18:18:25 +0800
Subject: [PATCH 444/770] [Deprecation] Deprecation `--convert reward`, use
 `--convert embed` instead. (#30463)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
---
 docs/models/pooling_models.md | 5 ++++-
 vllm/config/model.py          | 7 +++++++
 vllm/config/pooler.py         | 6 ++++--
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index 32ffcf96fabef..b4b0150faf841 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -316,10 +316,13 @@ We have split the `encode` task into two more specific token-wise tasks: `token_
 
 ### Remove softmax from PoolingParams
 
-We are going to remove `softmax` and `activation` from `PoolingParams`. Instead, use `use_activation`, since we allow `classify` and `token_classify` to use any activation function.
+We are going to remove `softmax` and `activation` from `PoolingParams` in v0.15. Instead, use `use_activation`, since we allow `classify` and `token_classify` to use any activation function.
 
 ### as_reward_model
 
+!!! warning
+    We are going to remove `--convert reward` in v0.15, use `--convert embed` instead.
+
 Pooling models now default support all pooling, you can use it without any settings.
 
 - Extracting hidden states prefers using `token_embed` task.
diff --git a/vllm/config/model.py b/vllm/config/model.py
index bd98111ffb5db..03140c17fb50e 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -788,6 +788,13 @@ class ModelConfig:
         runner_type: RunnerType,
         convert: ConvertOption,
     ) -> ConvertType:
+        if convert == "reward":
+            logger.warning(
+                "`--convert reward` is deprecated and will be removed in v0.15. "
+                "Please use `--convert embed` instead."
+            )
+            return "embed"
+
         if convert != "auto":
             return convert
 
diff --git a/vllm/config/pooler.py b/vllm/config/pooler.py
index aa4e7006d0247..976ae8c063eb7 100644
--- a/vllm/config/pooler.py
+++ b/vllm/config/pooler.py
@@ -111,13 +111,15 @@ class PoolerConfig:
 def get_use_activation(o: object):
     if softmax := getattr(o, "softmax", None) is not None:
         logger.warning_once(
-            "softmax will be deprecated, please use use_activation instead."
+            "softmax will be deprecated and will be removed in v0.15. "
+            "Please use use_activation instead."
         )
         return softmax
 
     if activation := getattr(o, "activation", None) is not None:
         logger.warning_once(
-            "activation will be deprecated, please use use_activation instead."
+            "activation will be deprecated and will be removed in v0.15. "
+            "Please use use_activation instead."
         )
         return activation
 

From d917747c95b212f9b7e85c100bc572e3e5d33360 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 11 Dec 2025 18:33:55 +0800
Subject: [PATCH 445/770] [Bugfix] Fix `task` still being passed in
 tests/benchmarks (#30476)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 benchmarks/benchmark_ngram_proposer.py                         | 1 -
 tests/models/language/pooling/test_mm_classifier_conversion.py | 2 --
 2 files changed, 3 deletions(-)

diff --git a/benchmarks/benchmark_ngram_proposer.py b/benchmarks/benchmark_ngram_proposer.py
index 872a263318ff7..b5373d383b548 100644
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@@ -32,7 +32,6 @@ def benchmark_propose(args):
 
         model_config = ModelConfig(
             model="facebook/opt-125m",
-            task="generate",
             max_model_len=args.num_token + args.num_spec_token,
             tokenizer="facebook/opt-125m",
             tokenizer_mode="auto",
diff --git a/tests/models/language/pooling/test_mm_classifier_conversion.py b/tests/models/language/pooling/test_mm_classifier_conversion.py
index a31a771238e26..d50ee85b9fd2b 100644
--- a/tests/models/language/pooling/test_mm_classifier_conversion.py
+++ b/tests/models/language/pooling/test_mm_classifier_conversion.py
@@ -17,7 +17,6 @@ def test_idefics_multimodal(
     with vllm_runner(
         model_name="HuggingFaceM4/Idefics3-8B-Llama3",
         runner="pooling",
-        task="classify",
         convert="classify",
         load_format="dummy",
         max_model_len=512,
@@ -86,7 +85,6 @@ def test_gemma_multimodal(
     with vllm_runner(
         model_name="google/gemma-3-4b-it",
         runner="pooling",
-        task="classify",
         convert="classify",
         load_format="auto",
         hf_overrides=update_config,

From 853611bb181290787d05502568fe76837507fdd9 Mon Sep 17 00:00:00 2001
From: Kenichi Maehashi <939877+kmaehashi@users.noreply.github.com>
Date: Thu, 11 Dec 2025 20:07:56 +0900
Subject: [PATCH 446/770] Fix typo of endpoint name in CLI args docs (#30473)

Signed-off-by: Kenichi Maehashi <maehashi@preferred.jp>
---
 vllm/entrypoints/openai/cli_args.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 946362ce2ef0a..b798b05dcfcbf 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -176,7 +176,7 @@ class FrontendArgs:
     enable_force_include_usage: bool = False
     """If set to True, including usage on every request."""
     enable_tokenizer_info_endpoint: bool = False
-    """Enable the /get_tokenizer_info endpoint. May expose chat
+    """Enable the `/tokenizer_info` endpoint. May expose chat
     templates and other tokenizer configuration."""
     enable_log_outputs: bool = False
     """If True, log model outputs (generations).

From a11f4a81e027efd9ef783b943489c222950ac989 Mon Sep 17 00:00:00 2001
From: Qiu <qiuchunshuo@huawei.com>
Date: Thu, 11 Dec 2025 19:36:18 +0800
Subject: [PATCH 447/770] [Misc][PCP&DCP] relocate PCP feature check (#30050)

Signed-off-by: QiuChunshuo <qiuchunshuo@huawei.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/attention/backends/abstract.py |  6 +++++
 vllm/config/parallel.py             |  5 ----
 vllm/config/vllm.py                 |  5 ----
 vllm/engine/arg_utils.py            | 10 -------
 vllm/v1/worker/cp_utils.py          | 42 +++++++++++++++++++++++++++++
 vllm/v1/worker/gpu_model_runner.py  | 18 +++----------
 6 files changed, 52 insertions(+), 34 deletions(-)
 create mode 100644 vllm/v1/worker/cp_utils.py

diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 03f4c40302eb8..025ede1eb0a4e 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -294,6 +294,12 @@ class AttentionImpl(ABC, Generic[T]):
     # Some features like decode context parallelism require the softmax lse.
     can_return_lse_for_decode: bool = False
 
+    # Whether the attention impl supports Prefill Context Parallelism.
+    supports_pcp: bool = False
+    # Whether the attention impl(or ops) supports MTP
+    # when cp_kv_cache_interleave_size > 1
+    supports_mtp_with_cp_non_trivial_interleave_size: bool = False
+
     # some attention backends might not always want to return lse
     # even if they can return lse (for efficiency reasons)
     need_to_return_lse_for_decode: bool = False
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index 0327832c4fb8c..1f9dd38ac9114 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -317,11 +317,6 @@ class ParallelConfig:
                     "num_redundant_experts."
                 )
 
-        if self.prefill_context_parallel_size > 1:
-            raise ValueError(
-                "Prefill context parallelism is not fully supported. "
-                "Please set prefill_context_parallel_size to 1."
-            )
         return self
 
     @property
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index a3a9eec9b3203..0e75daf0d722c 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -820,11 +820,6 @@ class VllmConfig:
                 f"({self.parallel_config.cp_kv_cache_interleave_size})."
             )
 
-        assert (
-            self.parallel_config.cp_kv_cache_interleave_size == 1
-            or self.speculative_config is None
-        ), "MTP with cp_kv_cache_interleave_size > 1 is not supported now."
-
         # Do this after all the updates to compilation_config.mode
         self.compilation_config.set_splitting_ops_for_v1(
             all2all_backend=self.parallel_config.all2all_backend,
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 3f23b95641d61..757023e12d439 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1848,16 +1848,6 @@ class EngineArgs:
         default_chunked_prefill = model_config.is_chunked_prefill_supported
         default_prefix_caching = model_config.is_prefix_caching_supported
 
-        if self.prefill_context_parallel_size > 1:
-            default_chunked_prefill = False
-            default_prefix_caching = False
-            logger.warning_once(
-                "--prefill-context-parallel-size > 1 is not compatible with "
-                "chunked prefill and prefix caching now. Chunked prefill "
-                "and prefix caching have been disabled by default.",
-                scope="local",
-            )
-
         if self.enable_chunked_prefill is None:
             self.enable_chunked_prefill = default_chunked_prefill
 
diff --git a/vllm/v1/worker/cp_utils.py b/vllm/v1/worker/cp_utils.py
new file mode 100644
index 0000000000000..f666c739b0be7
--- /dev/null
+++ b/vllm/v1/worker/cp_utils.py
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import TYPE_CHECKING, Any, cast
+
+from vllm.config import VllmConfig, get_layers_from_vllm_config
+
+if TYPE_CHECKING:
+    from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
+else:
+    AttentionLayerBase = object
+
+
+def check_attention_cp_compatibility(vllm_config: VllmConfig) -> None:
+    pcp_size = vllm_config.parallel_config.prefill_context_parallel_size
+    dcp_size = vllm_config.parallel_config.decode_context_parallel_size
+    interleave_size = vllm_config.parallel_config.cp_kv_cache_interleave_size
+    if pcp_size * dcp_size > 1:
+        layer_type = cast(type[Any], AttentionLayerBase)
+        layers = get_layers_from_vllm_config(vllm_config, layer_type)
+        for layer in layers.values():
+            layer_impl = getattr(layer, "impl", None)
+            if layer_impl is None:
+                continue
+            if vllm_config.speculative_config is not None and interleave_size > 1:
+                assert layer_impl.supports_mtp_with_cp_non_trivial_interleave_size, (
+                    "MTP with cp_kv_cache_interleave_size > 1 is not "
+                    f"supported in {layer_impl.__class__.__name__}."
+                )
+            if dcp_size > 1:
+                assert layer_impl.need_to_return_lse_for_decode, (
+                    "DCP requires attention impls to return"
+                    " the softmax lse for decode, but the impl "
+                    f"{layer_impl.__class__.__name__} "
+                    "does not return the softmax lse for decode."
+                )
+
+            if pcp_size > 1:
+                assert layer_impl.supports_pcp, (
+                    "PCP requires attention impls' support, "
+                    f"but the impl {layer_impl.__class__.__name__} "
+                    "does not support PCP."
+                )
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 7dc86f1ee4815..0e2bf9df9a18f 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -148,6 +148,7 @@ from vllm.v1.spec_decode.ngram_proposer import NgramProposer
 from vllm.v1.spec_decode.suffix_decoding import SuffixDecodingProposer
 from vllm.v1.structured_output.utils import apply_grammar_bitmask
 from vllm.v1.utils import CpuGpuBuffer, record_function_or_nullcontext
+from vllm.v1.worker.cp_utils import check_attention_cp_compatibility
 from vllm.v1.worker.dp_utils import coordinate_batch_across_dp
 from vllm.v1.worker.ec_connector_model_runner_mixin import ECConnectorModelRunnerMixin
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
@@ -4736,6 +4737,9 @@ class GPUModelRunner(
             attention_backend_list, kv_cache_config.kv_cache_groups
         )
 
+        # Check if attention backend supports PCP&DCP and related features.
+        check_attention_cp_compatibility(self.vllm_config)
+
         for i, attn_backend_map in enumerate(attention_backend_maps):
             self.attn_groups.append(create_attn_groups(attn_backend_map, i))
 
@@ -5394,20 +5398,6 @@ class GPUModelRunner(
                 kv_transfer_group.register_kv_caches(kv_caches)
             kv_transfer_group.set_host_xfer_buffer_ops(copy_kv_blocks)
 
-        if self.dcp_world_size > 1:
-            layer_type = cast(type[Any], AttentionLayerBase)
-            layers = get_layers_from_vllm_config(self.vllm_config, layer_type)
-            for layer in layers.values():
-                layer_impl = getattr(layer, "impl", None)
-                if layer_impl is None:
-                    continue
-                assert layer_impl.need_to_return_lse_for_decode, (
-                    "DCP requires attention impls to return"
-                    " the softmax lse for decode, but the impl "
-                    f"{layer_impl.__class__.__name__} "
-                    "does not return the softmax lse for decode."
-                )
-
     def may_add_encoder_only_layers_to_kv_cache_config(self) -> None:
         """
         Add encoder-only layers to the KV cache config.

From f4417f8449dc7a2cb890dbef659c0d1ce93432da Mon Sep 17 00:00:00 2001
From: Martin Hickey <martin.hickey@ie.ibm.com>
Date: Thu, 11 Dec 2025 14:30:29 +0000
Subject: [PATCH 448/770] [KVConnector] Add KV events to KV Connectors (#28309)

Signed-off-by: Martin Hickey <martin.hickey@ie.ibm.com>
---
 .../unit/test_lmcache_connector.py            | 756 ++++++++++++++++++
 vllm/distributed/kv_events.py                 | 130 ++-
 .../kv_transfer/kv_connector/utils.py         |  15 +
 .../kv_transfer/kv_connector/v1/base.py       |  10 +-
 .../kv_connector/v1/lmcache_connector.py      | 117 ++-
 .../kv_connector/v1/multi_connector.py        |   6 +
 vllm/v1/outputs.py                            |   4 +
 .../worker/kv_connector_model_runner_mixin.py |  13 +-
 8 files changed, 1036 insertions(+), 15 deletions(-)
 create mode 100644 tests/v1/kv_connector/unit/test_lmcache_connector.py

diff --git a/tests/v1/kv_connector/unit/test_lmcache_connector.py b/tests/v1/kv_connector/unit/test_lmcache_connector.py
new file mode 100644
index 0000000000000..6a8cfc71a67a6
--- /dev/null
+++ b/tests/v1/kv_connector/unit/test_lmcache_connector.py
@@ -0,0 +1,756 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from unittest.mock import MagicMock
+
+import pytest
+
+from vllm.distributed.kv_events import BlockStored
+from vllm.distributed.kv_transfer.kv_connector.v1.lmcache_connector import (
+    LMCacheConnectorV1,
+    LMCacheKVEvents,
+)
+from vllm.v1.outputs import KVConnectorOutput
+
+
+@pytest.fixture
+def mock_lmcache_engine_event():
+    """Create a mock event object that mimics what the lmcache engine returns."""
+
+    class MockEvent:
+        def __init__(
+            self,
+            block_hashes,
+            parent_block_hash,
+            token_ids,
+            lora_id,
+            block_size,
+            medium,
+        ):
+            self.block_hashes = block_hashes
+            self.parent_block_hash = parent_block_hash
+            self.token_ids = token_ids
+            self.lora_id = lora_id
+            self.block_size = block_size
+            self.medium = medium
+
+    return MockEvent(
+        block_hashes=["hash1", "hash2"],
+        parent_block_hash="parent_hash",
+        token_ids=[1, 2, 3, 4],
+        lora_id=None,
+        block_size=16,
+        medium="GPU",
+    )
+
+
+@pytest.fixture
+def mock_connector():
+    """Create a mock LMCacheConnectorV1 instance with mocked dependencies."""
+    connector = MagicMock(spec=LMCacheConnectorV1)
+    connector._kv_cache_events = None
+    connector._lmcache_engine = MagicMock()
+
+    # Make the methods use the real implementation
+    connector.get_kv_connector_kv_cache_events = (
+        LMCacheConnectorV1.get_kv_connector_kv_cache_events.__get__(
+            connector, LMCacheConnectorV1
+        )
+    )
+    connector.update_connector_output = (
+        LMCacheConnectorV1.update_connector_output.__get__(
+            connector, LMCacheConnectorV1
+        )
+    )
+    connector.take_events = LMCacheConnectorV1.take_events.__get__(
+        connector, LMCacheConnectorV1
+    )
+
+    return connector
+
+
+class TestGetKVConnectorKVCacheEvents:
+    """Test get_kv_connector_kv_cache_events method."""
+
+    def test_returns_none_when_no_events(self, mock_connector):
+        """Test that None is returned when lmcache engine has no events."""
+        mock_connector._lmcache_engine.get_kv_events.return_value = None
+
+        result = mock_connector.get_kv_connector_kv_cache_events()
+
+        assert result is None
+        mock_connector._lmcache_engine.get_kv_events.assert_called_once()
+
+    def test_returns_none_when_empty_list(self, mock_connector):
+        """Test that None is returned when lmcache engine returns empty list."""
+        mock_connector._lmcache_engine.get_kv_events.return_value = []
+
+        result = mock_connector.get_kv_connector_kv_cache_events()
+
+        assert result is None
+
+    def test_converts_single_event(self, mock_connector, mock_lmcache_engine_event):
+        """Test conversion of a single event from lmcache engine format."""
+        mock_connector._lmcache_engine.get_kv_events.return_value = [
+            mock_lmcache_engine_event
+        ]
+
+        result = mock_connector.get_kv_connector_kv_cache_events()
+
+        assert result is not None
+        assert isinstance(result, LMCacheKVEvents)
+        assert result.get_number_of_workers() == 1
+
+        events = result.get_all_events()
+        assert len(events) == 1
+        assert isinstance(events[0], BlockStored)
+        assert events[0].block_hashes == ["hash1", "hash2"]
+        assert events[0].parent_block_hash == "parent_hash"
+        assert events[0].token_ids == [1, 2, 3, 4]
+        assert events[0].lora_id is None
+        assert events[0].block_size == 16
+        assert events[0].medium == "GPU"
+
+    def test_converts_multiple_events(self, mock_connector):
+        """Test conversion of multiple events from lmcache engine format."""
+
+        class MockEvent:
+            def __init__(self, i):
+                self.block_hashes = [f"hash{i}"]
+                self.parent_block_hash = f"parent{i}"
+                self.token_ids = [i]
+                self.lora_id = None
+                self.block_size = 16
+                self.medium = "GPU"
+
+        events = [MockEvent(i) for i in range(5)]
+        mock_connector._lmcache_engine.get_kv_events.return_value = events
+
+        result = mock_connector.get_kv_connector_kv_cache_events()
+
+        assert result is not None
+        assert isinstance(result, LMCacheKVEvents)
+
+        converted_events = result.get_all_events()
+        assert len(converted_events) == 5
+
+        for i, event in enumerate(converted_events):
+            assert isinstance(event, BlockStored)
+            assert event.block_hashes == [f"hash{i}"]
+            assert event.parent_block_hash == f"parent{i}"
+            assert event.token_ids == [i]
+
+    def test_preserves_event_attributes(self, mock_connector):
+        """Test that all event attributes are correctly preserved."""
+
+        class MockEventWithLora:
+            def __init__(self):
+                self.block_hashes = ["hash_a", "hash_b", "hash_c"]
+                self.parent_block_hash = "parent_xyz"
+                self.token_ids = [100, 200, 300]
+                self.lora_id = 42
+                self.block_size = 32
+                self.medium = "DISK"
+
+        mock_connector._lmcache_engine.get_kv_events.return_value = [
+            MockEventWithLora()
+        ]
+
+        result = mock_connector.get_kv_connector_kv_cache_events()
+
+        events = result.get_all_events()
+        event = events[0]
+
+        assert event.block_hashes == ["hash_a", "hash_b", "hash_c"]
+        assert event.parent_block_hash == "parent_xyz"
+        assert event.token_ids == [100, 200, 300]
+        assert event.lora_id == 42
+        assert event.block_size == 32
+        assert event.medium == "DISK"
+
+    def test_handles_none_parent_block_hash(self, mock_connector):
+        """Test handling of events with None parent_block_hash."""
+
+        class MockEventNoParent:
+            def __init__(self):
+                self.block_hashes = ["hash1"]
+                self.parent_block_hash = None
+                self.token_ids = [1, 2]
+                self.lora_id = None
+                self.block_size = 16
+                self.medium = "GPU"
+
+        mock_connector._lmcache_engine.get_kv_events.return_value = [
+            MockEventNoParent()
+        ]
+
+        result = mock_connector.get_kv_connector_kv_cache_events()
+
+        events = result.get_all_events()
+        assert events[0].parent_block_hash is None
+
+
+class TestUpdateConnectorOutput:
+    """Test update_connector_output method."""
+
+    def test_does_nothing_when_kv_cache_events_is_none(self, mock_connector):
+        """Test that method returns early when kv_cache_events is None."""
+        connector_output = KVConnectorOutput(kv_cache_events=None)
+
+        mock_connector.update_connector_output(connector_output)
+
+        assert mock_connector._kv_cache_events is None
+
+    def test_does_nothing_when_kv_cache_events_is_not_lmcache_kv_events(
+        self, mock_connector
+    ):
+        """Test that method returns early when kv_cache_events is not
+        LMCacheKVEvents."""
+        # Create a mock object that is not LMCacheKVEvents
+        fake_events = MagicMock()
+        connector_output = KVConnectorOutput(kv_cache_events=fake_events)
+
+        mock_connector.update_connector_output(connector_output)
+
+        assert mock_connector._kv_cache_events is None
+
+    def test_sets_kv_cache_events_when_none(self, mock_connector):
+        """Test that _kv_cache_events is set when it was None."""
+        kv_events = LMCacheKVEvents(num_workers=1)
+        event = BlockStored(
+            block_hashes=["hash1"],
+            parent_block_hash=None,
+            token_ids=[1, 2],
+            block_size=16,
+            lora_id=None,
+            medium="GPU",
+        )
+        kv_events.add_events([event])
+
+        connector_output = KVConnectorOutput(kv_cache_events=kv_events)
+
+        mock_connector.update_connector_output(connector_output)
+
+        assert mock_connector._kv_cache_events is kv_events
+
+    def test_adds_events_when_kv_cache_events_already_exists(self, mock_connector):
+        """Test that events are added when _kv_cache_events already exists."""
+        # Set up existing events
+        existing_events = LMCacheKVEvents(num_workers=2)
+        event1 = BlockStored(
+            block_hashes=["hash1"],
+            parent_block_hash=None,
+            token_ids=[1],
+            block_size=16,
+            lora_id=None,
+            medium="GPU",
+        )
+        existing_events.add_events([event1])
+        existing_events.add_events([event1])  # Simulate 2 workers reporting
+
+        mock_connector._kv_cache_events = existing_events
+
+        # Create new events to add
+        new_events = LMCacheKVEvents(num_workers=1)
+        event2 = BlockStored(
+            block_hashes=["hash2"],
+            parent_block_hash=None,
+            token_ids=[2],
+            block_size=16,
+            lora_id=None,
+            medium="GPU",
+        )
+        new_events.add_events([event2])
+
+        connector_output = KVConnectorOutput(kv_cache_events=new_events)
+
+        mock_connector.update_connector_output(connector_output)
+
+        # Check that events were added
+        all_events = mock_connector._kv_cache_events.get_all_events()
+        assert len(all_events) == 3  # 2 from existing + 1 from new
+        assert event1 in all_events
+        assert event2 in all_events
+
+    def test_increments_workers_when_kv_cache_events_already_exists(
+        self, mock_connector
+    ):
+        """Test that worker count is incremented correctly."""
+        # Set up existing events with 2 workers
+        existing_events = LMCacheKVEvents(num_workers=2)
+        mock_connector._kv_cache_events = existing_events
+
+        # Create new events from 3 workers
+        new_events = LMCacheKVEvents(num_workers=3)
+        event = BlockStored(
+            block_hashes=["hash1"],
+            parent_block_hash=None,
+            token_ids=[1],
+            block_size=16,
+            lora_id=None,
+            medium="GPU",
+        )
+        new_events.add_events([event])
+
+        connector_output = KVConnectorOutput(kv_cache_events=new_events)
+
+        mock_connector.update_connector_output(connector_output)
+
+        # Worker count should be 2 + 3 = 5
+        assert mock_connector._kv_cache_events.get_number_of_workers() == 5
+
+    def test_multiple_updates(self, mock_connector):
+        """Test multiple consecutive updates."""
+        # First update
+        events1 = LMCacheKVEvents(num_workers=1)
+        event1 = BlockStored(
+            block_hashes=["hash1"],
+            parent_block_hash=None,
+            token_ids=[1],
+            block_size=16,
+            lora_id=None,
+            medium="GPU",
+        )
+        events1.add_events([event1])
+        output1 = KVConnectorOutput(kv_cache_events=events1)
+        mock_connector.update_connector_output(output1)
+
+        # Second update
+        events2 = LMCacheKVEvents(num_workers=2)
+        event2 = BlockStored(
+            block_hashes=["hash2"],
+            parent_block_hash=None,
+            token_ids=[2],
+            block_size=16,
+            lora_id=None,
+            medium="GPU",
+        )
+        events2.add_events([event2])
+        output2 = KVConnectorOutput(kv_cache_events=events2)
+        mock_connector.update_connector_output(output2)
+
+        # Third update
+        events3 = LMCacheKVEvents(num_workers=1)
+        event3 = BlockStored(
+            block_hashes=["hash3"],
+            parent_block_hash=None,
+            token_ids=[3],
+            block_size=16,
+            lora_id=None,
+            medium="GPU",
+        )
+        events3.add_events([event3])
+        output3 = KVConnectorOutput(kv_cache_events=events3)
+        mock_connector.update_connector_output(output3)
+
+        # Check final state
+        all_events = mock_connector._kv_cache_events.get_all_events()
+        assert len(all_events) == 3
+        assert mock_connector._kv_cache_events.get_number_of_workers() == 4  # 1+2+1
+
+    def test_updates_with_empty_events(self, mock_connector):
+        """Test updating with empty event lists."""
+        # First update with actual events
+        events1 = LMCacheKVEvents(num_workers=1)
+        event1 = BlockStored(
+            block_hashes=["hash1"],
+            parent_block_hash=None,
+            token_ids=[1],
+            block_size=16,
+            lora_id=None,
+            medium="GPU",
+        )
+        events1.add_events([event1])
+        output1 = KVConnectorOutput(kv_cache_events=events1)
+        mock_connector.update_connector_output(output1)
+
+        # Second update with empty events
+        events2 = LMCacheKVEvents(num_workers=2)
+        # No events added
+        output2 = KVConnectorOutput(kv_cache_events=events2)
+        mock_connector.update_connector_output(output2)
+
+        # Should still have the original event
+        all_events = mock_connector._kv_cache_events.get_all_events()
+        assert len(all_events) == 1
+        assert mock_connector._kv_cache_events.get_number_of_workers() == 3
+
+
+class TestTakeEvents:
+    """Test take_events method."""
+
+    def test_yields_nothing_when_kv_cache_events_is_none(self, mock_connector):
+        """Test that nothing is yielded when _kv_cache_events is None."""
+        mock_connector._kv_cache_events = None
+
+        events = list(mock_connector.take_events())
+
+        assert events == []
+
+    def test_yields_events_and_clears(self, mock_connector):
+        """Test that events are yielded and then cleared."""
+        # Set up events
+        kv_events = LMCacheKVEvents(num_workers=1)
+        event1 = BlockStored(
+            block_hashes=["hash1"],
+            parent_block_hash=None,
+            token_ids=[1],
+            block_size=16,
+            lora_id=None,
+            medium="GPU",
+        )
+        event2 = BlockStored(
+            block_hashes=["hash2"],
+            parent_block_hash=None,
+            token_ids=[2],
+            block_size=16,
+            lora_id=None,
+            medium="GPU",
+        )
+        kv_events.add_events([event1, event2])
+        mock_connector._kv_cache_events = kv_events
+
+        # Take events
+        events = list(mock_connector.take_events())
+
+        # Check that events were yielded
+        assert len(events) == 2
+        assert event1 in events
+        assert event2 in events
+
+        # Check that _kv_cache_events was cleared
+        assert mock_connector._kv_cache_events is None
+
+    def test_aggregates_before_yielding(self, mock_connector):
+        """Test that events are aggregated before yielding."""
+        # Set up events from multiple workers
+        kv_events = LMCacheKVEvents(num_workers=3)
+        common_event = BlockStored(
+            block_hashes=["hash_common"],
+            parent_block_hash=None,
+            token_ids=[1],
+            block_size=16,
+            lora_id=None,
+            medium="GPU",
+        )
+        uncommon_event = BlockStored(
+            block_hashes=["hash_uncommon"],
+            parent_block_hash=None,
+            token_ids=[2],
+            block_size=16,
+            lora_id=None,
+            medium="GPU",
+        )
+
+        # All 3 workers report common_event
+        kv_events.add_events([common_event])
+        kv_events.add_events([common_event])
+        kv_events.add_events([common_event])
+
+        # Only 1 worker reports uncommon_event
+        kv_events.add_events([uncommon_event])
+
+        mock_connector._kv_cache_events = kv_events
+
+        # Take events
+        events = list(mock_connector.take_events())
+
+        # Only the common event should be yielded
+        assert len(events) == 1
+        assert events[0] == common_event
+
+    def test_multiple_take_events_calls(self, mock_connector):
+        """Test calling take_events multiple times."""
+        # First call with events
+        kv_events1 = LMCacheKVEvents(num_workers=1)
+        event1 = BlockStored(
+            block_hashes=["hash1"],
+            parent_block_hash=None,
+            token_ids=[1],
+            block_size=16,
+            lora_id=None,
+            medium="GPU",
+        )
+        kv_events1.add_events([event1])
+        mock_connector._kv_cache_events = kv_events1
+
+        events1 = list(mock_connector.take_events())
+        assert len(events1) == 1
+        assert events1[0] == event1
+        assert mock_connector._kv_cache_events is None
+
+        # Second call with no events
+        events2 = list(mock_connector.take_events())
+        assert events2 == []
+
+        # Third call after adding new events
+        kv_events2 = LMCacheKVEvents(num_workers=1)
+        event2 = BlockStored(
+            block_hashes=["hash2"],
+            parent_block_hash=None,
+            token_ids=[2],
+            block_size=16,
+            lora_id=None,
+            medium="GPU",
+        )
+        kv_events2.add_events([event2])
+        mock_connector._kv_cache_events = kv_events2
+
+        events3 = list(mock_connector.take_events())
+        assert len(events3) == 1
+        assert events3[0] == event2
+
+    def test_yields_empty_after_aggregation_removes_all(self, mock_connector):
+        """Test that nothing is yielded if aggregation removes all events."""
+        # Set up events from 2 workers with no common events
+        kv_events = LMCacheKVEvents(num_workers=2)
+        event1 = BlockStored(
+            block_hashes=["hash1"],
+            parent_block_hash=None,
+            token_ids=[1],
+            block_size=16,
+            lora_id=None,
+            medium="GPU",
+        )
+        event2 = BlockStored(
+            block_hashes=["hash2"],
+            parent_block_hash=None,
+            token_ids=[2],
+            block_size=16,
+            lora_id=None,
+            medium="GPU",
+        )
+
+        # Worker 1 reports event1
+        kv_events.add_events([event1])
+        # Worker 2 reports event2
+        kv_events.add_events([event2])
+
+        mock_connector._kv_cache_events = kv_events
+
+        # Take events
+        events = list(mock_connector.take_events())
+
+        # No common events, so nothing should be yielded
+        assert events == []
+        assert mock_connector._kv_cache_events is None
+
+
+class TestIntegrationScenarios:
+    """Test integration scenarios."""
+
+    def test_full_workflow(self, mock_connector, mock_lmcache_engine_event):
+        """Test a complete workflow from getting events to taking them."""
+        # Step 1: Get events from lmcache engine
+        mock_connector._lmcache_engine.get_kv_events.return_value = [
+            mock_lmcache_engine_event
+        ]
+        kv_events = mock_connector.get_kv_connector_kv_cache_events()
+
+        assert kv_events is not None
+        assert len(kv_events.get_all_events()) == 1
+
+        # Step 2: Update connector output (simulate receiving from worker)
+        output1 = KVConnectorOutput(kv_cache_events=kv_events)
+        mock_connector.update_connector_output(output1)
+
+        assert mock_connector._kv_cache_events is not None
+
+        # Step 3: Take events
+        taken_events = list(mock_connector.take_events())
+
+        assert len(taken_events) == 1
+        assert mock_connector._kv_cache_events is None
+
+    def test_multiple_workers_workflow(self, mock_connector):
+        """Test workflow with multiple workers."""
+
+        class MockEvent:
+            def __init__(self, hash_val):
+                self.block_hashes = [hash_val]
+                self.parent_block_hash = None
+                self.token_ids = [1]
+                self.lora_id = None
+                self.block_size = 16
+                self.medium = "GPU"
+
+        # Worker 1
+        mock_connector._lmcache_engine.get_kv_events.return_value = [
+            MockEvent("hash_common"),
+            MockEvent("hash_worker1"),
+        ]
+        kv_events1 = mock_connector.get_kv_connector_kv_cache_events()
+        output1 = KVConnectorOutput(kv_cache_events=kv_events1)
+        mock_connector.update_connector_output(output1)
+
+        # Worker 2
+        mock_connector._lmcache_engine.get_kv_events.return_value = [
+            MockEvent("hash_common"),
+            MockEvent("hash_worker2"),
+        ]
+        kv_events2 = mock_connector.get_kv_connector_kv_cache_events()
+        output2 = KVConnectorOutput(kv_cache_events=kv_events2)
+        mock_connector.update_connector_output(output2)
+
+        # Take events (should only get common events)
+        taken_events = list(mock_connector.take_events())
+
+        # With aggregation, only events reported by both workers should be present
+        # In this case, hash_common was reported by both
+        event_hashes = [e.block_hashes[0] for e in taken_events]
+        assert "hash_common" in event_hashes
+
+    def test_empty_workflow(self, mock_connector):
+        """Test workflow when there are no events at any stage."""
+        # Get events returns None
+        mock_connector._lmcache_engine.get_kv_events.return_value = None
+        kv_events = mock_connector.get_kv_connector_kv_cache_events()
+
+        assert kv_events is None
+
+        # Update with None
+        output = KVConnectorOutput(kv_cache_events=None)
+        mock_connector.update_connector_output(output)
+
+        # Take events
+        taken_events = list(mock_connector.take_events())
+
+        assert taken_events == []
+        assert mock_connector._kv_cache_events is None
+
+    def test_repeated_cycles(self, mock_connector):
+        """Test multiple cycles of the complete workflow."""
+
+        class MockEvent:
+            def __init__(self, cycle_num):
+                self.block_hashes = [f"hash_cycle_{cycle_num}"]
+                self.parent_block_hash = None
+                self.token_ids = [cycle_num]
+                self.lora_id = None
+                self.block_size = 16
+                self.medium = "GPU"
+
+        for cycle in range(3):
+            # Get events
+            mock_connector._lmcache_engine.get_kv_events.return_value = [
+                MockEvent(cycle)
+            ]
+            kv_events = mock_connector.get_kv_connector_kv_cache_events()
+
+            # Update
+            output = KVConnectorOutput(kv_cache_events=kv_events)
+            mock_connector.update_connector_output(output)
+
+            # Take
+            taken_events = list(mock_connector.take_events())
+
+            # Verify
+            assert len(taken_events) == 1
+            assert taken_events[0].block_hashes[0] == f"hash_cycle_{cycle}"
+            assert mock_connector._kv_cache_events is None
+
+    def test_lmcache_kv_events_aggregation(self):
+        """
+        Test LMCacheKVEvents aggregation across TP ranks using
+        KVOutputAggregator (used by MultiprocExecutor).
+        """
+        from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
+        from vllm.v1.outputs import ModelRunnerOutput
+
+        # Create KVOutputAggregator for 3 workers (simulating TP=3)
+        aggregator = KVOutputAggregator(expected_finished_count=3)
+
+        # Define common and unique events
+        common_event = BlockStored(
+            block_hashes=["hash_common"],
+            parent_block_hash="parent_common",
+            token_ids=[1, 2, 3],
+            block_size=16,
+            lora_id=None,
+            medium="GPU",
+        )
+
+        worker1_unique_event = BlockStored(
+            block_hashes=["hash_worker1"],
+            parent_block_hash="parent_w1",
+            token_ids=[4, 5],
+            block_size=16,
+            lora_id=None,
+            medium="GPU",
+        )
+
+        worker2_unique_event = BlockStored(
+            block_hashes=["hash_worker2"],
+            parent_block_hash="parent_w2",
+            token_ids=[6, 7],
+            block_size=16,
+            lora_id=None,
+            medium="GPU",
+        )
+
+        worker3_unique_event = BlockStored(
+            block_hashes=["hash_worker3"],
+            parent_block_hash="parent_w3",
+            token_ids=[8, 9],
+            block_size=16,
+            lora_id=None,
+            medium="GPU",
+        )
+
+        # Create events for each worker
+        # Worker 0: reports common event and its unique event
+        worker0_events = LMCacheKVEvents(num_workers=1)
+        worker0_events.add_events([common_event, worker1_unique_event])
+
+        # Worker 1: reports common event and its unique event
+        worker1_events = LMCacheKVEvents(num_workers=1)
+        worker1_events.add_events([common_event, worker2_unique_event])
+
+        # Worker 2: reports common event and its unique event
+        worker2_events = LMCacheKVEvents(num_workers=1)
+        worker2_events.add_events([common_event, worker3_unique_event])
+
+        # Create ModelRunnerOutput instances for each worker
+        worker_outputs = []
+        for i, worker_events in enumerate(
+            [worker0_events, worker1_events, worker2_events]
+        ):
+            output = ModelRunnerOutput(
+                req_ids=[f"req_{i}"],
+                req_id_to_index={f"req_{i}": 0},
+                sampled_token_ids=[[123]],  # dummy token
+                logprobs=None,
+                prompt_logprobs_dict={},
+                pooler_output=[None],
+                kv_connector_output=KVConnectorOutput(
+                    finished_sending=set([f"req_{i}_send"])
+                    if i < 2
+                    else None,  # Workers 0,1 finished sending
+                    finished_recving=set([f"req_{i}_recv"])
+                    if i > 0
+                    else None,  # Workers 1,2 finished receiving
+                    kv_cache_events=worker_events,
+                ),
+            )
+            worker_outputs.append(output)
+
+        # Use the real aggregation mechanism (like MultiprocExecutor.execute_model)
+        aggregated_output = aggregator.aggregate(worker_outputs, output_rank=0)
+        kv_cache_events = aggregated_output.kv_connector_output.kv_cache_events
+
+        assert isinstance(kv_cache_events, LMCacheKVEvents)
+
+        # After aggregation, events should be combined from all workers
+        # The aggregator doesn't automatically aggregate events, so we need to call
+        # aggregate() to get only common events
+        kv_cache_events.aggregate()
+        aggregated_events = kv_cache_events.get_all_events()
+
+        # Only the common event should remain after aggregation
+        # because it's the only event reported by all 3 workers
+        assert len(aggregated_events) == 1
+        assert aggregated_events[0] == common_event
+
+        # Verify the common event properties
+        assert aggregated_events[0].block_hashes == ["hash_common"]
+        assert aggregated_events[0].parent_block_hash == "parent_common"
+        assert aggregated_events[0].token_ids == [1, 2, 3]
diff --git a/vllm/distributed/kv_events.py b/vllm/distributed/kv_events.py
index 7b5cb94cf13ea..3b76af75504de 100644
--- a/vllm/distributed/kv_events.py
+++ b/vllm/distributed/kv_events.py
@@ -5,7 +5,7 @@ import queue
 import threading
 import time
 from abc import ABC, abstractmethod
-from collections import deque
+from collections import Counter, deque
 from collections.abc import Callable
 from dataclasses import asdict
 from itertools import count
@@ -54,11 +54,26 @@ class BlockStored(KVCacheEvent):
     lora_id: int | None
     medium: str | None
 
+    def __hash__(self) -> int:
+        return hash(
+            (
+                tuple(self.block_hashes),
+                self.parent_block_hash,
+                tuple(self.token_ids),
+                self.block_size,
+                self.lora_id,
+                self.medium,
+            )
+        )
+
 
 class BlockRemoved(KVCacheEvent):
     block_hashes: list[ExternalBlockHash]
     medium: str | None
 
+    def __hash__(self) -> int:
+        return hash((tuple(self.block_hashes), self.medium))
+
 
 class AllBlocksCleared(KVCacheEvent):
     pass
@@ -68,6 +83,119 @@ class KVEventBatch(EventBatch):
     events: list[BlockStored | BlockRemoved | AllBlocksCleared]
 
 
+class KVEventAggregator:
+    """
+    Aggregates KV events across multiple workers.
+    Tracks how many times each event appears and returns only those
+    that were emitted by all workers.
+    """
+
+    __slots__ = ("_event_counter", "_num_workers")
+
+    def __init__(self, num_workers: int) -> None:
+        if num_workers <= 0:
+            raise ValueError("num_workers must be greater than zero.")
+        self._event_counter: Counter[KVCacheEvent] = Counter()
+        self._num_workers: int = num_workers
+
+    def add_events(self, events: list[KVCacheEvent]) -> None:
+        """
+        Add events from a worker batch.
+
+        :param events: List of KVCacheEvent objects.
+        """
+        if not isinstance(events, list):
+            raise TypeError("events must be a list of KVCacheEvent.")
+        self._event_counter.update(events)
+
+    def get_common_events(self) -> list[KVCacheEvent]:
+        """
+        Return events that appeared in all workers.
+
+        :return: List of events present in all workers.
+        """
+        return [
+            event
+            for event, count in self._event_counter.items()
+            if count == self._num_workers
+        ]
+
+    def get_all_events(self) -> list[KVCacheEvent]:
+        """
+        Return all events for all workers.
+
+        :return: List of events for all workers.
+        """
+        return list(self._event_counter.elements())
+
+    def clear_events(self) -> None:
+        """
+        Clear all tracked events.
+        """
+        self._event_counter.clear()
+
+    def increment_workers(self, count: int = 1) -> None:
+        """
+        Increment the number of workers contributing events.
+
+        :param count: Number to increment the workers by.
+        """
+        if count <= 0:
+            raise ValueError("count must be positive.")
+        self._num_workers += count
+
+    def reset_workers(self) -> None:
+        """
+        Reset the number of workers to 1.
+        """
+        self._num_workers = 1
+
+    def get_number_of_workers(self) -> int:
+        """
+        Return the number of workers.
+
+        :return: int number of workers.
+        """
+        return self._num_workers
+
+    def __repr__(self) -> str:
+        return (
+            f"<KVEventAggregator workers={self._num_workers}, "
+            f"events={len(self._event_counter)}>"
+        )
+
+
+class KVConnectorKVEvents(ABC):
+    """
+    Abstract base class for KV events.
+    Acts as a container for KV events from the connector.
+    """
+
+    @abstractmethod
+    def add_events(self, events: list[KVCacheEvent]) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def aggregate(self) -> "KVConnectorKVEvents":
+        raise NotImplementedError
+
+    @abstractmethod
+    def increment_workers(self, count: int = 1) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_all_events(self) -> list[KVCacheEvent]:
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_number_of_workers(self) -> int:
+        raise NotImplementedError
+
+    @abstractmethod
+    def clear_events(self) -> None:
+        raise NotImplementedError
+
+
 class EventPublisher(ABC):
     """Lightweight publisher for EventBatch batches with data parallelism
     support.
diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py
index 99d3be57c1381..117d159e25e71 100644
--- a/vllm/distributed/kv_transfer/kv_connector/utils.py
+++ b/vllm/distributed/kv_transfer/kv_connector/utils.py
@@ -78,6 +78,7 @@ class KVOutputAggregator:
         finished_sending = set[str]()
         finished_recving = set[str]()
         aggregated_kv_connector_stats = None
+        combined_kv_cache_events = None
         invalid_block_ids = set[int]()
         for model_runner_output in outputs:
             assert model_runner_output is not None
@@ -119,6 +120,19 @@ class KVOutputAggregator:
                         aggregated_kv_connector_stats.aggregate(kv_connector_stats)
                     )
 
+            # Combine kv_cache_events from all workers.
+            if combined_kv_cache_events is None:
+                # Use the first worker's kv_cache events as start event list.
+                combined_kv_cache_events = kv_output.kv_cache_events
+            elif kv_cache_events := kv_output.kv_cache_events:
+                assert isinstance(
+                    combined_kv_cache_events,
+                    type(kv_cache_events),
+                )
+                worker_kv_cache_events = kv_cache_events.get_all_events()
+                combined_kv_cache_events.add_events(worker_kv_cache_events)
+                combined_kv_cache_events.increment_workers(1)
+
             invalid_block_ids |= kv_output.invalid_block_ids
 
         # select output of the worker specified by output_rank
@@ -129,6 +143,7 @@ class KVOutputAggregator:
             finished_sending=finished_sending or None,
             finished_recving=finished_recving or None,
             kv_connector_stats=aggregated_kv_connector_stats or None,
+            kv_cache_events=combined_kv_cache_events or None,
             invalid_block_ids=invalid_block_ids,
             expected_finished_count=self._expected_finished_count,
         )
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
index 91f6443f92cbe..c05e5485a835e 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -49,7 +49,7 @@ from vllm.v1.outputs import KVConnectorOutput
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
-    from vllm.distributed.kv_events import KVCacheEvent
+    from vllm.distributed.kv_events import KVCacheEvent, KVConnectorKVEvents
     from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
         KVConnectorPromMetrics,
         KVConnectorStats,
@@ -379,6 +379,14 @@ class KVConnectorBase_V1(ABC):
         """
         return None
 
+    def get_kv_connector_kv_cache_events(self) -> Optional["KVConnectorKVEvents"]:
+        """
+        Get the KV connector kv cache events collected during the last interval.
+        This function should be called by the model runner every time after the
+        model execution and before cleanup.
+        """
+        return None
+
     def get_handshake_metadata(self) -> KVConnectorHandshakeMetadata | None:
         """
         Get the KVConnector handshake metadata for this connector.
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
index 30da424ddcca0..17d468fe6c305 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
@@ -1,14 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterable
 from typing import TYPE_CHECKING, Any
 
 import torch
-from lmcache.integration.vllm.vllm_v1_adapter import (
-    LMCacheConnectorV1Impl as LMCacheConnectorLatestImpl,
-)
 
 from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import VllmConfig
+from vllm.distributed.kv_events import (
+    BlockStored,
+    KVCacheEvent,
+    KVConnectorKVEvents,
+    KVEventAggregator,
+)
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1,
     KVConnectorMetadata,
@@ -16,6 +20,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
 )
 from vllm.logger import init_logger
 from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.outputs import KVConnectorOutput
 
 if TYPE_CHECKING:
     from vllm.forward_context import ForwardContext
@@ -26,6 +31,44 @@ if TYPE_CHECKING:
 logger = init_logger(__name__)
 
 
+class LMCacheKVEvents(KVConnectorKVEvents):
+    """
+    Concrete implementation of KVConnectorKVEvents using KVEventAggregator.
+    """
+
+    def __init__(self, num_workers: int) -> None:
+        self._aggregator = KVEventAggregator(num_workers)
+
+    def add_events(self, events: list[KVCacheEvent]) -> None:
+        self._aggregator.add_events(events)
+
+    def aggregate(self) -> "LMCacheKVEvents":
+        """
+        Aggregate KV events and retain only common events.
+        """
+        common_events = self._aggregator.get_common_events()
+        self._aggregator.clear_events()
+        self._aggregator.add_events(common_events)
+        self._aggregator.reset_workers()
+        return self
+
+    def increment_workers(self, count: int = 1) -> None:
+        self._aggregator.increment_workers(count)
+
+    def get_all_events(self) -> list[KVCacheEvent]:
+        return self._aggregator.get_all_events()
+
+    def get_number_of_workers(self) -> int:
+        return self._aggregator.get_number_of_workers()
+
+    def clear_events(self) -> None:
+        self._aggregator.clear_events()
+        self._aggregator.reset_workers()
+
+    def __repr__(self) -> str:
+        return f"<LMCacheKVEvents events={self.get_all_events()}>"
+
+
 class LMCacheConnectorV1(KVConnectorBase_V1):
     def __init__(
         self,
@@ -50,10 +93,17 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
             cls = _adapter.LMCacheConnectorV1Impl
         else:
             logger.info("Initializing latest dev LMCache connector")
+            # lazy import
+            from lmcache.integration.vllm.vllm_v1_adapter import (
+                LMCacheConnectorV1Impl as LMCacheConnectorLatestImpl,
+            )
+
             cls = LMCacheConnectorLatestImpl
 
         self._lmcache_engine = cls(vllm_config, role, self)
 
+        self._kv_cache_events: LMCacheKVEvents | None = None
+
     # ==============================
     # Worker-side methods
     # ==============================
@@ -151,6 +201,31 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
         # Fallback for older versions that don't support this method
         return set()
 
+    def get_kv_connector_kv_cache_events(self) -> LMCacheKVEvents | None:
+        """
+        Get the KV connector kv cache events collected during the last interval.
+        """
+
+        events = self._lmcache_engine.get_kv_events()  # type: ignore [attr-defined]
+        if not events:
+            return None
+
+        blocks: list[BlockStored] = [
+            BlockStored(
+                block_hashes=e.block_hashes,
+                parent_block_hash=e.parent_block_hash,
+                token_ids=e.token_ids,
+                lora_id=e.lora_id,
+                block_size=e.block_size,
+                medium=e.medium,
+            )
+            for e in events
+        ]
+
+        lmcache_kv_events = LMCacheKVEvents(num_workers=1)
+        lmcache_kv_events.add_events(blocks)
+        return lmcache_kv_events
+
     # ==============================
     # Scheduler-side methods
     # ==============================
@@ -198,6 +273,28 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
         """
         return self._lmcache_engine.build_connector_meta(scheduler_output)
 
+    def update_connector_output(self, connector_output: KVConnectorOutput):
+        """
+        Update KVConnector state from worker-side connectors output.
+
+        Args:
+            connector_output (KVConnectorOutput): the worker-side
+                connectors output.
+        """
+        # Get the KV events
+        kv_cache_events = connector_output.kv_cache_events
+        if not kv_cache_events or not isinstance(kv_cache_events, LMCacheKVEvents):
+            return
+
+        if self._kv_cache_events is None:
+            self._kv_cache_events = kv_cache_events
+        else:
+            self._kv_cache_events.add_events(kv_cache_events.get_all_events())
+            self._kv_cache_events.increment_workers(
+                kv_cache_events.get_number_of_workers()
+            )
+        return
+
     def request_finished(
         self,
         request: "Request",
@@ -214,3 +311,17 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
             returned by the engine.
         """
         return self._lmcache_engine.request_finished(request, block_ids)
+
+    def take_events(self) -> Iterable["KVCacheEvent"]:
+        """
+        Take the KV cache events from the connector.
+
+        Yields:
+            New KV cache events since the last call.
+        """
+        if self._kv_cache_events is not None:
+            self._kv_cache_events.aggregate()
+            kv_cache_events = self._kv_cache_events.get_all_events()
+            yield from kv_cache_events
+            self._kv_cache_events.clear_events()
+            self._kv_cache_events = None
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
index c80dc1a567fdb..6825745374959 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
@@ -259,6 +259,12 @@ class MultiConnector(KVConnectorBase_V1):
             agg_block_ids |= c.get_block_ids_with_load_errors()
         return agg_block_ids
 
+    # TODO: Add a generic implementation of 'get_kv_connector_kv_cache_events' method
+    # for the MultiConnector. It should be able to get events from multiple
+    # connectors, handling the case where only a subset of the requested connectors
+    # implements the 'get_kv_connector_kv_cache_events'
+    # Follow on PR from https://github.com/vllm-project/vllm/pull/28309#pullrequestreview-3566351082
+
     # ==============================
     # Scheduler-side methods
     # ==============================
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index 546eacebf83e5..bea9e5846de13 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -12,9 +12,11 @@ from vllm.compilation.cuda_graph import CUDAGraphStat
 from vllm.v1.core.sched.output import SchedulerOutput
 
 if TYPE_CHECKING:
+    from vllm.distributed.kv_events import KVConnectorKVEvents
     from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats
 else:
     KVConnectorStats = object
+    KVConnectorKVEvents = object
 
 
 class LogprobsLists(NamedTuple):
@@ -108,6 +110,7 @@ class KVConnectorOutput:
     finished_sending: set[str] | None = None
     finished_recving: set[str] | None = None
     kv_connector_stats: KVConnectorStats | None = None
+    kv_cache_events: KVConnectorKVEvents | None = None
     # IDs of externally computed KV blocks that failed to load.
     # Requests referencing these blocks should be rescheduled to recompute them
     invalid_block_ids: set[int] = field(default_factory=set)
@@ -123,6 +126,7 @@ class KVConnectorOutput:
             not self.finished_sending
             and not self.finished_recving
             and not self.kv_connector_stats
+            and not self.kv_cache_events
             and not self.invalid_block_ids
         )
 
diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py
index b799f1be73d9c..2bcc87b63bcdf 100644
--- a/vllm/v1/worker/kv_connector_model_runner_mixin.py
+++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py
@@ -22,7 +22,6 @@ from vllm.distributed.kv_transfer import (
     has_kv_transfer_group,
 )
 from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
-from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats
 from vllm.forward_context import get_forward_context, set_forward_context
 from vllm.logger import init_logger
 from vllm.v1.kv_cache_interface import AttentionSpec, KVCacheConfig
@@ -138,16 +137,10 @@ class KVConnectorModelRunnerMixin:
             )
             output.invalid_block_ids = kv_connector.get_block_ids_with_load_errors()
 
-            output.kv_connector_stats = (
-                KVConnectorModelRunnerMixin.get_kv_connector_stats()
-            )
-            kv_connector.clear_connector_metadata()
+            output.kv_connector_stats = kv_connector.get_kv_connector_stats()
+            output.kv_cache_events = kv_connector.get_kv_connector_kv_cache_events()
 
-    @staticmethod
-    def get_kv_connector_stats() -> KVConnectorStats | None:
-        if has_kv_transfer_group():
-            return get_kv_transfer_group().get_kv_connector_stats()
-        return None
+            kv_connector.clear_connector_metadata()
 
     @staticmethod
     def use_uniform_kv_cache(

From 3a3b06ee706e6ff99b711b20a6c431b43e490dbc Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 11 Dec 2025 22:39:51 +0800
Subject: [PATCH 449/770] [Misc] Improve error message for `is_multimodal`
 (#30483)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/interfaces.py | 20 +++++++++++++++++---
 vllm/model_executor/models/phi3v.py      |  5 ++---
 vllm/model_executor/models/qwen3_vl.py   |  3 ++-
 3 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 1e5d80dd2f313..cb99d57e8b8c7 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -53,6 +53,22 @@ The output embeddings must be one of the following formats:
 """
 
 
+def _require_is_multimodal(is_multimodal: Tensor | None) -> Tensor:
+    """
+    A helper function to be used in the context of
+    [vllm.model_executor.models.interfaces.SupportsMultiModal.embed_input_ids][]
+    to provide a better error message.
+    """
+    if is_multimodal is None:
+        raise ValueError(
+            "`embed_input_ids` now requires `is_multimodal` arg, "
+            "please update your model runner according to "
+            "https://github.com/vllm-project/vllm/pull/16229."
+        )
+
+    return is_multimodal
+
+
 @runtime_checkable
 class SupportsMultiModal(Protocol):
     """The interface required for all multi-modal models."""
@@ -190,12 +206,10 @@ class SupportsMultiModal(Protocol):
         if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
             return inputs_embeds
 
-        assert is_multimodal is not None
-
         return _merge_multimodal_embeddings(
             inputs_embeds=inputs_embeds,
             multimodal_embeddings=multimodal_embeddings,
-            is_multimodal=is_multimodal,
+            is_multimodal=_require_is_multimodal(is_multimodal),
         )
 
 
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 0d39e29dcc97b..900b0eade308c 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -64,6 +64,7 @@ from .interfaces import (
     SupportsMultiModal,
     SupportsPP,
     SupportsQuant,
+    _require_is_multimodal,
 )
 from .utils import (
     AutoWeightsLoader,
@@ -687,12 +688,10 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant)
         if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
             return inputs_embeds
 
-        assert is_multimodal is not None
-
         return _merge_multimodal_embeddings(
             inputs_embeds=inputs_embeds,
             multimodal_embeddings=multimodal_embeddings,
-            is_multimodal=is_multimodal,
+            is_multimodal=_require_is_multimodal(is_multimodal),
         )
 
     def forward(
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index eac3774196a0a..f8e0ea6284994 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -93,6 +93,7 @@ from .interfaces import (
     SupportsMRoPE,
     SupportsMultiModal,
     SupportsPP,
+    _require_is_multimodal,
 )
 from .qwen2_5_vl import (
     Qwen2_5_VisionAttention,
@@ -1572,7 +1573,7 @@ class Qwen3VLForConditionalGeneration(
         if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
             return inputs_embeds
 
-        assert is_multimodal is not None
+        is_multimodal = _require_is_multimodal(is_multimodal)
 
         if self.use_deepstack:
             (

From 97a042f3bca53417de6405a248e3d11fca568e2c Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 11 Dec 2025 15:44:56 +0000
Subject: [PATCH 450/770] Make the `httpx` logger less annoying when
 Transformers v5 is installed (#30480)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/logger.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/logger.py b/vllm/logger.py
index 3b7bb1f22ec96..5506e09b8a65b 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -229,6 +229,11 @@ def suppress_logging(level: int = logging.INFO) -> Generator[None, Any, None]:
 # guaranteed by the Python GIL.
 _configure_vllm_root_logger()
 
+# Transformers uses httpx to access the Hugging Face Hub. httpx is quite verbose,
+# so we set its logging level to WARNING when vLLM's logging level is INFO.
+if envs.VLLM_LOGGING_LEVEL == "INFO":
+    logging.getLogger("httpx").setLevel(logging.WARNING)
+
 logger = init_logger(__name__)
 
 
From 17cb540248359afe3c93eb54dad03ce9e8d7f140 Mon Sep 17 00:00:00 2001
From: ioana ghiban <ioana.ghiban@arm.com>
Date: Thu, 11 Dec 2025 16:57:10 +0100
Subject: [PATCH 451/770] [Docs][CPU Backend] Add nightly and per revision
 pre-built Arm CPU wheels (#30402)

Signed-off-by: Ioana Ghiban <ioana.ghiban@arm.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .../installation/cpu.arm.inc.md               | 23 +++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/docs/getting_started/installation/cpu.arm.inc.md b/docs/getting_started/installation/cpu.arm.inc.md
index 156f31f633d57..8ec18bcb826ec 100644
--- a/docs/getting_started/installation/cpu.arm.inc.md
+++ b/docs/getting_started/installation/cpu.arm.inc.md
@@ -29,8 +29,27 @@ uv pip install --pre vllm==<version>+cpu --extra-index-url https://wheels.vllm.a
 
 The `uv` approach works for vLLM `v0.6.6` and later. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
 
-!!! note
-    Nightly wheels are currently unsupported for this architecture. (e.g. to bisect the behavior change, performance regression).
+**Install the latest code**
+
+LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides working pre-built Arm CPU wheels for every commit since `v0.11.2` on <https://wheels.vllm.ai/nightly>. For native CPU wheels, this index should be used:
+
+* `https://wheels.vllm.ai/nightly/cpu/vllm`
+
+To install from nightly index, copy the link address of the `*.whl` under this index to run, for example:
+
+```bash
+uv pip install -U https://wheels.vllm.ai/c756fb678184b867ed94e5613a529198f1aee423/vllm-0.13.0rc2.dev11%2Bgc756fb678.cpu-cp38-abi3-manylinux_2_31_aarch64.whl # current nightly build (the filename will change!)
+```
+
+**Install specific revisions**
+
+If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), specify the full commit hash in the index:
+https://wheels.vllm.ai/${VLLM_COMMIT}/cpu/vllm .
+Then, copy the link address of the `*.whl` under this index to run:
+
+```bash
+uv pip install -U <wheel-url>
+```
 
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]

From 93db3256a4c56cbf8647b6c0caca78abdf926130 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 11 Dec 2025 16:22:58 +0000
Subject: [PATCH 452/770] Give pooling examples better names (#30488)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/models/supported_models.md                                 | 2 +-
 docs/serving/openai_compatible_server.md                        | 2 +-
 .../pooling/score/{qwen3_reranker.py => offline_reranker.py}    | 0
 .../score/{jinaai_rerank_client.py => openai_reranker.py}       | 0
 vllm/model_executor/models/config.py                            | 2 +-
 5 files changed, 3 insertions(+), 3 deletions(-)
 rename examples/pooling/score/{qwen3_reranker.py => offline_reranker.py} (100%)
 rename examples/pooling/score/{jinaai_rerank_client.py => openai_reranker.py} (100%)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index d0166060c267a..586d5d91634dc 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -568,7 +568,7 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
     ```
 
 !!! note
-    Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/pooling/score/qwen3_reranker.py](../../examples/pooling/score/qwen3_reranker.py).
+    Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/pooling/score/offline_reranker.py](../../examples/pooling/score/offline_reranker.py).
 
     ```bash
     vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index 01453483a8d60..0e29204f8947c 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -851,7 +851,7 @@ endpoints are compatible with both [Jina AI's re-rank API interface](https://jin
 [Cohere's re-rank API interface](https://docs.cohere.com/v2/reference/rerank) to ensure compatibility with
 popular open-source tools.
 
-Code example: [examples/pooling/score/jinaai_rerank_client.py](../../examples/pooling/score/jinaai_rerank_client.py)
+Code example: [examples/pooling/score/openai_reranker.py](../../examples/pooling/score/openai_reranker.py)
 
 #### Example Request
 
diff --git a/examples/pooling/score/qwen3_reranker.py b/examples/pooling/score/offline_reranker.py
similarity index 100%
rename from examples/pooling/score/qwen3_reranker.py
rename to examples/pooling/score/offline_reranker.py
diff --git a/examples/pooling/score/jinaai_rerank_client.py b/examples/pooling/score/openai_reranker.py
similarity index 100%
rename from examples/pooling/score/jinaai_rerank_client.py
rename to examples/pooling/score/openai_reranker.py
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index 55dd6e50ad249..8de793941b8c3 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -214,7 +214,7 @@ class Qwen3ForSequenceClassificationConfig(VerifyAndUpdateConfig):
         tokens = getattr(config, "classifier_from_token", None)
         assert tokens is not None and len(tokens) == 2, (
             "Try loading the original Qwen3 Reranker?, see: "
-            "https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/qwen3_reranker.py"
+            "https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/offline_reranker.py"
         )
         vllm_config.model_config.hf_config.method = "from_2_way_softmax"
 

From 305b168a9fc50f322e9c5a07f4fc8c7bbda5f844 Mon Sep 17 00:00:00 2001
From: Shengqi Chen <harry-chen@outlook.com>
Date: Fri, 12 Dec 2025 00:42:30 +0800
Subject: [PATCH 453/770] [CI] refine more logic when generating and using
 nightly wheels & indices, add cuda130 build for aarch64, specify correct
 manylinux version (#30341)

Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
---
 .buildkite/release-pipeline.yaml              | 21 ++++++++--
 .buildkite/scripts/generate-nightly-index.py  | 11 ++++++
 .buildkite/scripts/upload-wheels.sh           | 12 ++++--
 tests/standalone_tests/python_only_compile.sh | 39 +++++++++++++++++--
 4 files changed, 73 insertions(+), 10 deletions(-)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index fbfc923998f89..151bb6abb0905 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -15,6 +15,21 @@ steps:
     env:
       DOCKER_BUILDKIT: "1"
 
+  - label: "Build arm64 wheel - CUDA 13.0"
+    depends_on: ~
+    id: build-wheel-arm64-cuda-13-0
+    agents:
+      queue: arm64_cpu_queue_postmerge
+    commands:
+      # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
+      # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04  --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
+    env:
+      DOCKER_BUILDKIT: "1"
+
   # aarch64 build
   - label: "Build arm64 CPU wheel"
     depends_on: ~
@@ -25,7 +40,7 @@ steps:
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
+      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
     env:
       DOCKER_BUILDKIT: "1"
 
@@ -39,7 +54,7 @@ steps:
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
+      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_31"
     env:
       DOCKER_BUILDKIT: "1"
 
@@ -52,7 +67,7 @@ steps:
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
+      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
     env:
       DOCKER_BUILDKIT: "1"
 
diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py
index f10cb2f0b6e21..d0965fbd56405 100644
--- a/.buildkite/scripts/generate-nightly-index.py
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -372,6 +372,17 @@ if __name__ == "__main__":
 
     print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
 
+    # keep only "official" files for a non-nightly version (specifed by cli args)
+    PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$")
+    if PY_VERSION_RE.match(version):
+        # upload-wheels.sh ensures no "dev" is in args.version
+        wheel_files = list(
+            filter(lambda x: version in x and "dev" not in x, wheel_files)
+        )
+        print(f"Non-nightly version detected, wheel files used: {wheel_files}")
+    else:
+        print("Nightly version detected, keeping all wheel files.")
+
     # Generate index and metadata, assuming wheels and indices are stored as:
     # s3://vllm-wheels/{version}/<wheel files>
     # s3://vllm-wheels/<anything>/<index files>
diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
index 8e38ace0bfbc2..3a218a4bb2e6d 100644
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -34,9 +34,10 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then
 fi
 wheel="${wheel_files[0]}"
 
-# current build image uses ubuntu 20.04, which corresponds to manylinux_2_31
+# default build image uses ubuntu 20.04, which corresponds to manylinux_2_31
+# we also accept params as manylinux tag
 # refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
-manylinux_version="manylinux_2_31"
+manylinux_version="${1:-manylinux_2_31}"
 
 # Rename 'linux' to the appropriate manylinux version in the wheel filename
 if [[ "$wheel" != *"linux"* ]]; then
@@ -96,8 +97,11 @@ if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]];
     aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
 fi
 
-# copy to /<pure_version>/ only if it does not have "dev" in the version
+# re-generate and copy to /<pure_version>/ only if it does not have "dev" in the version
 if [[ "$version" != *"dev"* ]]; then
-    echo "Uploading indices to overwrite /$pure_version/"
+    echo "Re-generating indices for /$pure_version/"
+    rm -rf "$INDICES_OUTPUT_DIR/*"
+    mkdir -p "$INDICES_OUTPUT_DIR"
+    $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
     aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
 fi
diff --git a/tests/standalone_tests/python_only_compile.sh b/tests/standalone_tests/python_only_compile.sh
index d29b9afcc6fbf..2017e34030d60 100644
--- a/tests/standalone_tests/python_only_compile.sh
+++ b/tests/standalone_tests/python_only_compile.sh
@@ -3,12 +3,45 @@
 # for users who do not have any compilers installed on their system
 
 set -e
-set -x
 
 merge_base_commit=$(git merge-base HEAD origin/main)
-echo "Current merge base commit with main: $merge_base_commit"
+echo "INFO: current merge base commit with main: $merge_base_commit"
 git show --oneline -s $merge_base_commit
 
+# test whether the metadata.json url is valid, retry each 3 minutes up to 5 times
+# this avoids cumbersome error messages & manual retries in case the precompiled wheel
+# for the given commit is still being built in the release pipeline
+meta_json_url="https://wheels.vllm.ai/$merge_base_commit/vllm/metadata.json"
+echo "INFO: will use metadata.json from $meta_json_url"
+
+for i in {1..5}; do
+    echo "Checking metadata.json URL (attempt $i)..."
+    if curl --fail "$meta_json_url" > metadata.json; then
+        echo "INFO: metadata.json URL is valid."
+        # check whether it is valid json by python
+        if python3 -m json.tool metadata.json; then
+            echo "INFO: metadata.json is valid JSON. Proceeding with the test."
+        else
+            echo "CRITICAL: metadata.json exists but is not valid JSON, please do report in #sig-ci channel!"
+            exit 1
+        fi
+        break
+    fi
+    # failure handling
+    if [ $i -eq 5 ]; then
+        echo "ERROR: metadata.json URL is still not valid after 5 attempts."
+        echo "ERROR: Please check whether the precompiled wheel for commit $merge_base_commit exists."
+        echo " NOTE: If $merge_base_commit is a new commit on main, maybe try again after its release pipeline finishes."
+        echo " NOTE: If it fails, please report in #sig-ci channel."
+        exit 1
+    else
+        echo "WARNING: metadata.json URL is not valid. Retrying in 3 minutes..."
+        sleep 180
+    fi
+done
+
+set -x
+
 cd /vllm-workspace/
 
 # uninstall vllm
@@ -29,6 +62,6 @@ python3 -c 'import vllm'
 
 # Check if the clangd log file was created
 if [ ! -f /tmp/changed.file ]; then
-    echo "changed.file was not created, python only compilation failed"
+    echo "ERROR: changed.file was not created, python only compilation failed"
     exit 1
 fi

From aa3c250c487e843b229a58d9978b02707b71109c Mon Sep 17 00:00:00 2001
From: Julien Denize <40604584+juliendenize@users.noreply.github.com>
Date: Thu, 11 Dec 2025 17:53:26 +0100
Subject: [PATCH 454/770] [IMPROVEMENT] Change MistralReasoningParser behavior
 (#30391)

Signed-off-by: juliendenize <julien.denize@mistral.ai>
Signed-off-by: Julien Denize <40604584+juliendenize@users.noreply.github.com>
Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 .../test_mistral_reasoning_parser.py          | 157 ++++++++++--------
 vllm/reasoning/mistral_reasoning_parser.py    | 105 +++++++++++-
 2 files changed, 192 insertions(+), 70 deletions(-)

diff --git a/tests/reasoning/test_mistral_reasoning_parser.py b/tests/reasoning/test_mistral_reasoning_parser.py
index 0fe315c2567f9..01592fd0782a9 100644
--- a/tests/reasoning/test_mistral_reasoning_parser.py
+++ b/tests/reasoning/test_mistral_reasoning_parser.py
@@ -18,47 +18,53 @@ def mistral_tokenizer():
     return mistral_tokenizer
 
 
-SIMPLE_REASONING = {
+INVALID_SIMPLE_REASONING = {
     "output": "This is a reasoning section[/THINK]This is the rest",
-    "reasoning": "This is a reasoning section",
-    "content": "This is the rest",
-    "is_reasoning_end": True,
+    "reasoning": None,
+    "content": "This is a reasoning sectionThis is the rest",
+    "is_reasoning_end": False,
 }
-COMPLETE_REASONING = {
+INVALID_COMPLETE_REASONING = {
     "output": "This is a reasoning section[/THINK]",
-    "reasoning": "This is a reasoning section",
-    "content": None,
-    "is_reasoning_end": True,
+    "reasoning": None,
+    "content": "This is a reasoning section",
+    "is_reasoning_end": False,
 }
 NO_CONTENT = {
-    "output": "This is content",
-    "reasoning": "This is content",
+    "output": "[THINK]This is reasoning",
+    "reasoning": "This is reasoning",
     "content": None,
     "is_reasoning_end": False,
 }
+NO_REASONING = {
+    "output": "This is content",
+    "reasoning": None,
+    "content": "This is content",
+    "is_reasoning_end": False,
+}
 NO_REASONING_STREAMING = {
     "output": "This is a reasoning section",
-    "reasoning": "This is a reasoning section",
-    "content": None,
+    "reasoning": None,
+    "content": "This is a reasoning section",
     "is_reasoning_end": False,
 }
-MULTIPLE_LINES = {
+INVALID_MULTIPLE_LINES = {
     "output": "This\nThat[/THINK]This is the rest\nThat",
-    "reasoning": "This\nThat",
-    "content": "This is the rest\nThat",
-    "is_reasoning_end": True,
+    "reasoning": None,
+    "content": "This\nThatThis is the rest\nThat",
+    "is_reasoning_end": False,
 }
-SHORTEST_REASONING_NO_STREAMING = {
-    "output": "[/THINK]This is the rest",
-    "reasoning": "",
-    "content": "This is the rest",
-    "is_reasoning_end": True,
-}
-SHORTEST_REASONING = {
+INVALID_SHORTEST_REASONING_NO_STREAMING = {
     "output": "[/THINK]This is the rest",
     "reasoning": None,
     "content": "This is the rest",
-    "is_reasoning_end": True,
+    "is_reasoning_end": False,
+}
+INVALID_SHORTEST_REASONING = {
+    "output": "[/THINK]This is the rest",
+    "reasoning": None,
+    "content": "This is the rest",
+    "is_reasoning_end": False,
 }
 REASONING_WITH_THINK = {
     "output": "[THINK]This is a reasoning section[/THINK]This is the rest",
@@ -78,17 +84,17 @@ MULTIPLE_LINES_WITH_THINK = {
     "content": "This is the rest\nThat",
     "is_reasoning_end": True,
 }
-SHORTEST_REASONING_NO_STREAMING_WITH_THINK = {
-    "output": "[/THINK]This is the rest",
-    "reasoning": "",
-    "content": "This is the rest",
-    "is_reasoning_end": True,
-}
-SHORTEST_REASONING_WITH_THINK = {
+INVALID_SHORTEST_REASONING_NO_STREAMING_WITH_THINK = {
     "output": "[/THINK]This is the rest",
     "reasoning": None,
     "content": "This is the rest",
-    "is_reasoning_end": True,
+    "is_reasoning_end": False,
+}
+INVALID_SHORTEST_REASONING_WITH_THINK = {
+    "output": "[/THINK]This is the rest",
+    "reasoning": None,
+    "content": "This is the rest",
+    "is_reasoning_end": False,
 }
 THINK_NO_END = {
     "output": "[THINK]This is a reasoning section",
@@ -98,8 +104,8 @@ THINK_NO_END = {
 }
 EMPTY = {
     "output": "",
-    "reasoning": "",
-    "content": None,
+    "reasoning": None,
+    "content": "",
     "is_reasoning_end": False,
 }
 EMPTY_STREAMING = {
@@ -109,47 +115,48 @@ EMPTY_STREAMING = {
     "is_reasoning_end": False,
 }
 NEW_LINE = {
-    "output": "\n[THINK]This is a reasoning section[/THINK]\nThis is the rest",
+    "output": "Before\n[THINK]This is a reasoning section[/THINK]\nThis is the rest",
     "reasoning": "This is a reasoning section",
-    "content": "\nThis is the rest",
+    "content": "Before\n\nThis is the rest",
     "is_reasoning_end": True,
 }
-# Streaming cannot handle new lines at the beginning of the output
-# because we need to support [THINK]...[/THINK] and [/THINK]...
-# We cannot know if the text before [THINK] is reasoning content
-# or not.
 NEW_LINE_STREAMING = {
-    "output": "\n[THINK]This is a reasoning section[/THINK]\nThis is the rest",
-    "reasoning": "\nThis is a reasoning section",
-    "content": "\nThis is the rest",
+    "output": "Before\n[THINK]This is a reasoning section[/THINK]\nThis is the rest",
+    "reasoning": "This is a reasoning section",
+    "content": "Before\n\nThis is the rest",
     "is_reasoning_end": True,
 }
 
 TEST_CASES = [
     pytest.param(
         False,
-        SIMPLE_REASONING,
-        id="simple_reasoning",
+        INVALID_SIMPLE_REASONING,
+        id="invalid_simple_reasoning",
     ),
     pytest.param(
         True,
-        SIMPLE_REASONING,
-        id="simple_reasoning_streaming",
+        INVALID_SIMPLE_REASONING,
+        id="invalid_simple_reasoning_streaming",
     ),
     pytest.param(
         False,
-        COMPLETE_REASONING,
-        id="complete_reasoning",
+        INVALID_COMPLETE_REASONING,
+        id="invalid_complete_reasoning",
     ),
     pytest.param(
         True,
-        COMPLETE_REASONING,
-        id="complete_reasoning_streaming",
+        INVALID_COMPLETE_REASONING,
+        id="invalid_complete_reasoning_streaming",
     ),
     pytest.param(
         False,
         NO_CONTENT,
-        id="no_content_token",
+        id="no_content",
+    ),
+    pytest.param(
+        False,
+        NO_REASONING,
+        id="no_reasoning",
     ),
     pytest.param(
         True,
@@ -158,23 +165,23 @@ TEST_CASES = [
     ),
     pytest.param(
         False,
-        MULTIPLE_LINES,
-        id="multiple_lines",
+        INVALID_MULTIPLE_LINES,
+        id="invalid_multiple_lines",
     ),
     pytest.param(
         True,
-        MULTIPLE_LINES,
-        id="multiple_lines_streaming",
+        INVALID_MULTIPLE_LINES,
+        id="invalid_multiple_lines_streaming",
     ),
     pytest.param(
         True,
-        SHORTEST_REASONING,
-        id="shortest",
+        INVALID_SHORTEST_REASONING,
+        id="invalid_shortest",
     ),
     pytest.param(
         False,
-        SHORTEST_REASONING_NO_STREAMING,
-        id="shortest_streaming",
+        INVALID_SHORTEST_REASONING_NO_STREAMING,
+        id="invalid_shortest_streaming",
     ),
     pytest.param(
         False,
@@ -208,13 +215,13 @@ TEST_CASES = [
     ),
     pytest.param(
         False,
-        SHORTEST_REASONING_NO_STREAMING_WITH_THINK,
-        id="shortest_with_think",
+        INVALID_SHORTEST_REASONING_NO_STREAMING_WITH_THINK,
+        id="invalid_shortest_with_think",
     ),
     pytest.param(
         True,
-        SHORTEST_REASONING_WITH_THINK,
-        id="shortest_with_think_streaming",
+        INVALID_SHORTEST_REASONING_WITH_THINK,
+        id="invalid_shortest_with_think_streaming",
     ),
     pytest.param(
         False,
@@ -316,10 +323,26 @@ def test_mistral_reasoning(
 
     # Test extract_content
     if param_dict["content"] is not None:
-        content = parser.extract_content_ids(output_tokens)
-        assert content == mistral_tokenizer.tokenizer.encode(
-            param_dict["content"], bos=False, eos=False
+        # Handle the case where there are tokens outputted before Thinking.
+        # This should not occur if the model is well trained and prompted.
+        if "[THINK]" in param_dict["output"] and not param_dict["output"].startswith(
+            "[THINK]"
+        ):
+            before_content = param_dict["output"].split("[THINK]")[0]
+            before_token_ids = mistral_tokenizer.tokenizer.encode(
+                before_content, bos=False, eos=False
+            )
+            left_to_encode = param_dict["content"][len(before_content) :]
+        # Normal situation.
+        else:
+            before_token_ids = []
+            left_to_encode = param_dict["content"]
+
+        content_tokens = parser.extract_content_ids(output_tokens)
+        expected_token_ids = before_token_ids + mistral_tokenizer.tokenizer.encode(
+            left_to_encode, bos=False, eos=False
         )
+        assert content_tokens == expected_token_ids
     else:
         content = parser.extract_content_ids(output_tokens)
         assert content == []
diff --git a/vllm/reasoning/mistral_reasoning_parser.py b/vllm/reasoning/mistral_reasoning_parser.py
index b61e50c188f8c..3206dbb29fe2e 100644
--- a/vllm/reasoning/mistral_reasoning_parser.py
+++ b/vllm/reasoning/mistral_reasoning_parser.py
@@ -3,20 +3,29 @@
 
 from functools import cached_property
 
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    ResponsesRequest,
+)
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParser
-from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
+from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
 from vllm.tokenizers import MistralTokenizer
 
 logger = init_logger(__name__)
 
 
-class MistralReasoningParser(DeepSeekR1ReasoningParser):
+class MistralReasoningParser(BaseThinkingReasoningParser):
     """
     Reasoning parser for Mistral models.
 
-    The Mistral models uses [THINK]...[/THINK] tokens to denote reasoning
+    The Mistral models uses `[THINK]`...`[/THINK]` tokens to denote reasoning
     text. This parser extracts the reasoning content from the model output.
+
+    A valid reasoning trace should always start with a `[THINK]` token and end with
+    a `[/THINK]` token.
+
+    If `[THINK]` token is not generated, then this parser only returns content.
     """
 
     def __init__(self, tokenizer: MistralTokenizer, *args, **kwargs):
@@ -53,3 +62,93 @@ class MistralReasoningParser(DeepSeekR1ReasoningParser):
         from mistral_common.tokens.tokenizers.base import SpecialTokens
 
         return SpecialTokens.end_think
+
+    def is_reasoning_end(self, input_ids: list[int]) -> bool:
+        has_eot_token = False
+
+        for id in input_ids[::-1]:
+            if id == self.start_token_id:
+                # Reasoning ends only if a BOT token is found before a EOT token.
+                return has_eot_token
+            elif id == self.end_token_id:
+                has_eot_token = True
+        return False
+
+    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
+        """
+        Extract the content
+        """
+        has_bot_token = False
+        has_eot_token = False
+        bot_token_index = -1
+        eot_token_index = -1
+        # One for loop instead of multiple lookups
+        for i, token_id in enumerate(input_ids):
+            # We filter that we have multiple BOT tokens which should not
+            # happen for a well prompted trained model
+            if token_id == self.start_token_id and not has_bot_token:
+                has_bot_token = True
+                bot_token_index = i
+            elif token_id == self.end_token_id:
+                has_eot_token = True
+                eot_token_index = i
+                break
+
+        # 1. Only BOT has been outputted
+        if has_bot_token and not has_eot_token:
+            # Should be = [] if model is well prompted and trained.
+            return input_ids[:bot_token_index]
+        # 2. Neither BOT or EOT have been outputted
+        elif not has_bot_token and not has_eot_token:
+            return input_ids
+        # 3. Both BOT and EOT have been outputted.
+        elif has_bot_token and has_eot_token:
+            return input_ids[:bot_token_index] + input_ids[eot_token_index + 1 :]
+        # 4. Only EOT has been outputted => this should not have occured for a model
+        #    well prompted and trained.
+        else:
+            return input_ids[:eot_token_index] + input_ids[eot_token_index + 1 :]
+
+    def extract_reasoning(
+        self, model_output: str, request: ChatCompletionRequest | ResponsesRequest
+    ) -> tuple[str | None, str | None]:
+        """
+        Extract reasoning content from the model output.
+        """
+        if not model_output:
+            return (None, "")
+
+        # Check if the start token is present in the model output, remove it
+        # if it is present.
+        prev_bot_token, bot_token, post_bot_token = model_output.partition(
+            self.start_token
+        )
+
+        has_bot_token = bool(bot_token)
+        # Valid EOT tokens should follow BOT token
+        has_valid_eot_token = has_bot_token and self.end_token in post_bot_token
+
+        # 1. If there is BOT token followed by EOT token
+        if has_bot_token and has_valid_eot_token:
+            prev_eot_token, _, post_eot_token = post_bot_token.partition(self.end_token)
+            # If model is well prompted and trained prev_bot_token should be ""
+            content = prev_bot_token + post_eot_token
+            return prev_eot_token, content if content else None
+        # 2. Only BOT token
+        elif has_bot_token:
+            # If model is well prompted and trained prev_bot_token should be ""
+            return post_bot_token, prev_bot_token if prev_bot_token else None
+        # 3. EOT token has been outputted without BOT or neither has been outputted
+        else:
+            has_non_valid_eot_token = self.end_token in prev_bot_token
+            # 3.a EOT token has been outputted without BOT
+            # If model is well prompted and trained `has_non_valid_eot_token` should
+            # be `False` and the parser outputs all tokens as 'content'
+            if has_non_valid_eot_token:
+                prev_eot_token, _, post_eot_token = prev_bot_token.partition(
+                    self.end_token
+                )
+                return None, prev_eot_token + post_eot_token
+            # 3.b neither BOT or EOT have been outputted
+            else:
+                return None, prev_bot_token

From 8781cd6b88ad264a01886a05e698b5e036fb4eb9 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 11 Dec 2025 17:02:10 +0000
Subject: [PATCH 455/770] Add Eagle and Eagle3 support to Transformers modeling
 backend (#30340)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/v1/e2e/test_spec_decode.py              | 36 +++++++++-
 .../models/transformers/base.py               | 66 +++++++++++++++++--
 2 files changed, 94 insertions(+), 8 deletions(-)

diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 8c904a8cddac4..c8587659d6580 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -280,9 +280,20 @@ def test_speculators_model_integration(
 
 
 @pytest.mark.parametrize(
-    ["model_setup", "mm_enabled", "enable_chunked_prefill"],
+    ["model_setup", "mm_enabled", "enable_chunked_prefill", "model_impl"],
     [
-        (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False, False),
+        (
+            ("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1),
+            False,
+            False,
+            "auto",
+        ),
+        (
+            ("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1),
+            False,
+            False,
+            "transformers",
+        ),
         pytest.param(
             (
                 "eagle3",
@@ -292,6 +303,7 @@ def test_speculators_model_integration(
             ),
             False,
             False,
+            "auto",
             marks=pytest.mark.skip(
                 reason="architecture of its eagle3 is LlamaForCausalLMEagle3"
             ),
@@ -305,6 +317,7 @@ def test_speculators_model_integration(
             ),
             False,
             False,
+            "auto",
             marks=pytest.mark.skip(
                 reason="Skipping due to its head_dim not being a a multiple of 32"
             ),
@@ -318,6 +331,7 @@ def test_speculators_model_integration(
             ),
             False,
             True,
+            "auto",
             marks=large_gpu_mark(min_gb=40),
         ),  # works on 4x H100
         (
@@ -329,6 +343,7 @@ def test_speculators_model_integration(
             ),
             False,
             False,
+            "auto",
         ),
         pytest.param(
             (
@@ -339,6 +354,7 @@ def test_speculators_model_integration(
             ),
             False,
             False,
+            "auto",
             marks=large_gpu_mark(min_gb=80),
         ),  # works on 4x H100
         pytest.param(
@@ -350,6 +366,7 @@ def test_speculators_model_integration(
             ),
             True,
             True,
+            "auto",
             marks=large_gpu_mark(min_gb=80),
         ),  # works on 4x H100
         (
@@ -361,10 +378,12 @@ def test_speculators_model_integration(
             ),
             False,
             False,
+            "auto",
         ),
     ],
     ids=[
         "qwen3_eagle3",
+        "qwen3_eagle3-transformers",
         "qwen3_vl_eagle3",
         "qwen2_5_vl_eagle3",
         "llama3_eagle",
@@ -381,6 +400,7 @@ def test_eagle_correctness(
     model_setup: tuple[str, str, str, int],
     mm_enabled: bool,
     enable_chunked_prefill: bool,
+    model_impl: str,
     attn_backend: str,
 ):
     if attn_backend == "TREE_ATTN":
@@ -389,6 +409,17 @@ def test_eagle_correctness(
             "TREE_ATTN is flaky in the test disable for now until it can be "
             "resolved (see https://github.com/vllm-project/vllm/issues/22922)"
         )
+    if model_impl == "transformers":
+        import transformers
+        from packaging.version import Version
+
+        installed = Version(transformers.__version__)
+        required = Version("5.0.0.dev")
+        if installed < required:
+            pytest.skip(
+                "Eagle3 with the Transformers modeling backend requires "
+                f"transformers>={required}, but got {installed}"
+            )
 
     # Generate test prompts inside the function instead of using fixture
     test_prompts = get_test_prompts(mm_enabled)
@@ -448,6 +479,7 @@ def test_eagle_correctness(
             max_model_len=max_model_len,
             max_num_batched_tokens=max_num_batched_tokens,
             enable_chunked_prefill=enable_chunked_prefill,
+            model_impl=model_impl,
         )
         spec_outputs = spec_llm.chat(test_prompts, sampling_config)
         matches = 0
diff --git a/vllm/model_executor/models/transformers/base.py b/vllm/model_executor/models/transformers/base.py
index f3ebc6da8e302..45e746ac2d356 100644
--- a/vllm/model_executor/models/transformers/base.py
+++ b/vllm/model_executor/models/transformers/base.py
@@ -36,6 +36,8 @@ from vllm.distributed.utils import get_pp_indices
 from vllm.logger import init_logger
 from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from vllm.model_executor.models.interfaces import (
+    SupportsEagle,
+    SupportsEagle3,
     SupportsLoRA,
     SupportsPP,
     SupportsQuant,
@@ -92,7 +94,15 @@ def vllm_flash_attention_forward(
 ALL_ATTENTION_FUNCTIONS["vllm"] = vllm_flash_attention_forward
 
 
-class Base(nn.Module, VllmModel, SupportsQuant, SupportsLoRA, SupportsPP):
+class Base(
+    nn.Module,
+    VllmModel,
+    SupportsQuant,
+    SupportsLoRA,
+    SupportsPP,
+    SupportsEagle,
+    SupportsEagle3,
+):
     embedding_modules = ["embed_tokens"]  # TODO transformers will have a util to get it
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
@@ -131,17 +141,24 @@ class Base(nn.Module, VllmModel, SupportsQuant, SupportsLoRA, SupportsPP):
         self.pp_group = get_pp_group()
         self.tp_group = get_tp_group()
 
-        # Weights to skip in `self.load_weights`
+        # Attrs for weight loading (see self.load_weights)
         self.skip_prefixes: list[str] = []
         """Skip loading weights whose qualname starts with these prefixes."""
         self.skip_substrs: list[str] = []
         """Skip loading weights whose qualname contains these substrings."""
         self.ignore_unexpected_prefixes: list[str] = []
-        """Ignore unexpected weights whose qualname starts with these prefixes.
-        """
+        """Ignore unexpected weights whose qualname starts with these prefixes."""
         self.ignore_unexpected_suffixes: list[str] = []
         """Ignore unexpected weights whose qualname ends with these suffixes."""
 
+        # Attrs for Eagle3 (see self.set_aux_hidden_state_layers)
+        self._target_class: type[nn.Module] = nn.Module
+        """Target class for Eagle3 aux hidden state recording."""
+        self._layer_names: dict[int, str] = {}
+        """Mapping from layer index to layer name for Eagle3."""
+        self._output_aux_hidden_states_kwargs: dict[str, bool] = {}
+        """Kwargs to pass to model forward for Eagle3 aux hidden states."""
+
         if self.quant_config:
             quant_method_name = self.quant_config.get_name()
             # Check for unsupported quantization methods.
@@ -278,6 +295,15 @@ class Base(nn.Module, VllmModel, SupportsQuant, SupportsLoRA, SupportsPP):
             for child_name, child_module in module.named_children():
                 new_module = child_module
                 qual_name = maybe_prefix(prefix, child_name)
+                # Populate Eagle3 attrs
+                if (
+                    isinstance(module, nn.ModuleList)
+                    and len(module) == self.text_config.num_hidden_layers
+                ):
+                    self._target_class = type(child_module)
+                    layer_name = qual_name.removeprefix("model.")
+                    self._layer_names[int(child_name)] = layer_name
+                # Replace modules as needed
                 if isinstance(child_module, nn.Linear):
                     generator = (p for p in tp_plan if re.match(p, qual_name))
                     pattern = next(generator, None)
@@ -425,19 +451,26 @@ class Base(nn.Module, VllmModel, SupportsQuant, SupportsLoRA, SupportsPP):
         else:
             position_ids = positions[None, ...]
 
-        hidden_states = self.model(
+        outputs = self.model(
             input_ids=input_ids,
             inputs_embeds=inputs_embeds,
             use_cache=False,
             position_ids=position_ids,
             attention_instances=self.attention_instances,
             return_dict=False,
+            **self._output_aux_hidden_states_kwargs,
             **kwargs,
-        )[0][0, ...]  # we remove batch dimension for now
+        )
+        # We must remove the batch dimension from these outputs
+        hidden_states = outputs[0][0, ...]
+        if self._output_aux_hidden_states_kwargs:
+            aux_hidden_states = [x[0][0, ...] for x in outputs[1:]]
 
         if not self.pp_group.is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
 
+        if self._output_aux_hidden_states_kwargs and len(aux_hidden_states) > 0:
+            return hidden_states, aux_hidden_states
         return hidden_states
 
     def load_weights(
@@ -462,3 +495,24 @@ class Base(nn.Module, VllmModel, SupportsQuant, SupportsLoRA, SupportsPP):
                 f"Transformers modeling backend requires transformers>={required} "
                 f"for {feature}, but got {installed}"
             )
+
+    def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
+        self.check_version("5.0.0.dev0", "Eagle3 support")
+        from transformers.utils.generic import OutputRecorder
+
+        # The default value in PreTrainedModel is None
+        if self.model._can_record_outputs is None:
+            self.model._can_record_outputs = {}
+
+        target_class = self._target_class
+        for layer in layers:
+            # layer - 1 because we want the input to the layer
+            layer_name = self._layer_names[layer - 1]
+            layer_key = f"aux_hidden_state_{layer}"
+            aux_hidden_state_i = OutputRecorder(target_class, layer_name=layer_name)
+            self.model._can_record_outputs[layer_key] = aux_hidden_state_i
+            self._output_aux_hidden_states_kwargs[f"output_{layer_key}"] = True
+
+    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
+        num_layers = self.text_config.num_hidden_layers
+        return (2, num_layers // 2, num_layers - 3)

From 0e71eaa6447d99e76de8e03213ec22bc1d3b07df Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= <wangzhipeng628@gmail.com>
Date: Fri, 12 Dec 2025 02:03:32 +0800
Subject: [PATCH 456/770] [Feature] AWQ marlin quantization support for fused
 moe with lora (#30442)

Signed-off-by: princepride <wangzhipeng628@gmail.com>
---
 .../model_executor/layers/fused_moe/config.py | 36 +++++++
 .../layers/quantization/awq_marlin.py         | 95 ++++++++++++++++++-
 2 files changed, 130 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
index f35cafa0f77dc..5eb6bc4829adf 100644
--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -700,6 +700,42 @@ def int4_w4afp8_moe_quant_config(
     )
 
 
+def awq_marlin_moe_quant_config(
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    w1_zp: torch.Tensor | None,
+    w2_zp: torch.Tensor | None,
+    weight_bits: int,
+    group_size: int,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
+) -> FusedMoEQuantConfig:
+    """
+    Construct a quant config for awq marlin quantization.
+    """
+    from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
+
+    w_shape = None if group_size == -1 else GroupShape(row=1, col=group_size)
+
+    # Activations are NOT quantized for AWQ (fp16/bf16)
+    a_shape = w_shape  # Same as weight shape for alignment
+
+    # Determine weight dtype
+    if weight_bits == 4:
+        weight_dtype = "int4"
+    elif weight_bits == 8:
+        weight_dtype = torch.int8
+    else:
+        raise ValueError(f"Unsupported weight_bits: {weight_bits}")
+
+    return FusedMoEQuantConfig(
+        _a1=FusedMoEQuantDesc(dtype=None, shape=a_shape),
+        _a2=FusedMoEQuantDesc(dtype=None, shape=a_shape),
+        _w1=FusedMoEQuantDesc(weight_dtype, w_shape, w1_scale, None, w1_zp, w1_bias),
+        _w2=FusedMoEQuantDesc(weight_dtype, w_shape, w2_scale, None, w2_zp, w2_bias),
+    )
+
+
 def biased_moe_quant_config(
     w1_bias: torch.Tensor | None,
     w2_bias: torch.Tensor | None,
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 16aa4f1e22698..3ed15ed7dd422 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -470,6 +470,11 @@ class AWQMarlinMoEMethod(FusedMoEMethodBase):
             }
         )
 
+        intermediate_size_full = extra_weight_attrs.pop(
+            "intermediate_size_full", intermediate_size_per_partition
+        )
+        self.is_k_full = intermediate_size_per_partition == intermediate_size_full
+
         w13_qweight = Parameter(
             torch.empty(
                 num_experts,
@@ -597,6 +602,13 @@ class AWQMarlinMoEMethod(FusedMoEMethodBase):
         )
         replace_parameter(layer, "w2_qweight", marlin_w2_qweight)
 
+        # The modular kernel expects w13_weight and w2_weight,
+        # but AWQ uses w13_qweight and w2_qweight
+        # Alias for modular kernel
+        layer.w13_weight = layer.w13_qweight
+        # Alias for modular kernel
+        layer.w2_weight = layer.w2_qweight
+
         # Why does this take the intermediate size for size_k?
         marlin_w13_scales = marlin_moe_permute_scales(
             s=layer.w13_scales,
@@ -661,7 +673,88 @@ class AWQMarlinMoEMethod(FusedMoEMethodBase):
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
     ) -> FusedMoEQuantConfig | None:
-        return None
+        from vllm.model_executor.layers.fused_moe.config import (
+            awq_marlin_moe_quant_config,
+        )
+
+        return awq_marlin_moe_quant_config(
+            w1_scale=layer.w13_scales,
+            w2_scale=layer.w2_scales,
+            weight_bits=self.quant_config.weight_bits,
+            group_size=self.quant_config.group_size,
+            w1_zp=getattr(layer, "w13_qzeros", None)
+            if self.quant_config.zero_point
+            else None,
+            w2_zp=getattr(layer, "w2_qzeros", None)
+            if self.quant_config.zero_point
+            else None,
+            w1_bias=getattr(layer, "w13_bias", None),
+            w2_bias=getattr(layer, "w2_bias", None),
+        )
+
+    def select_gemm_impl(
+        self,
+        prepare_finalize,
+        layer: torch.nn.Module,
+    ):
+        """
+        Select the GEMM implementation for AWQ-Marlin MoE.
+        Returns MarlinExperts configured for AWQ quantization.
+        This is ONLY used when LoRA is enabled.
+        Without LoRA, AWQ uses its own apply() method.
+        """
+        # Only use modular kernels when LoRA is enabled
+        # Without LoRA, AWQ's own apply() method works fine and is more efficient
+        if not self.moe.is_lora_enabled:
+            raise NotImplementedError(
+                "AWQ-Marlin uses its own apply() method when LoRA is not enabled. "
+                "Modular kernels are only used for LoRA support."
+            )
+
+        from vllm.model_executor.layers.fused_moe import modular_kernel as mk
+        from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+            BatchedMarlinExperts,
+            MarlinExperts,
+        )
+
+        # Ensure quant config is initialized
+        assert self.moe_quant_config is not None, (
+            "moe_quant_config must be initialized before select_gemm_impl"
+        )
+
+        w13_g_idx = getattr(layer, "w13_g_idx", None)
+        w2_g_idx = getattr(layer, "w2_g_idx", None)
+        w13_g_idx_sort_indices = getattr(layer, "w13_g_idx_sort_indices", None)
+        w2_g_idx_sort_indices = getattr(layer, "w2_g_idx_sort_indices", None)
+
+        # Check if using batched expert format (for Expert Parallelism)
+        if (
+            prepare_finalize.activation_format
+            == mk.FusedMoEActivationFormat.BatchedExperts
+        ):
+            # For batched format, use BatchedMarlinExperts
+            max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank()
+            assert max_num_tokens_per_rank is not None
+            return BatchedMarlinExperts(
+                max_num_tokens=max_num_tokens_per_rank,
+                num_dispatchers=prepare_finalize.num_dispatchers(),
+                quant_config=self.moe_quant_config,
+                w13_g_idx=w13_g_idx,
+                w2_g_idx=w2_g_idx,
+                w13_g_idx_sort_indices=w13_g_idx_sort_indices,
+                w2_g_idx_sort_indices=w2_g_idx_sort_indices,
+                is_k_full=self.is_k_full,
+            )
+        else:
+            # Standard Marlin experts for AWQ
+            return MarlinExperts(
+                quant_config=self.moe_quant_config,
+                w13_g_idx=w13_g_idx,
+                w2_g_idx=w2_g_idx,
+                w13_g_idx_sort_indices=w13_g_idx_sort_indices,
+                w2_g_idx_sort_indices=w2_g_idx_sort_indices,
+                is_k_full=self.is_k_full,
+            )
 
     def apply(
         self,

From 72aaac5b66f908008efed5ba6874c3ed60e6c90a Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Thu, 11 Dec 2025 13:25:01 -0600
Subject: [PATCH 457/770] [ROCm][Bugfix] Add MLACommonMetadata to allowed
 attention types for speculative decoding (#30430)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 vllm/v1/spec_decode/eagle.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 4cc78ae9d23ae..65a0a88ec0f5d 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -178,6 +178,12 @@ class EagleProposer:
                 )
 
                 rocm_types.append(AiterFlashAttentionMetadata)
+
+            # TRITON_MLA backend support for MLA models (e.g., DeepSeek)
+            from vllm.v1.attention.backends.mla.common import MLACommonMetadata
+
+            rocm_types.append(MLACommonMetadata)
+
             self.allowed_attn_types = tuple(rocm_types)
 
         # Parse the speculative token tree.

From e458270a9537c5abc1d848f53f2d56fce92a6122 Mon Sep 17 00:00:00 2001
From: "Ye (Charlotte) Qi" <yeq@meta.com>
Date: Thu, 11 Dec 2025 12:06:09 -0800
Subject: [PATCH 458/770] [Misc] Add mcp to requirements (#30474)

Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
---
 requirements/common.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index f18560b98d16c..31c8fb404f63a 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -50,4 +50,5 @@ ijson # Required for mistral streaming tool parser
 setproctitle # Used to set process names for better debugging and monitoring
 openai-harmony >= 0.0.3  # Required for gpt-oss
 anthropic == 0.71.0
-model-hosting-container-standards >= 0.1.9, < 1.0.0
\ No newline at end of file
+model-hosting-container-standards >= 0.1.9, < 1.0.0
+mcp
\ No newline at end of file

From 92fea56fd1e148a5650160427d6b5c733ff211b8 Mon Sep 17 00:00:00 2001
From: Zhengxu Chen <zhxchen17@fb.com>
Date: Thu, 11 Dec 2025 15:28:03 -0500
Subject: [PATCH 459/770] [compile] Stop one-off setting enable_aot_compile and
 use context manager instead. (#30503)

Signed-off-by: zhxchen17 <zhxchen17@fb.com>
---
 vllm/compilation/wrapper.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index b59a4a9dd1527..02e974b0f9e8c 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -171,22 +171,24 @@ class TorchCompileWithNoGuardsWrapper:
 
             compiled_ptr = self.check_invariants_and_forward
 
+        aot_context = nullcontext()
         if envs.VLLM_USE_AOT_COMPILE:
             if hasattr(torch._dynamo.config, "enable_aot_compile"):
-                torch._dynamo.config.enable_aot_compile = True
+                aot_context = torch._dynamo.config.patch(enable_aot_compile=True)
             else:
                 msg = "torch._dynamo.config.enable_aot_compile is not "
                 msg += "available. AOT compile is disabled and please "
                 msg += "upgrade PyTorch version to use AOT compile."
                 logger.warning(msg)
 
-        self._compiled_callable = torch.compile(
-            compiled_ptr,
-            fullgraph=True,
-            dynamic=False,
-            backend=backend,
-            options=options,
-        )
+        with aot_context:
+            self._compiled_callable = torch.compile(
+                compiled_ptr,
+                fullgraph=True,
+                dynamic=False,
+                backend=backend,
+                options=options,
+            )
 
         if envs.VLLM_USE_BYTECODE_HOOK and mode != CompilationMode.STOCK_TORCH_COMPILE:
             torch._dynamo.convert_frame.register_bytecode_hook(self.bytecode_hook)

From cf3eacfe58fa9e745c2854782ada884a9f992cf7 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 11 Dec 2025 20:45:23 +0000
Subject: [PATCH 460/770] Standardise `get_rope` to use
 `rope_parameters["partial_rotary_factor"]`, not `rotary_dim` (#30389)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 benchmarks/kernels/benchmark_mrope.py         |   1 -
 benchmarks/kernels/benchmark_rope.py          |   4 +-
 tests/compile/test_functionalization.py       |   5 +-
 tests/kernels/core/test_mrope.py              |   2 -
 tests/kernels/core/test_pos_encoding.py       |  12 +-
 vllm/config/utils.py                          |  18 +-
 .../layers/rotary_embedding/__init__.py       | 370 +++++++++---------
 vllm/model_executor/models/afmoe.py           |   1 -
 vllm/model_executor/models/apertus.py         |   1 -
 vllm/model_executor/models/arctic.py          |   1 -
 vllm/model_executor/models/baichuan.py        |   1 -
 vllm/model_executor/models/bailing_moe.py     |   4 +-
 vllm/model_executor/models/bamba.py           |   7 +-
 vllm/model_executor/models/chameleon.py       |   1 -
 vllm/model_executor/models/chatglm.py         |   7 +-
 vllm/model_executor/models/commandr.py        |   1 -
 vllm/model_executor/models/config.py          |  12 +-
 vllm/model_executor/models/dbrx.py            |   1 -
 vllm/model_executor/models/deepseek_v2.py     |   4 -
 vllm/model_executor/models/dots1.py           |   1 -
 vllm/model_executor/models/ernie45_moe.py     |   1 -
 vllm/model_executor/models/exaone.py          |   1 -
 vllm/model_executor/models/exaone4.py         |   1 -
 vllm/model_executor/models/falcon.py          |   1 -
 vllm/model_executor/models/falcon_h1.py       |   7 +-
 vllm/model_executor/models/gemma.py           |   1 -
 vllm/model_executor/models/gemma2.py          |   1 -
 vllm/model_executor/models/gemma3.py          |   1 -
 vllm/model_executor/models/gemma3n.py         |   1 -
 vllm/model_executor/models/glm4.py            |   2 -
 vllm/model_executor/models/glm4_1v.py         |   2 +-
 vllm/model_executor/models/glm4_moe.py        |   1 -
 vllm/model_executor/models/gpt_j.py           |   5 +-
 vllm/model_executor/models/gpt_neox.py        |   1 -
 vllm/model_executor/models/gpt_oss.py         |   1 -
 vllm/model_executor/models/granite.py         |   1 -
 vllm/model_executor/models/granitemoe.py      |   1 -
 .../model_executor/models/granitemoehybrid.py |   1 -
 vllm/model_executor/models/grok1.py           |   1 -
 vllm/model_executor/models/hunyuan_v1.py      |   2 -
 vllm/model_executor/models/internlm2.py       |   1 -
 vllm/model_executor/models/lfm2.py            |   1 -
 vllm/model_executor/models/lfm2_moe.py        |   1 -
 vllm/model_executor/models/llama.py           |   1 -
 vllm/model_executor/models/llama4.py          |   1 -
 vllm/model_executor/models/minicpm.py         |   1 -
 vllm/model_executor/models/minicpm3.py        |   1 -
 vllm/model_executor/models/minimax_m2.py      |   6 +-
 vllm/model_executor/models/minimax_text_01.py |   7 +-
 vllm/model_executor/models/mixtral.py         |   1 -
 vllm/model_executor/models/mllama4.py         |   2 +-
 vllm/model_executor/models/modernbert.py      |   1 -
 vllm/model_executor/models/molmo.py           |   1 -
 vllm/model_executor/models/nemotron.py        |   1 -
 vllm/model_executor/models/nemotron_nas.py    |   1 -
 vllm/model_executor/models/olmo.py            |   1 -
 vllm/model_executor/models/olmo2.py           |   1 -
 vllm/model_executor/models/olmoe.py           |   1 -
 vllm/model_executor/models/openpangu.py       |   2 -
 vllm/model_executor/models/orion.py           |   1 -
 vllm/model_executor/models/ouro.py            |   1 -
 vllm/model_executor/models/persimmon.py       |   1 -
 vllm/model_executor/models/phi.py             |  12 +-
 vllm/model_executor/models/phimoe.py          |   1 -
 vllm/model_executor/models/plamo2.py          |   1 -
 vllm/model_executor/models/plamo3.py          |   1 -
 vllm/model_executor/models/qwen.py            |   1 -
 vllm/model_executor/models/qwen2.py           |   1 -
 vllm/model_executor/models/qwen2_5_vl.py      |   2 +-
 vllm/model_executor/models/qwen2_moe.py       |   1 -
 vllm/model_executor/models/qwen2_vl.py        |   2 +-
 vllm/model_executor/models/qwen3.py           |   1 -
 vllm/model_executor/models/qwen3_moe.py       |   1 -
 vllm/model_executor/models/qwen3_next.py      |   1 -
 .../models/qwen3_omni_moe_thinker.py          |   2 +-
 vllm/model_executor/models/qwen3_vl.py        |   2 +-
 vllm/model_executor/models/seed_oss.py        |   1 -
 vllm/model_executor/models/solar.py           |   1 -
 vllm/model_executor/models/stablelm.py        |   1 -
 vllm/model_executor/models/starcoder2.py      |   1 -
 vllm/model_executor/models/step3_text.py      |   1 -
 vllm/model_executor/models/zamba2.py          |   1 -
 vllm/transformers_utils/config.py             |  17 +-
 83 files changed, 260 insertions(+), 314 deletions(-)

diff --git a/benchmarks/kernels/benchmark_mrope.py b/benchmarks/kernels/benchmark_mrope.py
index 83bd91917508f..09de5fa822f86 100644
--- a/benchmarks/kernels/benchmark_mrope.py
+++ b/benchmarks/kernels/benchmark_mrope.py
@@ -99,7 +99,6 @@ def benchmark_mrope(
     # the parameters to compute the q k v size based on tp_size
     mrope_helper_class = get_rope(
         head_size=head_dim,
-        rotary_dim=head_dim,
         max_position=max_position,
         is_neox_style=is_neox_style,
         rope_parameters=rope_parameters,
diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py
index 074b7a440b612..7a1bc050bb33f 100644
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -32,8 +32,8 @@ def get_benchmark(head_size, rotary_dim, is_neox_style, device):
     def benchmark(batch_size, seq_len, num_heads, provider):
         dtype = torch.bfloat16
         max_position = 8192
-        base = 10000
-        rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
+        rope_parameters = {"partial_rotary_factor": rotary_dim / head_size}
+        rope = get_rope(head_size, max_position, is_neox_style, rope_parameters)
         rope = rope.to(dtype=dtype, device=device)
         cos_sin_cache = rope.cos_sin_cache.to(dtype=torch.float, device=device)
 
diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py
index 7585915892700..ad5ead36e2310 100644
--- a/tests/compile/test_functionalization.py
+++ b/tests/compile/test_functionalization.py
@@ -128,14 +128,12 @@ class TestFusedAddRMSNorm(torch.nn.Module):
 
 
 class TestRotaryEmbedding(torch.nn.Module):
-    def __init__(self, head_dim=64, rotary_dim=None, max_position=2048, base=10000):
+    def __init__(self, head_dim=64, max_position=2048, base=10000):
         super().__init__()
         self.head_dim = head_dim
-        self.rotary_dim = rotary_dim or head_dim
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.rotary_dim,
             max_position=max_position,
             rope_parameters={"rope_type": "default", "rope_theta": base},
         )
@@ -170,7 +168,6 @@ class TestRotaryEmbeddingSliceScatter(torch.nn.Module):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=max_position,
             rope_parameters={"rope_type": "default", "rope_theta": base},
         )
diff --git a/tests/kernels/core/test_mrope.py b/tests/kernels/core/test_mrope.py
index 4e1559a049bf9..ba5d593b2d355 100644
--- a/tests/kernels/core/test_mrope.py
+++ b/tests/kernels/core/test_mrope.py
@@ -116,7 +116,6 @@ def test_mrope(
 
     mrope_helper_class = get_rope(
         head_size=head_dim,
-        rotary_dim=head_dim,
         max_position=max_position,
         is_neox_style=is_neox_style,
         rope_parameters=config.rope_parameters,
@@ -185,7 +184,6 @@ def test_mrope_torch_compile_tracing(
 
     mrope_helper_class = get_rope(
         head_size=head_dim,
-        rotary_dim=head_dim,
         max_position=max_position,
         is_neox_style=is_neox_style,
         rope_parameters=config.rope_parameters,
diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py
index a8ed3825689d3..d18f01314c8f5 100644
--- a/tests/kernels/core/test_pos_encoding.py
+++ b/tests/kernels/core/test_pos_encoding.py
@@ -83,8 +83,12 @@ def test_rotary_embedding(
     torch.set_default_device(device)
     if rotary_dim is None:
         rotary_dim = head_size
-    rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
-    rope = get_rope(head_size, rotary_dim, max_position, is_neox_style, rope_parameters)
+    rope_parameters = {
+        "rope_type": "default",
+        "rope_theta": rope_theta,
+        "partial_rotary_factor": rotary_dim / head_size,
+    }
+    rope = get_rope(head_size, max_position, is_neox_style, rope_parameters)
     rope = rope.to(dtype=dtype, device=torch.get_default_device())
 
     positions = torch.randint(0, max_position, (batch_size, seq_len))
@@ -150,9 +154,9 @@ def test_rope_module_cache():
         if rotary_dim is None:
             rotary_dim = head_size
         rope_parameters["rope_theta"] = rope_theta
+        rope_parameters["partial_rotary_factor"] = rotary_dim / head_size
         rope = get_rope(
             head_size,
-            rotary_dim,
             max_position,
             is_neox_style,
             rope_parameters,
@@ -177,9 +181,9 @@ def test_rope_module_cache():
         if rotary_dim is None:
             rotary_dim = head_size
         rope_parameters["rope_theta"] = rope_theta
+        rope_parameters["partial_rotary_factor"] = rotary_dim / head_size
         rope = get_rope(
             head_size,
-            rotary_dim,
             max_position,
             is_neox_style,
             rope_parameters,
diff --git a/vllm/config/utils.py b/vllm/config/utils.py
index 93da3fd417ace..470296517deb1 100644
--- a/vllm/config/utils.py
+++ b/vllm/config/utils.py
@@ -73,14 +73,28 @@ def get_field(cls: ConfigType, name: str) -> Field:
     )
 
 
-def getattr_iter(object: object, names: Iterable[str], default: Any) -> Any:
+def getattr_iter(
+    object: object, names: Iterable[str], default: Any, warn: bool = False
+) -> Any:
     """
     A helper function that retrieves an attribute from an object which may
     have multiple possible names. This is useful when fetching attributes from
     arbitrary `transformers.PretrainedConfig` instances.
+
+    In the case where the first name in `names` is the preferred name, and
+    any other names are deprecated aliases, setting `warn=True` will log a
+    warning when a deprecated name is used.
     """
-    for name in names:
+    for i, name in enumerate(names):
         if hasattr(object, name):
+            if warn and i > 0:
+                logger.warning_once(
+                    "%s contains a deprecated attribute name '%s'. "
+                    "Please use the preferred attribute name '%s' instead.",
+                    type(object).__name__,
+                    name,
+                    names[0],
+                )
             return getattr(object, name)
     return default
 
diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py
index 4dff984f92be6..452b87ea4e7a5 100644
--- a/vllm/model_executor/layers/rotary_embedding/__init__.py
+++ b/vllm/model_executor/layers/rotary_embedding/__init__.py
@@ -25,7 +25,6 @@ _ROPE_DICT: dict[tuple, RotaryEmbedding] = {}
 
 def get_rope(
     head_size: int,
-    rotary_dim: int,
     max_position: int,
     is_neox_style: bool = True,
     rope_parameters: dict[str, Any] | None = None,
@@ -54,12 +53,15 @@ def get_rope(
     else:
         dual_chunk_attention_args = None
 
-    partial_rotary_factor = 1.0
-    if rope_parameters is not None:
-        partial_rotary_factor = rope_parameters.get("partial_rotary_factor", 1.0)
+    rope_parameters = rope_parameters or {}
+    base = rope_parameters.get("rope_theta", 10000)
+    scaling_type = rope_parameters.get("rope_type", "default")
+    partial_rotary_factor = rope_parameters.get("partial_rotary_factor", 1.0)
+
+    if partial_rotary_factor <= 0.0 or partial_rotary_factor > 1.0:
+        raise ValueError(f"{partial_rotary_factor=} must be between 0.0 and 1.0")
+    rotary_dim = int(head_size * partial_rotary_factor)
 
-    if partial_rotary_factor < 1.0:
-        rotary_dim = int(rotary_dim * partial_rotary_factor)
     key = (
         head_size,
         rotary_dim,
@@ -72,7 +74,6 @@ def get_rope(
     if key in _ROPE_DICT:
         return _ROPE_DICT[key]
 
-    base = rope_parameters["rope_theta"] if rope_parameters else 10000
     if dual_chunk_attention_config is not None:
         extra_kwargs = {
             k: v
@@ -88,109 +89,76 @@ def get_rope(
             dtype,
             **extra_kwargs,
         )
-    elif not rope_parameters:
-        rotary_emb = RotaryEmbedding(
+    elif scaling_type == "default":
+        if "mrope_section" in rope_parameters:
+            rotary_emb = MRotaryEmbedding(
+                head_size,
+                rotary_dim,
+                max_position,
+                base,
+                is_neox_style,
+                dtype,
+                mrope_section=rope_parameters["mrope_section"],
+                mrope_interleaved=rope_parameters.get("mrope_interleaved", False),
+            )
+        else:
+            rotary_emb = RotaryEmbedding(
+                head_size,
+                rotary_dim,
+                max_position,
+                base,
+                is_neox_style,
+                dtype,
+            )
+    elif scaling_type == "llama3":
+        scaling_factor = rope_parameters["factor"]
+        low_freq_factor = rope_parameters["low_freq_factor"]
+        high_freq_factor = rope_parameters["high_freq_factor"]
+        original_max_position = rope_parameters["original_max_position_embeddings"]
+        rotary_emb = Llama3RotaryEmbedding(
+            head_size,
+            rotary_dim,
+            max_position,
+            base,
+            is_neox_style,
+            dtype,
+            scaling_factor,
+            low_freq_factor,
+            high_freq_factor,
+            original_max_position,
+        )
+    elif scaling_type == "mllama4":
+        rotary_emb = Llama4VisionRotaryEmbedding(
             head_size, rotary_dim, max_position, base, is_neox_style, dtype
         )
-    else:
-        scaling_type = rope_parameters["rope_type"]
-
-        if scaling_type == "llama3":
-            scaling_factor = rope_parameters["factor"]
-            low_freq_factor = rope_parameters["low_freq_factor"]
-            high_freq_factor = rope_parameters["high_freq_factor"]
-            original_max_position = rope_parameters["original_max_position_embeddings"]
-            rotary_emb = Llama3RotaryEmbedding(
-                head_size,
-                rotary_dim,
-                max_position,
-                base,
-                is_neox_style,
-                dtype,
-                scaling_factor,
-                low_freq_factor,
-                high_freq_factor,
-                original_max_position,
-            )
-        elif scaling_type == "mllama4":
-            rotary_emb = Llama4VisionRotaryEmbedding(
-                head_size, rotary_dim, max_position, base, is_neox_style, dtype
-            )
-        elif scaling_type == "default":
-            if "mrope_section" in rope_parameters:
-                rotary_emb = MRotaryEmbedding(
-                    head_size,
-                    rotary_dim,
-                    max_position,
-                    base,
-                    is_neox_style,
-                    dtype,
-                    mrope_section=rope_parameters["mrope_section"],
-                    mrope_interleaved=rope_parameters.get("mrope_interleaved", False),
-                )
-            else:
-                rotary_emb = RotaryEmbedding(
-                    head_size,
-                    rotary_dim,
-                    max_position,
-                    base,
-                    is_neox_style,
-                    dtype,
-                )
-        elif scaling_type == "linear":
-            scaling_factor = rope_parameters["factor"]
-            rotary_emb = LinearScalingRotaryEmbedding(
-                head_size,
-                rotary_dim,
-                max_position,
-                base,
-                is_neox_style,
-                scaling_factor,
-                dtype,
-            )
-        elif scaling_type == "ntk":
-            scaling_factor = rope_parameters["factor"]
-            mixed_b = rope_parameters.get("mixed_b")
-            rotary_emb = NTKScalingRotaryEmbedding(
-                head_size,
-                rotary_dim,
-                max_position,
-                base,
-                is_neox_style,
-                scaling_factor,
-                dtype,
-                mixed_b,
-            )
-        elif scaling_type == "dynamic":
-            if "alpha" in rope_parameters:
-                scaling_alpha = rope_parameters["alpha"]
-                rotary_emb = DynamicNTKAlphaRotaryEmbedding(
-                    head_size,
-                    rotary_dim,
-                    max_position,
-                    base,
-                    is_neox_style,
-                    scaling_alpha,
-                    dtype,
-                )
-            elif "factor" in rope_parameters:
-                scaling_factor = rope_parameters["factor"]
-                rotary_emb = DynamicNTKScalingRotaryEmbedding(
-                    head_size,
-                    rotary_dim,
-                    max_position,
-                    base,
-                    is_neox_style,
-                    scaling_factor,
-                    dtype,
-                )
-            else:
-                raise ValueError(
-                    "Dynamic rope scaling must contain either 'alpha' or 'factor' field"
-                )
-        elif scaling_type == "xdrope":
+    elif scaling_type == "linear":
+        scaling_factor = rope_parameters["factor"]
+        rotary_emb = LinearScalingRotaryEmbedding(
+            head_size,
+            rotary_dim,
+            max_position,
+            base,
+            is_neox_style,
+            scaling_factor,
+            dtype,
+        )
+    elif scaling_type == "ntk":
+        scaling_factor = rope_parameters["factor"]
+        mixed_b = rope_parameters.get("mixed_b")
+        rotary_emb = NTKScalingRotaryEmbedding(
+            head_size,
+            rotary_dim,
+            max_position,
+            base,
+            is_neox_style,
+            scaling_factor,
+            dtype,
+            mixed_b,
+        )
+    elif scaling_type == "dynamic":
+        if "alpha" in rope_parameters:
             scaling_alpha = rope_parameters["alpha"]
-            rotary_emb = XDRotaryEmbedding(
+            rotary_emb = DynamicNTKAlphaRotaryEmbedding(
                 head_size,
                 rotary_dim,
                 max_position,
@@ -198,67 +166,66 @@ def get_rope(
                 is_neox_style,
                 scaling_alpha,
                 dtype,
-                xdrope_section=rope_parameters["xdrope_section"],
             )
-        elif scaling_type == "yarn":
+        elif "factor" in rope_parameters:
             scaling_factor = rope_parameters["factor"]
-            original_max_position = rope_parameters["original_max_position_embeddings"]
-            extra_kwargs = {
-                k: v
-                for k, v in rope_parameters.items()
-                if k
-                in (
-                    "extrapolation_factor",
-                    "attn_factor",
-                    "beta_fast",
-                    "beta_slow",
-                    "apply_yarn_scaling",
-                    "truncate",
-                )
-            }
-            if "mrope_section" in rope_parameters:
-                extra_kwargs.pop("apply_yarn_scaling", None)
-                rotary_emb = MRotaryEmbedding(
-                    head_size,
-                    rotary_dim,
-                    original_max_position,
-                    base,
-                    is_neox_style,
-                    dtype,
-                    mrope_section=rope_parameters["mrope_section"],
-                    mrope_interleaved=rope_parameters.get("mrope_interleaved", False),
-                    scaling_factor=scaling_factor,
-                    **extra_kwargs,
-                )
-            else:
-                rotary_emb = YaRNScalingRotaryEmbedding(
-                    head_size,
-                    rotary_dim,
-                    original_max_position,
-                    base,
-                    is_neox_style,
-                    scaling_factor,
-                    dtype,
-                    **extra_kwargs,
-                )
-        elif scaling_type in ["deepseek_yarn", "deepseek_llama_scaling"]:
-            scaling_factor = rope_parameters["factor"]
-            original_max_position = rope_parameters["original_max_position_embeddings"]
-            # assert max_position == original_max_position * scaling_factor
-            extra_kwargs = {
-                k: v
-                for k, v in rope_parameters.items()
-                if k
-                in (
-                    "extrapolation_factor",
-                    "attn_factor",
-                    "beta_fast",
-                    "beta_slow",
-                    "mscale",
-                    "mscale_all_dim",
-                )
-            }
-            rotary_emb = DeepseekScalingRotaryEmbedding(
+            rotary_emb = DynamicNTKScalingRotaryEmbedding(
+                head_size,
+                rotary_dim,
+                max_position,
+                base,
+                is_neox_style,
+                scaling_factor,
+                dtype,
+            )
+        else:
+            raise ValueError(
+                "Dynamic rope scaling must contain either 'alpha' or 'factor' field"
+            )
+    elif scaling_type == "xdrope":
+        scaling_alpha = rope_parameters["alpha"]
+        rotary_emb = XDRotaryEmbedding(
+            head_size,
+            rotary_dim,
+            max_position,
+            base,
+            is_neox_style,
+            scaling_alpha,
+            dtype,
+            xdrope_section=rope_parameters["xdrope_section"],
+        )
+    elif scaling_type == "yarn":
+        scaling_factor = rope_parameters["factor"]
+        original_max_position = rope_parameters["original_max_position_embeddings"]
+        extra_kwargs = {
+            k: v
+            for k, v in rope_parameters.items()
+            if k
+            in (
+                "extrapolation_factor",
+                "attn_factor",
+                "beta_fast",
+                "beta_slow",
+                "apply_yarn_scaling",
+                "truncate",
+            )
+        }
+        if "mrope_section" in rope_parameters:
+            extra_kwargs.pop("apply_yarn_scaling", None)
+            rotary_emb = MRotaryEmbedding(
+                head_size,
+                rotary_dim,
+                original_max_position,
+                base,
+                is_neox_style,
+                dtype,
+                mrope_section=rope_parameters["mrope_section"],
+                mrope_interleaved=rope_parameters.get("mrope_interleaved", False),
+                scaling_factor=scaling_factor,
+                **extra_kwargs,
+            )
+        else:
+            rotary_emb = YaRNScalingRotaryEmbedding(
                 head_size,
                 rotary_dim,
                 original_max_position,
@@ -268,28 +235,55 @@ def get_rope(
                 dtype,
                 **extra_kwargs,
             )
-        elif scaling_type == "longrope":
-            short_factor = rope_parameters["short_factor"]
-            long_factor = rope_parameters["long_factor"]
-            original_max_position = rope_parameters["original_max_position_embeddings"]
-            extra_kwargs = {
-                k: v
-                for k, v in rope_parameters.items()
-                if k in ("short_mscale", "long_mscale")
-            }
-            rotary_emb = Phi3LongRoPEScaledRotaryEmbedding(
-                head_size,
-                rotary_dim,
-                max_position,
-                original_max_position,
-                base,
-                is_neox_style,
-                dtype,
-                short_factor,
-                long_factor,
-                **extra_kwargs,
+    elif scaling_type in ["deepseek_yarn", "deepseek_llama_scaling"]:
+        scaling_factor = rope_parameters["factor"]
+        original_max_position = rope_parameters["original_max_position_embeddings"]
+        # assert max_position == original_max_position * scaling_factor
+        extra_kwargs = {
+            k: v
+            for k, v in rope_parameters.items()
+            if k
+            in (
+                "extrapolation_factor",
+                "attn_factor",
+                "beta_fast",
+                "beta_slow",
+                "mscale",
+                "mscale_all_dim",
             )
-        else:
-            raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+        }
+        rotary_emb = DeepseekScalingRotaryEmbedding(
+            head_size,
+            rotary_dim,
+            original_max_position,
+            base,
+            is_neox_style,
+            scaling_factor,
+            dtype,
+            **extra_kwargs,
+        )
+    elif scaling_type == "longrope":
+        short_factor = rope_parameters["short_factor"]
+        long_factor = rope_parameters["long_factor"]
+        original_max_position = rope_parameters["original_max_position_embeddings"]
+        extra_kwargs = {
+            k: v
+            for k, v in rope_parameters.items()
+            if k in ("short_mscale", "long_mscale")
+        }
+        rotary_emb = Phi3LongRoPEScaledRotaryEmbedding(
+            head_size,
+            rotary_dim,
+            max_position,
+            original_max_position,
+            base,
+            is_neox_style,
+            dtype,
+            short_factor,
+            long_factor,
+            **extra_kwargs,
+        )
+    else:
+        raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
     _ROPE_DICT[key] = rotary_emb
     return rotary_emb
diff --git a/vllm/model_executor/models/afmoe.py b/vllm/model_executor/models/afmoe.py
index 85827d54c911a..3ced52c2050d6 100644
--- a/vllm/model_executor/models/afmoe.py
+++ b/vllm/model_executor/models/afmoe.py
@@ -241,7 +241,6 @@ class AfmoeAttention(nn.Module):
         if self.is_local_attention:
             self.rotary_emb = get_rope(
                 self.head_dim,
-                rotary_dim=self.head_dim,
                 max_position=max_position_embeddings,
                 rope_parameters=config["rope_parameters"],
                 is_neox_style=True,
diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py
index 2a8be29d8d306..e3f97a718b0f4 100644
--- a/vllm/model_executor/models/apertus.py
+++ b/vllm/model_executor/models/apertus.py
@@ -226,7 +226,6 @@ class ApertusAttention(nn.Module):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
             rope_parameters=config.rope_parameters,
             is_neox_style=is_neox_style,
diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index 266d29a8d9b2b..0200984c0ec85 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -314,7 +314,6 @@ class ArcticAttention(nn.Module):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
             rope_parameters=config.rope_parameters,
             is_neox_style=True,
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index beb22995a0719..ee4a1dbd6df94 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -189,7 +189,6 @@ class BaiChuanAttention(nn.Module):
         else:
             self.rotary_emb = get_rope(
                 self.head_dim,
-                rotary_dim=self.head_dim,
                 max_position=self.max_position_embeddings,
                 rope_parameters=rope_parameters,
             )
diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py
index 0143e140af265..4bccee7521749 100644
--- a/vllm/model_executor/models/bailing_moe.py
+++ b/vllm/model_executor/models/bailing_moe.py
@@ -127,11 +127,11 @@ class BailingAttention(nn.Module):
             prefix=f"{prefix}.dense",
         )
 
-        self.rotary_dim = getattr(config, "rotary_dim", self.head_dim)
+        rotary_dim = getattr(config, "rotary_dim", self.head_dim)
+        config.rope_parameters["partial_rotary_factor"] = rotary_dim / self.head_dim
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.rotary_dim,
             max_position=config.max_position_embeddings,
             rope_parameters=config.rope_parameters,
             is_neox_style=True,
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
index 00d742f84ef79..22631bbc5489b 100644
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -178,14 +178,11 @@ class BambaAttentionDecoderLayer(nn.Module):
         self.scaling = self.head_dim**-0.5
         self.max_position_embeddings = max_position_embeddings
 
-        if hasattr(config, "attn_rotary_emb"):
-            rotary_dim = config.attn_rotary_emb  # for backward compatibility
-        else:
-            rotary_dim = self.head_dim  # default
+        rotary_dim = getattr(config, "attn_rotary_emb", self.head_dim)
+        config.rope_parameters["partial_rotary_factor"] = rotary_dim / self.head_dim
 
         self.rotary_emb = get_rope(
             head_size=self.head_dim,
-            rotary_dim=rotary_dim,
             max_position=max_position_embeddings,
             rope_parameters=config.rope_parameters,
             is_neox_style=True,
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index dfc05a366b286..176c5cd14c6e2 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -314,7 +314,6 @@ class ChameleonAttention(nn.Module):
         self.k_norm = ChameleonLayerNorm((self.num_kv_heads, self.head_dim))
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
             rope_parameters=rope_parameters,
         )
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 3d485fdd0a2e1..26181d1c9bae4 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -99,13 +99,16 @@ class GLMAttention(nn.Module):
         # https://huggingface.co/zai-org/chatglm3-6b-32k/blob/e210410255278dd9d74463cf396ba559c0ef801c/modeling_chatglm.py#L141
         rope_ratio = getattr(config, "rope_ratio", 1.0)
         max_positions = getattr(config, "seq_length", 8192)
-        rope_parameters = {"rope_type": "default", "rope_theta": 10000 * rope_ratio}
+        rope_parameters = {
+            "rope_type": "default",
+            "rope_theta": 10000 * rope_ratio,
+            "partial_rotary_factor": 0.5,
+        }
         # NOTE: zai-org/cogagent-9b-20241220 uses original_rope=False,
         # which is equivalent to is_neox_style=True
         is_neox_style = not config.original_rope
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim // 2,
             max_position=max_positions,
             rope_parameters=rope_parameters,
             is_neox_style=is_neox_style,
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index f837502c468f1..63a93eaa2d4f3 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -175,7 +175,6 @@ class CohereAttention(nn.Module):
         )
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
             rope_parameters=config.rope_parameters,
             is_neox_style=False,
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index 8de793941b8c3..06cc92ee88180 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -42,9 +42,10 @@ class GteNewModelConfig(VerifyAndUpdateConfig):
         config.hidden_act = "geglu"
 
         head_dim = config.hidden_size // config.num_attention_heads
+        rotary_dim = getattr(config, "rotary_emb_dim", head_dim)
+        config.rope_parameters["partial_rotary_factor"] = rotary_dim / head_dim
         config.rotary_kwargs = {
             "head_size": head_dim,
-            "rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
             "max_position": config.max_position_embeddings,
             "rope_parameters": config.rope_parameters,
         }
@@ -77,9 +78,11 @@ class JinaRobertaModelConfig(VerifyAndUpdateConfig):
             if not model_config.enforce_eager:
                 max_position = round_up(max_position, 8)
 
+            rotary_dim = getattr(config, "rotary_emb_dim", head_dim)
+            config.rope_parameters["partial_rotary_factor"] = rotary_dim / head_dim
+
             config.rotary_kwargs = {
                 "head_size": head_dim,
-                "rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
                 "max_position": max_position,
                 "rope_parameters": config.rope_parameters,
             }
@@ -113,12 +116,10 @@ class NomicBertModelConfig(VerifyAndUpdateConfig):
         config.num_hidden_layers = config.n_layer
 
         head_dim = config.hidden_size // config.num_attention_heads
-        rotary_emb_dim = int(head_dim * config.rotary_emb_fraction)
         max_trained_positions = getattr(config, "max_trained_positions", 2048)
 
         config.rotary_kwargs = {
             "head_size": head_dim,
-            "rotary_dim": rotary_emb_dim,
             "max_position": max_trained_positions,
             "rope_parameters": config.rope_parameters,
         }
@@ -240,9 +241,10 @@ class SnowflakeGteNewModelConfig(VerifyAndUpdateConfig):
         config.hidden_act = "geglu"
 
         head_dim = config.hidden_size // config.num_attention_heads
+        rotary_dim = getattr(config, "rotary_emb_dim", head_dim)
+        config.rope_parameters["partial_rotary_factor"] = rotary_dim / head_dim
         config.rotary_kwargs = {
             "head_size": head_dim,
-            "rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
             "max_position": config.max_position_embeddings,
             "rope_parameters": config.rope_parameters,
         }
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index 946baffc8817a..db4fe61b0d85f 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -222,7 +222,6 @@ class DbrxAttention(nn.Module):
         )
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=self.max_position,
             rope_parameters=rope_parameters,
             is_neox_style=True,
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 0b6513789aea8..a9fa76deecbd2 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -156,7 +156,6 @@ class DeepseekAttention(nn.Module):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
             rope_parameters=config.rope_parameters,
         )
@@ -499,7 +498,6 @@ class DeepseekV2Attention(nn.Module):
 
         self.rotary_emb = get_rope(
             qk_rope_head_dim,
-            rotary_dim=qk_rope_head_dim,
             max_position=max_position_embeddings,
             rope_parameters=config.rope_parameters,
             is_neox_style=False,
@@ -1018,7 +1016,6 @@ class DeepseekV2MLAAttention(nn.Module):
 
         self.rotary_emb = get_rope(
             qk_rope_head_dim,
-            rotary_dim=qk_rope_head_dim,
             max_position=max_position_embeddings,
             rope_parameters=config.rope_parameters,
             is_neox_style=False,
@@ -1038,7 +1035,6 @@ class DeepseekV2MLAAttention(nn.Module):
         if self.is_v32:
             self.indexer_rope_emb = get_rope(
                 qk_rope_head_dim,
-                rotary_dim=qk_rope_head_dim,
                 max_position=max_position_embeddings,
                 rope_parameters=config.rope_parameters,
                 is_neox_style=True,
diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py
index 3beee9f864634..870a37039f151 100644
--- a/vllm/model_executor/models/dots1.py
+++ b/vllm/model_executor/models/dots1.py
@@ -250,7 +250,6 @@ class Dots1Attention(nn.Module):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
             rope_parameters=config.rope_parameters,
         )
diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py
index 278ba45e9684c..fbbd31a485383 100644
--- a/vllm/model_executor/models/ernie45_moe.py
+++ b/vllm/model_executor/models/ernie45_moe.py
@@ -288,7 +288,6 @@ class Ernie4_5_MoeAttention(nn.Module):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
             rope_parameters=rope_parameters,
             is_neox_style=False,
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index acf651ed24988..039e7cf68e52b 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -167,7 +167,6 @@ class ExaoneAttention(nn.Module):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
             rope_parameters=config.rope_parameters,
             is_neox_style=is_neox_style,
diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py
index cb710a7ec5cf9..b4b7a798fd050 100644
--- a/vllm/model_executor/models/exaone4.py
+++ b/vllm/model_executor/models/exaone4.py
@@ -176,7 +176,6 @@ class Exaone4Attention(nn.Module):
         set_default_rope_theta(config, default_theta=1000000)
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
             rope_parameters=config.rope_parameters,
             is_neox_style=is_neox_style,
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index 32d9e7b925597..7cdfcae0e718d 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -167,7 +167,6 @@ class FalconAttention(nn.Module):
             max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
             self.rotary_emb = get_rope(
                 self.head_dim,
-                rotary_dim=self.head_dim,
                 max_position=max_position_embeddings,
                 rope_parameters=config.rope_parameters,
             )
diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py
index a1c1263f8d724..bfb6b1a1f160d 100644
--- a/vllm/model_executor/models/falcon_h1.py
+++ b/vllm/model_executor/models/falcon_h1.py
@@ -242,14 +242,11 @@ class FalconH1AttentionDecoderLayer(nn.Module):
         self.scaling = self.head_dim**-0.5
         self.max_position_embeddings = max_position_embeddings
 
-        if hasattr(config, "attn_rotary_emb"):
-            rotary_dim = config.attn_rotary_emb  # for backward compatibility
-        else:
-            rotary_dim = self.head_dim  # default
+        rotary_dim = getattr(config, "attn_rotary_emb", self.head_dim)
+        config.rope_parameters["partial_rotary_factor"] = rotary_dim / self.head_dim
 
         self.rotary_emb = get_rope(
             head_size=self.head_dim,
-            rotary_dim=rotary_dim,
             max_position=max_position_embeddings,
             rope_parameters=config.rope_parameters,
             is_neox_style=True,
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index dd5a74c8ed005..7304a728067f4 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -174,7 +174,6 @@ class GemmaAttention(nn.Module):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
             rope_parameters=rope_parameters,
             is_neox_style=True,
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index cb36e04824588..fe6ec5ff83dec 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -152,7 +152,6 @@ class Gemma2Attention(nn.Module):
         )
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
             rope_parameters=config.rope_parameters,
             is_neox_style=True,
diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index 73176eba95ed5..40f6d100c767e 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -176,7 +176,6 @@ class Gemma3Attention(nn.Module):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
             rope_parameters=rope_parameters,
             is_neox_style=True,
diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py
index f4427c9fd1d10..4d446f51c2ecb 100644
--- a/vllm/model_executor/models/gemma3n.py
+++ b/vllm/model_executor/models/gemma3n.py
@@ -384,7 +384,6 @@ class Gemma3nAttention(nn.Module):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
             rope_parameters=rope_parameters,
             is_neox_style=True,
diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py
index 9adfa942b99fa..2cd11e66c752b 100644
--- a/vllm/model_executor/models/glm4.py
+++ b/vllm/model_executor/models/glm4.py
@@ -81,7 +81,6 @@ class Glm4Attention(nn.Module):
         config.rope_parameters.setdefault("partial_rotary_factor", 0.5)
         self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
         self.head_dim = head_dim or hidden_size // self.total_num_heads
-        self.rotary_dim = self.head_dim
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
@@ -103,7 +102,6 @@ class Glm4Attention(nn.Module):
         )
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.rotary_dim,
             max_position=max_position,
             rope_parameters=config.rope_parameters,
             is_neox_style=False,
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index de091f03e881c..786482d77a1d2 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -678,9 +678,9 @@ class Glm4vVisionTransformer(nn.Module):
         head_dim = self.hidden_size // self.num_heads
         self.rotary_pos_emb = get_rope(
             head_size=head_dim,
-            rotary_dim=head_dim // 2,
             max_position=8192,
             is_neox_style=True,
+            rope_parameters={"partial_rotary_factor": 0.5},
         )
         self.blocks = nn.ModuleList(
             [
diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
index 8cae5ee425e4d..541d3b2beff83 100644
--- a/vllm/model_executor/models/glm4_moe.py
+++ b/vllm/model_executor/models/glm4_moe.py
@@ -285,7 +285,6 @@ class Glm4MoeAttention(nn.Module):
         config.rope_parameters.setdefault("partial_rotary_factor", 0.5)
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
             rope_parameters=config.rope_parameters,
         )
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index f0a34c47da54c..f32ac2639435c 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -95,12 +95,13 @@ class GPTJAttention(nn.Module):
         scaling = self.head_size**-0.5
         assert getattr(config, "rotary", True)
         assert config.rotary_dim % 2 == 0
+        rope_parameters = getattr(config, "rope_parameters", {})
+        rope_parameters["partial_rotary_factor"] = config.rotary_dim / self.head_size
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.rotary_emb = get_rope(
             self.head_size,
-            rotary_dim=config.rotary_dim,
             max_position=max_position_embeddings,
-            rope_parameters=getattr(config, "rope_parameters", None),
+            rope_parameters=rope_parameters,
             is_neox_style=False,
         )
         self.attn = Attention(
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index 212d605c17285..c4d11b488f38b 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -92,7 +92,6 @@ class GPTNeoXAttention(nn.Module):
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.rotary_emb = get_rope(
             self.head_size,
-            rotary_dim=self.head_size,
             max_position=max_position_embeddings,
             rope_parameters=config.rope_parameters,
         )
diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index cff16b7a7a8cd..6a92cf1533213 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -67,7 +67,6 @@ class OAIAttention(nn.Module):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=config.max_position_embeddings,
             dtype=torch.float32,
             rope_parameters={
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index 76519c4660f15..82c945f5ad5ec 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -160,7 +160,6 @@ class GraniteAttention(nn.Module):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
             rope_parameters=config.rope_parameters,
         )
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index b038400a1262a..0b1064b6343e3 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -190,7 +190,6 @@ class GraniteMoeAttention(nn.Module):
         )
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=max_position,
             rope_parameters=rope_parameters,
             is_neox_style=True,
diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py
index 1d9c2f5df4a55..3434716b83789 100644
--- a/vllm/model_executor/models/granitemoehybrid.py
+++ b/vllm/model_executor/models/granitemoehybrid.py
@@ -271,7 +271,6 @@ class GraniteMoeHybridAttention(nn.Module):
         if config.position_embedding_type == "rope":
             self.rotary_emb = get_rope(
                 self.head_dim,
-                rotary_dim=self.head_dim,
                 max_position=config.max_position_embeddings,
                 rope_parameters=config.rope_parameters,
                 is_neox_style=True,
diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py
index 6f62a1d11e52e..0a2e5cf39ffd8 100644
--- a/vllm/model_executor/models/grok1.py
+++ b/vllm/model_executor/models/grok1.py
@@ -181,7 +181,6 @@ class Grok1Attention(nn.Module):
         )
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=max_position,
             rope_parameters=rope_parameters,
             is_neox_style=True,
diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py
index ccdfa3fe175f1..0e82e84c4edbe 100644
--- a/vllm/model_executor/models/hunyuan_v1.py
+++ b/vllm/model_executor/models/hunyuan_v1.py
@@ -199,7 +199,6 @@ class HunYuanAttention(nn.Module):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
             rope_parameters=config.rope_parameters,
             is_neox_style=True,
@@ -305,7 +304,6 @@ class HunYuanCrossAttention(nn.Module):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
             rope_parameters=config.rope_parameters,
             is_neox_style=True,
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index c79934e121447..3ca8864618628 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -140,7 +140,6 @@ class InternLM2Attention(nn.Module):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
             rope_parameters=rope_parameters,
         )
diff --git a/vllm/model_executor/models/lfm2.py b/vllm/model_executor/models/lfm2.py
index a4a994f97a2f8..142ad3d6d1d1a 100644
--- a/vllm/model_executor/models/lfm2.py
+++ b/vllm/model_executor/models/lfm2.py
@@ -143,7 +143,6 @@ class Lfm2Attention(nn.Module):
         )
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
             rope_parameters=config.rope_parameters,
             is_neox_style=True,
diff --git a/vllm/model_executor/models/lfm2_moe.py b/vllm/model_executor/models/lfm2_moe.py
index c8669de72dd09..70804e0a843e8 100644
--- a/vllm/model_executor/models/lfm2_moe.py
+++ b/vllm/model_executor/models/lfm2_moe.py
@@ -236,7 +236,6 @@ class Lfm2MoeAttention(nn.Module):
         )
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
             rope_parameters=config.rope_parameters,
             is_neox_style=True,
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 167dfbca248ce..3507a2bc66c17 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -259,7 +259,6 @@ class LlamaAttention(nn.Module):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
             rope_parameters=getattr(config, "rope_parameters", None),
             is_neox_style=is_neox_style,
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index 423be45e80149..7b3da3e10ab8a 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -243,7 +243,6 @@ class Llama4Attention(nn.Module):
         self.rotary_emb = (
             get_rope(
                 self.head_dim,
-                rotary_dim=self.head_dim,
                 max_position=max_position_embeddings,
                 rope_parameters=config.rope_parameters,
                 is_neox_style=is_neox_style,
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 67c462f4b25c4..f104018d3aa6c 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -277,7 +277,6 @@ class MiniCPMAttention(nn.Module):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
             rope_parameters=rope_parameters,
         )
diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py
index 0a2bcbd7f6084..c7a54cea21544 100644
--- a/vllm/model_executor/models/minicpm3.py
+++ b/vllm/model_executor/models/minicpm3.py
@@ -120,7 +120,6 @@ class MiniCPM3Attention(nn.Module):
 
         self.rotary_emb = get_rope(
             self.qk_rope_head_dim,
-            rotary_dim=self.qk_rope_head_dim,
             max_position=max_position_embeddings,
             rope_parameters=config.rope_parameters,
         )
diff --git a/vllm/model_executor/models/minimax_m2.py b/vllm/model_executor/models/minimax_m2.py
index 3e6a9add9ec49..ee19288ae6852 100644
--- a/vllm/model_executor/models/minimax_m2.py
+++ b/vllm/model_executor/models/minimax_m2.py
@@ -199,9 +199,13 @@ class MiniMaxM2Attention(nn.Module):
             prefix=f"{prefix}.o_proj",
         )
 
+        if (
+            rope_parameters is not None
+            and "partial_rotary_factor" not in rope_parameters
+        ):
+            rope_parameters["partial_rotary_factor"] = rotary_dim / self.head_dim
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
             rope_parameters=rope_parameters,
         )
diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py
index 390de78cc27b4..4bfe3c391c26f 100644
--- a/vllm/model_executor/models/minimax_text_01.py
+++ b/vllm/model_executor/models/minimax_text_01.py
@@ -187,7 +187,6 @@ class MiniMaxText01Attention(nn.Module):
         num_heads: int,
         head_dim: int,
         num_kv_heads: int,
-        rotary_dim: int,
         max_position: int = 4096 * 32,
         rope_parameters: dict | None = None,
         sliding_window: int | None = None,
@@ -245,7 +244,6 @@ class MiniMaxText01Attention(nn.Module):
         )
         self.rotary_emb = get_rope(
             head_size=self.head_dim,
-            rotary_dim=rotary_dim,
             max_position=max_position,
             rope_parameters=rope_parameters,
             is_neox_style=True,
@@ -290,6 +288,8 @@ class MiniMaxText01DecoderLayer(nn.Module):
         head_dim = getattr(config, "head_dim", None)
         if head_dim is None:
             head_dim = config.hidden_size // config.num_attention_heads
+        rotary_dim = getattr(config, "rotary_dim", head_dim)
+        config.rope_parameters["partial_rotary_factor"] = rotary_dim / head_dim
         if hasattr(config, "max_model_len") and isinstance(config.max_model_len, int):
             max_position_embeddings = min(
                 config.max_position_embeddings, config.max_model_len
@@ -321,9 +321,6 @@ class MiniMaxText01DecoderLayer(nn.Module):
                 hidden_size=self.hidden_size,
                 num_heads=config.num_attention_heads,
                 head_dim=head_dim,
-                rotary_dim=config.rotary_dim
-                if hasattr(config, "rotary_dim")
-                else head_dim,
                 num_kv_heads=config.num_key_value_heads,
                 max_position=max_position_embeddings,
                 rope_parameters=config.rope_parameters,
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 50ec57e7a8053..e170c530ca29f 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -206,7 +206,6 @@ class MixtralAttention(nn.Module):
         )
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=max_position,
             rope_parameters=config.rope_parameters,
             is_neox_style=True,
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index e944c0ee38aa1..fe963cc6644fb 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -295,11 +295,11 @@ class Llama4VisionAttention(nn.Module):
         rope_parameters = {
             "rope_type": "mllama4",
             "rope_theta": config.rope_parameters["rope_theta"],
+            "partial_rotary_factor": 0.5,
         }
 
         self.rotary_emb = get_rope(
             head_size=self.head_dim,
-            rotary_dim=config.hidden_size // config.num_attention_heads // 2,
             # number of image patches
             max_position=(config.image_size // config.patch_size) ** 2,
             rope_parameters=rope_parameters,
diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py
index be36f761c63aa..4655ffa7b2f61 100644
--- a/vllm/model_executor/models/modernbert.py
+++ b/vllm/model_executor/models/modernbert.py
@@ -105,7 +105,6 @@ class ModernBertAttention(nn.Module):
 
         self.rotary_emb = get_rope(
             head_size=self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=config.max_position_embeddings,
             rope_parameters=rope_parameters,
             dtype=torch.float16,
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index a6cd9ad16c188..71c6b1aa2e814 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -433,7 +433,6 @@ class MolmoAttention(nn.Module):
         # Rotary embeddings.
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
             rope_parameters=config.rope_parameters,
         )
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index bf83ee5e42a15..21605015c470b 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -199,7 +199,6 @@ class NemotronAttention(nn.Module):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
             rope_parameters=config.rope_parameters,
         )
diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py
index 734fbc60709fa..19a942a5277cc 100644
--- a/vllm/model_executor/models/nemotron_nas.py
+++ b/vllm/model_executor/models/nemotron_nas.py
@@ -118,7 +118,6 @@ class DeciLMAttention(LlamaAttention):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
             rope_parameters=config.rope_parameters,
             is_neox_style=is_neox_style,
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index 3bbb4dd242262..dd7c27f10c531 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -102,7 +102,6 @@ class OlmoAttention(nn.Module):
         # Rotary embeddings.
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
             rope_parameters=config.rope_parameters,
         )
diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py
index 88e9c2d8541a1..b030c94b54cd5 100644
--- a/vllm/model_executor/models/olmo2.py
+++ b/vllm/model_executor/models/olmo2.py
@@ -146,7 +146,6 @@ class Olmo2Attention(nn.Module):
             rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
             rope_parameters=rope_parameters,
         )
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index 1376583a99725..a5a926151c5c9 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -171,7 +171,6 @@ class OlmoeAttention(nn.Module):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
             rope_parameters=config.rope_parameters,
             is_neox_style=True,
diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py
index bddd9fa50957a..47abd7bf0b68a 100644
--- a/vllm/model_executor/models/openpangu.py
+++ b/vllm/model_executor/models/openpangu.py
@@ -352,7 +352,6 @@ class OpenPanguMLAAttention(nn.Module):
         }
         self.rotary_emb = get_rope(
             qk_rope_head_dim,
-            rotary_dim=qk_rope_head_dim,
             max_position=max_position_embeddings,
             rope_parameters=rope_parameters,
             is_neox_style=False,
@@ -525,7 +524,6 @@ class OpenPanguEmbeddedAttention(nn.Module):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
             rope_parameters=config.rope_parameters,
             is_neox_style=is_neox_style,
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index 544a44ed54681..9d9066c4ba619 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -135,7 +135,6 @@ class OrionAttention(nn.Module):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
             rope_parameters=rope_parameters,
         )
diff --git a/vllm/model_executor/models/ouro.py b/vllm/model_executor/models/ouro.py
index dcae92ed20881..829148b4c1fb7 100644
--- a/vllm/model_executor/models/ouro.py
+++ b/vllm/model_executor/models/ouro.py
@@ -166,7 +166,6 @@ class OuroAttention(nn.Module):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=max_position,
             rope_parameters=config.rope_parameters,
             dual_chunk_attention_config=dual_chunk_attention_config,
diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py
index 8f26c68720a5c..b644603c5baa1 100644
--- a/vllm/model_executor/models/persimmon.py
+++ b/vllm/model_executor/models/persimmon.py
@@ -134,7 +134,6 @@ class PersimmonAttention(nn.Module):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
             rope_parameters=config.rope_parameters,
         )
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index 253fbbc41330c..e01e9d47c545c 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -84,19 +84,18 @@ class PhiAttention(nn.Module):
         prefix: str = "",
     ):
         super().__init__()
-        self.total_num_heads = config.num_attention_heads
         self.hidden_size = config.hidden_size
-        self.head_size = self.hidden_size // self.total_num_heads
+        self.head_size = self.hidden_size // config.num_attention_heads
 
         tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
-        assert self.total_num_heads % tensor_model_parallel_world_size == 0
-        self.num_heads = self.total_num_heads // tensor_model_parallel_world_size
+        assert config.num_attention_heads % tensor_model_parallel_world_size == 0
+        self.num_heads = config.num_attention_heads // tensor_model_parallel_world_size
 
         # pylint: disable=C0103
         self.qkv_proj = QKVParallelLinear(
             self.hidden_size,
             self.head_size,
-            self.total_num_heads,
+            config.num_attention_heads,
             bias=True,
             quant_config=quant_config,
             prefix=f"{prefix}.qkv_proj",
@@ -109,13 +108,10 @@ class PhiAttention(nn.Module):
         )
 
         scaling = self.head_size**-0.5
-        rotary_dim = config.hidden_size // config.num_attention_heads
-        assert rotary_dim % 2 == 0
 
         max_position_embeddings = getattr(config, "max_position_embeddings", 2048)
         self.rotary_emb = get_rope(
             self.head_size,
-            rotary_dim=rotary_dim,
             max_position=max_position_embeddings,
             rope_parameters=config.rope_parameters,
         )
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index 49530776f8903..14f73d0c64586 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -352,7 +352,6 @@ class PhiMoEAttention(nn.Module):
         )
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=max_position,
             rope_parameters=rope_parameters,
             is_neox_style=True,
diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py
index 472de5590dcf8..6765ee0c5779c 100644
--- a/vllm/model_executor/models/plamo2.py
+++ b/vllm/model_executor/models/plamo2.py
@@ -574,7 +574,6 @@ class Plamo2AttentionMixer(nn.Module):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=max_position,
             rope_parameters=config.rope_parameters,
         )
diff --git a/vllm/model_executor/models/plamo3.py b/vllm/model_executor/models/plamo3.py
index 4aeb9d432dcc6..3557104d905cb 100644
--- a/vllm/model_executor/models/plamo3.py
+++ b/vllm/model_executor/models/plamo3.py
@@ -179,7 +179,6 @@ class Plamo3AttentionMixer(nn.Module):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=max_position,
             rope_parameters=rope_parameters,
         )
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 12285cf9c1968..492ba2fb12145 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -114,7 +114,6 @@ class QWenAttention(nn.Module):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
             rope_parameters=rope_parameters,
         )
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index f5501bae78418..3af4a49cd77cc 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -164,7 +164,6 @@ class Qwen2Attention(nn.Module):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=max_position,
             rope_parameters=rope_parameters,
             dual_chunk_attention_config=dual_chunk_attention_config,
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 3cc3a3a7873c6..fba06e34f6227 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -624,9 +624,9 @@ class Qwen2_5_VisionTransformer(nn.Module):
         head_dim = self.hidden_size // self.num_heads
         self.rotary_pos_emb = get_rope(
             head_size=head_dim,
-            rotary_dim=head_dim // 2,
             max_position=8192,
             is_neox_style=True,
+            rope_parameters={"partial_rotary_factor": 0.5},
         )
 
         self.attn_backend = get_vit_attn_backend(
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index cbc618f1abd08..2750f1864b81a 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -244,7 +244,6 @@ class Qwen2MoeAttention(nn.Module):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
             rope_parameters=rope_parameters,
             dual_chunk_attention_config=dual_chunk_attention_config,
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 608e90337f452..2c4ac2f8efff1 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -621,9 +621,9 @@ class Qwen2VisionTransformer(nn.Module):
         head_dim = embed_dim // num_heads
         self.rotary_pos_emb = get_rope(
             head_size=head_dim,
-            rotary_dim=head_dim // 2,
             max_position=8192,
             is_neox_style=True,
+            rope_parameters={"partial_rotary_factor": 0.5},
         )
 
         self.blocks = nn.ModuleList(
diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py
index 7d2b3e5f9bc79..0d0da52ed7382 100644
--- a/vllm/model_executor/models/qwen3.py
+++ b/vllm/model_executor/models/qwen3.py
@@ -111,7 +111,6 @@ class Qwen3Attention(nn.Module):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=max_position,
             rope_parameters=rope_parameters,
             dual_chunk_attention_config=dual_chunk_attention_config,
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index c6984dc37c51c..0be81ecc7dd3a 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -269,7 +269,6 @@ class Qwen3MoeAttention(nn.Module):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
             rope_parameters=rope_parameters,
             dual_chunk_attention_config=dual_chunk_attention_config,
diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py
index dd64e3983e381..6a5447ad0fed4 100644
--- a/vllm/model_executor/models/qwen3_next.py
+++ b/vllm/model_executor/models/qwen3_next.py
@@ -747,7 +747,6 @@ class Qwen3NextAttention(nn.Module):
 
         self.rotary_emb = get_rope(
             head_size=self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=config.max_position_embeddings,
             rope_parameters=config.rope_parameters,
             dual_chunk_attention_config=self.dual_chunk_attention_config,
diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
index dbe7bcd07576b..635c3bfdc65c7 100755
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -333,9 +333,9 @@ class Qwen3Omni_VisionTransformer(nn.Module):
         head_dim = self.hidden_size // self.num_heads
         self.rotary_pos_emb = get_rope(
             head_size=head_dim,
-            rotary_dim=head_dim // 2,
             max_position=8192,
             is_neox_style=True,
+            rope_parameters={"partial_rotary_factor": 0.5},
         )
 
         self.blocks = nn.ModuleList(
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index f8e0ea6284994..fcd58c4d33cd7 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -340,9 +340,9 @@ class Qwen3_VisionTransformer(nn.Module):
         head_dim = self.hidden_size // self.num_heads
         self.rotary_pos_emb = get_rope(
             head_size=head_dim,
-            rotary_dim=head_dim // 2,
             max_position=8192,
             is_neox_style=True,
+            rope_parameters={"partial_rotary_factor": 0.5},
         )
 
         self.merger = Qwen3_VisionPatchMerger(
diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py
index 267c60157506d..f25223c782552 100644
--- a/vllm/model_executor/models/seed_oss.py
+++ b/vllm/model_executor/models/seed_oss.py
@@ -161,7 +161,6 @@ class SeedOssAttention(nn.Module):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=max_position,
             rope_parameters=rope_parameters,
         )
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index 7bef56110cab7..964aa902704b3 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -160,7 +160,6 @@ class SolarAttention(nn.Module):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
             rope_parameters=config.rope_parameters,
         )
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index e879599ad3ead..ea4342882feb4 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -148,7 +148,6 @@ class StablelmAttention(nn.Module):
         )
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=self.config.max_position_embeddings,
             rope_parameters=self.config.rope_parameters,
         )
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index 46422f303ff43..569ca9b082cfa 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -112,7 +112,6 @@ class Starcoder2Attention(nn.Module):
         )
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
             rope_parameters=config.rope_parameters,
             is_neox_style=True,
diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py
index 077cce84a98dd..7077f1a22e8d7 100644
--- a/vllm/model_executor/models/step3_text.py
+++ b/vllm/model_executor/models/step3_text.py
@@ -196,7 +196,6 @@ class Step3TextAttention(nn.Module):
         )
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=max_position_embedding,
             rope_parameters=rope_parameters,
         )
diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py
index 653b5b9beef7b..fe157887eea91 100644
--- a/vllm/model_executor/models/zamba2.py
+++ b/vllm/model_executor/models/zamba2.py
@@ -230,7 +230,6 @@ class Zamba2Attention(nn.Module):
         if config.use_mem_rope:
             self.rotary_emb = get_rope(
                 head_size=self.attention_head_dim,
-                rotary_dim=self.attention_head_dim,
                 max_position=config.max_position_embeddings,
                 rope_parameters=config.rope_parameters,
                 is_neox_style=True,
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index d761802da9403..fb88c62dc5b23 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -306,8 +306,13 @@ def patch_rope_parameters(config: PretrainedConfig) -> None:
     """Provide backwards compatibility for RoPE."""
     from vllm.config.utils import getattr_iter
 
-    rope_theta_names = ("rope_theta", "rotary_emb_base")
-    rope_theta = getattr_iter(config, rope_theta_names, None)
+    # Older custom models may use non-standard field names
+    # which need patching for both Transformers v4 and v5.
+    names = ["rope_theta", "rotary_emb_base"]
+    rope_theta = getattr_iter(config, names, None, warn=True)
+    names = ["partial_rotary_factor", "rotary_pct", "rotary_emb_fraction"]
+    partial_rotary_factor = getattr_iter(config, names, None, warn=True)
+
     if Version(version("transformers")) < Version("5.0.0.dev0"):
         # Transformers v4 installed, legacy config fields may be present
         if (rope_scaling := getattr(config, "rope_scaling", None)) is not None:
@@ -316,14 +321,18 @@ def patch_rope_parameters(config: PretrainedConfig) -> None:
             if not hasattr(config, "rope_parameters"):
                 config.rope_parameters = {"rope_type": "default"}
             config.rope_parameters["rope_theta"] = rope_theta
-        partial_rotary_factor_names = ("partial_rotary_factor", "rotary_pct")
-        partial_rotary_factor = getattr_iter(config, partial_rotary_factor_names, None)
         if partial_rotary_factor is not None:
             if not hasattr(config, "rope_parameters"):
                 config.rope_parameters = {"rope_type": "default"}
             config.rope_parameters["partial_rotary_factor"] = partial_rotary_factor
     elif rope_theta is not None or hasattr(config, "rope_parameters"):
         # Transformers v5 installed
+        # Patch these fields in case they used non-standard names
+        if rope_theta is not None:
+            config.rope_theta = rope_theta
+        if partial_rotary_factor is not None:
+            config.partial_rotary_factor = partial_rotary_factor
+        # Standardize and validate RoPE parameters
         config.standardize_rope_params()
         config.validate_rope()
 

From 90d6cf921fe623524f618740616a6cf494d4a8df Mon Sep 17 00:00:00 2001
From: Xingyu Liu <38244988+charlotte12l@users.noreply.github.com>
Date: Thu, 11 Dec 2025 13:00:15 -0800
Subject: [PATCH 461/770] [BugFix][MM]support VLLM_RANDOMIZE_DP_DUMMY_INPUTS
 (#30472)

Signed-off-by: Xingyu Liu <charlotteliu12x@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/v1/worker/gpu_model_runner.py | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 0e2bf9df9a18f..40c8059f90d34 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import functools
 import gc
 import itertools
 import time
@@ -3892,19 +3893,21 @@ class GPUModelRunner(
             return {}
 
     @contextmanager
-    def maybe_randomize_inputs(self, input_ids: torch.Tensor):
+    def maybe_randomize_inputs(
+        self, input_ids: torch.Tensor | None, inputs_embeds: torch.Tensor | None
+    ):
         """
         Randomize input_ids if VLLM_RANDOMIZE_DP_DUMMY_INPUTS is set.
         This is to help balance expert-selection
          - during profile_run
          - during DP rank dummy run
         """
+
         dp_size = self.vllm_config.parallel_config.data_parallel_size
         randomize_inputs = envs.VLLM_RANDOMIZE_DP_DUMMY_INPUTS and dp_size > 1
         if not randomize_inputs:
             yield
-        else:
-            import functools
+        elif input_ids is not None:
 
             @functools.cache
             def rand_input_ids() -> torch.Tensor:
@@ -3912,13 +3915,27 @@ class GPUModelRunner(
                     self.input_ids.gpu,
                     low=0,
                     high=self.model_config.get_vocab_size(),
-                    dtype=input_ids.dtype,
                 )
 
-            logger.debug_once("Randomizing dummy data for DP Rank")
+            logger.debug_once("Randomizing dummy input_ids for DP Rank")
             input_ids.copy_(rand_input_ids()[: input_ids.size(0)], non_blocking=True)
             yield
             input_ids.fill_(0)
+        else:
+
+            @functools.cache
+            def rand_inputs_embeds() -> torch.Tensor:
+                return torch.randn_like(
+                    self.inputs_embeds.gpu,
+                )
+
+            assert inputs_embeds is not None
+            logger.debug_once("Randomizing dummy inputs_embeds for DP Rank")
+            inputs_embeds.copy_(
+                rand_inputs_embeds()[: inputs_embeds.size(0)], non_blocking=True
+            )
+            yield
+            inputs_embeds.fill_(0)
 
     def _get_mm_dummy_batch(
         self,
@@ -4167,7 +4184,7 @@ class GPUModelRunner(
                     num_tokens_across_dp[:] = num_tokens_padded
 
             with (
-                self.maybe_randomize_inputs(input_ids),
+                self.maybe_randomize_inputs(input_ids, inputs_embeds),
                 set_forward_context(
                     attn_metadata,
                     self.vllm_config,

From 0efd9f867c6a7617fbcb8a335677bb8295f1bcb8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Thu, 11 Dec 2025 22:06:51 +0100
Subject: [PATCH 462/770] [Core] Whisper Enable Encoder Batching (#29421)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 vllm/config/model.py                  |  5 +++
 vllm/config/vllm.py                   | 30 +++++----------
 vllm/model_executor/models/whisper.py | 17 +++++++--
 vllm/v1/core/encoder_cache_manager.py | 53 +++++++++++++++++++++++++++
 vllm/v1/core/sched/scheduler.py       |  7 +++-
 5 files changed, 87 insertions(+), 25 deletions(-)

diff --git a/vllm/config/model.py b/vllm/config/model.py
index 03140c17fb50e..59e9689567bd2 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -539,6 +539,11 @@ class ModelConfig:
 
         self.original_max_model_len = self.max_model_len
         self.max_model_len = self.get_and_verify_max_len(self.max_model_len)
+
+        if self.is_encoder_decoder:
+            self.mm_processor_cache_gb = 0
+            logger.info("Encoder-decoder model detected, disabling mm processor cache.")
+
         # Init multimodal config if needed
         if self._model_info.supports_multimodal:
             if (
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 0e75daf0d722c..b5f8f916de438 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -750,27 +750,17 @@ class VllmConfig:
         # TODO: Move after https://github.com/vllm-project/vllm/pull/26847 lands
         self._set_compile_ranges()
 
-        if self.model_config and self.model_config.is_encoder_decoder:
-            from vllm.multimodal import MULTIMODAL_REGISTRY
-
-            self.scheduler_config.max_num_encoder_input_tokens = (
-                MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config)
+        if (
+            self.model_config
+            and self.model_config.architecture == "WhisperForConditionalGeneration"
+            and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"
+        ):
+            logger.warning(
+                "Whisper is known to have issues with "
+                "forked workers. If startup is hanging, "
+                "try setting 'VLLM_WORKER_MULTIPROC_METHOD' "
+                "to 'spawn'."
             )
-            logger.debug(
-                "Encoder-decoder model detected: setting "
-                "`max_num_encoder_input_tokens` to encoder length (%s)",
-                self.scheduler_config.max_num_encoder_input_tokens,
-            )
-            if (
-                self.model_config.architecture == "WhisperForConditionalGeneration"
-                and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"
-            ):
-                logger.warning(
-                    "Whisper is known to have issues with "
-                    "forked workers. If startup is hanging, "
-                    "try setting 'VLLM_WORKER_MULTIPROC_METHOD' "
-                    "to 'spawn'."
-                )
 
         if (
             self.kv_events_config is not None
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index b2feff1335151..b513e3513b2e2 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -522,6 +522,7 @@ class WhisperEncoder(nn.Module):
 
     def forward(self, input_features: torch.Tensor | list[torch.Tensor]):
         hidden_states = []
+        input_is_batched = False
         for features in input_features:
             embeds = nn.functional.gelu(self.conv1(features))
             embeds = nn.functional.gelu(self.conv2(embeds))
@@ -530,7 +531,13 @@ class WhisperEncoder(nn.Module):
                 embeds.dtype
             )
             hidden_states.append(embeds)
-        hidden_states = torch.cat(hidden_states)
+            input_is_batched = embeds.ndim > 2
+        # Input to MHA must be B x T x D
+        if input_is_batched:
+            # Models using WhisperEncoder may handle batching internally.
+            hidden_states = torch.cat(hidden_states)
+        else:
+            hidden_states = torch.stack(hidden_states, dim=0)
 
         for encoder_layer in self.layers:
             hidden_states = encoder_layer(hidden_states)
@@ -603,8 +610,7 @@ class WhisperModel(nn.Module):
         positions: torch.Tensor,
         encoder_outputs: list[torch.Tensor],
     ) -> torch.Tensor:
-        assert len(encoder_outputs) in (0, 1)
-        enc_states = encoder_outputs[0] if len(encoder_outputs) == 1 else None
+        enc_states = torch.cat(encoder_outputs, dim=0) if len(encoder_outputs) else None
         decoder_outputs = self.decoder(
             input_ids=input_ids,
             positions=positions,
@@ -913,7 +919,10 @@ class WhisperForConditionalGeneration(
     def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         # Required as part of SupportsMultiModal interface.
         audio_input = self._parse_and_validate_audio_input(**kwargs)
-        return [self.model.get_encoder_outputs(audio_input["input_features"])]
+        # Split concatenated encoder outputs into one tensor per audio input
+        enc_output = self.model.get_encoder_outputs(audio_input["input_features"])
+        # The assumption is we can only process whole mm items (audios)
+        return enc_output.unbind(dim=0)
 
     def embed_input_ids(
         self,
diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py
index 3959e9a59a53b..50f738713590b 100644
--- a/vllm/v1/core/encoder_cache_manager.py
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -341,3 +341,56 @@ def compute_mm_encoder_budget(
     )
 
     return encoder_compute_budget, encoder_cache_size
+
+
+# NOTE (NickLucche): Temporary implementation for encoder-decoder models that only
+# use the manager for scheduling purposes. Encoder-decoder models will eventually
+# utilize the cache and this class will fold into EncoderCacheManager, as
+# differences with MM models shrink.
+class EncoderDecoderCacheManager(EncoderCacheManager):
+    def __init__(self, cache_size: int):
+        self.cache_size = cache_size
+        self.num_free_slots = cache_size
+        self.freed: list[str] = []
+
+    def check_and_update_cache(self, request: Request, input_id: int) -> bool:
+        return False
+
+    def can_allocate(
+        self,
+        request: Request,
+        input_id: int,
+        encoder_compute_budget: int,
+        num_tokens_to_schedule: int,
+    ) -> bool:
+        num_tokens = request.get_num_encoder_tokens(input_id)
+        # Not enough compute budget
+        if num_tokens > encoder_compute_budget:
+            return False
+
+        num_tokens += num_tokens_to_schedule
+        # Enough free slots
+        return num_tokens <= self.num_free_slots
+
+    def allocate(self, request: Request, input_id: int) -> None:
+        num_encoder_tokens = request.get_num_encoder_tokens(input_id)
+        self.num_free_slots -= num_encoder_tokens
+
+        mm_hash = request.mm_features[input_id].identifier
+        self.freed.append(mm_hash)
+
+    def free(self, request: Request) -> None:
+        for input_id in range(len(request.mm_features)):
+            self.free_encoder_input(request, input_id)
+
+    def get_cached_input_ids(self, request: Request) -> set[int]:
+        return set(range(len(request.mm_features)))
+
+    def get_freed_mm_hashes(self) -> list[str]:
+        freed = self.freed
+        self.freed = []
+        return freed
+
+    def free_encoder_input(self, request: Request, input_id: int) -> None:
+        num_tokens = request.get_num_encoder_tokens(input_id)
+        self.num_free_slots += num_tokens
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index c3d504f2e72c3..a9ce6e63cc775 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -27,6 +27,7 @@ from vllm.logger import init_logger
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.v1.core.encoder_cache_manager import (
     EncoderCacheManager,
+    EncoderDecoderCacheManager,
     compute_encoder_budget,
 )
 from vllm.v1.core.kv_cache_manager import KVCacheBlocks, KVCacheManager
@@ -181,7 +182,11 @@ class Scheduler(SchedulerInterface):
         # NOTE: For the models without encoder (e.g., text-only models),
         # the encoder cache will not be initialized because cache size is 0
         # for these models.
-        self.encoder_cache_manager = EncoderCacheManager(cache_size=encoder_cache_size)
+        self.encoder_cache_manager = (
+            EncoderDecoderCacheManager(cache_size=encoder_cache_size)
+            if self.is_encoder_decoder
+            else EncoderCacheManager(cache_size=encoder_cache_size)
+        )
 
         speculative_config = vllm_config.speculative_config
         self.use_eagle = False

From 3efdc3feaef01d45fb54650163da480bdf2f0ce4 Mon Sep 17 00:00:00 2001
From: ioana ghiban <ioana.ghiban@arm.com>
Date: Thu, 11 Dec 2025 23:03:29 +0100
Subject: [PATCH 463/770] [Docs][CPU backend] Add pre-built Arm CPU Docker
 images (#30491)

Signed-off-by: Ioana Ghiban <ioana.ghiban@arm.com>
---
 .../installation/cpu.arm.inc.md                | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/docs/getting_started/installation/cpu.arm.inc.md b/docs/getting_started/installation/cpu.arm.inc.md
index 8ec18bcb826ec..ad9c7d9ef21be 100644
--- a/docs/getting_started/installation/cpu.arm.inc.md
+++ b/docs/getting_started/installation/cpu.arm.inc.md
@@ -100,7 +100,23 @@ Testing has been conducted on AWS Graviton3 instances for compatibility.
 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:pre-built-images]
 
-Currently, there are no pre-built Arm CPU images.
+See [Using Docker](../../deployment/docker.md) for instructions on using the official Docker image.
+
+Stable vLLM Docker images are being pre-built for Arm from version 0.12.0. Available image tags are here: [https://gallery.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo).
+Please replace `<version>` in the command below with a specific version string (e.g., `0.12.0`).
+
+```bash
+docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v<version>
+```
+
+You can also access the latest code with Docker images. These are not intended for production use and are meant for CI and testing only. They will expire after several days.
+
+The latest code can contain bugs and may not be stable. Please use it with caution.
+
+```bash
+export VLLM_COMMIT=6299628d326f429eba78736acb44e76749b281f5 # use full commit hash from the main branch
+docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT}-arm64-cpu
+```
 
 # --8<-- [end:pre-built-images]
 # --8<-- [start:build-image-from-source]

From c817b1415121cf88178af1e4e78f651d802df4da Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Thu, 11 Dec 2025 17:28:34 -0500
Subject: [PATCH 464/770] [Perf] Optimize deepgemm experts initialization, 3.9%
 TTFT improvement (#30494)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
Co-authored-by: li-jinpeng <3332126450@qq.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
---
 .../layers/fused_moe/deep_gemm_utils.py       | 23 ++++++++++++++-----
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py b/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py
index 6cca954123274..57d303cd53fef 100644
--- a/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py
@@ -84,10 +84,16 @@ def _fwd_kernel_ep_scatter_1(
     m_indices_start_ptr = m_indices + cur_expert_start
     off_expert = tl.arange(0, BLOCK_E)
 
+    # any rows in the per-expert aligned region that do not correspond to
+    # real tokens are left untouched here and should remain initialized to
+    # -1 so DeepGEMM can skip them
     for start_m in tl.range(0, cur_expert_token_num, BLOCK_E, num_stages=4):
+        offs = start_m + off_expert
+        mask = offs < cur_expert_token_num
         tl.store(
-            m_indices_start_ptr + start_m + off_expert,
+            m_indices_start_ptr + offs,
             cur_expert,
+            mask=mask,
         )
 
 
@@ -366,12 +372,17 @@ def deepgemm_moe_permute(
         (M_sum, H // block_k), device=device, dtype=torch.float32
     )
 
-    maybe_has_empty_blocks = (expert_tokens_meta is None) or (
-        expert_tokens_meta.expert_num_tokens_cpu is None
+    # DeepGEMM uses negative values in m_indices (here expert_ids) to mark
+    # completely invalid / padded blocks that should be skipped. We always
+    # initialize expert_ids to -1 so any row that is not explicitly written
+    # by the scatter kernel will be treated as invalid and skipped by
+    # DeepGEMM's scheduler.
+    expert_ids = torch.full(
+        (M_sum,),
+        fill_value=-1,
+        device=device,
+        dtype=torch.int32,
     )
-    expert_ids_init = torch.zeros if maybe_has_empty_blocks else torch.empty
-
-    expert_ids = expert_ids_init((M_sum), device=device, dtype=torch.int32)
     inv_perm = torch.empty(topk_ids.shape, device=device, dtype=torch.int32)
 
     expert_num_tokens = None

From 61249b177de1566027fc74e9b51b45a4c973eb47 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Thu, 11 Dec 2025 17:43:41 -0500
Subject: [PATCH 465/770] [Refactor] Remove useless syncwarp (#30510)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 csrc/moe/grouped_topk_kernels.cu | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/csrc/moe/grouped_topk_kernels.cu b/csrc/moe/grouped_topk_kernels.cu
index 47ee5f021eb4a..5fa367abd96f5 100644
--- a/csrc/moe/grouped_topk_kernels.cu
+++ b/csrc/moe/grouped_topk_kernels.cu
@@ -481,8 +481,6 @@ __device__ void topk_with_k2(T* output, T const* input, T const* bias,
       largest = value;
     }
   }
-
-  __syncwarp();  // Ensure all threads have valid data before reduction
   // Get the top2 warpwise
   T max1 = cg::reduce(tile, largest, cg::greater<T>());
 
@@ -589,7 +587,6 @@ __global__ void group_idx_and_topk_idx_kernel(
     int pre_count_equal_to_top_value = 0;
     // Use loop to find the largset top_group
     while (count_equal_to_top_value < target_num_min) {
-      __syncwarp();  // Ensure all threads have valid data before reduction
       topk_group_value = cg::reduce(tile, value, cg::greater<T>());
       if (value == topk_group_value) {
         value = neg_inf<T>();
@@ -644,10 +641,8 @@ __global__ void group_idx_and_topk_idx_kernel(
       }
     }
     queue.done();
-    __syncwarp();
     // Get the topk_idx
     queue.dumpIdx(s_topk_idx);
-    __syncwarp();
   }
 
   // Load the valid score value

From a00d88973daf9a151ecbd4c740ca99645715b9df Mon Sep 17 00:00:00 2001
From: Andrew Briand <atb8888@comcast.net>
Date: Thu, 11 Dec 2025 16:59:40 -0600
Subject: [PATCH 466/770] [EPLB] Support EPLB w/ NVFP4 (#29804)

Signed-off-by: Andrew Briand <abriand@nvidia.com>
Co-authored-by: Andrew Briand <abriand@nvidia.com>
---
 .../test_eplb_fused_moe_layer_dep_nvfp4.py    | 276 ++++++++++++++++++
 .../layers/quantization/modelopt.py           |  26 +-
 .../quantization/utils/flashinfer_fp4_moe.py  |  79 +++++
 3 files changed, 376 insertions(+), 5 deletions(-)
 create mode 100644 tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py

diff --git a/tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py b/tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py
new file mode 100644
index 0000000000000..951b692e1edaf
--- /dev/null
+++ b/tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py
@@ -0,0 +1,276 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Test that the interaction between EPLB and FusedMoE Layer is okay for DP w/ NVFP4
+
+from dataclasses import dataclass
+
+import pytest
+import torch
+
+from tests.kernels.moe.utils import make_test_quant_config
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace
+from vllm.distributed.parallel_state import (
+    ensure_model_parallel_initialized,
+    get_dp_group,
+)
+from vllm.forward_context import set_forward_context
+from vllm.model_executor.layers.fused_moe.layer import FusedMoE
+from vllm.model_executor.layers.quantization.modelopt import (
+    ModelOptNvFp4Config,
+    ModelOptNvFp4FusedMoE,
+)
+
+from .eplb_utils import distributed_run, set_env_vars_and_device
+
+
+@dataclass
+class TestConfig:
+    num_layers: int
+    num_experts: int
+    num_local_experts: int
+    num_topk: int
+    hidden_size: int
+    intermediate_size: int
+    num_tokens: int
+
+
+def make_fused_moe_layer(
+    rank: int,
+    layer_idx: int,
+    test_config: TestConfig,
+) -> FusedMoE:
+    quant_config = None
+
+    device = torch.device(f"cuda:{rank}")
+
+    quant_config = ModelOptNvFp4Config(
+        is_checkpoint_nvfp4_serialized=True,
+        kv_cache_quant_algo=None,
+        exclude_modules=[],
+    )
+
+    fml = FusedMoE(
+        num_experts=test_config.num_experts,
+        top_k=test_config.num_topk,
+        hidden_size=test_config.hidden_size,
+        intermediate_size=test_config.intermediate_size,
+        prefix=f"dummy_layer_{layer_idx}",
+        activation="silu",
+        is_act_and_mul=True,
+        params_dtype=torch.bfloat16,
+        quant_config=quant_config,
+    )
+
+    nvfp4_fused_moe = ModelOptNvFp4FusedMoE(quant_config, fml)
+    nvfp4_fused_moe.create_weights(
+        fml,
+        test_config.num_local_experts,
+        test_config.hidden_size,
+        test_config.intermediate_size,
+        params_dtype=torch.uint8,
+        global_num_experts=test_config.num_experts,
+    )
+
+    fml = fml.to(device)
+    w1_q, w2_q, quant_config = make_test_quant_config(
+        test_config.num_local_experts,
+        test_config.intermediate_size,
+        test_config.hidden_size,
+        in_dtype=torch.bfloat16,
+        quant_dtype="nvfp4",
+        block_shape=None,
+        per_act_token_quant=False,
+    )
+
+    fml.w13_weight.data = w1_q
+    fml.w2_weight.data = w2_q
+
+    fml.w2_input_scale.data = torch.randn_like(fml.w2_input_scale.data) / 5
+    fml.w13_input_scale.data = torch.randn_like(fml.w13_input_scale.data) / 5
+    fml.w2_weight_scale_2.data = torch.randn_like(fml.w2_weight_scale_2.data) / 5
+    fml.w13_weight_scale_2.data = torch.randn_like(fml.w13_weight_scale_2.data) / 5
+    fml.w2_weight_scale.data = (
+        torch.randn(fml.w2_weight_scale.data.shape, device=device) / 5
+    ).to(fml.w2_weight_scale.data.dtype)
+    fml.w13_weight_scale.data = (
+        torch.randn(fml.w13_weight_scale.data.shape, device=device) / 5
+    ).to(fml.w13_weight_scale.data.dtype)
+
+    nvfp4_fused_moe.process_weights_after_loading(fml)
+
+    fml.maybe_init_modular_kernel()
+
+    return fml
+
+
+def _test_eplb_fml(env, world_size: int, test_config: TestConfig):
+    set_env_vars_and_device(env)
+
+    vllm_config = VllmConfig()
+    vllm_config.parallel_config.data_parallel_size = world_size
+    vllm_config.parallel_config.enable_expert_parallel = True
+
+    with set_current_vllm_config(vllm_config):
+        ensure_model_parallel_initialized(
+            tensor_model_parallel_size=1, pipeline_model_parallel_size=1
+        )
+
+        ep_group = get_dp_group().cpu_group
+        ep_rank = torch.distributed.get_rank()
+
+        device = torch.device(f"cuda:{ep_rank}")
+
+        fml_layers = [
+            make_fused_moe_layer(ep_rank, layer_idx, test_config).to(device)
+            for layer_idx in range(test_config.num_layers)
+        ]
+        rank_expert_weights = [fml.get_expert_weights() for fml in fml_layers]
+
+        hidden_states = []
+        router_logits = []
+        for layer_idx in range(test_config.num_layers):
+            hidden_states.append(
+                torch.randn(
+                    (test_config.num_tokens, test_config.hidden_size),
+                    dtype=torch.bfloat16,
+                    device=device,
+                )
+            )
+            router_logits.append(
+                torch.randn(
+                    (test_config.num_tokens, test_config.num_experts),
+                    dtype=torch.bfloat16,
+                    device=device,
+                )
+            )
+
+        out_before_shuffle = []
+        with set_forward_context(
+            {},
+            num_tokens=test_config.num_tokens,
+            num_tokens_across_dp=torch.tensor(
+                [test_config.num_tokens] * world_size, device="cpu", dtype=torch.int
+            ),
+            vllm_config=vllm_config,
+        ):
+            for lidx, fml in enumerate(fml_layers):
+                out_before_shuffle.append(
+                    fml(hidden_states[lidx].clone(), router_logits[lidx].clone())
+                )
+
+        indices = torch.zeros(
+            test_config.num_layers, test_config.num_experts, dtype=torch.long
+        )
+        for lidx in range(test_config.num_layers):
+            indices[lidx] = torch.Tensor(range(test_config.num_experts))
+
+        shuffled_indices = torch.zeros_like(indices)
+        for lidx in range(test_config.num_layers):
+            shuffled_indices[lidx] = torch.randperm(test_config.num_experts)
+
+        rearrange_expert_weights_inplace(
+            indices,
+            shuffled_indices,
+            rank_expert_weights,
+            ep_group,
+            is_profile=False,
+        )
+
+        num_global_experts = test_config.num_experts
+
+        logical_to_physical_map_list = []
+        for lidx, fml in enumerate(fml_layers):
+            physical_to_logical_map = shuffled_indices[lidx].to(device)
+            logical_to_physical_map = torch.empty(
+                (num_global_experts,), dtype=torch.int32, device=device
+            )
+            logical_to_physical_map[physical_to_logical_map] = torch.arange(
+                0, num_global_experts, dtype=torch.int32, device=device
+            )
+            logical_to_physical_map_list.append(
+                logical_to_physical_map.reshape(num_global_experts, 1)
+            )
+
+        logical_to_physical_map = torch.stack(logical_to_physical_map_list)
+
+        for lidx, fml in enumerate(fml_layers):
+            logical_replica_count = torch.ones(
+                (test_config.num_layers, num_global_experts),
+                dtype=torch.int32,
+                device=device,
+            )
+            fml.enable_eplb = True
+            fml.set_eplb_state(
+                lidx,
+                torch.zeros(
+                    (test_config.num_layers, num_global_experts),
+                    dtype=torch.int32,
+                    device=device,
+                ),
+                logical_to_physical_map,
+                logical_replica_count,
+            )
+
+        out_after_shuffle = []
+        with set_forward_context(
+            {},
+            num_tokens=test_config.num_tokens,
+            num_tokens_across_dp=torch.tensor(
+                [test_config.num_tokens] * world_size, device="cpu", dtype=torch.int
+            ),
+            vllm_config=vllm_config,
+        ):
+            for lidx, fml in enumerate(fml_layers):
+                out_after_shuffle.append(
+                    fml(hidden_states[lidx].clone(), router_logits[lidx].clone())
+                )
+
+        for lidx in range(test_config.num_layers):
+            torch.testing.assert_close(
+                out_before_shuffle[lidx], out_after_shuffle[lidx], atol=1e-1, rtol=1e-1
+            )
+
+
+@pytest.mark.parametrize("world_size", [2, 4])
+@pytest.mark.parametrize("num_layers", [8])
+@pytest.mark.parametrize("num_experts", [32])
+@pytest.mark.parametrize("hidden_size", [256])
+@pytest.mark.parametrize("intermediate_size", [256])
+@pytest.mark.parametrize("num_tokens", [256])
+@pytest.mark.parametrize("backend", ["latency", "throughput"])
+def test_eplb_fml(
+    world_size: int,
+    num_layers: int,
+    num_experts: int,
+    hidden_size: int,
+    intermediate_size: int,
+    num_tokens: int,
+    backend: str,
+    monkeypatch,
+):
+    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
+    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", backend)
+
+    if torch.cuda.device_count() < world_size:
+        pytest.skip(f"Need at least {world_size} GPUs to run the test")
+
+    num_local_experts = num_experts // world_size
+    num_topk = 4
+
+    test_config = TestConfig(
+        num_layers=num_layers,
+        num_experts=num_experts,
+        num_local_experts=num_local_experts,
+        num_topk=num_topk,
+        hidden_size=hidden_size,
+        intermediate_size=intermediate_size,
+        num_tokens=num_tokens,
+    )
+
+    distributed_run(
+        _test_eplb_fml,
+        world_size,
+        test_config,
+    )
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index e825cb33c3580..18a0fe6fbbb44 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -38,6 +38,7 @@ from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
     build_flashinfer_fp4_cutlass_moe_prepare_finalize,
     flashinfer_trtllm_fp4_moe,
+    flashinfer_trtllm_fp4_routed_moe,
     prepare_static_weights_for_trtllm_fp4_moe,
     reorder_w1w3_to_w3w1,
     select_nvfp4_gemm_impl,
@@ -1325,7 +1326,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 "Accuracy may be affected."
             )
 
-        w13_weight_scale_2 = layer.w13_weight_scale_2[:, 0]
+        w13_weight_scale_2 = layer.w13_weight_scale_2[:, 0].contiguous()
         layer.w13_weight_scale_2 = Parameter(w13_weight_scale_2, requires_grad=False)
 
         # Common processing for input scales and alphas
@@ -1482,6 +1483,10 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
             a2_gscale=layer.w2_input_scale_quant,
         )
 
+    @property
+    def supports_eplb(self) -> bool:
+        return True
+
     def apply(
         self,
         layer: FusedMoE,
@@ -1500,11 +1505,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         if (
             self.allow_flashinfer
             and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
+            and not layer.enable_eplb
         ):
-            if layer.enable_eplb:
-                raise NotImplementedError(
-                    "EPLB not supported for `ModelOptNvFp4FusedMoE` yet."
-                )
             return flashinfer_trtllm_fp4_moe(
                 layer=layer,
                 x=x,
@@ -1522,6 +1524,20 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
             router_logits=router_logits,
         )
 
+        # EPLB path
+        if (
+            self.allow_flashinfer
+            and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
+        ):
+            return flashinfer_trtllm_fp4_routed_moe(
+                layer=layer,
+                x=x,
+                topk_ids=topk_ids,
+                topk_weights=topk_weights,
+                top_k=layer.top_k,
+                global_num_experts=layer.global_num_experts,
+            )
+
         if self.use_marlin:
             return fused_marlin_moe(
                 x,
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
index eda40657b1e39..8f96222f19f20 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
@@ -331,3 +331,82 @@ def flashinfer_trtllm_fp4_moe(
     )[0]
 
     return out
+
+
+def flashinfer_trtllm_fp4_routed_moe(
+    layer: torch.nn.Module,
+    x: torch.Tensor,
+    topk_ids: torch.Tensor,
+    topk_weights: torch.Tensor,
+    top_k: int,
+    global_num_experts: int,
+) -> torch.Tensor:
+    """
+    Apply FlashInfer TensorRT-LLM FP4 MoE kernel. Uses packed
+    input top k expert indices and scores rather than computing
+    top k expert indices from scores.
+
+    Args:
+        layer: The MoE layer with weights and scales
+        x: Input tensor
+        topk_ids: Ids of selected experts
+        top_k: Number of experts to select per token
+        global_num_experts: Total number of experts across all ranks
+
+    Returns:
+        Output tensor from the MoE layer
+    """
+    import flashinfer
+
+    # Pack top k ids and expert weights into a single int32 tensor, as
+    # required by TRT-LLM
+    packed_tensor = (topk_ids.to(torch.int32) << 16) | topk_weights.to(
+        torch.bfloat16
+    ).view(torch.int16)
+
+    # Quantize input to FP4
+    a1_gscale = layer.w13_input_scale_quant
+    (hidden_states_fp4, hidden_states_scale_linear_fp4) = flashinfer.fp4_quantize(
+        x,
+        a1_gscale,
+        is_sf_swizzled_layout=False,
+    )
+
+    # Call TRT-LLM FP4 block-scale MoE kernel
+    out = flashinfer.fused_moe.trtllm_fp4_block_scale_routed_moe(
+        topk_ids=packed_tensor,
+        routing_bias=None,
+        hidden_states=hidden_states_fp4,
+        hidden_states_scale=hidden_states_scale_linear_fp4.view(
+            torch.float8_e4m3fn
+        ).flatten(),
+        gemm1_weights=layer.gemm1_weights_fp4_shuffled.data,
+        gemm1_weights_scale=layer.gemm1_scales_fp4_shuffled.data.view(
+            torch.float8_e4m3fn
+        ),
+        gemm1_bias=None,
+        gemm1_alpha=None,
+        gemm1_beta=None,
+        gemm1_clamp_limit=None,
+        gemm2_weights=layer.gemm2_weights_fp4_shuffled.data,
+        gemm2_weights_scale=layer.gemm2_scales_fp4_shuffled.data.view(
+            torch.float8_e4m3fn
+        ),
+        gemm2_bias=None,
+        output1_scale_scalar=layer.g1_scale_c.data,
+        output1_scale_gate_scalar=layer.g1_alphas.data,
+        output2_scale_scalar=layer.g2_alphas.data,
+        num_experts=global_num_experts,
+        top_k=top_k,
+        n_group=0,
+        topk_group=0,
+        intermediate_size=layer.intermediate_size_per_partition,
+        local_expert_offset=layer.ep_rank * layer.local_num_experts,
+        local_num_experts=layer.local_num_experts,
+        routed_scaling_factor=None,
+        tile_tokens_dim=None,
+        routing_method_type=1,
+        do_finalize=True,
+    )[0]
+
+    return out

From 2cc5affc388d3d134bacc14f042405ead925531b Mon Sep 17 00:00:00 2001
From: Concurrensee <yida.wu@amd.com>
Date: Thu, 11 Dec 2025 17:03:54 -0600
Subject: [PATCH 467/770] [ROCM][CI] Fix AMD Examples Test Group (#30276)

Signed-off-by: Yida Wu <yida.wu@amd.com>
Signed-off-by: Yida <yida.wu@amd.com>
---
 .buildkite/test-amd.yaml                  | 3 +--
 examples/offline_inference/basic/embed.py | 8 ++++++++
 examples/offline_inference/basic/score.py | 8 ++++++++
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 4038d32834e68..4e957634e7b47 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -435,7 +435,7 @@ steps:
 
 - label: Examples Test # 30min
   timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   working_dir: "/vllm-workspace/examples"
@@ -455,7 +455,6 @@ steps:
     # for multi-modal models
     - python3 offline_inference/audio_language.py --seed 0
     - python3 offline_inference/vision_language.py --seed 0
-    - python3 offline_inference/vision_language_pooling.py --seed 0
     - python3 offline_inference/vision_language_multi_image.py --seed 0
     - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
     # for pooling models
diff --git a/examples/offline_inference/basic/embed.py b/examples/offline_inference/basic/embed.py
index eeb7137ff7bae..17f727b33d321 100644
--- a/examples/offline_inference/basic/embed.py
+++ b/examples/offline_inference/basic/embed.py
@@ -4,6 +4,9 @@
 from argparse import Namespace
 
 from vllm import LLM, EngineArgs
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.config import AttentionConfig
+from vllm.platforms import current_platform
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 
@@ -20,6 +23,11 @@ def parse_args():
 
 
 def main(args: Namespace):
+    if current_platform.is_rocm():
+        args.attention_config = AttentionConfig(
+            backend=AttentionBackendEnum.FLEX_ATTENTION
+        )
+
     # Sample prompts.
     prompts = [
         "Hello, my name is",
diff --git a/examples/offline_inference/basic/score.py b/examples/offline_inference/basic/score.py
index cbca50eb5efa8..b2dadffd249f5 100644
--- a/examples/offline_inference/basic/score.py
+++ b/examples/offline_inference/basic/score.py
@@ -4,6 +4,9 @@
 from argparse import Namespace
 
 from vllm import LLM, EngineArgs
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.config import AttentionConfig
+from vllm.platforms import current_platform
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 
@@ -20,6 +23,11 @@ def parse_args():
 
 
 def main(args: Namespace):
+    if current_platform.is_rocm():
+        args.attention_config = AttentionConfig(
+            backend=AttentionBackendEnum.FLEX_ATTENTION
+        )
+
     # Sample prompts.
     text_1 = "What is the capital of France?"
     texts_2 = [

From d527cf0b3d4210c4277f258c9d26286cec726a6f Mon Sep 17 00:00:00 2001
From: Ev Lacey <elacey@nvidia.com>
Date: Thu, 11 Dec 2025 15:36:31 -0800
Subject: [PATCH 468/770] [FIX]Patch run-cluster.sh (fix for #28328) (#30002)

Signed-off-by: elacey <elacey@nvidia.com>
Signed-off-by: Ev Lacey <github@everettlacey.com>
---
 examples/online_serving/run_cluster.sh | 60 +++++++++++++++-----------
 1 file changed, 34 insertions(+), 26 deletions(-)

diff --git a/examples/online_serving/run_cluster.sh b/examples/online_serving/run_cluster.sh
index 0756d4b0ae556..5996098eb25aa 100644
--- a/examples/online_serving/run_cluster.sh
+++ b/examples/online_serving/run_cluster.sh
@@ -21,7 +21,7 @@
 #         --worker \
 #         /abs/path/to/huggingface/cache \
 #         -e VLLM_HOST_IP=<worker_node_ip>
-# 
+#
 # Each worker requires a unique VLLM_HOST_IP value.
 # Keep each terminal session open. Closing a session stops the associated Ray
 # node and thereby shuts down the entire cluster.
@@ -59,6 +59,34 @@ if [ "${NODE_TYPE}" != "--head" ] && [ "${NODE_TYPE}" != "--worker" ]; then
     exit 1
 fi
 
+# Extract VLLM_HOST_IP from ADDITIONAL_ARGS (e.g. "-e VLLM_HOST_IP=...").
+VLLM_HOST_IP=""
+for ((i = 0; i < ${#ADDITIONAL_ARGS[@]}; i++)); do
+    arg="${ADDITIONAL_ARGS[$i]}"
+    case "${arg}" in
+        -e)
+            next="${ADDITIONAL_ARGS[$((i + 1))]:-}"
+            if [[ "${next}" == VLLM_HOST_IP=* ]]; then
+                VLLM_HOST_IP="${next#VLLM_HOST_IP=}"
+                break
+            fi
+            ;;
+        -eVLLM_HOST_IP=* | VLLM_HOST_IP=*)
+            VLLM_HOST_IP="${arg#*=}"
+            break
+            ;;
+    esac
+done
+
+# For the head node, HEAD_NODE_ADDRESS and VLLM_HOST_IP should be consistent.
+if [[ "${NODE_TYPE}" == "--head" && -n "${VLLM_HOST_IP}" ]]; then
+    if [[ "${VLLM_HOST_IP}" != "${HEAD_NODE_ADDRESS}" ]]; then
+        echo "Warning: VLLM_HOST_IP (${VLLM_HOST_IP}) differs from head_node_ip (${HEAD_NODE_ADDRESS})."
+        echo "Using VLLM_HOST_IP as the head node address."
+        HEAD_NODE_ADDRESS="${VLLM_HOST_IP}"
+    fi
+fi
+
 # Generate a unique container name with random suffix.
 # Docker container names must be unique on each host.
 # The random suffix allows multiple Ray containers to run simultaneously on the same machine,
@@ -74,36 +102,17 @@ cleanup() {
 trap cleanup EXIT
 
 # Build the Ray start command based on the node role.
-# The head node manages the cluster and accepts connections on port 6379, 
+# The head node manages the cluster and accepts connections on port 6379,
 # while workers connect to the head's address.
 RAY_START_CMD="ray start --block"
 if [ "${NODE_TYPE}" == "--head" ]; then
-    RAY_START_CMD+=" --head --port=6379"
+    RAY_START_CMD+=" --head --node-ip-address=${HEAD_NODE_ADDRESS} --port=6379"
 else
+
     RAY_START_CMD+=" --address=${HEAD_NODE_ADDRESS}:6379"
-fi
-
-# Parse VLLM_HOST_IP from additional args if present.
-# This is needed for multi-NIC configurations where Ray needs explicit IP bindings.
-VLLM_HOST_IP=""
-for arg in "${ADDITIONAL_ARGS[@]}"; do
-    if [[ $arg == "-e" ]]; then
-        continue
+    if [ -n "${VLLM_HOST_IP}" ]; then
+        RAY_START_CMD+=" --node-ip-address=${VLLM_HOST_IP}"
     fi
-    if [[ $arg == VLLM_HOST_IP=* ]]; then
-        VLLM_HOST_IP="${arg#VLLM_HOST_IP=}"
-        break
-    fi
-done
-
-# Build Ray IP environment variables if VLLM_HOST_IP is set.
-# These variables ensure Ray binds to the correct network interface on multi-NIC systems.
-RAY_IP_VARS=()
-if [ -n "${VLLM_HOST_IP}" ]; then
-    RAY_IP_VARS=(
-        -e "RAY_NODE_IP_ADDRESS=${VLLM_HOST_IP}"
-        -e "RAY_OVERRIDE_NODE_IP_ADDRESS=${VLLM_HOST_IP}"
-    )
 fi
 
 # Launch the container with the assembled parameters.
@@ -118,6 +127,5 @@ docker run \
     --shm-size 10.24g \
     --gpus all \
     -v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \
-    "${RAY_IP_VARS[@]}" \
     "${ADDITIONAL_ARGS[@]}" \
     "${DOCKER_IMAGE}" -c "${RAY_START_CMD}"

From 48661d275fb44b969112a7bd8586dfd9f498e2e3 Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Thu, 11 Dec 2025 18:24:20 -0600
Subject: [PATCH 469/770] [CI/Build][AMD] Skip tests in test_fusions_e2e and
 test_dbo_dp_ep_gsm8k that require non-existing imports for ROCm  (#30417)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 tests/compile/distributed/test_fusions_e2e.py | 26 ++++++++++++++++++-
 tests/v1/distributed/test_dbo.py              |  2 ++
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py
index 75a81efedea3b..5379b5157b811 100644
--- a/tests/compile/distributed/test_fusions_e2e.py
+++ b/tests/compile/distributed/test_fusions_e2e.py
@@ -138,6 +138,17 @@ elif current_platform.is_rocm():
 CUSTOM_OPS_FP8 = ["-quant_fp8", "+quant_fp8"]
 
 
+def has_cuda_graph_wrapper_metadata() -> bool:
+    from importlib import import_module
+
+    try:
+        module = import_module("torch._inductor.utils")
+        module.CUDAGraphWrapperMetadata  # noqa B018
+    except AttributeError:
+        return False
+    return True
+
+
 @pytest.mark.parametrize(
     "model_name, model_kwargs, backend, matches, custom_ops",
     # Test attention+quant_fp8 fusion with custom and torch impls of QuantFP8
@@ -145,7 +156,20 @@ CUSTOM_OPS_FP8 = ["-quant_fp8", "+quant_fp8"]
     # quant_fp4 only has the custom impl
     + list(flat_product(MODELS_FP4, [""])),
 )
-@pytest.mark.parametrize("inductor_graph_partition", [True, False])
+@pytest.mark.parametrize(
+    "inductor_graph_partition",
+    [
+        pytest.param(
+            True,
+            marks=pytest.mark.skipif(
+                not has_cuda_graph_wrapper_metadata(),
+                reason="This test requires"
+                "torch._inductor.utils.CUDAGraphWrapperMetadata to run",
+            ),
+        ),
+        False,
+    ],
+)
 def test_attn_quant(
     model_name: str,
     model_kwargs: dict[str, Any],
diff --git a/tests/v1/distributed/test_dbo.py b/tests/v1/distributed/test_dbo.py
index f3a159762ea54..e5cbe1ce85e96 100644
--- a/tests/v1/distributed/test_dbo.py
+++ b/tests/v1/distributed/test_dbo.py
@@ -13,6 +13,7 @@ import torch
 
 from tests.evals.gsm8k.gsm8k_eval import evaluate_gsm8k
 from tests.utils import RemoteOpenAIServer
+from vllm.utils.import_utils import has_deep_ep
 
 # Detect Blackwell / B200 (compute capability 10.x)
 try:
@@ -44,6 +45,7 @@ DEEPEP_BACKENDS = [
 ]
 
 
+@pytest.mark.skipif(not has_deep_ep(), reason="These tests require deep_ep to run")
 @pytest.mark.parametrize("all2all_backend", DEEPEP_BACKENDS)
 @pytest.mark.xfail(
     IS_BLACKWELL,

From 0ab23c2b2be1cdbde41b824186f57343f102e306 Mon Sep 17 00:00:00 2001
From: jiahanc <173873397+jiahanc@users.noreply.github.com>
Date: Thu, 11 Dec 2025 17:00:58 -0800
Subject: [PATCH 470/770] [fix] fix SM check for Flashinfer TRTLLM MOE (#30314)

Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com>
---
 .../layers/quantization/utils/flashinfer_utils.py               | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
index 00c2720a34875..ba3653e4b5ea7 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
@@ -290,7 +290,7 @@ def get_flashinfer_moe_backend() -> FlashinferMoeBackend:
     if flashinfer_moe_backend in backend_map:
         if (
             flashinfer_moe_backend == "latency"
-            and not current_platform.is_device_capability(100)
+            and not current_platform.has_device_capability(100)
         ):
             logger.info_once(
                 "Flashinfer TRTLLM MOE backend is only supported on "

From ba809266818cfb9e63bcb34d79fdd77af6e308fe Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Thu, 11 Dec 2025 19:02:19 -0600
Subject: [PATCH 471/770] [CI/Build][AMD] Skip test_cutlass_w4a8_moe tests on
 ROCm sine they require cutlass_pack_scale_fp8 (#30508)

Signed-off-by: Randall Smith <ransmith@amd.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 tests/kernels/quantization/test_cutlass_w4a8_moe.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/kernels/quantization/test_cutlass_w4a8_moe.py b/tests/kernels/quantization/test_cutlass_w4a8_moe.py
index 3560402a29e90..a855f7333b617 100644
--- a/tests/kernels/quantization/test_cutlass_w4a8_moe.py
+++ b/tests/kernels/quantization/test_cutlass_w4a8_moe.py
@@ -18,7 +18,9 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
 from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
 
-IS_SUPPORTED_BY_GPU = current_platform.get_device_capability()[0] >= 9
+IS_SUPPORTED_BY_GPU = (
+    current_platform.is_cuda() and current_platform.get_device_capability()[0] >= 9
+)
 
 
 def to_fp8(tensor: torch.Tensor) -> torch.Tensor:

From b5945d49c08b66658110fa1c63e55fde66fcfad7 Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Thu, 11 Dec 2025 19:37:24 -0600
Subject: [PATCH 472/770] [ROCm][CI] Use mi325_4 agent pool for V1 e2e tests
 (#30526)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 .buildkite/test-amd.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 4e957634e7b47..c7d460be6e2b5 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -326,10 +326,10 @@ steps:
   commands:
   - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
 
-- label: V1 Test e2e + engine # 30min
-  timeout_in_minutes: 45
+- label: V1 Test e2e + engine # 65min
+  timeout_in_minutes: 90
   mirror_hardwares: [amdexperimental]
-  agent_pool: mi325_1
+  agent_pool: mi325_4
   # grade: Blocking
   source_file_dependencies:
     - vllm/

From 042da732445f5cef93cb83e1045333544e61a0a1 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Thu, 11 Dec 2025 20:54:12 -0500
Subject: [PATCH 473/770] [Core] Refactor `_build_attention_metadata` (#29628)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 vllm/v1/worker/gpu_model_runner.py | 248 ++++++++++++++---------------
 1 file changed, 123 insertions(+), 125 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 40c8059f90d34..3f20296c27ba7 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1534,28 +1534,13 @@ class GPUModelRunner(
         """
         :return: tuple[attn_metadata, spec_decode_common_attn_metadata]
         """
+        # Attention metadata is not needed for attention free models
+        if len(self.kv_cache_config.kv_cache_groups) == 0:
+            return {}, None
+
         num_tokens_padded = num_tokens_padded or num_tokens
         num_reqs_padded = num_reqs_padded or num_reqs
-
-        logits_indices_padded = None
-        num_logits_indices = None
-        if logits_indices is not None:
-            num_logits_indices = logits_indices.size(0)
-            if self.cache_config.kv_sharing_fast_prefill:
-                logits_indices_padded = self._prepare_kv_sharing_fast_prefill(
-                    logits_indices
-                )
-
-        # update seq_lens of decode reqs under DCP.
-        if self.dcp_world_size > 1:
-            self.dcp_local_seq_lens.cpu[:num_reqs] = get_dcp_local_seq_lens(
-                self.seq_lens.cpu[:num_reqs],
-                self.dcp_world_size,
-                self.dcp_rank,
-                self.parallel_config.cp_kv_cache_interleave_size,
-            )
-            self.dcp_local_seq_lens.cpu[num_reqs:].fill_(0)
-            self.dcp_local_seq_lens.copy_to_gpu(num_reqs_padded)
+        assert num_reqs_padded is not None and num_tokens_padded is not None
 
         attn_metadata: PerLayerAttnMetadata = {}
         if ubatch_slices is not None:
@@ -1576,36 +1561,12 @@ class GPUModelRunner(
             self.num_accepted_tokens.np[num_reqs:].fill(1)
             self.num_accepted_tokens.copy_to_gpu()
 
-        # Used in the below loop, uses padded shapes
-        query_start_loc = self.query_start_loc.gpu[: num_reqs_padded + 1]
-        query_start_loc_cpu = self.query_start_loc.cpu[: num_reqs_padded + 1]
-        seq_lens = self.seq_lens.gpu[:num_reqs_padded]
-        seq_lens_cpu = self.seq_lens.cpu[:num_reqs_padded]
-        num_computed_tokens_cpu = self.input_batch.num_computed_tokens_cpu_tensor[
-            :num_reqs_padded
-        ]
+        kv_cache_groups = self.kv_cache_config.kv_cache_groups
 
-        dcp_local_seq_lens, dcp_local_seq_lens_cpu = None, None
-        if self.dcp_world_size > 1:
-            dcp_local_seq_lens = self.dcp_local_seq_lens.gpu[:num_reqs_padded]
-            dcp_local_seq_lens_cpu = self.dcp_local_seq_lens.cpu[:num_reqs_padded]
-
-        spec_decode_common_attn_metadata = None
-
-        # Prepare the attention metadata for each KV cache group and make layers
-        # in the same group share the same metadata.
-        for kv_cache_gid, kv_cache_group in enumerate(
-            self.kv_cache_config.kv_cache_groups
-        ):
-            encoder_seq_lens, encoder_seq_lens_cpu = self._get_encoder_seq_lens(
-                num_scheduled_tokens or {},
-                kv_cache_group.kv_cache_spec,
-                num_reqs_padded,
-            )
-
-            if isinstance(kv_cache_group.kv_cache_spec, EncoderOnlyAttentionSpec):
-                # Encoder-only layers do not have KV cache, so we need to
-                # create a dummy block table and slot mapping for them.
+        def _get_block_table_and_slot_mapping(kv_cache_gid: int):
+            assert num_reqs_padded is not None and num_tokens_padded is not None
+            kv_cache_spec = kv_cache_groups[kv_cache_gid].kv_cache_spec
+            if isinstance(kv_cache_spec, EncoderOnlyAttentionSpec):
                 blk_table_tensor = torch.zeros(
                     (num_reqs_padded, 1),
                     dtype=torch.int32,
@@ -1621,92 +1582,129 @@ class GPUModelRunner(
                 blk_table_tensor = blk_table.get_device_tensor(num_reqs_padded)
                 slot_mapping = blk_table.slot_mapping.gpu[:num_tokens_padded]
 
-                # Fill unused with -1. Needed for reshape_and_cache in full cuda
-                # graph mode. `blk_table_tensor` -1 to match mamba PAD_SLOT_ID
-                slot_mapping[num_tokens:num_tokens_padded].fill_(-1)
-                blk_table_tensor[num_reqs:num_reqs_padded].fill_(-1)
+            # Fill unused with -1. Needed for reshape_and_cache in full cuda
+            # graph mode. `blk_table_tensor` -1 to match mamba PAD_SLOT_ID
+            slot_mapping[num_tokens:num_tokens_padded].fill_(-1)
+            blk_table_tensor[num_reqs:num_reqs_padded].fill_(-1)
 
-            common_attn_metadata = CommonAttentionMetadata(
-                query_start_loc=query_start_loc,
-                query_start_loc_cpu=query_start_loc_cpu,
-                seq_lens=seq_lens,
-                _seq_lens_cpu=seq_lens_cpu,
-                _num_computed_tokens_cpu=num_computed_tokens_cpu,
-                num_actual_tokens=num_tokens_padded,
-                num_reqs=num_reqs_padded,
-                max_query_len=max_query_len,
-                max_seq_len=max_seq_len,
-                block_table_tensor=blk_table_tensor,
-                slot_mapping=slot_mapping,
-                logits_indices_padded=logits_indices_padded,
-                num_logits_indices=num_logits_indices,
-                causal=True,
-                encoder_seq_lens=encoder_seq_lens,
-                encoder_seq_lens_cpu=encoder_seq_lens_cpu,
-                dcp_local_seq_lens=dcp_local_seq_lens,
-                dcp_local_seq_lens_cpu=dcp_local_seq_lens_cpu,
+            return blk_table_tensor, slot_mapping
+
+        block_table_gid_0, slot_mapping_gid_0 = _get_block_table_and_slot_mapping(0)
+        cm_base = CommonAttentionMetadata(
+            query_start_loc=self.query_start_loc.gpu[: num_reqs_padded + 1],
+            query_start_loc_cpu=self.query_start_loc.cpu[: num_reqs_padded + 1],
+            seq_lens=self.seq_lens.gpu[:num_reqs_padded],
+            _seq_lens_cpu=self.seq_lens.cpu[:num_reqs_padded],
+            _num_computed_tokens_cpu=self.input_batch.num_computed_tokens_cpu_tensor[
+                :num_reqs_padded
+            ],
+            num_reqs=num_reqs_padded,
+            num_actual_tokens=num_tokens_padded,
+            max_query_len=max_query_len,
+            max_seq_len=max_seq_len,
+            block_table_tensor=block_table_gid_0,
+            slot_mapping=slot_mapping_gid_0,
+            causal=True,
+        )
+
+        if self.dcp_world_size > 1:
+            self.dcp_local_seq_lens.cpu[:num_reqs] = get_dcp_local_seq_lens(
+                self.seq_lens.cpu[:num_reqs],
+                self.dcp_world_size,
+                self.dcp_rank,
+                self.parallel_config.cp_kv_cache_interleave_size,
             )
+            self.dcp_local_seq_lens.cpu[num_reqs:].fill_(0)
+            self.dcp_local_seq_lens.copy_to_gpu(num_reqs_padded)
+
+            cm_base.dcp_local_seq_lens = self.dcp_local_seq_lens.gpu[:num_reqs_padded]
+            cm_base.dcp_local_seq_lens_cpu = self.dcp_local_seq_lens.cpu[
+                :num_reqs_padded
+            ]
+
+        if logits_indices is not None and self.cache_config.kv_sharing_fast_prefill:
+            cm_base.num_logits_indices = logits_indices.size(0)
+            cm_base.logits_indices_padded = self._prepare_kv_sharing_fast_prefill(
+                logits_indices
+            )
+
+        def _build_attn_group_metadata(
+            kv_cache_gid: int,
+            attn_gid: int,
+            common_attn_metadata: CommonAttentionMetadata,
+            ubid: int | None = None,
+        ) -> None:
+            attn_group = self.attn_groups[kv_cache_gid][attn_gid]
+            cascade_attn_prefix_len = (
+                cascade_attn_prefix_lens[kv_cache_gid][attn_gid]
+                if cascade_attn_prefix_lens
+                else 0
+            )
+
+            builder = attn_group.get_metadata_builder(ubid or 0)
+            extra_attn_metadata_args = {}
+            if use_spec_decode and isinstance(builder, GDNAttentionMetadataBuilder):
+                assert ubid is None, "UBatching not supported with GDN yet"
+                extra_attn_metadata_args = dict(
+                    num_accepted_tokens=self.num_accepted_tokens.gpu[:num_reqs_padded],
+                    num_decode_draft_tokens_cpu=self.num_decode_draft_tokens.cpu[
+                        :num_reqs_padded
+                    ],
+                )
+
+            if for_cudagraph_capture:
+                attn_metadata_i = builder.build_for_cudagraph_capture(
+                    common_attn_metadata
+                )
+            else:
+                attn_metadata_i = builder.build(
+                    common_prefix_len=cascade_attn_prefix_len,
+                    common_attn_metadata=common_attn_metadata,
+                    **extra_attn_metadata_args,
+                )
+
+            if ubid is None:
+                assert isinstance(attn_metadata, dict)
+                attn_metadata_dict = attn_metadata
+            else:
+                assert isinstance(attn_metadata, list)
+                attn_metadata_dict = attn_metadata[ubid]
+
+            for layer_name in attn_group.layer_names:
+                attn_metadata_dict[layer_name] = attn_metadata_i
+
+        # Prepare the attention metadata for each KV cache group and make layers
+        # in the same group share the same metadata.
+        spec_decode_common_attn_metadata = None
+        for kv_cache_gid, kv_cache_group in enumerate(kv_cache_groups):
+            cm = copy(cm_base)  # shallow copy
+
+            # Basically only the encoder seq_lens, block_table and slot_mapping change
+            # for each kv_cache_group.
+            cm.encoder_seq_lens, cm.encoder_seq_lens_cpu = self._get_encoder_seq_lens(
+                num_scheduled_tokens or {},
+                kv_cache_group.kv_cache_spec,
+                num_reqs_padded,
+            )
+            if kv_cache_gid > 0:
+                cm.block_table_tensor, cm.slot_mapping = (
+                    _get_block_table_and_slot_mapping(kv_cache_gid)
+                )
 
             if self.speculative_config and spec_decode_common_attn_metadata is None:
                 if isinstance(self.drafter, EagleProposer):
                     if self.drafter.attn_layer_names[0] in kv_cache_group.layer_names:
-                        spec_decode_common_attn_metadata = common_attn_metadata
+                        spec_decode_common_attn_metadata = cm
                 else:
-                    spec_decode_common_attn_metadata = common_attn_metadata
-
-            for attn_gid, attn_group in enumerate(self.attn_groups[kv_cache_gid]):
-                cascade_attn_prefix_len = (
-                    cascade_attn_prefix_lens[kv_cache_gid][attn_gid]
-                    if cascade_attn_prefix_lens
-                    else 0
-                )
-                builder = attn_group.get_metadata_builder()
-
-                extra_attn_metadata_args = {}
-                if use_spec_decode and isinstance(builder, GDNAttentionMetadataBuilder):
-                    extra_attn_metadata_args = dict(
-                        num_accepted_tokens=self.num_accepted_tokens.gpu[
-                            :num_reqs_padded
-                        ],
-                        num_decode_draft_tokens_cpu=self.num_decode_draft_tokens.cpu[
-                            :num_reqs_padded
-                        ],
-                    )
+                    spec_decode_common_attn_metadata = cm
 
+            for attn_gid in range(len(self.attn_groups[kv_cache_gid])):
                 if ubatch_slices is not None:
-                    common_attn_metadata_list = split_attn_metadata(
-                        ubatch_slices, common_attn_metadata
-                    )
-                    for ubid, common_attn_metadata in enumerate(
-                        common_attn_metadata_list
-                    ):
-                        builder = attn_group.get_metadata_builder(ubatch_id=ubid)
-                        if for_cudagraph_capture:
-                            attn_metadata_i = builder.build_for_cudagraph_capture(
-                                common_attn_metadata
-                            )
-                        else:
-                            attn_metadata_i = builder.build(
-                                common_prefix_len=cascade_attn_prefix_len,
-                                common_attn_metadata=common_attn_metadata,
-                            )
-                        for layer_name in kv_cache_group.layer_names:
-                            assert type(attn_metadata) is list
-                            attn_metadata[ubid][layer_name] = attn_metadata_i
+                    for ubid, _cm in enumerate(split_attn_metadata(ubatch_slices, cm)):
+                        _build_attn_group_metadata(kv_cache_gid, attn_gid, _cm, ubid)
+
                 else:
-                    assert isinstance(attn_metadata, dict)
-                    if for_cudagraph_capture:
-                        attn_metadata_i = builder.build_for_cudagraph_capture(
-                            common_attn_metadata
-                        )
-                    else:
-                        attn_metadata_i = builder.build(
-                            common_prefix_len=cascade_attn_prefix_len,
-                            common_attn_metadata=common_attn_metadata,
-                            **extra_attn_metadata_args,
-                        )
-                    for layer_name in attn_group.layer_names:
-                        attn_metadata[layer_name] = attn_metadata_i
+                    _build_attn_group_metadata(kv_cache_gid, attn_gid, cm)
 
         if self.is_mm_prefix_lm:
             req_doc_ranges = {}

From f355ad5412bc414a2a55f55481cb4aa1d909b4a3 Mon Sep 17 00:00:00 2001
From: Fadi Arafeh <115173828+fadara01@users.noreply.github.com>
Date: Fri, 12 Dec 2025 02:09:25 +0000
Subject: [PATCH 474/770] [CPU][FIX] Fix build failures on Arm CPUs with torch
 nightly (#30481)

Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
---
 cmake/utils.cmake     | 23 ++++++++++++++---------
 vllm/platforms/cpu.py | 14 ++++++++++----
 2 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 5047c354ff7d2..bdb2ba74d944d 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -140,16 +140,21 @@ function(vllm_prepare_torch_gomp_shim TORCH_GOMP_SHIM_DIR)
   run_python(_VLLM_TORCH_GOMP_PATH
     "
 import os, glob
-try:
-  import torch
-  torch_pkg = os.path.dirname(torch.__file__)
-  site_root = os.path.dirname(torch_pkg)
-  torch_libs = os.path.join(site_root, 'torch.libs')
-  print(glob.glob(os.path.join(torch_libs, 'libgomp-*.so*'))[0])
-except:
-  print('')
+import torch
+torch_pkg = os.path.dirname(torch.__file__)
+site_root = os.path.dirname(torch_pkg)
+
+# Search both torch.libs and torch/lib
+roots = [os.path.join(site_root, 'torch.libs'), os.path.join(torch_pkg, 'lib')]
+candidates = []
+for root in roots:
+    if not os.path.isdir(root):
+        continue
+    candidates.extend(glob.glob(os.path.join(root, 'libgomp*.so*')))
+
+print(candidates[0] if candidates else '')
 "
-    "failed to probe torch.libs for libgomp")
+    "failed to probe for libgomp")
 
   if(_VLLM_TORCH_GOMP_PATH STREQUAL "" OR NOT EXISTS "${_VLLM_TORCH_GOMP_PATH}")
     return()
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index a49b6e92df00d..d961dcf13e53e 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -325,10 +325,16 @@ class CpuPlatform(Platform):
             # We need to find the location of PyTorch's libgomp
             torch_pkg = os.path.dirname(torch.__file__)
             site_root = os.path.dirname(torch_pkg)
-            torch_libs = os.path.join(site_root, "torch.libs")
-            pytorch_libgomp_so_candidates = glob.glob(
-                os.path.join(torch_libs, "libgomp-*.so*")
-            )
+            # Search both torch.libs and torch/lib - See: https://github.com/vllm-project/vllm/issues/30470
+            torch_libs_paths = [
+                os.path.join(site_root, "torch.libs"),
+                os.path.join(torch_pkg, "lib"),
+            ]
+            pytorch_libgomp_so_candidates = []
+            for torch_libs in torch_libs_paths:
+                pytorch_libgomp_so_candidates.extend(
+                    glob.glob(os.path.join(torch_libs, "libgomp*.so*"))
+                )
             if pytorch_libgomp_so_candidates:
                 pytorch_libgomp_so = pytorch_libgomp_so_candidates[0]
                 if ld_preload_str:

From 6a6fc41c799916521b1fa2914f72e108352e1bf6 Mon Sep 17 00:00:00 2001
From: Bhanu Prakash Voutharoja <59905694+Bhanu068@users.noreply.github.com>
Date: Fri, 12 Dec 2025 13:27:22 +1100
Subject: [PATCH 475/770] gptq marlin quantization support for fused moe with
 lora (#30254)

Signed-off-by: Bhanu068 <voutharoja.bhanu06@gmail.com>
---
 csrc/moe/marlin_moe_wna16/ops.cu              |   2 +-
 .../model_executor/layers/fused_moe/config.py |  36 ++++++
 .../layers/quantization/gptq_marlin.py        | 110 +++++++++++++++++-
 3 files changed, 146 insertions(+), 2 deletions(-)

diff --git a/csrc/moe/marlin_moe_wna16/ops.cu b/csrc/moe/marlin_moe_wna16/ops.cu
index 27b6ffaa67176..4fd8fc5c54202 100644
--- a/csrc/moe/marlin_moe_wna16/ops.cu
+++ b/csrc/moe/marlin_moe_wna16/ops.cu
@@ -860,4 +860,4 @@ torch::Tensor moe_wna16_marlin_gemm(
 
 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
   m.impl("moe_wna16_marlin_gemm", &moe_wna16_marlin_gemm);
-}
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
index 5eb6bc4829adf..a9a2990ca2b53 100644
--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -543,6 +543,42 @@ def int8_w8a8_moe_quant_config(
     )
 
 
+def gptq_marlin_moe_quant_config(
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    weight_bits: int,
+    group_size: int,
+    w1_zp: torch.Tensor | None = None,
+    w2_zp: torch.Tensor | None = None,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
+):
+    """
+    Construct a quant config for gptq marlin quantization.
+    """
+    from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
+
+    w_shape = None if group_size == -1 else GroupShape(row=1, col=group_size)
+
+    # Activations are NOT quantized for GPTQ (fp16/bf16)
+    a_shape = w_shape  # Same as weight shape for alignment
+
+    # Determine weight dtype
+    if weight_bits == 4:
+        weight_dtype = "int4"
+    elif weight_bits == 8:
+        weight_dtype = torch.int8
+    else:
+        raise ValueError(f"Unsupported weight_bits: {weight_bits}")
+
+    return FusedMoEQuantConfig(
+        _a1=FusedMoEQuantDesc(dtype=None, shape=a_shape),
+        _a2=FusedMoEQuantDesc(dtype=None, shape=a_shape),
+        _w1=FusedMoEQuantDesc(weight_dtype, w_shape, w1_scale, None, w1_zp, w1_bias),
+        _w2=FusedMoEQuantDesc(weight_dtype, w_shape, w2_scale, None, w2_zp, w2_bias),
+    )
+
+
 def mxfp4_w4a16_moe_quant_config(
     w1_scale: Union[torch.Tensor, "PrecisionConfig"],
     w2_scale: Union[torch.Tensor, "PrecisionConfig"],
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 8d1715f52f097..6e5dcfe59b2f9 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -732,6 +732,14 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
             is_a_8bit=is_a_8bit,
         )
         replace_parameter(layer, "w2_qweight", marlin_w2_qweight)
+
+        # The modular kernel expects w13_weight and w2_weight,
+        # but GPTQ uses w13_qweight and w2_qweight
+        # Alias for modular kernel
+        layer.w13_weight = layer.w13_qweight
+        # Alias for modular kernel
+        layer.w2_weight = layer.w2_qweight
+
         # Repack scales
         marlin_w13_scales = marlin_moe_permute_scales(
             s=layer.w13_scales,
@@ -782,7 +790,107 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
     ) -> FusedMoEQuantConfig | None:
-        return None
+        from vllm.model_executor.layers.fused_moe.config import (
+            gptq_marlin_moe_quant_config,
+        )
+
+        return gptq_marlin_moe_quant_config(
+            w1_scale=layer.w13_scales,
+            w2_scale=layer.w2_scales,
+            weight_bits=self.quant_config.weight_bits,
+            group_size=self.quant_config.group_size,
+            w1_zp=getattr(layer, "w13_qzeros", None)
+            if not self.quant_config.is_sym
+            else None,
+            w2_zp=getattr(layer, "w2_qzeros", None)
+            if not self.quant_config.is_sym
+            else None,
+            w1_bias=getattr(layer, "w13_bias", None),
+            w2_bias=getattr(layer, "w2_bias", None),
+        )
+
+    def select_gemm_impl(
+        self,
+        prepare_finalize,
+        layer: torch.nn.Module,
+    ):
+        """
+        Select the GEMM implementation for GPTQ-Marlin MoE.
+
+        Returns MarlinExperts configured for GPTQ quantization.
+        This is ONLY used when LoRA is enabled.
+        Without LoRA, GPTQ uses its own apply() method.
+        """
+        # Only use modular kernels when LoRA is enabled
+        # Without LoRA, GPTQ's own apply() method works fine and is more efficient
+        if not self.moe.is_lora_enabled:
+            raise NotImplementedError(
+                "GPTQ-Marlin uses its own apply() method when LoRA is not enabled. "
+                "Modular kernels are only used for LoRA support."
+            )
+
+        # The modular marlin kernels do not support 8-bit weights.
+        if self.quant_config.weight_bits == 8:
+            raise NotImplementedError(
+                "GPTQ-Marlin kernel does not support 8-bit weights."
+            )
+
+        from vllm.model_executor.layers.fused_moe import modular_kernel as mk
+        from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+            BatchedMarlinExperts,
+            MarlinExperts,
+        )
+
+        # Ensure quant config is initialized
+        assert self.moe_quant_config is not None, (
+            "moe_quant_config must be initialized before select_gemm_impl"
+        )
+
+        w13_g_idx = (
+            getattr(layer, "w13_g_idx", None) if self.quant_config.desc_act else None
+        )
+        w2_g_idx = (
+            getattr(layer, "w2_g_idx", None) if self.quant_config.desc_act else None
+        )
+        w13_g_idx_sort_indices = (
+            getattr(layer, "w13_g_idx_sort_indices", None)
+            if self.quant_config.desc_act
+            else None
+        )
+        w2_g_idx_sort_indices = (
+            getattr(layer, "w2_g_idx_sort_indices", None)
+            if self.quant_config.desc_act
+            else None
+        )
+
+        # Check if using batched expert format (for Expert Parallelism)
+        if (
+            prepare_finalize.activation_format
+            == mk.FusedMoEActivationFormat.BatchedExperts
+        ):
+            # For batched format, use BatchedMarlinExperts
+            max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank()
+            assert max_num_tokens_per_rank is not None
+            return BatchedMarlinExperts(
+                max_num_tokens=max_num_tokens_per_rank,
+                num_dispatchers=prepare_finalize.num_dispatchers(),
+                quant_config=self.moe_quant_config,
+                w13_g_idx=w13_g_idx,
+                w2_g_idx=w2_g_idx,
+                w13_g_idx_sort_indices=w13_g_idx_sort_indices,
+                w2_g_idx_sort_indices=w2_g_idx_sort_indices,
+                is_k_full=self.is_k_full,
+            )
+        else:
+            # Standard Marlin experts for GPTQ
+            return MarlinExperts(
+                quant_config=self.moe_quant_config,
+                w13_g_idx=w13_g_idx,
+                w2_g_idx=w2_g_idx,
+                w13_g_idx_sort_indices=w13_g_idx_sort_indices,
+                w2_g_idx_sort_indices=w2_g_idx_sort_indices,
+                is_k_full=self.is_k_full,
+            )
 
     def apply(
         self,

From 9f2fc16a6903f8988515ce2560d3ef0850809c42 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Thu, 11 Dec 2025 21:53:57 -0500
Subject: [PATCH 476/770] [Bugfix][Model] Fix Afmoe rope_parameters issue
 (#30505)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/models/registry.py            | 5 +----
 vllm/model_executor/models/afmoe.py | 2 +-
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/tests/models/registry.py b/tests/models/registry.py
index 020cb749341a6..18056a9657e82 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -173,10 +173,7 @@ class _HfExamplesInfo:
 
 _TEXT_GENERATION_EXAMPLE_MODELS = {
     # [Decoder-only]
-    "AfmoeForCausalLM": _HfExamplesInfo(
-        "arcee-ai/Trinity-Nano",
-        is_available_online=False,
-    ),
+    "AfmoeForCausalLM": _HfExamplesInfo("arcee-ai/Trinity-Nano-Preview"),
     "ApertusForCausalLM": _HfExamplesInfo("swiss-ai/Apertus-8B-Instruct-2509"),
     "AquilaModel": _HfExamplesInfo("BAAI/AquilaChat-7B", trust_remote_code=True),
     "AquilaForCausalLM": _HfExamplesInfo("BAAI/AquilaChat2-7B", trust_remote_code=True),
diff --git a/vllm/model_executor/models/afmoe.py b/vllm/model_executor/models/afmoe.py
index 3ced52c2050d6..f5dfe43067414 100644
--- a/vllm/model_executor/models/afmoe.py
+++ b/vllm/model_executor/models/afmoe.py
@@ -242,7 +242,7 @@ class AfmoeAttention(nn.Module):
             self.rotary_emb = get_rope(
                 self.head_dim,
                 max_position=max_position_embeddings,
-                rope_parameters=config["rope_parameters"],
+                rope_parameters=config.rope_parameters,
                 is_neox_style=True,
             )
         else:

From 947dfda9c281c2b2d779a29e73bbc20170dcfab3 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Thu, 11 Dec 2025 19:18:47 -0800
Subject: [PATCH 477/770] [LMCache] Relax lmcache version requirement (#30425)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 requirements/kv_connectors.txt                           | 2 +-
 .../v1/lmcache_integration/vllm_v1_adapter.py            | 9 ++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/requirements/kv_connectors.txt b/requirements/kv_connectors.txt
index f60a01a55d07c..083230c171096 100644
--- a/requirements/kv_connectors.txt
+++ b/requirements/kv_connectors.txt
@@ -1,2 +1,2 @@
-lmcache >= 0.3.10.post1
+lmcache
 nixl >= 0.7.1 # Required for disaggregated prefill
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
index cdc2969a7735e..09af128f3ed74 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
@@ -27,7 +27,14 @@ from lmcache.v1.lookup_client.lmcache_async_lookup_client import (
     LMCacheAsyncLookupServer,
 )
 from lmcache.v1.offload_server.zmq_server import ZMQOffloadServer
-from lmcache.v1.plugin.runtime_plugin_launcher import RuntimePluginLauncher
+
+try:
+    from lmcache.v1.plugin.runtime_plugin_launcher import RuntimePluginLauncher
+except ImportError:
+    # Backwards compatibility for lmcache <= 0.3.10-post1
+    from lmcache.v1.plugin.plugin_launcher import (
+        PluginLauncher as RuntimePluginLauncher,
+    )
 
 from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import VllmConfig

From 197473c4e71c99025a0fd3925d0f130bdbfa1e42 Mon Sep 17 00:00:00 2001
From: Ryan Rock <ryan.rock@amd.com>
Date: Thu, 11 Dec 2025 21:33:17 -0600
Subject: [PATCH 478/770] [CI/Build] Use spawn subprocess for ROCm (#30272)

Signed-off-by: Ryan Rock <ryan.rock@amd.com>
---
 examples/offline_inference/data_parallel.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
index 0b281fc41a341..be0b846995a92 100644
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@@ -33,6 +33,7 @@ import os
 from time import sleep
 
 from vllm import LLM, SamplingParams
+from vllm.platforms import current_platform
 from vllm.utils.network_utils import get_open_port
 
 
@@ -222,6 +223,11 @@ if __name__ == "__main__":
 
     from multiprocessing import Process
 
+    if current_platform.is_rocm():
+        from multiprocessing import set_start_method
+
+        set_start_method("spawn", force=True)
+
     procs = []
     for local_dp_rank, global_dp_rank in enumerate(
         range(node_rank * dp_per_node, (node_rank + 1) * dp_per_node)

From 783644e4ac7d4ee324a1817bbe199fc5b557bc7d Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Thu, 11 Dec 2025 21:54:56 -0600
Subject: [PATCH 479/770] [ROCm][CI] Skip multi-GPU speculative decoding tests
 when insufficient GPUs available (#30527)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 tests/v1/e2e/test_spec_decode.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index c8587659d6580..fcfc8bdce12e9 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -16,6 +16,16 @@ from vllm.platforms import current_platform
 MTP_SIMILARITY_RATE = 0.8
 
 
+def _skip_if_insufficient_gpus_for_tp(tp_size: int):
+    """Skip test if available GPUs < tp_size on ROCm."""
+    if current_platform.is_rocm():
+        available_gpus = torch.cuda.device_count()
+        if available_gpus < tp_size:
+            pytest.skip(
+                f"Test requires {tp_size} GPUs, but only {available_gpus} available"
+            )
+
+
 def get_test_prompts(mm_enabled: bool):
     prompt_types = ["repeat", "sentence"]
     if mm_enabled:
@@ -455,6 +465,8 @@ def test_eagle_correctness(
                 m.setenv("VLLM_ROCM_USE_AITER", "1")
 
         method, model_name, spec_model_name, tp_size = model_setup
+        _skip_if_insufficient_gpus_for_tp(tp_size)
+
         max_model_len = 2048
         max_num_batched_tokens = 128 if enable_chunked_prefill else max_model_len
 
@@ -525,6 +537,7 @@ def test_mtp_correctness(
         m.setenv("VLLM_MLA_DISABLE", "1")
 
         method, model_name, tp_size = model_setup
+        _skip_if_insufficient_gpus_for_tp(tp_size)
 
         ref_llm = LLM(
             model=model_name,

From fe1787107e5214ffb1f0943bf9dd0215cf85ebf2 Mon Sep 17 00:00:00 2001
From: Zhengxu Chen <zhxchen17@fb.com>
Date: Thu, 11 Dec 2025 23:30:51 -0500
Subject: [PATCH 480/770] [compile] Parse compile range cache keys as Range
 during cache loading. (#30516)

Signed-off-by: zhxchen17 <zhxchen17@fb.com>
---
 vllm/compilation/backends.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index dd2233522263d..8fcd2b42e13bb 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -141,7 +141,25 @@ class CompilerManager:
                 # we use ast.literal_eval to parse the data
                 # because it is a safe way to parse Python literals.
                 # do not use eval(), it is unsafe.
-                self.cache = ast.literal_eval(f.read())
+                cache = ast.literal_eval(f.read())
+
+            def check_type(value, ty):
+                if not isinstance(value, ty):
+                    raise TypeError(f"Expected {ty} but got {type(value)} for {value}")
+
+            def parse_key(key: Any) -> tuple[Range, int, str]:
+                range_tuple, graph_index, compiler_name = key
+                check_type(graph_index, int)
+                check_type(compiler_name, str)
+                if isinstance(range_tuple, tuple):
+                    start, end = range_tuple
+                    check_type(start, int)
+                    check_type(end, int)
+                    range_tuple = Range(start=start, end=end)
+                check_type(range_tuple, Range)
+                return range_tuple, graph_index, compiler_name
+
+            self.cache = {parse_key(key): value for key, value in cache.items()}
 
         self.compiler.initialize_cache(
             cache_dir=cache_dir, disable_cache=disable_cache, prefix=prefix

From 8f8fda261a620234fdeea338f44093d5d8072879 Mon Sep 17 00:00:00 2001
From: Ben Browning <bbrownin@redhat.com>
Date: Thu, 11 Dec 2025 23:59:53 -0500
Subject: [PATCH 481/770] [Bugfix] Multiple fixes for gpt-oss Chat Completion
 prompting (#28729)

Signed-off-by: Ben Browning <bbrownin@redhat.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
---
 .../openai/parser/test_harmony_utils.py       | 807 +++++++++++++++---
 tests/entrypoints/openai/test_serving_chat.py | 646 +++++++++++++-
 tests/entrypoints/openai/utils.py             | 190 +++++
 .../openai/parser/harmony_utils.py            | 225 ++++-
 vllm/entrypoints/openai/serving_chat.py       |  13 +-
 .../openai/tool_parsers/openai_tool_parser.py |   7 +-
 6 files changed, 1749 insertions(+), 139 deletions(-)
 create mode 100644 tests/entrypoints/openai/utils.py

diff --git a/tests/entrypoints/openai/parser/test_harmony_utils.py b/tests/entrypoints/openai/parser/test_harmony_utils.py
index a3fd80938de6a..1d34fc51ad563 100644
--- a/tests/entrypoints/openai/parser/test_harmony_utils.py
+++ b/tests/entrypoints/openai/parser/test_harmony_utils.py
@@ -1,21 +1,37 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import pytest
 from openai.types.responses import ResponseFunctionToolCall, ResponseReasoningItem
 from openai.types.responses.response_output_item import McpCall
 from openai_harmony import Author, Message, Role, TextContent
 
+from tests.entrypoints.openai.utils import verify_harmony_messages
 from vllm.entrypoints.openai.parser.harmony_utils import (
+    auto_drop_analysis_messages,
+    get_encoding,
     has_custom_tools,
+    parse_chat_input_to_harmony_message,
+    parse_chat_output,
     parse_input_to_harmony_message,
     parse_output_message,
 )
 
 
-class TestParseInputToHarmonyMessage:
-    """Tests for parse_input_to_harmony_message function."""
+class TestCommonParseInputToHarmonyMessage:
+    """
+    Tests for scenarios that are common to both Chat Completion
+    parse_chat_input_to_harmony_message and Responsees API
+    parse_input_to_harmony_message functions.
+    """
 
-    def test_assistant_message_with_tool_calls(self):
+    @pytest.fixture(
+        params=[parse_chat_input_to_harmony_message, parse_input_to_harmony_message]
+    )
+    def parse_function(self, request):
+        return request.param
+
+    def test_assistant_message_with_tool_calls(self, parse_function):
         """Test parsing assistant message with tool calls."""
         chat_msg = {
             "role": "assistant",
@@ -35,7 +51,7 @@ class TestParseInputToHarmonyMessage:
             ],
         }
 
-        messages = parse_input_to_harmony_message(chat_msg)
+        messages = parse_function(chat_msg)
 
         assert len(messages) == 2
 
@@ -53,7 +69,7 @@ class TestParseInputToHarmonyMessage:
         assert messages[1].recipient == "functions.search_web"
         assert messages[1].content_type == "json"
 
-    def test_assistant_message_with_empty_tool_call_arguments(self):
+    def test_assistant_message_with_empty_tool_call_arguments(self, parse_function):
         """Test parsing assistant message with tool call having None arguments."""
         chat_msg = {
             "role": "assistant",
@@ -67,12 +83,152 @@ class TestParseInputToHarmonyMessage:
             ],
         }
 
-        messages = parse_input_to_harmony_message(chat_msg)
+        messages = parse_function(chat_msg)
 
         assert len(messages) == 1
         assert messages[0].content[0].text == ""
         assert messages[0].recipient == "functions.get_current_time"
 
+    def test_system_message(self, parse_function):
+        """Test parsing system message."""
+        chat_msg = {
+            "role": "system",
+            "content": "You are a helpful assistant",
+        }
+
+        messages = parse_function(chat_msg)
+
+        assert len(messages) == 1
+        # System messages are converted using Message.from_dict
+        # which should preserve the role
+        assert messages[0].author.role == Role.SYSTEM
+
+    def test_developer_message(self, parse_function):
+        """Test parsing developer message."""
+        chat_msg = {
+            "role": "developer",
+            "content": "Use concise language",
+        }
+
+        messages = parse_function(chat_msg)
+
+        assert len(messages) == 1
+        assert messages[0].author.role == Role.DEVELOPER
+
+    def test_user_message_with_string_content(self, parse_function):
+        """Test parsing user message with string content."""
+        chat_msg = {
+            "role": "user",
+            "content": "What's the weather in San Francisco?",
+        }
+
+        messages = parse_function(chat_msg)
+
+        assert len(messages) == 1
+        assert messages[0].author.role == Role.USER
+        assert messages[0].content[0].text == "What's the weather in San Francisco?"
+
+    def test_user_message_with_array_content(self, parse_function):
+        """Test parsing user message with array content."""
+        chat_msg = {
+            "role": "user",
+            "content": [
+                {"text": "What's in this image? "},
+                {"text": "Please describe it."},
+            ],
+        }
+
+        messages = parse_function(chat_msg)
+
+        assert len(messages) == 1
+        assert messages[0].author.role == Role.USER
+        assert len(messages[0].content) == 2
+        assert messages[0].content[0].text == "What's in this image? "
+        assert messages[0].content[1].text == "Please describe it."
+
+    def test_assistant_message_with_string_content(self, parse_function):
+        """Test parsing assistant message with string content (no tool calls)."""
+        chat_msg = {
+            "role": "assistant",
+            "content": "Hello! How can I help you today?",
+        }
+
+        messages = parse_function(chat_msg)
+
+        assert len(messages) == 1
+        assert messages[0].author.role == Role.ASSISTANT
+        assert messages[0].content[0].text == "Hello! How can I help you today?"
+
+    def test_pydantic_model_input(self, parse_function):
+        """Test parsing Pydantic model input (has model_dump method)."""
+
+        class MockPydanticModel:
+            def model_dump(self, exclude_none=True):
+                return {
+                    "role": "user",
+                    "content": "Test message",
+                }
+
+        chat_msg = MockPydanticModel()
+        messages = parse_function(chat_msg)
+
+        assert len(messages) == 1
+        assert messages[0].author.role == Role.USER
+        assert messages[0].content[0].text == "Test message"
+
+    def test_tool_call_with_missing_function_fields(self, parse_function):
+        """Test parsing tool call with missing name or arguments."""
+        chat_msg = {
+            "role": "assistant",
+            "tool_calls": [
+                {
+                    "function": {}  # Missing both name and arguments
+                }
+            ],
+        }
+
+        messages = parse_function(chat_msg)
+
+        assert len(messages) == 1
+        assert messages[0].recipient == "functions."
+        assert messages[0].content[0].text == ""
+
+    def test_array_content_with_missing_text(self, parse_function):
+        """Test parsing array content where text field is missing."""
+        chat_msg = {
+            "role": "user",
+            "content": [
+                {},  # Missing text field
+                {"text": "actual text"},
+            ],
+        }
+
+        messages = parse_function(chat_msg)
+
+        assert len(messages) == 1
+        assert len(messages[0].content) == 2
+        assert messages[0].content[0].text == ""
+        assert messages[0].content[1].text == "actual text"
+
+
+class TestParseInputToHarmonyMessage:
+    """
+    Tests for scenarios that are specific to the Responses API
+    parse_input_to_harmony_message function.
+    """
+
+    def test_message_with_empty_content(self):
+        """Test parsing message with empty string content."""
+        chat_msg = {
+            "role": "user",
+            "content": "",
+        }
+
+        messages = parse_input_to_harmony_message(chat_msg)
+
+        assert len(messages) == 1
+        assert messages[0].content[0].text == ""
+
     def test_tool_message_with_string_content(self):
         """Test parsing tool message with string content."""
         chat_msg = {
@@ -111,6 +267,7 @@ class TestParseInputToHarmonyMessage:
 
         assert len(messages) == 1
         assert messages[0].author.role == Role.TOOL
+        assert messages[0].author.name == "functions.search_results"
         assert messages[0].content[0].text == "Result 1: Result 2: Result 3"
 
     def test_tool_message_with_empty_content(self):
@@ -124,140 +281,564 @@ class TestParseInputToHarmonyMessage:
         messages = parse_input_to_harmony_message(chat_msg)
 
         assert len(messages) == 1
+        assert messages[0].author.role == Role.TOOL
+        assert messages[0].author.name == "functions.empty_tool"
         assert messages[0].content[0].text == ""
 
-    def test_system_message(self):
-        """Test parsing system message."""
-        chat_msg = {
-            "role": "system",
-            "content": "You are a helpful assistant",
-        }
 
-        messages = parse_input_to_harmony_message(chat_msg)
+class TestParseChatInputToHarmonyMessage:
+    """
+    Tests for scenarios that are specific to the Chat Completion API
+    parse_chat_input_to_harmony_message function.
+    """
 
-        assert len(messages) == 1
-        # System messages are converted using Message.from_dict
-        # which should preserve the role
-        assert messages[0].author.role == Role.SYSTEM
-
-    def test_developer_message(self):
-        """Test parsing developer message."""
-        chat_msg = {
-            "role": "developer",
-            "content": "Use concise language",
-        }
-
-        messages = parse_input_to_harmony_message(chat_msg)
-
-        assert len(messages) == 1
-        assert messages[0].author.role == Role.DEVELOPER
-
-    def test_user_message_with_string_content(self):
-        """Test parsing user message with string content."""
-        chat_msg = {
-            "role": "user",
-            "content": "What's the weather in San Francisco?",
-        }
-
-        messages = parse_input_to_harmony_message(chat_msg)
-
-        assert len(messages) == 1
-        assert messages[0].author.role == Role.USER
-        assert messages[0].content[0].text == "What's the weather in San Francisco?"
-
-    def test_user_message_with_array_content(self):
-        """Test parsing user message with array content."""
-        chat_msg = {
-            "role": "user",
-            "content": [
-                {"text": "What's in this image? "},
-                {"text": "Please describe it."},
-            ],
-        }
-
-        messages = parse_input_to_harmony_message(chat_msg)
-
-        assert len(messages) == 1
-        assert messages[0].author.role == Role.USER
-        assert len(messages[0].content) == 2
-        assert messages[0].content[0].text == "What's in this image? "
-        assert messages[0].content[1].text == "Please describe it."
-
-    def test_assistant_message_with_string_content(self):
-        """Test parsing assistant message with string content (no tool calls)."""
-        chat_msg = {
-            "role": "assistant",
-            "content": "Hello! How can I help you today?",
-        }
-
-        messages = parse_input_to_harmony_message(chat_msg)
-
-        assert len(messages) == 1
-        assert messages[0].author.role == Role.ASSISTANT
-        assert messages[0].content[0].text == "Hello! How can I help you today?"
-
-    def test_pydantic_model_input(self):
-        """Test parsing Pydantic model input (has model_dump method)."""
-
-        class MockPydanticModel:
-            def model_dump(self, exclude_none=True):
-                return {
-                    "role": "user",
-                    "content": "Test message",
-                }
-
-        chat_msg = MockPydanticModel()
-        messages = parse_input_to_harmony_message(chat_msg)
-
-        assert len(messages) == 1
-        assert messages[0].author.role == Role.USER
-        assert messages[0].content[0].text == "Test message"
-
-    def test_message_with_empty_content(self):
-        """Test parsing message with empty string content."""
+    def test_user_message_with_empty_content(self):
         chat_msg = {
             "role": "user",
             "content": "",
         }
 
-        messages = parse_input_to_harmony_message(chat_msg)
+        messages = parse_chat_input_to_harmony_message(chat_msg)
 
-        assert len(messages) == 1
-        assert messages[0].content[0].text == ""
+        verify_harmony_messages(
+            messages,
+            [
+                {
+                    "role": "user",
+                    "content": "",
+                },
+            ],
+        )
 
-    def test_tool_call_with_missing_function_fields(self):
-        """Test parsing tool call with missing name or arguments."""
+    def test_user_message_with_none_content(self):
+        chat_msg = {
+            "role": "user",
+            "content": None,
+        }
+
+        messages = parse_chat_input_to_harmony_message(chat_msg)
+
+        verify_harmony_messages(
+            messages,
+            [
+                {
+                    "role": "user",
+                    "content": "",
+                },
+            ],
+        )
+
+    def test_assistant_message_with_empty_content(self):
+        chat_msg = {
+            "role": "assistant",
+            "content": "",
+        }
+
+        messages = parse_chat_input_to_harmony_message(chat_msg)
+
+        assert len(messages) == 0
+
+    def test_assistant_message_with_none_content(self):
+        chat_msg = {
+            "role": "assistant",
+            "content": None,
+        }
+
+        messages = parse_chat_input_to_harmony_message(chat_msg)
+
+        assert len(messages) == 0
+
+    def test_assistant_message_with_content_but_empty_reasoning(self):
+        chat_msg = {
+            "role": "assistant",
+            "content": "The answer is 4.",
+            "reasoning": "",
+        }
+
+        messages = parse_chat_input_to_harmony_message(chat_msg)
+
+        verify_harmony_messages(
+            messages,
+            [
+                {
+                    "role": "assistant",
+                    "channel": "final",
+                    "content": "The answer is 4.",
+                },
+            ],
+        )
+
+    def test_assistant_message_with_reasoning_but_empty_content(self):
+        chat_msg = {
+            "role": "assistant",
+            "reasoning": "I'm thinking about the user's question.",
+            "content": "",
+        }
+
+        messages = parse_chat_input_to_harmony_message(chat_msg)
+
+        verify_harmony_messages(
+            messages,
+            [
+                {
+                    "role": "assistant",
+                    "channel": "analysis",
+                    "content": "I'm thinking about the user's question.",
+                },
+            ],
+        )
+
+    def test_assistant_message_with_reasoning_but_none_content(self):
+        chat_msg = {
+            "role": "assistant",
+            "reasoning": "I'm thinking about the user's question.",
+            "content": None,
+        }
+
+        messages = parse_chat_input_to_harmony_message(chat_msg)
+
+        verify_harmony_messages(
+            messages,
+            [
+                {
+                    "role": "assistant",
+                    "channel": "analysis",
+                    "content": "I'm thinking about the user's question.",
+                },
+            ],
+        )
+
+    def test_assistant_message_with_tool_calls_but_no_content(self):
         chat_msg = {
             "role": "assistant",
             "tool_calls": [
                 {
-                    "function": {}  # Missing both name and arguments
+                    "function": {
+                        "name": "get_weather",
+                        "arguments": '{"location": "San Francisco"}',
+                    }
                 }
             ],
         }
 
-        messages = parse_input_to_harmony_message(chat_msg)
+        messages = parse_chat_input_to_harmony_message(chat_msg)
 
-        assert len(messages) == 1
-        assert messages[0].recipient == "functions."
-        assert messages[0].content[0].text == ""
+        verify_harmony_messages(
+            messages,
+            [
+                {
+                    "role": "assistant",
+                    "channel": "commentary",
+                    "recipient": "functions.get_weather",
+                    "content": '{"location": "San Francisco"}',
+                    "content_type": "json",
+                },
+            ],
+        )
 
-    def test_array_content_with_missing_text(self):
-        """Test parsing array content where text field is missing."""
+    def test_assistant_message_with_tool_calls_and_content(self):
         chat_msg = {
-            "role": "user",
+            "role": "assistant",
+            "tool_calls": [
+                {
+                    "function": {
+                        "name": "get_weather",
+                        "arguments": '{"location": "San Francisco"}',
+                    }
+                }
+            ],
+            "content": "I'll call the tool.",
+        }
+
+        messages = parse_chat_input_to_harmony_message(chat_msg)
+
+        verify_harmony_messages(
+            messages,
+            [
+                {
+                    "role": "assistant",
+                    "channel": "commentary",
+                    "content": "I'll call the tool.",
+                },
+                {
+                    "role": "assistant",
+                    "channel": "commentary",
+                    "recipient": "functions.get_weather",
+                    "content": '{"location": "San Francisco"}',
+                    "content_type": "json",
+                },
+            ],
+        )
+
+    def test_assistant_message_with_tool_calls_and_reasoning(self):
+        chat_msg = {
+            "role": "assistant",
+            "tool_calls": [
+                {
+                    "function": {
+                        "name": "get_weather",
+                        "arguments": '{"location": "San Francisco"}',
+                    }
+                }
+            ],
+            "reasoning": "I should use the get_weather tool.",
+        }
+
+        messages = parse_chat_input_to_harmony_message(chat_msg)
+
+        verify_harmony_messages(
+            messages,
+            [
+                {
+                    "role": "assistant",
+                    "channel": "analysis",
+                    "content": "I should use the get_weather tool.",
+                },
+                {
+                    "role": "assistant",
+                    "channel": "commentary",
+                    "recipient": "functions.get_weather",
+                    "content": '{"location": "San Francisco"}',
+                    "content_type": "json",
+                },
+            ],
+        )
+
+    def test_assistant_message_with_tool_calls_and_reasoning_and_content(self):
+        chat_msg = {
+            "role": "assistant",
+            "tool_calls": [
+                {
+                    "function": {
+                        "name": "get_weather",
+                        "arguments": '{"location": "San Francisco"}',
+                    }
+                }
+            ],
+            "reasoning": "I should use the get_weather tool.",
+            "content": "I'll call the tool.",
+        }
+
+        messages = parse_chat_input_to_harmony_message(chat_msg)
+
+        verify_harmony_messages(
+            messages,
+            [
+                {
+                    "role": "assistant",
+                    "channel": "commentary",
+                    "content": "I'll call the tool.",
+                },
+                {
+                    "role": "assistant",
+                    "channel": "analysis",
+                    "content": "I should use the get_weather tool.",
+                },
+                {
+                    "role": "assistant",
+                    "channel": "commentary",
+                    "recipient": "functions.get_weather",
+                    "content": '{"location": "San Francisco"}',
+                    "content_type": "json",
+                },
+            ],
+        )
+
+    def test_tool_message_with_string_content(self):
+        tool_id_names = {
+            "call_123": "get_weather",
+        }
+        chat_msg = {
+            "role": "tool",
+            "tool_call_id": "call_123",
+            "content": "The weather in San Francisco is sunny, 72°F",
+        }
+
+        messages = parse_chat_input_to_harmony_message(
+            chat_msg, tool_id_names=tool_id_names
+        )
+
+        verify_harmony_messages(
+            messages,
+            [
+                {
+                    "role": "tool",
+                    "name": "functions.get_weather",
+                    "content": "The weather in San Francisco is sunny, 72°F",
+                    "channel": "commentary",
+                },
+            ],
+        )
+
+    def test_tool_message_with_array_content(self):
+        tool_id_names = {
+            "call_123": "search_results",
+        }
+        chat_msg = {
+            "role": "tool",
+            "tool_call_id": "call_123",
             "content": [
-                {},  # Missing text field
-                {"text": "actual text"},
+                {"type": "text", "text": "Result 1: "},
+                {"type": "text", "text": "Result 2: "},
+                {
+                    "type": "image",
+                    "url": "http://example.com/img.png",
+                },  # Should be ignored
+                {"type": "text", "text": "Result 3"},
             ],
         }
 
-        messages = parse_input_to_harmony_message(chat_msg)
+        messages = parse_chat_input_to_harmony_message(
+            chat_msg, tool_id_names=tool_id_names
+        )
 
-        assert len(messages) == 1
-        assert len(messages[0].content) == 2
-        assert messages[0].content[0].text == ""
-        assert messages[0].content[1].text == "actual text"
+        verify_harmony_messages(
+            messages,
+            [
+                {
+                    "role": "tool",
+                    "name": "functions.search_results",
+                    "content": "Result 1: Result 2: Result 3",
+                    "channel": "commentary",
+                },
+            ],
+        )
+
+    def test_tool_message_with_empty_content(self):
+        tool_id_names = {
+            "call_123": "empty_tool",
+        }
+        chat_msg = {
+            "role": "tool",
+            "tool_call_id": "call_123",
+            "content": "",
+        }
+
+        messages = parse_chat_input_to_harmony_message(
+            chat_msg, tool_id_names=tool_id_names
+        )
+
+        verify_harmony_messages(
+            messages,
+            [
+                {
+                    "role": "tool",
+                    "name": "functions.empty_tool",
+                    "content": "",
+                    "channel": "commentary",
+                },
+            ],
+        )
+
+    def test_tool_message_with_none_content(self):
+        tool_id_names = {
+            "call_123": "empty_tool",
+        }
+        chat_msg = {
+            "role": "tool",
+            "tool_call_id": "call_123",
+            "content": None,
+        }
+
+        messages = parse_chat_input_to_harmony_message(
+            chat_msg, tool_id_names=tool_id_names
+        )
+
+        verify_harmony_messages(
+            messages,
+            [
+                {
+                    "role": "tool",
+                    "name": "functions.empty_tool",
+                    "content": "",
+                    "channel": "commentary",
+                },
+            ],
+        )
+
+
+class TestAutoDropAnalysisMessages:
+    def test_no_analysis_messages(self) -> None:
+        messages = [
+            Message.from_role_and_content(
+                Role.ASSISTANT, "The answer is 4."
+            ).with_channel("final"),
+        ]
+        cleaned_messages = auto_drop_analysis_messages(messages)
+        assert cleaned_messages == messages
+
+    def test_only_analysis_message(self) -> None:
+        messages = [
+            Message.from_role_and_content(
+                Role.ASSISTANT, "I'm thinking about the user's question."
+            ).with_channel("analysis"),
+        ]
+        cleaned_messages = auto_drop_analysis_messages(messages)
+        assert cleaned_messages == messages
+
+    def test_multiple_analysis_messages_without_final_message(self) -> None:
+        messages = [
+            Message.from_role_and_content(
+                Role.ASSISTANT, "I'm thinking about the user's question."
+            ).with_channel("analysis"),
+            Message.from_role_and_content(
+                Role.ASSISTANT, "I'm thinking more."
+            ).with_channel("analysis"),
+            Message.from_role_and_content(
+                Role.ASSISTANT, "I'm thinking even more."
+            ).with_channel("analysis"),
+        ]
+        cleaned_messages = auto_drop_analysis_messages(messages)
+        assert cleaned_messages == messages
+
+    def test_only_final_message(self) -> None:
+        messages = [
+            Message.from_role_and_content(
+                Role.ASSISTANT, "The answer is 4."
+            ).with_channel("final"),
+        ]
+        cleaned_messages = auto_drop_analysis_messages(messages)
+        assert cleaned_messages == messages
+
+    def test_drops_one_analysis_messages_before_final_message(self) -> None:
+        messages = [
+            Message.from_role_and_content(
+                Role.ASSISTANT, "I'm thinking about the user's question."
+            ).with_channel("analysis"),
+            Message.from_role_and_content(
+                Role.ASSISTANT, "The answer is 4."
+            ).with_channel("final"),
+            Message.from_role_and_content(
+                Role.ASSISTANT, "I should think harder."
+            ).with_channel("analysis"),
+        ]
+        cleaned_messages = auto_drop_analysis_messages(messages)
+        # Should have dropped the first analysis message
+        assert cleaned_messages == messages[1:]
+
+    def test_drops_all_analysis_messages_before_final_message(self) -> None:
+        messages = [
+            Message.from_role_and_content(
+                Role.ASSISTANT, "I'm thinking about the user's question."
+            ).with_channel("analysis"),
+            Message.from_role_and_content(
+                Role.ASSISTANT, "I'm thinking more."
+            ).with_channel("analysis"),
+            Message.from_role_and_content(
+                Role.ASSISTANT, "I'm thinking even more."
+            ).with_channel("analysis"),
+            Message.from_role_and_content(
+                Role.ASSISTANT, "The answer is 4."
+            ).with_channel("final"),
+            Message.from_role_and_content(
+                Role.ASSISTANT, "I should think harder."
+            ).with_channel("analysis"),
+        ]
+        cleaned_messages = auto_drop_analysis_messages(messages)
+        # Should have dropped the first 3 analysis messages
+        assert cleaned_messages == messages[3:]
+
+    def test_multiple_analysis_messages_with_multiple_final_messages(self) -> None:
+        messages = [
+            Message.from_role_and_content(
+                Role.ASSISTANT, "I'm thinking about the user's question."
+            ).with_channel("analysis"),
+            Message.from_role_and_content(
+                Role.ASSISTANT, "I'm thinking more."
+            ).with_channel("analysis"),
+            Message.from_role_and_content(
+                Role.ASSISTANT, "I'm thinking even more."
+            ).with_channel("analysis"),
+            Message.from_role_and_content(
+                Role.ASSISTANT, "The answer is 4."
+            ).with_channel("final"),
+            Message.from_role_and_content(
+                Role.ASSISTANT, "I should think harder."
+            ).with_channel("analysis"),
+            Message.from_role_and_content(
+                Role.ASSISTANT, "The answer is 5."
+            ).with_channel("final"),
+        ]
+        cleaned_messages = auto_drop_analysis_messages(messages)
+        # Should have dropped all those analysis messages
+        assert len(cleaned_messages) == 2
+        assert cleaned_messages[0].content[0].text == "The answer is 4."
+        assert cleaned_messages[1].content[0].text == "The answer is 5."
+
+    def test_drops_non_assistant_analysis_messages(self) -> None:
+        messages = [
+            Message.from_role_and_content(
+                Role.TOOL, "The tool thinks we should think harder."
+            ).with_channel("analysis"),
+            Message.from_role_and_content(
+                Role.ASSISTANT, "The answer is 4."
+            ).with_channel("final"),
+        ]
+        cleaned_messages = auto_drop_analysis_messages(messages)
+        # Should have dropped the analysis message
+        assert cleaned_messages == messages[1:]
+
+
+class TestParseChatOutput:
+    def test_parse_chat_output_interrupted_first_message(self) -> None:
+        harmony_str = "<|channel|>final<|message|>I'm in the middle of answering"
+        token_ids = get_encoding().encode(harmony_str, allowed_special="all")
+        reasoning, final_content, _ = parse_chat_output(token_ids)
+        assert reasoning is None
+        assert final_content == "I'm in the middle of answering"
+
+    def test_parse_chat_output_interrupted_reasoning_first_message(self) -> None:
+        harmony_str = "<|channel|>analysis<|message|>I'm in the middle of thinking"
+        token_ids = get_encoding().encode(harmony_str, allowed_special="all")
+        reasoning, final_content, _ = parse_chat_output(token_ids)
+        assert reasoning == "I'm in the middle of thinking"
+        assert final_content is None
+
+    def test_parse_chat_output_complete_reasoning_interrupted_content(self) -> None:
+        harmony_str = (
+            "<|channel|>analysis<|message|>I'm thinking.<|end|>"
+            "<|start|>assistant<|channel|>final"
+            "<|message|>I'm in the middle of answering"
+        )
+        token_ids = get_encoding().encode(harmony_str, allowed_special="all")
+        reasoning, final_content, _ = parse_chat_output(token_ids)
+        assert reasoning == "I'm thinking."
+        assert final_content == "I'm in the middle of answering"
+
+    def test_parse_chat_output_complete_content(self) -> None:
+        harmony_str = "<|channel|>final<|message|>The answer is 4.<|end|>"
+        token_ids = get_encoding().encode(harmony_str, allowed_special="all")
+        reasoning, final_content, _ = parse_chat_output(token_ids)
+        assert reasoning is None
+        assert final_content == "The answer is 4."
+
+    def test_parse_chat_output_complete_commentary(self) -> None:
+        harmony_str = (
+            "<|channel|>commentary<|message|>I need to call some tools.<|end|>"
+        )
+        token_ids = get_encoding().encode(harmony_str, allowed_special="all")
+        reasoning, final_content, _ = parse_chat_output(token_ids)
+        assert reasoning is None
+        assert final_content == "I need to call some tools."
+
+    def test_parse_chat_output_complete_reasoning(self) -> None:
+        harmony_str = (
+            "<|channel|>analysis<|message|>I've thought hard about this.<|end|>"
+        )
+        token_ids = get_encoding().encode(harmony_str, allowed_special="all")
+        reasoning, final_content, _ = parse_chat_output(token_ids)
+        assert reasoning == "I've thought hard about this."
+        assert final_content is None
+
+    def test_parse_chat_output_complete_reasoning_and_content(self) -> None:
+        harmony_str = (
+            "<|channel|>analysis<|message|>I've thought hard about this.<|end|>"
+            "<|start|>assistant<|channel|>final<|message|>The answer is 4.<|end|>"
+        )
+        token_ids = get_encoding().encode(harmony_str, allowed_special="all")
+        reasoning, final_content, _ = parse_chat_output(token_ids)
+        assert reasoning == "I've thought hard about this."
+        assert final_content == "The answer is 4."
 
 
 class TestParseOutputMessage:
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 9ea65f9fa6e7a..5a9293f1b9ae5 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -11,13 +11,25 @@ import pytest_asyncio
 from openai import OpenAI
 
 from vllm.config.multimodal import MultiModalConfig
-from vllm.entrypoints.openai.protocol import ChatCompletionRequest
+from vllm.entrypoints.openai.parser.harmony_utils import get_encoding
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    ChatCompletionResponse,
+    RequestResponseMetadata,
+)
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
+from vllm.entrypoints.openai.tool_parsers import ToolParserManager
+from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.tokenizers import get_tokenizer
 from vllm.v1.engine.async_llm import AsyncLLM
 
 from ...utils import RemoteOpenAIServer
+from .utils import (
+    accumulate_streaming_response,
+    verify_chat_response,
+    verify_harmony_messages,
+)
 
 GPT_OSS_MODEL_NAME = "openai/gpt-oss-20b"
 
@@ -728,3 +740,635 @@ async def test_serving_chat_data_parallel_rank_extraction():
     # Verify that data_parallel_rank defaults to None
     assert "data_parallel_rank" in mock_engine.generate.call_args.kwargs
     assert mock_engine.generate.call_args.kwargs["data_parallel_rank"] is None
+
+
+class TestServingChatWithHarmony:
+    """
+    These tests ensure Chat Completion requests are being properly converted into
+    Harmony messages and Harmony response messages back into Chat Completion responses.
+    These tests are not exhaustive, but each one was created to cover a specific case
+    that we got wrong but is now fixed.
+
+    Any changes to the tests and their expectations may result in changes to the
+    accuracy of model prompting and responses generated. It is suggested to run
+    an evaluation or benchmarking suite (such as bfcl multi_turn) to understand
+    any impact of changes in how we prompt Harmony models.
+    """
+
+    @pytest.fixture(params=[False, True], ids=["non_streaming", "streaming"])
+    def stream(self, request) -> bool:
+        """Parameterize tests to run in both non-streaming and streaming modes."""
+        return request.param
+
+    @pytest.fixture()
+    def mock_engine(self) -> AsyncLLM:
+        mock_engine = MagicMock(spec=AsyncLLM)
+        mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+        mock_engine.errored = False
+        mock_engine.model_config = MockModelConfig()
+        mock_engine.input_processor = MagicMock()
+        mock_engine.io_processor = MagicMock()
+        return mock_engine
+
+    @pytest.fixture()
+    def serving_chat(self, mock_engine) -> OpenAIServingChat:
+        chat = _build_serving_chat(mock_engine)
+        chat.use_harmony = True
+        chat.tool_parser = ToolParserManager.get_tool_parser("openai")
+        return chat
+
+    def mock_request_output_from_req_and_token_ids(
+        self, req: ChatCompletionRequest, token_ids: list[int], finished: bool = False
+    ) -> RequestOutput:
+        # Our tests don't use most fields, so just get the token ids correct
+        completion_output = CompletionOutput(
+            index=0,
+            text="",
+            token_ids=token_ids,
+            cumulative_logprob=0.0,
+            logprobs=None,
+        )
+        return RequestOutput(
+            request_id=req.request_id,
+            prompt=[],
+            prompt_token_ids=[],
+            prompt_logprobs=None,
+            outputs=[completion_output],
+            finished=finished,
+        )
+
+    @pytest.fixture
+    def weather_tools(self) -> list[dict[str, Any]]:
+        return [
+            {
+                "type": "function",
+                "function": {
+                    "name": "get_weather",
+                    "description": "Get the weather in a given location",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "location": {"type": "string"},
+                        },
+                        "required": ["location"],
+                    },
+                },
+            },
+        ]
+
+    @pytest.fixture
+    def weather_messages_start(self) -> list[dict[str, Any]]:
+        return [
+            {
+                "role": "user",
+                "content": "What's the weather like in Paris today?",
+            },
+        ]
+
+    async def generate_response_from_harmony_str(
+        self,
+        serving_chat: OpenAIServingChat,
+        req: ChatCompletionRequest,
+        harmony_str: str,
+        stream: bool = False,
+    ) -> ChatCompletionResponse:
+        harmony_token_ids = get_encoding().encode(harmony_str, allowed_special="all")
+
+        async def result_generator():
+            if stream:
+                for token_id in harmony_token_ids:
+                    yield self.mock_request_output_from_req_and_token_ids(
+                        req, [token_id]
+                    )
+                yield self.mock_request_output_from_req_and_token_ids(
+                    req, [], finished=True
+                )
+            else:
+                yield self.mock_request_output_from_req_and_token_ids(
+                    req, harmony_token_ids, finished=True
+                )
+
+        generator_func = (
+            serving_chat.chat_completion_stream_generator
+            if stream
+            else serving_chat.chat_completion_full_generator
+        )
+
+        result = generator_func(
+            request=req,
+            result_generator=result_generator(),
+            request_id=req.request_id,
+            model_name=req.model,
+            conversation=[],
+            tokenizer=get_tokenizer(req.model),
+            request_metadata=RequestResponseMetadata(
+                request_id=req.request_id,
+                model_name=req.model,
+            ),
+        )
+
+        if stream:
+            return await accumulate_streaming_response(result)
+        return await result
+
+    @pytest.mark.asyncio
+    async def test_simple_chat(self, serving_chat, stream):
+        messages = [{"role": "user", "content": "what is 1+1?"}]
+
+        # Test the Harmony messages for the first turn's input
+        req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
+        input_messages, _, _ = serving_chat._make_request_with_harmony(req)
+        verify_harmony_messages(
+            input_messages,
+            [
+                {"role": "system"},
+                {"role": "developer"},
+                {"role": "user", "content": messages[0]["content"]},
+            ],
+        )
+
+        # Test the Chat Completion response for the first turn's output
+        reasoning_str = "We need to think really hard about this."
+        final_str = "The answer is 2."
+        response_str = (
+            f"<|channel|>analysis<|message|>{reasoning_str}<|end|>"
+            f"<|start|>assistant<|channel|>final<|message|>{final_str}<|end|>"
+        )
+        response = await self.generate_response_from_harmony_str(
+            serving_chat, req, response_str, stream=stream
+        )
+        verify_chat_response(response, content=final_str, reasoning=reasoning_str)
+
+        # Add the output messages from the first turn as input to the second turn
+        for choice in response.choices:
+            messages.append(choice.message.model_dump(exclude_none=True))
+
+        # Test the Harmony messages for the second turn's input
+        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
+        input_messages_2, _, _ = serving_chat._make_request_with_harmony(req_2)
+        verify_harmony_messages(
+            input_messages_2,
+            [
+                {"role": "system"},
+                {"role": "developer"},
+                {"role": "user"},
+                # The analysis message should be dropped on subsequent inputs because
+                # of the subsequent assistant message to the final channel.
+                {"role": "assistant", "channel": "final", "content": final_str},
+            ],
+        )
+
+    @pytest.mark.asyncio
+    async def test_tool_call_response_with_content(
+        self, serving_chat, stream, weather_tools, weather_messages_start
+    ):
+        tools = weather_tools
+        messages = list(weather_messages_start)
+
+        # Test the Harmony messages for the first turn's input
+        req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
+        input_messages, _, _ = serving_chat._make_request_with_harmony(req)
+        verify_harmony_messages(
+            input_messages,
+            [
+                {"role": "system"},
+                {"role": "developer", "tool_definitions": ["get_weather"]},
+                {"role": "user", "content": messages[0]["content"]},
+            ],
+        )
+
+        # Test the Chat Completion response for the first turn's output
+        commentary_str = "We'll call get_weather."
+        tool_args_str = '{"location": "Paris"}'
+        response_str = (
+            f"<|channel|>commentary<|message|>{commentary_str}<|end|>"
+            "<|start|>assistant to=functions.get_weather<|channel|>commentary"
+            f"<|constrain|>json<|message|>{tool_args_str}<|call|>"
+        )
+        response = await self.generate_response_from_harmony_str(
+            serving_chat, req, response_str, stream=stream
+        )
+        verify_chat_response(
+            response,
+            content=commentary_str,
+            tool_calls=[("get_weather", tool_args_str)],
+        )
+
+        tool_call = response.choices[0].message.tool_calls[0]
+
+        # Add the output messages from the first turn as input to the second turn
+        for choice in response.choices:
+            messages.append(choice.message.model_dump(exclude_none=True))
+
+        # Add our tool output message
+        messages.append(
+            {
+                "role": "tool",
+                "tool_call_id": tool_call.id,
+                "content": "20 degrees Celsius",
+            },
+        )
+
+        # Test the Harmony messages for the second turn's input
+        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
+        input_messages_2, _, _ = serving_chat._make_request_with_harmony(req_2)
+        verify_harmony_messages(
+            input_messages_2,
+            [
+                {"role": "system"},
+                {"role": "developer"},
+                {"role": "user"},
+                {
+                    "role": "assistant",
+                    "channel": "commentary",
+                    "content": commentary_str,
+                },
+                {
+                    "role": "assistant",
+                    "channel": "commentary",
+                    "recipient": "functions.get_weather",
+                    "content": tool_args_str,
+                },
+                {
+                    "role": "tool",
+                    "author_name": "functions.get_weather",
+                    "channel": "commentary",
+                    "recipient": "assistant",
+                    "content": "20 degrees Celsius",
+                },
+            ],
+        )
+
+    @pytest.mark.asyncio
+    async def test_tools_and_reasoning(
+        self, serving_chat, stream, weather_tools, weather_messages_start
+    ):
+        tools = weather_tools
+        messages = list(weather_messages_start)
+
+        # Test the Harmony messages for the first turn's input
+        req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
+        input_messages, _, _ = serving_chat._make_request_with_harmony(req)
+        verify_harmony_messages(
+            input_messages,
+            [
+                {"role": "system"},
+                {"role": "developer", "tool_definitions": ["get_weather"]},
+                {"role": "user", "content": messages[0]["content"]},
+            ],
+        )
+
+        # Test the Chat Completion response for the first turn's output
+        reasoning_str = "I'll call get_weather."
+        tool_args_str = '{"location": "Paris"}'
+        response_str = (
+            f"<|channel|>analysis<|message|>{reasoning_str}<|end|>"
+            "<|start|>assistant to=functions.get_weather<|channel|>commentary"
+            f"<|constrain|>json<|message|>{tool_args_str}<|call|>"
+        )
+        response = await self.generate_response_from_harmony_str(
+            serving_chat, req, response_str, stream=stream
+        )
+        verify_chat_response(
+            response,
+            reasoning=reasoning_str,
+            tool_calls=[("get_weather", tool_args_str)],
+        )
+
+        tool_call = response.choices[0].message.tool_calls[0]
+
+        # Add the output messages from the first turn as input to the second turn
+        for choice in response.choices:
+            messages.append(choice.message.model_dump(exclude_none=True))
+
+        # Add our tool output message
+        messages.append(
+            {
+                "role": "tool",
+                "tool_call_id": tool_call.id,
+                "content": "20 degrees Celsius",
+            },
+        )
+
+        # Test the Harmony messages for the second turn's input
+        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
+        input_messages_2, _, _ = serving_chat._make_request_with_harmony(req_2)
+        verify_harmony_messages(
+            input_messages_2,
+            [
+                {"role": "system"},
+                {"role": "developer"},
+                {"role": "user"},
+                {
+                    "role": "assistant",
+                    "channel": "analysis",
+                    "content": reasoning_str,
+                },
+                {
+                    "role": "assistant",
+                    "channel": "commentary",
+                    "recipient": "functions.get_weather",
+                    "content": tool_args_str,
+                },
+                {
+                    "role": "tool",
+                    "author_name": "functions.get_weather",
+                    "channel": "commentary",
+                    "recipient": "assistant",
+                    "content": "20 degrees Celsius",
+                },
+            ],
+        )
+
+    @pytest.mark.asyncio
+    async def test_multi_turn_tools_and_reasoning(
+        self, serving_chat, stream, weather_tools, weather_messages_start
+    ):
+        tools = weather_tools
+        messages = list(weather_messages_start)
+
+        # Test the Harmony messages for the first turn's input
+        req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
+        input_messages, _, _ = serving_chat._make_request_with_harmony(req)
+        verify_harmony_messages(
+            input_messages,
+            [
+                {"role": "system"},
+                {"role": "developer", "tool_definitions": ["get_weather"]},
+                {"role": "user", "content": messages[0]["content"]},
+            ],
+        )
+
+        # Test the Chat Completion response for the first turn's output
+        reasoning_str = "I'll call get_weather."
+        paris_tool_args_str = '{"location": "Paris"}'
+        response_str = (
+            f"<|channel|>analysis<|message|>{reasoning_str}<|end|>"
+            "<|start|>assistant to=functions.get_weather<|channel|>commentary"
+            f"<|constrain|>json<|message|>{paris_tool_args_str}<|call|>"
+        )
+        response = await self.generate_response_from_harmony_str(
+            serving_chat, req, response_str, stream=stream
+        )
+        verify_chat_response(
+            response,
+            reasoning=reasoning_str,
+            tool_calls=[("get_weather", paris_tool_args_str)],
+        )
+
+        tool_call = response.choices[0].message.tool_calls[0]
+
+        # Add the output messages from the first turn as input to the second turn
+        for choice in response.choices:
+            messages.append(choice.message.model_dump(exclude_none=True))
+
+        # Add our tool output message
+        messages.append(
+            {
+                "role": "tool",
+                "tool_call_id": tool_call.id,
+                "content": "20 degrees Celsius",
+            },
+        )
+
+        # Test the Harmony messages for the second turn's input
+        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
+        input_messages_2, _, _ = serving_chat._make_request_with_harmony(req_2)
+        verify_harmony_messages(
+            input_messages_2,
+            [
+                {"role": "system"},
+                {"role": "developer"},
+                {"role": "user"},
+                {
+                    "role": "assistant",
+                    "channel": "analysis",
+                    "content": reasoning_str,
+                },
+                {
+                    "role": "assistant",
+                    "channel": "commentary",
+                    "recipient": "functions.get_weather",
+                    "content": paris_tool_args_str,
+                },
+                {
+                    "role": "tool",
+                    "author_name": "functions.get_weather",
+                    "channel": "commentary",
+                    "recipient": "assistant",
+                    "content": "20 degrees Celsius",
+                },
+            ],
+        )
+
+        # Test the Chat Completion response for the second turn's output
+        paris_weather_str = "The weather in Paris today is 20 degrees Celsius."
+        response_str = f"<|channel|>final<|message|>{paris_weather_str}<|end|>"
+        response_2 = await self.generate_response_from_harmony_str(
+            serving_chat, req_2, response_str, stream=stream
+        )
+        verify_chat_response(response_2, content=paris_weather_str)
+
+        # Add the output messages from the second turn as input to the third turn
+        for choice in response_2.choices:
+            messages.append(choice.message.model_dump(exclude_none=True))
+
+        # Add a new user message for the third turn
+        messages.append(
+            {
+                "role": "user",
+                "content": "What's the weather like in Boston today?",
+            },
+        )
+
+        # Test the Harmony messages for the third turn's input
+        req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
+        input_messages_3, _, _ = serving_chat._make_request_with_harmony(req_3)
+        verify_harmony_messages(
+            input_messages_3,
+            [
+                {"role": "system"},
+                {"role": "developer"},
+                {"role": "user"},
+                {
+                    "role": "assistant",
+                    "channel": "commentary",
+                    "recipient": "functions.get_weather",
+                    "content": paris_tool_args_str,
+                },
+                {
+                    "role": "tool",
+                    "author_name": "functions.get_weather",
+                    "channel": "commentary",
+                    "recipient": "assistant",
+                    "content": "20 degrees Celsius",
+                },
+                {
+                    "role": "assistant",
+                    "channel": "final",
+                    "content": paris_weather_str,
+                },
+                {"role": "user", "content": messages[-1]["content"]},
+            ],
+        )
+
+        # Test the Chat Completion response for the third turn's output
+        reasoning_str = "I'll call get_weather."
+        boston_tool_args_str = '{"location": "Boston"}'
+        response_str = (
+            f"<|channel|>analysis<|message|>{reasoning_str}<|end|>"
+            "<|start|>assistant to=functions.get_weather<|channel|>commentary"
+            f"<|constrain|>json<|message|>{boston_tool_args_str}<|call|>"
+        )
+        response_3 = await self.generate_response_from_harmony_str(
+            serving_chat, req, response_str, stream=stream
+        )
+        verify_chat_response(
+            response_3,
+            reasoning=reasoning_str,
+            tool_calls=[("get_weather", boston_tool_args_str)],
+        )
+
+        tool_call = response_3.choices[0].message.tool_calls[0]
+
+        # Add the output messages from the third turn as input to the fourth turn
+        for choice in response_3.choices:
+            messages.append(choice.message.model_dump(exclude_none=True))
+
+        # Add our tool output message
+        messages.append(
+            {
+                "role": "tool",
+                "tool_call_id": tool_call.id,
+                "content": "10 degrees Celsius",
+            },
+        )
+
+        # Test the Harmony messages for the fourth turn's input
+        req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
+        input_messages_4, _, _ = serving_chat._make_request_with_harmony(req_4)
+        verify_harmony_messages(
+            input_messages_4,
+            [
+                {"role": "system"},
+                {"role": "developer"},
+                {"role": "user"},
+                {"role": "assistant"},
+                {"role": "tool"},
+                {
+                    "role": "assistant",
+                    "channel": "final",
+                },
+                {"role": "user"},
+                {
+                    "role": "assistant",
+                    "channel": "analysis",
+                    "content": reasoning_str,
+                },
+                {
+                    "role": "assistant",
+                    "channel": "commentary",
+                    "recipient": "functions.get_weather",
+                    "content": boston_tool_args_str,
+                },
+                {
+                    "role": "tool",
+                    "author_name": "functions.get_weather",
+                    "channel": "commentary",
+                    "recipient": "assistant",
+                    "content": "10 degrees Celsius",
+                },
+            ],
+        )
+
+    @pytest.mark.asyncio
+    async def test_non_tool_reasoning(self, serving_chat):
+        messages: list[dict[str, Any]] = [
+            {
+                "role": "user",
+                "content": "What's 2+2?",
+            },
+            {
+                "role": "assistant",
+                "reasoning": "Adding 2 and 2 is easy. The result is 4.",
+                "content": "4",
+            },
+        ]
+        req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
+        input_messages, _, _ = serving_chat._make_request_with_harmony(req)
+
+        verify_harmony_messages(
+            input_messages,
+            [
+                {"role": "system"},
+                {"role": "developer"},
+                {"role": "user", "content": messages[0]["content"]},
+                # The reasoning that would have resulted in an analysis message is
+                # dropped because of a later assistant message to the final channel.
+                {
+                    "role": "assistant",
+                    "channel": "final",
+                    "content": messages[1]["content"],
+                },
+            ],
+        )
+
+    @pytest.mark.asyncio
+    async def test_non_tool_reasoning_empty_content(self, serving_chat):
+        messages: list[dict[str, Any]] = [
+            {
+                "role": "user",
+                "content": "What's 2+2?",
+            },
+            {
+                "role": "assistant",
+                "reasoning": "Adding 2 and 2 is easy. The result is 4.",
+                "content": "",
+            },
+        ]
+        req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
+        input_messages, _, _ = serving_chat._make_request_with_harmony(req)
+
+        verify_harmony_messages(
+            input_messages,
+            [
+                {"role": "system"},
+                {"role": "developer"},
+                {"role": "user", "content": messages[0]["content"]},
+                {
+                    "role": "assistant",
+                    "channel": "analysis",
+                    "content": messages[1]["reasoning"],
+                },
+            ],
+        )
+
+    @pytest.mark.asyncio
+    async def test_non_tool_reasoning_empty_content_list(self, serving_chat):
+        messages: list[dict[str, Any]] = [
+            {
+                "role": "user",
+                "content": "What's 2+2?",
+            },
+            {
+                "role": "assistant",
+                "reasoning": "Adding 2 and 2 is easy. The result is 4.",
+                "content": [],
+            },
+        ]
+        req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
+        input_messages, _, _ = serving_chat._make_request_with_harmony(req)
+
+        verify_harmony_messages(
+            input_messages,
+            [
+                {"role": "system"},
+                {"role": "developer"},
+                {"role": "user", "content": messages[0]["content"]},
+                {
+                    "role": "assistant",
+                    "channel": "analysis",
+                    "content": messages[1]["reasoning"],
+                },
+            ],
+        )
diff --git a/tests/entrypoints/openai/utils.py b/tests/entrypoints/openai/utils.py
new file mode 100644
index 0000000000000..501f6dcc91543
--- /dev/null
+++ b/tests/entrypoints/openai/utils.py
@@ -0,0 +1,190 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+from collections.abc import AsyncGenerator
+from typing import Any
+
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionResponse,
+    ChatCompletionResponseChoice,
+    ChatCompletionStreamResponse,
+    ChatMessage,
+    UsageInfo,
+)
+
+
+async def accumulate_streaming_response(
+    stream_generator: AsyncGenerator[str, None],
+) -> ChatCompletionResponse:
+    """
+    Accumulate streaming SSE chunks into a complete ChatCompletionResponse.
+
+    This helper parses the SSE format and builds up the complete response
+    by combining all the delta chunks.
+    """
+    accumulated_content = ""
+    accumulated_reasoning = None
+    accumulated_tool_calls: list[dict[str, Any]] = []
+    role = None
+    finish_reason = None
+    response_id = None
+    created = None
+    model = None
+    index = 0
+
+    async for chunk_str in stream_generator:
+        # Skip empty lines and [DONE] marker
+        if not chunk_str.strip() or chunk_str.strip() == "data: [DONE]":
+            continue
+
+        # Parse SSE format: "data: {json}\n\n"
+        if chunk_str.startswith("data: "):
+            json_str = chunk_str[6:].strip()
+            try:
+                chunk_data = json.loads(json_str)
+                # print(f"DEBUG: Parsed chunk_data: {chunk_data}")
+                chunk = ChatCompletionStreamResponse(**chunk_data)
+
+                # Store metadata from first chunk
+                if response_id is None:
+                    response_id = chunk.id
+                    created = chunk.created
+                    model = chunk.model
+
+                # Process each choice in the chunk
+                for choice in chunk.choices:
+                    if choice.delta.role:
+                        role = choice.delta.role
+                    if choice.delta.content:
+                        accumulated_content += choice.delta.content
+                    if choice.delta.reasoning:
+                        if accumulated_reasoning is None:
+                            accumulated_reasoning = ""
+                        accumulated_reasoning += choice.delta.reasoning
+                    if choice.delta.tool_calls:
+                        # Accumulate tool calls
+                        for tool_call_delta in choice.delta.tool_calls:
+                            # Find or create the tool call at this index
+                            while len(accumulated_tool_calls) <= tool_call_delta.index:
+                                accumulated_tool_calls.append(
+                                    {
+                                        "id": None,
+                                        "type": "function",
+                                        "function": {"name": "", "arguments": ""},
+                                    }
+                                )
+
+                            if tool_call_delta.id:
+                                accumulated_tool_calls[tool_call_delta.index]["id"] = (
+                                    tool_call_delta.id
+                                )
+                            if tool_call_delta.function:
+                                if tool_call_delta.function.name:
+                                    accumulated_tool_calls[tool_call_delta.index][
+                                        "function"
+                                    ]["name"] += tool_call_delta.function.name
+                                if tool_call_delta.function.arguments:
+                                    accumulated_tool_calls[tool_call_delta.index][
+                                        "function"
+                                    ]["arguments"] += tool_call_delta.function.arguments
+
+                    if choice.finish_reason:
+                        finish_reason = choice.finish_reason
+                    if choice.index is not None:
+                        index = choice.index
+
+            except json.JSONDecodeError:
+                continue
+
+    # Build the final message
+    message_kwargs = {
+        "role": role or "assistant",
+        "content": accumulated_content if accumulated_content else None,
+        "reasoning": accumulated_reasoning,
+    }
+
+    # Only include tool_calls if there are any
+    if accumulated_tool_calls:
+        message_kwargs["tool_calls"] = [
+            {"id": tc["id"], "type": tc["type"], "function": tc["function"]}
+            for tc in accumulated_tool_calls
+        ]
+
+    message = ChatMessage(**message_kwargs)
+
+    # Build the final response
+    choice = ChatCompletionResponseChoice(
+        index=index,
+        message=message,
+        finish_reason=finish_reason or "stop",
+    )
+
+    # Create usage info (with dummy values for tests)
+    usage = UsageInfo(
+        prompt_tokens=0,
+        completion_tokens=0,
+        total_tokens=0,
+    )
+
+    response = ChatCompletionResponse(
+        id=response_id or "chatcmpl-test",
+        object="chat.completion",
+        created=created or 0,
+        model=model or "test-model",
+        choices=[choice],
+        usage=usage,
+    )
+
+    return response
+
+
+def verify_harmony_messages(
+    messages: list[Any], expected_messages: list[dict[str, Any]]
+):
+    assert len(messages) == len(expected_messages)
+    for msg, expected in zip(messages, expected_messages):
+        if "role" in expected:
+            assert msg.author.role == expected["role"]
+        if "author_name" in expected:
+            assert msg.author.name == expected["author_name"]
+        if "channel" in expected:
+            assert msg.channel == expected["channel"]
+        if "recipient" in expected:
+            assert msg.recipient == expected["recipient"]
+        if "content" in expected:
+            assert msg.content[0].text == expected["content"]
+        if "content_type" in expected:
+            assert msg.content_type == expected["content_type"]
+        if "tool_definitions" in expected:
+            # Check that the tool definitions match the expected list of tool names
+            actual_tools = [t.name for t in msg.content[0].tools["functions"].tools]
+            assert actual_tools == expected["tool_definitions"]
+
+
+def verify_chat_response(
+    response: ChatCompletionResponse,
+    content: str | None = None,
+    reasoning: str | None = None,
+    tool_calls: list[tuple[str, str]] | None = None,
+):
+    assert len(response.choices) == 1
+    message = response.choices[0].message
+
+    if content is not None:
+        assert message.content == content
+    else:
+        assert not message.content
+
+    if reasoning is not None:
+        assert message.reasoning == reasoning
+    else:
+        assert not message.reasoning
+
+    if tool_calls:
+        assert message.tool_calls is not None
+        assert len(message.tool_calls) == len(tool_calls)
+        for tc, (expected_name, expected_args) in zip(message.tool_calls, tool_calls):
+            assert tc.function.name == expected_name
+            assert tc.function.arguments == expected_args
+    else:
+        assert not message.tool_calls
diff --git a/vllm/entrypoints/openai/parser/harmony_utils.py b/vllm/entrypoints/openai/parser/harmony_utils.py
index 2260e9604c3ed..376d97a03964e 100644
--- a/vllm/entrypoints/openai/parser/harmony_utils.py
+++ b/vllm/entrypoints/openai/parser/harmony_utils.py
@@ -232,7 +232,177 @@ def parse_response_input(
     return msg
 
 
+def parse_chat_inputs_to_harmony_messages(chat_msgs: list) -> list[Message]:
+    """
+    Parse a list of messages from request.messages in the Chat Completion API to
+    Harmony messages.
+    """
+    msgs: list[Message] = []
+    tool_id_names: dict[str, str] = {}
+
+    # Collect tool id to name mappings for tool response recipient values
+    for chat_msg in chat_msgs:
+        for tool_call in chat_msg.get("tool_calls", []):
+            tool_id_names[tool_call.get("id")] = tool_call.get("function", {}).get(
+                "name"
+            )
+
+    for chat_msg in chat_msgs:
+        msgs.extend(parse_chat_input_to_harmony_message(chat_msg, tool_id_names))
+
+    msgs = auto_drop_analysis_messages(msgs)
+    return msgs
+
+
+def auto_drop_analysis_messages(msgs: list[Message]) -> list[Message]:
+    """
+    Harmony models expect the analysis messages (representing raw chain of thought) to
+    be dropped after an assistant message to the final channel is produced from the
+    reasoning of those messages.
+
+    The openai-harmony library does this if the very last assistant message is to the
+    final channel, but it does not handle the case where we're in longer multi-turn
+    conversations and the client gave us reasoning content from previous turns of
+    the conversation with multiple assistant messages to the final channel in the
+    conversation.
+
+    So, we find the index of the last assistant message to the final channel and drop
+    all analysis messages that precede it, leaving only the analysis messages that
+    are relevant to the current part of the conversation.
+    """
+    last_assistant_final_index = -1
+    for i in range(len(msgs) - 1, -1, -1):
+        msg = msgs[i]
+        if msg.author.role == "assistant" and msg.channel == "final":
+            last_assistant_final_index = i
+            break
+
+    cleaned_msgs: list[Message] = []
+    for i, msg in enumerate(msgs):
+        if i < last_assistant_final_index and msg.channel == "analysis":
+            continue
+        cleaned_msgs.append(msg)
+
+    return cleaned_msgs
+
+
+def flatten_chat_text_content(content: str | list | None) -> str | None:
+    """
+    Extract the text parts from a chat message content field and flatten them
+    into a single string.
+    """
+    if isinstance(content, list):
+        return "".join(
+            item.get("text", "")
+            for item in content
+            if isinstance(item, dict) and item.get("type") == "text"
+        )
+    return content
+
+
+def parse_chat_input_to_harmony_message(
+    chat_msg, tool_id_names: dict[str, str] | None = None
+) -> list[Message]:
+    """
+    Parse a message from request.messages in the Chat Completion API to
+    Harmony messages.
+    """
+    tool_id_names = tool_id_names or {}
+
+    if not isinstance(chat_msg, dict):
+        # Handle Pydantic models
+        chat_msg = chat_msg.model_dump(exclude_none=True)
+
+    role = chat_msg.get("role")
+    msgs: list[Message] = []
+
+    # Assistant message with tool calls
+    tool_calls = chat_msg.get("tool_calls", [])
+
+    if role == "assistant" and tool_calls:
+        content = flatten_chat_text_content(chat_msg.get("content"))
+        if content:
+            commentary_msg = Message.from_role_and_content(Role.ASSISTANT, content)
+            commentary_msg = commentary_msg.with_channel("commentary")
+            msgs.append(commentary_msg)
+
+        reasoning_content = chat_msg.get("reasoning") or chat_msg.get(
+            "reasoning_content"
+        )
+        if reasoning_content:
+            analysis_msg = Message.from_role_and_content(
+                Role.ASSISTANT, reasoning_content
+            )
+            analysis_msg = analysis_msg.with_channel("analysis")
+            msgs.append(analysis_msg)
+
+        for call in tool_calls:
+            func = call.get("function", {})
+            name = func.get("name", "")
+            arguments = func.get("arguments", "") or ""
+            msg = Message.from_role_and_content(Role.ASSISTANT, arguments)
+            msg = msg.with_channel("commentary")
+            msg = msg.with_recipient(f"functions.{name}")
+            # Officially, this should be `<|constrain|>json` but there is not clear
+            # evidence that improves accuracy over `json` and some anecdotes to the
+            # contrary. Further testing of the different content_types is needed.
+            msg = msg.with_content_type("json")
+            msgs.append(msg)
+        return msgs
+
+    # Tool role message (tool output)
+    if role == "tool":
+        tool_call_id = chat_msg.get("tool_call_id", "")
+        name = tool_id_names.get(tool_call_id, "")
+        content = chat_msg.get("content", "") or ""
+        content = flatten_chat_text_content(content)
+
+        msg = (
+            Message.from_author_and_content(
+                Author.new(Role.TOOL, f"functions.{name}"), content
+            )
+            .with_channel("commentary")
+            .with_recipient("assistant")
+        )
+        return [msg]
+
+    # Non-tool reasoning content
+    reasoning_content = chat_msg.get("reasoning") or chat_msg.get("reasoning_content")
+    if role == "assistant" and reasoning_content:
+        analysis_msg = Message.from_role_and_content(Role.ASSISTANT, reasoning_content)
+        analysis_msg = analysis_msg.with_channel("analysis")
+        msgs.append(analysis_msg)
+
+    # Default: user/assistant/system messages with content
+    content = chat_msg.get("content") or ""
+    if content is None:
+        content = ""
+    if isinstance(content, str):
+        contents = [TextContent(text=content)]
+    else:
+        # TODO: Support refusal.
+        contents = [TextContent(text=c.get("text", "")) for c in content]
+
+    # Only add assistant messages if they have content, as reasoning or tool calling
+    # assistant messages were already added above.
+    if role == "assistant" and contents and contents[0].text:
+        msg = Message.from_role_and_contents(role, contents)
+        # Send non-tool assistant messages to the final channel
+        msg = msg.with_channel("final")
+        msgs.append(msg)
+    # For user/system/developer messages, add them directly even if no content.
+    elif role != "assistant":
+        msg = Message.from_role_and_contents(role, contents)
+        msgs.append(msg)
+
+    return msgs
+
+
 def parse_input_to_harmony_message(chat_msg) -> list[Message]:
+    """
+    Parse a message from request.previous_input_messages in the Responsees API to
+    Harmony messages.
+    """
     if not isinstance(chat_msg, dict):
         # Handle Pydantic models
         chat_msg = chat_msg.model_dump(exclude_none=True)
@@ -258,14 +428,7 @@ def parse_input_to_harmony_message(chat_msg) -> list[Message]:
     if role == "tool":
         name = chat_msg.get("name", "")
         content = chat_msg.get("content", "") or ""
-        if isinstance(content, list):
-            # Handle array format for tool message content
-            # by concatenating all text parts.
-            content = "".join(
-                item.get("text", "")
-                for item in content
-                if isinstance(item, dict) and item.get("type") == "text"
-            )
+        content = flatten_chat_text_content(content)
 
         msg = Message.from_author_and_content(
             Author.new(Role.TOOL, f"functions.{name}"), content
@@ -623,20 +786,40 @@ def parse_output_into_messages(token_ids: Iterable[int]) -> StreamableParser:
 def parse_chat_output(
     token_ids: Sequence[int],
 ) -> tuple[str | None, str | None, bool]:
+    """
+    Parse the output of a Harmony chat completion into reasoning and final content.
+    Note that when the `openai` tool parser is used, serving_chat only uses this
+    for the reasoning content and gets the final content from the tool call parser.
+
+    When the `openai` tool parser is not enabled, or when `GptOssReasoningParser` is
+    in use,this needs to return the final content without any tool calls parsed.
+
+    Empty reasoning or final content is returned as None instead of an empty string.
+    """
     parser = parse_output_into_messages(token_ids)
     output_msgs = parser.messages
     is_tool_call = False  # TODO: update this when tool call is supported
-    if len(output_msgs) == 0:
-        # The generation has stopped during reasoning.
-        reasoning = parser.current_content
-        final_content = None
-    elif len(output_msgs) == 1:
-        # The generation has stopped during final message.
-        reasoning = output_msgs[0].content[0].text
-        final_content = parser.current_content
-    else:
-        reasoning_msg = output_msgs[:-1]
-        final_msg = output_msgs[-1]
-        reasoning = "\n".join([msg.content[0].text for msg in reasoning_msg])
-        final_content = final_msg.content[0].text
+
+    # Get completed messages from the parser
+    reasoning_texts = [
+        msg.content[0].text for msg in output_msgs if msg.channel == "analysis"
+    ]
+    final_texts = [
+        msg.content[0].text for msg in output_msgs if msg.channel != "analysis"
+    ]
+
+    # Extract partial messages from the parser
+    if parser.current_channel == "analysis" and parser.current_content:
+        reasoning_texts.append(parser.current_content)
+    elif parser.current_channel != "analysis" and parser.current_content:
+        final_texts.append(parser.current_content)
+
+    # Flatten multiple messages into a single string
+    reasoning: str | None = "\n".join(reasoning_texts)
+    final_content: str | None = "\n".join(final_texts)
+
+    # Return None instead of empty string since existing callers check for None
+    reasoning = reasoning or None
+    final_content = final_content or None
+
     return reasoning, final_content, is_tool_call
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 2560a5b2cdf41..d94fa7dd91937 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -27,8 +27,8 @@ from vllm.entrypoints.openai.parser.harmony_utils import (
     get_stop_tokens_for_assistant_actions,
     get_streamable_parser_for_assistant,
     get_system_message,
+    parse_chat_inputs_to_harmony_messages,
     parse_chat_output,
-    parse_input_to_harmony_message,
     render_for_completion,
 )
 from vllm.entrypoints.openai.protocol import (
@@ -822,6 +822,9 @@ class OpenAIServingChat(OpenAIServing):
 
                             if delta_message is not None:
                                 harmony_tools_streamed[i] = True
+                        elif cur_channel == "commentary":
+                            # Tool call preambles meant to be shown to the user
+                            delta_message = DeltaMessage(content=delta_text)
                         else:
                             delta_message = None
                     # handle streaming deltas for tools with named tool_choice
@@ -1770,6 +1773,11 @@ class OpenAIServingChat(OpenAIServing):
     ):
         messages: list[OpenAIMessage] = []
 
+        # because of issues with pydantic we need to potentially
+        # re-serialize the tool_calls field of the request
+        # for more info: see comment in `maybe_serialize_tool_calls`
+        maybe_serialize_tool_calls(request)
+
         # Add system message.
         # NOTE: In Chat Completion API, browsing is enabled by default
         # if the model supports it. TODO: Support browsing.
@@ -1788,8 +1796,7 @@ class OpenAIServingChat(OpenAIServing):
         messages.append(dev_msg)
 
         # Add user message.
-        for chat_msg in request.messages:
-            messages.extend(parse_input_to_harmony_message(chat_msg))
+        messages.extend(parse_chat_inputs_to_harmony_messages(request.messages))
 
         # Render prompt token ids.
         prompt_token_ids = render_for_completion(messages)
diff --git a/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
index 387e87f208e66..a3cf793ed3a6d 100644
--- a/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
@@ -43,6 +43,7 @@ class OpenAIToolParser(ToolParser):
         parser = parse_output_into_messages(token_ids)
         tool_calls = []
         final_content = None
+        commentary_content = None
 
         if len(parser.messages) > 0:
             for msg in parser.messages:
@@ -75,11 +76,15 @@ class OpenAIToolParser(ToolParser):
                     )
                 elif msg.channel == "final":
                     final_content = msg_text
+                elif msg.channel == "commentary" and not msg.recipient:
+                    commentary_content = msg_text
 
         return ExtractedToolCallInformation(
             tools_called=len(tool_calls) > 0,
             tool_calls=tool_calls,
-            content=final_content,
+            # prefer final content over commentary content if both are present
+            # commentary content is tool call preambles meant to be shown to the user
+            content=final_content or commentary_content,
         )
 
     def extract_tool_calls_streaming(

From 302b2c1eb968711abe3e765f7a936dea66535907 Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Fri, 12 Dec 2025 03:30:23 -0600
Subject: [PATCH 482/770] [CI/Build][AMD] Fix ref_dynamic_per_token_quant
 reference implementation on ROCm. (#30291)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 tests/kernels/quant_utils.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py
index e29f66dca313f..7927bd0d200d8 100644
--- a/tests/kernels/quant_utils.py
+++ b/tests/kernels/quant_utils.py
@@ -30,16 +30,11 @@ def ref_dynamic_per_token_quant(
         if quant_dtype == torch.int8
         else torch.finfo(quant_dtype)
     )
-    qtype_traits_max = (
-        ROCM_FP8FNUZ_MAX
-        if current_platform.is_rocm() and current_platform.is_fp8_fnuz()
-        else qtype_traits.max
-    )
-    qtype_traits_min = (
-        -ROCM_FP8FNUZ_MAX
-        if current_platform.is_rocm() and current_platform.is_fp8_fnuz()
-        else qtype_traits.min
+    use_fp8fnuz = (
+        current_platform.is_fp8_fnuz() and quant_dtype == current_platform.fp8_dtype()
     )
+    qtype_traits_max = ROCM_FP8FNUZ_MAX if use_fp8fnuz else qtype_traits.max
+    qtype_traits_min = -ROCM_FP8FNUZ_MAX if use_fp8fnuz else qtype_traits.min
     qtype_max = as_float32_tensor(qtype_traits_max)
     s_1 = as_float32_tensor(1.0)
     s_512 = as_float32_tensor(512.0)

From f90319d5d14266769b65f0de28ff60b002a65fcc Mon Sep 17 00:00:00 2001
From: Jaehwang Jung <tomtomjhj@gmail.com>
Date: Fri, 12 Dec 2025 19:27:20 +0900
Subject: [PATCH 483/770] [Bugfix] Schedule failure due to wrong
 get_image_size_with_most_features (#29692)

---
 .../multimodal/processing/test_gemma3.py      | 42 ++++++++++++++++++
 .../multimodal/processing/test_qwen2_vl.py    | 35 +++++++++++++++
 vllm/model_executor/models/gemma3_mm.py       |  5 ++-
 vllm/model_executor/models/qwen2_vl.py        | 44 ++++++++++++++++---
 4 files changed, 117 insertions(+), 9 deletions(-)
 create mode 100644 tests/models/multimodal/processing/test_gemma3.py

diff --git a/tests/models/multimodal/processing/test_gemma3.py b/tests/models/multimodal/processing/test_gemma3.py
new file mode 100644
index 0000000000000..32a459ee8cdfb
--- /dev/null
+++ b/tests/models/multimodal/processing/test_gemma3.py
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+from ....conftest import ImageTestAssets
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id", ["google/gemma-3-4b-it"])
+def test_get_image_size_with_most_features(
+    image_assets: ImageTestAssets, model_id: str
+):
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs={"do_pan_and_scan": True},
+        limit_mm_per_prompt={"image": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+
+    hf_processor_mm_kwargs: dict[str, object] = {}
+    hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
+
+    max_image_size = processor.info.get_image_size_with_most_features()
+    max_tokens = processor.info.get_num_image_tokens(
+        image_width=max_image_size.width,
+        image_height=max_image_size.height,
+        processor=hf_processor,
+    )
+
+    prompt = "<start_of_image>"
+    image_seq_length = hf_processor.image_seq_length
+
+    for asset in image_assets:
+        mm_data = {"image": [asset.pil_image]}
+        processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
+        mm_kwargs_data = processed_inputs["mm_kwargs"].get_data()
+        num_patches_tensor = mm_kwargs_data["num_patches"]
+        tokens = int(num_patches_tensor.item()) * image_seq_length
+        assert tokens <= max_tokens
diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py
index 9f4cdb6789b2c..20beaa6011b8f 100644
--- a/tests/models/multimodal/processing/test_qwen2_vl.py
+++ b/tests/models/multimodal/processing/test_qwen2_vl.py
@@ -53,3 +53,38 @@ def test_processor_override(
     assert img_tok_count == expected_toks_per_img * num_imgs
     assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs
     assert pixel_shape[1] == expected_pixels_shape[1]
+
+
+@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])
+@pytest.mark.parametrize("max_pixels", [1280 * 28 * 28, 1283 * 28 * 28])
+def test_get_image_size_with_most_features(
+    image_assets: ImageTestAssets,
+    model_id: str,
+    max_pixels: int,
+):
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs={"max_pixels": max_pixels},
+        limit_mm_per_prompt={"image": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+
+    hf_processor_mm_kwargs: dict[str, object] = {}
+    hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
+    merge_size = processor.info.get_hf_config().vision_config.spatial_merge_size
+
+    max_image_size = processor.info.get_image_size_with_most_features()
+    max_tokens = processor.info.get_num_image_tokens(
+        image_width=max_image_size.width,
+        image_height=max_image_size.height,
+        image_processor=hf_processor.image_processor,
+    )
+
+    prompt = "<|vision_start|><|image_pad|><|vision_end|>"
+    for asset in image_assets:
+        mm_data = {"image": [asset.pil_image]}
+        processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
+        grid_thw = processed_inputs["mm_kwargs"].get_data()["image_grid_thw"].tolist()
+        t, h, w = grid_thw[0]
+        tokens = (t * h * w) // (merge_size**2)
+        assert tokens < max_tokens
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index e8dec36a1c5b8..45dfacd94431c 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -237,8 +237,9 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
         )
         max_num_crops = images_kwargs["pan_and_scan_max_num_crops"]
 
-        # Result in the max possible feature size (h:w = max_num_crops:1)
-        return ImageSize(height=50 * max_num_crops, width=50)
+        vision_config = self.get_hf_config().vision_config
+        native_size = vision_config.image_size
+        return ImageSize(height=native_size * max_num_crops, width=native_size)
 
 
 class Gemma3DummyInputsBuilder(BaseDummyInputsBuilder[Gemma3ProcessingInfo]):
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 2c4ac2f8efff1..4e54208a59b67 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -25,6 +25,7 @@
 # limitations under the License.
 """Inference-only Qwen2-VL model compatible with HuggingFace weights."""
 
+import math
 from collections.abc import Callable, Iterable, Mapping, Sequence
 from functools import partial
 from typing import Annotated, Any, Literal, TypeAlias
@@ -959,13 +960,42 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
         return num_video_tokens
 
     def get_image_size_with_most_features(self) -> ImageSize:
-        max_image_size, _ = self._get_vision_info(
-            image_width=9999999,
-            image_height=9999999,
-            num_frames=1,
-            image_processor=None,
-        )
-        return max_image_size
+        # NOTE: Simply processing a huge size with _get_vision_info might not give a
+        # size that maximizes the number of featrues, i.e., the number of (merged)
+        # patches. This is because the number of patches limits the allowed aspect
+        # ratios. For example, suppose the maximum number of patches is 1280. A square
+        # image cannot be broken down into 1280 patches, so feeding a giant square image
+        # into _get_vision_info will not yield a size that maximizes the number of
+        # patches. Therefore, we directly factorize the maximum number of patches into
+        # height and width. The tricky part is to avoid extreme aspect ratios (>200 for
+        # qwen2-vl). If we can't find a suitable aspect ratio, we decrease the number of
+        # patches and retry. This is safe because the processor does not accept extreme
+        # aspect ratios, so there is no valid post-resize image with the number of
+        # patches that yields extreme aspect ratios.
+
+        hf_config = self.get_hf_config()
+        vision_config = hf_config.vision_config
+        patch_size = vision_config.patch_size
+        merge_size = vision_config.spatial_merge_size
+        image_processor = self.get_image_processor()
+        max_pixels = image_processor.max_pixels or image_processor.size["longest_edge"]
+        unit = patch_size * merge_size
+        max_seq_len = max_pixels // (unit * unit)
+
+        def closest_factor_pair(n: int) -> tuple[int, int]:
+            # left <= right
+            for d in range(math.isqrt(n), 0, -1):
+                if n % d == 0:
+                    return d, n // d
+            return 1, n
+
+        height_factor, width_factor = 1, max_seq_len
+        for seq_len in range(max_seq_len, 0, -1):
+            height_factor, width_factor = closest_factor_pair(seq_len)
+            if width_factor / height_factor <= 200:
+                break
+
+        return ImageSize(width=unit * width_factor, height=unit * height_factor)
 
     def get_max_image_tokens(self) -> int:
         target_width, target_height = self.get_image_size_with_most_features()

From 91401c7a266450e332e88c3b569e93aeecca9a89 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=90=B4=E5=9D=8E?= <github@wu-kan.cn>
Date: Fri, 12 Dec 2025 18:54:52 +0800
Subject: [PATCH 484/770] [Bugfix] Fix CMakeLists Environment Variable (#21804)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: wu-kan <github@wu-kan.com>
Signed-off-by: 吴坎 <github@wu-kan.cn>
Signed-off-by: wu-kan <github@wu-kan.cn>
---
 CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6b93e3fe91603..cd52df86e0346 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -384,7 +384,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
         OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
       execute_process(
         COMMAND ${CMAKE_COMMAND} -E env
-        PYTHONPATH=$PYTHONPATH
+        PYTHONPATH=$ENV{PYTHONPATH}
           ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
         RESULT_VARIABLE marlin_generation_result
         OUTPUT_VARIABLE marlin_generation_result
@@ -822,7 +822,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
         OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
       execute_process(
         COMMAND ${CMAKE_COMMAND} -E env
-        PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
+        PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$ENV{PYTHONPATH}
           ${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
         RESULT_VARIABLE machete_generation_result
         OUTPUT_VARIABLE machete_generation_output
@@ -1004,7 +1004,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
         OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
       execute_process(
         COMMAND ${CMAKE_COMMAND} -E env
-        PYTHONPATH=$PYTHONPATH
+        PYTHONPATH=$ENV{PYTHONPATH}
           ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
         RESULT_VARIABLE moe_marlin_generation_result
         OUTPUT_VARIABLE moe_marlin_generation_output

From 3e41992fecdc31ee60715bb350f18fec18ed6680 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Fri, 12 Dec 2025 08:57:47 -0500
Subject: [PATCH 485/770] [Attention] Use sparse prefill kernel for fp8
 kv-cache in DeepSeek-v3.2 (#27532)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 csrc/cache.h                                  |  12 +-
 csrc/cache_kernels.cu                         | 131 +++-
 csrc/torch_bindings.cpp                       |   7 +
 tests/conftest.py                             |  21 +
 tests/kernels/moe/test_batched_deepgemm.py    |   2 +-
 tests/kernels/moe/test_batched_moe.py         |   1 +
 tests/kernels/moe/test_block_fp8.py           |   2 +-
 tests/kernels/moe/test_cutlass_moe.py         |  27 +-
 tests/kernels/moe/test_deepep_deepgemm_moe.py |   6 +
 tests/kernels/moe/test_deepep_moe.py          |   6 +
 tests/kernels/moe/test_deepgemm.py            |   2 +-
 tests/kernels/moe/test_flashinfer.py          |   1 +
 tests/kernels/moe/test_flashinfer_moe.py      |   9 +-
 .../moe/test_gpt_oss_triton_kernels.py        |   2 +-
 .../moe/test_modular_kernel_combinations.py   |   6 +
 .../moe/test_modular_oai_triton_moe.py        |   1 +
 tests/kernels/moe/test_moe.py                 |   1 +
 tests/kernels/moe/test_nvfp4_moe.py           |   2 +-
 tests/kernels/moe/test_pplx_moe.py            |   5 +
 .../v1/attention/test_sparse_mla_backends.py  | 251 ++++++-
 vllm/_custom_ops.py                           |  23 +
 vllm/envs.py                                  |   4 +
 .../layers/fused_moe/modular_kernel.py        |  73 +-
 vllm/model_executor/models/deepseek_v2.py     |  37 +-
 .../attention/backends/mla/flashmla_sparse.py | 665 +++++++++++++++---
 vllm/v1/attention/backends/mla/indexer.py     |  48 +-
 vllm/v1/attention/backends/utils.py           |  27 +
 vllm/v1/worker/gpu_model_runner.py            |   6 +
 vllm/v1/worker/gpu_worker.py                  |   5 +
 vllm/v1/worker/workspace.py                   | 245 +++++++
 30 files changed, 1372 insertions(+), 256 deletions(-)
 create mode 100644 vllm/v1/worker/workspace.py

diff --git a/csrc/cache.h b/csrc/cache.h
index f2a5ec0acf5cd..cbe44c09eb624 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <torch/all.h>
+#include <c10/util/Optional.h>
 
 #include <map>
 #include <vector>
@@ -58,6 +59,15 @@ void cp_gather_cache(
     torch::Tensor const& cu_seq_lens,  // [BATCH+1]
     int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);
 
+// Gather and upconvert FP8 KV cache to BF16 workspace
+void cp_gather_and_upconvert_fp8_kv_cache(
+    torch::Tensor const& src_cache,         // [NUM_BLOCKS, BLOCK_SIZE, 656]
+    torch::Tensor const& dst,               // [TOT_TOKENS, 576]
+    torch::Tensor const& block_table,       // [BATCH, BLOCK_INDICES]
+    torch::Tensor const& seq_lens,          // [BATCH]
+    torch::Tensor const& workspace_starts,  // [BATCH]
+    int64_t batch_size);
+
 // Indexer K quantization and cache function
 void indexer_k_quant_and_cache(
     torch::Tensor& k,             // [num_tokens, head_dim]
@@ -72,4 +82,4 @@ void cp_gather_indexer_k_quant_cache(
     torch::Tensor& dst_k,           // [num_tokens, head_dim]
     torch::Tensor& dst_scale,  // [num_tokens, head_dim / quant_block_size * 4]
     const torch::Tensor& block_table,   // [batch_size, num_blocks]
-    const torch::Tensor& cu_seq_lens);  // [batch_size + 1]
\ No newline at end of file
+    const torch::Tensor& cu_seq_lens);  // [batch_size + 1]
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index 8a5457206c706..f11c5f24c12ec 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -2,6 +2,7 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/cuda/CUDAException.h>
+#include <c10/util/Optional.h>
 
 #include "cuda_utils.h"
 #include "cuda_compat.h"
@@ -514,7 +515,8 @@ __global__ void indexer_k_quant_and_cache_kernel(
     const int quant_block_size,                // quantization block size
     const int cache_block_size,                // cache block size
     const int cache_stride,  // stride for each token in kv_cache
-    const bool use_ue8m0     // use ue8m0 scale format
+
+    const bool use_ue8m0  // use ue8m0 scale format
 ) {
   constexpr int VEC_SIZE = 4;
   const int64_t token_idx = blockIdx.x;
@@ -1061,6 +1063,82 @@ void gather_and_maybe_dequant_cache(
 }
 
 namespace vllm {
+
+// Gather and upconvert FP8 KV cache tokens to BF16 workspace
+// Similar to cp_gather_cache but specifically for FP8->BF16 conversion
+__global__ void cp_gather_and_upconvert_fp8_kv_cache(
+    const uint8_t* __restrict__ src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, 656]
+    __nv_bfloat16* __restrict__ dst,          // [TOT_TOKENS, 576]
+    const int32_t* __restrict__ block_table,  // [BATCH, BLOCK_INDICES]
+    const int32_t* __restrict__ seq_lens,     // [BATCH]
+    const int32_t* __restrict__ workspace_starts,  // [BATCH]
+    const int32_t block_size, const int32_t head_dim,
+    const int64_t block_table_stride, const int64_t cache_block_stride,
+    const int64_t cache_entry_stride, const int64_t dst_entry_stride) {
+  const int64_t bid = blockIdx.x;  // Batch ID
+  const int32_t num_splits = gridDim.y;
+  const int32_t split = blockIdx.y;
+  const int32_t seq_start = workspace_starts[bid];
+  const int32_t seq_len = seq_lens[bid];
+  const int32_t tot_slots = seq_len;
+  const int32_t split_slots = cuda_utils::ceil_div(tot_slots, num_splits);
+
+  const int32_t split_start = split * split_slots;
+  const int32_t split_end = min((split + 1) * split_slots, tot_slots);
+
+  const bool is_active_split = (split_start < tot_slots);
+
+  if (!is_active_split) return;
+
+  // Adjust the pointer for the block_table for this batch
+  const int32_t batch_offset = bid * block_table_stride;
+  int32_t offset = split_start;
+  int32_t offset_div = offset / block_size;
+  offset = offset % block_size;
+  const int32_t* batch_block_table = block_table + batch_offset;
+
+  // Adjust dst pointer based on the cumulative sequence lengths
+  dst += seq_start * dst_entry_stride;
+
+  const int tid = threadIdx.x;
+
+  // Process each token in this split
+  for (int pid = split_start; pid < split_end; ++pid) {
+    auto block_id = batch_block_table[offset_div];
+    const uint8_t* token_ptr =
+        src_cache + block_id * cache_block_stride + offset * cache_entry_stride;
+    __nv_bfloat16* dst_ptr = dst + pid * dst_entry_stride;
+
+    // FP8 format: 512 bytes fp8 + 16 bytes scales + 128 bytes rope (64 bf16)
+    const uint8_t* no_pe_ptr = token_ptr;
+    const float* scales_ptr = reinterpret_cast<const float*>(token_ptr + 512);
+    const __nv_bfloat16* rope_ptr =
+        reinterpret_cast<const __nv_bfloat16*>(token_ptr + 512 + 16);
+
+    // Parallelize fp8 dequant (512 elements) and rope copy (64 elements)
+    if (tid < 512) {
+      // FP8 dequantization
+      const int tile = tid >> 7;  // each tile is 128 elements
+      const float scale = scales_ptr[tile];
+      const uint8_t val = no_pe_ptr[tid];
+      dst_ptr[tid] =
+          fp8::scaled_convert<__nv_bfloat16, uint8_t,
+                              vllm::Fp8KVCacheDataType::kFp8E4M3>(val, scale);
+    } else if (tid < 576) {
+      // Rope copy (64 bf16 elements)
+      const int rope_idx = tid - 512;
+      dst_ptr[512 + rope_idx] = rope_ptr[rope_idx];
+    }
+
+    // Move to next token
+    offset += 1;
+    if (offset == block_size) {
+      offset_div += 1;
+      offset = 0;
+    }
+  }
+}
+
 template <typename scalar_t>
 // Note(hc): The cp_gather_cache allows seq_starts to no longer be divisible by
 // block_size.
@@ -1202,6 +1280,57 @@ void cp_gather_cache(
   }
 }
 
+void cp_gather_and_upconvert_fp8_kv_cache(
+    torch::Tensor const& src_cache,         // [NUM_BLOCKS, BLOCK_SIZE, 656]
+    torch::Tensor const& dst,               // [TOT_TOKENS, 576]
+    torch::Tensor const& block_table,       // [BATCH, BLOCK_INDICES]
+    torch::Tensor const& seq_lens,          // [BATCH]
+    torch::Tensor const& workspace_starts,  // [BATCH]
+    int64_t batch_size) {
+  at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  int32_t block_size = src_cache.size(1);
+  int32_t head_dim = dst.size(1);
+
+  TORCH_CHECK(block_table.dtype() == torch::kInt32,
+              "block_table must be int32");
+  TORCH_CHECK(seq_lens.dtype() == torch::kInt32, "seq_lens must be int32");
+  TORCH_CHECK(workspace_starts.dtype() == torch::kInt32,
+              "workspace_starts must be int32");
+
+  TORCH_CHECK(src_cache.device() == dst.device(),
+              "src_cache and dst must be on the same device");
+  TORCH_CHECK(src_cache.device() == block_table.device(),
+              "src_cache and block_table must be on the same device");
+  TORCH_CHECK(src_cache.device() == seq_lens.device(),
+              "src_cache and seq_lens must be on the same device");
+  TORCH_CHECK(src_cache.device() == workspace_starts.device(),
+              "src_cache and workspace_starts must be on the same device");
+
+  TORCH_CHECK(src_cache.dtype() == torch::kUInt8, "src_cache must be uint8");
+  TORCH_CHECK(dst.dtype() == torch::kBFloat16, "dst must be bfloat16");
+  TORCH_CHECK(head_dim == 576, "head_dim must be 576 for MLA");
+
+  int64_t block_table_stride = block_table.stride(0);
+  int64_t cache_block_stride = src_cache.stride(0);
+  int64_t cache_entry_stride = src_cache.stride(1);
+  int64_t dst_entry_stride = dst.stride(0);
+
+  // Decide on the number of splits based on the batch size
+  int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
+  dim3 grid(batch_size, num_splits);
+  dim3 block(576);
+
+  vllm::cp_gather_and_upconvert_fp8_kv_cache<<<grid, block, 0, stream>>>(
+      src_cache.data_ptr<uint8_t>(),
+      reinterpret_cast<__nv_bfloat16*>(dst.data_ptr()),
+      block_table.data_ptr<int32_t>(), seq_lens.data_ptr<int32_t>(),
+      workspace_starts.data_ptr<int32_t>(), block_size, head_dim,
+      block_table_stride, cache_block_stride, cache_entry_stride,
+      dst_entry_stride);
+}
+
 // Macro to dispatch the kernel based on the data type.
 #define CALL_INDEXER_K_QUANT_AND_CACHE(KV_T, CACHE_T, KV_DTYPE)         \
   vllm::indexer_k_quant_and_cache_kernel<KV_T, CACHE_T, KV_DTYPE>       \
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index d4c6f8c67c516..83d4943d62776 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -754,6 +754,13 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
       "Tensor cu_seq_lens, int batch_size, Tensor? seq_starts) -> ()");
   cache_ops.impl("cp_gather_cache", torch::kCUDA, &cp_gather_cache);
 
+  cache_ops.def(
+      "cp_gather_and_upconvert_fp8_kv_cache(Tensor src_cache, Tensor! dst, "
+      "Tensor block_table, Tensor seq_lens, Tensor workspace_starts, int "
+      "batch_size) -> ()");
+  cache_ops.impl("cp_gather_and_upconvert_fp8_kv_cache", torch::kCUDA,
+                 &cp_gather_and_upconvert_fp8_kv_cache);
+
   cache_ops.def(
       "indexer_k_quant_and_cache(Tensor k, Tensor! kv_cache, Tensor "
       "slot_mapping, "
diff --git a/tests/conftest.py b/tests/conftest.py
index 5b26a02823c56..b21cfd5ba85c4 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -202,6 +202,27 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
         cleanup_dist_env_and_memory()
 
 
+@pytest.fixture
+def workspace_init():
+    """Initialize the workspace manager for tests that need it.
+
+    This fixture initializes the workspace manager with a CUDA device
+    if available, and resets it after the test completes. Tests that
+    create a full vLLM engine should NOT use this fixture as the engine
+    will initialize the workspace manager itself.
+    """
+    from vllm.v1.worker.workspace import (
+        init_workspace_manager,
+        reset_workspace_manager,
+    )
+
+    if torch.cuda.is_available():
+        device = torch.device("cuda:0")
+        init_workspace_manager(device)
+    yield
+    reset_workspace_manager()
+
+
 @pytest.fixture(autouse=True)
 def dynamo_reset():
     yield
diff --git a/tests/kernels/moe/test_batched_deepgemm.py b/tests/kernels/moe/test_batched_deepgemm.py
index 59cecd60d3d61..0ba3d8d4c958e 100644
--- a/tests/kernels/moe/test_batched_deepgemm.py
+++ b/tests/kernels/moe/test_batched_deepgemm.py
@@ -27,7 +27,7 @@ BLOCK_SIZE = [128, 128]
 @pytest.mark.parametrize("N", [512, 1024])  # intermediate dim per expert
 @pytest.mark.parametrize("topk", [2, 4])
 def test_batched_deepgemm_vs_triton(
-    E: int, T: int, K: int, N: int, topk: int, monkeypatch
+    E: int, T: int, K: int, N: int, topk: int, monkeypatch, workspace_init
 ):
     """Compare BatchedDeepGemmExperts to BatchedTritonExperts."""
 
diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py
index dab1207d78031..2ef170f1ab308 100644
--- a/tests/kernels/moe/test_batched_moe.py
+++ b/tests/kernels/moe/test_batched_moe.py
@@ -248,6 +248,7 @@ def test_fused_moe_batched_experts(
     per_act_token_quant: bool,
     block_shape: list[int] | None,
     input_scales: bool,
+    workspace_init,
 ):
     """Note: float8_e4m3fn is not supported on CUDA architecture < 89,
     and those tests will be skipped on unsupported hardware."""
diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py
index b0ff1e64e3219..53a03f48e24ee 100644
--- a/tests/kernels/moe/test_block_fp8.py
+++ b/tests/kernels/moe/test_block_fp8.py
@@ -137,7 +137,7 @@ def setup_cuda():
 @pytest.mark.parametrize("seed", SEEDS)
 @torch.inference_mode()
 def test_w8a8_block_fp8_fused_moe(
-    M, N, K, E, topk, block_size, dtype, seed, monkeypatch
+    M, N, K, E, topk, block_size, dtype, seed, monkeypatch, workspace_init
 ):
     if topk > E:
         pytest.skip(f"Skipping test; topk={topk} > E={E}")
diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py
index c15837f145705..0160694d7bb54 100644
--- a/tests/kernels/moe/test_cutlass_moe.py
+++ b/tests/kernels/moe/test_cutlass_moe.py
@@ -274,6 +274,7 @@ def test_cutlass_moe_8_bit_no_graph(
     per_act_token: bool,
     per_out_ch: bool,
     monkeypatch,
+    workspace_init,
     ep_size: int | None = None,
 ):
     current_platform.seed_everything(7)
@@ -329,6 +330,7 @@ def test_cutlass_moe_8_bit_cuda_graph(
     per_act_token: bool,
     per_out_ch: bool,
     monkeypatch,
+    workspace_init,
 ):
     current_platform.seed_everything(7)
     monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
@@ -385,9 +387,19 @@ def test_cutlass_moe_8_bit_EP(
     per_out_channel: bool,
     ep_size: int,
     monkeypatch,
+    workspace_init,
 ):
     test_cutlass_moe_8_bit_no_graph(
-        m, n, k, e, topk, per_act_token, per_out_channel, monkeypatch, ep_size
+        m,
+        n,
+        k,
+        e,
+        topk,
+        per_act_token,
+        per_out_channel,
+        monkeypatch,
+        workspace_init,
+        ep_size,
     )
 
 
@@ -419,9 +431,19 @@ def test_cutlass_moe_8_bit_EP_large(
     per_out_channel: bool,
     ep_size: int,
     monkeypatch,
+    workspace_init,
 ):
     test_cutlass_moe_8_bit_no_graph(
-        m, n, k, e, topk, per_act_token, per_out_channel, monkeypatch, ep_size
+        m,
+        n,
+        k,
+        e,
+        topk,
+        per_act_token,
+        per_out_channel,
+        monkeypatch,
+        workspace_init,
+        ep_size,
     )
 
 
@@ -445,6 +467,7 @@ def test_run_cutlass_moe_fp8(
     per_act_token: bool,
     per_out_channel: bool,
     ep_size: int,
+    workspace_init,
 ):
     current_platform.seed_everything(7)
     with set_current_vllm_config(vllm_config):
diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py
index 455ecacef5ec3..f427734ef09e2 100644
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -29,6 +29,7 @@ from vllm.utils.deep_gemm import (
     is_deep_gemm_supported,
 )
 from vllm.utils.import_utils import has_deep_ep, has_deep_gemm
+from vllm.v1.worker.workspace import init_workspace_manager
 
 from ...utils import multi_gpu_test
 from .parallel_utils import ProcessGroupInfo, parallel_launch
@@ -363,6 +364,9 @@ def _test_deepep_deepgemm_moe(
     w1_scale: torch.Tensor,
     w2_scale: torch.Tensor,
 ):
+    device = torch.device(f"cuda:{pgi.local_rank}")
+    init_workspace_manager(device)
+
     current_platform.seed_everything(pgi.rank)
 
     w1 = w1.to(device=torch.cuda.current_device())
@@ -445,6 +449,7 @@ def test_ht_deepep_deepgemm_moe(
     topk: int,
     world_dp_size: tuple[int, int],
     disable_deepgemm_ue8m0,
+    workspace_init,
 ):
     """
     Tests for High-Throughput DeepEP + DeepGemm integration.
@@ -518,6 +523,7 @@ def test_ll_deepep_deepgemm_moe(
     block_size: list[int],
     world_dp_size: tuple[int, int],
     disable_deepgemm_ue8m0,
+    workspace_init,
 ):
     """
     Tests for Low-Latency DeepEP + DeepGemm integration.
diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py
index d78b8250463a9..e698ca92a1515 100644
--- a/tests/kernels/moe/test_deepep_moe.py
+++ b/tests/kernels/moe/test_deepep_moe.py
@@ -22,6 +22,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
 )
 from vllm.platforms import current_platform
 from vllm.utils.import_utils import has_deep_ep
+from vllm.v1.worker.workspace import init_workspace_manager
 
 from ...utils import multi_gpu_test
 from .parallel_utils import ProcessGroupInfo, parallel_launch
@@ -342,6 +343,9 @@ def _deep_ep_moe(
     use_fp8_dispatch: bool,
     per_act_token_quant: bool,
 ):
+    device = torch.device(f"cuda:{pgi.local_rank}")
+    init_workspace_manager(device)
+
     if not low_latency_mode:
         assert not use_fp8_dispatch, (
             "FP8 dispatch interface is available only in low-latency mode"
@@ -437,6 +441,7 @@ def test_deep_ep_moe(
     topk: int,
     world_dp_size: tuple[int, int],
     per_act_token_quant: bool,
+    workspace_init,
 ):
     low_latency_mode = False
     use_fp8_dispatch = False
@@ -492,6 +497,7 @@ def test_low_latency_deep_ep_moe(
     topk: int,
     world_dp_size: tuple[int, int],
     use_fp8_dispatch: bool,
+    workspace_init,
 ):
     low_latency_mode = True
 
diff --git a/tests/kernels/moe/test_deepgemm.py b/tests/kernels/moe/test_deepgemm.py
index 9b1054f7d0ab8..442b561f8f315 100644
--- a/tests/kernels/moe/test_deepgemm.py
+++ b/tests/kernels/moe/test_deepgemm.py
@@ -143,7 +143,7 @@ NUM_EXPERTS = [32]
 @pytest.mark.parametrize("topk", TOPKS)
 @pytest.mark.parametrize("num_experts", NUM_EXPERTS)
 @pytest.mark.skipif(not is_deep_gemm_supported(), reason="Requires deep_gemm kernels")
-def test_deepgemm_vs_triton(m, n, k, topk, num_experts, monkeypatch):
+def test_deepgemm_vs_triton(m, n, k, topk, num_experts, monkeypatch, workspace_init):
     with monkeypatch.context() as mp:
         mp.setenv("VLLM_USE_DEEP_GEMM", "1")
 
diff --git a/tests/kernels/moe/test_flashinfer.py b/tests/kernels/moe/test_flashinfer.py
index a6977f222408d..d553e2820e5ff 100644
--- a/tests/kernels/moe/test_flashinfer.py
+++ b/tests/kernels/moe/test_flashinfer.py
@@ -206,6 +206,7 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
     topk: int,
     activation: str,
     monkeypatch,
+    workspace_init,
 ):
     current_platform.seed_everything(7)
     monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
diff --git a/tests/kernels/moe/test_flashinfer_moe.py b/tests/kernels/moe/test_flashinfer_moe.py
index b2be03ecee2f1..133a8a4a30a60 100644
--- a/tests/kernels/moe/test_flashinfer_moe.py
+++ b/tests/kernels/moe/test_flashinfer_moe.py
@@ -51,7 +51,14 @@ MNK_FACTORS = [
 @pytest.mark.parametrize("activation", ["silu_and_mul", "relu2"])
 @torch.inference_mode()
 def test_flashinfer_fp4_moe_no_graph(
-    m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype, activation: str
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    dtype: torch.dtype,
+    activation: str,
+    workspace_init,
 ):
     current_platform.seed_everything(7)
     with set_current_vllm_config(
diff --git a/tests/kernels/moe/test_gpt_oss_triton_kernels.py b/tests/kernels/moe/test_gpt_oss_triton_kernels.py
index 98e80ec029777..384f43db479b5 100644
--- a/tests/kernels/moe/test_gpt_oss_triton_kernels.py
+++ b/tests/kernels/moe/test_gpt_oss_triton_kernels.py
@@ -269,7 +269,7 @@ class Case:
 )
 @pytest.mark.parametrize("num_token", [2])
 @pytest.mark.parametrize("tp", [1, 2, 4, 8])
-def test_equiv(num_token, a_dtype, w_dtype, tp):
+def test_equiv(num_token, a_dtype, w_dtype, tp, workspace_init):
     from triton_kernels.tensor_details import layout
 
     if not hasattr(layout, "make_default_matmul_mxfp4_w_layout"):
diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py
index 2a30ef2355529..6ebf1016c166c 100644
--- a/tests/kernels/moe/test_modular_kernel_combinations.py
+++ b/tests/kernels/moe/test_modular_kernel_combinations.py
@@ -16,6 +16,7 @@ from vllm.platforms import current_platform
 from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
 from vllm.utils.import_utils import has_deep_ep, has_deep_gemm, has_pplx
 from vllm.utils.torch_utils import cuda_device_count_stateless
+from vllm.v1.worker.workspace import init_workspace_manager
 
 from .modular_kernel_tools.common import (
     Config,
@@ -77,6 +78,10 @@ def rank_worker(
     weights: WeightTensors,
     verbose: bool,
 ):
+    # Initialize workspace manager in child process
+    device = torch.device(f"cuda:{pgi.local_rank}")
+    init_workspace_manager(device)
+
     current_platform.seed_everything(pgi.rank)
 
     # sanity check
@@ -300,6 +305,7 @@ def test_modular_kernel_combinations_singlegpu(
     chunk_size: int | None,
     world_size: int,
     pytestconfig,
+    workspace_init,
 ):
     """Note: float8_e4m3fn is not supported on CUDA architecture < 89,
     and those tests will be skipped on unsupported hardware."""
diff --git a/tests/kernels/moe/test_modular_oai_triton_moe.py b/tests/kernels/moe/test_modular_oai_triton_moe.py
index c8616f13bbf85..1abb08f878b2b 100644
--- a/tests/kernels/moe/test_modular_oai_triton_moe.py
+++ b/tests/kernels/moe/test_modular_oai_triton_moe.py
@@ -209,6 +209,7 @@ def test_oai_triton_moe(
     num_experts: int,
     topk: int,
     unfused: bool,
+    workspace_init,
 ):
     current_platform.seed_everything(0)
     (
diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
index 82659276af37c..ce99d9691fdc8 100644
--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -231,6 +231,7 @@ def test_fused_moe(
     padding: bool,
     chunk_size: int,
     monkeypatch,
+    workspace_init,
 ):
     current_platform.seed_everything(7)
 
diff --git a/tests/kernels/moe/test_nvfp4_moe.py b/tests/kernels/moe/test_nvfp4_moe.py
index aa544fe0e0f63..e67bd76a16181 100644
--- a/tests/kernels/moe/test_nvfp4_moe.py
+++ b/tests/kernels/moe/test_nvfp4_moe.py
@@ -40,7 +40,7 @@ MNK_FACTORS = [
 @pytest.mark.parametrize("dtype", [torch.bfloat16])
 @torch.inference_mode()
 def test_cutlass_fp4_moe_no_graph(
-    m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype
+    m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype, workspace_init
 ):
     current_platform.seed_everything(7)
     with set_current_vllm_config(
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
index f671b23d300ce..35e554e16cb38 100644
--- a/tests/kernels/moe/test_pplx_moe.py
+++ b/tests/kernels/moe/test_pplx_moe.py
@@ -46,6 +46,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
 )
 from vllm.platforms import current_platform
 from vllm.utils.math_utils import round_up
+from vllm.v1.worker.workspace import init_workspace_manager
 
 from ...utils import multi_gpu_test
 from .parallel_utils import ProcessGroupInfo, parallel_launch
@@ -181,6 +182,7 @@ def test_fused_moe_batched_experts(
     e: int,
     topk: int,
     dtype: torch.dtype,
+    workspace_init,
 ):
     current_platform.seed_everything(7)
 
@@ -863,6 +865,9 @@ def _pplx_test_loop(
     make_weights: bool,
     test_fn: Callable,
 ):
+    device = torch.device(f"cuda:{pgi.local_rank}")
+    init_workspace_manager(device)
+
     def format_result(msg, ex=None):
         if ex is not None:
             x = str(ex)
diff --git a/tests/v1/attention/test_sparse_mla_backends.py b/tests/v1/attention/test_sparse_mla_backends.py
index b34d587eb362d..8049347280c5a 100644
--- a/tests/v1/attention/test_sparse_mla_backends.py
+++ b/tests/v1/attention/test_sparse_mla_backends.py
@@ -22,10 +22,14 @@ from tests.v1.attention.utils import (
 )
 from vllm import _custom_ops as ops
 from vllm.attention.ops import flashmla
+from vllm.config import set_current_vllm_config
 from vllm.model_executor.layers.linear import ColumnParallelLinear
 from vllm.utils.math_utils import cdiv
-from vllm.v1.attention.backends.mla.flashmla_sparse import FlashMLASparseBackend
-from vllm.v1.attention.backends.mla.indexer import split_prefill_chunks
+from vllm.v1.attention.backends.mla.flashmla_sparse import (
+    FlashMLASparseBackend,
+    triton_convert_req_index_to_global_index,
+)
+from vllm.v1.attention.backends.utils import split_prefill_chunks
 
 SPARSE_BACKEND_BATCH_SPECS = {
     name: BATCH_SPECS[name]
@@ -114,8 +118,12 @@ def _quantize_dequantize_fp8_ds_mla(
 @pytest.mark.parametrize("batch_name", list(SPARSE_BACKEND_BATCH_SPECS.keys()))
 @pytest.mark.parametrize("kv_cache_dtype", ["fp8_ds_mla", "auto"])
 @pytest.mark.parametrize("tensor_parallel_size", [1, 2, 4])
+@pytest.mark.skipif(
+    torch.cuda.get_device_capability() < (9, 0),
+    reason="FlashMLASparseBackend requires CUDA 9.0 or higher",
+)
 def test_sparse_backend_decode_correctness(
-    dist_init, batch_name, kv_cache_dtype, tensor_parallel_size
+    dist_init, batch_name, kv_cache_dtype, tensor_parallel_size, workspace_init
 ):
     if not torch.cuda.is_available():
         pytest.skip("CUDA is required for sparse MLA decode test")
@@ -320,28 +328,29 @@ def test_sparse_backend_decode_correctness(
     mock_kv_b_proj.weight = torch.nn.Parameter(kv_b_proj_weight.T.contiguous())
 
     impl_cls = FlashMLASparseBackend.get_impl_cls()
-    impl = impl_cls(
-        num_heads=num_heads,
-        head_size=head_size,
-        scale=scale,
-        num_kv_heads=1,
-        alibi_slopes=None,
-        sliding_window=None,
-        kv_cache_dtype=vllm_config.cache_config.cache_dtype,
-        logits_soft_cap=None,
-        attn_type="decoder",
-        kv_sharing_target_layer_name=None,
-        q_lora_rank=None,
-        kv_lora_rank=kv_lora_rank,
-        qk_nope_head_dim=qk_nope_head_dim,
-        qk_rope_head_dim=qk_rope_head_dim,
-        qk_head_dim=qk_nope_head_dim + qk_rope_head_dim,
-        v_head_dim=v_head_dim,
-        kv_b_proj=mock_kv_b_proj,
-        indexer=mock_indexer,
-    )
+    with set_current_vllm_config(vllm_config):
+        impl = impl_cls(
+            num_heads=num_heads,
+            head_size=head_size,
+            scale=scale,
+            num_kv_heads=1,
+            alibi_slopes=None,
+            sliding_window=None,
+            kv_cache_dtype=vllm_config.cache_config.cache_dtype,
+            logits_soft_cap=None,
+            attn_type="decoder",
+            kv_sharing_target_layer_name=None,
+            q_lora_rank=None,
+            kv_lora_rank=kv_lora_rank,
+            qk_nope_head_dim=qk_nope_head_dim,
+            qk_rope_head_dim=qk_rope_head_dim,
+            qk_head_dim=qk_nope_head_dim + qk_rope_head_dim,
+            v_head_dim=v_head_dim,
+            kv_b_proj=mock_kv_b_proj,
+            indexer=mock_indexer,
+        )
 
-    impl.process_weights_after_loading(dtype)
+        impl.process_weights_after_loading(dtype)
 
     layer = MockAttentionLayer(device)
     out_buffer = torch.empty(
@@ -366,22 +375,192 @@ def test_sparse_backend_decode_correctness(
     torch.testing.assert_close(backend_output, sdpa_reference, rtol=0.5, atol=0.5)
 
 
+def _triton_convert_reference_impl(
+    req_ids: torch.Tensor,
+    block_table: torch.Tensor,
+    token_indices: torch.Tensor,
+    block_size: int,
+    num_topk_tokens: int,
+    HAS_PREFILL_WORKSPACE: bool = False,
+    prefill_workspace_request_ids: torch.Tensor | None = None,
+    prefill_workspace_starts: torch.Tensor | None = None,
+) -> torch.Tensor:
+    """Reference implementation for triton_convert_req_index_to_global_index."""
+    num_tokens = req_ids.shape[0]
+    max_blocks_per_req = block_table.shape[1]
+    result = torch.empty(
+        num_tokens, num_topk_tokens, dtype=torch.int32, device=req_ids.device
+    )
+
+    for token_id in range(num_tokens):
+        req_id = req_ids[token_id].item()
+
+        # Determine if this token uses workspace or paged cache
+        use_prefill_workspace = False
+        workspace_start = 0
+        if HAS_PREFILL_WORKSPACE and prefill_workspace_request_ids is not None:
+            assert prefill_workspace_starts is not None
+            prefill_req_id = prefill_workspace_request_ids[token_id].item()
+            if prefill_req_id >= 0:
+                use_prefill_workspace = True
+                workspace_start = prefill_workspace_starts[prefill_req_id].item()
+
+        for idx_id in range(num_topk_tokens):
+            token_idx = token_indices[token_id, idx_id].item()
+
+            if token_idx == -1:
+                result[token_id, idx_id] = -1
+            elif use_prefill_workspace:
+                # Prefill + using prefill workspace: map to workspace offset
+                result[token_id, idx_id] = workspace_start + token_idx
+            else:
+                # Decode: map to paged cache
+                block_id = token_idx // block_size
+                if block_id >= max_blocks_per_req:
+                    result[token_id, idx_id] = -1
+                else:
+                    block_num = block_table[req_id, block_id].item()
+                    offset = token_idx % block_size
+                    result[token_id, idx_id] = block_num * block_size + offset
+
+    return result
+
+
+@pytest.mark.parametrize("block_size", [16, 64, 128])
+@pytest.mark.parametrize("num_topk_tokens", [128, 256, 512])
+@pytest.mark.skipif(
+    torch.cuda.get_device_capability() < (9, 0),
+    reason="FlashMLASparseBackend requires CUDA 9.0 or higher",
+)
+def test_triton_convert_req_index_to_global_index_decode_only(
+    block_size, num_topk_tokens
+):
+    device = torch.device("cuda")
+    num_tokens = 8
+    num_requests = 4
+    max_blocks_per_req = 10
+
+    req_id = torch.randint(
+        0, num_requests, (num_tokens,), dtype=torch.int32, device=device
+    )
+    block_table = torch.randint(
+        0, 100, (num_requests, max_blocks_per_req), dtype=torch.int32, device=device
+    )
+
+    token_indices = torch.randint(
+        0,
+        block_size * max_blocks_per_req,
+        (num_tokens, num_topk_tokens),
+        dtype=torch.int32,
+        device=device,
+    )
+
+    # Set some to -1 to test masking
+    token_indices[0, :10] = -1
+    token_indices[3, 50:60] = -1
+
+    # Set some to out of bounds
+    token_indices[2, 100:110] = max_blocks_per_req * block_size
+    token_indices[6, 150:160] = max_blocks_per_req * block_size
+
+    result = triton_convert_req_index_to_global_index(
+        req_id,
+        block_table,
+        token_indices,
+        BLOCK_SIZE=block_size,
+        NUM_TOPK_TOKENS=num_topk_tokens,
+    )
+
+    reference_result = _triton_convert_reference_impl(
+        req_id,
+        block_table,
+        token_indices,
+        block_size,
+        num_topk_tokens,
+    )
+
+    torch.testing.assert_close(result, reference_result, rtol=0, atol=0)
+
+
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.skipif(
+    torch.cuda.get_device_capability() < (9, 0),
+    reason="FlashMLASparseBackend requires CUDA 9.0 or higher",
+)
+def test_triton_convert_req_index_to_global_index_with_prefill_workspace(block_size):
+    device = torch.device("cuda")
+    num_requests = 4
+    max_blocks_per_req = 8
+    num_topk_tokens = 128
+
+    # First 6 tokens are decode (reqs 0, 1), last 6 are prefill (reqs 2, 3)
+    req_id = torch.tensor(
+        [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3], dtype=torch.int32, device=device
+    )
+    prefill_workspace_request_ids = torch.tensor(
+        [-1, -1, -1, -1, -1, -1, 0, 0, 0, 1, 1, 1], dtype=torch.int32, device=device
+    )
+
+    # Workspace starts for the 2 prefill reqs: req 2 starts at 0, req 3 starts at 100
+    prefill_workspace_starts = torch.tensor([0, 100], dtype=torch.int32, device=device)
+
+    block_table = torch.randint(
+        0, 50, (num_requests, max_blocks_per_req), dtype=torch.int32, device=device
+    )
+    token_indices = torch.randint(
+        0,
+        block_size * max_blocks_per_req,
+        (req_id.shape[0], num_topk_tokens),
+        dtype=torch.int32,
+        device=device,
+    )
+
+    # Set some to -1 to test masking
+    token_indices[0, :10] = -1
+    token_indices[3, 50:60] = -1
+
+    # Set some to out of bounds
+    token_indices[2, 100:110] = max_blocks_per_req * block_size
+    token_indices[6, 150:160] = max_blocks_per_req * block_size
+
+    result = triton_convert_req_index_to_global_index(
+        req_id,
+        block_table,
+        token_indices,
+        BLOCK_SIZE=block_size,
+        NUM_TOPK_TOKENS=num_topk_tokens,
+        HAS_PREFILL_WORKSPACE=True,
+        prefill_workspace_request_ids=prefill_workspace_request_ids,
+        prefill_workspace_starts=prefill_workspace_starts,
+    )
+
+    reference_result = _triton_convert_reference_impl(
+        req_id,
+        block_table,
+        token_indices,
+        block_size,
+        num_topk_tokens,
+        HAS_PREFILL_WORKSPACE=True,
+        prefill_workspace_request_ids=prefill_workspace_request_ids,
+        prefill_workspace_starts=prefill_workspace_starts,
+    )
+
+    torch.testing.assert_close(result, reference_result, rtol=0, atol=0)
+
+
 @pytest.mark.parametrize(
-    "seq_lens,max_buf,start,expected",
+    "seq_lens,max_buf,expected",
     [
         # Basic split: totals per chunk ≤ max_buf
-        (torch.tensor([2, 3, 4, 2]), 5, 0, [(0, 2), (2, 3), (3, 4)]),
-        # Non-zero start index
-        (torch.tensor([2, 3, 4, 2]), 5, 1, [(1, 2), (2, 3), (3, 4)]),
-        # Exact fits should split between items when adding the next would
-        # overflow
-        (torch.tensor([5, 5, 5]), 5, 0, [(0, 1), (1, 2), (2, 3)]),
+        (torch.tensor([2, 3, 4, 2]), 5, [(0, 2), (2, 3), (3, 4)]),
+        # Exact fits should split between items when adding the next would overflow
+        (torch.tensor([5, 5, 5]), 5, [(0, 1), (1, 2), (2, 3)]),
         # All requests fit in a single chunk
-        (torch.tensor([1, 1, 1]), 10, 0, [(0, 3)]),
-        # Large buffer with non-zero start
-        (torch.tensor([4, 4, 4]), 100, 1, [(1, 3)]),
+        (torch.tensor([1, 1, 1]), 10, [(0, 3)]),
+        # Large buffer
+        (torch.tensor([4, 4, 4]), 100, [(0, 3)]),
     ],
 )
-def test_split_prefill_chunks(seq_lens, max_buf, start, expected):
-    out = split_prefill_chunks(seq_lens, max_buf, start)
+def test_split_prefill_chunks(seq_lens, max_buf, expected):
+    out = split_prefill_chunks(seq_lens, max_buf)
     assert out == expected
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 6d862c5812560..52a58a082683d 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -2403,6 +2403,29 @@ def cp_gather_cache(
     )
 
 
+def cp_gather_and_upconvert_fp8_kv_cache(
+    src_cache: torch.Tensor,
+    dst: torch.Tensor,
+    block_table: torch.Tensor,
+    seq_lens: torch.Tensor,
+    workspace_starts: torch.Tensor,
+    batch_size: int,
+) -> None:
+    """Gather and upconvert FP8 KV cache to BF16 workspace.
+
+    Args:
+        src_cache: FP8 KV cache [num_blocks, block_size, 656]
+        dst: BF16 output workspace [total_tokens, 576]
+        block_table: Block indices [num_reqs, max_blocks]
+        seq_lens: Sequence lengths [num_reqs]
+        workspace_starts: Workspace start offsets [num_reqs]
+        batch_size: Number of requests
+    """
+    torch.ops._C_cache_ops.cp_gather_and_upconvert_fp8_kv_cache(
+        src_cache, dst, block_table, seq_lens, workspace_starts, batch_size
+    )
+
+
 def indexer_k_quant_and_cache(
     k: torch.Tensor,
     kv_cache: torch.Tensor,
diff --git a/vllm/envs.py b/vllm/envs.py
index cb75ba1a62de9..d0f2798096263 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -239,6 +239,7 @@ if TYPE_CHECKING:
     VLLM_NCCL_INCLUDE_PATH: str | None = None
     VLLM_USE_FBGEMM: bool = False
     VLLM_GC_DEBUG: str = ""
+    VLLM_DEBUG_WORKSPACE: bool = False
     VLLM_DISABLE_SHARED_EXPERTS_STREAM: bool = False
     VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD: int = 256
     VLLM_COMPILE_CACHE_SAVE_FORMAT: Literal["binary", "unpacked"] = "binary"
@@ -1537,6 +1538,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # - VLLM_GC_DEBUG='{"top_objects":5}': enable GC debugger with
     #                                      top 5 collected objects
     "VLLM_GC_DEBUG": lambda: os.getenv("VLLM_GC_DEBUG", ""),
+    # Debug workspace allocations.
+    # logging of workspace resize operations.
+    "VLLM_DEBUG_WORKSPACE": lambda: bool(int(os.getenv("VLLM_DEBUG_WORKSPACE", "0"))),
     # Disables parallel execution of shared_experts via separate cuda stream
     "VLLM_DISABLE_SHARED_EXPERTS_STREAM": lambda: bool(
         int(os.getenv("VLLM_DISABLE_SHARED_EXPERTS_STREAM", "0"))
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index 075610ec588ae..9e75a7c08070e 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -22,12 +22,12 @@ from vllm.model_executor.layers.fused_moe.utils import (
 from vllm.platforms import current_platform
 from vllm.utils.math_utils import cdiv
 from vllm.v1.worker.ubatching import (
-    dbo_current_ubatch_id,
     dbo_enabled,
     dbo_maybe_run_recv_hook,
     dbo_register_recv_hook,
     dbo_yield,
 )
+from vllm.v1.worker.workspace import current_workspace_manager
 
 logger = init_logger(__name__)
 
@@ -661,25 +661,6 @@ def _slice_scales(
     return None
 
 
-class SharedResizableBuffer:
-    def __init__(self):
-        self.buffer = None
-
-    def get(
-        self, shape: tuple[int, ...], device: torch.device, dtype: torch.dtype
-    ) -> torch.Tensor:
-        assert shape != ()
-        shape_numel = prod(shape)
-        if (
-            self.buffer is None
-            or self.buffer.numel() < shape_numel
-            or self.buffer.device != device
-            or self.buffer.dtype != dtype
-        ):
-            self.buffer = torch.empty(shape_numel, device=device, dtype=dtype)
-        return self.buffer[:shape_numel].view(*shape)
-
-
 @final
 class FusedMoEModularKernel(torch.nn.Module):
     """
@@ -694,22 +675,6 @@ class FusedMoEModularKernel(torch.nn.Module):
     objects.
     """
 
-    class SharedBuffers:
-        def __init__(self) -> None:
-            self.fused_out = SharedResizableBuffer()
-            self.workspace13 = SharedResizableBuffer()
-            self.workspace2 = SharedResizableBuffer()
-
-    # Persistent buffers that are shared across `FusedMoEModularKernel`
-    # instances (layers), to save memory and allocattions.
-    #
-    # We have two sets of buffers to support dual batch overlap (DBO) where each
-    # microbatch (ubatch) should use its own set of buffers to avoid
-    # cross-ubatch contimination.
-    # NOTE that memory is lazily allocated for these buffers, meaning that if
-    # DBO isn't being used, the second SharedBuffers will be empty.
-    shared_buffers: list[SharedBuffers] = [SharedBuffers(), SharedBuffers()]
-
     def __init__(
         self,
         prepare_finalize: FusedMoEPrepareAndFinalize,
@@ -806,10 +771,6 @@ class FusedMoEModularKernel(torch.nn.Module):
         assert M_full > 0 and M_chunk > 0
 
         num_chunks, _ = self._chunk_info(M_full)
-
-        # select per-ubatch buffers to avoid cross-ubatch reuse under DBO
-        ubatch_idx = dbo_current_ubatch_id()
-        buffers = self.shared_buffers[ubatch_idx]
         workspace_dtype = self.fused_experts.workspace_dtype(out_dtype)
 
         # Force worst-case allocation in profiling run for
@@ -832,14 +793,11 @@ class FusedMoEModularKernel(torch.nn.Module):
                     expert_tokens_meta,
                 )
             )
-            buffers.workspace13.get(
-                max_workspace_13, device=device, dtype=workspace_dtype
-            )
-            buffers.workspace2.get(
-                max_workspace_2, device=device, dtype=workspace_dtype
-            )
-            buffers.fused_out.get(
-                max_fused_out_shape, device=device, dtype=workspace_dtype
+
+            current_workspace_manager().get_simultaneous(
+                (max_workspace_13, workspace_dtype),
+                (max_workspace_2, workspace_dtype),
+                (max_fused_out_shape, out_dtype),
             )
 
         # Get intermediate workspace shapes based off the chunked M size.
@@ -866,22 +824,23 @@ class FusedMoEModularKernel(torch.nn.Module):
 
         # We can reuse the memory between cache1 and cache3 because by the
         # time we need cache3, we're done with cache1.
-        workspace13 = buffers.workspace13.get(
-            workspace13_shape, device=device, dtype=workspace_dtype
-        )
-        workspace2 = buffers.workspace2.get(
-            workspace2_shape, device=device, dtype=workspace_dtype
-        )
-
         # Construct the entire output that can then be processed in chunks.
         # Reuse workspace13 for the output in the non-chunked case as long
         # as it is large enough. This will not always be the case for standard
         # format experts and with experts that have empty workspaces.
         if num_chunks == 1 and prod(workspace13_shape) >= prod(fused_out_shape):
+            workspace13, workspace2 = current_workspace_manager().get_simultaneous(
+                (workspace13_shape, workspace_dtype),
+                (workspace2_shape, workspace_dtype),
+            )
             fused_out = _resize_cache(workspace13, fused_out_shape)
         else:
-            fused_out = buffers.fused_out.get(
-                fused_out_shape, device=device, dtype=out_dtype
+            workspace13, workspace2, fused_out = (
+                current_workspace_manager().get_simultaneous(
+                    (workspace13_shape, workspace_dtype),
+                    (workspace2_shape, workspace_dtype),
+                    (fused_out_shape, out_dtype),
+                )
             )
 
         return workspace13, workspace2, fused_out
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index a9fa76deecbd2..146124153c79d 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -83,6 +83,7 @@ from vllm.v1.attention.backends.mla.indexer import (
     DeepseekV32IndexerMetadata,
 )
 from vllm.v1.kv_cache_interface import KVCacheSpec, MLAAttentionSpec
+from vllm.v1.worker.workspace import current_workspace_manager
 
 from .interfaces import MixtureOfExperts, SupportsEagle, SupportsLoRA, SupportsPP
 from .utils import (
@@ -616,8 +617,15 @@ def sparse_attn_indexer(
     # careful! this will be None in dummy run
     attn_metadata = get_forward_context().attn_metadata
     fp8_dtype = current_platform.fp8_dtype()
+
     # assert isinstance(attn_metadata, dict)
     if not isinstance(attn_metadata, dict):
+        # Reserve workspace for indexer during profiling run
+        current_workspace_manager().get_simultaneous(
+            ((total_seq_lens, head_dim), torch.float8_e4m3fn),
+            ((total_seq_lens, 4), torch.uint8),
+        )
+
         return sparse_attn_indexer_fake(
             hidden_states,
             k_cache_prefix,
@@ -651,17 +659,17 @@ def sparse_attn_indexer(
     topk_indices_buffer[: hidden_states.shape[0]] = -1
     if has_prefill:
         prefill_metadata = attn_metadata.prefill
+
+        # Get the full shared workspace buffers once (will allocate on first use)
+        workspace_manager = current_workspace_manager()
+        k_fp8_full, k_scale_full = workspace_manager.get_simultaneous(
+            ((total_seq_lens, head_dim), fp8_dtype),
+            ((total_seq_lens, 4), torch.uint8),
+        )
+
         for chunk in prefill_metadata.chunks:
-            k_fp8 = torch.empty(
-                [chunk.total_seq_lens, head_dim],
-                device=k.device,
-                dtype=fp8_dtype,
-            )
-            k_scale = torch.empty(
-                [chunk.total_seq_lens, 4],
-                device=k.device,
-                dtype=torch.uint8,
-            )
+            k_fp8 = k_fp8_full[: chunk.total_seq_lens]
+            k_scale = k_scale_full[: chunk.total_seq_lens]
             ops.cp_gather_indexer_k_quant_cache(
                 kv_cache,
                 k_fp8,
@@ -777,15 +785,6 @@ def sparse_attn_indexer_fake(
     total_seq_lens: int,
     topk_indices_buffer: torch.Tensor | None,
 ) -> torch.Tensor:
-    # profile run
-    # NOTE(Chen): create the max possible flattened_kv. So that
-    # profile_run can get correct memory usage.
-    _flattened_kv = torch.empty(
-        [total_seq_lens, head_dim + 4], device=k.device, dtype=torch.uint8
-    )
-    fp8_dtype = current_platform.fp8_dtype()
-    _k_fp8 = _flattened_kv[..., :head_dim].view(fp8_dtype).contiguous()
-    _k_scale = _flattened_kv[..., head_dim:].view(torch.float32).contiguous()
     return topk_indices_buffer
 
 
diff --git a/vllm/v1/attention/backends/mla/flashmla_sparse.py b/vllm/v1/attention/backends/mla/flashmla_sparse.py
index 1eee1d225293b..f3052fbaf2a65 100644
--- a/vllm/v1/attention/backends/mla/flashmla_sparse.py
+++ b/vllm/v1/attention/backends/mla/flashmla_sparse.py
@@ -18,7 +18,7 @@ from vllm.attention.ops.flashmla import (
     flash_mla_with_kvcache,
     get_mla_metadata,
 )
-from vllm.config import VllmConfig
+from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
@@ -30,13 +30,31 @@ from vllm.v1.attention.backends.utils import (
     AttentionCGSupport,
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
+    reshape_attn_output_for_spec_decode,
+    reshape_query_for_spec_decode,
+    split_decodes_and_prefills,
+    split_prefill_chunks,
 )
 from vllm.v1.kv_cache_interface import AttentionSpec
+from vllm.v1.worker.workspace import current_workspace_manager
 
 if TYPE_CHECKING:
     from vllm.model_executor.models.deepseek_v2 import Indexer
 
 logger = init_logger(__name__)
+
+# For FP8 sparse attention we have two impelementations:
+# 1. Mixed batch mode: use the FP8 decode kernel for both prefill and decode this is
+#    done by treating all tokens as single batch.
+# 2. Separate prefill and decode mode: use the BF16 prefill kernel for prefill
+#    (upconverting the FP8 cache to BF16 then calling the prefill kernel) and using
+#    the FP8 decode kernel for decode.
+# Currently we use #1 when the number of heads per rank is low (i.e. TP) since the BF16
+# prefill kernel requires padding the numer of heads to 128 while the decode does not
+# so when the per ranke head count is below MIN_HEADS_FOR_BF16_PREFILL we use the mixed
+# batch mode (#2).
+MIN_HEADS_FOR_BF16_PREFILL = 32
+
 """
 NOTE: FlashMLA Sparse uses an fp8 cache with the following format
 
@@ -127,19 +145,72 @@ class FlashMLASparseMetadata:
         dummy_block_table: torch.Tensor
         cache_lens: torch.Tensor
 
-    fp8_extra_metadata: FP8KernelMetadata | None = None
+    @dataclass
+    class FP8SeperatePrefillDecode:
+        @dataclass
+        class Decode:
+            kernel_metadata: "FlashMLASparseMetadata.FP8KernelMetadata"
+            decode_query_len: int  # needed for reshape in spec decode
+
+        @dataclass
+        class Prefill:
+            # Sequence lengths (context + query) for prefill requests
+            # Shape: [num_prefill_reqs]
+            seq_lens: torch.Tensor
+
+            # Request ID for each token: -1 for decode tokens, request index
+            # (0, 1, 2, ...) for prefill tokens.
+            # Shape: [num_actual_tokens]
+            request_ids: torch.Tensor
+
+            # Workspace start offsets for all prefill requests
+            # Shape: [num_prefill_reqs], adjusted in-place per chunk to be
+            # 0-indexed within each chunk. Used to map prefill tokens to workspace
+            # offsets in convert_logical_index_to_physical_index
+            workspace_starts: torch.Tensor
+
+            @dataclass
+            class Chunk:
+                """Metadata for a chunk of prefill requests.
+
+                Prefill requests may be chunked to fit within the fixed workspace size.
+                """
+
+                seq_lens: torch.Tensor
+                tokens_slice: slice
+                block_table: torch.Tensor
+                req_start_idx: int
+                workspace_starts: torch.Tensor
+                chunk_tot_seqlen: int
+
+            chunks: list[Chunk]
+
+        num_prefills: int = 0
+        num_decodes: int = 0
+        num_prefill_tokens: int = 0
+        num_decode_tokens: int = 0
+
+        decode: Decode | None = None
+        prefill: Prefill | None = None
+
+    fp8_extra_metadata: FP8SeperatePrefillDecode | FP8KernelMetadata | None = None
+    fp8_use_mixed_batch: bool = False
 
 
+# Kernel with prefill workspace support
 @triton.jit
 def _convert_req_index_to_global_index_kernel(
     req_id_ptr,  # int32 [num_tokens]
     block_table_ptr,  # int32 [num_requests, max_num_blocks_per_req]
     token_indices_ptr,  # int32 [num_tokens, NUM_TOPK_TOKENS]
     out_ptr,  # int32 [num_tokens, NUM_TOPK_TOKENS]
+    prefill_request_id_ptr,  # int32 [num_tokens], -1 for decode, >=0 for prefill
+    workspace_starts_ptr,  # int32 [num_prefill_reqs+1] or nullptr
     # shapes (compile-time where possible)
     max_num_blocks_per_req: tl.constexpr,
     BLOCK_SIZE: tl.constexpr,
     BLOCK_N: tl.constexpr,  # tile width along columns
+    HAS_PREFILL: tl.constexpr,
     # strides (in elements)
     bt_stride0,
     bt_stride1,
@@ -165,7 +236,10 @@ def _convert_req_index_to_global_index_kernel(
 
     # Only token == -1 should propagate as -1
     is_invalid_tok = tok < 0
-
+    is_prefill = False
+    if HAS_PREFILL:
+        prefill_req_id = tl.load(prefill_request_id_ptr + token_id)
+        is_prefill = prefill_req_id >= 0
     # Compute block id and in-block offset
     block_id = tok // BLOCK_SIZE
     inblock_off = tok % BLOCK_SIZE
@@ -173,12 +247,18 @@ def _convert_req_index_to_global_index_kernel(
     # Guard block_table access
     valid_block = (block_id < max_num_blocks_per_req) & (block_id >= 0)
     bt_ptr = block_table_ptr + req * bt_stride0 + block_id * bt_stride1
-    base = tl.load(bt_ptr, mask=valid_block, other=0)
+    is_invalid_tok |= ~valid_block
+    base = tl.load(bt_ptr, mask=valid_block & ~is_prefill, other=0)
+    out_val = base * BLOCK_SIZE + inblock_off
 
-    # If token == -1 OR block_id OOB, output -1; else base * BLOCK_SIZE + offset
-    out_val = tl.where(
-        is_invalid_tok | (~valid_block), -1, base * BLOCK_SIZE + inblock_off
-    )
+    # Override with prefill output if prefill is enabled
+    if HAS_PREFILL:
+        workspace_start = tl.load(
+            workspace_starts_ptr + prefill_req_id, mask=is_prefill, other=0
+        )
+        prefill_out = workspace_start + tok
+        out_val = tl.where(is_prefill, prefill_out, out_val)
+    out_val = tl.where(is_invalid_tok, -1, out_val)
 
     # Store results
     out_ptr_ij = out_ptr + token_id * out_stride0 + indice_id * out_stride1
@@ -192,6 +272,9 @@ def triton_convert_req_index_to_global_index(
     BLOCK_SIZE: int = 64,
     NUM_TOPK_TOKENS: int = 2048,
     BLOCK_N: int = 128,  # tile width along columns
+    HAS_PREFILL_WORKSPACE: bool = False,
+    prefill_workspace_request_ids: torch.Tensor | None = None,
+    prefill_workspace_starts: torch.Tensor | None = None,
 ):
     """
     out[token_id, indice_id] =
@@ -202,17 +285,32 @@ def triton_convert_req_index_to_global_index(
     Only when token_indices[token_id, indice_id] == -1 do we output -1.
     For safety, we also output -1 if the derived block_id would be
         out-of-bounds.
+
+    When HAS_PREFILL_WORKSPACE is True, prefill tokens are mapped to workspace offsets
+    instead of global cache slots. prefill_workspace_request_ids and
+    prefill_workspace_starts must be provided.
+
+    prefill_workspace_request_ids: int32 [num_tokens], -1 for decode else
+        prefill request index (maps to prefill_workspace_starts)
+    prefill_workspace_starts: int32 [num_prefills], 0-indexed workspace
+        starts for each prefill request
     """
     assert req_id.dtype == torch.int32
     assert block_table.dtype == torch.int32
     assert token_indices.dtype == torch.int32
     assert token_indices.shape[1] == NUM_TOPK_TOKENS
     assert NUM_TOPK_TOKENS % BLOCK_N == 0, (
-        f"NUM_TOPK_TOKENS ({NUM_TOPK_TOKENS}) must be divisible byBLOCK_N ({BLOCK_N})"
+        f"NUM_TOPK_TOKENS ({NUM_TOPK_TOKENS}) must be divisible by BLOCK_N ({BLOCK_N})"
     )
 
+    if HAS_PREFILL_WORKSPACE:
+        assert prefill_workspace_request_ids is not None
+        assert prefill_workspace_starts is not None
+        assert prefill_workspace_request_ids.dtype == torch.int32
+        assert prefill_workspace_starts.dtype == torch.int32
+
     num_tokens = req_id.shape[0]
-    num_requests, max_num_blocks_per_req = block_table.shape
+    max_num_blocks_per_req = block_table.shape[1]
     tiles_per_row = NUM_TOPK_TOKENS // BLOCK_N
 
     # Ensure contiguous tensors on the same device
@@ -226,6 +324,13 @@ def triton_convert_req_index_to_global_index(
     ti_stride0, ti_stride1 = token_indices_c.stride()
     out_stride0, out_stride1 = out.stride()
 
+    # Prepare prefill pointers
+    if HAS_PREFILL_WORKSPACE:
+        assert prefill_workspace_request_ids is not None  # for mypy
+        assert prefill_workspace_starts is not None  # for mypy
+        assert prefill_workspace_request_ids.is_contiguous()
+        assert prefill_workspace_starts.is_contiguous()
+
     # Exact 2D grid: tokens × column tiles
     grid = (num_tokens, tiles_per_row)
 
@@ -234,10 +339,13 @@ def triton_convert_req_index_to_global_index(
         block_table_c,
         token_indices_c,
         out,
+        prefill_workspace_request_ids,
+        prefill_workspace_starts,
         # shapes / constexprs
         max_num_blocks_per_req,
         BLOCK_SIZE,
         BLOCK_N,
+        HAS_PREFILL_WORKSPACE,
         # strides
         bt_stride0,
         bt_stride1,
@@ -249,7 +357,16 @@ def triton_convert_req_index_to_global_index(
     return out
 
 
-@dataclass
+def get_prefill_workspace_size(max_model_len: int):
+    # NOTE(Lucas): 5 is a magic number for controlling the prefill buffer size.
+    # May be tuned later.
+    # Memory usage: 5 * max_model_len * 576 * 2 bytes
+    #   Example: DeepSeek-V3.2 with max_model_len=163840 ->
+    #            5 * 163840 * 576 * 2 = ~900 MB
+    # This fits nicely below the typical MoE workspace size of >2GB so this is "free"
+    return max_model_len * 5
+
+
 class FlashMLASparseMetadataBuilder(AttentionMetadataBuilder[FlashMLASparseMetadata]):
     _cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.UNIFORM_BATCH
 
@@ -259,29 +376,42 @@ class FlashMLASparseMetadataBuilder(AttentionMetadataBuilder[FlashMLASparseMetad
         layer_names: list[str],
         vllm_config: VllmConfig,
         device: torch.device,
-    ):
+    ) -> None:
+        self.vllm_config = vllm_config
+        self.layer_names = layer_names
         cache_config = vllm_config.cache_config
         self.kv_cache_spec = kv_cache_spec
         self.model_config = vllm_config.model_config
         parallel_config = vllm_config.parallel_config
         self.device = device
 
+        # Treat requests with query length <= 1 as decodes to match the
+        # DeepGEMM indexer constraint (fp8_paged_mqa_logits only supports next_n <= 2)
+        self._init_reorder_batch_threshold(1, supports_spec_as_decode=True)
+
         props = torch.cuda.get_device_properties(device)
         sm_count = props.multi_processor_count
 
         self.num_heads = self.model_config.get_num_attention_heads(parallel_config)
         self.mla_dims = get_mla_dims(self.model_config)
+
         self.topk_tokens = vllm_config.model_config.hf_config.index_topk
         self.use_fp8_kv_cache = cache_config.cache_dtype == "fp8_ds_mla"
-        self.topk_tokens_tensor = torch.tensor(
-            [self.topk_tokens], device=device, dtype=torch.int32
+        max_num_seqs = vllm_config.scheduler_config.max_num_seqs
+        # Shape: [max_num_seqs], all elements = topk_tokens (constant for full-CG)
+        self.topk_tokens_tensor = torch.full(
+            (max_num_seqs,), self.topk_tokens, device=device, dtype=torch.int32
         )
-        self.max_model_len_tensor = torch.tensor(
-            [self.model_config.max_model_len], device=device, dtype=torch.int32
+        # Shape: [max_num_seqs], all elements = max_model_len
+        self.max_model_len_tensor = torch.full(
+            (max_num_seqs,),
+            self.model_config.max_model_len,
+            device=device,
+            dtype=torch.int32,
         )
         # this is ignored by `flash_mla_with_kvcache` if indices not None
         self.dummy_block_table = torch.empty(
-            (1, 1), dtype=torch.int32, device=self.device
+            (max_num_seqs, 1), dtype=torch.int32, device=self.device
         )
 
         # Equation taken from FlashMLA/csrc/pybind.cpp
@@ -299,10 +429,9 @@ class FlashMLASparseMetadataBuilder(AttentionMetadataBuilder[FlashMLASparseMetad
             dtype=torch.int32,
             device=device,
         )
+        # Sized for per-request batching (num_decodes + 1)
         self.num_splits_buffer = torch.empty(
-            # We pack all the tokens into one batch for sparse attention.
-            # Otherwise, we can exceed the sm of `get_mla_metadata`.
-            (2,),
+            (max_num_seqs + 1,),
             dtype=torch.int32,
             device=device,
         )
@@ -312,30 +441,171 @@ class FlashMLASparseMetadataBuilder(AttentionMetadataBuilder[FlashMLASparseMetad
             device=device,
         )
 
-    def build(
+    def _build_fp8_mixed_decode_prefill(
         self,
-        common_prefix_len: int,
         common_attn_metadata: CommonAttentionMetadata,
-        fast_build: bool = False,
-    ) -> FlashMLASparseMetadata:
-        num_tokens = common_attn_metadata.num_actual_tokens
-        starts = np.asarray(common_attn_metadata.query_start_loc_cpu, dtype=np.int32)
-        seg_lengths = np.diff(starts)
-        req_id_per_token = np.repeat(
-            np.arange(seg_lengths.shape[0], dtype=np.int32), seg_lengths
-        )
-        # Zero-fill for cudagraphs
-        self.req_id_per_token_buffer.fill_(0)
-        self.req_id_per_token_buffer[: req_id_per_token.shape[0]].copy_(
-            torch.from_numpy(req_id_per_token), non_blocking=True
-        )
-        req_id_per_token = self.req_id_per_token_buffer[:num_tokens]
+    ) -> "FlashMLASparseMetadata.FP8KernelMetadata":
+        """Build FP8 metadata treating all tokens as one mixed batch.
+
+        This matches main branch's approach and avoids the BF16 prefill kernel
+        which has head padding overhead when num_heads is small (high TP case).
+        """
+        num_tokens = common_attn_metadata.num_actual_tokens
+
+        # Build metadata for all tokens as a single batch
+        tile_scheduler_metadata, num_splits = get_mla_metadata(
+            cache_seqlens=self.topk_tokens_tensor[:1],  # Single batch
+            num_q_tokens_per_head_k=num_tokens * self.num_heads,
+            topk=self.topk_tokens,
+            num_heads_q=self.num_heads,
+            num_heads_k=1,
+            is_fp8_kvcache=True,
+        )
+
+        num_sm_parts = tile_scheduler_metadata.size(0)
+        tile_scheduler_metadata_buffer = self.tile_scheduler_metadata_buffer[
+            :num_sm_parts
+        ]
+        tile_scheduler_metadata_buffer.copy_(tile_scheduler_metadata)
+        num_splits_view = self.num_splits_buffer[:2]
+        num_splits_view.copy_(num_splits)
+
+        fp8_metadata = FlashMLASparseMetadata.FP8KernelMetadata(
+            scheduler_metadata=tile_scheduler_metadata_buffer,
+            num_splits=num_splits_view,
+            cache_lens=self.max_model_len_tensor[:1],
+            dummy_block_table=self.dummy_block_table[:1],
+        )
+
+        return fp8_metadata
+
+    def _build_fp8_separate_prefill_decode(
+        self,
+        common_attn_metadata: CommonAttentionMetadata,
+    ) -> "FlashMLASparseMetadata.FP8SeperatePrefillDecode":
+        num_tokens = common_attn_metadata.num_actual_tokens
+
+        (num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens) = (
+            split_decodes_and_prefills(
+                common_attn_metadata,
+                decode_threshold=self.reorder_batch_threshold or 1,
+                require_uniform=True,
+            )
+        )
+
+        FP8Meta = FlashMLASparseMetadata.FP8SeperatePrefillDecode
+        fp8_metadata = FP8Meta(
+            num_decodes=num_decodes,
+            num_prefills=num_prefills,
+            num_decode_tokens=num_decode_tokens,
+            num_prefill_tokens=num_prefill_tokens,
+        )
+
+        # Extract prefill sequence lengths (context + query, not just query)
+        # Decode requests come first in the batch, prefill requests follow
+        prefill_seq_lens = None
+        prefill_request_id = None
+        prefill_workspace_starts = None
+        prefill_chunks = None
+
+        # For pure decode batches, prefill_request_id will be None
+        # For mixed batches, it will have -1 for decode and request_id for prefill
+        if num_prefills > 0:
+            seq_lens_cpu = common_attn_metadata.seq_lens_cpu
+            seq_lens = common_attn_metadata.seq_lens
+            query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
+
+            prefill_seq_lens_cpu = seq_lens_cpu[num_decodes:]
+            prefill_seq_lens = seq_lens[num_decodes:]
+
+            # Build prefill_request_id: -1 for decode, request index for
+            # prefill. This enables a single
+            # convert_logical_index_to_physical_index call for all tokens
+            prefill_request_id = torch.full(
+                (num_tokens,), -1, dtype=torch.int32, device=self.device
+            )
+            # Map prefill tokens to their request IDs (0, 1, 2, ...)
+            for req_idx in range(num_prefills):
+                # Get query token range for this prefill request
+                global_req_idx = num_decodes + req_idx
+                req_query_start = query_start_loc_cpu[global_req_idx]
+                req_query_end = query_start_loc_cpu[global_req_idx + 1]
+                prefill_request_id[req_query_start:req_query_end] = req_idx
+
+            # will be adjusted by chunk loop
+            prefill_workspace_starts_cpu = torch.zeros(
+                num_prefills, dtype=torch.int32, pin_memory=True
+            )
+            prefill_workspace_starts_cpu[1:] = torch.cumsum(
+                prefill_seq_lens_cpu[:-1], dim=0
+            )
+            # populated by non-blocking copy after prefill_workspace_starts_cpu is
+            # updated by each chunk
+            prefill_workspace_starts = torch.empty(
+                num_prefills, dtype=torch.int32, device=self.device
+            )
+
+            # Chunk prefill requests to fit within workspace size
+            max_prefill_buffer_size = get_prefill_workspace_size(
+                self.vllm_config.model_config.max_model_len
+            )
+            chunk_bounds = split_prefill_chunks(
+                prefill_seq_lens_cpu, max_prefill_buffer_size
+            )
+
+            prefill_chunks = []
+            for chunk_start, chunk_end in chunk_bounds:
+                # Adjust workspace_starts in-place per chunk to be
+                # 0-indexed within each chunk
+                # Example: seq_lens=[10,15,20,5], chunks=[[0,2],[2,4]]
+                #   Initial: workspace_starts=[0,10,25,45]
+                #   After:   workspace_starts=[0,10,0,20]
+                #           (chunk 0 starts at 0, chunk 1 starts at 0)
+                offset = prefill_workspace_starts_cpu[chunk_start].item()
+                prefill_workspace_starts_cpu[chunk_start:chunk_end] -= offset
+
+                chunk_seq_lens = prefill_seq_lens[chunk_start:chunk_end]
+                chunk_tot_seqlen = prefill_seq_lens_cpu[chunk_start:chunk_end].sum()
+                token_start = query_start_loc_cpu[num_decodes + chunk_start].item()
+                token_end = query_start_loc_cpu[num_decodes + chunk_end].item()
+                tokens_slice = slice(token_start, token_end)
+
+                # Create chunk view of gpu tensor
+                chunk_workspace_starts = prefill_workspace_starts[chunk_start:chunk_end]
+                chunk_block_table = common_attn_metadata.block_table_tensor[
+                    num_decodes + chunk_start : num_decodes + chunk_end
+                ]
+
+                prefill_chunks.append(
+                    FP8Meta.Prefill.Chunk(
+                        seq_lens=chunk_seq_lens,
+                        tokens_slice=tokens_slice,
+                        block_table=chunk_block_table,
+                        req_start_idx=chunk_start,
+                        workspace_starts=chunk_workspace_starts,
+                        chunk_tot_seqlen=chunk_tot_seqlen,
+                    )
+                )
+
+            prefill_workspace_starts.copy_(
+                prefill_workspace_starts_cpu, non_blocking=True
+            )
+
+            fp8_metadata.prefill = FP8Meta.Prefill(
+                seq_lens=prefill_seq_lens,
+                request_ids=prefill_request_id,
+                workspace_starts=prefill_workspace_starts,
+                chunks=prefill_chunks,
+            )
+
+        if num_decodes > 0:
+            # Compute decode_query_len for spec decode (uniform due to require_uniform)
+            query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
+            decode_query_len = (query_start_loc_cpu[1] - query_start_loc_cpu[0]).item()
 
-        fp8_extra_metadata = None
-        if self.use_fp8_kv_cache:
             tile_scheduler_metadata, num_splits = get_mla_metadata(
-                cache_seqlens=self.topk_tokens_tensor,
-                num_q_tokens_per_head_k=num_tokens * self.num_heads,
+                cache_seqlens=self.topk_tokens_tensor[:num_decodes],
+                num_q_tokens_per_head_k=decode_query_len * self.num_heads,
                 topk=self.topk_tokens,
                 num_heads_q=self.num_heads,
                 num_heads_k=1,
@@ -348,33 +618,70 @@ class FlashMLASparseMetadataBuilder(AttentionMetadataBuilder[FlashMLASparseMetad
                 :num_sm_parts
             ]
             tile_scheduler_metadata_buffer.copy_(tile_scheduler_metadata)
-            self.num_splits_buffer.copy_(num_splits)
+            # num_splits has size [num_decodes + 1]
+            num_splits_view = self.num_splits_buffer[: num_decodes + 1]
+            num_splits_view.copy_(num_splits)
 
-            fp8_extra_metadata = FlashMLASparseMetadata.FP8KernelMetadata(
+            kernel_meta = FlashMLASparseMetadata.FP8KernelMetadata(
                 scheduler_metadata=tile_scheduler_metadata_buffer,
-                num_splits=self.num_splits_buffer,
-                # cache_lens and block_table are basically unused in sparse case
-                # but the decode kernel will treat -1 and indices >= cache_lens
-                # as invalid so we make sure cache_lens is large enough to not
-                # accidentally mark indices invalid, we will use -1 exclusively
-                # to mark invalid indices
-                cache_lens=self.max_model_len_tensor,
-                dummy_block_table=self.dummy_block_table,
+                num_splits=num_splits_view,
+                dummy_block_table=self.dummy_block_table[:num_decodes],
+                cache_lens=self.max_model_len_tensor[:num_decodes],
+            )
+            fp8_metadata.decode = FP8Meta.Decode(
+                kernel_metadata=kernel_meta,
+                decode_query_len=decode_query_len,
             )
 
+        return fp8_metadata
+
+    def build(
+        self,
+        common_prefix_len: int,
+        common_attn_metadata: CommonAttentionMetadata,
+        fast_build: bool = False,
+    ) -> FlashMLASparseMetadata:
+        cm = common_attn_metadata
+        num_tokens = cm.num_actual_tokens
+        starts = np.asarray(cm.query_start_loc_cpu, dtype=np.int32)
+        seg_lengths = np.diff(starts)
+        req_id_per_token = np.repeat(
+            np.arange(seg_lengths.shape[0], dtype=np.int32), seg_lengths
+        )
+        # Zero-fill for cudagraphs
+        self.req_id_per_token_buffer.fill_(0)
+        self.req_id_per_token_buffer[: req_id_per_token.shape[0]].copy_(
+            torch.from_numpy(req_id_per_token), non_blocking=True
+        )
+        req_id_per_token = self.req_id_per_token_buffer[:num_tokens]
+
+        fp8_extra_metadata: (
+            FlashMLASparseMetadata.FP8SeperatePrefillDecode
+            | FlashMLASparseMetadata.FP8KernelMetadata
+            | None
+        ) = None
+        fp8_use_mixed_batch = self.num_heads < MIN_HEADS_FOR_BF16_PREFILL
+        if self.use_fp8_kv_cache:
+            if fp8_use_mixed_batch:
+                fp8_extra_metadata = self._build_fp8_mixed_decode_prefill(cm)
+            else:
+                fp8_extra_metadata = self._build_fp8_separate_prefill_decode(cm)
+
         metadata = FlashMLASparseMetadata(
-            num_reqs=common_attn_metadata.num_reqs,
-            max_query_len=common_attn_metadata.max_query_len,
-            max_seq_len=common_attn_metadata.max_seq_len,
-            num_actual_tokens=common_attn_metadata.num_actual_tokens,
-            query_start_loc=common_attn_metadata.query_start_loc,
-            slot_mapping=common_attn_metadata.slot_mapping,
-            block_table=common_attn_metadata.block_table_tensor,
+            num_reqs=cm.num_reqs,
+            max_query_len=cm.max_query_len,
+            max_seq_len=cm.max_seq_len,
+            num_actual_tokens=cm.num_actual_tokens,
+            query_start_loc=cm.query_start_loc,
+            slot_mapping=cm.slot_mapping,
+            block_table=cm.block_table_tensor,
             req_id_per_token=req_id_per_token,
             block_size=self.kv_cache_spec.block_size,
             topk_tokens=self.topk_tokens,
             fp8_extra_metadata=fp8_extra_metadata,
+            fp8_use_mixed_batch=fp8_use_mixed_batch,
         )
+
         return metadata
 
 
@@ -414,12 +721,204 @@ class FlashMLASparseImpl(MLACommonBaseImpl[FlashMLASparseMetadata]):
         self.topk_indices_buffer = indexer.topk_indices_buffer
         self.padding = 128 if current_platform.is_device_capability(100) else 64
 
+        if kv_cache_dtype == "fp8_ds_mla":
+            # Reserve workspace during initialization
+            vllm_config = get_current_vllm_config()
+            assert vllm_config is not None and vllm_config.model_config is not None
+            prefill_workspace_size = get_prefill_workspace_size(
+                vllm_config.model_config.max_model_len
+            )
+            self.prefill_workspace_shape = (prefill_workspace_size, head_size)
+            (self.prefill_bf16_workspace,) = (
+                current_workspace_manager().get_simultaneous(
+                    (self.prefill_workspace_shape, torch.bfloat16)
+                )
+            )
+
     def _forward_bf16_kv(
         self,
         q: torch.Tensor,
         kv_c_and_k_pe_cache: torch.Tensor,
         topk_indices: torch.Tensor,
         attn_metadata: FlashMLASparseMetadata,
+    ) -> torch.Tensor:
+        # Convert per-request indices to global slots (decode) or workspace
+        # offsets (prefill).
+        topk_indices = triton_convert_req_index_to_global_index(
+            attn_metadata.req_id_per_token,
+            attn_metadata.block_table,
+            topk_indices,
+            BLOCK_SIZE=attn_metadata.block_size,
+            NUM_TOPK_TOKENS=topk_indices.shape[1],
+        )
+
+        return self._bf16_flash_mla_kernel(q, kv_c_and_k_pe_cache, topk_indices)
+
+    def _forward_fp8_kv_separate_prefill_decode(
+        self,
+        q: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        topk_indices: torch.Tensor,
+        attn_metadata: FlashMLASparseMetadata,
+    ) -> torch.Tensor:
+        fp8_metadata = attn_metadata.fp8_extra_metadata
+        assert isinstance(fp8_metadata, FlashMLASparseMetadata.FP8SeperatePrefillDecode)
+        num_decodes = fp8_metadata.num_decodes
+
+        prefill_request_ids = None
+        prefill_workspace_starts = None
+        has_prefill_workspace = False
+        if fp8_metadata.prefill is not None:
+            prefill_request_ids = fp8_metadata.prefill.request_ids
+            prefill_workspace_starts = fp8_metadata.prefill.workspace_starts
+            has_prefill_workspace = True
+
+        # Convert per-request indices to global slots (decode) or workspace
+        # offsets (prefill).
+        # For FP8 cache: prefill uses workspace mapping (upconverted to BF16)
+        # For BF16 cache: always use global cache slots (no workspace)
+        # prefill_workspace_starts has been adjusted in-place per chunk so
+        # prefill indices automatically come out chunk-local
+        topk_indices = triton_convert_req_index_to_global_index(
+            attn_metadata.req_id_per_token,
+            attn_metadata.block_table,
+            topk_indices,
+            BLOCK_SIZE=attn_metadata.block_size,
+            NUM_TOPK_TOKENS=topk_indices.shape[1],
+            HAS_PREFILL_WORKSPACE=has_prefill_workspace,
+            prefill_workspace_request_ids=prefill_request_ids,
+            prefill_workspace_starts=prefill_workspace_starts,
+        )
+
+        fp8_metadata = attn_metadata.fp8_extra_metadata
+        assert isinstance(fp8_metadata, FlashMLASparseMetadata.FP8SeperatePrefillDecode)
+
+        def _fp8_decode(q: torch.Tensor, topk_indices: torch.Tensor) -> torch.Tensor:
+            # Reshape q: (num_decode_tokens, num_heads, head_dim)
+            #         -> (num_decodes, seq_len, num_heads, head_dim)
+            q = reshape_query_for_spec_decode(q, num_decodes)
+            seq_len = q.shape[1]
+            # Reshape topk_indices: (num_decode_tokens, topk)
+            #                    -> (num_decodes, seq_len, topk)
+            topk_indices = topk_indices.view(num_decodes, seq_len, -1)
+            assert fp8_metadata.decode is not None
+            attn_out, _ = self._fp8_flash_mla_kernel(
+                q=q,
+                kv_c_and_k_pe_cache=kv_c_and_k_pe_cache,
+                topk_indices=topk_indices,
+                kernel_metadata=fp8_metadata.decode.kernel_metadata,
+            )
+            # Reshape output: (num_decodes, seq_len, num_heads, head_dim_v)
+            #              -> (num_decode_tokens, num_heads, head_dim_v)
+            return reshape_attn_output_for_spec_decode(attn_out)
+
+        num_decode_tokens = fp8_metadata.num_decode_tokens
+        num_prefill_tokens = fp8_metadata.num_prefill_tokens
+
+        # Pure decode: direct call without allocation
+        if num_decode_tokens > 0 and num_prefill_tokens == 0:
+            assert fp8_metadata.decode is not None
+            attn_out = _fp8_decode(q, topk_indices)
+        else:
+            # Mixed or pure prefill: allocate output tensor
+            attn_out = q.new_empty(
+                (attn_metadata.num_actual_tokens, self.num_heads, self.kv_lora_rank),
+                dtype=q.dtype,
+                device=q.device,
+            )
+
+            if num_decode_tokens > 0:
+                attn_out[:num_decode_tokens] = _fp8_decode(
+                    q[:num_decode_tokens], topk_indices[:num_decode_tokens]
+                )
+
+            assert fp8_metadata.prefill is not None
+            for chunk in fp8_metadata.prefill.chunks:
+                chunk_workspace = self.prefill_bf16_workspace[: chunk.chunk_tot_seqlen]
+                ops.cp_gather_and_upconvert_fp8_kv_cache(
+                    kv_c_and_k_pe_cache,
+                    chunk_workspace,
+                    chunk.block_table,
+                    chunk.seq_lens,
+                    chunk.workspace_starts,
+                    len(chunk.block_table),
+                )
+
+                chunk_q = q[chunk.tokens_slice]
+                chunk_topk_indices_workspace = topk_indices[chunk.tokens_slice]
+
+                attn_out[chunk.tokens_slice] = self._bf16_flash_mla_kernel(
+                    chunk_q,
+                    chunk_workspace,
+                    chunk_topk_indices_workspace,
+                )
+
+        return attn_out
+
+    def _forward_fp8_kv_mixed_batch(
+        self,
+        q: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        topk_indices: torch.Tensor,
+        attn_metadata: FlashMLASparseMetadata,
+    ) -> torch.Tensor:
+        """Mixed batch FP8 forward path that treats all tokens as one batch.
+
+        This is equivalent to main branch's approach and avoids the BF16
+        prefill kernel which has head padding overhead when num_heads is small.
+        Used when use_mixed_batch is True.
+        """
+        # Convert per-request indices to global slots (decode) or workspace
+        # offsets (prefill).
+        topk_indices = triton_convert_req_index_to_global_index(
+            attn_metadata.req_id_per_token,
+            attn_metadata.block_table,
+            topk_indices,
+            BLOCK_SIZE=attn_metadata.block_size,
+            NUM_TOPK_TOKENS=topk_indices.shape[1],
+        )
+
+        assert attn_metadata.fp8_extra_metadata is not None
+        assert isinstance(
+            attn_metadata.fp8_extra_metadata, FlashMLASparseMetadata.FP8KernelMetadata
+        )
+        fp8_metadata = attn_metadata.fp8_extra_metadata
+
+        _attn_out, _ = self._fp8_flash_mla_kernel(
+            q=q.unsqueeze(0),  # unsqueeze to add batch_dim: (T, H, D) -> (1, T, H, D)
+            kv_c_and_k_pe_cache=kv_c_and_k_pe_cache,
+            topk_indices=topk_indices.unsqueeze(0),  # (T, topk) -> (1, T, topk)
+            kernel_metadata=fp8_metadata,
+        )
+
+        # Output is (1, T, H, D_v), squeeze back to (T, H, D_v)
+        return _attn_out.squeeze(0)
+
+    def _fp8_flash_mla_kernel(
+        self,
+        q: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        topk_indices: torch.Tensor,
+        kernel_metadata: FlashMLASparseMetadata.FP8KernelMetadata,
+    ) -> torch.Tensor:
+        return flash_mla_with_kvcache(
+            q=q,
+            k_cache=kv_c_and_k_pe_cache.view(torch.uint8).unsqueeze(-2),
+            block_table=kernel_metadata.dummy_block_table,
+            head_dim_v=512,
+            cache_seqlens=kernel_metadata.cache_lens,
+            tile_scheduler_metadata=kernel_metadata.scheduler_metadata,
+            num_splits=kernel_metadata.num_splits,
+            is_fp8_kvcache=True,
+            indices=topk_indices,
+            softmax_scale=self.softmax_scale,
+        )
+
+    def _bf16_flash_mla_kernel(
+        self,
+        q: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        topk_indices: torch.Tensor,
     ) -> torch.Tensor:
         num_tokens = q.shape[0]
         kv_c_and_k_pe_cache = kv_c_and_k_pe_cache.view(
@@ -445,31 +944,6 @@ class FlashMLASparseImpl(MLACommonBaseImpl[FlashMLASparseMetadata]):
         output = output[:, : self.num_heads, :]
         return output
 
-    def _forward_fp8_kv(
-        self,
-        q: torch.Tensor,
-        kv_c_and_k_pe_cache: torch.Tensor,
-        topk_indices: torch.Tensor,
-        attn_metadata: FlashMLASparseMetadata,
-    ) -> torch.Tensor:
-        assert attn_metadata.fp8_extra_metadata is not None
-        extra_metadata = attn_metadata.fp8_extra_metadata
-
-        _attn_out, _ = flash_mla_with_kvcache(
-            q=q.unsqueeze(0),  # unsqueeze to add batch_dim
-            k_cache=kv_c_and_k_pe_cache.view(torch.uint8).unsqueeze(-2),
-            block_table=extra_metadata.dummy_block_table,
-            head_dim_v=512,
-            cache_seqlens=extra_metadata.cache_lens,
-            tile_scheduler_metadata=extra_metadata.scheduler_metadata,
-            num_splits=extra_metadata.num_splits,
-            is_fp8_kvcache=True,
-            indices=topk_indices.unsqueeze(0),  # unsqueeze to add batch_dim
-            softmax_scale=self.softmax_scale,
-        )
-
-        return _attn_out
-
     def forward(
         self,
         layer: AttentionLayer,
@@ -477,7 +951,7 @@ class FlashMLASparseImpl(MLACommonBaseImpl[FlashMLASparseMetadata]):
         k_c_normed: torch.Tensor,  # key in unified attn
         k_pe: torch.Tensor,  # value in unified attn
         kv_cache: torch.Tensor,
-        attn_metadata: FlashMLASparseMetadata,
+        attn_metadata: FlashMLASparseMetadata | None,
         output: torch.Tensor | None = None,
         output_scale: torch.Tensor | None = None,
         output_block_scale: torch.Tensor | None = None,
@@ -493,6 +967,7 @@ class FlashMLASparseImpl(MLACommonBaseImpl[FlashMLASparseMetadata]):
             )
 
         if attn_metadata is None:
+            # Dummy run - no need to allocate buffers
             # The zero fill is required when used with DP + EP
             # to ensure all ranks within a DP group compute the
             # same expert outputs.
@@ -505,6 +980,7 @@ class FlashMLASparseImpl(MLACommonBaseImpl[FlashMLASparseMetadata]):
         q = q[:num_actual_toks, ...]
         k_c_normed = k_c_normed[:num_actual_toks, ...]
         k_pe = k_pe[:num_actual_toks, ...]
+        topk_indices = self.topk_indices_buffer[:num_actual_toks]
 
         q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
         # Convert from (B, N, P) to (N, B, P)
@@ -514,16 +990,7 @@ class FlashMLASparseImpl(MLACommonBaseImpl[FlashMLASparseMetadata]):
         # Convert from (N, B, L) to (B, N, L)
         ql_nope = ql_nope.transpose(0, 1)
 
-        topk_indices = self.topk_indices_buffer[:num_actual_toks]
-
-        # TODO: handle index / kv_cache correctly
-        topk_indices_global = triton_convert_req_index_to_global_index(
-            attn_metadata.req_id_per_token,
-            attn_metadata.block_table,
-            topk_indices,
-            BLOCK_SIZE=attn_metadata.block_size,
-            NUM_TOPK_TOKENS=attn_metadata.topk_tokens,
-        )
+        use_fp8_cache = self.kv_cache_dtype == "fp8_ds_mla"
 
         q = torch.cat([ql_nope, q_pe], dim=-1)
 
@@ -538,13 +1005,15 @@ class FlashMLASparseImpl(MLACommonBaseImpl[FlashMLASparseMetadata]):
                 scale=layer._k_scale,
             )
 
-        if self.kv_cache_dtype != "fp8_ds_mla":
-            attn_out = self._forward_bf16_kv(
-                q, kv_cache, topk_indices_global, attn_metadata
+        if not use_fp8_cache:
+            attn_out = self._forward_bf16_kv(q, kv_cache, topk_indices, attn_metadata)
+        elif attn_metadata.fp8_use_mixed_batch:
+            attn_out = self._forward_fp8_kv_mixed_batch(
+                q, kv_cache, topk_indices, attn_metadata
             )
         else:
-            attn_out = self._forward_fp8_kv(
-                q, kv_cache, topk_indices_global, attn_metadata
+            attn_out = self._forward_fp8_kv_separate_prefill_decode(
+                q, kv_cache, topk_indices, attn_metadata
             )
 
         self._v_up_proj(attn_out, out=output[:num_actual_toks])
diff --git a/vllm/v1/attention/backends/mla/indexer.py b/vllm/v1/attention/backends/mla/indexer.py
index 77f1ba00d5b04..d0696f60a08c7 100644
--- a/vllm/v1/attention/backends/mla/indexer.py
+++ b/vllm/v1/attention/backends/mla/indexer.py
@@ -18,6 +18,7 @@ from vllm.v1.attention.backends.utils import (
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
     split_decodes_and_prefills,
+    split_prefill_chunks,
 )
 
 logger = init_logger(__name__)
@@ -176,40 +177,15 @@ def kv_spans_from_batches(
 
 def get_max_prefill_buffer_size(vllm_config: VllmConfig):
     max_model_len = vllm_config.model_config.max_model_len
-    # NOTE(Chen): 2 is a magic number for controlling the prefill buffer size.
-    # May be tuned later.
-    return max_model_len * 2
-
-
-def split_prefill_chunks(
-    seq_lens_cpu: torch.Tensor, max_prefill_buffer_size: int, reqs_start: int
-) -> list[tuple[int, int]]:
-    """
-    Split the prefill chunks into a list of tuples of (reqs_start, reqs_end)
-    such that the total sequence length of each chunk is less than the
-    maximum prefill buffer size.
-
-    Args:
-        seq_lens_cpu: The sequence lengths of the prefill requests.
-        max_prefill_buffer_size: The maximum prefill buffer size.
-        reqs_start: The start index of the prefill requests.
-
-    Returns:
-        A list of tuples of (reqs_start, reqs_end).
-    """
-    chunk_seq_ids = []
-    total_seq_lens = 0
-    for i in range(reqs_start, len(seq_lens_cpu)):
-        cur_seq_len = seq_lens_cpu[i].item()
-        assert cur_seq_len <= max_prefill_buffer_size
-        total_seq_lens += cur_seq_len
-        if total_seq_lens > max_prefill_buffer_size:
-            chunk_seq_ids.append((reqs_start, i))
-            reqs_start = i
-            total_seq_lens = cur_seq_len
-    if total_seq_lens > 0:
-        chunk_seq_ids.append((reqs_start, len(seq_lens_cpu)))
-    return chunk_seq_ids
+    # NOTE(Chen): 40 is a magic number for controlling the prefill buffer size.
+    # Each entry is 128 fp8 bytes and 4 scale bytes for a total of 132 bytes.
+    # The flashmla_sparse backend uses a workspace size of 5 * max_model_len.
+    # The memory usage of the workspace there is 576 * 2 bytes; so we size this as
+    # (576 * 2 // 132) * 5 = 40 to maximize this workspace size while still fitting
+    # within the flashmla_sparse workspace.
+    # For DeepSeek-V3.2, the max_model_len is 163840.
+    #   40 * 163840 * 132 = 865075200 bytes = 825 MB
+    return max_model_len * 40
 
 
 class DeepseekV32IndexerMetadataBuilder(AttentionMetadataBuilder):
@@ -302,9 +278,9 @@ class DeepseekV32IndexerMetadataBuilder(AttentionMetadataBuilder):
         prefill_metadata = None
         if num_prefills > 0:
             chunk_seq_ids = split_prefill_chunks(
-                common_attn_metadata.seq_lens_cpu,
+                common_attn_metadata.seq_lens_cpu[num_decodes:],
                 self.max_prefill_buffer_size,
-                num_decodes,
+                request_offset=num_decodes,
             )
             chunks = [
                 self.build_one_prefill_chunk(
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 79a1f7d4757d9..da43d87038234 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -937,6 +937,33 @@ def split_decodes_and_prefills(
     return (num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens)
 
 
+def split_prefill_chunks(
+    seq_lens_cpu: torch.Tensor, workspace_size: int, request_offset: int = 0
+) -> list[tuple[int, int]]:
+    """
+    Split the prefill requests into chunks such that the total sequence length
+    of each chunk is less than or equal to the workspace size.
+
+    Args:
+        seq_lens_cpu: The sequence lengths of the prefill requests on CPU.
+        workspace_size: The maximum workspace size (in tokens) per chunk.
+        request_offset: The offset to add to the request indices.
+    Returns:
+        A list of tuples of (reqs_start, reqs_end) representing chunk boundaries.
+    """
+    chunk_bounds = []
+    i, n = 0, len(seq_lens_cpu)
+    assert torch.all(seq_lens_cpu <= workspace_size).item()
+
+    while i < n:
+        start, chunk_total = i, 0
+        while i < n and (chunk_total + (s := seq_lens_cpu[i].item())) <= workspace_size:
+            chunk_total += s
+            i += 1
+        chunk_bounds.append((start + request_offset, i + request_offset))
+    return chunk_bounds
+
+
 def reorder_batch_to_split_decodes_and_prefills(
     input_batch: "InputBatch",
     scheduler_output: "SchedulerOutput",
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 3f20296c27ba7..978224faae65e 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -162,6 +162,7 @@ from vllm.v1.worker.ubatch_utils import (
     maybe_create_ubatch_slices,
 )
 from vllm.v1.worker.utils import is_residual_scattered_for_sp
+from vllm.v1.worker.workspace import lock_workspace
 
 from .utils import (
     AttentionGroup,
@@ -297,6 +298,7 @@ class GPUModelRunner(
         self.device = device
         self.pin_memory = is_pin_memory_available()
         self.dtype = self.model_config.dtype
+
         self.kv_cache_dtype = kv_cache_dtype_str_to_dtype(
             cache_config.cache_dtype, self.model_config
         )
@@ -4597,6 +4599,10 @@ class GPUModelRunner(
         # after here.
         set_cudagraph_capturing_enabled(False)
 
+        # Lock workspace to prevent resizing during execution.
+        # Max workspace sizes should have been captured during warmup/profiling.
+        lock_workspace()
+
         end_time = time.perf_counter()
         elapsed_time = end_time - start_time
         cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 25ac5aaf99818..21a8564f83c40 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -54,6 +54,7 @@ from vllm.v1.outputs import (
 from vllm.v1.utils import report_usage_stats
 from vllm.v1.worker.utils import is_residual_scattered_for_sp
 from vllm.v1.worker.worker_base import WorkerBase
+from vllm.v1.worker.workspace import init_workspace_manager
 
 logger = init_logger(__name__)
 
@@ -255,6 +256,10 @@ class Worker(WorkerBase):
         else:
             raise RuntimeError(f"Not support device type: {self.device_config.device}")
 
+        # Initialize workspace manager
+        num_ubatches = 2 if self.vllm_config.parallel_config.enable_dbo else 1
+        init_workspace_manager(self.device, num_ubatches)
+
         # Construct the model runner
         if self.use_v2_model_runner:
             from vllm.v1.worker.gpu.model_runner import (
diff --git a/vllm/v1/worker/workspace.py b/vllm/v1/worker/workspace.py
new file mode 100644
index 0000000000000..a16dde1f67800
--- /dev/null
+++ b/vllm/v1/worker/workspace.py
@@ -0,0 +1,245 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import inspect
+import os
+from itertools import accumulate
+from math import prod
+from typing import Optional
+
+import torch
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+from vllm.utils.math_utils import round_up
+from vllm.v1.worker.ubatching import dbo_current_ubatch_id
+
+logger = init_logger(__name__)
+
+
+def _compute_bytes(shape: tuple[int, ...], dtype: torch.dtype) -> int:
+    return prod(shape) * dtype.itemsize
+
+
+# Constants
+_MB = 1024**2
+_GiB = 1024**3
+
+# Global workspace manager instance
+_manager: Optional["WorkspaceManager"] = None
+
+
+class WorkspaceManager:
+    """Manager for workspace allocation.
+
+    Manages workspace buffers for DBO (Dual Batch Overlap) execution.
+    Can be locked to prevent further growth during execution.
+    """
+
+    def __init__(self, device: torch.device, num_ubatches: int | None = None):
+        self._device = device
+        # Cache num ubatches at init based on configuration (default to 1)
+        self._num_ubatches = num_ubatches if num_ubatches is not None else 1
+        self._current_workspaces: list[torch.Tensor | None] = [None, None]
+        self._locked: bool = False
+
+    @staticmethod
+    def _workspace_size_bytes(workspace: torch.Tensor | None) -> int:
+        """Get size of workspace in bytes."""
+        if workspace is None:
+            return 0
+        return workspace.numel() * workspace.element_size()
+
+    def lock(self) -> None:
+        """Lock the workspace to prevent further growth.
+
+        After locking, any attempt to allocate a larger workspace will raise
+        an assertion error. This ensures workspace size is fixed during execution.
+        """
+        self._locked = True
+        if envs.VLLM_DEBUG_WORKSPACE:
+            logger.info(
+                "[WORKSPACE DEBUG] Workspace locked. Current sizes: %s",
+                [
+                    self._workspace_size_bytes(ws) / _MB
+                    for ws in self._current_workspaces
+                    if ws is not None
+                ],
+            )
+
+    def is_locked(self) -> bool:
+        """Check if workspace is locked."""
+        return self._locked
+
+    def get_simultaneous(
+        self, *shapes_and_dtypes: tuple[tuple[int, ...], torch.dtype]
+    ) -> list[torch.Tensor]:
+        """Get multiple workspace tensors simultaneously from a single allocation.
+
+        Args:
+            *shapes_and_dtypes: One or more (shape, dtype) tuples.
+
+        Returns:
+            List of tensor views into the workspace buffer, one per shape/dtype pair.
+        """
+        actual_bytes = [_compute_bytes(s, d) for s, d in shapes_and_dtypes]
+        aligned_bytes = [round_up(actual, 256) for actual in actual_bytes]
+        total_bytes = sum(aligned_bytes)
+
+        # Calculate cumulative offsets using itertools.accumulate
+        offsets = list(accumulate([0] + aligned_bytes[:-1]))
+
+        current_workspace = self._ensure_workspace_size(total_bytes)
+
+        return [
+            current_workspace[offsets[i] : offsets[i] + actual_bytes[i]]
+            .view(shapes_and_dtypes[i][1])
+            .reshape(shapes_and_dtypes[i][0])
+            for i in range(len(shapes_and_dtypes))
+        ]
+
+    def _ensure_workspace_size(self, required_bytes: int) -> torch.Tensor:
+        """Ensure workspace is allocated and large enough, return current workspace.
+
+        Args:
+            required_bytes: The number of bytes required.
+
+        Returns:
+            The current workspace tensor.
+        """
+        ubatch_id = dbo_current_ubatch_id()
+        current_workspace = self._current_workspaces[ubatch_id]
+        current_size = self._workspace_size_bytes(current_workspace)
+
+        if current_size < required_bytes:
+
+            def get_caller_info() -> str:
+                """Find first frame outside WorkspaceManager."""
+                curr_frame = inspect.currentframe()
+                if curr_frame is None:
+                    return "unknown"
+                # Walk up the stack skipping WorkspaceManager frames
+                curr_frame = curr_frame.f_back
+                while curr_frame is not None:
+                    # TODO: This only catches instance methods (self), missing
+                    # classmethods and staticmethods. Once Python 3.11+ is the
+                    # minimum supported version, use co_qualname instead:
+                    #   qualname = curr_frame.f_code.co_qualname
+                    #   if qualname.startswith("WorkspaceManager."):
+                    if isinstance(curr_frame.f_locals.get("self"), WorkspaceManager):
+                        curr_frame = curr_frame.f_back
+                        continue
+                    filename = os.path.basename(curr_frame.f_code.co_filename)
+                    return (
+                        f"{filename}:{curr_frame.f_lineno}:{curr_frame.f_code.co_name}"
+                    )
+                return "unknown"
+
+            if self._locked:
+                raise AssertionError(
+                    f"Workspace is locked but allocation from '{get_caller_info()}' "
+                    f"requires {required_bytes / _MB:.2f} MB, current size is "
+                    f"{current_size / _MB:.2f} MB. "
+                    "Workspace growth is not allowed after locking."
+                )
+
+            for ubatch_id in range(self._num_ubatches):
+                current_workspace = self._current_workspaces[ubatch_id]
+                if current_workspace is None:
+                    self._current_workspaces[ubatch_id] = torch.empty(
+                        (required_bytes,), dtype=torch.uint8, device=self._device
+                    )
+                elif self._workspace_size_bytes(current_workspace) < required_bytes:
+                    current_workspace.resize_(required_bytes)
+
+            if envs.VLLM_DEBUG_WORKSPACE:
+                logger.info(
+                    "[WORKSPACE DEBUG] Resized workspace from '%s': %.2f MB -> "
+                    "%.2f MB (%d ubatches, total memory %.2f MB)",
+                    get_caller_info(),
+                    current_size / _MB,
+                    required_bytes / _MB,
+                    self._num_ubatches,
+                    required_bytes * self._num_ubatches / _MB,
+                )
+
+            current_workspace = self._current_workspaces[dbo_current_ubatch_id()]
+
+        return current_workspace
+
+
+def is_workspace_manager_initialized() -> bool:
+    """Check if workspace manager has been initialized.
+
+    Returns:
+        True if workspace manager is initialized, False otherwise.
+    """
+    return _manager is not None
+
+
+def current_workspace_manager() -> "WorkspaceManager":
+    """Get the current workspace manager instance.
+
+    Raises:
+        AssertionError: If workspace manager has not been initialized.
+    """
+    assert _manager is not None, (
+        "WorkspaceManager not initialized. Call init_workspace_manager() "
+        "with a device before using workspace functions."
+    )
+    return _manager
+
+
+def init_workspace_manager(
+    device: torch.device, num_ubatches: int | None = None
+) -> None:
+    """Initialize the workspace manager with a device.
+
+    Must be called before using any workspace functions. Typically called
+    from GPUModelRunner.__init__.
+
+    Args:
+        device: The device to allocate workspace on.
+        num_ubatches: Number of micro-batches. Defaults to 1.
+    """
+    global _manager
+    if _manager is not None:
+        logger.warning(
+            "WorkspaceManager already initialized on device %s, "
+            "reinitializing on device %s",
+            _manager._device,
+            device,
+        )
+    _manager = WorkspaceManager(device, num_ubatches)
+
+
+def lock_workspace() -> None:
+    """Lock the workspace to prevent further growth.
+
+    After calling this function, any attempt to allocate a workspace larger
+    than the current size will raise an AssertionError. This ensures that
+    workspace size is fixed during execution and prevents unexpected memory
+    allocations in the hot path.
+
+    Example:
+        # During initialization
+        init_workspace_manager(device)
+        reserve_workspace(shape1, dtype1)
+        reserve_workspace(shape2, dtype2)
+
+        # Lock after warmup/profiling
+        lock_workspace()
+
+        # Now all get_workspace calls must fit in pre-allocated size
+    """
+    current_workspace_manager().lock()
+
+
+def reset_workspace_manager() -> None:
+    """Reset the workspace manager to uninitialized state.
+
+    This is primarily intended for testing purposes to allow tests
+    to reinitialize the workspace manager cleanly.
+    """
+    global _manager
+    _manager = None

From 3e34adcdfb1e3845e2fc4d0cd192d314eab516f0 Mon Sep 17 00:00:00 2001
From: Vladislav Nosivskoy <47858711+vladnosiv@users.noreply.github.com>
Date: Fri, 12 Dec 2025 18:01:06 +0300
Subject: [PATCH 486/770] [DeepSeek V3.2] Proper drop_thinking logic (#30490)

Signed-off-by: Vladislav Nosivskoy <vladnosiv@gmail.com>
---
 vllm/tokenizers/deepseekv32.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/tokenizers/deepseekv32.py b/vllm/tokenizers/deepseekv32.py
index 5c4936b5e7ad3..a7fa0f421725a 100644
--- a/vllm/tokenizers/deepseekv32.py
+++ b/vllm/tokenizers/deepseekv32.py
@@ -47,11 +47,13 @@ class DeepseekV32Tokenizer(HfTokenizer):
             thinking_mode = "chat"
         conversation = kwargs.get("conversation", messages)
         messages = conversation.copy()
-        drop_thinking = True
         if tools is not None and len(tools) > 0:
             messages.insert(0, {"role": "system"})
             messages[0]["tools"] = tools
-            drop_thinking = False
+
+        # Historical reasoning content is dropped when a new user message is introduced
+        drop_thinking = messages[-1]["role"] == "user"
+
         encode_config = dict(thinking_mode=thinking_mode, drop_thinking=drop_thinking)
         prompt_str = encode_messages(messages, **encode_config)  # type: ignore
         return prompt_str

From dc13c99eedf837f22f60e9b5836abf147f5254f1 Mon Sep 17 00:00:00 2001
From: Christina Norman <truffle@gmail.com>
Date: Fri, 12 Dec 2025 09:10:12 -0600
Subject: [PATCH 487/770] fix(gguf): Disable bfloat16 for GGUF on blackwell
 device (#30408)

Signed-off-by: Christina <truffle@gmail.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Christina Norman <christina@example.com>
Co-authored-by: Isotr0py <isotr0py@users.noreply.github.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/model_executor/layers/quantization/gguf.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index 13aa2bcad21ba..9dd734f2fea6a 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -33,6 +33,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 )
 from vllm.model_executor.models.utils import WeightsMapper
 from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
 from vllm.utils.torch_utils import direct_register_custom_op
 
 logger = init_logger(__name__)
@@ -52,6 +53,11 @@ class GGUFConfig(QuantizationConfig):
         return "gguf"
 
     def get_supported_act_dtypes(self) -> list[torch.dtype]:
+        # GGUF dequantization kernels use half precision (fp16) internally.
+        # bfloat16 has precision issues on Blackwell devices.
+        if current_platform.has_device_capability(100):
+            logger.warning_once("GGUF has precision issues with bfloat16 on Blackwell.")
+            return [torch.half, torch.float32]
         return [torch.half, torch.bfloat16, torch.float32]
 
     @classmethod

From 09ad3b76b320fffcb6b0214bd90851c3328581ea Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Fri, 12 Dec 2025 10:40:50 -0500
Subject: [PATCH 488/770] [Bug] Fix attention_backend arg string parsing
 (#30534)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/engine/arg_utils.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 757023e12d439..2867532756450 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1649,7 +1649,13 @@ class EngineArgs:
                     "attention_backend and attention_config.backend "
                     "are mutually exclusive"
                 )
-            attention_config.backend = self.attention_backend
+            # Convert string to enum if needed (CLI parsing returns a string)
+            if isinstance(self.attention_backend, str):
+                attention_config.backend = AttentionBackendEnum[
+                    self.attention_backend.upper()
+                ]
+            else:
+                attention_config.backend = self.attention_backend
 
         load_config = self.create_load_config()
 

From 9c0ee995a81fbd87b397c956ca56fc94f784966e Mon Sep 17 00:00:00 2001
From: jvlunteren <161835099+jvlunteren@users.noreply.github.com>
Date: Fri, 12 Dec 2025 16:55:40 +0100
Subject: [PATCH 489/770] [Kernel] Support CUDA Graphs in 3D Triton Attention
 Kernel (#28306)

Signed-off-by: Jan van Lunteren <jvl@zurich.ibm.com>
Signed-off-by: jvlunteren <161835099+jvlunteren@users.noreply.github.com>
Co-authored-by: Thomas Parnell <tom.parnell@gmail.com>
Co-authored-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 .../test_triton_unified_attention.py          | 27 ++++++
 .../attention/ops/triton_unified_attention.py | 69 +++++++--------
 vllm/v1/attention/backends/triton_attn.py     | 84 ++++++++++++++++++-
 3 files changed, 140 insertions(+), 40 deletions(-)

diff --git a/tests/kernels/attention/test_triton_unified_attention.py b/tests/kernels/attention/test_triton_unified_attention.py
index bf4d2179af5f9..7fb08e5780f51 100644
--- a/tests/kernels/attention/test_triton_unified_attention.py
+++ b/tests/kernels/attention/test_triton_unified_attention.py
@@ -7,6 +7,7 @@ import torch
 
 from vllm.attention.ops.triton_unified_attention import unified_attention
 from vllm.platforms import current_platform
+from vllm.utils.math_utils import next_power_of_2
 
 NUM_HEADS = [(4, 4), (8, 2)]
 HEAD_SIZES = [128, 256]
@@ -22,6 +23,10 @@ QDTYPES = (
 # one value small enough to test the schema op check
 NUM_BLOCKS = [32768, 2048]
 
+# 0: use 2D kernel for decode
+# 8: use 3D kernel for decode
+SEQ_THRESHOLD_3D_VALUES = [0, 8]
+
 
 def ref_paged_attn(
     query: torch.Tensor,
@@ -92,6 +97,7 @@ def ref_paged_attn(
 @pytest.mark.parametrize("soft_cap", [None, 50.0])
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 @pytest.mark.parametrize("q_dtype", QDTYPES)
+@pytest.mark.parametrize("seq_threshold_3D", SEQ_THRESHOLD_3D_VALUES)
 @torch.inference_mode()
 def test_triton_unified_attn(
     seq_lens: list[tuple[int, int]],
@@ -103,6 +109,7 @@ def test_triton_unified_attn(
     soft_cap: float | None,
     num_blocks: int,
     q_dtype: torch.dtype | None,
+    seq_threshold_3D: int,
 ) -> None:
     torch.set_default_device("cuda")
 
@@ -152,6 +159,21 @@ def test_triton_unified_attn(
         k_descale = torch.rand(scale_shape, dtype=torch.float32)
         v_descale = torch.rand(scale_shape, dtype=torch.float32)
 
+    num_par_softmax_segments = 16
+    head_size_padded = next_power_of_2(head_size)
+    softmax_segm_output = torch.empty(
+        (seq_threshold_3D, num_query_heads, num_par_softmax_segments, head_size_padded),
+        dtype=torch.float32,
+    )
+    softmax_segm_max = torch.empty(
+        (seq_threshold_3D, num_query_heads, num_par_softmax_segments),
+        dtype=torch.float32,
+    )
+    softmax_segm_expsum = torch.empty(
+        (seq_threshold_3D, num_query_heads, num_par_softmax_segments),
+        dtype=torch.float32,
+    )
+
     unified_attention(
         q=maybe_quantized_query,
         k=maybe_quantized_key_cache,
@@ -169,6 +191,11 @@ def test_triton_unified_attn(
         q_descale=q_descale,
         k_descale=k_descale,
         v_descale=v_descale,
+        seq_threshold_3D=seq_threshold_3D,
+        num_par_softmax_segments=num_par_softmax_segments,
+        softmax_segm_output=softmax_segm_output,
+        softmax_segm_max=softmax_segm_max,
+        softmax_segm_expsum=softmax_segm_expsum,
     )
 
     ref_output = ref_paged_attn(
diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py
index 565be1c39bec1..a1877bb4429b9 100644
--- a/vllm/attention/ops/triton_unified_attention.py
+++ b/vllm/attention/ops/triton_unified_attention.py
@@ -355,7 +355,7 @@ def kernel_unified_attention_2d(
 @triton.jit
 def kernel_unified_attention_3d(
     segm_output_ptr,
-    # [num_tokens, num_query_heads, num_segments, head_size]
+    # [num_tokens, num_query_heads, num_segments, head_size_padded]
     segm_max_ptr,  # [num_tokens, num_query_heads, num_segments]
     segm_expsum_ptr,  # [num_tokens, num_query_heads, num_segments]
     query_ptr,  # [num_tokens, num_query_heads, head_size]
@@ -749,6 +749,11 @@ def unified_attention(
     q_descale,
     k_descale,
     v_descale,
+    seq_threshold_3D=None,
+    num_par_softmax_segments=None,
+    softmax_segm_output=None,
+    softmax_segm_max=None,
+    softmax_segm_expsum=None,
     alibi_slopes=None,
     output_scale=None,
     qq_bias=None,
@@ -793,8 +798,19 @@ def unified_attention(
     TILE_SIZE_PREFILL = 32
     TILE_SIZE_DECODE = 16 if q.element_size() >= 2 else 32
 
-    # if batch contains a prefill
-    if max_seqlen_q > 1 or total_num_q_blocks * num_kv_heads > 128:
+    # Launch the 2D kernel if
+    # 1. No intermediate tiled softmax buffers for the 3D kernel have been allocated, or
+    # 2. The batch includes at least one prefill request, or
+    # 3. The number of sequences exceeds the configured threshold
+    if (
+        seq_threshold_3D is None
+        or num_par_softmax_segments is None
+        or softmax_segm_output is None
+        or softmax_segm_max is None
+        or softmax_segm_expsum is None
+        or max_seqlen_q > 1
+        or num_seqs > seq_threshold_3D
+    ):
         kernel_unified_attention_2d[
             (
                 total_num_q_blocks,
@@ -847,37 +863,12 @@ def unified_attention(
             USE_FP8=output_scale is not None,
         )
     else:
-        # for initial version, NUM_SEGMENTS = 16 is chosen as a default
-        # value that showed good performance in tests
-        NUM_SEGMENTS = 16
-
-        segm_output = torch.empty(
-            q.shape[0],
-            num_query_heads,
-            NUM_SEGMENTS,
-            triton.next_power_of_2(head_size),
-            dtype=torch.float32,
-            device=q.device,
-        )
-        segm_max = torch.empty(
-            q.shape[0],
-            num_query_heads,
-            NUM_SEGMENTS,
-            dtype=torch.float32,
-            device=q.device,
-        )
-        segm_expsum = torch.empty(
-            q.shape[0],
-            num_query_heads,
-            NUM_SEGMENTS,
-            dtype=torch.float32,
-            device=q.device,
-        )
-
-        kernel_unified_attention_3d[(total_num_q_blocks, num_kv_heads, NUM_SEGMENTS)](
-            segm_output_ptr=segm_output,
-            segm_max_ptr=segm_max,
-            segm_expsum_ptr=segm_expsum,
+        kernel_unified_attention_3d[
+            (total_num_q_blocks, num_kv_heads, num_par_softmax_segments)
+        ](
+            segm_output_ptr=softmax_segm_output,
+            segm_max_ptr=softmax_segm_max,
+            segm_expsum_ptr=softmax_segm_expsum,
             query_ptr=q,
             key_cache_ptr=k,
             value_cache_ptr=v,
@@ -917,13 +908,13 @@ def unified_attention(
             BLOCK_Q=BLOCK_Q,
             num_seqs=num_seqs,
             BLOCK_M=BLOCK_M,
-            NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS,
+            NUM_SEGMENTS_PER_SEQ=num_par_softmax_segments,
         )
         reduce_segments[(q.shape[0], num_query_heads)](
             output_ptr=out,
-            segm_output_ptr=segm_output,
-            segm_max_ptr=segm_max,
-            segm_expsum_ptr=segm_expsum,
+            segm_output_ptr=softmax_segm_output,
+            segm_max_ptr=softmax_segm_max,
+            segm_expsum_ptr=softmax_segm_expsum,
             seq_lens_ptr=seqused_k,
             num_seqs=num_seqs,
             num_query_heads=num_query_heads,
@@ -936,6 +927,6 @@ def unified_attention(
             HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
             query_start_len_ptr=cu_seqlens_q,
             BLOCK_Q=BLOCK_Q,
-            NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS,
+            NUM_SEGMENTS_PER_SEQ=num_par_softmax_segments,
             USE_FP8=output_scale is not None,
         )
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index 3b17c4bcd89cc..7bea3862a03f9 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -17,7 +17,7 @@ from vllm.attention.ops.triton_reshape_and_cache_flash import (
     triton_reshape_and_cache_flash,
 )
 from vllm.attention.ops.triton_unified_attention import unified_attention
-from vllm.config import VllmConfig
+from vllm.config import CUDAGraphMode, VllmConfig
 from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
@@ -26,6 +26,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
 )
 from vllm.platforms import current_platform
 from vllm.platforms.interface import DeviceCapability
+from vllm.utils.math_utils import next_power_of_2
 from vllm.v1.attention.backends.utils import (
     AttentionCGSupport,
     AttentionMetadataBuilder,
@@ -36,6 +37,11 @@ from vllm.v1.kv_cache_interface import AttentionSpec
 logger = init_logger(__name__)
 
 
+# constants
+MIN_LAUNCH_GRID_SIZE_2D = 128  # Minimum launch grid size of 2D kernel
+NUM_PAR_SOFTMAX_SEGMENTS = 16  # Number of parallel tiled softmax segments
+
+
 @dataclass
 class TritonAttentionMetadata:
     # NOTE(sang): Definition of context_len, query_len, and seq_len.
@@ -54,6 +60,12 @@ class TritonAttentionMetadata:
     block_table: torch.Tensor
     slot_mapping: torch.Tensor
 
+    seq_threshold_3D: int
+    num_par_softmax_segments: int
+    softmax_segm_output: torch.Tensor
+    softmax_segm_max: torch.Tensor
+    softmax_segm_expsum: torch.Tensor
+
     # For cascade attention.
     use_cascade: bool
     common_prefix_len: int
@@ -87,6 +99,60 @@ class TritonAttentionMetadataBuilder(AttentionMetadataBuilder[TritonAttentionMet
         self.num_heads_kv = model_config.get_num_kv_heads(vllm_config.parallel_config)
         self.headdim = model_config.get_head_size()
 
+        # Check if CUDA Graphs are enabled for decode
+        self.decode_cudagraph_enabled = (
+            self.vllm_config.compilation_config.cudagraph_mode
+            in (
+                CUDAGraphMode.FULL_AND_PIECEWISE,
+                CUDAGraphMode.FULL_DECODE_ONLY,
+                CUDAGraphMode.FULL,
+            )
+        )
+
+        # The launch grid for the 2D kernel is defined as (num_q_blocks, num_heads_kv).
+        # A lower bound for num_q_blocks is the number of sequences.
+        # To ensure the minimum launch grid size is achieved, the number of sequences
+        # must be at least equal to the threshold below.
+        # If this threshold is not reached (i.e., the batch size is not large enough),
+        # the 3D kernel will be selected instead.
+        self.seq_threshold_3D = MIN_LAUNCH_GRID_SIZE_2D // self.num_heads_kv
+
+        # Modify the threshold if needed.
+        if self.decode_cudagraph_enabled:
+            capture_sizes = self.vllm_config.compilation_config.cudagraph_capture_sizes
+            assert capture_sizes, "CUDA Graphs enabled but no capture sizes specified."
+
+            # Select the CUDA Graph capture size closest to self.seq_threshold_3D
+            # as threshold. This ensures that each captured graph covers the
+            # correct execution path.
+            self.seq_threshold_3D = min(
+                capture_sizes,
+                key=lambda x: abs(x - self.seq_threshold_3D),
+            )
+
+        self.num_par_softmax_segments = NUM_PAR_SOFTMAX_SEGMENTS
+        headdim_padded = next_power_of_2(self.headdim)
+        self.softmax_segm_output = torch.empty(
+            (
+                self.seq_threshold_3D,
+                self.num_heads_q,
+                self.num_par_softmax_segments,
+                headdim_padded,
+            ),
+            dtype=torch.float32,
+            device=device,
+        )
+        self.softmax_segm_max = torch.empty(
+            (self.seq_threshold_3D, self.num_heads_q, self.num_par_softmax_segments),
+            dtype=torch.float32,
+            device=device,
+        )
+        self.softmax_segm_expsum = torch.empty(
+            (self.seq_threshold_3D, self.num_heads_q, self.num_par_softmax_segments),
+            dtype=torch.float32,
+            device=device,
+        )
+
     def build_for_cudagraph_capture(
         self, common_attn_metadata: CommonAttentionMetadata
     ) -> TritonAttentionMetadata:
@@ -143,6 +209,11 @@ class TritonAttentionMetadataBuilder(AttentionMetadataBuilder[TritonAttentionMet
             prefix_kv_lens=prefix_kv_lens,
             suffix_kv_lens=suffix_kv_lens,
             prefix_scheduler_metadata=prefix_scheduler_metadata,
+            seq_threshold_3D=self.seq_threshold_3D,
+            num_par_softmax_segments=self.num_par_softmax_segments,
+            softmax_segm_output=self.softmax_segm_output,
+            softmax_segm_max=self.softmax_segm_max,
+            softmax_segm_expsum=self.softmax_segm_expsum,
         )
         return attn_metadata
 
@@ -349,6 +420,12 @@ class TritonAttentionImpl(AttentionImpl):
         max_seqlen_k = attn_metadata.max_seq_len
         block_table = attn_metadata.block_table
 
+        seq_threshold_3D = attn_metadata.seq_threshold_3D
+        num_par_softmax_segments = attn_metadata.num_par_softmax_segments
+        softmax_segm_output = attn_metadata.softmax_segm_output
+        softmax_segm_max = attn_metadata.softmax_segm_max
+        softmax_segm_expsum = attn_metadata.softmax_segm_expsum
+
         descale_shape = (cu_seqlens_q.shape[0] - 1, key_cache.shape[2])
 
         unified_attention(
@@ -369,6 +446,11 @@ class TritonAttentionImpl(AttentionImpl):
             q_descale=None,  # Not supported
             k_descale=layer._k_scale.expand(descale_shape),
             v_descale=layer._v_scale.expand(descale_shape),
+            seq_threshold_3D=seq_threshold_3D,
+            num_par_softmax_segments=num_par_softmax_segments,
+            softmax_segm_output=softmax_segm_output,
+            softmax_segm_max=softmax_segm_max,
+            softmax_segm_expsum=softmax_segm_expsum,
             sinks=self.sinks,
             output_scale=output_scale,
         )

From f3237f3f6b1ce3ea3b1881a059811c2695ffe650 Mon Sep 17 00:00:00 2001
From: Benjamin Bartels <benjamin@bartels.dev>
Date: Fri, 12 Dec 2025 16:28:54 +0000
Subject: [PATCH 490/770] [Frontend] Fixes anthropic streaming message_start
 usage nesting (#30266)

Signed-off-by: bbartels <benjamin@bartels.dev>
---
 tests/entrypoints/openai/test_messages.py      |  9 ++++++---
 vllm/entrypoints/anthropic/serving_messages.py | 12 ++++++------
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/tests/entrypoints/openai/test_messages.py b/tests/entrypoints/openai/test_messages.py
index b804a1a7a841a..8de6c4cb6c887 100644
--- a/tests/entrypoints/openai/test_messages.py
+++ b/tests/entrypoints/openai/test_messages.py
@@ -79,9 +79,12 @@ async def test_anthropic_streaming(client: anthropic.AsyncAnthropic):
 
     assert chunk_count > 0
     assert first_chunk is not None, "message_start chunk was never observed"
-    assert first_chunk.usage is not None, "first chunk should include usage stats"
-    assert first_chunk.usage["output_tokens"] == 0
-    assert first_chunk.usage["input_tokens"] > 5
+    assert first_chunk.message is not None, "first chunk should include message"
+    assert first_chunk.message.usage is not None, (
+        "first chunk should include usage stats"
+    )
+    assert first_chunk.message.usage.output_tokens == 0
+    assert first_chunk.message.usage.input_tokens > 5
 
 
 @pytest.mark.asyncio
diff --git a/vllm/entrypoints/anthropic/serving_messages.py b/vllm/entrypoints/anthropic/serving_messages.py
index e7ea3bb59ca70..25c2d88a2c7a4 100644
--- a/vllm/entrypoints/anthropic/serving_messages.py
+++ b/vllm/entrypoints/anthropic/serving_messages.py
@@ -324,12 +324,12 @@ class AnthropicServingMessages(OpenAIServingChat):
                                     id=origin_chunk.id,
                                     content=[],
                                     model=origin_chunk.model,
-                                ),
-                                usage=AnthropicUsage(
-                                    input_tokens=origin_chunk.usage.prompt_tokens
-                                    if origin_chunk.usage
-                                    else 0,
-                                    output_tokens=0,
+                                    usage=AnthropicUsage(
+                                        input_tokens=origin_chunk.usage.prompt_tokens
+                                        if origin_chunk.usage
+                                        else 0,
+                                        output_tokens=0,
+                                    ),
                                 ),
                             )
                             first_item = False

From d2c919dcc20b1ea77a94fa01e813ebbb31f8a66a Mon Sep 17 00:00:00 2001
From: realliujiaxu <realliujiaxu@163.com>
Date: Sat, 13 Dec 2025 01:03:35 +0800
Subject: [PATCH 491/770] [bugfix] fix bug when top_logprobs=0 with spec
 decoding (#30059)

Signed-off-by: realliujiaxu <realliujiaxu@163.com>
---
 tests/v1/sample/test_logprobs.py          | 4 +++-
 tests/v1/sample/test_rejection_sampler.py | 2 +-
 vllm/v1/sample/rejection_sampler.py       | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index c89c33be80c10..76a0e8e25a4ae 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -528,9 +528,11 @@ def test_logprobs_mode(logprobs_mode: LogprobsMode):
         ),
     ],
 )
+@pytest.mark.parametrize("top_logprobs", [0, 3])
 def test_spec_decode_logprobs(
     logprobs_mode: LogprobsMode,
     model_setup: tuple[str, str, str],
+    top_logprobs: int,
 ):
     """Spec decode logprobs should match those of the base model.
 
@@ -543,7 +545,7 @@ def test_spec_decode_logprobs(
 
     prompt = "Hello world " * 50
     sampling_params = SamplingParams(
-        temperature=0, logprobs=3, max_tokens=10, ignore_eos=False
+        temperature=0, logprobs=top_logprobs, max_tokens=10, ignore_eos=False
     )
     method, model_name, spec_model_name = model_setup
     max_model_len = 256
diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py
index bf7726ebf907f..61caffee45daf 100644
--- a/tests/v1/sample/test_rejection_sampler.py
+++ b/tests/v1/sample/test_rejection_sampler.py
@@ -111,7 +111,7 @@ def create_sampling_metadata(
         top_p=top_p,
         top_k=top_k,
         generators=generators,
-        max_num_logprobs=0,
+        max_num_logprobs=None,
         no_penalties=no_penalties,
         prompt_token_ids=prompt_token_ids,
         frequency_penalties=frequency_penalties,
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index ccaf07e18c468..50b91d8292ee8 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -145,7 +145,7 @@ class RejectionSampler(nn.Module):
         )
 
         logprobs_tensors = None
-        if sampling_metadata.max_num_logprobs:
+        if sampling_metadata.max_num_logprobs is not None:
             logprobs_tensors = self._get_logprobs_tensors(
                 sampling_metadata.max_num_logprobs,
                 metadata,

From 02a58803948e6b493a9bde6d38b69423a638ae49 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Fri, 12 Dec 2025 13:05:34 -0500
Subject: [PATCH 492/770] [CI] Fix mypy for vllm/v1/executor (#30517)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 tools/pre_commit/mypy.py               |  2 +-
 vllm/v1/executor/abstract.py           |  2 +-
 vllm/v1/executor/multiproc_executor.py | 10 +++++++---
 vllm/v1/executor/ray_executor.py       |  6 +++---
 vllm/v1/executor/uniproc_executor.py   | 13 ++++++++-----
 5 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py
index 724b393044266..3f7e0a069f869 100755
--- a/tools/pre_commit/mypy.py
+++ b/tools/pre_commit/mypy.py
@@ -43,6 +43,7 @@ FILES = [
     "vllm/worker",
     "vllm/v1/core",
     "vllm/v1/engine",
+    "vllm/v1/executor",
     "vllm/v1/metrics",
     "vllm/v1/pool",
     "vllm/v1/sample",
@@ -60,7 +61,6 @@ SEPARATE_GROUPS = [
     "vllm/model_executor",
     # v1 related
     "vllm/v1/attention",
-    "vllm/v1/executor",
     "vllm/v1/kv_offload",
     "vllm/v1/spec_decode",
     "vllm/v1/structured_output",
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index db8303fcec501..8ada52435edae 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -219,7 +219,7 @@ class Executor(ABC):
 
     def sample_tokens(
         self, grammar_output: GrammarOutput | None, non_block: bool = False
-    ) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]:
+    ) -> ModelRunnerOutput | Future[ModelRunnerOutput]:
         output = self.collective_rpc(  # type: ignore[call-overload]
             "sample_tokens", args=(grammar_output,), non_block=non_block
         )
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 7e8ebe25c4603..b42d026a3e15b 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -294,8 +294,8 @@ class MultiprocExecutor(Executor):
         kwargs: dict | None = None,
         non_block: bool = False,
         unique_reply_rank: int | None = None,
-        kv_output_aggregator: KVOutputAggregator = None,
-    ) -> Any | list[Any] | Future[Any | list[Any]]:
+        kv_output_aggregator: KVOutputAggregator | None = None,
+    ) -> Any:
         """Returns single result if unique_reply_rank and/or kv_output_aggregator
         is provided, otherwise list."""
         assert self.rpc_broadcast_mq is not None, (
@@ -476,6 +476,8 @@ class WorkerProc:
     """Wrapper that runs one Worker in a separate process."""
 
     READY_STR = "READY"
+    rpc_broadcast_mq: MessageQueue | None
+    worker_response_mq: MessageQueue | None
 
     def _init_message_queues(
         self, input_shm_handle: Handle, vllm_config: VllmConfig
@@ -487,7 +489,7 @@ class WorkerProc:
             )
 
             # Initializes a message queue for sending the model output
-            self.worker_response_mq: MessageQueue = MessageQueue(1, 1)
+            self.worker_response_mq = MessageQueue(1, 1)
             self.peer_response_handles = []
         else:
             # Initialize remote MessageQueue for receiving SchedulerOutput across nodes
@@ -720,6 +722,7 @@ class WorkerProc:
         try:
             reader.close()
             worker = WorkerProc(*args, **kwargs)
+            assert worker.worker_response_mq is not None
 
             # Send READY once we know everything is loaded
             ready_writer.send(
@@ -804,6 +807,7 @@ class WorkerProc:
 
     def worker_busy_loop(self, cancel: threading.Event | None = None):
         """Main busy loop for Multiprocessing Workers"""
+        assert self.rpc_broadcast_mq is not None
         while True:
             method, args, kwargs, output_rank = self.rpc_broadcast_mq.dequeue(
                 cancel=cancel, indefinite=True
diff --git a/vllm/v1/executor/ray_executor.py b/vllm/v1/executor/ray_executor.py
index 406eafcd339b0..2fd64e5c2277c 100644
--- a/vllm/v1/executor/ray_executor.py
+++ b/vllm/v1/executor/ray_executor.py
@@ -413,7 +413,7 @@ class RayDistributedExecutor(Executor):
         self,
         grammar_output: "GrammarOutput | None",
         non_block: bool = False,
-    ) -> ModelRunnerOutput | Future[ModelRunnerOutput]:
+    ) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]:
         """Execute the model on the Ray workers.
 
         The scheduler output to use should have been provided in
@@ -428,7 +428,7 @@ class RayDistributedExecutor(Executor):
         """
         scheduler_output = self.scheduler_output
         if scheduler_output is None:
-            return COMPLETED_NONE_FUTURE if non_block else None  # noqa
+            return COMPLETED_NONE_FUTURE if non_block else None
 
         self.scheduler_output = None
 
@@ -439,7 +439,7 @@ class RayDistributedExecutor(Executor):
         scheduler_output: SchedulerOutput,
         grammar_output: "GrammarOutput | None",
         non_block: bool = False,
-    ) -> ModelRunnerOutput | Future[ModelRunnerOutput]:
+    ) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]:
         # Build the compiled DAG for the first time.
         if self.forward_dag is None:  # type: ignore
             self.forward_dag = self._compiled_ray_dag(enable_asyncio=False)
diff --git a/vllm/v1/executor/uniproc_executor.py b/vllm/v1/executor/uniproc_executor.py
index 095d3d1dac21b..b8ca922554304 100644
--- a/vllm/v1/executor/uniproc_executor.py
+++ b/vllm/v1/executor/uniproc_executor.py
@@ -67,7 +67,7 @@ class UniProcExecutor(Executor):
         kwargs: dict | None = None,
         non_block: bool = False,
         single_value: bool = False,
-    ) -> Any | list[Any] | Future[Any | list[Any]]:
+    ) -> Any:
         if kwargs is None:
             kwargs = {}
 
@@ -79,10 +79,13 @@ class UniProcExecutor(Executor):
             result = run_method(self.driver_worker, method, args, kwargs)
             if isinstance(result, AsyncModelRunnerOutput):
                 if (async_thread := self.async_output_thread) is not None:
-                    get_output = result.get_output
-                    if not single_value:
-                        get_output = lambda go=result.get_output: [go()]
-                    return async_thread.submit(get_output)
+                    if single_value:
+                        return async_thread.submit(result.get_output)
+
+                    def get_output_list() -> list[Any]:
+                        return [result.get_output()]
+
+                    return async_thread.submit(get_output_list)
                 result = result.get_output()
             future = Future[Any]()
             future.set_result(result if single_value else [result])

From cd7740ac5c3906b2913d58ade61f231ec3a93296 Mon Sep 17 00:00:00 2001
From: shivampr <shivampr.dev@gmail.com>
Date: Fri, 12 Dec 2025 10:28:20 -0800
Subject: [PATCH 493/770] [ROCm] Enable Triton ScaledMM fallback + kernel
 selection fix (#26668)

Signed-off-by: Shivam <shivampr.dev@gmail.com>
Signed-off-by: Shivam <shivamprasad91@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |  2 +-
 .../test_scaled_mm_kernel_selection.py        | 91 +++++++++++++++++++
 .../kernels/scaled_mm/ScaledMMLinearKernel.py |  5 +-
 .../kernels/scaled_mm/__init__.py             | 40 +++-----
 .../quantization/kernels/scaled_mm/aiter.py   | 22 +++--
 .../quantization/kernels/scaled_mm/cpu.py     | 11 ++-
 .../quantization/kernels/scaled_mm/cutlass.py | 17 +++-
 .../quantization/kernels/scaled_mm/triton.py  | 63 +++++++++----
 .../quantization/kernels/scaled_mm/xla.py     | 11 ++-
 9 files changed, 193 insertions(+), 69 deletions(-)
 create mode 100644 tests/kernels/quantization/test_scaled_mm_kernel_selection.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 750e7c038351c..0a5b56f473c29 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -836,7 +836,7 @@ steps:
   - tests/models/multimodal
   no_gpu: true
   commands:
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - "pip install git+https://github.com/TIGER-AI-Lab/Mantis.git || echo 'Mantis installation skipped (decord not available on CPU-only environment)'"
     - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
 
 - label: Multi-Modal Processor Test
diff --git a/tests/kernels/quantization/test_scaled_mm_kernel_selection.py b/tests/kernels/quantization/test_scaled_mm_kernel_selection.py
new file mode 100644
index 0000000000000..2ed55931c8164
--- /dev/null
+++ b/tests/kernels/quantization/test_scaled_mm_kernel_selection.py
@@ -0,0 +1,91 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for ScaledMM kernel selection logic (CPU-only)
+
+Run `pytest tests/kernels/quantization/test_scaled_mm_kernel_selection.py`.
+"""
+
+import inspect
+from abc import ABC
+
+import pytest
+
+from vllm.model_executor.layers.quantization.kernels.scaled_mm import (
+    ScaledMMLinearLayerConfig,
+)
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.aiter import (
+    AiterScaledMMLinearKernel,
+)
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.cpu import (
+    CPUScaledMMLinearKernel,
+)
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import (  # noqa: E501
+    ScaledMMLinearKernel,
+)
+
+pytestmark = pytest.mark.cpu_test
+
+
+def test_is_supported_is_abstract():
+    """Test that is_supported() is properly defined as abstract."""
+    assert issubclass(ScaledMMLinearKernel, ABC)
+    assert hasattr(ScaledMMLinearKernel, "is_supported")
+
+
+def test_cpu_kernel_implements_is_supported():
+    """Test that CPUScaledMMLinearKernel implements is_supported() method."""
+    assert hasattr(CPUScaledMMLinearKernel, "is_supported"), (
+        "CPUScaledMMLinearKernel missing is_supported() method"
+    )
+    # Verify it's a classmethod by checking if it can be called with the class
+    # and by checking the method type
+    assert inspect.ismethod(CPUScaledMMLinearKernel.is_supported) or inspect.isfunction(
+        CPUScaledMMLinearKernel.is_supported
+    ), "CPUScaledMMLinearKernel.is_supported() should be a classmethod"
+    # Verify it can be called as a classmethod
+    result, reason = CPUScaledMMLinearKernel.is_supported()
+    assert isinstance(result, bool), "is_supported() should return a bool"
+    assert reason is None or isinstance(reason, str), "reason should be str or None"
+
+
+def test_aiter_kernel_implements_is_supported():
+    """Test that AiterScaledMMLinearKernel implements is_supported() method."""
+    assert hasattr(AiterScaledMMLinearKernel, "is_supported"), (
+        "AiterScaledMMLinearKernel missing is_supported() method"
+    )
+    # Verify it's a classmethod by checking if it can be called with the class
+    # and by checking the method type
+    assert inspect.ismethod(
+        AiterScaledMMLinearKernel.is_supported
+    ) or inspect.isfunction(AiterScaledMMLinearKernel.is_supported), (
+        "AiterScaledMMLinearKernel.is_supported() should be a classmethod"
+    )
+    # Verify it can be called as a classmethod
+    # (will return False on CPU, which is expected)
+    result, reason = AiterScaledMMLinearKernel.is_supported()
+    assert isinstance(result, bool), "is_supported() should return a bool"
+    assert reason is None or isinstance(reason, str), "reason should be str or None"
+    # On CPU, it should return False with a reason about requiring ROCm
+    # This validates the method works correctly even on non-ROCm platforms
+
+
+def test_cpu_kernel_accepts_all_configs():
+    """Test that CPUScaledMMLinearKernel accepts all config combinations."""
+    configs = [
+        ScaledMMLinearLayerConfig(
+            is_channelwise=False,
+            is_static_input_scheme=True,
+            input_symmetric=True,
+        ),
+        ScaledMMLinearLayerConfig(
+            is_channelwise=True,
+            is_static_input_scheme=False,
+            input_symmetric=False,
+        ),
+    ]
+
+    for config in configs:
+        can_impl, reason = CPUScaledMMLinearKernel.can_implement(config)
+        assert can_impl, (
+            f"CPUScaledMMLinearKernel should accept config {config}: {reason}"
+        )
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
index 2a885ec899458..7be220f7a3734 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
@@ -17,7 +17,9 @@ class ScaledMMLinearLayerConfig:
 class ScaledMMLinearKernel(ABC):
     @classmethod
     @abstractmethod
-    def get_min_capability(cls) -> int:
+    def is_supported(
+        cls, compute_capability: int | None = None
+    ) -> tuple[bool, str | None]:
         raise NotImplementedError
 
     @classmethod
@@ -35,6 +37,7 @@ class ScaledMMLinearKernel(ABC):
         azp_adj_param_name: str,
     ) -> None:
         assert self.can_implement(c)
+        assert self.is_supported()
         self.config = c
         self.w_q_name = w_q_param_name
         self.w_s_name = w_s_param_name
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
index dd59e5d935dcb..bd1d399715305 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
@@ -27,7 +27,7 @@ from vllm.platforms import PlatformEnum, current_platform
 # in priority/performance order (when available)
 _POSSIBLE_KERNELS: dict[PlatformEnum, list[type[ScaledMMLinearKernel]]] = {
     PlatformEnum.CPU: [CPUScaledMMLinearKernel],
-    PlatformEnum.CUDA: [CutlassScaledMMLinearKernel],
+    PlatformEnum.CUDA: [CutlassScaledMMLinearKernel, TritonScaledMMLinearKernel],
     PlatformEnum.ROCM: [AiterScaledMMLinearKernel, TritonScaledMMLinearKernel],
     PlatformEnum.TPU: [XLAScaledMMLinearKernel],
 }
@@ -55,41 +55,25 @@ def choose_scaled_mm_linear_kernel(
         type[ScaledMMLinearKernel]: Chosen kernel.
     """
 
-    if compute_capability is None:
-        _cc = current_platform.get_device_capability()
-        if _cc is not None:
-            compute_capability = _cc[0] * 10 + _cc[1]
-
     failure_reasons = []
     for kernel in _POSSIBLE_KERNELS[current_platform._enum]:
         if kernel.__name__ in os.environ.get("VLLM_DISABLED_KERNELS", "").split(","):
-            failure_reasons.append(
-                f" {kernel.__name__} disabled by environment variable"
-            )
+            failure_reasons.append(f"{kernel.__name__}: disabled by env var")
             continue
 
         # If the current platform uses compute_capability,
         # make sure the kernel supports the compute cability.
-        if compute_capability is not None:
-            kernel_min_capability = kernel.get_min_capability()
-            if (
-                kernel_min_capability is not None
-                and kernel_min_capability > compute_capability
-            ):
-                failure_reasons.append(
-                    f"{kernel.__name__} requires capability "
-                    f"{kernel_min_capability}, current compute capability "
-                    f"is {compute_capability}"
-                )
-                continue
+        is_supported, reason = kernel.is_supported(compute_capability)
+        if not is_supported:
+            failure_reasons.append(f"{kernel.__name__}: {reason}")
+            continue
 
-        can_implement, failure_reason = kernel.can_implement(config)
-        if can_implement:
-            return kernel
-        else:
-            failure_reasons.append(
-                f" {kernel.__name__} cannot implement due to: {failure_reason}"
-            )
+        can_implement, reason = kernel.can_implement(config)
+        if not can_implement:
+            failure_reasons.append(f"{kernel.__name__}: {reason}")
+            continue
+
+        return kernel
 
     raise ValueError(
         "Failed to find a kernel that can implement the "
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
index 038a92c516cec..971bd2005a23b 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
@@ -14,17 +14,21 @@ from .ScaledMMLinearKernel import ScaledMMLinearLayerConfig
 
 class AiterScaledMMLinearKernel(CutlassScaledMMLinearKernel):
     @classmethod
-    def get_min_capability(cls) -> int:
-        return 90
-
-    @classmethod
-    def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
+    def is_supported(
+        cls, compute_capability: int | None = None
+    ) -> tuple[bool, str | None]:
         if not current_platform.is_rocm():
             return (
                 False,
                 "AiterScaledMMLinearKernel requires `aiter` which is not "
                 + "currently supported on non-ROCm platform.",
             )
+        if compute_capability is None:
+            _cc = current_platform.get_device_capability()
+            if _cc is not None:
+                compute_capability = _cc.major * 10 + _cc.minor
+        if compute_capability is not None and compute_capability < 90:
+            return False, f"requires capability 90, got {compute_capability}"
 
         try:
             import aiter  # noqa: F401 # deliberately attempt to import aiter
@@ -34,8 +38,8 @@ class AiterScaledMMLinearKernel(CutlassScaledMMLinearKernel):
                 "AiterScaledMMLinearKernel requires `aiter` which is not "
                 + "installed on ROCm.",
             )
-        # Check if rocm_aiter_gemm_w8a8_scaled_mm is enabled
-        if not (rocm_aiter_ops.is_linear_enabled()):
+
+        if not rocm_aiter_ops.is_linear_enabled():
             return (
                 False,
                 "AiterScaledMMLinearKernel is disabled. "
@@ -44,6 +48,10 @@ class AiterScaledMMLinearKernel(CutlassScaledMMLinearKernel):
                 + "`VLLM_ROCM_USE_AITER_LINEAR` default is True.",
             )
 
+        return True, None
+
+    @classmethod
+    def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
         if not c.input_symmetric:
             return (
                 False,
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py
index feb1e0bee1aaf..6401b94d6278b 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py
@@ -19,14 +19,15 @@ from .ScaledMMLinearKernel import ScaledMMLinearKernel, ScaledMMLinearLayerConfi
 
 class CPUScaledMMLinearKernel(ScaledMMLinearKernel):
     @classmethod
-    def get_min_capability(cls) -> int:
-        return 75
+    def is_supported(
+        cls, compute_capability: int | None = None
+    ) -> tuple[bool, str | None]:
+        if not current_platform.is_cpu():
+            return False, "Requires CPU."
+        return True, None
 
     @classmethod
     def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
-        if not current_platform.is_cpu():
-            return False, "CPUScaledMM requires running on CPU."
-
         return True, None
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
index e8769916b4cef..2f00e0df8ed47 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
@@ -16,14 +16,21 @@ from .ScaledMMLinearKernel import ScaledMMLinearKernel, ScaledMMLinearLayerConfi
 
 class CutlassScaledMMLinearKernel(ScaledMMLinearKernel):
     @classmethod
-    def get_min_capability(cls) -> int:
-        return 75
+    def is_supported(
+        cls, compute_capability: int | None = None
+    ) -> tuple[bool, str | None]:
+        if not current_platform.is_cuda():
+            return False, "Requires CUDA."
+        if compute_capability is None:
+            _cc = current_platform.get_device_capability()
+            if _cc is not None:
+                compute_capability = _cc.major * 10 + _cc.minor
+        if compute_capability is not None and compute_capability < 75:
+            return False, f"requires capability 75, got {compute_capability}"
+        return True, None
 
     @classmethod
     def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
-        if not current_platform.is_cuda():
-            return False, "CutlassScaledMM requires running on CUDA."
-
         return True, None
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
index 3f4ec7f2a738b..760f1f7f79576 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
@@ -4,34 +4,53 @@
 
 import torch
 
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.compressed_tensors.triton_scaled_mm import (  # noqa: E501
+    triton_scaled_mm,
+)
+from vllm.model_executor.layers.quantization.utils import replace_parameter
 from vllm.platforms import current_platform
 
-from .cutlass import CutlassScaledMMLinearKernel
-from .ScaledMMLinearKernel import ScaledMMLinearLayerConfig
+from .ScaledMMLinearKernel import ScaledMMLinearKernel, ScaledMMLinearLayerConfig
 
 
-class TritonScaledMMLinearKernel(CutlassScaledMMLinearKernel):
+class TritonScaledMMLinearKernel(ScaledMMLinearKernel):
     @classmethod
-    def get_min_capability(cls) -> int:
-        return 75
+    def is_supported(
+        cls, compute_capability: int | None = None
+    ) -> tuple[bool, str | None]:
+        if current_platform.is_cuda_alike():
+            return True, None
+        return False, "Requires ROCm or CUDA."
 
     @classmethod
     def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
-        if current_platform.is_cpu():
-            return (
-                False,
-                "TritonScaledMMLinearKernel requires Triton which is not "
-                + "currently supported on CPU.",
-            )
         if not c.input_symmetric:
-            return (
-                False,
-                "TritonScaledMMLinearKernel only supports symmetric " + "quantization.",
-            )
+            return False, "Only symmetric input is supported."
         return True, None
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        super().process_weights_after_loading(layer)
+        weight = getattr(layer, self.w_q_name)
+        replace_parameter(
+            layer,
+            self.w_q_name,
+            torch.nn.Parameter(weight.t().data, requires_grad=False),
+        )
+
+        # INPUT SCALE
+        if self.config.is_static_input_scheme:
+            input_scale = getattr(layer, self.i_s_name)
+            replace_parameter(
+                layer,
+                self.i_s_name,
+                torch.nn.Parameter(input_scale.max(), requires_grad=False),
+            )
+            setattr(layer, self.i_zp_name, None)
+        else:
+            setattr(layer, self.i_s_name, None)
+            setattr(layer, self.i_zp_name, None)
+
+        setattr(layer, self.azp_adj_name, None)
 
     def apply_weights(
         self,
@@ -39,4 +58,14 @@ class TritonScaledMMLinearKernel(CutlassScaledMMLinearKernel):
         x: torch.Tensor,
         bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
-        return super().apply_weights(layer, x, bias)
+        w_q, w_s, i_s, i_zp, azp_adj = self._get_weight_params(layer)
+
+        x_q, x_s, x_zp = ops.scaled_int8_quant(
+            x.contiguous(), i_s, i_zp, symmetric=True
+        )
+
+        assert x_zp is None, "Triton kernel only supports symmetric quantization"
+
+        return triton_scaled_mm(
+            x_q, w_q, scale_a=x_s, scale_b=w_s, out_dtype=x.dtype, bias=bias
+        )
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
index ddac9f13cf4f3..0be858c51993d 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
@@ -17,11 +17,12 @@ from .ScaledMMLinearKernel import ScaledMMLinearKernel, ScaledMMLinearLayerConfi
 
 class XLAScaledMMLinearKernel(ScaledMMLinearKernel):
     @classmethod
-    def get_min_capability(cls) -> int:
-        raise NotImplementedError(
-            "TPU platform does have a concept of compute capability, "
-            "this method should not be called."
-        )
+    def is_supported(
+        cls, compute_capability: int | None = None
+    ) -> tuple[bool, str | None]:
+        if not current_platform.is_tpu():
+            return False, "Requires TPU."
+        return True, None
 
     @classmethod
     def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:

From 1f19d8f899b228a530d256bf9476d9b1ea3039af Mon Sep 17 00:00:00 2001
From: Xin Yang <105740670+xyang16@users.noreply.github.com>
Date: Fri, 12 Dec 2025 11:07:57 -0800
Subject: [PATCH 494/770] [Perf] Set split_k to 1 for triton_kernels (#30528)

Signed-off-by: Xin Yang <xyangx@amazon.com>
---
 .../layers/quantization/utils/mxfp4_utils.py   | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
index d0c8b3d1a3093..7a351afb3c415 100644
--- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
@@ -57,12 +57,18 @@ def _swizzle_mxfp4(quant_tensor, scale, num_warps):
                 mx_axis=1, num_warps=num_warps
             )
         )
-    if current_platform.is_cuda() and current_platform.is_device_capability(100):
-        constraints = {
-            "is_persistent": True,
-            "epilogue_subtile": 1,
-        }
-        opt_flags.update_opt_flags_constraints(constraints)
+    if current_platform.is_cuda():
+        if current_platform.is_device_capability(90):
+            constraints = {
+                "split_k": 1,
+            }
+            opt_flags.update_opt_flags_constraints(constraints)
+        elif current_platform.is_device_capability(100):
+            constraints = {
+                "is_persistent": True,
+                "epilogue_subtile": 1,
+            }
+            opt_flags.update_opt_flags_constraints(constraints)
     # transpose the tensor so that the quantization axis is on dim1
     quant_tensor = quant_tensor.transpose(-2, -1)
     scale = scale.transpose(-2, -1)

From 9693dd0fe382e10abf239e33dbb1707cebc18ff9 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Sat, 13 Dec 2025 03:21:35 +0800
Subject: [PATCH 495/770] [CI/Build] Add x86 CPU wheel release pipeline
 (#28848)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 .buildkite/release-pipeline.yaml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 151bb6abb0905..a9d51557bd9bb 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -71,6 +71,20 @@ steps:
     env:
       DOCKER_BUILDKIT: "1"
 
+  # x86 CPU wheel build
+  - label: "Build x86 CPU wheel"
+    depends_on: ~
+    id: build-wheel-x86-cpu
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
+    env:
+      DOCKER_BUILDKIT: "1"
+
   # Build release images (12.9)
   - label: "Build release image (x86)"
     depends_on: ~

From 6ec0d8dbe4ccff35d042fafa29f2c141e553e7ae Mon Sep 17 00:00:00 2001
From: danielafrimi <45691845+danielafrimi@users.noreply.github.com>
Date: Fri, 12 Dec 2025 21:27:47 +0200
Subject: [PATCH 496/770] [Fix]Load kv-cache dtype from hf_quant_config.json
 automatically (#29980)

Signed-off-by: Daniel Afrimi <dafrimi@nvidia.com>
---
 vllm/utils/torch_utils.py | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py
index c97efce312b56..edcb79fbc9cd7 100644
--- a/vllm/utils/torch_utils.py
+++ b/vllm/utils/torch_utils.py
@@ -194,12 +194,33 @@ def get_kv_cache_torch_dtype(
     return torch_dtype
 
 
+def get_kv_cache_quant_algo_dtype(quant_cfg: dict[str, Any]) -> torch.dtype | None:
+    quant_method = quant_cfg.get("quant_method", "")
+    if quant_method.startswith("modelopt"):
+        quantization_inner = quant_cfg.get("quantization", quant_cfg)
+        # Check if quant config is specified and use kv cache quant algo
+        kv_algo = quantization_inner.get("kv_cache_quant_algo") or quant_cfg.get(
+            "kv_cache_quant_algo"
+        )
+        if isinstance(kv_algo, str):
+            return STR_DTYPE_TO_TORCH_DTYPE[kv_algo.lower()]
+    return None
+
+
 def kv_cache_dtype_str_to_dtype(
     kv_cache_dtype: str, model_config: ModelConfig
 ) -> torch.dtype:
+    # Model config may not be specified for unit tests, default to float16
+    dtype = model_config.dtype if model_config else torch.half
     if kv_cache_dtype == "auto":
-        # Model config may not be specified for unit tests, default to float16
-        return model_config.dtype if model_config else torch.half
+        hf_cfg = getattr(model_config, "hf_config", None)
+        if hf_cfg is not None:
+            quant_cfg = getattr(hf_cfg, "quantization_config", None)
+            if quant_cfg is not None:
+                kv_algo_dtype = get_kv_cache_quant_algo_dtype(quant_cfg)
+                return kv_algo_dtype if kv_algo_dtype is not None else dtype
+        return dtype
+
     return STR_DTYPE_TO_TORCH_DTYPE[kv_cache_dtype]
 
 
From 13618626dff735a794cae3dec4ac4c3d78de2e86 Mon Sep 17 00:00:00 2001
From: danielafrimi <45691845+danielafrimi@users.noreply.github.com>
Date: Fri, 12 Dec 2025 22:42:32 +0200
Subject: [PATCH 497/770] [MoE-FP8-modelopt] Add FlashInfer alignment padding
 for intermediate dimensions (#29748)

Signed-off-by: Daniel Afrimi <dafrimi@pool0-00589.cm.cluster>
Signed-off-by: dafrimi <dafrimi@nvidia.com>
Co-authored-by: Daniel Afrimi <dafrimi@pool0-00589.cm.cluster>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 .../layers/quantization/modelopt.py           | 48 +++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 18a0fe6fbbb44..a3a8ec738dae2 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -81,6 +81,7 @@ from vllm.utils.flashinfer import (
     has_flashinfer,
     has_flashinfer_moe,
 )
+from vllm.utils.math_utils import round_up
 
 if TYPE_CHECKING:
     from vllm.model_executor.models.utils import WeightsMapper
@@ -607,6 +608,9 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
         Only supports pre-quantized checkpoints with FP8 weights and scales.
         """
 
+        if self.flashinfer_moe_backend is not None:
+            self._maybe_pad_intermediate_for_flashinfer(layer)
+
         layer.w13_weight = Parameter(layer.w13_weight.data, requires_grad=False)
         layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False)
 
@@ -684,6 +688,50 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
                 rotate_flashinfer_fp8_moe_weights(layer.w13_weight, layer.w2_weight)
         register_moe_scaling_factors(layer)
 
+    def _maybe_pad_intermediate_for_flashinfer(self, layer: torch.nn.Module) -> None:
+        """Pad intermediate size so FlashInfer kernels' alignment constraints hold.
+
+        Some FlashInfer FP8 MoE kernels require the (gated) intermediate size
+        used for GEMM to be divisible by a small alignment value. When this is
+        not satisfied (e.g. with certain tensor-parallel sizes), we pad the
+        gate/up and down projection weights along the intermediate dim.
+        """
+        if not hasattr(layer, "w13_weight") or not hasattr(layer, "w2_weight"):
+            return
+
+        # Current local intermediate size (per partition) is the K dimension of
+        # the down projection.
+        num_experts, hidden_size, intermediate = layer.w2_weight.shape
+
+        min_alignment = 16
+        padded_intermediate = round_up(intermediate, min_alignment)
+
+        if padded_intermediate == intermediate:
+            return
+
+        logger.info(
+            "Padding intermediate size from %d to %d for up/down projection weights.",
+            intermediate,
+            padded_intermediate,
+        )
+
+        up_mult = 2 if self.moe.is_act_and_mul else 1
+        padded_gate_up_dim = up_mult * padded_intermediate
+
+        # Pad w13 and w12 along its intermediate dimension.
+        w13 = layer.w13_weight.data
+        padded_w13 = w13.new_zeros((num_experts, padded_gate_up_dim, hidden_size))
+        padded_w13[:, : w13.shape[1], :] = w13
+        layer.w13_weight.data = padded_w13
+
+        w2 = layer.w2_weight.data
+        padded_w2 = w2.new_zeros((num_experts, hidden_size, padded_intermediate))
+        padded_w2[:, :, :intermediate] = w2
+        layer.w2_weight.data = padded_w2
+
+        if hasattr(layer, "intermediate_size_per_partition"):
+            layer.intermediate_size_per_partition = padded_intermediate
+
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
     ) -> FusedMoEQuantConfig | None:

From 1e6b115300b8b3629d10e69db0933246fe3253af Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Fri, 12 Dec 2025 16:45:23 -0500
Subject: [PATCH 498/770] [Refactor] Reduce duplicate code in
 `per_token_group_quant` cuda kernels (#30496)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 .../w8a8/fp8/per_token_group_quant.cu         | 181 ++++++++----------
 1 file changed, 83 insertions(+), 98 deletions(-)

diff --git a/csrc/quantization/w8a8/fp8/per_token_group_quant.cu b/csrc/quantization/w8a8/fp8/per_token_group_quant.cu
index f9ac874c43730..49d1b2086b8db 100644
--- a/csrc/quantization/w8a8/fp8/per_token_group_quant.cu
+++ b/csrc/quantization/w8a8/fp8/per_token_group_quant.cu
@@ -22,6 +22,62 @@ __device__ __forceinline__ float GroupReduceMax(float val) {
   return val;
 }
 
+template <typename T, bool SCALE_UE8M0>
+__device__ __forceinline__ float ComputeGroupScale(
+    const T* __restrict__ group_input, T* __restrict__ smem_group,
+    const int group_size, const int lane_id, const int threads_per_group,
+    const float eps, const float max_8bit) {
+  float local_absmax = eps;
+
+  constexpr int vec_size = 16 / sizeof(T);
+
+  // copy global -> shared & compute absmax
+  auto scalar_op_cache = [&] __device__(T & dst, const T& src) {
+    float abs_v = fabsf(static_cast<float>(src));
+    local_absmax = fmaxf(local_absmax, abs_v);
+    dst = src;
+  };
+
+  vllm::vectorize_with_alignment<vec_size>(
+      group_input,        // in
+      smem_group,         // out (shared)
+      group_size,         // elements per group
+      lane_id,            // thread id
+      threads_per_group,  // stride in group
+      scalar_op_cache);   // scalar handler
+
+  local_absmax = GroupReduceMax(local_absmax);
+
+  float y_s = local_absmax / max_8bit;
+  if constexpr (SCALE_UE8M0) {
+    y_s = exp2f(ceilf(log2f(fmaxf(fabsf(y_s), 1e-10f))));
+  }
+
+  return y_s;
+}
+
+template <typename T, typename DST_DTYPE>
+__device__ __forceinline__ void QuantizeGroup(
+    const T* __restrict__ smem_group, DST_DTYPE* __restrict__ group_output,
+    const int group_size, const int lane_id, const int threads_per_group,
+    const float y_s, const float min_8bit, const float max_8bit) {
+  constexpr int vec_size = 16 / sizeof(T);
+
+  // quantize shared -> global 8-bit
+  auto scalar_op_quant = [&] __device__(DST_DTYPE & dst, const T& src) {
+    float q = fminf(fmaxf(static_cast<float>(src) / y_s, min_8bit), max_8bit);
+    dst = DST_DTYPE(q);
+  };
+
+  vllm::vectorize_with_alignment<vec_size>(
+      smem_group,         // in (shared)
+      group_output,       // out (global quant tensor)
+      group_size,         // elements
+      lane_id,            // tid
+      threads_per_group,  // stride
+      scalar_op_quant);   // scalar handler
+}
+
 template <typename T, typename DST_DTYPE, bool IS_COLUMN_MAJOR = false,
           bool SCALE_UE8M0 = false, typename scale_packed_t = float>
 __global__ void per_token_group_quant_8bit_kernel(
@@ -38,8 +94,6 @@ __global__ void per_token_group_quant_8bit_kernel(
   const int64_t global_group_id = block_group_id + local_group_id;
   const int64_t block_group_offset = global_group_id * group_size;
 
-  float local_absmax = eps;
-
   using scale_element_t = float;
   static_assert(sizeof(scale_packed_t) % sizeof(scale_element_t) == 0);
 
@@ -68,30 +122,9 @@ __global__ void per_token_group_quant_8bit_kernel(
   T* smem = reinterpret_cast<T*>(smem_raw);
   T* smem_group = smem + local_group_id * group_size;
 
-  constexpr int vec_size = 16 / sizeof(T);
-  using vec_t = vllm::vec_n_t<T, vec_size>;
-
-  // copy global -> shared & compute absmax
-  auto scalar_op_cache = [&] __device__(T & dst, const T& src) {
-    float abs_v = fabsf(static_cast<float>(src));
-    local_absmax = fmaxf(local_absmax, abs_v);
-    dst = src;
-  };
-
-  vllm::vectorize_with_alignment<vec_size>(
-      group_input,        // in
-      smem_group,         // out (shared)
-      group_size,         // elements per group
-      lane_id,            // thread id
-      threads_per_group,  // stride in group
-      scalar_op_cache);   // scalar handler
-
-  local_absmax = GroupReduceMax(local_absmax);
-
-  float y_s = local_absmax / max_8bit;
-  if constexpr (SCALE_UE8M0) {
-    y_s = exp2f(ceilf(log2f(fmaxf(fabsf(y_s), 1e-10f))));
-  }
+  const float y_s = ComputeGroupScale<T, SCALE_UE8M0>(
+      group_input, smem_group, group_size, lane_id, threads_per_group, eps,
+      max_8bit);
 
   scale_element_t y_s_quant = y_s;
 
@@ -101,19 +134,24 @@ __global__ void per_token_group_quant_8bit_kernel(
 
   __syncthreads();
 
-  // quantize shared -> global 8-bit
-  auto scalar_op_quant = [&] __device__(DST_DTYPE & dst, const T& src) {
-    float q = fminf(fmaxf(static_cast<float>(src) / y_s, min_8bit), max_8bit);
-    dst = DST_DTYPE(q);
-  };
+  QuantizeGroup<T, DST_DTYPE>(smem_group, group_output, group_size, lane_id,
+                              threads_per_group, y_s, min_8bit, max_8bit);
+}
 
-  vllm::vectorize_with_alignment<vec_size>(
-      smem_group,         // in (shared)
-      group_output,       // out (global quant tensor)
-      group_size,         // elements
-      lane_id,            // tid
-      threads_per_group,  // stride
-      scalar_op_quant);   // scalar handler
+inline int GetGroupsPerBlock(int64_t num_groups) {
+  if (num_groups % 16 == 0) {
+    return 16;
+  }
+  if (num_groups % 8 == 0) {
+    return 8;
+  }
+  if (num_groups % 4 == 0) {
+    return 4;
+  }
+  if (num_groups % 2 == 0) {
+    return 2;
+  }
+  return 1;
 }
 
 void per_token_group_quant_8bit(const torch::Tensor& input,
@@ -133,17 +171,7 @@ void per_token_group_quant_8bit(const torch::Tensor& input,
 
   constexpr int THREADS_PER_GROUP = 16;
 
-  int groups_per_block = 1;
-
-  if (num_groups % 16 == 0) {
-    groups_per_block = 16;
-  } else if (num_groups % 8 == 0) {
-    groups_per_block = 8;
-  } else if (num_groups % 4 == 0) {
-    groups_per_block = 4;
-  } else if (num_groups % 2 == 0) {
-    groups_per_block = 2;
-  }
+  const int groups_per_block = GetGroupsPerBlock(num_groups);
 
   auto dst_type = output_q.scalar_type();
   const int num_blocks = num_groups / groups_per_block;
@@ -225,8 +253,6 @@ __global__ void per_token_group_quant_8bit_packed_kernel(
 
   const int64_t block_group_offset = global_group_id * group_size;
 
-  float local_absmax = eps;
-
   const T* group_input = input + block_group_offset;
   DST_DTYPE* group_output =
       static_cast<DST_DTYPE*>(output_q) + block_group_offset;
@@ -235,29 +261,9 @@ __global__ void per_token_group_quant_8bit_packed_kernel(
   extern __shared__ __align__(16) char smem_raw[];
   T* smem = reinterpret_cast<T*>(smem_raw);
   T* smem_group = smem + local_group_id * group_size;
-
-  constexpr int vec_size = 16 / sizeof(T);
-  using vec_t = vllm::vec_n_t<T, vec_size>;
-
-  // copy global -> shared & compute absmax
-  auto scalar_op_cache = [&] __device__(T & dst, const T& src) {
-    float abs_v = fabsf(static_cast<float>(src));
-    local_absmax = fmaxf(local_absmax, abs_v);
-    dst = src;
-  };
-
-  vllm::vectorize_with_alignment<vec_size>(
-      group_input,        // in
-      smem_group,         // out (shared)
-      group_size,         // elements per group
-      lane_id,            // thread id
-      threads_per_group,  // stride in group
-      scalar_op_cache);   // scalar handler
-
-  local_absmax = GroupReduceMax(local_absmax);
-
-  float y_s = local_absmax / max_8bit;
-  y_s = exp2f(ceilf(log2f(fmaxf(fabsf(y_s), 1e-10f))));
+  const float y_s =
+      ComputeGroupScale<T, true>(group_input, smem_group, group_size, lane_id,
+                                 threads_per_group, eps, max_8bit);
 
   // pack 4 scales into a uint32
   if (lane_id == 0) {
@@ -284,19 +290,8 @@ __global__ void per_token_group_quant_8bit_packed_kernel(
 
   __syncthreads();
 
-  // quantize shared -> global 8-bit
-  auto scalar_op_quant = [&] __device__(DST_DTYPE & dst, const T& src) {
-    float q = fminf(fmaxf(static_cast<float>(src) / y_s, min_8bit), max_8bit);
-    dst = DST_DTYPE(q);
-  };
-
-  vllm::vectorize_with_alignment<vec_size>(
-      smem_group,         // in (shared)
-      group_output,       // out (global quant tensor)
-      group_size,         // elements
-      lane_id,            // tid
-      threads_per_group,  // stride
-      scalar_op_quant);   // scalar handler
+  QuantizeGroup<T, DST_DTYPE>(smem_group, group_output, group_size, lane_id,
+                              threads_per_group, y_s, min_8bit, max_8bit);
 }
 
 void per_token_group_quant_8bit_packed(const torch::Tensor& input,
@@ -337,17 +332,7 @@ void per_token_group_quant_8bit_packed(const torch::Tensor& input,
 
   constexpr int THREADS_PER_GROUP = 16;
 
-  int groups_per_block = 1;
-
-  if (num_groups % 16 == 0) {
-    groups_per_block = 16;
-  } else if (num_groups % 8 == 0) {
-    groups_per_block = 8;
-  } else if (num_groups % 4 == 0) {
-    groups_per_block = 4;
-  } else if (num_groups % 2 == 0) {
-    groups_per_block = 2;
-  }
+  const int groups_per_block = GetGroupsPerBlock(num_groups);
 
   auto dst_type = output_q.scalar_type();
   const int num_blocks = num_groups / groups_per_block;

From b4039c08b5dd0a14d5c1ccac8557aa11732d857c Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <khluu000@gmail.com>
Date: Fri, 12 Dec 2025 14:13:09 -0800
Subject: [PATCH 499/770] [ci] Mark PrimeRL integration test as soft fail
 (#30578)

Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
---
 .buildkite/test-pipeline.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 0a5b56f473c29..242a110cec3b9 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -1346,6 +1346,7 @@ steps:
 - label: Prime-RL Integration Test # 15min
   timeout_in_minutes: 30
   optional: true
+  soft_fail: true
   num_gpus: 2
   working_dir: "/vllm-workspace"
   source_file_dependencies:
@@ -1379,4 +1380,4 @@ steps:
   num_gpus: 2
   working_dir: "/vllm-workspace"
   commands:
-  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
\ No newline at end of file
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1

From 08f8a5627e6306312202d5c0fda8b7255d2f27fc Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Fri, 12 Dec 2025 17:41:56 -0600
Subject: [PATCH 500/770] [CI/Build][Kernel][BugFix][AMD] Fix
 per_token_group_quant_fp8 to use correct fp8 min/max values and update
 atol/rtol in test_quantfp8_group_functionality  (#30292)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 tests/kernels/quantization/test_fp8_quant_group.py         | 4 ++--
 vllm/model_executor/layers/quantization/utils/fp8_utils.py | 7 +++++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/tests/kernels/quantization/test_fp8_quant_group.py b/tests/kernels/quantization/test_fp8_quant_group.py
index 6628ac650fd5f..f5e1cde94b6e9 100644
--- a/tests/kernels/quantization/test_fp8_quant_group.py
+++ b/tests/kernels/quantization/test_fp8_quant_group.py
@@ -62,7 +62,7 @@ def test_quantfp8_group_functionality(
     assert scales_col.stride(1) == batch_size
 
     # Test column-major scales consistency
-    assert torch.allclose(scales_col, scales_native, rtol=1e-9, atol=1e-8)
+    torch.testing.assert_close(scales_col, scales_native, rtol=1e-9, atol=1e-8)
 
     # 3. Test CUDA implementation (only for divisible dimensions)
     if is_divisible:
@@ -71,7 +71,7 @@ def test_quantfp8_group_functionality(
         assert scales_cuda.shape == (batch_size, expected_num_groups)
 
         # Verify CUDA/native consistency
-        assert torch.allclose(scales_cuda, scales_native, rtol=1e-9, atol=1e-8)
+        torch.testing.assert_close(scales_cuda, scales_native, rtol=2e-7, atol=2e-8)
 
         # Quantized values should mostly match
         diff_count = (x_quant_cuda != x_quant_native).sum().item()
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index e12fe61bf3d97..9eeb6e266c34e 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -762,9 +762,12 @@ def per_token_group_quant_fp8(
     )
     assert x.stride(-1) == 1, "`x` groups must be contiguous"
 
+    # Using the default value (240.0) from pytorch will cause accuracy
+    # issue on dynamic quantization models. Here use 224.0 for fnuz on ROCm
+    # platforms that use the torch.float8_e4mefnuz dtype.
     finfo = torch.finfo(dtype)
-    fp8_min = finfo.min
-    fp8_max = finfo.max
+    fp8_min = -224.0 if current_platform.is_fp8_fnuz() else finfo.min
+    fp8_max = 224.0 if current_platform.is_fp8_fnuz() else finfo.max
 
     assert out_q is None or out_q.shape == x.shape
     x_q = out_q

From 86a3261525858bca5d4a234691fae0496d6fed99 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Fri, 12 Dec 2025 19:02:11 -0500
Subject: [PATCH 501/770] [Bugfix] Pass FA version in `MultiHeadAttention`
 (#30575)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 vllm/attention/layer.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index c77fc0fad0038..c095b94518143 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer."""
 
+import functools
 from collections.abc import Callable
 from typing import cast
 
@@ -17,6 +18,7 @@ from vllm.attention.backends.abstract import (
 )
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.selector import get_attn_backend
+from vllm.attention.utils.fa_utils import get_flash_attn_version
 from vllm.attention.utils.kv_sharing_utils import validate_kv_sharing_target
 from vllm.attention.utils.kv_transfer_utils import maybe_transfer_kv_layer
 from vllm.config import CacheConfig, get_current_vllm_config
@@ -524,6 +526,14 @@ class MultiHeadAttention(nn.Module):
             AttentionBackendEnum.ROCM_AITER_FA,
         }
 
+        self.fa_version = None
+        if self.attn_backend == AttentionBackendEnum.FLASH_ATTN:
+            self.fa_version = get_flash_attn_version()
+            assert self._flash_attn_varlen_func is not None
+            self._flash_attn_varlen_func = functools.partial(
+                self._flash_attn_varlen_func, fa_version=self.fa_version
+            )
+
         logger.info_once(
             f"Using {self.attn_backend} for MultiHeadAttention in multimodal encoder."
         )

From fc0119425c8bef73cbeadc265d70777103b3e940 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Fri, 12 Dec 2025 20:34:23 -0500
Subject: [PATCH 502/770] Add IBM and Red Hat to compute resources sponsors
 (#30581)

Signed-off-by: Michael Goin <mgoin64@gmail.com>
---
 README.md                  | 2 ++
 docs/community/sponsors.md | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/README.md b/README.md
index 5c040fe4a66d2..26222b815370d 100644
--- a/README.md
+++ b/README.md
@@ -143,11 +143,13 @@ Compute Resources:
 - Databricks
 - DeepInfra
 - Google Cloud
+- IBM
 - Intel
 - Lambda Lab
 - Nebius
 - Novita AI
 - NVIDIA
+- Red Hat
 - Replicate
 - Roblox
 - RunPod
diff --git a/docs/community/sponsors.md b/docs/community/sponsors.md
index fd1c82376d086..847b99cce45c9 100644
--- a/docs/community/sponsors.md
+++ b/docs/community/sponsors.md
@@ -24,11 +24,13 @@ Compute Resources:
 - Databricks
 - DeepInfra
 - Google Cloud
+- IBM
 - Intel
 - Lambda Lab
 - Nebius
 - Novita AI
 - NVIDIA
+- Red Hat
 - Replicate
 - Roblox
 - RunPod

From f5dfbbd8e9f35d68b924677d574aa01857a07781 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Fri, 12 Dec 2025 21:20:15 -0500
Subject: [PATCH 503/770] [Docs] Remove references to `VLLM_ATTENTION_BACKEND`
 (#30564)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 docs/getting_started/quickstart.md | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md
index 94920dc5306b3..e3974354d8f3b 100644
--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@@ -281,17 +281,27 @@ Alternatively, you can use the `openai` Python package:
 
 Currently, vLLM supports multiple backends for efficient Attention computation across different platforms and accelerator architectures. It automatically selects the most performant backend compatible with your system and model specifications.
 
-If desired, you can also manually set the backend of your choice by configuring the environment variable `VLLM_ATTENTION_BACKEND` to one of the following options:
+If desired, you can also manually set the backend of your choice using the `--attention-backend` CLI argument:
+
+```bash
+# For online serving
+vllm serve Qwen/Qwen2.5-1.5B-Instruct --attention-backend FLASH_ATTN
+
+# For offline inference
+python script.py --attention-backend FLASHINFER
+```
+
+Some of the available backend options include:
 
 - On NVIDIA CUDA: `FLASH_ATTN` or `FLASHINFER`.
 - On AMD ROCm: `TRITON_ATTN`, `ROCM_ATTN`, `ROCM_AITER_FA` or `ROCM_AITER_UNIFIED_ATTN`.
 
-For AMD ROCm, you can further control the specific Attention implementation using the following variables:
+For AMD ROCm, you can further control the specific Attention implementation using the following options:
 
-- Triton Unified Attention: `VLLM_ROCM_USE_AITER=0 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=0 VLLM_ROCM_USE_AITER_MHA=0`
-- AITER Unified Attention: `VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=0 VLLM_ROCM_USE_AITER_MHA=0`
-- Triton Prefill-Decode Attention: `VLLM_ROCM_USE_AITER=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 VLLM_ROCM_USE_AITER_MHA=0`
-- AITER Multi-head Attention: `VLLM_ROCM_USE_AITER=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=0 VLLM_ROCM_USE_AITER_MHA=1`
+- Triton Unified Attention: Set the environment variables `VLLM_ROCM_USE_AITER=0 VLLM_ROCM_USE_AITER_MHA=0` and pass `--attention-config.use_prefill_decode_attention=false` as a CLI argument.
+- AITER Unified Attention: Set the environment variables `VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_ROCM_USE_AITER_MHA=0` and pass `--attention-config.use_prefill_decode_attention=false` as a CLI argument.
+- Triton Prefill-Decode Attention: Set the environment variables `VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_MHA=0` and pass `--attention-config.use_prefill_decode_attention=true` as a CLI argument.
+- AITER Multi-head Attention: Set the environment variables `VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_MHA=1` and pass `--attention-config.use_prefill_decode_attention=false` as a CLI argument.
 
 !!! warning
     There are no pre-built vllm wheels containing Flash Infer, so you must install it in your environment first. Refer to the [Flash Infer official docs](https://docs.flashinfer.ai/) or see [docker/Dockerfile](../../docker/Dockerfile) for instructions on how to install it.

From 2f32a68d75324299d13025c75f9cb5427e5c445d Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Fri, 12 Dec 2025 21:28:13 -0500
Subject: [PATCH 504/770] [CI] Update several models in registry that are
 available online now (#30514)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
---
 .buildkite/test-pipeline.yaml |  2 ++
 tests/models/registry.py      | 10 ++++------
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 242a110cec3b9..5fcf945f3e5a6 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -692,6 +692,7 @@ steps:
   source_file_dependencies:
   - vllm/
   - tests/models/test_initialization.py
+  - tests/models/registry.py
   commands:
     # Run a subset of model initialization tests
     - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
@@ -704,6 +705,7 @@ steps:
   - vllm/model_executor/models/
   - vllm/transformers_utils/
   - tests/models/test_initialization.py
+  - tests/models/registry.py
   commands:
     # Only when vLLM model source is modified - test initialization of a large
     # subset of supported models (the complement of the small subset in the above
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 18056a9657e82..769b33d877983 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -356,7 +356,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     ),
     "MistralForCausalLM": _HfExamplesInfo("mistralai/Mistral-7B-Instruct-v0.1"),
     "MistralLarge3ForCausalLM": _HfExamplesInfo(
-        "mistralai/Mistral-Large-3-675B-Instruct-2512-NVFP4", is_available_online=False
+        "mistralai/Mistral-Large-3-675B-Instruct-2512-NVFP4"
     ),
     "MixtralForCausalLM": _HfExamplesInfo(
         "mistralai/Mixtral-8x7B-Instruct-v0.1",
@@ -635,7 +635,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     ),
     "HunYuanVLForConditionalGeneration": _HfExamplesInfo(
         "tencent/HunyuanOCR",
-        is_available_online=False,
+        hf_overrides={"num_experts": 0},
     ),
     "Idefics3ForConditionalGeneration": _HfExamplesInfo(
         "HuggingFaceM4/Idefics3-8B-Llama3",
@@ -674,8 +674,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
         "https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/discussions/31",
     ),
     "LightOnOCRForConditionalGeneration": _HfExamplesInfo(
-        "lightonai/LightOnOCR-1B",
-        is_available_online=False,
+        "lightonai/LightOnOCR-1B-1025"
     ),
     "Llama4ForConditionalGeneration": _HfExamplesInfo(
         "meta-llama/Llama-4-Scout-17B-16E-Instruct",
@@ -779,8 +778,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
             "ministral-3": "mistralai/Ministral-3-3B-Instruct-2512",
         },
         tokenizer_mode="mistral",
-        # TODO: revert once Mistral-Large-3 and Ministral-3 are publicly available.
-        is_available_online=False,
     ),
     "QwenVLForConditionalGeneration": _HfExamplesInfo(
         "Qwen/Qwen-VL",
@@ -886,6 +883,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
     "EagleMistralLarge3ForCausalLM": _HfExamplesInfo(
         "mistralai/Mistral-Large-3-675B-Instruct-2512",
         speculative_model="mistralai/Mistral-Large-3-675B-Instruct-2512-Eagle",
+        # TODO: revert once figuring out OOM in CI
         is_available_online=False,
     ),
     "LlamaForCausalLMEagle3": _HfExamplesInfo(

From 57e9bf18642a391e918400a5afc7c01221635698 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Sat, 13 Dec 2025 03:49:11 +0100
Subject: [PATCH 505/770] [CI] Whisper logprobs tests (#30504)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 tests/conftest.py                             |   8 +-
 .../multimodal/generation/test_whisper.py     | 232 +++++++++---------
 tests/models/registry.py                      |   5 +-
 3 files changed, 133 insertions(+), 112 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index b21cfd5ba85c4..a03f40a9a72ac 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -702,10 +702,16 @@ class HfRunner:
                 **kwargs,
             )
 
+            # Encoder-decoder models return decoder_hidden_states instead of
+            # hidden_states
+            hidden_states = (
+                getattr(output, "hidden_states", None) or output.decoder_hidden_states
+            )
+
             (
                 seq_logprobs_lst,
                 output_len,
-            ) = self._hidden_states_to_logprobs(output.hidden_states, num_logprobs)
+            ) = self._hidden_states_to_logprobs(hidden_states, num_logprobs)
 
             all_logprobs.append(seq_logprobs_lst)
             seq_ids = output.sequences[0]
diff --git a/tests/models/multimodal/generation/test_whisper.py b/tests/models/multimodal/generation/test_whisper.py
index 592862c2a0bb0..b206995a9cecc 100644
--- a/tests/models/multimodal/generation/test_whisper.py
+++ b/tests/models/multimodal/generation/test_whisper.py
@@ -1,150 +1,146 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from collections.abc import Sequence
+from typing import Any
+
+import librosa
 import pytest
+from transformers import AutoModelForSpeechSeq2Seq
 
-from vllm import SamplingParams
 from vllm.assets.audio import AudioAsset
+from vllm.platforms import current_platform
 
-from ....conftest import VllmRunner
+from ....conftest import HfRunner, PromptAudioInput, VllmRunner
 from ....utils import create_new_process_for_each_test, multi_gpu_test
+from ...registry import HF_EXAMPLE_MODELS
+from ...utils import check_logprobs_close
 
-PROMPTS = [
-    {
-        "prompt": "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
-        "multi_modal_data": {
-            "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
-        },
-    },
-    {  # Test explicit encoder/decoder prompt
-        "encoder_prompt": {
-            "prompt": "",
-            "multi_modal_data": {
-                "audio": AudioAsset("winning_call").audio_and_sample_rate,
-            },
-        },
-        "decoder_prompt": "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
-    },
-]
+VLLM_PROMPT = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
+HF_PROMPT = ""
+# Whisper expects 16kHz audio
+WHISPER_SAMPLE_RATE = 16000
 
-EXPECTED = {
-    "openai/whisper-tiny": [
-        " He has birth words I spoke in the original corner of that. And a"
-        " little piece of black coat poetry. Mary had a little sandwich,"
-        " sweet, with white and snow. And everyone had it very went the last"
-        " would sure to go.",
-        " >> And the old one, fit John the way to Edgar Martinez. >> One more"
-        " to line down the field line for our base camp. Here comes joy. Here"
-        " is June and the third base. They're going to wave him in. The throw"
-        " to the plate will be late. The Mariners are going to play for the"
-        " American League Championship. I don't believe it. It just continues"
-        " by all five.",
-    ],
-    "openai/whisper-small": [
-        " The first words I spoke in the original pornograph. A little piece"
-        " of practical poetry. Mary had a little lamb, its fleece was quite a"
-        " slow, and everywhere that Mary went the lamb was sure to go.",
-        " And the old one pitch on the way to Edgar Martinez one month. Here"
-        " comes joy. Here is Junior to third base. They're gonna wave him"
-        " in. The throw to the plate will be late. The Mariners are going to"
-        " play for the American League Championship. I don't believe it. It"
-        " just continues. My, oh my.",
-    ],
-    "openai/whisper-medium": [
-        " The first words I spoke in the original phonograph, a little piece"
-        " of practical poetry. Mary had a little lamb, its fleece was quite as"
-        " slow, and everywhere that Mary went the lamb was sure to go.",
-        " And the 0-1 pitch on the way to Edgar Martinez swung on the line"
-        " down the left field line for Obeyshev. Here comes Joy. Here is"
-        " Jorgen at third base. They're going to wave him in. The throw to the"
-        " plate will be late. The Mariners are going to play for the American"
-        " League Championship. I don't believe it. It just continues. My, oh"
-        " my.",
-    ],
-    "openai/whisper-large-v3": [
-        " The first words I spoke in the original phonograph, a little piece"
-        " of practical poetry. Mary had a little lamb, its feet were quite as"
-        " slow, and everywhere that Mary went, the lamb was sure to go.",
-        " And the 0-1 pitch on the way to Edgar Martinez. Swung on the line."
-        " Now the left field line for a base hit. Here comes Joy. Here is"
-        " Junior to third base. They're going to wave him in. The throw to the"
-        " plate will be late. The Mariners are going to play for the American"
-        " League Championship. I don't believe it. It just continues. My, oh,"
-        " my.",
-    ],
-    "openai/whisper-large-v3-turbo": [
-        " The first words I spoke in the original phonograph, a little piece"
-        " of practical poetry. Mary had a little lamb, its streets were quite"
-        " as slow, and everywhere that Mary went the lamb was sure to go.",
-        " And the 0-1 pitch on the way to Edgar Martinez. Swung on the line"
-        " down the left field line for a base hit. Here comes Joy. Here is"
-        " Junior to third base. They're going to wave him in. The throw to the"
-        " plate will be late. The Mariners are going to play for the American"
-        " League Championship. I don't believe it. It just continues. My, oh,"
-        " my.",
-    ],
-}
+
+@pytest.fixture(autouse=True)
+def use_spawn_for_whisper(monkeypatch):
+    """Whisper has issues with forked workers, use spawn instead."""
+    monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
 
 
 def run_test(
+    hf_runner: type[HfRunner],
     vllm_runner: type[VllmRunner],
+    inputs: Sequence[tuple[list[str], list[str], PromptAudioInput]],
     model: str,
     *,
+    max_model_len: int,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
     tensor_parallel_size: int,
     distributed_executor_backend: str | None = None,
-    dtype: str = "half",
+    enforce_eager: bool = True,
 ) -> None:
-    prompt_list = PROMPTS * 10
-    expected_list = EXPECTED[model] * 10
+    """Inference result should be the same between hf and vllm.
 
+    All the audio fixtures for the test are from AudioAsset.
+    For huggingface runner, we provide the audio as input.
+    For vllm runner, we provide MultiModalDataDict objects
+    and corresponding MultiModalConfig as input.
+    """
     with vllm_runner(
         model,
         dtype=dtype,
-        max_model_len=448,
+        max_model_len=max_model_len,
         tensor_parallel_size=tensor_parallel_size,
         distributed_executor_backend=distributed_executor_backend,
-        # TODO (NickLucche) figure out output differences with non-eager and re-enable
-        enforce_eager=True,
+        limit_mm_per_prompt={"audio": 2},
+        enforce_eager=enforce_eager,
+        disable_custom_all_reduce=True,
     ) as vllm_model:
-        llm = vllm_model.llm
+        vllm_outputs_per_case = [
+            vllm_model.generate_greedy_logprobs(
+                vllm_prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                audios=audios,
+            )
+            for vllm_prompts, _, audios in inputs
+        ]
 
-        sampling_params = SamplingParams(
-            temperature=0,
-            top_p=1.0,
-            max_tokens=200,
+    with hf_runner(model, dtype=dtype, auto_cls=AutoModelForSpeechSeq2Seq) as hf_model:
+        hf_outputs_per_case = [
+            hf_model.generate_greedy_logprobs_limit(
+                hf_prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                audios=audios,
+            )
+            for _, hf_prompts, audios in inputs
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
         )
 
-        outputs = llm.generate(prompt_list, sampling_params)
 
-    for output, expected in zip(outputs, expected_list):
-        print(output.outputs[0].text)
-        assert output.outputs[0].text == expected
+@pytest.fixture
+def input_audios() -> list[tuple[list[str], list[str], list[tuple[Any, int]]]]:
+    audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
+    inputs = []
+    for asset in audio_assets:
+        audio, orig_sr = asset.audio_and_sample_rate
+        # Resample to Whisper's expected sample rate (16kHz)
+        if orig_sr != WHISPER_SAMPLE_RATE:
+            audio = librosa.resample(
+                audio, orig_sr=orig_sr, target_sr=WHISPER_SAMPLE_RATE
+            )
+        # vLLM prompts, HF prompts, audio inputs
+        inputs.append(([VLLM_PROMPT], [HF_PROMPT], [(audio, WHISPER_SAMPLE_RATE)]))
+    return inputs
+
+
+def check_model_available(model: str) -> None:
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")
 
 
 @pytest.mark.core_model
-@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
-@pytest.mark.parametrize("dtype", ["half"])
-@create_new_process_for_each_test()
-def test_models(vllm_runner, model, dtype) -> None:
-    run_test(
-        vllm_runner,
-        model,
-        tensor_parallel_size=1,
-        dtype=dtype,
-    )
-
-
 @pytest.mark.cpu_model
 @pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
 @pytest.mark.parametrize("dtype", ["half"])
-def test_models_cpu(vllm_runner, model, dtype) -> None:
-    # @create_new_process_for_each_test() does not work for some runners
-    # TODO: to fix cpu privilege issues in run-cpu-test-arm.sh
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("enforce_eager", [True, False])
+@create_new_process_for_each_test("spawn")
+def test_models(
+    hf_runner,
+    vllm_runner,
+    model: str,
+    dtype: str,
+    num_logprobs: int,
+    input_audios,
+    enforce_eager: bool,
+) -> None:
+    check_model_available(model)
+    if current_platform.is_cpu() and not enforce_eager:
+        pytest.skip("Skipping test for CPU with non-eager mode")
     run_test(
+        hf_runner,
         vllm_runner,
+        input_audios,
         model,
-        tensor_parallel_size=1,
         dtype=dtype,
+        max_model_len=448,
+        max_tokens=200,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+        enforce_eager=enforce_eager,
     )
 
 
@@ -152,15 +148,31 @@ def test_models_cpu(vllm_runner, model, dtype) -> None:
 @pytest.mark.core_model
 @pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
 @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
-@create_new_process_for_each_test()
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [200])
+@pytest.mark.parametrize("num_logprobs", [5])
+@create_new_process_for_each_test("spawn")
 def test_models_distributed(
+    hf_runner,
     vllm_runner,
-    model,
-    distributed_executor_backend,
+    model: str,
+    distributed_executor_backend: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    input_audios,
 ) -> None:
+    check_model_available(model)
     run_test(
+        hf_runner,
         vllm_runner,
+        input_audios,
         model,
+        dtype=dtype,
+        max_model_len=448,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
         tensor_parallel_size=2,
         distributed_executor_backend=distributed_executor_backend,
+        enforce_eager=False,
     )
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 769b33d877983..ca50785b46a1a 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -840,7 +840,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
         is_available_online=False,
     ),
     # [Encoder-decoder]
-    "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"),
+    "WhisperForConditionalGeneration": _HfExamplesInfo(
+        "openai/whisper-large-v3-turbo",
+        extras={"v3": "openai/whisper-large-v3"},
+    ),
     # [Cross-encoder]
     "JinaVLForRanking": _HfExamplesInfo("jinaai/jina-reranker-m0"),
 }

From 4fa7ce46f31cbd97b4651694caf9991cc395a259 Mon Sep 17 00:00:00 2001
From: "Roberto L. Castro"
 <38211239+LopezCastroRoberto@users.noreply.github.com>
Date: Sat, 13 Dec 2025 04:34:23 +0100
Subject: [PATCH 506/770] [Feature] Add SM103 (Blackwell Ultra) Support to vLLM
 (#30484)

Signed-off-by: LopezCastroRoberto <robertol.c510@gmail.com>
Signed-off-by: Roberto L. Castro <38211239+LopezCastroRoberto@users.noreply.github.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
---
 tests/compile/distributed/test_fusions_e2e.py     |  2 +-
 .../kernels/attention/test_cutlass_mla_decode.py  |  4 ++--
 .../attention/test_flashinfer_trtllm_attention.py |  4 ++--
 tests/kernels/moe/test_ocp_mx_moe.py              |  4 ++--
 tests/quantization/test_blackwell_moe.py          |  4 ++--
 vllm/model_executor/layers/batch_invariant.py     |  2 +-
 .../layers/fused_moe/batched_deep_gemm_moe.py     |  5 ++++-
 vllm/model_executor/layers/quantization/fp8.py    |  6 +++---
 vllm/model_executor/layers/quantization/mxfp4.py  |  8 ++++----
 .../quantization/utils/flashinfer_fp4_moe.py      |  2 +-
 .../layers/quantization/utils/flashinfer_utils.py |  2 +-
 .../layers/quantization/utils/fp8_utils.py        |  2 +-
 .../layers/quantization/utils/mxfp4_utils.py      |  2 +-
 vllm/model_executor/models/config.py              |  2 +-
 vllm/platforms/cuda.py                            |  2 +-
 vllm/platforms/interface.py                       | 15 +++++++++++++++
 vllm/utils/deep_gemm.py                           |  4 ++--
 vllm/utils/flashinfer.py                          |  4 +++-
 vllm/v1/attention/backends/flashinfer.py          |  2 +-
 vllm/v1/attention/backends/mla/common.py          |  6 +++---
 vllm/v1/attention/backends/mla/flashmla_sparse.py |  4 ++--
 21 files changed, 53 insertions(+), 33 deletions(-)

diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py
index 5379b5157b811..1fcafe1840cd3 100644
--- a/tests/compile/distributed/test_fusions_e2e.py
+++ b/tests/compile/distributed/test_fusions_e2e.py
@@ -20,7 +20,7 @@ from vllm.utils.torch_utils import is_torch_equal_or_newer
 
 from ...utils import flat_product, multi_gpu_test
 
-is_blackwell = lambda: current_platform.is_device_capability(100)
+is_blackwell = lambda: current_platform.is_device_capability_family(100)
 """Are we running on Blackwell, a lot of tests depend on it"""
 
 
diff --git a/tests/kernels/attention/test_cutlass_mla_decode.py b/tests/kernels/attention/test_cutlass_mla_decode.py
index a60f4e385a893..784c16304a286 100644
--- a/tests/kernels/attention/test_cutlass_mla_decode.py
+++ b/tests/kernels/attention/test_cutlass_mla_decode.py
@@ -32,8 +32,8 @@ def cal_diff(
 
 
 CUTLASS_MLA_UNSUPPORTED_REASON = (
-    "Cutlass MLA Requires compute capability of 10 or above."
-    if not current_platform.is_device_capability(100)
+    "Cutlass MLA Requires compute capability of 100 or above."
+    if not current_platform.is_device_capability_family(100)
     else "Cutlass MLA is supported"
 )
 
diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
index 98ea40608b468..06a7085a82ba0 100644
--- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py
+++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
@@ -11,7 +11,7 @@ from tests.kernels.quantization.nvfp4_utils import (
 from vllm.platforms import current_platform
 from vllm.utils.math_utils import round_up
 
-if not current_platform.is_device_capability(100):
+if not current_platform.is_device_capability_family(100):
     pytest.skip(
         "This TRTLLM kernel requires NVIDIA Blackwell.", allow_module_level=True
     )
@@ -443,7 +443,7 @@ def test_flashinfer_trtllm_prefill_with_baseline(
         output_trtllm = output_trtllm.reshape(-1, query.shape[1], query.shape[2])
 
     if q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP4_DTYPE:
-        rtol, atol = 1e-1, 2e-1
+        rtol, atol = 3e-1, 4e-1
     elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP8_DTYPE:
         rtol, atol = 4e-2, 6e-2
     elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == dtype:
diff --git a/tests/kernels/moe/test_ocp_mx_moe.py b/tests/kernels/moe/test_ocp_mx_moe.py
index 5a850dda4f6fd..8fe471d124f43 100644
--- a/tests/kernels/moe/test_ocp_mx_moe.py
+++ b/tests/kernels/moe/test_ocp_mx_moe.py
@@ -17,7 +17,7 @@ QUARK_MXFP4_AVAILABLE = find_spec("quark") is not None and version.parse(
 ) >= version.parse("0.8.99")
 
 TRTLLM_GEN_MXFP4_AVAILABLE = (
-    current_platform.is_cuda() and current_platform.is_device_capability(100)
+    current_platform.is_cuda() and current_platform.is_device_capability_family(100)
 )
 
 HOPPER_MXFP4_BF16_AVAILABLE = (
@@ -799,7 +799,7 @@ def test_flashinfer_cutlass_mxfp4_fused_moe(
 @pytest.mark.skipif(
     not (
         current_platform.is_cuda()
-        and current_platform.is_device_capability(100)
+        and current_platform.is_device_capability_family(100)
         and has_flashinfer()
     ),
     reason="NVIDIA GPU sm100 and flashinfer are required for this test",
diff --git a/tests/quantization/test_blackwell_moe.py b/tests/quantization/test_blackwell_moe.py
index 8dd4551ff4b96..a43d2abfdd8b8 100644
--- a/tests/quantization/test_blackwell_moe.py
+++ b/tests/quantization/test_blackwell_moe.py
@@ -10,9 +10,9 @@ import pytest
 from tests.utils import RemoteOpenAIServer
 from vllm.platforms import current_platform
 
-if not current_platform.is_device_capability(100):
+if not current_platform.is_device_capability_family(100):
     pytest.skip(
-        "This test only runs on Blackwell GPUs (SM100).", allow_module_level=True
+        "This test only runs on Blackwell GPUs (SM10x).", allow_module_level=True
     )
 
 
diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py
index b14e7dad77f9a..4f31e5afa1ac9 100644
--- a/vllm/model_executor/layers/batch_invariant.py
+++ b/vllm/model_executor/layers/batch_invariant.py
@@ -936,7 +936,7 @@ def enable_batch_invariant_mode():
     # Batch invariant matmuls are no longer needed after cublas overrides
     if not is_torch_equal_or_newer("2.10.0.dev"):
         if (
-            current_platform.is_device_capability(100)
+            current_platform.is_device_capability_family(100)
             or current_platform.is_device_capability(80)
             or current_platform.is_device_capability(89)
         ):
diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
index 53362277dae8a..15f6e3a18ed6c 100644
--- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
@@ -287,7 +287,10 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         """
         DeepGemm supports packed ue8m0 activation scales format in devices == sm100
         """
-        return is_deep_gemm_e8m0_used() and current_platform.is_device_capability(100)
+        return (
+            is_deep_gemm_e8m0_used()
+            and current_platform.is_device_capability_family(100)
+        )
 
     def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
         # Let PrepareAndFinalize::finalize() decide the impl.
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 60dde9eb57e0f..6909bac1efc7c 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -137,7 +137,7 @@ def get_fp8_moe_backend(
     if (
         current_platform.is_cuda()
         and (
-            current_platform.is_device_capability(100)
+            current_platform.is_device_capability_family(100)
             or current_platform.is_device_capability(90)
         )
         and envs.VLLM_USE_FLASHINFER_MOE_FP8
@@ -148,7 +148,7 @@ def get_fp8_moe_backend(
             logger.info_once("Using FlashInfer FP8 MoE TRTLLM backend for SM100")
             return Fp8MoeBackend.FLASHINFER_TRTLLM
         else:
-            if block_quant and current_platform.is_device_capability(100):
+            if block_quant and current_platform.is_device_capability_family(100):
                 raise ValueError(
                     "FlashInfer FP8 MoE throughput backend does not "
                     "support block quantization. Please use "
@@ -193,7 +193,7 @@ def get_fp8_moe_backend(
     # CUTLASS BlockScaled GroupedGemm on SM100 with block-quantized weights
     if (
         current_platform.is_cuda()
-        and current_platform.is_device_capability(100)
+        and current_platform.is_device_capability_family(100)
         and block_quant
     ):
         logger.info_once(
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 6eae4e9e66e1b..e96e87d15787d 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -118,19 +118,19 @@ def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend:
             logger.info_once("Using FlashInfer MXFP4 BF16 backend for SM90")
             return Mxfp4Backend.SM90_FI_MXFP4_BF16
         elif (
-            current_platform.is_device_capability(100)
+            current_platform.is_device_capability_family(100)
             and has_flashinfer()
             and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS
         ):
             logger.info_once("Using FlashInfer MXFP4 MXFP8 CUTLASS backend for SM100")
             return Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS
         elif (
-            current_platform.is_device_capability(100)
+            current_platform.is_device_capability_family(100)
             and has_flashinfer()
             and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
         ):
             return Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM
-        elif current_platform.is_device_capability(100) and has_flashinfer():
+        elif current_platform.is_device_capability_family(100) and has_flashinfer():
             logger.info_once(
                 "Using FlashInfer MXFP4 BF16 backend for SM100, "
                 "For faster performance on SM100, consider setting "
@@ -139,7 +139,7 @@ def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend:
             )
             return Mxfp4Backend.SM100_FI_MXFP4_BF16
         elif (
-            current_platform.is_device_capability(100)
+            current_platform.is_device_capability_family(100)
             or current_platform.is_device_capability(90)
         ) and not has_flashinfer():
             logger.warning_once(
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
index 8f96222f19f20..e424cd0e1ac99 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
@@ -50,7 +50,7 @@ def is_flashinfer_fp4_cutedsl_moe_available() -> bool:
         envs.VLLM_USE_FLASHINFER_MOE_FP4
         and has_flashinfer_cutedsl_grouped_gemm_nt_masked()
         and current_platform.is_cuda()
-        and current_platform.is_device_capability(100)
+        and current_platform.is_device_capability_family(100)
     )
 
 
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
index ba3653e4b5ea7..09d0fe6a2f3ad 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
@@ -290,7 +290,7 @@ def get_flashinfer_moe_backend() -> FlashinferMoeBackend:
     if flashinfer_moe_backend in backend_map:
         if (
             flashinfer_moe_backend == "latency"
-            and not current_platform.has_device_capability(100)
+            and not current_platform.is_device_capability_family(100)
         ):
             logger.info_once(
                 "Flashinfer TRTLLM MOE backend is only supported on "
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 9eeb6e266c34e..ea68745585160 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -247,7 +247,7 @@ class W8A8BlockFp8LinearOp:
         self.act_quant_group_shape = act_quant_group_shape
         self.is_deep_gemm_supported = is_deep_gemm_supported()
         self.is_hopper = current_platform.is_device_capability(90)
-        self.is_blackwell = current_platform.is_device_capability(100)
+        self.is_blackwell = current_platform.is_device_capability_family(100)
         self.use_deep_gemm_e8m0 = is_deep_gemm_e8m0_used()
 
         # Get the correct blockscale mul and input quant operations.
diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
index 7a351afb3c415..e9ecf0547033d 100644
--- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
@@ -63,7 +63,7 @@ def _swizzle_mxfp4(quant_tensor, scale, num_warps):
                 "split_k": 1,
             }
             opt_flags.update_opt_flags_constraints(constraints)
-        elif current_platform.is_device_capability(100):
+        elif current_platform.is_device_capability_family(100):
             constraints = {
                 "is_persistent": True,
                 "epilogue_subtile": 1,
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index 06cc92ee88180..4b08472538db4 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -363,7 +363,7 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
         else:
             kernel_block_alignment_size = 16
             if (
-                current_platform.is_device_capability(100)
+                current_platform.is_device_capability_family(100)
                 and model_config.get_head_size() == 256
                 and (
                     attention_config.backend is None
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 915392a4125f9..ef33e64bbfdf4 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -182,7 +182,7 @@ class CudaPlatformBase(Platform):
 
             if vllm_config.attention_config.backend is None:
                 # Default case
-                if cls.is_device_capability(100) and not use_sparse:
+                if cls.is_device_capability_family(100) and not use_sparse:
                     # Blackwell => Force CutlassMLA (unless sparse, i.e. DSv3.2).
                     use_cutlass_mla = True
                     # Set the backend in AttentionConfig so it's used during
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index f04e94e425257..49437c7d56d12 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -301,6 +301,21 @@ class Platform:
 
         return current_capability.to_int() == capability
 
+    @classmethod
+    def is_device_capability_family(
+        cls,
+        capability: int,
+        device_id: int = 0,
+    ) -> bool:
+        """
+        Returns True if the device capability is any <major>.x.
+        Mirrors CUDA 13 'family' architecture semantics (e.g. 10.x, 11.x, 12.x).
+        """
+        current_capability = cls.get_device_capability(device_id=device_id)
+        if current_capability is None:
+            return False
+        return (current_capability.to_int() // 10) == (capability // 10)
+
     @classmethod
     def get_device_name(cls, device_id: int = 0) -> str:
         """Get the name of a device."""
diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index a099fde1bdc45..46be3e2cd5c54 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -38,7 +38,7 @@ class DeepGemmQuantScaleFMT(Enum):
             return DeepGemmQuantScaleFMT.FLOAT32
         return (
             DeepGemmQuantScaleFMT.UE8M0
-            if current_platform.is_device_capability(100)
+            if current_platform.is_device_capability_family(100)
             else DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0
         )
 
@@ -50,7 +50,7 @@ def is_deep_gemm_supported() -> bool:
     """
     is_supported_arch = current_platform.is_cuda() and (
         current_platform.is_device_capability(90)
-        or current_platform.is_device_capability(100)
+        or current_platform.is_device_capability_family(100)
     )
     return envs.VLLM_USE_DEEP_GEMM and has_deep_gemm() and is_supported_arch
 
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index 9a66049350cd8..5019b771f4a14 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -264,7 +264,9 @@ def supports_trtllm_attention() -> bool:
         return False
 
     # Requires SM100 and NVIDIA artifactory to be accessible to download cubins
-    return current_platform.is_device_capability(100) and has_nvidia_artifactory()
+    return (
+        current_platform.is_device_capability_family(100) and has_nvidia_artifactory()
+    )
 
 
 def force_use_trtllm_attention() -> bool | None:
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 4174b80ee312e..2740a6916fd97 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -564,7 +564,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         )
         self.paged_kv_last_page_len_np = self.paged_kv_last_page_len_cpu.numpy()
 
-        if self.head_dim == 256 and current_platform.is_device_capability(100):
+        if self.head_dim == 256 and current_platform.is_device_capability_family(100):
             # https://github.com/flashinfer-ai/flashinfer/issues/1993 reports that
             # head size 256 and block size 16 is not supported on blackwell.
             assert kv_cache_spec.block_size != 16, (
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 8265503c28c35..fea482493635f 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -446,7 +446,7 @@ def use_flashinfer_prefill() -> bool:
         and flashinfer_available
         and not vllm_config.attention_config.use_cudnn_prefill
         and not vllm_config.attention_config.use_trtllm_ragged_deepseek_prefill
-        and current_platform.is_device_capability(100)
+        and current_platform.is_device_capability_family(100)
     )
 
 
@@ -457,7 +457,7 @@ def use_cudnn_prefill() -> bool:
     return (
         flashinfer_available
         and vllm_config.attention_config.use_cudnn_prefill
-        and current_platform.is_device_capability(100)
+        and current_platform.is_device_capability_family(100)
         and has_nvidia_artifactory()
     )
 
@@ -470,7 +470,7 @@ def use_trtllm_ragged_deepseek_prefill() -> bool:
     return (
         flashinfer_available
         and vllm_config.attention_config.use_trtllm_ragged_deepseek_prefill
-        and current_platform.is_device_capability(100)
+        and current_platform.is_device_capability_family(100)
     )
 
 
diff --git a/vllm/v1/attention/backends/mla/flashmla_sparse.py b/vllm/v1/attention/backends/mla/flashmla_sparse.py
index f3052fbaf2a65..0818078da0364 100644
--- a/vllm/v1/attention/backends/mla/flashmla_sparse.py
+++ b/vllm/v1/attention/backends/mla/flashmla_sparse.py
@@ -420,7 +420,7 @@ class FlashMLASparseMetadataBuilder(AttentionMetadataBuilder[FlashMLASparseMetad
         max_num_sm_parts = int(
             max((sm_count // 2) / h_k // (cdiv(h_q // h_k, 2 * 64) * s_q), 1)
         )
-        if current_platform.is_device_capability(100):
+        if current_platform.is_device_capability_family(100):
             max_num_sm_parts *= 2
         self.tile_scheduler_metadata_buffer = torch.empty(
             # TileSchedulerMetaDataSize = 8
@@ -719,7 +719,7 @@ class FlashMLASparseImpl(MLACommonBaseImpl[FlashMLASparseMetadata]):
         self.softmax_scale = scale
         assert indexer is not None
         self.topk_indices_buffer = indexer.topk_indices_buffer
-        self.padding = 128 if current_platform.is_device_capability(100) else 64
+        self.padding = 128 if current_platform.is_device_capability_family(100) else 64
 
         if kv_cache_dtype == "fp8_ds_mla":
             # Reserve workspace during initialization

From fdc135d768267b3a0ae8ed6fc3eca6a68d75f7a6 Mon Sep 17 00:00:00 2001
From: Tsukasa OI <floss_llm@irq.a4lg.com>
Date: Sat, 13 Dec 2025 14:55:14 +0900
Subject: [PATCH 507/770] [Misc][Quantization] Clarify the intent of GGUF
 `FusedMoE` weight materialization (#30310)

Signed-off-by: Tsukasa OI <floss_llm@irq.a4lg.com>
---
 vllm/model_executor/layers/fused_moe/layer.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 7f803720d4770..eba6ab4cc35f7 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1200,10 +1200,14 @@ class FusedMoE(CustomOp):
         if full_load:
             shard_dim += 1
 
-        # Materialize GGUF UninitializedParameter
+        # Materialize GGUF UninitializedParameter accounting merged weights
         if is_gguf_weight and isinstance(param, UninitializedParameter):
+            # To materialize a tensor, we must have full shape including
+            # number of experts, making this portion to require `full_load`.
+            assert full_load
             final_shape = list(loaded_weight.shape)
-            if shard_id in ["w1", "w3"]:
+            # w1 and w3 are merged per expert.
+            if shard_id in {"w1", "w3"}:
                 final_shape[1] *= 2
             final_shape[shard_dim] = final_shape[shard_dim] // self.tp_size
             param.materialize(final_shape, dtype=loaded_weight.dtype)

From b09806e28ffcc3e63176d668b2b3e965b35c986c Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 13 Dec 2025 15:48:56 +0800
Subject: [PATCH 508/770] [Bugfix] Dictionary MM embeddings for online chat
 (#30507)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/entrypoints/test_chat_utils.py | 110 +++++++++++++++++++++++++--
 vllm/entrypoints/chat_utils.py       |  97 ++++++++++++++++-------
 vllm/v1/engine/input_processor.py    |  30 +++++---
 3 files changed, 193 insertions(+), 44 deletions(-)

diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 527322c71ae4b..40059c9041541 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -796,9 +796,13 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
             "content": "<|image_1|>\nWhat's in this image?",
         }
     ]
+
     assert mm_data is not None
     assert "image" in mm_data
-    assert mm_data["image"] is None
+    assert isinstance(mm_data["image"], list)
+    assert len(mm_data["image"]) == 1
+    assert mm_data["image"][0] is None
+
     _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid])
 
 
@@ -825,10 +829,11 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid(
     # Should have audio in mm_data as None (UUID provided)
     assert mm_data is not None
     assert "audio" in mm_data
-    assert mm_data["audio"] is None
+    assert isinstance(mm_data["audio"], list)
+    assert len(mm_data["audio"]) == 1
+    assert mm_data["audio"][0] is None
+
     # UUID should be recorded
-    assert mm_uuids is not None
-    assert "audio" in mm_uuids
     _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[uuid])
 
 
@@ -1121,10 +1126,105 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
     mm_data = await mm_future
     assert mm_data is not None
     assert "image" in mm_data
-    assert mm_data["image"] is None
+    assert isinstance(mm_data["image"], list)
+    assert len(mm_data["image"]) == 1
+    assert mm_data["image"][0] is None
+
     _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid])
 
 
+def test_parse_chat_messages_empty_dict_image_embeds(
+    phi3v_model_config_image_embeds,
+):
+    """Test that empty dictionary for image_embeds is handled without errors."""
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_embeds", "image_embeds": {}},
+                    {"type": "text", "text": "What's in this image?"},
+                ],
+            }
+        ],
+        phi3v_model_config_image_embeds,
+        content_format="string",
+    )
+
+    # Verify conversation structure
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "<|image_1|>\nWhat's in this image?",
+        }
+    ]
+
+    # Verify mm_data contains an empty dictionary of embeddings
+    assert mm_data is not None
+    assert "image" in mm_data
+    assert isinstance(mm_data["image"], dict)
+    assert len(mm_data["image"]) == 0
+
+    # Verify UUIDs (None since we didn't provide any)
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
+
+
+def test_parse_chat_messages_multiple_dict_image_embeds(
+    phi3v_model_config_image_embeds,
+):
+    """Test that multiple dictionaries for image_embeds is handled without errors."""
+    # Create two sample image embedding tensors
+    batch_size = 2
+    image_embedding_1 = torch.randn(batch_size, 256, 1024)
+    image_embedding_2 = torch.randn(batch_size, 3)
+
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_embeds",
+                        "image_embeds": {
+                            "image_embedding_1": tensor2base64(p),
+                            "image_embedding_2": tensor2base64(i),
+                        },
+                    }
+                    for p, i in zip(image_embedding_1, image_embedding_2)
+                ]
+                + [
+                    {"type": "text", "text": "Describe these two images."},
+                ],
+            }
+        ],
+        phi3v_model_config_image_embeds,
+        content_format="string",
+    )
+
+    # Verify conversation structure
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "<|image_1|>\n<|image_2|>\nDescribe these two images.",
+        }
+    ]
+
+    # Verify mm_data contains a dictionary of multi-embeddings
+    assert mm_data is not None
+    assert "image" in mm_data
+    assert isinstance(mm_data["image"], dict)
+    assert len(mm_data["image"]) == batch_size
+
+    # Verify each embedding has the correct shape
+    assert isinstance(mm_data["image"]["image_embedding_1"], torch.Tensor)
+    assert mm_data["image"]["image_embedding_1"].shape == image_embedding_1.shape
+    assert isinstance(mm_data["image"]["image_embedding_2"], torch.Tensor)
+    assert mm_data["image"]["image_embedding_2"].shape == image_embedding_2.shape
+
+    # Verify UUIDs (None since we didn't provide any)
+    _assert_mm_uuids(mm_uuids, batch_size, expected_uuids=[None, None])
+
+
 @pytest.mark.asyncio
 async def test_parse_chat_messages_multiple_images_async(
     phi3v_model_config,
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index aceaa8bd45b81..5a15dec6f84c1 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -9,7 +9,7 @@ from collections import Counter, defaultdict, deque
 from collections.abc import Awaitable, Callable, Iterable
 from functools import cached_property, lru_cache, partial
 from pathlib import Path
-from typing import Any, Generic, Literal, TypeAlias, TypeVar, cast
+from typing import TYPE_CHECKING, Any, Generic, Literal, TypeAlias, TypeVar, cast
 
 import jinja2
 import jinja2.ext
@@ -53,7 +53,14 @@ from vllm.tokenizers import MistralTokenizer, TokenizerLike
 from vllm.transformers_utils.chat_templates import get_chat_template_fallback_path
 from vllm.transformers_utils.processor import cached_get_processor
 from vllm.utils import random_uuid
+from vllm.utils.collection_utils import is_list_of
 from vllm.utils.func_utils import supports_kw
+from vllm.utils.import_utils import LazyLoader
+
+if TYPE_CHECKING:
+    import torch
+else:
+    torch = LazyLoader("torch", globals(), "torch")
 
 logger = init_logger(__name__)
 
@@ -620,6 +627,44 @@ ModalityStr = Literal["image", "audio", "video", "image_embeds", "audio_embeds"]
 _T = TypeVar("_T")
 
 
+def _extract_embeds(tensors: list[torch.Tensor]):
+    if len(tensors) == 0:
+        return tensors
+
+    if len(tensors) == 1:
+        tensors[0]._is_single_item = True  # type: ignore
+        return tensors[0]  # To keep backwards compatibility for single item input
+
+    first_shape = tensors[0].shape
+    if all(t.shape == first_shape for t in tensors):
+        return torch.stack(tensors)
+
+    return tensors
+
+
+def _get_embeds_data(items_by_modality: dict[str, list[Any]], modality: str):
+    embeds_key = f"{modality}_embeds"
+    embeds = items_by_modality[embeds_key]
+
+    if len(embeds) == 0:
+        return embeds
+    if is_list_of(embeds, torch.Tensor):
+        return _extract_embeds(embeds)
+    if is_list_of(embeds, dict):
+        if not embeds:
+            return {}
+
+        first_keys = set(embeds[0].keys())
+        if any(set(item.keys()) != first_keys for item in embeds[1:]):
+            raise ValueError(
+                "All dictionaries in the list of embeddings must have the same keys."
+            )
+
+        return {k: _extract_embeds([item[k] for item in embeds]) for k in first_keys}
+
+    return embeds
+
+
 class BaseMultiModalItemTracker(ABC, Generic[_T]):
     """
     Tracks multi-modal items in a given request and ensures that the number
@@ -688,11 +733,14 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
     def all_mm_uuids(self) -> MultiModalUUIDDict | None:
         if not self._items_by_modality:
             return None
-        mm_uuids = {}
+
         uuids_by_modality = dict(self._uuids_by_modality)
         if "image" in uuids_by_modality and "image_embeds" in uuids_by_modality:
             raise ValueError("Mixing raw image and embedding inputs is not allowed")
+        if "audio" in uuids_by_modality and "audio_embeds" in uuids_by_modality:
+            raise ValueError("Mixing raw audio and embedding inputs is not allowed")
 
+        mm_uuids = {}
         if "image_embeds" in uuids_by_modality:
             mm_uuids["image"] = uuids_by_modality["image_embeds"]
         if "image" in uuids_by_modality:
@@ -703,6 +751,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
             mm_uuids["audio"] = uuids_by_modality["audio"]  # UUIDs of audios
         if "video" in uuids_by_modality:
             mm_uuids["video"] = uuids_by_modality["video"]  # UUIDs of videos
+
         return mm_uuids
 
     @abstractmethod
@@ -714,29 +763,25 @@ class MultiModalItemTracker(BaseMultiModalItemTracker[object]):
     def all_mm_data(self) -> MultiModalDataDict | None:
         if not self._items_by_modality:
             return None
-        mm_inputs = {}
+
         items_by_modality = dict(self._items_by_modality)
         if "image" in items_by_modality and "image_embeds" in items_by_modality:
             raise ValueError("Mixing raw image and embedding inputs is not allowed")
         if "audio" in items_by_modality and "audio_embeds" in items_by_modality:
             raise ValueError("Mixing raw audio and embedding inputs is not allowed")
 
+        mm_inputs = {}
         if "image_embeds" in items_by_modality:
-            image_embeds_lst = items_by_modality["image_embeds"]
-            mm_inputs["image"] = (
-                image_embeds_lst if len(image_embeds_lst) != 1 else image_embeds_lst[0]
-            )
+            mm_inputs["image"] = _get_embeds_data(items_by_modality, "image")
         if "image" in items_by_modality:
             mm_inputs["image"] = items_by_modality["image"]  # A list of images
         if "audio_embeds" in items_by_modality:
-            audio_embeds_lst = items_by_modality["audio_embeds"]
-            mm_inputs["audio"] = (
-                audio_embeds_lst if len(audio_embeds_lst) != 1 else audio_embeds_lst[0]
-            )
+            mm_inputs["audio"] = _get_embeds_data(items_by_modality, "audio")
         if "audio" in items_by_modality:
             mm_inputs["audio"] = items_by_modality["audio"]  # A list of audios
         if "video" in items_by_modality:
             mm_inputs["video"] = items_by_modality["video"]  # A list of videos
+
         return mm_inputs
 
     def create_parser(self) -> "BaseMultiModalContentParser":
@@ -747,38 +792,32 @@ class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]):
     async def all_mm_data(self) -> MultiModalDataDict | None:
         if not self._items_by_modality:
             return None
-        mm_inputs = {}
-        items_by_modality = {}
-        for modality, items in self._items_by_modality.items():
-            coros = []
-            for item in items:
-                if item is not None:
-                    coros.append(item)
-                else:
-                    coros.append(asyncio.sleep(0))
-            items_by_modality[modality] = await asyncio.gather(*coros)
 
+        coros_by_modality = {
+            modality: [item or asyncio.sleep(0) for item in items]
+            for modality, items in self._items_by_modality.items()
+        }
+        items_by_modality: dict[str, list[object | None]] = {
+            modality: await asyncio.gather(*coros)
+            for modality, coros in coros_by_modality.items()
+        }
         if "image" in items_by_modality and "image_embeds" in items_by_modality:
             raise ValueError("Mixing raw image and embedding inputs is not allowed")
         if "audio" in items_by_modality and "audio_embeds" in items_by_modality:
             raise ValueError("Mixing raw audio and embedding inputs is not allowed")
 
+        mm_inputs = {}
         if "image_embeds" in items_by_modality:
-            image_embeds_lst = items_by_modality["image_embeds"]
-            mm_inputs["image"] = (
-                image_embeds_lst if len(image_embeds_lst) != 1 else image_embeds_lst[0]
-            )
+            mm_inputs["image"] = _get_embeds_data(items_by_modality, "image")
         if "image" in items_by_modality:
             mm_inputs["image"] = items_by_modality["image"]  # A list of images
         if "audio_embeds" in items_by_modality:
-            audio_embeds_lst = items_by_modality["audio_embeds"]
-            mm_inputs["audio"] = (
-                audio_embeds_lst if len(audio_embeds_lst) != 1 else audio_embeds_lst[0]
-            )
+            mm_inputs["audio"] = _get_embeds_data(items_by_modality, "audio")
         if "audio" in items_by_modality:
             mm_inputs["audio"] = items_by_modality["audio"]  # A list of audios
         if "video" in items_by_modality:
             mm_inputs["video"] = items_by_modality["video"]  # A list of videos
+
         return mm_inputs
 
     def create_parser(self) -> "BaseMultiModalContentParser":
diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py
index a3c18464d3f52..5bd18cc064cb5 100644
--- a/vllm/v1/engine/input_processor.py
+++ b/vllm/v1/engine/input_processor.py
@@ -188,29 +188,39 @@ class InputProcessor:
         def _validate_single_prompt(single_prompt: dict | str) -> None:
             if not isinstance(single_prompt, dict):
                 return
+
             mm_data = single_prompt.get("multi_modal_data")
             mm_uuids = single_prompt.get("multi_modal_uuids")
             if not mm_data or not mm_uuids:
                 return
 
+            import torch
+
+            def _get_len(items: object):
+                if isinstance(items, dict):  # Embedding inputs
+                    return _get_len(next(iter(items.values()))) if items else 1
+
+                if isinstance(items, list):
+                    return len(items)
+                if isinstance(items, torch.Tensor):
+                    # To keep backwards compatibility for single item embedding input
+                    return 1 if getattr(items, "_is_single_item", False) else len(items)
+
+                return 1
+
             for modality, items in mm_data.items():
                 if modality in mm_uuids:
-                    data_len = len(items) if isinstance(items, list) else 1
-                    uuid_len = (
-                        len(mm_uuids[modality])
-                        if isinstance(mm_uuids[modality], list)
-                        else 1
-                    )
+                    data_len = _get_len(items)
+                    uuid_len = _get_len(mm_uuids[modality])
                     if uuid_len != data_len:
                         raise ValueError(
-                            f"multi_modal_uuids for modality '{modality}' "
+                            f"multi_modal_uuids for modality {modality!r} "
                             "must have same length as data: got "
-                            f"{uuid_len} uuids vs "
-                            f"{data_len} items."
+                            f"{uuid_len} uuids vs {data_len} items."
                         )
                 else:
                     raise ValueError(
-                        f"multi_modal_uuids for modality '{modality}' must "
+                        f"multi_modal_uuids for modality {modality!r} must "
                         "be provided if multi_modal_data is provided."
                     )
 

From 1cec5b7ea9ba72b34de9a7c7001beb8a1b8f0dc0 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Sat, 13 Dec 2025 01:45:26 -0800
Subject: [PATCH 509/770] [Scheduer] Simplify stop checking for pooling models
 (#30591)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/core/sched/scheduler.py | 11 +++++------
 vllm/v1/core/sched/utils.py     | 12 ++----------
 2 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index a9ce6e63cc775..278970ae7ee88 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -1117,6 +1117,7 @@ class Scheduler(SchedulerInterface):
             stopped = False
             new_logprobs = None
             new_token_ids = generated_token_ids
+            pooler_output = pooler_outputs[req_index] if pooler_outputs else None
             kv_transfer_params = None
             status_before_stop = request.status
 
@@ -1125,12 +1126,10 @@ class Scheduler(SchedulerInterface):
                 new_token_ids, stopped = self._update_request_with_output(
                     request, new_token_ids
                 )
-
-            # Stop checking for pooler models.
-            pooler_output = None
-            if pooler_outputs:
-                pooler_output = pooler_outputs[req_index]
-                stopped = check_stop(request, self.max_model_len, pooler_output)
+            elif request.pooling_params and pooler_output is not None:
+                # Pooling stops as soon as there is output.
+                request.status = RequestStatus.FINISHED_STOPPED
+                stopped = True
 
             if stopped:
                 kv_transfer_params = self._free_request(request)
diff --git a/vllm/v1/core/sched/utils.py b/vllm/v1/core/sched/utils.py
index 82166dc978396..6319731883225 100644
--- a/vllm/v1/core/sched/utils.py
+++ b/vllm/v1/core/sched/utils.py
@@ -2,8 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import contextlib
 
-import torch
-
 from vllm.v1.request import Request, RequestStatus
 
 
@@ -39,14 +37,8 @@ def remove_all(lst: list, items_to_remove: set) -> list:
     return [item for item in lst if item not in items_to_remove]
 
 
-def check_stop(
-    request: Request, max_model_len: int, pooler_output: torch.Tensor | None = None
-) -> bool:
-    if request.pooling_params:
-        if pooler_output is not None:
-            request.status = RequestStatus.FINISHED_STOPPED
-            return True
-        return False
+def check_stop(request: Request, max_model_len: int) -> bool:
+    assert not request.pooling_params
 
     sampling_params = request.sampling_params
     assert sampling_params is not None

From 64251f48df0ed16fb67f12ece26ab6c7ea730e74 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 13 Dec 2025 20:42:39 +0800
Subject: [PATCH 510/770] [Chore] Adjust tokenizer import to avoid circular
 imports (#30601)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 benchmarks/backend_request_func.py                         | 2 +-
 tests/entrypoints/openai/test_serving_engine.py            | 2 +-
 tests/entrypoints/test_chat_utils.py                       | 3 ++-
 tests/models/language/generation/test_mistral.py           | 2 +-
 tests/models/multimodal/generation/test_voxtral.py         | 2 +-
 tests/models/multimodal/processing/test_common.py          | 7 ++-----
 tests/reasoning/test_mistral_reasoning_parser.py           | 2 +-
 tests/reasoning/utils.py                                   | 2 +-
 tests/tokenizers_/test_detokenize.py                       | 2 +-
 tests/tool_use/test_mistral_tool_parser.py                 | 7 ++-----
 vllm/entrypoints/chat_utils.py                             | 3 ++-
 vllm/entrypoints/llm.py                                    | 3 ++-
 vllm/entrypoints/openai/serving_engine.py                  | 4 +++-
 vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py | 3 ++-
 vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py  | 3 ++-
 .../entrypoints/openai/tool_parsers/mistral_tool_parser.py | 5 ++++-
 vllm/entrypoints/pooling/score/serving.py                  | 3 ++-
 vllm/entrypoints/utils.py                                  | 2 +-
 vllm/model_executor/models/pixtral.py                      | 3 ++-
 vllm/model_executor/models/voxtral.py                      | 3 ++-
 vllm/reasoning/mistral_reasoning_parser.py                 | 2 +-
 vllm/v1/engine/input_processor.py                          | 3 ++-
 vllm/v1/structured_output/backend_xgrammar.py              | 3 ++-
 23 files changed, 40 insertions(+), 31 deletions(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index d69d74ca61f54..831b76b66e096 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -620,7 +620,7 @@ def get_tokenizer(
         kwargs["use_fast"] = False
     if tokenizer_mode == "mistral":
         try:
-            from vllm.tokenizers import MistralTokenizer
+            from vllm.tokenizers.mistral import MistralTokenizer
         except ImportError as e:
             raise ImportError(
                 "MistralTokenizer requires vllm package.\n"
diff --git a/tests/entrypoints/openai/test_serving_engine.py b/tests/entrypoints/openai/test_serving_engine.py
index 956a06dc5487c..192c7cafb7493 100644
--- a/tests/entrypoints/openai/test_serving_engine.py
+++ b/tests/entrypoints/openai/test_serving_engine.py
@@ -10,7 +10,7 @@ import pytest
 from vllm.config import ModelConfig
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
-from vllm.tokenizers import MistralTokenizer
+from vllm.tokenizers.mistral import MistralTokenizer
 
 
 @pytest.fixture()
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 40059c9041541..a87a4c35d3dc7 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -29,7 +29,8 @@ from vllm.multimodal.utils import (
     encode_image_base64,
     encode_video_base64,
 )
-from vllm.tokenizers import MistralTokenizer, get_tokenizer
+from vllm.tokenizers import get_tokenizer
+from vllm.tokenizers.mistral import MistralTokenizer
 from vllm.utils.serial_utils import tensor2base64
 
 from ..models.registry import HF_EXAMPLE_MODELS
diff --git a/tests/models/language/generation/test_mistral.py b/tests/models/language/generation/test_mistral.py
index e2d6271e2faed..bc8bb05c284e6 100644
--- a/tests/models/language/generation/test_mistral.py
+++ b/tests/models/language/generation/test_mistral.py
@@ -10,7 +10,7 @@ from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
     MistralToolParser,
 )
 from vllm.sampling_params import SamplingParams
-from vllm.tokenizers import MistralTokenizer
+from vllm.tokenizers.mistral import MistralTokenizer
 
 from ...utils import check_logprobs_close
 
diff --git a/tests/models/multimodal/generation/test_voxtral.py b/tests/models/multimodal/generation/test_voxtral.py
index 9e9087cb0fc4d..0eaef49e2395c 100644
--- a/tests/models/multimodal/generation/test_voxtral.py
+++ b/tests/models/multimodal/generation/test_voxtral.py
@@ -9,7 +9,7 @@ from mistral_common.audio import Audio
 from mistral_common.protocol.instruct.chunk import AudioChunk, RawAudio, TextChunk
 from mistral_common.protocol.instruct.messages import UserMessage
 
-from vllm.tokenizers import MistralTokenizer
+from vllm.tokenizers.mistral import MistralTokenizer
 
 from ....conftest import AudioTestAssets
 from ....utils import RemoteOpenAIServer
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 2e032ac4ca526..67861ebfc44e4 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -22,11 +22,8 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
 from vllm.multimodal.cache import MultiModalProcessorOnlyCache
 from vllm.multimodal.inputs import MultiModalInputs, batched_tensors_equal
 from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
-from vllm.tokenizers import (
-    MistralTokenizer,
-    TokenizerLike,
-    cached_tokenizer_from_config,
-)
+from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
+from vllm.tokenizers.mistral import MistralTokenizer
 
 from ....multimodal.utils import random_audio, random_image, random_video
 from ...registry import (
diff --git a/tests/reasoning/test_mistral_reasoning_parser.py b/tests/reasoning/test_mistral_reasoning_parser.py
index 01592fd0782a9..d6da723f80b08 100644
--- a/tests/reasoning/test_mistral_reasoning_parser.py
+++ b/tests/reasoning/test_mistral_reasoning_parser.py
@@ -5,7 +5,7 @@ import pytest
 
 from tests.reasoning.utils import run_reasoning_extraction_mistral
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
-from vllm.tokenizers import MistralTokenizer
+from vllm.tokenizers.mistral import MistralTokenizer
 
 parser_name = "mistral"
 
diff --git a/tests/reasoning/utils.py b/tests/reasoning/utils.py
index 695312a0cadfe..a020fb8e97161 100644
--- a/tests/reasoning/utils.py
+++ b/tests/reasoning/utils.py
@@ -4,7 +4,7 @@
 
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
 from vllm.reasoning import ReasoningParser
-from vllm.tokenizers import MistralTokenizer
+from vllm.tokenizers.mistral import MistralTokenizer
 
 
 class StreamingReasoningReconstructor:
diff --git a/tests/tokenizers_/test_detokenize.py b/tests/tokenizers_/test_detokenize.py
index ae1d6b0956722..d307993d04df9 100644
--- a/tests/tokenizers_/test_detokenize.py
+++ b/tests/tokenizers_/test_detokenize.py
@@ -8,7 +8,7 @@ import pytest
 from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
 
 from vllm.sampling_params import SamplingParams
-from vllm.tokenizers import MistralTokenizer
+from vllm.tokenizers.mistral import MistralTokenizer
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.detokenizer import (
     FastIncrementalDetokenizer,
diff --git a/tests/tool_use/test_mistral_tool_parser.py b/tests/tool_use/test_mistral_tool_parser.py
index 2dd0399cb8eeb..d498863317e8d 100644
--- a/tests/tool_use/test_mistral_tool_parser.py
+++ b/tests/tool_use/test_mistral_tool_parser.py
@@ -13,12 +13,9 @@ from partial_json_parser.core.options import Allow
 
 from vllm.entrypoints.openai.protocol import DeltaMessage, DeltaToolCall
 from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolParser
-from vllm.tokenizers import (
-    MistralTokenizer,
-    TokenizerLike,
-    get_tokenizer,
-)
+from vllm.tokenizers import TokenizerLike, get_tokenizer
 from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
+from vllm.tokenizers.mistral import MistralTokenizer
 
 
 @pytest.fixture(scope="module")
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 5a15dec6f84c1..6a8dfe3cd9e38 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -49,7 +49,8 @@ from vllm.logger import init_logger
 from vllm.model_executor.models import SupportsMultiModal
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalUUIDDict
 from vllm.multimodal.utils import MEDIA_CONNECTOR_REGISTRY, MediaConnector
-from vllm.tokenizers import MistralTokenizer, TokenizerLike
+from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers.mistral import MistralTokenizer
 from vllm.transformers_utils.chat_templates import get_chat_template_fallback_path
 from vllm.transformers_utils.processor import cached_get_processor
 from vllm.utils import random_uuid
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 6440b702f4fa6..31319cf64aeb8 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -72,7 +72,8 @@ from vllm.platforms import current_platform
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import BeamSearchParams, RequestOutputKind, SamplingParams
 from vllm.tasks import PoolingTask
-from vllm.tokenizers import MistralTokenizer, TokenizerLike
+from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers.mistral import MistralTokenizer
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils.collection_utils import as_iter, is_list_of
 from vllm.utils.counter import Counter
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index a799432baeb40..d83a7c8d59f39 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -117,7 +117,9 @@ from vllm.outputs import CompletionOutput, PoolingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
 from vllm.sampling_params import BeamSearchParams, SamplingParams
-from vllm.tokenizers import DeepseekV32Tokenizer, MistralTokenizer, TokenizerLike
+from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers.deepseekv32 import DeepseekV32Tokenizer
+from vllm.tokenizers.mistral import MistralTokenizer
 from vllm.tracing import (
     contains_trace_headers,
     extract_trace_headers,
diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
index 19c1c83268ed4..14cf2f38b70cc 100644
--- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
@@ -22,7 +22,8 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
 )
 from vllm.logger import init_logger
-from vllm.tokenizers import MistralTokenizer, TokenizerLike
+from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers.mistral import MistralTokenizer
 
 logger = init_logger(__name__)
 
diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
index 4655da8dd4542..92b09917c2521 100644
--- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
@@ -21,7 +21,8 @@ from vllm.entrypoints.openai.protocol import (
 from vllm.entrypoints.openai.tool_parsers import ToolParser
 from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff
 from vllm.logger import init_logger
-from vllm.tokenizers import MistralTokenizer, TokenizerLike
+from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers.mistral import MistralTokenizer
 
 logger = init_logger(__name__)
 
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index bc827f045606c..f60c379d26711 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -6,6 +6,7 @@ from collections.abc import Sequence
 from enum import Enum, auto
 from random import choices
 from string import ascii_letters, digits
+from typing import Any
 
 import ijson
 import regex as re
@@ -24,7 +25,8 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
 )
 from vllm.logger import init_logger
-from vllm.tokenizers import MistralTokenizer, TokenizerLike
+from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers.mistral import MistralTokenizer
 
 logger = init_logger(__name__)
 
@@ -84,6 +86,7 @@ class MistralToolParser(ToolParser):
 
         # initialize properties used for state when parsing tool calls in
         # streaming mode
+        self.prev_tool_call_arr: list[dict[str, Any]] = []
         self.current_tool_id: int = -1
         self.streaming_state: StreamingState = StreamingState.WAITING_FOR_TOOL_START
 
diff --git a/vllm/entrypoints/pooling/score/serving.py b/vllm/entrypoints/pooling/score/serving.py
index e5a66783005a6..f574d8bcebb40 100644
--- a/vllm/entrypoints/pooling/score/serving.py
+++ b/vllm/entrypoints/pooling/score/serving.py
@@ -38,7 +38,8 @@ from vllm.inputs.data import TokensPrompt
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
-from vllm.tokenizers import MistralTokenizer, TokenizerLike
+from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers.mistral import MistralTokenizer
 from vllm.utils.async_utils import make_async, merge_async_iterators
 
 logger = init_logger(__name__)
diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
index daeeb995bc749..f4a633c69cb0b 100644
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -30,7 +30,7 @@ from vllm.entrypoints.openai.protocol import (
 from vllm.entrypoints.openai.serving_models import LoRAModulePath
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.tokenizers import MistralTokenizer
+from vllm.tokenizers.mistral import MistralTokenizer
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 logger = init_logger(__name__)
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index faf2d80d24bba..555e6ea4b8cb2 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -59,7 +59,8 @@ from vllm.multimodal.processing import (
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
-from vllm.tokenizers import MistralTokenizer, cached_tokenizer_from_config
+from vllm.tokenizers import cached_tokenizer_from_config
+from vllm.tokenizers.mistral import MistralTokenizer
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py
index 7b408248ec74c..331f0c54ecfbc 100644
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -51,7 +51,8 @@ from vllm.multimodal.processing import (
 )
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
-from vllm.tokenizers import MistralTokenizer, cached_tokenizer_from_config
+from vllm.tokenizers import cached_tokenizer_from_config
+from vllm.tokenizers.mistral import MistralTokenizer
 
 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsTranscription
 from .utils import init_vllm_registered_model, maybe_prefix
diff --git a/vllm/reasoning/mistral_reasoning_parser.py b/vllm/reasoning/mistral_reasoning_parser.py
index 3206dbb29fe2e..de3d1296ec734 100644
--- a/vllm/reasoning/mistral_reasoning_parser.py
+++ b/vllm/reasoning/mistral_reasoning_parser.py
@@ -10,7 +10,7 @@ from vllm.entrypoints.openai.protocol import (
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParser
 from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
-from vllm.tokenizers import MistralTokenizer
+from vllm.tokenizers.mistral import MistralTokenizer
 
 logger = init_logger(__name__)
 
diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py
index 5bd18cc064cb5..65e0c845b0afa 100644
--- a/vllm/v1/engine/input_processor.py
+++ b/vllm/v1/engine/input_processor.py
@@ -19,7 +19,8 @@ from vllm.multimodal.processing import EncDecMultiModalProcessor
 from vllm.multimodal.utils import argsort_mm_positions
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
-from vllm.tokenizers import MistralTokenizer, TokenizerLike
+from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers.mistral import MistralTokenizer
 from vllm.utils import length_from_prompt_token_ids_or_embeds
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.metrics.stats import MultiModalCacheStats
diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py
index 826ee08caa4e2..c5e7165026d1b 100644
--- a/vllm/v1/structured_output/backend_xgrammar.py
+++ b/vllm/v1/structured_output/backend_xgrammar.py
@@ -10,7 +10,8 @@ import torch
 import vllm.envs
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams
-from vllm.tokenizers import DeepseekV32Tokenizer, MistralTokenizer
+from vllm.tokenizers.deepseekv32 import DeepseekV32Tokenizer
+from vllm.tokenizers.mistral import MistralTokenizer
 from vllm.utils.import_utils import LazyLoader
 from vllm.v1.structured_output.backend_types import (
     StructuredOutputBackend,

From e5db3e2774fd16394f8a96a608263ff2416385c8 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 13 Dec 2025 20:43:01 +0800
Subject: [PATCH 511/770] [CI/Build] Fix broken mm processor test
 Mistral-3-large (#30597)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 tests/models/multimodal/processing/test_tensor_schema.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py
index 5d489549c5b46..cb875436857cf 100644
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -8,6 +8,7 @@ from typing import Any, TypeAlias
 
 import numpy as np
 import pytest
+import torch
 import torch.nn as nn
 from PIL import Image
 
@@ -35,6 +36,7 @@ from vllm.tokenizers import cached_tokenizer_from_config
 from vllm.utils.collection_utils import is_list_of
 from vllm.utils.torch_utils import set_default_torch_dtype
 
+from ....utils import create_new_process_for_each_test
 from ...registry import HF_EXAMPLE_MODELS
 from ...utils import dummy_hf_overrides
 from .test_common import get_model_ids_to_test, get_text_token_prompts
@@ -136,6 +138,7 @@ def create_batched_mm_kwargs(
     )
 
 
+# TODO(Isotr0py): Don't initalize model during test
 @contextmanager
 def initialize_dummy_model(
     model_cls: type[nn.Module],
@@ -150,16 +153,21 @@ def initialize_dummy_model(
         backend="nccl",
     )
     initialize_model_parallel(tensor_model_parallel_size=1)
+
+    current_device = torch.get_default_device()
     vllm_config = VllmConfig(model_config=model_config)
     with set_current_vllm_config(vllm_config=vllm_config):
         with set_default_torch_dtype(model_config.dtype):
+            torch.set_default_device(current_platform.device_type)
             model = model_cls(vllm_config=vllm_config)
+            torch.set_default_device(current_device)
         yield model
 
     del model
     cleanup_dist_env_and_memory()
 
 
+@create_new_process_for_each_test()
 @pytest.mark.parametrize("model_id", get_model_ids_to_test())
 def test_model_tensor_schema(model_id: str):
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)

From ace34e3783208a31b185968a1e92c79ac8f633cb Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Sat, 13 Dec 2025 06:12:45 -0800
Subject: [PATCH 512/770] [Bugfix] Qwen3-next with  --hf-overrides
 \{\"num_hidden_layers\":8\}  (#30433)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 vllm/model_executor/models/qwen3_next.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py
index 6a5447ad0fed4..ccf6cc6e5894b 100644
--- a/vllm/model_executor/models/qwen3_next.py
+++ b/vllm/model_executor/models/qwen3_next.py
@@ -1092,6 +1092,8 @@ class Qwen3NextModel(nn.Module):
                         name.endswith(".bias") or name.endswith("_bias")
                     ) and name not in params_dict:
                         continue
+                    if name not in params_dict:
+                        continue
                     param = params_dict[name]
                     weight_loader = param.weight_loader
                     weight_loader(
@@ -1108,6 +1110,11 @@ class Qwen3NextModel(nn.Module):
                         continue
                     if is_pp_missing_parameter(name, self):
                         continue
+                    if name not in params_dict:
+                        logger.warning_once(
+                            f"Parameter {name} not found in params_dict, skip loading"
+                        )
+                        continue
                     param = params_dict[name]
                     weight_loader = getattr(
                         param, "weight_loader", default_weight_loader

From 39cefbdf17e2e906e0eae3e82bd601f66137deb4 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 13 Dec 2025 23:16:22 +0800
Subject: [PATCH 513/770] [Refactor] `TokenizerRegistry` only uses lazy imports
 (#30609)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/test_inputs.py                  |   4 +-
 tests/tokenizers_/test_basic.py       |  43 +++---
 tests/tokenizers_/test_registry.py    |  23 ++-
 vllm/entrypoints/chat_utils.py        |   5 +-
 vllm/tokenizers/__init__.py           |   6 -
 vllm/tokenizers/deepseekv32.py        |  47 ++++--
 vllm/tokenizers/hf.py                 |  19 +--
 vllm/tokenizers/mistral.py            |   7 +-
 vllm/tokenizers/protocol.py           |   2 +-
 vllm/tokenizers/registry.py           | 202 +++++++++++++-------------
 vllm/transformers_utils/tokenizer.py  |   6 +-
 vllm/v1/engine/async_llm.py           |   4 +-
 vllm/v1/engine/llm_engine.py          |   4 +-
 vllm/v1/structured_output/__init__.py |   4 +-
 14 files changed, 201 insertions(+), 175 deletions(-)

diff --git a/tests/test_inputs.py b/tests/test_inputs.py
index c4339827de8b6..8351af2528e4b 100644
--- a/tests/test_inputs.py
+++ b/tests/test_inputs.py
@@ -7,7 +7,7 @@ from vllm.config import ModelConfig
 from vllm.inputs import zip_enc_dec_prompts
 from vllm.inputs.parse import parse_raw_prompts
 from vllm.inputs.preprocess import InputPreprocessor
-from vllm.tokenizers import init_tokenizer_from_config
+from vllm.tokenizers import cached_tokenizer_from_config
 
 pytestmark = pytest.mark.cpu_test
 
@@ -108,7 +108,7 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
 )
 def test_preprocessor_always_mm_code_path(model_id, prompt):
     model_config = ModelConfig(model=model_id)
-    tokenizer = init_tokenizer_from_config(model_config)
+    tokenizer = cached_tokenizer_from_config(model_config)
     input_preprocessor = InputPreprocessor(model_config, tokenizer)
 
     # HF processor adds sep token
diff --git a/tests/tokenizers_/test_basic.py b/tests/tokenizers_/test_basic.py
index b152227a5a50f..0510261eacde7 100644
--- a/tests/tokenizers_/test_basic.py
+++ b/tests/tokenizers_/test_basic.py
@@ -3,38 +3,39 @@
 from typing import _get_protocol_attrs  # type: ignore
 
 import pytest
-from transformers import PreTrainedTokenizerBase
+from transformers import (
+    PreTrainedTokenizer,
+    PreTrainedTokenizerBase,
+    PreTrainedTokenizerFast,
+)
 
 from vllm.tokenizers import TokenizerLike, get_tokenizer
+from vllm.tokenizers.mistral import MistralTokenizer
 
 
 def _get_missing_attrs(obj: object, target: type):
     return [k for k in _get_protocol_attrs(target) if not hasattr(obj, k)]
 
 
+def _assert_tokenizer_like(tokenizer: object):
+    missing_attrs = _get_missing_attrs(tokenizer, TokenizerLike)
+    assert not missing_attrs, f"Missing attrs: {missing_attrs}"
+
+
 def test_tokenizer_like_protocol():
-    assert not (
-        missing_attrs := _get_missing_attrs(
-            get_tokenizer("gpt2", use_fast=False),
-            TokenizerLike,
-        )
-    ), f"Missing attrs: {missing_attrs}"
+    tokenizer = get_tokenizer("gpt2", use_fast=False)
+    assert isinstance(tokenizer, PreTrainedTokenizer)
+    _assert_tokenizer_like(tokenizer)
 
-    assert not (
-        missing_attrs := _get_missing_attrs(
-            get_tokenizer("gpt2", use_fast=True),
-            TokenizerLike,
-        )
-    ), f"Missing attrs: {missing_attrs}"
+    tokenizer = get_tokenizer("gpt2", use_fast=True)
+    assert isinstance(tokenizer, PreTrainedTokenizerFast)
+    _assert_tokenizer_like(tokenizer)
 
-    assert not (
-        missing_attrs := _get_missing_attrs(
-            get_tokenizer(
-                "mistralai/Mistral-7B-Instruct-v0.3", tokenizer_mode="mistral"
-            ),
-            TokenizerLike,
-        )
-    ), f"Missing attrs: {missing_attrs}"
+    tokenizer = get_tokenizer(
+        "mistralai/Mistral-7B-Instruct-v0.3", tokenizer_mode="mistral"
+    )
+    assert isinstance(tokenizer, MistralTokenizer)
+    _assert_tokenizer_like(tokenizer)
 
 
 @pytest.mark.parametrize("tokenizer_name", ["facebook/opt-125m", "gpt2"])
diff --git a/tests/tokenizers_/test_registry.py b/tests/tokenizers_/test_registry.py
index 7e795350d64c8..546f38b078dde 100644
--- a/tests/tokenizers_/test_registry.py
+++ b/tests/tokenizers_/test_registry.py
@@ -2,7 +2,14 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from pathlib import Path
 
-from vllm.tokenizers import TokenizerLike, TokenizerRegistry, get_tokenizer
+import pytest
+
+from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers.registry import (
+    TokenizerRegistry,
+    get_tokenizer,
+    resolve_tokenizer_args,
+)
 
 
 class TestTokenizer(TokenizerLike):
@@ -40,10 +47,22 @@ class TestTokenizer(TokenizerLike):
         return True
 
 
+@pytest.mark.parametrize("runner_type", ["generate", "pooling"])
+def test_resolve_tokenizer_args_idempotent(runner_type):
+    tokenizer_mode, tokenizer_name, args, kwargs = resolve_tokenizer_args(
+        "facebook/opt-125m",
+        runner_type=runner_type,
+    )
+
+    assert (tokenizer_mode, tokenizer_name, args, kwargs) == resolve_tokenizer_args(
+        tokenizer_name, *args, **kwargs
+    )
+
+
 def test_customized_tokenizer():
     TokenizerRegistry.register("test_tokenizer", __name__, TestTokenizer.__name__)
 
-    tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer", "abc")
+    tokenizer = TokenizerRegistry.load_tokenizer("test_tokenizer", "abc")
     assert isinstance(tokenizer, TestTokenizer)
     assert tokenizer.path_or_repo_id == "abc"
     assert tokenizer.bos_token_id == 0
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 6a8dfe3cd9e38..8485022024a4f 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -50,7 +50,6 @@ from vllm.model_executor.models import SupportsMultiModal
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalUUIDDict
 from vllm.multimodal.utils import MEDIA_CONNECTOR_REGISTRY, MediaConnector
 from vllm.tokenizers import TokenizerLike
-from vllm.tokenizers.mistral import MistralTokenizer
 from vllm.transformers_utils.chat_templates import get_chat_template_fallback_path
 from vllm.transformers_utils.processor import cached_get_processor
 from vllm.utils import random_uuid
@@ -60,6 +59,8 @@ from vllm.utils.import_utils import LazyLoader
 
 if TYPE_CHECKING:
     import torch
+
+    from vllm.tokenizers.mistral import MistralTokenizer
 else:
     torch = LazyLoader("torch", globals(), "torch")
 
@@ -1832,7 +1833,7 @@ def apply_hf_chat_template(
 
 
 def apply_mistral_chat_template(
-    tokenizer: MistralTokenizer,
+    tokenizer: "MistralTokenizer",
     messages: list[ChatCompletionMessageParam],
     chat_template: str | None,
     tools: list[dict[str, Any]] | None,
diff --git a/vllm/tokenizers/__init__.py b/vllm/tokenizers/__init__.py
index 67a6d7c8eb3d9..31e74b1a16e20 100644
--- a/vllm/tokenizers/__init__.py
+++ b/vllm/tokenizers/__init__.py
@@ -1,9 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from .deepseekv32 import DeepseekV32Tokenizer
-from .hf import HfTokenizer
-from .mistral import MistralTokenizer
 from .protocol import TokenizerLike
 from .registry import (
     TokenizerRegistry,
@@ -15,12 +12,9 @@ from .registry import (
 
 __all__ = [
     "TokenizerLike",
-    "HfTokenizer",
-    "MistralTokenizer",
     "TokenizerRegistry",
     "cached_get_tokenizer",
     "get_tokenizer",
     "cached_tokenizer_from_config",
     "init_tokenizer_from_config",
-    "DeepseekV32Tokenizer",
 ]
diff --git a/vllm/tokenizers/deepseekv32.py b/vllm/tokenizers/deepseekv32.py
index a7fa0f421725a..bf279a5cf67c5 100644
--- a/vllm/tokenizers/deepseekv32.py
+++ b/vllm/tokenizers/deepseekv32.py
@@ -2,24 +2,18 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from pathlib import Path
+from typing import Any
 
 from transformers import BatchEncoding
 
+from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+
 from .deepseek_v32_encoding import encode_messages
-from .hf import HfTokenizer, TokenizerLike
-from .registry import TokenizerRegistry
+from .hf import CachedHfTokenizer
+from .protocol import TokenizerLike
 
 
-@TokenizerRegistry.register("deepseek_v32")
-class DeepseekV32Tokenizer(HfTokenizer):
-    def __init__(self, tokenizer: TokenizerLike):
-        self.tokenizer = tokenizer
-        self.name_or_path = (
-            tokenizer.name_or_path if hasattr(tokenizer, "name_or_path") else ""
-        )
-        self._added_vocab = self.tokenizer.get_added_vocab()
-        self._added_vocab_size = len(self._added_vocab)
-
+class DeepseekV32Tokenizer(CachedHfTokenizer):
     @classmethod
     def from_pretrained(
         cls,
@@ -40,7 +34,21 @@ class DeepseekV32Tokenizer(HfTokenizer):
         )
         return DeepseekV32Tokenizer(tokenizer)
 
-    def apply_chat_template(self, messages, tools=None, **kwargs):
+    def __init__(self, tokenizer: TokenizerLike) -> None:
+        super().__init__()
+
+        self.tokenizer = tokenizer
+        self.name_or_path = getattr(tokenizer, "name_or_path", "")
+
+        self._added_vocab = self.tokenizer.get_added_vocab()
+        self._added_vocab_size = len(self._added_vocab)
+
+    def apply_chat_template(
+        self,
+        messages: list["ChatCompletionMessageParam"],
+        tools: list[dict[str, Any]] | None = None,
+        **kwargs,
+    ) -> str | list[int]:
         thinking = kwargs.get("thinking", False)
         thinking_mode = "thinking"
         if not thinking:
@@ -49,13 +57,24 @@ class DeepseekV32Tokenizer(HfTokenizer):
         messages = conversation.copy()
         if tools is not None and len(tools) > 0:
             messages.insert(0, {"role": "system"})
-            messages[0]["tools"] = tools
+            messages[0]["tools"] = tools  # type: ignore[typeddict-unknown-key]
 
         # Historical reasoning content is dropped when a new user message is introduced
         drop_thinking = messages[-1]["role"] == "user"
 
         encode_config = dict(thinking_mode=thinking_mode, drop_thinking=drop_thinking)
         prompt_str = encode_messages(messages, **encode_config)  # type: ignore
+
+        if kwargs.get("tokenize", True):
+            tokenizer_kwargs = {
+                k: kwargs[k] for k in ("truncation", "max_length") if k in kwargs
+            }
+            return self.encode(
+                prompt_str,
+                add_special_tokens=False,
+                **tokenizer_kwargs,
+            )
+
         return prompt_str
 
     def num_special_tokens_to_add(self) -> int:
diff --git a/vllm/tokenizers/hf.py b/vllm/tokenizers/hf.py
index 3445073120387..a7b565dca5d8f 100644
--- a/vllm/tokenizers/hf.py
+++ b/vllm/tokenizers/hf.py
@@ -3,22 +3,18 @@
 import contextlib
 import copy
 from pathlib import Path
-from typing import TYPE_CHECKING
+from typing import TypeAlias
 
-from transformers import AutoTokenizer
+from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
 
 from vllm.transformers_utils.config import get_sentence_transformer_tokenizer_config
 
 from .protocol import TokenizerLike
-from .registry import TokenizerRegistry
 
-if TYPE_CHECKING:
-    from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+HfTokenizer: TypeAlias = PreTrainedTokenizer | PreTrainedTokenizerFast
 
 
-def get_cached_tokenizer(
-    tokenizer: "PreTrainedTokenizer | PreTrainedTokenizerFast",
-) -> TokenizerLike:
+def get_cached_tokenizer(tokenizer: HfTokenizer) -> HfTokenizer:
     """
     By default, transformers will recompute multiple tokenizer properties
     each time they are called, leading to a significant slowdown.
@@ -65,11 +61,10 @@ def get_cached_tokenizer(
     CachedTokenizer.__name__ = f"Cached{tokenizer.__class__.__name__}"
 
     cached_tokenizer.__class__ = CachedTokenizer
-    return cached_tokenizer  # type: ignore
+    return cached_tokenizer
 
 
-@TokenizerRegistry.register("hf")
-class HfTokenizer(TokenizerLike):
+class CachedHfTokenizer(TokenizerLike):
     @classmethod
     def from_pretrained(
         cls,
@@ -79,7 +74,7 @@ class HfTokenizer(TokenizerLike):
         revision: str | None = None,
         download_dir: str | None = None,
         **kwargs,
-    ) -> "TokenizerLike":
+    ) -> HfTokenizer:
         try:
             tokenizer = AutoTokenizer.from_pretrained(
                 path_or_repo_id,
diff --git a/vllm/tokenizers/mistral.py b/vllm/tokenizers/mistral.py
index 1f44037dd55ec..534b0da484a5d 100644
--- a/vllm/tokenizers/mistral.py
+++ b/vllm/tokenizers/mistral.py
@@ -3,10 +3,11 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, cast
 
+from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.logger import init_logger
 
 from .protocol import TokenizerLike
-from .registry import TokenizerRegistry
 
 if TYPE_CHECKING:
     from mistral_common.protocol.instruct.request import (
@@ -15,9 +16,6 @@ if TYPE_CHECKING:
     from mistral_common.tokens.tokenizers.tekken import Tekkenizer
     from transformers import BatchEncoding
 
-    from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
-    from vllm.entrypoints.openai.protocol import ChatCompletionRequest
-
     try:
         # Transformers v5
         from transformers.tokenization_mistral_common import MistralCommonBackend
@@ -201,7 +199,6 @@ def _tekken_token_to_id(tokenizer: "Tekkenizer", t: str | bytes) -> int:
         return tokenizer.unk_id
 
 
-@TokenizerRegistry.register("mistral")
 class MistralTokenizer(TokenizerLike):
     @classmethod
     def from_pretrained(
diff --git a/vllm/tokenizers/protocol.py b/vllm/tokenizers/protocol.py
index d6a3b0ba9b5f5..28754f9e10d00 100644
--- a/vllm/tokenizers/protocol.py
+++ b/vllm/tokenizers/protocol.py
@@ -97,7 +97,7 @@ class TokenizerLike(Protocol):
         messages: list["ChatCompletionMessageParam"],
         tools: list[dict[str, Any]] | None = None,
         **kwargs,
-    ) -> list[int]:
+    ) -> str | list[int]:
         raise NotImplementedError
 
     def convert_tokens_to_string(self, tokens: list[str]) -> str:
diff --git a/vllm/tokenizers/registry.py b/vllm/tokenizers/registry.py
index 1d44feeee500f..1296ce62ae693 100644
--- a/vllm/tokenizers/registry.py
+++ b/vllm/tokenizers/registry.py
@@ -1,13 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import importlib.util
-from collections.abc import Callable
+from dataclasses import dataclass, field
 from functools import lru_cache
 from pathlib import Path
-from typing import TYPE_CHECKING, TypeVar, overload
+from typing import TYPE_CHECKING
 
 import huggingface_hub
-from typing_extensions import assert_never
+from typing_extensions import TypeVar, assert_never, deprecated
 
 import vllm.envs as envs
 from vllm.logger import init_logger
@@ -24,46 +24,25 @@ from vllm.utils.import_utils import resolve_obj_by_qualname
 from .protocol import TokenizerLike
 
 if TYPE_CHECKING:
-    from vllm.config import ModelConfig
+    from vllm.config.model import ModelConfig, RunnerType
 
 logger = init_logger(__name__)
 
-_T = TypeVar("_T", bound=type[TokenizerLike])
+
+_VLLM_TOKENIZERS = {
+    "deepseekv32": ("deepseekv32", "DeepseekV32Tokenizer"),
+    "hf": ("hf", "CachedHfTokenizer"),
+    "mistral": ("mistral", "MistralTokenizer"),
+}
 
 
-class TokenizerRegistry:
-    # Tokenizer name -> tokenizer_cls or (tokenizer module, tokenizer class)
-    REGISTRY: dict[str, type[TokenizerLike] | tuple[str, str]] = {}
+@dataclass
+class _TokenizerRegistry:
+    # Tokenizer mode ->  (tokenizer module, tokenizer class)
+    tokenizers: dict[str, tuple[str, str]] = field(default_factory=dict)
 
-    # In-tree tokenizers
-    @staticmethod
-    @overload
-    def register(tokenizer_mode: str) -> Callable[[_T], _T]: ...
-
-    # OOT tokenizers
-    @staticmethod
-    @overload
-    def register(tokenizer_mode: str, module: str, class_name: str) -> None: ...
-
-    @staticmethod
-    def register(
-        tokenizer_mode: str,
-        module: str | None = None,
-        class_name: str | None = None,
-    ) -> Callable[[_T], _T] | None:
-        # In-tree tokenizers
-        if module is None or class_name is None:
-
-            def wrapper(tokenizer_cls: _T) -> _T:
-                assert tokenizer_mode not in TokenizerRegistry.REGISTRY
-                TokenizerRegistry.REGISTRY[tokenizer_mode] = tokenizer_cls
-
-                return tokenizer_cls
-
-            return wrapper
-
-        # OOT tokenizers
-        if tokenizer_mode in TokenizerRegistry.REGISTRY:
+    def register(self, tokenizer_mode: str, module: str, class_name: str) -> None:
+        if tokenizer_mode in self.tokenizers:
             logger.warning(
                 "%s.%s is already registered for tokenizer_mode=%r. "
                 "It is overwritten by the new one.",
@@ -72,36 +51,42 @@ class TokenizerRegistry:
                 tokenizer_mode,
             )
 
-        TokenizerRegistry.REGISTRY[tokenizer_mode] = (module, class_name)
+        self.tokenizers[tokenizer_mode] = (module, class_name)
 
         return None
 
-    @staticmethod
-    def get_tokenizer(tokenizer_mode: str, *args, **kwargs) -> "TokenizerLike":
-        if tokenizer_mode not in TokenizerRegistry.REGISTRY:
+    def load_tokenizer_cls(self, tokenizer_mode: str) -> type[TokenizerLike]:
+        if tokenizer_mode not in self.tokenizers:
             raise ValueError(f"No tokenizer registered for {tokenizer_mode=!r}.")
 
-        item = TokenizerRegistry.REGISTRY[tokenizer_mode]
-        if isinstance(item, type):
-            return item.from_pretrained(*args, **kwargs)
-
-        module, class_name = item
+        module, class_name = self.tokenizers[tokenizer_mode]
         logger.debug_once(f"Loading {class_name} for {tokenizer_mode=!r}")
 
-        class_ = resolve_obj_by_qualname(f"{module}.{class_name}")
-        return class_.from_pretrained(*args, **kwargs)
+        return resolve_obj_by_qualname(f"{module}.{class_name}")
+
+    def load_tokenizer(self, tokenizer_mode: str, *args, **kwargs) -> TokenizerLike:
+        tokenizer_cls = self.load_tokenizer_cls(tokenizer_mode)
+        return tokenizer_cls.from_pretrained(*args, **kwargs)
 
 
-def get_tokenizer(
+TokenizerRegistry = _TokenizerRegistry(
+    {
+        mode: (f"vllm.tokenizers.{mod_relname}", cls_name)
+        for mode, (mod_relname, cls_name) in _VLLM_TOKENIZERS.items()
+    }
+)
+
+
+def resolve_tokenizer_args(
     tokenizer_name: str | Path,
     *args,
+    runner_type: "RunnerType" = "generate",
     tokenizer_mode: str = "auto",
-    trust_remote_code: bool = False,
-    revision: str | None = None,
-    download_dir: str | None = None,
     **kwargs,
-) -> TokenizerLike:
-    """Gets a tokenizer for the given model name via HuggingFace or ModelScope."""
+):
+    revision: str | None = kwargs.get("revision")
+    download_dir: str | None = kwargs.get("download_dir")
+
     if envs.VLLM_USE_MODELSCOPE:
         # download model from ModelScope hub,
         # lazy import so that modelscope is not required for normal use.
@@ -125,16 +110,6 @@ def get_tokenizer(
                 )
                 tokenizer_name = tokenizer_path
 
-    if tokenizer_mode == "slow":
-        if kwargs.get("use_fast", False):
-            raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
-
-        tokenizer_mode = "hf"
-        kwargs["use_fast"] = False
-
-    if "truncation_side" not in kwargs:
-        kwargs["truncation_side"] = "left"
-
     # Separate model folder from file path for GGUF models
     if is_gguf(tokenizer_name):
         if check_gguf_file(tokenizer_name):
@@ -150,6 +125,21 @@ def get_tokenizer(
             )
             kwargs["gguf_file"] = gguf_file
 
+    if "truncation_side" not in kwargs:
+        if runner_type == "generate" or runner_type == "draft":
+            kwargs["truncation_side"] = "left"
+        elif runner_type == "pooling":
+            kwargs["truncation_side"] = "right"
+        else:
+            assert_never(runner_type)
+
+    if tokenizer_mode == "slow":
+        if kwargs.get("use_fast", False):
+            raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
+
+        tokenizer_mode = "hf"
+        kwargs["use_fast"] = False
+
     # Try to use official Mistral tokenizer if possible
     if tokenizer_mode == "auto" and importlib.util.find_spec("mistral_common"):
         allow_patterns = ["tekken.json", "tokenizer.model.v*"]
@@ -165,49 +155,70 @@ def get_tokenizer(
     if tokenizer_mode == "auto":
         tokenizer_mode = "hf"
 
-    tokenizer_args = (tokenizer_name, *args)
-    tokenizer_kwargs = dict(
+    return tokenizer_mode, tokenizer_name, args, kwargs
+
+
+cached_resolve_tokenizer_args = lru_cache(resolve_tokenizer_args)
+
+
+def tokenizer_args_from_config(config: "ModelConfig", **kwargs):
+    return cached_resolve_tokenizer_args(
+        config.tokenizer,
+        runner_type=config.runner_type,
+        tokenizer_mode=config.tokenizer_mode,
+        revision=config.tokenizer_revision,
+        trust_remote_code=config.trust_remote_code,
+        **kwargs,
+    )
+
+
+_T = TypeVar("_T", bound=TokenizerLike, default=TokenizerLike)
+
+
+def get_tokenizer(
+    tokenizer_name: str | Path,
+    *args,
+    tokenizer_cls: type[_T] = TokenizerLike,  # type: ignore[assignment]
+    trust_remote_code: bool = False,
+    revision: str | None = None,
+    download_dir: str | None = None,
+    **kwargs,
+) -> _T:
+    """Gets a tokenizer for the given model name via HuggingFace or ModelScope."""
+    tokenizer_mode, tokenizer_name, args, kwargs = cached_resolve_tokenizer_args(
+        tokenizer_name,
+        *args,
         trust_remote_code=trust_remote_code,
         revision=revision,
         download_dir=download_dir,
         **kwargs,
     )
 
-    if tokenizer_mode == "custom":
-        logger.warning_once(
-            "TokenizerRegistry now uses `tokenizer_mode` as the registry key "
-            "instead of `tokenizer_name`. "
-            "Please update the definition of `.from_pretrained` in "
-            "your custom tokenizer to accept `args=%s`, `kwargs=%s`. "
-            "Then, you can pass `tokenizer_mode=%r` instead of "
-            "`tokenizer_mode='custom'` when initializing vLLM.",
-            tokenizer_args,
-            str(tokenizer_kwargs),
-            tokenizer_name,
-        )
+    if tokenizer_cls == TokenizerLike:
+        tokenizer_cls_ = TokenizerRegistry.load_tokenizer_cls(tokenizer_mode)
+    else:
+        tokenizer_cls_ = tokenizer_cls
 
-        tokenizer_mode = str(tokenizer_name)
-
-    tokenizer = TokenizerRegistry.get_tokenizer(
-        tokenizer_mode,
-        *tokenizer_args,
-        **tokenizer_kwargs,
-    )
+    tokenizer = tokenizer_cls_.from_pretrained(tokenizer_name, *args, **kwargs)
     if not tokenizer.is_fast:
         logger.warning(
             "Using a slow tokenizer. This might cause a significant "
             "slowdown. Consider using a fast tokenizer instead."
         )
 
-    return tokenizer
+    return tokenizer  # type: ignore
 
 
 cached_get_tokenizer = lru_cache(get_tokenizer)
 
 
 def cached_tokenizer_from_config(model_config: "ModelConfig", **kwargs):
+    if model_config.skip_tokenizer_init:
+        return None
+
     return cached_get_tokenizer(
         model_config.tokenizer,
+        runner_type=model_config.runner_type,
         tokenizer_mode=model_config.tokenizer_mode,
         revision=model_config.tokenizer_revision,
         trust_remote_code=model_config.trust_remote_code,
@@ -215,19 +226,8 @@ def cached_tokenizer_from_config(model_config: "ModelConfig", **kwargs):
     )
 
 
+@deprecated(
+    "Renamed to `cached_tokenizer_from_config`. The old name will be removed in v0.14."
+)
 def init_tokenizer_from_config(model_config: "ModelConfig"):
-    runner_type = model_config.runner_type
-    if runner_type == "generate" or runner_type == "draft":
-        truncation_side = "left"
-    elif runner_type == "pooling":
-        truncation_side = "right"
-    else:
-        assert_never(runner_type)
-
-    return get_tokenizer(
-        model_config.tokenizer,
-        tokenizer_mode=model_config.tokenizer_mode,
-        trust_remote_code=model_config.trust_remote_code,
-        revision=model_config.tokenizer_revision,
-        truncation_side=truncation_side,
-    )
+    return cached_tokenizer_from_config(model_config)
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 8745e1d9dbbbc..90af573535d3b 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -60,17 +60,17 @@ def __getattr__(name: str):
 
         return cached_tokenizer_from_config
     if name == "init_tokenizer_from_configs":
-        from vllm.tokenizers import init_tokenizer_from_config
+        from vllm.tokenizers import cached_tokenizer_from_config
 
         warnings.warn(
             "`vllm.transformers_utils.tokenizer.init_tokenizer_from_configs` "
-            "has been moved to `vllm.tokenizers.init_tokenizer_from_config`. "
+            "has been moved to `vllm.tokenizers.cached_tokenizer_from_config`. "
             "The old name will be removed in v0.14.",
             DeprecationWarning,
             stacklevel=2,
         )
 
-        return init_tokenizer_from_config
+        return cached_tokenizer_from_config
 
     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
 
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 8eff61563ccea..a6ee241c41151 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -26,7 +26,7 @@ from vllm.plugins.io_processors import get_io_processor
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.tasks import SupportedTask
-from vllm.tokenizers import TokenizerLike, init_tokenizer_from_config
+from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
 from vllm.tracing import init_tracer
 from vllm.transformers_utils.config import maybe_register_config_serialize_by_value
 from vllm.usage.usage_lib import UsageContext
@@ -111,7 +111,7 @@ class AsyncLLM(EngineClient):
         if self.model_config.skip_tokenizer_init:
             tokenizer = None
         else:
-            tokenizer = init_tokenizer_from_config(self.model_config)
+            tokenizer = cached_tokenizer_from_config(self.model_config)
 
         self.input_processor = InputProcessor(self.vllm_config, tokenizer)
         self.io_processor = get_io_processor(
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 4422eced82fea..1011317b706d3 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -23,7 +23,7 @@ from vllm.plugins.io_processors import get_io_processor
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.tasks import SupportedTask
-from vllm.tokenizers import TokenizerLike, init_tokenizer_from_config
+from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
 from vllm.tracing import init_tracer
 from vllm.usage.usage_lib import UsageContext
 from vllm.v1.engine import EngineCoreRequest
@@ -86,7 +86,7 @@ class LLMEngine:
         if self.model_config.skip_tokenizer_init:
             tokenizer = None
         else:
-            tokenizer = init_tokenizer_from_config(self.model_config)
+            tokenizer = cached_tokenizer_from_config(self.model_config)
 
         self.input_processor = InputProcessor(self.vllm_config, tokenizer)
         self.io_processor = get_io_processor(
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 4dd478804049b..79ee4161e9dfa 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -7,7 +7,7 @@ from typing import TYPE_CHECKING
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParserManager
-from vllm.tokenizers import init_tokenizer_from_config
+from vllm.tokenizers import cached_tokenizer_from_config
 from vllm.utils.import_utils import LazyLoader
 from vllm.v1.structured_output.backend_guidance import GuidanceBackend
 from vllm.v1.structured_output.backend_types import (
@@ -71,7 +71,7 @@ class StructuredOutputManager:
             # of CPUs.
             max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
             self.executor = ThreadPoolExecutor(max_workers=max_workers)
-            self.tokenizer = init_tokenizer_from_config(
+            self.tokenizer = cached_tokenizer_from_config(
                 model_config=self.vllm_config.model_config
             )
             reasoning_parser = (

From 763963aa7358e19d627f1bf614a00f415a4ef6b3 Mon Sep 17 00:00:00 2001
From: Laith Sakka <lsakka@meta.com>
Date: Sat, 13 Dec 2025 18:36:53 +0300
Subject: [PATCH 514/770] set assume_32bit_indexing and pass unbacked hints
 (#30459)

Signed-off-by: Laith Sakka <lsakka@meta.com>
---
 vllm/compilation/decorators.py | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 31f5e78408460..f07061bdb7b2d 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -28,7 +28,7 @@ from vllm.config.compilation import DynamicShapesType
 from vllm.logger import init_logger
 from vllm.sequence import IntermediateTensors
 from vllm.utils.import_utils import resolve_obj_by_qualname
-from vllm.utils.torch_utils import supports_dynamo
+from vllm.utils.torch_utils import is_torch_equal_or_newer, supports_dynamo
 
 from .monitor import start_monitoring_torch_compile
 
@@ -316,7 +316,13 @@ def _support_torch_compile(
     def _mark_dynamic_inputs(mod, type, *args, **kwargs):
         def mark_dynamic(arg, dims):
             if type == DynamicShapesType.UNBACKED:
-                torch._dynamo.decorators.mark_unbacked(arg, dims)
+                if is_torch_equal_or_newer("2.10.0.dev"):
+                    for dim in dims:
+                        torch._dynamo.decorators.mark_unbacked(
+                            arg, dim, hint_override=arg.size()[dim]
+                        )
+                else:
+                    torch._dynamo.decorators.mark_unbacked(arg, dims)
             else:
                 torch._dynamo.mark_dynamic(arg, dims)
 
@@ -350,7 +356,13 @@ def _support_torch_compile(
                     if isinstance(arg, torch.Tensor):
                         # In case dims is specified with negative indexing
                         dims = [arg.ndim + dim if dim < 0 else dim for dim in dims]
-                        torch._dynamo.decorators.mark_unbacked(arg, dims)
+                        if is_torch_equal_or_newer("2.10.0.dev"):
+                            for dim in dims:
+                                torch._dynamo.decorators.mark_unbacked(
+                                    arg, dim, hint_override=arg.size()[dim]
+                                )
+                        else:
+                            torch._dynamo.decorators.mark_unbacked(arg, dims)
 
     def __call__(self, *args, **kwargs):
         # torch.compiler.is_compiling() means we are inside the compilation
@@ -488,6 +500,12 @@ def _support_torch_compile(
         if ds_type == DynamicShapesType.BACKED_SIZE_OBLIVIOUS:
             fx_config_patches["backed_size_oblivious"] = True
 
+        # Prepare inductor config patches
+        # assume_32bit_indexing is only available in torch 2.10.0.dev+
+        inductor_config_patches = {}
+        if is_torch_equal_or_newer("2.10.0.dev"):
+            inductor_config_patches["assume_32bit_indexing"] = True
+
         with (
             patch.object(
                 InliningInstructionTranslator, "inline_call_", patched_inline_call
@@ -496,6 +514,7 @@ def _support_torch_compile(
             maybe_use_cudagraph_partition_wrapper(self.vllm_config),
             torch.fx.experimental._config.patch(**fx_config_patches),
             _torch27_patch_tensor_subclasses(),
+            torch._inductor.config.patch(**inductor_config_patches),
         ):
             if envs.VLLM_USE_AOT_COMPILE:
                 self.aot_compiled_fn = self.aot_compile(*args, **kwargs)

From ddbfbe527850f1dddf7c5d9d4dcaf80a86853c8f Mon Sep 17 00:00:00 2001
From: lif <1835304752@qq.com>
Date: Sun, 14 Dec 2025 01:37:59 +0800
Subject: [PATCH 515/770] [Docs] Clarify Expert Parallel behavior for attention
 and MoE layers (#30615)

Signed-off-by: majiayu000 <1835304752@qq.com>
---
 docs/serving/data_parallel_deployment.md   |  4 ++--
 docs/serving/expert_parallel_deployment.md | 22 +++++++++++++++++++++-
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/docs/serving/data_parallel_deployment.md b/docs/serving/data_parallel_deployment.md
index e5954917cd790..f0946eaf407a9 100644
--- a/docs/serving/data_parallel_deployment.md
+++ b/docs/serving/data_parallel_deployment.md
@@ -8,11 +8,11 @@ For MoE models, particularly those like DeepSeek that employ MLA (Multi-head Lat
 
 In these cases, the data parallel ranks are not completely independent. Forward passes must be aligned, and expert layers across all ranks are required to synchronize during every forward pass, even when there are fewer requests to be processed than DP ranks.
 
-The expert layers will by default form a (DP x TP) sized tensor parallel group. To enable expert parallelism, include the `--enable-expert-parallel` CLI arg (on all nodes in the multi-node case).
+By default, expert layers form a tensor parallel group of size `DP × TP`. To use expert parallelism instead, include the `--enable-expert-parallel` CLI arg (on all nodes in the multi-node case). See [Expert Parallel Deployment](expert_parallel_deployment.md) for details on how attention and expert layers behave differently with EP enabled.
 
 In vLLM, each DP rank is deployed as a separate "core engine" process that communicates with front-end process(es) via ZMQ sockets. Data Parallel attention can be combined with Tensor Parallel attention, in which case each DP engine owns a number of per-GPU worker processes equal to the configured TP size.
 
-For MoE models, when any requests are in progress in any rank, we must ensure that empty "dummy" forward passes are performed in all ranks that don't currently have any requests scheduled. This is handled via a separate DP Coordinator process that communicates with all ranks, and a collective operation performed every N steps to determine when all ranks become idle and can be paused. When TP is used in conjunction with DP, expert layers form an EP or TP group of size (DP x TP).
+For MoE models, when any requests are in progress in any rank, we must ensure that empty "dummy" forward passes are performed in all ranks that don't currently have any requests scheduled. This is handled via a separate DP Coordinator process that communicates with all ranks, and a collective operation performed every N steps to determine when all ranks become idle and can be paused. When TP is used in conjunction with DP, expert layers form a group of size `DP × TP` (using either tensor parallelism by default, or expert parallelism if `--enable-expert-parallel` is set).
 
 In all cases, it is beneficial to load-balance requests between DP ranks. For online deployments, this balancing can be optimized by taking into account the state of each DP engine - in particular its currently scheduled and waiting (queued) requests, and KV cache state. Each DP engine has an independent KV cache, and the benefit of prefix caching can be maximized by directing prompts intelligently.
 
diff --git a/docs/serving/expert_parallel_deployment.md b/docs/serving/expert_parallel_deployment.md
index 923020dc88c91..82fde27d71fd4 100644
--- a/docs/serving/expert_parallel_deployment.md
+++ b/docs/serving/expert_parallel_deployment.md
@@ -44,7 +44,27 @@ Where:
 - `DP_SIZE`: Data parallel size
 - `EP_SIZE`: Expert parallel size (computed automatically)
 
-When EP is enabled, MoE layers use expert parallelism instead of tensor parallelism, while attention layers continue to use tensor parallelism if `TP_SIZE > 1`.
+### Layer Behavior with EP Enabled
+
+When EP is enabled, different layers in MoE models behave differently:
+
+| Layer Type | Behavior | Parallelism Used |
+|------------|----------|------------------|
+| **Expert (MoE) Layers** | Sharded across all EP ranks | Expert Parallel (EP) of size `TP × DP` |
+| **Attention Layers** | Behavior depends on TP size | See below |
+
+**Attention layer parallelism:**
+
+- **When `TP = 1`**: Attention weights are **replicated** across all DP ranks (data parallelism)
+- **When `TP > 1`**: Attention weights are **sharded** using tensor parallelism across TP ranks within each DP group
+
+For example, with `TP=2, DP=4` (8 GPUs total):
+
+- Expert layers form an EP group of size 8, with experts distributed across all GPUs
+- Attention layers use TP=2 within each of the 4 DP groups
+
+!!! note "Key Difference from Data Parallel Deployment"
+    Without `--enable-expert-parallel`, MoE layers would use tensor parallelism (forming a TP group of size `TP × DP`), similar to dense models. With EP enabled, expert layers switch to expert parallelism, which can provide better efficiency and locality for MoE models.
 
 ### Example Command
 

From 7c16f3fbcc45e95491b90811fe9af1e6dfe297bc Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 14 Dec 2025 02:02:29 +0800
Subject: [PATCH 516/770] [Doc] Add documents for multi-node distributed
 serving with MP backend (#30509)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 docs/serving/parallelism_scaling.md    | 24 +++++++++++++++++++++++-
 vllm/v1/executor/multiproc_executor.py |  4 +---
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/docs/serving/parallelism_scaling.md b/docs/serving/parallelism_scaling.md
index a32840ea73b9a..339a5b8140214 100644
--- a/docs/serving/parallelism_scaling.md
+++ b/docs/serving/parallelism_scaling.md
@@ -62,7 +62,7 @@ If a single node lacks sufficient GPUs to hold the model, deploy vLLM across mul
 
 ### What is Ray?
 
-Ray is a distributed computing framework for scaling Python programs. Multi-node vLLM deployments require Ray as the runtime engine.
+Ray is a distributed computing framework for scaling Python programs. Multi-node vLLM deployments can use Ray as the runtime engine.
 
 vLLM uses Ray to manage the distributed execution of tasks across multiple nodes and control where execution happens.
 
@@ -130,6 +130,28 @@ vllm serve /path/to/the/model/in/the/container \
      --distributed-executor-backend ray
 ```
 
+### Running vLLM with MultiProcessing
+
+Besides Ray, Multi-node vLLM deployments can also use `multiprocessing` as the runtime engine. Here's an example to deploy model across 2 nodes (8 GPUs per node) with `tp_size=8` and `pp_size=2`.
+
+Choose one node as the head node and run:
+
+```bash
+vllm serve /path/to/the/model/in/the/container \
+  --tensor-parallel-size 8 --pipeline-parallel-size 2 \
+  --nnodes 2 --node-rank 0 \
+  --master-addr <HEAD_NODE_IP>
+```
+
+On the other worker node, run:
+
+```bash
+vllm serve /path/to/the/model/in/the/container \
+  --tensor-parallel-size 8 --pipeline-parallel-size 2 \
+  --nnodes 2 --node-rank 1 \
+  --master-addr <HEAD_NODE_IP> --headless
+```
+
 ## Optimizing network communication for tensor parallelism
 
 Efficient tensor parallelism requires fast inter-node communication, preferably through high-speed network adapters such as InfiniBand.
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index b42d026a3e15b..f81b5df96d4b6 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -124,9 +124,7 @@ class MultiprocExecutor(Executor):
         # Set multiprocessing envs
         set_multiprocessing_worker_envs()
 
-        # Multiprocessing-based executor does not support multi-node setting.
-        # Since it only works for single node, we can use the loopback address
-        # get_loopback_ip() for communication.
+        # use the loopback address get_loopback_ip() for communication.
         distributed_init_method = get_distributed_init_method(
             get_loopback_ip(), get_open_port()
         )

From 6e78ed6ba7f3671f866766f93c6d3571e5bf504d Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Sat, 13 Dec 2025 16:12:53 -0500
Subject: [PATCH 517/770] [Logs] Optimize startup logs 4 (#29903)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/model_executor/layers/fused_moe/fused_moe.py | 11 +++++------
 vllm/model_executor/layers/fused_moe/layer.py     |  4 +++-
 vllm/platforms/cuda.py                            |  5 +++--
 vllm/profiler/wrapper.py                          | 13 ++++++++-----
 vllm/v1/executor/multiproc_executor.py            |  2 +-
 5 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 0b83a3f5c4803..b286c3bc6fc07 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -885,12 +885,11 @@ def get_moe_configs(
 
     # If no optimized configuration is available, we will use the default
     # configuration
-    logger.warning(
-        (
-            "Using default MoE config. Performance might be sub-optimal! "
-            "Config file not found at %s"
-        ),
-        config_file_paths,
+    logger.warning_once(
+        "Using default MoE config. Performance might be sub-optimal! "
+        "Config file not found at %s",
+        ", ".join(config_file_paths),
+        scope="local",
     )
     return None
 
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index eba6ab4cc35f7..cc3afade709d9 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -369,7 +369,9 @@ class FusedMoE(CustomOp):
             # aux_stream() returns None on non-cuda-alike platforms.
             self.shared_experts_stream = aux_stream()
             if self.shared_experts_stream is not None:
-                logger.info_once("Enabled separate cuda stream for MoE shared_experts")
+                logger.info_once(
+                    "Enabled separate cuda stream for MoE shared_experts", scope="local"
+                )
 
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index ef33e64bbfdf4..38adf5dda07fe 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -409,10 +409,11 @@ class CudaPlatformBase(Platform):
         )
         selected_index = sorted_indices[0]
         selected_backend = valid_backends_priorities[selected_index][0]
-        logger.info(
+        logger.info_once(
             "Using %s attention backend out of potential backends: %s",
             selected_backend.name,
-            [b[0].name for b in valid_backends_priorities],
+            tuple(b[0].name for b in valid_backends_priorities),
+            scope="local",
         )
 
         return selected_backend.get_path()
diff --git a/vllm/profiler/wrapper.py b/vllm/profiler/wrapper.py
index a44a6a5eea0dd..f891a88f90394 100644
--- a/vllm/profiler/wrapper.py
+++ b/vllm/profiler/wrapper.py
@@ -61,7 +61,7 @@ class WorkerProfiler(ABC):
         """Call _stop with error handling but no safeguards."""
         try:
             self._stop()
-            logger.info("Profiler stopped successfully.")
+            logger.info_once("Profiler stopped successfully.", scope="local")
         except Exception as e:
             logger.warning("Failed to stop profiler: %s", e)
         self._running = False  # Always mark as not running, assume stop worked
@@ -91,7 +91,7 @@ class WorkerProfiler(ABC):
             and self._delay_iters > 0
             and self._active_iteration_count == self._delay_iters
         ):
-            logger.info("Starting profiler after delay...")
+            logger.info_once("Starting profiler after delay...", scope="local")
             self._call_start()
 
         if self._running:
@@ -105,7 +105,9 @@ class WorkerProfiler(ABC):
             # Automatically stop the profiler after max iters
             # will be marked as not running, but leave as active so that stop
             # can clean up properly
-            logger.info("Max profiling iterations reached. Stopping profiler...")
+            logger.info_once(
+                "Max profiling iterations reached. Stopping profiler...", scope="local"
+            )
             self._call_stop()
             return
 
@@ -125,7 +127,7 @@ class WorkerProfiler(ABC):
 
     def shutdown(self) -> None:
         """Ensure profiler is stopped when shutting down."""
-        logger.info_once("Shutting down profiler")
+        logger.info_once("Shutting down profiler", scope="local")
         if self._running:
             self.stop()
 
@@ -156,9 +158,10 @@ class TorchProfilerWrapper(WorkerProfiler):
         self.profiler_config = profiler_config
         torch_profiler_trace_dir = profiler_config.torch_profiler_dir
         if local_rank in (None, 0):
-            logger.info(
+            logger.info_once(
                 "Torch profiling enabled. Traces will be saved to: %s",
                 torch_profiler_trace_dir,
+                scope="local",
             )
             logger.debug(
                 "Profiler config: record_shapes=%s,"
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index f81b5df96d4b6..649875fe8b7c1 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -706,7 +706,7 @@ class WorkerProc:
                     death_pipe.recv()
                 except EOFError:
                     # Parent process has exited, terminate this worker
-                    logger.info("Parent process exited, terminating worker")
+                    logger.info_once("Parent process exited, terminating worker")
                     # Send signal to self to trigger clean shutdown
                     shutdown_event.set()
                 except Exception as e:

From 24429d592450a2b00d7df894288913a320a257e0 Mon Sep 17 00:00:00 2001
From: Qidong Su <soodoshll@gmail.com>
Date: Sat, 13 Dec 2025 16:56:53 -0500
Subject: [PATCH 518/770] [Doc] Add instructions for building docker image on
 GB300 with CUDA13 (#30414)

Signed-off-by: Qidong Su <soodoshll@gmail.com>
---
 docs/deployment/docker.md | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md
index 0e636c87f38a4..d70e0142e3202 100644
--- a/docs/deployment/docker.md
+++ b/docs/deployment/docker.md
@@ -82,7 +82,7 @@ DOCKER_BUILDKIT=1 docker build . \
 
 ## Building for Arm64/aarch64
 
-A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
+A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper and Grace-Blackwell. Using the flag `--platform "linux/arm64"` will build for arm64.
 
 !!! note
     Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
@@ -104,6 +104,25 @@ A docker container can be built for aarch64 systems such as the Nvidia Grace-Hop
     --build-arg RUN_WHEEL_CHECK=false
     ```
 
+For (G)B300, we recommend using CUDA 13, as shown in the following command.
+
+??? console "Command"
+
+    ```bash
+    DOCKER_BUILDKIT=1 docker build \
+    --build-arg CUDA_VERSION=13.0.1 \
+    --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 \
+    --build-arg max_jobs=256 \
+    --build-arg nvcc_threads=2 \
+    --build-arg RUN_WHEEL_CHECK=false \
+    --build-arg torch_cuda_arch_list='9.0 10.0+PTX' \
+    --platform "linux/arm64" \
+    --tag vllm/vllm-gb300-openai:latest \
+    --target vllm-openai \
+    -f docker/Dockerfile \
+    .
+    ```
+
 !!! note
     If you are building the `linux/arm64` image on a non-ARM host (e.g., an x86_64 machine), you need to ensure your system is set up for cross-compilation using QEMU. This allows your host machine to emulate ARM64 execution.
 

From dc7fb5bebe21657109672dba18f725753df93aac Mon Sep 17 00:00:00 2001
From: Qier Li <kevin44036@gmail.com>
Date: Sat, 13 Dec 2025 20:23:08 -0500
Subject: [PATCH 519/770] [Bug][KVConnector][Metrics] Remove a vacuous
 assertion breaking external-launcher (#30577)

Co-authored-by: Qier Li <qier@fb.com>
---
 vllm/distributed/kv_transfer/kv_connector/v1/metrics.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py b/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py
index eb8342eb7129f..28aad71ab48f2 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py
@@ -7,7 +7,6 @@ from prometheus_client import Counter, Gauge, Histogram
 
 from vllm.config import KVTransferConfig, VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
-from vllm.distributed.kv_transfer.kv_transfer_state import has_kv_transfer_group
 from vllm.logger import init_logger
 
 PromMetric: TypeAlias = Gauge | Counter | Histogram
@@ -53,8 +52,6 @@ class KVConnectorStats:
 
 class KVConnectorLogging:
     def __init__(self, kv_transfer_config: KVTransferConfig | None):
-        # This should be called on frontend process.
-        assert not has_kv_transfer_group()
         # Instantiate the connector's stats class.
         if kv_transfer_config and kv_transfer_config.kv_connector:
             self.connector_cls = KVConnectorFactory.get_connector_class(

From 29f7d9771569f26238d67cf6ea3a8792fb6a7792 Mon Sep 17 00:00:00 2001
From: Kayvan Mivehnejad <40775007+mivehk@users.noreply.github.com>
Date: Sat, 13 Dec 2025 22:18:41 -0500
Subject: [PATCH 520/770] Improve parse_raw_prompt test cases for invalid input
 .v2 (#30512)

Signed-off-by: Kayvan Mivehnejad <K.Mivehnejad@gmail.com>
---
 tests/test_inputs.py |  7 +++++++
 vllm/inputs/parse.py | 27 ++++++++++++++++++---------
 2 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/tests/test_inputs.py b/tests/test_inputs.py
index 8351af2528e4b..073be24a4a072 100644
--- a/tests/test_inputs.py
+++ b/tests/test_inputs.py
@@ -34,6 +34,13 @@ INPUTS_SLICES = [
 ]
 
 
+# Test that a nested mixed-type list of lists raises a TypeError.
+@pytest.mark.parametrize("invalid_input", [[[1, 2], ["foo", "bar"]]])
+def test_invalid_input_raise_type_error(invalid_input):
+    with pytest.raises(TypeError):
+        parse_raw_prompts(invalid_input)
+
+
 def test_parse_raw_single_batch_empty():
     with pytest.raises(ValueError, match="at least one prompt"):
         parse_raw_prompts([])
diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py
index 211551be8e60b..71289277eb987 100644
--- a/vllm/inputs/parse.py
+++ b/vllm/inputs/parse.py
@@ -33,22 +33,31 @@ def parse_raw_prompts(
         if len(prompt) == 0:
             raise ValueError("please provide at least one prompt")
 
+        # case 2: array of strings
         if is_list_of(prompt, str):
-            # case 2: array of strings
             prompt = cast(list[str], prompt)
             return [TextPrompt(prompt=elem) for elem in prompt]
+
+        # case 3: array of tokens
         if is_list_of(prompt, int):
-            # case 3: array of tokens
             prompt = cast(list[int], prompt)
             return [TokensPrompt(prompt_token_ids=prompt)]
-        if is_list_of(prompt, list):
-            prompt = cast(list[list[int]], prompt)
-            if len(prompt[0]) == 0:
-                raise ValueError("please provide at least one prompt")
 
-            if is_list_of(prompt[0], int):
-                # case 4: array of token arrays
-                return [TokensPrompt(prompt_token_ids=elem) for elem in prompt]
+        # case 4: array of token arrays
+        if is_list_of(prompt, list):
+            first = prompt[0]
+            if not isinstance(first, list):
+                raise ValueError("prompt expected to be a list of lists")
+
+            if len(first) == 0:
+                raise ValueError("Please provide at least one prompt")
+
+            # strict validation: every nested list must be list[int]
+            if not all(is_list_of(elem, int) for elem in prompt):
+                raise TypeError("Nested lists must contain only integers")
+
+            prompt = cast(list[list[int]], prompt)
+            return [TokensPrompt(prompt_token_ids=elem) for elem in prompt]
 
     raise TypeError(
         "prompt must be a string, array of strings, "

From 97f2f160fda2805f9149b0e44da76b5d3b1f7c7e Mon Sep 17 00:00:00 2001
From: Micah Williamson <micah.williamson@amd.com>
Date: Sun, 14 Dec 2025 00:56:26 -0600
Subject: [PATCH 521/770] [ROCm][CI] Add "Qwen3-Next-80B-A3B-Instruct MTP Async
 EPLB Accuracy Test" Back Into AMD CI (#30590)

Signed-off-by: David Chen <530634352@qq.com>
Signed-off-by: WeiQing Chen <40507679+david6666666@users.noreply.github.com>
Signed-off-by: Micah Williamson <micah.williamson@amd.com>
Co-authored-by: WeiQing Chen <40507679+david6666666@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 .../qwen3_next_mtp_async_eplb.sh              | 74 +++++++++++++++++++
 .buildkite/test-amd.yaml                      |  1 -
 vllm/distributed/eplb/rebalance_execute.py    |  3 -
 3 files changed, 74 insertions(+), 4 deletions(-)
 create mode 100644 .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh

diff --git a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
new file mode 100644
index 0000000000000..937a43d1a3221
--- /dev/null
+++ b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
@@ -0,0 +1,74 @@
+#!/usr/bin/env bash
+set -euxo pipefail
+
+# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
+THRESHOLD=${1:-0.25}
+NUM_Q=${2:-1319}
+PORT=${3:-8040}
+OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
+mkdir -p "${OUT_DIR}"
+
+wait_for_server() {
+  local port=$1
+  timeout 600 bash -c '
+    until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
+      sleep 1
+    done'
+}
+
+MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"
+
+# Set BACKENDS based on platform
+if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
+  # ROCm platform
+  BACKENDS=("allgather_reducescatter")
+  # Disable MOE padding for ROCm since it is causing eplb to fail
+  export VLLM_ROCM_MOE_PADDING=0
+else
+  # Non-ROCm platform (CUDA/other)
+  BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+fi
+
+cleanup() {
+  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
+    kill "${SERVER_PID}" 2>/dev/null || true
+    for _ in {1..20}; do
+      kill -0 "${SERVER_PID}" 2>/dev/null || break
+      sleep 0.5
+    done
+    kill -9 "${SERVER_PID}" 2>/dev/null || true
+  fi
+}
+trap cleanup EXIT
+
+for BACK in "${BACKENDS[@]}"; do
+  VLLM_DEEP_GEMM_WARMUP=skip \
+  VLLM_ALL2ALL_BACKEND=$BACK \
+  vllm serve "$MODEL" \
+    --enforce-eager \
+    --tensor-parallel-size 4 \
+    --enable-expert-parallel \
+    --enable-eplb \
+    --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
+    --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
+    --trust-remote-code \
+    --max-model-len 2048 \
+    --gpu-memory-utilization 0.9 \
+    --port $PORT &
+  SERVER_PID=$!
+  wait_for_server $PORT
+
+  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
+  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
+  python3 - <<PY
+import json; acc=json.load(open('${OUT}'))['accuracy']
+print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
+assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
+PY
+
+  cleanup
+  SERVER_PID=
+  sleep 1
+  PORT=$((PORT+1))
+done
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index c7d460be6e2b5..0c2e4ed48dda6 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1629,7 +1629,6 @@ steps:
   mirror_hardwares: [amdexperimental]
   agent_pool: mi325_4
   # grade: Blocking
-  gpu: h100
   optional: true
   num_gpus: 4
   working_dir: "/vllm-workspace"
diff --git a/vllm/distributed/eplb/rebalance_execute.py b/vllm/distributed/eplb/rebalance_execute.py
index 376dad8a72ef1..55856d940f001 100644
--- a/vllm/distributed/eplb/rebalance_execute.py
+++ b/vllm/distributed/eplb/rebalance_execute.py
@@ -322,9 +322,6 @@ async def transfer_layer(
     num_local_physical_experts = next(iter(expert_weights[0])).shape[0]
     assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts)
     assert num_physical_experts == ep_size * num_local_physical_experts
-    # A buffer to hold the expert weights in one layer during the exchange.
-    # NOTE: Currently we assume the same weights across different layers
-    # have the same shape.
 
     is_unchanged, is_received_locally, experts_recv_loc = move_to_buffer(
         num_local_experts=num_local_physical_experts,

From f569c654e14b19a0725788fadcb6a4ac045e50fe Mon Sep 17 00:00:00 2001
From: Laith Sakka <lsakka@meta.com>
Date: Sun, 14 Dec 2025 11:14:06 +0300
Subject: [PATCH 522/770] enable unbacked with aot_compile (#30462)

Signed-off-by: Laith Sakka <lsakka@meta.com>
---
 tests/compile/test_dynamic_shapes_compilation.py | 10 ++++++++--
 vllm/compilation/decorators.py                   |  8 --------
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/tests/compile/test_dynamic_shapes_compilation.py b/tests/compile/test_dynamic_shapes_compilation.py
index bc3dbf5533312..9ccb363b088f5 100644
--- a/tests/compile/test_dynamic_shapes_compilation.py
+++ b/tests/compile/test_dynamic_shapes_compilation.py
@@ -36,7 +36,7 @@ def get_test_models():
         DynamicShapesType.BACKED_SIZE_OBLIVIOUS,
     ],
 )
-@pytest.mark.parametrize("use_aot_compile", ["0"])
+@pytest.mark.parametrize("use_aot_compile", ["0", "1"])
 @pytest.mark.parametrize("use_bytecode_hook", [True, False])
 @pytest.mark.parametrize("evaluate_guards", [False, True])
 @pytest.mark.skipif(
@@ -54,6 +54,12 @@ def test_dynamic_shapes_compilation(
     if use_bytecode_hook and shapes_type == DynamicShapesType.UNBACKED:
         pytest.skip("UNBACKED dynamic shapes require VLLM_USE_BYTECODE_HOOK=0")
 
+    if evaluate_guards and shapes_type == DynamicShapesType.UNBACKED:
+        pytest.skip("unbacked dynamic shapes do not add guards")
+
+    if evaluate_guards and use_aot_compile:
+        pytest.skip("evaluate_guards requires use_aot_compile=0")
+
     monkeypatch.setenv("VLLM_USE_AOT_COMPILE", use_aot_compile)
     monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
 
@@ -120,7 +126,7 @@ def test_model_specialization_with_evaluate_guards(
         and dynamic_shapes_type == DynamicShapesType.BACKED
         and evaluate_guards
     ):
-        pytest.skip("evaluate_guards for backed does not work with aot_compile =1")
+        pytest.skip("evaluate_guards for backed does not work with aot_compile=1")
 
     @support_torch_compile
     class ModelWithSizeCheck(torch.nn.Module):
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index f07061bdb7b2d..d1ee995ee8959 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -390,14 +390,6 @@ def _support_torch_compile(
             serialized backend artifacts), then we need to generate a new AOT
             compile artifact from scratch.
             """
-            # Validate that AOT compile is not used with unbacked dynamic
-            # shapes. aot_compile re-allocates backed symbols post dynamo!
-            if ds_type == DynamicShapesType.UNBACKED:
-                raise ValueError(
-                    "AOT compilation is not compatible with UNBACKED dynamic shapes. "
-                    "Please use BACKED or BACKED_SIZE_OBLIVIOUS dynamic shapes type "
-                    "when VLLM_USE_AOT_COMPILE is enabled."
-                )
             from .caching import compilation_config_hash_factors
 
             factors: list[str] = compilation_config_hash_factors(self.vllm_config)

From dcb31196dae923e06da81eae02de1de662a97d2b Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 14 Dec 2025 17:22:37 +0800
Subject: [PATCH 523/770] [Chore] Remove redundant `RequestPrompt` (#30612)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/entrypoints/openai/test_chat_error.py   |   3 +-
 tests/entrypoints/openai/test_serving_chat.py |  26 +--
 .../openai/test_serving_responses.py          |   6 +-
 vllm/entrypoints/openai/serving_chat.py       |  55 +++--
 vllm/entrypoints/openai/serving_engine.py     | 201 +++++++-----------
 vllm/entrypoints/openai/serving_responses.py  |  21 +-
 vllm/entrypoints/pooling/classify/serving.py  |   6 +-
 vllm/entrypoints/pooling/embed/serving.py     |  59 ++---
 vllm/entrypoints/pooling/pooling/serving.py   |   7 +-
 vllm/entrypoints/renderer.py                  |  38 ++--
 vllm/entrypoints/serve/disagg/serving.py      |   6 +-
 vllm/entrypoints/serve/tokenize/serving.py    |  13 +-
 12 files changed, 188 insertions(+), 253 deletions(-)

diff --git a/tests/entrypoints/openai/test_chat_error.py b/tests/entrypoints/openai/test_chat_error.py
index 102eeaf614410..b194e9b74d874 100644
--- a/tests/entrypoints/openai/test_chat_error.py
+++ b/tests/entrypoints/openai/test_chat_error.py
@@ -80,10 +80,9 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
         return dict(engine_prompt), {}
 
     async def _fake_preprocess_chat(*args, **kwargs):
-        # return conversation, request_prompts, engine_prompts
+        # return conversation, engine_prompts
         return (
             [{"role": "user", "content": "Test"}],
-            [[1, 2, 3]],
             [{"prompt_token_ids": [1, 2, 3]}],
         )
 
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 5a9293f1b9ae5..c7e088fddf7e4 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -877,7 +877,7 @@ class TestServingChatWithHarmony:
 
         # Test the Harmony messages for the first turn's input
         req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
-        input_messages, _, _ = serving_chat._make_request_with_harmony(req)
+        input_messages, _ = serving_chat._make_request_with_harmony(req)
         verify_harmony_messages(
             input_messages,
             [
@@ -905,7 +905,7 @@ class TestServingChatWithHarmony:
 
         # Test the Harmony messages for the second turn's input
         req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
-        input_messages_2, _, _ = serving_chat._make_request_with_harmony(req_2)
+        input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
         verify_harmony_messages(
             input_messages_2,
             [
@@ -927,7 +927,7 @@ class TestServingChatWithHarmony:
 
         # Test the Harmony messages for the first turn's input
         req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
-        input_messages, _, _ = serving_chat._make_request_with_harmony(req)
+        input_messages, _ = serving_chat._make_request_with_harmony(req)
         verify_harmony_messages(
             input_messages,
             [
@@ -971,7 +971,7 @@ class TestServingChatWithHarmony:
 
         # Test the Harmony messages for the second turn's input
         req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
-        input_messages_2, _, _ = serving_chat._make_request_with_harmony(req_2)
+        input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
         verify_harmony_messages(
             input_messages_2,
             [
@@ -1008,7 +1008,7 @@ class TestServingChatWithHarmony:
 
         # Test the Harmony messages for the first turn's input
         req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
-        input_messages, _, _ = serving_chat._make_request_with_harmony(req)
+        input_messages, _ = serving_chat._make_request_with_harmony(req)
         verify_harmony_messages(
             input_messages,
             [
@@ -1052,7 +1052,7 @@ class TestServingChatWithHarmony:
 
         # Test the Harmony messages for the second turn's input
         req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
-        input_messages_2, _, _ = serving_chat._make_request_with_harmony(req_2)
+        input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
         verify_harmony_messages(
             input_messages_2,
             [
@@ -1089,7 +1089,7 @@ class TestServingChatWithHarmony:
 
         # Test the Harmony messages for the first turn's input
         req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
-        input_messages, _, _ = serving_chat._make_request_with_harmony(req)
+        input_messages, _ = serving_chat._make_request_with_harmony(req)
         verify_harmony_messages(
             input_messages,
             [
@@ -1133,7 +1133,7 @@ class TestServingChatWithHarmony:
 
         # Test the Harmony messages for the second turn's input
         req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
-        input_messages_2, _, _ = serving_chat._make_request_with_harmony(req_2)
+        input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
         verify_harmony_messages(
             input_messages_2,
             [
@@ -1183,7 +1183,7 @@ class TestServingChatWithHarmony:
 
         # Test the Harmony messages for the third turn's input
         req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
-        input_messages_3, _, _ = serving_chat._make_request_with_harmony(req_3)
+        input_messages_3, _ = serving_chat._make_request_with_harmony(req_3)
         verify_harmony_messages(
             input_messages_3,
             [
@@ -1246,7 +1246,7 @@ class TestServingChatWithHarmony:
 
         # Test the Harmony messages for the fourth turn's input
         req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
-        input_messages_4, _, _ = serving_chat._make_request_with_harmony(req_4)
+        input_messages_4, _ = serving_chat._make_request_with_harmony(req_4)
         verify_harmony_messages(
             input_messages_4,
             [
@@ -1295,7 +1295,7 @@ class TestServingChatWithHarmony:
             },
         ]
         req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
-        input_messages, _, _ = serving_chat._make_request_with_harmony(req)
+        input_messages, _ = serving_chat._make_request_with_harmony(req)
 
         verify_harmony_messages(
             input_messages,
@@ -1327,7 +1327,7 @@ class TestServingChatWithHarmony:
             },
         ]
         req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
-        input_messages, _, _ = serving_chat._make_request_with_harmony(req)
+        input_messages, _ = serving_chat._make_request_with_harmony(req)
 
         verify_harmony_messages(
             input_messages,
@@ -1357,7 +1357,7 @@ class TestServingChatWithHarmony:
             },
         ]
         req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
-        input_messages, _, _ = serving_chat._make_request_with_harmony(req)
+        input_messages, _ = serving_chat._make_request_with_harmony(req)
 
         verify_harmony_messages(
             input_messages,
diff --git a/tests/entrypoints/openai/test_serving_responses.py b/tests/entrypoints/openai/test_serving_responses.py
index cf00f0a042241..7d03dccec30de 100644
--- a/tests/entrypoints/openai/test_serving_responses.py
+++ b/tests/entrypoints/openai/test_serving_responses.py
@@ -21,7 +21,7 @@ from vllm.entrypoints.openai.serving_responses import (
     extract_tool_types,
 )
 from vllm.entrypoints.tool_server import ToolServer
-from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
+from vllm.inputs.data import TokensPrompt
 
 
 class MockConversationContext(ConversationContext):
@@ -237,7 +237,7 @@ class TestValidateGeneratorInput:
         """Test _validate_generator_input with valid prompt length"""
         # Create an engine prompt with valid length (less than max_model_len)
         valid_prompt_token_ids = list(range(5))  # 5 tokens < 100 max_model_len
-        engine_prompt = EngineTokensPrompt(prompt_token_ids=valid_prompt_token_ids)
+        engine_prompt = TokensPrompt(prompt_token_ids=valid_prompt_token_ids)
 
         # Call the method
         result = serving_responses_instance._validate_generator_input(engine_prompt)
@@ -247,7 +247,7 @@ class TestValidateGeneratorInput:
 
         # create an invalid engine prompt
         invalid_prompt_token_ids = list(range(200))  # 100 tokens >= 100 max_model_len
-        engine_prompt = EngineTokensPrompt(prompt_token_ids=invalid_prompt_token_ids)
+        engine_prompt = TokensPrompt(prompt_token_ids=invalid_prompt_token_ids)
 
         # Call the method
         result = serving_responses_instance._validate_generator_input(engine_prompt)
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index d94fa7dd91937..1cf887529dc94 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -61,7 +61,7 @@ from vllm.entrypoints.openai.tool_parsers import ToolParser
 from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolCall
 from vllm.entrypoints.openai.utils import maybe_filter_parallel_tool_calls
 from vllm.entrypoints.utils import get_max_tokens, should_include_usage
-from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
+from vllm.inputs.data import TokensPrompt
 from vllm.logger import init_logger
 from vllm.logprobs import Logprob
 from vllm.outputs import CompletionOutput, RequestOutput
@@ -234,11 +234,7 @@ class OpenAIServingChat(OpenAIServing):
                 )
                 if error_check_ret is not None:
                     return error_check_ret
-                (
-                    conversation,
-                    request_prompts,
-                    engine_prompts,
-                ) = await self._preprocess_chat(
+                conversation, engine_prompts = await self._preprocess_chat(
                     request,
                     tokenizer,
                     request.messages,
@@ -254,11 +250,7 @@ class OpenAIServingChat(OpenAIServing):
                 )
             else:
                 # For GPT-OSS.
-                (
-                    conversation,
-                    request_prompts,
-                    engine_prompts,
-                ) = self._make_request_with_harmony(request)
+                conversation, engine_prompts = self._make_request_with_harmony(request)
         except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(f"{e} {e.__cause__}")
@@ -278,7 +270,7 @@ class OpenAIServingChat(OpenAIServing):
         generators: list[AsyncGenerator[RequestOutput, None]] = []
         try:
             for i, engine_prompt in enumerate(engine_prompts):
-                prompt_text, _, _ = self._get_prompt_components(request_prompts[i])
+                prompt_text, _, _ = self._get_prompt_components(engine_prompt)
                 # If we are creating sub requests for multiple prompts, ensure that they
                 # have unique request ids.
                 sub_request_id = (
@@ -313,7 +305,7 @@ class OpenAIServingChat(OpenAIServing):
 
                 self._log_inputs(
                     sub_request_id,
-                    request_prompts[i],
+                    engine_prompt,
                     params=sampling_params,
                     lora_request=lora_request,
                 )
@@ -537,7 +529,7 @@ class OpenAIServingChat(OpenAIServing):
         request_id: str,
         model_name: str,
         conversation: list[ConversationMessage],
-        tokenizer: TokenizerLike,
+        tokenizer: TokenizerLike | None,
         request_metadata: RequestResponseMetadata,
     ) -> AsyncGenerator[str, None]:
         created_time = int(time.time())
@@ -591,6 +583,11 @@ class OpenAIServingChat(OpenAIServing):
 
         try:
             if self.reasoning_parser:
+                if tokenizer is None:
+                    raise ValueError(
+                        "Tokenizer not available when `skip_tokenizer_init=True`"
+                    )
+
                 reasoning_parser = self.reasoning_parser(
                     tokenizer,
                     chat_template_kwargs=request.chat_template_kwargs,  # type: ignore
@@ -604,6 +601,11 @@ class OpenAIServingChat(OpenAIServing):
         # Prepare the tool parser if it's needed
         try:
             if tool_choice_auto and self.tool_parser:
+                if tokenizer is None:
+                    raise ValueError(
+                        "Tokenizer not available when `skip_tokenizer_init=True`"
+                    )
+
                 tool_parsers: list[ToolParser | None] = [
                     self.tool_parser(tokenizer)
                 ] * num_choices
@@ -1317,7 +1319,7 @@ class OpenAIServingChat(OpenAIServing):
         request_id: str,
         model_name: str,
         conversation: list[ConversationMessage],
-        tokenizer: TokenizerLike,
+        tokenizer: TokenizerLike | None,
         request_metadata: RequestResponseMetadata,
     ) -> ErrorResponse | ChatCompletionResponse:
         created_time = int(time.time())
@@ -1367,6 +1369,11 @@ class OpenAIServingChat(OpenAIServing):
                     reasoning = None
 
                 if self.tool_parser is not None:
+                    if tokenizer is None:
+                        raise ValueError(
+                            "Tokenizer not available when `skip_tokenizer_init=True`"
+                        )
+
                     tool_parser = self.tool_parser(tokenizer)
                     # NOTE: We use token_ids for openai tool parser
                     tool_call_info = tool_parser.extract_tool_calls(
@@ -1409,6 +1416,11 @@ class OpenAIServingChat(OpenAIServing):
 
             if self.reasoning_parser:
                 try:
+                    if tokenizer is None:
+                        raise ValueError(
+                            "Tokenizer not available when `skip_tokenizer_init=True`"
+                        )
+
                     reasoning_parser = self.reasoning_parser(
                         tokenizer,
                         chat_template_kwargs=request.chat_template_kwargs,  # type: ignore
@@ -1648,7 +1660,7 @@ class OpenAIServingChat(OpenAIServing):
         self,
         logprobs: dict[int, Logprob],
         top_logprobs: int | None,
-        tokenizer: TokenizerLike,
+        tokenizer: TokenizerLike | None,
         should_return_as_token_id: bool,
     ) -> list[ChatCompletionLogProb]:
         return [
@@ -1672,7 +1684,7 @@ class OpenAIServingChat(OpenAIServing):
         self,
         token_ids: GenericSequence[int],
         top_logprobs: GenericSequence[dict[int, Logprob] | None],
-        tokenizer: TokenizerLike,
+        tokenizer: TokenizerLike | None,
         num_output_top_logprobs: int | None = None,
         return_as_token_id: bool | None = None,
     ) -> ChatCompletionLogProbs:
@@ -1690,6 +1702,11 @@ class OpenAIServingChat(OpenAIServing):
                 if should_return_as_token_id:
                     token = f"token_id:{token_id}"
                 else:
+                    if tokenizer is None:
+                        raise ValueError(
+                            "Tokenizer not available when `skip_tokenizer_init=True`"
+                        )
+
                     token = tokenizer.decode(token_id)
 
                 logprobs_content.append(
@@ -1800,10 +1817,10 @@ class OpenAIServingChat(OpenAIServing):
 
         # Render prompt token ids.
         prompt_token_ids = render_for_completion(messages)
-        engine_prompt = EngineTokensPrompt(prompt_token_ids=prompt_token_ids)
+        engine_prompt = TokensPrompt(prompt_token_ids=prompt_token_ids)
 
         # Add cache_salt if provided in the request
         if request.cache_salt is not None:
             engine_prompt["cache_salt"] = request.cache_salt
 
-        return messages, [prompt_token_ids], [engine_prompt]
+        return messages, [engine_prompt]
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index d83a7c8d59f39..bb614cb8f8977 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -5,29 +5,61 @@ import json
 import sys
 import time
 import traceback
-from collections.abc import AsyncGenerator, Callable, Iterable, Mapping, Sequence
+from collections.abc import AsyncGenerator, Callable, Iterable, Mapping
 from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass, field
 from http import HTTPStatus
 from typing import Any, ClassVar, Generic, TypeAlias, TypeVar
 
 import numpy as np
-import torch
 from fastapi import Request
+from openai.types.responses import (
+    ToolChoiceFunction,
+)
 from pydantic import ConfigDict, TypeAdapter
 from starlette.datastructures import Headers
-from typing_extensions import TypeIs
 
+import vllm.envs as envs
+from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.chat_utils import (
+    ChatCompletionMessageParam,
+    ChatTemplateContentFormatOption,
+    ConversationMessage,
+    apply_hf_chat_template,
+    apply_mistral_chat_template,
+    parse_chat_messages_futures,
+    resolve_chat_template_content_format,
+)
 from vllm.entrypoints.context import (
+    ConversationContext,
     HarmonyContext,
     ParsableContext,
     StreamingHarmonyContext,
 )
+from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
+    ChatCompletionNamedToolChoiceParam,
+    ChatCompletionRequest,
+    ChatCompletionResponse,
+    CompletionRequest,
+    CompletionResponse,
+    DetokenizeRequest,
+    ErrorInfo,
+    ErrorResponse,
     FunctionCall,
+    FunctionDefinition,
     ResponseInputOutputItem,
     ResponsesRequest,
+    TokenizeChatRequest,
+    TokenizeCompletionRequest,
+    TokenizeResponse,
+    TranscriptionRequest,
+    TranscriptionResponse,
+    TranslationRequest,
 )
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
 from vllm.entrypoints.pooling.classify.protocol import (
     ClassificationChatRequest,
     ClassificationCompletionRequest,
@@ -49,58 +81,13 @@ from vllm.entrypoints.pooling.score.protocol import (
     ScoreRequest,
     ScoreResponse,
 )
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-
-if sys.version_info >= (3, 12):
-    from typing import TypedDict
-else:
-    from typing_extensions import TypedDict
-
-from openai.types.responses import (
-    ToolChoiceFunction,
-)
-
-import vllm.envs as envs
-from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function
-from vllm.engine.protocol import EngineClient
-from vllm.entrypoints.chat_utils import (
-    ChatCompletionMessageParam,
-    ChatTemplateContentFormatOption,
-    ConversationMessage,
-    apply_hf_chat_template,
-    apply_mistral_chat_template,
-    parse_chat_messages_futures,
-    resolve_chat_template_content_format,
-)
-from vllm.entrypoints.context import ConversationContext
-from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.protocol import (
-    ChatCompletionNamedToolChoiceParam,
-    ChatCompletionRequest,
-    ChatCompletionResponse,
-    CompletionRequest,
-    CompletionResponse,
-    DetokenizeRequest,
-    ErrorInfo,
-    ErrorResponse,
-    FunctionDefinition,
-    TokenizeChatRequest,
-    TokenizeCompletionRequest,
-    TokenizeResponse,
-    TranscriptionRequest,
-    TranscriptionResponse,
-    TranslationRequest,
-)
-from vllm.entrypoints.openai.serving_models import OpenAIServingModels
-from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
 from vllm.entrypoints.renderer import BaseRenderer, CompletionRenderer, RenderConfig
 from vllm.entrypoints.responses_utils import (
     construct_input_messages,
 )
 from vllm.entrypoints.serve.disagg.protocol import GenerateRequest, GenerateResponse
 from vllm.entrypoints.utils import _validate_truncation_size
-from vllm.inputs.data import PromptType
-from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
+from vllm.inputs.data import PromptType, TokensPrompt
 from vllm.inputs.parse import (
     PromptComponents,
     get_prompt_components,
@@ -109,10 +96,7 @@ from vllm.inputs.parse import (
 from vllm.logger import init_logger
 from vllm.logprobs import Logprob, PromptLogprobs
 from vllm.lora.request import LoRARequest
-from vllm.multimodal import (  # noqa: F401 - Required to resolve Pydantic error in RequestProcessingMixin
-    MultiModalDataDict,
-    MultiModalUUIDDict,
-)
+from vllm.multimodal import MultiModalDataDict
 from vllm.outputs import CompletionOutput, PoolingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
@@ -185,34 +169,6 @@ AnyResponse: TypeAlias = (
 )
 
 
-class TextTokensPrompt(TypedDict):
-    prompt: str
-    prompt_token_ids: list[int]
-
-
-class EmbedsPrompt(TypedDict):
-    prompt_embeds: torch.Tensor
-
-
-RequestPrompt: TypeAlias = list[int] | str | TextTokensPrompt | EmbedsPrompt
-
-
-def is_text_tokens_prompt(prompt: RequestPrompt) -> TypeIs[TextTokensPrompt]:
-    return (
-        isinstance(prompt, dict)
-        and "prompt_token_ids" in prompt
-        and "prompt_embeds" not in prompt
-    )
-
-
-def is_embeds_prompt(prompt: RequestPrompt) -> TypeIs[EmbedsPrompt]:
-    return (
-        isinstance(prompt, dict)
-        and "prompt_token_ids" not in prompt
-        and "prompt_embeds" in prompt
-    )
-
-
 RequestT = TypeVar("RequestT", bound=AnyRequest)
 
 
@@ -223,8 +179,7 @@ class RequestProcessingMixin:
     handling prompt preparation and engine input.
     """
 
-    request_prompts: Sequence[RequestPrompt] | None = field(default_factory=list)
-    engine_prompts: list[EngineTokensPrompt] | None = field(default_factory=list)
+    engine_prompts: list[TokensPrompt] | None = field(default_factory=list)
 
 
 @dataclass(kw_only=True)
@@ -425,7 +380,7 @@ class OpenAIServing:
             prompts_batch, lora_req_batch = zip(
                 *[
                     (
-                        EngineTokensPrompt(
+                        TokensPrompt(
                             prompt_token_ids=beam.tokens,
                             multi_modal_data=beam.multi_modal_data,
                             mm_processor_kwargs=beam.mm_processor_kwargs,
@@ -947,7 +902,7 @@ class OpenAIServing:
         prompt: str,
         tokenizer: TokenizerLike,
         add_special_tokens: bool,
-    ) -> TextTokensPrompt:
+    ) -> TokensPrompt:
         async_tokenizer = self._get_async_tokenizer(tokenizer)
 
         if (
@@ -988,7 +943,7 @@ class OpenAIServing:
         request: AnyRequest,
         prompt_ids: list[int],
         tokenizer: TokenizerLike | None,
-    ) -> TextTokensPrompt:
+    ) -> TokensPrompt:
         truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens", None)
 
         if truncate_prompt_tokens is None:
@@ -1011,7 +966,7 @@ class OpenAIServing:
         request: AnyRequest,
         input_ids: list[int],
         input_text: str,
-    ) -> TextTokensPrompt:
+    ) -> TokensPrompt:
         token_num = len(input_ids)
 
         # Note: EmbeddingRequest, ClassificationRequest,
@@ -1042,7 +997,7 @@ class OpenAIServing:
                     f"{token_num} tokens in the input for {operation}. "
                     f"Please reduce the length of the input."
                 )
-            return TextTokensPrompt(prompt=input_text, prompt_token_ids=input_ids)
+            return TokensPrompt(prompt=input_text, prompt_token_ids=input_ids)
 
         # Note: TokenizeRequest and DetokenizeRequest doesn't have max_tokens
         # and does not require model context length validation
@@ -1050,7 +1005,7 @@ class OpenAIServing:
             request,
             (TokenizeCompletionRequest, TokenizeChatRequest, DetokenizeRequest),
         ):
-            return TextTokensPrompt(prompt=input_text, prompt_token_ids=input_ids)
+            return TokensPrompt(prompt=input_text, prompt_token_ids=input_ids)
 
         # chat completion endpoint supports max_completion_tokens
         if isinstance(request, ChatCompletionRequest):
@@ -1078,7 +1033,7 @@ class OpenAIServing:
                 f" - {token_num})."
             )
 
-        return TextTokensPrompt(prompt=input_text, prompt_token_ids=input_ids)
+        return TokensPrompt(prompt=input_text, prompt_token_ids=input_ids)
 
     async def _tokenize_prompt_input_async(
         self,
@@ -1086,7 +1041,7 @@ class OpenAIServing:
         tokenizer: TokenizerLike,
         prompt_input: str | list[int],
         add_special_tokens: bool = True,
-    ) -> TextTokensPrompt:
+    ) -> TokensPrompt:
         """
         A simpler implementation that tokenizes a single prompt input.
         """
@@ -1105,7 +1060,7 @@ class OpenAIServing:
         tokenizer: TokenizerLike,
         prompt_inputs: Iterable[str | list[int]],
         add_special_tokens: bool = True,
-    ) -> AsyncGenerator[TextTokensPrompt, None]:
+    ) -> AsyncGenerator[TokensPrompt, None]:
         """
         A simpler implementation that tokenizes multiple prompt inputs.
         """
@@ -1158,11 +1113,7 @@ class OpenAIServing:
         chat_template_kwargs: dict[str, Any] | None = None,
         tool_parser: Callable[[TokenizerLike], ToolParser] | None = None,
         add_special_tokens: bool = False,
-    ) -> tuple[
-        list[ConversationMessage],
-        Sequence[RequestPrompt],
-        list[EngineTokensPrompt],
-    ]:
+    ) -> tuple[list[ConversationMessage], list[TokensPrompt]]:
         model_config = self.model_config
 
         resolved_content_format = resolve_chat_template_content_format(
@@ -1235,9 +1186,7 @@ class OpenAIServing:
                 "Prompt has to be a string",
                 "when the tokenizer is not initialised",
             )
-            prompt_inputs = TextTokensPrompt(
-                prompt=request_prompt, prompt_token_ids=[1]
-            )
+            prompt_inputs = TokensPrompt(prompt=request_prompt, prompt_token_ids=[1])
         elif isinstance(request_prompt, str):
             prompt_inputs = await self._tokenize_prompt_input_async(
                 request,
@@ -1250,14 +1199,15 @@ class OpenAIServing:
             assert is_list_of(request_prompt, int), (
                 "Prompt has to be either a string or a list of token ids"
             )
-            prompt_inputs = TextTokensPrompt(
+            prompt_inputs = TokensPrompt(
                 prompt=tokenizer.decode(request_prompt),
                 prompt_token_ids=request_prompt,
             )
 
-        engine_prompt = EngineTokensPrompt(
-            prompt_token_ids=prompt_inputs["prompt_token_ids"]
-        )
+        engine_prompt = TokensPrompt(prompt_token_ids=prompt_inputs["prompt_token_ids"])
+        if "prompt" in prompt_inputs:
+            engine_prompt["prompt"] = prompt_inputs["prompt"]
+
         if mm_data is not None:
             engine_prompt["multi_modal_data"] = mm_data
 
@@ -1270,7 +1220,7 @@ class OpenAIServing:
         if hasattr(request, "cache_salt") and request.cache_salt is not None:
             engine_prompt["cache_salt"] = request.cache_salt
 
-        return conversation, [request_prompt], [engine_prompt]
+        return conversation, [engine_prompt]
 
     async def _process_inputs(
         self,
@@ -1302,7 +1252,7 @@ class OpenAIServing:
     async def _render_next_turn(
         self,
         request: ResponsesRequest,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike | None,
         messages: list[ResponseInputOutputItem],
         tool_dicts: list[dict[str, Any]] | None,
         tool_parser,
@@ -1313,7 +1263,7 @@ class OpenAIServing:
             request_input=messages,
         )
 
-        _, request_prompts, engine_prompts = await self._preprocess_chat(
+        _, engine_prompts = await self._preprocess_chat(
             request,
             tokenizer,
             new_messages,
@@ -1322,20 +1272,20 @@ class OpenAIServing:
             chat_template=chat_template,
             chat_template_content_format=chat_template_content_format,
         )
-        return request_prompts, engine_prompts
+        return engine_prompts
 
     async def _generate_with_builtin_tools(
         self,
         request_id: str,
-        request_prompt: RequestPrompt,
-        engine_prompt: EngineTokensPrompt,
+        engine_prompt: TokensPrompt,
         sampling_params: SamplingParams,
         context: ConversationContext,
         lora_request: LoRARequest | None = None,
         priority: int = 0,
         **kwargs,
     ):
-        prompt_text, _, _ = self._get_prompt_components(request_prompt)
+        prompt_text, _, _ = self._get_prompt_components(engine_prompt)
+
         orig_priority = priority
         sub_request = 0
         while True:
@@ -1343,7 +1293,7 @@ class OpenAIServing:
             sub_request_id = f"{request_id}_{sub_request}"
             self._log_inputs(
                 sub_request_id,
-                request_prompt,
+                engine_prompt,
                 params=sampling_params,
                 lora_request=lora_request,
             )
@@ -1388,10 +1338,9 @@ class OpenAIServing:
             # Render the next prompt token ids.
             if isinstance(context, (HarmonyContext, StreamingHarmonyContext)):
                 prompt_token_ids = context.render_for_completion()
-                engine_prompt = EngineTokensPrompt(prompt_token_ids=prompt_token_ids)
-                request_prompt = prompt_token_ids
+                engine_prompt = TokensPrompt(prompt_token_ids=prompt_token_ids)
             elif isinstance(context, ParsableContext):
-                request_prompts, engine_prompts = await self._render_next_turn(
+                engine_prompts = await self._render_next_turn(
                     context.request,
                     context.tokenizer,
                     context.parser.response_messages,
@@ -1401,8 +1350,7 @@ class OpenAIServing:
                     context.chat_template_content_format,
                 )
                 engine_prompt = engine_prompts[0]
-                request_prompt = request_prompts[0]
-                prompt_text, _, _ = self._get_prompt_components(request_prompt)
+                prompt_text, _, _ = self._get_prompt_components(engine_prompt)
 
             # Update the sampling params.
             sampling_params.max_tokens = self.max_model_len - len(
@@ -1412,19 +1360,13 @@ class OpenAIServing:
             priority = orig_priority - 1
             sub_request += 1
 
-    def _get_prompt_components(
-        self,
-        prompt: RequestPrompt | PromptType,
-    ) -> PromptComponents:
-        if isinstance(prompt, list):
-            return PromptComponents(token_ids=prompt)
-
-        return get_prompt_components(prompt)  # type: ignore[arg-type]
+    def _get_prompt_components(self, prompt: PromptType) -> PromptComponents:
+        return get_prompt_components(prompt)
 
     def _log_inputs(
         self,
         request_id: str,
-        inputs: RequestPrompt | PromptType,
+        inputs: PromptType,
         params: SamplingParams | PoolingParams | BeamSearchParams | None,
         lora_request: LoRARequest | None,
     ) -> None:
@@ -1486,7 +1428,7 @@ class OpenAIServing:
     @staticmethod
     def _parse_tool_calls_from_content(
         request: ResponsesRequest | ChatCompletionRequest,
-        tokenizer: TokenizerLike,
+        tokenizer: TokenizerLike | None,
         enable_auto_tools: bool,
         tool_parser_cls: Callable[[TokenizerLike], ToolParser] | None,
         content: str | None = None,
@@ -1526,6 +1468,11 @@ class OpenAIServing:
             and enable_auto_tools
             and (request.tool_choice == "auto" or request.tool_choice is None)
         ):
+            if tokenizer is None:
+                raise ValueError(
+                    "Tokenizer not available when `skip_tokenizer_init=True`"
+                )
+
             # Automatic Tool Call Parsing
             try:
                 tool_parser = tool_parser_cls(tokenizer)
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 60d14337dcaaf..055f1cb81d7cf 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -107,7 +107,7 @@ from vllm.entrypoints.responses_utils import (
     make_response_output_items_from_parsable_context,
 )
 from vllm.entrypoints.tool_server import ToolServer
-from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
+from vllm.inputs.data import TokensPrompt
 from vllm.logger import init_logger
 from vllm.logprobs import Logprob as SampleLogprob
 from vllm.logprobs import SampleLogprobs
@@ -258,7 +258,7 @@ class OpenAIServingResponses(OpenAIServing):
         self.tool_server = tool_server
 
     def _validate_generator_input(
-        self, engine_prompt: EngineTokensPrompt
+        self, engine_prompt: TokensPrompt
     ) -> ErrorResponse | None:
         """Add validations to the input to the generator here."""
         if self.max_model_len <= len(engine_prompt["prompt_token_ids"]):
@@ -353,11 +353,11 @@ class OpenAIServingResponses(OpenAIServing):
             tokenizer = await self.engine_client.get_tokenizer()
 
             if self.use_harmony:
-                messages, request_prompts, engine_prompts = (
-                    self._make_request_with_harmony(request, prev_response)
+                messages, engine_prompts = self._make_request_with_harmony(
+                    request, prev_response
                 )
             else:
-                messages, request_prompts, engine_prompts = await self._make_request(
+                messages, engine_prompts = await self._make_request(
                     request, prev_response, tokenizer
                 )
 
@@ -393,7 +393,7 @@ class OpenAIServingResponses(OpenAIServing):
             assert len(builtin_tool_list) == 0
             available_tools = []
         try:
-            for i, engine_prompt in enumerate(engine_prompts):
+            for engine_prompt in engine_prompts:
                 maybe_error = self._validate_generator_input(engine_prompt)
                 if maybe_error is not None:
                     return maybe_error
@@ -449,7 +449,6 @@ class OpenAIServingResponses(OpenAIServing):
                         )
                 generator = self._generate_with_builtin_tools(
                     request_id=request.request_id,
-                    request_prompt=request_prompts[i],
                     engine_prompt=engine_prompt,
                     sampling_params=sampling_params,
                     context=context,
@@ -564,7 +563,7 @@ class OpenAIServingResponses(OpenAIServing):
             prev_msg=self.msg_store.get(prev_response.id) if prev_response else None,
             prev_response_output=prev_response.output if prev_response else None,
         )
-        _, request_prompts, engine_prompts = await self._preprocess_chat(
+        _, engine_prompts = await self._preprocess_chat(
             request,
             tokenizer,
             messages,
@@ -573,7 +572,7 @@ class OpenAIServingResponses(OpenAIServing):
             chat_template=self.chat_template,
             chat_template_content_format=self.chat_template_content_format,
         )
-        return messages, request_prompts, engine_prompts
+        return messages, engine_prompts
 
     def _make_request_with_harmony(
         self,
@@ -586,13 +585,13 @@ class OpenAIServingResponses(OpenAIServing):
             )
         messages = self._construct_input_messages_with_harmony(request, prev_response)
         prompt_token_ids = render_for_completion(messages)
-        engine_prompt = EngineTokensPrompt(prompt_token_ids=prompt_token_ids)
+        engine_prompt = TokensPrompt(prompt_token_ids=prompt_token_ids)
 
         # Add cache_salt if provided in the request
         if request.cache_salt is not None:
             engine_prompt["cache_salt"] = request.cache_salt
 
-        return messages, [prompt_token_ids], [engine_prompt]
+        return messages, [engine_prompt]
 
     async def _initialize_tool_sessions(
         self,
diff --git a/vllm/entrypoints/pooling/classify/serving.py b/vllm/entrypoints/pooling/classify/serving.py
index d6d3825daf7bb..e166405a6f05a 100644
--- a/vllm/entrypoints/pooling/classify/serving.py
+++ b/vllm/entrypoints/pooling/classify/serving.py
@@ -72,11 +72,7 @@ class ClassificationMixin(OpenAIServing):
                 if ret:
                     return ret
 
-                (
-                    _,
-                    _,
-                    engine_prompts,
-                ) = await self._preprocess_chat(
+                _, engine_prompts = await self._preprocess_chat(
                     cast(ChatCompletionRequest, chat_request),
                     ctx.tokenizer,
                     messages,
diff --git a/vllm/entrypoints/pooling/embed/serving.py b/vllm/entrypoints/pooling/embed/serving.py
index aafc354897105..f5a21208ed802 100644
--- a/vllm/entrypoints/pooling/embed/serving.py
+++ b/vllm/entrypoints/pooling/embed/serving.py
@@ -20,7 +20,6 @@ from vllm.entrypoints.openai.serving_engine import (
     EmbeddingServeContext,
     OpenAIServing,
     ServeContext,
-    TextTokensPrompt,
 )
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.pooling.embed.protocol import (
@@ -32,7 +31,7 @@ from vllm.entrypoints.pooling.embed.protocol import (
     EmbeddingResponseData,
 )
 from vllm.entrypoints.renderer import RenderConfig
-from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
+from vllm.inputs.data import TokensPrompt
 from vllm.logger import init_logger
 from vllm.outputs import (
     EmbeddingRequestOutput,
@@ -83,11 +82,7 @@ class EmbeddingMixin(OpenAIServing):
             renderer = self._get_renderer(tokenizer)
 
             if isinstance(ctx.request, EmbeddingChatRequest):
-                (
-                    _,
-                    _,
-                    ctx.engine_prompts,
-                ) = await self._preprocess_chat(
+                _, ctx.engine_prompts = await self._preprocess_chat(
                     ctx.request,
                     tokenizer,
                     ctx.request.messages,
@@ -209,14 +204,13 @@ class EmbeddingMixin(OpenAIServing):
     async def _process_chunked_request(
         self,
         ctx: EmbeddingServeContext,
-        original_prompt: TextTokensPrompt,
+        token_ids: list[int],
         pooling_params,
         trace_headers,
         prompt_idx: int,
     ) -> list[AsyncGenerator[PoolingRequestOutput, None]]:
         """Process a single prompt using chunked processing."""
         generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
-        token_ids = original_prompt["prompt_token_ids"]
 
         # Split into chunks using max_position_embeddings
         max_pos_embeddings = self._get_max_position_embeddings()
@@ -228,18 +222,12 @@ class EmbeddingMixin(OpenAIServing):
             chunk_request_id = f"{ctx.request_id}-prompt-{prompt_idx}-chunk-{chunk_idx}"
 
             # Create engine prompt for this chunk
-            chunk_engine_prompt = EngineTokensPrompt(prompt_token_ids=chunk_tokens)
-
-            # Create chunk request prompt for logging
-            chunk_text = ""
-            chunk_request_prompt = TextTokensPrompt(
-                prompt=chunk_text, prompt_token_ids=chunk_tokens
-            )
+            chunk_engine_prompt = TokensPrompt(prompt_token_ids=chunk_tokens)
 
             # Log the chunk
             self._log_inputs(
                 chunk_request_id,
-                chunk_request_prompt,
+                chunk_engine_prompt,
                 params=pooling_params,
                 lora_request=ctx.lora_request,
             )
@@ -263,7 +251,7 @@ class EmbeddingMixin(OpenAIServing):
         request,
         input_ids: list[int],
         input_text: str,
-    ) -> TextTokensPrompt:
+    ) -> TokensPrompt:
         """Override to support chunked processing for embedding requests."""
         token_num = len(input_ids)
 
@@ -328,23 +316,15 @@ class EmbeddingMixin(OpenAIServing):
                         )
                     )
 
-            return TextTokensPrompt(prompt=input_text, prompt_token_ids=input_ids)
+            return TokensPrompt(prompt=input_text, prompt_token_ids=input_ids)
 
         # For other request types, use the parent's implementation
         return super()._validate_input(request, input_ids, input_text)
 
-    def _is_text_tokens_prompt(self, prompt) -> bool:
-        """Check if a prompt is a TextTokensPrompt (has prompt_token_ids)."""
-        return (
-            isinstance(prompt, dict)
-            and "prompt_token_ids" in prompt
-            and "prompt_embeds" not in prompt
-        )
-
     async def _create_single_prompt_generator(
         self,
         ctx: EmbeddingServeContext,
-        engine_prompt: EngineTokensPrompt,
+        engine_prompt: TokensPrompt,
         pooling_params: PoolingParams,
         trace_headers: Mapping[str, str] | None,
         prompt_index: int,
@@ -413,14 +393,16 @@ class EmbeddingMixin(OpenAIServing):
 
             for i, engine_prompt in enumerate(ctx.engine_prompts):
                 # Check if this specific prompt needs chunked processing
-                if self._is_text_tokens_prompt(engine_prompt):
-                    # Cast to TextTokensPrompt since we've verified
-                    # prompt_token_ids
-                    text_tokens_prompt = cast(TextTokensPrompt, engine_prompt)
-                    if len(text_tokens_prompt["prompt_token_ids"]) > max_pos_embeddings:
+                if "prompt_token_ids" in engine_prompt:
+                    prompt_token_ids = engine_prompt["prompt_token_ids"]
+                    if len(prompt_token_ids) > max_pos_embeddings:
                         # Use chunked processing for this prompt
                         chunk_generators = await self._process_chunked_request(
-                            ctx, text_tokens_prompt, pooling_params, trace_headers, i
+                            ctx,
+                            prompt_token_ids,
+                            pooling_params,
+                            trace_headers,
+                            i,
                         )
                         generators.extend(chunk_generators)
                         continue
@@ -578,14 +560,13 @@ class EmbeddingMixin(OpenAIServing):
 
                         # Get original prompt token IDs for this prompt
                         original_prompt = ctx.engine_prompts[prompt_idx]
-                        if not self._is_text_tokens_prompt(original_prompt):
+                        if "prompt_token_ids" not in original_prompt:
                             return self.create_error_response(
-                                f"Chunked prompt {prompt_idx} is not a TextTokensPrompt"
+                                f"Chunked prompt {prompt_idx} does not contain "
+                                "token IDs"
                             )
 
-                        original_token_ids = cast(TextTokensPrompt, original_prompt)[
-                            "prompt_token_ids"
-                        ]
+                        original_token_ids = original_prompt["prompt_token_ids"]
 
                         pooling_request_output = PoolingRequestOutput(
                             request_id=aggregator["request_id"],
diff --git a/vllm/entrypoints/pooling/pooling/serving.py b/vllm/entrypoints/pooling/pooling/serving.py
index 57f1a6440cf76..4e1b326806eae 100644
--- a/vllm/entrypoints/pooling/pooling/serving.py
+++ b/vllm/entrypoints/pooling/pooling/serving.py
@@ -137,11 +137,8 @@ class OpenAIServingPooling(OpenAIServing):
                 )
                 if error_check_ret is not None:
                     return error_check_ret
-                (
-                    _,
-                    _,
-                    engine_prompts,
-                ) = await self._preprocess_chat(
+
+                _, engine_prompts = await self._preprocess_chat(
                     request,
                     tokenizer,
                     request.messages,
diff --git a/vllm/entrypoints/renderer.py b/vllm/entrypoints/renderer.py
index f31b309b8ca48..22f3c61ff73fa 100644
--- a/vllm/entrypoints/renderer.py
+++ b/vllm/entrypoints/renderer.py
@@ -12,9 +12,7 @@ import torch
 from pydantic import Field
 
 from vllm.config import ModelConfig
-from vllm.inputs.data import EmbedsPrompt as EngineEmbedsPrompt
-from vllm.inputs.data import TextPrompt as EngineTextPrompt
-from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
+from vllm.inputs.data import EmbedsPrompt, TextPrompt, TokensPrompt
 from vllm.inputs.parse import get_prompt_components, parse_raw_prompts
 from vllm.tokenizers import TokenizerLike
 from vllm.utils.async_utils import AsyncMicrobatchTokenizer
@@ -97,7 +95,7 @@ class BaseRenderer(ABC):
         *,
         prompt_or_prompts: str | list[str] | list[int] | list[list[int]],
         config: RenderConfig,
-    ) -> list[EngineTokensPrompt]:
+    ) -> list[TokensPrompt]:
         """
         Convert text or token inputs into engine-ready TokensPrompt objects.
 
@@ -115,7 +113,7 @@ class BaseRenderer(ABC):
                 (e.g., tokenization and length handling).
 
         Returns:
-            list[EngineTokensPrompt]: Engine-ready token prompts.
+            list[TokensPrompt]: Engine-ready token prompts.
 
         Raises:
             ValueError: If input formats are invalid or length limits exceeded.
@@ -129,7 +127,7 @@ class BaseRenderer(ABC):
         prompt_or_prompts: str | list[str] | list[int] | list[list[int]] | None = None,
         prompt_embeds: bytes | list[bytes] | None = None,
         config: RenderConfig,
-    ) -> list[EngineTokensPrompt | EngineEmbedsPrompt]:
+    ) -> list[TokensPrompt | EmbedsPrompt]:
         """
         Convert text/token and/or base64-encoded embeddings inputs into
         engine-ready prompt objects using a unified RenderConfig.
@@ -146,7 +144,7 @@ class BaseRenderer(ABC):
                 (e.g., tokenization and length handling).
 
         Returns:
-            list[Union[EngineTokensPrompt, EngineEmbedsPrompt]]:
+            list[Union[TokensPrompt, EmbedsPrompt]]:
                 Engine-ready prompt objects.
 
         Raises:
@@ -161,14 +159,14 @@ class BaseRenderer(ABC):
         prompt_embeds: bytes | list[bytes],
         truncate_prompt_tokens: Annotated[int, Field(ge=0)] | None = None,
         cache_salt: str | None = None,
-    ) -> list[EngineEmbedsPrompt]:
+    ) -> list[EmbedsPrompt]:
         """Load and validate base64-encoded embeddings into prompt objects."""
         if not self.model_config.enable_prompt_embeds:
             raise ValueError(
                 "You must set `--enable-prompt-embeds` to input `prompt_embeds`."
             )
 
-        def _load_and_validate_embed(embed: bytes) -> EngineEmbedsPrompt:
+        def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt:
             tensor = torch.load(
                 io.BytesIO(pybase64.b64decode(embed, validate=True)),
                 weights_only=True,
@@ -185,7 +183,7 @@ class BaseRenderer(ABC):
                 assert tensor.dim() == 2
             if truncate_prompt_tokens is not None:
                 tensor = tensor[-truncate_prompt_tokens:]
-            embeds_prompt = EngineEmbedsPrompt(prompt_embeds=tensor)
+            embeds_prompt = EmbedsPrompt(prompt_embeds=tensor)
             if cache_salt is not None:
                 embeds_prompt["cache_salt"] = cache_salt
             return embeds_prompt
@@ -213,7 +211,7 @@ class CompletionRenderer(BaseRenderer):
         *,
         prompt_or_prompts: str | list[str] | list[int] | list[list[int]],
         config: RenderConfig,
-    ) -> list[EngineTokensPrompt]:
+    ) -> list[TokensPrompt]:
         """Implementation of prompt rendering for completion-style requests.
 
         Uses async tokenizer pooling for improved performance. See base class
@@ -240,7 +238,7 @@ class CompletionRenderer(BaseRenderer):
         prompt_or_prompts: str | list[str] | list[int] | list[list[int]] | None = None,
         prompt_embeds: bytes | list[bytes] | None = None,
         config: RenderConfig,
-    ) -> list[EngineTokensPrompt | EngineEmbedsPrompt]:
+    ) -> list[TokensPrompt | EmbedsPrompt]:
         """
         Render text/token prompts and/or precomputed embedding prompts. At
         least one of `prompt_or_prompts` or `prompt_embeds` must be provided.
@@ -249,7 +247,7 @@ class CompletionRenderer(BaseRenderer):
         if truncate_prompt_tokens == 0:
             return []
 
-        rendered: list[EngineTokensPrompt | EngineEmbedsPrompt] = []
+        rendered: list[TokensPrompt | EmbedsPrompt] = []
 
         if prompt_embeds is not None:
             rendered.extend(
@@ -281,10 +279,10 @@ class CompletionRenderer(BaseRenderer):
 
     async def _create_prompt(
         self,
-        prompt_input: EngineTextPrompt | EngineTokensPrompt,
+        prompt_input: TextPrompt | TokensPrompt,
         config: RenderConfig,
         truncate_prompt_tokens: int | None,
-    ) -> EngineTokensPrompt:
+    ) -> TokensPrompt:
         prompt, prompt_token_ids, _ = get_prompt_components(prompt_input)
 
         if prompt_token_ids is not None:
@@ -317,7 +315,7 @@ class CompletionRenderer(BaseRenderer):
         truncate_prompt_tokens: int | None,
         add_special_tokens: bool,
         cache_salt: str | None,
-    ) -> EngineTokensPrompt:
+    ) -> TokensPrompt:
         """Tokenize text input asynchronously."""
         async_tokenizer = self._get_async_tokenizer()
 
@@ -350,7 +348,7 @@ class CompletionRenderer(BaseRenderer):
         truncate_prompt_tokens: int | None,
         cache_salt: str | None,
         needs_detokenization: bool | None = False,
-    ) -> EngineTokensPrompt:
+    ) -> TokensPrompt:
         """Optionally detokenize token IDs and build a tokens prompt."""
         token_ids = self._maybe_apply_truncation(token_ids, truncate_prompt_tokens)
 
@@ -392,8 +390,8 @@ class CompletionRenderer(BaseRenderer):
         max_length: int | None = None,
         cache_salt: str | None = None,
         prompt: str | None = None,
-    ) -> EngineTokensPrompt:
-        """Create validated EngineTokensPrompt."""
+    ) -> TokensPrompt:
+        """Create validated TokensPrompt."""
         if max_length is not None and len(token_ids) > max_length:
             raise ValueError(
                 f"This model's maximum context length is {max_length} tokens. "
@@ -401,7 +399,7 @@ class CompletionRenderer(BaseRenderer):
                 "Please reduce the length of the input messages."
             )
 
-        tokens_prompt = EngineTokensPrompt(prompt_token_ids=token_ids)
+        tokens_prompt = TokensPrompt(prompt_token_ids=token_ids)
         if cache_salt is not None:
             tokens_prompt["cache_salt"] = cache_salt
         if prompt is not None:
diff --git a/vllm/entrypoints/serve/disagg/serving.py b/vllm/entrypoints/serve/disagg/serving.py
index 5c1d17156a90d..1798b174b1413 100644
--- a/vllm/entrypoints/serve/disagg/serving.py
+++ b/vllm/entrypoints/serve/disagg/serving.py
@@ -27,7 +27,7 @@ from vllm.entrypoints.serve.disagg.protocol import (
     GenerateResponse,
     GenerateResponseChoice,
 )
-from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
+from vllm.inputs.data import TokensPrompt
 from vllm.logger import init_logger
 from vllm.logprobs import Logprob
 from vllm.outputs import RequestOutput
@@ -99,7 +99,7 @@ class ServingTokens(OpenAIServing):
 
         # TODO(NickLucche): Change to EngineCoreRequest once Renderer work is
         # completed
-        engine_prompt = EngineTokensPrompt(prompt_token_ids=request.token_ids)
+        engine_prompt = TokensPrompt(prompt_token_ids=request.token_ids)
         if request.features is not None:
             engine_prompt["multi_modal_data"] = None
 
@@ -115,7 +115,7 @@ class ServingTokens(OpenAIServing):
 
             self._log_inputs(
                 request_id,
-                request.token_ids,
+                TokensPrompt(prompt_token_ids=request.token_ids),
                 params=sampling_params,
                 lora_request=lora_request,
             )
diff --git a/vllm/entrypoints/serve/tokenize/serving.py b/vllm/entrypoints/serve/tokenize/serving.py
index 979da02d14500..0b07f0b18dfd5 100644
--- a/vllm/entrypoints/serve/tokenize/serving.py
+++ b/vllm/entrypoints/serve/tokenize/serving.py
@@ -21,6 +21,7 @@ from vllm.entrypoints.openai.protocol import (
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.renderer import RenderConfig
+from vllm.inputs import TokensPrompt
 from vllm.logger import init_logger
 from vllm.tokenizers import TokenizerLike
 
@@ -80,11 +81,8 @@ class OpenAIServingTokenization(OpenAIServing):
                 )
                 if error_check_ret is not None:
                     return error_check_ret
-                (
-                    _,
-                    _,
-                    engine_prompts,
-                ) = await self._preprocess_chat(
+
+                _, engine_prompts = await self._preprocess_chat(
                     request,
                     tokenizer,
                     request.messages,
@@ -141,7 +139,10 @@ class OpenAIServingTokenization(OpenAIServing):
         tokenizer = await self.engine_client.get_tokenizer()
 
         self._log_inputs(
-            request_id, request.tokens, params=None, lora_request=lora_request
+            request_id,
+            TokensPrompt(prompt_token_ids=request.tokens),
+            params=None,
+            lora_request=lora_request,
         )
 
         prompt_input = await self._tokenize_prompt_input_async(

From add1b9d3dec4a6d1b404f5793a210ff77482b7ae Mon Sep 17 00:00:00 2001
From: drslark <96540755+drslark@users.noreply.github.com>
Date: Sun, 14 Dec 2025 17:32:16 +0800
Subject: [PATCH 524/770] [main][BugFix] Fixed an accuracy bug of
 Qwen3-next-MTP when batched inferring (#30632)

Signed-off-by: drslark <slarksblood@qq.com>
---
 vllm/v1/attention/backends/gdn_attn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/attention/backends/gdn_attn.py b/vllm/v1/attention/backends/gdn_attn.py
index 3a2f92d9921c3..ace2cbb0564c8 100644
--- a/vllm/v1/attention/backends/gdn_attn.py
+++ b/vllm/v1/attention/backends/gdn_attn.py
@@ -211,7 +211,7 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata]
                 spec_token_masks = torch.repeat_interleave(
                     spec_sequence_masks, query_lens
                 )
-                index = torch.argsort(spec_token_masks)
+                index = torch.argsort(spec_token_masks, stable=True)
                 num_non_spec_tokens = num_prefill_tokens + num_decode_tokens
                 non_spec_token_indx = index[:num_non_spec_tokens]
                 spec_token_indx = index[num_non_spec_tokens:]

From 1a55cfafcbed71c68a6217f5e7b2929014e6df2d Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Sun, 14 Dec 2025 11:14:37 +0100
Subject: [PATCH 525/770] [Doc]: fixing typos in various files (#30540)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
Signed-off-by: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
---
 docs/configuration/optimization.md                          | 2 +-
 docs/deployment/integrations/production-stack.md            | 2 +-
 docs/design/cuda_graphs.md                                  | 4 ++--
 docs/design/optimization_levels.md                          | 2 +-
 docs/design/paged_attention.md                              | 6 +++---
 docs/models/supported_models.md                             | 2 +-
 docs/serving/parallelism_scaling.md                         | 2 +-
 docs/usage/security.md                                      | 4 ++--
 .../online_serving/structured_outputs/structured_outputs.py | 2 +-
 vllm/entrypoints/openai/serving_responses.py                | 2 +-
 vllm/model_executor/layers/fused_moe/shared_fused_moe.py    | 4 ++--
 .../layers/quantization/kernels/scaled_mm/__init__.py       | 2 +-
 12 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md
index fdd9c317b022f..556d9f8b9420a 100644
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -7,7 +7,7 @@ This guide covers optimization strategies and performance tuning for vLLM V1.
 
 ## Preemption
 
-Due to the auto-regressive nature of transformer architecture, there are times when KV cache space is insufficient to handle all batched requests.
+Due to the autoregressive nature of transformer architecture, there are times when KV cache space is insufficient to handle all batched requests.
 In such cases, vLLM can preempt requests to free up KV cache space for other requests. Preempted requests are recomputed when sufficient KV cache space becomes
 available again. When this occurs, you may see the following warning:
 
diff --git a/docs/deployment/integrations/production-stack.md b/docs/deployment/integrations/production-stack.md
index 2f1894ccf0022..624e98a08c98d 100644
--- a/docs/deployment/integrations/production-stack.md
+++ b/docs/deployment/integrations/production-stack.md
@@ -4,7 +4,7 @@ Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine le
 
 * **Upstream vLLM compatibility** – It wraps around upstream vLLM without modifying its code.
 * **Ease of use** – Simplified deployment via Helm charts and observability through Grafana dashboards.
-* **High performance** – Optimized for LLM workloads with features like multi-model support, model-aware and prefix-aware routing, fast vLLM bootstrapping, and KV cache offloading with [LMCache](https://github.com/LMCache/LMCache), among others.
+* **High performance** – Optimized for LLM workloads with features like multimodel support, model-aware and prefix-aware routing, fast vLLM bootstrapping, and KV cache offloading with [LMCache](https://github.com/LMCache/LMCache), among others.
 
 If you are new to Kubernetes, don't worry: in the vLLM production stack [repo](https://github.com/vllm-project/production-stack), we provide a step-by-step [guide](https://github.com/vllm-project/production-stack/blob/main/tutorials/00-install-kubernetes-env.md) and a [short video](https://www.youtube.com/watch?v=EsTJbQtzj0g) to set up everything and get started in **4 minutes**!
 
diff --git a/docs/design/cuda_graphs.md b/docs/design/cuda_graphs.md
index 7baadf8ba23cb..19c02fc88641c 100644
--- a/docs/design/cuda_graphs.md
+++ b/docs/design/cuda_graphs.md
@@ -41,7 +41,7 @@ These features allow the most flexibility for cudagraph capture and compilation
 * `NONE` — turn CUDA Graphs off. Good for debugging.
 * `PIECEWISE` —  a single-mode strategy (and past default). It is the most flexible: attention or other CUDA Graphs-incompatible operations stay eager, everything else goes into CUDA Graphs. Requires piecewise compilation.
 * `FULL` — a single-mode strategy, which only captures full CUDA Graphs for non-uniform batches, then uniform-decode batches reuse the CUDA Graph of non-uniform batch of the same batch_size, since they are compatible; can be good for small models or workloads with small prompts.
-* `FULL_DECODE_ONLY` — full CUDA Graph for uniform decode, no cudagraph for prefill/mixed etc; suitable for decode instances in a P/D setup where prefill is not as important, this way we can save the memory needed for `PIECEWISE` CUDA Graphs.
+* `FULL_DECODE_ONLY` — full CUDA Graph for uniform decode, no cudagraph for prefill/mixed etc.; suitable for decode instances in a P/D setup where prefill is not as important, this way we can save the memory needed for `PIECEWISE` CUDA Graphs.
 * `FULL_AND_PIECEWISE` — (default mode) full CUDA Graph for uniform decode, piecewise CUDA Graphs for others; generally the most performant setting, especially for low latency with small models or MoEs, but also requires the most memory and takes the longest to capture.
 
 Defaults: If you’re on v1 with piecewise compilation, we default to `FULL_AND_PIECEWISE` for better performance, (for pooling models, it's still `PIECEWISE`). Otherwise, e.g. if piecewise compilation unavailable, we default to `NONE`.
@@ -49,7 +49,7 @@ Defaults: If you’re on v1 with piecewise compilation, we default to `FULL_AND_
 While `NONE` , `PIECEWISE`, and `FULL` are single-mode configurations and simply equivalent to past implementations of eager execution, piecewise CUDA Graphs, and full CUDA Graphs respectively, `FULL_DECODE_ONLY` and `FULL_AND_PIECEWISE` are newly appended dual-mode configurations, which require dispatching to switch between concrete runtime modes according to runtime batches dynamically.
 
 !!! note
-    Here, the single-modes `NONE`, `PIECEWISE`, and `FULL` are treated as the runtime modes for CUDA Graphs dispatching. If using a dual-mode, the dispatcher will always dispatch to one of its member modes (plus a potantial `NONE` if no suitable CUDA Graph available), depending on the batch composition.
+    Here, the single-modes `NONE`, `PIECEWISE`, and `FULL` are treated as the runtime modes for CUDA Graphs dispatching. If using a dual-mode, the dispatcher will always dispatch to one of its member modes (plus a potential `NONE` if no suitable CUDA Graph available), depending on the batch composition.
 
 While cascade attention is not cudagraph compatible, it is now compatible with all possible cudagraph mode configurations. If a batch uses cascade attention, it always gets dispatched to `PIECEWISE` mode if available (otherwise `NONE`).
 
diff --git a/docs/design/optimization_levels.md b/docs/design/optimization_levels.md
index 940286071ef3c..4987c1820ad32 100644
--- a/docs/design/optimization_levels.md
+++ b/docs/design/optimization_levels.md
@@ -4,7 +4,7 @@
 
 ## Overview
 
-vLLM now supports optimization levels (`-O0`, `-O1`, `-O2`, `-O3`). Optimization levels provide an intuitive mechnaism for users to trade startup time for performance. Higher levels have better performance but worse startup time. These optimization levels have associated defaults to help users get desired out of the box performance. Importantly, defaults set by optimization levels are purely defaults; explicit user settings will not be overwritten.
+vLLM now supports optimization levels (`-O0`, `-O1`, `-O2`, `-O3`). Optimization levels provide an intuitive mechanism for users to trade startup time for performance. Higher levels have better performance but worse startup time. These optimization levels have associated defaults to help users get desired out-of-the-box performance. Importantly, defaults set by optimization levels are purely defaults; explicit user settings will not be overwritten.
 
 ## Level Summaries and Usage Examples
 ```bash
diff --git a/docs/design/paged_attention.md b/docs/design/paged_attention.md
index d87b2a639df12..5cc5878425515 100644
--- a/docs/design/paged_attention.md
+++ b/docs/design/paged_attention.md
@@ -36,7 +36,7 @@ the input pointers `q`, `k_cache`, and `v_cache`, which point
 to query, key, and value data on global memory that need to be read
 and processed. The output pointer `out` points to global memory
 where the result should be written. These four pointers actually
-refer to multi-dimensional arrays, but each thread only accesses the
+refer to multidimensional arrays, but each thread only accesses the
 portion of data assigned to it. I have omitted all other runtime
 parameters here for simplicity.
 
@@ -229,7 +229,7 @@ manner.
 
 ## QK
 
-As shown the pseudo code below, before the entire for loop block, we
+As shown the pseudocode below, before the entire for loop block, we
 fetch the query data for one token and store it in `q_vecs`. Then,
 in the outer for loop, we iterate through different `k_ptrs` that
 point to different tokens and prepare the `k_vecs` in the inner for
@@ -403,7 +403,7 @@ for ... { // Iteration over different blocks.
 }
 ```
 
-As shown in the above pseudo code, in the outer loop, similar to
+As shown in the above pseudocode, in the outer loop, similar to
 `k_ptr`, `logits_vec` iterates over different blocks and reads
 `V_VEC_SIZE` elements from `logits`. In the inner loop, each
 thread reads `V_VEC_SIZE` elements from the same tokens as a
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 586d5d91634dc..7a3cb7b2ad820 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -743,7 +743,7 @@ Some models are supported only via the [Transformers modeling backend](#transfor
     - There's no PLE caching or out-of-memory swapping support, as described in [Google's blog](https://developers.googleblog.com/en/introducing-gemma-3n/). These features might be too model-specific for vLLM, and swapping in particular may be better suited for constrained setups.
 
 !!! note
-    For `InternVLChatModel`, only InternVL2.5 with Qwen2.5 text backbone (`OpenGVLab/InternVL2.5-1B` etc), InternVL3 and InternVL3.5 have video inputs support currently.
+    For `InternVLChatModel`, only InternVL2.5 with Qwen2.5 text backbone (`OpenGVLab/InternVL2.5-1B` etc.), InternVL3 and InternVL3.5 have video inputs support currently.
 
 !!! note
     To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
diff --git a/docs/serving/parallelism_scaling.md b/docs/serving/parallelism_scaling.md
index 339a5b8140214..ed93432701f35 100644
--- a/docs/serving/parallelism_scaling.md
+++ b/docs/serving/parallelism_scaling.md
@@ -154,7 +154,7 @@ vllm serve /path/to/the/model/in/the/container \
 
 ## Optimizing network communication for tensor parallelism
 
-Efficient tensor parallelism requires fast inter-node communication, preferably through high-speed network adapters such as InfiniBand.
+Efficient tensor parallelism requires fast internode communication, preferably through high-speed network adapters such as InfiniBand.
 To set up the cluster to use InfiniBand, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the
 [examples/online_serving/run_cluster.sh](../../examples/online_serving/run_cluster.sh) helper script.
 Contact your system administrator for more information about the required flags.
diff --git a/docs/usage/security.md b/docs/usage/security.md
index 74060d86f6854..e619eec660aee 100644
--- a/docs/usage/security.md
+++ b/docs/usage/security.md
@@ -10,7 +10,7 @@ All communications between nodes in a multi-node vLLM deployment are **insecure
 
 ### Configuration Options for Inter-Node Communications
 
-The following options control inter-node communications in vLLM:
+The following options control internode communications in vLLM:
 
 #### 1. **Environment Variables:**
 
@@ -28,7 +28,7 @@ The following options control inter-node communications in vLLM:
 
 ### Notes on PyTorch Distributed
 
-vLLM uses PyTorch's distributed features for some inter-node communication. For
+vLLM uses PyTorch's distributed features for some internode communication. For
 detailed information about PyTorch Distributed security considerations, please
 refer to the [PyTorch Security
 Guide](https://github.com/pytorch/pytorch/security/policy#using-distributed-features).
diff --git a/examples/online_serving/structured_outputs/structured_outputs.py b/examples/online_serving/structured_outputs/structured_outputs.py
index ff473d044e323..2599c951ef8ad 100644
--- a/examples/online_serving/structured_outputs/structured_outputs.py
+++ b/examples/online_serving/structured_outputs/structured_outputs.py
@@ -112,7 +112,7 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
         "messages": [
             {
                 "role": "user",
-                "content": "Generate an SQL query to show the 'username' and 'email'from the 'users' table.",
+                "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
             }
         ],
         "extra_body": {
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 055f1cb81d7cf..fb2a6440daf09 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -420,7 +420,7 @@ class OpenAIServingResponses(OpenAIServing):
                         context = HarmonyContext(messages, available_tools)
                 else:
                     if envs.VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT:
-                        # This is an feature in development for parsing
+                        # This is a feature in development for parsing
                         # tokens during generation instead of at the end
                         context = ParsableContext(
                             response_messages=messages,
diff --git a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
index 9aaeec4f98a61..60aa1c088b4d8 100644
--- a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
@@ -30,8 +30,8 @@ class SharedFusedMoE(FusedMoE):
 
         # Disable shared expert overlap if:
         #   - we are using eplb, because of correctness issues
-        #   - we are using flashinfer with DP, since there nothint to gain
-        #   - we are using marlin kjernels
+        #   - we are using flashinfer with DP, since there nothing to gain
+        #   - we are using marlin kernels
         self.use_overlapped = (
             use_overlapped
             and not (
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
index bd1d399715305..20d050d387d49 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
@@ -62,7 +62,7 @@ def choose_scaled_mm_linear_kernel(
             continue
 
         # If the current platform uses compute_capability,
-        # make sure the kernel supports the compute cability.
+        # make sure the kernel supports the compute capability.
         is_supported, reason = kernel.is_supported(compute_capability)
         if not is_supported:
             failure_reasons.append(f"{kernel.__name__}: {reason}")

From 3a20450d313e7bffc78f1a0d3628a0866b486883 Mon Sep 17 00:00:00 2001
From: Lasha Koroshinadze <26011196+lashahub@users.noreply.github.com>
Date: Sun, 14 Dec 2025 05:14:55 -0500
Subject: [PATCH 526/770] Add AudioFlamingo3 model support (#30539)

Signed-off-by: Lasha <26011196+lashahub@users.noreply.github.com>
Signed-off-by: Lasha Koroshinadze <26011196+lashahub@users.noreply.github.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 docs/models/supported_models.md               |   1 +
 examples/offline_inference/audio_language.py  | 117 ++--
 .../expected_results_batched.json             |   1 +
 .../expected_results_single.json              |   1 +
 .../generation/test_audioflamingo3.py         | 142 ++++
 .../processing/test_audioflamingo3.py         | 125 ++++
 tests/models/registry.py                      |   3 +
 vllm/model_executor/models/audioflamingo3.py  | 639 ++++++++++++++++++
 vllm/model_executor/models/registry.py        |   4 +
 9 files changed, 989 insertions(+), 44 deletions(-)
 create mode 100644 tests/models/fixtures/audioflamingo3/expected_results_batched.json
 create mode 100644 tests/models/fixtures/audioflamingo3/expected_results_single.json
 create mode 100644 tests/models/multimodal/generation/test_audioflamingo3.py
 create mode 100644 tests/models/multimodal/processing/test_audioflamingo3.py
 create mode 100644 vllm/model_executor/models/audioflamingo3.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 7a3cb7b2ad820..9d8cdfe8b1302 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -659,6 +659,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
 |--------------|--------|--------|-------------------|----------------------|---------------------------|
 | `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | |
+| `AudioFlamingo3ForConditionalGeneration` | AudioFlamingo3 | T + A<sup>+</sup> | `nvidia/audio-flamingo-3-hf`, `nvidia/music-flamingo-hf` | ✅︎ | ✅︎ |
 | `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereLabs/aya-vision-8b`, `CohereLabs/aya-vision-32b`, etc. | | ✅︎ |
 | `BeeForConditionalGeneration` | Bee-8B | T + I<sup>E+</sup> | `Open-Bee/Bee-8B-RL`, `Open-Bee/Bee-8B-SFT` | | ✅︎ |
 | `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ |
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index 40462c78ae8c2..a6d0c5d12dd41 100755
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -42,60 +42,31 @@ class ModelRequestData(NamedTuple):
 # Unless specified, these settings have been tested to work on a single L4.
 
 
-# Voxtral
-# Make sure to install mistral-common[audio].
-def run_voxtral(question: str, audio_count: int) -> ModelRequestData:
-    from mistral_common.audio import Audio
-    from mistral_common.protocol.instruct.chunk import (
-        AudioChunk,
-        RawAudio,
-        TextChunk,
-    )
-    from mistral_common.protocol.instruct.messages import (
-        UserMessage,
-    )
-    from mistral_common.protocol.instruct.request import ChatCompletionRequest
-    from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
-
-    model_name = "mistralai/Voxtral-Mini-3B-2507"
-    tokenizer = MistralTokenizer.from_hf_hub(model_name)
-
+# AudioFlamingo3
+def run_audioflamingo3(question: str, audio_count: int) -> ModelRequestData:
+    model_name = "nvidia/audio-flamingo-3-hf"
     engine_args = EngineArgs(
         model=model_name,
-        max_model_len=8192,
+        max_model_len=4096,
         max_num_seqs=2,
         limit_mm_per_prompt={"audio": audio_count},
-        config_format="mistral",
-        load_format="mistral",
-        tokenizer_mode="mistral",
         enforce_eager=True,
-        enable_chunked_prefill=False,
     )
 
-    text_chunk = TextChunk(text=question)
-    audios = [
-        Audio.from_file(str(audio_assets[i].get_local_path()), strict=False)
-        for i in range(audio_count)
-    ]
-    audio_chunks = [
-        AudioChunk(input_audio=RawAudio.from_audio(audio)) for audio in audios
-    ]
+    # AudioFlamingo3 uses <sound> token for audio
+    audio_placeholder = "<sound>" * audio_count
 
-    messages = [UserMessage(content=[*audio_chunks, text_chunk])]
-
-    req = ChatCompletionRequest(messages=messages, model=model_name)
-
-    tokens = tokenizer.encode_chat_completion(req)
-    prompt_ids, audios = tokens.tokens, tokens.audios
-
-    audios_and_sr = [(au.audio_array, au.sampling_rate) for au in audios]
-
-    multi_modal_data = {"audio": audios_and_sr}
+    prompt = (
+        "<|im_start|>system\n"
+        "You are a helpful assistant.<|im_end|>\n"
+        "<|im_start|>user\n"
+        f"{audio_placeholder}{question}<|im_end|>\n"
+        "<|im_start|>assistant\n"
+    )
 
     return ModelRequestData(
         engine_args=engine_args,
-        prompt_token_ids=prompt_ids,
-        multi_modal_data=multi_modal_data,
+        prompt=prompt,
     )
 
 
@@ -361,6 +332,63 @@ def run_ultravox(question: str, audio_count: int) -> ModelRequestData:
     )
 
 
+# Voxtral
+# Make sure to install mistral-common[audio].
+def run_voxtral(question: str, audio_count: int) -> ModelRequestData:
+    from mistral_common.audio import Audio
+    from mistral_common.protocol.instruct.chunk import (
+        AudioChunk,
+        RawAudio,
+        TextChunk,
+    )
+    from mistral_common.protocol.instruct.messages import (
+        UserMessage,
+    )
+    from mistral_common.protocol.instruct.request import ChatCompletionRequest
+    from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
+
+    model_name = "mistralai/Voxtral-Mini-3B-2507"
+    tokenizer = MistralTokenizer.from_hf_hub(model_name)
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"audio": audio_count},
+        config_format="mistral",
+        load_format="mistral",
+        tokenizer_mode="mistral",
+        enforce_eager=True,
+        enable_chunked_prefill=False,
+    )
+
+    text_chunk = TextChunk(text=question)
+    audios = [
+        Audio.from_file(str(audio_assets[i].get_local_path()), strict=False)
+        for i in range(audio_count)
+    ]
+    audio_chunks = [
+        AudioChunk(input_audio=RawAudio.from_audio(audio)) for audio in audios
+    ]
+
+    messages = [UserMessage(content=[*audio_chunks, text_chunk])]
+
+    req = ChatCompletionRequest(messages=messages, model=model_name)
+
+    tokens = tokenizer.encode_chat_completion(req)
+    prompt_ids, audios = tokens.tokens, tokens.audios
+
+    audios_and_sr = [(au.audio_array, au.sampling_rate) for au in audios]
+
+    multi_modal_data = {"audio": audios_and_sr}
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt_token_ids=prompt_ids,
+        multi_modal_data=multi_modal_data,
+    )
+
+
 # Whisper
 def run_whisper(question: str, audio_count: int) -> ModelRequestData:
     assert audio_count == 1, "Whisper only support single audio input per prompt"
@@ -382,7 +410,7 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData:
 
 
 model_example_map = {
-    "voxtral": run_voxtral,
+    "audioflamingo3": run_audioflamingo3,
     "gemma3n": run_gemma3n,
     "granite_speech": run_granite_speech,
     "midashenglm": run_midashenglm,
@@ -392,6 +420,7 @@ model_example_map = {
     "qwen2_audio": run_qwen2_audio,
     "qwen2_5_omni": run_qwen2_5_omni,
     "ultravox": run_ultravox,
+    "voxtral": run_voxtral,
     "whisper": run_whisper,
 }
 
diff --git a/tests/models/fixtures/audioflamingo3/expected_results_batched.json b/tests/models/fixtures/audioflamingo3/expected_results_batched.json
new file mode 100644
index 0000000000000..4dbb107edccb7
--- /dev/null
+++ b/tests/models/fixtures/audioflamingo3/expected_results_batched.json
@@ -0,0 +1 @@
+{"transcriptions": ["There is no clear relationship between the barking and the music, as they seem to be independent of each other.", "(B) To indicate that language cannot express clearly, satirizing the inversion of black and white in the world"], "token_ids": [[3862, 374, 902, 2797, 5025, 1948, 279, 293, 33452, 323, 279, 4627, 11, 438, 807, 2803, 311, 387, 9489, 315, 1817, 1008, 13, 151645], [5349, 8, 2014, 13216, 429, 4128, 4157, 3158, 9355, 11, 7578, 404, 4849, 279, 46488, 315, 3691, 323, 4158, 304, 279, 1879, 151645, 151671]]}
\ No newline at end of file
diff --git a/tests/models/fixtures/audioflamingo3/expected_results_single.json b/tests/models/fixtures/audioflamingo3/expected_results_single.json
new file mode 100644
index 0000000000000..be9233467a20e
--- /dev/null
+++ b/tests/models/fixtures/audioflamingo3/expected_results_single.json
@@ -0,0 +1 @@
+{"transcriptions": ["The content of the input audio is 'you can ask why over and over and over again forever even if one day we explain every physical interaction and scientific law and hope and dream and regret with a single elegant equation'."], "token_ids": [[785, 2213, 315, 279, 1946, 7699, 374, 364, 9330, 646, 2548, 3170, 916, 323, 916, 323, 916, 1549, 15683, 1496, 421, 825, 1899, 582, 10339, 1449, 6961, 16230, 323, 12344, 2329, 323, 3900, 323, 7904, 323, 22231, 448, 264, 3175, 25777, 23606, 4427, 151645]]}
\ No newline at end of file
diff --git a/tests/models/multimodal/generation/test_audioflamingo3.py b/tests/models/multimodal/generation/test_audioflamingo3.py
new file mode 100644
index 0000000000000..d14291a62c346
--- /dev/null
+++ b/tests/models/multimodal/generation/test_audioflamingo3.py
@@ -0,0 +1,142 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2025 The vLLM team.
+# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
+# reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+
+import pytest
+
+from tests.models.registry import HF_EXAMPLE_MODELS
+from vllm import LLM, SamplingParams
+
+MODEL_NAME = "nvidia/audio-flamingo-3-hf"
+
+
+def get_fixture_path(filename):
+    return os.path.join(
+        os.path.dirname(__file__), "../../fixtures/audioflamingo3", filename
+    )
+
+
+@pytest.fixture(scope="module")
+def llm():
+    # Check if the model is supported by the current transformers version
+    model_info = HF_EXAMPLE_MODELS.get_hf_info("AudioFlamingo3ForConditionalGeneration")
+    model_info.check_transformers_version(on_fail="skip")
+
+    try:
+        llm = LLM(
+            model=MODEL_NAME,
+            trust_remote_code=True,
+            dtype="bfloat16",
+            enforce_eager=True,
+            limit_mm_per_prompt={"audio": 1},
+        )
+        return llm
+    except Exception as e:
+        pytest.skip(f"Failed to load model {MODEL_NAME}: {e}")
+
+
+def test_single_generation(llm):
+    fixture_path = get_fixture_path("expected_results_single.json")
+    if not os.path.exists(fixture_path):
+        pytest.skip(f"Fixture not found: {fixture_path}")
+
+    with open(fixture_path) as f:
+        expected = json.load(f)
+
+    audio_url = "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/Why_do_we_ask_questions_converted.wav"
+
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "audio_url", "audio_url": {"url": audio_url}},
+                {"type": "text", "text": "Transcribe the input speech."},
+            ],
+        }
+    ]
+
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=128)
+
+    outputs = llm.chat(
+        messages=messages,
+        sampling_params=sampling_params,
+    )
+    generated_text = outputs[0].outputs[0].text.strip()
+
+    expected_text = expected["transcriptions"][0]
+
+    assert expected_text in generated_text or generated_text in expected_text
+
+
+def test_batched_generation(llm):
+    fixture_path = get_fixture_path("expected_results_batched.json")
+    if not os.path.exists(fixture_path):
+        pytest.skip(f"Fixture not found: {fixture_path}")
+
+    with open(fixture_path) as f:
+        expected = json.load(f)
+
+    items = [
+        {
+            "audio_url": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/dogs_barking_in_sync_with_the_music.wav",
+            "question": "What is surprising about the relationship "
+            "between the barking and the music?",
+            "expected_idx": 0,
+        },
+        {
+            "audio_url": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/Ch6Ae9DT6Ko_00-04-03_00-04-31.wav",
+            "question": (
+                "Why is the philosopher's name mentioned in the lyrics? "
+                "(A) To express a sense of nostalgia "
+                "(B) To indicate that language cannot express clearly, "
+                "satirizing the inversion of black and white in the world "
+                "(C) To add depth and complexity to the lyrics "
+                "(D) To showcase the wisdom and influence of the philosopher"
+            ),
+            "expected_idx": 1,
+        },
+    ]
+
+    conversations = []
+    for item in items:
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "audio_url", "audio_url": {"url": item["audio_url"]}},
+                    {"type": "text", "text": item["question"]},
+                ],
+            }
+        ]
+        conversations.append(messages)
+
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=128)
+
+    outputs = llm.chat(
+        messages=conversations,
+        sampling_params=sampling_params,
+    )
+
+    for i, output in enumerate(outputs):
+        generated_text = output.outputs[0].text.strip()
+        expected_text = expected["transcriptions"][i]
+
+        assert expected_text in generated_text or generated_text in expected_text
diff --git a/tests/models/multimodal/processing/test_audioflamingo3.py b/tests/models/multimodal/processing/test_audioflamingo3.py
new file mode 100644
index 0000000000000..d7c00516ffead
--- /dev/null
+++ b/tests/models/multimodal/processing/test_audioflamingo3.py
@@ -0,0 +1,125 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2025 The vLLM team.
+# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
+# reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from unittest.mock import MagicMock
+
+import numpy as np
+import pytest
+import torch
+from transformers import PretrainedConfig
+
+from tests.models.registry import HF_EXAMPLE_MODELS
+
+
+class MockAudioFlamingo3Config(PretrainedConfig):
+    model_type = "audioflamingo3"
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.audio_config = PretrainedConfig()
+        self.text_config = PretrainedConfig()
+
+
+class MockAudioFlamingo3Processor:
+    def __init__(self):
+        self.audio_token = "<sound>"
+        self.audio_token_id = 12345
+        self.feature_extractor = MockFeatureExtractor()
+
+    def __call__(self, text=None, audios=None, **kwargs):
+        return {"input_ids": [1, 2, 3], "input_features": [np.zeros((3000, 80))]}
+
+
+class MockFeatureExtractor:
+    def __init__(self):
+        self.sampling_rate = 16000
+        self.chunk_length = 30
+
+
+@pytest.fixture
+def mock_ctx():
+    config = MockAudioFlamingo3Config()
+
+    ctx = MagicMock()
+    ctx.get_hf_config.return_value = config
+    ctx.get_hf_processor.return_value = MockAudioFlamingo3Processor()
+    ctx.model_config.hf_config = config
+    return ctx
+
+
+@pytest.fixture(autouse=True)
+def check_transformers_version():
+    # Check if the model is supported by the current transformers version
+    model_info = HF_EXAMPLE_MODELS.get_hf_info("AudioFlamingo3ForConditionalGeneration")
+    model_info.check_transformers_version(on_fail="skip")
+
+
+def test_audio_chunk_counting(mock_ctx):
+    from vllm.model_executor.models.audioflamingo3 import (
+        AudioFlamingo3DummyInputsBuilder,
+        AudioFlamingo3MultiModalProcessor,
+        AudioFlamingo3ProcessingInfo,
+    )
+
+    info = AudioFlamingo3ProcessingInfo(mock_ctx)
+    processor = AudioFlamingo3MultiModalProcessor(
+        info, AudioFlamingo3DummyInputsBuilder(info)
+    )
+
+    sr = 16000
+    audio_1 = np.zeros(30 * sr)
+    audio_2 = np.zeros(45 * sr)
+
+    mm_data = {"audio": [audio_1, audio_2]}
+    prompt = "<|user|>Listen.<|end|>"
+
+    from vllm.multimodal.processing import BaseMultiModalProcessor
+
+    def mock_base_call(self, prompt, mm_data, mm_kwargs, tok_kwargs):
+        return {"input_ids": [1, 2, 3], "input_features": torch.randn(1, 80, 3000)}
+
+    with pytest.MonkeyPatch.context() as mp:
+        mp.setattr(BaseMultiModalProcessor, "_call_hf_processor", mock_base_call)
+
+        processed = processor._call_hf_processor(prompt, mm_data, {}, {})
+
+        chunk_counts = processed["chunk_counts"]
+
+        assert chunk_counts[0].item() == 1
+        assert chunk_counts[1].item() == 2
+        assert len(chunk_counts) == 2
+
+
+def test_dummy_data_generation(mock_ctx):
+    from vllm.model_executor.models.audioflamingo3 import (
+        AudioFlamingo3DummyInputsBuilder,
+        AudioFlamingo3ProcessingInfo,
+    )
+
+    info = AudioFlamingo3ProcessingInfo(mock_ctx)
+    builder = AudioFlamingo3DummyInputsBuilder(info)
+
+    mm_counts = {"audio": 2}
+    dummy_data = builder.get_dummy_mm_data(100, mm_counts, None)
+
+    assert "audio" in dummy_data
+    assert len(dummy_data["audio"]) == 2
+
+    expected_len = 600 * 16000
+    assert len(dummy_data["audio"][0]) == expected_len
diff --git a/tests/models/registry.py b/tests/models/registry.py
index ca50785b46a1a..3f835a8b88e3d 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -578,6 +578,9 @@ _AUTOMATIC_CONVERTED_MODELS = {
 _MULTIMODAL_EXAMPLE_MODELS = {
     # [Decoder-only]
     "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
+    "AudioFlamingo3ForConditionalGeneration": _HfExamplesInfo(
+        "nvidia/audio-flamingo-3-hf", min_transformers_version="5.0.0.dev"
+    ),
     "AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereLabs/aya-vision-8b"),
     "BeeForConditionalGeneration": _HfExamplesInfo(
         "Open-Bee/Bee-8B-RL",
diff --git a/vllm/model_executor/models/audioflamingo3.py b/vllm/model_executor/models/audioflamingo3.py
new file mode 100644
index 0000000000000..0ca5f2c4e0a75
--- /dev/null
+++ b/vllm/model_executor/models/audioflamingo3.py
@@ -0,0 +1,639 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2025 The vLLM team.
+# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
+# reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Annotated, Any, Literal, TypeAlias
+
+import torch
+import torch.nn as nn
+from transformers import BatchFeature, PretrainedConfig
+from transformers.models.audioflamingo3 import (
+    AudioFlamingo3Config,
+    AudioFlamingo3Processor,
+)
+from transformers.models.qwen2_audio import Qwen2AudioEncoder
+
+from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
+from vllm.multimodal.parse import (
+    DictEmbeddingItems,
+    ModalityData,
+    ModalityDataItems,
+    MultiModalDataItems,
+    MultiModalDataParser,
+)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
+
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsLoRA,
+    SupportsMultiModal,
+    SupportsPP,
+)
+from .utils import (
+    AutoWeightsLoader,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
+
+MAX_AUDIO_LEN = 10 * 60
+
+
+# === Audio Inputs === #
+class AudioFlamingo3FeatureInputs(TensorSchema):
+    """
+    Dimensions:
+        - num_chunks: Number of audio chunks (flattened)
+        - nmb: Number of mel bins
+        - num_audios: Number of original audio files
+    """
+
+    type: Literal["audio_features"]
+    input_features: Annotated[
+        torch.Tensor | list[torch.Tensor],
+        TensorShape("num_chunks", "nmb", 3000),
+    ]
+
+    feature_attention_mask: Annotated[
+        torch.Tensor,
+        TensorShape("num_chunks", 3000),
+    ]
+
+    chunk_counts: Annotated[
+        torch.Tensor,
+        TensorShape("num_audios"),
+    ]
+
+
+class AudioFlamingo3EmbeddingInputs(TensorSchema):
+    """
+    Dimensions:
+        - bn: Batch size
+        - naf: Number of audio features
+        - hs: Hidden size (must match the hidden size of language model
+          backbone)
+    """
+
+    type: Literal["audio_embeds"] = "audio_embeds"
+
+    audio_embeds: Annotated[
+        list[torch.Tensor],
+        TensorShape("bn", "naf", "hs"),
+    ]
+
+
+AudioFlamingo3Inputs: TypeAlias = (
+    AudioFlamingo3FeatureInputs | AudioFlamingo3EmbeddingInputs
+)
+
+
+class AudioFlamingo3Encoder(Qwen2AudioEncoder):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+    ):
+        super().__init__(config)
+        self.avg_pooler = nn.AvgPool1d(kernel_size=2, stride=2)
+        # self.layer_norm is already initialized in super().__init__
+
+    def forward(
+        self,
+        input_features: torch.Tensor | list[torch.Tensor],
+        attention_mask: torch.Tensor = None,
+    ):
+        # input_features: (batch, num_mel_bins, seq_len)
+        if isinstance(input_features, list):
+            input_features = torch.stack(input_features)
+
+        hidden_states = nn.functional.gelu(self.conv1(input_features))
+        hidden_states = nn.functional.gelu(self.conv2(hidden_states))
+        hidden_states = hidden_states.transpose(-1, -2)
+        hidden_states = (
+            hidden_states + self.embed_positions.weight[: hidden_states.size(-2), :]
+        ).to(hidden_states.dtype)
+
+        for layer in self.layers:
+            layer_outputs = layer(hidden_states, attention_mask)
+            hidden_states = layer_outputs[0]
+
+        # AvgPool (time/2) + LayerNorm
+        # hidden_states: (batch, seq_len, hidden_size)
+        hidden_states = hidden_states.permute(0, 2, 1)  # (batch, hidden_size, seq_len)
+        hidden_states = self.avg_pooler(hidden_states)
+        hidden_states = hidden_states.permute(
+            0, 2, 1
+        )  # (batch, seq_len/2, hidden_size)
+        hidden_states = self.layer_norm(hidden_states)
+
+        return hidden_states
+
+    def _get_feat_extract_output_lengths(self, input_lengths: torch.Tensor):
+        """
+        Computes the output length of the convolutional layers and the output length
+        of the audio encoder
+        """
+        input_lengths = (input_lengths - 1) // 2 + 1
+        output_lengths = (input_lengths - 2) // 2 + 1
+        return input_lengths, output_lengths
+
+
+class AudioFlamingo3MultiModalProjector(nn.Module):
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.linear_1 = nn.Linear(
+            config.audio_config.hidden_size,
+            config.text_config.hidden_size,
+            bias=config.projector_bias,
+        )
+        self.act = get_act_fn(config.projector_hidden_act)
+        self.linear_2 = nn.Linear(
+            config.text_config.hidden_size,
+            config.text_config.hidden_size,
+            bias=config.projector_bias,
+        )
+
+    def forward(self, audio_features):
+        hidden_states = self.linear_1(audio_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+class AudioFlamingo3ProcessingInfo(BaseProcessingInfo):
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(AudioFlamingo3Config)
+
+    def get_hf_processor(self, **kwargs: object):
+        return self.ctx.get_hf_processor(AudioFlamingo3Processor, **kwargs)
+
+    def get_feature_extractor(self, **kwargs: object):
+        hf_processor = self.get_hf_processor(**kwargs)
+        feature_extractor = hf_processor.feature_extractor
+        return feature_extractor
+
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
+        return {"audio": None}
+
+
+class AudioFlamingo3DummyInputsBuilder(
+    BaseDummyInputsBuilder[AudioFlamingo3ProcessingInfo]
+):
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_audios = mm_counts.get("audio", 0)
+        hf_processor = self.info.get_hf_processor()
+        audio_token = hf_processor.audio_token
+        return audio_token * num_audios
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
+    ) -> MultiModalDataDict:
+        feature_extractor = self.info.get_feature_extractor()
+        sampling_rate = feature_extractor.sampling_rate
+        audio_len = MAX_AUDIO_LEN * sampling_rate
+        num_audios = mm_counts.get("audio", 0)
+        audio_overrides = mm_options.get("audio") if mm_options else None
+
+        return {
+            "audio": self._get_dummy_audios(
+                length=audio_len,
+                num_audios=num_audios,
+                overrides=audio_overrides,
+            )
+        }
+
+
+def _audioflamingo3_field_config(hf_inputs: Mapping[str, torch.Tensor]):
+    chunk_counts = hf_inputs.get("chunk_counts")
+    if chunk_counts is not None:
+        return dict(
+            audio_embeds=MultiModalFieldConfig.batched("audio"),
+            input_features=MultiModalFieldConfig.flat_from_sizes(
+                "audio", chunk_counts, dim=0
+            ),
+            feature_attention_mask=MultiModalFieldConfig.flat_from_sizes(
+                "audio", chunk_counts, dim=0
+            ),
+            chunk_counts=MultiModalFieldConfig.batched("audio"),
+        )
+    return dict(
+        audio_embeds=MultiModalFieldConfig.batched("audio"),
+        input_features=MultiModalFieldConfig.batched("audio"),
+        feature_attention_mask=MultiModalFieldConfig.batched("audio"),
+        chunk_counts=MultiModalFieldConfig.batched("audio"),
+    )
+
+
+class AudioFlamingo3MultiModalDataParser(MultiModalDataParser):
+    def _parse_audio_data(
+        self,
+        data: dict[str, torch.Tensor] | ModalityData[Any],
+    ) -> ModalityDataItems[Any, Any] | None:
+        if isinstance(data, dict):
+            return DictEmbeddingItems(
+                data,
+                modality="audio",
+                required_fields={"audio_embeds"},
+                fields_factory=_audioflamingo3_field_config,
+            )
+        return super()._parse_audio_data(data)
+
+
+class AudioFlamingo3MultiModalProcessor(
+    BaseMultiModalProcessor[AudioFlamingo3ProcessingInfo]
+):
+    def _get_data_parser(self) -> MultiModalDataParser:
+        feature_extractor = self.info.get_feature_extractor()
+        return AudioFlamingo3MultiModalDataParser(
+            target_sr=feature_extractor.sampling_rate
+        )
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: dict[str, object],
+        mm_kwargs: Mapping[str, Any],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        audios = mm_data.pop("audios", [])
+        if audios:
+            mm_data["audio"] = audios
+
+        if not mm_data.get("audio", []):
+            prompt_ids = self.info.get_tokenizer().encode(prompt)
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        feature_extractor = self.info.get_feature_extractor(**mm_kwargs)
+        mm_kwargs = dict(
+            **mm_kwargs,
+            sampling_rate=feature_extractor.sampling_rate,
+        )
+
+        # Calculate chunk counts
+        audio_list = mm_data.get("audio")
+        if not isinstance(audio_list, list):
+            audio_list = [audio_list]
+
+        chunk_counts = []
+        sampling_rate = feature_extractor.sampling_rate
+        chunk_length = feature_extractor.chunk_length
+        window_size = int(sampling_rate * chunk_length)
+        # MAX_AUDIO_LEN is 10 * 60 in HF processor.
+        max_windows = int(MAX_AUDIO_LEN // chunk_length)
+
+        for audio in audio_list:
+            # audio is numpy array or list
+            n_samples = len(audio) if isinstance(audio, list) else audio.shape[0]
+
+            n_win = max(1, (n_samples + window_size - 1) // window_size)
+            if n_win > max_windows:
+                n_win = max_windows
+            chunk_counts.append(n_win)
+
+        outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+            tok_kwargs=tok_kwargs,
+        )
+
+        if "input_features_mask" in outputs:
+            outputs["feature_attention_mask"] = outputs.pop("input_features_mask")
+
+        outputs["chunk_counts"] = torch.tensor(chunk_counts, dtype=torch.long)
+
+        return outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return _audioflamingo3_field_config(hf_inputs)
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargsItems,
+    ) -> Sequence[PromptUpdate]:
+        processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        tokenizer = self.info.get_tokenizer()
+        vocab = tokenizer.get_vocab()
+
+        audio_token = getattr(processor, "audio_token", "<sound>")
+        audio_token_id = vocab.get(audio_token)
+        if audio_token_id is None:
+            # Fallback if not found, though it should be there
+            audio_token_id = processor.audio_token_id
+
+        out_mm_data = out_mm_kwargs.get_data()
+        feature_attention_mask = out_mm_data.get("feature_attention_mask")
+        chunk_counts = out_mm_data.get("chunk_counts")
+
+        def get_replacement_audioflamingo3(item_idx: int):
+            if feature_attention_mask is not None:
+                if chunk_counts is not None:
+                    counts = (
+                        chunk_counts.tolist()
+                        if isinstance(chunk_counts, torch.Tensor)
+                        else chunk_counts
+                    )
+                    start_idx = sum(counts[:item_idx])
+                    count = counts[item_idx]
+                    end_idx = start_idx + count
+
+                    if isinstance(feature_attention_mask, list):
+                        mask_list = feature_attention_mask[start_idx:end_idx]
+                        if len(mask_list) > 0 and isinstance(
+                            mask_list[0], torch.Tensor
+                        ):
+                            mask = torch.stack(mask_list)
+                        else:
+                            mask = torch.tensor(mask_list)
+                    else:
+                        mask = feature_attention_mask[start_idx:end_idx]
+                else:
+                    # feature_attention_mask is list[Tensor] or Tensor
+                    if isinstance(feature_attention_mask, list):
+                        mask = feature_attention_mask[item_idx]
+                    else:
+                        mask = feature_attention_mask[item_idx].unsqueeze(0)
+
+                # mask shape: (num_chunks, 3000)
+                input_lengths = mask.sum(-1)
+                conv_lengths = (input_lengths - 1) // 2 + 1
+                audio_output_lengths = (conv_lengths - 2) // 2 + 1
+                num_features = audio_output_lengths.sum().item()
+            else:
+                audio_embeds = out_mm_data["audio_embeds"][item_idx]
+                num_features = audio_embeds.shape[0]
+
+            if num_features == 0:
+                raise ValueError("Audio is too short")
+
+            audio_tokens = [audio_token_id] * int(num_features)
+            return PromptUpdateDetails.select_token_id(
+                audio_tokens,
+                embed_token_id=audio_token_id,
+            )
+
+        return [
+            PromptReplacement(
+                modality="audio",
+                target=audio_token,
+                replacement=get_replacement_audioflamingo3,
+            )
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    AudioFlamingo3MultiModalProcessor,
+    info=AudioFlamingo3ProcessingInfo,
+    dummy_inputs=AudioFlamingo3DummyInputsBuilder,
+)
+class AudioFlamingo3ForConditionalGeneration(
+    nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA
+):
+    """
+    AudioFlamingo3 model for conditional generation.
+
+    This model integrates a Whisper-based audio encoder with a Qwen2 language model.
+    It supports multi-chunk audio processing.
+    """
+
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model.",
+            connector="multi_modal_projector.",
+            tower_model="audio_tower.",
+        )
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        self.audio_tower = AudioFlamingo3Encoder(
+            config.audio_config,
+        )
+        self.multi_modal_projector = AudioFlamingo3MultiModalProjector(config)
+
+        self.quant_config = quant_config
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+            architectures=["Qwen2ForCausalLM"],
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors
+        )
+
+    def _parse_and_validate_audio_input(
+        self, **kwargs: object
+    ) -> AudioFlamingo3Inputs | None:
+        input_features = kwargs.pop("input_features", None)
+        audio_embeds = kwargs.pop("audio_embeds", None)
+        feature_attention_mask = kwargs.pop("feature_attention_mask", None)
+        chunk_counts = kwargs.pop("chunk_counts", None)
+
+        if input_features is None and audio_embeds is None:
+            return None
+
+        if audio_embeds is not None:
+            return AudioFlamingo3EmbeddingInputs(
+                type="audio_embeds", audio_embeds=audio_embeds
+            )
+
+        if input_features is not None:
+            return AudioFlamingo3FeatureInputs(
+                type="audio_features",
+                input_features=input_features,
+                feature_attention_mask=feature_attention_mask,
+                chunk_counts=chunk_counts,
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _process_audio_input(
+        self, audio_input: AudioFlamingo3Inputs
+    ) -> torch.Tensor | tuple[torch.Tensor, ...]:
+        if audio_input["type"] == "audio_embeds":
+            audio_embeds = audio_input["audio_embeds"]
+            return tuple(audio_embeds)
+
+        input_features = audio_input["input_features"]
+        feature_attention_mask = audio_input["feature_attention_mask"]
+        chunk_counts = audio_input.get("chunk_counts")
+
+        if isinstance(input_features, list):
+            input_features = torch.cat(input_features, dim=0)
+            feature_attention_mask = torch.cat(feature_attention_mask, dim=0)
+
+        if chunk_counts is None:
+            chunk_counts = [1] * input_features.shape[0]
+        elif isinstance(chunk_counts, torch.Tensor):
+            chunk_counts = chunk_counts.tolist()
+        elif (
+            isinstance(chunk_counts, list)
+            and chunk_counts
+            and isinstance(chunk_counts[0], torch.Tensor)
+        ):
+            chunk_counts = [c.item() for c in chunk_counts]
+
+        # Calculate output lengths
+        input_lengths = feature_attention_mask.sum(-1)
+        # Conv downsampling
+        conv_lengths = (input_lengths - 1) // 2 + 1
+        # AvgPool downsampling
+        audio_output_lengths = (conv_lengths - 2) // 2 + 1
+
+        batch_size, _, max_mel_seq_len = input_features.shape
+
+        # Calculate max_seq_len after convs (before pooling) for attention mask
+        max_seq_len = (max_mel_seq_len - 1) // 2 + 1
+
+        # Create a sequence tensor of shape (batch_size, max_seq_len)
+        seq_range = (
+            torch.arange(
+                0,
+                max_seq_len,
+                dtype=conv_lengths.dtype,
+                device=conv_lengths.device,
+            )
+            .unsqueeze(0)
+            .expand(batch_size, max_seq_len)
+        )
+        lengths_expand = conv_lengths.unsqueeze(-1).expand(batch_size, max_seq_len)
+        # Create mask
+        padding_mask = seq_range >= lengths_expand
+
+        audio_attention_mask_ = padding_mask.view(batch_size, 1, 1, max_seq_len).expand(
+            batch_size, 1, max_seq_len, max_seq_len
+        )
+        audio_attention_mask = audio_attention_mask_.to(
+            dtype=self.audio_tower.conv1.weight.dtype,
+            device=self.audio_tower.conv1.weight.device,
+        )
+        audio_attention_mask[audio_attention_mask_] = float("-inf")
+
+        # Forward pass
+        audio_features = self.audio_tower(
+            input_features, attention_mask=audio_attention_mask
+        )
+
+        # Project
+        audio_features = self.multi_modal_projector(audio_features)
+
+        # Masking after pooling
+        num_audios, max_audio_tokens, embed_dim = audio_features.shape
+        audio_output_lengths = audio_output_lengths.unsqueeze(1)
+        audio_features_mask = (
+            torch.arange(max_audio_tokens)
+            .expand(num_audios, max_audio_tokens)
+            .to(audio_output_lengths.device)
+            < audio_output_lengths
+        )
+        masked_audio_features = audio_features[audio_features_mask].view(-1, embed_dim)
+
+        # Split to tuple of embeddings for individual audio input.
+        chunk_embeddings = torch.split(
+            masked_audio_features, audio_output_lengths.flatten().tolist()
+        )
+
+        grouped_embeddings = []
+        current_idx = 0
+        for count in chunk_counts:
+            audio_chunks = chunk_embeddings[current_idx : current_idx + count]
+            grouped_embeddings.append(torch.cat(audio_chunks, dim=0))
+            current_idx += count
+        return tuple(grouped_embeddings)
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
+        audio_input = self._parse_and_validate_audio_input(**kwargs)
+        if audio_input is None:
+            return []
+        masked_audio_features = self._process_audio_input(audio_input)
+        return masked_audio_features
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs: object,
+    ) -> torch.Tensor | IntermediateTensors:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        hidden_states = self.language_model.model(
+            input_ids,
+            positions,
+            intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        return self.language_model.compute_logits(hidden_states)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index a4a964bc7c1a6..419c47a2198cf 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -264,6 +264,10 @@ _CROSS_ENCODER_MODELS = {
 _MULTIMODAL_MODELS = {
     # [Decoder-only]
     "AriaForConditionalGeneration": ("aria", "AriaForConditionalGeneration"),
+    "AudioFlamingo3ForConditionalGeneration": (
+        "audioflamingo3",
+        "AudioFlamingo3ForConditionalGeneration",
+    ),
     "AyaVisionForConditionalGeneration": (
         "aya_vision",
         "AyaVisionForConditionalGeneration",

From 3224ea9915750cdd714d85c843264923ef4018cc Mon Sep 17 00:00:00 2001
From: Ilya Markov <markovilya197@gmail.com>
Date: Sun, 14 Dec 2025 11:15:11 +0100
Subject: [PATCH 527/770] [torch.compile] Add encoder tag for compilation
 (#30489)

Signed-off-by: ilmarkov <markovilya197@gmail.com>
---
 vllm/compilation/backends.py             | 11 ++++++++++-
 vllm/compilation/piecewise_backend.py    |  7 +------
 vllm/model_executor/models/qwen2_5_vl.py |  6 +++---
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 8fcd2b42e13bb..a1eec7d74483f 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -463,21 +463,27 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
 # the tag for the part of model being compiled,
 # e.g. backbone/eagle_head
 model_tag: str = "backbone"
+model_is_encoder: bool = False
 
 
 @contextmanager
-def set_model_tag(tag: str):
+def set_model_tag(tag: str, is_encoder: bool = False):
     """Context manager to set the model tag."""
     global model_tag
+    global model_is_encoder
     assert tag != model_tag, (
         f"Model tag {tag} is the same as the current tag {model_tag}."
     )
     old_tag = model_tag
+    old_is_encoder = model_is_encoder
+
     model_tag = tag
+    model_is_encoder = is_encoder
     try:
         yield
     finally:
         model_tag = old_tag
+        model_is_encoder = old_is_encoder
 
 
 class VllmBackend:
@@ -523,6 +529,9 @@ class VllmBackend:
         # them, e.g. backbone (default), eagle_head, etc.
         self.prefix = prefix or model_tag
 
+        # Mark compilation for encoder.
+        self.is_encoder = model_is_encoder
+
         # Passes to run on the graph post-grad.
         self.pass_manager = resolve_obj_by_qualname(
             current_platform.get_pass_manager_cls()
diff --git a/vllm/compilation/piecewise_backend.py b/vllm/compilation/piecewise_backend.py
index a15c693767a51..58d3e2a14b22a 100644
--- a/vllm/compilation/piecewise_backend.py
+++ b/vllm/compilation/piecewise_backend.py
@@ -53,12 +53,7 @@ class PiecewiseBackend:
         self.is_last_graph = piecewise_compile_index == total_piecewise_compiles - 1
 
         self.is_full_graph = total_piecewise_compiles == 1
-        # TODO: we need to generalize encoder compilation to other models
-        self.is_encoder_compilation = vllm_backend.prefix in [
-            "Qwen2_5_VisionPatchEmbed",
-            "Qwen2_5_VisionPatchMerger",
-            "Qwen2_5_VisionBlock",
-        ]
+        self.is_encoder_compilation = vllm_backend.is_encoder
 
         self.compile_ranges = self.compilation_config.get_compile_ranges()
         if self.is_encoder_compilation:
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index fba06e34f6227..4320e8644f751 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -612,7 +612,7 @@ class Qwen2_5_VisionTransformer(nn.Module):
         # DO NOT MOVE THIS IMPORT
         from vllm.compilation.backends import set_model_tag
 
-        with set_model_tag("Qwen2_5_VisionPatchEmbed"):
+        with set_model_tag("Qwen2_5_VisionPatchEmbed", is_encoder=True):
             self.patch_embed = Qwen2_5_VisionPatchEmbed(
                 patch_size=patch_size,
                 temporal_patch_size=temporal_patch_size,
@@ -651,7 +651,7 @@ class Qwen2_5_VisionTransformer(nn.Module):
                 f"Qwen2.5-VL does not support {self.attn_backend} backend now."
             )
 
-        with set_model_tag("Qwen2_5_VisionBlock"):
+        with set_model_tag("Qwen2_5_VisionBlock", is_encoder=True):
             self.blocks = nn.ModuleList(
                 [
                     Qwen2_5_VisionBlock(
@@ -670,7 +670,7 @@ class Qwen2_5_VisionTransformer(nn.Module):
                 ]
             )
 
-        with set_model_tag("Qwen2_5_VisionPatchMerger"):
+        with set_model_tag("Qwen2_5_VisionPatchMerger", is_encoder=True):
             self.merger = Qwen2_5_VisionPatchMerger(
                 d_model=vision_config.out_hidden_size,
                 context_dim=self.hidden_size,

From e9add129ad9daf7a9e00381da318db271646813a Mon Sep 17 00:00:00 2001
From: Matthias Gehre <matthias.gehre@amd.com>
Date: Sun, 14 Dec 2025 11:15:37 +0100
Subject: [PATCH 528/770] [Bugfix] awq_gemm: fix argument order swap (#30364)

Signed-off-by: Matthias Gehre <matthias.gehre@amd.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
---
 tests/kernels/quantization/test_awq.py | 6 +++---
 vllm/_custom_ops.py                    | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/kernels/quantization/test_awq.py b/tests/kernels/quantization/test_awq.py
index efb62ca3799a9..3bf59dea30972 100644
--- a/tests/kernels/quantization/test_awq.py
+++ b/tests/kernels/quantization/test_awq.py
@@ -41,9 +41,9 @@ def test_awq_gemm_opcheck(monkeypatch: pytest.MonkeyPatch):
         qweight = torch.randint(
             -2000000000, 2000000000, (8192, 256), device="cuda", dtype=torch.int32
         )
-        scales = torch.randint(
+        scales = torch.empty((64, 2048), device="cuda", dtype=torch.float16)
+        qzeros = torch.randint(
             -2000000000, 2000000000, (64, 256), device="cuda", dtype=torch.int32
         )
-        qzeros = torch.empty((64, 2048), device="cuda", dtype=torch.float16)
         split_k_iters = 8
-        opcheck(torch.ops._C.awq_gemm, (input, qweight, qzeros, scales, split_k_iters))
+        opcheck(torch.ops._C.awq_gemm, (input, qweight, scales, qzeros, split_k_iters))
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 52a58a082683d..2319655008c50 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -498,15 +498,15 @@ def awq_dequantize(
 def awq_gemm(
     input: torch.Tensor,
     qweight: torch.Tensor,
-    qzeros: torch.Tensor,
     scales: torch.Tensor,
+    qzeros: torch.Tensor,
     split_k_iters: int,
 ) -> torch.Tensor:
     if envs.VLLM_USE_TRITON_AWQ:
         from vllm.model_executor.layers.quantization.awq_triton import awq_gemm_triton
 
-        return awq_gemm_triton(input, qweight, qzeros, scales, split_k_iters)
-    return torch.ops._C.awq_gemm(input, qweight, qzeros, scales, split_k_iters)
+        return awq_gemm_triton(input, qweight, scales, qzeros, split_k_iters)
+    return torch.ops._C.awq_gemm(input, qweight, scales, qzeros, split_k_iters)
 
 
 # gptq
@@ -632,8 +632,8 @@ if hasattr(torch.ops._C, "gptq_marlin_24_gemm"):
     def _awq_gemm_fake(
         input: torch.Tensor,
         qweight: torch.Tensor,
-        qzeros: torch.Tensor,
         scales: torch.Tensor,
+        qzeros: torch.Tensor,
         split_k_iters: torch.SymInt,
     ) -> torch.Tensor:
         num_in_feats = input.size(0)

From 060893654dc6e3b4fe3f40951d9f4d769903ee7e Mon Sep 17 00:00:00 2001
From: Johannes F <johannesflommersfeld@users.noreply.github.com>
Date: Sun, 14 Dec 2025 11:16:06 +0100
Subject: [PATCH 529/770] fix: Update json features supported by xGrammar
 (#30390)

Signed-off-by: Johannes Flommersfeld <johannes.flommersfeld@tngtech.com>
Signed-off-by: Johannes F <johannesflommersfeld@users.noreply.github.com>
Co-authored-by: Johannes Flommersfeld <johannes.flommersfeld@tngtech.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 tests/v1/entrypoints/conftest.py              | 5 +++++
 tests/v1/structured_output/test_utils.py      | 4 ++--
 vllm/v1/structured_output/backend_xgrammar.py | 8 +-------
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/tests/v1/entrypoints/conftest.py b/tests/v1/entrypoints/conftest.py
index 40b9d1fe850c6..bc9674ee86cf8 100644
--- a/tests/v1/entrypoints/conftest.py
+++ b/tests/v1/entrypoints/conftest.py
@@ -76,6 +76,8 @@ def sample_json_schema():
         },
         "required": ["name", "age", "skills", "grade", "email", "work_history"],
         "additionalProperties": False,
+        "minProperties": 1,
+        "maxProperties": 10,
     }
 
 
@@ -96,6 +98,9 @@ def unsupported_json_schema():
         },
         "required": ["score", "tags"],
         "additionalProperties": False,
+        "patternProperties": {
+            "^score$": {"type": "integer"},
+        },
     }
 
 
diff --git a/tests/v1/structured_output/test_utils.py b/tests/v1/structured_output/test_utils.py
index 513a21dd6bb39..c026ab0e4e785 100644
--- a/tests/v1/structured_output/test_utils.py
+++ b/tests/v1/structured_output/test_utils.py
@@ -44,8 +44,6 @@ def unsupported_array_schemas():
 @pytest.fixture
 def unsupported_object_schemas():
     return [
-        {"type": "object", "minProperties": 1},
-        {"type": "object", "maxProperties": 5},
         {"type": "object", "propertyNames": {"pattern": "^[a-z]+$"}},
         {"type": "object", "patternProperties": {"^S": {"type": "string"}}},
     ]
@@ -79,6 +77,8 @@ def supported_schema():
                 },
             },
         },
+        "minProperties": 1,
+        "maxProperties": 100,
     }
 
 
diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py
index c5e7165026d1b..678121683434d 100644
--- a/vllm/v1/structured_output/backend_xgrammar.py
+++ b/vllm/v1/structured_output/backend_xgrammar.py
@@ -268,13 +268,7 @@ def has_xgrammar_unsupported_json_features(schema: dict[str, Any]) -> bool:
 
         # Unsupported keywords for objects
         if obj.get("type") == "object" and any(
-            key in obj
-            for key in (
-                "minProperties",
-                "maxProperties",
-                "propertyNames",
-                "patternProperties",
-            )
+            key in obj for key in ("patternProperties", "propertyNames")
         ):
             return True
 

From 0bb0bae43696d59f8e4d88bd7c6daa992fd31af4 Mon Sep 17 00:00:00 2001
From: Shengliang Xu <106840466+shengliangxu@users.noreply.github.com>
Date: Sun, 14 Dec 2025 02:18:31 -0800
Subject: [PATCH 530/770] Nvidia ModelOpt workaround for issue 28072 (#30164)

Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>
Co-authored-by: Pavani Majety <pmajety@nvidia.com>
---
 .../layers/quantization/modelopt.py           | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index a3a8ec738dae2..030d85080a34d 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -188,7 +188,24 @@ class ModelOptQuantConfigBase(QuantizationConfig):
 
     def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
         if len(self.exclude_modules) > 0:
-            self.exclude_modules = hf_to_vllm_mapper.apply_list(self.exclude_modules)
+            # This is a workaround for the weights remapping issue:
+            # https://github.com/vllm-project/vllm/issues/28072
+            # Right now, the Nvidia ModelOpt library use just one wildcard pattern:
+            #        module_path*
+            # It gets applied if the whole tree of modules rooted at module_path
+            # is not quantized. Here we replace such pattern by 2 patterns that are
+            # collectively equivalent to the original pattern:
+            #        module_path
+            #        module_path.*
+            new_exclude_modules = []
+            for exclude in self.exclude_modules:
+                if len(exclude) >= 2 and exclude[-1] == "*" and exclude[-2] != ".":
+                    new_exclude_modules.append(exclude[:-1])
+                    new_exclude_modules.append(exclude[:-1] + ".*")
+                else:
+                    new_exclude_modules.append(exclude)
+
+            self.exclude_modules = hf_to_vllm_mapper.apply_list(new_exclude_modules)
 
     @staticmethod
     def get_config_filenames() -> list[str]:

From 6ecc1e411ba3e720ef85aa34bba338581bcb7f76 Mon Sep 17 00:00:00 2001
From: tjp_zju <tanjianpingzju1990@gmail.com>
Date: Sun, 14 Dec 2025 18:20:51 +0800
Subject: [PATCH 531/770] =?UTF-8?q?[Bugfix]=20fix=20=5Fget=5Fquant=5Fmetho?=
 =?UTF-8?q?d=20of=20FusedMoE=20for=20deepseekV3.2=20on=20non-NV=E2=80=A6?=
 =?UTF-8?q?=20(#30057)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: tjp_zju <tanjianpingzju1990@gmail.com>
---
 vllm/model_executor/layers/quantization/moe_wna16.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index 0131a330f70d2..4bedb951a33f5 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -17,6 +17,9 @@ from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoEMethodBase,
     FusedMoeWeightScaleSupported,
 )
+from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import (
+    UnquantizedFusedMoEMethod,
+)
 from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod
 from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
@@ -162,6 +165,8 @@ class MoeWNA16Config(QuantizationConfig):
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
         if is_layer_skipped_quant(prefix, self.modules_to_not_convert):
+            if isinstance(layer, FusedMoE):
+                return UnquantizedFusedMoEMethod(layer.moe_config)
             return UnquantizedLinearMethod()
         elif isinstance(layer, LinearBase):
             # Avoid circular import

From a8ec486592fd44db67a7390fb91f032ce69f80e1 Mon Sep 17 00:00:00 2001
From: Bin Bao <binbao@meta.com>
Date: Sun, 14 Dec 2025 08:02:39 -0500
Subject: [PATCH 532/770] [Misc] Add a script to benchmark compilation time
 (#29919)

Signed-off-by: Bin Bao <binbao@meta.com>
---
 vllm/benchmarks/startup.py                | 326 ++++++++++++++++++++++
 vllm/entrypoints/cli/__init__.py          |   2 +
 vllm/entrypoints/cli/benchmark/startup.py |  21 ++
 3 files changed, 349 insertions(+)
 create mode 100644 vllm/benchmarks/startup.py
 create mode 100644 vllm/entrypoints/cli/benchmark/startup.py

diff --git a/vllm/benchmarks/startup.py b/vllm/benchmarks/startup.py
new file mode 100644
index 0000000000000..086f7bf62f838
--- /dev/null
+++ b/vllm/benchmarks/startup.py
@@ -0,0 +1,326 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Benchmark the cold and warm startup time of vLLM models.
+
+This script measures total startup time (including model loading, compilation,
+and cache operations) for both cold and warm scenarios:
+- Cold startup: Fresh start with no caches (temporary cache directories)
+- Warm startup: Using cached compilation and model info
+"""
+
+import argparse
+import dataclasses
+import json
+import multiprocessing
+import os
+import shutil
+import tempfile
+import time
+from contextlib import contextmanager
+from typing import Any
+
+import numpy as np
+from tqdm import tqdm
+
+from vllm.benchmarks.lib.utils import (
+    convert_to_pytorch_benchmark_format,
+    write_to_json,
+)
+from vllm.engine.arg_utils import EngineArgs
+
+
+@contextmanager
+def cold_startup():
+    """
+    Context manager to measure cold startup time:
+    1. Uses a temporary directory for vLLM cache to avoid any pollution
+       between cold startup iterations.
+    2. Uses inductor's fresh_cache to clear torch.compile caches.
+    """
+    from torch._inductor.utils import fresh_cache
+
+    # Use temporary directory for caching to avoid any pollution between cold startups
+    original_cache_root = os.environ.get("VLLM_CACHE_ROOT")
+    temp_cache_dir = tempfile.mkdtemp(prefix="vllm_startup_bench_cold_")
+    try:
+        os.environ["VLLM_CACHE_ROOT"] = temp_cache_dir
+        with fresh_cache():
+            yield
+    finally:
+        # Clean up temporary cache directory
+        shutil.rmtree(temp_cache_dir, ignore_errors=True)
+        if original_cache_root:
+            os.environ["VLLM_CACHE_ROOT"] = original_cache_root
+        else:
+            os.environ.pop("VLLM_CACHE_ROOT", None)
+
+
+def run_startup_in_subprocess(engine_args_dict, result_queue):
+    """
+    Run LLM startup in a subprocess and return timing metrics via a queue.
+    This ensures complete isolation between iterations.
+    """
+    try:
+        # Import inside the subprocess to avoid issues with forking
+        from vllm import LLM
+        from vllm.engine.arg_utils import EngineArgs
+
+        engine_args = EngineArgs(**engine_args_dict)
+
+        # Measure total startup time
+        start_time = time.perf_counter()
+
+        llm = LLM(**dataclasses.asdict(engine_args))
+
+        total_startup_time = time.perf_counter() - start_time
+
+        # Extract compilation time if available
+        compilation_time = 0.0
+        if hasattr(llm.llm_engine, "vllm_config"):
+            vllm_config = llm.llm_engine.vllm_config
+            if (
+                hasattr(vllm_config, "compilation_config")
+                and vllm_config.compilation_config is not None
+            ):
+                compilation_time = vllm_config.compilation_config.compilation_time
+
+        result_queue.put(
+            {
+                "total_startup_time": total_startup_time,
+                "compilation_time": compilation_time,
+            }
+        )
+
+    except Exception as e:
+        result_queue.put(None)
+        result_queue.put(str(e))
+
+
+def save_to_pytorch_benchmark_format(
+    args: argparse.Namespace, results: dict[str, Any]
+) -> None:
+    base_name = os.path.splitext(args.output_json)[0]
+
+    cold_startup_records = convert_to_pytorch_benchmark_format(
+        args=args,
+        metrics={
+            "avg_cold_startup_time": results["avg_cold_startup_time"],
+        },
+        extra_info={
+            "cold_startup_times": results["cold_startup_times"],
+            "cold_startup_percentiles": results["cold_startup_percentiles"],
+        },
+    )
+    if cold_startup_records:
+        write_to_json(f"{base_name}.cold_startup.pytorch.json", cold_startup_records)
+
+    cold_compilation_records = convert_to_pytorch_benchmark_format(
+        args=args,
+        metrics={
+            "avg_cold_compilation_time": results["avg_cold_compilation_time"],
+        },
+        extra_info={
+            "cold_compilation_times": results["cold_compilation_times"],
+            "cold_compilation_percentiles": results["cold_compilation_percentiles"],
+        },
+    )
+    if cold_compilation_records:
+        write_to_json(
+            f"{base_name}.cold_compilation.pytorch.json", cold_compilation_records
+        )
+
+    warm_startup_records = convert_to_pytorch_benchmark_format(
+        args=args,
+        metrics={
+            "avg_warm_startup_time": results["avg_warm_startup_time"],
+        },
+        extra_info={
+            "warm_startup_times": results["warm_startup_times"],
+            "warm_startup_percentiles": results["warm_startup_percentiles"],
+        },
+    )
+    if warm_startup_records:
+        write_to_json(f"{base_name}.warm_startup.pytorch.json", warm_startup_records)
+
+    warm_compilation_records = convert_to_pytorch_benchmark_format(
+        args=args,
+        metrics={
+            "avg_warm_compilation_time": results["avg_warm_compilation_time"],
+        },
+        extra_info={
+            "warm_compilation_times": results["warm_compilation_times"],
+            "warm_compilation_percentiles": results["warm_compilation_percentiles"],
+        },
+    )
+    if warm_compilation_records:
+        write_to_json(
+            f"{base_name}.warm_compilation.pytorch.json", warm_compilation_records
+        )
+
+
+def add_cli_args(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        "--num-iters-cold",
+        type=int,
+        default=5,
+        help="Number of cold startup iterations.",
+    )
+    parser.add_argument(
+        "--num-iters-warmup",
+        type=int,
+        default=3,
+        help="Number of warmup iterations before benchmarking warm startups.",
+    )
+    parser.add_argument(
+        "--num-iters-warm",
+        type=int,
+        default=5,
+        help="Number of warm startup iterations.",
+    )
+    parser.add_argument(
+        "--output-json",
+        type=str,
+        default=None,
+        help="Path to save the startup time results in JSON format.",
+    )
+
+    parser = EngineArgs.add_cli_args(parser)
+    return parser
+
+
+def main(args: argparse.Namespace):
+    # Set multiprocessing start method to 'spawn' for clean process isolation
+    # This ensures each subprocess starts fresh without inheriting state
+    multiprocessing.set_start_method("spawn", force=True)
+
+    engine_args = EngineArgs.from_cli_args(args)
+
+    def create_llm_and_measure_startup():
+        """
+        Create LLM instance in a subprocess and measure startup time.
+        Returns timing metrics, using subprocess for complete isolation.
+        """
+        # Convert engine_args to dictionary for pickling
+        engine_args_dict = dataclasses.asdict(engine_args)
+
+        # Create a queue for inter-process communication
+        result_queue = multiprocessing.Queue()
+        process = multiprocessing.Process(
+            target=run_startup_in_subprocess,
+            args=(
+                engine_args_dict,
+                result_queue,
+            ),
+        )
+        process.start()
+        process.join()
+
+        if not result_queue.empty():
+            result = result_queue.get()
+            if result is None:
+                if not result_queue.empty():
+                    error_msg = result_queue.get()
+                    raise RuntimeError(f"Subprocess failed: {error_msg}")
+                else:
+                    raise RuntimeError("Subprocess failed with unknown error")
+            return result
+        else:
+            raise RuntimeError("Subprocess did not return a result")
+
+    os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
+    print("Setting VLLM_ENABLE_V1_MULTIPROCESSING=0 to collect startup metrics.\n")
+
+    print("Measuring cold startup time...\n")
+    cold_startup_times = []
+    cold_compilation_times = []
+    for i in tqdm(range(args.num_iters_cold), desc="Cold startup iterations"):
+        with cold_startup():
+            metrics = create_llm_and_measure_startup()
+            cold_startup_times.append(metrics["total_startup_time"])
+            cold_compilation_times.append(metrics["compilation_time"])
+
+    # Warmup for warm startup
+    print("\nWarming up for warm startup measurement...\n")
+    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
+        create_llm_and_measure_startup()
+
+    print("\nMeasuring warm startup time...\n")
+    warm_startup_times = []
+    warm_compilation_times = []
+    for i in tqdm(range(args.num_iters_warm), desc="Warm startup iterations"):
+        metrics = create_llm_and_measure_startup()
+        warm_startup_times.append(metrics["total_startup_time"])
+        warm_compilation_times.append(metrics["compilation_time"])
+
+    # Calculate statistics
+    cold_startup_array = np.array(cold_startup_times)
+    cold_compilation_array = np.array(cold_compilation_times)
+    warm_startup_array = np.array(warm_startup_times)
+    warm_compilation_array = np.array(warm_compilation_times)
+
+    avg_cold_startup = np.mean(cold_startup_array)
+    avg_cold_compilation = np.mean(cold_compilation_array)
+    avg_warm_startup = np.mean(warm_startup_array)
+    avg_warm_compilation = np.mean(warm_compilation_array)
+
+    percentages = [10, 25, 50, 75, 90, 99]
+    cold_startup_percentiles = np.percentile(cold_startup_array, percentages)
+    cold_compilation_percentiles = np.percentile(cold_compilation_array, percentages)
+    warm_startup_percentiles = np.percentile(warm_startup_array, percentages)
+    warm_compilation_percentiles = np.percentile(warm_compilation_array, percentages)
+
+    print("\n" + "=" * 60)
+    print("STARTUP TIME BENCHMARK RESULTS")
+    print("=" * 60)
+
+    # Cold startup statistics
+    print("\nCOLD STARTUP:")
+    print(f"Avg total startup time: {avg_cold_startup:.2f} seconds")
+    print(f"Avg compilation time:   {avg_cold_compilation:.2f} seconds")
+    print("Startup time percentiles:")
+    for percentage, percentile in zip(percentages, cold_startup_percentiles):
+        print(f"  {percentage}%: {percentile:.2f} seconds")
+    print("Compilation time percentiles:")
+    for percentage, percentile in zip(percentages, cold_compilation_percentiles):
+        print(f"  {percentage}%: {percentile:.2f} seconds")
+
+    # Warm startup statistics
+    print("\nWARM STARTUP:")
+    print(f"Avg total startup time: {avg_warm_startup:.2f} seconds")
+    print(f"Avg compilation time:   {avg_warm_compilation:.2f} seconds")
+    print("Startup time percentiles:")
+    for percentage, percentile in zip(percentages, warm_startup_percentiles):
+        print(f"  {percentage}%: {percentile:.2f} seconds")
+    print("Compilation time percentiles:")
+    for percentage, percentile in zip(percentages, warm_compilation_percentiles):
+        print(f"  {percentage}%: {percentile:.2f} seconds")
+
+    print("=" * 60)
+
+    # Output JSON results if specified
+    if args.output_json:
+        results = {
+            "avg_cold_startup_time": float(avg_cold_startup),
+            "avg_cold_compilation_time": float(avg_cold_compilation),
+            "cold_startup_times": cold_startup_times,
+            "cold_compilation_times": cold_compilation_times,
+            "cold_startup_percentiles": dict(
+                zip(percentages, cold_startup_percentiles.tolist())
+            ),
+            "cold_compilation_percentiles": dict(
+                zip(percentages, cold_compilation_percentiles.tolist())
+            ),
+            "avg_warm_startup_time": float(avg_warm_startup),
+            "avg_warm_compilation_time": float(avg_warm_compilation),
+            "warm_startup_times": warm_startup_times,
+            "warm_compilation_times": warm_compilation_times,
+            "warm_startup_percentiles": dict(
+                zip(percentages, warm_startup_percentiles.tolist())
+            ),
+            "warm_compilation_percentiles": dict(
+                zip(percentages, warm_compilation_percentiles.tolist())
+            ),
+        }
+        with open(args.output_json, "w") as f:
+            json.dump(results, f, indent=4)
+        save_to_pytorch_benchmark_format(args, results)
diff --git a/vllm/entrypoints/cli/__init__.py b/vllm/entrypoints/cli/__init__.py
index 9dff68236fe94..dc02ac563406a 100644
--- a/vllm/entrypoints/cli/__init__.py
+++ b/vllm/entrypoints/cli/__init__.py
@@ -2,12 +2,14 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from vllm.entrypoints.cli.benchmark.latency import BenchmarkLatencySubcommand
 from vllm.entrypoints.cli.benchmark.serve import BenchmarkServingSubcommand
+from vllm.entrypoints.cli.benchmark.startup import BenchmarkStartupSubcommand
 from vllm.entrypoints.cli.benchmark.sweep import BenchmarkSweepSubcommand
 from vllm.entrypoints.cli.benchmark.throughput import BenchmarkThroughputSubcommand
 
 __all__: list[str] = [
     "BenchmarkLatencySubcommand",
     "BenchmarkServingSubcommand",
+    "BenchmarkStartupSubcommand",
     "BenchmarkSweepSubcommand",
     "BenchmarkThroughputSubcommand",
 ]
diff --git a/vllm/entrypoints/cli/benchmark/startup.py b/vllm/entrypoints/cli/benchmark/startup.py
new file mode 100644
index 0000000000000..81eefd7c174dc
--- /dev/null
+++ b/vllm/entrypoints/cli/benchmark/startup.py
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+
+from vllm.benchmarks.startup import add_cli_args, main
+from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
+
+
+class BenchmarkStartupSubcommand(BenchmarkSubcommandBase):
+    """The `startup` subcommand for `vllm bench`."""
+
+    name = "startup"
+    help = "Benchmark the startup time of vLLM models."
+
+    @classmethod
+    def add_cli_args(cls, parser: argparse.ArgumentParser) -> None:
+        add_cli_args(parser)
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        main(args)

From 5b64ac21f99ff1c31f5481267ee80e34b3c77955 Mon Sep 17 00:00:00 2001
From: Drew Botwinick <6953152+dbotwinick@users.noreply.github.com>
Date: Sun, 14 Dec 2025 07:19:20 -0600
Subject: [PATCH 533/770] [Bugfix] Update get_processor_data to use get_all
 method (#30583)

Signed-off-by: Drew Botwinick <6953152+dbotwinick@users.noreply.github.com>
---
 vllm/multimodal/parse.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index c3c7cc2c3da0e..a69afc3176cab 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -120,7 +120,7 @@ class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]):
         return self.data[index]
 
     def get_processor_data(self) -> Mapping[str, object]:
-        return {f"{self.modality}s": self.data}
+        return {f"{self.modality}s": self.get_all()}
 
     def get_passthrough_data(self) -> Mapping[str, object]:
         return {}

From 48b8456ff9927f619ab9463106735b83d3035113 Mon Sep 17 00:00:00 2001
From: zifeitong <zifeitong@gmail.com>
Date: Sun, 14 Dec 2025 05:20:08 -0800
Subject: [PATCH 534/770] [Bugfix] Revert Qwen2-VL part of change in #28271
 (#30542)

Signed-off-by: Zifei Tong <zifeitong@gmail.com>
---
 vllm/model_executor/models/qwen2_vl.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 4e54208a59b67..22982ea1113ac 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -50,7 +50,7 @@ from vllm.attention.layer import (
 )
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
-from vllm.distributed import parallel_state
+from vllm.distributed import parallel_state, tensor_model_parallel_all_gather
 from vllm.distributed import utils as dist_utils
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import QuickGELU
@@ -360,10 +360,21 @@ class Qwen2VisionAttention(nn.Module):
     def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
         # [s, b, 3 * head * head_dim]
         seq_len, bs, _ = qkv.shape
+        if self.tp_size > 1:
+            qkv = tensor_model_parallel_all_gather(qkv)
 
         # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim]
         q, k, v = qkv.chunk(3, dim=2)
 
+        # 3 * [s, b, head * head_dim]
+        if self.tp_size > 1:
+            splitter = partial(
+                dist_utils.split_tensor_along_last_dim, num_partitions=self.tp_size
+            )
+            q = splitter(q)[self.tp_rank]
+            k = splitter(k)[self.tp_rank]
+            v = splitter(v)[self.tp_rank]
+
         # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim]
         new_shape = (
             seq_len,

From 994acec0cc9d6348268b5f371c66239fe75f928d Mon Sep 17 00:00:00 2001
From: ElizaWszola <ewszola@redhat.com>
Date: Sun, 14 Dec 2025 14:22:37 +0100
Subject: [PATCH 535/770] [Bugfix] Fix fusion for VL models (#30244)

Signed-off-by: ElizaWszola <ewszola@redhat.com>
---
 tests/compile/distributed/test_fusions_e2e.py |  78 ++++++++++++++
 vllm/compilation/fusion.py                    | 100 +++++++++---------
 vllm/compilation/matcher_utils.py             |  20 ++--
 vllm/utils/deep_gemm.py                       |  17 ---
 4 files changed, 143 insertions(+), 72 deletions(-)

diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py
index 1fcafe1840cd3..bd326f1157d8f 100644
--- a/tests/compile/distributed/test_fusions_e2e.py
+++ b/tests/compile/distributed/test_fusions_e2e.py
@@ -27,6 +27,7 @@ is_blackwell = lambda: current_platform.is_device_capability_family(100)
 class Matches(NamedTuple):
     attention_fusion: int = 0
     allreduce_fusion: int = 0
+    rms_quant_norm_fusion: int = 0
     sequence_parallel: int = 0
     async_tp: int = 0
 
@@ -40,6 +41,7 @@ class ModelBackendTestCase(NamedTuple):
 
 MODELS_FP8: list[ModelBackendTestCase] = []
 MODELS_FP4: list[ModelBackendTestCase] = []
+MODELS_GROUP_FP8: list[ModelBackendTestCase] = []
 MODELS: list[ModelBackendTestCase] = []  # tp-only
 
 if current_platform.is_cuda():
@@ -498,3 +500,79 @@ def run_model(compile_config: int | CompilationConfig, model: str, **model_kwarg
     compilation_config.compile_ranges_split_points = (
         llm.llm_engine.vllm_config.compilation_config.compile_ranges_split_points
     )
+
+
+if current_platform.is_cuda():
+    MODELS_GROUP_FP8 = [
+        ModelBackendTestCase(
+            model_name="Qwen/Qwen3-30B-A3B-FP8",
+            model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
+            backend=AttentionBackendEnum.TRITON_ATTN,
+            matches=Matches(
+                rms_quant_norm_fusion=48,
+            ),
+        ),
+    ]
+
+CUSTOM_OPS_QUANT_RMS_NORM = ["+quant_fp8,+rms_norm"]
+
+
+@pytest.mark.parametrize(
+    "model_name, model_kwargs, backend, matches, custom_ops",
+    # Test rms norm+group quant_fp8 fusion
+    list[tuple[Any, ...]](flat_product(MODELS_GROUP_FP8, CUSTOM_OPS_QUANT_RMS_NORM)),
+)
+@pytest.mark.parametrize("inductor_graph_partition", [True, False])
+def test_rms_group_quant(
+    model_name: str,
+    model_kwargs: dict[str, Any],
+    backend: AttentionBackendEnum,
+    matches: Matches,
+    custom_ops: str,
+    inductor_graph_partition: bool,
+    caplog_mp_spawn,
+    monkeypatch,
+):
+    if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
+        pytest.skip("Inductor graph partition requires torch>=2.9")
+
+    custom_ops_list = custom_ops.split(",") if custom_ops else []
+
+    if inductor_graph_partition:
+        mode = CUDAGraphMode.FULL_AND_PIECEWISE
+        splitting_ops: list[str] | None = None
+    else:
+        mode = CUDAGraphMode.FULL_DECODE_ONLY
+        splitting_ops = []
+
+    # Disable, compile cache to make sure custom passes run.
+    # Otherwise, we can't verify fusion happened through the logs.
+    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
+
+    # To capture subprocess logs, we need to know whether spawn or fork is used.
+    # Force spawn as it is more general.
+    monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
+    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
+
+    compilation_config = CompilationConfig(
+        # Testing properties
+        custom_ops=custom_ops_list,
+        use_inductor_graph_partition=inductor_graph_partition,
+        cudagraph_mode=mode,
+        splitting_ops=splitting_ops,
+        # Common
+        mode=CompilationMode.VLLM_COMPILE,
+        pass_config=PassConfig(eliminate_noops=True, enable_fusion=True),
+        # Inductor caches custom passes by default as well via uuid
+        inductor_compile_config={"force_disable_caches": True},
+    )
+
+    with caplog_mp_spawn(logging.DEBUG) as log_holder:
+        run_model(compilation_config, model_name, **model_kwargs)
+
+    log_matches = re.findall(
+        r"\[fusion.py:\d+] Replaced (\d+) patterns",
+        log_holder.text,
+    )
+    assert len(log_matches) == 1, log_holder.text
+    assert int(log_matches[0]) == matches.rms_quant_norm_fusion
diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py
index a7e6a69e64c91..d121106334cb9 100644
--- a/vllm/compilation/fusion.py
+++ b/vllm/compilation/fusion.py
@@ -23,17 +23,14 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
     kNvfp4Quant,
     kStaticTensorScale,
 )
-from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    cutlass_block_fp8_supported,
-)
 from vllm.platforms import current_platform
-from vllm.utils.deep_gemm import (
-    is_deep_gemm_e8m0_used,
-    should_use_deepgemm_for_fp8_linear_for_nk,
-)
 
 from .inductor_pass import enable_fake_mode
-from .matcher_utils import MatcherFusedAddRMSNorm, MatcherQuantFP8, MatcherRMSNorm
+from .matcher_utils import (
+    MatcherFusedAddRMSNorm,
+    MatcherQuantFP8,
+    MatcherRMSNorm,
+)
 from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass
 
 logger = init_logger(__name__)
@@ -118,21 +115,18 @@ FUSED_OPS: dict[FusedRMSQuantKey, OpOverload] = {
 
 
 class RMSNormQuantPattern:
-    def __init__(self, epsilon: float, key: FusedRMSQuantKey):
+    def __init__(
+        self,
+        epsilon: float,
+        key: FusedRMSQuantKey,
+        has_col_major_scales: bool = False,
+        is_e8m0: bool = False,
+    ):
         self.epsilon = epsilon
         self.quant_dtype = key.quant.dtype
         config = get_current_vllm_config()
         self.model_dtype = config.model_config.dtype if config.model_config else None
 
-        # groupwise FP8 linear uses col major scales if deepgemm and cutlass
-        using_deepgemm = should_use_deepgemm_for_fp8_linear_for_nk(
-            self.model_dtype,
-            config.model_config.hf_config.intermediate_size,
-            config.model_config.hf_config.hidden_size,
-        )
-        use_col_major_scales = using_deepgemm or cutlass_block_fp8_supported()
-        use_e8m0 = is_deep_gemm_e8m0_used() if using_deepgemm else False
-
         assert key in FUSED_OPS, f"unsupported fused rmsnorm+quant op for {key}"
         self.FUSED_OP = FUSED_OPS[key]
 
@@ -142,7 +136,7 @@ class RMSNormQuantPattern:
             else MatcherFusedAddRMSNorm(epsilon)
         )
         self.quant_matcher = MatcherQuantFP8(
-            key.quant, use_col_major_scales=use_col_major_scales, use_e8m0=use_e8m0
+            key.quant, has_col_major_scales=has_col_major_scales, is_e8m0=is_e8m0
         )
 
 
@@ -260,6 +254,8 @@ class FusedAddRMSNormGroupQuantPattern(RMSNormQuantPattern):
         quant_dtype: torch.dtype,
         group_shape: GroupShape,
         symmetric=True,
+        has_col_major_scales: bool = False,
+        is_e8m0: bool = False,
     ):
         scale = ScaleDesc(torch.float32, False, group_shape)
         key = FusedRMSQuantKey(
@@ -267,7 +263,11 @@ class FusedAddRMSNormGroupQuantPattern(RMSNormQuantPattern):
             quant=QuantKey(dtype=quant_dtype, scale=scale, symmetric=symmetric),
         )
         self.group_shape = group_shape
-        super().__init__(epsilon, key)
+        self.has_col_major_scales = has_col_major_scales
+        self.is_e8m0 = is_e8m0
+        super().__init__(
+            epsilon, key, has_col_major_scales=has_col_major_scales, is_e8m0=is_e8m0
+        )
 
     def register(self, pm_pass: PatternMatcherPass):
         def pattern(input: torch.Tensor, weight: torch.Tensor, residual: torch.Tensor):
@@ -283,9 +283,7 @@ class FusedAddRMSNormGroupQuantPattern(RMSNormQuantPattern):
             input = input.to(dtype=self.model_dtype)
 
             result = torch.empty_like(input, dtype=self.quant_dtype)
-            scale = self.quant_matcher.make_scale(
-                input, transposed=self.quant_matcher.use_col_major_scales
-            )
+            scale = self.quant_matcher.make_scale(input, self.has_col_major_scales)
             at = auto_functionalized(
                 self.FUSED_OP,
                 result=result,
@@ -296,7 +294,7 @@ class FusedAddRMSNormGroupQuantPattern(RMSNormQuantPattern):
                 scale_ub=None,
                 residual=residual,
                 group_size=self.group_shape[1],
-                is_scale_transposed=self.quant_matcher.use_col_major_scales,
+                is_scale_transposed=self.has_col_major_scales,
             )
 
             # result, residual, scale
@@ -318,6 +316,8 @@ class RMSNormGroupQuantPattern(RMSNormQuantPattern):
         quant_dtype: torch.dtype,
         group_shape: GroupShape,
         symmetric=True,
+        has_col_major_scales: bool = False,
+        is_e8m0: bool = False,
     ):
         scale = ScaleDesc(torch.float32, False, group_shape)
         key = FusedRMSQuantKey(
@@ -325,7 +325,9 @@ class RMSNormGroupQuantPattern(RMSNormQuantPattern):
             quant=QuantKey(dtype=quant_dtype, scale=scale, symmetric=symmetric),
         )
         self.group_shape = group_shape
-        super().__init__(epsilon, key)
+        super().__init__(
+            epsilon, key, has_col_major_scales=has_col_major_scales, is_e8m0=is_e8m0
+        )
 
     def register(self, pm_pass: PatternMatcherPass):
         def pattern(input: torch.Tensor, weight: torch.Tensor):
@@ -340,7 +342,7 @@ class RMSNormGroupQuantPattern(RMSNormQuantPattern):
 
             result = torch.empty_like(input, dtype=self.quant_dtype)
             scale = self.quant_matcher.make_scale(
-                input, transposed=self.quant_matcher.use_col_major_scales
+                input, transposed=self.quant_matcher.has_col_major_scales
             )
             at = auto_functionalized(
                 self.FUSED_OP,
@@ -352,7 +354,7 @@ class RMSNormGroupQuantPattern(RMSNormQuantPattern):
                 scale_ub=None,
                 residual=None,
                 group_size=self.group_shape[1],
-                is_scale_transposed=self.quant_matcher.use_col_major_scales,
+                is_scale_transposed=self.quant_matcher.has_col_major_scales,
             )
 
             # result, scale
@@ -489,27 +491,6 @@ class RMSNormQuantFusionPass(VllmPatternMatcherPass):
         # Make sure fused add patterns are before simple rms norm,
         # as the latter is a subset of the former in torch ops
         for epsilon in [1e-5, 1e-6]:
-            # Fuse fused_add_rms_norm + fp8 group quant
-            # Only register group quant patterns on CUDA where the C++ op exists
-            if current_platform.is_cuda():
-                FusedAddRMSNormGroupQuantPattern(
-                    epsilon, FP8_DTYPE, group_shape=GroupShape(1, 128)
-                ).register(self.patterns)
-
-                # Fuse rms_norm + fp8 group quant
-                RMSNormGroupQuantPattern(
-                    epsilon, FP8_DTYPE, group_shape=GroupShape(1, 128)
-                ).register(self.patterns)
-
-                FusedAddRMSNormGroupQuantPattern(
-                    epsilon, FP8_DTYPE, group_shape=GroupShape(1, 64)
-                ).register(self.patterns)
-
-                # Fuse rms_norm + fp8 group quant
-                RMSNormGroupQuantPattern(
-                    epsilon, FP8_DTYPE, group_shape=GroupShape(1, 64)
-                ).register(self.patterns)
-
             # Fuse fused_add_rms_norm + static fp8 quant
             FusedAddRMSNormStaticQuantPattern(epsilon, FP8_DTYPE).register(
                 self.patterns
@@ -526,6 +507,29 @@ class RMSNormQuantFusionPass(VllmPatternMatcherPass):
             # Fuse rms_norm + dynamic per-token fp8 quant
             RMSNormDynamicQuantPattern(epsilon, FP8_DTYPE).register(self.patterns)
 
+            # Only register group quant patterns on CUDA where the C++ op exists
+            if current_platform.is_cuda():
+                for group_shape in [GroupShape(1, 128), GroupShape(1, 64)]:
+                    for has_col_major_scales in [True, False]:
+                        for is_e8m0 in [True, False]:
+                            # Fuse fused_add_rms_norm + fp8 group quant
+                            FusedAddRMSNormGroupQuantPattern(
+                                epsilon,
+                                FP8_DTYPE,
+                                group_shape=group_shape,
+                                has_col_major_scales=has_col_major_scales,
+                                is_e8m0=is_e8m0,
+                            ).register(self.patterns)
+
+                            # Fuse rms_norm + fp8 group quant
+                            RMSNormGroupQuantPattern(
+                                epsilon,
+                                FP8_DTYPE,
+                                group_shape=group_shape,
+                                has_col_major_scales=has_col_major_scales,
+                                is_e8m0=is_e8m0,
+                            ).register(self.patterns)
+
         self.dump_patterns(config, self.patterns)
 
     @VllmInductorPass.time_and_log
diff --git a/vllm/compilation/matcher_utils.py b/vllm/compilation/matcher_utils.py
index 0c0bece9b3fda..ec9ed34f561b4 100644
--- a/vllm/compilation/matcher_utils.py
+++ b/vllm/compilation/matcher_utils.py
@@ -234,24 +234,30 @@ class MatcherQuantFP8(MatcherCustomOp):
         self,
         quant_key: QuantKey,
         enabled: bool | None = None,
-        use_col_major_scales: bool = False,
-        use_e8m0: bool = False,
+        has_col_major_scales: bool = False,
+        is_e8m0: bool = False,
     ):
         if enabled is None:
             enabled = QuantFP8.enabled()
 
         super().__init__(enabled)
         self.quant_key = quant_key
-        self.use_col_major_scales = use_col_major_scales
-        self.use_e8m0 = use_e8m0
         assert quant_key in QUANT_OPS, f"unsupported quantization scheme {quant_key}"
         self.QUANT_OP = QUANT_OPS[quant_key]
 
+        self.has_col_major_scales = has_col_major_scales
+        self.is_e8m0 = is_e8m0
+
         assert quant_key.dtype == current_platform.fp8_dtype(), (
             "Only QuantFP8 supported by"
         )
         assert quant_key.scale2 is None
-        self.quant_fp8 = QuantFP8(quant_key.scale.static, quant_key.scale.group_shape)
+        self.quant_fp8 = QuantFP8(
+            quant_key.scale.static,
+            quant_key.scale.group_shape,
+            column_major_scales=has_col_major_scales,
+            use_ue8m0=is_e8m0,
+        )
 
     def forward_custom(
         self,
@@ -264,7 +270,7 @@ class MatcherQuantFP8(MatcherCustomOp):
 
         if self.quant_key.scale.group_shape.is_per_group():
             assert scale is None
-            scale = self.make_scale(input, transposed=self.use_col_major_scales)
+            scale = self.make_scale(input, transposed=self.has_col_major_scales)
 
             finfo = torch.finfo(self.quant_key.dtype)
             fp8_min = finfo.min
@@ -279,7 +285,7 @@ class MatcherQuantFP8(MatcherCustomOp):
                 eps=1e-10,
                 fp8_min=fp8_min,
                 fp8_max=fp8_max,
-                scale_ue8m0=self.use_e8m0,
+                scale_ue8m0=self.is_e8m0,
             )
             return result, scale
 
diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index 46be3e2cd5c54..3d4f8449ad3b6 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -381,22 +381,6 @@ def should_use_deepgemm_for_fp8_linear(
     )
 
 
-def should_use_deepgemm_for_fp8_linear_for_nk(
-    output_dtype: torch.dtype,
-    shape0: int,
-    shape1: int,
-    supports_deep_gemm: bool | None = None,
-):
-    if supports_deep_gemm is None:
-        supports_deep_gemm = is_deep_gemm_supported()
-    return (
-        supports_deep_gemm
-        and output_dtype == torch.bfloat16
-        and shape0 % 128 == 0
-        and shape1 % 128 == 0
-    )
-
-
 __all__ = [
     "calc_diff",
     "DeepGemmQuantScaleFMT",
@@ -411,7 +395,6 @@ __all__ = [
     "is_deep_gemm_supported",
     "get_num_sms",
     "should_use_deepgemm_for_fp8_linear",
-    "should_use_deepgemm_for_fp8_linear_for_nk",
     "get_col_major_tma_aligned_tensor",
     "get_mk_alignment_for_contiguous_layout",
 ]

From 5ccf0efa8422277ff25adbcf137136925a3f0b51 Mon Sep 17 00:00:00 2001
From: yifant-code <tian.yifan123@gmail.com>
Date: Sun, 14 Dec 2025 08:23:37 -0500
Subject: [PATCH 536/770] [Bugfix]  Improve error messages in ModelConfig
 validation (#30213)

Signed-off-by: ytian218 <ytian218@bloomberg.net>
Co-authored-by: ytian218 <ytian218@bloomberg.net>
---
 vllm/config/model.py | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/vllm/config/model.py b/vllm/config/model.py
index 59e9689567bd2..10e4d653c8256 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -611,9 +611,17 @@ class ModelConfig:
     @model_validator(mode="after")
     def validate_model_config_after(self: "ModelConfig") -> "ModelConfig":
         if not isinstance(self.tokenizer, str):
-            raise ValueError("tokenizer must be a string after __post_init__.")
-        if not isinstance(self.max_model_len, int):
-            raise ValueError("max_model_len must be an integer after __post_init__.")
+            raise ValueError(
+                f"tokenizer must be a string, got "
+                f"{type(self.tokenizer).__name__}: {self.tokenizer!r}. "
+                "Please provide a valid tokenizer path or HuggingFace model ID."
+            )
+        if not isinstance(self.max_model_len, int) or self.max_model_len <= 0:
+            raise ValueError(
+                f"max_model_len must be a positive integer, "
+                f"got {type(self.max_model_len).__name__}: {self.max_model_len!r}. "
+                "Example: max_model_len=2048"
+            )
         return self
 
     def _get_transformers_backend_cls(self) -> str:
@@ -1186,7 +1194,15 @@ class ModelConfig:
                         // block.attention.n_heads_in_group
                     )
 
-            raise RuntimeError("Couldn't determine number of kv heads")
+            raise RuntimeError(
+                "Could not determine the number of key-value attention heads "
+                "from model configuration. "
+                f"Model: {self.model}, Architecture: {self.architectures}. "
+                "This usually indicates an unsupported model architecture or "
+                "missing configuration. "
+                "Please check if your model is supported at: "
+                "https://docs.vllm.ai/en/latest/models/supported_models.html"
+            )
 
         if self.is_attention_free:
             return 0

From ae88aada38eca50f6b7e3c9caf2ac410e76964c9 Mon Sep 17 00:00:00 2001
From: ZiTian Zhao <zitian.zhao@tencentmusic.com>
Date: Sun, 14 Dec 2025 21:24:56 +0800
Subject: [PATCH 537/770] [Feature]Add EVS (Efficient Video Sampling) Support
 for Qwen3-VL (#29752)

Signed-off-by: zitian.zhao <zitian.zhao@tencentmusic.com>
Co-authored-by: deitxfge <huhaibo1990@126.com>
---
 vllm/model_executor/models/qwen3_vl.py | 436 ++++++++++++++++++++++++-
 1 file changed, 424 insertions(+), 12 deletions(-)

diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index fcd58c4d33cd7..7fb14a5cf404a 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -67,12 +67,19 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.evs import (
+    compute_mrope_for_media,
+    compute_retained_tokens_count,
+    compute_retention_mask,
+    recompute_mrope_positions,
+)
 from vllm.multimodal.inputs import (
     MultiModalDataDict,
     MultiModalFeatureSpec,
     MultiModalFieldConfig,
     MultiModalKwargsItem,
     MultiModalKwargsItems,
+    PlaceholderRange,
     VideoItem,
 )
 from vllm.multimodal.parse import ImageSize, MultiModalDataItems, MultiModalDataParser
@@ -92,6 +99,7 @@ from .interfaces import (
     SupportsLoRA,
     SupportsMRoPE,
     SupportsMultiModal,
+    SupportsMultiModalPruning,
     SupportsPP,
     _require_is_multimodal,
 )
@@ -1043,13 +1051,39 @@ class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo])
                 tokenizer.encode(f"<{curr_time:.1f} seconds>", add_special_tokens=False)
                 for curr_time in timestamps
             ]
-            num_tokens_per_frame = int(grid_thw[1:].prod()) // merge_length
+            tokens_per_frame = int(grid_thw[1:].prod()) // merge_length
+            per_frame_token_counts = [tokens_per_frame for _ in frames_idx_token]
+
+            video_pruning_rate = self.info.ctx.get_mm_config().video_pruning_rate
+            if video_pruning_rate is not None and video_pruning_rate > 0.0:
+                total_retained = compute_retained_tokens_count(
+                    tokens_per_frame,
+                    len(frames_idx_token),
+                    video_pruning_rate,
+                )
+                if len(frames_idx_token) == 0:
+                    per_frame_token_counts = []
+                elif len(frames_idx_token) == 1:
+                    per_frame_token_counts = [tokens_per_frame]
+                else:
+                    first_frame_tokens = tokens_per_frame
+                    remaining_tokens = max(total_retained - first_frame_tokens, 0)
+                    base = remaining_tokens // (len(frames_idx_token) - 1)
+                    remainder = remaining_tokens % (len(frames_idx_token) - 1)
+                    per_frame_token_counts = [first_frame_tokens]
+                    for frame_idx in range(1, len(frames_idx_token)):
+                        extra = base + (1 if (frame_idx - 1) < remainder else 0)
+                        per_frame_token_counts.append(extra)
+
             placeholder = []
-            for frame_idx in frames_idx_token:
-                placeholder.extend(frame_idx)
+            for frame_idx, timestamp_tokens in enumerate(frames_idx_token):
+                placeholder.extend(timestamp_tokens)
+                tokens_this_frame = per_frame_token_counts[
+                    frame_idx if frame_idx < len(per_frame_token_counts) else -1
+                ]
                 placeholder.extend(
                     [vision_start_token_id]
-                    + [video_token_id] * num_tokens_per_frame
+                    + [video_token_id] * tokens_this_frame
                     + [vision_end_token_id]
                 )
             return PromptUpdateDetails.select_token_id(placeholder, video_token_id)
@@ -1190,6 +1224,7 @@ class Qwen3VLForConditionalGeneration(
     SupportsPP,
     SupportsMRoPE,
     SupportsEagle3,
+    SupportsMultiModalPruning,
 ):
     packed_modules_mapping = {
         "qkv_proj": [
@@ -1232,6 +1267,11 @@ class Qwen3VLForConditionalGeneration(
         self.config = config
         self.multimodal_config = multimodal_config
         self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
+        self.video_pruning_rate = multimodal_config.video_pruning_rate
+        self.is_multimodal_pruning_enabled = (
+            multimodal_config.is_multimodal_pruning_enabled()
+        )
+
         if not multimodal_config.get_limit_per_prompt(
             "image"
         ) and not multimodal_config.get_limit_per_prompt("video"):
@@ -1418,6 +1458,109 @@ class Qwen3VLForConditionalGeneration(
         sizes = (grid_thw.prod(-1) // merge_size // merge_size).tolist()
         return video_embeds.split(sizes)
 
+    def _postprocess_image_embeds_evs(
+        self,
+        image_embeds_split: tuple[torch.Tensor, ...],
+        image_input: Qwen2_5_VLImageInputs,
+    ) -> tuple[torch.Tensor, ...]:
+        """
+        Append mrope positions for each for images.
+        This is necessary to recover correct mrope
+        positions after video pruning
+
+        Args:
+            image_embeds_split: Tuple of image embeddings for
+                each image item.
+            image_input: Image input data.
+
+        Returns:
+            Tuple of image embeddings for each image item.
+            Resulting embeddings will have extra 4 channels for
+            computed mrope positions.
+        """
+        merge_size = self.visual.spatial_merge_size
+        grid_thw = image_input["image_grid_thw"]
+        grid_thw_list = grid_thw.tolist()
+        image_embeds_out = []
+        for emb, size in zip(image_embeds_split, grid_thw_list):
+            positions = compute_mrope_for_media(size, merge_size).to(emb.device)
+            emb = torch.cat([emb, positions], dim=1)
+            image_embeds_out.append(emb)
+        image_embeds_split = image_embeds_out
+        return tuple(image_embeds_split)
+
+    def _postprocess_video_embeds_evs(
+        self,
+        video_embeds_split: tuple[torch.Tensor, ...],
+        video_input: Qwen2_5_VLVideoInputs,
+    ) -> tuple[torch.Tensor, ...]:
+        """
+        Prunes video embeddings via Efficient Video Sampling (EVS)
+        and then appends mrope positions for each retained embeddings
+
+        Args:
+            video_embeds_split: Tuple of video embeddings for each video item.
+            video_input: Video input data.
+
+        Returns:
+            Tuple of video embeddings for each video item.
+            Resulting embeddings will have extra 4 channels for
+            computed mrope positions.
+        """
+        grid_thw = video_input["video_grid_thw"]
+        assert grid_thw.ndim == 2
+        grid_thw_list = grid_thw.tolist()
+        merge_size = self.visual.spatial_merge_size
+
+        # Cast to long to match the original code
+        # https://github.com/huggingface/transformers/blob/41980ce93e775f6c88500c51c8db7946fc6a2add/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py#L491 # noqa
+        second_per_grid_ts = video_input.get("second_per_grid_ts")
+        if second_per_grid_ts is None:
+            # For Qwen3-VL, second_per_grid_ts might not be available
+            # Use default value of 1.0 for each video
+            second_per_grid_ts = torch.ones(len(grid_thw_list), dtype=torch.long)
+        else:
+            second_per_grid_ts = second_per_grid_ts.long()
+        tokens_per_second = getattr(self.config.vision_config, "tokens_per_second", 1.0)
+
+        video_embeds_out = []
+        for emb, size, video_second_per_grid_t in zip(
+            video_embeds_split, grid_thw_list, second_per_grid_ts
+        ):
+            # For each video, we compute retention mask using EVS
+            retention_mask = compute_retention_mask(
+                emb,
+                size,
+                spatial_merge_size=self.visual.spatial_merge_size,
+                q=self.video_pruning_rate,
+            )
+
+            # Debug logging for EVS pruning
+            logger.debug(
+                "EVS: Video tokens pruned from %d to %d (T=%d,H=%d,W=%d, "
+                "pruning_rate=%.2f, reduction=%.1f%%)",
+                emb.shape[0],
+                retention_mask.sum().item(),
+                size[0],
+                size[1],
+                size[2],
+                self.video_pruning_rate,
+                (1 - retention_mask.float().mean().item()) * 100,
+            )
+
+            positions = compute_mrope_for_media(
+                size,
+                merge_size,
+                tokens_per_second=tokens_per_second,
+                video_second_per_grid=video_second_per_grid_t.item(),
+            ).to(emb.device)
+
+            emb = emb[retention_mask]
+            positions = positions[retention_mask]
+            emb = torch.cat([emb, positions], dim=1)
+            video_embeds_out.append(emb)
+        return tuple(video_embeds_out)
+
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
         mm_input_by_modality = {}
         for input_key in kwargs:
@@ -1440,6 +1583,20 @@ class Qwen3VLForConditionalGeneration(
     def iter_mm_grid_hw(
         self, input_tokens: list[int], mm_features: list[MultiModalFeatureSpec]
     ) -> Iterator[tuple[int, int, int]]:
+        """
+        Iterate over multimodal features and yield grid information.
+
+        For videos with EVS (Efficient Video Sampling) enabled, this function
+        computes the offset based on the pruned token count rather than relying
+        on input_tokens.index(), which would fail when tokens are pruned.
+
+        Args:
+            input_tokens: List of token IDs in the prompt
+            mm_features: List of multimodal feature specifications
+
+        Yields:
+            Tuple of (offset, grid_h, grid_w) for each frame/image
+        """
         video_token_id = self.config.video_token_id
         spatial_merge_size = self.config.vision_config.spatial_merge_size
         for mm_feature in sorted(mm_features, key=lambda f: f.mm_position.offset):
@@ -1452,42 +1609,289 @@ class Qwen3VLForConditionalGeneration(
                 t, h, w = mm_feature.data["video_grid_thw"].data.tolist()
                 llm_grid_h = h // spatial_merge_size
                 llm_grid_w = w // spatial_merge_size
-                for _ in range(t):
-                    offset = input_tokens.index(video_token_id, offset)
-                    yield offset, llm_grid_h, llm_grid_w
-                    offset += llm_grid_h * llm_grid_w
+
+                # Check if EVS (Efficient Video Sampling) is enabled
+                is_evs_enabled = (
+                    hasattr(self, "video_pruning_rate")
+                    and self.video_pruning_rate is not None
+                    and self.video_pruning_rate > 0.0
+                )
+
+                if is_evs_enabled:
+                    frame_offsets = self._extract_frame_offsets_from_mask(
+                        mm_feature.mm_position, t
+                    )
+                    if frame_offsets is not None:
+                        for rel_offset in frame_offsets:
+                            yield offset + rel_offset, llm_grid_h, llm_grid_w
+                        continue
+
+                    # If EVS is enabled but mask is missing, this indicates a bug
+                    # in the prompt processing pipeline. The is_embed mask should
+                    # always be present when video_pruning_rate > 0.
+                    raise RuntimeError(
+                        f"EVS is enabled (pruning_rate={self.video_pruning_rate}) "
+                        "but is_embed mask is missing from mm_position. "
+                        "This indicates a bug in prompt processing."
+                    )
+                else:
+                    # Non-EVS mode: Use original logic with input_tokens.index()
+                    for _ in range(t):
+                        offset = input_tokens.index(video_token_id, offset)
+                        yield offset, llm_grid_h, llm_grid_w
+                        offset += llm_grid_h * llm_grid_w
             else:
                 raise ValueError(f"Unsupported modality: {mm_feature.modality}")
 
+    def _get_evs_mask_segments(
+        self, mm_position: PlaceholderRange, expected_frames: int
+    ) -> list[torch.Tensor] | None:
+        """Extract contiguous segments from EVS is_embed mask.
+
+        The EVS (Efficient Video Sampling) mask marks which placeholder
+        positions should be filled with video embeddings. This method splits
+        the mask into contiguous segments, where each segment represents one
+        retained frame.
+
+        This is a pure function - it does not modify any state and always
+        returns the same output for the same input (idempotent).
+
+        Args:
+            mm_position: MultiModal position containing the is_embed mask
+            expected_frames: Expected number of frame segments
+
+        Returns:
+            List of tensors, each containing indices for one frame segment,
+            or None if EVS is not enabled or validation fails.
+        """
+        is_embed_mask = getattr(mm_position, "is_embed", None)
+        if is_embed_mask is None:
+            return None
+
+        # Find all True positions in the mask
+        mask_tensor = torch.as_tensor(is_embed_mask, dtype=torch.bool).view(-1)
+        true_indices = torch.nonzero(mask_tensor, as_tuple=False).flatten()
+        if true_indices.numel() == 0:
+            return None
+
+        # Split into contiguous segments (where diff > 1 indicates a gap)
+        if true_indices.numel() == 1:
+            segments = [true_indices]
+        else:
+            diffs = torch.diff(true_indices)
+            split_points = torch.nonzero(diffs != 1, as_tuple=False).flatten()
+            if split_points.numel() == 0:
+                segments = [true_indices]
+            else:
+                segments = torch.tensor_split(
+                    true_indices, split_points.add(1).tolist()
+                )
+
+        # Validate segment count matches expected frames
+        if len(segments) < expected_frames:
+            logger.debug(
+                "EVS mask segments (%d) do not match expected frames (%d)",
+                len(segments),
+                expected_frames,
+            )
+            return None
+
+        return segments[:expected_frames]
+
+    def _extract_frame_offsets_from_mask(
+        self, mm_position: PlaceholderRange, expected_frames: int
+    ) -> list[int] | None:
+        """Return relative offsets for each EVS-retained frame.
+
+        The prompt processor stores a boolean mask inside ``mm_position`` that
+        marks which placeholder locations should be populated with video
+        embeddings. By splitting that mask into contiguous runs we can recover
+        the start of every retained frame without probing ``input_tokens``.
+
+        Args:
+            mm_position: MultiModal position containing the is_embed mask
+            expected_frames: Expected number of frames
+
+        Returns:
+            List of starting offsets (relative to mm_position) for each frame,
+            or None if EVS is not enabled.
+        """
+        segments = self._get_evs_mask_segments(mm_position, expected_frames)
+        if segments is None:
+            return None
+
+        return [int(segment[0].item()) for segment in segments]
+
+    def _get_actual_frame_token_counts(
+        self, mm_position: PlaceholderRange, expected_frames: int
+    ) -> list[int] | None:
+        """Return actual token count for each EVS-retained frame.
+
+        This function calculates the actual number of tokens per frame by
+        analyzing the is_embed mask, accounting for EVS pruning. Each frame
+        may have a different token count due to content-aware pruning.
+
+        Args:
+            mm_position: MultiModal position containing the is_embed mask
+            expected_frames: Expected number of frames
+
+        Returns:
+            List of token counts for each frame, or None if EVS is not enabled.
+        """
+        segments = self._get_evs_mask_segments(mm_position, expected_frames)
+        if segments is None:
+            return None
+
+        return [len(seg) for seg in segments]
+
+    def recompute_mrope_positions(
+        self,
+        input_ids: list[int],
+        multimodal_embeddings: tuple[torch.Tensor, ...],
+        mrope_positions: torch.LongTensor,
+        num_computed_tokens: int,
+    ) -> tuple[tuple[torch.Tensor, ...], torch.Tensor, int]:
+        """
+        Update part of input mrope positions (starting with
+        num_computed_tokens index). Original mrope_positions are computed
+        for unpruned sequence and becomes incorrect once pruning occurs,
+        so once we prune media tokens we should reflect this in the
+        mrope_positions before we feed it to LLM.
+
+        Args:
+            input_ids: (N,) All input tokens of the prompt (Containing
+                entire sequence).
+            multimodal_embeddings: Tuple of multimodal embeddings.
+            mrope_positions: Existing mrope positions (3, N) for entire
+                sequence
+            num_computed_tokens: A number of computed tokens so far.
+
+        Returns:
+            Tuple of (multimodal_embeddings, mrope_positions,
+                mrope_position_delta).
+        """
+        image_token_id = self.config.image_token_id
+        video_token_id = self.config.video_token_id
+        vision_start_token_id = self.config.vision_start_token_id
+
+        # Device
+        device = (
+            multimodal_embeddings[0].device
+            if len(multimodal_embeddings)
+            else mrope_positions.device
+        )
+
+        # Tensors
+        input_ids_t = torch.as_tensor(input_ids, device=device, dtype=torch.long)
+
+        mm_embeddings_out = [mm[:, :-4] for mm in multimodal_embeddings]
+        mm_embeddings_pos = [
+            mm[:, -4:].permute(1, 0).long() for mm in multimodal_embeddings
+        ]
+
+        positions, mrope_positions_delta = recompute_mrope_positions(
+            input_ids_t,
+            mm_embeddings_pos,
+            mrope_positions,
+            num_computed_tokens,
+            vision_start_token_id,
+            image_token_id,
+            video_token_id,
+        )
+
+        return tuple(mm_embeddings_out), positions, mrope_positions_delta
+
     def get_mrope_input_positions(
         self,
         input_tokens: list[int],
         mm_features: list[MultiModalFeatureSpec],
     ) -> tuple[torch.Tensor, int]:
+        # Pre-collect actual frame token counts for EVS mode
+        frame_token_counts_map = {}
+        for mm_feature in mm_features:
+            if mm_feature.modality == "video":
+                is_evs_enabled = (
+                    hasattr(self, "video_pruning_rate")
+                    and self.video_pruning_rate is not None
+                    and self.video_pruning_rate > 0.0
+                )
+                if is_evs_enabled:
+                    t = mm_feature.data["video_grid_thw"].data.tolist()[0]
+                    token_counts = self._get_actual_frame_token_counts(
+                        mm_feature.mm_position, t
+                    )
+                    assert token_counts is not None, (
+                        "EVS enabled but failed to extract frame token counts "
+                        "from is_embed mask"
+                    )
+                    frame_token_counts_map[mm_feature.mm_position.offset] = token_counts
+
         llm_pos_ids_list = []
         st = 0
+        frame_counts_idx = {}
+
         for offset, llm_grid_h, llm_grid_w in self.iter_mm_grid_hw(
             input_tokens, mm_features
         ):
             text_len = offset - st
             st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
-            llm_pos_ids_list.append(
+
+            # Determine actual token count for this frame
+            base_offset = None
+            for feat_offset in frame_token_counts_map:
+                if offset >= feat_offset:
+                    base_offset = feat_offset
+
+            if base_offset is not None:
+                # EVS mode: use actual token count from is_embed mask
+                assert base_offset in frame_token_counts_map, (
+                    f"Found base_offset {base_offset} but not in frame_token_counts_map"
+                )
+
+                if base_offset not in frame_counts_idx:
+                    frame_counts_idx[base_offset] = 0
+
+                counts = frame_token_counts_map[base_offset]
+                idx = frame_counts_idx[base_offset]
+
+                assert idx < len(counts), (
+                    f"EVS frame index {idx} out of range (total frames: {len(counts)})"
+                )
+
+                actual_frame_tokens = counts[idx]
+                frame_counts_idx[base_offset] += 1
+            else:
+                # Non-EVS mode (or image): use theoretical grid size
+                actual_frame_tokens = llm_grid_h * llm_grid_w
+
+            # Add text segment
+            text_positions = (
                 np.broadcast_to(np.arange(text_len), (3, text_len)) + st_idx
             )
+            llm_pos_ids_list.append(text_positions)
+            st_idx += text_len
 
+            # Add frame segment with actual token count (not theoretical)
             grid_indices = np.indices((1, llm_grid_h, llm_grid_w)).reshape(3, -1)
-            llm_pos_ids_list.append(grid_indices + text_len + st_idx)
-            st = offset + llm_grid_h * llm_grid_w
+            # Only take the first actual_frame_tokens positions
+            frame_positions = grid_indices[:, :actual_frame_tokens] + st_idx
+            llm_pos_ids_list.append(frame_positions)
 
+            # Update st using actual token count
+            st = offset + actual_frame_tokens
+
+        # Handle final text segment
         if st < len(input_tokens):
             st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
             text_len = len(input_tokens) - st
-            llm_pos_ids_list.append(
+            final_text_positions = (
                 np.broadcast_to(np.arange(text_len), (3, text_len)) + st_idx
             )
+            llm_pos_ids_list.append(final_text_positions)
 
         llm_positions = np.concatenate(llm_pos_ids_list, axis=1).reshape(3, -1)
         mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
+
         return torch.from_numpy(llm_positions), mrope_position_delta
 
     def get_language_model(self) -> torch.nn.Module:
@@ -1508,9 +1912,17 @@ class Qwen3VLForConditionalGeneration(
             multimodal_input = mm_input_by_modality[modality]
             if modality == "image":
                 image_embeddings = self._process_image_input(multimodal_input)
+                if self.is_multimodal_pruning_enabled:
+                    image_embeddings = self._postprocess_image_embeds_evs(
+                        image_embeddings, multimodal_input
+                    )
                 multimodal_embeddings += tuple(image_embeddings)
             if modality == "video":
                 video_embeddings = self._process_video_input(multimodal_input)
+                if self.is_multimodal_pruning_enabled:
+                    video_embeddings = self._postprocess_video_embeds_evs(
+                        video_embeddings, multimodal_input
+                    )
                 multimodal_embeddings += tuple(video_embeddings)
         return multimodal_embeddings
 

From add4b0ca448e0b053a76b7db215aee0e797786d3 Mon Sep 17 00:00:00 2001
From: Vensen <vensenmu@gmail.com>
Date: Sun, 14 Dec 2025 22:57:15 +0800
Subject: [PATCH 538/770] [Bugfix][benchmarks] Fix input token calculation for
 rerank benchmark metrics (#30596)

Signed-off-by: vensen <vensenmu@gmail.com>
---
 vllm/benchmarks/serve.py                   | 4 +++-
 vllm/entrypoints/pooling/score/protocol.py | 1 +
 vllm/entrypoints/pooling/score/serving.py  | 4 +++-
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index 254e4d35e5350..f5d8ea5a975a9 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -235,7 +235,9 @@ async def get_request(
 
 
 def calculate_metrics_for_embeddings(
-    outputs: list[RequestFuncOutput], dur_s: float, selected_percentiles: list[float]
+    outputs: list[RequestFuncOutput],
+    dur_s: float,
+    selected_percentiles: list[float],
 ) -> EmbedBenchmarkMetrics:
     """Calculate the metrics for the embedding requests.
 
diff --git a/vllm/entrypoints/pooling/score/protocol.py b/vllm/entrypoints/pooling/score/protocol.py
index a22219707c357..e81bda2eec3d7 100644
--- a/vllm/entrypoints/pooling/score/protocol.py
+++ b/vllm/entrypoints/pooling/score/protocol.py
@@ -120,6 +120,7 @@ class RerankResult(BaseModel):
 
 
 class RerankUsage(BaseModel):
+    prompt_tokens: int
     total_tokens: int
 
 
diff --git a/vllm/entrypoints/pooling/score/serving.py b/vllm/entrypoints/pooling/score/serving.py
index f574d8bcebb40..edbfcd03ac92c 100644
--- a/vllm/entrypoints/pooling/score/serving.py
+++ b/vllm/entrypoints/pooling/score/serving.py
@@ -502,5 +502,7 @@ class ServingScores(OpenAIServing):
             id=request_id,
             model=model_name,
             results=results,
-            usage=RerankUsage(total_tokens=num_prompt_tokens),
+            usage=RerankUsage(
+                total_tokens=num_prompt_tokens, prompt_tokens=num_prompt_tokens
+            ),
         )

From 9e33a1a75b032e035b9129d7876d33c37596c6fe Mon Sep 17 00:00:00 2001
From: Tsukasa OI <floss_llm@irq.a4lg.com>
Date: Mon, 15 Dec 2025 00:01:42 +0900
Subject: [PATCH 539/770] [Model][Quantization] Override HF defaults to GGUF
 ones (incl. Qwen3 MoE) (#30118)

Signed-off-by: Tsukasa OI <floss_llm@irq.a4lg.com>
---
 vllm/transformers_utils/config.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index fb88c62dc5b23..ba89a43d573f2 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -617,6 +617,28 @@ def get_config(
         hf_overrides=hf_overrides_kw,
         **kwargs,
     )
+
+    # Patching defaults for GGUF models
+    if _is_gguf:
+        # Some models have different default values between GGUF and HF.
+        def apply_gguf_default(key: str, gguf_default: Any):
+            """
+            Apply GGUF defaults unless explicitly configured.
+
+            This function reads/writes external `config` and `config_dict`.
+            If the specified `key` is not in `config_dict` (i.e. not explicitly
+            configured and the default HF value is used), it updates the
+            corresponding `config` value to `gguf_default`.
+            """
+            if key not in config_dict:
+                config.update({key: gguf_default})
+
+        # Apply architecture-specific GGUF defaults.
+        if config.model_type in {"qwen3_moe"}:
+            # Qwen3 MoE: norm_topk_prob is always true.
+            # Note that, this parameter is always false (HF default) on Qwen2 MoE.
+            apply_gguf_default("norm_topk_prob", True)
+
     # Special architecture mapping check for GGUF models
     if _is_gguf:
         if config.model_type not in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:

From ae2e503dda693b6b7ab9052ec61e012a3c730f2f Mon Sep 17 00:00:00 2001
From: "Chendi.Xue" <chendi.xue@intel.com>
Date: Sun, 14 Dec 2025 09:38:28 -0600
Subject: [PATCH 540/770] [NIXL][BUG FIX] Fix a bug for PD with host_buffer
 after merging 29665 (#30420)

Signed-off-by: Chendi Xue <chendi.xue@intel.com>
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Co-authored-by: Mark McLoughlin <markmc@redhat.com>
---
 .../kv_connector/unit/test_nixl_connector.py  | 12 +--
 .../kv_connector/v1/nixl_connector.py         | 99 +++++++++++--------
 2 files changed, 64 insertions(+), 47 deletions(-)

diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index 53da09cfbc21d..66804fa671c7c 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -461,7 +461,7 @@ class TestNixlHandshake:
             metadata = NixlConnectorMetadata()
             if num_xfers > 0:
                 num_xfers -= 1
-                metadata.add_new_req(
+                metadata.add_new_req_to_recv(
                     request_id=request_id,
                     local_block_ids=[num_xfers + 1, num_xfers + 2, num_xfers + 3],
                     kv_transfer_params={
@@ -532,7 +532,7 @@ class TestNixlHandshake:
             vllm_config, connector.engine_id
         )
         metadata = NixlConnectorMetadata()
-        metadata.add_new_req(
+        metadata.add_new_req_to_recv(
             request_id="id",
             local_block_ids=[1, 2, 3],
             kv_transfer_params={
@@ -588,7 +588,7 @@ class TestNixlHandshake:
         metadata = NixlConnectorMetadata()
         total_reqs = 5
         for i in range(total_reqs):
-            metadata.add_new_req(
+            metadata.add_new_req_to_recv(
                 request_id=f"id_{i}",
                 local_block_ids=[1, 2, 3],
                 kv_transfer_params={
@@ -752,7 +752,7 @@ def test_kv_connector_stats(dist_init):
     # Create transfer metadata
     request_id = "test_req_for_stats"
     metadata = NixlConnectorMetadata()
-    metadata.add_new_req(
+    metadata.add_new_req_to_recv(
         request_id=request_id,
         local_block_ids=[1, 2, 3],
         kv_transfer_params={
@@ -1515,7 +1515,7 @@ def test_handshake_failure_returns_finished(dist_init):
 
     request_id = "test_handshake_fail"
     metadata = NixlConnectorMetadata()
-    metadata.add_new_req(
+    metadata.add_new_req_to_recv(
         request_id=request_id,
         local_block_ids=[1, 2, 3],
         kv_transfer_params={
@@ -1565,7 +1565,7 @@ def test_transfer_setup_failure_returns_finished(dist_init):
 
     request_id = "test_transfer_fail"
     metadata = NixlConnectorMetadata()
-    metadata.add_new_req(
+    metadata.add_new_req_to_recv(
         request_id=request_id,
         local_block_ids=[7, 8, 9],
         kv_transfer_params={
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 514b8534aaa6b..fb4b8ac391afb 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -202,17 +202,22 @@ def compute_nixl_compatibility_hash(
     return compat_hash
 
 
+@dataclass
+class RemoteMeta:
+    block_ids: list[int]
+    host: str
+    port: int
+    engine_id: str
+    request_id: str
+
+
 @dataclass
 class ReqMeta:
     local_block_ids: list[int]
     # To be used when logical block size does not match the kernel block size
     local_physical_block_ids: list[int]
-    remote_block_ids: list[int]
-    remote_host: str
-    remote_port: int
-    remote_engine_id: str
-    remote_request_id: str
     tp_size: int
+    remote: RemoteMeta | None = None
 
 
 class NixlConnectorMetadata(KVConnectorMetadata):
@@ -223,31 +228,43 @@ class NixlConnectorMetadata(KVConnectorMetadata):
         self.reqs_in_batch: set[ReqId] = set()
         self.reqs_not_processed: set[ReqId] = set()
 
-    def add_new_req(
+    def _add_new_req(
+        self,
+        local_block_ids: list[int],
+        kv_transfer_params: dict[str, Any],
+    ) -> ReqMeta:
+        return ReqMeta(
+            local_block_ids=local_block_ids,
+            local_physical_block_ids=local_block_ids,
+            # P workers don't need to receive tp_size from proxy here.
+            tp_size=kv_transfer_params.get("tp_size", 1),
+        )
+
+    def add_new_req_to_save(
         self,
         request_id: ReqId,
         local_block_ids: list[int],
         kv_transfer_params: dict[str, Any],
-        load_remote_cache: bool = True,
-        save_to_host: bool = False,
     ):
-        # save and load are mutually exclusive
-        assert load_remote_cache ^ save_to_host
-        _req = ReqMeta(
-            local_block_ids=local_block_ids,
-            local_physical_block_ids=local_block_ids,
-            remote_block_ids=kv_transfer_params["remote_block_ids"],
-            remote_engine_id=kv_transfer_params["remote_engine_id"],
-            remote_request_id=kv_transfer_params["remote_request_id"],
-            remote_host=kv_transfer_params["remote_host"],
-            remote_port=kv_transfer_params["remote_port"],
-            # P workers don't need to receive tp_size from proxy here.
-            tp_size=kv_transfer_params.get("tp_size", 1),
+        self.reqs_to_save[request_id] = self._add_new_req(
+            local_block_ids, kv_transfer_params
         )
-        if save_to_host:
-            self.reqs_to_save[request_id] = _req
-        if load_remote_cache:
-            self.reqs_to_recv[request_id] = _req
+
+    def add_new_req_to_recv(
+        self,
+        request_id: ReqId,
+        local_block_ids: list[int],
+        kv_transfer_params: dict[str, Any],
+    ):
+        req = self._add_new_req(local_block_ids, kv_transfer_params)
+        req.remote = RemoteMeta(
+            block_ids=kv_transfer_params["remote_block_ids"],
+            engine_id=kv_transfer_params["remote_engine_id"],
+            request_id=kv_transfer_params["remote_request_id"],
+            host=kv_transfer_params["remote_host"],
+            port=kv_transfer_params["remote_port"],
+        )
+        self.reqs_to_recv[request_id] = req
 
 
 class NixlConnector(KVConnectorBase_V1):
@@ -666,22 +683,18 @@ class NixlConnectorScheduler:
         # Loop through scheduled reqs and convert to ReqMeta.
         for req_id, (req, block_ids) in self._reqs_need_recv.items():
             assert req.kv_transfer_params is not None
-            meta.add_new_req(
+            meta.add_new_req_to_recv(
                 request_id=req_id,
                 local_block_ids=block_ids,
                 kv_transfer_params=req.kv_transfer_params,
-                load_remote_cache=True,
-                save_to_host=False,
             )
 
         for req_id, (req, block_ids) in self._reqs_need_save.items():
             assert req.kv_transfer_params is not None
-            meta.add_new_req(
+            meta.add_new_req_to_save(
                 request_id=req_id,
                 local_block_ids=block_ids,
                 kv_transfer_params=req.kv_transfer_params,
-                load_remote_cache=False,
-                save_to_host=True,
             )
 
         meta.reqs_to_send = self._reqs_need_send
@@ -1124,10 +1137,11 @@ class NixlConnectorWorker:
         # Do NIXL handshake in background and add to _ready_requests when done.
         fut = self._handshake_futures.get(remote_engine_id)
         if fut is None:
+            assert meta.remote is not None
             fut = self._handshake_initiation_executor.submit(
                 self._nixl_handshake,
-                meta.remote_host,
-                meta.remote_port,
+                meta.remote.host,
+                meta.remote.port,
                 meta.tp_size,
                 remote_engine_id,
             )
@@ -1774,6 +1788,7 @@ class NixlConnectorWorker:
             # clean up metadata for completed requests
             meta = self._recving_metadata.pop(req_id, None)
             assert meta is not None, f"{req_id} not found in recving_metadata list"
+            assert meta.remote is not None
             if self.use_host_buffer:
                 self.sync_recved_kv_to_device(req_id, meta)
             if self.enable_permute_local_kv:
@@ -1781,7 +1796,7 @@ class NixlConnectorWorker:
 
             # post processing for heteroblocksize
             block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(
-                meta.remote_engine_id
+                meta.remote.engine_id
             )
             if (
                 not self.use_mla
@@ -1916,17 +1931,18 @@ class NixlConnectorWorker:
             meta.local_physical_block_ids = self._logical_to_kernel_block_ids(
                 meta.local_block_ids
             )
-            meta.remote_block_ids = self._logical_to_kernel_block_ids(
-                meta.remote_block_ids
+            assert meta.remote is not None
+            meta.remote.block_ids = self._logical_to_kernel_block_ids(
+                meta.remote.block_ids
             )
-            remote_engine_id = meta.remote_engine_id
+            remote_engine_id = meta.remote.engine_id
             logger.debug(
                 "start_load_kv for request %s from remote engine %s. "
                 "Num local_block_ids: %s. Num remote_block_ids: %s. ",
                 req_id,
                 remote_engine_id,
                 len(meta.local_physical_block_ids),
-                len(meta.remote_block_ids),
+                len(meta.remote.block_ids),
             )
             # always store metadata for failure recovery
             self._recving_metadata[req_id] = meta
@@ -1965,17 +1981,18 @@ class NixlConnectorWorker:
                 self._reqs_to_send[req_id] = expiration_time
 
     def _read_blocks_for_req(self, req_id: str, meta: ReqMeta):
+        assert meta.remote is not None
         logger.debug(
             "Remote agent %s available, calling _read_blocks for req %s",
-            meta.remote_engine_id,
+            meta.remote.engine_id,
             req_id,
         )
         self._read_blocks(
             request_id=req_id,
-            dst_engine_id=meta.remote_engine_id,
-            remote_request_id=meta.remote_request_id,
+            dst_engine_id=meta.remote.engine_id,
+            remote_request_id=meta.remote.request_id,
             local_block_ids=meta.local_physical_block_ids,
-            remote_block_ids=meta.remote_block_ids,
+            remote_block_ids=meta.remote.block_ids,
         )
 
     def _read_blocks(

From 9ccbf6b692e0e39995b063a8381a322097cff5e0 Mon Sep 17 00:00:00 2001
From: RioS <aa248424@gmail.com>
Date: Mon, 15 Dec 2025 04:25:45 +0900
Subject: [PATCH 541/770] [responsesAPI]add extra body parameters (#30532)

Signed-off-by: Ri0S <aa248424@gmail.com>
---
 vllm/entrypoints/openai/protocol.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index aeff6bded7f00..a7c4980cd3674 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -320,6 +320,7 @@ class ResponsesRequest(OpenAIBaseModel):
     max_tool_calls: int | None = None
     metadata: Metadata | None = None
     model: str | None = None
+    logit_bias: dict[str, float] | None = None
     parallel_tool_calls: bool | None = True
     previous_response_id: str | None = None
     prompt: ResponsePrompt | None = None
@@ -333,6 +334,7 @@ class ResponsesRequest(OpenAIBaseModel):
     tools: list[Tool] = Field(default_factory=list)
     top_logprobs: int | None = 0
     top_p: float | None = None
+    top_k: int | None = None
     truncation: Literal["auto", "disabled"] | None = "disabled"
     user: str | None = None
 
@@ -387,6 +389,7 @@ class ResponsesRequest(OpenAIBaseModel):
     _DEFAULT_SAMPLING_PARAMS = {
         "temperature": 1.0,
         "top_p": 1.0,
+        "top_k": 0,
     }
 
     def to_sampling_params(
@@ -408,6 +411,10 @@ class ResponsesRequest(OpenAIBaseModel):
             top_p = default_sampling_params.get(
                 "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"]
             )
+        if (top_k := self.top_k) is None:
+            top_k = default_sampling_params.get(
+                "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"]
+            )
         stop_token_ids = default_sampling_params.get("stop_token_ids")
 
         # Structured output
@@ -428,6 +435,7 @@ class ResponsesRequest(OpenAIBaseModel):
         return SamplingParams.from_optional(
             temperature=temperature,
             top_p=top_p,
+            top_k=top_k,
             max_tokens=max_tokens,
             logprobs=self.top_logprobs if self.is_include_output_logprobs() else None,
             stop_token_ids=stop_token_ids,
@@ -435,6 +443,7 @@ class ResponsesRequest(OpenAIBaseModel):
                 RequestOutputKind.DELTA if self.stream else RequestOutputKind.FINAL_ONLY
             ),
             structured_outputs=structured_outputs,
+            logit_bias=self.logit_bias,
         )
 
     def is_include_output_logprobs(self) -> bool:

From 174e39ead7cb14a0511b0bbdc15dfd4a01ffb5fb Mon Sep 17 00:00:00 2001
From: Or Ozeri <oro@il.ibm.com>
Date: Mon, 15 Dec 2025 01:50:45 +0200
Subject: [PATCH 542/770] CPU KV Offloading: Use more CUDA streams (#29013)

Signed-off-by: Or Ozeri <oro@il.ibm.com>
---
 tests/v1/kv_offload/test_cpu_gpu.py  |  22 +--
 vllm/v1/kv_offload/cpu.py            |  14 +-
 vllm/v1/kv_offload/worker/cpu_gpu.py | 261 ++++++++++++++++++---------
 3 files changed, 192 insertions(+), 105 deletions(-)

diff --git a/tests/v1/kv_offload/test_cpu_gpu.py b/tests/v1/kv_offload/test_cpu_gpu.py
index a248104e16d2d..3516c0013879d 100644
--- a/tests/v1/kv_offload/test_cpu_gpu.py
+++ b/tests/v1/kv_offload/test_cpu_gpu.py
@@ -9,7 +9,7 @@ import torch
 from vllm.platforms import current_platform
 from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
 from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
-from vllm.v1.kv_offload.worker.cpu_gpu import CpuGpuOffloadingHandler
+from vllm.v1.kv_offload.worker.cpu_gpu import CpuGpuOffloadingHandlers
 
 BACKENDS_TO_TEST = [FlashAttentionBackend]
 
@@ -82,7 +82,7 @@ def test_transfer(
 
     # create handler
     cpu_block_size = gpu_blocks_per_cpu_block * gpu_block_size
-    handler = CpuGpuOffloadingHandler(
+    handlers = CpuGpuOffloadingHandlers(
         attn_backends=attn_backends,
         gpu_block_size=gpu_block_size,
         cpu_block_size=cpu_block_size,
@@ -112,8 +112,7 @@ def test_transfer(
 
     # set transfer direction
     if gpu_to_cpu:
-        src_kv_caches = handler.gpu_tensors
-        dst_kv_caches = handler.cpu_tensors
+        handler = handlers.gpu_to_cpu_handler
         src_spec_class = GPULoadStoreSpec
         dst_spec_class = CPULoadStoreSpec
         src_blocks = gpu_blocks
@@ -122,8 +121,7 @@ def test_transfer(
         dst_blocks_in_gpu_block_size = cpu_blocks_in_gpu_block_size
         dst_size_in_gpu_blocks = num_cpu_blocks * gpu_blocks_per_cpu_block
     else:
-        src_kv_caches = handler.cpu_tensors
-        dst_kv_caches = handler.gpu_tensors
+        handler = handlers.cpu_to_gpu_handler
         src_spec_class = CPULoadStoreSpec
         dst_spec_class = GPULoadStoreSpec
         src_blocks = cpu_blocks
@@ -144,12 +142,12 @@ def test_transfer(
     dst_spec = dst_spec_class(dst_blocks)
 
     # clone src and dst tensors before transfer
-    orig_src_caches = [x.clone() for x in src_kv_caches]
-    orig_dst_caches = [x.clone() for x in dst_kv_caches]
+    orig_src_caches = [x.clone() for x in handler.src_tensors]
+    orig_dst_caches = [x.clone() for x in handler.dst_tensors]
 
     # call transfer function
     assert handler.transfer_async(1, (src_spec, dst_spec))
-    assert set(handler.transfer_events.keys()) == {1}
+    assert set({x[0] for x in handler._transfers}) == {1}
 
     # wait for transfer to complete
     end_time = time.time() + 10
@@ -161,15 +159,15 @@ def test_transfer(
         time.sleep(0.1)
 
     # verify src tensors did not change
-    for orig_tensor, tensor in zip(orig_src_caches, src_kv_caches):
+    for orig_tensor, tensor in zip(orig_src_caches, handler.src_tensors):
         assert torch.equal(orig_tensor, tensor)
 
     # verify dst tensors
     for dst_block in range(dst_size_in_gpu_blocks):
         src_block_candidate = dst_to_src.get(dst_block)
         for src_cache, dst_cache, orig_dst_cache, kv_dim in zip(
-            src_kv_caches,
-            dst_kv_caches,
+            handler.src_tensors,
+            handler.dst_tensors,
             orig_dst_caches,
             handler.kv_dim_before_num_blocks,
         ):
diff --git a/vllm/v1/kv_offload/cpu.py b/vllm/v1/kv_offload/cpu.py
index 2f2e85c0ff332..e1cf7b14a785c 100644
--- a/vllm/v1/kv_offload/cpu.py
+++ b/vllm/v1/kv_offload/cpu.py
@@ -13,7 +13,7 @@ from vllm.v1.kv_offload.backends.cpu import CPUBackend
 from vllm.v1.kv_offload.lru_manager import LRUOffloadingManager
 from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
 from vllm.v1.kv_offload.spec import OffloadingSpec
-from vllm.v1.kv_offload.worker.cpu_gpu import CpuGpuOffloadingHandler
+from vllm.v1.kv_offload.worker.cpu_gpu import CpuGpuOffloadingHandlers
 from vllm.v1.kv_offload.worker.worker import OffloadingHandler
 
 
@@ -32,7 +32,7 @@ class CPUOffloadingSpec(OffloadingSpec):
         self._manager: OffloadingManager | None = None
 
         # worker-side
-        self._handler: OffloadingHandler | None = None
+        self._handlers: CpuGpuOffloadingHandlers | None = None
 
         self.eviction_policy: str = self.extra_config.get("eviction_policy", "lru")
 
@@ -67,13 +67,13 @@ class CPUOffloadingSpec(OffloadingSpec):
         kv_caches: dict[str, torch.Tensor],
         attn_backends: dict[str, type[AttentionBackend]],
     ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]:
-        if not self._handler:
+        if not self._handlers:
             if not current_platform.is_cuda_alike():
                 raise Exception(
                     "CPU Offloading is currently only supported on CUDA-alike GPUs"
                 )
 
-            self._handler = CpuGpuOffloadingHandler(
+            self._handlers = CpuGpuOffloadingHandlers(
                 attn_backends=attn_backends,
                 gpu_block_size=self.gpu_block_size,
                 cpu_block_size=self.offloaded_block_size,
@@ -81,6 +81,6 @@ class CPUOffloadingSpec(OffloadingSpec):
                 gpu_caches=kv_caches,
             )
 
-        assert self._handler is not None
-        yield GPULoadStoreSpec, CPULoadStoreSpec, self._handler
-        yield CPULoadStoreSpec, GPULoadStoreSpec, self._handler
+        assert self._handlers is not None
+        yield GPULoadStoreSpec, CPULoadStoreSpec, self._handlers.gpu_to_cpu_handler
+        yield CPULoadStoreSpec, GPULoadStoreSpec, self._handlers.cpu_to_gpu_handler
diff --git a/vllm/v1/kv_offload/worker/cpu_gpu.py b/vllm/v1/kv_offload/worker/cpu_gpu.py
index 461458c1f6ce8..42ae4f1413ad0 100644
--- a/vllm/v1/kv_offload/worker/cpu_gpu.py
+++ b/vllm/v1/kv_offload/worker/cpu_gpu.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections import deque
 
 import numpy as np
 import torch
@@ -8,7 +9,7 @@ from vllm import _custom_ops as ops
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.logger import init_logger
 from vllm.utils.platform_utils import is_pin_memory_available
-from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
+from vllm.v1.kv_offload.mediums import BlockIDsLoadStoreSpec
 from vllm.v1.kv_offload.worker.worker import (
     OffloadingHandler,
     TransferResult,
@@ -51,7 +52,123 @@ def expand_block_ids(
         output_idx = output_end_idx
 
 
-class CpuGpuOffloadingHandler(OffloadingHandler):
+class SingleDirectionOffloadingHandler(OffloadingHandler):
+    """
+    SingleDirectionOffloadingHandler handles transfers for a single direction,
+    either CPU->GPU or GPU->CPU.
+    Transfers are guaranteed to be executed in order of their submission.
+    Each transfer uses a unique CUDA stream, and its stream will start
+    executing only after the streams of previous transfers have finished.
+    """
+
+    def __init__(
+        self,
+        src_tensors: list[torch.Tensor],
+        dst_tensors: list[torch.Tensor],
+        kv_dim_before_num_blocks: list[bool],
+        src_block_size_factor: int,
+        dst_block_size_factor: int,
+        priority: int,
+    ):
+        """
+        Initialize a SingleDirectionOffloadingHandler.
+
+        Args:
+            src_tensors: list of KV cache tensors to copy from.
+            dst_tensors: list of KV cache tensors to copy to.
+                Order should match src_tensors.
+            kv_dim_before_num_blocks: list of bools, indicating
+                whether the respective KV cache tensor has a KV
+                dimension before its num_blocks dimension.
+                e.g. (2, num_blocks, ...)
+            src_block_size_factor: The number of kernel blocks
+                per KV block in a source tensor.
+            dst_block_size_factor: The number of kernel blocks
+                per KV block in a destination tensor.
+            priority: The priority of the backing CUDA streams.
+                Lower numbers indicate higher priority.
+        """
+        assert len(src_tensors) == len(dst_tensors) == len(kv_dim_before_num_blocks)
+
+        self.src_tensors: list[torch.Tensor] = src_tensors
+        self.dst_tensors: list[torch.Tensor] = dst_tensors
+        self.kv_dim_before_num_blocks: list[bool] = kv_dim_before_num_blocks
+        self.src_block_size_factor: int = src_block_size_factor
+        self.dst_block_size_factor: int = dst_block_size_factor
+        self.priority = priority
+
+        # queue of transfers (job_id, stream, event)
+        self._transfers: deque[tuple[int, torch.cuda.Stream, torch.Event]] = deque()
+        # list of CUDA streams available for re-use
+        self._stream_pool: list[torch.cuda.Stream] = []
+        # list of CUDA events available for re-use
+        self._event_pool: list[torch.Event] = []
+
+    def transfer_async(self, job_id: int, transfer_spec: TransferSpec) -> bool:
+        src_spec, dst_spec = transfer_spec
+        assert isinstance(src_spec, BlockIDsLoadStoreSpec)
+        assert isinstance(dst_spec, BlockIDsLoadStoreSpec)
+
+        src_blocks = src_spec.block_ids
+        dst_blocks = dst_spec.block_ids
+        assert src_blocks.ndim == 1
+        assert dst_blocks.ndim == 1
+
+        src_sub_block_count = src_blocks.size * self.src_block_size_factor
+        dst_sub_block_count = dst_blocks.size * self.dst_block_size_factor
+        src_sub_blocks_to_skip = -dst_blocks.size % self.src_block_size_factor
+
+        assert dst_sub_block_count == src_sub_block_count - src_sub_blocks_to_skip
+
+        src_to_dst = np.empty((dst_sub_block_count, 2), dtype=np.int64)
+        expand_block_ids(
+            src_blocks,
+            self.src_block_size_factor,
+            src_to_dst[:, 0],
+            skip_count=src_sub_blocks_to_skip,
+        )
+        expand_block_ids(dst_blocks, self.dst_block_size_factor, src_to_dst[:, 1])
+        src_to_dst_tensor = torch.from_numpy(src_to_dst)
+
+        stream = (
+            self._stream_pool.pop()
+            if self._stream_pool
+            else torch.cuda.Stream(priority=self.priority)
+        )
+        event = self._event_pool.pop() if self._event_pool else torch.Event()
+        if self._transfers:
+            _, _, last_event = self._transfers[-1]
+            # assure job will start only after the previous one completes
+            stream.wait_event(last_event)
+        with torch.cuda.stream(stream):
+            for src_tensor, dst_tensor, kv_dim in zip(
+                self.src_tensors, self.dst_tensors, self.kv_dim_before_num_blocks
+            ):
+                if kv_dim:
+                    src_key_cache, src_value_cache = src_tensor
+                    dst_key_cache, dst_value_cache = dst_tensor
+                    ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst_tensor)
+                    ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst_tensor)
+                else:
+                    ops.swap_blocks(src_tensor, dst_tensor, src_to_dst_tensor)
+            event.record(stream)
+
+        self._transfers.append((job_id, stream, event))
+
+        # success
+        return True
+
+    def get_finished(self) -> list[TransferResult]:
+        results: list[TransferResult] = []
+        while self._transfers and self._transfers[0][2].query():
+            job_id, stream, event = self._transfers.popleft()
+            results.append((job_id, True))
+            self._stream_pool.append(stream)
+            self._event_pool.append(event)
+        return results
+
+
+class CpuGpuOffloadingHandlers:
     def __init__(
         self,
         gpu_block_size: int,
@@ -60,27 +177,20 @@ class CpuGpuOffloadingHandler(OffloadingHandler):
         gpu_caches: dict[str, torch.Tensor],
         attn_backends: dict[str, type[AttentionBackend]],
     ):
+        assert gpu_caches
         assert cpu_block_size % gpu_block_size == 0
-        self.block_size_factor = cpu_block_size // gpu_block_size
-
-        # cuda streams for gpu->cpu and cpu->gpu
-        self.d2h_stream = torch.cuda.Stream()
-        self.h2d_stream = torch.cuda.Stream()
-
-        # job_id -> transfer cuda event
-        self.transfer_events: dict[int, torch.Event] = {}
-        # list of cuda events available for re-use
-        self.events_pool: list[torch.Event] = []
+        block_size_factor = cpu_block_size // gpu_block_size
 
         pin_memory = is_pin_memory_available()
 
         # allocate cpu tensors
         logger.info("Allocating %d CPU tensors...", len(gpu_caches))
-        self.gpu_tensors: list[torch.Tensor] = []
-        self.cpu_tensors: list[torch.Tensor] = []
-        self.kv_dim_before_num_blocks: list[bool] = []
+        gpu_tensors: list[torch.Tensor] = []
+        cpu_tensors: list[torch.Tensor] = []
+        kv_dim_before_num_blocks: list[bool] = []
+        kernel_block_size: int | None = None
         for layer_name, gpu_tensor in gpu_caches.items():
-            self.gpu_tensors.append(gpu_tensor)
+            gpu_tensors.append(gpu_tensor)
 
             gpu_shape = gpu_tensor.shape
             attn_backend = attn_backends[layer_name]
@@ -88,16 +198,21 @@ class CpuGpuOffloadingHandler(OffloadingHandler):
                 num_blocks=1234, block_size=16, num_kv_heads=8, head_size=256
             )
 
+            has_layers_dim = False
             if len(gpu_shape) != len(test_shape):
                 # cross-layers tensor
                 # shape is (num_blocks, ...)
                 assert len(gpu_shape) == len(test_shape) + 1
                 num_blocks_idx = 0
-                self.kv_dim_before_num_blocks.append(False)
+                has_layers_dim = True
+                kv_dim_before_num_blocks.append(False)
+
+                # prepend a dummy num_layers=80 to test_shape
+                test_shape = (80,) + test_shape
             elif test_shape[0] == 1234:
                 # shape is (num_blocks, ...)
                 num_blocks_idx = 0
-                self.kv_dim_before_num_blocks.append(False)
+                kv_dim_before_num_blocks.append(False)
             else:
                 # shape should be (2, num_blocks, ...)
                 assert test_shape[0] == 2
@@ -105,13 +220,32 @@ class CpuGpuOffloadingHandler(OffloadingHandler):
                 assert gpu_shape[0] == 2
 
                 num_blocks_idx = 1
-                self.kv_dim_before_num_blocks.append(True)
+                kv_dim_before_num_blocks.append(True)
+
+            try:
+                kv_cache_stride_order = attn_backend.get_kv_cache_stride_order(
+                    include_num_layers_dimension=has_layers_dim
+                )
+                assert len(kv_cache_stride_order) == len(gpu_shape)
+            except (AttributeError, NotImplementedError):
+                kv_cache_stride_order = tuple(range(len(gpu_shape)))
+
+            # permute test_shape according to stride_order
+            test_shape = tuple(test_shape[i] for i in kv_cache_stride_order)
+
+            # find block_size (16) dimension index
+            block_size_idx = test_shape.index(16)
+            if kernel_block_size is not None:
+                assert kernel_block_size == gpu_shape[block_size_idx]
+            else:
+                kernel_block_size = gpu_shape[block_size_idx]
+                assert gpu_block_size % kernel_block_size == 0
 
             cpu_shape = list(gpu_shape)
-            cpu_shape[num_blocks_idx] = num_cpu_blocks * self.block_size_factor
+            cpu_shape[num_blocks_idx] = num_cpu_blocks * block_size_factor
 
             logger.debug("Allocating CPU tensor of shape %r", cpu_shape)
-            self.cpu_tensors.append(
+            cpu_tensors.append(
                 torch.zeros(
                     cpu_shape,
                     dtype=gpu_tensor.dtype,
@@ -120,72 +254,27 @@ class CpuGpuOffloadingHandler(OffloadingHandler):
                 )
             )
 
-    def transfer_async(self, job_id: int, spec: TransferSpec) -> bool:
-        src_spec, dst_spec = spec
-        if isinstance(src_spec, CPULoadStoreSpec):
-            assert isinstance(dst_spec, GPULoadStoreSpec)
-            stream = self.h2d_stream
-            src_tensors = self.cpu_tensors
-            dst_tensors = self.gpu_tensors
-            src_block_size_factor = self.block_size_factor
-            dst_block_size_factor = 1
-        else:
-            assert isinstance(src_spec, GPULoadStoreSpec)
-            assert isinstance(dst_spec, CPULoadStoreSpec)
-            stream = self.d2h_stream
-            src_tensors = self.gpu_tensors
-            dst_tensors = self.cpu_tensors
-            src_block_size_factor = 1
-            dst_block_size_factor = self.block_size_factor
+        assert kernel_block_size is not None
+        gpu_block_size_factor = gpu_block_size // kernel_block_size
+        cpu_block_size_factor = cpu_block_size // kernel_block_size
 
-        src_blocks = src_spec.block_ids
-        dst_blocks = dst_spec.block_ids
-        assert src_blocks.ndim == 1
-        assert dst_blocks.ndim == 1
+        # TODO (orozery): adapt swap_blocks to support gpu_block_size_factor
+        assert gpu_block_size_factor == 1
 
-        src_sub_block_count = src_blocks.size * src_block_size_factor
-        dst_sub_block_count = dst_blocks.size * dst_block_size_factor
-        src_sub_blocks_to_skip = -dst_blocks.size % src_block_size_factor
-
-        assert dst_sub_block_count == src_sub_block_count - src_sub_blocks_to_skip
-
-        src_to_dst = np.empty((dst_sub_block_count, 2), dtype=np.int64)
-        expand_block_ids(
-            src_blocks,
-            src_block_size_factor,
-            src_to_dst[:, 0],
-            skip_count=src_sub_blocks_to_skip,
+        self.gpu_to_cpu_handler = SingleDirectionOffloadingHandler(
+            src_tensors=gpu_tensors,
+            dst_tensors=cpu_tensors,
+            kv_dim_before_num_blocks=kv_dim_before_num_blocks,
+            src_block_size_factor=gpu_block_size_factor,
+            dst_block_size_factor=cpu_block_size_factor,
+            priority=1,
         )
-        expand_block_ids(dst_blocks, dst_block_size_factor, src_to_dst[:, 1])
-        src_to_dst_tensor = torch.from_numpy(src_to_dst)
 
-        event = self.events_pool.pop() if self.events_pool else torch.Event()
-        with torch.cuda.stream(stream):
-            for src_tensor, dst_tensor, kv_dim in zip(
-                src_tensors, dst_tensors, self.kv_dim_before_num_blocks
-            ):
-                if kv_dim:
-                    src_key_cache = src_tensor[0]
-                    dst_key_cache = dst_tensor[0]
-                    ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst_tensor)
-                    src_value_cache = src_tensor[1]
-                    dst_value_cache = dst_tensor[1]
-                    ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst_tensor)
-                else:
-                    ops.swap_blocks(src_tensor, dst_tensor, src_to_dst_tensor)
-            event.record(stream)
-
-        self.transfer_events[job_id] = event
-
-        # success
-        return True
-
-    def get_finished(self) -> list[TransferResult]:
-        results: list[TransferResult] = []
-        for job_id, event in self.transfer_events.items():
-            if event.query():
-                results.append((job_id, True))
-                self.events_pool.append(event)
-        for job_id, _ in results:
-            del self.transfer_events[job_id]
-        return results
+        self.cpu_to_gpu_handler = SingleDirectionOffloadingHandler(
+            src_tensors=cpu_tensors,
+            dst_tensors=gpu_tensors,
+            kv_dim_before_num_blocks=kv_dim_before_num_blocks,
+            src_block_size_factor=cpu_block_size_factor,
+            dst_block_size_factor=gpu_block_size_factor,
+            priority=-1,
+        )

From e2ed238885be6af358be1851cd43105b7d036c49 Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Sun, 14 Dec 2025 19:33:41 -0500
Subject: [PATCH 543/770] Revert "[Fix]Load kv-cache dtype from
 hf_quant_config.json automatically" (#30653)

---
 vllm/utils/torch_utils.py | 25 ++-----------------------
 1 file changed, 2 insertions(+), 23 deletions(-)

diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py
index edcb79fbc9cd7..c97efce312b56 100644
--- a/vllm/utils/torch_utils.py
+++ b/vllm/utils/torch_utils.py
@@ -194,33 +194,12 @@ def get_kv_cache_torch_dtype(
     return torch_dtype
 
 
-def get_kv_cache_quant_algo_dtype(quant_cfg: dict[str, Any]) -> torch.dtype | None:
-    quant_method = quant_cfg.get("quant_method", "")
-    if quant_method.startswith("modelopt"):
-        quantization_inner = quant_cfg.get("quantization", quant_cfg)
-        # Check if quant config is specified and use kv cache quant algo
-        kv_algo = quantization_inner.get("kv_cache_quant_algo") or quant_cfg.get(
-            "kv_cache_quant_algo"
-        )
-        if isinstance(kv_algo, str):
-            return STR_DTYPE_TO_TORCH_DTYPE[kv_algo.lower()]
-    return None
-
-
 def kv_cache_dtype_str_to_dtype(
     kv_cache_dtype: str, model_config: ModelConfig
 ) -> torch.dtype:
-    # Model config may not be specified for unit tests, default to float16
-    dtype = model_config.dtype if model_config else torch.half
     if kv_cache_dtype == "auto":
-        hf_cfg = getattr(model_config, "hf_config", None)
-        if hf_cfg is not None:
-            quant_cfg = getattr(hf_cfg, "quantization_config", None)
-            if quant_cfg is not None:
-                kv_algo_dtype = get_kv_cache_quant_algo_dtype(quant_cfg)
-                return kv_algo_dtype if kv_algo_dtype is not None else dtype
-        return dtype
-
+        # Model config may not be specified for unit tests, default to float16
+        return model_config.dtype if model_config else torch.half
     return STR_DTYPE_TO_TORCH_DTYPE[kv_cache_dtype]
 
 
From 917fdae5b2eccf0e7b6f2d4ae67132d13d13580c Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Sun, 14 Dec 2025 18:49:45 -0800
Subject: [PATCH 544/770] [Log] Skip piecewise cudagraph warn when using full
 cudagraph (#30657)

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 vllm/config/compilation.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 3b6cb8a343608..568a01bd9db91 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -932,9 +932,13 @@ class CompilationConfig:
                 self.splitting_ops = list(self._attention_ops)
                 added_default_splitting_ops = True
             elif len(self.splitting_ops) == 0:
-                logger.warning_once(
-                    "Using piecewise compilation with empty splitting_ops"
-                )
+                if (
+                    self.cudagraph_mode == CUDAGraphMode.PIECEWISE
+                    or self.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE
+                ):
+                    logger.warning_once(
+                        "Using piecewise compilation with empty splitting_ops"
+                    )
                 if self.cudagraph_mode == CUDAGraphMode.PIECEWISE:
                     logger.warning_once(
                         "Piecewise compilation with empty splitting_ops do not"

From 738648fb81aa53639994bee81eb0daa19aeadf59 Mon Sep 17 00:00:00 2001
From: Shanshan Shen <467638484@qq.com>
Date: Mon, 15 Dec 2025 11:02:09 +0800
Subject: [PATCH 545/770] [CustomOp] Support object-level enable for CustomOp
 (#30547)

Signed-off-by: shen-shanshan <467638484@qq.com>
---
 vllm/model_executor/custom_op.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index 9ef696d80712c..66250f816f459 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -38,8 +38,9 @@ class CustomOp(nn.Module):
             )
         return super().__new__(op_cls_to_instantiate)
 
-    def __init__(self):
+    def __init__(self, enforce_enable: bool = False):
         super().__init__()
+        self._enforce_enable = enforce_enable
         self._forward_method = self.dispatch_forward()
 
     def forward(self, *args, **kwargs):
@@ -84,7 +85,11 @@ class CustomOp(nn.Module):
         # NOTE(woosuk): Here we assume that vLLM was built for only one
         # specific backend. Currently, we do not support dynamic dispatching.
         compilation_config = get_cached_compilation_config()
-        enabled = self.enabled()
+
+        # CustomOp object can be enforce enabled, e.g., enable device-specific
+        # kernels in ViT models when enabling graph mode. By default, it will
+        # follow the compilation_config to determine whether enable itself.
+        enabled = self._enforce_enable or self.enabled()
         if enabled:
             compilation_config.enabled_custom_ops.update([self.__class__.name])
         else:

From 84e23d103d3483f944780d0d42bcf0993fd27e3a Mon Sep 17 00:00:00 2001
From: Wenqi Glantz <wenqiglantz@gmail.com>
Date: Sun, 14 Dec 2025 22:07:10 -0500
Subject: [PATCH 546/770] additional protection for CVE-2025-62164 (#30649)

Signed-off-by: Wenqi Glantz <wglantz@nvidia.com>
---
 .../openai/test_sparse_tensor_validation.py   | 342 ++++++++++++++++++
 .../test_sparse_tensor_validation_unit.py     | 134 +++++++
 vllm/entrypoints/renderer.py                  |  25 +-
 vllm/multimodal/audio.py                      |  12 +-
 vllm/multimodal/image.py                      |  12 +-
 5 files changed, 510 insertions(+), 15 deletions(-)
 create mode 100644 tests/entrypoints/openai/test_sparse_tensor_validation.py
 create mode 100644 tests/multimodal/test_sparse_tensor_validation_unit.py

diff --git a/tests/entrypoints/openai/test_sparse_tensor_validation.py b/tests/entrypoints/openai/test_sparse_tensor_validation.py
new file mode 100644
index 0000000000000..907c82b57dead
--- /dev/null
+++ b/tests/entrypoints/openai/test_sparse_tensor_validation.py
@@ -0,0 +1,342 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Sparse tensor validation in embedding APIs.
+
+Tests verify that malicious sparse tensors are rejected before they can trigger
+out-of-bounds memory writes during to_dense() operations.
+"""
+
+import base64
+import io
+
+import pytest
+import torch
+
+from vllm.entrypoints.renderer import CompletionRenderer
+from vllm.multimodal.audio import AudioEmbeddingMediaIO
+from vllm.multimodal.image import ImageEmbeddingMediaIO
+
+
+def _encode_tensor(tensor: torch.Tensor) -> bytes:
+    """Helper to encode a tensor as base64 bytes."""
+    buffer = io.BytesIO()
+    torch.save(tensor, buffer)
+    buffer.seek(0)
+    return base64.b64encode(buffer.read())
+
+
+def _create_malicious_sparse_tensor() -> torch.Tensor:
+    """
+    Create a malicious sparse COO tensor with out-of-bounds indices.
+
+    This tensor has indices that point beyond the declared shape, which would
+    cause an out-of-bounds write when converted to dense format without
+    validation.
+    """
+    # Create a 3x3 sparse tensor but with indices pointing to (10, 10)
+    indices = torch.tensor([[10], [10]])  # Out of bounds for 3x3 shape
+    values = torch.tensor([1.0])
+    shape = (3, 3)
+
+    # Create sparse tensor (this will be invalid)
+    sparse_tensor = torch.sparse_coo_tensor(indices, values, shape, dtype=torch.float32)
+    return sparse_tensor
+
+
+def _create_valid_sparse_tensor() -> torch.Tensor:
+    """Create a valid sparse COO tensor for baseline testing."""
+    indices = torch.tensor([[0, 1, 2], [0, 1, 2]])
+    values = torch.tensor([1.0, 2.0, 3.0])
+    shape = (3, 3)
+
+    sparse_tensor = torch.sparse_coo_tensor(indices, values, shape, dtype=torch.float32)
+    return sparse_tensor
+
+
+def _create_valid_dense_tensor() -> torch.Tensor:
+    """Create a valid dense tensor for baseline testing."""
+    return torch.randn(10, 768, dtype=torch.float32)  # (seq_len, hidden_size)
+
+
+class TestPromptEmbedsValidation:
+    """Test sparse tensor validation in prompt embeddings (Completions API)."""
+
+    def test_valid_dense_tensor_accepted(self, model_config):
+        """Baseline: Valid dense tensors should work normally."""
+        renderer = CompletionRenderer(model_config)
+
+        valid_tensor = _create_valid_dense_tensor()
+        encoded = _encode_tensor(valid_tensor)
+
+        # Should not raise any exception
+        result = renderer.load_prompt_embeds(encoded)
+        assert len(result) == 1
+        assert result[0]["prompt_embeds"].shape == valid_tensor.shape
+
+    def test_valid_sparse_tensor_accepted(self):
+        """Baseline: Valid sparse tensors should load successfully."""
+        io_handler = ImageEmbeddingMediaIO()
+
+        valid_sparse = _create_valid_sparse_tensor()
+        encoded = _encode_tensor(valid_sparse)
+
+        # Should not raise any exception (sparse tensors remain sparse)
+        result = io_handler.load_base64("", encoded.decode("utf-8"))
+        assert result.shape == valid_sparse.shape
+
+    def test_malicious_sparse_tensor_rejected(self, model_config):
+        """Security: Malicious sparse tensors should be rejected."""
+        renderer = CompletionRenderer(model_config)
+
+        malicious_tensor = _create_malicious_sparse_tensor()
+        encoded = _encode_tensor(malicious_tensor)
+
+        # Should raise RuntimeError due to invalid sparse tensor
+        with pytest.raises((RuntimeError, ValueError)) as exc_info:
+            renderer.load_prompt_embeds(encoded)
+
+        # Error should indicate sparse tensor validation failure
+        error_msg = str(exc_info.value).lower()
+        assert "sparse" in error_msg or "index" in error_msg or "bounds" in error_msg
+
+    def test_extremely_large_indices_rejected(self, model_config):
+        """Security: Sparse tensors with extremely large indices should be rejected."""
+        renderer = CompletionRenderer(model_config)
+
+        # Create tensor with indices far beyond reasonable bounds
+        indices = torch.tensor([[999999], [999999]])
+        values = torch.tensor([1.0])
+        shape = (10, 10)
+
+        malicious_tensor = torch.sparse_coo_tensor(
+            indices, values, shape, dtype=torch.float32
+        )
+        encoded = _encode_tensor(malicious_tensor)
+
+        with pytest.raises((RuntimeError, ValueError)):
+            renderer.load_prompt_embeds(encoded)
+
+    def test_negative_indices_rejected(self, model_config):
+        """Security: Sparse tensors with negative indices should be rejected."""
+        renderer = CompletionRenderer(model_config)
+
+        # Create tensor with negative indices
+        indices = torch.tensor([[-1], [-1]])
+        values = torch.tensor([1.0])
+        shape = (10, 10)
+
+        malicious_tensor = torch.sparse_coo_tensor(
+            indices, values, shape, dtype=torch.float32
+        )
+        encoded = _encode_tensor(malicious_tensor)
+
+        with pytest.raises((RuntimeError, ValueError)):
+            renderer.load_prompt_embeds(encoded)
+
+
+class TestImageEmbedsValidation:
+    """Test sparse tensor validation in image embeddings (Chat API)."""
+
+    def test_valid_dense_tensor_accepted(self):
+        """Baseline: Valid dense tensors should work normally."""
+        io_handler = ImageEmbeddingMediaIO()
+
+        valid_tensor = _create_valid_dense_tensor()
+        encoded = _encode_tensor(valid_tensor)
+
+        # Should not raise any exception
+        result = io_handler.load_base64("", encoded.decode("utf-8"))
+        assert result.shape == valid_tensor.shape
+
+    def test_valid_sparse_tensor_accepted(self):
+        """Baseline: Valid sparse tensors should load successfully."""
+        io_handler = AudioEmbeddingMediaIO()
+
+        valid_sparse = _create_valid_sparse_tensor()
+        encoded = _encode_tensor(valid_sparse)
+
+        # Should not raise any exception (sparse tensors remain sparse)
+        result = io_handler.load_base64("", encoded.decode("utf-8"))
+        assert result.shape == valid_sparse.shape
+
+    def test_malicious_sparse_tensor_rejected(self):
+        """Security: Malicious sparse tensors should be rejected."""
+        io_handler = ImageEmbeddingMediaIO()
+
+        malicious_tensor = _create_malicious_sparse_tensor()
+        encoded = _encode_tensor(malicious_tensor)
+
+        # Should raise RuntimeError due to invalid sparse tensor
+        with pytest.raises((RuntimeError, ValueError)) as exc_info:
+            io_handler.load_base64("", encoded.decode("utf-8"))
+
+        error_msg = str(exc_info.value).lower()
+        assert "sparse" in error_msg or "index" in error_msg or "bounds" in error_msg
+
+    def test_load_bytes_validates(self):
+        """Security: Validation should also work for load_bytes method."""
+        io_handler = ImageEmbeddingMediaIO()
+
+        malicious_tensor = _create_malicious_sparse_tensor()
+        buffer = io.BytesIO()
+        torch.save(malicious_tensor, buffer)
+        buffer.seek(0)
+
+        with pytest.raises((RuntimeError, ValueError)):
+            io_handler.load_bytes(buffer.read())
+
+
+class TestAudioEmbedsValidation:
+    """Test sparse tensor validation in audio embeddings (Chat API)."""
+
+    def test_valid_dense_tensor_accepted(self):
+        """Baseline: Valid dense tensors should work normally."""
+        io_handler = AudioEmbeddingMediaIO()
+
+        valid_tensor = _create_valid_dense_tensor()
+        encoded = _encode_tensor(valid_tensor)
+
+        # Should not raise any exception
+        result = io_handler.load_base64("", encoded.decode("utf-8"))
+        assert result.shape == valid_tensor.shape
+
+    def test_valid_sparse_tensor_accepted(self):
+        """Baseline: Valid sparse tensors should be converted successfully."""
+        io_handler = AudioEmbeddingMediaIO()
+
+        valid_sparse = _create_valid_sparse_tensor()
+        encoded = _encode_tensor(valid_sparse)
+
+        # Should not raise any exception
+        result = io_handler.load_base64("", encoded.decode("utf-8"))
+        assert result.is_sparse is False
+
+    def test_malicious_sparse_tensor_rejected(self):
+        """Security: Malicious sparse tensors should be rejected."""
+        io_handler = AudioEmbeddingMediaIO()
+
+        malicious_tensor = _create_malicious_sparse_tensor()
+        encoded = _encode_tensor(malicious_tensor)
+
+        # Should raise RuntimeError due to invalid sparse tensor
+        with pytest.raises((RuntimeError, ValueError)) as exc_info:
+            io_handler.load_base64("", encoded.decode("utf-8"))
+
+        error_msg = str(exc_info.value).lower()
+        assert "sparse" in error_msg or "index" in error_msg or "bounds" in error_msg
+
+    def test_load_bytes_validates(self):
+        """Security: Validation should also work for load_bytes method."""
+        io_handler = AudioEmbeddingMediaIO()
+
+        malicious_tensor = _create_malicious_sparse_tensor()
+        buffer = io.BytesIO()
+        torch.save(malicious_tensor, buffer)
+        buffer.seek(0)
+
+        with pytest.raises((RuntimeError, ValueError)):
+            io_handler.load_bytes(buffer.read())
+
+
+class TestSparseTensorValidationIntegration:
+    """
+    These tests verify the complete attack chain is blocked at all entry points.
+    """
+
+    def test_attack_scenario_completions_api(self, model_config):
+        """
+        Simulate a complete attack through the Completions API.
+
+        Attack scenario:
+        1. Attacker crafts malicious sparse tensor
+        2. Encodes it as base64
+        3. Sends to /v1/completions with prompt_embeds parameter
+        4. Server should reject before memory corruption occurs
+        """
+        renderer = CompletionRenderer(model_config)
+
+        # Step 1-2: Attacker creates malicious payload
+        attack_payload = _encode_tensor(_create_malicious_sparse_tensor())
+
+        # Step 3-4: Server processes and should reject
+        with pytest.raises((RuntimeError, ValueError)):
+            renderer.load_prompt_embeds(attack_payload)
+
+    def test_attack_scenario_chat_api_image(self):
+        """
+        Simulate attack through Chat API with image_embeds.
+
+        Verifies the image embeddings path is protected.
+        """
+        io_handler = ImageEmbeddingMediaIO()
+        attack_payload = _encode_tensor(_create_malicious_sparse_tensor())
+
+        with pytest.raises((RuntimeError, ValueError)):
+            io_handler.load_base64("", attack_payload.decode("utf-8"))
+
+    def test_attack_scenario_chat_api_audio(self):
+        """
+        Simulate attack through Chat API with audio_embeds.
+
+        Verifies the audio embeddings path is protected.
+        """
+        io_handler = AudioEmbeddingMediaIO()
+        attack_payload = _encode_tensor(_create_malicious_sparse_tensor())
+
+        with pytest.raises((RuntimeError, ValueError)):
+            io_handler.load_base64("", attack_payload.decode("utf-8"))
+
+    def test_multiple_valid_embeddings_in_batch(self, model_config):
+        """
+        Regression test: Multiple valid embeddings should still work.
+
+        Ensures the fix doesn't break legitimate batch processing.
+        """
+        renderer = CompletionRenderer(model_config)
+
+        valid_tensors = [
+            _encode_tensor(_create_valid_dense_tensor()),
+            _encode_tensor(_create_valid_dense_tensor()),
+            _encode_tensor(_create_valid_dense_tensor()),
+        ]
+
+        # Should process all without error
+        result = renderer.load_prompt_embeds(valid_tensors)
+        assert len(result) == 3
+
+    def test_mixed_valid_and_malicious_rejected(self, model_config):
+        """
+        Security: Batch with one malicious tensor should be rejected.
+
+        Even if most tensors are valid, a single malicious one should
+        cause rejection of the entire batch.
+        """
+        renderer = CompletionRenderer(model_config)
+
+        mixed_batch = [
+            _encode_tensor(_create_valid_dense_tensor()),
+            _encode_tensor(_create_malicious_sparse_tensor()),  # Malicious
+            _encode_tensor(_create_valid_dense_tensor()),
+        ]
+
+        # Should fail on the malicious tensor
+        with pytest.raises((RuntimeError, ValueError)):
+            renderer.load_prompt_embeds(mixed_batch)
+
+
+# Pytest fixtures
+@pytest.fixture
+def model_config():
+    """Mock ModelConfig for testing."""
+    from vllm.config import ModelConfig
+
+    return ModelConfig(
+        model="facebook/opt-125m",
+        tokenizer="facebook/opt-125m",
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        dtype="float32",
+        seed=0,
+        enable_prompt_embeds=True,  # Required for prompt embeds tests
+    )
diff --git a/tests/multimodal/test_sparse_tensor_validation_unit.py b/tests/multimodal/test_sparse_tensor_validation_unit.py
new file mode 100644
index 0000000000000..2eec8ea8283a2
--- /dev/null
+++ b/tests/multimodal/test_sparse_tensor_validation_unit.py
@@ -0,0 +1,134 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Unit tests for sparse tensor validation.
+
+Simple, fast unit tests that can run without server fixtures.
+Run with: pytest tests/multimodal/test_sparse_tensor_validation_unit.py -v
+"""
+
+import io
+
+import pytest
+import torch
+
+
+class TestSparseTensorValidationContextManager:
+    """Test that torch.sparse.check_sparse_tensor_invariants() works as expected."""
+
+    def test_valid_sparse_tensor_passes(self):
+        """Valid sparse tensors should pass validation."""
+        indices = torch.tensor([[0, 1], [0, 1]])
+        values = torch.tensor([1.0, 2.0])
+        shape = (2, 2)
+
+        with torch.sparse.check_sparse_tensor_invariants():
+            tensor = torch.sparse_coo_tensor(indices, values, shape)
+            dense = tensor.to_dense()
+
+        assert dense.shape == shape
+
+    def test_out_of_bounds_indices_rejected(self):
+        """Sparse tensors with out-of-bounds indices should be rejected."""
+        indices = torch.tensor([[5], [5]])  # Out of bounds for 2x2
+        values = torch.tensor([1.0])
+        shape = (2, 2)
+
+        with pytest.raises(RuntimeError) as exc_info:  # noqa: SIM117
+            with torch.sparse.check_sparse_tensor_invariants():
+                tensor = torch.sparse_coo_tensor(indices, values, shape)
+                tensor.to_dense()
+
+        assert (
+            "index" in str(exc_info.value).lower()
+            or "bound" in str(exc_info.value).lower()
+        )
+
+    def test_negative_indices_rejected(self):
+        """Sparse tensors with negative indices should be rejected."""
+        indices = torch.tensor([[-1], [0]])
+        values = torch.tensor([1.0])
+        shape = (2, 2)
+
+        with pytest.raises(RuntimeError):  # noqa: SIM117
+            with torch.sparse.check_sparse_tensor_invariants():
+                tensor = torch.sparse_coo_tensor(indices, values, shape)
+                tensor.to_dense()
+
+    def test_without_context_manager_allows_invalid(self):
+        """
+        WITHOUT validation, invalid tensors may not immediately error.
+
+        This demonstrates the vulnerability: PyTorch 2.8.0+ doesn't validate
+        by default, which can lead to memory corruption.
+        """
+        indices = torch.tensor([[100], [100]])  # Way out of bounds
+        values = torch.tensor([1.0])
+        shape = (2, 2)
+
+        # Without validation context, this might create an invalid tensor
+        # (actual behavior depends on PyTorch version)
+        tensor = torch.sparse_coo_tensor(indices, values, shape)
+
+        # The tensor object is created, but it's invalid
+        assert tensor.is_sparse
+
+
+class TestTorchLoadWithValidation:
+    """Test torch.load() with sparse tensor validation."""
+
+    def test_load_valid_sparse_tensor_with_validation(self):
+        """Valid sparse tensors should load successfully with validation."""
+        # Create and save a valid sparse tensor
+        indices = torch.tensor([[0, 1], [0, 1]])
+        values = torch.tensor([1.0, 2.0])
+        tensor = torch.sparse_coo_tensor(indices, values, (2, 2))
+
+        buffer = io.BytesIO()
+        torch.save(tensor, buffer)
+        buffer.seek(0)
+
+        # Load with validation
+        with torch.sparse.check_sparse_tensor_invariants():
+            loaded = torch.load(buffer, weights_only=True)
+            dense = loaded.to_dense()
+
+        assert dense.shape == (2, 2)
+
+    def test_load_invalid_sparse_tensor_rejected(self):
+        """Invalid sparse tensors should be caught when loaded with validation."""
+        # Create an invalid sparse tensor (out of bounds)
+        indices = torch.tensor([[10], [10]])
+        values = torch.tensor([1.0])
+        tensor = torch.sparse_coo_tensor(indices, values, (2, 2))
+
+        buffer = io.BytesIO()
+        torch.save(tensor, buffer)
+        buffer.seek(0)
+
+        # Load with validation - should fail on to_dense()
+        with pytest.raises(RuntimeError):  # noqa: SIM117
+            with torch.sparse.check_sparse_tensor_invariants():
+                loaded = torch.load(buffer, weights_only=True)
+                loaded.to_dense()
+
+    def test_load_dense_tensor_unaffected(self):
+        """Dense tensors should work normally with the validation context."""
+        # Create and save a dense tensor
+        tensor = torch.randn(10, 20)
+
+        buffer = io.BytesIO()
+        torch.save(tensor, buffer)
+        buffer.seek(0)
+
+        # Load with validation (should have no effect on dense tensors)
+        with torch.sparse.check_sparse_tensor_invariants():
+            loaded = torch.load(buffer, weights_only=True)
+
+        assert loaded.shape == (10, 20)
+        assert not loaded.is_sparse
+
+
+if __name__ == "__main__":
+    # Allow running directly for quick testing
+    pytest.main([__file__, "-v", "--tb=short"])
diff --git a/vllm/entrypoints/renderer.py b/vllm/entrypoints/renderer.py
index 22f3c61ff73fa..0f89c840be80f 100644
--- a/vllm/entrypoints/renderer.py
+++ b/vllm/entrypoints/renderer.py
@@ -167,17 +167,20 @@ class BaseRenderer(ABC):
             )
 
         def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt:
-            tensor = torch.load(
-                io.BytesIO(pybase64.b64decode(embed, validate=True)),
-                weights_only=True,
-                map_location=torch.device("cpu"),
-            )
-            assert isinstance(tensor, torch.Tensor) and tensor.dtype in (
-                torch.float32,
-                torch.bfloat16,
-                torch.float16,
-            )
-            tensor = tensor.to_dense()
+            # Enable sparse tensor integrity checks to prevent out-of-bounds
+            # writes from maliciously crafted tensors
+            with torch.sparse.check_sparse_tensor_invariants():
+                tensor = torch.load(
+                    io.BytesIO(pybase64.b64decode(embed, validate=True)),
+                    weights_only=True,
+                    map_location=torch.device("cpu"),
+                )
+                assert isinstance(tensor, torch.Tensor) and tensor.dtype in (
+                    torch.float32,
+                    torch.bfloat16,
+                    torch.float16,
+                )
+                tensor = tensor.to_dense()
             if tensor.dim() > 2:
                 tensor = tensor.squeeze(0)
                 assert tensor.dim() == 2
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
index 062547401c3cf..51b8f77f29088 100644
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -127,13 +127,21 @@ class AudioEmbeddingMediaIO(MediaIO[torch.Tensor]):
 
     def load_bytes(self, data: bytes) -> torch.Tensor:
         buffer = BytesIO(data)
-        return torch.load(buffer, weights_only=True)
+        # Enable sparse tensor integrity checks to prevent out-of-bounds
+        # writes from maliciously crafted tensors
+        with torch.sparse.check_sparse_tensor_invariants():
+            tensor = torch.load(buffer, weights_only=True)
+            return tensor.to_dense()
 
     def load_base64(self, media_type: str, data: str) -> torch.Tensor:
         return self.load_bytes(pybase64.b64decode(data, validate=True))
 
     def load_file(self, filepath: Path) -> torch.Tensor:
-        return torch.load(filepath, weights_only=True)
+        # Enable sparse tensor integrity checks to prevent out-of-bounds
+        # writes from maliciously crafted tensors
+        with torch.sparse.check_sparse_tensor_invariants():
+            tensor = torch.load(filepath, weights_only=True)
+            return tensor.to_dense()
 
     def encode_base64(self, media: torch.Tensor) -> str:
         return tensor2base64(media)
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 789421e9e0c3b..1506ecb8c7aa0 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -122,13 +122,21 @@ class ImageEmbeddingMediaIO(MediaIO[torch.Tensor]):
 
     def load_bytes(self, data: bytes) -> torch.Tensor:
         buffer = BytesIO(data)
-        return torch.load(buffer, weights_only=True)
+        # Enable sparse tensor integrity checks to prevent out-of-bounds
+        # writes from maliciously crafted tensors
+        with torch.sparse.check_sparse_tensor_invariants():
+            tensor = torch.load(buffer, weights_only=True)
+            return tensor.to_dense()
 
     def load_base64(self, media_type: str, data: str) -> torch.Tensor:
         return self.load_bytes(pybase64.b64decode(data, validate=True))
 
     def load_file(self, filepath: Path) -> torch.Tensor:
-        return torch.load(filepath, weights_only=True)
+        # Enable sparse tensor integrity checks to prevent out-of-bounds
+        # writes from maliciously crafted tensors
+        with torch.sparse.check_sparse_tensor_invariants():
+            tensor = torch.load(filepath, weights_only=True)
+            return tensor.to_dense()
 
     def encode_base64(self, media: torch.Tensor) -> str:
         return pybase64.b64encode(media.numpy()).decode("utf-8")

From 87b4d1557dc83addf5dd8568862ee7a74882200a Mon Sep 17 00:00:00 2001
From: Shanshan Shen <467638484@qq.com>
Date: Mon, 15 Dec 2025 11:13:32 +0800
Subject: [PATCH 547/770] [CustomOp][MM] Extract MMEncoderAttention as CustomOp
 and replace the backend of QwenVisionAttention with it. (#30125)

Signed-off-by: shen-shanshan <467638484@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .../test_vit_backend_functionality.py         | 434 ++++++++++++++++++
 vllm/attention/layer.py                       |  73 +--
 vllm/attention/layers/mm_encoder_attention.py | 284 ++++++++++++
 vllm/attention/ops/vit_attn_wrappers.py       |  11 +-
 vllm/model_executor/models/dots_ocr.py        | 129 ++----
 vllm/model_executor/models/ernie45_vl.py      | 108 ++---
 vllm/model_executor/models/glm4_1v.py         | 137 ++----
 vllm/model_executor/models/keye.py            | 107 ++---
 vllm/model_executor/models/opencua.py         |   8 +-
 vllm/model_executor/models/ovis2_5.py         |  22 +-
 vllm/model_executor/models/paddleocr_vl.py    | 105 ++---
 .../models/qwen2_5_omni_thinker.py            |   1 +
 vllm/model_executor/models/qwen2_5_vl.py      | 122 ++---
 vllm/model_executor/models/qwen2_vl.py        | 143 ++----
 .../models/qwen3_omni_moe_thinker.py          |  20 +-
 vllm/model_executor/models/qwen3_vl.py        |  46 +-
 vllm/model_executor/models/qwen3_vl_moe.py    |   3 +-
 vllm/model_executor/models/siglip2navit.py    | 127 ++---
 vllm/model_executor/models/vision.py          |  13 +-
 vllm/platforms/cuda.py                        |  54 ++-
 vllm/platforms/interface.py                   |  45 +-
 vllm/platforms/rocm.py                        |  57 ++-
 vllm/platforms/tpu.py                         |  28 +-
 vllm/platforms/xpu.py                         |  36 +-
 24 files changed, 1262 insertions(+), 851 deletions(-)
 create mode 100644 tests/models/multimodal/generation/test_vit_backend_functionality.py
 create mode 100644 vllm/attention/layers/mm_encoder_attention.py

diff --git a/tests/models/multimodal/generation/test_vit_backend_functionality.py b/tests/models/multimodal/generation/test_vit_backend_functionality.py
new file mode 100644
index 0000000000000..78797ff7c1979
--- /dev/null
+++ b/tests/models/multimodal/generation/test_vit_backend_functionality.py
@@ -0,0 +1,434 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Consolidated test for ViT attention backend functionality across multiple models.
+
+This test validates that each multimodal model can successfully generate outputs
+using different ViT attention backends. Tests are parametrized by model and backend.
+"""
+
+from dataclasses import asdict
+from typing import Any
+
+import pytest
+from transformers import AutoProcessor
+
+from vllm import LLM, EngineArgs, SamplingParams
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.multimodal.utils import encode_image_base64
+from vllm.multimodal.video import sample_frames_from_video
+from vllm.platforms import current_platform
+
+from ....utils import create_new_process_for_each_test
+from ...utils import dummy_hf_overrides
+
+# Dots.OCR prompt from official repository
+# https://github.com/rednote-hilab/dots.ocr/blob/d72d1d8c5bdd0362eb264f714cdbd1e5daa7cdff/dots_ocr/utils/prompts.py#L3
+# ruff: noqa: E501
+DOTS_OCR_PROMPT = """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
+
+1. Bbox format: [x1, y1, x2, y2]
+
+2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].
+
+3. Text Extraction & Formatting Rules:
+    - Picture: For the 'Picture' category, the text field should be omitted.
+    - Formula: Format its text as LaTeX.
+    - Table: Format its text as HTML.
+    - All Others (Text, Title, etc.): Format their text as Markdown.
+
+4. Constraints:
+    - The output text must be the original text from the image, with no translation.
+    - All layout elements must be sorted according to human reading order.
+
+5. Final Output: The entire output must be a single JSON object.
+"""
+
+VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"
+
+
+# Model configurations
+MODEL_CONFIGS: dict[str, dict[str, Any]] = {
+    "dots_ocr": {
+        "model_name": "rednote-hilab/dots.ocr",
+        "interface": "llm_chat",
+        "max_model_len": 32768,
+        "max_num_seqs": 1,
+        "limit_mm_per_prompt": {"image": 1},
+        "sampling_params": {
+            "temperature": 0.1,
+            "max_tokens": 16384,
+            "top_p": 0.9,
+            "stop_token_ids": None,
+        },
+        "use_specific_image": "stop_sign",
+        "prompt_builder": "build_dots_ocr_prompt",
+        "output_validator": lambda x: len(x) > 10 and "stop" in x.lower(),
+    },
+    "ernie45_vl": {
+        "model_name": "baidu/ERNIE-4.5-VL-28B-A3B-PT",
+        "interface": "llm_generate",
+        "max_model_len": 16384,
+        "max_num_seqs": 2,
+        "sampling_params": {
+            "temperature": 0.0,
+            "max_tokens": 256,
+            "stop_token_ids": None,
+        },
+        "use_processor": True,
+        "question": "What is the content of each image?",
+    },
+    "glm4_1v": {
+        "model_name": "zai-org/GLM-4.1V-9B-Thinking",
+        "interface": "llm_generate",
+        "max_model_len": 32768,
+        "max_num_seqs": 2,
+        "sampling_params": {
+            "temperature": 0.0,
+            "max_tokens": 256,
+            "stop_token_ids": None,
+        },
+        "use_processor": True,
+        "question": "What is the content of each image?",
+    },
+    "keye_vl": {
+        "model_name": "Kwai-Keye/Keye-VL-8B-Preview",
+        "interface": "llm_generate",
+        "max_model_len": 8192,
+        "max_num_seqs": 5,
+        "sampling_params": {
+            "temperature": 0.0,
+            "max_tokens": 256,
+            "stop_token_ids": None,
+        },
+        "supported_backends": {
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
+        },
+        "use_processor": True,
+        "question": "What is the content of each image?",
+    },
+    "ovis2_5": {
+        "model_name": "AIDC-AI/Ovis2.5-2B",
+        "interface": "llm_generate",
+        "max_model_len": 8192,
+        "max_num_seqs": 2,
+        "sampling_params": {
+            "temperature": 0.0,
+            "max_tokens": 256,
+            "stop_token_ids": None,
+        },
+        "prompt_builder": "build_ovis_prompt",
+        "question": "What is the content of each image?",
+    },
+    "qwen2_5_vl": {
+        "model_name": "Qwen/Qwen2.5-VL-3B-Instruct",
+        "interface": "vllm_runner",
+        "media_type": "video",
+        "max_model_len": 4000,
+        "max_num_seqs": 1,
+        "limit_mm_per_prompt": {"video": 1},
+        "sampling_params": {
+            "max_tokens": 128,
+        },
+        "runner_kwargs": {
+            "runner": "generate",
+            "dtype": "bfloat16",
+        },
+        "video_params": {
+            "num_frames": 16,
+            "pruning_rates": [0.0, 0.75],
+        },
+    },
+    "qwen2_5_omni": {
+        "model_name": "Qwen/Qwen2.5-Omni-3B",
+        "interface": "llm_generate",
+        "max_model_len": 32768,
+        "max_num_seqs": 2,
+        "limit_mm_per_prompt": {"image": 3, "video": 3, "audio": 3},
+        "sampling_params": {
+            "temperature": 0.6,
+            "top_p": 0.95,
+            "top_k": 20,
+            "max_tokens": 16384,
+        },
+        "use_processor": True,
+        "question": "What is the content of each image?",
+    },
+    "qwen3_omni": {
+        "model_name": "Qwen/Qwen3-Omni-30B-A3B-Instruct",
+        "interface": "llm_generate",
+        "max_model_len": 32768,
+        "max_num_seqs": 2,
+        "limit_mm_per_prompt": {"image": 3, "video": 3, "audio": 3},
+        "sampling_params": {
+            "temperature": 0.6,
+            "top_p": 0.95,
+            "top_k": 20,
+            "max_tokens": 16384,
+        },
+        "use_processor": True,
+        "question": "What is the content of each image?",
+    },
+}
+
+
+# Prompt builder functions
+def build_dots_ocr_prompt(images, config):
+    """Build Dots.OCR specific prompt with OCR instructions."""
+    # Use only stop_sign image for Dots.OCR
+    image = images[0]  # Already filtered to stop_sign
+
+    image_url = f"data:image/jpeg;base64,{encode_image_base64(image)}"
+
+    placeholders = [{"type": "image_url", "image_url": {"url": image_url}}]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {
+                    "type": "text",
+                    "text": f"<|img|><|imgpad|><|endofimg|>{DOTS_OCR_PROMPT}",
+                },
+            ],
+        },
+    ]
+
+    return messages
+
+
+def build_processor_prompt(images, config):
+    """Build prompt using AutoProcessor.apply_chat_template()."""
+    processor = AutoProcessor.from_pretrained(
+        config["model_name"], trust_remote_code=True
+    )
+
+    image_urls = [
+        f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
+    ]
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": config["question"]},
+            ],
+        },
+    ]
+
+    return processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+
+def build_ovis_prompt(images, config):
+    """Build Ovis2.5 specific prompt with custom format."""
+    image_urls = [
+        f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
+    ]
+
+    placeholders = "\n".join(
+        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
+    )
+
+    return (
+        f"<|im_start|>user\n\n{placeholders}\n{config['question']}<|im_end|>\n"
+        "<|im_start|>assistant\n"
+    )
+
+
+def build_qwen2_5_video_prompt():
+    """Build Qwen2.5-VL video prompt with EVS placeholder."""
+    return (
+        f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+        f"<|im_start|>user\n{VIDEO_PLACEHOLDER}"
+        "Describe this video with a short sentence (no more than 20 words)"
+        "<|im_end|><|im_start|>assistant\n"
+    )
+
+
+# Handler functions
+def run_llm_generate_test(config, mm_encoder_attn_backend, image_assets):
+    """Standard LLM.generate() interface handler."""
+    images = [asset.pil_image for asset in image_assets]
+
+    # Build prompt
+    if config.get("use_processor"):
+        prompt = build_processor_prompt(images, config)
+    else:
+        prompt_builder_name = config.get("prompt_builder", "build_ovis_prompt")
+        prompt_builder = globals()[prompt_builder_name]
+        prompt = prompt_builder(images, config)
+
+    # Determine limit_mm_per_prompt
+    limit_mm_per_prompt = config.get("limit_mm_per_prompt", {"image": len(images)})
+
+    # Create engine
+    engine_args = EngineArgs(
+        model=config["model_name"],
+        trust_remote_code=True,
+        max_model_len=config["max_model_len"],
+        max_num_seqs=config["max_num_seqs"],
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        mm_encoder_attn_backend=mm_encoder_attn_backend,
+        hf_overrides=dummy_hf_overrides,
+        load_format="dummy",
+    )
+
+    engine_dict = asdict(engine_args) | {"seed": 42}
+    llm = LLM(**engine_dict)
+
+    # Generate
+    sampling_params = SamplingParams(**config["sampling_params"])
+    outputs = llm.generate(
+        {
+            "prompt": prompt,
+            "multi_modal_data": {"image": images},
+        },
+        sampling_params=sampling_params,
+    )
+
+    # Validate
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        validator = config.get("output_validator", lambda x: len(x) > 10)
+        assert validator(generated_text), (
+            f"Validation failed for {config['model_name']}: {generated_text}"
+        )
+
+
+def run_llm_chat_test(config, mm_encoder_attn_backend, image_assets):
+    """LLM.chat() interface handler for Dots.OCR."""
+    # Filter to stop_sign image only
+    stop_sign_image = [
+        asset.pil_image for asset in image_assets if asset.name == "stop_sign"
+    ][0]
+
+    # Build messages
+    messages = build_dots_ocr_prompt([stop_sign_image], config)
+
+    # Create engine
+    engine_args = EngineArgs(
+        model=config["model_name"],
+        trust_remote_code=True,
+        max_model_len=config["max_model_len"],
+        max_num_seqs=config["max_num_seqs"],
+        limit_mm_per_prompt=config["limit_mm_per_prompt"],
+        mm_encoder_attn_backend=mm_encoder_attn_backend,
+        hf_overrides=dummy_hf_overrides,
+        load_format="dummy",
+    )
+
+    engine_dict = asdict(engine_args) | {"seed": 42}
+    llm = LLM(**engine_dict)
+
+    # Generate using chat
+    sampling_params = SamplingParams(**config["sampling_params"])
+    outputs = llm.chat(messages=messages, sampling_params=sampling_params)
+
+    # Validate
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        validator = config.get("output_validator", lambda x: len(x) > 10)
+        assert validator(generated_text), (
+            f"Validation failed for {config['model_name']}: {generated_text}"
+        )
+
+
+def run_video_test(config, mm_encoder_attn_backend, video_assets, vllm_runner):
+    """Video test with EVS (Efficient Video Sampling) handler."""
+    for pruning_rate in config["video_params"]["pruning_rates"]:
+        num_frames = config["video_params"]["num_frames"]
+
+        # Sample frames from video
+        sampled_vids = [
+            sample_frames_from_video(asset.np_ndarrays, num_frames)
+            for asset in video_assets
+        ]
+
+        # Build prompt and prepare video
+        prompt = build_qwen2_5_video_prompt()
+        prompts = [prompt]
+        videos = [sampled_vids[0]]
+
+        # Run with vllm_runner context manager
+        with vllm_runner(
+            config["model_name"],
+            max_model_len=config["max_model_len"],
+            max_num_seqs=config["max_num_seqs"],
+            limit_mm_per_prompt=config["limit_mm_per_prompt"],
+            tensor_parallel_size=1,
+            video_pruning_rate=pruning_rate,
+            mm_encoder_attn_backend=mm_encoder_attn_backend,
+            hf_overrides=dummy_hf_overrides,
+            load_format="dummy",
+            **config["runner_kwargs"],
+        ) as vllm_model:
+            outputs = vllm_model.generate_greedy(
+                prompts,
+                config["sampling_params"]["max_tokens"],
+                videos=videos,
+            )
+
+            # Validate output
+            assert len(outputs) == 1, f"Expected 1 output, got {len(outputs)}"
+            output_ids, output_text = outputs[0]
+            assert len(output_ids) > 0, "Generated no output IDs"
+            assert len(output_text) > 0, "Generated empty text"
+            assert isinstance(output_text, str), (
+                f"Output is not string: {type(output_text)}"
+            )
+
+
+# Main test function
+@pytest.mark.parametrize("model_key", list(MODEL_CONFIGS.keys()))
+@pytest.mark.parametrize(
+    "mm_encoder_attn_backend",
+    [None] + current_platform.get_supported_vit_attn_backends(),
+)
+@create_new_process_for_each_test()
+def test_vit_backend_functionality(
+    model_key: str,
+    mm_encoder_attn_backend: AttentionBackendEnum | None,
+    image_assets,
+    video_assets,
+    vllm_runner,
+    request,
+):
+    """Test ViT attention backend functionality for multimodal models.
+
+    This test validates that each model can successfully generate outputs
+    using different ViT attention backends. The test:
+    1. Filters unsupported backends per model
+    2. Applies appropriate GPU marks
+    3. Routes to the correct test handler based on interface
+    4. Validates output meets minimum requirements
+    """
+    config = MODEL_CONFIGS[model_key]
+
+    # Step 1: Backend filtering
+    if (
+        "supported_backends" in config
+        and mm_encoder_attn_backend is not None
+        and mm_encoder_attn_backend not in config["supported_backends"]
+    ):
+        pytest.skip(
+            f"{model_key} does not support {mm_encoder_attn_backend} backend now."
+        )
+
+    # Step 2: Apply GPU marks dynamically
+    if "gpu_marks" in config:
+        for mark in config["gpu_marks"]:
+            request.applymarker(mark)
+
+    # Step 3: Route to appropriate handler
+    if config.get("media_type") == "video":
+        run_video_test(config, mm_encoder_attn_backend, video_assets, vllm_runner)
+    elif config["interface"] == "llm_chat":
+        run_llm_chat_test(config, mm_encoder_attn_backend, image_assets)
+    elif config["interface"] == "llm_generate":
+        run_llm_generate_test(config, mm_encoder_attn_backend, image_assets)
+    else:
+        raise ValueError(f"Unknown interface: {config['interface']}")
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index c095b94518143..47daf6d138431 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -3,7 +3,6 @@
 """Attention layer."""
 
 import functools
-from collections.abc import Callable
 from typing import cast
 
 import torch
@@ -17,6 +16,7 @@ from vllm.attention.backends.abstract import (
     MLAAttentionImpl,
 )
 from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.attention.layers.mm_encoder_attention import maybe_get_vit_flash_attn_backend
 from vllm.attention.selector import get_attn_backend
 from vllm.attention.utils.fa_utils import get_flash_attn_version
 from vllm.attention.utils.kv_sharing_utils import validate_kv_sharing_target
@@ -49,58 +49,9 @@ from vllm.v1.kv_cache_interface import (
     SlidingWindowSpec,
 )
 
-if current_platform.is_rocm():
-    from vllm.platforms.rocm import on_gfx9
-else:
-    on_gfx9 = lambda *args, **kwargs: False
-
-
-FP8_DTYPE = current_platform.fp8_dtype()
 logger = init_logger(__name__)
 
 
-def maybe_get_vit_flash_attn_backend(
-    attn_backend: AttentionBackendEnum,
-    attn_backend_override: AttentionBackendEnum | None = None,
-) -> tuple[AttentionBackendEnum, Callable | None]:
-    if current_platform.is_rocm():
-        if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA and on_gfx9():
-            attn_backend = AttentionBackendEnum.ROCM_AITER_FA
-        elif (
-            attn_backend_override is None
-            and on_gfx9()
-            and attn_backend == AttentionBackendEnum.FLASH_ATTN
-        ):
-            pass
-        else:
-            return AttentionBackendEnum.TORCH_SDPA, None
-    elif current_platform.is_cuda():
-        pass
-    elif current_platform.is_xpu():
-        assert attn_backend == AttentionBackendEnum.FLASH_ATTN, (
-            "XPU platform only supports FLASH_ATTN as vision attention backend."
-        )
-        pass
-    else:
-        return AttentionBackendEnum.TORCH_SDPA, None
-
-    if attn_backend in {
-        AttentionBackendEnum.FLASH_ATTN,
-        AttentionBackendEnum.ROCM_AITER_FA,
-    }:
-        if attn_backend == AttentionBackendEnum.ROCM_AITER_FA:
-            from aiter import flash_attn_varlen_func
-        else:
-            try:
-                from vllm.attention.utils.fa_utils import flash_attn_varlen_func
-            except ImportError:
-                flash_attn_varlen_func = None
-    else:
-        flash_attn_varlen_func = None
-
-    return attn_backend, flash_attn_varlen_func
-
-
 def _init_kv_cache_quant(
     layer: nn.Module,
     quant_config: QuantizationConfig | None,
@@ -496,29 +447,15 @@ class MultiHeadAttention(nn.Module):
         attn_backend_override = None
         if multimodal_config is not None:
             attn_backend_override = multimodal_config.mm_encoder_attn_backend
-        backend = get_vit_attn_backend(
+
+        self.attn_backend = get_vit_attn_backend(
             head_size=head_size,
             dtype=dtype,
             attn_backend_override=attn_backend_override,
         )
 
-        self.attn_backend = (
-            backend
-            if backend
-            in {
-                AttentionBackendEnum.TORCH_SDPA,
-                AttentionBackendEnum.PALLAS,
-                AttentionBackendEnum.ROCM_AITER_FA,
-                AttentionBackendEnum.FLASH_ATTN,
-            }
-            else AttentionBackendEnum.TORCH_SDPA
-        )
-
-        self.attn_backend, self._flash_attn_varlen_func = (
-            maybe_get_vit_flash_attn_backend(
-                self.attn_backend,
-                attn_backend_override=attn_backend_override,
-            )
+        self._flash_attn_varlen_func = maybe_get_vit_flash_attn_backend(
+            self.attn_backend,
         )
 
         self.is_flash_attn_backend = self.attn_backend in {
diff --git a/vllm/attention/layers/mm_encoder_attention.py b/vllm/attention/layers/mm_encoder_attention.py
new file mode 100644
index 0000000000000..c9107ebcab856
--- /dev/null
+++ b/vllm/attention/layers/mm_encoder_attention.py
@@ -0,0 +1,284 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Callable
+
+import torch
+
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.attention.ops.vit_attn_wrappers import (
+    vit_flash_attn_wrapper,
+    vit_torch_sdpa_wrapper,
+)
+from vllm.config import MultiModalConfig
+from vllm.logger import init_logger
+from vllm.model_executor.custom_op import CustomOp
+from vllm.model_executor.models.vision import get_vit_attn_backend
+
+logger = init_logger(__name__)
+
+
+def maybe_get_vit_flash_attn_backend(
+    attn_backend: AttentionBackendEnum | None,
+) -> Callable | None:
+    # At this point,
+    # we already have the attn_backend,
+    # overriding logic is done in the platform-specific implementation.
+    # so we don't need to override backend here.
+    # Just return the attn_backend and flash_attn_varlen_func.
+
+    if attn_backend == AttentionBackendEnum.FLASH_ATTN:
+        from vllm.attention.utils.fa_utils import flash_attn_varlen_func
+    elif attn_backend == AttentionBackendEnum.ROCM_AITER_FA:
+        from aiter import flash_attn_varlen_func
+    else:
+        flash_attn_varlen_func = None
+
+    # if attn_backend is TORCH_SDPA,
+    # it will reach here and the flash_attn_varlen_func will be None.
+    return flash_attn_varlen_func
+
+
+@CustomOp.register("mm_encoder_attn")
+class MMEncoderAttention(CustomOp):
+    """Multi-headed attention without any cache, used for multimodal encoder."""
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float | None = None,
+        num_kv_heads: int | None = None,
+        prefix: str = "",
+        multimodal_config: MultiModalConfig | None = None,
+    ) -> None:
+        """
+        Args:
+            num_heads: number of attention heads per partition.
+            head_size: hidden_size per attention head.
+            scale: scale factor.
+            num_kv_heads: number of kv heads.
+            prefix: This has no effect, it is only here to make it easier to
+                    swap between Attention and MultiHeadAttention
+            multimodal_config: configs for multi-modal.
+        """
+        super().__init__()
+
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = scale
+        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
+        self.layer_name = prefix
+
+        assert self.num_heads % self.num_kv_heads == 0, (
+            f"num_heads ({self.num_heads}) is not "
+            f"divisible by num_kv_heads ({self.num_kv_heads})"
+        )
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+
+        # During model initialization, the default dtype is set as the model
+        # weight and activation dtype.
+        dtype = torch.get_default_dtype()
+
+        # Try to get vision attention backend from multimodal_config.
+        attn_backend_override = None
+        if multimodal_config is not None:
+            attn_backend_override = multimodal_config.mm_encoder_attn_backend
+
+        # Get device-specific vision attention backend.
+        self.attn_backend = get_vit_attn_backend(
+            head_size=head_size,
+            dtype=dtype,
+            attn_backend_override=attn_backend_override,
+        )
+
+        self.is_flash_attn_backend = self.attn_backend in {
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
+        }
+
+        self.flash_attn_varlen_func = maybe_get_vit_flash_attn_backend(
+            self.attn_backend,
+        )
+
+        logger.info_once(f"Using {self.attn_backend} for MMEncoderAttention.")
+
+    @classmethod
+    def enabled(cls) -> bool:
+        return True
+
+    def reshape_qkv_to_4d(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        bsz: int,
+        q_len: int,
+        kv_len: int,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Reshape query, key, value to 4D tensors:
+        (batch_size, seq_len, num_heads, head_size)
+        """
+        query = query.view(bsz, q_len, self.num_heads, self.head_size)
+        key = key.view(bsz, kv_len, self.num_kv_heads, self.head_size)
+        value = value.view(bsz, kv_len, self.num_kv_heads, self.head_size)
+
+        if (num_repeat := self.num_queries_per_kv) > 1:
+            # Handle MQA and GQA
+            key = torch.repeat_interleave(key, num_repeat, dim=2)
+            value = torch.repeat_interleave(value, num_repeat, dim=2)
+
+        return query, key, value
+
+    def reshape_qkv_to_3d(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        bsz: int,
+        q_len: int,
+        kv_len: int,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Reshape query, key, value to 3D tensors:
+        (batch_size * seq_len, num_heads, head_size)
+        """
+        query = query.view(bsz * q_len, self.num_heads, self.head_size)
+        key = key.view(bsz * kv_len, self.num_kv_heads, self.head_size)
+        value = value.view(bsz * kv_len, self.num_kv_heads, self.head_size)
+
+        if (num_repeat := self.num_queries_per_kv) > 1:
+            # Handle MQA and GQA
+            key = torch.repeat_interleave(key, num_repeat, dim=1)
+            value = torch.repeat_interleave(value, num_repeat, dim=1)
+
+        return query, key, value
+
+    def _forward_sdpa(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        cu_seqlens: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        # TODO(Isotr0py): Migrate MultiHeadAttention
+        assert cu_seqlens is not None
+
+        bsz, q_len = query.size()[:2]
+        kv_len = key.size(1)
+
+        query, key, value = self.reshape_qkv_to_4d(
+            query, key, value, bsz, q_len, kv_len
+        )
+
+        output = vit_torch_sdpa_wrapper(
+            q=query,
+            k=key,
+            v=value,
+            cu_seqlens=cu_seqlens,
+        )
+        return output
+
+    def _forward_fa(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        cu_seqlens: torch.Tensor | None = None,
+        max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
+    ) -> torch.Tensor:
+        assert self.flash_attn_varlen_func is not None, (
+            "Flash attention function is not set."
+        )
+        # # TODO(Isotr0py): Migrate MultiHeadAttention
+        assert cu_seqlens is not None and max_seqlen is not None
+
+        bsz = query.shape[0]
+
+        output = vit_flash_attn_wrapper(
+            q=query,
+            k=key,
+            v=value,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
+            batch_size=bsz,
+            is_rocm_aiter=(self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA),
+        )
+        return output
+
+    def forward_native(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        cu_seqlens: torch.Tensor | None = None,
+        max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
+    ) -> torch.Tensor:
+        return self._forward_sdpa(query, key, value, cu_seqlens)
+
+    def forward_cuda(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        cu_seqlens: torch.Tensor | None = None,
+        max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
+    ) -> torch.Tensor:
+        if self.is_flash_attn_backend:
+            return self._forward_fa(query, key, value, cu_seqlens, max_seqlen)
+        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
+            return self._forward_sdpa(query, key, value, cu_seqlens)
+        else:
+            raise ValueError(
+                f"Unsupported multi-modal encoder attention backend for CUDA: "
+                f"{self.attn_backend}."
+            )
+
+    def forward_cpu(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        cu_seqlens: torch.Tensor | None = None,
+        max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
+    ) -> torch.Tensor:
+        return self._forward_sdpa(query, key, value, cu_seqlens)
+
+    def forward_xpu(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        cu_seqlens: torch.Tensor | None = None,
+        max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
+    ) -> torch.Tensor:
+        assert self.is_flash_attn_backend, (
+            "XPU only supports FLASH_ATTN for vision attention."
+        )
+        return self._forward_fa(query, key, value, cu_seqlens, max_seqlen)
+
+    def forward_tpu(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        cu_seqlens: torch.Tensor | None = None,
+        max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
+    ) -> torch.Tensor:
+        assert self.attn_backend == AttentionBackendEnum.PALLAS, (
+            f"MMEncoderAttention on TPU only supports PALLAS backend, "
+            f"but got {self.attn_backend}."
+        )
+        if cu_seqlens is None:
+            query, key, value = (x.transpose(1, 2) for x in (query, key, value))
+            from torch_xla.experimental.custom_kernel import flash_attention
+
+            out = flash_attention(query, key, value, sm_scale=self.scale)
+            out = out.transpose(1, 2)
+            return out
+        logger.warning_once(
+            "PALLAS backend with cu_seqlens is not supported for ViT yet. ",
+            "Falling back to SDPA implementation.",
+        )
+        return self._forward_sdpa(query, key, value, cu_seqlens)
diff --git a/vllm/attention/ops/vit_attn_wrappers.py b/vllm/attention/ops/vit_attn_wrappers.py
index 9036c2b801949..46c7d83dfa5c2 100644
--- a/vllm/attention/ops/vit_attn_wrappers.py
+++ b/vllm/attention/ops/vit_attn_wrappers.py
@@ -44,9 +44,7 @@ def flash_attn_maxseqlen_wrapper(
         dropout_p=0.0,
         causal=False,
     )
-    context_layer = einops.rearrange(
-        output, "(b s) h d -> s b (h d)", b=batch_size
-    ).contiguous()
+    context_layer = einops.rearrange(output, "(b s) h d -> b s h d", b=batch_size)
     return context_layer
 
 
@@ -59,8 +57,7 @@ def flash_attn_maxseqlen_wrapper_fake(
     batch_size: int,
     is_rocm_aiter: bool,
 ) -> torch.Tensor:
-    b, s, h, d = q.shape
-    return torch.empty((s, b, h * d), dtype=q.dtype, device=q.device)
+    return torch.empty_like(q)
 
 
 direct_register_custom_op(
@@ -106,7 +103,6 @@ def torch_sdpa_wrapper(
         output_i = einops.rearrange(output_i, "b h s d -> b s h d ")
         outputs.append(output_i)
     context_layer = torch.cat(outputs, dim=1)
-    context_layer = einops.rearrange(context_layer, "b s h d -> s b (h d)").contiguous()
     return context_layer
 
 
@@ -116,8 +112,7 @@ def torch_sdpa_wrapper_fake(
     v: torch.Tensor,
     cu_seqlens: torch.Tensor,
 ) -> torch.Tensor:
-    b, s, h, d = q.shape
-    return torch.empty((s, b, h * d), dtype=q.dtype, device=q.device)
+    return torch.empty_like(q)
 
 
 direct_register_custom_op(
diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py
index da19d8fdb15e0..9b61cd9503073 100644
--- a/vllm/model_executor/models/dots_ocr.py
+++ b/vllm/model_executor/models/dots_ocr.py
@@ -5,15 +5,14 @@ from typing import Annotated, Literal, TypeAlias
 
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 from torch.nn import LayerNorm
 from transformers.models.qwen2_vl import Qwen2VLProcessor
 
 from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.layer import (
-    maybe_get_vit_flash_attn_backend,
+from vllm.attention.layers.mm_encoder_attention import (
+    MMEncoderAttention,
 )
-from vllm.config import VllmConfig
+from vllm.config import MultiModalConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import utils as dist_utils
 from vllm.distributed.parallel_state import (
@@ -254,11 +253,15 @@ class DotsVisionAttention(nn.Module):
         bias: bool = True,
         *,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        use_data_parallel: bool = False,
-        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
+        use_data_parallel = (
+            multimodal_config.mm_encoder_tp_mode == "data"
+            if multimodal_config
+            else False
+        )
 
         self.embed_dim = dim
         self.tp_size = (
@@ -287,31 +290,13 @@ class DotsVisionAttention(nn.Module):
             prefix=f"{prefix}.proj",
             disable_tp=use_data_parallel,
         )
-        # Select attention backend
-        self.attn_backend = get_vit_attn_backend(
-            self.hidden_size_per_attention_head,
-            torch.get_default_dtype(),
-            attn_backend_override=attn_backend_override,
-        )
 
-        self.attn_backend, self.flash_attn_varlen_func = (
-            maybe_get_vit_flash_attn_backend(
-                self.attn_backend,
-                attn_backend_override=attn_backend_override,
-            )
+        self.attn = MMEncoderAttention(
+            num_heads=self.num_attention_heads_per_partition,
+            head_size=self.hidden_size_per_attention_head,
+            multimodal_config=multimodal_config,
+            prefix=f"{prefix}.attn",
         )
-        if self.attn_backend not in {
-            AttentionBackendEnum.FLASH_ATTN,
-            AttentionBackendEnum.TORCH_SDPA,
-            AttentionBackendEnum.ROCM_AITER_FA,
-        }:
-            raise RuntimeError(
-                f"Unsupported vision attention backend: {self.attn_backend}"
-            )
-        self.is_flash_attn_backend = self.attn_backend in {
-            AttentionBackendEnum.FLASH_ATTN,
-            AttentionBackendEnum.ROCM_AITER_FA,
-        }
 
     def forward(
         self,
@@ -319,7 +304,7 @@ class DotsVisionAttention(nn.Module):
         cu_seqlens: torch.Tensor,
         rotary_pos_emb: torch.Tensor | None = None,
         *,
-        max_seqlen: int | None = None,
+        max_seqlen: torch.Tensor | None = None,
     ) -> torch.Tensor:
         # [S, C] -> [S, B=1, C]
         x = hidden_states.unsqueeze(1)
@@ -336,41 +321,13 @@ class DotsVisionAttention(nn.Module):
             qk_rotated = apply_rotary_pos_emb_vision(qk_concat, rotary_pos_emb)
             q, k = torch.chunk(qk_rotated, 2, dim=0)
 
-        if self.is_flash_attn_backend:
-            q_ = q.reshape(bs * q.shape[1], q.shape[2], q.shape[3])
-            k_ = k.reshape(bs * k.shape[1], k.shape[2], k.shape[3])
-            v_ = v.reshape(bs * v.shape[1], v.shape[2], v.shape[3])
-            output = self.flash_attn_varlen_func(
-                q_,
-                k_,
-                v_,
-                cu_seqlens_q=cu_seqlens,
-                cu_seqlens_k=cu_seqlens,
-                max_seqlen_q=max_seqlen,
-                max_seqlen_k=max_seqlen,
-                dropout_p=0.0,
-                causal=False,
-            )
-            context_layer = output.view(
-                bs,
-                -1,
-                self.num_attention_heads_per_partition,
-                self.hidden_size_per_attention_head,
-            )
-        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
-            outputs = []
-            for i in range(1, len(cu_seqlens)):
-                s = int(cu_seqlens[i - 1])
-                e = int(cu_seqlens[i])
-                q_i = q[:, s:e].permute(0, 2, 1, 3)
-                k_i = k[:, s:e].permute(0, 2, 1, 3)
-                v_i = v[:, s:e].permute(0, 2, 1, 3)
-                out_i = F.scaled_dot_product_attention(q_i, k_i, v_i, dropout_p=0.0)
-                out_i = out_i.permute(0, 2, 1, 3)
-                outputs.append(out_i)
-            context_layer = torch.cat(outputs, dim=1) if outputs else q[:, :0]
-        else:
-            raise RuntimeError("Unsupported attention backend")
+        context_layer = self.attn(
+            query=q,
+            key=k,
+            value=v,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
+        )
 
         # [B,S,H,D] -> [S,B,H*D] -> [S, C]
         context_layer = context_layer.permute(1, 0, 2, 3).contiguous()
@@ -385,14 +342,19 @@ class DotsSwiGLUFFN(nn.Module):
         config,
         *,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        use_data_parallel: bool = False,
     ):
         super().__init__()
         hidden_features = config.intermediate_size
         in_features = config.embed_dim
         bias = config.use_bias
 
+        use_data_parallel = (
+            multimodal_config.mm_encoder_tp_mode == "data"
+            if multimodal_config
+            else False
+        )
         # Referenced aimv2.py AIMv2SwiGLUFFN
         self.fc13 = MergedColumnParallelLinear(
             in_features,
@@ -498,9 +460,8 @@ class DotsVisionBlock(nn.Module):
         config,
         *,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        use_data_parallel: bool = False,
-        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
 
@@ -510,16 +471,15 @@ class DotsVisionBlock(nn.Module):
             num_heads=config.num_attention_heads,
             bias=config.use_bias,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             prefix=f"{prefix}.attn",
-            use_data_parallel=use_data_parallel,
-            attn_backend_override=attn_backend_override,
         )
         self.norm1 = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
         self.mlp = DotsSwiGLUFFN(
             config,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             prefix=f"{prefix}.mlp",
-            use_data_parallel=use_data_parallel,
         )
         self.norm2 = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
 
@@ -546,12 +506,11 @@ class DotsVisionTransformer(nn.Module):
         self,
         config: DotsVisionConfig,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         *,
         num_hidden_layers_override: int | None = None,
         require_post_norm: bool | None = None,
         prefix: str = "",
-        use_data_parallel: bool = False,
-        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         self.config = config
@@ -561,6 +520,11 @@ class DotsVisionTransformer(nn.Module):
 
         head_dim = config.embed_dim // config.num_attention_heads
         self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)
+        attn_backend_override = (
+            multimodal_config.mm_encoder_attn_backend
+            if multimodal_config is not None
+            else None
+        )
         self.attn_backend = get_vit_attn_backend(
             head_size=head_dim,
             dtype=torch.get_default_dtype(),
@@ -578,9 +542,8 @@ class DotsVisionTransformer(nn.Module):
                 DotsVisionBlock(
                     config,
                     quant_config=quant_config,
+                    multimodal_config=multimodal_config,
                     prefix=f"{prefix}.blocks.{i}",
-                    use_data_parallel=use_data_parallel,
-                    attn_backend_override=attn_backend_override,
                 )
                 for i in range(num_layers)
             ]
@@ -592,6 +555,11 @@ class DotsVisionTransformer(nn.Module):
         else:
             self.post_trunk_norm = None
 
+        use_data_parallel = (
+            multimodal_config.mm_encoder_tp_mode == "data"
+            if multimodal_config
+            else False
+        )
         self.merger = PatchMerger(
             dim=config.hidden_size,
             context_dim=config.embed_dim,
@@ -647,7 +615,7 @@ class DotsVisionTransformer(nn.Module):
             self.attn_backend == AttentionBackendEnum.FLASH_ATTN
             or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA
         ):
-            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
         return max_seqlen
 
     def forward(
@@ -733,17 +701,12 @@ class DotsOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA
             self.config.vision_config = vision_config
         else:
             vision_config = self.config.vision_config
-        attn_backend_override = (
-            multimodal_config.mm_encoder_attn_backend
-            if multimodal_config is not None
-            else None
-        )
+
         self.vision_tower = DotsVisionTransformer(
             vision_config,
             quant_config=self.quant_config,
+            multimodal_config=multimodal_config,
             prefix=maybe_prefix(prefix, "vision_tower"),
-            use_data_parallel=self.use_data_parallel,
-            attn_backend_override=attn_backend_override,
         )
         self.language_model: Qwen2ForCausalLM = init_vllm_registered_model(
             vllm_config=vllm_config,
diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py
index 053d260cc09b2..dd2b74736bcac 100644
--- a/vllm/model_executor/models/ernie45_vl.py
+++ b/vllm/model_executor/models/ernie45_vl.py
@@ -37,10 +37,10 @@ from einops import rearrange, repeat
 from transformers import BatchFeature
 
 from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.layer import (
-    maybe_get_vit_flash_attn_backend,
+from vllm.attention.layers.mm_encoder_attention import (
+    MMEncoderAttention,
 )
-from vllm.config import VllmConfig
+from vllm.config import MultiModalConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import parallel_state
 from vllm.distributed import utils as dist_utils
@@ -163,8 +163,8 @@ class Ernie4_5_VisionAttention(nn.Module):
         num_heads: int,
         projection_size: int,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         # Per attention head and per partition values.
@@ -193,33 +193,13 @@ class Ernie4_5_VisionAttention(nn.Module):
             prefix=f"{prefix}.proj",
         )
 
-        # Detect attention implementation.
-        self.attn_backend = get_vit_attn_backend(
+        self.attn = MMEncoderAttention(
+            num_heads=self.num_attention_heads_per_partition,
             head_size=self.hidden_size_per_attention_head,
-            dtype=torch.get_default_dtype(),
-            attn_backend_override=attn_backend_override,
+            multimodal_config=multimodal_config,
+            prefix=f"{prefix}.attn",
         )
 
-        self.attn_backend, self.flash_attn_varlen_func = (
-            maybe_get_vit_flash_attn_backend(
-                self.attn_backend,
-                attn_backend_override=attn_backend_override,
-            )
-        )
-
-        if self.attn_backend not in {
-            AttentionBackendEnum.FLASH_ATTN,
-            AttentionBackendEnum.TORCH_SDPA,
-            AttentionBackendEnum.ROCM_AITER_FA,
-        }:
-            raise RuntimeError(
-                f"Ernie45-VL does not support {self.attn_backend} backend now."
-            )
-        self.is_flash_attn_backend = self.attn_backend in {
-            AttentionBackendEnum.FLASH_ATTN,
-            AttentionBackendEnum.ROCM_AITER_FA,
-        }
-
     def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
         # [s, b, 3 * head * head_dim]
         seq_len, bs, _ = qkv.shape
@@ -253,14 +233,13 @@ class Ernie4_5_VisionAttention(nn.Module):
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
         rotary_pos_emb: torch.Tensor,
-        max_seqlen: int | None = None,  # Only used for Flash Attention
+        max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
     ) -> torch.Tensor:
         # [s, b, c] --> [s, b, head * 3 * head_dim]
         x, _ = self.qkv(x)
 
         # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim]
         q, k, v = self.split_qkv(x)
-        batch_size = q.shape[1]
 
         q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v))
         if rotary_pos_emb is not None:
@@ -268,43 +247,14 @@ class Ernie4_5_VisionAttention(nn.Module):
             qk_rotated = apply_rotary_pos_emb_vision(qk_concat, rotary_pos_emb)
             q, k = torch.chunk(qk_rotated, 2, dim=0)
 
-        if self.is_flash_attn_backend:
-            q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
-
-            output = self.flash_attn_varlen_func(
-                q,
-                k,
-                v,
-                cu_seqlens_q=cu_seqlens,
-                cu_seqlens_k=cu_seqlens,
-                max_seqlen_q=max_seqlen,
-                max_seqlen_k=max_seqlen,
-                dropout_p=0.0,
-                causal=False,
-            )
-
-            context_layer = rearrange(
-                output, "(b s) h d -> s b (h d)", b=batch_size
-            ).contiguous()
-        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
-            # Execute attention entry by entry for speed & less VRAM.
-            outputs = []
-
-            lens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
-            q_chunks = torch.split(q, lens, dim=1)
-            k_chunks = torch.split(k, lens, dim=1)
-            v_chunks = torch.split(v, lens, dim=1)
-            for q_i, k_i, v_i in zip(q_chunks, k_chunks, v_chunks):
-                q_i, k_i, v_i = (
-                    rearrange(x, "b s h d -> b h s d") for x in [q_i, k_i, v_i]
-                )
-                output_i = F.scaled_dot_product_attention(q_i, k_i, v_i, dropout_p=0.0)
-                output_i = rearrange(output_i, "b h s d -> b s h d ")
-                outputs.append(output_i)
-            context_layer = torch.cat(outputs, dim=1)
-            context_layer = rearrange(
-                context_layer, "b s h d -> s b (h d)"
-            ).contiguous()
+        output = self.attn(
+            query=q,
+            key=k,
+            value=v,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
+        )
+        context_layer = rearrange(output, "b s h d -> s b (h d)").contiguous()
 
         output, _ = self.proj(context_layer)
         return output
@@ -350,8 +300,8 @@ class Ernie4_5_VisionBlock(nn.Module):
         act_layer: type[nn.Module] = QuickGELU,
         norm_layer: Callable[[int], nn.Module] | None = None,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
 
@@ -366,8 +316,8 @@ class Ernie4_5_VisionBlock(nn.Module):
             num_heads=num_heads,
             projection_size=dim,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             prefix=f"{prefix}.attn",
-            attn_backend_override=attn_backend_override,
         )
 
         self.mlp = Ernie4_5_VisionMLP(
@@ -383,7 +333,7 @@ class Ernie4_5_VisionBlock(nn.Module):
         hidden_states: torch.Tensor,
         cu_seqlens: torch.Tensor,
         rotary_pos_emb: torch.Tensor,
-        max_seqlen: int | None = None,  # Only used for Flash Attention
+        max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
     ) -> torch.Tensor:
         hidden_states = hidden_states + self.attn(
             self.norm1(hidden_states),
@@ -441,8 +391,8 @@ class Ernie4_5_VisionTransformer(nn.Module):
         vision_config,
         norm_eps: float = 1e-6,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         patch_size = vision_config.patch_size
@@ -477,8 +427,8 @@ class Ernie4_5_VisionTransformer(nn.Module):
                     mlp_ratio=mlp_ratio,
                     norm_layer=norm_layer,
                     quant_config=quant_config,
+                    multimodal_config=multimodal_config,
                     prefix=f"{prefix}.blocks.{layer_idx}",
-                    attn_backend_override=attn_backend_override,
                 )
                 for layer_idx in range(depth)
             ]
@@ -489,6 +439,9 @@ class Ernie4_5_VisionTransformer(nn.Module):
         )
         self.ln = nn.LayerNorm(hidden_size, eps=1e-6)
 
+        attn_backend_override = (
+            multimodal_config.mm_encoder_attn_backend if multimodal_config else None
+        )
         self.attn_backend = get_vit_attn_backend(
             head_size=head_dim,
             dtype=torch.get_default_dtype(),
@@ -535,13 +488,13 @@ class Ernie4_5_VisionTransformer(nn.Module):
         rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
         return rotary_pos_emb
 
-    def compute_attn_mask_seqlen(self, cu_seqlens: torch.Tensor) -> int | None:
+    def compute_attn_mask_seqlen(self, cu_seqlens: torch.Tensor) -> torch.Tensor | None:
         max_seqlen = None
         if (
             self.attn_backend == AttentionBackendEnum.FLASH_ATTN
             or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA
         ):
-            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
         return max_seqlen
 
     def forward(
@@ -1304,17 +1257,12 @@ class Ernie4_5_VLMoeForConditionalGeneration(
         self.config = config
         self.multimodal_config = multimodal_config
 
-        attn_backend_override = (
-            multimodal_config.mm_encoder_attn_backend
-            if multimodal_config is not None
-            else None
-        )
         self.vision_model = Ernie4_5_VisionTransformer(
             config.vision_config,
             norm_eps=getattr(config, "rms_norm_eps", 1e-6),
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             prefix=maybe_prefix(prefix, "vision_model"),
-            attn_backend_override=attn_backend_override,
         )
 
         self.language_model = Ernie4_5_VLMoeForCausalLM(
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 786482d77a1d2..10e5261a30485 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -47,8 +47,10 @@ from transformers.models.glm4v.video_processing_glm4v import Glm4vVideoProcessor
 from transformers.video_utils import VideoMetadata
 
 from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.layer import maybe_get_vit_flash_attn_backend
-from vllm.config import VllmConfig
+from vllm.attention.layers.mm_encoder_attention import (
+    MMEncoderAttention,
+)
+from vllm.config import MultiModalConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size, parallel_state
 from vllm.distributed import utils as dist_utils
@@ -191,10 +193,15 @@ class Glm4vVisionMLP(nn.Module):
         hidden_features: int,
         bias: bool = False,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        use_data_parallel: bool = False,
     ):
         super().__init__()
+        use_data_parallel = (
+            multimodal_config.mm_encoder_tp_mode == "data"
+            if multimodal_config
+            else False
+        )
         self.gate_up_proj = MergedColumnParallelLinear(
             input_size=in_features,
             output_sizes=[hidden_features] * 2,
@@ -248,12 +255,16 @@ class Glm4vVisionAttention(nn.Module):
         num_heads: int,
         projection_size: int,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        use_data_parallel: bool = False,
-        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         # Per attention head and per partition values.
+        use_data_parallel = (
+            multimodal_config.mm_encoder_tp_mode == "data"
+            if multimodal_config
+            else False
+        )
         self.tp_size = (
             1 if use_data_parallel else get_tensor_model_parallel_world_size()
         )
@@ -287,34 +298,12 @@ class Glm4vVisionAttention(nn.Module):
             disable_tp=use_data_parallel,
         )
 
-        # Detect attention implementation.
-        self.attn_backend = get_vit_attn_backend(
+        self.attn = MMEncoderAttention(
+            num_heads=self.num_attention_heads_per_partition,
             head_size=self.hidden_size_per_attention_head,
-            dtype=torch.get_default_dtype(),
-            attn_backend_override=attn_backend_override,
+            multimodal_config=multimodal_config,
         )
 
-        self.attn_backend, self.flash_attn_varlen_func = (
-            maybe_get_vit_flash_attn_backend(
-                self.attn_backend,
-                attn_backend_override=attn_backend_override,
-            )
-        )
-
-        if self.attn_backend not in {
-            AttentionBackendEnum.FLASH_ATTN,
-            AttentionBackendEnum.TORCH_SDPA,
-            AttentionBackendEnum.ROCM_AITER_FA,
-        }:
-            raise RuntimeError(
-                f"GLM-4V does not support {self.attn_backend} backend now."
-            )
-
-        self.is_flash_attn_backend = self.attn_backend in {
-            AttentionBackendEnum.FLASH_ATTN,
-            AttentionBackendEnum.ROCM_AITER_FA,
-        }
-
     def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
         # [s, b, 3 * head * head_dim]
         seq_len, bs, _ = qkv.shape
@@ -338,14 +327,13 @@ class Glm4vVisionAttention(nn.Module):
         cu_seqlens: torch.Tensor,
         rotary_pos_emb_cos: torch.Tensor,
         rotary_pos_emb_sin: torch.Tensor,
-        max_seqlen: int | None = None,  # Only used for Flash Attention
+        max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
     ) -> torch.Tensor:
         # [s, b, c] --> [s, b, head * 3 * head_dim]
         x, _ = self.qkv(x)
 
         # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim]
         q, k, v = self.split_qkv(x)
-        batch_size = q.shape[1]
 
         q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v))
         if rotary_pos_emb_cos is not None and rotary_pos_emb_sin is not None:
@@ -356,43 +344,14 @@ class Glm4vVisionAttention(nn.Module):
             )
             q, k = torch.chunk(qk_rotated, 2, dim=0)
 
-        if self.is_flash_attn_backend:
-            q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
-
-            output = self.flash_attn_varlen_func(
-                q,
-                k,
-                v,
-                cu_seqlens_q=cu_seqlens,
-                cu_seqlens_k=cu_seqlens,
-                max_seqlen_q=max_seqlen,
-                max_seqlen_k=max_seqlen,
-                dropout_p=0.0,
-                causal=False,
-            )
-
-            context_layer = rearrange(
-                output, "(b s) h d -> s b (h d)", b=batch_size
-            ).contiguous()
-        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
-            # Execute attention entry by entry for speed & less VRAM.
-            outputs = []
-
-            lens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
-            q_chunks = torch.split(q, lens, dim=1)
-            k_chunks = torch.split(k, lens, dim=1)
-            v_chunks = torch.split(v, lens, dim=1)
-            for q_i, k_i, v_i in zip(q_chunks, k_chunks, v_chunks):
-                q_i, k_i, v_i = (
-                    rearrange(x, "b s h d -> b h s d") for x in [q_i, k_i, v_i]
-                )
-                output_i = F.scaled_dot_product_attention(q_i, k_i, v_i, dropout_p=0.0)
-                output_i = rearrange(output_i, "b h s d -> b s h d ")
-                outputs.append(output_i)
-            context_layer = torch.cat(outputs, dim=1)
-            context_layer = rearrange(
-                context_layer, "b s h d -> s b (h d)"
-            ).contiguous()
+        context_layer = self.attn(
+            query=q,
+            key=k,
+            value=v,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
+        )
+        context_layer = rearrange(context_layer, "b s h d -> s b (h d)").contiguous()
 
         output, _ = self.proj(context_layer)
         return output
@@ -406,9 +365,8 @@ class Glm4vVisionBlock(nn.Module):
         mlp_hidden_dim: int,
         norm_layer: Callable[[int], nn.Module] | None = None,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        use_data_parallel: bool = False,
-        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         if norm_layer is None:
@@ -420,17 +378,16 @@ class Glm4vVisionBlock(nn.Module):
             num_heads=num_heads,
             projection_size=dim,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             prefix=f"{prefix}.attn",
-            use_data_parallel=use_data_parallel,
-            attn_backend_override=attn_backend_override,
         )
         self.mlp = Glm4vVisionMLP(
             dim,
             mlp_hidden_dim,
             bias=False,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             prefix=f"{prefix}.mlp",
-            use_data_parallel=use_data_parallel,
         )
 
     def forward(
@@ -489,11 +446,16 @@ class Glm4vPatchMerger(nn.Module):
         d_model: int,
         context_dim: int,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         bias: bool = False,
         prefix: str = "",
-        use_data_parallel: bool = False,
     ) -> None:
         super().__init__()
+        use_data_parallel = (
+            multimodal_config.mm_encoder_tp_mode == "data"
+            if multimodal_config
+            else False
+        )
         self.hidden_size = d_model
         self.proj = ColumnParallelLinear(
             self.hidden_size,
@@ -649,19 +611,19 @@ class Glm4vVisionTransformer(nn.Module):
         vision_config: Glm4vVisionConfig,
         norm_eps: float = 1e-6,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        use_data_parallel: bool = False,
-        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
 
+        assert multimodal_config is not None, "multimodal_config must be provided"
+
         patch_size = vision_config.patch_size
         temporal_patch_size = vision_config.temporal_patch_size
         in_channels = vision_config.in_channels
         depth = vision_config.depth
         self.hidden_size = vision_config.hidden_size
         self.num_heads = vision_config.num_heads
-        self.use_data_parallel = use_data_parallel
 
         self.patch_size = vision_config.patch_size
         self.spatial_merge_size = vision_config.spatial_merge_size
@@ -690,9 +652,8 @@ class Glm4vVisionTransformer(nn.Module):
                     mlp_hidden_dim=vision_config.out_hidden_size,
                     norm_layer=norm_layer,
                     quant_config=quant_config,
+                    multimodal_config=multimodal_config,
                     prefix=f"{prefix}.blocks.{layer_idx}",
-                    use_data_parallel=self.use_data_parallel,
-                    attn_backend_override=attn_backend_override,
                 )
                 for layer_idx in range(depth)
             ]
@@ -701,9 +662,9 @@ class Glm4vVisionTransformer(nn.Module):
             d_model=vision_config.out_hidden_size,
             context_dim=vision_config.intermediate_size,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             bias=False,
             prefix=f"{prefix}.merger",
-            use_data_parallel=self.use_data_parallel,
         )
         self.embeddings = Glm4vVisionEmbeddings(vision_config)
 
@@ -723,7 +684,7 @@ class Glm4vVisionTransformer(nn.Module):
         self.attn_backend = get_vit_attn_backend(
             head_size=head_dim,
             dtype=torch.get_default_dtype(),
-            attn_backend_override=attn_backend_override,
+            attn_backend_override=multimodal_config.mm_encoder_attn_backend,
         )
 
     @property
@@ -775,13 +736,13 @@ class Glm4vVisionTransformer(nn.Module):
     def compute_attn_mask_seqlen(
         self,
         cu_seqlens: torch.Tensor,
-    ) -> int | None:
+    ) -> torch.Tensor | None:
         max_seqlen = None
         if (
             self.attn_backend == AttentionBackendEnum.FLASH_ATTN
             or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA
         ):
-            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
         return max_seqlen
 
     def forward(
@@ -1465,18 +1426,12 @@ class Glm4vForConditionalGeneration(
         self.multimodal_config = multimodal_config
         self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
 
-        attn_backend_override = (
-            multimodal_config.mm_encoder_attn_backend
-            if multimodal_config is not None
-            else None
-        )
         self.visual = Glm4vVisionTransformer(
             config.vision_config,
             norm_eps=getattr(config, "rms_norm_eps", 1e-5),
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             prefix=maybe_prefix(prefix, "visual"),
-            use_data_parallel=self.use_data_parallel,
-            attn_backend_override=attn_backend_override,
         )
 
         if config.model_type == "glm4v":
diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
index f31da0ee302b3..52e4413690619 100644
--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -9,7 +9,6 @@ from typing import Annotated, Any, Literal, TypeAlias, TypeVar
 import numpy as np
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 from einops import rearrange
 from transformers import PretrainedConfig
 from transformers.activations import GELUActivation
@@ -17,11 +16,10 @@ from transformers.feature_extraction_utils import BatchFeature
 from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from transformers.utils import torch_int
 
-from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.layer import (
-    maybe_get_vit_flash_attn_backend,
+from vllm.attention.layers.mm_encoder_attention import (
+    MMEncoderAttention,
 )
-from vllm.config import VllmConfig
+from vllm.config import MultiModalConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
@@ -80,7 +78,6 @@ from .utils import (
     is_pp_missing_parameter,
     maybe_prefix,
 )
-from .vision import get_vit_attn_backend
 
 logger = init_logger(__name__)
 
@@ -369,8 +366,8 @@ class KeyeSiglipAttention(nn.Module):
         self,
         config: PretrainedConfig,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.config = config
@@ -408,34 +405,14 @@ class KeyeSiglipAttention(nn.Module):
             prefix=f"{prefix}.out_proj",
         )
 
-        # Detect attention implementation.
-        self.attn_backend = get_vit_attn_backend(
+        self.attn = MMEncoderAttention(
+            num_heads=self.num_heads,
             head_size=self.head_dim,
-            dtype=torch.get_default_dtype(),
-            attn_backend_override=attn_backend_override,
+            num_kv_heads=self.num_kv_heads,
+            prefix=f"{prefix}.attn",
+            multimodal_config=multimodal_config,
         )
 
-        self.attn_backend, self.flash_attn_varlen_func = (
-            maybe_get_vit_flash_attn_backend(
-                self.attn_backend,
-                attn_backend_override=attn_backend_override,
-            )
-        )
-
-        if self.attn_backend not in {
-            AttentionBackendEnum.FLASH_ATTN,
-            AttentionBackendEnum.TORCH_SDPA,
-            AttentionBackendEnum.ROCM_AITER_FA,
-        }:
-            raise RuntimeError(
-                f"Keye-VL does not support {self.attn_backend} backend now."
-            )
-
-        self.is_flash_attn_backend = self.attn_backend in {
-            AttentionBackendEnum.FLASH_ATTN,
-            AttentionBackendEnum.ROCM_AITER_FA,
-        }
-
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -450,8 +427,7 @@ class KeyeSiglipAttention(nn.Module):
             dim=-1,
         )
 
-        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        batch_size = q.shape[0]
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
 
         if rope_emb is None:
             q = q.view(*q.shape[:-1], self.num_heads, self.head_dim)
@@ -482,38 +458,14 @@ class KeyeSiglipAttention(nn.Module):
                 self.head_dim,
             )
 
-        if self.is_flash_attn_backend:
-            q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
-
-            output = self.flash_attn_varlen_func(
-                q,
-                k,
-                v,
-                cu_seqlens_q=cu_seqlens,
-                cu_seqlens_k=cu_seqlens,
-                max_seqlen_q=max_seqlen,
-                max_seqlen_k=max_seqlen,
-                causal=False,
-                softmax_scale=self.scale,
-            )
-            context_layer = rearrange(output, "(b s) ... -> b s ...", b=batch_size)
-        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
-            outputs = []
-            for i in range(1, len(cu_seqlens)):
-                start_idx = cu_seqlens[i - 1]
-                end_idx = cu_seqlens[i]
-                q_i = q[:, start_idx:end_idx]
-                k_i = k[:, start_idx:end_idx]
-                v_i = v[:, start_idx:end_idx]
-                q_i, k_i, v_i = (
-                    rearrange(x, "b s h d -> b h s d") for x in (q_i, k_i, v_i)
-                )
-                output_i = F.scaled_dot_product_attention(q_i, k_i, v_i, dropout_p=0.0)
-                output_i = rearrange(output_i, "b h s d -> b s h d ")
-                outputs.append(output_i)
-            context_layer = torch.cat(outputs, dim=1) if outputs else q[:, :0]
-
-        context_layer = rearrange(context_layer, "b s h d -> b s (h d)").contiguous()
+        context_layer = self.attn(
+            query=q,
+            key=k,
+            value=v,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
+        )
+        context_layer = rearrange(context_layer, "b s h d -> b s (h d)")
 
         output, _ = self.out_proj(context_layer)
         return output
@@ -547,8 +499,8 @@ class KeyeSiglipEncoderLayer(nn.Module):
         self,
         config: PretrainedConfig,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.embed_dim = config.hidden_size
@@ -556,8 +508,8 @@ class KeyeSiglipEncoderLayer(nn.Module):
         self.self_attn = KeyeSiglipAttention(
             config,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             prefix=f"{prefix}.self_attn",
-            attn_backend_override=attn_backend_override,
         )
         self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
         self.mlp = SiglipMLP(
@@ -601,8 +553,8 @@ class KeyeSiglipEncoder(nn.Module):
         self,
         config: PretrainedConfig,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.config = config
@@ -614,8 +566,8 @@ class KeyeSiglipEncoder(nn.Module):
                 KeyeSiglipEncoderLayer(
                     config,
                     quant_config=quant_config,
+                    multimodal_config=multimodal_config,
                     prefix=f"{prefix}.layers.{layer_idx}",
-                    attn_backend_override=attn_backend_override,
                 )
                 for layer_idx in range(config.num_hidden_layers)
             ]
@@ -696,8 +648,8 @@ class KeyeSiglipVisionTransformer(nn.Module):
         self,
         config: PretrainedConfig,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.config = config
@@ -707,8 +659,8 @@ class KeyeSiglipVisionTransformer(nn.Module):
         self.encoder = KeyeSiglipEncoder(
             config,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             prefix=f"{prefix}.encoder",
-            attn_backend_override=attn_backend_override,
         )
         self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
 
@@ -779,16 +731,16 @@ class KeyeSiglipVisionModel(nn.Module):
         self,
         config: PretrainedConfig,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
 
         self.vision_model = KeyeSiglipVisionTransformer(
             config,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             prefix=f"{prefix}.vision_model",
-            attn_backend_override=attn_backend_override,
         )
         self.quant_config = quant_config
 
@@ -1329,16 +1281,11 @@ class BaseKeyeModule(nn.Module):
         self.config = config
         self.multimodal_config = multimodal_config
 
-        attn_backend_override = (
-            multimodal_config.mm_encoder_attn_backend
-            if multimodal_config is not None
-            else None
-        )
         self.visual = KeyeSiglipVisionModel(
             config.vision_config,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             prefix=maybe_prefix(prefix, "visual"),
-            attn_backend_override=attn_backend_override,
         )
 
         self.mlp_AR = self._build_projector(
diff --git a/vllm/model_executor/models/opencua.py b/vllm/model_executor/models/opencua.py
index 23668cc2b746e..35a6a78f653ef 100644
--- a/vllm/model_executor/models/opencua.py
+++ b/vllm/model_executor/models/opencua.py
@@ -240,18 +240,12 @@ class OpenCUAForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
         )
 
         if multimodal_config.get_limit_per_prompt("image"):
-            attn_backend_override = (
-                multimodal_config.mm_encoder_attn_backend
-                if multimodal_config is not None
-                else None
-            )
             self.visual = OpenCUAVisionTransformer(
                 vision_config=config.vision_config,
                 norm_eps=getattr(config, "rms_norm_eps", 1e-6),
                 quant_config=self.quant_config,
+                multimodal_config=self.multimodal_config,
                 prefix=maybe_prefix(prefix, "visual"),
-                use_data_parallel=self.use_data_parallel,
-                attn_backend_override=attn_backend_override,
             )
         else:
             self.visual = None
diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py
index 0ad22aab748e3..945138b5972f7 100644
--- a/vllm/model_executor/models/ovis2_5.py
+++ b/vllm/model_executor/models/ovis2_5.py
@@ -10,8 +10,7 @@ import torch
 import torch.nn as nn
 from transformers import BaseImageProcessor, BatchFeature, PretrainedConfig
 
-from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.config import VllmConfig
+from vllm.config import MultiModalConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -104,18 +103,16 @@ class VisualTokenizer(torch.nn.Module):
         config: PretrainedConfig,
         visual_vocab_size: int,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        use_data_parallel: bool = False,
-        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.config = config
         self.vit = self._init_backbone(
             config=config,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             prefix=f"{prefix}.vit",
-            use_data_parallel=use_data_parallel,
-            attn_backend_override=attn_backend_override,
         )
         # reserved tokens for INDICATOR_IDS
         head_dim = visual_vocab_size - len(INDICATOR_IDS)
@@ -133,18 +130,16 @@ class VisualTokenizer(torch.nn.Module):
         self,
         config: PretrainedConfig,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: QuantizationConfig | None = None,
         prefix: str = "",
-        use_data_parallel: bool = False,
-        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         model_type = config.model_type
         if model_type == "siglip2_navit":
             return Siglip2NavitModel(
                 config=config,
                 quant_config=quant_config,
+                multimodal_config=multimodal_config,
                 prefix=prefix,
-                use_data_parallel=use_data_parallel,
-                attn_backend_override=attn_backend_override,
             )
         raise ValueError(f"Unsupported visual tokenizer model_type: {model_type}")
 
@@ -468,17 +463,12 @@ class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP):
             prefix=maybe_prefix(prefix, "llm"),
         )
 
-        attn_backend_override = (
-            multimodal_config.mm_encoder_attn_backend
-            if multimodal_config is not None
-            else None
-        )
         self.visual_tokenizer = VisualTokenizer(
             config=config.vit_config,
             visual_vocab_size=config.visual_vocab_size,
+            multimodal_config=multimodal_config,
             quant_config=quant_config,
             prefix=f"{prefix}.visual_tokenizer",
-            attn_backend_override=attn_backend_override,
         )
 
         self.vte = VisualEmbedding(config.visual_vocab_size, config.hidden_size)
diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py
index 9703a5b417d02..66acc0432d125 100644
--- a/vllm/model_executor/models/paddleocr_vl.py
+++ b/vllm/model_executor/models/paddleocr_vl.py
@@ -22,7 +22,6 @@ from typing import Annotated, Literal
 import numpy as np
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 from einops import rearrange, repeat
 from transformers import BatchFeature, PretrainedConfig
 from transformers.activations import GELUActivation
@@ -32,13 +31,10 @@ from transformers.modeling_outputs import (
 from transformers.utils import torch_int
 
 from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.layer import (
-    maybe_get_vit_flash_attn_backend,
+from vllm.attention.layers.mm_encoder_attention import (
+    MMEncoderAttention,
 )
-from vllm.attention.ops.vit_attn_wrappers import (
-    vit_flash_attn_wrapper,
-)
-from vllm.config import VllmConfig
+from vllm.config import MultiModalConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import parallel_state
 from vllm.distributed import utils as dist_utils
@@ -578,9 +574,8 @@ class SiglipAttention(nn.Module):
         num_heads: int,
         projection_size: int,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA,
-        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
 
@@ -608,18 +603,12 @@ class SiglipAttention(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.out_proj",
         )
-
-        self.attn_backend = attn_backend
-        self.attn_backend, self.flash_attn_varlen_func = (
-            maybe_get_vit_flash_attn_backend(
-                self.attn_backend,
-                attn_backend_override=attn_backend_override,
-            )
+        self.attn = MMEncoderAttention(
+            num_heads=self.num_attention_heads_per_partition,
+            head_size=self.hidden_size_per_attention_head,
+            multimodal_config=multimodal_config,
+            prefix=f"{prefix}.attn",
         )
-        self.is_flash_attn_backend = self.attn_backend in {
-            AttentionBackendEnum.FLASH_ATTN,
-            AttentionBackendEnum.ROCM_AITER_FA,
-        }
 
     def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
         seq_len, bs, _ = qkv.shape
@@ -665,44 +654,16 @@ class SiglipAttention(nn.Module):
             qk_rotated = apply_rotary_pos_emb_vision(qk_concat, rotary_pos_emb)
             q, k = torch.chunk(qk_rotated, 2, dim=0)
 
-        if self.is_flash_attn_backend:
-            if max_seqlen is None:
-                raise ValueError("Flash attention backend requires max_seqlen.")
-            context_layer = vit_flash_attn_wrapper(
-                q,
-                k,
-                v,
-                cu_seqlens,
-                max_seqlen,
-                batch_size,
-                self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA,
-            )
-        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
-            outputs = []
-            for i in range(1, len(cu_seqlens)):
-                start_idx = cu_seqlens[i - 1]
-                end_idx = cu_seqlens[i]
-                q_i = q[:, start_idx:end_idx]
-                k_i = k[:, start_idx:end_idx]
-                v_i = v[:, start_idx:end_idx]
-                q_i, k_i, v_i = (
-                    rearrange(tensor, "b s h d -> b h s d")
-                    for tensor in (q_i, k_i, v_i)
-                )
-                output_i = F.scaled_dot_product_attention(q_i, k_i, v_i, dropout_p=0.0)
-                output_i = rearrange(output_i, "b h s d -> b s h d")
-                outputs.append(output_i)
-            context_layer = torch.cat(outputs, dim=1)
-            context_layer = rearrange(
-                context_layer, "b s h d -> s b (h d)"
-            ).contiguous()
-        else:
-            raise RuntimeError(
-                f"PaddleOCR-VL does not support {self.attn_backend} backend now."
-            )
+        context_layer = self.attn(
+            query=q,
+            key=k,
+            value=v,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
+        )
+        context_layer = rearrange(context_layer, "b s h d -> b s (h d)")
 
         output, _ = self.out_proj(context_layer)
-        output = rearrange(output, "s b d -> b s d")
         return output
 
 
@@ -774,10 +735,8 @@ class SiglipEncoderLayer(nn.Module):
         self,
         config: PretrainedConfig,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        *,
-        attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA,
-        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.embed_dim = config.hidden_size
@@ -787,9 +746,8 @@ class SiglipEncoderLayer(nn.Module):
             num_heads=config.num_attention_heads,
             projection_size=config.hidden_size,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             prefix=f"{prefix}.self_attn",
-            attn_backend=attn_backend,
-            attn_backend_override=attn_backend_override,
         )
         self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
         self.mlp = SiglipMLP(
@@ -832,14 +790,18 @@ class SiglipEncoder(nn.Module):
         self,
         config: PretrainedConfig,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.config = config
         embed_dim = config.hidden_size
         num_heads = config.num_attention_heads
         head_dim = embed_dim // num_heads
+
+        attn_backend_override = (
+            multimodal_config.mm_encoder_attn_backend if multimodal_config else None
+        )
         self.attn_backend = get_vit_attn_backend(
             head_size=head_dim,
             dtype=torch.get_default_dtype(),
@@ -858,9 +820,8 @@ class SiglipEncoder(nn.Module):
                 SiglipEncoderLayer(
                     config,
                     quant_config=quant_config,
+                    multimodal_config=multimodal_config,
                     prefix=f"{prefix}.layers.{layer_idx}",
-                    attn_backend=self.attn_backend,
-                    attn_backend_override=attn_backend_override,
                 )
                 for layer_idx in range(config.num_hidden_layers)
             ]
@@ -941,8 +902,8 @@ class SiglipVisionTransformer(nn.Module):
         self,
         config: PretrainedConfig,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.config = config
@@ -952,8 +913,8 @@ class SiglipVisionTransformer(nn.Module):
         self.encoder = SiglipEncoder(
             config,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             prefix=f"{prefix}.encoder",
-            attn_backend_override=attn_backend_override,
         )
         self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
 
@@ -991,16 +952,16 @@ class SiglipVisionModel(nn.Module):
         self,
         config,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
 
         self.vision_model = SiglipVisionTransformer(
             config,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             prefix=f"{prefix}.vision_model",
-            attn_backend_override=attn_backend_override,
         )
         self.quant_config = quant_config
 
@@ -1119,17 +1080,11 @@ class PaddleOCRVLForConditionalGeneration(nn.Module, SupportsMultiModal, Support
         self.config = config
         self.multimodal_config = multimodal_config
 
-        attn_backend_override = (
-            multimodal_config.mm_encoder_attn_backend
-            if multimodal_config is not None
-            else None
-        )
-
         self.visual = SiglipVisionModel(
             config=config.vision_config,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             prefix=maybe_prefix(prefix, "visual"),
-            attn_backend_override=attn_backend_override,
         )
         self.mlp_AR = Projector(config, config.vision_config)
 
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index 3438406c4fac1..f9bce4bf981b2 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -845,6 +845,7 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
                 norm_eps=getattr(thinker_config.text_config, "rms_norm_eps", 1e-6),
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "visual"),
+                multimodal_config=multimodal_config,
             )
         else:
             self.visual = None
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 4320e8644f751..a5a47f81ba24d 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -42,13 +42,9 @@ from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
 )
 
 from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.layer import maybe_get_vit_flash_attn_backend
-from vllm.attention.ops.vit_attn_wrappers import (
-    vit_flash_attn_wrapper,
-    vit_torch_sdpa_wrapper,
-)
+from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import VllmConfig
+from vllm.config import MultiModalConfig, VllmConfig
 from vllm.distributed import parallel_state
 from vllm.distributed import utils as dist_utils
 from vllm.forward_context import set_forward_context
@@ -267,10 +263,15 @@ class Qwen2_5_VisionMLP(nn.Module):
         bias: bool = False,
         act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        use_data_parallel: bool = False,
     ):
         super().__init__()
+        use_data_parallel = (
+            multimodal_config.mm_encoder_tp_mode == "data"
+            if multimodal_config
+            else False
+        )
         self.gate_up_proj = MergedColumnParallelLinear(
             input_size=in_features,
             output_sizes=[hidden_features] * 2,  # [gate_proj, up_proj]
@@ -304,13 +305,16 @@ class Qwen2_5_VisionAttention(nn.Module):
         num_heads: int,
         projection_size: int,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        use_data_parallel: bool = False,
-        attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA,
-        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         # Per attention head and per partition values.
+        use_data_parallel = (
+            multimodal_config.mm_encoder_tp_mode == "data"
+            if multimodal_config
+            else False
+        )
         self.tp_size = (
             1
             if use_data_parallel
@@ -342,18 +346,12 @@ class Qwen2_5_VisionAttention(nn.Module):
             prefix=f"{prefix}.proj",
             disable_tp=use_data_parallel,
         )
-        self.attn_backend = attn_backend
-        self.attn_backend, self.flash_attn_varlen_func = (
-            maybe_get_vit_flash_attn_backend(
-                self.attn_backend,
-                attn_backend_override=attn_backend_override,
-            )
-        )
 
-        self.is_flash_attn_backend = self.attn_backend in {
-            AttentionBackendEnum.FLASH_ATTN,
-            AttentionBackendEnum.ROCM_AITER_FA,
-        }
+        self.attn = MMEncoderAttention(
+            num_heads=self.num_attention_heads_per_partition,
+            head_size=self.hidden_size_per_attention_head,
+            multimodal_config=multimodal_config,
+        )
 
     def forward(
         self,
@@ -394,32 +392,17 @@ class Qwen2_5_VisionAttention(nn.Module):
         else:
             q, k, v = qkv.unbind(dim=2)
 
-        if self.is_flash_attn_backend:
-            context_layer = vit_flash_attn_wrapper(
-                q,
-                k,
-                v,
-                cu_seqlens,
-                max_seqlen,
-                batch_size,
-                self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA,
-            )
-        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
-            # Execute attention entry by entry for speed & less VRAM.
-            from vllm.platforms import current_platform
+        context_layer = self.attn(
+            query=q,
+            key=k,
+            value=v,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
+        )
 
-            # Never remove the next contiguous logic
-            # Without it, hallucinations occur with the backend
-            if current_platform.is_rocm():
-                q = q.contiguous()
-                k = k.contiguous()
-                v = v.contiguous()
-            context_layer = vit_torch_sdpa_wrapper(
-                q,
-                k,
-                v,
-                cu_seqlens,
-            )
+        context_layer = einops.rearrange(
+            context_layer, "b s h d -> s b (h d)", b=batch_size
+        ).contiguous()
 
         output, _ = self.proj(context_layer)
         return output
@@ -443,10 +426,8 @@ class Qwen2_5_VisionBlock(nn.Module):
         act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
         norm_layer: Callable[[int], nn.Module] | None = None,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        use_data_parallel: bool = False,
-        attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA,
-        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         if norm_layer is None:
@@ -458,10 +439,8 @@ class Qwen2_5_VisionBlock(nn.Module):
             num_heads=num_heads,
             projection_size=dim,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             prefix=f"{prefix}.attn",
-            use_data_parallel=use_data_parallel,
-            attn_backend=attn_backend,
-            attn_backend_override=attn_backend_override,
         )
         self.mlp = Qwen2_5_VisionMLP(
             dim,
@@ -469,8 +448,8 @@ class Qwen2_5_VisionBlock(nn.Module):
             act_fn=act_fn,
             bias=True,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             prefix=f"{prefix}.mlp",
-            use_data_parallel=use_data_parallel,
         )
 
     def forward(
@@ -542,10 +521,15 @@ class Qwen2_5_VisionPatchMerger(nn.Module):
         norm_layer: Callable[[int], nn.Module] | None = None,
         spatial_merge_size: int = 2,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        use_data_parallel: bool = False,
     ) -> None:
         super().__init__()
+        use_data_parallel = (
+            multimodal_config.mm_encoder_tp_mode == "data"
+            if multimodal_config
+            else False
+        )
         self.hidden_size = context_dim * (spatial_merge_size**2)
         if norm_layer is None:
             norm_layer = partial(nn.LayerNorm, eps=1e-6)
@@ -586,9 +570,8 @@ class Qwen2_5_VisionTransformer(nn.Module):
         vision_config: Qwen2_5_VLVisionConfig,
         norm_eps: float = 1e-6,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        use_data_parallel: bool = False,
-        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
 
@@ -598,7 +581,6 @@ class Qwen2_5_VisionTransformer(nn.Module):
         depth = vision_config.depth
         self.hidden_size = vision_config.hidden_size
         self.num_heads = vision_config.num_heads
-        self.use_data_parallel = use_data_parallel
         self.out_hidden_size = vision_config.out_hidden_size
 
         # args for get_window_index_thw
@@ -629,19 +611,17 @@ class Qwen2_5_VisionTransformer(nn.Module):
             rope_parameters={"partial_rotary_factor": 0.5},
         )
 
+        attn_backend_override = (
+            multimodal_config.mm_encoder_attn_backend
+            if multimodal_config is not None
+            else None
+        )
         self.attn_backend = get_vit_attn_backend(
             head_size=head_dim,
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
 
-        self.attn_backend, self.flash_attn_varlen_func = (
-            maybe_get_vit_flash_attn_backend(
-                self.attn_backend,
-                attn_backend_override=attn_backend_override,
-            )
-        )
-
         if self.attn_backend not in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.TORCH_SDPA,
@@ -661,10 +641,8 @@ class Qwen2_5_VisionTransformer(nn.Module):
                         act_fn=get_act_and_mul_fn(vision_config.hidden_act),
                         norm_layer=norm_layer,
                         quant_config=quant_config,
+                        multimodal_config=multimodal_config,
                         prefix=f"{prefix}.blocks.{layer_idx}",
-                        use_data_parallel=use_data_parallel,
-                        attn_backend=self.attn_backend,
-                        attn_backend_override=attn_backend_override,
                     )
                     for layer_idx in range(depth)
                 ]
@@ -677,8 +655,8 @@ class Qwen2_5_VisionTransformer(nn.Module):
                 norm_layer=norm_layer,
                 spatial_merge_size=self.spatial_merge_size,
                 quant_config=quant_config,
+                multimodal_config=multimodal_config,
                 prefix=f"{prefix}.merger",
-                use_data_parallel=use_data_parallel,
             )
 
     @property
@@ -1200,18 +1178,12 @@ class Qwen2_5_VLForConditionalGeneration(
         if multimodal_config.get_limit_per_prompt(
             "image"
         ) or multimodal_config.get_limit_per_prompt("video"):
-            attn_backend_override = (
-                multimodal_config.mm_encoder_attn_backend
-                if multimodal_config is not None
-                else None
-            )
             self.visual = Qwen2_5_VisionTransformer(
                 vision_config=config.vision_config,
                 norm_eps=getattr(config, "rms_norm_eps", 1e-6),
                 quant_config=self.quant_config,
                 prefix=maybe_prefix(prefix, "visual"),
-                use_data_parallel=self.use_data_parallel,
-                attn_backend_override=attn_backend_override,
+                multimodal_config=multimodal_config,
             )
         else:
             self.visual = None
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 22982ea1113ac..192a54c3ec839 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -33,7 +33,6 @@ from typing import Annotated, Any, Literal, TypeAlias
 import numpy as np
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 from einops import rearrange
 from transformers import BatchFeature
 from transformers.models.qwen2_vl import Qwen2VLImageProcessor, Qwen2VLProcessor
@@ -45,10 +44,8 @@ from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
 from transformers.models.qwen2_vl.video_processing_qwen2_vl import Qwen2VLVideoProcessor
 
 from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.layer import (
-    maybe_get_vit_flash_attn_backend,
-)
-from vllm.config import VllmConfig
+from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
+from vllm.config import MultiModalConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import parallel_state, tensor_model_parallel_all_gather
 from vllm.distributed import utils as dist_utils
@@ -251,10 +248,15 @@ class Qwen2VisionMLP(nn.Module):
         hidden_features: int,
         act_layer: type[nn.Module] = QuickGELU,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        use_data_parallel: bool = False,
     ):
         super().__init__()
+        use_data_parallel = (
+            multimodal_config.mm_encoder_tp_mode == "data"
+            if multimodal_config
+            else False
+        )
         self.fc1 = ColumnParallelLinear(
             in_features,
             hidden_features,
@@ -295,12 +297,16 @@ class Qwen2VisionAttention(nn.Module):
         num_heads: int,
         projection_size: int,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        use_data_parallel: bool = False,
-        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         # Per attention head and per partition values.
+        use_data_parallel = (
+            multimodal_config.mm_encoder_tp_mode == "data"
+            if multimodal_config
+            else False
+        )
         self.tp_size = (
             1
             if use_data_parallel
@@ -329,34 +335,12 @@ class Qwen2VisionAttention(nn.Module):
             disable_tp=use_data_parallel,
         )
 
-        # Detect attention implementation.
-        self.attn_backend = get_vit_attn_backend(
+        self.attn = MMEncoderAttention(
+            num_heads=self.num_attention_heads_per_partition,
             head_size=self.hidden_size_per_attention_head,
-            dtype=torch.get_default_dtype(),
-            attn_backend_override=attn_backend_override,
+            multimodal_config=multimodal_config,
         )
 
-        self.attn_backend, self.flash_attn_varlen_func = (
-            maybe_get_vit_flash_attn_backend(
-                self.attn_backend,
-                attn_backend_override=attn_backend_override,
-            )
-        )
-
-        if self.attn_backend not in {
-            AttentionBackendEnum.FLASH_ATTN,
-            AttentionBackendEnum.TORCH_SDPA,
-            AttentionBackendEnum.ROCM_AITER_FA,
-        }:
-            raise RuntimeError(
-                f"Qwen2-VL does not support {self.attn_backend} backend now."
-            )
-
-        self.is_flash_attn_backend = self.attn_backend in {
-            AttentionBackendEnum.FLASH_ATTN,
-            AttentionBackendEnum.ROCM_AITER_FA,
-        }
-
     def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
         # [s, b, 3 * head * head_dim]
         seq_len, bs, _ = qkv.shape
@@ -398,7 +382,6 @@ class Qwen2VisionAttention(nn.Module):
 
         # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim]
         q, k, v = self.split_qkv(x)
-        batch_size = q.shape[1]
 
         q, k, v = (rearrange(x, "s b ... -> b s ...") for x in (q, k, v))
 
@@ -409,49 +392,15 @@ class Qwen2VisionAttention(nn.Module):
         )
         q, k = torch.chunk(qk_rotated, 2, dim=0)
 
-        if self.is_flash_attn_backend:
-            q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
+        context_layer = self.attn(
+            query=q,
+            key=k,
+            value=v,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
+        )
 
-            output = self.flash_attn_varlen_func(
-                q,
-                k,
-                v,
-                cu_seqlens_q=cu_seqlens,
-                cu_seqlens_k=cu_seqlens,
-                max_seqlen_q=max_seqlen,
-                max_seqlen_k=max_seqlen,
-                dropout_p=0.0,
-                causal=False,
-            )
-
-            context_layer = rearrange(
-                output, "(b s) h d -> s b (h d)", b=batch_size
-            ).contiguous()
-        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
-            # Execute attention entry by entry for speed & less VRAM.
-            from vllm.platforms import current_platform
-
-            if current_platform.is_rocm():
-                q = q.contiguous()
-                k = k.contiguous()
-                v = v.contiguous()
-            outputs = []
-
-            lens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
-            q_chunks = torch.split(q, lens, dim=1)
-            k_chunks = torch.split(k, lens, dim=1)
-            v_chunks = torch.split(v, lens, dim=1)
-            for q_i, k_i, v_i in zip(q_chunks, k_chunks, v_chunks):
-                q_i, k_i, v_i = (
-                    rearrange(x, "b s h d -> b h s d") for x in [q_i, k_i, v_i]
-                )
-                output_i = F.scaled_dot_product_attention(q_i, k_i, v_i, dropout_p=0.0)
-                output_i = rearrange(output_i, "b h s d -> b s h d ")
-                outputs.append(output_i)
-            context_layer = torch.cat(outputs, dim=1)
-            context_layer = rearrange(
-                context_layer, "b s h d -> s b (h d)"
-            ).contiguous()
+        context_layer = rearrange(context_layer, "b s h d -> s b (h d)").contiguous()
 
         output, _ = self.proj(context_layer)
         return output
@@ -466,9 +415,8 @@ class Qwen2VisionBlock(nn.Module):
         act_layer: type[nn.Module] = QuickGELU,
         norm_layer: Callable[[int], nn.Module] | None = None,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        use_data_parallel: bool = False,
-        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         if norm_layer is None:
@@ -482,17 +430,16 @@ class Qwen2VisionBlock(nn.Module):
             num_heads=num_heads,
             projection_size=dim,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             prefix=f"{prefix}.attn",
-            use_data_parallel=use_data_parallel,
-            attn_backend_override=attn_backend_override,
         )
         self.mlp = Qwen2VisionMLP(
             dim,
             mlp_hidden_dim,
             act_layer=act_layer,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             prefix=f"{prefix}.mlp",
-            use_data_parallel=use_data_parallel,
         )
 
     def forward(
@@ -552,10 +499,15 @@ class Qwen2VisionPatchMerger(nn.Module):
         norm_layer: Callable[[int], nn.Module] | None = None,
         spatial_merge_size: int = 2,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        use_data_parallel: bool = False,
     ) -> None:
         super().__init__()
+        use_data_parallel = (
+            multimodal_config.mm_encoder_tp_mode == "data"
+            if multimodal_config
+            else False
+        )
         self.hidden_size = context_dim * (spatial_merge_size**2)
         if norm_layer is None:
             norm_layer = partial(nn.LayerNorm, eps=1e-6)
@@ -599,9 +551,8 @@ class Qwen2VisionTransformer(nn.Module):
         vision_config: Qwen2VLVisionConfig,
         norm_eps: float = 1e-6,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        use_data_parallel: bool = False,
-        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
 
@@ -615,7 +566,11 @@ class Qwen2VisionTransformer(nn.Module):
         num_heads = vision_config.num_heads
         mlp_ratio = vision_config.mlp_ratio
 
-        self.use_data_parallel = use_data_parallel
+        self.use_data_parallel = (
+            multimodal_config.mm_encoder_tp_mode == "data"
+            if multimodal_config
+            else False
+        )
         self.out_hidden_size = vision_config.hidden_size
 
         self.spatial_merge_size = spatial_merge_size
@@ -647,8 +602,7 @@ class Qwen2VisionTransformer(nn.Module):
                     norm_layer=norm_layer,
                     quant_config=quant_config,
                     prefix=f"{prefix}.blocks.{layer_idx}",
-                    use_data_parallel=use_data_parallel,
-                    attn_backend_override=attn_backend_override,
+                    multimodal_config=multimodal_config,
                 )
                 for layer_idx in range(depth)
             ]
@@ -659,7 +613,10 @@ class Qwen2VisionTransformer(nn.Module):
             norm_layer=norm_layer,
             quant_config=quant_config,
             prefix=f"{prefix}.merger",
-            use_data_parallel=use_data_parallel,
+            multimodal_config=multimodal_config,
+        )
+        attn_backend_override = (
+            multimodal_config.mm_encoder_attn_backend if multimodal_config else None
         )
         self.attn_backend = get_vit_attn_backend(
             head_size=head_dim,
@@ -720,7 +677,7 @@ class Qwen2VisionTransformer(nn.Module):
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
-            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
         return max_seqlen
 
     def forward(
@@ -1324,18 +1281,12 @@ class Qwen2VLForConditionalGeneration(
         if multimodal_config.get_limit_per_prompt(
             "image"
         ) or multimodal_config.get_limit_per_prompt("video"):
-            attn_backend_override = (
-                multimodal_config.mm_encoder_attn_backend
-                if multimodal_config is not None
-                else None
-            )
             self.visual = Qwen2VisionTransformer(
                 config.vision_config,
                 norm_eps=getattr(config, "rms_norm_eps", 1e-6),
                 quant_config=quant_config,
+                multimodal_config=multimodal_config,
                 prefix=maybe_prefix(prefix, "visual"),
-                use_data_parallel=self.use_data_parallel,
-                attn_backend_override=attn_backend_override,
             )
         else:
             self.visual = None
diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
index 635c3bfdc65c7..089129e443c01 100755
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -48,7 +48,7 @@ from transformers.models.whisper import WhisperFeatureExtractor
 
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import VllmConfig
+from vllm.config import MultiModalConfig, VllmConfig
 from vllm.distributed import get_pp_group
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY
@@ -192,6 +192,7 @@ class Qwen3_VisionBlock(nn.Module):
         mlp_hidden_dim: int,
         act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
         norm_layer: Callable[[int], nn.Module] | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
@@ -205,6 +206,7 @@ class Qwen3_VisionBlock(nn.Module):
             num_heads=num_heads,
             projection_size=dim,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             prefix=f"{prefix}.attn",
         )
         self.mlp = Qwen3_VisionMLP(
@@ -299,8 +301,8 @@ class Qwen3Omni_VisionTransformer(nn.Module):
         vision_config,
         norm_eps: float = 1e-6,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         self.hidden_size = vision_config.hidden_size
@@ -347,6 +349,7 @@ class Qwen3Omni_VisionTransformer(nn.Module):
                     act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act],
                     norm_layer=norm_layer,
                     quant_config=quant_config,
+                    multimodal_config=multimodal_config,
                     prefix=f"{prefix}.blocks.{layer_idx}",
                 )
                 for layer_idx in range(vision_config.depth)
@@ -376,6 +379,12 @@ class Qwen3Omni_VisionTransformer(nn.Module):
                 ]
             )
 
+        attn_backend_override = (
+            multimodal_config.mm_encoder_attn_backend
+            if multimodal_config is not None
+            else None
+        )
+
         self.attn_backend = get_vit_attn_backend(
             head_size=head_dim,
             dtype=torch.get_default_dtype(),
@@ -1188,17 +1197,12 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
 
         self.audio_tower = Qwen3OmniMoeAudioEncoder(thinker_config.audio_config)
 
-        attn_backend_override = (
-            multimodal_config.mm_encoder_attn_backend
-            if multimodal_config is not None
-            else None
-        )
         self.visual = Qwen3Omni_VisionTransformer(
             vision_config=thinker_config.vision_config,
             norm_eps=getattr(thinker_config.text_config, "rms_norm_eps", 1e-6),
             quant_config=quant_config,
             prefix=maybe_prefix(prefix, "visual"),
-            attn_backend_override=attn_backend_override,
+            multimodal_config=multimodal_config,
         )
         self.quant_config = quant_config
 
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 7fb14a5cf404a..c0589986d1fe8 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -50,7 +50,7 @@ from transformers.video_utils import VideoMetadata
 
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import VllmConfig
+from vllm.config import MultiModalConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
 from vllm.distributed import get_pp_group
 from vllm.logger import init_logger
@@ -169,10 +169,15 @@ class Qwen3_VisionMLP(nn.Module):
         bias: bool = False,
         act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        use_data_parallel: bool = False,
     ):
         super().__init__()
+        use_data_parallel = (
+            multimodal_config.mm_encoder_tp_mode == "data"
+            if multimodal_config
+            else False
+        )
         self.linear_fc1 = ColumnParallelLinear(
             in_features,
             hidden_features,
@@ -206,10 +211,9 @@ class Qwen3_VisionBlock(nn.Module):
         mlp_hidden_dim: int,
         act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
         norm_layer: Callable[[int], nn.Module] | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
-        use_data_parallel: bool = False,
-        attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA,
     ) -> None:
         super().__init__()
         if norm_layer is None:
@@ -221,9 +225,8 @@ class Qwen3_VisionBlock(nn.Module):
             num_heads=num_heads,
             projection_size=dim,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             prefix=f"{prefix}.attn",
-            use_data_parallel=use_data_parallel,
-            attn_backend=attn_backend,
         )
         self.mlp = Qwen3_VisionMLP(
             dim,
@@ -231,8 +234,8 @@ class Qwen3_VisionBlock(nn.Module):
             act_fn=act_fn,
             bias=True,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             prefix=f"{prefix}.mlp",
-            use_data_parallel=use_data_parallel,
         )
 
     def forward(
@@ -264,10 +267,15 @@ class Qwen3_VisionPatchMerger(nn.Module):
         spatial_merge_size: int = 2,
         use_postshuffle_norm: bool = False,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        use_data_parallel: bool = False,
     ) -> None:
         super().__init__()
+        use_data_parallel = (
+            multimodal_config.mm_encoder_tp_mode == "data"
+            if multimodal_config
+            else False
+        )
         self.hidden_size = context_dim * (spatial_merge_size**2)
 
         self.use_postshuffle_norm = use_postshuffle_norm
@@ -313,9 +321,8 @@ class Qwen3_VisionTransformer(nn.Module):
         vision_config: Qwen3VLVisionConfig,
         norm_eps: float = 1e-6,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        use_data_parallel: bool = False,
-        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         self.hidden_size = vision_config.hidden_size
@@ -326,7 +333,6 @@ class Qwen3_VisionTransformer(nn.Module):
         self.spatial_merge_unit = self.spatial_merge_size**2
         self.temporal_patch_size = vision_config.temporal_patch_size
         self.deepstack_visual_indexes = vision_config.deepstack_visual_indexes
-        self.use_data_parallel = use_data_parallel
         self.num_grid_per_side = int(self.num_position_embeddings**0.5)
 
         # NOTE: This is used for creating empty tensor for all_gather for
@@ -359,8 +365,8 @@ class Qwen3_VisionTransformer(nn.Module):
             norm_layer=norm_layer,
             spatial_merge_size=self.spatial_merge_size,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             prefix=f"{prefix}.merger",
-            use_data_parallel=use_data_parallel,
         )
 
         self.deepstack_merger_list = nn.ModuleList(
@@ -372,13 +378,16 @@ class Qwen3_VisionTransformer(nn.Module):
                     use_postshuffle_norm=True,
                     norm_layer=norm_layer,
                     quant_config=quant_config,
+                    multimodal_config=multimodal_config,
                     prefix=f"{prefix}.deepstack_merger_list.{layer_idx}",
-                    use_data_parallel=use_data_parallel,
                 )
                 for layer_idx in range(len(self.deepstack_visual_indexes))
             ]
         )
 
+        attn_backend_override = (
+            multimodal_config.mm_encoder_attn_backend if multimodal_config else None
+        )
         self.attn_backend = get_vit_attn_backend(
             head_size=head_dim,
             dtype=torch.get_default_dtype(),
@@ -402,9 +411,8 @@ class Qwen3_VisionTransformer(nn.Module):
                     act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act],
                     norm_layer=norm_layer,
                     quant_config=quant_config,
+                    multimodal_config=multimodal_config,
                     prefix=f"{prefix}.blocks.{layer_idx}",
-                    use_data_parallel=use_data_parallel,
-                    attn_backend=self.attn_backend,
                 )
                 for layer_idx in range(vision_config.depth)
             ]
@@ -1277,18 +1285,12 @@ class Qwen3VLForConditionalGeneration(
         ) and not multimodal_config.get_limit_per_prompt("video"):
             self.visual = None
         else:
-            attn_backend_override = (
-                multimodal_config.mm_encoder_attn_backend
-                if multimodal_config is not None
-                else None
-            )
             self.visual = Qwen3_VisionTransformer(
                 config.vision_config,
                 norm_eps=getattr(config, "rms_norm_eps", 1e-6),
                 quant_config=quant_config,
+                multimodal_config=multimodal_config,
                 prefix=maybe_prefix(prefix, "visual"),
-                use_data_parallel=self.use_data_parallel,
-                attn_backend_override=attn_backend_override,
             )
 
         self.language_model = Qwen3LLMForCausalLM(
diff --git a/vllm/model_executor/models/qwen3_vl_moe.py b/vllm/model_executor/models/qwen3_vl_moe.py
index a054bd5b3831e..025e11aa6cba9 100644
--- a/vllm/model_executor/models/qwen3_vl_moe.py
+++ b/vllm/model_executor/models/qwen3_vl_moe.py
@@ -418,7 +418,6 @@ class Qwen3VLMoeForConditionalGeneration(
 
         self.config = config
         self.multimodal_config = multimodal_config
-        self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
 
         if not multimodal_config.get_limit_per_prompt(
             "image"
@@ -429,8 +428,8 @@ class Qwen3VLMoeForConditionalGeneration(
                 config.vision_config,
                 norm_eps=getattr(config, "rms_norm_eps", 1e-6),
                 quant_config=quant_config,
+                multimodal_config=multimodal_config,
                 prefix=maybe_prefix(prefix, "visual"),
-                use_data_parallel=self.use_data_parallel,
             )
 
         self.language_model = Qwen3MoeLLMForCausalLM(
diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py
index bbce01995412c..2ee21fc06846c 100644
--- a/vllm/model_executor/models/siglip2navit.py
+++ b/vllm/model_executor/models/siglip2navit.py
@@ -13,7 +13,8 @@ from transformers import Siglip2VisionConfig
 from transformers.configuration_utils import PretrainedConfig
 
 from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.layer import maybe_get_vit_flash_attn_backend
+from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
+from vllm.config import MultiModalConfig
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.conv import Conv2dLayer
@@ -28,8 +29,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.platforms import current_platform
 
-from .vision import get_vit_attn_backend
-
 
 class VisionRotaryEmbedding(nn.Module):
     def __init__(self, dim: int, theta: float = 10000.0) -> None:
@@ -190,7 +189,7 @@ def apply_rotary_pos_emb(
 ) -> tuple[torch.Tensor, torch.Tensor]:
     cos = cos.chunk(2, dim=-1)[0].contiguous()
     sin = sin.chunk(2, dim=-1)[0].contiguous()
-    if is_flash_attn_backend and not current_platform.is_xpu():
+    if is_flash_attn_backend and current_platform.is_cuda():
         from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
 
         apply_rotary_emb_func = apply_rotary_emb
@@ -208,6 +207,7 @@ class Siglip2Attention(nn.Module):
         self,
         config: Siglip2VisionConfig,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
         attn_backend_override: AttentionBackendEnum | None = None,
@@ -227,20 +227,25 @@ class Siglip2Attention(nn.Module):
         self.dropout = config.attention_dropout
         self.is_causal = False
 
-        # TODO(Isotr0py): Enable data parallel after we support
-        # disabling TP on parallel linear layer
+        use_data_parallel = (
+            multimodal_config.mm_encoder_tp_mode == "data"
+            if multimodal_config
+            else False
+        )
         self.qkv_proj = QKVParallelLinear(
             hidden_size=self.embed_dim,
             head_size=self.head_dim,
             total_num_heads=self.num_heads,
             quant_config=quant_config,
             prefix=f"{prefix}.qkv_proj",
+            disable_tp=use_data_parallel,
         )
         self.out_proj = RowParallelLinear(
             input_size=self.embed_dim,
             output_size=self.embed_dim,
             quant_config=quant_config,
             prefix=f"{prefix}.out_proj",
+            disable_tp=use_data_parallel,
         )
 
         self.tp_size = (
@@ -249,31 +254,13 @@ class Siglip2Attention(nn.Module):
         self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
         self.use_rope = config.use_rope
 
-        # Detect attention implementation.
-        self.attn_backend = get_vit_attn_backend(
+        self.attn = MMEncoderAttention(
+            num_heads=self.num_heads_per_partition,
             head_size=self.head_dim,
-            dtype=torch.get_default_dtype(),
-            attn_backend_override=attn_backend_override,
+            prefix=f"{prefix}.attn",
+            multimodal_config=multimodal_config,
         )
 
-        self.attn_backend, self.flash_attn_varlen_func = (
-            maybe_get_vit_flash_attn_backend(
-                self.attn_backend,
-                attn_backend_override=attn_backend_override,
-            )
-        )
-
-        if self.attn_backend not in {
-            AttentionBackendEnum.FLASH_ATTN,
-            AttentionBackendEnum.TORCH_SDPA,
-            AttentionBackendEnum.ROCM_AITER_FA,
-        }:
-            self.attn_backend = AttentionBackendEnum.TORCH_SDPA
-        self.is_flash_attn_backend = self.attn_backend in {
-            AttentionBackendEnum.FLASH_ATTN,
-            AttentionBackendEnum.ROCM_AITER_FA,
-        }
-
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -298,46 +285,23 @@ class Siglip2Attention(nn.Module):
                 keys.unsqueeze(0),
                 cos,
                 sin,
-                self.is_flash_attn_backend,
+                self.attn.is_flash_attn_backend,
             )
             queries = queries.squeeze(0)
             keys = keys.squeeze(0)
 
-        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        if self.is_flash_attn_backend:
-            attn_output = self.flash_attn_varlen_func(
-                queries,
-                keys,
-                values,
-                cu_seqlens_q=cu_seqlens,
-                cu_seqlens_k=cu_seqlens,
-                max_seqlen_q=max_seqlen,
-                max_seqlen_k=max_seqlen,
-            ).reshape(seq_length, -1)
-        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
-            # Execute attention entry by entry for speed & less VRAM.
-            batch_size = cu_seqlens.shape[0] - 1
-            outputs = []
-            cu = cu_seqlens.tolist()
-            for i in range(batch_size):
-                start_idx = cu[i]
-                end_idx = cu[i + 1]
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+        attn_output = self.attn(
+            query=queries.unsqueeze(0),
+            key=keys.unsqueeze(0),
+            value=values.unsqueeze(0),
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
+        )
+        attn_output = attn_output.reshape(
+            seq_length, self.num_heads_per_partition * self.head_dim
+        )
 
-                # Each sequence is processed independently.
-                q_i = queries[start_idx:end_idx].unsqueeze(0)
-                k_i = keys[start_idx:end_idx].unsqueeze(0)
-                v_i = values[start_idx:end_idx].unsqueeze(0)
-
-                # (1, seq_len, num_heads, head_dim) ->
-                # (1, num_heads, seq_len, head_dim)
-                q_i, k_i, v_i = [x.transpose(1, 2) for x in (q_i, k_i, v_i)]
-
-                output_i = F.scaled_dot_product_attention(q_i, k_i, v_i, dropout_p=0.0)
-                # (1, num_heads, seq_len, head_dim) -> (seq_len, embed_dim)
-                output_i = output_i.transpose(1, 2).reshape(end_idx - start_idx, -1)
-                outputs.append(output_i)
-
-            attn_output = torch.cat(outputs, dim=0)
         attn_output, _ = self.out_proj(attn_output)
         return attn_output
 
@@ -347,25 +311,30 @@ class Siglip2MLP(nn.Module):
         self,
         config: Siglip2VisionConfig,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        use_data_parallel: bool = False,
     ):
         super().__init__()
         self.config = config
+        use_data_parallel = (
+            multimodal_config.mm_encoder_tp_mode == "data"
+            if multimodal_config
+            else False
+        )
         self.activation_fn = get_act_fn(config.hidden_act)
-        # TODO(Isotr0py): Enable data parallel after we support
-        # disabling TP on parallel linear layer
         self.fc1 = ColumnParallelLinear(
             config.hidden_size,
             config.intermediate_size,
             quant_config=quant_config,
             prefix=f"{prefix}.fc1",
+            disable_tp=use_data_parallel,
         )
         self.fc2 = RowParallelLinear(
             config.intermediate_size,
             config.hidden_size,
             quant_config=quant_config,
             prefix=f"{prefix}.fc2",
+            disable_tp=use_data_parallel,
         )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -380,9 +349,8 @@ class Siglip2EncoderLayer(nn.Module):
         self,
         config: Siglip2VisionConfig,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        use_data_parallel: bool = False,
-        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.embed_dim = config.hidden_size
@@ -390,16 +358,15 @@ class Siglip2EncoderLayer(nn.Module):
         self.self_attn = Siglip2Attention(
             config,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             prefix=f"{prefix}.self_attn",
-            use_data_parallel=use_data_parallel,
-            attn_backend_override=attn_backend_override,
         )
         self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
         self.mlp = Siglip2MLP(
             config,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             prefix=f"{prefix}.mlp",
-            use_data_parallel=use_data_parallel,
         )
 
     def forward(
@@ -444,9 +411,8 @@ class Siglip2Encoder(nn.Module):
         self,
         config: Siglip2VisionConfig,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        use_data_parallel: bool = False,
-        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.config = config
@@ -455,9 +421,8 @@ class Siglip2Encoder(nn.Module):
                 Siglip2EncoderLayer(
                     config,
                     quant_config=quant_config,
+                    multimodal_config=multimodal_config,
                     prefix=f"{prefix}.layers.{idx}",
-                    use_data_parallel=use_data_parallel,
-                    attn_backend_override=attn_backend_override,
                 )
                 for idx in range(config.num_hidden_layers)
             ]
@@ -630,9 +595,8 @@ class Siglip2VisionTransformer(nn.Module):
         self,
         config: Siglip2VisionConfig,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        use_data_parallel: bool = False,
-        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.config = config
@@ -642,9 +606,8 @@ class Siglip2VisionTransformer(nn.Module):
         self.encoder = Siglip2Encoder(
             config,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             prefix=f"{prefix}.encoder",
-            use_data_parallel=use_data_parallel,
-            attn_backend_override=attn_backend_override,
         )
         self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
 
@@ -671,18 +634,16 @@ class Siglip2NavitModel(torch.nn.Module):
         self,
         config: Siglip2VisionConfig,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
-        use_data_parallel: bool = False,
-        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
 
         self.vision_model = Siglip2VisionTransformer(
             config,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             prefix=f"{prefix}.vision_model",
-            use_data_parallel=use_data_parallel,
-            attn_backend_override=attn_backend_override,
         )
 
     def forward(
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 7602eca9c3257..5a02916bb7752 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -88,14 +88,17 @@ def get_vit_attn_backend(
     """
     Get the available attention backend for Vision Transformer.
     """
-    if attn_backend_override is not None:
-        return attn_backend_override
+    attn_backend = attn_backend_override
 
     selected_backend = get_current_vllm_config().attention_config.backend
-    if selected_backend is not None:
-        return selected_backend
+    if attn_backend is None:
+        attn_backend = selected_backend
 
-    return current_platform.get_vit_attn_backend(head_size, dtype)
+    return current_platform.get_vit_attn_backend(
+        head_size,
+        dtype,
+        backend=attn_backend,
+    )
 
 
 def should_torch_compile_mm_vit(vllm_config: VllmConfig) -> bool:
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 38adf5dda07fe..ad5a6789b2023 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -7,7 +7,7 @@ pynvml. However, it should not initialize cuda context.
 import os
 from collections.abc import Callable
 from functools import cache, wraps
-from typing import TYPE_CHECKING, TypeVar
+from typing import TYPE_CHECKING, Optional, TypeVar
 
 import torch
 from typing_extensions import ParamSpec
@@ -255,23 +255,6 @@ class CudaPlatformBase(Platform):
         torch.cuda.reset_peak_memory_stats(device)
         return torch.cuda.max_memory_allocated(device)
 
-    @classmethod
-    def get_vit_attn_backend(
-        cls, head_size: int, dtype: torch.dtype
-    ) -> "AttentionBackendEnum":
-        # Try FlashAttention first
-        if (cc := cls.get_device_capability()) and cc.major >= 8:
-            try:
-                backend_class = AttentionBackendEnum.FLASH_ATTN.get_class()
-                if backend_class.supports_head_size(
-                    head_size
-                ) and backend_class.supports_dtype(dtype):
-                    return AttentionBackendEnum.FLASH_ATTN
-            except ImportError:
-                pass
-
-        return AttentionBackendEnum.TORCH_SDPA
-
     @classmethod
     def get_valid_backends(
         cls,
@@ -418,6 +401,41 @@ class CudaPlatformBase(Platform):
 
         return selected_backend.get_path()
 
+    @classmethod
+    def get_supported_vit_attn_backends(cls) -> list["AttentionBackendEnum"]:
+        return [
+            AttentionBackendEnum.TORCH_SDPA,
+            AttentionBackendEnum.FLASH_ATTN,
+        ]
+
+    @classmethod
+    def get_vit_attn_backend(
+        cls,
+        head_size: int,
+        dtype: torch.dtype,
+        backend: Optional["AttentionBackendEnum"] = None,
+    ) -> "AttentionBackendEnum":
+        if backend is not None:
+            assert backend in cls.get_supported_vit_attn_backends(), (
+                f"Backend {backend} is not supported for vit attention. "
+                f"Supported backends are: {cls.get_supported_vit_attn_backends()}"
+            )
+            logger.info_once(f"Using backend {backend} for vit attention")
+            return backend
+
+        # Try FlashAttention first
+        if (cc := cls.get_device_capability()) and cc.major >= 8:
+            try:
+                backend_class = AttentionBackendEnum.FLASH_ATTN.get_class()
+                if backend_class.supports_head_size(
+                    head_size
+                ) and backend_class.supports_dtype(dtype):
+                    return AttentionBackendEnum.FLASH_ATTN
+            except ImportError:
+                pass
+
+        return AttentionBackendEnum.TORCH_SDPA
+
     @classmethod
     def get_punica_wrapper(cls) -> str:
         return "vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU"
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 49437c7d56d12..9788e5b564165 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -7,7 +7,7 @@ import platform
 import random
 import sys
 from datetime import timedelta
-from typing import TYPE_CHECKING, Any, NamedTuple
+from typing import TYPE_CHECKING, Any, NamedTuple, Optional
 
 import numpy as np
 import torch
@@ -222,12 +222,6 @@ class Platform:
         with contextlib.suppress(ImportError):
             import vllm._moe_C  # noqa: F401
 
-    @classmethod
-    def get_vit_attn_backend(
-        cls, head_size: int, dtype: torch.dtype
-    ) -> "AttentionBackendEnum":
-        return AttentionBackendEnum.TORCH_SDPA
-
     @classmethod
     def get_attn_backend_cls(
         cls,
@@ -245,6 +239,43 @@ class Platform:
         """Get the attention backend class of a device."""
         return ""
 
+    @classmethod
+    def get_supported_vit_attn_backends(cls) -> list["AttentionBackendEnum"]:
+        return [
+            AttentionBackendEnum.TORCH_SDPA,
+        ]
+
+    @classmethod
+    def get_vit_attn_backend(
+        cls,
+        head_size: int,
+        dtype: torch.dtype,
+        backend: Optional["AttentionBackendEnum"] = None,
+    ) -> "AttentionBackendEnum":
+        """
+        Get the vision attention backend class of a device.
+
+        NOTE: ViT Attention should be checked and override in the platform-specific
+        implementation. we should not override this in any other places, like
+        the model_executor/models/<model_name>.py.
+
+        We check if the backend is None or not:
+            1. If not, check if the backend is supported by the platform.
+            2. If None, continue to the default selection logic.
+        """
+        if backend is not None:
+            assert backend in cls.get_supported_vit_attn_backends(), (
+                f"Backend {backend} is not supported for vit attention"
+                f"Supported backends are: {cls.get_supported_vit_attn_backends()}"
+            )
+            logger.info_once(f"Using backend {backend} for vit attention")
+            return backend
+
+        logger.info_once(
+            f"Using default backend {AttentionBackendEnum.TORCH_SDPA} for vit attention"
+        )
+        return AttentionBackendEnum.TORCH_SDPA
+
     @classmethod
     def get_device_capability(
         cls,
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 876114c2d33a4..b90fb3686c280 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -3,7 +3,7 @@
 
 import os
 from functools import cache, lru_cache, wraps
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional
 
 import torch
 
@@ -187,24 +187,6 @@ class RocmPlatform(Platform):
     if not on_gfx9():
         supported_quantization += ["bitsandbytes"]
 
-    @classmethod
-    def get_vit_attn_backend(
-        cls, head_size: int, dtype: torch.dtype
-    ) -> AttentionBackendEnum:
-        from importlib.util import find_spec
-
-        from vllm._aiter_ops import rocm_aiter_ops
-
-        if rocm_aiter_ops.is_mha_enabled():
-            # Note: AITER FA is only supported for Qwen-VL models.
-            # TODO: Add support for other VL models in their model class.
-            return AttentionBackendEnum.ROCM_AITER_FA
-
-        if on_gfx9() and find_spec("flash_attn") is not None:
-            return AttentionBackendEnum.FLASH_ATTN
-
-        return AttentionBackendEnum.TORCH_SDPA
-
     @classmethod
     def get_attn_backend_cls(
         cls,
@@ -322,6 +304,43 @@ class RocmPlatform(Platform):
             "ROCm. Note that V0 attention backends have been removed."
         )
 
+    @classmethod
+    def get_supported_vit_attn_backends(cls) -> list["AttentionBackendEnum"]:
+        return [
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
+            AttentionBackendEnum.TORCH_SDPA,
+        ]
+
+    @classmethod
+    def get_vit_attn_backend(
+        cls,
+        head_size: int,
+        dtype: torch.dtype,
+        backend: Optional["AttentionBackendEnum"] = None,
+    ) -> "AttentionBackendEnum":
+        if backend is not None:
+            assert backend in cls.get_supported_vit_attn_backends(), (
+                f"Backend {backend} is not supported for vit attention. "
+                f"Supported backends are: {cls.get_supported_vit_attn_backends()}"
+            )
+            logger.info_once(f"Using backend {backend} for vit attention")
+            return backend
+
+        from importlib.util import find_spec
+
+        from vllm._aiter_ops import rocm_aiter_ops
+
+        if rocm_aiter_ops.is_mha_enabled():
+            # Note: AITER FA is only supported for Qwen-VL models.
+            # TODO: Add support for other VL models in their model class.
+            return AttentionBackendEnum.ROCM_AITER_FA
+
+        if on_gfx9() and find_spec("flash_attn") is not None:
+            return AttentionBackendEnum.FLASH_ATTN
+
+        return AttentionBackendEnum.TORCH_SDPA
+
     @classmethod
     def set_device(cls, device: torch.device) -> None:
         """
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index d6998e7a308af..50de87098f05c 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import contextlib
-from typing import TYPE_CHECKING, cast
+from typing import TYPE_CHECKING, Optional, cast
 
 import torch
 from tpu_info import device
@@ -75,6 +75,32 @@ class TpuPlatform(Platform):
         logger.info("Using Pallas V1 backend.")
         return AttentionBackendEnum.PALLAS.get_path()
 
+    @classmethod
+    def get_supported_vit_attn_backends(cls) -> list["AttentionBackendEnum"]:
+        return [
+            AttentionBackendEnum.PALLAS,
+        ]
+
+    @classmethod
+    def get_vit_attn_backend(
+        cls,
+        head_size: int,
+        dtype: torch.dtype,
+        backend: Optional["AttentionBackendEnum"] = None,
+    ) -> "AttentionBackendEnum":
+        if backend is not None:
+            assert backend in cls.get_supported_vit_attn_backends(), (
+                f"Backend {backend} is not supported for vit attention"
+                f"Supported backends are: {cls.get_supported_vit_attn_backends()}."
+            )
+            logger.info_once(f"Using backend {backend} for vit attention.")
+            return backend
+
+        logger.info_once(
+            f"Using default backend {AttentionBackendEnum.PALLAS} for vit attention."
+        )
+        return AttentionBackendEnum.PALLAS
+
     @classmethod
     def set_device(cls, device: torch.device) -> None:
         """
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 0a05750764d8d..c1ec2d41c73b0 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -3,7 +3,7 @@
 
 import contextlib
 import os
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional
 
 import torch
 
@@ -77,6 +77,34 @@ class XPUPlatform(Platform):
         logger.info("Using Flash Attention backend.")
         return AttentionBackendEnum.FLASH_ATTN.get_path()
 
+    @classmethod
+    def get_supported_vit_attn_backends(cls) -> list["AttentionBackendEnum"]:
+        # XPU only supports FLASH_ATTN for vision attention.
+        return [
+            AttentionBackendEnum.FLASH_ATTN,
+        ]
+
+    @classmethod
+    def get_vit_attn_backend(
+        cls,
+        head_size: int,
+        dtype: torch.dtype,
+        backend: Optional["AttentionBackendEnum"] = None,
+    ) -> "AttentionBackendEnum":
+        if backend is not None:
+            assert backend in cls.get_supported_vit_attn_backends(), (
+                f"Backend {backend} is not supported for vit attention. "
+                f"Supported backends are: "
+                f"{cls.get_supported_vit_attn_backends()}."
+            )
+            logger.info_once(f"Using backend {backend} for vit attention")
+            return backend
+
+        logger.info_once(
+            f"Using backend {AttentionBackendEnum.FLASH_ATTN} for vit attention"
+        )
+        return AttentionBackendEnum.FLASH_ATTN
+
     @classmethod
     def set_device(cls, device: torch.device) -> None:
         """
@@ -110,12 +138,6 @@ class XPUPlatform(Platform):
         device_props = torch.xpu.get_device_properties(device_id)
         return device_props.total_memory
 
-    @classmethod
-    def get_vit_attn_backend(
-        cls, head_size: int, dtype: torch.dtype
-    ) -> "AttentionBackendEnum":
-        return AttentionBackendEnum.FLASH_ATTN
-
     @classmethod
     def inference_mode(cls):
         return torch.no_grad()

From a524d1ba0af49998820d81429872869c62f8585f Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Mon, 15 Dec 2025 12:20:31 +0800
Subject: [PATCH 548/770] [Bugfix] Fix  deepseek_v32 tokenizer_mode  (#30658)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/entrypoints/openai/serving_engine.py           | 2 +-
 vllm/tokenizers/{deepseekv32.py => deepseek_v32.py} | 0
 vllm/tokenizers/registry.py                         | 2 +-
 vllm/v1/structured_output/backend_xgrammar.py       | 2 +-
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename vllm/tokenizers/{deepseekv32.py => deepseek_v32.py} (100%)

diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index bb614cb8f8977..46eb351f52843 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -102,7 +102,7 @@ from vllm.pooling_params import PoolingParams
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
 from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.tokenizers import TokenizerLike
-from vllm.tokenizers.deepseekv32 import DeepseekV32Tokenizer
+from vllm.tokenizers.deepseek_v32 import DeepseekV32Tokenizer
 from vllm.tokenizers.mistral import MistralTokenizer
 from vllm.tracing import (
     contains_trace_headers,
diff --git a/vllm/tokenizers/deepseekv32.py b/vllm/tokenizers/deepseek_v32.py
similarity index 100%
rename from vllm/tokenizers/deepseekv32.py
rename to vllm/tokenizers/deepseek_v32.py
diff --git a/vllm/tokenizers/registry.py b/vllm/tokenizers/registry.py
index 1296ce62ae693..72447ef04e87c 100644
--- a/vllm/tokenizers/registry.py
+++ b/vllm/tokenizers/registry.py
@@ -30,7 +30,7 @@ logger = init_logger(__name__)
 
 
 _VLLM_TOKENIZERS = {
-    "deepseekv32": ("deepseekv32", "DeepseekV32Tokenizer"),
+    "deepseek_v32": ("deepseek_v32", "DeepseekV32Tokenizer"),
     "hf": ("hf", "CachedHfTokenizer"),
     "mistral": ("mistral", "MistralTokenizer"),
 }
diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py
index 678121683434d..9dd506880389a 100644
--- a/vllm/v1/structured_output/backend_xgrammar.py
+++ b/vllm/v1/structured_output/backend_xgrammar.py
@@ -10,7 +10,7 @@ import torch
 import vllm.envs
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams
-from vllm.tokenizers.deepseekv32 import DeepseekV32Tokenizer
+from vllm.tokenizers.deepseek_v32 import DeepseekV32Tokenizer
 from vllm.tokenizers.mistral import MistralTokenizer
 from vllm.utils.import_utils import LazyLoader
 from vllm.v1.structured_output.backend_types import (

From b337647aa0ce103a84aac1e07a8fd738a5a4f13f Mon Sep 17 00:00:00 2001
From: Seokhyun An <84222373+seokhyunan@users.noreply.github.com>
Date: Mon, 15 Dec 2025 13:21:12 +0900
Subject: [PATCH 549/770] [Bugfix] Drop empty tool_calls lists to keep
 assistant replies in chat template (#30648)

Signed-off-by: Seokhyun An <iamseokhyun@gmail.com>
---
 vllm/entrypoints/chat_utils.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 8485022024a4f..6a7975adeac81 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -1629,12 +1629,17 @@ def _postprocess_messages(messages: list[ConversationMessage]) -> None:
     # so, for messages that have tool_calls, parse the string (which we get
     # from openAI format) to dict
     for message in messages:
-        if (
-            message["role"] == "assistant"
-            and "tool_calls" in message
-            and isinstance(message["tool_calls"], list)
-        ):
-            for item in message["tool_calls"]:
+        if message["role"] == "assistant" and "tool_calls" in message:
+            tool_calls = message.get("tool_calls")
+            if not isinstance(tool_calls, list):
+                continue
+
+            if len(tool_calls) == 0:
+                # Drop empty tool_calls to keep templates on the normal assistant path.
+                message.pop("tool_calls", None)
+                continue
+
+            for item in tool_calls:
                 # if arguments is None or empty string, set to {}
                 if content := item["function"].get("arguments"):
                     if not isinstance(content, (dict, list)):

From 3778673ea81bf5241f40e9c5e90f989bde377acf Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Sun, 14 Dec 2025 23:21:36 -0500
Subject: [PATCH 550/770] [Feat] Refactor for `parallel_config` in
 `FusedMoEModularKernel` (#30282)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
---
 .../moe/modular_kernel_tools/common.py        |  3 ++-
 tests/kernels/moe/test_flashinfer.py          | 14 +++++++++++++
 .../layers/fused_moe/cutlass_moe.py           |  2 --
 .../layers/fused_moe/deep_gemm_moe.py         |  2 +-
 .../fused_moe/fused_moe_modular_method.py     |  7 +------
 .../layers/fused_moe/modular_kernel.py        | 21 ++++++++++++-------
 .../compressed_tensors_moe.py                 |  3 ---
 .../quantization/utils/flashinfer_utils.py    |  7 +------
 8 files changed, 32 insertions(+), 27 deletions(-)

diff --git a/tests/kernels/moe/modular_kernel_tools/common.py b/tests/kernels/moe/modular_kernel_tools/common.py
index d95c22fdf0a5b..6078ce44cee9f 100644
--- a/tests/kernels/moe/modular_kernel_tools/common.py
+++ b/tests/kernels/moe/modular_kernel_tools/common.py
@@ -594,7 +594,8 @@ def make_modular_kernel(
     )
 
     modular_kernel = mk.FusedMoEModularKernel(
-        prepare_finalize=prepare_finalize, fused_experts=fused_experts
+        prepare_finalize=prepare_finalize,
+        fused_experts=fused_experts,
     )
 
     return modular_kernel
diff --git a/tests/kernels/moe/test_flashinfer.py b/tests/kernels/moe/test_flashinfer.py
index d553e2820e5ff..bf4ef2d30466b 100644
--- a/tests/kernels/moe/test_flashinfer.py
+++ b/tests/kernels/moe/test_flashinfer.py
@@ -5,6 +5,7 @@ from dataclasses import dataclass
 import pytest
 import torch
 
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEQuantConfig,
@@ -107,6 +108,19 @@ class TestData:
         layer.w2_input_scale = a2_scale
         layer.w13_weight_scale = w13_weight_scale
         layer.w2_weight_scale = w2_weight_scale
+        # Setup dummy config.
+        layer.moe_parallel_config = mk.FusedMoEParallelConfig(
+            tp_size=1,
+            pcp_size=1,
+            dp_size=1,
+            ep_size=1,
+            tp_rank=1,
+            pcp_rank=1,
+            dp_rank=1,
+            ep_rank=1,
+            use_ep=False,
+            all2all_backend="naive",
+        )
 
         register_moe_scaling_factors(layer)
 
diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
index 552e38a71bf98..4a0b4e82c1b39 100644
--- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -460,7 +460,6 @@ def cutlass_moe_fp8(
     expert_map: torch.Tensor | None = None,
     apply_router_weight_on_input: bool = False,
     global_num_experts: int = -1,
-    parallel_config=None,
 ) -> torch.Tensor:
     """
     This function computes a a8w8-quantized Mixture of Experts (MoE) layer
@@ -538,7 +537,6 @@ def cutlass_moe_fp8(
             c_strides2=c_strides2,
             quant_config=quant_config,
         ),
-        parallel_config=parallel_config,
     )
 
     return fn(
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
index 4a64736ed767b..5ca91768c9760 100644
--- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -293,7 +293,7 @@ def deep_gemm_moe_fp8(
     expert_map: torch.Tensor | None = None,
     a1_scale: torch.Tensor | None = None,
     a2_scale: torch.Tensor | None = None,
-    apply_router_weight_on_input=False,
+    apply_router_weight_on_input: bool = False,
 ) -> torch.Tensor:
     """
     This function computes a a8w8-quantized Mixture of Experts (MoE) layer
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
index 1947423bf4777..9c9bc2514bb4b 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
@@ -43,11 +43,6 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
         prepare_finalize: FusedMoEPrepareAndFinalize,
         shared_experts: torch.nn.Module | None,
     ) -> "FusedMoEModularMethod":
-        parallel_config = getattr(
-            getattr(moe_layer, "vllm_config", None),
-            "parallel_config",
-            None,
-        )
         return FusedMoEModularMethod(
             old_quant_method,
             FusedMoEModularKernel(
@@ -55,7 +50,7 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
                 old_quant_method.select_gemm_impl(prepare_finalize, moe_layer),
                 shared_experts,
                 getattr(moe_layer, "shared_experts_stream", None),
-                parallel_config=parallel_config,
+                moe_parallel_config=moe_layer.moe_parallel_config,
             ),
         )
 
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index 9e75a7c08070e..484314091cb15 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -10,10 +10,12 @@ from typing import final
 import torch
 
 import vllm.envs as envs
-from vllm.config import ParallelConfig, get_current_vllm_config
 from vllm.forward_context import get_forward_context, is_forward_context_available
 from vllm.logger import init_logger
-from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEParallelConfig,
+    FusedMoEQuantConfig,
+)
 from vllm.model_executor.layers.fused_moe.utils import (
     _resize_cache,
     count_expert_num_tokens,
@@ -681,7 +683,7 @@ class FusedMoEModularKernel(torch.nn.Module):
         fused_experts: FusedMoEPermuteExpertsUnpermute,
         shared_experts: torch.nn.Module | None = None,
         shared_experts_stream: torch.cuda.Stream | None = None,
-        parallel_config: ParallelConfig | None = None,
+        moe_parallel_config: FusedMoEParallelConfig | None = None,
     ):
         super().__init__()
         self.prepare_finalize = prepare_finalize
@@ -689,12 +691,15 @@ class FusedMoEModularKernel(torch.nn.Module):
         self.shared_experts = shared_experts
         self.shared_experts_stream = shared_experts_stream
 
-        # cache whether this worker is using DP+EP
-        if parallel_config is None:
-            parallel_config = get_current_vllm_config().parallel_config
+        # prefer an explicit FusedMoEParallelConfig when available (from
+        # FusedMoE layers / tests).
+        # if not provided, assume this kernel is
+        # running in a non-DP+EP context
+        self.moe_parallel_config: FusedMoEParallelConfig | None = moe_parallel_config
         self.is_dp_ep = (
-            parallel_config.data_parallel_size > 1
-            and parallel_config.enable_expert_parallel
+            moe_parallel_config is not None
+            and moe_parallel_config.dp_size > 1
+            and moe_parallel_config.use_ep
         )
 
         self._post_init_setup()
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 5ad26f9318df3..18c2ab026b2ba 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -1266,9 +1266,6 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                     ab_strides2=self.ab_strides2,
                     c_strides1=self.c_strides1,
                     c_strides2=self.ab_strides1_c_strides2,
-                    parallel_config=getattr(
-                        getattr(layer, "vllm_config", None), "parallel_config", None
-                    ),
                 )
 
         else:
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
index 09d0fe6a2f3ad..3d6e9cda87667 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
@@ -247,11 +247,6 @@ def flashinfer_cutlass_moe_fp8(
     assert quant_config is not None
 
     # Construct modular kernel with block-scale support when requested.
-    parallel_config = getattr(
-        getattr(layer, "vllm_config", None),
-        "parallel_config",
-        None,
-    )
     fused_experts = mk.FusedMoEModularKernel(
         build_flashinfer_fp8_cutlass_moe_prepare_finalize(
             moe=moe, use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale
@@ -262,7 +257,7 @@ def flashinfer_cutlass_moe_fp8(
             out_dtype=hidden_states.dtype,
             use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale,
         ),
-        parallel_config=parallel_config,
+        moe_parallel_config=layer.moe_parallel_config,
     )
 
     return fused_experts(

From e3a1cd1c59b7cfb8fd6eb05e69393aa7f42dc12d Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Mon, 15 Dec 2025 13:32:06 +0800
Subject: [PATCH 551/770] [XPU] fix Dockerfile.xpu, avoid wheel conflicts
 (#30662)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
---
 docker/Dockerfile.xpu | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu
index adac43c6accbe..72d2053102c22 100644
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@@ -76,6 +76,9 @@ RUN python3 -m pip install -e tests/vllm_test_utils
 ENV NIXL_VERSION=0.7.0
 RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
 
+# PyJWT-2.7.0 will influence some wheel behaviors, remove its dist-info to avoid conflicts
+RUN rm /usr/lib/python3/dist-packages/PyJWT-2.7.0.dist-info/ -rf
+
 # remove torch bundled oneccl to avoid conflicts
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip uninstall oneccl oneccl-devel -y

From 1adeb3b84c2dcf776b13a9933904c6214c3fe745 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= <wangzhipeng628@gmail.com>
Date: Mon, 15 Dec 2025 14:58:23 +0800
Subject: [PATCH 552/770] [New Model] BAGEL support (AR only) (#28439)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: princepride <wangzhipeng628@gmail.com>
Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 docs/models/supported_models.md               |   1 +
 examples/offline_inference/vision_language.py |  27 +
 tests/models/registry.py                      |   1 +
 vllm/model_executor/models/bagel.py           | 584 ++++++++++++++++++
 vllm/model_executor/models/qwen2.py           |  32 +
 vllm/model_executor/models/registry.py        |   1 +
 vllm/transformers_utils/config.py             |   1 +
 vllm/transformers_utils/configs/__init__.py   |   2 +
 vllm/transformers_utils/configs/bagel.py      |  53 ++
 .../transformers_utils/processors/__init__.py |   2 +
 vllm/transformers_utils/processors/bagel.py   |  73 +++
 11 files changed, 777 insertions(+)
 create mode 100644 vllm/model_executor/models/bagel.py
 create mode 100644 vllm/transformers_utils/configs/bagel.py
 create mode 100644 vllm/transformers_utils/processors/bagel.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 9d8cdfe8b1302..9ba0f4ca9096e 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -661,6 +661,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | |
 | `AudioFlamingo3ForConditionalGeneration` | AudioFlamingo3 | T + A<sup>+</sup> | `nvidia/audio-flamingo-3-hf`, `nvidia/music-flamingo-hf` | ✅︎ | ✅︎ |
 | `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereLabs/aya-vision-8b`, `CohereLabs/aya-vision-32b`, etc. | | ✅︎ |
+| `BagelForConditionalGeneration` | BAGEL | T + I<sup>+</sup> | `ByteDance-Seed/BAGEL-7B-MoT` | ✅︎ | ✅︎ |
 | `BeeForConditionalGeneration` | Bee-8B | T + I<sup>E+</sup> | `Open-Bee/Bee-8B-RL`, `Open-Bee/Bee-8B-SFT` | | ✅︎ |
 | `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ |
 | `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ |
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 9142279140e56..dd5b22ae9b0f6 100755
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -118,6 +118,32 @@ def run_bee(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+def run_bagel(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+    model_name = "ByteDance-Seed/BAGEL-7B-MoT"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        max_num_seqs=2,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    prompts = [
+        (
+            f"<|im_start|>user\n<|image_pad|>\n{question}<|im_end|>\n"
+            f"<|im_start|>assistant\n"
+        )
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # BLIP-2
 def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -1832,6 +1858,7 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
 model_example_map = {
     "aria": run_aria,
     "aya_vision": run_aya_vision,
+    "bagel": run_bagel,
     "bee": run_bee,
     "blip-2": run_blip2,
     "chameleon": run_chameleon,
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 3f835a8b88e3d..1bde8ab189c2e 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -582,6 +582,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
         "nvidia/audio-flamingo-3-hf", min_transformers_version="5.0.0.dev"
     ),
     "AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereLabs/aya-vision-8b"),
+    "BagelForConditionalGeneration": _HfExamplesInfo("ByteDance-Seed/BAGEL-7B-MoT"),
     "BeeForConditionalGeneration": _HfExamplesInfo(
         "Open-Bee/Bee-8B-RL",
         trust_remote_code=True,
diff --git a/vllm/model_executor/models/bagel.py b/vllm/model_executor/models/bagel.py
new file mode 100644
index 0000000000000..98229c6d4ca1b
--- /dev/null
+++ b/vllm/model_executor/models/bagel.py
@@ -0,0 +1,584 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright 2025 Bytedance Ltd. and/or its affiliates.
+"""Inference-only BAGEL model compatible with HuggingFace weights.
+
+BAGEL is a unified multimodal model for image understanding and generation.
+For vLLM, we focus on the image understanding (vision-to-text) capabilities.
+"""
+
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Any, Literal, TypeAlias
+
+import torch
+import torch.nn as nn
+
+from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
+from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
+from vllm.multimodal.parse import MultiModalDataItems
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.processors.bagel import BagelProcessor
+from vllm.utils.tensor_schema import TensorSchema
+
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsLoRA,
+    SupportsMultiModal,
+    SupportsPP,
+)
+from .siglip import SiglipVisionModel
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
+
+logger = init_logger(__name__)
+
+
+class BagelImagePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - bn: Batch size * number of images
+        - c: Number of channels (3)
+        - h: Height of each image
+        - w: Width of each image
+    """
+
+    type: Literal["pixel_values"]
+    pixel_values: torch.Tensor  # Shape: (bn, 3, h, w)
+
+
+BagelImageInputs: TypeAlias = BagelImagePixelInputs
+
+
+class BagelVisionMLP(nn.Module):
+    """MLP connector for vision features."""
+
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int,
+        out_features: int,
+        act_layer: str = "gelu_pytorch_tanh",
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.fc1 = ColumnParallelLinear(
+            in_features,
+            hidden_features,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
+        )
+        self.act = get_act_fn(act_layer)
+        self.fc2 = RowParallelLinear(
+            hidden_features,
+            out_features,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, _ = self.fc1(x)
+        x = self.act(x)
+        x, _ = self.fc2(x)
+        return x
+
+
+class PositionEmbedding(nn.Module):
+    """2D position embedding for vision tokens using sin-cos embeddings."""
+
+    def __init__(self, max_num_patch_per_side: int, hidden_size: int):
+        super().__init__()
+        self.max_num_patch_per_side = max_num_patch_per_side
+        self.hidden_size = hidden_size
+
+        # Create learnable 2D position embeddings (frozen sin-cos)
+        pos_embed = self._get_2d_sincos_pos_embed(hidden_size, max_num_patch_per_side)
+        self.register_buffer(
+            "pos_embed",
+            torch.from_numpy(pos_embed).float(),
+            persistent=False,
+        )
+
+    @staticmethod
+    def _get_2d_sincos_pos_embed(embed_dim: int, grid_size: int):
+        """Generate 2D sin-cos position embeddings."""
+        import numpy as np
+
+        grid_h = np.arange(grid_size, dtype=np.float32)
+        grid_w = np.arange(grid_size, dtype=np.float32)
+        grid = np.meshgrid(grid_w, grid_h)  # w goes first
+        grid = np.stack(grid, axis=0)
+        grid = grid.reshape([2, 1, grid_size, grid_size])
+        pos_embed = PositionEmbedding._get_2d_sincos_pos_embed_from_grid(
+            embed_dim, grid
+        )
+        return pos_embed
+
+    @staticmethod
+    def _get_2d_sincos_pos_embed_from_grid(embed_dim: int, grid):
+        """Generate 2D sin-cos position embeddings from grid."""
+        import numpy as np
+
+        assert embed_dim % 2 == 0
+        # use half of dimensions to encode grid_h
+        emb_h = PositionEmbedding._get_1d_sincos_pos_embed_from_grid(
+            embed_dim // 2, grid[0]
+        )
+        emb_w = PositionEmbedding._get_1d_sincos_pos_embed_from_grid(
+            embed_dim // 2, grid[1]
+        )
+        emb = np.concatenate([emb_h, emb_w], axis=1)
+        return emb
+
+    @staticmethod
+    def _get_1d_sincos_pos_embed_from_grid(embed_dim: int, pos):
+        """Generate 1D sin-cos position embeddings."""
+        import numpy as np
+
+        assert embed_dim % 2 == 0
+        omega = np.arange(embed_dim // 2, dtype=np.float64)
+        omega /= embed_dim / 2.0
+        omega = 1.0 / 10000**omega
+
+        pos = pos.reshape(-1)
+        out = np.einsum("m,d->md", pos, omega)
+
+        emb_sin = np.sin(out)
+        emb_cos = np.cos(out)
+        emb = np.concatenate([emb_sin, emb_cos], axis=1)
+        return emb
+
+    def forward(self, position_ids: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            position_ids: Flattened position IDs, shape (N,) where each ID
+                         corresponds to a position in the flattened grid
+        Returns:
+            Position embeddings of shape (N, hidden_size)
+        """
+        # Ensure position_ids are on the same device as pos_embed
+        position_ids = position_ids.to(self.pos_embed.device)
+        return self.pos_embed[position_ids]
+
+
+class BagelProcessingInfo(BaseProcessingInfo):
+    """Processing information for BAGEL model."""
+
+    def get_hf_processor(self, **kwargs: object) -> BagelProcessor:
+        from vllm.transformers_utils.processor import cached_get_image_processor
+
+        image_processor = cached_get_image_processor(
+            self.ctx.model_config.model,
+            revision=self.ctx.model_config.revision,
+            trust_remote_code=self.ctx.model_config.trust_remote_code,
+        )
+
+        tokenizer = self.get_tokenizer()
+
+        return BagelProcessor(
+            image_processor=image_processor,
+            tokenizer=tokenizer,
+            **kwargs,
+        )
+
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
+        return {"image": None}
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        hf_config = self.get_hf_config()
+        # Calculate max tokens per image
+        # For BAGEL: (vit_max_num_patch_per_side) ** 2
+        max_num_patches = hf_config.vit_max_num_patch_per_side**2
+        return {"image": max_num_patches}
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        hf_config = self.get_hf_config()
+        vit_config = hf_config.vit_config
+        patch_size = vit_config.patch_size
+
+        # Calculate number of patches
+        num_patches_h = image_height // patch_size
+        num_patches_w = image_width // patch_size
+        return num_patches_h * num_patches_w
+
+
+class BagelDummyInputsBuilder(BaseDummyInputsBuilder[BagelProcessingInfo]):
+    """Build dummy inputs for BAGEL model profiling."""
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+        # Use a simple placeholder for each image
+        return "<|image_pad|>" * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+        hf_config = self.info.get_hf_config()
+        vit_config = hf_config.vit_config
+
+        # Use the configured image size
+        image_size = vit_config.image_size
+        image_overrides = mm_options.get("image") if mm_options else None
+
+        return {
+            "image": self._get_dummy_images(
+                width=image_size,
+                height=image_size,
+                num_images=num_images,
+                overrides=image_overrides,
+            ),
+        }
+
+
+class BagelMultiModalProcessor(BaseMultiModalProcessor[BagelProcessingInfo]):
+    """Multimodal processor for BAGEL model."""
+
+    def _hf_processor_applies_updates(
+        self,
+        prompt_text: str,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Mapping[str, object],
+    ) -> bool:
+        return False
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargsItems,
+    ) -> Sequence[PromptReplacement]:
+        """Replace image placeholders with the correct number of tokens."""
+        hf_config = self.info.get_hf_config()
+
+        # Get the tokenizer to look up the image token ID
+        tokenizer = self.info.get_tokenizer()
+        image_token_id = tokenizer.get_vocab().get("<|image_pad|>")
+        if image_token_id is None:
+            raise ValueError(
+                "Image token '<|image_pad|>' not found in tokenizer vocabulary"
+            )
+
+        def get_replacement_bagel(item_idx: int):
+            # For BAGEL, calculate number of tokens based on max patch size
+            num_tokens = hf_config.vit_max_num_patch_per_side**2
+            # Use the image token ID from tokenizer
+            return [image_token_id] * num_tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=get_replacement_bagel,
+            )
+        ]
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: Any,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return {
+            "pixel_values": MultiModalFieldConfig.batched("image"),
+        }
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    BagelMultiModalProcessor,
+    info=BagelProcessingInfo,
+    dummy_inputs=BagelDummyInputsBuilder,
+)
+class BagelForConditionalGeneration(
+    nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP
+):
+    """
+    BAGEL: A unified multimodal model for image understanding and generation.
+
+    For vLLM, we focus on the image understanding (vision-to-text) capabilities.
+    The image generation part is not supported in vLLM.
+    """
+
+    # Weight mapping from HF to vLLM
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "language_model.": "language_model.",
+            "vit_model.": "vit_model.",
+            "connector.": "connector.",
+            "vit_pos_embed.": "vit_pos_embed.",
+        }
+    )
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        # Ensure we have a BagelConfig (check by name to handle trust_remote_code)
+        # When trust_remote_code=True, the config comes from transformers_modules
+        if type(config).__name__ != "BagelConfig":
+            raise ValueError(
+                f"Expected BagelConfig, got {type(config).__name__}. "
+                "Make sure the model config is properly loaded."
+            )
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        # Initialize language model (Qwen2)
+        # Pass the llm_config from BagelConfig to initialize Qwen2 properly
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.llm_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+            architectures=["Qwen2ForCausalLM"],
+        )
+
+        # Initialize vision model (SigLIP) if visual understanding is enabled
+        if config.visual_und:
+            # Fix vit_config: checkpoint has 26 layers (0-25) but config says 27
+            # Also disable head as it's not in checkpoint
+            vit_config = config.vit_config
+            if vit_config.num_hidden_layers == 27:
+                logger.warning(
+                    "Overriding vit_config.num_hidden_layers from 27 to 26 "
+                    "to match the Bagel model checkpoint."
+                )
+                vit_config.num_hidden_layers = 26
+            if not hasattr(vit_config, "vision_use_head"):
+                logger.warning(
+                    "Setting vit_config.vision_use_head to False as it is not "
+                    "present in the Bagel model checkpoint."
+                )
+                vit_config.vision_use_head = False
+
+            self.vit_model = SiglipVisionModel(
+                config=vit_config,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "vit_model"),
+            )
+
+            # Initialize connector (MLP)
+            vit_hidden_size = config.vit_config.hidden_size
+            llm_hidden_size = config.llm_config.hidden_size
+
+            self.connector = BagelVisionMLP(
+                in_features=vit_hidden_size,
+                hidden_features=llm_hidden_size,
+                out_features=llm_hidden_size,
+                act_layer=config.connector_act,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "connector"),
+            )
+
+            # Position embedding for vision tokens
+            self.vit_pos_embed = PositionEmbedding(
+                max_num_patch_per_side=config.vit_max_num_patch_per_side,
+                hidden_size=llm_hidden_size,
+            )
+        else:
+            self.vit_model = None
+            self.connector = None
+            self.vit_pos_embed = None
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors
+        )
+
+    def _parse_and_validate_image_input(
+        self, **kwargs: object
+    ) -> BagelImageInputs | None:
+        pixel_values = kwargs.pop("pixel_values", None)
+
+        if pixel_values is None:
+            return None
+
+        return BagelImagePixelInputs(
+            type="pixel_values",
+            pixel_values=pixel_values,
+        )
+
+    def _process_image_input(
+        self, image_input: BagelImageInputs
+    ) -> tuple[torch.Tensor, ...]:
+        """Process image inputs through vision encoder and connector."""
+        pixel_values = image_input["pixel_values"]
+
+        # Handle potential extra batch dimension
+        # Expected shape: (batch_size * num_images, 3, H, W)
+        # But might receive: (batch_size, num_images, 3, H, W)
+        if pixel_values.ndim == 5:
+            # Flatten batch and num_images dimensions
+            batch_size, num_images, channels, height, width = pixel_values.shape
+            pixel_values = pixel_values.reshape(
+                batch_size * num_images, channels, height, width
+            )
+
+        # Get vision features from SigLIP
+        # pixel_values shape: (batch_size * num_images, 3, H, W)
+        vision_features = self.vit_model(pixel_values)
+
+        # Pass through connector
+        vision_embeds = self.connector(vision_features)
+
+        # Add position embeddings
+        batch_size, num_patches, hidden_size = vision_embeds.shape
+        patch_size = self.config.vit_config.patch_size
+        image_size = self.config.vit_config.image_size
+
+        # Calculate grid dimensions
+        num_patches_per_side = image_size // patch_size
+
+        # Create flattened position IDs (0 to num_patches-1)
+        # For BAGEL, we use extrapolate mode by default
+        h_coords = torch.arange(num_patches_per_side, device=vision_embeds.device)
+        w_coords = torch.arange(num_patches_per_side, device=vision_embeds.device)
+        position_ids = (
+            h_coords[:, None] * self.config.vit_max_num_patch_per_side + w_coords
+        ).flatten()
+        position_ids = position_ids.unsqueeze(0).expand(batch_size, -1).flatten()
+
+        # Add position embeddings
+        pos_embeds = self.vit_pos_embed(position_ids)
+        pos_embeds = pos_embeds.reshape(batch_size, num_patches, hidden_size)
+        # Ensure pos_embeds are on the same device as vision_embeds
+        pos_embeds = pos_embeds.to(vision_embeds.device)
+        vision_embeds = vision_embeds + pos_embeds
+
+        # Split by image
+        return tuple(vision_embeds)
+
+    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+        """Get multimodal embeddings from input."""
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return []
+
+        return self._process_image_input(image_input)
+
+    def get_language_model(self) -> nn.Module:
+        return self.language_model
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs: object,
+    ) -> torch.Tensor | IntermediateTensors:
+        """Run forward pass for BAGEL.
+
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a batch.
+            positions: Flattened (concatenated) position ids corresponding to a batch.
+            intermediate_tensors: Intermediate tensors from prior forward pass.
+            inputs_embeds: Optional tensor of input embeddings.
+        """
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        hidden_states = self.language_model.model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        return self.language_model.compute_logits(hidden_states)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        """Load weights from checkpoint."""
+        skip_prefixes = []
+        # Skip vit_pos_embed.pos_embed as it's handled by PositionEmbedding module
+        skip_prefixes.append("vit_pos_embed.pos_embed")
+
+        # If visual understanding is disabled, skip vision-related weights
+        if self.vit_model is None:
+            skip_prefixes.extend(["vit_model.", "connector.", "vit_pos_embed"])
+
+        # Skip generation-related weights since we only support text2text and image2text
+        # Filter out all image generation components:
+        # - 'moe_gen': MoE generation weights
+        # - 'latent_pos_embed': Latent position embeddings for VAE
+        # - 'llm2vae', 'vae2llm': LLM-VAE projections
+        # - 'time_embedder': Timestep embeddings for diffusion
+        # - VAE encoder/decoder: Use specific prefixes to avoid matching vision encoder
+        generation_keywords = [
+            "moe_gen",
+            "latent_pos_embed",
+            "llm2vae",
+            "vae2llm",
+            "time_embedder",
+        ]
+        vae_prefixes = [
+            "decoder.",
+            "encoder.",
+        ]  # VAE encoder/decoder, not vision encoder
+        filtered_weights = []
+        for name, tensor in weights:
+            # Skip generation-related keywords
+            if any(skip in name for skip in generation_keywords):
+                continue
+            if any(name.startswith(prefix) for prefix in vae_prefixes):
+                continue
+
+            if "patch_embedding.weight" in name and tensor.ndim == 2:
+                out_channels = tensor.shape[0]
+                in_features = tensor.shape[1]
+                patch_size = self.config.vit_config.patch_size
+                in_channels = self.config.vit_config.num_channels
+                if in_features == in_channels * patch_size * patch_size:
+                    tensor = tensor.reshape(
+                        out_channels, patch_size, patch_size, in_channels
+                    )
+                    tensor = tensor.permute(0, 3, 1, 2).contiguous()
+
+            filtered_weights.append((name, tensor))
+
+        loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
+        return loader.load_weights(filtered_weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 3af4a49cd77cc..f4c2d3cb75d25 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -122,6 +122,8 @@ class Qwen2Attention(nn.Module):
         prefix: str = "",
         attn_type: str = AttentionType.DECODER,
         dual_chunk_attention_config: dict[str, Any] | None = None,
+        qk_norm: bool = False,
+        rms_norm_eps: float = 1e-6,
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
@@ -144,6 +146,7 @@ class Qwen2Attention(nn.Module):
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
         self.dual_chunk_attention_config = dual_chunk_attention_config
+        self.qk_norm = qk_norm
 
         self.qkv_proj = QKVParallelLinear(
             hidden_size,
@@ -162,6 +165,11 @@ class Qwen2Attention(nn.Module):
             prefix=f"{prefix}.o_proj",
         )
 
+        # QK Normalization support (used in BAGEL and some other models)
+        if self.qk_norm:
+            self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+            self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+
         self.rotary_emb = get_rope(
             self.head_dim,
             max_position=max_position,
@@ -197,6 +205,23 @@ class Qwen2Attention(nn.Module):
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        # Apply QK normalization if enabled (before RoPE)
+        if self.qk_norm:
+            # Reshape to apply per-head normalization
+            # q shape: (total_tokens, q_size) -> (total_tokens, num_heads, head_dim)
+            total_tokens = q.shape[0]
+            q = q.view(total_tokens, self.num_heads, self.head_dim)
+            k = k.view(total_tokens, self.num_kv_heads, self.head_dim)
+
+            # Apply normalization
+            q = self.q_norm(q)
+            k = self.k_norm(k)
+
+            # Reshape back
+            q = q.view(total_tokens, self.q_size)
+            k = k.view(total_tokens, self.kv_size)
+
         q, k = self.rotary_emb(positions, q, k)
         attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
@@ -227,6 +252,9 @@ class Qwen2DecoderLayer(nn.Module):
         else:
             attn_type = AttentionType.ENCODER_ONLY
 
+        # Check if QK normalization is enabled (used in BAGEL and some other models)
+        qk_norm = getattr(config, "qk_norm", False)
+
         self.self_attn = Qwen2Attention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
@@ -238,6 +266,8 @@ class Qwen2DecoderLayer(nn.Module):
             prefix=f"{prefix}.self_attn",
             attn_type=attn_type,
             dual_chunk_attention_config=dual_chunk_attention_config,
+            qk_norm=qk_norm,
+            rms_norm_eps=config.rms_norm_eps,
         )
         self.mlp = Qwen2MLP(
             hidden_size=self.hidden_size,
@@ -480,6 +510,8 @@ class Qwen2Model(nn.Module):
                     continue
                 if is_pp_missing_parameter(name, self):
                     continue
+                if name not in params_dict:
+                    continue
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 419c47a2198cf..4575e91e13959 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -272,6 +272,7 @@ _MULTIMODAL_MODELS = {
         "aya_vision",
         "AyaVisionForConditionalGeneration",
     ),
+    "BagelForConditionalGeneration": ("bagel", "BagelForConditionalGeneration"),
     "BeeForConditionalGeneration": ("bee", "BeeForConditionalGeneration"),
     "Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"),
     "ChameleonForConditionalGeneration": (
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index ba89a43d573f2..a11d37b4b2edf 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -66,6 +66,7 @@ class LazyConfigDict(dict):
 
 _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
     afmoe="AfmoeConfig",
+    bagel="BagelConfig",
     chatglm="ChatGLMConfig",
     deepseek_vl_v2="DeepseekVLV2Config",
     deepseek_v32="DeepseekV3Config",
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index e536ca8521325..54fe1b8d7b523 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -16,6 +16,7 @@ import importlib
 
 _CLASS_TO_MODULE: dict[str, str] = {
     "AfmoeConfig": "vllm.transformers_utils.configs.afmoe",
+    "BagelConfig": "vllm.transformers_utils.configs.bagel",
     "ChatGLMConfig": "vllm.transformers_utils.configs.chatglm",
     "DeepseekVLV2Config": "vllm.transformers_utils.configs.deepseek_vl2",
     "DotsOCRConfig": "vllm.transformers_utils.configs.dotsocr",
@@ -54,6 +55,7 @@ _CLASS_TO_MODULE: dict[str, str] = {
 
 __all__ = [
     "AfmoeConfig",
+    "BagelConfig",
     "ChatGLMConfig",
     "DeepseekVLV2Config",
     "DeepseekV3Config",
diff --git a/vllm/transformers_utils/configs/bagel.py b/vllm/transformers_utils/configs/bagel.py
new file mode 100644
index 0000000000000..53347ef452138
--- /dev/null
+++ b/vllm/transformers_utils/configs/bagel.py
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from transformers import PretrainedConfig, SiglipVisionConfig
+from transformers.models.qwen2 import Qwen2Config
+
+
+class BagelConfig(PretrainedConfig):
+    """Configuration class for BAGEL model."""
+
+    model_type = "bagel"
+
+    def __init__(
+        self,
+        visual_gen: bool = True,
+        visual_und: bool = True,
+        llm_config: dict | Qwen2Config | None = None,
+        vit_config: dict | SiglipVisionConfig | None = None,
+        vae_config: dict | None = None,
+        latent_patch_size: int = 2,
+        max_latent_size: int = 32,
+        vit_max_num_patch_per_side: int = 70,
+        connector_act: str = "gelu_pytorch_tanh",
+        interpolate_pos: bool = False,
+        timestep_shift: float = 1.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.visual_gen = visual_gen
+        self.visual_und = visual_und
+
+        # Convert dict configs to proper config objects
+        if isinstance(llm_config, dict):
+            self.llm_config = Qwen2Config(**llm_config)
+        else:
+            self.llm_config = llm_config or Qwen2Config()
+
+        if isinstance(vit_config, dict):
+            self.vit_config = SiglipVisionConfig(**vit_config)
+        else:
+            self.vit_config = vit_config or SiglipVisionConfig()
+
+        self.vae_config = vae_config or {"z_channels": 16, "downsample": 8}
+        self.latent_patch_size = latent_patch_size
+        self.max_latent_size = max_latent_size
+        self.vit_max_num_patch_per_side = vit_max_num_patch_per_side
+        self.connector_act = connector_act
+        self.interpolate_pos = interpolate_pos
+        self.timestep_shift = timestep_shift
+
+    @property
+    def hidden_size(self) -> int:
+        """Return the hidden size of the language model."""
+        return self.llm_config.hidden_size
diff --git a/vllm/transformers_utils/processors/__init__.py b/vllm/transformers_utils/processors/__init__.py
index b49fdbe9ce776..af25dbe4ccdfe 100644
--- a/vllm/transformers_utils/processors/__init__.py
+++ b/vllm/transformers_utils/processors/__init__.py
@@ -8,6 +8,7 @@ reasons:
 - There is a need to override the existing processor to support vLLM.
 """
 
+from vllm.transformers_utils.processors.bagel import BagelProcessor
 from vllm.transformers_utils.processors.deepseek_vl2 import DeepseekVLV2Processor
 from vllm.transformers_utils.processors.hunyuan_vl import HunYuanVLProcessor
 from vllm.transformers_utils.processors.hunyuan_vl_image import HunYuanVLImageProcessor
@@ -15,6 +16,7 @@ from vllm.transformers_utils.processors.ovis import OvisProcessor
 from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor
 
 __all__ = [
+    "BagelProcessor",
     "DeepseekVLV2Processor",
     "HunYuanVLProcessor",
     "HunYuanVLImageProcessor",
diff --git a/vllm/transformers_utils/processors/bagel.py b/vllm/transformers_utils/processors/bagel.py
new file mode 100644
index 0000000000000..850e64f2fad1e
--- /dev/null
+++ b/vllm/transformers_utils/processors/bagel.py
@@ -0,0 +1,73 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright 2025 Bytedance Ltd. and/or its affiliates.
+"""BAGEL processor for image and text inputs."""
+
+from transformers import AutoProcessor
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+
+
+class BagelProcessor(ProcessorMixin):
+    """
+    Constructs a BAGEL processor which wraps a
+    SigLIP image processor and a Qwen2 tokenizer.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "SiglipImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __call__(
+        self,
+        text: TextInput
+        | PreTokenizedInput
+        | list[TextInput]
+        | list[PreTokenizedInput] = None,
+        images: ImageInput = None,
+        **kwargs,
+    ):
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s).
+        """
+        if images is not None:
+            # Process images with the image processor
+            # Ensure return_tensors is set to "pt" for PyTorch tensors
+            image_kwargs = {**kwargs}
+            if "return_tensors" not in image_kwargs:
+                image_kwargs["return_tensors"] = "pt"
+            pixel_values = self.image_processor(images, **image_kwargs)
+        else:
+            pixel_values = None
+
+        text_inputs = self.tokenizer(text, **kwargs) if text is not None else None
+
+        if pixel_values is not None and text_inputs is not None:
+            text_inputs["pixel_values"] = pixel_values["pixel_values"]
+            return text_inputs
+        elif pixel_values is not None:
+            return pixel_values
+        else:
+            return text_inputs
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's batch_decode.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's decode.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+
+
+AutoProcessor.register("BagelProcessor", BagelProcessor)

From 33278073d68940dcaff579ab2dc316700e1db87a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E3=82=86=E3=82=8A?= <crystalinecohomology@gmail.com>
Date: Mon, 15 Dec 2025 16:00:39 +0900
Subject: [PATCH 553/770] typing: Add type hints to TurnMetrics class in
 context.py (#30552)

Co-authored-by: zkexorability <zkexorability@gmail.com>
Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
---
 vllm/entrypoints/context.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
index c70eaaa082fe5..ec1506b473bd9 100644
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -74,24 +74,24 @@ class TurnMetrics:
 
     def __init__(
         self,
-        input_tokens=0,
-        output_tokens=0,
-        cached_input_tokens=0,
-        tool_output_tokens=0,
-    ):
+        input_tokens: int = 0,
+        output_tokens: int = 0,
+        cached_input_tokens: int = 0,
+        tool_output_tokens: int = 0,
+    ) -> None:
         self.input_tokens = input_tokens
         self.output_tokens = output_tokens
         self.cached_input_tokens = cached_input_tokens
         self.tool_output_tokens = tool_output_tokens
 
-    def reset(self):
+    def reset(self) -> None:
         """Reset counters for a new turn."""
         self.input_tokens = 0
         self.output_tokens = 0
         self.cached_input_tokens = 0
         self.tool_output_tokens = 0
 
-    def copy(self):
+    def copy(self) -> "TurnMetrics":
         """Create a copy of this turn's token counts."""
         return TurnMetrics(
             self.input_tokens,

From 4429d934de3c5cc327b0d7aec8e473aeba38db90 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <yuqi.wang@daocloud.io>
Date: Mon, 15 Dec 2025 16:13:00 +0800
Subject: [PATCH 554/770] [Model] Automatic conversion of TokenClassification
 model (#30666)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
---
 .../pooling/test_token_classification.py      | 31 +++++++++++++++++++
 tests/models/registry.py                      |  1 +
 vllm/config/model.py                          |  1 +
 vllm/model_executor/models/adapters.py        | 12 +++++++
 4 files changed, 45 insertions(+)

diff --git a/tests/models/language/pooling/test_token_classification.py b/tests/models/language/pooling/test_token_classification.py
index 2dfc0072126bc..64d42432c74b9 100644
--- a/tests/models/language/pooling/test_token_classification.py
+++ b/tests/models/language/pooling/test_token_classification.py
@@ -68,3 +68,34 @@ def test_modernbert_models(
         hf_output = torch.tensor(hf_output).cpu().float()
         vllm_output = torch.tensor(vllm_output).cpu().float()
         assert torch.allclose(hf_output, vllm_output, atol=1e-2)
+
+
+@pytest.mark.parametrize("model", ["bd2lcco/Qwen3-0.6B-finetuned"])
+@pytest.mark.parametrize("dtype", ["float"])
+@torch.inference_mode
+def test_auto_conversion(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+) -> None:
+    with vllm_runner(model, max_model_len=1024, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.token_classify(example_prompts)
+
+    with hf_runner(
+        model, dtype=dtype, auto_cls=AutoModelForTokenClassification
+    ) as hf_model:
+        tokenizer = hf_model.tokenizer
+        hf_outputs = []
+        for prompt in example_prompts:
+            inputs = tokenizer([prompt], return_tensors="pt")
+            inputs = hf_model.wrap_device(inputs)
+            output = hf_model.model(**inputs)
+            hf_outputs.append(softmax(output.logits[0]))
+
+    # check logits difference
+    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
+        hf_output = torch.tensor(hf_output).cpu().float()
+        vllm_output = torch.tensor(vllm_output).cpu().float()
+        assert torch.allclose(hf_output, vllm_output, atol=1e-2)
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 1bde8ab189c2e..c5d72b5d581b9 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -573,6 +573,7 @@ _AUTOMATIC_CONVERTED_MODELS = {
     "Qwen3ForSequenceClassification": _HfExamplesInfo(
         "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
     ),
+    "Qwen3ForTokenClassification": _HfExamplesInfo("bd2lcco/Qwen3-0.6B-finetuned"),
 }
 
 _MULTIMODAL_EXAMPLE_MODELS = {
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 10e4d653c8256..7ff095bcb9ccd 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -1796,6 +1796,7 @@ _SUFFIX_TO_DEFAULTS: list[tuple[str, tuple[RunnerType, ConvertType]]] = [
     ("ForTextEncoding", ("pooling", "embed")),
     ("EmbeddingModel", ("pooling", "embed")),
     ("ForSequenceClassification", ("pooling", "classify")),
+    ("ForTokenClassification", ("pooling", "classify")),
     ("ForAudioClassification", ("pooling", "classify")),
     ("ForImageClassification", ("pooling", "classify")),
     ("ForVideoClassification", ("pooling", "classify")),
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 9ba76f312edac..504de9fe10871 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -337,6 +337,18 @@ def as_seq_cls_model(cls: _T) -> _T:
             tokens = getattr(text_config, "classifier_from_token", None)
             method = getattr(text_config, "method", None)
 
+            def auto_set_score_bias(weights):
+                for name, weight in weights:
+                    if name == "score.bias":
+                        device = self.score.weight.device
+                        dtype = self.score.weight.dtype
+                        bias = weight.to(device).to(dtype)
+                        self.score.bias = torch.nn.Parameter(bias)
+                        self.score.skip_bias_add = False
+                    else:
+                        yield name, weight
+
+            weights = auto_set_score_bias(weights)
             if tokens is None and method is None:
                 return super().load_weights(weights)
             else:

From e4806d973acba6550dd061830471b19e8c54e692 Mon Sep 17 00:00:00 2001
From: duke <108673086+iwzbi@users.noreply.github.com>
Date: Mon, 15 Dec 2025 18:38:29 +0800
Subject: [PATCH 555/770] [BugFix] Add embed_input_ids method to make
 QWenLMHeadModel a vllm model (#30674)

Signed-off-by: root <iwzbi@zju.edu.cn>
Co-authored-by: root <iwzbi@zju.edu.cn>
---
 vllm/model_executor/models/qwen.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 492ba2fb12145..61a6e67805d6a 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -281,6 +281,9 @@ class QWenBaseModel(nn.Module):
             self.transformer.make_empty_intermediate_tensors
         )
 
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.wte(input_ids)
+
     def compute_logits(
         self,
         hidden_states: torch.Tensor,

From 185c22bf2f736d0ffa69a3faae379ad0c444de56 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Mon, 15 Dec 2025 12:17:58 +0100
Subject: [PATCH 556/770] [Misc][Hybrid allocator + kv connector] Optionally
 enable hybrid allocator + KV cache connector (#29805)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 vllm/config/scheduler.py |  4 +-
 vllm/config/vllm.py      | 96 +++++++++++++++++++++++++---------------
 vllm/engine/arg_utils.py |  2 +-
 3 files changed, 64 insertions(+), 38 deletions(-)

diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
index 8da3ae538d671..8abbe8ba0103e 100644
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -122,10 +122,12 @@ class SchedulerConfig:
     the default scheduler. Can be a class directly or the path to a class of
     form "mod.custom_class"."""
 
-    disable_hybrid_kv_cache_manager: bool = False
+    disable_hybrid_kv_cache_manager: bool | None = None
     """If set to True, KV cache manager will allocate the same size of KV cache
     for all attention layers even if there are multiple type of attention layers
     like full attention and sliding window attention.
+    If set to None, the default value will be determined based on the environment
+    and starting configuration.
     """
 
     async_scheduling: bool = False
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index b5f8f916de438..ace5adc109d86 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -887,17 +887,48 @@ class VllmConfig:
         if not self.instance_id:
             self.instance_id = random_uuid()[:5]
 
-        if not self.scheduler_config.disable_hybrid_kv_cache_manager:
-            # logger should only print warning message for hybrid models. As we
-            # can't know whether the model is hybrid or not now, so we don't log
-            # warning message here and will log it later.
-            if not current_platform.support_hybrid_kv_cache():
-                # Hybrid KV cache manager is not supported on non-GPU platforms.
-                self.scheduler_config.disable_hybrid_kv_cache_manager = True
+        # Hybrid KV cache manager (HMA) runtime rules:
+        # - Explicit enable (--no-disable-kv-cache-manager): error if runtime
+        #   disables it
+        # - No preference: auto-disable for unsupported features (e.g. kv connector)
+        # - Explicit disable (--disable-kv-cache-manager): always respect it
+        need_disable_hybrid_kv_cache_manager = False
+        # logger should only print warning message for hybrid models. As we
+        # can't know whether the model is hybrid or not now, so we don't log
+        # warning message here and will log it later.
+        if not current_platform.support_hybrid_kv_cache():
+            # Hybrid KV cache manager is not supported on non-GPU platforms.
+            need_disable_hybrid_kv_cache_manager = True
+        if self.kv_events_config is not None:
+            # Hybrid KV cache manager is not compatible with KV events.
+            need_disable_hybrid_kv_cache_manager = True
+        if (
+            self.model_config is not None
+            and self.model_config.attention_chunk_size is not None
+        ):
+            if (
+                self.speculative_config is not None
+                and self.speculative_config.use_eagle()
+            ):
+                # Hybrid KV cache manager is not yet supported with chunked
+                # local attention + eagle.
+                need_disable_hybrid_kv_cache_manager = True
+            elif not envs.VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE:
+                logger.warning(
+                    "There is a latency regression when using chunked local"
+                    " attention with the hybrid KV cache manager. Disabling"
+                    " it, by default. To enable it, set the environment "
+                    "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE=1."
+                )
+                # Hybrid KV cache manager is not yet supported with chunked
+                # local attention.
+                need_disable_hybrid_kv_cache_manager = True
+
+        if self.scheduler_config.disable_hybrid_kv_cache_manager is None:
+            # Default to disable HMA, but only if the user didn't express a preference.
             if self.kv_transfer_config is not None:
-                # NOTE(Kuntai): turn HMA off for connector for now.
-                # TODO(Kuntai): have a more elegent solution to check and
-                # turn off HMA for connector that does not support HMA.
+                # NOTE(Kuntai): turn HMA off for connector unless specifically enabled.
+                need_disable_hybrid_kv_cache_manager = True
                 logger.warning(
                     "Turning off hybrid kv cache manager because "
                     "`--kv-transfer-config` is set. This will reduce the "
@@ -905,33 +936,26 @@ class VllmConfig:
                     "or Mamba attention. If you are a developer of kv connector"
                     ", please consider supporting hybrid kv cache manager for "
                     "your connector by making sure your connector is a subclass"
-                    " of `SupportsHMA` defined in kv_connector/v1/base.py."
+                    " of `SupportsHMA` defined in kv_connector/v1/base.py and"
+                    " use --no-disable-hybrid-kv-cache-manager to start vLLM."
                 )
-                self.scheduler_config.disable_hybrid_kv_cache_manager = True
-            if self.kv_events_config is not None:
-                # Hybrid KV cache manager is not compatible with KV events.
-                self.scheduler_config.disable_hybrid_kv_cache_manager = True
-            if (
-                self.model_config is not None
-                and self.model_config.attention_chunk_size is not None
-            ):
-                if (
-                    self.speculative_config is not None
-                    and self.speculative_config.use_eagle()
-                ):
-                    # Hybrid KV cache manager is not yet supported with chunked
-                    # local attention + eagle.
-                    self.scheduler_config.disable_hybrid_kv_cache_manager = True
-                elif not envs.VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE:
-                    logger.warning(
-                        "There is a latency regression when using chunked local"
-                        " attention with the hybrid KV cache manager. Disabling"
-                        " it, by default. To enable it, set the environment "
-                        "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE=1."
-                    )
-                    # Hybrid KV cache manager is not yet supported with chunked
-                    # local attention.
-                    self.scheduler_config.disable_hybrid_kv_cache_manager = True
+            self.scheduler_config.disable_hybrid_kv_cache_manager = (
+                need_disable_hybrid_kv_cache_manager
+            )
+        elif (
+            self.scheduler_config.disable_hybrid_kv_cache_manager is False
+            and need_disable_hybrid_kv_cache_manager
+        ):
+            raise ValueError(
+                "Hybrid KV cache manager was explicitly enabled but is not "
+                "supported in this configuration. Consider omitting the "
+                "--no-disable-hybrid-kv-cache-manager flag to let vLLM decide"
+                " automatically."
+            )
+
+        if self.scheduler_config.disable_hybrid_kv_cache_manager is None:
+            # Default to enable HMA if not explicitly disabled by user or logic above.
+            self.scheduler_config.disable_hybrid_kv_cache_manager = False
 
         if self.compilation_config.debug_dump_path:
             self.compilation_config.debug_dump_path = (
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 2867532756450..3862aa9222446 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -491,7 +491,7 @@ class EngineArgs:
     enable_chunked_prefill: bool | None = None
     disable_chunked_mm_input: bool = SchedulerConfig.disable_chunked_mm_input
 
-    disable_hybrid_kv_cache_manager: bool = (
+    disable_hybrid_kv_cache_manager: bool | None = (
         SchedulerConfig.disable_hybrid_kv_cache_manager
     )
 

From 2a1776b7ac4fae7c50c694edeafc1b14270e4350 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Mon, 15 Dec 2025 20:54:52 +0800
Subject: [PATCH 557/770] [Refactor] [2/N] Move tool parsers into the vLLM main
 directory (#30675)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 docs/features/tool_calling.md                 |   4 +-
 tests/entrypoints/openai/test_serving_chat.py |   2 +-
 .../test_gigachat3_tool_parser.py             |   2 +-
 .../tool_parsers/test_hermes_tool_parser.py   |   2 +-
 .../test_hunyuan_a13b_tool_parser.py          |   2 +-
 .../test_llama3_json_tool_parser.py           |   2 +-
 .../test_llama4_pythonic_tool_parser.py       |   2 +-
 .../tool_parsers/test_olmo3_tool_parser.py    |   2 +-
 .../tool_parsers/test_pythonic_tool_parser.py |   2 +-
 .../entrypoints/openai/tool_parsers/utils.py  |   2 +-
 .../language/generation/test_mistral.py       |   6 +-
 .../tool_use/test_deepseekv31_tool_parser.py  |   4 +-
 .../tool_use/test_ernie45_moe_tool_parser.py  |   2 +-
 tests/tool_use/test_glm4_moe_tool_parser.py   |   4 +-
 tests/tool_use/test_jamba_tool_parser.py      |   2 +-
 tests/tool_use/test_kimi_k2_tool_parser.py    |   2 +-
 tests/tool_use/test_minimax_tool_parser.py    |   2 +-
 tests/tool_use/test_mistral_tool_parser.py    |   2 +-
 tests/tool_use/test_openai_tool_parser.py     |   2 +-
 tests/tool_use/test_qwen3coder_tool_parser.py |   8 +-
 tests/tool_use/test_seed_oss_tool_parser.py   |   2 +-
 tests/tool_use/test_tool_choice_required.py   |   2 +-
 tests/tool_use/test_xlam_tool_parser.py       |   2 +-
 vllm/entrypoints/context.py                   |   2 +-
 vllm/entrypoints/openai/api_server.py         |   2 +-
 vllm/entrypoints/openai/cli_args.py           |   2 +-
 .../openai/parser/responses_parser.py         |   2 +-
 vllm/entrypoints/openai/serving_chat.py       |   4 +-
 vllm/entrypoints/openai/serving_engine.py     |   2 +-
 .../openai/tool_parsers/__init__.py           | 163 +++---------------
 vllm/tool_parsers/__init__.py                 | 150 ++++++++++++++++
 .../tool_parsers/abstract_tool_parser.py      |   4 +-
 .../tool_parsers/deepseekv31_tool_parser.py   |   4 +-
 .../tool_parsers/deepseekv32_tool_parser.py   |   6 +-
 .../tool_parsers/deepseekv3_tool_parser.py    |   6 +-
 .../tool_parsers/ernie45_tool_parser.py       |   6 +-
 .../tool_parsers/gigachat3_tool_parser.py     |   2 +-
 .../tool_parsers/glm4_moe_tool_parser.py      |   6 +-
 .../granite_20b_fc_tool_parser.py             |   8 +-
 .../tool_parsers/granite_tool_parser.py       |   8 +-
 .../tool_parsers/hermes_tool_parser.py        |   6 +-
 .../tool_parsers/hunyuan_a13b_tool_parser.py  |   8 +-
 .../tool_parsers/internlm2_tool_parser.py     |   8 +-
 .../tool_parsers/jamba_tool_parser.py         |   4 +-
 .../tool_parsers/kimi_k2_tool_parser.py       |   6 +-
 .../llama4_pythonic_tool_parser.py            |   4 +-
 .../tool_parsers/llama_tool_parser.py         |   6 +-
 .../tool_parsers/longcat_tool_parser.py       |   2 +-
 .../tool_parsers/minimax_m2_tool_parser.py    |   6 +-
 .../tool_parsers/minimax_tool_parser.py       |   8 +-
 .../tool_parsers/mistral_tool_parser.py       |   6 +-
 .../tool_parsers/olmo3_tool_parser.py         |   4 +-
 .../tool_parsers/openai_tool_parser.py        |   4 +-
 .../tool_parsers/phi4mini_tool_parser.py      |   4 +-
 .../tool_parsers/pythonic_tool_parser.py      |   4 +-
 .../tool_parsers/qwen3coder_tool_parser.py    |   6 +-
 .../tool_parsers/qwen3xml_tool_parser.py      |   6 +-
 .../tool_parsers/seed_oss_tool_parser.py      |   6 +-
 .../tool_parsers/step3_tool_parser.py         |   6 +-
 .../openai => }/tool_parsers/utils.py         |   0
 .../tool_parsers/xlam_tool_parser.py          |   2 +-
 61 files changed, 288 insertions(+), 257 deletions(-)
 create mode 100644 vllm/tool_parsers/__init__.py
 rename vllm/{entrypoints/openai => }/tool_parsers/abstract_tool_parser.py (98%)
 rename vllm/{entrypoints/openai => }/tool_parsers/deepseekv31_tool_parser.py (99%)
 rename vllm/{entrypoints/openai => }/tool_parsers/deepseekv32_tool_parser.py (99%)
 rename vllm/{entrypoints/openai => }/tool_parsers/deepseekv3_tool_parser.py (99%)
 rename vllm/{entrypoints/openai => }/tool_parsers/ernie45_tool_parser.py (99%)
 rename vllm/{entrypoints/openai => }/tool_parsers/gigachat3_tool_parser.py (98%)
 rename vllm/{entrypoints/openai => }/tool_parsers/glm4_moe_tool_parser.py (99%)
 rename vllm/{entrypoints/openai => }/tool_parsers/granite_20b_fc_tool_parser.py (98%)
 rename vllm/{entrypoints/openai => }/tool_parsers/granite_tool_parser.py (98%)
 rename vllm/{entrypoints/openai => }/tool_parsers/hermes_tool_parser.py (99%)
 rename vllm/{entrypoints/openai => }/tool_parsers/hunyuan_a13b_tool_parser.py (99%)
 rename vllm/{entrypoints/openai => }/tool_parsers/internlm2_tool_parser.py (98%)
 rename vllm/{entrypoints/openai => }/tool_parsers/jamba_tool_parser.py (98%)
 rename vllm/{entrypoints/openai => }/tool_parsers/kimi_k2_tool_parser.py (99%)
 rename vllm/{entrypoints/openai => }/tool_parsers/llama4_pythonic_tool_parser.py (99%)
 rename vllm/{entrypoints/openai => }/tool_parsers/llama_tool_parser.py (99%)
 rename vllm/{entrypoints/openai => }/tool_parsers/longcat_tool_parser.py (93%)
 rename vllm/{entrypoints/openai => }/tool_parsers/minimax_m2_tool_parser.py (99%)
 rename vllm/{entrypoints/openai => }/tool_parsers/minimax_tool_parser.py (99%)
 rename vllm/{entrypoints/openai => }/tool_parsers/mistral_tool_parser.py (99%)
 rename vllm/{entrypoints/openai => }/tool_parsers/olmo3_tool_parser.py (99%)
 rename vllm/{entrypoints/openai => }/tool_parsers/openai_tool_parser.py (98%)
 rename vllm/{entrypoints/openai => }/tool_parsers/phi4mini_tool_parser.py (98%)
 rename vllm/{entrypoints/openai => }/tool_parsers/pythonic_tool_parser.py (99%)
 rename vllm/{entrypoints/openai => }/tool_parsers/qwen3coder_tool_parser.py (99%)
 rename vllm/{entrypoints/openai => }/tool_parsers/qwen3xml_tool_parser.py (99%)
 rename vllm/{entrypoints/openai => }/tool_parsers/seed_oss_tool_parser.py (99%)
 rename vllm/{entrypoints/openai => }/tool_parsers/step3_tool_parser.py (99%)
 rename vllm/{entrypoints/openai => }/tool_parsers/utils.py (100%)
 rename vllm/{entrypoints/openai => }/tool_parsers/xlam_tool_parser.py (99%)

diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md
index c77fe44659790..70a11d6def566 100644
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -420,7 +420,7 @@ Flags: `--tool-call-parser pythonic --chat-template {see_above}`
 
 ## How to Write a Tool Parser Plugin
 
-A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in [vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py](../../vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py).
+A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in [vllm/tool_parsers/hermes_tool_parser.py](../../vllm/tool_parsers/hermes_tool_parser.py).
 
 Here is a summary of a plugin file:
 
@@ -468,7 +468,7 @@ Here is a summary of a plugin file:
     # register the tool parser to ToolParserManager
     ToolParserManager.register_lazy_module(
         name="example",
-        module_path="vllm.entrypoints.openai.tool_parsers.example",
+        module_path="vllm.tool_parsers.example",
         class_name="ExampleToolParser",
     )
 
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index c7e088fddf7e4..444275e061c61 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -19,9 +19,9 @@ from vllm.entrypoints.openai.protocol import (
 )
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
-from vllm.entrypoints.openai.tool_parsers import ToolParserManager
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.tokenizers import get_tokenizer
+from vllm.tool_parsers import ToolParserManager
 from vllm.v1.engine.async_llm import AsyncLLM
 
 from ...utils import RemoteOpenAIServer
diff --git a/tests/entrypoints/openai/tool_parsers/test_gigachat3_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_gigachat3_tool_parser.py
index 02c5189d0f6c1..6ac48317e8bc6 100644
--- a/tests/entrypoints/openai/tool_parsers/test_gigachat3_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_gigachat3_tool_parser.py
@@ -10,8 +10,8 @@ from tests.entrypoints.openai.tool_parsers.utils import (
     run_tool_extraction_streaming,
 )
 from vllm.entrypoints.openai.protocol import FunctionCall
-from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
 from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers import ToolParser, ToolParserManager
 
 SIMPLE_ARGS_DICT = {
     "action": "create",
diff --git a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
index ce6727bb04f6c..8600aaf639431 100644
--- a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
@@ -6,8 +6,8 @@ import json
 import pytest
 
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
-from vllm.entrypoints.openai.tool_parsers.hermes_tool_parser import Hermes2ProToolParser
 from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers.hermes_tool_parser import Hermes2ProToolParser
 
 from ....utils import RemoteOpenAIServer
 
diff --git a/tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py
index bdd5344652c4b..3944575321391 100644
--- a/tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py
@@ -12,7 +12,7 @@ from tests.entrypoints.openai.tool_parsers.utils import (
     run_tool_extraction_streaming,
 )
 from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
-from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
+from vllm.tool_parsers import ToolParser, ToolParserManager
 
 
 def make_tool_call(name, arguments):
diff --git a/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
index 6c286ca90ce48..3ce7801b45975 100644
--- a/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
@@ -6,8 +6,8 @@ from unittest.mock import MagicMock, patch
 import pytest
 
 from vllm.entrypoints.openai.protocol import ExtractedToolCallInformation
-from vllm.entrypoints.openai.tool_parsers.llama_tool_parser import Llama3JsonToolParser
 from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers.llama_tool_parser import Llama3JsonToolParser
 
 
 @pytest.fixture
diff --git a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
index 8aa88a007188f..3bd1ca7f528d0 100644
--- a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
@@ -10,8 +10,8 @@ from tests.entrypoints.openai.tool_parsers.utils import (
     run_tool_extraction_streaming,
 )
 from vllm.entrypoints.openai.protocol import FunctionCall
-from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
 from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers import ToolParser, ToolParserManager
 
 # Test cases similar to pythonic parser but with Llama4 specific format
 SIMPLE_FUNCTION_OUTPUT = "[get_weather(city='LA', metric='C')]"
diff --git a/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
index a0b9a3c563bc2..3774b3d1833e9 100644
--- a/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
@@ -10,8 +10,8 @@ from tests.entrypoints.openai.tool_parsers.utils import (
     run_tool_extraction_streaming,
 )
 from vllm.entrypoints.openai.protocol import FunctionCall
-from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
 from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers import ToolParser, ToolParserManager
 
 # https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#model-response-format-1
 SIMPLE_FUNCTION_OUTPUT = "get_weather(city='San Francisco', metric='celsius')"
diff --git a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
index 52202c55e8405..c4cad17fd2d01 100644
--- a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
@@ -10,8 +10,8 @@ from tests.entrypoints.openai.tool_parsers.utils import (
     run_tool_extraction_streaming,
 )
 from vllm.entrypoints.openai.protocol import FunctionCall
-from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
 from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers import ToolParser, ToolParserManager
 
 # https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#model-response-format-1
 SIMPLE_FUNCTION_OUTPUT = "get_weather(city='San Francisco', metric='celsius')"
diff --git a/tests/entrypoints/openai/tool_parsers/utils.py b/tests/entrypoints/openai/tool_parsers/utils.py
index 2d4f5f1734102..0b32e5f899ff4 100644
--- a/tests/entrypoints/openai/tool_parsers/utils.py
+++ b/tests/entrypoints/openai/tool_parsers/utils.py
@@ -10,8 +10,8 @@ from vllm.entrypoints.openai.protocol import (
     FunctionCall,
     ToolCall,
 )
-from vllm.entrypoints.openai.tool_parsers import ToolParser
 from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers import ToolParser
 
 
 class StreamingToolReconstructor:
diff --git a/tests/models/language/generation/test_mistral.py b/tests/models/language/generation/test_mistral.py
index bc8bb05c284e6..0ef4ba2577724 100644
--- a/tests/models/language/generation/test_mistral.py
+++ b/tests/models/language/generation/test_mistral.py
@@ -5,12 +5,12 @@ import json
 
 import pytest
 
-from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
+from vllm.sampling_params import SamplingParams
+from vllm.tokenizers.mistral import MistralTokenizer
+from vllm.tool_parsers.mistral_tool_parser import (
     MistralToolCall,
     MistralToolParser,
 )
-from vllm.sampling_params import SamplingParams
-from vllm.tokenizers.mistral import MistralTokenizer
 
 from ...utils import check_logprobs_close
 
diff --git a/tests/tool_use/test_deepseekv31_tool_parser.py b/tests/tool_use/test_deepseekv31_tool_parser.py
index 8beb7739b6081..69a4cc8b989c5 100644
--- a/tests/tool_use/test_deepseekv31_tool_parser.py
+++ b/tests/tool_use/test_deepseekv31_tool_parser.py
@@ -3,10 +3,10 @@
 
 import pytest
 
-from vllm.entrypoints.openai.tool_parsers.deepseekv31_tool_parser import (
+from vllm.tokenizers import get_tokenizer
+from vllm.tool_parsers.deepseekv31_tool_parser import (
     DeepSeekV31ToolParser,
 )
-from vllm.tokenizers import get_tokenizer
 
 MODEL = "deepseek-ai/DeepSeek-V3.1"
 
diff --git a/tests/tool_use/test_ernie45_moe_tool_parser.py b/tests/tool_use/test_ernie45_moe_tool_parser.py
index 92f86de23267b..533bd1ec3dfff 100644
--- a/tests/tool_use/test_ernie45_moe_tool_parser.py
+++ b/tests/tool_use/test_ernie45_moe_tool_parser.py
@@ -13,9 +13,9 @@ from vllm.entrypoints.openai.protocol import (
     FunctionCall,
     ToolCall,
 )
-from vllm.entrypoints.openai.tool_parsers.ernie45_tool_parser import Ernie45ToolParser
 from vllm.tokenizers import TokenizerLike, get_tokenizer
 from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
+from vllm.tool_parsers.ernie45_tool_parser import Ernie45ToolParser
 
 # Use a common model that is likely to be available
 MODEL = "baidu/ERNIE-4.5-21B-A3B-Thinking"
diff --git a/tests/tool_use/test_glm4_moe_tool_parser.py b/tests/tool_use/test_glm4_moe_tool_parser.py
index 753b3f1c23adf..749b0eef4ec85 100644
--- a/tests/tool_use/test_glm4_moe_tool_parser.py
+++ b/tests/tool_use/test_glm4_moe_tool_parser.py
@@ -7,10 +7,10 @@ import json
 import pytest
 
 from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
-from vllm.entrypoints.openai.tool_parsers.glm4_moe_tool_parser import (
+from vllm.tokenizers import get_tokenizer
+from vllm.tool_parsers.glm4_moe_tool_parser import (
     Glm4MoeModelToolParser,
 )
-from vllm.tokenizers import get_tokenizer
 
 pytestmark = pytest.mark.cpu_test
 
diff --git a/tests/tool_use/test_jamba_tool_parser.py b/tests/tool_use/test_jamba_tool_parser.py
index 9036bd32dd704..70e8253708592 100644
--- a/tests/tool_use/test_jamba_tool_parser.py
+++ b/tests/tool_use/test_jamba_tool_parser.py
@@ -9,9 +9,9 @@ import pytest
 from partial_json_parser.core.options import Allow
 
 from vllm.entrypoints.openai.protocol import DeltaMessage, FunctionCall, ToolCall
-from vllm.entrypoints.openai.tool_parsers.jamba_tool_parser import JambaToolParser
 from vllm.tokenizers import TokenizerLike, get_tokenizer
 from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
+from vllm.tool_parsers.jamba_tool_parser import JambaToolParser
 
 pytestmark = pytest.mark.cpu_test
 
diff --git a/tests/tool_use/test_kimi_k2_tool_parser.py b/tests/tool_use/test_kimi_k2_tool_parser.py
index 1558a9c3e01f2..c014d29fa9079 100644
--- a/tests/tool_use/test_kimi_k2_tool_parser.py
+++ b/tests/tool_use/test_kimi_k2_tool_parser.py
@@ -7,8 +7,8 @@ import json
 import pytest
 
 from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
-from vllm.entrypoints.openai.tool_parsers.kimi_k2_tool_parser import KimiK2ToolParser
 from vllm.tokenizers import get_tokenizer
+from vllm.tool_parsers.kimi_k2_tool_parser import KimiK2ToolParser
 
 pytestmark = pytest.mark.cpu_test
 
diff --git a/tests/tool_use/test_minimax_tool_parser.py b/tests/tool_use/test_minimax_tool_parser.py
index dda63f984a832..a931ce4679d18 100644
--- a/tests/tool_use/test_minimax_tool_parser.py
+++ b/tests/tool_use/test_minimax_tool_parser.py
@@ -12,8 +12,8 @@ from vllm.entrypoints.openai.protocol import (
     FunctionCall,
     ToolCall,
 )
-from vllm.entrypoints.openai.tool_parsers.minimax_tool_parser import MinimaxToolParser
 from vllm.tokenizers import get_tokenizer
+from vllm.tool_parsers.minimax_tool_parser import MinimaxToolParser
 
 pytestmark = pytest.mark.cpu_test
 
diff --git a/tests/tool_use/test_mistral_tool_parser.py b/tests/tool_use/test_mistral_tool_parser.py
index d498863317e8d..9400a67267f4c 100644
--- a/tests/tool_use/test_mistral_tool_parser.py
+++ b/tests/tool_use/test_mistral_tool_parser.py
@@ -12,10 +12,10 @@ from mistral_common.protocol.instruct.tool_calls import FunctionCall, ToolCall
 from partial_json_parser.core.options import Allow
 
 from vllm.entrypoints.openai.protocol import DeltaMessage, DeltaToolCall
-from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolParser
 from vllm.tokenizers import TokenizerLike, get_tokenizer
 from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
 from vllm.tokenizers.mistral import MistralTokenizer
+from vllm.tool_parsers.mistral_tool_parser import MistralToolParser
 
 
 @pytest.fixture(scope="module")
diff --git a/tests/tool_use/test_openai_tool_parser.py b/tests/tool_use/test_openai_tool_parser.py
index 6537f281c0e1b..44b8c92745e91 100644
--- a/tests/tool_use/test_openai_tool_parser.py
+++ b/tests/tool_use/test_openai_tool_parser.py
@@ -15,8 +15,8 @@ from openai_harmony import (
 )
 
 from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
-from vllm.entrypoints.openai.tool_parsers.openai_tool_parser import OpenAIToolParser
 from vllm.tokenizers import get_tokenizer
+from vllm.tool_parsers.openai_tool_parser import OpenAIToolParser
 
 MODEL = "gpt2"
 
diff --git a/tests/tool_use/test_qwen3coder_tool_parser.py b/tests/tool_use/test_qwen3coder_tool_parser.py
index 5a56768805fdf..87ad816f0837d 100644
--- a/tests/tool_use/test_qwen3coder_tool_parser.py
+++ b/tests/tool_use/test_qwen3coder_tool_parser.py
@@ -13,12 +13,12 @@ from vllm.entrypoints.openai.protocol import (
     FunctionCall,
     ToolCall,
 )
-from vllm.entrypoints.openai.tool_parsers.qwen3coder_tool_parser import (
-    Qwen3CoderToolParser,
-)
-from vllm.entrypoints.openai.tool_parsers.qwen3xml_tool_parser import Qwen3XMLToolParser
 from vllm.tokenizers import TokenizerLike, get_tokenizer
 from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
+from vllm.tool_parsers.qwen3coder_tool_parser import (
+    Qwen3CoderToolParser,
+)
+from vllm.tool_parsers.qwen3xml_tool_parser import Qwen3XMLToolParser
 
 pytestmark = pytest.mark.cpu_test
 
diff --git a/tests/tool_use/test_seed_oss_tool_parser.py b/tests/tool_use/test_seed_oss_tool_parser.py
index 8795c35a1347f..fda91b514edd1 100644
--- a/tests/tool_use/test_seed_oss_tool_parser.py
+++ b/tests/tool_use/test_seed_oss_tool_parser.py
@@ -14,9 +14,9 @@ from vllm.entrypoints.openai.protocol import (
     FunctionCall,
     ToolCall,
 )
-from vllm.entrypoints.openai.tool_parsers.seed_oss_tool_parser import SeedOssToolParser
 from vllm.tokenizers import TokenizerLike, get_tokenizer
 from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
+from vllm.tool_parsers.seed_oss_tool_parser import SeedOssToolParser
 
 pytestmark = pytest.mark.cpu_test
 
diff --git a/tests/tool_use/test_tool_choice_required.py b/tests/tool_use/test_tool_choice_required.py
index d5572cfbebe3c..35ed8d215f73a 100644
--- a/tests/tool_use/test_tool_choice_required.py
+++ b/tests/tool_use/test_tool_choice_required.py
@@ -12,7 +12,7 @@ from vllm.entrypoints.openai.protocol import (
     ChatCompletionToolsParam,
 )
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
-from vllm.entrypoints.openai.tool_parsers.utils import get_json_schema_from_tools
+from vllm.tool_parsers.utils import get_json_schema_from_tools
 
 pytestmark = pytest.mark.cpu_test
 
diff --git a/tests/tool_use/test_xlam_tool_parser.py b/tests/tool_use/test_xlam_tool_parser.py
index 3098fda036a81..ed24ba7cba1ac 100644
--- a/tests/tool_use/test_xlam_tool_parser.py
+++ b/tests/tool_use/test_xlam_tool_parser.py
@@ -12,9 +12,9 @@ from vllm.entrypoints.openai.protocol import (
     FunctionCall,
     ToolCall,
 )
-from vllm.entrypoints.openai.tool_parsers.xlam_tool_parser import xLAMToolParser
 from vllm.tokenizers import TokenizerLike, get_tokenizer
 from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
+from vllm.tool_parsers.xlam_tool_parser import xLAMToolParser
 
 pytestmark = pytest.mark.cpu_test
 
diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
index ec1506b473bd9..a22ab02229cd8 100644
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -34,13 +34,13 @@ from vllm.entrypoints.openai.protocol import (
     ResponseRawMessageAndToken,
     ResponsesRequest,
 )
-from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ToolParser
 from vllm.entrypoints.responses_utils import construct_tool_dicts
 from vllm.entrypoints.tool import Tool
 from vllm.entrypoints.tool_server import ToolServer
 from vllm.outputs import RequestOutput
 from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
 from vllm.tokenizers.protocol import TokenizerLike
+from vllm.tool_parsers.abstract_tool_parser import ToolParser
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import random_uuid
 
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 7be601d824f34..5d0eacae34dd7 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -72,7 +72,6 @@ from vllm.entrypoints.openai.serving_transcription import (
     OpenAIServingTranscription,
     OpenAIServingTranslation,
 )
-from vllm.entrypoints.openai.tool_parsers import ToolParserManager
 from vllm.entrypoints.openai.utils import validate_json_request
 from vllm.entrypoints.pooling.classify.serving import ServingClassification
 from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding
@@ -95,6 +94,7 @@ from vllm.entrypoints.utils import (
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParserManager
 from vllm.tasks import POOLING_TASKS
+from vllm.tool_parsers import ToolParserManager
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.gc_utils import freeze_gc_heap
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index b798b05dcfcbf..a8eef76cd8ae4 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -27,8 +27,8 @@ from vllm.entrypoints.constants import (
     H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT,
 )
 from vllm.entrypoints.openai.serving_models import LoRAModulePath
-from vllm.entrypoints.openai.tool_parsers import ToolParserManager
 from vllm.logger import init_logger
+from vllm.tool_parsers import ToolParserManager
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 logger = init_logger(__name__)
diff --git a/vllm/entrypoints/openai/parser/responses_parser.py b/vllm/entrypoints/openai/parser/responses_parser.py
index 00045a7ccfd24..4fa6b4d906db0 100644
--- a/vllm/entrypoints/openai/parser/responses_parser.py
+++ b/vllm/entrypoints/openai/parser/responses_parser.py
@@ -12,10 +12,10 @@ from openai.types.responses.response_reasoning_item import (
 )
 
 from vllm.entrypoints.openai.protocol import ResponseInputOutputItem, ResponsesRequest
-from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ToolParser
 from vllm.outputs import CompletionOutput
 from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
 from vllm.tokenizers.protocol import TokenizerLike
+from vllm.tool_parsers.abstract_tool_parser import ToolParser
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import random_uuid
 
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 1cf887529dc94..2df5372635596 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -57,8 +57,6 @@ from vllm.entrypoints.openai.serving_engine import (
     clamp_prompt_logprobs,
 )
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
-from vllm.entrypoints.openai.tool_parsers import ToolParser
-from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolCall
 from vllm.entrypoints.openai.utils import maybe_filter_parallel_tool_calls
 from vllm.entrypoints.utils import get_max_tokens, should_include_usage
 from vllm.inputs.data import TokensPrompt
@@ -73,6 +71,8 @@ from vllm.tokenizers.mistral import (
     truncate_tool_call_ids,
     validate_request_params,
 )
+from vllm.tool_parsers import ToolParser
+from vllm.tool_parsers.mistral_tool_parser import MistralToolCall
 from vllm.utils.collection_utils import as_list
 from vllm.v1.sample.logits_processor import validate_logits_processors_parameters
 
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 46eb351f52843..5f7cfaa53ec18 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -59,7 +59,6 @@ from vllm.entrypoints.openai.protocol import (
     TranslationRequest,
 )
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
-from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
 from vllm.entrypoints.pooling.classify.protocol import (
     ClassificationChatRequest,
     ClassificationCompletionRequest,
@@ -104,6 +103,7 @@ from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.tokenizers import TokenizerLike
 from vllm.tokenizers.deepseek_v32 import DeepseekV32Tokenizer
 from vllm.tokenizers.mistral import MistralTokenizer
+from vllm.tool_parsers import ToolParser, ToolParserManager
 from vllm.tracing import (
     contains_trace_headers,
     extract_trace_headers,
diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py
index 7be1263e802dc..ad1b682a9ef65 100644
--- a/vllm/entrypoints/openai/tool_parsers/__init__.py
+++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
@@ -1,150 +1,33 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParser,
-    ToolParserManager,
-)
-
-__all__ = ["ToolParser", "ToolParserManager"]
+import warnings
 
 
-"""
-Register a lazy module mapping.
+def __getattr__(name: str):
+    if name == "ToolParser":
+        from vllm.tool_parsers import ToolParser
 
-Example:
-    ToolParserManager.register_lazy_module(
-        name="kimi_k2",
-        module_path="vllm.entrypoints.openai.tool_parsers.kimi_k2_parser",
-        class_name="KimiK2ToolParser",
-    )
-"""
+        warnings.warn(
+            "`vllm.entrypoints.openai.tool_parsers.ToolParser` has been moved to "
+            "`vllm.tool_parsers.ToolParser`. "
+            "The old name will be removed in v0.14.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
 
+        return ToolParser
+    if name == "ToolParserManager":
+        from vllm.tool_parsers import ToolParserManager
 
-_TOOL_PARSERS_TO_REGISTER = {
-    "deepseek_v3": (  # name
-        "deepseekv3_tool_parser",  # filename
-        "DeepSeekV3ToolParser",  # class_name
-    ),
-    "deepseek_v31": (
-        "deepseekv31_tool_parser",
-        "DeepSeekV31ToolParser",
-    ),
-    "deepseek_v32": (
-        "deepseekv32_tool_parser",
-        "DeepSeekV32ToolParser",
-    ),
-    "ernie45": (
-        "ernie45_tool_parser",
-        "Ernie45ToolParser",
-    ),
-    "glm45": (
-        "glm4_moe_tool_parser",
-        "Glm4MoeModelToolParser",
-    ),
-    "granite-20b-fc": (
-        "granite_20b_fc_tool_parser",
-        "Granite20bFCToolParser",
-    ),
-    "granite": (
-        "granite_tool_parser",
-        "GraniteToolParser",
-    ),
-    "hermes": (
-        "hermes_tool_parser",
-        "Hermes2ProToolParser",
-    ),
-    "hunyuan_a13b": (
-        "hunyuan_a13b_tool_parser",
-        "HunyuanA13BToolParser",
-    ),
-    "internlm": (
-        "internlm2_tool_parser",
-        "Internlm2ToolParser",
-    ),
-    "jamba": (
-        "jamba_tool_parser",
-        "JambaToolParser",
-    ),
-    "kimi_k2": (
-        "kimi_k2_tool_parser",
-        "KimiK2ToolParser",
-    ),
-    "llama3_json": (
-        "llama_tool_parser",
-        "Llama3JsonToolParser",
-    ),
-    "llama4_json": (
-        "llama_tool_parser",
-        "Llama3JsonToolParser",
-    ),
-    "llama4_pythonic": (
-        "llama4_pythonic_tool_parser",
-        "Llama4PythonicToolParser",
-    ),
-    "longcat": (
-        "longcat_tool_parser",
-        "LongcatFlashToolParser",
-    ),
-    "minimax_m2": (
-        "minimax_m2_tool_parser",
-        "MinimaxM2ToolParser",
-    ),
-    "minimax": (
-        "minimax_tool_parser",
-        "MinimaxToolParser",
-    ),
-    "mistral": (
-        "mistral_tool_parser",
-        "MistralToolParser",
-    ),
-    "olmo3": (
-        "olmo3_tool_parser",
-        "Olmo3PythonicToolParser",
-    ),
-    "openai": (
-        "openai_tool_parser",
-        "OpenAIToolParser",
-    ),
-    "phi4_mini_json": (
-        "phi4mini_tool_parser",
-        "Phi4MiniJsonToolParser",
-    ),
-    "pythonic": (
-        "pythonic_tool_parser",
-        "PythonicToolParser",
-    ),
-    "qwen3_coder": (
-        "qwen3coder_tool_parser",
-        "Qwen3CoderToolParser",
-    ),
-    "qwen3_xml": (
-        "qwen3xml_tool_parser",
-        "Qwen3XMLToolParser",
-    ),
-    "seed_oss": (
-        "seed_oss_tool_parser",
-        "SeedOssToolParser",
-    ),
-    "step3": (
-        "step3_tool_parser",
-        "Step3ToolParser",
-    ),
-    "xlam": (
-        "xlam_tool_parser",
-        "xLAMToolParser",
-    ),
-    "gigachat3": (
-        "gigachat3_tool_parser",
-        "GigaChat3ToolParser",
-    ),
-}
+        warnings.warn(
+            "`vllm.entrypoints.openai.tool_parsers.ToolParserManager` "
+            "has been moved to `vllm.tool_parsers.ToolParserManager`. "
+            "The old name will be removed in v0.14.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
 
+        return ToolParserManager
 
-def register_lazy_tool_parsers():
-    for name, (file_name, class_name) in _TOOL_PARSERS_TO_REGISTER.items():
-        module_path = f"vllm.entrypoints.openai.tool_parsers.{file_name}"
-        ToolParserManager.register_lazy_module(name, module_path, class_name)
-
-
-register_lazy_tool_parsers()
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/vllm/tool_parsers/__init__.py b/vllm/tool_parsers/__init__.py
new file mode 100644
index 0000000000000..181d8bcba9553
--- /dev/null
+++ b/vllm/tool_parsers/__init__.py
@@ -0,0 +1,150 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.tool_parsers.abstract_tool_parser import (
+    ToolParser,
+    ToolParserManager,
+)
+
+__all__ = ["ToolParser", "ToolParserManager"]
+
+
+"""
+Register a lazy module mapping.
+
+Example:
+    ToolParserManager.register_lazy_module(
+        name="kimi_k2",
+        module_path="vllm.tool_parsers.kimi_k2_parser",
+        class_name="KimiK2ToolParser",
+    )
+"""
+
+
+_TOOL_PARSERS_TO_REGISTER = {
+    "deepseek_v3": (  # name
+        "deepseekv3_tool_parser",  # filename
+        "DeepSeekV3ToolParser",  # class_name
+    ),
+    "deepseek_v31": (
+        "deepseekv31_tool_parser",
+        "DeepSeekV31ToolParser",
+    ),
+    "deepseek_v32": (
+        "deepseekv32_tool_parser",
+        "DeepSeekV32ToolParser",
+    ),
+    "ernie45": (
+        "ernie45_tool_parser",
+        "Ernie45ToolParser",
+    ),
+    "glm45": (
+        "glm4_moe_tool_parser",
+        "Glm4MoeModelToolParser",
+    ),
+    "granite-20b-fc": (
+        "granite_20b_fc_tool_parser",
+        "Granite20bFCToolParser",
+    ),
+    "granite": (
+        "granite_tool_parser",
+        "GraniteToolParser",
+    ),
+    "hermes": (
+        "hermes_tool_parser",
+        "Hermes2ProToolParser",
+    ),
+    "hunyuan_a13b": (
+        "hunyuan_a13b_tool_parser",
+        "HunyuanA13BToolParser",
+    ),
+    "internlm": (
+        "internlm2_tool_parser",
+        "Internlm2ToolParser",
+    ),
+    "jamba": (
+        "jamba_tool_parser",
+        "JambaToolParser",
+    ),
+    "kimi_k2": (
+        "kimi_k2_tool_parser",
+        "KimiK2ToolParser",
+    ),
+    "llama3_json": (
+        "llama_tool_parser",
+        "Llama3JsonToolParser",
+    ),
+    "llama4_json": (
+        "llama_tool_parser",
+        "Llama3JsonToolParser",
+    ),
+    "llama4_pythonic": (
+        "llama4_pythonic_tool_parser",
+        "Llama4PythonicToolParser",
+    ),
+    "longcat": (
+        "longcat_tool_parser",
+        "LongcatFlashToolParser",
+    ),
+    "minimax_m2": (
+        "minimax_m2_tool_parser",
+        "MinimaxM2ToolParser",
+    ),
+    "minimax": (
+        "minimax_tool_parser",
+        "MinimaxToolParser",
+    ),
+    "mistral": (
+        "mistral_tool_parser",
+        "MistralToolParser",
+    ),
+    "olmo3": (
+        "olmo3_tool_parser",
+        "Olmo3PythonicToolParser",
+    ),
+    "openai": (
+        "openai_tool_parser",
+        "OpenAIToolParser",
+    ),
+    "phi4_mini_json": (
+        "phi4mini_tool_parser",
+        "Phi4MiniJsonToolParser",
+    ),
+    "pythonic": (
+        "pythonic_tool_parser",
+        "PythonicToolParser",
+    ),
+    "qwen3_coder": (
+        "qwen3coder_tool_parser",
+        "Qwen3CoderToolParser",
+    ),
+    "qwen3_xml": (
+        "qwen3xml_tool_parser",
+        "Qwen3XMLToolParser",
+    ),
+    "seed_oss": (
+        "seed_oss_tool_parser",
+        "SeedOssToolParser",
+    ),
+    "step3": (
+        "step3_tool_parser",
+        "Step3ToolParser",
+    ),
+    "xlam": (
+        "xlam_tool_parser",
+        "xLAMToolParser",
+    ),
+    "gigachat3": (
+        "gigachat3_tool_parser",
+        "GigaChat3ToolParser",
+    ),
+}
+
+
+def register_lazy_tool_parsers():
+    for name, (file_name, class_name) in _TOOL_PARSERS_TO_REGISTER.items():
+        module_path = f"vllm.tool_parsers.{file_name}"
+        ToolParserManager.register_lazy_module(name, module_path, class_name)
+
+
+register_lazy_tool_parsers()
diff --git a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/vllm/tool_parsers/abstract_tool_parser.py
similarity index 98%
rename from vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
rename to vllm/tool_parsers/abstract_tool_parser.py
index 87ef2e0786a94..e2ccb1dad9907 100644
--- a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
+++ b/vllm/tool_parsers/abstract_tool_parser.py
@@ -17,12 +17,12 @@ from vllm.entrypoints.openai.protocol import (
     ResponsesRequest,
     ResponseTextConfig,
 )
-from vllm.entrypoints.openai.tool_parsers.utils import get_json_schema_from_tools
 from vllm.logger import init_logger
 from vllm.sampling_params import (
     StructuredOutputsParams,
 )
 from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers.utils import get_json_schema_from_tools
 from vllm.utils.collection_utils import is_list_of
 from vllm.utils.import_utils import import_from_path
 
@@ -203,7 +203,7 @@ class ToolParserManager:
         Example:
             ToolParserManager.register_lazy_module(
                 name="kimi_k2",
-                module_path="vllm.entrypoints.openai.tool_parsers.kimi_k2_parser",
+                module_path="vllm.tool_parsers.kimi_k2_parser",
                 class_name="KimiK2ToolParser",
             )
         """
diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py b/vllm/tool_parsers/deepseekv31_tool_parser.py
similarity index 99%
rename from vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py
rename to vllm/tool_parsers/deepseekv31_tool_parser.py
index 10de3dabf985c..33383e1bc0739 100644
--- a/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py
+++ b/vllm/tool_parsers/deepseekv31_tool_parser.py
@@ -15,11 +15,9 @@ from vllm.entrypoints.openai.protocol import (
     FunctionCall,
     ToolCall,
 )
-from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParser,
-)
 from vllm.logger import init_logger
 from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers.abstract_tool_parser import ToolParser
 
 logger = init_logger(__name__)
 
diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv32_tool_parser.py b/vllm/tool_parsers/deepseekv32_tool_parser.py
similarity index 99%
rename from vllm/entrypoints/openai/tool_parsers/deepseekv32_tool_parser.py
rename to vllm/tool_parsers/deepseekv32_tool_parser.py
index 4973deb7cefa8..db081178fdeae 100644
--- a/vllm/entrypoints/openai/tool_parsers/deepseekv32_tool_parser.py
+++ b/vllm/tool_parsers/deepseekv32_tool_parser.py
@@ -17,11 +17,11 @@ from vllm.entrypoints.openai.protocol import (
     FunctionCall,
     ToolCall,
 )
-from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParser,
-)
 from vllm.logger import init_logger
 from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers.abstract_tool_parser import (
+    ToolParser,
+)
 
 logger = init_logger(__name__)
 
diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py b/vllm/tool_parsers/deepseekv3_tool_parser.py
similarity index 99%
rename from vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
rename to vllm/tool_parsers/deepseekv3_tool_parser.py
index 66b14875dce25..f8cf559f2284a 100644
--- a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
+++ b/vllm/tool_parsers/deepseekv3_tool_parser.py
@@ -15,11 +15,11 @@ from vllm.entrypoints.openai.protocol import (
     FunctionCall,
     ToolCall,
 )
-from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParser,
-)
 from vllm.logger import init_logger
 from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers.abstract_tool_parser import (
+    ToolParser,
+)
 
 logger = init_logger(__name__)
 
diff --git a/vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py b/vllm/tool_parsers/ernie45_tool_parser.py
similarity index 99%
rename from vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py
rename to vllm/tool_parsers/ernie45_tool_parser.py
index d054d8e4b8651..79193787b3b3b 100644
--- a/vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py
+++ b/vllm/tool_parsers/ernie45_tool_parser.py
@@ -15,11 +15,11 @@ from vllm.entrypoints.openai.protocol import (
     FunctionCall,
     ToolCall,
 )
-from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParser,
-)
 from vllm.logger import init_logger
 from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers.abstract_tool_parser import (
+    ToolParser,
+)
 
 logger = init_logger(__name__)
 
diff --git a/vllm/entrypoints/openai/tool_parsers/gigachat3_tool_parser.py b/vllm/tool_parsers/gigachat3_tool_parser.py
similarity index 98%
rename from vllm/entrypoints/openai/tool_parsers/gigachat3_tool_parser.py
rename to vllm/tool_parsers/gigachat3_tool_parser.py
index dd27ffa83cfc4..27a6bc1a7bad8 100644
--- a/vllm/entrypoints/openai/tool_parsers/gigachat3_tool_parser.py
+++ b/vllm/tool_parsers/gigachat3_tool_parser.py
@@ -16,9 +16,9 @@ from vllm.entrypoints.openai.protocol import (
     FunctionCall,
     ToolCall,
 )
-from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ToolParser
 from vllm.logger import init_logger
 from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers.abstract_tool_parser import ToolParser
 
 logger = init_logger(__name__)
 
diff --git a/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py b/vllm/tool_parsers/glm4_moe_tool_parser.py
similarity index 99%
rename from vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py
rename to vllm/tool_parsers/glm4_moe_tool_parser.py
index 165346adb3d93..d254fcb5240a5 100644
--- a/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py
+++ b/vllm/tool_parsers/glm4_moe_tool_parser.py
@@ -18,11 +18,11 @@ from vllm.entrypoints.openai.protocol import (
     FunctionCall,
     ToolCall,
 )
-from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParser,
-)
 from vllm.logger import init_logger
 from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers.abstract_tool_parser import (
+    ToolParser,
+)
 
 logger = init_logger(__name__)
 
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/tool_parsers/granite_20b_fc_tool_parser.py
similarity index 98%
rename from vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
rename to vllm/tool_parsers/granite_20b_fc_tool_parser.py
index df1b590526b1a..d841fb57ac87e 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
+++ b/vllm/tool_parsers/granite_20b_fc_tool_parser.py
@@ -19,17 +19,17 @@ from vllm.entrypoints.openai.protocol import (
     FunctionCall,
     ToolCall,
 )
-from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+from vllm.logger import init_logger
+from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers.abstract_tool_parser import (
     ToolParser,
 )
-from vllm.entrypoints.openai.tool_parsers.utils import (
+from vllm.tool_parsers.utils import (
     consume_space,
     find_common_prefix,
     is_complete_json,
     partial_json_loads,
 )
-from vllm.logger import init_logger
-from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/tool_parsers/granite_tool_parser.py
similarity index 98%
rename from vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
rename to vllm/tool_parsers/granite_tool_parser.py
index 14b0ca0abe357..7abfdf72849d9 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
+++ b/vllm/tool_parsers/granite_tool_parser.py
@@ -17,17 +17,17 @@ from vllm.entrypoints.openai.protocol import (
     FunctionCall,
     ToolCall,
 )
-from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+from vllm.logger import init_logger
+from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers.abstract_tool_parser import (
     ToolParser,
 )
-from vllm.entrypoints.openai.tool_parsers.utils import (
+from vllm.tool_parsers.utils import (
     consume_space,
     find_common_prefix,
     is_complete_json,
     partial_json_loads,
 )
-from vllm.logger import init_logger
-from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/tool_parsers/hermes_tool_parser.py
similarity index 99%
rename from vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
rename to vllm/tool_parsers/hermes_tool_parser.py
index 14cf2f38b70cc..4b1dea7edf27a 100644
--- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+++ b/vllm/tool_parsers/hermes_tool_parser.py
@@ -18,12 +18,12 @@ from vllm.entrypoints.openai.protocol import (
     FunctionCall,
     ToolCall,
 )
-from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParser,
-)
 from vllm.logger import init_logger
 from vllm.tokenizers import TokenizerLike
 from vllm.tokenizers.mistral import MistralTokenizer
+from vllm.tool_parsers.abstract_tool_parser import (
+    ToolParser,
+)
 
 logger = init_logger(__name__)
 
diff --git a/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py b/vllm/tool_parsers/hunyuan_a13b_tool_parser.py
similarity index 99%
rename from vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py
rename to vllm/tool_parsers/hunyuan_a13b_tool_parser.py
index d2419b5d84ead..c739821368042 100644
--- a/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py
+++ b/vllm/tool_parsers/hunyuan_a13b_tool_parser.py
@@ -17,12 +17,12 @@ from vllm.entrypoints.openai.protocol import (
     FunctionCall,
     ToolCall,
 )
-from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParser,
-)
-from vllm.entrypoints.openai.tool_parsers.utils import consume_space
 from vllm.logger import init_logger
 from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers.abstract_tool_parser import (
+    ToolParser,
+)
+from vllm.tool_parsers.utils import consume_space
 from vllm.utils import random_uuid
 
 logger = init_logger(__name__)
diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/tool_parsers/internlm2_tool_parser.py
similarity index 98%
rename from vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
rename to vllm/tool_parsers/internlm2_tool_parser.py
index 67788358543e9..e87efe3275a71 100644
--- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
+++ b/vllm/tool_parsers/internlm2_tool_parser.py
@@ -17,12 +17,12 @@ from vllm.entrypoints.openai.protocol import (
     FunctionCall,
     ToolCall,
 )
-from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParser,
-)
-from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff
 from vllm.logger import init_logger
 from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers.abstract_tool_parser import (
+    ToolParser,
+)
+from vllm.tool_parsers.utils import extract_intermediate_diff
 
 logger = init_logger(__name__)
 
diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/tool_parsers/jamba_tool_parser.py
similarity index 98%
rename from vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
rename to vllm/tool_parsers/jamba_tool_parser.py
index 92b09917c2521..7f3de0b38a33c 100644
--- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
+++ b/vllm/tool_parsers/jamba_tool_parser.py
@@ -18,11 +18,11 @@ from vllm.entrypoints.openai.protocol import (
     FunctionCall,
     ToolCall,
 )
-from vllm.entrypoints.openai.tool_parsers import ToolParser
-from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff
 from vllm.logger import init_logger
 from vllm.tokenizers import TokenizerLike
 from vllm.tokenizers.mistral import MistralTokenizer
+from vllm.tool_parsers import ToolParser
+from vllm.tool_parsers.utils import extract_intermediate_diff
 
 logger = init_logger(__name__)
 
diff --git a/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py b/vllm/tool_parsers/kimi_k2_tool_parser.py
similarity index 99%
rename from vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py
rename to vllm/tool_parsers/kimi_k2_tool_parser.py
index 07db52ebd5af1..c215b7978854e 100644
--- a/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py
+++ b/vllm/tool_parsers/kimi_k2_tool_parser.py
@@ -15,11 +15,11 @@ from vllm.entrypoints.openai.protocol import (
     FunctionCall,
     ToolCall,
 )
-from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParser,
-)
 from vllm.logger import init_logger
 from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers.abstract_tool_parser import (
+    ToolParser,
+)
 
 logger = init_logger(__name__)
 
diff --git a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py b/vllm/tool_parsers/llama4_pythonic_tool_parser.py
similarity index 99%
rename from vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
rename to vllm/tool_parsers/llama4_pythonic_tool_parser.py
index 1d6de9244066e..3c5409bbfaf42 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
+++ b/vllm/tool_parsers/llama4_pythonic_tool_parser.py
@@ -18,10 +18,10 @@ from vllm.entrypoints.openai.protocol import (
     FunctionCall,
     ToolCall,
 )
-from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+from vllm.logger import init_logger
+from vllm.tool_parsers.abstract_tool_parser import (
     ToolParser,
 )
-from vllm.logger import init_logger
 
 logger = init_logger(__name__)
 
diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/tool_parsers/llama_tool_parser.py
similarity index 99%
rename from vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
rename to vllm/tool_parsers/llama_tool_parser.py
index e1fe6e90dfd0b..b0dfe05c8e556 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+++ b/vllm/tool_parsers/llama_tool_parser.py
@@ -20,15 +20,15 @@ from vllm.entrypoints.openai.protocol import (
     FunctionCall,
     ToolCall,
 )
-from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+from vllm.logger import init_logger
+from vllm.tool_parsers.abstract_tool_parser import (
     ToolParser,
 )
-from vllm.entrypoints.openai.tool_parsers.utils import (
+from vllm.tool_parsers.utils import (
     find_common_prefix,
     is_complete_json,
     partial_json_loads,
 )
-from vllm.logger import init_logger
 
 logger = init_logger(__name__)
 
diff --git a/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py b/vllm/tool_parsers/longcat_tool_parser.py
similarity index 93%
rename from vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py
rename to vllm/tool_parsers/longcat_tool_parser.py
index 76d76a4aa35a1..72f13559a9222 100644
--- a/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py
+++ b/vllm/tool_parsers/longcat_tool_parser.py
@@ -3,8 +3,8 @@
 
 import regex as re
 
-from vllm.entrypoints.openai.tool_parsers.hermes_tool_parser import Hermes2ProToolParser
 from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers.hermes_tool_parser import Hermes2ProToolParser
 
 
 class LongcatFlashToolParser(Hermes2ProToolParser):
diff --git a/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py b/vllm/tool_parsers/minimax_m2_tool_parser.py
similarity index 99%
rename from vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py
rename to vllm/tool_parsers/minimax_m2_tool_parser.py
index b595a98f35555..dcb2b64f6e73c 100644
--- a/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py
+++ b/vllm/tool_parsers/minimax_m2_tool_parser.py
@@ -17,11 +17,11 @@ from vllm.entrypoints.openai.protocol import (
     FunctionCall,
     ToolCall,
 )
-from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParser,
-)
 from vllm.logger import init_logger
 from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers.abstract_tool_parser import (
+    ToolParser,
+)
 
 logger = init_logger(__name__)
 
diff --git a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py b/vllm/tool_parsers/minimax_tool_parser.py
similarity index 99%
rename from vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
rename to vllm/tool_parsers/minimax_tool_parser.py
index 1025041037c6e..86e1433c6e360 100644
--- a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
+++ b/vllm/tool_parsers/minimax_tool_parser.py
@@ -17,12 +17,12 @@ from vllm.entrypoints.openai.protocol import (
     FunctionCall,
     ToolCall,
 )
-from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParser,
-)
-from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff
 from vllm.logger import init_logger
 from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers.abstract_tool_parser import (
+    ToolParser,
+)
+from vllm.tool_parsers.utils import extract_intermediate_diff
 
 logger = init_logger(__name__)
 
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/tool_parsers/mistral_tool_parser.py
similarity index 99%
rename from vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
rename to vllm/tool_parsers/mistral_tool_parser.py
index f60c379d26711..49a175f69f434 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/tool_parsers/mistral_tool_parser.py
@@ -21,12 +21,12 @@ from vllm.entrypoints.openai.protocol import (
     FunctionCall,
     ToolCall,
 )
-from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParser,
-)
 from vllm.logger import init_logger
 from vllm.tokenizers import TokenizerLike
 from vllm.tokenizers.mistral import MistralTokenizer
+from vllm.tool_parsers.abstract_tool_parser import (
+    ToolParser,
+)
 
 logger = init_logger(__name__)
 
diff --git a/vllm/entrypoints/openai/tool_parsers/olmo3_tool_parser.py b/vllm/tool_parsers/olmo3_tool_parser.py
similarity index 99%
rename from vllm/entrypoints/openai/tool_parsers/olmo3_tool_parser.py
rename to vllm/tool_parsers/olmo3_tool_parser.py
index baff33bd7e8ac..8cd6a84a9f6b1 100644
--- a/vllm/entrypoints/openai/tool_parsers/olmo3_tool_parser.py
+++ b/vllm/tool_parsers/olmo3_tool_parser.py
@@ -18,10 +18,10 @@ from vllm.entrypoints.openai.protocol import (
     FunctionCall,
     ToolCall,
 )
-from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+from vllm.logger import init_logger
+from vllm.tool_parsers.abstract_tool_parser import (
     ToolParser,
 )
-from vllm.logger import init_logger
 
 logger = init_logger(__name__)
 
diff --git a/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py b/vllm/tool_parsers/openai_tool_parser.py
similarity index 98%
rename from vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
rename to vllm/tool_parsers/openai_tool_parser.py
index a3cf793ed3a6d..db92ea8982d70 100644
--- a/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
+++ b/vllm/tool_parsers/openai_tool_parser.py
@@ -12,10 +12,10 @@ from vllm.entrypoints.openai.protocol import (
     FunctionCall,
     ToolCall,
 )
-from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+from vllm.logger import init_logger
+from vllm.tool_parsers.abstract_tool_parser import (
     ToolParser,
 )
-from vllm.logger import init_logger
 
 if TYPE_CHECKING:
     from vllm.tokenizers import TokenizerLike
diff --git a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py b/vllm/tool_parsers/phi4mini_tool_parser.py
similarity index 98%
rename from vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
rename to vllm/tool_parsers/phi4mini_tool_parser.py
index acb25ea2768e1..9003429d8c6f2 100644
--- a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
+++ b/vllm/tool_parsers/phi4mini_tool_parser.py
@@ -16,10 +16,10 @@ from vllm.entrypoints.openai.protocol import (
     FunctionCall,
     ToolCall,
 )
-from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+from vllm.logger import init_logger
+from vllm.tool_parsers.abstract_tool_parser import (
     ToolParser,
 )
-from vllm.logger import init_logger
 
 logger = init_logger(__name__)
 
diff --git a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/vllm/tool_parsers/pythonic_tool_parser.py
similarity index 99%
rename from vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
rename to vllm/tool_parsers/pythonic_tool_parser.py
index abeb923b93227..476a62d5f5273 100644
--- a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
+++ b/vllm/tool_parsers/pythonic_tool_parser.py
@@ -19,10 +19,10 @@ from vllm.entrypoints.openai.protocol import (
     FunctionCall,
     ToolCall,
 )
-from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+from vllm.logger import init_logger
+from vllm.tool_parsers.abstract_tool_parser import (
     ToolParser,
 )
-from vllm.logger import init_logger
 
 logger = init_logger(__name__)
 
diff --git a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py b/vllm/tool_parsers/qwen3coder_tool_parser.py
similarity index 99%
rename from vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
rename to vllm/tool_parsers/qwen3coder_tool_parser.py
index d49b14690ef03..d1a3cbeaafc7d 100644
--- a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
+++ b/vllm/tool_parsers/qwen3coder_tool_parser.py
@@ -18,11 +18,11 @@ from vllm.entrypoints.openai.protocol import (
     FunctionCall,
     ToolCall,
 )
-from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParser,
-)
 from vllm.logger import init_logger
 from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers.abstract_tool_parser import (
+    ToolParser,
+)
 
 logger = init_logger(__name__)
 
diff --git a/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py b/vllm/tool_parsers/qwen3xml_tool_parser.py
similarity index 99%
rename from vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py
rename to vllm/tool_parsers/qwen3xml_tool_parser.py
index 03862ff432a5d..107f791654a1a 100644
--- a/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py
+++ b/vllm/tool_parsers/qwen3xml_tool_parser.py
@@ -19,11 +19,11 @@ from vllm.entrypoints.openai.protocol import (
     FunctionCall,
     ToolCall,
 )
-from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParser,
-)
 from vllm.logger import init_logger
 from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers.abstract_tool_parser import (
+    ToolParser,
+)
 
 logger = init_logger(__name__)
 
diff --git a/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py b/vllm/tool_parsers/seed_oss_tool_parser.py
similarity index 99%
rename from vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
rename to vllm/tool_parsers/seed_oss_tool_parser.py
index c7947faad1923..206072e65b10f 100644
--- a/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
+++ b/vllm/tool_parsers/seed_oss_tool_parser.py
@@ -21,11 +21,11 @@ from vllm.entrypoints.openai.protocol import (
     FunctionCall,
     ToolCall,
 )
-from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParser,
-)
 from vllm.logger import init_logger
 from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers.abstract_tool_parser import (
+    ToolParser,
+)
 
 logger = init_logger(__name__)
 
diff --git a/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py b/vllm/tool_parsers/step3_tool_parser.py
similarity index 99%
rename from vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py
rename to vllm/tool_parsers/step3_tool_parser.py
index 9213d6859dd93..acd99bf56d0b6 100644
--- a/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py
+++ b/vllm/tool_parsers/step3_tool_parser.py
@@ -17,11 +17,11 @@ from vllm.entrypoints.openai.protocol import (
     FunctionCall,
     ToolCall,
 )
-from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParser,
-)
 from vllm.logger import init_logger
 from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers.abstract_tool_parser import (
+    ToolParser,
+)
 from vllm.utils import random_uuid
 
 logger = init_logger(__name__)
diff --git a/vllm/entrypoints/openai/tool_parsers/utils.py b/vllm/tool_parsers/utils.py
similarity index 100%
rename from vllm/entrypoints/openai/tool_parsers/utils.py
rename to vllm/tool_parsers/utils.py
diff --git a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py b/vllm/tool_parsers/xlam_tool_parser.py
similarity index 99%
rename from vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
rename to vllm/tool_parsers/xlam_tool_parser.py
index effd2bd08b42a..9c2b585fe9fdb 100644
--- a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
+++ b/vllm/tool_parsers/xlam_tool_parser.py
@@ -17,7 +17,7 @@ from vllm.entrypoints.openai.protocol import (
     FunctionCall,
     ToolCall,
 )
-from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+from vllm.tool_parsers.abstract_tool_parser import (
     ToolParser,
 )
 from vllm.logger import init_logger

From ed586e7724fdf91b391abcf6f3e473be641ff5d6 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 15 Dec 2025 21:45:36 +0800
Subject: [PATCH 558/770] [Refactor] [3/N] Move tool parser tests and run on
 CPU (#30693)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/test-amd.yaml                      | 20 +++++--------------
 .buildkite/test-pipeline.yaml                 | 17 +++++-----------
 .buildkite/test_areas/misc.yaml               |  4 +++-
 .buildkite/test_areas/tool_use.yaml           | 12 +----------
 tests/tool_parsers/__init__.py                |  0
 .../test_deepseekv31_tool_parser.py           |  0
 .../test_ernie45_moe_tool_parser.py           |  0
 .../test_glm4_moe_tool_parser.py              |  2 --
 .../test_jamba_tool_parser.py                 |  2 --
 .../test_kimi_k2_tool_parser.py               |  2 --
 .../test_minimax_tool_parser.py               |  2 --
 .../test_mistral_tool_parser.py               |  0
 .../test_openai_tool_parser.py                |  0
 .../test_qwen3coder_tool_parser.py            |  2 --
 .../test_seed_oss_tool_parser.py              |  2 --
 .../test_xlam_tool_parser.py                  |  2 --
 16 files changed, 14 insertions(+), 53 deletions(-)
 create mode 100644 tests/tool_parsers/__init__.py
 rename tests/{tool_use => tool_parsers}/test_deepseekv31_tool_parser.py (100%)
 rename tests/{tool_use => tool_parsers}/test_ernie45_moe_tool_parser.py (100%)
 rename tests/{tool_use => tool_parsers}/test_glm4_moe_tool_parser.py (99%)
 rename tests/{tool_use => tool_parsers}/test_jamba_tool_parser.py (99%)
 rename tests/{tool_use => tool_parsers}/test_kimi_k2_tool_parser.py (99%)
 rename tests/{tool_use => tool_parsers}/test_minimax_tool_parser.py (99%)
 rename tests/{tool_use => tool_parsers}/test_mistral_tool_parser.py (100%)
 rename tests/{tool_use => tool_parsers}/test_openai_tool_parser.py (100%)
 rename tests/{tool_use => tool_parsers}/test_qwen3coder_tool_parser.py (99%)
 rename tests/{tool_use => tool_parsers}/test_seed_oss_tool_parser.py (99%)
 rename tests/{tool_use => tool_parsers}/test_xlam_tool_parser.py (99%)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 0c2e4ed48dda6..3c9b8cbedcf06 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -61,8 +61,8 @@ steps:
   - pytest -v -s -m 'not cpu_test' multimodal
   - pytest -v -s utils_
 
-- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
-  timeout_in_minutes: 20
+- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min
+  timeout_in_minutes: 30
   mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
   agent_pool: mi325_1
   grade: Blocking
@@ -73,6 +73,7 @@ steps:
   - tests/multimodal
   - tests/standalone_tests/lazy_imports.py
   - tests/tokenizers_
+  - tests/tool_parsers
   - tests/transformers_utils
   - tests/config
   no_gpu: true
@@ -82,6 +83,7 @@ steps:
   - pytest -v -s test_outputs.py
   - pytest -v -s -m 'cpu_test' multimodal
   - pytest -v -s tokenizers_
+  - pytest -v -s tool_parsers
   - pytest -v -s transformers_utils
   - pytest -v -s config
 
@@ -759,19 +761,7 @@ steps:
     - vllm/
     - tests/tool_use
   commands:
-    - pytest -v -s -m 'not cpu_test' tool_use
-
-- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  # grade: Blocking
-  timeout_in_minutes: 10
-  source_file_dependencies:
-    - vllm/
-    - tests/tool_use
-  no_gpu: true
-  commands:
-    - pytest -v -s -m 'cpu_test' tool_use
+    - pytest -v -s tool_use
 
 #####  models test  #####
 
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 5fcf945f3e5a6..2dcca5711b3d5 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -57,8 +57,8 @@ steps:
   - pytest -v -s -m 'not cpu_test' multimodal
   - pytest -v -s utils_
 
-- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
-  timeout_in_minutes: 20
+- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min
+  timeout_in_minutes: 30
   source_file_dependencies:
   - vllm/
   - tests/test_inputs.py
@@ -66,6 +66,7 @@ steps:
   - tests/multimodal
   - tests/standalone_tests/lazy_imports.py
   - tests/tokenizers_
+  - tests/tool_parsers
   - tests/transformers_utils
   - tests/config
   no_gpu: true
@@ -75,6 +76,7 @@ steps:
   - pytest -v -s test_outputs.py
   - pytest -v -s -m 'cpu_test' multimodal
   - pytest -v -s tokenizers_
+  - pytest -v -s tool_parsers
   - pytest -v -s transformers_utils
   - pytest -v -s config
 
@@ -672,16 +674,7 @@ steps:
     - vllm/
     - tests/tool_use
   commands:
-    - pytest -v -s -m 'not cpu_test' tool_use
-
-- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
-  timeout_in_minutes: 10
-  source_file_dependencies:
-    - vllm/
-    - tests/tool_use
-  no_gpu: true
-  commands:
-    - pytest -v -s -m 'cpu_test' tool_use
+    - pytest -v -s tool_use
 
 #####  models test  #####
 
diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml
index 072bccadb726a..252af1e56a105 100644
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -115,7 +115,7 @@ steps:
 
 - label: Async Engine, Inputs, Utils, Worker, Config (CPU)
   depends_on: ~
-  timeout_in_minutes: 20
+  timeout_in_minutes: 30
   source_file_dependencies:
   - vllm/
   - tests/test_inputs.py
@@ -123,6 +123,7 @@ steps:
   - tests/multimodal
   - tests/standalone_tests/lazy_imports.py
   - tests/tokenizers_
+  - tests/tool_parsers
   - tests/transformers_utils
   - tests/config
   no_gpu: true
@@ -132,6 +133,7 @@ steps:
   - pytest -v -s test_outputs.py
   - pytest -v -s -m 'cpu_test' multimodal
   - pytest -v -s tokenizers_
+  - pytest -v -s tool_parsers
   - pytest -v -s transformers_utils
   - pytest -v -s config
 
diff --git a/.buildkite/test_areas/tool_use.yaml b/.buildkite/test_areas/tool_use.yaml
index 7040cd1d253b3..69527a1214229 100644
--- a/.buildkite/test_areas/tool_use.yaml
+++ b/.buildkite/test_areas/tool_use.yaml
@@ -10,14 +10,4 @@ steps:
     - vllm/
     - tests/tool_use
   commands:
-    - pytest -v -s -m 'not cpu_test' tool_use
-
-- label: OpenAI-Compatible Tool Use (CPU)
-  depends_on: ~
-  timeout_in_minutes: 10
-  source_file_dependencies:
-    - vllm/
-    - tests/tool_use
-  no_gpu: true
-  commands:
-    - pytest -v -s -m 'cpu_test' tool_use
+    - pytest -v -s tool_use
diff --git a/tests/tool_parsers/__init__.py b/tests/tool_parsers/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/tool_use/test_deepseekv31_tool_parser.py b/tests/tool_parsers/test_deepseekv31_tool_parser.py
similarity index 100%
rename from tests/tool_use/test_deepseekv31_tool_parser.py
rename to tests/tool_parsers/test_deepseekv31_tool_parser.py
diff --git a/tests/tool_use/test_ernie45_moe_tool_parser.py b/tests/tool_parsers/test_ernie45_moe_tool_parser.py
similarity index 100%
rename from tests/tool_use/test_ernie45_moe_tool_parser.py
rename to tests/tool_parsers/test_ernie45_moe_tool_parser.py
diff --git a/tests/tool_use/test_glm4_moe_tool_parser.py b/tests/tool_parsers/test_glm4_moe_tool_parser.py
similarity index 99%
rename from tests/tool_use/test_glm4_moe_tool_parser.py
rename to tests/tool_parsers/test_glm4_moe_tool_parser.py
index 749b0eef4ec85..52f5a9198e9b4 100644
--- a/tests/tool_use/test_glm4_moe_tool_parser.py
+++ b/tests/tool_parsers/test_glm4_moe_tool_parser.py
@@ -12,8 +12,6 @@ from vllm.tool_parsers.glm4_moe_tool_parser import (
     Glm4MoeModelToolParser,
 )
 
-pytestmark = pytest.mark.cpu_test
-
 pytest.skip("skip glm4_moe parser test", allow_module_level=True)
 # Use a common model that is likely to be available
 MODEL = "zai-org/GLM-4.5"
diff --git a/tests/tool_use/test_jamba_tool_parser.py b/tests/tool_parsers/test_jamba_tool_parser.py
similarity index 99%
rename from tests/tool_use/test_jamba_tool_parser.py
rename to tests/tool_parsers/test_jamba_tool_parser.py
index 70e8253708592..ccad16ae2f6b6 100644
--- a/tests/tool_use/test_jamba_tool_parser.py
+++ b/tests/tool_parsers/test_jamba_tool_parser.py
@@ -13,8 +13,6 @@ from vllm.tokenizers import TokenizerLike, get_tokenizer
 from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
 from vllm.tool_parsers.jamba_tool_parser import JambaToolParser
 
-pytestmark = pytest.mark.cpu_test
-
 MODEL = "ai21labs/Jamba-tiny-dev"
 
 
diff --git a/tests/tool_use/test_kimi_k2_tool_parser.py b/tests/tool_parsers/test_kimi_k2_tool_parser.py
similarity index 99%
rename from tests/tool_use/test_kimi_k2_tool_parser.py
rename to tests/tool_parsers/test_kimi_k2_tool_parser.py
index c014d29fa9079..d02f53c34b455 100644
--- a/tests/tool_use/test_kimi_k2_tool_parser.py
+++ b/tests/tool_parsers/test_kimi_k2_tool_parser.py
@@ -10,8 +10,6 @@ from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
 from vllm.tokenizers import get_tokenizer
 from vllm.tool_parsers.kimi_k2_tool_parser import KimiK2ToolParser
 
-pytestmark = pytest.mark.cpu_test
-
 # Use a common model that is likely to be available
 MODEL = "moonshotai/Kimi-K2-Instruct"
 
diff --git a/tests/tool_use/test_minimax_tool_parser.py b/tests/tool_parsers/test_minimax_tool_parser.py
similarity index 99%
rename from tests/tool_use/test_minimax_tool_parser.py
rename to tests/tool_parsers/test_minimax_tool_parser.py
index a931ce4679d18..28cfc4ea7a175 100644
--- a/tests/tool_use/test_minimax_tool_parser.py
+++ b/tests/tool_parsers/test_minimax_tool_parser.py
@@ -15,8 +15,6 @@ from vllm.entrypoints.openai.protocol import (
 from vllm.tokenizers import get_tokenizer
 from vllm.tool_parsers.minimax_tool_parser import MinimaxToolParser
 
-pytestmark = pytest.mark.cpu_test
-
 # Use a common model that is likely to be available
 MODEL = "MiniMaxAi/MiniMax-M1-40k"
 
diff --git a/tests/tool_use/test_mistral_tool_parser.py b/tests/tool_parsers/test_mistral_tool_parser.py
similarity index 100%
rename from tests/tool_use/test_mistral_tool_parser.py
rename to tests/tool_parsers/test_mistral_tool_parser.py
diff --git a/tests/tool_use/test_openai_tool_parser.py b/tests/tool_parsers/test_openai_tool_parser.py
similarity index 100%
rename from tests/tool_use/test_openai_tool_parser.py
rename to tests/tool_parsers/test_openai_tool_parser.py
diff --git a/tests/tool_use/test_qwen3coder_tool_parser.py b/tests/tool_parsers/test_qwen3coder_tool_parser.py
similarity index 99%
rename from tests/tool_use/test_qwen3coder_tool_parser.py
rename to tests/tool_parsers/test_qwen3coder_tool_parser.py
index 87ad816f0837d..3a0a612d7fbfd 100644
--- a/tests/tool_use/test_qwen3coder_tool_parser.py
+++ b/tests/tool_parsers/test_qwen3coder_tool_parser.py
@@ -20,8 +20,6 @@ from vllm.tool_parsers.qwen3coder_tool_parser import (
 )
 from vllm.tool_parsers.qwen3xml_tool_parser import Qwen3XMLToolParser
 
-pytestmark = pytest.mark.cpu_test
-
 MODEL = "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8"
 
 
diff --git a/tests/tool_use/test_seed_oss_tool_parser.py b/tests/tool_parsers/test_seed_oss_tool_parser.py
similarity index 99%
rename from tests/tool_use/test_seed_oss_tool_parser.py
rename to tests/tool_parsers/test_seed_oss_tool_parser.py
index fda91b514edd1..c7f595830f34b 100644
--- a/tests/tool_use/test_seed_oss_tool_parser.py
+++ b/tests/tool_parsers/test_seed_oss_tool_parser.py
@@ -18,8 +18,6 @@ from vllm.tokenizers import TokenizerLike, get_tokenizer
 from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
 from vllm.tool_parsers.seed_oss_tool_parser import SeedOssToolParser
 
-pytestmark = pytest.mark.cpu_test
-
 # Use a common model that is likely to be available
 MODEL = "ByteDance-Seed/Seed-OSS-36B-Instruct"
 
diff --git a/tests/tool_use/test_xlam_tool_parser.py b/tests/tool_parsers/test_xlam_tool_parser.py
similarity index 99%
rename from tests/tool_use/test_xlam_tool_parser.py
rename to tests/tool_parsers/test_xlam_tool_parser.py
index ed24ba7cba1ac..380792a9926a4 100644
--- a/tests/tool_use/test_xlam_tool_parser.py
+++ b/tests/tool_parsers/test_xlam_tool_parser.py
@@ -16,8 +16,6 @@ from vllm.tokenizers import TokenizerLike, get_tokenizer
 from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
 from vllm.tool_parsers.xlam_tool_parser import xLAMToolParser
 
-pytestmark = pytest.mark.cpu_test
-
 # Use a common model that is likely to be available
 MODEL = "Salesforce/Llama-xLAM-2-8B-fc-r"
 

From 3f175f18a2e5d430ffa17fcb96759a758cc3ec05 Mon Sep 17 00:00:00 2001
From: Max Hu <hyoung2991@gmail.com>
Date: Mon, 15 Dec 2025 22:06:01 +0800
Subject: [PATCH 559/770] [Bugfix] Fix multimodal configuration for Qwen3VL MOE
 model (#30670)

Signed-off-by: Max Hu <hyoung2991@gmail.com>
---
 vllm/model_executor/models/qwen3_vl_moe.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/model_executor/models/qwen3_vl_moe.py b/vllm/model_executor/models/qwen3_vl_moe.py
index 025e11aa6cba9..3186804488e57 100644
--- a/vllm/model_executor/models/qwen3_vl_moe.py
+++ b/vllm/model_executor/models/qwen3_vl_moe.py
@@ -418,6 +418,11 @@ class Qwen3VLMoeForConditionalGeneration(
 
         self.config = config
         self.multimodal_config = multimodal_config
+        self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
+        self.video_pruning_rate = multimodal_config.video_pruning_rate
+        self.is_multimodal_pruning_enabled = (
+            multimodal_config.is_multimodal_pruning_enabled()
+        )
 
         if not multimodal_config.get_limit_per_prompt(
             "image"

From d0502b4928fb683491952c6cd4f31b3d63e6d25c Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Mon, 15 Dec 2025 09:54:53 -0500
Subject: [PATCH 560/770] [MoE][Refactor 1/N] Separate Online Quantization
 (#30627)

Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
---
 .../model_executor/layers/quantization/fp8.py | 243 +++++++++++-------
 1 file changed, 154 insertions(+), 89 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 6909bac1efc7c..f2b66a2beb6d7 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -332,7 +332,10 @@ class Fp8Config(QuantizationConfig):
                 fused_mapping=self.packed_modules_mapping,
             ):
                 return UnquantizedFusedMoEMethod(layer.moe_config)
-            moe_quant_method = Fp8MoEMethod(self, layer)
+            if self.is_checkpoint_fp8_serialized:
+                moe_quant_method = Fp8MoEMethod(self, layer)
+            else:
+                moe_quant_method = Fp8OnlineMoEMethod(self, layer)
             moe_quant_method.marlin_input_dtype = get_marlin_input_dtype(prefix)
             return moe_quant_method
         elif isinstance(layer, Attention):
@@ -745,8 +748,9 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         layer.orig_dtype = params_dtype
         layer.weight_block_size = None
 
-        if self.quant_config.is_checkpoint_fp8_serialized:
-            params_dtype = torch.float8_e4m3fn
+        assert self.quant_config.is_checkpoint_fp8_serialized
+        params_dtype = torch.float8_e4m3fn
+
         if self.block_quant:
             assert self.weight_block_size is not None
             layer.weight_block_size = self.weight_block_size
@@ -773,41 +777,6 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                     f"weight quantization block_k = {block_k}."
                 )
 
-        # if we are doing online quantization, patch the weight
-        # loaded to call `process_weights_after_loading` in a streaming fashion
-        # as soon as the last weight chunk is loaded
-        if not self.quant_config.is_checkpoint_fp8_serialized:
-            weight_loader = extra_weight_attrs["weight_loader"]
-            # create a new holder to prevent modifying behavior of any other
-            # objects which might depend on the old one
-            new_extra_weight_attrs = extra_weight_attrs
-
-            def patched_weight_loader(param, loaded_weight, *args, **kwargs):
-                # load the current weight chunk
-                res = weight_loader(param, loaded_weight, *args, **kwargs)  # type: ignore[misc]
-
-                # add a counter to track how many elements we have updated
-                if not hasattr(layer, "_loaded_numel"):
-                    layer._loaded_numel = 0
-                layer._loaded_numel += loaded_weight.numel()
-
-                # if we have loaded all of the elements, call
-                # process_weights_after_loading
-                target_loaded_numel = layer.w13_weight.numel() + layer.w2_weight.numel()
-                if layer._loaded_numel == target_loaded_numel:
-                    self.process_weights_after_loading(layer)
-
-                    # Delete the bookkeeping
-                    del layer._loaded_numel
-                    # Prevent the usual `process_weights_after_loading` call
-                    # from doing anything
-                    layer._already_called_process_weights_after_loading = True
-
-                return res
-
-            new_extra_weight_attrs["weight_loader"] = patched_weight_loader
-            extra_weight_attrs = new_extra_weight_attrs
-
         # WEIGHTS
         w13_weight = torch.nn.Parameter(
             torch.empty(
@@ -875,21 +844,11 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             if self.block_quant
             else {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
         )
-        # If loading fp8 checkpoint, pass the weight loaders.
-        # If loading an fp16 checkpoint, do not (we will quantize in
-        #   process_weights_after_loading()
-        if self.quant_config.is_checkpoint_fp8_serialized:
-            set_weight_attrs(w13_weight_scale, extra_weight_attrs)
-            set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
 
         # INPUT_SCALES
         if self.quant_config.activation_scheme == "static":
-            if not self.quant_config.is_checkpoint_fp8_serialized:
-                raise ValueError(
-                    "Found static activation scheme for checkpoint that "
-                    "was not serialized fp8."
-                )
-
             w13_input_scale = torch.nn.Parameter(
                 torch.ones(num_experts, dtype=torch.float32), requires_grad=False
             )
@@ -986,45 +945,6 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 layer.w2_weight_scale_inv = Parameter(
                     dg_w2_weight_scale_inv, requires_grad=False
                 )
-
-        # If checkpoint is fp16, quantize in place.
-        elif not self.quant_config.is_checkpoint_fp8_serialized:
-            fp8_dtype = current_platform.fp8_dtype()
-            w13_weight = torch.empty_like(layer.w13_weight.data, dtype=fp8_dtype)
-            w2_weight = torch.empty_like(layer.w2_weight.data, dtype=fp8_dtype)
-
-            # Re-initialize w13_scale because we directly quantize
-            # merged w13 weights and generate a single scaling factor.
-            replace_parameter(
-                layer,
-                "w13_weight_scale",
-                torch.ones(
-                    layer.local_num_experts,
-                    dtype=torch.float32,
-                    device=w13_weight.device,
-                ),
-            )
-            for expert in range(layer.local_num_experts):
-                w13_weight[expert, :, :], layer.w13_weight_scale[expert] = (
-                    ops.scaled_fp8_quant(layer.w13_weight.data[expert, :, :])
-                )
-                w2_weight[expert, :, :], layer.w2_weight_scale[expert] = (
-                    ops.scaled_fp8_quant(layer.w2_weight.data[expert, :, :])
-                )
-            replace_parameter(layer, "w13_weight", w13_weight)
-            replace_parameter(layer, "w2_weight", w2_weight)
-
-            if self.rocm_aiter_moe_enabled:
-                # reshaping weights is required for aiter moe kernel.
-                shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
-                    layer.w13_weight, layer.w2_weight
-                )
-
-                replace_parameter(layer, "w13_weight", shuffled_w13)
-                replace_parameter(layer, "w2_weight", shuffled_w2)
-        # If checkpoint is fp8, we need to handle that the
-        # MoE kernels require single activation scale and single weight
-        # scale for w13 per expert.
         else:
             # Fp8 moe kernels require a single activation scale.
             # We take the max of all the scales in case they differ.
@@ -1387,6 +1307,151 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             return result
 
 
+class Fp8OnlineMoEMethod(Fp8MoEMethod):
+    """MoE method for online FP8 quantization.
+    Supports loading quantized FP16/BF16 model checkpoints with dynamic
+    activation scaling. The weight scaling factor will be initialized after
+    the model weights are loaded.
+
+    Args:
+        quant_config: The quantization config.
+    """
+
+    def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module):
+        super().__init__(quant_config, layer)
+        assert not quant_config.is_checkpoint_fp8_serialized
+        assert quant_config.activation_scheme == "dynamic"
+        assert quant_config.weight_block_size is None
+        assert self.flashinfer_moe_backend is None
+
+    def create_weights(
+        self,
+        layer: Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        layer.intermediate_size_per_partition = intermediate_size_per_partition
+        layer.hidden_size = hidden_size
+        layer.num_experts = num_experts
+        layer.orig_dtype = params_dtype
+        layer.weight_block_size = None
+
+        # We are doing online quantization, patch the weight loaded
+        # to call `process_weights_after_loading` in a streaming fashion
+        # as soon as the last weight chunk is loaded.
+        weight_loader = extra_weight_attrs["weight_loader"]
+        # create a new holder to prevent modifying behavior of any other
+        # objects which might depend on the old one
+        new_extra_weight_attrs = extra_weight_attrs
+
+        def patched_weight_loader(param, loaded_weight, *args, **kwargs):
+            # load the current weight chunk
+            res = weight_loader(param, loaded_weight, *args, **kwargs)  # type: ignore[misc]
+
+            # add a counter to track how many elements we have updated
+            if not hasattr(layer, "_loaded_numel"):
+                layer._loaded_numel = 0
+            layer._loaded_numel += loaded_weight.numel()
+
+            # if we have loaded all of the elements, call
+            # process_weights_after_loading
+            target_loaded_numel = layer.w13_weight.numel() + layer.w2_weight.numel()
+            if layer._loaded_numel == target_loaded_numel:
+                self.process_weights_after_loading(layer)
+
+                # Delete the bookkeeping
+                del layer._loaded_numel
+                # Prevent the usual `process_weights_after_loading` call
+                # from doing anything
+                layer._already_called_process_weights_after_loading = True
+
+            return res
+
+        new_extra_weight_attrs["weight_loader"] = patched_weight_loader
+        extra_weight_attrs = new_extra_weight_attrs
+
+        # WEIGHTS
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        # WEIGHT_SCALES
+        # Allocate 2 scales for w1 and w3 respectively.
+        # They will be combined to a single scale after weight loading.
+        w13_weight_scale = torch.nn.Parameter(
+            torch.ones(num_experts, dtype=torch.float32), requires_grad=False
+        )
+        w2_weight_scale = torch.nn.Parameter(
+            torch.ones(num_experts, dtype=torch.float32), requires_grad=False
+        )
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+
+        layer.w13_input_scale = None
+        layer.w2_input_scale = None
+
+        self.rocm_aiter_moe_enabled = False
+
+    def process_weights_after_loading(self, layer: Module) -> None:
+        if getattr(layer, "_already_called_process_weights_after_loading", False):
+            return
+
+        # Lazy import to avoid importing triton too early.
+        self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
+
+        # If checkpoint is fp16, quantize in place.
+        fp8_dtype = current_platform.fp8_dtype()
+        w13_weight = torch.empty_like(layer.w13_weight.data, dtype=fp8_dtype)
+        w2_weight = torch.empty_like(layer.w2_weight.data, dtype=fp8_dtype)
+
+        for expert in range(layer.local_num_experts):
+            w13_weight[expert, :, :], layer.w13_weight_scale[expert] = (
+                ops.scaled_fp8_quant(layer.w13_weight.data[expert, :, :])
+            )
+            w2_weight[expert, :, :], layer.w2_weight_scale[expert] = (
+                ops.scaled_fp8_quant(layer.w2_weight.data[expert, :, :])
+            )
+        replace_parameter(layer, "w13_weight", w13_weight)
+        replace_parameter(layer, "w2_weight", w2_weight)
+
+        # Reshuffle weights for AITER if needed.
+        if self.rocm_aiter_moe_enabled:
+            shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
+                layer.w13_weight, layer.w2_weight
+            )
+            replace_parameter(layer, "w13_weight", shuffled_w13)
+            replace_parameter(layer, "w2_weight", shuffled_w2)
+
+        # Rushuffle weights for MARLIN if needed.
+        if self.use_marlin:
+            prepare_moe_fp8_layer_for_marlin(
+                layer, False, input_dtype=self.marlin_input_dtype
+            )
+
+
 class Fp8KVCacheMethod(BaseKVCacheMethod):
     """
     Supports loading kv-cache scaling factors from FP8 checkpoints.

From 855b101d75d2fc1fa02a47a6fcfa4053e8541cf0 Mon Sep 17 00:00:00 2001
From: yjc9696 <32888676+yjc9696@users.noreply.github.com>
Date: Mon, 15 Dec 2025 23:08:47 +0800
Subject: [PATCH 561/770] [Frontend] add tools for dsv32 developer role
 (#30040)

Signed-off-by: pridejcyang <pridejcyang@tencent.com>
Co-authored-by: pridejcyang <pridejcyang@tencent.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/entrypoints/chat_utils.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 6a7975adeac81..ab055dfb1fb0e 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -24,6 +24,7 @@ from openai.types.chat import (
     ChatCompletionContentPartInputAudioParam,
     ChatCompletionContentPartRefusalParam,
     ChatCompletionContentPartTextParam,
+    ChatCompletionFunctionToolParam,
     ChatCompletionMessageToolCallParam,
     ChatCompletionToolMessageParam,
 )
@@ -269,6 +270,9 @@ class CustomChatCompletionMessageParam(TypedDict, total=False):
     reasoning: str | None
     """The reasoning content for interleaved thinking."""
 
+    tools: list[ChatCompletionFunctionToolParam] | None
+    """The tools for developer role."""
+
 
 ChatCompletionMessageParam: TypeAlias = (
     OpenAIChatCompletionMessageParam
@@ -300,6 +304,9 @@ class ConversationMessage(TypedDict, total=False):
     reasoning_content: str | None
     """Deprecated: The reasoning content for interleaved thinking."""
 
+    tools: list[ChatCompletionFunctionToolParam] | None
+    """The tools for developer role."""
+
 
 # Passed in by user
 ChatTemplateContentFormatOption = Literal["auto", "string", "openai"]
@@ -1619,6 +1626,8 @@ def _parse_chat_message_content(
         if "name" in message and isinstance(message["name"], str):
             result_msg["name"] = message["name"]
 
+        if role == "developer":
+            result_msg["tools"] = message.get("tools", None)
     return result
 
 
From 17fec3af0942da83bcebe2ca0cb4f6ae81c634d8 Mon Sep 17 00:00:00 2001
From: mondaylord <leon@phala.network>
Date: Tue, 16 Dec 2025 00:13:37 +0800
Subject: [PATCH 562/770] [Bugfix] Fix missing first token in tool calls during
 reasoning-to-tool transition (#30671)

Signed-off-by: mondaylord <20212010046@fudan.edu.cn>
---
 vllm/entrypoints/openai/serving_chat.py | 60 ++++++++++++-------------
 1 file changed, 28 insertions(+), 32 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 2df5372635596..98fc7810faf96 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -964,21 +964,9 @@ class OpenAIServingChat(OpenAIServing):
                         assert reasoning_end_arr is not None
                         output_token_ids = as_list(output.token_ids)
                         if not reasoning_end_arr[i]:
-                            delta_message = (
-                                reasoning_parser.extract_reasoning_streaming(
-                                    previous_text,
-                                    current_text,
-                                    delta_text,
-                                    previous_token_ids,
-                                    current_token_ids,
-                                    output_token_ids,
-                                )
-                            )
                             # When encountering think end id in prompt_token_ids
                             # i.e {"enable_thinking": False},
                             # set reasoning status to end.
-                            # Remove the text and token ids related
-                            # to 'reasoning'.
                             if (
                                 res.prompt_token_ids
                                 and reasoning_parser.is_reasoning_end(
@@ -987,30 +975,38 @@ class OpenAIServingChat(OpenAIServing):
                             ):
                                 reasoning_end_arr[i] = True
                                 current_token_ids = output_token_ids
-                                if delta_message and delta_message.content:
-                                    current_text = delta_message.content
-                                    delta_message.content = None
-                                else:
-                                    current_text = ""
-                            # When encountering think end id in delta_token_ids,
-                            # set reasoning status to end.
-                            # Remove the text and token ids related
-                            # to 'reasoning'.
-                            if reasoning_parser.is_reasoning_end(output_token_ids):
-                                reasoning_end_arr[i] = True
-                                current_token_ids = (
-                                    reasoning_parser.extract_content_ids(
-                                        output_token_ids
+                                # Don't update current_text, keep it as is from delta
+                            else:
+                                delta_message = (
+                                    reasoning_parser.extract_reasoning_streaming(
+                                        previous_text,
+                                        current_text,
+                                        delta_text,
+                                        previous_token_ids,
+                                        current_token_ids,
+                                        output_token_ids,
                                     )
                                 )
-                                if delta_message and delta_message.content:
-                                    current_text = delta_message.content
-                                    delta_message.content = None
-                                else:
-                                    current_text = ""
+
+                                # When encountering think end id in delta_token_ids,
+                                # set reasoning status to end.
+                                # Remove the text and token ids related
+                                # to 'reasoning'.
+                                if reasoning_parser.is_reasoning_end(output_token_ids):
+                                    reasoning_end_arr[i] = True
+                                    current_token_ids = (
+                                        reasoning_parser.extract_content_ids(
+                                            output_token_ids
+                                        )
+                                    )
+                                    if delta_message and delta_message.content:
+                                        current_text = delta_message.content
+                                        delta_message.content = None
+                                    else:
+                                        current_text = ""
 
                         # handle tool calls only after reasoning is done,
-                        else:
+                        if reasoning_end_arr[i]:
                             delta_token_ids = output_token_ids
                             # First time to tool call,
                             # add the remaining text and token ids

From 970713d4a40b1d83244eb0ed4eb4d690b6bb4f14 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 15 Dec 2025 17:34:08 +0000
Subject: [PATCH 563/770] Remove `SkipValidation` from `ModelConfig` (#30695)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/config/model.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/vllm/config/model.py b/vllm/config/model.py
index 7ff095bcb9ccd..1de9d15cf8c52 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -8,7 +8,7 @@ from functools import cached_property
 from typing import TYPE_CHECKING, Any, Literal, cast, get_args
 
 import torch
-from pydantic import ConfigDict, SkipValidation, field_validator, model_validator
+from pydantic import ConfigDict, Field, field_validator, model_validator
 from pydantic.dataclasses import dataclass
 from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
 from transformers.configuration_utils import ALLOWED_LAYER_TYPES
@@ -109,7 +109,7 @@ class ModelConfig:
     """Convert the model using adapters defined in
     [vllm.model_executor.models.adapters][]. The most common use case is to
     adapt a text generation model to be used for pooling tasks."""
-    tokenizer: SkipValidation[str] = None  # type: ignore
+    tokenizer: str = Field(default=None)
     """Name or path of the Hugging Face tokenizer to use. If unspecified, model
     name or path will be used."""
     tokenizer_mode: TokenizerMode | str = "auto"
@@ -164,7 +164,7 @@ class ModelConfig:
     """The specific revision to use for the tokenizer on the Hugging Face Hub.
     It can be a branch name, a tag name, or a commit id. If unspecified, will
     use the default version."""
-    max_model_len: SkipValidation[int] = None  # type: ignore
+    max_model_len: int = Field(default=None, gt=0)
     """Model context length (prompt and output). If unspecified, will be
     automatically derived from the model config.
 
@@ -175,7 +175,7 @@ class ModelConfig:
     - 25.6k -> 25,600"""
     spec_target_max_model_len: int | None = None
     """Specify the maximum length for spec decoding draft models."""
-    quantization: SkipValidation[QuantizationMethods | None] = None
+    quantization: QuantizationMethods | str | None = None
     """Method used to quantize the weights. If `None`, we first check the
     `quantization_config` attribute in the model config file. If that is
     `None`, we assume the model weights are not quantized and use `dtype` to
@@ -597,6 +597,14 @@ class ModelConfig:
         self._verify_cuda_graph()
         self._verify_bnb_config()
 
+    @field_validator("tokenizer", "max_model_len", mode="wrap")
+    @classmethod
+    def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
+        """Skip validation if the value is `None` when initialisation is delayed."""
+        if value is None:
+            return value
+        return handler(value)
+
     @field_validator("tokenizer_mode", mode="after")
     def _lowercase_tokenizer_mode(cls, tokenizer_mode: str) -> str:
         return tokenizer_mode.lower()
@@ -610,13 +618,14 @@ class ModelConfig:
 
     @model_validator(mode="after")
     def validate_model_config_after(self: "ModelConfig") -> "ModelConfig":
+        """Called after __post_init__"""
         if not isinstance(self.tokenizer, str):
             raise ValueError(
                 f"tokenizer must be a string, got "
                 f"{type(self.tokenizer).__name__}: {self.tokenizer!r}. "
                 "Please provide a valid tokenizer path or HuggingFace model ID."
             )
-        if not isinstance(self.max_model_len, int) or self.max_model_len <= 0:
+        if not isinstance(self.max_model_len, int):
             raise ValueError(
                 f"max_model_len must be a positive integer, "
                 f"got {type(self.max_model_len).__name__}: {self.max_model_len!r}. "

From ec154c36ee74f35def28e4ddc1c16a0dc7a8c112 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Tue, 16 Dec 2025 01:36:07 +0800
Subject: [PATCH 564/770] [Platform] Refactor Platform attention backend
 selection to avoid breakpoint for OOT platform (#30212)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/attention/selector.py  | 59 +++++++++++++++++------------
 vllm/platforms/cpu.py       | 15 ++------
 vllm/platforms/cuda.py      | 74 ++++++++-----------------------------
 vllm/platforms/interface.py | 12 +-----
 vllm/platforms/rocm.py      | 22 +++++------
 vllm/platforms/tpu.py       | 13 ++-----
 vllm/platforms/xpu.py       | 15 ++------
 7 files changed, 73 insertions(+), 137 deletions(-)

diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index bbf95ff009001..e66f698add99d 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -2,11 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from functools import cache
-from typing import cast, get_args
+from typing import NamedTuple, cast, get_args
 
 import torch
 
-from vllm.attention.backends.abstract import AttentionBackend
+from vllm.attention.backends.abstract import AttentionBackend, AttentionType
 from vllm.attention.backends.registry import (
     MAMBA_TYPE_TO_BACKEND_MAP,
     MambaAttentionBackendEnum,
@@ -18,6 +18,31 @@ from vllm.utils.import_utils import resolve_obj_by_qualname
 logger = init_logger(__name__)
 
 
+class AttentionSelectorConfig(NamedTuple):
+    head_size: int
+    dtype: torch.dtype
+    kv_cache_dtype: CacheDType | None
+    block_size: int | None
+    use_mla: bool = False
+    has_sink: bool = False
+    use_sparse: bool = False
+    use_mm_prefix: bool = False
+    attn_type: str = AttentionType.DECODER
+
+    def __repr__(self):
+        return (
+            f"AttentionSelectorConfig(head_size={self.head_size}, "
+            f"dtype={self.dtype}, "
+            f"kv_cache_dtype={self.kv_cache_dtype}, "
+            f"block_size={self.block_size}, "
+            f"use_mla={self.use_mla}, "
+            f"has_sink={self.has_sink}, "
+            f"use_sparse={self.use_sparse}, "
+            f"use_mm_prefix={self.use_mm_prefix}, "
+            f"attn_type={self.attn_type})"
+        )
+
+
 def get_attn_backend(
     head_size: int,
     dtype: torch.dtype,
@@ -43,8 +68,7 @@ def get_attn_backend(
     vllm_config = get_current_vllm_config()
     backend_enum = vllm_config.attention_config.backend
 
-    return _cached_get_attn_backend(
-        backend=backend_enum,
+    attn_selector_config = AttentionSelectorConfig(
         head_size=head_size,
         dtype=dtype,
         kv_cache_dtype=cast(CacheDType | None, kv_cache_dtype),
@@ -53,36 +77,25 @@ def get_attn_backend(
         has_sink=has_sink,
         use_sparse=use_sparse,
         use_mm_prefix=use_mm_prefix,
-        attn_type=attn_type,
+        attn_type=attn_type or AttentionType.DECODER,
+    )
+
+    return _cached_get_attn_backend(
+        backend=backend_enum,
+        attn_selector_config=attn_selector_config,
     )
 
 
 @cache
 def _cached_get_attn_backend(
     backend,
-    head_size: int,
-    dtype: torch.dtype,
-    kv_cache_dtype: CacheDType | None,
-    block_size: int | None,
-    use_mla: bool = False,
-    has_sink: bool = False,
-    use_sparse: bool = False,
-    use_mm_prefix: bool = False,
-    attn_type: str | None = None,
+    attn_selector_config: AttentionSelectorConfig,
 ) -> type[AttentionBackend]:
     from vllm.platforms import current_platform
 
     attention_cls = current_platform.get_attn_backend_cls(
         backend,
-        head_size,
-        dtype,
-        kv_cache_dtype,
-        block_size,
-        use_mla,
-        has_sink,
-        use_sparse,
-        use_mm_prefix,
-        attn_type,
+        attn_selector_config=attn_selector_config,
     )
     if not attention_cls:
         raise ValueError(
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index d961dcf13e53e..e1b461d79a655 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -23,6 +23,7 @@ from .interface import CpuArchEnum, Platform, PlatformEnum
 logger = init_logger(__name__)
 
 if TYPE_CHECKING:
+    from vllm.attention.selector import AttentionSelectorConfig
     from vllm.config import VllmConfig
 else:
     VllmConfig = None
@@ -126,21 +127,13 @@ class CpuPlatform(Platform):
     def get_attn_backend_cls(
         cls,
         selected_backend: "AttentionBackendEnum",
-        head_size: int,
-        dtype: torch.dtype,
-        kv_cache_dtype: str | None,
-        block_size: int,
-        use_mla: bool,
-        has_sink: bool,
-        use_sparse: bool,
-        use_mm_prefix: bool,
-        attn_type: str | None = None,
+        attn_selector_config: "AttentionSelectorConfig",
     ) -> str:
         if selected_backend and selected_backend != AttentionBackendEnum.CPU_ATTN:
             logger.info("Cannot use %s backend on CPU.", selected_backend)
-        if use_mla:
+        if attn_selector_config.use_mla:
             raise NotImplementedError("MLA is not supported on CPU.")
-        if use_sparse:
+        if attn_selector_config.use_sparse:
             raise NotImplementedError("Sparse Attention is not supported on CPU.")
         return AttentionBackendEnum.CPU_ATTN.get_path()
 
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index ad5a6789b2023..2dc4ba5d70cac 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -14,7 +14,6 @@ from typing_extensions import ParamSpec
 
 # import custom ops, trigger op registration
 import vllm._C  # noqa
-from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
 from vllm.utils.import_utils import import_pynvml
@@ -23,6 +22,7 @@ from vllm.utils.torch_utils import cuda_device_count_stateless
 from .interface import DeviceCapability, Platform, PlatformEnum
 
 if TYPE_CHECKING:
+    from vllm.attention.selector import AttentionSelectorConfig
     from vllm.config import VllmConfig
     from vllm.config.cache import CacheDType
 else:
@@ -258,16 +258,8 @@ class CudaPlatformBase(Platform):
     @classmethod
     def get_valid_backends(
         cls,
-        head_size,
-        dtype,
-        kv_cache_dtype,
-        block_size,
-        use_mla,
-        has_sink,
-        use_sparse,
-        use_mm_prefix,
-        device_capability,
-        attn_type,
+        device_capability: DeviceCapability,
+        attn_selector_config: "AttentionSelectorConfig",
     ) -> tuple[
         list[tuple["AttentionBackendEnum", int]],
         dict["AttentionBackendEnum", list[str]],
@@ -275,21 +267,15 @@ class CudaPlatformBase(Platform):
         valid_backends_priorities = []
         invalid_reasons = {}
 
-        backend_priorities = _get_backend_priorities(use_mla, device_capability)
+        backend_priorities = _get_backend_priorities(
+            attn_selector_config.use_mla, device_capability
+        )
         for priority, backend in enumerate(backend_priorities):
             try:
                 backend_class = backend.get_class()
                 invalid_reasons_i = backend_class.validate_configuration(
-                    head_size,
-                    dtype,
-                    kv_cache_dtype,
-                    block_size,
-                    use_mla,
-                    has_sink,
-                    use_sparse,
-                    use_mm_prefix,
-                    device_capability,
-                    attn_type,
+                    device_capability=device_capability,
+                    **attn_selector_config._asdict(),
                 )
             except ImportError:
                 invalid_reasons_i = ["ImportError"]
@@ -304,37 +290,19 @@ class CudaPlatformBase(Platform):
     def get_attn_backend_cls(
         cls,
         selected_backend: "AttentionBackendEnum",
-        head_size: int,
-        dtype: torch.dtype,
-        kv_cache_dtype: "CacheDType | None",
-        block_size: int | None,
-        use_mla: bool,
-        has_sink: bool,
-        use_sparse: bool,
-        use_mm_prefix: bool,
-        attn_type: str | None = None,
+        attn_selector_config: "AttentionSelectorConfig",
     ) -> str:
-        if attn_type is None:
-            attn_type = AttentionType.DECODER
-
         device_capability = cls.get_device_capability()
         assert device_capability is not None
 
+        attn_selector_config = attn_selector_config._replace(block_size=None)
         # First try checking just the selected backend, if there is one.
         if selected_backend is not None:
             try:
                 backend_class = selected_backend.get_class()
                 invalid_reasons = backend_class.validate_configuration(
-                    head_size,
-                    dtype,
-                    kv_cache_dtype,
-                    None,
-                    use_mla,
-                    has_sink,
-                    use_sparse,
-                    use_mm_prefix,
-                    device_capability,
-                    attn_type,
+                    device_capability=device_capability,
+                    **attn_selector_config._asdict(),
                 )
             except ImportError:
                 invalid_reasons = ["ImportError"]
@@ -350,16 +318,8 @@ class CudaPlatformBase(Platform):
         # No selected backend or the selected backend is invalid,
         # so we try finding a valid backend.
         valid_backends_priorities, invalid_reasons = cls.get_valid_backends(
-            head_size,
-            dtype,
-            kv_cache_dtype,
-            None,
-            use_mla,
-            has_sink,
-            use_sparse,
-            use_mm_prefix,
-            device_capability,
-            attn_type,
+            device_capability=device_capability,
+            attn_selector_config=attn_selector_config,
         )
         reasons_str = (
             "{"
@@ -369,11 +329,7 @@ class CudaPlatformBase(Platform):
             )
             + "}"
         )
-        config_str = (
-            f"head_size: {head_size}, dtype: {dtype}, "
-            f"kv_cache_dtype: {kv_cache_dtype}, block_size: {block_size}, "
-            f"use_mla: {use_mla}, has_sink: {has_sink}, use_sparse: {use_sparse}"
-        )
+        config_str = attn_selector_config.__repr__()
         logger.debug_once(
             f"Some attention backends are not valid for {cls.device_name} with "
             f"{config_str}. Reasons: {reasons_str}."
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 9788e5b564165..d4b40045df384 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -18,8 +18,8 @@ from vllm.logger import init_logger
 if TYPE_CHECKING:
     from torch.distributed import PrefixStore, ProcessGroup
 
+    from vllm.attention.selector import AttentionSelectorConfig
     from vllm.config import VllmConfig
-    from vllm.config.cache import CacheDType
     from vllm.inputs import ProcessorInputs, PromptType
     from vllm.pooling_params import PoolingParams
     from vllm.sampling_params import SamplingParams
@@ -226,15 +226,7 @@ class Platform:
     def get_attn_backend_cls(
         cls,
         selected_backend: "AttentionBackendEnum",
-        head_size: int,
-        dtype: torch.dtype,
-        kv_cache_dtype: "CacheDType | None",
-        block_size: int,
-        use_mla: bool,
-        has_sink: bool,
-        use_sparse: bool,
-        use_mm_prefix: bool,
-        attn_type: str | None = None,
+        attn_selector_config: "AttentionSelectorConfig",
     ) -> str:
         """Get the attention backend class of a device."""
         return ""
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index b90fb3686c280..e469a928da229 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -15,6 +15,7 @@ from vllm.utils.torch_utils import cuda_device_count_stateless
 from .interface import DeviceCapability, Platform, PlatformEnum
 
 if TYPE_CHECKING:
+    from vllm.attention.selector import AttentionSelectorConfig
     from vllm.config import VllmConfig
 
 logger = init_logger(__name__)
@@ -190,21 +191,16 @@ class RocmPlatform(Platform):
     @classmethod
     def get_attn_backend_cls(
         cls,
-        selected_backend,
-        head_size,
-        dtype,
-        kv_cache_dtype,
-        block_size,
-        use_mla,
-        has_sink,
-        use_sparse,
-        use_mm_prefix,
-        attn_type: str | None = None,
+        selected_backend: "AttentionBackendEnum",
+        attn_selector_config: "AttentionSelectorConfig",
     ) -> str:
         from vllm._aiter_ops import rocm_aiter_ops
 
-        if use_sparse:
-            if kv_cache_dtype.startswith("fp8"):
+        block_size = attn_selector_config.block_size
+        kv_cache_dtype = attn_selector_config.kv_cache_dtype
+
+        if attn_selector_config.use_sparse:
+            if kv_cache_dtype and kv_cache_dtype.startswith("fp8"):
                 raise ValueError(
                     "ROCMAiterMLASparseBackend doesn't support fp8 kv_cache_dtype."
                 )
@@ -214,7 +210,7 @@ class RocmPlatform(Platform):
             logger.info_once("Using Sparse MLA backend on V1 engine.")
             return AttentionBackendEnum.ROCM_AITER_MLA_SPARSE.get_path()
 
-        if use_mla:
+        if attn_selector_config.use_mla:
             if selected_backend is None:
                 selected_backend = (
                     AttentionBackendEnum.ROCM_AITER_MLA
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 50de87098f05c..7c479bf2b6a0e 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -16,6 +16,7 @@ from .interface import Platform, PlatformEnum
 if TYPE_CHECKING:
     from typing import TypeAlias
 
+    from vllm.attention.selector import AttentionSelectorConfig
     from vllm.config import VllmConfig
     from vllm.config.cache import BlockSize
     from vllm.pooling_params import PoolingParams
@@ -57,17 +58,9 @@ class TpuPlatform(Platform):
     def get_attn_backend_cls(
         cls,
         selected_backend: "AttentionBackendEnum",
-        head_size: int,
-        dtype: torch.dtype,
-        kv_cache_dtype: str | None,
-        block_size: int,
-        use_mla: bool,
-        has_sink: bool,
-        use_sparse: bool,
-        use_mm_prefix: bool,
-        attn_type: str | None = None,
+        attn_selector_config: "AttentionSelectorConfig",
     ) -> str:
-        if use_sparse:
+        if attn_selector_config.use_sparse:
             raise NotImplementedError("Sparse Attention is not supported on TPU.")
         if selected_backend != AttentionBackendEnum.PALLAS:
             logger.info("Cannot use %s backend on TPU.", selected_backend)
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index c1ec2d41c73b0..af8979af36643 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -14,6 +14,7 @@ from vllm.logger import init_logger
 from .interface import DeviceCapability, Platform, PlatformEnum
 
 if TYPE_CHECKING:
+    from vllm.attention.selector import AttentionSelectorConfig
     from vllm.config import VllmConfig
 else:
     VllmConfig = None
@@ -42,15 +43,7 @@ class XPUPlatform(Platform):
     def get_attn_backend_cls(
         cls,
         selected_backend: "AttentionBackendEnum",
-        head_size: int,
-        dtype: torch.dtype,
-        kv_cache_dtype: str | None,
-        block_size: int,
-        use_mla: bool,
-        has_sink: bool,
-        use_sparse: bool,
-        use_mm_prefix: bool,
-        attn_type: str | None = None,
+        attn_selector_config: "AttentionSelectorConfig",
     ) -> str:
         from vllm.v1.attention.backends.utils import set_kv_cache_layout
 
@@ -60,7 +53,7 @@ class XPUPlatform(Platform):
             "only NHD layout is supported by XPU attention kernels."
         )
 
-        if use_sparse:
+        if attn_selector_config.use_sparse:
             raise NotImplementedError("Sparse Attention is not supported on XPU.")
         if selected_backend == AttentionBackendEnum.TRITON_ATTN:
             logger.info_once("Using Triton backend.")
@@ -71,7 +64,7 @@ class XPUPlatform(Platform):
         elif selected_backend:
             raise ValueError(
                 f"Invalid attention backend for {cls.device_name}, "
-                f"with use_mla: {use_mla}"
+                f"with use_mla: {attn_selector_config.use_mla}"
             )
 
         logger.info("Using Flash Attention backend.")

From 51e5b3e3c422cdd81e3c1bd2b9abd025e53ae986 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Mon, 15 Dec 2025 14:45:21 -0500
Subject: [PATCH 565/770] [Bugfix] Fix ViT with FlashAttention on ROCm (#30703)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 vllm/attention/layer.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 47daf6d138431..7ef77db8fbb5b 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -464,7 +464,10 @@ class MultiHeadAttention(nn.Module):
         }
 
         self.fa_version = None
-        if self.attn_backend == AttentionBackendEnum.FLASH_ATTN:
+        if (
+            self.attn_backend == AttentionBackendEnum.FLASH_ATTN
+            and current_platform.is_cuda()
+        ):
             self.fa_version = get_flash_attn_version()
             assert self._flash_attn_varlen_func is not None
             self._flash_attn_varlen_func = functools.partial(

From b2191abdcae73fe80f751b463c488159f4dd08a4 Mon Sep 17 00:00:00 2001
From: Fadi Arafeh <115173828+fadara01@users.noreply.github.com>
Date: Mon, 15 Dec 2025 19:46:25 +0000
Subject: [PATCH 566/770] [docs][fix]  Update Arm CPU vLLM wheel installation
 docs (#30594)

Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
---
 .../installation/cpu.arm.inc.md               | 32 ++++++++++++-------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/docs/getting_started/installation/cpu.arm.inc.md b/docs/getting_started/installation/cpu.arm.inc.md
index ad9c7d9ef21be..657bf2509db01 100644
--- a/docs/getting_started/installation/cpu.arm.inc.md
+++ b/docs/getting_started/installation/cpu.arm.inc.md
@@ -16,15 +16,15 @@ vLLM offers basic model inferencing and serving on Arm CPU platform, with suppor
 # --8<-- [start:pre-built-wheels]
 
 Pre-built vLLM wheels for Arm are available since version 0.11.2. These wheels contain pre-compiled C++ binaries.
-Please replace `<version>` in the commands below with a specific version string (e.g., `0.11.2`).
 
 ```bash
-uv pip install --pre vllm==<version>+cpu --extra-index-url https://wheels.vllm.ai/<version>%2Bcpu/
+export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
+uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu
 ```
 
 ??? console "pip"
     ```bash
-    pip install --pre vllm==<version>+cpu --extra-index-url https://wheels.vllm.ai/<version>%2Bcpu/
+    pip install vllm==${VLLM_VERSION}+cpu --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu
     ```
 
 The `uv` approach works for vLLM `v0.6.6` and later. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
@@ -35,20 +35,28 @@ LLM inference is a fast-evolving field, and the latest code may contain bug fixe
 
 * `https://wheels.vllm.ai/nightly/cpu/vllm`
 
-To install from nightly index, copy the link address of the `*.whl` under this index to run, for example:
-
+To install from nightly index, run:
 ```bash
-uv pip install -U https://wheels.vllm.ai/c756fb678184b867ed94e5613a529198f1aee423/vllm-0.13.0rc2.dev11%2Bgc756fb678.cpu-cp38-abi3-manylinux_2_31_aarch64.whl # current nightly build (the filename will change!)
+uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly/cpu
 ```
 
+??? console "pip (there's a caveat)"
+
+    Using `pip` to install from nightly indices is _not supported_, because `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. In contrast, `uv` gives the extra index [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes).
+
+    If you insist on using `pip`, you have to specify the full URL (link address) of the wheel file (which can be obtained from https://wheels.vllm.ai/nightly/cpu/vllm).
+
+    ```bash
+    pip install https://wheels.vllm.ai/4fa7ce46f31cbd97b4651694caf9991cc395a259/vllm-0.13.0rc2.dev104%2Bg4fa7ce46f.cpu-cp38-abi3-manylinux_2_35_aarch64.whl # current nightly build (the filename will change!)
+    ```
+
 **Install specific revisions**
 
-If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), specify the full commit hash in the index:
-https://wheels.vllm.ai/${VLLM_COMMIT}/cpu/vllm .
-Then, copy the link address of the `*.whl` under this index to run:
+If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL:
 
 ```bash
-uv pip install -U <wheel-url>
+export VLLM_COMMIT=730bd35378bf2a5b56b6d3a45be28b3092d26519 # use full commit hash from the main branch
+uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}/cpu
 ```
 
 # --8<-- [end:pre-built-wheels]
@@ -103,10 +111,10 @@ Testing has been conducted on AWS Graviton3 instances for compatibility.
 See [Using Docker](../../deployment/docker.md) for instructions on using the official Docker image.
 
 Stable vLLM Docker images are being pre-built for Arm from version 0.12.0. Available image tags are here: [https://gallery.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo).
-Please replace `<version>` in the command below with a specific version string (e.g., `0.12.0`).
 
 ```bash
-docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v<version>
+export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
+docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${VLLM_VERSION}
 ```
 
 You can also access the latest code with Docker images. These are not intended for production use and are meant for CI and testing only. They will expire after several days.

From a450c64a30ab6d450b23587611a726af965618b1 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Mon, 15 Dec 2025 15:18:02 -0500
Subject: [PATCH 567/770] [Bugfix] Fail instead of ignoring when
 CompilationConfig gets invalid args (#30708)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 tests/benchmarks/test_param_sweep.py | 8 --------
 vllm/config/compilation.py           | 8 ++++----
 2 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/tests/benchmarks/test_param_sweep.py b/tests/benchmarks/test_param_sweep.py
index 0d47cfd9d6230..467797d9915c9 100644
--- a/tests/benchmarks/test_param_sweep.py
+++ b/tests/benchmarks/test_param_sweep.py
@@ -23,14 +23,6 @@ class TestParameterSweepItem:
                 {"compilation_config.use_inductor_graph_partition": True},
                 "--compilation-config.use_inductor_graph_partition=true",
             ),
-            (
-                {"compilation_config.use_inductor": False},
-                "--compilation-config.use_inductor=false",
-            ),
-            (
-                {"compilation_config.use_inductor": True},
-                "--compilation-config.use_inductor=true",
-            ),
         ],
     )
     def test_nested_boolean_params(self, input_dict, expected):
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 568a01bd9db91..1fdb843e1a7c7 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -8,7 +8,7 @@ from dataclasses import field
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, ClassVar, Literal
 
-from pydantic import Field, TypeAdapter, field_validator
+from pydantic import ConfigDict, Field, TypeAdapter, field_validator
 from pydantic.dataclasses import dataclass
 
 import vllm.envs as envs
@@ -96,7 +96,7 @@ class CUDAGraphMode(enum.Enum):
 
 
 @config
-@dataclass
+@dataclass(config=ConfigDict(extra="forbid"))
 class PassConfig:
     """Configuration for custom Inductor passes.
 
@@ -251,7 +251,7 @@ class DynamicShapesType(str, enum.Enum):
 
 
 @config
-@dataclass
+@dataclass(config=ConfigDict(extra="forbid"))
 class DynamicShapesConfig:
     """Configuration to control/debug torch compile dynamic shapes."""
 
@@ -290,7 +290,7 @@ class DynamicShapesConfig:
 
 
 @config
-@dataclass
+@dataclass(config=ConfigDict(extra="forbid"))
 class CompilationConfig:
     """Configuration for compilation.
 

From 60dbf7d8f13689b17c88840f3ae4e7a222305f2b Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Mon, 15 Dec 2025 15:24:16 -0500
Subject: [PATCH 568/770] Update batch invariant to use attention config
 (#30704)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/model_executor/layers/batch_invariant.py | 39 +++++++++++--------
 vllm/v1/worker/gpu_worker.py                  |  3 +-
 2 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py
index 4f31e5afa1ac9..fde0826779eb1 100644
--- a/vllm/model_executor/layers/batch_invariant.py
+++ b/vllm/model_executor/layers/batch_invariant.py
@@ -6,7 +6,7 @@ from typing import Any
 
 import torch
 
-import vllm.envs as envs
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
@@ -1004,27 +1004,30 @@ def vllm_is_batch_invariant() -> bool:
     return VLLM_BATCH_INVARIANT
 
 
-def override_envs_for_invariance():
-    curr_attn_backend = envs.VLLM_ATTENTION_BACKEND
+def override_envs_for_invariance(
+    attention_backend: AttentionBackendEnum | None,
+):
     supported_backends = [
-        "FLASH_ATTN",  # best supported backend
-        "FLASHINFER",
-        "FLASH_ATTN_MLA",
-        "TRITON_MLA",
+        AttentionBackendEnum.FLASH_ATTN,  # best supported backend
+        AttentionBackendEnum.FLASHINFER,
+        AttentionBackendEnum.FLASH_ATTN_MLA,
+        AttentionBackendEnum.TRITON_MLA,
         # Not yet supported MLA backends
-        # "FLASHMLA",
-        # "FLEX_ATTENTION", # IMA issue even if we disable batch invariance
-        # "FLASHINFER_MLA", https://github.com/vllm-project/vllm/pull/28967
+        # AttentionBackendEnum.FLASHMLA,
+        # AttentionBackendEnum.FLEX_ATTENTION,  # IMA issue
+        # AttentionBackendEnum.FLASHINFER_MLA,  # PR #28967
     ]
-    if curr_attn_backend not in supported_backends:
+    if attention_backend not in supported_backends:
+        supported_names = [b.name for b in supported_backends]
+        backend_name = attention_backend.name if attention_backend else None
         error = (
             "VLLM batch_invariant mode requires an attention backend in "
-            f"{supported_backends}, but got '{curr_attn_backend}'. "
-            "Please set the 'VLLM_ATTENTION_BACKEND' environment variable "
-            "to one of the supported backends before enabling batch_invariant."
+            f"{supported_names}, but got '{backend_name}'. "
+            "Please use --attention-backend or attention_config to set "
+            "one of the supported backends before enabling batch_invariant."
         )
         raise RuntimeError(error)
-    if os.environ["VLLM_ATTENTION_BACKEND"] != supported_backends[0]:
+    if attention_backend != supported_backends[0]:
         warning = (
             "You are using a decode-invariant form of batch invariance. "
             "This will not be invariant between prefill and decode."
@@ -1050,10 +1053,12 @@ def override_envs_for_invariance():
     os.environ["VLLM_USE_AOT_COMPILE"] = "0"
 
 
-def init_batch_invariance():
+def init_batch_invariance(
+    attention_backend: AttentionBackendEnum | None,
+):
     # this will hit all the csrc overrides as well
     if vllm_is_batch_invariant():
-        override_envs_for_invariance()
+        override_envs_for_invariance(attention_backend)
         enable_batch_invariant_mode()
 
         # Disable TF32 for batch invariance - it causes non-deterministic rounding
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 21a8564f83c40..1e13650cd083e 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -931,10 +931,11 @@ def init_worker_distributed_environment(
     backend: str = "nccl",
 ) -> None:
     """Initialize the distributed environment."""
+    attention_config = vllm_config.attention_config
     parallel_config = vllm_config.parallel_config
     from vllm.model_executor.layers.batch_invariant import init_batch_invariance
 
-    init_batch_invariance()
+    init_batch_invariance(attention_config.backend)
     set_custom_all_reduce(not parallel_config.disable_custom_all_reduce)
 
     init_method = distributed_init_method or "env://"

From c01d589813f40c9ea25db3cdaa2c6c2144ab4e53 Mon Sep 17 00:00:00 2001
From: Kevin Musgrave <kevin.musgrave@gmail.com>
Date: Mon, 15 Dec 2025 17:00:29 -0500
Subject: [PATCH 569/770] [Benchmarks] `auto_tune.sh`: Use hostname variable
 for server requests (#30529)

Signed-off-by: Kevin Musgrave <kevin.musgrave@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 benchmarks/auto_tune/auto_tune.sh | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh
index 25baa9cbda39c..a245e2022e605 100644
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@@ -18,6 +18,11 @@ MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0}
 MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
 NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
 NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}
+HOSTNAME=$(hostname)
+if [[ -z "$HOSTNAME" ]]; then
+    echo "Error: Failed to determine hostname." >&2
+    exit 1
+fi
 
 LOG_FOLDER="$BASE/auto-benchmark/$TAG"
 RESULT="$LOG_FOLDER/result.txt"
@@ -82,6 +87,7 @@ start_server() {
         "$MODEL"
         "--disable-log-requests"
         "--port" "8004"
+        "--host" "$HOSTNAME"
         "--gpu-memory-utilization" "$gpu_memory_utilization"
         "--max-num-seqs" "$max_num_seqs"
         "--max-num-batched-tokens" "$max_num_batched_tokens"
@@ -113,7 +119,7 @@ start_server() {
         # since that we should always have permission to send signal to the server process.
         kill -0 $server_pid 2> /dev/null || break
 
-        RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
+        RESPONSE=$(curl -s -X GET "http://${HOSTNAME}:8004/health" -w "%{http_code}" -o /dev/stdout)
         STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
         if [[ "$STATUS_CODE" -eq 200 ]]; then
             server_started=1
@@ -173,6 +179,7 @@ run_benchmark() {
         --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
         --num-prompts 1000 \
         --random-prefix-len $prefix_len \
+        --host "$HOSTNAME" \
         --port 8004 &> "$bm_log"
     throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
     e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
@@ -188,7 +195,7 @@ run_benchmark() {
         request_rate=$((${throughput%.*} + 1))
         while ((request_rate > 0)); do
             # clear prefix cache
-            curl -X POST http://0.0.0.0:8004/reset_prefix_cache
+            curl -X POST http://${HOSTNAME}:8004/reset_prefix_cache
             sleep 5
             bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
             vllm bench serve \
@@ -204,6 +211,7 @@ run_benchmark() {
                 --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
                 --num-prompts 100 \
                 --random-prefix-len $prefix_len \
+                --host "$HOSTNAME" \
                 --port 8004 &> "$bm_log"
             throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
             e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
@@ -304,6 +312,7 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
         --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
         --num-prompts 100 \
         --random-prefix-len $prefix_len \
+        --host "$HOSTNAME" \
         --port 8004 \
         --profile &> "$bm_log"
 else

From a182be43089bae3edb0b0232942ee7bf0fbeff0e Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Mon, 15 Dec 2025 17:29:09 -0500
Subject: [PATCH 570/770] [UX][Attention] Add `attention_config` argument to
 `LLM()` (#30710)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 vllm/entrypoints/llm.py | 68 ++++++++++++++++-------------------------
 1 file changed, 26 insertions(+), 42 deletions(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 31319cf64aeb8..2768e267f4837 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -18,6 +18,7 @@ from vllm.beam_search import (
     create_sort_beams_key_function,
 )
 from vllm.config import (
+    AttentionConfig,
     CompilationConfig,
     PoolerConfig,
     ProfilerConfig,
@@ -175,6 +176,10 @@ class LLM:
         compilation_config: Either an integer or a dictionary. If it is an
             integer, it is used as the mode of compilation optimization. If it
             is a dictionary, it can specify the full compilation configuration.
+        attention_config: Configuration for attention mechanisms. Can be a
+            dictionary or an AttentionConfig instance. If a dictionary, it will
+            be converted to an AttentionConfig. Allows specifying the attention
+            backend and other attention-related settings.
         **kwargs: Arguments for [`EngineArgs`][vllm.EngineArgs].
 
     Note:
@@ -213,6 +218,7 @@ class LLM:
         | StructuredOutputsConfig
         | None = None,
         profiler_config: dict[str, Any] | ProfilerConfig | None = None,
+        attention_config: dict[str, Any] | AttentionConfig | None = None,
         kv_cache_memory_bytes: int | None = None,
         compilation_config: int | dict[str, Any] | CompilationConfig | None = None,
         logits_processors: list[str | type[LogitsProcessor]] | None = None,
@@ -252,51 +258,28 @@ class LLM:
         if hf_overrides is None:
             hf_overrides = {}
 
-        if compilation_config is not None:
-            if isinstance(compilation_config, int):
-                compilation_config_instance = CompilationConfig(
-                    mode=CompilationMode(compilation_config)
-                )
-            elif isinstance(compilation_config, dict):
-                compilation_config_instance = CompilationConfig(
-                    **{
-                        k: v
-                        for k, v in compilation_config.items()
-                        if is_init_field(CompilationConfig, k)
-                    }
-                )
-            else:
-                compilation_config_instance = compilation_config
-        else:
-            compilation_config_instance = CompilationConfig()
+        def _make_config(value: Any, cls: type[_R]) -> _R:
+            """Convert dict/None/instance to a config instance."""
+            if value is None:
+                return cls()
+            if isinstance(value, dict):
+                return cls(**{k: v for k, v in value.items() if is_init_field(cls, k)})  # type: ignore[arg-type]
+            return value
 
-        if structured_outputs_config is not None:
-            if isinstance(structured_outputs_config, dict):
-                structured_outputs_instance = StructuredOutputsConfig(
-                    **{
-                        k: v
-                        for k, v in structured_outputs_config.items()
-                        if is_init_field(StructuredOutputsConfig, k)
-                    }
-                )
-            else:
-                structured_outputs_instance = structured_outputs_config
+        if isinstance(compilation_config, int):
+            compilation_config_instance = CompilationConfig(
+                mode=CompilationMode(compilation_config)
+            )
         else:
-            structured_outputs_instance = StructuredOutputsConfig()
+            compilation_config_instance = _make_config(
+                compilation_config, CompilationConfig
+            )
 
-        if profiler_config is not None:
-            if isinstance(profiler_config, dict):
-                profiler_config_instance = ProfilerConfig(
-                    **{
-                        k: v
-                        for k, v in profiler_config.items()
-                        if is_init_field(ProfilerConfig, k)
-                    }
-                )
-            else:
-                profiler_config_instance = profiler_config
-        else:
-            profiler_config_instance = ProfilerConfig()
+        structured_outputs_instance = _make_config(
+            structured_outputs_config, StructuredOutputsConfig
+        )
+        profiler_config_instance = _make_config(profiler_config, ProfilerConfig)
+        attention_config_instance = _make_config(attention_config, AttentionConfig)
 
         # warn about single-process data parallel usage.
         _dp_size = int(kwargs.get("data_parallel_size", 1))
@@ -341,6 +324,7 @@ class LLM:
             pooler_config=pooler_config,
             structured_outputs_config=structured_outputs_instance,
             profiler_config=profiler_config_instance,
+            attention_config=attention_config_instance,
             compilation_config=compilation_config_instance,
             logits_processors=logits_processors,
             **kwargs,

From 511e81e7c9a8a6c7ff5e8ce075c75c88513ad29f Mon Sep 17 00:00:00 2001
From: Shengqi Chen <harry-chen@outlook.com>
Date: Tue, 16 Dec 2025 06:48:01 +0800
Subject: [PATCH 571/770] [BUILD] use sm_100f when compiling flashmla to fix
 support on sm103 (#30705)

Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
---
 cmake/external_projects/flashmla.cmake | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/cmake/external_projects/flashmla.cmake b/cmake/external_projects/flashmla.cmake
index 2cf3c1a755d3c..0d4f9b7aa07c8 100644
--- a/cmake/external_projects/flashmla.cmake
+++ b/cmake/external_projects/flashmla.cmake
@@ -35,16 +35,21 @@ message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
 # sm90a
 
 set(SUPPORT_ARCHS)
-if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3)
-    list(APPEND SUPPORT_ARCHS 9.0a)
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3)
+    list(APPEND SUPPORT_ARCHS "9.0a")
 endif()
-if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8)
-    list(APPEND SUPPORT_ARCHS 10.0a)
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.9)
+    # CUDA 12.9 has introduced "Family-Specific Architecture Features"
+    # this supports all compute_10x family
+    list(APPEND SUPPORT_ARCHS "10.0f")
+elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+    list(APPEND SUPPORT_ARCHS "10.0a")
 endif()
 
 
 cuda_archs_loose_intersection(FLASH_MLA_ARCHS "${SUPPORT_ARCHS}" "${CUDA_ARCHS}")
 if(FLASH_MLA_ARCHS)
+    message(STATUS "FlashMLA CUDA architectures: ${FLASH_MLA_ARCHS}")
     set(VLLM_FLASHMLA_GPU_FLAGS ${VLLM_GPU_FLAGS})
     list(APPEND VLLM_FLASHMLA_GPU_FLAGS "--expt-relaxed-constexpr" "--expt-extended-lambda" "--use_fast_math")
 
@@ -126,7 +131,8 @@ if(FLASH_MLA_ARCHS)
         $<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
         $<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
 else()
-    # Create empty targets for setup.py when not targeting sm90a systems
+    message(STATUS "FlashMLA will not compile: unsupported CUDA architecture ${CUDA_ARCHS}")
+    # Create empty targets for setup.py on unsupported systems
     add_custom_target(_flashmla_C)
     add_custom_target(_flashmla_extension_C)
 endif()

From bbd850e597b92ed92ccc4d6698a0563a2e1fb74a Mon Sep 17 00:00:00 2001
From: penfree <pengfei.qiu@qq.com>
Date: Tue, 16 Dec 2025 09:03:11 +0800
Subject: [PATCH 572/770] [Bugfix] fix streaming final output for non harmony
 (#30237)

Signed-off-by: penfree <qiupengfei@baidu.com>
Co-authored-by: penfree <qiupengfei@baidu.com>
---
 .../openai/test_response_api_simple.py        | 45 +++++++++++++++++++
 vllm/entrypoints/context.py                   | 34 +++++++++++++-
 vllm/entrypoints/openai/serving_responses.py  |  3 +-
 3 files changed, 79 insertions(+), 3 deletions(-)

diff --git a/tests/entrypoints/openai/test_response_api_simple.py b/tests/entrypoints/openai/test_response_api_simple.py
index aee03199bc6f4..02e06297f3987 100644
--- a/tests/entrypoints/openai/test_response_api_simple.py
+++ b/tests/entrypoints/openai/test_response_api_simple.py
@@ -87,3 +87,48 @@ async def test_reasoning_item(client: OpenAI, model_name: str):
     assert response.output[0].type == "reasoning"
     assert response.output[1].type == "message"
     assert type(response.output[1].content[0].text) is str
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_streaming_output_consistency(client: OpenAI, model_name: str):
+    """Test that streaming delta text matches the final response output_text.
+
+    This test verifies that when using streaming mode:
+    1. The concatenated text from all 'response.output_text.delta' events
+    2. Matches the 'output_text' in the final 'response.completed' event
+    """
+    response = await client.responses.create(
+        model=model_name,
+        input="Say hello in one sentence.",
+        stream=True,
+    )
+
+    events = []
+    async for event in response:
+        events.append(event)
+
+    assert len(events) > 0
+
+    # Concatenate all delta text from streaming events
+    streaming_text = "".join(
+        event.delta for event in events if event.type == "response.output_text.delta"
+    )
+
+    # Get the final response from the last event
+    response_completed_event = events[-1]
+    assert response_completed_event.type == "response.completed"
+    assert response_completed_event.response.status == "completed"
+
+    # Get output_text from the final response
+    final_output_text = response_completed_event.response.output_text
+
+    # Verify final response has output
+    assert len(response_completed_event.response.output) > 0
+
+    # Verify streaming text matches final output_text
+    assert streaming_text == final_output_text, (
+        f"Streaming text does not match final output_text.\n"
+        f"Streaming: {streaming_text!r}\n"
+        f"Final: {final_output_text!r}"
+    )
diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
index a22ab02229cd8..eef8fce09c622 100644
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -2,11 +2,13 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 import contextlib
+import copy
 import json
 import logging
 from abc import ABC, abstractmethod
 from collections.abc import Callable
 from contextlib import AsyncExitStack
+from dataclasses import replace
 from typing import TYPE_CHECKING, Union
 
 from openai.types.responses.response_function_tool_call_output_item import (
@@ -164,6 +166,12 @@ class SimpleContext(ConversationContext):
 
     def __init__(self):
         self.last_output = None
+
+        # Accumulated final output for streaming mode
+        self._accumulated_text: str = ""
+        self._accumulated_token_ids: list[int] = []
+        self._accumulated_logprobs: list = []
+
         self.num_prompt_tokens = 0
         self.num_output_tokens = 0
         self.num_cached_tokens = 0
@@ -183,6 +191,13 @@ class SimpleContext(ConversationContext):
         self.num_cached_tokens = output.num_cached_tokens or 0
         self.num_output_tokens += len(output.outputs[0].token_ids or [])
 
+        # Accumulate text, token_ids, and logprobs for streaming mode
+        delta_output = output.outputs[0]
+        self._accumulated_text += delta_output.text
+        self._accumulated_token_ids.extend(delta_output.token_ids)
+        if delta_output.logprobs is not None:
+            self._accumulated_logprobs.extend(delta_output.logprobs)
+
         if len(self.input_messages) == 0:
             output_prompt = output.prompt or ""
             output_prompt_token_ids = output.prompt_token_ids or []
@@ -194,11 +209,26 @@ class SimpleContext(ConversationContext):
             )
         self.output_messages.append(
             ResponseRawMessageAndToken(
-                message=output.outputs[0].text,
-                tokens=output.outputs[0].token_ids,
+                message=delta_output.text,
+                tokens=delta_output.token_ids,
             )
         )
 
+    @property
+    def final_output(self) -> RequestOutput | None:
+        """Return the final output, with complete text/token_ids/logprobs."""
+        if self.last_output is not None and self.last_output.outputs:
+            assert isinstance(self.last_output, RequestOutput)
+            final_output = copy.copy(self.last_output)
+            # copy inner item to avoid modify last_output
+            final_output.outputs = [replace(item) for item in self.last_output.outputs]
+            final_output.outputs[0].text = self._accumulated_text
+            final_output.outputs[0].token_ids = tuple(self._accumulated_token_ids)
+            if self._accumulated_logprobs:
+                final_output.outputs[0].logprobs = self._accumulated_logprobs
+            return final_output
+        return self.last_output
+
     def append_tool_output(self, output) -> None:
         raise NotImplementedError("Should not be called.")
 
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index fb2a6440daf09..251684157e060 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -675,7 +675,8 @@ class OpenAIServingResponses(OpenAIServing):
             num_tool_output_tokens = 0
         else:
             assert isinstance(context, SimpleContext)
-            final_res = context.last_output
+            # Use final_output which has accumulated text/token_ids/logprobs
+            final_res = context.final_output
             assert final_res is not None
             assert len(final_res.outputs) == 1
             final_output = final_res.outputs[0]

From ff21a0fc859390385d4da3363c23f43eacefd5c6 Mon Sep 17 00:00:00 2001
From: Amr Mahdi <amrmahdi@meta.com>
Date: Tue, 16 Dec 2025 04:52:19 +0200
Subject: [PATCH 573/770] [docker] Restructure Dockerfile for more efficient
 and cache-friendly builds (#30626)

Signed-off-by: Amr Mahdi <amrmahdi@meta.com>
---
 docker/Dockerfile                             | 282 ++++++++++--------
 .../dockerfile-stages-dependency.png          | Bin 177867 -> 209492 bytes
 2 files changed, 162 insertions(+), 120 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 0d50d97e54c6c..ae2624ace67b9 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -32,7 +32,7 @@ ARG DEADSNAKES_GPGKEY_URL
 
 # The PyPA get-pip.py script is a self contained script+zip file, that provides
 # both the installer script and the pip base85-encoded zip archive. This allows
-# bootstrapping pip in environment where a dsitribution package does not exist.
+# bootstrapping pip in environment where a distribution package does not exist.
 #
 # By parameterizing the URL for get-pip.py installation script, we allow
 # third-party to use their own copy of the script stored in a private mirror.
@@ -73,15 +73,13 @@ ARG INSTALL_KV_CONNECTORS=false
 #################### BASE BUILD IMAGE ####################
 # prepare basic build environment
 FROM ${BUILD_BASE_IMAGE} AS base
+
 ARG CUDA_VERSION
 ARG PYTHON_VERSION
-ARG TARGETPLATFORM
-ARG INSTALL_KV_CONNECTORS=false
+
 ENV DEBIAN_FRONTEND=noninteractive
 
-ARG GET_PIP_URL
-
-# Install system dependencies and uv, then create Python virtual environment
+# Install system dependencies including build tools
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
     && apt-get update -y \
@@ -107,32 +105,30 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && ln -s /opt/venv/bin/pip /usr/bin/pip \
     && python3 --version && python3 -m pip --version
 
-ARG PIP_INDEX_URL UV_INDEX_URL
-ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
-ARG PYTORCH_CUDA_INDEX_BASE_URL
-ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
-
 # Activate virtual environment and add uv to PATH
 ENV PATH="/opt/venv/bin:/root/.local/bin:$PATH"
 ENV VIRTUAL_ENV="/opt/venv"
 
-# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
-# Reference: https://github.com/astral-sh/uv/pull/1694
+# Environment for uv
 ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
-# Use copy mode to avoid hardlink failures with Docker cache mounts
 ENV UV_LINK_MODE=copy
 
-RUN <<EOF
-gcc --version
-EOF
+# Verify GCC version
+RUN gcc --version
 
-# Workaround for https://github.com/openai/triton/issues/2507 and
-# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
-# this won't be needed for future versions of this docker image
-# or future versions of triton.
+# Workaround for triton/pytorch issues
 RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 
+# ============================================================
+# SLOW-CHANGING DEPENDENCIES BELOW
+# These are the expensive layers that we want to cache
+# ============================================================
+
+# Install PyTorch and core CUDA dependencies
+# This is ~2GB and rarely changes
+ARG PYTORCH_CUDA_INDEX_BASE_URL
+
 WORKDIR /workspace
 
 # install build and runtime dependencies
@@ -142,13 +138,12 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
     --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
-# cuda arch list used by torch
-# can be useful for both `dev` and `test`
-# explicitly set the list to avoid issues with torch 2.2
-# see https://github.com/pytorch/pytorch/pull/123243
+# CUDA arch list used by torch
+# Explicitly set the list to avoid issues with torch 2.2
+# See https://github.com/pytorch/pytorch/pull/123243
 ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
-#################### BASE BUILD IMAGE ####################
+#################### BUILD BASE IMAGE ####################
 
 #################### CSRC BUILD IMAGE ####################
 FROM base AS csrc-build
@@ -241,6 +236,48 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
     fi
 #################### CSRC BUILD IMAGE ####################
 
+#################### EXTENSIONS BUILD IMAGE ####################
+# Build DeepGEMM, pplx-kernels, DeepEP - runs in PARALLEL with csrc-build
+# This stage is independent and doesn't affect csrc cache
+FROM base AS extensions-build
+ARG CUDA_VERSION
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+ENV UV_LINK_MODE=copy
+
+WORKDIR /workspace
+
+# Build DeepGEMM wheel
+ARG DEEPGEMM_GIT_REF
+COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
+RUN --mount=type=cache,target=/root/.cache/uv \
+    mkdir -p /tmp/deepgemm/dist && \
+    VLLM_DOCKER_BUILD_CONTEXT=1 TORCH_CUDA_ARCH_LIST="9.0a 10.0a" /tmp/install_deepgemm.sh \
+        --cuda-version "${CUDA_VERSION}" \
+        ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"} \
+        --wheel-dir /tmp/deepgemm/dist || \
+    echo "DeepGEMM build skipped (CUDA version requirement not met)"
+
+# Ensure the wheel dir exists so COPY won't fail when DeepGEMM is skipped
+RUN mkdir -p /tmp/deepgemm/dist && touch /tmp/deepgemm/dist/.deepgemm_skipped
+
+# Build pplx-kernels and DeepEP wheels
+COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh
+ARG PPLX_COMMIT_HASH
+ARG DEEPEP_COMMIT_HASH
+RUN --mount=type=cache,target=/root/.cache/uv \
+    mkdir -p /tmp/ep_kernels_workspace/dist && \
+    export TORCH_CUDA_ARCH_LIST='9.0a 10.0a' && \
+    /tmp/install_python_libraries.sh \
+        --workspace /tmp/ep_kernels_workspace \
+        --mode wheel \
+        ${PPLX_COMMIT_HASH:+--pplx-ref "$PPLX_COMMIT_HASH"} \
+        ${DEEPEP_COMMIT_HASH:+--deepep-ref "$DEEPEP_COMMIT_HASH"} && \
+    find /tmp/ep_kernels_workspace/nvshmem -name '*.a' -delete
+#################### EXTENSIONS BUILD IMAGE ####################
+
 #################### WHEEL BUILD IMAGE ####################
 FROM base AS build
 ARG TARGETPLATFORM
@@ -265,6 +302,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 
 WORKDIR /workspace
 
+# Copy pre-built csrc wheel directly
 COPY --from=csrc-build /workspace/dist /precompiled-wheels
 
 COPY . .
@@ -286,27 +324,9 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     fi && \
     python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38
 
-# Install DeepGEMM from source
-ARG DEEPGEMM_GIT_REF
-COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
-RUN --mount=type=cache,target=/root/.cache/uv \
-    VLLM_DOCKER_BUILD_CONTEXT=1 TORCH_CUDA_ARCH_LIST="9.0a 10.0a" /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"} --wheel-dir /tmp/deepgemm/dist
-
-# Ensure the wheel dir exists so later-stage COPY won't fail when DeepGEMM is skipped
-RUN mkdir -p /tmp/deepgemm/dist && touch /tmp/deepgemm/dist/.deepgemm_skipped
-
-COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh
-# Install EP kernels(pplx-kernels and DeepEP)
-ARG PPLX_COMMIT_HASH
-ARG DEEPEP_COMMIT_HASH
-RUN --mount=type=cache,target=/root/.cache/uv \
-    export TORCH_CUDA_ARCH_LIST='9.0a 10.0a' && \
-    /tmp/install_python_libraries.sh \
-        --workspace /tmp/ep_kernels_workspace \
-        --mode wheel \
-        ${PPLX_COMMIT_HASH:+--pplx-ref "$PPLX_COMMIT_HASH"} \
-        ${DEEPEP_COMMIT_HASH:+--deepep-ref "$DEEPEP_COMMIT_HASH"} && \
-    find /tmp/ep_kernels_workspace/nvshmem -name '*.a' -delete
+# Copy extension wheels from extensions-build stage for later use
+COPY --from=extensions-build /tmp/deepgemm/dist /tmp/deepgemm/dist
+COPY --from=extensions-build /tmp/ep_kernels_workspace/dist /tmp/ep_kernels_workspace/dist
 
 # Check the size of the wheel if RUN_WHEEL_CHECK is true
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
@@ -344,32 +364,25 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --python /opt/venv/bin/python3 -r requirements/dev.txt \
     --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 #################### DEV IMAGE ####################
-
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
 FROM ${FINAL_BASE_IMAGE} AS vllm-base
+
 ARG CUDA_VERSION
 ARG PYTHON_VERSION
-ARG INSTALL_KV_CONNECTORS=false
-WORKDIR /vllm-workspace
-ENV DEBIAN_FRONTEND=noninteractive
-ARG TARGETPLATFORM
-
-# TODO (huydhn): There is no prebuilt gdrcopy package on 12.9 at the moment
-ARG GDRCOPY_CUDA_VERSION=12.8
-# Keep in line with FINAL_BASE_IMAGE
-ARG GDRCOPY_OS_VERSION=Ubuntu22_04
-
-SHELL ["/bin/bash", "-c"]
-
 ARG DEADSNAKES_MIRROR_URL
 ARG DEADSNAKES_GPGKEY_URL
 ARG GET_PIP_URL
 
+ENV DEBIAN_FRONTEND=noninteractive
+WORKDIR /vllm-workspace
+
+
+# Python version string for paths (e.g., "312" for 3.12)
 RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
     echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
 
-# Install Python and other dependencies
+# Install Python and system dependencies
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
     && apt-get update -y \
@@ -408,63 +421,104 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
     && python3 --version && python3 -m pip --version
 
-# Install CUDA development tools and build essentials for runtime JIT compilation
+# Install CUDA development tools for runtime JIT compilation
 # (FlashInfer, DeepGEMM, EP kernels all require compilation at runtime)
 RUN CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') && \
     apt-get update -y && \
     apt-get install -y --no-install-recommends \
-    cuda-nvcc-${CUDA_VERSION_DASH} \
-    cuda-cudart-${CUDA_VERSION_DASH} \
-    cuda-nvrtc-${CUDA_VERSION_DASH} \
-    cuda-cuobjdump-${CUDA_VERSION_DASH} \
-    # https://github.com/vllm-project/vllm/issues/29590
-    libcurand-dev-${CUDA_VERSION_DASH} \
-    libcublas-${CUDA_VERSION_DASH} \
-    # Fixes nccl_allocator requiring nccl.h at runtime
-    # https://github.com/vllm-project/vllm/blob/1336a1ea244fa8bfd7e72751cabbdb5b68a0c11a/vllm/distributed/device_communicators/pynccl_allocator.py#L22
-    libnccl-dev && \
+        cuda-nvcc-${CUDA_VERSION_DASH} \
+        cuda-cudart-${CUDA_VERSION_DASH} \
+        cuda-nvrtc-${CUDA_VERSION_DASH} \
+        cuda-cuobjdump-${CUDA_VERSION_DASH} \
+        libcurand-dev-${CUDA_VERSION_DASH} \
+        libcublas-${CUDA_VERSION_DASH} \
+        # Fixes nccl_allocator requiring nccl.h at runtime
+        # https://github.com/vllm-project/vllm/blob/1336a1ea244fa8bfd7e72751cabbdb5b68a0c11a/vllm/distributed/device_communicators/pynccl_allocator.py#L22
+        libnccl-dev && \
     rm -rf /var/lib/apt/lists/*
 
+# Install uv for faster pip installs
+RUN python3 -m pip install uv
+
+# Environment for uv
+ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+ENV UV_LINK_MODE=copy
+
+# Workaround for triton/pytorch issues
+RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+
+# ============================================================
+# SLOW-CHANGING DEPENDENCIES BELOW
+# These are the expensive layers that we want to cache
+# ============================================================
+
+# Install PyTorch and core CUDA dependencies
+# This is ~2GB and rarely changes
+ARG PYTORCH_CUDA_INDEX_BASE_URL
+COPY requirements/common.txt /tmp/common.txt
+COPY requirements/cuda.txt /tmp/requirements-cuda.txt
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r /tmp/requirements-cuda.txt \
+        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') && \
+    rm /tmp/requirements-cuda.txt /tmp/common.txt
+
+# Install FlashInfer pre-compiled kernel cache and binaries
+# This is ~1.1GB and only changes when FlashInfer version bumps
+# https://docs.flashinfer.ai/installation.html
+ARG FLASHINFER_VERSION=0.5.3
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system flashinfer-cubin==${FLASHINFER_VERSION} \
+    && uv pip install --system flashinfer-jit-cache==${FLASHINFER_VERSION} \
+        --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
+    && flashinfer show-config
+
+# ============================================================
+# OPENAI API SERVER DEPENDENCIES
+# Pre-install these to avoid reinstalling on every vLLM wheel rebuild
+# ============================================================
+
+# Install gdrcopy (saves ~6s per build)
+# TODO (huydhn): There is no prebuilt gdrcopy package on 12.9 at the moment
+ARG GDRCOPY_CUDA_VERSION=12.8
+ARG GDRCOPY_OS_VERSION=Ubuntu22_04
+ARG TARGETPLATFORM
+COPY tools/install_gdrcopy.sh /tmp/install_gdrcopy.sh
+RUN set -eux; \
+    case "${TARGETPLATFORM}" in \
+      linux/arm64) UUARCH="aarch64" ;; \
+      linux/amd64) UUARCH="x64" ;; \
+      *) echo "Unsupported TARGETPLATFORM: ${TARGETPLATFORM}" >&2; exit 1 ;; \
+    esac; \
+    /tmp/install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "${GDRCOPY_CUDA_VERSION}" "${UUARCH}" && \
+    rm /tmp/install_gdrcopy.sh
+
+# Install vllm-openai dependencies (saves ~2.6s per build)
+# These are stable packages that don't depend on vLLM itself
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        BITSANDBYTES_VERSION="0.42.0"; \
+    else \
+        BITSANDBYTES_VERSION="0.46.1"; \
+    fi; \
+    uv pip install --system accelerate hf_transfer modelscope \
+        "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3,gcs]>=0.15.3'
+
+# ============================================================
+# VLLM INSTALLATION (depends on build stage)
+# ============================================================
+
 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 ARG PYTORCH_CUDA_INDEX_BASE_URL
 ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
 
-# Install uv for faster pip installs
-RUN --mount=type=cache,target=/root/.cache/uv \
-    python3 -m pip install uv
-
-# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
-# Reference: https://github.com/astral-sh/uv/pull/1694
-ENV UV_HTTP_TIMEOUT=500
-ENV UV_INDEX_STRATEGY="unsafe-best-match"
-# Use copy mode to avoid hardlink failures with Docker cache mounts
-ENV UV_LINK_MODE=copy
-
-# Workaround for https://github.com/openai/triton/issues/2507 and
-# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
-# this won't be needed for future versions of this docker image
-# or future versions of triton.
-RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
-
 # Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
     --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system dist/*.whl --verbose \
         --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
-# Install FlashInfer pre-compiled kernel cache and binaries
-# https://docs.flashinfer.ai/installation.html
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system flashinfer-cubin==0.5.3 \
-    && uv pip install --system flashinfer-jit-cache==0.5.3 \
-        --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
-    && flashinfer show-config
-
-COPY examples examples
-COPY benchmarks benchmarks
-COPY ./vllm/collect_env.py .
-
 RUN --mount=type=cache,target=/root/.cache/uv \
 . /etc/environment && \
 uv pip list
@@ -478,7 +532,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
               echo "No DeepGEMM wheels to install; skipping."; \
            fi'
 
-# Pytorch now installs NVSHMEM, setting LD_LIBRARY_PATH (https://github.com/pytorch/pytorch/blob/d38164a545b4a4e4e0cf73ce67173f70574890b6/.ci/manywheel/build_cuda.sh#L141C14-L141C36)
+# Pytorch now installs NVSHMEM, setting LD_LIBRARY_PATH
 ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
 
 # Install EP kernels wheels (pplx-kernels and DeepEP) that have been built in the `build` stage
@@ -487,23 +541,17 @@ RUN --mount=type=bind,from=build,src=/tmp/ep_kernels_workspace/dist,target=/vllm
     uv pip install --system ep_kernels/dist/*.whl --verbose \
         --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
-RUN --mount=type=bind,source=tools/install_gdrcopy.sh,target=/tmp/install_gdrcopy.sh,ro \
-    set -eux; \
-    case "${TARGETPLATFORM}" in \
-      linux/arm64) UUARCH="aarch64" ;; \
-      linux/amd64) UUARCH="x64" ;; \
-      *) echo "Unsupported TARGETPLATFORM: ${TARGETPLATFORM}" >&2; exit 1 ;; \
-    esac; \
-    /tmp/install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "${GDRCOPY_CUDA_VERSION}" "${UUARCH}"
-
 # CUDA image changed from /usr/local/nvidia to /usr/local/cuda in 12.8 but will
 # return to /usr/local/nvidia in 13.0 to allow container providers to mount drivers
 # consistently from the host (see https://github.com/vllm-project/vllm/issues/18859).
 # Until then, add /usr/local/nvidia/lib64 before the image cuda path to allow override.
 ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}
 
+# Copy examples and benchmarks at the end to minimize cache invalidation
+COPY examples examples
+COPY benchmarks benchmarks
+COPY ./vllm/collect_env.py .
 #################### vLLM installation IMAGE ####################
-
 #################### TEST IMAGE ####################
 # image to run unit testing suite
 # note that this uses vllm installed by `pip`
@@ -569,18 +617,12 @@ ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
 
-# install additional dependencies for openai api server
+# install kv_connectors if requested
 RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,source=requirements/kv_connectors.txt,target=/tmp/kv_connectors.txt,ro \
     if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \
         uv pip install --system -r /tmp/kv_connectors.txt; \
-    fi; \
-    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        BITSANDBYTES_VERSION="0.42.0"; \
-    else \
-        BITSANDBYTES_VERSION="0.46.1"; \
-    fi; \
-    uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3,gcs]>=0.15.3'
+    fi
 
 ENV VLLM_USAGE_SOURCE production-docker-image
 
diff --git a/docs/assets/contributing/dockerfile-stages-dependency.png b/docs/assets/contributing/dockerfile-stages-dependency.png
index 7420ca4d89441e6dd320657092aaf3e1c0491e9c..c8839eb93de95fa5ffd6b3338b38ce270ea0e1c7 100644
GIT binary patch
literal 209492
zcmaHU30Tef`}S8RhAcCNj5W(an-G=uFyV+wD(z*oPzY_(b}+__#&%Rf8%n8EN)m0x
zkak*?RA`}8l(g4-Kc~~-|G(b%J-^>|jZ)v`vpmoJ-1q%_{H?3KY4XI`6B&k?%-;N)
z9>e@(!!SSW`EeY+lW1C#ga3^CU2D^C%rN~gEIslf!z^Ifzpc|h67=!4o1a0$HMJqT
zaX*Fcj{LQ&YQX~0kLE9a`)zT6P3%j}1kJ@q%HrkjIOu=bc*9I&_U<zaT(^Hbyj*<t
zqxmAgKYs4J@^SCr1<R${Ka2_9HT7{*Qh980ZF+6@>ms#;a%0~&Dv`1JlKw(?gzrCD
zZstZ7&j0(noCljF#(w`}_;Z$uI9}#|egelwO!}YSVVKS^zcK&odsg%PWdGN<nEVrZ
zG%DYJIPSW+-+xViUO9EL?-brQv$&4UrMG(fTH;q9di&qj5EVtIro!b))3T;TY`ZGI
zw=CKqz|y<-?%tA{@7D^hGBC`)bHZjxNy(0Z&Z4Hz*<wZSU)<<?I>)j8qK=w`lBh=d
zi+lTD#irC8t+YBZIq}_DW!wJN6rFcZ&-k7d9=%k8zND1@mWb;cc;C=XDF{|pIH%IF
zdEC_bL7T+9)?8e=O%e~Icf&ZzwsLP|@Zq{+<8SVax7b@6wf)6~(W@{4qBA8&hS4fx
zsdi9cOWZ@53QHd^wZ8W^+zavW+a9>KUYRmaDcm5&aPN!jPu@<RpgVPIsG91<WjcjT
zp*|rmZft65iO&paO2b1p{<Zk*vpd=IuNoK{+pLQGVe~#m&HTx#>%XZJ8WI!tK=P8g
zt1<3q)LPY%w<6TswQ`B7$JcyA)BXOTgCFgLBYud=R2v@XOzU~;>;CD{_|E4WrZ)Jl
z$cS=H#~UfW`Qw-3OrK#I-Y~z>yD(iL{7H;{*i~*?f*<hgZMbC6l$)2Alqd5r+_5&}
z1D<#GnmxK!>p2lxxAb%W`g!r;*De({&(1HryxLB?EV=x)X_H%Kzj<F*%cYKv4xt@_
z#aODfWft#NJ&`(Uy{zoHPFv@D_q>UAnmk87f8nXQ+|`9^7dyn>*?8;&vzc3@KCE)d
z;^^ME^siO<SL9-v2k71A9Js$QSjk0l<-E~{)<}1{7N5skq6^$=z1$u+f6%*qepjg1
z*Bn#FbpLfH#z|N+QH!hY?Z2u%@Mfxe_iHz&|9;*S9vdsorUhj8c(OviONIH)gX>2x
z$?4M^*)CanvDdFJ8Xl}24z?WbtGKM>V$b#QuIsMxXfJr}RvYY6ZX8+n2a}%{9TRs2
zcWS7#3*DA|cHTu+t#|+azOSuUPfSq~FNxGyv3B9;#We<X4JG}zXrAJI!|L-@S(`0i
ztDTBedh0J5G)+<KsWWpbklS0CXmc!wF`AxaTab(87eG@!+*ek+an63fQ}d6COU0)h
ze)M7VqD;V^nE^9{d0%p#w2KY*Z1V2SvoxF`yXHXk+MO2eHdxoO=f_x0C<$j@G&}TU
zMrwOTe{r-y3@ayU`;EjWqNeqO-L)&$3Wha{dpq_^Jbz%VZmskl==pg6-C6ghU^gyH
zD(?R8e@>ngP$2&*w$7Rv(BGYXwjl0d+m&-_ed6rmIP-@ppa1*i!^=Cn?>zeUh+DsV
zM7xgqoDY>rb_NSKJaldCz|NJ#EG<)VY4)3Gab%(SL)UE`MT<g$U8^L-d<G6ZqEI{X
z3E0YCMFG<=usY}Ft9BrUEMu3OINbhcuA;2@180e~vlzKs`(NFgwd(iZ9=NmwuzEix
z6&2subZ%?l%4>Mw(T~=Ae!SB<rgzJ4!xTl<ij1D$x#76I`^P)kiOtb5qCt~5RSDIH
z^BUfq5-WVr5_gp~OWAebS1bX2ef{Y{f_Q0_bA2RGT1gzlrfI?tY~x~oEv2YZZnXQ%
z@f~AyuNq?PRS+9KJioSqTYuRuB<EISh{-|8(I?_8(K5cJFF9i#Z`W|DI`dZ-g?n_j
z-HM9Z?C$PPgKqBjI{Cd3BfWG*`WL->uU%hxBDzUPN#)UsOz*CFh`${DXdM^H^VQ~k
zrO|qM)$WZ`)CWG{xmoN&mCmgvCox)=w<)|{v(xO5CO2GPGD0U*t+zQQexmlbIBCrf
z;=1PsGl#o}J06V}W3lgHeFtIn3ojE2`_ZfaYe7bT+f`O?<5kDQ#q4D>f_@x*0IQtu
zS9@4vdPlBQrDtcs4rAlXH@982{qp{WvBT};(`SV>)@AfH$2hlDEB>-{>t?RXm**QO
z);zj3dMT#)?sqS<wySgpU4*6Bp4K^CmXU5SG}#+LE8F1Rn6cxm+p1HAmkL5pRDbT4
z2)$eHf8t%NH1^HoQ+8O%+T2(~mszWAx6YQ|XM|a3K7RXNmE?-FPwTn$_g)>yJ2rm4
z(ZNNdPry-X`M2ZxFT5{xJ28U!(eFA7m30=5Wef_{hX<nHB^9YD`}_L~-xQ?Gs8{@q
zRl1tyYvb9umsuaGk{!NcwvPWYM)%Ub%)zg{PgZzeVD;9WnA(vwcIxzR!xz@3H<Chi
zi_AP)lMS*ndkY-(EZRbkkBMpvzo;FpG*l8gq;EA%@Q4p?6D&AAh4%m&1%Ye!>fvse
z5SuaF^0O2Um|c<IZ{jS?GzX?SHQBZ%R}`KS^A1F$P8%NTH{N|l4s$VjHQfrc_bo4Y
zUo~IAkF8y8SGbis*Yne(iL;o%Nc_@V_xcO$A>-#Ot)HF8>x}+l={Ei*ZN4yG%rl|+
zNPPNN$(0irm)9X4ZN@GGue+y?{wOo_{S#;D?8KxZ`z@hW5My||qpQL)a{G;qn%P2O
zQ%tc`XaR=f8<$t^T5oc1pRqG*^h%ru_1~svb&Xf|Yi#$0?Ba9}H>`ANU4#lQ%c(Ix
z2eDJ{;OuX|wrM2VYpAX|`sID}y92Ku%n1B}v*Zb4tv)^L&!a!e46#Rl^fT7F;k@|F
zp+4cIW31SpUw8YqXPguK;3u8Y0r-2W`anYS{T-3of*+i!))ST`cx=Yy;z-uz9R2<6
ze}5ZfOT{LvKgMQ^^rRntOx#-@Ys_tKUo5!tz_*_+4rjBun1nTsHEDHg7-<QUlmo%_
zwcr2nRl%jJ?(SuCZ40hMq*bdA|C?EjO^BdzwW&z4DQJgtnn^NtUr~3Z!xrSPrc@u`
zAN!B<OXEbBo3Pk7w_ese_}v=I9?s>hk*H<ZRov3|r*4ml8XhvJ?~MO`thVtnr*8bb
zqwlXt_-`>#?|QV;(#t>CtwwRV$=zob9<BZ#N+OG#YJj-(yQ=L%y<BfwyCa;%v>j@`
z6<MCEDtPTMasGfZ0V}a{E&iM;76S0nfWVXIztiHoWxzyT8il9QyCkKgV$MiON>1D&
zhy_d+e^40Cl6+&kVE@*XS~cvAZABrfomXoIZoc0vxU238IcX*SSH3s-8Rf+)!QY3~
zNUa$Cd%-s~%)|fRkXd8i%HRL{E9l7Ezj@%ac#Bj77PIkz)P~|)J0d@()Oycad(cV^
zU_iKS_3o$tAoDD*^k`2{LVk<T^c5xiX=vfzurIx<eAY@U4T~dhPWo+$2n}+tzJExu
z!EZ-#gw~}s`>WP$yRs$+i7_;(P)Vvib9iWKoEHYxsV%k1Hs<!dEvKf;E53i|-Q9ha
zvN7=)M~{B}c-g5)RpG|QGkrO;W<L2vN996Ix<@3IEkSXUyL&F2ou_=$GSk}~$!(@X
zz=OurrjWeto96igJ1G=gRnTcEzunmxI;?VPo~u!aci)HW?*!pM<F<rU*?+?icz8DK
zV6d`VVc@FW#fdg~!2PN3*Gh-4+Wj;f7Z(|P^tJWgE1ALm?jphyiHV8!6*Se4rzlGU
zz?;^o5kS;d`Vc<%(0@Pg#iCAjaH~GF>1(aeuzQ^HyMI=9^w$onbyqv-Oi}&v96;ME
zq_Zl;QN3s`0q&e-p+TkxQkNh`i{z(%t#W7r@+iD^YW|g_+pb6-9QwDXt;O|grCpQX
z4(*%^o916uMs^urG1u$Mi&TIC8nJfs%#VqA1fJdFyzrw*K*~Ptp79pxvd-_Hw+CAS
zy)_xze||F4G8_x0GcB8NYg35FfsEdZ^+8UBIT$!oIg3EcpSPh5AQXJyVe37yAh%}=
zPt~9E8Q5Kjo0@rke!}KHJu}A+`(Gq&{FBZh!Dz0u6aXnEz)(l!^KbXCN}|{QNi%U`
z{M14`rL4!-kJ#Y~v(!9~cn`K%?!4{Z_aQRr)q#f%C#I@J-K#*Rx*DSDVT)BGv0_CL
z)f_V|Jy&7dZ_(7$T)H*zJfY0Yo+m4~ET7^2TBm;}EUQP2WmaGP_|(**q5gLD+Vy|-
zVx8(FHUyY9;;**a*w~mP+xw%+Sc_rPwk-_0j5Xbe?a4zq23el~iXu=%h)4_f+(GD|
zBv_3&t=Jg2y8~s69V)`h$XK*|zBHG`*w=&(eTn=CP;*(;1EJxTgKd7`1p+`-4p`Ft
zO}$N_MSc=HWNmW&2>WfCd+0YR;DEZLvhE5tq^>eQAZU!K@$PS>D>s{dz_Q6s8|rIO
z?5R6ZlHt|A#=cZMZ+VK@1Gn}N<+l5aD{!BwZS+$H{=#O?FEmpduQ((E<3v&^_dIMJ
z)u+HOivfD~Xan$yRXZS;qbQ2*qrWaoM$&Cecsxbx@t>2OF#1ge3bpzF%vvoAEIn!N
z+LdSKD1<bHAf@JG0R&(Nu-KHUQ9i1U9XIKjQcEmm<4U-DXX6JHRWc9WUsvh>m_$Y7
z`pgjZasW3!s#M)--0i4H+L!kE>E4<&cbbCzDX-;G=vZnix89BJMZU2rRO|oP-s+Sq
z5viYUfd?{;@|ZeHHr&1d5a)hYZ%l0BmSpuyZLc4$7&tNX<Fa69y%6`t%h8{0E|~|s
za?Di4uCbvgfP7$`R3M)hw(i7XtS>v%eJC`B^kOaxhTH1U4u0M53oM(95x;~;y@ZWy
zP5WszBGcveD{5YS)_BCwreq|@m5ynjH<tB;otV=1sNeE%&Jtty27jO5I;=m{J-WR1
z(C;Ch9XUZhL%kmm-2zbS-rfI7K3mMkQyUQv+a6f3lk)Ky#|Iy7#sWunrWavBVzVhW
zZL%?&9_rn<_pDNL6siEV&S0mZ-lj;?lmiCuv{t0AaZK+rylp2{j8z+m>hhX>8X#R$
zO6FjzXH$3EYvu8C4{A_2e)97-$Raw2UpFKKPM#&pLSYd%TMRM6c-P4pqW9ZSCtYpo
zEmG@`tvZ4_NBw@0Y@YN*#I*wpx!$#ZO!(z6;)b2?@_T_zUD$)(?bX<N7eE4J*kcao
zG|j}e6KZ#<zp!b@YP;sP54D-YQ!Q2?lgURWCdwj%y6n%EES<5_?1J7ceYyE7t+IqF
zf{r%dN}e@AFbNZ{@pF)961`Q`ks{S@vnZEW`;xZJmb3qo-g(<JB~E_mb-VY@*0Eik
zhmlin>Pr@+yx--0WzAkLW+%V9Pt<fDZsB+6-BW|$IVwjEW(<CP75({NHOIVw<t4a)
zW^PIpm27eqW=_XC0iL$ECSwb{0x&ne`eNd0uf8vnXpvPT(w~(4Y=p5+1k9-T{TqE`
zG{E1RS-<OV-+loQ;GVsig+<6@h3bRX%4L9nXzL?ZP<<EONvUJDL+o}rdX%IxLsOgg
z&#$34ic0;&mVfPBf_!%PLis<xuDV`zWcz7Rpv_cdM?$H-&P6AavE5gcrUfiFHS{Qm
z)DFG^jwr@Zc1CN%CCkDfMW^VeEw@u@RPr<$BOEH?l4pqgUxxDrs{a+>bz)|xIM%9x
z75S)Bm6VpIa$?-%E!eNoh3Q~iK9t4mq<Uno^YeA7ND!U4!BBWViw$r!)4c#Ccl4PG
zz@N~d8#l3*U9UBHr}!;o*Bp6w=E~U}%hC{K%APvJr8cgQ-(L}DzAal2{5k6-M))Sq
zlRf|bF`?|~ZIk5a3tkkaR6zGs?n^3&4i3eZ85(F(k92FV+576={(CQf_Z|44<C7a}
zoa9~V1(=w=xl=1bF9Y|nZ}M(lt={o$G0>oCU6JNY^SfE;R}>sGo_FsD36cC+edzN!
z&B6>;kA#nfZ^uyqj>_U{@ByCm#vM1e-jcU)t+a92*M3I6N__FAvnX6uF489VH!!^A
znyxQtQ~&G&1^PsUQQMaZyL>&*Z9Fr3dGP@1uW%&#s?;l<jiJ$Lo^N~%Izm%1JtifG
zn*TuPFqosTx7hkxh5Vl9VK?6i=0xL~VCj@%f!V)9ye;ohy;o^_)y(a+%Pe(oPYa~c
zFYUGJ^OT&U<n0D}K3DEOrz(5oQ{C*$VK+ITBv4PWu5t1q?uUAcs)0$x#o4WWMNL=a
z91Pd71$PQN!UvyLt4vT~#++HE6JqAj4RD!<uoI#slgh&O-#*k=k=fX!KI~rBg<S*Q
zQRbIr+OcJm@>XPeCOfHjU3ROqxrF5?JT%blGu$oIZQ^(Qui^Z3!gz<j_Turi&$M>!
z7;Fly081l!wDrq}5(lLbx|E(}`WC@}JH)5EQuQM@gqS9J>({z$d!!uVk^Y%2CrQD&
zte0{xpiOj{hvVHB8_S-yr*~W2u5sK>xIjd;8+=c^d|Op=i%JvVkii_lUyoxR6}F1m
z*D(TpC`R!F*V94-qmaNF<#T<vW3?xO{n0<uR2X~}0Cst4705RQAOK<_#*Ck<>*9^n
zKwo@C`p8n9t#BYnMCN{2Rchw&V7ti7p-okXvggH|u|%LhY4HmA*U#TS0>q2lJSlbu
z!tefTjoSbYO(Zr@umPxbFWgUX?=$kf=zSk^$>^8ZElK&ZdHQE+@k;p^J_L-9E#G3G
z)<i*=o`>jM_7wT1cKz)+sP)S}Bj<iiZ*fGpTHl(~_MpXZj)Ft^zH8-!M(G~i+O(6O
ziu9+&XY|=WJtr1wT~lQGi-lKLnde|vg-B30M(5`B>(}i)v&M`MPxGwFw58Y|eMTMJ
zj##w^P*t+UKX=5h$nU>`)Er~4;?i7-sPy}>7UYs&_>V;+TdFKrTCaeFl3r|_V{pdt
z{+mBhyS^D;ksqYEngTx8y1{>EA>h+sx*}kQO$frhaLw@Jsp_%vHDI|#7j>OWp<)8)
z%HSP-L++gF*Ox*S=GXuR*@JyuG5-5z%$R}NqHby-O0m>0#ab~>C(mB<46~q6+dEI$
zbt~|7%v0>#fk%T>*J7cb+?sYI$hnw}X^fdoX<5X~>B)?prKwGVMZpObEJ|C<K4Agw
zXwAq)j9vi(^Gee|Zf-7t=-!@p=Pt5lnjKu1Jay1#XQuaH2Z5sQ8Y;_2aIsymOCnO`
z(92E6d*}ztEqw-~Kkqs<&#+6S^DsugsqJCAHjjizHk|tcIK=GehY0t#e=loPZu(N9
zQ-nNfA=g%$>7(S<rc&E`X0}`|g0!C8d?n}io}pb4mn?D#)O>AyHT>6?WV@1W3bzfz
zL09VNOWo|znX2+h19A7tnmy004*e$(O*l_B^7r0!E6~0<pzlh+EV70Np7;Mb=RhE+
zX1RCS=ZcU-WOx1hM>RG><<XYgPOtW-EMZe6G8asD=WDl*FYg%p40f$PSeHdO<1*q*
zWB%of$gmBlcX9(v4_wZk?;*`DPIK?1u;ANp;tN%B3A-LsVaLkp`TQ>F(YQJF#ab&0
zP!?Xo0+vUSR)FH|5|PbFxomE<-L2(d<+g!ZCDt<t=*b-;NA={^U?n*qlYK|Kn*Duc
z9z@v>;l3ufOMmy~VZ{m;)EggA`Jqk>p+G^|S8d?aV*<eR{iEG&!6=wB?#=(a3CTGx
z5tW(#nemf;JwJBJ+66?P-b7wSRd0tvl_xXA?2M1afGQ%zh-?+mrHowscSKc|P6gN~
zVB@9BP4DH`=UntX`*{6q@T{aFL}IazZ<5>dd;$Js@7?{dHX<$UXj8|>$|M@!hQvIX
zJW$Oun==qT3eG6h1cJfwYOg}v3Dyo)T3)J#A>Buldh*Iu7L|dHKK-wV0J#+#JMbtq
zv#%`v@Vhe#wy10^-w!r5Af^{$!O0gn5F;@~>3ta0<?dZ&JDVPE0<<}+((%j;v0!iY
z{m<)PcVqrHV<qct-@d9=F?*hy+0ds`>h%Erd7;AtP40#67)A<U*orfeja%2h?#CJ}
zoG2B0zHFzZD){ZsLQS<;F+dF!-GxPgs}>WbTKDLL&%r^0V1&O>jhM@1^t^R1Ozm%b
z2prolt?w<mqFqd|6FXgYJ?4{>S;etf6U2UR{O8vT`1ut&>VwV{$JiH_5{p2bE6rT}
z`y1y9uQ<dh7O4gp+g_$cs))&6ZC@G%;Km}x^6A-mR~<8Ymy)6ZSP(+$QYA%R;1}#8
zS2j|CXlKb%;E-HwMk<U48dmt^ygR2JLZO~`Itu>Kz^sXQXs=06q7pw~E1toS_)w||
z0ID1}o|_-MB6C2NU5sq-5#vXXlJw!`Rhi<9P4isz@6{Y_p<#CiHPrDTG86%L5fb||
zO2NC&DlRqMpKP#SMehu;0^m~ukUf_vymsC}vGN}=b+v|u1}ZC(-uyYomgFH+L_40<
zE)xu3)P5Uq#QM%7<h8d_t@D(ub8iL(F~BNAMtX&k0EzK`@Bhp8Fi?<)%@_PqSm7w4
ztFy+TT$(NG)#oghAZ&FF1k`eN@pVn#<Is@#pHfwRZfO1+1>iC^@CHHZ<0VW&I1#&C
zpUm$VM@PXA8{RfCn5Pr8_6|r!X$c6WvdEmQvf53f7iM~v@l#OOe~_+BTn3O?MmSH3
zDF83wef<IspBd)!Sz#gXQJ|K6hTJN$RVa)5_F?<XS}!rd%hiSQ0a@uUcorjl{sYln
zy#q&y$m@`KX@<DgJ)hnn6On`v=P~d(`yvbZgDTxedj-$gJSxz1{f6vRg3Qdy!Jag1
zeN{B3PTdQI+$=eJeQr3aIl{S*{{FxG<}(U#E}p}K-7~@JJ~#mAiQBTdfIFf=0?j(?
z!05!B`v^{v3V1H@9Ykv2vQaTZfNUwvN03&<L1gnk8m}Wp%B-;Yduubi0DuvqQJ4mu
ze3Vo>>IK*|-#xtUH>Ok*S@oy|k`*gw+?;*qQQ;)u+oPXvEbvXr;n`!yPmw0Zyv$&y
zxq(Y{yk(})cPrz6ELD4LJY+ejOwXP-r`TM(5cj22=RrD&lB$7X=lr2~36ejM^fszw
zi7x(5BXW6hvO#lX7lheTl*RAcmh~W&u2?ylaVeN(7j)tD>C>zn&X}>&zZGr+oJ4FE
zNhKbq$H>K?GDY!ZLE(ah+~JXkEj=@6BImj)NQ+9KEj>y9Vo2O+_&$-^FYb)KInx~d
zP1q@Lz{tzouR9+78Ona>-@hyq4g8UFz2J(R6vz(ep|VPEyeTBz(T@=`x|H?7!6hQp
zv)Ckj2OZ9pptZG`f>0+w=rT6ZWQ0QS?a|*RVQ9EdaxU-d=3o+?sl@#6?_VymQmPMY
zA>Eln7ZOeRdGSVTQj9Rt6e$U1;|FDwL`ScrVeYoe{l&=2h*AafS0IKw5dE2$H7YZ+
zeq*E$U=imL9|AbU$^r3y<Gf(=SS{oic2P~Ec!ABul-WQ!$pvQsO)l&0N^F0k!$_p#
zvPkg%-}2$6{~;gBIh4l|Q%t$*BJ1kDM0N<6=ev7eT!#u;8ldRZXpIH^3@|c?WxPMR
ztOM7jG`Io^D*ux;?cFz3rJuL~ga`%1-Ob#zfsQ=lrdQ)G$1jf2or3`7^#0l@$j&5u
zf@RwAZ1gfr!e9L0AiV^HXfny&#(N|DxvX>QLq}-i5EV>M6%JXri8FS}Gr)`xmNEWn
zHHcNRa|kaJ#A2fkz9GWsG)Z`=AOlNK1F~{Tbku|AxxQRTAQ`H0)C82KIlzD1hiwm}
z*hJu>Mp8wIxPPLyVDl$f^EZEj9~IEDVCZA-79&O=$1Q*Tz$H&)MVjtGS;qS`aDXyZ
zSCm&-w2c3pGVclzlHaS!GW->(b&YCaLR<94yvP+m&jz++8yF49_eT1-pocu_{xq-z
zY%VyRRf{+N^K6ArZyrW;bnv7rN5?LLwmEOyv9B+(lI)5KX`7IQ0YU!&UGi5;&(E3!
z)5cC!SxFK(cVEJzakCT;Tgz8kUp!v&?W(<_SAD{hrJ=ZzqJXu@Wfeglmajhen0V;R
zicYqVCX5NZgym9S<q*FOrTJm(b;Drww|_0}c<Z~OF>|OTbK=(VqtC1B;|Fhz-iw92
zZ;02R=neoCP=Di`TC`FHvU?D?Mau3aU_##hJLWv=va;KOHXvdWmE94lV+!NbI&w)0
zA&yw++e4Qsj~Z^ma*^Pjp3btX2Y_K?UzYU((>Fa^d@i443H*6x;7?W`zCb{1B1DJ(
zTHgNW8hjt8<{sWGlR3~KjU6t$bR>Y`Nd>ymCodueN#7+9R6A%_58a(SL`6i@5^H`t
zxm<GP5{<#HfFX?#B=a%W=OHRL0jk}0^@<F3%@v+$c7B_DWq8;hqoeZ8Uenl4RS2jS
zzg>_2jJSUNtw+U9V;g_x)+#w5LvIu+=aB~^U0TX1Q;yj?CSWW1KtKw}?kx@{EJJ)L
zu|=NXm20n>f-oadjcDVu%56p(CZqS{M4aYZdPIr*B-)nf4@wl8*<*~Fv9lJApMS)P
z*il01_~z)Z8kz97nAMAa<|)flVMIy{Z3&cQE-*ng2g~oTvPNN{1ERD9YmpWjYDKAU
zQ8BA{v~E%_Ad*v|)^jzK-Dmt);mJQnUF9g7I$mrNvjJO5*wXjxSeNynj`6e|P(&89
zi?3}s<=j%f9QLZVY1^%z11H@*@LCBJWFfhq3WJsHkSCXayD!7X9${_$2x+qWUM5ug
z3=g^kW{_zBz9I=4=oc@-w10*APl=&@#=o(M5sIh;OCGwl1D(Z&IggG7BmIcKS2WDO
zAeY#Ib)=ZH`f&ZT{9xsj<|-%=p@=={S<brMTY=?hU&mnyW9aTxC4+HngaNC7^e2#(
zx4&By(#Hf>Ug_3Wod!{A76^(qfDqI}wjrJJU^i4CXaSNu`d#}+tv^vIHR3|~00#b0
z*H_jK8W_PF;tCrUPgjIpXyR4@M9G@AezbD6dHT;R_b)Ftt;TSue99V2<~yH*-hc(2
z*w!{=XWBYpSB^Z0+=0*MeDcACUID9O_z;URCFtP46Dl!=v}KP^6C_aGYyltn0X$be
z3RTj<$p2H@AJ#LKw(wc?p&pa1<GV^OS$bdQIpi!n_Gch@*jLa}N2Q>Kl`AeTFds24
ze7ML&?HBXz*+sS&tkMNKs$Kekv!d?j=DAg`f)1sFYHT^{kBG;p1u09IdG|K-W)Amf
zMt44J&+ur2NF?+vjOuP0eby`T-w!ohJ(#Ai%UUa7h$05Qx3b9FaKnUSfH5+_VfkeK
z;<<KE0Douxi}Dzqx^01ok+eJLDZ6WZd{C+R&VJQTaR<&ez1{s|hQHDRPcAdm!ZW|v
zE?8NDi)5XX19L#W7k79KJ3c+0YKP4)=Y7s{uzBY!CFeb7+Q0G9SwiM76W))6u7q$Z
z37yDvL_BtNs*^*j8kq><z~Y#^`c@%w#0)L?OWLTIf(=jm5S3-+Bv)8mg-n-f*e=-q
zOxD2YXldRBKi23o)Z}B|HJ_pd<cm;~^zpi@N$S{oWzDVkzY5-`REVGbJ}v%2@`;$|
z!P_5S!x9F`E5Y#VIQ#P2p|2@(4`z)q2DU|h3dJ)2UUM&9Ah>`zJTK%TJJV)tXcz#4
zoej#mm=|8hst$n;gI-~S5lRKApM=uT2Joy9%MUp7^jSeF%F-DX<GsVdMDbV_5C*2Z
zJ-(tmR6Y77>dqji%B5;=WWyi7Xm9RpEV#%*g;Mb8&Ew7(%RxyHW4w)qtvEgCn*~BF
zCbnMcDDS1~mZd_|AboV8-iwT0|IAla0+k$fOh4=G8Yq+1&L!JY-=3E0K&IaL@E-ou
zh{-Ji@pcJC@pQitQxdMl5qq<q|L(eDiOWU$soxa80{SeRt(Q?X$Bo-D#`hIwxDy1V
zZC%ze(sz>`@}!fteto%{NA!@pN@R<mc*|NeO4`cpg1dwSH1Gh!H_E#~i(fK3%$^)6
zbQ;oUlo?9nRB+a^c7;Lq737-xMSJZ+QoqO>rMq4*!J?zIf9W4^4k$q=bIk01)N=9#
z_-2B|psQ}3wAOfm@;ZsEjH#%%iRIIO=n5jF>HLN%65;}M$%u^Fi@i@1Url@j_V(u+
z<qG>=t|TKCftK0Y<4Wa-fFOuKbYJhQ+Mxi*&N8Izfn+stUL$zE#@2uT<kvHtC1qGW
zM3z|R2d-jsp)$;cQbGnkY}O224}u#LXIV+S04^L|t@AxT-th!po;Yhr{cD?Yeso98
z@Q~fqZ699h-Wew+I{gO`aeL_nrzZ=WUf;83fvEJ)a;uv5yqysB_ohcRV;p`vGg<2B
zB6(>m1y}vki<&v&UV%+heJXYSGS8h|*Y!X~rs6?GO?O(hdiCO@Za2YhD?J#|jL-GO
zyw|W|Q!wKZrburWSx|}#K@|l<p0$JNhgZmfXl}2*L6#I$z|hT;)^Y_`V|osa)1PiW
zG9+4D#PCKCuR37(^Ty7id_qjy>xXVST-1UpuncHxK1mst^GrY5C2Rfcr?KCrhp}I|
zVdSg2@!&I4h?<!+drefDE4dZ^UKX+$?0l2QN95%z-h+K!Ef%nD=)kunYAhv~5k@3V
z-y9$3k$6?=j5*juPQ<O3WtXrMkqDZu`^x0T`wX@?dQtpJB5I0qrK1-Jz5>9;D^RtU
z7u#BWkVjrFy|rW6j2Vpyf@(l|(b%*NBf&A_F@E0(+u5ept9$uW>z4-~PbL$kXOqUC
ztXm)m4}e>TKjv;)IT9J^MB`)`k&lpLe3B4InxGj=40j8%C+B3$|9EtwAxPdh4=OS`
z?JL-BStK=gIHEtD=uX8Bh1(1q1!C#t#phHGUrQ0nqbjT3PQBll__z9)V=G{!J3@Ye
zSXgtOJQYluwCcED`MP;+9hpsmD}zuRt!3+>uq7>9aj55=Pq8;@z#L?^tNrbn>JfT-
zcij}q3qzfJ)2=AQ9k{NuJ+mza76l5@l(Z>;9y=+R3@zc&39U6SFlfL-<dO*rE`ufP
za!8WII#=4q-1hFsAZr4~x&&PGN&kuG3A+Kq9g2I<98JcW3<nT{?t~oY!rw&}riqdQ
zFl}^w7XM|2W!*dK2Rbfg=5Ls)s&DZM(iK5a0$5qHoOXFcNP^j9>Vn@zp*Fd(;A&IU
zjkCE_Ir|%$+zJ!K9?lZ(wGjsSJc}eocozc6J;2rj(;WoUh$0L?i2oDg#`f&8EeZ)H
zY7r)TZ=wKu;~>hez@YgMsSNkT-S01antx&GSJ5#st06RgX{lHNRf5e$wY&&?aV@D5
zqGsbil|We8O|XWNpT2>Ce+86OxX!uY%&{zI2#X59QR$JfX?YWA=s7m!G?G{xG$jqk
zCc$B{8|t*;*jbz<HLkDrUm^v8^0BsRS7~g*zZ2ORvp2%5{mXZo(VL^^g3pT)KVriD
zGe|VZV9zkY7d>5?dUj|lF7X-a4BdhKcf5rW%>W(0NTKG)ZsJpTu5xfxFW7>mAorT#
zuLYw<a?UE^jmS8}%Gn}y+YbO)4S?9;kqzSuTS+JQ?d{Hjk~xcOrO|qQWY1Y0>O~QF
z7_Rzy2pAIaE+1d6!eViPUu3)0tgN@Y7YKH0*3_nXJ9!Tv{Ywop6~f##mGjqNJqGCo
zm44GCYr*3&b#uyO`0hH<WFo<7=McBUVgs|ThUN4lfQ#@Y!4_h4FB(keqe$N^Sl(2_
z!J>0R<@_$P`R~wCwOKGWOQ@&S(dQ~b=d!Xg%}BuwrS&50+x>X6XC?6k@uwbHwve}K
zmEVV*I82xWAVfG|jC3VA3-$qxu(_x!)yc)>4f9|)V&HerBkep^^0W-^`ka&dk%B6*
zxe$WwE8>EYm6S0zH;zqWME1bu((n!pht#s5gX_uZK#Q4}^FzA?Pv@&~>j6IRM^d|y
z5it)p&MHZ+w$0znjZMszOof`CgbXffHZH+AP{EPsktERc-(Np>n#Vv97!3(L`s2I3
zx@R=<Wa;e!r$Ot0UN)dY$@OC(`Q<>|n;f%YwBoUu)Tr}>ZjQ84=yn?+^~0+aM`~Os
z1<LzwPYM4nJ~HoG8PK;P;9^JBZ<p(UER%b3rbQqyhN#iFgndM1lWU6}6%eJ<9ArsZ
zF=V4Jt*;aa$FBPpz1066-KUkXCHT}6Pegnxi^9+04-uEha3v)_$!gXbY+f@+ZM0M2
zm-O$AlLMk4$_!a65Cxg2z5pbBx-ltA{K3dbd!I%gp7iU=1yHxN+!s714<#7UryAop
z$B8f?9jg;rxe4J(3zlL%Y&KR7)wbk&Bc0`uzhKVP&Ay+ro>f#9kjO%wkUR&HtKiwY
z1YA19Xk3=hQN%!5gdW~jg@jC8Dh*OF&uq(_q%MXIM0G?=iOXmkd7;Q?`GK0{**&ky
z7f8m$a#1RY2LHfF4+^GqS_7y7cFKX*<gi=T+kC0jXW*R}8PXylO_+_(N`VZPKgTic
z0#PQxBf-nJ01$y!dsF)Y*(lPW7?U^eg)yUb2C!+_)dLTRow$hNJ95LRDYuYv@ON3o
zPjp9uM8I643`jlGH#A(zhU>@=RXiy#68HHsV4V~ZSTL&v;P$KNPUvp3aGsVWyZht}
z!_Aju%!}~@P#I-l@ex;=z!-J#vq~2C&1tE)XREU3UPL*(lr8_#>I5t>^D6h;I5Sa(
zA8t02jS(WJI1z4YNS=<KTLJKsu>5U67i@#bUkr-jiP?mN`hK(mttOFvj)DxyUQjXn
z&YVL~?kWZw0L!2ha+5{AKNOw+_%CRG4?Ex{EH+e}x!;Ckh@dX;*+ro|{tDnyqZ5gU
zGO6`k?El>v4yX*CLBS5eUb%^31MOolG5QCiqhNJCsOd!3v*(ivEQy$up!Plr#E4pQ
z6ky12cwBM}XGtJ`50sh#jc8$JWl^!S*dR<KB7?X?gZ+`PCe$f`wa&S?tf{j|T?Zh|
zSDP0mImlQI$3<kGp+_hiM8)ki3tWKoO~vM@8;NKqiNLBbnvejm9w<4m5N2RtNlHXo
z-kDJj=EdybBVm$)0V=Yl?cun^@|KM5VpTxW%HMsL=Z>94lMS!wa=iLI1h~AYV`InL
zo|r7ZnW#g!FtP{Sc_Vb1)+DVSMbN<rjLjACp$MVmQGG^2E_lrvY`iD2uB}xh4b4AN
zcZ}!RM3qcgj+FyuoFr}{hhVmmzCWEY;`&9W@8BUg&g@27B?Ke@!dPtb67W_DI3^L9
zv0>uGsv+9?17_V%D@TU}X@*1((pU?%@Sf2qfXte(P)H4{B|&ne@Z~dD;bVguVa5F^
zoZ&AZPa3G@4OMbnhY79;5Pkk5L=a>OM(eLJ2bS=5gvOawh)Hu@U+($<?A<$gD^V11
zS<XU=4>&|cCq*W?a-$n*U;h<Z5ssBbL_~o4;0K+~$@_Q@!#UZCF!PLn56ZJ^1O55n
z7+K^2mcgRMh3um+ZfnUCN>4-Bg{=q13#&_+43mw8!KMcnGP))HfH^U~XGbmygXjWO
zYdRsYSn}2py<fvWs|z=4o}BfV@r;O$mL!E1cFG`Pc92b=xJ_AG$02J0QRaN)XSH_?
zI1JL8K;LfR#(RglV@=lU`1I{)#`GPW$EfRrsU>zA%%^_^fKSS#Ir7plqKGF<Vxk_6
zMlFpuWcj3J*^82p%c2&+^9wg5g5*Ye)7VBF6-^8D-I9tn3sRJ+#3{mhy^2y{#z<_>
zDz<{*;scNu$zUutwhIc<3Tv2tr0!4Rgf`RapgJ6q!-zao4LO5k35p^13;5_ryqt<J
zR3Q%rwAv`$Q~g4u{qa0>X%Ls1iW<i0&B>XK$m&<ytBSn8CQ--$+|ve95gAFfR_v}^
zsd#Io&m6=EE{o(cM1l;Sv~g)nLKunRu#_)@9gEF{h|6m^Lr~W{IE&GJ%r6c!Rz*ig
zA0|67l`8QFSrUX!i(&0U*fG-cg3Evc(zq#e0||s>IG}aV9{O&2DN<_K2%ZX9LuvtK
zcY@D3JH^Vqz&VjObEet2tba)cx~X+ZhAebkp4}HOZ(pa(d}<fem05{oMm2xneA#iq
z26Df9_8i3e>&XFK5QraFBDHfq3IoKn`8>fDxcRZhNN(^>fN5Hs1hlEvK!6+OqsNbS
zfh<W>Gz`FX%Ipr9Tq3a0cmjkCqp?jdIxa3wMu``1I5yeT-wE+YitfY>hv|b{O`R>4
z!w>3YnShP71Zfmdvf9Gsvs{Ts3UQWXVW<*eO=?2>9pJrvnIvk7HCFSjB)&rMwT2l#
z4-S-zEQ$lbIVJ~ZFuEHitsD`0IZqnleJ@%!eja>iH(6|Qc~+w`u7qJkka&8wpgcMZ
zO$#Lz#32==9%C36MB;?3wv)3au|!*$s4F&?tb?fMB<@dSqQ-nTw!6UXkXI40l$v@#
zL>BUgjtP5uA6pxLLBW#5O%$-wi^<|iEn>WpM)$@lQg9v_V9>1iD9pDM?|t~?u2Vds
z3*eX=zUkb2_l}%pN#LAkn(-;KPySR(0*yCx5}*O|E0gde=gEHTnLPHgV88m9?lh|R
zc$0$bznC=QN42^@Mr{PQQ&i2dQH33bPoohGs5WA!NE|;W_Y8L-#6px~b9rtP_&iQe
z77F``k|Y^CSUH45ki%x0{lHAW{eAJRML29J$#Fb+mO=tHV<<w~L(F&2XKG5^2TQBs
zhwzJu^h&|JduFY7#ZJ44cV%<Iy)TEJYC@C|rd1QTa3{5^=7*>%V<+WMR|+`b>q!jf
zcL5~iL=+Jd4qECC>IlS-eA*4DX|%$rCk_It(YVR8{dmS0Xp5_%9^wM@G1`oHdo^qL
zJZu^x=K@d*GH+mCBCXk&A%%53tR0kL6YDlG)SbEF{*R3KXpt+u2K7mA=x}f7M-bVU
z)x3^!>){R81dV?YN;-;BM!f*#bO6j;OW4TIm#73qd-Rt6udPlX+i+1u;hDVclCU5S
zzIY<H*<6^SXj=)-9HaY7kZaXmaszQ$Fd(^6&^_+UaIX7(mmaKg;Ear=c!os1Jbn-^
z*Fxx5S4QwVBhm%}zmOPYsH7_)?gX}1Q9m>gLK7<FJj7Td-G#jIP=5lRo^)E0up;y%
z&~20pY1F`3l#%<7Vk|flSvKts3Vkv(7FmK4)rz=2QUj^egVt9^EjqImLGJiY9?Lno
z@9W3PR9Nbq5b7Hs%Z0R?AX6t&mq_Ujor0xT_vwv3#ZzjX0K{0!CX+%F6f^PrKXR^6
z&nPuJk*>zdftV457(ByFn31*>lyO!$Ur8cIpNRBSYe4mN2Qmj;5xI;c%Wj)0P%#cx
zsRLAO1lAitVy-m`s#I)PT35~}jA@wjpYb+ZR3(yjamQzDD}Ad8jNJCts$?hN6aOOE
zpgM#cc=M1h!Yflaq3=m&MC=I#Pr8(CXzA5ejpAW1?BS@;6ME7R(NaJCE8>KfPFuv5
zPV-lI59;^231li<#zTLf_JiR`-j|rOl%3JnUk#sVa@``v{yHvcdq%GO8I>{UIk83=
zb_?xTgy-L`4!iz-q$mj3N{zxqG<RtHNzxxIeh5AZ`Nn|BvamE#za`OO-XI|^VV_kw
z+yZh);5nE%j!sZeFP-z~YBr#kH3s@V4EHRn*Iwf>P7#<WBgBX*@H1}Dc1m(k>&U?l
zl&#IBoSjEqY3Mab0=5;`#^RwOkp#0;h)O6D9$G^J+lB%0)#mN|x;c}?)Y3fD!2eD+
z64KaKfne70IWyCnVe<a`o=TCP1?U$*D}oxw+;1cLDS~F$v3=K+mCQgS6mep!-j7r*
zLywn*dU7GY$svYTg4^#znwk;(gT`ce_$$dbKs%LsWvJ=}m`3>~Js$)m&nm(Zvj!v~
zvyl6O4aB#C3ImXNj_BSaiov8$=~gNuF#=o`@~g!Cahxj$pzah+QT_;}9IQYMbU50k
zPjVhw^X9e3g0@n*sbet7v+P84S=*uNrw)58j**Xg4Y?T94q$waErBm=Xt3)owy|Ww
z4~$VYKZw^U4-U5b6rlqx2&Ds>>H(N3dl85D?Lr0E8)j<Aw=TR3$-`{aaR)U?!oF4f
zB<bivJvONLc|nVeGlhhVCd<DJL#Q^ACle5<CaVF6ukYYJUw|Y*5^gw<J}sw-ap<Z|
zK)AYet`^u$NDFUUz%RTqZemrDMUwInkGQ1vMY1_Udyu$4g$bBPsvY(3C6}8>L5(*7
z_iE|PlW(&0t^v2Oi{VfqzM^h66Yv9_XV7~DRkd&c!}wn1msj~sBxn+%q`=0@Xyi5^
z2Lxg)I7UDaQZ&)0v6M|DCvmiNa0B^wNYuhI#fP)xRi@fj2XR$kU#d5<|CD925=X1#
zTykVl*E^W#oxHc_VM!1rkhf5sGkF0>F7?Ah?Zbybx%-V@T!36fZcyqKTYm55A_V0E
zaNZ&^JeI(ygKVa39uHs!kKT2Q^rN>B?78)*16SN1#~20i=XwFVgOu#6`|stDy2PT+
zQmXd`8Wl3BLJu$}6BEE$Cjw7Bih*Y+M)flYd4Ks>xq8mNgmqL8ptu(`;&Ge_*0<u~
zw7L5FJ$oiI>eZPC>^tHserBvH>xSp@R_Qak4<xU8q|+@tUKPM=E2?|~PF!o0<OP6a
zmp{MDrnWK)#l2Nncsw>AQB51x(N54YGe>SmW2-fjA20$A!EKaB>y|2Rb|RJdcpp+q
zux_(_qH%N!O$%9Pct<<HC+64wbTxzkVm_fUZKM8MYDR2(&W9=tBl2uy#c^z2W*r;r
z1c+=4w>Fy#%7f^iJnBROHvi;BNE%39nc4-YxB<hwjF*sM8>_#RIbcJ?&v7Z<C~(Af
zqxEa^E&VMh*vXmhT)4;$DC|;ZEvA$QT51LNeNJiyXwgQ9lg~iHatSTw0P<V~S5BC1
z;MY0(?{3Fs61N7eUBxrZqo8Hw{Jd%I1+>@D3gUmX<VVM96}f^?^&dtByNnH^PT|^j
z|CqtvPYLUh3zMZ&h5}_Se96>{?F8&h?5yT8-s9tHS<~WOUyOVqt;WmBbbbk%@Yr0^
z<gg&Off%Dn6%9apSUt`(kX8~5YQRKn0QyKTrW12y%pcT4w$|oS|7FT+S5y3kJ_c3r
zDcw5y^JA$2<0IPW=oka8HMI0R8VO{g>1LzEwDd$$Eg&}rksXKnL-8tlVCf;rD1y@V
zMpr-(MSAoRKb>0R)c55B&u)QYnU2aKhKu@EXBZkSq@Si{A~Md<c^z9KiWd#EF^tt`
z34@(Gcg`Gf^=4fsw=@+wy|7GPWRZcL+(5)I;;Bll>HHF$=Hq{THSd)~|E4jbKtR4_
zbbU?_ek^-{dI}&-4BrSLbB1$!&7f&~EvjR(qvR<#_Q+FbDz!aR8`l<itus3BkWmaU
zDFpk)$FUYKaFuXB^}wPsP_Ipfh$g+^&q=*Csza}dC<7Drev|5WQQ>$oCSmJr-R;gJ
zbLd+Lg77kqTDb*yi~I0ta^_PoK}uBCG871?-4L0W$VK`LXpqnc-w#-uQBX#G2!tST
zY)2#(45@82mPANV6$dk7N?R{sBLEUAEn($2oWCl&9v()--W{^=WB`9IOGR=h0jtN#
z!R9YPmmTq;z6~DZ7>;HRe?vx;0tZ|qJrrP=hiG}lGU{tHDwa!LV<VIAz7w<_VF*ZE
zE{73RZQ(G|jO4v{mR=K9j}`<KVBHd=f{s4K4>OD5B6ZL+!%x$RlO{`#r#FRtIu3+%
z8tl7BFx6x?rs9%1g_9jBv^&b$$9bbq+Q;D?(M>!;fqHJKZ-TNt87DzZ*r0ue*X~P3
z5fCy1Zr+*DUuAs6NJr0WNwzQRD{t?H9NIVJbAf?tl3(^GOjkm|PZ|rsR5ll?#2PZd
zAn(&g3;2P0dC)AdT`d$#c&H~^olI)Ej!-4wLXr|LADQdgdZN~_8g{7B@*?}3XUjXN
zXID@q2Nc2oIsp4ZpTjv1uyh1${EK|LVl2JvvO#BE#d;VHgo7k%wCFQBO>`oObub+n
zTC_k2pi#pXo^CXbk!F*xlbV*@0YiD|tDP%HygZyAgq%w5?9L|o2S7@=VgzHd4+Fp3
zU{^MDq6}Cbq4z;Kl)M$;*WbpR3&g&&1L(H&$OrG&s2qwLxwRP+feG4Qea#$(u~!+5
za?@y!5mAOLuEMJqDl&)U01O6d6pR_h^CFN2b&rzWkjB9pl(V>{2O5cZMFaKm(F6mE
zCuZp}On@8(7SdgT;L#r0YfJqKwA2$54<G9Wseg(pKQwtQBAXKZCG}k|)JNcxRw8x}
zC6cL{BOafeTJ2snuq|Fjh2dZ12ptJWTsdT9gUNT!=!7mTh03oswVkXo`a5?vVbcsQ
z>}43;8>5OCM_2(e0FjJjYP-nlYA(woEn&XuQ3{6Alc_-9*(`xCoY39UNTjd&aBmSS
z2dd~rGI#I}wxTFO?cSvQ9`_x?L`e|aPc|gN-PFv3|Jq`gDFPeQ=~2Q}DZpNqrkRkN
zoB-gZHy<YT6C1WL#g(=*7OzQ|SO1p}7g5y9cePaL0O#tVLxzYhI<7}_k0oFcYnJL!
zXW(u_wOYu{dSHgdzEp!GYyv3V(oyzP8pf8;iG*b>vU4IzC|e+BHkYIVI&TV5;|h`J
zuKUHdk7cYrG%mBCz{FWnjfbRWsWenWY%a=~`gWgT4_6dc;OjUM+kg~GP&iOiFd5aU
za6v=QVm9x{F?6%!F1cjJW6@Co(X#xsn<zeBz><>cg#Ho%0Ey}qJt?I^t^F`4zMJDC
zKY&*16panD?8~CN&a_}ftGgGqC$3bp`a+~1M99wD`|msv6-jHS%Okg_K~T^JBUu7k
zhEC~1gffJDnkrla$Ci;S0Hke3CkT$=@#-l6xt+o4n1Ig7HfVN!M%F*P`v@9=6+x2N
zTdbe}l*g(h+9JyhB&Q=+?d8)yS<^nE;Zq0gn$#)3w72=pH1esTP~3rlnXo1XSH%k8
z@oCh>MS=ynBk0g5A~A?>2>y}DI`;+E8+oE#JsogpK(5eKn<&f7A%!>`9p!cGt2w|_
z3HbOm`(7T*P|m!EcI;zcw{gMg+dxBZ=%lp-&ey!o3&(}@Cnp2jN`K0r-NcjV%PA#5
zrCjkmgKz~oCMa*?z@Cn&8INVK0scW<Qsm~1Fj@~{p@;1OKmR66(^rT;9q8fXH8de;
zn_I*~)Iy8Jc{(u;lLfBO1@%(`pleCkT5RXpqp0GV1yn*p;3TGSV^J}<!^574COh6n
z(jxwiLI}+d)$2rn(}{qm3xyaXdz=k7Xi4lU)xIcc$*P}-CeDT;DDgtAuQ0jsw+##p
zZ$5aSgkZL;{T0DX$iNM6CZMT73q=%}*OqTSKujV1g~;9P6l(cjRM|}=h?oxCq$HDp
zI+=N-*5`G^*IK!~#VlkB>@qQ&#YE>ZHFQEmgp_2MLsS)_%JSzm0BZj#0N|dySAmMP
z<4g;saXvlKfTp!)Nc)Pl%qi%d<lCYOG8D&ckls@X9&4t_Otc*H>lDtC=fsU+S9Ju-
zkTIMjDr4kj$S6#J0UaMfAd)anz){@oFapNv+A4+DkG^F0(dky`DI!coeH_D1>rTm%
z`Gu0Q=zYh%WGTf%QdcI5cxlkzy-3@g0B<9wpR^-Us?@qeI6#BJ%pUC&h<IAqVdQ$!
z=*OP#bU-JbXA`B+P6T<E3!b+Lw$MW3&Dh|Vl2p(mgk){d1m2KMSQ&HXM#&19)cFFE
zTGm^>UY#|~7Om=)<Wm7B$g+I>_gkWT`Pc%t8~#9y7r;p;CZ1%VBGNH5`f+xcy`^bC
z%zZQW6tU>^GSo_Uvify{FgX!8c2Q)1CdoCB1qu<PsAMF(Lzt5QqHt9=WbtRz14`@y
zwRo2U`tX`3k>7|wI;oFCDDpyvJ&UqXzl!!&Nc(Rgbb%9YLUHO@J#1LH)OJFo2%C$E
zoeMJ?Z!(*Q<LMP`N66r;Y1Dm1Pe=!<uqx4$;)I$q0_}k#_JvRFM8>9}MDK+{m*+q|
zlA=e%iRdo|+#y|w&iwHeT`t<Qq2tpwO7x`Dig?wMn1l%;l9o>Pq`IIX-j41&;EWX*
z+&%!<Wg2UliB(FPo-2%6AMciE6BB*WQ*Q2C3~B*2Ks>3!?}+Ts<d+%)B=+n8>1a6r
z*>gMt8M?>uDv44JlzCSnLvW$q9yr$h8benMV=1{qs0%mg>1+p*2Sn0VNXx)fUqD0&
zQNd{UOUr=WV)^q7O3+y0qIs`5W0<g|aA>^tY!^EM8e=uurNjmXkXY%2Jbg)$(=WGI
zfexlcZ8`9EX6-}DaMbzOTMo^>|AaDwOGKOVy|%UO)T|3sXzKm=KN1cWZum1C94w8r
zv>B-seckP$gx#sL<b@)WC2NiC<Bp`=fhV;AF}L~DlzF0iG7-;K($E6yn+_#`99a==
zVsy9_MSCUi?npZb;*acs>icvM#<8z-<P$+#Rt{2Z0O*$mRJK$9>ONFYBGr>YcW9xS
z@b2iOQ-YeX?R2;><tsv-*+Ut#-Rl6h9jR0-HX?RrS5qg>rx_fdJb>dRLfg}Tglj8t
z6*=?KY4S_~3$l!QwLox@41)fEInRKB{K<txURZLM(TS6!{E)~@ETdjD;v?-yIwXd=
zEr8wWkfBhJE4IhF>9h|j{hRQrd6YkqP^gL!xi&jsKFugqKhy*R7-SljPR^o^&miXm
z>3|$M{*qdlg$sTFi4={~3L4&_LD{FZk0fTyA^uM}GXux>v^ti+Lfp{{9-{X|M&s~6
z(QrM2zjXWO{Qwev<eaE|SWBgTtvd?OjIKMwLxY9}1~y=yJI(3bjoyA$=nom)4r7_D
z>-^-aG1<V}JecfR#6`Kc5+O}*aeAUpx(FlfMJ?bsZYj!BjZdzSG&lsyoHcP$fbLp1
z@Ql>ePaK*10$VP%3ke&JQGY^A`{J`}7r`Foh4x$v3(QWJ2qbxqSbMF{pRg}~tcX(w
zN#5`dthe)s{{Xezb~Q0{1W`o^eCg9tLC3q$1o4h2al(PIK6bS*tOtXM{9+nkbb0kM
zdiT60T0bPTIC>5E4CN1nj-8j$(l3m0ee%$iS7O+Qfw&~G06W-{>)AEt7;=dM$vZ?K
z`eNO?bO3?ag5e?IIJX5=uQ#3JLvfCTabl>2q{eU}3aGK3j=+NZC>Qct018Pum`g%Q
zq~`)cZ48wW)X&gK&U%cLJCMW7BjS%~;z<|u>3J+htskU%fQUBPT*fe;T%eKI_4Yu*
z$))yeUI2!dhP?6}&red9GSK;8Rp(R~SAw9UH(XVvlZVJMh2)%4k6t+(%HTwzI>;T8
z(0Cow2oeNuQ_-$Wj-d9B{U{(VBfL?)LR(oDbq~&{SVpG=EKjNC?!Q;LlnMlHIF#^N
za16M$RRS82#i*gPc6g|p>Jn=ACAot#Dwj?@q^#s!m#GPacz;V-tz0vYljb}*fc4i~
z7Vqtki!EUTgZfh@MdztCZ`3QO8ykDI0gPX0`vx7@6bZr<qqyeFv%-c#jBiO~sP`36
z67|HJVA%(OM^Qm>P%PA{2p|E5X3xSILRviMDP+=Do(@3wythG&5!DCJJ&AY<WIKQQ
zP0-;IO5<};UV}mDr{vUEo((2lnrJ@^QbyM*aurZ}2&osf{c_zv*MvWCj#aOMMJUc5
zpm`bSiuWm?(xrWPDR&<z8LSislI_~PK7t$GIX~mmHtKQ)l|C0$ruut_$Jgt-fJ@k8
z`CMnzI%+rJb@7A5<FY7ZkaozgmUL;Nmr<OcwlKJoZ898qffC0!6CKJX1rs8^=qc2U
zUae=^P(eAN1#r9W&k52!5S)45=lz37-<uI7MZ_Uxh>Ar=ERBhd&LuA}jIxRp=&77S
z2D1g-!$!eVjILg!(zhPn6bI2<b`OGu3X%U5Y^WEWY6O^faN^dhdN}8M*PZxf$Oc9t
za_V=$)?AM@Y^Mf%gZfd56j5@s5GxEe+>y#Ma1MiYUSjag(74hLSq<31%AvS{6a9Fm
z2}p2VC%TVo=Cq7wq%V#RnFLOK<YJJi*Jvvj=V2@$c8+RSlu?ZRpOC-nfr;{Q&|MI(
z<;8){BqsKgf-|%a;Tc#`#pDd9CXkF|2z;uL2gRd)G%}=)GO}6q7NLjW3LW{2DOs*G
znK5G2^hL6C-teZ657A41AYe@w0=u+|N8Qsn@p_yQQ{;+3jFN2%#`HZI%Ddy?o+XA5
zESRO$JSHJ(pts1U5WGhqiWoY?k>U<v^j_UH3BU17clhPIIL!%b{Bx}=BhHyxCiH2?
zkwMsYi#b)Az`^)Pw-C&yLV;|$TsoJH8g+^>pS%IN2`3)a;{><h$qXc$6j?Gswu+6N
z#f1F{cDxXEUkG`+QHeYjYIPwS2UtVP%{I)FqqrDJUa;tmRV8ONfjZ(q2wa1erO&{_
z?`5mJXi|F+P;+^LI7F}mWVk@;PlU?CS&~P!2ucbfourk7nE-b>1Z+Oa1*%~PZ>>hy
zq^>u2ussmXqE-<{PGw)`Z3BJ%dTd)Q4E@AtA&yO6W}9;jU>2P+rP;(J{Lxd<k}aSx
z$CAWlKXatzAR+cByfDRJ1PDs7dF2p^mwdWBiyMg}D5ynaxCqQIon~&Ikqj!R5mCyu
z+l@2a1URS#$kTzbp>srt$)l5TW&ua=&K{D590v<}`upijz)w^hq0Hj4aMX+B_X>x2
z7Ku(MtK@i&8rl|EsF}*j#6@eR!FPC1#cvlm0il=EIgXKlq(Eh*bHOnjB7+av!Fc|u
z3BN3z4kb7*03$#qpFHrDr?qjO_UHEaS#);8_bQrEKKv}7slh7LXULr!uNJiu<`_2(
z-1eRI^rxd<0487}^_t_H53(=WqD@;$X}s0n&xsGfQAEeOc^!rbqJWs0M&kWrvWh=i
zW`<(E?GaLUz~(L^6n66}zU+P3?+KknZ~F-1%u1|BMl1kovzY%Z0g;tpQ>U*9!<k)2
z$K|8;I9<qa*3<EomlYgqgj(+a=trV|0C;+O9r*4-`~o$EDVpxPqe;nG1^7wRZj8p6
zy9m8Y*%aYvD0%7`^`iFnSjI1gto4Y`&pxRqq(<$skYJXuDH2h;0&j*UF&t`%z+rzp
z?;U6fbqaYfrY9It7=tD0Al(xOqsdc9p(9RVsOXtMN8cj{E@1;<kmZqg&Iop#DHx2>
zt>jb@%Y*|6B3?rH5vX8!*`SY%p;3OTdwxHNEntm`gZ!;r7OJC7L$F=gQO`RxFOX7b
zb|TsV1g;%UilE|(#2xC>#a-X4Fh(pooig_R+tVm{%z@DcmVh<&B&wVadU(Hv@jeb(
zvL2bXi1bOQ6U1GnOlXH00*gzd;4Uyiz`5bU4fmK&`-i;#ItRV+)9@X)+Be(%|Lz}d
z|5779;g2iN*D2|V`iV~4GR|=N4}1TOj$M=>blQC9p<|lcuf)CA+c_a@fi;I=Gh=@K
zX=Q2FmSui_{J}cEzG`5ou4O=8?7EP|vWCG2ac21{O^QWMQzh5t1oi7n^ZOMUMq`y}
z>fs3Rn^F1sTiL}B=5B$We^yyp$!lcqDFZ%=Mncq7Q&TIy161Y!i@JUF{X5`-Vy-GU
z&JO$u?t}`kM>ckLrlGH@s-h8Cqs>wikKJOJskx6HtpiC|2KXHgIY>fH%>s;DIhOjF
zv*Q?zTVGHkn~FKUeo%%ZN;ruN6cj2<Y?~VzBESdzO2=ROvF7+b`F$1tbP0S7@Z05&
zk>A@;U%#2u&6`WK0-n{>)bO5He6y|XO>Eu>@Jr1&BB}SR`;jAiP+t)34l}AX^{<^S
zhl8;G?c3|;&;KMWEPQK=zvRYE#5!}PfrDE;Pu<(h)zuaMUc>AqE)1vku0w^4DtJL|
zTlM)R($e{KR!7(m+fw8NN5J^)#Qsi&U2X4j>tB^!V<Gu$#982shD77XkFggPZ)!&C
zLDd77nDrYr*zDfDnEk-L^QNebSrmduuC=U$tZZ@JU2rTGMn-|6D=cDB7wRIyi9biy
z|ANYIFKUao76%zl2OYeN=NMF3wCG=)^vo%iR#B<$>#4|8$K9Un+WqF?4ZH_^bn8W>
zjlv)anW~2=fmKA6UUYOF^?>G{g+8<Q(Q)BERP_EKAtCKXz$#tfOALU={F0KA)=e10
z%rHPN`M&+BE2c7GN4!wGk)4l~Q(J4f^dcMohk?G{>nA2G|5dt23byqkHjJIb4%*_J
zbd9}!!NIqoVz%Ij)x*xt`etTxg(De8U%G_fONzl>B=6L?H5~z`ya^9fhJ83`D>Nm}
z^@D?bE_9?zAELfXZ%2b?c8J$k<K<@epNCm9oYO2UB5d4X)Pkm{Ne?~3Q@q6So4-lz
zj7xJ+k(W0_bdg<v<`~Tu9D7%cgu-G2o*O_1n56v!Bfg&&2#~ZM_Y3`d1EW*=$F|6&
z{GJi!^m8Kk<IleVj~YQ|bA$y&=|!AICYT(o>R|}XN<4LJHp=teR<ewS<|q8%E%+Qd
zy1Oet2qoL(iMCd!?wBxP!p2Qw89yiRZhu4j6}h~1Y3lv^n`tfnh9weKeFTa~?75lJ
zMpihyU1P7D&k%H&;Xw~9AN{%VqFDU=bBM4WTBJpNm+FMX0yr3Pt#|Ln-t5~9^u*=;
zvk1CD0HqPYetV#rY}_<v<gPj9wRS)L_~Y|jf2l5PwmE(i8QmA*k&#levM61Qk3#17
z=j>QU<HBZbZA18uTsj-KDJm&7!}%DElR@cpmWcll)XE;ZIkNQt0?4~m`SRtjLD2gB
z5<Yy3BnyqDMYjd5-40^nqI2Z=UGSXER`*Wh|C0?G?uXz8E{iia1;F#qu_nWPX{wW*
z8pCr1Y1V<SBrgVw4c3_4<-Dnyj5o$Iii9+_g`BVvs9k2!oqy%<k2Aoq{8L?RMu*Fo
zq#e1#$|)?=#=#dSp}XwINgcecPy@`rBHA)DW8}o{uxd7&z1_+xjLYR(+u22ehNGah
zF<XfFbsrjIj{=P?VFLrcKsx%3Ey*x5N}Nq^{KC5!_L;H&)vGb6kd!PJKISgb66bd+
zJvqJ}A<g~5rAv_jz2rE^&B<9s$2Q_<_F@1&nGc0U$A*z9#Fdp_)#)?L;wRbJ4iF;C
zn}8F;u*^6U{}H^_5eCyUTjfXsJXFJ*xpU{uOFr6VKUqwzdL+{M-v5!j?HZbiUj43K
zc;x0+De(gX18ZDA;@^Tv*22<xzXw;`2M-t3fh>uoOE+Q*$gcgF$=VG3t^51$*I9d!
z+ptG3(TPc%3;cr~@W+0?%a?C~N!$aXmdlzsb0)H?HXM$DBTG#bm4*&@$N-$KZ{BhW
z+-pNNLM|PN(1MC@jS{9P;wfy$yS;+^{K8L)%xs1WXS=iWT}lfxg45H}1Eh@?jUPXr
zU5py)2Jp{pOsmx&sCD$PkNsE%xAZgt=;ZOV%*rcgPHurz$NSv3L$O*8gcygVzL{HH
zUHt`nPeNS$*~qxWz^WUakg$@?y?F5=c->eO)6w8u$KlWz70PlaPn~jj^Y2NZbncVi
zFFbtsFwQJ>k(-;_4V+VMbsuX9YhfFmH3=QXFCbv4rKRPQCr@&aV%{IYc(w@^s+@i4
zbM!ORH#OPeXdn%?;*n2b&|6}8nG}OVrP7HV2p%W4MUMEP@B(eyj~;z6Zqlz|&z@~&
zb0Z@o2^WyFpPrA_7M2giCIH**h`=t@PoHjFxNu<~)Rvodbyl?V0;e$tGf^0ZQvp#M
zh_RGEP#pz-&n#?iPO-$2C7Sfx$b|6myWe<+=!Lfu9zR}6Ny&u!1wMLv?9s!nuJSnD
zC+xm?`cVU5NrLHc#%G_afYs#8WSEGyj1?T*RcC>a5#uvFM5D&SArlgd7Oe-Tw1#&+
zQ0@sxqI)o=Cif3s1Lt_5-3am-X<4cuoDMrVtrlO%)OpWU^E84h!jFZ4+YCd(3-GLH
z2H$^T<Yb06O)N-kcDr=(3TxTDSa^vkDGKtxBD#m)yoq3RfDVaZ-Gb&<b!Yb}Y>3Hp
za?h4A-=6Xuo-)r`*0}wTx4;lcc`*PTBLnO1gur&&=+1g9s&Z&Yo%1@eD<L?naqqB}
zX%^<l)<GS72nTHh%?8C!d{j8liK7tLDPyztLcOD;Y2)DFfY_q3T}w-A2Y_K0psN;F
zNJ!|eNB04Pc#GGx62tsPJKAyTnU0Ccm90z3th*F$tFPr0P&2$dcyAu2l9!iP0wlA>
zt<6kx<Hkm1TLdK>lk*<5^}F?uNa+*y>;BpdbI?FEudKa*G$s$6Z_S-McM=9cTvFjc
z^?3wU7$KIhVcsc&GpQ_U`(nBQN6%WR%X8!^%%?fU#l_QS&%P25i$}on=@cQuURG9B
z0f+1b3{W~)#{r?SaMPwuY`p^q;__{iXi4L8EpMOrj*G;yK=ERJ|KK`sX6pWdfqR^%
zGa8D;Fuf8;D4+J{3@O7XER-!;TB~V!iJLueW}~C)ujKQOva>e<GTx+oT76Ow=)^TJ
zI50{@w!TWydeFI{0&C<w6fb#9J;O)_VT0^}neM^CH^)&;FZA*8X}JTIKBuUtD2rF;
z=qe3>dMQSC2U7gDGqYB2JT+BJUTGSV_7SjJbOcrE!-rbjMT-`}>8A{QM<JB+Zx487
zngy)mtz7-vCUUg#5$9-IFyd#v1;gRR*i!4(t)pefnuxz-bd`E~d-p<jQgUkiCqTwL
z8b_VJ2La-D<Hn5_FPJd1oPq*_-+udp5>A%avz(k4$Y?C~<;#~TNr8)rfXzY@Xt&N8
zs4ff@*Xh6hI)ComIhF+Q3++Y(ADY?wmZH9zh}wGS*>`Ekox#?_&iVrGG0Fypz(|CQ
zh#CB&BEV^?>*?uDpEm6}{3P4C`pNbNv}TYDXeJou;3-5zpwwG|fxqHN-Aep>YPtui
z)7*DpGz}nzmw}+bFTAFI_o8lo<*Qf!hpsn)%Q^l2|8I;jgJGDmj4c$EeOFn+pgUVy
zRFtjkM1_!)ea4zw)*?&Mo-C1U!=!~Qk))EfBo*3J)bDw2%zVC&|Ks1|yUdK-%k{pl
zb6)55dYyAEz}9nc(8|;LRw3GXcQ#IPa&kI-=1d<hhdOfUO=irQvcjxTzPO3X+`^)i
zt{bThw(Q(FfM0kv)o+N$WY%mI&U&BSyE`i1KD~^+<Vm2Hmty?`c4f+csqT{}v0Afr
z>o790scN-{2ejm@FJ{OI3bQ8z4Gn+CVU=|0)hkE2H7LYBHX6TJ5Fy7zMQ^t{%P$#9
z5<5x3Xx6;D-M!Bh#hk*y;|}zWW4@i2OHP_maAOjuVKf<(@@6beU4f;wrzwo{RD1WH
zfP0+KcyNjLvE#>el+n@A!USowY^g`!U;i5fddi)fGox&_eZ5MD&opWbP|3vPPue6q
z=hgIZT!Bn1&q{u_Sg@Ii1U}y0TFTQ=Q4^OeIR$@q?8p%<^|!7^eD6GEO5wD63f&<v
znV!_CJ@VN7kD_3>^HIbeAON**)v8q;7D<~SN@E%{@BWZItR4!rLY_YREC|ix#}hpL
zSqV$8JfKdJpCi{}onLNRw|;&2#)fOwtl6?-hx(|I&Ff!FP97&f(a+DXgi3Yf=pC&m
z{_)2T8)1!ooFxcs$zagQvd^EKe`#a*(6CvnR-u2duNMlAIXKuo8l2Q{>`w~Kw2I=~
zQmCE~$a{fy9BJx1z`NDhu?^+5o?Urt17iC&?~ARkd$n2b&p-uLp+@gNMve^rmWO%2
zY>?Bdygb1e33u+?VJ}YKPl(s+)@>Z0{ag6;8OCsk^#}p|JRrd<ZXP;xNR#aiuPCl`
zu=W5Fi;pK$xS7}Xh!A?#A-<7}_uNUIlxFHx%({MfYQ~5<+&<^kd%oenM@2tu!-gLy
z{_^<RssQj2<1tO)Z~CcEirMGhy?aY)vcI~<zE-VTOrQ;uCjE*}pIWq$ZOM=BZsLiM
z*b+7B+DEVqZO+ZTKb)+y@9Nd7mo8uCgXqkeGp9~UZR2#G!-pf-z&f2f=U?C0z;N}a
zQ!{7I)M(evGtCYV^o5P^2uA+|IzcOM{NX;o{fn!E_P==fQd2`Cfa1(VHUCjqFmD*l
z5~%Z<!t^FOTsXAE>sH0@-`}IqWO~pKOgOlEM2$<$3~3-<{{o|YJu>rAmN+ph%f7I%
z&}GRI-Lk<GXaRw?u<J4@?B1R()>}M=DvZ|vtRUNt_nm#a1-T?y?p-|c?)Y~IV;uKe
zyLORGKa;9f6+kf4Gc-I1_N8JO$PrIL-8jbwTJ2VN=G-|e9H8_9{;+Y=rlgxUZ(j3}
zwKMy;qqV^?&d&k_c6mmEp}4^5TAj(T5t|yeoay1TU_loK-o#wI2#Qw1KDfBJsAyFj
zG<YzTzjousjbmbB64TRdr%s*f;_SSOvQ?IxKN>J#07bu07=~&5Rd8EHS+q@8L(m!7
zjE>l)uM(Q@Aoi6krbcpSx6RMG2)2G3c<-JWz2WyH{vBEU{oVxXY8M#1(s$_=@p7Y|
zlrZ=2-_NWo*-7&Wfb|#*a9(yz^?6e3$2JSnPY1$O)!p8r*YNJ$yZ@;31Hb2`()|7R
zkgtF>E2|&K=rTh_K9-aSfO@tjFd!g^J(zUoPWN43{0<#@{o+LjzPy2f!S{#d?0g?8
zIGC&;w1%mvskw{GX`&|OT5{9G9~H0Ho<D!SG?W*a|MXG}A#}Uk-jy`qUu<OAdS8!e
zlXw_QKfY2b&9R$kwrLYC!B~yH0!NsdZUK@XgYS|ZTCt;|vT{CnN6tCE))^2d7pBag
zA0o5`jbRWwf2;RJ>jc0HH0e>`qyFH*S%z&}v<Q0q*ak&Vk#q!5D(uFoFCHEq=eUru
zv@k1!{!Uqudi}E=eH&cGhG@zrm$4iR+K(XnX=rMW6as~ecInckzRoZHkg7XBEz%m_
zQg}SSi+gZv-5-CP&lwkbe66z9vpaDr5RC<itQ3s4;KPeJHLLS_LwI;Nj9~;>PBJ!h
zgpW2+&W9bVdmK<U!@XYJx)ev7)LU4gdt)IXEbiMCaeOpC`Dol7CcIF}AtPHk@7NC1
zC_!OX*=FDpV<f$aa5p5!@RulDVJy;IXaP-I3LXLKB;0%(V`DF-k~L3%;^O~1FMF0C
z8i==RE{YwHus?<3vs!*bJ*SEQa({3^K|v{9h;I<Hb1qYY5)&5vN1m8P=xA#P0UHcm
zUj3;ZOkRC*#Uqxy^ED6obhD1IyU&b<2qJcuUcDyv>eXxh{bL)c<}HmmH;4n9HCnam
zA{pvFWL!U{4?iuE1y`)$29@MxJou*Rp}cG*VtXo&wrM(s6Fzm=35#>zoa9F`P052O
z`(Ya!VnULct`~&bR;*C)B-0>JLW^_nE&#`qV0vyNc7u95E9XBrv3>cw$44pAzQ3h4
z{MHype`=|>eCBUSE2+u<W&SkI5kHaj#~?)S)((bwmak0OpYy%U*B~Hkv~Jy%^g2fU
zT|POWC?1D|wOp8QWMs4ew&ozpd{luN+xPru>A$DL$4F04PwL>ZqU;{ZkHy6g2u`UN
zoE^T^Ci2}Cif!H$XD~N!_|1`5mQ&$tHfwf*Z`HiwiTW3AKq0kN|C7QvhO7z0=)HPl
zL&Aj`Xn*|^g>l`zyLQ1$tqMv=Ytw7y2HsPy4&=wj;oq>YJAi27*!#d)x1y;^Rb*r&
zfeRR5@%00Jx;oRrVfzmC3+i<0+&P|UAZG1_P5>zjX+ZgTA|WB6!<39$x5mtvF+=PW
zcJ}sL7pj#R<2;AOYUnI4pj&_5ym`_?y1Xz`PZ`2BBJx%cq2J$%f7KxWPy;N0OZa`@
zk|Xclz1#NncEGp}0|r>roHP-YrUY##AuCJpqgh0oK`YFjKYw1oapN?Q#H=WrEvt7T
zs?UwvqFMi^pL9Q-Q$K^io!bKkKDGa9`BfPIY&v0rTn2C!N<(SBN1q`sZNgdKR;?sv
z7_a5D62ePYBEyz4Y342Nj9tFIzH1ZZdmMQq7QlnOXKm3AChk<0U#$s8wiI3=X{VoG
z>n|YG$(L81dlX?Gy0v+aW&`i0rCC#2Z`r;*6-Pn4xjjH#+uIGD%A~=g6jt?r!$!=O
zUq1GKs9udxl<#~Dpif4|lqOA@e0{KiHsC?laIr@<a`~IZo>jht;y|c}w**s=nwj~9
zeSC)?U8-S8Qc6RQ%CkrpM^R68`}BE0L1MOCr&Fi-?C{?WTmtm1Bf^jo{vc>w4)s0?
zn{?1{SqM)k82)@d4|x?P_O^2;3>`Xj<R5>`g=_ozL5%A<I6Ob;D<7Pnq@Zov8bUd8
zlb7<MMZzgOQ$(&iQl2Ul)h6%@VBUUn?zCR?_*DDsxOBj;ut);00jz)~-R=zg5$?SI
z!pBEQKEJo&@E>^`ck||aJh+t<P&!IM3k>vl_4e(2PNJ?6pXK=b_e%~W&n|czHG(wk
zmh8NI`6SwVs5?Wj@LjrA?kCBJ$WRp8wdcTrHpK1S{{GXBPF0_RBV!Ma|2C=;&?XR=
zBK#rfPFqQ`e{y=(Rzm*bucyAe3#=9v(cry3rbTSqwrzF%ljudEsB}X5XRiHKw;o8Y
zgsosNMHF9n-MV#WA!mh05hQoyF|4Ju%CAEC5!QCun|puD1`c_hlQW;9tG93R<g05g
zkB8WNc8NkL2!Ctu17qv|{PV7pCv)3HWKlpC$i+;#Wa~Vk5T-J5n1cFg`2tyGg-?V3
z&a^v&=StXyw-)vvqXNlPyzjtom^J!`gG;`WvHCAwy!hA#e)%8<vQX&zP?y(9U<k7o
zRpmu#SdvNr8fk{jIFE8=$>Hw%IY+uZh&7hM<~qk6&{W{8PP}<Dm-QFUjxN70OQu40
ziuTa^z(WdkD!c5oA=quM$Sr8uhj2QL*tl_HFo~!9Tz~n@M>79IKj|v&V}W*^$DvaR
zxLTBXVIstH&{twtV-H^c1)S=Gedg>~18cuJo;Y)E&fT_v;p<FJZTk?l<*dtVQ>9z=
zlxoZ{!KnPh`uFeu6~8K4<Wa6YxwP`k^&2<j0YK?z^^1qar7ZV>((tNP=OI)1h8I-O
zxK`toeqE3$EIB28^ZBA@5Nf|qB1A&!Xk;{nB1zPE?W`Ct2jGAR6JPHi*6^hjWgdMK
zlVg{M11L3q{q+bAo2ydQ8vYLH>E{;)-4Fz(6?O$1!St`!ih2#hBXNSX#*@n@Vd}q0
zX~t44D|%|ro0*fpUKbSDJ628U+Km2RYl_f>0fo<>9|a7I<r8!{jRl*ZSx=X`|H~s^
zyPS}moSLw!W3guM{mHY{yn>fY1mhOorUY&s?>{7*yc+_x<YdqC0p1aZEkh@DOV22d
z(vepSGGT8E<piUdkMGOM&dzRaG2b>6q7b~TudV&ZjvYJVRPk_#NHz_cbv?C%^FJQ2
zZ^qNyI%|TwjB)xb2fd*2-=|NS?t&~7@XBv_Ya>=9EbJeOeDdva$=^V#(r({=7``_Z
zWfvkAQ&+-ca~>o9v4`sa@(cXFIejr>x@HT}g>oR}7=~LS$fT$}#_E?yI{Tk?pqpA>
z-_eF+#_9ZCN%Ko)mxLSzg#;5!DG9yYT@WAG*h1lQHo1;GcEOIVTes5oskb{Z0Q_E2
zRn^Mdl15#TDuhrIW0Ln*#Myi8znoLxzPOq9(P>JlrN46Pvy*=x@~b-h$fwia0BqBR
zis6x4^G#9%$24D7m?_wA5AS#X1Dh@;=E<2kJIP0qP^kVi2S)NS-)6<0CqE#;baZno
zZ#sVRDME1dAT@9HxygTkV?4Z$nEo-u&Aj|&#CI=1r@Zm)Z$N^ko?kpooqv6P;TIDV
z6IB5ZA6gQVa*WnMWRgNY7yeG&Og(6O)a9Rl{?TpU{&|FR=gys}n4b14%*p9AdUX71
zhqY_hZcI|Ix3O)%rEknQ=UM#<-@FMzObLb@ou7T>GPLRu(ObQLH6G~$ThE9kSzvaE
zsZy!8WGXm^$RWWS2ypA>%@Z4Ts}r`F_KA#Ehtx05|EVv-LMAt82{t&py23@av+;o@
z&6}TO9FOkc!S=BB(ap59J9oY{GwKcKid4T(o5Dx8M~|Y2C`<&?;Qu_IHEh|kWeBnC
z=+t<c!NtNdJp2RwSaYdZ8n3>xe*Jo4X#G7xUao?0luC&jTj^H6GrQmMYtg7waatBG
zTnN9QC%A9e2A&z$IYzMmn(5aw-Hb3{!I}aFJ2dYqKt#7xF43B<H-ho9e72Kl*0%BF
zV86(Kqx|#6YE1N>-%{9uXZEq@i3TW~9%-%J-QCF;gvVb}=nSP=?p8<jb#Y4m<qyr?
z$~|tCgg!mx>2r)U3mnsL#rx?Z4S1dI+NX~NeM{z)R!&bZ1(%d84Gav_Gcd4a9Rv+g
z<^cqK0G?m~&G}c64F)5=Y@GZOlbP<ribjnZZ8V%FxBu3^yeV#`-opyC>y3AQr9>xd
zYRuoxfo~S#^EZpmR3UE|z(Iosk*oFDD$p5(pm9P5xVi0P&G@6RyMw8)VJ@aj$r%2c
zJE0aoeMo@b1DKUy#$<zgxQW;dlNY+pU(IBX5<oyGcq{Jo5EjzwV>ra*qj)9}Z4h|F
z;?1L7*ASBZN_pX5fBp63g7b0Y3SBih#<Bq&>Mc-?AmeR$MQ2~qRj$yJ%UmY^^L|S9
zv`~x$@-m!k;YjwLJUQp*&g!#t<kQ5j`<iTJd4TJ+m^)WT={BV6C#-{_YW;YU=P)Y~
z-7P{0W!N>-apSC5X7*2E#cFr=s^R;3U$}U&+1u61@;xzHWo2dc>({@Szpi@d#cSCR
z3l#6F0FH^K{(9|SS`Rd%r@SKhWpHqCQgU+gfZ0(tWQ(p!vWWiDP1mcZn!2r-R=;D%
zjw8@sRz53c%&E+oL$y9-W}ttOCdnx^9uWN&j*bBxI(AIGw9-|?Qb`UaL-T7SVTw9I
zGEI}1iIc6v6v3ayaqq7NHsoDbky}ggWsgxo@`ly@1yKU>x(W_y@u9_0HcOfRL*;CG
zYhRCjOLM<0A~^S5^LYijc!1-yhF<19ID%lgZ*V4J>gCDLE?v83EVPVc?_lUepRXw@
zX>hACGZv3g?_u74q$6-n#3yxNVk0#0*gTiWY?yh)V+xNGCL#rwLfEvLszG{sN)X2*
z98vYpr7^555GjBS2r3(HzuyCHQ51rY@88QN1S7oux(N}RS<rE+Z->=j;6Q4K!P)!4
z^+-GoN)4BsXH-9jLaFNAr_Y{UyJobx1-U+d{`@QEfBN`nuvdi)#Lj)QPLVIP>$GXv
z!)LXFv8%=|syTl<?%cVEG}PF8+2X5LuNI_a6=7$P9a-b{u6)a&q=SUkZc{JWK$}P1
z@yE_?L(6$T*e|)lQMALO<-c=r;Pva*wUt&OmiO69qNO)qWq)n%hYue(JUvuRJFr@x
zd%+n~nJ3)6TQs1DH<y&0nNx~XIeE<Q)II^XZ{Lp1VeHLlx)ZuaBq)-+;dXUV(kPU~
z9^%BuOlwWVVsPffuQm&zB(?N-X@z3bf&Wmk`E%i$#1`?TvfC{&yI|7lHBeK5V8BeB
ziNlAli<ovb;-7#1;M0&AhH;E9oh+`R9sNwYT~I?N>`nupby&G-6-jgq92s1c4{D@5
z7ow)nYcOv-7yig)*)mCbLa*iIoFF{jx^pMO%)qNgv`R=BlK&Lo)?N;l04PFO{$tq_
zJMr}d@`M6UoS5uCl=j0_H1F5na~y>81g77^KlkO!<M<LnVM4ZS+t$@`Qwprt1g_<i
zZug>9;{Y$e_nht}3<?2v$h}Hx2y>vU@GK5(+qRXCEW-NKpm$!oNKGe!J)Ai+HuGzz
zAXZH02;KgV%7qM>2t!d=k@@c3Lb<U7o8JS*(v(83=yPJLtE;zc-O30sg=q}xV&cO7
zT?P%Z6Y9DC#jt-;Q>}=4qOJi&g{U+(S3eKay8mYL7K^bB{N88wHo~xaCaExDGZ2Lp
z`g^Apt;u=y$}(sf#X|rOe$e|0(RW0v2x^FuN=E4R@OR^t0+YyqMQdw}I7Jb64iBCZ
z)G#V43i`r_gm?^W5e!KQ*QxjL;KpU|pAGZCxLLnJgIm_ew3GmoRN`_89R(=W)rZq$
za<!0TN;lJPYrYJ2qXL7WLU=Y?+C!n+9zsZvN@?M3w#$8-h7HFPR?Vu>Oh$5r<t}Xz
zcqxt>UlPtx9C%9U_j#3{KSjG`EgvV8EKtfm!R#O>2!P;gD`}FNNEu9_ftF+Url`3Z
zPMMAWK6qSl&4!WF+eM$J99O1bF)2molLFoABUq@_#nlz3sehKDbD_A4S&Y-A{*NzT
zE`x*B(b1V#{kei$#O#iiEqEBTi;ctF=rd<5UMKCb3N+QskAx`x_~FCEK+`?-SKA?g
zvA~*fW9q%H2Uog>%^47&Eu20E5Yw?^yS`7G)~?<UKg<QcZ+zlL52TE?{<xjLBu1wz
zSFUjQPe7)=E-LClr46_Yr8EekIs+Tn1zoN&v9t3NpA`lNeU>yk3PP2kAXe<6FE^H@
zJa}M*h(K8{9Jctpaa@+L=}a#c=e~p(Aja(P;Q32lUS87e+m?LjEvpw<F3k3<xlAWe
zBu7W>xH-GBcs2p)mbz?zo&E9kt$r@AOq3}&ruwqx^`_?2DVu_6po3DP2pD>cc1N$2
zTvR2=wjfl9v8B*@wR5@Z)RLsD0-uv0un<{XSE`8^-O>};Ro%W`e?&(8lQtqZW+k75
z+ruiN_NLTZTJdp~2OY2%SCxh^t--O|bQq=!x&0tnCcSAbkCuQPuG3Iq+GFwJ#Vg(2
zL+Q2Bxwv@v@ay&t@N1yS`%uSg%IySuR93#l3EE?Crr0RBTek6KhBgWP53?7D2gp>#
z5eR+!__6!sIh?LX=9?iILMR60cSMX~fToFXnGhioc4%4C%v-?6YbUO*x2*Cbm~y20
zDFOM%XHpTQfG7Dn96qX+6Q9|fM>a6@Eva%~r%or#7Mxp{o>KWq_(z{wx-u~98;K!`
zXr=4p8j$8Of=rjbeJust2q*q65F|DGKTM_9mN|gax7k;(63@FSB`nZ%FsATQZS)PH
zec*{avva9mLpjgFHKIWk<u!V*Fl!;{9}RzxxAzgEm|pkp<2eDuCb?v}aNc%se$&_%
ztEuS^8n~Vix)Y&}7x_}SdTZP<7S6UoC+S?d?3FY#YA&5HqTVg5Mj7vieYnFdBY0jB
z9#n;P1Fug1yY@LAP{O0pamzlvpTPmMLMj&5%d6+lXGq=aRlj5B&ZB{c7d@(?8I=~o
zwjTU`GP42Bln%bmr#`p3;^VO`KVMV)PcalerQCO;3M6a=?i0p=xb_q+lb<D?C%)eZ
zt~m0~$kC%&%!jLT3-R)DoB7DhzZtMM1Fy$h!oSG<<l;z}I-#223|Bns_u#<;QqXLV
z!j~^Q;se0pFhUA6h$JiId(Oxhd0#@ksQDzYW^rR`dv(}__IBaIKuZ5{LW?7FEoJtU
z{gu_}lQX^9IdE*6A@{mrj$x33HFbs+QeG;v<+ERMoT-3c8FlX7z3^~h>lo*kqoj{r
zB{T{lw;;j*8Tuiu3-djOh{=*GU`(F=CTNkzcW%h}GiM$!Gw6sY%p}r{Q`>1~Fph+&
zD!+kqTC^eL9=Sdb^$Fr+4oq1d(9iAioEU4YUjQXDsu4J$2tHPGXLChdH^fo6WFavL
zyo58m?-))l{jGf9U;WSBnd+5ZvE)C}$(BjpB14xpx*!{OQkYG9sun3O&6W~7mp#8;
zSJV}@d}4V%fMpqlU+kR<T;qMGU-KGX5DUXlQBe`h6q&}K|6Z*K`U%ZH;3}r;t48r@
zY3Hw<-ozHv)6y~?KYo1O!_5(B0o*2)UxsF2YdiuMM~TM`1A>O9Ub=9vd2Cvkt%b9*
zv)lG6OcWHbzjJxT(fZwFi?nb+dYF9L@h@#Ly-N3GoygQ<BOA0aaQOp-4$OG8_{!B<
zW~==8$cUL%D^^^fx%^mISQrFEdb+rSpyyWM`%vq?--@yB<8m$T$m{cCdYeG<Fml?^
z1?_K0*NmW2Ay1PovCa!P_>WRHvg>eXlwWytVafC1M_p<5h`F~wVLTG(DjUx}#%1Ky
zWj(fj`m{ngcRs0G8???;av1uwYB5yDO9{o=vMNW99=&>C1}dx=e~?V~V2x!ykSP(7
zWRhB^dA^-&Md@X=md%A6dYE%5ZH@DgM=K$^&vI7#Eq?lWZJB(5>Dw?EPi`0g&p#R=
zK1bQ{A_7r|ECpiAiPr5<TX#4)OzcwZlUe<JmoY`Mb}rCWY`d0o=LUfTw0iW2Y&yQa
zyr6O1wa$ms`v$u5PgVhrdFn;H1loF#v#19fuQce2(_|PLA<w>Kb1_A}<?&I!)cxTH
z-JU(~t;#Lsn>agnRrnlXkivMZ^?t;@aL_F_5m!Uc>Pw~hw5`iObf0#`ZhGQ)m{<VK
zvb0aM(1qzVh9jV!&ML}jfy-DVKkDOIj*db-!M6$l3tL)LwX7g3Hm&jk2Uh4>sjtL8
zJo(D%80~o|gCZ@7^m>_Le~r(IVCAJM_Z-u*e(a;jq9zFLl;oy=!N=gTO8N@&2naA7
z|Iwp7gEddTF3QjU`nQau7z*h#eE4vm{rh#5Z|@)9OjCmqd~ZuXS_O=nKQeVITvdGp
z`7LZ`7poA*ZuXzDr`4%l+mbj!dDG0WkK)MOfd&S_@C0&F*T$n(FZ=Lf<O16pVE3h!
zCG!s9mVv9O$myh`lSe$kWVd3)8^#w_XI*~z@?|NeN_fvv!b-}dU?BB;)M)uALXn~N
zJWb0_zv_9l^aI|Yu1c{XGhVUps*<@<!x*mle1Q!jd{{V@A8yloBtz1_>oj+H?z*-k
z?UT$b7u3@AV@o3_u<#2&WSF=h*w9Zw+Kal)oSP9HUh6Af2ztD*>(!^nzHJ)CmcK;@
z2MD|cov5U#c^)~C!e{{c_K{<7E%zm|l)RaUbnq79&CvP91|+O=#1_uL+qG$TD2>OC
z-B!C!9SS<D4Z*b(v*OgKfSyfN02g&BG454oRRCMeDU-tQQee}~6DUa19K-in$3#`t
z#h0^aOc(EtlBBu&>Q&DxFA1w+Bp2U`j2?dUfloOf_dCp4`rrbAsMlJ6Y#+sKZ4QSU
z9uN>vue<da6BBPVH*K+W<D==Gv|mwu4I{Z@e_zeEZ6i>KhSjxCdH(io2%Uoo6wsbE
z)dCB@t<JTYK8OD08sl<s&aFb4f<QD-no})dZ_;VkuARerY)$7VNu+M?Jl^jfbiY_|
z4h?i3j-}-x)j)$mxon0MifJnWYOhBwUmnd8EyGGJ>4L;wzhT4AS*~)KI3p>>cwUIU
zeEBi}ISeomOm`LE@&LVH?K!Clj~+cTygM@rBfF_6w=5b1ngg@#`T#Q+(s&RF#i@<!
zk|p;b+9P3`5^vmi*>-9_^)Z^=`vBivM~UQYMTTs*B3yJ(9yX<ks_AdPZPjh>weHoc
zS8vdF9wLvPTV62P9;@56m7r+E61ksSwAs#`PkswJiZwU>hYZ*CCoqHoeWo;M2^AL%
z%_R)t$U?|6V#^*5?@~aDXz=95@}&`%FMnw_wQmE(wRX1r@wp#nT=S|`Ug9<e_3%|;
zp}29xNhkG_ds!HJ|6}qkHX(x!WD|zl#iSU*k)HVT&!1S!kt#7oU<nzcLRqSbbx#9W
zuzzJ9MG4_opI2hJ4;r~eFjltJIR7E`m~k#eW~&RFxJY5V%s1`dzkm3i-Y}NtB5s+R
ztGZ9UBuawjL{o)|$Pt7nBfKih2NTup|DRX&av%1u{MEsYW(cgi`E;PE0!YD9mm!Sy
zmUA^o+No#=_#EpB@=is@VIasKPzF{|wgNK5XiE{T<D;Q4n+iBdt6YJCTfTI>5N|w@
z)ebDCK+-jzKa9T2NxO*JwD_<;3KG@b6#t^^tJ#$Tsjr-DD4Bx;%J_VT%QTfzusnQy
zYaj2R+s*d$A7~3^Fz$u()LH!NlEl|*-1Z?gbNzFo5ol-WtstW6UIH=cQeMwKV@prE
z+4ALO_O5=14hh2!8TD4~#sbg%VxkVhio$){<7P&+X!DJaGOpNl@SvQzH?Xa$0C0!I
z{gA1bFNzxV+6wa>i9$SfKpgq@6bWW1e-C9bF#e&g?!G+O<279-=LtRPXaDfxvIVHs
zr%WD=8$W(8Wj+A{>1eJ4)f)InW>YOKduNN!<Jziv$s#Q>!A2Fp4#g8xe5K3?3n;^G
zkD!Cv#G@ORK+@&YW$GOp>%Qw!caD#uaUmy_ieXpjO3wPjD2LHP8pA}G5e+CTTqZ1l
zm={y8Du9ER$2<bvy|;~5?xoRyb(d%iC?jV6v9?&a0L;Mz9X<3N+IMvPG~_)L8Y%zz
z88i-%<%DTsiM_Q2CmXpHTH-*FWeT*y*eg2pn>RK=%O*um4QCz2nkQW5F?<1^qmb+u
zuvV%7Iv^~it!-EMAy71j+YZfJrHwtf8;P8@j4KPANJQsgVbFMVp#Z_TjnB>jmdTos
zscqxMvqp!zF=uA=k$w{m5iiUp3)^t>)$7-Z4<1<HKd9T*ZOIZd&Ri;d@VUj=Q)O`=
zN5f}1)#TIFI{(SpU~RdKnFD1NHM?HY*0X2(QbC}{$$_F-Yx3G+A|o~{eUu<j(1+><
zvMOGuxBnZ7M|MO@mL>)@0WD64=*8JiUyM-^)e%6N8XiTWib!`9@y!P-uj0bu+`InP
zEdL&8%f*>^m?=5-?c90Sej2=yTbttI;@yW1*{^kXoHa}Env|#NE)-FvrB2QFBg8|r
z>BICxi&ZBxnmClllk8_;A(@ykAO4c3EHpcolb6H88^5&2lTW>pN`*XadZHL^Od~nh
z^N^yR#JWW`YCZ71Z{1(6LUklsr!bPR9(<-PMVJ%~qeg9_S1XA7?f2g6qDVR{CLD@?
zabkcQefI2G_P-n>7cM@#{B<JX>jwYYBkT9T*1`BPpVIE8XzsZFMb8tzeNS^zP@{H3
z!bC~MA1o%ErcIj`9xNwN;yqjWqxKJqBTJ^Pe67hb4!xKEI3pvJ4m)WHELHyl%MN?e
zl?2Ey+?pIcokk{nP;X!`cWKKbsH}M$TYMoT<V(%VqV)}5D30mASUCTn!>q|Cat7UP
zdSc$e0~S*k^*vaadire1s(!bEZ?{e|edw2*Zt0ig@>93mTJ?Ud_d`pK$5)4bj=9=;
z)<CUOwWi&Tzp;5vyGKu}lA9Q~ho32%!!x9cLu4Z=mS5?klrFKpGynN1-1f}P)pu}6
zqOM0Xfv(N#yZ%Eh?yP?&t}KZlSej3#FQiD`%MIQ-p$;G*tvWcw#J)^>q_3x}4RyL@
zc_<BIl=_i?3w`1p#StEFv!lZ|p4Kz)E|zwTo;}Uzb$JNweDsx5tgg)|u~>AXz7VN#
z<Q9#nZ_IDPj~cUNFY~j+v*8U+!Fn;CD$Uo{u3LiNBnY2wv(9xCj^{z#b2*L(87XiT
zaw<6-T~a-SdxY@J2OQo$fd$j?uy__ZSF$Sz<A55c>KN!Z^6cr7&tiEU;N%B8byRvk
zRoq^vLFdsRi@U$!?&G$i5m19UuH>N<*#3ve5Mu{7KmAl3ip-(+R0WVm^5{-6i<%kN
zRft8e+f6k!1HXwuvV)_YvO;KxKN*wF=VUU_gjpkoYt7m6*IyIO%)BX4j{?W?sT~rd
zL(Pt^1Ad0X0Ech{AfuCW0@Q19t@i&ci|XlZfnyA&!XUo98RC!QJ}kbaMcLDYRH5U;
zVo^T$4xuv$7%qKpCCmxKz1YQ<BGLN`P8gaEXF&_tLNoW?+AmY@nRjLy|M|hP9sCWc
z&Bfyoa64uf0{s{ytpHphM>%KGEGPC-$f;!$G6Y}J)TKY|%4d&5zok3S!T9#cxzL6;
zypc2rI1!x>ueko!hOdw?li@_D$FU;f7o?vVO}BXb;E}Jytw;~3jnA*A?yRj-PZTym
z<B83zm%6wJ7z_kp<1HGG=&`A;o*IOE#px!gSB3rflDW_x+uImTQu*|lHslLZ1@*xB
zftq2zwrm*;`<@3tlf6j4P3kU%<8j^tW2D9MyXg!2@98#W5{c6cbZB{b)v`T$7Nh$6
zCLi=R@Y}a<AGy^k%%)d;#{Kt~7a+>>=}%BDPHr<=rGeOs93Y*ABnd+DL4+>Ygk=;I
zZy+OcUK(Yv7!L@7ujq>fH3i0)&VfM{pv2C7{KX^J*=4}dfSc$Jbh_MUfI_^xM<WIt
zYKS(bk^>6infT(5gye<&eYa@n9qTZ1OVElBFNk}mX!sb^lU}Z4oT)d!7x@6G&fX1B
zK;B64$rYb?<C*s18O-v*Ib2A+@6|OISK-w7y}A>JH}5gkk{$i-$px4>pfgLdo0e~Z
z%eFpvvK0m`zql$AF2q*lZmh3)+R%LRe=F`FZJE5XeM*7y6L<qt|7AyKeaFp|Q@G5<
zJM)C<jJSOS_+!(if#PFBq#|Lg#7ZM1kMsg3Cu`GP^xX@P%qH>9&!RyCqC>a^^5PJ&
zo05>ubH(wimP~DsS@FVGL0_7)uzL||Qlk`FX<PsL>ptOLnqOQ8^@nGZ7n@@lWHt=3
z3GUmH4V8M8P{YN~12P+eue=uiQPWy+dJ<pmc7-{F<0Md`Y@kpFoskp2wl^pgc>~}O
zfjVQ|HZ|{zFvH}D<QJ9+rf7kc3~`WqB)JM~@7qF!_xAeDx(4wXLYTYC3FtB)x|Xq_
zSe@7eVjVlT=F2K7@yTs|R<vz_oFg{u%Cdq`6sFv>f4M5WkIQ$5ig~oIZUgxu<ohY=
zU*_ppi7NvVN4__qb&@o2dyRbKU&m{i%k&z*V`|F>4=p7ZBdP^efx32=s3uW;t3~@z
zJ94b#UpmSIIZHYjH&Ma}){_|fBAp5F^K%xPC=%+3Z=OOynx4djMTk)?%FWLYrFCI0
zLbJZ@X~+I2fAGre*tP3efVa$X6vD_cL!-;2vGRCDLCDKHu)z9SCe6~&RWFuQ4c}Nl
z*ko4+WVcJXMZDFm*=J_i+bC9!x3u&Tx}I7QTiG=D31u{N(`#ualqP4KR=09|6yENB
zMsDM!kA)VP@KdKwIc9wN^eKo4#K#Gcb9!p}-wVh8tcb~mQ_^hG<S6CyTR=0%)+E{<
zy+kYs%!)N(;nUR@+D<y^*+`MzabM|Oz3-9anzK|vjJXJ9A1hNpBauY)$2{-)G@RWN
zgR$7BL=ebI+oIX!1RmfJag{tibrfN54wX+rY<Ig0N9gZ_?y(Z)lEXf?yp$(8pTYEE
zb%gHeYWlOn;4W(~R&QF9Y*DJ0Pb)^qL8+4#EwMb1!+b{S^^xWZNGL}q_Z2IGX2xv^
zv57h+(@xymaxAj1pH+Ok__eQJ=7w_#({?DZf-8kQP;q_86)_RrHq}wswkIbGMC06@
zz@Hr(xu?7RWU!buPF8B8I(u{{UO&b{yGeD<W_tk1!d>GR-G{U)E-RY^md26TiN4XT
z=4?H!(ECH{DHgnbUu`f_dadMV;B3wqWXExc4jpCWrH@+}`!k=Yun||bbUxse>gVt3
z;v(2e%sDK;c+SR|p6wNbY{>Iu9C-VN-Rv(~ViD<mby9#y6KQXgm!gxWE0=pe>fLaJ
zP|!S@#pg|9;gzIakK_@=8Cm(!?KL+ig|gTO<C>%2hVS@6p$Z@^azrP|-X{^DHB4U<
zP7D>NKT*1~Z}RLJrixRO(2ymniBP#~*RGsmuG|r^0yiIiJcl*bwaEcHcgkb$%ZJ24
zI*b^QlV-ZFdpOxP9~;IwzKBB6yOo5^mH+otMl)0a;wQ;&UKu`?dvfplt6#=7KFxQT
z%S14FZWVrf8$QrU@djZfINP(0!dNA{Nop_*R<n<4D`lr$sjgfg0~(sq5VfE+M6uXK
zp?Z>YQL|;6Lw*E7b;zHaoR4oU&Pv#*49%7c=!<4Db>jXyUiDI_dHPMireI9U;vCny
z^)>E^+Y;6&DLy{_I#rQUGKiGF(u=fw<W2HRi1Zd?Cy-ZG7|%Y!0Wz4@D9_*G-(H!w
z67tz%Z&_)TQ{$OG{`g}ORKg3#gt5?lqHf!tE5`#iNd^S8<%;tL@bW5G>P=DFM7}9p
zotX<$!ICauVVa$+<MpnOd)Yz+{2FuWaqYy}XYynTWyA)2%oM*yiY4c1<CPNh9;bEg
z5bGeb_EKz6$6eZ(sL*nk;R<r{Y=ds1OA_T%uGV~6?DW2_S8Gbp5v$PtP<C934d_7u
zA6s(*PWO~uU06>ZrG3}J|M`YVJK7EoAvuXKrh0gK))>eYI!B$FbsjPMFZKsBpJWV$
z?|{-uBtOuqB~?SW@%0oYV}N>Mmi<EeY>q3AfJYdth+D&01LH@cdSPk|#~{7`E~sOo
z4E}%`nm2=yMjfS-Lc$>cs=t4$zH_~0xB`mL<oyjvVZ>1N3sX$wxPoS6l#WQ_w*2X{
zKiNy-4(6cv1!Tmvr&%-tS<?I-&V8-s5(Hy>M}BeLf+V2+F?q^nkzN2-m({ElF~Cq$
z-!g$m*p<Oj)|^bqO~o-{Yn6bEiK^?^9~CA$>4Fiaq!K<Lxwa9H;sUjHH|n0P4{90b
zG-%vdLd{u>KgBV$OU*+x&^J5c#S>EGH32P?m#m}%-k+hSQDScf>)6QGyNNIKVf3OY
z=oZsdP7^dR@t!Yb(9cBg9~CaU#2-#q47^}pe<XxM6vcWzl)~N(?hUQyb?NVQ{}Z7y
zyO6TDAZi;MqMg#5=PX(?(t)jC!vo}qE^qH0T~AKYQ9>7pRq+HLD)j};v)iPhKn^E~
zQF-2a+cc~gRCNoTdn*zKVShD)r(KFiL8+p_p0F2!!+r{Exwnjmc)%v3`>0!9*_Nez
zL@QXxZj_b>a>3E0i9I{o4ehQ3(%l2Ly~c3CiyDt&<iE9mHe7OaDOg7ymc>M8-pYYd
zXz7&Pt)*3zc=Lb0+~Zj48NBPqu!ShiVKYkcg7sdj+t>gZ0>tIiEK3(XN{S7%^A!13
zRU(86`+!!j(gqBgV%VUMHZ|-QKHf7_9V*-yB%+Q=eTx&@kgkqXu}$WCEx9Vqu+%7T
zA$wH;G<TTL>O6_W-IZ^Kn18j}oLnmOgi=aj{_2P(BLNpu2!Wy1@gaa0n@`TrpLlFT
z9NR##WI$yu8EBTp{{Wim2{aW+L4-qJY0evnIiu6H-1cB@VM#zwo%70)FS4+&NA*`E
zjF5;)UY6vq3ZQW5qN{5{$#)(G(a{c1h_J=dTX@mqOd<N3xT9oEd*34DqzX4ido-~2
zK4lcN$4)+&)=MdTAS3etF6_V`r(&n3#-Ak*YWB~K8ciahA%t~MQj0~e`BG}Fsb~HV
z3k;jGzS=<X$|L>W&~~{#wgTGZ$2%6khg2rBhy4gFo}_)-K|mKf??By_Y%3ufCek+k
zalcW~b1LyY+5n32AYThA5cxn37}`RBng{F3ow{VVZ@*4*nh(V$)Vy{cvV&w}iW1pv
zEnN{{kjtK2dVZYp4j#j3z-onYTMKlpMGPeBy2bt3wOYK{I7novt`%N_kK1ab-yeGN
z+f`~<j3_Q7`l`DVAmaq#2;aVScl$FfHXWr+QwT~`07V2k?=t-<(P8prZI8~qe@v1b
zpG;pV?VwEV5Gx-mXyQ+T#%dxyb5#J#n#bib6KM~SPn}R7K7IL1&8s!lv5eU3cB7~4
zuRPb2KYb*~>SRq_>k8}vRi5SXYIU(BZn6>j#di^rl&@)v*+!w@usr))mHIzHuJOPd
zJ^KPd=&3VR)z{EHf=d3Dm~w7l%ia&*3C^wfFjJ|b1eCx_G`|1BQsEV5d1-~SKnd!Q
zUD_;~6~qChw^vDpG~ZB;n5s<ob`1vCB&DULef=x=Y^g2ALlde3BoZU;tN+SM%wZA?
zl{EvJJjN*b^|xXBM6tqOvsXLWf9Nzg0}5F*oAuk-NP||KKJDq$N4FzGR^(^UJv2D6
zJt|eyLF|VUY$3k#0U2!|4R8=Ou#fV-L68xX;$aDPL%qGBk!T`?C`BdVQbNwMt_qU}
z?4F0v(k(Z&X8_8gr<ZK0d(KjLY56itbQt=-6&Q(}dy=p}63tuwEI=Ge(nJ%GG1A+O
zJ~#YZHY~cgJ(Hcf!tL;C(nP^x0nDY0!xgewzRTm&M?8nz(K~ZpUC+Y~cree=;*8dM
zc*mT({u7368q~?qaI&;<^EG(X^Y*V}VAyork5{nBX)A$(5VB(PiVId~EfcQ;XMWk7
zLw^C7%s4)6d-5S4G8I)<X=-9(Mp2~Eq{$3teT7LDh_oNowW~W-Z|3`2ipDMQJBcEo
z3b=4#U}-^ywVVQ26~&5E<PnAA$eKafh}om(3|DuIu=p{z;cJo``?H8b9I^Ff<-4JP
zlalsoi}i?fwl{h986KcktT1q--bpwUMBJ(Vyb_avEtRpkdLP!Ut5kuo<~L(F@*o?D
zQ5<>ceZ?tgX=$ku*QwU4xk3_<Sag)wYtd|ucV)cOso`W8&b6V02$Yz;EFW=oHcynM
z(;opaUOstWiVOO`ymPv#de<l@KZ!kiC@f@Q<#x~q45e`ZsYm0*vKJl{iqrClaV;sy
zI2qAAwrd>_#)W{PzR*$f6(`aAEM!;tlT$O+LsRQ0aRZw3^(g<k@!wA+ffl8KgKjJY
zQ6=%z8axrUgTG?KEn%O(e9=*Ix<$5!+L+06@v_3K(?dE+@NfFCrMZHuacY?D12uiL
zvMpc6*!;j_iKXe!zkQci3q>NOJ0f{qLvDsSqBWweCT%Xn6lDq+@sD9YDpp*R;>Ibk
zg5p3-yv;h*_Hu8z=_qa&S9b&=4N_q}J9q%O``kq2Ju#eZ{jJuu%PdeSZGQVRsF+`G
zYRpUVaV(fd9HA{Uyrq|09)iNVT<W3BZhfk9``_rf?RNcd{DIb-44Njm6zwQo21-$K
zrTq-vNdl&hz5NB^oh$%~kgQq4&q8IBflO2)xl8--i$4_aJ*5j-oCYciBYQ#YCDu^n
z>}ep3$c##f!*D=zSj105r9h%=(N;3yS{j4o6DIsjRfbi_>X3o!D3$a#GSTS?K}Vrj
z*<9Au1yOg%j_*9ewK^TJLSO;EPwLcBVtAB^9v3DK-HD*TRt=FV(a9d3nKOZPQn+m4
zqy-_Vd=21?hnsk1b%pGX^VUNn*)-MFRg0A=d_u{vTP@^o$Q--7jNLBbk`EzIPcix^
zb2-f(;@g04g?8rm_eqx_d)c_r2Rs39pj<dwi=Wy7RL;t*B;%)&rMwT0edE8=HK|0t
zkNmj!arT56ojq(5(r_M1P5?VYZ0BdRMvzHUkB@Gw{^TmEm4gEe+M%>Uq{(Y$f>8|l
zE%ZIz1g@gNj5a{cpO-)Y%qP`1>s;F_Lk8DTcq`4xcQMboh~uQVvLXAmQMAU<Mtt?_
z#uy+QBcM~C%G`h1+m9DGesbrOXtnC*O+l9*bMKN49rrMz+jr+QW<VGi^wQvRxpH31
zcPDnGc{7^waFvk`yQ>e^rZgn*$fyWc-+UW|!N`>S#}L&wu4^m2{1^sci>+nA`4|qH
zBH?%PpCUMMTXjrw&xRfQjQAdK4Io~m@%0O1kHK*5*4XgnEdFY$({=o&Fc%D{f(WsB
z%DKkqN}8OmeiFM8cVQ>qkSCGQL`txXnmH0}V1?Jl4I6@>dSq)tD~Zf(K!X}l&ilqc
zKpxs#M8jo>6=PdEfM(TyNi51#E#BHzkyGtyv}b)4ul@kv3FxE?OxP(@o#Ki%Jnv4&
zs92W#<56}uK57t+<?`b_I_{r;f;gf)ZEPU(p&vCI6J82lh3ZhI;7Pl-p}WHPG%P|1
z&GKU|+8>z#3g?a9ADP116YjK?NJB@o>L|2qspi;q{s9?vf2SW}mU9P1<BROp{X6UZ
z^1CAs!FYTQpGeMQN<k}QT1BVxD+;em8%b-|nv`e!hnKwT0fNRzi@bUil0NvGsBsdx
zfQrMmTVC={Xej4<{;beCNyhfA*zcRJu<4dp)ShhAU^m;?Fv6i0-LsyJu`8ykpY0Bw
zHc(^)yZ%{FRi1xF!y7$&8-4F)%Ljy!Ar`>Lsbpw`WO1p);u5`6C8nQtRHDh>LWG3$
z7tjECsxYvj_7Nur-anD5p~&DD-z#u}NDBe!%JRh6mJ&Fi#tFq<t@ZOhZRUqm(6UA`
z{V&>I_jI*2XZjzc(OxKk>C3b^vCV$_ts~DE8Z#6+g3aFbr8F=e@M4{`1{WTe^dEZT
z`)-`dJk?FM-hU%T(TaCO>qeayGz&@tk~Y@;b*)HV^8mYt)Pc8>8oJx~I?sj)N6f=F
zQPBr`>zAJuUX}!`2_UY$YUA_Q-Sqwl2K@8#kh89EBDDJ9@tmI$@RxR;*tPHj#QnwR
z`uHQ&hgZo3|Nn|AuR(Ixr_2P>PK=8i32FbZ&)q?iDff|reRuNE7RZKh+I4n`MM`-S
zMZ$O(yguz*+3`z$NQ+Xq)aC&y^7fQ`S*Pj8H|;Hqg7K<ZXFbP%r<k>NG8b$U6SujE
zCwcY*IDu3Ia~NAARQh|X?I63=TVZQJQFCSXXoZ%MGp}yaLw3dwc0jrm8C;O#pf2DY
z8>zoT(%k<lDC4%LF!cfDZf72*vl*xL;u#%mo^yC}Uea>?kL;?<4tZ5^0P1dIYd;&d
zx5O}-eBP{6Eyb&51ik_Cb*|~VML00iz$(UN*478?%sTSL6usO1f4|SZ47yu|*rpxh
zD~YPEqAgEd=f#raLBd~Onunl|2B=-7UN^TjH3N-|Lg9#;aZlN+AC>WgFYKs0eUH&q
zPtbuhy{X@JJrwAHosrxD#JY3lq`uO+-22J@oVl%nS~zzWAYAQWE@Tr+a$GPkwf+Xw
zxxQtPcicSiTil^mM-KeZkZ-o8W(5;eaA^4bsPJw@lT;F$uqlrZuWb2F+C<`%{QgGD
zfr|a=<9B<QB%P@(3;rX95&|jXBXFllgNq7zsj~;un=}6@QEZhNa++Oj%*KyzSX29y
z_0i#V(-%fN*1Vd<91Z1%D(oFgPx!qgMUDN?8j5z_3BQhAG`ri`sForoZQZR{VU`uM
zBk)v<)5DkBHF#y2?)=4?%<j0;f2e2mSfhI0O?ebH+w*NqTE;$g^eg;3wtv0CC9bZ!
z86Ttl^(i&@;_9AP;F@Ax=k!f(x8DDvCY*$A)23+czWdb%XI#FXK`*;EX!62H(9@u!
zl<ZHz`x-glxwlPX&xXpR;@<|;QXI%1-`?M-sH4I=tMGI6(Nirr%<`!E$$PbQ&UGF1
z^-a}VeUxK>=N+SOGlaZv+0l$ZxW7lOt)ql!NP(ogt?#372_z7PQeu;9Qn*2=UFgeK
z^MsX{(E^yy6+Nd7$DjBFJNb$p8VT`?1W1ftNh5o5kx^);7>0^G7_?^W*s**4s9YV#
zvQUoa8M(n*m{Lqlf`eNI&UMU~6<?YS$X#@}qn4J|m^*xN@7=Rrt?JWb#{+{7J*N7K
z#mRX5?x#aKd-T&<z+sI4?0F87DD<V)ZnN(+XTnHgqC8^w^9d?Reo_@;9H4IY+SU@5
zQ`(;>MCUQG9&!4#VN;jC#Mv!GRf7f=BvXpna2S)x9A$;pEt%vXFq&jnf*VbSWU3V&
zK8*Z$4ch)4m>GB9g;^d$%v@rRDggFHCXRe8Ev0gO|7-;^uj83?{zsTe){v0=^YRQ#
z_04pKIk}eW|H8kA$?%T0KiQ3_qwsc?<_ikP*~`8QNXFeq{2Q5?_f(fykC<Jhtz;P=
zQCP^BHTaTiUn;;Q0w`tTFyy|()6`wwV?KRjJt^P;a(c-J`iv%@O6jRcaFGdMGSx#<
z*UVV90W+XxlPVN0m*{wu?s6IUhuvS$8F@;8!`C10F$1TN{nbClPZ&r)SWcHpoBJrG
zF9O=)99354RUv8U^p7_D(Ipt$^L%*EN%8!xgh^ByPl!vYOsGrfN7X9=1mbskVGPLv
zB59rsY}=Uk?L8DRYj=OWI-36w;KUz`sZvKt{HVk!E*>gDv0s0@T}3OCsUY+Y-Coxa
ztY8UT)=^@GMnbS)W{mXsbUFTw{!zFL{FiMtc#Pn11e3gP26$6NO}0^Tk@_bgozDqO
z?1uL!bM+lsbi`pMZf?f&6j$&)rxH=qcsgN!ezlMB-Rl0U(!MeiHV?BlV+(bazYkt@
zmPv<{zgoo-AjNhK%IN|B1dkkoAR^5_GC-7qFR5q4P>0zUVRYsB;rkH|uYU9HVYTbl
z526Wz;}^)Z@JQypj5SE(Lr9Sn#NsFU54{Q@EK)iK3Oj6_W7DE!=mu6?>fHk|_I7r1
zgylI&%L)wQ>4NNql(4JdJ4>0>8S=+pO{D`5>aDH&yIxY0!&SMT@as%lpK#(*A&G!U
zh=!L|n>TraC-QAJ&3FTIbRP$vj53p^CQ`OYrxZzIuohhj8~y>+;K8&uiu5MkbHe3d
zRG<zEM1NWU9{}vRQ#RHh4!)uveZ1l^Wb?>3OWNBl6zV^pBRar3ld>=70skt!>!QcN
zLRm5tR9Z7+a9<_gFKox0U}D4q{OD2|)YsGx5CtFqne@vDQyBWPmSTEmv;^XxwQQd&
z7y0H=TyWl!GBT<T7Z}ktP!Y^X0<h5ek5d%A-@%lLAzV%$Z8f)U+!z7f`v4e0qerUy
z6Mqyu9QPqI6o-J0qh$Ch?$YhcFn*Rozv}ARnjU8&lhw1(=y4Y78R7IbHd2(c5Uv4{
z$IHz@BY=FJvf`7IqCSezMh5Hh2TA*S^gsrL;{1aHzKb1;Nz}LAsxa{k7~g45iw^AX
zWDFH5z6p|)6%<(MoBJbZbi8oV0~5xTnre3#i^(8?oX`HG%(-w#IjxtR9L|g}Ers%r
zVU|f@XFY4yY;SE~jwA2vmgYUWbJ8Uz@|F|vJeP2(MxeV>0#E+z%((i$v>jw4$}<dB
zC%7y4uCZe3VDFdmth!K5jyNy@4p*c#F~ni(50UQ*%z2xB<|JgH>Zn7fIYd$!CWLeG
zsKAWkzr@ePwU^e|qLY1!H*DI(A59c*Gx~NgXJNlXZLWp5JGg*G=u#9wnNz1v3pBM{
zBp^Q+`!Kk4%BJ2h`=cQX=Ob-YoXO`e3uDeGHoA<&HBwYQCAn$3L1S0)l#pp)6lTK@
zz(p5ApBJNW=080=;=3m*!I^C5pn)di?P$_lrpGPQepOq5ns!Sybays^CcaM}(v8_?
z4h+rL`w~op<p(6w&hCC*US566JyG<M%j9sLE5LLz$L+JQdMiK2*8K3}k0&ZG_n3S(
zL6^GdV0<FG{sZB*B`GAaZ7%0duP_7S?TwxdU$U43;Cd_;qSwvtYJ2*hqn>9tGp#sL
z&Tsk!p;Pz5$wJm`Ro*(6>T=1OZ2f)!@=*l3)=3Z(3tmD|>$R0BZhD387|uE}-F+^0
zuaMjp$aN!PCLA6#Pwr7`^tz1EaWZfo7r>pP4kzOwzuw)M)nZc=-I&L1qGkrq4#79|
zfvWq&icFqtr`T!S(WY8dBMojVWGq<c*t;AXi_AF`hx67HDKmC*Yeqry{F<x%X+$(F
zJ;yifg;8=YX12$Jhtl1&`Ob{M%(7~97Ck%$6+`Mco{NpF%hTFU>DN8Ad?m_p!?uGS
z|G9npxU&7U4VpM-Lz3#mPLSGa(X?RmK*tOtmjeHTuOezvFoeyLYc-PdAF)ZerzP`L
zzAqL=gQbn0t$ANv>zNa0?9!ZtH?p&{FEBc8OO@TF75Cpfy<9ce<1_BClPia?BSxq`
zK$pzQIlp#nf6waAeIw^!I@$m3O7QkQ1PR`HRs7-BG2>ldO<uxr-L!kR#g+n7AKe-6
zGGiVL$^mRG)X+84iYsG52A~A(nq6JBOEWEpi+=JWM}a8XR5t7|?aJz#TpKxyQK=7(
zzB|xw(wg(IX)l?+ZtCpUFRvXL!vVu+qt_$vT(82bX`GX^D9u{9g5p)4*EjrJ{;r%c
zBLNrhqA(Ywckbx_%fOV#aSNW_gxS~*rE~J2{CVHp5p<?z&4nxpDZubkV0Kl%Bbj15
zt7GucbE&g)FL9GP&{Y@QwwS+mllaz<a&mL!4szl+XD2&T?iudWk!1*OdmiU48*RWg
zJc|L3XJb>*JL~}4*66P5DsKDOj$=e)=blHO!?4t~1gPmnGoz_LyzPv(MdZ=``jf);
z&+jxk_bB~Mt;j}wa_<I|#y@z_CvN4q#B^{&RyAnt%E@FX;5LSCW5?beTs8aAps_cW
zES&lX+4j)e7!X~-`vnex4KI0aTlS4<^=kbZ(pXd&ke#*?J}Q+IHIE$ZdT7u}b!?Mh
z2p0*E%Lkd?V~sTNm5GU!H5RA0whWXIrR!1Tw8hRJs|GOeT*kkDC69<q(s?Dlk!Yxh
z%3?EN*7c-?3&TYC$H75cW%sOHncOS0qClo^*aw!VYLxHNtsep_^Og2y+8k+ST&bD|
zLaY^%hTc`NKjEi-h^~QV6G^o%8U>+u@6xL56}%t1{ql>hCueciWte=j)k@}D$S5DF
zuSM8KQSOI`FS52u1m(dT>L*OYzj#-Z7sL{Ii&Ra1Fk^-=2TYVpxvy?^>^J*)aNB$$
zlsL-O)T1;$3AqDk<avn!Pv&r<vWnP&LOX7M&tY)ECT@i#pl@>=qBPmeDxZ9q#3<te
zuxV;%O|^>Pv?fMO_O=#e0Db<5Q=z7$4^ogzM=VkRgqBP!q_xMI_b4HVPVpm44Fn-c
zP7_l@Z^u88qz7Q_!u7SF5|uiYwyHB(6c0!%Ck5sE8M^-g3xWnC=<#+g03Z-OlLKIn
zrVikZrT0+5DAfr7;HaM66wih=XZ}oKR;-L7%F$m)pSMg>h6mPNytqiCd|ZS*ZEc8X
z55FSiIk(CcnLD|_cDQiUG8?4L;8hP>pQ+5QsqRd+R+=M|$lx28$9b?l0!xWfC2U=$
znNAk%gxGEc&4aq#7jNcC1~MJ$mOWyUo$mku5TRlBj27ZO68{wh^Bcet-l}^5xROZE
zFJqX^1L=3qv!-eq>(Vhf)%wi*utA+VjRM!CNkNx&!FdE5QG6JU+71eWmjiv<%8W}M
zkB(C27?Y|avGy8N_EuM6dt=vpxk!T4C-vh~9>p?=mr}TE^q>6&+#Yyb+&Yvru(IJY
z$A$}aM~SbI%`e_Zs6O9<Gzd4cUe`p-Xl`Qf*fe9mt9Rk;8Cw~I2k<VX>nsUE3}ZtI
zV8gMj!n`;4%zmlLu9-w06_yp}v&?oAThMAhnNkI%tP1d~t{BbHSh~XL0pT(bHGyU?
zQ<Y!_JagQ$F&6Grv`+432q!V#0>E#Y5m2?v7V|*Xppz(NbE%Sq5QUqUJ{_5m0$|;v
zEj5iWHur3<V`EMQLY!Na@yID`CS?WsFg^~~3o=)L-_fNjg8xSm8uxC^9cacAXtM|7
zzt9wZ7Es^!?p5@15wtkZs-^B@XYlmJ1<mo*V45SLROTY7<k(#1;%E#6<`=J~-QJs3
z+>@;r4k<L}%^dv+37?;Q=NNl4XzL%{X;&`U_KJq`Zxmiur^Tp3+q};N;N@0<$o>l$
zhHaQ0P4Dg~VrfChSbu<IoI_PrRfd&)TchO*w=AN`Z#%!=O0^A@*1+d7aVjTj%GtTk
z?+sDP?VZaj%%|ur;<IkT6_s8_hlok>CzhpW#sC>vOGN#^)*g@^4gT7R6MBAh5du?4
z{P@X#ufbUo94-Ed<Y=zO7<n+J`1P&aB^y6NiD!g2E#vyQ7JRhC=*#Rcz~taFi&)m}
z(8qzRK64A;<1ek`eu+^K``b5c*x>$T)acQzP?`H=;7WOF0?F{<v9RkI%kh$?MvvLs
zwO5xdC(pip`sIRqzYUNRzvDCPNx)Yv@?=VltL9|oCa5E4K(6K8T~BLzxLL1u3pn<f
zXQc{HIQ;wQn#R9xn$g;5`E94TYt$hvDH{``xBvNPPbq+o+r_vn&uzgBh=T{h_D*>^
zRPo9_Z0U}7OUwJoV3{T@TI`=$1x8JC@|Vl}QeQ^eL|u>g!i%TemR|JO{9i_AdKCEY
zEs5CF4^w4qnO<oZIAbm}TJU&b|JZy1nwzgQR)dghJsUbu!i<v5^z=Lm-FD(dV3iGL
zsxOmAoZO_1!GTSi7=3q97v~{w(8{(W1?;bhX3O{IA-=wM1=^-JM0-Q!yoe144o7vq
zk?=SqP#Ia5yTcqnAJ>3uBTp|XYP)+4v(m*D>;Qd`7~Qy0qv%%q+G2SbaiDjH(8%Q7
zJ2Fm`8O>k^E$_0Gj`7^!wgW<MCNb1yPV_S10Zww?nHc~*7~hkt7QX6w$&(7iF(Px7
zhg>TYn_o#XHM&$J-ei;BGFN1N?6wkv2N{u<-JgzPYi*|rZAnE~es=p`e?|Gud0ANK
zbcx>eQGCJe_}n|We+E82U0C$)o!<+;?b{>*ul$U)p(Xvo<Dv?<=|}*{2-U<en<&ej
zz9r}Ud(IfXrGV4Coq-XLXPL}v`>>6Sp>t1;Wnj#U7V%&WCqeEj03`)w>7>Ar+@JX3
zzC9iw)M?~KwPjkruKmTc>G()0=k18e96VG;BI@n1qRtq)3C2FXjDXdjRPOuYZn{7o
zd`rFsV(pvX?8b$`{g<5d;0c%?dLUz$&xEr_J_rcT|D@VRf;suPzrKDFsB3%sM1kne
z^q*^F%zh1&xlmqH&P<hU)b}lc#J#cQJNa?R9{9E&+@_OY-GynfZbf~k?e^2EQ`FAk
z5GAnuXpO$$@_Gv-iNmD&wARJT=wqNQ8b^X1RnKgzdCE$-yW3kHxw*QzrBRbw;n7o`
z4;@>Bt<UtsV=iw!N6j<csfMXx%BXdpKutuv{?f_CgSd0eO*I4s>^>I7+XcioDZ})l
zi*RcRrQtN;U|%IbXxXRLqOg<c<RV<htJQP@_=av(3dtA-#MH?sgY}jxA7TzY(!YRd
zPz*jdY}-&sFj6lVy2L~!H9DpD!O+WD;Y*%hN4nir;a<WT=S}_F4R!zuY_FeR;TMxQ
zM!yzjzX8)T7jMFrc<EKHxPbQ?;ZiTU069k@dqh~d_@t#@5Ind0%PI`7GN;D13Www>
z?ngb{<?t-&9zrGY&0w#K{-#=#&oITh4Vnobo32S`cjig-%=yp=f@Uoi79&&#Nysdd
z;bg5_`?dMd^^9xoc<JrWSuWQ#y|XqEwOJ-;<pV_0oA4q^b1;L<g@bXHURIShr&$I;
zQF$<M(e2{S1DVy|I4h_KjkawM2WQtgy$@9aIr(-07<(Q>L8AgA46sajdCRAG2!K3n
z)ufL$X$S~4iDb@~<edv{o<UiaYTSbzkr++Uelh0laqZ2McHeGjyK)f}W{IU|=f2n^
zr1vOhFuN6Zk#<IZ&N%TVj68cV1Ebd1WvQu1ST-`uA!hKBXU51q0V^>tB3x~TeSjQr
za+=`VKl$hvY0jX#Xkto%!7m2Mm>m1E(y&`ki_fu;8OeuexREi4GV3a~VyY5;A!T*z
zcI}FmOnW*}nfsBuo+i?Vq>NfbH6!DF1M)wSxYJ1QdtRhvarF+Ij$j3q_&1Ud4>pP`
z0~9QG-2?G(f)9+>)gZkLHc}ef<g)P1eOhfg%Cxux#j~V+1AuvQ;LcLHri*=(Tp(T?
zQ2&xot5q_vMk8LoEuL7~q0%#-a`^^Y;l3AlZE1TVwF_!uPK=??4cbNc<uS{~r^#JH
za}KW@rS7+#Fr?z2qYg$d1Sw<NlQE6(8{8CR0;4xR%45=<8)NT?sZq3Bj)ZswEU&Cy
z$w|$VPsDs38H2yHO^*S`z8k*0>rf5{kpyyHMUk53!7oYg$-tK0F=A|YbDe<DLT(JT
zEZ4)Y4z8W$L;@Nfzq8$h=x~%1>9j3ge*!-}e{~EH?#zPo(CN~xgZ@)V`>VyJ;Gum$
zKKBcA3e_2`@e=Jku7c^awH9I~cG!Wdub|97_PFNbCtCeZxRzyP495)i+Z`t(N}tr1
z7ojj=N_Ny8x0ysVAL1vc3^ULJzk8FJd?vsqrwo924(ICTyX@gHv5o&O+FR1kCjbI`
za9p%<@uLa({d|1h24vjMDJt@RQ7_Yto&;M-`{(P>XUUk0xWq75GCsiRPNow+Y`G87
zvOE~CT~@un34*y}3=i}$Kq8G8$LxfQomH><3#B`wMb7e+Ru9da{`#x;o?Z9}q^C@}
z$%)GS{fn7gK=ca)>SvdK*lje4*$mP^3WuCo{$a);#N3SSb&XtZ+#+}<`?hy^k=$J+
zgGw~6GG=S(-TjXcx4qwuYv9KHLyuRlW?#}){M6H=o@3m(7&Kb37Uaa>$XHqY0UE)g
ztGAz(!)wOl^lBH&gQr@VaPBfm!rqr^s`v!1A!yC@FrtEbU8~?sguyykC7=vubP}22
zA&((!+y3^z95bG(#phKiLryNOT4oad%pQW#^1_lCu2rHs*LW+n3vy@0*|i;gGlaee
zxsJxwoi){qJ~Y(xiv9C@IQJi49+KFy?_9HxxK4%9B#zj18&Kj+oNMhiB4-B(y2y~C
z4dU=d?%2CF8-1$%`;7K<A`s`?Td-$o>lxTNaOS(c`4r6kn%h{~0L~RVd6ZPfYKF=`
zwYcZ*-<V{;89ni-_#TAv=AC<{=uE5rmcfjpl~@*tQR-3LGJl+E+g~U2cAZJSw81l_
z>-Hiv`ORgBI)^ISuba_A3OO!@4sy(37{TsJrW}_tw$nr<gKTLq)2G=nrJ!*=eEY}I
zL8Zw9YT}ar;>7k4O_f-9k_+Ie3`_o&%YNh{pW^iq<Q;~JUNMdO_$e#JaCA!J8)_G)
z%=#N#P*{CR<Rp1t^KSYd9!wYt4p`%&d-AwyjkmTz+25}ZEvFT$_jwX!Fx8pS*~OD1
z1?|r|lYZM*nD)Ihr%aeIL58`gO0az7^S=|$eY!*4EMmxJm#2|5iWxkVk-U`@x;j~F
zYSafb{Efe_x(Og-$CXhWx||=;^IZOtdELVLoPpLTkozS`u}YPU2gCS9JK$>DHoY<^
z`))W{>0(9L?9?F?cK~Vr2vB?03`4DzU@uX<vRtM4bIM@Dj&e-~#pUC{b5O7_990=Q
zk+O(<EAAGv)B0GCxb-KZgzM^F!4QfA<Mu?g7MKwGp|w)RprMKu=3l7SsI?gwAj_3#
zd$lO_tW>h^lI8%FmaiDAV?YcV6WzjrP4mFd$M5$WJ<n2ex*zUA|KebW0ePjA#94g~
z3>r{V!!Vq6(5uJ5m%h`wzyXTS4`7#>7E_dih#@_jbXz^(k%bVUbYbzKbY*bm1FJ!r
z2Acu>z>0Z0Qqhl;Qm2S7_r(x60}%ni*c9(O-)(t&3dEJ=di?J!sP)C?voP;lW?17n
zb*iE!x2-;4MG@Y)Ti?BVA&nE9<YdV><Dz-{G#y?8$D+H8%B0_)0c0l+#6W2*SHlQG
z;Dbz(K9qbf$An*@ME<_C(gTbZT=s!p8-o39j|Y<*-mE{b)S#&F_#^Cz0FJRymy>5(
zNK0I=<Dh~}&jf8%S+ThB;V$wp$c~Hp(sAdY7iONWa#;qN(hUPu*<Ti2w7hr4bfslR
z92#prJ+HU<v`pk%o{@JT+ii|r8Dgm*)*D+Z1Ilwdb#!PIJdclZ@n#cMwYv}}B<1A|
zvI5dX+&VF?mDq^bBvfJkDRz$fMmaN{7VyvGmtT6dL-=?hIYz`Mh5xH{*}NlTJfQZ%
z;gLRr2vgp$ew~$ZD4Xx1;g#OpTSFsdR7Ruv^-a%xkv0o>zyqO~5Zxx?mZQU1CL|G*
zrXqGR2iyDQ^P(G+-kL32&U1pInFe*ys9*|Sj5E3;U>9(|yWdTkou~&sJowuzYQTBt
zJ$NXN83I|+A0CRropt!lDlAaak3)^^viwfv&Q>@I3nu+=Dt)0`H#wq;b-(LWRp@`?
zVZbCSdlf@{Ezi!KDAb5~QsmL4RqO8&vP@OtpW|DN0ZqTniY@oZTCuxjVfE0xDqI7Z
zPVS&C+a{eyU#KeIc6{h!c)B}V29l8)3|#dK!_A~OEuZmaSl1)!H)?cecTp9y)O}a3
z_gxzMc-?W2eb834@azS|Ck+lADszU?YuC-|Mdu@NE4|PF4iIRTndW>RS2Z``JT3kV
z)RMV#vp-$H7Md8b4wX#%a@mswEr&#Cb5ipU@A=!`y)dR~>aHq?;}q!4g2S8<KMteE
zwD?~dHXOVBTq*uJy3h=7#Q)N$(fG1{Q^DFdew(`6p~a}v@g9sO>^tN7hKB8|Rhwv8
zfk(<1yl+a4C%sdnSj~R%)796r!XLZ7$$7o)?Sqp$fjfaTaL0AMet`*63~ujbANr+z
zt_)eSuazZ}Ca9q%M128!=_timj85bKx*{c%130glUyOU)Z3!!>Wzz7PG2SrxkX6`~
z{%+3B;%2;^?EqtD8f`ym+nzZkuDc0O!knsU*m?L*NkMn_epn^>^T2UDaB>o`kLf67
zycdQF3%{Z>%u|!eUIeDQQ6tk+-nyPRzf(WG+@eqH&L`LjTyJ-VF+(|%*?rOhDP@wy
z-8}~b92$wWcSehHjjTW6Q`woxi<m8A*}z0wx?wkO=}n_)#`ZSgpr?I`^v}6!bA{5Y
zB@;K@u8v838?`+){!apz&+}sqWiSTYMJ`v9QAu*)Nt?GBbJ7Og4gf0TEy=i)l=xP@
z1V7i-FXWWV6@WTfZLi7W5k>;x+cvk*^y!X>GPm;0+Y`f0_xn&HFv9m;nL7oD>>nYB
zQD=L088T<{wl&%2l~>D+TE`?lE8Ox#O})?sm_nR;H^dAY_@uO!HPtbdy9N$|3s|f+
zNqd{frCoG3#n^M9Sl=$|vY!_eEV51?LN$e+*QmhvX<DzqG&2<}(j4oUFBCa)WiRt?
z>CbnUxt^$8=SRf!_Kl1zs&Ob9a^rYN+4-H>yW9SxebhQnSLfG|rNb?LvI-s^wB@G+
z|KlbTch+sB|7Yt~Tg>YQC?{(7t-JQ#uyw;~>6jLE+wiN2e{S8zRXx{g@9vp6KB3&z
z-tP9ncP@7WZclcP{F1S}n>Cz<Z&}v5_B)Tga?3eOC8n}G_Qzkh|Afixz4o$)mR@<u
z`rc3KsovD1)nvlz)t96Z1~Ad`t_((mM#Bm<{X`K{3fD<s@H=#9doE2LrV}@{mg~p*
zH~;On@BicKJivP1-#7lX_sA?ll9g2w4cSt*BxEb2GEyQLWrS!@64?}@Bnct1Qb<IR
zs3?+BMhQv(*AwUb&h>Yl>v#RmL4Est-tTAJ_x;>Yzg~B&TtosuF1L=DBCFKqO}f4O
zb4ke<$GxjvSA3j$7pvyu<!5V5F4(zu?N?!*|0*+t7$x^l?Y_J0_p364$S+f(k3DSn
z5^*9O^M%zj<ZY+jqP_OKOH!t^ve<T?(T4yQnT<}|W~HvOGj)$njFox9iv1tnYvZE6
zmc3Ah$J$_kDOM+BNU;nlp*G>z&rr!I5={VNUHK73J0K&$1qEv|_Y|{Ew0Ei$Thph%
zT$?U4h60zYfGkVy{?z%Im5<T<XAaD4_x&0P-EOz?^L5XM%luR;9Q__!wMLY088O#g
z?A%;kT{&PTXiG}hUI!(%jht!w^^RF2&M3ky^C_5Y@e&KLE15;1fm>FEd$u^}ylZ^j
zzk6-95;!rsGCQ%dr`jQEvuZ_KYxdU+wK$!4pdiR|*7>Fk5nsA=sch`wm#1ZV<?QVB
zvWI(Wm<2m9rZJci02m`f4gm7?KN@!M&74%ZaYx(Ac=H8u8N9OlKG=WYeT{+I3!*G1
zPQQQyfUHymznJg1<qm)=ZrS6d_r{!GHjy)6EvqCO#yGyDV_WnhulzSjJ)g0&zUkc{
zz60}vOV^J6Xw)4UDV)SH@v5L!)9qBq#sga#F~+10XoL{V#1MnB4!$L#*~MpMx3a<&
zKMpSfkG8qCIj8xXiM|6}*X{cj({gD4k@6EKZMUvS+8<NYk>#_qTn1cp95t}Zk3Bt%
z%9s2cthJpwM!cod`mbC%_^KxwfVXS2-{q|Abbl+wxeT<F8daQiA44#$hz0kUR(j+@
zw^w_W*JZ~w*q7V(n|F>cV<iD)JwszIo}6pU6nn+`qTyezgal9Kt+&1OiX8Uq$OXHI
zDg1wsb)%;%f=8bDbmAW^fT7<#%;gpxyzg<n9o+I`#t+4h(uj&mR@8HRWRyevd1NmY
z_)0sMibf7}YT`>Na?FGD!KY+_J70WW!krKHSD8OlS=kJ$?x$2(jLc@0?Y^=)VifXc
zosSC&=FH1c4n5|bj-_<QyPN*I)_l5Ai;0Zsr|w%DXZ}Kzib%lza^ZL9$rf$9ja&Eo
zunhc@pDyc+TKTY@-Wwf}B@arCHhkBg&lhCqxQ!UKGIRKQ<NztaJ{{9qIK(eWHhi}{
z=jBn)2gjfyQ)!_qhruz^B|!QXde$rb<I68|lE}Lzp$yPPz;PvRhQ;Ygva(0^BrB(n
z_NnDHs@*$P@0h#Bem){V3VfZt?i*L;L3wm^H$?nE)z2u%lVg_H-c>R?M`DyQct{iO
z_FD9KF%)KY&wihCX!~x4x8Ax#PbT~$bJH5kG)H2>PMP{eM%k%T)!vzOzp4hNKd2~d
z4vrQWktI{dsC8N=i7KD3Y(J&pYE~cWFPfIHy=jofkA$fsPBIm@q9`Mb+SCSgnJn#}
zSf|hFlv6yj%AK29b2eQ2`Jo#&;E&svjs4x$#l_{gU3t#^5yr+Zq7EL*yi2ZSoN2g>
zY)gMZxXI@=^t}&gJ{scv5n21B40k|4I>x&<2wBD&0lVm&qGKIgFGTvifB&BIZ&Os~
z)r0S^2*`VHWenJ{YxU2;r5WWWBQt~ga>z%K;rmd-Y*~5{OU;I@pJa#{aj!!}kdM!}
z<5wH`7PFnq`re9-8#eTwcLkw{qsicnIatp6buEgxGdJOU(JLw}oq1R0m#?c{z508K
z^hZqZ=%RoS8)x`4;c~@`!K<y2!Hh#iA;iE(l*Q6DDz8+7?|w|nD#Kh?iiT`02mxZI
zlijsUyMk(^sM|ezZj;0dvv1d)zn$><{k2u}IAm;_H9lynD%Wa8=N_C$k%+C|+Dz@q
zAa7Ta=I9aWRdS^dP*}G^y(Vnsgp*(R##Dk9P#lYQmr(+U125h7NkHfM!`51Znb~sg
zgsc<uQ_AhJ)D7oVIv5Wdrrhs!9l{H@(5%NcfR$ddjmcOx35(yk%o8we^X8h>C`>M&
z3UX%qaEtzy>^E~WgL9&tJPnp+^yiqw@9AdF=(CLX(}_*w))rj}Ndlrt;S1L<DiSW7
zq(xh)4*dtqd*3$d6v2_sX;ew40lxZQwSo<=cZ)t?<vl>yLVlf1r!;-aQoUn0Ve70)
z?sW^_v%hZqnpIzNqKm4Jqm&4>u-p7QZQ83JsE4BT+n)su5?G_*+Ql}e>vGS47|i?b
zl(a{?!>GuCCKQ%*Ftdchralf35)bG<t-B*u6=GOK4Yx9WfZz2kZ2B1c>}Nv2IGX7P
z;~6Iv`dNTz*hATc%6U)g-n41cW6Wn{uV0y`GwKV-PEqf`8E+H!m0t76D$VGMlrU#N
zba1;}znmV|VoI+cvz4-k3Lr@aR9W<J;;vWg6EeyIG~Ou2hg&V{N(oE*C+s+af<`h6
zwCJ$Gn@6|-`_ycxy6}7>hVg06)4E4;>&0Az3^Zx>?AaD0hG`~$*w?vZM~3Ov84s?B
zn=%PobZwJKnh5l=&#GVY*0VVH>v<^wOJ_}GjBB{6vQ9Xc%AZ^6xV~JZ3|~vohHxE5
zHF2+|Gf@nhs1~~c1_R+f#ew1&L{!G)T|eo$CIMP923;Qeu{(Ok<7z|s_e!Rp*9xgQ
zeX~!Gd3W^p=U6@EBw9Ore%PA5eClgXs?>G-kR)LoBvO!p;7Y%8*5H~Wrd(SSja2p+
z6VGgtT^;w80$ot=qANQ`JIqhKRsA1t3b*bu>CCWat6Dj<CHiZL=$%efJ;w%-3BdA;
z58mH)<$gov>%nIFCb2EioC;*`wW2F%9v3}XXI^-U>?AH3;!+@9o)%}OhY+$`&AMnA
zCQ>{cM?OC)8~MBK%<JonX5>rYI$y83X7QuFs=-Lf<Yvf18T1(${Sq)JPQ$4@du{&5
zj2Od2zxm$|x<y@#AO58@V$k}kE5a7|<Xk^7|J;=BBI{lCFym14uCbS^DclM=8@XR|
z_)ltP4n({u#7mDXY&_=<lVkC)%ups{9p8M}2%Nr<=W~kqOT*eN@8OK<l>Vmy`U;2q
z*s8*gheY=u3B8>64p@6R$nb7}U(2G~ZHUrQc5^HEe913&cPM*DQhb7v$7=<wl8oVB
zxp+vakz?qu2}VM<DPq?F8;Hk&*EgYAu2=j{Unu%AF~6niD|zXdv{y-cgndr`KvCTB
zuBA#5tvvMEi|N(dwP>;X&DA8N3pbCKLn!D4#V@efe=EVu>M5PCSeP2G#!Ofoj(a;D
z;(H1DDCP!&5l8ATiJ+&VOVv*)NetXl=U-jz)OXxlF$E}m04y*{%pm8i%&y|FfxDc3
z9xdR`kTM1b0|l#S-fQN|lghnJ89Fov0>sxTAf)Vix6`vOs!)fQz!wX&LtEDSpwG5#
zGR4-;r3~cjTu9uyZv|8LHJuwGUSf$J@#LwDl^4$y90l^1ap#$9OD65h<x~cSl*Qm0
zGvofN)ycDbMac5zT<N-NX<x7GUeh}&&G-5R)C70WloYJ(RYl?UBl&Ce;-!>e$^%c7
z{M$C1I|!c66%V3qrW!O&K2Yz<-=Gkjzoe4-BL4~NSE)k)Y&ykxSct7-!@YVw1;O;B
zVsY>J4VXmq$fn6WT=7E^0st(qU$?nu-o#(~L2JM01>*q^oX^%bDT?^=(*C!+=s_=}
zaP5D=GR!z)P@b!M^;%Tn<HjX*PuiU>UNYk7fuwBC*EJ&@SCQ@)_y1N>B15J_9n5j-
zLz!q?h758=%e|Kj7wGm#$Xy^Nz>H(&)eF3}XPN38W|mx7@zC*={zX0bTX`$rC2p*p
zbtXse;OvI~d$(ECQ@q<sjKvgS=P_4tPrS$U&Oc1IGnbPdQwWYkhr~a4-evX)XjHG>
z6SU@Rj_fQsl!JOzh{H8TyjL)|3j==*1#j}kGREs0;BB(g#cKfat;3CsS`OF0hsZ>Y
z2r1IZ04MlMu+gPoU!sg?XRr}<f;1oJ!z@m}8aR+@GtwsgtAXlAigqqA+bbgSt>0Gu
z1(nxZOSN>dJlitken;1&q66mpryuOU%JVK#KoiGw0?vwg@tUBN7i`kYzRoXNT}PpK
zu%IY+1sik{-DiOrOa@qKcd0<S$=s5ST#ar}_s7cl>fHtm$T;JaM{oHc`F{by!Qrw_
zj}_c_u}8*hVxm#Q0P;N5noJCdEghHsh^WMf2x2ZZYG$<u%Zh6BqRm3mbs?x?MnN(m
ztE{lG$Aj1L*H(S0H-;g=9q)L%+PXdVE(`rN=ixpLvIwD>pLXxMmgD54>^+U9qi@Tp
zu1NGRW2G;sF6Sv8kAC#{ojR@Q!?-_du@)i~vFVu(@Uic6*2Xwz6gcO)?wXyn$J}>@
zqX$ERGVOZo04usO*Oeun`Iv54ICj9m{KBNeF=BsW<(>X2yL;LUWA6_A7aiF~+mm*l
zZYmTyF;(xvUVc(wlJu9Gx<rTW-{Mhm<X=uJ0XQEq+hpRVsV|soINcw{c=vOx2c;rN
zIyB}^A&D|ArjX{q>n;g&j8htBwmkh&JekTxn^Z+6zJHUu%Kf#FvND1DO5(-GnSFn_
zFzDR-1&~cDSL~x@KSqK?L8T$X!*s3j<7<xiP7?iSSqK4k(Iz)>-?R2*u1vEmegQ~!
z-Yy26K*n(+aN8|830vV0uJpVNb3PfW!0W=Xj9a^2maB}q&#CuhbKpal!suMH=@*cB
z$7Np`?qSxnNt3ygH-|Q9*3A0JZ@$4t%cW%Ky-H{`tv7H#L`!GW^CHq&P(5XG(CXHH
zM}4ncY@`vqb~2uS53OCQoGFg?>+B;NcaF6|SSZSY=nNVGL0q-`|9>Dd%_j0pgnd{J
zUR0K<kLXE8*eOuNmDpEiMn1lbo(LUw^rS;iI@4Mh?Zqv0rN9zfnq~bzXB=%1Ki#`v
zU)j%}FV6Rxd3Ddb6<})n&J4Nd-QQv+=U`vGaGs^8>0U&Yb4$dN(AWBj4?mP*ht8{M
z%U=_h1!|k1$KKD>IPFGyGmYa`HENapHo@=i@$V%x4If)AJvd<Pi!<d5miN5R4gF|2
z_Ok0b3*&dCG6DhMydr$TaZl9|XG_cc+j3%`qC;FF$>InJ@WiDx7Kyhu_4qdo?G^i1
z{4r(eNRVOk#UeB#0R--^uQg()`tU_#)ixs@q>QvVvywC+TQ@7mqs6m7q-O<nQa*vm
zoTpX{9n?Ar=NP*L-;RATpKGXkQfLtWiVV#ZZpG!IeALXnPnmQcn1`;t%f51^!y#Q9
z(fGp2^}R6XIMYQhCaL_U?(Mr6Xk<&lb-qhqzlmtt{oi?<<!2L<SK}*ypvF5JMk3eN
zNSwz#5Ubx~A5WHQb)=qs2JB!q;~fFzVzk>46cyt2w(UOq3!XTy0pBqnH+$BV6+LEj
zt`&I`-v0~>wT{K$Y{kwHqZB9EUqYWhe?=dKB1?@1^JhV1lG%(n+bCQ5B;$03l5tIU
z*mi&5=i5#DwzMy#_(}Olxqa)5;g|UW+zw|E^+H!lESoc5IkyY0JgcIJKOiTGQ7$Os
z)Jx+9>i!$f{0R&cpF*vHc&BoVJHGniDVwm^FdWY7-S?x(4tJDDi+c9aqh=F1F^z<v
z0Fr<`@*G-@_?B1{V|UVe+QPma^NjlDy|;Q>boHzE=kFY~6(u()+DlTWEj1NbzK=sr
zd>=j!r!9uL{AnmMjmse6bl+VauczghrV}vYT$|kBK5<u8PIuup^ttOJD*(%Cs&Pa#
z&4RW*#@irVhS1EPVHvY-_KoCS8ix*H6Vc~N-L61=?CAXHt077~Xuu1r56!=~ExYzN
zX?bY`=AI{*oMEow=lc8eQ~Hm7{y&D^LyC-S#UgS^v8Z_g@hNQ|l;Aj+K0UNevu3;A
z-FYDc{9GDA4H;8lkG%e!w=L=HL@+5e73a2a+6De<oJS2)@wfZKTi)&ynK^CQ@4+i0
zV+=i}`Jy*nbj^Yp;zVBB!Y!LNy{ADMQ*tv{zKrC5jSxt9fkFaRbQlx3CG#lFuD3<w
zOSWwN5!67S&W33hLA;;@yxu5F%>`(&oXlCPrz_L{Oo6?+^kfd5#G<-=_PiDH0KZjE
zCH`Jog`3|orNY0@w>^%b0@{pmz?F@D{UoO?9-AXec6bD1o%SVk%Q(>|#DEJbF#njx
zeg$*koBFQo`yMi9oBJT<_e4jEhk58WB1l_b@;Kq#_e1qyv3nJI<S{Cl^Mu&lZ@vhx
zg~IglOpuGQ2mAksokzWUnt_upDMKdDKZzm9n9QYI^u8^xcWj9<E|)CWU~DxEX_(+(
zFwT_k?I&|Hc@J~G5fnp6FER=u(6y?LOa`rB(=m@O?d6Py(YIjqXFJxqFYW@<3w%>l
z)A}{4U0Z8iX-6As@xTFVUACn+s8=uY%GNJ2s;fe;a<}`|7$TZqFtGl27Q-RP+*_Q!
z2V!+L3OeMirIb)8&h72L>Wcu3#)e?%=vFjmT<+6y?1FP(I$~5d+G#IgKHYN~5c99t
zw+E37?EA)l4+X<DvQJ}`l#FyC)IHm60mjF2a*wUWB|zQ+x4qHy-vAyidBFk6lhoS(
z_$7Z$7VTlCJaX9qJn9xAU-Av23HUIu;)l<xGM#B~l(>P<Jzs&=_5(7AG0T2X^JlD#
z>@-3!F%G4ekYq<oXh6Q|lJ#Y;QKP1%$NtUarj?U`(1dLG*#)6m7lmVNj=9?pX3);N
zl-P=)fbVj!Sk~K&koj`4r{mzrx~;!{h*)ba+5g^|j%9IA`;mnRB_1Of)S%u&zaH$o
zOuRG~^>k>2r$1V&<Ol1D>O7>%cu&Lmgm5V%x`H7XKX#`)9$;TJhk=?CB+6DR^U4qf
zKnHe^<x<iPLH&s{CHL?Yu~im}%N}Ex12kBA)Am=_XRnE9KV<QwYk5h498#_hnc%&c
zdeWIQgE-LC7bS)Gj&6B}`|CMzD0r!&M#l|vDIm3M5kj~wJ8oo06<^S1^vE6zJ)?*g
zf%leq31ZDp%`=)5QMpr*1W%12CfqVHNZwuCsfsI8PUyYo9-sdc8J!r+h%XCpa|*P#
zj3{vIgiaH;JkPn+Depx92V}GRL*Mf9a&fUdW(0#Mt95yIW{{>R`=jXAWG0ttD6*@i
z?~Gk2lVnhbQRN`R&Zki^2Fvgm5(e=n5-TtC&`4}7S%W?5RLkyF>4N8RxWzJze1Aer
z<K(iEBZGOb^sZwh)UkKW^DVNS#XV1rl$DrEiq~5)v3qJ#H-U+P*4Q1q9~R{}cTS8r
z&}<*VN~w2#$NA62Sdj8g`6E)`lOVf%n*hYTvYMSTxjeI?{1C#vqv!<12}?_xfzr}?
zwQGyy0G_hv_B-l%53845xx1ZOP?uQ^8Mfq#^wZHCZH}R(7&*MtM>dLG{oSBp(SEep
z#24Y=ysTW34^*v-`sESb2T(~ES_d)b2m~ONEQr7&*f8>#kH0Ggrz}I!=*mze+xo`C
zs)Cz!l2^!8l*6{W`_!ZOyqR%&b}i4n>TZ<XOoD)ZCCQVmyi&-&w7Cx-9b5AK(`zuE
z0NKeTf1i%*@~W@LWjIk(TBsFRdg-en)461_MlT;SU@1@X*xH<UC0L+o;{VH9mt7Rx
zTz6TaLl}8R`@MKu)u3QG>7?Y!*eTK+o_3`lGvfP!6~B%Nw#>^rXBqaB;m_ofndC%~
zbnwT<5K*5X&kAE!0Cl9S9TLHT%zt7(q{(oYXkmnATmCU=G<PxeI<}6LoRmy*f?vv_
zAm{;kk^(gof1viqU%oagZ_?%EHjK93l}(w`Pz$Ze8neB(W7|$<voTDOUs?8|+qLZ8
z;<UzTHZm}x@~prlz6MRTt-m$ST-Vbk6f{8`u=cfk%TQ`?3)7I_-MC~BKkkiQj>S*l
zRK9mB-Fx$YfXxL*Uk3{Ny1tAjd2R)o@D!)DYuAQ>m8?D$zLwfNRrG^<hx$fwzfy@V
zL6^b^e#fJ|VB%hsuX|N4LqAprI1aK`D2(nu!fu9XD$jEYf#O-M`7@Cw&qzPXMcEc!
z1`wUfG{Th&JiylZJi!BRE{G82*$T9Bo=#Yl?S1zJwO~-<I(g}KTt=^?E$4qTU-|qp
ztz0w56O(aoF?$)1VjD%iOZ)Kd-EQ0Cz31JVf&gG{>408r!y{=dIhEcWsj?H#Fy!|l
zTFR8?@6Z$$xKwZvmPA>Gr4r3wME#1n-Z=rk)b@!*B1kKN>Wav~es1W%C-d*w9BQX%
zYlO>h<j8ZVHBt+@I9Cr&ZRwnQU}Y#RajHnbe(JOEH59L}3vtZm#bn~1KLyO&>{TJz
zX_43V?UCOYth+O>(7Ol0Z|USi#=a%AI2NaMOMYj@R4`#xR_oekVxAqbJbC(c*q>Cy
zztbos_r2*&6T15Q(oQ?-^GTAez$8=oyR8q6;RHq%rbXr+xsZo0+}}EkQedOm&F<vj
z$h<3_%Gx&zopc-ux^^f(U);7<Td?{WWtUbzaHQl}(W8P}yWsoUhaax3f<uX-HM6;P
zoQ`GM&>=(2`cHm*Vq80`ecl5s4o+uriWB9r1uSBpi}?&Ph=w-|98%@?!C2Kb>o|*g
zsmGpQ7h8rgnHA_*SA_4&&|Y917X=47?%4ohQ-S3=c;2`H$*tOrJO@GY=}2Zi%GoI>
zp%(s9O*3r4ejgL3E4U1&Qq%-R$6U#N$$`|lGmjClc9(ppWa2;I*!*r!F&)YIGbkM2
z{Se@|{5K-jn`cVnl9GC*d|v93yC7|KsCvx61*H!MUw*c_C??=tw2^5lpwtwq$z5-r
z5x8yS#q(uep6SmA?AQCbPWS}h5}|9-+UPcVVjERaI_=(%+=||xetcf{&Of{Ix}hf%
z+dNS?6AFBTsORk1aPw|+)pJMqk?olEk-INJ4TlU<<fYQ9i0F)YYJofnQxe2@bqiKl
zPwn19Tv-WyPZ^FchF|Q$j2Kk0_z_Zz9h1U{1xg`HIwR8)zcX>gBqJ^)_g!}We3Usv
ze;88a7lVQQ1X4z2aiF`4Q&&ffLE%$HoFVI5Af6u4c%VmBI(G9iM5zcQBvFdzk0_o(
zCjC6k1`{Hp9S2R8WeD+)lDaFOh#o@htI0e9M~H;xi9^r`uA{hQ2|9)mj!am|pbn8i
zEcx%zmCN)pfk7mOgtesPLuHbx$j2g{z>erJT5%ag6MI!zp8}vK5BA=^e&MguyEC4=
z0-rwcBs<^)nNr6UuRTg|`DqD-PtncFW+<YQVbfW!Zmj|{T+YuYP5|<GM6C*UEfx-R
zd*TB3^t!97{AK9n`FDw*dF(+L4y-Uuss$pOe0=)TKjw&}RvGclySDlp+EUrtBlf(c
z_wAybq@NQ>Aq3GQs<0p_kmE)<?4fX#Ei^)236D!_E{oPMpE83L<f8VV;~xV7p51T;
zzV5y7;6s%}#@tV-R7{e%ck9-zf+_Kx#2!_i9QCiS*zWl?eLL2>a377@OoEN9I}p`0
z5pxFaQ!?|%`U=@<ChKbe!q`u<Z$|oq0;{(ysh1T}Y*z@PKoP%tadAUp6GS0J8*+;4
zDh7<Q*OY6Y9#zSl=-($aiw4;=Sxm-d#(e+#$0h7#5Dx99jBMGaXo_bODtsB(q+()i
z^eJ&&fL~)Nq$K)+R@*T*u))8omG<0C@;uOF%Ch!tUS49GK$j&%k!+7S#ZHb+WB2bV
zETDL-{GPO5HY6O`<{-XKY&Vz`Vm6%DE@j`_*RPKNhrS49Lfxjc>?`3_#>;@D`aEpT
zR)a3&Tsg4Zv=b!lu3B1JBM$YFDF&h4Byfo*zQm9&nwqjmManeE*AR2E%mO2aD{0FX
z=iFYkN8kUB%gXtP)Z~}j7WPC85Nmt!{nKEaZ8U>h?g=3~rePp>x}`HU)*oMhF~X}c
zNF}<w6{>hl-N>z_hy*H@zC>IM#KtO;nBq`-z(B>GM63~j?4*lBRFvplO%dPz`!Bwu
zefOV}2fesf{%sl!_p^mORZAIp(N>y$t^S&IYqEBw9jW|eSR=imC73{}@Y*0dp`N1J
zUgDtn=QscT-yCD9*FWnadX-6H4;#1r+y8NFajP~RcaX&|&OfGb?r18o5>lz0oZIdH
z{`CKT73&z1YAYXT2~Eh`rcV_5Ep`67W4diw&V0P=ZzA*sXfW$-L(Y_ibNU!n4|;)!
zBTY#$sEMl&Uu}oZW1q@0CU%Wy(#~mMJ;OM!9FhxmKR{B)FeumW#n8?56G9ipcJTS<
zbH)xONsJeJXK^JpqX0NR=T^-h-@otKUq5;Rx38lq%7pz3t1F&=BoO%q0l4nnUwzOL
zr;F`mc;8R_%6!G<u-K}~o-OOT<I4oV#LCug9{h{K?U>ep5ldum3=zC*$~-CVnSr(H
zDHC4wXD33p*0Q!!Y4aaTB{ehxD8v_XvwbxvnMGB`L*!?6IDO^LAcoH<?G`(<!m(!=
zeVFOdlg$)QUY3_RmKG`D7KR;nKkv1Vd}_>Y@iPKvugj%68!JDFm3$I`j~Iqth^AUf
zd`cr(TE^$cc1w19nQ={o9WgcINpm)P`%SEtHq`iTZqL0+f88D98;A)CqH_~p8cs1{
zL?iAuv3)-r^ZT|P!Cpa7<=cI5<Wu{zlYEzTnSNEp)V#vXLswcp&xkq3$Lt!t@|<e<
z6vh1LveM2Z1>_yTU^&C+bb9vgo$Ne%pm?TgD<A66Z#(|tETCaM&8}tv5mUO0cZCQi
z;@Zbn1U&xyuNA*r{XM|<e70VuV$es+EepZ;U>CbluYK0TX5n}P)P0+<g0qbp+2l}b
z#c0Q(%tPFL3S7%h$1&il&h)2l+b2A{-2||Fk^<o+g}f|w9+4fYD9%NLM&19<$Gez^
zgO0FHToXy8Qhk$pj?0b|0C0{VHY(&JUx~tCI3GMZAJj7JyCckc`I`T^p&byEm;Hwc
z3-KqB-FrIPO7<hj!k%}D>0&_QVM7};n0D*)_lzVg*G889F4uSdq$Xsq&Cz!C)C^DJ
zWFvzorYnC&OfRmYFsaA$RVeOSt?_;t2Mj6RAbdqJg&|hxoAFY|{`*p$&81_LrV6)P
z4I4&cem=YYd!B=4mUoVP3zK?&ZdI&namW>U!w0q)v?2@t-2TVg)l;0kD?I64`5lGo
z#?O_;WCjENHBX*>f}-;@(p71%DpxH^XFX<YjbGzviFb7#*VZ(dgf}nBa*P;u7D{by
z>I*|nhdp|1HMNVv<I+@iEH5g;Umzq6+*u<bG0|&#nqtp!7{@p(Ip*gzn9PEj->HFk
zjHOH?J7K4&aPFfu03XV`w><y1?&7)heIirkDyo?R6zs)!w^!%R4I|o@*^~U3Bh!!#
zh`~P%6>5`YEf`{a4cR6mMurI<iX02}28@U6z@ZRc3zzIUOVkvDF34c4*l7DUYuuQ<
ztG)c1R*4T2^Be;4NMFIaXy5|(*H-K}fcRq+-2YAunm!9e*8l&*?$!S5bsY2>ama_4
z(UD>iYifqhrG)KH@g(*f>)3LPlDzH?{QdoyhbcnFSNvQ&#*U^aAp7YzaC=#pv2GD{
z_`knd`I7d0EX+JYx(-9;mBS@1@RZ;%=-r$(4Hajf{r5oyssBa#(O77NU{C~B8wR;O
zW^B#=Q%DU+8@roI^*}_be0OJ<l3&~(LNk(Y8T+m15n9*LqpK?79}=jkWjmNcmx(Qe
zJj4}pV7_hTZd2)YG^I?Hcf#+0OZM=@y7Qa!^Z(p!^Q)K(Rc>ADO1cr2nfAfTK~rs>
z_)gKCbTq};C5U&HwO)J)*daUFH>ap5939Hu{j~=z!8}e^oR55^E~ColKPq;#K`lLj
z8pt;YKc16-hP;R!>5$1&c+!3vi_^{t0@qt#3I9P^W+MVe(`3vLMd~bHL6P<N=66-a
zY@aRviQpzTM42UcmTwMq3=|AQ@R>R0aLEnZ*!I6xrc+-L8za8Y*`I-n<q=g3CEo4W
z4TP%VBh%YW{S>hkFUv}0PEJc3r_3u{adO-b<wMmJ$zRT9J8q{-HR%0!I{j0ZDZo(T
zt_EI7A0SmLfQpu3i`fCZYuOve3-i;E2?M?dNV&qgr6g}~z%x{z0wE(g!G^B4VGBhU
z<k%uPq|jyAr_hjNnDCf?m%r2$U}F+T-#W^YcD$<`cVHa(Epc@alOY<j95z`HB}akx
z>hk5(_F+sS^HkkTr7|P?fHTgOYQk+qqil>JZB?}fRK#|j=qo$}zx1CI4P|W+8ZuK~
zDmjzEHFLTM<%c@n^IPksSLe(n`{A0qZ>c6J#<vfXjdj^c(3gDF2h70KjQ%oXUD;x;
zpyHe#80(CcMFWc=HSm`obm28qEXXiM`jV&-+gGprQ}s@#*g76u0)B!qGLB*zat6CD
zd;0z5+?dtJ>sEd+MWw?XN(@bLJCr~wlmIXGCv&0qu=Pc}y;DQLHV&Asese|aB2MN+
zGClWl5;p3%yLDTdaaEqzP}X`NBn_fn6N(syM526JghJ7&3>GMbA!$>>nE!I33glb7
z1y%7R<`#vR<xwzw>%%LvTG3tMmOx>$pjid>kd-$4l~kRzdK|W%wU#~l{;OK&5w-q`
z#T+VoFqS!YiGKuCxfTr%XQ8)b2e{X3kK0mOe~xSb{Y058OHo^`Y$=N;y2dNi(m%7S
z0j#^@tLSNW06e5L^TQ49!E!WuVB{|sioRq6t1H@2v}BL915%TzAiPeN^0M9bN}qt(
zOywi+qxk$Q)cmxG&KkM#|GVdkS9@Q+LT*FwAL(m;ecf6?k<k-q>#OWo2F9sfsJ@C}
zKbg6Zn7PLWF7PLKY)4ZvTQZkrp@TZ+<)|6(b;FmLc5a$*TM}31OKViQ)&UiT&@ZD&
zIF0&ZdJQo@%2(Xc&@IYgcu?$GQR*s$kVXjVd@)|B3``ES<e{$CcUZF|A|s~x-x$4P
zl<;e0I2qj0Fy*>G12~(+K#QMWl}1H`H_C%RDlH9DPJ$roqp^sNw}sTTpz4*CRVW(h
ztHqd$qv6(!V<LQ=s0bkPsIWqTnf77}xZ@6mO0dknQZIqjr|{O!z+cJlnMq)yU+Oww
zz)Vt#^dw?B7)h8t_nb&9$JNw~)=o~8tZK*++vw<8+(5USPoIC(8ryKx-wH2hyo{RE
z?aj=-33Eh;)vC2sp#ps-J}1YF(BopB7Jr*sEp)}Z2P&cv;9dI$F{2b9?>|OnW_dBb
ztXbAzF`BF(1k+Ghhe)Y~_U`iPq$8BRW^x%PGXyAztFIub;w~(%3tSzwZafjO^)W7^
zAq&EySTU%D6-g@I^MA)>$7wMSq4D2cxk;O_JD#$Y;zBPKyjI`}(_}!15-8aZG>DaP
z&mBk%IC>KLBwr1TbTjs>sY>$f7Jl_;<xuf*$Q}88aHBtep0;0{hyJWeXReqYLYoix
zJonYg%t^o-fA%GFo|-jnN=8x)vXBuvv9<s8svQ?fEF>zU-AfOuqnw$0bq8FWQnb5?
zaQOUk*h`%KfQr>G#nvB&Q0i8IGTz%>bpo;Ykunb3(Uivwvbj%owQ;T#y+<)lA#0!j
z0nl^o>WU<$^fZGMo!DYghWmV;%&8ORG)yrNc|@^KpF77vc6UT`u0+%?yn(Nn77A*?
zY0{;udR{~oA+s2iHSJ9$StI)EN@M1@k?)xZi$isD7+%Z77gYH$tsDF75_2SEMz`cB
z#jowXY|ZcWPllfOC*&OF+l0WsU7i++Gj>8V>(~y`paQC(8V%Mk74=X^_4*2DDdB8^
zzffjkk3rWH$YFBWFH=g)bBckjh7A`;yp5<)6B3Zzv}dU`U{F7fD&}`oZb#Lsxv8Gu
zDag9`3C&jW&|FX;(DhhX{^(no<(00s-T=W1g!q?06l+beS6i7-@Sl^FFU}L-NvDC*
z8JoKLX#-l=#S|Q0Wn1F)UthM1L9gjgAWw0sVVn;Mgd)oXW)>>=B7l#EDc*>12B#5j
zF5tVz?_nq85Xb`EP7h?$p{(QqPngGVxpfe`Xz(#9Y&5!J?j^Mr-09NFzXu#FE4O!n
zBWu`DU&$lj?aL$Z2F5zVM64y#LB2sqcVtm+WueM%D%1$D*c^9@``3njkc5lofR1Bj
zv`Rz%kKmR1&38CSHWD8JFtUKkJs)yzxc1b03j0lPWG8u(LOy7iGQ=(qSTtaKKxaKU
zn><kb0lS)_uug@r3**|zIaPfPIw`4*(c)gFg2o9W2YAJA#@?z^yY?e6f663HQ!yG5
z;G6sT=Pv^8hyxh7U3*hRp(#*}#gL2pc+Rj-bBP5$7eZV8uXB!$qw=Mv?o9j~<twEP
zpB|5`J0fusk)2rDii4x?a+babIf}fy4~IT$&M#i0$RqkL&FG%@(0!4ZNidVn6OPrE
zH9SyR_tVp-`O2WQOeD&HMde?_9kJ37qEd&kw@WEM`GhI;V>@KCY~(TKCM-W@7LIuX
zglngfFUyDjq$L7pk{e;v#8=QLz^}bG=kimfHRBFB_nG(h`@hQ#<D0{m3T;p3H}%EV
z?g%ffRm-o6=rObr!Wzke^c6`9j^RUy5d;!l{Wbs*9Ks+zL#xipq7Wkx=98iN^11aa
zps&u)yWgUD4@>jPt;kGk3f3sW!}>P-VlZ<6d1U)POKtdZ<>|=dh#*M?%j0<VY;y8}
zo7;e`p7QGDkeKuWxxl*MB(GP<I1TRiNSSmKHrBy_A1`P#5wO3(1Lgm`w3}jwyT5-z
zKrh*ndYZO--~PH<lezNZat?0D;s%IUr4GhjKscp8=Vyy;E65kAR6-?pXOu6tJy9%J
zMbMp;vs|%5pMS5EStH8x4T80?A+pG#EAZCEd=@t&*=~ePTSNl_A!>-@DH`PoyxuV^
zF57zy$%0f_{NU|PCHTv{o_L7x2|E$po@Wb025c?98c51ad}ZMUCs4!(@(1`HRO^Et
zE=c{~%XH46VjRjXi_l0&Nttv%pjU3sj<FS!F%J+kWef^--kH0zvQ8({2V%j#B=g#0
zhatcgvQra%ij>fh9kct31t-B|1~cbZJ@OR9B_L$%D<2~?^UB32t^=@)pv+j&>1cBt
zI)YRS-AFh(!CO;Kx-NdLlnIcDb(+eyvMZ}kTmYQPj;3-C;XMNy@77+gjP?f~H#<xY
zWtu~F1DX2bogiZ!(myfAuC3pmx0@{8f6nOnK1n?+PNxKhtmePTuhHijPZN6&y$N6V
zb}y!%ua@PCjYMbJ?bdnheyxKtW}e}a-$Q0}=qKK}v%`;5?|U9+gcvq|$Y8!#E|mEx
z>8d~}wD^JcWb<F2fh{{x5oWU^>t3E@WcJPN-+yUV4aJ@w^ado1Cmds^pT#!WKkl|&
zJ(_4YnVZcMi&dqw2@@9@%mMB=7Xr*YLhK+@R>!vXRK)g|nLs@8Xh2Q^<hM$ijQp5#
z-JGvEX0&AQ5wnRwX|wvO9C97>k9~Cib6~D1S64a8<bkiO7PO@lqLMgwk-!#%X)IAQ
zP#mOknV$}keR8tT2MOGm(Y3MGrF<-5=f^@OjQZYD?%ra%fa$^)Ba-jbsk*{Z8ZGLm
zR47rWqCQ*Y$HrJv_zNjHn>L+)PznJY<-x2qYIIRp8#3?mgg*__)1A8do^u9!Y1q2{
zgvS>yaOR%IiU+b4S&V1-K`*Sj#upXVfr-Od=XQ$G_Kb7OZb1sIRAgG7xie<WD8aMg
z!-16<J5fb?)FFvxnMe{{eQ$pDzgzXP9!pe0oiYf9Y^O#^L7FUjJ?@}8`ve7M9z&7I
zsElVjPT0p#q3}USBxNxhljME0J1Y9W5ORc%-34}CG#aQW5}$FG!!Ti;Mb6Rb+FoHj
z5Y|Azc)@M=<ziP;Nz=ERhUmkuiNQ?XHQi)q#%V;Z&C|X3V=|o>(>g==mwtwtZqwhO
zA2c3FJDJEd^Ti)7kX|yC2Hd=NS7XJwLqas*7%~|H^Te-6KE&-&lpgX?JC59?#i!{1
zQve#)eTy%-7;tcSJ;Uk5#V(Oqt?`7jF>LZ=NA_*-f(HR!cmGYvvBz=&z$DXQ`Ow8w
z{*QoQOvsWf;#bMaIXWgWp}}kLQc*q??nJsBB$Q+1%0s7#3j{*A(I)j2`V;9Z8R0tv
z{wS-pWn;B?=y2>aFNSh!g$NYlj4R$LsD|7AIYcmtGqTV_$29v4Av=LiR=`5Sch1L;
z>@qy0x+>tl7+a!CALA$sBgJc2J_nv*MXL)wd<d*zQ>Fh%iB-i`N&EG!_+Oq{<~cf5
z#%5#q`A(3%19!AgIBX_)*io;(z+bd&pO}j{FUx`@RCW@FK;Gj?v^?d?Pbce!j0cL5
zg8U$~Xspx-F5B(o<RtZbG4c^IwIzR)C#xdS_TByU|2=<AP!fTl<-=d>))2N1;?4}Q
z0_iGekqjnEV}$G;Amc%S9*dZF@kH@xdJIc9iIoKarGvbe!LG>hCyDYwPCS56i5?5<
zBI0o|+0hi|vSkXjyL!y^%AQhjmjXEKXV_4YV?dE<frvv~=y+=KUBGdqYNP+V$1-9F
zqp4<#0tuaay6yA&7pHU^u_eVe1=6*cXz<6NiziC-ANj6kwYJ(aCJwfTb?-@vaq+d9
zKccGJ2yZ-GL`=m)3BkJp&1k@#x!Qm*ISav&rMi^Afv%KdBC)_k@hQG5bv3zCxL4u(
z$bLG8P59$9m|QF==4A4OP$3h!(w&a*h7V+BK=w`1UYsIL@~!F4dP967J&W<Yz%k<w
z_=D|=Q8$GG^`cO?%*u+1Hur8Er^%mkmOmy%BStCm;xRYpzk_o8z`tQm_x!@bo-nlg
z*6L8jhE5>5N$CZrGyzb~oRu7r9bWxQI5{u(Y&URb<&LTZysBud&4_V{4Qk}f7Wq1m
zir0k{+qGGB?hr|2Rk*Ae!+4c5i_~UM+3r!S%cd^dRtDcNdG6dd)vmvT4@OMnThDj>
zB3pFCBHB)rrLp62E0d)%vi=3rgf}boYB0*@2Mfs{z6|(L0C`=hU2RZ{JqAvv<N`xJ
zbwd+Q1|}P?&N_!&DdgL{c32hr!4!7s)hoRI(rg@6%ouxM1VGG1yWL58N&hUJ98aE4
zd$p*7na=tjGwGga`X=$?z0Jwl_4?WR2V$E_dDl%_dxDUfuvibgH>q}~k03Pc<V=Xa
z8|18@xRviP14=`N!vswdFIMhkbF7+Ibf3szLmt-^ms73ZGV+bPj4vTR?en=9K8;v(
zNIIcMfGtmSe;?nxu<iB|;t!Wf5H5mdS777h1It|an-b8OZ=Jxjincy<rb0sm0HhEL
z9@OrFBz_V`{=R(swi_8ZthH3j4D46BkP>~k(C~8u29)C4injcLw|}dDbm&=+&srxm
z5lJ!!l?hKVd=lM0ZL|JN!lG=g3Jg2~O<gIM5=RLN>$AdB|M-!IE+eF8F0Yp=H-rjN
z77Ww(uFQPZ5$JCuqJk8uN^LjGrU_mxHQ?i+i%(E|%J<R8Bmc-)iq821<n%Ty(GqC}
z8=)5c9*7TTwe5XH;RoRt!l?H5@!?gbT2TWCdZ}SbwKM@j0XdRQB`C0q((yfEMche6
z_Jz-I_Gfg%i+jc;Ah-gTmJ&X2k6xJpD|=)Bo`%;BqDiI#X}>+gg)tK2d$O=2c%vu+
zo&=Ao@QEq&-u7X7dd+nW7Ax|`L@4Ig7;|)Pvl>=|2QRNVUS(9XMfwS*6O#5_)$gSJ
zZOWHZ9xkD##a%RZdnV5}A9klnmDam8o^PyC@2FaGdi~6HQy+Go)2HsOMfXakZSPpH
zHmQ8N<JjE~wN_`2+nxHAK4#GP@r{Gby7F3QbA)p@4dYj_c&*v9ANEG-N(YCVgmP`v
z2RC{*R@ymRZ`b;*e}A>Qh9dJuW@fwK;9!OGW96MajaJ|CZj2G5Hv#ig)qY275{+w_
zn3@hYGiw60YY*s@cs()c2|2j9Agw8F)m;j=;$6G8Zhb!vk@Cjv+v}NY`m5!yUbU*d
zre<B>u&6>=cD#nnt#IR}Sb;=D75)e%OIlG>Kfk#2(7QcTy2pu@ZVff_OZQ7vLPA4<
zpE@*d-MXIQ)3<MN$Y<uvovUu&8z|5Ap1Edons;};z+{!H_FzUYudZr3Z{9rD^ji-e
z)JMtW%<C@Rap^akSYx>^I|Bo^J$-7fI&hsTqfgGSt-!%IarE-Kwx3Xb)Z+9q25h@D
ztJ}qYWYF>BwqYo<Hy%3FPNPYk4I4K6dXoXx_(uz%=e&pKW|G8WT*ip!*FGnL5+xNC
z51PH&=LRS#)$tyx^3S6^Ip@mqye7PqPv1XVDPqynZU@1syMNos&rCYj1Qe>ZwnCFK
z#fn3-r}w;VGoN)-f7P`|kKwkqw*G?~?A^Q9(hgolJT&w5W%;ZYX-sv%nkhy3*S<b9
zoDw;|!R@&bV}Ihi+zi4&p}30dYzm`}V}G`2*)kguY8>dhm#=S=#LxV&bEu9!0V5|E
z1kKXd*Jr4=77ww8;(L<o?~UlOOtJ{c!=645G1q)~ZEb71o6~*Qye1s=(sh2l3J==G
zO`C3H{-RB%C3?I7%=or4^~v38s^|2f8UOnIVzu`#U%lGt@9(~0LsdmAN8@!?)?uIc
zVI>*-3~Z56xDQzm;}PaMLZwx!o9XGT%|e=iop1&H6s8_LxWL_1m^W}647Y2gqB4g@
zYLCu>hV|>$FTUJYNhzh9>b*g7xaC)aXMFv~EW*H0pp8Nc2bkM&aW~`RO|q<qd2J65
z??n|@eBoeZqz<e|7UAZiu7}CM<R?#d?%(f8Cy@ohp|`S76|wkPS|o8?7;?9R`q57B
zK{8_p{{Gx3CTxMV^(Jn2U6FO!Wb#8jfBt+>u-T3K_v=wqhd3qCJ-lIF;3joOpSv_|
z?tmVw6u6pNZYo8{2xMHJeO!N4qu1O*CVaG$pYf!Uvw`#L@S!c2Yx7*|H)s$6t`TqD
z`#|ZL@WRaNK72DZUESs(At5FsM<&<r<=S@Cs8LrbaX)?evVotAo}|%;5hI?br)LwY
zP%7y5?%lh%xc=5wyKmkc0**JNXTZ=S<1yKtY<J+w6+@1(b#E26W8H84P0r?rc*5&u
zQ5^NMWBuA}SdvelKR>?jGHjT6__8k5IB1o>bVq~UDEC>|m}b3-LXHc2$Qg#Ox86@l
zcK~<qS$bkJ@o>cG(e;^DDL&U$sZM@g&vm@&Eo=b@b0NPqfLgqIWXIO6Ga(s!(xb0W
zPEIzMb^q~WCE%ai^iBCe6;HM|_VV)joRiRW&)&T$C#87(L|3IowsL&%{JAnX`vB1#
zOtG~5pxL`l0X5~CBD6un48(vGMZvR#EcaX`k711Qw9|q{hzXn8PS3#L4M8KxXz9X*
zTj7swYHEG|{yp~Atz$15yh$q5G6YHQ;(zb5=a{EmPMT|imJbRsSMS@mg^bWy=4NHN
zanwzM&FZ%?*i{rPXLI>Nj^h;4?169{c#B^!>1U&&!oEiFN(X0+uSsO8{AKI>)K-^1
zRH;py23bi53}9x~6MpOM_dL1mUP8iPQ0x3(#!;9@v>y2D(@?LaZ95!2e){Uw={kF4
zL9;WOrq3_z!jFJ2AUV@vok@+cCc}o6q)9w%+H~RxJ0{+$fsjZ2?k6R>$*57cctqZm
zv~!6hfsSO*JFrqPXkEJ!JJ$tg;6+bdXmjn?mr;OO3%yH=GTZUxhf~Je`X7)((duGy
zvf1(vPwR*$d3qqtc@HsQ$y*%rGYe&S%78|W%a#phEr71*6x^bte`J0$>(Z=t-MUp3
z60=>RLp)AKn2sB_WyldXvyj^qr{npEVR28NHbaB4!$hNYg9gLt7pj|#<jp;G-mkZ;
zDz$z{<nQChk3*n##p!Os0=vda6E@yHQeQ5({CfHctxPqj`<jX=Q>VU0V{n78*3(||
zXj~5sjp|$}r{wfUj~atc+~hvz2MtuxU57e9z22;P92c~v)u?eM!3oYd{jo7mag5LO
zo_9IR`Jojc6Q9}XcRl%21eSK<VS35=v--M<@|C|}o8_qz-cCI=a{i>C#!<Kd+hp<Q
zYbio4qw3Pqx#pELY&XD@Cb}`tF<I4d#fmuKDWw)IYVI^CPL7esdNPkrOGmb|2So8o
zw#a<>ys)3vJV@mMYxERq-MiOE-KdMk!tSXZWt@aN#cWV=Wj4q?bZDYG9BxA#*^iiO
zMz~h2qe!y}`&wMQX>_NJ=9==G-w*EN8@+EHl46ULuP#}`py*>>o(El3mN?X{T1v+J
zv|G8(0XO+X^6UMiP|>RHxWVP3Cwnufrn=D{e54Q5pLaGg@)q(>E#^QEly*9L{8ntN
z0($foBNE#X<(a2@AvkJBlYUcR5mhnr-m;|@)UOtMi|T_Q9g6VSdh+0ok-=v#TsYJB
z)8PxAo}QnW*)m%BeTt1uSLR1H@O82%_*gmO`!1K(Gr+P#j~-)mcD<(Ada62*Q!&cu
zH$U=sn4`ujDpNa_)@rxLjl%Wjy?Y;me>RrrR3(<Y6M;l(I#Br7XM6iDw)9hFb*j|I
zYuYI*Psv`-%6PD;#8N_3_@tidRaI408@FxiO^vr}*Dl#KKfX(|t=qQUB;53Ig7~@(
zS*T4r$}&E6H8n*OkLrxKG~Rxv>VRt8>!X{oW>O}krTiGbnI4Y))K%WSduQ-*)UaV*
zTTY%kch2m6<3ER%|LtW(k%-Io=8?f0Of+ut>a~&D9<fr=tpl%azhZ^)k|k#zx**K2
z!x@(R_t2$@5=@C&j~)#~1`A)i>HPUV(=RWtL&f4|8Mdx4TZ{u@KITX)NMhWCnNy1c
zC64y?u`r&q^giWE!pCg)n_2lVbtM{^M<{#~Cv!HlUcauwV4-5rs#W@-VPVX<c=C?l
zfE*`%oIG=;Hx11kirYyw2NmZ(YeZmp+i<-6DCSnQPIti_W-KTND%@J}yY%{)6$^e4
zSBLftsP_G9+>%C(d|H))G8oC;<UV$(_0fr<(R?&%wEEBO`~Z|x6jmCYI}c6op~~};
zd_t)T4e=P5u8MCH-et<TsZ*ySM%hB#K%_N({P@F>8L6qM&o3?OAYMO_2MGjSdc6Km
zh;3{(apGe{3ec`q73fj7)4JAaz3rq21F|<DptZ1z#=>D}XCu^`-ZTSs_<9C;BZd{P
zqz7D<m)tS&@Me;-Y!n*d;$lEO1FDKf{*YzZ5wnolGzrIU4;vQLh;TTB{Dz=^bx?3{
zUeE$6T&3pCYxL`Pv1RI=+qY*^rB3ozQBk32X~0QJT_g1PCoZVF`<=`)uc%KXwP-YH
zZ)n*1!UeiulIS*K#@Tyc*#S^P5DWgOZrZ+dxtS*Y$Q>@l@7K$`*VR%Y{wJ8QTU0f<
zF|D_e#za4*rK2+x;KHloBHN6&czHFDzf)T1h5OBWy{<||Nl#kWee0ml;F%lQ)>e1(
z=FRU@!&*}Zz@jyv#ryNv>&RdScYuHN*b1C_!y+mL_x?qh*Tj$`r7~_T|4*y%;W`;c
zinc~=`zsWlZ46sbIb?gO55tAmqJzG^3iEw{d9@I-EKi=S(d2a|+Y)dqdobhT){`^4
z^yqO1?BOvC(VoD-)JsWHy)o0nZ9B@hL;Mx~?F8FC8~Qa>nlNUbwY3_(gG2PM=qnRb
zpK=7fNO5gBK(5pF9X#mG0I68Ia&ejh40uuz+`^(eE}zlfKiL8wlG*V)+6SV`%)Cox
z_{@-|?ZL?P>#Lwq^kH7V#kX(RKL%8W!>QcC+DzNoB()#9dC-yZgSxlKKfiP+3ruQv
z>(=dLimfy{oQ*Tf!fC@<H&|66Q-Fkk!IQN;-Zy6t&TD}G@tXcPB@UfG|9GkaNQ;R^
z6Y7UzQf*vZoXc2Ad*-E@nwqRhsuTa#0aPDb>7jN9{QQ~%zuMF3wz}S{iN~-#eHXXF
z{`BE93%a;-yZgMF5FwXQemzwq{<nt)*V*wJ0QV;2D|$QDpY#^Fug=!jlBgfHR#7=Z
z#cLeXqkDG=5uU30=Pqb_Y+Y{Ly#+{Z#!06`fG~%zUR4EqxC{0Wl{#hGw8<*TS%rlS
z;hHZ+M;}2refHeBQ%()l)svZ-UcKY~+wYKcJAHl47B}8#qOl<+#t1gX{Mc_}W8?3D
z(OUSR1+4R=+GMer*|Cy?xf09QxPtuGSLt0Hcw-7v+{n--&U<c2XpErq(DZ39^-{8X
zR@Q9i)vL3`C_nER7ltR(FOTs3iPt_j%%Ylt9^B&eOxf46CVYFQm(qm9J^S_z)$|{t
z=|2}8yj1bc>ekpbvLjJJ*C{VAZ%T}dE<IWULqo&wKYk2W->Igjr%VL+8_HiKJbY+O
zH_2auU_Nxk#`DC9&O{|{2<@6$6OT`D87T|9h+UB68dukf7jd0w4L1L$yUWn*)$1-;
z#%Dfd6$KfP+U!YAuge{hqbMil(mrvvVp&6U`oP5t7mR7qslw?=K7nRm|I!Nj5EW8=
z;kCBM6Vb;`ovOmmxw*KwICo|jI^?D-9&0G+%w-+;H5Nq1GvYMVZOxF^9yZr(A8=m@
zc-86Bo{*5Pio?X1xN0f|A3sJ=5%VLbXomMJMjG<9oP`vh(Qss;Tzq<!GKxpSb*KeX
zxsd^u#vBZ)lsY&;t{=O^o^rA^iD09cKB-N*b^QL|x^!IT70SxWvGhCyojv`Qwxx&C
zt=3B7*q1L33fJG)NCx~vl&a0d!sgMPZV|V#u+jPysm2TbxjJd}fv?>sPMpZi8KS;(
zHu_f4S4Xw-7*=(siTUEjrQ5%km32^8uS)9>p5myce)PDh`47#C50$B2(tV!kym=j&
z`Pl@qt#I48u^LE(h^V9$X~>Ac;6;nN=Q!vS2Wt6`ob>kUkt6MKG}ua>#+-Yk<Bfd!
zGLh6xwgec+xQRwM>;3zak-wy@hR+eFv!D|vCYR}OBlCxMRGj58dbTyXY+)C$YuCah
zTAkEwx3wL(uD<(_D$&!u;dt!I4WB)E(v-+`he-q5n3*Js#?70%Lu#g84(MraVR17f
zqpeT7ftx)%>VP!sw(QVKSviXl-dj-av+eA<*_+Afw|zkH#@xh<)d5HGevDfS3%^M)
z8F->Cd9BXib*Z4&s-?hqu0DtT^s-(ky!Q8B)kJ+~ZESS7r^zJSQ`Knc^_IVH%$Y`6
z_lffp=JaC4j!t}!CV?Y^hm0IqmkYvg*^~UjoEkQ4Si64xO_Y#>2E~qw8JCxGx+R29
z`9-I>5z6>7+(p)KZ+!|OYvQQ?=73--*tx*53Wbi24!tJ%U>dkVgMA}|Z?f5lSG68J
zJ89YC^ye>MhD@2Fj1=%=Zo>(zl`~{iuENYtj3flR@bvPsqxm=)fdYPzv2o2MFE6bp
zGO2awP>q?4Vf+`_@@V;S^pu%18*+N+y<^kTRDj)B(4zyDvQY+m)-6dn)f}OgAYZo@
z7!M1YMuCOBVikhn7RJh@dx^hy?<Q5?PCvhme7G@-CrnV1Q_fi;O&ksl4Yj<^dz(vJ
zwMTE6x?JUU?R;qefTj-<3R!Ht@$~6#9Q$R=yan9TL*=;W#rZm9qsuNY3|<!%`9K4h
z>tn3kD|E({7NEe*x-{2q0oZx={P|`EHF+v26US}QesIvJHOk|VN#RJ!UXv>^Vh{FC
z)?EXQWHNT_W)qDeW5+gJv}lp-(-C17w_dzxg@B8BQ6tf`X9o>44>%YTqHSQ%igDr;
z%iQ<xw_-X(5LDOFYTTkli$6JL%+xt^+Cy>DFetTdy`4v}FbpG?EyL8;n`_=dnKko=
zP65UAT-12ILi3(SbU@>=o~Lx(j85U`_5XCTr(B8V8xR#7K6kDORmAg{)%~d4+`WfC
zogIAqI7h3h;fjJm78Vv!U#Wd$U7i+_&htw%e=jXViK$BAl~>R1_FQfBCqAIw&-D5!
zDa8bOde-NtUtP`;fTpT#+DIj+XJ~i}P=Jb?V;OpD*RWx+1OtczS2Z1-i8>2k=P5Qn
zcre;}{(RQM<+>a7*3ntURBQa(FS!j>9i5zp+uLgqUr|V(-rucz_hFz7^i-z{wu+*R
zB^#gJ?OL~LCCjY#Fg($%zg_tc+)yIem!WVwaG(uSejC^XvBhxUgfKLAjk##drqt#?
z-)u9P5NR?r%z_YS#T-z|ivCX2+(eC;A+u;`KM`F;W_8&!?eHY7eOz2kYT$+G7kY4D
z>|W+mWlI~ckg)fhsgE;ro9tBZTSvxsQ46?FO061YIU_6%+>%p3pS&Z%TGe?d$=>M=
z&HeN1t^=FZz2mrLAs_!QV#Y$tz|u~-Fz-KFHk>9$7GUC5l}XgA_5K5pmGHrSZJ?q#
z%pp{qWz1{zp#d}GvL;c80hl-Z#c=<nb>%KmUstSHA^kbml>|65Z+;+B;~RJG)X?-F
zng4?-L!Cn>Zeq*UsjIs<N#Vdyh<#Cp5gX20a^b>XW;0%Pi_u40e-;nE@n5Oc{^;?y
zB%RwNHQSV1FJG!crlfdZw0TmKJ$Y`UJFR~EEP5QQbW?yX%cQjQbb?+R<^Il}x|OB`
zFm}wiYgNeGSo-qUIXN23HEgTjyLayql9T;Lt6L~O39Lq7+yYS+n;@mLiNC);CD<Or
zl|5;QCcoY9<I{-kbyC7zC6G9$-S=;!s%p>DGiMi<)ZsZbg;Ya;Qca--l)#{oZlSDh
zwE}TaxKUuvCPo+}m<+qQy7;{kJW{=2GY^)Epx~H%T3btN5S@Uop&2N5ogHA`TU$xH
zLtRJ5!w0m;Uwxe{I)@=hUMMih1|Y77so|laoA&K%4evGM?WN%0j&d&m@$wh-AQgPs
z{mGV9?tgxr>W4wgHViou2W<vwwpXk6xLbTe!VN+JS#dquaNBmv0|Nu)`U4_B>AOcn
zEb~fbT~0?yeWGhHe8$r%$cSkT6Ri7T%&=j)&&ZkLVEPrXqOX9Bpm1EUrZg~@Tq_1h
z%nS|f&MkWb=s#(R?Gj{3VUAbXh<s^nSzqqz>-X<#*Q!+u@bT7-8&$|6Kwv3r($dl<
z{Wxg2q-mYbpab%6*C<m?m8TsYUXMr_Vqq7v&;-y(9|lLTa*5*z+!&c`-*eVAPx6i3
z?}skmihA;;rCv3zcK1B{O82K8sSogAfFAlo`I4BwQS#!&i;pIpRD#aC!7>^_u4~&y
zwF1vHq6t3>mI{hkr;QJ<q`L4Qf==nNl*d0CPWBE0Pljhs==a6H7th%8#p5_01?d-k
zs;Qh8<><RI%vg?P6ssTod50g6@r2rq8jU~$P33lYe3!GRoaD$W4qKf5ODa4#G_V~g
z=cm$ALhRr{gA^d;)z`J)0-GN)D$<Rcm6xweWZyvW1@hOu%UHYzLZxGT8mDuel$3O5
z9K9FN#t0bU&oA>WJUTh%(_>tkD;g^+*LELrqfJYX-^Qk<b;#}<kFx+ly8;3x>o1)>
z+l{S8YW@4G0@7@R!x@`OWz~&J$^7keuIO1zG`8NEdmwc1uwg$ZOqsH^th6YbS8Gqi
zM0;SD5>DexdOY>mYVo2)TC-!_EI)oT8P@9?`c;7<q_iTGIygG+JaAyfN2i>x1oyT=
z4NTfh1D8$LW50a)FzV#TxMk?+>mLD2VDqWlpNfN$j4?}ZnV`&{vtYp%Dukh({MwVQ
zjEyg+`@NyHz^#h@HR|1S(&1;~#bKo52Gh2jY%k(Yu(ZwR&h=Kr;xus<bCvZINu@gn
zFfP{^4c#9IsuXsp<*YSTLd-2cYTMY@IMd`DHVZkJ{_KBL(DH-P+JdY}I8X@YLxF1h
zaul^Fy@<k1`dS2GomD(eu3MclBQ)5o>!QVrYg8hYC5il;5s2Zq*PN3^WaJcE2#H6m
zcKp-fIj>rk*CP{|un=rphpCU=*S7%}Z}#D?>guQJ{kM9mE?^5OTU}dQN%1InR7?2Y
z8!uj1!dR5}VRGo;(3?R6p#v&|%x4s=1)H)93PQiur<TAbp$23Hh0Z3Zx!Evh2D7#h
zcMihsyyl(8KYSP`S;_g9dtdS>nZq@#H<Osv5!S8u_is*}Gkfvk2GgcZ14_L?K&F1-
z_PX!gtMVW*aSp!)w$I(Uk(}Iwl&;#)*kxxxz!uzcz>HxNTJjmXbm=l^@Zc)6_)=je
zCh;^rm6VV-ri7*Pbb3*i+x~K<$FN?o;F;#ezT;+P-+Sv&@G`n7putHCD?7V-6#aWg
zZ{{i7B*^FLA7n4+P<k=pL7YF&6_T*A$1tZ0Z6^$MfFoxcVr)V}9Vp%r95Q!rxg#$2
zXhNixLWdFJ@QZ%xF2JzIqeqWgUYp;rUcFm5A)hSS9}%HV;1FVHhVc?n7gUmo<U0fI
zVd+R8lr(p#gG68T;DBLEqSo1nh`|(KnfZ-%2UE}A<eieG?a{05+P&N2w0569n=W4L
z_ZPlmMx+u)**PANg^r9rxr6?A-j0%q^XJe1^z|$FtUD2bqfo=7?k<1-8-URu*>Q;G
zY9v5ojj6f?Hulf2ou33ySi`)9!j1Ttg>|C+vSmto%_o%m?cTi^{D+@Ai_eqt@NWi<
z<&Qk37B)!I!G2R!&t5?&-*o8EtOMapfi`&Z<aPaV6>ENd*Cv`kKxW%!!j3D`x!i@(
zvx|DSeMzzNR4Fg19lY8h5nsy&O-r5XL%O%P#Q~tN4?M)q485(PZE0UNa-Vtc{67H?
zVH7E;ly>jlty{Q{wE+wQz9x$S@eH3Z;c>T8u{M$pztsYT8_pMVV8Ij$J3Bk6;8Rlt
zd9k;*m$Hk7b<WbI;Xi(dyMFkvxlNR%7x;>t7EtMAgRR_+R?vgWMlO9=ieq+Sqw+x+
zW&eeDQyi@xg?D&>u170h7&8uu$Mx#~3bs{=)LIK3swU3<C|e%3QSSZvsI~co0F-#7
z0DN4%w`}Opq1e|}qjoSzeD(bN;`A1@9Gkc#M7oM^u3upxUkkCh>Wc=Tn>*S&Y0gF_
zVlM42`oDRL)NK`9MtjCN2n;Zfw&msZ6_@bSTXC(e)@%i+ckzBH3qJV%+k9d1u&m_6
zq%bFMZ*OIT<y|;P3=tv?I+7do0)Hd8>(TF(L3u9yhlQ&n27MUk^iurNHxi$f_Xyl$
zL$-d9)IFOT<oUUI^=PbSeY^@1$d`okcuj_T)+UU8t^o;^pUz^b9YAZ0CvHNN16SgF
z`cc17kA?l7;YBpW1$^e^-+71OEvmO}-C7ZgZeDGtiS7O))cuGC&p50<@8y-~cqqWb
z!!&AeGokd?tv$GF*Cxuun6vMA_bJ{~Uq{ZKC9oTR$5p`Ae-65<J!n|p<-%bPqI?ds
z$fgBIO8gW`6@_8>Q@x?M`-E*aP{bl25TO5bTYzA#zI```g)I*I`Sj_N?LWyN^h&`c
z*S$^{c1y7CjpY00%a_xO^bhr4wE;;;%$LH*%Dm|{oDLVXv3{`G)VG(wbLKzX@4+tT
z*x1-Cf0qG<L9G>_;wU;c@gedx9IgEK?W;$il1}~Z-C6{gaEAfCdNl&(i{nq|&Puux
z^0RBi6i@Pzfa6Xo2e!6STCv>zA1#2g|44TV7ns&TJp;1Qs#xZ8oVB6oNedFT?%f-i
zB&4=UORiw$L1|X^_MJO619Zx|0m6a}@(1B0-<Q15pz(6q^gnRPL=Wzax^evJ6oTRN
z3yWK0nc8i6cl8@@-@Q|3FAz|xH2Z+Z)A@+h__ba9S}7?_SbM@OWCuZ;AEyI942=j7
z@d)}|zza`_<RF#&bP~p?X$ZX%;rXBinb)sd<X0f@uU)U+=}A7c$<*IR<9xzTo^+$G
z4C{RrQtdkmcH5K=U}>mxb=O^;(w&;bG_3dgHx)iOCRMLh>)H3u)MA_ucoU-Odj157
zdbW*C9dZj<f`?L<+!A=GM~@zJtgMdxNF%scqCQ_6-;CU-EZR$|PhzdE?j5uh?fnw8
za6xp5j1;0kX~INDhCsVpp0Di?ef8dF%KU69_Pd0)ISUs~IhL`q@a16SX?==5GdSZx
z+5FVN(P||Xe|=>wh2j%Jho<e;7ApZ0O7TXl5sb|?>ZiLA&Eo5~Z)-67!W%h^I8jSi
zH)W`cx^4WQR*eA9-qHzDb;)OQhldwD-K&+mP`i~4z|7S9zsi=7ovCW<21GLI=~XFe
z_;8@3vFj}9=qi5&t>&u_$Ef|CdIYGHqgpjx8wxj^5Zs_9sleGlV;&X<+(u+j9=NVQ
zyj67*Q^JD1GA$tS6B=PdXa(|1EA*K1euodQLEM73=wtYRTuq3ZuG7D&0<M07g+)!G
zDmPH-s;GO1KdPUa9K8L&65I5l!DcB2x1PLi29DIXZ{NUd{)fW`VzFuGP9-XW_AR#0
zv9+xW&ZFGA^{tyXtAYT~)-Dd~4IG%ovyb}lc!yAFu=vkG1_XAJM59akqlv#wRB7C-
z+3mr9$lR)8`*PO)^^gL_dhQ*b4Ra`!Il7m+o}RWV)0R<=ekKXZCN0p&Dz$oF4T1z&
zcrEknoNDM}Kizbpaw#r+)tWwZ2>j_|xC+{kXHKKlZ8sr1L04N}$YSz13zI`H_^qH`
zMtv)t(3@EmKU=O$1`Ip30KARQenRE;j=;cEQ4c}s7^&SrqNd-cM5>J1Cx17c^xJ8>
z?mx}gQ3D`4Bfj}G$DzqWJTkp8XQ)yg4RjhDOSzx|hkv{Z7YfJX1@3sB;U2e5ym6$w
z=9;Rgc65~Agb=!Zy}EmQT&-bfX=^m`2(r)zs}}`RSnh)d56%}p__2IS)la!<?o>0D
z^VG_n4Ym&M88A6ouVU)os^Sm4%9ZLms1iX`chRI!ClZDC^}Lai(p+pz#%|W`PhV5d
zE;SUauHN$0$cTLgPMQqZgjUu?Yz1hL_{p8TeT1(2YMsEXAIv#KBJ=>>L{?{QZ4GMV
z;4p-K23Q4+DyN}D652@{D2$twZ`=r_mMyoCv%;K^m<n<+GWb1lWPZFp&%`F`zws=b
zsi~~|{L^VECm2E0v(HlWiap%vEp++wXvfa$Fiv1&YrB+(1Y{b7!JYhWAY83{d~m6`
zdG2c+qE%`5IPBpTR!f$6Q}XhM<^adhY&2HuD#1a?D4(x4<(zdLz`UCNBZKubFe6*T
z=+Spp{3awV`QRZ??TSbz@I2}`#x>o!s-*L_mRs61QOcWWY1s;llpYH%Fsk9EwLi<+
zhnUypp9Ks4)Oxxz@$E5Qv!*|P=D+kBDvQ)#s|yg*$~!q)bNhcnxhn_G*jv?uYFFVp
zHhsXdH`VE*Ym7BEFFh9-89_tIAyMnvHRQ!AuA1ZycXv9Z@h1#@Du1dxT%>yP%|tu(
z540LV**;AD_1LYI5vAk-y0WuOYe=hoPpc-D<7;@Fpm8;qehFV7(i!RC-vjXm_FLL>
zxvKinmM-5vwHLG;OoK-B2(~DeUXWw5syA(KI?8CbAlFl9@VLZhVcF$0pH=0%>3%-7
zaVHqykAoXH3-A5|4HxIS{?2th>+p7DC88#DDqJ5OTgm|v4@Hvvm;(m1g$tVWP_Q`S
z2FWJd`-c1w=HbX`QQ)9m>d?M@6|{UQwvtY%-jPO8EySW8bqkU|IxHxWy7w&2!lI0;
z(f5i@u@dSt{i<|CP@_$?kjm-yMZ2VjLkI7Ze~79lqRs%)1-7|D2i(ZY$|`1Fe^hJG
zrVY3#Lgv#zKS_VVmk&P`6jWU_rS%i9Ip%t&8Zf@V2N<Nj)7~Bt+Rj4z|7oQ1?jq3s
z#6A@WQ7<oP6Cq#f?B8Zm3ZpFQ5HMyL-+5Jj2etq-LC`v6Q0)hel-P3gdI04!crhh}
z74?#6ku9?S;=2Dd%EnbSIv(8|bpHIVmQ|WUg**m55aZEBFE1&PI0CalVZP%LIZGN(
zdb+BeG$2r@xUv~R5PnzF-(<{~6w3v6cH3zahr)(+=^`QDDbNlKc*-UFBkyt<A8v^*
zBj+%m>Qq01*(Ru*At<7FWAuHN`TdQINXBaZBO(7A(d8O-?ntp+{GS8xAw!6FURdJ8
zQ4B7};?>sKUQ_+m$Wfz)T3c&iLbi<%XPb+Uo)#_3=!5-Ln22ejCdx*o$C+0ga~lfE
z#xW-aYZ|^EfKlrP05INv@yGO*HEY(ad=Mv(9<7Qa0u=jgpuxv1e>u{DFQ98l^1s2J
z0paeB1O*LqeEINULujsBV951($5om(p?+Y{WDKA`BrpCre~uZQCwtx+gqVZ2?J)S^
z$s}8$Yy<GOV&?q@LJ84PRCZOxy(tECH+qV@d`V<0t}CDP{|I{zu%7$&|Nq@YX4x5$
zof$H+QzENUGNO`Dt}bMcC@ION%#5b`AgjoTWJLBTgb-22m64U`_c%Yf#{Yl+j^o#H
ze~;t-Ubjo1&-?v)o#S~vpU;zX(WrCh8W64xBo5haO#7F<<Zw)Q9B>+pX`dB_q(g8o
zbO3@qK>_E_d3*-vwLP2|`*)}I?{7;*QC7XDw<?l@?$90*cs-Cw=C3jzIC`Z#v<{N0
z_SPY~kn>L;Z5C<x502dzp`autK2!N{DCBW7=5xuhne#`N7nPhmYUU#vJEKMKHF~3{
zHh(fZT$n(NY6OlTzUl>9%=A9%f@**xlK+QoKIiNi78;+!K?oSd)GJ~A)4ohYK3IE-
zhsMpC>Da^=Fgj0W6GfIeltj34HDGiK7RkZfxsjU9%=15p^z0UpAaZhYqz6}AF#%=I
zdte!U>-85+gb;F8tvXL~Q9VA`yw*zDA_~1v3eM)Nf@;Qz1Aaa&L!x%={tBcWOu4xu
zpbix_nQax5&M1;W>aLSJNhs-@>&DPyQws!}JxY*wW5S*pWJ(n(!_f0gLERX`xmqk3
z$!zh7J1OhK;jS^}-MTgE(Ba_7t}zLtkLYrK7P<|kGN5tL?$P6<(_h0G${Sl$mX8u?
z`21f=imDzl3lK1!`=O!0kxK<($AqO5Bg&K$kn1>gFdF4~wQSvbuTKudDwt0Gq;t9-
z*J;z*+p5!f<SW>o0rv6BEF$`1q>*JvtH2GQTM(QH8nBrTgdpku&CYbfoTd{V=oLAM
z{2HG25UL+FpH~<TRR7zaL;5`j!yO&A%Wkkf&F0Z(Yz7Tt(fA@i$V?IZpGA6OW7WNX
z%Uq22s6ZY5Q|3ZugsQQ3^XmrHs`=5{oY_AW)J{Z%@P!J)J`t6Ay#ZE`yiR5`FfdS!
zTcTak{ZdvL#TUT8FW`O|pf={e;d4BXo+B^p3{6GAVJLTzK!}<xnk<-4Cvo}mW#Jys
zcdP?{-2bwm@>EzZmQ<lSQWEnn7SVTh^DXX7a}D8gH8E_;{x=VfwBR$F;#&_~IsQhg
z6>+WX;~l=C2kgMbW^5VB1kJ*lcjz@;@DWPW3DljG7x#qHmX6|PE&j4;w{F>`E-R9)
z4T2uhQ&gas28C&2YP!D1V*~w^=(lhGfJIY|i$GpE&~a7%=o?iUyn`!iX>6CY4aRV{
zZrzx_6vbq9>DVz1!BdBYU;dzu&w4!{s^guAGVb{`zX<5*`)4zT5#WeolyMeJEp^2e
zJpR8x(O)>h8UclAy?=FK2>{sH?bA-T`lDs!=2;(dtxAxka)T_{W|s5`+c(Q8Q(Bfy
z#dHP=g2dx83DbiX%0vB4eM`*hmD0@@=M^R)0Xs(+=>KfRB?Y3fNN0HOWee1IkP#7a
zs|M5NZO^%gTRM9h2<M8qklvPci4TrVGZHIsIQS!G?gL3e=T8T}fBIQ2UL07F(C+mD
za|6$4$)Jvkk~mjZQEDi}9SX3;d9zvV;)gU>pE#;=WXj}qN{CwE_f!rr$R3?p;_^3}
z8IVIb&?Iqn5{h*C4`DQVEPo?P-hLb`P+A$uLob1NmjYn>m4O??iS?nOl<EXEiMMHK
z0<puy(Xdyyn4V6ywtfH<ETI>0*zwP+3o;;>!eOmr=2e42?sCO+ntHz4mRFVC0V^(Y
z6-~jLPG3Ke4u@5OEzG!1%Yhbir6OYaRU>2zkrx|7a-8{TDj;v-<K}-Cu_j@;w7I+v
zvoTkgH<bP?T+$fir5>D4!xfBT5FYoBcbGuG)3QyQeIwlg{3&Lp>Y5S_%H~d9{{Ja9
z=aSF^IG|B+apjeR6bVuYYVI+<od9`}LrRQ*Xh*|n750gCu1>e<jd%hbxF~S<hU|`(
zGh&(PF?q7^7u%moz3~ryS3o$L<gz<#X_+L$bO|ybnghm<-}y2bo<noQ3T#!<r2i+*
zY?gTg<Ek{~h=goQxGaL--kk!uOY$!;2Ise*r)T{2do-Rxd-_#a`m5@ML!(~(ghcvl
zVWvM|b3btJKy@QM(E2~s#;oU%Dn%JWd#5ODYd=dRrajEcfx`bkOoaVi69|KinUKkV
z1jU!qO8xc{PtJZ*3wT)idb$X&fD1}0lUwXX{QqeM%c#nH*D$^-uhm^b#I<Xg^}F)c
z8CRIH=~D`0LWBxqhyZkd{Www$0)yi&<3{=aZIX6;DI_Y$G=z%cbz*xc0evWi*%F}*
z#zpj^B8W)34jjV5WuPBY$*cDbNir7${(Zl3Uw~|IPzr#bU0Jya!cSI_%9shl5}Tnz
zA;oKp*IC{AAKOs13P%r?oY-h-m)-y9$5-AazWBGmY}ClFkhP$4oI6x;=fbP%5p{V}
z=U4%VP-5Sa(+~^rWY@{@{W@;PXDZ8LVKQPlBguO8YAfD-mid3-tm)P2Q3XB!&z*GW
zLC^umr4Uv8=)Hw22!^5AA}aPT8F9lIl>E&}8rfqS69UEqG|-D1NylLMc>s+mk1Pm-
zd0^ID-a5r76b%CdDGp__yQpk<b~klK9mEUk7eiwo^yXE1QnQVhr5HSW@#272DJy~p
zF8Mdd{Xd|bqE-KS=5u-%=Qj~vheDQK8yZ>z51)Qd9qu~x0#GN~aHq!1)$dXUSn0zs
z=3{}FR1`%N6;^5Me~$tJ8t(M|*>KW%#d4)6t3QWt;IJQO{L1|bT*IW!q4n=He$Gw6
zlLc@CJ%Fz7j$T4?5`HE%)a;|sN(wgp11^KU{4cXd`5Vv();@ZEV7|6IQ2Dt#oiJ+y
z0sQ=VZJ*=eT6G5&;u9`|4rn*V&6~$&FI#|@J!haCJxiS)bK3N&p{{rqh$Ws-buA)5
zeahL688c@8MZzN=NXFd%Qv4nIj5{??AJUDqq0&LJKEOF&a?pVn1s#IW%jAMXs?!Nf
z_e#r6nS^9&1X|eb`H4r@ip4*R|0Di+J*yWdbL($)OPNfAdD6`>ub?b4Poi{Zk!|Xw
z;A0b{<n)NlIP+&srXXcMMFwm^XdGq(q^hxg;4*qg2IfQZN8PoA*$Feb300ZUSYWp$
zZQFCQ{P1m^D=^S4u3A=LtO6btm=MSlcIP?xPl;I%?%$_KtVVDW5>--ClGM6pmAJXU
zc!~m_(w(u&mfpVRQh{FP7>s{g7-Y&m<*Gm>(x8#w#B-qp63T+nQ{O*5D`^`QgQ#fq
zQ|?QH%aB*7)4v%?*Y3DIptgql0h{EAIv&RD9P^)EqbNfZS@arS?P@_l=z>p0qv<AI
zdQ7KCdxxM<lAw=de<w=0K}C^96ryoNsh|2O-H3~eONE2RAC)o+hR*)km_d{;U*3;+
zST@xbXe`dnZf~Z<1Ey3Cd6PzR^GiS?JJ11jXu##0Q~%=4gYej0DN)2x{oBsf!Eyc>
z@GbdrC_*BRne7M<=Vkk27oW19M_l^k&YM;A=u*deAG7NMha5H)`iA~cCgDPrMd8Y+
z22CQW|2;0VJGWu$))g?cuIU8OKWxE-IyFAd^yI>??U)t>(sc6#j%Lqi=A^BoBS;Z0
zc&9!9SwISqXikgLPo07iGOmrhfmP7IdJ5>D>M#1NV}}l@u;4;?Diat0+)wnZPE*ek
zx3f<wQoE&m%D>fQ5&Oo3%SKo>OGoUD(jx?yj7sRF^*c092__d&B}vf<qPCmP72QZ}
z^IDwsRG_EN<`^JBT9Bq$4TeIlT(xRm#j5&t<4oGM8-U`Ha$4AdFiO4lV+0;>=1iaE
z9@N{nTI)M{)%)=G8MlnK0?)@AdKu-hq6o);vPvc;Pbc2JkC(MbWvDXHS{gZ+IQ}>J
zS2HxsDAkpuIM9~CegOI%E;rO6`*)vPD{!^bZrrHKNeAE)x|*Ck!#!wLYKrEDqW}0t
z-g{;}k*Q!=0{)X>axNPVL<gK%{yFf`wGnh$Iylr22@;Zx(KAn>4o)TqPs`>m_z5xn
zxO2Pv#lq15zI%!%(I{P#pX_M4K-nzw6P#$AdO&$lAVA9WB*q>P)yfa9nC0En|H=av
zw4x~x=$v$0MS<&ZfmkCfGyjct?!e^J87%WJ3B#PyKGTblApE%?QDiZi(~RbNLR<Zm
zG_{O0DIl`CJf|ua{Q^0jX|k(_N6f2e^$UoeCM<KXhlA`IVZ`FO29L!dJIre$#;6NX
zu?bCPT|QF&!(#^~*>+bdHSAf3v2cH8%-TcPZ)+wnJ^eG}?h2dELZhXClCd8Aq~+n^
z_2K%+_?_~5X&w(Nn7Y>0-Cc`G9fXU<AYMp-<LuohP1415hNn{Him*P%jhfJOmZp3Y
z`SGfQKj{9tm-y5=G4_}JD@6<MikF^BZe14^=J;+AS>Kd*He^c&4LbkYWw?w!eb;IV
z3{l6{2!WaW?*A8ZjjnJS3A#Fr?xewjrt#XO$B}JYwz#r|2vguLSU9!i0n}v9fV@o@
zh{A3WVgl3XvdeH2x!8EGMgdGPAq#ShbZLU&t^6h`KuM)uU?L-21==E+Ot`seFjA5~
zGj^<qeo>J*E`Y}Lj1dI*|9RZFJ|Kt@9@Bz(FPXLnz`JVGrVYfMTUbX3`UJ6KIq(1G
zEgnOhzCPr=k$Mf(F@k>({JJYpI75Zs+(&6@xc=s~%v@0jGLGx<9=oY$yrRtO>=|oJ
z0OT_vr1S^EZPZNtOlCt_PX%+9A`_NDT;^+_-~j_*Ok7})t6!tWGX5Gp_3$V-c-aOZ
zA&7ZmtV1K&4_z?YTIoB{l{Ry=P_3r)AR}ALh}dB;f+b7vY9V!ac@;nEr+0!$W)KP2
zo#vPaU}<N!e%O!d`nNt&OSsY8Nx=VQv~X@ir<q?$HA*sLt?UP7{h{CA#454&gDVZ)
zFLzk<?&7G6AKg0yHSSdD+?KQV54RsZ_~N61Cd~rvn?{8mZr*DDv4@3g8_rDMQETw*
zK5Y-YdVb6O#H;Wd#}6DYH@ooF%p4<datdF)1`<tFXOi0dtJd{60p`ObYu(0;xHqAR
z6I9xNgxzdd>a9f!NJ)fV>f(|<+Z;xW5V|cR9eGjYrSfH!l-FMFg0`-1F0iI_{07yl
z53bLqc`Z?AaAUaVfn48kft|=iHwDB57@q+0aC>G;1)uS8LrCp}E%E8n86w4q1+dyh
zYEXgUd*YgP-V}3}rL;2GRj%UQM!{n688*n$;b+SHPr4`zkmOnbW>QJ=yRn{=TTb)g
z4;Lbw=2_paa$4j+>P3eWyfS2n#uU6t2V89!zQdMw*ujl{%C`^<6CbM;B_)ftE~Y2r
zxVzDbOI?V}r$recq}so;&yqEa!cqZKI8acf2dsE5QKfC#)2B~UjVBO5R#_TrC^=@3
z-f~)lYqI91le8Mdf6tS#vCSpw%7x=`)Dn1Hzrr(4nn*&4wLxGQTTcsF5tlCM2su-`
znQz+K7)NfCzYVM;5H6*Z7f>9xF7DN<myBRxT`W6s;zVC*sNxLW?R3i`jWOOwkPAg^
zhZ0gjP>5i}0;E@a*Z{{JWzYuO@?@R#=H&SO%%3Duiz)}!TKRPw<nDRx1vz03a+a?Q
z>yJD!a>etzcke!Z@j{!vMy9!-8=*5n%r@*HiabYLe-0eR6cJ)h40U%1M(pV@Plc~X
zjPP8@&`>mCtfHvKXI58W#*LYkx^fi0VXHUs`f66zm*YdF!!>iAJh>d)7!BTNW;>#f
zG(_l+s;nQ_wCyFm8Z|Q>BziV~I^*2mT!89S1&v)TcmS%PQU97Ngup~TPyIU`NKNU>
zfK%VlFa#?ez3^44!%W$jx*Kus=MftYEvDWU{FMB?3=OAJLGDo8QJT67=oTnSrqf~?
zEWuifJwAemQ=PF{&PWc5q@kwT6v99vYIY)f7_?*>!0bJD$MqG#<*OR?21vDzzfgy|
zO`Cy<pyB%KZ5cjxK$uu?t)0@he}64rsi+`X3*SWJ4#z}+ui%E)hD=hJ;6=ZqKRzo`
zP<m~>Eo<bK>(}z2ZB+k6<_g%1{fb7*6LB2?ZP7hx?uk_81MQ++7ZvN^Q|Hiyc0rD7
z%G!DE{>r(f>VN$><wf%RPtD<I@E_12k1oT1_P4Xs2ZQ&A?vKEnoOkZY=oP_U5h&~3
zA?u5PbjV3aJIteWZ0LP0RuZHc!HUcAkr{g!5<)<!B@(+Sw3S=dWf<`Mk@30$icZ1s
z`sNHCv2M0RBj9Z~cUmor_szf_=^NWIkDuKf9H$quV#Qfl&C{{c^4z{=O%<7$aRN0q
z@^tLX&i;GXo8r4)k+uPtW?5vUF*uACVyXytyP6-1KTo9`tpz~199M~7KlGziDifJJ
z(aj0V+0f9i4BL*(e!;aA6*7dLwGeM$mDFyw=1x}D0M<oFyDuldxVYG77`?HhIFLxi
zQ6K*YKn_7)f73*aHyjam5G?i!A*T5S1#T#>gcr@yvG%SOCOAs>X35eDX@zKGln(6G
zE8#1Hx^m^4Qc(cIiseR(H}^ITSk9kdT7C=DtlsBofI6^bjmAk!;Q|SsDdRJ^*_wOM
z@I;!dviipYBi}`6RZKQ0auo#-=sZzRXPA|vkV++<U{8OJVPo;+7vsPg5CNux21LN#
zDVvX}ya7gJI0G?h$&w|(7jg~KVSwrWj`pa~uwiZ@$KPQGAIP2hRp!5-*UQ#GxbX2o
z*RyYYiXn`PuvP;ev8TArsDlbuzCSEkGJr=bh%lfd^j}(Azku;K>e6M?_Pv}63;wz&
zM%BqorTijSpCbOA2Lk`LwF$3%1C%`E&$*Nuw~vChJ?FnHdLj*HjN5<0poqAL{<$1~
z#Sbcy;E7;y5rNB>pJN)x?@dF3y7IHER;$z9rzsf^8cWN*7IK?ZLz&L&P|z`JLmk(o
zS+kh)GeE`z(M)iCMEnX3<5Kss(YIr5Ds+kG(Dx_u%HO<khq*1)SLe>1wTBEe^5Q`W
z(4wTG`Nr$?f;?xAp%|*DD$pPcouUMJyP`nN5zaWFuy(h)`ue^ql;pyNXS0Nsg7I`^
zuHUO`QzSe!{ZEGP3v-geHX%+=5nAcMrny#LNq4D8gg661vG0O@mrt^YL6c*T$nb1d
z*2uC6zr$EcCLZ2XYBph$@(c(zHua}Yn6Ol^Odw^rM9;?AN=OWjlSPHKX|Dso`Y$Ec
zt5e5SR2^W_BnWLA>s`k5^TGy)09JvbI^sFaT4kSC*Bf-}(c?LKJjIWsmv--7ZRT$^
z<r{`M(ceNaTuOFRtaNgaq)Yx&d(EgO{2R~Xj!HboyG5E2r7zkK>7A+%PP)o<nf~^^
zsQ}%)hYQ7-nVHXWb2SlxBLCb~?~(d*-=@H*kNk%MaVhizsIoTo_e4Z!(4@jGbfrWS
zr1N+AVmG@fGDPMQ#!Z`A#-<QdztLu!&VKDsOJF+VcsSL<1%jZhVVgMmD+_QerK%Z5
zJ5jS4Y7OrGIc}FmjhdQgeDd@Xlakc`o26s<mH)6#Gw1Lf(pZ<IC<J-wx6SAN)n>ZM
zhUJt596C9rR2+Vz+H5LtvbSH#(G#K#-RcyGLBFfA!(~;OMN<R7PY9TrF*`2)$#{r}
zO&6yJqa#kG%IMyfo^?4vss7uyZ|8mJqHk9dhT&77Oum|!%yjexkCvI8)GD=_N(WOH
z`|m)4$FE;MrYU~JdDK02XBPX)sLRd@r!Z_rsuP&B6*%b2cw;NS5S4HB`Oh2c(gOx3
zGgE#SOE`l8L*2-Y7U_{T4#51}KKY8c8CXEBIb_N~KhITG&&qICD{BNJx3%MOia&sq
zYM>TSn>TkGHg;@1=Ju9QTct{6XPL!yI^*EpL&5hMWL01qh#f+C;GR0rL7_=4TQk7Y
zVJ{~g<lYVYzT|~p6Nb1@SsZdbs|ixOskC=>zL%+J`i>nFVXlTk0u=;?s#ZXxxWnRb
z!&IAP)rxRU`S2suracZ;2mj$D*m)Br+TPF3&ZUb})H4<K3Rde@({a=9l7l(%IdF)T
zpN~zsr3p-M5k}Lk)4mHe45V_l8@dT@9Gx1k6GMuM^b&1mgfqrrc>nb6+e*~<(5|AO
z#rh0GwBWE<NJjL&6SKwthGOg-7kf8WLP^%+$2VkDz7*)Z30~2$2lz`RJX0)bP+j>O
z++IS_D}2QKJA2mNwBwd23Lh+8N-af<ZJ@{A!bV+a|8^cHF-;O+18ho3<qAs{QVRm?
zYNr01hm&scH@JCvu7KUAr1Ca}ltH6mg~i-?^aqDnHN<1A_2%BKTmD#8N#RMP{g69C
z9d$eK!3380rw*al99;h)ndAP02cn1~_1w(N44XNIR2Nbks0`tojBw_C|JY3FJ9>0o
zfRNng4UHphLbq>U37qkaelDGx=ur%qbr8WBig&%xHNRT7YE^+td=VIxDU}Y4WQwg1
zi*zW?|AY!6MX|aj1M#Ett=SEt+2xg+H`nGgv-q>Tqobp+Vi+vvk0N$9ZS!v(rt)T{
zNvXUHAX@QDN-~3lpv|8L{jd^S6Ee7fi!fE5oiQqnSE(Y|IV!(Tw`#+yC#!G@_3Z)$
zk0ubl2Y)#W00b!8YDRcakS=xo&12JC1R4h2*s2a!mxuGF!S7ZIlZ8isH<G?NY>U;Q
zpma=_h51gPT3>MF(4n(r(VIsm5BI-qH|RGv0lLAZM2O1gKw9PCSoQ^X5g9BxGcAS8
z6F~ATFRpkm{)Em3l&7Wa=E=%?l9)e?($7+J!atceGCLX`05?y{R%JJ4n*vO;j*Jc@
zZ#{2?%4!0S7*$Xu#+wt1nZvk&i%2<EuWJ>2aI3Xcr`61++%!KRH%^08D~!JF(~nYr
zpr@K%W`{HX7PQ+;8%~WaXdO~dyVW0GUaKe7)TlFydvUUwc{5Q=1ug$TH2?T#tfSai
z3gV|&y&*^hLK6_>=}pv+-2_RS#6asJX-Y$(6*$kZPiGik&q4^bNlB6$m8U4qFYp-u
zFJJtHcvr!K(@>Cxl}B~Ppe8jwe*|F%cCOSZ@=V~ru3Rn3PtKA!E$x<xyBhVP`hzye
zkz*jof4Gr-J@;(?TvQ~T7dKz(Pw2Xv<@ocXy37_(e*puL&}_=NVA#^czNc%dB0`1^
zKvqDY!>RNr3rzW^GL8~pl%t`qFI+<&xYC!sS19MTyuH2UGOxF#&Cvqc0SoKL<cB}W
zjJ1|Prur2E{;e#%R&veVp^(rw*g80zCy1D`V<E!*mbH}ZJk+On;6R0$@Dl~^1remb
ze5nhA^zR_Yy_Qm?tia;MHtN&o%=tr<;!?ZNa4?Gz_A3FZdkd10>ie(nf2^e;12fkb
z<^wd=iXbbu=WuxB)Ziy9{-)GgysgZzQEh@mf8y@0p+LSpk4}oRm)AJ<xKA?#&^;7C
zAP%lf>*?|w4gQ;yb>SE?KW~e~RYRd^6u~&J+Q<aL4Yag$%1NLjhkY)C2Up~*1c7VG
zIGnRoHo~?W`u_d1{*;aUN5nB5=Z5=kE{osMR&@|IMSuvrO5|fE4aWKer2;U5LbFoI
zsCRu%0vMP6dS|HxzhM|Hf;x7}*gSl5H$H$3BEq791-#!+8q~NG580|pGpMT|OX$iS
zX(``FrAGh~c|J@FO=aoQrRa{@LIvZCD*vfIX0>-YZHa>610PrM!#jdu7*DBi<BlDb
zfp}4A=8}In_#J=BbjrtPDSxJYY773J$l-*APQuD-Y5-NVHZ=5@HQq3;6U+C<1$8$M
zH$9qfC|>g<%E3nW_BvI?BQF&-?k#*VrGInWNkig_4}*Y7R9lJTMvSN~gJc-9H(6e$
zQ9WxIREc@TMXg{%2d=Ro*)=9!U=9<^dqBLP!BA7J@aA+n>ht!~-~vGYS@;DZ&aPA*
zQ1C$I-xAqjBR*Ua6g9~dWQuGqmgEi`k$O2!R&~Im>c}GysPGh9BJS7W8t~zz)xF7g
zECUrC>1BG!7=Rvm26I7j*(UR&R4dRsHz!sT%Zj`$GI{*<M%=O}?%vaXZvDdeG^fGN
zV6SkdCd?93(L5B6f2oRz34}NG91;oLa%Jryqkf3@pWq$vU$9F;PE`>grb2_2ylo3;
z%yl4FXs8ivloGYNaV;I2fsjzC-2UXnh)r&*<SPBkb^r6KC(6F{_b~Ak1qbdBbh}aT
z(E$favEE!xzbAD9eMRjz)6lQ>E-jfUP#y24dKnDu=mfM&6sN1ePl-37ZPUZb8AK1r
zY|zRMVh2LueSL+T$u{)D!lEHpqJUWq-jYicwMge&Rs-%8&Hw_i8E~$*_6AEq6y1P^
z$clnl$7a+*cADrz01WdSVQUKk%iu;2vf*l({4$@`RN(4qE?l_KpCoDL_z8e{0-cEB
zCn6CLeN8!=#@+wL!Xme^2ULrEqpQU2-P;6PDbgXi{O5DJ>ux?T%FDQl>VLvdb8~a?
z8Q_FdCM4GtY$dq&KoH(CZx|1TCUtqdF}2)SyIQ49+qTc%zFj2*ZkTPhBS&tUIf36?
z?7*wwE>JrqNj;TeWxQVA^b_B&4WL<h6obXDMiAu+#$oH^RJcc`GKigj!qRz*;r
z$*hFbE&!1K9bRJCa6C###o9_f>4Iilos1mqh#OFe*FgThd%(qrTLN9ZnmR~dzA<LJ
z-QQvd2Q8LcO+clfgtS}p@Z{X~6+1d;pwaBMxTG5=UB7;P=RanKZ`)=_v=u-|ykel2
z|3*Vz8-qk}#YjYqS<h1uuSegOI|{wI$Q02KLt+&rnSig=<&9V|NSV}6*I1RsJr&fA
z$&oIznau^*ad%=O>Ia~IaGrBbI+GBZwh1%?0vqFmqPAu<xk?_1c`|%!N2}DPT=y_s
zri~~Z#4u#t&Yj?Y%eW@$%Mkn`c$WjeQ~i_4PIgm)ZRuMTMVe2DLanB&vfF}&lNru}
z<E->vH=?YEqT5-HEG~k#rx_!&rpVV%snwI`yfdWWm9YSw?WtqsWgJ4dmd+gf>PmN0
zewQNwdVj#CO||fItVqpBcWc*nU0Ip+jQa)Vad^lsl-5Y0R<X802sFEw6}hm<<}8e)
zWomkXIK@UksmcNX^|#Ed4DLwu(zGr5RF-1BY6qs3=BzoOOZmXs!%@dyN)v-w^0&+B
zlRZ3S#*&wD^N_nWaKL`&nc;nwG^A4c_Tsr&=g#EWkqMQY*DjgSD`a}FUl%@qt9>u3
z{JrNc|NJAznppvH##q+K0FEityr0ElVYzILrNjQ3wQ5b^@4UUT&b6yog_krwx#dBI
zZ(>z2?`u>yE-bDbsi;x7FnQZt2-%#A<sfrXFf!W?>m-j!-?1hYYd<<#RP=g4N5n8D
zDx15Fw6oKv*OP{k<`IT@1S<Sj=SdX~I{=Eo${P3T)j*LL&+8F^fy7lvq;au>fMOZu
z9)x0hYQbll*c<SL7$z*;yLWHj0x#L*#!A|%QazgMaTdcqeasu&`Oimni0onjfwEUE
zUv(S>I*!W^L$^Rc_r|SSt+<!RC(dj*H7s!0`vh8VKCEb@xyy8@?e8rDL_5!K&M>pa
z%q=5nP}@$Ojy$Rh);IRAp#d1PwY*nO8&ohar}L%%ecJ!N07ti8!M!iguh4dDag?dq
zjLpm(VP%+Q0|ic}ZH#cYt-!c}gRYG;&$F3S;6_Z=@*z}WDga%-;!Z_Qtg5*be)JcN
z*X5(~=d2-R$3K6V=&41W2-FfqqIxt@B7sq}&Yc~`FzS{@Uh0Y|Y-T4p*ago2^0uxl
z3`Dpv=;ae}h>!wP%qIzOIzvVs>ACPrCBE~9#bV0AevPwSXlbM^9r92DM(*CNC^u=p
z{qKPBIPOVEs6rco%a9CQqz}6`lPKz?e|BX@3Sf#n(bY0#g0R+5={tDv-ba5IlyP1R
zcGBP`J)>$QqE=|IJUH&+;Nioqfrf~{B5RO$$+Y4E_4}=aaXliQ%Q?H%V<$pHaw>x<
zV`%mKBM{&=vrA4|)xni!H>oXMN-c*w%qf9KHJAMQ-r`rur{M*k$Kon!VcjIsP?&Jy
zWP#InQMRoDOX4gbleCJ$UQJzQHXJV;hN8_MI~T7dNFS13->yPV%4O17px+>=!$flr
z0-``M-k|<cUiyJ)hZFMSEx{86d?(+)=W^cXLQ%~~g&y|ji&s+HFh;vly*kjxve?*{
zEsS=daQo||9ZvOiaA;_GPc6H-dbQ-mb4LdUH0zsvJf1EnS^36p*_#jT6VlHlH?&G_
zSepCKKWdyM3+=@%7A?v6gjX@yO>ozcpmprn+(Q+YC?Z6Y`!HnaP#MWf{*W6C`osF%
zVAUSrB&?X4a0?;-LQ=JWH2|YHWHrl0`cm78hD?-Lzuct0DlhzVj7buMM~=Qw5#q^B
zblKug0ZhWUp1NtXyYwn5Zy;!4&eH;ri7WDcRq<SaT&&73LO7pzU0<bAA+tR#?b8+C
z>pN{G10ujfQ~bzJ44tLDSoAs5NkDRA)dHze1bA1bfBkEmnh_>%IkN!}U=!;#+R7p{
z(4+7xFRKEFWo4wa48Y@>s_Nj>d+mS%onwc)xES#nrLBVrup2b?#uWY%{+b|w0h>OS
zi3bm+3Qc_i#@8k)m3RZdq=JkzEOs2Pk(pK+eWkQk%&Z3NUdABM1^9{KK~>ZNvQ`Pl
zko5)P|IP7X^xw?ZLekB);l}I57l{VZ0(L5uq)=jC&!dG9z6WuQaXWF(>3GO}NpTgL
z;mJfQkQxBUU$Au_16Z?c4ge**k|@QsG^>m&Zc#Y<-_C4^ailfX6L?Jp@XfgR_zE%<
zj>{<Cvt>Pw4k(`@V8YEJmZ)+mE8PD0qmqIJsU~H86Lm|pq5Ah)9%4EoUfb|-J-0Je
z5obg&9C^8juRn%JbRoXDfoP#4+cRV=UA<U%%D5*qL?uQ;GWNs}7WcyyiOg_cir^5Q
z>eY(u$kIjv7y>PL0{=eTu8jpW1ocINN*bsTDkWIlOI#HO%$wJZCT2R?ZMo&WdZpa`
zK%*#UyiCBpmS;&W?!QLr&tQ}E-wl5L>(}xuLBNS9y~KNg)ft2hI~{|S7g$%(axuC$
z%I&eS2M~+Oq+dEI;CFw6k?py*ry>BS*Vkv`R2A56emi$IB<MCqzGbY^-PzmMB&Et6
zEDkZ>4*h+|UW^EBO#6;_{Q{(==<PndEPT#_sqjBtk5fOZOz~Wibt!6(+aW_jkG{#z
zcg0DFS>i6gf;bZ)Cvci3{#wL%!IfrD-b(As)%6`U2sTo<;Z3*(F`MT?b|4&LsTV$J
zV^U|y-=2Acrd&Kb*kdG53R@Cf$zs;ryL;DCo&cm3t{*_!j3bpfR+$3<CR3@;Lz2=j
zT!;XU*Mwgh!pi%Vppp}Sq%VYM)I%x8p^(>B@lYr^T<ChtdHREd!11>`FuoUao&Nc9
zbCL$|kv@=iA=nG?LzE4XUc#NhQJsxJ*49EjdMY2aX^SwI&}W<_ft(31(cW}0^50n+
zFly^?ymI&nLl&oy#w6{(2i5UB31BFmP^-0*)^V9R?}wPbm3*p&Jy~^%w)3Q?Z<CiY
zH{=c9r`~1Sx{UGppocAnZ1Lj@elQ*_7tlan4CBy_I(6V6XY`Dvu@;L?sy!*|!Rz``
z>km^Or)4u`eA3-Vn@aBX$!o6*3ZCI;k4b7Z;G(c}Dg@#7{qgvOIEHuhMF1uziQ=Fl
zbuuT(NDyB5H50QLi%BmyWy+NDp`s6(W9Q^#&D4fSjQX~1#B+yj6fucV9?ticJo@8D
z?Ww!uPQH8Slxqs>B`7M&U8i|dRb6yiD>-4C?jM6<zR&r%{b~QW+ra$BXwLZn1cut?
zRd~af)3a?-hI*1id4fS)c*SbkDEh`Vj4Q%x!xQGmaRkl1*dc%{iqZhoLh)lhBJM;a
zhA0uO>}fiG{)b#0VTl3Qcy7l45SsG}nx_gfl+1|Sw{I1dyF2B)XosN}RNbBb;yp-}
zh`PC%{!H?TV{iBirOc+Bmow^&b3C*5m^FgX;hd|BOiVl~xU`Tgd?3*fi)Z5?s7DW=
zsR;cTPRON8mt6blMtZgRO|;-sM7}we@i!MB5cnpe2!^PPrD!f{wjB5QE()@(k(FM*
zehn&?j#gKkPoFL2BV`(wKJp9tF&3h*d&#G|PR*XQ7CUEEoxn7kl7do@kwuInBnT0s
z;NqUWHd=i&&92f?9KsUE)3giFnb^_Ew7o@CMlDR=#1jZnS6{olC&GZiL{~l9T|1kj
z1oLV1iOUD+V-UIZlsZdhMwU(c(8c{u2omhj$qj6}!9M-o{;{<wn5Dh|llr@)IWHd~
z=>~TU!gCdfiFDxN|Co^xw!p<Wa!h+JX`i1()sa9}Vr#s&6EzL{D5R#LohZNMU@-?U
zl}3oNr0T4~oCf>~>K=V*A?@}@#>NIgT@i^ikH{i@eLcCl$9;7A-l{y0Y;~o6VPRoa
zWUd9zWrTC<)**xH2960r3&7O_Cs#2l7q@KIi5*MZ_3CkMC>t*Y`)OU$ePErW&#%hy
zYuX+=YJX#(jFW|nM>r8`grj3-#v*g>C|Ffr8M;&H3CR;)Zz?sKh|Kr7oMGrxRL#7h
z;|9#6#o$#vkb&V`AFfdVL_e9<<d2Yp!}qFVIze0H0dz6KRDxzKJiwbDc+J5k&UDf!
zFsA=3o=riU3ovo}YjTVyRzJW1Le?XDl9PS38u297hck}X<1e_KPO$>=WZAc`pI>~i
zSL{Eaj`hpyyMTVb!7G+TD08XQWgj=Q@&FGRfp$bC2}u~o(4xzqwr=14oC>1m799Hd
z=`AB}s3uAc->icet^5`%@~Ry&9g7_;z})i!AIht@G;^k;lVX`z)C6?}>^~tGK+G2@
z+pL}{R~mP|XM4ubS_-E})^J4po~oLxC8UU|Pn3dCB63dquH-yEqdJFww7cxl^xlej
z1h$^w7Sa;)4IiDHTNTOtU3biM+$K(xQci9w#Q?8)le?F^G?PI%D_nif=@BwzXi(fs
zI6JWN&SJUjN58W5H2Fp@gv`cSqT0yT!cAy6V!}?t*X+(|bkvI&$ZN*PUkA`g+1cpR
zs5Z4KhGIU-&8(ht0)jC=CcH3R5iT%bct+8PdMq8Ph<NS_FExLjLq94T)9Fw9diK*L
zo`%B#fiJJVi6OS+Eoqx&N-^hc9pDQ3<{(%G2X9KGa;(Q~Bs=g4MELg7lz&m28Ojg=
z@oM6ydo!7mMpmU~4icR)6Li!Ol+3yFTSn@^v<r~&6BOn&VB(3z=v?JU2)~m+q}i+n
zu*t|nta)Ypbm8{^E%PRt+8KBqNNA)#k)d;$>5^sU-_WgHAoWcZNOQ}Dtq4MkCguWM
zD`C~D*?(&@S`ixg-C^UW%ZkYG?lW~sz-3?DVb8Dz_04CM2j-Dc1cjg!)^fQ~mdOGR
zC|zRC!DYL$C<SHdRd)wRM>oi=P(lkfqml_XkebwPK1G^<smu<C2VU>NemyxKvJHoM
zt}Dj_O&2AML;j~~j%G0TBppk^1+n8w(bjF-CU7!EdJr16w=gCajWZ}((9)%8EpAY6
zFQpPoLmWZx3$)`8AuyYvm;WHC?}7)8hsCgz$WyTa9-<tHt*?I!63+wP+aT)FQDFSw
z8Eit7)h$OF^{*`Nf?8v6`-_Kpi2#!onZak;lKAc)nOt@JnZ=c<*H10}X(*I?08vO`
z>Prx}3FiDoPWME;4kcf!B$UQbknp{>YE~k!00gcHw^u)m*;1UM-GHk-Jeyb3mG?I;
z)xk3C6~9*Kmg0Ui(Zl2NKZ$9^j8;yKwHk|l2y9uz4TyQ1o$Esd(MP62QuF<Iir?IB
zWa2-RSbFK&zsHP`dVBWmt3(3;m9Z}uF~6IGhAa)@94f-7nf|L}zK{e&h-|VkBt%an
zJX|n-p*Ed_OdDB0HhQ|cU>-+5j(!EWx!q5eed=Ona;2wC*a1reIr22rgu)Av_7Zs<
zV$p%%#a?r2`y0A{MygEcx<zhIXT_91`W-|hfYft<_^yneT+==thgK%bCHDP<Oj6RL
zthAHA$ujS6uBJmP=7_r)i_EH|QUu4RHdBw#!~ja7#2@%FZm7te6RIjqrFj4sun`Wi
zI7Wh`y83hofaCv^$R<^tySp7F+RGUna}Ln@TCRjZ(ZKF~yu(?(^T1I+g+kNy@-m?W
z6e~@}zmAtK7SUoGbA9B|f3a$cA-Fc}I(vIq^0aQquHR%%A=blytkKeKJs2zvZT~@o
z3O2s_Htm_t(4j-iC?cu}P>+K|WKIt$sy|p?vVnzvD&DX7%BEcXMu)`Os><XP_z3Ng
zF%YAxl5?(xcdw2b)#u7$W(+uk8A4p?XJex$XBGNjXYcG9goip^{*Zyvm7hJqKg9(_
zJS)$iw**O(T>uP1&I`AI@3MAhrxq>Bq4)ELNs6oF3d(9+CluItmOOA_gs-aGNtgT5
ze(o=LI6^6tEfZj=t3mcex-9+`#J8`v3RhG&Fc2S4=075-W{f11yIHelF=i`Ad*KFg
z8ZM31@grAOl3sWv%W34rhu?U8=+xD%-<XL6n!!7{PL>)`q2m$08%$P!2?#13CY%o*
zzTnAc2wW1Y5PypFhq?{Qln^-6Ae;-qh87U{$%k7r<q|s93M7+TMD#CHsn}(PZrHF?
zEc+=s6+h`{KojAA85__4^!2MXB?>c0JHjE%%yTqU>N&ncWt8)<Xe?f~1{Eso;SrB$
z1X?HcRTO=lxF};Z5wY*8VUe6*huLgPuSs3l(VJ+_fTU(i{mka{j_UT3eG$~G8AWD*
zIe`2ZSS_r>_R!;#!ST*<#Mf`x63pAeg%E9OWhQa0k5(vj=SHW(ilGAT(6OUXrCWC}
z<LhN|RJ{oGvNVQo*m%?$WICyjAE#V31(UWS9vzwkGnfTho911G-adnuFyS5474@_$
z6>%%3gdV+*F}k9Pj9$xIt7L7RKH<4Jwux`&-dF<v@;PkQI=vTlL$hR4KZ>HBlQRM4
zn&QXvkwS~rUNMu#vMmNwVBDl%A48b+a&Dpzj2?~qM*xTY?9H18M?K-oneqV)){aH0
zBwkR>25(Ab_s`u=&AE5dOp>@OQS4RCD9p<Xgb`tk1IyJJd_{3M2S;|Ca43;ZC6eu*
z%^FU1cMqTdkr#!-4pauX5WqEJ@Bwyoy6apz9ATuLp|ehT1(*`r0d1&k2%@NMHRI~?
z7YC*k)-rGr-4sIzT9xM<9gl{eBBnauI&kZxU=D4@9+fJpu)$Ps@73bZOfys1c4xcw
z0}oERX2{4+W&|E<yl+QWzz)fe!ud?sfK!fPyNoyoO`WFJ3>?;1js(Qedo=LTI&}r+
zSPB_EDKsjoiCCj4Q25TV#G?;rXc=?HaWl&l_+~h%)P<vB&6VTVY60K6!~Kx~6>XTy
zTVqqE++z7hxgt6!x#mSh((^yXYcUl>lmvZIeiuhFFD!A!rPS2jk3QubpIJ_jVK6Bo
z%avY6nQd<D>XR-rSd0;*vFW%y7paiUm=QX=<8!~X#ze)BGbup{Q6Z|h&_6Sv?zFjg
zO5P1G4l=>$b+W%ME~En1tgN9H1Xv(*pbzlDG#XRZnpaeytzBf6)fgTf3)rX>+2^S%
z@CK#DhcEBOI^(a(0O|Jpd;3OfVZ35<<=v}S<%oTr{elP?!VCj}loJn-B_b^Hs+J-=
z5X%3jRO&`Qo$2{G6y?O39&bH~|I%SH-i1!6selBZ=WjA))fCle%G~Qli@#}LhOn;I
zM2Vc>$_$lch{oR`3n-U@jENlq_7p+~<X8X9FA>}gp0{}V3SiHd<mSkJrRc&y8ZXlY
zMN`NeDRZS|cSdswN+Z-M`M^9!ey<EPU~~u-n!=g1<R|-304et!zIO!s4+%?x>rnEb
zyFKW0V1`G^0IK_Qr{;fRNTEY}$?0nZ36A4Qkt?_!->OAhRqguq#e0tb>BQCfEX@;U
zU>ldy^`|{K9Z1R-QZ*jCq8A35_{OmGDWpgRx_bo*@BnbF#rap%J$8Hzy(dn3jF{!e
zD@G=BP&*VqK9V?3XJ==t<7Pz3h{r#{+7y9O+_>=Se;TP`Uk8p1`WU59JXwR*kx{X5
zFU>jmF)l7{H&q1RYu#cAl^h};8$3KXr}KH#IKHUG{^{xBI>cyXabbh$PbRa}<@Qi(
zA_e#{d)rJgbFexe>xCw-ihyg(39!R$duK9X0T~;#FkWt3UngdzO9KKv6cvm(HSdF_
zye1)rLLH|PTI_=_ZU^YVnk`R^NWf}I=BI2-6_k2cPvqeGx6)N!bIsdhUQVkYQ!i_~
z+0-Cr7FA8`OHdkerFE$V2=zb+?Pu?cii%3S<OdKU{c!k0HaL=xZ>Rr8X||>d?U_N!
z{-PP^fy9}Xw)OP=`;gEj*NmBU9dAvVic43o!c{O58g-}MoGl7oG0#_aGZN?Wwdr(Y
zPx9QthGL`%uA!*_c+S9rkBXyqz3m8=q@<($Q(#>1`pPH%@!;^??@xX{UIa2JlXi*?
z%zly$(o1nD+I9715wOMP<D-4aep|ZDh%EX5K|wTw2I<A$zRAWG=pKq6*`8!vIp%t0
z1*#Rpd@b9eo&Bw?%aKkHdoDh7h@q}EEQJ^=T_NyEuhG4<&w=4|@=zfv(;9?zyUsI#
zjwlG1B63vOQ<4#WGP;CmZ(*;O?)YHhDI@|QVyP(^-ynU-TSKRdQ;T?kd=dK-BT4SJ
zzfLQqqt{`m<*B~`11_Z9o43tnAN%dupc?)lRUE_-x%W#?*P^Bg+`D&b{LCw39e;dB
z$ZhF!XwyF<rY%dCYVt2XCpRg6q~k%YO{?qMy{pMk0+K8jF_Moi5C3)6e(`#w;*6pa
z^865pw<0D|v__HlgLZA;Fr{J302E+BK`BDG!Go_%sI5l0^z9yi8&s=bzs;SI`r8%3
zxsZ-ppjar5iHZu~J}tgGVDW|n|KbI)9mexF$)r_a4(gvzEv}`|`(ufFY}5}LM@FIF
zE^p;_>%&q+q|({*_KS_LWOa+l0abGR9DEfzZqH`PUE`EP!$ymdKIK3J6bf*dRx|fi
zv#KjaCmaX+8x>BCWoJ7AN&c!p8iGyTew{h9H|NX}BbXkoJKaeqU|LdMO#ku4Td{i8
ztzpl3epoDqEf~TC1fJ45EIWl;eu2@%;0C2nzK+K1P6se@?4(#w%f{f%3c$uPh7VCZ
z;AWG)vq4Nk*zSDa7yLU{7ES9)2aIu+-8Q!4RHBfNS+nF=oPR<>yX>RhD<Lu1r>_OI
z;kjxlGR-A!Nd})?(q}Z8Yb_1LX7eg7O}ZVzO<(EDSN}k2VdqrX{8gt`c2p!2et_?}
z+5DG#*0;ysTYogm#u<_Hpsh$T#k<-13ACdg)sm9YF5@XHcSo$sEyeeOB)`r6RoK9$
zQsQxAG?XcK<Mipq2Db?@UZ(o?Nm@;l$78JiRzcfkKF4pa?IpBI>5_T1cjq@vgbU^u
znepvR$;XxInLylO62LR6W)c2M>A~TP0&Hgd7r0|b1Chpv4=&Jhvsx4RhgOh_1SF`B
z!#1Q)E6^z@3F1gCOJkS{wYj7&GK$m@Tv#t$Syqr(FbKa1c;s_qAv}2D?g+F)bb?S`
zi9-Y49Xd6KOxKe+E)Nw9wdLGlCfVUWe?96Bze!7+Z`ed!Ya(5fAH_Vf1qgiKFYDA*
z`ntHBIsXL$_M(sB%5G-700FfoHj)E)mhQr%OM__-kG@Mu=bO4w!?KIK(<Fy6M?az_
zY5yr4o8cW--Emy9uh=oP^XbU0R-k%g)YIs%XG+6HuhVF0((@V1u6^s*sT0g})%X3(
zcFxHMuP$b8FIVDrI{yVV?%);fUH`b>s4Yw8;PWh=b|?iQlwnAS)x)cL6)X06y=@(B
zga~as*33V)&YF=erBi9n{G`N0XwxM*Gayc_UtCJ8@U#n1y?p$+_2u+^ANrUSo7oH-
zwrTOMojZT!%%^!9>|6LP0S5H=;uMw2YlV*TB!;Nd=_Qm<>aj};cj5UR9M`eclD8U!
zr-6*9iKU}|Zj;oBnu&IJ-FIaX7fLgh>4)m2(2prN={3Y<NoFZGi5_aMj&CIWO3wvF
z<>}s}(pr4s;zj(Q&wQOnxhF>NKr$U&5KC>u36-D7>;m6<<9c(G4um8@<LNj0dsKV&
z_EAB@(7lxCIb}+trcI-rb+m*Xz}%}p$iT}n13+)<)U6Ai(ARF#3QJ?(@LjvsQetE8
ze8TzD^=)Q@7u?IG5#<F&aBVpUzji-=a%z46`FH|>o6)?<j3Laec+MXf!%)2z#Hxnh
z!hu~_n6!|~n2i~lc`YMYL=Xse^=v^t)ejp=#(=cno$surpcXTrEr;m-!+vdd>K&0)
zqL_E3{P`GCY)4a)@fP-3p7rW!(5J3Ghqt+C%t3qXs7Ji~aM0~s@BCzv(lZL)b-lU6
zJ(z8N&v_22cBPHQ2MG=RnaP49DSn*&XYi&2aiX5J3h~6R?;3spWRcWGg*zT%`~_6c
z^VYU?NS;Qhv2}FxEj*;;{HV`c+c>`jF%lLg@pnIz*4X)@8jRWX^D!BbC||Yr_=cFb
zbz;7CdhVm;a(tef-OkF|w73*Hq}*iqi>+q<MIs7&U}R`$GUCN~8ZYdi3a>q)O5U0d
zvP9m^UAR_&HdPxV-=h<`^3V-*c}M}m<b{tIw*Gf1rtJfaLuHOG&h{ZTDo>?u3gca#
zef+oqw(|Ypry;g0%Gr28{ueXh^fYg-OCff<l>Tw?`yF1h>B=Drf%?nBw%dhIQrv6H
z%h7jq;r|bWolXf6$km|*-T#Q|E^W?*Y^L?#F(VV^W7&f*eC~vI6h*D)+6BSvHD<GI
ztT+8uUb_siOUJi=M#H+9S2zQe<9X7Gv=e}7V`6V$a!-n<<`0B1m6({gJ1R<x`6@tI
z*xtB=h=^z9ybaX{JH!lczO6$)Q!AY#T9RDUTfNYtT=6O0Ic{{$*B%QPxA*ocS+eBD
z&zmzF7;#UeTfLo5l}L5@ZMQp)Ld`$jfAXX>U`;18v!A<XP+rLz7#65i+?`+AgF=@3
zblX(A^#jYH9WjsV-7US3Rb2{Eu&-bJ3Ik@%o(=ads$)82lTj}}Q1JI|d1pB($R7eP
zzlr4eVXv2o&+0xGmY*IVgjNJ~G0&xRrxkGp*4gEppL=FW)!s$h9BB0cKnJcg0JLc^
z;ZU+g@4XT=i)$Et9y`#&VrkK`zCc32CML~ls9%YGg;%>a&Qe6ur?F8OHD>r)73ehL
zsi-N9ktUXy9pVT{ya42)5_ifS5VwKBUnImBaISuoWWt(e*(4G)Cm(gxFgDnblr;PJ
z$||8RN*|p3bTBUCSLLT&&zY3JWPI3U>28aV36r*+{wp?l>ZrKm&D$NfowUirByZ~L
zy{-QApHw_^@X$wdhd#P<UR$erzkVLpUuOGlePZMszS7OF?)0F3DShtc<;{QD^Hq4^
zzR5vbUo948DJBNg7>$`l0)$Xc`UE6+?d51@T*)-+7PY8v#+8=4+_E4n!kHo~t4u=$
zz8CeJkkScI&g~3Vbf9FG)gDZG<`tY!$5yw3-|Gfx^DFip{r*0+E7eiUdtE6!=yR8I
z$OLG75@Ry-*0(0!-tWeIW=Ysa&%L=n7aS*u!=u*ohCLdirQ#F{Rig2bwy?S>EtaRG
z_^y7~y<4}I<6JIi{OJ3@eRr<kPinV}qPa6>*e!Sx7EqHRQq<j7mK2MDz9vtC2sJjD
zvffCNHK_oHf&n8()*?tweKg>K@4XfnM8y0sIkC3&h&=BXD0iU2NkUT;{Bnf#K;FR7
zqnRk*Sc7TA`CbaVZh!BUhtbBo2w6EnkXlsWrzfHRgevn`Rl)BZOmq~PY;n51T7mA$
zPiN89u|%?sl8TH0T<Gp}$nJNm&Bt$_lqy0EpfS{Ps-k_eBsJO)suzQbG(?8N{%$-^
z7Fgg)ZtG=4vnQVlp_zw89r&u(>P}Msj)5Lxhk-Fr{Gv0oI-+*<J6zZKN}r0e3vTDd
zd}UVP)3_+2XI~)_v`=|L1Jmm52rS6>s>Tp+DU>_Dp@@_XRY+f2g?E=-?!n`|5lNTv
zwrOt#j^{Epzc-D%qxb<4rd_@qKYr@I$`mn4Uk(hztl!>WkM!JI6ut^_nvJ6;dwGr7
z>(10oFQQW3j}bL8ZXy?o`8tiF_0F*2!#8{GiH*%j3|P7p{!w(%uhzi+_NrYS13q2p
z|2iXV?eM+VDYHd`E1-S6q1TaV9y=JBz{j?~KMkdrG%j9mqcVOZhoAT}N*Ye6uTuGv
z!P+_h0;CkQ6bxJ_BH{r9n}ikRdoP*>g&A!W${J$NspC4?e%UfnEfe#!*}r(`Y|xg&
zt7pxURli9!cq#I;ZDz^08E%_)P%!jwYT%j7oA-(nQybb|4TY6|8EVH%v&)w*rGgg`
zHT;<?x_xKLU~{85nihlTc@7gZl+YIC`;*+lcAjN8#ZO#>Da(Rc-ZSP)FDlHWx5t_T
zuZSj9;4BzETW5pD2tLES$kolwt}sc0lHJcA`B7`dLC3Ab7gz)@Eb5AFGAb^`qj8mJ
zLkK1vy*(@%c)i{g78Wyod-gI9r<^5u3tAs@8Ib1}@)|18-nUy6@poY=UViNo&}*>?
z2UHinHMOX-T3~nk$qr8o@5L+Q^Ly*wT~g1NUE*T+C%kkqTO*^2)!D)Egnghpfmd_p
zziL3rjmqxOe`wF<0_xxGS1|WoBd2hizqtU6RfJ-v?B*r-#E&g|#zn6YB}N~l5}sap
zBvkgwkC}|`j)~FXT7nm(J$d4Cry1Wo<uDG2vK3uw=F(q34!kctQ~HbAb9~DbBff=f
zdj^jaF~<8P+q-zbm}DfmjJq22j01WRj>RgV4^Z~Llg0t;2_z|gwKRkMYs!#ueRHAs
zpi&kGBfrC6SIgQxs5ODTLF6sUbUV~`G4b{;J;KrlJ1*XpFfdQnb+;ckV(1S{@HTlm
zB`kqEg!}NXJ+}?-4q~AWjj_dj7}>Sf81-4h{qagZTpX3(JK?~?fe_At^eFEJP|FF9
z#<4f<)@@YtDdH-YPYWph>am;!ga7Qq!9`>7YT_-PS&@ysd8lYa(>~%Uyl2w{^?Ay_
zX1snNTO(F*;yQiXuOVO?vlB%DP?6b2P-}S-SK)GqhMiLKJwZ-jB2L-1b`&^qXHBY(
zCuqvjocACs(qfaNd1o|qb_-l6(gaA8MT4jf^{$BW55eX0BuS$DI3hig`H&Fwfkd{>
zCUS{AkTg~BqKB3}WV`-i`z!WwxOXWB;Y)3`4PtboD2&VC*+#E7tY=PedG=+*NU(N~
zYzQey*+4OaW4*CP<UnipamYDCCL40n=V#4wH_qq@`U>$Nf(5i4R_k=ERKMH7thKvd
z$9?Rbb>4d-4w1xCcO>m1lw15gueRXO7O9KK4dPFU?Bufd4^<!aiOp!$AVa|XHglO+
zx4poB1&7)$qj|i{WUdR$`C(USiS?akDIfUG%ledj#e0I-Z<xSaFg!uo`y4n7>vvJg
z2UTgHDi8;c&MN#SDJ(^T?D30S0M%PH@@z%Ll|ioPzY(vp;BwTQS+icOZLOZF29NK@
zU6wvID`DJ92#^{3D5EaW;YmX(_Hlnd&|D}|_`N;F@Pl2Txcf8WUP?VBtHi*d0Ds&l
zpA8d=J~LF*q!vN3tEJFNBOj;=87S}}XE<Y7HyKfJSoDsyHoBj%3UTb#IeC{iqf79}
zhE-})c(O5$6tP22GGsPXStdEmH=0`9r6cP}0L;YEol+A@rj{b)4T@w~iNj*gK=_4z
z)IQNADZMEV)2@3=|GKzrC<=8otaAsaUg{`N6g1^gvMGa34FzW0gqHoah)~Vj%zb;`
zE+DL${;j={G!bb&V431Xn7`VVwL3H<g2M9W(>Su#(4O*{!G1H%=ZTRYmVOrKxxrYM
za$X~my~**#$Yx*cRKcS7`P_=?oKSA%>bm<;o0FXAZ}93(Nj)*T^UL172X(bYOL5+i
zqa%JP6rs9`@I0?s3~z5Ox@XLdNe~(eCz`s0aJW3pn|&1Rx6T274>g~3Va2Se8^Xex
zb#i^e=bD!CdDD<x48qRRcamcjKf+i&79AI)qVk$1L_{ScO#=x?SoSbSoOL|1;gIPa
zTW=G?v3hMXs2O`AjADvg%{1EksSAUIa})xHCSEpR@|t+@G%V#*km`xWz2<+kY!@2g
zGX>Y%xHR*OK}`(+{ToqVVUSK3j=%j-qF($q`-n<-jLy4ro2DL)rYfmy>E42$$CH!G
zU!k@&+_x;lu_S5<VX*V43PfTPlja|u;6N*{Mn8TeR%<d^B|u#%A&~T?kl!Cr8(S_>
zNh12kmg{oKREii{j=Qsls>4<Az>N~!(2pa;1lMy&c1FJkaCw0WUW~36w^K_&yegxC
zk==OA;+3buPDa(SiaJTFWkv5snV&v?u4z$MVW07+c-x?lKVlv+;s`U(A-=8Kv<Z&H
z6~iApdd|0hxll!?lo8mL{O2x?CRiv__{GO3BQnE#WdFf6blfu7RoD)1SID|$u4eN)
zR8}LChqB=SzU&DbdzNb1GSM@sWM}uXcf#6-`OIS-RW3{iJhFu%=gW3iZxQOLQF)jc
zP3X<y{fZ&7$b(k|SBy~;bL?x!Sk&d^26mmrkk{$=i7c8lCFfLk9uvm$jLo78e9EV?
zY>6E1<fKlH2SNY}D!lp>@=Bj@?$l4l<?iktzgWF4cY$D?RAoH+Xe&-W090#m#!DsB
zux9(`%vT=DSr)?~v#D7u%5$Ey1MnU0!G}yI7nZjF4%4^xT9zpZpDg-RS-}RR76n4f
zcJ0JYhSojynV!pVZe&<p%hTQuy9f_l7H1_r?CD!9(iwwnE`_a0c@2f)4y-_zY2Lo<
z-dqr{l3FB&Z1_fV!hE5qt!1W32U}Z_jSVqg-=8fvVq;H>xBaWjv)uXxFuG=z55~u{
zH$@vj;Y1^tO6GF$Hu5NZUYxv|x>HO<Wr_OzLbRZ1K9CbtKi?I=J?cPBp-5B5L(^HZ
zq}1^1xG`fwfZQ?epwdGUlkKb{xhrghNoeC1Ev!d%F*UUtYpM<oMK5n=Wd*T088Xtu
z-ZR&|WSmw0O4!Cw!m|n{la@qM4-X9IM%>Bc9#8{b>T6-4iGV;*(5*{o`(>5j=iZh7
z_a;p4BL1Pcw+A%Cr-izZuf@N-^r|^s3W^P3B+y3Rv<;NcQlu(ohm_cc!X!{qaSK%x
z%Jkb4M;|bwbCNxLcn^1x0S;tgNO@0zC&5eCJD0-zB34C-|FQ5ts^8o_khFRT{f3b5
za9oc+d;>wP5H2E8a9$u2IX$=shYSo<j86Ryr(fmPt>Lsd{P4{ZIN{I7=8A0l4D^08
zmH@{kZ(8ZgnFXIsWIHajlaUYl04CGt>`&KwGk^feJ0HLwL?{2m<VVYeM)%UanY;z^
zM`bNIPdVZ6aGC@BW|}7QC@MB-5@qtEO1&<XAR0iy)KvD3I;05<l8#Hetw$*uf1zXW
zx9Qr>87m`}k@sW(Ixy3#fx6d7s#!*apH>T|Kv2idjq<cM@V@Cy<DPlt&vv2->Qh>B
zBx5!zFxijv`Z(B6YdFh6(2&*-0GxHGn=C_8Ot0^Le{}w-`i$^rR!gMjU+tteW@fV3
z@r}r=7?)3=^1XS>?)9r}>a*j6XIKX&k-VAV$ybT<K;w5@9S`*39*atXBZ|dFbJUYd
z!HYz|ZpGl1D&HSgr10Q~D=8iN=CGiE2htxjIka~ck!?m^MosFQYBX~f5S_7(AcUtO
zL@#lj;Hm`vg1UmvV)W`CMIS!&<EesjD=RFQ%8XNVQ_y;YnAc*IU_cEeMgS-t7MK2<
zt2S^s2qfoKE_d&CCIA54f@HF+MDfsQD*OLn76|_{KR^E|`A#}c?p3p8Zk;EJQ5OG5
zwtdMWLiW_gee!j7IVR{LjFKijC-Fd_BUNKV4)JUPT^wW|Z3S|*A(2WoV~NyPSFfI@
zLdG~?DMwgznxJ>`esJ^);kk(9Q*^WR&3Xz9>A19X#HcL!y2(yCd#fEnqe0B6C@_(p
zv0Ud%+Q6J8r9WIqFiyw+0)Vt4GV!i6_kL?~nbt|{K+o08T+3q<KaRhAplb+sUaMxj
z%ycP`G!){$$#!4Bit;l1zJ6WTwoCFIE_H@>M)4+gg)xvK21GEa5;`KY>DVipyHEe<
z@0b(%D$u+wF7fEm;(W&OkU1$1H)>{Hd*jn<V0WbQ^veCYkw9D!>@=CPCzZH$Wi|}c
z5s-!<!ct_508ANcN7N(kv9pB0)&6zfqLGXQ=rSX`sU0ExCN(%<LLVVxWZHSWk@FVx
z>T;PC*Ve26@9@0)dUIgxk=vag<1)eg^zO8NmJXBYM`g5j1F}T44@C%Ux0=7CVu&4g
z@6#GGf_S!oQy*VKZ7xnaj*EWh?fltI)(?~=k+6-P!glQVhHt>Qusk3_1htf*N==>(
zb))bW77{E8CMTJ*x|=aHHAJz}IPulhtpOzdn4|j(=9-6Lf4&=5i8vOCSNFkEsuy*P
zNwqfeUB<++weKp_G{OMT`(cFQTGl7<;Nbn#`~n`8SgpfuJp>`Chz&xEdyPQzB5Ob3
zY_xtm*0lU0_5})urTMUMvDdnne67W<MN0w6nWMPG7Y}EhA>-A?=H_GEo*)pF5<_;h
zshp?5JAwOS_!WTCpZT4JLeR&?DDr)>xL<*Lx19Mv4n2l(@b5B{_cPxS)L3Rp6f}SS
z6xAArd$QFOu-L7wg1$zNX-!P=$VB(_DlpHFXUrBkELZu%Eo-9+Zd#7<*L?TAepp_S
ziGf(N@#yuwj{TDfJdx1;^}>;K#l+&L7+ctW^F6>lJ^d}WBHDzm$-B=v6_iM2YjVUQ
zer{8n7hP<dmCcsKo5`U|@&T|CJQlTWy9C>At!h~ld6>Y{_<UCW^Sznvmu}lpO5nq!
zC|tS)?iO_con)S>z?O6R*7&m{FVj>8=}dqyAZqO-2>$GBC+a_u(1@UrMtcC28r9Me
zBSe!-iP5IjlMUBlVVQ$n%6Z_WFBEGbr8_u;I(t+62xsLzU*GesX+p-2{^N3%y$U?h
z-P~O7)oN}ij0w-=$P@F&C6se)%k>~*oa0wd=QW$`dIMxBE@>kBnsRgZX1!LzI_5*H
z;lz|*aXt?%#|H}P=TJ!YIhr3bkx<DBc*}tUgNGRGr!9w}E#@OAtu6B+^o8aDgK60u
zhGI2h@2U(Z*KW38X5Bh<WCfnZ58iD3t3!)Ia=)`H=5=IZVhzOQP~rMhAPP~G=!UOA
z<5`W#V8ec7-MeQ^<W^N0Cyh>V`bc$ZKt_`S8-kH!ML1V&x;_Y|RU=}3pd!V=YGheg
zck7LcBPTRRc91KsjL{4e@~_>(Vbl4<tO{_+%VQ1--*`QQ4XPi8(Ata)g%T+H)~G)3
zeecFlRd5+u*a`DcmhupvXwtEu><`vy>XA|7)Tiq`lClrLz}02ngv@E+oC`e3x++#7
zmZQYs9tqR#8@mwe-JLj}{!#z}nJI)rz8BsBqrIZfpT%Q{UnK)jsT)xAb{>~tFCIuF
z%<Awg(s{7b)RoLR(eCRZ^FN!wx}#;OA@WuxjDAur^BL(kD5U&Y78@Vm3ezmvHUvy_
z_w|gZn3zCWr$AOX%bS?-@@ie4sMOVP1le>Yj<%HYt06q4oju#Hz*ks^@!K~!tT-Pi
z))wH!iXs~gi;Gn$2b~_k8IqL{?6Qqy^m0FWMiU~L<APNoUn?kL;*BNZxTK%LDWV~i
zu^@A!Z>(ig0{}5*Df<Y*!X8&{(l`M++n%fGc@Z|`GRr}~Gs(G{DdFfhOPH))fbZ|o
z?!e&keF1yKixk39qh7tnP1=Es1VXQ5JwQKN^NMeHb^?;DGU_w_otBlDHpnIrY#h=&
z=+EK1SW`RDaW&U|jQHCXgAQ<(bz`Qg28MmL_FeFJXYFm|$Pi?172mP%Q}?rVCy$hN
z_X5~eqb<TH;bp+!Z+8zPMGmOzAAhXEa+S{c^1fq-52xY#Ja*3mORCfN*SbD=$_Gz7
z1kXiG5Tv}9kzR)GpM-ntNPjdt5Ho?@VuGy*2`b!39PJo$vb7T0ZomA8P*4SM!RB)0
z<wG31m3*u#+6O(2L*2j~=dqFZf3jB7$>uFqi_ZMLxmY8`Y{**?=l1kF6MvPM0@G(U
zft|!*_PGKUVZ|NX@nt_ERv2F*fWQWhwB4~3o`nWvt(q<}+v8`!Mn<qzMK#U+2rS%m
z<@kvcVpk!XHW*u)IjA>pW$N(ThpM_Abt7?%Ix#E$Wk}=5G0)_;=dVt*u)b6C)EWdq
zSpDnm{^PwYg_1Rmo{6y}=Uhq&MPUOekh@v3Z4fnZg)89gS+32ZO^Br8qY240Zo|)*
z*jQ0nT8w2ROSP1WE2^w8bcWtI9cw<dydP&B+@G@!xU7YEIRXBzSH(&9*=9~*7C1+g
zolK*)zW9NsE+w#7;aH=j7hWSD004S#Ol*jQ?~-bE=f0GDpV_#(BmFhwPQ!$OS$yaj
z0NWAXtPW&S;f~R52UP3rYNNx%1WJA-L4^@fDppsLpfL0b>zzaOyAm>ir~nmqqbqU*
zSYWEy;xEaN!xf4Wf`>HtjIin&X%C2J<?{t`3x(96c)0cAtXW_X7dR8?bd6tBa=A>S
z&+>d&g}4&Mb?T!>B{6HpA&6o$({lX{(P8n^)Ov?M^QC++Esrq&ij~Hd0gjQy_JBZ7
zpSsH8H?Hc&(JdHLNd*cS95(WZAC_f}T-o0eJ=!L)qqOGWGm1iEkW{P49JdTH*YZ2u
z-UTj7XJ@<7L(P)vEe(mN#PsL3bK|1hn45zh32huobSjcAj#l!L($g7hAW|Mg7GN^j
zruGU;@%ERPC{Q|4NAUxphyrrL1S2uybxTx6-M97)62Bncqt8#O;e#MHa(h7q6+a?5
zvMsb@4NVSIjNA1ifHmf{01X&;>DqodF%JSzWhaMT)D9Au98XUMIXFjOEd+02mrza;
z*x8X{O$HM&UEq%!1W{afs$J@U;zrch9s7jbx_2)f_@ok%w`{jIYPd)Y2OW1#br8|8
zMWX;lCc?hOlr5uZ2JR=I$+ApPtSKr~s341%57O`lTSl52Eo3G%?^lVy6?LwvX)u<S
zOKBIVcb;*{<$oe6=51nR9q9V{#Le5cmonZKiSXpf%VjP?mnq;hTMKVa%Gczv@#AR&
zJ9f_7`~Ez<XPnKBZy>}<bs(;5yoJoNj=1cdS~@%l)$`!!3ajPB)cM}Rwxn?tS2wY^
zdhsH<U<16F3Y;hDs)SYZ`E%y1K^;Y2!A_Fhf80E=ef#!F6(h~B(0$<bi(D7;Cb0L@
zKfjEnRda}GtdJKSyA3?b+v5o{vgyj9P^2;wnI^&7VdP*i6=2%diTn5MdymeXDibtT
z{D{Ql5}u;;EeGk^HXFK*4J$s;he6v#fUcTBqs)-W&3t+FEJ7m<g+wNK2xje`>7@cp
z0xbI-HoCOvd*0^E>-M?yW3*aN1^a*V30T{)cQu~2j_{jq?(FQj!MRcY?jX90ety2U
zrRV&P3II=ikqt0Wc|GebAh{T^DH#^a=sF$t{9N~1{BD4m8A1fHwj=h6@Yswa?+qkL
z!PXEu=!+K<0dw(N)q!jdJ8%FHhx||B;cZIVCbd)PImey)gQ3#xGdBcI<R{*SAuu6Q
zg#C?+&WJM3lytU;&7+K$Z<oEm=yu_18&LZ88g?e3{$A;Hqy&iDv+DY=e*h*}OhHE#
z!m(y<$6-TkYrSmz7*W5arO7l=6loCSlgF&-MDZ3=oP={D0YvtDDUB!+xo>s_`()F0
z4c#A_seFCbPT4;10_ZGLEJhTHA}s^=PV8jDGIj1g`AJ{ogx$FX<!I>Z9;mRS)Wvi}
zc~4Iu5EZv2Va|4vC>>?Rca$?oqM|Q87rQfd&?$;!N2Ki+U@U~H>_)?@O#6(F-C1h&
zQSPP0iU{ycD7obnSxhF@;<ILXeVxM$?Ht@pP6^{Hsh#@QHC#uL>dJm4xSsuqZFv5!
zrpG)>vwtT>{gCan5aJ0l9K}+jCQbT}N+>D464W-7)px&ra-M<99DAFef62d9*=;GZ
z(J#~4bG%wC8tyy{gb$NuZgGF2^4`j&Y9}!#G-djzw$|<1b)dkWq*qg984eCG_PTV*
znr_D5<*=(xM&HU77i@JMtJ>@-MiSd~(T{h>Xj$zG7JvTy869K=U@0iN0}9^JLj>CD
zDonTSJzJiG<UBDOns0*r!`X#7<H~y|ejMnyyblKw2zvP*5Q5Vnn!{_gxv?gG&z^Gl
zu<;ue!4cmx`(wXULbkV*L=ZqCep1@E?m0|Y7GyFQ_)01Wprc_%qgpFxK-G!(iIq9z
zP={x8$WJva>T+<!!7+lrCCPjaO*!Cc1XNw*5ICo0Cd^aS<~6`w5Wi71BFX~-`LxmL
zqkihQ_F@$w6(?wbNQlTCGDkt##;Fk9h^eW*)FnseWQEb15$44@9fksH?~r`-oAiL<
zghOc3owYXdvp%+2^X6hY%~WB4mzV153CNMI3=O|=LP!-AD?1L9xA!@M7Foz`lP1N+
z4u;6)WO-rc0K5X#fV_U@F{dt}bOmB2)uD4|v6z;MX50oF`YqWn#W6+@BNzpw#Q{^1
z<IBf~3M`d743()<-)zpA#@AtjqoKgdp-*&J@DTGIT3e9Oa`=&eNBSK<{(bPh=wbYt
zfrAG5V>5(c+Hw2=Z0rdOIP{SeDJLt{tG9j3jp%VszziMxX{mrn0;w9tOmq6)f7pNl
zK`I+f#clfZ?zD$7N2en01)W=VHjR63MIhxPIgZ?JURc%9{_7{rmJ9#nBRm89Aj47R
z_b2YwE2(;U<;s<!i{zz?@R6r8B|EXzWS6)Blu@$K9Q2<tA}%vgT%2v~_6{5W^{Y7r
zpf$+l)Rb}FE_YBVw`|kZ>+ZmKTj^C1?=|lHnu-KnS4c*hH*iv`jjG3xwPKF$eN;%l
zALrQe9Q}{PH6=B))d=#-GHU5oDZM;pgbUXl2d_%>Osnd5H{jHu_nnrKa;Aw_OP);N
zjp(ebEDDZ!1vf6|1sJXj2@zL4yjk`(J#LOBfkHs=AYcu*Y10mD_?n!z>nm-T9f&m)
z418{f75GUddpjKBw7eAzQ)%zhp0o#X=IaJ=&JlCUNLR$N8UgLk!RaC8fy~zx^)z-9
zpiDU7MnY7TJp^H4H94W*zKU#?p9M`&@J^s|`i3(n1GY9tu5%S>%VeRv%gMrgnM=lb
z<yAMq7^R;{*vB6R_IruCGBS5epsub9sHUsXw5C$iy*|>}SQSVa={9=nitqfJ`JZns
zp4si=-&_FK0Uz+Sc@BW$n1BzeKSx%13jN_eR6-$g?33<<-1T1iVq{erzTAjD)R(G5
zx^RYMKi|v4D2ois?FGn;AUWIIJ8+4LL4@wCM>Jx?W%483y@eIV)HE!owr!VDr#QqJ
zvTsbUbjmvPXp&8_t(f=ieZ-@x{h*uP!Y^kLWCxq7E%&v-YdSeWCK=u0nv+$ikovRx
z6!^61u^hN`lxt%1D5frmP~FDiHFni~PbA^hsr&S<tPRU}F&KlcQ=hZOnq`d{#s(A4
zo@}@iw~`~*2?er0fOV&@dw!b1ltM(Q;jGr;Pi?H5Kl)G>9ctu~(qCSczfB1a@HbiF
zuL|imGB4Ub{?YUP4}S78MY~YPwhi8$ScWo34$Hh(t#GZ+>&4?k7t%fm8b#+@{z%T|
zP83Q1hpqE~%dvmM{*_Tg2-!2FVP?+~g{(@$N<z;=WMqa0Dr6+e9u?70Sw%F+&ZcA}
zv&YB?l@WTs=Utxv^LgL*{`~(QpXaIjzV7S#{m$Qc9_Mi!C%ss_hL^+puuV!ib+dU^
z63QGAZLv<THhqGK4}OG@V?r8DiW^D$!r0qZ|6oLfo!jLbJW)y>!$nCX;k*CdCNvJQ
zoRG#=$Bv^;PH0`5^RNcWa`767d7RomZq=zL`HCrLHc*yY=(}AG);Mj1JQArSppS;t
z!u}p~bwYk2m~@^prPU&j5rIXVkx@SvM0SsVL))JtqxM7So*0DNyzR-^Xr?a(1^pt(
zXt#RXw)@DPhV0_>mFy3=u|@KU5|&6v|G$`Yj|_ytk;1`tEsY$dHJ;K)u9v=GIwcF-
zV-a40NM>}b_$HxEh~*#C5Hg3Lv3R=qA&{>{NoPU033<48knUNr35I4f;p7}ODo*2k
zb6ss2Sth4HeToFjBUA<#N16Qcl@>h5kb6FaLqWtrFNF1F`LVu&cwaQK(r1)Cnop3v
zO@%8{Qvg-svb@-0uk&a$$uwpx9s$YgZi#Vi+osKxpVv!W0HUhw&59}EQ5~{>(w3%^
zyi(%exXs1c6980Q*pw;&fz}s5M>wAS<%n$83s7IZOMms?a0iFi0Now2m7_%~hg!wY
zh2mM&VH{Kxd)C*O+*CubEBD95IRPW2f)Hgs&QId@C4wI60sZ#vFGw+o=hIIBBB|MP
z`ja_7a@BaTl_RYgYkJD|tBi*_?3xwrdg4Kvw+W4HoqHzqD;fnllETPbP|0UqFtC&6
zC%1&or>|g9Tm(C@d2qA&Ma(U1*`Y&R#sc~f(F^BzLEexm72Kxkfa<gp=PX=kL61eI
zaIz&hZ=L~7pPciof`85C8~BKOOA1`T{0>jJ>oZotBB-v|C;`nVE-PK$Q<41jmwy=q
zBI+<C7wJ%E5Y78a#fF}R661_Kf9Q6GUsEW6#Nq10XV&ox@^<&Fj~+dGaXF}!X8aSv
z<+E3>+ID%JL-3{1PSul6#{;UHFMCBMZD@XVc)-FHA8tW=wo<#Dg?Ik#g%xKA7CzYU
z21qqckHo#=Wv9PMyIJJtx&~X@*ywiAAK$vRLBFT<^>-$uJa6}rO=_s+GA%OmSF++m
zb|}OeQb%9r-&ieNO2elKH%BTlNMf9WTJ$$661K*qEJ?_ze(`O8Z`&<M?!bkwRPWuD
z)DaPcu*zZOi+B(DMo-LOh1|hCjP5owLXU8Ao+R#N2Z~<os1}Ai0tIMEm;1wKFo!9h
zo?l3a6H&WZY{E?tyK<Sz1^HAat})aESv3scwcN?GPoAV@e6jVMP;)_AwpvE*`68NG
zl*Cq}(-o5gIj6JucBlpW^|sBPEh9U6FNn^`lOa;jaAI=);qu5iqum`H=}u(dz8%$h
zTWfIq18$t3g>wzR#=6dNbIeKrQxR{Buq{}iBdRs(Ht;G?)h8%Uc<V?VeCzh~PR%Qd
zY?5QO+v^r#{`#bN@=aNo&z?T@Ar7ZAdWY;Gd;au6h7`0bB|M1Z>xzF^u)<1%7N9G?
zh4db9@)IpRgP&njnOlsFO6xCvL+B~sJynHP91B!_n9_{RDz4aE&8pI$uS_&&^kZdW
zM(KOgg(PuWR<w>FgKQG$Ke|-lm>|i|BO+Nx<syS1x_vmYe}5$*^-K3O`5?xT7%&<!
z-auRn$|((8=S7%etJqU1)_cEl7TZQmzPutl)Q{rs+0R3kAjO(1>w%Y9yPWA%7F4va
zECHI6Dvs6eykez0$qP(*UB;$4#(dl)sle-^=X6GBUaRTv?7A|n`u>sRqv`vHZtm)+
z&M0W)ea=Y~Jg`Uj;kV;gHeBxlZrP<y5%<yM-bX4=zW+BInI)h+`wW^Y^cvD<_-5ju
z4-rF+a%TJcCxlsyZOfDE^gLW4(gW+qG&+8Z)fE<3R-q0A+1(a#YT1@~=0(~2p<7Yc
z)JePz=s=)9Fnqgi7o%4kVg-mko}(>Ir$Zj{@C5#{++rnID`(h{Ay%%ZVGunFtOlQp
zwPwiEZ`-Bu0^ArVm&I`f8uc^aBYmiKZ2OSjbORsF{(kK-r256IqnP)nY3qobY8u65
z19A(Qg*j&|IW0_TY3GRd7C7$xgc<eGW>hBTN*;4^ubA=Gsk85}1sQS6O1jW3aOkjf
zN~5X}KVH&2AlKa`-@gmIo_^*WD2w13()0zh_;jPsNBtCUrAqbcb+;@ndg~`67fJlm
zUBDBB-~!0GlDj0b+PkAnE+Ety_u}32_|2g5)vluYLgscXLm*V6E;HDF%g3~{UovUB
zmsjfW)02ms&5!QCr#=|3jFqf4e_wojpftRZiKKT12oaVU3K#(hKsW_;058+)a`z|+
zh7QGzn9HPx&Lde7R03D8owY7*M(`rQ9V%-I6U4X_6KC1eGjyti8^YVF-Ru$^@U1(O
zk_<60sg)?^`;b5Lm;Hn_F63TRkeHK&4S$`%5zWBr)x4YH@=Pq*>~igk=Jw0{HBm!p
z(zL1H6x-BhI)RSX7v_&7M+=jKoK@hOJW`o@BbGA8)w8Kl%_#k^drN;F%y?y!l<~^L
z<!@_^7WMVQt}Rt}vCG*?lZYZGBp0Lm17#TqZA2Y^E%L%DBu*L%IDP6_zfw+H*HgX@
zxf1P}3r~Ynrn(R|IG0vZtsEHEYKehl(}Yr$q#23_8Uf<;hTa5O;UYqZc|oBgVrVIG
zf%xrDr}0KDH;TZ740`}%x2ZpJPA#Y=R&m^R>Hi?(eunfSYtP{2N587kSMbGcovBi(
zrN5Uh`|;+WS&He{lV(IBFt=62KrumQsT}Go_Hk3_EJ}=r?;0qMZ4~W5B}(YKFS!t=
zIsq|*{=9%;CVp%%Ei=X*SpyJG54uGq1KXbty{3jePXmJ9USMa}xl^Z&Rc)!!ViF%C
zXkTA>b~z%H(t#P}!f+KMJPnPjp1B{94lpyl8a^*j&Riocr2ui&H~c-Z3)~Ta<hzqV
z8VU_TOX_cFD0A})|C-(@tUCGImWKMjSnySj4Tn1O0w`ZXNS=BuRe+eWizeRZ==QGM
zDZ{PJzS8(p^mj%qkyV(-Ba7_(G?ad{<n2n8l}#A&(5}bb1tW@#juPj#FNc$Iqi)<N
zbC%s14r%F;i5Y8@XGuv(wX4#!gX9vPxljjqGB)F^JESi$1Y~ePCmeTBVu`B}J?r>|
z>K`*8xalqXltlx}dD2i997BSVZm@QkdbM|g355YpfsM#-&vK5**?!DR2F_tDtUTp$
zenYES_g+ul)fun13+X-$%(;nMIyf3u?@MW<$~5V>T`9bac98}iX_bt_!IFGT)Gvp7
zlre{xX*?_R2rEInAUI^EfGe=m29T2z*o8wLQQWk7^FXo>(f77mnd)SLW_wRx58dM!
z^o4$I-n|E$P)KEFedy&x1>53Xc5VOo4jlIp3EjFD6(vJks#T#O2r&7bVe;49rN*8N
zhmc`jFkNKWx>&W*9VH#tu2rjrZ;Qy1pVo04Sx%lsA+;{UvGXUN#}?A!z+R|Uh?i?p
z+=&%}k}+|Z&_DB%20rG{Y2Pk>{D~Ui;r1u5LqPL&>DaMjM1@+>WHlCCd~LusfREs@
zf5V{>NdT%c8mN^a2$7A>234Dx;-tzmrq-EnG_fXjzpO>S>?nTLEREDJ;xF}&*mH@E
zw1Td1b-^y{Rm2uEMs>THPd^LZW9&vNFe1<{X?C7xmK`yjpzWJ@<jCAuuh*lSCEAU=
z{bD370+2k|8Ukux%Wsgfr`)7e+$;b?WUW9%(62MqqR>#>JVh-<QIYf|aY*ZQx-0@2
zOH&q!S7rWNCrrRe(J%Y9w1vc>4$u{2d|ZkNl5G<(Qa?OcTA7$MCl9{Bsp|K2Rmk^}
zl7a2&RaE-_DjY2l*WFX7ZfJloEx^ZQbcbEBhE87JBMQEcrXfU2rw?%J)E~l&B&18n
znNJBKDWst!Oi6;n{Df|Gg!LI+%^}iY8p5>q5fxhKu-hX$j|qG3PKDI#q{wl3Meczz
zF$p+P_-um)`CjYII<F3uixkCU{bpu#3!(bp^N+nnKTNEfaTvv!>ubwl!|1GHhfl-4
z*K%fv+-`h;G9>dQFuEw*6OL7Z@6}8^e$*#Ar7lZ)IA95sx5VL3h<B6Vq~E$7_5A>t
z)+Iiussf%J_o7>n_`Cs7RCy0Daj)l@Tc80X;fGGBEdb&A2BpsR-4|Ae^uMI<rARZz
za+NVH_Mv0qX6<BONZFI#VmRsJ_Or+A=0Y{#{1=2F;;q#x@cb*jZj6bTFzcy@IqO)(
zP352W-YmxbxtqEkaOPw#ZcOU3UdmINHJDD=ukW(1qD#efFzM7o-z71sz#?h+O|1t6
z3tty%vXK7XR~JxS=moblHnxG(D%5Yk6No{hP9>{Ha1@vR{8E$M-1fMJR-@MSFI(Zg
z&{R$j8aZl;kf88&YiSj>ivt)<V$gT%Gbl)2R%mEQRH@Gbmw7K0l*Cz&1K_)4y59Pp
z&n&1aQtf?OFdfJMAH?KZ>PMmG3oGK8&-EYsGrsnYkD<KS{;HU7@eE7z7u`cNlHca%
z8+k$yPigek&LPC<)r`?~S~zVD|8fxu{3m3a)22^%CU&_$x|`n~V<PsvjyQnwRVAI~
zT#PAB@Q-ZC)H$E6Tes4@FsFu=kw|hhC{nh_NE{s_`^=LcD_5&$+wij0NE)VyJ0!_f
zo|0&bj8zN=TuKA>&H(eUesP4piEsc6s}E0x$v~Od@i=R_quePx&WS{RH#30kE|ArC
zLP6Xp5VoLgb9S@%#-qvJo|o|+SW!w`vaob_kAk36R0>EoX;3yHT>?fGN))zp?c0qX
zJ2nhB&1`4OYJ$`9(z+@FimM&dWF$H;hH*X!hfoM$#_fC(Bs;)5gw%w)=ugfo6&Xn6
z#+`l1sUveJDFwPz=njOYe`?M96)y2;bK@t}S&G6#M@GS6^IaZ5-3J*w>qeFDXEmBn
z!WvfbK?tLApYc(v=$cas2c|}pEz=}wh8!5fX;y_)&}=D#&b!q(q>-Vn&a5l5ziOR~
zZrHf7IJ=+;u^CfS!Fm!*p8jTkYB!x4IUQ-|az$#v*FTJX{4_*33-lVM5u%NX4zNnY
z8fm$}1N0eQva)7>tCQF1d`P{pee4;XPlbhpc*3_&EtuiwN!;$xjv>(4(qwJ>{hNM@
zCJbvto*Mv5CgY4`nSF35VmqnKeU2`-Bf{BTos<bu^_2Z)Rzi~7t;ROPMd(VBu4euk
zIB!Hu;4ASRad`;Ns(GTJ@zg4U2S7Me&d?XjGi#WUZ+GcCk#Gull-VO1CC@x+Dv)E^
z`x-Mi`vs0nQckA7pV(F)rqidFuJ;t)1+a;D?U;_ee^c@wub<z3=vx~^d&Ph0{R3FE
z09~<ahLp&T{T?L`*B_Xd8m)OfYQku@o5K>St`#DknHzlS+x+^trfVlkU#GmGa73)S
zac#=Y&6VK@@E*E4HnUC$uG$JJjiC&B!}WrlNxvER!T$S>C$x8!GjuZRe#kDHwY3;j
zk5uhLmY=ch&*VlrIuUc8a0@M{*4KY@#@Wf>?w%gA_oV+C2{DBnk$DGds#axn8A}?1
zq)_JNhLnmba>8CfV;Z4LqV959QUkh|mSJ(D^G@NuFkt-n(-Ys8b0>7Oz^l%JYKg-!
zwmR1T^k_qsgxO$C4(Ih?f1i)3S|kRxzsGMNQi#bRkN@s`M%rhZHZ}2@Fb4aI!|F!T
zVH~lzjafsqj9OEsyl7NnyDRS~lmc)Ni0iROKZ-FbQPDXxokcwHMd&!Pj_SkXr*HXk
z3ffrE^hJn?qTc6vesxUnH>OdI&-{=#ze&h+whXhh#ONlzQ}T1vUUk)Ph;IC0LtLJk
zWn^`I;b|^IV(&y$qn4P_uUgU73>e#E#%#Nl2$Sotv9Uwh)Z#Zx5y&~ev*Ze675I|r
z^@G4Jy7?PJh`8)%-~aBiff(faaCkAPOQDzdv*6zN*-?38KBjzahTzsuZ452Dfe^PK
zmqkC;1E;6Nao=jB*~KFy>(Uop4s4P^ZIN||S}py(z<_i$!cs*^%YAT<ANIDlw_tEY
zhFTAq`D*PNx)4%cC@8sa3HSfg`f4u1IS77B&h@csSas6}>+We81P{pixL{%rI8<VC
z=yTL+1ynivlGLzp+GX1ljR%a*Gt@A(%XM(H%N^zBt=J~Ty;bjlG6Vu4d*vK?*tT9N
zEl)FYRn)a_jJsmKGTzn1qP&C^9wudrOy>lq5X9tni}j6yKQ|w4IGGU<6fDnGDkNyJ
zVkbo}VRQaz90<l`O`0*o$k+HM?h9GP^XAN{)qijNxCE7ORCuv&rEJ>lZ!W`U&5FBL
zzXvE;%v|rhjB`5iPH8n1Np>~{1qm47hi2t={$RrpnT0~*8bf!Pohr<Cf;V)*XHFOT
z6LcZoL$E$@cvnb>-N%H+DD-)j+;7NJNX;XLpI(;q*=3mt1N-Dv_=ANb;|_#4n)Kv`
z7{q~VT;RJ1Hc{S#iw_6ks<xT!nYca-hY{Nd%J84~_%!!9`s8@cMqP)TlR-b$o=atQ
zAD<R>cvRj1b;|v$8n#`Hjnnbl@>RP{@o|A7fbOiVODP$I*mQAW(kp5o6F-p~(CGzS
zO#f@N3!~g(V(fMU-ASh_Ik|6=noo$vFrzP19upv87Gd;%bQC+Boz>zvX6nygzmBv`
zVX!<Wq)l{u^kXzS2OM{nTt&2NA67kN&4tO*E(U}^@S|6?na^<jmEk3C09XA8zMcA<
zq}C;OcYO8m?OKp_RN#Icol?+-z`uwp+Qx@ofFCH$U|6Ue$ijiJ^WV(An1p*M!9_+h
zUQ-?eoWvxI91$J(%AD<0%SutmeJz{s!x^@$f`$^<s($dkxG$Fk3woGwFP2s>NtVpF
zFnutCX2|auO+Q#;vomk^kA8~_6jEyUiaFiPWqG${hls5;I5sj-Ki)K}=ds1+fH1~x
zN5&i~oJ(tM>O(sDHcKAuuw|XQR(P2o{o#8mh$|3CMTH!s&@#CL-j(6^Djs-~2Mjxf
zr1yT(yteeiBv)?Pb^VaLoYC0o?zeS9!qhTYm0FE;(I4nSKqnrE+5-h_t=IM)L^u-H
zAk^*wZKh`*Kh_L=*To(TS;RiWw;NXm1M?&B*ktd^p{wlDzg03Ni6tg2ZsS`9(?@&G
z7UQ^|6uSW<t{tNxQvpnKO|1|;>h*<7I`F`5<P!8^!yJ3f3H$lcV+ZqCYob>W+9w(Y
z?JW$4nffqfEB{a_h1E`%t?NOV1%AH1lW>9Rx$N8FoR9b5Q%XN=@}2!dXP;m6T7GK(
z5(!&$=~=om!GwRM|ErHV?ZA1|k}^F>QQ@D&^z!9Bqd_Ze9jm|WFJ-K361}tA$sA-E
zExHW*#TMp-OE)ZGIo2{ATVb-B#^}tK!r`g-59VD53aj7XVy(wE?FPnomLo$<T3hSs
z8K&$GSM%qt(*Lb&)%{l5DJy^Svn$mP$6a^ZKNN~z$aIHYk0%73$~i$#M3}EYC?YH9
zu`RN9z=yWjlnbqqBtzP;BD&119=oF$sOuTMJ;g`V{G5eWC7tQWi1bCqwL@d1pDuK$
zb%+kaS=&Oaq)qgjsVppw3jBb0E7AKDcOz$$cuW0(3l}DKnlWbtmY6cWAk=G5eVvuF
zklp7&x(Ruhh_v_sO62zSGnB|zKdqBK(5a2ck85Cv^a1w5=B+;l&RMoBxnR|b9V&9f
z@=b5|zpH6T_!o~|9<bsP@6AD*u%t=!=H9va$u$Iwhfh>3zBx5p8|K=@7Z!&!-`BBD
zMrib<rFnCFANm{HesHq8pH3wx(Mvf)f_@u>ew_W#*^Qt-if;|5iQKZTjy1FuW(AqO
zNex+9;;Q*YT={__#T$>L`AUk$p^<rC%*;jeMfhJyRf&LW*s!_u>BLllqDd^Qp$aQh
z-y%VMcEnJ<P6d~bYIJylqmO-7a^M(=iY96ds3{j8#KPq!kk?4S#^nIZMY)T$;;eS2
zg4k&NGjC5=m~+C}s%TKnPMR%Hs}`-9P@Z&z_h2TBj2K@9^Cn|NCEeO<#26KzZ8C{0
zx}kekA+m%EPl-$(*?^)D_|k#7x=mm;ck2p-Uv~=yXg5(UYEREF($28**zzLUcF&tr
zlkX3(lfluco=ct8pNYjD2b+7vN6q>#qmHdY!jG#%^p=q9L(dQGG=2ZqB>S&5$4A&0
zh&rF3b+vjyk8a&CjoO{#xe3t~z+}W%Z(=9CC-CqqDv^j+o@Yji=CQsNYpMU*5;Nj)
zM&Dc`!22dtH7&m^Hty8P6p&i1yJ5?0|INt|oAsL|>NnQOsWRQOi_|x23TV=*v^4LO
z${u12-)G^wGh!}HFNfy$Qz8xxW#8c@yo1au6ebxBIhT7_XJihG?U+wFd{&5N#XB+h
z7UL~hv>}1owQGkLDv25RGDjy*X!<HJV}VBH%2`qW?09Q$4^*g9Vmy)2_OxVEGLP7X
zb&uZ;2gLZq0)XX}Rb8wuE#2*~8%s$M6sgn^qC-GxmhIXcrMU6?hu7-v+`G5qU3<G}
z8(<V6WGVO^op>zU{)5x^-sp8k?6yn&!v!Gw*vE)cN-IIVCdNHo(v_BD^=5s?+;y3j
z$$`s!*56BdZDG_Mz42#^YeFdNto(#&nxFS=P(bECDU<WEU%$<*2>>j#V&lDzAn!7B
z8meQ`JU4PN<1VFpQ~Cs6HSjLg`2BX)?bK;@mu=0|2VQKU?l;wPtZS`{V?JEFR=>-X
zD;u8;>~gMC|0&z``}-TOGqatt_T9R$cjH&jv_3v8;)9oWVgB$%`AxH`#=l}}UCFMm
zCA;Q5-d)oFRnDSUr*hw9&2kzsLW|>63{<?tU|Ua5(~Xn_C{w%`d1^vx7Qz(Dh4}fN
zfcx%unFM{F{3B-lK8#ll;*U20B{{G^bijirh;uktYkB?s*I(g>4_kfPO(L{?xIe8O
zF%dJ`BZ@!Hi=^jOwL<V<1?-W-fhYu`gWptR8cE3X>m1;INRPY+CvDrpyZAU_&~bm1
zm}ki=*I8;50D>c4zm{<JTus=HB{-rsWH~lT;n!uG4T_(_trbQZ-Phl6+Qr=~`5$Du
z;-K$)b`8U>R#LL9#+^x9d(E@;`WaRZpaXvUA98`J8S4I8oAc`+-wN&d;1radgj2tc
zw%QvqLeahpP1yHU4W;?A0yTUxcUlg1GGQEp9w=nN4_wS!_l<jNcKqG6TUu{s+lCNP
zlHTmFozijqpfE#?8chV?@I60)6LI*??pl%-&EHAV9AtF-Q}?l_7q+J~tf??CGo7xT
z)$Ga*E$JqyW=^Bg2SfFPcu(gq3>1VVE-LXQMQ<yUZ7O-2KM_LnyXF3{ZT@lX(M#3&
z2yg)X<lT3WHygUey#F%zLC{vWnKLnbEHQK*XlGXxvSsP$te4xy7SA4CTx6yt6G=LQ
zp-Dlch^q<PH2Gq_K}pS^9jy=OwQcK=6z9+essf04OXnUmN?9wKX|JyKUv+e@xrK#h
z<!zkW%%8Y!nC(#Pae!G+-Mc4FFrfzbd%m<ObgtP&9#+}_0t-^LV46X3p3GyC#^{&#
z+9}aHcD=Z&w}A|EOa45jnjps+uYWW&ENr>tZ&cnaj+i+qBE07O)(L>+Y-$)X8Kjt$
zp_7!NU=)297nr`5!~|qR_|vbSq?w4=lRR(hc!vkyINgggK4xcVwy3S0wPll8Pm*3U
z=VO~zXtcRd%ZDtPv(!wjl4AC$V(7Qt$0H+$Y;*YBN7AN6Z=STtH~Rid&&JM9CqP%6
zyRpS!&88=Bs!%OrZChKrIl}FkY*MC|718q+x<?RIT4Hq0v)9j_HG6hV@=-K{nTyXf
zmPUfEId)4^sQrWy4Y39dWQ&}w%nkmS2toi3Gk)9eo8G@i?_1hL2p8|eto%zjKgH;u
zWG6@)4px5G$RBs`xCgi$e$u07yLQ7nM0F=shXYj6$NfrTLfE^@7j@cfuszUW@TF9E
zG%!<r-=K$V*`h_L{oo-+{oFn6%o}Ft>8CFK6n<}V<S>`PuCZ%9VH>XnbFTV}(SSl9
zSMTz6sb@82u(3ClP)=bkIsWU4>Jcdvc8f{PqTf3#sq5H{vu4ZmDj*cMm^R>A5>j`s
z^7fUdb@4U|315&<`)BX<qZ4m7cd4V6Mj$R^E7xmfzl(TiR$|X+w`So{6Gz-w)uyUh
zn~XiHqMRjJXZ*y-J$*q-nFj&LFmbvgN2>c>muj_aBW60m;d2NG)0Ka9sg&-a-&*6o
z2m}`&ii?~0A$jk{jV=1-;(bit*>>N3U<9V<jh^+JbZ^G-X0BssnTgLY@M}E)(RzKF
z==x>8Y~r*q_)tK=j1jwg+v#H*C>}jqw`Rv^jH$Y2PzAuxp1phP;!Pyz9H%&Jx^?`u
z>Mt2!LuYhXUyq{`o<$z@YebG0fN^tLLNg=ntk_LA*9P3^XkckI4m+X=oiE+-tLgLn
zh0~kx_=s%lg%38FX#}(zZ}U#k|GjKHeXA3xo6@U%tbj&GmKz|rFi$X-MEw)SU@qz?
zR?k33$EFOmVs{Kgtn0f%GoqxsYi1RS40q29KXc2Co^bJjI@VGgI%vjiEomMLB~mz6
z@u@#FIfv^Tn8|WnvQ@gcNL&$dnsd6Q{&*X+q~R^%Zlgpxm(I76i9%UJH;4cwrD0ve
z3^7oV(m}xopk;`qEXMHYXiW+c>3%9=2#Q>%>Xk^&8F6H$4ii`TaZkJ+3|f1qDriko
z=TjXBx#K_R6(CB<b{*i43R|-(#&28fHuQCyI<<8V$IWThBMdeb?t88Ein!-~cfpa_
z;^loJSFNIY^0CSdWs3v8Dhih5K2T(Xhs|`2ob@ENecul>1Cel>ud|p-HEa$=Vf~h<
zz^~>HrdT|vl=9H#(W}580pqoW-X_#@x24?-ZdeB$^{Z#tYV#Fe?Qz2>h>K#DJbv{_
zFY@}AsV-BE8*-McB)n8mglz(RoBH_&YlsJ8kUSEOt<J<^qK*4qz&ek-o*Awf$>D13
z`-af%7tkO0>+<8(neZgtp^kQUi?(tOsZ9{VdODy}Y<zra#=aw?&rOCT?0nAuEXN_t
zoI{s~&EB<RLd|~A;buJqAe_~k9yY0r&H*~DW7}44Xn2P%+WZeW4NZQ3YbCDff=8X4
zrEhS*;KAQ}ue84le>3ULChfd6xs}9pA6vJT!+OR?J{>T^)_w@bN#w}17+nu$(}<G*
zWAo^f`97gAt*zNS__w2t687%n(Jz);48}UVG5Z4_53()cZir(ulp(TdPWj6Tu$~$T
zxq@Qks+NLD66nKLfh)15lcmKrmk|<VuwO2!0$oB!NWZL#ItS{vA9sGKJ7F%@SyRVl
z@lF|T3LLfZ3dZ6*yQ*tuf;`m!k$Iu=W<S5m>;XUPe}q4~dw1+Dlg$}=yE6~m8Twp-
z$LtW3Z$cSaaFUA5u2_uMiIYK_HdP4yruPlp+|Oe3mMc?xlHd9~&v?VZ<o@1eb47sU
z7{|ZgXsgHAbf0^yaN7MW>-kT%JRX&>z1<JJ2ar9*S4ELwN(@q~v0y=wX31Z;D&VRI
z<-42Zm{=pq1Zhch`h^Z<b%PO)V`QNPX}ccPQqV9jx9WNkCMA0~q&W8am%-rUxFCJc
znCo<Up%w^Hcv|AH<_<2#4QZ!^hTD9P@E@+X^u6xDfYk2IOuX7H+68Tr0V5OrJTEM6
zre*E8r>&9h<|}wZ^p%oF=>&mKT;f8Cy@6igj~n*Av=swF>BN(jdMOt^&NYv3Wm(@4
zyV%&QulB&rQeq~#Y2ia;(OFH$X)wf(Ll#*MADQ6A8C&hH7%+iP9xC{pd6-ah?)r5t
z7^!pS&W-wf-#qgcScgH*iJ7-|lMr4V!PgjZ@~@c4Ukg|XPgwxWWwD@7W%Qo;o*P7t
zMIt3KqfXv@bY6gQaQnA*o5RgaKmC0Heg}=fZ%-@oDcRFJ!n1*HQ0Z1WaxQe7I9On7
zs?V90eXC97>IzChMe)_GV)`lJ7{=?ExcJ`8Y*V=|39IiEn>XPaZOr>x26PB+H{RIs
zvkc`@y>e&DEEzN_)HK!~mJZ^e!a`!8VF*TXYp!U?N^QM^BWp5?Q3h;@(vwg!i9XZh
zRTv@ZZl_-lZU}D}BP^sBgP57JdiD9j8-KqMc!P~9v_nON%4Eu-O@;34=lw0SGWSW;
z<(Tkk|KkWNlON6MF#hiTbvx*u`sEzov144qRsBtoW5e~#8>;4uyI;wGVj(sEg!k?w
zysMxFDQBSg)AbLY(Xr3N-Mae%4R?$Ysx-%g^KN>M_9vhEhnFP2^D<0onQF*Ji|=MU
zbs+svX)`kv1>15_P<`*QAuv$h4@wOFJbTBj&%+A}y^KHk1c6Lv+&?_K^re?BUyjUZ
z9d}}YXL3h=S7wDFEgdqZ11#FGuNn%a&<kKkm~vQg#T}jPw#NjPswUs{*)Mni=6+i$
z1`JNx3lj^oB<C43*3zCaN}lOvv~&XKk91s&-m?(;QE{i<>pnZ@WSCXjoyt~<?e<9>
zFMgS}Ix{N^UaW6xd-EHc;ktT6xUmRe;z_GeP2v1yWKMiWV>djSBmgmbuLTLs9WN3@
zod6u2IZSY!80XTLAaDVg!S@YC>OhK=+XqLtN1uiO^C@_i;sbu_4Cx}PgTs#<)(~<*
z1{3PlNlD@YZj)EZbQ37OnvSRIq*N7I1{69xT<jlxrKBYXG3mIa{T|z~+9!@;zY_g^
zGyd|+;;vu69$S2B9>u@#jL^WTnd(=u-$CpUPi^p@3i_!VxJ!QNtr8&<YJ-?FL%`iH
zzil)CuccXdsxFv>Q_NupbIbZw7VAW;t!x{ye~eG~mtmT9^n0hx|FkmTxuL-}rEYy&
zBggr1@1BIt93Ju2`oRi~kPBs=BCO}aXjCIIg9~|-``11U4Vyw9EmTEN99<h<S@VNR
zZ5Z)}7`Ole>vPC@VnHAdx%ZyDV&YC7hqYMeSGL;q<ruauK`O-KNXi!$sx;|`M7O6?
z!|1M!FQLqO`&o?kvV*qiUUeZEZ)BqWIf{z8_(b<!dj&_LSw8@#g?8iK(0uTMv_&8H
zSlvH^(-f6@R);QFGbDvqClokD>YtNB?UIg;w|+@rGCWb+q*O4H7TvlfJbYi_LXUH&
z;n1*Am4MCF;e}6dtNofsT(_C_J&FL;r(|{5ylvYzOY?>TlYmGcXRb`n<u{WbFinci
zszAfkO$et>W$b_BAi@|rsfaKfcCKp-ua*A2%qQdg$%;Eoxy9&-`Uhl6i00<*85v0T
zL8{P?Ovt}^A};GvNd*P^Zii*}Jz+y|B4Xub^WGf<U?3I)KF@WlM${Q+IC}i6nrtHB
znxuc1(ZPYuP2!60SF4*=Uu7o-RZP<@J}$Ul9LDNt8QE`895^lAiB8KeK7Mw>r2|LJ
zyGI*tYkBcYWVJR`$0>$=i?2?Vp|ZH-Ba9{u-I_lICNLSWQp+MIPK=i!XZ^A~0->n^
z37Zn|#M|U&!NAzvv%${WZ@o`f&hGu(zqwAc&YIY#_6YZI0jm~WuUNy<EzT+~1?Lb0
zLgfrO4{U4)mm?bm;Lo0muve$I#jQs}DTsi6E{Tru!-pF|``*MRz%CSOj9u}X_3MRq
z?KhpASy(sjo*B(L>KG(C5Vc=SUJYXhy4LuFU$80s!mGQOWQPc!Q3g;b7tGrruT+~o
z`=e{NCsPfd?KlcYJg@(gI$7Q9?Cku~`lvL3lD3AQK)}-hO7bUWRcMxO?Lg!Z4lBhL
z=h)R>vsI@+WJ~(M6~%Mq<HtZM^K}h6R{$T^)apQ<GFuE0WW?gO8da)H8ZeZ1H9L1X
zuHINO8ldWy!||538!{QS(V@F;sH8%PrRjy72hr{4Fj;)dxaQ`EkE_nXt*wXJTj*`Z
z)P6$Pg66&F4_TZ;Mr%0uOb4Q%(bj3nKt!&arwDoJd?tc?5pm@Gpjf}$X87J-^A2Pz
zV0wgY;~CRs;7H>TOzMj8iimgLj+=&lvKQra(i?ESedv@jToCBT75o_1Zp>kYAjQ1s
z6}E=$9ga+_&Yn3BoLmzGhIF(Qq48eYOH2-Q`hS0^lKqIY&$w|r%&t<QjtC2SjySGu
z@`ls%->e|1wmp3f@}c-m0F<4hql;af=7r3SnMJbH@tq}s)x0ZqN$?;-u0l6;esd?n
ztadRC^SE1yu3vtN*@$yjo9)97Sm|6K=X&;WAM!Pl+N^0i6JqIsy5F^H>hR^u7r*GB
zc;RG%7e*evNdOYTxAQ5YGE<AEMEoMzSrAs(D~=>J=TjeAl=RW51A~mbY{$KMK9so`
zaqi~}076NvGkf0PQ8?4EPy$MWj^NYf7m%rM$<SnR>i2yE<1r(pS%xKWy4W8~@6qFg
z`;`=zJO&ssK$;4#J~X@PCMIdFYg=eF=w$k#k+{^@bvOpr^z6qd7CPXzOGy5~6M+cH
zaEg)%Kcc3C@=UKBkNw#R;Fw7dZ#EZSq@2P<PdpvsMp26AX|~%9R1eSnkb4SjVl!u5
za|8FsLI_C;uNAUBocc*<8pEN^GI!;4#)A6XQ&>&u-QIJ`>QwzVkAZt}52oW3sIo|e
zWcsO|!^Qzjq!zac&RFfHO|?di8dYYrAFe(Fo6kA(=U1}odnOF(N~hIOmQo&*P|5%#
z%}c)sGT9zpW%b;cU^O4wuhguD!?C^v#S;_r0o=~~`HyIO2vK=c&1$f{#Am|m=gIWn
z99(A1vP|E{zJo}YR8%UQ&1%(R4$r3DN1)YdHlPp+L;4HwHvJ}d`7VCi$t1ZfAiZ^q
z9qm!_$!>>?#f&f8ZNU}me*6mMzUtk-Z|pit_ieQE6O#XD^a_Bx64E>G1nraCQpoyz
z`s$T~vsuh=y>l2k&v|~ns^3>P3}jwfWO`v4BvTVu9rpLVX_<8!`Mi{tZeD;pbmY`K
z*=C--a2%nOtGIo*IysA!|KORBB4UeQv0*gbS|Mzq?60+H3hrxb=!Y2{pGkB1uN%7z
z)-4-<k8=&4pbuPY<wD6^xI>z;CHPUCBT6cD1BaACtI?m(E<7wU|7&H1@?8W6X3JL+
z&Bfy~WW&83fJ3x8p3`kn0@n7n4LR&IjvNX*qc!k%tKOwG4#8+>gt-ygx=-GvU%g>O
zswwmo%s@#j9vyO%!sSXnJJW1ggihVMlTeb#D8$s%-!t4s15VLRt5fVWmhiqZK0dy{
zYZlM}e5LVuy=6~{)6c9^Ge5tfg!L6C8ZW<m;Ig!Wo`@*ZVC4G!62`QE7lh4$>nAhn
zv5gGh3SI0(Hm?pb!20#JiSB)E2a(Q;5u)=F)|$@5(?YC+O9qBUzsouJb2Lu;b$~~7
z4sa<(b*DwGW0A2m!bt2DnYkq3bIYK_<IowB%(NxNHL5aj_6eW{i7G>e4?j=Z0GNI*
z=;NE+kbXr?yx7CTZp=F2gR!YEicf|I&FaAr`mQ3%wBHgl2F;E#YSc|jlSM>TbqE26
z=F+M@J0`)^F5*3<8WhJB1>4C*tMkm_4#zX04`@v0R4AhTkhx9BvTaX&fJ`FX9Mw~@
zFAxK_@>gVcCV1+?7b8RuL_Ch){p6&jT`>;OhUpC*ZZgpWe6x1vadJxOUZ92%=NOVe
z#cEm2(_6PXB&^hjC$O_UJtq<Vy5Ez5J#cCQ8jG`gw+31VGUs!&;lF0%{D?28jTe9-
zhV7^{SZAHKZ`)Qy$*Wx$k3k!^SWSQA{MZg**^scI+y@QwiXEyGgWQtF%E5=<K=NK%
z>{buhYyt&q9Sq`d*eufKXM>9VYgxfr4MwS$Vr(Ef0zU2>JTl7pO^vk+VOE>HqhVsE
zwi5bvm5?M7EBzCX(TMP^D(a8y-L+FE#cEgG?|VysYQP>J%%Q1FV%3ZN8o#VTgM(aO
zddS>95nke!X1KM9J6La504oJs5m6_KHl1x-FHyv$1y3A+y)a6UtbpMv%VC9!W)-QV
zDUEAc3p67o2SRZIgV&#xm0I$l552_HJ4HAdHv?`q`<#cAS__7mKqPb%#2jG${H_A1
zW5Xt?7$ug50&}ryiH?4=5ff;V&iVzndm!#=-Dl-g3GI2MH+4QI>|q3;d<K=>C*D(v
ze;jW*OHwby{tH9jM4Yf*Vxo3oXoQ>t1>>GDMMky})OR+VIx{YX7)41tm+4ZtY9P@z
zA2PDTuZM^+<|YN5F>2J%KIr!3y?s4aaDrdHZnU%SQjQPzq1Uof=sKqakd#$*`ioM?
z`N2__tc#;Z>$Gul*z(ghh{lXe|HFLT3}iuYb9wG?_P7J-r^Nk_D;hiGa_sa8)P~@D
zAxlTt?vE?J6cbH|vYPNt=C+H&4Ash9TtH}QHNa<<L%0<>XVI&nMSn)4t#<KH+3+|r
zHtM%GyPtP38Z)cQQ;pk=JzB|ldv%J13oyz^fmZ-B!_8`pc!9Wjf%1!YsTg|lb2w~?
zq@u?uOC5Ul4{mR1v*c7^QqclW@q*wn`#hhw59<oZzTykY;u7k-P#!40#BMFg_bWOf
z%@TW>v_lHw-+D-!wVY2+JH(H%6ymB2ZFeyIacIqc+KE}2nPN)$tjX9XH}|E+FX&Gi
z%q$K~PdOyG+H@cVZm&-6XkoZ;rG9gZuuJy-{k<*8a}`BeWDr)X;$p|INr#wAu)rKf
zSgK@wGEF!AGrOv8?p`=SE8z{?N@Sk!vz|&_@8x!<It_Kh09H&gWMffj$aw?R<nW0T
zt4^Kj4e`hi71FaYGXVZ%mIqq6thf@qN%H<Na8F$Zx-Kp(0-!-Dp+9$t5F!5WqEY4c
zN(Y28sNJs9nE0(G<;^a5P)a}=5BDRo>kU3HX-Ye*WwFt(liu2ib_ueCm4ie4<}W{X
zUdGM>UF9W`tr@m;*S#LG8Q`DzKYm*W$NPt<rllQ4CK<@vO<_H~&WzeT=Nb^kMFNQL
zZ=%+DLG|Ax!o>6AjjEZ?13FP?@QC1)M0FMZ0+Mg=R?b|Lh~tqB-Ka+Q$SmlrOnza>
zC;>N%K*nH0Rj^v-Sg&^N6{Lz`45X5oH*te|pc&hh%}5gLqQ<#6AB2<q94j?Za1(0g
z;`WAGT&TMEhxvkIz(gWau4PY(z*<3|CFETF<mP`uH~*Y?fvV*Q3reJj+z@v+D0D&?
zlm4)u#zmERkLxrz4@Y2%qV&@c`@wbVd?6gs*lrOtXPGUP$94KK!jOd_rKX>ny%ZbF
zbe}M(tS!ykG@NLM)A;^QN0YD#vo(w@G8IW7U(N2(Z1gN<>A5T($`Pmc#tb@-?E2vF
zaXkKHmsPNu&{bI8X3ops%A9LXxwu4}@^z?kjAjb&jT016jn$jah($2aX)YEv7CSp$
zFGGncS?zi{ep8`!2<$&`?4C7|QnX%&3Hq%U+722$x{f?@Yzr$jZ_ZW{f24EgDw61(
z!(vsb+x8j-Md-tm85uzKv;jo2*SeL%d8lb3m*k8Z_r7<$<7ss#rze(!bM9ds!U+SZ
zYqbcjxN4o^NBBP>!8MwR&+-{GXnC7j4BI@UwV0a7&_(j5q}Ew?3?Gi73ck3aa<u|*
zM%&`LQn=A4Vriv6c#xECebRBlgu}2=lRN<YmmfTMu;7Hfjg1*SC}HEhNmQ6yREb<H
z{^%%k%u;T#FDxH%q_uZg_tI`?>rIBG6A1*DNMQ0d&GaVENYZ|BRkN&%C3wx_)a~0F
zB3EJhg6+POSARRk#>NVBH0)$R_*H}GQ@=i)di&YaS{IO-uE_mIu{{I-Z~>TN<QHF3
z8?QAHOQ2%7>+oY!;es2@b&{ZZ3v*)Gw)o+&)jJU?Edv*@P6ku;COk6ooE59ZdG!tS
z&8Xi<2+kEqbh`+nYtGX%nwztW3Jd4JWKrx)0=ZA(cFym{fA(~nF{2vAiMwCW+b<rg
z=f<KTFP#K*1^?W`?<xf87o16AsBGxCh)^R3Au6B5^o7_Uh!KrASCWPpu3tprORZS@
z020;J)PHtG$LqqOtB+HrevSzl3g1;{!>Jl|n=h3#2<+Ar#4K#VM2t-V?&@sl!XR>S
z_(GCccieBFHW9f^aGyICaxR9+iDucy?Y?6GMJ@NKQTsX#8jRUDd0e+VbRdAbknO8p
zx$`j^mFECR@*pYo+WXVMeNHP-u<61kBW%aBPsYb}ny`jw{*3fw49#(oYg-0Bl5wKF
z=IKc$``)C#*iUkOP}mP=SXb-+<8>=>4uWQ*bK^zCSc+;LF{2vDoLBLV%}Mn<ozXx!
z!yFD-G@pNte+Vg=I${+NnPL2N21R!U>;MPYUi$6el1pDS$ih^w+ouOm(}V4jj1Yi3
ze)U{Re%%?_%W&{JXg~D~RMN2UhYJ{|j+=Frl`b0q7(lIj>8o50f^X?Ibl5QQ{*!4n
zq2KzKP~#db+uNM?PpRW&qupmkZEkCp-ja$N8((<@kQL@a=cI0cGb1(4vOa({+|v)c
z$BGCW6K#28#*@^3zkhxiCGC2#R3lYkK}$vi610ykK&ql<wX2=egJR3l^-RgDMponZ
zykm@52-3|^vB&sbaTj8FNkb8@F>GsUNIW5Cb12zO?+XbVknNf!?fBCVw;6_$9)qgT
z=$26x;-HBQr%dXD)*}m*hiL()SZAaeMU0l%WAT(*67Si*0|Z7HK3tr6g<MY4QVEFJ
zWmnvH*bm#OOs^HnmW=HX*|BQmb0|V^(H2uBSLE813-Li2iPpEhFe#A-8t0TI?wNw(
z$}~*cRwODyPsYQhN_;<n67tP0<42iav4XQDD~LGe3wQ<YZ&cVDR+L?&)7Y%_>26F)
zw9IcGb4pj8BS7_oaHlvmv6OTeyHQPHkF7+6p-Z|al6zvO=#KaSbaV}%>~cnN&<!Sj
zIvLR9pN)USQ|da9&U0YtF`gYvAHNUGw%ht~ZEZb0J!C@tX{;K-rPn0Wy+~*;cz&T8
zG?;zdG)7?bJGJQJ?{QhxoS^0f(Rz1gyqIXP=z*_c|Dee>)7vat2Se1?48EpuveGd7
zE}^^8-tjY5qBsG-T|+fBf&EG^f-cZ<!d#OkO>W_U^sK>So&YqGknyW1Aq!4MzmiG}
zHyAO>rS_HS!7w9z;6_up?xSn-`oXcbR5dep?Hcpw=D>9`b_?2$ES${r9PMPk2JD+p
z=sRRzkzd=z$*eRAkO8a@5fsy_{A10Nn0S$c+wZHZ-KQ;u26%CfHM&$Ns;9^_;1_u4
zbJ#v28Aiyq`1|KJ;xgmnB2zs5;&xjuMdc(-E7-9zv<=Kfv;I^?0ag_YEGcE6NaDEa
zDffuflW-&!X{3AS#tl83aO6nT(ZhD;Z+|Q^Eq#0UVBr0hwX%$8qYDWkJ#kb1E^6WD
zurbZa>|_iI`>GYyFsnyyGYlc@7Y^JiX2zkY2(RtvR!vxLFbSl&s7zxp1cXc6!=08D
z-X5&@h<DkT<gj<Wu5&wt@Wx)9L#!6kCXrX_ED{yjd9ZabYEUDY@?~aex_2u@OoD{L
zCx)8bz6<1ns*y6dv_U$-q|$<78s6CFhuNxRLlO>NIv$AODOpw%CODD@$Bh^}wjpV}
zB-Xoj?PADgYnx!zPcD!cA>@0}-z6{mURzYVoEik;0|ieZ_-4)#KC_t8h>d&QCv{Fj
zOIE>3;qJk}!~xOy^Kn=HZ}-1%rzNN1WbLE+5&;Jgqnu+bqP0!yNfcdkjpPJ34)K~`
zKb^1fYWJm}8_0C+3RHf;stYSD4!SaR^_n#UtgLE@6AD`-BLOzb6xs;VySSa~F)t`w
zW-jPPn@}XN7BskxY6`82i*PV8?Q82kRk|OHJu>Vu<RD9*QKM^l8mpT$<SM$Enxy=m
zb9#^_{km<P(#1WsY%ukIl0*I)xtwr)j?_h(S3pvX%N?$N8G@`lU<`<K@h{?ZpFBWa
zE@jEzxJNis7~#O7C=7&Z`uNm>bL6RmHZa@7H5#A-f6C=JGFWX41EiDzG}YjrYFkIx
zp0DAPt3@6pE<%bz1y{S5YPy}g@BdzglF_0C758G2I(IiX%4f0T>;%$Ecf)L#Lhl@+
zhN7XNA-*ltujlx=Amj!>ePUKdX0U?F<bKQA+Sc8mM`#hr8Byyz5ffptj{Dpb=FuQK
zn@IhYXk(j!KxU2YR9O52KO^Dxu5GGPMRw~%l_%p`Tf4We^z(G{Z%KiC!4<FO{khce
z9-|pQN}eh^l@K&GZUk~uTqBW&(HpV?MlC-p{UfHfE+W3DS~)&cKuyqpnfxlOa}jfW
z9i0t(EeB=Z5@SNqGkg7b)0RPO74(1qo&xFj8<uK$pWDiJHVnU&wSrAr@C^&t1@)@S
zQqxWuuUZsTQo4i96d52Gx;QPk^?j7yc5Nx(lu-DE?yXfzPX5@xnsevdw{OsnD*!RO
zS8hm80(bVc@LOe;HI47`8LGDas1M)~*Q#B+0)Pd5u!Ex*T0cn6PMigrG+4#j5qoDm
zePCa&C!?=gxuF1i+-yeE?XApYnAJmCYC3if9=XwQ=#H`*x5Sb9LiycZ-SqjL>}(Ag
zw_CM@5nPeIX<RG6`vsP}=&)qYZRVk}6eMQIiOWtwSmL{oBTqa=@jAZ5VKB+XrOo8y
zsxLTO96d#Q-=Onk-%cNaV@UtZ1hS$kCRPSCXsLbsv=J*BIVh^Z^PFB1ZWJGwnsiKL
zbTmP7*h_t(p^XT)vi84I)uv64v9y=vUT;9q0(_!2$=csPe7^B}?uhw1+%$6QiY{nY
z&R^E5O(I`3+`ld90C!jvF0`Skul0|=fyactfXybu=s0dG(&5qGr=rp=`NHt!DPYfH
zNF_Es&R)xw^#wmx2T?*RJc-j0P$g(}P;Kp$Ujh;V(pRR73-TsBN*|(lMFnzz2nysQ
z#vGrfO$Z;fVGEBTx2s{md}EyGI5o+@Z_#%=wL!I!R_Ce_s?udfV?cwdA*55<&79B*
z!86mfpvO9Uh2UU;!}bp5ZOUFl`tk@FIa^J9EGP&=W-vBw74;%i#O*$&@+fSFakz`8
z@+4w~C`bhmfa}M~mKhVma#v*L0ZlVuh%gGH2E1^b;|9RDe)+fQpu&GdO2h^eoQD26
z8Y}Ix!^Ax)l}uA=V%iR?2t5I{+RMXf4^NA*g^d1Z?;3upE6r^~C!%h%E^CfC6aID=
zd_#V~RJA&ti91W`1UIt_yfC5cCh+6ub;<&dxL^Q3xVwqHG~<WFcJR~bzN&}XyX&Mg
zO7#Yv6ik4yY&g$}zA-qs_3^ft2C}MPf9YBb9aLC+455M$vBe{juF$(Vy#MR=p;qPV
zBm5y_skNTXCADV#SO%x6b!6bd`606CtbZ)CK6SFDYQf1LS%ECNK^7LM>i$cw)Rg=S
zi|<-0ljPf{ufVyH@PQ#g04+XT+Mgj`7d0+7@IsBi)vsgs9XOy4i-HulG08F0k5}H8
zU%oxwB8sllkasP(NUq@Viw_?*fa;nN!02v29WhDEXSg1d=nSw))8q{#Yh~TOJItn4
z!zxY)?qLTG_y-5q#XBOPX}QS+@7zZ$qa@K3Cp8?XGZNV2V!<Ji$2>`O>m!{7`4;Pp
zGWS4C=tO8E@1ENLJNfiL2Hay}y@Cvp+C)u((9wJ0yDGF{#Lpu)cjC*{vRm61Y!0}u
ztqLBuPq(G3!L-#%3WCEml^Zs~=BEPAm7FQ)Wh+V@!C7#=21R?LNZNbPIZqWIYOo_M
z#4T9$vTk}cBO{#$%(QZucM$w%k>e{B2E)P3SJwHU{Z**Dkd+5;YTO?G0(hwtgS1I`
ze4Y=;5MBa{O1y+*W+c^jCD={m?WX2_1|pTPomcxvMHK=QzUU_CzuzSgr{|T;^2D4q
z{RMP0?N*pe`p=@blm6n)tD0pols}^##YG6OI@vn7b|kH+&Ka-`9?Fe9{nnCe<CJjp
z-*1yLT2f`sG79B-#F&+c$R?D~M*BnqNBlQXdWvZUmWibN9c<2XZH0O^hP-O`siOMO
zB`0JHl0BUzr4ego*g%8cZG2sN6=P$VYIp?77VgV1KyqUhmq=?J>t_9W(XLd)XNjDC
z!*iL3pnBP^MeA@F@K91(SU?^B@P-k=o6<)2t@3AQHj0BuGMPR+sML8#Jd8<miVK7i
zB}5$VC&5wL)VhN#dE?ou_l^lJYhDb3q;frINuyAPKR@*@EorVyB3Xf#mwcHgY(4N8
zvmoR?D?X$8f3E~mp*y(z+W7Jd!E)hI){>RW(L5n;1g$12B8lZ-GOf#LCGv?|xcXL<
z%p8hb;UL0>pBPF%efw6E4yA}hYOmS9rxj5?Tz#Fn9E!Y#<dRQ+f{ao@VJM2{gp+fY
zJGHB$9kh!S>=~tl;zO^O1-?c_-dC;2+dGZ=^=nEajNXn8rHO6a<J{b4WE<26q?)x`
znyDU?@B{`@g9`4C3PwgygZsxH-V0y|bNwkG9<6U~@xk(I^8a&9k$okn>g!QkDS(nC
zgQrz@M9Ul#60&MV;Y?tLhPp6KH~eATu6}*J&i|FMv1>VZ)3-W#t&wcBt|~<F6>dLj
z&%Pu`;v|^PYR8q_DlQOUm?<P`>}T<%Kyy;|AazwEedQY&#?McWWf|diqe<Ke+Q+{;
zLTf+(Rk6xZ>8W9>)$3(gegkyFdt9Qea|G40pq{YCI{aB(BnSg2L;knAaz_Yrin3o@
zN5>EFmibY~(!HSSz=7}u#o%FAYSSiS$%bH_+MqMx2u<Z>i!h`7LBTsao1N|OQl9Ee
zAmu(3ml_HOPSYEEO=?0oaknk|9sbnXIb3fbw^NL~=Diw8L1Gj4n6inqODI&b6v~r+
zzDl@^1nS8sa4{``YXa>*G~ElvFz$N(9OTH)0s)q@=IxgY8ue%0uQsg5;0qQ=D4lKK
z2ch{PB)=fxZV=v}b?!fS(6U=M9fj#a{edW)UPf~1Wv@nYv~I;O4*wP=CT^Ip6Z@%s
z`&xk=B1t65>y_74@`=0248(|>Cn|Zqq2-m#%uHW?8v7kMGBvo*+Pbprd#C?<+mb-B
zCeiLxd@yirK=+qkuIj<nUP+=#Rfg39;3;`$8SzkcS&`5FS1V!JC~o%IxsZK@w8r`e
zAJ8>N*IxFDHw49`mPG2>fDRO6GaOXWLU@;`)6=QVc)DeaEFrQuz?h~EhmP3NCLHNq
zwt`$$^IkX=UVyqNKD5v9kx|{J$W;8WG?Axqoedj4L-9nLQu{wE&;JFD2~kLk=@gf7
zB>_1%JU-$vmtK+7<v-V&U_jkXaW{xcoKI%P)eFrMlB=^>SvvGhWPV7ROif+#C3B>;
zKS&G^wz#l=`}Orn7`*?+XH}6w+q&W$=p=lKP}>+?0n?`vVmj3$e|rUz#(EC7mgwAJ
z-El-|06?ftzp{ruWIlQjNy+%Kt0>M*^)_Eg7>`v7eUJt{W;NN--OcbnkG(m3Xz=`1
zKzF1Wpk9gTULb@1n0fG-&v@`Skd*D0i38r(b&>hNbAA*~SA6&hDQ||f-juyTMJwg}
z^_SMW3}R4Ih@1k-z;16LTgEmUbX0~&K*+43i0LKPrDOiHSkEtDvZWC4Wz#_)$a_d+
zx{V@nNQ8~_^*Fu$bFU{`4jEEQvul%H?m`Z4mpvF{rMYLJHh4#8%*91M={tloXNX$D
zXq9&}xF!B4(9ZC9`-X8fwP4{x?Xv8~`kbWMx?sqPcqT~?`QgKHa&3`unEujgZ+6n-
zi=HB9E5y-%M!fv9JPSuIfjw7F=LD4vf9JNSJA;Lbi!bu@Z=*W;PXO@OXP?UYKp(&|
zKy+{IY(mdTG9Q)*$-=dp{0G;bt_SriDuT2zy(QM_$aEA(5>H0`Ec@8mYfyX%8cE|!
zQQ-e^UisU3tXZ9FMjK-wXDhREG(x}a_2k0+IAR3IEq`Ujv)Z4_`AMw{4ZVH#0f^2m
z2l`I}^%ld4k0IsyP~C7>m;%qS&mzE^p{=7=WAyHCSbH2eXJA#!h+3x0y^`A;8T`;F
zLs8&C;=d(JN_C$x<f+%X_E0ys6{@A#=rG&_vArbSuhpk+bSw16@{ue*A}ow|5u=hA
zt`Wo|OAIB!GGBo4D?ZdSut4jyOaGTA`f(jlAC4y|f)q*#gi6dkK&2$#gQV?$=cn$H
z@gmFTUvZ>kffBrN=hz+o{^}r$<%jw<nuoOvB*TqexE2;JRmkCKw{@Sr{P_CyYJ^;-
zj*cnadz5{$);02Du>wP_Q;E70m0|rpt)pAfi<>il{&^@jY6@}>zOM}NGZ!3XVm4Tm
z%!!ckND92X<+4yD#~EBcgM;01;(xLPQD2#uIK?@Z<tXBbM6;sie-_e-2h^%ZNO;(>
z0^iVqG|V2X8x+bSpM%DLgAU}=d^LyM>)oSEpuJ9FQ3y$vW=K^<gxAzgtj=Q(bA_@2
zyrck&u1rfa5j$Z;lu<G~7(!)0lZII#H06o4vJ@1BqZ$_VIXNOP^Y(UbOh+MVQ)4L#
z)EQ~}NwN8#ueL~VSwtQc#ZVE*`Dy}&uEJrbbO5?8YTlxJZEB}%md*(2xjZd|6PHNQ
zjNWBq)Gn(XmA@R75YPJ!COIq|8M!mOx-#c&rh$N(bj&mqhUdvp75IG%j`UIm|Jq_;
zAwEYid7UQ$b_?5*&bjIN%aQ$26Fb8bVoTTiXFUv!v|q)If#c2B3Hqf=FvohB^em)P
zJazGXSP=lbs{g23&Z{G50UyBUtN=qtX1T+D&U*+qN36pHlqIa!F9VI_Qo_9*libC{
zQMMBpv!P|RtUGtiC|zWR2AOZ}O}NXpOW($^`Kh!t;1rz)DDm|G7Leu<jy7*ykEud{
zZB4`|E1P?LQ==17>mRK~U8``ZGLv$3khP~F4x04##Sn+nP`S{<5Gqb>ptU-^I1!_7
zkbQIl`r$knxUC0s6vf$?EMM2+f6K817b$;^0WmCx!WA@wHYn?xHnJbcrLCQvd4Nn_
zR=px!1)pP-GKmdDzV(nG+&v|BW6dX_4PK9k)LPm|E^G0)o{JJN_~DnYUoVmU`*{Ei
zODhs;qdG+CQ#pNAiy`YMyI*Pvi1XTphoYmRL=G!)qgU_RR|y!M7rq+<d9unE5}8k+
z-(j#bRbN17C<Q3D@5uDKde4@{K`Eh$J_*I8?%;n50^6l@V~&uq2`ou^p7)SSXbqK+
zoRMNKSEc-5;962n5o|@QLP;T~>K^9ZI-9W4CHo)>!U^&Z2(T<$k`^%~;nmy7{8xIq
zx{{TJhKH|!3m8U|r09r;YDq2`0clS}9nxvS9M9Pc2$go;rJb2oD09!(cUq>JbH9h8
z2umd`6B83+{MrUhwYAlR(0mxiRwZ@y47<UlWG3Qx4UZr|{&U&O`aJ|@sdgZ~t%GTl
zlt^0XOs@)0D|itrprj13{<o~p+Jj|*^e)Kz;6O@igeX64S{14``HEyYJffl|5+55M
z{EbQhk0W)jUque$blCuFC^!^$PCNBfYuus5<}1<_p9G^oPJgz9Uw2Rj8L=WiJKMHZ
zFK-Ac#B+ZRr>bA5{{i+^fv0i=hGZ>Rv-)ii+(<eiVmwOKgf6Fw8!Pc587AE5j4!?Z
ze9x3mD5-?zfEA-AEiS`<j{u%Fc7=6YnyKzLe}w<ojFo3uit~(;pC*k<Qhyqk#8wT`
zoX)U;YNX5dJMyD_hShVrk<u6`J%WaYrh<<MU<kQ`;U4EK!^d7&*gOe*Kof`DwjQ_@
zq7m_2+}MPpG(+s*j~(kGtYz^3icsfjwH?CZzfU}yOT3b>&FP@pSmHKG2Yfxfl7SSM
z`}O}6%Bv?j!A6xP0EbE|ocVLs=OK_3vIj<vfdZA)ZS&_|1f8VQ>JL;a*b)(mlU1kq
z7*hy13M7qt`t;0MFK%a4z{IdWT!4Tzql3e`liy4sYlu(S0PI+cT$;ZDYQu+2{`?v0
zTmz_ZGcL$8eTs){d_oN_a+<4a>gUfn%}byE{#O6%;LaZcz2|!WTwAiQ!PiS^Cry4|
z_+@>l<-0%*Xb$j!0`t&96+VmbW5~?obsDGi0m%}^Arc4yY;=lmBL?LV5>IPUh&nm#
zt(rb)P&fLtex%9*B*CNRq?eJ+hVN!@j0zF};=y3hp{nht3XMci5*49CNG?fa%6o_f
zIWkYCylBzz)z(#YVPs{vZbP+J08OYBuBh;vac1-K`z4q9?p<}hrW`ux38PgRPNQ{u
zZO?CbY6tJuIWHrGy99%%IAOCortgcmiD{odV+6PO{r<D0kAnPitLoR^nr+$WrQ9UB
zF9E8X^uj5p`32QY`qc45;AAal?*>dj#34mSIIUe<O$Hvgxa4IAtjX;swKwJ)R7Ns@
z9sd}p$Qm?9&GzC@(4^18;pbT{xw9Z4iD(>OG)O(_Iw3uZjRwdUl-H6=Dc`)`JzlhI
z<g3z4pB5FjXj#eH()GB#o%Q*4)<%=-Ms&|=&?b6V%=@i(#yy!mdR!NKqvE*MwwBih
zMa_OYqg#T5u6^yG2HVq2e&#f?(ao<E|H|pruKWqvMXB>HG!4X<Vcx7<TiDb9?g==g
zJTC1Uj@v*qnR25}lY9(qBPUr|Dymlt?_&(^<QU32>^x9J%D|HL{H74D8#QwMk@KEZ
z4?!dT#d}F#vPhmimMqCYu!E$n>o+?^*?8h)g~q(s854?_9k;a$g2Ce*z+JPDBt-1r
zuiv$+Eqa$20&qMz)?4^b7K1DK9JGkCi72EC7egR)U$n>pI*1F-1M}OJU)ZoackUeL
zIJmyo#EE{B4fX1&g?K6_xIMDx031!v!N9mi*$?+UK!+q6zx_tN-1ozhJVb0VY|+9L
z&V()QGp*kU5xYtiE4~;sg+mTbo_56l5LiI`j?0*0HpPOsnM0*cvP)g&an4;)_IUj@
z{p;3h-ez>*9Epy9CtlPaePA8%&e3ic>7|B&(YE{A5m%<fy?UN-Fd-4T%)mg!b!z_e
zW%ph*Q~j^T|MQm(KaVyuoj7#JkXy9Wz{6u`7BL#HQPZktOhoeug1Mh8YuF|V$gV`d
zvEyt??!q%2X*l8JVc6+<90B`YlznaGLA{0z4LFK`8AlP^mxiXe=VeGgn(%#)Y_jpj
zT4g`E_|)TMkI^)sy6a2<LAzFvTz}%?0YD<33kz-G<&t!T{N0f77Xu+J<MwT9nYE3f
z`|iKzRaEvE)Y)^NtWNd1ycC+?4rJ*(HW$OtJ1m=6$J6#`j?@oUEc=my*29Jm7iBWm
z^9Knfpas1gmxh`7Ws-cr4IPF$w5xub@*jRTbi{}(ZfHwyZ|};LD!ooC{WaVF6~7Wi
zy9?2CED`P?nbRxRIPYKiPK|oZ+BN;j)vLo$DNrgo#@)a+^B*hBs#LGQ*~9_41CT5Y
zhDKY3tzaeFMcK4jNx)OR7T=*h9fT=rCKihepnJ}DgQ|&eYBWiE7CF6+zSWt~;Ibc9
zKJ@ndaU3Y=<;Rcl^vQ7d?muS(v{yP{ZHY!=TiP+D&}PG{#YI^T3Eri@=U<9ux-)&*
zC^Ry?`a>DqZf>n!_HI+Jb0B2maL8I2x+$QpK{!Q7G(=qWw&k~O;}cA8U73S04jL3I
zzS*>nj;D^x2x07p_~9EqzU3EYjI3Y)IqtY5_Xydnco#hL;Jc3|oz8-Rl@;&g#P@oz
z<169GkW|XwjdSGlr%$&rWjoqULs3ct|9z&XIMaDhBT0lewFYgMK!B3HbI*olpXT#Z
z4~~m`8kfZ!iC8w){uw8Z9I>Xelo{jDF=6^$3vfmCHkHeMVWSS|)WejeC!o?5!Y(sn
zHJ|&kYxnLf%rOZdY*%}C1_QlIVwoJ^1nO6~3m}s;&%@AW|K%wnQhoC8Lw!&OheK}N
zTUyKAn|>)S-U38ie?<8`y9HJ7IHXo??s$^6C7~(Mg?T<(3EELYW%;uSOlJv&Qf^kQ
zTA}QJgI>M9FP{f)C$Ue$3W>cvQOUU5Lc*6$-MWRBzwE)pw>_8$N6wOWKe0YEA=y&B
zT9iOoDx{k|18cDZQ^2h52s;t7%%t+4P%O~yZsGIt7|o3i@b&MjnSi{RJRL2Je!974
zcKWjme3IR92CJ;DE>R0LKXV=@_`hP^nv1E6#Ho9S41i5FZmbCm+)55X(|vbT5=Und
zSCG$v`_g%|;&UjJGDY=rzd^d)4Pq}x<IqhL5gXx>9K+Pi|1GupfDH+g&Q^#WbL9E*
zwb8tnJe(rXN_y*f!y5m&IHeV!{xF{OrnIy)8j|~9dHub3qR_?(@p9uxrZ%o$zrOW!
z7T}Wn*b~i`=8`@+LTa$#ejW{Jt`uAg`}flDjyRpkkSR(p^Gz?`y*q>+4I5R9|6N*R
zs+I(L{u29XkSz}V6|r@q4FbAMq@f`Oqc8ru4L%=}a+I6)`}c26EiF5X6DbdwFU6iB
zvSpGo@-`XmuIKS7fCiB~^#|(_(J%aWZT0#HCL~c1>1pdLNLi>V?Z~~fMwV~d7H#4B
zjfbqqDR`Gy6v88ndrXJ~b8sMJ<bC+i0RiYrcsr`s;!UUsMshjh(!5fH14A@_^{oj6
zqDI|kZqpmJryX0hNpo&V=KpTV;Q8~9pPK(B9G>yeMXBe-sJSbQK!mo9ca@)UruOe1
zpLT%Lu$cyAG>rfGOhf<6_0Ik^66|S3M42wHpdL}F>?#Vn(njzX=XM_-CDtizB9&6F
z-d<436A-hdFx9eG?+(h-sDXcC)5|>+cd{(WlDahIaNQ{uG8hzA{1Z4YAEMXM-YIng
zu)#h~^HS5Lpv)36Q%XLZHgrl&R;yYbWBb0R3Mit(arx1s@nBuOy3b-{ZxL>?8CS0w
zUAkCF>AOH5Xi8o_IXysbJY@hCHEnB3olz#GKaT~}?>KADpN-Uz`ti?!0+pNCV&Hfh
zf-S%zRM0k<gunrB7+kmP9nTzl$#S^^J~MAw4j}zya1jJjH$ELvQ%3ducmK{l#BG71
zXeAj!l$oEVX7%cssQ0G*`>~6fVl5Azv;(CVS;bG&rR;Lxg$S7!3^J(R`p?}ygzc~c
zi+`TmGP*47=^Sy+T}|kvD5LgQ{l}v1p9>2EM`D>~5fVEN?d|pRBsl(YI(`41PB#;&
z7rryDnHa|ZcR+Ctj%zWHUDVb6<&dUK=Oi89Odqjwd1zK%_VV^FozS$Dyvh}<0m{QZ
zdIS@2lP;LrnOHuoOYicl+|!hDQkFUnmH*$R-nS1S?KllXF2I#N50q`0JqxWVC!L(z
zX!wFS`v#yMQ@x&UAg`K1<J800fn|gGu`Stk-|aP(jqxM?4Mm?PaDz!J4gl~B-Nb|X
zT@>#v@aaYM@?f>c9Y=%PobeKl2zYOn7jt+<p`&>F;Ml={dhK0J&B{JeS$vBHlK1?1
zIJHpUoyV{}kRu67%=W+MRZu>+`SWzoG+>Q1>c8ydfl+&B^}YwuitmJICgaT1@@H|E
zv)qFAN+zGHQ@i$O(~@~N`yB+un1MI?5=vvOAyvvhwB{=k8MGX)z9(~}{y@mg?1~0s
z$Bp~v`%lGZ+44W<Xs3IiaYfJ<!IbSbz5MVYj$<wgXPbW`+n$-w8YDg8TnuIp$6aUs
zISzNWAWoUk(cl?32(DfJvCZDcp2)pJ|3hmhE)hS*9i6%+V@vxPW7HKN=Rdngc@aDI
zm<3{mOme_BL-{;ADD6%Y<t;q)NfGG(obvj03X1EVh|Uu|i|puqxb#9!z6%uLSdP;s
zDp5P!b!V<;4(tK$YP+wC=FG8TEguVc#+u@3*d$(@)b9iTb5{M2zj*A(-bdr5_uS{c
z)6;AsEM4y&STE1O+1d8qiyFT@bIvWv+5EVu?0cBr!w)Wc%#q1>5O-;}5-(b0m;1>0
zti`aDz()(IJP|(Uu6AbgW!M;OS9?^5T-aQlc4Kb69ujEwZ=k9P{3LnD@cI`{ojiG&
zt`VtAcU7uVFjG*{<LK9Bt`|%(<H{9-tG{Fttt*hygAB~ihb&;g>i8+ipDs2IShMDe
zdnqeTEa3T%)9>t`Y4T=O(aDuXC;7ANFR!Lg3?%NcDu<GUyDwb0W%usK121tCo<6zM
z@6e@w4;*Wjeb1>QtgJS27rtNqqFKKKQNA&mq$y$I=#e8q`}Xald~#vx8_J&t4<BYQ
z+|W-`XA`HUX1#Wi=oT_Kd*t8XuSiOAcfHH>=}|-rgPuLd$QPK;v@^%##hq(T?4|S-
z_@Jn~c5=NP?ne}vO71rQP}<H()905TM5dc?GL8{!_uRRGRMHSF?{H7zs5@fl!dhIC
z-M426NZhrBa&tG9!m*mr#xfXR;qAAr%f6J-VwmOD4zc{uWsW?cmC`XoTDE2DxDpU#
zMZ#xTK<U<`X>(=HP4JQhw4q2fW&qle$*tz@Mj+UYB$qb^PVF83XW{xR+lLNRV&Tr&
zsZo-Ru~@N**%TNUNX@){0~xB)H;X*(Ae%ID(%BV;R-vX_E?)lNH<Bg?duUYR#Yo#I
zo5UZ(Y~VXBAOKO1*+ttXcFANMA!C*wG~z-2_gVh?Q29&SgB{<=M<|NvKfH@Mzj$SH
z616!>sIHQC0A6Jj?3#QbAc_paaANXFdJ{)~CDl>-u4OaD;p1S$v;qlSKtUti5XZ^w
z*wSunxTcQbG?p7A`b|7`XOHVtvUjTQY^wS+<+ADI$>F&kubPrHU7ml90}`svQP7=E
ze<FI>&nb%^a)ZnP-ZOz3n(23X`+wLv53s1ut=p4ga!gEeVv8NiAjK}Af`G(W7^Em5
zioF0zv13CKG>OIv3|K(KC<s!l2o|h~1+bu0!Gem2BVq+X5xBoiF(>z4pXd8dVqoT9
z-glR^)?V+UM_tKuCzF$DhbOQS2T-7yoFQUGv;SvPvgR|CKy9_{;Jv&&CzN4X9D$|H
z4EX6S13$&6l<izh8G<V}tPW3A{piSt9h-6K8{9M7G?p3~e%GXfgR#$s0*;8Nzn};6
z$PNY^5933tqYlrQiz<*&NJ$Lj<gF~4)w-WO$Ww)d#WQ&}mb{tIuqcIq^_+c{TQfLS
zFN;e3^rm5-cJe%_B$x}?&ls*UqsO7sQmkz4JvskDLy*?s$RccG%iwxrX%ZyzI_w?R
zG@@I50y25~eJ7LsOYbiq2|byt-oe&kj4lqGQo_wwQD10SWMIT2*DO6M86m0=Jm@B2
zt^Dqy^rGR-8q-DR+B-*GHeys7Pdlzu#l^)Teco@mRj+NB^t~xVc&t&B2yHvcraA9}
z?BEb3S4i*HjFM%coz3Ei=Efv$k{o5TD;8*UJOq?Vfu_9t>8)QdVfgdU?i!DO0b<QY
zRRo6WZ2W}tx_7VoprjK&X<Gf|+q>ZnsWqiHubLT`l<e%WDlhhw^m9yR8VqQ|1qA#6
zxOMf)m67w#ly9Rl@$~8-@2EnTW)U3n5M7U@3WK^b)GzJ~Jy=>>8xE>pf9Dgl%*>{c
z+%uVCpQo4dDq~b1HGNeMOr9<Vb?>DF+E1<gl=3Z@L!1=4<VkAXT!v?)_KkbHlMueI
z=vf0GVE+C34I5PJ)~#daS)ruIlx+aPJgu&bI^PEA;|Cz|#HBB%DD8;&1{iornyY)T
z=c~C&sdVSuKb_|l&v0uf7S(9qn_1PD->*CYNJq0qQK)vQs~#|o*elN#NrE5_&N)Q?
zGJLXhI(I(r70*hPj`*9FIsxtNZThBf9#j87{q8bHx|dA5A5KCQ2@(=Zp&#0B=GqIX
zJUPG9KXDitUmx;Ke{lUcKBi^e8hoP*-eMg#Xbd);t1&gwQgY^SyZ+99aNW&p>I29t
zufZ^w2<T?$r961>;7>6G#IT1C@YA*e_K-V9Vx3yn=`3;?L)-%yc~9MHR2LT(x-_iy
z&f9sX9uVQ7`m?sEccAyIJHkz0zrFxpF2^pmGngF!$<$4WNH+s*rodPAE7aT-+EvZb
zPgz7GEhXtY69&m4(&EQYtI~%CJCcs51Yk&|kTR9N7XD7kFlVry>L|FM5;`1JBp>TW
zJR#~Wv$4^?V4^*;p;CLZhF8z=2L~vwcW&`vh(d}P$cy^-_4q8pNbJ_K90D#}x-^_g
zg*~0+H(z}j<CG8u8c844Q)x^W4z(18TJ6)tO=ct|2AA?3-}HBgbx^`e%5R!zN&M#p
z$2zB~R4-aST3)z$U`zU^h_{~_I9mnX*XFkRBS-*2T%95{J>3>=%aIc2f?D3~-36Gw
zoT;s;l6plE!wKuyP<8sLnXC-kKf%=8fgKhuSP-aSaCmAvgw0H4H}-$8yr2f{=w!W6
zE-US50FE)~xm9P(Uj!+?urjuf8(oziJ5Qm8wt9B^NCX2<TW2o%=L(PUU_Ch6{OIxH
z+ZAxL!vVlhU;I|T^uo4o+ZM(1EaA6R3g%O+cbo0{cCB2w_IN4$Nl3t9N);Qdj0<cZ
z_6`4yBePPMVv3kJui(DJtFshgg3|u_>#y>mC-*Q+A^VWpSx?Ex)=uI7JXbb`y9euY
z&-$$3FupkAq$(0vwl2I<9W<+Jdmo3PxRT04^%#GBXtEem<X6#BSOiRF$%Jx_NKEA`
zUg9yvSAMl`2*-IMt}3NV%{bG3@wct?oagIln9r5YkP{;u4r7L+0!GI8BBa|)`$ojh
z_IG*%cc4cB>lA8YbL`Y}XHguu)CG{vRou2b0;LM67X8Mo59j)D9e&zVx9W{GfGH*M
zC{>)z;KjVpr%MUp8yO?(sg!z(MIYT<%d!m*F5ssr%ZpmQudLi*Hcyi`9b;kp4;Em%
znLQ@`447P$LYIRz=be82m2FFw@c9;{d`dfg*G(irg_42aag0|qvk|<jbF}1S>S0wR
z-=bq=l=om5f6X7;aOkb&D^~1$hamQl-q_eSm@EY30*28=YAH|oR3%}c#A2Q6;$ija
z<-`zM_7ZX9sOD9X{KpvnFBC(c`t=eY!ao9#Xekrjv-|tFNgRHlE-WoQ@Pk>UIomdq
zOgf`JBk-7RAX28h?xB2^pBN{{_x{@AGmTB+)PDv~PE{n>{3zIw^qgg`AiHixKQ;or
zw&;BfF>|1?NcZwYsu+sT#Wh?m>O!;geOv%y#EXSq2X&2<{{ogp%xuVDT$TCR1eON{
z_i&X1mHeE7vy-9CWgWHA!qkYp#KTk}<Yc@qN|1P${Y{@3b!-4UQhp%b8gd-p=1!*h
zUY?+A&+Z&Q^eNUj-xn|@G*nO7ZqWSBpW0!*w(}Q^e0_Mdv5!?1Hx?~%q>yJHNd=+g
z*HxHA?C<ICg352(Hv0?w0)PBIGAHFG4WrsMi8t**S+nvX3*K*KA8wF*;W%yDpRBUF
zI$KfyOY;9ymP$Azb0}3n)A3Nu?ZSA+ngSY0f5SFc=qeP$2+O{Ar+5Dp6w-gAk&UBM
zVfAlQs1tLi*z*MhBiB#xL&KK@jD;|hO+?Yrhgb(lz8~;?#FfuA9=Cv^%4Ygnp#&fd
z0q%Ko;uovF>8~B?f7jfMhE#hF^OxZBsz_97lVL`3NKlR8t>HT~G>%F^ZwvvUkOSf*
z`9)>|RBr7vb&9HGhYs;fUz)ya)6%8>7tZTPAa|2`lkT9GQqT`Lq;l$rc6E9E)5lW5
zUE<tP>pN4XO!=z&XBv-3^3b2*4&&xa)8%ivmM(h21e>)DJ<FS9h{RYffRW#G3hB52
zWrCU6s$2U1^Qq>p9YOKgAX^mzX62t+9Cu+o)r%D4{B18Sg<}8cJwrEa+7x`|jE`%w
zZQu0k^Xq@#e3e6#Roma!u8rjuRFU8<XJ`gL6m6ecAN>C3Tl#wt+M`N2{=ChOFEs*o
zQzY-FCsRe@VBto-@z37f*!wT*-b&YQn5+|Wg+fWy<6dy~3<jV7pjZtb9MBf!XH+RQ
zd|Ua@g2cJ^@{d(loYM^Xxv?@fNuYq~3?WjtcK{0Za|s-3@AL^sXuQZD78Rwns%C;o
z6-l)!x$P@$*2OH;Cjmsh$ic(gQt$V_xBb&~MmbHK=mBsi?TG?@ZuYAO7d|kA@&Bxd
znp7b7V3n$?(mnrJ%mp`|?k+GldQF8AirW~qU{sY|V;E02VTgvOH3|xTK*ya1ZYoLf
zh?MHfm&WU5Lv2EiRB&wa8Slkk1kr%D>~rqpQMW3RU&xD*jSs+Fa3X=S#DDh5lci<@
z0L4Qw<=}!9zh?duV&rrE&J9Mk>z2#|kR@iKTT@f9rtyBwTa#?91U4nkD3pxOje#kn
zW&N(P_Gr8`zhr6<UFFeZ$AYQ<k4z_g|Hgg{6<@_b^{U3Dukj)iGkZZ@sC8A5fLg%U
zR$x_zjckd=11B#&U)m(8eEH8k8yUOD--5@e6x5Wg`*vYifGSXb^q+I(@@3aknrOmi
zuCg(2)B@C9m|wGF0PM`2Y^1GJDDx9$`BQ1`g|PYO`oj(3Ufxtx<f1MW)U{2=`ogsH
z)DtW84@ga*<YqPibnQ&iM8>#}QwQabpYKiD7N(v?!A>}E@C()-@w_%Yj+dh2H!$ub
zKQu`WElFg&>+b<{@YS#W{$Y98e~h&F8OAcx4$ncXMPQzH|EtdzqW*~_DsP`Yk(B`J
zvGdCMX<q^!BGu^fcM_mO(SmjwMuM^v1uSiB9ET2Fb8A7}wP?H%PkN=@>d%`rvsrUR
zjAICped@lGMDc8OAxq&bmc${`ceBA$uV3#qQvdU1-mGS(NPv!&z)M%XKG8E8Ka4pr
z!LaV$CLFto^O67)P%akoC`ka*y-ehLmwDE{Pxs|G&Hl;^e4B+|__alsXso8yL?F(E
zIIK_$3a7n$rK;#P84>wq6bevl-|F)1l;RN-yr3bmf=^61wqLkE0(vYZ)6t})8!_8e
zx|9?6OYX!-$ZQv<+X7Wcl?oL$_TVHVDIAd5FkpB!bJEhHGH(q(1>~R`zhumsMo$uj
zy+98-HZdiiKR<w#89VOKL4rpNWX}5~Tb^?QBxiF4peu@pmni`*<mX1OZ?454Sa&%L
zPuITtmEy@~SDvh?e(AgOot#U9S6gwV$qzG;;VC)o<Z9;f`W^KqE_rv*-Riqvf&egc
z9#pn$_wi4K+Opq-Go}Z1fE|G9Tm>98hHfew{vmt+m~Ih;e8>ac{&VjR;(h-NRs`GF
z&)DYdw$RX6zR?4iNDm5~8cuuMxLi;Fi42>1**518(nGW+!D{h<%RoB~DeJPhG=D4w
zR@#!NFxJhI%y+zvxsw7+a?l+nGNY=JpvV`u57U_iB&~)4%L3R+FwNncggTmY_r$HT
z-(srZQ}qT6a72}RlOHdl-)1Z#PI#d3+A)Yygp|<h+jleq4s|}P+5x(fT}F#WP;zah
z`z6tCEiTy=R1)ZvZdx(vR0TY2AHJ|C?_-K4)SmSnG=6W&%MJT}x6l$`2delO-wtsw
z=<h`=7YQ`-oWLw^F|?fpp5ns+#d@e8&+h1^4F>ZI`p~hSo}Q-)i6Pmn`NfY@e(W{=
zusLQQw}^@^ZXx>HEpSheM&77@>-ydDDt-l?yRClwaeDqsj<V)Fo9dfpP~x|MSrkg9
zc^p|niwwI3Y)59em$6;+hgOpNQP%Wfx-(+tL$gm2t-myDu)D$ySXlhQOqCFO>^e&j
z|9*8*AiOym=`5b@*gwrbi->r<g6Bg=6Q@w3jfjTTif8JeO_*i7!As)n!f5E=r3XnB
zWobPQwF!F`&YYDj3YDZ@;jteKOF76i+w~Crk*r$ZKh1x3q;GTpbn9QbU2m)<uw|~t
zp)*m)R?^VI@%I_hp$b-l+-S6w)h5Xfl;I^%DFaQN6w#={X5T(M-FU^jctt8RYebu|
zs2Iusdep>4N|an0Uy%O2z`4}q(xQ7~bG=9_v!Lj*x5jw_pR`vy-=P-Gf(aE)8*X3K
zp<~Cb0RexK*WA&a<eyj&K}HqQrnKPPRP>;AoKa<myhJ!?`-g}a^%uXm>m<iZl4V-~
zE3nwy7|>#tDilubPacJJNdMu4?7Z+QmURxV<)~>x(F=$Jnf6kIFUkydiXkQ<Tr53b
zh5kU}`ggDKDw3p;!Qh;*sLBiq^lbhibMrL$?{91@Ez^Ff*u1dT`u2OR+{wHs!#UU<
zWt~@~G=%{2=Gb=mR75ASSlJqxt)sl+v!Z~wVnG~<MM0dLBB-yBA>4yEiOjl~!-#Ew
zOs>D~pbF-3FhfSK&ZxQ@IK32fK+@=UdXd_)W9&yV{x}wSv$YQ!Z}zDG-Q$lhd26I}
z<e(BeV)EGJC3C)O+yCOzK|kOqvrpGPYBoE8z!na`M1|c&KIP!#iENE*a4DC$tBNzu
zQ*)^~BEp`%ayGPZLFw%yf0wm1QMSVf90UCr(qjr_3bLduz>EVxjiCR^j9t-1*5M{b
zcNXC?<VcTp{oSC3r>NolxTfSA_ULz(Yu0_DVqdl4^N+23zwituF{((AN%;qQYyAd>
z!3`;@%ule1#$o`@d_&-^K!tMZ%GbZc&cuD*<eUBItfYgy6`1jU#hA4);rC8-PDvtE
z#t3OL|A8NVM&%r0=mtn`WIPgxsO<W{0$k@_Xr}Apmjvmj;Nvro2n<KoHVPfCLeRO7
zDS^;;LpX-><f&6Nh3`>ljG<`Px~IFN|MEA^_wZu5vPii+p_4}E#s;;M_m<@3WP}iI
z`TpzI-=Y$Ko87@*MsC_Mq!&|3COsgh4K+*mdHDNJ1{f!2)&RMPkfmq34@`+DK~xU>
zR8Ri~2neCq`v<@NAlZd62@+Tdhv^v_H}7xWSiC*x2gl@gYwvIon?(+8<yKXujzAZE
zp$^tPpMV$OBzas&ZTHYG#e%Vq7^zZ_Lw;dWGzZ~fQEBNMuP`sx->g%61k5JX=nwkw
zXdz)<1YMG5X%b?v@dY_gxn(3B_0y-7DQ^+gs9rp|&1<Wsg)j=$qKW522b6BtwI4xI
zvcg%Yw2-7elTQ4}AW!k^gUN4Zm}<xoVNg7&Sto(ggHOIcbM*_hOmK|0N71DY{vK#Z
zPjRk-=#EPE6dxxrAiX^8CPtv?Z|-t<*k&>btD)a_nr0?f3mX^MD_dA>;Q-ZW41}kI
zLfEwBw`QN3z?9Dchao0I!%tBVMUk0GSi22{q_Zb^HmX#F6cGJiMg6=a@AmE4R9h1c
zPg_lz-`+!P!MkM^qo){rZ`G1;%W!PG{!29y`cCTam{m)6>ns{)Zf?Hi$*CTFa=y@*
zvg~1#R=p<;vM+8;uhQrAt2J&zC@t8cDL*YT*@6m=n4flRDuokO!;`>j4lJFMZ^nX!
zE7V-^f?5|y+Cn8pAHCN!NPRROv>nm-<~wf$A9!3?D6<ixPAyvfiq<sJc1U$rJ)r+e
zR5u|>l=AMA{`>e3d4m{ysKZBx+v8C3ClMQ%&^6?Ref;<%URA!ndzahDm{&&WI3BHi
zHE=*?E2U_7uv7_;$cwxS^~Up%gWjW2T$wtaBf>}D@x8rb(Ufa}GFp9;cWhmSwlCG;
zTva-U&a8@G4ku&XWEsUrpn8UR<Ewh0c6Re>_?HLDfzZ1QPlRmfXBBFo6u}$MvjKe(
zxf&&2OEg5F3q6dCoVbs%>_ftyHu}iSr+?(BsCbCVJ*bg|c-ykYk+Rl-n-CPA_;)p{
z#g0W8w5qM>tXXk7pdn&|3L3%^c?|r+CKM1l7XED8<#QbG1k<6T)FU{6la!~JF`Dd1
z)wimubRhMjK=k0@2s;B<7||kzM9~`yXt%9@5hl=IR^z-OFrZYdFjsMK(%vsl77Im<
zm63GHPMwBuD`+*Oeix*eEGbPiq7tE+s(BD@s*-`l+j~tMFz$-)Bp!<T-GTV{g<c=e
z6$JE9@(`|j6@Y&|g(rxrCNsyuml$ocPV+5!-Ip1oV8lbT{1G4)BFB_!S#R9Ic&TKm
zmPwme%a)xiuMsem<nKXuZOozcm^$o7Y$lhXH)ii|B4m<h$fMhym{>k@^DfFj0;?Nk
zwLk_fJ9a!kSt05{5Ku`dJizkv*PqyKM3^<__F<9CMO9RAX{j(P1O$7SWM^chUe~PN
zQwj<d)^X5$b7=a7US19mS6NbVx@A5I=@YEF^YZ%gepd)fnWeMdor#1PB>arFwA`R@
zqp5@zu!t(gop~<+p2Lax(|vGg5BZ@1PkXR_A$2Y0ZngkTLBiV6$^v0p(yHI|`|Jyc
zvn$_w6<@h>r6hXystk$|PI~BvdMlOi2LkShA?;WcP$pM9wQCoPF~x9-yc|kiDrG}9
z^>~Qe(bOl|^cq$_o$}cMYdSksr6pemtH}Y-(ZpHx)^StDP-au#lwA2RSA7t{5jCEY
zyo6|QJBhtB{IqGs-9_%V5APwWI*_qFi_cNINCJ|tW#w&?`e0kn7%mb$Psx!(-*E&=
z7)OpBs_Tp>2%+>{IM<cOEW(=DiN_C8{lo%Ny9^%u=BGxaQ^QCmplMfbVyh=Fp8B!}
zHVtsQzEAqpzyO)oK;UUpk9Yq&N%jfR3Lfny3|~f~)RWYLH@V@g<fZedl7`&$1%>*M
zd#<MZ=KIp;&pVwsX`^Bmg-4XwtO_{TiVS+gVb!<aZd`W*{K?4&nTLAr&22C4{@I>F
zO-ek{)|+?l9KoYCjdLa=`$Ey{uyzjX+4t#pe&8MweI;65ym;~Jd}2u}mK<Cw?gK9F
zH~(DCaL9qK!#6nV-I+{NJn)`D9x>W!Le1Nv&skG+L}L&kLyBf5@@{1#oSOe|4NA|)
zXaxJar0s_tqzpH>*XAC-hiLch>LVptF%eYPCi6cs8BtsLwi7iruk~$8ZX1>Y8{KsN
z+Zx^;;E$Hnq#XQtb6~;Bh;cyOfrZtRNtobJ2l)v)p?9m_(Xw8;bSY%+l0FlY;?LH6
z@SqcYf8@z>)xBEhG7yg^Pb;}p^*hbtwF_3PsPd{}ZC?$vvJE9c^qO>Hw7i}N1@G=v
zRI+-Ac=j}Q_B5QGjHk>!>pOhJh-Xrhw+ecnoLqZfT3FHd(I88;UCtP)kPiMGu)ZHJ
zqm=CMO7n>h4#5m;9)lzl42f5rg-mtf^MsDw?zeyW6SoOU^k((BlSIi|^bgWa!=GEs
zyu9iCm9AFHUmFYU$@e(lo<(C2J2^RtA}W$@D%xWs;dl1z{iq#9_xbW{l?}j6+bxA^
zbqp$ClldzV@$m~w*7``@1?{TLVCsk^^|D!ddk*l1tKaqYmOf5Lhe6c0eV2z=bjwel
zpR7G&6#Z?+<4YZQW7fxO|N6F%bL?*99I`QJX-28whe4|%ty*oaq^%&*_f!fTL<?HB
zBc{IJ_S3#Ypx6IsR{0Wji2lbL`v_MeOF8Thp$cENNHNpZVD8=VlA;fnQoiLUFFnL*
zq_57;dcxn`WTH)^uRc6kkxHqTp<G-{l=UheNyA7un1BcoIfdcE$0>K0|6Kc$6~uxL
zmVVLv&b{<1W}D)fz1|N90lYtSZIjFCL?(bXmqF4O5S{mt^*6WaHxxuuozFnuA#Sb*
z4Ku^jAHW@jNO#ez<%x+vjn3Hg=DUiQOQLaQ2Yj&J(YMRrYf7bki#v2S^kIiXXWoTU
zeMU0_y&qQq)J3Y8Uj6zBCYQ+w+%d+euJ7pg3zWzz=NA%9z^u<uD84qoFhhB2`P(R2
zB_6rGrGw&C9^_E^1mBNy&Gg>~o||vjP}&Dj8MoaF%DuC9)^0<OZNjl|m_<Ec780aV
z?WKFcuGUW9_1aB}heRg-MRP~9AN3Y%)BW7?7eTJq`2<5oMD2h6iqSF=#$v5Hnx!PN
zN^}LWJSGuTA9cC9&@_{wf_-FcU2X>Li7>T*RWVFB3I@e5-aI`0Z%GPDJ6dd@i0#v-
z`>o3Zw{}Y=|J5#=!WNLzZb@xDU>qp@gW-ThZY!SwPJp07$krH*zva57|Mey5tjo7U
zR)@*Bl@E^FQEQq3^ULe~{EIIVnMAd}@LvR+1jZ7^tNaaXXr|?qQ6P~tEOWT7OeFAf
z^QCxXqF99wI%H*=#(LOPx9B(QG`UXYmsvXGc}342FZ#sKry{kgICE~=d2WB9A~WcT
z<SJ&A{>+`S!g|QZSHH@mp$G*}v?6o#_X$T;*1b=kL(Mj}{y2H=8DI99HBNASTRERt
z(Pb(X46{^m(DI@-Z~(H9tN{)vVMeAgG~erEa(GsTwlX~+g+mxPjDQ*wk`AxzK+;%u
zaA(mYVHyA7X@r0Op2n{g9|~$~qCq;QLN^H_ftX{q_YYO<zOnm9cTq0w-Kis5T)x_4
z{v%%sAOp%;X3G{ZdSu7@m1s%a3cZnsZZK+9pKwPKZwBHf>nb9O@AJJj@2796>a*j9
z)$6UvHB%mU)zT_j{p~*e=Iz^i=zWEO$4r4sGvC&4a*bsR8C4ujQ8Nx8Fmd|dUr{0q
zMK~0QnLe1cmJ%d1hX)yp{`?v3o)8E0!JuPZ8SE^P-XVF(fhj9W4Kq)$9efV1+J2TT
zKv8hRAz;tS_iu!bg(Vf42+zirFeNx!<lswH3<Jc;bM!aOrQ$GT2!HIMDU(@Abg%&!
zD(>M$&1MQQ>g(=ube;&{ot+<dyCU|cqT$d>8NJW4m7p{N|M1dVf}aTYe|9RwjA&6Q
zr8Sx1{EZofJ@95y@=9O=agwEG|Jiz4F<O0veSzi@%J8zjzmerqkgkYiN6Jy)5YZ<S
zs{|Jni4k}?CyBjUJwv^0VfEVxVSD9|Ns=6bfu8Q!z2xK2coHkfq0(rEcz+`!NC9_h
zYKEzinz^w|qN=60q|p7qxtBpNVjhqkvS61*u$;MhDbSD|jR=-WuEvL`AVtCmt|NR|
z=H_|SRokgo)q6K>0%jeqvid0jb2BLpq^OR1JH(1oy>3-nSAon5IV-$!rV;-lRlu`c
z-tD=X3BVv*fBp6EZdc}_(qq9+&p%`zePia-@mNvr6JDJVqTaH-$2c1l8#2;V4)}w|
z%1{c84^MW^>+cicM*1d3X{&E|cr99V_0?(0iV~)nb19az?wwRGK)j$1pFLyFoCDPU
zanz(tQ{|F9;UG4DeOWYEZplE4A4tlYp`4{N)xgCB{O8n8cg-U9e;rZU<h9^EnBGMv
z>+)P5x#Q(U;e@z)U!Sv;T{8;{(h;h$qb--lBgJ_0>ebEm^G@$cH><MZa>fCC3Is#=
z1KaO@^{uHcn1XBmK^T3`s0fNEET}EpNgM8Z>SgtCjTgT8w-O#OmQuG(5lg5e{LL(L
z9rp}vLjOETQiKQz0I1|i?DlyVf<^Dn7cYMvvG7Cn)!f|NvSWKGJ)b4NJUgo5_+_v@
z-~oHcaEIe_C;MPG)YfeC(^bSXk@wWi3PC=qZ=hjN2@Ncd9HEQ@WxF5{Y-<l3wEe>h
z$GmxSre8Nm4Ld%|fCFr$1>CZ|r;{B~Yb;MQn{WVcH5Rv<y8SAu$h<tG^r=@EUw~EC
zQD&g@Fr0Q?!~PDvyo}yq`vDo9>UElJD)xt4gM%ae{4|tHbIyRVpc*OrViBYJvnq|{
zLA_X7w43#Hsq7T)b9*Qbha9z~63xaTTA4wt6MT}tawJ?u_Go>FdR2Sw7BstFO=eI(
zWLCd0ORGMYj9JpaR-->?Q|vkdl#0HT9=z=J=M4ys?RuFOU}*yV+jF~C1LKoVoS04N
zaP-*6;)y|W_^|p$@d7eD!m(^Bue}%z8QC7Ky+>5Ey2Ec8wsPJY$sTt{E3G~|@K|Ev
zRtgsB=cvY+_i~~=lzrFj>#x5Sl#Kjmv9#B0K+$99i4Eu7zgsr>6iQDzjD;V|I}BMh
ziT$tdGi2hSDW)Pzf;@;?^kUVMmk%HA0he?KN|Y{|N{v=5%4_#_Mg-pCkJPu66W#J0
zo<9_E26B%3tE%Rp*6IhuNo$qCWydg3w;z-NhUgY%;0A<1w1Dq4&rQS=X}{<&DYETV
z0?b#FAEMTMW+C!F^fGd!(iskB7TqjQ%YaHi+GzEv3Q(Q!-oCV)gy7zpHS+SA)FWoy
z^tuKFNn(}Nn8mCl@kt5-0p^J;`ib8YlMoV2JM}W6V}}mlYQ0#xqnNjOb}M3?un*E<
zN&P<f<>|NQ2l=P@e=L4j5Zb_wFY^@L9u=dtrq`kIM~0F&SH?##lm9w`n)>&%AKzSU
z)<=cG;208?Rh4u{tV8_{SFSy;y`B8>sa|ZNf8jX?_vS;JcKlL(=C|)x|JM0T({D2R
zbk(ywIbhh--fQN6);{<1DY}+p%It=D|MuI>!4^MQZ}{x<&zk?|zZPAV*QU(4rX1YD
z&gS!i^a=5XRd}5ZDZb|1-1Xh6q+L=p24f8^%ig|UPX<LeluHJ^j{erH?Q3wHXKeWq
z%faYdH*MRT875I6_Mu3mXmpv-%x$@P-*%PtY@2Jyl|fbpra_aZI&QJKPV@st@1{F_
znvHu<+jAHmZ+tnMadJLJ6+0$k?In%Y6GNbfFV!BGafxSo_nLgu@+L&crpF!B!T~t*
zeT$YX$w75lr$Q3sML43k(oXd&kAaHq_kAiR!y%YBu1s2p*1B8BI)w;NW=wN*-9I*N
z+wk|iz^DrAl~YDp_6tl2dLbAFq6`!@2OLVUwz>Pr*cavouy~&1og2?f#y$E{lNV(5
zc9Qgyg!TiMmqDZ=?z*SL5n1C7oql5(Th3sf+v~}=?%Z~NwE8Yq$dlN$HsuxOwC?4o
zH`^7#^!gCB^IC<wp)ANi(mN|Kpvw^~klNp&O&jqjI{G>(e#p#Zb~WW_>S7-}P15&<
zh5yz})$^;a(OFaMgsdTADwOm~f$P?dJoTX=6oebl3rVdy<}4VK7@ONy^`@_qH-`$#
zx_STc*f~c~nMYMlVJtBl*;oYUj;#_E8-1Ydf&~YOY;MRFN4&@Cb41sT*P1h|Cz$uL
z8^RSKY^b$fqFEM*#ND0FWW-;;ay4%B{ZUneDME0|iU9wQ{1AVcq5zeNVH$=tLe<!B
z_9ZTl82S}bMd8wA)%4(r6JcC`OsD9aW?TdP@{|`5W0rn!{E@v<u>@Ct{Mb|V-)I-&
zX^ffwwn%UbH~F0R=*(|1EM`4oOW3B!22jSlHLlKaka5$SwVXnch3(;f8o&HOGI`W_
zF{(}F;x!QpL(khc_pZe3#-RjZ+Sui99`{QF$4boiCBD)lX(_#d&*O6jmayGKcQ=N|
znzX}nXLHb$nOY^!r@1zdV2TfPdPeZl=eMWFmR@WT#^LcnGj<^vk%7+<C+R*!Xpx<?
z0uRz><{{6UHXj|?s0El0A5N8iN0!dO>!}az{(7e8+I>Ov#cK-|E?ztqv~2Uy#B3|5
zyYifg^F&35^u+;1QO(=kwX&xFtEoBfh`SY?7;!1V>SZ)>wDlKfn;-HF-o|dz9(C}I
zL3fvc=8lJL=eX=0Nz`<HZ5lmbe(xjK&aTq337zr8Y(t1gHp<)Cfu~(~-;OQTm6hbC
zSDqCT6<mINsdh9+lUVELZBcLzu{vZ{c)v7~FcSakk5V$C0RZaiCk|yCePKpl=Lx=t
zOk!j{4)vj>rA55qs*jacw%dX_`Y6s!(XP3Zu_>}V(=tuY5UghYVVgq8d&ada#!cnj
z@aKF`EFEO)7PUY196xRGZgf0_XT2XeH!)Ijyp5gJ;a3)<>|SZ2wImnV9Vj5gZ+t`X
zs^5BAb4uXW70Fdh&Vh>UJ83JEY=B4+r$Hbx(|2gOVU-U6Xq9k-(+GRM_%1y}U;C1`
z{8rP$Q@zz%t$Ix`xSn)}(PVh1HbvB&y1jefuzE_8-d6M4-#5}p`?WqL|I5$`ErngB
z0&y%Uf1a0l`+J^8+1t;OezvPuIBRs?h_?A>K3XKG!F_*>h4nf)_Y$@~8YZUk)}EB;
z@t6EIu-^7f$j#2qo>3kEQm$r)xFx-0ESb>(l#wzk(mf(+E}61*&`R7rwhr-K6;~C~
z)+%renyMTM6GCw>`|_*7x-&gF7AV8L8c4MC?n!%<^SpVxpUsjc1Ke)&Lig16Duylm
zfKTVf&UJkC_HBaBPE%L4X%gEfmk;XYIMsHP_ZE8^S;nm0Ugr}ZGFt8KB?<Jh#$ldF
z<f*-*M}X7VKq&N219KG^GP|1IddKs`EXQWv8Co?pcJ{|+j}gRNXDa%T?53qoi>n3U
zK2lAoI&pH|S$MR3POzlT4r*E8Tls2F;H!}wkDO-pD1f^!;Y4b~G{E6wCv2XKaJ})F
zvh{iokXNfi-rFW$>u`>?yxkqX+NrJ07TkrWAMw78xS-cp2cB#*Q#dK?1DzMYnrEl{
z+Ch6xpCeScLkG^iJDOrz7NS>q&@L>)V}xL)GdsFir*6q<EVINsM_9ZO%Rww0wGXk>
zCh&U!ux!HbW_K~$c1%@kxh>sLakc43UEOg9!-l^HQuqI9yq>Ci#tVwPm};WT>@ib>
zfH2#=;)PM%VgXvId?HqYNs4s9=}csW0ZI!`y|oupFCgGaoR!|mzDjvCbpH^wPbsif
zq*E<gRZ&vMtjLDsIIa^E{4mx0)0+C0EyxbHzU&@Afxewmi;8l3Ap(C5zsIN#c08$~
zHzsr#rsX*H^(A`sl!#`XP{U(WC|S8o87~T5oY_tNr$3ZKNd6(}{?<XiE?RT3hZ(ir
z!~^HJrkPvSKpAoH2f8_TcvZuVOxL%j7K`S=-EkhSCZ0P5U%$+xZ&Cbzt^`yLf~1^C
z){4pL<#aUi(uNnRp4~4MJ9D1OIynzdDXk0BF2Q0SS2B+oL0N}m!i^WZho(-ExmQ0w
zKj%Z*a|_+ZL|Ozo7dyGDrgZb~V5HLRx8sb2sfDPt%hj19B|3i?tZfK@dwKC_%gXot
z2Ok;yIuZZve24yFVLcDDI^Fzg?4r&I<EvH-S1sscqsqS6$yNn4x><9`{Wj0J8DH9&
zUb*G3EgqGuRe+g9dDKRh+mA|1py)fmYgd(BZa+^ZkkrrW<yIQYGKAFLdzw&ZP%^|a
z^d60ZbH)3G-*{An2AOFmISx_3jlOnzU`KF*z)c(rYXdXYFLB<Z-<sO}RXti?HVzm~
zXzWDz@r?^JW~Fngs33l0mMmOoN9|{wI5}d(cN^c}Ep1gw9PL5}((}5pvfvy>k}*lw
zaf1qGwT!U|jnq78|G}<Nwr?>I5f+LVnOV-I_*qPWgY0kuo5$Ht`s;-)sl(zHH~%42
z>5({Hclw%@XjeCl&B+CR0!r4u)N}#IDBdEwxy+gsjc#(E{+naSnR51qd5sM^#zAyt
zN3=hWzQW*-7Jd{mrwT%o37+6ZKr+yNJF%)pOSf;|o44n(C}Yc2HU>7QRHJOlPh82#
z*9glJwJc{OifOhNdq7|h<}Fcljas}LM-sj6-MeQG>&A+e9?Fn;VVTF{5;GXR>D1%w
zbFbHrhM#$RWAKK}o5w=6l_BPgDw$QXAeay&#Wx!tkF<SY4OHo)g*XQp8Oy1NkY{RO
zRZ%Yk>UUNOWxa~ieIj;+ZS&L`%#CGv-D68k2u=?(3UaL`8A?weBtD?MLMfUZlC9y)
z%gw~rKrAcB;o-}lw6))@-7WX{#THdsY9tI%UmHCc=9>O%S@;)u*RL6xye??xx@xMK
zF8HOsfP3WgSx&F36oWr3Q>Z&X!7bXWba&b9=-^;&rp&knCZmy6pb1jWYVf8wA0#74
z!f^%m^`TruemrSI>ZxB*Ac2n4ySmBDW5-xG`Z%|<yO#|;6X_6CFys+P=$KWcx56gd
z)*$F`QY_bTI_ey&oE@Zx#RxJER7G8srx+jS4&%E|vPi$3!^#{V%qRDh)5MSeaSl)<
zs>N(=pYGEJ1bPAe$LA=$YeId(50gGc8YyFO%y8mkH@&>3oM2#`xH)X|r>bkfeu~Sa
zx%c#@PMwM}%?ZRPfDnbADq&@YOgAj_M@J#EPe2mKm9?&4&5ayz#Kcf2>+UvTI?7Wk
zDh>+YghI7@+;Lh;r5$y?18}le`Sk0c$34^3ApyIxuRLOiP1Z~zWuSp}-{^}Is@dbv
z(ARaMw*_<Tp4l%2fd@DSReRKq=>uK_fAxn`i3f@n<V$n6HQPZ}O={g|N|?ITbt^)=
z68i64_K@UOIb|-@F(y@y{G5ffi-S5e)tD|&cS(f>iWTp?7^`*479E<_%UyMUK}L|E
zm9t4YSqioTIRa?Hzmyo1L>5oj=9Td#;%9HAOLlo2x*VaI%xXU_5*?d`cXLQ4Hj_J>
zN)?GCkDR>&47T*><EHD`EE9u~BaRcrgL*?nU}Rhy;?OV>)Tl+8%#(C;ov6Q}RDZ?y
zw>N~O2QGW%;hmm-_nDN#1RI4?a9|PH%$l{m<by>|bz+Y*O$VrQY}FGgch`O#$qL^}
zvj)G?2QsMa>-ymn4U;A~>3Aw!#KKmL$AHI&IGy2jjZ(ok;G*;Xd9!|#oA((J(5l?N
z1$z%-X~KbA4%HC~R250pvay22mYF^4QDig!`~=>{KE?4E<%dlJ)Ct9r>Ei$_D{jyH
z9Nxw{TmP+RPxbAuJ<?kUWGFZw(4&5<U%c~2y!a>S?WCc$rk>sg_$I_L4=EL=7|x(d
z?caos=YhHm;OF+S+Q>K`=e4wz9&kmMJNu&@KxQ@{4}9PnrKQEAfIOAWUfXoPkod42
z6d|TEDS=CI@aqYTS4f>GS{1fc>XuZ>Lt(A#OF5u9WydBNYH;{E2Or_{JIX#<xPGUX
zX2~fEwylrnMRrk}<}&dpRv~FR$aoSMT=<jb>rck@x|b}a^!PyYn3+9t?vJ3Xf~Z5s
zr|lkcj?@@K4Kd538H?$r;4uQ*(o!0)tUL_x+p<+F@jz&(x9q6#QRhUF0-c^ug~9gR
zJW$&rJyIPxq;~zd-j*LHI8HgogmOG>*HZ>T8`TW(Iuo-_u6%z~Fm_QM!4=;^6KnS|
z4B1%J2ZL_v&VSv#{QRYXez9LKmrDgbPTr%V-@N(;%E&MzfMIIiznw>FDq-JK|Ni^$
zHqI+}bob8r7I>`*B-4x!4(fizl)xz`nkg~EXj3t}fd(_c!h154t_Cl*BFyRQWVe>$
zr%lCOmRMM6KeHIs{pf)Sw}(zBzpj|M#LHkvwTWAmvPH?PCyUlbHr2BW%1du9U^e>H
z3xjGqcI+sujm%s!)Eh_+Ia=%>194QZ`i9@%Zmn!|weci*Ysqv38cyWwn`EjJa!S}O
zkj?l5e@mC#e;2Etvfee`&KH|F89R`y{S4b{N1h9aVz4j~>VAsJO>l(M%dA?~fwuJ6
zj+-XfuSl{8$*G|v+B}B?$Q??GuJ05KyY&}CVX`tzcW5V5z1hMTwO-$r>RaFR1@Pd;
z-<pXkfXqA7__K>a3M{lhlTQgvjFAc2Ba=HV`kTy{H?I$$X^>tzQq|pMKh&Y)@8!cR
zyf<69X+_4LU1e&A!uZU)d=Kx^#9W!-K$3DbJw1Id7OJzahZ)%mEeE?0lo)7GgMaL7
z>aCDYB|#Uj8Tp0Hgii@IU6E8b4ubZ>E=qe;$`&&_dmh=wt?4oX4b&WxKk)S`ZYR~T
zh=jWHh;d#+=<~Xigx4b}vHJ=iDgd>6Ly45~oizN64Lt?D4N?d|f=p*LwEbpd0&FhM
zjSfkkP7G+>*2~hi!8N2;8g`l%xi!j?<Wr|y2Spvrw{15sW1&EaD7maQcj*C{?S>Kx
zqyRx0quT{pi!4o?EYf&+^2a%W3zcz30ku&MiROXN*WoQwJ{?=UjGr-&DuGlG=j53g
zOafo+sp{Vo3QJW+>X?=K<q(AL)MHzXG8S?(*S;TV_t*JgsG#;PY&!`KA^9VTVxpu-
z3}@^D;cHLFRwtq0j=t=beDU(-&{*Oy<`_Rw|K3?vN@$Dv>c@S>Cwe;_y}0+|jlnNX
zwl-}m^L>Qy65_jPBEXwORy}c@OB!Ky^Bg{5V~MTJljqOF+*+*24BFIPZP;I(?yzLi
zIXgR^nN&8=s#$QtWk~Wwk#e~T6^O7j*>F-Dt3mX8rLks1S_|=~f1sDgwJGg0n-iQd
z&K-YXR%wTnKdx1Fu-$O2vNu^Te%{&lBkYD|{E2zLVshN}qF}QOBpj{i8S7fM3l4T`
zQ3(M?w<($xzFbalcWU0P?wM=6I@Sj2oYAN`CKZ!OWy9^(QV-DQ2ZT*tKY7tkzVImx
zj5<HsBCu=gqKoMxuQTX-SO>`R_A@i}@^EkZ{*4QfM)pio7TTu2{SYRHSVGYCPb1uP
zXxcX1Zc=vrcWJbzfQf|j+}vt-?T3;&Gb&LfZ|+vNf2W4@cgZ^DW*_nV<;!{TT^DeW
z6seSpwRuL3xvgyK$%QT(-vG>7z1(^KvtF*KO)k&qJWA-x#d{x!r2!*0%BC$sU?TDi
zO2)-GUf=r>fXih<Pf(uAOMrl~Kd9UO_V&b;0j5_7I}w+YM5gZ$GsC+;`x3=SS-!_1
z>DQ6gW`aSAxR|$^4}`aQ4&117+-sBXG8WzV`s=SJ1_#xIxs;b*6(hN)qIP7x^_o(E
z=HTcl<)L~02RA2e+nRehYlrC%@w>iQpZ93d<6-O1eXYCV>_+b38ME5~3ZeL5w)<}N
z>K0u--I|r(D~-jVpI888RQ~GWFA>j+inhkzpmsh6l$Tj6_}7th{{FTb8Z0~fw8vtM
z)+_u+`kZ`=kHR*x-=HaNya8!FTRmzavL5h?erc2khPw;|e<!Yz<3+JO;a(pTle=Ck
z*SClq)wCk0#i(|*<ffi43Z8he*$?Y*P~~4E0?+4s+M>5Dj?B1(qHdbP^*`T#zl}qC
z*Gr6kHB4Z&O{IjZ1DsoJ_m}e!C0cvyq|oKAF=&d08@<+R?S?n!O_5M{jdYF++%ft0
zyU!m4L*UQ6b4{-!Ey7Wqoz+_uF@P!}Z%6xNw{i#vv0r{isEJS7?$v+(giaN=DI{|o
z&OmbN&>`x~dbc3$JMn@*5G!j%RR%2=OF7~80@;Wz%@j5QI)jH=*+bf+rI-_pip>4*
z*`CEva8X|uYgHTZ(_4lc^*A&Yf<g?uWH^w@Gp@W~S>bm^g9o=S3U1z;rvF~oAFseJ
zSS41c(4=f$WNfPRiBC=QnCX6PN)<|aw=p)}-z!`<rU$ktN)5WwwcoEHFS~YY8X(FH
z_T@0Uzoz8~(+K2kkyt}ME<)FWyy5YFvupc#M&h0m42R_5ehyX5_+X!vEBhPvuHQHh
z+=LOHIT(#Bd`g1Cjc}`K9)78%uHD7%M@(bTB8sm;1a4J_kT~tguRz<^x`<LbB)`v~
zLAT2i_FYuI*!SD4-`8txu4Oq(5{*gHawB)hKs21<oUnO$>RW4C__x$^NO@*P>%YO@
zKRq^JS~o?*D4~X@PU6L8u3x9hW<5DvR;%OP@r<jGV&*D-un0TWC4oGWMn$2cIcWa*
zEay}z@L5h)>&JZ_=p5g9=5<&YtL#x>!D2V;SR#bbj2Vj_RcAJD*)kHb<0uuGaqEm-
z7k?EG$@rw@1OR)VQI^{}OrwN~dZ)j_lWM#NNNq%iPMzku_dlLj7@SFQZA<)%ve|9c
zODWXSl$EcC@4S?NhpAl6)vH&t^vYB+I5hM8cqUcC^S284)Kg_2+1<b(tE=wc`spU3
zwHcq4`mAJo@P7}5G%Z1%(fZuP=Gy;Rb`jQ+hF(7neQ4Anedt~xY85f)0ff7cslJvX
zYoL=)YY!%SJUwye7SooPoWx7TDc`(%*S7tLS)g{FTl;u$ev1E<XC4q}q1ZBt5INRC
zm-L>QrExPXXPpy4idNzH7&8v3i+S9MCo(VzE&VP}mjMm&)Bg&j>Z4WfsYAU`6BdR~
zdKCFxiU~T6!#dh$zYg6NS&mD`Z1;04VEgnbU|#Nlm3v49sm)79r4~+2aNQg;NTDVs
zZdDi=8$UZ)9Ik(6s_TM9ivSLtZp(3DYRl$Z?yhcZ0R1)x?vT>Y_6ub0f{N-8goh&$
z-vfX>mXg(A+`}82&XTx1f|1w>yc~Jb7Eiow2ak@C0z@GPHQnSLW!^0oUdGYw9EI!a
z+Vx29+SKM+jG~3>)!pQ)CQU*@&sWdB)LyszVmo5cfF!16A_PCnpMq$ChV#I?Dhr8q
z-Saf5><AB*i*_Tpk+#cAbG`P0`>X@L%OYVFNA8|#;oZPh3S-3Ca&aUr8ueC;V<WM$
z5MhViUm6EsMqS-CX_a&3ykJOl<WcRKnq9k?L}wmX&N3{6!5?KgeswYbJ1|@JE%Of8
zNMxn9d0eij<m7|+948dxmtswMv7;Qdi6rY0{9JHH@!DqNTN#(u&1nx>@uoO^UNCdS
zbM)4NKD1Sus=jZ1ZGZgf0a31WcBzhwEE#XQ*|p1Bnxf$|A|gFs8yy`AMzX1tXs1|F
zfLj9YBg96l#s75&DBn~{sN8q*^Be)0v|3xFVX)MqyVYpy6vVL-4mE1|%1$Gflm*@|
z@s#Z3m?IM_8S&@vMcDXqDa?+)X3vI|^(}?142O)NM$DmA!n+}kJ&}@`nYs3s=}Bp7
zUf*3xU+lR1fB*sPe*H99moe-}>k1==zvC7YZd?`&`A9dSJTfJt(zo*A*HE^Z?p5>@
z2I1+bc<wx!vS5ia)cFyTpW~-&y;U}yD>~~+JmF8k7YgfZ^N~h2*XQ3Jtj|RML5I?|
zXk?s=w#`=oV*xN{^>bQC`{ibHe>X}FX1MyFUbw`aC))>rBBDcE&OWt|jyg5rGReRI
zmv;L(Uw`#gyE~+I+@N;FRNr6u)*4O{g2?TlcMUq^c0nGzOy?u>X7DpfKn?%xhOSkV
zIK8^t>=d7Ls{VAH+LEcYcH_s7->{ff4(_}RI4WfBITVGhi=^nvj%*4O7i#8nD(16c
z{?w7rWEPQ<q!pJ$pCJ0xsEYJ&F!vK{0?xecs{3$o8&hwus0w}N@T6Ix-V;f;8I5OV
zmCeopy{<oQC51nA%3}N@1SjHj1`KodW-nppArf&JZk>~8Jw;Z;$(QcN2gA|`SL}@p
z@D2L|2`I*ZZ!hZPB9_T}-^Zo2yvGsr-))sY+r6d1)&a4q^K}HC+BZHOI#+h~7&8VM
zIT3xMf%bJGNWR4ldmwKNRJ)wf>|*WYGsh~Gj9hlew|W#avbtwA>4ACb?OdS$WCX`8
z$&Dn~N*vHUr=DGDlvjnP!oZ{+^j|aDL$6$ZM|)#$ZN7k;Vrs?k-umxFj0R6Qnr8^o
zoej>{>(Py7dAF_<PMY<$=xQ|5*|xsh>G|E*DY`kPOuxHc+c7<uVLg!zINxt*S~UKo
z({M7W@CknI!?bkLH|q5i<I&72!~pfGn!k8|jxI~Ld%IO-HDpK>!M&8C#h0^rGpiYq
z@gM=ay-1)VlJ-Prg|~Oqj#q>CRnEJ2Y9A_Ht?L&HD9|#8)V{av7G80GMY`9=5V5}%
zf_cn+Mt+4jaj&LBehM{`zU5}X^$v3y{G>GezVvUdFGIpC8^h-MxX?!#-xfJkinaKV
zadO$lt=&Wff_TGqd4YhaVlTO}ay~QgdgOj-Eyab2{~n8mSo99kLtx-9bEq;W;6(*Y
ztJZVW3^7<1Cuc&WvS|<ZNvL52O(m5H*(*{CDxnNy%knFX%RdD&c1zkhNcUI7jOt}!
zyLJgwN?B$p6}ohhJX*D8ZRI)`q6o3iU|180c@E7%;;L$IzU0?)#Fv<)iF6*w>g<Hh
zP=J3}NoXOjiJ2S)qJ&<8hCPx(2;wO0L_@YD)8B{ociF(Gl_>!JGm0~WS-aQ%pm~si
zx3qirCQIHuK5sYrqBxSmQ`ucO{ZEgr7TZW7BTUcM;J$eYXm!LAFdK_L8T03LW^!X<
zS4bh4lC{8_(@Ydj&DX^n7hhWztavmgqowi#;{;`l^SHPV!5l2L`HxZB?=O&p4Di%A
z^Cqo_``r^q1crg&$B9KeOr-&a4SkG<5R6(Z08UiCwaM!VDw6tQrU5o?U9m6%nMEpu
zVlBhGSky)mrDaGYElArRE4D=Ma(DDIw=LpFQv}F1$i8R*t!DWg+D`mO@yF^xOf!9Z
zO+Rhtz(b5BMo3u;0WE`#V|;-C+-y3bmpLN}B%mfcHqV69%Z?BTVtG|D{GTjs65e9Y
z>=ipu;5AkA?j>SUccV;UTDOx4F7lr1!1i_Di_td8+Y1{W(G3V4NeQQ=_0fjYgk!T?
zMN3XvLq<h-*;}{Ep1+9Whwze^Le`L}6~PykO;ar?j<(KdFB%vsl}%eyMdB^fEWo+q
zHAlgY^<O8Ss?y^^8;KPdLG+@-N|}@px2uoM5O|0fsBQJ_y!-3QDs%G?f3_(;92{VR
zZ_mujCtG*wbdWIOA>*3j%Mn9}@W$mxzgm)q{IN~uyTv=Ib-F+IBmlvkzOmX<d4lD{
zzNat`j#L&j1qYA{xN+2g7%e~#!j#6444$#c{Q)O+bRQa3(}j;^AU4wnYSvol45$P-
z_5BP&9+WfN1S<iS^GH>lFz{{Hs0GlH+1drRCmGhP0}NM&s|;!=?!|1C>4*S!nuyTh
zO+;}cc35X?YgP;&ITH26$`9mf%vfG;@>%VlilrYSj$EcsNX$KSyS)EM5ig!Eay=3#
zNM*-ZQfTt8TC82Iy>ar&3AzOhvR3D0p1sKN#s?l_*hb;X(=m^d*}!9HT;s}V$T8~d
zsuW>czt;l{kwhIG-=v$N<Po&cn-=1EnGtq#H%-u)O&7Ykg^=dtZ9gw(;Y&_()b?Fk
zl4C7HpducB_T_RMF<R+4SO`<}wdTrchEW14#}DnJlxcj?4DK@XF~06Y9Ec+Th(pJS
zg?<-dSH-glnNbXUB&3o(eNG})TpP!_UpoPlK+xYKXike`_JtH{9AS`xy})&EYICMb
zy+Y&e|AaFlv&S*K@hp~a%M#A_jWk}pm_Q)dSZHx9AfZSgp~fU|>*grcgJco{*;CRE
zjqv6mQvyi7c=19+^Y-xx%o%N5Ur45V#`reK&!$`!T$DJRyS-cd=wegwKb`Bwx^0cU
z_K(=qCh%Dm37IEpf9j5PDp{Bw|BOR{FTx;(jg2DiqBRpjA||Kc^zsBzwPjZ4qAZid
ziz?64dpFg>=8l7N9wmyS26d)#(YbizPU;1OY2i69RR|C*`(3yuj(-d-&0@|9m(C-W
z!8(DiXCS^zj{<ayH6nF+P{?#^ZpOQK*U~IBL2-+<1aA_9K7dBg#JNao%P>OSkN!3)
z-{w&Ki2)*<!8@&<Cl--C^W2Wyf6P1BC#6`dam$DEilz0#KhHh!rKn4R)B+Vk$y&4t
z-bVG6M!oVV5rw#v1h~F<#8xp3>kcd6{Vs2o{eeXs(k5nLyt<eiH<Y-N4A^$=Y$e^C
z<+y{EVGPu+NF3zA#;g*p6QnaImG7&ZvZSnwY0ctyfuIM>4Ch&NRyC(T`L0c0bJ<oA
zg@uPE8ZsH!?cpCz1wIwKh^Ljeed@J&oqgW0=s?5J^Ih%Q&*bQRu%%uvDww2mkcSxK
z123Bm3T3|-8tw73XR-ZeUY?EW#IfWD{7J~{!HP3bLDH(K&FD*{Gmk4ThDzrnp7LQ#
zPi@Hlz$3_EQy<9DFQJ^_(!+SJE{O*To^q#T&4~x@h>+#F=$>E&#oDEu>Eg)ucTX_n
z@qsNp_LP^K{p;PBQ6P}X1dBY7a{#Mhc0u4X)5qkOwy7$6grVOR8odOxlL5H6>a%#{
zq+Zte^2;v;CT^X+*xrr|BZDV`(b!pyuOSka5HDobhAcbJu(fWNlRl27hrA~sy8oFJ
zD2lR6Ynr(2_)uwE=T}I;a9)r8CbgiTz@n~3XI3GozBBxd>yU2vvO=EZERR~3v|tWw
ze+x>~8HRb6ocJ?+dfN6ryFoZ&MT9_{aI;aQTM5)pjK;CQ5AlM!;IADiYWNU1Oc`?l
z^h*|^oiHDrcVB?{&-%$I(bAz`)y-GRAQ5&>YI0C6<e`9nWSmj7?x>ze^N-ml7tP9S
zvE<&pd+1nA#8?uQlQG8xkbh)!Q>M5O<6yqJld5aB`(7_gPR5WU0mL3%gfPNQk@Z=)
zvBTCkcW0m+;&Mj(-SAETy}5$>a>5=gktH<h7#=6QfLI>WaK~_E^1-CwKM6A-bMG+Y
zE`dp2KIMa$iO>mHc@J52P)yTU#!e9TV?gh4S*$cDS%ET(%$kD?YH``qF}sHNPGPG^
zuA4DqhK#2a7M~!U%J@1ij2W%uI04jeeI?P85TppQwrzPs%|eR?O^l=F{6GsUWxN!B
zK=q@=x0l5uM<d+7e?RK%aS0t6;7EoHG9j9^X8=x&W*|v)Fr}bBkxE2*OVO|Mn1J}k
zf*@J@{=j<~jb1?JwkQ}%NYR{KlnI?I#U>-lBM4n%@jhotadBp;*_P)g26UJ7CE^!K
zdg*(h4>x<bV64lS`$Pr-1=zfLYj?K<0*N{y-7J$7Qbuu50;`e*pdbz*@}T8GLeG&q
zGCZT%OQH~<eXEs{SkhukIxxV7r-+coa#KZxg=n4Ed46y<vK?_Kfg7iz&O&o32S-3S
zNb;dqkQr8443wAqN#9PDx>8f{M~1v0_xd(}XJ+%p-6Whx;lV_q+<}Zsv7XfYGm7T`
zrm|jp9;0ZZetbQ%V&=SghZ>4S5gr0C92BDj_Kc`}!~qy+QzT_-@9yJ(PQ;sB>LGSa
zA|?}jur(?bNBCNCR?SA3inJniOhuNAj!^SIMMEtki$IY;u5)#5!%ajcKq~4+9SL^x
z_DFbUx8j721B2|uE{H$oNy#`)2k!XUa)-%?8FeZ*x3)|r9od_LnJa^i9)*9@|0Mt4
zFFtVKyeag(F(8_<l;(8&J|M3A#XU4Ir78tO8&-e<2@8gq=fp5hz*ZQ0Zk10*lVvxr
z^|mifFm4cLY4}H@fX*lR5tY#gFjqz>-N%s~?MKQLbk-JLo?=;{WGBm(+uV)FZSE@j
zk#H*ltBLCxbk-W@?SF5Cs?5KC!jLrha=Z0yIUZM!cB&3Y(6DeN4Kk}2sE~Ed#y!4Z
z7+;iPRa@a(TQ__()u(*xfnOS`CA{_@%`R!Y?a0zGGu9Hh#W8ZkjJ0A7LA*=){s(@*
z{QKA9SptZbuvpAq>;IT10h}IR{2v_s!p<CMz&1E;IpSh76pq1)ZvB3t^{J;k<TsoX
z!HKB(MyUiKDx6*aDGS%4tFTbLC^$E-&_dwWM^&Y`DMwXw`vjMa3XlMdxg=s{5~mH0
zS^YX`$QPbxdu%R6$D7}`-lE(uECS`1Dw0^%e&pvGDZNmz&+#1c!F0yjnT}sJTw`aC
z6UoWYj<!Ow!dLSUW4Uw0K%;4flIrNUlt0DTMH-1nR(l)+C<C_qhi^2u5&7rL!KOWy
z);~s*RIwk^Qn=uHv>oV$m$=ZM?S-kT!KDg5b|2(f`yyUD!PRx#Z1ZiFE6-KmdikUN
z^MtOA57Q|ceTDeS(UvGAlF1Msej%P6gmmbyx1jmjjX(8ixJe`Bqgu^i>^|29)<43>
z9SBid6+BuY>IQ82N+Y2CM-Jh*{4S!Pn`=O$A6m-qux--8%9c{eXf6|mZ>BY6DQ=Hr
z66J``lWTyuvf?^x8<9PZ?5eE&uKp`*6Gj#mD|B^qQuxx7w!{IkM6$0%E-|NL?*KQr
z?&+R@FIUP|eL~R}%26kR)v;rO6%yCcwtUmH>5&D_eCV=Cb>L?U+f(v|ue?Tu9s8ul
zX$mho$5T7p`CAQh_fM3J$?uX^8y2Rj`BzW886zYF4cH>mY5xdfq=apS8*}NhGV2z<
zhG#EeE-4Qc*tU-rU1I&;YiJn#!lcQjQdX2aSo68B;GsnLq(Z9TdSuxK!+-9?2rZh7
z>ch@*C-6?h{&AdHP`9|vy#>7cVjXc^9KG6){F2K#pCbtcWMGsEYuDZoY4v{(`loL~
zt1D9i3OB$?T!7??{KB5I@h(?Bqqf4={8ri7Pv0J{L7_1Q;q;NA`Ly_QbPHiGuTBsi
zDaTrEE(LtJUB3g2E+AmuRElqH!(Z^p(e#F@=8i*tXx8l50t5cSlPXIVho);_<KkF0
zRD@~Zmkv~DT1t^Wiud#<<$2^t99Nv@?S7@R6k`cG7UCbbso|*wIzo5{W;Zfw{It&N
zDO1|tVaX;<SMYs3`5CS=NCC?vMUgsxcVjR+-$FbKWO5J4RZA&eUjCg-k?8r<N67@z
zog70@H=~OF)vZ2I-?rp=(7);`Wt0I<*0aNpayc62H!>&=WdvL$L%F=Gw97w!rm=9Z
z46!0wP|+;eid1@XRnVt<|I6XPgt4@a+i8jwy1*X5evTXiQ1^QMoP>8x@_k?QQ~l;R
zzYW=^yC?kA+X0*8U9(?`#i#QKaNWu~O$a+jRsjjm9Q^v_cM1P2-?|YzF5@a#iAvhP
z^3prwkW<#LUsr}JheDWcQ^@G6Jpa0Mpb>CS8h25ot4N>w$=~^dy4HQEUl`rQ&zHxa
z#P-w)Rj~x9fFRHNr=36x1M4_flD>NGum3#jSZW9nRSS-+oWOj}6yNH_m#$p7Q@5DL
zOXkLsv~3ZPdVXKJdku}G(TVP3<q#H`<K-SH!8|+G$6E6(4(L6ZNSUB&$wUTz0Jpg^
zbDJOG<`DX}EdsJL%c)+Hyllj@hRADsjI<TYaskm(DFkz4x@h0#2tt6LA+U~26N{&)
zQt&`rTV@vSmy2*dE{6N4eH-_L#ouheu}5YMr-(_taN%cUYf?~pc5JP2{<4U2X=N-_
z(mzs}4Y^VMF6LcGr$&uUhkcY7ABc~fjHIKK31b7iirSA9_ZSa`*2<Z3Y*z%N-J@&Q
zzt*yNG&BzXPH+hfsE_#Pzd?H;)6J?#0W`qW96deb<tA=taC?HrN0`thUO8J})2M!a
zpnH>bcYW93+!KKu?!HXsX9jE$;aUzWVFEE|n}YX16<2e#F+TAh2^R8v*>OP%ffG>_
z*8S%?uO#N6wd!1zC`&pe_eur;sJ!+~;~2?}cPInJQB0)}4GQzJw9-fG9<})dLX~t1
z;xY;how&GRO1<tIXUY*Xyc#=H1RXkd6b~^&w~UK9(?8QVnulaS=0Po05{enp;2?*>
z1v_^>dxg}{C)nvi{S(qi$peGf3pXcHCe<;{9u)-udEi)G?6<In?a7KA+e7%{oIUcj
z^?%H_I8ySF$UNXd2{M_0qJn3|VYZU={dXDx)&lww1ZAp46-kK`Cy2S*!epPQVU5EJ
zxjXQM5nEwp2Kp+QeFRU>8>H9xCH4Q?{{<tSa<qd&4H;OcM{3o|Mb=wh7ZEp|ezRf0
zJqWl+EQ)?_cHHA9%B<ZF9UDI6u^znfhbu#B_8?2MPzjbTi3y0OemDQ~U4ENR;t^nC
zYPz`2t($jcz^E&nWd4A@8s^(b@>|3VfT*ZcNgYlomkG`YYum*1`HlWR3p>g?#PvAk
zBR@P@z!Lmx>L2DmU!~z}71(vS5Wpu-oUl;Ic>y5z3`8xEiwdZt1QCFB>+%cDM;U9)
z6@ikn`6uyNv?#U7913n#DN9)nhJypkyByf?UK_qEpdSSu?=LQxLRpz1vtFEwY}XLW
z0zzbL5LDI;EJ_tgH|+*ts0XGJRsLzj<bm!sZQD*VnkK`^3elp-kU6kZea-Vfch#1D
ztG$e1BKE*B7zYNy_wn_V@a(*n*?ww%`??e_-d}Xd<A6N$6E#&bG9`a(jnFx<2-*(R
z^ha$42_)Uf|NExY7*>demx9t#1z0E^cHnpUa{v8ja5n-R^4nTU>6-Wf3jb=Wmm3=c
ztcDmWNrj5#Td<-uFJUAEIZlf4pNe2V?Ozz47OaUUc-y#T-R1d3+tAv`Sw(eLAJAP?
zXao728}h{&;<YKZB$zFV@q*Hho?RLx`JquCj%{QOr>nL#yrIX&^uJ@-J*_%*>RqQD
z=gV|hJ@x5SzVBo%v*d<^yz19iG)2t$Bz^ZGdd_^~dXCgD`Ude|3~V6lE-pI&_9>JF
zJKp{86b%j6Ntj29(FLQ*(@#Ia<Xa|085niSE_JAicW(%@<}ct~=5UhRP*8!b3K_%^
zY^W{(oiCK5z~m_aPu2H0CW&+ahwyoLx{rghz^A$>y<a_8jmOmNx)BksJKV<K(N44P
zG{4vRp@trh`C{&)$AAGjYj1s~Vb_B0mlOoze8(bX8i`CW(}IE_JzDg)P(6pPC`>rk
zNS>X%!P=EW0t-LOoVD90io>(+Tpq&{>})E;jELsAA0a1zA>??=iR&aen6v_s5IZK~
z_&A`R%OwKrtkYaomzDcAcp+Le-PpRC5w|Qh(574&?-p?|D47tm-$if&NYjct<oA(6
zm7bz4eeKQ1h^sjB55|N4+F{w4G&FQWkvqx>Np+s_;?#)EP<~w`+p&ORT?|0j-IqNY
z#&vN#2rP=L%eA&~><fANWKKDVX>%^qy#~ZWnP+EO@W}MvWG!fDSUjii22(z^iW5aQ
z<}wz&pD%gFP`-ndRV_PmpbOYQ-I*X_R4eF3?yj(~GKTux7<mQnn!4ooUp`%Pj2OLj
zx7yKJDFl#AFknwwpIr3;_JpDo<)h`t@wZ9bG787Pv0r;^?KLrH?HFkWP8RTD6FKxt
z?AfIf8otng6^y9HMketAB4+)qQ@`E=1c3#1SRi)WmrA&?G2pxctcZJxbZr~{s}a6Z
zm_1tRB9IR{M(yN+8X#8=s#A&q)RJ{0K3%=r3gAOSY$B8@Pz{*{q`|mT|HOsw5*Jh%
zT_v#uotes67ARZfh%1>j2Z<N34GPG4c9Q~lI`P<R+jO#2?{bD?)@70wGau!*tA64b
zCbHqR(aosaXo*rsw+A;WTJ}d_V4Ew2^E&u{*}?zZ0+X7{uPx5Mu58a=wKeS|eiz{E
znCUoC_GqmrfGeTElcPUy9}Eo~-S9{@bI?E;at-m2)wMp?3-Jyk83)O7(#DeSx5mYJ
ztbJkl+4;;TaBv_vH>swipKH>jiA;ttXffjGDN!Rb0?sUJG%Msz=LyQj>PutVb?zT}
zqEEdB?iVSj#NDhuqN;zy97)1774<mminBk6<P%_7Gx;2CFbv$QBNf}5H~&pcIDVNU
zLTlF0s^`_%F&u=nchOlZGEZ2h^wI7AKab+mUv0Zpcm$9{s+iO%zE47Cfm3QP=HrHM
zXqjs9N8R$)IJ^fwsRU_nCU9EV!!P$}_?5*0T+6Auh++ipO!|ISY{Y?X^*^r>kN`<M
zn{Gr-dX?In_y_B6BSa9bzGp|ll&r*W0%F||&RRNK;&&#fmiV18{BXxuoSZw4i<xxt
zs6tL#gAa|B<PM<3-!1%<c|!JpC(AHMIiQ`IV-Y4GT;d=$FMv>R-K#kdKRe$dnztrE
zH7hY%P&$4k>HE~r8`HMNc$xhWY?f`FOY#sD)s*H$jFw=x=hR(rMxE;!a1|;7McpV3
z=(sJ{T0*OKiXw#7+dpe0^#gtc51J*)ac0d*`zYysGFTm&D|~<W!N1uR8jdfhQ)lh}
zZp+Bew_GV0)XBGSnc~5+kDCi$5B&63SD(Jt0g)3=GR$LU?QhTpJoZHRB`$76*8PNM
z3zfjm5Dfylik}TB&kJ;={m5@LPKpQVCSN#Adtg)jqW3>L9^IJ!Gzhe!?vL$evE?VV
zNCZ{5;kihg9bT7R8h)bwfBbs(%f6pe-x~sjnMyon=jr&?d{jjW@f6hE`sf0OfBg>s
z5@7hLZ|gi+=orepl>256-#rrlNb1fz19blH-%0uW(*_BcO9z=lYU!>7iO7X3l)CQr
z|6PM}L6biH&a3e?_<p-X&#EF(4^Z=ulCbsvxut9WL|gw-^tGcwd$p7U=iL{xVlSw(
zx^U}PJL~2Bh(x{rl=%-So*&q4q3#8Xw+EG~2c3;0w*(j&>*=iN+-A-2VC(<1{O_8y
z;iE?ztnNG9-s<hDpIrB?ZKZ3u_p9Z9kGXaF_vOP!|94RF$C}xzX3aS9AD!o46u))a
znD}m0#j1)bAGZSrj>Qe7hlWPw$~N^9$^qiGG55s7wH@Z4|4gak;XMwvw@D@Ie$%_5
z6_lq}`FvA-iMCsbUU4B*m}~T|C6+3aSLeTIYcPBC1rGE6^k?_3dQZ;>`gdmfn=XXr
zb4@Ny8l|$rOZQymr9R&rl-$wqr_!_WlQ;hT{otN#-vh>!4(80QI8AF@4gBo+Su^WV
zfXcgNFYXPduvDtzCnO>j^-4u8X+OVJ<0ta7QT^%#{3{Q_$n0)mYtm2o21A4=zqPPQ
z!5!n@KYUrg@rUs=d5fybShlFDC&MK2;nZ9IdK9+@ji@V}{L=4zKlpjTiWeufr+&!T
zu2T4ZEY!?A=Uclb?w>2s7_g=OSxif~?Nf129}k7#ty{H}kd&cu2T#R5_@7?K@4H#`
zcX-=VfMp_?<1WU3hCnwiTRr@LKC}Ie`r~j9JMCJM?`8x>6{LW_v4vfDOPT)DBq@O!
z{=8B*Jl>u6w6;%D`QH2eTTcR>hH}`?4pql}8Ms1yq=wN!mHO@fzL)VsBI+8{^RD9{
zN$%oQk%yOEkm3ik%4v|*PXBt|AKKQhrldb$JBpI-%pFBc3{w0wY5vbIP84qW*AvOP
z-f;DoI(R?b)0V+QRU`)n7rei*^bOc`tN*?F!%G^Tb+2dIL*6TG2A3XFfO?IR__LCs
zxhCU2t!NF6_|(P^p1kUPkytya9+O;eRGh6ivu0!C1L1#joEx6=BT@7uzc|%H@D0`7
zm2a<ym@HrTueBW4usB-IfVI621tF)-DB7AVo8M{0^CSPdv6}7mulmov+Pl2$R(;6a
z-CZeX4siU&W7HizPki&wH9XL4crCwUG3LoSI;=<~0B1s{{OeL{F4W)C=>6oB;;5Z-
z(|%$HDN+#|g<^4@bo+l|n_t70j$J)@={-ZFVJ_Lr>3WVr9ew`4x8Ly1*~3K1#}_J6
z1rfkX$pw1rUr*F9q~VV8cko<<#ZoMP78BinW#y~?y-L5(YYkU9M@sd9_m|D&uvC?T
zd~pp?%D<MCyI%dWGEW5k+h~QwpVH3B3Hio{#}Li`dPe)#H(u!t-<o3ICl_1TTB%Oc
zh^i@Q7?%217;$gc___AHAV4jb<M)<~Qkhgf_%#HP#s9ujb8CCP(#grI4iP+`e=XFo
z$H-30o%u*RN5^fP*3h5)+{gIae_q%AwwJfIo?Q3uhkKLduKN8SzTO0`=Dhv?KgKZM
z8QVR>jIlG0B|BwH%8aobYa5}EDMDeCC{oRsVJulrvV;nWin2zw%nYp*rNx$|jch4v
zO26lII_Y$~|Nry&e&?RyET7Nk{eEA|>$<Mjs}^!wgu_WEW6GZ%BzVw0$Uz~!DlLko
z$QF!-c}1<HnaG}4+w@l*lV)nM*%Id6?^<a#@nK_mkHjv(?IA43cE(6~=93p`Y5V2p
z56^31HD!_J{TKGn;gCUAs8&t}U!w%%2k6~I_xLzH);>Y&KWQQkq4uCs;}IR;ORv*Z
z=j7ZvtrmIPJ(Yn~zl*dMxx}|yWsgr6Y6vEAb`5PfF!*6*)~R_6wzmu-jz84caqk8s
z1>rkgk_W2P1lao+dYsH<>pt=eQHk2KIiXoUr%7Vol#|i)g79>z?h`qDs7okPE8Pq=
zUu<h<#Gs?xrLZ_i$Ot?>7Ib?}vW^3Z>Yg)dZRJox)+CCtUI4FKl#?0Ep^0k;4OF^^
z)O((f)*go&IAtAv5>xR3Z7`RDe2&$yzjci4g{Tm$($S+2|4(k$du(qi;Vl<u%?Lek
zU?d)}__BVAedg39-F0_KR=;&!yPvLnJQhfs_XATHtL`@WsM2BQ!r?W)J-EiVzrs1q
zqE##fVzf=lX!bbzNn2}P_=Qucdn&i2@mg*%AMv<DhR-u2ryH}!bq~1ryg#e%F_TPe
zr`v<oDmBr19t}1<PHFB(bB&6u@3i~x@2H&f@1I#eUC=>}#M`q|-Shlz;I7st2Q?SG
zO^&ME1{{0tSY6p0O?K%x8K-lx={-(+&r1TS3izV6ThK#s9kwl^C|&Jod>~};9amvy
z=*WcG7^<}-DO<r0f{!0R{?MTEQskeuUGpk@G(N{kU*4kO#u4^5g0j>1e$aWN<HR!$
z6;V@#?TA*UHOcZ<QMDK&n-zf~*&h9@uLQEB87blG5iK{dvv;~?Odj|s*q1!gcTZaN
zH+di0XjZr14%S(8y|h<tioe^_IBQKyN(15<=cD&~7I?1NKxg|NXPzA0b4>ROOy_`U
z$Sd&V=pNC*b9=p9(={TV?5|!A-Ay`N>&WJf0~vOv-Kdiad(-FFdlxVP-_<y@^R$Qd
zRxqutaqplc4p6%1ZW`gtQ~cJZvW%hmMW^a_ig(a#2quYVGk#~I8#*06p$&(ej;%K=
zJh0QFI6<X8nZ2|t)~Rca4dJhKAyISaWL-!ex>YIG6M90wUsh;-Q}Yk+Lo2mAY)2IR
zwGMCvsMHkf9N_1wVOXr|X@iJ<3m^@c10kt8cZ>D>1iFxFKAyT&k7pW(tIsbN|5B<D
zcaRZYR<Fi6DAWc&rtLlG8ch+tW@Q2+^v-l{Hr0JPmP@qX*?c!UDZ1#Ndf~~m5Bg(e
zgD>dprl8YW7k~ZxxK>MS)*F{TR3<PqvXc?#CN;j$q3R1~f~JIG>eTjFQlg?IU+J!O
zSXZqp@=0YHbSHzMDm774Ti!SA$dA=eamb08m2t>{g~^ntq_tP&4AN9UhS^<rSq`k%
z?ufq0ThG7TSHHE<0M$`uIO?}CKC62!f(-jq`km_|36B#Z&OGy1(h0Dv4_cZv{B>ch
z)?c^x^YhC)Tap}Z*arS_@u}T(Hf>mosvE)KMz&mZ6%9u6xICV@tFAR0;`)pBMvf3c
zyHGH(+66bT5%Lyhx`Au;`>sjjG%e#DRoQMiJ~j*n*4Q@bS_2jod*<PuwdS11upIVl
zc+H#DI@#-Pc<b?0l6{r58zexex2A#aP1LEp3581Jc=-Pgs)*BtVN%(x@tec6yI^B5
z!Na>p<GuWqdOdrlfE?7=scU9c-s(NCvbS+2$@9-V-J*=fIF!)3`kn4>X`+orFI05y
z+BJ!uW_BOGnsQnVTm6GJa(>&1{CyN?u&>e!+AWu)?k8((z9yF{m+XRxUia=-!2E1n
zjS0hG$~#+QGk31sqlyn37I>EI7?{7!POTKSIP_r6bv>===i0~O*n7r5e~7tMIU@>G
z!0>O4y_c1$-FpWw)%v34M!@(^DsebF$#`o#KWqO=li=<8Jw0R+SSzES98D?)zN_(E
zoK)H?%$UJ}NZvr!q?8;>Y+{Y`Tm42mFoR0lL1vNEeuGhsJ2|{hYqTb9Xtg-29o?2A
z)!5Wo3|kOO)QBjvzR}){$C~F}74Iuk1nOZ|xa*p6NC)kAzHLJ1ablgOHm0t|Fm0hN
zu}AlD+m6<5!o#)34Y>b@)Uj2?!sboKXpN_{V~6(XG@$5svGQ@#X`8pe>%Fz^QM!<?
z%}s1;w{pAp8>P+hYO1Lnn)UwW=bG=c#04s{Cb9vHsP5gp_ny781K84P*L1A4JK(6*
z8BqzqjLsq6YH0W{h<cOriooe+W}66imC$-FqV2idU{0!%hUi}#lFOgSuAtITuxD+L
zFvE&Ms~CTa@q=9ty%D%08Df76*KB||lAu{DZYjL|^GG&R2n{3~dvZK&+^_<L$1KWY
zS0p<Ti<fERGS;(bMVq&Gf37xrhke9K1^SA=077Zu;YH0U^ZS{EDV;Lj2hddMK>zl@
z7Z)36F1uU1k<rFBW%s_|maH;PExz~L7y15<5eYG6Nihdsm%{sec8V9hAvDu$bt2$*
zG|FMWCHwTxp5&TMTeFGx=Zb@A6p5S}M45$~ngjeNkup%)e(LAL_SbS!)Bc!G^qK&u
zXTh_9%pjL=Y#gd^*XC4T_JsmTa-8bIp|}K1oapRj^v9ja$Q(Utl=!Q#5-uTsb*F%M
z`hg|oS_I$2)@e)UFdTCV;VeET1TSRS7UX2|m*ky}h@QTqBep%w=1CH&NGmb$^5ZAg
zNS;0;INkZ{KVOEQI}c^fZNA}g`FslB7SpB$03JF(h}aCEMA(=m#U#&`x$`y2wX-@?
zOHY|#HPw%9P2&wl#Un9OQNT@ybR+nCE@I01tza;{6OZs_@Ec@gB3_*%K1o~{*J>-U
z!vJw3N8(!Vkm}!wWkP1dV9#KN66H9yCVeNQG@g)&p}*$w*!pfF>wv&viqWg<7Wl`E
z=7FywD@r5Zr3HT23}uYFkJ(Cgv;zRMtOHab$5eN}R@jnuO33Si2j5R!R-Tj8M>y!t
zhblZjGrc3B|5h7kmE5W(_m+R5nt%RYRo#$vx>sU3g(~df8N@<cIS8FpxP3pzi0D~9
z%Oq?&i-@yIU9acaUr+qFQu!{n|M^Wc#CA{xfZ*K5i941b1g+`^6p%>LdW=V&9&u+;
z+E|s^V%)&IW6Rqqyep0X@Qz^&j)Zy9`C`9FdnjzFz>VwDwd>k(wI@Z=v2PsLdYO|t
z7fseA0GAq;s>oe@jP)iQ-^5lXue0nNZDMlP4^k@yK?E7hEM3P4z@dc|6({a2zO@f1
zY%<-}v{swFDy8aLvg2-9$3f3d*+~#LwzQ!_@%R|cXo9NUeH+{VC_gOM;|_`O+T_nB
z?GG$};vjIi+McS5JAp+FM`k=8J8awaXxg|z6I#iXRQObDel5LGw<vUXu^NucV@V2j
zv|4hzx2g|q8?K%rY~iasdE0q(y5o{Nu2zzjEn<RzBqRHH&svJZzfl28m+SeW)gn_N
zHeogM@KxOm>)H0jOQ*o=xLqm9A*W)Jt~)An7o%Xk2~QeXc#4m4HROlsFNC8%yWk0r
zQ}y9Rf3u#biXVE@()GI89Afzok7GV`#p$cmw`HL|EIC|DVLRF#PomETdO|4yUp`iN
zT}gkw-BpLpgCNJ^8#A{m6Ho)&!Dg)Cay+J$eVeH`-0UoqhP?Co(&qy7koB$dS^8{6
zBjZV~9{fAAIW%lU8qUu+cw=c`NnTH{=)$*Lptaq-b}Hcs0s6P6{w$%F^X1-gxy&+L
zmcUKi11wbk3_np@;V~@$gJ*=56ZllJewKb8rKmZ?(AVY+X^;*Jw~cc`hY7V8V;_?+
z+1p8qn%Zc4C&Vy$Oq3<mcY{`;5BK{j+a<P{gVXiS|8huoL1@-9!Nl`k!3dMmYjZP&
ziP}}blgKxdS=<S6)JmonET_hn!s(K5Loli_G(V4HS#eN~E6!jYwvs<z4j4R$L!Auf
z;e6d}#6FsW2D0)dukNl=7%$kr(|MHVtMKW0{bSyZmGxiO#>5zyQ%^TDe!C9GdtugL
zmUlnn#*fcAOkGi*0-G#Hx-?Z2jiuzW-DGk}lC;~-N!@R_W=^$Qc9*{mUyqE*vFwxx
zDd@c*sLpV|E9P?7lI($vjEaUpZ+}8r@%G2|YnCui@ZBzjqQ#Tlt6JZD5_klHSuwCs
zT8yNHFO5_mpVJ=y7yP51?OI80Jl~wTo98>;IAK`Rshjb~w-9M_zcyLN@;lg88A^u}
zGye231Fs@2jC`>a5J~21D9wr8a<qvQMz<GKJu}}oqdYt5Y4l_BDO3C-rk<I_O>L+6
zBYRKs$`-|avo0qaK0dgou_EXuGja>@LR;9a`)~mwIVeykjyB|>fo+SE)bJ&o8Az#!
z*`GB#-SfA<d%D2OS{E5AYPR`1sy7EC@cHm9Tt>CQ1*aDR&8hf*+H<xbWa<EEmE-d1
z?5Obd&`IUI@{HwihOJ-1zJ8T`i&{un4$u?YBlbhj*HsL;Pe6Z+Rdb-6n}`>gxTvA&
z(4Dx98!jn>eU-G~U&h%_sd1DiU++PIebQ743xwMzZaPXHAPiVC{tY-JA9Z=SdsW2Z
z6jGEau4ge~T!+c@ih?(X#^M;lKWxdAz@v6dogF*cJo(Jl(V5Il62}|A1j?wBETQaN
zD{OH0*U<`<;C3|3<$>>yRF~y;GSPS~3TRDq&x_Mn#Jq>Ug+I=7{^>^{9#0*oCs7UX
zrL%h$GXtSh-+NV<ou?j(L427y`tkB?AX#6fh|1s~jbD|}M0S34o>LDnvF?VPjw}Xh
zV{VLSwis-f;^^T0fu2h#Zk?ZOQtw`xNHEa@F9>J}WRLcoYsh3q)!11bJ9p<%W%Xie
zT@n|ylCdQxY6GWlz5?P2A;r{WTmsBGE>EU_v~5YYdnc9i#ji&=owrqZUvb(xb+L+#
zU2=73qddk3lPP33=%YxFg+IOI$>Dz+s??;iMhr6lS|fZ6IZ*!Sq(YHCjJ!#o1fc9y
zq!-#LcB!<HhP5eHseQ*8>PcOf)2H#voLYh_h8H3YpQY|g`tH$(gLFA;#H&XadPrc!
zK*8|A0wX1njMFjLqJJ`Nvi%FosZ6ABRpH4DwQnY#`*m+XErrKo-Y`z)mT?{}LDEB~
z-fA;Xk)B4&S^!Vg1pbJT0u{Y{<O^actdYcO)n~HB?&E~Nw*N|JO~Fo=aq3*QVPcBs
zYp4(cIUZs~)%N5KLJU6fj4|k)25Xk-;zM&c&tL~lh><+T$1@h7%j(q!S|I1Yd{24%
z{FHxrQ@_$?l#xO?&=PA{FF4nC!MRrx*{`Qr*K2KSn?9vKwCAc;izlgbV=CUq=+k3U
zBggiU<UXp8Y>C&jswfligF9(oIa7F3AbcAvrJkN%0uOLIamqe^NRUyPJ!rpvzz;yn
zqdBB5KtB8U>tV|qD7;@`8z+kIFC#Kfyh|_Fm_Xyf?8UJlq=X+?w%~K`a7WvTa671a
z;AxDo(=`zN7XL~RJ5#(Ar?9{3D9HuL+^iFifxVqE76C*a-Q~avk|w5{4d)<L46Y)t
zk=NUy4S8xm6xUK@grevqP&n?RgxQwC0O1SLzzM)Hf*u-pkImpI)^E|Tn+B@j5Okuq
zpk_8^dSY!uh4;K+M6G?$7*ZyPv%mISm<|GkICS5>!1LQ>Wb!P-Z35_${+)qI<KXM_
z5@^vS>D(GyY6K~uOpYEUmdfXU#}VjemR^?lFsuT_>5A(s2>~qc+{MH;c(-Q9K`@L7
zfqt?@vmZK?#d(#DNPpiiue_oxaDlodSyzkl`U=~pk}-*)5eFx+UXxw5i=tZ?d$>J2
z{W#(>1p*1$>A}#qkKb4W5^P|qEic?!Km9}T)<?9Kt8>G`BN9IrqB`GX8cD2k#Os?@
zF{SD9Gq6*~qsV7v+RWNlsv)E~Dz)62WT}4-JG@ZpO;Rxfl^BoX;hSTeQiIx5pNYf6
zeBQr|StHZ?_goGq^*9skRq??cLo4YBoKT7ETDz9c@$DYnjqy_3Ga2wM`_1{L%c85l
z!nC{+j_4QeMR72x&4;2@XN92*H`x${p3TVP?W%M55<`+5ak*A%d(wdD2Wu#h>L4!0
zl@Pn_F?QKh3C2^+z3SJx?!<wSSR;Jp94T7Bx3^ojXkxp(%%HL$^PNN)d8<syW~g)N
z{2YaF11Zlm-Gbxc9$F9~Dv0g``<U|AlzLF`X=dW-QNzxi*5>h9zei!$Z~yvlgUX1Q
zZd)m@)TmM1ta|T1vh)HO|9{srDzwmDkOI6Ro&ayB(w+5mP99iL1GXufuD#H&Oli}Q
zgFA|~Q`^J#@s1(<>AE*nl%ho^n&di8e?O(ise8XGg=1FoRazxubf4C}3N2h^8YKDZ
zHamJ!<&?Zj81Gp#=cv#EOr1?hXd})RCu+1toI&;TIZj=5i;^R&et$lxbdq?GuoCf&
z4CU4NPRIDJ)?yiR@00W=&Dp<;atjd*S<+?!NOZ6BzzywpS~g}D%w0M^l<p`S>i<^>
zF=$rR{fuRF*aWc85#U#?G<km`MY;`YAmn|ug;wvmx!$!K#!K9$1nRlG<b264*{-@4
zZ^#ubpc3T&h*}Ag608(o=zMd|{(T5_)VNsps*4?pAeZZFs?o{>!QiowA-dS(f$BGk
zpQCWOQ`!$z1XsQUIW;eKZPn8%+&Hvh0*U5Ur4*4Mq?7DT7uycX*P=<LKGJX%*V1K7
zrC*a9!|v6n?e$$z^}Po%#Ly=#Uksq6Ou(CMB|5Fq1W^6n2tvx}`zu?Rvm2E&F7=vK
z)eDe-Mbd{MSoeBtZ%IQ!zhiXA^mEwdI#D1l53<lPbnvFJF^ouI#Vb2~O@KNd^q5La
zR^bU~ia?~stGVLbTX#P!mIS-z*Gl<D@nHryz`H98JblHL31B5$i%&qKZvb_;q^dg2
z<Xl@nf6pBH{oCXMP#VpUl+)u~()47>hlC=A=H<s_ml?%5_+lv=oJ}L?gIMuL*cy5p
zBGaGAN!Qd#-{tp@*-j<YvvA21khN*x&YQPv$tS8w0U;<3eSYKS9ZbYPtnM-J^}Z1#
ziOcFV;SSwp_ePw0uttWlaJg4Nh^8(2=E6ir`mDH~59-GeXL?8~Doq&sP0ID?m+WS!
zoyigd8zs1B*l!f=`<awK8ZV6+k*`YOGAQ2Lro1z?s>wMwN%GSOz}R<#9c|8lbR8ux
zFCcilOg3{m_7(nrCKmVcyfYkieY(Fq$X%`=opFrBu9He!vpjU(oJcaalcZ)KQ?Qc?
zT*PJBxLoP0Qa>l%>HEvZ8FbAJ%vq-|Wdskhw<~Z>9Ron1CTF0Mvc>Q9h+cM5%5N{S
zm;6K;?!3xRNwIKI^9vITq|`;E?guYLiqvz*Ar*8R@QBDpug1*}<mSIxEFc^e6MAGx
z-0qe?Taw1H5!_B=JJZJY&l3O*L@~o{0ftWwtGr94mJXi%0ZZo!tPJ*IAi&xO3S;?M
zb{nqXBm9~D^xD-*sF~?k4ofDt^#>d@l2!XaI8ngpNHRg39~r;Z9ew&dWvT}pOOkIw
z&hsaXm(kkWjN+L(SmqQ=Jj-G$hR^%c^Zl*6v)HF9X(>45`h9IhP)FF6f20FOHj>nA
z$K&N>)t->=3!eBbdr*3wh&KAUqnGYNnK(~9x`*J4L}?$Lt@pffQabE=gNHG>Z~RrV
zL@7!TjExy*_$avLlG_XA>RQuQdz&H&5$*lv&tq80u)58pJjzuMBJfKBunlC0Hy>(<
z@iZ<;pVKU0pvUuWun=-g!=|@_XNL5m53O`hfb&#pe&k@k#H&)CS{U0*rIxb!uD$i8
zVhTV8Ou|Lb1$la5jG1*PWBe0awV^$%&K8BjvW)<sqefS}+0Z_KJaAp>!G=U_<JTJ%
z$CnY3T;{8$+qS!qF=NBv*YSdiyt*4I#zWA|npS{*uAsIAg<}9j)ynB_|7oguY&?F)
zb0_JIm-0b|TRCOTv1%twLG-=f>t&q67_8b41ff!6Y4bz(2i$y^EKNtWW=0wcoJO<{
zWG1M#st-qG^VY4Ai_1C4Nu)RGGP{fSi7f69@9%Volz5`64GkC!Rt|a98(dx!>kL@<
zi}d3V>nNo}TMRXgu@v$_l1hk8pGUI<M==AxudNjpiQ&oRxQ<oxGX>Wz#$O@Zve9+w
zPS*I8Cb{gf<hvhUpH!;2+2Tt*Jr|yPQ*pOVp%=I|Ppzzj%9+CYegH|bb4*D1`sZw?
zv2*FGUY`C!(#)GDdV1WiH(|wpQjAK*NChiBg7Lf4g$oP=$SOI;CsYlrSC9nnlSzgc
zcHczTr`C5%Gv;PWu}x6PT`8r1H}c#FceMjnYeFQ=@%^PFFKHN2wnY21^X3}{_J-qf
z-S@qJkeop3p*x)u^`wA$j0RcP^95iv;sRW__E~_?%wjkQm|*kk-*qt0<ITaR$)-YN
zd=MyiK40lmHuW>bNC7|Uf8VREWTT7}Y!f_!bqzmFbQe$s4iHYHvYl@)zBNzUT7F9>
zDTMsZnV_N5ySQGz@NleJe5q3fH{Nk`*cXcHmh|o~Nq?d0wCgx2L_wQ}CJ%#FWVlb?
zQ4yEDd{w5$8?TmLdCGPpe_Hp~jkZyy%hq1+JnGNey%(6>{&`gRsS{JeyQi1x4V!Z@
z!tBJp<G~YunDV&anqaddb1$uH+wT3F5k2<*@a6XPFFb6lA|AUKmy{2@`~1!SZB2Q9
zJ=~#N8zZ+~X^%&$Q8qrp9M-O|<(xl%q$in)knY(7BzH$ooPdLWKa+CpRfRB5NhGxI
z)$}IRHqC8qZT;tALB>shvgXjD0?9bpSV|G6M57SBoY*(TW^_}@pn|#b>0drRenNQE
zZL&J7f+9UL<O5{*jOjx6u-AY;56Lqv7z&f$N5ET^N~*L6!M83L0HKiaDeq6iue7ex
zUU=smCdp_uByy~DdyxDQU%|J>+=i&F<@7&CvM?Q9^b86!qTc{`4V~stOp04*W6xSc
zhBvEUR|p=6?q$xFsjn`K5x^91{rYa5m{DW)de&JHT3g|5mHzMNUs7y98$9{tTl`D&
zB;chfr^}(^YOk-auR=$2c1Y%?V0?6oBqj{Tr_b;8y}`2APTRL{7b*kcs#HFTsVI+x
zHPhbCZZA)E)|{tJ6L6;jvgSxR*k>Mz*Z6DDRrU~ChoL1*KASPuT$s9VEst&wMXSWj
zOe-NUgBZYPbz_Zoa!32w5bN{zKfwP1)AIqMZy9eR(L*-%k42fcIZS|lyYcS_XsA+4
zkFw}Di^Mq-Nejl*?F0Hb=Im(h_)++@bEie{o+$t(vPaUze4epIpQj=PO*fq=&I!l1
z%&%~e?Ed+y*1dP8jB1LgC8VqyHC$!PzIW6NAbxZm{EOSbN!x&q+$O*{Jlxzl_49gb
zFMG9|er{>{q#huTr^~x_>n1~=kXE5m<+#0(rA*Krf$Qg(!Fy8lA!x*{7oKKJz4WP<
z(kZD_30EVE3`dW@UMtgJLhSd#6#|DNPaZr_?|bY4374?8$Y3fH>K4aY^zqc1#+wx|
zuc1M!WN>}fmF9Nl4ONA=PDXN<p_rJ5!Y$0iys&@Gy~87Ygdwd^s%RXu!0-CFtcP|-
zE&6JdTStioDDd7)JG=JGx8;;>HzV@nkr=1t(A%4oVB6~eDCu0*3BUrkxp)p9`sU#o
zE2!#q($F{H5CljSjw-&$3V>_^^c9UqG*X1zhXh@w5jm6UV4{ZUT=oQ7^u}lg_;nv~
z(Jw-5uuYf?{LnU6afGj=wrtdBuS)Ibm)`ijv6I)LF~fgm1d@F#$6Xo9@FT<@wx`QQ
zXigRpf)Y(BJQM@1;Q~9+<HJI_{$(gN^c&`#fNWTWZY%z_JxIahHzgbeHdNpkVf|4J
z_9jYOq2LvxMvlA+7|ASHuB8i?erFP@F6zH+O;OLJb7{swuWjBx=;^88nzH1f%5;03
zx^-tQ4dnEsyo9`<Xl}3m{jD%BF!npbt3RE?k%|A$EhyNZl{HC~3NmgmGqH(iBl88m
zOv#W=v|{?vzn`1JEtZmJsL;j<pCml1dGK1W@ZrGgAxwMLe7Vh#@$xbE?%(gNI(j(%
z=7@z%(j6&#xV!AQkxhGM<OtM58y1dErJEofp~RfWm?ELEgHO=h`IlAnKgzJjqX?OD
z79yuM9!0+$c?KPx-@bh_SHAw^c!O4k26v+3o(D3eqA~|4LyjfEAP&|VODa9-n8|hJ
zqF&+9twsI1H?p#XRgzJY-WewTrb=`{)iRTYB{QBmB9zJ2PHA@2TnTfMGu|@z<Vb9M
z!+$R*adAmWQbtSW9TGF#;0?0-X4^ra`XdsT9vXP6b;7CE75A2wt$4g?-Q!LAvRY&$
zf-i{HbVkI~aBidv$A0zd)#_k|ya<;Z{0A)DBZq<!6gUJ2UwYnOQPgJ#Om;-{J?AkI
zjv&-kBGr8Y#Y2$sKb~&8%BOqn-zSZHTiZ_rx%SB$;cFC}2me}Z?>QapH>w*ouM3l7
z*h$13@t49o#s1F=T`1BXK-uQ-v2uKbHD$2t7|rmINHsgXCn?E$@}%tI3wV7PDH`EF
zFF!vYg{~1%c9+-DcO3?Ece6sOR8yxy-l%36|HPvA7-b<_x(U@J_wP3=FwR$L<WWAm
zd~Y5!A}Vu6So^f>d9+~059<!n{*t)3K9r+3*0(>yK04Ow-!$W)`oC))p>xBC5q&iO
zm)sRfKC<&%7}cnUC)~PaP4eD9suNnx+>p>6i};PqcoDILeA&LPx5K}-y&f{*XUdq!
zA~?|Crwk*Agw@4mOXHzB@RVk+M)vv8w1;AwH^<Lu-aJ=ESp?G`?UI$1^?edGisirD
zUA3y9b*qw7lrSC<)NS7EG5c;};wZM)1O%z<dwt)h<yOr4kr>&Mae14Gx*pLtJ0}X}
zHe=T^XaG{jjBD4S(@eJvXh(P4KM%Kb?=5Y{mTlT1L#v}I2D=-%$NlSAD$nVUKmMp6
zw|b`;Ogc_JQ1GQJt`9?{Jfnk$4jpT?;=~EpsGNICl{rg;cUEpUc3cTmz{gRdJOsTt
zte4}`=Q}!fW#C<AY;kV+ZTomzp=sm;O<^#lkcT7<#c$qZ$`mAN&<;bL)=SMZx%cP4
zt3Oxt+64EFI4jwR93)HF4#xO+y&Q6fhK;nRO&Q+gv}#s0^{Ewebs|@7(c5T)C?5@W
zD{6bCpMyie{`E^xA;31LIxRCY`}>$_(2(Eizq!Jk;gOb2o4&btzERAaK}4C8>vd~|
zPv$2lgv5Q@!)6kbXerkwO<O#tmGSZnkpL+Ck1+2PJUHjp`WFscj{9G~J$|vFpk3XF
zIFo=?-yh8(X`XU&{x0(3y+C~3CoH3Um(CoryPI1)C(I&dbgc_Xv3LR*a-(@zXoO{I
zjl;QxC@OSlbxKxg#-??aNuS2FIBbW_SE(Ab_=zc%Z~eaKxy;yNqAX~3H*VZ$5O?+<
ze8ig%y&#9^>EyhI!;$dMKR+>V4ve3QgBX)*1|Zhb=RWW^#2J@gp5kH)X2V~0FOLvF
zILGG*QPUB-RcKP1AIDe;ACSz5;&eiMcX8!HFDIOh$h-GcEd{xPm$U-}HU1hR2wlT_
z?~=@H_xc)!c^Oz$2eVga8tM(D%Tp+?NT_AZ`_b!V#}APxPp<b}&(BAX9u3ilD7;IK
zz%?Lgy(R1}yJryuohoyf8k`O4E?BTYCMwf%=xXxayO4J1rnfNuC!jMY&a;HUj0kCT
z`hFe{#8twK1MT08-#gd3cj+o%9!0P*)L(@J`U7RAIe24dXXieyvvn_?f4@*qIKFLv
zU{KSH1YU?BMU`mExuqw73AW6-i=Pq^T?ychBy$!K(Gg}0k>$V-2u(uzsmdXLDB9rj
z$;)0@Wb+V;M7WLjh*UUqYdqV~(#uQ*vr6jC4+0XugcQ;g6Xm>UoJvd>#N3V^<<u8Y
z9&BgfLh;+0Y!lKH_-KI?E?<9(`P(;gK<}k5X9vK*<!nlzEhM8NT_xm_7;f`dm{xz`
zLt!)<3FB*@n|*d+t(0Mff{Z{Ko7P>Z0r;v3OwWyJ)%?GnMbT}2O?c-gbP)LPtxAaF
z3u&*Oh-ko(!R^}pi1@1+y|N5BudxJfYZHko#z-~wvb_ZQ&2I@gE9a&Q^LpmRwVYeD
zU?Q?%K<(`8Y+*Z93LnW$blQ(}CupG9lt!JZi%NzcQ6ikXrkDOxrb*ed1|Lu(YMbWr
z$De;Ti+g_SWY09lvvVvi{-<^8fJXx%aT$eklgB3IO33RTQgz87tT0#FcZS5D{<7ta
zs_G3GFbR?-y)T<7+WOY7yMi443sFH*H(~yi-lgvwzy9x3VPxQ+np9yj;plO+P&C46
z_+8BLKJrG9DwnBcSO{c{j(Vl9Mc4}%$fZ?17Zn3)m*H#y8MpnaN#`KEuifvfutx+-
zW+{sn^lAV(qYPa$;dPCvNii50gmI9@ig%B&iT**=W!l+O6C<b18&k-|6?G#(5mvP+
zrx%63c;h=lj1?upE$~GUFLlKUZf`qE;TDYtnGIARTnR7~_aKW{4q~C9;r_(Z1UPCe
zjZiwlzuOnpaApt32p=hF)Lf?&CjP0LD@CU#j-zYMR=vs|Jcx1Vs#IRg(x{%p07;wH
zroV0DU!A-6t_VsLQM_iX=pZ6#VdV8f<f>LWIywrOG=&Gz2w2^o_Wj5`qNM{I5L!6T
z$w_3NgzgQaboB7Oz;24R`-%SM5?sx_bNU4H^e@4=5AfV+3Hcv~*JW+vhKiz=@m#ZT
z+mpfzW%6m~yfvaK086J|XS1J(CdRkCi!cp^!_inkLs+x<W+5#`vg+yT9NZlR;T|e;
z##^18OsGMIf77@znSigI%lW?~A{82Eb->aFZ3<vlB56)cPVO<<k)d;;34@M_6Q-Xm
zgfo+;ObHnzLE{7nD8vQt0V{qbB7c%`xQY&Mwuy9in(u4|3z2@#6gAhe@zKDQ4O?}=
zjY^b{Q!*(%JJOtu)n_oA1g2~Ls4vw+Cf9TCYt+RPZ{EE5$EI2r!gp8M=~s_CRoZE;
zK?a^XYDlki&$pYgL~sH69Y$@|WLpLq+=Iocx81vSTc@h+Y)&@f?2+7b6Dc`x!Z8EZ
z@$|rNh7#zBAa(sWk;E$guS8#Xf<v$Go%24ZMpGj^c5aKWukX&CJC7O}pA2ZKP)zH~
z<7S9_0;PuZ6OLBwp7`tSYF+3GIGKG7;)dPiex2dm!X_e>>J{}NSqE0pT=3b0Rl~K;
zQ+ZSzEc8oKx^-$-Tamky)I<;SBG>>91GK%<(>rIj+G7=Jye`<)52060UT1F`mlX}=
zAUy`N{edqr>hwRXL1$y*?2DWJ+aRPg)slpG+xWppVzCNBgcU?LjL;LJn5on;F)<&&
z>ij<$-M#njr;n@wZo~X~PM?OE7O~b2D*9JMZ*NRXo#9c<l@^tK)BsSN{Rc24gc3*c
z+ogy>lN9LB(CKH@y#GSUNA4y?(VEAo7ElXnj~v`cQ=Yz*L>iCK8JN+0W?SozgW(EE
z3$WeoRj0iw&U5BP;_iQZ;-mZ+RCD_JX=H?Adny@Vc>V$hfBNJ?s)vxhl?kb-=464s
zP!22gV9&va!b8|wL-wP&D(yeK=X_+NQ`kykF3Ni-OKntQ&qJZX9!1~lC!e}-Y{bV$
zJ>)AmKZR5g``cDUY~Y7Wad>b*_gDCLZ-)&ZNm;xtg>#aGT(V&GOFAav<Kp5j3g1VK
zSBlPan8Rf1Ndq`JavsyrD{QcTeEt9T{zsUvRJ!^*sNmxeZUEApv7fHLQ}yi^{fsi{
z3H)LgjdY~&6-_<$>))SVR<zEYbn5ec{VA{!0}K72I*q;v0aj#u{K&={MoHbf$?frx
z{YDW0X}<>>%!0|g4k{QYgm4!7IrbB9V&k%^lgViMMx5SUytuG-DtQFc$v*mhGAcjh
zSv&BI%2(Teg#{M)CW$yoef-z^Kf(5T<A2;z^|tmNBN4G6V*=5oQZGCK!OEwbG}oy5
ze)=3D!YM6I>e+#mUnNKgwruXFdtm3V>TR<{278xMIQh6j!dmn{ey3&i?;L$1-~gC~
zQs{z&o=DV>{>Se~gghi-H6ogzTF?UdZQC}x@Yg;{fPSlA(2Ndf2p1p}&UHTjymQ;O
zLnQS-@{a8^tsnVkCKd5E<=jn3!#awhCUsZ*NN<y1Dvb0a{|qD!mCyVK@w%(azk5G9
z*2`6Kp`-^<8?Vs926b29=R|4e+n&?PR=fN>e0U)tIAhSX;(@5ldA$}H1+w`LD&%a;
z*$Y1NkdZNk=S~@l<Qcf>IG6RJIY3^a4kkEGjE&9L8}zxeG+GIrB?f02)SM}co|1uM
zm95C#tO#E8jU;YncVqy12t6SMhnZd1!@()Y0GQZBa>_MPL?OaLe8Zpal6y0S6iohF
z>Re)R2<gWL8c&t9o*3SH5lO)aF)3@;t`*KNB91Wfi<{?`^$)!JUys6Vce63A6h&vx
zVC5jBv_Mbw)B(I3hDBm<A!M^l{UffGiRbMBa0a?RiVzMJyBH}B%q~G@=E!RjkF=e3
zn7vp}h_x=4#=iLH%SMf?WH~?kEcKj8z&?V8(fCQVQt*}(k8<+9B-CK(Ul|$_yM4zF
zN5%`(!Hj{4yvdC<Inygjx6<Jh3Q`WU%|!Z?w{k16Y(Qyz-{}hIR?$&(!ZwmD64Hyv
zkzF}?+yrt~n5wqJCUhY!>*mcun(epXvV(KI8pH0nmbhk2iz?dCs^_q=0TExH-2gkW
z|6ZpyYf9_)%=z;pa744YtT05kKw?}5vm}3wX=4zVw16iY`$tRip+2qynwxl@1@RVF
zE31&1c?Rm8m8AmbL7wdPOCYiI0gy()pF|tBTihZ90iKFXI~>6Sc0qqa#MG~23N2bc
zZYiLZ*zJZ$QL+_dZ_d3^)eE576d%C~iky0IuF@VCAu>F|&l=XNmSVga!5n`*7KB0_
z?Cq@(azwI#Pve~U=A6*&rYGl5&6!cip%Pv^H2O@r8Kp`n>C?kA^;ge?e?7&%RokwD
zfJqXiHek}mW_nM&699WUMW{UwA3j|4o`)TZ8-d{}v2lU^_w_O4tT96ar=gbL@<1}!
zBW1rD^QeQ-4sJ&k0ti6BZw;>=EJMO@DnCm`c%(vS1-sZo+B;z2K;is98Wpug39MDP
zz3hF}qE-Fx#unIIdke`(Xpsj|gB+4zwbg)M+h%5TC0n?B<%%oJw;Z_snHjZ?+JQ-o
z8Z*X{571B@ZZkb4Hg-6~_rlRdViJb=-9Os|*)ufVI}>pV?J3ds^1_`6GHeE1LMQ`L
zKXtcDt^n_L@9y{i103sK8zQcWj42q9@Y+qKc&DZOA1U!`5RlDWHkI%!EA`Ogk=0`|
zh(k70oyF=+jPYD91gXToo0Jqd9x`3q=Tl#iPly^{iz1B>JxUE=NW`H$zMjgGwHkN3
z)5f~MT@ELf@*(b29gOwB@GghtW$7AE$ofU*8m6$o-R|H#p;V=2^S6c&URu15yL3so
zJK^j-?e^lA)8(EO<x9lk8W<Q%xW!ZPak))*?<6mTxu4#Rna(X*a2!v&OvmjG`0%1H
zA~Jz{y7LY1-#qs!Jn#Gy#BH2lJ(WzXvIs8AufP`dh*ESW-hBcD-=sbeAPzMzKDm&T
zZmLumm{k$E45++HLtSnnk;bD}Z@wQ~9~5`givR>BZe~T{zAQ6yRbTvHqgIBp*M*2p
zuaNNP5NZ$-{Q@J5_t68mman9~6Tu{B$?7+ol_H<S4T`FrQr-wBlqClI$atyC@nuv;
z^?LVy5!##GWz+*qnj)x>*7ZQWUWM4BFOpk=+=04=G|wT~fT<X{h8vSkQeF{`Mo&-A
zlx~X^Lt;gYT{s5FK$VZ^`5+fq`)NeGaUhn*`7aFAk+LFFv-$Ko90<;>Jrb*)lzt+h
zKkj5R%9Gj4-%J$^1aUk99C6l#0aNlgiOqPx;e=UG1P0Uay6Kq*w=1mK2Y?|$L4QS;
z2A;7<9={A|Kn(Yze-7I?uduKST1+4?{qDS@_lW<<G>G}K59(qAMo=~s_GM0LIMh^R
z=$o)%9Z1mWyzY7>+Un4hnJu5AdcZ*L_qRS!gxEjZg|vpgz%(EIPSiP?`!+zf{KNKx
z?=}+GLjKWI@=JhXL}F2s5A-NrEfnNryYaU1Z`&+ylw-2!q3KGs>*bvT^)D^^K=(+C
zTLK~$`nTjT--#CJnIELXW7Mc|2E`DlB_t=0<5)>xCRI?T+lGAk<(HwcUwrXJIHMGD
z+;;BTB_a)kuVbk-Tw`1U6mXwG=5jzg`8EL^ff}s%3WhsslT(fYIur1O>@=LBnBeiS
z4aYWa>?@mMKITZshh-w}ApG@vMWz^sJ$T_k*dK*6lCj)yO7iMpz^&-$=upNZ#*bNA
zN72iNDCn`E)=DWL3)X``&uSrWt?}x!KDhpnhVCzm&I4jcM7*NYNZbcGa=OgSxgIJ=
z=lr~4+9m16jma4q8A6dngUxm1=1^`uz^aLef|f*bpZn}`e$$fU>hnbogr>Veo#Qci
zG*xHFr=;?9W=YS?H^*iseJ8tCir-*Zmgha(*}RWHBG^ik{4=B31|ksp!7{?;Mpg!s
zFiqwW&Ok^tBjWkcQR5QZstCR?6kZ)2jSFcr*Jy_jdmSu+jhhqG1P0L1Xuw(fZpdW4
z3N2?EW7~tlUUWUKn}^#S={mM`Yo&)jHT46aq@{Gu=RSOR0B|W3colyVAhOw5m;Dc8
zmcL(2{dkBFE^~BA$VCrV&_Jq<GUG&48wnb`D03$zKm_H&@lle$z^t+bD3s_d1j!NM
zIp&&$CyFxr-uh38#LwOo)dxx(9Gqs2Z}3W>APu%)&cvs_(N^MFU?Gu=u)9MFDnbSL
zz&y7ngp}_d?%g0VTq<-ae`oLPyX#eP=hOJTnca*+h4<D=1d~2_Tv%S|QWDt+;Z784
zL>j#}Rocq}6*niEXRN81D*`NR?oNJRJQ^UeBcPHT%^o(Vf@t;_3Vhjn{aw0yCDGM;
zw^YDYq5V`hcA_U}ie%P^w1jYksIvFBdZQ1QE-N;kzp#W;R;eNs7EjD0cnQO=BP|o8
zwEF7_;LorC^F7W%XDc623y))tXQ~S|P4cdGX-D!2I0gMUxg1bT9Sl)ed`3pUv=RsD
zurQ$5B~gUb^)dB_aD19h0XAuOfNUvWOG1*)xMuw;@4%NO10s65%8|)?`n3Gw2lS-V
zL}MtDW;P3nA=SY^*zy1T;OO<FAv(#htX!u|2o^I|m2kHQiQ!GlyB(L8m%2*j1qe#2
z-15_+4;9CNt_{NsS3wuApIf#B%H9E2e(pmBS_t>R$#-)hVK@}N#vkwoM6`>wwY8LC
z!ErNS!aK-752qvV=5hdpX-*<hPbAvA*$O3~ycx{_+egI~b;b@MJ88z8-ZlEn1Yd?R
zF6qh@_IYc{2lH3@+QO}Umyse-(h}GnYN<@JrbEOU(g5&33XMMyz74!5+7=`xN)_>5
zGvlINtx$X{g(Fm0a)p1B03-T*en~J*<}y~GOh8pcq~=+9OCAwhXS7;81YW67K;<vL
zRBhTk1LpvO0L11e!r8(onKf(X%zc7LDV0An1SB#n26S9>Yx7;w|53@6A^*B7{+^Uc
zK~E@b@t^6GY@49;hf^xZ@~4bM%i0vmNYGzUzkC$c2HUR=c(i*Bv8R|D3>e2~i@3O{
zNQSB{sa_#-Hi70vF9YayMd1U0TvV<C3?nHGvK~@OWc9n9Jh>@^ya{1ub3?QUd(T)U
zi-J`BP?`x<o1V-N)i0=0l`=gR`$zpoapYzrh<2~ib|zmUF&FsE=o9nt)gqUn8uj8`
zDv`Y1Q6DPN!X~!{l?ir(CDkLFGm#WT*6I-#$Tpz&a-A!bd^9&lns?Zm*aen*>K!&*
z(U!o#K;;}fCGyrHv52`SXgT={#m2$2X3e5i@i#ORx`0B9!V1fEO=1)x=xrL}LtSss
zcB-T%?f6B)me@?nmDl7*<mBvy`t@6AFgf*hYgVL1U&_dsNT={7`bik!ElKP!)6M8d
z48;ni7*D;zr%#`{5;`<lb*??}tDw25?6X}zwEK52(5mEydh`HrxO|1*!`FJ`Zm)_D
zdf)Uh60sn=fi$j+@a;qI%8!`*?XBMgFKFQjzwEQ@bMLQv4m<?vC`C6~U2EUCb!-0(
z8RM2vQ)Kw4a5}g>Ls?~{8gBBy_D5!AKstJy5PFe`m3->KD|TRq4knF>!so1KxbrZa
zOhK4u=`OhvTz`YHT~t(*_%w-JA2eI`@?1lbOWe3i-ua(ywjO-DR3Z$a$35g;URJh<
zkf4dH^jN(9ErY^OA^CS`;jv|H-ypC=ePvl#DV9RWJSPYA@6u%yZfN9$yZ8^g;=qGa
zyV|p7kNI_hBtcM<GtNp=2Dz^Jb$Q-=AAdw?CzExsNXOO<vW_W1H8I`It<;PUOk^zV
zm{H`#k{AC;i8FCC{_vt(;0Ai8kbLVP=Hy5+Dus-uO+6iRY^I+!X29Le0mWpmS-QEM
zf+kb??X~%Q;<diD6x$u<%n7BbNxGXvb3>RP30P<I`-G+X8qE#eeKW*gvZ8;OxF*Q+
z4?9oihRD~D6HK6t;_zL{#&$z#d)rIsBeeeHn(nh4ac%sE!Igdh_qdwJp)OBuog9xD
z9(n8#_LS7q_Ox9aliV#PUY;r%>s;5A0*t8C(y8St%0|M%COSKhB3{hC#c!uQ-P%|>
zf@YQf`-j53_<q?=T0AiL&s**)VOzu&A2YIFFk{AUa1WCcfq&axrG0isXmp!r?P(1G
zo5J@bnXjAd2Tk_5R~gpL3R}R9kBw1{8kezg;+{r&?&GoG6My~n*VAtnPf5uFhOjWQ
zYZTb?QJ_P&{?EDwUr9>pI^hnTibt0pSWH7uXv6OTg$CSOl}v;kPQiVDSjw~AC8Q1e
z-c1`YXpqC>tL{Jl{Ih?_IOfTD7rhFHkYNZpP*PMB|9W=`Wd=FbdW8sK?|M44KujVa
zkwy9v<dUpcC8z8h=$pUnEUw?FjbXvNk+CH&8U<|dGD<4QF=LI>Df*<dgPabP7;vpt
zd|bNVL_$%0oTN)CtI~rgy@uhBYbyeh3Dk%mrNZ!%X#!!Fb#`&_1yzTTYmA1ORCm=k
zex+*WJ*G3BuJ?^0q+cZQd&vNsdpQ^Hu+n!fI>Ut#!ig9L-Qt5Jl8?*q@@LmVkm@-n
zifTfaD+^|N+K7e(l?{Vi)Y=7$UlixRW5<5Cc{66bxL67=z1^L?t83$*{GK)EwrBv_
z2LfX5ac05%!$*#&j(xZ`^c8`4bn~I(+zb*{hBh+Ky|Tj!%h81bX;i-ZsSUW#4oc<s
z&qE#dRTd4$uPhz@^UqWnrltBmpQn;s2J4k}?xcS4y1}K>j7rx5_!;%+*YEZA4$%?|
z%UWNTHlblZ=_$PD;Qz-++9Ncvl!m3KE_o*&zx;AV_J21+88xW*_uKeCBn%wcZ{xOY
z2iQh)33jBcEo{x+-rklK?ndVpgnOyu8vVTAfp$*{uh(f!i{vLAmhL(zusa2I|9OoX
zTN#a|e}`Cg+bnB@2#+<N;!o9nT^^-zyBn#g#jB-8zn8~$>EgxQ*+v_Zp01cPM-*ZR
z+crL0NYZace@v>A^w%6|`SsVE^0J(NI`Ye<Cja@*#;5MYN*(mv?}iNc{uc*_t@qDw
zKX<Z2Ue>wtqGm}x@4uR3O#m8pnzfj@EL)BqiHTle>*Z0$64TNwgg4=}(FRAj1|rTj
zB>_^oWUu?s?2Qt?9NMrRn2+JD63|r}0usGK=LZUB@`MS{(pbK_ajqvZ+~)jcd^X4F
zO4k=2M%c8f-Havm*LPSiaVMH1@kvRedFYa#9NjgbYj=Pr@#zGiBIK${@M=nb`Wumn
z$fJB3m-)HEeivmi;fIDVU;4nL_(LqK0anVb|GV(M(ORq-?Rci7m!g#1Zs<yBr%mHB
zQ0b0uQqFN$N555Vqi5Hm@XND0uN10Ze2k#{h>!*(w6X_0eM*va#_7rnItlVk4Yu1+
zf9-ZR+NIMSPauQXe{D8&S{*<8n?FZ1N-$aA;o(8++VL?P1VqwwV&P<b0f2!1Em|cm
zGaEc6(Ug$E_h8`mpa0X0Rng-{%#%9}Nsx6}R`%<{aqc-j{heoKON<}?+tWBdp0`!@
zT8w7i&1JW##@cM}->ISF{(WTT1D$Q(FJy%hPQ2Mx(7u`0cN`3CM10ILj-qyMg?;6j
zbtLeT2#FTX%SUqUfqf*cHCN`L3%OJ1>Ph2UPOh!kSR(CY(h`e(6E=8y>qA(gLL7hJ
zPZ~YEzs7d7yhjTjC;AbSC&H^NvUNE07PkN*oVuAj+W3LsCbeaW^1JW8lakFJUpAI1
z4`5^edV}0f5Fh+KsUflC$?1sP^)_2c1HhW75x%o7Uc4AQ(6~#NM7N^y$?;Eqi$bq2
z{?ermX^)K_WgF=0TQG#Z2;7PYutS=Q$p#irsrPDdk<rm3M>d^1caBwieR0_!+9H4o
z)y|&NqZdxEMaRq5Lx&D|EMG2-kHWf67$vcVWN(Bxb2*D*aJCeTA2lkKHn6_GT>L*8
zZd9W_>{`ThG^WBI$lqmdSB1CD1F!(8ap9cV3jPqs(-SN`+!m{J$Q?AscH|FwwKulE
zeEG7}!UU2R{%29HrP$o;7P!M9c3_u|9f!l?v*dIM8EEz-PlJUQ(&{%i7%2yxK76_-
zo*kU`;9C~{TQOZy$bkDN7~`^k`&C;awOD7628-hob4XqSN+o@K`Gz9#1B#tox8id;
zoMMX!!2_3OWz&pDlsvHY2-+M30TdMyJm}=o*Ev)zr=mGvUVL7nt+W}l;wd}`U%ASh
zA>T3I44oxWE0A^?390<KU$@D9CIr&AH5O;t>vf7M+I9M!QwxWJru8~MpI(hM%Z%F}
zT{ayiYex#Hv6+Mqn^vqC8j=qXIw2kQcxnYUm_&K#ew#N36I~{NRnUZ)4lbZyzAT4$
zO4#(l7r%c@N5X!RhQ##rzN?l{I2%if8iid5X3BX4rbGaC)=9X+KV6J?TTLp>#I`6V
zj#9H5T%q>2+rdG_jkZ;llE<Q4xZ5&^iGUz-(NUv}i|(@4^&_7GowBo<7sUC}FRw{!
zC39hNVt=6|wi1A;1%eRBZ+3sSDXmw^4xJXbxm_sV%?i$8ZYug~&(q)!bbB@m8SDs*
zu~A90VHe`4(V9M`h=>R&_v2BW`<xwf<IIxCBV|B3+q$j4{5@g}N<9it(t@t(H8XK5
z8UiT0SxH4CoB%w<DE43$Sk#MvU-=F<H@A?2_-D)EXg}Aax6_@|ebZd1gLFA|;)EZ-
zq;!^$6OK}o^|S|QN5uqY@NPp(z?&oi5(&r`Z>EvYqsuxHL*Jfa|JfkvPGUKlPv1ZS
zN3hfHlREm4K9X~aQ2ACRs@oxk8%?MQO}4h~$|lFFmom>S@*5;HgcPFzA+B*6l`1-*
zkaU|GlF=HDeI>wZ?=Kb?7q_4Zh9&Q$k_LMkbwpB4bZ(}5y!ca6QtDo;hExV@N0u*q
zM0bNqN+*KofRc;@ofA1p@BkjeVA>}hZhtBur!)oy=Pp|av53@aD@#wud4>+(NFmH8
zkCfWTx39}+MD>&jv9Yn+oF2Gmk_~`Dk5-E$BU?`NTj9N$E>$>UrDi6Mkf9!e2+hBb
z4Nv+j;k1{|HUPhb%a>{5dM+IpshjmI0^TmcAtpbB<w|N&q!3CKEiGwek?13Kb##2}
zaP$kskZ0e6@RIIy(i`|?3TJ@~*zl7os<%}uvBfV1`O{1@&=z7weBC=rAZ8u_1?jW=
zf`MzHr67!$AbFO;NS{ZXB`Srqbo@IelJwF=x@h)C6{jAf?U;umcrn43v^0@!Nl{iB
zgTOCn*%?P8rn!W)JNNS31ou|TtdJ-q00ZyGI&>vn63mU=6nG<`TlemgiHfxYYiDF+
z3!)EH5J@@0$c{{W2guRJGL9VO*>IXAM_Ks!m11KtT^-kV&cQQBc)=J6KxOxmik0*Y
zi?RT)`s7KI9^$4m2zkf0q@~Yo!i}3ZWi~|OVXCGbRDe3o5#Er-?Zbx;H!1g3tEKuP
zpdQJxT3Nl(eZ#{aAIts>p*|1j^hAeCbL@oEz$dU=H-hHhz4c^><DR?2{P{i5y-}G{
zcnY}bMF=MVO9z!efgssF^FX_<VXxWw`*Avj*g85cEr{DJspg0_G%4tKmu$gXk?H`3
zdlXTg&?!*5C*IzSjqNyena3tkjpGD~vcZ~CP+k#&7p&TB`u=gP7)lqpgV21u*g+W^
z5~Ubn+W}1wd2hgY>dKao6i~YSGu7u304J++oxoSxK-m^!@#PcnBk8m}^7ur*Ou9A!
zOw@BdxYBiRCN91Ea0mIET^qzsJH-rCgLCYmF|V96UCQ&PXt$eGTxE^~M}VR6em6t|
ziGroHD>*3j8i%=Oaq}UsceOxYE|lW?Xth8H`v!_teDC(Q`_*O>s5W9Q93Hw9EM(<n
z%0{Mj0L1jbmFVpDL)%5P)C3~ClcHv~CDF5sn;LIcQ8cXwYC%VmZF_zHhRA2@qXGJR
znbD3tp`59w8ou~>*LwBVHtD^sF#B`L;;ElHAG%OUW=CU;G8wlNYAp1h+g|$ZHnRba
zga}GIu-NN@!M%SyuRA(PB)q1SDNfkMH3Z8~`D9}1-naH!7(}!$Y+F=JO8gtNS)Elf
zY63-M#WpnDLjKdVX(wLglrT!6zI(~4Z!-)YL#<puI}$2=`gOJ&(sJy*H5*JcOdGuP
z%K)Pj%WQ$W56%rXC29`nI6mMYdkcl&ZSll;bbkM!k`O|W=f)#GZ`+?l>Hxb!^eTdL
z?U(=d+i#)Yxc+xlA^OSa6`}2cw>>zu64$6w37%nln)%?bJ_Y2ZqAIVPLnmTK1TYo%
z(ywZtB&7gN5ldttfzVi!Ol+EgU2Oet`Y0}Ry}+Jh&80Z6e1u^W3kka7aPz`{9jl^5
zX>@68#VGk8DWKqZL*N_UX1H?<OcrW_YLSbgkudf8eLi}&4<i6#h9x<L_D!u^@pllY
zl8C~T&9AG7+(s15c$BBaFiI7fj5!Ch$^UBgDC$!A%agDu01VO+A&2SW#h)!!ZEh!!
z8Zf;Kp3Vh13u#D4;XOZ*z>y#ZA_-HnWzmzDH9+FY`%Bsiq_0Q%ay3OnF<JoTpUx^x
z`}R9&vMVYUby^xB#<Ut9A|J-YhBTzqaL>=d(J>tCY`bB7#RZojcC~PhkvfIIsr`s%
zp2DlUSBL;G8m|ElheM=SsS)rgPIxx=V?Q68s*@g>n`8<A(5h5AjzKV+l6-Faq&@xY
zlj@Eow`XWjN~02(rdnb)A=;r?uGc3WWp3N0SF-X|^b>PmAvFAL?m699l?trWmogp|
zuI-=x*vH02%=rv#wGHLcO3hJ17BS18RKqSLt*LHAYdVT07Mz6WXtY|2LCkS(`|sAm
zqKYSX+_!CU-ECa7Y~^pYW{&95-za+L<<Y-e#;?6@Fzn}xT{o<$*JJzA&-NTO+JDDl
zbBnpN3iU?**rmbl%?87AgY(xn`{T12A<=6O9{oSVZu)0qOiEvMT-~R0(dFWIWv@H}
z3(nl`Sz(mfSO0cTgJGkZX&%DtMPl(<%s<u6&(Eu(_nSYPPTL06%fWk=+K0sd81$`r
z^JbwKlfnPI7bnlZI%MOTKA}#>>hl^DYF0@YJY?@HnMq#p!^%jw++_Oux7F3(H+$6P
zN}k252X;}wk~wU(a-X?gU`1JhkVpM)QMEz}#nmRA8J^%3bnOf*dL<_!mv>j0TUl*I
zqeEY)1OE=y`Qj&4U#zsDYAXat>X@ebGLtS;SvB5z`QuxmIIt%(82IM4i?0$a;0pD@
z;tRDKHEE$C57&IX_wdo$UbnHs$-{)VNrspa?(_GT$@+IWZ7L@Hs>2@Hy#2n)PaEhF
zSjUg;e`m9>L8^+0jB*d`U7v^vt2eGXKO17Vw{l;4cid0lBn&x~%+W$+O9NX0lj&Us
zRIZukihNdUTipDE@xU{we++zib*QhBFgV$4L@PhY0weC(S3etr@2VE@sL&n>L**l?
zr)qlwqJe~5CCtTtyHn?mV~14T@e~=(P8s~hUf2oLP`-2(>Jzl%rq{S*!>T)Wr@(QU
z!15?;7oAix)&G*YV8|@=bZ?-+_Np8B6WI|x8uX^a|FNNxd>eZ5e1h4YH9p{1^#LD%
zr95arDk-VA_jQ~o&q}z9YuxeARd+m5csp*B(=sqVmFNocK3sxOa{lhCv(>IvZM7Tk
zidLEIyi1T(sZm07e3^(>_(W=aU}Mz>CYQ#KFji|m07-(c60U+tb?bk>k$9-aGhO+U
z_L*MEVBXAE$S4TgN2_y%G`M$MOJ8WJj@7t_&#UesgNe!QU{U=E*v6@iSh0cd?Hwc3
z*S#cDgQ`VXMkCYoXYdpVH>}E;<r79%i_6tJ;_79+%A@726ut(aGc^_SPAUz7A@O1g
z*8D5oEo!DcG8ZZ`nGd@SGt3;YXti)tQu_FnM8ryWL#zqXTH5UVpcM@h;Y8jghFvA(
zvBEjdczc~rRSl$&uUT7E8V^Oq16a3`WdmtwNQ7?_R?)G&tKU0BUwc{<{T}!Rmf6c!
z3Im4<n#Yqupl}6au6w*LMD3}%m|-v`2)X;#wasASAAYHN%PX$@rQKm}jlgJAgz?(a
z%8DmKcqBNgp5Le%SL|4I#XEZ0Uho#Zm(Z{<gkRu%dH0}c%mEQE*S>;QSH1TY6ug~;
zuqp4Cm9RX{B*C9d)ctot)mtI(o1%TvLk91H)iau|e<@!o6R7<O*+cJr=ApW~#HoRH
zm)PpdgJtS^<b(+m)b`W>g@S}J@){>Ks%n!Q-Y(1_a@|}D!nvJiRbRvG->X*peNp74
z`jjpv=<c11mfyDW4tWeAM-q9~+Vyg$2DJ#MdK!LqVfd~^sVbG4MeEKYHygAV0$Uhk
zM$g<gt=b}a$BxshZGD9bhwvTmvD5st(*)7Fp?emp4O`u%8M`i~uBPGLyOA^-ZtpeY
zr?9XO*?$_*HdG&SBbqX67>hPPPq<cV_7vmdO&f9{W7abiQ+xFf68umkI9gD<2vTYo
znysyOi1$-Dm%qx~f7E<LwGAI)XKB{OZ`3d*19*9dJfLl-dsBU70!$j=x=XvT`J<jg
zG)rw;59MJ~YJeNV!orjZ4f@}vShdogIguLs#--}ub$`<4QhiBii{JdRGG+NQmo+*T
zzIik4Sw8$N@LOk0gG=<b8DQG~`%Qi|R_}Jz>P-~>JfY%XN2oqbyWESywC;oT@z(D7
ztVG!iV|EVcEf=KO#6nzTLHB^}sm@+o_5Ql!pqtfh9)kM(gU6E1b+?Fj)m_fBXT_qQ
zM%_v!gHt95y%h<$1J&yqPsOh4lKUATV^IdTl3r7N5sDgk|BuE;8Nt^zbVZRL-`!8n
z9#nTkqhr;=lwM^RuVg=lbj+nSVi(<8RqNz<Ypu5TPvVw@=0iePWi;GKF0g{^4+#m$
zUM5O00}BuAE%sU9Vdr@1bIl%*f6UI%{_v0=h1P}6q7Kp=rXI7)urGZX@xXKaqF)2A
zj={y~Ly#f-&J}OGDo&orsg9Hk25C=h(2zj*MP2ZI`<eJvsbNm<1v>7L^2<spcKc}L
z5UTRTHzb`$(vRtULI0{Ruobd(;T)3uP*n_Z=S<2$&>Q9r%I`7dcxRC!5?=5I&nUR(
zM8hi;rJDBfcD(beT#|u83dFFo`wE1ulBd=Z_7157B)Ui)LWto=Te2Uv7QbcD#job~
zRD}x$|M{ZjuS3N1RPu$gEO~#mUmuU36W-FQ;ODMuIU$e>M66EXr#@beV(heG30AW-
zhiS+Pp^1J8EhaI|IJL(UyEaJ-ln3&K^PJ~_N)|&76LQqNr~b!}U--hnT<c*Ch6o}3
z`T-CAx*TZYbu5!Hi$g*JfAXg;AK&ghYgpC0Ezz_{`0b0<+sK<2WSpiKOZd=XfxA6Q
z$1wZ<x#>gm#+uJJ4?2DNv?xvNm>R!PNz!wr`^no!zcsHm`360!mgqMehXkV?$L)N3
z{>|HGOJa8L;qKeM`gkQl@3fJ#qQS^e%fzPVi~d~KYS8Q2Pld?XBgmZ0uIg2_{Z<si
zb*4`@NA?fPmIFkMpEvL6cQ>j<rh*2VYyD}1i$eHjv+&Zl)=n`(z6PLh_m}rNSK9wu
zJc{L_|I_@&>}Nu6F8u2^AH08th1c+rkz~$)ajyC^{--wi4eO-&fOq~g;;TDd{;aPO
z#kD-)Bb|6dcq1!cK-bzUsq+|TK_C;I{yHXCJ^Co6`9o1ARqs;_vDl~e(qAoxdRo{Z
z-5GXtfn+LV&z@h_KC1wdn1uzjUNrvWhrIKH_E>iDThp-e+RD;E1T<7axY)@ITzq%K
zMd$}1mfulWSlIn&n~$FyqTdTarplvzVWq@Z$R+g_8c<<j1}gBpH(10EgwV8JaCP)C
z+qlapp4Y?wXuj#f8EdFS!4I|yy``Mi>^H{H0J{iJooE}>3(@)L(cu$?Pg-7X3osl>
zkTvBQk$SwtuWzE?{%quOBI`ho>HK>8bv}5y#&70p1bjr?g;XRlzwCe)@`;HA>02?g
z^SkQ);y2a5nB}pmrSr&V;21&_E<%54fI3SL2{k&siYNTis@K<`!MFS9XX~c9)Sw^P
z=v$p%rCW*C*JOa>WyxH;MsMKwyjzP~?9;i@&DB5n_MY*}-Hq~~3^${=yOq{(L#+O?
z%xK9{M5<%nHti@MKNGYtth3*!eS3y#eo|pA%~gFo%ic<sxt-e4pa6p94z!PX(3(Ge
z2y8Xlm-p})_A{@2srfJO?vbZY8w!Olh=?y3XMsgv>GirPj9k64fn)U2A39g6tno}U
z`a`s1!vaRruwg-p2{fNaK0FeYpEZIkX<t|W<Ab7D<&!!*PsAfdnC>oYTpl?s?#2^&
zmO#OG;NhgI4=5B3CJqa=?DB0Zjs5nHbs!*c2hP>UzA-!ApAN1>^1P!oJzu|my-BML
zOD6$43Li3w&OYGshXbxUO|Y`sPc>Aq6@X4P7HU5|9>Q&Cfoor6RP}4#`A)$x4>Z3N
zlr2x4HfMVpNTSKcgoSWV`y+LPp1Hf>XNSM0N}K?_pTFd$!Vf7Nm3!KM+yw@MbPYrE
z2&|T*QPEHdIzvNuX@g&_HP>j<kO)jP$t1JflT&pi3WG1&`&GtMhs}6bGO9IlGV`%1
z%i>BT>y!N`2ak<!&nX=ZfS4wdAmB;f7ZpAh*g~+*V*qAYaT7*?l4<EEhH<T3&vP$R
z2j$C-VElBN2p%yz5y6M5URPFX1C@Tn@LOSl%R@-zvS@$(eGKO`NSKttKeyFxIG`<H
zVx4`T&u%SyhSa_*5p+Ceq`zAR#aN9J1?@)`Q>RWf{i*8fy*rN8`i=@2U)afEM|E)N
zyJ@iDuSZn@F(o5mPP?cu-6$>)4#zpz|83yJe$Sudt_2klV%X2B51oOkJ-QE?Ul{UY
zuA`$qf$_`2bjKAxtsL)FnkGsN8qo%!oBye<_h+q;P?F@JL>S-X`FH7B@ge*?%!XYw
zQ3dnO`-o=lZLYd##n{oh8}UK6Nr&4CAF)Vp6rW>D6%eK;K|gC_+eeX|#3K+kVk6>)
zJWW11pI#=>PN>uI<0+cm=i_nrejuE=1Mhj)PW-bzrH4d`?NQYJv%aE%4b|2XdW`eM
z76^B)GJ(;8?bKQvh}h~>Te{iJbq}S}GU1CBCRh#z>cvz57_Z8Fcd?VP8BuA!O!+T~
zjv1v3^i>Mf!X_(;PFb!Ce)!ysPlOm>UWcESmlc?#eA#KIX%G8{z!=CTe8~m+TpvNM
zAhDVxAnM?Vh#nlaD>QOXtB!uXW3_8OL|=dY?Nj6x149oT8mks=Ujdjz*?}On=tmjj
zfrmM=xDu}F;-~S|A~~Xy<~8rBjqk6-BZ{$!Z69LUrGpA#hn_R>aio$0_*nlWcoR&a
z!tgCbuj;atin8$$oI#y<*kGJC*6?2~5u~gs`^Q%~y}Z!Mg)a4wqp2I#)bjW5>Gdw@
z-Tg*hlUhqix<q)aZGWiR9p15_dtPpctfUnPN}w*_3B2G(4Zo8r*{I}l898ldueFAX
zv1uq6Z5-Y6oPkPG^a6%h#?!S_Z=5DWldrovd)>=d4w^3>dmBgLf*8-1dd&nA;UwzW
zD`wa2asKUtN>XIu<_4=zq~xai6>i!%ef9&PE*5fFm3r{&toAs}o?`NT?V^bKbIqDH
z8p=ojj7oJ@4r+}rvD}QWzSd52DOrZXNIu@<LF3Ty@X?f3H9Hc{-_C~LDQ;y^B@UDg
zNr_IS7Be6gQ3o_^e)eO?WBd5Pp+iD)Dt<yKT!!EFQ5aZY1RP{#ir-RC@nD2#vhLIU
zvC?1+Y3wR&$xxGyRCBt!GUw;{_i9DwR8ccQ`=OI+8sMdk$BRpKI%&+fT>I>|HIBt(
zcrGs!U~KO&dW-O~Z0~J1oY**Kk6kU(!)WdGCYf0<ckbL&It$8>b^y|SV!o)-ne%Pl
zE9ZXnm|4Zb{wQ*cDj_juZJ;I-p4fWR-~OZy4hiYlbm}5AF0ks=JZcTWVi&{8z{xsU
zDChB0A}S#4GXXSUYq(3}J`4KIjjZ|@)_x{KD)N^?kt>wE!{Vy%?5}DA(%lgyyvdAj
zsB?*XE#nmz>WZ)Hx%|;+l^V-AljuYDp`5DmPzFi@6Zqlfeq{n7TPhys_r|rZUKPOd
z1`bY8V_~C5GUX=*EzEQu$E@nd;S5XA)E@Q&P@-l{51RokS&N^@eO{-5fRwsVqNDZ%
zkBw(&H7PUj-DQWb#(Xwp=$qGXF5bFZ9ch;NR{A3Ej=4`OLSMJJ>)lY%LbRPV@RCN-
zIIZgODh$R{o#nsiugN7$vV*w1P5Eq*h4`g*t*b$z1-h#B(%I_VeOsp`YSgN$5An(o
zRokpL!*SQs(>tgnQT;tlarJwTLW{sI(@hBeF4g(y<YxzIQwoExY1ZsA-HSx4Yp*XA
zKEtrtNtq-@di<C6HRRvm$`Czh^)|);-kl}}snfAm&=FH@h|l|qvOj(y%~#!~)BEs@
z-wV2yHSgpdAyiQ&(988zHFxdRqN=0*18K&@^Us(_{Y|Bq-3=fp6}zZRAPA_Z`+m?T
z%>{XPw_|6i)I|0?%BDN(X2;;MXTglqVtIy$vpl)s(xTJvA&0=P5I+jlh6H@p+edrF
zSAQm?=%{Ma=jubo7!07ZdOJK&h%D*(xf8oOXpSQxtJ<E<>^hyQpP8HX78MOf(R$#F
z;rELZm{@Gsm}<1Zz)HK8-gWnpBf29vmY5@S{Fuz1Jt`9jxe>xUQCq3`3TjuC!BzkH
z;jgt{Sp7$eepM-a^`BiBu6?4_f27FzD{PO}ZC|mPH*cP(aCSqKoJQ&tvIu{gvYlSI
zz%#c4<Y6XLf1wc(;{bS%VLXB*{R;eO-W3L|s_tu^WQa+^nQlElx&syA(z+@1<@mB}
z`Nxoq`og#%dC>FAfcb$Km7ab3_HAMG7>@w|R3HY|1hC-3po|EX0MI<EW6DDkaQ^z_
z-{)AqSy1P-*majArKLyF-#cFDeY_p-Jykat6^h*(ER?|6&>9z?6M-qht7X~1nlg_^
z!n;^QyUaZF^T+EjmU%*~OM!d(dC{{KTjdVtgGPdv-_s~wKnYaf68PT6v)DH<IIJ)*
z#!wjPMYCXR0K7UciTanVSSDQku!HwAjG=XIGL(VuOX;=IzAi*nC9_$xD1>X+vvE12
z)e=By1R<$zlGgs9I!bseNlr8(F(vzYE}wHOMKqX1l_i%Ju<`1$TQTL<wDar&g4vuh
zi9vBGKnXuCeS2d?Swr+AG=dp&x>T;Nu9Xtt5@KOPHmAIeD^~*6-x<^gU72$>XL}K{
zc4N0oo+@%8Oi0f-W>Og<$@QDA)3xz!Z1C<SJT>70Zc$dSaCT=pjy6*O=+A;LRR>eU
zm*5xm`29Y~T+U;CzL4CvBg?H)<I^0#U7s(FC4NIMVpXM03npHdPf3b6Z_x*so&xCF
zIB#?wWS|(I^}bb!<lzuVA2vEq@*-m9V?7>-N8;`su@qY<qULg0cXFF&Wt^?X5wg=i
z#(Rn@{|flw+-|5^0FdWX0Bb~rj-+Qw5Bmcf!Xr+H+3;uR-rP9zT_0tF7XluFSLVGy
zgruDF@$25nfVqh|VD;%;Q|RfDy3rGADS{?FK2e<)&K^)@6Rhu$%}MH7e2R@B;#U2m
z=QC{h%Gi!BvuLJnl5*%YD1ZXxZtiJL%9cpZ*$xo+vv$dH6eZfvdm&LFM{%C>bFa<)
z%L=?cIM99)o$#(|hbe~qTUFJ_D%4bMfL6P0cn_I3%j4ucmA2%^%VXK;fzX(3juRv>
zEIhn_^fQzLR76njMhw;ZB#Q{iyVFroMAKg7%bpD)4Hk_ng6XWFw;xTOVzyEB{ozid
z{(dV?>0Aa=!~;b}?p!`YG%H%HGx_jhL#E?Y_WChS{MI-@)<8wGdq;w1dSrOgJMy19
zs-J(fM3;7XVh(p592NC?>YH-3Ce+G-#CgX^Bh$N9Ls?f3Th3cr36@>>q6Ce=`Mo@s
zefdv_0F1Es-|tvZj!exB+cL<i@&WM<WHtlC?lwI$Jw1=QX(E60cD*}&cS)k~q)eb6
z!sXVylX=0u<CeyqrQDi_Y6|rdG=*PPyD^1<QI(N9G@sa|#X7nlte>C2+$=wrkFQv?
z;)O`CEDd}S3FqqBYwtxNuY{dDNVI*_QdDlyedgO6DV|#>6MMY<x6L?ePuWX!AdgAa
zzW|=nVqJJxVMSMB+BHkB&>jD(ZVriYFplGualunsu}@rkFWl=QkwFV~M^$Td2}B(6
z1}Jut3xTd@X<A@M7366XpO@)uld5W2Rw2{?r~ZD6xKY@fo3xE)I%daw*UM(nwP6E^
zLJF6UTiyT>V4K8^N_t=u1)(W^D}`1;&uAsh<=JOHDEOHCDQ=CM3Fo}Lyh9d%<=(bE
zOXvF1NV=hCp38qqq}O7d)xk+-&-MGoK@`mpeCg)6{&yBxP_v%UdPDQRT~yA6q5DJd
zwbmV~zxMFFkJkM(Qcd%EdFF3-C_J~Ao~Kv-Z&nk*FAv<L<d6hPp<V>>Zs*=+YG-TY
z-TznGl}9yoXW`IR?Htj}0X#OAf*c<R6c7c*4Z#r^Cx|R+L1YmXWJGG`0Akfpa@x_E
zL2Wf`QIw)045Flfj7AJbQDhUG0FAUjafVHV0pkKF%X~M>Oa7SoYw}lk2`}%x-*WH$
z?!Dg!8$i7Az9KdGI!NqR_p%Ryb5iSvaUzS}wG?=i-yK8Z^_ub)dCbUNNtEs2^di8;
zJqN9SvX`wbE!_arNu41_BIx-v`&<fQ@EFK6*?tGd=R8B??c`U|lcIFGtVxg5F3@?0
z@*ouazavSeo)>UTNzgSez5|A1Ah{YUJf}6Q9eRLb=nF_A_Ms}L_ckynC`CfGEbS%Q
z=n-l5r9UM6gI2S8?2mCRBZk}Ads|z@nj0I6msAIUoge`3Oqe1O_)jIb<I5UvB2WR0
zah{=!BdHzkOWT(TZ39wq;EM?FqPRuYh?>l8z!JubGMfawE~LZ&7P~OXF8$5rKGk8m
zai*Y<6Zk<%Lp7G4y}ynNh1Qde;b1ExeGdm@AW#l)b(Du9)`r&L9pM)KXI<?GvjMtW
z_K$H%<=a^D1rQ`S@?mW!82BR+WeC)XHc(c7&Kn;QX5VTQQxzZF!`^Q0(<}B`1n{w7
zVlc+D9t0c-C{S3;uLLV(A0Qa@GNFKxpeJ-sQpiq#hv(Q`ZafTrFNLilvVx@fcjyiV
zU<I~~_O25s)`|v1w-TJzG`Q1~zf6-VMgmB!iC}&s0Kro9;JIVFBZVN$<qWkrG3f(5
zdh1^GXKoX+*LDBGI)-A^;N*nfx#!FYFhpRu#i+$TgcU%gP9%2Izha~qahpmuoV+AB
z{J>|3;+L-~fcHw6o9axHzy{ujUDmRRB9v&-M10a;2K@o<ILW$`@4ANeM-MeZ2ic>~
zJ7@7rF{dZq-I+cT@#WK_3?D|p1!>SnowHKAC)wS7r1*D1^R&S9A0psCr=j)UorY<@
zv?r7ND7f;z*Z|9r)$O4*fr5x;<nx(Y(s4%a|BFNkOmgLObO*hv_2RBcClQY`s}97X
zlQJ7#*JVH|C_=A8Fpv#35qCg;HSeYz(6LJeK=DxP0QLdQq;gosj<WDrfdRHJpVgP$
zX!ND$|FHDze{o!Ol^^;;tdqRfl=v(dJ{;h?2r+!8UUdGAkQRdoN4s0*{uQZtFSTM0
z@|tU9V5N8Gla|2OY3zAf6LI!0H>P8Lg0E#J{y<pWZ8>bL4Grn5!)(mdkH}{HFbN=0
z%(M-WE>=NWRMn^^^+I1xDbNL2Psl#%;;)kruw3et6xSm((!JO{X486gu!bGNi9)vr
zKmRfx#kIEoHE<hy^3Wrw@+;&zXCqkMy%Yncdi`^KvhJjuu?j=)FSMEzE(k@~h)@S!
zF0s@U_y3;b8URPL4aP`)44ib7r^8oLzhwi3$EQxPSb1n8Rk$!A6-l2%xFv@I(o>Vb
zqG~{}AYvfHa(x}^OzESuKN*{^_yMF#JEm{R8IEfr)qIV=9UA7cS36Z`I7c!|kW>=%
zds)^Y##F%XxILk&C+U*o-_BV3{+2CU(!vjRxS<HL9*PiIlw-x>(!mj3)8z+KBq2Y}
zF@NYen$$OBzsqZ2q1B?AuWXlF1zPyNvs%B*YUzjb=J0$ke3SRhJ<G)w+jea@nOZt>
zKUbO>`;nR1>Eox~TRhutj{{@Bj@x^&H9uRK@ar1Wp=X7CXCktX$Qp}>M~{q5&GrDF
zqI1XKUfT-wZ3qhyJD$6K<Hq9g<lLQDq1N+_oIm*lIuKr7Ug%fdr(6)l-tQRK-aF7N
zdV~6a^8u`<3X-cmBNo>{)hG)rZv+)a#~3+p^7Zx2i;(mI^hSFL5)PF8^(khd7jfnr
z*Yt4pdE_m(AmgM$>GWbg>_7&|sdb|sIE{ET?=AE~wdZg3qjM*~F7=}AYW0bPg>KHy
zr)W142;3bV9eL*FzTg3F3*n^4B__H8)ei+(3aDg;h8`i1t2?Rx`AUPIujt7PEd@5n
z)INS3kRwplijH%{bCW=wz4O!sU~LOo??9dsX!z@+*i53Zrg;_?ewR1bVjiWVoaxS>
z1II@N__mj#PlS@2i&_;*Wfq(_89vg!gwNvqz1lRhHO!p!DS`i`G2VfBmo6=;s;aUM
zHVQaaQc{AE=|kp{FM@6Q*kB|+)(U)uRsZIT&U0A(48Ib9Xk7cvrQU(*VKd~H{|?m3
ze9d83K3SYS*gr9{x?gWF){40~?auJA20KU-`VmA7tV<oX@X+{qoGb;(t4rZFGqoAP
z5ogf$&=OlND$<>Rvu7q0Na>X&g@>P4QL!2fmV6NmF5Xvk+jE?bf|razCz*6)61mg-
z2uap97M*Y|(<;=7XF%fNG>Y!6`PAo5ib${|JEeh~u&v3-$)x$W48;A+AtYi;k<5s{
zq`!iXv{z7e9}!I%))P~UE+ha+X9d_`@Bsg~)Es;VgjXVUca8}`?e{8hI2g=Ao%&gW
z2_6ITI-9BXfT=|yR)pr(UJ7cY#==5yk3DUNqq`ElBcg5ut((9Abfq=6Hk!Q}m+!P9
zW+8bOE?9x%t_BSSX$-_hA0hNqp)D{gY^Ho`|G+>Uxm#d$P(w@2&CO|;BAI$|S7#Tk
z%$+zowtPF)4kRWfGJG2w8x<<!D1XP}@$95?h0_nvVgNIk31>P6(_O7j39MokSzw=H
zcG8Na+Nsj84nx$21Satv`CJxtkKQR-0v`m=*;;8d;%{th90ep5-B!a#%fNU^Mdy2C
zW9>^c1C3ja>~+xuDb5=|v(F$&M|2wT6A}_c-Ka39GL|w-x)Uc(q=$Vd-@&uA^hfUj
zcos!y&tmwv9ZF0SC|Vd)WIdl<jSYf~i?*qyi_{O(R*8xpO3!2L2Jf|@HAsx<5seUP
zZ(N*1$y{Mdu$ecAFFFR>r7o0PJ^B(b@Jf(UHDEXQxys)6&8zxNPUQ%+3B6L@IZ#xk
zyeB^%xu3Y7uG_snyoP`Tv^Mz^e<1F_V^&~Dz+Hi9<TigM52KuOc%bmr=%~T!6Yny(
zV6g`j3RL$^{k!2HPgjitW61`=NZgYjtYDR#3g<??8Ao)eXl>0vmysLF<|i%xy*)Iv
zxvgz4^=E5dUH^QtTVs8N{A750ZD_J05mC<$NCPZ{%RpM=1FsL0+7KS(I%|1P4}wYB
z^3XvBT5u6?iX~_^9BZyXNW2Jx=!g2LUuEcifKzr%GebF3+tD3MKtG9S^T4P5YSXo#
zdV<Novg^8hc|l3Z3zOjY8p6B-yHO~^Td<%G1uf}#)&~>^TVvf|XF>B6O7Rco`csoE
z{XnkLZYyI){J!@%Pnv7Ar64?VL9bJii9A}e{aSu`xf|UN^2}7on`EN9LV4US?Gs4Z
z9~H&OWU^<&!~S%4mYz8vgXp7+K?l!jZe%q{bt)7Jkq$3ZJ}*>0hiN~eKx;*kh5nS$
z+EW(3#{L_P4v|4)A#XQ&{G|g|is~A5U*vQ(z1dptr2ex@rD$nMNA|MRh5eY*HT`2i
zlKxN4J9(m3lF{IUNE7WdFgRwjQ*#vthyI_h1#XJJKB#xwG-nm=?R}VHl)rqa(|wWY
QG#%r6`>iit7y0#n07|0G)c^nh

literal 177867
zcmbq+c_5W**Z!m4>YT=NP9sgHq9PHI%$nO2GLxaf5Q;J+GIcuDDUI8dS%b)&Aw!`_
zX3Cf%8W2L6$?&_@raJHY{rTIk&Zlh8zMuPE_gdF=t?Pci^Qy`_W=!Ro$}r3f*3PYJ
z4D*`_!%RH(%VhkEI$z=({GZ7Oly+=oM(KZ{8R4D`^C!dFx=G_i;OAP0>zWN|67_Y#
zT$}#pI{b^pVVz$tOp<;j{`)^WR6Spv6x=Dc;iA(YhaWGoR6Z<n=px?+!F|yY5hnzn
zz5PvO^6!%--QMZ9-_5N*?aHk`>zR`aI=-d%diP|u8k7vi#Ay4(_ql4_&=H(B5rg>m
z>rqwE-aYOA`S(B3_dVC{O#k11vkm3{Kfk@`Q2zhEZQT@q(f|Fo!c+erm%Eg|m~%aM
zzPgWjp6jh@Qg>Znx6E_p?)=Zyi3N*)-FLn;-M%&HaJ6W`+7x}g6f1-FjXwr3<TgH<
zc!kKQsIKlZm8N&+mKGNXX9a2-I`S=b{<10UAeY#JBX8~(O<s8T>ZJ{@S4+329=u=Y
z`c2Hd^mfg?AD42yIyL*iJ)QR7IR8>^ooH54EI&F>Qgh<NCC4w1r>sBr{+Z2=3H^_C
zO#Ms}x{9UEvW9z=N`hs}QjdMW558Zh|Hejbb&3gwX|#Cq_pfO!kx@29;#D7q`|3hI
z_*cg44Ob6W);00gEKM*eNILZLz}K_?ImFyi4)SMtu57q<MJ}I)f9%8aqGJYc?jNyz
zwf|hh^@Er56xYNgU046{IYag1i}NUDcaW60ufcI$q4ZBTF?lyCGaG#N3O#5~w{+_M
zbWO-0{o@Y#!99Lz6U6kFx^(Y3J7b~Grqh#6zSKYFmOg$sX1G8wao1(p-dBOnoq6W9
z>GpW%_q!20F+RbnZ|X*RQ|KK7{U5F<+0LCix7xnSz_$D4j_|!HR(GSL1rKg^U+Qz<
z-t}c$&VG!Hip_s?YT|m!%IMNetr)$A&xs|<9d9$#_w3p8-0S<*nHlWy$Z>8yJxeZk
z&yL03Q<j_v6|0Kab8C^^i%pWl-DQc-c`Us|7Bj5Qmphg=Jeek37&1E4v@1j|L#!@L
z$yfhmTXOS@8(X-`xV~RjY-5v?{!Gr_(uRBAB#sQWR$1i6R~epc=&3R)!LY-ZO;nJ5
zdH=|pp|4lm3Z*+9dOn>gEqG@0GFB&I$n3)<iDFG{1I@<2zh9D>;W0i^`F#}?6$5S7
zZUIIkoxXlDjy7yzw=XA~f*cxq-&mJOwMMJs*9!A61OBbC4}`Xyn_1-48sp3BCvtH6
zWx2smpR-0s{G{wH{AT~0<>_Y-dV&tWa6WEcR*C$mOomfW{=qA<TK>}Y?h|bqytVH{
zMn+!DV|0Ei3s)(*8y@UmAoM`c$fY|aZ)Jcn*3!J}&ThY-lOXuAIA*oO*hk=sa^D2y
zhnrQDW^nt9nX{FIGsGm`p15^(I-_*`&hCIlwvtd@twUXq&)U5!S!ZW04UW10TA(BI
zo7KOsSUoj6Ziz1^OFWwO<hsbggb$ac8{1Oql-beoPo}2f{R!Fg_IgQjGxj}=0oHd;
z_O&~X4)=NTw!Ka+KxF&(RPK#?`1r@9`s`}b@{%4~H;<CrEYH$gJFcj0IeWbAb;G@!
zC1qE+LQj3d+6Fr|`q{OV^XK_qI&u81W7E*Lx=|t9A9=`~&-44@{D!@?JH@KDvs6i#
zkM%mmI>ao*WkYLRmaCs9SNyu<BX2frIXj)zR2<~VyVSLRKT?a&*AF?q81ne*5qrnj
z_H|$Cj$l4LQ)>0ovon;~NtRVYvt+wfwG8YlL|0ARw@2rAwMo~biL;Dr?W=aP9}YAI
zRNI!wsv8&>G#Y=u7^5_A_t=$P8KKPn_ce}9K`b_MKoervtWxLTCDHH+`}XMVKQp(p
z{`4$UZ%r*0TV~*+r(}bddVU}k8_sKAp4#ep9@#eM#{Txwu`_Q~VwtjZSbDk?b|TGu
zX~xMuo16*zL>$}GniAUXzcGD&k(+<cRL%})3O?y0J#YMC%p-Pv%>|oP<Ja=ra71sW
zZ1)QdZS7So*S-gJ=DEz5iHW$MCPW2`J;x(^k>kf}7?jF?jcB}HC`|0F8y&9Wmb5Wu
z8-Hs{Dd}(<{U*9<Lg*q2_M!Llvv#tb2b;qsNBTb$1dc^GK6ziB>c8*k-N|=n<|iO+
z-MMpTWs$#l0p-Bnwl}^QwFz!E<o3aw%iQv=&TI<xX3U_U`j7oyZuze;dd?pn$oBX5
z=greK3kq8{@zJcB3u|JpD+#6D7HfZ#Vr^OyZ18;R_o-ldjHqrFEaq$!_cN{6b}Y#g
zdhnUetBbtS^?afA?LD`)n<!6GaCw1bBhyv5p2ZHZO6SL&8u`usIfT7GhcK<K1JhXA
zJNEeupIiUarHXHc2YZxu@7~Q__T!_=t;ch#*t!zgKK)>q-fCV}xN=aZhtbf@!t+do
z3YDp$kE`Ta?2C&eMf>X9-2VNMMn8VtZKNwi=}p!^k+s^McX5AlMYJJD3m-9Z>8UW;
z^&`uuX3f2T5?8Z~GXkBBE0qJU(@ygc&fM8r8M7Y6LJavDH8H{rvlNtEWhmW|(Pt<i
zAkf+Vpu_Z%c;#vb!SBO&A8gdWvV)Th4k;`-QGd$$YxaB=d*aNE3R@;KjS)9*E<d-_
z^#ogZq1g0A8{=_5-(M}O#`grzRg@v``+cttUJ+f%6`I*v5v6VZt+j8~_jfWQD*s-c
z^r*XcsSJ8y$A+5}_;5>D>GR-EyAD%+u<cDsN$#~3RE;k#m6y}h)NHss27G3`Mm6ai
zY>hKRIrbAXS3|AUYv0Kz1tOzzfQ4|Dvy^>{m$#PQ-I{BR``Nad?G4JDK|Ff$wOG2t
zc&Ps?iyd_QHH&WTF6v!cXZ{*0ZV$ganO1!)XEA^Z=lwhvs}s$d&dfK=4c&CQ6E}v$
zZJ5^-;%10J&M2O`Ci?0niMJacd`?iJ+B|cyl$Z1$%!^0kwfu#`!=cWCu(skLX<w0F
z8Kn!mH&owF8tyC*wj4)d(;f#{XZl6OWlE*g*eJ<iSK8!o9s2V8(n8;JOPp3yhEm4A
zMyECUOMVJHZPWkmT)4r5;|-UmFk;O2fcY&hbI`*4<kJ}vf(QR#eC8Ns895KjjRlHe
z>f_T>nyAdqR*|`evSev=Lp8T0DlS+#NXq2&j14;<9B<=57eKPz{$l*<2i*BelFZ9Q
zfv0xi?*zIGT?ZN-Vu}b~m=~Sw$nbR?9&8A-EzWZru5xQAS6AtApUs3SivdF8b?DEx
zcu(o^b2As`U;mFgUv^c;{1rPc)8Oc(V<Sr`u`Q{Ls&kdUb!D_JCGPgx2jM6&y5?ct
z`Ch+;zFR1AVC&==JYEEDa<4ACi)e|~O|+rX9Q&X>Klo(3Ut+OTfrAC(-VvrcWZEjr
zxxCL#f=WxAzdYv6Lk&_x=rnF+1b_9HunxYf9&U<z&RrAtP;DG{#y?wtyK)_FHEOyn
z-5w#H_13AiA>!8M{7q99djaZr^DllhtLoFsTefY<RgLMbu?47<ie28`-q@e6o*2Oz
zB)ByPH#7TwwYA!oHk&hu_@lc5H;S;-_UyUVSQOwZIRlFympOf~!euOoucvTLoJ>hD
znMZV}KS#be$l<LYO8APL#SVce<0c9D0#P3e*OwGhu*;8j$(vWj8nAC8oz6FMK1Si4
zv*cuSAwQQp-}&)sIpdiZzdD<j;GclNP>-;)$s>0zlnh(vpkvRu8!o5pjL95qvCZf`
zW0KKR8E|kINI@vcHvUlZp_j{9odZoFzLH~MU+$)UXIGxwx6htiW=isgxb*6>jkD)!
z7rJvz>_q68UXkt9i3rJNz6`HTE?1Kr{B)}e8Rd@K=!mXUbC@SD7P}DpEU<34-2uzS
zZ>z}cW7Y3X<4nzrt@m_x{bBTlYu_p7irl8CZ(~Jdomt1jF6H~LEuw0Gufoy@6*^+d
zZC-4eVjet8Rb_2Pcd2Agxq8=Y%Q%zppyP_24NbS20&SHELO2h7%`s8_g(-I!Nt(j>
z7_mcBmO39rhHS3*4KR0AwTZ7m>CJ5(pMgYx*IE`;WsHt|3u|q!imO|m*IBvO+PnBz
zK7Y6%+m`eDjMp1=6<P1K_g^iK%jnf(-$sT?`_}%T>x$dZiZ#*kn*pWP$ye&z7Rd~K
zd79S$-lGgl#hdrxihM9(^o+i@Yiy;MP`NU%v9f~jsMIxUpo)ZbvgxaJXacHP>%hzT
zIQPkxFG4pBG^@DzpP#*IB}>Zg)7AY?&xV<m1e+*Ne)N~XoT-xU{+=yroO_j^3rp1I
zW3F9uDW7mgw;F-E9`Ere00Kn0)cerV(jv+?_Q{7{wIgo3TPoH7Q!26TI^N0(Km4*8
zAE63tQ@#z6)iu;rVm91UrNX|rSb76-DxgL~{8Yi^_BfGz^haY~)~E}IYgtwb$~txD
zTDx`yeCJAsj-H*)%?3UfPBbl2WPg6pk;%RB=stgOOD$YRa5`Z|V860t%PQWyzki*b
z05shglk!#oz%MlO<&Q}!1B{=}Ey@ezS412O5Y`7dm0k*`J877hxsvYYGEZJ)$U|T;
zSz%yTG)Yg&BQ>LB`s+_uTRmuF^L}`KNeS3d)U@zA5FXPqd%R9@uEj9*qu)?g%c8X7
zb_L5wvJ$YiO$ct~ZD&H>F=AT@wO|bckg|DMog?2m!nNZJ3G@SbGCgPHXZ$F{G%_}(
z8=OdP;LFolp1g04q!?fb_r)yABMO1>y-=A%PkgDTG|plJy$P)fK5>8`PtTe0WmBC+
zH;vt|zv1<vThXcSFPPV)9`i9cUTyd3#WJK*2cfK?P9(J{UD4~0yiTe)`g8^X)Ur&M
zZ(yCvFD;Z%4%l#{6w}+?_C}&arfU~sKWP1xC9+bhg6z~PjogL-2LyKqY$!Ur(CT{K
z=t!R<W-_bmRmxkZHD<w%rs;OgY)vg)K2!k~lBO2$M)#+eE9gfQ3{4lOy$5A0IyGg<
zbrd?<59Tj#Zo|gO=x(o-2FV#ZP^4BTgDpBx5$Ea+UM5i#AfXba71Q})>FCCDi)?lV
zpLl=XCTC2T&^a{N)9MR$$;;9Le;&enc9KpIIMqG-28NjiQl8jds<etV(j6|}fX`WL
zD}ep9dd~dkDMFmcUj5+mV(CUy;GG!n%H94N?HdcjDT@L;2X`#JZ=F(mToSdW41b?9
z^^D%PB|yH5Y;u=zBYI_8qwhWEAskDbJWMyyG`Y7Hsiztgqwz#db9hLwZL#DkhnizM
zX3F)~?+TK#zuc;2=-43Hcpa!idE>jkX%Ud${LG38T&)T`x|!Z7cI35PaBEXeRb2{|
z$=jaF`A{)}-ykBSuSmZCXN3)NQPEc=hxrC}d#JigE|JC}+qJ#!+1n+1vbRvSucj+z
zAh}Y{^78Jr&o3_yz?LZfy2v2iE)OY5ylRML^PJm^s)OLArlu7)QPmo$V(pfw?G9?4
zDfKCIsAP0Vr5NNj&~0RZ#m3Ya4|X?q$xLZz8X3z8Mo84Y3Du?n#5oW=#UQ5}a!MST
zL$9pTo9`?-`iP6gCfy+}YsBewlDT@L2BX0JQ-1ogrRUSFE2gb6DMI4LIu5n=wqV(&
zh!;OeTT?mke3{PN6$1^zZn{#9=T`;imbmsN0`(?n#oX9(?#_-YORxlXfTA{qB00vd
z;`jS)Jazz@&wAIP{^?6(Hp&m=sjzu(ZNHR{bR9s1@06`PuE<%O*H`yjVV+MPMj1<k
z8lbMO?uj(fh~!Z6#yTrVIJ1Ab=PH%rX~GY$sw+!|mAmhM_RsHBrkbkf)PED0=P{5i
z?0OAvmEEz}ULzQDL@8kQDrF8x3YI<TQ)vLOP5W!TZDZj!c339Ss&t1A!`6&C#0LSF
zrmhnC5<tOVqNL{*AJ?&Z`1Lud0VcbqaZ-$`1Tta%<0(r6ksN;VmvZEi5M&l<RwrVt
zQ6g(#H_?EGJL7rQ>oIneHw_|KF^d>^n6F_z*OWz90X-#Yr4kB+0svn5v2QJ7(_11t
zqN~DHLevu7v0;RvlbPJFu_+)R42+^{mDW%ivp0pZqGWsOhs?xEV6!0LKqpNtEuUT&
zpnS=XSGSa+b_E`PZSL4pp-t)KBKP{0zDBN&P!~jV{EeKyZo~S8G|*P#P-iMTgS~A5
z_LaJ;uvd3?H5Qen*)|0_ymjQ3vOA=gbV#JGqp7c_Dr-c1mR$cn%7#M&C2qxc0<8Rj
zhzRdjAAfVimdYUT<Jy>|Vuw_b5c*;`>8g1+9VEE1wPxa!nSzTGL7a+ghrtyzT0;ty
zrQ5_t=MTy*avS<e%vtG!hfw-zi8^Q7`@n9L4A<%<<<8rC_!<c|6DLp4g>+>Lxlyom
z*I4fN$j6-e<NlELzdm2sge@vyyPU~y1~M<fpAg<ONE=<C@II2iGpfqtK>N`FEx~z-
zeQ!^i<$Lk7*huO%xCQgqNzM9KBoUm6*y2UU>IffW8N$Tgx{VHJbq&qeKe`D!s7sc;
z3c2av%3as50=o$kk4!-Hnqea<LmBL9H5zq_HBy#_RzN7r=eZj|Y2si@R7sBvII4N^
z#t#=eypL@fYLXw#i*xOX`i!h&YtVI0erQ>cw1dzk6!j+$aYYZAy^=E)ZV-e*cEtQ4
zbfu5aW~&6Y4o;jUvkq0xn)Xii?_;!V-wl84Jb<-_yew?DPT#B{zDqpf0!X#_=4a+=
z<?{LJtMP<5wcO>O=V7wzKaAIRaVGxy3=qL?uOME^X^JY^{X#B--F_d2z!P;WWk4S~
zq7%#2luWSn0atF<ZNS#<ZmkmDdnCDHQD5Jumml`p0suox)mP`SM$W;OcNHzMUF_Qb
zuC6-bz<$cyS+;IBR3nl}M$pzjmVNWGrk1b14J$_26IIS3I4961pWj1t3?4jMZi{Gc
zLKRkGliFAfsTUH3``|uEy9E>vyaR|wZ|wUF@K<-6@1SE|qGeph{mN7n_%8dJ4Q~#6
zf`7SLW@<)hB|pdR?v)8CK8l(irXyIK{PJlH61><BloI_u>*NY8%nRi*^LD5Uh1fTW
z9|khsA~x*WYaCB?u8lygG>%EG^lwVHs!gwO>aflnXe=);7tB9ovfjR>oY=HjY-FT=
z+{z^j565q!vb*j2n4Z7yUBJ5in}q;;c_Ak6?tD-;97{@q@YFM#iviC|`!B99zTVU)
z0+P!bSE+R_QW`})DNVh3^FU@u64g+%#zH@g)4VRz#XJ*}u6yv3xZ(#;Pf02&s5hQS
zQD^4M<w52P#@HuGnSya8Kc7~5=pD34#d|u1C}EijO@T%Wty14P$3)>xJ;5r-ny3$h
z%kwm>Q2(7xi~Kcttm7X~wmm#qbQKDwp2oN$Kkqz#Zy8txi@gZztBV-Or=>B5(9JKK
zTVEtmS4i?Gn3{s*_cn#c%YS}fp=i{i$fnc>k<%{7LQ6yABfzW@Az52e@*vqKzQ3@>
z6vAW)Y1+gD?4?xL(MXTeoD09adun+J8=+0ANk`}P{Ug$KmRWDTma$OBz0WKTH#~ua
zhd1b1#tg{a2eY0gJ*G52T5b>W;d38^)ZjTUQ?8w}$aa_cH|$Fx(AZG@ajNT{eF$Pg
z^7FNO@9VF&2Uf_DLG7zP;IY=CNCk}#h-Y=S^{1xXLCd1rA}YoPA7_SAT-PcSqnE4~
z+LL`iB*b;lJSHvOu#5T=fGLidv>hEQE#5vHa1KeqOjq_JsQ1=?R{AS_s7vo^-x$XX
ztqBIPLD5;rVlO`aYMK1V*DIWOj{s>1J12+8S_cfg2AOT&vF|Ko-YqQD!^#s#I);am
zC8~`*Wxjn*q%>;#@!Ag4FmJ8M<SZS*Mikm2((i#n)*wgMO$fOKlszrU6`2f>=dO6l
zy12Ah0fENRTWssbU>V2GCzsWQcln8iRi;9^?s7S7V$$&B@)*>M&#ggH3~PfbAAOO<
zM3g}038ZYq5{2l3dY?-BF?;vV5RQzrxReD>b;kAbk^9vvwGHewp?`XU5j8-#EEb50
zR<kT3$(m3*HlF^crqOhSUl`=Q<i>f~;=(iQ3Tr|4MEP`+G93*dvJ0z^r3v?Kt*F2%
zohBceu_3)d&3lM1XuV#73dHTnFb&^ZXM)>_=f!-5iZHJ_4SS9XtNzi)lSl=(tuk<E
z$dMm8X6^<)8zSc-y8p@F+x+*ZHTXn{Ygbvm1^!hRoYyokh~|QHiQcr*Ki>zWA>6Ac
z70VGa&^wge(h@NG5W3yQx*Vu6_WN_ju$d0;_Dek8DRr*WtWv0yX9lZhF*8bh_FDNh
z_!`NvI$vy=1=N<#nX{kUz7j#Mp{eQp976cJ3F4FcGX>VHA$l`?shkWTg}u3J=f^y5
znG;`@%l54dIILzVbKu!O+nh2deFY9F?tAS}=aN#H3f^U_uf2MX#U-=iK*^HV9tn~;
ze+@R;Us@z-lX=D9NALv_Y%1+VsOn>X2>yx>O9Wb3IncR&1SlZ<e?~_?F52+M>p3;z
zuqm5bU*TP=(4)Z@6(w`m`bVdrR#dxwYirWIg?!gsm*p0dh6OD$>hEZd66SMmfD(7T
z;~<J_1NI4cu+{z?MfStq*Opz$R;9OD$^N340Tb}pkPojB4HY~XV^deOQ3#LbVN@7h
zi@RZpAGD2J#4-Q=aR;0FhR5WX)hSf4lQOWE#)LX_79)~|d!S9c_+mb;)p=C^zGKlI
zOJ{6{lB#3)uaWsTGx=}TDZN>Q7BtP!IfxNAL_e{yHMOG^y@XwqrKlW0vk^J?{31)V
zsnuYCA*9ud3ni?ATS?tQ;w?Q<=hC-(TXvwlsV<Z@%S&|t4js|pD4({yU2t#{@wStn
z@9i+hF8c_YXdIm50wOq$N*JY!xTEBxjK`A~dT~@n#FF>@EZ3!643FG6=iMiQ`c<2N
z-jdgIf(6g7EFBHXA&oZ;$&<|s$d?B}Z6HQ`soT)8C4{VkM*&i!#771_7tK2}S6g3X
z$5IzN^P%@1!i_i1E%9$U2JGu<{=lK*tq1R3dwYB0rycqTd0S}zKF<L@o6EvW^sC~K
zhyeyz=0hOJu4coQv1u!_4;+)5ExaUWshcylmAUidN%IT4IHLYrgE9H$Q4PTy7Mp6V
z{NT$g`GJ$&`5sd{?)69uw9TP)KdOW-1kJT6HSBq7^>|rwqeESvF>pUQm%&9<RpsR$
z|5<T4Hw*L0%NiXSwjBO6+UnHT9;6~aY!}-U40WOyU4YUPLH3n-_up83a}hSl?CGeJ
zN4eT>?wq^qoL8)wax8dz4dRlchwr<)Xi3j^FTgeN%MsDhg%p=GJ+YPmr%4Y{al^i%
z%Ml2AT5RuvB%BB+>hrv>#<oO1t|zT2xM16S120syfs?n5>^9Uncj;XcTjJ9FQtuLf
zW;4=L34Z36pP&C7riB(0Y~(esPB4ivtzgzk0MiQ{Q3>355Bc<G=3IT6069KJMK9UH
z`(we<3~SVf0W^Dk5P!xHTB+&Mp82gEY*SpK)=r)M5h%A93o2EW4rF4w0i0GJLFkB%
z$a>(pLTH=l;uTYIA-?7K>Bc!WFL-^)42ha}9oL<8(28r2I&|HJdpvm&<ayf{9`OS8
zTDM~R#a!SaQ3NfC0@Vm=sxd2B&3f~^(LR)dBt!y@&68hOeI$}Ysxw9SVkt4A!Dy~&
z>(zm}>+N3K*hnNHBaWJ-m|`y|R4b@%nidE80gwwr6gC8X315WH1!a)=6@0GN#IKKo
z+n1(V*JV~?jO!1-iYKN=+7fb*EwsQpsFk{wG0CK?`-+Hg_`##001$k(mt5RLcgKN2
z*)6C`{rmaRIXjTHWsKT6ES5fAxX}9HHB{%)`Ts<WzIE;@A{pe0{7An;Y!qUG8doUW
z(sSn~pA;$ylw42KO+9_;Eq2+z#V&0?7efWj%dF%MC}E1w%NwJLMRFE-LZ7zDtXyg2
zn$`yBPP(g_x_VcT<!Ecx$eJo6Hy0Fxw1HUpRm5(Vom(WD<AR1%!FOf}0{mXz5-F0f
z7Hx2BKhFWF4)BG5w;3nH)6<<|ErCKjX;q2-b&|4o+0qcCHyU`3bbi80Moml5kMS{b
z9f*c<Jb*XJmh@mA9h`bU55x?h&Nn~=C<ZI+56U4doQ7Mot<cnN02PNjf*3aXZdJ|=
zW?yYaIWt#(E&{(sIvVL*L*P6D4vQF{-;8SIH@rRZ0k!I3%`rsuhaWu~M&5X_lpWyc
z77{;eT>5Gq-=AMasvb$~pn5yM^|mDfT5xtcfSh;^kJ8#?4Qhfea{6?GTh95!<e7`E
zlk7$jPU1x2b&(Ym!&h9pZ&6{W2KFRD@=ij6jfl|^>WAn<(;x;y#PK7!P{5EyL1ExC
z;f;z;P$adCAPmVm2lC1x!x4~5L#LM0p+KcBMtTe6)dkSxj1E=12=F>n!QQPW$|%gT
zDwbBJ6B;BE-GB;Edm=P{V|-j80WWCCf)|wZ4naWmA?ysoVLUQemDTt)XK4wL549FH
z9C>X_<_1)L?~LBpQG%m#V+f*L^M>kb2dZr?X`5k1@R1&?bM{TFCdPSb`%=zY4E^R<
zD#_HH`!<Cc`tzr>!};0RfBuiNsOJ$8_{}%2;tMT{w3=h{(ma)=?%HJ#X~ar0*H?U(
zS?i}n(<lGXNfk1}#v~4fiT4Uc3wT**+_g-(&lINctKZ>eK{MFXI3|YEhxc6h-qzR(
z1(D4wE-ucoS(GvHx37b;vPU0W`7zQjZa-R2diM|Oz77U$0+frt9T6K_OkI)k%zwR^
z?|Z}YAGku2lVkyd(6HRc62Ujv<h;opE5@!YIB-WLBp98Vk60o{vLB$v+=fy}{Dn+I
zO@B=j%0+KzencaZmkl~_#_0VDRvd<Q3!v;7Yw+L>au56tW{M%3xEO^kKicdsq1u)9
zqA$*YPyCs#S^b0K%^~+}GtdN;8>%}|+0f3_@ppqSN2#mMxfZ?2I$eBlM<Gffr#nOX
z<o3&p{4j?f@!ni?J+7h@iG>H+C=w(LbRBF4UWZ1B2;xPITaiXt19TeZPu3T}X^xtO
z7U3>mA%g^{jt%Ir=i+`cHJeZzgbZ@B=Nsw!$KfKaJ-y9kW?XOgpo^FKW6JT(zD?gz
z(u*(MD4*E%+UR1B^0Io-H6;rKr&GJjk4;br^2cEwaSJa1gm5&Cn$al6b{47fhr4Ds
z6l+l+9}N&a@=3+-e<}*E7VHdWU|N8$M_oHTLwH#Pkx8>eHXJ^XW{LrM{TPtc@z2)*
z@c0Hgo9CcKPL#Q02~r_13v-HmTM#(r=&C>0j243m@cTWT=pMf!R2B{Y(MG5Nz5WMd
zXx*Xn319jKof?d6{hPyyyOHk`5WL2oyEo*aqvUgH;*>jps4O-xtT4J#ul3hsidL;R
zE8fK>HH<pTiDZO;;00Gzp-sVBU(k*Q$byAf%)X#1>3xsfOYH-pjE4=N;-?vQ5ZmpL
z09%>&`;SN^3748)Y-bdz<Y3pCOdVoksY(BrcPuvg%i$oHCdy;GT)}|)1dyo@Dt<s3
zxowo?YHLu3D!vFPGVCi99a@F2dt0eZOMTt2uc#cOL6SnL^?zaAO*h?~$S7ot3z7;G
z$%|6j3sAL}n6LvOU-TB(!dXYhu^PJ#sIVBnsrZ2?I8c0@V<9z%G&GzCW+%dTgE$}B
zYbl3LncfFLdl$2!Fdsbu!lXmbCo}F&aeLCnsp6N<fMEy>6$=Xs8zA{6EP;xNi3yB<
zs<Vb6DDxkIGT6>BLd4f|lx7IKBp=5NAR|cvUYZ737=fjQtvt?B)7A9@HWUT3k5R8h
z<sqHlJ`!1SodiWc>W?0eaAKh-b>5^(K9C)VDj<@!Hk)@8omn#bp?a<i69-|Bv@ZlP
z4uTrjEWJ>U)IUo2^SVVDCe*NWtcMlK#Yy?tX{OZCL(vQ(LkE`ZQ3@eLGW4){7amF4
zLj@1k8iwJCr08+`_NGqMB;TtLSb?<YLcjH0q_IPV>wjy%1ACk-YIXj^Hd<@E-X)bO
zlLaLv4N$i{jsGc0?9@bJq2=uj1<S0fNTQ_v@#$6#2gpCfeOfdy$vT}GAlBq`Gbbbx
z4qD%3{~0|Y9gF;RKztAJ(`xiVl)g!08;Muytbi`4$gTmv;TE^hP==R@+CrCj2F~-q
zJR30jeiCKNL;J`eG@VCmHOpq8!50RHM=AR7RE4zxaS*9uRE=cNY~d)(pvum@)n)@=
z1O!dU^k*OW4MErh)7@z05UiuE<z6>XjW2iA9ljGEU)bh6?En;jur|mSGA!cSQdP3B
z@04Y^jV4)C2%rHJR<+`a3>!c)y|{;qap!ybL;HcW1CcuUIdwf_?o20(6*{8iP6Kh5
zWr+e7c7Lv34@|Ax*IwII7BV764Twxl2>Cn{zVE^71`f626kfD#WV&B$A*K61S}5>o
zg$ADhV74vWEy2fvsWk}ZA7a3a`(VAW+kjGY$y6g<Oa_+i1_H3o-?Ac76WCQZJUuGr
zzFsWQm7Xmss3J0Oi-rV?+zT``@W(YpG4L4J$3A$ueIYeE{X|U^$<Bk2Gy&3zv8}~-
zd@OSRP~&)@Gvg2{Pf*L{1tQC5$qyaX(lZYuJ6~88Qjd01A;Ua+A^}^j-UlLLRp?*$
zMT#H){K~qhE%n1?M_9Yc!0CoMMn^!&;jkHiIWL&_6LE9sS~f^6O`)e}M#7_nsLy;c
z9`&avyOC3h`fDsU+-z%+&8cPA;!&m|FN@vAVgn`JaqMd3Br3sw$hJtmbvc9uH5^C{
zCIJ$y;|I$2B$Zef?FPn!WT3olTXtXUw*sRPQ8L>AD_lc@55uk7pVv)*$)*W3PiJ*<
zra6+N9<Ti<T83-c>BJlp$ohs}jc|sKp;I$Ju~dZ%>M=E*oGl>_gp>f%=!L)LObqD9
z87)f`8r#4+<oHems3|OqoJly@o))e(@f@-lb$npKJWL%neftW{YUHtbdO{4ddj9ud
z359d&Pg4Gg7!dv8v@^$zdnOy8&r%6yvC-OVgeYy#ae_^20j|$O*Gb#b3VRHT4vT^r
zbdv(=`M@RTJCYJB|JnNO!!*;Q9y(BS%TlcC&KULus8kQ<qiXAfrxUlq#Ck0q{<Om!
zZ3Q0jPE9TB`UR+57ME0HzCP2pd%4ZyqdrwLie64zW^|-ul;f&FvQ%p=2`3&0O9u!0
zK*-SNm4wId*jI^a4ltUbiiAAa*AZfv;l#%x?~EBb*Sp|4D+rwKZUgS^ba~@@4wlLz
z|DYfeEKGOE7F7UaqUeh2#gP@NSbi(nTFf7yf+9EXM&PPfrB)wKETYb2McT|6XcWpO
zJej)27=Vgc2^phc7pp?L14m*5Um@Z<L~x%06<T}O4u9(_LHT&F4h8JdtWx;1R4qv~
z+2&ddz)FPxyvd{(f;S100|@etG5r*aiY#i3esPxz#_4nDySGY}m?+HzR(}=Qt(B#k
zZx2x!f_E<eBYB{xUB^oq)Arf#JG{^^V#E#;L!gGVW&8ZB{%t`4qIjdEB;_lZZm7GD
z-DDp*#kLIHP{hLi8&$)AT8eLga;cfH{mPOcxH!(ZBBuJ6L8RSn!4ZYBokjo=$jMRr
zz@Y?wrR(U<ugvo;S~hvsl0cX*tnW1<^_{tUvu0dMv>Snr;v2l-7?n~8rGnJw+7C7*
z3A+uOmxvOn8yXt@5wOb73#-sJ8s2M*%ouB13-#g-!U-#L>FjjDYlMHH@ClI=g(gX2
zbW*7zi%k$(**<)CfDajx9f3}|kjKq>KUYs%fzb<!NIA3zl8NK_DA>!qPgx#R4Z~Ge
z-&S31{4w`haxdCQ>mUMxEDp=G61~DI#)(THN(PE?7d?A!*^J?geJ?4XM^Ka-uWp%%
zPJ(TrNOMh^E#RIGc}#>zs6cWyaC-lV+br;%9D=p(3v`%|V5qsYOuw(#ZcJ4XIk`Zp
zqJ|_fq*W7GA#-^wqpzVAwH{fqakJudrC0)0woS#!jew@tfL0=v?a_v8gYzU@jKoHo
z4x|-%$aE3vN3g>Jp>byQ9V1tYP})Z}ZyvmhouKc=XbxX*x;J~}u3QkttGv~44I~0)
z7agm`n8b~AD^XbVKA8CEHcHz9c)2uvl^EM*HPk%bGD}vBMYw>ROC;aGHxj=Qb_8*R
zq#0QTd0E&cXoQ7(NzP@M@}-n7$<>EY`yBstCdZ!zrCNj>r$LtJXH|2r+owS83nH+9
zM!WIud7CW`ogg#$<{PTvJkV|;#Dl@o4I%4@S$rITvrkZ$DB9~_u+EfTNLEFNVSJ@Q
zf3a+n{`zco*+a*!v6fVvEqN!1uibrKLme4M{AtpK{S=Dg+e5c5HKzBrR_#UK-fz8G
z<!gCNacNKR(I>xl$>7!nr;hKwA!Z1$OhDG99mtM_(!3sfnPaggTahc`s0mt{5<42U
zv9xz*7s4*PVl#G(;<jzuNFlK<a_h>eRTxfzjr}8RW4kG1LJ_z7{7M8ampqaa+Yd2J
zyaF+$(q2&f%;eFi5?8h{T!InF)<A*9oc^RdrkVU#YEbl@GKm?$yS1B5DjJ|$heKm4
zHY9=3wDyqy)fA<xdW^E(eta`yab~L%7t-j<k@>W~^;1HzwWuzTDe5AOI@Ck!g+@rp
z32kNbv=3bIS1X`ll19gg9Xii}&&!k?d4(3Ck1^UuXExtZ7N?{~zAz{yJ0T+_pfZt>
z-C*ahr?O2T%oISuFtoJ7uCyN+MOSURF`OI4WCS9y*w9Ga1z6<__QUZ4Y-n#@%C#e3
z&dGnwU2$a>@ShDj9=`rUhjA(D9cJ1Okvs5;+~Dd}A<j00N8rfjML(_p@<a%=Y=HQk
z?~L<3=+pKvA6({TL8>7^<#s|sr*u~aDl=^cDq=$@*hOfN`5?b>0)|R7gycp@t2^0Z
z)^BAbpiDYc){u=4OoPx--sr)O;-`_ZU0GMMW_r{=xoLXfP;6LQN9ev!?^2Be)ja)K
z=J$SIc;tEGeSg!!m#%uISr(<G?EF7=E2+&^RiBkJqh_1_QPyM!gp2-fy2Zmg9GZ%M
zFPPJxAhWcp!ak@=a;aQh`0%nxP2c27NuwDC7SckaCJE;bGQgYIyu!l5IB9b*tE+A$
zR#a7H*JY!H_G9FdVIov#E|H0-uiqCWX}i5tMPjj_Zo*bMIl0NcT+D@np4O_lzyE$?
z(zJOy*^yc?k#pB*MWEc#4;jXx2@qIOSy{`*CgFH{&0X~OBcZepf~{Xvo~U|it$wQN
z=_#`^&0S!@;@NmKlxrKnSn3HUCjkiwZA5`8e(|NGyXVt)FQ6XIS-9|IfvmV?WjTzt
z^k;9o?)}4lC^IohNlA4N8}XiQbb%s4O}AyjwParuB(06@8Vu8Z-y64ISfG~`aNm|g
zwWzBhq%M6<cusb9_BOPuy_YXvZq@o)f%P6r660NJd=`9&^HShTbnG(iX)fDQU0uz-
zjaK-ffzE~$XWL0k=umEPaV0)C$*ISJ#XfiL9BC^f!`~L_)BM$B;9OG+q!o#1`_{_c
z3pX5I7x9aN$#07n#}F9J&&^HUFS~Z_Zs2<H2pp`KGsP-$o356sYAAr&E@Y=mi1k|@
z^MxZ7y#-K8u0iyVK^$t1jErPCD!aO7&J)s)KzGdo))gt2!K7IIwB^>;*5XHAYhWvD
zZ{7Or$PDFvT})%<=*Y<<)d|}WyX0~43J8dTc<~8ypbafS{LqU(v6L#Yv9T#G?HA^*
zp?S0b6p}jG8_k<nQlj$s@narYr=;}bZBB2@xcJU@Kz-9toQ{mjdAaAWmXy@JBGdT~
z`fV#%oqc^(e@&WJo^jIQbO8dv0$mFpnG^94fcaPuN~)fmnO2cvZA3q0+~12Ftygft
z<}7X!w<uRtP;fuIA>foIBj5oA;t;Ua>onVNtYLUW#Ok!v)CEBRH|-N7Dkd=P|EZHl
zh<Ys;?`~n?Dj;4*00w=6@t%EKKr`|a)Db$nWi0A9J$529v;=ddZfty;=86@m9)8(J
z$RH95!vfBd%?SatCp_2k>fSBX<Ol$uqg%w>Z7lI?{!;dl6DCZUvtR)kA|kLT+s(Kb
z)w@U4*E03WUxp_#?uzGcjM*Nrk^F*WPio<4&r3Ku11-U0;*oFG%fh%a*@ug<+Od86
zS!8VSBER)HX_c8SCuu(*ql?7T6|OFtd9Js;HVRGoPx;<!$mw=bd4jv>LD+VjfP;cc
zWMyQ^Z)}<30?w8Wh&@AaA`{wKe@Bmb6jVPR!_WU76C@Y~KCN)&#kkfkl7)7y^yH-^
z-16eaX{5KCfq{!oPvc^?<rNf^Ar;<4_5XDC)O0Y-zcyIa<SI}sFR*B@NlVa+NeBGL
z=Cg!1XBP;*nA9{T-wPY7_KOTQE!egyE<8hE0>&~h(0A+1nKP4p87AK|B+Gheh_peC
znLndOvAC#cHw+aoF*%_S!&vNC!;DC%VFt5gzPN*1-hKE`=HLo0hVANfx@2get0KWf
z2~AumBD(C`NC7PXTovfy38|>4oL+;Zqgu_kYSkJ!Im4!s5QkXGS<TJOleM@Mszu#K
zoZDXA+Y;j3c^*Jm5{B{3Xn=TO)4nvvuy_m&4cWJqe1*96CorppZES1;J8pda`c)(q
zx9_zQN$BW;#bbaJ9BCoK=jOeGIW4eCYi-(e%C@$y&d6=BEJVz?|LDNCZwqyPVZ6(5
z@B$*J5x;<dqPe+wvAt*BZO|^Qj;zs4i;75-eakShUq;9CU?l;I(9l08O`a^UZk;L@
z7gx>U=}gvgVPTc__V(>&xH)sOz)79>5^7Nu8&g=g>lSR@4Udq{?9hB=vC$!j*yb^h
zTOZS`nD;Y-F;+Nz6batl{H-IaO4#k<=f=i6Fy{#f3hqMrivSQY-N(mh9L~ACxD4I;
zxl5M{2Nd0@si|3Ln4%--gb1q$Q<{HzDsJEXra`8&?keR#Rcwe$%3SV>W+?Jf$KL;4
zXqx!f>C>7SPWHE8Kv<oKToDO1A|FdEDJf}VYr9_RS4Q=DTXo_NbZW2h%A(6jje*CM
z^}W`Oo2MD(sO8@KhhIXgyp@`|v7xD{t~)&>Bm}LTD4Yzsi@e_1)8ic%7l&PCs<4*v
zQo(LjhYv6zA;GTmBa0ex1RSwAR<B#%zrP9TrWH*i&+5Y)Lg~EO-0Fq`-(=TeC%+<7
z#}g+O1Wja8C*r716~HndOEXeq4>AzC+PC4YtFA{C$PW$=an<W;D)H?udoc(az=H{1
z-I_VvyLb5AEVrw?XesU;gt~5i6ufCQpC|W+F|5xpsbOGu7AS8tL1^pRZribAjjU|#
zfD<;qsHkX?O@rdZHC&<Zql~gL;XS|k?&-NG0IF=)%%|_J<{1n0+EP&8pF3}!1yZ-(
zy#r^oqIJkw%r7r*1Qe(R>(HfQq&4OFLi#VTrhz4KMp>~)<^?4sS11?Fp?4SGn2tF)
zuzDg>FF_{ufmCeS$v#-l(&wt*OZ_8j%G>>4KZv^yoxp^sAo>%k55K;5@LX}Qtfk!&
zMq?w&uLiC(hleK;0=)*_CH(TnuONhFz%$18bm(hCfF|DZ$Acx9?EJGB%6v)ZQy7Y>
z9ICH6H#fIlqUoj2#~<Kp=Aw;%CRQJ|g9v9*Dr|jcx3cna*G};C6n0?XXXlzbyBAJ`
zLZe{5UK086C8n42a_?NTV#O9nJX)rvXm=%ed3%SUO!5S2@(0Sh8NvgniB=+_^XUAC
z4<CMPZCztzWK<Druov>D+UdzN6qEPkSczVOao!9|WZYvj%|DfeUvYMJR!pYDBYZ$a
zDuA<<D6PCK^w_j<OnB~+C9!Y?zN(jE6pn4gaw7ijAbxp*CNK(qo_>B<5&5}u=0sq+
zC7cIw52I^Wt=fUhXeldSZmEik1GDq_v=1Ov_0F#2>(j^Yg;C8M7#Ki}E%|g&@!jiW
zOT}HgyoyZK4<C+J4wkv=<uwOIG53=g)1Z$$Q&U^(?d#j5ucV@)t)by{^XAQu?d`D$
zG669$4bw#3Q>RXKe)+O4egUqY{kXQ)0D6=rjsZ}$6+isy#V1}yL8Pm%PeKY5TeKC4
zaF0UJhP>t9y#NaC8xFq;1$bzM@Hq5#26I$tLjdhx_U-uib(66hRP(N5DV~1%acojG
z2{&eeroLXf-Ce9HQh1B=5#RtGDZ7YM_KW}<acTirjab6HmBTsf9TH;n@hv7$VD;*q
zn>KBlI&-Gh&Yl0@=9JQC<Vd(@>7wvdJApBFqy2I$5x)7&)X|5~r^716QbfGGwPPV|
z#VErQ?<V0UmDs?DYoaPFIQwv$C-)CnpunvLRE4<&+T%xc$ji$UXxT9UCFyVI@X`~f
za%s#%#i+u18mGK5NqLiuj@RDywz{k=zyrMt_83wwsv~5ZiqvBVZ>b}V)PP0zfI&%y
zq#Svpr4^+~R#$uAO&*=e%%j>jS#DYA<ME$+3S)xPPPDI_5!m%B=L;KpWo2a%{6J6!
z$vCRPqA89t%6J9mS#9UADw+Y+YN}oZzQR>>CLehMa@BCl2olpL5(2}d7;jC4vlyV;
z3?8waY`ej3a}s~CT<A5HLX!zRIdS5|OZg}U5gJIp$xxW7)ZDvwud}O5-NGXBr7q(>
z)vCZ}t%OWnvZXJ9%`C`5ljj1|ID;qOxO(-s7~RC}1R4>T6^Ujlw-T{aciGu-peD-+
z1ZP1m_9vgQ&77XEdNgi&g+7}zp0EJW311;kM8(H{xu01Hw2cwsB)PylMW){#WijDR
z8@_nfn_fga`heXZxZIguC~kWpHmDvy-aDgp%B&?CFgs0;oWN+z!$e-VbV(E2<No35
zo9^z+kyr7LKM69-g>1OZqU49(&0_OLRYbmerE}T?gJl6GV;$~-X(+Pk7I7YqTOSZo
zvD#ST7N&jd6)~Mw<Uwt0fB%WK2I(58aH+R!VbYq5pkF;3fpG_7Ex)K}=_fo^!(H(P
zijl;0<1f9vS~@z{r%s(ZJ(@;G2?o>OjRRI&U})Dunt#bc5)MF*EfR+SsjK(?{d*+K
zm8kPPfU>JsWDa#IH7T5Z@j{iezgT|%y}tx*c6Zy2$1Nav84_wo%R{0eNV}10VTvMi
zCn!{3_4B(65kmzz09YMd%%47&F;PZkq@@HT&b3I6M5i*+Fe4o+kC*B)YG4#Edxl^7
z_19k~&*x(D|3Xi{8OLK@Ak_I-aCjLXg~{5x>|ba?br-;qmzURV)aSKegXrq;-%pg4
zl}$D&n3s&|^{V(k?g`6RuH3F7H*lLb4*&o)$20rDwd@0Xpz?ft_iq1DKp%x_RUMtx
z^XJdMwVp;3`h<V=YE|4}vfHRDOMbY<3CF4@YjZId0YW#@fgZNlzDLYj!-ucxk+1gs
z_Jg4@M<p;Fx3Q1%OcTvP&eEL+Y|iU9!JmWunW3@VY@+*1^q23_cyg;KapRn3!GU|L
zk$#Xdu{qW>Ky$%MH4wS<4*t+gIYC*kc<5+Lc;*FsnB5S<cCv?thP(i$U{0biT9=PY
zRoqoy)zHvz7mPrW#d-;~%h(Q3W8LH}+?=c8KdG}F9UZ6s`s+)mze+=TM3t~H1U&OJ
z-d9avRv4R_TEJqZsHn(B?gwGB>apB~GBoKw|H(aY4_^^T<U_+qy7^~@hMdnTaRE3A
z>clIk$o6=3IilqvW}N%mjKxwKC^#FyAXKZ76><moKj9SBMQ9sGkP{#MLSvXZwG=8m
zAL~QTl`5QJyLf2=!)zyLF#d9Afi|Ol;>3ybhPNQBVB`DZmKD3D7*(G|(#Llle_((w
z#0xOZv$cJ1;iZ{6V+Nt{SkNu61AJ&gSOWm~&l5I`M$1$UZK5*BcGk4DO`ZT>04`bI
z4Gpb7+k=D^34L<1CKvOkiLr468qDcO-~Gkg3>Ux)bj_#7PGG$51BmHD!}aDR%+Alx
zPZcf|e0jk?KgbqSb7{WeiA~570{6Obe&wVr`Z0XR<ruFiAONU5_|nU8N)}ze-ulEa
zB*}7Y$mx-|>C|ehwwP_>uA&h6I>!}P0s;aqFOiLO>S^KDpMn@YiibS7KyUSzc8Fzr
zdQv=~EYwlPAw_2fx9C>3MzVz_A0OXcG>a%;6$?OqyJ7aigA-(By{})7z&vtt&h(#X
zi)nxEy+)g~h(FS}H;Rg?f$Rzb?u9=5)^PPKV!0VP<Q7E5{S->>WoR*qq2;x(5(nG4
zih^bLf@iIVYT%w=2}O|DKMYOgc9<6SZC^tt7P`;AyeRpqEEe%`2~3!3DWy#08G`U^
zy60`%xARag%xC_!nNsds*To{LTcNIIA2&soNj{h}6QMKrjUS`>J0STQY3aQvC!$jL
z)kn=(Q4%Lmy-kw!=5ll2#V7dfT6wFC6HqDda2X1GD7m~W<VSjTPQR`gqp|N>UxzmG
zffuk7%LyIjZWxZnhV#qM;ixWp>-z1;rNz?sb&9c3)ejuF2@ZatK?YGqH5Qq9rG$h8
zm1KoU_i$pU=hMs6`%ltmt11V{x#&ULeu4Ajd@L}kK}d*zgO*3RAvA@BhXa`yN<sVP
z<p+EM_wxoDd~AYTbaCI$4Uv5#K3;X6$2Pn<vo#S-$_RQ+$%06r)D=MCm<YA~`)}Z`
zR(fPF_sG79T$GD^{YhqgL%DCFCIlq3rrqgu|Ik-Hc63;C#&G-2ok&0pq6fTpu@g`U
zr|-qAW<C1>%OSH7?1(jx4w82pxm@xKHZV42LraU^MH9x|$P<~kzP>)7vbFr~En;;@
zYA69M<##=~2)96_J$|?@)4)2T$A^;O`0601&D{zRAUMwhM1+@x1pBhHp`ZscEf1uS
z)V;XfN*p?}|N4F#OAW^|y{=vj#hGJ{PLh8;&pG->?aaOnRz@tbqrcU%JmLn9@?SWG
z6qHYymbjCvtE)tG2-FY}5fLXe)+^F%%_ua<#*eoCEgW>?(HOhj1yPqlJLGsB_%*Zu
zP0;5in;ADMWdwtq8wTMa0_|t^ZB#}3_8MJWlH|sAFQ|mrO$2mNOo0nx3^V*O3MO>t
z%5zWOt&>8U<GkE2{E5<p=D>?H)8@B-YK3_+1|6t7*P}4)J1`?-)C4}$ThY<cd)tq^
zym@BxmMtb50!;4dd~9noxDbYm-d-f@qyYoU8TcP&q&;}B5Hymp+5(79)fR%}!b%u0
zZ^DUuZBr`ae&86=h%C-mk(R}qXDn(CuMl-AWMpJWma2)x;saf%dCXgD6RR7eMJb1>
z_$i9U=>dg5!=b$t3R({V4D3wX*tE1XFptsuB@4t3U4<T~^M@njF0C;JcllPX6yWD)
zt<j1Gk~)6vq=fa`XrwzI8<gMG%6Mo*1*}$~@S-S#gABsS*#oD_LK4`l)Pb9P1a)sh
zPt(BxhikmUJ#lUonBhw^CNdhULAvRzIN7?Jz~_qQV4cuO<p%amT};ai!OE_&keekx
z;}so&??%oSDV)JDLp(qC_ECxZR<)5hKKlt4kK*JnUyhQk16(Pjv$NB+Yr318TU$+<
zI!c;Qo(A*CAit!9Sh`7KwQ*u~5-5B_V`F|!&JIYJA1(oXdZ9`LmEwG97(h~8{E|6y
zZa;gr^)z<squN)QE)QTDn@B(cClEJCO<`2Ic8ACrpiR8T$|{yO56~1Wr3~pzGt1TG
zbSoI#5!@!JZOH%>O?p5vbD`amzKh#r@_+r;ovfNVZCVe^OwEuQ2%_<^peH$j4b0)@
z=4a7Lqwx1dHKnGe9%{(<M#WtF)s#{A3aFA$8D|tnJxb7>?M=w_(i%EC5rC=YkWJaQ
z(MQq7!wAYDXlg>lqjBM7QUP6~CtQA*Fq&Pgqmz>b8j((rn~<N)_ASP7ZjkFx*eNVV
zh6bSY*u~CTBBO)u@9%Ef5Gvnf|BNWtIDiZO<;yo?quIyWx4qVcA1xV)wNZbKprBXw
zftM{6QPz%?=vLpvb9F}7zBHsEJc{$Ue7O=}Au4OQ&6-El_#afeOUj&m<~0$@-ynRU
zq$Pm4NLJO>)`~;t<t%V@VmLI&Q{Iefntqn60T{_3;tsT{r{K*hsAH~pe3&L|!RzAD
z&-L2!Loj6O`*c!|-dMh3h5g8oBa0ms6-5Xg{)#=2$Ryh2k;;)89i9fnyH+A$9(nsv
zcgd0^eBiOG-@qp9MyfET-n;s+7SI-rHB^lsx0q-AekBL0Jz6a3Hu8`}QHpz1hm3dP
z;PtYwXSIL~BFWz&WT5dYGnd*DkZY2yYNfJsaxAf^`NhSV`&H0e=`L{_QKuHM;}yh7
z>g<2rz;XR_J67Do&I_RKfuy3A2TIzOgX|g&zeYZ?0Opm5hAedmxz7Tf;`7}Tn0#N<
zP63jQ9%74uEc;+d;1jwDGPpqJht{-sl}d0qNGUWLL4c__Y1j->xJC?CZxy)aKz~0t
zY?PN0SLhHrRHfM-^Krl-jEFYck`goa_v=E_qXLfG>T+0%kuUt$$Wsp2rfZU%jba*P
z`R?62e@WXr7M0PCd(C9v*7uh^dB1C5oR-f&xdzRVyuv~a<S6R_QxO*QPVccSI`@x1
zz<le}1VK!Z+iVDyVNOl#CNuRyZM6@y!SFhJdws&g!vQO9piPnc^ywC~oB+8a5n&!Z
z{=g!GXkCZ{f1#i%YRBsHS9WnRDaT=4jUeC-T7x7hOeh3(PGKPDXRup^p_*eGT!T?u
z6QG^NewdNbgO1;t<;&kY1lbRCn&3nu3l{iW)NVmX>x+jQ<F1a(uBN7jyn`<yKY@XP
z%lmMGDbnRzFEF**(WCJ=OuWY0x<gHfqo{=*@VIitx57dba^gyr;KHMmnbkR3vHEvm
zAR1|u!-8O$BXH-)b<{e-FhJ<V=^w!~JG;9z0Gt~>d{{AS)|I_8PxLqFk~RtWgZhM#
zB%SQDLa0T~8TrVle#M=NTh-`5Slt7*UJ<Ucl$ZnjN)V11)q6kHRri}zxPJfrcXD{y
zuKn{*1r#1lB=Gi6h0p|>(VwTMSCF7QR~wSc1jxAl(1iYcP%8v>48SfRFQ1j0fo<h4
zYH|VHH-mg{T1ZA?7q&d14h$~P5_k#1{vIgG)R?F_-&z$Hg>b|c*%190j1Yt%3y!fh
zKx<ndiD--od3NufOnREmgv!|1+mpuY?Bb${$14I27eTqgeTw{$iQPwgaiq^2hvKEM
zV{eDN<rOzATri#r9PE#Sd4slGSTUPXJ+Gf;vkhqyvYKa+X&KNM66_~um#^=$dDiW7
zKXnFFVio#VFmK$)se@_QD5MHA0{=in0P@s2+57oeY(<=r5x9iIu<Zdxq4^|E8_u0Q
z`{1h{$dIvU8r{XYzO&+-nP`7yJpZS%uzC&oSJAFsEhAGSdG!X~2bAZ8(;gQZF5RDQ
zj6UIc>`>6rPjCp(69B!gUAuv?XaS?B0uRjj<BtP&%NZuD9o3g-!{O`DNp2!nw4h-?
znmVKLCmv>Vom#9wUipp_+i<Aku>!&^JX$MiS(W{<V=rIbJD7aDO>Z&;JnRQaQ=5&8
zX5L!C87=;^?ln|xB9LKWVUkr(o<0=^H-K1M_K5TTN5arGKVh~j(9pHV16!J$2oJK^
z%jeIR3rDceiL0s6hqT=aC&NJiXD@Y#Y1$wtB=lKSMej|C1qF>k;QPE5hoB9K^?+#B
zDNX{wUo<QxO`1f}g#JO$L~d*ZakJt@Yjxt?f#2K}-vNc{#qT@C5uw3vvhpe_G><>1
z#av>e3<9uHFPm=7{STz28^~VkuR_YW2e^0_XJ5mPXP{QD;p00)a1c?n>C4`o^bKAL
zVfNjh_Mj7xfivg%1qGhG*f-S61eFs{oe6z>J(7$pyl?4HS1RD`y0k!KGw=-(sHoHo
z#(g=JD%zM^xBlJRQ5Q$05TeY5F@?olw9q2ir(Yc0hzj8_o~#1xZOzwK&Lk+F%X@cp
z<HgHkLqRS~%6|6j5F(nxfk`(3HhKB%{Op0G^%!Z{tsS!f=-wu^x3ol}ABu>9FhCjx
zDTtxJG427xchG7UmqPRs|G6C&Mq<wDdtD2~Ep88VHLY2<F8*08=hN3c`<JF8;QjgY
z;S8M9zi{D#O)Oz^N(;zogg+OoTDEKx9bl-n(uOpFY<Hoisk!+si71CR;42E$i6f$X
zR{&Gj<4ns7Q0xoCH9&Q6>zH+Z{mS2)$ob^66eDA=(9Ty0CLYxJNJ<iG(mQbi2fl9-
zL)(XPY_bF573%mJB%Z(j{!iGQJ8x~`e0_a4Z{FMuXwj<THmJ_8HIC~awE?S?V{doE
zil~WR`|A6)84n+dflh4Ox^)hyD_-uR=WcLffWE&F7aFQS{s17ea5NfO?39!g9F~ZM
zcFM!y{>K4eV+<Z>L9sl4{`_m~^xY@%_?KUCe#c>QIp<zf<rc{~>!6MULl7SpxCGuL
zfcZl6gmlX7+qd5fuDf?|2LXB~oT9$aqfaAI@O%2BCMjee|CswbMtKNbi{j)*j}*wc
zfIfmagdF$IW^DVhra&18nmv3!E^s2z*#l}^;^oDHGXV;G7+~g2n2}nbIEx}N2t*UE
z6B1Gez@nZDmdIn6XQd|#&M~8ISa9;6KE3s-gaaVFB!8BA4SfVSkH~?)h4fSZWZ}2U
zkUmgeHm0Sgb0#J`9#0X0=(PitQiMbRn%QQYuqcP$n~xP79PEU<?a5ygM?@Gsw1@?0
z{iCfI2gwk6?WS$DE7ei!u<t;1_;`5_J^pKwgIx^FU^{S7E9}XWC+!;+(q&a$f0i!Q
zuf}F(t?)*d0x|_!!fIR!;~Y|xF$GI1)Rb5%r_?D`+=F@oMPUg?T2!n7<rcH~xD$+D
zx(eqqZXmCN?vPO5K)xj}NHip7;v81am8HTqM&JQNBvH`rs)WcvZ@S7$Yd&?>tkK%4
zwu&fUu+|lBZle!c#wJhNv-Tp4f@81Hjf6W1DFFg5{T|dOY}X*7%K@HxaEj_KP(y-_
zAifCB0qS5W-T$~65)ubFO`E7X@lUcP1R+bR-UPIWLRZETW@buwpm%?w<;^KnxQFMk
zbR^j|s}T;(zAzU>R>;PP$n%mVLXg6-De{d)Ym>rphywSbYGu`a#NZ93mz>LJXMFhB
z(&D7;Hj_rLFkt-Or6F}Mn4uHQ8MOZ2;ga~l=hGp4Jwq>y`C<vnnEEiDi;}rN1}KlP
zkgDocq;dj!4h{~~HqLH94kuKW3@Z(*6awjOY6;-&9IKGmWo)2Ba0}}_JaG^7&)RM#
z<7JLwd%?p)-P!pe%>{~iKreZNHg43}zkfg3Nrr}pty@1sdoagMXQjnKU#AD;;iKIg
z;0r-QeRBP?sU2sZ(qA1~0spBh^~dblL6#7HiSbt4X3d^h05;PWfwWEJ9o9|hDoEU8
zSdW*LnVH!F#)AGfzED$4G=!M?nebaRL$9Ld-t7=Mb32^;cFtc-y%s!Thj0>45Eo;j
z2G|JsI)b_ix(Of8hRh!+g>&~7&M|F8%Lx13)7Q7EBh#nCVz}tWbUZ1hoP<<-%5Xan
zEnagy{<}!jEB~uV+zzp7fz2%%87$`#9v$el9!-ZzuoU9Q=ou}RbME&6f$TFp^Y`Cl
zLh`zQer57cLBZNWQO0{0(1oBoG&K@i@mkOfzX=e^h4bh4=<5ffy?UVrr$=UEGw{pE
z=pzb!Jv<5w4Dj@mThyVu76;g>iAOA+qC?>i(#OE>lj}bv3xzbVpr9D?N5ex>oD8D}
zeS<&|C9BIEU!;EIfLyB-%bu1BSdI=~`v&g;T)zX-gIi?)w$L5bK6EI8JOH#ksN50m
zK@-6J>A6`%y$~p67`8noG@v&%J)ImBm|t*6@+o*^+(JEvG@rIV2ERNAgFYF^4rEb_
znhCA>seXqJRpZFp*_n$Y@#sGJ;^cq+`2~+?*n$@9IvT4NsZKBwG|8jgewFvTs|!Hk
zsB|p)?hwEiVA=6ny$$n4t#fpH<{<GQ+~=po0k6=OpEj)w$G%pA1^v+mL-9x$Xu4Dg
z3_&eVXUQA?^Cw-PKI<peRdxN?)MN?2F6juEHyja{lk*ojcs3Zvhe;4nFehmQvOYL;
zLq|s(OghB`m^rUe6~rhkRqsE1pcD+TZC+X&+#qoC;0x01wFeJQ-W7*!P7^*_Zxlh!
zjs{&8Ot$0{>ioGsm5%&#5F-xZJON6QT`bK2Aq70yWjb*F*elc+58WgqhC1XE!td~@
z)N7L)3hhj7>}MZgqgZ@fmy`p|g^Ye^p#iEwT{Ko`Rzdp<5VQv-=(~uSvtt@&zRbUd
zka_+30!Y{JDl4%+!pj1$hXu^_APt1s7>)moHjNG~g$;+W3GT_}*fEi@fMG~7#3Oeh
zja!sc@2BN;aui~}92FWG3XlFTp$RyUq=`XNn{NNp$#d;J6e01#)c|!HUz%}-vJ(|K
zYn*!UXU+67Kn<pv+*lmcs&847dJo@30U>duA^|oK(p^Y8SSDXy&{uvCFHj_Wkp~IT
zAr;LLBzo=)JiQgli`KO`h>A4!slWU}mN;_L(6PwTbCXnW0z;zDwvrM7I1E|GuYGIB
zChuu`qYVVy0y6`2Ma4Awe*ggO@n1*-(OtHZdKO5o|0lXN#0qXSJdgkG1@Kv0h7Ouj
zLDlf^Fx1bvJUjx4)zQ!#ybdg;3x|q=1^9yjy1e-Yg|K<ii!WXAXQ^MbZ<%|3J!&&$
z5oB=mYK5Tae#`{R$hSx)+Yy@n)6a7zQDe@(CUGr_4aGQIcenrgYXXaX<;oSf!)Fg8
z7{xeCtAT-!wR3QB*z#-E_H8gP-HM4M9|5GDtSmQvA5J0@@I-X-^YU(Giye6O3mv<A
z3DlhI^6fZ!HRfQZ7|2k?dT=iiO88M%aEX?hv_yyuN-LHsNcsc^#{w2mRYfp^9Xv3<
zf>KSTZQi^)c>LjJ2z?TvW3&9nPmMvL`dI&${nU&@@1-~ynhn2{l)uC=ST5AmA??A3
zP~CPMUol2mhQ7i^LN;-E#%d6IdSJUm=8A_zLlZpL0QeU*k`EpN7{dtre{ni~{62}M
zaHU9(O5&(nTyQfu&-f1D^P3qONNrR_#S4^pX3xF_9Ua>p-D=T*B1$}PtCEPFA4Z?R
zcuD<hTb=#yZPn<k9sz~zF*abR=(m9x&49sUNEpHRddA7p($aDeBGmU#JMxx7<tHN|
zY&WQabMSbaXe8%?H*faBttaZYD=U2z<rii#&kaza;YB2x1!sxGXF6~JcKVuKI9YY3
zeuk4Knu`~%T&ZI7DOpLD(*Y?A+!IO)%uY~zwZ_N3;KC0q&m1&(cmm1dQYp-XZU1Nc
zsV2eVh(+fY5!r(!q34of@E40GxT{@5SA%-<$bLv@8sMxP1vBEy<mn4S`ILNirYGKT
z$A%@{p2m$<Oe-QA9<&P3DL30k6WuQ~;l9Y4MHvGo3k>mXQVuvO(hCx-6To`K&27`h
zlg8@xKi#V9icz@5@d#56FogqI48W=)V$W0J-R^H3;9=IhtCs!g;(_%zQUd0^4X{yM
zYLWsshn2!Wm+m^8&T9_jGze-rg9>;=Kc0&cH#!h9dX1OkESx+e{X-(ME7Ct73%+_a
zOw)xGzbkP2qKl0cM|iKnN^Gm$t_JTOD)9fK>rLQl&foX{gE3|>8Y4^Aj1ZGGTQW3^
z<rJw9k+l>`C|azI8H_DCSrb_biBLpJ41=OXNz#T&D239h`aSOxGxPcWAOG|CK0e=>
zk<<JATJGh#ulu@9Fd;nmDXXyK^*)~xrO(WN<J-x@<(BrHyPin;dHa}W+1bYkBIJlm
zetp=whu3dSNxP>^pFX|cqx7q<UM=97OqAL?`NW*%%{=?hL$_hepMNfGX6w%4e54h;
zjZ`7c450ttuW$#>pZ`)%F@I{E-x7v5HN;?Uv!oSV;L>(q_eYm>+|Hdl$+2W3TDER|
zf+7yARrRs{)Ozr-FWyM8`yPmZtTHFSAcYJekVgW|$x@>m*~r+Kg0k@PA2r*z*Q0<(
zL@D|ueUiNhkY7`pPzA6~64Xn<N3WGhkgGy>26>Snt(XF1)tGH<j?hzc55SFVN8x$*
zh|Z?|(kbe;qN8VM7k&LnMB@_4(!kvaxw6;Cf9~D8xA1RCb!;zA7XrbT^RW*8y_dlk
zY~=7);3^c=%%xDE=2CnP+Ar@X^_V=JDB~3Hiqy4Np5dm{B^BKDQ}~X};c*m-0Y%fl
zb0F=Lnip{WPks8F{o(3x&(QpW)UY=Xk8P5pPMi0gS4bat8sB=pG$tiws-%=ubtz^p
zzhhrbx{Z`qgw8_Rfu~<yI(OOp`hAU7uDtTu$WTxjN*}tE`pL(HDz;(MrhKSX`k$?%
z^Wy&hav{1FzPPout$}j@RIohtWsR<>bYOeq>vkkJCrA1}jmM2Mv#=0K@bsK${}Hhd
z>aYg-3!t#9JRdGv6<fT#_UPaX$B>)5Mn67j-bHv|LN9vkP+xZh3Dn<9h4lIRSB)C|
zL?w(>;B-ymhl*bL-MBH54C)n0^`fU2g0Ki%A~cPZ)K4F9EhSU}A<stuL(h3ONb94s
zvqzAvN)hiArl!YUwUaSB@}C4XR6>2qS?TX$X9BSjp?$qX!8t7NLzPY}AJFX&Km2e3
zo(!o}%Ls}hGcgkEmf(;yW35X`S@PiUZ(f~7gy5FM&`4}9H4YAqQU!-y_{Adn{OJ;#
zh5bG%VKq&~ssQrXHzIJC9VS%@&;z219z;l)nVGkCe|z{VVMdB!{r`2y*yuoO@HN88
z4Fgkey;8rQ{M)Tv<fY%X=q9Zl_kU|jKIA)MUG&jo$09bv4LbY{Sg;iM(FF;j+(xx$
zA?KiyK9dmqYV57t+FIWcFe-t7Q{-o`s_7puYEYemf?}fT$jl2sZ(WL?Y@#J`z783W
zE}Uu5Y^gjIRr<<9d8Q>%(#gxFjil8*t7e;o{}~+|rr5QOm|RdeJl!6c!U)PK;ne40
z3!I#sh?dn{L4sD@fdi+|FzAK<)`GOg$EY8qWc5GsKW6XS(|AOlkWANxp97pIdeAIM
zC<MS(=+35?U02}$Q7O79I0L-{@!jA_=j7PJPTe2>GBVn7<jD2K@B1V;JUMrSXfIIu
zI0VCkalC?%V8h0Zix4}IbXx1O8aC;AT4!VbPpj$qcrf!|qi2Oe^d&O+ocBV+n3WSv
zGAQ%}l3sBoR0Jk{c7T&j{|%iqgE4CA?(06Y_mm`^C_w|;Qo0qL%54Ee6ra{oj~v`|
zp)glDg@n9?l?BvD3_I19c{hTX9uqT8dR(Le%9`gc9U>K7lBI<t-=x2JGihq$v*mI4
z;u+vTr@+pj;<&A_T-MB8u{2MJ`(Sk9Oa=|IgycKAkc4G(oq4N60?I6UtKV7izx8;t
z8u8syEgJ|$KW{p=xP+NF!Pq(JlTbXn1es9$^IghGlzWy#;+xUz5imt1oy`LP{qwYY
z^8y(8d?xmoi3ihH2dF)_(~k2B<N?s*0b7v!($(HS!7cZ0OOI8n;3Jx0B9pN}TO%~D
zp8-=EGvd>`-{;Mn_diaY#BD0+iJYp7j;V>B7V2eq*Tkn{9F|`Fxju6SH7p7$Yi=5C
zBx9tGvR&<na<5B045Gf$f=Z&3f%h+!3l*biJ|Wl%gsNd&xzKDBG+s$;xxRbGsK0Tt
zh)VhB5JtSg_(Wy9oSJ$zgot>Y%C#u-%=-1~=e=%VcHz;Z$rJYX9}UxkY#<4drr`4O
za{S`tV9N;{6)Dak%6EikSqNw(Wd=U~<+VKXqEak?(wmIPOIV!VN(pL^pgqP(8r~I1
z?Xcf8den>0r#eQTIlGXQhJ!;>#_LZ-(OfyeJHf)G__othzWn6rwH@zjD>)UnoCOxa
zpccN-Qw)|066vS7agv9V;>Ibd@7}db2VO>c%DK6tB$wmVNYMbqf%2{-jScERawGG9
z<E6`8h+h)Pkx%oONfkY&b4jVqJfz}3f2nNw+8x|qgHBn6W?w5YXIr*ydj#Eq0?Lm{
zf_>yzc9EW(;0an%pyV^zlI?XHUW46am`{-5@ZcehhAF)9rc&sq7si60odWxtmZ}T2
z_cGUvoofE^ZS6JFapOid@itme-y&dC-|u(*`b)qcKbU<#*}M15zw(0Hltj1Gj96It
zr*o4Z0$Vqk&~)^|o!wLqng{4=C)PjWQ@=-X)$g5(mulw*^y^WSGTkaa&NkstzMo%C
z+@ZMD;Z~16h47FL#B>sK${uRz+X>1~cUG&eYP?b5bxx~n&-5=b!u5J!97)jSmKVRA
zpj2($x;1l6*-Y^g&dx_W_QlRQsl2@fFq}!X9ke6~khuS&i8xnrywwP8@8H<$C{0aG
zSHjhh{$(dMoas4!sLRW3Jtv=>b17urtu2eV(G?$w1%l0vj9gesMkv^#v5MgZ9ja-7
zh}{!sG|Tl<`7$wd{GLhxGH65ZjM~A_qOfw3-C0RP<mBkp9!*qg|Kd5m?L>1mWW;uO
z-ek9q3m*hO(Jw{X1)8+({fHhLr@3-9Y#&h_)BOH>ZKVKm+}yoEHPmhTNZw1+2!8{P
zKVsMB6!-O$2Ya4&fps(G^t6X>gjGK71Y}uyaeCe=YOQN#1OVV&S7L1W{r88l`NPf3
zDuCKgO%77&1YsJmyHmnne*IxRPHiy+D)8k;0Lja-;I?U`1VP*_K|ANnIlQG+uS##9
zx);(L^Y8oEVE-Obh^8*{)H((Jv=B}x-G%q%dMWzb6g)XE&Y-K30;~lzKK%@PHU&+o
zQ_#i8MSwR73goa)5`4{i=1l#o8Vaq}v?2(ticW?^2~VKV2Gb;^xc&>VO%endY)Nly
zhzTiW60o>M#_<IYkDj8%#413fJ9^H%a>00yUXTFht1CXZqNo1JpG&_u{jxc26ykz0
z=e~}Sz*(|of|2+RT3l(=S)J-Cyo1gEDZ_eS!RjOWH?}P>q)MgI$=Lrq53O_h%?|3B
za#}s(ge6`=EA2Ry9A;2Dd#fzTL(m)5Y~FlsNH9%!!U6*qoS(W@F{F?dBr!!ONcK`z
z!UPYdWNM|JJf$UMN}fov_e!RGX|fXhD|ubN<hN|xQI5+i&V?#~_aI@<pZn|A&QTy<
zi&zd}sdBScP#|ygRFe}R^nc803Ej#;{PI>hb?!;y+tIQ+>Q4_t=bS$U&;|B9kk33M
zD=E$3OD?H*Fmxo?hVd@lBQkG}^CD2po{|I51L##0w<b*mv@+RD-S-zzmn5ikelBUL
zH?6Np>9^X8Nk%0Jz{Pe^uB4(Ys17ZT)BGfjv3kv$P;`>yJ&^ObP#Q9qz4-N1+G%hD
zLW_b?#^2=V@f}aY>~}z}xCr9Qd;XN_N{IDHTmsR%5QlrtiYgyvst)ZGdi^s~-+Zwg
zo_-56qdcUhY+$nIf9!^H=nyfep*MsKkpH-sQkZELj(bU$d9~tvn(^!&Zr~rW-M%}D
zLMATu`V){_s^vnRkoricPX2o9oD?0Qtquv(9`!mg@S^+sd5z^F=ln8yi>CBh5r@RB
zAdlUFddm@{2`11%_yE*QO6zOaaU(S68%0aJEVvkD(UGB9c+xA<nF%5y2w&>{8CW<z
zF*}BIQU3$~bI#@Cn<)Zw@s(#HLV#TX_IL803+n91Md}UvcATY$4<5p7;UD0XXUhAW
zbWE-FUsn@Eys?m0OB(HFh8J|{D!ejEH&Xm&C*t+Se4R5Nj<YH(EQ~{s)#y&S`udx3
zF}aw6o*Eh&fQb^boH9GIK6$4?JHe2+OFF5!bFVmr<SpSte{kN9MH1F9zvke5{}I9h
zd(Cfuh_Yt@s~MbKPh<4?dADH9cZLkkRlZ*swaLfFhc}THRPhQ#<E^@JaE(lFmA6;)
zFUoxHLCiYk_;{`0E`fLL0tOANJJq0ivr{iGJ0|%rRO5a5(1nA@S%a58ztO525w4Nz
z0twh|-V8fE0&~4v*L-AG<6UME&bgDJ4QuWNusuuOW}g~8$)?Rl=}ndBkl*zb7B0;b
zAIwU>Q&cEpRb2^Dj{)^b2D*rBM&5s_iLr5_SSIihF*8tqQ$PjIcr{dbm{R<ka`e}4
zq4QXS7)^yTbVlZ6dA9%Ky@6s_`#|H}7E+KqHRUJTa%S-rp+gg|-k$W24LU3bk1U>(
zJ_YlsLm#UNuQ(Ya(MxTD3YMaAz>f69PTU$qv1k$sL^-*+(&)9j=M-a7q?b1QGhyL@
z9U1BA9bf2=OM?R!NCOP}Y6L~};?QP_97Cb)%eo0-#MW8fspn~pcdB`S^iK);->+zF
zmjAf=1g!uPR|&~Tw(ZRjIhEgJyC}|CHf!hq@3Vi1<duU-te#$7btxhQ0z5=>C(|K2
zp-Or>IzPK!kJKhvFVx%m;Raac(>v&o!lw+t3{YaauQ$~7EH3F3hy_97AsqE#+;l8_
zDfn=$0vdXz_aLqkt|l%)I)tFE&z7}k<2x<L&&iQURFF@&(dr+cOZ4nJ%p(+ijDV^>
zMHf>YPgzHVdbFPrVr@W@CQK}K81l7ediQSKZV&VNnJG@vn@!P7Y=XuG*e1gI5l{yh
zA!YpqdGY$Zc+A>}rK?T<@r^MFSho|D+ItQiI$cEx@)R=`j7ZaValUW8=5KkHe5;{B
zb>w3oaMDbtt6-7D*k|}{f{U+2hE>n_8-+(!tWS3cWQ)#NM_5qye?Ua{Had$`QH}(4
zl2%>6^7OMOLRv`$SEemSe%#QwrNFJisb*bF(Iq*r+gXvbhUZQq&6D^aTm`T4@Y5mM
zG3c%+Awq<XJH6F$4yKXoyw%MVkMI4J?Xu-d4q&u)xtJU%mN2IdMuhOQ;3P->F*(}R
zy=O<Ct}Uyr@#nqm2iur)@T7T5urX4zR^7hx{3HZmIk@z~)znliak&axiS%V4yUpTH
z+xNZVv=ASKt3vbQ=8Q@z1;QAVleVRGA1CvnikNGr{=?6}12Ps4r%+}>`j{Q|6qfQD
zU{+3u_l9qY6})%1@o!*sf7i|1d7ATCZ25S8vg4k~m(jgv6OvDiC3FbR^bEC_|F~-c
zh>)92#ac#4ky5(07_2rQ7}siFGljMinuK5jLbRo|Ezx>5k)FgnAnT7eG*E1XA21dH
z)8j0LV+h+A&S;omDxR|j9f-fNT%k~2SyL4*jo<Vo#}vNY!EW>f!KX5RuCLl3sZUac
zb4<D4(K@J)SDF1c2tD<mFCmB$i;>80im2d46%nT76b%0f5ptCE)F}65+dGYz#d)QS
zxpe~vEjEJRDK9GhQ^<<-P0{Ey+#2R~zjcxIa+<)IegtS<Lb&0ZkGPqjY(ZEA&6W*3
z+v{?@q=6lRL`!-Pm6Rsb1wE9CPa6GJzg2fRE-U*DZ{(SN=f!)2e*z2b5kpp&pOe4>
zZKi$s(x65SpS$L)w#ACXBHVyE7gt&@TxJq336W*WNwdFh*eVob{S4hqB~u5wieDi_
z2%z%GK^9$`_|!8hm4*hPh!H<!Sf^Zcm?{HOq%~(ltU}>6WbLQR(kLz-FrRXHh%n7C
zfOS4#WnX#J_@nPD-0yx=9}1qoBTF!ha5+XV<{9@CB<K}ec3c~$p>RG(WE2btW%tWQ
z4vGUsTjH!h{w(JQj2R94_ayzop(w@vQ=DAl)_H&;rz;VYR1M><xd0zck6s-i5*1G6
z3Qy~!*iK<5O5w-A(9lGMlyhrbeLl?WRwwwSo?YbZ7TPD5r`xsJxO4GUc@X*NwE(ja
zfeZ-meN%84CIl@~Ly|XFcy3FL@(7dA0iH9XPspl?lTKngTXk<hZb3p`f`aJ2*0JLI
zjvP5c+svgQS_<2PIJzL7mx}n#<$1&25})#4dTsTor|2&#87o=e6{Q7MN(<b|1-tmR
zHH`>-y?>W&kIL)pm2k&!{)&IJTl0kggI%)w<!itrJjo)WcI&3(K%Um3%CHEjDl6~j
zl!7dO{q?JcS_<vG0w>{^g{q)T#GO2%FB&*H^@Xj%dmx_nINOmK_52uSk({yn;MEGF
z-mOV$z#@{*%$Ta8l<pzDQy^yHybvuyH;ENK9N%Ox{KP+0tennJmgGW<k;Mv)Q?U<$
z@Lv}IFN)o`C3eHJq=|;cRGt{by}LSiMVqy|SSMmu@0OI#E>bk5x6xPyKnNK8_jijG
z&R0Q%h3Uy}Pz4a)<%5en8`eva(_FGq-awMN*QURnhx-I);R!DWqo=iajkJ!B>$W-Y
z=P!Vyo|M;C6lKmN_wScNJYXz2AAySAc*!Jf)A1@L5X$q}C{=*;g^8Am-@${uOutsd
zEJCTr5|b|c2sPzP%4t;DtayRMKCs}4tWVQv)mbTx2*Ob{FzDysR*^H1=s^NCe4~z1
z+#;dz?b<wkqqE|6p2B;bZE^YPVgFunjeU?&gi1ha#e~)a2Uy2rJG+eiF!5LZPR?{p
z4TK+mgWu6?=p_m=z-rTF6=%+~v1y6O2sJ(vm2jZh$CkaOKVns$T?VIvy7UI~rV0RF
zT}Zs8*{Xr(umVaXl4%JqSgGQB0Vv}P#6~uebNH!yyE}Q=5kGwCFLpjJx-!I@V(zWH
zrJeOD9TH%pJp>!|w7%E3>BoZ9B(8jZelVwCsXkYF8DWJ~y52lK^OJHYsS$vV?XsSV
zrPctS;s}a2?jYEHO_d>gqlO|To8aU)C}S{3NfjXd9mplMI>AQ9T$4Lov+H2COUUid
z5vr&4BEm_Cf)5F)bj6m6DiCF{oa}6yWdhV{yP&gZb7tE4kt)KsKnzUKxSc=B7Y=KR
zZ<1DNiItVY@DSDgt&{3`+6eH(`BPfp_DfKMnKZ4b;={M-<Vc?{FQf`UYkZ_+3b%sz
z9Ur=@H*<bU-LZSE?see5D>UK>xdyB(Suo+yFe|byRRAHN8XhHutBY9T46(g!2P`;D
z*%+TFIC$fxO)m^>rFho4$#>t;zj4sE&Bn$pdj`Pt*{9#Go{{MVsB}Wz2AYSaAnl>R
z5p!1PDXxzSpYvCJ_5gi*$d>dk-%U$Pdoh_Osc6K_@Z6P~UHsnEYR^A@@GS)=jh_)Q
z^BuD6D0_;LqNdOZ6M{ABu0w+&+<2g6AGKnIrmezrwKR*tRNJW|Tmr~Et-9Cu+y#gm
z4DGG!55eodZ~>^rK!GEGNPkyPQSnGRL8Qr0CF2`q;sk$>JsrWGDhltuc^?tqw`rH3
z){Fa<PiP9NY6GaiQOhz>!R-;$R&-oUO&>jD^Nlj!5mh*#$)E}Ve3phrXE(P?Ls}}X
z_vDtCsaq;W)Q(e$=7w~Tu;ZSMWE+ICts~THsCj!y#$*RABi!E~6@?VCQ)Qy0Qry^S
zf*)uX*$}v0p4tkN{BH>Ed|9pnOM~MYsD_Xv)dgJnY);O=78HT3q4ZdSV+y2m+Dbpz
zrk+ACoJeuSs|UZR0Hz00DH}&sPoAw-@=u{fQQhCeV~Ozw=dAlXpLaF!V7vNmZ=ZHO
z5O2o~Qg}vfCEL+9uo^JDQ^KQ#bqj|R5stury+zL-e_X#_wmim#BF}N!JWW)LG4YVR
zT1P2pHKoR`N`{7<;3m#r_M_rl8S;Tr<x&N}gLb(|SF4JS6ODoJCR7*f6FN#u^$W2{
z9_=_+M$Vy&Ry*l8y%}?N&gZA_j3ZP^La=>+#|K-gs7jT@K_E#&V^q6jMd&P$5P}Rf
z@DdxVl-^FZ?KBBqOlPj}BYc<E#v>g4SGaLb`gtyeC>x!ccK)bSW>k-<Q>S(?aQ>C3
zIFvup-ZYH}08i9QNeH-y%z59=)FBeLe$DOld}RAq`Rb@i;W}SDzwHy^wIc%9iYQT&
z{ui7wrVI639ix0WO;6D*5nj+?43!B7U-3lXZ1_exO4(I3Rpzo#mu(8uBd&N9AJ=(}
zYELc(%<~{|e2}yo7JkxL3aVIl&JV~rxA1BC-}t}T+%_8_CR3^HB26r6UJ`oduK?;S
z5Xb1nU{_p|KZ)0Y9fam9n$Vcye2SC1e}_FyzH@(2l#dV{vI=*&Rjs<!Q{+r;e}_wD
zJ3?Ogg01W4Ju&*SCnb>_a|BG!($MKLleLzqaZ=Wn2vK2FYPj^7r$>4|xi}%_G{&5J
z%Lb4!h$5N9smMr72{dlWp^UkMvv9m0Qpz~SzEzw;D;%}pphRrlK^H}df|y!1Mqza5
zFJgyhe~wp&HTII%%E#y_?6VWNgMAIPxFix`V(!&okM`_oxQvR++%WZfL8Y32o%dOO
z!ssT3egP|Ak#Tk1rXie_fWLvsDBr4<=Y8#s<Cr1i*|nF<<=QU&)$Y!}xr$;dKe>?A
z#=brxbk<QK)*{Fy9n(ifZfI=fN9&lveF*zEe|WRTjYIo>tI$3XS?2O0GeJXf?gl5X
ztFmEZO*8VqUZxEcF|qRN6*0rNCDc=R-Hod={>i_oUylC?V!WldP=~29`sP#1J%+eb
zuw!ayUA*2g1cFyATo)1`$;4>|Sh51Gm1a``kvoi7YmG$SBc5<_z_{jMwK1Cn3gx<t
zQwfES;zBGJ(9=&3f(qHFIfV)cTvXt?+S}WU)llTDlm~mgb2E|oiGpTzWdJJ6BR0;}
zB|(yo#wPtQM??r{1=st#I-_4yka<-eNfkhhHQn!Pg?BrWTVvJos?x=nii8&f6d!_#
zE{glzInN;zP?S{5TZce$0+__bAt>$`WnO2)y*uvL`VVZ;Q3cW}&?3&xTU|SV3jUB{
zfKaU<NN#Px(N$<<T^*c6^U#AEIttJ98+Y#9SsjJA14ln5p|HB@<}$oo^SV?1vz+ph
zyRZI}*S-7xG~9b^-}DIQKpl~G0C@*H+baxVbvST+=(XCK$coF*0n{g}RoNjE1CIR$
zr6R|^tfu<;5FGC}Ub23n9mTY*oWFc^p<jo&Uv|}vLnQDAanSC$ZBzjSj1H6+CCXDf
zbVYtOx8!9uV*hdKci{AFT{U8?;@R9QRb_XeP!kzl81kJ$d+;-UmqI(ASW6Kwb<BTG
zDku)qZzWs>c-Bu5vVw%GqzZNZC8s?lw4FL&!e#_0q3SQwofUPNTh#RC1Ma{XuLj!w
zS5~#pC$bS|(Uy9S8F-m`(}0P87r$|}|Ip!XcfIPMP|T!1cOCSK$l?hh92`k((E?}>
zV*HTLTLvJebaOh_kPWzh|9(&F%ja~Q>>AAJAdB^SMUnm29EksZ_rH09?Gd(KkVibE
z#rW~tbObBK{iYzvc``;=<mq{Z%fccfBX23aTT_W2JLo{K)U#y7(>nB-F$Ey$R^Fj0
zG!E`}tffG26|SM(6*PFKPDYeR(hD8Mho8J`I?pd~FP_`}MDDK@6;Ik<zL%P61?;4%
zyP`d}CA<xEi;JE1T}(#^aLV^?`6xu3A37^~@5BG5*jv9GAI1p%b9^YN5n<TZMU(dT
z@&{{kIJB3%klo{>-ap&H|H3-mXkd^gvwzG?W@aY9$8M4G3p=VVl08!sE}P!$d+oz|
zxfO(6J*Qtd2|4lft5>gV(*y`vok^SJ{XKe{G&&6`Yu>)SU#}{DTTE<!=ctOlUP;bJ
zqud>3=@bSNR^IX3mh4&3j~1xnGOCtwZ-&JD_%B8?ehB(+(&$0+eEO$5k5Jk^L8d>G
zV$?-7WUtUZD-)bW;fxiXBnm<ZR-YC`^~=Yj?Y~({gQhplz?-1gUl83D75gUbuY}gX
zsP|s{A%Dj+N*655$)I!RzmDo)i+T$t!h{K#8bN<a9lELhLF>O$-+cKk-f7mCm;QpK
z7fdZ~1(j52a9bzCp9Q9nANp`b(07)^lD?lq9xt^dlC&IKU!i@cNs}g%=zd=D;_q*j
z7Gl*C4)vlBS5oe}k8^|W!C#!ek<$lmX)yHvx||v&fCCw7-BDNAMv@f>=k(#jhhj-E
ztg;wJUMr}UGI7R1`V!*TH&vlw-I+S?WU4e4Kt^hqO#RCu>ix4^*Xu1?woF=Mq-2jI
zoQZUg3aBkDPVkQ}4(a^)0iT7Cb4bsiaK@V6^wxnaHXe$kN?OvTO%dRpTXU&Dv7h>*
zz3JFLxp=^HSZ7jg70tNf7a8!mO+^Js?X14o{>wk1V;~Ppz^7g)**ivT9mW|*>#$Jt
ztQM3puv|(@qVY+PKTS(bjo>$WkNUTYXECSEMmf=BmZH_EDzuI=oPSrd*X(F(nt9YH
z7LmvXFr$<7L`y`*Do#d8SxU6jFQ(A_mi%7t;);CB@7na0(yH0cA4g9HYm<sBL1Hqo
z>S?b=1XScu(y~vOHd&3;qnn6boJ6Y`9ko*Jr1HtQg&FFl3JB1lap5F8`;<1Z-4pA@
zoFj31MRYa=)Jvq#fV+d>vJ3PlH3*|BDfV$+g}q3kA^uiyASQfXK~AmWo&WXPKEXTU
znfe(7>0X+51xS`}0=)_*aF9MKHGDaAMbC`t+oVMVg>MoAxANA<Ilp%aP_O~evL7Hm
zd4^CfS+De9N2$DucBa%lNDE0{rb5jYHALyB2Tlq2Q-XHS_PyxGk~ojb7d+fEVES)%
zj_qj93^4=^DjPlR4mbJvjSY)3u3SXBZt}S~zX(|cI+x(89*4Rc;<qOPlS^kceTO~7
zZvY@n0l5Fj*x#QB1^_pVr}`7ZNyYQ<FTayMccJ)CVKF8tX%ddwoCHfG4PX@8=@vt2
zMcSd-gx#GCnlJ@=pKL4Rkp+3%+w|jpKhEv~n13Rd3c`c*$SLRCyP^wQ1fL4%V0YK5
z&2TLUEbAJUHCpzZ95bx@QX~l)J<YnjY!urKiOgS$V?cqF<jrOJ^2UVs@C1Lt0gH@L
zMdnAElSlvvlv?w`6G;7H3MR$3;(R0tZxI8qyVnd$LMgD$z{zk#MEQ)0THFJVEjiTE
zM-e$%fGF{RJ*mOTjpO!>zOrtjbIUKr;+-egwjE_seVzTYSu}lQ66UNJz}{$2#7A!H
z9D5RIC)wOvBi-<JJ2Y=OUEgKX+je2by4t<*cKQN&5H@s86M5rF>8{+qSf?a(d~DOE
zk_D!gauiIKbwZ<e;qW3P>e3cx|9N2WlD0Jc8=pG6UFR4Y$YW4Y!wWcuxG;2ZK_?@-
zcNO)X&2k&Dl{*lZ(1ork8;Ch$2&}fGk&U-Q3au~I|M<83`8FWKw%D?u!s5a~>{;BK
z6Tj;XSXc9Dzz+Myt$X_wgmYzkI3*En+FpvNb=U9Z)&7KmvtBt_;FGJhMYU4?KMxB&
zEP*<9J7Nyw^H)KqnzwE1YgZE&<7HF4N&?aaoWQwZsfnvK2n(;fqqV(_#tFk+YrrVS
z@V62RrmEgE+odf?wTJJzNjO0Is<tEL`al=M?_14Pcq{~>Q64+Pct0%8Md!5lQ@D;v
z!!rFDY0w~z+Sb72#j42Ty}H!4TWV`<$SKowp<noZIZ~zDfwWHeUIL031H43jwG2Y^
zF-}HM_!1s&RQ9QDe{SB~k2d|c56U0nsNDf#w-dVtKFuR;+72ncEv1!a+dDv@om-F|
zp+7SMUAF~<$$?LoQ0&<bGG{|j)FWJLkmIy1RZl3k1D{MBQqo=|1+y^;JxZrwVl^*q
z-?3vb?9K(dbj^RfHHeVuHZjM-Qa6yuY-B$s3`~+j)uvhZphMVDQy;?#>##BeZ2LB4
zu?{s>6qHgIcWTwD)aU?Vv%!U;Cr_R<3g5nEiwuf-{$SOUl$62AWz(l$%i1`%4X}pm
zsVd>--aNZ~cmH=4?>lw>;`s_bF_=z3GUAx7N^&U*h@y%joT;rz>YoZ_v4H3fhyoW?
zYm^N}a+f5Q6oLVCU>CbcNJS^fS)|@_E2xtf1ZOFQHEE^SH{~j`Nbqk5q@xMaW<I)l
z;*3RZK@`JV9dD?}aRU$E4YH0L(t`WGujo0#lu9UVYT9FV&;0+j<Arf?ZGQMc<SK04
z$Nc<kt_UfsIYV$nAG1qBFp$>e+Nx`{KXLe#iEv~XN4H8S6x{F0STw1z(;};j&jVC>
zOyqDDFk##k{bR4ucncsAs5&xiQ^AlyONcC`Y4~Q}5>6*^rWJYqE~l*MGa_wAe?T&k
zOE>Z=te;xG7t@>Qw`K+Rcu`jLRmx}Rc6R4tjpw+hux7jKI1lIvO~-ld(K$F|*+^M)
zq{J=W4yFJEBlR)I^(?0e8@m9bH<Tm<5Lf5|{1sD>A#WXi%9L7(2&{HkvtRZ}J5Bgl
zpuAJ5fM29;Fq7T~ZnmsFdAeQr0eyQf@kKnTG_(O|B!l}(36R4s#B|Yni7(~7Xw_aC
zZi}Z1qwR=SCy4ouFP^xrwwl+w5(pkyjmCw91d>?7ZASEm05dR$xi-mk2#UBid7D%t
z866c2{w<uA*O)jFKJ)1H0^l0eD(Pc0+*QG(M-jY${jGz(;#!f_T8?zXt=qTXNHrX!
z2mofNc)pEu1FBY7F}VtVSAxS-sstXzHobvM02Z0V_LpFpPTIz*Q~}guVzRPEjLQ2#
zI4=|fC?q^R7(9()XHU!1x=LWk2MkffQ?8Wrf{jHd<H9O`X+Pm#?5+XPh=OradM(yR
zm{$VIB_4EXET@R9nkI-9JgBIK@FsiJi7xQSBTyRO!aUz(Ny`aU7U6;WHytH)M9y7&
z**lVb3MfBHG?RER1&rt=*woC8rGrIt!goNAQAI(+I=9<0id~hDQ3y(;4h(`!57}~-
zhPv}?ZtyYLbB|W~2eaoU{v38;7)0`o)Vj~Rzki{bc8w7Vu4&sCK$u)JYSgH=PnV>I
zc8wp8cv)X@E6GRl-otkFO-n!6!C>I)gWoyOZ|eFMs!g_L8kaxiT&m+3hz;?j<Vnqb
z*3i~3|D-@@elfiFcF_7>+Itfm_M3a-t1eVAGOE8g75i~MwjkA7L{;NmyWLJ7)WUx4
z-d($FiPAGZrP8OD`R9%uZVS?gs(g#WF4*0ODr?g-P-}G2?wwX9ML)MX4Zp?wq*>=K
z0|pF84Tmrtp1%(KY7B_ic1}#Uv>Hz7Mi9`{8W3JL-AQ9?%g$G1xZ>se(-XFDUL;X+
z<o?kk;POwdy6P65LUlFpsVfnjT^c_1_|tIQrQHlbsBDT{rsv;QpS{Hq+qWny#)?-Y
zhngPov0OBcT<?4Y40Ur)BaG5gZ99DU@Z_}lwsHG2m%ZgI6ok`$=i3};Jso=dZ{f7W
zkHPZ4y`LS$@QX3T6WfSUbkf!#g%e+jeIG-OdsM<rb8O_msy<Ug{FV_|Xr^`idF7*b
zA3pTg(Zb6Hm98ps7{NRb%``5g78qRUz3*GJ@L7}vf*iUk9?w{znXnAclcZ}A@aKD~
zi%%AZVJIh;EqnLumt(Lf?W(}d+L2VmhF3$I+XhT~U~%=QPl?0w`ooE8lsUM^!QdfO
zTAsgnF)tjy9Mrr`n|&{;^o9<7ibj-gYBb<SkX^rRhIwBXhHKDM3m?hC(GazXNS93%
z$dprSRol*;JExh3S4u)NbbIGmcldhS@j`)52Fce@hR>+jjRiGK4d-->;o;-L#dEO%
z7V&Fpi^?zq+v_)M+=eC4r&8|ZM@swW)~)(NzO*4Qw2Ne20xEoSzS1iHR6g-;?z+$3
zO1o196&HxcCnQ}x{h|urmCuoos>tVTHCP}P>3_iGZqQ7Vb{tZpm(2P0Spm0UQ}cxW
zVu&g0&hLE*@Cy2X-IlkjpfO1yk?X0^g*IMQz`u#b0Q-w&C|M&nA`Ly=Pn68=_<tNy
zz~{t**p3`5Au-Q$(>?Gde%;FtaAPmbyVZD8cs1cPEad>G*RNz?X~>^i&QNF&U@#Y}
zQd%X^CakvjTr^KbJxoR`g}%iaFTP)?8+#;GCv2o^#{+~fGl@cati6WS5S6_6Tq5qD
zSm@Mn)CN+J6N}Gfba&U}H4|x@CAU~;he%WqhTNg|j`jxqldYzXsC?B$3h>q)StlGz
zQaX(n;DyKdWm4IX7)G&cQ+VXckEf(WqD&;|3+p<=u06=3zFnLd{-sm-A%Z+;u6Die
zV5UdZR_qY&1vqH;v@9Cf;_py_YPKV{!$<hl`*r|#9>zkScoGg_tWpL5hSYE@MaC_#
zd<|f|NKkS{Q~_Liak4$auPw{o2+zW)_bSB5a7E6puBPEvq1Z}88kT*CUuxf`5?e?e
zU8iNAS=OSd9vtit`ZgHqrXb_@Xg!nQ;AWLi0GocoJ>&&~M3V^B;&aKgrtrEG-mkI5
z;6#?C5(mo=oK4R3d()ic0j5nLM+>e%s!F%}9lY8NzJDaT!!o?ERKG{mehMRIk|~qI
z)$9?j(X82HO2u)xZ22kSO@*&;b4wvyn?ly8Oe7vEVc<^SPgzK?NG2!J^_1Z(aSOs%
zV&PvwYn^OlWOSwS-ge!)Y>9ldXX`f|#e|~$v;#|>Z`>7^`YQ#NMGTqjsjd>Bi0NH6
zD{gOl0KJrMVv<{{atARDI>_p@nJVMEMLoW4*6jZMam=D6382?amt-a`YTr{jhDZh;
zz#-y^K8`OsCWTw#1@itmvmDp{4}@WTysVx>3EXIkoG(MW430{VUXvE3^<($Uf*+L-
zU?#p_g=>)Br;IC*AG)ndx-nCoh?UbA26sMg^|(g1cg}mjKzsy-GN3GjfgZ*0X}4o(
zLIAc5Ty8s6fXD`w&LB_a3H9QucXTj(yt;tpkrqVd#jAL@nFlCZ7a!M?#fkVG8W(ZB
zeeX^~mKs6YR=PyuTm{MiH=9C7aj5>hjWTRi=!09buDQka$nfZ3@soSX;a?g^72KL4
zmfOownc11kW=*#lrVMKJz{FJ2Z^uc1q7b|uTf-5YvSl&c+p|P9E*@NK*RvCyWq|Uc
z^P8(gg~zqww^XQ>OnukS0BVwJUakNwVcw^@B)Wu*G_I|Rdsr5)pD--@fy0U-k3X>E
zB9PqdqrWdmvGi_pP6YvwZ4h!~n*WG(nISP(IP3xrz8TK7IwA7|tecU7b8>T+-sVx@
z4iCOnHVWcs`@)|t8ApHmR+fZMP6JvQ-=tkS%SEyYAzRrglU>xPvl_4UEU<)dBJyDI
zt{mUpl;hOxoLK5a#-JX-Hru-Wy{R|W+{yAJN7-O`3oFjY<Ot{az>odgCDn_Kq_8E?
z`lS#!-wW?sAKXKXyMTpr%?D~8wZYiJMVRgOF2eib`S(6w%P0Y-!R=CK3L_$8{8O!d
zSecB=%WgW&#C6iMcw+p=Dy)U5>bd4isUPaT?WrsZKV&;UpW`$hSvc#kpD*x-D7<K#
zUWX=&;A>6G9NKKeu9jS1De8L6l+!Aldc|dDDM#V$<J^~7JA^P&lr@`lt?xCe$1d@H
z29tn07*bu7SC?E!-4$0V&=Yy5aO9-x0QBDK%7|g@p{_n4FR-ooBt3cHB>|O&QP7WP
z0!iYz$&S))Zh3Q#g&Vf?*k9G$W!p=BkF)kL)!>m+g>?9qmfoLc)M);l^A%S&+)i|^
z&GIICXR1QcZ?0uo`})E3h8n7!tTo^o3i6^3(_`&A3>6i3|A=%7#B>M*?l<V$_koiK
z#)-{c;+rO3F~)#7cc&pO9vFOf;(<ZElKUN++olp<6Tf&;yRFl@p6QoYZSx2ofG~4=
zgkP$oU>c+*QswI3=|jgdui~|z-j9_cBejtp;b%ePMrFr?uP;DTv{~L|xAA_{>-|Os
z^|V)`h7AK#-B|i$y0%1tsM&nZ@%b50?6;0ele?mKbR|PGh@WpAsqLqHy)Mf(Cw<H!
zkx>aL{-&%ZN)iDKr}0-Q4p=sY^4DJ`PvW94cdx1vCrCi@5W_7^pOR37oLk^5x(B2V
z`|rK3Fv#fj5@fNJ-n7F2A%+(ggIW3fJlk0)@yDd&^hmVxPyb-;F?*#u7X0w~>Gq4m
zQcDnLaUco^vYaWb9!1V~pgl`Z&cqr?wO!H~a7t@pgkOMJM;K;XQD(jo_1A<elN=%2
z-#W6G+?C#vX)jnyO!2Oap(yr@&srW0@rTx_TTTakd?p=VGBgW9Py>%Z!`5r;j_cke
z;q!T0MphKR+<c>mfM~=utzk5<-4)%NfOR6X6S;z_PaJ<thSj}lx=bJ5-}CNLp4GPM
z3<byITl?#z6v?c<*GHQQ{7WD=_fqe{HEYYOs)VTMJABO*k7XCIgu-z{C}6;hB}OL!
z*Va~loGP>iQKjNgbHqDCd^?dpa&b<3_<mtOosugkSTpW+#=GMP`p|}auXOcm`oz1!
zA3FZ@WzDrU3tI;ECf>~WP`$w;^!MaDFQFy$gayi1c2!abA0ICDVo8sEZ_SmFRfN8}
z<nEFZg}GGriLa$`r(h4d4;`L{keCTXT0#If?#iTHx}7_Bu50C;b_@_u^d)&^CdT63
z=-is5x9t)Hunvd}Qrs~J*PSul=6;EndDp6Ci!67T|Gs|x1vHwK7O(}#2&J2q(TQe4
zkxpC__%hR_8L7T%ZW|51V#4%2IJfz|4GjzeryISPrQUwsQ9wa3wifkn4e$xl?dsmC
zOdR0N(ORLEYkV^D1{IDNyBzGbY3uo-`AW0@VQL^kXhGiWZ2Pup_9d%FI*(v92km^%
z!##HUB!3Q8sp?5PId!ZO!qfu|wQ$<UIP9-zr8uT$5-4(7P#9Tpc`=ZG!Je|BEdS(X
z&VNMePsqrPalGX=|D~O?LSd23)EPX(y`-chYd4?`E{T%Yx?5U3n=W1&gI~r!6;4l&
z?%jL;6~7&k!?#Y|_K{}e>Q##)DZ^mnMQVXyW9_u&NrAs^{$XKw53|EYSHdG|Yjw(^
z%fd{EECIo$h(;iN6g;V*hN+I>jiL+J9O`kkJ4m6lQKn_A7OYP&AHn^kl^CJ(*|?i#
zBmArEjX*W7+)T?{b<C1d9xVkq@51G>Id3E|^4)<n^%<b^p|`^2<}N=m=grkTrL*e7
zZNJwYjv1F`KRVs<7#4RD&pUAMv17+X<c^alQV_x<q<{6$^Xgf=G=E)9qh^vv5O2d_
zyxzbj5yEXkdN;(<xPfaA%YYQ@@{E_xf-dSka-T{<+hLI#F`kqLt-<$0Z{Dgc?9(mO
zCC_wIxKC)!8lQ(Q?F7Fe=NHjS+PRT9H31=B{VWgBzKN-e_a=J$ap5%B4T=+w06ay*
zkJvTsd5Yn&ycYe7UG6S;zB%d-i+%Fid_KsK(wUh0OOdewNZZia9DBa@q?EH2wmYiq
zW0e!IKn0}6(SEY|RZAR!t(Ur|l|&sfvg~OUDqZdNmqN(u_ZPY`q*Oqqj4M_4-R}L#
zpE!V?YMf3exU~%o#58>NSUX>ndo+02xSc;+@@&HaEg54zVyTadW!y-Ha-`VW|5^Ou
z!`;c(fbEr~1Me3U6i7LB?N3DgL2J7%q-#<7D>JU7S5`fQV<S?9qKyTmrL$7xv_$e9
z3oX)&_eyGf!eAfKwt1s4>>6-VCFguf^y&)7)b62K<P7mzqtDWThwoSf8B60r<f<?e
z+S{Old@$1=+Ox~L!C^0!oY7%M<V4rOApy1w8OE|sKD9VB`^hIkejrLG4$RJ;3G7pT
z70a@KI+Q4Gi3JR~=1g}CaSdO8*DfrGOkb30fNu}JaafDYcw{aaT`-UJ`o5l$=q;fn
z?*~@hb5nldzG7fARR9%j>OOCNJMbXE&)^uu7iH;&_RhW~UIt-{w)Ip>Ls!O-UIt(w
zKuh|2n`r7D##CeHh9^pKMu?bhw>NMDQJ;7q($_z9H5N87H2r;}vU}g`E3J?8Fg<2Y
zo2uPjyO-;RP+M7XG6Rpy>9i}4$|Cyh;N*~7^)dvbCHhyOi<7Thn(%hmNaL48oS+Xq
zIMIm*Q4p>!@u}aSL5HVV)kT}OwVEjJ4-WbEfvug@GADye|J3_Dhb8oFx`t^Jh`|l&
zqs>KDk70sJ@fpxP?R_Zq3<}C`_bf@jyi<<dD*F+tL^t!a>eheO*Vrnp5!m0{>f!Lz
ztSlZpTT2?L{{I-^)DJ_)t(^1t>}l1^!P_k<8JRS0n|_w_nfjq^nq#<QRjO#a&8WQi
zrtH}G{rv^ZyFjEQ`Zea}=Ay1D{4xRaxW)IDzuz8-ogau=DCNQQ-dQLL2?0khwbAKf
z#%;W&4Q9ZSmo)cPHW;lX5Hj0sd^dG)WKK(~HIC6svWAn4-T0(sXlUp_WWJIT6Z4`L
zLm?ivbkb&^qMQ2mGEH0h`=s+rL`>jwpNmZWyKhq&4=jHm{TOc~`7$#z3tclm_sNg+
zb>6F#Tl#J(fR^Y~h_apze$H61KbV&VPesa$_=n~~gh~Bdnios!<Qj&7OGQ`wpzy>5
z{>*6eo(Pz$Z%%1kY-TkX@Pv!HH-gVbQ(3gze^08}c=$l+!vq!aVFzYb(X=p5#MVw3
z=ogP@Z*PCcscXVn8(AHiFW1U&4;{LY-thdE+zTOJwd`&EB%~Unr@tcelMKDUkmVK>
z$a4YF2?w_@V}{g6p^I2#45#Jhxl0?bKN0CXdGhzITTS-7)xYHYsc9W~ff#8dhFuJf
z^n1{AN2o3QQZ)@rp=kdDpyZ{3SKCHL#8)TeXQmz{8tsAkI3=wDAwzDE@Jkf|OR1Ms
z-3Q4gjL!L=V!YFb?_7m~MG)@b<p(mw3wV^sL1gbKL_kx>F)&nxbI1PLGjZskiN?m#
z>>jtiDGm%0y`=LrkDlVY?cMhzREdF-XjcgH(p4*^OzeHZQ$O^<*sVmUqN=yYpp3cR
z<FEH`+GW&itZ{-WPgufmEs){P_^y<r7*@5GSU@_Mm72}sOr_^hI0-z`PxR3U$7ar)
zXAMRN5hxsn3W-3?(A2d^+RvNnEnTqET`daFMEoVct#9hJvQ6#Hh;`fE<mVrF-jCAD
z*sx)<U58LQfWp0~l}!41jCuOV0SpFm)6leyX>77b&m6LuFv*ET?pv<|S{BlDoJvI{
z`De2Qz|rlrYOD+_yz9A{ed$*-IwZAMk<S!e{pEQ#B89NPfk^G|qZYl?F1uW77xKuO
zMnk6v!Y`{mU5NDWRo>O3iW`}W!06DZ#m22W)mM0T5c&Zjsf^DQ9cegqOQcu=7}hlY
z@78E}0}e|#PD6;rma1Pe`*Lq1+Z7jGdp;i#9#sIP_1M_W_s;mVXw&A^gALW37~`<B
zr-u#7qTntpw%+((b3{s-mH9VkN5#a7I+v8$0eMN%!z<QMY5^z_-9Qb;85o$z+|z!$
zH5Hz<JFXgP1fQ}_AHH_U*S|EZH_+a1bVGZS_|7L12Rvy1pz(k(zh3#f7W!_tJQQl`
z6=-#6e7w#HqkF0C6NeczX{Ouk*{-`Mf6QK6`)+=<w)ro=-FjDT-8v=Jp`yS(qutPR
z{#VPcv{oFRqf6D67;5`liWm7nHMdJiMWr0Uk$s(N??151yx-1Y<X455hfTL-_?niC
z9;_rd;&}JiuBG4Yrt)2*X6+x~B2=%}*vI?so?C@&i7NVW<2pOzK7%uF_SoX^2*aPx
zcHS<<6HOuhv;Nsm+C7$Kj}c*}B~kCkY)t7T)6*ChpXGM-+O2sJZ?A1S`Y!CQhWl<I
zBl0mi(t^^9GzWod-`-@8NtcS86uXIa^bI=84_$X$n1Ac^xZ3Aqm5~3<k7w6-+{*a-
z@4r{wuTJo}8&{HWX9cN-Ue;j4tcgTwTL2xJbsByeSfpJQG4&&G5-jtud&OTN0Zuf8
z@w}X$t~-n#hd@yjO{3Xw%>Fyrx)8V)_{Z_^-;Y`~-@40hwNo@9Pt!E+;g#AsWq&<>
zYzc~x$X6#b^V4P>v=|Tym$Y)B7C*tnFLxR%ydrN6*xZSP#nZQk8ojRF@>L_|CvxNS
zJi?FO2eJ}TS3vkZb8Dev?|NMhw4-&2MoU=1KJ8-xQ#y|wJJzO(wA-rp{y#yU5-;Oc
zPg3T;?Fu&tMe)$q7d-oAulR#q2qpwqIbFSUnNUWW`^(EnCG*0qYyZyw_T}M!ZrPDg
zsLjIXp(-%ARBcD$B-2S1*F>F9#!O|^`~sH&zG<0$5)@UqjIW;%64*ZR+P?j2-Y4vj
z)!m(~QK)&|bo55)S?9B-@IZ1=cbHmdz){5O%78QAF=G{r8^D^J#Ma;5T)IC3l^8KF
zQjh4q!G2Ju#wSX;N9pM3yrK@ayJTJW$nwg{OxF$La=cjhnuHb;U{$Y4@JI^Tx3UjD
zi@^)AiB%JtHESk=6!Z%WIj)O2454>0!G#8HzDR)V?lXH<tF2vA&b|;RN!v}RNH>*_
znJ&Ce1dK4QGDrYXDDiNczio1YyNeE}{KvBPX@oRQ^Z$x_>@G|Tvt8c12L=V`wd|zH
zk&tn<=MSO~22N<du;z3~<l@EZhDjU4x7YqUO?h$;lVkY5_d`wAx{Wy7W!M7q4Pjv}
z8{f*bJ7{pbw#T2DR0I0=k@k9QZ>5*Zu$cUB(JQ%~u#n&~bzC#!FKzakjnV3q6oF<c
z(Kl9FkiTu;^X7E)U+UCMw=GG=F|xQUmvj)^-F|+Kjw8ruZf`nT^V+|=Q)d3D0R(mX
z*DgrV1NezqI`AC@PdcP34wV_MD%A^wdXpqwGw+_?)T-GdN)*^Vn$9W~XE%U{Ln{f_
zAbmwM%~`}vGIr%W3MI3pq4oB<7Ozd(c}F^bh=@3FV8+I`)PpiyGPxx^1`Zs!b)k$;
zuv=(NRIl&2SI_E0ct2Kd>Bbhsfgh)G%zK#>FQMN{>aaTRS2baibiYd6+xyosV-~@+
z8MEve?ZY9Q^?fbRJc#sPG0V;&N)DjzH6G4+&*KB!4XdHih#GEu%O&b+QbqwGD|CRV
z3a{il*HBwq`(D{hNESM=nVGYw7RG$2j$6)l*nZ60-9g+bLCSnbqOs|;c|>HF1DfBZ
z#QY@M-`__E!JqC2lv@Y=yAkfv3r`8`BdIdo(&nlXhNp{!j-+$3uX%jBPs|YkrwX(-
zr61cSK==8@4i3S1%&cXrF&8;l7005d7xP1k697C-rsl+($k<^H%lT^B+A@oW*CH4E
zAtM{{*9oL#>IsMVr(=3lQ%oc$5aI_n;FbI<{Ymn#c)iT`Yjr<4Iy(0ETd<Tu%+J&e
ziK#4bvSOWDB(>~iP?0{YddjG-(s)xgY^|%QwYO9Ctk}C#Cf~eqqrc-|qWh#}097H&
zsxakNE1Mlssg9C|C{^TD?jwYurTdjk2L?)Mo5qPuVmGxnzqp<rjvv<BZgTB@WK!Ec
z)mH`#8dQ2H{|=4kB+x@m?oe-<N(0)qw+1dR=?N_jDf<@TDe6371Fq?imD=iRATVUN
zXNx8?884E)Difur6eZ~S75nV&ZcS+(`i4T;RHA_GZv}29e2HD|$XE8)jF7<_j4z1F
z>xVNxLFl4Xk@ktcLZk6BFPMF)I~+Ee0nZ*IHs^xD482xD)u5%z@a;3%G*83Oix-ob
zKJ1#Jp*vw;-(P^g2Zw`KiW;1u7d7e>pNYd4I)Yyt+u!$#U0y)2Gmz8<!fwUz#Jod<
z$F}9)r?71LexFJwf|`rOkmKUT#XnMhvwA_77(_s6jJ+N0>8TJ2d2m)~7CWB>Q4G|`
zdVchuG8U-6^<U0hbk&g5u4i*|T#7}c0)KM5lzzGrsC^mBGKJ*Fbc1`YFDW9oz14fa
z68Qiq7E2sgxx2rjBy?}NBa_4WYyPnsYGY+Zo`2C0H#9FrhmT;<<jTjZ3?~}nN{Ga(
zuRadsu#)s@`&+(P7$g)#{DvI7x3;$04J@E4#D@L{zquJhZDYL0-;o9*t#Ih?XJTSv
z#@{)9{J6N`E2KnwoCzPLLwi7Y7g8X`*)VO|erOFv$hN~*yN5Uafio`PEN!dp?#N78
z7!rl<{=n3)H?*GRt+Ttwq%ob)JKM9vKAdv)EQT`OE)@Wuq$A%eTPhZE%cq7udHQq_
zHc(B~Dq^G%{K%!pvnltB4*QcQrIa_hZ$(4aQKgo07tD*hm~}$BsB3#C?E2D`uIo*i
zK;~lL{?f`H9DoDBo1Ug#Iluh-u5vCEwZhm3Syy7h)Ep*}2#}^Ld_5bYp5H}>bHCd}
zEI;2=PI3D{2~exk9O;u0inAP68Mw<}5)meVH7XLjtn+02rE`Ag%uCb>PEl;WT`J;l
z!eljcE1b-?OKwOv#hcwzXFUXOg_E&}(=n@mzkW<sLt$vW3`-$-Y}?5;npT(hoy+5D
zI*gvkop^-Gb<-+qYineY?JbTT?a|%;0KIVzw5^9;x-_^{TFXQnCuC;iawubnOTU7e
zD8lK|IZFkjOWJ-at4BG^4lEfh!TV@I3Zf`1BK#O3VPBGzbyp$-o(R=wT}q1%(jks3
zSE^(RAb>L>m<usISE#x2UxZa6!!QM90?z63`f<zgVWA?0sxa%jjpxBj-;4%SbAGYB
zV;d<_!grS5E?GNHE{}SgLtw%;PQw1~_Vw-g+OqRW4FB=srm!4g4-E>JXXd7eW(xp@
z8D^MFT(EaE5v224%eY?QEwmf?68&&kBZ0hw1Ex@(5#2!<k1Z&}O8WRvGUj$aa$M5~
zK9C3=VDXHvOmTmOFO(4i#L=?DvPbY^7Kl(0qygkXr$}=&PLS+%cj$^?`sD4ea7ly0
zW$rZ3k&hWm;nvEyH~@I!?wKO_G{F7cGso(v(0fFKqL6UD1K|m!Z)UaEgrE2F@`4dC
znLX3VSot<gG|6EqiLL~GT-Z<O-${JH4{W=!8K7y%sE5NsV;ZYZutL!=uC}IZL@+~i
zrl{Tv4ww&$AMYh$p7sGzui9-ZIA^N4**AcnBW^zJSDgZ()Zx4DzPreM_~d1yW7VtG
z))k<2kD+SXRb0luUwoRHnz162Yc``rixy{qG~L>}0LU|6_~NX~OP`0h1lEcuC;vJf
zDU>~d?+LVHO8PO;Dg*x4^;Ma*5CbnB8oJ`{P^u_~Uf+K6jr+S!-QH<@swQ#6(Oy7A
zC9OP7#$9}hkD5)OXQ_M)`{<>a@xJ+Z0<A-lQIyNAqKh&PEjeC`#P#`Q{#71_ECsRa
zkWGY=kl2rT8Hl?kw}llF_?}w-<)R{&StAC+1PLYLE<Lm(mLvl#-7@&hnZ?Iz3%N0I
z`@OyA2$FJ!j41*6IK+>vYALtqt=YkxhBM;QlW*P1JyiUZj;5@Y`Mn%0EE3n$j-S)4
zW-V))bNWX4Cz;!+b*O6g<)xkPmBknO7O#DL5nl6@aIfaATkj2gkL+PTuVrUH9?@|i
zP3l;_Y7H|k@|*ODn)AtRSMtSa&dwiZ=561)HLR3*wug!X&$v+QbHCG%n0tRm8~od2
z@`o&!39uv0A;#4B&aK7!g?*}ijn=}R=)n{(qw*^+t*foE^6egZc8YD;wJ?HjTi+tk
zzAJgFR;f;dN?(LzRPI$<11L4^jAif&TClC|7UN{Jv|v|<F83!wwvFkC+Rz$oPht7f
zF_mYD<)^pqHGTOV|4Ej$wY^$?DyuEbF1m9kV(OhQg#rV{x-7zSh#oDHRJuyI4v+By
zqB84K$H8puOJQ72e<dN0)I2&+@0Q|*4j$DC)$d2pV5A|!G2W1zoLZbEyZ3-S6HzNP
zB?U*vCMFWI3VS}?eVl>#u}T?y<@ltpp!k%}ACeHwx3M_}Z%78mOAVP>5%*eNAx@jc
zC|rsh{bjBNg?`aYOWo<cgK0UIz!i@P1$9Idwy%Nt!(B-U@%}5{^q_<)<6o1A@1-4r
zAqTO4{q+k8*AxU)a^b6s{U%ktm*~g;Ch-ab-QUfTS|s7xBSMB1>~i*^0Dhf}>mqZS
zE14JKzqWh)`$$wmi-+LTLQNIDC-Q+x9Q$sD2|)}89TQg}zB;$C@KB(KEV!l~phP0J
zx~md*VF_3?-gL4%H+-TP5}BSXbUyCwWAfCN&6^*A%bJ>&7EEul$PZBf(q?3p-kDoc
z62Z~P*oMc1_8-ws(}fxfB1q?kG!)37aUlVTR2n0@{N5Hw-+<(BSfquJ#YGKIJO#{C
z(Txx+iVn;^B(EJttQ~f94UNQya*Gn!t_$2a!AITCijt`aDS;AxB$z%4i1vmD{cY{F
z1#rBjWq$yzK2jrMew+A3KBDFEW#*g=oB=4yD7dV(3#}rNG7#bpa2E{i6~b+$F!Cd4
zDtNRhTtssT-)ZEGLBY2mrs1Aj*r)hrUruhBGeEo0<~f=JOGV69l(D#OHuKa$gT_(V
z1H?K<MIHmP!~PD;Ni=#_*1$8heTNQ0h2`ZIp@?Y5WzhT{QwD};HSAjb!o_eVg~oNh
z1Lv1?df&cp`ev>9R9*gew@afw|N85{i_(XLsG9xZKfe;!kCpGCCOPm;Up_H^Lkr!O
z1`SUHv^FpFTz~I=!IwW=+FgVyMy!nzCR#x-$4u(R(xKf=bgcgSiOG@>FWVKyg{}Ra
zpxo=#=lU6b`2pLP?5+hGB>gq}3rhx=^%!%bNCAE&jME{%OU|KhalOvlqA^dink!-)
zhV5vmNY^+dZMivmkMndF5e~}4SMqVYg`e<dXj8k|tfD9o(egW8vqJkdraeACk>_gm
z0jkYHk@rxmZHmbtCqVh)Flo*Kb-${SDgSkk4%O@ZeqSlfy#Ky^+gL@MGv2iR;gc^s
z^c32W1au;uxj9ez^9(k8G}wX^t2TIFVTcXJPM4XPs1z$Lc$;K$)2Kfi<v5f@4nOe^
zE)w0G{XscGdYmO{(+Q%)Au@A<b2_W#a3lUgy*%-Zk3-V%@4sSpreGG-u2eNeMc*iL
zI_~Hd{e=r~@j})J0F-X9(=|28D{n8ZTb-t?&MAC}m<+<9NfPowzXd+gM5&v)<{~PV
z;?NMB=&A%hwxoy|8}!?!=WA2)TqByR5gJ}F)B%uD3P=PxU6tYC;VZMZ4;JwME{S>$
zf<XXaCQmoDmoM(np%*5^r$s%*he)hZn@a}!zw@+8^e@@z=ci|AI9%04aXFGSvh5|0
zjwIsV`qTW6>548HS<!Kk)AO=n-=(9Pb1|KgxyQa{G1m&+);RgJn=8MGkG9CDs{`t{
zX(_t4CpmnBkjkQQ#7tx}6q?Sw%OIysfBdm;9~)>EFf-NUZn@i6iVqe~i~O<b*opL<
z0h}t&x((dH2DYV1OuPdgHJKW~NuoA^nC0lVZ{I%W)vJ>Lnr~z>AM?6V(@{^A9r?BP
z!K#`5ld3)+u<HYW9cu}QPf!-KnN5e!K(|8c5HEooG@Iis2RL;a=5B_a>DTW&#bIq}
zBU~0qlGLqRJ+obsu8D46z>I5q;iye8{Ynv2<gQ+(VwUOT`HL5&4??=zW$;hI5ToCg
zU(0tnkp9h=8|P+2L<u=TNF}<qOaD3CMxEs+yc<PVylf|2c*y&M29B-NFeu1~O!-^x
z*N@d%TT60>E4=s0FO2D3cY=mJrXjfCO4i2+F<>rTJ|n0K+inb@VXn&q@Jpux2Zr-X
z;RauOK&?@wI~{vJXIP)J$O_=$MLC0$8;t#iDK3cG$M#r{o<PQIq*P!SOEPF8tE9V+
zGc@_g$c8f+7RcsHXjky|ZM;{*!{1z?a3|Vo3~mtDdowwi&g<^8YI(7k4yWnMPJOS)
zFP`Y=Y^WCH=068m0SwZmInS?rKG+C6(}i3}O|^;jh`V|7=H)dc459=CI++3i_AJd2
zC#k-P#PdMm7Eyf}>($;IfWs?K+JQ0jI!dDcXE&V{o{>_#5VNmVjl_J;@g-_GCBSF^
zS@>(FfWG{?z9Po~K{PXD=XGdJKQNL@<=Os5CC%I&G(!Jl5kf@W0r}WRjJz5>W{j~4
z1WX1@IB-W-yOZ53`u<XBu`%w~(W52fk%|uOqC#MxOvU`u>EfR$+lZC~xz@0ydrll$
zdH3>FJV~QN8VWOaq}dWLAFL^Aq}bQFx=3_M_H#SDvjW$3OoPnV0x1NA=5Sd^tB>0G
zE?WtyUoO|?Kgwqu^W3W^5`K$3N2zoulIFpB(eEelq1gq|eFJq``OE_h2lbaD>^GgO
zCZG?J#_<k5xdtMx2>#szoicLWhoq#;5-Yo&s!(h&p@<~Xp_q)AhY!sm#7z=Xuk5?1
zxo{g~>dUpYYZ^9c<dW;6xV{*gh7jwdd4o!`6i7vU%@uy5=g8kb8Yr}X>bP`G&E^SC
z8$VwTv*paLQyYKSGUhR6lt`p6nk*_%?Qr_;-?VA0SDcTZ-)Sn`!cP?UCTat+tF#kJ
zyN@(!&=EmP%zH1FfbF{j=-a0~;^nwO3Zv&g;6-?WaaX2PD!FiB5*?`<97I#&8cp)C
zn+XUXLST}FSeXc-r*^tAbJeT<{_y8jAM>Swd~^y_yn%DBD;8&5Nqz!CXCif2F>dN*
z_Z&dfBxDCJ&4+@0gdUdUXZov|jx$sBrT0UHT<;8m@Lmbd*yIsRB9pOKaME|`2*E{2
zBL*!M<D^V3*|2)ri)ekRO}G!nl5ntX*HRjU_b~M@Hi+$R+)2zrWbAJrE4u%Ka>SUA
zfZ7YS`8YgNrRw0pgR>CW5VaaX0}Q0$)w4vZACL-@ys}_PNFkVCKZR-E(vCtBFiu5b
z;~1?n`}cgO@V=(*v7$^YUAu9*_6kK9S?DR~NYKUi>c~)PV$S|Nz4o<o(HbJ|S6Xm%
zMEpt~U!iF#3+v|7Ky4R>wd5J(F@BVBd@#9uDB5rk;IT>I2flJdeGp<ck&TZxEJV&t
zE>m{Tyg`Eo32w2ipD5j2zE9t>iX|7Cg3T(bK;4el^W27a*z?@&5h?qG@hDkLBtqT)
z{9>~@JK9ke%rG|Sqk*8{>QBcf1nRVS>cdA(d6EZm2S6o?$93`3(!l@&u}RqsAj$`_
zF{jZ-g4}iowso2zm=*zJXQc!TT(rF>DmYh3P+fxvHjGu+;S;im$aeIXFdwwOyHX|}
ztgWfASMTfZQX~hgR|QDV%})clB02r#8RK#uRi@9&4He3Vd<rK4i-;65_$->@O4Ve!
zHqbz2Fm&Cf8X|3B{#o+_&QDI|dv5zNLPx=r%)!r!hPslsyrC_M9GN&uoF`ILxhuyg
zjA~H)?je3J<?QUe+qZ8Q@g$*y6--ksJpd{#j4@RJk-jkFAnOcbx3B9-5s0spz5vYI
zLfYaaSxA*&tnttC8YRwh_v5eLQNM)Dl|zJ4M&B-kEHkF_C@L=(r6e!L%X76h7&~h1
zPtVQ$ow)iqaxkwqcR$3rxF~b2xGp*=O|(b|L_-%MuVCGg_MLy%ktT#bQSa6V%*bW3
z?6&Vy4k$d260uIf`X`gscU7_-$Gz(rMJGqPEg^F;QNf8DE$!~zy4P3i>&k*k)$e`B
zHxvj)i;p(yICo7m#W<fNOu_DnYGnf+G3Gd6t~A3z1ySjcWn2ZRlwmXT5ztaci-#A;
z$ZVu<km*0p4Z}=I6EYw&&TC7L{hZF@R9jw)Ha`hz*qU1MGsv;8srIP?u*7c!$auYv
z)hOg=)DudSk~~tL)1yt#XHvcE2c;1ZgMQl~PjtE<&^kKzR2a=#MgN-w9ZK;4qF-Pz
zWXL{IdOG}S4lm>Fyn0Q|xBr~`P(3amh1!kMT2a72Q(s{t0do4*g%0WBC74MeLW4M)
zDV!`6P2`=p{B{l@yop-V7ow)fOK~peuktcH4|EQ!Arn-_Vh7hAj=+)t&UeD*Gl|g@
zO1lO5hqF$1g?25K_=A8k$0^;V0`Ve_@aD<+V;rAnhscQaAVaHbU!wZRdp*jTjP^n(
zr4ivmt0WS{s7o-@?F2c9?#gLss6E`%B;Vzq3XW|&cU$^xpmesWqAV^^AlTmLS8R_c
zAJt=}$eSXGK=v&p7haU6lCXg^W5`U#K(ha~v&DZxMCZEfo_o4(>S{;bNG=Mv2M|l;
zIIM)n;fTZoGOugmAw6Z!+0juj{zSb#@5v|BL}m6CZLpKTsC7h_hLKQySaI!qQDl=c
z#+50KQKb8gdiO$GZk<g3z=T3H8mN>JpJG(md~E4VZ0X?zcLV=p!wP$M?K+zL6iq}Q
z#&PQ?rGN`I+LipL4uNTCLUIEoc-xH)7wT`kaJE3#^7K9&{g03+IDYeW=bYpj_T;^0
zoF_>0{)|oY)YFK7YhK<>-azt#cH&?OSyj?aDO?r0*6zMXAmzg*nqE#(wI?3)))9#|
zpg-?h^ZFT}7CDn&3ke(^ohvW+M>s&h?%p$>=GY>E;|!RE4>eUQzDO1hhh{K#MPc;t
zHgVaOL)O;*lfKk*p4}(VIaaU1*FvJZ9SzG?gaJf+VRNh)k54kTiF!-CS-7|54XKE6
z;_N(nb4jIBU{vbJ>EDjx>H`lIU-!7fkXwrH1JcSI$UsF#Pcmi=zvoUIiR2@r8c7DC
zdW1@uM5TZg_mZ>gD?Vg`FUfQw0ITt;M~tx&^=8nWZrvMr7DQ6v7K<-LJW-+eNMW1d
z%*pbueyuPET%R}elBfcJ?qs;5^U|e#StEr)eQ<Z)`|rW+O<M(?aS~}_z>My{i$n&w
zx$4ZBGj(;xV!$Y^`Aur|=3HioX!uJZKAT<kj`lnPl`W#=8>jsux7bummHQsROT-y^
zQ6aM)zgwoSh}3VkP6h{Cl;BUXH;3lqctoQ^h;2ZR-a5j_k*1?R8X5i8iX0ik(6&?k
z(hnaJc}HP6`S{dRI2+>*g)&95rc}?a5eE)X!Gf@jZE*o+<+*=V`dNHQVYR5uERqRb
zWUbrQ5SXRVRt}=qt_5F5Luj9#J>P`i*g)VXYYbp`@3Hm5TTNa!X)XE7^XGHlH2O&S
z*bOrH^F!HaADyKjF&mG)n{1|fPVM)sv!wHCMM!SzLV7b~8sZ^)G{jC*4ZDZjhPwV7
zB)vlT;`EG+2`FJ|^v1D)rC%1<;I-vnPe^-3HbGvq<0G6q&)NsckZ~%J8wW8|&lOg0
zU(II1T+LY?7yiv6uGdzO#)!VF^-eR@|1~@@KNBxfF7h*%9pe}`N%g%4vWUSP)VOtG
z<2Iso0k772<yRUn0e;=$ExB_t!UA~OXZ!XE9lJ$lPzx%_@tODr)KR4a<|e^_II*Az
zg`#W;T@DoCE|b=ERdTf+$fg0K@is>H4M53YK+TIPEu?uw6mMxUlpRagAS#6(>p+1+
z&CeqOumB^l1hXD*iqeXwp?Jj&CLPZv^x(v(KiX9N@_AXNqWWxW-OU7<RDxw%LlV@8
z#P#0+6)fD{-4Vpa@{r*D1XbXk3z#I;j|^SwakH<R(N31MwMpS7QZG!eenBw#YUbLi
z#UfBgtMfi(tdkqX7D~s`qH{Z1A_IQqR7e98P&tk9LX=Q?>0~ha`LlR^k+u<}h(C-F
z;R5OZN7_PL3Hd|jtqerEPwgFXi&H2%B}4ZW9-56mUHsr!N1+|;Q~N=yzc59O8?|_x
z_fLd9I)j?f0Lcc@!sDN$pu{eiGO_UhbvEF=hUh>`y^k`wDkmq$F`n@?dyXvae8%$#
z@nC3J*uB89bG(p!ou-cev53=&Ws)e%6~B%Bio+pL`alRpHX_O)^WR}r-TE6ta(Wu0
ztEI4u&23vHda6hl<OKhyt%zADIz$wDOPEcsRV6-IsFrvX31n?Y11X88A^n+=c&0^p
z=IY`JGO2<>_&|017`jDHv17M0|8DJ;(Ek(ed^STbejmlw$6TV?(qFT8K+C@EX>cHL
zOn7V!!?y0#rMHm4{bgd8m~}EKg5*r@y`*xY-Ql&XE9wdl=q^}}qKHkF2sgnl@KzZp
zVm8ULp{u+AwCnb8f*!K(i3A6d<K~E~d`*lrl|)r1_V(>Dz-NyE>4ODi^=qbhHxX%|
zN5m{2sgty=k`S0+_#`}Pbv%FlcL{|WZjJrd*m@on-D=6&IVjuzzc%TT0DhzsW1f$k
zUhgLCM8+}roQVL8V!BYTV@o@~m606y`_;XFUcX*SRWgGP`k%e)E=NO;4*{}ExJRuj
z5ENkwi{dsl_io1<D8?mn-9Azodrew@mdfe&YN<S<%WwwQ=8KAWTyvsyQ5OX?98hX7
zbm$@!6-3}iDtYsUDm-<>P*O%eBBgeGUfU|Lm0(WAyU4bLe&5UcR?Hr5^{*?{94LC2
zfEH2$waJ2dF@R=8s`PcB(dY?C5uwq!{l<ophE1UgTexQ455IJ2FZFVITYF5Ln9#Ig
zQyOsb@v*nY)(-<=m5wjjT2PpI1(z;fl)1U0GA?65!6~~P|H|_yQkkRScs`VuBW~?=
zb5oAMNMMcQQ7h=y?JGsG00Rum5N&!!4lOR=TezH8r7;GB|J%o=iku?|MoBNSOsj@1
zc)aP^iTHQnOC?dwSUhheCo^XNj>o$;=jOj5l94&M=V`ijtZ}2{jLb!#N0~u0A`Yx}
zsb>2Qj#xK*6r23*O_HWvbdH~i5aT2F_Kzb#kbX-wL}@~7ov;)xU%uS5)-v4D3ik5^
z09e6Sh^XdS7%Oz6B+X%CbP=kcwlKxk)^_7Mw-qarWp+Fagxz{pi2=KKMyOA3vTAEC
ziOM^y7TfYm<XEqQP8_n3!UbG?X~pdp_?734*hLrLfMh+rvLcu<ErKfGO6}Sd!<vzq
zRzhjxNRBrq-uFzG_$h9Z;=e}gFO1e_&5Cwa9cj+~e{{VET+jRa|NpXAR*I~UB0>Wv
zTgs@2%1VWXXpn3YWtEXt*((&4k<82zLUu~Fk`-lUhW_{Kot*Re-M)Xf@Auq3=Nw1x
z_v`h1Uf1J#T#xH<35D@DXjNUtMCQ)%&E}LjN4wg$=w|w9jA1|+dDXNe%MuTtr8hTo
zWyncWhMlSU1{2hGxr_HH9n)C;1wN*$!OVh@FIts}%w~LODAqm#vfn=J45Dg!Zs@ag
z?9qb(COf!DSFT-ayE442+1W{$c#x?3OACwqyFPEd=;r7_+0$1O|01Y0&gaWX_KJQY
ze;zkb_Hw~i4x<D;QJ}lBgGkT`;&Q*{W>F4kGy^6#{i>P#pEKYpHsGiZ8Z=pY?9idx
z+-Y22b@%H+1q6*>vSP&@<`6mcht-2mp6)!iC3KeHOOo`kl$&t914H|*!0GrFq3sCE
zR|E{h_K)pXHc00j=7+$jOqirhJm&A+y}Q0~6{Tt@A_`&T=!D%tw#7p3X@{DZF<WgX
zrv?v1&f_Wc7W&IBxJ}pP;95%k0i@y)>XFhhO}>nKEUj=JtyJ*OU%gtt^Wqq8^oz?O
z|A}BJvM}s_A{ebymU{cBP*u~O{aNP9WT@=wRaXz&9@MN&n>J2UMC=OxXIxdVRcP5@
zhF6}2m|nA6h&i$sVNB|9l4c|k7N6Y|r~q3}S*S(<JcM+mT75+dB%QVIT36ASN!}7;
zGHC&zlYGP5U`Ne-%DG6}0}gFi6B#Uf9Ju2u_8u#)LS~Zqf|v7W%6A_}<YT$go9-8>
z(9(cIH1#5wfRI27*&#{`c@k5mgfm+O$JkB|MW;>=6SABAa+dxpy|WRNfbA@uwwc0Y
zNcT_F->`;1r{{DoEeeBDJb{(C*=Nbyo+IQrp!~a1o6itIj3BX&)pxQqaP5Q_E_KhL
zZtLLCAc4Ij+C~|=U^C%3cXoKAOHC=Cq3?%*&^eJ8`sv%=mE8`A!^5IrfJTT}N$1*C
zO&T_ADBp)35@keg$Ck<^3r>xY=yH%XH9e*`8cb*FH9c5?gdfDvhOZ@xuiSU(-7#m-
zhvP>Gzy!w~E=^|QBzM6&WBZkxyoGw;L${+yZ&?(-R~*`12XIU>k53sV!(ZqX#9E|P
zTfSPlE(Sl_L$qbq(5rs><9di<i}!$m6YxW?^!~=@SxeOQuCADB_!s6ZLho~sT2`iW
z4T#-g&zR%`+R5-#dlh8(=tvu@1lIxB8POQ$)6UfNnA#$V8oZ&c{qgx{6Ii)&nQZ$a
zO#u3QsEj01sz@ReO*Hv3Zi%9WZ<tyj|J%gm<IxY8DM+MCTW2`Y{Ql1Z;5-O+-S3~D
z28kUW+5B|Dw0W}F1MeZzR?S?tN*C?v`6c&xI=kZ6aRJ475~g@HfR5umz_JPtL1ki(
zDttiKxoGZW0TaLJ+F|bh?Z3i7jk@Tm)+Qw-$&^<9Y#@@#>xMnV;4e^*4dsZ}Xs-%t
z*j2Eg&-v&JYlBgiqZ6~)phLYNVJf{jIVjrn@k|T!V8?aQoQ&iX4+YNH46MM`$hg4$
zUd7+n3>EU7x@rh?&BJO=W{f!4AVkcffqEFqgR5nOF6PwBU%Vv|jKAn*-ZW5FK~qR-
zXlcdJ;1Jqq!wULiCjj(1kWf;;i6S94iqG073rH1-vf;(|NfXA87rTl)j#c=UEhD!!
zX`-w${P#;&L<TRNMb4z26+%2QO-CKA>*VC5A`r5BiEfOa1`7I7HW*3s7@u~PDpf+h
zaZfOOiY>X6dWAw<heDjxm%(-PNO0(YPZo<{?h-u!Bg>X9O~+|aqd^1jpA+vFP+RnP
z|5HS-@)&>`jVzrG=(cQWKrJR1<%0+)2pEq)&xK#HW5ECL^a4leNx=k*aRxKX+Ip2f
zCLyv#$1Faq@}S4=9vOi&kq;38vrOlp85SLNWW}CFV@866ngMEk%3m)(nffsvz1fR<
z#(24_m(pWqz3Mh>xQA5eeQQUb)H8%ePwF%|xGK6~)kqS7n59s24@-t^$eX>@cJ<cN
zr%zK(F5e-*iFy$t)uvq!b;WG%4CqE#*CaSi{2r#$8ndk|*B(H?8ncCBwFKGLu!34=
zyHkEGfzc95`LPi`7cWf|`Lt|YmS-a3Z=_^Trk%pU1jT(U9X~k=V$~>Um$n>d5GC0t
z?hwhvLA!g|$ZV9jq4<F9p=Tm_3#$0oa%p42`(cQ<lOrc<gj>kcH~pVN*zmhvTuu<V
zh3tG12gC4OxSo!G!{~e}jFHr%>~LS7FcA&=MtX<CO?b$cF8*5>AOv|ZeA=WEcH+x(
z?nu9W7ggsZr77oxlQvek;YH`kw6dk*WuEbt&6~y2O2qqvR{97zNDzCyqL12_FJGjT
z4_2?pJsZ(x?SoWp`Z+LjVY>8;*<a9M_@Q!(E<pIo7)gZ+6-4;Vy|CYvE4Y>0K~R*1
zd3%%ik&`cpM2So8+&!50CRU-4mLUgc!(06OC0p<zneMrR|Bk3~tYj+z<~z}J$?vG@
z+f}}rhEayjmIf}J{7>{!Pc78~6u#;n@A7>RFS74FxiRpZ_9d$wF4cp6N8T9?8}ah;
z2ZSUdT7(ngZ$2*L5bU8wOr6g6&yXcwl;FrHRR)(dyV_hkGUl>+?wn|XvJdg@cvyY?
ztKiwgsHpkId1RhjoLZB{!~XzNW~E4~rJxL6ekCzB28!hPKY`SipHbY`jV)`Yn63Rc
z80h{t0=|Kg70&{dh=oWQ`N|64F)p!CiBfAn`cQS&jHlVzk$k86i-?Wz4PxFrIZsk5
z31G^d_|ft0MxJ=~V-ihCQGc=hq<PlLHBXqyk@=9YD4~mKVgH_U<1;}@=+Uq!jw)uK
zFD?5<>Wcv`lee;-5P1}<tMiT;qmq<ydq%!Q<M=q2lbcwRZzLXSlLJO^BE;f>_jUQh
z@iIz<$1SIiZK?V^e4>t0dr~S-6?XMlkE!ed?hRZJNkxm@FjxAULI0vWD`D`*>AT42
z+riuOI75aEr0(=P%Q(BfD}O{V8_?yB%dg2<bq00YPIUGL9v`2sh8TttyzXW5idM1~
zZW5*=FFOA-oC-wSvz>B`#R-FDA2J4UvMx)gGtdK1aSbGho|TmIwDAtpL_5u}zI5y*
zm+^g}%SJUX&-Y!%<$wbpgiHG6TeE;E;Q2OqMl|ime4s|Vx<1o;=<)mHSJNA^)PmYk
z#-J18f$So6k~%<z5jC`B#8{$PiXSs4(NR%jR1|-Mk_sSmPrnVSvC2r>2MO*WVliMa
z1p%9s0%YJDYra=2%d<gO>KG=Q)rczvXI@ebg|%}|j5P5`ljR1THEWx-An?NZwj)}5
zlBXnuPG!&2^9)ck-RwBJz|f+XRO##c4QMtQDJwnLEH#-j^|xq#A%0|C%Zlqr6{hys
zUMlCgbaDFIhpiyJVf5=4$tHK9qCl9c)Lmf^Sk3OJP;Pj%oZt`s%bN~MvBim*s?21a
zw(yX&@kUDl28ZzC+bP*;l{hJPa=_#H=gH;t>qH_4k>957l{wd@eE;YnYBSa1LkMBx
zmhNQ}LnI?rsL+b~FYxDw5R<OaJCQ4*EDH|^2#|dfvb}1dB<UBGSs}#XmR_$o_kp52
z&y}MiAQBIxRvr$}K7S{<F0JGV3Vnq$#=S(7Llj)Hl}B7-R7dwx5-D7o8W0ov7UlrL
zDZ{SaaV{-p&XBKe!fWHrNI|Fis*x@aSedCk>)Gl7gG|nh7Dpw<KqK5gY4#R9Hn2rc
zR%v7bLEg^ZPk7zKTMCQH8a3NS3}pxQF7JoIYtcjz(aMH#kxN3?JV-^<7DI`SdCXc)
zE;cH@=JryZu6gt{j#ulAvf%5qz2I|ktmAjhy?K|$U2dJOhw7+47^&nHL0Ygn3oTsp
z@iz;X#aJ+zLWc2@EdF*i0H8aAa`@f%y5z~cEJA^$b^^*gG>l-ya?iwf5(KE2*=1s&
zL!9W(1l5hz-}|m;G~!vc`nq|PZm9+6Dqn6ou5?>ZIqQX~U;|d}z@bVbJ9o@^P>*F8
zCBYH8(PDo^#5zM}@zUsUnA}c!YSB}w?|kn0F6sRc!B9Oq%P`goX|TbRw-$`Xgd~-W
z%3%M=@KzSgoruj?3_r7GL=(lf&|z=q5f&m<;#c;(HsoKa?YK6&rO-_LLZ!%r@M4b(
zi0KVAOi7|)AVVLj#px5%fW|^+l(8U{jPdc=)PE((na{UFM8nL-rhWPGYunFXzdj>U
z|D?SnO8i}9Uz?uQ58fnF*6gCKvW7-h_vw0}p~8dPYQ>oHtER{^V00Z$Pgdr@VP2l<
z%*&LzL;RU(!${m!UE&c!+)7+vM#d56OW}7LzeFuVfR=O&owuQG-e{n&=xS-4u3fR^
zOXpLx6F>YmQnKd;n3swMSJh<f(-SYmADG=NpCbi7+_-e^RT1pje)g48dVtIsuK_0i
z%SP5S(l8v=t@LJFBJy{ZW*L^H+A$H^`Lr@K<1udBx&?Vv_e*z{B(qLehTgoHnwxV&
zw$-qR%Yip6_7b8n7;IeTSr~tato#%=eA0Ww`t>s(XaSgc+}4tIX{CvP!ICqv$_|ud
z*Wm~g{Ltl3{z*k!1HKoU41u!v&W-bWD+^m7i%!4nM2Zo~lkm&a(U|}4Sqey$LjnV3
zo`iC~6m=b?B`!=iBEIyc_0rGKKoABMero)+HN#G=I^%a11rTTuIpy`}ejEW^u}hU!
zy-L%?sra~^4o)63Pq$Dm9Sn$0?@DEHh<r<zb-hJi)icxrGWbTVKiJv%;heyUOi+kF
zIRpnxpd|ytmvW+%$q!1w7hs&whX#_9kNG%R>p98_w0u`mQ(M6Ka0)r4eMAXqYrBe?
zS~WV@Wj{MagsUe=N^|3J*6h>N73mP;@4D<F7>N}VjMJ8a?`0IrkkYPFavMVYMizBa
zt&VqYCAGGgii?a!RlJND1X&Eg8V6Zz=)$yu`Oz1`><L>Z|HxHNVLREV=4Z+`v4vC0
zAV{!TYrcuDqPQfYeiKVHsqfLtrOw3?SIkc-)^6Mh5Lr5yzSaJLEzuFs4YVFGAc7V#
z9s7VMQzSh@ZG}(WqZ>d|rn6?9pehIb({n9qc8}tPf1EdvqDe`tbI5w9aE3ITJ$_6i
z(Wxj#%Ke@<jgi+C>wC!6<=0g|gc`E{k6xA07pfvFu#Cv5r4gd|jr)J<lK<&~X6idy
zmULV8->;~c^$0_(50sXLHnarvup=76JUBffMQJbYyCT6aVLCgx2a<A&o6Me}J^J>R
zZ-GDO9BR4NGE;qin+2A({hj&_$lRz?uf1p6on1~C-v1F7t)FoC=-W)MuAat&56Asl
z-LS|$)qX<9Bf7npbRV*+=|RJ!o<9<Izr8u^(zVMU{d;JCwd?Wn*_6lIv}>-OePdP=
z@#krP+4#YO2Mc%i{B?Qz-kh)0k<O^vUsTyu?2h32#GCymb**Hq3E_<Yl__(Wd=uXi
zUFi*Ek=~c-H4jQ}#tVULk^Mf<#cS5BoAIazOs$Ly0R?zz7hm2c#I9`wg$^Mw7}p<;
z=x*-Qn=g|{@sME1-2`7UXqI?)3fT4I36o?TZe&{;oPOIc6h*D~=K21`maR7;AeKS<
zq5l3SRHeUkcAy%Me@C<#Q8hL;w#K4rS!<2Pcz9@fwL{|&l2W0_fRKbO=0Hatz5YP1
z>W}<GH!>hi#Gy?@?!P~-OY)RvE$ej9S1M*zWZ{u&vps2-A}7pv)E}9=2n*yGh4#cZ
zP%Ji!{T_oOrPZrp@-X>`sn)$K^?frb`9en@^St%+=!>$pKXAr<!%*H$(QeR&mpLh(
z8oP6r+eL<N*bocn=Se?WmOE<FVnyA3#rp^~+b|kxE#kR35rWdhXQpc8^mmWvoWmaT
zr&!n1r;g>~${6wBDNsNm+crp1d74j6#-ILrQW-^aC0eutH}gwv0s8piQP~|^9a-yY
zT(?1kI9m0%oE<Td{#wjpvQOi*y7?8<sABuh%D6@mIz`|4z&ayzK@K3X>FGpVGph$c
zZiLdzN4K&1kyKpNQ>HCcQV~6_i@}eVJ&R*$ljyf?drj->`}a4Suk0qCINYt#U#Fq5
zf{Qiwt$1cNmY=Hb4BEKth<uMn9Inr(+^oZ-o~BfL^wC-94%!+lXeW^jC1O2YTsbo0
zN?KYvyoLs3wPDgvd+3c5Idi2r0y$D4V@n|;9&7!;M7A66UfGQCApiH_@%v&>R9L3>
znoRGZgd)~4jB=|lzIu;PJ&G2n$Itf`7=;};zoNPlW4r6GMj|4JPT3ne>(dhxJs95-
zt(*)`NPZ0Zi0a!(Hm+2ymme?uEQq1cMr^qTlP<yGGxS|NF~6j$FNi{e5tROdCK|q%
z8QXprww!~sNY8%~M50kY&h_KK;yKc9<}jxwo(^@T5T)n!_T6=3EU!dXW%_U!E;gto
zhjMV*+<Exi3bH^@lvO%yh^2+22Kf5!XB<@a$O?}jgAgX|+p`j|F;>?1N31RrF_}oq
z5rpNrMMX7r=L2g&&WJBKL^4G~mNjYCitFOVyK+43e}rc@EPm9SbLN2}#HDPe4SUdW
zpKvx+tO7~EF`Qb}!d}p}S7$|eu3gIZ1ht2m@InoMH_Jwjlb#g?>28|-yO~YdBp@rI
z68;%oXy(}^)@ssZVuZ9j3xn(3{#}<`4b^i)-??(NYVEDH$eCjj95yFX5!w!eyp1Ul
ztQvRaO#1DB@B&iI63aW$0J_5#jf8==rA-1kKhb;t;lmw}jCGCZVrkD8#7(FL-{hdf
zp>?K|c0-VmakD>%^x4EfZYHAyvg9-mamP?5Wr&vhAH>6dNd<XGr<}fpkV^b#GUE$l
zG<^E>FVl*!hm8YQq}DmW5V#7PY>tjuWW(&bG~wWXk~{omfUk=}N47Xi?=e;8TM-@n
z&Z)DLJoI}w?6BQYzX@dogWLU{$}rlm<p&q}RMRrIu%Ku2T22waf(|z2-~pXg_7+u@
zd=E8g!C6{bu71h((2Zg!l>1evP~x%7cDY?1ISP=}aGQZN)?_m0=txY{MCt|+>JeeY
zzTdYqUSb4h52#j|=HZeUd#_AH_xkngLjdoTlfa<%raneUx>Cavu<yl2>Z|%d-hpqw
zQKPns{2o$efS?2xq0Bn>3m6}ZURIQ4v?0QG?;cmBWd$WN<8R&N7yO^N9yPk%kVb@A
z97OdwYyvWZ(pxhkv)%9Ps1!d;7)hT;ET=FP+gYRC-Tv}OM1Ik7pX!k)5kqP1h*&}B
zb)>!S4{1gsV#yRoC1UmCk3aMB@*w<&@y21VWh51mV?w7L>wH9y!L+xo7043+x+PR6
zn3gCM($5!Jj&16Sh7%^i4x!yt+?V3`)DOE=&)%h>p`lO5q9hyMXLPLkSD}p&(nrzx
z!~38g!MrUPPRSlCn~FrJM>>?LW2(JJeo0Uv+&sd0E2hNiE{k3Wh_h{NmsawP6ph^+
zT~*M=tyvYc0Wo4u(n7|NqWd^HZd}o*vs;AW-D+Am=@f6OcjjkZw#;+d8!oE$rED-|
z{bg39TN#x6fzPdo5Ml_4Ib|RCsDDxmJk<EULrJI0o;%=sy-NnaU~ox9b+2rG{z$gY
z(-?k|afbV7npQ8r_;l`yuA4X4GV8M%6L*acT%<nS;;e#P)TDH&VgieQ|9J4^_bwv*
zhE*EuL40)e>qI`2Ftr2^86T8p0RY}CwqI!(V(TmoF1m;m^Mh0GV!&T`wo;S#9lRB#
z_4>`58(}k%1|x21X#@<{?jE-zp!Z93_(4I+PQRS{WrLYd+B!g0h$+7E9Yt-z#EjU+
zrY2r+v%9giyEqNdblBmras#PX3T`s!g)S3k{lk!bh>2fUK%3ZxZC|(@<v|PG^0u`_
z5=#I0nbiYgl?mrgiX>^1@&<5V%Ly(LF*={kUVsmpMd@rx=1FT*T3ZA?03hwAUKblZ
z>bh0Bk7m8x!lIoX^)z|a*SV8jIn32m#Rb`2=NkH^ceaTxhAu2BnTM)l%i)smWqXU$
z=83#jBmuG!Ib;Gi_!dnZV1+?=dQKpdm13wt$dT|Yd^7re8J+;0jlEb-L9;1Ks|2{=
zy}|J1_J(hVqk-!FKDit*osq$lt&t6;SWRrxzI}MopfZYwW8c2PxZCQpMw-T^?BqU^
zGnf0nbm>xZX(lzUA-ySy0cIQemcC5Ct}N4xVJnF2U^YqxN9`cQ#X*m%j_n1#KX*EL
z6|;U<uU_RlzkK<!<U6yFij1`R&vy<QeMJin+3!*HL-Jke)K5djuqmQOk_TkN32gNl
z8LED0r;=>5WaUa$=Lpz@t@7%2H(S2-A6YiIU_jCM?nF$4(4vEo&e>oSa)*J`bZ`Kz
z|Bq-cj|68BqKu6k*5keYe!o334>WG;tiGI}HR8ACjZR+L3Zjk@N!QYpQKwu2?^V;n
zcp{3S2@S9Pq!`4i=Ydr9j5-ib2n^E%Qpqphy$251;(Qf<>f=&65xEE25)H)9<n76K
zJ;Kw=C~1diKuEpMZ<0{u@Bf3Olb9$iiq!CvNPb7UeV6G1l(|<hX~1Qf#=%hvW$XNJ
zG-&!Tpaap=g4am1q?hT5wui}}q6-$;{hrL$AZBUh*tYvO^b94qY-L_otYA|e$S#G*
zR%QN2^%9e)*7^}BO{&)0-`3UA+PZbRGYA{UY9pXf1+_1-1=_a_r2;Q19FfaCJ5a^f
zKY5D8dhRHPljS@^fq5NTHC7^bT)ui$+FNqJ2taRm*`^N-sdq^oqSyp=ZgC8IkJV2e
z$If(aAn(zK`sv5*ZNHx#b}y^=9kR~-I*@>2^!M`ZxQiB1CWJ$39v68;%^u&(q<Jfl
zcU9D>Q{7m!lA`aJcr+*n_J-@3n0H)OdNGO<a<IbeDi(km-l+<B1buATwyh0-*&YeS
zVai>u+eY*VL6d#J@FE@pq8AuFMg}U;RooO>@ne$oU*A3>s{}Xznh!oey35{e+6A3L
zhckZNB;fr#wC2v^nfS~ei&$n9={M}!Sk-tmh?<fKl>+Tun%a5@38|zu5mke#{;kuq
z$D-qZojH7TCZTirrp{P<Z-)|>`arnL$fs5|Hq-Q-6OS%zW?-;Kl|E6RWYslV7!tw|
z(-l>2h?yztR2f0A4ue(rELLlP?%nxSl^{3a${`j9ijL&mOsY5WawKbohm}#@Kltwj
zqSzZa@cLgkRC0m7PN?_m7w%HD(w<NBZ9+Q~G15+Ig(r6=QG_?AwiY-FVM)8?dcz)`
z$#mFZ_Xsie%svQuRUOUke~;$R)zX({h+3t0O`r)PZNz}&3QABN`kJGLCrV%;e@E}$
zU9Yqn<YkiNjz<C^BNxifQt?m|XJ_}lB5lM(20^Km`)2o!O?dE+_}4J|K1plh<5S0_
zjm0eXWw(U&5jBdXZ4W(T)U~1#Va-z(cs1tGJ=y5+3$s%WZKqCmXPSTe6C8CKkE8R#
zpkDgBx3>|zP~;ZQxPl9xJ|k`cGUvmk7R*Gfm<nd%^;OcIX$E$bR@RgWuoHbsBhUNU
z|38sOW6-4XN~9JOXKxthXNQfSzRO)oHP{cQsN)QniRmH@mQl&Iw6)v8<(Gh_KC9QP
ziKaZ0jsumZ?4aX-U9{Q3hRq$2EYb`UIq$idNWsMaJWb1bLrRJ{Q$53(Xo?`<QcGQB
z*J1SiBc}Ths@X&JB<9Q&q<><@yLOyd03cej&oy73VX$A8qZjAQMW`nQHAwGLM_4PT
z>2z{M^$m~`ktuJKCxO??m?Msy`)DG?+^rnt!oR>VaW(by8<|&yHw;aHNb^LZL0fr|
zbdvRFBWU?U*Z1i0Sl_i1Q+7)~PFh18f<T0^a&}6trKNSL!fK+F$9Y1dSk#E9h>pFC
zokw21CmRoJHcmZf63heoRWLCy_r(@_aS|4PLWt;gRzk~D@DE|c6MDz&9)KuHi~+{~
zBQ1v#P=wbBG0lJheURwsoEw`V)d-UM=`ZVuv<XGg5GFSsILbv$-0;qc)~1Ya%ZPB&
z@6Yi<Cswg4Lw@`F)xV7fl4Q_=m_|#;eY{kp%*wD7Pc>d|eGKHgbG%?8E&zG;-wRMi
zE&!U@7k_F9CBnDs^!%H|S&u{a8l|!lNVfZlVJeN7j;$Kcy#lt7m;U_OGs`fLNXN;~
zEG4wj944{^9n3@8DW`*BMLGN<@eBzrdh}{ZE_RozNgc5|O%6dq3e!#So@m-biEKoS
z726-X^9QJ@8or%zvGI%;uDx7#2q@@Grxf|RZU27V1L)aQ2wpsXA|uUIlM>=gfg_3Z
ziX&>2kUwA4gNX{SdA-!%=%l4+NUjCVf$XcZ7ky0m<#B^sKU(b1d4^N9uua=8l@j#9
zVv}I|@eD(VzB_94^?MjM?B&I6V$vrKsY^F$dx{gEw1t?G`WRd!3k=ac$Q~}*Cq+(O
z)?RSC#pU}9DTWC9*L1gXO4_{8P&GN&TybDP292|ciKEe?fk-&pG81togqrrt-!`#j
zI=D%-NRw?p?s7yJs6@okSm*}ZkCdZRQE#Yj@&SgCxG1rfQ$QfC|6&WDJE1MbLWN&H
z=aF2+u?fgmBbPR)IN{28F=ul_L=DqDJWpX+wiW*z*QZ}U$p-nBitT<G4<#ns0j9+F
zpmXlg4Ui<m#fub^sDvBU75zsrzq=`eMNaIPH1{+dcKm~jaKxy#g$C3Pc;AbDp{fGV
zX#O`LOS!jU+*aeQZwt5?Tr6^_HKWjysggTmg(MSmC6J!!vbuR?`}}1BNYc%$_Jpk}
z)dJ8Xsk`jL5`&V?qJE(ChG3p-Q*BoOl;0n(qpVM!o4c1(%9&WRb?d0{Q;P8CqbHPs
zICtrUfr?okTgnW<L|oc}y9v3$+jbYxJfR-?1O0^Xljd5sZfW%%`BY|crNK+ENY^5p
z--arUBPxrqJ?JU(wfQO_+iC94L^zLS9o8XwvdEc#dSNr<So8~3Y$Wf->_V^l)Nt47
znhh1^g#+2<&a4mtDLsGM(&@O+#ZXD^He$pG=?ink3h@+?>d5qAJlzyHByl^SL(&52
zu&M~Ru3XBz#H<&SP^zic2ra^3P72AtY*&2z)VvBnJ$&QF*Cij4vuk2vvb4iG^tvUq
z132%SwyLMcw@w<GS_>5gy?=Wo7-q&ufdq}=t9#FF+ex}fjvPO_2cz<*lvAuznd5<8
zX-)jw$5w4*eAEg6Zm}>a5ohQvyI?QZJE%%2^KN#5Sy~x$^Wx7`l}DK|b{6y{69lsZ
zrIRQ2@8AD6;)~Ff3bV_cgR?B$RByr0wg19t7)n^|zi>L}sU{jD!NJIi^Yuw&CTUuv
zgwE0xA=GaECy6$KtZ=V4YUWw5i=g9p`YrVFFO5BvM{9c6s8Nrn70sMAE0%~YaVcr;
zFIfbtk~)Rkyd3GMx?9Q+EmGZ0T0WZf>WvE(iI`YY^0p-ded{xD;0g*J4^?fKAHU;0
zdq)`-Ag7^#cx#(nt5Ku;k~pmA4nmfnBM=)3^!2<-+D2W0DSgc4fan?G$Q=7=oQFpq
z8$vTk&$1*@IFQdX>Gu%oWHTcXTtaZ;ZZqE#f~hd2nB$>q&&)0ZN21~tg3?p%9UL4y
zr?c-~ekI@gFPJFozhD-G84`vIw>8QrdaVr&2Qjncr|$#@EL1-Dz_j~1<WVRh-`O5@
zF?6l0ruJ;M*b$^f&!1nGU@(9-d+OaMzrUDDQ-`aq0@{J0CNxp5-o_OlHRHNV2_k)l
z$6#mVj@~OODc>gi0Z}4i;XS||7X%kUVMx}5QytD*`*w;EYiFM>r6*iJ3we%gm*~>%
zgAP-Mq6Ho~<B<mrGiI~aZQO{g^ykDf7Q==OTm7Ph$tHZ9#lWqE39*icty4Wu4$ATb
zj1hz>khCY2fob3R7WbwD4OOd~xi@GDrya_ro;k}^Ah-<KadGODDYv0=B|ZA)R|rx^
z@?`u+yUg0YEkv^Ys&L595ylyJ%nU@7a)yFT+`7DbTvF3VJ`_2yj53h5t-?qk;?;TV
zv44L8fW*XzN@TYz&2~&nupiqF{)+52>XhGv9CJ)-DLJE15Oq?eH?w=P&QfLuey~e~
z86U$u8YIB1MvI6{_Tx~`Y&7v$DrVLpA9&3weR~JbT2GR!6Qt)#Y!MS!p)1fD$i%EE
z0_5zvd)bL_e&q)Kv8baMgOF8S`$RZ8v!W8C4+s*!l@i<&Vv^<aXwt>TQg;g(&vX|T
zbDywUK?l?R=k!H(qBrbA?^_C0{t#U&4p0VqT~Y5c^=mc)MddzZFa{7UC&~uXf!FYA
z$6zHRlUb@NTI!c4Po9*#(X?ZQg;QmuMHUG`0AE{CjwuciTrWj}jsl@^DxJ6vDEDO2
z*w?q4)4s#R8`&c%-bfxE9`$o+rgy=zMB2*(Dk+JYl6~O~(Y4ZR6mhx`CA&Oe{a?#q
z3C!wlpvxn_+0=iqV}`=y_3r)t5aA|K^Y-_8v>@eI+8SERvfEhIVo#vC;YH8#A+^;v
zn?L-wDnUpI+V}E}Ykn+tM!bp=4B8i&?{g^q^K0Z!(i}uZ_?fr6u`!XsDxJh8tQIxI
zZP!y6fQX|Vb(}uJO!%O&s8RG=wYrRl46+UB+cG1w6^jo21pw?QsM?6TAK*dR5#1Ku
z03bqEF30h@MF>IE=p*t_m;+ivs4Mh@eLa7$jM*$Zc3&YYdVRg?yCINYQ_)ttZ(_<!
zbd%7YJO)Uo`{!XjBTxOsanH?NUAN2P9)`*Fn>V*eH>}r%WkMFB*2fd)a+NYlbT7;j
z<WU5P!Y1v?6@zqVOgZT6!DR9ZKaHs_Pl4ihEKgj%EY$ULBW+q|vYLn|d}`c~)Khdy
zq{a68_tzvJ4z;Z5z6Z`f=E+-3yLPSP+;>l(>XZm2WLY&o;g1T8c3Nu=n0#|cf-uqq
zB_Z}gZDB7t#x~$y5nu8DI{Xv&(eisyI_KOaG>T`MBp%wAv9uAecA0EdYzR4U<J>t5
ze#8kN;%BPEJsc{A8|agS{!f1{xxkVk2!@_9wbF8LL+6r^HjFzfC8?3c-v>j!=`%Nd
z75Q5ApafL#Stpk=Nw#$vk!SOT>Ndk3H9EQ|B*aHX#BfRg!rNFcelQJb6%|=#zLpa7
zAOinoZjKnn(d1i8fpmN2tnB16GQ}pHLQzl_(vEFJJobSx^v_SYC~#E5VbH;e{|4mS
zqDK_-4GNqvU6fWJXy=KiKd*((lSJ;fHa~wndXKxRyPJ;#C}RYWMm$vM2}=JAvYHK=
zj-?1mNozg3{y8+FM`nNZ4Vl2(yUn_--jWnG5pnM6F#F4RDYS}bpIiaO3snOY*)M+x
ziB}ddAXBFDwXs%HJO;sI%AtpKm#BCjX%^0idl*X?bu6AQ4Oty6(lA2!Kh0>!>9_og
zQ0_(iPmLTFCDD;EJq%9ShA*aH<Ig3wOQdge4l~F2ldB58CZ0S=E5Kjc4X9Ks%2=St
z;GEYPE|EX-aQ{JrvS7Jpd21`#Gw3tOw48KsRLt1Pw<r1|Ssf1Ym>CblRMnF{x)jV{
z+_y$=krtD-_HfK(eKVcD3HIiSez9=EOfSjQ3O)T{bLXB(xaO81W^fYs#VN?mnp&f8
zDUT9#uqD+jMbjNr51yge^1Mc?vic8#Ifml=o~rY*YW@0Geyh)4d;j(83|`(kqlLl2
zAIF%>O-%BlxZqC?6Y^W;=~%ZDFPAOavRAKpEU?)Rv=b3njRy!Lw-ci51V~np%?#C>
z^l+BVRp(~n4Io6LYy*MHZVwqa#2IQa9bvp+9cf$Tf}EvI-VyUx1WxPMOWWW0FDlR1
zZ__42emhmX$CxpeVT82~?x||x!tb14S&4Sw0;vG*IfVP6Nw+7i{V(E8^puxkwwnJ`
z)b4QP?NyCAU}qd<st}&*uZ>E5pJotG7QW@`1R{}CA|zOP&Pp7y24_@N{Nk926fvlH
z+ZoGjx;Rg;_TC`NNHJ@*Dm;8A2-$~<xyEx*Q;8}pnxndbfkY*u+21cSULpZrNpMP`
z38IQXv_xhz1)3zcgM!4fNv?~Y8oBs3XJ}c6;lOTuN08mn6_ZQA?dVcc^S0iBB98&V
z5keA(gRJ{0&JF5CK|0B8IJ$DnN8zdo`*g|G2?Z1emKOCbTCLr>bw8i?4b9{`6#CPk
z|29B(+Fp;^z55M)kSr|amM3r-=<<zdU@~U0kl&IwS6{t4K$Hk#;zavd&#LaSK609;
zPVKPBmttyVX|WrWK9P2t3`kITH_uyF^Sn$qq8hr44w%~BHi0o;(JG=llgU&r5vUJ9
zSAwN!8f$C&Y8aKTP~k&iQ#p=$t)ckET2$b@BwYxPZmnu6t+faw{5jKDr5-W?Zc$`S
zbY92Xclo&*h~XpMW0qrmIRB9=>xG0?2Dc+x!fn~3&^tn*?Y@`3mOm>nq~Rd7xNl$P
z9aYAO4-;=`og}==k_^u86%kQJM(iw=&qfa)Hf-$<7FQ9jxagKYp696}QCh+Tg)2Bm
zKMB#`)TigGs0e2nk_=4>pBP+%vnNKhg{dO3mD~D{)VYxvt-vg#LYar7?V(s(jvc#8
zTIw=FM5wIsKoGz9auXJ|SmpV(2mNO9--`f1>ySemap*jQpC!&an>cs2qsB831J~%~
z<y9j1M;AHgN1@nEMCw<qR!yZGCr^`&!*v%eTquT+#I%28R7pSjZ;;FrFJ8)0FM?vK
z15ykTw}TO7_%&@Mf@<_Qv&+b0HBJOYu_}M19I2p2ljd){_j{9m`!-{=0fhSgq>&mT
z!P!CYES9X=BHF?ZjzZS;iB%OpOei~{7#deoYzNBJ2OYy9#Ccs=`y|>233-!El;nL<
zL2LWJYL@%*ET2hSB7`ZtOr2)ds&^^3YQ3(M*u<~2-B>m-r%UsboJ~h%J>H3Fb!UH=
z|I5SBrPI_`tAy%|I-Y+4BE20zl=;iPB909oCg+e}8u5H-#)4)`yGeEL9ukW&xE9z#
zg}_Yh!=-?JkvtWDyVz4~rkP(c>->>2Up8ZbRMzMs=@&PjaD=|{X^)8=2b`1W-lInE
zUxo|h0(1!;d`ZPN?cZyaODGiG7P8RgstAH>G|yB<y(4__?Cio2UP-nR-AIRZyLazK
zALyg1Rhdzi4rD1us9c{RW29J#y*u@3MS6OA=p=gc^0|;9Mi}!=D@n4UcTSqrvqH|Z
zidQfDrJzKWrTu9lQHCPeX<;R^imcU(!^%VcU6B34QmJ7?4f5M<q(W{FyNzeMiOFO+
z0wp}7@(IrJwVZ7g4rKH`HZn4M*UQQOzcAVCf)Nv_qT=J(ZzJj6o1{dVQ~z(4`FwdP
zX_qb;rDBvI?qjqzR&0W0Po8!LHIO)qmsqDzCV=jrAL^j!hd`2uyj;*mm<)+Qjf>xs
z%8(<vd}&c?BsZPtxs#tT=*zJ<IErXetAlE2v9S_ys`MV5gPK9iI6Ue&9p6R*eF*ay
z#{A(agD85@ca9B<0+z^{DX7%l{8&$4BPGT~8ZP2@$%h-t<u`syA6y1|rFqCp_jDsO
z+d4`o!kaGez->L+T1gx8%(ti2i1r&ct0~!SWW@51K-x0wE$gHZ6|JdMl@PO}^qs74
z#juk|0TJsgPr&xrlYPr4SRpTf^g8C@e<&mhru{S~RbM4krBnf^?4`ZawIl%rJ!$V>
zC=G@Ou2-Q4Vfclbu!@E6rfZlE#r*kl%We9mrde8F3k!wU{w!&>Y18EM3loQtwjt+t
zNk(~}Qcak7GoF)(SuxdaWA*yx_5}#WuGkz`Y|DtDnnXROOmwZJ45ZKGj)#+CTlMeN
z2pf)KiSZF{6we%jRk|pv#h=|DJ7DJO1`MpM95einqzeitkEVCK|G}yBd&H;hPu$8L
zCWS18nT&o&4;Ho<;f<BJQvuDd!f-slgGh?<wvE`g^Z8R}^4o=nWe?mi)tLUmk<+Ii
z6_q6YRWEw{e72j7XFS|agz%kNL0RvMvzLFt5wz+wC=i+)6NS9^Q(1i{6^r$Z$Jnvf
zVIq6L%}3==wLY};RYvUPtauSDB<-ilrYupBaKHdIFi7z0RQ6&w#;%EwvcV9qarzU_
zf{F#eGMXqV>lBE<-ma?pE5Z(5=q9vu1c-*{EdIS|(O9^@T5kIIrqH3VLGGtG{r(tJ
zcjWC0CD5Y9`C@s>pC8YDH`4UX*lP^{oEukFR)R+VgmySVbBHU^lE5J&J6tefA0ik2
zLQr@MIf$+m74~q7QQ>6erMl>plFGJ|MKc)qGq;I&C+pIsQ}w?)`qUlNy{v8AAB9DK
zEmo7VlWjC7_5apXQIp+(^6v<41V(q!h`Iw6d86B>P@F-UQ7_i1^YoZx%fRnfM(bKh
zZ45roND2(}_2&(>HZ>iN!geJ*W^*f%q9LU5;e!T^_}f{P(HX|C*W+BLtY2LEb~=sz
zH)`BS)IUfC^aGW42CkJzppp%#zDpEfGBq9=8v3tV!nHCEVtC}2swI?2HwUPAj^{4w
zm@c7}7zbiXL;OCeVkg*lc^FEH`}L&w*Eatz`LPH9?Hw5P(HwhY>pses{$WzUi*pT4
zD3Nro7jp>+T%o*m#q$T!xr10q8C59_d4BsjGqg`+LbPM5Q~}a@CTwxEY->kHN9Vz-
z^%eTt6(A*5@#+_%T4f|5^rIjGDqJ>Q4gI6xvRy43e}y=txw?J_?TLhh1b<|bGTjt1
zK{^-|0iSt1;E^j)3qeF};D-mZLQfoq@aCO^G*^Z;WfHnY(T%|0J6RZQ`6K$kI3|xo
zR?GKNTC5-F(L|M|mCZYlUdXx@iQ81DR$)vJ$Q}k^hrys}hyE}1(Ns;Agh-4+pjfk}
zX;l#@ev1?rNX6ET#1Ku6qlse%jUzp59K+Av*rlYMe)!N?LI7OW5TMYxqs-SV*Y55J
zn~~e<4$%1MDb%!5KInQ;Yip3!kz-S!VO4gRCjg`EMZd+Lb%L0OD#`$tR&CNUcb9yV
zWSdT8xZfXE26}=*Qyu8o%2cCxyn#s5mX)d<oJ8#ab(wbKhGBXD(WfM@!hv4GSIcUT
z>QBplaT;+=)|xF9ahQ>974#&D*~znjZ-UTHW6{@z7RR}xgy+{lmSMbJh$=Vxl%;vl
zQZkf6OGx&Y2vcC&hEpsCr*z~+9E-+SeADG6MV}XiBC1_0A;4+K46@^PYu=dZJ+Gv*
zW4l+bw=fILl&sMm(Wb6cj#TbzS;m0LNJPg!4ZyO&3m29r4A%YA@)!CHF+-_-YH2?h
z0+CzNSiP^`yc_eukZaefS5z+Ki*lJDBq3VD#rS3H?t~fC(A2z8-AL~hEmF~K)0q~P
z+TOiwAbg~X);AS*Kj;&yboZ6r`(kmJ+zZ{UwDklkNPmFK$a2PUhp==DQXh*UUYibG
zOL@T%inl$o$?F~PXAiA0>cxRAc9pfge{Wg6d_&qD;Do893Y}{nhNBXX`Za?V;&RxN
zR!Z-@^>u2r&&9u=6yL_5f6eOE+u`-ZCSz$IRE{DkmKSM_1mtdAeCrAHQ*JM_5=qQ2
zyqwSY2GDgSmu#rAgs(G-RZwULR_Vgi8?<S2^;vHem}CE(I6W(1Nfyb_cd`y|G-cPi
zH}>+xg>gRCk)pD@tX+5d{ezY#pCSLK$-+g8^4N5Ad|zi7MZrSP>9;?og~&mRkDB=P
zn0w*-w+b(Fhi_gvdQy+MXTN{bEcqM$&CmwdE7>-ZsYV8(M6FMG(&x+vb+bHJx!r-k
zKW(!fIB*yYICDM*{E>&;*UT3B3dRRSMTh13?b>#&$;+Z5>YF2l2I;ufe<VtXmMYh%
zp%S%o;PvOgK;y*VHeF3gY%A%l;lxv-qG{E<`8ll_C~)|CC^&Q<@omXu%x}0l=gM>=
z6aeK!f9BnjglmEiN-nsE$AYW`uo8h4CuR%}%*-od)22PrXaMm5U9+$#I$p7%613Fg
z#HM>y_OC&cNEjLjVLccvEy{?QPM^dW2e^82UuPooET>{K>p_E(3e<=QX9_k|Oajrd
z7UApKB(mwoV{>jd{rmiQvOLUsB~SLxV~)-VP>#6~jX(iHYYS^n$BMHO6n>ufjZh+A
zOEZi|`1Zt`c7mQs=|eb~;G(N57FBIZf@6KaR26^s|2BjsYDl3fl8WjrRRP5EXMkGd
zfUc9DEtYW#VG}W`{3o@0w{EhLN{mXynte$XS6LuQA6OnDr=~+!xR49pocYvvR0NFC
zQscG<6m<Xj{+w)sLf9caM!2F3mH9#W4WrUn&>D$!wXduIqz|*5h;1d6_{5VXHMnh;
z&G?#vfkbXG_2;)qge0@$GoZ9ZOOJR?40S+``$Vz`7#%`9zP2J{a#NK~G3UN|@ZP<9
z|EAqQ7**+%muhOKBQ;+ISVH0;VGLQoZJOw5$4dbb<8;7|iJ`;6gm_xU(0Yju3Q|B5
zfx<Ks^d#9S16>c@)3t);5_o*si@>-6fhC7U&x#x<<dE%I2MY3P*E->RepI+MF?mA7
zd&ZZaev(>m8e)ovAB|<=34Phafdi<H#X{)V&(A2MrTZ|^z=DAl0`ad08=FJ^(NE%E
zqQ*J)$oK01ac2Hr%{BF3=5Cqs*pA~Va;FDf+O=LGkDOtqRoBK&NsLU}PhUd<i5wp0
zkT0<#2m?Gybz}I{2pjs?GEXBMEN*o^WDE4E+kha`ADw|8LM?ZPc{FH^tgxEPOjHWd
z@d!BJ9vz+qRx2u%d{bKlIQ3NJ?zlz)rlnc{h-~SjqsDzS)BH4KFv(Ar0*ccuM!rEG
z$MV7D_{dW#h-R10uuRzpMb%ycR@wy0Q-fvOgFIe5Nhui>{b?dK&dzqRO6S_|mcR;h
zv)zcQMm$~w7SSLOYZ>aQ-&e=;icEH`o7u{O3(Rut?ZTc>iPw0*bBQGii(PgWV_41U
z*oTGk&Y^yB1<HfBh|m?Ct`sfMcfONbM+^f{Wr`PnojSQjyrBIM@iGD`jPHl-EtJS_
zFglL(*2H>75+qqEq(H<RqIU1e_PZ8(y}eeIw9rHgo<#{FjeE{9hV=jsj&WBFqYiAZ
z<e`@a!sLQ)G&01E6F-m(t^q>rp=%}eN0?&teiJ)FaZw5RAS5_?&^vg|$#Nn46CX*O
z{m`b$b|D!rCx!G66K`ca=VYQbDE*~)d@-S?YbAWN=x)TV+4`guuiFB#bdib^xY?p5
z5KV84|B_+GG7l^!WT4aK#xj7yUKvWV%+bF;)}y->F-BK{jj7Cwrp80A>_fR7%ad!+
zd2$pS=vX(Ch!#}yw^+5hSG1qx?bMmH;f1#oZ&HS!O^ur=LmKhcGo|$^O^bT%oTa)U
zWHsR%&JEL*=}@lC?I*>*XPkx*Q&qtO)y7>eshlvk0tD||n8N{XZ9P%i2_PUuX5_#6
zdTMkP!JPx0tEl3lzVD1DbB@XUzDRTUQ>6hcA{!<VZsmI_X^A2>LUJpom1^YWfwoxO
z%A_*%NXogBOGiMgOGQCR9P$BPOmIkYb=h?AZ5mXO4Jvqj14>cX2owI{I;k>R*Xr+T
zw?FZ8A0|0AwjZ}bsDRvEjp)%Zk~>-)TL<+Ofm9mu1any!CWb1ExwwXM@4O(i{j=v)
z8ZaOv-GwMETtuVa0kn1p5o9v>R5}R23o4XX9-7hNlWH9s3v}qWwBjLrq?O22)y?x$
z6GuYn^}Q##82G0`pU$Aql7E6v5#9GvLA05d46D(T+VB<q28$ENtDXpREq3bTv(OQU
zY6x!P=^=}jqFEG$I@tdaeup&t!o~fV>vn_3sYG;PD&r3LPR|O$L$-SjVDW^00f9gi
zwN}zq5%yb-VvxGYqr7M}1gSD7No*P$3VI$QHrG|E@_h6cK&BpH6SGiAvb@1q8*w6S
zqYW)L3=v*JM8HBZ<yt)ejSM5J$fC#J1E3kHZFft104i109{4RwFVS7?2BF#v2zgSf
z$#IpX8Gi{C;fD0ocsSB`Eaetk&xaKdF*roFEG+0+BS_uv4@t*P$6ILa8y)3N4}>Zk
zibe<m%&LmP3i{sq1_u2puxwSdL^<X!?l3t3$hgcTGiT$cQw8OHh_sPp`Z%luu5YNh
zE`b6m#5)~}a0LtqeF+<;YXxzuD(_*?xQU(K9FXH}oeZ-eC>n#`vsx&TSA$Gc*)Dv$
zZgf<XC)58jOJsZph%6#fK9u--W{+k3XrSTg!m!^sXf<@7@o3sQ$#8_JI^jddgZ{dJ
zB<#*G6>*oeVbSW&HTYS3gmiYQNeS`}k+>A;+uq`1TcYY+qbq8nKZ83m`W#2{MOI<E
znU}juWEQAMllFJ|2f00-A*(aJJGL5m5B_KeIZ6Qe*nUVsQ!1nLgqIg%O3d-@<aa{D
zk&RLzwa5wP99>_B&b2um&91lg5K0RQOVAuFy$=l3!_9KFBH9bX$>dq6$x#@LJen%9
z%>a+wabhEP%=eMB0Ib4bV&wp-j3;P8XvjLvT^F@y4<J*AMG;nHhV%r-&t~ATx#%*b
zI|4&PF=?0V$6T4_^9fGOv5<hZn1`*a>Wy>|DL%RB>DGXcR=?hiE<WMg4IWu|C*88P
z52X*vnB_=O_mQ}aCD{nw$4M0eaY>c_NrX3vfpopnWjv0KiSXS_WOwL_8>VKeFja7y
z=)xWF2^9<%sm5Apk^SrD83uk|2JP#usv68UJEb+XV*Cus>SPBz8&Mr}`D81fZ-C6j
z$;IZLhRZ@zPm+(A(HuXU-9e1IDd1IZLC9xsQikkTB{?3_!#I?#%<~Lla(tbxP;a!n
zhSP(%d^znXS4Gu&ILD&YzwA$Au2ls!Xp2=CeUx4++o>q+Mal?Sy^G-dil&#yZ{b^g
zr*}|}dHo9^8mD>i*7uTZBs$7(T2kf83hX_Gh;r@RdH@K;7gNQ)_SCYiWg?4OL=Ke9
z@^)Qu`)A;nw_RH=eG}N^o5jW8pjw(`DwKD(sBN!NO{?0`^VVfk`?}faEbhC!h4s(-
zH?<aNIfd@%yFh2{N|(?b^CNp{?2VjXb$(FO^youHMGMaTbbej7n$N6JFa54RMyGjd
z#@KliJ`HeoZUk8r=WObEvuXK?6^A+UPN(KGe3fR5|Jg!J*3}Gj{kx)bbA|dCy5H&e
zWIaY7HXd#2-Qjt0PYTsqwQDzfH+k{m#W7xc2m9u?Rb!X+=!Y%vJJ<2QLb!0HmT+J<
zA8#_4q=ln~!r9Tn+~d@h=Jl&puYLhF?CLFBsw)Q$A3jgL^8xDb@^KR#9q+~9z-)rD
z7V0D%@mIn}KLw9n;FcMAXXwvb_to(V>}+Lc{hBU|mo7C&K(XZo&4)16KY4ZkhygI^
zvE|nwKArrsJj&;Fadn+X_$j9#mVBRouYKp?eR_Y^Q(uz(?%hhlp$<|Bw=Mhkx1(Ja
zLaDW)SJ`QWVB%`q)xdor(b0N6e!OZ<m3m1&;*J?B;dD;*Zz^9`XGhlN`QI{Q+)DnY
zBKy`-AodqiJ}F8@M#cdQ*iqK;ABL?#?ylCTQ!VbRk`|U;j8>`_%IfQDYS&)97e2w2
zS%6NHpH)^O;a@uuuDdolJVg7H?eD?b$+a3cUe1(!muu0nu~k?I+01Wyx1h5b84<;U
zmy~?Z#oh<}KYM>GD%#2B2o`13glSs@@HW007PFc^9fey%0?j!pb-g>hqO<qz)rV|)
z_Rghy_gNpy(^NMHdEY{-q;FvQW7-}5)R_4*K3mHU5TaaJ+b_T3larGh6jiENF@??S
zo}*9Atj)h$?$FFG7ZTwDZJ62(8Z3pEcuH5FX}U#MkA3pazQViAdGVsfrwOY9PVFh~
z$)Edp(wlEJ+L5Aa*S;JY7uOnj$Vx`YvZ-ij)(h$Lfuh-hiNMFt?aGmaTp3z2Z+ec#
zw;x73F?;r0k@pa$H+1OGMqQ>>Zr!>ysj(>;_yaxw&Ghvvqh$ZsF5Zqbp2OiX&raFX
zOl>92k=xF3ad9`Zvg(zsmsU<IE%48yTO9MY-RLu+Fmvb^s$Q8oZQ5!!pk9ny{J00z
z(cMGiY62q`6H1m$gSMN$XOBUn#*LqBs8wdQ<wDqv9N789IYs~YJ~^|TX!H~XM)e<i
z0mCc4;p^hz&`R}Ht=x&vl`BK=HtIRI5d$P=VP&2>LjcyUefsU=<ulH!-ek;WIYQU2
zRp%#L4j4cKu~NDEkIMp2*OOo0NAE!cT#NEaZA?=g|Ne|(yF$}kx_WhCQc_PCIAa1Q
z|BVH1rjMU@L@rPTJDo+5i-woeLMzr(eM!{T{v$@z4-O7icdhoknHnOkr&CH!GAFn`
zBlMi@vgh3r4(tUiPu{!uRRKwFKII8(qslQJss3i~zI_R>hoP{OPrc*cl_jY#<UGt#
zyMFyDFs_}kN|9gH?HnCj-)h&Y)#|}|^7n-;T=mPTp2THFr%e#z+1c7EHa0f1)>)Vs
zu_uDxqbNd4ZSFCr0Torml%c(QXRJrd?1!#hRxLwnGp^Q+(xECl6-{-<H*MbBLdP#B
zvsN&ln#;jWUD)<igXEhq<P|kkKa9V0+)hYlD7qp4J8{~Zx^$}+a5Fm2d<=YK)c5t4
zuD3SFI1d@3A)Zi;y3MS`)NRUSDDDB9JJm*P)fbKCGl%T~8#Zkk?zn6F_Iad^&O<EZ
zTlKDf1aa-dq`)SWCkKukS%lc_RDTHuJ4_Xe`l<sKW5<#Gq<i7md1{#|M0~{zI6etk
zC|SSnnLf2^+l6~RKGm6$vMLXdA@*#3<5{rk0a)=_>ncXJnYrC9+nf)m-EC$Np}6F$
z)=sq5PPvKaL&ef>TKjb04jnqUxVx9b1$DzUjOT}7`IgOOO+U!TTm07bn>OVVPS@P~
zXYr_hC{0ilhH!G^g*)5WRL610;Y>hh>T#WmRU5Ww+qODgIf*8Ove(^L9zWihfhT3k
zlqoqEFI`&PtDWi*OpKu#$%V}L*lvM*;m1a59)E(PHxJPjM@RD_#C*N^49^|pg&bj^
ziCM0O0I@zCQ8>Uw6u4@1Lp{R^>ea3L6eAXbdst<c#MxilF-S9P_i;Xfi<{egE?58Z
zMpNHCZp@jS7&Y5W^%3|>{7DXM4!kL}d%u}8J1LRKGSoaGkA+$4&gUxv85U54TJODm
z^JX&KpPb=Wd3l#(Q+`05oZ}AHCdeYOL&Q=};e1q>GUb#14Pd}BV18QM09Axn{l&kW
zam2im*bygY@ngo0Evtxn_AVy-$N4Q`m3aPbMxM~%2llnItBu?A5|D?4v=*NzR)t`x
z?7TQMRKr}0azk(b&0!4*R$5wGmFHDMMp4IHOE#_@+E^OgUlla1yX+RDsf#bTb({NB
zi<7P>g9i`pJiM>26K#7K>Ymwoc}sv}OQ!Yh+t+W>q<P`tb>IeD?XKGJExE4~e0%hB
zI`h@5{&_j7uj2OSPSzeOCRMoZX((oc2GwNbAYuxNRoy{7e}8KZfS&a3ad(2F8v>~H
zU1#`ic>$bNPpLbqx(l{d^WJ;R0El~8m2VeO$%xIT71GiI9)y=Xp~Z`%c1A`9@eIo<
zkW_yjGEcta%(oz~Tyn0FiHZ7LL@)D%<v2T~j@279D}0<v@TO=Qnk8IMNh!-Hh~Lzy
z5f6&uP`EE%whYBcS=wenlo6S;+qG}sUhzoEBCZ*A?6_#dhWcXjyl1jJznFPQgAOpb
z2i2zzVa8Z~@b>M^7_C&p>z+^BXGh?`fi*;k5HK2-aV^I9wMjxDuvisg@H`LR#}#%l
zC2dnu3nidL+qG-!78Vw|`1ma2JM847RewB?`^g^?3@Zd_x-%-OJOx!Us!SB9Y5_E6
zrfoeG8L3bmA(vwzis>o>YkIo<tJkknGBPw^78mT?*}U-c=Z4OKefw5~9bSkwDi?yo
zcIePWXU_DH>3kw>%B4#&1BC$20YoKE%~O}WgBbvM!@Ph0YM7+;kmtg{OY+T|2hJWb
zT|+73PI6D2UeJJGP7-J^HMF5m6YgdX9S?IeGX+KVs;ygVthOvaa^%Ru{Cjoy-pQyT
z-)N5Cuz7PR_cVu))N8;1HM%w1ckQYuzwPa#hWLpx_G3n??oBsSZnmAfU9GI=CR0i-
zW#K>s`+eZY{cxI8l6g|bNaF;2_5L^=si~Tlk>Zx11=Vqm!htYRZAlNHiTXUrOkdwU
zxyYS)bGUokwr!UrFZ!qi3zeQez4T5j9%TxVdY(~8y|pwBnlO7lnUt+Z#gqcmlwDA;
z9NOOX_>LVqyz5XXH+Kq`$*~1o{%@eW)vHzo@z`9y(-C#?^&RzY7Qwf+re+21m0*k=
zL%1RSS7P_<*?Hgqdrs<dmR3164*d1G-mU=~?A^C9BLVY$EPZn0#*MKG-H~RDm0vCM
zSEJZ@vjE!xB^fCa#J5B4(Q@U=x%&7R9OfWZZrE^>^)JbDOk_l+bq4p>Zqz6gE^J8E
zY6iRa?hWPhxIxD$K~zjGs1X(~U%v0?Q6ig+gLJ&}#Fj9!9%HTpH@r<y3eb42-l9c|
zHFuyHRDgZOAHa^RuV?b3M~ySLypHAuf-29F!@a$|?K9uKd9w`8p=q;b+f(P_9pX1_
z+67p#$f&3xUcYFd-8?g=sP#ohC#UARy5(v8Xtf`!M$ctW{_b77f|H7WH$*<S=g3<$
zv#vCM=`K1SDcXsNr#cFsAP}{ApFYuPJ<>A_edwN~6hL{n@p`{8|27FM*WjY%>`;%>
zD@Prf^=YiM_^Al@9GM9Bd;p8+0HguxUyrxeSzQ@*;=Ge5XYYH`-22Iy?%W7$lR_MQ
zJiX(u6xZbKc-z0xcs{aBN%}iI7oQ8m@+O?Bpm}P&dzX*Qw|Ir#tBiipf%|k89gg8;
zjE{eV`}gl3ICyaW>C@fk;tBKZ+qb6Ox}CaT)4n6A)58Nbb0r2F)Oq;l`6jD2Y*<1Y
zkG<-RQCH5}w5cJZpXE6z6-}suRKIRD?B(Rt(9zLR{mPh0nYXyK=WgGw!<a_pDpk%B
z_mE^i$)1&0P)l8XKCvK~U`GG1R@wTbG$T~25W$s*tKm*f%PGWnEO#w*SjY!(YuF*4
zXG%x@vROsG?v5;55*wbbF5||9P$@SvG^|NA!3e9qTJ`Eo=IH4WoZI1{@xCx`u!ED+
zPFTQi6vXl+D5P{cb<(<X=T6Egd9LyC6L)6LfgCENKCV3GReE$ap5IDPE;@0YX3ffA
zGdOSc>e@;+#zo(FNgtjB7H7_qF;qaqMND}Yk==A!wyXpyq(V?Xlhm^AvIKl_2dp<L
z1A;no{WuRC2xV7!j26Dc6m|=tq8Ulo>DH|gF)0KX$niM9?+R^bT7#CvwEaDM^w1`#
zXm#|~zyWL?#bZ=-wA3zin-w7;J9X;Rv_*?ucfQjwTgXW@Z&UZ&rAw9hu)TnW3`?zA
zvt|)^FB_MhMf?Ek{{26|@g_fhtgW*;=K2xS3JDHsvuDrdwY#7#B_=v9SUJhPU?(&I
zt5r3*L^m@s7M?uWm9EX|b?ZWiA&l6#5I^M+DXQAE>-0AKTc6FQJ$p9cbzfxC`2EM{
z{V_z$hBv)P%Yt{ll43fWcdOOizk`DLa(x_h9T)&Qd~$<iCA(IOeU_!fWy9QDN=r+D
z8uGD;SLI}&;~+1uRs(Eo7A^b$&$XoH!NrSTJ*o>l0sTl~;jArQxw0Q9@&jXOZj87O
zU13*}Btn-jzeTS}08(mds?k6H1oIA`jMFXolXZ3+a~wOi1(@T*v&(hJJpX)~vQD6v
ztnbxboA>5TmVf48|Ji^MdFa3ZReZgZM~=FDxr)3r#wEABs9w2pGL_!@pWml+3;f*(
z`gZ2;pPje%m{WDagaZGp*+pL_IE)z4YxL+Qyth|(6Pl~3sb#-<wV2PJHG7mC<rDso
z_D^jGr~#DbEc^^0#NfY1T3XtX6>~m5tHRf4roIFf?+VC?ArG=xJX5*87BfjS=6yDI
zH8QG&cd&U8pz}aNLhPigJjD;-+A0M!L)^JRNSZ97&j5E|-a~5XQ)2e?^z<ZL*TZ)=
zgetIiF=wmrb!N4iO?s49B;_4rN;F~XkHo;@9yIk9F0++lo!)RS{KZ;-zIOTYS^NX%
zbM4CLSIa{NU<MntXG?MBJL}NKUF*U_+&wwN5CQF#)<uSR9X)+kh-t!|nFmj-TUQU&
zakIm;8>)~mS{MB&DssnNhzXoB3JKi}x-av3b;TlB?JQZUOvK1%79?_t>!EXJ=yvJS
zfHbFJUXikAsdP9x1%CexSZlt2bW)ARjkmNe!WPtxG=V-{QP>gUg1KwQ>m!H$0^T24
z5xZyj%7l6I${=wf^ty5#6ViT9>z=wiVI{aa7`C(OMH!9$Mz+|Kzlj5nDyDKX-5GBm
z_9An~?VEvgCuP!bJG*(?wl$$SJj~<C(xRsfOK)>NSHZyl(eFUwpk27bZilSeAn*&g
zHOQL=9i{4H{3pMjiA=)0>4Gg=8keIavYZb-BZZb#E?v4L$4F7YybBT%j0tV;*`_3H
z@;8!w?S>7PVT91lzpyg#aRH?;U~dthFBvOV3Gshm#+}>T*hPyfpsO+nzcG?TWzcrd
zo6BE@b`JPvB-k5il_Wx(fh&P!DNOHYACH}cC4v;S<j(c<4|+ER<upEkW9WGp2iiuT
z{X@yn3g2lF-yGt7Dd-w8Zu5>EkMx@M_0_F%6y-C2elw=_l!0sPO@mb*s_^<?TiaxW
zISZ$|x#{3{)hhEYtf85%dX7x-7_aIziZ}&b^SX5wu``Ert9hTpaTe4|{Jd^GPffXr
zA7%i9<>o~kX!BKz_L3L^5IC7P71i$~xW(4x&|%Z9l;}?)Q4}&kX0XAK=fJl0-DXW*
z?oP#1jzqQnG(9)=-VD2EN*B$o$p|MWce?1A*So`~EeS7#E(=9u8v7<`#yf4`-Ja9y
zLwG^*Fb_kTv|H$xF_w7(gaL|2v*wh;b?@_jX15)Eu?2Av9y_z5i88HgYJAx99wHyE
zb;)Jy*kKczmb3kG5?bVo<C}5{8o4D+SBQV_eKX40yBE^ZtUYgR?py~fZ$E*{v^XXq
zqg6yR9J6NHk92lcYuk2T-&Uo^=VhkpW%1+dJj|Rr&j8e(^97`>L{#`LXM;(Onp6rO
zu9-e@;od`h*U`{?v$=Zz(gV^v^X{ox?GecZfx1&JT&RE>uvV9;o(o={4)|_@H_LLG
z4K8Sz)}su>qP1b8y+7XPSTyQ9xjcdoGw?A^KcxD3%;;9k`nd~*nFR=)aOdc8sqX}$
zX-<B=TKDeV@!iRWa9Z<2jur$P>|y+jEhoV26Ufczv<5z{n)*39Cg#%D<pN{lq%>Gk
zr}#?G@OUfgL{J~|Buq{3%)C<z7wlLQm+cQ3N!Pv4bL<1Kje2YN#vpTBr%7e`8}<7T
zc)j7CeHyuYdg?nrE2}{F`!UH8d+hsbt4~8ASdzI$hKD%?AH=op-V8zwV5`o3klVh`
zN4CILyze%W6be4NpT2tQ)^@ea+A*xwFlRa?5u*t{TZUU$lp`|59@)kk7{4OU=(MU4
zp4Rc&NpTs^wbq4y?g87c;mZ_qW|xU9k9rT4n|c3T@$YxV3m*VB9S*qkTK-EqYCm8$
zsQORdDG=n;vBmQfQ+rH$TK>G7vEqX2<J8i|C2-0KD~&QtO&BVXsGbfSIutIYXkizk
zR6QYLolpB~^7UoRwYPmw)qf*@>8S1F;{)iq67ZPdV|}GV$BqpHK3=-(%D{9}#j1C&
zAptCaghg%LHVGE!>a}Zcx=#c%0ONs*W0u1Jlv5}ig6IDFkvV&m0ugfDFY_jlWpQsE
zHSKO%R6w&9ynuEv<T(7TIqa7gbFSZ$ho2&yQy<=OsI6_SGY>|z0~vvWI&S?oY!h@w
zE~)QkX67PT=A`!4oXm{!hYufq&)~KT1fCKEVN9u535?PIh&zLEt2S<23g88|oQbvG
zSI5ceL4&LrB#nW;mWPJw-v3;s>z>QB6q?=A#`BByKr5T9q!OxD0QPXhr{l<q_QuAs
z#tU~mqzxZQ2~<9sYP2%Yy(>KS!X6yQl-x=R-QanAxU4S6ND)$j!>N>0FpitgFk&wo
zn|*f@6z@mD$mn_(@%vt(_&)N>;7n@l5(??wU3GNIN<%m!cau5Xj;ouSy-!cC6_bvq
z-tD(><HnQ%tf}(vAE`zHjhpnApW7g7U0_noL&~DaojV8nT+>WSIzi;5EZcc!lHAti
z(F3=u$@E{tsJ)?7zNXf_N-nW(PZ;X?mUN}op15w1QLDDyH1XSzqo4>W9Q+rJ*FK#i
zr8AUA^X}anQ%+hA8nmv{hom#3&IF8j{*kxc!IJ|r6#zKI(HGCmY};U9@^-|YZX1>^
zwO(LRKd#G$*b~17<LImfy?cSiHRWH6g#9S6k^WU=P$YyfE#&>v-P<q5qYyB^TS0XM
zw}|rPY19v9yQ#Hy-v7<>tfX}6+_~4tk*9CH*;BK9`$G+~j?w2E_zMcS)vfE7FUO8{
z9SRg!U3nmBW7UtYwZ-_UHxS1*ZW45JSlqVY;EJddL&-#C^iMv45mBEvgQ85mD3=Ov
zA-N=DT11Em)8`a2&T(18DR5+K>+(5#(w4Va54tw4g%{w|4Dw=T^+G_~hHGsN4t7iZ
z+^nDnf(Tui%P8E`eq0SwS$2ND^Sj7pOOq<4SG4i=@i~{AtT4Q<1Yt<XD>NH%eBf>i
zb-wN8(2N`ATf5DgPg+!hNWV~z4fg4m0L`$PI0@x^p-0)`qBZDsnkn2~&TAi9iw@Xj
zmQbXs-?~G!VRm23TNxMW8{|M@0nC{0ilpd<+43Y}EqKwkRjb&^v)s3AwtMvD^$uPN
zBu-yghwXsj>eZKf$46{jv!(*n>Q73{lo_U1kb~5vg>n{2Ma%q)yts~uJ><vvcXx1f
ze9EThWNJG_A+Mc-XC>QyvYx{@<Z@x@di5&9f9mzxK6+Pl^w3;?J-z5IpN9_}y2kU0
zj_rPDhSiXoEjkJ#gVd!(rUT`_r?<CG`}TDdxwYu7%twz-N}UAswa@zcx=NARJ@@R)
zI$xUW=;;2~v~i<#_Zb(m7Ruu9))Ane8b`jwCf3AWGfHgW%1hl?v*twz5geEn@j|GU
zyUrL)0em`Rj!)NA&C8RUY*@eEg>HgSwTfAry15h&Odi#ut7pVxrW@A?@Ms^>_MbxI
z5!z|10Cl|mij{^)+|68<v8@LyAo?V>ouH%YKGe3e>8(>63dqIN^HTP)EHfSen(4}s
z`3rQ<OtR_SyCY`w-P^QnJBfX2VC!<g{b@fGsbWfgJ+4l}W%<YBgBkJqz^aiY9*mP5
z*_yAroMt(Xv)9x113;=3Z=B?+I2M6LQYetAfz6miJ>LcTRT3u}ppcCl9T&W5#(Ub4
zmBl+TB0VY|^F~T7PIt)QGpD+tl6?Jo4Qd-3NB5ENG^XCguQ~dbnlVNU2Z6dODy!D6
z<+k#(rrjS4R9R#j{PDZ@>@DdBJ^IJKjQ=+B3fF(h7m69Z`!udo=u$>)Egs5;50Bf{
zq2<fr0o9w=UFMS?2<Lefj6&tSHZJ(Y)wcV{w?fDq`GPXA@mjWsx;Gi2ojZB?k(I=V
ziuC2$I3BJ{|4no4#njZN$DYhRhi1MaZK_Z$<Ffq6=aIZ`kBqb}2+(X2PB{pnxp&Gn
zjz`#&33`0%S#}Xe-Dywz6#*2zBivMv_thh56-WZ&X<h_UJRezOd;YZ1-|L-(5e#qe
zM7JezIEa%}R=+6n3(V!@{lTTL20n}&Oc}WLo3W{B$kC(I7rdm2U^ITh1XpUzpHo@o
zUUBGfn>KZuH*Kn*bm+xnNI+1YGo>rpizZJ4pf|0K@{}KGB`m~@WJIpbK0KVx?NLFU
zTr!uEX5Ps1AIvRu@24;9l9AC9N4Y!3X=(POryNWFsrhE#UEjq^%V=iWF&laBpEB8G
z+gG^1)cPk9#*3#^m!}MIU6GSWGvfTpy*{ftVr4xjXU}J?a?6ueiXXf%-;gk8$+HF(
zCeqZ*q3I~)wJel=cydPk{o~I*td~;PTg|@b&SfiBU_%iTBPe3|(3%pUYoBfoC%y!+
zy<&!i(dIqPBJ*v2OorT>ymuj;3k@Veb>UOzIIUPyO7HL=%@Ue|M8&jxIW8>qz&5`l
z>9j3e!Pqn^QjljGp^51{T!lGptVWJ(jQ+9KeN*3u$$`bcTHbm|Y`b*rS|75Q(d1{B
zcD4JkFB%g1R=b)5!iQb>4dv(SXVdZ86I3@#reFAh(99w4S%6m{VnV&!_oJzWjcDem
zS`!?0@udGrZ}C-Dd=p^KlONRvW=;>?<lQ7TCdLKhjQ5{EV*uL#!Ltz(7wgXyN85>R
zG*f)nd>RrkU@P;tk5^i;8mi(GL%D4~!U#n%F_rOvR+Q$=oAcB<c-yH`Z{e5r9Xo3J
z_Vh2bT(QRX`0yvS-Z3cJ$e<ZC#Uk#xpHwKSr}yW~y}HKgPc=kW*oc-PdRrhgI&l5&
zT`fG3`+>JG;amb2bSc4}pR5zMo0%)?%?_s_=$Cv}rOT^ZyEnx2X7r)YTXq1KGM7B2
z3a_0CTVZ{5^@a^2c25}V;bC=n+SIAujR&vG4q5sM;Mf1K!JTd4;mtokj9&l)zogm%
z;SmKq_HPTJWZ!+R+}AhGWWA4f`$2gM#l$Q)GSbD|DPhXgJKosPes@~J#2@Zqe3zKb
zL*h>`cFKt@s~C0YFg|1cpZiFh)DLihGeAgSq2MUk^x;3)qPo-PEZoJ?wnOiWe`Xdp
zFF`Ky*C8X|?WAo9WRO|3l%Y0dlJDPdz+vu1I$<`eERE34!vq)hjxR?bQDer?6U(Wl
zwBQs5tQI03h>44Pii9(4_>zfwaMbV_l?$Njo;`nV)$t3&fqFC9VMnK6?!Menr)D;r
zHV1Aa!ZXRgS9OIR-pbxxDX(51A6<!A<!!xZi<?_q@1tX8g+4#Ml3CFTG^V6=OtIiF
z7j~WnhQnSL@9Nn?DwQ-V&9wL4Wi$5}wkdhdulv)u<H1!*&Ulc;yjHMZQKo4czcp?r
zi22lLhb#k&|J<Nz(A+t>x#a|+j~J1|ZAj^Ju$2EiF|8V0#rh6j>Hv&q&c>>XI*`Ut
z%ZKd{1W4CZxhs)h&i0!9=7Xw3f-Pj4t@4Bf2UYEwI*NrRJ0`fhH_uGuNyYxawZ&@l
zfp~hg+THyv3lk0>?k$<%3k9Gw!8bP1pFMzqX-Rdk9AW;O_nvAns*(b7I<lhnvJ17A
zUHJdVIuo#**6;oAOouX+GGwMm#=?;dB}Fn-N*PmzlZ=^;DME!PG#D~fL}f^nP?V_1
zRLGD_nI%LeLjTVSzw^DW|9hS59EZH`e)oQ!^{jQT`@WZ-eZa81v&$T!SKMs&EIn^9
z{U_Muo;!DrJK}*kUQMdd^;5}$gTL9=_S;<grIteILQbp(DlpMCguC?o==CtsJIsvx
z6Uc9}y_MQi7f)aqfr?(A^C$auqhlKR_luU#yZ`AZ@)uuMT%pAT+%as}ut{^~Yy!@#
zS-@Lw0LOSj7X|yql@W*bND`(V+?4ny&8|4AgH_m73A?~^ifh%&*8wDSa)mPGHvX%7
zhhB)xqtYjUW7DaNFa7-GOCz&KN=kM%cA5bPeP9xtXZES932+`Kwrvh+<|^jj{iS5V
z3E94VyJgQ2TlFVe>yr)LF@UDIP<7AdwcrNee@S=b9v!$hn4ZnHXmRh*$%SVZc{?LB
zQx(a$dajh{Bux<^CY&-<6N)EaV`xt~rXpi4wKA<89rf}?J2=dVbBUY=#O56wY#(d_
z2DRniCASsYDv_T~m4Z`0icr5?XJuFI>({Ow$*pL*aLBfxgoGC>>f7XyiZPh{#*J<B
zb*m|WxQZw1M{<wrIav>Qb|Wn9V#?aiGV7~;wne7Nu;9`QmElK15TNK7cHyqwyR+dQ
zl8Vq48Q}O2oFR&-HlUIOE@00+%{s~%9o}mdsgk*=DKsixSQc(X3t5zy<2QvmMx*@}
z%DKcl4IV#x_RJl(l(2r#An)boTE(9Q%9o(svy=sP52|C<Xj|J<l7w+nA4ANv8Xt6*
zskYI^nCkm+7^=>{v?e$>*u0!GF5aKOJpJ&zDF?`RxsslxTS^=EO6xDG#<^iJAsmd3
zryeqyXZwyF_V?$2?xYryMbE^<@Xl!|{(+b%9qw&Eabo_F$=vjH+JVP=?7ryYKKbr|
zYKpY*XzeLaq)AT+|9rgq<xFtxSX{UIE-$OEJbC}#e`No!`yYwmIv++4?_kO;qniki
zyiVvl-v>*VE~Pr`^E<ByEVpjN*=IUwQY(S+sRKbo_Q7(?%FdPy!+^tbdD+vo^ZMMk
zxs$PGU1VzkBZ=G63xRSVXfo?1EIQhkQ#OnPyc^jI1J}%R=X(%sQg7U-2X19<Wu;1B
z2|IL1jhK1w{(Wayq_oGjobMMnarW#kF!whgsOanQO_N2B_)SZ85xp%J#E*pZps$Mv
zk)4WO7Lvh4EDapBb#Xp`LiWy06*vB`&G>pkdyVyis(`JgJbvty-)9>oS7XX1R*Za+
z-lK;|+@2FnGgaH9;(K@gSW(yoRg|mPPVJd`Hit9&BS_2z_5-{g5&>_zPhxE`e&btv
z4pCxXz+9X0vu4d&@9C2(9bw=~7|E$z*P(YOcn*0<OGdepN_>$TJlF$^eNRgC^K4(z
z$8;JO7|2JLE^EP5lY7Ufs#UTdQtus@1{Y*9_|%jNZtl#-=bBTv9ywk?zOldV?#=Ci
z0cPf%5*}C1p2XIL2D5!V5bJw$Bnyel+SWGu_{e$l3|CbA_&2QZ^=rNHc@g2T!ZoSr
zC2dv|g6bM!g^bwcIJJUi_SM~e8o`87GE2R;Y_Zw>UKl@H4tK~*^i0SGpKzWtrydiV
zz%d3+``l&R$=Xyw8FXG|3?F>RQb$8zR{{9}hgZWK-jSo*k&+eU_~gmI>4J55<s0IU
z3Tf`lgTr^W`!H|Hd(i#uC<f{))=o5%`C!L5S9R>1S5xr`>0W4AkOQsl?7YB0Xwp_8
zrmRXx=!LM=uj{$8DkuYjv(HA2nK`qQ0L7Rd<a5g<r32`cy$zXOwgAItGa$IG;RiBb
z{A)8hV_)pq!sJH7&$DIkt;Ikf=-=KYr6=?11;AIThG+J$oqyM%_vb?+38B64N??hE
zvwew8(6Y{>S*8cQauTD&g(<lQd*_&0FP}euf-!muyr3t~o~@C;^v^$SX)>|>Pc1HL
zBZ&g-NnmNy^e?BDvcyuMNyO5_BEys6J7p^yYH4}11{*+ExCk!gPte&hs~3Ni?`Pd6
z{J<Ub8$iiV|G_)pQ<wi+<Lu|i5o|cg{G}K2JISqoDU=hG)IwMYLJkr5o*#V##@CNF
z-vE~}EC2(h<Mp;Lp53i`_Y0^9!=NaJzAN8DDxHp{&Sv-n+<UT#)GJri05mj4w78p{
zO*g@5cNg=&IhB^9ryx<gfCOvq=a<9DXIg`u(avv7pN-B<*W#9JHhQ(Mv|lZlJ2wT4
zE;Q0{(c61O_-q<Gg40rx3M4=QLHAQU9*D(N=^zuq)9G7IC|qCWH{8Oax;&{<DY;Tx
zLVAVKl22b<ScmW;V^NQRAGXEbuuJ@%G(u6>!GmitB}>W4i64-0DylS_xAFrXl-?Le
zpnRgUb8BEyL08Y;ys5!zRXj;<Z!dK=!UGIG5bl>&R&t1QXP&_fUtE-5o!{ZE)MhlD
ze0AaOC{lVwl@yI2j@;I4oS9WqI8iLRw%iFgh3RbA=Myxj%Ga@|<eE?Y`(a|Blgb!v
zx{R8$AKbfFv+MWQ*Ele|IZIsFM`rnx2TI@rB@wUkxwg?jG{}L}Se%sq)n)NeGd_9#
z+>85IRcX<Gc}<+#%f5c?dCDh5euCWn0{M~>6fyLuV{8?0jdTLgS}B4dA_&Z`JmIfy
zH(sdEiKrcabl&m)EnvjC<37wV`_`~}Ovi+Oq~?u<1q+S8E^dx#T;4HsLK~a)wzYgw
z2X+9(<f_-NUmMTr3W6e^N%T<^8YUNkuZmd1ZJ^!AJpc$-6qwlCk<fp9xp*={%K8|W
zQ4-W5=NcB$qZZs6>A6#l9mA7Pdi^W`X-Xm5v4)NqIE&Nmo8RIFlAitddcCkgqeg2v
zoz8=GiF~;1=ccHr4lBwFd&#oJrmgH~WiD$oPy%{*KO~TDF_um?qyR*Auf32xoF@R@
zT|7g$P=sCPJ(d2X#^?LFxvB-YA9=kHKdWQkMKsiSUwF9f^rkcnb7oIQ%bj|Ly{hP`
zSuBgc;~?f7sh^O^Sj9W-hcdE(+%F|Py@nzTV7`|b<SYt$IIJ01(0nW_njH2X-fA(Z
z^oR#kDyjTcVFysooQG#X7Ul_hh$?6uQ+2@y&dF$R!4oJZ?ij~FWkr+jO$Ap~dK^VK
zyAR|Oge80DPCBrXJIs3}g&?D@p;5)d<A?q$va{XC_TJcQ(zR1ELYHGngwWJa3_4Q1
zQm1xpotNU4bZj=w6paE2#H0~IMs>uWtUp)Th<w=V%o$_m`uV~NF@rckt-HDA7B~fs
zUjD>oT162cX7P7Bhh&(j1aG&83HC*wK1G)r<T8kLSHEQqL7Qmk%$L6XyS7RhQmkW0
zZ=4^2P(lwdrR&TWuy$}5%o^{8B%OJURrQwTK^_@6Zd_vek9`oB*nD#Gv9Ej_JEgM`
zq<&`^ni3?)Wh5n{W+Rvji{=S8Ktn}SPWeyaorWKzCtY)-%>BuUtbVqRd7CZf<3_i?
z-qSoyd*a#<{piM-rgKhmpPaFaO3k_YAa42^cIl-BO`TT$=*h{`nK|h8_FB`XO(SGV
zR}}L@j+=>e2%UlTT<;!jP6v9**uShxUuFjA4uwomPay%ykbq7Lw3f$$1q<XU(=d=W
z6WBY#!WJ)Y?A?_7^P3ucs=y@^UWDW?uAg1NLD;#B<^+=`e0_&&bhCDXE?tH!f6}a3
zonyz-+`$PII!r2{ynI!zfnfk@%s{I{7s9LfX$WfI$uJBc9Q>su(Uy@)45-(uKLLIa
z$!;t|03-y%NOL?sr4}{y+aSB~jU>K+ig}fiE*gMJhCRfCl*jXgaF<SLvYGmVcdLhX
zFa=I?R_LsjCM&+juNd}U)mT{v`qR@AY9k9ttx<4?e`d|9L+Q1#rOCwE4ndU(d4dCS
z17WT|4VSTs!SgLX_>-LELk_@)_9DswDWEBhsJaG>8r6VCh^L=l9hj`1C?D@G>jRy_
zXvrrXIj0d&3p<75r2cwyHfL25+NXp_uJG81&oo9fgr^r1(-o%>cXXvf)+HKeWo3D;
zUtddDgUpsmxq0*WqmnDRj4Q6+purm2_fH53r{=$^i#{rp#Rp;x4DAA(1OgbOADX`~
z3nu_MjiB@a-lVZCVFy-GN{ppZk1t$R@DyQ6K_E}zcwZxh7p+g)8<eUzZ}o&NlASQe
zi5q<Tjk^#oC7eOopr@Tm(o(-kllACug0^lIj-a&Lj~waDK{~p_RpIPf?%A(8Rdx(N
zuAR6wAzd^K73Ikl1A&p`MzP{ACF;~}VpxqN%V=3?CxWkh#N|YtS~Q*|&r(A9|1w6P
zQ9k52fDRzIUd6FGP)B3H;K6^f4n=$*8ODxTQW%1<N5yWU>(9)i_2d{;_0)#B!z{vD
zPd}pW-(GA>>XA;>3;}s3bHy5~YRA~x+ow_Kb(@`Ml(hV-2HkT>b|ptR5t7+6r>IpM
zHx99bbkMbALSY&4Bo^FGF(ok|0(G%BT7_ZO$JSmjgE(Pz)h~V2#}ZYtC+a*>bD=iT
zT7J8DlbZw8K1~8u=ZWA)^5HzZR1T735WW>pmVlVQY|$Cth<X+eXGP(WvUZXte%}0!
zBu{b}6x?pNkF~2TGXMN5KnfG+E?zw2d;+gVfiRm%5ZK%;SSmrIyd};QA3X&*?rb_o
zO)Zz$QzIs`ub8fDf6zb(q)hldQ$c=PVCTSnEm&?c<#KxZf$XlP_~3gYz;dR(G2HC{
z(;x7pH`wXt8rMF<<1#M!Kw7iLf5(9X?KnE0zI{7(#$(^}OSJf+N3&a+2*S)#o$(G3
zXCoQQWIWXj?;q&&CHoL{f%8BA{MDyVqEC=RXDXHD_^F7%q3P5WQWmF&>V2s4Xb?fc
zhkHZc!YlCni{Ui2g+VObFQE+{IB?)omm#STW4r=2jvP4>MrQcRH}c&3M`zXO-q2_z
zFVGN-m-1ukx#lFhQEaNa^&n}O_4)hnhEN|3kLbc<tJk7${oJ;LHkwxUzxN$E;^+P>
zy;dRB9_@w9xGte@{Qpw7@eqLoppk9K50*k$WLd@DbEue;<InYm3BQ4zG|IJJb7M`~
z2|1;Bbom4Z24>Y&QyYl!q+Ll)8Uc0-&4jpru7%i7u3%+a>73<sxU=j56uJRCNVFpK
z{>Oh<Vu@HTaV^VuAGtHgwn^W<eRW#2mq+=3uiFdIA3e#>=OFa}<Q&&#Gv`iS$Foa&
zGTl}1<^MQ^gub&cI7ec<IT%zExZN)tYh7Lq9E!Wu_-DbFnr1?VdwXu{)~y<apAJP_
z$t17R$)#iraLc2D(aj~lFg*0eez<$zb?e?DnW{-g*0v<)&K(G0Te1L9(#Ry;n$5>=
zlek!`kUaPHndTH{J#g!GCF3@a{<dGY;ReY7!X9FA)>vEnj`91PTxpY`r@Oo40nNNq
z5GWL(PKdM2?pLAdQY!-<f})@n667X@(g7kus}DE-OSt&eBR5hr9IK8cB#1VFc4q6M
z9VZZQpe0y)@1A2Pr*q99PRu*J^?&|ai0Z&5mhW*zoXa42#S`riVw4x-)q8q@r8EC^
zBbeZ&<+EYi3SX1jGG^4N6y_7O&)!<1b+GNZ7o!~hZ~^GW<#GVq`hjjsmo2Mq&6=K;
zIswyX`7Pm&X}eIPo?7w47!htkkE2Id-l|%qiq``&_u$n_qi+JWtUt7!fQ{X6%!q||
zo^#b;7N@g*a)w4|JH$GrS%>BV*H0#)RXm+9_zY>@#{3i108eB=vLb2FcE(J|pNzT@
zRrseae*g15*S$W1I?j&*Ka5@{R24nWViH(Y#d_4J3qc)5Nu4BL5n#ldbA4CG$`~_7
zucAxB;p7Xn?B*nVs%RRsr-7`!IVMw=G_rOea|Spxfx|+7%!kDNLRMCNtT4s0WEKB=
zCGld7A;X4UWZRSMt;SJ+o_Kv@V`I8U>97bOyFYl|@-^ElpE^b&J`7b6@%8N4Q&Mf>
zSO5NHPpWRz?iu~4yg#i>8c2$#i0h-5KvwflNipSWp0GzQUB5nPL&MIYM~>8ixX(_!
zK#EJ+a>!x0-%~E-<LA!}+(5@<y$dS=noKBdwK^dt9Xizd-H&1`L2IqWX8?pti|6&j
z#9sZGVmjz?dth8Y<CPv1y6aCKKR&af*uHOT$6Svr9vgZeRocpmCtfrR>oCvP|5&HQ
zxWy)io~0kEE7DV<#<Q@rtVx&5lRn|^6BA*9aA1jm4sXoo2vht5r|R4nnOboDg=iot
z^Ua&WY?CAauEann@{c0UHRSQsuV3Gjs!0frz)&2spouMnqfcrQ4rKqS%P9Rh<@0y@
zM3?nCd*GD?q$pazjdBTkJnulaJ_ut>9UU8c9?x)~^&mr`Hu-i`t6O)gE6MkSXm=Kh
zH(1=qw7tT}#<J9Q&cBp^+-4sFJu^s_y-_=GchI6mf6MNIOY{_-`Whf80L(11hI3Iv
z=fvaBjV(YUxxLWV=;USL9S2x17?aSwZ+$R@ItqVIov%aw@iabxHwSuNCx99Uh3(px
zK@B#f33Csqe+?%JsJ;s5mH=-MKtBR_Os0L>*ucf>cEzSlEKfxQBvHiuSq(%QvckD*
z+e1QJ;Bk=}dC|+*{Zf*=->;>2G5UPQ2V{U3Zr-`GW2`im1dT$c><91TQk0qesM|26
zahU<`^EIANP$ihQMjlALR$C!iil^_#iFKej*UM1q369W1^LC?`Vd<@9B{{PcQyVl2
zeRpyHZ#|!o?8qPdp-YYq5$@(<YFxjbMt%BBMzyJNMHn-1$n5&OB!TN@V6Y|nXD&w8
zEol_gucDf%+pb+30|Rv+jn{XMG$mbeDj5Gl(kfhY>`uRj*d$Qfyxo>MlB;pdOX=@0
z)`=Y;v~3nWMN*1n|Ik_{L;kb9JGu)5kyEo~%osQMF+V(+$4vl<jO&8VjrEN{<^ohL
zp+3vbks6&)N$=T{9iIX<CG#{(#}6#gVeC0-Wt?*D#~zJ|*(A;l^FCzmnHe)Kp)Mu=
z(YP*<X6bVX{4;1%3qK*X8ap9cQx{>R_z}D@w-0^yd^hPOd6Y9+Z0LuEGV*iBUk#G3
z>-)6hfo9Hp&r!er@faaf6h453Q!}i#LU!EtmbK4a&en-RiF2P;5#|m(Agcm_gNZ<`
zJ9VnfJxsw#VSVrUFHJfA!<{BqR746`NS%n}za<Y=_YVBB)?6n<6{D3u`^uw2-Y^NR
zLC-sPw^M+_y|<l?6eN7E*)j{@&fs&G6a_!vCU_bgG`>p@&y(g#D`VraE=$N+Fer0J
zv+|u$2iqCQ2<2ZVp*;KEjfF}I(oBxl;H?|X+psz8d#A%J=Zvmlz365ElKVsNdZC6`
zUXtSwyp@S&H7Uqu1Wy*P&Ce-;zAJA5K#iXzPfSgvhVr)fOFZjMqY$KThqs~K^21i+
z7*U~(0Z!NN!*OG>aCeWjTQz|_d4cm?QHV|bppCtCT==Ys;gp(G$Dtg8A8a4#k%Znm
zv#=v_dR2};;nS0Rno$_Nn4rDoDv^+8fC}uEZDq^qDkJywJr5r61Qp1DH8rFg*R5N(
zNcVHHkjS*riiuRK<B^fWhg-P0=~GPA0L_xr3@|15?aDE~dLPVC8asUgsWqALTqxhd
z^&+d4>Q1tpe*MmleN{>|L(<F*ww^wL?k{JCJE8+oSm0;`7sx(7rsRKc$jEoyi{Zkt
zM5_`Qg3ryL`n%w3+Ux!9Z~j79$xgJSiUWZdj6Gt~P-#S)Pn&jN?l_vFm2G>QQOhRx
zH^9Uz9lmlm5JtMGwMixfd!r@Lv09V;2M(lynUhM}?%q{ujE|Wbwsl>@9wHott#v*x
zuNlyZDbP1?qU=W&pC}4u-CipF4Ajas3qQ?W!zvA@z?S0-@fT;bNIW1H90o3<Hj(0P
zX&>rh$&x&eH*kXT;yCLuFhwVfyNy&%ibR-so1!`)!vpwyN<nG_Cz7jF3+;3a6=fBk
zNlZz#-RU$R73n$FU(ez446;n45p6tRA7et`Ux78o;;)}{QGpc1`62YFl~7V#{E^lB
z3?hjctpOTr>qM#y`Xl53s2@Uy#iLDnL^m(LnsbOJTPq>w_H8*2c3e9W9j(fqdRpx*
zpgud&^2e%~Y4x{~nC-|ZJ>%y4SWS`bFQU0P*g3UH)agA9VC%59gefiw3#hx)zFe=E
zPv(uNE!|xB;a;s8)1NgmHm+>Lbn7Nv*!Pc<H%Wy+jI`u%MH|h`G%rMKmm;PuL6)3K
z+rf)qn4kFJoYbyD!jss@9U?{#qJ2dicuJoNi!Yww<-~XTXA5sEoI=PG%+TxskCe0^
zr*viU>n-MqT~ySy%13h!=wFWj1S`2G#=JGBie8N?4_4_KlX>;3rkRV(%(&ZXTXfR5
z@jh=c#aE&G_OZCw9R8`G%)^)$c3^;=`6ua+L4{NypHIUb!g=acZw`}Ze09mW$k{h?
z9|^MONp$e4?0N3}{CV><_WZ-NX$_)BI;3sqDO0K;DCZV~!9FG84WRZGOoHPzpCf|S
zS=6FOz(gW!MH*hsx-{<&WM&$yGCf!nC$zrtai~X(F&{m;p@`NqHdVA-_N95lw49%H
zzF%1#8o5sEaBQfJ`tYDo^`@&MHD{#w=-SwHy6CE*xmU+!-RfBLwlB8~o?M4OYdT!>
zyp_xOE`P5MwzLVk>Aa-kNk-CD+eHqmRKBk`x}hNC#QbS5A5B<vedg&84g4tWw|#9a
z3!fB3D;GGMNhWGp)D3Xs@;PTj&ne*@gj9Zt?wv^10kUo{IrTsL-z^DoH3w`}#yJJP
zPW&fylFxIraUyEq?A+NyMY`TL5gmse7(Qf(O+l|G=5t)&Aw~Rr`GAnYoKeM-j{g&u
zM{pBu^(YYpry6b$cIer0(O3Hn`o$4en?>Gv>BT&ydiUs<vGJ;q?&zx}W@V5P!@C{;
z$m89sCG7Wf@twd;Jo_hQVztmo=`X|6YS4yR&F^p3-iBZkD*zRRz>0@tD{?yPnS)**
z&v$qUbfKv=<=C-fo(v0dopvR)N;Nc<o+nOpmjPTi&(&*Y)T+~*hi9~szkk|b^*~Gf
zriq(1w6(QUXOT*H(N%+@9P3fONN`Hi0q(U3Z_Gd&65-)=W<f0yNk#RM(blbwUt!!W
zXL=hv*S{2LD9C2ha~}(i`x{{d%f@ufW~K=-zu~kKCcrmh^b}!$qy8YrpG=mT3nCo+
zzNvDU>0CjJ&rLdCg*~+X+k%2SbvO?Oa?(mK7=3bNQbjlh8~*fFe7n4kkBbwjy@-t{
zMCGKEgA;O+tU>_0mHQ^!x2I8p>V9ih3z|4AV~3*|uJ?|$7F&W+Zkzqm7BA-Yrh~3~
zrf@>!@v%)}Hh`wv45;zXKd)SdB=b3gIQ_B@j%+F1I&c-7F>13#&0qQWwlMJ(=gr7g
zGW^R+K-q$LB>10qZQ3M}sfO~gsAvF)R%WM}qin>yD0^OZ(cvC&ur#*LnmN-vrscvT
z+;Z`FI6c3;^t7HXA`XM&gtNs&6UpjPQP^B&at#QCtdd)5c^s*#2$)46^L<!v$1orE
zum0JQ<ltH@4pA`#|2!yj$~C7*e*#@X@a)eA#L0wnA*|o@oBIvR8#U~zXE)mHNSidX
zTE68=N0B0vuQkZ!h+@Q?f98XZkJgg>bm*LUjFuEWaADjZ$E3xvOI1e2H)ztN!}8v@
zaQPp$!?3y(fS5qt4pWbMA!i5454cY3iXTg}jcJJsWY`$GAEE})vf-u(aN2_RwjaBE
z1lT(SkAGqdROk>lZrbz$a3O0{8vC@gfHgLPhHgD$g26*rD~Y+3$>tyS7UUdS%YNQR
z{@9ot1f9sZndwIYXf98%+x(R9+#vm{8FAm7RpUHsmLG7Gq-Ibt1mi!u?SIqxZFtF5
z$0a@;@WlJ^UM*}f+fo(R<)D*E$l*05GFa-<*At;L4NA$%7BEol?Pii)-$ak{lgGvz
z8HmHi*`}G-?IPPqCf<6W&$<B34aei-wR9tyOb~RYYsZcqv$wS;qtH+L_%^4bkYtpP
z7&=DA`RT*)Dgu;NEksgC-Z9W_b66yaem-aH;F&XLo=&rF$f}_GzfQwSlg3INzFaly
z%~Q_N)e8R75^8v-dfGiAFZQQyldpKL{-JRRT|hO*yc||aAl#zU`*b8yx>l`PnYxl5
zCpzXYv7tTGm`KpnBmsA&4kuE_(wP#9UziZ#f>qbbS#jr1L$*~JO&NQu7SUEBZ_T?w
zdC7Y=d9k8)+S@YTZPfHcQiQfRlP}nr-zd4~@To~%M3KU9qPS($&J+||bCdCs4X>h0
z;tC2qB6fvVzZC*r7#TUbj$~}7-R1#Xkb=BkBZG@TgQ7G?4cDM07IW^K6?a3AuaC)Q
zy~qiPC+9vu>}bCJhy8#h6ANxnA#2du$^q;TQnu}W1yEi1N#nG7zCUj$^?fbRc24^d
z*`Pt_it)nWyFNgzVJB0HwcExm?B{?&v7`-zj!8#VI{&0+-nX6Aqn?8hEQaZDrITw<
zD7`4;R12v$zkgU+G3v|*@_NmVm#ppBg0YCT!kX8w-%7vuK;*G?jjXM^MIQcg16(Ve
zrekn=85&0j2@bA|%y`Da>=>U6=J+}m6=ri1hoBZ~|HE7&58y)T!-*tKE4JHtTq?^Z
z_z!Cb2}cXjp%@_0)QoK<jE3w!nGo+B&$&PI@!FD=I0=9r>XE~@e}Sc<2I__zYBuj`
zRsKV?DLKP(!u1%^q)(}BHKA+Z<&RI-wp1-d`DR-VyMDv8t^SY=2#dioP*F}tK`6v5
z%6Ci0f1h7cr+KAT$Z2Ed;E-^27MQcmbf+piLPEm1YgXl)6vG^U(%*sPeQ-h3)8o5b
ztywQ<V$f4c5tNMQvRZVXHf%@QU#eQyt-ET``n6G66JA~2YM<5KfZIpLHR6P@D%bMZ
zBjN}bmZhlfDX%xdzM!=+3HBJJ@9U;%dfp@+>!u|-!BxQa(e8Z)bmB(wNVxXhTtqzS
zylC9;u9?<EZ&)5}yLTVuI&$aVN3LEpQnnkphj-gIZ^erKsGCL|9%-?urA32^xcGPj
z_xy{7UpfX}HupfBScFVj-|yf7gKphYFm=y-cZ;ET@TOD?Nr+ghqZhckb+|EOcgR~{
z>{IhzFm|1$0&Ob4n|BvMKa89(dO&*%UymyV-w&T^NrNT+q{BO>(KbHxoTU$^c;c{H
z6SqX$0!O>~9R0gYR<@A!h*74X?;JE+$Hl6L`+h=_6EybNu`CqMy4v;R*oY{H&Aaht
ze`vyxhBR}ho~`$do8<ihUQ<LlJ9%_20CCT7gUL)9w#qom5?yz6dN0oTVx#{9f1%M2
zr)&aGlthn~X!icU@EP+Y`;@X2k2lx?9e;e^Eo;*4WccMc?_?Xe%=$yKi^5&qTe4uI
zzsBvaYpMleWI<Q|%DY=<7S3?y^lB30lspJ#h1EN1Wk;#P=ACY4dVc0Biw4cLuq2&K
z8JK$V#aD}~tRwS6KvKPjgW3g#5fl(^>n9j#($7~Tn;w07rOf1e!6_!x0Y0en{GEhn
z0WXv+s3Xi_JbrNnux<EB<l0sS9&osTy#l8ET@0SK%Xp!c>s?kA0n+ZwZiL6^i&|T~
zwHdVg)sP~Cb9lL+>m6Fyk|Qi>genceaY1?GgJq_yS(`u(7>=oq&yA2itd%ZJ#djgd
zATXpj)mylet%1E&qr?gBHHt&_8%}IQGC+S%Ggtifqc;1aV3MX3)d-fkyNp%=R5To#
zc{H-<8C>)5{O)p5QDqzRYu}*1pswY=xKZ(k$4(LM%rsj?#p{{XpHa|OwYCIDB*iRr
z3jxl+0j<_78gC-pd$Y(wElx27R<i4VAiu!L-D!@MyUEzGZzuS3S|88H)|_cfi&v3&
zKHE9xoK1m|Ej`8kWAFCtDYlFH-y4q^Glr6M5E?XT)PjmnhWkV3)i&h&2)p}sr&lmf
zy!TazpLcJthui_t!A%+#j4K{%r$x)s6pE7P@#F<_a7X_*aT4*8oz+H^ZiP&(tJ0{^
zHru<eUcPjpA7Yk>eGRB_<juwK%KiACTK1>D6h8P18D_kiLR9+mvLDQ^q~S;-YJ<n7
zT)L3LzJ|MQO5UgI%P9+^H=N^sASE|9T*xYtltc{#K-2Gp|I+r*Q>ajp>G)lgceR-s
z-O!>507gvQNDv6~Lf{J923q3~nn@i)_o-4z1g|2T#6Y(+nR9&)9?Ubh7AFLQd2^Pi
z&LXLTmJk&;i}-V;v;U{i?WeXggd$FFAOk0k^<|!)M!8hW+-$RP?E;@-aOBXT9h?3;
zztS)1EQd?DOMTJ?{q_^Iv5~e98ql0B@8-)>%*>X6H17C;@I(Kt)8cUpkC9;u2xvN>
z`Yo2s=zFomrXm{TLcuc~u=*Vhcp+n9$fH@%b=#F#Jeh>^dhblgu1OOJcvM^KrfGsH
zjsDsg){3UDq2`{w8%8Hr*Ps2JyH`J+Q#j1_Ds3ayv9LxxSBH1tR(`OvJrezF8kwa<
zp=1YGT4i2cZ1d=@anJSRy9nE??f1PSLwXE(QDBg!bDO$0c+;VK2}!;$_0spxF49Z;
z8IeYXzp_^KB;gMLsYNbq)&Y{kp&l3d1@C43N*P5&kRumKw&U#Tmg^pErBgx&^?>bT
zzzbjB5NO8R9D@c-u(%NmcotE5A|ZjB-bc~Y^ajWL?RVitW)$g>$FTJzqKxZKJS@jM
zMuI1p+4!(6iE)}8pQF+I^RgR7644rwKJ2gykREi$whz3FE!7&TqGqHt_AkGv$j{ov
z<D6bicm!}QW<N?=&s%o5wx$&xdk#SGz)Ffj^YwcYZcDG5kLv7eD5k{ITc%a(HTAMN
zDqb_&S?E-yn<~~p7teX0xK9Tw*~ZLbI*V#G{$63wVo$iF)v?;Qeft<2DNo+#^%j*7
zC*5Ek1`t|MyZX@qSM3*Uy?N8uP<jGElODprb2GT|57LFmhmf{|6YO79oYfJc1=l48
zVnCz$lI>K)1n4b8Px01^#p6O)#vMpxK5%!+2Nl^Lz`p^dp`y3_;UadFe)7pjLGkfD
z$*(1OQL5K^OqnE9o~*=w=k##9T@xHne0l`Iq?&h~F=H~VTyZu%wl`$FUov6OLhYeq
zedz5ZFnJ5fBU~(+c=|RmdGz=D)F~zMGzJfLIGx&0Ye-JfF}4_@Q?2wMRs${;k-OfF
z$=1&wd+>lX9VtrcDUJItH<F25pH5PJgr7X_9vcbw#PCbW0nHnqeKTwZ6)ovHJ&)Ql
z0}6x^8rm%zu>SVOfJZ&)Cko?N6uDkrUY5=4sw<)R;_H9u3GpjlZG%S8OOirbIkE!0
z`7#tTt&ua3B}dF2RqPQJ4gn~iJY_K`mgnj(B7GO^#9;BqNdnm6RFdXv9{dvi2s!25
zMab#GODV|4{0~!po!c&InwgK1nt-!*6w$-jyEJ)3wFT#!3FHyx>Xm@OLKku^l1L>5
zNJeN*Lz@w-?Cq%zD{DcWjmUr2?YCx$1@*X<)Fy3A+bdQFNE%FjHioifUP+IBa()d?
z0oqdxcSJe?Wy0UB8rn)dp)dBe>vG>-T{L0h`c+eVQ(be&<6Us5^kim>3fd}tmzCBw
zJ%9Cc#I*%`cJ1oq=CG~F`Hr7Ydz5`_O?@b;y7-e-gG@#$NP{ee7+2^q^o6*tFr;J+
z1_+!#b%6<no^Cj*!xer%n{)qr@vV_~aG7yobegT+5%$N4VSpsjk$P)fG$0tYXi*n(
zL&OhhYcP#RC34qY3*=Xk9t<#U)_1Y>i)Wf80Bss`V%8N6G<Yuym`?7TW?vp?PL-;|
zd&izD$0<i(|0fl>XKv8HThp5gzP(lCupvXD3@w_-Qfenu_PPGeC<VEY+tKZ!7w@q-
zxY=7jt==QA{IA0v=4<=@7;0?tlMBYl$Qvpzy@8!BuMS3Mrlm!U)m#~EDQ<|IF&$qx
z8t$mg$#wyI^CwT9w9;(<skHQH(6a^|_Faz;2YQR#jqAHhGI^JHiW1wbmo@8rXeHau
z9fGzPB!~&e^Lf2JSEu$Nj>*tGMS&c*mhKL10F*n9Z-3HlH=o5>SoZB(wAP%kUQ0|y
z{)t?~B(2LY=8<rwViQR&VUF>ws|Upyx>#XE94Z-Cg;cB!$zFQRWRu4j3s7+P{@hz#
zT`Mpkpw0K|4)D+5RA~&B_Y6KVRPI&(vgG_J)3U`bKc&=O)g+*sdlS}Rs@>NKMI
z-ZrDQZ}(Z01c#DlS+u{44bfz#J((E}p{Fl};cxO>r_waB!WU)a@7i;mqhnBRSEfJS
z9=%I9DcW}V?40%EcE2hG4%psr+21%isAyiD*O$ytb``CyfuxGQ9<AUbx9c0z^BTLe
zI!O+`JmFv%f(~<$9edUwkm6kW1eS^DQGo(B1IV%(d(6&jEVNn(V{pjo;Lg>o|8mDT
zkLlH|+*W5Uo)FY?P3#%X4XK)b5lI=FGS?*@uzj^Fs?5Y;&!+MR(Orgo4!+E-tyFr!
zwiV}r!i*vAcN(PW+&!*WIx8nScx*<O;UrqpuBE$LRqxcD%a<-$e;qu)_e7FjIxT=z
z&^(kBiJG(~=B+z34_qv2(55xfVCUxEE%(uSgus_Vqg;%vR83lB@8lX{7T5KFvK8B(
z7<=r8YQXM4T!3T8QYgT@9|+frR0^C;z9Tx}O_sz$k(UR%aCUnFYqP|U!C;c!9!zre
zlJ)-n>1-jP4htWKiq?I-)01tQ6+3O4^^)Z2_bYm5S=r*5z<pSEE(U>i-R<vR7wFJ{
zDW(ADLmIS_<jH=S+*sN%f}x2NLO8eF=lylD$cWy~AF3{y4L<2?W628qce9m66V9)X
zl(~v2$yOGpmTH~o<I<>+mO!P$E@pF$baP&kEkWm0f^N+F*-rDH@m)3!bfc+<Rknu4
zm!>`4yrll=9eD7JUFXcuGJ)rV$JN&2vAp$NH?GSzsh%6C8q#T>#3}Zghj+T=A!BK4
z$i&Fs+78gRsN&VNz2U?Z0j)e?WUNi@7rKzC$A`5W$ZqUt*kg!&dVyKdE}_&>B?Xnc
zXD<70y>Xem>Eq;;y$205Evo39Rxu>nSm;_;YvMNLPs~rYdFsn4zb@C?yS`o6JqNOH
zT%RoCD(GN{iDuy`WRPMay#CH!+#<JM^MH48FK&9^*jR72^6lGk_X?%HnPFlVp9It|
zepF)hK-xRBd0m^+@MG%7BlB@Oya{J&07bZivCPMqw<&@ijUzg1JjeYB<>EIBiV{Bg
zpJ~%1K@?%r$(i?UXSO823onJjF1{}FHU<BA@W531JUroxq2;uHm*@#8(ggrVY!v;z
zc5W-reaH-I<4W_zN$k%~neZgq018&d;FBj#obbv9zusbJr=K~O!va90iry*L$PSI_
zN&yKzA<m)7Df*{alK6^`g`QrskNIh8xGXwDV$_=VJ^c0#p<0|sx}aXIMh)Lm6Lg{T
zI$sh4CowNI9dLdvz|^)}2ZN}eq%|)ZX6!_zVjs1a^%{BP`usy>x-Qe%Y(9j<no>a~
zov7YK{$E5CR5q@8rwjz$hn+g4`Bp2)<a(X35pmdQwFJ6>eV+jdWG=q1HVQtvDa`*<
z3lsg0v+b|$HED?qJ0|W~)Y6Ha{fyK4X=c<<b1$2fRQMyZL*_Gg$sTY^S5YY5E@DvU
zdu{yDXgCbNs}aR|=A9sdM&}#hy>YS?N08T!(VA1mP>c8VUc{+rLq$7p!v?plB*?NP
zSFKr76<{6EwDm$ub2pokZ}k)^0`o!);MeEEWxRLJBQa}sZd?cU`Xl}R_Klk~c}g4o
zqV%AJUO}@kwWO^go0;5gEtGrLK4~pCMbCdIiSX~2-R`q1erU5&gla2nrEFtPfZu#w
z@Viwo;mmC+Q<LF6V8;r9hCaOo-XTAKmd}BMH~8%_fjD8??%aG~>h$Tg0BP4_A<(8x
zhe(qbbL)&+P~$VvSjJ_$mY0`TRWMHySB>`>vu_=!Lk;7-2=Di91!oK)v%Y9CGK@Oj
zH&-U-oPLjo_b|s6-6omt=<bxj_UW-^IX^qL)~TsB!-ieCeA(uVU$jMQ>;%jU!FHmW
zZF4cRnNH)gwcEBYpY0~(A=spOOYZ3jkp(b5ixfHPf-PlBBac-xQDHjw(29`<egdDU
z56Sv12Tpj=dxhP!JgunLOYFOB8`x!A$A>RxZ77U1ZUM=~*LMOdHF#{2g>Siymi=?1
zG1|CC(2HoOh`sIMaS88_96e815t46Mb0#Z%T6mF0vlqSLS=dmP`#rP14F+Pn$zHHX
z)#~j$R~P76fbOIaJM4d|)vQ^6mI+3s8s#=6p^zLn$J<h!NPh=7H5&~{!zrO|Z}?&T
zFMhth8u40c6%R2wpvhvEDA4Jb{9y5JN`(O^%7W}daL>+n61X{}rWW4+ZFc0Z{E;&_
z5EXG9c7yJ>ettMbkRzz0%z<5za{m0P>c3<(dDCZ9f(7bg#}nL58-&LS4`-<`!wTQL
z>EPUh)ua>g?>W6SqBd#rH$n*Z+G6rXmVrofM3iUdKY{TB0*bp{c`*+e{K$w|6=ZL!
z>uc%l`F=QaSbnt5;f7QwLC0n~JKIciVj+^tTQAtfy^3D23l2&O{-Z2N-4%MaiHm00
zB)SHS7*lR%LAhhm-C-~?{JzUhMc<gk$j!(j%pq8?lZ6*tI$;7IR*!u)7^i-o4n%Mt
zm-Q7E6oT@GK#ReRr@n#t)kP<Wwwq`jGzQzn>OWg3rf)JP#Pu;(d*pK<`0U(A)}wob
z_syus>1!jBa{Y`>+pf?*sKal714buJu(1A;q|Iomc?RJ(Bflkwm2KB=H>0605VJ3B
zMDz7|FM5yIs>5?-$5&A#X@=i<BC=ax8*)7%6_Kz{eSeomtmAbYoS0{Q(~%O>+#-7E
zm-iE<b$ZJLoxhk2+(v9VBX2H(b5lQ_csG8wflY}$H%0`8cmoKxh6@xe8_7@9z_v>}
zq9U}L{!YdEUEU+w=Qe!|3~X-CX2yk8IYgI(J=lT55U29B`r&Zcv9-QYF+3p`4~`1m
zr7c&Gl*AvM-n>~e9;pg;CaOQ){jslSWxbvyZc>jQx5R~M0IYg3HigXYEJ^7xG0<d(
z(empD7}DN3PU25-m$G*!Zg@SevDU}Ox82OPqzZR=;;o)<gPSLU+$}~DM*=eYI0dPA
z>P`O}fj^y~gWay2hQ9`wJ<2)bWNfVZZ+I8SyqI-!70r?F9j5x+1da_Z-5nI9V`q3b
zCnt^32KyK6BpF_p3o~55`Jva7?<cRUa49Guk(@QP`>l+Oj6;2wd2p^-RBunK#Xe-J
zF<8lq{X5ut8*+Wu57HfCX=&YY$3guKAmNne(x8#TS42j%e5VviUzFuZfAL)A1K`b}
zj$&Y7<=r#-1o`HrDTdf<n7JZmd`hjTKMRW$Un%uWY4k~Cig*xow=A26nawH@EW^q?
z6jun5%KL8e?OM6MuX6Z;+wME%ygJay`v%<*wJIuU8IQyFpK7T!pxv^>D}8qiYo|TK
zu1Qr94-LwI@zz+3iRrg<q@-kI9K84W;{n)J7O(D-EOdC-4$>0De6t7V&xeh7>9F51
z?geKujIpSjg}&*tF|mdY9W-OSoyYBHdo|$WduNyRq9y_V2fg`=uRgT7w$14+S6&#c
z_<ri<3iPnyGcs9mjXEyQ!Pn46bo>#G?RxiWgcI2XGV`F)Tj**z?7SBZu~;do5Jvd5
z3Q3N3;ZYW01UJyDNNjW1f@`cENXqg1Wxtg#e*qRASlL`Q5#utuWe?+FiKtK9mB^ZW
zXj+qBw{^c*xwwwvk9kp+Q=BY}n{$fSuZw(nX36JvRJ|&i!_v>dkN<s8#p+cR?(6!1
zx?kSiUqfG}ie80IUkd=<M%QkFyQ)N-SVhV*_D${gb0ld`^FKI@E3hi>kFeQVGliCE
z#Kl+&nGFHK@NW^#jeq$!5m1Ue&(&kBhg#XqW(>&dGUyok^Y^k>HP0z_X^f&W@4gYH
z`=~d6a|G+R+}CFB!Em~lP*~OI1kRK(vV|Q}xu9;?chl%F1>Hnx<%PS%=@!p8eZ^IT
z!4X3J;Ua=hlES56cN$a*+PBNvXun7uC}@Lc2X5V3gCTU{6)8v#xs$zblz!CfyVWag
zpQyiG%PBMC9<l_yb!MWl3qIN5MiH`|F^<=-UbTA@%X5|_LD5`Zzpry@wRrzAk3`S!
zMrnEC>QBDY@`B0zF(RY^)9J(wn$2FXsw*&ku5_Bzn%Gy<BCszVbcl}T1r0PbCb_w(
z5-~YEPhRv8x`3m!DR;!Aal}LgCymvEm;2b`OLJ;u1{~F9LZa}!3x5P1pq~_e@eU^x
zEVwhbe@gxdyy$&0Dr$Ocle<InA&b#iMT|BmqGCP1x?vFR&6ZJmPc6k9B=U8U{lS{`
zb81t;3#pic-$|@N;q!>GgTMw0UCp%)#1%zFf)}q(mzhVJ1(RJ|s({;u5tOF$9D+_i
z<}OE%Z+tilcrvK;GR!J`TC#p3E3tZ);F{egyI>g!n0^|PpbpCq9OHL_Ps(Zd`xWgf
z<k?!aR>905kgZ{xZ#Y-+4y_{k2@&H+o%8K0)jCkM1weOgIE{5eC%K#^ku}RF+*<0>
zQK^&|_W2GaaIS4XDYcqyFboY%J&6K6>)gtU{u7f|)PZJ))o`fUrU?f}G++Ls<gQsC
z;^^ZQ<((m@TfJ-9pg~p0;R$<woaqFH$)?fFP5x2Z6ysPrAdMUA>1^vbYQcedxE}?d
z%*K<RzN(i_LF&a~5;=`>j&f<are1x?9qt%gwg$)X{@C2u%#66^v*H)oZs-vm8?wUe
zPWNPObie!!Y>2H9lX*jQj#=0pg-O9nikN)+CwQkr$j31^{}v7*vmiv5OX+VPbgz>)
zWv197l4Nj9rQn&h7I0!~fq}>pu*sKR5O0{uMtu6})xn!5A(%RzoH=I4`YW7~o=BW4
zP2+iTqoYhOQ&-Jpk|Msnt@ZU)!KF2>J)hXzuDGCz0`Tku%A{JzUXw~^YXA1o0?GjG
zRWx<drI#5oIrvqX0bCf0`$4XAu(l>W=@g#~AWm#|=X)!aWj$WuvQaR|Fz;^jLB8PX
zc4FoQ4auT<k4Ij^+Fg(~CeW+KUC0C%DAkA?sk0bEL%&LY;a=OkO}<&+Xe2vT^pvM7
z|58YIt?~wES3D!+Z5^A(aZ;Vjv`gHcilt-hf;YA_4M|ha6Yw4LUI;6i)KyXS?XsLP
zrx6Xa#?S$RdQ?q`bckuekcOc5CNvAFp9H1@E`$&e6}Npbb&A~6r{7EIZao1h)-4qv
zknqqxjtK3?y4UyH;O4`BT{r;+92JQsq`vj4OY7&@0lVV2-3(H9xovr~eoHmsE}11F
z%Glm}{<K(fi5zU?_hs=5EdVdC)vJ8*L$%irIUs{HVNuxb@DRZjwQf~~w5i*YPtP^V
zp`qE@jeiF_5HY&*j=OPG_!*FDswrYI1f3=F{RGH}e_r|y=xPKh5*i%3N&0eXkCSJ(
zxus&wg=J<64J;Y@5<<(Ma!Wy>(O5C~4BUfkbFjAJ_d<L0dRz@JpeFDie&-|F8w|g<
z<OIxqS7<sQ_=0bz71mU8s&HJias>%NJ*$89(GI#qB;L+sr$RIa-!j1cS#<-R0DsHA
z&ALuq!jT12y$BaiQP8(-TUGdGYuElo8Q(=e-#Uakbo{BgDgf!DJEK21q;-q*$-BuL
zar#F>n-A(`X=!PS!iT%3-msw$Ye3DrDh)~lymoEBlp{J-z2Od{o|`NP5y7h!iZ9M;
z>c>6;H3B%pBZz9&-MZV*LqJYhkB&K4#e~RLnmxa)31zy)itj~(h1vw*m5N}{E;oUl
z&X4I^bt?VzDHWEHh&`rEnIaQR@Ty6z`1!?T*xvrlA(*<Q(MlcD$>t5K$E|Q*U#O<e
zAPHQ#Vr^|v0bx~Z+ERzoUW{D}_Wulg?P6g<85cHhm)uTLEiqW8K`#6pECxi^$6OWz
zD9R*B`=AC3w(HvN+i}g|p*2(k(jHA2y)?@Hc|mv<SQ22XQ_Yw;a0-?)lu%f!(*NYN
zgj_4y%enLCi)Akd%~+a{P-Y_+zVCXY=$X{gbodo<X%PkpBYP{J@bkDL3_!&yZ^B7B
znT$iG<ydbvig{b{uLZ?_Lu?`1#I@|ZSm9Ka+=OCXDt-D#qRA7klOvN$#@+b%K0jZG
z^Dg-p*MlGo1S98olz@j+T3Az<?W&i2V2uYSw$uuv?haqwm2D)hBh+8+s5)9t9|hmy
zcK+WYF#}8TE(PTy_^pK)Ar9s=#06f)n<Z@DzBPdBQOzC`Sp7I})Vw2QO|JO9IQYgJ
ziw5?=*+{90e3FVPDV#rea`g}b65&?*DoE_5rWDGY>3O39vU`A{4iA&_zxFmUaf&FK
zhgCxvvT*N{+)Z+<BARHekmv&62?bYdD<}sPPg3`WT_+y?<PlR>6d&y6zaUZv83*Y<
z6SW7<_0qSZhv(c^XX!Q;y$A_Jz-NFDu&{w360oLZLXLoXl2bPx%&u4q_7b$T4hW*I
zzS|dyjmasVD$>bzK_xPOei1fUf`!0-ZB4B=ahm$U0Ajy0Kg0^SK)9EDo^EX|Z$F@%
zweY2D{b0>aaC6pCm}yi63$?a+2XX5Yi!vef<AotCUTGJRtkC6?Osh4;;FKv|+~Wxm
zU*1R}`unBD>0gBorx}DG0~UI=mnpVXYx%l>Q0w3l!qV3&e^^~k{4GJ`O|jS^hxG$@
zWeM4*5}qQD(;%n^`uCT0w;iv5!FZ%?-DQlCPEWt@#}AD8Zh|}wIv^=spFU)6r}L^V
z5e&Dq_@;Q>bHr6q1Q?`2VU=*7shu~g4O6ybr&;Nu*h>8aQC|Os*)mSWQ;CTk3-6xu
zSl0Uf$ZS4J1RCq6eI9)qtQ&?IA!P945t_Y?jq~C3)KJ7(9F$_T_BK3AaA34Z_<_MC
z;`bAqpWB8Ytx~u>;XB@J_$I9)VcO~Fm|M1CEp-)|X^$=zRHj}SkQ2Ex-_eAXwI|ZU
z=`IVfH6|y@Oxi2H^w4^MyP$!_YLtr#2*QEhqGfalgvR#v-Cgf9|9+?oIjot{Txq-L
z#f}_u;}%8@$DoD6P}+8SCF#)OrAu9Sn!49(_E}!`(d9(eBk+L{BSwVZcVxOV0dd?*
zyh?I)lfGJKkR?pQCi?9+lJI;2mHo10Um9y4$95gUSqT=WtADi{j0nR(KRSAA<hj3S
z3>#0j%IkuHLrK?&2<ETq;kqp1b2uO!<|q0t?f+<yMRFI_-H>}K(~-q#z~_*&e-1j)
zCFxpOm>y;-z#i$6jgJW$x-TNaTV$>TP2F}6Y4p`5U$xpaX2DF4ad2pOpfF52#c!f%
zMPL;DE1)JliADvtLI+gc#XuR;`(GwY*Oxd%sFtC`Zw=6atbxO}VcO-R@5)=J#b4(^
zd*`d^Zn->#w5bW-!m4mSsSG_VQ$1Dd0FHX`Mu8*Oq30t^gvJ(^iPMh}<hQ7DJ0?ap
z0b1Q}`<NzfWllKXHUtd29#@B5@CK`L2Ds^vaF|yv4&gCdJy`4QeIAcZj_HU}K6dWB
zc~V*59--024PG`0#ICUqXq%2HOtpI}bswD1w|9<gK>zD-&pmn#5(NM?C%dG`<4u}2
zZA(KJczf#4ks85k$x(vGexD|!8HUwifkaevc)8-+mk(k<EIxWuOA|?vX>QQjEHNx+
zc7O}wP=dLh`&C-9x_sSsag6T~AOl9SnfE3~Iv1l3J0-V3FKV6JNN;h=VgRhR`ubMW
zAG4^RCRhCEih#^6x0lV5r)9q_f!Uv=bUYuK7>B0ht4qPPYuBW=I~ig9oRZz`E0g@G
zJ9^C#j8f73eiXR-bNG+}Z24)3LE5mPyqEhEEgUSn=<>t%eL8+E9Hd^!D-cJh$3|hL
z!>I<GvC0#FRZ8%+)Vys8zT~RrBsJX_p?L4!y-;c=nq)B_7xyoJ*S$7^Vw4_&JxXvS
z7!Gi7XvB^2#v%ZmAe&bIc>L<#{6vfC^@3yetc{f17wFRp?FSW=02!R(QgT`+%$^{W
zKmZV?ux!jcQHvx%8;@WW6vvdMHJwZ6u=nWEPGa|xw`4R;Q`59F|NQY|rEf>$#)P<<
zH2+Oud~q{{S&8*$E$ZNi>N-|80Y8}$)(Y5+uQnJN3x`)2!b~*E3&_q$s52P6CwWqQ
z=?}EilP?G8is=6M@#ErTY1rW38NT40vf8k65XlT*(Li|433BJfHuhP19f6F=jVwou
z7{pIZF>C_P2CK5FbTxs+<`l2WV|_}85j;`tL=8PCp7!>|1>u|F!83ti4GTrcLvlL7
z7KGo^dRKRti7M>J;J}!nwtJodcGYM%a?eIE+I-sfV&4JHV$1gJHBXQCXNl~i5h25&
zcJH1LJoHrH<tcLZ2|*b+LMUP)puw2AIv$O3IC3*zd<$Cu-uIL#k!=V2i-!XsP6rY~
z8^FLGa~?I5o5LN?S$bH&BC%T_ZqmN{3%i95nxmyDB8pf!Bv6=7eSZP5BGTy$dfn<|
zUX(VH2$BE{f`=e1L>dGR8}6S%yd#@Kn0H9sg>^zq6h$W6ByP^*7Bpxt@G(55BcE1)
zL5hvML}n;Xk^yH{Z=%SQ8I}wPk`V?PG7m;5S0tn`7|oeyhkeMw9{0CLn^1p74Qv%{
zE8CyqwGP~`N~IxejMfP5$+4vMiG%<VoENOMLp|1v23Ql3H)+4%l4M$g7{;Z9vbtmd
zCp}X!R}yLuOhNs$>ByT{ZBj+bwjeSJ<$);r475Z%(GkSVefZ|d6HlJfq3=d)C^5&B
zpXFJK^BNNwggHVC5=)M6rG*;^+77Pj!s7k?3J);1uy7^^5Ea}nA_x&$bZ~Sktde(D
z3Z$dmFhG($BRnywtbFP=_eGhTlwEBuo&SFLJvIW5GaKMTM)!;ATdT(>`|arlc!Kag
ziYTxq4%?hd-Xjtpil~9m&sc3w)~ZWBQVm|G`QX9(pxhZT3gPzCq|ZjT02|KM@^2}B
zUitd4mF$KtoUnxis>mN%ANNl>&#Pq!)Pq`srDj^Eh+J3}82Fx|kmII93RYoPu5SyV
z`-y2Vb$E2pm8zMZzvuy_aTVwI#Vv#}8?9K|p>J!xekh}|!>qOwb=3Z|N0*>j`Lb^-
z#R||P!BN&nwlP4A7!_j}4lOZ2&rEJ@_@}9pCV8^^!s&x7W-<!{$~_5pPsu{fx!n<U
zH{lL?nGDKc!E5V_7cVM$dPYs^*H2k3^Eq@<!)x1h_!H)aG^+XM!~gqllnzsj#PNaX
zCyr>QUAAqwgO{Z`Moj7Sc8zQ*oqKaEMOn!Dllu+^op;WosSGKDR3XIFs0-Yb1SS{x
z@FQ(0-A&I=xyVsv!4ahEzmfYHtsU%Z2v+P&R-K%>IxOT0m;i8q?b-p;BDERI`D;di
zJR$pGPdA;!;zq~c%d?`Q??;c8>grKzo>hWIa^W`DKGQdSmv_1WtQ<8A_&bd~7!~zL
zmnZXLO`3`eak%ykHaxlGv6Cl7AV@fJc60OPi=>`EKj7_0u>Cp=6q(F+mU?(|U|>~6
zRP3T*AXOgPOa>y@Tj*;|6%yChTL2fG8p}gzvos-z_M&1PGqK<LwyhO2`^T~rxpS(;
z{FUH#08t8MLskB;^5^+icmgRj6IxYihDgBg)ypC79Lcz(1_V~&6s-@Jt8No82y*1y
zmsgvJkvQ+0!6_t;E8~+EtCMq}L)&6(-lmOhHu9a{DiPI$`wt#WM(`2d*GfJs+%EH2
zhmcnG+k5<C1s0zq@>G9P(Wq*x=?odFsu%flTv;AX?GAw$p*^YC^q*O--Pw!j>8ZaE
zY0;NpwBx;6P(?U3QXO!+YhxACkol*4(7&VEL^3Y+*@+wdt6G2Ta|kBKK3K0c)zkuw
zjJZm0IQ_Uy>1$4r77S$picZJP5DM32?zgq^39n`AZF0OaI<#5Zl=_EEu1ZRHZF`IC
zDY?LEmW^xi3>tywi@u6KsNBNxubgY-zcT8E<ZKcRKiYC($BqIFQ}zrbpMry|B5=N&
zn^oh^mCGi_nsY;HB@h5F_#a9u05ZN7$W1#*irl>jLIM1tIDWC4D=n9Vib7HfQzQQs
z?rP@ygJ69Kblay-AD93x=+Cv9H<0&NKU<X*=KTU=n<0&BY0=u!*b4NaE7Z2=4OF{o
zS1dBr<q225qC+^zo)#3S0U^SKZizVN(&fu43VVc-Ng7=eAt@1LqQ{VC2J#6uIXwUe
z#2k)bWT3EDCldo?z!R+ykuI)p%MwyBr)#lm*DjGd<fHpbE!?~OQ!N>($!T3{A5HA=
zx8@Py@=*EP;%C8h#Flmoe)Z3b>&i&UV^?*qN1y*sWg!zEY!)O*cI&l;h4brWGls9y
zjjAusHCu{64XxLIV@<_V{>`b!56Kj>hE&%|iDC3wC<~HCNWY9piz*5a+!5SPpkmS#
zk(+}bYYetBFP;tdoDSkC@rHiQ6O3BdjPLT7LP)B?u?DSXEgXW2Tj=52EKHp<LSpth
zOAQn`4Yo>!<g8;C>1rsW?<%>4(Hp{_S!Cz;mz7ib(m4x@O-e1E_eFA0ivqHu-$N;-
zO^3NyfSzn7Zr1R2f#-<;lu|hVq!SvL)FLIaU<RO`Yv3}7@hSvc;VbfH+l*%teRL^y
z1Thl-Cq@i{sxZ6A{`Nr*H*`vCH*8P?0Mzjf;=-f5`z_y^S4l5uQ%m?moDJag4aprO
z0gW2C^cA!?1b|OKIN)Ea01cN_?Tb7i5Sl;Mj4f$fG1gRtGun3TpYLd1e4W#_lKX<$
z0GM0$85G&#6+ty^LC&M|i(wx5QuGvJ8UZ4$1+bx8IvoNAHK#wP!axXh5M4|O<sEz5
zayvSLR8ZNFOr~=%0gqjCv1qJ%7DROJqDAQdA4`rRG7C?L4X~(ew6BZ2iEYFM5L-oP
zwt&Kx2mnG1r>atoR#zA+jF`b&=<7E2T1+u!LntCb5g7YdC#!P9Ed4j)zyTM69K@mq
z0(H#%A70=-Kw1rROZnz39fpO4;jIW>B>fb)d7@W))MR)_k5xF2i4!<yH*6})=0X9n
zU)eE!zyGh6@1uT|@<b|`2sJO?SR`z|hJ@>5GN(28EwEeaak9Su*#c)1xj2;k;_Jwv
zFg9)kw~OFGx3oW_E!x?5RX&Xm2e7Lnj;K)GNU@5SAo6`*nr>paBHlH(TWhH2)I@(O
z{&UDPIW7c<7b8=0B0%jux1Y)_suND@b8pv$^()`Ec=l<`J62#2ORG_GHW>yQQFr~%
zk$CfpxGOHM2laZ}cI|3FFt%OGd0@YBe3!MX8t|E#a1kKz3MH%Zk1F5Nym*8q1I3Am
z(%vT6MBMu(iY%P&*a6@7RI0WjeXVd&r5h%&9{5+(h1PANN7MQYi;1a6mLyA+BvA_F
z$dO>~lPGx@`$r2uj1aA1Ax8Z@FIqMHJq-6Uw=Trq^-w5^^GUGQ6;Hr`NfaKHG*0_#
zl6cTOR2cDWF?a5<N~y!I5R|i=|B8fyAqGBt*=%DV2~k<%ZzdDeelKzMSmY<+l+cGu
z<HD$5Y-XN?7bm$mOaiVA66nzSm38R+N(z|V3>T8?bb9On@i>RD=<3pp7HrBcSj^Zk
zSW!Ij)j>K`Nof9cNjBGtnMNo&AQ|gNEiz`N2OlF0R)QSGsKe@-e<at@J|f%!E};=4
zz7N>ySiQRonOL})p!wni%hU*6LLx_A{YoXZtO9pT7%6%GQpD0)G+w{9k0cnxTg8;k
zJr~KGoPNz<Z2q}0mb(fG$h${pn^IYXFJxb*Kq$v9Vzn3r{StgiB#l1FfhE-iYgXi5
zR#yAq0bwA}31YA(W>wfcI<2<kYeZ^KvZ+zILEGpbbxl3O%A&^x?W^<pef)M50;p%G
zR?;gCGz9yhJ*?yZd$t&VeH#5484$e!Ei7f~q5oZM<Xyq7%znK&f#(gb<z>i1)luq6
zG{p?fu_d`XmEm8E&TfptnL$0B2JGiv*;&y2{`IwkXI|RenoM6sK~W@*!IgbRH~RG#
z7?4v#L1=LvPH(Hg%l~^_-vX|@kkABk5NoUPomz*|N*7uy=Cw@1ZetU~r&PXr4#p_}
z`%al7OEKHFOP8Y;R|x5O_wMG>u>hoSoj%=u-hO`C&bu;{$b|*J17*O0QoJKU+r_X)
zoVF`BqQ-hAEK_(&`K-uSrcmx&iHZLD$edf$=`!pS4s{z`Q{qNY2gzOLFsG$71zP{$
zaJ@f^%<Uv@<YJ~GurWG$fap;!$rP00(1HV425}9qU5CqZnr^x1_=n+d^(Du_$(g>D
zg7<5UR<zUTerXIQhS<uD;B;0|2nqM%XKeeQ9ckYc=#H~lHYHJ*w7TusIj*kj2zf53
z$%F&(i+U$F;cOx~lRQuSv58=3?!D@zNSek})OxgS#e$f?sr&1ZUe1isBMBG`gdrlx
z7Ky(O%y4}M(N_ii68NOj4GoKkAR~SFTr=f#2X#`>wDU;DGK)Sy<@Wt`7tM2|Hz5P)
zoK}3T4$#~12)Kj@w!zf|mWLoR^uOB}8A&u~!(EWQ%4kE7nxXIRLTq9C%U)$mlH*1`
zuiO<HK_hK^@KX5x%B12vnR+Zl{@bKKeOLdv=*r=wd=|ihaLFW$+`m7hc|#7Q0I}kr
zjc-nai$Z8Ff&p?8E1@L*enrReNI|O5$u=S#f-I_aWKMQ?bwOaD{O&_Lc(YoiFS>kH
z{mb~xf#KGH*C!L{>89zHVxiZB0$i0OnP7uU?7@Jxa&D{I_P1yvk|x?0iUNQH6W8j}
zl;a6x#FejTAG!t3PgA`WPlQQ=%K*Zxy=X&NvOsL54!u)+gMZCrU%YJD(Eo`*Ik=j5
z^QfMooa$WqZ25nzOn_(~ps4?rM)m_SIa|GEjVup{LqS2|41?YvBu+ekXbgF>$f{K)
zfW15H4X@3#QIUthaZ(XG=ko~1xUWTb)a8ul+H$0bl9WwBhOiENg~eI7ay4t|GWJ$<
z50$mP_`cJFEF0+lJMgq=Q=N{xJaL)CPWI$tk~#-ZEkR^3rU?aF)Ufz{uZt`0C4T<6
z=lb`GXEE>8#;n_%n`S%It-k%ZMm8=(-c3Jf=^LPJarO31n*(RFvTt7Rwfbc0ZO7Gr
z>uH!-*}Zc+bT+7=uEz29TQ7YozYx<h@MhiPyE@&-SXgrA`sokld0#rdAKJh0;H#v1
zp5Zw9gpz4?G&HmY2(_Iy?Pb-;-1r<=2Vcv|E+L<$R;-~@T~VrA8(B2bnT}QLw+D}Z
z9v}o;1yF?bn0B?XYIlW~X%lx^oC{$v@@HFhazLI*KlA$5RHj<HxP()^MHAMB283X7
zPmOV!xoo>?%U@I!z23~%jDl?Pm|SBAz0EB#H*#~wLMO?FVV8rRaR<df5F-HPl_o^2
zTw**21cwsC-cPOdpa1W_y|i2x)6RtTZhv)%ijp!}KPkL+fM)aNM{sl1qf!n44L3Ra
z`Ffx4^>lfk&|Q>fcR}%T@TJ(dFLWI(&uBuY@x15jblyZA+;bC>ck$KkH{?349bsEM
zFDyL5i81udmxjr#zENI(RbG&ib+%~%HJX;Tb~HQYO+v!sp7<WjNh<PTXGGKD(EfAn
zgRXFM!NWAUZm917jjzL=={p=f3@UvkBV#ZK(bmAgQ3ma~<C{X~ZZNdGN%O9R5-dcu
zJLe7w^>iE&I$xbCpB&T;v=@id%_Mp~nL%Ch{^5hymU?Zl%7MLdlZruS2nwoPnptQ$
z=gc)!Ob6^nril|Bl-zAPLwdU1&cb7(nATj=KM$kZ9dfo}WcuTrJW7J8KR=aVfE|sO
zhBlUO6Bj?NPPx_O&ll3ofncy-2IjFjeEt0?gzr?H%mII!?Bgq(DcXT<a41#ei44?S
zVN+uWKXnH?$AgtihzPtNO}8b5lK#7h$}goLB+7<4f^SLZA-2d|;^+(0Bxtg$wuLSp
z`1E*l;m7D)h64)DnMEd0dqfRov8@n2FiC&oGY>8owZb6~t{UyJ8<*Ec<{2Gk|9fB+
z_6-EQ<?A`QaZ#+`;`F9-BCUe;Yv@#~{6W*{K`0~f^73B0wqu(%X4EM4n!kGgz6&fX
z%E~AJXv1E;y1v}6C{HRnhOorqaB-Jqh1pP$PH!V4SE~89)TcI`WPPM4SdNT_5GSJ!
zM~$Pt)oj-6Fdx`td+oH={rXJ+9=wZAbh2Ahqq(7|in{_&Rt;29TF98Np7;~-pBuOD
z0PR5c9|C|`quuZC<m)th_EB;@O%@&~tjWi8GoZ^KUvI43nw*>r|IF3es`5K2L0WV`
z<gT%2;#d4wx|@g-_6$CA4r8X}F#)#Bn>$xezE#i+3x}Wq0iu~7&fIUY@QJ#1<;qZs
zp#6qMyq|TZ2V|LVu(^t9d~VN$02=|74D;Iblqq^6_@ofdU>rb>!Xp?G056zx;1Fkj
z<6U)?kI6u8qlmj%un#A>F~R08rU6~ZoaH-LzS0U{>DwR@M$)+X!-@o_B}<p;W0Xfo
zwe?ojs%;6*$IJ866NHAOfYuCKJNKh>+|+;mG3l0m{rU*D)n>Hk82N;<z8Y`VR6OTJ
zxU!kPJc{cD%Ff!@nM-R+j4A3g?q-Te(c*Mpc}uf~P?J%i4#PU>TZu_~x*$DKBbB_l
zmGNXjw@ga09x!F7AVYGY8y{hZMf1(&f%LsfP2CfTS7)bldBsa+gUq8Et<$o9-jfBD
zm#df-VcX#^9bNVw{@qsGsIT81=H(S<c8qbwQt{KrkKObnGV#hhM9v{&6MZ*syiIKv
zyl2nPEO*X`V9GWQ5PESP*@s&5ZTZ=x@UEF34^e`7d@stt-84XF%-FGRWL(h@j@&ze
zJS(A|lpRN*!nx9;VnDZHJo789P$Ajy|8<6~4Um!O->x|Q3<p>rwUbI3$WCGkar|)#
z-(bj@r6tiK6YWgD6V;SKeDcRswXvt>hH;ChOfs&#Y9(|r>DO4uK-u__hS80GRnWax
zFU*s+(I=8PEBp9Q>^SuS5eVUJ8z*EO<1fVBw_bAxh<HnuEt6=7k}cTJZ@5Im&|&)L
z*lIx`A<^V`v+_>aaMIrD^C4-}_@S~J7FU7lXp8{>y@5jrxSucUL&_LP3p#=*U^86n
z>*2BSR#bN<F*|f~<Dwj3VfR3)Dd!yes}wKA`W&PXH|oZ*$0n_k=Y;ZjloNA!O6bdv
zqQ9|x&##RaI*jTF--ZI56bxrQJX5mjC+X#Mx~bd?&9qCu*0FWI#9B&3mZ%URi;rR3
zP+0f^3I<gVv3eX2v3siyiZa!#sNvKy&be21r{_)OWYqj}Q|@PrYyg!Q$`!DZ-cE^U
z&z_yQtgz?T^72D2X}&cg1`hd`6PM^P5<$oo=)s>#?4#|5JbLu#EyLZ|{*hdwb^{G1
z^BAGgm7`l06FvJDiyt`QrqPp3#t#d2lJ6Q3zk;9Q`-pr-;t5nAJ<c?zy?%avvcX9#
z7rxKiky;s}IkJO+)AsY(5+cRGgKFS&N@D;KhIWpIFZZdW#Gd6_#}bc^sHwwjd{QOX
zU?g3YYen$d8<sh>U?#&taEA8h9_8fNb0+Co`;e{5nDOc%7jE1bNijBZdB1Y^mX-h1
z`~QA1a>%_+<i;$Pj&i!&VMEA99ZW;Pnun$qROiKDD=S~oMN>7tC4aBi{Mqy8``Jy0
z=ylFpv}h{?vwAhFrc4GEcHl=*5aE-5kgRhew4Qp+$L$+bhaEg#3`clY8+!As<YnEL
zn;o~b#7*u!zC9~LlR@KDYu3ynU0D)a|JP4t2GRPH`cUpVh#BIJ_{v0O8?lKkT#~<`
zD?2fIdtepCI{Io{599NJ%t5-v$)VOG<MGeZgeoRI7#QQCtE1DmsAT20TYFW%XFVbN
z=RowWaxI%=p-R6VmsaFuRQ_|}|NF%#jnaL#5ms*4I_TR)hC*N{ps~Kq+uLH5Vc2(g
zZ+AF5dq7wAKyxgq7kDc+fm5^t*=BH@w70W%{wA}RtWI318QasT^Uy+v)xW-B%7;wk
zWl(!5TjOUxG{$(bms2v^ZlXttJp=7CZ{4~T8uaUDO|y?DBy8gi<eNime*>N7ckD(J
zI7RzkI)u19!E81sWg&eD;4T5(wB<yry0VI*ShahfQiJ#Un8*YCXF;_fbt)y4#aj~+
z5-5J+{<+aYzJ@U@X518yVwryb^*Y6R8p`{DH~N0wh>J-WSyq8}qlud_7|V|RbzPoq
z#5T$e=zr|-DeI4pKOGQr1O0v1v&8|A7C)=Qu{x3p^1uDDmpFrP;KiFaQEW~psI%R)
zT!F{eMje|@n#-C-vE7CE(C<s$eB2C2C9n8l2aK_1r7GW{#tESKzW<M}H-XDBZ{N5t
zJJ~5ak>XaC82eCJqO!G<EuopRhU`j~EE(IajJ-vrMV7`cl`JU+CHs=7k+OwIO7Hh<
zG4s6d|MT9T=P}0QzOVbbe!uf~p2u;V$9ZtbfQ{g&`#B{|7EsPjph2?Qo|oK_`nGl}
zTe0;ze|Q|+jSWUMujR4@SY9*xpA}zT8kPR9&Hv{&2h`EpCi9X|qOVvH!z%VwtAjXu
z54NwgWH6#d4OADjjH533eDl}$VH^s>67@gn+ePS-YMceo3u#q$Gn}6RI7m*7#EwjM
zRNRGX??{BLO-#Jj)z>~koz|=(c&pCJ9-thhj(36kvKTeWBl`@&C7Ggu6b+Pf4L&G7
zvS;)XZdgz7EkE7$Zjruosu+hHIy6EJ&5RpFPf0X*oYfDNXEv65-afl90buYR20qS(
z^*&<jZck^=`qF2ToiYR$pP$df;!jZYw@=THCDs<AtXhZ=h>f;zp&iQBOk_qZe2JJ>
z_I=KPztDqoIXI1Dv$qe<6!HpilZh=KNk!OPBWJTuZOA@QyZ=6={3nzUY=8Di2IoPE
zL1=wYr5MKl9{BQSK9Q{cZ?CmwZ)WmvK1x!?n}Hk*rj5*Q&YX8urXdGUqmCifO6FjL
z-s;mf-2K(f*1fR80RNMK-LAQ!WK}lhS9w_$U~|!nWaaN&-G>nV)d;jYO7hx*Eb-#&
zvL8}>r}Lj4(d+m>I}quxWG7x;3L)Ep7Ov`g)yn=t@+r<_19A^JmKe`DY}nQRqu{%;
zb1rc?2bP8>C>7q{h|R>w;gpdQ@FYN0{*Zib(wE&6qX&dq&(i*R|NbbofGw-Hr(yY*
z@KXOnG?$GfqDBlRY7~O2YT-bLyy9cAPT56S-r%W(P*BNP0-lfp-O{H25&kd-H*tt<
z>aIz_ri~E!N1;2|dy`}}il2!<bYps|gAy<?=;tNbj~~D1*|OX&l7*$&LwfL=iR6QZ
z6_=zOBzwytZ%vaS?L40vg)`H_0>J}j0%JWft6O$WzHQNU*wTth!o0BHgVVq_R(CbD
z^l|nZEt?u4eI^qFTykG!WrcI0C|0dw7ZE8vP|>#N>0%ieRZUTSRVA0bo7W4JB1bqv
zF_|z^1BT?kJO6$La9uPSt-h@*euz{#5}hXUDrcw#8y;M&CuAFm+z5C*c^>@Lx)uZy
zc|Es+%eA6dV#MoO@Gp8;^Z$DS=R7@s8j5jZKraC72x{3Mb4t)79>LxtoZV1uJk^O+
z#jd>a8u4z60ZWcSrZ~?a?CckjHFea_tXwriOfFFYNOl0IGvIbQLY8V(t7dQR^UTsu
z->xDr$6o~ftybg42dQwQU|0|(|64&77UqQfIeFQe7l+9X?obDQT|Z5VK-x))o$?6q
zZ7TiT0x@iXe45Z*xKrnre?6iLC1Va`c^^4oozdmRcYxv3vkQi!TDr{x1s4i|vrHfJ
za#DpBU_~)fh-F!RV?Se&NiA;wGj3RF%DF-D#%7&C2<s2vFeK3~+}f12C`T1K-K^~F
z0^?VnJk^Eh$4dI$Ois3duDWOZBd<Q7gz=YKR1Fo6Sve?;yd0FNEYnf5=~~jLTi1`)
z?^Q`<k4(`_BfUDoVj;oq1U%v>#|cM5=KJ^Epnd>|erCre^Uk?ttR^+q#QKPgcGu)L
zpVJ`-?;f8voSZd`f}HdoyoQ7}0z@)rUYo6NAxcDb-lxy$%Efi4dNu}?Lo9})*%-2c
z5DV;ngiI(BVOu27)z4z`!x`AX6LhVXMmQP@suZJ-O9Rx^YS5spjw(-VOw0k4!>EuY
z^pl2Lj2N-}{|NmjFd*IBs^7=s_Yd1g?ZIwUXYSnhtFP8je7k8nm)#GqJkHB}o(t_d
z4(jQyuHm$;jAf580L}YnkSjVse4mXoB}g4A^ruzkp8D(LNk1Nt6cvIF2_T_a_Le5*
z7*It~q|^C55hC(rSI5|*+w*wtek2Mw@kRm98KsP8bG1N;C*3&IBwD&mzA5?{*(}5k
zG?_7xG>Y{+DH$zok)R5FDx!#2y0qeQt0SoT4BEBJSmlYXmXs&w<o(Gz)~s1m37(+z
z<!jGWEqit-Ptu)0qIG53G3xjf%B{1;4V4e7ipArlW=!00@eG7noU!cS0irl)z)>Po
zOX}v*mnvO^|3p|9@)a_j>?*}TIF@oe*J&O^vD188#WQl)@h`MTF`Intn(D;a{rmQb
zCNr0MW<lWJwG`Eke?R>dS~McpHfd$B*-Mqlk&;CPRB;3btYG)0P_YTjMOuUy%)l{@
z>VVi6#8H#~_VHMd*Plm?<vr)$ev80glxdw40&nJWJfJq3NK)_Jznc<p<KLSd3gI@2
zIfOjk&{N!37c5io5ItD2YrLZ0m!Bdmq99_^z}xR>90P@hcrv5p>E5<o+5PeQ*nvub
zTe2j3@y=>@gUuV$kYPcV-tF_*N=itZpOGaRa3}ZgKIyje=SP<UNm3btSajt}tpWKh
zeK||{AZLD;N#|@qt2_frNQTth8nXFN<Kow2Dlj9(@_^(YBv0L<ztPSvfStZd6?W|&
zaEe96YBxz*jFyY?&7gPhdujK$Epj+<D}OF4%X5ro)$h<@82q9P!*%BvUQvuISq>W;
ztSXHPS|fyHSYC!VXk6$8L8;?-2w4~T?RHW++sv7j6pLy<QyX{WASo!a4nv&ekO--D
zNvQw!H+Ezj^sYF1EYG3a97e}=>dGIVv?noQbJY)0IeolNiHkcTBO`yV=eFV`OJN&5
z?p_4#iC7W`lD}(SU#569+FTKBRFg0!CrDR+%_sLf32f$yVA(cA@^@Z8OlU%9fWlFN
zkt+pK;oGQboagu8homB>k14gHY*0P~kjXo61RC7yg-B8k&K&KMg7u)(n-<PiOg0X-
zDt}jBBgn`W#-1@O+)=a9lBb~M{!C)ma#sb;?11F&B!vRB@JMzvfZdJzb7#+vBJGp!
zDoxd#qJzLHOJC2wN=2elUE6jNYR6gr7Mc3G!_0AO^>ZmLi(+~L;DE*G(L3Fb>gOYa
zkAfLl@JXODUdb0o^?T8t3GL$<AYZvfkLeq*BoE_hHQQFcvz0cJz#~r3<$38IUe*z^
z(#b2D@2sRuY4-2KJn#Nm4KSnCvgOPf%io=Sg;p*Uy=B=qR82y2Ci|lkj()UCtqAHT
zg&63q(Y$s1u`4>AF)J?UN<1S|b6@q3jMiF+_Jk4~LKEPiTJkYQ{E3%iwLNQ!&#W&#
zv$gbp{N`xq_lc-7M$OjJx4n%{D^kq;_<#NV@`G{OUyo~l)pBPDro28(ht|r<$%$aw
zG`YSE=>bPMh2ail^R~yQn%d!HO<9F{!zW_w(IcFwasp7Q@JKrEoTymt#8^I=)1C!w
z#U@}|{N^5P+~ZDab~#XCGDr$1=;iy=%e|;!DL%&{%N@%qM%6ojZPv1NLoP+7MJ98E
z160#x3jzeK>~<#w4FRvHmJDa__<Vi!rd`GdXJ%${7w+8oE53f+cTBcL@b`b-0?J}C
zzs^mYHYM}!1bj>8QSh|igRRR8N&yPTL0`+$<1xQ+;omfRphL!u!<@huef+pph=t#D
zyfwf(=?_V!4;Lv%gk|7rPWG(KOkHXdo>wxB#q4_da~x-<<GaveCVvSx?G-bzhs;6N
z{)nb7px-pS*+W7?w7zOrQhb*J4$r_YO(HUYlR%Thh9|H8xhwP8G+LUP!36P{2n%7k
z_knzAa=Zij39d&L*Q0+2ZklBf492l|S>z^3FJHXS;rmL>RQif<oJ(FV_z&H#5u}o+
z$n`(=q5~_6c3>B<7Yj2p)yI4#Am+Sh@i|gcEl3z-vDcnHokVBEIjJS7u3EMp^Be7B
zONXs5e`Adi%a)zH%BTd&Pn!woJp^AHD09zB5pRoA&~1Vngtpv$AhT{f`?2%wx|aBR
zFhcuAW5;ghFp72nHNYnsm)4U96kopoYu~<_1)J+)BC%lA+;>xr-#Hn-n^yY2|JQE}
zvOUW#BOsGmk2{#~7{#K=KD{-44}1V}ycwy%8_B0TW3vUwe=Q}&ijt_hUUje3S5&kl
zK{i}H074M?%WDp-`o(S1$rd3nQqxO*EC$4GvCUrHbH~rev4b|7gA@f!jxtw6ne?RF
zY?Qyz{=L7I$Di`s;$=Q-)?p~w!=$CJW}apvfv3LqU){v8no<<&?^$S0L39@$M=%f;
z(XK+`T3X!7mB7VO@!tFVm7l?C(WG<k<}kgF1h9zWze}A#!dho!c@R}pV^HRwgsT<c
zi+wvRvFu{vOT-s>GY!26NpIeNhAx#Ymg_fm@^%uHL9a=l0I#HOmLs8{V2RCnayV3O
zB2-Iy_AK_y3(JaP!x_Nih;RZBr1}O1Bh;DM*<tX_28M>WE6{S_{V7b?B6-*ge-tQr
zA!$?h{*6j+ipklp{v}1j<<tg-7&&`$i(aYN&86XEFUjC8RGH?a9(T#Iv|6>=ceL*i
z@>CKReVVynY#8+O$f!>j{eEDLU{==dM@2<NlC%<YlH#ZHIVKmmo7tSQGoHnqO|g<s
zF1p$+t|$ml!VV04yDlH*iWH$mzxl!Jt-9qkiRBMEJq6JR74{2{g6ok$M{~`s#l02F
z%3_&I0cDE>X-;}<m)Fm}zI`A2{m8SGB_0mr#%X4K{N<Z`Ixmy6AHYg;AjVt#y6A9t
z7gAa#npm<NN^>Kx)r|T|bNunKDS5UvlCvKD<v8(52jU5pPGu31s=?Ae)8*+xKMiWD
z?&W8`$+!6ojYF`~)p1aDsI4oPFXMi_bv0DyU?R{AMlbtp+-OCAq~6o-u^xC`En_@^
zmbSKIaPD3(>@2O<<#nh@EjX<f<i7flbL#0%PJtlSGlA|13q49d?L)T6a%AnmOsa!{
z{0$hf7#3L~k#b38LIBH9a8Ex}9<L`~Tqh+e5N!n<r1~xmXMpB5fg`E0an-|{eU4Cj
z1GWICb(wg^l2)2k95H(4q*G(v-A~ZywP|Y7?w_gGS$A3;Sg<qsLZPP+`!X(iB<W)r
zF7UyHgDiwLL%oh2N2n<hLxq!M4n-M#_+dEOox|w8a#_!6U1H(i7?QK*dh{&=I!qp(
zq#yCZ$7BO6G(vU5(fE`yBfV$2K)S~75MpB3mR8|xi&8!eoHY%Xv4qJpLYF$-Dx<R$
z-|CQ{sghu~YSo#LZ~irQ*G!D`NN36xR?%_D=b7|-hOVw7;1lVMiUCNZn3YL$Qz?J#
z;ygXft$`t*t@tzwc=6Hm@`dCqozZu3wXqpg9o;qa)~vRXCZ3fW0UE@+?bL;B8O7lF
z{*YpUN1b;PW5#CxRi0~xT%!!>g#>B;z~%SSdQXX0N{<VJvAs(xq7MO7<uh6EM8O@p
zM7kWulEOd1oTu>hd2MZvdJ?PCI|%)C%_Y_2+uI`0SyRsW`L^aH9}7<5PDh=&HL_*7
z381L*0~#Z!T_>+BUf8;?n~6G;tOEUj&GB`E4e!Jp`>p&6Eg}iRxfMHQHVLDggm~!J
zEe&m%i}6rk1zu&k!P!>_WU+NByLW?;S?<A<2K>8|IpkUDgt-?&FkSh1X>}z=fTzvY
zzxDwcjstL==5)oY$5l=M1N+11jNa;+qp%-~YIq@^J9xsS&fMdpw|^^71T4<7&1T5m
z5o!+9U?y8}&%ouUU>8fDtk=gkAk!`IdX~XP=J0^GrAwY1q3j}58<6+0!T$NaxELw}
z(jd9w!G??l@db}wNV~JSnhKZIB45mT9ETC{%pLWsxoA5yTedW4*>dmd(F5<q#FxW|
zUOj>-90W0+yzI4}xiwK4sc;$q16j_^2W~t$)qPr%U6+|fl8o-58Mc}%c&{D-ocb~&
zn&hpkdH;DY*8a#py0ZDcvWO7y85jP0Gml>QP5+vVpYuRjTzbcD(l9{|YQxE{YmUnF
zHo5X7)dHJ}bD>}}0kae34|2iiQ>S+73eWuU^JiNId6uF9mTd%20=rb}a&rWg6z=Mf
zECI`LwCnd|Ow^sn=LCdr-Fmj0K7Y>DroN_i`VJdQ&wR|ld|w8sF1T~SOYw^S8{SE}
ztI^`>y3%{dr`xT16;c5verNzlprN6WO4)Qb%&NV)HTP6@mT<-liLc&~T-0Rgc8IO_
zUCs)Z%2{7$B!8#`s0J{^koGvWZW7cv+p|YsNk;kk*9(}%iP8LsQoKVDQ>kG4YAFGI
zsS<8{c4%vkh{+z<bJah=KQE;Qz(*8+K7)?7&{O<i7y?*1#Y+zY%gLDK+He@DgVD1@
zlIvIRSb9@zg}MWx09ADu(7E&iln=WY)EbEkjOy24C#Z$~Iq7=m>T_E~X)j*tB0<q*
z*4>OJXXaJgv$Y~yi7%06Gh_ooAkUvSZ@ca!rVa2Ut<j$l>FbOpjCn=Rpqjvd{E%+a
z-Mc5_OV%`#ublK%R5VQ3kyn7NM!kCX;7+=gE_Bklx}&QqhN)yscCer3sf6DXR7o8V
zZ2I}x?Q<TVK1wS_Agi$SK+|m0C<udVQCXB85+)Ar@JF1UW@;JR!XdGy@xMuwNfM^6
zLFf%E%kH_+x1oYmux^?qUAbZk-xvE@22-5ps661(wW8cowu;PCbWxnt>{UH;i3GwU
zaT`R-z&884<=mJ+WF>L{b2G{g;L`6&ST>_$Vk7y9t~pSku=}im0A0*D{7Zw42#r{(
z{w{6)is{(pRmI<0fa*4t;`uX!ymvh!Ymju<?#RT4r8!CYPsu-g>x0Nh5spY_I#n#2
zHiA=gSYl<xw*mV5Wp7X3*EL73p2E=u81tpC&5fSj)D^i!Kiqf1X^C|C?~*D+2)wy=
zMKW0SVJ7{K<8|UZeY%;Nvej%*b;ZP}Tg->m94Q^m0ZmU(hqPH<@8@l+_J6GuP8PP3
zHN)O6RzPFrUqyAkN!N)k{XH??sXMZ=q6!Dgn%aNzu_+g)IgY8w)B+mNWI9$bc>q9N
zb4eL$)w1GFxs%2GL~P36(eex3M_|S_YAJOh>1dSg;5!t9QX9$xJ##qzJ0uF)xyycD
zw0AJOA9d9Zq=GmSb}{e2<U_#!JnQl|=O13t&cGs}FYixY{&&2t;-$WU6p2TymPwPa
zt)sRs99WTZdkw~-H<Z)@62eS)NL}&yppMulPnOo0HI2(pu!@WC&2Ux=sRyuIaCU2<
zn3&(gY41)RxrcV{_@5VS8qbs)8Q}~0IBGUq-f#W-^|+W$tpMyQuGb<?N(AGg4{LN^
zL%a$eeQ_hCoCB4z=nc%R$(P8Tw7>o5%`YRPp{PnhU>OBYVWtKm2_+61s8nX)rKO>-
z;sf|+fO{dp%*aF#zL-_!iW0Jl<R5WcTXT59yQKF2^X7*s73I*NHKeOK?q@crqxj(u
z;~f9_chkzddo5>^|JMHtTgR}L5^%!h-r?P^3*wB2|7iO2K3oZq2S+t54~`=F|M$VA
zW)<YU3?&n3*>YD^#d18{W14$Wx_|qU+CLxI)T!kU40G-OePH>yhyJez)_)?AP)${T
zP}D~Isw<W)&YVBLcgxD+R!cQ2GEl{5H_wOZKS#-dKsiRLKO?2S&ND<M@Dqqr0)oxa
zB9#0n#vghtTQ@1Y97fU`TnbepPj&CcY>btc+57JM(0p9G|C7Wq^upQ-ar8kvZs8To
zOCK6;MnG8q`_Ozf0bgjI4<b1E?%L(zUdI1=HEKs|QR)SWy90bme%x5HFR0E=(}v~Y
zApT(9moN8dq5vDfg5pH^E))k0EpRAOmPat|Qnjde?NR~uum3{gGn&$>%RfU^ow@Wq
zbh-}tE;IzkkLKLA0ox{F-@&Q5%*Bz_2(vW)#>OdC$@DS78F=D7ILDQJ)lHnDPn^JT
z|0KtZhyoA3l(j+ps|m>Et4p=gVqev*FY$IkZSi)<cjD3^7*^xkMhIL0dW}Et;a?-k
zZ@u?4G<lexo__lG>C*=kE;QX$tL&ZP+lN6>h@O;l1=Qp<=@<)g8(}&ejTcbvwbE~<
zByXcPYaExV)+PP7APW4_w_$01`rD0NflwmmsRl=5f~aea@>am+kW3#V1RcUxWe|Oc
zwj{EA2|>V6`3Moe3=oe%)zM+Xgr@hlel&xvI1H|l%{K?dQG=r48XL|I6zqfme{WN_
zSkCS9Q~%D7On6pb*(MSJSH@5D$7PVKJ|z4u@E_0wdOk}c-rbm`3&2H2iMht{nVJNb
zbs#b)K^jGGb&lyrKuoq#<9o@olJP2>@3lRiQs!FIyxV*Ka#W{e58komNM##;@(h}=
z6z889s8^$S7BqTxDG__QJvRY38A+T@r77gZw8VN+pdsoQmMHmB!<OW^c&ez#;2_>w
zv_3RcM6uXIGPgN|XD6kj=n+@FS~zcB($@}otd(p=D{fs;%zYKLz@#|ONldkhprmqm
zKGItTjD)^~R7^V4_R3-IhtS5lod6a9<ooos%8;i3sS-iYSmj#_MQDPwQ(r%4ae5s#
z0FoC?80^Uad6uAanwpxr+S>k=kCwzy#b;wSoI<$pL&bRXJvW%Gn?^!)l7S;^F;{fl
zS7Ot>&te@+s`brtrEox~fS}4P)`6<snMdv_72L=*32HyT5JY-HJ%B0HZ@mMeip{TY
zWHedgIATVT1m)~s{A_z<<jjrZ4jOWd$(cmio^<~JoEy=HB*Q{<5yIr68nxNE(|f_D
zOxAa=l|PCG@2*|8pjGcNK?nio+;y`CDoAp=9~r&Ks!6?e>IzkXNJ^FZ2OJ~MU-l!?
zY*EHo5{YRx*}TO&{U<+TxmwHx<qAbDMCT~Q|7$*8biyZ)CoO#c>}l!lPz`2f2o5%S
z^p+GWix}?yOzI(ZFoq<OfEFS%t~*+fjA|6JnQgj28D=cDJN`dDcU)uhbTQ4!Y<qL{
zjQdBrz&x*M92g)3EnYD~N!Hg?EwPr&7?b1IF>qZXggD^w#bIE6nhiX<C4Vkk4UMA;
zYfZxAGE6cSl5Ai2{_ec#mtheC`vPmozS0Genv4}I^Ys1{0GjGE7jRo(2%;Hsr(T;G
z5)CYEk9@~R_s&WKpJK&}D|ZD530;5*H{W`a=&0|EG42ARLV6`Ltwd@qA}Z#133y03
zS_puH1gZU{`>HUAyKE_pJEeP7sv-a#tKNnSm89z=CWt5#M2G~M`G*{4`20RU`8#oi
zMbcymLBH(fnOpMXeaQ*hcFe6AZGt!`m?NddK8-l%IS4h&Uf%@6!73M#hX`<WJF&=Q
zbCaxtd{1*5%#Xz{iR&`a%38yr=u5jdYIW{d?n0l|i*wSF;&oMZaCf@Pz<7FcbwJ-`
z%ERsZG!^a#QwkCMi>`aY^a;x|?i~(C-7CPa)ZDH`qKtG&2^xH>-BW{!pYF%~xXb)G
zglOZ>xrYmN#(z&n-%sH!(lj}+08Dlt5!zu?pB`F%FCNraOlcc)q{-f%Tn$1`b?{|T
zJIFb(pPB3k0==Xqy66po(Ibd&v=!_zD*7;F_ulmLy>JMzn|mRDCsHVj22IbpR%ijz
zT5J{W?Q@d>=F3nXU~Zucs9^%kxhyCA(C0@pkGiG=nMT-#tAm%%y}q~CO#4;$87X{p
zA=1vux5~?TCH_*;mslA~vqt@V>rYTq6^L9QA4`fz91GOJ;8KuBVO*I+<}xmO+J&Nz
z*K_V1+R7Fa4_L&P&$7+b!SpAD0YC}dFC-t*-h=J4KBb#Bm$Llj>VSCCx}a56YCwuk
zqb|ND2n7^V)0`HRzoNs&9`Eb<Y5Mku!vwd2X$U<fyfiIjmVcS0qOU$enY-&IrgL#p
z$jANa$&wUMGkAG58%hSd>E87=O*2+Z)be=X^NKQd5$k3mc?kW4#8Ol(dNVdBigjZ#
zPH#Q6;*b#RrABSjUrYfCa~(@&^mqL0x6PQ?7hay(QCHW6#i!OCGv-#u1LZ;uaLG@n
z<BCVI<%xXzVIL0D0obf{t@=Ol*np~$a9?r`gZ`j=F;6Ape}1yy*6umgB?(DxzhJ?F
zQMmh3-N?|;765GSy5`oHcbx!(d<$=bd2g@&qm3ZjdJnukImAM1>5?VQ_SLM`9ceZt
zjGVdSV^^G|F+;z0?M;Ix9mIujzPo#ozqDcOd!O;(@Zrv)BlO-Qy#|D>?)y=NO`v=!
z`E(BP#-1(EA)m2&Z8GDCqgJA);zgzl(9*{B4QICV9M6(3LQD)u-Fwn!+U7(_j8Jm#
zx`|CriQ5VgE)fIXS!!l>t?|)f8s-t%4uQ6zHd^*#Y5C?=3I?KAxC=#+FWN-Lxp0;n
z_HWc?bwx#NQj5_A<yFPC&+Be+T)I-PQzp-iZK;+d`uSI8c|LV7N*&P+UEq!Qo+=eL
zbaR`5<2gNco<F~Q`(c*)rr~WKvl~~us<zbN+a(%0Cy?BSeroEtB5(VV@A;d1#35eo
zoHj8|e+%)C6E5lD!*AQKik}#f{g?gZQP~J<^`>r4B$P_lwf812VL*FtOO{Ro5F3@q
z;e3Z2s?GAZpo}y4BX#iNXG_=bol=qNs}d_j2#RkbfRQ_N1omRk$(>6!?EM{N%|z5P
zxlcgwcS7Z;M5(m9*3`H{%GYc9wQVcrs=9TiPEQ4-<Z#Nrb{|iLe}3)urt)e$Ns<=h
z*?4LAxtpn}Jsh{MWhZ3AN_S41Wu5Y_h63y`9D80iU1XcwbNu?xuO$c5|Gskk>W;pP
z-t0m(Kd1lPJE4EKpY-+dJQO=yPtHL~Y4k0#vAH!1L;GyAa>WH$+1}2ZE9l(kDdhoJ
zlzk%3z;f^kCfckvk)5#T^mrTxr&0R__)7l6QxKsK`VTAeMgSI}J@0Zx0178(Z>T2P
zFv6ieZJJn&S#b@(t8M85J~=m?9hko?xQ}7eP^e~rd>85mIYdM(LPeSk>;`Ww`<;sG
z2%kMLr3itu1SVaPf&sA=`QI;sDwp0IM14q=djW+ru^U!<G{Tp$K>VK*YIh5x^>~bL
zxQd!~28@avTnqDFm?Gu84rR&XA4@8F7fup#bYNZ)-{jF>zi$P-521SmTcMexPqy@M
z*s?M2_$QIfj?1k7>OR^8vsWu8Qq_OE8Fl!u#=;*Zo^8jfLG;>udqgJlntUnr6NkS@
z{=9dN(qH;LZ#W({!dt#Ya>dXy0p9ufG&CQQ?aB(#TC+maH2=Fnw{Bz5`Q4@LmK>O;
zkc)y?f~^4kzaLmSj6`KDxD{o8@wf%QJ39+e&b9vOS&&f!a;_7)9GFpS0v9aX1m6Db
zs`Y<g_n~23svs6cX!?8Z6G`h4v|M=Ncaxf=^{4Ih367C2DyrB}AY3`BxPg+MqTR82
zS35*A|NTEN#!cg<eY>fhznIcvln^NBoy@IadB}3|ujO(X>jhY+aPKg3*N5y?<vYh$
zAJq9i4T`{JO$3dD@hV>V=IPRjqra6H{}}F?-hOkh^X+syICT8pB)Zq!4wsvKUw-?>
z^24884R=ldt=WWYJrXWIs6I+J+MrQm>+J)z0?k5375~V7{B7ZpBUuw|$L5_$pEKw5
zr*k)R7f!yFdv5fheV>*MbPStxzBj@94+vc`iwY`4b|E7^XlrZ0xLI_^IFE=)0cG|O
zZtL|fUjVQeg)7D!RQ^{W5T`gf$=HLcJf_urt<0gPlf`Eml_Q(38A7yeJ9n-l{vG6I
zi+_w@CHMXc(E$LuOtK@6ryNVX;!{%_u|ozUo-l-!p!fB{;v%s*<{-n0eJxwtgc;fZ
zrbFdNIPgy<Ru|VQa&)xyIzU(3)auG;cA)7G@^D<zV+S&yRYpL@Eq-Hr+^|!5|2Ms#
zdZNFffSOuVkcVa6P$u5nADz6JG%<l5ci~{%H%Ozb?I6h0R+u*Wo_`N3BmSTmk8Gke
z+w_I?)UH#fT13S)ST0jFS_Ktz=ROeV_1~u7zJ2U+W`8C7;a?ru)Ox2!abO%nlV_ec
z7FnWe3IDp9*$pnMdt3G$)CYqgddseIQj2#OgKawvwBJs93=>%#-^EJb$bMzlwryU<
zKMJZ42nVr7Hx`TM1}3d|+ephYe|H+voG^~v;0HdU0@~H{8xpZ0!-wa&ts<Prn0J|v
zgX-1vJMx+ByLXFIsrc^F<#Fuz@r(G~ii4ih)XyXuTcjlKdkXJOk^_qv@woQ({|d@o
z6{)%3`|k8TbN+PgzSJ1|4^m&4jmyM5ne!RMv)11_b|P~q$ho?S^FcSq*v`j|(Y(U(
zCrtm!`QwCfoeP0%e6j6s(FO<mU{Png)&IbOuKJO`I6XmdYP)6ACIi1+`g>QeuFMJ8
znj^#@@2^Sjn~m{TgiYQ=V)Rmm8ZC>7j`0NNez9KLvw=EEbL{6&G=Ht9110|Q<*HQn
zmt<B9YcO)a7x2?OxQdL@j%SP-tByG^U=Y@HGzocuhfJg4FQoTz$Hw=EIUr)AZ@hEf
zuV2xC;>ght-QO{C_0H75PK`o8F40Dc@C)*&wx>5U;hfrAOycMP-1G%;uyn`s4h7uF
zV{q`zTC_+WcoQrbPd(cg`QXfbQ69{G$={&(s;p4RXV95=`WX;H9&pm1OP;b2A*HrG
z{s&+^<Uq}#Umi_-RLeE_;8s$DP028x+tS2agbi^SHnEZ-6BOZnf`c2<$dbqxt)vj^
zSEDJ}V^wBIL}RDj9Lm2xzYjRmmzJQjw<dy%Y{bX%>}*=>6i3H&>y1_#$kTCN8~ZL6
zy`v6JYq&tF=G`iyoDzd%L`&kn{RjES5)bGv8J`vAi2(x4s@NUSYTX>O)zJ@Q0!v;|
z#)<I>T4Reh2Itb53^xk5|5^gzP_I3y`VvL47yvvmLW#ctt*1<>6wh{m@S)$ejXnR6
zVcPr|HP~gIpcN^Xc^*~TG?LOY^zCf)xZ?j7M6(HHu$R2$+PADl8Gri3h>gD<ZQKiF
zRpJ|fy^^;Fry27?hD@B;>POM1F~A4`sBiUO#u7jz0DzP1G#gh(!RhoO!NBcD(b6JX
zOfl5Z;iNhA;bDw#LHN{fXKYsedq<~&(BP&S54PZ4s{;y6H+Od#j+FWAbLnh#>wh?v
z#M}@c+bZz$Y|ZK<4`L)i_29jX$2pv4*Xr#(`VbTnI3T^();V|XoD9XH|6vUw7rp%y
z;O=98{dEbU&N0rw9$Tfp>AU1ZTMFwbIPp?>Orb9fhl(pK*W>6%#FU!m3dM^&>3QVQ
zSoPj!Cc9Cz)Ow8iWbC;2tPuGKd2Gy&5yMKJoOq4^a?AsZhfJtK4zAV6aKN(9HA$N>
z7`^uEipn7}kf8vOc<ov-0h8y8dKXjf4!U(@Tr>^84NwW)%PVH3(4iW$;#(t}5Z&g@
zyU4*U?~G4DyS8l=@uIg+UW(g#EMcNi(kP?pR0BcL(VyN-y4JxSTO9Qh``%9RY^f1?
zp~+Wo-KtL(KZuO<7$pMtM7l;F(5b|Ax7TgMa?qy*U$>9zpn1*<iwQAW9I|b7r>k0A
zYl;;Idu|_tWl<@vVaPKRnZ&h;NAps8RKGPBBQPF+lV=koQoG*iAkT^8{E`G^w2Zdl
z*40=&;%6v+&S%X#>dTCbi<d4{fm|%WfZUBtAfrWZ?5{Y1s;F8GfKhSB<$Pw}saM44
zkmn%AWMaDE;-Zqsh&8Eg7%QpW>zD#d+TAlqdj~%RuzwpoS}U;r!%BCo)kifu9mssa
zb?Q~Oxc&(vnRd;0F+CC2MP$c>q_S7U08+$C2%6mAKWi=aeLUlacoG7@qONPHFpqpa
zU5R40g}+Jy1rDa88-XEpFqK3Ab5Y!~$?j?;g5xgSZ8XAo<&ObmVLW5t(FB&W8?^fX
z=3Wp)#vPwp4f&s<+2Or*U$?fRlv?B{w~F9}DernVt5~U##F#RkRcyjL?`lAs4>>Nb
zmk|U0p^6Zn`gZNsFl8F$!deQe=M)PvGmS34wgVh#bh^w%dRL9f5#v=(rrRWA%wu^;
z7}~&K8aB0e)kZuJJRbH0zgkb^Y1(9noMP+{al;(2qCluKv1t(HZ+3R}b8g}TOy3b;
zP1JGdZ#Q|QOm-`K*>2=uazdtriB41dWq&6puU@~t2Fc1+5*3%egXJzqxB&WNuUM16
zG6hm9$b+T0+S|JthlfH;!Z3&_cr|Iax<U<LiGm~W9m$J@Zsi4w77fFV5=gjnosWt}
z$IFPf@Ud$s#=lR_z|`ZQw#v&(ahm#p)y98lH}Ve~t+*cGlFW=eW$5Zi<^!r3kK5U8
z;0Bp{@Z?D=9-p|6$%IzviA{~)Zsbu|vpZt`m^(f>gb?vkV-JWehvZI-Y;}2XtliZu
zt!uAbxzg#w7;HkN$>7@q5<jdw8a!J}Hj{Pl`pui``1aiC8Ub#p2QlLjx8-Tms?hS&
zta<Z5L_PSL_U}FS*~F<qE4D)m)bbxrkz;oiD=wSBAFQP=-!4@0jyzVBUQ(<L7po{F
z?5lVq?#DVw7r8iG7}w!>#-j7)PhSWWmX>qy431CFsp{5Zmcs~$031G$Pfx|Z9LS^w
z-V+)DZVGem7i>cmc+<nK1J?OXPO*OK3?H6oJuvo7P9KH_Y$I^g@_I94xb_+_9487K
zF#{s}tS5m_;MNru7K$CLxV4Ly^%+lEGp9U@S*otS(eF$Cbvob1MTMc=mchIT+1IiG
z22YqEV|m0f8O2-expU{@;$vP|2cU<6)a`vcV*f!ahFC5e_U^q76F4q>9h7t%=8Wt$
z-1Y%faop)S`-6>aJKdUjZ~#J^;gl!X+14kcvc-b^%(~6^ebT9*>3I)CPK^J_r6;z3
zKMhI4;Jv8dH+^{;v$Qd~6}cI*I_0`}va&~E#l3)*G!|B=Tv@YTTAi9Tw|y7eXVbX*
zn@VyiUM+9kyJx?on9EQ<`x7Yf_CFq;P0eLeYsTXC3WhQfsrl6Y9W5G6n?AiNzPm|p
zDdC1XcUrRu5M~Q$f*cZnj;U)(4>s8og<j%HN!7Llb3&R9+Qv+#)mHW%f5u6><8f~y
z0l#O|h5Eyjc8pBpU$6q|@@K&&3CH}KY;Z2h^gDxSTZ%a^ncfio?=)qK0!qYG7lqn)
zoj5cG1OzBDTA0e$n^^yxo)#<!e;qk$l#`1~P4*zU=7`UoJ?M#c{^-f2u7O?9U>03<
z#Xb9WC2AR*?z9eUYV!1xF*WnC4}-BqV{wmo(w>t1$cJwjGjBN1K5pr!>XL<s@xFKh
z0uOs(BZ}v>3_%rNbxx6lPrf8bQwanX{#{T4wxmaeS<A?zNN-HBRob_IzxVQD*y%*z
zcDKxH4T0*am^dZmbMkI_H@u4X%LYD)=`g}6KRB{wO#MZ;n$<GJDn4F8*C0nGZ(42A
zq)CeRt%WObbxTSgLZ#IJ7ikc;01#a4KUd)fEJoLz9ipqsjbgr?SHgl4_^oZrTiNB=
zT)7nduT7@57w<3g9d>n}w$8A51`7$tr~BD6PD&uOJ<Efrfm6)<9U9og$w-31{U~yz
z7GSh*mo7Z*i7FDrmtgGbsSn~-8dFXn&mSSo!rzMn9uU9g(G3|Wvy<8#-%Xoamm7@A
zB1fWD>2$K0?>D$A4v9l<aSYx@@8tav%)sJstEg+-gzbCx)(&u+vLz_#4{FOM9>W+K
z*ucK#_QZSlnvjp})EweK#@Za_ZS6-&kzhB!Ik*FMrmMs3X5bON*%W92jjy>3+N2i3
z$g7ef)1FZmSTB77_?{rr&e5ZH^_tlPz4WtAMx)No&+^%Jql!W#=(IE_#umg+({C=l
zrDtd7eQKf5|0cOdCf6o&WMaR+`_+3P0YUfNO$1l1DN1dZzDbSs)Ee_41&Al)l{L7S
z_fS=`H~}NWAAFWti1aZip^%v0(4z^3@dnhG5677P$y_v{IEn^Z{`K1^agrb0uS=Jx
z8`ZbOZsi1>Lhg4Aw)Ff;60w9uRH&R2$p$AhvJhT3I`#I=o7=!B7Wapz#6Qu=(M1aa
zFd34#gPvIj&Pf!Nc>JR!v0qts!ttwi3!cWA^XF@4ai~4#$*UhBkgq~hhfGHl&6It+
z*uN<IdAP&h#S)c+Fnt+J;Zqr;Y89F^!|ic>2@Lo+k5(mKz50SPJEX;2t_@O7f559A
zy?bAk6YoilE|lYyNj^BKDyc?J<y>-4DuyH+OvOT2wJSIrN?<1cjIqQ;$_tZ|gaRhU
ztYe`m3MeOITW7R05@_-g;>e+SDvLR7$B?x%jN-_LrHd%)WK3jWD@{#Xo!4u0M3dq_
zyow3OS9fT|z!*#@Vm!XvPtCO5-u>UYKcz+jI}6fxFoQiIJ8CK7AdVDf0~v{D&&a>T
z8Y<=d%=Fj`3AMH;9}rn=Oor{&$gM|z(zxPE3MTK7(IwoXRz@qDDIZYdh8^y9mlR$y
z?!d!`4t3l!%V6t~BVB3Hw&+sbE_mcrcQ>~N_J%by#va-kR8aL5sC`n@C#J!s<YYlt
zNcEFEt$y<!Lo-Zve-w{C=>=86xsHFG>86A^#>EDkKly0(?ErU0td#>-P?Ut^fBN|G
z!_&p-8B|y@;+{Q}Hh!b&9nj}v81MuRfV)l6?;jFEV_;AvN9(a6#I)ZwX6kECv#uWC
zR&DuG^Tj?`Ej9JH=~Mlt*4Rh)kG?Rsk$He^CM2JnK{6MV*#+3gW_RDqNT&*7<c*hq
zo#R(*>kdX~y=PVph4##$kxTdYomZKX%emF=WsHBiO%tE@_m&(4E$W27dGA+Wyf7qn
zRJ=EyWVeqP=rDTe=XT34bp3HUL^C?yPtSQtabYh}6)PsBQ<4^*EY$BtF9l^(=k13}
z9meFBu){I}rDt9`eiF`mc>)=+%krjI(@FyW$?7mp^&&f4rN0t-!v!~>ghFz?rhapu
zoXPX7lw4CoqtVm%i|_$vJ^7sq2lm1+$9q0a!I{6`Py|aNJyb>g!qQ2(kih|-6x9^+
z*utwAdLRMp^!0ISuFa%THM0m1Z5ae(ku3)Gl39i<^fA6OYRs5FPpDQCiMcDb_k<Zg
zMvvaaoL5y`tMB;N?%P95EZ2Z;U=ZKHB;)cbkXG?ZzQi%ntJGk+q22U|4yQ;}#`X+&
z0rI1fOV`3i`}${@?%c*`McC&8COVmLY#~#Z(FGf=BC&{Tw<k0A`E!M$-tTR2ozNwY
zKswyCZ3B)Ufsaa91O%B}NVZIUSWUqj)?mdLI-Te!Ynbe#?sJWeJB$QQ?{1gckpDr^
zY41mJ!$^mDj|Ne(NB-!aZewlTNQ~T<bwOqvRnSwU3JJ<F?ulxmRkp{&2M?a(OV~77
z1{W>;7@WL_P0=lHPmtCcBlj!D#-{iK+-6KzVA1o7RlFNNx}H<$3}`UxJqLDH@)w8`
zJ}Mp>AnV=A?_Z*}eW;<D7-eDcVw+@(q&JO{K?0gi>S)lu6Z$%<7kj8#lOhlJHSLt{
zh&ICeEwy}fokI!cBStiiYSWuJf++!alT&LbRz0*7{Te07>eiNBmdCuHr(qRdTbGCf
z9CI`BZ4OZP*QG4Kfdj~$zzQqll;M&kf*v$UhT*pCw{#!{>n(p{aTDh&+umyz+t@j2
zWu?3Ybxj7Wa%=+yVD$}9>3x%3m7_PFfECj_nadosuozSmNgiL|CY)O%1;-k&b`}}l
zbHE9`oMP~~ppQ_<x>qa*^jorG#Q=J7bi-RKej5v&-_K-5u8C_g3TD&;7U#xJ!d~2J
zw|R4;I{ztLK39TAyh@Xd%+Ya*en^s-66>*?dO)LEwQ4Q4V061-gMJWES^w!#00;>0
zz{4}DeUkZlwhV+hnpz05TSckUW<WE{f-Q0*hAg^09N<XzWF)-@F7w??EmvH5Me^sD
zkB^1!;T}&zN3~=U{k5p3EIa+k9WG8}v3$twYd)z6dRaVY_V#qOZjw>cXXO|GT<gM>
z$H;_z?A>d$-J|Kay;ugMo}SZM5yTfJe{Eo()9(CN85xy0$^+kIWrg&7cpdR;!$ys)
zU*x|?EEosT7Ik@P=!Q4FqE9p$U~z%dcF#wak+|ctFYUhlI-EY{W*Pl!YW`SZzW>3b
zqVe15Kj_qav}x$9YDZrr=d@CFiTE+OlE$rXIuVVQuP`HN$)NGH&X?BM{7~lVPtrE^
zaqOeCg<**L5}mK5msP8F?I9*6&~I@}I}r_rz45;XDPoJO8<P<VTI4p>`@2oQ`O`Ok
zU!mn4vHb7oTN~}}{^{|<cF)&PV)W3YnkP=2Kt|aye0JX2S|%CAmYom;%875jZ~{)2
z`qzz?%lKPjk>$O+cTLHA^;$nBQMZj<@hv)d5eQD$<O}Cg`$vB#;pp=Efi<BjfJT(T
z4!zBd?MCN>Y$<a~Cm9d27&2l+ZI}lE(VzeGkF*Lxwb!Qop)aO9^cAf`*XGiHq-5?9
zCJ15e?sYP98TN3-C6kP*OD9a2;N+LT^2fItVgvM~7A+q0&MbP{f-hi5D?zXJW8ab8
z1x3G!=#UICaN*O?DGLP}B4Qv9s!qAM9vb5%E6}3jr^v#hO3%Rq5*BfkY2Mqh-=*_;
z^$64aBk55QmHPU`EX&s*8$Ux(lPsO(1qC2+)oCw4xDLI)PC?)ot;Nn3uk{2F&J+ab
zOO`B&v00JzI%E7JlRql70GHMmfYDL8)L;QgAspo&wOHJ@fud!|@HM7Z&X}PON+hP@
zfnaZ84u5r^z-kCeESMuhJrqNxev6-3-_fQcPFc4hTIt@Gk|3-VK0+rvT{(xTPQ+>*
zy}05<0u_%bzARq`m`jBS*DiBs)X#2xpP245=1mPR6hcXBS{dkszgl(fmC4c_^Xqo|
zaeAF^`k&f9Wc6Ca6~*#Qp@Tz_QKzj&KG*6^#ooX^^1_?N-MV!e!>lJta}7l>KXU!F
zKh(03<&}Mja<K-Qt9DL>V8={l8gMr09Fsc*^`InmA_dNB9Wfmj7K^V<{VnU8IDP!C
zL%|*Nk&v9;#jqbws-wk6S%co@F3xsQ<F!Dr%tm0pCDaYyPrq^3upKd{py-0;V+s_u
zqU48%RS}u2-#(KGUv_(?4NbMadNuh#oV9;P0Irub_Fm)w%VSS6UMD<eW${0Mw=_=-
z_1SD*=J+kwZRAF}BTNk<;SFN2O5Q0Pk&L6`PD_cur`~q?$shg{HWxW|KPo<{9YP#U
z8(-J@hM?*g3Kqo#Rsn5-4-Ed!u+w-xD3bNb2$++FmRS_wLDBsiM{oY)4~t$+9KbaS
z-kwhXfSOgcKfd$Uh(h^LnHg*E>l+vl@ce>N`%#@xRc<#NGULg|>KnpyW)cpW=(S*X
zon5Y&dIEQkC_dw?hnG6oX({6c+6w6IJ1!*ju>0%OYS7gQ0Fm=Z-5^sF5)*4+nxmJ)
zsqMs)laa})PM13RudwcVJoYGjpiF^vz5os}6^h&<H*7(d9s5sA>}fW}-eFQ4>rQOS
zW%7%hcj^OlvzAi{iWg@F<Ib9MR_ducYEnPw>w~{3^j}ySJkyDeYz8l14*=TYF==<L
z?lbqK&Gb)OU}6&-v-QOq$GZcK4I+^SPQ@}^@gl%Z13_l`W`w(kX2<1kt@~VDH_Kz`
z{RHrk*03V0m?clbCPQsJ1OAPCt03((%$qZDbu$FZ6iQmX91yR%SZ6L@3}Je7z!2LJ
zc~@#C2gT|@8iO|lCNJZFzQh`-rBDMle9IAJ6M5@}d2^pFhx9wGiP1gnbIs_WpU0QZ
z-&bUDUGE2UXiE>V^}tTIu1-%h3R`wS*hu6a8Kej5sg|!E*!KGmnU^wL%4N==A$+m!
z$PHvt9h?-+kO_zlajQ-4;Xe!fZRpSl8k%ODSo!d-k;M=?lf=l@X<H&RxrMItwQJW*
zs0zFibnAtTTGZ%5|0=&F&!0MZ*s#(mM#wU19bL%N=H_EM?tS*`(v9z@qhA6q{L{UN
zgwCnQd`?ZxdWVyelN)sy*tApBAXDep1Vg<SJNNV)`gDN@*e_`0F-s<qH<VyRJSMk{
z7#fH3%SG_ayEzA=1)RmR85i|B<Z@VS*Mp4Izu%0eVp7mK<=nv)9wTI@D3`{KAD_T@
z3hr({>X@3~B?CN*#-+v5tC`heu4%x{VAqHKB`*w`YORdweAE#&$`y!A87D1s+CY-d
ztSpHQ8x0Z&4qFxHl9BuRHYD-i@!g2$q?g~x`%T+~<Y77Qdv<!|)xqydx`E$*{5lM8
zpM?M(7d_oPyriy88;@(L3=Tp;W_Yc4`}S3!Pqy!yB1Lo1eBg?qS;WcbAYs)N;GMwB
zGv@qUPoGO#VtGvAuKUD*C$fBFa2?Q^ChyLo+B@}RT9IYavqNJSZ?EyN&|2L+H`~nL
zv+ME|4=)#neA_pu_aJYbkmAnm`uA^v%EjC5M{ahu)c-;=BO+t&@|{Mv!U3v3P$XGz
zUej~af1{BOQp&5`M?-pbo%BXfs|piJc}~Q=Q^x*&UX^m^&YDe|HXXAlCPKL}CMhNT
zfaJ|LZVctQQ0%<2-a4pvB?m;<;a~H*lC^i-a~mkW?+S!RA&ZL8OV%!<TW+yNX%A{x
zw<d!u?3WvkS^5i;F^Gt&?n6K%!V_CJYNTUv4HMT142!m(UwK1)b9}9-t3{o_e$Ie+
zHu$=>jw#FqhH;BSUM*_)Ft`pK0j>_-`GdAy7?wPtR`}tgj;4HLL;N0PvZf^C9)&%b
zwLR41Rc7WCHX=aX@h6$XqVu3RT96rpw8>pRG`XLNRwC<B1{WyVPtN{=XXqj8F*_~W
zc&O0+PD@GV)Eu}5kC5O~R~}=m6LL9X1qDE3`;MRT@>XH+HvTwP5{)R^^d<w5Tjuut
z`H1~i0Z0iEPjrIl3<`3i!%b$C4&D#5!z!=B!D7nQ^wi3!k%%zX7Hb~s$P2SPe);9E
zuUpI?<~hIg>+<;F!wf{2AuWbDB;UG~2+0IB0j>0aw&LJdzMq|(tjk^J$%D3zq|lp(
zM~je+rF0@=MByG~SQmlf5`Q~w{Gc6j)|hF*N)(ZYCj_4+7ucxoQECZoNYi|e-OqoY
zGpDQm^&>X|sSf5^{Mt(Ku3`h;yEd9KKxl*r56v;jw{c$e&X+WgpB`_gJ8<BI%dhMw
z1AFV<Wk`?|g(3ln`BIoa(|pZ_v~QCczusOSIPa^r^Wb%emPhXIuZ7+CFtjPp8F3CI
z?%Kbz`MdPEZnJ>0?Dni(OKJ`;J$C=}p3(Qs9{PuWJmZ;UgDDo@q&keNhyZ%ed9neD
zspj|kt^2PfrX|qTn1)o!+LpoQoloxK@!HQwv^LE-X|<cc0lXJ^_)t!(CE6i*Mn;+;
zc@E4En}#w*(b!R4>`bEWUe~wKPDZ9JME2lsa8qh^51HSKoK8iir%g`b+^9W}N$D#k
zaVkg^(5}{j?;Nw-Nn!0xoK|NtetY~wK97MUgQ9vvWH$EOWUTF{Lo2E65%dbly>kYP
z@hC1l88*wwsh$4yt4D8sD)f0yoB}Fqn2ec+)8KVmHIL5txu&$4Kh|aBo}RS;rq^A#
zFwiT(9J9VQ(vam6-$O1Z<vR}0^`vpdcg()N?GEeA1B7_O08B2r*?=W(yFaHRFtYka
zg{^)Z@oRu-1iG#ilP1fTf4APRT7udEs;aVT-?b-5enaw6kap0Qx$0?CJ01Akh5Xdn
zV`24-nB2vYH@_}pqMp;^8#n5J9QZx?G9(X{PIjLR`(%1BjFuM^^^E?VlW=&_pY?0M
zetzRNznT_svmbS5H~s5B_W9-KkI*3hma+|x%0?I3v}!dm=*XM=sO(2!uFgwm+YZfH
z8bBE{y7BviI_gnw<J<UO;ix>0p|T<)KIAAfH}9iAUOR%E#ME{RYT>l5aC-oFTFC?$
zy&Q7qw%D?LjEgJzF(EA$_-%e}*^}c}k1*Kvj?0E&A<<UXvwrQ`mk4-5zDW0gO*_wR
zVEYP#>;sE3lNu+PHx?woBH0qIFb(Ci^LY~Zq&%%L+v;gP8`E`k?;mXAl;X!e?`Yc(
znopqUrCH0h#eZ>-%baL3$e9Mbn!CgCS#o0D9u^99hYy#htgV&;^})@u$t{ZJ=r2{g
z*hGHg{*F0u0&L8-YpjoV4>&u78A~13+D`4&C_Hl8h}Qe&BMftzkgVUDmE71b#B-Q(
z8(bE#MF8c~{I4yWHLLl!dGqG_TRZ;ReBAEE#dF?2Iom#bvb}v1i1fAdU006r8^rhq
zJag7jT2@kE8mf_Rhs+jz2cs@X6F%0o3TFeA;&k=F1=pmwjHXjN^WxHI<KdhHLGv9Z
z$B`h_rFjwa6CJ7==VDRp-exFc^2bJ9aW$FfyFrE=s<%v<PKA=tFKV>4wq5eaM`0V%
zW;z1agmqqCPxH{muT~A2&$X%1FJ|95KkHa2RqA|1M2hzs2ga~gq2BS%XSzxT7si2a
z7iJXf^7LFUlE}3JZUrx*4s7Iy2ftOsyni3{VF*BjS#lg8dsGz8eAtDlFiD}?9~emw
z>neO8EW4^y@#OS+Coe2|yC>`kt;FVUaVOU^o}hmqvyEzn{kVOziqMz}$>TmtAr7}h
z?!TEFwf3tCOZCUQ1#Li2o7@`|g8W()xy@|Iu8Kht6fy08pEAXZ=1?IZWU`S$%2%4Q
zp?^S`6rNF2rd4DQ!QQ-Sicd>aD~Yi4Ycnfd+0olqoJ^TfP+Rf}vpt=cLXY*>6kcG_
zR#~ex-F9i==kVZVS9bKE#b}gsF@p9D9eOvNdlY!G4h2(nMaDO0QTeJw7@XA|j6y$r
zuA--|0wHB@{cCu7)T&jL+!rqNo{zgxaMBa&xI;c&Ij1O#>*DiJSCP2_GAG-m^8rry
zODN>nVQTxp8eSYBwW?f2o%B8B$iyAY<*^tYXhy%6s+0IuFtf%W!sg!nZUKgqJ2k$?
zTuG8YpN=(ELG+JfMi97<oc>$?9Z2R-eJC>N+6oh8^S8mO>oX7Pn7%jlE6pe#J$jT)
zA8o?9JxjEm162;Mzn}PXQq~ARIa2l)(gv==b%;Y>6zH`<6n6AfP6Xw*^CK9Q0-rP+
z1ZTD(0%)BAgRq<G+*=TV($}&+z`Ko}c1wp~JYeLZ|73^3ffK7NZgKXSX1(}o;<r7j
z4?dFSZ-G$s1}^vbn7L7vH-v<7gzmi>7FSECw9I^}D9EA|1#W5(gKl(B8+4%<4T|>l
zZhc_D(ja@aTSTk>tc4AVrcvjOfBd0oU$=`&CF7}BL}PU(l*aMx=g4<tmSNz&UAs2W
zeN_o<sX)A`c4tN$e2?HxBa5O+1kbK}QE&H{<SgR9Al;fw6(2;KcWQ3{2=xhz<#d%C
zYq1Q8p-levn;l^vrN8Gxv&N0vyH;MD;?u|L#pM>X=`}n>oY)MnqZ%-oI3TUc(djy*
z6OBhj(?QZ7NxR=Eb4FcpaOi!tTG8ltDg;qLobX(<=+vWH1Lr)8sf7-sFLP?y*;Lgw
zksb~L(W9CXMMi*|*F9PnN1WWsnx2OKP$vHI-@L18TTeX?=99dP3S49+UI|)DV44kH
z7XaqUAVMJ&2Tj&qjqc3Gc|QpECi}PZ7A!EZIz(oFMa(n~@7TN7p#NN>;?Fi`^XUev
zOL`UfCO6lh(>ieLn&tzi*QgzPG&-iYexqj1#!c<j(Lu&r>0Qpg8~LZ|`1a1DVzx$4
znKi5PEVQ$J541HkVUlVPJXNjMQtQ|_{R;N`T4Nli?oQuvCil?v)7)E`p@|1!kA&P#
zegA5b?{Mv$$-fV0X7qT6xvUI}T<w1IhkpKsWnb~rf2m`ghG;cqjEO2SF>xAif9?8p
z(Ss0Zdp3KzJ`(vz(5qMya((|OVdp6i2A^erWv7Mj$TaH12<GT5B%axx1yp%i_jYU+
z#xovGqHUWJZg9dU2Qs^q3okzE?42%S@AMys?QmbMU%x~7j44^}gUa*7f^Xky?a)Gm
zml8On1|v#-S<p@5<%K`#STo^mR86CHP8!hM#DUx0c+}1((axJvlfDa2J#yrY<wy5L
zZP>oOA&MhCUy7y-Aj}VQ-;p6ng{|J=AxF}AusDisb=o_4-i98-5I<R;%lQUjzc%S4
zz)#SrE8@*;Ywz2jJ#*rC#DBTGIH5`B?kch`%OA|*#28%X^qkS7>rs|(-@m`hw*vZ}
z8v5}tju^TQ_VhHOZAGtcTYWWJA8csq0rm_7MuIjJN+{{=^}&pfk+I^`F^aeUMOeNO
z7yJ=(WKeucNi%L-Gu&M}pM4AE=G)<@Q2HW}pnqr(xFT?QX*K_nJ=};D$xtv;Din0m
zZsb2n^XzO1OQHfrDW`SsEK(jA%=W$zl@|L-Odd{-^LdQR!8XmI+eO7ngPEeBC+Yj^
zuh*k~XY8*uR;nDa^%5tJ{;o#2xS8FYMp6K%7XH17uYN9?2~O?;xf87N#Va8s`0%yS
z*aUI7mUJ2z+Tq!7hcH09WBa#lvz%H&@$LN<fz0)R>o6=kRt)LpT}wq8Q4Lp*lZ{(T
zHBC2l6+)TQZ@<YDn~g8J*3C*uJU{XGx^?Sz2U*K_A5F)1RUELRmIHzvo_N+;MSiUb
z<k96qACsR6DMKb35B%E2;)@Xg#3=927bWpW70Q{TXg>Sok89AAp_fd*i%5iUCA}uq
zARfA$HO3l6G6~nc@Y^CQ82nH~{t0uJ=EWhLLu_xmr+{szK^YU(;1=|}UZUe30UTLw
zwX)z+odB#fa?w7#lnQa{?XZ2XUHU6!(VSl+mY{DC>=adTJ7!RFWbeZ7!>#cJQk!zX
zO+S#+KZx@n$gjPScyTLBdc2=7e0U886^VF3hPrXpQl?0;K~sX@ZBhzQdjlAEL^7Cy
zqMDRhM_)f;<W$B+Fzi!DV2l7a)6JlzQ)p_D+~kkVo54Y9BLNiJz-;8mp%gt7=o3N`
zyBzQhB<E7Ur0Tvz;wVyR838PnCsSjDYvV+(OAna|BeFP%MQr<$gGtdCTiw6c%$*4@
z!7w!8*)M`TR8qJvYnT+zi6@7s0BKw;Sj!;t08Dh(p)@kvavrF0+&TA7xP)z{(~EB)
z8i8^QohfxPFpsYA=G1>zp@W2TbwXXD-eL^HDpGg^vL+eRZ=s*hpp|kM@zjzQT+nQx
zcu><1VWvg!`*1Tl*f#3cb6$AU+{#xsNad=bv>LFa0S8UzyPl+;ZipsQX6EP`b4%8u
zC&&Ck^enXVYLVqK-)1;cI?<oe)g)3x0>edoKJ{9kF1S3CA-^gU4t1BT1IgDIhQy?4
z5xT*_GZWklX+|CNPVycQ8UUK!mt~ygYUHvBmwO_W)PKiL*=6WdnKs;wY>z;;98;#_
zk-5*KOIxNhGr0PUWja0XuQT(gEnjeh$DN#^B|Hk_xRGDQbJyshz%8H}nu&%jTW*r#
zURSp&H>VhM>TKGM&%XI5tq#DnPo=+*;pufSh^x=}vTM&C^e9dHF8HSomhyQRL@M5K
zjoI3^NBV}40ylwDJ+`vMQ}M2)b!yI2A6g6pAwYm}TS=M`VCY~D7?auN(v>UK81>n}
zzQcFan8>Yu`TXW#W6I4T9OMvdYe{Ei<Tp+kmq8Nst}LeE1#!yQg61@3Tta4@{&n`B
z%eCXrJN|y2QH}d9ft)kvK$S6Z1OpIZ_m0D%i=0)FIkSL2I}4XnQkA_5AF#8W0I`E!
z4VU5Yy=U#Jez5QZ9rg`Y7Uk7I;yM#-JcDm4M>%n|7BU*pyV9?|{7Yw<*(kaxlZ*w0
zM3+ElTlIG|3e8zYH4vLIqQ0&ujL^T-BISP)w^Dogmp3w|*?W017K6-Cy^#8<Q*@pq
z+CtQyiJ}!0O!lxtV~W5lhy~rUZ?_YfD?G4a&z_fwjCpu6$E4RN*uvydQLoFiX68Th
zy}tqFCOm!G8jN}k$}7AF)*?`ZWUZNGn0+b8E}(Ok2o=j-9%-;C$H@ffitLtfxei3<
ztcTI2AJ;aKdlNAZ_uH*>Gx+86rK(XyDqggT80Oh;XB+z1U$oREE}=4Skjyk0%l?6m
z_u%xsFSbwSB$lo|y~!&%ia7P`M*moZCW^Q%L+!j9XHwc%qH~liMI>_i(`EoSyJ#&_
z5T!RHb&@H&2D4{UX^5(jBx~?|uNTl>^AKYN2M0?@h*jEpCUP@qpzHXMVLqSflbCq6
z5ypB6B~>)%k(#ZpRvTRB$cf2NNIbQ0_q*SPl9o}^$j&4)J-d(|pg_0Exh&#5x`;as
z97|>|GuK5M2mYy^F#Oq9W)W@V2R3!^x_&Su#GAxI_)l<UzX`8^Mw^Ukb&H{FaKQk(
zIx+t+fl)fhTpf|0FdcU6_~8f0I3-Qe_{ze82XJ`%GWjnSB$TqjlM?@uq8dn#8YhcV
z9=cWYo-x3mD3z0nPtSVTNX|K2jC9V9B973g+kV+>(igB!n@2|u$JSpcfe#0rcd@`k
zQ`s{T6R9A4+n~znJuYTRck%8!F&Q4r2gYX_2bFMl!F6Wh=x`YipshrOh{LrNSgm*j
zN<IYzOZ*nHi`Ee-4nm)*w?rL@KGPc=O-<6<`MHQxrx2iJ@-Mw8T3>+oM|B>5Le#T&
zAAA>KnFN3@#l+MgZjz$l%G}OrTMv>UDWfmgTUvUR6u)2DclnpY35(#bWD2a<j~}~9
zFvIh66_AAYLg^EcDwA^)uzKT@{uGf{#TqvdYuu@=aB|7$9zxgXS~AHe9$6_hKD@jC
zP=_F%KsVOfsyZLhN=&26K`K6X_h%kY^)bzDm$AW?kawABkza0ll#P>PP~tN5fv8e9
z%c5>{$>w2gq{L6(w?<<xSylLK=zYPX!sl@QAm(|_HG)A$95zNOCtqDLq_Y)W(YX2A
zNIc?6S?%tnM?2RhJ6liB89<zEX*C;<I@Nh#IAhF&`{N`HKb)JHxt6aPNbuIr1>w?2
z1d3}f-2+0jk$Tz{K4E&^mvMjnR#%$xJN-a|AbMByAa)L?x%uO0!cACyjgI5N(bG)<
zXS^9J*JgkT*p1y6#+Lp#eYu$zbC5uhHXO)JwxJONoVtks?@!idNs|qWPVxkf;IKcc
zsYcFmw7(T;WKW-p2`MY<w*B~HCbf<(UisqRtVw$Sk*xECQ+2306ZpQc_#5a3V-DPu
z7ne73NqR_td$#T%QN~V(Ym>syaN~U(sV06u3cg1>^=d9k&S9`F8ls46YyJGR__#G_
z(+u{TP2K|>ehd-DK6FZ=oWUZ;aQhv^9dc!?Kv7R4XOQS%v2}6bXICeak$ma9qw!wr
z(4hk@Kyn;I*NKz3PWSE&X=PdqH%1tdA?>JBO+K!jL>5QHs|GjFcixMtlADk>aIggt
z@g&}(F_0mlu0baTU>_9EW|3gLk4zena46>!3&IlREaYG<cr{J?Ib&reFMGgn>gAdu
zHlv)7L6_3*!O(ijtw6R43Q<Vfy2of6OiDyPa76^9EChI+O4NsR+^#=2H_lo#lY$5U
zane_J@y;C^R}X@SG-mag^CTW%-i_ooI5hMH{3){|O$h+r{x1PbXe@3**<x)F%D+ra
zSX)bWCWplbdXz>SP)$r}-lD}+I=O^%k$VIY#>T45Q5Xhq#WE6^9z6+@QeWAlG1I{*
z0A1H*Pz>Z9t3{vCwQD_|8|BAZKo`=`b?ABnNNc4e9sjk1^d~~Cw<VK6wj&cgNu)kN
zFQc|U$jv3EtaWR+U4<5)7d3JU>}^t_N7Jl%vwgGPy{pGJC4twWXAzxR!pUgvn6%{~
z<Ec7i$R&+pBa6KNBe+(LEzNYGq_I`o#7U5ylM_UGD2;^hbyYakLK11asRo_29u;zJ
zVv!s)<U3ZRd_8BR)#O<WhRLwRePsT!-?eUipZq(D+<GKA98oW+QzZ#QyG4>+`n1s`
z^R(A88moD1?eauIM@QljwClEGUni-@-Z@WaTrK%XCQJxthuyWP7SXQ?-!P!%!bOYL
z9X$B**RKboqBaAkPQl{f!C$}Ezv*wh6>&U(MHNJ5!-@8CScx%Q)bI0o<X6Fu=aB)m
z8n{dVetAzRjgWVWP8s<3D*2^B+Q59C{EPBV8);P_wWH(|Atz*_H!^(+5pUS|*gFfB
zMHHOS2oCn`Sn*YYwmu`^8WP~F{&4}{7NRAqx6@`}UN-pdMS^@R$joYWuZo|Se+Jdx
z&PpJm5Lt06{AccDG_aci?^?5V?JA1<n+FZqwRhZx4s|}nlSINc2nJpxhMh*moVea!
z_4B<74H4D1M*F+k-(Q=$qmsf|noV4!PWHaLMDWE8X20<C<{_L=yx7x}RlynZ#~Z?W
zGMQlt$<Koe%TI?6Y4`I@bqd4&!#y_)ze0}0?HI;2C`wsv)l~vE0C+i+{W-5cy)+zh
zl@wOLVUHfETstq@J&WEwsRjDV1Xh>+n*V-6&{mEAAm!U1?ddaji`022ns49h8uK@G
z`8*E&1tx|P7q_n?a@F8BTy;E&#F5u-rC-38LCaMXSgD?^q}4=TR#z<D{e!b(`S<Ob
z@8`$u{|_IUy`R-Sl_9Ruo&<JYNulhzL<^2k#nL2R)&o;|l|l7~fR~rHbr2z2Y{4F<
z^c6N0yN+rKwU0~RM&x~+ebVOwh5h+t%YQ%Uf@I|g#b99!O9lO435*Xmp-x(NP=?_~
zOq7<2w{Pom<aan*c3%X^JNEa#I(BbFREk1n)MCq8^lh77#Qoyqxz+6Azdteb^D}U8
z8I=pTbCI-PY0$Xw8bXQac2I$oN#nxiv#Ku1P5>k)at&s&mENjqQilksGjh%L3Q?e4
zE7Kh$K%QL=FB?5HmEPlnFm(xW>`Ny{?2=uy8~fj0kNUv$!EgVPkvge0vB2SZ2arf9
zUJwVOhvzIdg)=Ds!UZ%Dw4aq8x6-lv!>ml_BPVe>Ur^Gr{N8Rb`i(tXUBOCj<2!$@
zsMCBnDyQ?=UpHv}M-DhNL?#8&Y|#kTNtyjNVZt^JJufIS?rG0bfvMFM`y!{g3_+^i
zzI0g!`2-h~)K6=!D<j3_^|_sx)gEij+?R`H8}$8YX3~MVOEm`D8nu3e?F^riSQ5t_
z|MOVG(q8|6*ALuzZO{iEc3>HvuOAArqVbr%tQLRp+<57$Uz8ZTV8~s5HS+xX`iu#^
zfSr;2QBZPF!TU`wl00nxL-6Y8|AGb1(CO24t5ZM6oZ@&tG4p=Ck{{oXR&QUrgm!W2
z)0$pML7y>v^5g^S%BC#qjKT>a!Ovi<qR^2*SJ}FX|G>vKW3=j4yl51Kb0s9hg(wij
zj}HbGQv&!Sn#;C7+4-NP95#{-pTQoX%wKc-cz303hYr=rzt^5V-G|D!SF3iVn+^%h
zX8w95h31f5*BF(?OZwhf3Q9cv!S&!W>z1-!a`$b!qpb^MeJEa-<Fd$R_LX~b4e8x_
zYJVl3sC@Ta`py5Wbla}nWIk#YxqD@$ymJVb#im$8v?!g@9QGocxf93!zB{q>(`7PW
z!XlyNn%0!n&daJ=&nfYW(|5A4(nq6z4&2KL*4-iM)PL7kR;a8mQE`>6ug5#e*OwS8
zXJpIU>#yV6UPSMpNJnJ5A;zU2;IRUA`8-0G;>C1;V;_I=##(s4>^heEjW1|rX&E5#
zH%w<4Y_OsI!-Ry&qBf5l_@97ibr!A36;WLAW~I%IQtFF-6(b|)x!Z1Q<v#8~Uc8#V
z14Xr>Es*lzvbes%%%*acDtY7&ig%<@PHt}L-7SqVK|UdTwd_YH4V1PdC78UWNuaiE
zh|5;fQn-?R+>DVacMtq$4Vbm3_*(@?@CUx3OT@rTMxjHoMqa!oH>2k*tZ5&7`TBLZ
zr)O;SgJlQFDEQ?r(f}Gb00X|d!-mKO^Rk^MmN*Zd;elAX_I*?o<5WLd$pBK;-<$pS
zLA02L^8%el7io(n3YV@H(KjFg$jltVH!5_&yCQ)(<ceTR<R2AZRe!FO7c`G3av%w&
z`=3p)3qKtnc6?=pl6VUsz>L!6XC_xGn1gcruiwYu2{ribwEv!iuKP%m`B2fXC?@13
zNP5<C_uCRUE~Ur<aJ`Be)LX1$bP(CDNhu4nOLIw%<!4D~oygkv)LSoijaOI#>tziV
z36_(YXDD|r$cLc{BDvH|W-jT{Lofe(0d_S}!gLaphsgyd6V89Q=S-nXgY`V}?6Q;C
z_J=?zfNF{fr5J~Ac=?(L8{kTn2XrI-)a1(HWk;x7h5iYu?t`q0(l+Y0%&xt}cd;@R
zM4d=qNx>0jHHS*fop~o{k<hUC#jtcS$uB`)#iiuT`SUM%kA#^Zr=s$5g551!8p$Fs
zHH1uqKN&(gCEcVbY(g*ne23ECM7V)+d1YK$Uqun<g14kxaglNMslG%A+2>GV6vZHo
zy5tH!x~~72+V$;ox%{%%GcmMo$e?v0^M9^UlEPlin#;HOCVp-i4DD0!`l^O$R#5qk
zsd$48`k_vjei(mwH*j)pzx0xl1ySx>bkv{Pu4$-cg%D@ZmW?CB?fz*yZEyIT$8~0Z
zYJIoOqKAKd9T~aXX~y2;;jMPqsQwt{qaN3=@%)-YZt6I=pNU;r;PL0{SH3gyU%SPL
zHA(I=NB>*jKb@K5Z=8Jc+~mj!qcP^uO``fgfbOK}#HuD%l+`;I+#1*i*m#7x=y6=}
z<D7aJ+4}lfgrq)#a^yAc@sV>m5&IV0?B@`(1b=#ZZO%SlU3-3pchk_ACr=!dDKE!P
znzWyJD;*PC|38$y2UL}J)9$_17!yl;6I*Pkaoam6ilUfe*>=T-1+al&2P}Y;#1=Iw
zZYzjj#ex)3RFD=;1XMslz=EABB1#bvgzq<-BIf<hS?jRYlP5;l-1ooCTyxFLHEZXf
zzP&BBQi5gX3bYOTKK+<<IgiPTN>y1&c=-_e{<>bN@4R{$@>xM}TYHrDIe|0Io;^E-
zRAA!c<p+<VuZjGcT`ss*YTfgY@Jc$@7M+)%S7*Hgu75YA>CCU?TjVR^W9ln!zX^Ps
zT1oI3)$4MKni_x>bLvI}TRmEKIjxI9u}VF$tPNzttnUUra~|-(>YzzSjp)|7qBW0K
zCquvwfP}`pu2w6Hp4hLTcJ^?#?^WMtl~a59ElDkq@3f26%Wzk!R<B;|Sa7`EB4AzJ
z8MXX-Pn|X`4Gk_ITVo@mAg?+t)yhRDr<}QX<ErNQ&UHljT1CP(4;C-^+_b4vQv+`G
z9!mzVLC-41G=-_dTM{rCcYbXgL+ugj-EV@-X)$&onb2&`*Ta{&8aO%1-i3U6|460Q
zi5=8&#++6EKD=)?v-shC&Fn(cxmk3f!_FNdut++GlzV4F)wqtk^rx7LCV9nI_6EX=
zGzN;rYT3Wut+YkeEuK&+gtxAHpw^_;xE~j|wotS7*S{atX1C06TC&XPSIq}FH=01^
z+`2GvMg{|!$6-6wN@`@r5(;P|GKCPzv1cNiSMjy&V$P*U>3&Y1maKVVo1u(8zH68p
z<vM1vXa{Qf>xwF&DE*eb+vTKGfe}0pEf4Mnmuy1uV1?lMeN)EOcG)%b{mk!%eR!sf
zCV!?)zv=p1`={Q;rp>B;lk@n52}5%B4taB>BiYz-d08#ida{0h+)~VJt+npi@9sA7
zssVM@z9+|!vMRFe1BwWfU#)IfF=9=3@zpoMOORa%!_^JKvZ&N44BfM!q|Z0>{lh=9
zHNR!pwC?)?5P~|UBY)KGR`UA7MEU58*eg|l>EC})9n^`WQqL<5ofG+9D|9|jL9yDF
ze=6Fegqnz=`ZOR$_tCk+?0s^xW<AwB{t-pvVING{l6?S{Ff*TH$9_Vl(*j1Za?||J
z*dy_7jl84z<;QDo(K$EQMkKT%I%g!#Fd0uFiZ(rjN)?DHc5wDNcd|xjNG$%nT5;|y
zDq>wS4zD$SbLHhux4<S726aG>c^PEh)Yb>uOgC<Q6>TkrqUlizux$(Oj<TjUF}eEO
z!0c5XPu)eN&|)o49o~-^M(Va6zEM!%+a|~l-%nl2Q@QqAFw*hE^>&?$Q!k$@&zk5o
z1_+cNQKxa!IrSpW(xo~7yCJXr-F|O~6oVd8Io1i|2PP&B->lqPdCy0lq{^lxlNfzN
z@@|Uepepa4;lz`uWVF;?IGX81#w$B0B7)!XAJaneV{c)uX4oL5zcc^_+!I17<3M=n
zq9=|e%oz}mZv4h`g*;x<qeQaB6qL9dls<2Kh7d!YiiT$tS<O$SFIt30A>T2JkM_(!
z+uWXs<5o#V=4nhl8>gn0NT2<@4P(yo9G(^GXmn<3_rzBfhpv7eQeP*?q`MAtu#1_+
z>slvHo3RMv*VDhD;>a56{PcmEF(J13Q$WctcMj=o@bT`h(zZyzC+)_QreX@>D9O1X
zV*(00?i}=!Y!qI`*i<v@Qk9pwXJUG!LD?#;^>6=n3&g>lBHN(Hxl4dMs-3MEt%fGm
zh1iecc|+nea!z^JSvr=NJUs<ZmHpF#DjPa-8N)JzGLLPyx;vu9A_m8))WizuVu6T6
z8~D~yoNgO8v}jIQnn~Ho^nrCW(&4Da9K^_{4S8BT!KZw)?DV~!bA$V{6fq35Jc#2?
z^DMVD!)N5Py!u<(u9>;&cfL)r{suY`d0PxhJ77>_xmB|-ggr1jM9WBT5<L@P0`}5n
zT)Rgt-Cq2sn7L~o@ALKz?#^GCWaYm{*;SAEVp$IZvxCQJR~mu-`X8Qc$RhRP!KHQ!
zM6h#?UDiq18`bYz;lSg;!Gn|MuAM7qN6c7{&`ZVXhfeXRdZ{kI#)I9LA{0<}T5Uzd
zX78!pnR96zjZsu{<Q73c72{f_4>r5i>+<c!j&C=(qjM=2=;xD~x7kPTcgTS`R;seM
zTMQOL$n^-7$sBD~{qwyZt;Gotjw0lFWQ_9;!!uKFj_nh=mqc7T<|GzamQVkTpUrLE
zM@#oK=zXO<8@A?EGP1Ny8dch!wv;Q}#HH+^v?lRfPH?tw%OPu)im|JGOQEogqoqq@
z6-nYmF_E=2C{k_*S-G2+9xXq_#_LmBqoRW20<Y}%e)UtkY#$2ER|UpRRFhpaK-V?5
zRZhQo*EYtL9x2O47s_@gRRRMn*QM?rmG|N0F2B247&>c%7mlSmMK2BCM0QEpH8H52
zr<K95=Z@=de)@3F?a408;XwLbKi&SH&E@6?=B`mu`D2wA-Q^U?S#Jl;c}?<}VQkyU
zsq|&Le6C_Az;bEI0*<o>gC&MMT7GrerTg=@yLq0k?Qi*nk8>**tG|%PG5K&|3xl9_
zsN%ZiYVHv<(lgud=)6PCR%=$sd9{yN2G(T4;H(n`*~xt<z*z2qk!xd@d9*mKkCUs(
zr&lxR(w%UZx`!o+I5w)QK5DL}p0(K<ti`xPy1kGxHIC^A(L7<=w8SwPRD`VF;dt;9
z!lfBv3oJc7mE!4=KaAZlWA5t27T+D8*-R#{wsNv~`+GgDC=t={y{k2<&8m$h-xs~~
zthqvaOWu8}S-Mk61&)}FQk^<2;q3SS`kkIq(2#gGj{E@2uQ*Yrj~`*BH0C_Yr9%+A
zrg5iilX2FCBbT4p^}xgS@c+T|<)y-;*%|j2>>AJRCXf>?e38GV(hZNF0PeJY_Io|A
zN(&?q4yMoyx<iAHus6*Qf1~hE+K1#MYm>!Uq(O5xEapOTRJIs;9mJ5iFg-?~`ycW3
zaVW<JJlKUuE&GL7gK@-L2}CQ%^tL!GKr3R_BsLwwc_{ZVZd5ymz2Tp7Rs?a~45v-o
zhpvb|f)JK>w<=n1;$2^JkRLFRDuK(_X7#<vS$B?Jw<{W(baM=_C=(lqaQ#o@CwjJ+
zYcsibaOSa|*h^JBZe`Az2aB>$!^eLqvXc-VJ17?r`puEyPoG}%2dsFRWo{9|ps8EC
zc+5QR)pl)4KcvS#anA9C>v3e>4vv5;x_?W~O}@^Rym+I$t2Zpsn^7sdoYFPNdSl7E
zZZ*&Qtb`H)tPx(zC@r$3$snICNyi*@8(hzHgG`Ph+usx3@|IjYR0o?KE||8t<xP1N
z#nL=f>UKl&rZG6j+sOO$=_n@nx<R<tK&g16#WQX1*46yPh-~-(Er)`;1@w%WEu*>O
z0dJ5eF7q?4<tL&iX;7}Sa`w{mBQREb0Pe$him_^a{HR{RonLhTUaVe74dA6Po!g@a
z7~@z+&F@O;g4n}oG^}XDzUUSR$X!Iu$b;3%b8Em)+|to`X(L)_e&XB;cHsb()}5+B
zv7i*9yaPDQ;2NNqU)8PVubQD_c3XZ`KAO%+8l;Q5>Udmo{m)1jyY|38?^uNncak~0
z&TTQraD^4%1DmyiL8HbsK&xWXqe`^)p?=Mkn7uq`QH4jJx25fVQojrFZ*^N}V2$QY
z6oE(2P3^vK+fL1&?oaAD)4x5r!B|e3Ws*2K?Cv5EOb@(v_1DZ>oA<S#fR``-4lKl6
z?L{~Dl5*CkkZHV{z3K>$dyCf0`Z&+(uOzR4DIkQwnqO}EYkvw!3ZObUgiB*)abI=d
zGdM%%A~f@!Dt^qg*N@YUZ8VWm#OAY@SX<2U6gtf=thwH3;0ag=!58A?s=gu~npRA_
ziD-pE?PUe(NW+T7?Vr8eRcT8+W_5qwmPMErqp<b1e{ZV#F;4cY?lm?__iH&547qv(
zqv94whRKY?<^!e)eQ_qHU$5RM-R4(c2aH8?N=?Lk3T&WKllxx?yp7{13yi4#3iGYG
z5)1N)X-?u%C4%`suSBy(J^BP|KEiZ?&7Gg?Eh5R9MQ$$UVWWX5F4z$>Sd6+1`>g)*
zzFB!WW(EA)eD*?;D?yMc7t;36?yQn12Hwk@8IDH5HWptj#DSV8R(B2c@bf`_+8}?o
zKp2OGaykCH5MdFY7ozTv+z{(G__fbR!d*`XPwhng_w1D#cUbX|Z*6NDyz$Y$HXgbc
zE3t@@iUCW+jzW|M@Vw1y-uco!)%S3z<#&TKkLc$`b^N>%Fym>Uias?D><9DB6^E*+
zDV9q<)r#Y@(A(z{Q&Kqbn9+4)<@N%?n$OW#--_px5PymcD&Xr-0FuW2m=5kbc(rBB
zED7n>yuMF(bGGJzy?^5KHW^;dj~!x;uw%=9^M4buYo#k5SK?~|Sg)E!;=WC#q4d{+
zc#ESJLz~`{*XJX(RBAj(AN~+6;RW%^shW1|e!-6w2G(Co?`=dK9qMv`0DfjUn_Dj*
z4Xb0t>1$DQ?G9;Gct}N(4%MKxnCX2`MCR2CL9LExMFtS8{n6m_Q8G=KG`;)2F(XE4
zR^n~3G4OIKQ97zL-bO<*k8IFJ1nz$=MP~I<IQO|k0E&^QSPBxXO54n%TLR$C&8cyy
zoBa7#McnYIVUV7AW6OKM#NO12(>J`3glVF1x4W3NnwQ>xa;qvcq3}+}NsOB|Z5k?7
zCKYCae6?)a7eBG85<HvMiRU;>oj%=+is%QH-i+mt1OFXmD-B!q?+OI(b){OKnZQ;{
zh>NK<0#y_Wlc$1bfm5j~oJwZLJp8#{+r1dflhW+CspdP)t9ESXJPxmU*o{qHL1)@-
z;@VCsEs)x4%uLInb~fVHtG@i7*Yr{|b<o*bDS<FFOR!lEp<aw!kHqP-o7LRn*5|6P
zhkv?2D_A3Pui<9PctHyqFX-%>Rnv7Wy;A8q?vL&dDNJ+`^zKIM1R^t~s4I(#yEznP
zF<`hs?~3=9Cv<MKy&^bOw04z9=<cX?G2&#DMvNw*p3ha_U7UXYxbv&3UI=$lt6xRT
zV%qT{Cn61~8$*-%6sRd>H1j*GB5KuJ_}>$;xu%Oc@?M{?A`1@DgH}PeK*I@oonVw>
zr_WwCt>#;uORn}&!*eWwh%7@q<)v<cjN9tRiHO03&G_Oco>r~HSC#AF+IQYB0@0PW
zStk#~12L(&Xiss}s+#7~zfAK=W{0dESJeW5Vw(v3H%DPWFa6*GwesYYZWRCB?y#ke
z8nx&z?*7d1syn>Zf<RQL2;?QC@PDlOHiuvIp`iu7I2dJJYq+YeH((+XqyZYf2(uZy
z`c`>qT)9}c?lp~I{ekB~GAlg)Hfz>7l^q*TMeUL#q0w+4>2ioy&8x9nT9q`DX_I91
z&*hR(h+!dWb*bRDn~W&;pvff_;o;sjFUhufl|$!LYGiWtChEWYO*1j0+rBX^QLG39
z7-vu@+51YB)H%O0DYU{T{;@;qe)FzA{p6b|ki)d2F1X+|(fA1Sh><zvjTQ{b4av`g
z3hbhi{LQ@Un-_69zc3+#b#71kI_WI&pgyZ@c2~H$h>=U#4cH-vZL!YduR#O~o0Gxk
z_!&{5Fq@u*!<00BVJ)-&sdj2J4FZ=x9;{;?JFa@g_#ty<x72jG5y9t;?$%lwBF7)A
z(Dbm_fTdqKzQ5qP2w21$be6%MXU?6IN+0WVSMzj2)Z3(#5^z1Gugwi^t#HggB;D+p
z<M`3auhY#oYfoAtZwoQeyz)bFD6?&9FU>=l&<)6%R?pzW&l1h;9dnY5kalDJx5KHk
zJRqMZAxza*#H<;{l_Ne{Z(er!u`UsVy_g~hlrAbQ2;_^)9_eBPzF5(R=9N3@O=`O*
zlDAPzmUHI@4OHcE3e2ed5T^18(e)R8Bd7AZJNx;74aNOol_l6yMvou(c&{uWEX9|<
zs_O7~pR8D_+XNY|1aN?s-g6Ev*fmsFem@b_9EhpvuX8OcJ=@HV1Yb7Xb}Rg6XB7@y
zc4I^1VS||5IQ5Hb^s(~7o9<Y4HRqBmG+gyUP?dccByo~Qt?8&|4y^<d{?qsF+b0Ll
zb*TLsq(Z`#(GrW%Rgn%a70Y;yX*Hj2{pX*{A?*5MAvgi<OvmVgbJs+5Rj=g#s_OB4
zWYjiOj;*1<ydbYsYG}${5;`NE?83Z`(=|cC2g}N5oU~-;u#cHETWE3)>JwADFG5z&
z0$Zp>VQ?jHtNCruZdKxs;XamxcgtA~akHo&jZsrHFc5lnG&YE}tZDh)7FRytktEK`
zNytNuQ>R~`lZrVi*HN14E7l04Vf4jf?5}hj=X8W6rnc1;hVm2#OJxN-JH<%}0y))$
z%+C8NA24EYBKSls5$EOs52{xBg}$G)6KY#yoE-`#fANUlR-THk|4bg$b|0sr&DxYz
zD)p2zduibEf`POxeHU5tHz}%qU=qxeBu5s3n=SaCdZAczu+9`CBUd0m%{MxvUG-B_
zp_#c!2jKrs5>TtuuU@^9b)YvOnrmLuftngFs=QR$n^rhv&lZxn`gXv43O68Ze9hb8
z->nk1I2Vqj<Vzj}K6erOHC7$+{^1h6vJWqHOe35n$gdgZ`FE?jh=s9L3&9zvT8MP#
zI^RWcv!>Ss=ZET7-Kah?SqSQ7iQq;4*Gc8E?!{K}3?0~ajQSV9>AX&bhj%WV{{OHR
z3(Tt4BD88PVrfO%d%!90?t57bEME%z%NL7rzG^WV{la2sumA9t&g^dN*<5uhWrA_v
zc5!#P5IeXBN!xBU{FC#J9hC=g>7=lt(A-6|%jrGe*hJ%|1xpBwgXcL{^XBVzuDbbu
z!`3ddf)164N^>%VXBkaE@c(l%TGy#u#QW{5?g*n8WNS4^GL$n8F4(EpS-7E^=XPmk
z)!pB439Og_nRA;=pJ!g)#9{BlNW2Ma`@hzsL)FqZGOk(=4}Cq1lf(agJsRz<T95ua
z38o}jx3S3tGMu;)t)YYKX{|Gb6f>-OI9j)=dau(<FU05u!Npla9D9L&RcbV6w1u_k
zbYPB}uBxUrLaK<(*D8Wih3*)MYQTK*Diw=C1XjHmHE{6KZ>kpKcp!^m4$UJ;5<}y%
z=us~q(bf|9?x7>F`5aBgQe)8~KE_u49!oN2abziUZ1Ylu=vJu*K3UhXi^sb8k`F?=
z^h8bTnKYwnpH+j1D}}skGEu*uYD)Kg+5nL@q_obQJ=?toba5`MUOw$hoQ)LDMi9hd
z0tARM8UW-f?pZyycQnL*{Tc|`)M0+*p+Dp<F$9iu0zsQvnV6W^vj5^qJHd{^Kz?!N
z!zv+}^Z0mU`<#7n>1O1JyY(qgeVIoxVTd4G-1-+cIlKB0E=)u+g66$wNh`EnbH9Ts
z??&qtsVDYKxM^g5cps{uH8*Sc?y7~~v-Y3wN1fokyBxNeqVXhvbnzru)S4vEk*<}i
zH(c#=`N?3W%UNqORtS@*@*t9j{-2IT@Fjqg!vFkfU*A~vYDVz~$-@58p!aFpVJy2W
zLyxwGjDAH=j_Y?Rx>mh5E@XOF>?<-M-jGbAI4YpwgiD|h*(M7nIc8{ulanaTa4Iv8
z$rmW3$4q5|qWeIE{+Cm%#>V7>U5AkcHc8V}m?mgDmVW3-HvZJ%HM1DpI+;Y``jfIX
zM>c$S6U6EvKi3W)BV>Uj^WpRtUBclFct|Ll8@Dzr|8U^c3gS+V;qYZ*p^`LC03W(<
zm*_&mMcnonS~5270rRcJXi4_f#4bg)M64Baaj%&hFQL()Ebl<La9lln`WOW)JXt1b
zslA31RHOn7E<`$gN_I*kukUO7gnCu3qjMj~^^nq6Gs16DswIQcskZ7uDM3{8*-Ize
zv{I};O1`!SAoe!$q3$N-ACo0kV|=8CU_P1?Jgu*N+vQwR7<t7Q$@QS*KeS$3n>(|7
zt7{1qrpv_2SpE(D>j5R{l+}+=r6i>gq%>#U^GyTds9$o2pw56CxJ3{dkJL@+o3-J$
zcqW6QQjpOA0R{9y$pw)yK|^vb8hVgbA}`aAxV@NfJ_N_v2gazp#d}8lb(QogHug&V
zM*&%UMB&({ttnv>4i6^OgT$GHsN($+DEn<=|N6WIkm;_i;%!UNM2t&^U-J&<uJ0ll
zM*|WoAq2R3@BV3GIC=8!TfMLLq&CPn?UQ6pN()XI#E?agr-zkY8Uh45`|nzfNvW_a
zZnC46KO+n%8U4h;mqS8It>Z$;x(MDhBLP<YZy(3q)B{PHU{2OS>e$o%(us|d7v_Ug
zVCj+fnii8SjPcGvFaGRl95P5KS5Be_*}wJavp6bIz&2}!FFTrbz0P&sPXo^>b_I-Z
z!!mS~!WmfcS384YsMAfvH6~EcLvgDPtMehF?9*3@Z8qFj0W8JQ6CKs~i{2`2D0m5-
zC5sGEU9`_$@(b`;MjUjtWSWYL<D5Js8!-LRXYOLAPTB~}s&~2+Ues+d$0c=7AFERG
zO2xPPX|h?GqLto3sp@@SYb#ac0jxMsomFLY#+nl?dfNN27AHtj1ySuF!GdqW!vV>2
z_9%<_j&QRT-zc7yUm!zBu|_k!SEr8$kee|RU=17+_RT^uNy?ytKsVJMW>~RPb`7H%
zR#3XVcW_rpky_+Tboyv00Cv`r8fP{};A3kSGW<u=-@!5#xF)aQ5GYmn$#@n&?(uO{
zr9Ab@%Kd0TB)htV&KY-nO47MLoIio)LIgm=TLnI;qlh?QbY<l{i7VtOT0i|TQh@j<
zE!z{BJr$>ENcEJJe!y1mQC67?syhbzVaGB&B=Mh$s$AJ^J2IlXSj7-{*}#XN8TN=*
za>sPrbM{Z$mNKl*r{E#&dP~Cm?_&dCC*E2Yk@*@LuOVp&!-_T~zoLasol0KG+vsYR
z*P%S3`;%1gJ-|Ir&mFH&oI57`rLYiUIwjx`KfvU*?Xk6;Uk4@|OHe{ptGi$fv+xWL
z`*^`zM07pcUS0R@h5M=-Esbb<$#_!;pDwv}kYC`FM%8yCB3XjOrNgDlz%3H+J2xwk
z9`8i`<%wq#54^cRlCx8B>cGh?Xr}cpJ)G+W$>3qbHCbF`P`cX^_4>j7waA&+!k*{=
zFogmYH$(7qVde8{$1W)DZ(N3xm8yp~7?{`o*T2J`KV7dQjtPa^J>07HDl0^uGDwyd
z$A^+*+DjHT(N0sBQt_}Ro+hgrDFRKQnfaio16rCCJ&EGtStVtKeoj8Nr8z5{>`x{e
zI~J$8MN%?T8t*__>#AF1pU{GM^RM-lp6i2BF~!kBB?ARhGmr0z6@$8zlP6iH@Vo|{
zNWBY5FA-ua8C)6&`T==Vv}p(*-}La;5uGy;-;NV5LLQjek>k}+!<~_HyV=<3!}Y)L
zmiyl86{T2POVWZs=WNWJ>h)HHb;jr)X|7KY)sp@$USp4EY3h<ji)gc6UiRr}`A}5=
z`O^%_(u}eSX_X64Uxmg`n2jG!G)W&Nop&v?k^#ID>q&utB%USFD1{PaN-x}m(F&A`
z2uZStofWbts^hM4l3k@P$}#Qf7W@IdqDH>#p3R#Sz4DKK<!-VEls(<D%{YAdracHt
zDGV+l_(7a{;{0BpU%_Mx$a7MIHBTLiNc5GA-Bo<-_lET0lb`w3(gXd249tEMgiruY
zPD?`zlY-s^X9vI>`q&s#V;1w06s%)bV0rPawwS)>EwqXr=Q<7)#LM^jh7`n)RjH|5
zDKQBin0*5{I>n~8;#>fZB|OP*G6i|ua5EadvZ#RPllD^=md=h0oJ8rJ7xAp@q%dpZ
zua|h1^Xdzc%?|qTV*8S#pSdgNM4>G1{Xxf$Pyd<Ev%5lbu^;A149+;%pif9&8j{6I
z8|2$zZY;G>vn?3w_fDv%i@7bjQIyJh8WPG$_L)MFC}_ff?khIT(p`M#$gw60(@^3s
zjeQJyv8_iuPQ-o9LyI$~PmiV_W&4CWrcM00t?BTyCO$_SssvI<0oX0r*h_1!e_rgF
zy#Qwsc3uS9EsWRAQ!k{Q+!oMomF;smouZW}fS24-$whhU`2pnz>^jiXDDPzQ2Nq?C
z7L&44$L`9(&Tt*~C<P=@$T+<?lsxPiTG)Y~_G^f~Uc*9-$WZuiMeTli*-L}61%Yt_
z(nuoZVYL=<U*>GDk+M7Y9p~6|s3CPgWGK&Fd{49^GpWHP_Tq;nS*}Ng%Z$Nh{LZts
zAy|Swp*IY|%i=N|JV?!@V@zWqgxsmib7=OoZeR`;st<MV(;rAJ4(@OH;khfwg-oQN
zCxHY18j>~pb+E!8yCykESpK=k9ptvdlp9blYc#gz^YxbJt8)jqGRHb*zyoecK4lc?
zw{_xf(-9o-z|6C3au^Srr0>W*4!05G*;^$WMeAfi4?m~B>$W&&rWbe;k%19i6cOF+
zxjdE*U9b1P`dUqcbE@D2(EK#-<l4@CtSF&O#dsO@ut$qfmq2u>-TT);)8Bv2Aet8L
zZP6B!?j|4`Gp<tW7pLxNi9o>^zZK{IZH~7cuNJsYB;k2A4kIrQ+*uV?@;EoixUXV(
zuSXmXp~D4^*p?M%=%BM(r=_CnZ3IlDQp+cCD2dtEB*#pB-c*h_=OjfWUwAO19e1ct
ztbYFO()5Ah(X=V&F}rmz*h=!)mc|A=3K-LU>WmrYq#USADl&gh*ZsJdmyq>~4%_&h
z!7Zd=?R;#9t$Y?i&iD0_oPLxmN#<A;Pe=%3@^;B#6D}yZ)MS06x|fKuQQ&)c;yGAH
zD`4sg7L&22V|5?h9@SdWG#A12SYnvKq@uY18cA4^Z=$Tj+$Xnj^&N<qksni6ao<@6
zES6`KmwLL}rn}mLv+^E2Ez^}-5tD6D@*qz0N2!Ss5`xyqZgRF`vPs@YNfe=sX14-=
zZnWsf!8VIy@+mf55V%<zR;sf~3Ih^b-FzF@=;?<W0~;=KF2D68bW72ka;G*<uNPi7
ztG8uG+MvjO3;%S#<o<(|rB#%>qpM-pwyFJ3Z42Af#;T!_TE}S2J>AHkzVB?_ZOeoE
ze?53zT5>(@aNxYw`43(@UhSP+Xcua7Dl#Kq%i_JGqvF}O^t4Qdt4c*ykL>K};lJ1R
z+WD2j86{F98P`lx)Xvx8jE{4G@3xat$%RhU>rwG9`Uiu~aJZb$funa#Ui@5U+x_<2
z#(wFksV77YjV5Q|?@4ubLaj2CQ0Em*9VlIjbX~LFb7rE}kzuG?OEKJ+iPJ}^s?UUb
zpNbF|lX`k0HCLbPtJPtOkSr;i(X+e3O|l_sZ8P-qk*;`>md7O`w*8CQagf(tR5Fqi
zxkGoeQi{^<!p~3h8rPEWVJv;}Xpx*v@-y+H;SD@J7Ex!$aC>xn!3U%qV1N-NMqU&W
zcGcIPLoBri?a<e+Uyp0sNa5U&nSHOg6jp22#NM@LROCHc3z!?8mZ|rjl^Goue-A9Y
zZr1(#_oJ95t)7bTT~EJueNCM~wMy&!Ey?5TI6gMavf)EzBsD#y7D<6f9>VY^P|j;Z
zD~e2;Ck~i~8=MM4ZdlZF>S>qV;aJ$lF_X^C?6<&?chbrK;eroqe8Whc`s|rY+EC;5
z??x<`yC&(Wy;G+%kkLjUPsPI9G|`e8Po~i1EZL14_eifr^=Ga%W#p46E1Sx+csfmF
zx;(;6J-T(vI`2_P?d&UxdlShNwaX6s&r7cBd*L2E>xNq*5*8qlV)v;qxs!?$6<-d9
za2=#ce@EP-vf-HIHiM=ISuK*$<vhd;Bvi)jZD^|^DH6Ca`Uh1!&VjdF-K0xx=aQ>-
zg=gvHzlc--Bkj47ug<?-VrJ$=A<uLQMC^NNDvD}6%(J7z0{XA6$W^!)Fo5?ghB@!u
zJMh~MIswgIjaDd9|D+%^Ix=z;TE|S5O`@vnuiA~so?Z=@-9wL|-8kE1tVY`F*QEyy
z|8u_DcWhc|*Y(cAn=;CNIwor&lWkw28m2!P&2t9X?xpvGl-Lm(J{FT2Vj|c*5*4ed
znCh9+LQBh+lEzoqgX0kug#pzs5YZ%JPerP@{U~&r$C0`|-kv8=yB|KB#_c3$*}85k
zu$LD^keS^s!dqi0Z~S|2^+#0v%a!e?sxupXYoYLuXv!E`{w3OwDY%OPy72Uy=*nn!
z<vl(ip20qzlcs$6G0vsit7bj3Y_;h<OM;0%13{F9<f4q3H+K58Iw&~U>ySNSD^LSV
z#*7=+|L6bpF&k=pj5flt%n(0LD&{nYEt4KJnH~-k+R<p{%+YFuV*CJAZiBjg`7jX)
zJ8FE26`Vj4`Z@ex?@i390~NO7N|EI8qN(06@;}?Q&eF)Oy3e5f3iKxLLNFBK@w@i#
zKNZOZW)4rMy&hqLac%pU&;WjxN|PQeoZ+ZiZ@*!g7Ohy(r(V5!&3et6$}SwBJDpM8
zJ;#nM)BEfj<blU+^?Xs4D_i)GpgWKTcj|*jpyx3ILGm*lYeg7(Q_XIP9spkH6t(E8
zX5FXuU9J4$eWTXi9*={r_;R_*AY@VZCJh}r=0I;BVDw^6Q`3bC=Qa!sJw^2X3ZUY`
zXe_dN-kTb<=+21wdv~ioK=c3YpZfQ&CPjz8v7reQy@`pQ>+`l^iuNu~K|}){V>+iU
z?fLVgc*a}kIU!vTK%3~#^A%BtrWhKoMuAI=5skx1ToPHrOr{XtMJr>6ZN2Z>xOjSc
zk_q)8^Oi+kYA@p?zTNZ1ZbT&Ps@_QF{*1PidoMM%=x>bH5!+Ds`hg*bY#O~Ha%qq4
zRlK!9&qVtn|8c-WMg-o4vpvOed<EWA3($+kvQiCp7hLW~)6K1i4?k`-{~W#Z`p%^h
zDPo8#zJel2-rQpxU85iSMs$r5l@x9UeBZLcs0AVJ(}xCLd`?6Yq%MwPJ+csJyF(!%
zd*6qogi1|+QvqT18GcNrOK;s%wdtB)5RtI8`W+Qa+m+`-gzRCsQ}`+w%pykTA_FFY
z<$wIrP+|HD+Jhk!;rjv>42jZa{=7M^susZQc1&)mA)<znGFQ4$Yu#}SH|RX!Go+H=
zII+dHiuxod!(lC$_l~DqLV}C?xK}xi0d3I25{;)me?ITUMuoFI?yVa-Wr#T4qtw5P
zO0ER9y%2rLY?!0Y6KWJO$>%q&@%fjK^G-ZtI10rg7iF`qlZK;o_mNuYMDB2tQ%LcT
z<_HDyO)&@i{mNO~FWniVq0y!>EOcu^^)ASRM<o1Sy;3d{mputFB3QnMmP02Mr5}I&
zv0=jiTJZTL5&vW<6s{-;9A$i1CRE^AT40q~q{=kbq}Qy4FDkRr^09om6itQ#srbq}
zO49QT)Bd(0`Kb(C`e2HTze4(Byn921OIw6P?-IGCG7C?IOW4mDX06O#EUf(TivM%D
zT;l^>VY-=c<Tz%`7`3wS&Ewgqb4|vzkK+U1b_FL$+e#=&Nc9OkqvS!`t$+Lb);(2E
zr1{9<Pex3xI;Byg!alv9O?P)UT#&i|QhhO0AQCnGS8)zsn&S32gEPU!$nm%;Ehr}Z
zNC~L?zM5Zoooj3qLKVkmm^e&Bwnlw|H2rLluR2KYY4@+4PsMO|3Sp#M$>e8C&_m~O
zpG$M(6K$IqSX2?w#S!dCB9csx!yIO-1^YUX_3S*Mt|CgcBCObp8{bP-ZaRgj&04jx
zd@;4=GbvmyPpaMu4=d*83Hp<fk}6Ba-6HSt6aCpuy3|pmUZnr!G$q&S6EII%0@*N8
z<K1aQsqvNfUG+Og7%m6%KV%q;^33|;ypOQi3u&$A_d9p_n+|wLgap#%Jyde9;ii`U
z_sQ1ElR3Ac5U-zX5)A+WXe^|qrFBARPnx?lM%3d^j7+yqo&LSoPtpFVTw}P<4yu6j
z=kH>}(-3~q{Khlb^;uWf_%OP#lgfmd>#$w(|N9#UzMZ^S;WGB0fBwnDT+Ns?$p!0s
z0t^Zi8Y4ggkw_XD^CAl6&f@HC7>mZhcyIC$c9fsqJHKEQkoiNH`6Zm+<JBx(vz{|2
z5sDjvi4ytSX{RT3h;awTWK;M+1Dd`4t*1}RkNTZ=DlZ#so&Fc&mzp7#JYz=k7T@S>
zs@9K^q38<id^{5+gKvM}jOr6$#Uy}m6A`gRPvSJdO@e~rkyECq(JI~R?%q>H3zeFu
zT_A60c$!)EGRm*`<Hs>-vKuFel5e68`A>9>ox`TB6@Z>vo3@q&AdokGnhhhLI+Z&#
zVvZOlTs5#c6L?7rEFl8Ty205vMil&tqU<DQG<h!9NB0(qn8vr8|Ihy%kt0niBY}W#
zD(|eC0zBTURv)7X5vicLc2E;j`A;QpF3h|BbuXiU-_-Yz=wCnC1>yZ!sFzITWpLx?
z$O%qF*liC+QF(@TK#lc#n=0al+!%=`r=m4m1zAx@ks!dsQvMEaLo(aL!;ZRx@$TO!
zOvmsG<AG-vS7%TEgi&-uKT(XytC?S{gNwHqlW|Kag_ZA6o<SLp#uZQ2vm<IPGG1vG
zx<>TR9`X10w^_mrt9uM6`-FW~pO7dG$;0k*uK3GTH>!j)*($k9{ISQ{wM8ZA5I?_D
zyV3D@JG2nUFO6qE4T-11KD=l!H^^!PqJX-jWOYVSlsg%fU#RFa5MUjyhc;=_q!1oY
zr4}_){`Oq>WLtWpA2mg9RyB)K-_4u)@hW-8NgZ&mOfdddYu6q?h03nzjSoqRLxWo>
z^rs;vYf%~=hLEyINR1sE)~TVQ_%V-(u8l8IUyR4l*-XA=r2NToZL91|#S&TdvNo)T
z$6m_bj{JEu8`2GwMha&-p%08)9}B8bJ~AAofE>N|=-oeMC82DT(lx3tDlyAau1v<;
zO~aQDV3feJ$HCJ8<xD-=yt#3^vWT8UQM7Cvc<0SGx%Hj*2N|EnK#2+Jra%68(5vgO
zxUK={BdX#-?76e|iW07LR4FZ})anl7ne^ne{wqA?$Vb<&U*D+WiLDx=6ykcQyd>vO
z5-b+5foEycG+y_-ZZG4InUtD$9`}u6;x0ntY+$qc1boeDg0apMzEVUrLM-wWoYiPG
z(6c{!CB!{7&xv!2pkC9TL=J($Y|3bq@u|-}uwlcmo2_Xo^`)lJ>*_G%CRJ08{<5in
z?1)!ua)jsCuV1fJfjN`0?g8!zKs)h2cTEu9cY6oe_hu4ug-<|P$l?ZKyF1#w@?mV2
zBqq}u+D=1d<8JSn{ztyu!}FwS;dkzFO?^lg<9J6RJ+nv2ta#KLN28e*ysbqaU%<B|
zRPVN}Tl=jNv`9_oNxprInu@RfIPrw9Yb#Q_6N#xqLqnBWREFNctH`CP`DN7y<L<nb
zi*fhE&&lAVo6*r(zvq{iOe#l1@FYiOteTGdPD6%VZJ50JP0EACD;{3#HrpsPB;>F6
z1P<|k>7kWbw*$JKLVEUpNx0L0Sk^lB5gZ#eeRIe&9Ks$s0!v_jZp;)z_ZMyGk|ij5
ze2W=<jAU%3XLVxLGJ?`p{pN3^2G);ysqwV=U=9zwxf;o1^>-{?7fMo@czd6*L*X>i
z731kf4b63<IN(=4V%C+FN7v*&@sXbdxF(WXJB_rQAbBpHAbA9$&fz^fZ_oV!fSUU1
z)macVj%My)l;J3{J&U5pk1yGuhhYv^<#PSdY0PO@vRN0*cECA|JJ?E*w_ji|AV__J
zviCE;Y^tw0R^zud?{!w5BmQvhYxHF4uKKCw8Y`kgdrEYX=MCO(l>>%dWcw+!ae=*^
z-IKM(V1XU)si(TIey0*0hE_RH7d9^?DyA*(B5Ib!B_-Zsex{uGQ{qWng-Q*<c!o^L
z!tBzJ7jS0G&5d5U6s{)Am(iI|*7{*>1+;-@rlL%a^uYX(14VB7wgJo)aPwL`3W;mn
z%0)<en|#<~AN-9}#z>cL@H={BEMd5}?b?;hbL|t~^AH_l%-{*$I18$%fBK61{y#%x
zVMCufm|#yLHtgkA!@zl9%!Y2-yxEeQJu7T|5Y@<CRNz2Ez$i3FT4tu)Mj-?wHDfsU
zZ3Bf?B6e!WdqUIk2ftFd%qB`0P8M#r1S=FW?a&(P7QWz%pl0~jns6=8`)b`EIPSpg
z-D!nrRrGxb@C?ljkzfiqqPBt0?CIH21-serA>J|TModi107m|S>AJx)VS<k;voLMb
zk=Ox67KAgxZMJzvCoz>kg?FMYed9iwS_yw4Pk)A7qwB-9m?lXsiUt(FE{l_P2U2w^
zLbw0kxY24;lQ(&Vg#+qq_g|heQ~WNLZ2-Nu5*A=rWD`(U&9zSl{2Zf5Yb;p@hOQq_
zTCigEzxn14ebo}I9RE>r$`{`D1|OuD5OE7KKRnD~7J_`lQrD**`|aBW`p9fUt=Z9>
zJ?+v`qecwf1;%L4!yp_MofXc(Xf>Nu9x;C>Cf=Jq0n101e3V%Tt=?rW?!p(peQ{D<
zhG-)9)EV5ZeKr~OOnUXYP!{=)Q~%c*sHWT@aY(!|yM>n3DG2-}Z5SgfahGm_RhzB(
zJ$^u&#oTi@J!>y?)6WWZq)yGv^+P1lEc<oHi76+?fVR|k8-Mqm?n7SqR+2*)1+n0}
zqmT>2A6eXBV!byjeKP10PMy}!G3tB3BCO?&t~WcV$o}qUx!99a<gJFI358<T*tM9y
z)rF*J#;TFHnRaqavjuVF-bbqu_DDqLt?f{|sWO|G0GU|L+}*PoLj1FsyuP1a2Io_P
zMOqD9ehw0I31kiW9Pzl;Kyb{z|NJvr-P<_iG&yYx5@!GycQHT|DNSTqbXVCa`i3%&
z;&=`cG1rs|=aksw<m3;-mUG@QoQwXuz$Jo!$35dAB=k`>>o!F>+djqN$MI@}h_nV3
zz4;KjM7yJ|?sO>=zyc^$gmg&&GRLRIVi7Su&46rhT4U5Qi9zbAv*wg$GvFHI`CITG
z4b=-5Ejo$eyvbzQE0++jRH=alxb=Nr?Kda|ydr>dhO2%IAniaO{ztM!2|0720gulm
z?c<7OHquliC;Iw6?KOP2o7-8KBS;!cL9+<E1(XhE883toOQkl(+Y7Zt_03UW#<Hhj
zb2*Pv;DyOls+>L*9srJck>H4a{CF-L;43I=ys-JhxA@tj=8LYBe47bNdV+YB`IOVw
zuMd&HOisqV-CrEd_Q^+qvIip`Z;4b!$7|a=HSN-=Q_CFtb~odq9_y&p>b-7mZf|~e
zP)T?af_rjAKLxkctaa-iSrg`MVYAMH>5GP*`fckEZVXzcc6i>LLqqK^VL`MD-CWM&
zW}d8l9d2U44cn`Pdzc9Jyt#5bK6EOiFAJ1-u-pVCoRN>1D$D#1p|L2x-H~0?yy;fv
zsEAm{XkkjDdW`Xd*EcfW^Ari)q46KFopLR9T^xnm3uHA$J?yBhowqv)ZyMdT_MW*M
z{ApAqYFRXX6WNxg&Q%6w|9-l%q2%4Vrfe9I(G2(~x_k%tF!Kqb^B#6}ANsE<>G?wf
z`ZOTp)~#D(h2$kU7GL`5YlVjyGIr8YqC7(pXw%b-xOPD;Q1SpmDq?g8Gk=u>z4U*f
z<%&c4WXprx=0l^-V)a?*?fMH-xpg<({;ID%daYLfhTndxTKhOQb^>(69fpJh+7q{=
zx6)qtC+Zk&Vb{-c<}9Ct6#ts6KY`n6h#Jl21(f^j-?AmLPVyGyNQpXI?u3WT6eW_>
zzl<iIo!X6?xX4m;rdKolNu|DewZtI<bA1NSPw;mD7r5`adwFx!l}F)4A48eux=y7&
zbZE9${)@XQy*BQiOINR5YD3LG+wAJzp~LgtI{L5U?%aOnnDY0h(O2sgz5GtI#F2jB
z6~wogz2S+87Ky-aKmBy=$B9rg@I&XYkVb%+d(?3xBH!7h^ViN@UJ}J?t}S%&?SCwY
z+JDCsfz~^#2BQcxLL(|teQKW>bft==vnki{rxDvSu`Tr*H;zlv;^d4h!2mRNw=a4g
zWS?9*gETV*!0lAQ`FFw)_933}!u;GyNzqRTVleuS;_1_;yBZf=5!GZg0@rU|z|J<F
z%lzA%hT9ECuid7tZ$b7fx*!F$*&lQ48#pce=gAY}_)kywlsC`4Vm~oA)Nw`0n>q&T
zth!G<{dRQr*u7ofMYS9w|J2jk=i`&p(VSs*4ykQYb`81G&Z0=OSaSS6P|`3?&Kq}j
zH9*Sd<%n%fCxbMEp1?c*vVZ^j244*?dkZu9V-7aLwePoz_I_APue-ynQvVcF*~IG$
z8pb(gOP3!M5x=paCkY*2fzpG?Jn+){PZkypo3xgTQWCqJyTt`oM#seDgyjRUy7%eh
z*K<+S4w<eAFEZqMzs8MsZSh^ZY87H&lQu6X?=|(UvT5%*gA%M8H0^47k-(}Q=CsQ)
z)FjRO+;@<%y;OnS%o~i%rw={X=gn*%E-HANwMS+E8;+WUFr(GK_PbkauQVvz_vJtJ
zw_yPh5M3=;5R`d%c?y{$sI$GqVUlV)89ugFM8Z$~``=q^o?gD8mg2*TYO1N?JPe;D
zsf;ET1PcgCbyJbt^9(4>T7VSU$P(cra~S}<bJ3M_3n`c%HEYF!1$)f58!}QO3YQC|
z6in{BPR5g7siHm@>Gm9OAh0iQu(^3gq#N)^e<&l<G|8K<zLE<y#nRH!jWHC~D2q=L
z%L@m;fBRvcGo8&Gd6f~RUJYt1qI96Rd!daDRPNEU=WWSP)%dD3yML07lK@v0k0%e9
zv-c=|M*?DrIyfRJYoCNJz*XNgo|*bBhc)l*+dIVSg<!?55Ny!OS?M6tR|uP`PXL_s
zlNxg7r+`%gLVU55y9Q9R7WIgzjJWVLamo@BIk2hY21epG@bN&ZX%KYdU4BqlokB$I
zcn(8T#odNn1L(n;Z~N=7d-&w>Z9mr(%Uf8r)x<Zb;|sqn{7wONdzQedS6GwH)=hr+
zp@2l2=q=nBKmua*>-&?bXSqWhcaa?I0K_RC2a{JAEAe>&bqnb!4Sw;(+16{Tzitxz
zs1Q&@fAKMN1Bc;-l-**I@4aNEZs&mmFGmKSQsNKBva<s|%B)<}eFH8=QiO1kx$`PD
z69YNUhkE+#+PYO`xN<%aywsz%;#ohz-tdp7u``R|O$qW;nw6;7##2Np3;)=wa48uv
zYSdV@<V%>4@}mFn_l!STOwB~gowyyLRwfSm{;iGm%9TeXO<UtDr5?5_Vk6p0-sGG!
zIS(vwqC}jS!86ors80Y6K2f4#M;XR6m{yy{cNjcpOuzSJf(Pn?P7=(V1<I>WU~>>h
zpGXzmew&P1R)mUON@~CRN1P$nJ_%eWih;z5XJG?01g!&#OTq>y&tNfLr3790Ho@!(
z4!7A4Kg=(nQtfcs&<UQymc5<~qu0}KBCPM8qtyFp5~?R~d&hX_#S&n^*^nBDkD)5S
zTnL*=?yU3ZZxyy<!4k^JF12ipq1j*=b=uHt7f997mM-q!tP5ri`RJQZiXPWk6=pbF
zjN;z*J<1`M)Hg(Jpy=5Nv)szSF+>#4YS*nVs)(F7sD5C<<BzP~63(WqAbFWWXyTh3
zT7hk35g0_#OErbuav_D6`gfDRPUt%M*hXOi@npiMTRsRldGkP@K1_8Yk!upnU>{{P
zYKoLGP5-@i-Er<e0<%V;yDPB)USUVLXUeVUXcaDeAD0sR-CO9~6VS<CQ;!U<C9Ww`
zs+%Pj|80{mOfMn8HX%6X1BRxPccw<n*V$RYK*lM65A7Qbnsq;ci<kDLmcF?UE}Q~F
z7VuF^SPKCf{g}?=B~df=UfKf(#=T4aBjqT_?IRC9fO#gLK$3*n?wX)4ATV$mP()~!
zeZA*C{6|H-fuoSwB(#P;UOi;U5V>puScsAqSwvs%i3x~vcXw~vrp?lV*8LYBfq_W0
z`TdIxQVb9Kx5;-3mj+F{PaVy*N~BkRHLARG!`2&NB)J%S$Iv~}puzTfzAGT*$Y(cg
z*6d_{MqH0!!;E|P?%jR%rPp<m3ADSzrKqQ(brbpC&aw2}DCiaDKfgJj<Jtm^E~G$>
z)dhF#KbW8^a0fI(;Cj&K!ccvElomP@=e^<#+%SejR`1)F=#VV@8a}x@!f0;UP)0Mj
zB%GG{b9ziV;{2`<6?C7aX}2Z5Xl*X{e(e4tYEe$Js9lBEj?VjN?-V|I3WX6DLa<<y
z-2P~Y?x>s%jEkIXp4hGLm-s>!X1E=0#d(ew$hJE*=a-TO4<2lm)xi;&@#BjlUf$zr
zBg4#%r~1y(W6yifUcw}uc4Ie#=)^p@dX#fo&DIJ3IdN?acw<zUqxKYZ&qfXNCT|b3
zHvdv`#=Tr*H1b;GBOtfEspF9=i#;6I_S-Kvhd(_F6VM}HRwzAQWljgF^g}s6Ev)#D
zHUBxWRV82=zEkR4<Nz5)*8GNwe~;1@ep2%_@iMpXV1**Z7vDJi(#+2pXGK&vhPVKy
zDuzvX6DGTmQ7*q9ry4!_4%VCzkiDUOoAoiC1H@Omx;|K7@FgjVH?4XPev!u&2~3Im
z#s5<yvfu0+b`tPthzOomu{-Bu2`X^n)0?(valq^xzFKU@llA%IN!@jM{1BLmI%v^R
z^L2K1@btZ70-4*!XoH1=o-)3gT*%F9-e0^CxA~XP5aR7`LbMY4CSi5|S^||Jai}<7
zZ1!A8szjo!i|p9a!{1Z!z+W({I_k=F#0^PL2vZ|aU$UrZtXYt}kfM<RYv0T<npIXZ
zu?yOcdGFslW*RYRYz8nVVNQNgQ8E>l+U_P4XA$`fnC<u91v09t&EUBwjw9Tqv-K>A
z6@lZ2kTLTE;#ROQB(cYP4x1}%%ae64cl7jJ#Vp3qx7Qj7^QJt5E{m`T1dR*azlT4k
z2xu{BCh>u^;H^e^TzaYrG}THrO)53i(#wYLYi*tq+FJ#l&eBG~`*GKhmCWk^XGoxT
zc1}<L!T*I)w=dV#+2V6{Orifh)d&)jPIh@t2_MssuBN_tp*Hhi&LsTW2?};bIyyRb
zr>xE^ZuGcuQ=toY{LzpjluRAtRNPble6;HaFg%C2MNowL1g!<4DwsgM#Ew9!oYYDu
zCnqxQ(K8KbtAPqC4bvlH>ng0+9&c}Nr3H+GbO(D0kyi7o@6hU#&wnnyE@fDRfXB<>
zIZpM6TIok&!fAp@KIm3KOUmh!p#}abgAhz;w(!afjCLWihChCSr+vjuXlZu-V(H~1
zDK7Fjqc3m-z?k~_^*J(P3oSnvhHD-ns^<B(nnSe)K5Je9^)F{xq+vFXp%--=M&e`X
z2el|T?CwrcR@@ff{QP{KhvXck9Vb{1F?Qjmc9fAj@`f25W|MpM-_~ePl}!XCHa`4i
zHHX6Lb6%Y+VG&48)@5WkG#hO4y3AnyWd89l1czZXT7IH%EsENMquEe5<UGb659i=F
z%xI*ge}6ILfCX2waY_-tIsqQ4Gd`jJrgk;g7xVe#=g*hS<t6!?);h1KJa4a1{r2<J
zLtZAt{^NYCt*q1+E?9e|Z{E1^L{>5`%rG0aMvUrlCr%t3(TPLJuuRi@W+i4)FVFOH
z6&8(t4)qBj(ivzs*PCBt8ZeJ7Zf%)Z?VZ>4C)qM4A^h?;*RD71F?}2}32ie&s87%t
zU(QQY`4oGvB}+DvnXpD|tf3_0Vp<Pcc?24xgGx|??$Z11+qLtw$(JxdR2<`BRCV@M
z|B8w?PaRg{+UoVD%4_`7q$J&hB}6#xXKYd6w)iOGYy~aLo1y;xyigcDn#Z>7*s)LP
zK;>!6=8G8lB11*Up>-m;ul?vhRd~)8pELK0)GHe|ZtU*>A6wpJldG$dSGojm<7Gk=
z44mt*4CA`>n?}72CP_ix*ijACrVIgS&@brf)z;P&!3-ecRmIDBJN%#R`|@RC|Ngtf
zcmpOJB}o2Gdq8e@-5)7NJO}x*@T2?fR{HDwC-41PJ*>RcI>uEeGIPbFOXFGfDahp~
zQnnoFIPA;AvCgVSQIJCa6J=ou1t?#*Wd%Yd-o6(CO7zM?CYU36ZXK1xL~eDCw<A+}
z@ZgWf%kimW)!5L^qeexx`7$kwM`>7{ROh5;WM#Q}IpJT%`88_L;MTnm;uN!zhN{6U
zpE<J_z~a0+LD2HL51GPohZ=)Us^FE+|A0?Vi22X{+BW&jLJZ%TpO-f#;VGfISCR%?
z!Yg~FW3{Fbk&SdBQgDq0cb>v|+<f$CHrk8pGE2GM;`c{7(PE@NAr+Y%IRYj_9D|O-
zf7U4Ka1A$UHHTm)Ei22Sz^No>AhG?MqaxLCAWH{;>qp(vgUl@(H)c{t7klp^d8}FZ
zC5`s(P@6DD!UKlQ3z%|pj{7aSJazwdZ=a3Js4Cg0+?=2mpI<PdNk3GPj~`|js*_WB
zy;v$VHzO&#r{`hHla)rZXOC4g1Vk!_hT4oL3XoDG6+<)b5n^kvPU1){T3(iG5cuKa
z$L>9Qj*|n~{q>iJKw)K2W2d?t!^9Igdf#mf8kbSM4KqfL8s$x`3*3(63BBTk`_R}#
zRGUu`CU+xfyIxGW)g%^UHL9wrcsfX?ofy4)+~H|7#D&tHVDVmk0^5D_#*NMiu3zrx
z@HdMrUR6I+`*OK_IP#`Vn<`ZTn+i!^oR$0#nrV!a@gZ|0(MbIU4H9@h1K|r9LE^3K
zrmZ~%Z*;>EumNVB=B7h4^J!*^ON|p*FE^M_4pbtgwTZrSXG=)Uc=pkw|CCsT1Fee3
zufX~g^$ZoI<t2djkxz-eNip1xo&9Basj%uGi)J+r28Vc}3jqp|^luV9>?{!7y?7E!
z?d;R}ScyHjO_HWMF~sv>W@>7|%=f^(bq|p_&m^NiLt-bg(5{2&;8^!peL@uRu-K?O
zK=6Hg_e#g5=;T<W>kS$th0J>ii;{fjE}27@NH~Gt4o8Zkg<g8&&Vzdscgea165ew}
zCxF<a`x*kg3lSSRgA$OD<52L%WOA$IJM~mkfbN<q;@tnfV;A1B@osnbb9{`~Rcc9k
zLp4#KdGo=8;@|Cm8#O8$1FSwlUHfTaP-Cx_^u40w@6VD1xpANEZVEN6>Q2uXvo^I;
zUP?}e46afN9MI;6MgGbvmim6F8#+tElfIn?O@<C3jsb#C+f={VRBC+3`CXtahLgPd
z#XsNnn(03rv5j>n+K>kqCYH&OKFl-hzW|sRN`3ObqaI>HpIHd~ASEE6pO26l_b#t6
z5rjr&$7?6|O=}McElMUWM8kxn$8kZKxTM|C$50qV6^2qVO>L=$&57dJUJr#ve`7xZ
zTIV8D79`(-B+~{V-~ptN3B>;r7C(9N#FA{$C6ny{COX596Mw4W=>R&M6Iy-BHUBdF
z&99Tce%1j;6Tkk|tdS6xN5mIVjKIH63Vugx+>iF6k4j;=<>nH1q_7SSTp0B~V%vYr
zCm09zC{@t)FlWc=H}83H`7U-hONw-CGMc;uSsdCrW9zA(8OR|+-Bfn;L28$NT&JGk
zp?`*Kk4g<n`|wZPu0Q{DxqSJuuwPuA?g`mU6p?ha)J;-#lSE;6cZR_)%~(ePOaWyx
zV}lL%o+PzOgnU2=5wpnNgoN{{$;@4Zhd&81;1v{RlQqQwYQ+uKh~6X9V^q_%??MT+
z<v)LZ-}rD5&ch&Hj~vV-ey5hju6ndQg^Sw6f|CCn!-ReVS1a20S+#QIBr0SW;-@o!
z`Cn2s3r33d80w8+*ez<%;i~jJI+8CWs$RX*b3;fPiA#AgZirxP!^pdwwS5N<p614S
zlit>Igg9B~l}@337XPWSJ6)>-S;6t#wFD#s)H&m%jC-=nJoXb-I)tjOqtBk1%6MVY
zcMqPlmXmgYlLjmrF#hMw<nmh94Y+pgn$%QD{tPv|fz!r)x$FGNB@TohRhyd7m-Ajx
zUo2+{{eWHYhDuFk89?FPZvpg=7j1Q~voMm>N1$0uD^)zSoONjkg)-iR6s?BZU7WvQ
zLGF+hfN@G@k_m;iu6T0l$b2hRfYs^ni<hiE!KsfddE>}09^HfG0q#o)JH<Pi@(&6B
z@nNeOA+b91&ff>a**IAQ*kbKz|2wF4*~PQ3`;Br%CTe?29g}JbiDcIAz=8}tM%Z4v
z_KQ@3QgUVey3Mo6C`OH?6pYyYgv3BEGafv6aE4%$B#oP-_gJP=Ee!~hq<EQ-<0iB|
zC&Dh%1LEWWZ5NCp27?7GO%q+AyS*nL`|T24H;Q#`l<=SBxJg~aDxAO?hJBx=>y>89
zRjcm8Lj>9tT5>p5>dFm~p~9fQ&e7i~g7$HAlss1SKUCb<Mn5iJ&iqNYL0?Ausl!Sa
zHhcRcLr6qkxRs9CU&?NS2F>J&<91)bN)%8$wFL6gjUlpPM>=MtYI-c-$Z&q&OGzsx
z<t`;PsrUubEUoi-K`(rQ?YecRsqJh0xQMs@NZDSqcJ2C@o15?JV^a2eKE%FMyQ-!T
z#un1kwQ$f1fXtZDqg|j=CtrE#`{$qEP=GK-%?<B?MOMXg<7_zLlKRBX@4@+PwlC+R
zN<Q<h0i}v}%OJ2;6E5`6XTML=4)*4EzwA|`VGD?xI;rT%@2mB7w2E-%BgdtX9@uR3
z5)GlZfD(07@oZK9RWE*HgO}hpVA@aJj%jDS=QOEC;c#TSBH4m*2x2;9x&a}wc-@iZ
zPf9L8VcF!4fXduauR(b>W?Wc13K}_Ly<P%{l1aL>eDLUzH*9_5PaU$D<8YYS^T0rt
z<HwJ8pMGxLQ@?JKd^*BGkY*bR_hDkEV{B-z@x}3|)GuDXTuoOCmDG|5bhR@8VZ5P}
z{3NACC%fHXM(Jrn*H-`jSayrL3_?}$CuAu*tKw<O5gt6o)4+r@-@2b3)M|hE!sW{c
zY%<F0?NO`OU1_dx>Bc?A&<z@N@PC~U{Sn{lcc#`)=OIgV^X`Q=!`g&#Rp}wf$TSLy
zp~JZY6H5u$X41sBn_!rOavJxTMf0hYD*-0YkRQsXwa}2#HKtD(JYE|v{-0)jcPWFy
zD4YglouwWUmPcyPG|8dWka18N`>}680sE()0HZzmRpS|q@OUPu=6oa-V$GU1J%Pg5
zbh72saa>bb^nG0?|AHO8xiXh6Xqin|f;sKKKpL*c>ZIr7_)xk0WMIb2rcl-EXRX2v
zctp9b<-{%q5J}wt=CzXwlkOsQ`6Lieb2*H<w_iE19I+`{ZiBYO3h5RflEK8qll??|
z+oot?bu|?JQL3R%4B0f27P=lp7f66e-;SF7Ce}NOJGaSZ_L}fY+}~&dDM55_ZBm5<
zt~%9CYn7i=Bk@sg3{Jz1A7=`#6t7sVUY&?BlQxch-3&)cjW|i0sRX$;b%$d$w3N&(
z!Rd@PbU9jKGBO<+Xs4vTcySEgNwOv*S9#N(AjB<yKcGyjzM}X+XyXJh@F7Y3^$99@
zS|hUIBP%INtv&Ybo;ck(b;sa4^fLYs)3$NrSD^esqvCG*5TTu5R_bqBT3W&yQzwdn
zI>qUbOrum3mz3u`w%<E_b*RDHqKK&RBfe=iRoQ;(&iY+XzNkC$-Zx+Ev9Z`cYRuL{
z?#BvduIqNo<)>Y3pDPEg)AQO_-^*og*V~i&Puc&Y)!m)jT5S8D0ln+juAN`9!E@JN
z`dttFUN5z9*7m^n`H$_Lip&l2KgH_bcFVOt#cIK)s(K^tacp~%)7uU0POy=iHoZeN
zVByZ8@9)3H02uk3{;W~QLmcrIG8ci5b2HY4m&|2&!$p1C!1dnWU!YN)MtclQE?w|<
z{qkv-b*F&lS~ycb&iUukV(W9sk2whlgr%$r-?h-8>#u6OM~}!h*0;Y4O5ZHvO_~N_
z5u{qiMTrLTd%%M&ooqr<tNYT4ZvC@<-DQ9{{MpcqcI6);8<dcg47i(|`gqgEc4i_x
zOGT7p$)l&DMkd<XBEuR&H^0dHN&mlPqyY`P^0+!ktDj>cBj)NhG?OPRvCI#+Eut18
z_CuueICTJZbm<~PE}aI%d>7!`0OMUSoXdfWdvJHB@b;3G$@5v^u;jv<I*R+DvfCmB
z*V5d5@;3oXKn?UIYs3{0SV(ytMjx#^S<NnxZe~YC{ZjA6pB=SVz4)h<Ut7}|@vvXl
z-UGH~T^pmgKTW#SLFQwoe9HJ-(U`sOK(OM&>4UVt&%27Tee}tWL$oUFW1?TW*P-P%
zFNYQSFbK;2s)@CE0QqOpIT=f-TRyHTgWN?NO1OLI>P|g#%3}=4NfDG6xp7hY71=ZV
zQ}fvJ1kbX)J^Vtnqd)p3a|kd250ysk{27{fN0j_zx<_wn+h{Lco?KEKyEbf(G?j~z
z1AfI-=?YwxZd}n3NuFapo^q!{h_PJL^DHmT$Vby7?CnPpSzxZiL8WMX7+>)mFG|ST
zD4h556}@XwztRmV-#rSwS&@PvgiCo*b~%aOk^}N&K%h904Wq12#<;{dm1Q`I)QPAR
zY39`!eE7R@UQnFF@(UmJ(eMyGPo^1bgOqL&8V`H>L8PZi@@z_e`svNu{Loy7KB_#1
zH}p$On_aZvvb5TB+VVsQ8S+`BMpj5>jI<s){v(F+R*SH=n@A%3gkieLw&gKFBY0(O
z9J-1OAsWbQ8pwMNqdm84D`f(|QMSEao0Y>wO>=$LSd*JrMQNZBtpp1ul#9+E<vH{(
z7mw$6O_@IZ(&HP=6#h#}kbgJ~4cheDk7~J~NHwM$>$|xTx1*=fSU_4GdX`J+c)rQ?
zNuY>Vrv3z>xS>1bs|L@GpCFI6DF?Wp#{T!2H$d^aK09r6|GjlJ&Nf}}OUol)vtv3U
zbGJtcASpNN^WjNz5yYas$OA-^fvCRdgkLThu(q?P<OQ$I^-xM*w<vUp(x()>0%vfP
zykr0_EFH`tS`yxyv>Ue8?TM0(;3&sJZ<CYcNjEuQZEuNGGyusAYDnZ*tjL?LD4c&j
zMu=+ECUhye>3&hdRGG0NBaLo*8W?azV;FTac|md;y10Fb{(BAc<5AE+dEU9GoD~~C
z#o>D<`0~aXV!1?^3Mh~!ZTSY}#c`UMBUX$NO6y@#JVrD|MCS+xc@s|C;NnxPUq?dZ
zXzH@_amg!kA9b~a!xphU(Lw&jbQHsu_NB4V(b&2<y<R_;%O@k6lr5Qj(81;TW9!~m
zp8X*YD~e1a-pHaF2<{r0oP(_r(PibC_n$<Gr|ZUL((})Cj^S_e(O>CEzUArim-Q9*
z2Z&9PzY)pqW%#IA(c#!}?c}8-8A3rG(j2KvQHoSA#GuI4TOOR{C0*nLtlB!y{rBoT
zM^zs0Fr$IZSI+ZwOCM)k4L7OGHW9uI{iUU81G%WM1l>Z6DY4{G?x(O&blD$p-K=2}
zX3QQG8Y&XBE#|EJnUHo5inXgQH(0dJX}CZMwGv6DNrCU-G7KQS>;nIN0>F>)EGsyr
zj5eox-SYQ_tv~K6|FCNu>Te|2qy0DpwAA}|UUAe7p`6pR+F!qlL7#Hp5&m^a|Gm$f
zcz*Z|=JwKukj8co-vUuZF2FH!)qtFTnoPWn!25ot6lIH^&2}woGzrs0l|uwPvEXJG
zN|w{ArcUML`tF`}`5p})RwX%?SLBt9fek$y>Vb-`y3qOsAf>lmEJ#oWI+z1dgE+L?
zHft{(6bVW@2sq8a5Bcgw>aO>L-8x9RXT^b4BUZKUz1(32#?y+Pr4@$ZK%m=VwS8pp
z#iNiTtHC3#T+fj?kAmB;g=4gR`**#8mIhp}1VS{QH)~m1F?(T?OK?EpG25FsqI6}>
ze@%QSW2{gUS$62713#sMe3s=hbiK@_?SI*;wfG)atntVkQXFNv4(YZ$d4uhz2HHHd
zICT=VGyCz1`HmsCO4scwe&^9v&w#FhO`^&tl1fK&c;Q$M_`qV<Ws+sm*bi}gV|xcF
zWydP*hp1*1Gwj11ES;D+pw^n{c)_kqnL_fi)48uWQ+UD#k6Ldo(V!pshrh?9ukY8E
zNdkn<yy`ROI`Lri^)FjL9gPoLGjk~1ZY(8qG8HTx8frZ46l?DX%BliSFJ*=Shi1%H
z_;+(EzGV_%m3H%3iAaU=z{3vd7k%2E(yISr7p3Tvoeczx8>nQm(=U$<Ck_<}O=UE~
zivH0{mR?!G={%(G+j;4$#&e%G`f7AipIASURJw@?w!9ce?YxhFXvY>0C~7nR@b-B-
zyE(u^D@sIHOn#dCB$DfAh`t`q%IEB=B_d5n9Y)Hz1<B`(4A-5V%jwznwSzJ{Hw%@Y
zTNJtI>Rs9%bDsRm)rW2D*E)8}jX9sw&$sJx)S_Ak&b5l}^eE)?8u*UZz4{Jg^)B1q
zEf#+&2&Q%5itJ<gunYm2_G0Xo&9Vt2se0@aJ<x@ZLNeE*>r%|Labuqx%*=!I5g9~M
zF<5)=Kr(FW54AWvk;{Zt%o>?PE!0QdUV}ZEMRd1l9k7F+Pbr-lSA7h0FN!&4s9N+I
zr&%Q<TFl)mnN_FqvSQR}4k(MC(pb;9C8sE?jHnQ=NY1YFx}glUkzxSy%<E|%a1&J{
zeBjzMw`!Yq2_)@v%{&7hKOL!|-m6AbABG|AjyCkKD>JLUX+4vgTdvIk82!Mvew{r-
zg7q4N??Pit#M)^tk>jXidGB<qt7ThUsm<4*(UJ)a8ap;|kz+XCs1H@-R(Bo1C^4!o
zhc8B3OG87>YFpdldY9zyH(hG9p^l)v@x}`f?TL{YX`)L1Fu~#SmhruI-d{QS6JF`^
zl12mOU3b|ba^?Q&!g*g49WucBU){p|HxB4aE&}9j>KAyt&6@l32o+<^G!j#EYITmT
zw<|}f5woYre*Dgy7}o`@!&<bC?Wb@qnoV=ocMXDO@s=8pxs3j%GtIl!eLHzAt7uNO
zlM#O<dJ*^+?|YqU`S1C7%cM-5Y~nOjB{E+zWpkBP3m5U`x2|=T{yxz$Z%^X2uO+R(
zn6n<Bj~GE1;uWyb4;J5j@8vPQ{O2d(?d=T3b3t+ql1VrSro8PH7MmpEN0`oSZ|nzN
zd2wB&LO{AbgICb5|KMT#Hm*zerJ$i;|4@Wn)<3i5k)s6_5)CFU9-4bOw1ro*d}d`W
zM2u?odSmdnh?_KhoQ&wvwfBVemxC|4`O^X>>iLgS;=&QA;!4<$s)yPdP2>Yh?)Pf@
zNM*TnY50)HOOaQO@$@&A-CKja{&MSZ=#pD_kI6pEt_Ot%;#2p4_Lf;2<gOKCdZ6eS
zDhMM=9;bM&YPXOSdFx)yY5XeIEh^cuVtq%1qb3Aia+~R#LhVo+rfkV2x-tk&r0rpm
z)Mu9J79Kj-E?3*&$pXg^^HH(8MAn7Q*V|MAUw}grL0;(870ZLt_AmP`w!0Y#A2e#F
ziXx<-x#*AO+Z})>(#tBKtyOsEh}g4q3T%2C%S(K_NMX8%42#MAH&hya+oL*-ao_ga
zMb676Vd-R;n-M*2vN0@gPr<|plh4Yz!oh2HX<);8>%toqVOZ8)u>6pcG3-E{#`=9>
zWTFH*_6phHlgy!DW{<<6k5^C}Tyom2R|2gN*WTm9bZt{0E9!3#deZKP^FbZzx9+We
z`asmVI8C3LdkX#6lY&K!Ej@+$5svo3;>k~ZUp=0*xz2Zkjwq9#s@wk=!`{d=1?7Nt
z&H+PjH2XKyY1OLndTDzNFO3by&Y#|0_famX-u#?AW4ariu7*Z($RGCpl=RR1ocZ4k
zEx6Ol$p4cg9Lj^`x#x?|dlbLZ%6~a@h@GQMCt%c+(Q-m3xluqm$y)ewie(1Gjcd_j
z3CIdeZ;(c4GHp5PS!DYINBY+C@1aLhOZqN(gT4rZIZ;-dh*r_XdyBX;ws>!Af=8uF
z+<+`qC*SIr$TUN_x0YoD+ohQt8QsP6wy62Z^e!Nm)4cM?Veijhdta|tya`$Wcddtw
zra4R0@$lJ$VG$OD6`bs|?BBMpjfcJ4I<e^`vN2bqUwUv`ewie*%+KaL>f3(-&pRT{
zgk;vd=OsJ+Pd0cc-+bcOvH9Em@_$}%0R^ji?bff91Vky*)ly!BtsRMxH&8Ft|9Z_x
zF`DnmF!V%du1@3{fUtbLa)6V`qXlznDX#irMZE$7v}M{DsN;YP=7Maw#n=riI*!sA
z6(jN|mx9Z=y(!E>MBH#}0R73zY=b@T8L`C)a6m1?&n>u_^V~6VWo2?44o=LEtg~nt
zp6uPg`S5LOK@XFI?xLbvFXSiRdN~S3vyD;<&>u7WPqXSX{n~w__V7ksd-S+PymJVt
z0&`{owA5WX6HqYi{!<RfJ=`Nlo&t=JfM`e2s=D@m^WU}FdEr;&8wK<Vmyo@D{y>q`
zl~GU{Wg~BsPu=N1>eFg*Ut)31^kwgyM_^64b8-(wr}A#LKKI(_^76Sc&C6>+k<Ga8
z_;Sa<VK(PK@m<>Vqqg#UDPT}0Li~U2oqbf4Wg5m`thO)vGPUJ+klY3pOen2TQR`G3
z9aL;}g@{qGNZmjQ5L=25&mOU3da?$AOpsCu`4CA>G!pP+DH@ut$%L|g1Y$&DkOGRh
zziVKS`m6u9`3Gk_@Xq_Z&vSoV_kCX%6RVI*y_UZ|u|tWjqyep$TLF-AKJ<*)E<qK}
z*ko&b!^|uX^F6_NBoKBbG7z%)6vt)Vy~mx0+V2oC{G2hRLl<t15~<*PkcH$b)-&&^
z^WOV+zMD_fB$a}ZUo0Yk;WX~_O)$zY=F#R)#1{a2);2vJ_;?SBA{w`Dd7~y4DVWm(
zlNM#*o|OtSlHR|Xw$N1YlqUh1ww(1bY?o!5C{G{2SB;LYJrMtMc*uhRw>-0#_bnke
zE{7ynfh-IseRI5gieJk1G$wjXY*~AxmW7yV6$v@vW^w}K=AxBNF_pDfMoN}&(c(^G
z=ok_nx1h2!H&E)RX$<!!0-T>28}M<`#1|C>k$?|1ON<oGzF~u0<O6CYXK2+BSUWVN
zvlk|Tu&PZmEa6{YmfU^nGyqFvU0!}+_29b58SGf1LeHrukE&tZBr~XOzNO&@sksWh
zXFbkC#g<>uyi<adcb&}1i1w>!u=T^qP21MFI<M@?XIrz`<JFz?@|~9ul@&C@2hFjg
z_il{}AAj}2#^lG3g6?4^USE05b|OmOU{?NI;{D)Lw;PRz6}ZCzC)#beDkob-Dayi0
zI7w`(l?^%gKO(7VOcvao^%kP$A{`sgYV~n8I?HIdsU_(2w{lcBUub&TuqD($8n_nz
zLaUvWm^~7F?j6T7FqX_w<~$n`HnWv28oA?P<Q#7#>MROlBFJbbrC&>-?mT3}37nl&
zAeQr@=eEYz`<#};EWS`pbnI^}2re%C%f;G_O)RJ%2g_;_ksAGX(Zn^ca))-tQIefk
z8B|jp7jS82{)w+odtSqhn&LPe(ce72Uuq@o*PD{ESO#&+$3dod3|2^&CDS$PK*F9<
z;JQ*-Db>FZ*o<NLagolyG1)~7dULO_!E1(8otmSV&m?lWyXQXWQ^J^W|A93fFaS>l
z>W;=7*OH1pCxk0^dP3f%J@bfI1)BH;c^XB7)Yk;GmMXj!>sWF1$ZWjJ0kl;nUb^uF
zL$Zh5b26*|8%i|IR=~76LY!Cla}%5fpGgpXBT{z5%y!jYZWxQdG;^;qL9m*wm)^v<
zxfS$uPK4GE!wGz=KPiC^b2M3NJs6iqqYn^UGtXoqoHmE)F4iH)X6W%p3RH}ZO8sOg
ziBqN0>iCv*r%#;=r>$u$14&zRgWC_B<n~7s>BKjuWJ$wEf))}-_z_+y!4V?vW#l1>
z(&~CTp|>n>l|bI%F+Z#_Rlr=khI)Js#cby>uZP0Nvi{Bohf0dsr*S<<<Z3x@U7OOq
zI1(Y8^uZpe{1`(X-1Oc!SqX{2R$PrM$|p6PP4tmvlBDQNLb!f`JKFJuS#=8EQwr|;
zy6sqc>6#<)=SS;u%(A33H#b|b%ObW7Ip^oJv}P9ZmxvvSvV_5y#fB~hNx5HPgn>S#
zH_wOe+b&b9vMf2qIKEuX(PE=$${1))qu~%VnHk2<fUvj#&rvk`uULI4F`3MwH3Zf<
zHt<%*Q7K9p;T7vtv|V%Zgr`<<xU_xy!`t_{uBW46yje<mA}#f7^a1;8E-t%1Qi=6*
z2yoF?GqwpHZ7{<!5aY-7M#_U5t`qEks+mPH?(G%fAvifa!PK+j8`u-nx@&}Zjk#Df
zE%AZjRO`PKzjD&G;-vf&{}llajc!=J>djj@_hAI1?Xaw_V{PgrCWFt}PMdkXoD4P~
zox{6uTN?IQEW*y-i!6!Pe+rq;c-;5WP2_GB{0YnRA9Hq@@Hjtwg!i~47~V&$+A?jz
z(PCO^lV8DWT-h7xZ#|N5F-CSJ@t2h$l6VZFm6cmbF$raw_-n_9@^Kx&1c#ZIwNsS~
z6xz^ol6&=S-7~@N9)ahrVE{nE-%XPFZT;oq%#XaQl<29Rv+i<2{dI059aleElBkJf
z*Yl{*#I;m<i|krTk79H6Zr`J|_a`#aF57aw=F@ltdHPo?PTN&Ravm9#8T>MKXryIC
zML=>`#8104t`zWG?$<+&_%)gVP)$j|43u7DIVeJ+R3vJ`%<V=krx|OW&E!M!W6Moa
zY9<CBIJd8{?yT%3)ZL{Lt3}r@xU}snFOiD7#|9Z|4p0xvbiICIl39&r19$d>C$=5i
z@Dq769@;x;ztqF2X1JLZdBy|jxkvdv<IX|#5j;5uMo|moh@vX>n|*BHPqdNRPcbCt
z`jWYa7-l+f?Mv6r_xK}pILvY1K~Ch(@bF?fgWjiFaXhf>>UYwPW8!1@tlTw|K{{jL
z6?HQ!#z6{r!@dD~3pWBA+&{*~_at$7L=fqU{63+Rf+>;vkN;(#2r<ppro`5g-O$QR
zu?%pWjB@{`+duQ0?PR!4U^_$G^K;}dv#6XY52o+hFtSBMIYiNZ+d5ZlK)ol4)jLF}
zEXM^3>1Gxyd;dPpf{w%`zWeO*LAQ=FB0k3S*5m3CKwRhTCFm~fa-YUTjNXhp!WOVM
zm*4C1Qg2^&&$G^k&9f9P?Rto1zX73+#;2rxo-uM%gsn)qFqyzX0&Gh|v|%h6q`oRd
zSVU;d`7xQ<EUALk*~JsrIYmgSMr_TJ{k7IglXW%5ykFpL<?k-Wdu$@$H6J6E1HYf3
zK}lB2u_U%9?;zJIk@Oq@fex)AHSRmx^Pk)0(&4T4leRkLh;7H~45KlPWls~xEm;Fe
z0S#KTYlt2fyQpM`D6K`_s(CC^#ehJ|`(SE0`(`;6JtECaMOI&wN@I-dq+h^In-%Vk
z%`)N_3xF0t8tWXqiCXO1@o~|)hfrAlnaQ)q6nyipZSq(^@D|bip6<L-yd5*IijnIz
z+kSg_;CQxEQKoi=sy_jRmbE6)_O?J=zI%sn&%SPa`L)z8QAK0PDpfk@cAIW3n6qb`
zLNy7qX#uZX;<;wi69@lwkX6#Ts&n;Tz{@P4UbbnH1>&+g<|!tK>H}yld7qY<^j)Kj
z*N$na2+QVS#=5kzh*tv-GG}Y)8O10soYXa3RlaBwZhAL#=+d7zRbW|KlD*=G+-Q0F
z#>{gEjm>xJmsJ;_+&t8*D3HMC6FP0&3i~=9ZD<wgn%;zZ&!kUIMHNXUTEoZ$+pDWe
z)4VW<qS0TGe@ke%Hl5J#hsxT%QvJq!jLcub9eeN5s2=t?qr|gYFr?&T40H}#E6@_<
zO9W&sNc#LuaJx}d75fF=UY^zluWFk5!GYzHck0A5{xh9s<0&upy5s4NJI*6^0X1Bx
zSP2Kvb>bxsF0OLIi7A#LhHDtuu^xazV@!^lS*=&vfC@CRsVheGY$@AiPz1efm6`CD
zEAEP6GIm%{*?~8Ji$p@+Liz0pP_B_t$wtIx+_CauO*sqlmOg@&AnV==71R1UkJ_|v
zK+my>U)=rFAAWoAn+sG`mC>Tif))=vH|^Mo3@VrPtfntda@IKyrzMeGsgkM1|Bx(Y
z#GVLJNz-Z~gRlp#bnAV>O}G`pDAM&hv(dME)Xsdut6>iVyDe2^olnw0GJMC8OdoH6
z=|lmCeO<;qK?{=S+*G3^i=*eMO5jU7Bi0gki9>sMqtB0<QdEDO&y$eQ`<?kctJJo@
zLkCZ<MvIi_JFXO$RZ`ZCq1V2i7`#hdNHUfxoauN=f>@JTx<9IR)8clAjJ+iypYW<q
z`tj7?B&OSOwW{o}SVR{tKBf90hdCR3$8B@Z{e`xk^$N3JdbrZWI37+5*05Fu9{60+
ziM!p*N_HhgLwB6NRe7)!X+S*f?Qa01*^jz}rq2LLrcEl0;8o=O_>#Z>x{HWRwH~D;
z?0&Ugx>(%1Ru+zti`^&9aNP_ZubQ@KeOC?g_LO?sU`G2m1lwN!hVFu9yh2)$-_q5e
z@FykMp#`8S=_P3pk67B8x?d#&99lYQ$dh;ph^K3FJLrLAO2x6S!Y{vEufirB->MtW
zNWZJ-(Dx$rCrXfjIXc?UYY(dMO(mbYr$=m@;}S;6NXIokl`=S+VKSuhPgS=PvdyPE
z#%=Rmp;aQ9ZNv&^d-fC_m;P-TU9`Mj*eJ3pVIQCOSPIU_Jp5wMMY{h{et7clW<BY^
zcmUV+yVpDScaLe^{QQMw-4Z-EKkE4gH$O7^59zW{y*~Qrj_w=m^_g$?Ve9qL@Mr)0
zYQ2`@kFMH=FOJ`K>F8&4#Boe>j@u2StCK{sVtaSwH7;K^s;I^SGH8#WX7^-&-It6=
z6(aJxSLpnI%xUP;wqUGM7ErNN`vyDnNCALuwy0+uucbB52NAc<%3bOB#PQQs(eY70
z?g+<Q$4{Gcw<&jgz|TK7K(ECxj6b>o{%?-W!IV|QV;}g<8u7+w?W9iE58j$l`_3~l
R8}*-NU;pO|pAG%@{{Wg+^oIZd


From 3bd9c491583d45ae9f3f24b10e99626f502014b4 Mon Sep 17 00:00:00 2001
From: Shanshan Shen <467638484@qq.com>
Date: Tue, 16 Dec 2025 11:08:16 +0800
Subject: [PATCH 574/770] [CustomOp] Extract ApplyRotaryEmb as CustomOp and
 unify the dispatch logic (#29873)

Signed-off-by: shen-shanshan <467638484@qq.com>
Co-authored-by: gcanlin <canlinguosdu@gmail.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
---
 tests/kernels/core/test_apply_rotary_emb.py   | 203 ++++++++++++++++
 .../layers/rotary_embedding/base.py           |  20 +-
 .../layers/rotary_embedding/common.py         | 224 ++++++++++++------
 .../rotary_embedding/ernie45_vl_rope.py       |  13 +-
 .../layers/rotary_embedding/mrope.py          |  25 +-
 .../layers/rotary_embedding/xdrope.py         |  66 +++++-
 vllm/model_executor/models/dots_ocr.py        |  40 +---
 vllm/model_executor/models/ernie45_vl.py      |  63 ++---
 vllm/model_executor/models/glm4_1v.py         |  13 +-
 vllm/model_executor/models/keye.py            |  22 +-
 vllm/model_executor/models/paddleocr_vl.py    |  55 +----
 vllm/model_executor/models/qwen2_5_vl.py      |  12 +-
 vllm/model_executor/models/qwen2_vl.py        |  21 +-
 vllm/model_executor/models/siglip2navit.py    |  56 ++---
 14 files changed, 553 insertions(+), 280 deletions(-)
 create mode 100644 tests/kernels/core/test_apply_rotary_emb.py

diff --git a/tests/kernels/core/test_apply_rotary_emb.py b/tests/kernels/core/test_apply_rotary_emb.py
new file mode 100644
index 0000000000000..23c722fa5e638
--- /dev/null
+++ b/tests/kernels/core/test_apply_rotary_emb.py
@@ -0,0 +1,203 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Tests for ApplyRotaryEmb CustomOp dispatch behavior.
+
+This test ensures that RotaryEmbedding classes correctly call the appropriate
+ApplyRotaryEmb methods based on the calling context:
+
+1. RotaryEmbedding.forward_native() -> ApplyRotaryEmb.forward_native()
+2. RotaryEmbedding.forward_cuda() -> ApplyRotaryEmb.forward() (auto-dispatch)
+3. RotaryEmbedding.forward_hip() -> ApplyRotaryEmb.forward() (auto-dispatch)
+"""
+
+from dataclasses import dataclass
+
+import pytest
+import torch
+
+from vllm.config import (
+    CompilationConfig,
+    VllmConfig,
+    get_cached_compilation_config,
+    set_current_vllm_config,
+)
+from vllm.platforms import current_platform
+
+CUDA_DEVICES = ["cuda:0"]
+
+
+@dataclass
+class RotaryEmbeddingTestCase:
+    """Test case configuration for RotaryEmbedding dispatch tests."""
+
+    name: str
+    rope_class: type
+    rope_kwargs: dict
+    method_name: str  # forward_native, forward_cuda, forward
+    positions_shape: tuple  # (num_tokens,) or (3, num_tokens) or (4, num_tokens)
+    expect_forward_native: bool  # Should call ApplyRotaryEmb.forward_native()
+    expect_forward: bool  # Should call ApplyRotaryEmb.forward()
+
+
+def get_test_cases() -> list[RotaryEmbeddingTestCase]:
+    """Generate test cases for all RotaryEmbedding classes."""
+    from vllm.model_executor.layers.rotary_embedding.ernie45_vl_rope import (
+        Ernie4_5_VLRotaryEmbedding,
+    )
+    from vllm.model_executor.layers.rotary_embedding.mrope import MRotaryEmbedding
+    from vllm.model_executor.layers.rotary_embedding.xdrope import XDRotaryEmbedding
+
+    common_kwargs = {
+        "head_size": 128,
+        "rotary_dim": 128,
+        "max_position_embeddings": 4096,
+        "base": 10000,
+        "is_neox_style": True,
+        "dtype": torch.bfloat16,
+    }
+
+    return [
+        # MRotaryEmbedding tests
+        RotaryEmbeddingTestCase(
+            name="MRotaryEmbedding.forward_native",
+            rope_class=MRotaryEmbedding,
+            rope_kwargs={**common_kwargs, "mrope_section": [16, 24, 24]},
+            method_name="forward_native",
+            positions_shape=(3, 32),  # 2D for multimodal
+            expect_forward_native=True,
+            expect_forward=False,
+        ),
+        RotaryEmbeddingTestCase(
+            name="MRotaryEmbedding.forward_cuda_1d",
+            rope_class=MRotaryEmbedding,
+            rope_kwargs={**common_kwargs, "mrope_section": [16, 24, 24]},
+            method_name="forward_cuda",
+            positions_shape=(32,),  # 1D triggers apply_rotary_emb path
+            expect_forward_native=False,
+            expect_forward=True,
+        ),
+        # XDRotaryEmbedding tests
+        RotaryEmbeddingTestCase(
+            name="XDRotaryEmbedding.forward",
+            rope_class=XDRotaryEmbedding,
+            rope_kwargs={
+                **common_kwargs,
+                "scaling_alpha": 1.0,
+                "xdrope_section": [16, 16, 16, 16],
+            },
+            method_name="forward",
+            positions_shape=(4, 32),  # 4D for P/W/H/T
+            expect_forward_native=False,
+            expect_forward=True,
+        ),
+        # Ernie4_5_VLRotaryEmbedding tests
+        RotaryEmbeddingTestCase(
+            name="Ernie4_5_VLRotaryEmbedding.forward_native",
+            rope_class=Ernie4_5_VLRotaryEmbedding,
+            rope_kwargs={**common_kwargs, "mrope_section": [22, 22, 20]},
+            method_name="forward_native",
+            positions_shape=(3, 32),  # 2D for multimodal
+            expect_forward_native=True,
+            expect_forward=False,
+        ),
+    ]
+
+
+def run_dispatch_test(
+    test_case: RotaryEmbeddingTestCase,
+    device: str,
+):
+    """Run a dispatch test for a RotaryEmbedding class."""
+    vllm_config = VllmConfig(
+        compilation_config=CompilationConfig(custom_ops=["all", "+apply_rotary_emb"])
+    )
+    get_cached_compilation_config.cache_clear()
+
+    with set_current_vllm_config(vllm_config):
+        rope = test_case.rope_class(**test_case.rope_kwargs).to(device=device)
+
+        apply_rotary_emb = rope.apply_rotary_emb
+
+        # Verify custom op is enabled
+        if test_case.expect_forward_native:
+            assert (
+                apply_rotary_emb._forward_method != apply_rotary_emb.forward_native
+            ), "Test setup error: ApplyRotaryEmb custom op should be enabled"
+
+        # Setup call tracking
+        call_tracker = {"forward_native_called": False, "forward_called": False}
+        original_forward_native = apply_rotary_emb.forward_native
+        original_forward = apply_rotary_emb.forward
+
+        def tracked_forward_native(*args, **kwargs):
+            call_tracker["forward_native_called"] = True
+            return original_forward_native(*args, **kwargs)
+
+        def tracked_forward(*args, **kwargs):
+            call_tracker["forward_called"] = True
+            return original_forward(*args, **kwargs)
+
+        apply_rotary_emb.forward_native = tracked_forward_native
+        apply_rotary_emb.forward = tracked_forward
+
+        try:
+            num_tokens = test_case.positions_shape[-1]
+            num_q_heads = 8
+            num_kv_heads = 2
+            head_size = test_case.rope_kwargs["head_size"]
+            max_position = test_case.rope_kwargs["max_position_embeddings"]
+
+            positions = torch.randint(
+                0, max_position // 4, test_case.positions_shape, device=device
+            )
+            query = torch.randn(
+                num_tokens, num_q_heads * head_size, dtype=torch.bfloat16, device=device
+            )
+            key = torch.randn(
+                num_tokens,
+                num_kv_heads * head_size,
+                dtype=torch.bfloat16,
+                device=device,
+            )
+
+            # Call the method under test
+            method = getattr(rope, test_case.method_name)
+            method(positions, query.clone(), key.clone())
+
+            # Verify expectations
+            if test_case.expect_forward_native:
+                assert call_tracker["forward_native_called"], (
+                    f"{test_case.name} should call ApplyRotaryEmb.forward_native()"
+                )
+            if not test_case.expect_forward:
+                assert not call_tracker["forward_called"], (
+                    f"{test_case.name} should NOT call ApplyRotaryEmb.forward(). "
+                    "Bug: when +apply_rotary_emb is enabled, forward_native() "
+                    "incorrectly dispatches to CUDA/HIP kernels."
+                )
+            if test_case.expect_forward:
+                assert call_tracker["forward_called"], (
+                    f"{test_case.name} should call ApplyRotaryEmb.forward()"
+                )
+        finally:
+            apply_rotary_emb.forward_native = original_forward_native
+            apply_rotary_emb.forward = original_forward
+
+
+@pytest.mark.skipif(
+    not current_platform.is_cuda_alike(), reason="Skipping CUDA/ROCm only tests."
+)
+@pytest.mark.parametrize("test_case", get_test_cases(), ids=lambda tc: tc.name)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_rotary_embedding_dispatch(
+    test_case: RotaryEmbeddingTestCase,
+    device: str,
+):
+    """
+    Test that RotaryEmbedding classes dispatch to the correct ApplyRotaryEmb method.
+
+    - forward_native methods should call ApplyRotaryEmb.forward_native()
+    - forward_cuda/forward methods should call ApplyRotaryEmb.forward()
+    """
+    run_dispatch_test(test_case, device)
diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py
index 4114b21168cc8..afa69324c4e2e 100644
--- a/vllm/model_executor/layers/rotary_embedding/base.py
+++ b/vllm/model_executor/layers/rotary_embedding/base.py
@@ -7,7 +7,7 @@ import torch
 from vllm._aiter_ops import rocm_aiter_ops
 from vllm.model_executor.custom_op import CustomOp
 
-from .common import apply_rotary_emb_torch
+from .common import ApplyRotaryEmb
 
 
 @CustomOp.register("rotary_embedding")
@@ -49,6 +49,10 @@ class RotaryEmbeddingBase(CustomOp):
             rocm_aiter_ops.is_triton_rotary_embed_enabled()
         )
 
+        self.apply_rotary_emb = ApplyRotaryEmb(
+            is_neox_style=self.is_neox_style,
+        )
+
     def _compute_inv_freq(self, base: float) -> torch.Tensor:
         """Compute the inverse frequency."""
         # NOTE(woosuk): To exactly match the HF implementation, we need to
@@ -123,7 +127,12 @@ class RotaryEmbedding(RotaryEmbeddingBase):
         query = query.view(num_tokens, -1, head_size)
         query_rot = query[..., :rotary_dim]
         query_pass = query[..., rotary_dim:]
-        query_rot = apply_rotary_emb_torch(query_rot, cos, sin, is_neox_style)
+        query_rot = ApplyRotaryEmb.forward_static(
+            query_rot,
+            cos,
+            sin,
+            is_neox_style,
+        )
         query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
 
         # key may be None in some cases, e.g. cross-layer KV sharing
@@ -132,7 +141,12 @@ class RotaryEmbedding(RotaryEmbeddingBase):
             key = key.view(num_tokens, -1, head_size)
             key_rot = key[..., :rotary_dim]
             key_pass = key[..., rotary_dim:]
-            key_rot = apply_rotary_emb_torch(key_rot, cos, sin, is_neox_style)
+            key_rot = ApplyRotaryEmb.forward_static(
+                key_rot,
+                cos,
+                sin,
+                is_neox_style,
+            )
             key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
         return query, key
 
diff --git a/vllm/model_executor/layers/rotary_embedding/common.py b/vllm/model_executor/layers/rotary_embedding/common.py
index 13f8d15cc0f72..3e6584dbc3da0 100644
--- a/vllm/model_executor/layers/rotary_embedding/common.py
+++ b/vllm/model_executor/layers/rotary_embedding/common.py
@@ -2,19 +2,14 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
-from collections.abc import Callable
-from functools import cache
 from importlib.util import find_spec
 
 import torch
 
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
+from vllm.model_executor.custom_op import CustomOp
 from vllm.utils.torch_utils import direct_register_custom_op
 
-if current_platform.is_cuda():
-    from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
-
 logger = init_logger(__name__)
 
 
@@ -32,71 +27,6 @@ def rotate_gptj(x: torch.Tensor) -> torch.Tensor:
     return x.flatten(-2)
 
 
-def apply_rotary_emb_torch(
-    x: torch.Tensor,
-    cos: torch.Tensor,
-    sin: torch.Tensor,
-    is_neox_style: bool,
-) -> torch.Tensor:
-    cos = cos.unsqueeze(-2).to(x.dtype)
-    sin = sin.unsqueeze(-2).to(x.dtype)
-    if is_neox_style:
-        x1, x2 = torch.chunk(x, 2, dim=-1)
-    else:
-        x1 = x[..., ::2]
-        x2 = x[..., 1::2]
-    o1 = x1 * cos - x2 * sin
-    o2 = x2 * cos + x1 * sin
-    if is_neox_style:
-        return torch.cat((o1, o2), dim=-1)
-    else:
-        return torch.stack((o1, o2), dim=-1).flatten(-2)
-
-
-def apply_rotary_emb_dispatch(
-    x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, is_neox_style: bool
-) -> torch.Tensor:
-    """
-    Args:
-        x: [num_tokens, num_heads, head_size]
-        cos: [num_tokens, head_size // 2]
-        sin: [num_tokens, head_size // 2]
-        is_neox_style: Whether to use the Neox-style or GPT-J-style rotary
-            positional embeddings.
-    """
-    if current_platform.is_cuda():
-        return apply_rotary_emb(x.unsqueeze(0), cos, sin, not is_neox_style).squeeze(0)
-    else:
-        return apply_rotary_emb_torch(x, cos, sin, is_neox_style)
-
-
-@cache
-def dispatch_rotary_emb_function(
-    default: Callable[..., torch.Tensor] | None = None,
-) -> Callable[..., torch.Tensor]:
-    if current_platform.is_cuda():
-        return apply_rotary_emb
-
-    # if torch compile is not enabled
-    # use rotary embedding function from flash_attn package
-    # otherwise use the naive pytorch embedding implementation
-    # is faster when torch compile is enabled.
-    if current_platform.is_rocm() and not torch.compiler.is_compiling():
-        if find_spec("flash_attn") is not None:
-            from flash_attn.ops.triton.rotary import apply_rotary
-
-            return apply_rotary
-        else:
-            logger.warning(
-                "flash_attn is not installed. Falling back to PyTorch "
-                "implementation for rotary embeddings."
-            )
-    if default is not None:
-        return default
-
-    return apply_rotary_emb_torch
-
-
 # yarn functions
 # Inverse dim formula to find dim based on number of rotations
 def yarn_find_correction_dim(
@@ -186,3 +116,155 @@ direct_register_custom_op(
     mutates_args=["query", "key"],  # These tensors are modified in-place
     fake_impl=_flashinfer_rotary_embedding_fake,
 )
+
+
+@CustomOp.register("apply_rotary_emb")
+class ApplyRotaryEmb(CustomOp):
+    def __init__(
+        self,
+        enforce_enable: bool = False,
+        is_neox_style: bool = True,
+        enable_fp32_compute: bool = False,
+    ) -> None:
+        super().__init__(enforce_enable)
+        self.is_neox_style = is_neox_style
+        self.enable_fp32_compute = enable_fp32_compute
+
+        self.apply_rotary_emb_flash_attn = None
+        if find_spec("flash_attn") is not None:
+            from flash_attn.ops.triton.rotary import apply_rotary
+
+            self.apply_rotary_emb_flash_attn = apply_rotary
+
+    @staticmethod
+    def forward_static(
+        x: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+        is_neox_style: bool = True,
+        enable_fp32_compute: bool = False,
+    ) -> torch.Tensor:
+        """
+        Args:
+            x: [batch_size (optional), seq_len, num_heads, head_size]
+            cos: [seq_len, head_size // 2]
+            sin: [seq_len, head_size // 2]
+            is_neox_style: Whether to use the Neox-style or GPT-J-style.
+            enable_fp32_compute: Temporarily convert x, cos, sin to FP32 dtype
+                                 for higher accuracy.
+        """
+        origin_dtype = x.dtype
+        if enable_fp32_compute:
+            x = x.float()
+
+        cos = cos.unsqueeze(-2).to(x.dtype)
+        sin = sin.unsqueeze(-2).to(x.dtype)
+
+        if is_neox_style:
+            x1, x2 = torch.chunk(x, 2, dim=-1)
+        else:
+            x1 = x[..., ::2]
+            x2 = x[..., 1::2]
+
+        o1 = x1 * cos - x2 * sin
+        o2 = x2 * cos + x1 * sin
+
+        if is_neox_style:
+            output = torch.cat((o1, o2), dim=-1)
+        else:
+            output = torch.stack((o1, o2), dim=-1).flatten(-2)
+
+        if enable_fp32_compute:
+            output = output.to(origin_dtype)
+        return output
+
+    def forward_native(
+        self,
+        x: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> torch.Tensor:
+        output = self.forward_static(
+            x, cos, sin, self.is_neox_style, self.enable_fp32_compute
+        )
+        return output
+
+    def forward_cuda(
+        self,
+        x: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> torch.Tensor:
+        from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
+
+        origin_dtype = x.dtype
+        if self.enable_fp32_compute:
+            x = x.float()
+            cos = cos.float()
+            sin = sin.float()
+
+        origin_shape = x.shape
+        if len(origin_shape) == 3:
+            # x: [seq_len, num_heads, head_size]
+            x = x.unsqueeze(0)
+
+        """
+        Arguments of apply_rotary_emb() in vllm_flash_attn:
+            x: [batch_size, seq_len, nheads, headdim]
+            cos, sin: [seqlen_rotary, rotary_dim / 2]
+            interleaved: defalut as False (Neox-style).
+            ...
+        """
+        interleaved = not self.is_neox_style
+        output = apply_rotary_emb(x, cos, sin, interleaved)
+
+        if len(origin_shape) == 3:
+            output = output.squeeze(0)
+        if self.enable_fp32_compute:
+            output = output.to(origin_dtype)
+        return output
+
+    def forward_hip(
+        self,
+        x: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> torch.Tensor:
+        if self.apply_rotary_emb_flash_attn is not None:
+            origin_dtype = x.dtype
+            if self.enable_fp32_compute:
+                x = x.float()
+                cos = cos.float()
+                sin = sin.float()
+
+            origin_shape = x.shape
+            if len(origin_shape) == 3:
+                # x: [seq_len, num_heads, head_size]
+                x = x.unsqueeze(0)
+
+            """
+            Arguments of apply_rotary() in flash_attn:
+                x: [batch_size, seq_len, nheads, headdim]
+                cos, sin: [seqlen_rotary, rotary_dim / 2]
+                interleaved: defalut as False (Neox-style).
+                ...
+            """
+            interleaved = not self.is_neox_style
+            output = self.apply_rotary_emb_flash_attn(
+                x, cos, sin, interleaved=interleaved
+            ).type_as(x)
+
+            if len(origin_shape) == 3:
+                output = output.squeeze(0)
+            if self.enable_fp32_compute:
+                output = output.to(origin_dtype)
+        else:
+            # Falling back to PyTorch native implementation.
+            output = self.forward_native(x, cos, sin)
+
+        return output
+
+    def extra_repr(self) -> str:
+        s = f"is_neox_style={self.is_neox_style}"
+        s += f"enable_fp32_compute={self.enable_fp32_compute}"
+        return s
diff --git a/vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py b/vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py
index 749cdbe88a62e..2eda63a34ac44 100644
--- a/vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py
@@ -4,7 +4,6 @@
 
 import torch
 
-from .common import apply_rotary_emb_dispatch
 from .mrope import MRotaryEmbedding
 
 
@@ -55,14 +54,22 @@ class Ernie4_5_VLRotaryEmbedding(MRotaryEmbedding):
         query = query.view(num_tokens, -1, self.head_size)
         query_rot = query[..., : self.rotary_dim]
         query_pass = query[..., self.rotary_dim :]
-        query_rot = apply_rotary_emb_dispatch(query_rot, cos, sin, self.is_neox_style)
+        query_rot = self.apply_rotary_emb.forward_native(
+            query_rot,
+            cos,
+            sin,
+        )
         query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
 
         key_shape = key.shape
         key = key.view(num_tokens, -1, self.head_size)
         key_rot = key[..., : self.rotary_dim]
         key_pass = key[..., self.rotary_dim :]
-        key_rot = apply_rotary_emb_dispatch(key_rot, cos, sin, self.is_neox_style)
+        key_rot = self.apply_rotary_emb.forward_native(
+            key_rot,
+            cos,
+            sin,
+        )
         key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
         return query, key
 
diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py
index 0592aa8f967a6..a74bf092b182b 100644
--- a/vllm/model_executor/layers/rotary_embedding/mrope.py
+++ b/vllm/model_executor/layers/rotary_embedding/mrope.py
@@ -8,7 +8,6 @@ import torch
 from vllm.triton_utils import tl, triton
 
 from .base import RotaryEmbeddingBase
-from .common import apply_rotary_emb_dispatch
 from .yarn_scaling_rope import YaRNScalingRotaryEmbedding, yarn_get_mscale
 
 
@@ -301,14 +300,22 @@ class MRotaryEmbedding(RotaryEmbeddingBase):
         query = query.view(num_tokens, -1, self.head_size)
         query_rot = query[..., : self.rotary_dim]
         query_pass = query[..., self.rotary_dim :]
-        query_rot = apply_rotary_emb_dispatch(query_rot, cos, sin, self.is_neox_style)
+        query_rot = self.apply_rotary_emb.forward_native(
+            query_rot,
+            cos,
+            sin,
+        )
         query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
 
         key_shape = key.shape
         key = key.view(num_tokens, -1, self.head_size)
         key_rot = key[..., : self.rotary_dim]
         key_pass = key[..., self.rotary_dim :]
-        key_rot = apply_rotary_emb_dispatch(key_rot, cos, sin, self.is_neox_style)
+        key_rot = self.apply_rotary_emb.forward_native(
+            key_rot,
+            cos,
+            sin,
+        )
         key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
         return query, key
 
@@ -347,13 +354,21 @@ class MRotaryEmbedding(RotaryEmbeddingBase):
         query = query.view(num_tokens, -1, self.head_size)
         query_rot = query[..., : self.rotary_dim]
         query_pass = query[..., self.rotary_dim :]
-        query_rot = apply_rotary_emb_dispatch(query_rot, cos, sin, self.is_neox_style)
+        query_rot = self.apply_rotary_emb(
+            query_rot,
+            cos,
+            sin,
+        )
         query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
 
         key = key.view(num_tokens, -1, self.head_size)
         key_rot = key[..., : self.rotary_dim]
         key_pass = key[..., self.rotary_dim :]
-        key_rot = apply_rotary_emb_dispatch(key_rot, cos, sin, self.is_neox_style)
+        key_rot = self.apply_rotary_emb(
+            key_rot,
+            cos,
+            sin,
+        )
         key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
         return query, key
 
diff --git a/vllm/model_executor/layers/rotary_embedding/xdrope.py b/vllm/model_executor/layers/rotary_embedding/xdrope.py
index 2432273faf195..dab7aad9759a2 100644
--- a/vllm/model_executor/layers/rotary_embedding/xdrope.py
+++ b/vllm/model_executor/layers/rotary_embedding/xdrope.py
@@ -4,7 +4,6 @@
 import numpy as np
 import torch
 
-from .common import apply_rotary_emb_dispatch
 from .dynamic_ntk_alpha_rope import DynamicNTKAlphaRotaryEmbedding
 
 
@@ -36,7 +35,7 @@ class XDRotaryEmbedding(DynamicNTKAlphaRotaryEmbedding):
             dtype,
         )
 
-    def forward(
+    def forward_native(
         self,
         positions: torch.Tensor,
         query: torch.Tensor,
@@ -68,14 +67,73 @@ class XDRotaryEmbedding(DynamicNTKAlphaRotaryEmbedding):
         query = query.view(num_tokens, -1, self.head_size)
         query_rot = query[..., : self.rotary_dim]
         query_pass = query[..., self.rotary_dim :]
-        query_rot = apply_rotary_emb_dispatch(query_rot, cos, sin, self.is_neox_style)
+        query_rot = self.apply_rotary_emb.forward_native(
+            query_rot,
+            cos,
+            sin,
+        )
         query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
 
         key_shape = key.shape
         key = key.view(num_tokens, -1, self.head_size)
         key_rot = key[..., : self.rotary_dim]
         key_pass = key[..., self.rotary_dim :]
-        key_rot = apply_rotary_emb_dispatch(key_rot, cos, sin, self.is_neox_style)
+        key_rot = self.apply_rotary_emb.forward_native(
+            key_rot,
+            cos,
+            sin,
+        )
+        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
+
+    def forward_cuda(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None = None,
+        offsets: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        """PyTorch-native implementation equivalent to forward().
+
+        Args:
+            positions:
+                [4, num_tokens] (P/W/H/T positions with multimodal inputs)
+            query: [num_tokens, num_heads * head_size]
+            key: [num_tokens, num_kv_heads * head_size]
+        """
+        assert positions.ndim == 2
+        assert key is not None
+
+        num_tokens = positions.shape[-1]
+        cos_sin = self.cos_sin_cache[positions]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        cos = torch.cat(
+            [m[i] for i, m in enumerate(cos.split(self.xdrope_section, dim=-1))], dim=-1
+        )
+        sin = torch.cat(
+            [m[i] for i, m in enumerate(sin.split(self.xdrope_section, dim=-1))], dim=-1
+        )
+
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, self.head_size)
+        query_rot = query[..., : self.rotary_dim]
+        query_pass = query[..., self.rotary_dim :]
+        query_rot = self.apply_rotary_emb(
+            query_rot,
+            cos,
+            sin,
+        )
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        key_shape = key.shape
+        key = key.view(num_tokens, -1, self.head_size)
+        key_rot = key[..., : self.rotary_dim]
+        key_pass = key[..., self.rotary_dim :]
+        key_rot = self.apply_rotary_emb(
+            key_rot,
+            cos,
+            sin,
+        )
         key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
         return query, key
 
diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py
index 9b61cd9503073..6d8dbec9236c9 100644
--- a/vllm/model_executor/models/dots_ocr.py
+++ b/vllm/model_executor/models/dots_ocr.py
@@ -29,6 +29,9 @@ from vllm.model_executor.layers.linear import (
     RowParallelLinear,
 )
 from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding.common import (
+    ApplyRotaryEmb,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.interfaces import (
     MultiModalEmbeddings,
@@ -158,32 +161,6 @@ class DotsOCRProcessingInfo(Qwen2VLProcessingInfo):
         return processor
 
 
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_pos_emb_vision(
-    tensor: torch.Tensor, freqs: torch.Tensor
-) -> torch.Tensor:
-    orig_dtype = tensor.dtype
-    tensor = tensor.float()
-
-    cos = freqs.cos()
-    sin = freqs.sin()
-
-    cos = cos.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
-    sin = sin.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
-
-    output = (tensor * cos) + (rotate_half(tensor) * sin)
-
-    output = output.to(orig_dtype)
-
-    return output
-
-
 class VisionRotaryEmbedding(nn.Module):
     def __init__(self, dim: int, theta: float = 10000.0) -> None:
         super().__init__()
@@ -298,6 +275,11 @@ class DotsVisionAttention(nn.Module):
             prefix=f"{prefix}.attn",
         )
 
+        self.apply_rotary_emb = ApplyRotaryEmb(
+            enforce_enable=True,
+            enable_fp32_compute=True,
+        )
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -318,7 +300,11 @@ class DotsVisionAttention(nn.Module):
 
         if rotary_pos_emb is not None:
             qk_concat = torch.cat([q, k], dim=0)
-            qk_rotated = apply_rotary_pos_emb_vision(qk_concat, rotary_pos_emb)
+            qk_rotated = self.apply_rotary_emb(
+                qk_concat,
+                rotary_pos_emb.cos(),
+                rotary_pos_emb.sin(),
+            )
             q, k = torch.chunk(qk_rotated, 2, dim=0)
 
         context_layer = self.attn(
diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py
index dd2b74736bcac..61cf78fdb5a67 100644
--- a/vllm/model_executor/models/ernie45_vl.py
+++ b/vllm/model_executor/models/ernie45_vl.py
@@ -33,7 +33,7 @@ import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from einops import rearrange, repeat
+from einops import rearrange
 from transformers import BatchFeature
 
 from vllm.attention.backends.registry import AttentionBackendEnum
@@ -53,6 +53,9 @@ from vllm.model_executor.layers.linear import (
     RowParallelLinear,
 )
 from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding.common import (
+    ApplyRotaryEmb,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (
@@ -69,7 +72,6 @@ from vllm.multimodal.processing import (
     PromptUpdate,
 )
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
-from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
@@ -89,52 +91,6 @@ logger = init_logger(__name__)
 # === Vision Transformer === #
 
 
-def rotate_half(x: torch.Tensor, interleaved: bool = False) -> torch.Tensor:
-    if not interleaved:
-        x1, x2 = x.chunk(2, dim=-1)
-        return torch.cat((-x2, x1), dim=-1)
-    else:
-        x1, x2 = x[..., ::2], x[..., 1::2]
-        return rearrange(
-            torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2
-        )
-
-
-def apply_rotary_emb_torch(
-    x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, interleaved: bool = False
-) -> torch.Tensor:
-    """
-    x: (batch_size, seqlen, nheads, headdim)
-    cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
-    """
-    ro_dim = cos.shape[-1] * 2
-    assert ro_dim <= x.shape[-1]
-    cos = repeat(
-        cos, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
-    )
-    sin = repeat(
-        sin, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
-    )
-    return torch.cat(
-        [
-            x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin,
-            x[..., ro_dim:],
-        ],
-        dim=-1,
-    )
-
-
-def apply_rotary_pos_emb_vision(t: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
-    t_ = t.float()
-    cos = freqs.cos()
-    sin = freqs.sin()
-    apply_rotary_emb = apply_rotary_emb_torch
-    if current_platform.is_cuda():
-        from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
-    output = apply_rotary_emb(t_, cos, sin).type_as(t)
-    return output
-
-
 def all_gather_interleave(local_tensor, hidden_size: int, tp_size: int):
     """All-gather the input tensor interleavely across model parallel group."""
     import torch.distributed as dist
@@ -200,6 +156,11 @@ class Ernie4_5_VisionAttention(nn.Module):
             prefix=f"{prefix}.attn",
         )
 
+        self.apply_rotary_emb = ApplyRotaryEmb(
+            enforce_enable=True,
+            enable_fp32_compute=True,
+        )
+
     def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
         # [s, b, 3 * head * head_dim]
         seq_len, bs, _ = qkv.shape
@@ -244,7 +205,11 @@ class Ernie4_5_VisionAttention(nn.Module):
         q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v))
         if rotary_pos_emb is not None:
             qk_concat = torch.cat([q, k], dim=0)
-            qk_rotated = apply_rotary_pos_emb_vision(qk_concat, rotary_pos_emb)
+            qk_rotated = self.apply_rotary_emb(
+                qk_concat,
+                rotary_pos_emb.cos(),
+                rotary_pos_emb.sin(),
+            )
             q, k = torch.chunk(qk_rotated, 2, dim=0)
 
         output = self.attn(
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 10e5261a30485..84989537da6e2 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -65,6 +65,9 @@ from vllm.model_executor.layers.linear import (
 )
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.rotary_embedding.common import (
+    ApplyRotaryEmb,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -95,7 +98,7 @@ from .interfaces import (
     SupportsMultiModal,
     SupportsPP,
 )
-from .qwen2_vl import _create_qwen2vl_field_factory, apply_rotary_pos_emb_vision
+from .qwen2_vl import _create_qwen2vl_field_factory
 from .utils import (
     AutoWeightsLoader,
     WeightsMapper,
@@ -304,6 +307,8 @@ class Glm4vVisionAttention(nn.Module):
             multimodal_config=multimodal_config,
         )
 
+        self.apply_rotary_emb = ApplyRotaryEmb(enforce_enable=True)
+
     def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
         # [s, b, 3 * head * head_dim]
         seq_len, bs, _ = qkv.shape
@@ -339,8 +344,10 @@ class Glm4vVisionAttention(nn.Module):
         if rotary_pos_emb_cos is not None and rotary_pos_emb_sin is not None:
             # [2 * b, s, heads, head_dim]
             qk_concat = torch.cat([q, k], dim=0)
-            qk_rotated = apply_rotary_pos_emb_vision(
-                qk_concat, rotary_pos_emb_cos, rotary_pos_emb_sin
+            qk_rotated = self.apply_rotary_emb(
+                qk_concat,
+                rotary_pos_emb_cos,
+                rotary_pos_emb_sin,
             )
             q, k = torch.chunk(qk_rotated, 2, dim=0)
 
diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
index 52e4413690619..fcf88953ba20f 100644
--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -30,6 +30,9 @@ from vllm.model_executor.layers.linear import (
     RowParallelLinear,
 )
 from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding.common import (
+    ApplyRotaryEmb,
+)
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader,
     maybe_remap_kv_scale_name,
@@ -59,7 +62,6 @@ from vllm.multimodal.processing import (
     PromptUpdate,
 )
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
-from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
@@ -341,20 +343,14 @@ def apply_rotary_pos_emb_flashatt(
     cos = cos.chunk(2, dim=-1)[0].contiguous()
     sin = sin.chunk(2, dim=-1)[0].contiguous()
 
-    if current_platform.is_cuda():
-        from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
-    elif current_platform.is_rocm():
-        from flash_attn.ops.triton.rotary import apply_rotary as apply_rotary_emb
-    else:
-        # For other platforms, use PyTorch fallback
-        from vllm.model_executor.layers.rotary_embedding.common import (
-            apply_rotary_emb_torch,
-        )
+    apply_rotary_emb = ApplyRotaryEmb(
+        enforce_enable=True,
+        enable_fp32_compute=True,
+    )
 
-        apply_rotary_emb = partial(apply_rotary_emb_torch, is_neox_style=True)
+    q_embed = apply_rotary_emb(q, cos, sin)
+    k_embed = apply_rotary_emb(k, cos, sin)
 
-    q_embed = apply_rotary_emb(q.float(), cos.float(), sin.float()).type_as(q)
-    k_embed = apply_rotary_emb(k.float(), cos.float(), sin.float()).type_as(k)
     return q_embed, k_embed
 
 
diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py
index 66acc0432d125..56565266c0dcc 100644
--- a/vllm/model_executor/models/paddleocr_vl.py
+++ b/vllm/model_executor/models/paddleocr_vl.py
@@ -22,7 +22,7 @@ from typing import Annotated, Literal
 import numpy as np
 import torch
 import torch.nn as nn
-from einops import rearrange, repeat
+from einops import rearrange
 from transformers import BatchFeature, PretrainedConfig
 from transformers.activations import GELUActivation
 from transformers.modeling_outputs import (
@@ -47,7 +47,7 @@ from vllm.model_executor.layers.linear import (
 )
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding.common import (
-    dispatch_rotary_emb_function,
+    ApplyRotaryEmb,
 )
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader,
@@ -130,47 +130,6 @@ def smart_resize(
     return h_bar, w_bar
 
 
-def rotate_half(x: torch.Tensor, interleaved: bool = False) -> torch.Tensor:
-    if not interleaved:
-        x1, x2 = x.chunk(2, dim=-1)
-        return torch.cat((-x2, x1), dim=-1)
-    x1, x2 = x[..., ::2], x[..., 1::2]
-    return rearrange(torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2)
-
-
-def apply_rotary_emb_torch(
-    x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, interleaved: bool = False
-) -> torch.Tensor:
-    """
-    x: (batch_size, seqlen, nheads, headdim)
-    cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
-    """
-    ro_dim = cos.shape[-1] * 2
-    assert ro_dim <= x.shape[-1]
-    cos = repeat(
-        cos, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
-    )
-    sin = repeat(
-        sin, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
-    )
-    return torch.cat(
-        [
-            x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin,
-            x[..., ro_dim:],
-        ],
-        dim=-1,
-    )
-
-
-def apply_rotary_pos_emb_vision(t: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
-    rotary_emb_function = dispatch_rotary_emb_function(default=apply_rotary_emb_torch)
-    t_ = t.float()
-    cos = freqs.cos()
-    sin = freqs.sin()
-    output = rotary_emb_function(t_, cos, sin).type_as(t)
-    return output
-
-
 class PaddleOCRVLProcessingInfo(BaseProcessingInfo):
     def get_hf_config(self):
         return self.ctx.get_hf_config()
@@ -609,6 +568,10 @@ class SiglipAttention(nn.Module):
             multimodal_config=multimodal_config,
             prefix=f"{prefix}.attn",
         )
+        self.apply_rotary_emb = ApplyRotaryEmb(
+            enforce_enable=True,
+            enable_fp32_compute=True,
+        )
 
     def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
         seq_len, bs, _ = qkv.shape
@@ -651,7 +614,11 @@ class SiglipAttention(nn.Module):
 
         if rotary_pos_emb is not None:
             qk_concat = torch.cat([q, k], dim=0)
-            qk_rotated = apply_rotary_pos_emb_vision(qk_concat, rotary_pos_emb)
+            qk_rotated = self.apply_rotary_emb(
+                qk_concat,
+                rotary_pos_emb.cos(),
+                rotary_pos_emb.sin(),
+            )
             q, k = torch.chunk(qk_rotated, 2, dim=0)
 
         context_layer = self.attn(
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index a5a47f81ba24d..b730ac0315893 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -60,6 +60,9 @@ from vllm.model_executor.layers.linear import (
 )
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.rotary_embedding.common import (
+    ApplyRotaryEmb,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.vision import should_torch_compile_mm_vit
@@ -95,7 +98,6 @@ from .qwen2_vl import Qwen2VLDummyInputsBuilder as Qwen2_5_VLDummyInputsBuilder
 from .qwen2_vl import (
     Qwen2VLMultiModalProcessor,
     Qwen2VLProcessingInfo,
-    apply_rotary_pos_emb_vision,
 )
 from .utils import (
     AutoWeightsLoader,
@@ -353,6 +355,8 @@ class Qwen2_5_VisionAttention(nn.Module):
             multimodal_config=multimodal_config,
         )
 
+        self.apply_rotary_emb = ApplyRotaryEmb(enforce_enable=True)
+
     def forward(
         self,
         x: torch.Tensor,
@@ -378,8 +382,10 @@ class Qwen2_5_VisionAttention(nn.Module):
             qk_reshaped = einops.rearrange(
                 qk, "b s two head head_dim -> (two b) s head head_dim", two=2
             )
-            qk_rotated = apply_rotary_pos_emb_vision(
-                qk_reshaped, cos=rotary_pos_emb_cos, sin=rotary_pos_emb_sin
+            qk_rotated = self.apply_rotary_emb(
+                qk_reshaped,
+                rotary_pos_emb_cos,
+                rotary_pos_emb_sin,
             )
             qk_rotated = qk_rotated.view(
                 2,
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 192a54c3ec839..321fbd764c0f5 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -59,8 +59,7 @@ from vllm.model_executor.layers.linear import (
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.rotary_embedding.common import (
-    apply_rotary_emb_torch,
-    dispatch_rotary_emb_function,
+    ApplyRotaryEmb,
 )
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
@@ -280,16 +279,6 @@ class Qwen2VisionMLP(nn.Module):
         return x
 
 
-def apply_rotary_pos_emb_vision(
-    t: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
-) -> torch.Tensor:
-    rotary_emb_function = dispatch_rotary_emb_function(
-        default=partial(apply_rotary_emb_torch, is_neox_style=True)
-    )
-    output = rotary_emb_function(t, cos, sin).type_as(t)
-    return output
-
-
 class Qwen2VisionAttention(nn.Module):
     def __init__(
         self,
@@ -341,6 +330,8 @@ class Qwen2VisionAttention(nn.Module):
             multimodal_config=multimodal_config,
         )
 
+        self.apply_rotary_emb = ApplyRotaryEmb(enforce_enable=True)
+
     def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
         # [s, b, 3 * head * head_dim]
         seq_len, bs, _ = qkv.shape
@@ -387,8 +378,10 @@ class Qwen2VisionAttention(nn.Module):
 
         # [2 * b, s, heads, head_dim]
         qk_concat = torch.cat([q, k], dim=0)
-        qk_rotated = apply_rotary_pos_emb_vision(
-            qk_concat, rotary_pos_emb_cos, rotary_pos_emb_sin
+        qk_rotated = self.apply_rotary_emb(
+            qk_concat,
+            rotary_pos_emb_cos,
+            rotary_pos_emb_sin,
         )
         q, k = torch.chunk(qk_rotated, 2, dim=0)
 
diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py
index 2ee21fc06846c..efdee255ab5eb 100644
--- a/vllm/model_executor/models/siglip2navit.py
+++ b/vllm/model_executor/models/siglip2navit.py
@@ -6,7 +6,6 @@ within a vision language model."""
 from collections.abc import Iterable
 
 import torch
-from einops import rearrange, repeat
 from torch import nn
 from torch.nn import functional as F
 from transformers import Siglip2VisionConfig
@@ -26,6 +25,9 @@ from vllm.model_executor.layers.linear import (
     RowParallelLinear,
 )
 from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding.common import (
+    ApplyRotaryEmb,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.platforms import current_platform
 
@@ -146,40 +148,6 @@ class Siglip2VisionEmbeddings(nn.Module):
         return patch_embeds
 
 
-# copy from flash_attn/layers/rotary.py
-def rotate_half(x, interleaved=False):
-    if not interleaved:
-        x1, x2 = x.chunk(2, dim=-1)
-        return torch.cat((-x2, x1), dim=-1)
-    else:
-        x1, x2 = x[..., ::2], x[..., 1::2]
-        return rearrange(
-            torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2
-        )
-
-
-def apply_rotary_emb_torch(x, cos, sin, interleaved=False):
-    """
-    x: (batch_size, seqlen, nheads, headdim)
-    cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
-    """
-    ro_dim = cos.shape[-1] * 2
-    assert ro_dim <= x.shape[-1]
-    cos = repeat(
-        cos, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
-    )
-    sin = repeat(
-        sin, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
-    )
-    return torch.cat(
-        [
-            x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin,
-            x[..., ro_dim:],
-        ],
-        dim=-1,
-    )
-
-
 def apply_rotary_pos_emb(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -189,14 +157,20 @@ def apply_rotary_pos_emb(
 ) -> tuple[torch.Tensor, torch.Tensor]:
     cos = cos.chunk(2, dim=-1)[0].contiguous()
     sin = sin.chunk(2, dim=-1)[0].contiguous()
-    if is_flash_attn_backend and current_platform.is_cuda():
-        from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
 
-        apply_rotary_emb_func = apply_rotary_emb
+    apply_rotary_emb = ApplyRotaryEmb(
+        enforce_enable=True,
+        enable_fp32_compute=True,
+    )
+
+    if is_flash_attn_backend and not current_platform.is_cuda():
+        apply_rotary_emb_func = apply_rotary_emb.forward_cuda
     else:
-        apply_rotary_emb_func = apply_rotary_emb_torch
-    q_embed = apply_rotary_emb_func(q.float(), cos.float(), sin.float()).type_as(q)
-    k_embed = apply_rotary_emb_func(k.float(), cos.float(), sin.float()).type_as(k)
+        apply_rotary_emb_func = apply_rotary_emb.forward_native
+
+    q_embed = apply_rotary_emb_func(q, cos, sin)
+    k_embed = apply_rotary_emb_func(k, cos, sin)
+
     return q_embed, k_embed
 
 
From c881db364e2bbcc90350db857fe294ae01ff71b7 Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Mon, 15 Dec 2025 19:12:05 -0800
Subject: [PATCH 575/770] improve lazy import test (#30733)

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 tests/standalone_tests/lazy_imports.py | 31 +++++---------------------
 1 file changed, 6 insertions(+), 25 deletions(-)

diff --git a/tests/standalone_tests/lazy_imports.py b/tests/standalone_tests/lazy_imports.py
index ddcdd2a51ab9f..fff5c54f276d3 100644
--- a/tests/standalone_tests/lazy_imports.py
+++ b/tests/standalone_tests/lazy_imports.py
@@ -5,9 +5,6 @@
 # The utility function cannot be placed in `vllm.utils`
 # this needs to be a standalone script
 import sys
-from contextlib import nullcontext
-
-from vllm_test_utils import BlameResult, blame
 
 # List of modules that should not be imported too early.
 # Lazy import `torch._inductor.async_compile` to avoid creating
@@ -16,26 +13,10 @@ from vllm_test_utils import BlameResult, blame
 # `cv2` can easily mess up the environment.
 module_names = ["torch._inductor.async_compile", "cv2"]
 
+# set all modules in `module_names` to be None.
+# if we import any modules during `import vllm`, there would be a
+# hard error and nice stacktrace on the first import.
+for module_name in module_names:
+    sys.modules[module_name] = None  # type: ignore[assignment]
 
-def any_module_imported():
-    return any(module_name in sys.modules for module_name in module_names)
-
-
-# In CI, we only check finally if the module is imported.
-# If it is indeed imported, we can rerun the test with `use_blame=True`,
-# which will trace every function call to find the first import location,
-# and help find the root cause.
-# We don't run it in CI by default because it is slow.
-use_blame = False
-context = blame(any_module_imported) if use_blame else nullcontext()
-with context as result:
-    import vllm  # noqa
-
-if use_blame:
-    assert isinstance(result, BlameResult)
-    print(f"the first import location is:\n{result.trace_stack}")
-
-assert not any_module_imported(), (
-    f"Some the modules in {module_names} are imported. To see the first"
-    f" import location, run the test with `use_blame=True`."
-)
+import vllm  # noqa

From b9ff4f2a8dffc84b2ce226e7e98c33756caf098f Mon Sep 17 00:00:00 2001
From: jiangkuaixue123 <jiangxiaozhou111@163.com>
Date: Tue, 16 Dec 2025 13:04:01 +0800
Subject: [PATCH 576/770] [feature] extend DBO to XBO (#30120)

Signed-off-by: jiangkuaixue123 <jiangxiaozhou111@163.com>
Co-authored-by: root <root@hk01dgx028.cm.cluster>
---
 .../v1/attention/test_attention_splitting.py  |  1 +
 vllm/config/parallel.py                       | 10 +++
 vllm/config/vllm.py                           |  7 +-
 vllm/engine/arg_utils.py                      |  6 ++
 vllm/v1/attention/backends/utils.py           | 14 ++--
 vllm/v1/worker/dp_utils.py                    |  8 +--
 vllm/v1/worker/gpu_model_runner.py            | 33 +++++++--
 vllm/v1/worker/gpu_ubatch_wrapper.py          | 35 ++++-----
 vllm/v1/worker/ubatch_utils.py                | 71 ++++++++++---------
 vllm/v1/worker/ubatching.py                   | 21 ++++--
 10 files changed, 133 insertions(+), 73 deletions(-)

diff --git a/tests/v1/attention/test_attention_splitting.py b/tests/v1/attention/test_attention_splitting.py
index f08e2f480e30f..734819fcdca83 100644
--- a/tests/v1/attention/test_attention_splitting.py
+++ b/tests/v1/attention/test_attention_splitting.py
@@ -323,6 +323,7 @@ def test_prefill_split_across_ubatches(
         num_tokens,
         batch_spec.batch_size,
         split_point=split_point,
+        num_ubatches=2,
     )
     assert ubatch_slices is not None and len(ubatch_slices) == 2
 
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index 1f9dd38ac9114..3fe066ec32505 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -156,6 +156,8 @@ class ParallelConfig:
 
     enable_dbo: bool = False
     """Enable dual batch overlap for the model executor."""
+    ubatch_size: int = 0
+    """Number of ubatch size."""
 
     dbo_decode_token_threshold: int = 32
     """The threshold for dual batch overlap for batches only containing decodes.
@@ -325,6 +327,14 @@ class ParallelConfig:
         including data parallelism."""
         return self.world_size * self.data_parallel_size
 
+    @property
+    def use_ubatching(self) -> bool:
+        return self.enable_dbo or self.ubatch_size > 1
+
+    @property
+    def num_ubatches(self) -> int:
+        return 2 if self.enable_dbo else self.ubatch_size
+
     def get_next_dp_init_port(self) -> int:
         """
         We might need to initialize process groups in multiple
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index ace5adc109d86..0439dc52e7e6f 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -870,9 +870,12 @@ class VllmConfig:
                     f"cudagraph_mode={self.compilation_config.cudagraph_mode}"
                 )
 
-        if self.parallel_config.enable_dbo:
+        if self.parallel_config.use_ubatching:
             a2a_backend = self.parallel_config.all2all_backend
-            assert a2a_backend in ["deepep_low_latency", "deepep_high_throughput"], (
+            assert a2a_backend in [
+                "deepep_low_latency",
+                "deepep_high_throughput",
+            ], (
                 "Microbatching currently only supports the deepep_low_latency and "
                 f"deepep_high_throughput all2all backend. {a2a_backend} is not "
                 "supported. To fix use --all2all-backend=deepep_low_latency or "
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 3862aa9222446..ca19e468914c7 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -408,6 +408,7 @@ class EngineArgs:
     enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
     all2all_backend: str | None = ParallelConfig.all2all_backend
     enable_dbo: bool = ParallelConfig.enable_dbo
+    ubatch_size: int = ParallelConfig.ubatch_size
     dbo_decode_token_threshold: int = ParallelConfig.dbo_decode_token_threshold
     dbo_prefill_token_threshold: int = ParallelConfig.dbo_prefill_token_threshold
     disable_nccl_for_dp_synchronization: bool = (
@@ -841,6 +842,10 @@ class EngineArgs:
             "--all2all-backend", **parallel_kwargs["all2all_backend"]
         )
         parallel_group.add_argument("--enable-dbo", **parallel_kwargs["enable_dbo"])
+        parallel_group.add_argument(
+            "--ubatch-size",
+            **parallel_kwargs["ubatch_size"],
+        )
         parallel_group.add_argument(
             "--dbo-decode-token-threshold",
             **parallel_kwargs["dbo_decode_token_threshold"],
@@ -1557,6 +1562,7 @@ class EngineArgs:
             enable_expert_parallel=self.enable_expert_parallel,
             all2all_backend=self.all2all_backend,
             enable_dbo=self.enable_dbo,
+            ubatch_size=self.ubatch_size,
             dbo_decode_token_threshold=self.dbo_decode_token_threshold,
             dbo_prefill_token_threshold=self.dbo_prefill_token_threshold,
             disable_nccl_for_dp_synchronization=self.disable_nccl_for_dp_synchronization,
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index da43d87038234..1cbe929fc57a8 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -201,10 +201,11 @@ def _make_metadata_with_slice(
     )
     # NOTE: last token can be outside of the last request if we have CG padding.
 
-    # If the "middle" request has tokens in both ubatches, we have to split it.
-    # If ubatch_slice is the first ubatch then we will be splitting the last
-    # request. If it's the second microbatch, then we will be splitting the
-    # first request
+    # If the request is split across ubatches, we have to adjust the metadata.
+    # splits_first_request: The first request in this slice is the continuation of
+    #                       a request that started in a previous slice.
+    # splits_last_request:  The last request in this slice continues into the
+    #                       next slice.
     splits_first_request = first_tok > start_locs[first_req]
     splits_last_request = last_tok < start_locs[last_req + 1] - 1
 
@@ -225,7 +226,10 @@ def _make_metadata_with_slice(
     seq_lens_cpu = attn_metadata.seq_lens_cpu[request_slice]
 
     if splits_last_request:
-        tokens_skipped = query_start_loc_cpu[-1] - token_slice.stop
+        # NOTE: We use start_locs (the original query_start_loc_cpu) to calculate
+        # the tokens skipped because query_start_loc_cpu might have been modified
+        # if splits_first_request is True.
+        tokens_skipped = start_locs[last_req + 1] - token_slice.stop
         query_start_loc[-1] -= tokens_skipped
         query_start_loc_cpu[-1] -= tokens_skipped
 
diff --git a/vllm/v1/worker/dp_utils.py b/vllm/v1/worker/dp_utils.py
index 1b9646e1980a8..82de0cba9194b 100644
--- a/vllm/v1/worker/dp_utils.py
+++ b/vllm/v1/worker/dp_utils.py
@@ -11,7 +11,7 @@ from vllm.distributed.parallel_state import get_dp_group
 from vllm.logger import init_logger
 from vllm.v1.worker.ubatch_utils import (
     check_ubatch_thresholds,
-    is_second_ubatch_empty,
+    is_last_ubatch_empty,
 )
 
 logger = init_logger(__name__)
@@ -56,7 +56,7 @@ def _run_ar(
     return tensor
 
 
-def _post_process_ubatch(tensor: torch.Tensor) -> bool:
+def _post_process_ubatch(tensor: torch.Tensor, num_ubatches: int) -> bool:
     orig_num_tokens_tensor = tensor[0, :]
     padded_num_tokens_tensor = tensor[1, :]
 
@@ -68,7 +68,7 @@ def _post_process_ubatch(tensor: torch.Tensor) -> bool:
     # there are no "empty" second ubatches
     orig_min_num_tokens = int(orig_num_tokens_tensor.min().item())
     padded_max_num_tokens = int(padded_num_tokens_tensor.max().item())
-    if is_second_ubatch_empty(orig_min_num_tokens, padded_max_num_tokens):
+    if is_last_ubatch_empty(orig_min_num_tokens, padded_max_num_tokens, num_ubatches):
         logger.debug(
             "Aborting ubatching %s %s", orig_min_num_tokens, padded_max_num_tokens
         )
@@ -146,7 +146,7 @@ def _synchronize_dp_ranks(
     assert should_attempt_dp_padding == should_dp_pad
 
     # Check conditions for microbatching
-    should_ubatch = _post_process_ubatch(tensor)
+    should_ubatch = _post_process_ubatch(tensor, parallel_config.num_ubatches)
 
     if should_ubatch and not should_dp_pad:
         logger.debug_once(
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 978224faae65e..1aa2ec6bb655c 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2987,7 +2987,7 @@ class GPUModelRunner(
 
                 cascade_attn_prefix_lens = None
                 # Disable cascade attention when using microbatching (DBO)
-                if self.cascade_attn_enabled and not self.parallel_config.enable_dbo:
+                if self.cascade_attn_enabled and not self.parallel_config.use_ubatching:
                     # Pre-compute cascade attention prefix lengths
                     cascade_attn_prefix_lens = self._compute_cascade_attn_prefix_lens(
                         num_scheduled_tokens_np,
@@ -3028,6 +3028,13 @@ class GPUModelRunner(
                     num_scheduled_tokens_np,
                     num_tokens_padded,
                     num_reqs_padded,
+                    self.parallel_config.num_ubatches,
+                )
+
+                logger.debug(
+                    "ubatch_slices: %s, ubatch_slices_padded: %s",
+                    ubatch_slices,
+                    ubatch_slices_padded,
                 )
 
                 pad_attn = cudagraph_mode == CUDAGraphMode.FULL
@@ -3710,11 +3717,14 @@ class GPUModelRunner(
         # wrap the model with full cudagraph wrapper if needed.
         cudagraph_mode = self.compilation_config.cudagraph_mode
         assert cudagraph_mode is not None
-        if cudagraph_mode.has_full_cudagraphs() and not self.parallel_config.enable_dbo:
+        if (
+            cudagraph_mode.has_full_cudagraphs()
+            and not self.parallel_config.use_ubatching
+        ):
             self.model = CUDAGraphWrapper(
                 self.model, self.vllm_config, runtime_mode=CUDAGraphMode.FULL
             )
-        elif self.parallel_config.enable_dbo:
+        elif self.parallel_config.use_ubatching:
             if cudagraph_mode.has_full_cudagraphs():
                 self.model = UBatchWrapper(
                     self.model, self.vllm_config, CUDAGraphMode.FULL, self.device
@@ -4095,7 +4105,16 @@ class GPUModelRunner(
             batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs
         )
         ubatch_slices, ubatch_slices_padded = maybe_create_ubatch_slices(
-            should_ubatch, num_scheduled_tokens, num_tokens_padded, num_reqs_padded
+            should_ubatch,
+            num_scheduled_tokens,
+            num_tokens_padded,
+            num_reqs_padded,
+            self.vllm_config.parallel_config.num_ubatches,
+        )
+        logger.debug(
+            "ubatch_slices: %s, ubatch_slices_padded: %s",
+            ubatch_slices,
+            ubatch_slices_padded,
         )
 
         attn_metadata: PerLayerAttnMetadata | None = None
@@ -4644,7 +4663,7 @@ class GPUModelRunner(
             # is above the threshold. Otherwise we just capture a non-ubatched
             # version of the graph
             allow_microbatching = (
-                self.parallel_config.enable_dbo
+                self.parallel_config.use_ubatching
                 and cudagraph_runtime_mode == CUDAGraphMode.FULL
                 and uniform_decode
                 and check_ubatch_thresholds(
@@ -4779,8 +4798,8 @@ class GPUModelRunner(
                     if kv_cache_group_id < len(kernel_block_sizes)
                     else None,
                     num_metadata_builders=1
-                    if not self.parallel_config.enable_dbo
-                    else 2,
+                    if not self.parallel_config.use_ubatching
+                    else self.parallel_config.num_ubatches,
                 )
         # Calculate reorder batch threshold (if needed)
         # Note (tdoublep): do this *after* constructing builders,
diff --git a/vllm/v1/worker/gpu_ubatch_wrapper.py b/vllm/v1/worker/gpu_ubatch_wrapper.py
index 2ce2b64512560..af09129e67b1e 100644
--- a/vllm/v1/worker/gpu_ubatch_wrapper.py
+++ b/vllm/v1/worker/gpu_ubatch_wrapper.py
@@ -103,8 +103,10 @@ class UBatchWrapper:
         self.vllm_config = vllm_config
         self.compilation_config = vllm_config.compilation_config
         self.comm_stream = torch.cuda.Stream(device=device)
-        # Two ubatch threads plus the main thread
-        self.ready_barrier = threading.Barrier(3)
+        # Ubatch threads plus the main thread
+        self.ready_barrier = threading.Barrier(
+            self.vllm_config.parallel_config.num_ubatches + 1
+        )
 
         self.cudagraphs: dict[int, CUDAGraphMetaData] = {}
 
@@ -309,7 +311,7 @@ class UBatchWrapper:
                 create_forward_context(
                     attn_metadata[i] if attn_metadata is not None else None,
                     self.vllm_config,
-                    dp_metadata=dp_metadata,
+                    dp_metadata=dp_metadata[i],
                     batch_descriptor=batch_descriptor,
                     cudagraph_runtime_mode=cudagraph_runtime_mode,
                 )
@@ -417,18 +419,19 @@ class UBatchWrapper:
 
         # We shouldn't be here unless we are running with multiple DP ranks
         assert dp_metadata is not None
-        num_tokens_per_ubatch = (
-            ubatch_slices[0].token_slice.stop - ubatch_slices[0].token_slice.start
-        )
-        dp_size = self.vllm_config.parallel_config.data_parallel_size
-        ubatch_num_tokens_across_dp = torch.tensor(
-            [num_tokens_per_ubatch] * dp_size, device="cpu", dtype=torch.int32
-        )
-        ubatch_dp_metadata = DPMetadata.make(
-            self.vllm_config.parallel_config,
-            num_tokens_per_ubatch,
-            ubatch_num_tokens_across_dp,
-        )
+        ubatch_dp_metadata = []
+        for ubatch_slice in ubatch_slices:
+            dp_size = self.vllm_config.parallel_config.data_parallel_size
+            ubatch_num_tokens_across_dp = torch.tensor(
+                [ubatch_slice.num_tokens] * dp_size, device="cpu", dtype=torch.int32
+            )
+            ubatch_dp_metadata.append(
+                DPMetadata.make(
+                    self.vllm_config.parallel_config,
+                    ubatch_slice.num_tokens,
+                    ubatch_num_tokens_across_dp,
+                )
+            )
 
         if (
             num_tokens not in self.cudagraphs
@@ -464,7 +467,7 @@ class UBatchWrapper:
                 intermediate_tensors=intermediate_tensors,
                 inputs_embeds=inputs_embeds,
                 compute_stream=compute_stream,
-                dp_metadata=dp_metadata,
+                dp_metadata=ubatch_dp_metadata,
                 batch_descriptor=batch_descriptor,
                 cudagraph_runtime_mode=CUDAGraphMode.NONE,
             )
diff --git a/vllm/v1/worker/ubatch_utils.py b/vllm/v1/worker/ubatch_utils.py
index 44788476fc9c5..f6889173578d6 100644
--- a/vllm/v1/worker/ubatch_utils.py
+++ b/vllm/v1/worker/ubatch_utils.py
@@ -27,14 +27,16 @@ class UBatchSlice:
 UBatchSlices: TypeAlias = list[UBatchSlice]
 
 
-def is_second_ubatch_empty(orig_num_tokens: int, padded_num_tokens: int) -> bool:
-    return (padded_num_tokens // 2) >= orig_num_tokens
+def is_last_ubatch_empty(
+    orig_num_tokens: int, padded_num_tokens: int, num_ubatches: int
+) -> bool:
+    return (padded_num_tokens // num_ubatches) * (num_ubatches - 1) >= orig_num_tokens
 
 
 def check_ubatch_thresholds(
     config: ParallelConfig, num_tokens: int, uniform_decode: bool
 ) -> bool:
-    if not config.enable_dbo:
+    if not config.use_ubatching:
         return False
     if uniform_decode:
         return num_tokens >= config.dbo_decode_token_threshold
@@ -42,21 +44,17 @@ def check_ubatch_thresholds(
         return num_tokens >= config.dbo_prefill_token_threshold
 
 
-# This just pads the second ubatch slice out to the total number of tokens
+# This pads the last ubatch slice out to the total number of tokens
 # (num_tokens + padding) since we do `create_ubatch_slices` before applying DP padding.
 def _pad_out_ubatch_slices(
     ubatch_slices: UBatchSlices, num_total_tokens: int, num_reqs_padded: int
 ) -> UBatchSlices:
-    # TODO(lucas): handle empty second ubatch
-    padded_second_request_slice = slice(
-        ubatch_slices[1].request_slice.start, num_reqs_padded
-    )
-    padded_second_token_slice = slice(
-        ubatch_slices[1].token_slice.start, num_total_tokens
-    )
-    return [
-        ubatch_slices[0],
-        UBatchSlice(padded_second_request_slice, padded_second_token_slice),
+    last_slice = ubatch_slices[-1]
+    padded_last_request_slice = slice(last_slice.request_slice.start, num_reqs_padded)
+    padded_last_token_slice = slice(last_slice.token_slice.start, num_total_tokens)
+
+    return ubatch_slices[:-1] + [
+        UBatchSlice(padded_last_request_slice, padded_last_token_slice)
     ]
 
 
@@ -65,40 +63,45 @@ def maybe_create_ubatch_slices(
     num_scheduled_tokens: np.ndarray,
     num_tokens_padded: int,
     num_reqs_padded: int,
-    split_point: int | None = None,
+    num_ubatches: int,
+    split_point: list[int] | int | None = None,
 ) -> tuple[UBatchSlices | None, UBatchSlices | None]:
     if not should_ubatch:
         return None, None
 
     if split_point is None:
-        split_point = int(num_tokens_padded) // 2
+        split_point = int(num_tokens_padded) // num_ubatches
+
+    token_split_points = [split_point * i for i in range(1, num_ubatches)]
 
     # TODO(lucas): Refactor the gpu_model_runner.py so we can pass
     # in cu_num_tokens directly (i.e. query_start_loc)
     cu_num_tokens = np.zeros(len(num_scheduled_tokens) + 1, dtype=np.int32)
     np.cumsum(num_scheduled_tokens, dtype=np.int32, out=cu_num_tokens[1:])
 
-    first_ubatch_token_slice = slice(0, split_point)
-    second_ubatch_token_slice = slice(split_point, cu_num_tokens[-1])
+    ubatch_slices = []
+    start_token = 0
 
-    # Determine request slices using exclusive stop semantics
-    # First ubatch includes requests whose tokens overlap [0, split_point)
-    first_ubatch_req_stop = int(
-        np.searchsorted(cu_num_tokens, split_point, side="left")
-    )
-    first_ubatch_req_slice = slice(0, first_ubatch_req_stop)
+    # Add the end point to the split points to make iteration easier
+    all_points = token_split_points + [cu_num_tokens[-1]]
 
-    # Second ubatch starts at the request that contains the split_point
-    # or the request starting exactly at split_point (if on boundary)
-    second_ubatch_req_start = int(
-        np.searchsorted(cu_num_tokens, split_point, side="right") - 1
-    )
-    second_ubatch_req_slice = slice(second_ubatch_req_start, len(cu_num_tokens) - 1)
+    for end_token in all_points:
+        token_slice = slice(start_token, end_token)
 
-    ubatch_slices = [
-        UBatchSlice(first_ubatch_req_slice, first_ubatch_token_slice),
-        UBatchSlice(second_ubatch_req_slice, second_ubatch_token_slice),
-    ]
+        # Determine request slices using exclusive stop semantics
+        # Ubatch includes requests whose tokens overlap [start_token, end_token)
+
+        # Start at the request that contains the start_token
+        # or the request starting exactly at start_token (if on boundary)
+        req_start = int(np.searchsorted(cu_num_tokens, start_token, side="right") - 1)
+
+        # Stop at the request that starts at or after end_token
+        req_stop = int(np.searchsorted(cu_num_tokens, end_token, side="left"))
+
+        req_slice = slice(req_start, req_stop)
+        ubatch_slices.append(UBatchSlice(req_slice, token_slice))
+
+        start_token = end_token
 
     ubatch_slices_padded = _pad_out_ubatch_slices(
         ubatch_slices, num_tokens_padded, num_reqs_padded
diff --git a/vllm/v1/worker/ubatching.py b/vllm/v1/worker/ubatching.py
index be8326e2fdbc1..e7a947f2ea8ca 100644
--- a/vllm/v1/worker/ubatching.py
+++ b/vllm/v1/worker/ubatching.py
@@ -7,10 +7,15 @@ import torch
 
 from vllm import forward_context
 from vllm.forward_context import ForwardContext
+from vllm.logger import init_logger
 from vllm.utils.torch_utils import current_stream
 
+logger = init_logger(__name__)
+
 _THREAD_ID_TO_CONTEXT: dict = {}
-_CURRENT_CONTEXTS: list[Optional["UBatchContext"]] = [None, None]
+# Here we hardcode the number of microbatches to 2 for default.
+_NUM_UBATCHES: int = 2
+_CURRENT_CONTEXTS: list[Optional["UBatchContext"]] = []
 
 
 class UBatchContext:
@@ -48,6 +53,7 @@ class UBatchContext:
         global _CURRENT_CONTEXTS, _THREAD_ID_TO_CONTEXT
         _THREAD_ID_TO_CONTEXT[threading.get_ident()] = self.id
         _CURRENT_CONTEXTS[self.id] = self
+        # _NUM_UBATCHES is set in make_ubatch_contexts
         self.ready_barrier.wait()
 
         self.cpu_wait_event.wait()
@@ -181,7 +187,7 @@ dbo_switch_to_compute_sync = _register_ubatch_function(
 def dbo_register_recv_hook(recv_hook):
     if len(_THREAD_ID_TO_CONTEXT) > 0:
         ctx_idx = _THREAD_ID_TO_CONTEXT[threading.get_ident()]
-        next_ctx = _CURRENT_CONTEXTS[(ctx_idx + 1) % 2]
+        next_ctx = _CURRENT_CONTEXTS[(ctx_idx + 1) % _NUM_UBATCHES]
         next_ctx.recv_hook = recv_hook
 
 
@@ -202,7 +208,14 @@ def make_ubatch_contexts(
     ready_barrier: threading.Barrier,
     schedule: str = "default",
 ) -> list[UBatchContext]:
-    assert num_micro_batches == 2, "only been tested with 2 micro-batches"
+    global _NUM_UBATCHES, _CURRENT_CONTEXTS
+    assert num_micro_batches > 1, "num_micro_batches must be greater than 1"
+
+    _NUM_UBATCHES = num_micro_batches
+    # Ensure the global context list is large enough
+    if len(_CURRENT_CONTEXTS) < num_micro_batches:
+        _CURRENT_CONTEXTS.extend([None] * (num_micro_batches - len(_CURRENT_CONTEXTS)))
+
     """
     Create a context manager for micro-batching synchronization.
     """
@@ -210,8 +223,6 @@ def make_ubatch_contexts(
     gpu_comm_done_events = [torch.Event() for _ in range(num_micro_batches)]
     gpu_compute_done_events = [torch.Event() for _ in range(num_micro_batches)]
 
-    assert len(forward_contexts) == 2
-
     ctxs = []
     for i in range(num_micro_batches):
         ctx = UBatchContext(

From e94384bbadbaf99dea24c4af4de6a8c897f830e7 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Tue, 16 Dec 2025 13:24:32 +0800
Subject: [PATCH 577/770] [Bugfix] Fix broken ViT attention selection for
 Blackwell device (#30731)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/model_executor/models/vision.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 5a02916bb7752..024c50f1207ed 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -11,7 +11,7 @@ import torch
 from transformers import PretrainedConfig
 
 from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.config import VllmConfig, get_current_vllm_config
+from vllm.config import VllmConfig
 from vllm.distributed import (
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
@@ -88,16 +88,10 @@ def get_vit_attn_backend(
     """
     Get the available attention backend for Vision Transformer.
     """
-    attn_backend = attn_backend_override
-
-    selected_backend = get_current_vllm_config().attention_config.backend
-    if attn_backend is None:
-        attn_backend = selected_backend
-
     return current_platform.get_vit_attn_backend(
         head_size,
         dtype,
-        backend=attn_backend,
+        backend=attn_backend_override,
     )
 
 
From 0d0c929f2360cde5bae6817ad0f555641329e79d Mon Sep 17 00:00:00 2001
From: Andrew Xia <axia@meta.com>
Date: Tue, 16 Dec 2025 13:54:59 +0800
Subject: [PATCH 578/770] [responsesAPI][8] input/output messages for
 ResponsesParser (#30158)

Signed-off-by: Andrew Xia <axia@fb.com>
Signed-off-by: Andrew Xia <axia@meta.com>
Co-authored-by: Andrew Xia <axia@fb.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
---
 .../test_response_api_parsable_context.py     |  6 +++
 vllm/entrypoints/context.py                   | 28 ++++++++++++++
 .../openai/parser/responses_parser.py         | 38 ++++++++++++++++++-
 vllm/entrypoints/openai/serving_responses.py  | 13 ++-----
 vllm/entrypoints/responses_utils.py           | 33 ----------------
 5 files changed, 74 insertions(+), 44 deletions(-)

diff --git a/tests/entrypoints/openai/test_response_api_parsable_context.py b/tests/entrypoints/openai/test_response_api_parsable_context.py
index 1899c5f04fe3f..6d97602f32475 100644
--- a/tests/entrypoints/openai/test_response_api_parsable_context.py
+++ b/tests/entrypoints/openai/test_response_api_parsable_context.py
@@ -165,6 +165,7 @@ async def test_mcp_tool_call(client: OpenAI, model_name: str):
         model=model_name,
         input="What is 13 * 24? Use python to calculate the result.",
         tools=[{"type": "code_interpreter", "container": {"type": "auto"}}],
+        extra_body={"enable_response_messages": True},
         temperature=0.0,
     )
 
@@ -178,3 +179,8 @@ async def test_mcp_tool_call(client: OpenAI, model_name: str):
     # make sure the correct math is in the final output
     assert response.output[3].type == "message"
     assert "312" in response.output[3].content[0].text
+
+    # test raw input_messages / output_messages
+    assert len(response.input_messages) == 1
+    assert len(response.output_messages) == 3
+    assert "312" in response.output_messages[2]["message"]
diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
index eef8fce09c622..b076b883b4d93 100644
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -297,12 +297,40 @@ class ParsableContext(ConversationContext):
         self.chat_template = chat_template
         self.chat_template_content_format = chat_template_content_format
 
+        self.input_messages: list[ResponseRawMessageAndToken] = []
+        self.output_messages: list[ResponseRawMessageAndToken] = []
+
     def append_output(self, output: RequestOutput) -> None:
         self.num_prompt_tokens = len(output.prompt_token_ids or [])
         self.num_cached_tokens = output.num_cached_tokens or 0
         self.num_output_tokens += len(output.outputs[0].token_ids or [])
         self.parser.process(output.outputs[0])
 
+        # only store if enable_response_messages is True, save memory
+        if self.request.enable_response_messages:
+            output_prompt = output.prompt or ""
+            output_prompt_token_ids = output.prompt_token_ids or []
+            if len(self.input_messages) == 0:
+                self.input_messages.append(
+                    ResponseRawMessageAndToken(
+                        message=output_prompt,
+                        tokens=output_prompt_token_ids,
+                    )
+                )
+            else:
+                self.output_messages.append(
+                    ResponseRawMessageAndToken(
+                        message=output_prompt,
+                        tokens=output_prompt_token_ids,
+                    )
+                )
+            self.output_messages.append(
+                ResponseRawMessageAndToken(
+                    message=output.outputs[0].text,
+                    tokens=output.outputs[0].token_ids,
+                )
+            )
+
     def append_tool_output(self, output: list[ResponseInputOutputItem]) -> None:
         self.parser.response_messages.extend(output)
 
diff --git a/vllm/entrypoints/openai/parser/responses_parser.py b/vllm/entrypoints/openai/parser/responses_parser.py
index 4fa6b4d906db0..c364d6d80544d 100644
--- a/vllm/entrypoints/openai/parser/responses_parser.py
+++ b/vllm/entrypoints/openai/parser/responses_parser.py
@@ -3,7 +3,11 @@
 import logging
 from collections.abc import Callable
 
-from openai.types.responses.response_function_tool_call import ResponseFunctionToolCall
+from openai.types.responses import ResponseFunctionToolCall, ResponseOutputItem
+from openai.types.responses.response_function_tool_call_output_item import (
+    ResponseFunctionToolCallOutputItem,
+)
+from openai.types.responses.response_output_item import McpCall
 from openai.types.responses.response_output_message import ResponseOutputMessage
 from openai.types.responses.response_output_text import ResponseOutputText
 from openai.types.responses.response_reasoning_item import (
@@ -11,6 +15,7 @@ from openai.types.responses.response_reasoning_item import (
     ResponseReasoningItem,
 )
 
+from vllm.entrypoints.constants import MCP_PREFIX
 from vllm.entrypoints.openai.protocol import ResponseInputOutputItem, ResponsesRequest
 from vllm.outputs import CompletionOutput
 from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
@@ -111,6 +116,37 @@ class ResponsesParser:
 
         return self
 
+    def make_response_output_items_from_parsable_context(
+        self,
+    ) -> list[ResponseOutputItem]:
+        """Given a list of sentences, construct ResponseOutput Items."""
+        response_messages = self.response_messages[self.num_init_messages :]
+        output_messages: list[ResponseOutputItem] = []
+        for message in response_messages:
+            if not isinstance(message, ResponseFunctionToolCallOutputItem):
+                output_messages.append(message)
+            else:
+                if len(output_messages) == 0:
+                    raise ValueError(
+                        "Cannot have a FunctionToolCallOutput before FunctionToolCall."
+                    )
+                if isinstance(output_messages[-1], ResponseFunctionToolCall):
+                    mcp_message = McpCall(
+                        id=f"{MCP_PREFIX}{random_uuid()}",
+                        arguments=output_messages[-1].arguments,
+                        name=output_messages[-1].name,
+                        server_label=output_messages[
+                            -1
+                        ].name,  # TODO: store the server label
+                        type="mcp_call",
+                        status="completed",
+                        output=message.output,
+                        # TODO: support error output
+                    )
+                    output_messages[-1] = mcp_message
+
+        return output_messages
+
 
 def get_responses_parser_for_simple_context(
     *,
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 251684157e060..1f9b5704624ab 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -104,7 +104,6 @@ from vllm.entrypoints.responses_utils import (
     construct_input_messages,
     construct_tool_dicts,
     extract_tool_types,
-    make_response_output_items_from_parsable_context,
 )
 from vllm.entrypoints.tool_server import ToolServer
 from vllm.inputs.data import TokensPrompt
@@ -658,17 +657,11 @@ class OpenAIServingResponses(OpenAIServing):
             else:
                 status = "incomplete"
         elif isinstance(context, ParsableContext):
-            response_messages = context.parser.response_messages[
-                context.parser.num_init_messages :
-            ]
-            output = make_response_output_items_from_parsable_context(response_messages)
+            output = context.parser.make_response_output_items_from_parsable_context()
 
-            # TODO: context for non-gptoss models doesn't use messages
-            # so we can't get them out yet
             if request.enable_response_messages:
-                raise NotImplementedError(
-                    "enable_response_messages is currently only supported for gpt-oss"
-                )
+                input_messages = context.input_messages
+                output_messages = context.output_messages
 
             # TODO: Calculate usage.
             # assert final_res.prompt_token_ids is not None
diff --git a/vllm/entrypoints/responses_utils.py b/vllm/entrypoints/responses_utils.py
index 99080fa43cb8e..df3d0495755da 100644
--- a/vllm/entrypoints/responses_utils.py
+++ b/vllm/entrypoints/responses_utils.py
@@ -16,7 +16,6 @@ from openai.types.responses.response import ToolChoice
 from openai.types.responses.response_function_tool_call_output_item import (
     ResponseFunctionToolCallOutputItem,
 )
-from openai.types.responses.response_output_item import McpCall
 from openai.types.responses.response_output_message import ResponseOutputMessage
 from openai.types.responses.response_reasoning_item import ResponseReasoningItem
 from openai.types.responses.tool import Tool
@@ -27,38 +26,6 @@ from vllm.entrypoints.openai.protocol import (
     ChatCompletionMessageParam,
     ResponseInputOutputItem,
 )
-from vllm.utils import random_uuid
-
-
-def make_response_output_items_from_parsable_context(
-    response_messages: list[ResponseInputOutputItem],
-) -> list[ResponseOutputItem]:
-    """Given a list of sentences, construct ResponseOutput Items."""
-    output_messages: list[ResponseOutputItem] = []
-    for message in response_messages:
-        if not isinstance(message, ResponseFunctionToolCallOutputItem):
-            output_messages.append(message)
-        else:
-            if len(output_messages) == 0:
-                raise ValueError(
-                    "Cannot have a FunctionToolCallOutput before FunctionToolCall."
-                )
-            if isinstance(output_messages[-1], ResponseFunctionToolCall):
-                mcp_message = McpCall(
-                    id=f"{MCP_PREFIX}{random_uuid()}",
-                    arguments=output_messages[-1].arguments,
-                    name=output_messages[-1].name,
-                    server_label=output_messages[
-                        -1
-                    ].name,  # TODO: store the server label
-                    type=f"{MCP_PREFIX}call",
-                    status="completed",
-                    output=message.output,
-                    # TODO: support error output
-                )
-                output_messages[-1] = mcp_message
-
-    return output_messages
 
 
 def construct_input_messages(

From 0e391e757039c8a49a7956226c59d1fbde72459d Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 16 Dec 2025 17:36:35 +0800
Subject: [PATCH 579/770] [Bugfix] Fix RequestOutput miss lora_request (#30636)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/test_gptoss_tp.py       |  6 +++++-
 tests/lora/test_llama_tp.py        |  9 ++++++++-
 vllm/v1/engine/output_processor.py | 11 ++++++-----
 3 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/tests/lora/test_gptoss_tp.py b/tests/lora/test_gptoss_tp.py
index f4269750feb6b..2fa61f280587f 100644
--- a/tests/lora/test_gptoss_tp.py
+++ b/tests/lora/test_gptoss_tp.py
@@ -76,6 +76,8 @@ def test_gpt_oss_lora(gptoss20b_lora_files):
         enable_lora=True,
         max_loras=4,
         max_lora_rank=8,
+        max_num_seqs=2,
+        max_num_batched_tokens=2048,
         compilation_config=vllm.config.CompilationConfig(  # Avoid OOM
             cudagraph_specialize_lora=False,
         ),
@@ -94,8 +96,10 @@ def test_gpt_oss_lora_tp2(gptoss20b_lora_files, fully_sharded_loras):
         enable_lora=True,
         max_loras=2,
         max_lora_rank=8,
-        max_num_seqs=16,
+        max_num_seqs=2,
+        max_num_batched_tokens=2048,
         tensor_parallel_size=2,
+        gpu_memory_utilization=0.8,
         fully_sharded_loras=fully_sharded_loras,
         compilation_config=vllm.config.CompilationConfig(  # Avoid OOM
             cudagraph_specialize_lora=False,
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index 18704fa6e45de..483235ff51291 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -76,11 +76,18 @@ def do_sample(
             if lora_id
             else None,
         )
-    # Print the outputs.
+    lora_request = LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None
     generated_texts: list[str] = []
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text
+        # The output should include  correct lora_request info
+        if lora_request is not None:
+            assert output.lora_request.lora_name == lora_request.lora_name
+            assert output.lora_request.lora_int_id == lora_request.lora_int_id
+            assert output.lora_request.lora_path == lora_request.lora_path
+        else:
+            assert output.lora_request is None
         generated_texts.append(generated_text)
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
     return generated_texts
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 9be3f4da7352d..8f7d8a71f1a2e 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -8,6 +8,7 @@ from typing import Any, cast
 
 import torch
 
+from vllm.lora.request import LoRARequest
 from vllm.outputs import (
     CompletionOutput,
     PoolingOutput,
@@ -93,7 +94,7 @@ class RequestState:
         request_id: str,
         parent_req: ParentRequest | None,
         request_index: int,
-        lora_name: str | None,
+        lora_request: LoRARequest | None,
         output_kind: RequestOutputKind,
         prompt: str | None,
         prompt_token_ids: list[int] | None,
@@ -112,7 +113,8 @@ class RequestState:
         self.request_id = request_id
         self.parent_req = parent_req
         self.request_index = request_index
-        self.lora_name = lora_name
+        self.lora_request = lora_request
+        self.lora_name = lora_request.lora_name if lora_request is not None else None
         self.output_kind = output_kind
         self.prompt = prompt
         self.prompt_token_ids = prompt_token_ids
@@ -178,9 +180,7 @@ class RequestState:
             request_id=request.request_id,
             parent_req=parent_req,
             request_index=request_index,
-            lora_name=(
-                request.lora_request.name if request.lora_request is not None else None
-            ),
+            lora_request=request.lora_request,
             output_kind=output_kind,
             prompt=prompt,
             prompt_token_ids=request.prompt_token_ids,
@@ -289,6 +289,7 @@ class RequestState:
 
         return RequestOutput(
             request_id=request_id,
+            lora_request=self.lora_request,
             prompt=self.prompt,
             prompt_token_ids=prompt_token_ids,
             prompt_logprobs=prompt_logprobs,

From 676db55eecf8b6d9ec38ea243cf6f35ea8378ec6 Mon Sep 17 00:00:00 2001
From: Junru Shen <jrshen.sjr@gmail.com>
Date: Tue, 16 Dec 2025 17:37:15 +0800
Subject: [PATCH 580/770] [Bugfix] Fix prefix_repetition routing in bench
 throughput (#29663)

Signed-off-by: Junru Shen <jrshen.sjr@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/benchmarks/throughput.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py
index d824e982b7489..37b8952a350b4 100644
--- a/vllm/benchmarks/throughput.py
+++ b/vllm/benchmarks/throughput.py
@@ -346,7 +346,10 @@ def get_requests(args, tokenizer):
         "output_len": args.output_len,
     }
 
-    if args.dataset_path is None or args.dataset_name == "random":
+    if args.dataset_name == "random" or (
+        args.dataset_path is None
+        and args.dataset_name not in {"prefix_repetition", "random-mm", "random-rerank"}
+    ):
         sample_kwargs["range_ratio"] = args.random_range_ratio
         sample_kwargs["prefix_len"] = args.prefix_len
         dataset_cls = RandomDataset

From 6f15ac5de7303ba0e7ea161452f8cfd9a1445cee Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 16 Dec 2025 13:40:26 +0000
Subject: [PATCH 581/770] Don'e assume `position_embedding_type` will be
 present for BERT and RoBERTa models (#30770)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/models/bert.py    |  4 +++-
 vllm/model_executor/models/roberta.py | 16 +++++-----------
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index e774cd647ea8c..ee429bf458843 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -55,7 +55,9 @@ class BertEmbedding(nn.Module):
             "position_ids",
             torch.arange(config.max_position_embeddings).unsqueeze(0),
         )
-        self.position_embedding_type = config.position_embedding_type
+        self.position_embedding_type = getattr(
+            config, "position_embedding_type", "absolute"
+        )
         if self.position_embedding_type != "absolute":
             raise ValueError(
                 "Only 'absolute' position_embedding_type" + " is supported"
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
index 31cc645099141..45b6e93307ac3 100644
--- a/vllm/model_executor/models/roberta.py
+++ b/vllm/model_executor/models/roberta.py
@@ -57,12 +57,6 @@ class RobertaEmbedding(nn.Module):
             torch.arange(config.max_position_embeddings).unsqueeze(0),
         )
 
-        self.position_embedding_type = config.position_embedding_type
-        if self.position_embedding_type != "absolute":
-            raise ValueError(
-                "Only 'absolute' position_embedding_type" + " is supported"
-            )
-
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -135,12 +129,12 @@ class RobertaEmbeddingModel(BertEmbeddingModel):
     def _build_model(
         self, vllm_config: VllmConfig, prefix: str = ""
     ) -> BertModel | BertWithRope:
-        if vllm_config.model_config.hf_config.position_embedding_type == "rotary":
-            return JinaRobertaModel(vllm_config=vllm_config, prefix=prefix)
+        hf_config = vllm_config.model_config.hf_config
+        kwargs = dict(vllm_config=vllm_config, prefix=prefix)
+        if getattr(hf_config, "position_embedding_type", "absolute") == "absolute":
+            return BertModel(**kwargs, embedding_class=RobertaEmbedding)
         else:
-            return BertModel(
-                vllm_config=vllm_config, prefix=prefix, embedding_class=RobertaEmbedding
-            )
+            return JinaRobertaModel(**kwargs)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         weights_list = list(weights)

From d0fb5729298eb0b3683445dac566764b093e5854 Mon Sep 17 00:00:00 2001
From: TJian <tunjian.tan@embeddedllm.com>
Date: Tue, 16 Dec 2025 21:50:47 +0800
Subject: [PATCH 582/770] [ROCm] [AITER] [DOC] Add usage description about
 check functions in `_aiter_ops` (#30586)

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 vllm/_aiter_ops.py     | 103 +++++++++++++++++++++++++++++++++++------
 vllm/platforms/rocm.py |   3 --
 2 files changed, 88 insertions(+), 18 deletions(-)

diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py
index 010817e79a936..c32bf04c71c1f 100644
--- a/vllm/_aiter_ops.py
+++ b/vllm/_aiter_ops.py
@@ -642,48 +642,130 @@ _OPS_REGISTERED = False
 
 
 class rocm_aiter_ops:
+    """ROCm AITER operations wrapper for AMD GPU acceleration in vLLM.
+
+    This class centralizes the import and registration of AITER ops,
+    and provides a unified interface for checking if AITER is enabled.
+    Operations are only available on supported gfx9
+    architectures when aiter is installed.
+
+    The class uses environment variables to control which features are enabled,
+    allowing fine-grained control over which AITER optimizations are used.
+
+    Environment Variables:
+        VLLM_ROCM_USE_AITER: Main toggle for all AITER operations.
+        VLLM_ROCM_USE_AITER_LINEAR: Controls GEMM and quantization ops.
+        VLLM_ROCM_USE_AITER_RMSNORM: Controls RMSNorm operations.
+        VLLM_ROCM_USE_AITER_MOE: Controls MoE (Mixture of Experts) ops.
+        VLLM_ROCM_USE_AITER_MLA: Controls MLA (Multi-head Latent Attention) ops.
+        VLLM_ROCM_USE_AITER_MHA: Controls MHA ops including flash_attn_varlen.
+        VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION: Controls Triton unified attention.
+        VLLM_ROCM_USE_AITER_FP8BMM: Controls FP8 batched matrix multiply.
+        VLLM_ROCM_USE_AITER_FP4_ASM_GEMM: Controls FP4 assembly GEMM.
+        VLLM_ROCM_USE_AITER_TRITON_ROPE: Controls Triton rotary embeddings.
+        VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS: Controls shared expert fusion.
+        VLLM_ROCM_USE_AITER_TRITON_GEMM: Controls Triton unquantized GEMM.
+
+    Note:
+        The environment variables are assigned when the module is imported,
+        so you can't change the environment variables after the module is imported.
+        This is done out of performance consideration. Accessing environment variables
+        is expensive as described in issue https://github.com/vllm-project/vllm/issues/17067
+        so we don't want to do it repeatedly, especially in the hot path (the forward pass).
+        You can call the refresh_env_variables() function to reload the env variables
+        after monkey patching the env variables in the unit test.
+
+    Check Functions:
+        All check functions (is_*_enabled) are decorated with @if_aiter_supported,
+        which verifies: (1) platform is ROCm, (2) device arch is gfx9, and
+        (3) aiter library is installed. The check function then also verifies
+        the corresponding environment variable is enabled.
+        i.e.                                             ___
+        is_enabled() == current_platform.is_rocm() and      |     checked by
+                        current_platform.is_on_gfx9() and   | @if_aiter_supported
+                        IS_AITER_FOUND and   _______________|
+                        cls._AITER_ENABLED   -----> Check by the logic in `is_enabled()`
+
+    Example:
+        from vllm._aiter_ops import rocm_aiter_ops
+
+        # Check if aiter is enabled before using operations
+        if rocm_aiter_ops.is_enabled():
+            result = rocm_aiter_ops.rms_norm(x, weight, epsilon)
+
+    Operations:
+        - RMS normalization: rms_norm, rms_norm2d_with_add
+        - GEMM operations: gemm_a8w8, gemm_a8w8_blockscale
+        - Fused MoE: fused_moe, asm_moe_tkw1
+        - Routing: topk_softmax, biased_grouped_topk, grouped_topk
+        - MLA decode: mla_decode_fwd
+        - Quantization: per_tensor_quant, per_token_quant, group_fp8_quant
+        - Triton ops: triton_rotary_embed, triton_fp8_bmm, triton_gemm_a8w8_blockscale
+    """
+
+    # Check if the env variable is set
     _AITER_ENABLED = envs.VLLM_ROCM_USE_AITER
     _LINEAR_ENABLED = envs.VLLM_ROCM_USE_AITER_LINEAR
     _RMSNORM_ENABLED = envs.VLLM_ROCM_USE_AITER_RMSNORM
     _FMOE_ENABLED = envs.VLLM_ROCM_USE_AITER_MOE
     _MLA_ENABLED = envs.VLLM_ROCM_USE_AITER_MLA
-    _PG_ATTN_ENABLED = envs.VLLM_ROCM_USE_AITER_PAGED_ATTN
     _MHA_ENABLED = envs.VLLM_ROCM_USE_AITER_MHA
     _TRITON_UNIFIED_ATTN_ENABLED = envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION
+    # TODO: Consolidate under _LINEAR_ENABLED
     _FP8BMM_ENABLED = envs.VLLM_ROCM_USE_AITER_FP8BMM
+    # TODO: Consolidate under _LINEAR_ENABLED
     _FP4_GEMM_DYNAMIC_QUANT_ASM = envs.VLLM_ROCM_USE_AITER_FP4_ASM_GEMM
+    # TODO: Consolidate under VLLM_ROCM_USE_AITER_ROPE
     _TRITON_ROTARY_EMBED = envs.VLLM_ROCM_USE_AITER_TRITON_ROPE
     _MOE_SHARED_EXPERTS_ENABLED = envs.VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS
+    # TODO: Consolidate under _LINEAR_ENABLED
     _TRITON_UNQUANT_GEMM = envs.VLLM_ROCM_USE_AITER_TRITON_GEMM
 
+    @classmethod
+    def refresh_env_variables(cls):
+        """
+        Since the environment variables are assigned when the module is imported,
+        This is a helper function to reload all the env variables from
+        the environment variables.
+        for example, after monkey patching the env variables in the unit test,
+        you can call this function to reload the env variables.
+        """
+        cls._AITER_ENABLED = envs.VLLM_ROCM_USE_AITER
+        cls._LINEAR_ENABLED = envs.VLLM_ROCM_USE_AITER_LINEAR
+        cls._RMSNORM_ENABLED = envs.VLLM_ROCM_USE_AITER_RMSNORM
+        cls._FMOE_ENABLED = envs.VLLM_ROCM_USE_AITER_MOE
+        cls._MLA_ENABLED = envs.VLLM_ROCM_USE_AITER_MLA
+        cls._MHA_ENABLED = envs.VLLM_ROCM_USE_AITER_MHA
+        cls._TRITON_UNIFIED_ATTN_ENABLED = envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION
+        cls._FP8BMM_ENABLED = envs.VLLM_ROCM_USE_AITER_FP8BMM
+        cls._FP4_GEMM_DYNAMIC_QUANT_ASM = envs.VLLM_ROCM_USE_AITER_FP4_ASM_GEMM
+        cls._TRITON_ROTARY_EMBED = envs.VLLM_ROCM_USE_AITER_TRITON_ROPE
+        cls._MOE_SHARED_EXPERTS_ENABLED = envs.VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS
+        cls._TRITON_UNQUANT_GEMM = envs.VLLM_ROCM_USE_AITER_TRITON_GEMM
+
     @classmethod
     @if_aiter_supported
     def is_enabled(cls) -> bool:
-        """Verifies device specs and availability of aiter main env variable."""
         return cls._AITER_ENABLED
 
     @classmethod
     @if_aiter_supported
     def is_linear_enabled(cls) -> bool:
-        """ "Verifies device specs and availability of env variable."""
         return cls._AITER_ENABLED and cls._LINEAR_ENABLED
 
     @classmethod
     @if_aiter_supported
     def is_linear_fp8_enaled(cls) -> bool:
-        """ "Verifies device specs and availability of env variable."""
         return cls.is_linear_enabled()
 
     @classmethod
     @if_aiter_supported
     def is_rmsnorm_enabled(cls) -> bool:
-        """ "Verifies device specs and availability of env variable."""
         return cls._AITER_ENABLED and cls._RMSNORM_ENABLED
 
     @classmethod
     @if_aiter_supported
     def is_fused_moe_enabled(cls) -> bool:
-        """ "Verifies device specs and availability of env variable."""
         return cls._AITER_ENABLED and cls._FMOE_ENABLED
 
     @classmethod
@@ -694,25 +776,16 @@ class rocm_aiter_ops:
     @classmethod
     @if_aiter_supported
     def is_mla_enabled(cls) -> bool:
-        """ "Verifies device specs and availability of env variable."""
         return cls._AITER_ENABLED and cls._MLA_ENABLED
 
     @classmethod
     @if_aiter_supported
     def is_mha_enabled(cls) -> bool:
-        """ "Verifies device specs and availability of env variable."""
         return cls._AITER_ENABLED and cls._MHA_ENABLED
 
-    @classmethod
-    @if_aiter_supported
-    def is_pa_attn_enabled(cls) -> bool:
-        """ "Verifies device specs and availability of env variable."""
-        return cls._AITER_ENABLED and cls._PG_ATTN_ENABLED
-
     @classmethod
     @if_aiter_supported
     def is_triton_unified_attn_enabled(cls) -> bool:
-        """ "Verifies device specs and availability of env variable."""
         return cls._AITER_ENABLED and cls._TRITON_UNIFIED_ATTN_ENABLED
 
     @classmethod
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index e469a928da229..c237f7cf887c1 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -124,8 +124,6 @@ def use_rocm_custom_paged_attention(
     alibi_slopes: torch.Tensor | None = None,
     sinks: torch.Tensor | None = None,
 ) -> bool:
-    from vllm._aiter_ops import rocm_aiter_ops
-
     GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
     ON_GFX9 = any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942", "gfx950"])
     ON_GFX11_GFX12 = any(arch in GPU_ARCH for arch in ["gfx11", "gfx12"])
@@ -141,7 +139,6 @@ def use_rocm_custom_paged_attention(
             and (gqa_ratio >= 1 and gqa_ratio <= 16)
             and max_seq_len <= 128 * 1024
             and (envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
-            and not (rocm_aiter_ops.is_pa_attn_enabled())
             and sinks is None
         )
 

From 104003dc77d7d532ea6f946a6ea72ef8ea749078 Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Tue, 16 Dec 2025 06:09:34 -0800
Subject: [PATCH 583/770] update piecewise cudagraph warning when
 splitting_ops=[] (#30728)

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 vllm/config/compilation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 1fdb843e1a7c7..4a98494b3c7b3 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -937,7 +937,7 @@ class CompilationConfig:
                     or self.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE
                 ):
                     logger.warning_once(
-                        "Using piecewise compilation with empty splitting_ops"
+                        "Using piecewise cudagraph with empty splitting_ops"
                     )
                 if self.cudagraph_mode == CUDAGraphMode.PIECEWISE:
                     logger.warning_once(

From 9dbbc59b151163b7c08a2fde362b6aaedd9bc343 Mon Sep 17 00:00:00 2001
From: Pleaplusone <ygan@amd.com>
Date: Tue, 16 Dec 2025 22:10:26 +0800
Subject: [PATCH 584/770] [ROCm][MTP] Support MTP for AITER MLA backend
 (#28624)

Signed-off-by: ganyi <ygan@amd.com>
---
 .../attention/backends/mla/rocm_aiter_mla.py  | 24 ++++++++++++-------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
index 00a0a77a1c2f7..589d6ef2f6348 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -15,6 +15,7 @@ from vllm.v1.attention.backends.mla.common import (
     MLACommonImpl,
     MLACommonMetadata,
     MLACommonMetadataBuilder,
+    QueryLenSupport,
 )
 from vllm.v1.attention.backends.utils import AttentionCGSupport
 from vllm.v1.kv_cache_interface import AttentionSpec
@@ -51,6 +52,8 @@ class AiterMLADecodeMetadata(MLACommonDecodeMetadata):
     qo_indptr: torch.Tensor | None = None
     # The dtype of MLA out tensor
     attn_out_dtype: torch.dtype = torch.bfloat16
+    # The max query output length: int
+    max_qo_len: int | None = None
 
 
 class AiterMLAMetadata(MLACommonMetadata[AiterMLADecodeMetadata]):
@@ -60,9 +63,8 @@ class AiterMLAMetadata(MLACommonMetadata[AiterMLADecodeMetadata]):
 class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
     # TODO(luka, lucas): audit this as part of:
     #  https://github.com/vllm-project/vllm/issues/22945
-    _cudagraph_support: ClassVar[AttentionCGSupport] = (
-        AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
-    )
+    _cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.UNIFORM_BATCH
+    query_len_support: ClassVar[QueryLenSupport] = QueryLenSupport.UNIFORM
 
     def __init__(
         self,
@@ -97,8 +99,8 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
                 max_num_reqs, dtype=torch.int32, device=device
             )
 
-            self.qo_indptr = torch.arange(
-                0, max_num_reqs + 1, dtype=torch.int32, device=device
+            self.qo_indptr = torch.zeros(
+                max_num_reqs + 1, dtype=torch.int32, device=device
             )
 
     def _build_decode(
@@ -128,6 +130,8 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
                 seq_lens_device.cumsum(dim=0, dtype=torch.int32),
             ]
         )
+        qo_len = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
+        max_qo_len = qo_len.max().item()
 
         if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
             num_actual_pages = paged_kv_indices.size(0)
@@ -150,6 +154,10 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
             self.paged_kv_last_page_len[num_reqs:].fill_(1)
             paged_kv_last_page_len = self.paged_kv_last_page_len[:num_reqs]
 
+            self.qo_indptr[: 1 + num_reqs].copy_(
+                query_start_loc_device, non_blocking=True
+            )
+            self.qo_indptr[1 + num_reqs :] = query_start_loc_device[-1]
             qo_indptr = self.qo_indptr[: 1 + num_reqs]
 
         else:
@@ -165,6 +173,7 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
             paged_kv_last_page_len=paged_kv_last_page_len,
             qo_indptr=qo_indptr,
             dcp_tot_seq_lens=dcp_tot_seq_lens_device,
+            max_qo_len=max_qo_len,
             attn_out_dtype=self.decode_attn_out_dtype,
         )
 
@@ -255,16 +264,13 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):
 
         kv_buffer = kv_c_and_k_pe_cache.unsqueeze(2)
 
-        # max_seqlen_qo must be 1 except for MTP
-        # TODO: Find the best value for MTP
-        max_seqlen_qo = 1
         rocm_aiter_ops.mla_decode_fwd(
             q,
             kv_buffer,
             o,
             self.scale,
             attn_metadata.decode.qo_indptr,
-            max_seqlen_qo,
+            attn_metadata.decode.max_qo_len,
             attn_metadata.decode.paged_kv_indptr,
             attn_metadata.decode.paged_kv_indices,
             attn_metadata.decode.paged_kv_last_page_len,

From 75eb302a2e4000470d1ad6bfc3a009379554b648 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Tue, 16 Dec 2025 15:20:19 +0100
Subject: [PATCH 585/770] [Bugfix] Whisper fix number of allocated CrossAttn
 blocks per-request (#30772)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 vllm/v1/core/sched/scheduler.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 278970ae7ee88..754e0b9d08316 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -187,6 +187,12 @@ class Scheduler(SchedulerInterface):
             if self.is_encoder_decoder
             else EncoderCacheManager(cache_size=encoder_cache_size)
         )
+        # For encoder-decoder models, allocate the maximum number of tokens for Cross
+        # Attn blocks, as for Whisper its input is always padded to the maximum length.
+        # TODO (NickLucche): Generalize to models with variable-length encoder inputs.
+        self._num_encoder_max_input_tokens = (
+            MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(vllm_config.model_config)
+        )
 
         speculative_config = vllm_config.speculative_config
         self.use_eagle = False
@@ -568,17 +574,11 @@ class Scheduler(SchedulerInterface):
                     0 if request.num_computed_tokens == 0 else self.num_lookahead_tokens
                 )
 
-                # Determine if we need to allocate cross-attention blocks.
-                if self.is_encoder_decoder and request.has_encoder_inputs:
-                    # TODO(russellb): For Whisper, we know that the input is
-                    # always padded to the maximum length. If we support other
-                    # encoder-decoder models, this will need to be updated if we
-                    # want to only allocate what is needed.
-                    num_encoder_tokens = (
-                        self.scheduler_config.max_num_encoder_input_tokens
-                    )
-                else:
-                    num_encoder_tokens = 0
+                num_encoder_tokens = (
+                    self._num_encoder_max_input_tokens
+                    if self.is_encoder_decoder and request.has_encoder_inputs
+                    else 0
+                )
 
                 new_blocks = self.kv_cache_manager.allocate_slots(
                     request,

From 4de08ad698674560be7abebd9437d698d1216872 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Tue, 16 Dec 2025 22:45:25 +0800
Subject: [PATCH 586/770] [CI/Build] Skip broken ViT backend functionality test
 tempoarily (#30782)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 .../multimodal/generation/test_vit_backend_functionality.py      | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/models/multimodal/generation/test_vit_backend_functionality.py b/tests/models/multimodal/generation/test_vit_backend_functionality.py
index 78797ff7c1979..a4e4ce312ddd4 100644
--- a/tests/models/multimodal/generation/test_vit_backend_functionality.py
+++ b/tests/models/multimodal/generation/test_vit_backend_functionality.py
@@ -388,6 +388,7 @@ def run_video_test(config, mm_encoder_attn_backend, video_assets, vllm_runner):
     "mm_encoder_attn_backend",
     [None] + current_platform.get_supported_vit_attn_backends(),
 )
+@pytest.mark.skip(reason="Broken test due to memory segmentation fault")
 @create_new_process_for_each_test()
 def test_vit_backend_functionality(
     model_key: str,

From 00a8d7628c202f580d5230eaa7fe94338a0549f5 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Tue, 16 Dec 2025 09:46:22 -0500
Subject: [PATCH 587/770] [BugFix] Fix memory spike in workspace allocation
 (#30744)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 .buildkite/test-pipeline.yaml |  2 ++
 vllm/v1/worker/workspace.py   | 14 +++++++++++---
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 2dcca5711b3d5..9d0b3fdd3a02c 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -1223,6 +1223,8 @@ steps:
     # FIXIT: find out which code initialize cuda before running the test
     # before the fix, we need to use spawn to test it
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    # Alot of these tests are on the edge of OOMing
+    - export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
     # There is some Tensor Parallelism related processing logic in LoRA that
     # requires multi-GPU testing for validation.
     - pytest -v -s -x lora/test_chatglm3_tp.py
diff --git a/vllm/v1/worker/workspace.py b/vllm/v1/worker/workspace.py
index a16dde1f67800..bbbd7705d54e4 100644
--- a/vllm/v1/worker/workspace.py
+++ b/vllm/v1/worker/workspace.py
@@ -145,12 +145,20 @@ class WorkspaceManager:
 
             for ubatch_id in range(self._num_ubatches):
                 current_workspace = self._current_workspaces[ubatch_id]
-                if current_workspace is None:
+                if (
+                    current_workspace is None
+                    or self._workspace_size_bytes(current_workspace) < required_bytes
+                ):
+                    # Delete old tensor before allocating new one to avoid
+                    # memory spike from resize_(). resize_() allocates new
+                    # memory before freeing old, which can cause OOM.
+                    # Must clear the list reference first since local var
+                    # is just a copy of the reference.
+                    self._current_workspaces[ubatch_id] = None
+                    del current_workspace
                     self._current_workspaces[ubatch_id] = torch.empty(
                         (required_bytes,), dtype=torch.uint8, device=self._device
                     )
-                elif self._workspace_size_bytes(current_workspace) < required_bytes:
-                    current_workspace.resize_(required_bytes)
 
             if envs.VLLM_DEBUG_WORKSPACE:
                 logger.info(

From 59bd5f6a718a309517343e126f5086e057227992 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Tue, 16 Dec 2025 10:33:52 -0500
Subject: [PATCH 588/770] [Feat] Enable eplb with default all2all backend
 (#30559)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/model_executor/layers/fused_moe/shared_fused_moe.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
index 60aa1c088b4d8..a143347b19f2c 100644
--- a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
@@ -29,14 +29,14 @@ class SharedFusedMoE(FusedMoE):
         self._shared_experts = shared_experts
 
         # Disable shared expert overlap if:
-        #   - we are using eplb, because of correctness issues
-        #   - we are using flashinfer with DP, since there nothing to gain
+        #   - we are using eplb with non-default backend, because of correctness issues
+        #   - we are using flashinfer with DP, since there nothint to gain
         #   - we are using marlin kernels
+        backend = self.moe_parallel_config.all2all_backend
         self.use_overlapped = (
             use_overlapped
             and not (
-                # TODO(wentao): find the root cause and remove this condition
-                self.enable_eplb
+                (self.enable_eplb and backend != "allgather_reducescatter")
                 or (self.moe_config.use_flashinfer_cutlass_kernels and self.dp_size > 1)
             )
             and self._shared_experts is not None

From ce12b407f2ecec0a72f426c55e72a8af806b3f5c Mon Sep 17 00:00:00 2001
From: Ming Yang <minos.future@gmail.com>
Date: Tue, 16 Dec 2025 08:01:38 -0800
Subject: [PATCH 589/770] [TRTLLM] Remove the MoE GEMM weight name change
 (#30713)

Signed-off-by: Ming Yang <minos.future@gmail.com>
---
 .../compressed_tensors_moe.py                 | 16 ++++---------
 .../layers/quantization/modelopt.py           | 16 ++++---------
 .../quantization/utils/flashinfer_fp4_moe.py  | 24 +++++++------------
 3 files changed, 16 insertions(+), 40 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 18c2ab026b2ba..f650a6eabbb9c 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -469,16 +469,14 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
             )
             logger.debug_once("Finished shuffling weights for TRT-LLM MOE")
 
-            layer.gemm1_weights_fp4_shuffled = Parameter(
+            layer.w13_weight = Parameter(
                 gemm1_weights_fp4_shuffled, requires_grad=False
             )
-            layer.gemm2_weights_fp4_shuffled = Parameter(
-                gemm2_weights_fp4_shuffled, requires_grad=False
-            )
-            layer.gemm1_scales_fp4_shuffled = Parameter(
+            layer.w2_weight = Parameter(gemm2_weights_fp4_shuffled, requires_grad=False)
+            layer.w13_weight_scale = Parameter(
                 gemm1_scales_fp4_shuffled, requires_grad=False
             )
-            layer.gemm2_scales_fp4_shuffled = Parameter(
+            layer.w2_weight_scale = Parameter(
                 gemm2_scales_fp4_shuffled, requires_grad=False
             )
 
@@ -487,12 +485,6 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
                 (layer.w2_input_scale_quant * layer.g1_alphas).to(torch.float32),
                 requires_grad=False,
             )
-
-            # Clean up weights that won't be used by TRT-LLM
-            del layer.w2_weight
-            del layer.w2_weight_scale
-            del layer.w13_weight
-            del layer.w13_weight_scale
         else:
             # swizzle weight scales
             layer.w13_weight_scale = torch.nn.Parameter(
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 030d85080a34d..f71854e6b63c5 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1458,16 +1458,14 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
             )
             logger.debug_once("Finished shuffling weights for TRT-LLM MOE")
 
-            layer.gemm1_weights_fp4_shuffled = Parameter(
+            layer.w13_weight = Parameter(
                 gemm1_weights_fp4_shuffled, requires_grad=False
             )
-            layer.gemm2_weights_fp4_shuffled = Parameter(
-                gemm2_weights_fp4_shuffled, requires_grad=False
-            )
-            layer.gemm1_scales_fp4_shuffled = Parameter(
+            layer.w2_weight = Parameter(gemm2_weights_fp4_shuffled, requires_grad=False)
+            layer.w13_weight_scale = Parameter(
                 gemm1_scales_fp4_shuffled, requires_grad=False
             )
-            layer.gemm2_scales_fp4_shuffled = Parameter(
+            layer.w2_weight_scale = Parameter(
                 gemm2_scales_fp4_shuffled, requires_grad=False
             )
 
@@ -1476,12 +1474,6 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 (layer.w2_input_scale_quant * layer.g1_alphas).to(torch.float32),
                 requires_grad=False,
             )
-
-            # Clean up weights that won't be used by TRT-LLM
-            del layer.w2_weight
-            del layer.w2_weight_scale
-            del layer.w13_weight
-            del layer.w13_weight_scale
         elif self.use_marlin:
             # Marlin processing
             prepare_moe_fp4_layer_for_marlin(layer)
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
index e424cd0e1ac99..76bce8a8d98d6 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
@@ -301,18 +301,14 @@ def flashinfer_trtllm_fp4_moe(
         hidden_states_scale=hidden_states_scale_linear_fp4.view(
             torch.float8_e4m3fn
         ).flatten(),
-        gemm1_weights=layer.gemm1_weights_fp4_shuffled.data,
-        gemm1_weights_scale=layer.gemm1_scales_fp4_shuffled.data.view(
-            torch.float8_e4m3fn
-        ),
+        gemm1_weights=layer.w13_weight.data,
+        gemm1_weights_scale=layer.w13_weight_scale.data.view(torch.float8_e4m3fn),
         gemm1_bias=None,
         gemm1_alpha=None,
         gemm1_beta=None,
         gemm1_clamp_limit=None,
-        gemm2_weights=layer.gemm2_weights_fp4_shuffled.data,
-        gemm2_weights_scale=layer.gemm2_scales_fp4_shuffled.data.view(
-            torch.float8_e4m3fn
-        ),
+        gemm2_weights=layer.w2_weight.data,
+        gemm2_weights_scale=layer.w2_weight_scale.data.view(torch.float8_e4m3fn),
         gemm2_bias=None,
         output1_scale_scalar=layer.g1_scale_c.data,
         output1_scale_gate_scalar=layer.g1_alphas.data,
@@ -380,18 +376,14 @@ def flashinfer_trtllm_fp4_routed_moe(
         hidden_states_scale=hidden_states_scale_linear_fp4.view(
             torch.float8_e4m3fn
         ).flatten(),
-        gemm1_weights=layer.gemm1_weights_fp4_shuffled.data,
-        gemm1_weights_scale=layer.gemm1_scales_fp4_shuffled.data.view(
-            torch.float8_e4m3fn
-        ),
+        gemm1_weights=layer.w13_weight.data,
+        gemm1_weights_scale=layer.w13_weight_scale.data.view(torch.float8_e4m3fn),
         gemm1_bias=None,
         gemm1_alpha=None,
         gemm1_beta=None,
         gemm1_clamp_limit=None,
-        gemm2_weights=layer.gemm2_weights_fp4_shuffled.data,
-        gemm2_weights_scale=layer.gemm2_scales_fp4_shuffled.data.view(
-            torch.float8_e4m3fn
-        ),
+        gemm2_weights=layer.w2_weight.data,
+        gemm2_weights_scale=layer.w2_weight_scale.data.view(torch.float8_e4m3fn),
         gemm2_bias=None,
         output1_scale_scalar=layer.g1_scale_c.data,
         output1_scale_gate_scalar=layer.g1_alphas.data,

From af506fd76ada27be322d8b89c090dd97a467f7ad Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 16 Dec 2025 16:02:24 +0000
Subject: [PATCH 590/770] Fix instantiation of `HfHubHTTPError` in LoRA test
 (#30768)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/lora/test_utils.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/lora/test_utils.py b/tests/lora/test_utils.py
index eb026c2ec0209..bec12eeeb48d5 100644
--- a/tests/lora/test_utils.py
+++ b/tests/lora/test_utils.py
@@ -3,7 +3,7 @@
 
 from collections import OrderedDict
 from typing import NamedTuple
-from unittest.mock import patch
+from unittest.mock import MagicMock, patch
 
 import pytest
 from huggingface_hub.utils import HfHubHTTPError
@@ -194,5 +194,8 @@ def test_get_adapter_absolute_path_huggingface_error(
     # Hugging Face model identifier with download error
     path = "org/repo"
     mock_exist.return_value = False
-    mock_snapshot_download.side_effect = HfHubHTTPError("failed to query model info")
+    mock_snapshot_download.side_effect = HfHubHTTPError(
+        "failed to query model info",
+        response=MagicMock(),
+    )
     assert get_adapter_absolute_path(path) == path

From 0b0acc758ed3f0eecd8d95b3e232f8dd91bb8473 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 16 Dec 2025 16:02:41 +0000
Subject: [PATCH 591/770] Remove `head_mask` from Ultravox and Swin (#30764)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/models/swin.py     | 16 +---------------
 vllm/model_executor/models/ultravox.py | 17 +++++++++++++++--
 2 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/vllm/model_executor/models/swin.py b/vllm/model_executor/models/swin.py
index a74fd80c06d8c..fbf5594851ece 100644
--- a/vllm/model_executor/models/swin.py
+++ b/vllm/model_executor/models/swin.py
@@ -102,7 +102,6 @@ class SwinSelfAttention(nn.Module):
         self,
         hidden_states: torch.Tensor,
         attention_mask: torch.FloatTensor | None = None,
-        head_mask: torch.FloatTensor | None = None,
         output_attentions: bool | None = False,
     ) -> tuple[torch.Tensor, ...]:
         batch_size, dim, num_channels = hidden_states.shape
@@ -201,12 +200,9 @@ class SwinAttention(nn.Module):
         self,
         hidden_states: torch.Tensor,
         attention_mask: torch.FloatTensor | None = None,
-        head_mask: torch.FloatTensor | None = None,
         output_attentions: bool | None = False,
     ) -> tuple[torch.Tensor]:
-        self_outputs = self.self(
-            hidden_states, attention_mask, head_mask, output_attentions
-        )
+        self_outputs = self.self(hidden_states, attention_mask, output_attentions)
         attention_output = self.output(self_outputs[0], hidden_states)
         outputs = (attention_output,) + self_outputs[1:]
         return outputs
@@ -339,18 +335,14 @@ class SwinStage(nn.Module):
         self,
         hidden_states: torch.Tensor,
         input_dimensions: tuple[int, int],
-        head_mask: torch.FloatTensor | None = None,
         output_attentions: bool | None = False,
         always_partition: bool | None = False,
     ) -> tuple[torch.Tensor]:
         height, width = input_dimensions
         for i, layer_module in enumerate(self.blocks):
-            layer_head_mask = head_mask[i] if head_mask is not None else None
-
             layer_outputs = layer_module(
                 hidden_states,
                 input_dimensions,
-                layer_head_mask,
                 output_attentions,
                 always_partition,
             )
@@ -425,17 +417,13 @@ class SwinEncoder(nn.Module):
         self,
         hidden_states: torch.Tensor,
         input_dimensions: tuple[int, int],
-        head_mask: torch.FloatTensor | None = None,
         output_attentions: bool | None = False,
         always_partition: bool | None = False,
     ) -> tuple[torch.Tensor]:
         for i, layer_module in enumerate(self.layers):
-            layer_head_mask = head_mask[i] if head_mask is not None else None
-
             layer_outputs = layer_module(
                 hidden_states,
                 input_dimensions,
-                layer_head_mask,
                 output_attentions,
                 always_partition,
             )
@@ -473,7 +461,6 @@ class SwinModel(nn.Module):
     def forward(
         self,
         pixel_values: torch.FloatTensor | None = None,
-        head_mask: torch.FloatTensor | None = None,
         output_attentions: bool | None = None,
     ) -> tuple[torch.Tensor]:
         embedding_output, input_dimensions = self.embeddings(pixel_values)
@@ -481,7 +468,6 @@ class SwinModel(nn.Module):
         encoder_outputs = self.encoder(
             embedding_output,
             input_dimensions,
-            head_mask=head_mask,
             output_attentions=output_attentions,
         )
 
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 32a2ba1ef38f7..7e1b7c90c9204 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -5,6 +5,7 @@
 """PyTorch Ultravox model."""
 
 import copy
+import inspect
 from collections.abc import Iterable, Mapping, Sequence
 from types import SimpleNamespace
 from typing import Annotated, Any, Literal, TypeAlias
@@ -380,11 +381,17 @@ class UltravoxTransformerProjector(nn.Module, ModuleUtilsMixin):
         )
         hidden_states = hidden_states + positions
 
+        # Backward compatibility for Transformers v4 where layer_head_mask
+        # was a required argument for WhisperEncoderLayer.forward
+        kwargs = {}
+        if "layer_head_mask" in inspect.signature(self.layers[0].forward).parameters:
+            kwargs["layer_head_mask"] = None
+
         for layer in self.layers:
             layer_outputs = layer(
                 hidden_states,
                 attention_mask=extended_attention_mask,
-                layer_head_mask=None,
+                **kwargs,
             )
             hidden_states = layer_outputs[0]
 
@@ -479,11 +486,17 @@ class ModifiedWhisperEncoder(WhisperEncoder):
 
         attention_mask = self.get_attention_mask_by_audio_len(audio_lens, hidden_states)
 
+        # Backward compatibility for Transformers v4 where layer_head_mask
+        # was a required argument for WhisperEncoderLayer.forward
+        kwargs = {}
+        if "layer_head_mask" in inspect.signature(self.layers[0].forward).parameters:
+            kwargs["layer_head_mask"] = None
+
         for encoder_layer in self.layers:
             layer_outputs = encoder_layer(
                 hidden_states,
                 attention_mask,
-                layer_head_mask=None,
+                **kwargs,
             )
 
             hidden_states = layer_outputs[0]

From e1625498f43b3c3f398e91d715f37ef42d61d8c0 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 16 Dec 2025 16:05:01 +0000
Subject: [PATCH 592/770] Update where `bytes_to_unicode` is imported from
 (#30771)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/v1/structured_output/utils.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py
index ae42b33f80f88..cb5ad99cfbdf7 100644
--- a/vllm/v1/structured_output/utils.py
+++ b/vllm/v1/structured_output/utils.py
@@ -21,8 +21,8 @@ from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
 if TYPE_CHECKING:
     import outlines_core as oc
     import transformers.file_utils as file_utils
-    import transformers.models.gpt2.tokenization_gpt2 as tokenization_gpt2
     import xgrammar as xgr
+    from transformers.convert_slow_tokenizer import bytes_to_unicode
 
     from vllm.tokenizers import TokenizerLike
     from vllm.v1.worker.gpu_input_batch import InputBatch
@@ -30,10 +30,8 @@ else:
     xgr = LazyLoader("xgr", globals(), "xgrammar")
     oc = LazyLoader("oc", globals(), "outlines_core")
     file_utils = LazyLoader("file_utils", globals(), "transformers.file_utils")
-    tokenization_gpt2 = LazyLoader(
-        "tokenization_gpt2",
-        globals(),
-        "transformers.models.gpt2.tokenization_gpt2",
+    bytes_to_unicode = LazyLoader(
+        "bytes_to_unicode", globals(), "transformers.convert_slow_tokenizer"
     )
 
     TokenizerLike = object
@@ -204,7 +202,7 @@ def _reduced_vocabulary(
         A Dict of token string -> equivalent token ids
     """
 
-    unicode_to_bytes = {v: k for k, v in tokenization_gpt2.bytes_to_unicode().items()}
+    unicode_to_bytes = {v: k for k, v in bytes_to_unicode().items()}
 
     def convert_token_to_string(token: str) -> str:
         string = tokenizer.convert_tokens_to_string([token])

From 66c3537e5df215d8095d7042b8e7abd51260393f Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Tue, 16 Dec 2025 16:35:46 +0000
Subject: [PATCH 593/770] [Docs][API] Remove warning about LoRARequest being
 internal-only (#30774)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 vllm/lora/request.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/vllm/lora/request.py b/vllm/lora/request.py
index c97e435e32165..55756bdb103bd 100644
--- a/vllm/lora/request.py
+++ b/vllm/lora/request.py
@@ -14,11 +14,6 @@ class LoRARequest(
     """
     Request for a LoRA adapter.
 
-    Note that this class should be used internally. For online
-    serving, it is recommended to not allow users to use this class but
-    instead provide another layer of abstraction to prevent users from
-    accessing unauthorized LoRA adapters.
-
     lora_int_id must be globally unique for a given adapter.
     This is currently not enforced in vLLM.
     """

From 10ee1c64cfa7c0b7f68e9ee793435c9cafbf821a Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 16 Dec 2025 14:28:34 -0500
Subject: [PATCH 594/770] [CI] Generalize gsm8k test args and add Qwen3-Next
 MTP B200 test (#30723)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |  4 +-
 tests/evals/gsm8k/README.md                   | 13 ++--
 .../DeepSeek-V2-Lite-Instruct-FP8.yaml        |  3 +-
 .../Llama-3-8B-Instruct-nonuniform-CT.yaml    |  2 +-
 .../Llama-3.2-1B-Instruct-INT8-CT.yaml        |  2 +-
 .../gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml   |  2 +-
 .../Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml   |  2 +-
 tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml |  2 +-
 .../gsm8k/configs/Qwen3-30B-A3B-NVFP4.yaml    |  3 +-
 .../configs/Qwen3-Next-80B-A3B-NVFP4-EP2.yaml | 12 ++++
 .../evals/gsm8k/configs/models-blackwell.txt  |  1 +
 tests/evals/gsm8k/conftest.py                 |  8 +--
 tests/evals/gsm8k/test_gsm8k_correctness.py   | 70 +++++++++++--------
 .../compressed_tensors_moe.py                 | 11 +--
 14 files changed, 78 insertions(+), 57 deletions(-)
 create mode 100644 tests/evals/gsm8k/configs/Qwen3-Next-80B-A3B-NVFP4-EP2.yaml

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 9d0b3fdd3a02c..8e6d32f71f220 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -654,7 +654,7 @@ steps:
   - vllm/model_executor/layers/quantization
   autorun_on_main: true
   commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
 
 - label: OpenAI API correctness # 22min
   timeout_in_minutes: 30
@@ -1064,7 +1064,7 @@ steps:
   - csrc/
   - vllm/model_executor/layers/quantization
   commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
 
 #####  1 GPU test  #####
 #####  multi gpus test  #####
diff --git a/tests/evals/gsm8k/README.md b/tests/evals/gsm8k/README.md
index 29c5199e1e87a..dcbfd85bfeee8 100644
--- a/tests/evals/gsm8k/README.md
+++ b/tests/evals/gsm8k/README.md
@@ -7,9 +7,8 @@ This directory contains a replacement for the lm-eval-harness GSM8K evaluation,
 ### Run tests with pytest (like buildkite)
 
 ```bash
-pytest -s -v tests/gsm8k/test_gsm8k_correctness.py \
-    --config-list-file=configs/models-small.txt \
-    --tp-size=1
+pytest -s -v tests/evals/gsm8k/test_gsm8k_correctness.py \
+    --config-list-file=configs/models-small.txt
 ```
 
 ### Run standalone evaluation script
@@ -31,5 +30,11 @@ model_name: "Qwen/Qwen2.5-1.5B-Instruct"
 accuracy_threshold: 0.54  # Minimum expected accuracy
 num_questions: 1319       # Number of questions (default: full test set)
 num_fewshot: 5            # Few-shot examples from train set
-max_model_len: 4096       # Model context length
+server_args: "--max-model-len 4096 --tensor-parallel-size 2"  # Server arguments
+env:                      # Environment variables (optional)
+  VLLM_USE_FLASHINFER_MOE_FP4: "1"
 ```
+
+The `server_args` field accepts any arguments that can be passed to `vllm serve`.
+
+The `env` field accepts a dictionary of environment variables to set for the server process.
diff --git a/tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml b/tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml
index 7ec6a1e0be27f..72fa7e8a38c73 100644
--- a/tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml
+++ b/tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml
@@ -2,5 +2,4 @@ model_name: "RedHatAI/DeepSeek-Coder-V2-Lite-Instruct-FP8"
 accuracy_threshold: 0.72
 num_questions: 1319
 num_fewshot: 5
-max_model_len: 4096
-
+server_args: "--enforce-eager --max-model-len 4096"
diff --git a/tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml b/tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
index caa0448f23d48..b7b59e9dcd5ce 100644
--- a/tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
+++ b/tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
@@ -2,4 +2,4 @@ model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
 accuracy_threshold: 0.74
 num_questions: 1319
 num_fewshot: 5
-max_model_len: 4096
\ No newline at end of file
+server_args: "--enforce-eager --max-model-len 4096"
diff --git a/tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml b/tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
index 615aa69a2d2b6..8b3c9ff645e87 100644
--- a/tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
+++ b/tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
@@ -2,4 +2,4 @@ model_name: "RedHatAI/Llama-3.2-1B-Instruct-quantized.w8a8"
 accuracy_threshold: 0.31
 num_questions: 1319
 num_fewshot: 5
-max_model_len: 4096
\ No newline at end of file
+server_args: "--enforce-eager --max-model-len 4096"
diff --git a/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml b/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
index 9297bf6ddf2d3..4a1b1948acac8 100644
--- a/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
+++ b/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
@@ -2,4 +2,4 @@ model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
 accuracy_threshold: 0.45
 num_questions: 1319
 num_fewshot: 5
-max_model_len: 4096
+server_args: "--enforce-eager --max-model-len 4096"
diff --git a/tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml b/tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
index 5319ada30f645..5ce3af8be346a 100644
--- a/tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+++ b/tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
@@ -2,4 +2,4 @@ model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
 accuracy_threshold: 0.60
 num_questions: 1319
 num_fewshot: 5
-max_model_len: 4096
\ No newline at end of file
+server_args: "--enforce-eager --max-model-len 4096"
diff --git a/tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml b/tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
index c39fb979d98ac..5452ebe753f04 100644
--- a/tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
+++ b/tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
@@ -2,4 +2,4 @@ model_name: "Qwen/Qwen3-0.6B-FP8"
 accuracy_threshold: 0.375
 num_questions: 1319
 num_fewshot: 5
-max_model_len: 4096
\ No newline at end of file
+server_args: "--enforce-eager --max-model-len 4096"
diff --git a/tests/evals/gsm8k/configs/Qwen3-30B-A3B-NVFP4.yaml b/tests/evals/gsm8k/configs/Qwen3-30B-A3B-NVFP4.yaml
index 6b7bdd1e65bb3..f162aa8bfe5b0 100644
--- a/tests/evals/gsm8k/configs/Qwen3-30B-A3B-NVFP4.yaml
+++ b/tests/evals/gsm8k/configs/Qwen3-30B-A3B-NVFP4.yaml
@@ -2,5 +2,4 @@ model_name: "nvidia/Qwen3-30B-A3B-FP4"
 accuracy_threshold: 0.89
 num_questions: 1319
 num_fewshot: 5
-max_model_len: 4096
-
+server_args: "--enforce-eager --max-model-len 4096"
diff --git a/tests/evals/gsm8k/configs/Qwen3-Next-80B-A3B-NVFP4-EP2.yaml b/tests/evals/gsm8k/configs/Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
new file mode 100644
index 0000000000000..673b473f817eb
--- /dev/null
+++ b/tests/evals/gsm8k/configs/Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
@@ -0,0 +1,12 @@
+model_name: "nm-testing/Qwen3-Next-80B-A3B-Instruct-NVFP4"
+accuracy_threshold: 0.75
+num_questions: 1319
+num_fewshot: 5
+server_args: >-
+  --enforce-eager
+  --max-model-len 4096
+  --tensor-parallel-size 2
+  --enable-expert-parallel
+  --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}'
+env:
+  VLLM_USE_FLASHINFER_MOE_FP4: "1"
diff --git a/tests/evals/gsm8k/configs/models-blackwell.txt b/tests/evals/gsm8k/configs/models-blackwell.txt
index 3c9b1084de7bc..39978aa6ffbe9 100644
--- a/tests/evals/gsm8k/configs/models-blackwell.txt
+++ b/tests/evals/gsm8k/configs/models-blackwell.txt
@@ -3,3 +3,4 @@ Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
 Qwen1.5-MoE-W4A16-CT.yaml
 DeepSeek-V2-Lite-Instruct-FP8.yaml
 Qwen3-30B-A3B-NVFP4.yaml
+Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
diff --git a/tests/evals/gsm8k/conftest.py b/tests/evals/gsm8k/conftest.py
index 1932a13cdfc63..6f25fe6414af4 100644
--- a/tests/evals/gsm8k/conftest.py
+++ b/tests/evals/gsm8k/conftest.py
@@ -11,14 +11,12 @@ def pytest_addoption(parser):
         default="configs/models-small.txt",
         help="File containing list of config files to test",
     )
-    parser.addoption("--tp-size", default=1, type=int, help="Tensor parallel size")
 
 
 def pytest_generate_tests(metafunc):
     """Generate test parameters from config files."""
     if "config_filename" in metafunc.fixturenames:
         config_list_file = metafunc.config.getoption("--config-list-file")
-        tp_size = metafunc.config.getoption("--tp-size")
 
         # Handle both relative and absolute paths
         config_list_path = Path(config_list_file)
@@ -55,9 +53,9 @@ def pytest_generate_tests(metafunc):
         # Generate test parameters
         if config_files:
             metafunc.parametrize(
-                ["config_filename", "tp_size"],
-                [(config_file, int(tp_size)) for config_file in config_files],
-                ids=[f"{config_file.stem}-tp{tp_size}" for config_file in config_files],
+                "config_filename",
+                config_files,
+                ids=[config_file.stem for config_file in config_files],
             )
         else:
             print("No config files found, test will be skipped")
diff --git a/tests/evals/gsm8k/test_gsm8k_correctness.py b/tests/evals/gsm8k/test_gsm8k_correctness.py
index b5d67df7bf3db..ea6715f5cb532 100644
--- a/tests/evals/gsm8k/test_gsm8k_correctness.py
+++ b/tests/evals/gsm8k/test_gsm8k_correctness.py
@@ -5,30 +5,31 @@ GSM8K evaluation using vLLM server and isolated GSM8K script.
 Replacement for lm-eval-harness with better performance and control.
 
 Usage:
-pytest -s -v test_gsm8k_correctness.py \
-    --config-list-file=configs/models-small.txt \
-    --tp-size=1
+pytest -s -v tests/evals/gsm8k/test_gsm8k_correctness.py \
+    --config-list-file=configs/models-small.txt
 """
 
+import shlex
+
 import yaml
 
 from tests.utils import RemoteOpenAIServer
 
 from .gsm8k_eval import evaluate_gsm8k
 
-RTOL = 0.08  # Relative tolerance for accuracy comparison
+TOL = 0.08  # Absolute tolerance for accuracy comparison
 
 
-def launch_gsm8k_eval(eval_config, server_url, tp_size):
-    """Launch GSM8K evaluation using our isolated script."""
+def run_gsm8k_eval(eval_config: dict, server_url: str) -> dict:
+    """Run GSM8K evaluation using our isolated script."""
     # Extract host and port from server URL
     if "://" in server_url:
         server_url = server_url.split("://")[1]
 
     host_port = server_url.split("/")[0]  # Remove path if present
     if ":" in host_port:
-        host, port = host_port.split(":")
-        port = int(port)
+        host, p = host_port.split(":")
+        port = int(p)
     else:
         host = host_port
         port = 8000
@@ -48,46 +49,57 @@ def launch_gsm8k_eval(eval_config, server_url, tp_size):
     return results
 
 
-def test_gsm8k_correctness_param(config_filename, tp_size):
+def test_gsm8k_correctness(config_filename):
     """Test GSM8K correctness for a given model configuration."""
     eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
 
-    # Server arguments
-    server_args = [
-        "--max-model-len",
-        str(eval_config.get("max_model_len", 4096)),
-        "--enforce-eager",
-        "--trust-remote-code",
-        "--tensor-parallel-size",
-        str(tp_size),
-    ]
+    # Parse server arguments from config (use shlex to handle quoted strings)
+    server_args_str = eval_config.get("server_args", "")
+    server_args = shlex.split(server_args_str) if server_args_str else []
+
+    # Add standard server arguments
+    server_args.extend(
+        [
+            "--trust-remote-code",
+        ]
+    )
 
     env_dict = eval_config.get("env", None)
 
+    print(f"Starting GSM8K evaluation for model: {eval_config['model_name']}")
+    print(f"Expected metric threshold: {eval_config['accuracy_threshold']}")
+    print(f"Number of questions: {eval_config['num_questions']}")
+    print(f"Number of few-shot examples: {eval_config['num_fewshot']}")
+    print(f"Server args: {' '.join(server_args)}")
+
     # Launch server and run evaluation
     with RemoteOpenAIServer(
-        eval_config["model_name"], server_args, env_dict=env_dict, max_wait_seconds=480
+        eval_config["model_name"],
+        server_args,
+        env_dict=env_dict,
+        max_wait_seconds=600,
     ) as remote_server:
         server_url = remote_server.url_for("v1")
+        print(f"Server started at: {server_url}")
 
-        results = launch_gsm8k_eval(eval_config, server_url, tp_size)
+        results = run_gsm8k_eval(eval_config, server_url)
 
-        # Check accuracy against threshold
-        measured_accuracy = results["accuracy"]
-        expected_accuracy = eval_config["accuracy_threshold"]
+        measured_metric = results["accuracy"]
+        expected_metric = eval_config["accuracy_threshold"]
 
         print(f"GSM8K Results for {eval_config['model_name']}:")
-        print(f"  Accuracy: {measured_accuracy:.3f}")
-        print(f"  Expected: {expected_accuracy:.3f}")
+        print(f"  Measured metric: {measured_metric:.4f}")
+        print(f"  Expected metric: {expected_metric:.4f}")
+        print(f"  Tolerance: {TOL:.4f}")
         print(f"  Questions: {results['num_questions']}")
         print(f"  Invalid rate: {results['invalid_rate']:.3f}")
         print(f"  Latency: {results['latency']:.1f}s")
         print(f"  QPS: {results['questions_per_second']:.1f}")
 
-        # Verify accuracy is within tolerance
-        assert measured_accuracy >= expected_accuracy - RTOL, (
-            f"Accuracy too low: {measured_accuracy:.3f} < "
-            f"{expected_accuracy:.3f} - {RTOL:.3f}"
+        # Verify metric is within tolerance
+        assert measured_metric >= expected_metric - TOL, (
+            f"GSM8K metric too low: {measured_metric:.4f} < "
+            f"{expected_metric:.4f} - {TOL:.4f} = {expected_metric - TOL:.4f}"
         )
 
         print(f"✅ GSM8K test passed for {eval_config['model_name']}")
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index f650a6eabbb9c..c302e465aedb7 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -626,17 +626,11 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
                 apply_router_weight_on_input=layer.apply_router_weight_on_input,
             )
         else:
+            # If no modular kernel is provided, use cutlass_moe_fp4 for TP case
+            # only (no EP).
             from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
 
-            assert layer.expert_map is None, (
-                "Expert Parallelism / expert_map "
-                "is currently not supported for "
-                "CompressedTensorsW4A4Nvfp4MoEMethod."
-            )
             assert self.moe_quant_config is not None
-
-            # Cutlass moe takes in activations in BF16/Half precision
-            # and fp4 quantized weights loaded from the checkpoint
             return cutlass_moe_fp4(
                 a=x,
                 w1_fp4=layer.w13_weight,
@@ -644,6 +638,7 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
                 quant_config=self.moe_quant_config,
+                expert_map=layer.expert_map,
                 apply_router_weight_on_input=layer.apply_router_weight_on_input,
                 # TODO(bnell): derive these from arguments
                 m=x.shape[0],

From ca702a14dc2d4c5c077dbb8098e66ca244cea185 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Tue, 16 Dec 2025 20:36:49 +0100
Subject: [PATCH 595/770] [Frontend] Add `max-completion-token` option to
 transcription/translation endpoints (#30769)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 .../test_transcription_validation_whisper.py  | 32 ++++++++++++++++++
 .../openai/test_translation_validation.py     | 33 +++++++++++++++++++
 vllm/entrypoints/openai/protocol.py           |  6 ++++
 vllm/entrypoints/openai/speech_to_text.py     | 10 ++++--
 4 files changed, 79 insertions(+), 2 deletions(-)

diff --git a/tests/entrypoints/openai/test_transcription_validation_whisper.py b/tests/entrypoints/openai/test_transcription_validation_whisper.py
index 3c507ee0a3fa7..8bf729c517f7a 100644
--- a/tests/entrypoints/openai/test_transcription_validation_whisper.py
+++ b/tests/entrypoints/openai/test_transcription_validation_whisper.py
@@ -244,3 +244,35 @@ async def test_audio_with_timestamp(mary_had_lamb, whisper_client):
     )
     assert transcription.segments is not None
     assert len(transcription.segments) > 0
+
+
+@pytest.mark.asyncio
+async def test_audio_with_max_tokens(whisper_client, mary_had_lamb):
+    transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="text",
+        temperature=0.0,
+        extra_body={"max_completion_tokens": 1},
+    )
+    out = json.loads(transcription)
+    out_text = out["text"]
+    from transformers import AutoTokenizer
+
+    tok = AutoTokenizer.from_pretrained(MODEL_NAME)
+    out_tokens = tok(out_text, add_special_tokens=False)["input_ids"]
+    assert len(out_tokens) == 1
+    # max_completion_tokens > max_model_len
+    transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="text",
+        temperature=0.0,
+        extra_body={"max_completion_tokens": int(1e6)},
+    )
+    out = json.loads(transcription)
+    out_text = out["text"]
+    out_tokens = tok(out_text, add_special_tokens=False)["input_ids"]
+    assert len(out_tokens) < 450  # ~Whisper max output len
diff --git a/tests/entrypoints/openai/test_translation_validation.py b/tests/entrypoints/openai/test_translation_validation.py
index d7d407484f16d..2c577237691ab 100644
--- a/tests/entrypoints/openai/test_translation_validation.py
+++ b/tests/entrypoints/openai/test_translation_validation.py
@@ -227,3 +227,36 @@ async def test_long_audio_request(foscolo, client_and_model):
     )
     out = json.loads(translation)["text"].strip().lower()
     assert out.count("greek sea") == 2
+
+
+@pytest.mark.asyncio
+async def test_audio_with_max_tokens(mary_had_lamb, client_and_model):
+    client, model_name = client_and_model
+    transcription = await client.audio.translations.create(
+        model=model_name,
+        file=mary_had_lamb,
+        response_format="text",
+        temperature=0.0,
+        extra_body={"max_completion_tokens": 1},
+    )
+    out = json.loads(transcription)
+    out_text = out["text"]
+    print(out_text)
+    from transformers import AutoTokenizer
+
+    tok = AutoTokenizer.from_pretrained(model_name)
+    out_tokens = tok(out_text, add_special_tokens=False)["input_ids"]
+    assert len(out_tokens) == 1
+    # max_completion_tokens > max_model_len
+    transcription = await client.audio.transcriptions.create(
+        model=model_name,
+        file=mary_had_lamb,
+        response_format="text",
+        temperature=0.0,
+        extra_body={"max_completion_tokens": int(1e6)},
+    )
+    out = json.loads(transcription)
+    out_text = out["text"]
+    print(out_text)
+    out_tokens = tok(out_text, add_special_tokens=False)["input_ids"]
+    assert len(out_tokens) < 450  # ~Whisper max output len
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index a7c4980cd3674..94dde4564ea0c 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -2054,6 +2054,9 @@ class TranscriptionRequest(OpenAIBaseModel):
 
     presence_penalty: float | None = 0.0
     """The presence penalty to use for sampling."""
+
+    max_completion_tokens: int | None = None
+    """The maximum number of tokens to generate."""
     # --8<-- [end:transcription-sampling-params]
 
     # Default sampling parameters for transcription requests.
@@ -2300,6 +2303,9 @@ class TranslationRequest(OpenAIBaseModel):
     # Flattened stream option to simplify form data.
     stream_include_usage: bool | None = False
     stream_continuous_usage_stats: bool | None = False
+
+    max_completion_tokens: int | None = None
+    """The maximum number of tokens to generate."""
     # --8<-- [end:translation-extra-params]
 
     # Default sampling parameters for translation requests.
diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py
index cea9924ebbaca..df9c06adb105a 100644
--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@@ -293,8 +293,14 @@ class OpenAISpeechToText(OpenAIServing):
         try:
             # Unlike most decoder-only models, whisper generation length is not
             # constrained by the size of the input audio, which is mapped to a
-            # fixed-size log-mel-spectogram.
-            default_max_tokens = self.model_config.max_model_len
+            # fixed-size log-mel-spectogram. Still, allow for fewer tokens to be
+            # generated by respecting the extra completion tokens arg.
+            if request.max_completion_tokens is None:
+                default_max_tokens = self.model_config.max_model_len
+            else:
+                default_max_tokens = min(
+                    self.model_config.max_model_len, request.max_completion_tokens
+                )
             sampling_params = request.to_sampling_params(
                 default_max_tokens, self.default_sampling_params
             )

From f21f5ea38c6fa0e824bc00d5762d17e049199cd3 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Tue, 16 Dec 2025 14:50:59 -0500
Subject: [PATCH 596/770] [Refactor] Small refactor for group topk (#30562)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
---
 csrc/moe/grouped_topk_kernels.cu              | 13 ++++++++++---
 tests/v1/determinism/test_batch_invariance.py |  1 -
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/csrc/moe/grouped_topk_kernels.cu b/csrc/moe/grouped_topk_kernels.cu
index 5fa367abd96f5..7229e420d3fe4 100644
--- a/csrc/moe/grouped_topk_kernels.cu
+++ b/csrc/moe/grouped_topk_kernels.cu
@@ -446,9 +446,13 @@ __device__ inline T apply_sigmoid(T val) {
 
 template <ScoringFunc SF, typename T>
 __device__ inline T apply_scoring(T val) {
-  if constexpr (SF == SCORING_SIGMOID) {
+  if constexpr (SF == SCORING_NONE) {
+    return val;
+  } else if constexpr (SF == SCORING_SIGMOID) {
     return apply_sigmoid(val);
   } else {
+    static_assert(SF == SCORING_NONE || SF == SCORING_SIGMOID,
+                  "Unsupported ScoringFunc in apply_scoring");
     return val;
   }
 }
@@ -670,10 +674,13 @@ __global__ void group_idx_and_topk_idx_kernel(
 
   if (case_id < num_tokens) {
     if (if_proceed_next_topk) {
+      float scale = routed_scaling_factor;
+      if (renormalize) {
+        scale /= topk_sum;
+      }
       for (int i = lane_id; i < topk; i += WARP_SIZE) {
         float base = cuda_cast<float, T>(s_topk_value[i]);
-        float value = renormalize ? (base / topk_sum * routed_scaling_factor)
-                                  : (base * routed_scaling_factor);
+        float value = base * scale;
         topk_indices[i] = s_topk_idx[i];
         topk_values[i] = value;
       }
diff --git a/tests/v1/determinism/test_batch_invariance.py b/tests/v1/determinism/test_batch_invariance.py
index 1c45e7fe366ff..7a58e1c9bad03 100644
--- a/tests/v1/determinism/test_batch_invariance.py
+++ b/tests/v1/determinism/test_batch_invariance.py
@@ -188,7 +188,6 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
     llm = LLM(
         model=model_name,
         tensor_parallel_size=tp_size,
-        # enable_prefix_caching=False,
         max_num_seqs=32,
         max_model_len=8192,
         dtype="bfloat16",  # not everything is supported

From 254a7f8fd613d6b6964abc277b73ca1f0b823cdb Mon Sep 17 00:00:00 2001
From: jiahanc <173873397+jiahanc@users.noreply.github.com>
Date: Tue, 16 Dec 2025 13:01:48 -0800
Subject: [PATCH 597/770] [Perf] Do FP4 quant before All gather on flashinfer
 trtllmgen MOE  (#30014)

Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com>
---
 .../device_communicators/all2all.py           | 29 ++++++++++---
 .../base_device_communicator.py               |  7 +++-
 .../device_communicators/cuda_communicator.py | 16 +++++---
 vllm/distributed/parallel_state.py            | 13 ++++--
 .../layers/fused_moe/fused_moe_method_base.py | 12 ++++++
 vllm/model_executor/layers/fused_moe/layer.py | 41 ++++++++++++++++++-
 .../layers/quantization/modelopt.py           | 25 ++++++++++-
 .../quantization/utils/flashinfer_fp4_moe.py  | 36 +++++++++-------
 vllm/utils/flashinfer.py                      | 17 ++++++++
 9 files changed, 165 insertions(+), 31 deletions(-)

diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py
index c40dde26b741f..7a4e81cf967de 100644
--- a/vllm/distributed/device_communicators/all2all.py
+++ b/vllm/distributed/device_communicators/all2all.py
@@ -64,7 +64,12 @@ class NaiveAll2AllManager(All2AllManagerBase):
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
         is_sequence_parallel: bool = False,
+        extra_tensors: list[torch.Tensor] | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
+        if extra_tensors is not None:
+            raise NotImplementedError(
+                "extra_tensors is not supported for NaiveAll2AllManager"
+            )
         sp_size = self.tp_group.world_size if is_sequence_parallel else 1
         dp_metadata = get_forward_context().dp_metadata
         assert dp_metadata is not None
@@ -76,6 +81,7 @@ class NaiveAll2AllManager(All2AllManagerBase):
         router_logits = self.naive_multicast(
             router_logits, cu_tokens_across_sp_cpu, is_sequence_parallel
         )
+
         return hidden_states, router_logits
 
     def combine(
@@ -113,7 +119,11 @@ class AgRsAll2AllManager(All2AllManagerBase):
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
         is_sequence_parallel: bool = False,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
+        extra_tensors: list[torch.Tensor] | None = None,
+    ) -> (
+        tuple[torch.Tensor, torch.Tensor]
+        | tuple[torch.Tensor, torch.Tensor, list[torch.Tensor]]
+    ):
         """
         Gather hidden_states and router_logits from all dp ranks.
         """
@@ -121,15 +131,22 @@ class AgRsAll2AllManager(All2AllManagerBase):
         assert dp_metadata is not None
         sizes = dp_metadata.get_chunk_sizes_across_dp_rank()
         assert sizes is not None
-
         dist_group = get_ep_group() if is_sequence_parallel else get_dp_group()
         assert sizes[dist_group.rank_in_group] == hidden_states.shape[0]
-        hidden_states, router_logits = dist_group.all_gatherv(
-            [hidden_states, router_logits],
+
+        tensors_to_gather = [hidden_states, router_logits]
+        if extra_tensors is not None:
+            tensors_to_gather.extend(extra_tensors)
+
+        gathered_tensors = dist_group.all_gatherv(
+            tensors_to_gather,
             dim=0,
             sizes=sizes,
         )
-        return hidden_states, router_logits
+
+        if extra_tensors is not None:
+            return (gathered_tensors[0], gathered_tensors[1], gathered_tensors[2:])
+        return gathered_tensors[0], gathered_tensors[1]
 
     def combine(
         self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False
@@ -204,6 +221,7 @@ class PPLXAll2AllManager(All2AllManagerBase):
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
         is_sequence_parallel: bool = False,
+        extra_tensors: list[torch.Tensor] | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         raise NotImplementedError
 
@@ -251,6 +269,7 @@ class DeepEPAll2AllManagerBase(All2AllManagerBase):
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
         is_sequence_parallel: bool = False,
+        extra_tensors: list[torch.Tensor] | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         raise NotImplementedError
 
diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py
index 3a849da70e4cb..caeff54406b59 100644
--- a/vllm/distributed/device_communicators/base_device_communicator.py
+++ b/vllm/distributed/device_communicators/base_device_communicator.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import threading
+from typing import Any
 from weakref import WeakValueDictionary
 
 import torch
@@ -68,7 +69,11 @@ class All2AllManagerBase:
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
         is_sequence_parallel: bool = False,
-    ):
+        extra_tensors: list[torch.Tensor] | None = None,
+    ) -> Any:
+        # Subclasses should either:
+        # - implement handling for extra_tensors, or
+        # - raise a clear error if extra_tensors is not supported.
         raise NotImplementedError
 
     def set_num_sms(self, num_sms: int):
diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py
index cd9c267beb5b5..9542498c453ec 100644
--- a/vllm/distributed/device_communicators/cuda_communicator.py
+++ b/vllm/distributed/device_communicators/cuda_communicator.py
@@ -318,17 +318,23 @@ class CudaCommunicator(DeviceCommunicatorBase):
 
         return output_list
 
-    def dispatch(
+    def dispatch(  # type: ignore[override]
         self,
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
         is_sequence_parallel: bool = False,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
+        extra_tensors: list[torch.Tensor] | None = None,
+    ) -> (
+        tuple[torch.Tensor, torch.Tensor]
+        | tuple[torch.Tensor, torch.Tensor, list[torch.Tensor]]
+    ):
         assert self.all2all_manager is not None
-        hidden_states, router_logits = self.all2all_manager.dispatch(
-            hidden_states, router_logits, is_sequence_parallel
+        return self.all2all_manager.dispatch(
+            hidden_states,
+            router_logits,
+            is_sequence_parallel,
+            extra_tensors,  # type: ignore[call-arg]
         )
-        return hidden_states, router_logits
 
     def combine(
         self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 338cb1f1814b5..f5ada5a009ec3 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -1007,10 +1007,17 @@ class GroupCoordinator:
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
         is_sequence_parallel: bool = False,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
+        extra_tensors: list[torch.Tensor] | None = None,
+    ) -> (
+        tuple[torch.Tensor, torch.Tensor]
+        | tuple[torch.Tensor, torch.Tensor, list[torch.Tensor]]
+    ):
         if self.device_communicator is not None:
-            return self.device_communicator.dispatch(
-                hidden_states, router_logits, is_sequence_parallel
+            return self.device_communicator.dispatch(  # type: ignore[call-arg]
+                hidden_states,
+                router_logits,
+                is_sequence_parallel,
+                extra_tensors,
             )
         else:
             return hidden_states, router_logits
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
index 8c9d8a2777d58..a46e3972ed8e3 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
@@ -71,6 +71,18 @@ class FusedMoEMethodBase(QuantizeMethodBase):
             "implementation based on the prepare_finalize"
         )
 
+    def prepare_dp_allgather_tensor(
+        self,
+        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+    ) -> tuple[torch.Tensor, list[torch.Tensor]]:
+        """Hook to prepare tensors and extra tensors for DP allgather + EP dispatch."""
+        raise NotImplementedError(
+            "Method 'prepare_dp_allgather_tensor' is not implemented in "
+            f"{self.__class__.__name__}."
+        )
+
     @abstractmethod
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index cc3afade709d9..b39ce415a0f83 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -44,6 +44,7 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
     is_flashinfer_supporting_global_sf,
 )
 from vllm.platforms import current_platform
+from vllm.utils.flashinfer import has_flashinfer_trtllm_fused_moe
 from vllm.utils.math_utils import cdiv, round_up
 from vllm.utils.torch_utils import (
     aux_stream,
@@ -1933,10 +1934,46 @@ class FusedMoE(CustomOp):
         )
 
         with sp_ctx:
+            extra_tensors = None
             if do_naive_dispatch_combine:
-                hidden_states_combined, router_logits = get_ep_group().dispatch(
-                    hidden_states, router_logits, self.is_sequence_parallel
+                # Avoid circular import
+                from vllm.model_executor.layers.quantization.modelopt import (
+                    ModelOptNvFp4FusedMoE,
                 )
+
+                post_quant_allgather = (
+                    has_flashinfer_trtllm_fused_moe()
+                    and self.quant_method is not None
+                    and self.dp_size > 1
+                    and self.use_ep
+                    and isinstance(self.quant_method, ModelOptNvFp4FusedMoE)
+                )
+                if post_quant_allgather:
+                    hidden_states_to_dispatch, extra_tensors = (
+                        self.quant_method.prepare_dp_allgather_tensor(
+                            self, hidden_states, router_logits
+                        )
+                    )
+                else:
+                    hidden_states_to_dispatch = hidden_states
+
+                dispatch_res = get_ep_group().dispatch(
+                    hidden_states_to_dispatch,
+                    router_logits,
+                    self.is_sequence_parallel,
+                    extra_tensors=extra_tensors,
+                )
+                if extra_tensors is not None:
+                    hidden_states_combined, router_logits, extra_tensors_combined = (
+                        dispatch_res
+                    )
+                    hidden_states_combined = (
+                        hidden_states_combined,
+                        extra_tensors_combined[0],
+                    )
+                else:
+                    hidden_states_combined, router_logits = dispatch_res
+
             # Run shared experts before matrix multiply.
             # because matrix multiply maybe modify the hidden_states.
             if has_separate_shared_experts and not use_shared_experts_stream:
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index f71854e6b63c5..d5d7e7bfaae73 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1522,6 +1522,24 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 w2_blockscale_swizzled, requires_grad=False
             )
 
+    def prepare_dp_allgather_tensor(
+        self,
+        layer: FusedMoE,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+    ) -> tuple[torch.Tensor, list[torch.Tensor]]:
+        """Optionally prepare extra tensors to carry through DP allgather/EP."""
+        import flashinfer
+
+        a1_gscale = layer.w13_input_scale_quant
+        hidden_states_fp4, hidden_states_sf = flashinfer.fp4_quantize(
+            hidden_states,
+            a1_gscale,
+            is_sf_swizzled_layout=False,
+        )
+        extra_tensors: list[torch.Tensor] = [hidden_states_sf]
+        return hidden_states_fp4, extra_tensors
+
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
     ) -> FusedMoEQuantConfig | None:
@@ -1576,8 +1594,13 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 e_score_correction_bias=layer.e_score_correction_bias,
             )
 
+        # Hidden_states in select_experts is only used to extract metadata
+        if isinstance(x, tuple):
+            x_routing, _ = x
+        else:
+            x_routing = x
         topk_weights, topk_ids, _ = layer.select_experts(
-            hidden_states=x,
+            hidden_states=x_routing,
             router_logits=router_logits,
         )
 
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
index 76bce8a8d98d6..1d410316d6299 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
@@ -238,7 +238,7 @@ def prepare_static_weights_for_trtllm_fp4_moe(
 
 def flashinfer_trtllm_fp4_moe(
     layer: torch.nn.Module,
-    x: torch.Tensor,
+    x: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
     router_logits: torch.Tensor,
     top_k: int,
     global_num_experts: int,
@@ -269,12 +269,16 @@ def flashinfer_trtllm_fp4_moe(
     from vllm.model_executor.models.llama4 import Llama4MoE
 
     # Quantize input to FP4
-    a1_gscale = layer.w13_input_scale_quant
-    (hidden_states_fp4, hidden_states_scale_linear_fp4) = flashinfer.fp4_quantize(
-        x,
-        a1_gscale,
-        is_sf_swizzled_layout=False,
-    )
+    if isinstance(x, tuple):
+        hidden_states_fp4, hidden_states_scale_linear_fp4 = x
+    else:
+        # hidden_states is the already quantized
+        a1_gscale = layer.w13_input_scale_quant
+        (hidden_states_fp4, hidden_states_scale_linear_fp4) = flashinfer.fp4_quantize(
+            x,
+            a1_gscale,
+            is_sf_swizzled_layout=False,
+        )
 
     # Determine routing method type
     use_llama4_routing = custom_routing_function is Llama4MoE.custom_routing_function
@@ -360,13 +364,17 @@ def flashinfer_trtllm_fp4_routed_moe(
         torch.bfloat16
     ).view(torch.int16)
 
-    # Quantize input to FP4
-    a1_gscale = layer.w13_input_scale_quant
-    (hidden_states_fp4, hidden_states_scale_linear_fp4) = flashinfer.fp4_quantize(
-        x,
-        a1_gscale,
-        is_sf_swizzled_layout=False,
-    )
+    if isinstance(x, tuple):
+        # Hidden_states is the already quantized
+        hidden_states_fp4, hidden_states_scale_linear_fp4 = x
+    else:
+        # Quantize input to FP4
+        a1_gscale = layer.w13_input_scale_quant
+        (hidden_states_fp4, hidden_states_scale_linear_fp4) = flashinfer.fp4_quantize(
+            x,
+            a1_gscale,
+            is_sf_swizzled_layout=False,
+        )
 
     # Call TRT-LLM FP4 block-scale MoE kernel
     out = flashinfer.fused_moe.trtllm_fp4_block_scale_routed_moe(
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index 5019b771f4a14..1c2710be3173b 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -184,6 +184,23 @@ def has_flashinfer_cutedsl() -> bool:
     )
 
 
+@functools.cache
+def has_flashinfer_trtllm_fused_moe() -> bool:
+    """Return `True` if FlashInfer TRTLLM fused MoE is available."""
+    if not has_flashinfer_moe():
+        return False
+    required_functions = [
+        ("flashinfer.fused_moe", "trtllm_fp8_block_scale_moe"),
+        ("flashinfer.fused_moe", "trtllm_fp8_per_tensor_scale_moe"),
+        ("flashinfer.fused_moe", "trtllm_fp4_block_scale_moe"),
+    ]
+    for module_name, attr_name in required_functions:
+        mod = _get_submodule(module_name)
+        if not mod or not hasattr(mod, attr_name):
+            return False
+    return True
+
+
 @functools.cache
 def has_flashinfer_cutlass_fused_moe() -> bool:
     """Return `True` if FlashInfer CUTLASS fused MoE is available."""

From 9fec0e13d512b6b9082e40297582d8052f434610 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Tue, 16 Dec 2025 17:10:16 -0500
Subject: [PATCH 598/770] [Attention] Cache attention metadata builds across
 hybrid KV-cache groups (#29627)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Co-authored-by: Stanislaw Wozniak <stw@zurich.ibm.com>
---
 .../attention/test_chunked_local_attention.py |  2 +-
 .../layers/chunked_local_attention.py         | 16 +++++++---
 vllm/envs.py                                  |  4 +--
 vllm/v1/attention/backends/flash_attn.py      | 13 ++++++++
 vllm/v1/attention/backends/mamba2_attn.py     | 27 ++++++++++++++++
 vllm/v1/attention/backends/utils.py           | 32 ++++++++++++++++---
 vllm/v1/worker/gpu_model_runner.py            | 24 +++++++++++++-
 7 files changed, 105 insertions(+), 13 deletions(-)

diff --git a/tests/v1/attention/test_chunked_local_attention.py b/tests/v1/attention/test_chunked_local_attention.py
index faace3473a281..4529c2cfc29b6 100644
--- a/tests/v1/attention/test_chunked_local_attention.py
+++ b/tests/v1/attention/test_chunked_local_attention.py
@@ -172,7 +172,7 @@ def test_local_attention_virtual_batches(test_data: LocalAttentionTestData):
     )
 
     # Call the function
-    result = make_local_attention_virtual_batches(
+    result, _ = make_local_attention_virtual_batches(
         attn_chunk_size, common_attn_metadata, block_size
     )
 
diff --git a/vllm/attention/layers/chunked_local_attention.py b/vllm/attention/layers/chunked_local_attention.py
index 0ced0028ded9e..7e3794d408332 100644
--- a/vllm/attention/layers/chunked_local_attention.py
+++ b/vllm/attention/layers/chunked_local_attention.py
@@ -4,7 +4,7 @@ import functools
 
 import torch
 
-from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
+from vllm.attention.backends.abstract import AttentionBackend
 from vllm.attention.layer import Attention
 from vllm.attention.selector import get_attn_backend
 from vllm.config import CacheConfig
@@ -51,11 +51,19 @@ def create_chunked_local_attention_backend(
             common_prefix_len: int,
             common_attn_metadata: CommonAttentionMetadata,
             fast_build: bool = False,
-        ) -> AttentionMetadata:
-            common_attn_metadata = make_local_attention_virtual_batches(
+        ):
+            cm, make_virtual_batches_block_table = make_local_attention_virtual_batches(
                 attention_chunk_size, common_attn_metadata, block_size
             )
-            return super().build(common_prefix_len, common_attn_metadata, fast_build)
+            metadata = super().build(common_prefix_len, cm, fast_build)
+            metadata.make_virtual_batches_block_table = make_virtual_batches_block_table
+            return metadata
+
+        def update_block_table(
+            self, metadata, blk_table: torch.Tensor, slot_mapping: torch.Tensor
+        ):
+            blk_table = metadata.make_virtual_batches_block_table(blk_table)
+            return super().update_block_table(metadata, blk_table, slot_mapping)
 
     attn_backend = subclass_attention_backend(
         name_prefix=prefix,
diff --git a/vllm/envs.py b/vllm/envs.py
index d0f2798096263..7e072a588591c 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -207,7 +207,7 @@ if TYPE_CHECKING:
     VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL: bool = False
     VLLM_ENABLE_CUDAGRAPH_GC: bool = False
     VLLM_LOOPBACK_IP: str = ""
-    VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False
+    VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = True
     VLLM_ENABLE_RESPONSES_API_STORE: bool = False
     VLLM_USE_TRTLLM_ATTENTION: str | None = None
     VLLM_NVFP4_GEMM_BACKEND: str | None = None
@@ -1430,7 +1430,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # kv-cache memory usage and enable longer contexts)
     # TODO(lucas): Remove this flag once latency regression is resolved.
     "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE": lambda: bool(
-        int(os.getenv("VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE", "0"))
+        int(os.getenv("VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE", "1"))
     ),
     # Enables support for the "store" option in the OpenAI Responses API.
     # When set to 1, vLLM's OpenAI server will retain the input and output
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index f5ad98cf2125c..3445e998d6371 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with FlashAttention."""
 
+import copy
 from dataclasses import dataclass
 from typing import ClassVar
 
@@ -250,6 +251,7 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad
         if get_flash_attn_version() == 3
         else AttentionCGSupport.UNIFORM_BATCH
     )
+    supports_update_block_table: bool = True
 
     def __init__(
         self,
@@ -493,6 +495,17 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad
         )
         return attn_metadata
 
+    def update_block_table(
+        self,
+        metadata: FlashAttentionMetadata,
+        blk_table: torch.Tensor,
+        slot_mapping: torch.Tensor,
+    ) -> FlashAttentionMetadata:
+        new_metadata = copy.copy(metadata)
+        new_metadata.block_table = blk_table
+        new_metadata.slot_mapping = slot_mapping
+        return new_metadata
+
     def use_cascade_attention(self, *args, **kwargs) -> bool:
         return use_cascade_attention(*args, **kwargs)
 
diff --git a/vllm/v1/attention/backends/mamba2_attn.py b/vllm/v1/attention/backends/mamba2_attn.py
index bf1d8f09ab0ac..f923371283aa0 100644
--- a/vllm/v1/attention/backends/mamba2_attn.py
+++ b/vllm/v1/attention/backends/mamba2_attn.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import copy
 import itertools
 from dataclasses import dataclass
 
@@ -134,6 +135,8 @@ class Mamba2AttentionMetadata:
 class Mamba2AttentionMetadataBuilder(
     BaseMambaAttentionMetadataBuilder[Mamba2AttentionMetadata]
 ):
+    supports_update_block_table: bool = True
+
     def __init__(
         self,
         kv_cache_spec: AttentionSpec,
@@ -346,3 +349,27 @@ class Mamba2AttentionMetadataBuilder(
             num_computed_tokens_p=num_computed_tokens_p,
         )
         return attn_metadata
+
+    def update_block_table(
+        self,
+        metadata: Mamba2AttentionMetadata,
+        blk_table: torch.Tensor,
+        slot_mapping: torch.Tensor,
+    ) -> Mamba2AttentionMetadata:
+        new_metadata = copy.copy(metadata)
+        prefix_caching = self.vllm_config.cache_config.enable_prefix_caching
+        state_indices_t = blk_table if prefix_caching else blk_table[:, 0]
+        num_reqs = blk_table.shape[0]
+
+        # For CUDA graphs, copy to persistent buffer
+        if (
+            metadata.num_prefills == 0
+            and num_reqs <= self.decode_cudagraph_max_bs
+            and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
+        ):
+            persistent_state_indices_t = self.state_indices_tensor[:num_reqs]
+            persistent_state_indices_t.copy_(state_indices_t, non_blocking=True)
+            state_indices_t = persistent_state_indices_t
+
+        new_metadata.state_indices_tensor = state_indices_t
+        return new_metadata
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 1cbe929fc57a8..56763f4b52539 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -4,6 +4,7 @@ import abc
 import enum
 import functools
 from abc import abstractmethod
+from collections.abc import Callable
 from dataclasses import dataclass, field, fields, make_dataclass
 from typing import (
     TYPE_CHECKING,
@@ -317,6 +318,9 @@ class AttentionMetadataBuilder(abc.ABC, Generic[M]):
     # If not, set this to None. Otherwise set it to the query
     # length that will be pulled into the front of the batch.
     reorder_batch_threshold: int | None = None
+    # Does this backend/builder support updating the block table in existing
+    # metadata
+    supports_update_block_table: bool = False
 
     @abstractmethod
     def __init__(
@@ -387,6 +391,21 @@ class AttentionMetadataBuilder(abc.ABC, Generic[M]):
         """
         raise NotImplementedError
 
+    def update_block_table(
+        self,
+        metadata: M,
+        blk_table: torch.Tensor,
+        slot_mapping: torch.Tensor,
+    ) -> M:
+        """
+        Update the block table for the attention metadata.
+        Faster when theres multiple kv-cache groups that create virtually the
+        same metadata but just with different block tables.
+
+        Only needs to be implemented if supports_update_block_table is True.
+        """
+        raise NotImplementedError
+
     def build_for_cudagraph_capture(
         self, common_attn_metadata: CommonAttentionMetadata
     ) -> M:
@@ -603,7 +622,7 @@ def make_local_attention_virtual_batches(
     attn_chunk_size: int,
     common_attn_metadata: CommonAttentionMetadata,
     block_size: int = 0,
-) -> CommonAttentionMetadata:
+) -> tuple[CommonAttentionMetadata, Callable[[torch.Tensor], torch.Tensor]]:
     query_start_loc_np = common_attn_metadata.query_start_loc_cpu.numpy()
     seq_lens_np = common_attn_metadata.seq_lens_cpu.numpy()
     block_table = common_attn_metadata.block_table_tensor
@@ -715,9 +734,12 @@ def make_local_attention_virtual_batches(
     # tensor first, which recovers perf.
     batch_indices_torch = torch.from_numpy(batch_indices)
     block_indices_torch = torch.from_numpy(block_indices)
-    block_table_local = block_table[batch_indices_torch, block_indices_torch].view(
-        virtual_batches, -1
-    )
+
+    # Save as a lambda so we can return this for update_block_table
+    make_block_table = lambda block_table: block_table[
+        batch_indices_torch, block_indices_torch
+    ].view(virtual_batches, -1)
+    block_table_local = make_block_table(block_table)
 
     query_start_loc_cpu = torch.from_numpy(cu_seqlens_q_local)
     seq_lens_cpu = torch.from_numpy(seqlens_k_local)
@@ -736,7 +758,7 @@ def make_local_attention_virtual_batches(
         causal=True,
         _seq_lens_cpu=seq_lens_cpu,
         _num_computed_tokens_cpu=torch.from_numpy(num_computed_tokens_local),
-    )
+    ), make_block_table
 
 
 def make_kv_sharing_fast_prefill_common_attn_metadata(
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 1aa2ec6bb655c..179f713c4d86a 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1630,6 +1630,15 @@ class GPUModelRunner(
                 logits_indices
             )
 
+        # Cache attention metadata builds across hybrid KV-cache groups
+        # The only thing that changes between different hybrid KV-cache groups when the
+        # same metadata builder and KVCacheSpec is the same is the block table, so we
+        # can cache the attention metadata builds and just update the block table using
+        # `builder.update_block_table` if the builder supports it.
+        cached_attn_metadata: dict[
+            tuple[KVCacheSpec, type[AttentionMetadataBuilder]], AttentionMetadata
+        ] = {}
+
         def _build_attn_group_metadata(
             kv_cache_gid: int,
             attn_gid: int,
@@ -1637,13 +1646,15 @@ class GPUModelRunner(
             ubid: int | None = None,
         ) -> None:
             attn_group = self.attn_groups[kv_cache_gid][attn_gid]
+            builder = attn_group.get_metadata_builder(ubid or 0)
+            cache_key = (kv_cache_groups[kv_cache_gid].kv_cache_spec, type(builder))
+
             cascade_attn_prefix_len = (
                 cascade_attn_prefix_lens[kv_cache_gid][attn_gid]
                 if cascade_attn_prefix_lens
                 else 0
             )
 
-            builder = attn_group.get_metadata_builder(ubid or 0)
             extra_attn_metadata_args = {}
             if use_spec_decode and isinstance(builder, GDNAttentionMetadataBuilder):
                 assert ubid is None, "UBatching not supported with GDN yet"
@@ -1658,12 +1669,23 @@ class GPUModelRunner(
                 attn_metadata_i = builder.build_for_cudagraph_capture(
                     common_attn_metadata
                 )
+            elif (
+                cache_key in cached_attn_metadata
+                and builder.supports_update_block_table
+            ):
+                attn_metadata_i = builder.update_block_table(
+                    cached_attn_metadata[cache_key],
+                    common_attn_metadata.block_table_tensor,
+                    common_attn_metadata.slot_mapping,
+                )
             else:
                 attn_metadata_i = builder.build(
                     common_prefix_len=cascade_attn_prefix_len,
                     common_attn_metadata=common_attn_metadata,
                     **extra_attn_metadata_args,
                 )
+                if builder.supports_update_block_table:
+                    cached_attn_metadata[cache_key] = attn_metadata_i
 
             if ubid is None:
                 assert isinstance(attn_metadata, dict)

From f5f51e5931ffd99afe69696b60765b88d3eb13f2 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Tue, 16 Dec 2025 14:18:17 -0800
Subject: [PATCH 599/770] [Core][MM] Optimize encoder cache manager by
 operating with embeddings only (#30475)

Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Sun Kim <sunytokki@gmail.com>
---
 .../multimodal/processing/test_mllama4.py     |  4 +-
 tests/multimodal/test_utils.py                | 92 +++++++++++++++++++
 tests/v1/core/test_encoder_cache_manager.py   | 79 +++++++++++++++-
 .../unit/test_ec_example_connector.py         |  2 +-
 .../ec_connector/example_connector.py         |  2 +-
 vllm/model_executor/models/qwen3_vl.py        |  8 +-
 vllm/multimodal/inputs.py                     | 39 +++++++-
 vllm/multimodal/profiling.py                  | 32 ++-----
 vllm/multimodal/registry.py                   |  2 +-
 vllm/v1/core/encoder_cache_manager.py         | 80 ++++++++--------
 vllm/v1/core/sched/scheduler.py               | 35 +++++--
 vllm/v1/request.py                            |  6 +-
 vllm/v1/worker/gpu_model_runner.py            | 49 +++-------
 vllm/v1/worker/utils.py                       |  6 ++
 14 files changed, 306 insertions(+), 130 deletions(-)

diff --git a/tests/models/multimodal/processing/test_mllama4.py b/tests/models/multimodal/processing/test_mllama4.py
index e5ff2d1391b62..325159965c803 100644
--- a/tests/models/multimodal/processing/test_mllama4.py
+++ b/tests/models/multimodal/processing/test_mllama4.py
@@ -60,12 +60,12 @@ def test_profiling(model_id: str, max_model_len: int):
         total_num_patches.item() + num_tiles.item() + 3
     )  # image start, image, image end
 
-    profiled_tokens = profiler.get_mm_max_contiguous_tokens(
+    profiled_tokens = profiler.get_mm_max_tokens(
         max_model_len,
         mm_counts=mm_counts,
     )
 
-    assert total_tokens == profiled_tokens["image"]
+    assert total_num_patches == profiled_tokens["image"]
     assert total_tokens == sum(
         placeholder.length
         for placeholder in decoder_dummy_data.multi_modal_placeholders["image"]
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index 636cd0ffd445e..02bb1f769baad 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -9,6 +9,7 @@ from tempfile import NamedTemporaryFile, TemporaryDirectory
 
 import numpy as np
 import pytest
+import torch
 from PIL import Image, ImageChops
 
 from vllm.multimodal.image import convert_image_mode
@@ -410,6 +411,97 @@ def test_argsort_mm_positions(case):
     assert modality_idxs == expected_modality_idxs
 
 
+@pytest.mark.parametrize(
+    "is_embed,expected",
+    [
+        (None, 5),
+        (torch.tensor([True, True, True, True, True]), 5),
+        (torch.tensor([False, False, False, False, False]), 0),
+        (torch.tensor([True, False, True, False, True]), 3),
+        (torch.tensor([True]), 1),
+    ],
+)
+def test_placeholder_range_get_num_embeds(is_embed, expected):
+    length = len(is_embed) if is_embed is not None else 5
+    pr = PlaceholderRange(offset=0, length=length, is_embed=is_embed)
+    assert pr.get_num_embeds == expected
+
+
+@pytest.mark.parametrize(
+    "is_embed,expected",
+    [
+        (None, None),
+        (
+            torch.tensor([False, True, False, True, True]),
+            torch.tensor([0, 1, 1, 2, 3]),
+        ),
+        (torch.tensor([True, True, True]), torch.tensor([1, 2, 3])),
+    ],
+)
+def test_placeholder_range_embeds_cumsum(is_embed, expected):
+    length = len(is_embed) if is_embed is not None else 5
+    pr = PlaceholderRange(offset=0, length=length, is_embed=is_embed)
+
+    if expected is None:
+        assert pr.embeds_cumsum is None
+        return
+
+    assert torch.equal(pr.embeds_cumsum, expected)
+    # cached_property should return the same object on repeated access
+    assert pr.embeds_cumsum is pr.embeds_cumsum
+
+
+@pytest.mark.parametrize(
+    "is_embed,start_idx,end_idx,expected",
+    [
+        (None, 2, 4, (2, 4)),
+        (
+            torch.tensor([False, True, False, True, True]),
+            3,
+            5,
+            (1, 3),
+        ),
+        (
+            torch.tensor([False, True, False, True, True]),
+            0,
+            2,
+            (0, 1),
+        ),
+        (
+            torch.tensor([True, False, True, False]),
+            2,
+            2,
+            (1, 1),
+        ),
+    ],
+)
+def test_placeholder_range_get_embeds_indices_in_range(
+    is_embed, start_idx, end_idx, expected
+):
+    length = len(is_embed) if is_embed is not None else 5
+    pr = PlaceholderRange(offset=0, length=length, is_embed=is_embed)
+    assert pr.get_embeds_indices_in_range(start_idx, end_idx) == expected
+
+
+@pytest.mark.parametrize(
+    "offset,is_embed,expected",
+    [
+        (0, None, [(0, 4)]),
+        (
+            2,
+            torch.tensor([False, True, False, True, True]),
+            [(3, 3), (5, 6)],
+        ),
+        (0, torch.tensor([True, True, True, True]), [(0, 3)]),
+        (0, torch.tensor([False, False, False, False]), []),
+    ],
+)
+def test_placeholder_range_extract_embeds_range(offset, is_embed, expected):
+    length = len(is_embed) if is_embed is not None else 5
+    pr = PlaceholderRange(offset=offset, length=length, is_embed=is_embed)
+    assert pr.extract_embeds_range() == expected
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
 @pytest.mark.parametrize("num_frames", [-1, 32, 1800])
diff --git a/tests/v1/core/test_encoder_cache_manager.py b/tests/v1/core/test_encoder_cache_manager.py
index 8a52b5bd78977..511ff48c401ca 100644
--- a/tests/v1/core/test_encoder_cache_manager.py
+++ b/tests/v1/core/test_encoder_cache_manager.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
+import torch
 
 from vllm.multimodal.inputs import MultiModalFeatureSpec, PlaceholderRange
 from vllm.v1.core.encoder_cache_manager import EncoderCacheManager
@@ -23,7 +24,7 @@ class MockRequest:
             )
             self.mm_features.append(feature)
 
-    def get_num_encoder_tokens(self, input_id: int) -> int:
+    def get_num_encoder_embeds(self, input_id: int) -> int:
         return self._token_counts[input_id]
 
 
@@ -162,8 +163,8 @@ def test_schedule_request_multi_images_respect_space_limit():
 
     num_tokens_to_schedule = 0
     assert manager.can_allocate(req, 0, compute_budget, num_tokens_to_schedule)
-    num_tokens_to_schedule += req.get_num_encoder_tokens(0)
-    compute_budget -= req.get_num_encoder_tokens(0)
+    num_tokens_to_schedule += req.get_num_encoder_embeds(0)
+    compute_budget -= req.get_num_encoder_embeds(0)
 
     assert not manager.can_allocate(req, 1, compute_budget, num_tokens_to_schedule)
 
@@ -174,7 +175,75 @@ def test_schedule_request_multi_images_respect_compute_limit():
     compute_budget = 10
     num_tokens_to_schedule = 0
     assert manager.can_allocate(req, 0, compute_budget, num_tokens_to_schedule)
-    num_tokens_to_schedule += req.get_num_encoder_tokens(0)
-    compute_budget -= req.get_num_encoder_tokens(0)
+    num_tokens_to_schedule += req.get_num_encoder_embeds(0)
+    compute_budget -= req.get_num_encoder_embeds(0)
 
     assert not manager.can_allocate(req, 1, compute_budget, num_tokens_to_schedule)
+
+
+def test_encoder_cache_with_is_embed_mask():
+    class MockRequestWithMask(MockRequest):
+        def get_num_encoder_embeds(self, input_id: int) -> int:
+            return self.mm_features[input_id].mm_position.get_num_embeds
+
+    is_embed = torch.zeros(100, dtype=torch.bool)
+    is_embed[torch.tensor([5, 15, 25, 35, 45, 55, 65, 75])] = True
+
+    request = MockRequestWithMask("r1", ["img1"], [100])
+    request.mm_features[0] = MultiModalFeatureSpec(
+        data=None,
+        modality="image",
+        identifier="img1",
+        mm_position=PlaceholderRange(offset=0, length=100, is_embed=is_embed),
+    )
+
+    manager = EncoderCacheManager(cache_size=100)
+    manager.allocate(request, 0)
+
+    assert manager.num_free_slots == 92
+    assert "img1" in manager.cached
+
+    old_size = 100
+    new_size = request.mm_features[0].mm_position.get_num_embeds
+    assert new_size == 8
+    savings_ratio = old_size / new_size
+    assert savings_ratio == 12.5
+
+
+def test_encoder_cache_mask_based_retrieval():
+    class MockRequestWithMask(MockRequest):
+        def get_num_encoder_embeds(self, input_id: int) -> int:
+            return self.mm_features[input_id].mm_position.get_num_embeds
+
+    is_embed = torch.tensor(
+        [False, False, True, True, False, True, True, True, False, False]
+    )
+
+    request = MockRequestWithMask("r1", ["img1"], [10])
+    request.mm_features[0] = MultiModalFeatureSpec(
+        data=None,
+        modality="image",
+        identifier="img1",
+        mm_position=PlaceholderRange(offset=0, length=10, is_embed=is_embed),
+    )
+
+    manager = EncoderCacheManager(cache_size=50)
+    manager.allocate(request, 0)
+
+    assert request.mm_features[0].mm_position.get_num_embeds == 5
+
+    start_idx = 2
+    end_idx = 8
+    num_embeds_before = is_embed[:start_idx].sum().item()
+    num_embeds_in_range = is_embed[start_idx:end_idx].sum().item()
+
+    assert num_embeds_before == 0
+    assert num_embeds_in_range == 5
+
+    start_idx = 0
+    end_idx = 5
+    num_embeds_before = is_embed[:start_idx].sum().item() if start_idx > 0 else 0
+    num_embeds_in_range = is_embed[start_idx:end_idx].sum().item()
+
+    assert num_embeds_before == 0
+    assert num_embeds_in_range == 2
diff --git a/tests/v1/ec_connector/unit/test_ec_example_connector.py b/tests/v1/ec_connector/unit/test_ec_example_connector.py
index 7e9eb21310031..9ed82e1cef823 100644
--- a/tests/v1/ec_connector/unit/test_ec_example_connector.py
+++ b/tests/v1/ec_connector/unit/test_ec_example_connector.py
@@ -38,7 +38,7 @@ class MockRequest:
             )
             self.mm_features.append(feature)
 
-    def get_num_encoder_tokens(self, input_id: int) -> int:
+    def get_num_encoder_embeds(self, input_id: int) -> int:
         assert input_id < len(self._token_counts)
         return self._token_counts[input_id]
 
diff --git a/vllm/distributed/ec_transfer/ec_connector/example_connector.py b/vllm/distributed/ec_transfer/ec_connector/example_connector.py
index 5f2eff5a8e6a8..c9aad9e9fc8f3 100644
--- a/vllm/distributed/ec_transfer/ec_connector/example_connector.py
+++ b/vllm/distributed/ec_transfer/ec_connector/example_connector.py
@@ -144,7 +144,7 @@ class ECExampleConnector(ECConnectorBase):
         Update ECConnector state after encoder cache allocation.
         """
         mm_hash = request.mm_features[index].identifier
-        num_encoder_token = request.get_num_encoder_tokens(index)
+        num_encoder_token = request.get_num_encoder_embeds(index)
         # Insert mm_hash only if this block has not been recorded yet.
         self._mm_datas_need_loads[mm_hash] = num_encoder_token
 
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index c0589986d1fe8..4838f68e06f70 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -713,17 +713,13 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo):
         mm_counts: Mapping[str, int],
     ) -> int:
         target_width, target_height = self.get_image_size_with_most_features()
-        video_soft_tokens = self.get_num_video_tokens(
+        num_video_soft_tokens = self.get_num_video_tokens(
             image_width=target_width,
             image_height=target_height,
             num_frames=self.get_num_frames_with_most_features(seq_len, mm_counts),
             image_processor=None,
         )
-
-        # NOTE: By default in Qwen3-VL, one video token is converted to
-        # "<{timestamp} seconds>" (on average 9.5 tokens) + vision_start_token + video_token + vision_end_token # noqa: E501
-        formatted_video_soft_tokens = video_soft_tokens * 12.5
-        return int(formatted_video_soft_tokens)
+        return num_video_soft_tokens
 
     def _calculate_timestamps(
         self, indices: list[int] | torch.Tensor, video_fps: float, merge_size: int
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 6b1cbbe24e2e7..fa69818a7b1f8 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -5,7 +5,7 @@ from abc import ABC, abstractmethod
 from collections import UserDict, defaultdict
 from collections.abc import Mapping, Sequence
 from dataclasses import dataclass
-from functools import partial
+from functools import cached_property, partial
 from itertools import accumulate
 from typing import (
     TYPE_CHECKING,
@@ -169,11 +169,42 @@ class PlaceholderRange:
     between `offset` and `offset + length` to assign embeddings to.
     """
 
-    def get_num_embeds(self) -> int:
+    @cached_property
+    def embeds_cumsum(self) -> torch.Tensor | None:
         if self.is_embed is None:
+            return None
+
+        return self.is_embed.cumsum(dim=0)
+
+    @cached_property
+    def get_num_embeds(self) -> int:
+        if self.embeds_cumsum is None:
             return self.length
 
-        return int(self.is_embed.sum().item())
+        return int(self.embeds_cumsum[-1])
+
+    def get_embeds_indices_in_range(
+        self, start_idx: int, end_idx: int
+    ) -> tuple[int, int]:
+        """
+        Returns the starting and ending indices of the embeddings of encoder outputs
+        in the range of [start_idx, end_idx) in the placeholders.
+
+        For example, given:
+        PlaceholderRange(offset=2, length=5, is_embed=[False, True, False, True, True])
+
+        If start_idx=3 and end_idx=5, the output is (1, 3) because we want to get
+        the second and the third embeddings from the encoder output.
+        """
+        if self.embeds_cumsum is None:
+            return start_idx, end_idx
+
+        embeds_start_idx = (
+            int(self.embeds_cumsum[start_idx - 1]) if start_idx > 0 else 0
+        )
+        embeds_end_idx = int(self.embeds_cumsum[end_idx - 1])
+
+        return embeds_start_idx, embeds_end_idx
 
     def extract_embeds_range(self) -> list[tuple[int, int]]:
         """Extract the start and end indices of the embedded region in prompt.
@@ -188,7 +219,7 @@ class PlaceholderRange:
             Returns full placeholder range if `is_embed` is `None`.
         """
         if self.is_embed is None:
-            return [(self.offset, self.offset + self.length)]
+            return [(self.offset, self.offset + self.length - 1)]
 
         mask_i = self.is_embed.int()
         starts = torch.nonzero(
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index cb70041e9744f..a690948f759e9 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -274,15 +274,11 @@ class MultiModalProfiler(Generic[_I]):
     def _get_mm_num_tokens(
         self,
         mm_inputs: MultiModalInputs,
-        mm_embeddings_only: bool = True,
     ) -> Mapping[str, int]:
         placeholders_by_modality = mm_inputs["mm_placeholders"]
 
         return {
-            modality: sum(
-                item.get_num_embeds() if mm_embeddings_only else item.length
-                for item in placeholders
-            )
+            modality: sum(item.get_num_embeds for item in placeholders)
             for modality, placeholders in placeholders_by_modality.items()
         }
 
@@ -328,12 +324,15 @@ class MultiModalProfiler(Generic[_I]):
             multi_modal_placeholders=mm_inputs["mm_placeholders"],
         )
 
-    def _get_mm_max_tokens(
+    def get_mm_max_tokens(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int] | None = None,
-        mm_embeddings_only: bool = True,
     ) -> Mapping[str, int]:
+        """
+        Returns the maximum number of embeddings per item of each modality, excluding
+        any break/text tokens in-between multimodal embeddings/encoder outputs.
+        """
         if mm_counts is None:
             mm_counts = self.get_mm_limits()
 
@@ -349,21 +348,4 @@ class MultiModalProfiler(Generic[_I]):
             }
 
         mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
-        return self._get_mm_num_tokens(mm_inputs, mm_embeddings_only=mm_embeddings_only)
-
-    def get_mm_max_contiguous_tokens(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int] | None = None,
-    ) -> Mapping[str, int]:
-        """
-        Returns the maximum length of the multimodal (image placeholders+text)
-        tokens, including any break/text tokens in-between image embeddings.
-
-        `<im_start> [IMG] [IMG] [IMG] <row_break> [IMG] [IMG] [IMG] <im_end>`
-        Returns 9, even when the number of image embeddings is 6.
-
-        This is important to take into account when profiling and
-        initializing the encoder cache size.
-        """
-        return self._get_mm_max_tokens(seq_len, mm_counts, mm_embeddings_only=False)
+        return self._get_mm_num_tokens(mm_inputs)
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 00a84f9dec4f7..1e7fe8648ab71 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -164,7 +164,7 @@ class MultiModalRegistry:
             profiler.get_mm_limits() if profiler_limits is None else profiler_limits
         )
 
-        return profiler.get_mm_max_contiguous_tokens(
+        return profiler.get_mm_max_tokens(
             seq_len,
             {modality: 1 for modality, limit in profiler_limits.items() if limit > 0},
         )
diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py
index 50f738713590b..d73c05d2cf80b 100644
--- a/vllm/v1/core/encoder_cache_manager.py
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -39,20 +39,26 @@ class EncoderCacheManager:
     space for new embeddings.
     Oldest cached embeddings with no request referenced will be first evicted.
 
+    NOTE: The EncoderCacheManager operates on the level of multimodal embeddings
+    instead of encoder tokens (i.e. all tokens that represent the multimodal data
+    in the input sequence). This means all break/text tokens in-between multimodal
+    embeddings are not considered with respect to the cache size and the number
+    of free slots.
+
     Args:
         cache_size: Limit the size of the cache, measured by the number of
-                    tokens from the input sequence.
+                    encoder embeddings from the input sequence.
 
     Attributes:
-        cache_size: Total cache capacity in encoder tokens.
-        num_free_slots: Current available cache capacity in encoder tokens.
+        cache_size: Total cache capacity in encoder embeddings.
+        num_free_slots: Current available cache capacity in encoder embeddings.
         num_freeable_slots: Capacity that can be immediately reclaimed by
-            evicting entries with zero references (in encoder tokens).
+            evicting entries with zero references (in encoder embeddings).
         cached: Mapping from mm_hash to a set of request IDs that currently
             reference the cached entry. If the set is empty, the entry exists
             but is not referenced by any request and is eligible for
             reclamation.
-        freeable: List of tuples (mm_hash, num_tokens) representing entries
+        freeable: List of tuples (mm_hash, num_encoder_embeds) representing entries
             whose no current running request is needed and that can be freed to
             make space when needed.
         freed: List of mm_hash strings that were actually evicted since the
@@ -67,7 +73,7 @@ class EncoderCacheManager:
         # mm_hash of mm_data => ids of requests that reference the mm_data
         self.cached: dict[str, set[str]] = {}
 
-        # mm_hash of mm_data => num_encoder_tokens of the mm_data
+        # mm_hash of mm_data => num_encoder_embeds of the mm_data
         self.freeable: OrderedDict[str, int] = OrderedDict()
         self.freed: list[str] = []
 
@@ -93,8 +99,8 @@ class EncoderCacheManager:
 
         # Cached but currently not referenced by any request
         if not self.cached[mm_hash]:
-            num_tokens = self.freeable.pop(mm_hash)
-            self.num_freeable_slots -= num_tokens
+            num_encoder_embeds = self.freeable.pop(mm_hash)
+            self.num_freeable_slots -= num_encoder_embeds
 
         self.cached[mm_hash].add(request.request_id)
         return True
@@ -104,7 +110,7 @@ class EncoderCacheManager:
         request: Request,
         input_id: int,
         encoder_compute_budget: int,
-        num_tokens_to_schedule: int,
+        num_embeds_to_schedule: int,
     ) -> bool:
         """Check if there's sufficient cache space for a multimodal input.
         If there is, return True and update EncoderCacheManager state.
@@ -121,9 +127,9 @@ class EncoderCacheManager:
         Args:
             request: The request containing the multimodal input.
             input_id: Index of the multimodal input within the request.
-            encoder_compute_budget: Number of encoder tokens allowed to be
+            encoder_compute_budget: Number of encoder embeddings allowed to be
                 computed when this method is invoked.
-            num_tokens_to_schedule: Number of tokens already scheduled to be
+            num_embeds_to_schedule: Number of encoder embeddings already scheduled to be
                 allocated with cache space when this method is invoked.
 
         Returns:
@@ -134,30 +140,30 @@ class EncoderCacheManager:
         Note: This method does not allocate physical memory for the encoder
         output but only the state of EncoderCacheManager.
         """
-        num_tokens = request.get_num_encoder_tokens(input_id)
+        num_embeds = request.get_num_encoder_embeds(input_id)
 
         # Not enough compute budget
-        if num_tokens > encoder_compute_budget:
+        if num_embeds > encoder_compute_budget:
             return False
 
-        num_tokens += num_tokens_to_schedule
+        num_embeds += num_embeds_to_schedule
 
         # Enough free slots
-        if num_tokens <= self.num_free_slots:
+        if num_embeds <= self.num_free_slots:
             return True
 
         # Not enough reclaimable slots
-        if num_tokens > self.num_freeable_slots:
+        if num_embeds > self.num_freeable_slots:
             return False
 
         # Not enough free slots but enough reclaimable slots
         # NOTE: Eviction takes place here, but physical memory is not freed
         # until model runner is notified by the scheduler output.
-        while num_tokens > self.num_free_slots:
-            mm_hash, num_free_token = self.freeable.popitem(last=False)
+        while num_embeds > self.num_free_slots:
+            mm_hash, num_free_embeds = self.freeable.popitem(last=False)
             del self.cached[mm_hash]
             self.freed.append(mm_hash)
-            self.num_free_slots += num_free_token
+            self.num_free_slots += num_free_embeds
         return True
 
     def allocate(self, request: Request, input_id: int) -> None:
@@ -176,16 +182,16 @@ class EncoderCacheManager:
         if mm_hash not in self.cached:
             self.cached[mm_hash] = set()
 
-        num_encoder_tokens = request.get_num_encoder_tokens(input_id)
+        num_encoder_embeds = request.get_num_encoder_embeds(input_id)
 
         # NOTE: Encoder cache should always have enough space for encoder inputs
         # that are scheduled since eviction takes place at can_allocate().
-        assert self.num_free_slots >= num_encoder_tokens
-        assert self.num_freeable_slots >= num_encoder_tokens
+        assert self.num_free_slots >= num_encoder_embeds
+        assert self.num_freeable_slots >= num_encoder_embeds
 
         self.cached[mm_hash].add(request_id)
-        self.num_free_slots -= num_encoder_tokens
-        self.num_freeable_slots -= num_encoder_tokens
+        self.num_free_slots -= num_encoder_embeds
+        self.num_freeable_slots -= num_encoder_embeds
 
     def get_cached_input_ids(self, request: Request) -> set[int]:
         """Get all cached multimodal input IDs for a request.
@@ -206,7 +212,7 @@ class EncoderCacheManager:
 
         When the reference set for the corresponding `mm_hash` becomes empty,
         the entry is appended to `freeable` and `num_freeable_slots` is
-        increased by the number of encoder tokens for that input.
+        increased by the number of encoder embeddings for that input.
 
         The entry is NOT physically freed until capacity is needed (e.g., by
         `can_allocate`).
@@ -218,9 +224,9 @@ class EncoderCacheManager:
             return
         self.cached[mm_hash].discard(req_id)
         if not self.cached[mm_hash]:
-            num_tokens = request.get_num_encoder_tokens(input_id)
-            self.freeable[mm_hash] = num_tokens
-            self.num_freeable_slots += num_tokens
+            num_encoder_embeds = request.get_num_encoder_embeds(input_id)
+            self.freeable[mm_hash] = num_encoder_embeds
+            self.num_freeable_slots += num_encoder_embeds
 
     def free(self, request: Request) -> None:
         """Free all encoder input cache reference held by *request*.
@@ -361,20 +367,20 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
         request: Request,
         input_id: int,
         encoder_compute_budget: int,
-        num_tokens_to_schedule: int,
+        num_embeds_to_schedule: int,
     ) -> bool:
-        num_tokens = request.get_num_encoder_tokens(input_id)
+        num_encoder_embeds = request.get_num_encoder_embeds(input_id)
         # Not enough compute budget
-        if num_tokens > encoder_compute_budget:
+        if num_encoder_embeds > encoder_compute_budget:
             return False
 
-        num_tokens += num_tokens_to_schedule
+        num_encoder_embeds += num_embeds_to_schedule
         # Enough free slots
-        return num_tokens <= self.num_free_slots
+        return num_encoder_embeds <= self.num_free_slots
 
     def allocate(self, request: Request, input_id: int) -> None:
-        num_encoder_tokens = request.get_num_encoder_tokens(input_id)
-        self.num_free_slots -= num_encoder_tokens
+        num_encoder_embeds = request.get_num_encoder_embeds(input_id)
+        self.num_free_slots -= num_encoder_embeds
 
         mm_hash = request.mm_features[input_id].identifier
         self.freed.append(mm_hash)
@@ -392,5 +398,5 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
         return freed
 
     def free_encoder_input(self, request: Request, input_id: int) -> None:
-        num_tokens = request.get_num_encoder_tokens(input_id)
-        self.num_free_slots += num_tokens
+        num_encoder_embeds = request.get_num_encoder_embeds(input_id)
+        self.num_free_slots += num_encoder_embeds
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 754e0b9d08316..8e835ad096405 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -355,11 +355,11 @@ class Scheduler(SchedulerInterface):
                             if preempted_encoder_inputs:
                                 # Restore encoder compute budget if the preempted
                                 # request had encoder inputs scheduled in this step.
-                                num_tokens_to_restore = sum(
-                                    preempted_req.get_num_encoder_tokens(i)
+                                num_embeds_to_restore = sum(
+                                    preempted_req.get_num_encoder_embeds(i)
                                     for i in preempted_encoder_inputs
                                 )
-                                encoder_compute_budget += num_tokens_to_restore
+                                encoder_compute_budget += num_embeds_to_restore
                             req_index -= 1
                     else:
                         preempted_req = self.running.pop()
@@ -911,10 +911,11 @@ class Scheduler(SchedulerInterface):
         # multiple encoder inputs per request), we need to create temporary
         # trackers for accounting at the encoder input level.
         mm_hashes_to_schedule = set()
-        num_tokens_to_schedule = 0
+        num_embeds_to_schedule = 0
         for i, mm_feature in enumerate(mm_features):
             start_pos = mm_feature.mm_position.offset
             num_encoder_tokens = mm_feature.mm_position.length
+            num_encoder_embeds = mm_feature.mm_position.get_num_embeds
 
             # The encoder output is needed if the two ranges overlap:
             # [num_computed_tokens, num_computed_tokens + num_new_tokens) and
@@ -970,9 +971,8 @@ class Scheduler(SchedulerInterface):
             ):
                 num_new_tokens = start_pos - num_computed_tokens
                 break
-
             if not self.encoder_cache_manager.can_allocate(
-                request, i, encoder_compute_budget, num_tokens_to_schedule
+                request, i, encoder_compute_budget, num_embeds_to_schedule
             ):
                 # The encoder cache is full or the encoder budget is exhausted.
                 # NOTE(woosuk): We assume that the encoder input tokens should
@@ -992,14 +992,31 @@ class Scheduler(SchedulerInterface):
                     num_new_tokens = 0
                 break
 
+            # Calculate the number of embeddings to schedule in the current range
+            # of scheduled encoder placholder tokens.
+            start_idx_rel = max(0, num_computed_tokens - start_pos)
+            end_idx_rel = min(
+                num_encoder_tokens, num_computed_tokens + num_new_tokens - start_pos
+            )
+            curr_embeds_start, curr_embeds_end = (
+                mm_feature.mm_position.get_embeds_indices_in_range(
+                    start_idx_rel,
+                    end_idx_rel,
+                )
+            )
+            # There's no embeddings in the current range of encoder placeholder tokens
+            # so we can skip the encoder input.
+            if curr_embeds_end - curr_embeds_start == 0:
+                continue
+
             if self.ec_connector is not None and remote_cache_has_item[i]:
                 mm_hashes_to_schedule.add(request.mm_features[i].identifier)
                 external_load_encoder_input.append(i)
-                num_tokens_to_schedule += num_encoder_tokens
+                num_embeds_to_schedule += num_encoder_embeds
                 continue
 
-            num_tokens_to_schedule += num_encoder_tokens
-            encoder_compute_budget -= num_encoder_tokens
+            num_embeds_to_schedule += num_encoder_embeds
+            encoder_compute_budget -= num_encoder_embeds
             mm_hashes_to_schedule.add(request.mm_features[i].identifier)
             encoder_inputs_to_schedule.append(i)
 
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index a775e840e841c..f33059b80b894 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -209,10 +209,10 @@ class Request:
     def get_finished_reason(self) -> FinishReason | None:
         return RequestStatus.get_finished_reason(self.status)
 
-    def get_num_encoder_tokens(self, input_id: int) -> int:
+    def get_num_encoder_embeds(self, input_id: int) -> int:
         assert input_id < len(self.mm_features)
-        num_tokens = self.mm_features[input_id].mm_position.length
-        return num_tokens
+        num_embeds = self.mm_features[input_id].mm_position.get_num_embeds
+        return num_embeds
 
     def record_event(
         self,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 179f713c4d86a..1db5bc99fff6c 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -169,9 +169,7 @@ from .utils import (
     MultiModalBudget,
     add_kv_sharing_layers_to_kv_cache_groups,
     bind_kv_cache,
-    gather_mm_placeholders,
     sanity_check_mm_encoder_outputs,
-    scatter_mm_placeholders,
 )
 
 if TYPE_CHECKING:
@@ -2209,10 +2207,7 @@ class GPUModelRunner(
 
         # Cache the encoder outputs by mm_hash
         for (mm_hash, pos_info), output in zip(mm_hashes_pos, encoder_outputs):
-            self.encoder_cache[mm_hash] = scatter_mm_placeholders(
-                output,
-                is_embed=pos_info.is_embed,
-            )
+            self.encoder_cache[mm_hash] = output
             logger.debug("Finish execute for mm hash %s", mm_hash)
             self.maybe_save_ec_to_connector(self.encoder_cache, mm_hash)
 
@@ -2263,6 +2258,13 @@ class GPUModelRunner(
                     num_encoder_tokens,
                 )
                 assert start_idx < end_idx
+                curr_embeds_start, curr_embeds_end = (
+                    pos_info.get_embeds_indices_in_range(start_idx, end_idx)
+                )
+                # If there are no embeddings in the current range, we skip
+                # gathering the embeddings.
+                if curr_embeds_start == curr_embeds_end:
+                    continue
 
                 mm_hash = mm_feature.identifier
                 encoder_output = self.encoder_cache.get(mm_hash, None)
@@ -2270,16 +2272,14 @@ class GPUModelRunner(
 
                 if (is_embed := pos_info.is_embed) is not None:
                     is_embed = is_embed[start_idx:end_idx]
+                    mm_embeds_item = encoder_output[curr_embeds_start:curr_embeds_end]
+                else:
+                    mm_embeds_item = encoder_output[start_idx:end_idx]
 
                 req_start_pos = req_start_idx + start_pos - num_computed_tokens
                 is_mm_embed[req_start_pos + start_idx : req_start_pos + end_idx] = (
                     True if is_embed is None else is_embed
                 )
-
-                mm_embeds_item = gather_mm_placeholders(
-                    encoder_output[start_idx:end_idx],
-                    is_embed=is_embed,
-                )
                 mm_embeds_req.append(mm_embeds_item)
 
             if self.is_multimodal_pruning_enabled and self.uses_mrope:
@@ -4508,31 +4508,8 @@ class GPUModelRunner(
                         dummy_encoder_outputs,
                         expected_num_items=max_mm_items_per_batch,
                     )
-
-                    # NOTE: This happens when encoder cache needs to store
-                    # the embeddings that encoder outputs are scattered onto.
-                    # In this case we create dummy embeddings of size
-                    # (max_tokens_for_modality, hidden_size) and scatter
-                    # encoder output into it.
-                    encoder_output_shape = dummy_encoder_outputs[0].shape
-                    max_mm_tokens_per_item = mm_budget.max_tokens_by_modality[
-                        dummy_modality
-                    ]
-                    if encoder_output_shape[0] < max_mm_tokens_per_item:
-                        encoder_hidden_size = encoder_output_shape[-1]
-                        expanded_outputs = []
-                        for output in dummy_encoder_outputs:
-                            expanded = output.new_zeros(
-                                (max_mm_tokens_per_item, encoder_hidden_size)
-                            )
-                            num_tokens = output.shape[0]
-                            expanded[:num_tokens].copy_(output)
-                            expanded_outputs.append(expanded)
-
-                        dummy_encoder_outputs = expanded_outputs
-
-                    # Cache the dummy encoder outputs.
-                    self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs))
+                    for i, output in enumerate(dummy_encoder_outputs):
+                        self.encoder_cache[f"tmp_{i}"] = output
 
         # Add `is_profile` here to pre-allocate communication buffers
         hidden_states, last_hidden_states = self._dummy_run(
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index e9c48223d58b9..2e8afec024ce9 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -4,10 +4,12 @@ from collections import defaultdict
 from dataclasses import dataclass, field
 
 import torch
+from typing_extensions import deprecated
 
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.attention.layer import Attention
 from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
+from vllm.logger import init_logger
 from vllm.model_executor.models.interfaces import MultiModalEmbeddings
 from vllm.model_executor.models.utils import extract_layer_index
 from vllm.multimodal.cache import processor_only_cache_from_config
@@ -17,6 +19,8 @@ from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
 from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget
 from vllm.v1.kv_cache_interface import KVCacheGroupSpec, KVCacheSpec
 
+logger = init_logger(__name__)
+
 
 class MultiModalBudget:
     """Helper class to calculate budget information for multi-modal models."""
@@ -198,6 +202,7 @@ def sanity_check_mm_encoder_outputs(
     )
 
 
+@deprecated("`scatter_mm_placeholders` is deprecated and will be removed in v0.15.0.")
 def scatter_mm_placeholders(
     embeds: torch.Tensor,
     is_embed: torch.Tensor | None,
@@ -226,6 +231,7 @@ def scatter_mm_placeholders(
     return placeholders
 
 
+@deprecated("`gather_mm_placeholders` is deprecated and will be removed in v0.15.0.")
 def gather_mm_placeholders(
     placeholders: torch.Tensor,
     is_embed: torch.Tensor | None,

From eaa82a709a963ab744647a701fe267223ed7b02b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20C=C3=A1mpora?=
 <961215+dcampora@users.noreply.github.com>
Date: Tue, 16 Dec 2025 23:21:17 +0100
Subject: [PATCH 600/770] [Bugfix][DSV32] Fix overflow in topk. (#30754)

Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
---
 csrc/sampler.cu | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/csrc/sampler.cu b/csrc/sampler.cu
index fc2154beff9e0..d458f8e4c1d02 100644
--- a/csrc/sampler.cu
+++ b/csrc/sampler.cu
@@ -550,8 +550,8 @@ static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowPrefill(
   int rowEnd = rowEnds[rowIdx];
 
   // Local pointers to this block
-  outIndices += rowIdx * topK;
-  logits += rowIdx * stride0;
+  outIndices += static_cast<int64_t>(rowIdx) * topK;
+  logits += static_cast<int64_t>(rowIdx) * stride0;
 
   topKPerRowJob<kNumThreadsPerBlock, kNumBins, useRadixSort>(
       nullptr, logits, rowStart, rowEnd, outIndices, nullptr, stride1, topK);
@@ -576,19 +576,21 @@ static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowDecode(
 
   // Local pointers to this block
   if constexpr (!multipleBlocksPerRow && !mergeBlocks) {
-    outIndices += rowIdx * topK;
+    outIndices += static_cast<int64_t>(rowIdx) * topK;
   } else if constexpr (multipleBlocksPerRow) {
     const auto blockSize = rowEnd / gridDim.y;  // 16384 / 2 = 8192
     rowStart = blockSize * blockIdx.y;          // 8192 * 1 = 8192
     rowEnd = gridDim.y == blockIdx.y + 1 ? rowEnd : rowStart + blockSize;
-    outIndices += rowIdx * gridDim.y * topK + blockIdx.y * topK;
-    outLogits += rowIdx * gridDim.y * topK + blockIdx.y * topK;
+    outIndices +=
+        static_cast<int64_t>(rowIdx) * gridDim.y * topK + blockIdx.y * topK;
+    outLogits +=
+        static_cast<int64_t>(rowIdx) * gridDim.y * topK + blockIdx.y * topK;
   } else if constexpr (mergeBlocks) {
     rowEnd = numBlocksToMerge * topK;
-    indices += rowIdx * numBlocksToMerge * topK;
-    outIndices += rowIdx * topK;
+    indices += static_cast<int64_t>(rowIdx) * numBlocksToMerge * topK;
+    outIndices += static_cast<int64_t>(rowIdx) * topK;
   }
-  logits += rowIdx * stride0;
+  logits += static_cast<int64_t>(rowIdx) * stride0;
 
   topKPerRowJob<kNumThreadsPerBlock, kNumBins, useRadixSort,
                 multipleBlocksPerRow, mergeBlocks>(

From ce96857fdd2bf2390aaa2183561fd1a0f5c464c7 Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <jinzhen.ljz@antgroup.com>
Date: Wed, 17 Dec 2025 06:35:28 +0800
Subject: [PATCH 601/770] [Kernel][Quantization][MoE] add marlin kernel support
 for turing (sm75) (#29901)

Signed-off-by: Jinzhen Lin <jinzhen.ljz@antgroup.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 CMakeLists.txt                                | 109 ++++---
 csrc/moe/marlin_moe_wna16/.gitignore          |   1 +
 csrc/moe/marlin_moe_wna16/generate_kernels.py | 132 +++++----
 csrc/moe/marlin_moe_wna16/marlin_template.h   | 208 ++++----------
 csrc/moe/marlin_moe_wna16/ops.cu              |  54 ++--
 csrc/quantization/gptq_marlin/.gitignore      |   1 +
 csrc/quantization/gptq_marlin/dequant.h       |   2 +-
 .../gptq_marlin/generate_kernels.py           | 132 +++++----
 csrc/quantization/gptq_marlin/gptq_marlin.cu  |  68 +++--
 csrc/quantization/gptq_marlin/marlin.cuh      |  74 ++++-
 csrc/quantization/gptq_marlin/marlin_mma.h    | 269 ++++++++++++++++++
 .../gptq_marlin/marlin_template.h             | 184 +++---------
 .../layers/quantization/awq_marlin.py         |   2 +-
 .../model_executor/layers/quantization/fp8.py |   2 +-
 .../layers/quantization/gptq_marlin.py        |   2 +-
 .../layers/quantization/modelopt.py           |   2 +-
 16 files changed, 729 insertions(+), 513 deletions(-)
 create mode 100644 csrc/quantization/gptq_marlin/marlin_mma.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cd52df86e0346..5ca71f6ba4df0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -357,6 +357,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 
   # marlin arches for fp16 output
   cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
+  # marlin has limited support for turing
+  cuda_archs_loose_intersection(MARLIN_SM75_ARCHS "7.5" "${CUDA_ARCHS}")
   # marlin arches for bf16 output (we need 9.0 for bf16 atomicAdd PTX)
   cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
   # marlin arches for fp8 input
@@ -364,8 +366,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
   # so we only enable fp8 computation for SM89 (e.g. RTX 40x0)  and 12.0 (e.g. RTX 50x0)
   cuda_archs_loose_intersection(MARLIN_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
+  # marlin arches for other files
+  cuda_archs_loose_intersection(MARLIN_OTHER_ARCHS "7.5;8.0+PTX" "${CUDA_ARCHS}")
 
-  if (MARLIN_ARCHS)
+  if (MARLIN_OTHER_ARCHS)
 
     #
     # For the Marlin kernels we automatically generate sources for various
@@ -406,25 +410,39 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
       message(STATUS "Marlin generation script has not changed, skipping generation.")
     endif()
 
-    file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
-      CUDA_ARCHS "${MARLIN_ARCHS}")
-    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
-      set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
-        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
-    endif()
-    list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
+    if (MARLIN_ARCHS)
+      file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu")
+      set_gencode_flags_for_srcs(
+        SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
+        CUDA_ARCHS "${MARLIN_ARCHS}")
+      if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+        set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
+          PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+      endif()
+      list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
 
-    file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}"
-      CUDA_ARCHS "${MARLIN_BF16_ARCHS}")
-    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
-      set_source_files_properties(${MARLIN_TEMPLATE_BF16_KERNEL_SRC}
-        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+      file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu")
+      set_gencode_flags_for_srcs(
+        SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}"
+        CUDA_ARCHS "${MARLIN_BF16_ARCHS}")
+      if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+        set_source_files_properties(${MARLIN_TEMPLATE_BF16_KERNEL_SRC}
+          PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+      endif()
+      list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
+    endif()
+
+    if (MARLIN_SM75_ARCHS) 
+      file(GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC "csrc/quantization/gptq_marlin/sm75_kernel_*.cu")
+      set_gencode_flags_for_srcs(
+        SRCS "${MARLIN_TEMPLATE_SM75_KERNEL_SRC}"
+        CUDA_ARCHS "${MARLIN_SM75_ARCHS}")
+      if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+        set_source_files_properties(${MARLIN_TEMPLATE_SM75_KERNEL_SRC}
+          PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+      endif()
+      list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_SM75_KERNEL_SRC})
     endif()
-    list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
 
     if (MARLIN_FP8_ARCHS) 
       file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu")
@@ -446,14 +464,14 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
        "csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
     set_gencode_flags_for_srcs(
       SRCS "${MARLIN_SRCS}"
-      CUDA_ARCHS "${MARLIN_ARCHS}")
+      CUDA_ARCHS "${MARLIN_OTHER_ARCHS}")
     if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
-      set_source_files_properties("csrc/quantization/gptq_marlin/gptq_marlin.cu"
+      set_source_files_properties(${MARLIN_SRCS}
         PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
     endif()
     list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
 
-    message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
+    message(STATUS "Building Marlin kernels for archs: ${MARLIN_OTHER_ARCHS}")
   else()
     message(STATUS "Not building Marlin kernels as no compatible archs found"
                    " in CUDA target architectures")
@@ -980,12 +998,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # note that we always set `use_atomic_add=False` for moe marlin now,
   # so we don't need 9.0 for bf16 atomicAdd PTX
   cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
+  # moe marlin has limited support for turing
+  cuda_archs_loose_intersection(MARLIN_MOE_SM75_ARCHS "7.5" "${CUDA_ARCHS}")
   # moe marlin arches for fp8 input
   # - sm80 doesn't support fp8 computation
   # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
   # so we only enable fp8 computation for SM89 (e.g. RTX 40x0)  and 12.0 (e.g. RTX 50x0)
   cuda_archs_loose_intersection(MARLIN_MOE_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
-  if (MARLIN_MOE_ARCHS)
+  # moe marlin arches for other files
+  cuda_archs_loose_intersection(MARLIN_MOE_OTHER_ARCHS "7.5;8.0+PTX" "${CUDA_ARCHS}")
+  if (MARLIN_MOE_OTHER_ARCHS)
 
     #
     # For the Marlin MOE kernels we automatically generate sources for various
@@ -1026,16 +1048,29 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
       message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
     endif()
 
-    file(GLOB MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/sm80_kernel_*.cu")
-    list(APPEND MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/ops.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${MARLIN_MOE_SRC}"
-      CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
-    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
-      set_source_files_properties(${MARLIN_MOE_SRC}
-        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+    if (MARLIN_MOE_ARCHS)
+      file(GLOB MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/sm80_kernel_*.cu")
+      set_gencode_flags_for_srcs(
+        SRCS "${MARLIN_MOE_SRC}"
+        CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
+      if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+        set_source_files_properties(${MARLIN_MOE_SRC}
+          PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+      endif()
+      list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
+    endif()
+
+    if (MARLIN_MOE_SM75_ARCHS) 
+      file(GLOB MARLIN_MOE_SM75_SRC "csrc/moe/marlin_moe_wna16/sm75_kernel_*.cu")
+      set_gencode_flags_for_srcs(
+        SRCS "${MARLIN_MOE_SM75_SRC}"
+        CUDA_ARCHS "${MARLIN_MOE_SM75_ARCHS}")
+      if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+        set_source_files_properties(${MARLIN_MOE_SM75_SRC}
+          PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+      endif()
+      list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SM75_SRC})
     endif()
-    list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
 
     if (MARLIN_MOE_FP8_ARCHS)
       file(GLOB MARLIN_MOE_FP8_SRC "csrc/moe/marlin_moe_wna16/sm89_kernel_*.cu")
@@ -1049,7 +1084,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
       list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_FP8_SRC})
     endif()
 
-    message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
+    set(MARLIN_MOE_OTHER_SRC "csrc/moe/marlin_moe_wna16/ops.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${MARLIN_MOE_OTHER_SRC}"
+      CUDA_ARCHS "${MARLIN_MOE_OTHER_ARCHS}")
+    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+      set_source_files_properties(${MARLIN_MOE_OTHER_SRC}
+        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+    endif()
+    list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_OTHER_SRC}")
+
+    message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_OTHER_ARCHS}")
   else()
     message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
                    " in CUDA target architectures")
diff --git a/csrc/moe/marlin_moe_wna16/.gitignore b/csrc/moe/marlin_moe_wna16/.gitignore
index ba805f9250ece..7dc482a894660 100644
--- a/csrc/moe/marlin_moe_wna16/.gitignore
+++ b/csrc/moe/marlin_moe_wna16/.gitignore
@@ -1,2 +1,3 @@
 sm*_kernel_*.cu
 kernel_selector.h
+kernel_*.cu
diff --git a/csrc/moe/marlin_moe_wna16/generate_kernels.py b/csrc/moe/marlin_moe_wna16/generate_kernels.py
index 88f1055337fd5..9db03ea149d0c 100644
--- a/csrc/moe/marlin_moe_wna16/generate_kernels.py
+++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py
@@ -10,6 +10,8 @@ import jinja2
 
 ARCHS = []
 SUPPORT_FP8 = False
+SUPPORT_SM75 = False
+SUPPORT_SM80 = False
 for arch in sys.argv[1].split(","):
     arch = arch[: arch.index(".") + 2].replace(".", "")
     arch = int(arch)
@@ -19,6 +21,10 @@ for arch in sys.argv[1].split(","):
     # with FP16 MMA, so it cannot achieve any acceleration.
     if arch in [89, 120]:
         SUPPORT_FP8 = True
+    if arch >= 80:
+        SUPPORT_SM80 = True
+    if arch == 75:
+        SUPPORT_SM75 = True
 
 FILE_HEAD_COMMENT = """
 // auto generated by generate_kernels.py
@@ -157,6 +163,7 @@ def remove_old_kernels():
 
 def generate_new_kernels():
     result_dict = {}
+    sm_75_result_dict = {}
 
     for quant_config in QUANT_CONFIGS:
         c_types = quant_config.get("c_type", ["kFloat16", "kBFloat16"])
@@ -174,6 +181,8 @@ def generate_new_kernels():
             s_type = quant_config.get("s_type", c_type)
             if (a_type, b_type, c_type) not in result_dict:
                 result_dict[(a_type, b_type, c_type)] = []
+                if a_type in ["kFloat16", "kS8"] and c_type == "kFloat16":
+                    sm_75_result_dict[(a_type, b_type, c_type)] = []
 
             for group_blocks, m_blocks, thread_configs in itertools.product(
                 all_group_blocks, all_m_blocks, all_thread_configs
@@ -197,78 +206,89 @@ def generate_new_kernels():
                     "thread_k_blocks": thread_k // 16,
                     "thread_n_blocks": thread_n // 16,
                     "m_block_size_8": "true" if m_blocks == 0.5 else "false",
-                    "stages": "pipe_stages",
+                    "stages": 4,
                     "group_blocks": group_blocks,
                     "is_zp_float": "false",
                 }
 
-                result_dict[(a_type, b_type, c_type)].append(config)
+                if SUPPORT_SM80:
+                    result_dict[(a_type, b_type, c_type)].append(config)
+                if (a_type, b_type, c_type) in sm_75_result_dict and SUPPORT_SM75:
+                    config_sm75 = config.copy()
+                    config_sm75["stages"] = 2
+                    sm_75_result_dict[(a_type, b_type, c_type)].append(config_sm75)
 
     kernel_selector_str = FILE_HEAD_COMMENT
 
-    for (a_type, b_type, c_type), config_list in result_dict.items():
-        all_template_str_list = []
-        for config in config_list:
-            s_type = config["s_type"]
-            template_str = jinja2.Template(TEMPLATE).render(
-                a_type_id=f"vllm::{a_type}.id()",
-                b_type_id=f"vllm::{b_type}.id()",
-                c_type_id=f"vllm::{c_type}.id()",
-                s_type_id=f"vllm::{s_type}.id()",
-                **config,
-            )
-            all_template_str_list.append(template_str)
-
-            conditions = [
-                f"a_type == vllm::{a_type}",
-                f"b_type == vllm::{b_type}",
-                f"c_type == vllm::{c_type}",
-                f"s_type == vllm::{s_type}",
-                f"threads == {config['threads']}",
-                f"thread_m_blocks == {config['thread_m_blocks']}",
-                f"thread_n_blocks == {config['thread_n_blocks']}",
-                f"thread_k_blocks == {config['thread_k_blocks']}",
-                f"m_block_size_8 == {config['m_block_size_8']}",
-                f"group_blocks == {config['group_blocks']}",
-                f"is_zp_float == {config['is_zp_float']}",
-            ]
-            conditions = " && ".join(conditions)
-
-            if kernel_selector_str == FILE_HEAD_COMMENT:
-                kernel_selector_str += f"if ({conditions})\n  kernel = "
-            else:
-                kernel_selector_str += f"else if ({conditions})\n  kernel = "
-
-            kernel_template2 = (
-                "Marlin<{{a_type_id}}, {{b_type_id}}, {{c_type_id}}, "
-                "{{s_type_id}}, {{threads}}, {{thread_m_blocks}}, "
-                "{{thread_n_blocks}}, {{thread_k_blocks}}, "
-                "{{m_block_size_8}}, {{stages}}, {{group_blocks}}, "
-                "{{is_zp_float}}>;"
-            )
-
-            kernel_selector_str += (
-                jinja2.Template(kernel_template2).render(
+    for result_dict_tmp in [result_dict, sm_75_result_dict]:
+        for (a_type, b_type, c_type), config_list in result_dict_tmp.items():
+            all_template_str_list = []
+            if not config_list:
+                continue
+            for config in config_list:
+                s_type = config["s_type"]
+                template_str = jinja2.Template(TEMPLATE).render(
                     a_type_id=f"vllm::{a_type}.id()",
                     b_type_id=f"vllm::{b_type}.id()",
                     c_type_id=f"vllm::{c_type}.id()",
                     s_type_id=f"vllm::{s_type}.id()",
                     **config,
                 )
-                + "\n"
-            )
+                all_template_str_list.append(template_str)
 
-        file_content = FILE_HEAD + "\n\n"
-        file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
-        if a_type == "kFE4M3fn":
-            filename = f"sm89_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
-        else:
-            filename = f"sm80_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+                conditions = [
+                    f"a_type == vllm::{a_type}",
+                    f"b_type == vllm::{b_type}",
+                    f"c_type == vllm::{c_type}",
+                    f"s_type == vllm::{s_type}",
+                    f"threads == {config['threads']}",
+                    f"thread_m_blocks == {config['thread_m_blocks']}",
+                    f"thread_n_blocks == {config['thread_n_blocks']}",
+                    f"thread_k_blocks == {config['thread_k_blocks']}",
+                    f"m_block_size_8 == {config['m_block_size_8']}",
+                    f"stages == {config['stages']}",
+                    f"group_blocks == {config['group_blocks']}",
+                    f"is_zp_float == {config['is_zp_float']}",
+                ]
+                conditions = " && ".join(conditions)
 
-        filename = filename.lower()
+                if kernel_selector_str == FILE_HEAD_COMMENT:
+                    kernel_selector_str += f"if ({conditions})\n  kernel = "
+                else:
+                    kernel_selector_str += f"else if ({conditions})\n  kernel = "
 
-        with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
-            f.write(file_content)
+                kernel_template2 = (
+                    "Marlin<{{a_type_id}}, {{b_type_id}}, {{c_type_id}}, "
+                    "{{s_type_id}}, {{threads}}, {{thread_m_blocks}}, "
+                    "{{thread_n_blocks}}, {{thread_k_blocks}}, "
+                    "{{m_block_size_8}}, {{stages}}, {{group_blocks}}, "
+                    "{{is_zp_float}}>;"
+                )
+
+                kernel_selector_str += (
+                    jinja2.Template(kernel_template2).render(
+                        a_type_id=f"vllm::{a_type}.id()",
+                        b_type_id=f"vllm::{b_type}.id()",
+                        c_type_id=f"vllm::{c_type}.id()",
+                        s_type_id=f"vllm::{s_type}.id()",
+                        **config,
+                    )
+                    + "\n"
+                )
+
+            file_content = FILE_HEAD + "\n\n"
+            file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
+            if a_type == "kFE4M3fn":
+                filename = f"sm89_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+            elif result_dict_tmp is sm_75_result_dict:
+                filename = f"sm75_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+            else:
+                filename = f"sm80_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+
+            filename = filename.lower()
+
+            with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
+                f.write(file_content)
 
     if not SUPPORT_FP8 and kernel_selector_str != FILE_HEAD_COMMENT:
         kernel_selector_str += (
diff --git a/csrc/moe/marlin_moe_wna16/marlin_template.h b/csrc/moe/marlin_moe_wna16/marlin_template.h
index 5b6b2456b4111..138197b76f026 100644
--- a/csrc/moe/marlin_moe_wna16/marlin_template.h
+++ b/csrc/moe/marlin_moe_wna16/marlin_template.h
@@ -26,6 +26,7 @@
 #include "quantization/gptq_marlin/marlin.cuh"
 #include "quantization/gptq_marlin/marlin_dtypes.cuh"
 #include "quantization/gptq_marlin/dequant.h"
+#include "quantization/gptq_marlin/marlin_mma.h"
 #include "core/scalar_type.hpp"
 
 #define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)               \
@@ -35,7 +36,7 @@
 
 namespace MARLIN_NAMESPACE_NAME {
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750
 
 template <typename scalar_t,  // compute dtype, half or nv_float16
           const vllm::ScalarTypeId b_type_id,  // weight MarlinScalarType id
@@ -84,146 +85,6 @@ __global__ void Marlin(
 
 #else
 
-// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
-// output/accumulation.
-template <vllm::ScalarTypeId type_id, int k_size = 16>
-__device__ inline void mma(
-    const typename MarlinScalarType<type_id>::FragA& a_frag,
-    const typename MarlinScalarType<type_id>::FragB& frag_b,
-    typename MarlinScalarType<type_id>::FragC& frag_c, int idx = 0) {
-  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
-  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
-  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
-  if constexpr (k_size == 16) {
-    if constexpr (std::is_same<scalar_t, half>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
-          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "f"(c[0]),
-            "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
-      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
-          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
-          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
-          : "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "r"(c[0]),
-            "r"(c[1]), "r"(c[2]), "r"(c[3]));
-    }
-  } else if (k_size == 32) {
-    if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
-      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
-          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-            "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
-    }
-  }
-}
-
-template <vllm::ScalarTypeId type_id, int k_size = 16>
-__device__ inline void mma_trans(
-    const typename MarlinScalarType<type_id>::FragA& a_frag,
-    const typename MarlinScalarType<type_id>::FragB& frag_b,
-    const typename MarlinScalarType<type_id>::FragB& frag_b2,
-    typename MarlinScalarType<type_id>::FragC& frag_c) {
-  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
-  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
-  const uint32_t* b2 = reinterpret_cast<const uint32_t*>(&frag_b2);
-  float* c = reinterpret_cast<float*>(&frag_c);
-  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
-  if constexpr (k_size == 16) {
-    if constexpr (std::is_same<scalar_t, half>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
-          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "f"(c[0]), "f"(c[1]), "f"(c[2]),
-            "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
-      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
-          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
-          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "r"(c[0]), "r"(c[1]), "r"(c[2]),
-            "r"(c[3]));
-    }
-  } else {
-    if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 1200
-      asm volatile(
-          "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-  #else
-      asm volatile(
-          "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-  #endif
-    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
-      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-            "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
-    }
-  }
-}
-
 // Instruction for loading a full 16x16 matrix fragment of operand A from shared
 // memory, directly in tensor core layout.
 template <int count, vllm::ScalarTypeId type_id>
@@ -439,9 +300,20 @@ __global__ void Marlin(
   if constexpr (a_type_id == vllm::kFE4M3fn.id()) return;
   #endif
 
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
+  // Turing TensorCore only supports fp16 and int8
+  if constexpr (a_type_id != vllm::kFloat16.id() && a_type_id != vllm::kS8.id())
+    return;
+  #endif
+
   int num_tokens_past_padded = num_tokens_past_padded_ptr[0];
   constexpr int moe_block_size = m_block_size_8 ? 8 : (16 * thread_m_blocks);
 
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
+  constexpr bool use_fp16_accum = a_type_id == vllm::kFloat16.id();
+  #else
+  constexpr bool use_fp16_accum = false;
+  #endif
   using Adtype = MarlinScalarType<a_type_id>;
   using Cdtype = MarlinScalarType<c_type_id>;
 
@@ -618,7 +490,22 @@ __global__ void Marlin(
         }
       }
 
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
+
+      if constexpr (moe_block_size >= 16)
+        local_count += __shfl_down_sync(0xFFFFFFFF, local_count, 16);
+      if constexpr (moe_block_size >= 8)
+        local_count += __shfl_down_sync(0xFFFFFFFF, local_count, 8);
+      if constexpr (moe_block_size >= 4)
+        local_count += __shfl_down_sync(0xFFFFFFFF, local_count, 4);
+      if constexpr (moe_block_size >= 2)
+        local_count += __shfl_down_sync(0xFFFFFFFF, local_count, 2);
+
+      local_count += __shfl_down_sync(0xFFFFFFFF, local_count, 1);
+      block_num_valid_tokens = local_count;
+  #else
       block_num_valid_tokens = __reduce_add_sync(0xffffffff, local_count);
+  #endif
 
       if (lane_id == 0)
         reinterpret_cast<int*>(sh_new)[0] = block_num_valid_tokens;
@@ -1018,10 +905,6 @@ __global__ void Marlin(
   constexpr int sh_s_size = has_act_order ? (act_s_max_num_groups * s_sh_stride)
                                           : (stages * s_sh_stage);
   int4* sh_s = sh_zp + (stages * zp_sh_stage);
-  // shared memory reused by reduction should be smaller than
-  // shared memory used by weight.
-  static_assert(thread_m_blocks * 16 * thread_n_blocks * 16 / 8 <=
-                stages * b_sh_stage);
   int4* sh_a = sh_s + sh_s_size;
 
   // Register storage for double buffer of shared memory reads.
@@ -1545,11 +1428,13 @@ __global__ void Marlin(
   #pragma unroll
       for (int i = 0; i < thread_m_blocks; i++) {
         if constexpr (m_block_size_8) {
-          mma_trans<a_type_id>(frag_a[k2][i], frag_b0, frag_b1,
-                               frag_c[i][j][0]);
+          mma_trans<a_type_id, use_fp16_accum>(frag_a[k2][i], frag_b0, frag_b1,
+                                               frag_c[i][j][0]);
         } else {
-          mma<a_type_id>(frag_a[k2][i], frag_b0, frag_c[i][j][0]);
-          mma<a_type_id>(frag_a[k2][i], frag_b1, frag_c[i][j][1]);
+          mma<a_type_id, use_fp16_accum>(frag_a[k2][i], frag_b0,
+                                         frag_c[i][j][0]);
+          mma<a_type_id, use_fp16_accum>(frag_a[k2][i], frag_b1,
+                                         frag_c[i][j][1]);
         }
       }
     }
@@ -1583,10 +1468,12 @@ __global__ void Marlin(
 
   #pragma unroll
       for (int i = 0; i < thread_m_blocks; i++) {
-        mma<a_type_id, 32>(frag_a[k2][i], frag_b[0],
-                           (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][0]);
-        mma<a_type_id, 32>(frag_a[k2][i], frag_b[1],
-                           (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][1]);
+        mma<a_type_id, false, 32>(
+            frag_a[k2][i], frag_b[0],
+            (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][0]);
+        mma<a_type_id, false, 32>(
+            frag_a[k2][i], frag_b[1],
+            (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][1]);
       }
 
       if constexpr (group_blocks != -1) {
@@ -2132,6 +2019,21 @@ __global__ void Marlin(
     // While this pattern may not be the most readable, other ways of writing
     // the loop seemed to noticeably worse performance after compilation.
     if (slice_iters == 0) {
+      // convert fp16 accum to fp32 for reduction
+      if constexpr (use_fp16_accum) {
+  #pragma unroll
+        for (int i = 0; i < (thread_m_blocks * (is_a_8bit ? 2 : 4) * 2); i++) {
+          float* frag_c_part_float = reinterpret_cast<float*>(frag_c) + i * 4;
+          scalar_t* frag_c_part_half =
+              reinterpret_cast<scalar_t*>(frag_c_part_float);
+
+  #pragma unroll
+          for (int i = 3; i >= 0; i--) {
+            frag_c_part_float[i] = Cdtype::num2float(frag_c_part_half[i]);
+          }
+        }
+      }
+
       if constexpr (is_a_8bit) {
         float frag_a_s[2 * thread_m_blocks];
 
diff --git a/csrc/moe/marlin_moe_wna16/ops.cu b/csrc/moe/marlin_moe_wna16/ops.cu
index 4fd8fc5c54202..8ac1691220a6b 100644
--- a/csrc/moe/marlin_moe_wna16/ops.cu
+++ b/csrc/moe/marlin_moe_wna16/ops.cu
@@ -142,7 +142,7 @@ typedef struct {
 
 int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
                           int prob_n, int prob_k, int num_bits, int group_size,
-                          bool has_act_order, bool is_k_full) {
+                          bool has_act_order, bool is_k_full, int stages) {
   bool cache_scales_chunk = has_act_order && !is_k_full;
 
   int tb_n = th_config.thread_n;
@@ -160,13 +160,13 @@ int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
 
   if (cache_scales_chunk) {
     int load_groups =
-        tb_groups * pipe_stages * 2;     // Chunk size is 2x pipeline over dim K
+        tb_groups * stages * 2;          // Chunk size is 2x pipeline over dim K
     load_groups = max(load_groups, 32);  // We load at least 32 scale groups
     return load_groups * tb_n * 2;
   } else {
     int tb_scales = tb_groups * tb_n * 2;
 
-    return tb_scales * pipe_stages;
+    return tb_scales * stages;
   }
 }
 
@@ -174,7 +174,7 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
                           int thread_m_blocks, int prob_m, int prob_n,
                           int prob_k, int num_bits, int group_size,
                           bool has_act_order, bool is_k_full, int has_zp,
-                          int is_zp_float, bool is_a_8bit) {
+                          int is_zp_float, bool is_a_8bit, int stages) {
   int pack_factor = 32 / num_bits;
 
   // Get B size
@@ -185,8 +185,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
   // shm size for block_sorted_ids/rd_block_sorted_ids/block_topk_weights
   // both of them requires tb_m * 4 bytes (tb_m * int32 or tb_m * float32)
   int sh_block_meta_size = tb_m * 16;
-  int sh_a_size = pipe_stages * (tb_m * tb_k) * (is_a_8bit ? 1 : 2);
-  int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4;
+  int sh_a_size = stages * (tb_m * tb_k) * (is_a_8bit ? 1 : 2);
+  int sh_b_size = stages * (tb_k * tb_n / pack_factor) * 4;
   int sh_red_size = tb_m * (tb_n + 8) * 2;
   int sh_bias_size = tb_n * 2;
   int tmp_size =
@@ -195,8 +195,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
 
   int sh_s_size =
       get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits,
-                            group_size, has_act_order, is_k_full);
-  int sh_g_idx_size = has_act_order && !is_k_full ? pipe_stages * tb_k / 4 : 0;
+                            group_size, has_act_order, is_k_full, stages);
+  int sh_g_idx_size = has_act_order && !is_k_full ? stages * tb_k / 4 : 0;
   int sh_zp_size = 0;
   if (has_zp) {
     if (is_zp_float)
@@ -217,7 +217,7 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8,
                      int thread_m_blocks, int prob_m, int prob_n, int prob_k,
                      int num_bits, int group_size, bool has_act_order,
                      bool is_k_full, int has_zp, int is_zp_float,
-                     int max_shared_mem, bool is_a_8bit) {
+                     bool is_a_8bit, int stages, int max_shared_mem) {
   // Sanity
   if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
       th_config.num_threads == -1) {
@@ -243,7 +243,7 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8,
   int cache_size =
       get_kernel_cache_size(th_config, m_block_size_8, thread_m_blocks, prob_m,
                             prob_n, prob_k, num_bits, group_size, has_act_order,
-                            is_k_full, has_zp, is_zp_float, is_a_8bit);
+                            is_k_full, has_zp, is_zp_float, is_a_8bit, stages);
   return cache_size <= max_shared_mem;
 }
 
@@ -252,7 +252,7 @@ MarlinFuncPtr get_marlin_kernel(
     const vllm::ScalarType c_type, const vllm::ScalarType s_type,
     int thread_m_blocks, int thread_n_blocks, int thread_k_blocks,
     bool m_block_size_8, bool has_act_order, bool has_zp, int group_blocks,
-    int threads, bool is_zp_float) {
+    int threads, bool is_zp_float, int stages) {
   int num_bits = b_type.size_bits();
   auto kernel = MarlinDefault;
 
@@ -266,8 +266,8 @@ exec_config_t determine_exec_config(
     const vllm::ScalarType& c_type, const vllm::ScalarType& s_type, int prob_m,
     int prob_n, int prob_k, int num_experts, int top_k, int thread_m_blocks,
     bool m_block_size_8, int num_bits, int group_size, bool has_act_order,
-    bool is_k_full, bool has_zp, bool is_zp_float, int max_shared_mem, int sms,
-    bool is_a_8bit) {
+    bool is_k_full, bool has_zp, bool is_zp_float, bool is_a_8bit, int stages,
+    int max_shared_mem, int sms) {
   exec_config_t exec_cfg = exec_config_t{1, thread_config_t{-1, -1, -1}};
   thread_config_t* thread_configs = thread_m_blocks > 1
                                         ? large_batch_thread_configs
@@ -284,15 +284,15 @@ exec_config_t determine_exec_config(
 
     if (!is_valid_config(th_config, m_block_size_8, thread_m_blocks, prob_m,
                          prob_n, prob_k, num_bits, group_size, has_act_order,
-                         is_k_full, has_zp, is_zp_float, max_shared_mem - 512,
-                         is_a_8bit)) {
+                         is_k_full, has_zp, is_zp_float, is_a_8bit, stages,
+                         max_shared_mem - 512)) {
       continue;
     }
 
     int cache_size = get_kernel_cache_size(
         th_config, m_block_size_8, thread_m_blocks, prob_m, prob_n, prob_k,
         num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float,
-        is_a_8bit);
+        is_a_8bit, stages);
 
     int group_blocks = 0;
     if (!has_act_order) {
@@ -303,7 +303,7 @@ exec_config_t determine_exec_config(
         get_marlin_kernel(a_type, b_type, c_type, s_type, thread_m_blocks,
                           th_config.thread_n / 16, th_config.thread_k / 16,
                           m_block_size_8, has_act_order, has_zp, group_blocks,
-                          th_config.num_threads, is_zp_float);
+                          th_config.num_threads, is_zp_float, stages);
 
     if (kernel == MarlinDefault) continue;
 
@@ -433,8 +433,14 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
                          dev);
   cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
                          dev);
-  TORCH_CHECK(major_capability * 10 + minor_capability >= 80,
-              "marlin kernel only support Ampere or newer GPUs.");
+  TORCH_CHECK(major_capability * 10 + minor_capability >= 75,
+              "marlin kernel only support Turing or newer GPUs.");
+  int stages = 4;
+  if (major_capability == 7 && minor_capability == 5) {
+    stages = 2;
+    TORCH_CHECK(a_type == vllm::kFloat16 || a_type == vllm::kS8,
+                "Turing only support FP16 or INT8 activation.");
+  }
   if (a_type == vllm::kFE4M3fn) {
     TORCH_CHECK(major_capability * 10 + minor_capability >= 89,
                 "FP8 only support Ada Lovelace or newer GPUs.");
@@ -461,8 +467,8 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
     exec_cfg = determine_exec_config(
         a_type, b_type, c_type, s_type, prob_m, prob_n, prob_k, num_experts,
         top_k, thread_m_blocks, m_block_size_8, num_bits, group_size,
-        has_act_order, is_k_full, has_zp, is_zp_float, max_shared_mem, sms,
-        is_a_8bit);
+        has_act_order, is_k_full, has_zp, is_zp_float, is_a_8bit, stages,
+        max_shared_mem, sms);
     thread_tfg = exec_cfg.tb_cfg;
   }
 
@@ -479,7 +485,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
   TORCH_CHECK(is_valid_config(thread_tfg, m_block_size_8, thread_m_blocks,
                               prob_m, prob_n, prob_k, num_bits, group_size,
                               has_act_order, is_k_full, has_zp, is_zp_float,
-                              max_shared_mem, is_a_8bit),
+                              is_a_8bit, stages, max_shared_mem),
               "Invalid thread config: thread_m_blocks = ", thread_m_blocks,
               ", thread_k = ", thread_tfg.thread_k,
               ", thread_n = ", thread_tfg.thread_n,
@@ -493,12 +499,12 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
   int sh_cache_size =
       get_kernel_cache_size(thread_tfg, m_block_size_8, thread_m_blocks, prob_m,
                             prob_n, prob_k, num_bits, group_size, has_act_order,
-                            is_k_full, has_zp, is_zp_float, is_a_8bit);
+                            is_k_full, has_zp, is_zp_float, is_a_8bit, stages);
 
   auto kernel = get_marlin_kernel(
       a_type, b_type, c_type, s_type, thread_m_blocks, thread_n_blocks,
       thread_k_blocks, m_block_size_8, has_act_order, has_zp, group_blocks,
-      num_threads, is_zp_float);
+      num_threads, is_zp_float, stages);
 
   if (kernel == MarlinDefault) {
     TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n,
diff --git a/csrc/quantization/gptq_marlin/.gitignore b/csrc/quantization/gptq_marlin/.gitignore
index ba805f9250ece..7dc482a894660 100644
--- a/csrc/quantization/gptq_marlin/.gitignore
+++ b/csrc/quantization/gptq_marlin/.gitignore
@@ -1,2 +1,3 @@
 sm*_kernel_*.cu
 kernel_selector.h
+kernel_*.cu
diff --git a/csrc/quantization/gptq_marlin/dequant.h b/csrc/quantization/gptq_marlin/dequant.h
index 26b8d40368aa9..edd97dbfcd8e5 100644
--- a/csrc/quantization/gptq_marlin/dequant.h
+++ b/csrc/quantization/gptq_marlin/dequant.h
@@ -67,7 +67,7 @@ where `scale_factor * multiplier` can be computed at weight loading.
 
 namespace MARLIN_NAMESPACE_NAME {
 
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 750
 // Lookup-table based 3-input logical operation; explicitly used for
 // dequantization as the compiler does not seem to automatically recognize it in
 // all cases.
diff --git a/csrc/quantization/gptq_marlin/generate_kernels.py b/csrc/quantization/gptq_marlin/generate_kernels.py
index 27ef7271ba41c..24866fc5cd546 100644
--- a/csrc/quantization/gptq_marlin/generate_kernels.py
+++ b/csrc/quantization/gptq_marlin/generate_kernels.py
@@ -10,6 +10,8 @@ import jinja2
 
 ARCHS = []
 SUPPORT_FP8 = False
+SUPPORT_SM75 = False
+SUPPORT_SM80 = False
 for arch in sys.argv[1].split(","):
     arch = arch[: arch.index(".") + 2].replace(".", "")
     arch = int(arch)
@@ -19,6 +21,10 @@ for arch in sys.argv[1].split(","):
     # with FP16 MMA, so it cannot achieve any acceleration.
     if arch in [89, 120]:
         SUPPORT_FP8 = True
+    if arch >= 80:
+        SUPPORT_SM80 = True
+    if arch == 75:
+        SUPPORT_SM75 = True
 
 FILE_HEAD_COMMENT = """
 // auto generated by generate_kernels.py
@@ -166,6 +172,7 @@ def remove_old_kernels():
 
 def generate_new_kernels():
     result_dict = {}
+    sm_75_result_dict = {}
 
     for quant_config in QUANT_CONFIGS:
         c_types = quant_config.get("c_type", ["kFloat16", "kBFloat16"])
@@ -184,6 +191,8 @@ def generate_new_kernels():
             s_type = quant_config.get("s_type", c_type)
             if (a_type, b_type, c_type) not in result_dict:
                 result_dict[(a_type, b_type, c_type)] = []
+                if a_type in ["kFloat16", "kS8"] and c_type == "kFloat16":
+                    sm_75_result_dict[(a_type, b_type, c_type)] = []
 
             for group_blocks, m_blocks, thread_configs in itertools.product(
                 all_group_blocks, all_m_blocks, all_thread_configs
@@ -207,78 +216,89 @@ def generate_new_kernels():
                     "thread_k_blocks": thread_k // 16,
                     "thread_n_blocks": thread_n // 16,
                     "m_block_size_8": "true" if m_blocks == 0.5 else "false",
-                    "stages": "pipe_stages",
+                    "stages": 4,
                     "group_blocks": group_blocks,
                     "is_zp_float": "true" if is_zp_float else "false",
                 }
 
-                result_dict[(a_type, b_type, c_type)].append(config)
+                if SUPPORT_SM80:
+                    result_dict[(a_type, b_type, c_type)].append(config)
+                if (a_type, b_type, c_type) in sm_75_result_dict and SUPPORT_SM75:
+                    config_sm75 = config.copy()
+                    config_sm75["stages"] = 2
+                    sm_75_result_dict[(a_type, b_type, c_type)].append(config_sm75)
 
     kernel_selector_str = FILE_HEAD_COMMENT
 
-    for (a_type, b_type, c_type), config_list in result_dict.items():
-        all_template_str_list = []
-        for config in config_list:
-            s_type = config["s_type"]
-            template_str = jinja2.Template(TEMPLATE).render(
-                a_type_id=f"vllm::{a_type}.id()",
-                b_type_id=f"vllm::{b_type}.id()",
-                c_type_id=f"vllm::{c_type}.id()",
-                s_type_id=f"vllm::{s_type}.id()",
-                **config,
-            )
-            all_template_str_list.append(template_str)
-
-            conditions = [
-                f"a_type == vllm::{a_type}",
-                f"b_type == vllm::{b_type}",
-                f"c_type == vllm::{c_type}",
-                f"s_type == vllm::{s_type}",
-                f"threads == {config['threads']}",
-                f"thread_m_blocks == {config['thread_m_blocks']}",
-                f"thread_n_blocks == {config['thread_n_blocks']}",
-                f"thread_k_blocks == {config['thread_k_blocks']}",
-                f"m_block_size_8 == {config['m_block_size_8']}",
-                f"group_blocks == {config['group_blocks']}",
-                f"is_zp_float == {config['is_zp_float']}",
-            ]
-            conditions = " && ".join(conditions)
-
-            if kernel_selector_str == FILE_HEAD_COMMENT:
-                kernel_selector_str += f"if ({conditions})\n  kernel = "
-            else:
-                kernel_selector_str += f"else if ({conditions})\n  kernel = "
-
-            kernel_template2 = (
-                "Marlin<{{a_type_id}}, {{b_type_id}}, {{c_type_id}}, "
-                "{{s_type_id}}, {{threads}}, {{thread_m_blocks}}, "
-                "{{thread_n_blocks}}, {{thread_k_blocks}}, "
-                "{{m_block_size_8}}, {{stages}}, {{group_blocks}}, "
-                "{{is_zp_float}}>;"
-            )
-
-            kernel_selector_str += (
-                jinja2.Template(kernel_template2).render(
+    for result_dict_tmp in [result_dict, sm_75_result_dict]:
+        for (a_type, b_type, c_type), config_list in result_dict_tmp.items():
+            all_template_str_list = []
+            if not config_list:
+                continue
+            for config in config_list:
+                s_type = config["s_type"]
+                template_str = jinja2.Template(TEMPLATE).render(
                     a_type_id=f"vllm::{a_type}.id()",
                     b_type_id=f"vllm::{b_type}.id()",
                     c_type_id=f"vllm::{c_type}.id()",
                     s_type_id=f"vllm::{s_type}.id()",
                     **config,
                 )
-                + "\n"
-            )
+                all_template_str_list.append(template_str)
 
-        file_content = FILE_HEAD + "\n\n"
-        file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
-        if a_type == "kFE4M3fn":
-            filename = f"sm89_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
-        else:
-            filename = f"sm80_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+                conditions = [
+                    f"a_type == vllm::{a_type}",
+                    f"b_type == vllm::{b_type}",
+                    f"c_type == vllm::{c_type}",
+                    f"s_type == vllm::{s_type}",
+                    f"threads == {config['threads']}",
+                    f"thread_m_blocks == {config['thread_m_blocks']}",
+                    f"thread_n_blocks == {config['thread_n_blocks']}",
+                    f"thread_k_blocks == {config['thread_k_blocks']}",
+                    f"m_block_size_8 == {config['m_block_size_8']}",
+                    f"stages == {config['stages']}",
+                    f"group_blocks == {config['group_blocks']}",
+                    f"is_zp_float == {config['is_zp_float']}",
+                ]
+                conditions = " && ".join(conditions)
 
-        filename = filename.lower()
+                if kernel_selector_str == FILE_HEAD_COMMENT:
+                    kernel_selector_str += f"if ({conditions})\n  kernel = "
+                else:
+                    kernel_selector_str += f"else if ({conditions})\n  kernel = "
 
-        with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
-            f.write(file_content)
+                kernel_template2 = (
+                    "Marlin<{{a_type_id}}, {{b_type_id}}, {{c_type_id}}, "
+                    "{{s_type_id}}, {{threads}}, {{thread_m_blocks}}, "
+                    "{{thread_n_blocks}}, {{thread_k_blocks}}, "
+                    "{{m_block_size_8}}, {{stages}}, {{group_blocks}}, "
+                    "{{is_zp_float}}>;"
+                )
+
+                kernel_selector_str += (
+                    jinja2.Template(kernel_template2).render(
+                        a_type_id=f"vllm::{a_type}.id()",
+                        b_type_id=f"vllm::{b_type}.id()",
+                        c_type_id=f"vllm::{c_type}.id()",
+                        s_type_id=f"vllm::{s_type}.id()",
+                        **config,
+                    )
+                    + "\n"
+                )
+
+            file_content = FILE_HEAD + "\n\n"
+            file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
+            if a_type == "kFE4M3fn":
+                filename = f"sm89_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+            elif result_dict_tmp is sm_75_result_dict:
+                filename = f"sm75_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+            else:
+                filename = f"sm80_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+
+            filename = filename.lower()
+
+            with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
+                f.write(file_content)
 
     if not SUPPORT_FP8 and kernel_selector_str != FILE_HEAD_COMMENT:
         kernel_selector_str += (
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
index 28ff06559a98a..77f319d53bc52 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -37,7 +37,7 @@ __global__ void MarlinDefault(MARLIN_KERNEL_PARAMS){};
 
 using MarlinFuncPtr = void (*)(MARLIN_KERNEL_PARAMS);
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750
 
 __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
                                     int const* __restrict__ perm_int_ptr,
@@ -148,7 +148,7 @@ typedef struct {
 
 int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
                           int prob_n, int prob_k, int num_bits, int group_size,
-                          bool has_act_order, bool is_k_full) {
+                          bool has_act_order, bool is_k_full, int stages) {
   bool cache_scales_chunk = has_act_order && !is_k_full;
 
   int tb_n = th_config.thread_n;
@@ -166,28 +166,29 @@ int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
 
   if (cache_scales_chunk) {
     int load_groups =
-        tb_groups * pipe_stages * 2;     // Chunk size is 2x pipeline over dim K
+        tb_groups * stages * 2;          // Chunk size is 2x pipeline over dim K
     load_groups = max(load_groups, 32);  // We load at least 32 scale groups
     return load_groups * tb_n * 2;
   } else {
     int tb_scales = tb_groups * tb_n * 2;
 
-    return tb_scales * pipe_stages;
+    return tb_scales * stages;
   }
 }
 
 int get_kernel_cache_size(thread_config_t const& th_config, int thread_m_blocks,
                           int prob_m, int prob_n, int prob_k, int num_bits,
                           int group_size, bool has_act_order, bool is_k_full,
-                          int has_zp, int is_zp_float) {
+                          int has_zp, bool is_zp_float, bool is_a_8bit,
+                          int stages) {
   int pack_factor = 32 / num_bits;
 
   // Get B size
   int tb_k = th_config.thread_k;
   int tb_n = th_config.thread_n;
   int tb_m = thread_m_blocks * 16;
-  int sh_a_size = pipe_stages * (tb_m * tb_k) * 2;
-  int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4;
+  int sh_a_size = stages * (tb_m * tb_k) * (is_a_8bit ? 1 : 2);
+  int sh_b_size = stages * (tb_k * tb_n / pack_factor) * 4;
   int sh_red_size = tb_m * (tb_n + 8) * 2;
   int sh_bias_size = tb_n * 2;
   int tmp_size =
@@ -196,8 +197,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, int thread_m_blocks,
 
   int sh_s_size =
       get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits,
-                            group_size, has_act_order, is_k_full);
-  int sh_g_idx_size = has_act_order && !is_k_full ? pipe_stages * tb_k / 4 : 0;
+                            group_size, has_act_order, is_k_full, stages);
+  int sh_g_idx_size = has_act_order && !is_k_full ? stages * tb_k / 4 : 0;
   int sh_zp_size = 0;
   if (has_zp) {
     if (is_zp_float)
@@ -217,7 +218,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, int thread_m_blocks,
 bool is_valid_config(thread_config_t const& th_config, int thread_m_blocks,
                      int prob_m, int prob_n, int prob_k, int num_bits,
                      int group_size, bool has_act_order, bool is_k_full,
-                     int has_zp, int is_zp_float, int max_shared_mem) {
+                     int has_zp, bool is_zp_float, bool is_a_8bit, int stages,
+                     int max_shared_mem) {
   // Sanity
   if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
       th_config.num_threads == -1) {
@@ -242,7 +244,7 @@ bool is_valid_config(thread_config_t const& th_config, int thread_m_blocks,
   // Check that pipeline fits into cache
   int cache_size = get_kernel_cache_size(
       th_config, thread_m_blocks, prob_m, prob_n, prob_k, num_bits, group_size,
-      has_act_order, is_k_full, has_zp, is_zp_float);
+      has_act_order, is_k_full, has_zp, is_zp_float, is_a_8bit, stages);
   return cache_size <= max_shared_mem;
 }
 
@@ -251,7 +253,7 @@ MarlinFuncPtr get_marlin_kernel(
     const vllm::ScalarType c_type, const vllm::ScalarType s_type,
     int thread_m_blocks, int thread_n_blocks, int thread_k_blocks,
     bool m_block_size_8, bool has_act_order, bool has_zp, int group_blocks,
-    int threads, bool is_zp_float) {
+    int threads, bool is_zp_float, int stages) {
   int num_bits = b_type.size_bits();
   auto kernel = MarlinDefault;
 
@@ -265,7 +267,8 @@ exec_config_t determine_exec_config(
     const vllm::ScalarType& c_type, const vllm::ScalarType& s_type, int prob_m,
     int prob_n, int prob_k, int thread_m_blocks, bool m_block_size_8,
     int num_bits, int group_size, bool has_act_order, bool is_k_full,
-    bool has_zp, bool is_zp_float, int max_shared_mem, int sms) {
+    bool has_zp, bool is_zp_float, int is_a_8bit, int stages,
+    int max_shared_mem, int sms) {
   exec_config_t exec_cfg = exec_config_t{1, thread_config_t{-1, -1, -1}};
   thread_config_t* thread_configs = thread_m_blocks > 1
                                         ? large_batch_thread_configs
@@ -280,13 +283,15 @@ exec_config_t determine_exec_config(
 
     if (!is_valid_config(th_config, thread_m_blocks, prob_m, prob_n, prob_k,
                          num_bits, group_size, has_act_order, is_k_full, has_zp,
-                         is_zp_float, max_shared_mem - 512)) {
+                         is_zp_float, is_a_8bit, stages,
+                         max_shared_mem - 512)) {
       continue;
     }
 
-    int cache_size = get_kernel_cache_size(
-        th_config, thread_m_blocks, prob_m, prob_n, prob_k, num_bits,
-        group_size, has_act_order, is_k_full, has_zp, is_zp_float);
+    int cache_size = get_kernel_cache_size(th_config, thread_m_blocks, prob_m,
+                                           prob_n, prob_k, num_bits, group_size,
+                                           has_act_order, is_k_full, has_zp,
+                                           is_zp_float, is_a_8bit, stages);
 
     int group_blocks = 0;
     if (!has_act_order) {
@@ -297,14 +302,10 @@ exec_config_t determine_exec_config(
         get_marlin_kernel(a_type, b_type, c_type, s_type, thread_m_blocks,
                           th_config.thread_n / 16, th_config.thread_k / 16,
                           m_block_size_8, has_act_order, has_zp, group_blocks,
-                          th_config.num_threads, is_zp_float);
+                          th_config.num_threads, is_zp_float, stages);
 
     if (kernel == MarlinDefault) continue;
 
-    // int m_tiles = div_ceil(prob_m, thread_m_blocks * 16);
-    // int n_tiles = prob_n / th_config.thread_n;
-    // int k_tiles = prob_k / th_config.thread_k;
-
     return {1, th_config};
   }
 
@@ -321,6 +322,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
                int group_size, int dev, cudaStream_t stream, int thread_k_init,
                int thread_n_init, int sms, bool use_atomic_add,
                bool use_fp32_reduce, bool is_zp_float) {
+  bool is_a_8bit = a_type.size_bits() == 8;
   TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
               ", ", prob_n, ", ", prob_k, "]");
 
@@ -389,8 +391,14 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
                          dev);
   cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
                          dev);
-  TORCH_CHECK(major_capability * 10 + minor_capability >= 80,
-              "marlin kernel only support Ampere or newer GPUs.");
+  TORCH_CHECK(major_capability * 10 + minor_capability >= 75,
+              "marlin kernel only support Turing or newer GPUs.");
+  int stages = 4;
+  if (major_capability == 7 && minor_capability == 5) {
+    stages = 2;
+    TORCH_CHECK(a_type == vllm::kFloat16 || a_type == vllm::kS8,
+                "Turing only support FP16 or INT8 activation.");
+  }
   if (a_type == vllm::kFE4M3fn) {
     TORCH_CHECK(
         major_capability * 10 + minor_capability == 89 ||
@@ -431,7 +439,8 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
       exec_cfg = determine_exec_config(
           a_type, b_type, c_type, s_type, prob_m_split, prob_n, prob_k,
           thread_m_blocks, m_block_size_8, num_bits, group_size, has_act_order,
-          is_k_full, has_zp, is_zp_float, max_shared_mem, sms);
+          is_k_full, has_zp, is_zp_float, is_a_8bit, stages, max_shared_mem,
+          sms);
       thread_tfg = exec_cfg.tb_cfg;
       if (thread_tfg.thread_n != -1) {
         if (prob_n / thread_tfg.thread_n *
@@ -440,7 +449,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
           if (is_valid_config({128, 64, 128}, thread_m_blocks, prob_m_split,
                               prob_n, prob_k, num_bits, group_size,
                               has_act_order, is_k_full, has_zp, is_zp_float,
-                              max_shared_mem_new)) {
+                              is_a_8bit, stages, max_shared_mem_new)) {
             thread_tfg = {128, 64, 128};
             exec_cfg = {1, thread_tfg};
           }
@@ -466,7 +475,8 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
     TORCH_CHECK(
         is_valid_config(thread_tfg, thread_m_blocks, prob_m_split, prob_n,
                         prob_k, num_bits, group_size, has_act_order, is_k_full,
-                        has_zp, is_zp_float, max_shared_mem_new),
+                        has_zp, is_zp_float, is_a_8bit, stages,
+                        max_shared_mem_new),
         "Invalid thread config: thread_m_blocks = ", thread_m_blocks,
         ", thread_k = ", thread_tfg.thread_k,
         ", thread_n = ", thread_tfg.thread_n,
@@ -475,12 +485,12 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
         ", prob_m_split = ", prob_m_split, ", group_size = ", group_size,
         ", has_act_order = ", has_act_order, ", is_k_full = ", is_k_full,
         ", has_zp = ", has_zp, ", is_zp_float = ", is_zp_float,
-        ", max_shared_mem_new = ", max_shared_mem_new);
+        ", stages = ", stages, ", max_shared_mem_new = ", max_shared_mem_new);
 
     auto kernel = get_marlin_kernel(
         a_type, b_type, c_type, s_type, thread_m_blocks, thread_n_blocks,
         thread_k_blocks, m_block_size_8, has_act_order, has_zp, group_blocks,
-        num_threads, is_zp_float);
+        num_threads, is_zp_float, stages);
 
     if (kernel == MarlinDefault) {
       TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n,
diff --git a/csrc/quantization/gptq_marlin/marlin.cuh b/csrc/quantization/gptq_marlin/marlin.cuh
index 2505e221322dd..33fe52f605b42 100644
--- a/csrc/quantization/gptq_marlin/marlin.cuh
+++ b/csrc/quantization/gptq_marlin/marlin.cuh
@@ -1,17 +1,19 @@
 #pragma once
 
-#include <torch/all.h>
+#ifndef _marlin_cuh
+  #define _marlin_cuh
+  #include <torch/all.h>
 
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-#include <iostream>
+  #include <ATen/cuda/CUDAContext.h>
+  #include <c10/cuda/CUDAGuard.h>
+  #include <cuda.h>
+  #include <cuda_fp16.h>
+  #include <cuda_runtime.h>
+  #include <iostream>
 
-#ifndef MARLIN_NAMESPACE_NAME
-  #define MARLIN_NAMESPACE_NAME marlin
-#endif
+  #ifndef MARLIN_NAMESPACE_NAME
+    #define MARLIN_NAMESPACE_NAME marlin
+  #endif
 
 namespace MARLIN_NAMESPACE_NAME {
 
@@ -51,9 +53,51 @@ using I4 = Vec<int, 4>;
 
 constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
-// No support for async
-#else
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+
+__device__ inline void cp_async1_ca_pred(void* smem_ptr, const void* glob_ptr,
+                                         bool pred = true) {
+  if (pred) {
+    reinterpret_cast<int32_t*>(smem_ptr)[0] =
+        reinterpret_cast<const int32_t*>(glob_ptr)[0];
+  }
+}
+
+__device__ inline void cp_async2_ca_pred(void* smem_ptr, const void* glob_ptr,
+                                         bool pred = true) {
+  if (pred) {
+    reinterpret_cast<int64_t*>(smem_ptr)[0] =
+        reinterpret_cast<const int64_t*>(glob_ptr)[0];
+  }
+}
+
+__device__ inline void cp_async4_ca_pred(void* smem_ptr, const void* glob_ptr,
+                                         bool pred = true) {
+  if (pred) {
+    reinterpret_cast<int4*>(smem_ptr)[0] =
+        reinterpret_cast<const int4*>(glob_ptr)[0];
+  }
+}
+
+__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
+                                      bool pred = true) {
+  if (pred) {
+    reinterpret_cast<int4*>(smem_ptr)[0] =
+        reinterpret_cast<const int4*>(glob_ptr)[0];
+  }
+}
+
+__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
+  reinterpret_cast<int4*>(smem_ptr)[0] =
+      reinterpret_cast<const int4*>(glob_ptr)[0];
+}
+
+__device__ inline void cp_async_fence() {}
+
+template <int n>
+__device__ inline void cp_async_wait() {}
+
+  #else
 
 __device__ inline void cp_async1_ca_pred(void* smem_ptr, const void* glob_ptr,
                                          bool pred = true) {
@@ -126,6 +170,8 @@ __device__ inline void cp_async_wait() {
   asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
 }
 
-#endif
+  #endif
 
 }  // namespace MARLIN_NAMESPACE_NAME
+
+#endif
\ No newline at end of file
diff --git a/csrc/quantization/gptq_marlin/marlin_mma.h b/csrc/quantization/gptq_marlin/marlin_mma.h
new file mode 100644
index 0000000000000..6ec2aaafc4392
--- /dev/null
+++ b/csrc/quantization/gptq_marlin/marlin_mma.h
@@ -0,0 +1,269 @@
+
+#include "marlin_dtypes.cuh"
+
+namespace MARLIN_NAMESPACE_NAME {
+
+// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
+// output/accumulation.
+template <vllm::ScalarTypeId type_id, bool use_fp16_accum, int k_size = 16>
+__device__ inline void mma(
+    const typename MarlinScalarType<type_id>::FragA& a_frag,
+    const typename MarlinScalarType<type_id>::FragB& frag_b,
+    typename MarlinScalarType<type_id>::FragC& frag_c, int idx = 0) {
+  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
+  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  if constexpr (!std::is_same<scalar_t, half>::value || k_size != 16) {
+    static_assert(!use_fp16_accum);
+  }
+
+  if constexpr (k_size == 16) {
+    if constexpr (std::is_same<scalar_t, half>::value && !use_fp16_accum) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(b[0]), "f"(c[0]), "f"(c[1]), "f"(c[2]),
+            "f"(c[3]));
+      asm volatile(
+          "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[2]), "r"(a[3]), "r"(b[1]), "f"(c[0]), "f"(c[1]), "f"(c[2]),
+            "f"(c[3]));
+#else
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+#endif
+    } else if constexpr (std::is_same<scalar_t, half>::value &&
+                         use_fp16_accum) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
+      uint32_t* c = reinterpret_cast<uint32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 "
+          "{%0,%1}, {%2,%3}, {%4}, {%5,%6};\n"
+          : "=r"(c[0]), "=r"(c[1])
+          : "r"(a[0]), "r"(a[1]), "r"(b[0]), "r"(c[0]), "r"(c[1]));
+      asm volatile(
+          "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 "
+          "{%0,%1}, {%2,%3}, {%4}, {%5,%6};\n"
+          : "=r"(c[0]), "=r"(c[1])
+          : "r"(a[2]), "r"(a[3]), "r"(b[1]), "r"(c[0]), "r"(c[1]));
+#else
+      uint32_t* c = reinterpret_cast<uint32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 "
+          "{%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n"
+          : "=r"(c[0]), "=r"(c[1])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "r"(c[0]), "r"(c[1]));
+#endif
+    } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "f"(c[0]),
+            "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
+      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+          : "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "r"(c[0]),
+            "r"(c[1]), "r"(c[2]), "r"(c[3]));
+    }
+  } else if (k_size == 32) {
+    if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
+      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
+      asm volatile(
+          "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1}, {%2}, {%3}, {%4,%5};\n"
+          : "=r"(c[0]), "=r"(c[1])
+          : "r"(a[0]), "r"(b[0]), "r"(c[0]), "r"(c[1]));
+      asm volatile(
+          "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1}, {%2}, {%3}, {%4,%5};\n"
+          : "=r"(c[2]), "=r"(c[3])
+          : "r"(a[1]), "r"(b[0]), "r"(c[2]), "r"(c[3]));
+      asm volatile(
+          "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1}, {%2}, {%3}, {%4,%5};\n"
+          : "=r"(c[0]), "=r"(c[1])
+          : "r"(a[2]), "r"(b[1]), "r"(c[0]), "r"(c[1]));
+      asm volatile(
+          "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1}, {%2}, {%3}, {%4,%5};\n"
+          : "=r"(c[2]), "=r"(c[3])
+          : "r"(a[3]), "r"(b[1]), "r"(c[2]), "r"(c[3]));
+#else
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
+#endif
+    }
+  }
+}
+
+template <vllm::ScalarTypeId type_id, bool use_fp16_accum, int k_size = 16>
+__device__ inline void mma_trans(
+    const typename MarlinScalarType<type_id>::FragA& a_frag,
+    const typename MarlinScalarType<type_id>::FragB& frag_b,
+    const typename MarlinScalarType<type_id>::FragB& frag_b2,
+    typename MarlinScalarType<type_id>::FragC& frag_c) {
+  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
+  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
+  const uint32_t* b2 = reinterpret_cast<const uint32_t*>(&frag_b2);
+  float* c = reinterpret_cast<float*>(&frag_c);
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  if constexpr (!std::is_same<scalar_t, half>::value || k_size != 16) {
+    static_assert(!use_fp16_accum);
+  }
+
+  if constexpr (k_size == 16) {
+    if constexpr (std::is_same<scalar_t, half>::value && !use_fp16_accum) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "f"(c[0]), "f"(c[1]), "f"(c[2]),
+            "f"(c[3]));
+      asm volatile(
+          "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[1]), "r"(b2[1]), "r"(a[1]), "f"(c[0]), "f"(c[1]), "f"(c[2]),
+            "f"(c[3]));
+#else
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+#endif
+    } else if constexpr (std::is_same<scalar_t, half>::value &&
+                         use_fp16_accum) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
+      uint32_t* c = reinterpret_cast<uint32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 "
+          "{%0,%1}, {%2,%3}, {%4}, {%5,%6};\n"
+          : "=r"(c[0]), "=r"(c[1])
+          : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "r"(c[0]), "r"(c[1]));
+      asm volatile(
+          "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 "
+          "{%0,%1}, {%2,%3}, {%4}, {%5,%6};\n"
+          : "=r"(c[0]), "=r"(c[1])
+          : "r"(b[1]), "r"(b2[1]), "r"(a[1]), "r"(c[0]), "r"(c[1]));
+#else
+      uint32_t* c = reinterpret_cast<uint32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 "
+          "{%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n"
+          : "=r"(c[0]), "=r"(c[1])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "r"(c[0]), "r"(c[1]));
+#endif
+    } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "f"(c[0]), "f"(c[1]), "f"(c[2]),
+            "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
+      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "r"(c[0]), "r"(c[1]), "r"(c[2]),
+            "r"(c[3]));
+    }
+  } else {
+    if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
+      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
+      asm volatile(
+          "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1}, {%2}, {%3}, {%4,%5};\n"
+          : "=r"(c[0]), "=r"(c[1])
+          : "r"(b[0]), "r"(a[0]), "r"(c[0]), "r"(c[1]));
+      asm volatile(
+          "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1}, {%2}, {%3}, {%4,%5};\n"
+          : "=r"(c[2]), "=r"(c[3])
+          : "r"(b2[1]), "r"(a[0]), "r"(c[2]), "r"(c[3]));
+      asm volatile(
+          "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1}, {%2}, {%3}, {%4,%5};\n"
+          : "=r"(c[0]), "=r"(c[1])
+          : "r"(b[0]), "r"(a[1]), "r"(c[0]), "r"(c[1]));
+      asm volatile(
+          "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1}, {%2}, {%3}, {%4,%5};\n"
+          : "=r"(c[2]), "=r"(c[3])
+          : "r"(b2[1]), "r"(a[1]), "r"(c[2]), "r"(c[3]));
+#else
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
+#endif
+    }
+  }
+}
+
+}  // namespace MARLIN_NAMESPACE_NAME
\ No newline at end of file
diff --git a/csrc/quantization/gptq_marlin/marlin_template.h b/csrc/quantization/gptq_marlin/marlin_template.h
index 22bb71e482ce8..c7b53696c1223 100644
--- a/csrc/quantization/gptq_marlin/marlin_template.h
+++ b/csrc/quantization/gptq_marlin/marlin_template.h
@@ -26,6 +26,7 @@
 #include "marlin.cuh"
 #include "marlin_dtypes.cuh"
 #include "dequant.h"
+#include "marlin_mma.h"
 #include "core/scalar_type.hpp"
 
 #define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)               \
@@ -35,7 +36,7 @@
 
 namespace MARLIN_NAMESPACE_NAME {
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750
 
 template <typename scalar_t,  // compute dtype, half or nv_float16
           const vllm::ScalarTypeId b_type_id,  // weight MarlinScalarType id
@@ -75,137 +76,6 @@ __global__ void Marlin(
 
 #else
 
-// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
-// output/accumulation.
-template <vllm::ScalarTypeId type_id, int k_size = 16>
-__device__ inline void mma(
-    const typename MarlinScalarType<type_id>::FragA& a_frag,
-    const typename MarlinScalarType<type_id>::FragB& frag_b,
-    typename MarlinScalarType<type_id>::FragC& frag_c, int idx = 0) {
-  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
-  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
-  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
-  if constexpr (k_size == 16) {
-    if constexpr (std::is_same<scalar_t, half>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
-          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "f"(c[0]),
-            "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
-      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
-          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
-          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
-          : "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "r"(c[0]),
-            "r"(c[1]), "r"(c[2]), "r"(c[3]));
-    }
-  } else if (k_size == 32) {
-    if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
-      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
-          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-            "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
-    }
-  }
-}
-
-template <vllm::ScalarTypeId type_id, int k_size = 16>
-__device__ inline void mma_trans(
-    const typename MarlinScalarType<type_id>::FragA& a_frag,
-    const typename MarlinScalarType<type_id>::FragB& frag_b,
-    const typename MarlinScalarType<type_id>::FragB& frag_b2,
-    typename MarlinScalarType<type_id>::FragC& frag_c) {
-  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
-  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
-  const uint32_t* b2 = reinterpret_cast<const uint32_t*>(&frag_b2);
-  float* c = reinterpret_cast<float*>(&frag_c);
-  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
-  if constexpr (k_size == 16) {
-    if constexpr (std::is_same<scalar_t, half>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
-          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "f"(c[0]), "f"(c[1]), "f"(c[2]),
-            "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
-      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
-          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
-          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "r"(c[0]), "r"(c[1]), "r"(c[2]),
-            "r"(c[3]));
-    }
-  } else {
-    if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
-      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-            "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
-    }
-  }
-}
-
 // Instruction for loading a full 16x16 matrix fragment of operand A from shared
 // memory, directly in tensor core layout.
 template <int count, vllm::ScalarTypeId type_id>
@@ -415,6 +285,17 @@ __global__ void Marlin(
   if constexpr (a_type_id == vllm::kFE4M3fn.id()) return;
   #endif
 
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
+  // Turing TensorCore only supports fp16 and int8
+  if constexpr (a_type_id != vllm::kFloat16.id() && a_type_id != vllm::kS8.id())
+    return;
+  #endif
+
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
+  constexpr bool use_fp16_accum = a_type_id == vllm::kFloat16.id();
+  #else
+  constexpr bool use_fp16_accum = false;
+  #endif
   using Adtype = MarlinScalarType<a_type_id>;
   using Cdtype = MarlinScalarType<c_type_id>;
   const int4* A = A0;
@@ -873,10 +754,6 @@ __global__ void Marlin(
   constexpr int sh_s_size = has_act_order ? (act_s_max_num_groups * s_sh_stride)
                                           : (stages * s_sh_stage);
   int4* sh_s = sh_zp + (stages * zp_sh_stage);
-  // shared memory reused by reduction should be smaller than
-  // shared memory used by weight.
-  static_assert(thread_m_blocks * 16 * thread_n_blocks * 16 / 8 <=
-                stages * b_sh_stage);
   int4* sh_a = sh_s + sh_s_size;
 
   // Register storage for double buffer of shared memory reads.
@@ -1395,11 +1272,13 @@ __global__ void Marlin(
   #pragma unroll
       for (int i = 0; i < thread_m_blocks; i++) {
         if constexpr (m_block_size_8) {
-          mma_trans<a_type_id>(frag_a[k2][i], frag_b0, frag_b1,
-                               frag_c[i][j][0]);
+          mma_trans<a_type_id, use_fp16_accum>(frag_a[k2][i], frag_b0, frag_b1,
+                                               frag_c[i][j][0]);
         } else {
-          mma<a_type_id>(frag_a[k2][i], frag_b0, frag_c[i][j][0]);
-          mma<a_type_id>(frag_a[k2][i], frag_b1, frag_c[i][j][1]);
+          mma<a_type_id, use_fp16_accum>(frag_a[k2][i], frag_b0,
+                                         frag_c[i][j][0]);
+          mma<a_type_id, use_fp16_accum>(frag_a[k2][i], frag_b1,
+                                         frag_c[i][j][1]);
         }
       }
     }
@@ -1433,10 +1312,12 @@ __global__ void Marlin(
 
   #pragma unroll
       for (int i = 0; i < thread_m_blocks; i++) {
-        mma<a_type_id, 32>(frag_a[k2][i], frag_b[0],
-                           (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][0]);
-        mma<a_type_id, 32>(frag_a[k2][i], frag_b[1],
-                           (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][1]);
+        mma<a_type_id, false, 32>(
+            frag_a[k2][i], frag_b[0],
+            (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][0]);
+        mma<a_type_id, false, 32>(
+            frag_a[k2][i], frag_b[1],
+            (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][1]);
       }
 
       if constexpr (group_blocks != -1) {
@@ -1956,6 +1837,21 @@ __global__ void Marlin(
     // While this pattern may not be the most readable, other ways of writing
     // the loop seemed to noticeably worse performance after compilation.
     if (slice_iters == 0) {
+      // convert fp16 accum to fp32 for reduction
+      if constexpr (use_fp16_accum) {
+  #pragma unroll
+        for (int i = 0; i < (thread_m_blocks * (is_a_8bit ? 2 : 4) * 2); i++) {
+          float* frag_c_part_float = reinterpret_cast<float*>(frag_c) + i * 4;
+          scalar_t* frag_c_part_half =
+              reinterpret_cast<scalar_t*>(frag_c_part_float);
+
+  #pragma unroll
+          for (int i = 3; i >= 0; i--) {
+            frag_c_part_float[i] = Cdtype::num2float(frag_c_part_half[i]);
+          }
+        }
+      }
+
       if constexpr (is_a_8bit) {
         float frag_a_s[2 * thread_m_blocks];
 
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 3ed15ed7dd422..314848721a80a 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -121,7 +121,7 @@ class AWQMarlinConfig(QuantizationConfig):
 
     @classmethod
     def get_min_capability(cls) -> int:
-        return 80
+        return 75
 
     @classmethod
     def get_config_filenames(cls) -> list[str]:
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index f2b66a2beb6d7..800340ed6043c 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -253,7 +253,7 @@ class Fp8Config(QuantizationConfig):
 
     @classmethod
     def get_min_capability(cls) -> int:
-        return 80
+        return 75
 
     @classmethod
     def get_config_filenames(cls) -> list[str]:
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 6e5dcfe59b2f9..347c7b2008d12 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -181,7 +181,7 @@ class GPTQMarlinConfig(QuantizationConfig):
 
     @classmethod
     def get_min_capability(cls) -> int:
-        return 80
+        return 75
 
     @classmethod
     def get_config_filenames(cls) -> list[str]:
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index d5d7e7bfaae73..aa3937d4c03ff 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -871,7 +871,7 @@ class ModelOptNvFp4Config(ModelOptQuantConfigBase):
 
     @classmethod
     def get_min_capability(cls) -> int:
-        return 80
+        return 75
 
     @classmethod
     def override_quantization_method(

From b6ec077e058e15e5b853793924e6643ec6c579aa Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Tue, 16 Dec 2025 17:47:53 -0500
Subject: [PATCH 602/770] [CI] Skip ci failure test (#30804)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 tests/compile/distributed/test_fusions_e2e.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py
index bd326f1157d8f..80086c4e03a9c 100644
--- a/tests/compile/distributed/test_fusions_e2e.py
+++ b/tests/compile/distributed/test_fusions_e2e.py
@@ -523,6 +523,8 @@ CUSTOM_OPS_QUANT_RMS_NORM = ["+quant_fp8,+rms_norm"]
     list[tuple[Any, ...]](flat_product(MODELS_GROUP_FP8, CUSTOM_OPS_QUANT_RMS_NORM)),
 )
 @pytest.mark.parametrize("inductor_graph_partition", [True, False])
+# TODO: remove skip after we fix the fusion thoroughly
+@pytest.mark.skipif(is_blackwell(), reason="Temporarily disabled on Blackwell")
 def test_rms_group_quant(
     model_name: str,
     model_kwargs: dict[str, Any],
@@ -562,7 +564,7 @@ def test_rms_group_quant(
         splitting_ops=splitting_ops,
         # Common
         mode=CompilationMode.VLLM_COMPILE,
-        pass_config=PassConfig(eliminate_noops=True, enable_fusion=True),
+        pass_config=PassConfig(eliminate_noops=True, fuse_norm_quant=True),
         # Inductor caches custom passes by default as well via uuid
         inductor_compile_config={"force_disable_caches": True},
     )

From 0a1ab1e565fce5070bc1c1b1f3374537e437550c Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 16 Dec 2025 17:56:02 -0500
Subject: [PATCH 603/770] [Perf][Kernels] Vectorize
 `csrc/activations_kernels.cu` (#29512)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 benchmarks/kernels/benchmark_activation.py |   4 +-
 csrc/activation_kernels.cu                 | 208 +++++++++++++++++----
 2 files changed, 175 insertions(+), 37 deletions(-)

diff --git a/benchmarks/kernels/benchmark_activation.py b/benchmarks/kernels/benchmark_activation.py
index 66268b71b3de6..d31e67057d8f6 100644
--- a/benchmarks/kernels/benchmark_activation.py
+++ b/benchmarks/kernels/benchmark_activation.py
@@ -13,8 +13,8 @@ from vllm.triton_utils import triton
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 
-batch_size_range = [1, 16, 32, 64, 128]
-seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
+batch_size_range = [1, 16, 128]
+seq_len_range = [1, 16, 64, 1024, 4096]
 intermediate_size = [3072, 9728, 12288]
 configs = list(itertools.product(batch_size_range, seq_len_range, intermediate_size))
 
diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu
index a4a880f13cf7e..8268065ef02c8 100644
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -15,19 +15,61 @@ __device__ __forceinline__ scalar_t compute(const scalar_t& x,
                                             const scalar_t& y) {
   return act_first ? ACT_FN(x) * y : x * ACT_FN(y);
 }
-// Activation and gating kernel template.
 
+// Check if all pointers are 16-byte aligned for int4 vectorized access
+__device__ __forceinline__ bool is_16byte_aligned(const void* ptr) {
+  return (reinterpret_cast<uintptr_t>(ptr) & 15) == 0;
+}
+
+// Activation and gating kernel template.
 template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&),
           bool act_first>
 __global__ void act_and_mul_kernel(
     scalar_t* __restrict__ out,          // [..., d]
     const scalar_t* __restrict__ input,  // [..., 2, d]
     const int d) {
+  constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
   const int64_t token_idx = blockIdx.x;
-  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
-    const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
-    const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
-    out[token_idx * d + idx] = compute<scalar_t, ACT_FN, act_first>(x, y);
+  const scalar_t* x_ptr = input + token_idx * 2 * d;
+  const scalar_t* y_ptr = x_ptr + d;
+  scalar_t* out_ptr = out + token_idx * d;
+
+  // Check alignment for 128-bit vectorized access.
+  // All three pointers must be 16-byte aligned for safe int4 operations.
+  const bool aligned = is_16byte_aligned(x_ptr) && is_16byte_aligned(y_ptr) &&
+                       is_16byte_aligned(out_ptr);
+
+  if (aligned && d >= VEC_SIZE) {
+    // Fast path: 128-bit vectorized loop
+    const int4* x_vec = reinterpret_cast<const int4*>(x_ptr);
+    const int4* y_vec = reinterpret_cast<const int4*>(y_ptr);
+    int4* out_vec = reinterpret_cast<int4*>(out_ptr);
+    const int num_vecs = d / VEC_SIZE;
+    const int vec_end = num_vecs * VEC_SIZE;
+
+    for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
+      int4 x = VLLM_LDG(&x_vec[i]), y = VLLM_LDG(&y_vec[i]), r;
+      auto* xp = reinterpret_cast<scalar_t*>(&x);
+      auto* yp = reinterpret_cast<scalar_t*>(&y);
+      auto* rp = reinterpret_cast<scalar_t*>(&r);
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; j++) {
+        rp[j] = compute<scalar_t, ACT_FN, act_first>(xp[j], yp[j]);
+      }
+      out_vec[i] = r;
+    }
+    // Scalar cleanup for remaining elements
+    for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
+      out_ptr[i] = compute<scalar_t, ACT_FN, act_first>(VLLM_LDG(&x_ptr[i]),
+                                                        VLLM_LDG(&y_ptr[i]));
+    }
+  } else {
+    // Scalar fallback for unaligned data or small d
+    for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+      const scalar_t x = VLLM_LDG(&x_ptr[idx]);
+      const scalar_t y = VLLM_LDG(&y_ptr[idx]);
+      out_ptr[idx] = compute<scalar_t, ACT_FN, act_first>(x, y);
+    }
   }
 }
 
@@ -120,50 +162,115 @@ template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&, const float)>
 __global__ void act_and_mul_kernel_with_param(
     scalar_t* __restrict__ out, const scalar_t* __restrict__ input, const int d,
     const float param) {
+  constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
   const int64_t token_idx = blockIdx.x;
-  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
-    const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
-    const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
-    out[token_idx * d + idx] = ACT_FN(x, param) * y;
+  const scalar_t* x_ptr = input + token_idx * 2 * d;
+  const scalar_t* y_ptr = x_ptr + d;
+  scalar_t* out_ptr = out + token_idx * d;
+
+  // Check alignment for 128-bit vectorized access
+  const bool aligned = is_16byte_aligned(x_ptr) && is_16byte_aligned(y_ptr) &&
+                       is_16byte_aligned(out_ptr);
+
+  if (aligned && d >= VEC_SIZE) {
+    // Fast path: 128-bit vectorized loop
+    const int4* x_vec = reinterpret_cast<const int4*>(x_ptr);
+    const int4* y_vec = reinterpret_cast<const int4*>(y_ptr);
+    int4* out_vec = reinterpret_cast<int4*>(out_ptr);
+    const int num_vecs = d / VEC_SIZE;
+    const int vec_end = num_vecs * VEC_SIZE;
+
+    for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
+      int4 x = VLLM_LDG(&x_vec[i]), y = VLLM_LDG(&y_vec[i]), r;
+      auto* xp = reinterpret_cast<scalar_t*>(&x);
+      auto* yp = reinterpret_cast<scalar_t*>(&y);
+      auto* rp = reinterpret_cast<scalar_t*>(&r);
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; j++) {
+        rp[j] = ACT_FN(xp[j], param) * yp[j];
+      }
+      out_vec[i] = r;
+    }
+    // Scalar cleanup for remaining elements
+    for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
+      out_ptr[i] = ACT_FN(VLLM_LDG(&x_ptr[i]), param) * VLLM_LDG(&y_ptr[i]);
+    }
+  } else {
+    // Scalar fallback for unaligned data or small d
+    for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+      const scalar_t x = VLLM_LDG(&x_ptr[idx]);
+      const scalar_t y = VLLM_LDG(&y_ptr[idx]);
+      out_ptr[idx] = ACT_FN(x, param) * y;
+    }
   }
 }
 
 template <typename T>
 __device__ __forceinline__ T swigluoai_and_mul(const T& gate, const T& up,
                                                float alpha, float limit) {
-  // clamp gate: min=None, max=limit
-  const float gate_f = (float)gate;
-  const float clamped_gate = gate_f > limit ? limit : gate_f;
-
-  // clamp up: min=-limit, max=limit
-  const float up_f = (float)up;
-  const float clamped_up =
-      up_f > limit ? limit : (up_f < -limit ? -limit : up_f);
-
-  // glu = gate * sigmoid(gate * alpha)
-  const float sigmoid_val = 1.0f / (1.0f + expf(-clamped_gate * alpha));
-  const float glu = clamped_gate * sigmoid_val;
-
-  // (up + 1) * glu
-  return (T)((clamped_up + 1.0f) * glu);
+  // Clamp gate to (-inf, limit] and up to [-limit, limit]
+  const float g = fminf((float)gate, limit);
+  const float u = fmaxf(fminf((float)up, limit), -limit);
+  // glu = gate * sigmoid(gate * alpha), then return (up + 1) * glu
+  return (T)((u + 1.0f) * g / (1.0f + expf(-g * alpha)));
 }
 
+// Interleaved gate/up: input has [gate0, up0, gate1, up1, ...].
 template <typename scalar_t,
           scalar_t (*ACT_FN)(const scalar_t&, const scalar_t&, const float,
                              const float)>
 __global__ void swigluoai_and_mul_kernel(
     scalar_t* __restrict__ out,          // [..., d]
-    const scalar_t* __restrict__ input,  // [..., 2, d]
+    const scalar_t* __restrict__ input,  // [..., 2 * d] (interleaved)
     const int d, const float alpha, const float limit) {
+  // For interleaved data: input has 2*d elements per token (gate/up pairs)
+  // output has d elements per token
+  constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
+  constexpr int PAIRS = VEC_SIZE / 2;  // Number of gate/up pairs per int4 load
   const int64_t token_idx = blockIdx.x;
-  // TODO: Vectorize loads and stores.
-  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
-    // gate = x[..., ::2]  (even indices)
-    const scalar_t gate = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx]);
-    // up = x[..., 1::2]   (odd indices)
-    const scalar_t up = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx + 1]);
+  const scalar_t* in_ptr = input + token_idx * 2 * d;
+  scalar_t* out_ptr = out + token_idx * d;
 
-    out[token_idx * d + idx] = ACT_FN(gate, up, alpha, limit);
+  // Check alignment for 128-bit vectorized access on input.
+  // For output we use int2 (64-bit) which has 8-byte alignment requirement.
+  const bool in_aligned = is_16byte_aligned(in_ptr);
+  const bool out_aligned =
+      (reinterpret_cast<uintptr_t>(out_ptr) & 7) == 0;  // 8-byte for int2
+
+  if (in_aligned && out_aligned && d >= PAIRS) {
+    // Fast path: vectorized loop
+    // Each int4 load gives VEC_SIZE elements = PAIRS gate/up pairs
+    // Each int2 store writes PAIRS output elements
+    const int4* in_vec = reinterpret_cast<const int4*>(in_ptr);
+    int2* out_vec = reinterpret_cast<int2*>(out_ptr);
+    const int num_vecs = d / PAIRS;
+    const int vec_end = num_vecs * PAIRS;
+
+    for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
+      int4 v = VLLM_LDG(&in_vec[i]);
+      int2 r;
+      auto* vp = reinterpret_cast<scalar_t*>(&v);
+      auto* rp = reinterpret_cast<scalar_t*>(&r);
+#pragma unroll
+      for (int j = 0; j < PAIRS; j++) {
+        rp[j] = ACT_FN(vp[2 * j], vp[2 * j + 1], alpha, limit);
+      }
+      out_vec[i] = r;
+    }
+    // Scalar cleanup for remaining elements
+    for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
+      out_ptr[i] = ACT_FN(VLLM_LDG(&in_ptr[2 * i]),
+                          VLLM_LDG(&in_ptr[2 * i + 1]), alpha, limit);
+    }
+  } else {
+    // Scalar fallback for unaligned data or small d
+    for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+      // gate = x[..., ::2]  (even indices)
+      const scalar_t gate = VLLM_LDG(&in_ptr[2 * idx]);
+      // up = x[..., 1::2]   (odd indices)
+      const scalar_t up = VLLM_LDG(&in_ptr[2 * idx + 1]);
+      out_ptr[idx] = ACT_FN(gate, up, alpha, limit);
+    }
   }
 }
 
@@ -217,10 +324,41 @@ __global__ void activation_kernel(
     scalar_t* __restrict__ out,          // [..., d]
     const scalar_t* __restrict__ input,  // [..., d]
     const int d) {
+  constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
   const int64_t token_idx = blockIdx.x;
-  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
-    const scalar_t x = VLLM_LDG(&input[token_idx * d + idx]);
-    out[token_idx * d + idx] = ACT_FN(x);
+  const scalar_t* in_ptr = input + token_idx * d;
+  scalar_t* out_ptr = out + token_idx * d;
+
+  // Check alignment for 128-bit vectorized access
+  const bool aligned = is_16byte_aligned(in_ptr) && is_16byte_aligned(out_ptr);
+
+  if (aligned && d >= VEC_SIZE) {
+    // Fast path: 128-bit vectorized loop
+    const int4* in_vec = reinterpret_cast<const int4*>(in_ptr);
+    int4* out_vec = reinterpret_cast<int4*>(out_ptr);
+    const int num_vecs = d / VEC_SIZE;
+    const int vec_end = num_vecs * VEC_SIZE;
+
+    for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
+      int4 v = VLLM_LDG(&in_vec[i]), r;
+      auto* vp = reinterpret_cast<scalar_t*>(&v);
+      auto* rp = reinterpret_cast<scalar_t*>(&r);
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; j++) {
+        rp[j] = ACT_FN(vp[j]);
+      }
+      out_vec[i] = r;
+    }
+    // Scalar cleanup for remaining elements
+    for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
+      out_ptr[i] = ACT_FN(VLLM_LDG(&in_ptr[i]));
+    }
+  } else {
+    // Scalar fallback for unaligned data or small d
+    for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+      const scalar_t x = VLLM_LDG(&in_ptr[idx]);
+      out_ptr[idx] = ACT_FN(x);
+    }
   }
 }
 

From 2410132bb1f9faa5b252fad3f2b83dc926946b08 Mon Sep 17 00:00:00 2001
From: TJian <tunjian.tan@embeddedllm.com>
Date: Wed, 17 Dec 2025 07:32:43 +0800
Subject: [PATCH 604/770] [ROCm] [Bugfix] Fix torch sdpa hallucination (#30789)

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 vllm/attention/ops/vit_attn_wrappers.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/vllm/attention/ops/vit_attn_wrappers.py b/vllm/attention/ops/vit_attn_wrappers.py
index 46c7d83dfa5c2..892c4209c01e0 100644
--- a/vllm/attention/ops/vit_attn_wrappers.py
+++ b/vllm/attention/ops/vit_attn_wrappers.py
@@ -16,6 +16,7 @@ import einops
 import torch
 import torch.nn.functional as F
 
+from vllm.platforms import current_platform
 from vllm.utils.torch_utils import direct_register_custom_op
 
 
@@ -89,6 +90,13 @@ def torch_sdpa_wrapper(
     v: torch.Tensor,
     cu_seqlens: torch.Tensor,
 ) -> torch.Tensor:
+    # Never remove the contiguous logic for ROCm
+    # Without it, hallucinations occur with the backend
+    if current_platform.is_rocm():
+        q = q.contiguous()
+        k = k.contiguous()
+        v = v.contiguous()
+
     outputs = []
 
     lens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()

From e80455ca8b696452b98d91785175210ed7a1bd41 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 16 Dec 2025 18:40:47 -0500
Subject: [PATCH 605/770] Replace deprecated enable_fusion with fuse_norm_quant
 in test_rms_group_quant (#30817)

Signed-off-by: mgoin <mgoin64@gmail.com>

From e087fbc393055fb69e9acf71fa124be0190498ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Wed, 17 Dec 2025 00:54:45 +0100
Subject: [PATCH 606/770] [MM] Pass FA version in ViT Attn (#30756)

Signed-off-by: NickLucche <nlucches@redhat.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/attention/layers/mm_encoder_attention.py | 6 ++++++
 vllm/attention/ops/vit_attn_wrappers.py       | 9 ++++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/vllm/attention/layers/mm_encoder_attention.py b/vllm/attention/layers/mm_encoder_attention.py
index c9107ebcab856..8b3dee1340b9f 100644
--- a/vllm/attention/layers/mm_encoder_attention.py
+++ b/vllm/attention/layers/mm_encoder_attention.py
@@ -10,6 +10,7 @@ from vllm.attention.ops.vit_attn_wrappers import (
     vit_flash_attn_wrapper,
     vit_torch_sdpa_wrapper,
 )
+from vllm.attention.utils.fa_utils import get_flash_attn_version
 from vllm.config import MultiModalConfig
 from vllm.logger import init_logger
 from vllm.model_executor.custom_op import CustomOp
@@ -101,6 +102,10 @@ class MMEncoderAttention(CustomOp):
             self.attn_backend,
         )
 
+        if self.is_flash_attn_backend:
+            assert self.flash_attn_varlen_func is not None
+            self._fa_version = get_flash_attn_version()
+
         logger.info_once(f"Using {self.attn_backend} for MMEncoderAttention.")
 
     @classmethod
@@ -204,6 +209,7 @@ class MMEncoderAttention(CustomOp):
             max_seqlen=max_seqlen,
             batch_size=bsz,
             is_rocm_aiter=(self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA),
+            fa_version=self._fa_version,
         )
         return output
 
diff --git a/vllm/attention/ops/vit_attn_wrappers.py b/vllm/attention/ops/vit_attn_wrappers.py
index 892c4209c01e0..5a74e1310133d 100644
--- a/vllm/attention/ops/vit_attn_wrappers.py
+++ b/vllm/attention/ops/vit_attn_wrappers.py
@@ -28,11 +28,15 @@ def flash_attn_maxseqlen_wrapper(
     max_seqlen: torch.Tensor,
     batch_size: int,
     is_rocm_aiter: bool,
+    fa_version: int,
 ) -> torch.Tensor:
+    kwargs = {}
     if is_rocm_aiter:
         from aiter import flash_attn_varlen_func
     else:
         from vllm.attention.utils.fa_utils import flash_attn_varlen_func
+
+        kwargs["fa_version"] = fa_version
     q, k, v = (einops.rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
     output = flash_attn_varlen_func(
         q,
@@ -44,6 +48,7 @@ def flash_attn_maxseqlen_wrapper(
         max_seqlen_k=max_seqlen.item(),
         dropout_p=0.0,
         causal=False,
+        **kwargs,
     )
     context_layer = einops.rearrange(output, "(b s) h d -> b s h d", b=batch_size)
     return context_layer
@@ -57,6 +62,7 @@ def flash_attn_maxseqlen_wrapper_fake(
     max_seqlen: torch.Tensor,
     batch_size: int,
     is_rocm_aiter: bool,
+    fa_version: int,
 ) -> torch.Tensor:
     return torch.empty_like(q)
 
@@ -76,9 +82,10 @@ def vit_flash_attn_wrapper(
     max_seqlen: torch.Tensor,
     batch_size: int,
     is_rocm_aiter: bool,
+    fa_version: int,
 ) -> torch.Tensor:
     return torch.ops.vllm.flash_attn_maxseqlen_wrapper(
-        q, k, v, cu_seqlens, max_seqlen, batch_size, is_rocm_aiter
+        q, k, v, cu_seqlens, max_seqlen, batch_size, is_rocm_aiter, fa_version
     )
 
 
From c0a88df7f771a48247a934e8821e6e230b3fc5a4 Mon Sep 17 00:00:00 2001
From: Amr Mahdi <amrmahdi@meta.com>
Date: Wed, 17 Dec 2025 02:41:57 +0200
Subject: [PATCH 607/770] [docker] Allow kv_connectors install to fail on arm64
 (#30806)

Signed-off-by: Amr Mahdi <amrmahdi@meta.com>
---
 docker/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index ae2624ace67b9..e61021b6eeb85 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -621,7 +621,7 @@ ENV UV_HTTP_TIMEOUT=500
 RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,source=requirements/kv_connectors.txt,target=/tmp/kv_connectors.txt,ro \
     if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \
-        uv pip install --system -r /tmp/kv_connectors.txt; \
+        uv pip install --system -r /tmp/kv_connectors.txt || true; \
     fi
 
 ENV VLLM_USAGE_SOURCE production-docker-image

From f5db6385a19b04e76b5834618305485753e75544 Mon Sep 17 00:00:00 2001
From: "Grzegorz K. Karch" <grzegorz-k-karch@users.noreply.github.com>
Date: Wed, 17 Dec 2025 02:06:28 +0100
Subject: [PATCH 608/770] Fix nemotron_nas intermediate_size computation
 (#30795)

Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
---
 vllm/model_executor/models/nemotron_nas.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py
index 19a942a5277cc..83ef5e7e1282d 100644
--- a/vllm/model_executor/models/nemotron_nas.py
+++ b/vllm/model_executor/models/nemotron_nas.py
@@ -169,10 +169,13 @@ class DeciLMDecoderLayer(nn.Module):
             self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
         if not self._is_no_op_ffn:
-            ffn_mult = block_config.ffn.ffn_mult
-            intermediate_size = _ffn_mult_to_intermediate_size(
-                ffn_mult, config.hidden_size
-            )
+            if hasattr(block_config.ffn, "ffn_mult"):
+                ffn_mult = block_config.ffn.ffn_mult
+                intermediate_size = _ffn_mult_to_intermediate_size(
+                    ffn_mult, config.hidden_size
+                )
+            else:
+                intermediate_size = block_config.ffn.intermediate_size
 
             self.mlp = LlamaMLP(
                 hidden_size=self.hidden_size,

From 811cdf5197acb4d6ab42250a5b0f822887d1190a Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 16 Dec 2025 20:52:14 -0500
Subject: [PATCH 609/770] Update model-hosting-container-standards to 0.1.10
 (#30815)

Signed-off-by: Michael Goin <mgoin64@gmail.com>
---
 requirements/common.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index 31c8fb404f63a..426d281c26704 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -50,5 +50,5 @@ ijson # Required for mistral streaming tool parser
 setproctitle # Used to set process names for better debugging and monitoring
 openai-harmony >= 0.0.3  # Required for gpt-oss
 anthropic == 0.71.0
-model-hosting-container-standards >= 0.1.9, < 1.0.0
-mcp
\ No newline at end of file
+model-hosting-container-standards >= 0.1.10, < 1.0.0
+mcp

From bb5ac1fe38c9fcc7bafaee47fd45c8d1696ad176 Mon Sep 17 00:00:00 2001
From: Fadi Arafeh <115173828+fadara01@users.noreply.github.com>
Date: Wed, 17 Dec 2025 04:21:07 +0000
Subject: [PATCH 610/770] [CPU] Add action to automatically label CPU related
 PRs (#30678)

Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
---
 .github/mergify.yml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/.github/mergify.yml b/.github/mergify.yml
index 3ad79f93bc7ad..3e4e21efe39df 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -235,6 +235,20 @@ pull_request_rules:
       add:
         - rocm
 
+- name: label-cpu
+  description: Automatically apply cpu label
+  conditions:
+    - label != stale
+    - files~=^(?!.*kv_offload)(?!.*cpu_offload).*\bcpu.*
+  actions:
+    label:
+      add:
+        - cpu
+    assign:
+      users:
+        - "fadara01"
+        - "aditew01"
+
 - name: label-structured-output
   description: Automatically apply structured-output label
   conditions:

From 44d3b1df3d6416b76d84c360d751b8f5220c0b11 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 17 Dec 2025 12:21:19 +0800
Subject: [PATCH 611/770] [CI/Build] Fix compatibility between #30244 and
 #30396 (#30787)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/compile/distributed/test_fusions_e2e.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py
index 80086c4e03a9c..960b5b4bd7ad4 100644
--- a/tests/compile/distributed/test_fusions_e2e.py
+++ b/tests/compile/distributed/test_fusions_e2e.py
@@ -564,7 +564,9 @@ def test_rms_group_quant(
         splitting_ops=splitting_ops,
         # Common
         mode=CompilationMode.VLLM_COMPILE,
-        pass_config=PassConfig(eliminate_noops=True, fuse_norm_quant=True),
+        pass_config=PassConfig(
+            fuse_norm_quant=True, fuse_act_quant=True, eliminate_noops=True
+        ),
         # Inductor caches custom passes by default as well via uuid
         inductor_compile_config={"force_disable_caches": True},
     )

From 009a773828fee13504ee2976ad02abb6020152c8 Mon Sep 17 00:00:00 2001
From: shanjiaz <43143795+shanjiaz@users.noreply.github.com>
Date: Wed, 17 Dec 2025 00:01:04 -0500
Subject: [PATCH 612/770] bump up compressed tensors version to 0.13.0 (#30799)

Signed-off-by: shanjiaz <zsjwpianpian@gmail.com>
Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com>
---
 requirements/common.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index 426d281c26704..7c89385da6ba5 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -37,7 +37,7 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=77.0.3,<81.0.0; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.12.2 # required for compressed-tensors
+compressed-tensors == 0.13.0 # required for compressed-tensors
 depyf==0.20.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files

From d4d2751732c3ccae162a5a0160c7d4fe05d2779a Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Wed, 17 Dec 2025 00:29:03 -0500
Subject: [PATCH 613/770] Update note comment for flashinfer attention warmup
 (#30711)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/model_executor/warmup/kernel_warmup.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/warmup/kernel_warmup.py b/vllm/model_executor/warmup/kernel_warmup.py
index 95f5982bc8c7b..98b28d3e5292f 100644
--- a/vllm/model_executor/warmup/kernel_warmup.py
+++ b/vllm/model_executor/warmup/kernel_warmup.py
@@ -49,13 +49,12 @@ def kernel_warmup(worker: "Worker"):
         except NotImplementedError:
             return False
 
-    # NOTE: we add check for empty attn_groups to avoid errors when
-    # deploying models such as E instances and encoder-only models.
-    # As for those models, worker.model_runner.attn_groups is empty.
-    # This change is made during EPD feature development.
     if (
         not worker.model_runner.is_pooling_model
         and worker.model_runner.attn_groups
+        # NOTE: This should be `any` instead of `all` but other hybrid attention
+        # backends don't support this dummy run. Once we remove
+        # `build_for_cudagraph_capture`, we can change it to `any`.
         and all(
             _is_flashinfer_backend(group.backend)
             for groups in worker.model_runner.attn_groups

From 0cd5353644d3d045ab33c7e8e19c182bfd7db911 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Wed, 17 Dec 2025 15:25:12 +0800
Subject: [PATCH 614/770] [Bugfix][CPU] Fix CPU backend ROPE dispatch for VL
 models (#30829)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
Signed-off-by: Li, Jiang <bigpyj64@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/model_executor/layers/rotary_embedding/common.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/vllm/model_executor/layers/rotary_embedding/common.py b/vllm/model_executor/layers/rotary_embedding/common.py
index 3e6584dbc3da0..50660c6ecc223 100644
--- a/vllm/model_executor/layers/rotary_embedding/common.py
+++ b/vllm/model_executor/layers/rotary_embedding/common.py
@@ -264,6 +264,15 @@ class ApplyRotaryEmb(CustomOp):
 
         return output
 
+    def forward_cpu(
+        self,
+        x: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> torch.Tensor:
+        # TODO (bigPYJ1151): need to enable fused CPU ROPE here
+        return self.forward_native(x, cos, sin)
+
     def extra_repr(self) -> str:
         s = f"is_neox_style={self.is_neox_style}"
         s += f"enable_fp32_compute={self.enable_fp32_compute}"

From 4f735babb7353987137b85ec0465e594e9ed1384 Mon Sep 17 00:00:00 2001
From: Yan Ma <yan.ma@intel.com>
Date: Wed, 17 Dec 2025 16:28:13 +0800
Subject: [PATCH 615/770] [XPU] fix broken fp8 online quantization for XPU
 platform (#30831)

Signed-off-by: Yan Ma <yan.ma@intel.com>
---
 .../layers/quantization/ipex_quant.py         | 35 +++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py
index 463c74c1c1482..f33ee43727f19 100644
--- a/vllm/model_executor/layers/quantization/ipex_quant.py
+++ b/vllm/model_executor/layers/quantization/ipex_quant.py
@@ -27,6 +27,10 @@ from vllm.model_executor.layers.quantization.awq import AWQLinearMethod
 from vllm.model_executor.layers.quantization.fp8 import Fp8Config, Fp8LinearMethod
 from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
 from vllm.model_executor.layers.quantization.utils.quant_utils import is_layer_skipped
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    maybe_create_device_identity,
+)
+from vllm.model_executor.parameter import ModelWeightParameter
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 
@@ -305,6 +309,37 @@ class XPUFp8LinearMethod(Fp8LinearMethod):
     def __init__(self, quant_config: Fp8Config):
         super().__init__(quant_config)
 
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        maybe_create_device_identity()
+
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        layer.logical_widths = output_partition_sizes
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+        layer.orig_dtype = params_dtype
+        layer.weight_block_size = None
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition,
+                dtype=params_dtype,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+
     def process_weights_after_loading(self, layer: Module) -> None:
         # If checkpoint not serialized fp8, quantize the weights.
         if not self.quant_config.is_checkpoint_fp8_serialized:

From 20fda431515d19a883cc962d3a1fa727f225e82d Mon Sep 17 00:00:00 2001
From: Robin <863579016@qq.com>
Date: Wed, 17 Dec 2025 16:37:57 +0800
Subject: [PATCH 616/770] [Bugfix][Frontend] Prevent IndexError in MiniMax M2
 tool parser during streaming extraction (#30555)

Signed-off-by: WangErXiao <863579016@qq.com>
---
 tests/tool_use/test_minimax_m2_tool_parser.py | 119 ++++++++++++++++++
 vllm/tool_parsers/minimax_m2_tool_parser.py   |  22 +++-
 2 files changed, 137 insertions(+), 4 deletions(-)
 create mode 100644 tests/tool_use/test_minimax_m2_tool_parser.py

diff --git a/tests/tool_use/test_minimax_m2_tool_parser.py b/tests/tool_use/test_minimax_m2_tool_parser.py
new file mode 100644
index 0000000000000..cf1835b1928b4
--- /dev/null
+++ b/tests/tool_use/test_minimax_m2_tool_parser.py
@@ -0,0 +1,119 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import pytest
+
+from vllm.tool_parsers.minimax_m2_tool_parser import (
+    MinimaxM2ToolParser,
+)
+
+pytestmark = pytest.mark.cpu_test
+
+
+class FakeTokenizer:
+    """Minimal fake tokenizer that exposes the attributes used by the
+    parser: a truthy model_tokenizer marker and a vocab mapping for the
+    special tokens.
+    """
+
+    def __init__(self):
+        self.model_tokenizer = True
+        # The parser will look up start/end tokens by their literal strings
+        self.vocab = {
+            "<minimax:tool_call>": 1,
+            "</minimax:tool_call>": 2,
+        }
+
+    def get_vocab(self):
+        return self.vocab
+
+
+@pytest.fixture
+def minimax_m2_tool_parser():
+    return MinimaxM2ToolParser(FakeTokenizer())
+
+
+def test_extract_tool_calls_streaming_incremental(minimax_m2_tool_parser):
+    parser = minimax_m2_tool_parser
+    parser._reset_streaming_state()
+    chunks = [
+        "<minimax:tool_call>",
+        '<invoke name="get_weather">',
+        '<parameter name="city">',
+        "Seattle</parameter>",
+        "</invoke></minimax:tool_call>",
+    ]
+    previous = ""
+    for chunk in chunks:
+        current = previous + chunk
+        delta = chunk
+        parser.extract_tool_calls_streaming(
+            previous_text=previous,
+            current_text=current,
+            delta_text=delta,
+            previous_token_ids=[],
+            current_token_ids=[],
+            delta_token_ids=[],
+            request=None,
+        )
+        previous = current
+
+    assert len(parser.prev_tool_call_arr) == 1
+    entry = parser.prev_tool_call_arr[0]
+
+    assert entry["name"] == "get_weather"
+    args = entry["arguments"]
+    assert args["city"] == "Seattle"
+
+
+def test_streaming_minimax_m2_multiple_invokes(minimax_m2_tool_parser):
+    parser = minimax_m2_tool_parser
+    parser._reset_streaming_state()
+
+    chunks = [
+        "<minimax:tool_call>",
+        '<invoke name="search_web">',
+        '<parameter name="query_tag">',
+        '["technology", "events"]</parameter>',
+        '<parameter name="query_list">',
+        '["OpenAI", "latest", "release"]</parameter>',
+        "</invoke>",
+        '<invoke name="search_web">',
+        '<parameter name="query_tag">',
+        '["technology", "events"]</parameter>',
+        '<parameter name="query_list">',
+        '["Gemini", "latest", "release"]</parameter>',
+        "</invoke>",
+        "</minimax:tool_call>",
+    ]
+    previous = ""
+    for chunk in chunks:
+        current = previous + chunk
+        delta = chunk
+        parser.extract_tool_calls_streaming(
+            previous_text=previous,
+            current_text=current,
+            delta_text=delta,
+            previous_token_ids=[],
+            current_token_ids=[],
+            delta_token_ids=[],
+            request=None,
+        )
+        previous = current
+
+    assert len(parser.prev_tool_call_arr) == 2
+
+    for entry, expect_model in zip(parser.prev_tool_call_arr, ["OpenAI", "Gemini"]):
+        assert entry["name"] == "search_web"
+        args = json.dumps(entry["arguments"])
+        assert "technology" in args and "events" in args
+        assert expect_model in args
+
+    # check streamed_args_for_tool for serving_chat.py
+    for index in range(2):
+        expected_call = parser.prev_tool_call_arr[index].get("arguments", {})
+        expected_call = json.dumps(expected_call)
+        actual_call = parser.streamed_args_for_tool[index]
+        assert expected_call == actual_call
diff --git a/vllm/tool_parsers/minimax_m2_tool_parser.py b/vllm/tool_parsers/minimax_m2_tool_parser.py
index dcb2b64f6e73c..a1ab75f548bfc 100644
--- a/vllm/tool_parsers/minimax_m2_tool_parser.py
+++ b/vllm/tool_parsers/minimax_m2_tool_parser.py
@@ -122,6 +122,8 @@ class MinimaxM2ToolParser(ToolParser):
         self.streaming_request = None
         # Clear previous tool call history to avoid state pollution
         self.prev_tool_call_arr.clear()
+        # Reset streamed args tracking
+        self.streamed_args_for_tool.clear()
 
     def _extract_name(self, name_str: str) -> str:
         """Extract name from quoted string."""
@@ -421,9 +423,12 @@ class MinimaxM2ToolParser(ToolParser):
                         self.prev_tool_call_arr.append(
                             {
                                 "name": self.current_function_name,
-                                "arguments": "{}",  # Placeholder, will be updated later
+                                "arguments": {},  # Placeholder, will be updated later
                             }
                         )
+                        # Initialize streamed_args_for_tool for this tool call
+                        if len(self.streamed_args_for_tool) <= self.current_tool_index:
+                            self.streamed_args_for_tool.append("")
 
                     # Send header with function info
                     return DeltaMessage(
@@ -445,6 +450,9 @@ class MinimaxM2ToolParser(ToolParser):
             # Send opening brace if not sent yet
             if self.in_function and not self.json_started:
                 self.json_started = True
+                # Update streamed_args_for_tool for opening brace
+                if self.current_tool_index < len(self.streamed_args_for_tool):
+                    self.streamed_args_for_tool[self.current_tool_index] += "{"
                 return DeltaMessage(
                     tool_calls=[
                         DeltaToolCall(
@@ -493,7 +501,7 @@ class MinimaxM2ToolParser(ToolParser):
                                 args = parsed_tool.function.arguments
                                 self.prev_tool_call_arr[self.current_tool_index][
                                     "arguments"
-                                ] = args
+                                ] = json.loads(args)
                         except Exception:
                             pass  # Ignore parsing errors during streaming
 
@@ -505,7 +513,9 @@ class MinimaxM2ToolParser(ToolParser):
                             )
                         ]
                     )
-
+                    # Update streamed_args_for_tool for closing brace
+                    if self.current_tool_index < len(self.streamed_args_for_tool):
+                        self.streamed_args_for_tool[self.current_tool_index] += "}"
                     # Reset state for next tool
                     self.json_closed = True
                     self.in_function = False
@@ -630,7 +640,11 @@ class MinimaxM2ToolParser(ToolParser):
                             )
 
                         self.param_count += 1
-
+                        # Update streamed_args_for_tool for this tool call
+                        if self.current_tool_index < len(self.streamed_args_for_tool):
+                            self.streamed_args_for_tool[self.current_tool_index] += (
+                                json_fragment
+                            )
                         return DeltaMessage(
                             tool_calls=[
                                 DeltaToolCall(

From a9e15c21efbbc5b4a7a1e69e40378fdfe1acdcb7 Mon Sep 17 00:00:00 2001
From: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com>
Date: Wed, 17 Dec 2025 10:48:53 +0200
Subject: [PATCH 617/770] [Mamba] Removed disable cascade attn in
 MambaModelConfig (#30712)

Signed-off-by: asafg <39553475+Josephasafg@users.noreply.github.com>
---
 vllm/model_executor/models/config.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index 4b08472538db4..a3624b1cfa5f2 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -308,12 +308,6 @@ class MambaModelConfig(VerifyAndUpdateConfig):
         if cache_config.mamba_block_size is None:
             cache_config.mamba_block_size = model_config.max_model_len
 
-        # TODO(tdoublep): remove once cascade attention is supported
-        logger.info(
-            "Disabling cascade attention since it is not supported for hybrid models."
-        )
-        model_config.disable_cascade_attn = True
-
 
 class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
     @classmethod

From 3b1d440ede42855f031ba72af4817583e5dddba0 Mon Sep 17 00:00:00 2001
From: Xinyu Chen <xinyu1.chen@intel.com>
Date: Wed, 17 Dec 2025 17:43:00 +0800
Subject: [PATCH 618/770] CustomOp: grouped topk (#29575)

Signed-off-by: Xinyu Chen <xinyu1.chen@intel.com>
---
 tests/kernels/moe/test_grouped_topk.py        | 10 ++--
 .../layers/fused_moe/__init__.py              |  4 +-
 .../layers/fused_moe/fused_moe.py             | 52 +++++++++++++++++++
 vllm/model_executor/layers/fused_moe/layer.py | 23 +++++---
 4 files changed, 75 insertions(+), 14 deletions(-)

diff --git a/tests/kernels/moe/test_grouped_topk.py b/tests/kernels/moe/test_grouped_topk.py
index 662e0723b7583..d26fe50b815b4 100644
--- a/tests/kernels/moe/test_grouped_topk.py
+++ b/tests/kernels/moe/test_grouped_topk.py
@@ -9,8 +9,8 @@ import pytest
 import torch
 
 from vllm.model_executor.layers.fused_moe.fused_moe import (
+    GroupedTopk,
     fused_grouped_topk,
-    grouped_topk,
 )
 from vllm.platforms import current_platform
 
@@ -50,15 +50,17 @@ def test_grouped_topk(
 
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_FUSED_MOE_GROUPED_TOPK", "0")
-        baseline_topk_weights, baseline_topk_ids = grouped_topk(
-            hidden_states=hidden_states,
-            gating_output=gating_output,
+        grouped_topk = GroupedTopk(
             topk=topk,
             renormalize=renormalize,
             num_expert_group=num_expert_group,
             topk_group=topk_group,
             scoring_func=scoring_func,
             routed_scaling_factor=routed_scaling_factor,
+        )
+        baseline_topk_weights, baseline_topk_ids = grouped_topk(
+            hidden_states=hidden_states,
+            gating_output=gating_output,
             e_score_correction_bias=e_score_correction_bias,
         )
 
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index d71cfc5ad8200..8fee4038b60b8 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -77,11 +77,11 @@ if HAS_TRITON:
         BatchedTritonExperts,
     )
     from vllm.model_executor.layers.fused_moe.fused_moe import (
+        GroupedTopk,
         TritonExperts,
         fused_experts,
         fused_topk,
         get_config_file_name,
-        grouped_topk,
     )
     from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
         TritonOrDeepGemmExperts,
@@ -91,7 +91,7 @@ if HAS_TRITON:
         "fused_topk",
         "fused_experts",
         "get_config_file_name",
-        "grouped_topk",
+        "GroupedTopk",
         "cutlass_moe_fp8",
         "cutlass_moe_fp4",
         "cutlass_moe_w4a8_fp8",
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index b286c3bc6fc07..20782e2712f27 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -16,6 +16,7 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
 from vllm._aiter_ops import rocm_aiter_ops
 from vllm.logger import init_logger
+from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.batch_invariant import (
     vllm_is_batch_invariant,
 )
@@ -1286,6 +1287,57 @@ def grouped_topk(
     return topk_weights.to(torch.float32), topk_ids.to(torch.int32)
 
 
+@CustomOp.register("grouped_topk")
+class GroupedTopk(CustomOp):
+    """GroupedTopk used by the Deepseek-V2 and Deepseek-V3 model."""
+
+    def __init__(
+        self,
+        topk: int,
+        renormalize: bool,
+        num_expert_group: int = 0,
+        topk_group: int = 0,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+    ) -> None:
+        super().__init__()
+        self.native_impl = grouped_topk
+        self.topk = topk
+        self.renormalize = renormalize
+        self.num_expert_group = num_expert_group
+        self.topk_group = topk_group
+        self.scoring_func = scoring_func
+        self.routed_scaling_factor = routed_scaling_factor
+
+    def forward_native(
+        self,
+        hidden_states: torch.Tensor,
+        gating_output: torch.Tensor,
+        e_score_correction_bias: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        return self.native_impl(
+            hidden_states,
+            gating_output,
+            self.topk,
+            self.renormalize,
+            self.num_expert_group,
+            self.topk_group,
+            self.scoring_func,
+            self.routed_scaling_factor,
+            e_score_correction_bias,
+        )
+
+    def forward_cuda(
+        self,
+        hidden_states: torch.Tensor,
+        gating_output: torch.Tensor,
+        e_score_correction_bias: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        return self.forward_native(
+            hidden_states, gating_output, e_score_correction_bias
+        )
+
+
 @torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
 def eplb_map_to_physical_and_record(
     topk_ids: torch.Tensor,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index b39ce415a0f83..db97d6eb88ea5 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -67,7 +67,7 @@ else:
         return topk_ids
 
     eplb_map_to_physical_and_record = _eplb_map_to_physical_and_record
-from vllm.model_executor.layers.fused_moe.fused_moe import grouped_topk
+from vllm.model_executor.layers.fused_moe.fused_moe import GroupedTopk
 from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa: E501
     rocm_aiter_grouped_topk,
 )
@@ -1594,19 +1594,26 @@ class FusedMoE(CustomOp):
                 grouped_topk_impl = partial(
                     rocm_aiter_grouped_topk,
                     num_fused_shared_experts=self.num_fused_shared_experts,
+                    topk=self.top_k,
+                    renormalize=self.renormalize,
+                    num_expert_group=self.num_expert_group,
+                    topk_group=self.topk_group,
+                    scoring_func=self.scoring_func,
+                    routed_scaling_factor=self.routed_scaling_factor,
                 )
             else:
-                grouped_topk_impl = grouped_topk
+                grouped_topk_impl = GroupedTopk(
+                    topk=self.top_k,
+                    renormalize=self.renormalize,
+                    num_expert_group=self.num_expert_group,
+                    topk_group=self.topk_group,
+                    scoring_func=self.scoring_func,
+                    routed_scaling_factor=self.routed_scaling_factor,
+                )
 
             topk_weights, topk_ids = grouped_topk_impl(
                 hidden_states=hidden_states,
                 gating_output=router_logits,
-                topk=self.top_k,
-                renormalize=self.renormalize,
-                num_expert_group=self.num_expert_group,
-                topk_group=self.topk_group,
-                scoring_func=self.scoring_func,
-                routed_scaling_factor=self.routed_scaling_factor,
                 e_score_correction_bias=self.e_score_correction_bias,
             )
         elif self.e_score_correction_bias is not None:

From f4e884f2224a25612eaeaeac2a854c1dd330c144 Mon Sep 17 00:00:00 2001
From: Sheng Lin <linsh0@protonmail.com>
Date: Wed, 17 Dec 2025 17:52:58 +0800
Subject: [PATCH 619/770] [NIXL][Bugfix] Fix NIXL/RDMA registration failure
 over CuMemAllocator (#29569)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Somoku <linsh0@protonmail.com>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
---
 csrc/cumem_allocator.cpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/csrc/cumem_allocator.cpp b/csrc/cumem_allocator.cpp
index 78dc840a98b67..6c2c18a6602d2 100644
--- a/csrc/cumem_allocator.cpp
+++ b/csrc/cumem_allocator.cpp
@@ -107,6 +107,16 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
   prop.location.id = device;
   prop.allocFlags.compressionType = CU_MEM_ALLOCATION_COMP_NONE;
 
+#ifndef USE_ROCM
+  int flag = 0;
+  CUDA_CHECK(cuDeviceGetAttribute(
+      &flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED,
+      device));
+  if (flag) {  // support GPUDirect RDMA if possible
+    prop.allocFlags.gpuDirectRDMACapable = 1;
+  }
+#endif
+
 #ifndef USE_ROCM
   // Allocate memory using cuMemCreate
   CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0));

From 4c054d89aa5972014ba7e13c0accb0ab631b5638 Mon Sep 17 00:00:00 2001
From: Andrew Xia <axia@meta.com>
Date: Wed, 17 Dec 2025 17:53:02 +0800
Subject: [PATCH 620/770] [Doc][ResponsesAPI] add documentation (#30840)

Signed-off-by: Andrew Xia <axia@fb.com>
Co-authored-by: Andrew Xia <axia@fb.com>
---
 docs/serving/openai_compatible_server.md | 27 ++++++++++++++++++++++++
 vllm/entrypoints/openai/protocol.py      | 18 ++++++++++++----
 2 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index 0e29204f8947c..6a08f872def15 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -47,6 +47,8 @@ We currently support the following OpenAI APIs:
 - [Completions API](#completions-api) (`/v1/completions`)
     - Only applicable to [text generation models](../models/generative_models.md).
     - *Note: `suffix` parameter is not supported.*
+- [Responses API](#responses-api) (`/v1/responses`)
+    - Only applicable to [text generation models](../models/generative_models.md).
 - [Chat Completions API](#chat-api) (`/v1/chat/completions`)
     - Only applicable to [text generation models](../models/generative_models.md) with a [chat template](../serving/openai_compatible_server.md#chat-template).
     - *Note: `user` parameter is ignored.*
@@ -229,6 +231,31 @@ The following extra parameters are supported:
     --8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-extra-params"
     ```
 
+### Responses API
+
+Our Responses API is compatible with [OpenAI's Responses API](https://platform.openai.com/docs/api-reference/responses);
+you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
+
+Code example: [examples/online_serving/openai_responses_client_with_tools.py](../../examples/online_serving/openai_responses_client_with_tools.py)
+
+#### Extra parameters
+
+The following extra parameters in the request object are supported:
+
+??? code
+
+    ```python
+    --8<-- "vllm/entrypoints/openai/protocol.py:responses-extra-params"
+    ```
+
+The following extra parameters in the response object are supported:
+
+??? code
+
+    ```python
+    --8<-- "vllm/entrypoints/openai/protocol.py:responses-response-extra-params"
+    ```
+
 ### Embeddings API
 
 Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings);
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 94dde4564ea0c..a3c347cb1bd3f 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1654,13 +1654,23 @@ class ResponsesResponse(OpenAIBaseModel):
     usage: ResponseUsage | None = None
     user: str | None = None
 
-    # --8<-- [start:responses-extra-params]
+    # --8<-- [start:responses-response-extra-params]
     # These are populated when enable_response_messages is set to True
     # NOTE: custom serialization is needed
     # see serialize_input_messages and serialize_output_messages
-    input_messages: ResponseInputOutputMessage | None = None
-    output_messages: ResponseInputOutputMessage | None = None
-    # --8<-- [end:responses-extra-params]
+    input_messages: ResponseInputOutputMessage | None = Field(
+        default=None,
+        description=(
+            "If enable_response_messages, we can show raw token input to model."
+        ),
+    )
+    output_messages: ResponseInputOutputMessage | None = Field(
+        default=None,
+        description=(
+            "If enable_response_messages, we can show raw token output of model."
+        ),
+    )
+    # --8<-- [end:responses-response-extra-params]
 
     # NOTE: openAI harmony doesn't serialize TextContent properly,
     # TODO: this fixes for TextContent, but need to verify for tools etc

From a100152288c8ec50336aea842f0b3d8e36624024 Mon Sep 17 00:00:00 2001
From: "Ye (Charlotte) Qi" <yeq@meta.com>
Date: Wed, 17 Dec 2025 01:54:21 -0800
Subject: [PATCH 621/770] [Kernels][FI] Skip trtllm attention when
 num_kv_heads=1 (#30842)

Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
---
 .../test_flashinfer_trtllm_attention.py       | 35 +++++++++++++++++++
 vllm/utils/flashinfer.py                      | 22 +++++++++++-
 2 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
index 06a7085a82ba0..220d827b9d5fa 100644
--- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py
+++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
@@ -455,3 +455,38 @@ def test_flashinfer_trtllm_prefill_with_baseline(
         torch.testing.assert_close(output, output_trtllm, atol=atol, rtol=rtol),
         f"{torch.max(torch.abs(output - output_trtllm))}",
     )
+
+
+def test_trtllm_attention_rejects_num_kv_heads_1() -> None:
+    """Test that TRTLLM attention correctly rejects num_kv_heads=1.
+
+    When num_kv_heads=1 (MQA), the KV cache strides become degenerate
+    (stride_heads == stride_batch), which causes CUDA's cuTensorMapEncodeTiled
+    to fail because TMA descriptors cannot handle degenerate 4D tensors with
+    singleton dimensions.
+
+    This test verifies that can_use_trtllm_attention returns False for
+    num_kv_heads=1 configurations.
+    """
+    from vllm.utils.flashinfer import can_use_trtllm_attention
+
+    # num_kv_heads=1 should be rejected
+    assert not can_use_trtllm_attention(num_qo_heads=64, num_kv_heads=1), (
+        "can_use_trtllm_attention should return False for num_kv_heads=1"
+    )
+    assert not can_use_trtllm_attention(num_qo_heads=32, num_kv_heads=1), (
+        "can_use_trtllm_attention should return False for num_kv_heads=1"
+    )
+
+    # num_kv_heads > 1 should be accepted (if platform supports it)
+    # Note: This may return False on non-Blackwell platforms, which is fine
+    result_kv8 = can_use_trtllm_attention(num_qo_heads=64, num_kv_heads=8)
+    result_kv1 = can_use_trtllm_attention(num_qo_heads=64, num_kv_heads=1)
+
+    # Even if platform doesn't support TRTLLM, num_kv_heads=1 should never
+    # return True when num_kv_heads > 1 returns True
+    if result_kv8:
+        assert not result_kv1, (
+            "If TRTLLM is supported for num_kv_heads=8, "
+            "it must be rejected for num_kv_heads=1"
+        )
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index 1c2710be3173b..6bbe02348eaf1 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -305,7 +305,18 @@ def can_use_trtllm_attention(num_qo_heads: int, num_kv_heads: int) -> bool:
     if force_use_trtllm_attention() is False:
         return False
     has_trtllm = supports_trtllm_attention()
-    return has_trtllm and (num_qo_heads % num_kv_heads == 0)
+    # num_kv_heads=1 is not supported due to TMA descriptor building limitations.
+    # When num_kv_heads=1, the KV cache strides become degenerate (stride_heads ==
+    # stride_batch), which causes CUDA's cuTensorMapEncodeTiled to fail because
+    # TMA descriptors cannot handle degenerate 4D tensors with singleton dimensions.
+    # See: https://fburl.com/352mrydz
+    if has_trtllm and num_kv_heads == 1:
+        logger.warning_once(
+            "TRTLLM attention does not support num_kv_heads=1. "
+            "This configuration causes TMA descriptor building to fail due to "
+            "degenerate tensor strides. Falling back to FlashInfer attention."
+        )
+    return has_trtllm and (num_qo_heads % num_kv_heads == 0) and (num_kv_heads != 1)
 
 
 def use_trtllm_attention(
@@ -355,6 +366,15 @@ def use_trtllm_attention(
             )
         return False
 
+    # num_kv_heads=1 is not supported
+    if num_kv_heads == 1:
+        if force_use_trtllm:
+            logger.warning_once(
+                "TRTLLM attention does not support num_kv_heads=1, "
+                "but --attention-config.use_trtllm_attention is set to 1"
+            )
+        return False
+
     if has_spec and not is_prefill:
         # Speculative decoding requires TRTLLM attention for decodes
         logger.info_once("Using TRTLLM attention (enabled for speculative decoding).")

From 519ef9a91111d2d6f8545c8a6b2c1a28d87309fa Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Wed, 17 Dec 2025 04:55:30 -0500
Subject: [PATCH 622/770] [UX] Make `vllm bench serve` discover model by
 default and use --input-len (#30816)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 tests/benchmarks/test_serve_cli.py |  9 ++--
 vllm/benchmarks/serve.py           | 83 +++++++++++++++++++++++++++---
 2 files changed, 79 insertions(+), 13 deletions(-)

diff --git a/tests/benchmarks/test_serve_cli.py b/tests/benchmarks/test_serve_cli.py
index 90d685c966d3e..c579b38069864 100644
--- a/tests/benchmarks/test_serve_cli.py
+++ b/tests/benchmarks/test_serve_cli.py
@@ -19,21 +19,18 @@ def server():
 
 @pytest.mark.benchmark
 def test_bench_serve(server):
+    # Test default model detection and input/output len
     command = [
         "vllm",
         "bench",
         "serve",
-        "--model",
-        MODEL_NAME,
         "--host",
         server.host,
         "--port",
         str(server.port),
-        "--dataset-name",
-        "random",
-        "--random-input-len",
+        "--input-len",
         "32",
-        "--random-output-len",
+        "--output-len",
         "4",
         "--num-prompts",
         "5",
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index f5d8ea5a975a9..12756d1700c9f 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -10,8 +10,10 @@ On the client side, run:
     vllm bench serve \
         --backend <backend or endpoint type. Default 'openai'> \
         --label <benchmark result label. Default using backend> \
-        --model <your_model> \
+        --model <your_model. Optional, defaults to first model from server> \
         --dataset-name <dataset_name. Default 'random'> \
+        --input-len <general input length. Optional, maps to dataset-specific args> \
+        --output-len <general output length. Optional, maps to dataset-specific args> \
         --request-rate <request_rate. Default inf> \
         --num-prompts <num_prompts. Default 1000>
 """
@@ -57,6 +59,33 @@ TERM_PLOTLIB_AVAILABLE = (importlib.util.find_spec("termplotlib") is not None) a
 )
 
 
+async def get_first_model_from_server(
+    base_url: str, headers: dict | None = None
+) -> str:
+    """Fetch the first model from the server's /v1/models endpoint."""
+    models_url = f"{base_url}/v1/models"
+    async with aiohttp.ClientSession() as session:
+        try:
+            async with session.get(models_url, headers=headers) as response:
+                response.raise_for_status()
+                data = await response.json()
+                if "data" in data and len(data["data"]) > 0:
+                    return data["data"][0]["id"]
+                else:
+                    raise ValueError(
+                        f"No models found on the server at {base_url}. "
+                        "Make sure the server is running and has models loaded."
+                    )
+        except (aiohttp.ClientError, json.JSONDecodeError) as e:
+            raise RuntimeError(
+                f"Failed to fetch models from server at {models_url}. "
+                "Check that:\n"
+                "1. The server is running\n"
+                "2. The server URL is correct\n"
+                f"Error: {e}"
+            ) from e
+
+
 class TaskType(Enum):
     GENERATION = "generation"
     POOLING = "pooling"
@@ -1025,8 +1054,26 @@ def add_cli_args(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--model",
         type=str,
-        required=True,
-        help="Name of the model.",
+        required=False,
+        default=None,
+        help="Name of the model. If not specified, will fetch the first model "
+        "from the server's /v1/models endpoint.",
+    )
+    parser.add_argument(
+        "--input-len",
+        type=int,
+        default=None,
+        help="General input length for datasets. Maps to dataset-specific "
+        "input length arguments (e.g., --random-input-len, --sonnet-input-len). "
+        "If not specified, uses dataset defaults.",
+    )
+    parser.add_argument(
+        "--output-len",
+        type=int,
+        default=None,
+        help="General output length for datasets. Maps to dataset-specific "
+        "output length arguments (e.g., --random-output-len, --sonnet-output-len). "
+        "If not specified, uses dataset defaults.",
     )
     parser.add_argument(
         "--tokenizer",
@@ -1332,10 +1379,6 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
             raise ValueError("For exponential ramp-up, the start RPS cannot be 0.")
 
     label = args.label
-    model_id = args.model
-    model_name = args.served_model_name
-    tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
-    tokenizer_mode = args.tokenizer_mode
 
     if args.base_url is not None:
         api_url = f"{args.base_url}{args.endpoint}"
@@ -1356,6 +1399,18 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
             else:
                 raise ValueError("Invalid header format. Please use KEY=VALUE format.")
 
+    # Fetch model from server if not specified
+    if args.model is None:
+        print("Model not specified, fetching first model from server...")
+        model_id = await get_first_model_from_server(base_url, headers)
+        print(f"Using model: {model_id}")
+    else:
+        model_id = args.model
+
+    model_name = args.served_model_name
+    tokenizer_id = args.tokenizer if args.tokenizer is not None else model_id
+    tokenizer_mode = args.tokenizer_mode
+
     tokenizer = get_tokenizer(
         tokenizer_id,
         tokenizer_mode=tokenizer_mode,
@@ -1368,6 +1423,20 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
             "'--dataset-path' if required."
         )
 
+    # Map general --input-len and --output-len to all dataset-specific arguments
+    if args.input_len is not None:
+        args.random_input_len = args.input_len
+        args.sonnet_input_len = args.input_len
+
+    if args.output_len is not None:
+        args.random_output_len = args.output_len
+        args.sonnet_output_len = args.output_len
+        args.sharegpt_output_len = args.output_len
+        args.custom_output_len = args.output_len
+        args.hf_output_len = args.output_len
+        args.spec_bench_output_len = args.output_len
+        args.prefix_repetition_output_len = args.output_len
+
     # when using random datasets, default to ignoring EOS
     # so generation runs to the requested length
     if (

From 177c391db2ad8dfc05906473525d4ae0a55549e0 Mon Sep 17 00:00:00 2001
From: Zhengxu Chen <zhxchen17@fb.com>
Date: Wed, 17 Dec 2025 04:55:56 -0500
Subject: [PATCH 623/770] [compile] Disable aot when eager backend is used.
 (#30810)

Signed-off-by: zhxchen17 <zhxchen17@fb.com>
---
 vllm/compilation/decorators.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index d1ee995ee8959..40bde97ac61d8 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -435,7 +435,10 @@ def _support_torch_compile(
                 return self.aot_compiled_fn(self, *args, **kwargs)
 
         if self.compiled:
-            assert not envs.VLLM_USE_AOT_COMPILE
+            assert (
+                not envs.VLLM_USE_AOT_COMPILE
+                or self.vllm_config.compilation_config.backend == "eager"
+            )
             return TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs)
 
         # This is the path for the first compilation.
@@ -508,7 +511,11 @@ def _support_torch_compile(
             _torch27_patch_tensor_subclasses(),
             torch._inductor.config.patch(**inductor_config_patches),
         ):
-            if envs.VLLM_USE_AOT_COMPILE:
+            use_aot_compile = envs.VLLM_USE_AOT_COMPILE
+            if self.vllm_config.compilation_config.backend == "eager":
+                logger.warning("Detected eager backend, disabling AOT compile.")
+                use_aot_compile = False
+            if use_aot_compile:
                 self.aot_compiled_fn = self.aot_compile(*args, **kwargs)
                 output = self.aot_compiled_fn(self, *args, **kwargs)
                 assert aot_compilation_path is not None

From 9db1db5949f7abd4b03cd0231450f81bfeeaba0f Mon Sep 17 00:00:00 2001
From: Zhengxu Chen <zhxchen17@fb.com>
Date: Wed, 17 Dec 2025 04:56:24 -0500
Subject: [PATCH 624/770] [compile] Ignore VLLM_FORCE_AOT_LOAD from cache
 factors (#30809)

Signed-off-by: zhxchen17 <zhxchen17@fb.com>
---
 vllm/envs.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/envs.py b/vllm/envs.py
index 7e072a588591c..2f8158d88d6c5 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -1654,6 +1654,7 @@ def compile_factors() -> dict[str, object]:
         "VLLM_CI_USE_S3",
         "VLLM_MODEL_REDIRECT_PATH",
         "VLLM_HOST_IP",
+        "VLLM_FORCE_AOT_LOAD",
         "S3_ACCESS_KEY_ID",
         "S3_SECRET_ACCESS_KEY",
         "S3_ENDPOINT_URL",

From 7b966ae2ba73b5391937907bfd8aaf63af033ff1 Mon Sep 17 00:00:00 2001
From: danielafrimi <45691845+danielafrimi@users.noreply.github.com>
Date: Wed, 17 Dec 2025 11:56:38 +0200
Subject: [PATCH 625/770] [Fix]Load kv-cache dtype from hf_quant_config.json
 automatically (fix for reverted PR) (#30785)

Signed-off-by: <>
Co-authored-by: root <root@gpu-937.slurm-workers-slurm.slurm.svc.cluster.local>
---
 vllm/engine/arg_utils.py  |  9 ++++-
 vllm/utils/torch_utils.py | 75 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 83 insertions(+), 1 deletion(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index ca19e468914c7..03720bd2516d4 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -93,6 +93,7 @@ from vllm.transformers_utils.utils import is_cloud_storage
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.mem_constants import GiB_bytes
 from vllm.utils.network_utils import get_ip
+from vllm.utils.torch_utils import resolve_kv_cache_dtype_string
 from vllm.v1.sample.logits_processor import LogitsProcessor
 
 if TYPE_CHECKING:
@@ -106,6 +107,7 @@ else:
     LoadFormats = Any
     UsageContext = Any
 
+
 logger = init_logger(__name__)
 
 # object is used to allow for special typing forms
@@ -1361,12 +1363,17 @@ class EngineArgs:
             f"dcp_size={self.decode_context_parallel_size}."
         )
 
+        # Resolve "auto" kv_cache_dtype to actual value from model config
+        resolved_cache_dtype = resolve_kv_cache_dtype_string(
+            self.kv_cache_dtype, model_config
+        )
+
         cache_config = CacheConfig(
             block_size=self.block_size,
             gpu_memory_utilization=self.gpu_memory_utilization,
             kv_cache_memory_bytes=self.kv_cache_memory_bytes,
             swap_space=self.swap_space,
-            cache_dtype=self.kv_cache_dtype,
+            cache_dtype=resolved_cache_dtype,
             is_attention_free=model_config.is_attention_free,
             num_gpu_blocks_override=self.num_gpu_blocks_override,
             sliding_window=sliding_window,
diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py
index c97efce312b56..b82e0171b7f7f 100644
--- a/vllm/utils/torch_utils.py
+++ b/vllm/utils/torch_utils.py
@@ -24,6 +24,10 @@ else:
     ModelConfig = object
     IntermediateTensors = object
 
+import logging
+
+logger = logging.getLogger(__name__)
+
 
 STR_DTYPE_TO_TORCH_DTYPE = {
     "float32": torch.float32,
@@ -49,6 +53,13 @@ TORCH_DTYPE_TO_NUMPY_DTYPE = {
 }
 
 
+MODELOPT_TO_VLLM_KV_CACHE_DTYPE_MAP = {
+    # TODO: Add more modelopt kv cache dtype
+    # mappings here when it supported by some attention backend
+    # (for example supports nvfp4).
+    "fp8": "fp8_e4m3",
+}
+
 T = TypeVar("T")
 
 
@@ -194,6 +205,70 @@ def get_kv_cache_torch_dtype(
     return torch_dtype
 
 
+def get_kv_cache_quant_algo_string(quant_cfg: dict[str, Any]) -> str | None:
+    """Get the KV cache quantization algorithm string from the quantization config.
+
+    Maps various FP8 format names to vLLM's standard cache dtype strings.
+    Returns None if no kv_cache_quant_algo is specified.
+    Returns "auto" if the value is not recognized/supported.
+    """
+    # Mapping from model config values to vLLM cache_dtype strings
+
+    quant_method = quant_cfg.get("quant_method", "")
+    if quant_method.startswith("modelopt"):
+        quantization_inner = quant_cfg.get("quantization", quant_cfg)
+        # Check if quant config is specified and use kv cache quant algo
+        kv_algo = quantization_inner.get("kv_cache_quant_algo") or quant_cfg.get(
+            "kv_cache_quant_algo"
+        )
+        if isinstance(kv_algo, str):
+            kv_algo_lower = kv_algo.lower()
+
+            # Try to map to vLLM's standard format
+            if kv_algo_lower in MODELOPT_TO_VLLM_KV_CACHE_DTYPE_MAP:
+                return MODELOPT_TO_VLLM_KV_CACHE_DTYPE_MAP[kv_algo_lower]
+            else:
+                # Unknown/unsupported format - return "auto" as safe fallback
+                logger.warning(
+                    "WARNING: Unknown kv_cache_quant_algo '%s' in model "
+                    "config. Supported values: %s. Falling back to 'auto'.",
+                    kv_algo,
+                    list(MODELOPT_TO_VLLM_KV_CACHE_DTYPE_MAP.keys()),
+                )
+                return "auto"
+    return None
+
+
+def get_kv_cache_quant_algo_dtype(quant_cfg: dict[str, Any]) -> torch.dtype | None:
+    """Get the KV cache quantization algorithm dtype from the quantization config."""
+    kv_algo_str = get_kv_cache_quant_algo_string(quant_cfg)
+    if kv_algo_str is not None and kv_algo_str != "auto":
+        # Only convert if we have a valid dtype string (not "auto" fallback)
+        return STR_DTYPE_TO_TORCH_DTYPE[kv_algo_str]
+    return None
+
+
+def resolve_kv_cache_dtype_string(
+    kv_cache_dtype: str, model_config: ModelConfig
+) -> str:
+    """Resolve 'auto' kv_cache_dtype to the actual string value from model config.
+    Returns the resolved cache_dtype string.
+    """
+    if kv_cache_dtype != "auto":
+        return kv_cache_dtype
+
+    hf_cfg = getattr(model_config, "hf_config", None)
+    if hf_cfg is not None:
+        quant_cfg = getattr(hf_cfg, "quantization_config", None)
+        if quant_cfg is not None:
+            kv_algo_str = get_kv_cache_quant_algo_string(quant_cfg)
+            if kv_algo_str is not None:
+                return kv_algo_str
+
+    # Default to auto (will be handled by downstream code)
+    return "auto"
+
+
 def kv_cache_dtype_str_to_dtype(
     kv_cache_dtype: str, model_config: ModelConfig
 ) -> torch.dtype:

From 53cd7f868b3632cbbe982cffaee8e16fb49dd694 Mon Sep 17 00:00:00 2001
From: Zhengxu Chen <zhxchen17@fb.com>
Date: Wed, 17 Dec 2025 05:00:12 -0500
Subject: [PATCH 626/770] [compile] Recompile graph module during Dynamo cache
 loading. (#30743)

Signed-off-by: Zhengxu Chen <zhxchen17@fb.com>
---
 vllm/compilation/caching.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/compilation/caching.py b/vllm/compilation/caching.py
index ce482572b401b..fc02a08f74265 100644
--- a/vllm/compilation/caching.py
+++ b/vllm/compilation/caching.py
@@ -104,6 +104,7 @@ class VllmSerializableFunction(SerializableCallable):
         state = pickle.loads(data)
         fake_mode = FakeTensorMode(shape_env=ShapeEnv())
         state["graph_module"] = GraphPickler.loads(state["graph_module"], fake_mode)
+        state["graph_module"].recompile()
         state["example_inputs"] = GraphPickler.loads(state["example_inputs"], fake_mode)
         vllm_backend = VllmBackend(get_current_vllm_config(), state["prefix"])
 

From f284d7bd0c55f929fa7912936b1d247089679191 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Wed, 17 Dec 2025 05:00:35 -0500
Subject: [PATCH 627/770] [Bug] Fix AttributeError: 'ColumnParallelLinear'
 object has no attribute `weight_scale_inv` (#30823)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/model_executor/layers/quantization/utils/fp8_utils.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index ea68745585160..bdc3d1fc7232d 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -1437,14 +1437,17 @@ def maybe_post_process_fp8_weight_block(layer: torch.nn.Module):
         layer.orig_dtype, layer.weight
     )
     if should_use_deepgemm:
+        scale_attr = (
+            "weight_scale_inv" if hasattr(layer, "weight_scale_inv") else "weight_scale"
+        )
         dg_weight, dg_weight_scale = deepgemm_post_process_fp8_weight_block(
             wq=layer.weight.data,
-            ws=layer.weight_scale_inv.data,
+            ws=getattr(layer, scale_attr).data,
             quant_block_shape=tuple(layer.weight_block_size),
             use_e8m0=is_deep_gemm_e8m0_used(),
         )
         replace_parameter(layer, "weight", dg_weight)
-        replace_parameter(layer, "weight_scale_inv", dg_weight_scale)
+        replace_parameter(layer, scale_attr, dg_weight_scale)
 
 
 def expert_weight_is_col_major(x: torch.Tensor) -> bool:

From 9ad5b2171002522772de0a0cc71b747068ec8862 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Wed, 17 Dec 2025 18:27:30 +0800
Subject: [PATCH 628/770] [Refactor] [4/N] Move VLLM_SERVER_DEV endpoints into
 the serve directory (#30749)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 .../scripts/hardware_ci/run-amd-test.sh       |  1 -
 .buildkite/test-amd.yaml                      | 37 ++++---
 .buildkite/test-pipeline.yaml                 | 34 ++++---
 .buildkite/test_areas/entrypoints.yaml        | 23 ++++-
 .buildkite/test_areas/tool_use.yaml           | 13 ---
 tests/entrypoints/instrumentator/__init__.py  |  0
 .../test_metrics.py                           |  5 +-
 tests/entrypoints/rpc/__init__.py             |  0
 .../{openai => rpc}/test_collective_rpc.py    |  2 +-
 tests/entrypoints/sleep/__init__.py           |  0
 .../{openai => sleep}/test_sleep.py           |  2 +-
 vllm/entrypoints/openai/api_server.py         | 98 +------------------
 vllm/entrypoints/serve/__init__.py            | 29 ++++++
 vllm/entrypoints/serve/cache/__init__.py      |  0
 vllm/entrypoints/serve/cache/api_router.py    | 61 ++++++++++++
 .../serve/instrumentator/server_info.py       | 40 ++++++++
 vllm/entrypoints/serve/rpc/__init__.py        |  0
 vllm/entrypoints/serve/rpc/api_router.py      | 61 ++++++++++++
 vllm/entrypoints/serve/sleep/api_router.py    |  4 -
 19 files changed, 259 insertions(+), 151 deletions(-)
 delete mode 100644 .buildkite/test_areas/tool_use.yaml
 create mode 100644 tests/entrypoints/instrumentator/__init__.py
 rename tests/entrypoints/{openai => instrumentator}/test_metrics.py (99%)
 create mode 100644 tests/entrypoints/rpc/__init__.py
 rename tests/entrypoints/{openai => rpc}/test_collective_rpc.py (96%)
 create mode 100644 tests/entrypoints/sleep/__init__.py
 rename tests/entrypoints/{openai => sleep}/test_sleep.py (98%)
 create mode 100644 vllm/entrypoints/serve/cache/__init__.py
 create mode 100644 vllm/entrypoints/serve/cache/api_router.py
 create mode 100644 vllm/entrypoints/serve/instrumentator/server_info.py
 create mode 100644 vllm/entrypoints/serve/rpc/__init__.py
 create mode 100644 vllm/entrypoints/serve/rpc/api_router.py

diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index 864eb470bb0a7..08da34d81d117 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -141,7 +141,6 @@ if [[ $commands == *" entrypoints/openai "* ]]; then
   --ignore=entrypoints/openai/test_audio.py \
   --ignore=entrypoints/openai/test_shutdown.py \
   --ignore=entrypoints/openai/test_completion.py \
-  --ignore=entrypoints/openai/test_sleep.py \
   --ignore=entrypoints/openai/test_models.py \
   --ignore=entrypoints/openai/test_lora_adapters.py \
   --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 3c9b8cbedcf06..e8f99100a8de0 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -128,7 +128,7 @@ steps:
   - tests/entrypoints/
   commands:
   - pytest -v -s entrypoints/openai/tool_parsers
-  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
+  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
 
 - label: Entrypoints Integration Test (LLM) # 30min
   timeout_in_minutes: 40
@@ -148,7 +148,7 @@ steps:
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
   - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 
-- label: Entrypoints Integration Test (API Server) # 100min
+- label: Entrypoints Integration Test (API Server 1) # 100min
   timeout_in_minutes: 130
   mirror_hardwares: [amdexperimental]
   agent_pool: mi325_1
@@ -162,10 +162,28 @@ steps:
   - tests/entrypoints/test_chat_utils
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/
   - pytest -v -s entrypoints/test_chat_utils.py
 
+- label: Entrypoints Integration Test (API Server 2)
+  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
+  fast_check: true
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/sleep
+  - tests/entrypoints/rpc
+  - tests/tool_use
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s entrypoints/sleep
+  - pytest -v -s tool_use
+  - PYTHONPATH=/vllm-workspace  pytest -v -s entrypoints/rpc
+
 - label: Entrypoints Integration Test (Pooling)
   timeout_in_minutes: 50
   mirror_hardwares: [amdexperimental]
@@ -751,17 +769,6 @@ steps:
   # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
   - pytest -s entrypoints/openai/correctness/
 
-- label: OpenAI-Compatible Tool Use # 23 min
-  timeout_in_minutes: 35
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  # grade: Blocking
-  fast_check: false
-  source_file_dependencies:
-    - vllm/
-    - tests/tool_use
-  commands:
-    - pytest -v -s tool_use
 
 #####  models test  #####
 
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 8e6d32f71f220..b4de630b09417 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -114,7 +114,7 @@ steps:
   - tests/entrypoints/
   commands:
   - pytest -v -s entrypoints/openai/tool_parsers
-  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
+  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
 
 - label: Entrypoints Integration Test (LLM) # 30min
   timeout_in_minutes: 40
@@ -132,7 +132,7 @@ steps:
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
   - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 
-- label: Entrypoints Integration Test (API Server) # 100min
+- label: Entrypoints Integration Test (API Server 1) # 100min
   timeout_in_minutes: 130
   mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
@@ -144,10 +144,26 @@ steps:
   - tests/entrypoints/test_chat_utils
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/  --ignore=entrypoints/openai/tool_parsers/
   - pytest -v -s entrypoints/test_chat_utils.py
 
+- label: Entrypoints Integration Test (API Server 2)
+  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental]
+  working_dir: "/vllm-workspace/tests"
+  fast_check: true
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/sleep
+  - tests/entrypoints/rpc
+  - tests/tool_use
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s entrypoints/sleep
+  - PYTHONPATH=/vllm-workspace  pytest -v -s entrypoints/rpc
+  - pytest -v -s tool_use
+
 - label: Entrypoints Integration Test (Pooling)
   timeout_in_minutes: 50
   mirror_hardwares: [amdexperimental]
@@ -666,16 +682,6 @@ steps:
   commands: # LMEval+Transcription WER check
   - pytest -s entrypoints/openai/correctness/
 
-- label: OpenAI-Compatible Tool Use # 23 min
-  timeout_in_minutes: 35
-  mirror_hardwares: [amdexperimental]
-  fast_check: false
-  source_file_dependencies:
-    - vllm/
-    - tests/tool_use
-  commands:
-    - pytest -v -s tool_use
-
 #####  models test  #####
 
 - label: Basic Models Tests (Initialization)
diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml
index 0a789be943f37..5b16ea9c1ad07 100644
--- a/.buildkite/test_areas/entrypoints.yaml
+++ b/.buildkite/test_areas/entrypoints.yaml
@@ -10,7 +10,7 @@ steps:
   - tests/entrypoints/
   commands:
   - pytest -v -s entrypoints/openai/tool_parsers
-  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
+  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
 
 - label: Entrypoints Integration (LLM)
   timeout_in_minutes: 40
@@ -25,7 +25,7 @@ steps:
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
   - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 
-- label: Entrypoints Integration (API Server)
+- label: Entrypoints Integration (API Server 1)
   timeout_in_minutes: 130
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
@@ -34,11 +34,26 @@ steps:
   - tests/entrypoints/test_chat_utils
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/  --ignore=entrypoints/openai/tool_parsers/
   - pytest -v -s entrypoints/test_chat_utils.py
 
 
+- label: Entrypoints Integration (API Server 2)
+  timeout_in_minutes: 130
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/tool_use
+  - tests/entrypoints/sleep
+  - tests/entrypoints/instrumentator
+  - tests/entrypoints/rpc
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
+  - pytest -v -s entrypoints/instrumentator
+  - pytest -v -s entrypoints/sleep
+  - pytest -v -s tool_use
+
 - label: Entrypoints Integration (Pooling)
   timeout_in_minutes: 50
   working_dir: "/vllm-workspace/tests"
diff --git a/.buildkite/test_areas/tool_use.yaml b/.buildkite/test_areas/tool_use.yaml
deleted file mode 100644
index 69527a1214229..0000000000000
--- a/.buildkite/test_areas/tool_use.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-group: Tool use
-depends_on: 
-  - image-build
-steps:
-- label: OpenAI-Compatible Tool Use
-  timeout_in_minutes: 35
-  mirror_hardwares: [amdexperimental]
-  fast_check: false
-  source_file_dependencies:
-    - vllm/
-    - tests/tool_use
-  commands:
-    - pytest -v -s tool_use
diff --git a/tests/entrypoints/instrumentator/__init__.py b/tests/entrypoints/instrumentator/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/instrumentator/test_metrics.py
similarity index 99%
rename from tests/entrypoints/openai/test_metrics.py
rename to tests/entrypoints/instrumentator/test_metrics.py
index 65a6fd20bd0d1..9f2ad105a380b 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/instrumentator/test_metrics.py
@@ -14,11 +14,10 @@ import requests
 from prometheus_client.parser import text_string_to_metric_families
 from transformers import AutoTokenizer
 
+from tests.conftest import LocalAssetServer
+from tests.utils import RemoteOpenAIServer
 from vllm import version
 
-from ...conftest import LocalAssetServer
-from ...utils import RemoteOpenAIServer
-
 MODELS = {
     "text": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
     "multimodal": "HuggingFaceTB/SmolVLM-256M-Instruct",
diff --git a/tests/entrypoints/rpc/__init__.py b/tests/entrypoints/rpc/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/entrypoints/openai/test_collective_rpc.py b/tests/entrypoints/rpc/test_collective_rpc.py
similarity index 96%
rename from tests/entrypoints/openai/test_collective_rpc.py
rename to tests/entrypoints/rpc/test_collective_rpc.py
index cbd6b02f05dce..56d93a427315f 100644
--- a/tests/entrypoints/openai/test_collective_rpc.py
+++ b/tests/entrypoints/rpc/test_collective_rpc.py
@@ -37,7 +37,7 @@ def server():
         "--max-num-seqs",
         "128",
         "--worker-extension-cls",
-        "tests.entrypoints.openai.test_collective_rpc.TestWorkerExtension",
+        "tests.entrypoints.rpc.test_collective_rpc.TestWorkerExtension",
     ]
     with RemoteOpenAIServer(
         MODEL_NAME,
diff --git a/tests/entrypoints/sleep/__init__.py b/tests/entrypoints/sleep/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/entrypoints/openai/test_sleep.py b/tests/entrypoints/sleep/test_sleep.py
similarity index 98%
rename from tests/entrypoints/openai/test_sleep.py
rename to tests/entrypoints/sleep/test_sleep.py
index 5f94ac6da2c25..260dcd00bae91 100644
--- a/tests/entrypoints/openai/test_sleep.py
+++ b/tests/entrypoints/sleep/test_sleep.py
@@ -4,7 +4,7 @@
 import requests
 from prometheus_client.parser import text_string_to_metric_families
 
-from ...utils import RemoteOpenAIServer
+from tests.utils import RemoteOpenAIServer
 
 MODEL_NAME = "meta-llama/Llama-3.2-1B"
 
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 5d0eacae34dd7..bca9571e39344 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -17,21 +17,20 @@ from argparse import Namespace
 from collections.abc import AsyncGenerator, AsyncIterator, Awaitable
 from contextlib import asynccontextmanager
 from http import HTTPStatus
-from typing import Annotated, Any, Literal
+from typing import Annotated, Any
 
 import model_hosting_container_standards.sagemaker as sagemaker_standards
 import pydantic
 import uvloop
-from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Query, Request
+from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request
 from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import JSONResponse, Response, StreamingResponse
+from fastapi.responses import JSONResponse, StreamingResponse
 from starlette.concurrency import iterate_in_threadpool
 from starlette.datastructures import URL, Headers, MutableHeaders, State
 from starlette.types import ASGIApp, Message, Receive, Scope, Send
 
 import vllm.envs as envs
-from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.anthropic.protocol import (
@@ -639,97 +638,6 @@ async def create_translations(
     return StreamingResponse(content=generator, media_type="text/event-stream")
 
 
-if envs.VLLM_SERVER_DEV_MODE:
-    logger.warning(
-        "SECURITY WARNING: Development endpoints are enabled! "
-        "This should NOT be used in production!"
-    )
-
-    PydanticVllmConfig = pydantic.TypeAdapter(VllmConfig)
-
-    @router.get("/server_info")
-    async def show_server_info(
-        raw_request: Request,
-        config_format: Annotated[Literal["text", "json"], Query()] = "text",
-    ):
-        vllm_config: VllmConfig = raw_request.app.state.vllm_config
-        server_info = {
-            "vllm_config": str(vllm_config)
-            if config_format == "text"
-            else PydanticVllmConfig.dump_python(vllm_config, mode="json", fallback=str)
-            # fallback=str is needed to handle e.g. torch.dtype
-        }
-        return JSONResponse(content=server_info)
-
-    @router.post("/reset_prefix_cache")
-    async def reset_prefix_cache(
-        raw_request: Request,
-        reset_running_requests: bool = Query(default=False),
-        reset_external: bool = Query(default=False),
-    ):
-        """
-        Reset the local prefix cache.
-
-        Optionally, if the query parameter `reset_external=true`
-        also resets the external (connector-managed) prefix cache.
-
-        Note that we currently do not check if the prefix cache
-        is successfully reset in the API server.
-
-        Example:
-            POST /reset_prefix_cache?reset_external=true
-        """
-        logger.info("Resetting prefix cache...")
-
-        await engine_client(raw_request).reset_prefix_cache(
-            reset_running_requests, reset_external
-        )
-        return Response(status_code=200)
-
-    @router.post("/reset_mm_cache")
-    async def reset_mm_cache(raw_request: Request):
-        """
-        Reset the multi-modal cache. Note that we currently do not check if the
-        multi-modal cache is successfully reset in the API server.
-        """
-        logger.info("Resetting multi-modal cache...")
-        await engine_client(raw_request).reset_mm_cache()
-        return Response(status_code=200)
-
-    @router.post("/collective_rpc")
-    async def collective_rpc(raw_request: Request):
-        try:
-            body = await raw_request.json()
-        except json.JSONDecodeError as e:
-            raise HTTPException(
-                status_code=HTTPStatus.BAD_REQUEST.value,
-                detail=f"JSON decode error: {e}",
-            ) from e
-        method = body.get("method")
-        if method is None:
-            raise HTTPException(
-                status_code=HTTPStatus.BAD_REQUEST.value,
-                detail="Missing 'method' in request body",
-            )
-        # For security reason, only serialized string args/kwargs are passed.
-        # User-defined `method` is responsible for deserialization if needed.
-        args: list[str] = body.get("args", [])
-        kwargs: dict[str, str] = body.get("kwargs", {})
-        timeout: float | None = body.get("timeout")
-        results = await engine_client(raw_request).collective_rpc(
-            method=method, timeout=timeout, args=tuple(args), kwargs=kwargs
-        )
-        if results is None:
-            return Response(status_code=200)
-        response: list[Any] = []
-        for result in results:
-            if result is None or isinstance(result, dict | list):
-                response.append(result)
-            else:
-                response.append(str(result))
-        return JSONResponse(content={"results": response})
-
-
 def load_log_config(log_config_file: str | None) -> dict | None:
     if not log_config_file:
         return None
diff --git a/vllm/entrypoints/serve/__init__.py b/vllm/entrypoints/serve/__init__.py
index c4fcc92db931f..260fd44a02ccb 100644
--- a/vllm/entrypoints/serve/__init__.py
+++ b/vllm/entrypoints/serve/__init__.py
@@ -4,8 +4,19 @@
 
 from fastapi import FastAPI
 
+import vllm.envs as envs
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
 
 def register_vllm_serve_api_routers(app: FastAPI):
+    if envs.VLLM_SERVER_DEV_MODE:
+        logger.warning(
+            "SECURITY WARNING: Development endpoints are enabled! "
+            "This should NOT be used in production!"
+        )
+
     from vllm.entrypoints.serve.lora.api_router import (
         attach_router as attach_lora_router,
     )
@@ -29,6 +40,18 @@ def register_vllm_serve_api_routers(app: FastAPI):
 
     attach_sleep_router(app)
 
+    from vllm.entrypoints.serve.rpc.api_router import (
+        attach_router as attach_rpc_router,
+    )
+
+    attach_rpc_router(app)
+
+    from vllm.entrypoints.serve.cache.api_router import (
+        attach_router as attach_cache_router,
+    )
+
+    attach_cache_router(app)
+
     from vllm.entrypoints.serve.tokenize.api_router import (
         attach_router as attach_tokenize_router,
     )
@@ -58,3 +81,9 @@ def register_vllm_serve_api_routers(app: FastAPI):
     )
 
     attach_health_router(app)
+
+    from vllm.entrypoints.serve.instrumentator.server_info import (
+        attach_router as attach_server_info_router,
+    )
+
+    attach_server_info_router(app)
diff --git a/vllm/entrypoints/serve/cache/__init__.py b/vllm/entrypoints/serve/cache/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/entrypoints/serve/cache/api_router.py b/vllm/entrypoints/serve/cache/api_router.py
new file mode 100644
index 0000000000000..d659895463273
--- /dev/null
+++ b/vllm/entrypoints/serve/cache/api_router.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+from fastapi import APIRouter, FastAPI, Query, Request
+from fastapi.responses import Response
+
+import vllm.envs as envs
+from vllm.engine.protocol import EngineClient
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+router = APIRouter()
+
+
+def engine_client(request: Request) -> EngineClient:
+    return request.app.state.engine_client
+
+
+@router.post("/reset_prefix_cache")
+async def reset_prefix_cache(
+    raw_request: Request,
+    reset_running_requests: bool = Query(default=False),
+    reset_external: bool = Query(default=False),
+):
+    """
+    Reset the local prefix cache.
+
+    Optionally, if the query parameter `reset_external=true`
+    also resets the external (connector-managed) prefix cache.
+
+    Note that we currently do not check if the prefix cache
+    is successfully reset in the API server.
+
+    Example:
+       POST /reset_prefix_cache?reset_external=true
+    """
+    logger.info("Resetting prefix cache...")
+
+    await engine_client(raw_request).reset_prefix_cache(
+        reset_running_requests, reset_external
+    )
+    return Response(status_code=200)
+
+
+@router.post("/reset_mm_cache")
+async def reset_mm_cache(raw_request: Request):
+    """
+    Reset the multi-modal cache. Note that we currently do not check if the
+    multi-modal cache is successfully reset in the API server.
+    """
+    logger.info("Resetting multi-modal cache...")
+    await engine_client(raw_request).reset_mm_cache()
+    return Response(status_code=200)
+
+
+def attach_router(app: FastAPI):
+    if not envs.VLLM_SERVER_DEV_MODE:
+        return
+    app.include_router(router)
diff --git a/vllm/entrypoints/serve/instrumentator/server_info.py b/vllm/entrypoints/serve/instrumentator/server_info.py
new file mode 100644
index 0000000000000..1a69dfacae1c2
--- /dev/null
+++ b/vllm/entrypoints/serve/instrumentator/server_info.py
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+from typing import Annotated, Literal
+
+import pydantic
+from fastapi import APIRouter, FastAPI, Query, Request
+from fastapi.responses import JSONResponse
+
+import vllm.envs as envs
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+router = APIRouter()
+PydanticVllmConfig = pydantic.TypeAdapter(VllmConfig)
+
+
+@router.get("/server_info")
+async def show_server_info(
+    raw_request: Request,
+    config_format: Annotated[Literal["text", "json"], Query()] = "text",
+):
+    vllm_config: VllmConfig = raw_request.app.state.vllm_config
+    server_info = {
+        "vllm_config": str(vllm_config)
+        if config_format == "text"
+        else PydanticVllmConfig.dump_python(vllm_config, mode="json", fallback=str)
+        # fallback=str is needed to handle e.g. torch.dtype
+    }
+    return JSONResponse(content=server_info)
+
+
+def attach_router(app: FastAPI):
+    if not envs.VLLM_SERVER_DEV_MODE:
+        return
+    app.include_router(router)
diff --git a/vllm/entrypoints/serve/rpc/__init__.py b/vllm/entrypoints/serve/rpc/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/entrypoints/serve/rpc/api_router.py b/vllm/entrypoints/serve/rpc/api_router.py
new file mode 100644
index 0000000000000..54f582c408d54
--- /dev/null
+++ b/vllm/entrypoints/serve/rpc/api_router.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from http import HTTPStatus
+from typing import Any
+
+from fastapi import APIRouter, FastAPI, HTTPException, Request
+from fastapi.responses import JSONResponse, Response
+
+import vllm.envs as envs
+from vllm.engine.protocol import EngineClient
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+router = APIRouter()
+
+
+def engine_client(request: Request) -> EngineClient:
+    return request.app.state.engine_client
+
+
+@router.post("/collective_rpc")
+async def collective_rpc(raw_request: Request):
+    try:
+        body = await raw_request.json()
+    except json.JSONDecodeError as e:
+        raise HTTPException(
+            status_code=HTTPStatus.BAD_REQUEST.value,
+            detail=f"JSON decode error: {e}",
+        ) from e
+    method = body.get("method")
+    if method is None:
+        raise HTTPException(
+            status_code=HTTPStatus.BAD_REQUEST.value,
+            detail="Missing 'method' in request body",
+        )
+    # For security reason, only serialized string args/kwargs are passed.
+    # User-defined `method` is responsible for deserialization if needed.
+    args: list[str] = body.get("args", [])
+    kwargs: dict[str, str] = body.get("kwargs", {})
+    timeout: float | None = body.get("timeout")
+    results = await engine_client(raw_request).collective_rpc(
+        method=method, timeout=timeout, args=tuple(args), kwargs=kwargs
+    )
+    if results is None:
+        return Response(status_code=200)
+    response: list[Any] = []
+    for result in results:
+        if result is None or isinstance(result, dict | list):
+            response.append(result)
+        else:
+            response.append(str(result))
+    return JSONResponse(content={"results": response})
+
+
+def attach_router(app: FastAPI):
+    if not envs.VLLM_SERVER_DEV_MODE:
+        return
+    app.include_router(router)
diff --git a/vllm/entrypoints/serve/sleep/api_router.py b/vllm/entrypoints/serve/sleep/api_router.py
index bc01e185315c8..c0e4c3028b2ea 100644
--- a/vllm/entrypoints/serve/sleep/api_router.py
+++ b/vllm/entrypoints/serve/sleep/api_router.py
@@ -52,9 +52,5 @@ async def is_sleeping(raw_request: Request):
 def attach_router(app: FastAPI):
     if not envs.VLLM_SERVER_DEV_MODE:
         return
-    logger.warning(
-        "SECURITY WARNING: Development endpoints are enabled! "
-        "This should NOT be used in production!"
-    )
 
     app.include_router(router)

From 4bf6c2366818a1eeae257e06ec337039e6895f13 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <khluu000@gmail.com>
Date: Wed, 17 Dec 2025 02:30:56 -0800
Subject: [PATCH 629/770] [ci] Sync test areas yaml file with test-pipeline
 (#30862)

Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
---
 .buildkite/test_areas/e2e_integration.yaml | 19 +------------------
 .buildkite/test_areas/lm_eval.yaml         |  4 ++--
 .buildkite/test_areas/lora.yaml            |  2 ++
 .buildkite/test_areas/models_basic.yaml    |  2 ++
 .buildkite/test_areas/pytorch.yaml         |  4 +++-
 5 files changed, 10 insertions(+), 21 deletions(-)

diff --git a/.buildkite/test_areas/e2e_integration.yaml b/.buildkite/test_areas/e2e_integration.yaml
index 93d389815edac..2e0857986c3fa 100644
--- a/.buildkite/test_areas/e2e_integration.yaml
+++ b/.buildkite/test_areas/e2e_integration.yaml
@@ -32,6 +32,7 @@ steps:
 - label: Prime-RL Integration (2 GPUs)
   timeout_in_minutes: 30
   optional: true
+  soft_fail: true
   num_gpus: 2
   working_dir: "/vllm-workspace"
   source_file_dependencies:
@@ -39,21 +40,3 @@ steps:
   - .buildkite/scripts/run-prime-rl-test.sh
   commands:
     - bash .buildkite/scripts/run-prime-rl-test.sh
-
-- label: DeepSeek V2-Lite Async EPLB Accuracy
-  timeout_in_minutes: 60
-  gpu: h100
-  optional: true
-  num_gpus: 4
-  working_dir: "/vllm-workspace"
-  commands:
-  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
-
-- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
-  timeout_in_minutes: 60
-  gpu: h100
-  optional: true
-  num_gpus: 4
-  working_dir: "/vllm-workspace"
-  commands:
-  - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
diff --git a/.buildkite/test_areas/lm_eval.yaml b/.buildkite/test_areas/lm_eval.yaml
index 9af43e0c375a8..e2498512bdef7 100644
--- a/.buildkite/test_areas/lm_eval.yaml
+++ b/.buildkite/test_areas/lm_eval.yaml
@@ -9,7 +9,7 @@ steps:
   - vllm/model_executor/layers/quantization
   autorun_on_main: true
   commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
 
 - label: LM Eval Large Models (4 GPUs)(A100)
   gpu: a100
@@ -43,4 +43,4 @@ steps:
   - csrc/
   - vllm/model_executor/layers/quantization
   commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
diff --git a/.buildkite/test_areas/lora.yaml b/.buildkite/test_areas/lora.yaml
index 809b4138f44ba..59ade40cc8f52 100644
--- a/.buildkite/test_areas/lora.yaml
+++ b/.buildkite/test_areas/lora.yaml
@@ -22,6 +22,8 @@ steps:
     # FIXIT: find out which code initialize cuda before running the test
     # before the fix, we need to use spawn to test it
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    # Alot of these tests are on the edge of OOMing
+    - export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
     # There is some Tensor Parallelism related processing logic in LoRA that
     # requires multi-GPU testing for validation.
     - pytest -v -s -x lora/test_chatglm3_tp.py
diff --git a/.buildkite/test_areas/models_basic.yaml b/.buildkite/test_areas/models_basic.yaml
index 39a5d51c48833..2a86596a6d603 100644
--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -9,6 +9,7 @@ steps:
   source_file_dependencies:
   - vllm/
   - tests/models/test_initialization.py
+  - tests/models/registry.py
   commands:
     # Run a subset of model initialization tests
     - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
@@ -20,6 +21,7 @@ steps:
   source_file_dependencies:
   - vllm/model_executor/models/
   - tests/models/test_initialization.py
+  - tests/models/registry.py
   commands:
     # Only when vLLM model source is modified - test initialization of a large
     # subset of supported models (the complement of the small subset in the above
diff --git a/.buildkite/test_areas/pytorch.yaml b/.buildkite/test_areas/pytorch.yaml
index 703c82eb1a91b..332d5202d8338 100644
--- a/.buildkite/test_areas/pytorch.yaml
+++ b/.buildkite/test_areas/pytorch.yaml
@@ -13,7 +13,9 @@ steps:
   # tests covered elsewhere.
   # Use `find` to launch multiple instances of pytest so that
   # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
-  - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\;"
+  # However, find does not normally propagate error codes, so we combine it with xargs
+  # (using -0 for proper path handling)
+  - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
 
 - label: PyTorch Fullgraph Smoke Test
   timeout_in_minutes: 30

From 84896fda22d3de74398a88b5769c98eef14258f1 Mon Sep 17 00:00:00 2001
From: baoqian426 <1354987947@qq.com>
Date: Wed, 17 Dec 2025 19:32:34 +0800
Subject: [PATCH 630/770] [Bugfix] deepseek-V3.2 self.weights_proj has no bias
 (#30841)

Signed-off-by: baoqian <1354987947@qq.com>
Signed-off-by: baoqian426 <1354987947@qq.com>
---
 vllm/model_executor/models/deepseek_v2.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 146124153c79d..6670143cda250 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -835,7 +835,11 @@ class Indexer(nn.Module):
         )
         self.k_norm = LayerNorm(self.head_dim, eps=1e-6)
         self.weights_proj = ReplicatedLinear(
-            hidden_size, self.n_head, quant_config=None, prefix=f"{prefix}.weights_proj"
+            hidden_size,
+            self.n_head,
+            bias=False,
+            quant_config=None,
+            prefix=f"{prefix}.weights_proj",
         )
         self.softmax_scale = self.head_dim**-0.5
 

From fb980eb2fdd15f81d4c5695347bdea308bb5515e Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 17 Dec 2025 11:33:50 +0000
Subject: [PATCH 631/770] Fix lazy import (#30858)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/v1/structured_output/utils.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py
index cb5ad99cfbdf7..74df0fa067670 100644
--- a/vllm/v1/structured_output/utils.py
+++ b/vllm/v1/structured_output/utils.py
@@ -20,9 +20,9 @@ from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
 
 if TYPE_CHECKING:
     import outlines_core as oc
+    import transformers.convert_slow_tokenizer as convert_slow_tokenizer
     import transformers.file_utils as file_utils
     import xgrammar as xgr
-    from transformers.convert_slow_tokenizer import bytes_to_unicode
 
     from vllm.tokenizers import TokenizerLike
     from vllm.v1.worker.gpu_input_batch import InputBatch
@@ -30,8 +30,8 @@ else:
     xgr = LazyLoader("xgr", globals(), "xgrammar")
     oc = LazyLoader("oc", globals(), "outlines_core")
     file_utils = LazyLoader("file_utils", globals(), "transformers.file_utils")
-    bytes_to_unicode = LazyLoader(
-        "bytes_to_unicode", globals(), "transformers.convert_slow_tokenizer"
+    convert_slow_tokenizer = LazyLoader(
+        "convert_slow_tokenizer", globals(), "transformers.convert_slow_tokenizer"
     )
 
     TokenizerLike = object
@@ -202,7 +202,9 @@ def _reduced_vocabulary(
         A Dict of token string -> equivalent token ids
     """
 
-    unicode_to_bytes = {v: k for k, v in bytes_to_unicode().items()}
+    unicode_to_bytes = {
+        v: k for k, v in convert_slow_tokenizer.bytes_to_unicode().items()
+    }
 
     def convert_token_to_string(token: str) -> str:
         string = tokenizer.convert_tokens_to_string([token])

From 6482e3895baa483fb30227648aa4721f09699cba Mon Sep 17 00:00:00 2001
From: Hank_ <37239608+ILikeIneine@users.noreply.github.com>
Date: Wed, 17 Dec 2025 19:58:16 +0800
Subject: [PATCH 632/770] chores: adjust the attn register param order (#30688)

Signed-off-by: Hank <hcc.mayday@gmail.com>
---
 vllm/attention/backends/registry.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/attention/backends/registry.py b/vllm/attention/backends/registry.py
index eaa0fa1d5db39..ed0021db204ac 100644
--- a/vllm/attention/backends/registry.py
+++ b/vllm/attention/backends/registry.py
@@ -201,8 +201,8 @@ _MAMBA_ATTN_OVERRIDES: dict[MambaAttentionBackendEnum, str] = {}
 
 def register_backend(
     backend: AttentionBackendEnum | MambaAttentionBackendEnum,
-    is_mamba: bool = False,
     class_path: str | None = None,
+    is_mamba: bool = False,
 ) -> Callable[[type], type]:
     """Register or override a backend implementation.
 

From 6e9dbcc50e35af75ec76bf033ee6402697c02609 Mon Sep 17 00:00:00 2001
From: Jialin Ouyang <Jialin.Ouyang@gmail.com>
Date: Wed, 17 Dec 2025 03:58:43 -0800
Subject: [PATCH 633/770] [Fix] uniform decode batch check (#30747)

Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
---
 tests/v1/worker/test_gpu_model_runner.py | 84 ++++++++++++++++++++++++
 vllm/v1/worker/gpu_model_runner.py       | 45 ++++++++++---
 2 files changed, 121 insertions(+), 8 deletions(-)

diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index 7b8c4268a5237..59f1ac705829f 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -1110,3 +1110,87 @@ def test_hybrid_cache_integration(model_runner, dist_init):
     runner._update_states(scheduler_output)
     assert _is_req_scheduled(runner, req_id)
     assert _is_req_state_block_table_match(runner, req_id)
+
+
+def test_is_uniform_decode() -> None:
+    # Normal
+    assert GPUModelRunner._is_uniform_decode(
+        max_num_scheduled_tokens=1,
+        uniform_decode_query_len=1,
+        num_tokens=16,
+        num_reqs=16,
+    )
+    assert not GPUModelRunner._is_uniform_decode(
+        max_num_scheduled_tokens=2,
+        uniform_decode_query_len=1,
+        num_tokens=16,
+        num_reqs=16,
+    )
+    assert not GPUModelRunner._is_uniform_decode(
+        max_num_scheduled_tokens=1,
+        uniform_decode_query_len=1,
+        num_tokens=16,
+        num_reqs=15,
+    )
+    # Spec decoding
+    assert GPUModelRunner._is_uniform_decode(
+        max_num_scheduled_tokens=5,
+        uniform_decode_query_len=5,
+        num_tokens=30,
+        num_reqs=6,
+    )
+    assert not GPUModelRunner._is_uniform_decode(
+        max_num_scheduled_tokens=5,
+        uniform_decode_query_len=4,
+        num_tokens=30,
+        num_reqs=6,
+    )
+    assert not GPUModelRunner._is_uniform_decode(
+        max_num_scheduled_tokens=5,
+        uniform_decode_query_len=5,
+        num_tokens=30,
+        num_reqs=7,
+    )
+    # Force uniform decode
+    assert GPUModelRunner._is_uniform_decode(
+        max_num_scheduled_tokens=1,
+        uniform_decode_query_len=1,
+        num_tokens=16,
+        num_reqs=16,
+        force_uniform_decode=True,
+    )
+    assert GPUModelRunner._is_uniform_decode(
+        max_num_scheduled_tokens=2,
+        uniform_decode_query_len=1,
+        num_tokens=16,
+        num_reqs=16,
+        force_uniform_decode=True,
+    )
+    assert GPUModelRunner._is_uniform_decode(
+        max_num_scheduled_tokens=1,
+        uniform_decode_query_len=1,
+        num_tokens=16,
+        num_reqs=15,
+        force_uniform_decode=True,
+    )
+    assert not GPUModelRunner._is_uniform_decode(
+        max_num_scheduled_tokens=1,
+        uniform_decode_query_len=1,
+        num_tokens=16,
+        num_reqs=16,
+        force_uniform_decode=False,
+    )
+    assert not GPUModelRunner._is_uniform_decode(
+        max_num_scheduled_tokens=2,
+        uniform_decode_query_len=1,
+        num_tokens=16,
+        num_reqs=16,
+        force_uniform_decode=False,
+    )
+    assert not GPUModelRunner._is_uniform_decode(
+        max_num_scheduled_tokens=1,
+        uniform_decode_query_len=1,
+        num_tokens=16,
+        num_reqs=15,
+        force_uniform_decode=False,
+    )
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 1db5bc99fff6c..a44150432434b 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2777,6 +2777,27 @@ class GPUModelRunner(
             **model_kwargs,
         )
 
+    @staticmethod
+    def _is_uniform_decode(
+        max_num_scheduled_tokens: int,
+        uniform_decode_query_len: int,
+        num_tokens: int,
+        num_reqs: int,
+        force_uniform_decode: bool | None = None,
+    ) -> bool:
+        """
+        Checks if it's a decode batch with same amount scheduled tokens
+        across all requests.
+        """
+        return (
+            (
+                (max_num_scheduled_tokens == uniform_decode_query_len)
+                and (num_tokens == max_num_scheduled_tokens * num_reqs)
+            )
+            if force_uniform_decode is None
+            else force_uniform_decode
+        )
+
     def _determine_batch_execution_and_padding(
         self,
         num_tokens: int,
@@ -2798,14 +2819,12 @@ class GPUModelRunner(
         torch.Tensor | None,
         CUDAGraphStat | None,
     ]:
-        num_tokens_padded = self._pad_for_sequence_parallelism(num_tokens)
-        uniform_decode = (
-            (
-                (max_num_scheduled_tokens == self.uniform_decode_query_len)
-                and (num_tokens_padded == max_num_scheduled_tokens * num_reqs)
-            )
-            if force_uniform_decode is None
-            else force_uniform_decode
+        uniform_decode = self._is_uniform_decode(
+            max_num_scheduled_tokens=max_num_scheduled_tokens,
+            uniform_decode_query_len=self.uniform_decode_query_len,
+            num_tokens=num_tokens,
+            num_reqs=num_reqs,
+            force_uniform_decode=force_uniform_decode,
         )
         # Encoder-decoder models only support CG for decoder_step > 0 (no enc_output
         # is present). Also, chunked-prefill is disabled, so batch are uniform.
@@ -2819,6 +2838,7 @@ class GPUModelRunner(
             else force_has_lora
         )
 
+        num_tokens_padded = self._pad_for_sequence_parallelism(num_tokens)
         dispatch_cudagraph = (
             lambda num_tokens, disable_full: self.cudagraph_dispatcher.dispatch(
                 num_tokens=num_tokens,
@@ -2834,6 +2854,15 @@ class GPUModelRunner(
             num_tokens_padded, use_cascade_attn or has_encoder_output
         )
         num_tokens_padded = batch_descriptor.num_tokens
+        if self.compilation_config.pass_config.enable_sp:
+            assert (
+                batch_descriptor.num_tokens
+                % self.vllm_config.parallel_config.tensor_parallel_size
+                == 0
+            ), (
+                "Sequence parallelism requires num_tokens to be "
+                "a multiple of tensor parallel size"
+            )
 
         # Extra coordination when running data-parallel since we need to coordinate
         # across ranks

From 9e67c4ce985b0b8852603cfe3fcaf8f37de137ed Mon Sep 17 00:00:00 2001
From: "rongfu.leng" <rongfu.leng@daocloud.io>
Date: Wed, 17 Dec 2025 20:14:45 +0800
Subject: [PATCH 634/770] [Docs] fix function name (#30748)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
---
 docs/design/plugin_system.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md
index b0ca2dad23d5b..0fd448c2153c3 100644
--- a/docs/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@@ -109,7 +109,7 @@ Every plugin has three parts:
     - `init_device`: This function is called to set up the device for the worker.
     - `initialize_cache`: This function is called to set cache config for the worker.
     - `load_model`: This function is called to load the model weights to device.
-    - `get_kv_cache_spaces`: This function is called to generate the kv cache spaces for the model.
+    - `get_kv_cache_spec`: This function is called to generate the kv cache spec for the model.
     - `determine_available_memory`: This function is called to profiles the peak memory usage of the model to determine how much memory can be used for KV cache without OOMs.
     - `initialize_from_config`: This function is called to allocate device KV cache with the specified kv_cache_config
     - `execute_model`: This function is called every step to inference the model.

From b7b6a60aca0405b2d6b2ed6fd13853635f000b5d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=AB=98=E9=91=AB=E5=B4=A7?=
 <50285788+SongDI911@users.noreply.github.com>
Date: Wed, 17 Dec 2025 23:10:59 +0800
Subject: [PATCH 635/770] Adapt the old parameter enable_thinking in
 chat_template_kwargs (#30852)

Signed-off-by: xinsong.gao <1418762819@qq.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
---
 vllm/reasoning/deepseek_v3_reasoning_parser.py | 2 ++
 vllm/tokenizers/deepseek_v32.py                | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/vllm/reasoning/deepseek_v3_reasoning_parser.py b/vllm/reasoning/deepseek_v3_reasoning_parser.py
index 6604f70badbcf..4e6758586bf42 100644
--- a/vllm/reasoning/deepseek_v3_reasoning_parser.py
+++ b/vllm/reasoning/deepseek_v3_reasoning_parser.py
@@ -26,6 +26,8 @@ class DeepSeekV3ReasoningParser(ReasoningParser):
 
         chat_kwargs = kwargs.pop("chat_template_kwargs", {}) or {}
         thinking = bool(chat_kwargs.pop("thinking", False))
+        enable_thinking = bool(chat_kwargs.pop("enable_thinking", False))
+        thinking = thinking or enable_thinking
 
         if thinking:
             self._parser = DeepSeekR1ReasoningParser(tokenizer, *args, **kwargs)
diff --git a/vllm/tokenizers/deepseek_v32.py b/vllm/tokenizers/deepseek_v32.py
index bf279a5cf67c5..d519b61ddb76d 100644
--- a/vllm/tokenizers/deepseek_v32.py
+++ b/vllm/tokenizers/deepseek_v32.py
@@ -50,6 +50,8 @@ class DeepseekV32Tokenizer(CachedHfTokenizer):
         **kwargs,
     ) -> str | list[int]:
         thinking = kwargs.get("thinking", False)
+        enable_thinking = kwargs.get("enable_thinking", False)
+        thinking = thinking or enable_thinking
         thinking_mode = "thinking"
         if not thinking:
             thinking_mode = "chat"

From 196cdc3224112df7f68c901fe4c5314875a65be8 Mon Sep 17 00:00:00 2001
From: KimHyemin <102578109+www-spam@users.noreply.github.com>
Date: Thu, 18 Dec 2025 00:11:18 +0900
Subject: [PATCH 636/770] [Model] Gemma3: Support untied word embeddings
 (#30827)

Signed-off-by: www-spam <panmahm@naver.com>
---
 vllm/model_executor/models/gemma3.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index 40f6d100c767e..70f72b5cb9beb 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -39,7 +39,10 @@ from vllm.model_executor.layers.linear import (
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader,
     maybe_remap_kv_scale_name,
@@ -532,12 +535,20 @@ class Gemma3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
         super().__init__()
         self.config = config
-        # currently all existing Gemma models have `tie_word_embeddings` enabled
-        assert config.tie_word_embeddings
         self.quant_config = quant_config
         self.model = Gemma3Model(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
+
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "lm_head"),
+        )
+        if config.tie_word_embeddings:
+            self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
+
         self.logits_processor = LogitsProcessor(
             config.vocab_size, soft_cap=config.final_logit_softcapping
         )
@@ -565,7 +576,7 @@ class Gemma3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         self,
         hidden_states: torch.Tensor,
     ) -> torch.Tensor | None:
-        logits = self.logits_processor(self.model.embed_tokens, hidden_states)
+        logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:

From 2497228ad4427310bc55427f6db404a00de4fd78 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 17 Dec 2025 23:32:17 +0800
Subject: [PATCH 637/770] [Chore] Factor out logic for requesting initial
 memory (#30868)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/utils/mem_utils.py      | 31 +++++++++++++++++++++++++++----
 vllm/v1/worker/gpu_worker.py | 20 ++++----------------
 vllm/v1/worker/utils.py      | 26 +++++++++++++++++++++++++-
 3 files changed, 56 insertions(+), 21 deletions(-)

diff --git a/vllm/utils/mem_utils.py b/vllm/utils/mem_utils.py
index e2517b935bf28..bf6d7846573b9 100644
--- a/vllm/utils/mem_utils.py
+++ b/vllm/utils/mem_utils.py
@@ -66,27 +66,43 @@ class MemorySnapshot:
     torch_memory: int = 0
     non_torch_memory: int = 0
     timestamp: float = 0.0
+
+    device: torch.types.Device = None
     auto_measure: bool = True
 
     def __post_init__(self) -> None:
+        if self.device is None:
+            from vllm.platforms import current_platform
+
+            device_fn = current_platform.current_device
+            assert device_fn is not None
+            self.device_ = torch.device(device_fn())
+        else:
+            self.device_ = torch.device(self.device)
+
         if self.auto_measure:
             self.measure()
 
     def measure(self) -> None:
         from vllm.platforms import current_platform
 
+        device = self.device_
+
         # we measure the torch peak memory usage via allocated_bytes,
         # rather than `torch.cuda.memory_reserved()` .
         # After `torch.cuda.reset_peak_memory_stats()`,
         # `torch.cuda.memory_reserved()` will keep growing, and only shrink
         # when we call `torch.cuda.empty_cache()` or OOM happens.
-        self.torch_peak = torch.cuda.memory_stats().get("allocated_bytes.all.peak", 0)
+        self.torch_peak = torch.cuda.memory_stats(device).get(
+            "allocated_bytes.all.peak", 0
+        )
 
-        self.free_memory, self.total_memory = torch.cuda.mem_get_info()
+        self.free_memory, self.total_memory = torch.cuda.mem_get_info(device)
         shared_sysmem_device_mem_sms = ((8, 7), (11, 0), (12, 1))  # Orin, Thor, Spark
         if (
             current_platform.is_cuda()
-            and current_platform.get_device_capability() in shared_sysmem_device_mem_sms
+            and current_platform.get_device_capability(device.index)
+            in shared_sysmem_device_mem_sms
         ):
             # On UMA (Orin, Thor and Spark) platform,
             # where both CPU and GPU rely on system memory,
@@ -106,12 +122,18 @@ class MemorySnapshot:
         # torch.cuda.memory_reserved() is how many bytes
         # PyTorch gets from cuda (by calling cudaMalloc, etc.)
         # this is used to measure the non-torch memory usage
-        self.torch_memory = torch.cuda.memory_reserved()
+        self.torch_memory = torch.cuda.memory_reserved(device)
 
         self.non_torch_memory = self.cuda_memory - self.torch_memory
         self.timestamp = time.time()
 
     def __sub__(self, other: "MemorySnapshot") -> "MemorySnapshot":
+        if self.device_ != other.device_:
+            raise ValueError(
+                "The two snapshots should be from the same device! "
+                f"Found: {self.device_} vs. {other.device_}"
+            )
+
         return MemorySnapshot(
             torch_peak=self.torch_peak - other.torch_peak,
             free_memory=self.free_memory - other.free_memory,
@@ -120,6 +142,7 @@ class MemorySnapshot:
             torch_memory=self.torch_memory - other.torch_memory,
             non_torch_memory=self.non_torch_memory - other.non_torch_memory,
             timestamp=self.timestamp - other.timestamp,
+            device=self.device_,
             auto_measure=False,
         )
 
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 1e13650cd083e..bc71351d2cc55 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -56,6 +56,8 @@ from vllm.v1.worker.utils import is_residual_scattered_for_sp
 from vllm.v1.worker.worker_base import WorkerBase
 from vllm.v1.worker.workspace import init_workspace_manager
 
+from .utils import request_memory
+
 logger = init_logger(__name__)
 
 if TYPE_CHECKING:
@@ -237,22 +239,8 @@ class Worker(WorkerBase):
             torch.cuda.empty_cache()
 
             # take current memory snapshot
-            self.init_snapshot = MemorySnapshot()
-            self.requested_memory = (
-                self.init_snapshot.total_memory
-                * self.cache_config.gpu_memory_utilization
-            )
-            if self.init_snapshot.free_memory < self.requested_memory:
-                GiB = lambda b: round(b / GiB_bytes, 2)
-                raise ValueError(
-                    f"Free memory on device "
-                    f"({GiB(self.init_snapshot.free_memory)}/"
-                    f"{GiB(self.init_snapshot.total_memory)} GiB) on startup "
-                    f"is less than desired GPU memory utilization "
-                    f"({self.cache_config.gpu_memory_utilization}, "
-                    f"{GiB(self.requested_memory)} GiB). Decrease GPU memory "
-                    f"utilization or reduce GPU memory used by other processes."
-                )
+            self.init_snapshot = init_snapshot = MemorySnapshot(device=self.device)
+            self.requested_memory = request_memory(init_snapshot, self.cache_config)
         else:
             raise RuntimeError(f"Not support device type: {self.device_config.device}")
 
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 2e8afec024ce9..31ccf7f157468 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -8,13 +8,15 @@ from typing_extensions import deprecated
 
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.attention.layer import Attention
-from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
+from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.models.interfaces import MultiModalEmbeddings
 from vllm.model_executor.models.utils import extract_layer_index
 from vllm.multimodal.cache import processor_only_cache_from_config
 from vllm.multimodal.registry import MultiModalRegistry
 from vllm.platforms import current_platform
+from vllm.utils.mem_constants import GiB_bytes
+from vllm.utils.mem_utils import MemorySnapshot
 from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
 from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget
 from vllm.v1.kv_cache_interface import KVCacheGroupSpec, KVCacheSpec
@@ -248,6 +250,28 @@ def gather_mm_placeholders(
     return placeholders[is_embed]
 
 
+def request_memory(init_snapshot: MemorySnapshot, cache_config: CacheConfig) -> float:
+    """
+    Calculate the amount of memory required by vLLM, then validate
+    that the current amount of free memory is sufficient for that.
+    """
+    requested_memory = init_snapshot.total_memory * cache_config.gpu_memory_utilization
+
+    if init_snapshot.free_memory < requested_memory:
+        GiB = lambda b: round(b / GiB_bytes, 2)
+        raise ValueError(
+            f"Free memory on device {init_snapshot.device_} "
+            f"({GiB(init_snapshot.free_memory)}/"
+            f"{GiB(init_snapshot.total_memory)} GiB) on startup "
+            f"is less than desired GPU memory utilization "
+            f"({cache_config.gpu_memory_utilization}, "
+            f"{GiB(requested_memory)} GiB). Decrease GPU memory "
+            f"utilization or reduce GPU memory used by other processes."
+        )
+
+    return requested_memory
+
+
 def add_kv_sharing_layers_to_kv_cache_groups(
     shared_kv_cache_layers: dict[str, str],
     kv_cache_groups: list[KVCacheGroupSpec],

From 9ca8cb38fd68142627c9649756f1ddc5432c8b19 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Wed, 17 Dec 2025 18:49:56 +0100
Subject: [PATCH 638/770] [CI][Bugfix] Fix flaky
 `tests/entrypoints/openai/test_audio.py::test_chat_streaming_audio` (#30878)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 tests/entrypoints/openai/test_audio.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index a2d8993441fcd..4cf864bdb2de9 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -254,7 +254,9 @@ async def test_single_chat_session_input_audio(
 async def test_chat_streaming_audio(
     client: openai.AsyncOpenAI, model_name: str, audio_url: str
 ):
-    messages = dummy_messages_from_audio_url(audio_url)
+    messages = dummy_messages_from_audio_url(
+        audio_url, "What's a short title for this audio?"
+    )
 
     # test single completion
     chat_completion = await client.chat.completions.create(

From 7eb6cb6c18a948fb49824154cb3ece1e32d12cf8 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Wed, 17 Dec 2025 12:49:59 -0500
Subject: [PATCH 639/770] [Attention] Update tests to remove deprecated env
 vars (#30563)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 .../scripts/hardware_ci/run-xpu-test.sh       |   2 +-
 .../test_basic_correctness.py                 |  85 +++++------
 tests/compile/distributed/test_fusions_e2e.py |   9 +-
 .../fullgraph/test_basic_correctness.py       |  82 ++++++-----
 .../compile/fullgraph/test_full_cudagraph.py  |  13 +-
 tests/compile/fullgraph/test_full_graph.py    |   7 +-
 tests/distributed/test_context_parallel.py    |   4 +-
 tests/distributed/test_pp_cudagraph.py        |  26 ++--
 tests/engine/test_arg_utils.py                | 135 +++++++++++++++++-
 tests/entrypoints/openai/test_serving_chat.py |  13 +-
 .../attention/test_attention_selector.py      |  52 +++----
 .../attention/test_rocm_attention_selector.py |  60 +++++---
 tests/kernels/test_flex_attention.py          |  95 ++++++------
 .../generation/test_granite_speech.py         |  12 +-
 tests/models/multimodal/pooling/conftest.py   |  24 ++--
 .../models/multimodal/pooling/test_siglip.py  |   8 ++
 tests/models/quantization/test_fp8.py         |   3 +-
 tests/models/test_initialization.py           |  12 +-
 .../test_rocm_attention_backends_selection.py |  12 +-
 tests/v1/attention/utils.py                   |  47 +++---
 tests/v1/cudagraph/test_cudagraph_mode.py     |  33 +----
 tests/v1/determinism/test_batch_invariance.py |  25 ++--
 .../test_online_batch_invariance.py           |   5 +-
 tests/v1/e2e/test_async_scheduling.py         |  22 +--
 tests/v1/e2e/test_cascade_attention.py        |  29 ++--
 tests/v1/e2e/test_spec_decode.py              |  43 +++---
 .../nixl_integration/run_accuracy_test.sh     |  22 ++-
 .../tp_config_sweep_accuracy_test.sh          |  12 +-
 .../kv_connector/unit/test_nixl_connector.py  |   6 +-
 tests/v1/kv_connector/unit/utils.py           |   4 +
 tests/v1/kv_offload/test_cpu_offloading.py    |  15 +-
 tests/v1/spec_decode/test_eagle.py            |  19 ++-
 tests/v1/spec_decode/test_max_len.py          |  89 ++++++------
 vllm/v1/attention/backends/rocm_attn.py       |   2 +-
 34 files changed, 580 insertions(+), 447 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index dfc9db512d1e9..85b554e5e8646 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -39,7 +39,7 @@ docker run \
     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
     python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
-    VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
     cd tests
     pytest -v -s v1/core
     pytest -v -s v1/engine
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 9e1cc309edd1d..68b5cd5101d5d 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -67,7 +67,6 @@ def _fix_prompt_embed_outputs(
 @pytest.mark.parametrize("model_executor", ["uni", "mp"])
 @pytest.mark.parametrize("enable_prompt_embeds", [True, False])
 def test_models(
-    monkeypatch: pytest.MonkeyPatch,
     hf_runner,
     model: str,
     backend: str,
@@ -77,48 +76,46 @@ def test_models(
     model_executor: str,
     enable_prompt_embeds: bool,
 ) -> None:
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", backend)
+    # 5042 tokens for gemma2
+    # gemma2 has alternating sliding window size of 4096
+    # we need a prompt with more than 4096 tokens to test the sliding window
+    prompt = (
+        "The following numbers of the sequence "
+        + ", ".join(str(i) for i in range(1024))
+        + " are:"
+    )
+    example_prompts = [prompt]
 
-        # 5042 tokens for gemma2
-        # gemma2 has alternating sliding window size of 4096
-        # we need a prompt with more than 4096 tokens to test the sliding window
-        prompt = (
-            "The following numbers of the sequence "
-            + ", ".join(str(i) for i in range(1024))
-            + " are:"
-        )
-        example_prompts = [prompt]
+    with hf_runner(model) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+        if enable_prompt_embeds:
+            with torch.no_grad():
+                prompt_embeds = hf_model.get_prompt_embeddings(example_prompts)
 
-        with hf_runner(model) as hf_model:
-            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-            if enable_prompt_embeds:
-                with torch.no_grad():
-                    prompt_embeds = hf_model.get_prompt_embeddings(example_prompts)
+    with VllmRunner(
+        model,
+        max_model_len=8192,
+        enforce_eager=enforce_eager,
+        enable_prompt_embeds=enable_prompt_embeds,
+        gpu_memory_utilization=0.7,
+        async_scheduling=async_scheduling,
+        distributed_executor_backend=model_executor,
+        attention_config={"backend": backend},
+    ) as vllm_model:
+        if enable_prompt_embeds:
+            vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens)
+            vllm_outputs = _fix_prompt_embed_outputs(
+                vllm_outputs, hf_model, example_prompts
+            )
+        else:
+            vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
-        with VllmRunner(
-            model,
-            max_model_len=8192,
-            enforce_eager=enforce_eager,
-            enable_prompt_embeds=enable_prompt_embeds,
-            gpu_memory_utilization=0.7,
-            async_scheduling=async_scheduling,
-            distributed_executor_backend=model_executor,
-        ) as vllm_model:
-            if enable_prompt_embeds:
-                vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens)
-                vllm_outputs = _fix_prompt_embed_outputs(
-                    vllm_outputs, hf_model, example_prompts
-                )
-            else:
-                vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-
-        check_outputs_equal(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
 
 
 @multi_gpu_test(num_gpus=2)
@@ -161,12 +158,6 @@ def test_models_distributed(
         ):  # noqa
             pytest.skip("enable_prompt_embeds does not work with ray compiled dag.")
 
-        if attention_backend:
-            monkeypatch_context.setenv(
-                "VLLM_ATTENTION_BACKEND",
-                attention_backend,
-            )
-
         for k, v in extra_env.items():
             monkeypatch_context.setenv(k, v)
 
@@ -178,6 +169,7 @@ def test_models_distributed(
         # if we run HF first, the cuda initialization will be done and it
         # will hurt multiprocessing backend with fork method
         # (the default method).
+        attention_config = {"backend": attention_backend} if attention_backend else None
         with vllm_runner(
             model,
             dtype=dtype,
@@ -185,6 +177,7 @@ def test_models_distributed(
             distributed_executor_backend=distributed_executor_backend,
             enable_prompt_embeds=enable_prompt_embeds,
             gpu_memory_utilization=0.7,
+            attention_config=attention_config,
         ) as vllm_model:
             if enable_prompt_embeds:
                 with hf_runner(model, dtype=dtype) as hf_model:
diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py
index 960b5b4bd7ad4..28ab2cee71a6a 100644
--- a/tests/compile/distributed/test_fusions_e2e.py
+++ b/tests/compile/distributed/test_fusions_e2e.py
@@ -208,7 +208,8 @@ def test_attn_quant(
     # To capture subprocess logs, we need to know whether spawn or fork is used.
     # Force spawn as it is more general.
     monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
+
+    model_kwargs["attention_config"] = {"backend": backend.name}
 
     compilation_config = CompilationConfig(
         # Testing properties
@@ -297,7 +298,8 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
     # To capture subprocess logs, we need to know whether spawn or fork is used.
     # Force spawn as it is more general.
     monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
+
+    model_kwargs["attention_config"] = {"backend": backend.name}
 
     compilation_config = CompilationConfig(
         # Testing properties
@@ -409,7 +411,8 @@ def test_tp2_attn_quant_async_tp(
     # To capture subprocess logs, we need to know whether spawn or fork is used.
     # Force spawn as it is more general.
     monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
+
+    model_kwargs["attention_config"] = {"backend": backend.name}
 
     compilation_config = CompilationConfig(
         # Testing properties
diff --git a/tests/compile/fullgraph/test_basic_correctness.py b/tests/compile/fullgraph/test_basic_correctness.py
index f2e58b5cc423e..d062ed221ff59 100644
--- a/tests/compile/fullgraph/test_basic_correctness.py
+++ b/tests/compile/fullgraph/test_basic_correctness.py
@@ -89,7 +89,6 @@ class TestSetting:
     ],
 )
 def test_compile_correctness(
-    monkeypatch: pytest.MonkeyPatch,
     test_setting: TestSetting,
 ):
     # this test is run under multiple suits, with different GPUs.
@@ -107,49 +106,48 @@ def test_compile_correctness(
             f"{cuda_device_count_stateless()}"
         )
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
-        final_args = [
-            *model_args,
-            "-pp",
-            str(pp_size),
-            "-tp",
-            str(tp_size),
-            "-cc.cudagraph_mode=none",
-        ]
+    final_args = [
+        *model_args,
+        "-pp",
+        str(pp_size),
+        "-tp",
+        str(tp_size),
+        "-cc.cudagraph_mode=none",
+        f"--attention-backend={attn_backend}",
+    ]
 
-        all_args: list[list[str]] = []
-        all_envs: list[dict[str, str] | None] = []
+    all_args: list[list[str]] = []
+    all_envs: list[dict[str, str] | None] = []
 
-        for comp_mode in [
-            CompilationMode.STOCK_TORCH_COMPILE,
-            CompilationMode.DYNAMO_TRACE_ONCE,
-            CompilationMode.VLLM_COMPILE,
-        ]:
-            for mode in [CompilationMode.NONE, comp_mode]:
-                all_args.append(
-                    final_args + [f"-cc.mode={mode.name}", "-cc.backend=inductor"]
-                )
-
-            # inductor will change the output, so we only compare if the output
-            # is close, not exactly the same.
-            compare_all_settings(
-                model,
-                all_args,
-                all_envs,
-                method=method if method != "generate" else "generate_close",
+    for comp_mode in [
+        CompilationMode.STOCK_TORCH_COMPILE,
+        CompilationMode.DYNAMO_TRACE_ONCE,
+        CompilationMode.VLLM_COMPILE,
+    ]:
+        for mode in [CompilationMode.NONE, comp_mode]:
+            all_args.append(
+                final_args + [f"-cc.mode={mode.name}", "-cc.backend=inductor"]
             )
-            all_envs.clear()
-            all_args.clear()
 
-        for mode in [
-            CompilationMode.NONE,
-            CompilationMode.STOCK_TORCH_COMPILE,
-            CompilationMode.DYNAMO_TRACE_ONCE,
-            CompilationMode.VLLM_COMPILE,
-        ]:
-            all_args.append(final_args + [f"-cc.mode={mode.name}", "-cc.backend=eager"])
-            all_envs.append({})
-            all_envs.append({})
+        # inductor will change the output, so we only compare if the output
+        # is close, not exactly the same.
+        compare_all_settings(
+            model,
+            all_args,
+            all_envs,
+            method=method if method != "generate" else "generate_close",
+        )
+        all_envs.clear()
+        all_args.clear()
 
-        compare_all_settings(model, all_args * 3, all_envs, method=method)
+    for mode in [
+        CompilationMode.NONE,
+        CompilationMode.STOCK_TORCH_COMPILE,
+        CompilationMode.DYNAMO_TRACE_ONCE,
+        CompilationMode.VLLM_COMPILE,
+    ]:
+        all_args.append(final_args + [f"-cc.mode={mode.name}", "-cc.backend=eager"])
+        all_envs.append({})
+        all_envs.append({})
+
+    compare_all_settings(model, all_args * 3, all_envs, method=method)
diff --git a/tests/compile/fullgraph/test_full_cudagraph.py b/tests/compile/fullgraph/test_full_cudagraph.py
index c6d4b5272dbcf..4ce6abfe3e46d 100644
--- a/tests/compile/fullgraph/test_full_cudagraph.py
+++ b/tests/compile/fullgraph/test_full_cudagraph.py
@@ -74,7 +74,6 @@ def llm_pair(request):
         # Force native sampler to avoid potential nondeterminism in FlashInfer
         # when per-request generators are not used in V1.
         "VLLM_USE_FLASHINFER_SAMPLER": "0",
-        **backend_config.env_vars,
     }
     with temporary_environ(env_vars):
         full = LLM(
@@ -170,16 +169,10 @@ class TestFullCUDAGraph:
 
 @pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
 def test_full_cudagraph_with_invalid_backend():
-    with (
-        temporary_environ(
-            {
-                "VLLM_ATTENTION_BACKEND": "FLEX_ATTENTION",
-                # Flex_Attention is not supported with full cuda graph
-            }
-        ),
-        pytest.raises(RuntimeError),
-    ):
+    # Flex_Attention is not supported with full cuda graph
+    with pytest.raises(RuntimeError):
         LLM(
             model="Qwen/Qwen2-1.5B-Instruct",
             compilation_config=CompilationConfig(cudagraph_mode="FULL"),
+            attention_config={"backend": "FLEX_ATTENTION"},
         )
diff --git a/tests/compile/fullgraph/test_full_graph.py b/tests/compile/fullgraph/test_full_graph.py
index 3cd1d4be2ebdc..22af2d57f4f3d 100644
--- a/tests/compile/fullgraph/test_full_graph.py
+++ b/tests/compile/fullgraph/test_full_graph.py
@@ -197,20 +197,19 @@ def test_custom_compile_config(
     ],
 )
 def test_fp8_kv_scale_compile(
-    monkeypatch: pytest.MonkeyPatch,
     compilation_mode: int,
     model: str,
     backend: AttentionBackendEnum | None,
 ):
-    if backend:
-        monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
-
     model_kwargs = {
         "quantization": "fp8",
         "kv_cache_dtype": "fp8_e4m3",
         "calculate_kv_scales": True,
         "max_model_len": 512,
     }
+    if backend:
+        model_kwargs["attention_config"] = {"backend": backend.name}
+
     run_model(compilation_mode, model, **model_kwargs)
 
 
diff --git a/tests/distributed/test_context_parallel.py b/tests/distributed/test_context_parallel.py
index aa47f28a34dd5..a286309217719 100644
--- a/tests/distributed/test_context_parallel.py
+++ b/tests/distributed/test_context_parallel.py
@@ -219,14 +219,12 @@ def _test_cp_gsm8k(
         ]
     )
 
-    server_env = {}
     if attn_backend:
-        server_env["VLLM_ATTENTION_BACKEND"] = attn_backend
+        server_args.append(f"--attention-backend={attn_backend}")
 
     with RemoteOpenAIServer(
         model_id,
         server_args,
-        env_dict=server_env,
         max_wait_seconds=720,
     ) as remote_server:
         host = f"http://{remote_server.host}"
diff --git a/tests/distributed/test_pp_cudagraph.py b/tests/distributed/test_pp_cudagraph.py
index 2f2b43cb4cc2b..34ae305c2d2c1 100644
--- a/tests/distributed/test_pp_cudagraph.py
+++ b/tests/distributed/test_pp_cudagraph.py
@@ -20,23 +20,21 @@ from ..utils import compare_two_settings, create_new_process_for_each_test
 )
 @create_new_process_for_each_test()
 def test_pp_cudagraph(
-    monkeypatch: pytest.MonkeyPatch,
     PP_SIZE: int,
     MODEL_NAME: str,
     ATTN_BACKEND: LiteralString,
 ):
-    with monkeypatch.context() as m:
-        cudagraph_args = [
-            # use half precision for speed and memory savings in CI environment
-            "--dtype",
-            "float16",
-            "--pipeline-parallel-size",
-            str(PP_SIZE),
-            "--distributed-executor-backend",
-            "mp",
-        ]
-        m.setenv("VLLM_ATTENTION_BACKEND", ATTN_BACKEND)
+    cudagraph_args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "float16",
+        "--pipeline-parallel-size",
+        str(PP_SIZE),
+        "--distributed-executor-backend",
+        "mp",
+        f"--attention-backend={ATTN_BACKEND}",
+    ]
 
-        eager_args = cudagraph_args + ["--enforce-eager"]
+    eager_args = cudagraph_args + ["--enforce-eager"]
 
-        compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
+    compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index c2cf77ffa12b6..25a5e00cc0e16 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -9,7 +9,7 @@ from typing import Annotated, Literal
 
 import pytest
 
-from vllm.config import CompilationConfig, config
+from vllm.config import AttentionConfig, CompilationConfig, config
 from vllm.engine.arg_utils import (
     EngineArgs,
     contains_type,
@@ -298,6 +298,139 @@ def test_compilation_config():
     )
 
 
+def test_attention_config():
+    from vllm.attention.backends.registry import AttentionBackendEnum
+
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+
+    # default value
+    args = parser.parse_args([])
+    assert args is not None
+    engine_args = EngineArgs.from_cli_args(args)
+    assert engine_args.attention_config == AttentionConfig()
+
+    # set backend via dot notation
+    args = parser.parse_args(["--attention-config.backend", "FLASH_ATTN"])
+    assert args is not None
+    engine_args = EngineArgs.from_cli_args(args)
+    assert engine_args.attention_config.backend is not None
+    assert engine_args.attention_config.backend.name == "FLASH_ATTN"
+
+    # set backend via --attention-backend shorthand
+    args = parser.parse_args(["--attention-backend", "FLASHINFER"])
+    assert args is not None
+    engine_args = EngineArgs.from_cli_args(args)
+    assert engine_args.attention_backend is not None
+    assert engine_args.attention_backend == "FLASHINFER"
+
+    # set all fields via dot notation
+    args = parser.parse_args(
+        [
+            "--attention-config.backend",
+            "FLASH_ATTN",
+            "--attention-config.flash_attn_version",
+            "3",
+            "--attention-config.use_prefill_decode_attention",
+            "true",
+            "--attention-config.flash_attn_max_num_splits_for_cuda_graph",
+            "16",
+            "--attention-config.use_cudnn_prefill",
+            "true",
+            "--attention-config.use_trtllm_ragged_deepseek_prefill",
+            "true",
+            "--attention-config.use_trtllm_attention",
+            "true",
+            "--attention-config.disable_flashinfer_prefill",
+            "true",
+            "--attention-config.disable_flashinfer_q_quantization",
+            "true",
+        ]
+    )
+    assert args is not None
+    engine_args = EngineArgs.from_cli_args(args)
+    assert engine_args.attention_config.backend is not None
+    assert engine_args.attention_config.backend.name == "FLASH_ATTN"
+    assert engine_args.attention_config.flash_attn_version == 3
+    assert engine_args.attention_config.use_prefill_decode_attention is True
+    assert engine_args.attention_config.flash_attn_max_num_splits_for_cuda_graph == 16
+    assert engine_args.attention_config.use_cudnn_prefill is True
+    assert engine_args.attention_config.use_trtllm_ragged_deepseek_prefill is True
+    assert engine_args.attention_config.use_trtllm_attention is True
+    assert engine_args.attention_config.disable_flashinfer_prefill is True
+    assert engine_args.attention_config.disable_flashinfer_q_quantization is True
+
+    # set to string form of a dict with all fields
+    args = parser.parse_args(
+        [
+            "--attention-config="
+            '{"backend": "FLASHINFER", "flash_attn_version": 2, '
+            '"use_prefill_decode_attention": false, '
+            '"flash_attn_max_num_splits_for_cuda_graph": 8, '
+            '"use_cudnn_prefill": false, '
+            '"use_trtllm_ragged_deepseek_prefill": false, '
+            '"use_trtllm_attention": false, '
+            '"disable_flashinfer_prefill": false, '
+            '"disable_flashinfer_q_quantization": false}',
+        ]
+    )
+    assert args is not None
+    engine_args = EngineArgs.from_cli_args(args)
+    assert engine_args.attention_config.backend is not None
+    assert engine_args.attention_config.backend.name == "FLASHINFER"
+    assert engine_args.attention_config.flash_attn_version == 2
+    assert engine_args.attention_config.use_prefill_decode_attention is False
+    assert engine_args.attention_config.flash_attn_max_num_splits_for_cuda_graph == 8
+    assert engine_args.attention_config.use_cudnn_prefill is False
+    assert engine_args.attention_config.use_trtllm_ragged_deepseek_prefill is False
+    assert engine_args.attention_config.use_trtllm_attention is False
+    assert engine_args.attention_config.disable_flashinfer_prefill is False
+    assert engine_args.attention_config.disable_flashinfer_q_quantization is False
+
+    # test --attention-backend flows into VllmConfig.attention_config
+    args = parser.parse_args(
+        [
+            "--model",
+            "facebook/opt-125m",
+            "--attention-backend",
+            "FLASH_ATTN",
+        ]
+    )
+    assert args is not None
+    engine_args = EngineArgs.from_cli_args(args)
+    vllm_config = engine_args.create_engine_config()
+    assert vllm_config.attention_config.backend == AttentionBackendEnum.FLASH_ATTN
+
+    # test --attention-config.backend flows into VllmConfig.attention_config
+    args = parser.parse_args(
+        [
+            "--model",
+            "facebook/opt-125m",
+            "--attention-config.backend",
+            "FLASHINFER",
+        ]
+    )
+    assert args is not None
+    engine_args = EngineArgs.from_cli_args(args)
+    vllm_config = engine_args.create_engine_config()
+    assert vllm_config.attention_config.backend == AttentionBackendEnum.FLASHINFER
+
+    # test --attention-backend and --attention-config.backend are mutually exclusive
+    args = parser.parse_args(
+        [
+            "--model",
+            "facebook/opt-125m",
+            "--attention-backend",
+            "FLASH_ATTN",
+            "--attention-config.backend",
+            "FLASHINFER",
+        ]
+    )
+    assert args is not None
+    engine_args = EngineArgs.from_cli_args(args)
+    with pytest.raises(ValueError, match="mutually exclusive"):
+        engine_args.create_engine_config()
+
+
 def test_prefix_cache_default():
     parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
     args = parser.parse_args([])
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 444275e061c61..2befa40d636da 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -76,15 +76,10 @@ def default_server_args(with_tool_parser: bool):
 
 
 @pytest.fixture(scope="module")
-def gptoss_server(
-    monkeypatch_module: pytest.MonkeyPatch, default_server_args: list[str]
-):
-    with monkeypatch_module.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
-        with RemoteOpenAIServer(
-            GPT_OSS_MODEL_NAME, default_server_args
-        ) as remote_server:
-            yield remote_server
+def gptoss_server(default_server_args: list[str]):
+    server_args = default_server_args + ["--attention-backend=TRITON_ATTN"]
+    with RemoteOpenAIServer(GPT_OSS_MODEL_NAME, server_args) as remote_server:
+        yield remote_server
 
 
 @pytest_asyncio.fixture
diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
index c959b2f4bb03c..d62acc2022d10 100644
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -6,7 +6,9 @@ from unittest.mock import patch
 import pytest
 import torch
 
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
+from vllm.config import AttentionConfig, VllmConfig, set_current_vllm_config
 from vllm.platforms import current_platform
 from vllm.platforms.cpu import CpuPlatform
 from vllm.platforms.cuda import CudaPlatform
@@ -73,18 +75,18 @@ def generate_params():
 
 
 @pytest.mark.parametrize("device, name, use_mla, block_size", generate_params())
-def test_env(
+def test_backend_selection(
     device: str,
     name: str,
     use_mla: bool,
     block_size: int,
-    monkeypatch: pytest.MonkeyPatch,
 ):
     """Test attention backend selection with valid device-backend pairs."""
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", name)
-        m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")
+    # Create AttentionConfig with the specified backend
+    attention_config = AttentionConfig(backend=AttentionBackendEnum[name])
+    vllm_config = VllmConfig(attention_config=attention_config)
 
+    with set_current_vllm_config(vllm_config):
         if device == "cpu":
             with patch("vllm.platforms.current_platform", CpuPlatform()):
                 backend = get_attn_backend(16, torch.float16, None, block_size)
@@ -217,27 +219,32 @@ def test_env(
 @pytest.mark.parametrize("device", ["cpu", "cuda"])
 def test_fp32_fallback(device: str):
     """Test attention backend selection with fp32."""
-    if device == "cpu":
-        with patch("vllm.platforms.current_platform", CpuPlatform()):
-            backend = get_attn_backend(16, torch.float32, None, 16)
-        assert backend.get_name() == "CPU_ATTN"
+    # Use default config (no backend specified)
+    vllm_config = VllmConfig()
 
-    elif device == "cuda":
-        with patch("vllm.platforms.current_platform", CudaPlatform()):
-            backend = get_attn_backend(16, torch.float32, None, 16)
-        assert backend.get_name() == "FLEX_ATTENTION"
+    with set_current_vllm_config(vllm_config):
+        if device == "cpu":
+            with patch("vllm.platforms.current_platform", CpuPlatform()):
+                backend = get_attn_backend(16, torch.float32, None, 16)
+            assert backend.get_name() == "CPU_ATTN"
+
+        elif device == "cuda":
+            with patch("vllm.platforms.current_platform", CudaPlatform()):
+                backend = get_attn_backend(16, torch.float32, None, 16)
+            assert backend.get_name() == "FLEX_ATTENTION"
 
 
 def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
     """Test FlashAttn validation."""
     pytest.skip(
         "Skipping as current backend selector does not "
-        "handle fallbacks when a backend is set via env var."
+        "handle fallbacks when a backend is explicitly set."
     )
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", "FLASH_ATTN")
+    attention_config = AttentionConfig(backend=AttentionBackendEnum.FLASH_ATTN)
+    vllm_config = VllmConfig(attention_config=attention_config)
 
+    with set_current_vllm_config(vllm_config):
         # Unsupported CUDA arch
         monkeypatch.setattr(torch.cuda, "get_device_capability", lambda _=None: (7, 5))
         backend = get_attn_backend(16, torch.float16, None, 16)
@@ -277,15 +284,10 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
         assert backend.get_name() != "FLASH_ATTN"
 
 
-def test_invalid_env(monkeypatch: pytest.MonkeyPatch):
+def test_invalid_backend():
     """Test that invalid attention backend names raise ValueError."""
     with (
-        monkeypatch.context() as m,
-        patch("vllm.platforms.current_platform", CudaPlatform()),
+        pytest.raises(ValueError),
     ):
-        m.setenv("VLLM_ATTENTION_BACKEND", "INVALID")
-
-        # Should raise ValueError for invalid backend
-        with pytest.raises(ValueError) as exc_info:
-            get_attn_backend(32, torch.float16, None, 16)
-        assert "Invalid value 'INVALID'" in str(exc_info.value)
+        # Invalid backend name should raise ValueError when creating enum
+        AttentionConfig(backend=AttentionBackendEnum["INVALID"])
diff --git a/tests/kernels/attention/test_rocm_attention_selector.py b/tests/kernels/attention/test_rocm_attention_selector.py
index b61058081c0b2..f97d475eb47d7 100644
--- a/tests/kernels/attention/test_rocm_attention_selector.py
+++ b/tests/kernels/attention/test_rocm_attention_selector.py
@@ -4,7 +4,9 @@
 import pytest
 import torch
 
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
+from vllm.config import AttentionConfig, VllmConfig, set_current_vllm_config
 from vllm.platforms.rocm import RocmPlatform
 
 
@@ -16,40 +18,56 @@ def clear_cache():
 
 @pytest.mark.skip(reason="Skipped for now. Should be revisited.")
 def test_selector(monkeypatch: pytest.MonkeyPatch):
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_ATTN")
+    # Set the current platform to ROCm using monkeypatch
+    monkeypatch.setattr("vllm.attention.selector.current_platform", RocmPlatform())
 
-        # Set the current platform to ROCm using monkeypatch
-        monkeypatch.setattr("vllm.attention.selector.current_platform", RocmPlatform())
+    # Test standard ROCm attention
+    attention_config = AttentionConfig(backend=AttentionBackendEnum.ROCM_ATTN)
+    vllm_config = VllmConfig(attention_config=attention_config)
 
-        # Test standard ROCm attention
+    with set_current_vllm_config(vllm_config):
         backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
         assert backend.get_name() == "ROCM_FLASH" or backend.get_name() == "TRITON_ATTN"
 
-        # MLA test for deepseek related
+    # MLA test for deepseek related
+    # Change the attention backend to triton MLA
+    attention_config = AttentionConfig(backend=AttentionBackendEnum.TRITON_MLA)
+    vllm_config = VllmConfig(attention_config=attention_config)
 
-        # change the attention backend to triton MLA
-        m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_MLA")
+    with set_current_vllm_config(vllm_config):
         backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, use_mla=True)
         assert backend.get_name() == "TRITON_MLA"
 
-        # If attention backend is None
-        # If use_mla is true
-        # The selected backend is triton MLA
-        m.setenv("VLLM_ATTENTION_BACKEND", "")
+    # If attention backend is None
+    # If use_mla is true
+    # The selected backend is triton MLA
+    attention_config = AttentionConfig(backend=None)
+    vllm_config = VllmConfig(attention_config=attention_config)
+
+    with set_current_vllm_config(vllm_config):
         backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, use_mla=True)
         assert backend.get_name() == "TRITON_MLA"
 
-        # change the attention backend to AITER MLA
-        m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_AITER_MLA")
+    # Change the attention backend to AITER MLA
+    attention_config = AttentionConfig(backend=AttentionBackendEnum.ROCM_AITER_MLA)
+    vllm_config = VllmConfig(attention_config=attention_config)
+
+    with set_current_vllm_config(vllm_config):
         backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False, use_mla=True)
         assert backend.get_name() == "ROCM_AITER_MLA"
 
-        # If attention backend is None
-        # If use_mla is true
-        # If VLLM_ROCM_USE_AITER is enabled
-        # The selected backend is ROCM_AITER_MLA
-        m.setenv("VLLM_ATTENTION_BACKEND", "")
+    # If attention backend is None
+    # If use_mla is true
+    # If VLLM_ROCM_USE_AITER is enabled
+    # The selected backend is ROCM_AITER_MLA
+    with monkeypatch.context() as m:
         m.setenv("VLLM_ROCM_USE_AITER", "1")
-        backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False, use_mla=True)
-        assert backend.get_name() == "ROCM_AITER_MLA"
+
+        attention_config = AttentionConfig(backend=None)
+        vllm_config = VllmConfig(attention_config=attention_config)
+
+        with set_current_vllm_config(vllm_config):
+            backend = get_attn_backend(
+                576, torch.bfloat16, "auto", 1, False, use_mla=True
+            )
+            assert backend.get_name() == "ROCM_AITER_MLA"
diff --git a/tests/kernels/test_flex_attention.py b/tests/kernels/test_flex_attention.py
index ae33f422d3732..f6987d54399d2 100644
--- a/tests/kernels/test_flex_attention.py
+++ b/tests/kernels/test_flex_attention.py
@@ -37,7 +37,7 @@ def set_seed(seed):
     not torch.cuda.is_available() or TORCH_VERSION < MINIMUM_TORCH_VERSION,
     reason="CUDA not available or PyTorch version < 2.7",
 )
-def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
+def test_flex_attention_vs_default_backend(vllm_runner):
     """Test that FlexAttention produces the same outputs as the default backend.
 
     This test compares the outputs from the FlexAttention backend with
@@ -54,35 +54,32 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
     ]
 
     # Run with flex attention
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
-
-        set_seed(seed)
-        with vllm_runner(
-            model_name,
-            runner="generate",
-            tensor_parallel_size=1,
-            num_gpu_blocks_override=128,
-            enforce_eager=True,
-        ) as llm_flex:
-            output_flex = llm_flex.generate_greedy_logprobs(
-                prompts, max_tokens, num_logprobs
-            )
+    set_seed(seed)
+    with vllm_runner(
+        model_name,
+        runner="generate",
+        tensor_parallel_size=1,
+        num_gpu_blocks_override=128,
+        enforce_eager=True,
+        attention_config={"backend": "FLEX_ATTENTION"},
+    ) as llm_flex:
+        output_flex = llm_flex.generate_greedy_logprobs(
+            prompts, max_tokens, num_logprobs
+        )
 
     # Run with default backend
-    with monkeypatch.context() as m:
-        set_seed(seed)
-        with vllm_runner(
-            model_name,
-            runner="generate",
-            tensor_parallel_size=1,
-            num_gpu_blocks_override=128,
-            enforce_eager=True,
-            gpu_memory_utilization=0.85,
-        ) as llm_default:
-            output_default = llm_default.generate_greedy_logprobs(
-                prompts, max_tokens, num_logprobs
-            )
+    set_seed(seed)
+    with vllm_runner(
+        model_name,
+        runner="generate",
+        tensor_parallel_size=1,
+        num_gpu_blocks_override=128,
+        enforce_eager=True,
+        gpu_memory_utilization=0.85,
+    ) as llm_default:
+        output_default = llm_default.generate_greedy_logprobs(
+            prompts, max_tokens, num_logprobs
+        )
 
     check_logprobs_close(
         outputs_0_lst=output_flex,
@@ -96,7 +93,7 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
     not torch.cuda.is_available() or TORCH_VERSION < MINIMUM_TORCH_VERSION,
     reason="CUDA not available or PyTorch version < 2.7",
 )
-def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
+def test_encoder_flex_attention_vs_default_backend(vllm_runner):
     """Test that FlexAttention produces the same outputs as the default backend.
 
     This test compares the outputs from the FlexAttention backend with
@@ -110,30 +107,26 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
     ]
 
     # Run with flex attention
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
-        with vllm_runner(
-            model_name,
-            runner="pooling",
-            dtype=torch.bfloat16,
-            tensor_parallel_size=1,
-            max_model_len=100,
-            enforce_eager=True,
-        ) as llm_flex:
-            flex_outputs = llm_flex.embed(prompts)
+    with vllm_runner(
+        model_name,
+        runner="pooling",
+        dtype=torch.bfloat16,
+        tensor_parallel_size=1,
+        max_model_len=100,
+        enforce_eager=True,
+        attention_config={"backend": "FLEX_ATTENTION"},
+    ) as llm_flex:
+        flex_outputs = llm_flex.embed(prompts)
 
     # Run with default backend
-    with (
-        monkeypatch.context() as m,
-        vllm_runner(
-            model_name,
-            runner="pooling",
-            dtype=torch.bfloat16,
-            tensor_parallel_size=1,
-            max_model_len=100,
-            enforce_eager=True,
-        ) as llm_default,
-    ):
+    with vllm_runner(
+        model_name,
+        runner="pooling",
+        dtype=torch.bfloat16,
+        tensor_parallel_size=1,
+        max_model_len=100,
+        enforce_eager=True,
+    ) as llm_default:
         default_outputs = llm_default.embed(prompts)
 
     check_embeddings_close(
diff --git a/tests/models/multimodal/generation/test_granite_speech.py b/tests/models/multimodal/generation/test_granite_speech.py
index f528a993f8551..489743c5a29b3 100644
--- a/tests/models/multimodal/generation/test_granite_speech.py
+++ b/tests/models/multimodal/generation/test_granite_speech.py
@@ -35,10 +35,12 @@ audio_lora_path = MODEL_NAME
 models = [MODEL_NAME]
 
 
-@pytest.fixture(autouse=True)
-def set_attention_backend_for_rocm(monkeypatch):
+@pytest.fixture
+def granite_speech_attention_config():
+    """Return attention config for Granite Speech tests on ROCm."""
     if current_platform.is_rocm():
-        monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
+        return {"backend": "TRITON_ATTN"}
+    return None
 
 
 def run_test(
@@ -53,6 +55,7 @@ def run_test(
     num_logprobs: int,
     tensor_parallel_size: int,
     distributed_executor_backend: str | None = None,
+    attention_config: dict | None = None,
 ):
     """Inference result should be the same between hf and vllm.
 
@@ -80,6 +83,7 @@ def run_test(
         enable_lora=True,
         max_lora_rank=64,
         enforce_eager=True,
+        attention_config=attention_config,
     ) as vllm_model:
         lora_request = LoRARequest("audio", 1, audio_lora_path)
         vllm_outputs_per_case = [
@@ -131,6 +135,7 @@ def test_models(
     vllm_runner,
     model: str,
     audio_assets: AudioTestAssets,
+    granite_speech_attention_config,
     dtype: str,
     max_model_len: int,
     max_tokens: int,
@@ -157,4 +162,5 @@ def test_models(
         max_tokens=max_tokens,
         num_logprobs=num_logprobs,
         tensor_parallel_size=1,
+        attention_config=granite_speech_attention_config,
     )
diff --git a/tests/models/multimodal/pooling/conftest.py b/tests/models/multimodal/pooling/conftest.py
index c5f40cb42ca2a..401bc39b4b109 100644
--- a/tests/models/multimodal/pooling/conftest.py
+++ b/tests/models/multimodal/pooling/conftest.py
@@ -2,23 +2,17 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Pytest configuration for vLLM pooling tests."""
 
-import os
-import warnings
+import pytest
 
 from vllm.platforms import current_platform
 
 
-def pytest_collection_modifyitems(config, items):
-    """Set FLEX_ATTENTION backend for SigLIP tests on ROCm."""
-    if not current_platform.is_rocm():
-        return
+@pytest.fixture
+def siglip_attention_config():
+    """Return attention config for SigLIP tests on ROCm.
 
-    siglip_tests = [item for item in items if "test_siglip" in item.nodeid]
-
-    if siglip_tests:
-        os.environ["VLLM_ATTENTION_BACKEND"] = "FLEX_ATTENTION"
-        warnings.warn(
-            "ROCm: Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION for SigLIP tests",
-            UserWarning,
-            stacklevel=1,
-        )
+    On ROCm, SigLIP tests require FLEX_ATTENTION backend.
+    """
+    if current_platform.is_rocm():
+        return {"backend": "FLEX_ATTENTION"}
+    return None
diff --git a/tests/models/multimodal/pooling/test_siglip.py b/tests/models/multimodal/pooling/test_siglip.py
index 72886cbf7f323..0b8cd33ccfb9d 100644
--- a/tests/models/multimodal/pooling/test_siglip.py
+++ b/tests/models/multimodal/pooling/test_siglip.py
@@ -38,6 +38,7 @@ def _run_test(
     *,
     dtype: str,
     tokenization_kwargs: dict[str, Any] | None = None,
+    attention_config: dict[str, Any] | None = None,
 ) -> None:
     if tokenization_kwargs is None:
         tokenization_kwargs = {}
@@ -49,6 +50,7 @@ def _run_test(
         enforce_eager=True,
         max_model_len=64,
         gpu_memory_utilization=0.7,
+        attention_config=attention_config,
     ) as vllm_model:
         vllm_outputs = vllm_model.embed(
             input_texts, images=input_images, tokenization_kwargs=tokenization_kwargs
@@ -90,6 +92,7 @@ def test_models_text(
     hf_runner,
     vllm_runner,
     image_assets,
+    siglip_attention_config,
     model: str,
     dtype: str,
 ) -> None:
@@ -108,6 +111,7 @@ def test_models_text(
             "padding": "max_length",
             "max_length": 64,
         },  # siglip2 was trained with this padding setting.
+        attention_config=siglip_attention_config,
     )
 
 
@@ -117,6 +121,7 @@ def test_models_image(
     hf_runner,
     vllm_runner,
     image_assets,
+    siglip_attention_config,
     model: str,
     dtype: str,
 ) -> None:
@@ -133,6 +138,7 @@ def test_models_image(
         input_images,
         model,
         dtype=dtype,
+        attention_config=siglip_attention_config,
     )
 
 
@@ -141,6 +147,7 @@ def test_models_image(
 def test_models_text_image_no_crash(
     vllm_runner,
     image_assets,
+    siglip_attention_config,
     model: str,
     dtype: str,
 ) -> None:
@@ -154,6 +161,7 @@ def test_models_text_image_no_crash(
         enforce_eager=True,
         max_model_len=64,
         gpu_memory_utilization=0.7,
+        attention_config=siglip_attention_config,
     ) as vllm_model:
         with pytest.raises(ValueError, match="not both"):
             vllm_model.embed(texts, images=images)
diff --git a/tests/models/quantization/test_fp8.py b/tests/models/quantization/test_fp8.py
index 7dfedaf2799d4..f3b85ba0ee394 100644
--- a/tests/models/quantization/test_fp8.py
+++ b/tests/models/quantization/test_fp8.py
@@ -75,7 +75,6 @@ def test_models(
 
     with monkeypatch.context() as m:
         m.setenv("TOKENIZERS_PARALLELISM", "true")
-        m.setenv("VLLM_ATTENTION_BACKEND", backend)
 
         MAX_MODEL_LEN = 1024
         NUM_LOG_PROBS = 8
@@ -86,6 +85,7 @@ def test_models(
             tensor_parallel_size=tensor_parallel_size,
             enforce_eager=enforce_eager,
             kv_cache_dtype="auto",
+            attention_config={"backend": backend},
         ) as vllm_model:
             baseline_outputs = vllm_model.generate_greedy_logprobs(
                 example_prompts, max_tokens, NUM_LOG_PROBS
@@ -97,6 +97,7 @@ def test_models(
             tensor_parallel_size=tensor_parallel_size,
             enforce_eager=enforce_eager,
             kv_cache_dtype=kv_cache_dtype,
+            attention_config={"backend": backend},
         ) as vllm_model:
             test_outputs = vllm_model.generate_greedy_logprobs(
                 example_prompts, max_tokens, NUM_LOG_PROBS
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index 8c4bd6eaa2dd8..0a573847bf913 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -108,11 +108,12 @@ def can_initialize(
         patch.object(V1EngineCore, "_initialize_kv_caches", _initialize_kv_caches_v1),
         monkeypatch.context() as m,
     ):
-        if model_arch == "GptOssForCausalLM":
-            # FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU
-            # has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
-            # L4 supports FA3.
-            m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
+        # FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU
+        # has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
+        # L4 supports FA3.
+        attention_config = (
+            {"backend": "TRITON_ATTN"} if model_arch == "GptOssForCausalLM" else None
+        )
         if model_arch == "WhisperForConditionalGeneration":
             m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
 
@@ -143,6 +144,7 @@ def can_initialize(
             else "vllm",
             hf_overrides=hf_overrides_fn,
             max_num_seqs=model_info.max_num_seqs,
+            attention_config=attention_config,
         )
 
 
diff --git a/tests/v1/attention/test_rocm_attention_backends_selection.py b/tests/v1/attention/test_rocm_attention_backends_selection.py
index 77790be6f892b..d8c747056faf6 100644
--- a/tests/v1/attention/test_rocm_attention_backends_selection.py
+++ b/tests/v1/attention/test_rocm_attention_backends_selection.py
@@ -94,26 +94,20 @@ def mock_on_gfx9():
             None,
             AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path(),
         ),
-        # Test Case 9: VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1
-        (
-            {"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1"},
-            None,
-            AttentionBackendEnum.ROCM_ATTN.get_path(),
-        ),
-        # Test Case 10: VLLM_ROCM_USE_AITER=1 + explicit TRITON_ATTN
+        # Test Case 9: VLLM_ROCM_USE_AITER=1 + explicit TRITON_ATTN
         (
             {"VLLM_ROCM_USE_AITER": "1"},
             "TRITON_ATTN",
             AttentionBackendEnum.TRITON_ATTN.get_path(),
         ),
-        # Test Case 11: VLLM_ROCM_USE_AITER=1 + VLLM_ROCM_USE_AITER_MHA=0
+        # Test Case 10: VLLM_ROCM_USE_AITER=1 + VLLM_ROCM_USE_AITER_MHA=0
         # (explicitly disabled)
         (
             {"VLLM_ROCM_USE_AITER": "1", "VLLM_ROCM_USE_AITER_MHA": "0"},
             None,
             AttentionBackendEnum.TRITON_ATTN.get_path(),
         ),
-        # Test Case 12: VLLM_ROCM_USE_AITER=1 + explicit ROCM_ATTN
+        # Test Case 11: VLLM_ROCM_USE_AITER=1 + explicit ROCM_ATTN
         (
             {"VLLM_ROCM_USE_AITER": "1"},
             "ROCM_ATTN",
diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
index 4dcaf9d908690..031436a030908 100644
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@@ -249,8 +249,8 @@ def create_dummy_kv_cache(
 @dataclass
 class BackendConfig:
     name: str
-    env_vars: dict
-    comp_config: dict  # compilation config
+    attention_config: dict
+    comp_config: dict
     specific_gpu_arch: tuple | None = None
 
 
@@ -259,10 +259,10 @@ full_cg_backend_configs = {
     # FA3 on Hopper
     "FA3": BackendConfig(
         name="FA3",
-        env_vars={
-            "VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
-            "VLLM_FLASH_ATTN_VERSION": "3",
-            "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
+        attention_config={
+            "backend": "FLASH_ATTN",
+            "flash_attn_version": 3,
+            "flash_attn_max_num_splits_for_cuda_graph": 16,
         },
         comp_config={
             "cudagraph_mode": "FULL",
@@ -272,9 +272,7 @@ full_cg_backend_configs = {
     # FlashMLA on Hopper
     "FlashMLA": BackendConfig(
         name="FlashMLA",
-        env_vars={
-            "VLLM_ATTENTION_BACKEND": "FLASHMLA",
-        },
+        attention_config={"backend": "FLASHMLA"},
         comp_config={
             "cudagraph_mode": "FULL_AND_PIECEWISE",
         },
@@ -283,9 +281,7 @@ full_cg_backend_configs = {
     # Cutlass MLA on Blackwell
     "CutlassMLA": BackendConfig(
         name="CutlassMLA",
-        env_vars={
-            "VLLM_ATTENTION_BACKEND": "CUTLASS_MLA",
-        },
+        attention_config={"backend": "CUTLASS_MLA"},
         comp_config={
             "cudagraph_mode": "FULL_AND_PIECEWISE",
         },
@@ -294,9 +290,7 @@ full_cg_backend_configs = {
     # FlashInfer MLA on Blackwell
     "FlashInferMLA": BackendConfig(
         name="FlashInferMLA",
-        env_vars={
-            "VLLM_ATTENTION_BACKEND": "FLASHINFER_MLA",
-        },
+        attention_config={"backend": "FLASHINFER_MLA"},
         comp_config={
             "cudagraph_mode": "FULL_AND_PIECEWISE",
         },
@@ -305,9 +299,9 @@ full_cg_backend_configs = {
     # FlashAttention MLA on Hopper
     "FlashAttentionMLA": BackendConfig(
         name="FlashAttentionMLA",
-        env_vars={
-            "VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA",
-            "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
+        attention_config={
+            "backend": "FLASH_ATTN_MLA",
+            "flash_attn_max_num_splits_for_cuda_graph": 16,
         },
         comp_config={
             "cudagraph_mode": "FULL_DECODE_ONLY",
@@ -317,10 +311,10 @@ full_cg_backend_configs = {
     # FA2
     "FA2": BackendConfig(
         name="FA2",
-        env_vars={
-            "VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
-            "VLLM_FLASH_ATTN_VERSION": "2",
-            "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
+        attention_config={
+            "backend": "FLASH_ATTN",
+            "flash_attn_version": 2,
+            "flash_attn_max_num_splits_for_cuda_graph": 16,
         },
         comp_config={
             "cudagraph_mode": "FULL_AND_PIECEWISE",
@@ -329,7 +323,7 @@ full_cg_backend_configs = {
     # Triton Attention
     "TritonAttn": BackendConfig(
         name="TritonAttn",
-        env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN"},
+        attention_config={"backend": "TRITON_ATTN"},
         comp_config={
             "cudagraph_mode": "FULL_AND_PIECEWISE",
         },
@@ -337,14 +331,17 @@ full_cg_backend_configs = {
     # FlashInfer
     "FlashInfer": BackendConfig(
         name="FlashInfer",
-        env_vars={"VLLM_ATTENTION_BACKEND": "FLASHINFER"},
+        attention_config={"backend": "FLASHINFER"},
         comp_config={
             "cudagraph_mode": "FULL_AND_PIECEWISE",
         },
     ),
     "RocmAttn": BackendConfig(
         name="RocmAttn",
-        env_vars={"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1"},
+        attention_config={
+            "backend": "ROCM_ATTN",
+            "use_prefill_decode_attention": True,
+        },
         comp_config={
             "cudagraph_mode": "FULL",
         },
diff --git a/tests/v1/cudagraph/test_cudagraph_mode.py b/tests/v1/cudagraph/test_cudagraph_mode.py
index b1895e83b8b37..f4f74d16c7019 100644
--- a/tests/v1/cudagraph/test_cudagraph_mode.py
+++ b/tests/v1/cudagraph/test_cudagraph_mode.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import contextlib
-import os
 import weakref
 from contextlib import ExitStack
 
@@ -13,26 +11,6 @@ from vllm import LLM
 from vllm.config import CompilationConfig, CompilationMode
 from vllm.platforms import current_platform
 
-
-@contextlib.contextmanager
-def temporary_environ(env_vars):
-    """
-    Temporarily set environment variables and restore them afterward.
-    We have to do this vs monkeypatch because monkeypatch doesn't work
-    with "module" scoped fixtures.
-    """
-    original_env = {k: os.environ.get(k) for k in env_vars}
-    try:
-        os.environ.update(env_vars)
-        yield
-    finally:
-        for k, v in original_env.items():
-            if v is None:
-                os.environ.pop(k, None)
-            else:
-                os.environ[k] = v
-
-
 # test attention backend and cudagraph_mode combo
 # (backend_name, cudagraph_mode, supported)
 if current_platform.is_rocm():
@@ -68,9 +46,9 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
     ):
         pytest.skip("Only Hopper GPUs support FA3 and FlashMLA")
 
-    env_vars = backend_configs[backend_name].env_vars
+    attention_config = backend_config.attention_config
 
-    with temporary_environ(env_vars), ExitStack() as stack:
+    with ExitStack() as stack:
         if not supported:
             stack.enter_context(pytest.raises(Exception))
 
@@ -80,6 +58,7 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
             trust_remote_code=True,
             gpu_memory_utilization=0.45,
             max_model_len=1024,
+            attention_config=attention_config,
             compilation_config=CompilationConfig(
                 mode=CompilationMode.VLLM_COMPILE, cudagraph_mode=cudagraph_mode
             ),
@@ -122,9 +101,10 @@ combo_cases_2 = [
 def test_cudagraph_compilation_combo(
     backend_name, cudagraph_mode, compilation_mode, supported
 ):
-    env_vars = backend_configs[backend_name].env_vars
+    backend_config = backend_configs[backend_name]
+    attention_config = backend_config.attention_config
 
-    with temporary_environ(env_vars), ExitStack() as stack:
+    with ExitStack() as stack:
         if not supported:
             stack.enter_context(pytest.raises(Exception))
 
@@ -134,6 +114,7 @@ def test_cudagraph_compilation_combo(
             trust_remote_code=True,
             gpu_memory_utilization=0.45,
             max_model_len=1024,
+            attention_config=attention_config,
             compilation_config=CompilationConfig(
                 mode=compilation_mode, cudagraph_mode=cudagraph_mode
             ),
diff --git a/tests/v1/determinism/test_batch_invariance.py b/tests/v1/determinism/test_batch_invariance.py
index 7a58e1c9bad03..61fb5f07303b4 100644
--- a/tests/v1/determinism/test_batch_invariance.py
+++ b/tests/v1/determinism/test_batch_invariance.py
@@ -28,7 +28,7 @@ IS_DEVICE_CAPABILITY_BELOW_90 = is_device_capability_below_90()
     BACKENDS,
 )
 def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
-    backend, monkeypatch: pytest.MonkeyPatch
+    backend,
 ):
     """
     Ensures that the same request (the 'needle' prompt) yields identical output
@@ -54,7 +54,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
     seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
     random.seed(seed)
 
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
+    attention_config = {"backend": backend}
     # Allow overrides from environment (useful for CI tuning)
     # "facebook/opt-125m" is too small, doesn't reliably test determinism
     model = resolve_model_name(backend)
@@ -92,6 +92,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
             max_num_seqs=max_batch_size,
             gpu_memory_utilization=gpu_mem_util,
             max_model_len=max_model_len,
+            attention_config=attention_config,
         )
 
         # Baseline generation for the needle prompt alone.
@@ -106,6 +107,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
             max_num_seqs=max_batch_size,
             gpu_memory_utilization=gpu_mem_util,
             max_model_len=max_model_len,
+            attention_config=attention_config,
         )
 
         mismatches = 0
@@ -163,10 +165,8 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
     BACKENDS,
 )
 def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
-    backend, monkeypatch: pytest.MonkeyPatch
+    backend,
 ):
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
-
     seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
     random.seed(seed)
     model_name = resolve_model_name(backend)
@@ -193,6 +193,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
         dtype="bfloat16",  # not everything is supported
         gpu_memory_utilization=0.9,
         enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
+        attention_config={"backend": backend},
     )
 
     # Use more realistic prompts for better token generation
@@ -381,12 +382,11 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
     "backend",
     BACKENDS,
 )
-def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch):
+def test_simple_generation(backend):
     """
     Simple test that runs the model with a basic prompt and prints the output.
     Useful for quick smoke testing and debugging.
     """
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
     model = resolve_model_name(backend)
 
     llm = LLM(
@@ -398,6 +398,7 @@ def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch):
         dtype="bfloat16",
         enable_prefix_caching=False,
         enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
+        attention_config={"backend": backend},
     )
 
     prompt = "the capital of france is"
@@ -444,8 +445,6 @@ def test_logprobs_without_batch_invariance_should_fail(
     The test will PASS if we detect differences (proving batch invariance matters).
     The test will FAIL if everything matches (suggesting batch invariance isn't needed).
     """
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
-
     # CRITICAL: Disable batch invariance for this test
     monkeypatch.setenv("VLLM_BATCH_INVARIANT", "0")
     monkeypatch.setattr(batch_invariant, "VLLM_BATCH_INVARIANT", False)
@@ -465,6 +464,7 @@ def test_logprobs_without_batch_invariance_should_fail(
         max_model_len=8192,
         dtype="bfloat16",
         enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
+        attention_config={"backend": backend},
     )
 
     # build ragged prompts to change shapes significantly across BS=1 vs BS=N
@@ -649,7 +649,7 @@ def test_logprobs_without_batch_invariance_should_fail(
 @skip_unsupported
 @pytest.mark.parametrize("backend", ["FLASH_ATTN"])
 def test_decode_logprobs_match_prefill_logprobs(
-    backend, monkeypatch: pytest.MonkeyPatch
+    backend,
 ):
     """
     Test that verifies decode logprobs match prefill logprobs.
@@ -664,8 +664,6 @@ def test_decode_logprobs_match_prefill_logprobs(
     This ensures that the logprobs from decode are consistent with what
     we would get if we ran prefill on each prefix.
     """
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
-
     seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
     random.seed(seed)
     model_name = resolve_model_name(backend)
@@ -689,6 +687,7 @@ def test_decode_logprobs_match_prefill_logprobs(
         max_model_len=8192,
         dtype="bfloat16",
         enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
+        attention_config={"backend": backend},
     )
 
     # Use a few test prompts
@@ -920,6 +919,7 @@ def LLM_with_max_seqs(
     max_num_seqs: int,
     gpu_memory_utilization: float,
     max_model_len: int,
+    attention_config: dict | None = None,
 ) -> LLM:
     """
     Helper to construct an LLM with a specific max_num_seqs (batch-size limit)
@@ -934,6 +934,7 @@ def LLM_with_max_seqs(
         tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")),
         enable_prefix_caching=False,
         enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
+        attention_config=attention_config,
         # Enable for MOE models
         # enable_expert_parallel=True,
     )
diff --git a/tests/v1/determinism/test_online_batch_invariance.py b/tests/v1/determinism/test_online_batch_invariance.py
index 5e3b997364949..52c8103b2f1ce 100644
--- a/tests/v1/determinism/test_online_batch_invariance.py
+++ b/tests/v1/determinism/test_online_batch_invariance.py
@@ -136,11 +136,9 @@ def _compare_bs1_vs_bsn_single_process(
 @skip_unsupported
 @pytest.mark.parametrize("backend", BACKENDS)
 def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
-    backend: str, monkeypatch: pytest.MonkeyPatch
+    backend: str,
 ) -> None:
     random.seed(int(os.getenv("VLLM_TEST_SEED", "12345")))
-    # Override backend for this test (and the RemoteOpenAIServer child process).
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
     model_name = resolve_model_name(backend)
     prompts_all = [_random_prompt(10, 50) for _ in range(32)]
 
@@ -156,6 +154,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
     server_args: list[str] = [
         "--max-model-len=8192",
         "--max-num-seqs=32",
+        f"--attention-backend={backend}",
     ]
     if tp_size:
         server_args += ["-tp", tp_size]
diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py
index 5cef9b33c9984..61e56c079a3b5 100644
--- a/tests/v1/e2e/test_async_scheduling.py
+++ b/tests/v1/e2e/test_async_scheduling.py
@@ -142,16 +142,17 @@ def run_tests(
     """Test consistency of combos of async scheduling, preemption,
     uni/multiproc executor with spec decoding."""
 
-    with monkeypatch.context() as m:
-        # avoid precision errors
-        if current_platform.is_rocm():
-            if is_testing_with_spec_decoding:
-                # Use TRITON_ATTN for spec decoding test for consistency
-                m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
-            else:
-                m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_AITER_FA")
+    # Determine attention config based on platform
+    if current_platform.is_rocm():
+        if is_testing_with_spec_decoding:
+            # Use TRITON_ATTN for spec decoding test for consistency
+            attention_config = {"backend": "TRITON_ATTN"}
         else:
-            m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
+            attention_config = {"backend": "ROCM_AITER_FA"}
+    else:
+        attention_config = {"backend": "FLEX_ATTENTION"}
+
+    with monkeypatch.context() as m:
         # lock matmul precision to full FP32 (IEEE)
         m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "ieee")
         # m.setenv("VLLM_BATCH_INVARIANT", "1")
@@ -174,6 +175,7 @@ def run_tests(
                 spec_config,
                 test_prefill_chunking=test_prefill_chunking,
                 is_testing_with_spec_decoding=is_testing_with_spec_decoding,
+                attention_config=attention_config,
             )
             outputs.append(test_results)
 
@@ -262,6 +264,7 @@ def run_test(
     spec_config: dict[str, Any] | None,
     test_prefill_chunking: bool,
     is_testing_with_spec_decoding: bool = False,
+    attention_config: dict[str, Any] | None = None,
 ):
     spec_decoding = spec_config is not None
     cache_arg: dict[str, Any] = (
@@ -301,6 +304,7 @@ def run_test(
         dtype=dtype,
         speculative_config=spec_config,
         disable_log_stats=False,
+        attention_config=attention_config,
         **cache_arg,
     ) as vllm_model:
         results = []
diff --git a/tests/v1/e2e/test_cascade_attention.py b/tests/v1/e2e/test_cascade_attention.py
index 0fcb97fe63055..a7be981805c0d 100644
--- a/tests/v1/e2e/test_cascade_attention.py
+++ b/tests/v1/e2e/test_cascade_attention.py
@@ -10,7 +10,7 @@ from ...utils import create_new_process_for_each_test
 
 @create_new_process_for_each_test()
 @pytest.mark.parametrize("attn_backend", ["FLASH_ATTN", "FLASHINFER"])
-def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
+def test_cascade_attention(example_system_message, attn_backend):
     prompt = "\n<User>: Implement fibonacci sequence in Python.\n<Claude>:"
 
     if attn_backend == "FLASHINFER":
@@ -19,19 +19,18 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
             "needs investigation. See issue #25679."
         )
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
+    llm = LLM(
+        model="Qwen/Qwen2-1.5B-Instruct", attention_config={"backend": attn_backend}
+    )
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
 
-        llm = LLM(model="Qwen/Qwen2-1.5B-Instruct")
-        sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
+    # No cascade attention.
+    single_prompt = [example_system_message + prompt]
+    responses = llm.generate(single_prompt, sampling_params)
+    ref_output = responses[0].outputs[0].text
 
-        # No cascade attention.
-        single_prompt = [example_system_message + prompt]
-        responses = llm.generate(single_prompt, sampling_params)
-        ref_output = responses[0].outputs[0].text
-
-        # (Probably) Use cascade attention.
-        prompts = [example_system_message + prompt] * 64
-        responses = llm.generate(prompts, sampling_params)
-        for response in responses:
-            assert response.outputs[0].text == ref_output
+    # (Probably) Use cascade attention.
+    prompts = [example_system_message + prompt] * 64
+    responses = llm.generate(prompts, sampling_params)
+    for response in responses:
+        assert response.outputs[0].text == ref_output
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index fcfc8bdce12e9..a25114a4d96cb 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -438,25 +438,26 @@ def test_eagle_correctness(
     should be the same when using eagle speculative decoding.
     model_setup: (method, model_name, eagle_model_name, tp_size)
     """
+    # Determine attention config
+    # Scout requires default backend selection because vision encoder has
+    # head_dim 88 being incompatible with FLASH_ATTN and needs to fall back
+    # to Flex Attn
+    if "Llama-4-Scout" in model_setup[1] and attn_backend == "FLASH_ATTN":
+        if current_platform.is_rocm():
+            # TODO: Enable Flex Attn for spec_decode on ROCm
+            pytest.skip("Flex Attn for spec_decode not supported on ROCm currently")
+        attention_config = None  # Let it fall back to default
+    else:
+        attention_config = {"backend": attn_backend}
+
+    if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
+        pytest.skip(
+            "TRITON_ATTN does not support "
+            "multi-token eagle spec decode on current platform"
+        )
+
     with monkeypatch.context() as m:
-        if "Llama-4-Scout" in model_setup[1] and attn_backend == "FLASH_ATTN":
-            # Scout requires default backend selection
-            # because vision encoder has head_dim 88 being incompatible
-            #  with FLASH_ATTN and needs to fall back to Flex Attn
-
-            # pass if not ROCm
-            if current_platform.is_rocm():
-                # TODO: Enable Flex Attn for spec_decode on ROCm
-                pytest.skip("Flex Attn for spec_decode not supported on ROCm currently")
-        else:
-            m.setenv("VLLM_MLA_DISABLE", "1")
-            m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
-
-        if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
-            pytest.skip(
-                "TRITON_ATTN does not support "
-                "multi-token eagle spec decode on current platform"
-            )
+        m.setenv("VLLM_MLA_DISABLE", "1")
 
         if attn_backend == "ROCM_AITER_FA" and current_platform.is_rocm():
             if "deepseek" in model_setup[1].lower():
@@ -471,7 +472,10 @@ def test_eagle_correctness(
         max_num_batched_tokens = 128 if enable_chunked_prefill else max_model_len
 
         ref_llm = LLM(
-            model=model_name, max_model_len=max_model_len, tensor_parallel_size=tp_size
+            model=model_name,
+            max_model_len=max_model_len,
+            tensor_parallel_size=tp_size,
+            attention_config=attention_config,
         )
         ref_outputs = ref_llm.chat(test_prompts, sampling_config)
         del ref_llm
@@ -492,6 +496,7 @@ def test_eagle_correctness(
             max_num_batched_tokens=max_num_batched_tokens,
             enable_chunked_prefill=enable_chunked_prefill,
             model_impl=model_impl,
+            attention_config=attention_config,
         )
         spec_outputs = spec_llm.chat(test_prompts, sampling_config)
         matches = 0
diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
index 453ccc81eb14a..c2c38f51c5003 100755
--- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
@@ -3,21 +3,29 @@ set -xe
 
 # Parse command line arguments
 KV_BUFFER_DEVICE="cuda"  # Default to cuda
+ATTENTION_BACKEND=""  # Default to empty (use vllm default)
 while [[ $# -gt 0 ]]; do
   case $1 in
     --kv_buffer_device)
       KV_BUFFER_DEVICE="$2"
       shift 2
       ;;
+    --attention-backend)
+      ATTENTION_BACKEND="$2"
+      shift 2
+      ;;
     *)
       echo "Unknown option $1"
-      echo "Usage: $0 [--kv_buffer_device <cuda|cpu>]"
+      echo "Usage: $0 [--kv_buffer_device <cuda|cpu>] [--attention-backend <backend>]"
       exit 1
       ;;
   esac
 done
 
 echo "Running accuracy tests with kv_buffer_device=$KV_BUFFER_DEVICE"
+if [[ -n "$ATTENTION_BACKEND" ]]; then
+  echo "Using attention backend: $ATTENTION_BACKEND"
+fi
 
 DECODER_KV_LAYOUT=${DECODER_KV_LAYOUT:-"HND"} # Default to HND, optional NHD
 if [[ "$DECODER_KV_LAYOUT" == "NHD" ]]; then
@@ -148,6 +156,11 @@ run_tests_for_model() {
     --tensor-parallel-size $PREFILLER_TP_SIZE \
     --kv-transfer-config '$KV_CONFIG'"
 
+    # Add attention backend config if specified
+    if [[ -n "$ATTENTION_BACKEND" ]]; then
+      BASE_CMD="${BASE_CMD} --attention-backend=$ATTENTION_BACKEND"
+    fi
+
     if [ -n "$model_args" ]; then
     FULL_CMD="$BASE_CMD $model_args"
     else
@@ -188,7 +201,12 @@ run_tests_for_model() {
     --block-size ${DECODE_BLOCK_SIZE} \
     --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
     --kv-transfer-config '$KV_CONFIG'"
-  
+
+    # Add attention backend config if specified
+    if [[ -n "$ATTENTION_BACKEND" ]]; then
+      BASE_CMD="${BASE_CMD} --attention-backend=$ATTENTION_BACKEND"
+    fi
+
   # DP-EP attention mode
   if [[ -z "$DP_EP" ]]; then
     BASE_CMD="${BASE_CMD} --tensor-parallel-size $DECODER_TP_SIZE"
diff --git a/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
index 9308c81da0635..8199fd516cd43 100755
--- a/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
@@ -15,14 +15,14 @@ configs=(
 
 run_tests() {
   local label=$1
-  local extra_env=$2
+  local extra_args=$2
 
   echo "=== Running tests (${label}) ==="
   for cfg in "${configs[@]}"; do
-    echo "-> Running with ${cfg} ${extra_env:+and ${extra_env}}"
+    echo "-> Running with ${cfg} ${extra_args:+and ${extra_args}}"
     # Use 'env' to safely set variables without eval
-    if ! env ${extra_env} ${cfg} bash "${SCRIPT}"; then
-      echo "❌ Test failed for config: ${cfg} ${extra_env:+(${extra_env})}"
+    if ! env ${cfg} bash "${SCRIPT}" ${extra_args}; then
+      echo "❌ Test failed for config: ${cfg} ${extra_args:+(${extra_args})}"
       exit 1
     fi
   done
@@ -34,8 +34,8 @@ run_tests "default backend" ""
 
 # Check if FLASHINFER is set (non-empty)
 if [[ -n "${FLASHINFER:-}" ]]; then
-  echo "FLASHINFER is set, rerunning with VLLM_ATTENTION_BACKEND=FLASHINFER"
-  run_tests "FLASHINFER backend" "VLLM_ATTENTION_BACKEND=FLASHINFER"
+  echo "FLASHINFER is set, rerunning with --attention-backend FLASHINFER"
+  run_tests "FLASHINFER backend" "--attention-backend FLASHINFER"
 else
   echo "FLASHINFER not set, skipping FLASHINFER runs."
 fi
diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index 66804fa671c7c..25f4308079595 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -1132,7 +1132,7 @@ def _run_abort_timeout_test(llm: LLM, timeout: int):
         "TRITON_ATTN",
     ],
 )
-def test_register_kv_caches(dist_init, attn_backend, monkeypatch):
+def test_register_kv_caches(dist_init, attn_backend):
     """
     Test that register_kv_caches() properly calls nixl_wrapper methods with
     correct data.
@@ -1144,9 +1144,7 @@ def test_register_kv_caches(dist_init, attn_backend, monkeypatch):
        block layout info
     """
 
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
-
-    vllm_config = create_vllm_config()
+    vllm_config = create_vllm_config(attention_backend=attn_backend)
 
     # Import the appropriate backend based on the parameter
     if attn_backend == "FLASH_ATTN":
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index 5cdb1f84b30d4..3a0dbb8e43b52 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -11,6 +11,7 @@ import torch
 
 from vllm import SamplingParams
 from vllm.config import (
+    AttentionConfig,
     CacheConfig,
     DeviceConfig,
     KVTransferConfig,
@@ -94,6 +95,7 @@ def create_vllm_config(
     dtype: str = "float16",
     cache_dtype: str = "auto",
     hf_overrides: dict[str, Any] | None = None,
+    attention_backend: str | None = None,
 ) -> VllmConfig:
     """Initialize VllmConfig For Testing."""
     model_config = ModelConfig(
@@ -124,12 +126,14 @@ def create_vllm_config(
         enable_permute_local_kv=enable_permute_local_kv,
         kv_connector_extra_config=kv_connector_extra_config or {},
     )
+    attention_config = AttentionConfig(backend=attention_backend)
     return VllmConfig(
         scheduler_config=scheduler_config,
         model_config=model_config,
         cache_config=cache_config,
         kv_transfer_config=kv_transfer_config,
         device_config=DeviceConfig("cpu"),
+        attention_config=attention_config,
     )
 
 
diff --git a/tests/v1/kv_offload/test_cpu_offloading.py b/tests/v1/kv_offload/test_cpu_offloading.py
index 57474a3dc01e7..1ac5e5b8cdc57 100644
--- a/tests/v1/kv_offload/test_cpu_offloading.py
+++ b/tests/v1/kv_offload/test_cpu_offloading.py
@@ -13,7 +13,6 @@ from vllm import LLM, SamplingParams, TokensPrompt
 from vllm.config import KVEventsConfig, KVTransferConfig
 from vllm.distributed.kv_events import BlockStored, KVEventBatch
 from vllm.platforms import current_platform
-from vllm.utils.system_utils import set_env_var
 
 CPU_BLOCK_SIZES = [48]
 ATTN_BACKENDS = ["FLASH_ATTN"]
@@ -180,13 +179,13 @@ def test_cpu_offloading(cpu_block_size: int, attn_backend: str) -> None:
         topic="test",
     )
 
-    with set_env_var("VLLM_ATTENTION_BACKEND", attn_backend):
-        llm = LLM(
-            model="meta-llama/Llama-3.2-1B-Instruct",
-            gpu_memory_utilization=0.5,
-            kv_events_config=kv_events_config,
-            kv_transfer_config=kv_transfer_config,
-        )
+    llm = LLM(
+        model="meta-llama/Llama-3.2-1B-Instruct",
+        gpu_memory_utilization=0.5,
+        kv_events_config=kv_events_config,
+        kv_transfer_config=kv_transfer_config,
+        attention_config={"backend": attn_backend},
+    )
 
     events_endpoint = events_endpoint.replace("*", "127.0.0.1")
     subscriber = MockSubscriber(events_endpoint, topic=kv_events_config.topic)
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index 55e9b4d0660f5..f63cd3a6e42aa 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -15,6 +15,7 @@ from tests.v1.attention.utils import (
 )
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import (
+    AttentionConfig,
     CacheConfig,
     DeviceConfig,
     ModelConfig,
@@ -38,6 +39,7 @@ eagle3_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
 def _create_proposer(
     method: str,
     num_speculative_tokens: int,
+    attention_backend: str | None = None,
     speculative_token_tree: list[tuple[int, ...]] | None = None,
 ) -> EagleProposer:
     model_config = ModelConfig(model=model_dir, runner="generate", max_model_len=100)
@@ -70,6 +72,7 @@ def _create_proposer(
             max_model_len=model_config.max_model_len,
             is_encoder_decoder=model_config.is_encoder_decoder,
         ),
+        attention_config=AttentionConfig(backend=attention_backend),
     )
 
     return EagleProposer(vllm_config=vllm_config, device=current_platform.device_type)
@@ -331,8 +334,6 @@ def test_load_model(
     use_distinct_lm_head,
     monkeypatch,
 ):
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
-
     if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
         pytest.skip(
             "TRITON_ATTN does not support "
@@ -394,7 +395,9 @@ def test_load_model(
     assert not isinstance(target_model, SupportsMultiModal)
 
     # Create proposer using the helper function
-    proposer = _create_proposer(method, num_speculative_tokens=8)
+    proposer = _create_proposer(
+        method, num_speculative_tokens=8, attention_backend=attn_backend
+    )
 
     # Call the method under test
     proposer.load_model(target_model)
@@ -420,8 +423,6 @@ def test_load_model(
 @pytest.mark.parametrize("attn_backend", get_attn_backend_list_based_on_platform())
 @pytest.mark.parametrize("num_speculative_tokens", [1, 3, 8])
 def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch):
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
-
     if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
         pytest.skip(
             "TRITON_ATTN does not support "
@@ -449,7 +450,9 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch):
     seq_lens = [seq_len_1, seq_len_2]
 
     # Create proposer first so we can use its actual hidden_size
-    proposer = _create_proposer("eagle", num_speculative_tokens)
+    proposer = _create_proposer(
+        "eagle", num_speculative_tokens, attention_backend=attn_backend
+    )
     # Get the hidden_size from the proposer to ensure consistency
     hidden_size = proposer.hidden_size
 
@@ -622,7 +625,9 @@ def test_propose_tree(spec_token_tree):
 
     # Create proposer first so we can use its actual hidden_size.
     proposer = _create_proposer(
-        "eagle", num_speculative_tokens, speculative_token_tree=spec_token_tree
+        "eagle",
+        num_speculative_tokens,
+        speculative_token_tree=spec_token_tree,
     )
     # Get the hidden_size from the proposer to ensure consistency.
     hidden_size = proposer.hidden_size
diff --git a/tests/v1/spec_decode/test_max_len.py b/tests/v1/spec_decode/test_max_len.py
index 15a6bd2659ea9..42991f9f1ae03 100644
--- a/tests/v1/spec_decode/test_max_len.py
+++ b/tests/v1/spec_decode/test_max_len.py
@@ -38,53 +38,48 @@ def test_ngram_max_len(num_speculative_tokens: int):
 def test_eagle_max_len(
     monkeypatch: pytest.MonkeyPatch, num_speculative_tokens: int, attn_backend: str
 ):
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
-
-        if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
-            pytest.skip(
-                "TRITON_ATTN does not support "
-                "multi-token eagle spec decode on current platform"
-            )
-
-        if attn_backend == "ROCM_AITER_FA" and current_platform.is_rocm():
-            m.setenv("VLLM_ROCM_USE_AITER", "1")
-
-        llm = LLM(
-            model="meta-llama/Meta-Llama-3-8B-Instruct",
-            enforce_eager=True,  # For faster initialization.
-            speculative_config={
-                "method": "eagle",
-                "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
-                "num_speculative_tokens": num_speculative_tokens,
-                "max_model_len": 80,
-            },
-            max_model_len=200,
+    if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
+        pytest.skip(
+            "TRITON_ATTN does not support "
+            "multi-token eagle spec decode on current platform"
         )
-        sampling_params = SamplingParams(max_tokens=200, ignore_eos=True)
-        outputs = llm.generate(_PROMPTS, sampling_params)
-        for o in outputs:
-            assert o.outputs[0].finish_reason == "length", (
-                "This test is only meaningful if the output "
-                "is truncated due to max length"
-            )
 
-        sampling_params = SamplingParams(
-            max_tokens=200,
-            structured_outputs=StructuredOutputsParams(
-                regex="^" + "a b c d e " * 15 + "$"
-            ),
+    if attn_backend == "ROCM_AITER_FA" and current_platform.is_rocm():
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
+
+    llm = LLM(
+        model="meta-llama/Meta-Llama-3-8B-Instruct",
+        enforce_eager=True,  # For faster initialization.
+        speculative_config={
+            "method": "eagle",
+            "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
+            "num_speculative_tokens": num_speculative_tokens,
+            "max_model_len": 80,
+        },
+        max_model_len=200,
+        attention_config={"backend": attn_backend},
+    )
+    sampling_params = SamplingParams(max_tokens=200, ignore_eos=True)
+    outputs = llm.generate(_PROMPTS, sampling_params)
+    for o in outputs:
+        assert o.outputs[0].finish_reason == "length", (
+            "This test is only meaningful if the output is truncated due to max length"
         )
-        output = llm.generate(_PROMPTS, sampling_params)
-        for o in output:
-            assert o.prompt_token_ids is not None
-            assert (
-                len(o.prompt_token_ids)
-                < 80
-                < len(o.prompt_token_ids) + len(o.outputs[0].token_ids)
-                <= 200
-            ), (
-                "This test is only meaningful if the output "
-                "is longer than the eagle max length"
-            )
-            assert o.outputs[0].text == "a b c d e " * 15
+
+    sampling_params = SamplingParams(
+        max_tokens=200,
+        structured_outputs=StructuredOutputsParams(regex="^" + "a b c d e " * 15 + "$"),
+    )
+    output = llm.generate(_PROMPTS, sampling_params)
+    for o in output:
+        assert o.prompt_token_ids is not None
+        assert (
+            len(o.prompt_token_ids)
+            < 80
+            < len(o.prompt_token_ids) + len(o.outputs[0].token_ids)
+            <= 200
+        ), (
+            "This test is only meaningful if the output "
+            "is longer than the eagle max length"
+        )
+        assert o.outputs[0].text == "a b c d e " * 15
diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py
index e2410a70b1a63..e231c600cba7a 100644
--- a/vllm/v1/attention/backends/rocm_attn.py
+++ b/vllm/v1/attention/backends/rocm_attn.py
@@ -165,7 +165,7 @@ class RocmAttentionBackend(AttentionBackend):
             raise ValueError(
                 f"Head size {head_size} is not supported by {attn_type}. "
                 f"Supported head sizes are: {cls.get_supported_head_sizes()}. "
-                "Set --attention-config.backend=FLEX_ATTENTION to use "
+                "Set --attention-backend=FLEX_ATTENTION to use "
                 "FlexAttention backend which supports all head sizes."
             )
 

From e3a0f21e6ce78268865cafcdc3dc58c7a80dbc57 Mon Sep 17 00:00:00 2001
From: Xunzhuo <xunzhuo@vllm-semantic-router.ai>
Date: Thu, 18 Dec 2025 02:45:56 +0800
Subject: [PATCH 640/770] [docs]: add ecosystem projects sr in docs/governance
 (#30844)

Signed-off-by: bitliu <bitliu@tencent.com>
---
 docs/governance/committers.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/governance/committers.md b/docs/governance/committers.md
index c9428027da953..2f0780a08978b 100644
--- a/docs/governance/committers.md
+++ b/docs/governance/committers.md
@@ -181,3 +181,4 @@ If you have PRs touching the area, please feel free to ping the area owner for r
 
 - Ascend NPU: [@wangxiyuan](https://github.com/wangxiyuan) and [see more details](https://vllm-ascend.readthedocs.io/en/latest/community/contributors.html#maintainers)
 - Intel Gaudi HPU [@xuechendi](https://github.com/xuechendi) and [@kzawora-intel](https://github.com/kzawora-intel)
+- Semantic Router: [@xunzhuo](https://github.com/xunzhuo), [@rootfs](https://github.com/rootfs) and [see more details](https://vllm-semantic-router.com/community/team)

From e06d0bf0aa2af11220b5c3aa5ccc8f999d0e3161 Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Wed, 17 Dec 2025 15:20:22 -0500
Subject: [PATCH 641/770] 2.9.1 PyTorch release update (#28495)

---
 .buildkite/test-amd.yaml           | 2 +-
 .buildkite/test-pipeline.yaml      | 2 +-
 CMakeLists.txt                     | 4 ++--
 pyproject.toml                     | 2 +-
 requirements/build.txt             | 2 +-
 requirements/cuda.txt              | 6 +++---
 requirements/rocm-build.txt        | 8 ++++----
 requirements/test.in               | 6 +++---
 requirements/test.txt              | 8 ++++----
 vllm/model_executor/layers/conv.py | 2 +-
 10 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index e8f99100a8de0..6df373632d730 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -740,7 +740,7 @@ steps:
   # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
   # we can only upgrade after this is resolved
   # TODO(jerryzh168): resolve the above comment
-  - uv pip install --system torchao==0.13.0
+  - uv pip install --system torchao==0.14.1
   - uv pip install --system conch-triton-kernels
   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index b4de630b09417..8e3bcfe4a36bc 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -658,7 +658,7 @@ steps:
   # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
   # we can only upgrade after this is resolved
   # TODO(jerryzh168): resolve the above comment
-  - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
+  - uv pip install --system torchao==0.14.1 --index-url https://download.pytorch.org/whl/cu129
   - uv pip install --system conch-triton-kernels
   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5ca71f6ba4df0..a14496e035d9a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,8 +56,8 @@ endif()
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.9.0")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.9.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.9.1")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.9.1")
 
 #
 # Try to find python package with an executable that exactly matches
diff --git a/pyproject.toml b/pyproject.toml
index a250ab6567f12..c03f96dd7acd5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
     "packaging>=24.2",
     "setuptools>=77.0.3,<81.0.0",
     "setuptools-scm>=8.0",
-    "torch == 2.9.0",
+    "torch == 2.9.1",
     "wheel",
     "jinja2",
 ]
diff --git a/requirements/build.txt b/requirements/build.txt
index 23ff8d4fdc1c0..3756371638bad 100644
--- a/requirements/build.txt
+++ b/requirements/build.txt
@@ -4,7 +4,7 @@ ninja
 packaging>=24.2
 setuptools>=77.0.3,<81.0.0
 setuptools-scm>=8
-torch==2.9.0
+torch==2.9.1
 wheel
 jinja2>=3.1.6
 regex
diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index 462f18ef7159b..1417fb99120bc 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -5,9 +5,9 @@ numba == 0.61.2 # Required for N-gram speculative decoding
 
 # Dependencies for NVIDIA GPUs
 ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
-torch==2.9.0
-torchaudio==2.9.0
+torch==2.9.1
+torchaudio==2.9.1
 # These must be updated alongside torch
-torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+torchvision==0.24.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 # FlashInfer should be updated together with the Dockerfile
 flashinfer-python==0.5.3
diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt
index b977e80be067f..54af9d995c4a2 100644
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@@ -2,11 +2,11 @@
 -r common.txt
 
 --extra-index-url https://download.pytorch.org/whl/rocm6.4
-torch==2.9.0
-torchvision==0.24.0
-torchaudio==2.9.0
+torch==2.9.1
+torchvision==0.24.1
+torchaudio==2.9.1
 
-triton==3.5.0
+triton==3.5.1
 cmake>=3.26.1,<4
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
diff --git a/requirements/test.in b/requirements/test.in
index dfae5b75821f8..55452ce83f232 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -24,9 +24,9 @@ soundfile # required for audio tests
 jiwer # required for audio tests
 tblib # for pickling test exceptions
 timm >=1.0.17 # required for internvl and gemma3n-mm test
-torch==2.9.0
-torchaudio==2.9.0
-torchvision==0.24.0
+torch==2.9.1
+torchaudio==2.9.1
+torchvision==0.24.1
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
 mistral_common[image,audio] >= 1.8.5 # required for voxtral test
diff --git a/requirements/test.txt b/requirements/test.txt
index 571194e05c1ba..ea2093e4347fe 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -1123,7 +1123,7 @@ tomli==2.2.1
     # via schemathesis
 tomli-w==1.2.0
     # via schemathesis
-torch==2.9.0+cu129
+torch==2.9.1+cu129
     # via
     #   -r requirements/test.in
     #   accelerate
@@ -1152,7 +1152,7 @@ torch==2.9.0+cu129
     #   torchvision
     #   vector-quantize-pytorch
     #   vocos
-torchaudio==2.9.0+cu129
+torchaudio==2.9.1+cu129
     # via
     #   -r requirements/test.in
     #   encodec
@@ -1165,7 +1165,7 @@ torchmetrics==1.7.4
     #   pytorch-lightning
     #   terratorch
     #   torchgeo
-torchvision==0.24.0+cu129
+torchvision==0.24.1+cu129
     # via
     #   -r requirements/test.in
     #   lightly
@@ -1206,7 +1206,7 @@ transformers==4.57.3
     #   transformers-stream-generator
 transformers-stream-generator==0.0.5
     # via -r requirements/test.in
-triton==3.5.0
+triton==3.5.1
     # via torch
 tritonclient==2.51.0
     # via
diff --git a/vllm/model_executor/layers/conv.py b/vllm/model_executor/layers/conv.py
index 8d51e5bd9920a..1cd02698b3863 100644
--- a/vllm/model_executor/layers/conv.py
+++ b/vllm/model_executor/layers/conv.py
@@ -251,6 +251,6 @@ class Conv3dLayer(ConvLayerBase):
         # See: https://github.com/vllm-project/vllm/issues/27406
         # and https://github.com/pytorch/pytorch/issues/166122
         # By default, we use CUDNN's convolution ops with optimization.
-        if self.enable_linear and is_torch_equal("2.9.0"):
+        if self.enable_linear and (is_torch_equal("2.9.0") or is_torch_equal("2.9.1")):
             return self._forward_mulmat(x)
         return self._forward_conv(x)

From e3fc374a9a69dddb16885d810f1e28d3fdd39ebd Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Wed, 17 Dec 2025 18:00:59 -0500
Subject: [PATCH 642/770] [BugFix] Workspace allocation during profile run :
 DeepEPHighThroughput + DeepGEMM  (#30899)

---
 vllm/model_executor/layers/fused_moe/modular_kernel.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index 484314091cb15..b0834e861338f 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -795,7 +795,10 @@ class FusedMoEModularKernel(torch.nn.Module):
                     top_k,
                     global_num_experts,
                     local_num_experts,
-                    expert_tokens_meta,
+                    # expert_tokens_meta help in allocating optimal/minimal
+                    # amount of workspace. Mark it None, so we allocate for
+                    # the worst-case scenario.
+                    expert_tokens_meta=None,
                 )
             )
 

From 05a83dc6ee84be55fef73d5fa6a77fb56d2dd80f Mon Sep 17 00:00:00 2001
From: Nathan Price <125999937+TheCodeWrangler@users.noreply.github.com>
Date: Wed, 17 Dec 2025 18:01:29 -0600
Subject: [PATCH 643/770] feat(api): Eager chat template warmup to eliminate
 first-request latency (#30700)

Signed-off-by: Nathan Price <nathan@abridge.com>
---
 vllm/entrypoints/openai/api_server.py   |  3 ++
 vllm/entrypoints/openai/serving_chat.py | 49 +++++++++++++++++++++++++
 2 files changed, 52 insertions(+)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index bca9571e39344..d45773f5364e3 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1082,6 +1082,9 @@ async def init_app_state(
         if "generate" in supported_tasks
         else None
     )
+    # Warm up chat template processing to avoid first-request latency
+    if state.openai_serving_chat is not None:
+        await state.openai_serving_chat.warmup()
     state.openai_serving_completion = (
         OpenAIServingCompletion(
             engine_client,
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 98fc7810faf96..95df373502bfd 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -162,6 +162,55 @@ class OpenAIServingChat(OpenAIServing):
         self.supports_code_interpreter = False
         self.python_tool = None
 
+    async def warmup(self) -> None:
+        """
+        Warm up the chat template processing to avoid first-request latency.
+
+        This method triggers Jinja2 template compilation and content format
+        detection that would otherwise happen on the first real request,
+        causing increased latency on the first request.
+        """
+        logger.info("Warming up chat template processing...")
+        start_time = time.perf_counter()
+
+        try:
+            # Get the tokenizer from the engine
+            tokenizer = await self.engine_client.get_tokenizer()
+
+            # Create a minimal dummy request
+            dummy_request = ChatCompletionRequest(
+                messages=[{"role": "user", "content": "warmup"}],
+                model=None,
+                max_completion_tokens=1,
+            )
+
+            # Call _preprocess_chat to trigger template compilation
+            # This forces:
+            # 1. Chat template content format detection
+            # 2. Jinja2 template compilation
+            # 3. Tokenizer initialization for chat
+            await self._preprocess_chat(
+                dummy_request,
+                tokenizer,
+                dummy_request.messages,
+                chat_template=self.chat_template,
+                chat_template_content_format=self.chat_template_content_format,
+                add_generation_prompt=True,
+                continue_final_message=False,
+                tool_dicts=None,
+                documents=None,
+                chat_template_kwargs=None,
+                tool_parser=None,
+                add_special_tokens=False,
+            )
+
+            elapsed = (time.perf_counter() - start_time) * 1000
+            logger.info("Chat template warmup completed in %.1fms", elapsed)
+
+        except Exception:
+            # Log but don't fail server startup if warmup fails
+            logger.exception("Chat template warmup failed")
+
     async def create_chat_completion(
         self,
         request: ChatCompletionRequest,

From 74a1ac38b00a8cf502db085d1bbd77712cf47e41 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Thu, 18 Dec 2025 08:05:24 +0800
Subject: [PATCH 644/770] [v1] Add PrefixLM support to TritonAttention backend
 (#30386)

---
 .../generation/test_multimodal_gguf.py        | 131 ++++++++++----
 .../attention/ops/triton_unified_attention.py | 164 +++++++++++++++---
 vllm/model_executor/models/gemma3.py          |  69 --------
 vllm/v1/attention/backends/triton_attn.py     |  39 +++++
 4 files changed, 280 insertions(+), 123 deletions(-)

diff --git a/tests/models/multimodal/generation/test_multimodal_gguf.py b/tests/models/multimodal/generation/test_multimodal_gguf.py
index e596b20c6302b..813dccf1451b5 100644
--- a/tests/models/multimodal/generation/test_multimodal_gguf.py
+++ b/tests/models/multimodal/generation/test_multimodal_gguf.py
@@ -1,17 +1,23 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Literal, NamedTuple
+import os
+
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+
+from typing import Any, NamedTuple
 
 import pytest
 from huggingface_hub import hf_hub_download
 from pytest import MarkDecorator
+from transformers import AutoModelForImageTextToText
 
 from tests.quantization.utils import is_quant_method_supported
 from vllm.assets.image import ImageAsset
+from vllm.multimodal.image import rescale_image_size
 from vllm.utils.torch_utils import set_default_torch_num_threads
 
-from ....conftest import PromptImageInput, VllmRunner
+from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner
 from ...utils import check_logprobs_close
 
 
@@ -21,9 +27,10 @@ class GGUFMMTestConfig(NamedTuple):
     gguf_backbone: str
     gguf_mmproj: str
     prompt: list[str]
-    mm_data: dict[Literal["images"], PromptImageInput]
+    image_names: list[str]  # Store names, load PIL images at runtime
     max_model_len: int = 4096
     marks: list[MarkDecorator] = []
+    mm_processor_kwargs: dict[str, Any] = {}
 
     @property
     def gguf_model(self):
@@ -31,27 +38,75 @@ class GGUFMMTestConfig(NamedTuple):
         return hf_hub_download(self.gguf_repo, filename=self.gguf_backbone)
 
 
+# Common prompts aligned with test_common.py "gemma3" entry format
+_GEMMA3_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        "stop_sign": (
+            "<bos><start_of_turn>user\n"
+            "<start_of_image>What's the content in the center of the image?"
+            "<end_of_turn>\n<start_of_turn>model\n"
+        ),
+        "cherry_blossom": (
+            "<bos><start_of_turn>user\n"
+            "<start_of_image>What is the season?"
+            "<end_of_turn>\n<start_of_turn>model\n"
+        ),
+    }
+)
+
+# Image asset names - load at runtime to avoid pickle issues with subprocess
+_GEMMA3_IMAGE_NAMES = ["stop_sign", "cherry_blossom"]
+
+# Regular multimodal (no pan-and-scan) - uses QAT Q4_0 GGUF
 GEMMA3_CONFIG = GGUFMMTestConfig(
     original_model="google/gemma-3-4b-it",
     gguf_repo="google/gemma-3-4b-it-qat-q4_0-gguf",
     gguf_backbone="gemma-3-4b-it-q4_0.gguf",
     gguf_mmproj="mmproj-model-f16-4B.gguf",
-    prompt=["<start_of_image>Describe this image in detail:"],
-    mm_data={"images": [ImageAsset("stop_sign").pil_image]},
+    prompt=_GEMMA3_PROMPTS,
+    image_names=_GEMMA3_IMAGE_NAMES,
+    max_model_len=4096,
     marks=[pytest.mark.core_model],
+    mm_processor_kwargs={},
 )
 
-MODELS_TO_TEST = [GEMMA3_CONFIG]
+# Pan-and-scan multimodal - uses unquantized BF16 GGUF
+GEMMA3_CONFIG_PAN_AND_SCAN = GGUFMMTestConfig(
+    original_model="google/gemma-3-4b-it",
+    gguf_repo="unsloth/gemma-3-4b-it-GGUF",
+    gguf_backbone="gemma-3-4b-it-BF16.gguf",
+    gguf_mmproj="mmproj-BF16.gguf",
+    prompt=_GEMMA3_PROMPTS,
+    image_names=_GEMMA3_IMAGE_NAMES,
+    max_model_len=4096,
+    marks=[pytest.mark.core_model],
+    mm_processor_kwargs={"do_pan_and_scan": True},
+)
+
+MODELS_TO_TEST = [GEMMA3_CONFIG, GEMMA3_CONFIG_PAN_AND_SCAN]
 
 
 def run_multimodal_gguf_test(
+    hf_runner: type[HfRunner],
     vllm_runner: type[VllmRunner],
     model: GGUFMMTestConfig,
     dtype: str,
     max_tokens: int,
     num_logprobs: int,
 ):
-    # Run gguf model.
+    # Load images at runtime (inside subprocess) to avoid pickle issues
+    images = [ImageAsset(name).pil_image for name in model.image_names]
+    size_factors = [0.25, 0.5, 1.0]
+    inputs_per_image = [
+        (
+            [prompt for _ in size_factors],
+            [rescale_image_size(image, factor) for factor in size_factors],
+        )
+        for image, prompt in zip(images, model.prompt)
+    ]
+
+    # NOTE: Run vLLM first to avoid CUDA init issues with multiprocessing fork.
+    # Run GGUF model via vLLM.
     with (
         set_default_torch_num_threads(1),
         vllm_runner(
@@ -60,35 +115,42 @@ def run_multimodal_gguf_test(
             tokenizer_name=model.original_model,
             dtype=dtype,
             max_model_len=model.max_model_len,
+            mm_processor_kwargs=model.mm_processor_kwargs,
         ) as gguf_model,
     ):
-        gguf_outputs = gguf_model.generate_greedy_logprobs(
-            prompts=model.prompt,
-            max_tokens=max_tokens,
-            num_logprobs=num_logprobs,
-            **model.mm_data,
-        )
+        gguf_outputs_per_case = [
+            gguf_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=images,
+            )
+            for prompts, images in inputs_per_image
+        ]
 
-    # Run unquantized model.
-    with vllm_runner(
-        model_name=model.original_model,
-        enforce_eager=True,  # faster tests
+    # Then run HfRunner for HuggingFace baseline comparison.
+    with hf_runner(
+        model.original_model,
         dtype=dtype,
-        max_model_len=model.max_model_len,
-    ) as original_model:
-        original_outputs = original_model.generate_greedy_logprobs(
-            prompts=model.prompt,
-            max_tokens=max_tokens,
-            num_logprobs=num_logprobs,
-            **model.mm_data,
-        )
+        auto_cls=AutoModelForImageTextToText,
+    ) as hf_model:
+        hf_outputs_per_case = [
+            hf_model.generate_greedy_logprobs_limit(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=images,
+            )
+            for prompts, images in inputs_per_image
+        ]
 
-    check_logprobs_close(
-        outputs_0_lst=original_outputs,
-        outputs_1_lst=gguf_outputs,
-        name_0="original",
-        name_1="gguf",
-    )
+    for hf_outputs, gguf_outputs in zip(hf_outputs_per_case, gguf_outputs_per_case):
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=gguf_outputs,
+            name_0="hf",
+            name_1="gguf",
+        )
 
 
 @pytest.mark.skipif(
@@ -105,11 +167,14 @@ def run_multimodal_gguf_test(
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [10])
-def test_models(
+def test_gemma3_mm_gguf(
+    hf_runner: type[HfRunner],
     vllm_runner: type[VllmRunner],
     model: GGUFMMTestConfig,
     dtype: str,
     max_tokens: int,
     num_logprobs: int,
 ) -> None:
-    run_multimodal_gguf_test(vllm_runner, model, dtype, max_tokens, num_logprobs)
+    run_multimodal_gguf_test(
+        hf_runner, vllm_runner, model, dtype, max_tokens, num_logprobs
+    )
diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py
index a1877bb4429b9..ae5a48ec3d26d 100644
--- a/vllm/attention/ops/triton_unified_attention.py
+++ b/vllm/attention/ops/triton_unified_attention.py
@@ -86,6 +86,9 @@ def kernel_unified_attention_2d(
     USE_SOFTCAP: tl.constexpr,  # bool
     USE_SINKS: tl.constexpr,  # bool
     SLIDING_WINDOW: tl.constexpr,  # int
+    USE_MM_PREFIX: tl.constexpr,  # bool
+    MAX_MM_RANGES: tl.constexpr,  # int
+    mm_prefix_range_ptr,  # [num_seqs] - prefix length for each sequence
     stride_k_cache_0: tl.int64,  # int
     stride_k_cache_1: tl.int64,  # int
     stride_k_cache_2: tl.int64,  # int
@@ -270,7 +273,38 @@ def kernel_unified_attention_2d(
         else:
             V = V_load
 
-        seq_mask = seq_offset[None, :] < context_len + query_pos[:, None] + 1
+        # Compute attention mask: causal by default (key <= query)
+        query_abs_pos = context_len + query_pos[:, None]
+        seq_mask = seq_offset[None, :] <= query_abs_pos
+
+        # Apply sliding window to base mask BEFORE mm_prefix OR.
+        # Order must match FlexAttention: (causal AND sliding_window) OR mm_prefix
+        if SLIDING_WINDOW > 0:
+            seq_mask = seq_mask & ((query_abs_pos - seq_offset) < SLIDING_WINDOW)
+
+        # PrefixLM: extend mask with bidirectional ranges for multimodal tokens.
+        # Applied AFTER sliding window so mm_prefix ranges override SW restriction.
+        if USE_MM_PREFIX:
+            for i in range(MAX_MM_RANGES):
+                range_start = tl.load(
+                    mm_prefix_range_ptr + seq_idx * MAX_MM_RANGES * 2 + i * 2
+                )
+                range_end = tl.load(
+                    mm_prefix_range_ptr + seq_idx * MAX_MM_RANGES * 2 + i * 2 + 1
+                )
+
+                is_valid = range_start < range_end
+                q_in_range = (
+                    (query_abs_pos >= range_start)
+                    & (query_abs_pos <= range_end)
+                    & is_valid
+                )
+                k_in_range = (
+                    (seq_offset[None, :] >= range_start)
+                    & (seq_offset[None, :] <= range_end)
+                    & is_valid
+                )
+                seq_mask |= q_in_range & k_in_range
 
         # S : (BLOCK_M, TILE_SIZE)
         S = tl.zeros(shape=(BLOCK_M, TILE_SIZE), dtype=tl.float32)
@@ -284,13 +318,6 @@ def kernel_unified_attention_2d(
             query_mask_1[:, None] & query_mask_0[:, None] & seq_mask, S, float("-inf")
         )
 
-        if SLIDING_WINDOW > 0:
-            S = tl.where(
-                (context_len + query_pos[:, None] - seq_offset) < SLIDING_WINDOW,
-                S,
-                float("-inf"),
-            )
-
         if USE_ALIBI_SLOPES:
             S += alibi_slope[:, None] * (seq_offset - context_len)
 
@@ -398,6 +425,9 @@ def kernel_unified_attention_3d(
     num_seqs: tl.int32,
     BLOCK_M: tl.constexpr,  # int
     NUM_SEGMENTS_PER_SEQ: tl.constexpr,  # int
+    USE_MM_PREFIX: tl.constexpr,  # bool
+    MAX_MM_RANGES: tl.constexpr,  # int
+    mm_prefix_range_ptr,  # [num_seqs] - prefix length for each sequence
 ):
     q_block_global_idx = tl.program_id(0)
     kv_head_idx = tl.program_id(1)
@@ -559,7 +589,38 @@ def kernel_unified_attention_3d(
         else:
             V = V_load
 
-        seq_mask = seq_offset[None, :] < context_len + query_pos[:, None] + 1
+        # Compute attention mask: causal by default (key <= query)
+        query_abs_pos = context_len + query_pos[:, None]
+        seq_mask = seq_offset[None, :] <= query_abs_pos
+
+        # Apply sliding window to base mask BEFORE mm_prefix OR.
+        # Order must match FlexAttention: (causal AND sliding_window) OR mm_prefix
+        if SLIDING_WINDOW > 0:
+            seq_mask = seq_mask & ((query_abs_pos - seq_offset) < SLIDING_WINDOW)
+
+        # PrefixLM: extend mask with bidirectional ranges for multimodal tokens.
+        # Applied AFTER sliding window so mm_prefix ranges override SW restriction.
+        if USE_MM_PREFIX:
+            for i in range(MAX_MM_RANGES):
+                range_start = tl.load(
+                    mm_prefix_range_ptr + seq_idx * MAX_MM_RANGES * 2 + i * 2
+                )
+                range_end = tl.load(
+                    mm_prefix_range_ptr + seq_idx * MAX_MM_RANGES * 2 + i * 2 + 1
+                )
+
+                is_valid = range_start < range_end
+                q_in_range = (
+                    (query_abs_pos >= range_start)
+                    & (query_abs_pos <= range_end)
+                    & is_valid
+                )
+                k_in_range = (
+                    (seq_offset[None, :] >= range_start)
+                    & (seq_offset[None, :] <= range_end)
+                    & is_valid
+                )
+                seq_mask |= q_in_range & k_in_range
 
         # S : (BLOCK_M, TILE_SIZE)
         S = tl.zeros(shape=(BLOCK_M, TILE_SIZE), dtype=tl.float32)
@@ -572,13 +633,6 @@ def kernel_unified_attention_3d(
             query_mask_1[:, None] & query_mask_0[:, None] & seq_mask, S, float("-inf")
         )
 
-        if SLIDING_WINDOW > 0:
-            S = tl.where(
-                (context_len + query_pos[:, None] - seq_offset) < SLIDING_WINDOW,
-                S,
-                float("-inf"),
-            )
-
         if USE_ALIBI_SLOPES:
             S += alibi_slope[:, None] * (seq_offset - context_len)
 
@@ -732,6 +786,43 @@ def reduce_segments(
     tl.store(output_ptr + output_offset, acc, mask=dim_mask)
 
 
+def _is_gemma3_attention(head_size: int, sliding_window: int) -> bool:
+    """Detect Gemma3 models via unique (head_size, sliding_window) signature.
+
+    Gemma3 models are the only ones using sliding_window=1024 with
+    head_size 128 (27B) or 256 (1B, 4B, 12B). Other SWA models use
+    different window sizes (Mistral=4096, Phi-3=2047).
+    """
+    return sliding_window == 1024 and head_size in (128, 256)
+
+
+def _get_tile_size(
+    head_size: int,
+    sliding_window: int,
+    element_size: int,
+    is_mm_prefix: bool,
+    is_prefill: bool,
+) -> int:
+    """Select tile size with Gemma3-specific optimization.
+
+    For Gemma3, use 32 for both prefill and decode to better utilize
+    the larger head dimension (128/256). For other models, use
+    the default vLLM behavior.
+    """
+    if is_mm_prefix:
+        # Multimodal bidirectional attention needs a larger tile size
+        return 64
+
+    if _is_gemma3_attention(head_size, sliding_window):
+        # Gemma3: use 32 for decode (default is 16)
+        return 32
+
+    # Default behavior
+    if is_prefill:
+        return 32
+    return 16 if element_size >= 2 else 32
+
+
 def unified_attention(
     q,
     k,
@@ -759,6 +850,8 @@ def unified_attention(
     qq_bias=None,
     # Optional tensor for sinks
     sinks=None,
+    # Optional tensor for prefix lengths (PrefixLM support)
+    mm_prefix_range=None,
 ):
     assert causal, "Only causal attention is supported"
     assert q_descale is None, "Q scales not supported"
@@ -766,6 +859,17 @@ def unified_attention(
     if sinks is not None:
         assert sinks.shape[0] == q.shape[1], "Sinks must be num_query_heads size"
 
+    use_mm_prefix = False
+    max_mm_ranges = 0
+    if mm_prefix_range is not None:
+        if mm_prefix_range.ndim == 3:
+            use_mm_prefix = True
+            max_mm_ranges = mm_prefix_range.shape[1]
+        else:
+            raise ValueError(
+                f"Unsupported mm_prefix_range shape: {mm_prefix_range.shape}"
+            )
+
     use_alibi_slopes = alibi_slopes is not None
     use_qq_bias = qq_bias is not None
 
@@ -792,11 +896,23 @@ def unified_attention(
     #    = floor(q.shape[0] / BLOCK_Q) + num_seqs
     total_num_q_blocks = q.shape[0] // BLOCK_Q + num_seqs
 
-    # Assigning default tile sizes for prefill and decode.
-    # Note: each tile size must be at least 32 for "fp8" (q.element_size() == 1)
-    # and at least 16 for all other data types.
-    TILE_SIZE_PREFILL = 32
-    TILE_SIZE_DECODE = 16 if q.element_size() >= 2 else 32
+    # Tile sizes for prefill and decode. Gemma3 models use optimized values.
+    # Note: tile size must be at least 32 for fp8 (element_size == 1).
+    sliding_window_val = 1 + window_size[0] if window_size[0] >= 0 else 0
+    TILE_SIZE_PREFILL = _get_tile_size(
+        head_size,
+        sliding_window_val,
+        q.element_size(),
+        is_mm_prefix=use_mm_prefix,
+        is_prefill=True,
+    )
+    TILE_SIZE_DECODE = _get_tile_size(
+        head_size,
+        sliding_window_val,
+        q.element_size(),
+        is_mm_prefix=use_mm_prefix,
+        is_prefill=False,
+    )
 
     # Launch the 2D kernel if
     # 1. No intermediate tiled softmax buffers for the 3D kernel have been allocated, or
@@ -847,6 +963,9 @@ def unified_attention(
             USE_QQ_BIAS=use_qq_bias,
             USE_SOFTCAP=(softcap > 0),
             USE_SINKS=(sinks is not None),
+            USE_MM_PREFIX=use_mm_prefix,
+            MAX_MM_RANGES=max_mm_ranges,
+            mm_prefix_range_ptr=mm_prefix_range,
             SLIDING_WINDOW=(1 + window_size[0]),
             stride_k_cache_0=k.stride(0),
             stride_k_cache_1=k.stride(1),
@@ -895,6 +1014,9 @@ def unified_attention(
             USE_QQ_BIAS=use_qq_bias,
             USE_SOFTCAP=(softcap > 0),
             USE_SINKS=(sinks is not None),
+            USE_MM_PREFIX=use_mm_prefix,
+            MAX_MM_RANGES=max_mm_ranges,
+            mm_prefix_range_ptr=mm_prefix_range,
             SLIDING_WINDOW=(1 + window_size[0]),
             stride_k_cache_0=k.stride(0),
             stride_k_cache_1=k.stride(1),
diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index 70f72b5cb9beb..e6a201c669e96 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -19,7 +19,6 @@ from collections.abc import Iterable
 from itertools import islice
 
 import torch
-import torch.nn.functional as F
 from torch import nn
 from transformers import Gemma3TextConfig
 
@@ -226,77 +225,9 @@ class Gemma3Attention(nn.Module):
 
         q, k = self.rotary_emb(positions, q, k)
         attn_output = self.attn(q, k, v)
-
-        if not kwargs.get("has_images", False):
-            # Fast path for text-only inputs. The performance for the text-only
-            # inputs are not affected by the naive attention below.
-            output, _ = self.o_proj(attn_output)
-            return output
-
-        # NOTE(woosuk): Gemma3 uses bidirectional attention between image tokens
-        # that correspond to the same image while using causal attention
-        # otherwise. Current attention backends cannot handle this pattern, so
-        # we temporarily use a naive attention implementation with mask tensors.
-
-        # We intentionally keep the attention backend as-is and only override
-        # `attn_output` with the naive implementation's output. This minimizes
-        # changes to existing model runners and attention backends. The call to
-        # `self.attn(q, k, v)` is only used to populate the KV cache - its
-        # output is discarded and overwritten below. While this duplicates
-        # computation, it maintains compatibility.
-        # TODO(woosuk): Optimize by implementing custom attention kernels.
-        attn_output = self.naive_attn_with_masks(q, k, v, out=attn_output, **kwargs)
         output, _ = self.o_proj(attn_output)
         return output
 
-    def naive_attn_with_masks(
-        self,
-        q: torch.Tensor,
-        k: torch.Tensor,
-        v: torch.Tensor,
-        out: torch.Tensor,
-        **kwargs,
-    ) -> torch.Tensor:
-        # NOTE(woosuk): As described in the comment above, this code is not
-        # meant to be performant. It is only meant to be correct.
-        q = q.view(-1, self.num_heads, self.head_dim)
-        # Expand the key and value to handle GQA.
-        num_queries_per_kv = self.num_heads // self.num_kv_heads
-        k = k.view(-1, self.num_kv_heads, self.head_dim)
-        k = k.repeat_interleave(num_queries_per_kv, dim=-2)
-        v = v.view(-1, self.num_kv_heads, self.head_dim)
-        v = v.repeat_interleave(num_queries_per_kv, dim=-2)
-
-        if self.is_sliding:
-            attn_masks = kwargs["local_attn_masks"]
-        else:
-            attn_masks = kwargs["global_attn_masks"]
-
-        seq_lens = kwargs["seq_lens"]
-        start_idx = 0
-        for seq_len, attn_mask in zip(seq_lens, attn_masks):
-            end_idx = start_idx + seq_len
-            query = q[start_idx:end_idx].unsqueeze(0)
-            key = k[start_idx:end_idx].unsqueeze(0)
-            value = v[start_idx:end_idx].unsqueeze(0)
-
-            # Transpose.
-            query = query.transpose(1, 2)
-            key = key.transpose(1, 2)
-            value = value.transpose(1, 2)
-
-            output = F.scaled_dot_product_attention(
-                query,
-                key,
-                value,
-                attn_mask,
-                self.scaling,
-            )
-            output = output.transpose(1, 2).flatten(-2, -1)
-            out[start_idx:end_idx] = output
-            start_idx = end_idx
-        return out
-
 
 class Gemma3DecoderLayer(nn.Module):
     def __init__(
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index 7bea3862a03f9..ca7be990ca555 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -76,6 +76,39 @@ class TritonAttentionMetadata:
     # Optional aot scheduling
     scheduler_metadata: torch.Tensor | None = None
     prefix_scheduler_metadata: torch.Tensor | None = None
+    mm_prefix_range: dict[int, list[tuple[int, int]]] | None = None
+
+    @property
+    def mm_prefix_range_tensor(self) -> torch.Tensor | None:
+        """Convert mm_prefix_range dict to padded tensor for Triton kernel.
+
+        Returns shape: (num_seqs, max_ranges, 2) with 0-padding for empty ranges.
+        Empty ranges have start==end==0, which kernel skips via is_valid check.
+        """
+        # TODO(Isotr0py): Move to model runner's attention metadata
+        # preparation to avoid duplicate computation.
+        if self.mm_prefix_range is None:
+            return None
+
+        num_seqs = self.seq_lens.shape[0]
+        device = self.seq_lens.device
+
+        # Collect ranges, using [(0,0)] for empty sequences to ensure uniform dims
+        range_lists = [
+            self.mm_prefix_range.get(i, [(0, 0)]) or [(0, 0)] for i in range(num_seqs)
+        ]
+
+        # Return None if all ranges are trivial (only (0,0) placeholders)
+        if all(r == [(0, 0)] for r in range_lists):
+            return None
+
+        # Create 2D tensors with shape (num_ranges, 2) for each sequence
+        range_tensors = [
+            torch.tensor(r, dtype=torch.int32, device=device).view(-1, 2)
+            for r in range_lists
+        ]
+
+        return torch.nested.nested_tensor(range_tensors).to_padded_tensor(0)
 
 
 class TritonAttentionMetadataBuilder(AttentionMetadataBuilder[TritonAttentionMetadata]):
@@ -268,6 +301,10 @@ class TritonAttentionBackend(AttentionBackend):
     def supports_head_size(cls, head_size: int) -> bool:
         return head_size >= 32
 
+    @classmethod
+    def supports_mm_prefix(cls) -> bool:
+        return True
+
     @classmethod
     def supports_sink(cls) -> bool:
         return True
@@ -427,6 +464,7 @@ class TritonAttentionImpl(AttentionImpl):
         softmax_segm_expsum = attn_metadata.softmax_segm_expsum
 
         descale_shape = (cu_seqlens_q.shape[0] - 1, key_cache.shape[2])
+        mm_prefix_range_tensor = attn_metadata.mm_prefix_range_tensor
 
         unified_attention(
             q=query[:num_actual_tokens],
@@ -453,6 +491,7 @@ class TritonAttentionImpl(AttentionImpl):
             softmax_segm_expsum=softmax_segm_expsum,
             sinks=self.sinks,
             output_scale=output_scale,
+            mm_prefix_range=mm_prefix_range_tensor,
         )
 
         return output

From ed2897f336b579cd6c1f5f6e48d2d5931804d315 Mon Sep 17 00:00:00 2001
From: Rafael Vasquez <rafvasq21@gmail.com>
Date: Wed, 17 Dec 2025 19:46:44 -0500
Subject: [PATCH 645/770] [CI][Feature] Adds auto-rebase PR rule (#30875)

Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com>
Co-authored-by: Kevin H. Luu <khluu000@gmail.com>
---
 .github/mergify.yml | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/.github/mergify.yml b/.github/mergify.yml
index 3e4e21efe39df..61a03135be395 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -349,6 +349,18 @@ pull_request_rules:
       add:
         - tool-calling
 
+- name: auto-rebase if approved, ready, and 40 commits behind main
+  conditions:
+    - base = main
+    - label=ready
+    - "#approved-reviews-by >= 1"
+    - "#commits-behind >= 40"
+    - -closed
+    - -draft
+    - -conflict
+  actions:
+    rebase: {}
+
 - name: ping author on conflicts and add 'needs-rebase' label
   conditions:
     - label != stale

From a0b782f9ccd02add3516f074de163c9847686066 Mon Sep 17 00:00:00 2001
From: SungMinCho <tjdals4565@gmail.com>
Date: Wed, 17 Dec 2025 17:40:51 -0800
Subject: [PATCH 646/770] [Metrics] Model FLOPs Utilization estimation (#30738)

Signed-off-by: SungMinCho <tjdals4565@gmail.com>
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Co-authored-by: Mark McLoughlin <markmc@redhat.com>
---
 tests/v1/metrics/test_perf_metrics.py |  897 ++++++++++++++++++
 vllm/config/observability.py          |    3 +
 vllm/engine/arg_utils.py              |    6 +
 vllm/envs.py                          |    5 +
 vllm/v1/core/sched/scheduler.py       |   13 +-
 vllm/v1/metrics/loggers.py            |   17 +-
 vllm/v1/metrics/perf.py               | 1244 +++++++++++++++++++++++++
 vllm/v1/metrics/stats.py              |    3 +
 8 files changed, 2186 insertions(+), 2 deletions(-)
 create mode 100644 tests/v1/metrics/test_perf_metrics.py
 create mode 100644 vllm/v1/metrics/perf.py

diff --git a/tests/v1/metrics/test_perf_metrics.py b/tests/v1/metrics/test_perf_metrics.py
new file mode 100644
index 0000000000000..b6cda7bef3d41
--- /dev/null
+++ b/tests/v1/metrics/test_perf_metrics.py
@@ -0,0 +1,897 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Tests for the analytic estimators in metrics/flops.py.
+"""
+
+import types
+from types import SimpleNamespace
+
+from transformers.models.deepseek_v3.configuration_deepseek_v3 import DeepseekV3Config
+from transformers.models.llama4.configuration_llama4 import (
+    Llama4Config,
+    Llama4TextConfig,
+)
+from transformers.models.qwen3.configuration_qwen3 import Qwen3Config
+from transformers.models.qwen3_moe.configuration_qwen3_moe import Qwen3MoeConfig
+
+from vllm.config.model import ModelConfig, get_hf_text_config
+from vllm.v1.metrics.perf import (
+    AttentionMetrics,
+    BaseConfigParser,
+    ExecutionContext,
+    FfnMetrics,
+    ModelMetrics,
+    ParsedArgs,
+    UnembedMetrics,
+)
+
+
+class MockModelConfig:
+    """Mock ModelConfig that implements the getter methods used by parsers."""
+
+    def __init__(self, hf_config, dtype):
+        self.hf_config = hf_config
+        self.hf_text_config = get_hf_text_config(hf_config)
+        self.dtype = dtype
+        self.is_attention_free = False
+
+    def __getattr__(self, name):
+        # 1. Check if ModelConfig actually has this attribute
+        if not hasattr(ModelConfig, name):
+            raise AttributeError(
+                f"'{type(self).__name__}' object has no attribute '{name}' "
+                f"and neither does 'ModelConfig'."
+            )
+
+        # 2. Fetch the attribute from the ModelConfig CLASS
+        attr = getattr(ModelConfig, name)
+
+        # 3. Case A: It is a @property
+        if isinstance(attr, property):
+            # Manually invoke the property's getter, passing 'self' (this mock instance)
+            return attr.__get__(self, self.__class__)
+
+        # 4. Case B: It is a standard method (function)
+        if isinstance(attr, types.FunctionType):
+            # Bind the function to 'self' so it acts like a method of
+            # this instance. This creates a bound method where 'self' is
+            # automatically passed as the first arg.
+            return types.MethodType(attr, self)
+
+        # 5. Case C: It is a class attribute / static variable
+        return attr
+
+
+def create_mock_vllm_config(
+    hf_config,
+    model_dtype="bfloat16",
+    cache_dtype="auto",
+    quant_config=None,
+    data_parallel_size=1,
+    tensor_parallel_size=1,
+    pipeline_parallel_size=1,
+    enable_expert_parallel=False,
+) -> SimpleNamespace:
+    vllm_config = SimpleNamespace()
+    vllm_config.model_config = MockModelConfig(hf_config, model_dtype)
+
+    vllm_config.cache_config = SimpleNamespace()
+    vllm_config.cache_config.cache_dtype = cache_dtype
+
+    vllm_config.quant_config = quant_config
+
+    vllm_config.parallel_config = SimpleNamespace()
+    vllm_config.parallel_config.data_parallel_size = data_parallel_size
+    vllm_config.parallel_config.tensor_parallel_size = tensor_parallel_size
+    vllm_config.parallel_config.pipeline_parallel_size = pipeline_parallel_size
+    vllm_config.parallel_config.enable_expert_parallel = enable_expert_parallel
+
+    return vllm_config
+
+
+#### Parser Tests ####
+
+
+def test_base_config_parser():
+    """Test BaseConfigParser extracts base model attributes correctly."""
+    hf_config = Qwen3Config(
+        vocab_size=50000,
+        hidden_size=2048,
+        num_attention_heads=16,
+        num_hidden_layers=24,
+    )
+    vllm_config = create_mock_vllm_config(hf_config, model_dtype="float16")
+
+    parser = BaseConfigParser()
+    args = ParsedArgs()
+    result = parser.parse(args, vllm_config)
+
+    assert result.vocab_size == 50000
+    assert result.hidden_size == 2048
+    assert result.num_attention_heads == 16
+    assert result.num_hidden_layers == 24
+    assert result.weight_byte_size == 2  # float16 is 2 bytes
+    assert result.activation_byte_size == 2  # default activation size
+
+
+def test_base_attention_config_parser_with_gqa():
+    """Test BaseAttentionConfigParser with grouped query attention."""
+    hf_config = Qwen3Config(
+        hidden_size=4096,
+        num_attention_heads=32,
+        num_key_value_heads=8,  # GQA with 4:1 ratio
+        head_dim=128,
+    )
+    vllm_config = create_mock_vllm_config(hf_config)
+
+    parser_chain = AttentionMetrics.get_parser()
+    result = parser_chain.parse(vllm_config)
+
+    assert result.num_key_value_heads == 8
+    assert result.head_dim == 128
+
+
+def test_base_attention_config_parser_without_gqa():
+    """
+    Test BaseAttentionConfigParser defaults to MHA when num_key_value_heads not
+    specified.
+    """
+    hf_config = Qwen3Config(
+        hidden_size=4096,
+        num_attention_heads=32,
+        # No num_key_value_heads specified
+    )
+    vllm_config = create_mock_vllm_config(hf_config)
+
+    parser_chain = AttentionMetrics.get_parser()
+    result = parser_chain.parse(vllm_config)
+
+    # Should default to MHA (num_key_value_heads = num_attention_heads)
+    assert result.num_key_value_heads == 32
+
+
+def test_base_ffn_config_parser_dense():
+    """Test BaseFfnConfigParser for dense FFN."""
+    hf_config = Qwen3Config(
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+    )
+    vllm_config = create_mock_vllm_config(hf_config)
+
+    parser_chain = FfnMetrics.get_parser()
+    result = parser_chain.parse(vllm_config)
+
+    assert result.intermediate_size == 11008
+    assert result.num_experts == 0
+    assert result.num_experts_per_tok == 0
+    assert result.num_moe_layers == 0  # No MoE
+
+
+def test_base_ffn_config_parser_moe():
+    """Test BaseFfnConfigParser for MoE FFN."""
+    hf_config = Qwen3MoeConfig(
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_experts=64,
+        num_experts_per_tok=8,
+        moe_intermediate_size=14336,
+        n_shared_experts=2,
+    )
+    vllm_config = create_mock_vllm_config(hf_config)
+
+    parser_chain = FfnMetrics.get_parser()
+    result = parser_chain.parse(vllm_config)
+
+    assert result.num_experts == 64
+    assert result.num_experts_per_tok == 8
+    assert result.moe_intermediate_size == 14336
+    assert result.num_shared_experts == 2
+    assert result.num_moe_layers == 32  # All layers are MoE by default
+
+
+def test_interleave_moe_layer_step_parser():
+    """Test InterleaveMoeLayerStepParser correctly computes MoE layer count."""
+    hf_config = Llama4Config(
+        text_config=Llama4TextConfig(
+            num_hidden_layers=32,
+            num_local_experts=64,
+            interleave_moe_layer_step=4,  # Every 4th layer is MoE
+        ),
+    )
+
+    vllm_config = create_mock_vllm_config(hf_config)
+
+    parser_chain = FfnMetrics.get_parser()
+    result = parser_chain.parse(vllm_config)
+
+    assert result.num_moe_layers == 8
+
+
+def test_moe_layer_freq_parser():
+    """Test MoeLayerFreqParser correctly computes MoE layer count."""
+    hf_config = DeepseekV3Config(
+        num_hidden_layers=30,
+        n_routed_experts=64,
+        moe_layer_freq=3,  # Every 3rd layer after first_k_dense_replace
+        first_k_dense_replace=6,  # First 6 layers are dense
+    )
+    vllm_config = create_mock_vllm_config(hf_config)
+
+    parser_chain = FfnMetrics.get_parser()
+    result = parser_chain.parse(vllm_config)
+
+    # Layers >= 6 and divisible by 3: 6, 9, 12, 15, 18, 21, 24, 27
+    expected_moe_layers = len(
+        [layer for layer in range(30) if layer >= 6 and layer % 3 == 0]
+    )
+    assert expected_moe_layers == 8
+    assert result.num_moe_layers == expected_moe_layers
+
+
+#### ComponentMetrics Tests ####
+
+
+def test_attention_metrics_scaling():
+    """Test that attention metrics scale proportionally with model dimensions."""
+    base_hf_config = Qwen3Config(
+        hidden_size=2048,
+        num_attention_heads=16,
+        num_key_value_heads=16,
+        num_hidden_layers=12,
+        head_dim=128,
+    )
+
+    base_vllm_config = create_mock_vllm_config(base_hf_config)
+    base_metrics = AttentionMetrics.from_vllm_config(base_vllm_config)
+
+    # Test scaling with number of layers
+    double_layers_hf_config = Qwen3Config(
+        hidden_size=2048,
+        num_attention_heads=16,
+        num_key_value_heads=16,
+        num_hidden_layers=24,  # Double the layers
+        head_dim=128,
+    )
+    double_layers_vllm_config = create_mock_vllm_config(double_layers_hf_config)
+    double_layers_metrics = AttentionMetrics.from_vllm_config(double_layers_vllm_config)
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=100, context_len=512, is_prefill=True
+    )
+
+    # FLOPS should double when layers double
+    base_flops = base_metrics.get_num_flops(ctx)
+    double_flops = double_layers_metrics.get_num_flops(ctx)
+    assert double_flops == 2 * base_flops
+
+    # Read/write bytes should also scale proportionally
+    base_read = base_metrics.get_read_bytes(ctx)
+    double_read = double_layers_metrics.get_read_bytes(ctx)
+    assert double_read == 2 * base_read
+
+    base_write = base_metrics.get_write_bytes(ctx)
+    double_write = double_layers_metrics.get_write_bytes(ctx)
+    assert double_write == 2 * base_write
+
+
+def test_attention_metrics_grouped_query():
+    """Test attention metrics handle grouped query attention correctly."""
+    mha_hf_config = Qwen3Config(
+        hidden_size=4096,
+        num_attention_heads=32,
+        num_key_value_heads=32,  # MHA
+        num_hidden_layers=1,
+    )
+    mha_config = create_mock_vllm_config(mha_hf_config)
+
+    gqa_hf_config = Qwen3Config(
+        hidden_size=4096,
+        num_attention_heads=32,
+        num_key_value_heads=8,  # GQA with 4:1 ratio
+        num_hidden_layers=1,
+    )
+    gqa_config = create_mock_vllm_config(gqa_hf_config)
+
+    mha_metrics = AttentionMetrics.from_vllm_config(mha_config)
+    gqa_metrics = AttentionMetrics.from_vllm_config(gqa_config)
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=1, context_len=1024, is_prefill=False
+    )
+
+    # GQA should have less KV cache reads since fewer KV heads
+    mha_read = mha_metrics.get_read_bytes(ctx)
+    gqa_read = gqa_metrics.get_read_bytes(ctx)
+    assert gqa_read < mha_read
+
+
+def test_ffn_metrics_scaling():
+    """Test FFN metrics scale proportionally with model dimensions."""
+    base_hf_config = Qwen3Config(
+        hidden_size=2048,
+        intermediate_size=8192,
+        num_hidden_layers=12,
+    )
+    base_vllm_config = create_mock_vllm_config(base_hf_config)
+    base_metrics = FfnMetrics.from_vllm_config(base_vllm_config)
+
+    # Test scaling with intermediate size
+    larger_ffn_hf_config = Qwen3Config(
+        hidden_size=2048,
+        intermediate_size=16384,  # Double intermediate size
+        num_hidden_layers=12,
+    )
+    larger_ffn_vllm_config = create_mock_vllm_config(larger_ffn_hf_config)
+    larger_ffn_metrics = FfnMetrics.from_vllm_config(larger_ffn_vllm_config)
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=100, context_len=512, is_prefill=True
+    )
+
+    # FLOPS should double when intermediate size doubles
+    base_flops = base_metrics.get_num_flops(ctx)
+    larger_flops = larger_ffn_metrics.get_num_flops(ctx)
+    assert larger_flops == base_flops * 2
+
+
+def test_moe_metrics_vs_dense():
+    """Test MoE metrics versus dense metrics."""
+    dense_hf_config = Qwen3Config(
+        hidden_size=2048,
+        intermediate_size=8192,
+        num_hidden_layers=12,
+    )
+    dense_config = create_mock_vllm_config(dense_hf_config)
+
+    moe_hf_config = Qwen3MoeConfig(
+        hidden_size=2048,
+        intermediate_size=8192,
+        num_hidden_layers=12,
+        num_experts=64,
+        num_experts_per_tok=2,  # 2 routed expert
+        moe_intermediate_size=8192,
+        n_shared_experts=0,
+    )
+    moe_config = create_mock_vllm_config(moe_hf_config)
+
+    dense_metrics = FfnMetrics.from_vllm_config(dense_config)
+    moe_metrics = FfnMetrics.from_vllm_config(moe_config)
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=100, context_len=512, is_prefill=True
+    )
+
+    # MoE should have different compute/memory characteristics
+    dense_flops = dense_metrics.get_num_flops(ctx)
+    moe_flops = moe_metrics.get_num_flops(ctx)
+
+    # 2 routed experts vs 1 dense.
+    assert moe_flops == dense_flops * 2
+
+
+def test_unembed_metrics_scaling():
+    """Test unembedding metrics scale with vocab size."""
+    small_vocab_hf_config = Qwen3Config(
+        hidden_size=2048,
+        vocab_size=32000,
+    )
+    small_vocab_config = create_mock_vllm_config(small_vocab_hf_config)
+
+    large_vocab_hf_config = Qwen3Config(
+        hidden_size=2048,
+        vocab_size=64000,  # Double vocab size
+    )
+    large_vocab_config = create_mock_vllm_config(large_vocab_hf_config)
+
+    small_vocab_metrics = UnembedMetrics.from_vllm_config(small_vocab_config)
+    large_vocab_metrics = UnembedMetrics.from_vllm_config(large_vocab_config)
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=100, context_len=512, is_prefill=True
+    )
+
+    # FLOPS should double when vocab size doubles
+    small_flops = small_vocab_metrics.get_num_flops(ctx)
+    large_flops = large_vocab_metrics.get_num_flops(ctx)
+    assert large_flops == 2 * small_flops
+
+
+def test_prefill_vs_decode_differences():
+    """Test that prefill and decode have different memory access patterns."""
+    hf_config = Qwen3Config(
+        hidden_size=2048,
+        num_attention_heads=16,
+        num_key_value_heads=16,
+        num_hidden_layers=1,
+    )
+    config = create_mock_vllm_config(hf_config)
+
+    metrics = AttentionMetrics.from_vllm_config(config)
+
+    prefill_ctx = ExecutionContext.from_single_request(
+        num_tokens=512, context_len=512, is_prefill=True
+    )
+    decode_ctx = ExecutionContext.from_single_request(
+        num_tokens=1, context_len=512, is_prefill=False
+    )
+
+    prefill_read = metrics.get_read_bytes(prefill_ctx)
+    decode_read = metrics.get_read_bytes(decode_ctx)
+
+    assert prefill_read != decode_read
+
+
+def test_model_metrics_aggregation():
+    """Test ModelMetrics correctly aggregates across components."""
+    hf_config = Qwen3Config(
+        hidden_size=2048,
+        num_attention_heads=16,
+        num_hidden_layers=12,
+        vocab_size=32000,
+        intermediate_size=8192,
+    )
+    config = create_mock_vllm_config(hf_config)
+
+    model_metrics = ModelMetrics(config)
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=100, context_len=512, is_prefill=True
+    )
+
+    # Should have metrics for attention, ffn, and unembed
+    total_flops = model_metrics.get_num_flops(ctx)
+    breakdown = model_metrics.get_num_flops_breakdown(ctx)
+
+    # Breakdown should sum to total
+    assert total_flops == sum(breakdown.values())
+
+
+def test_moe_expert_activation_proportional_scaling():
+    """Test that routed expert metrics scale proportionally with num_experts_per_tok."""
+    base_moe_config = Qwen3MoeConfig(
+        hidden_size=2048,
+        intermediate_size=8192,
+        num_hidden_layers=12,
+        num_experts=64,
+        num_experts_per_tok=1,  # 1 expert per token
+        moe_intermediate_size=8192,
+        n_shared_experts=2,
+    )
+
+    double_experts_config = Qwen3MoeConfig(
+        hidden_size=2048,
+        intermediate_size=8192,
+        num_hidden_layers=12,
+        num_experts=64,
+        num_experts_per_tok=2,  # 2 experts per token (double)
+        moe_intermediate_size=8192,
+        n_shared_experts=2,  # Same shared experts
+    )
+
+    triple_experts_config = Qwen3MoeConfig(
+        hidden_size=2048,
+        intermediate_size=8192,
+        num_hidden_layers=12,
+        num_experts=64,
+        num_experts_per_tok=3,  # 3 experts per token (triple)
+        moe_intermediate_size=8192,
+        n_shared_experts=2,  # Same shared experts
+    )
+
+    base_vllm_config = create_mock_vllm_config(base_moe_config)
+    double_vllm_config = create_mock_vllm_config(double_experts_config)
+    triple_vllm_config = create_mock_vllm_config(triple_experts_config)
+
+    base_metrics = FfnMetrics.from_vllm_config(base_vllm_config)
+    double_metrics = FfnMetrics.from_vllm_config(double_vllm_config)
+    triple_metrics = FfnMetrics.from_vllm_config(triple_vllm_config)
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=100, context_len=512, is_prefill=True
+    )
+
+    # Get total metrics - the key insight is that differences should be proportional
+    base_flops = base_metrics.get_num_flops(ctx)
+    double_flops = double_metrics.get_num_flops(ctx)
+    triple_flops = triple_metrics.get_num_flops(ctx)
+
+    # The difference between double and base should equal one additional expert
+    one_expert_diff = double_flops - base_flops
+
+    # The difference between triple and base should equal two additional experts
+    two_expert_diff = triple_flops - base_flops
+
+    # Proportional scaling: 2 * (1 expert diff) should equal (2 expert diff)
+    assert two_expert_diff == 2 * one_expert_diff
+
+    # Same logic applies to memory operations
+    base_read = base_metrics.get_read_bytes(ctx)
+    double_read = double_metrics.get_read_bytes(ctx)
+    triple_read = triple_metrics.get_read_bytes(ctx)
+
+    one_expert_read_diff = double_read - base_read
+    two_expert_read_diff = triple_read - base_read
+
+    assert two_expert_read_diff == 2 * one_expert_read_diff
+
+    # Same for write bytes
+    base_write = base_metrics.get_write_bytes(ctx)
+    double_write = double_metrics.get_write_bytes(ctx)
+    triple_write = triple_metrics.get_write_bytes(ctx)
+
+    one_expert_write_diff = double_write - base_write
+    two_expert_write_diff = triple_write - base_write
+
+    assert two_expert_write_diff == 2 * one_expert_write_diff
+
+
+def test_quantization_config_parser_fp8():
+    """Test quantization parsers with fp8."""
+
+    class MockQuantConfig:
+        def get_name(self):
+            return "fp8"
+
+    hf_config = Qwen3Config(
+        hidden_size=2048, num_attention_heads=16, num_hidden_layers=1
+    )
+    vllm_config = create_mock_vllm_config(hf_config, quant_config=MockQuantConfig())
+
+    attn_result = AttentionMetrics.get_parser().parse(vllm_config)
+    assert attn_result.weight_byte_size == 1  # fp8
+
+    ffn_result = FfnMetrics.get_parser().parse(vllm_config)
+    assert ffn_result.weight_byte_size == 1  # fp8
+
+
+def test_quantization_config_parser_mxfp4():
+    """Test quantization parsers with mxfp4."""
+
+    class MockQuantConfig:
+        def get_name(self):
+            return "mxfp4"
+
+    hf_config = Qwen3Config(
+        hidden_size=2048, intermediate_size=8192, num_hidden_layers=1
+    )
+    vllm_config = create_mock_vllm_config(hf_config, quant_config=MockQuantConfig())
+
+    ffn_result = FfnMetrics.get_parser().parse(vllm_config)
+    assert ffn_result.weight_byte_size == 0.5  # mxfp4
+
+
+#### Per-GPU Tests ####
+
+
+def test_attention_per_gpu_with_tensor_parallelism():
+    """Test attention metrics with tensor parallelism - per_gpu vs global."""
+    hf_config = Qwen3Config(
+        hidden_size=4096,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        num_hidden_layers=24,
+    )
+
+    # Test with TP=4
+    vllm_config = create_mock_vllm_config(hf_config, tensor_parallel_size=4)
+    metrics = AttentionMetrics.from_vllm_config(vllm_config)
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=128, context_len=1024, is_prefill=True
+    )
+
+    # Get global and per-gpu metrics
+    global_flops = metrics.get_num_flops(ctx, per_gpu=False)
+    per_gpu_flops = metrics.get_num_flops(ctx, per_gpu=True)
+
+    # With TP=4, global flops should be 4x per-gpu flops (heads divided by 4)
+    assert global_flops == 4 * per_gpu_flops
+
+    # Same for read/write bytes
+    global_read = metrics.get_read_bytes(ctx, per_gpu=False)
+    per_gpu_read = metrics.get_read_bytes(ctx, per_gpu=True)
+    # Reads should scale similarly (weight reads are divided by TP)
+    assert global_read > per_gpu_read
+
+    global_write = metrics.get_write_bytes(ctx, per_gpu=False)
+    per_gpu_write = metrics.get_write_bytes(ctx, per_gpu=True)
+    assert global_write > per_gpu_write
+
+
+def test_attention_per_gpu_with_pipeline_parallelism():
+    """Test attention metrics with pipeline parallelism - per_gpu vs global."""
+    hf_config = Qwen3Config(
+        hidden_size=2048,
+        num_attention_heads=16,
+        num_hidden_layers=32,
+    )
+
+    # Test with PP=4
+    vllm_config = create_mock_vllm_config(hf_config, pipeline_parallel_size=4)
+    metrics = AttentionMetrics.from_vllm_config(vllm_config)
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=100, context_len=512, is_prefill=False
+    )
+
+    # Get global and per-gpu metrics
+    global_flops = metrics.get_num_flops(ctx, per_gpu=False)
+    per_gpu_flops = metrics.get_num_flops(ctx, per_gpu=True)
+
+    # With PP=4, global flops should be 4x per-gpu flops (layers divided by 4)
+    assert global_flops == 4 * per_gpu_flops
+
+    global_read = metrics.get_read_bytes(ctx, per_gpu=False)
+    per_gpu_read = metrics.get_read_bytes(ctx, per_gpu=True)
+    assert global_read == 4 * per_gpu_read
+
+
+def test_ffn_per_gpu_with_tensor_parallelism():
+    """Test FFN metrics with tensor parallelism - per_gpu vs global."""
+    hf_config = Qwen3Config(
+        hidden_size=4096,
+        intermediate_size=14336,
+        num_hidden_layers=32,
+    )
+
+    # Test with DP=2, TP=4 (ffn_tp_size will be 8)
+    vllm_config = create_mock_vllm_config(
+        hf_config,
+        data_parallel_size=2,
+        tensor_parallel_size=4,
+    )
+    metrics = FfnMetrics.from_vllm_config(vllm_config)
+
+    # ffn_tp_size should be dp_size * tp_size = 8 (when EP not enabled)
+    assert metrics.ffn_tp_size == 8
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=128, context_len=2048, is_prefill=True
+    )
+
+    # Get global and per-gpu metrics
+    global_flops = metrics.get_num_flops(ctx, per_gpu=False)
+    per_gpu_flops = metrics.get_num_flops(ctx, per_gpu=True)
+
+    # With ffn_tp_size=8, global should be 8x per-gpu
+    assert global_flops == 8 * per_gpu_flops
+
+
+def test_ffn_per_gpu_with_pipeline_parallelism():
+    """Test FFN metrics with pipeline parallelism - per_gpu vs global."""
+    hf_config = Qwen3Config(
+        hidden_size=2048,
+        intermediate_size=8192,
+        num_hidden_layers=24,
+    )
+
+    # Test with PP=6
+    vllm_config = create_mock_vllm_config(hf_config, pipeline_parallel_size=6)
+    metrics = FfnMetrics.from_vllm_config(vllm_config)
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=100, context_len=512, is_prefill=True
+    )
+
+    # Get global and per-gpu metrics
+    global_flops = metrics.get_num_flops(ctx, per_gpu=False)
+    per_gpu_flops = metrics.get_num_flops(ctx, per_gpu=True)
+
+    # With PP=6, global should be 6x per-gpu (layers divided by 6)
+    assert global_flops == 6 * per_gpu_flops
+
+
+def test_moe_per_gpu_with_expert_parallelism():
+    """
+    Test MoE metrics with expert parallelism - verifies num_activated_experts bug fix.
+    """
+    hf_config = Qwen3MoeConfig(
+        hidden_size=2048,
+        intermediate_size=8192,
+        num_hidden_layers=24,
+        num_experts=64,
+        num_experts_per_tok=8,
+        moe_intermediate_size=14336,
+        n_shared_experts=2,
+    )
+
+    # Test with DP=2, TP=4, EP enabled (ffn_ep_size will be 8)
+    vllm_config = create_mock_vllm_config(
+        hf_config,
+        data_parallel_size=2,
+        tensor_parallel_size=4,
+        enable_expert_parallel=True,
+    )
+    metrics = FfnMetrics.from_vllm_config(vllm_config)
+
+    # When EP enabled, ffn_ep_size = dp_size * tp_size = 8
+    assert metrics.ffn_ep_size == 8
+    assert metrics.ffn_tp_size == 1
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=100, context_len=512, is_prefill=True
+    )
+
+    # Get per-gpu metrics
+    per_gpu_read_breakdown = metrics.get_read_bytes_breakdown(ctx, per_gpu=True)
+    global_read_breakdown = metrics.get_read_bytes_breakdown(ctx, per_gpu=False)
+
+    # Verify that routed expert weight reads are reasonable
+    # With per_gpu=True, each GPU has 64/8 = 8 experts
+    # T=100, E_per_gpu=8/8=1, so T*E=100 expert activations
+    # num_activated_experts should be min(100, 8) = 8
+
+    # Check that weight reads scale appropriately
+    # Global has all 64 experts, per-gpu has 8 experts
+    # So weight reads should reflect this difference
+    if "routed_up_gate_weights" in per_gpu_read_breakdown:
+        per_gpu_weight_reads = per_gpu_read_breakdown["routed_up_gate_weights"]
+        global_weight_reads = global_read_breakdown["routed_up_gate_weights"]
+
+        # The ratio should reflect the expert count difference
+        # This verifies the bug fix works correctly
+        assert per_gpu_weight_reads < global_weight_reads
+
+        # Global should read more experts than per-gpu
+        # Exact ratio depends on num_activated_experts calculation
+        ratio = global_weight_reads / per_gpu_weight_reads
+        # Should be > 1 since global has more experts to read
+        assert ratio > 1
+
+
+def test_moe_per_gpu_expert_activation_accounting():
+    """
+    Test that MoE correctly accounts for expert activations with small batch sizes.
+    """
+    hf_config = Qwen3MoeConfig(
+        hidden_size=2048,
+        intermediate_size=8192,
+        num_hidden_layers=12,
+        num_experts=64,
+        num_experts_per_tok=8,
+        moe_intermediate_size=14336,
+        n_shared_experts=0,  # No shared experts for this test
+    )
+
+    # Test with EP=8
+    vllm_config = create_mock_vllm_config(
+        hf_config,
+        data_parallel_size=8,
+        enable_expert_parallel=True,
+    )
+    metrics = FfnMetrics.from_vllm_config(vllm_config)
+
+    # Small batch: T=10, E_per_gpu=8/8=1
+    # Each GPU: T*E = 10*1 = 10 activations
+    # Experts per GPU: 64/8 = 8
+    # So num_activated_experts should be min(10, 8) = 8
+    small_ctx = ExecutionContext.from_single_request(
+        num_tokens=10, context_len=512, is_prefill=True
+    )
+    small_read = metrics.get_read_bytes_breakdown(small_ctx, per_gpu=True)
+
+    # Large batch: T=1000, E_per_gpu=1
+    # Each GPU: T*E = 1000*1 = 1000 activations
+    # Experts per GPU: 8
+    # So num_activated_experts should be min(1000, 8) = 8 (all experts activated)
+    large_ctx = ExecutionContext.from_single_request(
+        num_tokens=1000, context_len=512, is_prefill=True
+    )
+    large_read = metrics.get_read_bytes_breakdown(large_ctx, per_gpu=True)
+
+    # Weight reads should be similar (both activate all 8 experts per GPU)
+    # But activation reads should differ (proportional to T*E)
+    if "routed_up_gate_weights" in small_read:
+        small_weight = small_read["routed_up_gate_weights"]
+        large_weight = large_read["routed_up_gate_weights"]
+
+        # Weight reads should be the same (both read all 8 experts)
+        assert small_weight == large_weight
+
+        # But input activation reads should scale with T*E
+        small_input = small_read["routed_up_gate_input"]
+        large_input = large_read["routed_up_gate_input"]
+        assert large_input == 100 * small_input  # 1000/10 = 100x
+
+
+def test_unembed_per_gpu_with_tensor_parallelism():
+    """Test unembed metrics with tensor parallelism - per_gpu vs global."""
+    hf_config = Qwen3Config(
+        hidden_size=4096,
+        vocab_size=128000,
+    )
+
+    # Test with TP=8
+    vllm_config = create_mock_vllm_config(hf_config, tensor_parallel_size=8)
+    metrics = UnembedMetrics.from_vllm_config(vllm_config)
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=100, context_len=512, is_prefill=True
+    )
+
+    # Get global and per-gpu metrics
+    global_flops = metrics.get_num_flops(ctx, per_gpu=False)
+    per_gpu_flops = metrics.get_num_flops(ctx, per_gpu=True)
+
+    # With TP=8, vocab is divided by 8, so global should be 8x per-gpu
+    assert global_flops == 8 * per_gpu_flops
+
+    # For read bytes, weight reads scale with TP but input reads don't (replicated)
+    global_read_breakdown = metrics.get_read_bytes_breakdown(ctx, per_gpu=False)
+    per_gpu_read_breakdown = metrics.get_read_bytes_breakdown(ctx, per_gpu=True)
+
+    # Input reads should be the same (replicated across TP ranks)
+    assert global_read_breakdown["input"] == per_gpu_read_breakdown["input"]
+
+    # Weight reads should scale 8x (divided by TP)
+    assert global_read_breakdown["weight"] == 8 * per_gpu_read_breakdown["weight"]
+
+
+def test_model_metrics_per_gpu_aggregation():
+    """Test ModelMetrics correctly aggregates per_gpu metrics across components."""
+    hf_config = Qwen3Config(
+        hidden_size=2048,
+        num_attention_heads=16,
+        num_hidden_layers=12,
+        vocab_size=32000,
+        intermediate_size=8192,
+    )
+
+    # Test with mixed parallelism: TP=2, PP=2
+    vllm_config = create_mock_vllm_config(
+        hf_config,
+        tensor_parallel_size=2,
+        pipeline_parallel_size=2,
+    )
+
+    model_metrics = ModelMetrics(vllm_config)
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=100, context_len=512, is_prefill=True
+    )
+
+    # Get breakdowns for both modes
+    per_gpu_breakdown = model_metrics.get_num_flops_breakdown(ctx, per_gpu=True)
+    global_breakdown = model_metrics.get_num_flops_breakdown(ctx, per_gpu=False)
+
+    # Verify breakdown sums match totals
+    per_gpu_total = model_metrics.get_num_flops(ctx, per_gpu=True)
+    global_total = model_metrics.get_num_flops(ctx, per_gpu=False)
+
+    assert per_gpu_total == sum(per_gpu_breakdown.values())
+    assert global_total == sum(global_breakdown.values())
+
+    # Global should be larger than per-gpu due to parallelism
+    assert global_total > per_gpu_total
+
+    # With TP=2 and PP=2, the ratio depends on which parallelism applies to
+    # which component but we can verify that global is reasonably larger
+    ratio = global_total / per_gpu_total
+    assert ratio > 1  # Should be between PP and TP*PP depending on component mix
+
+
+def test_attention_per_gpu_heads_not_evenly_divisible():
+    """Test attention with heads not evenly divisible by TP."""
+    hf_config = Qwen3Config(
+        hidden_size=2048,
+        num_attention_heads=17,  # Not divisible by 4
+        num_key_value_heads=5,  # Not divisible by 4
+        num_hidden_layers=8,
+    )
+
+    vllm_config = create_mock_vllm_config(hf_config, tensor_parallel_size=4)
+    metrics = AttentionMetrics.from_vllm_config(vllm_config)
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=64, context_len=256, is_prefill=True
+    )
+
+    # Should not crash and should handle max(1, ...) correctly
+    per_gpu_flops = metrics.get_num_flops(ctx, per_gpu=True)
+    global_flops = metrics.get_num_flops(ctx, per_gpu=False)
+
+    # Both should be positive
+    assert per_gpu_flops > 0
+    assert global_flops > 0
+    assert global_flops > per_gpu_flops
diff --git a/vllm/config/observability.py b/vllm/config/observability.py
index e40bf18a00ce2..4aca6b15684ac 100644
--- a/vllm/config/observability.py
+++ b/vllm/config/observability.py
@@ -64,6 +64,9 @@ class ObservabilityConfig:
     module in the model and attach informations such as input/output shapes to
     nvtx range markers. Noted that this doesn't work with CUDA graphs enabled."""
 
+    enable_mfu_metrics: bool = False
+    """Enable Model FLOPs Utilization (MFU) metrics."""
+
     @cached_property
     def collect_model_forward_time(self) -> bool:
         """Whether to collect model forward time for the request."""
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 03720bd2516d4..64510bdcaf8a8 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -523,6 +523,7 @@ class EngineArgs:
     enable_layerwise_nvtx_tracing: bool = (
         ObservabilityConfig.enable_layerwise_nvtx_tracing
     )
+    enable_mfu_metrics: bool = ObservabilityConfig.enable_mfu_metrics
     scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
     scheduler_cls: str | type[object] | None = SchedulerConfig.scheduler_cls
 
@@ -1042,6 +1043,10 @@ class EngineArgs:
             "--enable-layerwise-nvtx-tracing",
             **observability_kwargs["enable_layerwise_nvtx_tracing"],
         )
+        observability_group.add_argument(
+            "--enable-mfu-metrics",
+            **observability_kwargs["enable_mfu_metrics"],
+        )
 
         # Scheduler arguments
         scheduler_kwargs = get_kwargs(SchedulerConfig)
@@ -1689,6 +1694,7 @@ class EngineArgs:
             kv_cache_metrics_sample=self.kv_cache_metrics_sample,
             cudagraph_metrics=self.cudagraph_metrics,
             enable_layerwise_nvtx_tracing=self.enable_layerwise_nvtx_tracing,
+            enable_mfu_metrics=self.enable_mfu_metrics,
         )
 
         # Compilation config overrides
diff --git a/vllm/envs.py b/vllm/envs.py
index 2f8158d88d6c5..b59991aa6523a 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -244,6 +244,7 @@ if TYPE_CHECKING:
     VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD: int = 256
     VLLM_COMPILE_CACHE_SAVE_FORMAT: Literal["binary", "unpacked"] = "binary"
     VLLM_USE_V2_MODEL_RUNNER: bool = False
+    VLLM_DEBUG_MFU_METRICS: bool = False
 
 
 def get_default_cache_root():
@@ -1565,6 +1566,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_USE_V2_MODEL_RUNNER": lambda: bool(
         int(os.getenv("VLLM_USE_V2_MODEL_RUNNER", "0"))
     ),
+    # Debug logging for --enable-mfu-metrics
+    "VLLM_DEBUG_MFU_METRICS": lambda: bool(
+        int(os.getenv("VLLM_DEBUG_MFU_METRICS", "0"))
+    ),
 }
 
 # --8<-- [end:env-vars-definition]
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 8e835ad096405..da8339558b143 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -43,6 +43,7 @@ from vllm.v1.core.sched.request_queue import SchedulingPolicy, create_request_qu
 from vllm.v1.core.sched.utils import check_stop, remove_all
 from vllm.v1.engine import EngineCoreEventType, EngineCoreOutput, EngineCoreOutputs
 from vllm.v1.kv_cache_interface import KVCacheConfig
+from vllm.v1.metrics.perf import ModelMetrics, PerfStats
 from vllm.v1.metrics.stats import (
     PrefixCacheStats,
     SchedulerStats,
@@ -219,6 +220,10 @@ class Scheduler(SchedulerInterface):
         self.use_pp = self.parallel_config.pipeline_parallel_size > 1
         self.use_v2_model_runner = envs.VLLM_USE_V2_MODEL_RUNNER
 
+        self.perf_metrics: ModelMetrics | None = None
+        if self.log_stats and vllm_config.observability_config.enable_mfu_metrics:
+            self.perf_metrics = ModelMetrics(vllm_config)
+
     def schedule(self) -> SchedulerOutput:
         # NOTE(woosuk) on the scheduling algorithm:
         # There's no "decoding phase" nor "prefill phase" in the scheduler.
@@ -1066,6 +1071,10 @@ class Scheduler(SchedulerInterface):
         kv_connector_output = model_runner_output.kv_connector_output
         cudagraph_stats = model_runner_output.cudagraph_stats
 
+        perf_stats: PerfStats | None = None
+        if self.perf_metrics and self.perf_metrics.is_enabled():
+            perf_stats = self.perf_metrics.get_step_perf_stats_per_gpu(scheduler_output)
+
         outputs: dict[int, list[EngineCoreOutput]] = defaultdict(list)
         spec_decoding_stats: SpecDecodingStats | None = None
         kv_connector_stats: KVConnectorStats | None = (
@@ -1262,7 +1271,7 @@ class Scheduler(SchedulerInterface):
 
         if (
             stats := self.make_stats(
-                spec_decoding_stats, kv_connector_stats, cudagraph_stats
+                spec_decoding_stats, kv_connector_stats, cudagraph_stats, perf_stats
             )
         ) is not None:
             # Return stats to only one of the front-ends.
@@ -1485,6 +1494,7 @@ class Scheduler(SchedulerInterface):
         spec_decoding_stats: SpecDecodingStats | None = None,
         kv_connector_stats: KVConnectorStats | None = None,
         cudagraph_stats: CUDAGraphStat | None = None,
+        perf_stats: PerfStats | None = None,
     ) -> SchedulerStats | None:
         if not self.log_stats:
             return None
@@ -1510,6 +1520,7 @@ class Scheduler(SchedulerInterface):
             spec_decoding_stats=spec_stats,
             kv_connector_stats=connector_stats_payload,
             cudagraph_stats=cudagraph_stats,
+            perf_stats=perf_stats,
         )
 
     def make_spec_decoding_stats(
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 9eaee1bb97bb9..2213b952c7a89 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -19,6 +19,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
 from vllm.logger import init_logger
 from vllm.plugins import STAT_LOGGER_PLUGINS_GROUP, load_plugins_by_group
 from vllm.v1.engine import FinishReason
+from vllm.v1.metrics.perf import PerfMetricsLogging
 from vllm.v1.metrics.prometheus import unregister_vllm_metrics
 from vllm.v1.metrics.stats import (
     CachingMetrics,
@@ -118,6 +119,9 @@ class LoggingStatLogger(StatLoggerBase):
         self.engine_is_idle = False
         self.aggregated = False
 
+        if self._enable_perf_stats():
+            self.perf_metrics_logging = PerfMetricsLogging(vllm_config)
+
     def _reset(self, now):
         self.last_log_time = now
 
@@ -127,6 +131,9 @@ class LoggingStatLogger(StatLoggerBase):
         self.num_corrupted_reqs: int = 0
         self.num_preemptions: int = 0
 
+    def _enable_perf_stats(self) -> bool:
+        return self.vllm_config.observability_config.enable_mfu_metrics
+
     def _track_iteration_stats(self, iteration_stats: IterationStats):
         # Save tracked stats for token counters.
         self.num_prompt_tokens += iteration_stats.num_prompt_tokens
@@ -175,6 +182,8 @@ class LoggingStatLogger(StatLoggerBase):
                 self.cudagraph_logging.observe(scheduler_stats.cudagraph_stats)
             if not self.aggregated:
                 self.last_scheduler_stats = scheduler_stats
+            if (perf_stats := scheduler_stats.perf_stats) and self._enable_perf_stats():
+                self.perf_metrics_logging.observe(perf_stats)
         if mm_cache_stats:
             self.mm_caching_metrics.observe(mm_cache_stats)
 
@@ -211,7 +220,7 @@ class LoggingStatLogger(StatLoggerBase):
             "Running: %d reqs",
             "Waiting: %d reqs",
         ]
-        log_args = [
+        log_args: list[int | float | str] = [
             self.last_prompt_throughput,
             self.last_generation_throughput,
             self.last_scheduler_stats.num_running_reqs,
@@ -254,6 +263,8 @@ class LoggingStatLogger(StatLoggerBase):
         self.kv_connector_logging.log(log_fn=log_fn)
         if self.cudagraph_logging is not None:
             self.cudagraph_logging.log(log_fn=log_fn)
+        if self._enable_perf_stats():
+            self.perf_metrics_logging.log(log_fn=log_fn, log_prefix=self.log_prefix)
 
     def log_engine_initialized(self):
         if self.vllm_config.cache_config.num_gpu_blocks:
@@ -282,6 +293,10 @@ class AggregatedLoggingStatLogger(LoggingStatLogger, AggregateStatLoggerBase):
     def log_prefix(self):
         return "{} Engines Aggregated: ".format(len(self.engine_indexes))
 
+    def _enable_perf_stats(self) -> bool:
+        # Adding per_gpu perf stats across engines can lead to misleading numbers.
+        return False
+
     def record(
         self,
         scheduler_stats: SchedulerStats | None,
diff --git a/vllm/v1/metrics/perf.py b/vllm/v1/metrics/perf.py
new file mode 100644
index 0000000000000..446a81fc4855d
--- /dev/null
+++ b/vllm/v1/metrics/perf.py
@@ -0,0 +1,1244 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Analytic flops/memory estimation module for transformer components,
+to help derive MFU (Model Flops Utilization) stats for a running model.
+"""
+
+import json
+import time
+from abc import ABC, abstractmethod
+from collections.abc import Iterable
+from dataclasses import asdict, dataclass
+from typing import Any, Protocol
+
+import torch
+from pydantic import BaseModel, Field, ValidationError, model_validator
+from typing_extensions import Self
+
+import vllm.envs as envs
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.utils.torch_utils import (
+    STR_DTYPE_TO_TORCH_DTYPE,
+    get_dtype_size,
+    get_kv_cache_torch_dtype,
+)
+from vllm.v1.core.sched.output import SchedulerOutput
+
+logger = init_logger(__name__)
+
+
+class InvalidComponent(Exception):
+    """
+    Custom exception to indicate that a certain ComponentMetric is not
+    applicable to the given VllmConfig.
+    """
+
+    pass
+
+
+#### Basic Data Types ####
+
+
+@dataclass
+class DebugPerfStats:
+    ## Stats for debugging the metrics calculation
+    calc_duration: float = 0.0  # time spent calculating these stats
+    num_prefill_requests: int = 0
+    num_decode_requests: int = 0
+    context_breakdown: dict[str, int] | None = None
+    num_flops_per_gpu_breakdown: dict[str, int] | None = None
+    num_read_bytes_per_gpu_breakdown: dict[str, int] | None = None
+    num_write_bytes_per_gpu_breakdown: dict[str, int] | None = None
+
+
+@dataclass
+class PerfStats:
+    num_flops_per_gpu: int = 0
+    num_read_bytes_per_gpu: int = 0
+    num_write_bytes_per_gpu: int = 0
+    debug_stats: DebugPerfStats | None = None
+
+
+@dataclass
+class ExecutionContext:
+    """
+    Represents an execution context for a batch of requests.
+
+    This class aggregates statistics across multiple requests in a batch,
+    separately tracking prefill and decode phases.
+
+    Example)
+    - Batch with one full prefill (2048 tokens) and one decode (1 token, 8192 context):
+      ctx = ExecutionContext()
+      ctx.add(2048, 2048, is_prefill=True)
+      ctx.add(1, 8192, is_prefill=False)
+    """
+
+    # Prefill phase statistics
+    num_prefill_requests: int = 0
+    prefill_num_tokens: int = 0  # sum of num_tokens for prefill requests
+    prefill_context_len: int = 0  # sum of context_len for prefill requests
+    prefill_token_context_product: int = 0  # sum of (num_tokens * context_len)
+
+    # Decode phase statistics
+    num_decode_requests: int = 0
+    decode_num_tokens: int = 0  # sum of num_tokens for decode requests
+    decode_context_len: int = 0  # sum of context_len for decode requests
+    decode_token_context_product: int = 0  # sum of (num_tokens * context_len)
+
+    def add(self, num_tokens: int, context_len: int, is_prefill: bool) -> None:
+        """Add a single request's statistics to this batch context."""
+        if is_prefill:
+            self.num_prefill_requests += 1
+            self.prefill_num_tokens += num_tokens
+            self.prefill_context_len += context_len
+            self.prefill_token_context_product += num_tokens * context_len
+        else:
+            self.num_decode_requests += 1
+            self.decode_num_tokens += num_tokens
+            self.decode_context_len += context_len
+            self.decode_token_context_product += num_tokens * context_len
+
+    def total_num_tokens(self) -> int:
+        """Total number of tokens across all requests in the batch."""
+        return self.prefill_num_tokens + self.decode_num_tokens
+
+    def total_token_context_product(self) -> int:
+        """Total sum of (num_tokens * context_len) across all requests."""
+        return self.prefill_token_context_product + self.decode_token_context_product
+
+    @classmethod
+    def from_single_request(
+        cls, num_tokens: int, context_len: int, is_prefill: bool
+    ) -> "ExecutionContext":
+        """Create an ExecutionContext from a single request.
+
+        This is a convenience method primarily for testing.
+        """
+        ctx = cls()
+        ctx.add(num_tokens, context_len, is_prefill)
+        return ctx
+
+
+class ParsedArgs:
+    """
+    Syntactic sugar so that Parsers can use dot notations
+    to access/update the parsed arguments.
+
+    e.g.)
+        args = ParsedArgs()
+        args.x = 3
+        args.y = args.x + 1
+    """
+
+    def __getattr__(self, name: str) -> Any:
+        raise AttributeError(f"'{type(self).__name__}' has no attribute '{name}'")
+
+    def __setattr__(self, name: str, value: Any) -> None:
+        object.__setattr__(self, name, value)
+
+    def model_dump(self) -> dict[str, Any]:
+        return vars(self).copy()
+
+
+#### Abstract ####
+
+
+class Parser(Protocol):
+    def parse(self, args: ParsedArgs, vllm_config: VllmConfig) -> ParsedArgs:
+        """
+        Parse the vllm config and update the current ParsedArgs and pass it on.
+        If the parser isn't applicable to the vllm_config, it will do nothing.
+        """
+        ...
+
+
+class ParserChain:
+    """
+    Applies chain of parser in a sequential order.
+    Later parsers might overwrite results from previous parsers,
+    so parsers should be chained in the appropriate order if they
+    are not mutually exclusive.
+    """
+
+    def __init__(self, *parsers: Parser) -> None:
+        self.parsers = list(parsers)
+
+    def add_parser(self, parser: Parser) -> None:
+        self.parsers.append(parser)
+
+    def parse(self, vllm_config: VllmConfig) -> ParsedArgs:
+        args = ParsedArgs()
+        for parser in self.parsers:
+            args = parser.parse(args, vllm_config)
+        return args
+
+
+_COMPONENT_METRICS_REGISTRY: dict[str, type["ComponentMetrics"]] = {}
+
+
+class ComponentMetrics(BaseModel, ABC):
+    """
+    Each concrete ComponentMetrics class is associated with:
+    - fields that are required for metric derivation
+      (fields are specified/validated through pydantic model)
+    - parser to parse VllmConfig into fields
+    - metric methods that derive flops/bytes for a given execution context
+    """
+
+    @classmethod
+    @abstractmethod
+    def component_type(cls) -> str: ...
+
+    @classmethod
+    @abstractmethod
+    def get_parser(cls) -> ParserChain:
+        """
+        Return a ParserChain that provides values for all required fields.
+        The returned parser chain must populate ParsedArgs with values for every
+        field defined on this ComponentMetrics class. Missing fields will cause
+        a ValidationError when from_vllm_config() is called.
+        See individual Parser docstrings for which args they provide, and field
+        comments on ComponentMetrics subclasses for which parser provides each field.
+        """
+        ...
+
+    def __init_subclass__(cls):
+        _COMPONENT_METRICS_REGISTRY[cls.component_type()] = cls
+
+    @classmethod
+    def from_vllm_config(cls, vllm_config: VllmConfig) -> Self:
+        """
+        Instantiate this class from VllmConfig.
+        Raises ValidationError if parsing fails.
+        """
+
+        parser = cls.get_parser()
+        parsed_args = parser.parse(vllm_config)
+        try:
+            return cls.model_validate(parsed_args.model_dump())
+        except ValidationError as e:
+            raise InvalidComponent(f"Invalid {cls.component_type()} config: {e}") from e
+
+    @classmethod
+    def registered_metrics(cls) -> Iterable[type["ComponentMetrics"]]:
+        return iter(_COMPONENT_METRICS_REGISTRY.values())
+
+    @abstractmethod
+    def get_num_flops_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]: ...
+
+    @abstractmethod
+    def get_read_bytes_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]: ...
+
+    @abstractmethod
+    def get_write_bytes_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]: ...
+
+    def get_num_flops(self, ctx: ExecutionContext, per_gpu: bool = True) -> int:
+        return sum(self.get_num_flops_breakdown(ctx, per_gpu).values())
+
+    def get_read_bytes(self, ctx: ExecutionContext, per_gpu: bool = True) -> int:
+        return sum(self.get_read_bytes_breakdown(ctx, per_gpu).values())
+
+    def get_write_bytes(self, ctx: ExecutionContext, per_gpu: bool = True) -> int:
+        return sum(self.get_write_bytes_breakdown(ctx, per_gpu).values())
+
+
+#### parsers ####
+
+
+class BaseConfigParser(Parser):
+    """
+    Parses base model configuration.
+    Provides: vocab_size, hidden_size, num_attention_heads, num_hidden_layers,
+    weight_byte_size, activation_byte_size, dp_size, tp_size, pp_size, enable_ep
+    """
+
+    def parse(self, args: ParsedArgs, vllm_config: VllmConfig) -> ParsedArgs:
+        model_config = vllm_config.model_config
+
+        args.vocab_size = model_config.get_vocab_size()
+        args.hidden_size = model_config.get_hidden_size()
+        # NOTE: model_config.get_attention_heads() divide by TP
+        # so we access field manually here to get total num_heads
+        args.num_attention_heads = get_required(
+            model_config.hf_text_config, "num_attention_heads"
+        )
+        args.num_hidden_layers = get_required(
+            model_config.hf_text_config, "num_hidden_layers"
+        )
+
+        model_dtype = vllm_config.model_config.dtype
+
+        if isinstance(model_dtype, torch.dtype):
+            torch_dtype = model_dtype
+        elif isinstance(model_dtype, str) and model_dtype in STR_DTYPE_TO_TORCH_DTYPE:
+            torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[model_dtype]
+        else:
+            # FIXME: handle this better
+            logger.warning(
+                "Unknown model_dtype %s, defaulting to bfloat16",
+                model_dtype,
+            )
+            torch_dtype = torch.bfloat16
+
+        args.weight_byte_size = get_dtype_size(torch_dtype)
+
+        # FIXME: handle this better by parsing whether activations use
+        # bf16, fp32, etc...
+        args.activation_byte_size = 2
+
+        args.dp_size = vllm_config.parallel_config.data_parallel_size
+        args.tp_size = vllm_config.parallel_config.tensor_parallel_size
+        args.pp_size = vllm_config.parallel_config.pipeline_parallel_size
+        args.enable_ep = vllm_config.parallel_config.enable_expert_parallel
+
+        return args
+
+
+#### Attention ####
+
+
+class BaseAttentionConfigParser(Parser):
+    """
+    Parses attention-specific configuration.
+    Provides: num_key_value_heads, head_dim, cache_byte_size
+    """
+
+    def parse(self, args: ParsedArgs, vllm_config: VllmConfig) -> ParsedArgs:
+        model_config = vllm_config.model_config
+
+        args.num_key_value_heads = model_config.get_total_num_kv_heads()
+        args.head_dim = model_config.get_head_size()
+
+        model_dtype = vllm_config.model_config.dtype
+        cache_dtype = vllm_config.cache_config.cache_dtype
+
+        kv_cache_torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
+        args.cache_byte_size = get_dtype_size(kv_cache_torch_dtype)
+
+        return args
+
+
+class AttentionQuantizationConfigParser(Parser):
+    """
+    Parses quantization configuration for attention layers.
+    Overrides: weight_byte_size
+    """
+
+    def parse(self, args: ParsedArgs, vllm_config: VllmConfig) -> ParsedArgs:
+        cfg = vllm_config.quant_config
+
+        if cfg is None:
+            return args
+
+        quant_method = cfg.get_name()
+        if quant_method in ["fp8", "fbgemm_fp8"]:
+            # FIXME: This is a hacky coarse-grained fp8 quantization detection.
+            # FIXME: These configs also have concept of "ignored layers" and we
+            # need to solve the same problem as above.
+            args.weight_byte_size = 1
+        elif quant_method == "mxfp4":
+            # FIXME: Also has "ignored layers" issue above
+            args.weight_byte_size = 0.5
+        else:
+            # FIXME: Add more parsing logic for different quant methods.
+            raise InvalidComponent
+
+        return args
+
+
+class AttentionMetrics(ComponentMetrics):
+    # From BaseConfigParser
+    num_hidden_layers: int = Field(..., gt=0)
+    hidden_size: int = Field(..., gt=0)
+    num_attention_heads: int = Field(..., gt=0)
+    activation_byte_size: int = Field(..., gt=0)
+    tp_size: int = Field(..., gt=0)
+    pp_size: int = Field(..., gt=0)
+
+    # From BaseAttentionConfigParser
+    num_key_value_heads: int = Field(..., gt=0)
+    head_dim: int = Field(..., gt=0)
+    cache_byte_size: int = Field(..., gt=0)
+
+    # From BaseConfig Parser, overridden by AttentionQuantizationConfigParser
+    weight_byte_size: int | float = Field(..., gt=0)
+
+    # TODO: discern cases where we have mixture of different attention layer types
+    # such as SWA, MLA, etc.
+
+    @classmethod
+    def component_type(cls) -> str:
+        return "attn"
+
+    @classmethod
+    def get_parser(cls) -> ParserChain:
+        return ParserChain(
+            BaseConfigParser(),
+            BaseAttentionConfigParser(),
+            AttentionQuantizationConfigParser(),
+        )
+
+    def get_num_flops_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]:
+        L, D, q, kv, d = (
+            self.num_hidden_layers,
+            self.hidden_size,
+            self.num_attention_heads,
+            self.num_key_value_heads,
+            self.head_dim,
+        )
+        T = ctx.total_num_tokens()
+        TC = ctx.total_token_context_product()
+
+        if per_gpu:
+            L //= self.pp_size
+            # tensor parallel along heads
+            q = max(1, q // self.tp_size)
+            kv = max(1, kv // self.tp_size)
+
+        return {
+            "qkv_proj": 2 * T * D * (q + 2 * kv) * d * L,
+            "attn_qk": 2 * q * TC * d * L,
+            "attn_av": 2 * q * TC * d * L,
+            "out_proj": 2 * T * D * q * d * L,
+        }
+
+    def get_read_bytes_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]:
+        L, D, q, kv, d = (
+            self.num_hidden_layers,
+            self.hidden_size,
+            self.num_attention_heads,
+            self.num_key_value_heads,
+            self.head_dim,
+        )
+        T = ctx.total_num_tokens()
+
+        if per_gpu:
+            L //= self.pp_size
+            # tensor parallel along heads
+            q = max(1, q // self.tp_size)
+            kv = max(1, kv // self.tp_size)
+
+        read_bytes = {}
+
+        read_bytes["qkv_input"] = T * D * self.activation_byte_size * L
+        read_bytes["qkv_weight"] = int(D * (q + 2 * kv) * d * self.weight_byte_size * L)
+
+        # Attention input reads differ between prefill and decode
+        # Prefill: read Q, K, V activations (all in activation_byte_size)
+        if ctx.prefill_num_tokens > 0:
+            read_bytes["attn_input"] = (
+                (ctx.prefill_num_tokens * q + 2 * ctx.prefill_context_len * kv)
+                * d
+                * self.activation_byte_size
+                * L
+            )
+
+        # Decode: read Q activations + read K, V from cache (in cache_byte_size)
+        if ctx.decode_num_tokens > 0:
+            read_bytes["attn_input"] = read_bytes.get("attn_input", 0) + (
+                ctx.decode_num_tokens * q * d * self.activation_byte_size * L
+                + 2 * ctx.decode_context_len * kv * d * self.cache_byte_size * L
+            )
+
+        read_bytes["out_input"] = T * q * d * self.activation_byte_size * L
+        read_bytes["out_weight"] = int(q * d * D * self.weight_byte_size * L)
+
+        return read_bytes
+
+    def get_write_bytes_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]:
+        """Calculate write memory traffic for attention layers."""
+        L, D, q, kv, d = (
+            self.num_hidden_layers,
+            self.hidden_size,
+            self.num_attention_heads,
+            self.num_key_value_heads,
+            self.head_dim,
+        )
+        T = ctx.total_num_tokens()
+
+        if per_gpu:
+            L //= self.pp_size
+            # tensor parallel along heads
+            q = max(1, q // self.tp_size)
+            kv = max(1, kv // self.tp_size)
+
+        return {
+            "qkv_output": T * (q + 2 * kv) * d * self.activation_byte_size * L,
+            "kv_cache": 2 * T * kv * d * self.cache_byte_size * L,
+            "out_output": T * D * self.activation_byte_size * L,
+        }
+
+
+#### Ffn ####
+
+
+class BaseFfnConfigParser(Parser):
+    """
+    Parses FFN and MoE configuration.
+    Provides: intermediate_size, num_experts, num_experts_per_tok,
+    moe_intermediate_size, num_shared_experts, num_moe_layers
+    """
+
+    def parse(self, args: ParsedArgs, vllm_config: VllmConfig) -> ParsedArgs:
+        cfg = vllm_config.model_config.hf_config
+        if hasattr(cfg, "text_config") and cfg.text_config is not None:
+            cfg = cfg.text_config
+
+        args.intermediate_size = getattr(cfg, "intermediate_size", args.hidden_size * 4)
+
+        # Try different naming conventions.
+        args.num_experts = vllm_config.model_config.get_num_experts()
+        args.num_experts_per_tok = getattr_from_list(
+            cfg, ["num_experts_per_tok", "moe_topk"], 0
+        )
+        args.moe_intermediate_size = getattr_from_list(
+            cfg, ["moe_intermediate_size", "intermediate_size"], 0
+        )
+        args.num_shared_experts = getattr_from_list(
+            cfg, ["n_shared_experts", "num_shared_experts"], 0
+        )
+
+        is_moe = args.num_experts != 0
+        # Assume all MoE layers by default
+        args.num_moe_layers = args.num_hidden_layers if is_moe else 0
+
+        return args
+
+
+class FfnParallelParser(Parser):
+    """
+    Parses FFN parallelism configuration.
+
+    Provides: ffn_tp_size, ffn_ep_size
+    """
+
+    def parse(self, args: ParsedArgs, vllm_config: VllmConfig) -> ParsedArgs:
+        # NOTE: ffn tp_size does not equal the tp_size parameter directly.
+        # e.g.) If we use DP2TP4, ffn will use TP8 (or EP8 if EP is enabled.)
+        if args.enable_ep:
+            ffn_tp_size, ffn_ep_size = 1, args.dp_size * args.tp_size
+        else:
+            ffn_tp_size, ffn_ep_size = args.dp_size * args.tp_size, 1
+
+        args.ffn_tp_size = ffn_tp_size
+        args.ffn_ep_size = ffn_ep_size
+
+        return args
+
+
+class InterleaveMoeLayerStepParser(Parser):
+    """
+    Parses interleave_moe_layer_step field for models like Llama4.
+
+    Overrides: num_moe_layers
+    """
+
+    def parse(self, args: ParsedArgs, vllm_config: VllmConfig) -> ParsedArgs:
+        cfg = vllm_config.model_config.hf_config
+        if hasattr(cfg, "text_config") and cfg.text_config is not None:
+            cfg = cfg.text_config
+
+        if (
+            hasattr(cfg, "interleave_moe_layer_step")
+            and cfg.interleave_moe_layer_step > 0
+        ):
+            args.num_moe_layers = len(
+                [
+                    layer
+                    for layer in range(args.num_hidden_layers)
+                    if (layer + 1) % cfg.interleave_moe_layer_step == 0
+                ]
+            )
+
+        return args
+
+
+class MoeLayerFreqParser(Parser):
+    """
+    Parses moe_layer_freq and first_k_dense_replace fields for models like Deepseek.
+
+    Overrides: num_moe_layers
+    """
+
+    def parse(self, args: ParsedArgs, vllm_config: VllmConfig) -> ParsedArgs:
+        cfg = vllm_config.model_config.hf_config
+        if hasattr(cfg, "text_config") and cfg.text_config is not None:
+            cfg = cfg.text_config
+
+        if hasattr(cfg, "moe_layer_freq") and hasattr(cfg, "first_k_dense_replace"):
+            args.num_moe_layers = len(
+                [
+                    layer
+                    for layer in range(args.num_hidden_layers)
+                    if layer >= cfg.first_k_dense_replace
+                    and layer % cfg.moe_layer_freq == 0
+                ]
+            )
+
+        return args
+
+
+class FfnQuantizationConfigParser(Parser):
+    """
+    Parses quantization configuration for FFN layers.
+
+    Overrides: weight_byte_size
+    """
+
+    def parse(self, args: ParsedArgs, vllm_config: VllmConfig) -> ParsedArgs:
+        cfg = vllm_config.quant_config
+
+        if cfg is None:
+            return args
+
+        quant_method = cfg.get_name()
+        if quant_method in ["fp8", "fbgemm_fp8"]:
+            # FIXME: This is a hacky coarse-grained fp8 quantization detection.
+            # (there might be more quantization methods for fp8).
+            # FIXME: These configs also have concept of "ignored layers" and we
+            # need to solve the same problem as above.
+            args.weight_byte_size = 1
+            pass
+        elif quant_method == "mxfp4":
+            # FIXME: Also has "ignored layers" issue above
+            args.weight_byte_size = 0.5
+        else:
+            # FIXME: Add more parsing logic for different quant methods.
+            raise InvalidComponent
+
+        return args
+
+
+class FfnMetrics(ComponentMetrics):
+    # From BaseConfigParser
+    num_hidden_layers: int = Field(..., gt=0)
+    hidden_size: int = Field(..., gt=0)
+    activation_byte_size: int = Field(..., gt=0)
+    pp_size: int = Field(..., gt=0)
+
+    # From FfnParallelParser
+    ffn_tp_size: int = Field(..., gt=0)
+    ffn_ep_size: int = Field(..., gt=0)
+
+    # From BaseFfnConfigParser
+    intermediate_size: int = Field(..., gt=0)
+    num_experts: int = Field(0)
+    num_experts_per_tok: int = Field(1)
+    moe_intermediate_size: int = Field(0)
+    num_shared_experts: int = Field(0)
+
+    # From BaseConfigParser, can be overridden InterleaveMoeLayerStep or MoeLayerFreq
+    num_moe_layers: int = Field(..., ge=0)
+
+    # FIXME: might have to make this more granular
+    # (i.e. dense_weight_byte_size, moe_routed_weight_byte_size,
+    # moe_shared_weight_byte_size)
+    # since it can differ from byte size of other components (e.g. attn)
+    # and can differ even from each other.
+
+    # From BaseConfigParser, can be overridden by FfnQuantizationConfigParser
+    weight_byte_size: int | float = Field(..., gt=0)
+
+    @model_validator(mode="after")
+    def validate_moe_fields(self) -> Self:
+        """Validate that MoE-related fields are properly set when num_moe_layers > 0."""
+        if self.num_moe_layers > 0:
+            assert self.num_experts, f"{self.num_experts=}"
+            assert self.num_experts_per_tok, f"{self.num_experts_per_tok=}"
+            assert self.moe_intermediate_size, f"{self.moe_intermediate_size=}"
+        return self
+
+    @classmethod
+    def component_type(cls) -> str:
+        return "ffn"
+
+    @classmethod
+    def get_parser(cls) -> ParserChain:
+        return ParserChain(
+            BaseConfigParser(),
+            FfnParallelParser(),
+            BaseFfnConfigParser(),
+            InterleaveMoeLayerStepParser(),
+            MoeLayerFreqParser(),
+            FfnQuantizationConfigParser(),
+        )
+
+    def get_num_flops_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]:
+        """Calculate flops breakdown for FFN layers."""
+        L, D, DI = self.num_hidden_layers, self.hidden_size, self.intermediate_size
+        Lm, E, MI, S = (
+            self.num_moe_layers,
+            self.num_experts_per_tok,
+            self.moe_intermediate_size,
+            self.num_shared_experts,
+        )
+        T = ctx.total_num_tokens()
+
+        Ld = L - Lm
+
+        num_activated_tokens = T * E if E else 0
+
+        if per_gpu:
+            Ld //= self.pp_size
+            Lm //= self.pp_size
+
+            DI //= self.ffn_tp_size
+            if MI is not None:
+                MI //= self.ffn_tp_size
+            if E:
+                num_activated_tokens //= self.ffn_ep_size
+
+        flops = {}
+
+        # Dense FFN layers (SwiGLU: 3 linear layers: up, gate, down)
+        if Ld:
+            flops["dense_ffn"] = 2 * D * 3 * DI * T * Ld
+
+        # MoE routed experts (each token activates E experts)
+        if Lm and E:
+            flops["routed_ffn"] = 2 * D * 3 * MI * num_activated_tokens * Lm
+
+        # MoE shared experts (all S shared experts run for every token)
+        if Lm and S:
+            flops["shared_ffn"] = 2 * D * 3 * MI * S * T * Lm
+
+        return flops
+
+    def get_read_bytes_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]:
+        """Calculate read memory traffic for FFN layers."""
+        L, D, DI = self.num_hidden_layers, self.hidden_size, self.intermediate_size
+        Lm, E, MI, S = (
+            self.num_moe_layers,
+            self.num_experts_per_tok,
+            self.moe_intermediate_size,
+            self.num_shared_experts,
+        )
+        T = ctx.total_num_tokens()
+        num_experts = self.num_experts
+
+        Ld = L - Lm
+
+        num_activated_tokens = T * E if E else 0
+
+        if per_gpu:
+            Ld //= self.pp_size
+            Lm //= self.pp_size
+
+            DI //= self.ffn_tp_size
+            if MI is not None:
+                MI //= self.ffn_tp_size
+            if E:
+                num_activated_tokens //= self.ffn_ep_size
+            if num_experts is not None:
+                num_experts //= self.ffn_ep_size
+
+        read_bytes = {}
+
+        # Dense FFN layers (3 GEMMs: up, gate, down projections + SiLU activation)
+        if Ld:
+            read_bytes["dense_up_gate_input"] = int(
+                T * D * self.activation_byte_size * Ld
+            )
+            read_bytes["dense_up_gate_weights"] = int(
+                2 * D * DI * self.weight_byte_size * Ld
+            )
+            read_bytes["dense_silu_input"] = int(
+                2 * T * DI * self.activation_byte_size * Ld
+            )
+            read_bytes["dense_down_input"] = int(
+                T * DI * self.activation_byte_size * Ld
+            )
+            read_bytes["dense_down_weights"] = int(D * DI * self.weight_byte_size * Ld)
+
+        if Lm:
+            # MoE routed expert reads
+            if E:
+                # FIXME: Assume perfect load balancing for now.
+                num_activated_experts = min(num_activated_tokens, num_experts)
+
+                read_bytes["routed_up_gate_input"] = int(
+                    num_activated_tokens * D * self.activation_byte_size * Lm
+                )
+                read_bytes["routed_up_gate_weights"] = int(
+                    2 * D * MI * num_activated_experts * self.weight_byte_size * Lm
+                )
+                read_bytes["routed_silu_input"] = int(
+                    2 * num_activated_tokens * MI * self.activation_byte_size * Lm
+                )
+                read_bytes["routed_down_input"] = int(
+                    num_activated_tokens * MI * self.activation_byte_size * Lm
+                )
+                read_bytes["routed_down_weights"] = int(
+                    D * MI * num_activated_experts * self.weight_byte_size * Lm
+                )
+
+            # MoE shared expert reads
+            if S:
+                read_bytes["shared_up_gate_input"] = int(
+                    T * D * self.activation_byte_size * Lm
+                )
+                read_bytes["shared_up_gate_weights"] = int(
+                    2 * D * MI * S * self.weight_byte_size * Lm
+                )
+                read_bytes["shared_silu_input"] = int(
+                    2 * T * MI * S * self.activation_byte_size * Lm
+                )
+                read_bytes["shared_down_input"] = int(
+                    T * MI * self.activation_byte_size * Lm
+                )
+                read_bytes["shared_down_weights"] = int(
+                    D * MI * S * self.weight_byte_size * Lm
+                )
+
+        return read_bytes
+
+    def get_write_bytes_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]:
+        """Calculate write memory traffic for FFN layers."""
+        L, D, DI = self.num_hidden_layers, self.hidden_size, self.intermediate_size
+        Lm, E, MI, S = (
+            self.num_moe_layers,
+            self.num_experts_per_tok,
+            self.moe_intermediate_size,
+            self.num_shared_experts,
+        )
+        T = ctx.total_num_tokens()
+
+        Ld = L - Lm
+
+        num_activated_tokens = T * E if E else 0
+
+        if per_gpu:
+            Ld //= self.pp_size
+            Lm //= self.pp_size
+
+            DI //= self.ffn_tp_size
+            if MI is not None:
+                MI //= self.ffn_tp_size
+            if E:
+                num_activated_tokens //= self.ffn_ep_size
+
+        write_bytes = {}
+
+        # Dense FFN layers
+        if Ld:
+            write_bytes["dense_up_gate_output"] = int(
+                2 * T * DI * self.activation_byte_size * Ld
+            )
+            write_bytes["dense_silu_output"] = int(
+                T * DI * self.activation_byte_size * Ld
+            )
+            write_bytes["dense_down_output"] = int(
+                T * D * self.activation_byte_size * Ld
+            )
+
+        # MoE outputs
+        if Lm:
+            if E:
+                write_bytes["routed_up_gate_output"] = int(
+                    2 * num_activated_tokens * MI * self.activation_byte_size * Lm
+                )
+                write_bytes["routed_silu_output"] = int(
+                    num_activated_tokens * MI * self.activation_byte_size * Lm
+                )
+                write_bytes["routed_down_output"] = int(
+                    num_activated_tokens * D * self.activation_byte_size * Lm
+                )
+            if S:
+                write_bytes["shared_up_gate_output"] = int(
+                    2 * T * S * MI * self.activation_byte_size * Lm
+                )
+                write_bytes["shared_silu_output"] = int(
+                    T * S * MI * self.activation_byte_size * Lm
+                )
+                write_bytes["shared_down_output"] = int(
+                    T * S * D * self.activation_byte_size * Lm
+                )
+
+        return write_bytes
+
+
+#### Unembed ####
+
+
+class UnembedMetrics(ComponentMetrics):
+    # From BaseConfigParser
+    hidden_size: int = Field(..., gt=0)
+    vocab_size: int = Field(..., gt=0)
+    weight_byte_size: int = Field(..., gt=0)
+    activation_byte_size: int = Field(..., gt=0)
+
+    tp_size: int
+
+    @classmethod
+    def component_type(cls) -> str:
+        return "unembed"
+
+    @classmethod
+    def get_parser(cls) -> ParserChain:
+        return ParserChain(
+            BaseConfigParser(),
+        )
+
+    def get_num_flops_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]:
+        """Calculate flops breakdown for unembedding layer."""
+        D, V = self.hidden_size, self.vocab_size
+        T = ctx.total_num_tokens()
+
+        if per_gpu:
+            V //= self.tp_size
+
+        return {
+            "unembed": 2 * T * D * V,
+        }
+
+    def get_read_bytes_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]:
+        """Calculate read memory traffic for unembedding layer."""
+        D, V = self.hidden_size, self.vocab_size
+        T = ctx.total_num_tokens()
+
+        if per_gpu:
+            V //= self.tp_size
+
+        return {
+            "input": T * D * self.activation_byte_size,
+            "weight": D * V * self.weight_byte_size,
+        }
+
+    def get_write_bytes_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]:
+        """Calculate write memory traffic for unembedding layer."""
+        V = self.vocab_size
+        T = ctx.total_num_tokens()
+
+        if per_gpu:
+            V //= self.tp_size
+
+        return {
+            "output": T * V * self.activation_byte_size,
+        }
+
+
+#### ModelMetrics ####
+
+
+class ModelMetrics:
+    def __init__(self, vllm_config: VllmConfig) -> None:
+        """
+        Parse vllm_config to instantiate metrics for each component.
+        is_enabled() will return False if no component metrics could be instantiated.
+        """
+
+        self.vllm_config = vllm_config
+
+        self.metrics: list[ComponentMetrics] = []
+        for metric_cls in ComponentMetrics.registered_metrics():
+            try:
+                metric = metric_cls.from_vllm_config(vllm_config)
+                self.metrics.append(metric)
+                logger.info(
+                    "Instantiated ComponentMetrics [%s] with (%s)",
+                    metric.component_type(),
+                    str(metric),
+                )
+            except InvalidComponent as e:
+                logger.debug(
+                    "Failed to instantiate %s from %s",
+                    metric_cls.component_type(),
+                    str(e),
+                )
+
+    def is_enabled(self) -> bool:
+        return len(self.metrics) > 0
+
+    def get_num_flops(self, ctx: ExecutionContext, per_gpu: bool = True) -> int:
+        return sum(metric.get_num_flops(ctx, per_gpu) for metric in self.metrics)
+
+    def get_read_bytes(self, ctx: ExecutionContext, per_gpu: bool = True) -> int:
+        return sum(metric.get_read_bytes(ctx, per_gpu) for metric in self.metrics)
+
+    def get_write_bytes(self, ctx: ExecutionContext, per_gpu: bool = True) -> int:
+        return sum(metric.get_write_bytes(ctx, per_gpu) for metric in self.metrics)
+
+    def get_num_flops_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]:
+        total = {}
+        for metric in self.metrics:
+            breakdown = metric.get_num_flops_breakdown(ctx, per_gpu)
+            component = metric.component_type()
+            prefixed = {f"{component}.{key}": val for key, val in breakdown.items()}
+            total.update(prefixed)
+        return total
+
+    def get_read_bytes_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]:
+        total = {}
+        for metric in self.metrics:
+            breakdown = metric.get_read_bytes_breakdown(ctx, per_gpu)
+            component = metric.component_type()
+            prefixed = {f"{component}.{key}": val for key, val in breakdown.items()}
+            total.update(prefixed)
+        return total
+
+    def get_write_bytes_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]:
+        total = {}
+        for metric in self.metrics:
+            breakdown = metric.get_write_bytes_breakdown(ctx, per_gpu)
+            component = metric.component_type()
+            prefixed = {f"{component}.{key}": val for key, val in breakdown.items()}
+            total.update(prefixed)
+        return total
+
+    def get_step_perf_stats_per_gpu(
+        self, scheduler_output: SchedulerOutput
+    ) -> PerfStats:
+        """
+        Calculate perf stats for the current step based on scheduled tokens.
+        """
+
+        t0 = time.monotonic()
+
+        # Build a single batch context
+        ctx = ExecutionContext()
+
+        # Process new requests (these are in prefill phase)
+        for new_req in scheduler_output.scheduled_new_reqs:
+            req_id = new_req.req_id
+            num_tokens = scheduler_output.num_scheduled_tokens.get(req_id, 0)
+            if num_tokens == 0:
+                continue
+
+            # For new requests, context_len = num_computed_tokens + num_tokens
+            # num_computed_tokens represents previously computed tokens in the sequence
+            context_len = new_req.num_computed_tokens + num_tokens
+            ctx.add(num_tokens, context_len, is_prefill=True)
+
+        # Process cached requests (continuing requests)
+        cached_reqs = scheduler_output.scheduled_cached_reqs
+        for i, req_id in enumerate(cached_reqs.req_ids):
+            num_tokens = scheduler_output.num_scheduled_tokens.get(req_id, 0)
+            if num_tokens == 0:
+                continue
+
+            # For cached requests, we have the current num_computed_tokens
+            num_computed_tokens = cached_reqs.num_computed_tokens[i]
+            context_len = num_computed_tokens + num_tokens
+
+            # Cached requests are typically in decode phase (num_tokens == 1)
+            # unless they're doing chunked prefill (num_tokens > 1)
+            is_prefill = num_tokens > 1
+            ctx.add(num_tokens, context_len, is_prefill)
+
+        num_flops_breakdown = self.get_num_flops_breakdown(ctx, True)
+        read_bytes_breakdown = self.get_read_bytes_breakdown(ctx, True)
+        write_bytes_breakdown = self.get_write_bytes_breakdown(ctx, True)
+        perf_stats = PerfStats(
+            sum(num_flops_breakdown.values()),
+            sum(read_bytes_breakdown.values()),
+            sum(write_bytes_breakdown.values()),
+        )
+
+        if envs.VLLM_DEBUG_MFU_METRICS:
+            perf_stats.debug_stats = DebugPerfStats(
+                time.monotonic() - t0,
+                ctx.num_prefill_requests,
+                ctx.num_decode_requests,
+                asdict(ctx),
+                num_flops_breakdown,
+                read_bytes_breakdown,
+                write_bytes_breakdown,
+            )
+
+        return perf_stats
+
+
+#### Logging ####
+
+
+class PerfMetricsDebugLogging:
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.total_calc_duration: float = 0.0
+        self.total_num_prefill_requests: int = 0
+        self.total_num_decode_requests: int = 0
+        self.total_num_batches: int = 0
+        self.total_context_breakdown: dict[str, int] = {}
+        self.total_num_flops_per_gpu_breakdown: dict[str, int] = {}
+        self.total_read_bytes_per_gpu_breakdown: dict[str, int] = {}
+        self.total_write_bytes_per_gpu_breakdown: dict[str, int] = {}
+
+    def observe(self, debug_stats: DebugPerfStats) -> None:
+        self.total_calc_duration += debug_stats.calc_duration
+        self.total_num_prefill_requests += debug_stats.num_prefill_requests
+        self.total_num_decode_requests += debug_stats.num_decode_requests
+        self.total_num_batches += 1
+
+        for dst, src in zip(
+            [
+                self.total_context_breakdown,
+                self.total_num_flops_per_gpu_breakdown,
+                self.total_read_bytes_per_gpu_breakdown,
+                self.total_write_bytes_per_gpu_breakdown,
+            ],
+            [
+                debug_stats.context_breakdown,
+                debug_stats.num_flops_per_gpu_breakdown,
+                debug_stats.num_read_bytes_per_gpu_breakdown,
+                debug_stats.num_write_bytes_per_gpu_breakdown,
+            ],
+        ):
+            assert isinstance(src, dict)
+            for key, val in src.items():
+                dst[key] = dst.get(key, 0) + val
+
+    def log(self, log_fn, log_prefix: str, delta_time: float):
+        # pretty print breakdowns
+        total_num_flops_per_gpu_breakdown = {
+            k: f"{v / 1e12:.1f}TF"
+            for k, v in self.total_num_flops_per_gpu_breakdown.items()
+        }
+        total_read_bytes_per_gpu_breakdown = {
+            k: f"{v / 1e9:.1f}GB"
+            for k, v in self.total_read_bytes_per_gpu_breakdown.items()
+        }
+        total_write_bytes_per_gpu_breakdown = {
+            k: f"{v / 1e9:.1f}GB"
+            for k, v in self.total_write_bytes_per_gpu_breakdown.items()
+        }
+
+        logger.debug(
+            "%sMFU details: %s",
+            log_prefix,
+            json.dumps(
+                {
+                    "prefill_reqs": self.total_num_prefill_requests,
+                    "decode_reqs": self.total_num_decode_requests,
+                    "num_batches": self.total_num_batches,
+                    "context_breakdown": self.total_context_breakdown,
+                    "flops_breakdown": total_num_flops_per_gpu_breakdown,
+                    "num_read_bytes_breakdown": total_read_bytes_per_gpu_breakdown,
+                    "num_write_bytes_breakdown": (total_write_bytes_per_gpu_breakdown),
+                    "duration": f"{delta_time:.1f}s",
+                    "mfu_calc_overhead": (
+                        f"{self.total_calc_duration / delta_time:.1%}"
+                    ),
+                },
+                indent=2,
+            ),
+        )
+
+
+class PerfMetricsLogging:
+    def __init__(self, vllm_config: VllmConfig):
+        self.vllm_config = vllm_config
+        self.pp_size = vllm_config.parallel_config.pipeline_parallel_size
+
+        self.debug_logging: PerfMetricsDebugLogging | None = None
+        if envs.VLLM_DEBUG_MFU_METRICS:
+            self.debug_logging = PerfMetricsDebugLogging()
+
+        self.reset()
+
+    def reset(self):
+        self.last_log_time = time.monotonic()
+
+        self.total_num_flops_per_gpu: int = 0
+        self.total_read_bytes_per_gpu: int = 0
+        self.total_write_bytes_per_gpu: int = 0
+
+        if self.debug_logging:
+            self.debug_logging.reset()
+
+    def observe(self, perf_stats: PerfStats) -> None:
+        self.total_num_flops_per_gpu += perf_stats.num_flops_per_gpu
+        self.total_read_bytes_per_gpu += perf_stats.num_read_bytes_per_gpu
+        self.total_write_bytes_per_gpu += perf_stats.num_write_bytes_per_gpu
+
+        if self.debug_logging:
+            assert perf_stats.debug_stats is not None
+            self.debug_logging.observe(perf_stats.debug_stats)
+
+    def log(self, log_fn=logger.info, log_prefix: str = "") -> None:
+        if not (
+            self.total_num_flops_per_gpu
+            or self.total_read_bytes_per_gpu
+            or self.total_write_bytes_per_gpu
+        ):
+            return
+
+        now = time.monotonic()
+        delta_time = now - self.last_log_time
+
+        if delta_time <= 0.0:
+            avg_tflops_per_gpu = 0.0
+            avg_gbps_per_gpu = 0.0
+        else:
+            avg_tflops_per_gpu = self.total_num_flops_per_gpu / delta_time / 1e12
+            avg_gbps_per_gpu = (
+                (self.total_read_bytes_per_gpu + self.total_write_bytes_per_gpu)
+                / delta_time
+                / 1e9
+            )
+
+        log_fn(
+            "%sMFU: %.1f TF/s/GPU %.1f GB/s/GPU",
+            log_prefix,
+            avg_tflops_per_gpu,
+            avg_gbps_per_gpu,
+        )
+
+        if self.debug_logging:
+            self.debug_logging.log(log_fn, log_prefix, delta_time)
+
+        self.reset()
+
+
+## util functions
+
+
+def get_required(obj: object, attr: str):
+    """Get an attr from an object, or throw a InvalidComponentError if it's not set."""
+    if not hasattr(obj, attr):
+        raise InvalidComponent(f"Missing required attr {attr} in config")
+    return getattr(obj, attr)
+
+
+def getattr_from_list(obj: object, attrs: list[str], default: object = None):
+    """Try to get the first attr that exists in the object
+    from a list of attrs. Otherwise return None."""
+    for attr in attrs:
+        if hasattr(obj, attr):
+            return getattr(obj, attr)
+    return default
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index a0cc58d0a64e8..cb1a860e38fbc 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -8,6 +8,7 @@ from typing import TYPE_CHECKING, Any
 
 import vllm.envs as envs
 from vllm.compilation.cuda_graph import CUDAGraphStat
+from vllm.v1.metrics.perf import PerfStats
 from vllm.v1.spec_decode.metrics import SpecDecodingStats
 
 if TYPE_CHECKING:
@@ -186,6 +187,8 @@ class SchedulerStats:
 
     cudagraph_stats: CUDAGraphStat | None = None
 
+    perf_stats: PerfStats | None = None
+
 
 @dataclass
 class RequestStateStats:

From fd8afdf38dad8bf7ccc4e7fcc3d4aaa4d6d9e0d8 Mon Sep 17 00:00:00 2001
From: Micah Williamson <micah.williamson@amd.com>
Date: Wed, 17 Dec 2025 20:27:37 -0600
Subject: [PATCH 647/770] [ROCm][CI] Reduce Flakiness For test_async_scheduling
 Using ROCM_ATTN With FP32 (#30811)

Signed-off-by: Micah Williamson <micah.williamson@amd.com>
---
 tests/v1/e2e/test_async_scheduling.py   | 12 ++----------
 vllm/v1/attention/backends/rocm_attn.py |  6 +++++-
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py
index 61e56c079a3b5..6447a33838d75 100644
--- a/tests/v1/e2e/test_async_scheduling.py
+++ b/tests/v1/e2e/test_async_scheduling.py
@@ -148,7 +148,7 @@ def run_tests(
             # Use TRITON_ATTN for spec decoding test for consistency
             attention_config = {"backend": "TRITON_ATTN"}
         else:
-            attention_config = {"backend": "ROCM_AITER_FA"}
+            attention_config = {"backend": "ROCM_ATTN"}
     else:
         attention_config = {"backend": "FLEX_ATTENTION"}
 
@@ -284,14 +284,6 @@ def run_test(
     print(f"---- TESTING {test_str}: {test_config}")
     print("-" * 80)
 
-    # On ROCm: use float16 for first test (ROCM_AITER_FA), but float32 for
-    # spec decoding test (TRITON_ATTN) for better precision.
-    # On others: always use float32.
-    if current_platform.is_rocm() and not is_testing_with_spec_decoding:
-        dtype = "float16"
-    else:
-        dtype = "float32"
-
     with VllmRunner(
         model,
         max_model_len=512,
@@ -301,7 +293,7 @@ def run_test(
         # enforce_eager=True,
         async_scheduling=async_scheduling,
         distributed_executor_backend=executor,
-        dtype=dtype,
+        dtype="float32",
         speculative_config=spec_config,
         disable_log_stats=False,
         attention_config=attention_config,
diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py
index e231c600cba7a..3701373f33315 100644
--- a/vllm/v1/attention/backends/rocm_attn.py
+++ b/vllm/v1/attention/backends/rocm_attn.py
@@ -152,7 +152,11 @@ class RocmAttentionMetadataBuilder(AttentionMetadataBuilder[RocmAttentionMetadat
 
 class RocmAttentionBackend(AttentionBackend):
     accept_output_buffer: bool = True
-    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
+    supported_dtypes: ClassVar[list[torch.dtype]] = [
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ]
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:

From bc3700e0cd8875951b87b17accd7aa4d80ddca50 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Thu, 18 Dec 2025 04:53:30 +0100
Subject: [PATCH 648/770] [NIXL] Support P tensor-parallel-size > D
 tensor-parallel-size (#27274)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 .../tp_config_sweep_accuracy_test.sh          |   3 +
 .../kv_connector/unit/test_nixl_connector.py  | 245 +++++++++-
 .../kv_transfer/kv_connector/utils.py         |  62 ++-
 .../kv_connector/v1/nixl_connector.py         | 458 +++++++++++-------
 4 files changed, 556 insertions(+), 212 deletions(-)

diff --git a/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
index 8199fd516cd43..f6b4498ceb371 100755
--- a/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
@@ -8,9 +8,12 @@ SCRIPT="v1/kv_connector/nixl_integration/run_accuracy_test.sh"
 configs=(
   "GPU_MEMORY_UTILIZATION=0.6 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=2"
   "GPU_MEMORY_UTILIZATION=0.6 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2"
+  "GPU_MEMORY_UTILIZATION=0.6 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=1"
   "GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA case
   "GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny"
+  "GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=1 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny"
   "DP_EP=1 GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA+P-TP1, D-DPEP=2 (TP=1) 
+  "DP_EP=1 GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA+P-TP2, D-DPEP=2 (TP=1) 
 )
 
 run_tests() {
diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index 25f4308079595..20ef566416b8f 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -391,6 +391,8 @@ class FakeNixlConnectorWorker(NixlConnectorWorker):
         super().__init__(*args, **kwargs)
         self._hand_shake_latency = hand_shake_latency
         self.kv_cache_layout = kv_cache_layout
+        # Mock register_kv_caches attribute needed for tests that do not call it.
+        self.src_xfer_handles_by_block_size = {self.block_size: 1}
 
     def _nixl_handshake(
         self, host: str, port: int, remote_tp_size: int, expected_engine_id: str
@@ -407,22 +409,43 @@ class FakeNixlConnectorWorker(NixlConnectorWorker):
 
         assert expected_engine_id == self.REMOTE_ENGINE_ID
 
-        remote_agent_name = self.add_remote_agent(
-            NixlAgentMetadata(
-                engine_id=self.REMOTE_ENGINE_ID,
-                agent_metadata=FakeNixlWrapper.AGENT_METADATA,
-                kv_caches_base_addr=[0],
-                device_id=0,
-                num_blocks=1,
-                block_lens=self.block_len_per_layer,
-                # `self.kv_cache_layout` is only forced to HND when vllm engine
-                # is started. We mock HND here.
-                kv_cache_layout="HND",
-                block_size=self.block_size,
-            ),
-            remote_tp_size=remote_tp_size,
-        )
-        return {0: remote_agent_name}
+        # Adjust remote block length metadata to satisfy heterogeneous TP
+        # invariants enforced during handshake validation.
+        remote_block_lens = list(self.block_len_per_layer)
+        tp_ratio = self.kv_topo.tp_ratio(remote_tp_size)
+        if remote_tp_size > self.world_size:
+            # P TP > D TP case, block_len of remote is smaller
+            remote_block_lens = [
+                block_len // (-tp_ratio) for block_len in remote_block_lens
+            ]
+        elif remote_tp_size < self.world_size:
+            remote_block_lens = [
+                block_len * tp_ratio for block_len in remote_block_lens
+            ]
+
+        # When remote tp_size > local tp_size, handshake with multiple
+        # remote ranks.
+        num_hanshakes = 1 if tp_ratio > 0 else -tp_ratio
+        remote_agents: dict[int, str] = {}
+        for remote_tp_rank in range(num_hanshakes):
+            remote_agent_name = self.add_remote_agent(
+                NixlAgentMetadata(
+                    engine_id=self.REMOTE_ENGINE_ID,
+                    agent_metadata=FakeNixlWrapper.AGENT_METADATA,
+                    kv_caches_base_addr=[0],
+                    device_id=remote_tp_rank,
+                    num_blocks=1,
+                    block_lens=remote_block_lens,
+                    # `self.kv_cache_layout` is only forced to HND when vllm engine
+                    # is started. We mock HND here.
+                    kv_cache_layout="HND",
+                    block_size=self.block_size,
+                ),
+                remote_tp_rank=remote_tp_rank,
+                remote_tp_size=remote_tp_size,
+            )
+            remote_agents[remote_tp_rank] = remote_agent_name
+        return remote_agents
 
 
 class TestNixlHandshake:
@@ -453,7 +476,13 @@ class TestNixlHandshake:
             vllm_config, connector.engine_id, hand_shake_latency=0
         )
         assert isinstance(connector.connector_worker.nixl_wrapper, FakeNixlWrapper)
-        connector.connector_worker.nixl_wrapper.set_cycles_before_xfer_done(3)
+        worker = connector.connector_worker
+        worker.nixl_wrapper.set_cycles_before_xfer_done(3)
+        # simulate handshake
+        worker.dst_xfer_side_handles = {
+            FakeNixlConnectorWorker.REMOTE_ENGINE_ID: {0: 1}
+        }
+        worker.kv_cache_layout = "HND"
         num_xfers = 4
         while True:
             # For the same request_id, initiate multiple xfers across different
@@ -567,6 +596,171 @@ class TestNixlHandshake:
                 return
         raise TimeoutError("Took too long to complete async handshake.")
 
+    @patch(
+        "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
+        FakeNixlWrapper,
+    )
+    @pytest.mark.parametrize("local_tp_size", [1, 2])
+    def test_prefill_tp_size_greater_than_decode_tp_size(
+        self, local_tp_size: int, dist_init
+    ):
+        """
+        Verify remote TP > local TP handshake succeeds with different
+        remote configurations.
+        """
+
+        vllm_config = create_vllm_config()
+        local_tp_size = 1
+        vllm_config.parallel_config.tensor_parallel_size = local_tp_size
+
+        connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
+        connector.connector_worker = FakeNixlConnectorWorker(
+            vllm_config, connector.engine_id, hand_shake_latency=0
+        )
+        worker = connector.connector_worker
+
+        # Minimal local registration params used by add_remote_agent
+        worker.slot_size_per_layer = [4096]
+        worker.block_len_per_layer = [4096 * worker.block_size]
+        worker.num_blocks = 1
+        worker.dst_num_blocks[worker.engine_id] = worker.num_blocks
+        worker.src_blocks_data = [(0, worker.block_len_per_layer[0], worker.tp_rank)]
+
+        def check_handshake(remote_tp_size: int):
+            tp_ratio = remote_tp_size // local_tp_size
+            assert set(remote_agents.keys()) == set(range(tp_ratio))
+
+            remote_engine_id = worker.REMOTE_ENGINE_ID
+            assert worker._tp_size[remote_engine_id] == remote_tp_size
+            assert -tp_ratio == worker.kv_topo.tp_ratio_from_engine_id(remote_engine_id)
+            # ensure src_xfer_handles_by_tp_ratio is populated with tpratio chunks
+            assert -tp_ratio in worker.src_xfer_handles_by_tp_ratio
+            assert len(worker.src_xfer_handles_by_tp_ratio[-tp_ratio]) == tp_ratio
+            assert remote_engine_id in worker.dst_xfer_side_handles
+            assert set(worker.dst_xfer_side_handles[remote_engine_id].keys()) == set(
+                range(tp_ratio)
+            )
+
+        remote_agents = worker._nixl_handshake(
+            host="localhost",
+            port=1234,
+            remote_tp_size=2,
+            expected_engine_id=worker.REMOTE_ENGINE_ID,
+        )
+        check_handshake(2)
+
+        # NOTE flexiblity: a second remote with higher number of ranks is
+        # discovered. This is not a scenario we actively support right now, but
+        # the connector allows it.
+        worker.REMOTE_ENGINE_ID = "remote_engine_2"
+        remote_agents = worker._nixl_handshake(
+            host="localhost",
+            port=1234,
+            remote_tp_size=6,
+            expected_engine_id=worker.REMOTE_ENGINE_ID,
+        )
+        check_handshake(6)
+
+    @patch(
+        "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
+        FakeNixlWrapper,
+    )
+    @pytest.mark.parametrize("local_tp_size", [1, 2])
+    def test_prefill_tp_size_greater_than_decode_tp_size_mla(
+        self, local_tp_size: int, dist_init
+    ):
+        """
+        Verify remote TP > local TP handshake succeeds with different
+        remote configurations for an MLA model.
+        """
+        vllm_config = create_vllm_config()
+        d_tp_size = 1
+        p_tp_size = 2
+
+        # Build two separate connectors/workers to emulate P TP=2 ranks.
+        conn_p0 = NixlConnector(vllm_config, KVConnectorRole.WORKER)
+        conn_p1 = NixlConnector(vllm_config, KVConnectorRole.WORKER)
+        conn_p0.connector_worker = FakeNixlConnectorWorker(
+            vllm_config, conn_p0.engine_id, hand_shake_latency=0
+        )
+        conn_p1.connector_worker = FakeNixlConnectorWorker(
+            vllm_config, conn_p1.engine_id, hand_shake_latency=0
+        )
+
+        # Force P world size to 2 for both workers and emulate distinct tp_ranks.
+        # Also enable MLA path so that expected_finished_count is updated.
+        for rank, worker in enumerate(
+            (conn_p0.connector_worker, conn_p1.connector_worker)
+        ):
+            worker.world_size = p_tp_size
+            worker.kv_topo.remote_tp_size = {worker.engine_id: p_tp_size}
+            worker.tp_rank = rank
+            worker.use_mla = True
+
+        req_id = "req-ep-dp2-p0"
+        now = time.perf_counter()
+        # Register a request on P that is waiting for consumers to read
+        # (both workers track it).
+        conn_p0.connector_worker._reqs_to_send[req_id] = now + 10.0
+        conn_p0.connector_worker._reqs_to_process.add(req_id)
+        conn_p1.connector_worker._reqs_to_send[req_id] = now + 10.0
+        conn_p1.connector_worker._reqs_to_process.add(req_id)
+
+        # Simulate a read notification coming from D with (tp=1, dp=2).
+        notif = f"{req_id}:{d_tp_size}".encode()
+        # D0-0->P0 notif
+        conn_p0.connector_worker.nixl_wrapper.get_new_notifs = lambda: {
+            "agent": [notif]
+        }  # type: ignore[method-assign]
+        conn_p1.connector_worker.nixl_wrapper.get_new_notifs = lambda: {
+            "agent": [notif]
+        }  # type: ignore[method-assign]
+
+        # Trigger notification processing via get_finished().
+        done_sending0, _ = conn_p0.get_finished(finished_req_ids=set())
+        done_sending1, _ = conn_p1.get_finished(finished_req_ids=set())
+        assert req_id in done_sending0 and req_id in done_sending1
+
+        # E2E aggregation: ensure the aggregated output marks the request
+        # as finished using the connector's expected_finished_count.
+        from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput
+
+        aggregator = KVOutputAggregator.from_connector(conn_p0, world_size=2)
+
+        out0 = ModelRunnerOutput(
+            req_ids=[req_id],
+            req_id_to_index={req_id: 0},
+            sampled_token_ids=[[0]],
+            logprobs=None,
+            prompt_logprobs_dict={},
+            pooler_output=[None],
+            kv_connector_output=KVConnectorOutput(
+                finished_sending=done_sending0,
+                finished_recving=None,
+            ),
+        )
+        out1 = ModelRunnerOutput(
+            req_ids=[req_id],
+            req_id_to_index={req_id: 0},
+            sampled_token_ids=[[0]],
+            logprobs=None,
+            prompt_logprobs_dict={},
+            pooler_output=[None],
+            kv_connector_output=KVConnectorOutput(
+                finished_sending=done_sending1,
+                finished_recving=None,
+            ),
+        )
+        aggregated = aggregator.aggregate([out0, out1], output_rank=0)
+        assert aggregated.kv_connector_output is not None
+        assert aggregated.kv_connector_output.finished_sending == {req_id}
+
+        # Producers cleaned up state for the finished request.
+        assert req_id not in conn_p0.connector_worker._reqs_to_send
+        assert req_id not in conn_p0.connector_worker._reqs_to_process
+        assert req_id not in conn_p1.connector_worker._reqs_to_send
+        assert req_id not in conn_p1.connector_worker._reqs_to_process
+
     @patch(
         "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
         FakeNixlWrapper,
@@ -585,6 +779,9 @@ class TestNixlHandshake:
         connector.connector_worker = FakeNixlConnectorWorker(
             vllm_config, connector.engine_id
         )
+        # Register (mocked) local xfer handler
+        # worker = connector.connector_worker
+        # worker.src_xfer_handles_by_block_size = {worker.block_size: 1}
         metadata = NixlConnectorMetadata()
         total_reqs = 5
         for i in range(total_reqs):
@@ -672,7 +869,6 @@ class TestNixlHandshake:
             with pytest.raises(RuntimeError):
                 # mismatched layout is expected to fail
                 worker.add_remote_agent(meta, remote_tp_size=2)
-            with pytest.raises(AssertionError):
                 worker.add_remote_agent(meta, remote_tp_size=1)
 
     @patch(
@@ -1357,8 +1553,11 @@ def test_shutdown_cleans_up_resources(dist_init):
         patch.object(nixl_wrapper, "deregister_memory") as mock_dereg,
     ):
         worker._recving_transfers = {"req1": [123]}
-        worker.src_xfer_side_handle = 456
-        worker.dst_xfer_side_handles = {"engine1": 789}
+        # Mock register_kv_cache which registers local handle
+        worker.src_xfer_handles_by_block_size = {worker.block_size: 455}
+        # P TP = 2 * D TP case, we should register 2 local handles
+        worker.src_xfer_handles_by_tp_ratio = {-2: [456, 457]}
+        worker.dst_xfer_side_handles = {"engine1": {0: 789}}
         worker._remote_agents = {"engine1": {0: "agent1"}}
         worker._registered_descs = ["desc1", "desc2"]
 
@@ -1379,8 +1578,10 @@ def test_shutdown_cleans_up_resources(dist_init):
         mock_listener.join.assert_called_once()
 
         mock_rel_xfer.assert_called_once_with(123)
-        assert mock_rel_dlist.call_count == 2
-        mock_rel_dlist.assert_any_call(456)  # src handle
+        assert mock_rel_dlist.call_count == 4
+        mock_rel_dlist.assert_any_call(455)  # src handle (whole region)
+        mock_rel_dlist.assert_any_call(456)  # src handle (1st chunk)
+        mock_rel_dlist.assert_any_call(457)  # src handle (2nd chunk)
         mock_rel_dlist.assert_any_call(789)  # dst handle
         mock_rem_agent.assert_called_once_with("agent1")
         assert mock_dereg.call_count == 2
diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py
index 117d159e25e71..a026cccb85372 100644
--- a/vllm/distributed/kv_transfer/kv_connector/utils.py
+++ b/vllm/distributed/kv_transfer/kv_connector/utils.py
@@ -21,6 +21,8 @@ if TYPE_CHECKING:
 
 logger = init_logger(__name__)
 
+EngineId = str
+
 
 def get_kv_connector_cache_layout():
     # NOTE (NickLucche) When running disaggregated PD with NIXL, HND layout is
@@ -209,12 +211,12 @@ class TpKVTopology:
     """
 
     tp_rank: int
-    remote_tp_size: dict[str, int]
+    remote_tp_size: dict[EngineId, int]
     is_mla: bool
     total_num_kv_heads: int
     attn_backend: type[AttentionBackend]
-    engine_id: str
-    remote_block_size: dict[str, int]
+    engine_id: EngineId
+    remote_block_size: dict[EngineId, int]
 
     def __post_init__(self):
         # Figure out whether the first dimension of the cache is K/V
@@ -256,18 +258,28 @@ class TpKVTopology:
         Calculate the tensor parallel ratio between local and remote TP.
         We can think of it as the number of local TP workers-per-remote TP
         workers. Local workers will read from the same remote TP worker in
-        groups of size `tp_ratio`.
+        groups of size `tp_ratio`.If remote tp_size > local tp_size, the
+        ratio is flipped (remote_size/local_size) and the returned value is
+        negative.
         """
-        assert self.tp_size % remote_tp_size == 0, (
-            f"Local tensor parallel size {self.tp_size} is not divisible "
-            f"by remote tensor parallel size {remote_tp_size}."
+        if self.tp_size >= remote_tp_size:
+            assert self.tp_size % remote_tp_size == 0, (
+                f"Local tensor parallel size {self.tp_size} is not divisible "
+                f"by remote tensor parallel size {remote_tp_size}."
+            )
+            return self.tp_size // remote_tp_size
+
+        assert remote_tp_size % self.tp_size == 0, (
+            f"Remote tensor parallel size {remote_tp_size} is not divisible "
+            f"by local tensor parallel size {self.tp_size}."
         )
-        return self.tp_size // remote_tp_size
+        # P TP > D TP case, return the ratio as negative
+        return -remote_tp_size // self.tp_size
 
     def block_size_ratio(
         self,
         remote_block_size: int,
-    ) -> float:
+    ) -> int:
         """
         Calculate the block size ratio between local and remote TP.
         """
@@ -279,19 +291,19 @@ class TpKVTopology:
 
     def tp_ratio_from_engine_id(
         self,
-        remote_engine_id: str,
+        remote_engine_id: EngineId,
     ) -> int:
         remote_tp_size = self.remote_tp_size[remote_engine_id]
         return self.tp_ratio(remote_tp_size)
 
     def block_size_ratio_from_engine_id(
         self,
-        remote_engine_id: str,
-    ) -> float:
+        remote_engine_id: EngineId,
+    ) -> int:
         remote_block_size = self.remote_block_size[remote_engine_id]
         return self.block_size_ratio(remote_block_size)
 
-    def is_kv_replicated(self, engine_id: str) -> bool:
+    def is_kv_replicated(self, engine_id: EngineId) -> bool:
         """
         Whether the KV cache is replicated across TP workers due to the
         number of TP workers being greater than the number of KV heads.
@@ -299,24 +311,30 @@ class TpKVTopology:
         tp_size = self.remote_tp_size[engine_id]
         return tp_size // self.total_num_kv_heads >= 1
 
-    def replicates_kv_cache(self, remote_engine_id: str) -> bool:
+    def replicates_kv_cache(self, remote_engine_id: EngineId) -> bool:
         # MLA is always replicated as the hidden dim can't be split.
         return self.is_mla or self.is_kv_replicated(remote_engine_id)
 
-    def get_target_remote_rank(
+    def get_target_remote_ranks(
         self,
         remote_tp_size: int,
-    ) -> int:
+    ) -> list[int]:
         """
         Get the remote TP rank (on P) that the current local TP rank
-        (on D) will read from.
+        (on D) will read from. When remote tp_size > local tp_size, we
+        read from multiple remote ranks.
         """
         tp_ratio = self.tp_ratio(remote_tp_size)
-        return self.tp_rank // tp_ratio
+        if tp_ratio > 0:
+            return [self.tp_rank // tp_ratio]
 
-    def get_target_remote_rank_from_engine_id(
+        # P TP > D TP case, D reads from |tp_ratio| remote workers.
+        tp_ratio = -tp_ratio
+        return [self.tp_rank * tp_ratio + i for i in range(tp_ratio)]
+
+    def get_target_remote_ranks_from_engine_id(
         self,
-        remote_engine_id: str,
-    ) -> int:
+        remote_engine_id: EngineId,
+    ) -> list[int]:
         remote_tp_size = self.remote_tp_size[remote_engine_id]
-        return self.get_target_remote_rank(remote_tp_size)
+        return self.get_target_remote_ranks(remote_tp_size)
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index fb4b8ac391afb..be56eb4e93c10 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -23,7 +23,7 @@ from vllm import envs
 from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.attention.selector import get_attn_backend
 from vllm.config import VllmConfig
-from vllm.distributed.kv_transfer.kv_connector.utils import TpKVTopology
+from vllm.distributed.kv_transfer.kv_connector.utils import EngineId, TpKVTopology
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     CopyBlocksOp,
     KVConnectorBase_V1,
@@ -56,7 +56,6 @@ if TYPE_CHECKING:
     from vllm.v1.request import Request
 
 TransferHandle = int
-EngineId = str
 ReqId = str
 
 #
@@ -873,9 +872,10 @@ class NixlConnectorWorker:
         self.copy_blocks: CopyBlocksOp | None = None
 
         # Map of engine_id -> kv_caches_base_addr. For TP case, each local
-        # rank will still only pull from a single remote TP worker.
-        self.kv_caches_base_addr: dict[EngineId, list[int]] = {}
         self.device_id: int = 0
+        # Current rank may pull from multiple remote TP workers.
+        # EngineId, dict[int, list[int]] -> engine_id, tp_rank, base_addr_for_layer
+        self.kv_caches_base_addr = defaultdict[EngineId, dict[int, list[int]]](dict)
 
         # Number of NIXL regions. Currently one region per cache
         # (so 1 per layer for MLA, otherwise 2 per layer)
@@ -883,10 +883,12 @@ class NixlConnectorWorker:
         self.num_layers = 0
 
         # nixl_prepped_dlist_handle.
-        self.src_xfer_side_handle: int = 0
-        self.src_xfer_side_handles: dict[int, int] = {}
-        # Map of engine_id -> nixl_prepped_dlist_handle (int)].
-        self.dst_xfer_side_handles: dict[EngineId, int] = {}
+        self.src_xfer_handles_by_block_size: dict[int, int] = {}
+        # Populated dynamically during handshake based on remote configuration.
+        # Keep track of regions at different tp_ratio values. tp_ratio->handles
+        self.src_xfer_handles_by_tp_ratio: dict[int, list[int]] = {}
+        # Map of engine_id -> {tp_rank: nixl_prepped_dlist_handle (int)}.
+        self.dst_xfer_side_handles = defaultdict[EngineId, dict[int, int]](dict)
 
         # Map of engine_id -> num_blocks. All ranks in the same deployment will
         # have the same number of blocks.
@@ -977,103 +979,108 @@ class NixlConnectorWorker:
         expected_engine_id: str,
     ) -> dict[int, str]:
         """Do a NIXL handshake with a remote instance."""
-
-        start_time = time.perf_counter()
-
-        # NOTE(rob): we need each rank to have a unique port. This is
-        # a hack to keep us moving. We will switch when moving to etcd
-        # or where we have a single ZMQ socket in the scheduler.
-
-        # Handshake only with the remote TP rank that current local rank will
-        # pull from. With homogeneous TP it happens to be the same rank_i.
-        p_remote_rank = self.kv_topo.get_target_remote_rank(remote_tp_size)
+        # When target instance TP > local TP, we need to perform multiple
+        # handshakes. Do it in a single background job for simplicity.
+        # Regardless, only handshake with the remote TP rank(s) that current
+        # local rank will read from. Note that With homogeneous TP,
+        # this happens to be the same single rank_i.
+        p_remote_ranks = self.kv_topo.get_target_remote_ranks(remote_tp_size)
+        remote_rank_to_agent_name = {}
         path = make_zmq_path("tcp", host, port)
-        logger.debug(
-            "Querying metadata on path: %s at remote tp rank %s", path, p_remote_rank
-        )
 
-        # Send query for the request.
         with zmq_ctx(zmq.REQ, path) as sock:
-            msg = msgspec.msgpack.encode((GET_META_MSG, p_remote_rank))
-            # Set receive timeout to 5 seconds to avoid hanging on dead server
-            sock.setsockopt(zmq.RCVTIMEO, 5000)  # milliseconds
-            sock.send(msg)
-            handshake_bytes = sock.recv()
-
-            # Decode handshake payload to get compatibility hash
-            handshake_decoder = msgspec.msgpack.Decoder(NixlHandshakePayload)
-            try:
-                handshake_payload = handshake_decoder.decode(handshake_bytes)
-            except (msgspec.DecodeError, msgspec.ValidationError) as e:
-                raise RuntimeError(
-                    f"Failed to decode NixlHandshakePayload. This likely indicates "
-                    f"an incompatibility between connector version. Error: {e}"
-                ) from e
-
-            got_metadata_time = time.perf_counter()
-            logger.debug(
-                "NIXL handshake: get metadata took: %s", got_metadata_time - start_time
-            )
-
-            # Check compatibility hash BEFORE decoding agent metadata
-            if (
-                self.enforce_compat_hash
-                and handshake_payload.compatibility_hash != self.compat_hash
-            ):
-                raise RuntimeError(
-                    f"NIXL compatibility hash mismatch. "
-                    f"Local: {self.compat_hash}, "
-                    f"Remote: {handshake_payload.compatibility_hash}. "
-                    f"Prefill and decode instances have incompatible configurations. "
-                    f"This may be due to: different vLLM versions, models, dtypes, "
-                    f"KV cache layouts, attention backends, etc. "
-                    f"Both instances must use identical configurations."
-                    f"Disable this check using "
-                    f'--kv-transfer-config \'{{"kv_connector_extra_config": '
-                    f'{{"enforce_handshake_compat": false}}}}\''
+            for remote_rank in p_remote_ranks:
+                logger.debug(
+                    "Querying metadata on path: %s at remote tp rank %s",
+                    path,
+                    remote_rank,
                 )
 
-            logger.info(
-                "NIXL compatibility check passed (hash: %s)",
-                handshake_payload.compatibility_hash,
-            )
+                start_time = time.perf_counter()
+                # Send query for the request.
+                msg = msgspec.msgpack.encode((GET_META_MSG, remote_rank))
+                # Set receive timeout to 5 seconds to avoid hanging on dead server
+                sock.setsockopt(zmq.RCVTIMEO, 5000)  # milliseconds
+                sock.send(msg)
+                handshake_bytes = sock.recv()
 
-            # Decode agent metadata
-            metadata_decoder = msgspec.msgpack.Decoder(NixlAgentMetadata)
-            try:
-                metadata = metadata_decoder.decode(
-                    handshake_payload.agent_metadata_bytes
-                )
-            except (msgspec.DecodeError, msgspec.ValidationError) as e:
-                # This should not happen if hash matched
-                raise RuntimeError(
-                    f"Failed to decode NixlAgentMetadata. Error: {e}"
-                ) from e
+                # Decode handshake payload to get compatibility hash
+                handshake_decoder = msgspec.msgpack.Decoder(NixlHandshakePayload)
+                try:
+                    handshake_payload = handshake_decoder.decode(handshake_bytes)
+                except (msgspec.DecodeError, msgspec.ValidationError) as e:
+                    raise RuntimeError(
+                        f"Failed to decode NixlHandshakePayload. This likely indicates "
+                        f"an incompatibility between connector version. Error: {e}"
+                    ) from e
 
-            # Ensure engine id matches.
-            if metadata.engine_id != expected_engine_id:
-                raise RuntimeError(
-                    f"Remote NIXL agent engine ID mismatch. "
-                    f"Expected {expected_engine_id},"
-                    f"received {metadata.engine_id}."
+                got_metadata_time = time.perf_counter()
+                logger.debug(
+                    "NIXL handshake: get metadata took: %s",
+                    got_metadata_time - start_time,
                 )
 
-            # Register Remote agent.
-            assert metadata.block_size <= self.block_size, (
-                "nP > nD is not supported yet."
-            )
-            remote_agent_name = self.add_remote_agent(
-                metadata, p_remote_rank, remote_tp_size
-            )
+                # Check compatibility hash BEFORE decoding agent metadata
+                if (
+                    self.enforce_compat_hash
+                    and handshake_payload.compatibility_hash != self.compat_hash
+                ):
+                    raise RuntimeError(
+                        f"NIXL compatibility hash mismatch. "
+                        f"Local: {self.compat_hash}, "
+                        f"Remote: {handshake_payload.compatibility_hash}. "
+                        f"Prefill and decode instances have incompatible "
+                        f"configurations. This may be due to: different vLLM versions,"
+                        f" models, dtypes, KV cache layouts, attention backends, etc. "
+                        f"Both instances must use identical configurations."
+                        f"Disable this check using "
+                        f'--kv-transfer-config \'{{"kv_connector_extra_config": '
+                        f'{{"enforce_handshake_compat": false}}}}\''
+                    )
 
-            setup_agent_time = time.perf_counter()
-            logger.debug(
-                "NIXL handshake: add agent took: %s",
-                setup_agent_time - got_metadata_time,
-            )
+                logger.info(
+                    "NIXL compatibility check passed (hash: %s)",
+                    handshake_payload.compatibility_hash,
+                )
 
-        # Remote rank -> agent name.
-        return {p_remote_rank: remote_agent_name}
+                # Decode agent metadata
+                metadata_decoder = msgspec.msgpack.Decoder(NixlAgentMetadata)
+                try:
+                    metadata = metadata_decoder.decode(
+                        handshake_payload.agent_metadata_bytes
+                    )
+                except (msgspec.DecodeError, msgspec.ValidationError) as e:
+                    # This should not happen if hash matched
+                    raise RuntimeError(
+                        f"Failed to decode NixlAgentMetadata. Error: {e}"
+                    ) from e
+
+                # Ensure engine id matches.
+                if metadata.engine_id != expected_engine_id:
+                    raise RuntimeError(
+                        f"Remote NIXL agent engine ID mismatch. "
+                        f"Expected {expected_engine_id},"
+                        f"received {metadata.engine_id}."
+                    )
+                # Ensure engine id matches.
+                if metadata.engine_id != expected_engine_id:
+                    raise RuntimeError(
+                        f"Remote NIXL agent engine ID mismatch. "
+                        f"Expected {expected_engine_id},"
+                        f"received {metadata.engine_id}."
+                    )
+                setup_agent_time = time.perf_counter()
+
+                # Register Remote agent.
+                remote_agent_name = self.add_remote_agent(
+                    metadata, remote_rank, remote_tp_size
+                )
+                logger.debug(
+                    "NIXL handshake: add agent took: %s",
+                    setup_agent_time - got_metadata_time,
+                )
+                remote_rank_to_agent_name[remote_rank] = remote_agent_name
+        return remote_rank_to_agent_name
 
     def initialize_host_xfer_buffer(self, kv_caches: dict[str, torch.Tensor]) -> None:
         """
@@ -1283,7 +1290,7 @@ class NixlConnectorWorker:
         assert len(self.block_len_per_layer) == len(seen_base_addresses)
         assert self.num_blocks != 0
 
-        self.kv_caches_base_addr[self.engine_id] = seen_base_addresses
+        self.kv_caches_base_addr[self.engine_id][self.tp_rank] = seen_base_addresses
         self.num_regions = len(caches_data)
         self.num_layers = len(xfer_buffers.keys())
 
@@ -1310,9 +1317,9 @@ class NixlConnectorWorker:
 
         # Register local/src descr for NIXL xfer.
         self.seen_base_addresses = seen_base_addresses
-        self.src_xfer_side_handle = self.register_local_xfer_handler(self.block_size)
-
-        self.src_xfer_side_handles[self.block_size] = self.src_xfer_side_handle
+        self.src_xfer_handles_by_block_size[self.block_size], self.src_blocks_data = (
+            self.register_local_xfer_handler(self.block_size)
+        )
 
         # TODO(mgoin): Hybrid memory allocator is currently disabled for
         # models with local attention (Llama 4). Can remove this once enabled.
@@ -1340,8 +1347,8 @@ class NixlConnectorWorker:
         agent_metadata = NixlAgentMetadata(
             engine_id=self.engine_id,
             agent_metadata=self.nixl_wrapper.get_agent_metadata(),
-            kv_caches_base_addr=self.kv_caches_base_addr[self.engine_id],
             device_id=self.device_id,
+            kv_caches_base_addr=self.kv_caches_base_addr[self.engine_id][self.tp_rank],
             num_blocks=self.num_blocks,
             block_lens=self.block_len_per_layer,
             kv_cache_layout=self.kv_cache_layout
@@ -1359,7 +1366,7 @@ class NixlConnectorWorker:
     def register_local_xfer_handler(
         self,
         block_size: int,
-    ) -> int:
+    ) -> tuple[int, list[tuple[int, int, int]]]:
         """
         Function used for register local xfer handler with local block_size or
         Remote block_size.
@@ -1407,7 +1414,7 @@ class NixlConnectorWorker:
 
         descs = self.nixl_wrapper.get_xfer_descs(blocks_data, self.nixl_memory_type)
         # NIXL_INIT_AGENT to be used for preparations of local descs.
-        return self.nixl_wrapper.prep_xfer_dlist("NIXL_INIT_AGENT", descs)
+        return self.nixl_wrapper.prep_xfer_dlist("NIXL_INIT_AGENT", descs), blocks_data
 
     def add_remote_agent(
         self,
@@ -1421,10 +1428,12 @@ class NixlConnectorWorker:
 
         In particular, handle both homogeneous and heterogeneous TP. The former
         requires local rank_i to read from remote rank_i.
-        The latter, assuming D.world_size > P.world_size, requires that two or
-        more local TP worker share the xfer from a single TP worker.
+        The latter, in the case of D.world_size < P.world_size, requires that a
+        local (D) TP worker reads from multiple remote (P) TP workers.
+        Conversely, assuming D.world_size > P.world_size, two or more local TP
+        workers will read from a single remote TP worker.
 
-        Here's an example (non-MLA case):
+        Here's an example for the last case described above (non-MLA):
 
         rank_offset     p_remote_tp_rank
         (kv split no)
@@ -1474,9 +1483,6 @@ class NixlConnectorWorker:
             nixl_agent_meta.agent_metadata
         )
 
-        # Handle tp_size>num_kv_heads: replicate KV cache.
-        replicates_kv_cache = self.kv_topo.replicates_kv_cache(engine_id)
-
         # Create dst descs and xfer side handles. TP workers have same #blocks
         # so we only register once per engine_id.
         # Example:
@@ -1490,14 +1496,52 @@ class NixlConnectorWorker:
             self.dst_num_blocks[engine_id] = nixl_agent_meta.num_blocks
 
         # Keep track of remote agent kv caches base addresses.
-        self.kv_caches_base_addr[engine_id] = nixl_agent_meta.kv_caches_base_addr
-
+        self.kv_caches_base_addr[engine_id][remote_tp_rank] = (
+            nixl_agent_meta.kv_caches_base_addr
+        )
         self._validate_remote_agent_handshake(nixl_agent_meta, remote_tp_size)
 
-        # Number of D TP workers reading from a single P TP worker. This is
-        # 1 when P and D `--tensor-parallel-size` match.
+        # This is 1 when P and D `--tensor-parallel-size` match. Otherwise,
+        # this is the ratio between the two sizes.
         tp_ratio = self.kv_topo.tp_ratio_from_engine_id(engine_id)
 
+        # Handle tp_size>num_kv_heads: replicate KV cache.
+        indexes_into_remote = (
+            not self.kv_topo.replicates_kv_cache(engine_id) and tp_ratio > 0
+        )
+
+        logger.debug(
+            "Registering remote agent (%s, rank %s) memory regions with tp_ratio %s",
+            engine_id,
+            remote_tp_rank,
+            tp_ratio,
+        )
+
+        ### (Optional) Register local agent memory regions. MLA is not split.
+        if (
+            tp_ratio < 0
+            and not self.use_mla
+            and tp_ratio not in self.src_xfer_handles_by_tp_ratio
+        ):
+            # Remote tp_size > local tp_size: read from multiple remote ranks.
+            # Logically "split" own regions into |tp_ratio| chunks. Mind that
+            # we only do this once per remote tp_size (replica-friendly).
+            self.src_xfer_handles_by_tp_ratio[tp_ratio] = []
+            for i in range(-tp_ratio):
+                blocks_data = []
+                for memory_region in self.src_blocks_data:
+                    addr, local_block_len, own_tp_rank = memory_region
+                    # Computing block len layer by layer allows for different
+                    # block sizes to be used.
+                    remote_block_len = local_block_len // (-tp_ratio)
+                    addr = addr + i * remote_block_len
+                    blocks_data.append((addr, remote_block_len, own_tp_rank))
+                descs = self.nixl_wrapper.get_xfer_descs(
+                    blocks_data, self.nixl_memory_type
+                )
+                handle = self.nixl_wrapper.prep_xfer_dlist("NIXL_INIT_AGENT", descs)
+                self.src_xfer_handles_by_tp_ratio[tp_ratio].append(handle)
+
         ### Register remote agent memory regions
         blocks_data = []
         # With homogeneous TP, D pulls the whole kv cache from corresponding
@@ -1507,14 +1551,19 @@ class NixlConnectorWorker:
 
         # Register all remote blocks, but only the corresponding kv heads.
         for i, base_addr in enumerate(nixl_agent_meta.kv_caches_base_addr):
-            kv_block_len = self.get_backend_aware_kv_block_len(layer_idx=i)
-            remote_kv_block_len = kv_block_len // block_size_ratio
+            # Read our whole local region size from remote.
+            local_block_len = self.get_backend_aware_kv_block_len(layer_idx=i)
+            remote_kv_block_len = local_block_len // block_size_ratio
             if block_size_ratio > 1:
                 # using remote kv_block_len as transfer unit
-                kv_block_len = remote_kv_block_len
+                local_block_len = remote_kv_block_len
+
+            if tp_ratio < 0 and not self.use_mla:
+                # Remote tp is bigger: read a chunk of local region from remote
+                local_block_len = local_block_len // (-tp_ratio)
             rank_offset = (
                 self.tp_rank % tp_ratio * remote_kv_block_len
-                if not replicates_kv_cache
+                if indexes_into_remote
                 else 0
             )
             for block_id in range(nixl_agent_meta.num_blocks):
@@ -1524,7 +1573,7 @@ class NixlConnectorWorker:
                 # self.block_len == remote_block_len//tp_ratio bytes.
                 addr = base_addr + block_offset + rank_offset
                 # (addr, len, device id)
-                blocks_data.append((addr, kv_block_len, nixl_agent_meta.device_id))
+                blocks_data.append((addr, local_block_len, nixl_agent_meta.device_id))
 
             if self.kv_topo.is_kv_layout_blocks_first:
                 # With FlashInfer index V separately to allow head splitting.
@@ -1533,7 +1582,7 @@ class NixlConnectorWorker:
                     addr = base_addr + block_offset + rank_offset
                     v_addr = addr + nixl_agent_meta.block_lens[i] // 2
                     blocks_data.append(
-                        (v_addr, kv_block_len, nixl_agent_meta.device_id)
+                        (v_addr, local_block_len, nixl_agent_meta.device_id)
                     )
 
         logger.debug(
@@ -1546,15 +1595,15 @@ class NixlConnectorWorker:
 
         # Register with NIXL.
         descs = self.nixl_wrapper.get_xfer_descs(blocks_data, self.nixl_memory_type)
-        self.dst_xfer_side_handles[engine_id] = self.nixl_wrapper.prep_xfer_dlist(
-            remote_agent_name, descs
+        self.dst_xfer_side_handles[engine_id][remote_tp_rank] = (
+            self.nixl_wrapper.prep_xfer_dlist(remote_agent_name, descs)
         )
 
         if block_size_ratio > 1:
             # when prefill with smaller block_size, we need to init a
             # new handler with same block_len to match
-            self.src_xfer_side_handles[nixl_agent_meta.block_size] = (
-                self.register_local_xfer_handler(nixl_agent_meta.block_size)
+            self.src_xfer_handles_by_block_size[nixl_agent_meta.block_size] = (
+                self.register_local_xfer_handler(nixl_agent_meta.block_size)[0]
             )
 
         return remote_agent_name
@@ -1574,7 +1623,9 @@ class NixlConnectorWorker:
         block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(
             remote_engine_id
         )
-        assert tp_ratio > 0, "Decode TP cannot be smaller than prefill TP"
+        # Num kv_heads > tp_size and P TP > D TP case, not supported
+        assert not (tp_ratio < 0 and self.kv_topo.is_kv_replicated(remote_engine_id))
+
         assert not self._use_pallas or tp_ratio == 1, (
             "TPU (pallas_v1) DOES NOT support heterogeneous TP yet."
         )
@@ -1616,17 +1667,29 @@ class NixlConnectorWorker:
                     "All remote layers must have the same block size"
                 )
 
-            assert (
-                remote_block_len
-                == (self.block_len_per_layer[0] * tp_ratio) // block_size_ratio
-            ), (
-                "Remote P worker KV layer cache must be of shape [2, N, "
-                "local_kv_heads*tp_ratio, block_size, head_dim] and same dtype."
-            )
+            if tp_ratio > 0:
+                # Remote tp is smaller: remote block_len size is bigger
+                assert (
+                    remote_block_len
+                    == (self.block_len_per_layer[0] * tp_ratio) // block_size_ratio
+                ), (
+                    "Remote P worker KV layer cache must be of shape [2, N, "
+                    "local_kv_heads*tp_ratio, page_size, head_dim] and same dtype."
+                )  # noqa: E501
+            else:
+                assert block_size_ratio == 1, (
+                    "Different local/remote block sizes are not supported when"
+                    " P TP > D TP."
+                )
+                # Remote tp is bigger: remote block_len size is smaller
+                assert remote_block_len == self.block_len_per_layer[0] // (-tp_ratio), (
+                    "Remote P worker KV layer cache must be of shape [2, N, "
+                    "local_kv_heads/tp_ratio, page_size, head_dim] and same dtype."
+                )  # noqa: E501
 
-        # TP workers have same #blocks.
+        # TP workers that handhshake with same remote have same #blocks.
         assert self.dst_num_blocks[remote_engine_id] == nixl_agent_meta.num_blocks
-
+        # Same number of regions/~layers.
         assert len(nixl_agent_meta.kv_caches_base_addr) == len(self.block_len_per_layer)
 
     def sync_recved_kv_to_device(self, req_id: str, meta: ReqMeta):
@@ -1710,7 +1773,7 @@ class NixlConnectorWorker:
                 )
                 cache.index_copy_(0, indices, permuted_blocks)
 
-    def blocksize_post_process(self, block_ids_per_ratio: dict[float, list[list[int]]]):
+    def blocksize_post_process(self, block_ids_per_ratio: dict[int, list[list[int]]]):
         def _process_local_gt_remote(blocks_to_update, block_size_ratio):
             n_kv_heads, block_size, head_size = blocks_to_update.shape[1:]
             remote_block_size = block_size // block_size_ratio
@@ -1840,7 +1903,7 @@ class NixlConnectorWorker:
         notified_req_ids: set[str] = set()
         for notifs in self.nixl_wrapper.get_new_notifs().values():
             for notif in notifs:
-                req_id, tp_ratio = notif.decode("utf-8").rsplit(":", 1)
+                req_id, tp_size = notif.decode("utf-8").rsplit(":", 1)
                 if (
                     req_id not in self._reqs_to_send
                     and req_id not in self._reqs_to_process
@@ -1853,9 +1916,22 @@ class NixlConnectorWorker:
                     )
                     continue
 
+                # NOTE: `tp_ratio` is the opposite when swapping local<>remote
+                n_consumers = int(tp_size)
+                tp_ratio = self.kv_topo.tp_ratio(n_consumers)
+
+                # Number of reads *per producer* to wait for.
+                # When remote D TP > local P TP we expect `tp_ratio` reads.
+                consumers_per_producer = (
+                    -tp_ratio if n_consumers > self.world_size else 1
+                )
+
                 self.consumer_notification_counts_by_req[req_id] += 1
                 # Wait all consumers (D) to be done reading before freeing.
-                if self.consumer_notification_counts_by_req[req_id] == int(tp_ratio):
+                if (
+                    self.consumer_notification_counts_by_req[req_id]
+                    == consumers_per_producer
+                ):
                     notified_req_ids.add(req_id)
                     del self.consumer_notification_counts_by_req[req_id]
                     self._reqs_to_process.remove(req_id)
@@ -1872,7 +1948,7 @@ class NixlConnectorWorker:
         """
         done_req_ids: set[str] = set()
         for req_id, handles in list(transfers.items()):
-            in_progress = False
+            in_progress = []
             for handle in handles:
                 try:
                     xfer_state = self.nixl_wrapper.check_xfer_state(handle)
@@ -1882,7 +1958,7 @@ class NixlConnectorWorker:
                         self.xfer_stats.record_transfer(res)
                         self.nixl_wrapper.release_xfer_handle(handle)
                     elif xfer_state == "PROC":
-                        in_progress = True
+                        in_progress.append(handle)
                         continue
                     else:
                         logger.error(
@@ -1892,7 +1968,6 @@ class NixlConnectorWorker:
                             xfer_state,
                         )
                         self._handle_failed_transfer(req_id, handle)
-                        in_progress = False
                 except Exception:
                     logger.exception(
                         "NIXL transfer exception for request %s. "
@@ -1900,11 +1975,13 @@ class NixlConnectorWorker:
                         req_id,
                     )
                     self._handle_failed_transfer(req_id, handle)
-                    in_progress = False
 
             if not in_progress:
+                # Only report request as completed when all transfers are done.
                 done_req_ids.add(req_id)
                 del transfers[req_id]
+            else:
+                transfers[req_id] = in_progress
         return done_req_ids
 
     def _handle_failed_transfer(self, req_id: str, handle: int):
@@ -1982,18 +2059,62 @@ class NixlConnectorWorker:
 
     def _read_blocks_for_req(self, req_id: str, meta: ReqMeta):
         assert meta.remote is not None
-        logger.debug(
-            "Remote agent %s available, calling _read_blocks for req %s",
-            meta.remote.engine_id,
-            req_id,
-        )
-        self._read_blocks(
-            request_id=req_id,
-            dst_engine_id=meta.remote.engine_id,
-            remote_request_id=meta.remote.request_id,
-            local_block_ids=meta.local_physical_block_ids,
-            remote_block_ids=meta.remote.block_ids,
+        remote_ranks = self.kv_topo.get_target_remote_ranks_from_engine_id(
+            meta.remote.engine_id
         )
+        tp_ratio = self.kv_topo.tp_ratio_from_engine_id(meta.remote.engine_id)
+        # D may have to perform multiple reads from different remote ranks.
+        for i, remote_rank in enumerate(remote_ranks):
+            if self.use_mla and tp_ratio < 0 and i > 0:
+                # MLA opt: when P TP > D TP, only a single read is executed for
+                # the first remote rank (cache is duplicated)..
+                break
+
+            remote_block_size = self.kv_topo.remote_block_size[meta.remote.engine_id]
+            logger.debug(
+                "Remote agent %s available, calling _read_blocks"
+                " on remote rank %s with remote block size %s for req %s",
+                meta.remote.engine_id,
+                remote_rank,
+                remote_block_size,
+                req_id,
+            )
+            # Get side handles.
+            if tp_ratio < 0 and not self.use_mla:
+                assert remote_block_size == self.block_size
+                # Remote tp_size > local tp_size: we must perform multiple
+                # reads. Get the memory chunk onto which we will write to.
+                local_xfer_side_handle = self.src_xfer_handles_by_tp_ratio[tp_ratio][i]
+            else:
+                # Single read from remote, we write to the whole memory region.
+                # Also handle remote block size different from local block size.
+                local_xfer_side_handle = self.src_xfer_handles_by_block_size[
+                    remote_block_size
+                ]
+
+            # Destination handle: remote_engine_id -> remote_rank -> handle.
+            remote_xfer_side_handle = self.dst_xfer_side_handles[meta.remote.engine_id][
+                remote_rank
+            ]
+            self._read_blocks(
+                request_id=req_id,
+                dst_engine_id=meta.remote.engine_id,
+                remote_request_id=meta.remote.request_id,
+                local_block_ids=meta.local_physical_block_ids,
+                remote_block_ids=meta.remote.block_ids,
+                remote_rank=remote_rank,
+                local_xfer_side_handle=local_xfer_side_handle,
+                remote_xfer_side_handle=remote_xfer_side_handle,
+            )
+
+            if self.use_mla and tp_ratio < 0:
+                # ..but we still need to notify the other remote ranks that we
+                # have the blocks we need so they can update the request state.
+                notif_id = f"{req_id}:{self.world_size}".encode()
+                remote_agents = self._remote_agents[meta.remote.engine_id]
+                for rank_to_notify, agent in remote_agents.items():
+                    if rank_to_notify != remote_rank:
+                        self.nixl_wrapper.send_notif(agent, notif_msg=notif_id)
 
     def _read_blocks(
         self,
@@ -2002,7 +2123,14 @@ class NixlConnectorWorker:
         dst_engine_id: str,
         request_id: str,
         remote_request_id: str,
+        remote_rank: int,
+        local_xfer_side_handle: int,
+        remote_xfer_side_handle: int,
     ):
+        """
+        Post a READ point-to-point xfer request from a single local worker to
+        a single remote worker.
+        """
         block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(dst_engine_id)
         if block_size_ratio > 1:
             local_block_ids = self.get_mapped_blocks(
@@ -2031,18 +2159,14 @@ class NixlConnectorWorker:
         # saturate IB with heterogeneous TP sizes. We should remove the staging
         # blocks until we are ready.
 
-        # Number of D TP workers that will read from dst P. Propagate tp_ratio
+        # Number of D TP workers that will read from dst P. Propagate info
         # on notification so that dst worker can wait before freeing blocks.
-        tp_ratio = self.kv_topo.tp_ratio_from_engine_id(dst_engine_id)
-        notif_id = f"{remote_request_id}:{tp_ratio}".encode()
+        notif_id = f"{remote_request_id}:{self.world_size}".encode()
 
         # Full prefix cache hit: do not need to read remote blocks,
         # just notify P worker that we have the blocks we need.
         num_local_blocks = len(local_block_ids)
         if num_local_blocks == 0:
-            remote_rank = self.kv_topo.get_target_remote_rank_from_engine_id(
-                dst_engine_id
-            )
             agent_name = self._remote_agents[dst_engine_id][remote_rank]
             try:
                 self.nixl_wrapper.send_notif(agent_name, notif_msg=notif_id)
@@ -2062,13 +2186,6 @@ class NixlConnectorWorker:
         if num_local_blocks < num_remote_blocks:
             remote_block_ids = remote_block_ids[-num_local_blocks:]
 
-        # Get side handles.
-        remote_block_size = self.kv_topo.remote_block_size[dst_engine_id]
-        local_xfer_side_handle = self.src_xfer_side_handles.get(
-            remote_block_size, self.src_xfer_side_handle
-        )
-        remote_xfer_side_handle = self.dst_xfer_side_handles[dst_engine_id]
-
         # NOTE (nicolo) With homogeneous TP, each TP worker loads KV from
         # corresponding rank. With heterogeneous TP, fixing D>P, the D tp
         # workers will issue xfers to parts of the P worker remote kv caches.
@@ -2230,7 +2347,7 @@ class NixlConnectorWorker:
             block_ids_np, self._physical_blocks_per_logical_kv_block, block_arange
         ).tolist()
 
-    def get_backend_aware_kv_block_len(self, layer_idx: int):
+    def get_backend_aware_kv_block_len(self, layer_idx: int) -> int:
         """
         Get the block length for one K/V element (K and V have the same size).
 
@@ -2276,11 +2393,16 @@ class NixlConnectorWorker:
             for handle in handles:
                 self.nixl_wrapper.release_xfer_handle(handle)
         self._recving_transfers.clear()
-        if self.src_xfer_side_handle:
-            self.nixl_wrapper.release_dlist_handle(self.src_xfer_side_handle)
-            self.src_xfer_side_handle = 0
-        for dst_xfer_side_handle in self.dst_xfer_side_handles.values():
-            self.nixl_wrapper.release_dlist_handle(dst_xfer_side_handle)
+        for handle in self.src_xfer_handles_by_block_size.values():
+            self.nixl_wrapper.release_dlist_handle(handle)
+        self.src_xfer_handles_by_block_size.clear()
+        for handles in self.src_xfer_handles_by_tp_ratio.values():
+            for handle in handles:
+                self.nixl_wrapper.release_dlist_handle(handle)
+        self.src_xfer_handles_by_tp_ratio.clear()
+        for dst_xfer_side_handles in self.dst_xfer_side_handles.values():
+            for dst_xfer_side_handle in dst_xfer_side_handles.values():
+                self.nixl_wrapper.release_dlist_handle(dst_xfer_side_handle)
         self.dst_xfer_side_handles.clear()
         for remote_agents in self._remote_agents.values():
             for agent_name in remote_agents.values():

From 6fe58876528751a808c0d25016d3c97b432b909f Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Thu, 18 Dec 2025 11:54:39 +0800
Subject: [PATCH 649/770] [Chore] Remove v0 dead code for Qwen2.5-omni (#30883)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 .../models/qwen2_5_omni_thinker.py            | 22 -------------------
 1 file changed, 22 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index f9bce4bf981b2..94deeb867c9f8 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -70,7 +70,6 @@ from vllm.multimodal.inputs import (
     MultiModalFeatureSpec,
     MultiModalFieldConfig,
     MultiModalKwargsItems,
-    NestedTensors,
 )
 from vllm.multimodal.parse import (
     AudioProcessorItems,
@@ -1150,27 +1149,6 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
             handle_oov_mm_token=handle_oov_mm_token,
         )
 
-    def embed_multimodal_v0(self, **kwargs: object) -> NestedTensors | None:
-        audio_input = self._parse_and_validate_audio_input(**kwargs)
-        image_input = self._parse_and_validate_image_input(**kwargs)
-        video_input = self._parse_and_validate_video_input(**kwargs)
-
-        if audio_input is None and image_input is None and video_input is None:
-            return None
-
-        multimodal_embeddings: list[tuple[NestedTensors, str]] = []
-
-        if audio_input is not None:
-            audio_embeds = self._process_audio_input(audio_input)
-            multimodal_embeddings.append((audio_embeds, "audio"))
-        if image_input is not None:
-            image_embeds = self._process_image_input(image_input)
-            multimodal_embeddings.append((image_embeds, "image"))
-        if video_input is not None:
-            video_embeds = self._process_video_input(video_input)
-            multimodal_embeddings.append((video_embeds, "video"))
-        return multimodal_embeddings
-
     def forward(
         self,
         input_ids: torch.Tensor,

From 5a3adf581e372c60d6135a535561f4d491c4d046 Mon Sep 17 00:00:00 2001
From: gnovack <gnovack@amazon.com>
Date: Wed, 17 Dec 2025 19:55:00 -0800
Subject: [PATCH 650/770] fused_moe_lora PDL improvements (#30716)

Signed-off-by: gnovack <gnovack@amazon.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/lora/ops/triton_ops/fused_moe_lora_op.py | 30 +++++++++++--------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
index 34383cdf1767c..f04936221eea6 100644
--- a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
+++ b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
@@ -156,16 +156,22 @@ def _fused_moe_lora_kernel(
         + offs_bn[None, :] * stride_bn
     )
 
+    if USE_GDC and IS_PRIMARY:
+        # GDC launch dependents hints the runtime system to launch dependent kernels.
+        tl.extra.cuda.gdc_launch_dependents()
+
     # accumulator
     accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+
+    # GDC wait waits for ALL programs in the prior kernel to complete
+    # before continuing.
+    if USE_GDC and not IS_PRIMARY:
+        tl.extra.cuda.gdc_wait()
+
     for k in range(0, grid_k):
         k_remaining = K - k * (BLOCK_SIZE_K * SPLIT_K)
         # pre-fetch lora weight
         b = tl.load(b_ptrs, mask=offs_k[:, None] < k_remaining, other=0.0)
-        # GDC wait waits for ALL programs in the prior kernel to complete
-        # before continuing.
-        if USE_GDC and not IS_PRIMARY:
-            tl.extra.cuda.gdc_wait()
         a = tl.load(
             a_ptrs,
             mask=token_mask[:, None] & (offs_k[None, :] < k_remaining),
@@ -179,9 +185,6 @@ def _fused_moe_lora_kernel(
     if MUL_ROUTED_WEIGHT:
         moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, other=0)
         accumulator = accumulator * moe_weight[:, None]
-    if USE_GDC and IS_PRIMARY:
-        # GDC launch dependents hints the runtime system to launch dependent kernels.
-        tl.extra.cuda.gdc_launch_dependents()
     accumulator = accumulator.to(c_ptr.dtype.element_ty)
     # Write back the block of the output
     offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
@@ -290,6 +293,7 @@ def _fused_moe_lora_shrink(
 def _fused_moe_lora_expand(
     output: torch.Tensor,  # (num_tokens, top_k_num, N*len(lora_a_stacked),)
     a_intermediate_cache1: torch.Tensor,  # (num_slices, M, top_k_num, max_lora_rank)
+    b_intermediate_cache1: torch.Tensor,  # (num_slices, M, top_k_num, output_dim_size)
     lora_b_stacked: list[
         torch.Tensor
     ],  # [(max_loras, num_experts, max_lora_rank, K,),...]
@@ -331,11 +335,6 @@ def _fused_moe_lora_expand(
         -1, a_intermediate_cache1.shape[3]
     )
 
-    b_intermediate_cache1 = torch.zeros(
-        (num_slices, M, top_k_num, w1_output_dim_size),
-        dtype=output.dtype,
-        device=device,
-    )
     use_gdc = supports_pdl(a_intermediate_cache1.device)
     expand_config = {
         "BLOCK_SIZE_M": block_size_m,
@@ -460,6 +459,12 @@ def _fused_moe_lora(
         device=device,
     )
 
+    b_intermediate_cache1 = torch.zeros(
+        (num_slices, M, top_k_num, w1_output_dim_size),
+        dtype=output.dtype,
+        device=device,
+    )
+
     _fused_moe_lora_shrink(
         a_intermediate_cache1,
         qcurr_hidden_states,
@@ -506,6 +511,7 @@ def _fused_moe_lora(
     _fused_moe_lora_expand(
         output,
         a_intermediate_cache1,
+        b_intermediate_cache1,
         lora_b_stacked,
         topk_weights,
         sorted_token_ids,

From 0c738b58bc0e5a5bf2448c95fc2014b83127a4d5 Mon Sep 17 00:00:00 2001
From: Bowen Bao <bowenbao@amd.com>
Date: Wed, 17 Dec 2025 20:20:42 -0800
Subject: [PATCH 651/770] [Quantization] Support Quark int4-fp8 w4a8 for MoE
 (#30071)

Signed-off-by: Bowen Bao <bowenbao@amd.com>
---
 .../layers/quantization/quark/quark.py        |  43 +++++
 .../layers/quantization/quark/quark_moe.py    | 160 +++++++++++++++++-
 2 files changed, 201 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py
index 3640e5c452786..39bcd56bcd3dc 100644
--- a/vllm/model_executor/layers/quantization/quark/quark.py
+++ b/vllm/model_executor/layers/quantization/quark/quark.py
@@ -218,6 +218,49 @@ class QuarkConfig(QuantizationConfig):
         else:
             return False
 
+    def _is_fp8_w4a8(
+        self,
+        weight_quant: list[dict[str, Any]] | None,
+        input_quant: dict[str, Any] | None,
+    ) -> bool:
+        # Confirm weights and input quantized.
+        if weight_quant is None or input_quant is None:
+            return False
+
+        if not isinstance(weight_quant, list) or len(weight_quant) != 2:
+            return False
+
+        # Confirm weight scheme is supported
+        is_w4a8_dtype = (
+            weight_quant[0].get("dtype") == "fp8_e4m3"
+            and weight_quant[1].get("dtype") == "int4"
+            and input_quant.get("dtype") == "fp8_e4m3"
+        )
+        is_static_weight = not weight_quant[0].get("is_dynamic") and not weight_quant[
+            1
+        ].get("is_dynamic")
+        is_per_tensor_fp8_and_per_channel_int4_weight = (
+            weight_quant[0].get("qscheme") == "per_tensor"
+            and weight_quant[1].get("qscheme") == "per_channel"
+            and weight_quant[1].get("symmetric") is True
+            and weight_quant[1].get("ch_axis") == 0
+        )
+
+        if not (
+            is_w4a8_dtype
+            and is_static_weight
+            and is_per_tensor_fp8_and_per_channel_int4_weight
+        ):
+            return False
+
+        # Dynamic quantization is always supported if weights supported.
+        if input_quant.get("is_dynamic"):
+            return True
+
+        # Confirm activation scheme is supported.
+        is_per_tensor_activation = input_quant.get("qscheme") == "per_tensor"
+        return is_per_tensor_activation
+
     def _is_fp8_w8a8(
         self,
         weight_quant: dict[str, Any] | None,
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index d84e22d1fa0f2..0b9b098afb1f6 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -63,8 +63,9 @@ class QuarkMoEMethod(FusedMoEMethodBase):
             )
         weight_config = layer_quant_config.get("weight")
         input_config = layer_quant_config.get("input_tensors")
-
-        if quant_config._is_fp8_w8a8(weight_config, input_config):
+        if quant_config._is_fp8_w4a8(weight_config, input_config):
+            return QuarkW4A8Fp8MoEMethod(weight_config, input_config, module.moe_config)
+        elif quant_config._is_fp8_w8a8(weight_config, input_config):
             return QuarkW8A8Fp8MoEMethod(weight_config, input_config, module.moe_config)
         elif quant_config._is_ocp_mx(weight_config, input_config):
             return QuarkOCP_MX_MoEMethod(weight_config, input_config, module.moe_config)
@@ -396,6 +397,161 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
             )
 
 
+class QuarkW4A8Fp8MoEMethod(QuarkMoEMethod):
+    def __init__(
+        self,
+        weight_config: dict[str, Any],
+        input_config: dict[str, Any],
+        moe: FusedMoEConfig,
+    ):
+        super().__init__(moe)
+        self.weight_quant = weight_config
+        self.input_quant = input_config
+
+        assert rocm_aiter_ops.is_fused_moe_enabled(), (
+            "W4A8 FP8 MoE requires ROCm AITER fused MoE support."
+        )
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        params_dtype = torch.uint32
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size // 8,  # INT32 packing for W4
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition // 8,  # INT32 packing for W4
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        # Per-tensor fp8 weight scales
+        w13_weight_scale = torch.nn.Parameter(
+            torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False
+        )
+        w2_weight_scale = torch.nn.Parameter(
+            torch.ones(num_experts, dtype=torch.float32), requires_grad=False
+        )
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
+        )
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        # Per-channel int4 weight scales
+        w13_weight_scale_2 = torch.nn.Parameter(
+            torch.ones(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                dtype=torch.float32,
+            ),
+            requires_grad=False,
+        )
+        w2_weight_scale_2 = torch.nn.Parameter(
+            torch.ones(num_experts, hidden_size, dtype=torch.float32),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_scale_2", w13_weight_scale_2)
+        layer.register_parameter("w2_weight_scale_2", w2_weight_scale_2)
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
+        )
+        set_weight_attrs(w13_weight_scale_2, extra_weight_attrs)
+        set_weight_attrs(w2_weight_scale_2, extra_weight_attrs)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
+            layer.w13_weight.data, layer.w2_weight.data
+        )
+        layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False)
+        layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False)
+
+        # INT4-FP8 : offset INT4 w13_weight_scale1 to single w13_weight_scale
+        # Fp8 moe kernel needs single fp8 w13_weight_scale for w13 per expert.
+        # We won't do requant each expert's fp8 weight (not direct available),
+        # instead we adjust half of INT4 w13_weight_scale1 numbers
+        shard_size = layer.intermediate_size_per_partition
+        max_w13_scales = layer.w13_weight_scale.max(dim=1).values
+        assert torch.all(max_w13_scales != 0), "fp8 weight scale cannot be zero."
+        for expert_id in range(layer.local_num_experts):
+            start = 0
+            max_w13_scale_fp8 = max_w13_scales[expert_id]
+            for shard_id in range(2):
+                if layer.w13_weight_scale[expert_id][shard_id] != max_w13_scale_fp8:
+                    int4_rescale = (
+                        layer.w13_weight_scale[expert_id][shard_id] / max_w13_scale_fp8
+                    )
+                    layer.w13_weight_scale_2[expert_id][start : start + shard_size] *= (
+                        int4_rescale
+                    )
+                start += shard_size
+
+        layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales, requires_grad=False)
+
+        # special hack to asm_moe, which takes (weight_scale1 * weight_scale) as post
+        # GEMM scaling optimal design - shall apply per-column weight_scale1 before
+        # GEMM, and weight_scale post
+        for expert_id in range(layer.local_num_experts):
+            layer.w13_weight_scale_2[expert_id] *= max_w13_scales[expert_id]
+            layer.w2_weight_scale_2[expert_id] *= layer.w2_weight_scale[expert_id]
+
+    def get_fused_moe_quant_config(self, layer):
+        return fp8_w8a8_moe_quant_config(
+            w1_scale=layer.w13_weight_scale_2,
+            w2_scale=layer.w2_weight_scale_2,
+            per_out_ch_quant=True,
+        )
+
+    def apply(
+        self,
+        layer: FusedMoE,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        topk_weights, topk_ids, _ = layer.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+        )
+
+        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+            rocm_aiter_fused_experts,
+        )
+
+        return rocm_aiter_fused_experts(
+            hidden_states=x,
+            w1=layer.w13_weight,
+            w2=layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            activation=layer.activation,
+            apply_router_weight_on_input=layer.apply_router_weight_on_input,
+            quant_config=self.moe_quant_config,
+            expert_map=layer.expert_map,
+        )
+
+
 class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
     def __init__(
         self,

From 4a8412f773c67e0ba1eb1d4992095d6e0204f0ce Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Wed, 17 Dec 2025 23:21:51 -0500
Subject: [PATCH 652/770] [UX] Reduce DeepGEMM warmup log output to single
 progress bar (#30903)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 .../model_executor/warmup/deep_gemm_warmup.py | 141 ++++++++++++------
 1 file changed, 99 insertions(+), 42 deletions(-)

diff --git a/vllm/model_executor/warmup/deep_gemm_warmup.py b/vllm/model_executor/warmup/deep_gemm_warmup.py
index 936f6b1e28ce1..2bbc655bd935f 100644
--- a/vllm/model_executor/warmup/deep_gemm_warmup.py
+++ b/vllm/model_executor/warmup/deep_gemm_warmup.py
@@ -10,7 +10,7 @@ import torch
 from tqdm import tqdm
 
 import vllm.envs as envs
-from vllm.distributed.parallel_state import get_dp_group
+from vllm.distributed.parallel_state import get_dp_group, is_global_first_rank
 from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts
 from vllm.model_executor.layers.fused_moe.deep_gemm_utils import compute_aligned_M
 from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoEModularMethod
@@ -175,7 +175,30 @@ def _fused_moe_grouped_gemm_may_use_deep_gemm(module: torch.nn.Module) -> bool:
 FP8_GEMM_NT_WARMUP_CACHE: set[torch.Size] = set()
 
 
-def _deepgemm_fp8_gemm_nt_warmup(w: torch.Tensor, ws: torch.Tensor, max_tokens: int):
+def _get_fp8_gemm_nt_m_values(w: torch.Tensor, max_tokens: int) -> list[int]:
+    """Get the M values to warmup for a given weight tensor."""
+    n, _ = w.size()
+    device = w.device
+
+    # Use optimal M values only if VLLM_DEEP_GEMM_WARMUP is set to "relax".
+    # Otherwise warmup all token sizes to avoid JIT compilation in hotpath
+    if envs.VLLM_DEEP_GEMM_WARMUP == "relax":
+        return _generate_optimal_warmup_m_values(max_tokens, n, device)
+    else:
+        assert envs.VLLM_DEEP_GEMM_WARMUP == "full", (
+            "Expected "
+            'VLLM_DEEP_GEMM_WARMUP env to be set to "full" but got '
+            f"{envs.VLLM_DEEP_GEMM_WARMUP}"
+        )
+        return list(range(1, max_tokens + 1))
+
+
+def _deepgemm_fp8_gemm_nt_warmup(
+    w: torch.Tensor,
+    ws: torch.Tensor,
+    max_tokens: int,
+    pbar: tqdm | None = None,
+):
     if w.size() in FP8_GEMM_NT_WARMUP_CACHE:
         return
 
@@ -189,27 +212,14 @@ def _deepgemm_fp8_gemm_nt_warmup(w: torch.Tensor, ws: torch.Tensor, max_tokens:
     )
     out = torch.empty((max_tokens, n), device=device, dtype=torch.bfloat16)
 
-    # Use optimal M values only if VLLM_DEEP_GEMM_WARMUP is set to "relax".
-    # Otherwise warmup all token sizes to avoid JIT compilation in hotpath
-    if envs.VLLM_DEEP_GEMM_WARMUP == "relax":
-        m_values = _generate_optimal_warmup_m_values(max_tokens, n, device)
-        desc = f"DeepGemm(fp8_gemm_nt) warmup (W={w.size()}) [relaxed]"
-    else:
-        assert envs.VLLM_DEEP_GEMM_WARMUP == "full", (
-            "Expected "
-            'VLLM_DEEP_GEMM_WARMUP env to be set to "full" but got '
-            f"{envs.VLLM_DEEP_GEMM_WARMUP}"
-        )
-        m_values = list(range(1, max_tokens + 1))
-        desc = f"DeepGemm(fp8_gemm_nt) warmup (W={w.size()}) [all tokens]"
-
-    pbar = tqdm(total=len(m_values), desc=desc)
+    m_values = _get_fp8_gemm_nt_m_values(w, max_tokens)
 
     for num_tokens in m_values:
         fp8_gemm_nt(
             (a1q[:num_tokens], a1q_scales[:num_tokens]), (w, ws), out[:num_tokens]
         )
-        pbar.update(1)
+        if pbar is not None:
+            pbar.update(1)
 
     FP8_GEMM_NT_WARMUP_CACHE.add(w.size())
 
@@ -217,20 +227,12 @@ def _deepgemm_fp8_gemm_nt_warmup(w: torch.Tensor, ws: torch.Tensor, max_tokens:
 GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE: set[torch.Size] = set()
 
 
-def _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(
+def _get_grouped_gemm_params(
     w1: torch.Tensor,
     w2: torch.Tensor,
-    w1_scale: torch.Tensor,
-    w2_scale: torch.Tensor,
     num_topk: int,
     max_tokens: int,
-):
-    if (
-        w1.size() in GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE
-        and w2.size() in GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE
-    ):
-        return
-
+) -> tuple[int, int, torch.Tensor]:
     assert w1.size(0) == w2.size(0), "w1 and w2 must have the same number of experts"
 
     block_m = get_mk_alignment_for_contiguous_layout()[0]
@@ -253,6 +255,27 @@ def _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(
     )
     expert_ids = torch.repeat_interleave(expert_ids_block, block_m, dim=0)
 
+    return MAX_M, block_m, expert_ids
+
+
+def _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    num_topk: int,
+    max_tokens: int,
+    pbar: tqdm | None = None,
+):
+    if (
+        w1.size() in GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE
+        and w2.size() in GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE
+    ):
+        return
+
+    MAX_M, block_m, expert_ids = _get_grouped_gemm_params(w1, w2, num_topk, max_tokens)
+    device = w1.device
+
     def _warmup(w: torch.Tensor, w_scale: torch.Tensor):
         _, n, k = w.size()
         a1q = torch.empty((MAX_M, k), device=device, dtype=torch.float8_e4m3fn)
@@ -261,15 +284,8 @@ def _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(
         )
         out = torch.empty((MAX_M, n), device=device, dtype=torch.bfloat16)
 
-        # Generate M values in block_m increments (already optimized for MoE)
         m_values = list(range(block_m, MAX_M + 1, block_m))
 
-        pbar = tqdm(
-            total=len(m_values),
-            desc=f"DeepGemm(m_grouped_fp8_gemm_nt_contiguous) warmup (W={w.size()}) "
-            f"[{len(m_values)} values, block_m={block_m}]",
-        )
-
         for num_tokens in m_values:
             m_grouped_fp8_gemm_nt_contiguous(
                 (a1q[:num_tokens], a1q_scales[:num_tokens]),
@@ -277,7 +293,8 @@ def _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(
                 out[:num_tokens],
                 expert_ids[:num_tokens],
             )
-            pbar.update(1)
+            if pbar is not None:
+                pbar.update(1)
 
     for w, ws in [(w1, w1_scale), (w2, w2_scale)]:
         if w.size() not in GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE:
@@ -285,16 +302,18 @@ def _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(
             GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE.add(w.size())
 
 
-def deepgemm_fp8_gemm_nt_warmup(model: torch.nn.Module, max_tokens: int):
+def deepgemm_fp8_gemm_nt_warmup(
+    model: torch.nn.Module, max_tokens: int, pbar: tqdm | None = None
+):
     dg_modules = [m for m in model.modules() if _fp8_linear_may_use_deep_gemm(m)]
 
     for dgm in dg_modules:
         w, ws, _ = _extract_data_from_linear_base_module(dgm)
-        _deepgemm_fp8_gemm_nt_warmup(w=w, ws=ws, max_tokens=max_tokens)
+        _deepgemm_fp8_gemm_nt_warmup(w=w, ws=ws, max_tokens=max_tokens, pbar=pbar)
 
 
 def deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(
-    model: torch.nn.Module, max_tokens: int
+    model: torch.nn.Module, max_tokens: int, pbar: tqdm | None = None
 ):
     dg_modules = [
         m for m in model.modules() if _fused_moe_grouped_gemm_may_use_deep_gemm(m)
@@ -305,10 +324,48 @@ def deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(
             dgm
         )
         _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(
-            w13, w2, w13_scale, w2_scale, num_topk, max_tokens
+            w13, w2, w13_scale, w2_scale, num_topk, max_tokens, pbar=pbar
         )
 
 
+def _count_warmup_iterations(model: torch.nn.Module, max_tokens: int) -> int:
+    seen_fp8_sizes: set[torch.Size] = set(FP8_GEMM_NT_WARMUP_CACHE)
+    seen_grouped_sizes: set[torch.Size] = set(
+        GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE
+    )
+
+    total = 0
+    for m in model.modules():
+        if _fp8_linear_may_use_deep_gemm(m):
+            w, _, _ = _extract_data_from_linear_base_module(m)
+            if w.size() not in seen_fp8_sizes:
+                total += len(_get_fp8_gemm_nt_m_values(w, max_tokens))
+                seen_fp8_sizes.add(w.size())
+        elif _fused_moe_grouped_gemm_may_use_deep_gemm(m):
+            w13, _, w2, _, num_topk = _extract_data_from_fused_moe_module(m)
+            if w13.size() in seen_grouped_sizes and w2.size() in seen_grouped_sizes:
+                continue
+            MAX_M, block_m, _ = _get_grouped_gemm_params(w13, w2, num_topk, max_tokens)
+            n_values = (MAX_M - block_m) // block_m + 1
+            if w13.size() not in seen_grouped_sizes:
+                total += n_values
+                seen_grouped_sizes.add(w13.size())
+            if w2.size() not in seen_grouped_sizes:
+                total += n_values
+                seen_grouped_sizes.add(w2.size())
+    return total
+
+
 def deep_gemm_warmup(model: torch.nn.Module, max_tokens: int):
-    deepgemm_fp8_gemm_nt_warmup(model, max_tokens)
-    deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(model, max_tokens)
+    total = _count_warmup_iterations(model, max_tokens)
+    if total == 0:
+        return
+
+    # Only show progress bar on rank 0 to avoid cluttered output
+    if is_global_first_rank():
+        with tqdm(total=total, desc="DeepGEMM warmup") as pbar:
+            deepgemm_fp8_gemm_nt_warmup(model, max_tokens, pbar)
+            deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(model, max_tokens, pbar)
+    else:
+        deepgemm_fp8_gemm_nt_warmup(model, max_tokens, None)
+        deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(model, max_tokens, None)

From 5f2f3fba1d9ed0aa433171b86c415a5f02055035 Mon Sep 17 00:00:00 2001
From: Zhengxu Chen <zhxchen17@fb.com>
Date: Wed, 17 Dec 2025 23:22:23 -0500
Subject: [PATCH 653/770] [compile] Fix CI for test_gpt2_cache_hit (#30902)

Signed-off-by: zhxchen17 <zhxchen17@fb.com>
---
 tests/compile/test_aot_compile.py | 11 ++++++++++-
 vllm/config/compilation.py        | 10 +++++-----
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/tests/compile/test_aot_compile.py b/tests/compile/test_aot_compile.py
index 8fa305d6d72f5..2ffcd627e476a 100644
--- a/tests/compile/test_aot_compile.py
+++ b/tests/compile/test_aot_compile.py
@@ -9,6 +9,7 @@ from contextlib import contextmanager
 import pytest
 import torch
 
+import vllm.model_executor.layers.activation
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (
     CompilationConfig,
@@ -16,9 +17,12 @@ from vllm.config import (
     VllmConfig,
     set_current_vllm_config,
 )
+from vllm.envs import disable_envs_cache
 from vllm.forward_context import set_forward_context
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
+from ..utils import create_new_process_for_each_test
+
 
 def reference_fn(x: torch.Tensor):
     assert x.shape[0] <= 42
@@ -66,6 +70,7 @@ def test_no_dynamo_cache_entry(monkeypatch: pytest.MonkeyPatch):
                 torch.compiler.set_stance("fail_on_recompile"),
             ):
                 CompiledMod(vllm_config=vllm_config)(*args)
+            disable_envs_cache()
 
             m.setenv("VLLM_USE_AOT_COMPILE", "1")
             torch._dynamo.reset()
@@ -101,6 +106,7 @@ def test_save_and_load(monkeypatch: pytest.MonkeyPatch):
             vllm_config = make_vllm_config()
             with use_vllm_config(vllm_config):
                 expected = CompiledMod(vllm_config=vllm_config)(*args)
+            disable_envs_cache()
 
             m.setenv("VLLM_FORCE_AOT_LOAD", "1")
             vllm_config = make_vllm_config()
@@ -130,6 +136,7 @@ def test_shape_env(monkeypatch: pytest.MonkeyPatch):
                 artifacts = compiled_mod.aot_compiled_fn._artifacts
                 guards_string = artifacts.compiled_fn.shape_env.format_guards()
                 assert guards_string == " - s77 <= 42\n - Eq(Mod(s77, 2), 0)"
+            disable_envs_cache()
 
             m.setenv("VLLM_FORCE_AOT_LOAD", "1")
             vllm_config = make_vllm_config()
@@ -144,7 +151,7 @@ def test_shape_env(monkeypatch: pytest.MonkeyPatch):
 @pytest.mark.skipif(
     not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
 )
-@use_vllm_config(make_vllm_config())
+@create_new_process_for_each_test("spawn")
 def test_gpt2_cache_hit(monkeypatch: pytest.MonkeyPatch):
     """
     Test that compiling gpt2 twice results in a cache hit and
@@ -186,6 +193,8 @@ def test_gpt2_cache_hit(monkeypatch: pytest.MonkeyPatch):
 
             # Clean up first model
             del llm_model
+            disable_envs_cache()
+            vllm.model_executor.layers.activation._ACTIVATION_REGISTRY._dict.clear()
 
             # Second compilation - should hit cache
             m.setenv("VLLM_FORCE_AOT_LOAD", "1")
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 4a98494b3c7b3..3e3ee1e572ec8 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -437,14 +437,14 @@ class CompilationConfig:
 
     compile_ranges_split_points: list[int] | None = None
     """Split points that represent compile ranges for inductor.
-    The compile ranges are 
-    [1, split_points[0]], 
-    [split_points[0] + 1, split_points[1]], ..., 
+    The compile ranges are
+    [1, split_points[0]],
+    [split_points[0] + 1, split_points[1]], ...,
     [split_points[-1] + 1, max_num_batched_tokens].
     Compile sizes are also used single element ranges,
     the range is represented as [compile_sizes[i], compile_sizes[i]].
-    
-    If a range overlaps with the compile size, graph for compile size 
+
+    If a range overlaps with the compile size, graph for compile size
     will be prioritized, i.e. if we have a range [1, 8] and a compile size 4,
     graph for compile size 4 will be compiled and used instead of the graph
     for range [1, 8].

From b166ef20e1e5256913b34456507d89850a8dba38 Mon Sep 17 00:00:00 2001
From: zzhxxx <zzh_201018@outlook.com>
Date: Thu, 18 Dec 2025 12:45:56 +0800
Subject: [PATCH 654/770] [refactor] Add prefix support to embed_tokens in
 DeepSeek MTP (#30788)

Signed-off-by: zzhx1 <zzh_201018@outlook.com>
---
 vllm/model_executor/models/deepseek_mtp.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py
index ca77b8322e2e8..c25e8422da157 100644
--- a/vllm/model_executor/models/deepseek_mtp.py
+++ b/vllm/model_executor/models/deepseek_mtp.py
@@ -141,6 +141,7 @@ class DeepSeekMultiTokenPredictor(nn.Module):
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
+            prefix=maybe_prefix(prefix, "embed_tokens"),
         )
         self.logits_processor = LogitsProcessor(config.vocab_size)
 

From cfb7e55515a5558be3a7199044411953017352d3 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Thu, 18 Dec 2025 12:59:09 +0800
Subject: [PATCH 655/770] [Doc][CPU] Update CPU doc (#30765)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
Signed-off-by: Li, Jiang <bigpyj64@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 docker/Dockerfile.cpu                         |  4 +-
 .../installation/cpu.arm.inc.md               | 34 ++++++++-
 docs/getting_started/installation/cpu.md      | 11 +--
 .../installation/cpu.x86.inc.md               | 71 +++++++++++++++++--
 .../installation/python_env_setup.inc.md      |  2 +-
 5 files changed, 106 insertions(+), 16 deletions(-)

diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index 8d55ecfba3e52..bd5bc43916eac 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -17,7 +17,7 @@
 #   VLLM_CPU_DISABLE_AVX512=false (default)|true
 #   VLLM_CPU_AVX512BF16=false (default)|true
 #   VLLM_CPU_AVX512VNNI=false (default)|true
-#   VLLM_CPU_AMXBF16=false (default)|true
+#   VLLM_CPU_AMXBF16=false |true (default)
 #
 
 ######################### COMMON BASE IMAGE #########################
@@ -95,7 +95,7 @@ ENV VLLM_CPU_AVX512BF16=${VLLM_CPU_AVX512BF16}
 ARG VLLM_CPU_AVX512VNNI=0
 ENV VLLM_CPU_AVX512VNNI=${VLLM_CPU_AVX512VNNI}
 # Support for building with AMXBF16 ISA: docker build --build-arg VLLM_CPU_AMXBF16="true" ...
-ARG VLLM_CPU_AMXBF16=0
+ARG VLLM_CPU_AMXBF16=1
 ENV VLLM_CPU_AMXBF16=${VLLM_CPU_AMXBF16}
 
 WORKDIR /workspace/vllm
diff --git a/docs/getting_started/installation/cpu.arm.inc.md b/docs/getting_started/installation/cpu.arm.inc.md
index 657bf2509db01..4940e5781b29a 100644
--- a/docs/getting_started/installation/cpu.arm.inc.md
+++ b/docs/getting_started/installation/cpu.arm.inc.md
@@ -19,7 +19,7 @@ Pre-built vLLM wheels for Arm are available since version 0.11.2. These wheels c
 
 ```bash
 export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
-uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu
+uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu --index-strategy first-index
 ```
 
 ??? console "pip"
@@ -27,6 +27,20 @@ uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu
     pip install vllm==${VLLM_VERSION}+cpu --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu
     ```
 
+!!! warning "set `LD_PRELOAD`"
+    Before use vLLM CPU installed via wheels, make sure TCMalloc is installed and added to `LD_PRELOAD`:
+    ```bash
+    # install TCMalloc
+    sudo apt-get install -y --no-install-recommends libtcmalloc-minimal4
+
+    # manually find the path
+    sudo find / -iname *libtcmalloc_minimal.so.4
+    TC_PATH=...
+
+    # add them to LD_PRELOAD
+    export LD_PRELOAD="$TC_PATH:$LD_PRELOAD"
+    ```
+
 The `uv` approach works for vLLM `v0.6.6` and later. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
 
 **Install the latest code**
@@ -37,7 +51,7 @@ LLM inference is a fast-evolving field, and the latest code may contain bug fixe
 
 To install from nightly index, run:
 ```bash
-uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly/cpu
+uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly/cpu --index-strategy first-index
 ```
 
 ??? console "pip (there's a caveat)"
@@ -56,7 +70,7 @@ If you want to access the wheels for previous commits (e.g. to bisect the behavi
 
 ```bash
 export VLLM_COMMIT=730bd35378bf2a5b56b6d3a45be28b3092d26519 # use full commit hash from the main branch
-uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}/cpu
+uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}/cpu --index-strategy first-index
 ```
 
 # --8<-- [end:pre-built-wheels]
@@ -105,6 +119,20 @@ VLLM_TARGET_DEVICE=cpu uv pip install -e . --no-build-isolation
 
 Testing has been conducted on AWS Graviton3 instances for compatibility.
 
+!!! warning "set `LD_PRELOAD`"
+    Before use vLLM CPU installed via wheels, make sure TCMalloc is installed and added to `LD_PRELOAD`:
+    ```bash
+    # install TCMalloc
+    sudo apt-get install -y --no-install-recommends libtcmalloc-minimal4
+
+    # manually find the path
+    sudo find / -iname *libtcmalloc_minimal.so.4
+    TC_PATH=...
+
+    # add them to LD_PRELOAD
+    export LD_PRELOAD="$TC_PATH:$LD_PRELOAD"
+    ```
+
 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:pre-built-images]
 
diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md
index 210f720e2d92a..affb94593dd42 100644
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -18,6 +18,12 @@ vLLM is a Python library that supports the following CPU variants. Select your C
 
     --8<-- "docs/getting_started/installation/cpu.s390x.inc.md:installation"
 
+## Technical Discussions
+
+The main discussions happen in the `#sig-cpu` channel of [vLLM Slack](https://slack.vllm.ai/).
+
+When open a Github issue about the CPU backend, please add `[CPU Backend]` in the title and it will be labeled with `cpu` for better awareness.
+
 ## Requirements
 
 - Python: 3.10 -- 3.13
@@ -258,11 +264,6 @@ vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel
     - GPTQ (x86 only)
     - compressed-tensor INT8 W8A8 (x86, s390x)
 
-### (x86 only) What is the purpose of `VLLM_CPU_SGL_KERNEL`?
-
-- Both of them require `amx` CPU flag.
-    - `VLLM_CPU_SGL_KERNEL` can provide better performance for MoE models and small-batch scenarios.
-
 ### Why do I see `get_mempolicy: Operation not permitted` when running in Docker?
 
 In some container environments (like Docker), NUMA-related syscalls used by vLLM (e.g., `get_mempolicy`, `migrate_pages`) are blocked/denied in the runtime's default seccomp/capabilities settings. This may lead to warnings like `get_mempolicy: Operation not permitted`. Functionality is not affected, but NUMA memory binding/migration optimizations may not take effect and performance can be suboptimal.
diff --git a/docs/getting_started/installation/cpu.x86.inc.md b/docs/getting_started/installation/cpu.x86.inc.md
index 1fad7f4338822..01e34eee10539 100644
--- a/docs/getting_started/installation/cpu.x86.inc.md
+++ b/docs/getting_started/installation/cpu.x86.inc.md
@@ -17,7 +17,51 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data
 # --8<-- [end:set-up-using-python]
 # --8<-- [start:pre-built-wheels]
 
-Currently, there are no pre-built x86 CPU wheels.
+Pre-built vLLM wheels for x86 with AVX512 are available since version 0.13.0. To install release wheels:
+
+```bash
+export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
+
+# use uv
+uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu --index-strategy first-index --torch-backend cpu
+```
+??? console "pip"
+    ```bash
+    # use pip
+    pip install vllm==${VLLM_VERSION}+cpu --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu --extra-index-url https://download.pytorch.org/whl/cpu
+    ```
+!!! warning "set `LD_PRELOAD`"
+    Before use vLLM CPU installed via wheels, make sure TCMalloc and Intel OpenMP are installed and added to `LD_PRELOAD`:
+    ```bash
+    # install TCMalloc, Intel OpenMP is installed with vLLM CPU
+    sudo apt-get install -y --no-install-recommends libtcmalloc-minimal4
+
+    # manually find the path
+    sudo find / -iname *libtcmalloc_minimal.so.4
+    sudo find / -iname *libiomp5.so
+    TC_PATH=...
+    IOMP_PATH=...
+
+    # add them to LD_PRELOAD
+    export LD_PRELOAD="$TC_PATH:$IOMP_PATH:$LD_PRELOAD"
+    ```
+
+**Install the latest code**
+
+To install the wheel built from the latest main branch:
+
+```bash
+uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly/cpu --index-strategy first-index --torch-backend cpu
+```
+
+**Install specific revisions**
+
+If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL:
+
+```bash
+export VLLM_COMMIT=730bd35378bf2a5b56b6d3a45be28b3092d26519 # use full commit hash from the main branch
+uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}/cpu --index-strategy first-index --torch-backend cpu
+```
 
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]
@@ -26,10 +70,12 @@ Install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the def
 
 ```bash
 sudo apt-get update -y
-sudo apt-get install -y gcc-12 g++-12 libnuma-dev python3-dev
+sudo apt-get install -y gcc-12 g++-12 libnuma-dev
 sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 ```
 
+--8<-- "docs/getting_started/installation/python_env_setup.inc.md"
+
 Clone the vLLM project:
 
 ```bash
@@ -82,6 +128,22 @@ uv pip install dist/*.whl
     pip install dist/*.whl
     ```
 
+!!! warning "set `LD_PRELOAD`"
+    Before use vLLM CPU installed via wheels, make sure TCMalloc and Intel OpenMP are installed and added to `LD_PRELOAD`:
+    ```bash
+    # install TCMalloc, Intel OpenMP is installed with vLLM CPU
+    sudo apt-get install -y --no-install-recommends libtcmalloc-minimal4
+
+    # manually find the path
+    sudo find / -iname *libtcmalloc_minimal.so.4
+    sudo find / -iname *libiomp5.so
+    TC_PATH=...
+    IOMP_PATH=...
+
+    # add them to LD_PRELOAD
+    export LD_PRELOAD="$TC_PATH:$IOMP_PATH:$LD_PRELOAD"
+    ```
+
 !!! example "Troubleshooting"
     - **NumPy ≥2.0 error**: Downgrade using `pip install "numpy<2.0"`.
     - **CMake picks up CUDA**: Add `CMAKE_DISABLE_FIND_PACKAGE_CUDA=ON` to prevent CUDA detection during CPU builds, even if CUDA is installed.
@@ -95,7 +157,6 @@ uv pip install dist/*.whl
       "torch==X.Y.Z+cpu"   # <-------
     ]
     ```
-    - If you are building vLLM from source and not using the pre-built images, remember to set `LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD"` on x86 machines before running vLLM.
 
 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:pre-built-images]
@@ -112,6 +173,7 @@ uv pip install dist/*.whl
 docker build -f docker/Dockerfile.cpu \
         --build-arg VLLM_CPU_AVX512BF16=false (default)|true \
         --build-arg VLLM_CPU_AVX512VNNI=false (default)|true \
+        --build-arg VLLM_CPU_AMXBF16=false|true (default) \
         --build-arg VLLM_CPU_DISABLE_AVX512=false (default)|true \ 
         --tag vllm-cpu-env \
         --target vllm-openai .
@@ -123,9 +185,8 @@ docker run --rm \
             --shm-size=4g \
             -p 8000:8000 \
             -e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \
-            -e VLLM_CPU_OMP_THREADS_BIND=<CPU cores for inference> \
             vllm-cpu-env \
-            --model=meta-llama/Llama-3.2-1B-Instruct \
+            meta-llama/Llama-3.2-1B-Instruct \
             --dtype=bfloat16 \
             other vLLM OpenAI server arguments
 ```
diff --git a/docs/getting_started/installation/python_env_setup.inc.md b/docs/getting_started/installation/python_env_setup.inc.md
index ba78c329723ed..06794f8d3120e 100644
--- a/docs/getting_started/installation/python_env_setup.inc.md
+++ b/docs/getting_started/installation/python_env_setup.inc.md
@@ -1,4 +1,4 @@
-On NVIDIA CUDA only, it's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following commands:
+It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following commands:
 
 ```bash
 uv venv --python 3.12 --seed

From 717ac33d9cfa82357ea57dc0f6ee2aa325eba9f2 Mon Sep 17 00:00:00 2001
From: Vadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com>
Date: Thu, 18 Dec 2025 09:16:04 +0400
Subject: [PATCH 656/770] [PERF] Qwen3-next. Add fp8 cutlass MoE tuned configs.
 `chmod -x *MI308X.json` (#29553)

Signed-off-by: Vadim Gimpelson <vadim.gimpelson@gmail.com>
---
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 147 ++++++++++++++++++
 ...N=768,device_name=AMD_Instinct_MI308X.json |   0
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 147 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 147 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 147 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 147 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 147 ++++++++++++++++++
 7 files changed, 882 insertions(+)
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
 mode change 100755 => 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI308X.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json

diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..3859583fb31f2
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI308X.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI308X.json
old mode 100755
new mode 100644
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..b03a587294217
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..7e57e97eef8a7
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..4438d15c56949
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..93f7227b11269
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..694dbf47b2074
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}

From 82dc338ad609c9a91b0ae764c7961083784cd620 Mon Sep 17 00:00:00 2001
From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Date: Wed, 17 Dec 2025 23:18:26 -0600
Subject: [PATCH 657/770] [AMD][CI] fix lm eval ci arg (#30911)

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
---
 .buildkite/test-amd.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 6df373632d730..f294261ec8c3a 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -754,7 +754,7 @@ steps:
   - vllm/model_executor/layers/quantization
   autorun_on_main: true
   commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
 
 - label: OpenAI API correctness # 10min
   timeout_in_minutes: 15
@@ -1203,7 +1203,7 @@ steps:
   - csrc/
   - vllm/model_executor/layers/quantization
   commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
 
 #####  1 GPU test  #####
 #####  multi gpus test  #####
@@ -1521,7 +1521,7 @@ steps:
   - csrc/
   - vllm/model_executor/layers/quantization
   commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
 
 - label: LM Eval Large Models (4 Card)
   mirror_hardwares: [amdexperimental, amdproduction]

From ec965569d94c09eb1c85d235319b24d1b795d048 Mon Sep 17 00:00:00 2001
From: Yihua Cheng <yihua98@uchicago.edu>
Date: Wed, 17 Dec 2025 21:31:34 -0800
Subject: [PATCH 658/770] [KV connector][LMCache] Only record the cuda event
 when there are request to store/load (#30814)

Signed-off-by: ApostaC <yihua98@uchicago.edu>
---
 .../multi_process_adapter.py                  |  1 +
 .../kv_connector/v1/lmcache_mp_connector.py   | 56 +++++++++++++------
 2 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py
index 6acfb73997f25..6656b5a25f83d 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py
@@ -262,6 +262,7 @@ class LMCacheMPWorkerAdapter:
     ):
         keys = []
         block_ids = []
+
         for op in ops:
             keys.extend(self._block_hashes_to_keys(op.block_hashes))
             block_ids.extend(op.block_ids)
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
index 78256a6552c22..995708b89bc26 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
@@ -24,6 +24,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.lmcache_integration import (
 )
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.outputs import KVConnectorOutput
+from vllm.v1.request import RequestStatus
 from vllm.v1.utils import ConstantList
 
 if TYPE_CHECKING:
@@ -211,7 +212,7 @@ class LMCacheMPRequestTracker:
         """
         self.num_stored_blocks += num_new_blocks
 
-    def update_block_ids(
+    def append_block_ids(
         self,
         new_block_ids: list[int],
     ):
@@ -455,10 +456,6 @@ class LMCacheMPConnector(KVConnectorBase_V1):
         metadata = self._get_connector_metadata()
         assert isinstance(metadata, LMCacheMPConnectorMetadata)
 
-        with torch.cuda.stream(torch.cuda.current_stream()):
-            event = torch.cuda.Event(interprocess=True)
-            event.record()
-
         request_ids = []
         ops = []
 
@@ -468,10 +465,14 @@ class LMCacheMPConnector(KVConnectorBase_V1):
             request_ids.append(meta.request_id)
             ops.append(meta.op)
 
-        if len(request_ids) > 0:
-            self.worker_adapter.batched_submit_retrieve_requests(
-                request_ids, ops, event
-            )
+        if len(request_ids) == 0:
+            return
+
+        with torch.cuda.stream(torch.cuda.current_stream()):
+            event = torch.cuda.Event(interprocess=True)
+            event.record()
+
+        self.worker_adapter.batched_submit_retrieve_requests(request_ids, ops, event)
 
     def wait_for_layer_load(self, layer_name: str) -> None:
         """
@@ -518,10 +519,6 @@ class LMCacheMPConnector(KVConnectorBase_V1):
         metadata = self._get_connector_metadata()
         assert isinstance(metadata, LMCacheMPConnectorMetadata)
 
-        with torch.cuda.stream(torch.cuda.current_stream()):
-            event = torch.cuda.Event(interprocess=True)
-            event.record()
-
         request_ids = []
         ops = []
         for meta in metadata.requests:
@@ -530,8 +527,14 @@ class LMCacheMPConnector(KVConnectorBase_V1):
             request_ids.append(meta.request_id)
             ops.append(meta.op)
 
-        if len(request_ids) > 0:
-            self.worker_adapter.batched_submit_store_requests(request_ids, ops, event)
+        if len(request_ids) == 0:
+            return
+
+        with torch.cuda.stream(torch.cuda.current_stream()):
+            event = torch.cuda.Event(interprocess=True)
+            event.record()
+
+        self.worker_adapter.batched_submit_store_requests(request_ids, ops, event)
 
     def get_finished(
         self, finished_req_ids: set[str]
@@ -627,6 +630,9 @@ class LMCacheMPConnector(KVConnectorBase_V1):
             into account.
         """
         tracker = self._get_or_create_request_tracker(request)
+        # TODO: support loading KV for preempted requests in the future
+        if request.status == RequestStatus.PREEMPTED:
+            return 0, False
 
         self.scheduler_adapter.maybe_submit_lookup_request(
             request.request_id, convert_block_hashes_to_bytes(request.block_hashes)
@@ -683,7 +689,7 @@ class LMCacheMPConnector(KVConnectorBase_V1):
 
         # No matter we need to retrieve or not, we need to update
         # the block ids into the tracker
-        tracker.update_block_ids(block_ids)
+        tracker.append_block_ids(block_ids)
 
         # Update the state of the tracker
         condition = tracker.needs_retrieve()
@@ -866,7 +872,8 @@ class LMCacheMPConnector(KVConnectorBase_V1):
 
             # Update block ids
             new_block_ids = reformat_block_ids(cached_reqs.new_block_ids[idx])
-            request_tracker.update_block_ids(new_block_ids)
+            if request_id not in cached_reqs.resumed_req_ids:
+                request_tracker.append_block_ids(new_block_ids)
 
             # Update new scheduled tokens
             num_new_tokens = cached_reqs.num_computed_tokens[idx]
@@ -889,6 +896,21 @@ class LMCacheMPConnector(KVConnectorBase_V1):
         self, request: "Request"
     ) -> LMCacheMPRequestTracker:
         request_id = request.request_id
+        # Remove the old trackers that is created before the preemption
+        if (
+            request.status == RequestStatus.PREEMPTED
+            and request_id in self.request_trackers
+        ):
+            tracker = self.request_trackers[request_id]
+
+            # NOTE: since this function may be called multiple times
+            # for a single request (because get_num_new_matched_tokens
+            # may be called multiple times) for the same request, we
+            # will only do the remove if the tracker is not in the "fresh"
+            # state, i.e., PREFETCHING
+            if tracker.state != LMCacheMPRequestState.PREFETCHING:
+                self.request_trackers.pop(request_id)
+
         if request_id not in self.request_trackers:
             new_tracker = LMCacheMPRequestTracker(request)
             self.request_trackers[request_id] = new_tracker

From fc2ae6d6177c0b27a10f6c930e335ac0ec240982 Mon Sep 17 00:00:00 2001
From: Nathan Price <125999937+TheCodeWrangler@users.noreply.github.com>
Date: Thu, 18 Dec 2025 00:12:29 -0600
Subject: [PATCH 659/770] fix: add warmup for audio preprocessing (#30706)

Signed-off-by: Nathan Price <nathan@abridge.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/entrypoints/openai/speech_to_text.py | 127 +++++++++++++++++++++-
 1 file changed, 126 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py
index df9c06adb105a..3e648f44f380b 100644
--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@@ -35,7 +35,7 @@ from vllm.entrypoints.openai.serving_engine import OpenAIServing, SpeechToTextRe
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
-from vllm.model_executor.models import SupportsTranscription
+from vllm.model_executor.models import SupportsTranscription, supports_transcription
 from vllm.outputs import RequestOutput
 from vllm.tokenizers import get_tokenizer
 from vllm.utils.import_utils import PlaceholderModule
@@ -112,6 +112,131 @@ class OpenAISpeechToText(OpenAIServing):
                 self.default_sampling_params,
             )
 
+        # Warm up audio preprocessing to avoid first-request latency
+        self._warmup_audio_preprocessing()
+        # Warm up input processor with dummy audio
+        self._warmup_input_processor()
+
+    def _warmup_audio_preprocessing(self) -> None:
+        """Warm up audio processing libraries to avoid first-request latency.
+
+        The first call to librosa functions (load, get_duration, mel-spectrogram)
+        triggers JIT compilation and library initialization which can take ~7s.
+        This method warms up these operations during server initialization.
+        """
+        # Skip warmup if librosa is not installed (optional dependency)
+        if isinstance(librosa, PlaceholderModule):
+            return
+
+        # Skip warmup if model doesn't support transcription
+        if not supports_transcription(self.model_cls):
+            return
+
+        try:
+            warmup_start = time.perf_counter()
+            logger.info("Warming up audio preprocessing libraries...")
+
+            # Create a minimal dummy audio (1 second of silence at target sample rate)
+            dummy_audio = np.zeros(int(self.asr_config.sample_rate), dtype=np.float32)
+
+            # Warm up librosa.load by using librosa functions on the dummy data
+            # This initializes FFTW, numba JIT, and other audio processing libraries
+            _ = librosa.get_duration(y=dummy_audio, sr=self.asr_config.sample_rate)
+
+            # Warm up mel-spectrogram computation with model-specific parameters
+            from vllm.transformers_utils.processor import (
+                cached_processor_from_config,
+            )
+
+            processor = cached_processor_from_config(self.model_config)
+            feature_extractor = None
+            if hasattr(processor, "feature_extractor"):
+                feature_extractor = processor.feature_extractor
+            elif hasattr(processor, "audio_processor"):
+                # For models like GraniteSpeech that use audio_processor
+                audio_proc = processor.audio_processor
+                if hasattr(audio_proc, "feature_extractor"):
+                    feature_extractor = audio_proc.feature_extractor
+                # If audio_processor doesn't have feature_extractor,
+                # skip mel-spectrogram warmup for these models
+
+            if feature_extractor is not None:
+                _ = librosa.feature.melspectrogram(
+                    y=dummy_audio,
+                    sr=self.asr_config.sample_rate,
+                    n_mels=getattr(feature_extractor, "n_mels", 128),
+                    n_fft=getattr(feature_extractor, "n_fft", 400),
+                    hop_length=getattr(feature_extractor, "hop_length", 160),
+                )
+
+            warmup_elapsed = time.perf_counter() - warmup_start
+            logger.info("Audio preprocessing warmup completed in %.2fs", warmup_elapsed)
+        except Exception:
+            # Don't fail initialization if warmup fails - log exception and continue
+            logger.exception(
+                "Audio preprocessing warmup failed (non-fatal): %s. "
+                "First request may experience higher latency.",
+            )
+
+    def _warmup_input_processor(self) -> None:
+        """Warm up input processor with dummy audio to avoid first-request latency.
+
+        The first call to input_processor.process_inputs() with multimodal audio
+        triggers multimodal processing initialization which can take ~2.5s.
+        This method processes a dummy audio request to warm up the pipeline.
+        """
+        # Skip warmup if model doesn't support transcription
+        if not supports_transcription(self.model_cls):
+            return
+
+        # Only warm up if model supports transcription methods
+        if not hasattr(self.model_cls, "get_generation_prompt"):
+            return
+
+        try:
+            from vllm.sampling_params import SamplingParams
+
+            warmup_start = time.perf_counter()
+            logger.info("Warming up multimodal input processor...")
+
+            # Create minimal dummy audio (1 second of silence)
+            dummy_audio = np.zeros(int(self.asr_config.sample_rate), dtype=np.float32)
+
+            # Use the same method that _preprocess_speech_to_text uses
+            # to create the prompt
+            dummy_prompt = self.model_cls.get_generation_prompt(
+                audio=dummy_audio,
+                stt_config=self.asr_config,
+                model_config=self.model_config,
+                language="en",
+                task_type=self.task_type,
+                request_prompt="",
+                to_language=None,
+            )
+
+            # Create minimal sampling params
+            dummy_params = SamplingParams(
+                max_tokens=1,
+                temperature=0.0,
+            )
+
+            # Process the dummy input through the input processor
+            # This will trigger all the multimodal processing initialization
+            _ = self.input_processor.process_inputs(
+                request_id="warmup",
+                prompt=dummy_prompt,
+                params=dummy_params,
+            )
+
+            warmup_elapsed = time.perf_counter() - warmup_start
+            logger.info("Input processor warmup completed in %.2fs", warmup_elapsed)
+        except Exception:
+            # Don't fail initialization if warmup fails - log warning and continue
+            logger.exception(
+                "Input processor warmup failed (non-fatal): %s. "
+                "First request may experience higher latency."
+            )
+
     @cached_property
     def model_cls(self) -> type[SupportsTranscription]:
         from vllm.model_executor.model_loader import get_model_cls

From e3ab93c89667bcd026582e74bc7c49774724a273 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Thu, 18 Dec 2025 14:36:49 +0800
Subject: [PATCH 660/770] [CPU] Refactor CPU fused MOE (#30531)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 .../scripts/hardware_ci/run-cpu-test.sh       |   1 +
 cmake/cpu_extension.cmake                     |   4 +-
 .../{cpu_attn_macros.h => cpu_arch_macros.h}  |  10 +-
 csrc/cpu/cpu_attn_impl.hpp                    |  41 +-
 csrc/cpu/cpu_fused_moe.cpp                    | 727 ++++++++++++++++++
 csrc/cpu/cpu_types_x86.hpp                    |   8 +
 csrc/cpu/cpu_wna16.cpp                        |  18 +-
 csrc/cpu/dnnl_helper.cpp                      |  12 +-
 csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp    |  33 +
 csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp   |  38 +
 csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp    |  19 +
 csrc/cpu/scratchpad_manager.cpp               |  23 -
 csrc/cpu/scratchpad_manager.h                 |  31 -
 csrc/cpu/torch_bindings.cpp                   |  24 +
 csrc/cpu/utils.cpp                            |  24 +-
 csrc/cpu/utils.hpp                            |  89 ++-
 docker/Dockerfile.cpu                         |   4 +-
 requirements/cpu-build.txt                    |   2 +-
 requirements/cpu.txt                          |   2 +
 tests/kernels/moe/test_cpu_fused_moe.py       | 172 +++++
 vllm/_custom_ops.py                           |  36 +
 .../layers/fused_moe/cpu_fused_moe.py         | 264 +++++--
 vllm/model_executor/layers/fused_moe/layer.py |   6 +-
 23 files changed, 1388 insertions(+), 200 deletions(-)
 rename csrc/cpu/{cpu_attn_macros.h => cpu_arch_macros.h} (97%)
 create mode 100644 csrc/cpu/cpu_fused_moe.cpp
 delete mode 100644 csrc/cpu/scratchpad_manager.cpp
 delete mode 100644 csrc/cpu/scratchpad_manager.h
 create mode 100644 tests/kernels/moe/test_cpu_fused_moe.py

diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index 438fe522c8702..471c8616df85c 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -50,6 +50,7 @@ function cpu_tests() {
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
     pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
+    pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
     pytest -x -v -s tests/kernels/test_onednn.py"
 
   # Run basic model test
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 85b286f8d8d0a..0af87fd7f0b53 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -330,7 +330,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
         PUBLIC ${oneDNN_BINARY_DIR}/include
         PRIVATE ${oneDNN_SOURCE_DIR}/src
     )
-    target_link_libraries(dnnl_ext dnnl)
+    target_link_libraries(dnnl_ext dnnl torch)
     target_compile_options(dnnl_ext PRIVATE ${CXX_COMPILE_FLAGS} -fPIC)
     list(APPEND LIBS dnnl_ext)
     set(USE_ONEDNN ON)
@@ -358,13 +358,13 @@ set(VLLM_EXT_SRC
     "csrc/cpu/pos_encoding.cpp"
     "csrc/moe/dynamic_4bit_int_moe_cpu.cpp"
     "csrc/cpu/cpu_attn.cpp"
-    "csrc/cpu/scratchpad_manager.cpp"
     "csrc/cpu/torch_bindings.cpp")
 
 if (AVX512_FOUND AND NOT AVX512_DISABLED)
     set(VLLM_EXT_SRC
         "csrc/cpu/shm.cpp"
         "csrc/cpu/cpu_wna16.cpp"
+        "csrc/cpu/cpu_fused_moe.cpp"
         ${VLLM_EXT_SRC})
     if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI)
         set(VLLM_EXT_SRC
diff --git a/csrc/cpu/cpu_attn_macros.h b/csrc/cpu/cpu_arch_macros.h
similarity index 97%
rename from csrc/cpu/cpu_attn_macros.h
rename to csrc/cpu/cpu_arch_macros.h
index 35716a0790ab3..c73b62ecdec90 100644
--- a/csrc/cpu/cpu_attn_macros.h
+++ b/csrc/cpu/cpu_arch_macros.h
@@ -1,5 +1,5 @@
-#ifndef CPU_ATTN_MACROS_H
-#define CPU_ATTN_MACROS_H
+#ifndef CPU_ARCH_MACROS_H
+#define CPU_ARCH_MACROS_H
 
 // x86_64
 #ifdef __x86_64__
@@ -26,7 +26,7 @@
           _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218));                  \
       const __m512i vec_127 = _mm512_set1_epi32(0x0000007f);                   \
       const int n_mantissa_bits = 23;                                          \
-      auto fast_exp = [&](vec_op::FP32Vec16& vec) __attribute__((              \
+      auto fast_exp = [&](const vec_op::FP32Vec16& vec) __attribute__((        \
                           always_inline)) {                                    \
         __m512 values = vec.reg;                                               \
         auto less_ln_flt_min_mask =                                            \
@@ -98,7 +98,7 @@
       poly = vbslq_f32(hi_mask, inf, poly);                                    \
       return vbslq_f32(lo_mask, zero, poly);                                   \
     };                                                                         \
-    auto fast_exp = [&](vec_op::FP32Vec16& vec)                                \
+    auto fast_exp = [&](const vec_op::FP32Vec16& vec)                          \
                         __attribute__((always_inline)) {                       \
                           float32x4x4_t result;                                \
                           result.val[0] = neon_expf(vec.reg.val[0]);           \
@@ -110,4 +110,4 @@
 
 #endif  // __aarch64__
 
-#endif
\ No newline at end of file
+#endif
diff --git a/csrc/cpu/cpu_attn_impl.hpp b/csrc/cpu/cpu_attn_impl.hpp
index e3e077b845f4f..08d208e05a62c 100644
--- a/csrc/cpu/cpu_attn_impl.hpp
+++ b/csrc/cpu/cpu_attn_impl.hpp
@@ -8,10 +8,8 @@
   #include <sys/sysctl.h>
 #endif
 
-#include "cpu_types.hpp"
-#include "scratchpad_manager.h"
-#include "cpu_attn_macros.h"
-#include "utils.hpp"
+#include "cpu/cpu_arch_macros.h"
+#include "cpu/utils.hpp"
 
 namespace cpu_attention {
 enum class ISA { AMX, VEC, VEC16, NEON };
@@ -378,12 +376,13 @@ class AttentionScheduler {
 
   static constexpr int32_t MaxQTileIterNum = 128;
 
-  AttentionScheduler() : available_cache_size_(get_available_l2_size()) {}
+  AttentionScheduler()
+      : available_cache_size_(cpu_utils::get_available_l2_size()) {}
 
   torch::Tensor schedule(const ScheduleInput& input) const {
     const bool casual = input.casual;
     const int32_t thread_num = omp_get_max_threads();
-    const int64_t cache_size = get_available_l2_size();
+    const int64_t cache_size = cpu_utils::get_available_l2_size();
     const int32_t max_num_q_per_iter = input.max_num_q_per_iter;
     const int32_t kv_len_alignment = input.kv_block_alignment;
     int32_t q_head_per_kv = input.num_heads_q / input.num_heads_kv;
@@ -659,7 +658,7 @@ class AttentionScheduler {
             metadata_ptr->thread_num +
         metadata_ptr->reduction_scratchpad_size_per_kv_head *
             (use_gqa ? input.num_heads_kv : input.num_heads_q);
-    DNNLScratchPadManager::get_dnnl_scratchpad_manager()->realloc(
+    cpu_utils::ScratchPadManager::get_scratchpad_manager()->realloc(
         scratchpad_size);
 
     // metadata_ptr->print();
@@ -667,7 +666,7 @@ class AttentionScheduler {
     // test out of boundary access
     // {
     //     float* cache_ptr =
-    //     DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<float>();
+    //     cpu_utils::ScratchPadManager::getl_scratchpad_manager()->get_data<float>();
     //     for (int64_t i = 0; i < scratchpad_size / sizeof(float); ++i) {
     //         cache_ptr[i] = std::numeric_limits<float>::quiet_NaN();
     //     }
@@ -749,27 +748,6 @@ class AttentionScheduler {
     return std::max(rounded_tile_size, round_size);
   }
 
-  static int64_t get_available_l2_size() {
-    static int64_t size = []() {
-#if defined(__APPLE__)
-      // macOS doesn't have _SC_LEVEL2_CACHE_SIZE. Use sysctlbyname.
-      int64_t l2_cache_size = 0;
-      size_t len = sizeof(l2_cache_size);
-      if (sysctlbyname("hw.l2cachesize", &l2_cache_size, &len, NULL, 0) == 0 &&
-          l2_cache_size > 0) {
-        return l2_cache_size >> 1;  // use 50% of L2 cache
-      }
-      // Fallback if sysctlbyname fails
-      return 128LL * 1024 >> 1;  // use 50% of 128KB
-#else
-      long l2_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE);
-      TORCH_CHECK_NE(l2_cache_size, -1);
-      return l2_cache_size >> 1;  // use 50% of L2 cache
-#endif
-    }();
-    return size;
-  }
-
  private:
   int64_t available_cache_size_;
 };
@@ -1402,7 +1380,7 @@ class AttentionMainLoop {
 
       // init buffers
       void* scratchpad_ptr =
-          DNNLScratchPadManager::get_dnnl_scratchpad_manager()
+          cpu_utils::ScratchPadManager::get_scratchpad_manager()
               ->get_data<void>();
       AttentionScratchPad buffer_manager(thread_id, metadata, scratchpad_ptr);
 
@@ -1422,8 +1400,7 @@ class AttentionMainLoop {
         }
       }
 
-      const int64_t available_cache_size =
-          AttentionScheduler::get_available_l2_size();
+      const int64_t available_cache_size = cpu_utils::get_available_l2_size();
       const int32_t default_tile_size =
           AttentionScheduler::calcu_default_tile_size(
               available_cache_size, head_dim, sizeof(kv_cache_t),
diff --git a/csrc/cpu/cpu_fused_moe.cpp b/csrc/cpu/cpu_fused_moe.cpp
new file mode 100644
index 0000000000000..090e2d4cd4b56
--- /dev/null
+++ b/csrc/cpu/cpu_fused_moe.cpp
@@ -0,0 +1,727 @@
+#include "cpu/cpu_types.hpp"
+#include "cpu/utils.hpp"
+#include "cpu/micro_gemm/cpu_micro_gemm_vec.hpp"
+#include "cpu/cpu_arch_macros.h"
+
+#ifdef CPU_CAPABILITY_AMXBF16
+  #include "cpu/micro_gemm/cpu_micro_gemm_amx.hpp"
+  #define AMX_DISPATCH(...)                                                    \
+    case cpu_utils::ISA::AMX: {                                                \
+      using gemm_t = cpu_micro_gemm::MicroGemm<cpu_utils::ISA::AMX, scalar_t>; \
+      return __VA_ARGS__();                                                    \
+    }
+#else
+  #define AMX_DISPATCH(...) case cpu_utils::ISA::AMX:
+#endif
+
+#define CPU_ISA_DISPATCH_IMPL(ISA_TYPE, ...)                          \
+  [&] {                                                               \
+    switch (ISA_TYPE) {                                               \
+      AMX_DISPATCH(__VA_ARGS__)                                       \
+      case cpu_utils::ISA::VEC: {                                     \
+        using gemm_t =                                                \
+            cpu_micro_gemm::MicroGemm<cpu_utils::ISA::VEC, scalar_t>; \
+        return __VA_ARGS__();                                         \
+      }                                                               \
+      default: {                                                      \
+        TORCH_CHECK(false, "Invalid CPU ISA type.");                  \
+      }                                                               \
+    }                                                                 \
+  }()
+
+namespace {
+enum class FusedMOEAct { SiluAndMul, SwigluOAIAndMul };
+
+FusedMOEAct get_act_type(const std::string& act) {
+  if (act == "silu") {
+    return FusedMOEAct::SiluAndMul;
+  } else if (act == "swigluoai") {
+    return FusedMOEAct::SwigluOAIAndMul;
+  } else {
+    TORCH_CHECK(false, "Invalid act type: " + act);
+  }
+}
+
+template <typename scalar_t>
+void swigluoai_and_mul(float* __restrict__ input, scalar_t* __restrict__ output,
+                       const int32_t m_size, const int32_t n_size,
+                       const int32_t input_stride,
+                       const int32_t output_stride) {
+  using scalar_vec_t = typename cpu_utils::VecTypeTrait<scalar_t>::vec_t;
+  // For GPT-OSS interleaved gate-up weights
+  alignas(64) static int32_t index[16] = {0,  2,  4,  6,  8,  10, 12, 14,
+                                          16, 18, 20, 22, 24, 26, 28, 30};
+  vec_op::INT32Vec16 index_vec(index);
+  vec_op::FP32Vec16 gate_up_max_vec(7.0);
+  vec_op::FP32Vec16 up_min_vec(-7.0);
+  vec_op::FP32Vec16 alpha_vec(1.702);
+  vec_op::FP32Vec16 one_vec(1.0);
+
+  DEFINE_FAST_EXP
+
+  for (int32_t m = 0; m < m_size; ++m) {
+    for (int32_t n = 0; n < n_size; n += 32) {
+      vec_op::FP32Vec16 gate_vec(input + n, index_vec);
+      vec_op::FP32Vec16 up_vec(input + n + 1, index_vec);
+      gate_vec = gate_vec.min(gate_up_max_vec);
+      up_vec = up_vec.clamp(up_min_vec, gate_up_max_vec);
+      auto sigmoid_vec = one_vec / (one_vec + fast_exp(-gate_vec * alpha_vec));
+      auto glu = gate_vec * sigmoid_vec;
+      auto gated_output_fp32 = (one_vec + up_vec) * glu;
+      scalar_vec_t gated_output = scalar_vec_t(gated_output_fp32);
+      gated_output.save(output + n / 2);
+    }
+    input += input_stride;
+    output += output_stride;
+  }
+}
+
+template <typename scalar_t>
+void silu_and_mul(float* __restrict__ input, scalar_t* __restrict__ output,
+                  const int32_t m_size, const int32_t n_size,
+                  const int32_t input_stride, const int32_t output_stride) {
+  using scalar_vec_t = typename cpu_utils::VecTypeTrait<scalar_t>::vec_t;
+  const int32_t dim = n_size / 2;
+  float* __restrict__ gate = input;
+  float* __restrict__ up = input + dim;
+  vec_op::FP32Vec16 one_vec(1.0);
+
+  DEFINE_FAST_EXP
+
+  for (int32_t m = 0; m < m_size; ++m) {
+    for (int32_t n = 0; n < dim; n += 16) {
+      vec_op::FP32Vec16 gate_vec(gate + n);
+      vec_op::FP32Vec16 up_vec(up + n);
+      auto sigmoid_vec = one_vec / (one_vec + fast_exp(-gate_vec));
+      auto silu = gate_vec * sigmoid_vec;
+      auto gated_output_fp32 = up_vec * silu;
+      scalar_vec_t gated_output = scalar_vec_t(gated_output_fp32);
+      gated_output.save(output + n);
+    }
+    gate += input_stride;
+    up += input_stride;
+    output += output_stride;
+  }
+}
+
+template <typename scalar_t>
+FORCE_INLINE void apply_gated_act(const FusedMOEAct act,
+                                  float* __restrict__ input,
+                                  scalar_t* __restrict__ output,
+                                  const int32_t m, const int32_t n,
+                                  const int32_t input_stride,
+                                  const int32_t output_stride) {
+  switch (act) {
+    case FusedMOEAct::SwigluOAIAndMul:
+      swigluoai_and_mul(input, output, m, n, input_stride, output_stride);
+      return;
+    case FusedMOEAct::SiluAndMul:
+      silu_and_mul(input, output, m, n, input_stride, output_stride);
+      return;
+    default:
+      TORCH_CHECK(false, "Unsupported act type.");
+  }
+}
+
+template <typename scalar_t, typename gemm_t>
+void prepack_moe_weight_impl(scalar_t* __restrict__ weight_ptr,
+                             scalar_t* __restrict__ packed_weight_ptr,
+                             const int32_t expert_num,
+                             const int32_t output_size,
+                             const int32_t input_size,
+                             const int64_t expert_stride) {
+#pragma omp parallel for
+  for (int32_t e_idx = 0; e_idx < expert_num; ++e_idx) {
+    gemm_t::pack_weight(weight_ptr + expert_stride * e_idx,
+                        packed_weight_ptr + expert_stride * e_idx, output_size,
+                        input_size);
+  }
+}
+
+template <typename scalar_t, typename w_t, typename gemm_t>
+void fused_moe_impl(scalar_t* __restrict__ output, scalar_t* __restrict__ input,
+                    w_t* __restrict__ w13, w_t* __restrict__ w2,
+                    w_t* __restrict__ w13_bias, w_t* __restrict__ w2_bias,
+                    float* __restrict__ topk_weights,
+                    int32_t* __restrict__ topk_id, FusedMOEAct act_type,
+                    const int32_t token_num, const int32_t expert_num,
+                    const int32_t topk_num, const int32_t input_size_13,
+                    const int32_t output_size_13, const int32_t input_size_2,
+                    const int32_t output_size_2) {
+  using scalar_vec_t = typename cpu_utils::VecTypeTrait<scalar_t>::vec_t;
+  constexpr int32_t gemm_n_tile_size = gemm_t::NSize;
+  constexpr int32_t gemm_m_tile_size = gemm_t::MaxMSize;
+  constexpr int32_t min_w13_n_tile_size = 2 * gemm_n_tile_size;
+  static_assert(gemm_n_tile_size % 16 == 0);
+
+  TORCH_CHECK_EQ(output_size_13 % min_w13_n_tile_size, 0);
+  TORCH_CHECK_EQ(output_size_2 % gemm_n_tile_size, 0);
+  TORCH_CHECK_EQ(output_size_13 / 2, input_size_2);
+
+  const int32_t thread_num = omp_get_max_threads();
+
+  const int32_t w13_input_buffer_size = cpu_utils::round_up<64>(
+      gemm_m_tile_size * input_size_13 * sizeof(scalar_t));
+
+  const int32_t w13_n_tile_size = [&]() {
+    const int64_t cache_size = cpu_utils::get_available_l2_size();
+    // input buffer + output buffer + weight
+    const int32_t n_size_cache_limit =
+        (cache_size - w13_input_buffer_size) /
+        (gemm_m_tile_size * sizeof(float) + input_size_13 * sizeof(scalar_t));
+    const int32_t n_size_thread_limit =
+        output_size_13 / std::max(1, thread_num / topk_num);
+    const int32_t n_size = cpu_utils::round_down<min_w13_n_tile_size>(
+        std::min(n_size_cache_limit, n_size_thread_limit));
+    return std::max(n_size, min_w13_n_tile_size);
+  }();
+
+  const int32_t w2_input_tile_size = cpu_utils::round_up<64>(
+      gemm_m_tile_size * input_size_2 * sizeof(scalar_t));
+
+  const int32_t w2_n_tile_size = [&]() {
+    const int64_t cache_size = cpu_utils::get_available_l2_size();
+    // input tile + weight
+    const int32_t n_size_cache_limit =
+        (cache_size - w2_input_tile_size) / (input_size_2 * sizeof(scalar_t));
+    const int32_t n_size_thread_limit =
+        output_size_2 / std::max(1, thread_num / topk_num);
+    const int32_t n_size = cpu_utils::round_down<gemm_n_tile_size>(
+        std::min(n_size_cache_limit, n_size_thread_limit));
+    return std::max(n_size, gemm_n_tile_size);
+  }();
+
+  // allocate buffers
+  int32_t common_buffer_offset = 0;
+  int32_t w13_thread_buffer_offset = 0;
+  int32_t ws_thread_buffer_offset = 0;
+
+  // common buffers
+  const int32_t token_num_per_group_buffer_size =
+      cpu_utils::round_up<64>(expert_num * sizeof(int32_t));
+  const int32_t token_num_per_group_buffer_offset = common_buffer_offset;
+  common_buffer_offset += token_num_per_group_buffer_size;
+
+  const int32_t cu_token_num_per_group_buffer_size =
+      cpu_utils::round_up<64>((expert_num + 1) * sizeof(int32_t));
+  const int32_t cu_token_num_per_group_buffer_offset = common_buffer_offset;
+  common_buffer_offset += cu_token_num_per_group_buffer_size;
+
+  const int32_t expand_token_id_buffer_size =
+      cpu_utils::round_up<64>(token_num * topk_num * sizeof(int32_t));
+  const int32_t expand_token_id_buffer_offset = common_buffer_offset;
+  common_buffer_offset += expand_token_id_buffer_size;
+
+  const int32_t expand_token_id_index_buffer_size =
+      cpu_utils::round_up<64>(token_num * topk_num * sizeof(int32_t));
+  const int32_t expand_token_id_index_buffer_offset = common_buffer_offset;
+  common_buffer_offset += expand_token_id_index_buffer_size;
+
+  const int32_t w13_gemm_output_buffer_size = cpu_utils::round_up<64>(
+      token_num * topk_num * (output_size_13 / 2) * sizeof(scalar_t));
+  const int32_t w13_gemm_output_buffer_offset = common_buffer_offset;
+  common_buffer_offset += w13_gemm_output_buffer_size;
+
+  const int32_t w2_gemm_output_buffer_size = cpu_utils::round_up<64>(
+      token_num * topk_num * output_size_2 * sizeof(float));
+  const int32_t w2_gemm_output_buffer_offset = common_buffer_offset;
+  common_buffer_offset += w2_gemm_output_buffer_size;
+
+  // w13 GEMM thread buffers
+  const int32_t w13_input_buffer_offset = w13_thread_buffer_offset;
+  w13_thread_buffer_offset += w13_input_buffer_size;
+
+  const int32_t w13_output_buffer_size = cpu_utils::round_up<64>(
+      gemm_m_tile_size * w13_n_tile_size * sizeof(float));
+  const int32_t w13_output_buffer_offset = w13_thread_buffer_offset;
+  w13_thread_buffer_offset += w13_output_buffer_size;
+
+  // Weighted sum thread buffer
+  const int32_t ws_output_buffer_size =
+      cpu_utils::round_up<64>(output_size_2 * sizeof(float));
+  const int32_t ws_output_buffer_offset = ws_thread_buffer_offset;
+  ws_thread_buffer_offset += ws_output_buffer_size;
+
+  const int32_t buffer_size =
+      common_buffer_offset +
+      std::max(w13_thread_buffer_offset, ws_thread_buffer_offset) * thread_num;
+  cpu_utils::ScratchPadManager::get_scratchpad_manager()->realloc(buffer_size);
+  uint8_t* common_buffer_start =
+      cpu_utils::ScratchPadManager::get_scratchpad_manager()
+          ->get_data<uint8_t>();
+  uint8_t* thread_buffer_start = common_buffer_start + common_buffer_offset;
+
+  int32_t* __restrict__ token_num_per_group_buffer = reinterpret_cast<int32_t*>(
+      common_buffer_start + token_num_per_group_buffer_offset);
+  int32_t* __restrict__ cu_token_num_per_group_buffer =
+      reinterpret_cast<int32_t*>(common_buffer_start +
+                                 cu_token_num_per_group_buffer_offset);
+  int32_t* __restrict__ expand_token_id_buffer = reinterpret_cast<int32_t*>(
+      common_buffer_start + expand_token_id_buffer_offset);
+  int32_t* __restrict__ expand_token_id_index_buffer =
+      reinterpret_cast<int32_t*>(common_buffer_start +
+                                 expand_token_id_index_buffer_offset);
+
+  // prepare token-expert mappings
+  {
+    std::memset(token_num_per_group_buffer, 0, expert_num * sizeof(int32_t));
+    for (int32_t i = 0; i < token_num * topk_num; ++i) {
+      int32_t curr_expert_id = topk_id[i];
+      ++token_num_per_group_buffer[curr_expert_id];
+    }
+
+    int32_t token_num_sum = 0;
+    cu_token_num_per_group_buffer[0] = 0;
+    int32_t* token_index_buffer = cu_token_num_per_group_buffer + 1;
+    for (int32_t i = 0; i < expert_num; ++i) {
+      token_index_buffer[i] = token_num_sum;
+      token_num_sum += token_num_per_group_buffer[i];
+    }
+
+    for (int32_t i = 0; i < token_num; ++i) {
+      int32_t* curr_topk_id = topk_id + i * topk_num;
+      int32_t* curr_index_buffer = expand_token_id_index_buffer + i * topk_num;
+      for (int32_t j = 0; j < topk_num; ++j) {
+        int32_t curr_expert_id = curr_topk_id[j];
+        int32_t curr_index = token_index_buffer[curr_expert_id];
+        ++token_index_buffer[curr_expert_id];
+        expand_token_id_buffer[curr_index] = i;
+        curr_index_buffer[j] = curr_index;
+      }
+    }
+  }
+
+  // w13 GEMM + act
+  {
+    alignas(64) cpu_utils::Counter counter;
+    cpu_utils::Counter* counter_ptr = &counter;
+
+#pragma omp parallel for schedule(static, 1)
+    for (int32_t thread_id = 0; thread_id < thread_num; ++thread_id) {
+      const int32_t task_num_per_expert =
+          (output_size_13 + w13_n_tile_size - 1) / w13_n_tile_size;
+      const int32_t task_num = task_num_per_expert * expert_num;
+
+      uint8_t* __restrict__ thread_buffer =
+          thread_buffer_start + thread_id * w13_thread_buffer_offset;
+      scalar_t* __restrict__ w13_input_buffer =
+          reinterpret_cast<scalar_t*>(thread_buffer + w13_input_buffer_offset);
+      float* __restrict__ w13_output_buffer =
+          reinterpret_cast<float*>(thread_buffer + w13_output_buffer_offset);
+      scalar_t* __restrict__ w13_gemm_output_buffer =
+          reinterpret_cast<scalar_t*>(common_buffer_start +
+                                      w13_gemm_output_buffer_offset);
+
+      gemm_t gemm;
+
+      const int32_t input_size_13_bytes = input_size_13 * sizeof(scalar_t);
+      const int32_t w13_n_group_stride = 16 * input_size_13;
+      const int32_t w13_n_tile_stride = gemm_n_tile_size * input_size_13;
+
+      for (;;) {
+        int32_t task_id = counter_ptr->acquire_counter();
+        if (task_id >= task_num) {
+          break;
+        }
+
+        const int32_t curr_expert_id = task_id / task_num_per_expert;
+        const int32_t curr_output_group_id = task_id % task_num_per_expert;
+        const int32_t curr_token_num =
+            token_num_per_group_buffer[curr_expert_id];
+        if (curr_token_num == 0) {
+          continue;
+        }
+
+        const int32_t actual_n_tile_size =
+            std::min(w13_n_tile_size,
+                     output_size_13 - curr_output_group_id * w13_n_tile_size);
+        const int32_t* __restrict__ curr_expand_token_id_buffer =
+            expand_token_id_buffer +
+            cu_token_num_per_group_buffer[curr_expert_id];
+        scalar_t* __restrict__ curr_w13_gemm_output_buffer =
+            w13_gemm_output_buffer +
+            cu_token_num_per_group_buffer[curr_expert_id] *
+                (output_size_13 / 2) +
+            curr_output_group_id * w13_n_tile_size / 2;
+
+        w_t* __restrict__ w13_weight_ptr_0 = nullptr;
+        w_t* __restrict__ w13_weight_ptr_1 = nullptr;
+        w_t* __restrict__ w13_bias_ptr_0 = nullptr;
+        w_t* __restrict__ w13_bias_ptr_1 = nullptr;
+        if (act_type == FusedMOEAct::SwigluOAIAndMul) {
+          // For SwigluOAIAndMul, up and down weights are interleaved
+          w13_weight_ptr_0 =
+              w13 + curr_expert_id * input_size_13 * output_size_13 +
+              curr_output_group_id * w13_n_tile_size * input_size_13;
+          w13_weight_ptr_1 =
+              w13_weight_ptr_0 + actual_n_tile_size / 2 * input_size_13;
+          if (w13_bias != nullptr) {
+            w13_bias_ptr_0 = w13_bias + curr_expert_id * output_size_13 +
+                             curr_output_group_id * w13_n_tile_size;
+            w13_bias_ptr_1 = w13_bias_ptr_0 + actual_n_tile_size / 2;
+          }
+        } else {
+          w13_weight_ptr_0 =
+              w13 + curr_expert_id * input_size_13 * output_size_13 +
+              curr_output_group_id * (w13_n_tile_size / 2) * input_size_13;
+          w13_weight_ptr_1 =
+              w13_weight_ptr_0 + output_size_13 / 2 * input_size_13;
+          if (w13_bias != nullptr) {
+            w13_bias_ptr_0 = w13_bias + curr_expert_id * output_size_13 +
+                             curr_output_group_id * (w13_n_tile_size / 2);
+            w13_bias_ptr_1 = w13_bias_ptr_0 + output_size_13 / 2;
+          }
+        }
+
+        scalar_t* __restrict__ curr_w13_input_buffer = w13_input_buffer;
+        for (int32_t token_idx = 0; token_idx < curr_token_num;
+             token_idx += gemm_m_tile_size) {
+          const int32_t actual_token_num =
+              std::min(gemm_m_tile_size, curr_token_num - token_idx);
+          // copy inputs
+          {
+            scalar_t* __restrict__ curr_w13_input_buffer_iter =
+                curr_w13_input_buffer;
+            for (int32_t i = 0; i < actual_token_num; ++i) {
+              const int32_t curr_token_id = curr_expand_token_id_buffer[i];
+              int8_t* __restrict__ curr_input_iter = reinterpret_cast<int8_t*>(
+                  input + curr_token_id * input_size_13);
+              int8_t* __restrict__ curr_output_iter =
+                  reinterpret_cast<int8_t*>(curr_w13_input_buffer_iter);
+              int32_t j = 0;
+              for (; j < input_size_13_bytes - 64; j += 64) {
+                vec_op::INT8Vec64 vec(curr_input_iter);
+                vec.save(curr_output_iter);
+                curr_input_iter += 64;
+                curr_output_iter += 64;
+              }
+              vec_op::INT8Vec64 vec(curr_input_iter);
+              vec.save(curr_output_iter, input_size_13_bytes - j);
+
+              // update
+              curr_w13_input_buffer_iter += input_size_13;
+            }
+            // update
+            curr_expand_token_id_buffer += actual_token_num;
+          }
+
+          // gemm + act
+          {
+            scalar_t* __restrict__ w13_weight_ptr_0_iter = w13_weight_ptr_0;
+            scalar_t* __restrict__ w13_weight_ptr_1_iter = w13_weight_ptr_1;
+            scalar_t* __restrict__ w13_bias_ptr_0_iter = w13_bias_ptr_0;
+            scalar_t* __restrict__ w13_bias_ptr_1_iter = w13_bias_ptr_1;
+            scalar_t* __restrict__ curr_w13_input_buffer_iter =
+                curr_w13_input_buffer;
+            float* __restrict__ w13_output_buffer_0_iter = w13_output_buffer;
+            float* __restrict__ w13_output_buffer_1_iter =
+                w13_output_buffer + actual_n_tile_size / 2;
+            for (int32_t i = 0; i < actual_n_tile_size;
+                 i += min_w13_n_tile_size) {
+              gemm.gemm(curr_w13_input_buffer_iter, w13_weight_ptr_0_iter,
+                        w13_output_buffer_0_iter, actual_token_num,
+                        input_size_13, input_size_13, w13_n_group_stride,
+                        actual_n_tile_size, false);
+
+              if (w13_bias != nullptr) {
+                cpu_micro_gemm::add_bias_epilogue<gemm_n_tile_size>(
+                    w13_output_buffer_0_iter, w13_output_buffer_0_iter,
+                    w13_bias_ptr_0_iter, actual_token_num, actual_n_tile_size,
+                    actual_n_tile_size);
+                w13_bias_ptr_0_iter += gemm_n_tile_size;
+              }
+
+              gemm.gemm(curr_w13_input_buffer_iter, w13_weight_ptr_1_iter,
+                        w13_output_buffer_1_iter, actual_token_num,
+                        input_size_13, input_size_13, w13_n_group_stride,
+                        actual_n_tile_size, false);
+
+              if (w13_bias != nullptr) {
+                cpu_micro_gemm::add_bias_epilogue<gemm_n_tile_size>(
+                    w13_output_buffer_1_iter, w13_output_buffer_1_iter,
+                    w13_bias_ptr_1_iter, actual_token_num, actual_n_tile_size,
+                    actual_n_tile_size);
+                w13_bias_ptr_1_iter += gemm_n_tile_size;
+              }
+
+              // update
+              w13_weight_ptr_0_iter += w13_n_tile_stride;
+              w13_weight_ptr_1_iter += w13_n_tile_stride;
+              w13_output_buffer_0_iter += gemm_n_tile_size;
+              w13_output_buffer_1_iter += gemm_n_tile_size;
+            }
+
+            apply_gated_act(act_type, w13_output_buffer,
+                            curr_w13_gemm_output_buffer, actual_token_num,
+                            actual_n_tile_size, actual_n_tile_size,
+                            output_size_13 / 2);
+
+            // update
+            curr_w13_gemm_output_buffer +=
+                gemm_m_tile_size * (output_size_13 / 2);
+          }
+        }
+      }
+    }
+  }
+
+  // w2 GEMM
+  {
+    alignas(64) cpu_utils::Counter counter;
+    cpu_utils::Counter* counter_ptr = &counter;
+
+#pragma omp parallel for schedule(static, 1)
+    for (int32_t thread_id = 0; thread_id < thread_num; ++thread_id) {
+      const int32_t task_num_per_expert =
+          (output_size_2 + w2_n_tile_size - 1) / w2_n_tile_size;
+      const int32_t task_num = task_num_per_expert * expert_num;
+      scalar_t* __restrict__ w13_gemm_output_buffer =
+          reinterpret_cast<scalar_t*>(common_buffer_start +
+                                      w13_gemm_output_buffer_offset);
+      float* __restrict__ w2_gemm_output_buffer = reinterpret_cast<float*>(
+          common_buffer_start + w2_gemm_output_buffer_offset);
+
+      gemm_t gemm;
+
+      const int32_t w2_n_tile_stride = gemm_n_tile_size * input_size_2;
+      const int32_t w2_n_group_stride = 16 * input_size_2;
+
+      for (;;) {
+        int32_t task_id = counter_ptr->acquire_counter();
+        if (task_id >= task_num) {
+          break;
+        }
+
+        const int32_t curr_expert_id = task_id / task_num_per_expert;
+        const int32_t curr_output_group_id = task_id % task_num_per_expert;
+        const int32_t curr_token_num =
+            token_num_per_group_buffer[curr_expert_id];
+        if (curr_token_num == 0) {
+          continue;
+        }
+
+        const int32_t actual_n_tile_size =
+            std::min(w2_n_tile_size,
+                     output_size_2 - curr_output_group_id * w2_n_tile_size);
+        scalar_t* __restrict__ curr_w13_gemm_output_buffer =
+            w13_gemm_output_buffer +
+            cu_token_num_per_group_buffer[curr_expert_id] * input_size_2;
+        float* __restrict__ curr_w2_gemm_output_buffer =
+            w2_gemm_output_buffer +
+            cu_token_num_per_group_buffer[curr_expert_id] * output_size_2 +
+            curr_output_group_id * w2_n_tile_size;
+        scalar_t* __restrict__ w2_weight_ptr =
+            w2 + curr_expert_id * output_size_2 * input_size_2 +
+            curr_output_group_id * w2_n_tile_size * input_size_2;
+        scalar_t* __restrict__ w2_bias_ptr = nullptr;
+        if (w2_bias != nullptr) {
+          w2_bias_ptr = w2_bias + curr_expert_id * output_size_2 +
+                        curr_output_group_id * w2_n_tile_size;
+        }
+
+        for (int32_t token_idx = 0; token_idx < curr_token_num;
+             token_idx += gemm_m_tile_size) {
+          const int32_t actual_token_num =
+              std::min(gemm_m_tile_size, curr_token_num - token_idx);
+
+          scalar_t* __restrict__ w2_weight_ptr_iter = w2_weight_ptr;
+          scalar_t* __restrict__ w2_bias_ptr_iter = w2_bias_ptr;
+          float* __restrict__ curr_w2_gemm_output_buffer_iter =
+              curr_w2_gemm_output_buffer;
+          for (int32_t i = 0; i < actual_n_tile_size; i += gemm_n_tile_size) {
+            gemm.gemm(curr_w13_gemm_output_buffer, w2_weight_ptr_iter,
+                      curr_w2_gemm_output_buffer_iter, actual_token_num,
+                      input_size_2, input_size_2, w2_n_group_stride,
+                      output_size_2, false);
+
+            if (w2_bias != nullptr) {
+              cpu_micro_gemm::add_bias_epilogue<gemm_n_tile_size>(
+                  curr_w2_gemm_output_buffer_iter,
+                  curr_w2_gemm_output_buffer_iter, w2_bias_ptr_iter,
+                  actual_token_num, output_size_2, output_size_2);
+              w2_bias_ptr_iter += gemm_n_tile_size;
+            }
+
+            w2_weight_ptr_iter += w2_n_tile_stride;
+            curr_w2_gemm_output_buffer_iter += gemm_n_tile_size;
+          }
+
+          // update
+          curr_w13_gemm_output_buffer += gemm_m_tile_size * input_size_2;
+          curr_w2_gemm_output_buffer += gemm_m_tile_size * output_size_2;
+        }
+      }
+    }
+  }
+
+  // weighted sum
+  {
+    alignas(64) cpu_utils::Counter counter;
+    cpu_utils::Counter* counter_ptr = &counter;
+
+#pragma omp parallel for schedule(static, 1)
+    for (int32_t thread_id = 0; thread_id < thread_num; ++thread_id) {
+      const int32_t task_num = token_num;
+      uint8_t* __restrict__ thread_buffer =
+          thread_buffer_start + thread_id * ws_thread_buffer_offset;
+      float* __restrict__ ws_output_buffer =
+          reinterpret_cast<float*>(thread_buffer + ws_output_buffer_offset);
+      float* __restrict__ w2_gemm_output_buffer = reinterpret_cast<float*>(
+          common_buffer_start + w2_gemm_output_buffer_offset);
+
+      for (;;) {
+        int32_t task_id = counter_ptr->acquire_counter();
+        if (task_id >= task_num) {
+          break;
+        }
+
+        int32_t token_id = task_id;
+        int32_t* __restrict__ curr_expand_token_id_index_buffer =
+            expand_token_id_index_buffer + token_id * topk_num;
+        float* __restrict__ curr_weight = topk_weights + token_id * topk_num;
+        scalar_t* __restrict__ curr_output_buffer =
+            output + token_id * output_size_2;
+
+        if (topk_num > 1) {
+          {
+            int32_t w2_output_idx = curr_expand_token_id_index_buffer[0];
+            float* __restrict__ w2_output_iter =
+                w2_gemm_output_buffer + w2_output_idx * output_size_2;
+            float* __restrict__ ws_output_buffer_iter = ws_output_buffer;
+            vec_op::FP32Vec16 weight_vec(curr_weight[0]);
+            for (int32_t i = 0; i < output_size_2; i += 16) {
+              vec_op::FP32Vec16 vec(w2_output_iter);
+              vec = vec * weight_vec;
+              vec.save(ws_output_buffer_iter);
+
+              // update
+              w2_output_iter += 16;
+              ws_output_buffer_iter += 16;
+            }
+          }
+
+          {
+            for (int32_t idx = 1; idx < topk_num - 1; ++idx) {
+              int32_t w2_output_idx = curr_expand_token_id_index_buffer[idx];
+              float* __restrict__ w2_output_iter =
+                  w2_gemm_output_buffer + w2_output_idx * output_size_2;
+              float* __restrict__ ws_output_buffer_iter = ws_output_buffer;
+              vec_op::FP32Vec16 weight_vec(curr_weight[idx]);
+              for (int32_t i = 0; i < output_size_2; i += 16) {
+                vec_op::FP32Vec16 vec(w2_output_iter);
+                vec_op::FP32Vec16 sum(ws_output_buffer_iter);
+                sum = sum + vec * weight_vec;
+                sum.save(ws_output_buffer_iter);
+
+                // update
+                w2_output_iter += 16;
+                ws_output_buffer_iter += 16;
+              }
+            }
+          }
+
+          {
+            int32_t idx = topk_num - 1;
+            int32_t w2_output_idx = curr_expand_token_id_index_buffer[idx];
+            float* __restrict__ w2_output_iter =
+                w2_gemm_output_buffer + w2_output_idx * output_size_2;
+            float* __restrict__ ws_output_buffer_iter = ws_output_buffer;
+            scalar_t* __restrict__ curr_output_buffer_iter = curr_output_buffer;
+            vec_op::FP32Vec16 weight_vec(curr_weight[idx]);
+            for (int32_t i = 0; i < output_size_2; i += 16) {
+              vec_op::FP32Vec16 vec(w2_output_iter);
+              vec_op::FP32Vec16 sum(ws_output_buffer_iter);
+              sum = sum + vec * weight_vec;
+              scalar_vec_t out_vec(sum);
+              out_vec.save(curr_output_buffer_iter);
+
+              // update
+              w2_output_iter += 16;
+              ws_output_buffer_iter += 16;
+              curr_output_buffer_iter += 16;
+            }
+          }
+        } else {
+          int32_t w2_output_idx = curr_expand_token_id_index_buffer[0];
+          float* __restrict__ w2_output_iter =
+              w2_gemm_output_buffer + w2_output_idx * output_size_2;
+          scalar_t* __restrict__ curr_output_buffer_iter = curr_output_buffer;
+          vec_op::FP32Vec16 weight_vec(curr_weight[0]);
+          for (int32_t i = 0; i < output_size_2; i += 16) {
+            vec_op::FP32Vec16 vec(w2_output_iter);
+            vec = vec * weight_vec;
+            scalar_vec_t out_vec(vec);
+            out_vec.save(curr_output_buffer_iter);
+
+            // update
+            w2_output_iter += 16;
+            curr_output_buffer_iter += 16;
+          }
+        }
+      }
+    }
+  }
+}
+}  // namespace
+
+void prepack_moe_weight(
+    const torch::Tensor& weight,  // [expert_num, output_size, input_size]
+    torch::Tensor& packed_weight, const std::string& isa) {
+  TORCH_CHECK(weight.is_contiguous());
+  const int32_t expert_num = weight.size(0);
+  const int32_t output_size = weight.size(1);
+  const int32_t input_size = weight.size(2);
+  TORCH_CHECK_EQ(output_size % 32, 0);
+  const int64_t expert_stride = weight.stride(0);
+  cpu_utils::ISA isa_type = cpu_utils::get_isa(isa);
+
+  VLLM_DISPATCH_FLOATING_TYPES(
+      weight.scalar_type(), "prepack_moe_weight", [&]() {
+        CPU_ISA_DISPATCH_IMPL(isa_type, [&]() {
+          scalar_t* weight_ptr = weight.data_ptr<scalar_t>();
+          scalar_t* packed_weight_ptr = packed_weight.data_ptr<scalar_t>();
+          prepack_moe_weight_impl<scalar_t, gemm_t>(
+              weight_ptr, packed_weight_ptr, expert_num, output_size,
+              input_size, expert_stride);
+        });
+      });
+}
+
+void cpu_fused_moe(
+    torch::Tensor& output,       // [token_num, output_size_2]
+    const torch::Tensor& input,  // [token_num, input_size_13]
+    const torch::Tensor&
+        w13,  // [expert_num, output_size_13, input_size_13], packed
+    const torch::Tensor&
+        w2,  // [expert_num, output_size_2, input_size_2], packed
+    const std::optional<torch::Tensor>&
+        w13_bias,  // [expert_num, output_size_13]
+    const std::optional<torch::Tensor>& w2_bias,  // [expert_num, output_size_2]
+    const torch::Tensor& topk_weights,            // [token_num, k], float32
+    const torch::Tensor& topk_id,                 // [token_num, k], int32
+    const std::string& act, const std::string& isa) {
+  const int32_t token_num = input.size(0);
+  const int32_t input_size_13 = input.size(1);
+  const int64_t input_stride = input.stride(0);
+  TORCH_CHECK_EQ(input_stride, input_size_13);
+  const int32_t expert_num = w13.size(0);
+  const int32_t output_size_13 = w13.size(1);
+  const int32_t input_size_2 = w2.size(2);
+  const int32_t output_size_2 = w2.size(1);
+  const int32_t topk_num = topk_id.size(1);
+  const FusedMOEAct act_type = get_act_type(act);
+  cpu_utils::ISA isa_type = cpu_utils::get_isa(isa);
+
+  VLLM_DISPATCH_FLOATING_TYPES(w13.scalar_type(), "cpu_fused_moe", [&]() {
+    CPU_ISA_DISPATCH_IMPL(isa_type, [&]() {
+      fused_moe_impl<scalar_t, scalar_t, gemm_t>(
+          output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
+          w13.data_ptr<scalar_t>(), w2.data_ptr<scalar_t>(),
+          w13_bias.has_value() ? w13_bias->data_ptr<scalar_t>() : nullptr,
+          w2_bias.has_value() ? w2_bias->data_ptr<scalar_t>() : nullptr,
+          topk_weights.data_ptr<float>(), topk_id.data_ptr<int32_t>(), act_type,
+          token_num, expert_num, topk_num, input_size_13, output_size_13,
+          input_size_2, output_size_2);
+    });
+  });
+}
diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp
index 6f51277f78440..d94af338ac1c9 100644
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -352,6 +352,10 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
   explicit FP32Vec16(bool, void* ptr)
       : reg((__m512)_mm512_stream_load_si512(ptr)) {}
 
+  // strided load
+  explicit FP32Vec16(const float* ptr, INT32Vec16 idx)
+      : reg(_mm512_i32gather_ps(idx.reg, ptr, 4)) {}
+
   explicit FP32Vec16(__m512 data) : reg(data) {}
 
   // de-pack 4 bit values
@@ -408,6 +412,10 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
     return FP32Vec16(_mm512_sub_ps(reg, b.reg));
   }
 
+  FP32Vec16 operator-() const {
+    return FP32Vec16(_mm512_xor_ps(reg, _mm512_set1_ps(-0.0f)));
+  }
+
   FP32Vec16 operator/(const FP32Vec16& b) const {
     return FP32Vec16(_mm512_div_ps(reg, b.reg));
   }
diff --git a/csrc/cpu/cpu_wna16.cpp b/csrc/cpu/cpu_wna16.cpp
index 816d195506e52..88d48f3db8772 100644
--- a/csrc/cpu/cpu_wna16.cpp
+++ b/csrc/cpu/cpu_wna16.cpp
@@ -1,6 +1,5 @@
-#include "cpu_types.hpp"
-#include "scratchpad_manager.h"
-#include "utils.hpp"
+#include "cpu/cpu_types.hpp"
+#include "cpu/utils.hpp"
 
 #ifdef CPU_CAPABILITY_AMXBF16
   #include "cpu/micro_gemm/cpu_micro_gemm_amx.hpp"
@@ -158,7 +157,7 @@ void cpu_gemm_wna16_impl(
   // a simple schedule policy, just to hold more B tiles in L2 and make sure
   // each thread has tasks
   const int32_t n_partition_size = [&]() {
-    const int64_t cache_size = cpu_utils::get_l2_size();
+    const int64_t cache_size = cpu_utils::get_available_l2_size();
     int64_t ps_cache_limit = cache_size / (k_size * sizeof(scalar_t));
     int64_t ps_thread_limit = n_size / thread_num;
     ps_cache_limit =
@@ -179,8 +178,8 @@ void cpu_gemm_wna16_impl(
   const int64_t b_buffer_offset = 0;
   const int64_t c_buffer_offset = b_buffer_size;
   const int64_t buffer_size = b_buffer_size + c_buffer_size;
-  DNNLScratchPadManager::get_dnnl_scratchpad_manager()->realloc(buffer_size *
-                                                                thread_num);
+  cpu_utils::ScratchPadManager::get_scratchpad_manager()->realloc(buffer_size *
+                                                                  thread_num);
 
   alignas(64) cpu_utils::Counter counter;
   cpu_utils::Counter* counter_ptr = &counter;
@@ -190,9 +189,10 @@ void cpu_gemm_wna16_impl(
     scalar_t* __restrict__ b_buffer = nullptr;
     float* __restrict__ c_buffer = nullptr;
     {
-      uint8_t* buffer_ptr = DNNLScratchPadManager::get_dnnl_scratchpad_manager()
-                                ->get_data<uint8_t>() +
-                            thread_id * buffer_size;
+      uint8_t* buffer_ptr =
+          cpu_utils::ScratchPadManager::get_scratchpad_manager()
+              ->get_data<uint8_t>() +
+          thread_id * buffer_size;
       b_buffer = reinterpret_cast<scalar_t*>(buffer_ptr + b_buffer_offset);
       c_buffer = reinterpret_cast<float*>(buffer_ptr + c_buffer_offset);
     }
diff --git a/csrc/cpu/dnnl_helper.cpp b/csrc/cpu/dnnl_helper.cpp
index cfb6e78cba9a1..e337e10e1cf7b 100644
--- a/csrc/cpu/dnnl_helper.cpp
+++ b/csrc/cpu/dnnl_helper.cpp
@@ -4,8 +4,8 @@
 #include "common/memory_desc.hpp"
 #include "common/memory.hpp"
 
-#include "dnnl_helper.h"
-#include "scratchpad_manager.h"
+#include "cpu/utils.hpp"
+#include "cpu/dnnl_helper.h"
 
 static dnnl::engine& default_engine() {
   static dnnl::engine engine(dnnl::engine::kind::cpu, 0);
@@ -274,7 +274,7 @@ void W8A8MatMulPrimitiveHandler::execute(ExecArgs& args) {
 
   auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(5);
   scratchpad_storage->set_data_handle(
-      DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<void>());
+      cpu_utils::ScratchPadManager::get_scratchpad_manager()->get_data<void>());
 
   matmul.execute(default_stream(), memory_cache_);
   default_stream().wait();
@@ -294,7 +294,7 @@ dnnl::matmul W8A8MatMulPrimitiveHandler::get_matmul_cache(
 
   return m_size_cache_->get_or_create(key, [&]() {
     dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
-    auto manager = DNNLScratchPadManager::get_dnnl_scratchpad_manager();
+    auto manager = cpu_utils::ScratchPadManager::get_scratchpad_manager();
     manager->realloc(desc.scratchpad_desc().get_size());
     return dnnl::matmul(desc);
   });
@@ -470,7 +470,7 @@ void MatMulPrimitiveHandler::execute(ExecArgs& args) {
 
   auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(3);
   scratchpad_storage->set_data_handle(
-      DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<void>());
+      cpu_utils::ScratchPadManager::get_scratchpad_manager()->get_data<void>());
 
   matmul.execute(default_stream(), memory_cache_);
   default_stream().wait();
@@ -486,7 +486,7 @@ dnnl::matmul MatMulPrimitiveHandler::get_matmul_cache(
   }
   return m_size_cache_->get_or_create(key, [&]() {
     dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
-    auto manager = DNNLScratchPadManager::get_dnnl_scratchpad_manager();
+    auto manager = cpu_utils::ScratchPadManager::get_scratchpad_manager();
     manager->realloc(desc.scratchpad_desc().get_size());
     return dnnl::matmul(desc);
   });
diff --git a/csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp b/csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp
index 87a019773a895..357c7cf1d7844 100644
--- a/csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp
+++ b/csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp
@@ -235,6 +235,39 @@ class MicroGemm<cpu_utils::ISA::AMX, scalar_t> {
     }
   }
 
+  static void pack_weight(const scalar_t* __restrict__ weight,
+                          scalar_t* __restrict__ packed_weight,
+                          const int32_t output_size, const int32_t input_size) {
+    constexpr int32_t elem_num_per_group = 4 / sizeof(scalar_t);
+    TORCH_CHECK_EQ(output_size % 16, 0);
+    TORCH_CHECK_EQ(input_size % (16 * elem_num_per_group), 0);
+
+    const int32_t output_group_num = output_size / 16;
+    const int32_t input_32b_num = input_size / elem_num_per_group;
+    for (int32_t output_group_idx = 0; output_group_idx < output_group_num;
+         ++output_group_idx) {
+      const int32_t* __restrict__ weight_32b =
+          reinterpret_cast<const int32_t*>(weight);
+      int32_t* __restrict__ packed_weight_32b =
+          reinterpret_cast<int32_t*>(packed_weight);
+      for (int32_t output_idx = 0; output_idx < 16; ++output_idx) {
+        for (int32_t weight_offset = 0, packed_offset = 0;
+             weight_offset < input_32b_num;
+             ++weight_offset, packed_offset += 16) {
+          packed_weight_32b[packed_offset] = weight_32b[weight_offset];
+        }
+
+        // update
+        weight_32b += input_32b_num;
+        packed_weight_32b += 1;
+      }
+
+      // update
+      weight += 16 * input_size;
+      packed_weight += 16 * input_size;
+    }
+  }
+
  private:
   alignas(64) __tilecfg amx_tile_config_;
   int32_t curr_m_;
diff --git a/csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp b/csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp
index 784da55a420e5..23e78a681b5fe 100644
--- a/csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp
+++ b/csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp
@@ -13,6 +13,9 @@ namespace cpu_micro_gemm {
 #define CPU_MICRO_GEMM_PARAMS \
   a_ptr, b_ptr, c_ptr, m, k, lda, b_n_group_stride, ldc, accum_c
 
+// Note: weights for MicroGemm should be packed as (output_size / 16) contiguous
+// blocks, means the logical shape of blocks is [16, input_size]. And the actual
+// layout of blocks can be ISA-specific.
 template <cpu_utils::ISA isa, typename scalar_t>
 class MicroGemm {
  public:
@@ -86,6 +89,41 @@ FORCE_INLINE void bias_epilogue(float* __restrict__ c_ptr,
     curr_d += ldd;
   }
 }
+
+template <int32_t n_size, typename scalar_t>
+FORCE_INLINE void add_bias_epilogue(float* c_ptr, float* d_ptr,
+                                    scalar_t* __restrict__ bias_ptr,
+                                    const int32_t m, const int64_t ldc,
+                                    const int64_t ldd) {
+  using scalar_vec_t = typename cpu_utils::VecTypeTrait<scalar_t>::vec_t;
+  static_assert(n_size % 16 == 0);
+  constexpr int32_t n_group_num = n_size / 16;
+  static_assert(n_group_num <= 16);
+
+  vec_op::FP32Vec16 bias_vecs[n_group_num];
+  scalar_t* __restrict__ curr_bias = bias_ptr;
+  vec_op::unroll_loop<int32_t, n_group_num>([&](int32_t i) {
+    scalar_vec_t vec(curr_bias);
+    bias_vecs[i] = vec_op::FP32Vec16(vec);
+    curr_bias += 16;
+  });
+
+  float* curr_c = c_ptr;
+  float* curr_d = d_ptr;
+  for (int32_t i = 0; i < m; ++i) {
+    float* curr_c_iter = curr_c;
+    float* curr_d_iter = curr_d;
+    vec_op::unroll_loop<int32_t, n_group_num>([&](int32_t n_g_idx) {
+      vec_op::FP32Vec16 c_vec_fp32(curr_c_iter);
+      c_vec_fp32 = c_vec_fp32 + bias_vecs[n_g_idx];
+      c_vec_fp32.save(curr_d_iter);
+      curr_c_iter += 16;
+      curr_d_iter += 16;
+    });
+    curr_c += ldc;
+    curr_d += ldd;
+  }
+}
 }  // namespace cpu_micro_gemm
 
 #endif
diff --git a/csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp b/csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp
index 3985c2f2e5fe4..bdd3e85a1c522 100644
--- a/csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp
+++ b/csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp
@@ -109,6 +109,25 @@ class MicroGemm<cpu_utils::ISA::VEC, scalar_t> {
   void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) {
     TileGemm82<scalar_t>::gemm(CPU_MICRO_GEMM_PARAMS);
   }
+
+  // Note: pack contiguous weight [output_size, input_size] as contiguous
+  // packed weight [output_size / 16, input_size, 16]
+  static void pack_weight(const scalar_t* __restrict__ weight,
+                          scalar_t* __restrict__ packed_weight,
+                          const int32_t output_size, const int32_t input_size) {
+    TORCH_CHECK_EQ(output_size % 16, 0);
+    for (int32_t o_idx = 0; o_idx < output_size; ++o_idx) {
+      const scalar_t* __restrict__ curr_weight = weight + o_idx * input_size;
+      scalar_t* __restrict__ curr_packed_weight =
+          packed_weight + (o_idx / 16) * (16 * input_size) + o_idx % 16;
+      for (int32_t i_idx = 0; i_idx < input_size; ++i_idx) {
+        *curr_packed_weight = *curr_weight;
+
+        curr_packed_weight += 16;
+        ++curr_weight;
+      }
+    }
+  }
 };
 }  // namespace cpu_micro_gemm
 
diff --git a/csrc/cpu/scratchpad_manager.cpp b/csrc/cpu/scratchpad_manager.cpp
deleted file mode 100644
index 05cd435f34b7a..0000000000000
--- a/csrc/cpu/scratchpad_manager.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-#include <cstdlib>
-
-#include "scratchpad_manager.h"
-
-DNNLScratchPadManager::DNNLScratchPadManager() : size_(0), ptr_(nullptr) {
-  this->realloc(allocation_unit * 128);
-}
-
-void DNNLScratchPadManager::realloc(size_t new_size) {
-  new_size = round(new_size);
-  if (new_size > size_) {
-    if (ptr_ != nullptr) {
-      std::free(ptr_);
-    }
-    ptr_ = std::aligned_alloc(64, new_size);
-    size_ = new_size;
-  }
-}
-
-DNNLScratchPadManager* DNNLScratchPadManager::get_dnnl_scratchpad_manager() {
-  static DNNLScratchPadManager manager;
-  return &manager;
-}
diff --git a/csrc/cpu/scratchpad_manager.h b/csrc/cpu/scratchpad_manager.h
deleted file mode 100644
index 0ecf59192f845..0000000000000
--- a/csrc/cpu/scratchpad_manager.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#ifndef SCRATCHPAD_MANAGER_H
-#define SCRATCHPAD_MANAGER_H
-
-#include <cstddef>
-#include <cstdio>
-
-class DNNLScratchPadManager {
- public:
-  static constexpr size_t allocation_unit = 4 * 1024;  // 4KB
-
-  static DNNLScratchPadManager* get_dnnl_scratchpad_manager();
-
-  DNNLScratchPadManager();
-
-  template <typename T>
-  T* get_data() {
-    return reinterpret_cast<T*>(ptr_);
-  }
-
-  static size_t round(size_t size) {
-    return ((size + allocation_unit - 1) / allocation_unit) * allocation_unit;
-  }
-
-  void realloc(size_t new_size);
-
- private:
-  size_t size_;
-  void* ptr_;
-};
-
-#endif
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index e0e3ef71b485f..dd419405c94b9 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -110,6 +110,17 @@ void cpu_gemm_wna16(const torch::Tensor& input, const torch::Tensor& q_weight,
                     const std::optional<torch::Tensor>& bias,
                     const int64_t pack_factor, const std::string& isa_hint);
 
+void prepack_moe_weight(const torch::Tensor& weight,
+                        torch::Tensor& packed_weight, const std::string& isa);
+
+void cpu_fused_moe(torch::Tensor& output, const torch::Tensor& input,
+                   const torch::Tensor& w13, const torch::Tensor& w2,
+                   const std::optional<torch::Tensor>& w13_bias,
+                   const std::optional<torch::Tensor>& w2_bias,
+                   const torch::Tensor& topk_weights,
+                   const torch::Tensor& topk_id, const std::string& act,
+                   const std::string& isa);
+
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // vLLM custom ops
 
@@ -296,6 +307,19 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "pack_factor, str isa_hint) -> ()");
   ops.impl("cpu_gemm_wna16", torch::kCPU, &cpu_gemm_wna16);
 #endif
+
+  // fused moe
+#if defined(__AVX512F__)
+  ops.def(
+      "prepack_moe_weight(Tensor weight, Tensor(a1!) packed_weight, str isa) "
+      "-> ()");
+  ops.impl("prepack_moe_weight", torch::kCPU, &prepack_moe_weight);
+  ops.def(
+      "cpu_fused_moe(Tensor(a0!) output, Tensor input, Tensor w13, Tensor w2, "
+      "Tensor? w13_bias, Tensor? w2_bias, Tensor topk_weights, Tensor topk_id, "
+      "str act, str isa) -> ()");
+  ops.impl("cpu_fused_moe", torch::kCPU, &cpu_fused_moe);
+#endif
 }
 
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp
index 3dacfc7b2b7a3..fcd7534ab4c5d 100644
--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@@ -10,7 +10,7 @@
   #define gettid() syscall(SYS_gettid)
 #endif
 
-#include "cpu_types.hpp"
+#include "cpu/utils.hpp"
 
 #ifdef VLLM_NUMA_DISABLED
 std::string init_cpu_threads_env(const std::string& cpu_ids) {
@@ -138,4 +138,26 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
 
   return ss.str();
 }
+
+namespace cpu_utils {
+ScratchPadManager::ScratchPadManager() : size_(0), ptr_(nullptr) {
+  this->realloc(allocation_unit * 128);
+}
+
+void ScratchPadManager::realloc(size_t new_size) {
+  new_size = round(new_size);
+  if (new_size > size_) {
+    if (ptr_ != nullptr) {
+      std::free(ptr_);
+    }
+    ptr_ = std::aligned_alloc(64, new_size);
+    size_ = new_size;
+  }
+}
+
+ScratchPadManager* ScratchPadManager::get_scratchpad_manager() {
+  static ScratchPadManager manager;
+  return &manager;
+}
+}  // namespace cpu_utils
 #endif
diff --git a/csrc/cpu/utils.hpp b/csrc/cpu/utils.hpp
index d3def306b8069..8ab0bb039c014 100644
--- a/csrc/cpu/utils.hpp
+++ b/csrc/cpu/utils.hpp
@@ -2,19 +2,24 @@
 #define UTILS_HPP
 
 #include <atomic>
-#include <cassert>
-#include <cstdint>
 #include <unistd.h>
+#include <ATen/cpu/Utils.h>
 
-#if defined(__APPLE__)
-  #include <sys/sysctl.h>
-#endif
-
-#include "cpu_types.hpp"
+#include "cpu/cpu_types.hpp"
 
 namespace cpu_utils {
 enum class ISA { AMX, VEC };
 
+inline ISA get_isa(const std::string& isa) {
+  if (isa == "amx") {
+    return ISA::AMX;
+  } else if (isa == "vec") {
+    return ISA::VEC;
+  } else {
+    TORCH_CHECK(false, "Invalid isa type: " + isa);
+  }
+}
+
 template <typename T>
 struct VecTypeTrait {
   using vec_t = void;
@@ -48,26 +53,66 @@ struct Counter {
   int64_t acquire_counter() { return counter++; }
 };
 
-inline int64_t get_l2_size() {
+inline int64_t get_available_l2_size() {
   static int64_t size = []() {
-#if defined(__APPLE__)
-    // macOS doesn't have _SC_LEVEL2_CACHE_SIZE. Use sysctlbyname.
-    int64_t l2_cache_size = 0;
-    size_t len = sizeof(l2_cache_size);
-    if (sysctlbyname("hw.l2cachesize", &l2_cache_size, &len, NULL, 0) == 0 &&
-        l2_cache_size > 0) {
-      return l2_cache_size >> 1;  // use 50% of L2 cache
-    }
-    // Fallback if sysctlbyname fails
-    return 128LL * 1024 >> 1;  // use 50% of 128KB
-#else
-    long l2_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE);
-    assert(l2_cache_size != -1);
+    const uint32_t l2_cache_size = at::cpu::L2_cache_size();
     return l2_cache_size >> 1;  // use 50% of L2 cache
-#endif
   }();
   return size;
 }
+
+template <int32_t alignment_v, typename T>
+inline T round_up(T size) {
+  T alignment = alignment_v;
+  return (((size + alignment - 1) / alignment) * alignment);
+}
+
+template <int32_t alignment_v, typename T>
+inline T round_down(T size) {
+  T alignment = alignment_v;
+  return (size / alignment) * alignment;
+}
+
+template <typename T>
+inline void print_logits(const char* name, T* ptr, int32_t row, int32_t col,
+                         int32_t stride) {
+  std::stringstream ss;
+  ss << std::fixed << std::setprecision(5) << name << ": [\n";
+  auto* curr_logits_buffer = ptr;
+  for (int32_t m = 0; m < row; ++m) {
+    for (int32_t n = 0; n < col; ++n) {
+      ss << curr_logits_buffer[n] << ", ";
+    }
+    ss << "\n";
+    curr_logits_buffer += stride;
+  }
+  ss << "]\n";
+  std::printf("%s", ss.str().c_str());
+}
+
+class ScratchPadManager {
+ public:
+  static constexpr size_t allocation_unit = 4 * 1024;  // 4KB
+
+  static ScratchPadManager* get_scratchpad_manager();
+
+  ScratchPadManager();
+
+  template <typename T>
+  T* get_data() {
+    return reinterpret_cast<T*>(ptr_);
+  }
+
+  static size_t round(size_t size) {
+    return ((size + allocation_unit - 1) / allocation_unit) * allocation_unit;
+  }
+
+  void realloc(size_t new_size);
+
+ private:
+  size_t size_;
+  void* ptr_;
+};
 }  // namespace cpu_utils
 
 #endif
diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index bd5bc43916eac..2caf1ad144178 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -147,7 +147,9 @@ WORKDIR /workspace/vllm
 
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
     --mount=type=cache,target=/var/lib/apt,sharing=locked \
-    apt-get install -y --no-install-recommends vim numactl xz-utils
+    apt-get install -y --no-install-recommends vim numactl xz-utils make clangd-14
+
+RUN ln -s /usr/bin/clangd-14 /usr/bin/clangd
 
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
diff --git a/requirements/cpu-build.txt b/requirements/cpu-build.txt
index 1ea401a04a12c..a7bd3b17b6323 100644
--- a/requirements/cpu-build.txt
+++ b/requirements/cpu-build.txt
@@ -1,7 +1,7 @@
 cmake>=3.26.1
 ninja
 packaging>=24.2
-setuptools>=77.0.3,<81.0.0
+setuptools==77.0.3 # this version can reuse CMake build dir
 setuptools-scm>=8
 torch==2.9.1+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
 torch==2.9.1; platform_system == "Darwin" or platform_machine == "ppc64le" or platform_machine == "aarch64"
diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index 7a670812e8943..111b8a5511562 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -1,6 +1,8 @@
 # Common dependencies
 -r common.txt
 
+setuptools==77.0.3 # this version can reuse CMake build dir
+
 numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative decoding
 
 # Dependencies for CPUs
diff --git a/tests/kernels/moe/test_cpu_fused_moe.py b/tests/kernels/moe/test_cpu_fused_moe.py
new file mode 100644
index 0000000000000..4dda45a6c7409
--- /dev/null
+++ b/tests/kernels/moe/test_cpu_fused_moe.py
@@ -0,0 +1,172 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from tests.kernels.allclose_default import get_default_atol, get_default_rtol
+from vllm._custom_ops import cpu_fused_moe, cpu_prepack_moe_weight
+from vllm.model_executor.layers.activation import SiluAndMul, SwigluOAIAndMul
+from vllm.platforms import current_platform
+
+if not current_platform.is_cpu():
+    pytest.skip("skipping CPU-only tests", allow_module_level=True)
+
+EXPERT_NUM = [
+    8,
+]
+HIDDEN_DIM = [128, 2880]
+INTERMEDIATE_DIM = [128, 2880]
+BATCH_SIZE = [1, 64, 256]
+ACT = ["silu", "swigluoai"]
+USE_BIAS = [True, False]
+ISA = ["amx", "vec"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
+DTYPE = [torch.bfloat16]
+
+_CPU_MOE_ACT = {
+    "silu": SiluAndMul(),
+    "swigluoai": SwigluOAIAndMul(),
+}
+
+
+def ref_fused_moe(
+    input: torch.Tensor,
+    w13: torch.Tensor,
+    w2: torch.Tensor,
+    w13_bias: torch.Tensor | None,
+    w2_bias: torch.Tensor | None,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    activation: str,
+) -> torch.Tensor:
+    len_experts = w13.size(0)
+
+    cnts = topk_ids.new_zeros((topk_ids.shape[0], len_experts))
+    cnts.scatter_(1, topk_ids.to(torch.int64), 1)
+    tokens_per_expert = cnts.sum(dim=0)
+    idxs = topk_ids.view(-1).argsort()
+
+    sorted_tokens = input[idxs // topk_ids.shape[1]]
+    tokens_per_expert = tokens_per_expert.cpu().numpy()
+
+    outputs = []
+    start_idx = 0
+
+    for i, num_tokens in enumerate(tokens_per_expert):
+        end_idx = start_idx + num_tokens
+        if num_tokens == 0:
+            continue
+        tokens_for_this_expert = sorted_tokens[start_idx:end_idx].float()
+        curr_w13 = w13[i].float()
+        curr_w2 = w2[i].float()
+
+        curr_w13_bias = None
+        if w13_bias is not None:
+            curr_w13_bias = w13_bias[i].float()
+
+        curr_w2_bias = None
+        if w2_bias is not None:
+            curr_w2_bias = w2_bias[i].float()
+
+        gate_up = torch.nn.functional.linear(
+            tokens_for_this_expert, curr_w13, curr_w13_bias
+        )
+        # Note: to simulate the kernel implementation
+        gate_up = (
+            _CPU_MOE_ACT[activation]
+            .forward_native(gate_up)
+            .to(dtype=input.dtype)
+            .float()
+        )
+        expert_out = torch.nn.functional.linear(gate_up, curr_w2, curr_w2_bias)
+
+        outputs.append(expert_out)
+        start_idx = end_idx
+
+    outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)
+    new_x = torch.empty_like(outs)
+
+    new_x[idxs] = outs
+    final_out = (
+        new_x.view(*topk_ids.shape, -1)
+        .mul_(topk_weights.unsqueeze(dim=-1))
+        .sum(dim=1)
+        .type(input.dtype)
+    )
+    return final_out
+
+
+@pytest.mark.parametrize("batch_size", BATCH_SIZE)
+@pytest.mark.parametrize("expert_num", EXPERT_NUM)
+@pytest.mark.parametrize("hidden_size", HIDDEN_DIM)
+@pytest.mark.parametrize("intermediate_size", INTERMEDIATE_DIM)
+@pytest.mark.parametrize("use_bias", USE_BIAS)
+@pytest.mark.parametrize("dtype", DTYPE)
+@pytest.mark.parametrize("act", ACT)
+@pytest.mark.parametrize("isa", ISA)
+def test_cpu_fused_moe(
+    batch_size: int,
+    expert_num: int,
+    hidden_size: int,
+    intermediate_size: int,
+    use_bias: bool,
+    dtype: torch.dtype,
+    act: str,
+    isa: str,
+):
+    current_platform.seed_everything(0)
+
+    topk_num = max(expert_num // 2, 1)
+    up_dim = 2 * intermediate_size
+
+    input = torch.randn((batch_size, hidden_size), dtype=dtype) / (
+        0.5 * hidden_size**0.5
+    )
+    w13 = torch.randn((expert_num, up_dim, hidden_size), dtype=dtype) / (
+        0.5 * hidden_size**0.5
+    )
+    w2 = torch.randn((expert_num, hidden_size, intermediate_size), dtype=dtype) / (
+        0.5 * intermediate_size**0.5
+    )
+    router_logits = torch.randn((batch_size, expert_num), dtype=dtype)
+    w13_bias = None
+    w2_bias = None
+    if use_bias:
+        w13_bias = torch.randn((expert_num, up_dim), dtype=dtype) / (0.5 * up_dim**0.5)
+        w2_bias = torch.randn((expert_num, hidden_size), dtype=dtype) / (
+            0.5 * hidden_size**0.5
+        )
+    score = torch.softmax(router_logits, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk_num)
+    topk_ids = topk_ids.to(torch.int32)
+
+    ref_output = ref_fused_moe(
+        input,
+        w13,
+        w2,
+        w13_bias,
+        w2_bias,
+        topk_weight,
+        topk_ids,
+        act,
+    )
+
+    packed_w13 = cpu_prepack_moe_weight(w13, isa)
+    packed_w2 = cpu_prepack_moe_weight(w2, isa)
+    output = cpu_fused_moe(
+        input,
+        packed_w13,
+        packed_w2,
+        w13_bias,
+        w2_bias,
+        topk_weight,
+        topk_ids,
+        act,
+        isa,
+    )
+
+    atol, rtol = get_default_atol(output), get_default_rtol(output)
+    (
+        torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol),
+        f"{torch.max(torch.abs(output - ref_output))}",
+    )
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 2319655008c50..cf7f17a033be3 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -2919,6 +2919,42 @@ def cpu_gemm_wna16(
     return output
 
 
+def cpu_prepack_moe_weight(
+    weight: torch.Tensor,
+    isa: str,
+) -> torch.Tensor:
+    output = torch.empty_like(weight)
+    torch.ops._C.prepack_moe_weight(weight, output, isa)
+    return output
+
+
+def cpu_fused_moe(
+    input: torch.Tensor,
+    w13: torch.Tensor,
+    w2: torch.Tensor,
+    w13_bias: torch.Tensor | None,
+    w2_bias: torch.Tensor | None,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    act: str,
+    isa: str,
+) -> torch.Tensor:
+    output = torch.empty_like(input)
+    torch.ops._C.cpu_fused_moe(
+        output,
+        input,
+        w13,
+        w2,
+        w13_bias,
+        w2_bias,
+        topk_weights,
+        topk_ids,
+        act,
+        isa,
+    )
+    return output
+
+
 if hasattr(torch.ops._qutlass_C, "matmul_mxf4_bf16_tn"):
 
     @register_fake("_qutlass_C::matmul_mxf4_bf16_tn")
diff --git a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
index 659a2d4ee5b39..cf7a4313de24c 100644
--- a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
@@ -1,12 +1,22 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import weakref
 from collections.abc import Callable
 
 import torch
 from torch.nn import functional as F
 
 from vllm import _custom_ops as ops
+from vllm._custom_ops import cpu_fused_moe, cpu_prepack_moe_weight
 from vllm.model_executor.layers.activation import SiluAndMul, SwigluOAIAndMul
+from vllm.model_executor.layers.quantization.utils.layer_utils import replace_parameter
+from vllm.utils.torch_utils import direct_register_custom_op
+
+_CPU_MOE_LAYER_CACHE = {}
+_CPU_MOE_ACT = {
+    "silu": SiluAndMul(),
+    "swigluoai": SwigluOAIAndMul(),
+}
 
 
 def grouped_topk(
@@ -174,8 +184,105 @@ class SGLFusedMOE:
 
 class CPUFusedMOE:
     def __init__(self, layer: torch.nn.Module) -> None:
-        use_onednn_mm = ops._supports_onednn and ops.is_onednn_acl_supported()
+        use_grouped_gemm, isa = self.check_grouped_gemm(layer)
+        self.isa = isa
+        if use_grouped_gemm:
+            self.forward_method = self.forward_grouped_gemm
+            self.init_moe_grouped_gemm(layer=layer)
+        else:
+            self.forward_method = self.forward_torch
+            self.init_moe_torch(layer=layer)
 
+    def __call__(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        use_grouped_topk: bool,
+        top_k: int,
+        router_logits: torch.Tensor,
+        renormalize: bool,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
+        global_num_experts: int = -1,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+        e_score_correction_bias: torch.Tensor | None = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+    ) -> torch.Tensor:
+        assert activation in _CPU_MOE_ACT, f"{activation} is not supported."
+        assert not apply_router_weight_on_input
+
+        topk_weights, topk_ids = select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
+            e_score_correction_bias=e_score_correction_bias,
+        )
+
+        return self.forward_method(
+            layer,
+            x,
+            topk_weights,
+            topk_ids,
+            activation,
+            global_num_experts,
+        )
+
+    def check_grouped_gemm(
+        self,
+        layer: torch.nn.Module,
+    ) -> tuple[bool, str]:
+        if not hasattr(torch.ops._C, "prepack_moe_weight"):
+            return False, "none"
+
+        dtype = layer.w13_weight.dtype
+        w13_input_size = layer.w13_weight.size(2)
+        w13_output_size = layer.w13_weight.size(1)
+        w2_input_size = layer.w2_weight.size(2)
+        w2_output_size = layer.w2_weight.size(1)
+
+        if not (w13_output_size % 32 == 0 and w2_output_size % 32 == 0):
+            return False, "none"
+
+        supports_amx = torch._C._cpu._is_amx_tile_supported()
+
+        if (
+            supports_amx
+            and dtype == torch.bfloat16
+            and w13_input_size % 32 == 0
+            and w2_input_size % 32 == 0
+        ):
+            return True, "amx"
+
+        if supports_amx:
+            return False, "none"
+
+        return True, "vec"
+
+    def init_moe_grouped_gemm(
+        self,
+        layer: torch.nn.Module,
+    ) -> None:
+        new_w13 = cpu_prepack_moe_weight(layer.w13_weight, self.isa)
+        replace_parameter(layer, "w13_weight", new_w13)
+        new_w2 = cpu_prepack_moe_weight(layer.w2_weight, self.isa)
+        replace_parameter(layer, "w2_weight", new_w2)
+
+    def init_moe_torch(
+        self,
+        layer: torch.nn.Module,
+    ) -> None:
+        use_onednn_mm = ops._supports_onednn and ops.is_onednn_acl_supported()
         num_experts = layer.w13_weight.size(0)
         has_w13_bias = hasattr(layer, "w13_bias")
         has_w2_bias = hasattr(layer, "w2_bias")
@@ -208,85 +315,112 @@ class CPUFusedMOE:
                 layer.down_linear.append(
                     lambda x, w=layer_w2_weight, b=layer_w2_bias: F.linear(x, w, b)
                 )
+
         if use_onednn_mm:  # remove weight
             layer.w13_weight = torch.nn.Parameter(torch.empty(0), requires_grad=False)
             layer.w2_weight = torch.nn.Parameter(torch.empty(0), requires_grad=False)
 
-        self.act_to_impl = {
-            "silu": SiluAndMul(),
-            "swigluoai": SwigluOAIAndMul(),
-        }
+        _CPU_MOE_LAYER_CACHE[id(layer)] = weakref.ref(layer)
 
-    def __call__(
+    def forward_grouped_gemm(
         self,
         layer: torch.nn.Module,
-        x: torch.Tensor,
-        use_grouped_topk: bool,
-        top_k: int,
-        router_logits: torch.Tensor,
-        renormalize: bool,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
+        input: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
         global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
     ) -> torch.Tensor:
-        assert activation in self.act_to_impl, f"{activation} is not supported."
-        assert not apply_router_weight_on_input
-        topk_weights, topk_ids = select_experts(
-            hidden_states=x,
-            router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
+        output = cpu_fused_moe(
+            input,
+            layer.w13_weight,
+            layer.w2_weight,
+            getattr(layer, "w13_bias", None),
+            getattr(layer, "w2_bias", None),
+            topk_weights,
+            topk_ids,
+            activation,
+            self.isa,
+        )
+        return output
+
+    def forward_torch(
+        self,
+        layer: torch.nn.Module,
+        input: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int = -1,
+    ) -> torch.Tensor:
+        output = torch.empty_like(input)
+        layer_id = id(layer)
+        torch.ops.vllm.cpu_fused_moe_torch(
+            layer_id,
+            output,
+            input,
+            topk_weights,
+            topk_ids,
+            activation,
+            global_num_experts,
         )
 
-        # Ref code from https://github.com/sgl-project/sglang/blob/716e682721397df103f347d22da8bd46c6016dab/python/sglang/srt/layers/moe/fused_moe_native.py#L53
-        len_experts = global_num_experts
+        return output
 
-        cnts = topk_ids.new_zeros((topk_ids.shape[0], len_experts))
-        cnts.scatter_(1, topk_ids.to(torch.int64), 1)
-        tokens_per_expert = cnts.sum(dim=0)
-        idxs = topk_ids.view(-1).argsort()
 
-        sorted_tokens = x[idxs // topk_ids.shape[1]]
-        tokens_per_expert = tokens_per_expert.cpu().numpy()
+def cpu_fused_moe_torch(
+    layer_id: int,
+    output: torch.Tensor,
+    input: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    activation: str,
+    global_num_experts: int = -1,
+) -> None:
+    layer = _CPU_MOE_LAYER_CACHE[layer_id]()
 
-        outputs = []
-        start_idx = 0
+    # Ref code from https://github.com/sgl-project/sglang/blob/716e682721397df103f347d22da8bd46c6016dab/python/sglang/srt/layers/moe/fused_moe_native.py#L53
+    len_experts = global_num_experts
 
-        for i, num_tokens in enumerate(tokens_per_expert):
-            end_idx = start_idx + num_tokens
-            if num_tokens == 0:
-                continue
-            tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
+    cnts = topk_ids.new_zeros((topk_ids.shape[0], len_experts))
+    cnts.scatter_(1, topk_ids.to(torch.int64), 1)
+    tokens_per_expert = cnts.sum(dim=0)
+    idxs = topk_ids.view(-1).argsort()
 
-            gate_up = layer.gate_up_linear[i](tokens_for_this_expert)
-            gate_up = self.act_to_impl[activation].forward_native(gate_up)
-            expert_out = layer.down_linear[i](gate_up)
-            outputs.append(expert_out)
-            start_idx = end_idx
+    sorted_tokens = input[idxs // topk_ids.shape[1]]
+    tokens_per_expert = tokens_per_expert.cpu().numpy()
 
-        outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)
-        new_x = torch.empty_like(outs)
+    outputs = []
+    start_idx = 0
 
-        new_x[idxs] = outs
-        final_out = (
-            new_x.view(*topk_ids.shape, -1)
-            .type(topk_weights.dtype)
-            .mul_(topk_weights.unsqueeze(dim=-1))
-            .sum(dim=1)
-            .type(new_x.dtype)
-        )
-        return final_out
+    for i, num_tokens in enumerate(tokens_per_expert):
+        end_idx = start_idx + num_tokens
+        if num_tokens == 0:
+            continue
+        tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
+
+        gate_up = layer.gate_up_linear[i](tokens_for_this_expert)  # type: ignore
+        gate_up = _CPU_MOE_ACT[activation].forward_native(gate_up)
+        expert_out = layer.down_linear[i](gate_up)  # type: ignore
+        outputs.append(expert_out)
+        start_idx = end_idx
+
+    outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)
+    new_x = torch.empty_like(outs)
+
+    new_x[idxs] = outs
+    final_out = (
+        new_x.view(*topk_ids.shape, -1)
+        .type(topk_weights.dtype)
+        .mul_(topk_weights.unsqueeze(dim=-1))
+        .sum(dim=1)
+        .type(new_x.dtype)
+    )
+    output.copy_(final_out)
+
+
+direct_register_custom_op(
+    op_name="cpu_fused_moe_torch",
+    op_func=cpu_fused_moe_torch,
+    mutates_args=["output"],
+)
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index db97d6eb88ea5..6a65b06014bca 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1726,9 +1726,10 @@ class FusedMoE(CustomOp):
             return states
 
         if self.shared_experts is None:
-            if current_platform.is_tpu():
+            if current_platform.is_tpu() or current_platform.is_cpu():
                 # TODO: Once the OOM issue for the TPU backend is resolved, we
                 # will switch to using the moe_forward custom op.
+                # Note: CPU doesn't require wrapped forward_impl.
                 fused_output = self.forward_impl(hidden_states, router_logits)
                 assert not isinstance(fused_output, tuple)
             else:
@@ -1744,9 +1745,10 @@ class FusedMoE(CustomOp):
             else:
                 return reduce_output(fused_output)[..., :og_hidden_states]
         else:
-            if current_platform.is_tpu():
+            if current_platform.is_tpu() or current_platform.is_cpu():
                 # TODO: Once the OOM issue for the TPU backend is resolved, we
                 # will switch to using the moe_forward custom op.
+                # Note: CPU doesn't require wrapped forward_impl.
                 shared_output, fused_output = self.forward_impl(
                     hidden_states, router_logits
                 )

From 11a89cf95caaec8dec13fab1e8e3d64c9a852a08 Mon Sep 17 00:00:00 2001
From: Yifan Qiao <yifanqiao@berkeley.edu>
Date: Wed, 17 Dec 2025 22:42:21 -0800
Subject: [PATCH 661/770] [Fix][FlexAttention] return max logical block index
 to handle reused blocks (#30915)

Signed-off-by: Yifan Qiao <yifanqiao@berkeley.edu>
---
 tests/kernels/test_flex_attention.py         | 31 +++++++++++++++++++-
 vllm/v1/attention/backends/flex_attention.py | 15 ++++++++--
 2 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/tests/kernels/test_flex_attention.py b/tests/kernels/test_flex_attention.py
index f6987d54399d2..7053a8697e190 100644
--- a/tests/kernels/test_flex_attention.py
+++ b/tests/kernels/test_flex_attention.py
@@ -15,7 +15,10 @@ from tests.v1.attention.utils import (
     create_standard_kv_cache_spec,
     create_vllm_config,
 )
-from vllm.v1.attention.backends.flex_attention import FlexAttentionMetadataBuilder
+from vllm.v1.attention.backends.flex_attention import (
+    FlexAttentionMetadataBuilder,
+    physical_to_logical_mapping,
+)
 
 from ..models.utils import check_embeddings_close, check_logprobs_close
 
@@ -205,5 +208,31 @@ def test_block_mask_direct_vs_slow_path():
     )
 
 
+def test_physical_to_logical_mapping_handles_reused_blocks():
+    """Regression test: reused physical blocks map to the latest logical block.
+
+    For sliding-window / hybrid attention layers, physical KV-cache blocks can be
+    reused over time. The inverse mapping must therefore select the latest
+    logical block index for a physical block id.
+    """
+    # Padding should not make physical block 0 look live.
+    block_table = torch.tensor([[6, 0, 0, 0]], dtype=torch.int32)
+    seq_lens = torch.tensor([1 * 16], dtype=torch.int32)  # only 1 block valid
+    out = physical_to_logical_mapping(
+        block_table=block_table, seq_lens=seq_lens, block_size=16, total_blocks=10
+    )
+    assert out[0, 0].item() == -1
+    assert out[0, 6].item() == 0
+
+    # If a physical block id appears multiple times (block reuse), mapping should
+    # point to the latest logical block index.
+    block_table2 = torch.tensor([[2, 2, 5]], dtype=torch.int32)
+    seq_lens2 = torch.tensor([3 * 16], dtype=torch.int32)
+    out2 = physical_to_logical_mapping(
+        block_table=block_table2, seq_lens=seq_lens2, block_size=16, total_blocks=8
+    )
+    assert out2[0, 2].item() == 1
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index d8dbe4cbae013..8193c05c2b1ab 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -160,7 +160,7 @@ def physical_to_logical_mapping(
     └───────────────────────────────────────────┘
 
     If multiple logical blocks map to the same physical block,
-    this function returns the first (minimum) logical block index.
+    this function returns the latest (maximum) logical block index.
 
     If a physical block is not mapped to by any logical block,
     its value in the result will be -1.
@@ -183,6 +183,15 @@ def physical_to_logical_mapping(
     To prevent this, we use seq_lens and block_size to mask out unused
     entries, ensuring only valid block references are processed.
 
+    IMPORTANT: Reused physical blocks (sliding-window / hybrid attention)
+    ────────────────────────────────────────────────────────────────────
+    For some attention types, physical cache blocks can be reused over time.
+    This can cause the same physical block id to appear multiple times in a row
+    of `block_table` at different logical block indices. In that case, only the
+    latest logical block index corresponds to the current contents of that
+    physical block. Therefore, the inverse mapping must pick the maximum logical
+    block index for each physical block id.
+
     Args:
         block_table: Tensor of shape [max_reqs, max_num_blocks]
             mapping logical blocks to physical locations. May contain
@@ -217,8 +226,8 @@ def physical_to_logical_mapping(
         mask, torch.arange(max_num_blocks, device=device)[None, :], 0
     )
 
-    physical_to_logical.scatter_(
-        -1, valid_block_table.to(torch.int64), valid_logical_indices
+    physical_to_logical.scatter_reduce_(
+        -1, valid_block_table.to(torch.int64), valid_logical_indices, reduce="amax"
     )
     # NB - Seems like block 0 is always empty so we reset it manually
     physical_to_logical[:, 0] = -1

From a85724bd6e51af58793da365da7b70b1b72d71b9 Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Thu, 18 Dec 2025 14:45:29 +0800
Subject: [PATCH 662/770] [Platform] Let EPD work with non-cuda platform
 (#30225)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
---
 .../ec_transfer/ec_connector/example_connector.py            | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/distributed/ec_transfer/ec_connector/example_connector.py b/vllm/distributed/ec_transfer/ec_connector/example_connector.py
index c9aad9e9fc8f3..3518044ce2e00 100644
--- a/vllm/distributed/ec_transfer/ec_connector/example_connector.py
+++ b/vllm/distributed/ec_transfer/ec_connector/example_connector.py
@@ -73,6 +73,7 @@ class ECExampleConnector(ECConnectorBase):
                 data hashes (`mm_hash`) to encoder cache tensors.
             kwargs (dict): Additional keyword arguments for the connector.
         """
+        from vllm.platforms import current_platform
 
         # Get the metadata
         metadata: ECConnectorMetadata = self._get_connector_metadata()
@@ -91,7 +92,9 @@ class ECExampleConnector(ECConnectorBase):
             if mm_data.mm_hash in encoder_cache:
                 continue
             filename = self._generate_filename_debug(mm_data.mm_hash)
-            ec_cache = safetensors.torch.load_file(filename)["ec_cache"].cuda()
+            ec_cache = safetensors.torch.load_file(
+                filename, device=current_platform.device_type
+            )["ec_cache"]
             encoder_cache[mm_data.mm_hash] = ec_cache
             logger.debug("Success load encoder cache for hash %s", mm_data.mm_hash)
 

From be2ad5f92060b66788740e2e8302a490fbd226f4 Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Thu, 18 Dec 2025 01:04:57 -0600
Subject: [PATCH 663/770] [ROCm][Bugfix] fix(structured_output): Skip guidance
 backend for schemas with patternProperties (#30730)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 vllm/v1/engine/input_processor.py             | 21 +++++++++++++--
 vllm/v1/structured_output/backend_guidance.py | 26 +++++++++++++++++++
 2 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py
index 65e0c845b0afa..29293877cb69d 100644
--- a/vllm/v1/engine/input_processor.py
+++ b/vllm/v1/engine/input_processor.py
@@ -24,7 +24,10 @@ from vllm.tokenizers.mistral import MistralTokenizer
 from vllm.utils import length_from_prompt_token_ids_or_embeds
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.metrics.stats import MultiModalCacheStats
-from vllm.v1.structured_output.backend_guidance import validate_guidance_grammar
+from vllm.v1.structured_output.backend_guidance import (
+    has_guidance_unsupported_json_features,
+    validate_guidance_grammar,
+)
 from vllm.v1.structured_output.backend_lm_format_enforcer import (
     validate_structured_output_request_lm_format_enforcer,
 )
@@ -340,8 +343,22 @@ class InputProcessor:
                 # The request either failed validation
                 # or includes some jsonschema feature(s) that
                 # are not supported in xgrammar.
-                if isinstance(self.tokenizer, MistralTokenizer):
+
+                # Check if schema has features unsupported by guidance
+                so_params = params.structured_outputs
+                skip_guidance = False
+                if so_params.json:
+                    if isinstance(so_params.json, str):
+                        import json
+
+                        schema = json.loads(so_params.json)
+                    else:
+                        schema = so_params.json
+                    skip_guidance = has_guidance_unsupported_json_features(schema)
+
+                if isinstance(self.tokenizer, MistralTokenizer) or skip_guidance:
                     # Fall back to outlines if the tokenizer is Mistral
+                    # or if schema contains features unsupported by guidance
                     validate_structured_output_request_outlines(params)
                     params.structured_outputs._backend = "outlines"
                 else:
diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py
index 2962a439dcb3e..727a67333bd71 100644
--- a/vllm/v1/structured_output/backend_guidance.py
+++ b/vllm/v1/structured_output/backend_guidance.py
@@ -44,6 +44,32 @@ def _walk_json_for_additional_properties(data: object):
             _walk_json_for_additional_properties(item)
 
 
+def has_guidance_unsupported_json_features(schema: dict[str, Any]) -> bool:
+    """Check if JSON schema contains features unsupported by guidance/llguidance."""
+
+    def check_object(obj: dict[str, Any]) -> bool:
+        if not isinstance(obj, dict):
+            return False
+
+        # patternProperties is not supported by llguidance
+        if "patternProperties" in obj:
+            return True
+
+        # Recursively check all nested objects and arrays
+        for value in obj.values():
+            if isinstance(value, dict):
+                if check_object(value):
+                    return True
+            elif isinstance(value, list):
+                for item in value:
+                    if isinstance(item, dict) and check_object(item):
+                        return True
+
+        return False
+
+    return check_object(schema)
+
+
 def process_for_additional_properties(
     guide_json: str | dict[str, Any],
 ) -> dict[str, Any]:

From aa7e8360559e639f201f08a4deee490af332b22c Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Thu, 18 Dec 2025 15:12:17 +0800
Subject: [PATCH 664/770] [Bugfix] Fix Unicode issues in GLM-4 tool calling
 (#30920)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 vllm/tool_parsers/glm4_moe_tool_parser.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/tool_parsers/glm4_moe_tool_parser.py b/vllm/tool_parsers/glm4_moe_tool_parser.py
index d254fcb5240a5..ebfd91297b417 100644
--- a/vllm/tool_parsers/glm4_moe_tool_parser.py
+++ b/vllm/tool_parsers/glm4_moe_tool_parser.py
@@ -114,7 +114,8 @@ class Glm4MoeModelToolParser(ToolParser):
                     ToolCall(
                         type="function",
                         function=FunctionCall(
-                            name=tc_name, arguments=json.dumps(arg_dct)
+                            name=tc_name,
+                            arguments=json.dumps(arg_dct, ensure_ascii=False),
                         ),
                     )
                 )

From 30bb19a760d6d5e8c69b3a4c78c9cb7430872a61 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Thu, 18 Dec 2025 02:50:15 -0500
Subject: [PATCH 665/770] [BugFix] Partial revert of #29558 (DeepEP HT +
 PIECEWISE CG support) (#30910)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 tests/compile/test_config.py | 38 ---------------------------
 vllm/config/compilation.py   | 50 ++++++++++--------------------------
 2 files changed, 14 insertions(+), 74 deletions(-)

diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py
index 04bb56ecb6470..6435d87ba7631 100644
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -233,24 +233,6 @@ def test_splitting_ops_dynamic():
     assert config.compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE
 
 
-def test_moe_splitting_ops_deepep_ht_piecewise():
-    # Non-inductor, non-attn-fusion case: DeepEP HT with dp>1
-    # should add MoE ops to splitting_ops on top of attention ops.
-    config = VllmConfig(
-        parallel_config=ParallelConfig(
-            all2all_backend="deepep_high_throughput",
-            data_parallel_size=8,
-        ),
-        compilation_config=CompilationConfig(
-            mode=CompilationMode.VLLM_COMPILE,
-        ),
-    )
-    splitting_ops = config.compilation_config.splitting_ops
-    assert splitting_ops is not None
-    assert "vllm::moe_forward" in splitting_ops
-    assert "vllm::moe_forward_shared" in splitting_ops
-
-
 def test_moe_splitting_ops_deepep_ht_inductor_partition():
     # Inductor partition case: user-provided splitting_ops should be
     # preserved and MoE ops should be appended for DeepEP HT with dp>1.
@@ -277,26 +259,6 @@ def test_moe_splitting_ops_deepep_ht_inductor_partition():
     ]
 
 
-def test_moe_splitting_ops_deepep_ht_attn_fusion_no_inductor():
-    # Pure attn-fusion case without inductor partition: even with
-    # DeepEP HT and dp>1, we should not re-enable piecewise compilation
-    # or add MoE ops into splitting_ops.
-    config = VllmConfig(
-        parallel_config=ParallelConfig(
-            all2all_backend="deepep_high_throughput",
-            data_parallel_size=8,
-        ),
-        compilation_config=CompilationConfig(
-            mode=CompilationMode.VLLM_COMPILE,
-            pass_config={"fuse_attn_quant": True, "eliminate_noops": True},
-            custom_ops=["+quant_fp8"],
-            cudagraph_mode=CUDAGraphMode.PIECEWISE,
-        ),
-    )
-    assert config.compilation_config.splitting_ops == []
-    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL
-
-
 def test_should_split():
     import torch
 
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 3e3ee1e572ec8..4676039b23961 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -915,8 +915,6 @@ class CompilationConfig:
             "mode is CompilationMode.VLLM_COMPILE"
         )
 
-        added_default_splitting_ops = False
-
         if self.pass_config.fuse_attn_quant and not self.use_inductor_graph_partition:
             self.set_splitting_ops_for_attn_fusion()
         else:
@@ -930,7 +928,6 @@ class CompilationConfig:
                 # for details. Make a copy to avoid mutating the class-level
                 # list via reference.
                 self.splitting_ops = list(self._attention_ops)
-                added_default_splitting_ops = True
             elif len(self.splitting_ops) == 0:
                 if (
                     self.cudagraph_mode == CUDAGraphMode.PIECEWISE
@@ -958,44 +955,25 @@ class CompilationConfig:
                     self.cudagraph_mode = CUDAGraphMode.FULL
                 self.splitting_ops = []
 
-        # split MoE ops for cudagraph
-        moe_ops = [
-            "vllm::moe_forward",
-            "vllm::moe_forward_shared",
-        ]
+        # Disable CUDA graphs for DeepEP high-throughput since its not CG compatible
         backend = all2all_backend or envs.VLLM_ALL2ALL_BACKEND
         dp_size = data_parallel_size if data_parallel_size is not None else 1
-        need_moe_splitting = (
+        if (
             backend == "deepep_high_throughput"
             and dp_size > 1
-            # pure attn-fusion without inductor partition deliberately disables
-            # piecewise graphs and MoE splitting.
-            and not (
-                self.pass_config.fuse_attn_quant
-                and not self.use_inductor_graph_partition
+            and self.cudagraph_mode != CUDAGraphMode.NONE
+        ):
+            # TODO: Piecewise Cuda graph might be enabled
+            # if torch compile cache key issue fixed
+            # See https://github.com/vllm-project/vllm/pull/25093
+            logger.info(
+                "DeepEP: Disabling CUDA Graphs since DeepEP high-throughput kernels "
+                "are optimized for prefill and are incompatible with CUDA Graphs. "
+                "In order to use CUDA Graphs for decode-optimized workloads, "
+                "use --all2all-backend with another option, such as "
+                "deepep_low_latency, pplx, or allgather_reducescatter."
             )
-        )
-
-        if need_moe_splitting and self.cudagraph_mode != CUDAGraphMode.NONE:
-            # if we just initialized default splitting_ops for this config,
-            # automatically append the MoE ops
-            if added_default_splitting_ops:
-                for op in moe_ops:
-                    if op not in self.splitting_ops:
-                        self.splitting_ops.append(op)
-
-            # make sure MoE ops are split out
-            if not any(op in self.splitting_ops for op in moe_ops):
-                self.cudagraph_mode = CUDAGraphMode.NONE
-                logger.warning_once(
-                    "DeepEP high throughput backend with data_parallel_size > 1 "
-                    "requires splitting MoE ops from cudagraphs. Please ensure "
-                    "'vllm::moe_forward' or 'vllm::moe_forward_shared' are "
-                    "present in CompilationConfig.splitting_ops."
-                )
-            elif self.cudagraph_mode.has_full_cudagraphs():
-                # fall back to piecewise when MoE splitting is required.
-                self.cudagraph_mode = CUDAGraphMode.PIECEWISE
+            self.cudagraph_mode = CUDAGraphMode.NONE
 
     def set_splitting_ops_for_attn_fusion(self):
         assert self.pass_config.fuse_attn_quant

From 8da6ae49c3d99e147bc739e54fe57e43581d887b Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Thu, 18 Dec 2025 02:45:51 -0600
Subject: [PATCH 666/770] [ROCm][Bugfix] Fix `fa_version` argument error in
 `flash_attn_maxseqlen_wrapper` for ROCm without aiter (#30909)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 vllm/attention/ops/vit_attn_wrappers.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/vllm/attention/ops/vit_attn_wrappers.py b/vllm/attention/ops/vit_attn_wrappers.py
index 5a74e1310133d..f555147bc055a 100644
--- a/vllm/attention/ops/vit_attn_wrappers.py
+++ b/vllm/attention/ops/vit_attn_wrappers.py
@@ -28,7 +28,7 @@ def flash_attn_maxseqlen_wrapper(
     max_seqlen: torch.Tensor,
     batch_size: int,
     is_rocm_aiter: bool,
-    fa_version: int,
+    fa_version: int | None,
 ) -> torch.Tensor:
     kwargs = {}
     if is_rocm_aiter:
@@ -36,7 +36,8 @@ def flash_attn_maxseqlen_wrapper(
     else:
         from vllm.attention.utils.fa_utils import flash_attn_varlen_func
 
-        kwargs["fa_version"] = fa_version
+        if not current_platform.is_rocm() and fa_version is not None:
+            kwargs["fa_version"] = fa_version
     q, k, v = (einops.rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
     output = flash_attn_varlen_func(
         q,
@@ -62,7 +63,7 @@ def flash_attn_maxseqlen_wrapper_fake(
     max_seqlen: torch.Tensor,
     batch_size: int,
     is_rocm_aiter: bool,
-    fa_version: int,
+    fa_version: int | None,
 ) -> torch.Tensor:
     return torch.empty_like(q)
 
@@ -82,7 +83,7 @@ def vit_flash_attn_wrapper(
     max_seqlen: torch.Tensor,
     batch_size: int,
     is_rocm_aiter: bool,
-    fa_version: int,
+    fa_version: int | None,
 ) -> torch.Tensor:
     return torch.ops.vllm.flash_attn_maxseqlen_wrapper(
         q, k, v, cu_seqlens, max_seqlen, batch_size, is_rocm_aiter, fa_version

From 8372be2828f16dd339b24d46cb6142c9d0afd004 Mon Sep 17 00:00:00 2001
From: Ming Yang <minos.future@gmail.com>
Date: Thu, 18 Dec 2025 01:02:38 -0800
Subject: [PATCH 667/770] [moe] Use enable_chunking func (to support disabling
 chunking) (#29935)

Signed-off-by: Ming Yang <minos.future@gmail.com>
---
 vllm/model_executor/layers/fused_moe/modular_kernel.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index b0834e861338f..25308b3106a44 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -743,7 +743,7 @@ class FusedMoEModularKernel(torch.nn.Module):
             1,
             (
                 M
-                if not self.fused_experts.supports_chunking()
+                if not self.fused_experts.enable_chunking()
                 else min(M, envs.VLLM_FUSED_MOE_CHUNK_SIZE)
             ),
         )
@@ -786,7 +786,7 @@ class FusedMoEModularKernel(torch.nn.Module):
             is_forward_context_available()
             and get_forward_context().attn_metadata is None
         )
-        if is_profile_run and self.fused_experts.supports_chunking() and self.is_dp_ep:
+        if is_profile_run and self.fused_experts.enable_chunking() and self.is_dp_ep:
             max_workspace_13, max_workspace_2, max_fused_out_shape = (
                 self.fused_experts.workspace_shapes(
                     envs.VLLM_FUSED_MOE_CHUNK_SIZE,

From f90d3636e285a78fbc452f081ea2f55811df1c72 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Thu, 18 Dec 2025 17:38:22 +0800
Subject: [PATCH 668/770] [Bugfix][CPU] Fix Mac CPU build (#30955)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 csrc/cpu/utils.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp
index fcd7534ab4c5d..88bc3c509790c 100644
--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@@ -138,6 +138,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
 
   return ss.str();
 }
+#endif  // VLLM_NUMA_DISABLED
 
 namespace cpu_utils {
 ScratchPadManager::ScratchPadManager() : size_(0), ptr_(nullptr) {
@@ -160,4 +161,3 @@ ScratchPadManager* ScratchPadManager::get_scratchpad_manager() {
   return &manager;
 }
 }  // namespace cpu_utils
-#endif

From 96bf50a2c0142597e83de39503ccb7cfc7732d95 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Thu, 18 Dec 2025 19:47:46 +0800
Subject: [PATCH 669/770] [ROCm] Serving Fails on Radeon Due to AITER Dtype
 Import  (#30952)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 vllm/_aiter_ops.py | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py
index c32bf04c71c1f..0eae279acf5be 100644
--- a/vllm/_aiter_ops.py
+++ b/vllm/_aiter_ops.py
@@ -24,14 +24,13 @@ def is_aiter_found() -> bool:
 # we keep this global outside to not cause torch compile breaks.
 IS_AITER_FOUND = is_aiter_found()
 
-# Can't use dtypes.fp8 directly inside an op
-# because it returns wrong result on gfx942.
-# This is a workaround to get the correct FP8 dtype.
-# This might because that the get_gfx() is wrapped as a custom op.
-if IS_AITER_FOUND:
-    from aiter import dtypes
 
-    AITER_FP8_DTYPE = dtypes.fp8
+def is_aiter_found_and_supported() -> bool:
+    if current_platform.is_rocm() and IS_AITER_FOUND:
+        from vllm.platforms.rocm import on_gfx9
+
+        return on_gfx9()
+    return False
 
 
 def if_aiter_supported(func: Callable) -> Callable:
@@ -43,17 +42,24 @@ def if_aiter_supported(func: Callable) -> Callable:
     def wrapper(*args, **kwargs):
         # checks the platform, device arch and aiter library existence.
 
-        if current_platform.is_rocm() and IS_AITER_FOUND:
-            from vllm.platforms.rocm import on_gfx9
-
-            if on_gfx9():
-                return func(*args, **kwargs)
+        if is_aiter_found_and_supported():
+            return func(*args, **kwargs)
 
         return None
 
     return wrapper
 
 
+# Can't use dtypes.fp8 directly inside an op
+# because it returns wrong result on gfx942.
+# This is a workaround to get the correct FP8 dtype.
+# This might because that the get_gfx() is wrapped as a custom op.
+if is_aiter_found_and_supported():
+    from aiter import dtypes
+
+    AITER_FP8_DTYPE = dtypes.fp8
+
+
 def _rocm_aiter_fused_moe_impl(
     hidden_states: torch.Tensor,
     w1: torch.Tensor,

From 100f93d2bea44097916c568a7ac642e7fee915ed Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Thu, 18 Dec 2025 09:51:17 -0500
Subject: [PATCH 670/770] Filter safetensors files to download if
 .safetensors.index.json exists (#30537)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .../model_loader/weight_utils.py              | 35 +++++++++++++++----
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 610e6a620ade2..0c5961561a7d9 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -23,6 +23,7 @@ import torch
 from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download
 from safetensors.torch import load, load_file, safe_open, save_file
 from tqdm.auto import tqdm
+from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
 
 from vllm import envs
 from vllm.config import ModelConfig
@@ -448,12 +449,31 @@ def download_weights_from_hf(
             fs = HfFileSystem()
             file_list = fs.ls(model_name_or_path, detail=False, revision=revision)
 
-            # Use the first pattern found in the HF repo's files.
-            for pattern in allow_patterns:
-                matching = fnmatch.filter(file_list, pattern)
-                if len(matching) > 0:
-                    allow_patterns = [pattern]
-                break
+            # If downloading safetensors and an index file exists, use the
+            # specific file names from the index to avoid downloading
+            # unnecessary files (e.g., from subdirectories like "original/").
+            index_file = f"{model_name_or_path}/{SAFE_WEIGHTS_INDEX_NAME}"
+            if "*.safetensors" in allow_patterns and index_file in file_list:
+                index_path = hf_hub_download(
+                    repo_id=model_name_or_path,
+                    filename=SAFE_WEIGHTS_INDEX_NAME,
+                    cache_dir=cache_dir,
+                    revision=revision,
+                )
+                with open(index_path) as f:
+                    weight_map = json.load(f)["weight_map"]
+                if weight_map:
+                    # Extra [] so that weight_map files are treated as a
+                    # single allow_pattern in the loop below
+                    allow_patterns = [list(set(weight_map.values()))]  # type: ignore[list-item]
+                else:
+                    allow_patterns = ["*.safetensors"]
+            else:
+                # Use the first pattern found in the HF repo's files.
+                for pattern in allow_patterns:
+                    if fnmatch.filter(file_list, pattern):
+                        allow_patterns = [pattern]
+                        break
         except Exception as e:
             logger.warning(
                 "Failed to get file list for '%s'. Trying each pattern in "
@@ -480,6 +500,9 @@ def download_weights_from_hf(
             )
             # If we have downloaded weights for this allow_pattern,
             # we don't need to check the rest.
+            # allow_pattern can be a list (from weight_map) or str (glob)
+            if isinstance(allow_pattern, list):
+                break
             if any(Path(hf_folder).glob(allow_pattern)):
                 break
         time_taken = time.perf_counter() - start_time

From eee600c34f87f50e87c7268eab569af16c7c2d22 Mon Sep 17 00:00:00 2001
From: zhrrr <43847754+izhuhaoran@users.noreply.github.com>
Date: Thu, 18 Dec 2025 22:52:20 +0800
Subject: [PATCH 671/770] [Misc] support nsys profile for bench latency
 (#29776)

Signed-off-by: zhuhaoran <zhuhaoran.zhr@alibaba-inc.com>
---
 vllm/benchmarks/latency.py | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/vllm/benchmarks/latency.py b/vllm/benchmarks/latency.py
index 99c1c846f19af..a9d149666e8ba 100644
--- a/vllm/benchmarks/latency.py
+++ b/vllm/benchmarks/latency.py
@@ -79,10 +79,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
 
 def main(args: argparse.Namespace):
     engine_args = EngineArgs.from_cli_args(args)
-    if args.profile and not engine_args.profiler_config.profiler == "torch":
-        raise ValueError(
-            "The torch profiler is not enabled. Please provide profiler_config."
-        )
 
     # Lazy import to avoid importing LLM when the bench command is not selected.
     from vllm import LLM, SamplingParams
@@ -125,8 +121,8 @@ def main(args: argparse.Namespace):
                 ),
             )
 
-    def run_to_completion(profile_dir: str | None = None):
-        if profile_dir:
+    def run_to_completion(do_profile: bool = False):
+        if do_profile:
             llm.start_profile()
             llm_generate()
             llm.stop_profile()
@@ -139,18 +135,24 @@ def main(args: argparse.Namespace):
 
     print("Warming up...")
     for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
-        run_to_completion(profile_dir=None)
+        run_to_completion(do_profile=False)
 
     if args.profile:
-        profile_dir = engine_args.profiler_config.torch_profiler_dir
-        print(f"Profiling (results will be saved to '{profile_dir}')...")
-        run_to_completion(profile_dir=profile_dir)
+        profiler_config = engine_args.profiler_config
+        if profiler_config.profiler == "torch":
+            print(
+                "Profiling with torch profiler (results will be saved to"
+                f" {profiler_config.torch_profiler_dir})..."
+            )
+        elif profiler_config.profiler == "cuda":
+            print("Profiling with cuda profiler ...")
+        run_to_completion(do_profile=True)
         return
 
     # Benchmark.
     latencies = []
-    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
-        latencies.append(run_to_completion(profile_dir=None))
+    for _ in tqdm(range(args.num_iters), desc="Bench iterations"):
+        latencies.append(run_to_completion(do_profile=False))
     latencies = np.array(latencies)
     percentages = [10, 25, 50, 75, 90, 99]
     percentiles = np.percentile(latencies, percentages)

From 66287582339d1f27c951f0108aac9aa7377fe643 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Thu, 18 Dec 2025 10:27:51 -0500
Subject: [PATCH 672/770] [Bug] Fix batch invariant in torch 2.10 (#30907)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/model_executor/layers/batch_invariant.py | 44 +++++++++----------
 1 file changed, 20 insertions(+), 24 deletions(-)

diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py
index fde0826779eb1..1058270889b29 100644
--- a/vllm/model_executor/layers/batch_invariant.py
+++ b/vllm/model_executor/layers/batch_invariant.py
@@ -933,30 +933,26 @@ def enable_batch_invariant_mode():
     _batch_invariant_MODE = True
     _batch_invariant_LIB = torch.library.Library("aten", "IMPL")
 
-    # Batch invariant matmuls are no longer needed after cublas overrides
-    if not is_torch_equal_or_newer("2.10.0.dev"):
-        if (
-            current_platform.is_device_capability_family(100)
-            or current_platform.is_device_capability(80)
-            or current_platform.is_device_capability(89)
-        ):
-            # For PyTorch 2.9, B200 uses GEMV for bs=1
-            # Requires https://github.com/pytorch/pytorch/pull/166735
-            _batch_invariant_LIB.impl("aten::mm", mm_batch_invariant, "CUDA")
-            _batch_invariant_LIB.impl("aten::addmm", addmm_batch_invariant, "CUDA")
-            _batch_invariant_LIB.impl("aten::matmul", matmul_batch_invariant, "CUDA")
-            _batch_invariant_LIB.impl("aten::linear", linear_batch_invariant, "CUDA")
-        else:
-            # Only source of batch invariance for Hopper is split-k, can disable through
-            # cuBLAS workspace config
-            _original_cublas_workspace_cfg = os.environ.get(
-                "CUBLAS_WORKSPACE_CONFIG", None
-            )
-            _original_cublaslt_workspace_size = os.environ.get(
-                "CUBLASLT_WORKSPACE_SIZE", None
-            )
-            os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
-            os.environ["CUBLASLT_WORKSPACE_SIZE"] = "1"
+    if (
+        current_platform.is_device_capability_family(100)
+        or current_platform.is_device_capability(80)
+        or current_platform.is_device_capability(89)
+    ):
+        # For PyTorch 2.9, B200 uses GEMV for bs=1
+        # Requires https://github.com/pytorch/pytorch/pull/166735
+        _batch_invariant_LIB.impl("aten::mm", mm_batch_invariant, "CUDA")
+        _batch_invariant_LIB.impl("aten::addmm", addmm_batch_invariant, "CUDA")
+        _batch_invariant_LIB.impl("aten::matmul", matmul_batch_invariant, "CUDA")
+        _batch_invariant_LIB.impl("aten::linear", linear_batch_invariant, "CUDA")
+    else:
+        # Only source of batch invariance for Hopper is split-k, can disable through
+        # cuBLAS workspace config
+        _original_cublas_workspace_cfg = os.environ.get("CUBLAS_WORKSPACE_CONFIG", None)
+        _original_cublaslt_workspace_size = os.environ.get(
+            "CUBLASLT_WORKSPACE_SIZE", None
+        )
+        os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
+        os.environ["CUBLASLT_WORKSPACE_SIZE"] = "1"
 
     _batch_invariant_LIB.impl(
         "aten::_log_softmax", _log_softmax_batch_invariant, "CUDA"

From 28d15ab56bd9d3fd17010bc4abaeec06988f7887 Mon Sep 17 00:00:00 2001
From: sarathc-cerebras <sarath.chandran@cerebras.net>
Date: Thu, 18 Dec 2025 21:16:58 +0530
Subject: [PATCH 673/770] adds jais 2 support (#30188)

Signed-off-by: sarathc-cerebras <sarath.chandran@cerebras.net>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docs/models/supported_models.md        |   1 +
 tests/models/registry.py               |   3 +
 vllm/model_executor/models/jais2.py    | 529 +++++++++++++++++++++++++
 vllm/model_executor/models/registry.py |   1 +
 4 files changed, 534 insertions(+)
 create mode 100644 vllm/model_executor/models/jais2.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 9ba0f4ca9096e..3ffbf63f9a18b 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -406,6 +406,7 @@ th {
 | `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ |
 | `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ |
 | `JAISLMHeadModel` | Jais | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. | | ✅︎ |
+| `Jais2ForCausalLM` | Jais2 | `inceptionai/Jais-2-8B-Chat`, `inceptionai/Jais-2-70B-Chat`, etc. | | ✅︎ |
 | `JambaForCausalLM` | Jamba | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. | ✅︎ | ✅︎ |
 | `KimiLinearForCausalLM` | Kimi-Linear-48B-A3B-Base, Kimi-Linear-48B-A3B-Instruct | `moonshotai/Kimi-Linear-48B-A3B-Base`, `moonshotai/Kimi-Linear-48B-A3B-Instruct` | | ✅︎ |
 | `Lfm2ForCausalLM`  | LFM2  | `LiquidAI/LFM2-1.2B`, `LiquidAI/LFM2-700M`, `LiquidAI/LFM2-350M`, etc. | ✅︎ | ✅︎ |
diff --git a/tests/models/registry.py b/tests/models/registry.py
index c5d72b5d581b9..fa70e94abd865 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -295,6 +295,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
         "internlm/internlm3-8b-instruct", trust_remote_code=True
     ),
     "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"),
+    "Jais2ForCausalLM": _HfExamplesInfo(
+        "inceptionai/Jais-2-8B-Chat", min_transformers_version="4.58"
+    ),
     "JambaForCausalLM": _HfExamplesInfo(
         "ai21labs/AI21-Jamba-1.5-Mini",
         extras={
diff --git a/vllm/model_executor/models/jais2.py b/vllm/model_executor/models/jais2.py
new file mode 100644
index 0000000000000..01e75338a8ced
--- /dev/null
+++ b/vllm/model_executor/models/jais2.py
@@ -0,0 +1,529 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Inference-only Jais2 model compatible with HuggingFace weights."""
+
+from collections.abc import Iterable
+
+import torch
+from torch import nn
+from transformers import Jais2Config
+
+from vllm.attention.layer import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+)
+from vllm.model_executor.layers.activation import ReLUSquaredActivation
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE,
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    extract_layer_index,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
+
+
+class Jais2MLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: QuantizationConfig | None = None,
+        bias: bool = False,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.up_proj = ColumnParallelLinear(
+            input_size=hidden_size,
+            output_size=intermediate_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            input_size=intermediate_size,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
+        self.act_fn = ReLUSquaredActivation()
+
+    def forward(self, x):
+        x, _ = self.up_proj(x)
+        x = self.act_fn(x)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Jais2Attention(nn.Module):
+    def __init__(
+        self,
+        config: Jais2Config,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        max_position_embeddings: int = 8192,
+        quant_config: QuantizationConfig | None = None,
+        bias: bool = False,
+        cache_config: CacheConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        layer_idx = extract_layer_index(prefix)
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
+        self.head_dim = getattr(
+            config, "head_dim", self.hidden_size // self.total_num_heads
+        )
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.o_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        is_neox_style = True
+        if quant_config is not None and quant_config.get_name() == "gguf":
+            is_neox_style = False
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            rope_parameters=getattr(config, "rope_parameters", None),
+            is_neox_style=is_neox_style,
+        )
+
+        if hasattr(config, "interleaved_sliding_window"):
+            interleaved_sliding_window = config.interleaved_sliding_window
+            if isinstance(interleaved_sliding_window, int):
+                sliding_window = interleaved_sliding_window
+            elif isinstance(interleaved_sliding_window, list):
+                sw_idx = layer_idx % len(interleaved_sliding_window)
+                sliding_window = interleaved_sliding_window[sw_idx]
+            else:
+                raise ValueError(
+                    f"{type(interleaved_sliding_window)} is not supported."
+                )
+        else:
+            sliding_window = None
+
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            per_layer_sliding_window=sliding_window,
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Jais2DecoderLayer(nn.Module):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        config: Jais2Config,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        config = config or vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = self.get_quant_config(vllm_config)
+
+        self.hidden_size = config.hidden_size
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        # Support abacusai/Smaug-72B-v0.1 with attention_bias
+        # Support internlm/internlm-7b with bias
+        attention_bias = getattr(config, "attention_bias", False) or getattr(
+            config, "bias", False
+        )
+        self.self_attn = Jais2Attention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=getattr(
+                config, "num_key_value_heads", config.num_attention_heads
+            ),
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            bias=attention_bias,
+            cache_config=cache_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.mlp = Jais2MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            bias=getattr(config, "mlp_bias", False),
+            prefix=f"{prefix}.mlp",
+        )
+        self.input_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps
+        )
+        self.post_attention_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = (
+                self.input_layernorm(hidden_states + residual),
+                hidden_states + residual,
+            )
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = (
+            self.post_attention_layernorm(hidden_states + residual),
+            hidden_states + residual,
+        )
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+    def get_quant_config(self, vllm_config: VllmConfig) -> QuantizationConfig | None:
+        """Get quantization config for this layer. Override in subclasses."""
+        return vllm_config.quant_config
+
+
+@support_torch_compile
+class Jais2Model(nn.Module):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        layer_type: type[nn.Module] = Jais2DecoderLayer,
+    ):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.quant_config = quant_config
+        self.padding_idx = config.pad_token_id
+        lora_vocab = (
+            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
+            if lora_config
+            else 0
+        )
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+        if get_pp_group().is_first_rank or (
+            config.tie_word_embeddings and get_pp_group().is_last_rank
+        ):
+            self.embed_tokens = VocabParallelEmbedding(
+                self.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                quant_config=quant_config,
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: layer_type(
+                config=config,
+                vllm_config=vllm_config,
+                prefix=prefix,
+            ),
+            prefix=f"{prefix}.layers",
+        )
+        if get_pp_group().is_last_rank:
+            self.norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors | tuple[torch.Tensor, list[torch.Tensor]]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.embed_input_ids(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states, residual = layer(positions, hidden_states, residual)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
+
+        hidden_states, _ = self.norm(hidden_states + residual), residual
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
+                # Loading kv cache scales for compressed-tensors quantization
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = loaded_weight[0]
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            if "scale" in name:
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class Jais2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+    }
+
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        self.config = config
+        self.lora_config = lora_config
+
+        self.model = self._init_model(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+
+        if get_pp_group().is_last_rank:
+            self.unpadded_vocab_size = config.vocab_size
+            if lora_config:
+                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                padding_size=(
+                    DEFAULT_VOCAB_PADDING_SIZE
+                    # We need bigger padding if using lora for kernel
+                    # compatibility
+                    if not lora_config
+                    else lora_config.lora_vocab_padding_size
+                ),
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+            if config.tie_word_embeddings:
+                self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
+
+            logit_scale = getattr(config, "logit_scale", 1.0)
+            self.logits_processor = LogitsProcessor(
+                self.unpadded_vocab_size, config.vocab_size, logit_scale
+            )
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors
+        )
+
+    def _init_model(self, vllm_config: VllmConfig, prefix: str = ""):
+        return Jais2Model(vllm_config=vllm_config, prefix=prefix)
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        model_output = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
+        return model_output
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        logits = self.logits_processor(self.lm_head, hidden_states)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 4575e91e13959..d332f51152484 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -127,6 +127,7 @@ _TEXT_GENERATION_MODELS = {
     "InternLM2VEForCausalLM": ("internlm2_ve", "InternLM2VEForCausalLM"),
     "InternLM3ForCausalLM": ("llama", "LlamaForCausalLM"),
     "JAISLMHeadModel": ("jais", "JAISLMHeadModel"),
+    "Jais2ForCausalLM": ("jais2", "Jais2ForCausalLM"),
     "JambaForCausalLM": ("jamba", "JambaForCausalLM"),
     "KimiLinearForCausalLM": ("kimi_linear", "KimiLinearForCausalLM"),  # noqa: E501
     "Lfm2ForCausalLM": ("lfm2", "Lfm2ForCausalLM"),

From 0db5439ded739731474009217a7515fd52c221c8 Mon Sep 17 00:00:00 2001
From: Lucas Kabela <lucaskabela@meta.com>
Date: Thu, 18 Dec 2025 08:23:31 -0800
Subject: [PATCH 674/770] [Bugfix][torch2.10] Fix test_qwen2_5_vl_compilation
 with 2.10 RC (#30822)

Signed-off-by: Lucas Kabela <lucaskabela@meta.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/compilation/backends.py          |  7 ++++---
 vllm/compilation/caching.py           | 10 ++++++++--
 vllm/compilation/piecewise_backend.py |  3 +--
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index a1eec7d74483f..2fb6265560b19 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -520,6 +520,7 @@ class VllmBackend:
         self,
         vllm_config: VllmConfig,
         prefix: str = "",
+        is_encoder: bool = False,
     ):
         # if the model is initialized with a non-empty prefix,
         # then usually it's enough to use that prefix,
@@ -530,7 +531,7 @@ class VllmBackend:
         self.prefix = prefix or model_tag
 
         # Mark compilation for encoder.
-        self.is_encoder = model_is_encoder
+        self.is_encoder = is_encoder or model_is_encoder
 
         # Passes to run on the graph post-grad.
         self.pass_manager = resolve_obj_by_qualname(
@@ -797,7 +798,7 @@ class VllmBackend:
             or not self.compilation_config.cudagraph_copy_inputs
         ):
             return VllmSerializableFunction(
-                graph, example_inputs, self.prefix, self.split_gm
+                graph, example_inputs, self.prefix, self.split_gm, self.is_encoder
             )
 
         # index of tensors that have symbolic shapes (batch size)
@@ -835,5 +836,5 @@ class VllmBackend:
             return self.split_gm(*list_args)
 
         return VllmSerializableFunction(
-            graph, example_inputs, self.prefix, copy_and_call
+            graph, example_inputs, self.prefix, copy_and_call, self.is_encoder
         )
diff --git a/vllm/compilation/caching.py b/vllm/compilation/caching.py
index fc02a08f74265..8c9ec87bcad56 100644
--- a/vllm/compilation/caching.py
+++ b/vllm/compilation/caching.py
@@ -37,12 +37,15 @@ class VllmSerializableFunction(SerializableCallable):
     serializing the Dynamo fx graph plus example inputs.
     """
 
-    def __init__(self, graph_module, example_inputs, prefix, optimized_call):
+    def __init__(
+        self, graph_module, example_inputs, prefix, optimized_call, is_encoder=False
+    ):
         assert isinstance(graph_module, torch.fx.GraphModule)
         self.graph_module = graph_module
         self.example_inputs = example_inputs
         self.prefix = prefix
         self.optimized_call = optimized_call
+        self.is_encoder = is_encoder
         self.shape_env = None
         sym_input = next(
             (i for i in self.example_inputs if isinstance(i, torch.SymInt)), None
@@ -106,7 +109,10 @@ class VllmSerializableFunction(SerializableCallable):
         state["graph_module"] = GraphPickler.loads(state["graph_module"], fake_mode)
         state["graph_module"].recompile()
         state["example_inputs"] = GraphPickler.loads(state["example_inputs"], fake_mode)
-        vllm_backend = VllmBackend(get_current_vllm_config(), state["prefix"])
+        is_encoder = state.get("is_encoder", False)
+        vllm_backend = VllmBackend(
+            get_current_vllm_config(), state["prefix"], is_encoder
+        )
 
         def optimized_call(*example_inputs):
             """
diff --git a/vllm/compilation/piecewise_backend.py b/vllm/compilation/piecewise_backend.py
index 58d3e2a14b22a..12cc49971e08b 100644
--- a/vllm/compilation/piecewise_backend.py
+++ b/vllm/compilation/piecewise_backend.py
@@ -170,8 +170,7 @@ class PiecewiseBackend:
         range_entry = self._find_range_for_shape(runtime_shape)
 
         assert range_entry is not None, (
-            f"Shape out of considered range: {runtime_shape} "
-            "[1, max_num_batched_tokens]"
+            f"Shape: {runtime_shape} out of considered ranges: {self.compile_ranges}"
         )
 
         self._maybe_compile_for_range_entry(range_entry, args)

From 326e7c31055812277957e3e2b43715b4f366facb Mon Sep 17 00:00:00 2001
From: wzyrrr <53074341+wzyrrr@users.noreply.github.com>
Date: Fri, 19 Dec 2025 00:29:33 +0800
Subject: [PATCH 675/770] [Doc] Add Sophgo TPU Support (#30949)

Co-authored-by: zhaoyang.wang <zhaoyang.wang@sophgo.com>
---
 docs/getting_started/installation/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/getting_started/installation/README.md b/docs/getting_started/installation/README.md
index cff7ce1a882a1..9b93a6b9ac12c 100644
--- a/docs/getting_started/installation/README.md
+++ b/docs/getting_started/installation/README.md
@@ -27,3 +27,4 @@ The backends below live **outside** the main `vllm` repository and follow the
 | IBM Spyre AIU | `vllm-spyre` | <https://github.com/vllm-project/vllm-spyre> |
 | Cambricon MLU | `vllm-mlu` | <https://github.com/Cambricon/vllm-mlu> |
 | Baidu Kunlun XPU | N/A, install from source | <https://github.com/baidu/vLLM-Kunlun> |
+| Sophgo TPU | N/A, install from source | <https://github.com/sophgo/vllm-tpu> |

From 9a5e96523be33f7ddd5aa56421b1e41000c0f2e2 Mon Sep 17 00:00:00 2001
From: Xin Yang <105740670+xyang16@users.noreply.github.com>
Date: Thu, 18 Dec 2025 08:42:22 -0800
Subject: [PATCH 676/770] [LoRA] Set default MXFP4 LoRA backend to Marlin
 (#30598)

Signed-off-by: Xin Yang <xyangx@amazon.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/model_executor/layers/quantization/mxfp4.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index e96e87d15787d..832925825c453 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -95,12 +95,12 @@ def get_mxfp4_backend_with_lora() -> Mxfp4Backend:
         # SM120 needs this fix: https://github.com/triton-lang/triton/pull/8498
         and (9, 0) <= current_platform.get_device_capability() < (11, 0)
     )
-    if envs.VLLM_MXFP4_USE_MARLIN or not triton_kernels_supported:
-        logger.info_once("[get_mxfp4_backend_with_lora] Using Marlin backend")
-        return Mxfp4Backend.MARLIN
+    if envs.VLLM_MXFP4_USE_MARLIN is False and triton_kernels_supported:
+        logger.info_once("[get_mxfp4_backend_with_lora] Using Triton backend")
+        return Mxfp4Backend.TRITON
 
-    logger.info_once("[get_mxfp4_backend_with_lora] Using Triton backend")
-    return Mxfp4Backend.TRITON
+    logger.info_once("[get_mxfp4_backend_with_lora] Using Marlin backend")
+    return Mxfp4Backend.MARLIN
 
 
 def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend:

From f4ee2c3d908ade890bc2e42945d307bc2c341b59 Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vkuzo@users.noreply.github.com>
Date: Thu, 18 Dec 2025 11:45:15 -0500
Subject: [PATCH 677/770] fix fp8 online quantization streaming with tp > 1
 (#30900)

Signed-off-by: vasiliy <vasiliy@fb.com>
---
 .../model_executor/layers/quantization/fp8.py | 41 +++++++++++++++----
 1 file changed, 33 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 800340ed6043c..ec3fc5ace17d8 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -8,6 +8,7 @@ from typing import TYPE_CHECKING, Any, Optional
 import torch
 from torch.nn import Module
 from torch.nn.parameter import Parameter
+from torch.utils._python_dispatch import TorchDispatchMode
 
 import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
@@ -363,6 +364,26 @@ class Fp8Config(QuantizationConfig):
         return None
 
 
+class CopyNumelCounter(TorchDispatchMode):
+    """
+    Tracks total number of elements modified with `copy_`. Useful for keeping
+    track of weight loading where underlying weights can be arbitrarily
+    transformed (such as with `narrow`) before calling copy.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.copied_numel = 0
+
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        if kwargs is None:
+            kwargs = {}
+        out = func(*args, **kwargs)
+        if func == torch.ops.aten.copy_.default:
+            self.copied_numel += args[0].numel()
+        return out
+
+
 class Fp8LinearMethod(LinearMethodBase):
     """Linear method for FP8.
     Supports loading FP8 checkpoints with static weight scale and
@@ -469,13 +490,15 @@ class Fp8LinearMethod(LinearMethodBase):
         else:
 
             def patched_weight_loader(param, loaded_weight, *args, **kwargs):
-                # load the current weight chunk
-                res = weight_loader(param, loaded_weight, *args, **kwargs)  # type: ignore[misc]
-
                 # track how many elements we have updated
                 if not hasattr(layer, "_loaded_numel"):
                     layer._loaded_numel = 0
-                layer._loaded_numel += loaded_weight.numel()
+
+                # load the current weight chunk
+                copy_numel_counter = CopyNumelCounter()
+                with copy_numel_counter:
+                    res = weight_loader(param, loaded_weight, *args, **kwargs)  # type: ignore[misc]
+                layer._loaded_numel += copy_numel_counter.copied_numel
 
                 # if we have loaded all of the elements, call
                 # process_weights_after_loading
@@ -1348,13 +1371,15 @@ class Fp8OnlineMoEMethod(Fp8MoEMethod):
         new_extra_weight_attrs = extra_weight_attrs
 
         def patched_weight_loader(param, loaded_weight, *args, **kwargs):
-            # load the current weight chunk
-            res = weight_loader(param, loaded_weight, *args, **kwargs)  # type: ignore[misc]
-
             # add a counter to track how many elements we have updated
             if not hasattr(layer, "_loaded_numel"):
                 layer._loaded_numel = 0
-            layer._loaded_numel += loaded_weight.numel()
+
+            # load the current weight chunk
+            copy_numel_counter = CopyNumelCounter()
+            with copy_numel_counter:
+                res = weight_loader(param, loaded_weight, *args, **kwargs)  # type: ignore[misc]
+            layer._loaded_numel += copy_numel_counter.copied_numel
 
             # if we have loaded all of the elements, call
             # process_weights_after_loading

From 686cbaac643c3412036728dd5e6bc29d6cce1a9f Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Thu, 18 Dec 2025 09:17:00 -0800
Subject: [PATCH 678/770] [Cleanup] Remove unused ModelRunner V1
 `InputBatch.num_tokens` field (#30218)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/worker/gpu_input_batch.py  | 28 +++++++++-------------------
 vllm/v1/worker/gpu_model_runner.py |  5 -----
 vllm/v1/worker/tpu_input_batch.py  | 11 +----------
 vllm/v1/worker/tpu_model_runner.py |  4 ++--
 4 files changed, 12 insertions(+), 36 deletions(-)

diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index ead7a3619dea5..08b595845bb40 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -128,7 +128,6 @@ class InputBatch:
         # allocation if max_model_len is big.
         # Maps req_index -> tensor of shape (num_prompt_tokens, hidden_size)
         self.req_prompt_embeds: dict[int, torch.Tensor] = {}
-        self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32)
         self.num_tokens_no_spec = np.zeros(max_num_reqs, dtype=np.int32)
         self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
         self.num_computed_tokens_cpu_tensor = torch.zeros(
@@ -340,9 +339,6 @@ class InputBatch:
             self.req_prompt_embeds[req_index] = request.prompt_embeds
         self.token_ids_cpu[req_index, start_idx:end_idx] = request.output_token_ids
         self.is_token_ids[req_index, start_idx:end_idx] = True
-        # Number of token ids in prompt (token_ids_cpu or prompt_embeds).
-        # NOTE(woosuk): This may include spec decode tokens.
-        self.num_tokens[req_index] = request.num_tokens
         # Number of tokens without spec decode tokens.
         self.num_tokens_no_spec[req_index] = request.num_tokens
 
@@ -522,10 +518,6 @@ class InputBatch:
             self.req_id_to_index[old_id_i2],
             self.req_id_to_index[old_id_i1],
         )
-        self.num_tokens[i1], self.num_tokens[i2] = (
-            self.num_tokens[i2],
-            self.num_tokens[i1],
-        )
         self.num_tokens_no_spec[i1], self.num_tokens_no_spec[i2] = (
             self.num_tokens_no_spec[i2],
             self.num_tokens_no_spec[i1],
@@ -661,17 +653,16 @@ class InputBatch:
             self.req_output_token_ids[last_req_index] = None
             self.req_id_to_index[req_id] = empty_index
 
-            if last_req_index != empty_index:
-                (
-                    self.spec_token_ids[last_req_index],
-                    self.spec_token_ids[empty_index],
-                ) = (
-                    self.spec_token_ids[empty_index],
-                    self.spec_token_ids[last_req_index],
-                )
-                self.spec_token_ids[last_req_index].clear()
+            num_tokens = self.num_tokens_no_spec[last_req_index] + len(
+                self.spec_token_ids[last_req_index]
+            )
+
+            (self.spec_token_ids[last_req_index], self.spec_token_ids[empty_index]) = (
+                self.spec_token_ids[empty_index],
+                self.spec_token_ids[last_req_index],
+            )
+            self.spec_token_ids[last_req_index].clear()
 
-            num_tokens = self.num_tokens[last_req_index]
             self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[
                 last_req_index, :num_tokens
             ]
@@ -682,7 +673,6 @@ class InputBatch:
                 self.req_prompt_embeds[empty_index] = self.req_prompt_embeds.pop(
                     last_req_index
                 )
-            self.num_tokens[empty_index] = num_tokens
             self.num_tokens_no_spec[empty_index] = self.num_tokens_no_spec[
                 last_req_index
             ]
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a44150432434b..36637b98ea823 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -923,7 +923,6 @@ class GPUModelRunner(
                         self.input_batch.num_prompt_tokens[req_index]
                         + num_output_tokens
                     )
-                    self.input_batch.num_tokens[req_index] = end_idx
                     self.input_batch.num_tokens_no_spec[req_index] = end_idx
 
             # Update the block IDs.
@@ -968,7 +967,6 @@ class GPUModelRunner(
                     req_index, start_token_index:end_token_index
                 ] = new_token_ids
                 self.input_batch.num_tokens_no_spec[req_index] = end_token_index
-                self.input_batch.num_tokens[req_index] = end_token_index
 
             # Add spec_token_ids to token_ids_cpu.
             spec_token_ids = scheduler_output.scheduled_spec_decode_tokens.get(
@@ -984,8 +982,6 @@ class GPUModelRunner(
                 self.input_batch.token_ids_cpu[
                     req_index, start_index:end_token_index
                 ] = spec_token_ids
-                # NOTE(woosuk): `num_tokens` here may include spec tokens.
-                self.input_batch.num_tokens[req_index] += num_spec_tokens
 
             # When speculative decoding is used with structured output,
             # the scheduler can drop draft tokens that do not
@@ -2702,7 +2698,6 @@ class GPUModelRunner(
             self.input_batch.token_ids_cpu[req_idx, start_idx:end_idx] = sampled_ids
             self.input_batch.is_token_ids[req_idx, start_idx:end_idx] = True
             self.input_batch.num_tokens_no_spec[req_idx] = end_idx
-            self.input_batch.num_tokens[req_idx] = end_idx
 
             req_id = req_ids[req_idx]
             req_state = self.requests[req_id]
diff --git a/vllm/v1/worker/tpu_input_batch.py b/vllm/v1/worker/tpu_input_batch.py
index 2ed65ca9d31cd..3758a73ee4967 100644
--- a/vllm/v1/worker/tpu_input_batch.py
+++ b/vllm/v1/worker/tpu_input_batch.py
@@ -51,7 +51,6 @@ class InputBatch:
             pin_memory=False,
         )
         self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
-        self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32)
         self.num_tokens_no_spec = np.zeros(max_num_reqs, dtype=np.int32)
         self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
         self.num_computed_tokens_cpu_tensor = torch.zeros(
@@ -200,9 +199,6 @@ class InputBatch:
         start_idx = num_prompt_tokens
         end_idx = start_idx + len(request.output_token_ids)
         self.token_ids_cpu[req_index, start_idx:end_idx] = request.output_token_ids
-        # Number of token ids in token_ids_cpu.
-        # NOTE(woosuk): This may include spec decode tokens.
-        self.num_tokens[req_index] = request.num_tokens
         # Number of tokens without spec decode tokens.
         self.num_tokens_no_spec[req_index] = request.num_tokens
 
@@ -344,10 +340,6 @@ class InputBatch:
             self.req_id_to_index[old_id_i2],
             self.req_id_to_index[old_id_i1],
         )
-        self.num_tokens[i1], self.num_tokens[i2] = (
-            self.num_tokens[i2],
-            self.num_tokens[i1],
-        )
         self.num_tokens_no_spec[i1], self.num_tokens_no_spec[i2] = (
             self.num_tokens_no_spec[i2],
             self.num_tokens_no_spec[i1],
@@ -448,11 +440,10 @@ class InputBatch:
             self.req_output_token_ids[last_req_index] = None
             self.req_id_to_index[req_id] = empty_index
 
-            num_tokens = self.num_tokens[last_req_index]
+            num_tokens = self.num_tokens_no_spec[last_req_index]
             self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[
                 last_req_index, :num_tokens
             ]
-            self.num_tokens[empty_index] = num_tokens
             self.num_tokens_no_spec[empty_index] = self.num_tokens_no_spec[
                 last_req_index
             ]
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 283f21b779e38..c7404c4642d7e 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -1283,7 +1283,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 token_id = valid_sampled_token_ids[i][0]
                 self.input_batch.token_ids_cpu[i, seq_len] = token_id
                 req_state.output_token_ids.append(token_id)
-                self.input_batch.num_tokens[i] += 1
+                self.input_batch.num_tokens_no_spec[i] += 1
 
         else:
             valid_mask = selected_token_ids != INVALID_TOKEN_ID
@@ -1291,7 +1291,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             valid_sampled_token_ids = [
                 seq.tolist() for seq in selected_token_ids[valid_mask].split(gen_lens)
             ]
-            self.input_batch.num_tokens[:num_reqs] += gen_lens
+            self.input_batch.num_tokens_no_spec[:num_reqs] += gen_lens
             for i, req_state, seq_len in request_seq_lens:
                 target_slice = slice(seq_len - gen_lens[i] + 1, seq_len + 1)
                 self.input_batch.token_ids_cpu[i, target_slice] = (

From 500f26e6d35d9743167bc4908c01ca356e543836 Mon Sep 17 00:00:00 2001
From: inkcherry <mingzhi.liu@amd.com>
Date: Fri, 19 Dec 2025 01:50:42 +0800
Subject: [PATCH 679/770] [Bugfix] fix DP-aware routing in OpenAI API requests
 (#29002)

Signed-off-by: inkcherry <mingzhi.liu@amd.com>
---
 tests/entrypoints/openai/test_chat_error.py   |  1 +
 .../openai/test_completion_error.py           |  1 +
 tests/entrypoints/openai/test_serving_chat.py |  1 +
 tests/v1/engine/test_async_llm.py             | 61 +++++++++++++++++++
 vllm/entrypoints/openai/serving_chat.py       |  1 +
 vllm/entrypoints/openai/serving_completion.py |  1 +
 vllm/entrypoints/openai/serving_engine.py     |  2 +
 7 files changed, 68 insertions(+)

diff --git a/tests/entrypoints/openai/test_chat_error.py b/tests/entrypoints/openai/test_chat_error.py
index b194e9b74d874..1f30d8cf1e8cc 100644
--- a/tests/entrypoints/openai/test_chat_error.py
+++ b/tests/entrypoints/openai/test_chat_error.py
@@ -76,6 +76,7 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
         lora_request,
         trace_headers,
         priority,
+        data_parallel_rank,
     ):
         return dict(engine_prompt), {}
 
diff --git a/tests/entrypoints/openai/test_completion_error.py b/tests/entrypoints/openai/test_completion_error.py
index ca56cc2ddb6a7..6643aa471321b 100644
--- a/tests/entrypoints/openai/test_completion_error.py
+++ b/tests/entrypoints/openai/test_completion_error.py
@@ -73,6 +73,7 @@ def _build_serving_completion(engine: AsyncLLM) -> OpenAIServingCompletion:
         lora_request,
         trace_headers,
         priority,
+        data_parallel_rank,
     ):
         return dict(engine_prompt), {}
 
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 2befa40d636da..69d7b1ceedf59 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -396,6 +396,7 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
         lora_request,
         trace_headers,
         priority,
+        data_parallel_rank,
     ):
         return dict(engine_prompt), {}
 
diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index 25af55baa91f4..224e5d741024b 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -11,6 +11,13 @@ from vllm import SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    ChatCompletionResponse,
+    ErrorResponse,
+)
+from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
 from vllm.inputs import PromptType
 from vllm.outputs import RequestOutput
 from vllm.platforms import current_platform
@@ -484,6 +491,60 @@ async def test_dp_rank_argument():
                 pass
 
 
+@pytest.mark.asyncio(scope="module")
+async def test_header_dp_rank_argument():
+    with ExitStack() as after:
+        with set_default_torch_num_threads(1):
+            engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
+        after.callback(engine.shutdown)
+
+        MODEL_NAME = "test-model"
+        BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
+
+        # Create models first
+        models = OpenAIServingModels(
+            engine_client=engine,
+            base_model_paths=BASE_MODEL_PATHS,
+        )
+
+        # Create serving chat instance
+        serving_chat = OpenAIServingChat(
+            engine_client=engine,
+            models=models,
+            response_role="assistant",
+            chat_template=None,
+            chat_template_content_format="auto",
+            request_logger=None,
+        )
+        # Create a chat completion request
+        req = ChatCompletionRequest(
+            model=MODEL_NAME,
+            messages=[{"role": "user", "content": TEXT_PROMPT}],
+            max_tokens=100,
+            temperature=1.0,
+            seed=33,
+        )
+        # Test 1: Valid DP rank (0)
+        mock_raw_request = MagicMock()
+        mock_raw_request.headers = {"X-data-parallel-rank": "0"}
+        mock_raw_request.state = MagicMock()
+
+        # Should succeed with valid rank
+        response = await serving_chat.create_chat_completion(req, mock_raw_request)
+        assert isinstance(response, ChatCompletionResponse), (
+            "Expected a ChatCompletionResponse for valid DP rank"
+        )
+
+        # Test 2: Out-of-range DP rank (1)
+        mock_raw_request.headers = {"X-data-parallel-rank": "1"}
+
+        # should return ErrorResponse for out-of-range rank
+        response2 = await serving_chat.create_chat_completion(req, mock_raw_request)
+        assert isinstance(response2, ErrorResponse), (
+            "Expected an ErrorResponse for out-of-range DP rank"
+        )
+
+
 @pytest.mark.asyncio
 async def test_check_health():
     """Test that check_health returns normally for healthy engine
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 95df373502bfd..04967cbe268dd 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -381,6 +381,7 @@ class OpenAIServingChat(OpenAIServing):
                         lora_request=lora_request,
                         trace_headers=trace_headers,
                         priority=request.priority,
+                        data_parallel_rank=data_parallel_rank,
                     )
 
                     generator = self.engine_client.generate(
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 1be0afc8c74e5..265ca9915e5db 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -230,6 +230,7 @@ class OpenAIServingCompletion(OpenAIServing):
                         lora_request=lora_request,
                         trace_headers=trace_headers,
                         priority=request.priority,
+                        data_parallel_rank=data_parallel_rank,
                     )
 
                     generator = self.engine_client.generate(
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 5f7cfaa53ec18..b9771963c6d4c 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -1231,6 +1231,7 @@ class OpenAIServing:
         lora_request: LoRARequest | None,
         trace_headers: Mapping[str, str] | None,
         priority: int,
+        data_parallel_rank: int | None = None,
     ) -> tuple[EngineCoreRequest, dict[str, Any]]:
         """Use the Processor to process inputs for AsyncLLM."""
         tokenization_kwargs: dict[str, Any] = {}
@@ -1246,6 +1247,7 @@ class OpenAIServing:
             tokenization_kwargs=tokenization_kwargs,
             trace_headers=trace_headers,
             priority=priority,
+            data_parallel_rank=data_parallel_rank,
         )
         return engine_request, tokenization_kwargs
 

From 62be3670cb97e7196c30be26fe347d12d183429c Mon Sep 17 00:00:00 2001
From: Alec <35311602+alec-flowers@users.noreply.github.com>
Date: Thu, 18 Dec 2025 09:52:55 -0800
Subject: [PATCH 680/770] [BugFix] Add sleep to fix tight loop and release GIL
 (#29476)

Signed-off-by: alec-flowers <aflowers@nvidia.com>
Signed-off-by: Alec <35311602+alec-flowers@users.noreply.github.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/engine/core.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 0045b8c1dd3e7..9e2571201a684 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -923,6 +923,13 @@ class EngineCoreProc(EngineCore):
         # Post-step hook.
         self.post_step(model_executed)
 
+        # If no model execution happened but there are waiting requests
+        # (e.g., WAITING_FOR_REMOTE_KVS), yield the GIL briefly to allow
+        # background threads (like NIXL handshake) to make progress.
+        # Without this, the tight polling loop can starve background threads.
+        if not model_executed and self.scheduler.has_unfinished_requests():
+            time.sleep(0.001)
+
         return model_executed
 
     def _handle_client_request(

From 700a5ad6c616358f42db7d9b55e8bc9caa140ca5 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Fri, 19 Dec 2025 02:04:19 +0800
Subject: [PATCH 681/770] [MM Encoder]: Migrate legacy ViT `MultiHeadAttention`
 to new `MMEncoderAttention` interface (#30684)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 tests/kernels/attention/test_attention.py     |   5 +-
 tests/kernels/attention/test_mha_attn.py      |  78 +++++++++--
 tests/v1/tpu/test_mha_attn.py                 |   6 +-
 vllm/attention/layer.py                       | 132 ------------------
 vllm/attention/layers/mm_encoder_attention.py |  90 ++++--------
 vllm/attention/ops/vit_attn_wrappers.py       |  53 +++++--
 vllm/model_executor/models/aimv2.py           |   4 +-
 vllm/model_executor/models/blip.py            |   4 +-
 vllm/model_executor/models/clip.py            |  11 +-
 vllm/model_executor/models/deepencoder.py     |   4 +-
 vllm/model_executor/models/glm4v.py           |   4 +-
 vllm/model_executor/models/hunyuan_vision.py  |   4 +-
 .../models/idefics2_vision_model.py           |   8 +-
 vllm/model_executor/models/intern_vit.py      |   4 +-
 vllm/model_executor/models/interns1_vit.py    |   8 +-
 vllm/model_executor/models/mllama4.py         |   4 +-
 vllm/model_executor/models/molmo.py           |   5 +-
 vllm/model_executor/models/siglip.py          |  10 +-
 vllm/model_executor/models/step3_vl.py        |   8 +-
 vllm/model_executor/models/whisper.py         |   6 +-
 20 files changed, 182 insertions(+), 266 deletions(-)

diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py
index 1a7d5ce0ddc1e..96bdcf16d5689 100644
--- a/tests/kernels/attention/test_attention.py
+++ b/tests/kernels/attention/test_attention.py
@@ -9,7 +9,8 @@ import torch
 from tests.kernels.allclose_default import get_default_atol, get_default_rtol
 from tests.kernels.utils import opcheck
 from vllm import _custom_ops as ops
-from vllm.attention.layer import Attention, MultiHeadAttention
+from vllm.attention.layer import Attention
+from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.platforms import current_platform
 from vllm.utils.mem_utils import get_max_shared_memory_bytes
 
@@ -442,7 +443,7 @@ def ref_multi_query_kv_attention(
     return torch.cat(ref_outputs, dim=0)
 
 
-@pytest.mark.parametrize("attention_cls", [Attention, MultiHeadAttention])
+@pytest.mark.parametrize("attention_cls", [Attention, MMEncoderAttention])
 def test_num_heads_not_divisble_by_num_kv_heads(attention_cls: type) -> None:
     head_size = 64
     scale = float(1.0 / (head_size**0.5))
diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py
index 639abdf6f0487..7405e4d41da94 100644
--- a/tests/kernels/attention/test_mha_attn.py
+++ b/tests/kernels/attention/test_mha_attn.py
@@ -3,16 +3,17 @@
 """
 Test:
 
-* Tests for MultiHeadAttention layer
+* Tests for MMEncoderAttention layer
 """
 
+import itertools
 from unittest.mock import patch
 
 import pytest
 import torch
 
 from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.attention.selector import _cached_get_attn_backend
 from vllm.platforms import current_platform
 from vllm.platforms.cpu import CpuPlatform
@@ -42,35 +43,31 @@ def test_mha_attn_platform(device: str):
 
     if device == "cpu":
         with (
-            patch("vllm.attention.layer.current_platform", CpuPlatform()),
             patch("vllm.model_executor.models.vision.current_platform", CpuPlatform()),
         ):
-            attn = MultiHeadAttention(16, 64, scale=1)
+            attn = MMEncoderAttention(16, 64, scale=1)
             assert attn.attn_backend == AttentionBackendEnum.TORCH_SDPA
     elif device == "hip":
         with (
-            patch("vllm.attention.layer.current_platform", RocmPlatform()),
             patch("vllm.model_executor.models.vision.current_platform", RocmPlatform()),
         ):
-            attn = MultiHeadAttention(16, 64, scale=1)
+            attn = MMEncoderAttention(16, 64, scale=1)
             assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN
     else:
         # Test CUDA with head_size=64 (divisible by 32)
         # - should use vLLM's FlashAttention
         with (
-            patch("vllm.attention.layer.current_platform", CudaPlatform()),
             patch("vllm.model_executor.models.vision.current_platform", CudaPlatform()),
         ):
-            attn = MultiHeadAttention(16, 64, scale=1)
+            attn = MMEncoderAttention(16, 64, scale=1)
             assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN
 
         # Test CUDA with head_size=72 (not divisible by 32)
         # - should use vLLM's FlashAttention
         with (
-            patch("vllm.attention.layer.current_platform", CudaPlatform()),
             patch("vllm.model_executor.models.vision.current_platform", CudaPlatform()),
         ):
-            attn = MultiHeadAttention(16, 72, scale=1)
+            attn = MMEncoderAttention(16, 72, scale=1)
             assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN
 
 
@@ -94,6 +91,10 @@ def ref_attention(
 
 BATCH_SIZES = [1, 16]
 SEQ_LENS = [1]
+VAR_SEQ_LENS = [
+    [2, 2],
+    [2, 3, 4],
+]
 NUM_HEADS = [1, 16]
 NUM_KV_HEADS = [1]
 HEAD_SIZES = [64, 80]
@@ -130,7 +131,7 @@ def test_mha_attn_forward(
     k = torch.randn(batch_size, seq_len, num_kv_heads * head_size)
     v = torch.randn(batch_size, seq_len, num_kv_heads * head_size)
     scale = 1.0 / head_size**0.5
-    attn = MultiHeadAttention(
+    attn = MMEncoderAttention(
         num_heads, head_size, scale=scale, num_kv_heads=num_kv_heads
     )
     output = attn(q, k, v)
@@ -151,3 +152,58 @@ def test_mha_attn_forward(
         scale=scale,
     ).reshape(batch_size, seq_len, num_heads * head_size)
     torch.testing.assert_close(output, ref_output)
+
+
+@pytest.mark.parametrize("var_seq_len", VAR_SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("num_kv_heads", NUM_KV_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_mha_attn_varlen_forward(
+    var_seq_len: list[int],
+    num_heads: int,
+    num_kv_heads: int,
+    head_size: int,
+    dtype: torch.dtype,
+    device: str,
+):
+    current_platform.seed_everything(0)
+    torch.set_default_device(device)
+    torch.set_default_dtype(dtype)
+
+    q = torch.randn(1, sum(var_seq_len), num_heads, head_size)
+    k = torch.randn(1, sum(var_seq_len), num_kv_heads, head_size)
+    v = torch.randn(1, sum(var_seq_len), num_kv_heads, head_size)
+    cu_seqlens = torch.tensor(
+        [0] + list(itertools.accumulate(var_seq_len)), dtype=torch.int32
+    )
+    scale = 1.0 / head_size**0.5
+    attn = MMEncoderAttention(
+        num_heads, head_size, scale=scale, num_kv_heads=num_kv_heads
+    )
+    output = attn(
+        q, k, v, cu_seqlens=cu_seqlens, max_seqlen=torch.tensor(max(var_seq_len))
+    )
+
+    assert num_heads % num_kv_heads == 0
+    num_queries_per_kv = num_heads // num_kv_heads
+    if num_queries_per_kv > 1:
+        k = torch.repeat_interleave(k, num_queries_per_kv, dim=2)
+        v = torch.repeat_interleave(v, num_queries_per_kv, dim=2)
+
+    ref_output = []
+    for q_i, k_i, v_i in zip(
+        torch.split(q, var_seq_len, dim=1),
+        torch.split(k, var_seq_len, dim=1),
+        torch.split(v, var_seq_len, dim=1),
+    ):
+        output_i = ref_attention(
+            q_i,
+            k_i,
+            v_i,
+            scale=scale,
+        )
+        ref_output.append(output_i)
+    ref_output = torch.cat(ref_output, dim=1)
+    torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2)
diff --git a/tests/v1/tpu/test_mha_attn.py b/tests/v1/tpu/test_mha_attn.py
index 5debdf85bea8d..84968dee6b60c 100644
--- a/tests/v1/tpu/test_mha_attn.py
+++ b/tests/v1/tpu/test_mha_attn.py
@@ -3,7 +3,7 @@
 """
 Test:
 
-* Tests for MultiHeadAttention layer
+* Tests for MMEncoderAttention layer
 """
 
 import pytest
@@ -12,7 +12,7 @@ import torch_xla
 import torch_xla.core
 import torch_xla.core.xla_model
 
-from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.attention.selector import _cached_get_attn_backend
 from vllm.platforms import current_platform
 
@@ -69,7 +69,7 @@ def test_mha_attn_forward(
     k = torch.randn(batch_size, seq_len, num_kv_heads * head_size, device=device)
     v = torch.randn(batch_size, seq_len, num_kv_heads * head_size, device=device)
     scale = 1.0 / head_size**0.5
-    attn = MultiHeadAttention(
+    attn = MMEncoderAttention(
         num_heads, head_size, scale=scale, num_kv_heads=num_kv_heads
     )
     output = attn(q, k, v)
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 7ef77db8fbb5b..1d882eb87bfde 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -2,12 +2,10 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer."""
 
-import functools
 from typing import cast
 
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 
 import vllm.envs as envs
 from vllm.attention.backends.abstract import (
@@ -16,13 +14,10 @@ from vllm.attention.backends.abstract import (
     MLAAttentionImpl,
 )
 from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.layers.mm_encoder_attention import maybe_get_vit_flash_attn_backend
 from vllm.attention.selector import get_attn_backend
-from vllm.attention.utils.fa_utils import get_flash_attn_version
 from vllm.attention.utils.kv_sharing_utils import validate_kv_sharing_target
 from vllm.attention.utils.kv_transfer_utils import maybe_transfer_kv_layer
 from vllm.config import CacheConfig, get_current_vllm_config
-from vllm.config.multimodal import MultiModalConfig
 from vllm.config.vllm import VllmConfig
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.logger import init_logger
@@ -36,7 +31,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
-from vllm.model_executor.models.vision import get_vit_attn_backend
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import (
     direct_register_custom_op,
@@ -412,132 +406,6 @@ class Attention(nn.Module, AttentionLayerBase):
             )
 
 
-class MultiHeadAttention(nn.Module):
-    """Multi-headed attention without any cache, used for ViT."""
-
-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        num_kv_heads: int | None = None,
-        # This has no effect, it is only here to make it easier to swap
-        # between Attention and MultiHeadAttention
-        prefix: str = "",
-        multimodal_config: MultiModalConfig | None = None,
-    ) -> None:
-        super().__init__()
-        self.num_heads = num_heads
-        self.head_size = head_size
-        self.scale = scale
-        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
-        self.layer_name = prefix
-
-        assert self.num_heads % self.num_kv_heads == 0, (
-            f"num_heads ({self.num_heads}) is not "
-            f"divisible by num_kv_heads ({self.num_kv_heads})"
-        )
-        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
-
-        # During model initialization, the default dtype is set as the model
-        # weight and activation dtype.
-        dtype = torch.get_default_dtype()
-
-        # Determine the attention backend
-        attn_backend_override = None
-        if multimodal_config is not None:
-            attn_backend_override = multimodal_config.mm_encoder_attn_backend
-
-        self.attn_backend = get_vit_attn_backend(
-            head_size=head_size,
-            dtype=dtype,
-            attn_backend_override=attn_backend_override,
-        )
-
-        self._flash_attn_varlen_func = maybe_get_vit_flash_attn_backend(
-            self.attn_backend,
-        )
-
-        self.is_flash_attn_backend = self.attn_backend in {
-            AttentionBackendEnum.FLASH_ATTN,
-            AttentionBackendEnum.ROCM_AITER_FA,
-        }
-
-        self.fa_version = None
-        if (
-            self.attn_backend == AttentionBackendEnum.FLASH_ATTN
-            and current_platform.is_cuda()
-        ):
-            self.fa_version = get_flash_attn_version()
-            assert self._flash_attn_varlen_func is not None
-            self._flash_attn_varlen_func = functools.partial(
-                self._flash_attn_varlen_func, fa_version=self.fa_version
-            )
-
-        logger.info_once(
-            f"Using {self.attn_backend} for MultiHeadAttention in multimodal encoder."
-        )
-
-    def forward(
-        self,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-    ) -> torch.Tensor:
-        """Input shape:
-        (batch_size x seq_len x hidden_size) or
-        (batch_size x seq_len x num_heads x head_size)
-        """
-        bsz, q_len = query.size()[:2]
-        kv_len = key.size(1)
-
-        query = query.view(bsz, q_len, self.num_heads, self.head_size)
-        key = key.view(bsz, kv_len, self.num_kv_heads, self.head_size)
-        value = value.view(bsz, kv_len, self.num_kv_heads, self.head_size)
-
-        if (num_repeat := self.num_queries_per_kv) > 1:
-            # Handle MQA and GQA
-            key = torch.repeat_interleave(key, num_repeat, dim=2)
-            value = torch.repeat_interleave(value, num_repeat, dim=2)
-
-        if self.is_flash_attn_backend:
-            assert self._flash_attn_varlen_func is not None
-            cu_seqlens_q = torch.arange(
-                0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=query.device
-            )
-            cu_seqlens_k = torch.arange(
-                0, (bsz + 1) * kv_len, step=kv_len, dtype=torch.int32, device=key.device
-            )
-
-            out = self._flash_attn_varlen_func(
-                query.flatten(0, 1),
-                key.flatten(0, 1),
-                value.flatten(0, 1),
-                cu_seqlens_q=cu_seqlens_q,
-                cu_seqlens_k=cu_seqlens_k,
-                max_seqlen_q=q_len,
-                max_seqlen_k=kv_len,
-                softmax_scale=self.scale,
-            )
-        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
-            query, key, value = (x.transpose(1, 2) for x in (query, key, value))
-            out = F.scaled_dot_product_attention(query, key, value, scale=self.scale)
-            out = out.transpose(1, 2)
-        elif self.attn_backend == AttentionBackendEnum.PALLAS:
-            query, key, value = (x.transpose(1, 2) for x in (query, key, value))
-            from torch_xla.experimental.custom_kernel import flash_attention
-
-            out = flash_attention(query, key, value, sm_scale=self.scale)
-            out = out.transpose(1, 2)
-        else:
-            # ViT attention hasn't supported this backend yet
-            raise NotImplementedError(
-                f"ViT attention hasn't supported {self.attn_backend} backend yet."
-            )
-
-        return out.reshape(bsz, q_len, -1)
-
-
 class MLAAttention(nn.Module, AttentionLayerBase):
     """Multi-Head Latent Attention layer.
 
diff --git a/vllm/attention/layers/mm_encoder_attention.py b/vllm/attention/layers/mm_encoder_attention.py
index 8b3dee1340b9f..25f54cc867b5a 100644
--- a/vllm/attention/layers/mm_encoder_attention.py
+++ b/vllm/attention/layers/mm_encoder_attention.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Callable
 
 import torch
 
@@ -19,27 +18,6 @@ from vllm.model_executor.models.vision import get_vit_attn_backend
 logger = init_logger(__name__)
 
 
-def maybe_get_vit_flash_attn_backend(
-    attn_backend: AttentionBackendEnum | None,
-) -> Callable | None:
-    # At this point,
-    # we already have the attn_backend,
-    # overriding logic is done in the platform-specific implementation.
-    # so we don't need to override backend here.
-    # Just return the attn_backend and flash_attn_varlen_func.
-
-    if attn_backend == AttentionBackendEnum.FLASH_ATTN:
-        from vllm.attention.utils.fa_utils import flash_attn_varlen_func
-    elif attn_backend == AttentionBackendEnum.ROCM_AITER_FA:
-        from aiter import flash_attn_varlen_func
-    else:
-        flash_attn_varlen_func = None
-
-    # if attn_backend is TORCH_SDPA,
-    # it will reach here and the flash_attn_varlen_func will be None.
-    return flash_attn_varlen_func
-
-
 @CustomOp.register("mm_encoder_attn")
 class MMEncoderAttention(CustomOp):
     """Multi-headed attention without any cache, used for multimodal encoder."""
@@ -98,21 +76,17 @@ class MMEncoderAttention(CustomOp):
             AttentionBackendEnum.ROCM_AITER_FA,
         }
 
-        self.flash_attn_varlen_func = maybe_get_vit_flash_attn_backend(
-            self.attn_backend,
+        self._fa_version = (
+            get_flash_attn_version() if self.is_flash_attn_backend else None
         )
 
-        if self.is_flash_attn_backend:
-            assert self.flash_attn_varlen_func is not None
-            self._fa_version = get_flash_attn_version()
-
         logger.info_once(f"Using {self.attn_backend} for MMEncoderAttention.")
 
     @classmethod
     def enabled(cls) -> bool:
         return True
 
-    def reshape_qkv_to_4d(
+    def maybe_reshape_qkv_to_4d(
         self,
         query: torch.Tensor,
         key: torch.Tensor,
@@ -136,30 +110,6 @@ class MMEncoderAttention(CustomOp):
 
         return query, key, value
 
-    def reshape_qkv_to_3d(
-        self,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        bsz: int,
-        q_len: int,
-        kv_len: int,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """
-        Reshape query, key, value to 3D tensors:
-        (batch_size * seq_len, num_heads, head_size)
-        """
-        query = query.view(bsz * q_len, self.num_heads, self.head_size)
-        key = key.view(bsz * kv_len, self.num_kv_heads, self.head_size)
-        value = value.view(bsz * kv_len, self.num_kv_heads, self.head_size)
-
-        if (num_repeat := self.num_queries_per_kv) > 1:
-            # Handle MQA and GQA
-            key = torch.repeat_interleave(key, num_repeat, dim=1)
-            value = torch.repeat_interleave(value, num_repeat, dim=1)
-
-        return query, key, value
-
     def _forward_sdpa(
         self,
         query: torch.Tensor,
@@ -167,13 +117,15 @@ class MMEncoderAttention(CustomOp):
         value: torch.Tensor,
         cu_seqlens: torch.Tensor | None = None,
     ) -> torch.Tensor:
-        # TODO(Isotr0py): Migrate MultiHeadAttention
-        assert cu_seqlens is not None
-
+        """Input shape:
+        (batch_size x seq_len x hidden_size) or
+        (batch_size x seq_len x num_heads x head_size)
+        """
         bsz, q_len = query.size()[:2]
         kv_len = key.size(1)
+        is_reshaped = query.dim() != 4
 
-        query, key, value = self.reshape_qkv_to_4d(
+        query, key, value = self.maybe_reshape_qkv_to_4d(
             query, key, value, bsz, q_len, kv_len
         )
 
@@ -183,6 +135,8 @@ class MMEncoderAttention(CustomOp):
             v=value,
             cu_seqlens=cu_seqlens,
         )
+        if is_reshaped:
+            output = output.view(bsz, q_len, -1)
         return output
 
     def _forward_fa(
@@ -193,13 +147,21 @@ class MMEncoderAttention(CustomOp):
         cu_seqlens: torch.Tensor | None = None,
         max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
     ) -> torch.Tensor:
-        assert self.flash_attn_varlen_func is not None, (
-            "Flash attention function is not set."
-        )
-        # # TODO(Isotr0py): Migrate MultiHeadAttention
-        assert cu_seqlens is not None and max_seqlen is not None
+        """Input shape:
+        (batch_size x seq_len x hidden_size) or
+        (batch_size x seq_len x num_heads x head_size)
+        """
+        assert (cu_seqlens is not None and max_seqlen is not None) or (
+            cu_seqlens is None and max_seqlen is None
+        ), "cu_seqlens and max_seqlen should be both set or both None."
 
-        bsz = query.shape[0]
+        bsz, q_len = query.size()[:2]
+        kv_len = key.size(1)
+        is_reshaped = query.dim() != 4
+
+        query, key, value = self.maybe_reshape_qkv_to_4d(
+            query, key, value, bsz, q_len, kv_len
+        )
 
         output = vit_flash_attn_wrapper(
             q=query,
@@ -211,6 +173,8 @@ class MMEncoderAttention(CustomOp):
             is_rocm_aiter=(self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA),
             fa_version=self._fa_version,
         )
+        if is_reshaped:
+            output = output.view(bsz, q_len, -1)
         return output
 
     def forward_native(
diff --git a/vllm/attention/ops/vit_attn_wrappers.py b/vllm/attention/ops/vit_attn_wrappers.py
index f555147bc055a..2204382a35e2a 100644
--- a/vllm/attention/ops/vit_attn_wrappers.py
+++ b/vllm/attention/ops/vit_attn_wrappers.py
@@ -24,11 +24,11 @@ def flash_attn_maxseqlen_wrapper(
     q: torch.Tensor,
     k: torch.Tensor,
     v: torch.Tensor,
-    cu_seqlens: torch.Tensor,
-    max_seqlen: torch.Tensor,
     batch_size: int,
     is_rocm_aiter: bool,
     fa_version: int | None,
+    cu_seqlens: torch.Tensor | None = None,
+    max_seqlen: torch.Tensor | None = None,
 ) -> torch.Tensor:
     kwargs = {}
     if is_rocm_aiter:
@@ -38,6 +38,14 @@ def flash_attn_maxseqlen_wrapper(
 
         if not current_platform.is_rocm() and fa_version is not None:
             kwargs["fa_version"] = fa_version
+
+    q_len = q.size(1)
+    if cu_seqlens is None:
+        cu_seqlens = torch.arange(
+            0, (batch_size + 1) * q_len, step=q_len, dtype=torch.int32, device=q.device
+        )
+    max_seqlen = q_len if max_seqlen is None else max_seqlen.item()
+
     q, k, v = (einops.rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
     output = flash_attn_varlen_func(
         q,
@@ -45,8 +53,8 @@ def flash_attn_maxseqlen_wrapper(
         v,
         cu_seqlens_q=cu_seqlens,
         cu_seqlens_k=cu_seqlens,
-        max_seqlen_q=max_seqlen.item(),
-        max_seqlen_k=max_seqlen.item(),
+        max_seqlen_q=max_seqlen,
+        max_seqlen_k=max_seqlen,
         dropout_p=0.0,
         causal=False,
         **kwargs,
@@ -79,24 +87,42 @@ def vit_flash_attn_wrapper(
     q: torch.Tensor,
     k: torch.Tensor,
     v: torch.Tensor,
-    cu_seqlens: torch.Tensor,
-    max_seqlen: torch.Tensor,
     batch_size: int,
     is_rocm_aiter: bool,
     fa_version: int | None,
+    cu_seqlens: torch.Tensor | None = None,
+    max_seqlen: torch.Tensor | None = None,
 ) -> torch.Tensor:
     return torch.ops.vllm.flash_attn_maxseqlen_wrapper(
-        q, k, v, cu_seqlens, max_seqlen, batch_size, is_rocm_aiter, fa_version
+        q,
+        k,
+        v,
+        batch_size,
+        is_rocm_aiter,
+        fa_version,
+        cu_seqlens,
+        max_seqlen,
     )
 
 
+def apply_sdpa(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor:
+    """
+    Input shape:
+    (batch_size x seq_len x num_heads x head_size)
+    """
+    q, k, v = (einops.rearrange(x, "b s h d -> b h s d") for x in [q, k, v])
+    output = F.scaled_dot_product_attention(q, k, v, dropout_p=0.0)
+    output = einops.rearrange(output, "b h s d -> b s h d ")
+    return output
+
+
 # TODO: Once we have a torch 2.10, we can use tensor slices
 # so we won't need to wrap this in custom ops
 def torch_sdpa_wrapper(
     q: torch.Tensor,
     k: torch.Tensor,
     v: torch.Tensor,
-    cu_seqlens: torch.Tensor,
+    cu_seqlens: torch.Tensor | None = None,
 ) -> torch.Tensor:
     # Never remove the contiguous logic for ROCm
     # Without it, hallucinations occur with the backend
@@ -105,6 +131,9 @@ def torch_sdpa_wrapper(
         k = k.contiguous()
         v = v.contiguous()
 
+    if cu_seqlens is None:
+        return apply_sdpa(q, k, v)
+
     outputs = []
 
     lens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
@@ -112,11 +141,7 @@ def torch_sdpa_wrapper(
     k_chunks = torch.split(k, lens, dim=1)
     v_chunks = torch.split(v, lens, dim=1)
     for q_i, k_i, v_i in zip(q_chunks, k_chunks, v_chunks):
-        q_i, k_i, v_i = (
-            einops.rearrange(x, "b s h d -> b h s d") for x in [q_i, k_i, v_i]
-        )
-        output_i = F.scaled_dot_product_attention(q_i, k_i, v_i, dropout_p=0.0)
-        output_i = einops.rearrange(output_i, "b h s d -> b s h d ")
+        output_i = apply_sdpa(q_i, k_i, v_i)
         outputs.append(output_i)
     context_layer = torch.cat(outputs, dim=1)
     return context_layer
@@ -142,6 +167,6 @@ def vit_torch_sdpa_wrapper(
     q: torch.Tensor,
     k: torch.Tensor,
     v: torch.Tensor,
-    cu_seqlens: torch.Tensor,
+    cu_seqlens: torch.Tensor | None = None,
 ) -> torch.Tensor:
     return torch.ops.vllm.torch_sdpa_wrapper(q, k, v, cu_seqlens)
diff --git a/vllm/model_executor/models/aimv2.py b/vllm/model_executor/models/aimv2.py
index 3d000f3ac3ab5..96ca27ad02504 100644
--- a/vllm/model_executor/models/aimv2.py
+++ b/vllm/model_executor/models/aimv2.py
@@ -8,7 +8,7 @@ from collections.abc import Iterable
 import torch
 import torch.nn as nn
 
-from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.utils import divide
 from vllm.model_executor.layers.activation import SiluAndMul
@@ -126,7 +126,7 @@ class AIMv2Attention(nn.Module):
         self.tp_size = get_tensor_model_parallel_world_size()
         self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
 
-        self.attn = MultiHeadAttention(
+        self.attn = MMEncoderAttention(
             self.num_heads_per_partition, self.head_dim, self.scale
         )
 
diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
index f31f99c0592b2..7387830b32bdc 100644
--- a/vllm/model_executor/models/blip.py
+++ b/vllm/model_executor/models/blip.py
@@ -9,7 +9,7 @@ import torch
 import torch.nn as nn
 from transformers import Blip2VisionConfig, BlipVisionConfig
 
-from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.conv import Conv2dLayer
@@ -122,7 +122,7 @@ class BlipAttention(nn.Module):
         self.tp_size = get_tensor_model_parallel_world_size()
         self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
 
-        self.attn = MultiHeadAttention(
+        self.attn = MMEncoderAttention(
             self.num_heads_per_partition, self.head_dim, self.scale
         )
 
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 22f3ecad748e6..8e77b36e6feb5 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -14,7 +14,8 @@ from transformers import (
     CLIPVisionConfig,
 )
 
-from vllm.attention.layer import Attention, MultiHeadAttention
+from vllm.attention.layer import Attention
+from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
@@ -354,7 +355,7 @@ class CLIPAttention(nn.Module):
         quant_config: QuantizationConfig | None = None,
         *,
         prefix: str = "",
-        attn_cls: type[Attention] | type[MultiHeadAttention],
+        attn_cls: type[Attention] | type[MMEncoderAttention],
     ) -> None:
         super().__init__()
 
@@ -449,7 +450,7 @@ class CLIPEncoderLayer(nn.Module):
         quant_config: QuantizationConfig | None = None,
         *,
         prefix: str = "",
-        attn_cls: type[Attention] | type[MultiHeadAttention],
+        attn_cls: type[Attention] | type[MMEncoderAttention],
     ) -> None:
         super().__init__()
         self.self_attn = CLIPAttention(
@@ -493,7 +494,7 @@ class CLIPEncoder(nn.Module):
         num_hidden_layers_override: int | None = None,
         *,
         prefix: str = "",
-        attn_cls: type[Attention] | type[MultiHeadAttention],
+        attn_cls: type[Attention] | type[MMEncoderAttention],
     ) -> None:
         super().__init__()
 
@@ -638,7 +639,7 @@ class CLIPVisionTransformer(nn.Module):
             quant_config=quant_config,
             num_hidden_layers_override=num_hidden_layers_override,
             prefix=f"{prefix}.encoder",
-            attn_cls=MultiHeadAttention,
+            attn_cls=MMEncoderAttention,
         )
 
         num_hidden_layers = config.num_hidden_layers
diff --git a/vllm/model_executor/models/deepencoder.py b/vllm/model_executor/models/deepencoder.py
index 8f1660891fcbf..045445d23b8f3 100644
--- a/vllm/model_executor/models/deepencoder.py
+++ b/vllm/model_executor/models/deepencoder.py
@@ -18,7 +18,7 @@ import torch.nn as nn
 import torch.nn.functional as F
 from transformers import CLIPVisionConfig
 
-from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -628,7 +628,7 @@ class DeepCLIPVisionTransformer(nn.Module):
             quant_config=quant_config,
             num_hidden_layers_override=num_hidden_layers_override,
             prefix=f"{prefix}.encoder",
-            attn_cls=MultiHeadAttention,
+            attn_cls=MMEncoderAttention,
         )
 
         num_hidden_layers = config.num_hidden_layers
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
index ec5af94e297c1..453a7812a1748 100644
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -19,7 +19,7 @@ from transformers import BatchFeature, PreTrainedTokenizer, TensorType
 from transformers.image_utils import ImageInput
 from transformers.tokenization_utils_base import TextInput
 
-from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size
@@ -135,7 +135,7 @@ class EVA2CLIPAttention(nn.Module):
             prefix=f"{prefix}.dense",
         )
 
-        self.attn = MultiHeadAttention(
+        self.attn = MMEncoderAttention(
             self.num_heads_per_rank, self.head_dim, self.scale
         )
         self.output_dropout = torch.nn.Dropout(config.dropout_prob)
diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py
index be084f4ee0f8e..6fc56094af650 100644
--- a/vllm/model_executor/models/hunyuan_vision.py
+++ b/vllm/model_executor/models/hunyuan_vision.py
@@ -34,7 +34,7 @@ import torch.nn.functional as F
 from transformers import BatchFeature
 
 from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.config import MultiModalConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import parallel_state
@@ -232,7 +232,7 @@ class HunYuanVisionAttention(nn.Module):
         )
 
         self.scale = self.hidden_size_per_attention_head**-0.5
-        self.attn = MultiHeadAttention(
+        self.attn = MMEncoderAttention(
             self.num_attention_heads_per_partition,
             self.hidden_size_per_attention_head,
             self.scale,
diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
index 06b8468e18db9..ee6ca5eacb176 100644
--- a/vllm/model_executor/models/idefics2_vision_model.py
+++ b/vllm/model_executor/models/idefics2_vision_model.py
@@ -27,7 +27,7 @@ from transformers.models.idefics2.configuration_idefics2 import (
     Idefics2VisionConfig,
 )
 
-from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.conv import Conv2dLayer
@@ -161,8 +161,8 @@ class Idefics2VisionAttention(nn.Module):
             prefix=f"{prefix}.out_proj",
             disable_tp=use_data_parallel,
         )
-        # Use unified MultiHeadAttention with Flash Attention support
-        self.attn = MultiHeadAttention(
+        # Use unified MMEncoderAttention with Flash Attention support
+        self.attn = MMEncoderAttention(
             self.num_heads_per_partition, self.head_dim, self.scale
         )
 
@@ -175,7 +175,7 @@ class Idefics2VisionAttention(nn.Module):
         )  # batch_size, q_len, 3 * num_heads_per_partition * head_dim
         query_states, key_states, value_states = qkv.chunk(3, dim=-1)
 
-        # Use unified MultiHeadAttention implementation
+        # Use unified MMEncoderAttention implementation
         out = self.attn(query_states, key_states, value_states)
         attn_output, _ = self.out_proj(out)
         return attn_output
diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py
index 61aeafc2ab436..5f7ba838aa3d9 100644
--- a/vllm/model_executor/models/intern_vit.py
+++ b/vllm/model_executor/models/intern_vit.py
@@ -15,7 +15,7 @@ import torch.nn as nn
 import torch.nn.functional as F
 from transformers import PretrainedConfig
 
-from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.distributed import (
     divide,
     get_tensor_model_parallel_rank,
@@ -207,7 +207,7 @@ class InternParallelAttention(nn.Module):
             disable_tp=use_data_parallel,
         )
 
-        self.attn = MultiHeadAttention(
+        self.attn = MMEncoderAttention(
             self.num_heads_per_partition, self.head_dim, self.scale
         )
 
diff --git a/vllm/model_executor/models/interns1_vit.py b/vllm/model_executor/models/interns1_vit.py
index cb0414bbc95a8..a16857d613226 100644
--- a/vllm/model_executor/models/interns1_vit.py
+++ b/vllm/model_executor/models/interns1_vit.py
@@ -14,7 +14,7 @@ import torch.nn as nn
 from transformers import PretrainedConfig
 from transformers.utils import torch_int
 
-from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -214,8 +214,8 @@ class InternSdpaAttention(nn.Module):
 
         self.projection_layer = nn.Linear(self.dummy_dim, self.embed_dim)
 
-        # Use unified MultiHeadAttention with automatic backend selection
-        self.attn = MultiHeadAttention(self.num_heads, self.head_dim, self.scale)
+        # Use unified MMEncoderAttention with automatic backend selection
+        self.attn = MMEncoderAttention(self.num_heads, self.head_dim, self.scale)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """x shape: (B, N, C)"""
@@ -228,7 +228,7 @@ class InternSdpaAttention(nn.Module):
             q = self.q_norm(q)
             k = self.k_norm(k)
 
-        # Use unified MultiHeadAttention with automatic backend selection
+        # Use unified MMEncoderAttention with automatic backend selection
         x = self.attn(q, k, v)
 
         x = self.projection_layer(x)
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index fe963cc6644fb..886d5151e43ff 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -31,7 +31,7 @@ from transformers.models.llama4.image_processing_llama4_fast import (
     get_best_fit,
 )
 
-from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size
@@ -255,7 +255,7 @@ class Llama4VisionAttention(nn.Module):
         self.attention_dropout = config.attention_dropout
         self.scaling = self.head_dim**-0.5
 
-        self.attn = MultiHeadAttention(
+        self.attn = MMEncoderAttention(
             self.num_local_heads, self.head_dim, self.scaling
         )
 
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 71c6b1aa2e814..9c741e1f5071f 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -17,7 +17,8 @@ from transformers import BatchFeature, PretrainedConfig, ProcessorMixin, TensorT
 from transformers.image_utils import ImageInput
 from transformers.tokenization_utils_base import TextInput
 
-from vllm.attention.layer import Attention, MultiHeadAttention
+from vllm.attention.layer import Attention
+from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
@@ -222,7 +223,7 @@ class MultiHeadDotProductAttention(nn.Module):
         )
 
         self.scale = self.head_dim**-0.5
-        self.attn = MultiHeadAttention(
+        self.attn = MMEncoderAttention(
             self.num_heads, self.head_dim, self.scale, num_kv_heads=self.num_kv_heads
         )
 
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 2600dc1c9f79c..799afc7ca2e51 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -16,8 +16,8 @@ from transformers import (
     SiglipVisionConfig,
 )
 
-from vllm.attention.layer import MultiHeadAttention
 from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
+from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
@@ -379,7 +379,7 @@ class SiglipAttention(nn.Module):
         quant_config: QuantizationConfig | None = None,
         *,
         prefix: str = "",
-        attn_cls: type[EncoderOnlyAttention] | type[MultiHeadAttention],
+        attn_cls: type[EncoderOnlyAttention] | type[MMEncoderAttention],
     ) -> None:
         super().__init__()
 
@@ -481,7 +481,7 @@ class SiglipEncoderLayer(nn.Module):
         quant_config: QuantizationConfig | None = None,
         *,
         prefix: str = "",
-        attn_cls: type[EncoderOnlyAttention] | type[MultiHeadAttention],
+        attn_cls: type[EncoderOnlyAttention] | type[MMEncoderAttention],
     ) -> None:
         super().__init__()
 
@@ -527,7 +527,7 @@ class SiglipEncoder(nn.Module):
         num_hidden_layers_override: int | None = None,
         *,
         prefix: str = "",
-        attn_cls: type[EncoderOnlyAttention] | type[MultiHeadAttention],
+        attn_cls: type[EncoderOnlyAttention] | type[MMEncoderAttention],
     ) -> None:
         super().__init__()
 
@@ -700,7 +700,7 @@ class SiglipVisionTransformer(nn.Module):
             quant_config=quant_config,
             num_hidden_layers_override=num_hidden_layers_override,
             prefix=f"{prefix}.encoder",
-            attn_cls=MultiHeadAttention,
+            attn_cls=MMEncoderAttention,
         )
 
         num_hidden_layers = config.num_hidden_layers
diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py
index e5038e56a2708..3c965721b9dae 100644
--- a/vllm/model_executor/models/step3_vl.py
+++ b/vllm/model_executor/models/step3_vl.py
@@ -15,7 +15,7 @@ from torchvision import transforms
 from torchvision.transforms.functional import InterpolationMode
 from transformers import BatchFeature, PretrainedConfig, TensorType
 
-from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size
@@ -753,8 +753,8 @@ class Step3VisionAttention(nn.Module):
             disable_tp=use_data_parallel,
         )
 
-        # Use unified MultiHeadAttention with automatic backend selection
-        self.attn = MultiHeadAttention(self.num_heads, self.head_dim, self.scale)
+        # Use unified MMEncoderAttention with automatic backend selection
+        self.attn = MMEncoderAttention(self.num_heads, self.head_dim, self.scale)
 
     def forward(
         self,
@@ -767,7 +767,7 @@ class Step3VisionAttention(nn.Module):
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.chunk(chunks=3, dim=-1)
 
-        # Use unified MultiHeadAttention with automatic backend selection
+        # Use unified MMEncoderAttention with automatic backend selection
         attn_output = self.attn(q, k, v)
 
         attn_output, _ = self.out_proj(attn_output)
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index b513e3513b2e2..f5a1e75d99617 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -16,9 +16,9 @@ from transformers import (
 )
 from transformers.models.whisper.modeling_whisper import sinusoids
 
-from vllm.attention.backends.abstract import AttentionType
-from vllm.attention.layer import Attention, MultiHeadAttention
+from vllm.attention.layer import Attention, AttentionType
 from vllm.attention.layers.cross_attention import CrossAttention
+from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size
@@ -141,7 +141,7 @@ class WhisperAudioInputs(TensorSchema):
     ]
 
 
-class WhisperEncoderAttention(MultiHeadAttention):
+class WhisperEncoderAttention(MMEncoderAttention):
     """Multi-headed attention for Whisper encoder with 2D tensor support."""
 
     def forward(

From 058926d48c2435496839ec8e7e3ee90683ea7791 Mon Sep 17 00:00:00 2001
From: Fanli Lin <fanli.lin@intel.com>
Date: Fri, 19 Dec 2025 02:16:36 +0800
Subject: [PATCH 682/770] [XPU] allow custom workers (e.g. vllm-omni workers)
 to be used on XPU (#30935)

Signed-off-by: Fanli Lin <fanli.lin@intel.com>
---
 vllm/platforms/xpu.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index af8979af36643..2d67551eed9f6 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -159,7 +159,10 @@ class XPUPlatform(Platform):
 
         # check and update parallel config
         parallel_config = vllm_config.parallel_config
-        parallel_config.worker_cls = "vllm.v1.worker.xpu_worker.XPUWorker"
+        # Only override worker_cls if it's still the default "auto"
+        # This allows custom workers (like vllm-omni workers) to be used on XPU
+        if parallel_config.worker_cls == "auto":
+            parallel_config.worker_cls = "vllm.v1.worker.xpu_worker.XPUWorker"
         if vllm_config.kv_transfer_config is not None:
             vllm_config.kv_transfer_config.enable_permute_local_kv = True
 

From 889f8bb250d498dcd38d7bcf58fb3c9e50d54d14 Mon Sep 17 00:00:00 2001
From: wz1qqx <55830058+wz1qqx@users.noreply.github.com>
Date: Fri, 19 Dec 2025 03:09:51 +0800
Subject: [PATCH 683/770] [BugFix]Reclaim resources to prevent memory leaks
 when use LMCacheMPConnector (#30745)

Signed-off-by: wz1qqx <ziqi.wang@novita.ai>
Co-authored-by: wz1qqx <ziqi.wang@novita.ai>
---
 .../lmcache_integration/multi_process_adapter.py |  8 ++++++++
 .../kv_connector/v1/lmcache_mp_connector.py      | 16 ++++++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py
index 6656b5a25f83d..9db4dedf48b7b 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py
@@ -147,6 +147,14 @@ class LMCacheMPSchedulerAdapter:
         """
         return self.blocks_in_chunk
 
+    def _cleanup_lookup_result(self, request_id: str) -> None:
+        """
+        Clean up lookup future for a finished request to prevent memory leak.
+        Args:
+            request_id: The ID of the finished request.
+        """
+        self.lookup_futures.pop(request_id, None)
+
     # Helper functions
     def _create_key(self, block_hash: bytes) -> IPCCacheEngineKey:
         """Convert a block hash to an IPC cache engine key"""
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
index 995708b89bc26..29166be62c242 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
@@ -701,6 +701,8 @@ class LMCacheMPConnector(KVConnectorBase_V1):
                 if condition
                 else LMCacheMPRequestState.READY
             )
+            # Clean up lookup future in scheduler adapter
+            self.scheduler_adapter._cleanup_lookup_result(request.request_id)
 
     def build_connector_meta(
         self, scheduler_output: SchedulerOutput
@@ -754,6 +756,8 @@ class LMCacheMPConnector(KVConnectorBase_V1):
             Optional KVTransferParams to be included in the request outputs
             returned by the engine.
         """
+        # Clean up request tracker to prevent memory leak
+        self._cleanup_request_tracker(request.request_id)
         return True, None
 
     def take_events(self) -> Iterable["KVCacheEvent"]:
@@ -915,3 +919,15 @@ class LMCacheMPConnector(KVConnectorBase_V1):
             new_tracker = LMCacheMPRequestTracker(request)
             self.request_trackers[request_id] = new_tracker
         return self.request_trackers[request_id]
+
+    def _cleanup_request_tracker(self, request_id: str) -> None:
+        """
+        Clean up request tracker and associated lookup future for a request.
+        This should be called when a request is finished to prevent memory leak.
+        """
+        # Clean up request tracker
+        if self.request_trackers.pop(request_id, None):
+            logger.debug(
+                "[KVConnector] Cleaned up request_tracker for request %s",
+                request_id,
+            )

From 53ad423f2638a3cbb95149928f127e07564581b7 Mon Sep 17 00:00:00 2001
From: jiahanc <173873397+jiahanc@users.noreply.github.com>
Date: Thu, 18 Dec 2025 11:31:18 -0800
Subject: [PATCH 684/770] [Perf] enable flashinfer rotary_embedding custom ops
 in DeepSeek rotary (#30729)

Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com>
---
 .../layers/rotary_embedding/base.py           |  5 ++++-
 .../rotary_embedding/deepseek_scaling_rope.py | 21 ++++++++++++++++++-
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py
index afa69324c4e2e..7e83ea9a1355b 100644
--- a/vllm/model_executor/layers/rotary_embedding/base.py
+++ b/vllm/model_executor/layers/rotary_embedding/base.py
@@ -38,7 +38,10 @@ class RotaryEmbeddingBase(CustomOp):
         #                        and current_platform.is_cuda()
         #                        and has_flashinfer()
         #                        and self.head_size in [64, 128, 256, 512])
-        self.use_flashinfer = False
+
+        # Check if use_flashinfer is already set
+        if not hasattr(self, "use_flashinfer"):
+            self.use_flashinfer = False
 
         cache = self._compute_cos_sin_cache()
         if not self.use_flashinfer:
diff --git a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
index e72834e473c15..8402b65efcc04 100644
--- a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
@@ -6,6 +6,7 @@ import math
 import torch
 
 from vllm.platforms import current_platform
+from vllm.utils.flashinfer import has_flashinfer
 
 from .base import RotaryEmbeddingBase
 from .common import (
@@ -56,6 +57,13 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbeddingBase):
             / yarn_get_mscale(self.scaling_factor, float(mscale_all_dim))
             * attn_factor
         )
+        self.use_flashinfer = (
+            self.enabled()
+            and dtype in (torch.float16, torch.bfloat16)
+            and current_platform.is_cuda()
+            and has_flashinfer()
+            and head_size in [64, 128, 256, 512]
+        )
         super().__init__(
             head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
         )
@@ -162,4 +170,15 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbeddingBase):
         key: torch.Tensor | None = None,
         offsets: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor | None]:
-        return self.forward_native(positions, query, key, offsets)
+        if self.use_flashinfer:
+            torch.ops.vllm.flashinfer_rotary_embedding(
+                torch.add(positions, offsets) if offsets is not None else positions,
+                query,
+                key,
+                self.head_size,
+                self.cos_sin_cache,
+                self.is_neox_style,
+            )
+            return query, key
+        else:
+            return self.forward_native(positions, query, key, offsets)

From b8c477c11502ad9b52e833faff3e48ba25752e04 Mon Sep 17 00:00:00 2001
From: navmarri14 <nmarri@roblox.com>
Date: Thu, 18 Dec 2025 11:41:59 -0800
Subject: [PATCH 685/770] tuned fused configs for B300 (#30629)

---
 ...me=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json | 147 ++++++++++++++++++
 ...me=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json | 147 ++++++++++++++++++
 ...me=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json | 147 ++++++++++++++++++
 3 files changed, 441 insertions(+)
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=160,N=768,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json

diff --git a/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..291a760cb2382
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.1",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..a081be65f613b
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.1",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=160,N=768,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=768,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..49aadc8c9dfd3
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=160,N=768,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.1",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    }
+}

From d2dc5dfc6ecafbd3d725c1c42dd019db2b1efd30 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Fri, 19 Dec 2025 03:42:32 +0800
Subject: [PATCH 686/770] [Bugfix] Remove `tile_size=64` for mm_prefix triton
 attention (#30973)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/attention/ops/triton_unified_attention.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py
index ae5a48ec3d26d..f61c8e9b89c24 100644
--- a/vllm/attention/ops/triton_unified_attention.py
+++ b/vllm/attention/ops/triton_unified_attention.py
@@ -800,7 +800,6 @@ def _get_tile_size(
     head_size: int,
     sliding_window: int,
     element_size: int,
-    is_mm_prefix: bool,
     is_prefill: bool,
 ) -> int:
     """Select tile size with Gemma3-specific optimization.
@@ -809,10 +808,6 @@ def _get_tile_size(
     the larger head dimension (128/256). For other models, use
     the default vLLM behavior.
     """
-    if is_mm_prefix:
-        # Multimodal bidirectional attention needs a larger tile size
-        return 64
-
     if _is_gemma3_attention(head_size, sliding_window):
         # Gemma3: use 32 for decode (default is 16)
         return 32
@@ -903,14 +898,12 @@ def unified_attention(
         head_size,
         sliding_window_val,
         q.element_size(),
-        is_mm_prefix=use_mm_prefix,
         is_prefill=True,
     )
     TILE_SIZE_DECODE = _get_tile_size(
         head_size,
         sliding_window_val,
         q.element_size(),
-        is_mm_prefix=use_mm_prefix,
         is_prefill=False,
     )
 

From 97000a2be7e318be1a3eb172f9abf2d67dbe73bf Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Thu, 18 Dec 2025 14:45:55 -0500
Subject: [PATCH 687/770] [Bug] Fix compressed tensor not using deepgemm
 (#30820)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/model_executor/layers/fused_moe/fused_moe.py      |  1 -
 .../compressed_tensors/compressed_tensors_moe.py       | 10 ++++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 20782e2712f27..37f8e7780f999 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1696,7 +1696,6 @@ def fused_experts(
         and (is_deep_gemm_e8m0_used() or _valid_deep_gemm(hidden_states, w1, w2))
     ):
         assert quant_config is not None
-        assert apply_router_weight_on_input is False
         return deep_gemm_moe_fp8(
             hidden_states=hidden_states,
             w1=w1,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index c302e465aedb7..fc359a3067a9c 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -96,6 +96,7 @@ from vllm.utils.deep_gemm import (
     get_col_major_tma_aligned_tensor,
     get_mk_alignment_for_contiguous_layout,
     is_deep_gemm_e8m0_used,
+    is_deep_gemm_supported,
 )
 from vllm.utils.import_utils import has_deep_gemm
 
@@ -716,6 +717,13 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             get_marlin_input_dtype(layer_name) if self.use_marlin else None
         )
 
+        self.allow_deep_gemm = (
+            self.block_quant
+            and envs.VLLM_MOE_USE_DEEP_GEMM
+            and is_deep_gemm_supported()
+            and list(self.weight_block_size) == get_mk_alignment_for_contiguous_layout()
+        )
+
     def create_weights(
         self,
         layer: torch.nn.Module,
@@ -1231,6 +1239,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                     if self.disable_expert_map
                     else layer.expert_map,  # ???
                     quant_config=self.moe_quant_config,
+                    allow_deep_gemm=self.allow_deep_gemm,
                 )
             else:
                 from vllm.model_executor.layers.fused_moe.cutlass_moe import (
@@ -1272,6 +1281,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 global_num_experts=layer.global_num_experts,
                 expert_map=layer.expert_map,
                 quant_config=self.moe_quant_config,
+                allow_deep_gemm=self.allow_deep_gemm,
             )
 
     @property

From 41b6f9200fef27cd43b3299408a6f0e50654931f Mon Sep 17 00:00:00 2001
From: Elizabeth Thomas <email2eliza@gmail.com>
Date: Thu, 18 Dec 2025 13:46:28 -0600
Subject: [PATCH 688/770] Remove all2all backend envvar (#30363)

Signed-off-by: Elizabeth Thomas <email2eliza@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .../qwen30b_a3b_fp8_block_ep_eplb.sh          |  2 +-
 .../qwen3_next_mtp_async_eplb.sh              |  2 +-
 .buildkite/test-amd.yaml                      |  2 +-
 .buildkite/test-pipeline.yaml                 |  2 +-
 .buildkite/test_areas/distributed.yaml        |  2 +-
 docs/design/moe_kernel_features.md            |  2 +-
 .../elastic_ep/serve_deepseek_v2.sh           |  2 +-
 tests/v1/cudagraph/test_cudagraph_dispatch.py |  5 +-
 vllm/config/compilation.py                    |  8 ++-
 vllm/config/parallel.py                       | 49 +++++++++----------
 vllm/engine/arg_utils.py                      |  2 +-
 vllm/envs.py                                  |  5 +-
 12 files changed, 40 insertions(+), 43 deletions(-)

diff --git a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
index 6a1bef275d047..d0921c5699d5d 100644
--- a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
@@ -44,10 +44,10 @@ trap cleanup EXIT
 
 for BACK in "${BACKENDS[@]}"; do
   VLLM_DEEP_GEMM_WARMUP=skip \
-  VLLM_ALL2ALL_BACKEND=$BACK \
   vllm serve "$MODEL" \
     --enforce-eager \
     --enable-eplb \
+    --all2all-backend $BACK \
     --eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
     --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
     --data-parallel-size ${DATA_PARALLEL_SIZE} \
diff --git a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
index 937a43d1a3221..b3b65128e6062 100644
--- a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
@@ -43,12 +43,12 @@ trap cleanup EXIT
 
 for BACK in "${BACKENDS[@]}"; do
   VLLM_DEEP_GEMM_WARMUP=skip \
-  VLLM_ALL2ALL_BACKEND=$BACK \
   vllm serve "$MODEL" \
     --enforce-eager \
     --tensor-parallel-size 4 \
     --enable-expert-parallel \
     --enable-eplb \
+    --all2all-backend $BACK \
     --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
     --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
     --trust-remote-code \
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index f294261ec8c3a..6e20ff3bf38d9 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1497,7 +1497,7 @@ steps:
     - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
     - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
     - pytest -v -s tests/distributed/test_context_parallel.py
-    - HIP_VISIBLE_DEVICES=0,1 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+    - HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput
     - pytest -v -s tests/v1/distributed/test_dbo.py
 
 ##### B200 test #####
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 8e3bcfe4a36bc..faf34d95735f4 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -1331,7 +1331,7 @@ steps:
     - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
     - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
     - pytest -v -s tests/distributed/test_context_parallel.py
-    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+    - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput
     - pytest -v -s tests/v1/distributed/test_dbo.py
 
 ##### B200 test #####
diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml
index 2cc90698d916a..52d57c99fcfb5 100644
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -145,7 +145,7 @@ steps:
     - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
     - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
     - pytest -v -s tests/distributed/test_context_parallel.py
-    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+    - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput
     - pytest -v -s tests/v1/distributed/test_dbo.py
 
 - label: Distributed Tests (2 GPUs)(B200)
diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md
index 48341d199cb80..6c02dcb76bec2 100644
--- a/docs/design/moe_kernel_features.md
+++ b/docs/design/moe_kernel_features.md
@@ -16,7 +16,7 @@ Async backends support the use of DBO (Dual Batch Overlap) and shared expert ove
 
 Certain models require the topk weights to be applied to the input activations rather than the output activations when topk==1, e.g. Llama. For modular kernels, this feature is supported by the `FusedMoEPrepareAndFinalize` subclass. For non-modular kernels, it is up to the experts function to deal with this flag.
 
-Unless otherwise specified, backends are controlled via `VLLM_ALL2ALL_BACKEND`. All backends except `flashinfer` only work with EP+DP or EP+TP. `Flashinfer` can work with EP or DP without EP.
+Unless otherwise specified, backends are controlled via the `--all2all-backend` command-line argument (or the `all2all_backend` parameter in `ParallelConfig`). All backends except `flashinfer` only work with EP+DP or EP+TP. `Flashinfer` can work with EP or DP without EP.
 
 <style>
 td {
diff --git a/examples/online_serving/elastic_ep/serve_deepseek_v2.sh b/examples/online_serving/elastic_ep/serve_deepseek_v2.sh
index 6845545b6fd17..20bf598c03e26 100644
--- a/examples/online_serving/elastic_ep/serve_deepseek_v2.sh
+++ b/examples/online_serving/elastic_ep/serve_deepseek_v2.sh
@@ -55,7 +55,6 @@ done
 echo "Starting vLLM server for $MODEL_NAME with data parallel size: $DATA_PARALLEL_SIZE and redundant experts: $REDUNDANT_EXPERTS"
 
 export RAY_DEDUP_LOGS=0
-export VLLM_ALL2ALL_BACKEND="pplx"
 export VLLM_USE_DEEP_GEMM=1
 
 vllm serve $MODEL_NAME \
@@ -65,6 +64,7 @@ vllm serve $MODEL_NAME \
     --enforce-eager \
     --enable-expert-parallel \
     --enable-eplb \
+    --all2all-backend pplx \
     --num-redundant-experts $REDUNDANT_EXPERTS \
     --trust-remote-code \
     --host $HOST \
diff --git a/tests/v1/cudagraph/test_cudagraph_dispatch.py b/tests/v1/cudagraph/test_cudagraph_dispatch.py
index 0e71d6c63ce68..f9d3e8d0532b5 100644
--- a/tests/v1/cudagraph/test_cudagraph_dispatch.py
+++ b/tests/v1/cudagraph/test_cudagraph_dispatch.py
@@ -49,7 +49,10 @@ def _create_vllm_config(
         mock_config.lora_config = None
     # Mimic the behavior of VllmConfig.__post_init__()
     if compilation_config.mode == CompilationMode.VLLM_COMPILE:
-        compilation_config.set_splitting_ops_for_v1()
+        compilation_config.set_splitting_ops_for_v1(
+            all2all_backend=mock_config.parallel_config.all2all_backend,
+            data_parallel_size=mock_config.parallel_config.data_parallel_size,
+        )
 
     # mimic VllmConfig.__post_init__
     if compilation_config.cudagraph_capture_sizes:
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 4676039b23961..cd527e4198557 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -899,7 +899,7 @@ class CompilationConfig:
         self.compute_bs_to_padded_graph_size()
 
     def set_splitting_ops_for_v1(
-        self, all2all_backend: str | None = None, data_parallel_size: int | None = None
+        self, all2all_backend: str, data_parallel_size: int = 1
     ):
         # To compatible with OOT hardware plugin platform (for example vllm-ascend)
         # which currently only supports sequence parallelism in eager mode.
@@ -956,11 +956,9 @@ class CompilationConfig:
                 self.splitting_ops = []
 
         # Disable CUDA graphs for DeepEP high-throughput since its not CG compatible
-        backend = all2all_backend or envs.VLLM_ALL2ALL_BACKEND
-        dp_size = data_parallel_size if data_parallel_size is not None else 1
         if (
-            backend == "deepep_high_throughput"
-            and dp_size > 1
+            all2all_backend == "deepep_high_throughput"
+            and data_parallel_size > 1
             and self.cudagraph_mode != CUDAGraphMode.NONE
         ):
             # TODO: Piecewise Cuda graph might be enabled
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index 3fe066ec32505..7fb3bef34b77a 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -36,6 +36,14 @@ ExpertPlacementStrategy = Literal["linear", "round_robin"]
 DistributedExecutorBackend = Literal["ray", "mp", "uni", "external_launcher"]
 DataParallelBackend = Literal["ray", "mp"]
 EPLBPolicyOption = Literal["default"]
+All2AllBackend = Literal[
+    "naive",
+    "pplx",
+    "deepep_high_throughput",
+    "deepep_low_latency",
+    "allgather_reducescatter",
+    "flashinfer_all2allv",
+]
 
 
 @config
@@ -126,24 +134,14 @@ class ParallelConfig:
       with 4 experts and 2 ranks, rank 0 will have experts [0, 2] and rank 1
       will have experts [1, 3]. This strategy can help improve load balancing
       for grouped expert models with no redundant experts."""
-    all2all_backend: (
-        Literal[
-            "naive",
-            "pplx",
-            "deepep_high_throughput",
-            "deepep_low_latency",
-            "allgather_reducescatter",
-            "flashinfer_all2allv",
-        ]
-        | None
-    ) = None
-    """All2All backend for MoE expert parallel communication. If not set, uses
-    the value from VLLM_ALL2ALL_BACKEND environment variable. Available options:
-    - "naive": Naive all2all implementation using broadcasts
-    - "allgather_reducescatter": All2all based on allgather and reducescatter
-    - "pplx": Use pplx kernels
-    - "deepep_high_throughput": Use deepep high-throughput kernels
-    - "deepep_low_latency": Use deepep low-latency kernels
+    all2all_backend: All2AllBackend = "allgather_reducescatter"
+    """All2All backend for MoE expert parallel communication. Available options:
+
+    - "naive": Naive all2all implementation using broadcasts\n
+    - "allgather_reducescatter": All2all based on allgather and reducescatter\n
+    - "pplx": Use pplx kernels\n
+    - "deepep_high_throughput": Use deepep high-throughput kernels\n
+    - "deepep_low_latency": Use deepep low-latency kernels\n
     - "flashinfer_all2allv": Use flashinfer alltoallv kernels for mnnvl"""
 
     max_parallel_loading_workers: int | None = None
@@ -495,20 +493,17 @@ class ParallelConfig:
         from vllm.config.utils import get_hash_factors, hash_factors
 
         factors = get_hash_factors(self, ignored_factors)
-        # Explicitly include backend affecting env factor as before
-        factors["VLLM_ALL2ALL_BACKEND"] = str(envs.VLLM_ALL2ALL_BACKEND)
         return hash_factors(factors)
 
     def __post_init__(self) -> None:
         # Set all2all_backend from env var if not specified, with deprecation warning
-        if self.all2all_backend is None:
+        if envs.is_set("VLLM_ALL2ALL_BACKEND"):
+            logger.warning_once(
+                "VLLM_ALL2ALL_BACKEND environment variable is deprecated and "
+                "will be removed in v0.15.0. Please use the "
+                "--all2all-backend command-line argument instead."
+            )
             self.all2all_backend = envs.VLLM_ALL2ALL_BACKEND
-            if envs.is_set("VLLM_ALL2ALL_BACKEND"):
-                logger.warning_once(
-                    "VLLM_ALL2ALL_BACKEND environment variable is deprecated and "
-                    "will be removed in a future release. Please use the "
-                    "--all2all-backend command-line argument instead."
-                )
 
         # Continue with the rest of the initialization
         self.world_size = (
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 64510bdcaf8a8..a4d262d5e1183 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -408,7 +408,7 @@ class EngineArgs:
     data_parallel_external_lb: bool = False
     data_parallel_backend: str = ParallelConfig.data_parallel_backend
     enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
-    all2all_backend: str | None = ParallelConfig.all2all_backend
+    all2all_backend: str = ParallelConfig.all2all_backend
     enable_dbo: bool = ParallelConfig.enable_dbo
     ubatch_size: int = ParallelConfig.ubatch_size
     dbo_decode_token_threshold: int = ParallelConfig.dbo_decode_token_threshold
diff --git a/vllm/envs.py b/vllm/envs.py
index b59991aa6523a..f6db42e9124d6 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -1263,7 +1263,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_MOONCAKE_BOOTSTRAP_PORT": lambda: int(
         os.getenv("VLLM_MOONCAKE_BOOTSTRAP_PORT", "8998")
     ),
-    # all2all backend for vllm's expert parallel communication
+    # [DEPRECATED - will be removed in v0.15.0] all2all backend for vllm's
+    # expert parallel communication. Use --all2all-backend CLI argument instead.
     # Available options:
     # - "naive": naive all2all implementation using broadcasts
     # - "allgather_reducescatter": all2all implementation based on allgather and
@@ -1274,7 +1275,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # - "flashinfer_all2allv", use flashinfer alltoallv kernels for mnnvl
     "VLLM_ALL2ALL_BACKEND": env_with_choices(
         "VLLM_ALL2ALL_BACKEND",
-        "allgather_reducescatter",
+        None,
         [
             "naive",
             "pplx",

From 24b65eff0da0c8c4422f9cff6bf35f80a11c0274 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Thu, 18 Dec 2025 11:47:56 -0800
Subject: [PATCH 689/770] [BugFix] Spec decode with
 VLLM_ENABLE_V1_MULTIPROCESSING=0 (#30319)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 vllm/v1/engine/core_client.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index c936646aa7993..807db8275fbf5 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -268,7 +268,8 @@ class InprocClient(EngineCoreClient):
         self.engine_core = EngineCore(*args, **kwargs)
 
     def get_output(self) -> EngineCoreOutputs:
-        outputs, _ = self.engine_core.step_fn()
+        outputs, model_executed = self.engine_core.step_fn()
+        self.engine_core.post_step(model_executed=model_executed)
         return outputs and outputs.get(0) or EngineCoreOutputs()
 
     def get_supported_tasks(self) -> tuple[SupportedTask, ...]:

From 634a14bd7d099442e338938cc2dc456266eedaa4 Mon Sep 17 00:00:00 2001
From: Kayvan Mivehnejad <40775007+mivehk@users.noreply.github.com>
Date: Thu, 18 Dec 2025 14:51:58 -0500
Subject: [PATCH 690/770] =?UTF-8?q?Strengthen=20input=20validation=20and?=
 =?UTF-8?q?=20tests=20for=20'parse=5Fraw=5Fprompts=E2=80=99.=20(#30652)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Kayvan Mivehnejad <K.Mivehnejad@gmail.com>
---
 vllm/inputs/parse.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py
index 71289277eb987..5e7795a14072f 100644
--- a/vllm/inputs/parse.py
+++ b/vllm/inputs/parse.py
@@ -45,16 +45,17 @@ def parse_raw_prompts(
 
         # case 4: array of token arrays
         if is_list_of(prompt, list):
-            first = prompt[0]
-            if not isinstance(first, list):
-                raise ValueError("prompt expected to be a list of lists")
-
-            if len(first) == 0:
-                raise ValueError("Please provide at least one prompt")
-
-            # strict validation: every nested list must be list[int]
-            if not all(is_list_of(elem, int) for elem in prompt):
-                raise TypeError("Nested lists must contain only integers")
+            if len(prompt) == 1 and isinstance(prompt[0], list) and len(prompt[0]) == 0:
+                raise ValueError("please provide at least one prompt")
+            for elem in prompt:
+                if not isinstance(elem, list):
+                    raise TypeError(
+                        "prompt must be a list of lists, but found a non-list element."
+                    )
+                if not is_list_of(elem, int):
+                    raise TypeError(
+                        "Nested lists of tokens must contain only integers."
+                    )
 
             prompt = cast(list[list[int]], prompt)
             return [TokensPrompt(prompt_token_ids=elem) for elem in prompt]

From b0b77c46551bb5762a520f2b6e3af73a4defdfab Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Thu, 18 Dec 2025 12:59:55 -0800
Subject: [PATCH 691/770] [BugFix] Fix spec decode + structured outputs +
 preemption edge case (#30916)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/worker/gpu_model_runner.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 36637b98ea823..33be4dccfc710 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -3393,9 +3393,13 @@ class GPUModelRunner(
         return async_output
 
     def take_draft_token_ids(self) -> DraftTokenIds | None:
-        if self._draft_token_ids is None:
+        if not self.num_spec_tokens:
             return None
+
         req_ids = self.input_batch.req_ids
+        if self._draft_token_ids is None:
+            return DraftTokenIds(req_ids, [[] for _ in req_ids])
+
         if isinstance(self._draft_token_ids, torch.Tensor):
             draft_token_ids = self._draft_token_ids.tolist()
         else:

From 19c583398aec0ef2b9fe42ba020bc3c39e7e001f Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 18 Dec 2025 21:59:10 +0000
Subject: [PATCH 692/770] Check for truthy `rope_parameters` not the existence
 of it (#30983)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/transformers_utils/config.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index a11d37b4b2edf..887f936a2d8ae 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -318,15 +318,15 @@ def patch_rope_parameters(config: PretrainedConfig) -> None:
         # Transformers v4 installed, legacy config fields may be present
         if (rope_scaling := getattr(config, "rope_scaling", None)) is not None:
             config.rope_parameters = rope_scaling
+        if (
+            rope_theta is not None or partial_rotary_factor is not None
+        ) and not getattr(config, "rope_parameters", None):
+            config.rope_parameters = {"rope_type": "default"}
         if rope_theta is not None:
-            if not hasattr(config, "rope_parameters"):
-                config.rope_parameters = {"rope_type": "default"}
             config.rope_parameters["rope_theta"] = rope_theta
         if partial_rotary_factor is not None:
-            if not hasattr(config, "rope_parameters"):
-                config.rope_parameters = {"rope_type": "default"}
             config.rope_parameters["partial_rotary_factor"] = partial_rotary_factor
-    elif rope_theta is not None or hasattr(config, "rope_parameters"):
+    elif rope_theta is not None or getattr(config, "rope_parameters", None):
         # Transformers v5 installed
         # Patch these fields in case they used non-standard names
         if rope_theta is not None:

From 6ca74bc11afc1a985ab80f4b94b2d3bcda764630 Mon Sep 17 00:00:00 2001
From: "Chendi.Xue" <chendi.xue@intel.com>
Date: Thu, 18 Dec 2025 16:10:02 -0600
Subject: [PATCH 693/770] [NIXL][BUG FIX] Fix both failing issue and accuracy
 issue with nixl + host_buffer on CUDA (#30419)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Chendi Xue <chendi.xue@intel.com>
Signed-off-by: Chendi.Xue <chendi.xue@intel.com>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
---
 .../kv_transfer/kv_connector/utils.py         | 21 +++++++++
 .../kv_connector/v1/nixl_connector.py         | 45 ++++++++++++-------
 .../kv_connector/v1/offloading_connector.py   | 23 +---------
 3 files changed, 53 insertions(+), 36 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py
index a026cccb85372..4f1ea1a0240c4 100644
--- a/vllm/distributed/kv_transfer/kv_connector/utils.py
+++ b/vllm/distributed/kv_transfer/kv_connector/utils.py
@@ -4,6 +4,7 @@
 KV cache helper for store.
 """
 
+from collections.abc import Iterator
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Literal
 
@@ -203,6 +204,26 @@ def copy_kv_blocks(
         copy_fn(src_tensor, dst_tensor, src_indices, dst_indices)
 
 
+def yield_req_data(
+    scheduler_output,
+) -> Iterator[tuple[str, tuple[list[int], ...], bool]]:
+    """
+    Yields:
+        (req_id, new_block_id_groups, preempted)
+    """
+    # new requests
+    for req_data in scheduler_output.scheduled_new_reqs:
+        yield req_data.req_id, req_data.block_ids, False
+
+    # cached requests
+    cached_reqs = scheduler_output.scheduled_cached_reqs
+    yield from zip(
+        cached_reqs.req_ids,
+        cached_reqs.new_block_ids,
+        (req_id in cached_reqs.resumed_req_ids for req_id in cached_reqs.req_ids),
+    )
+
+
 @dataclass
 class TpKVTopology:
     """
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index be56eb4e93c10..757ca41e9844b 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -23,7 +23,11 @@ from vllm import envs
 from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.attention.selector import get_attn_backend
 from vllm.config import VllmConfig
-from vllm.distributed.kv_transfer.kv_connector.utils import EngineId, TpKVTopology
+from vllm.distributed.kv_transfer.kv_connector.utils import (
+    EngineId,
+    TpKVTopology,
+    yield_req_data,
+)
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     CopyBlocksOp,
     KVConnectorBase_V1,
@@ -481,7 +485,7 @@ class NixlConnectorScheduler:
         # New requests are added by update_state_after_alloc in
         # the scheduler. Used to make metadata passed to Worker.
         self._reqs_need_recv: dict[ReqId, tuple[Request, list[int]]] = {}
-        self._reqs_need_save: dict[ReqId, tuple[Request, list[int]]] = {}
+        self._reqs_need_save: dict[ReqId, Request] = {}
         # Reqs to send and their expiration time
         self._reqs_need_send: dict[ReqId, float] = {}
         self._reqs_in_batch: set[ReqId] = set()
@@ -627,16 +631,7 @@ class NixlConnectorScheduler:
         if self.use_host_buffer and params.get("do_remote_decode"):
             # NOTE: when accelerator is not directly supported by Nixl,
             # prefilled blocks need to be saved to host memory before transfer.
-
-            # save all blocks
-            block_ids = blocks.get_block_ids()[0]
-            # TODO: skip the blocks that are already in the host xfer buffer.
-            # Currently, the host xfer buffer block is 1-to-1 mapped to device
-            # kv blocks, so host blocks won't be flushed as long as its device
-            # block is not overwritten; and it will be safe to skip saving them
-            # to host xfer buffer.
-            if block_ids:
-                self._reqs_need_save[request.request_id] = (request, block_ids)
+            self._reqs_need_save[request.request_id] = request
         elif params.get("do_remote_prefill"):
             if params.get("remote_block_ids"):
                 if all(
@@ -688,13 +683,32 @@ class NixlConnectorScheduler:
                 kv_transfer_params=req.kv_transfer_params,
             )
 
-        for req_id, (req, block_ids) in self._reqs_need_save.items():
+        # NOTE: For the prefill side, there might be a chance that an early added
+        # request is a chunked prefill, so we need to check if new blocks are added
+        for req_id, new_block_id_groups, _ in yield_req_data(scheduler_output):
+            req_to_save = self._reqs_need_save.get(req_id)
+            if req_to_save is None or new_block_id_groups is None:
+                continue
+            req = req_to_save
+
             assert req.kv_transfer_params is not None
             meta.add_new_req_to_save(
                 request_id=req_id,
-                local_block_ids=block_ids,
+                local_block_ids=new_block_id_groups[0],
                 kv_transfer_params=req.kv_transfer_params,
             )
+            assert scheduler_output.num_scheduled_tokens is not None
+            num_scheduled_tokens = scheduler_output.num_scheduled_tokens[req_id]
+            is_partial = (
+                req.num_computed_tokens + num_scheduled_tokens
+            ) < req.num_prompt_tokens
+            if not is_partial:
+                # For non-partial prefills, once new req_meta is scheduled, it
+                # can be removed from _reqs_need_save.
+                # For partial prefill case, we will retain the request in
+                # _reqs_need_save until all blocks are scheduled with req_meta.
+                # Therefore, only pop if `not is_partial`.
+                self._reqs_need_save.pop(req_id)
 
         meta.reqs_to_send = self._reqs_need_send
         meta.reqs_in_batch = self._reqs_in_batch
@@ -702,7 +716,6 @@ class NixlConnectorScheduler:
 
         # Clear the list once workers start the transfers
         self._reqs_need_recv.clear()
-        self._reqs_need_save.clear()
         self._reqs_in_batch = set()
         self._reqs_not_processed = set()
         self._reqs_need_send = {}
@@ -748,6 +761,8 @@ class NixlConnectorScheduler:
             # Also include the case of a P/D Prefill request with immediate
             # block free (eg abort). Stop tracking this request.
             self._reqs_not_processed.add(request.request_id)
+            # Clear _reqs_need_save if a request is aborted as partial prefill.
+            self._reqs_need_save.pop(request.request_id, None)
             return False, None
 
         # TODO: check whether block_ids actually ever be 0. If not we could
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
index 0ad9d4ae1b39f..a6d86bc9e1a19 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections import defaultdict
-from collections.abc import Iterable, Iterator
+from collections.abc import Iterable
 from dataclasses import dataclass
 from itertools import islice
 from typing import Any, ClassVar
@@ -12,6 +12,7 @@ from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
 from vllm.attention.layer import Attention
 from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.distributed.kv_events import BlockRemoved, BlockStored, KVCacheEvent
+from vllm.distributed.kv_transfer.kv_connector.utils import yield_req_data
 from vllm.distributed.kv_transfer.kv_connector.v1 import (
     KVConnectorBase_V1,
     KVConnectorRole,
@@ -516,23 +517,3 @@ class OffloadingConnectorWorker:
                 del self._store_jobs[req_id]
 
         return finished_sending, finished_recving
-
-
-def yield_req_data(
-    scheduler_output,
-) -> Iterator[tuple[str, tuple[list[int], ...], bool]]:
-    """
-    Yields:
-        (req_id, new_block_id_groups, preempted)
-    """
-    # new requests
-    for req_data in scheduler_output.scheduled_new_reqs:
-        yield req_data.req_id, req_data.block_ids, False
-
-    # cached requests
-    cached_reqs = scheduler_output.scheduled_cached_reqs
-    yield from zip(
-        cached_reqs.req_ids,
-        cached_reqs.new_block_ids,
-        (req_id in cached_reqs.resumed_req_ids for req_id in cached_reqs.req_ids),
-    )

From d6b3d39b6d87c847cfc9b9a865204ef0a6fcec12 Mon Sep 17 00:00:00 2001
From: Benjamin Chislett <bchislett@nvidia.com>
Date: Thu, 18 Dec 2025 17:45:30 -0500
Subject: [PATCH 694/770] [Cleanup] Refactor FlashInferMetadataBuilder (#29128)

Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Co-authored-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 vllm/v1/attention/backends/flashinfer.py | 694 ++++++++++++++---------
 1 file changed, 414 insertions(+), 280 deletions(-)

diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 2740a6916fd97..623ae892ecdaf 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -16,6 +16,7 @@ from flashinfer import (
 from flashinfer.decode import _get_range_buf, trtllm_batch_decode_with_kv_cache
 from flashinfer.prefill import trtllm_batch_context_with_kv_cache
 from flashinfer.utils import FP4Tensor
+from typing_extensions import override
 
 from vllm import envs
 from vllm.attention.backends.abstract import (
@@ -59,6 +60,7 @@ from vllm.v1.attention.backends.utils import (
     split_decodes_and_prefills,
 )
 from vllm.v1.kv_cache_interface import AttentionSpec
+from vllm.v1.utils import CpuGpuBuffer
 
 FLASHINFER_WORKSPACE_BUFFER_SIZE_BATCH_INVARIANT = 2048 * 1024 * 1024
 
@@ -181,7 +183,6 @@ class BatchDCPPrefillWrapper:
         paged_kv_indptr_cpu: torch.Tensor,
         paged_kv_indices: torch.Tensor,
         paged_kv_last_page_len_cpu: torch.Tensor,
-        prefill_start: int,
         page_size: int,
         num_qo_heads: int,
         dcp_world_size: int,
@@ -200,7 +201,7 @@ class BatchDCPPrefillWrapper:
             qo_indptr_cpu,
             paged_kv_indptr_cpu,
             paged_kv_indices,
-            paged_kv_last_page_len_cpu[prefill_start:],
+            paged_kv_last_page_len_cpu,
             num_qo_heads * dcp_world_size,
             num_kv_heads,
             head_dim,
@@ -380,40 +381,103 @@ class FlashInferBackend(AttentionBackend):
 
 
 @dataclass
-class FlashInferMetadata:
-    num_actual_tokens: int  # Number of tokens excluding padding.
+class FIPrefill:
+    """Metadata for the native FlashInfer prefill pathway (non-TRTLLM)."""
 
-    # The data type of the query
-    q_data_type: torch.dtype
+    wrapper: BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper
+
+
+@dataclass
+class FIDecode:
+    """Metadata for the native FlashInfer decode pathway (non-TRTLLM)."""
+
+    wrapper: BatchDecodeWithPagedKVCacheWrapper
+
+
+@dataclass
+class TRTLLMPrefill:
+    """Metadata for the TRTLLM prefill pathway."""
+
+    block_tables: torch.Tensor
+    """
+    The slice of the block table tensor corresponding *only* to prefill requests.
+    Shape: [num_prefills, max_num_blocks_per_seq]
+    """
+
+    seq_lens: torch.Tensor
+    """
+    The slice of the sequence lengths tensor corresponding *only* to prefill requests.
+    Shape: [num_prefills]
+    """
+
+    cum_seq_lens_q: torch.Tensor
+    cum_seq_lens_kv: torch.Tensor
+
+    max_q_len: int
+    """
+    The maximum query length *among prefill requests*. 
+    """
+
+    max_seq_len: int
+    """The maximum sequence length for KV Cache."""
+
+
+@dataclass
+class TRTLLMDecode:
+    """Metadata for the TRTLLM decode pathway."""
+
+    block_tables: torch.Tensor
+    """
+    The slice of the block table tensor corresponding *only* to decode requests.
+    Shape: [num_decodes, max_num_blocks_per_seq]
+    """
+
+    seq_lens: torch.Tensor
+    """
+    The slice of the sequence lengths tensor corresponding *only* to decode requests.
+    Shape: [num_decodes]
+    """
+
+    max_seq_len: int
+    """The maximum sequence length for KV Cache."""
+
+
+@dataclass
+class FlashInferMetadata:
+    num_actual_tokens: int
+    """Total number of tokens in the batch (excluding padding)."""
 
     slot_mapping: torch.Tensor
+    """Tensor for writing K/V to the cache. Shape: [num_actual_tokens]"""
 
-    # For flashinfer trtllm batch decode
-    max_q_len: int
-    max_q_len_prefill: int
-    max_seq_len: int
-    seq_lens: torch.Tensor
-    block_table_tensor: torch.Tensor
-    prefill_use_trtllm: bool
-    decode_use_trtllm: bool
+    q_data_type: torch.dtype
 
-    # For handling prefill decode split
     num_decodes: int
     num_decode_tokens: int
     num_prefills: int
     num_prefill_tokens: int
 
-    # For cascade attention (CPU for planning).
+    prefill: FIPrefill | TRTLLMPrefill | None
+    """
+    Holds the metadata for the prefill portion of the batch.
+    Will be `None` if `num_prefill_tokens == 0`.
+    """
+
+    decode: FIDecode | TRTLLMDecode | None
+    """
+    Holds the metadata for the decode portion of the batch.
+    Will be `None` if `num_decode_tokens == 0`.
+    """
+
+    # --- Special Case: Cascade Attention ---
+
     use_cascade: bool
+    """
+    If True, the entire batch is a cascade attention call, and the
+    `prefill` and `decode` fields will both be None.
+    """
 
-    prefill_wrapper: (
-        BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper | None
-    ) = None
-    decode_wrapper: BatchDecodeWithPagedKVCacheWrapper | None = None
-    cascade_wrapper: MultiLevelCascadeAttentionWrapper | None = None
-
-    qo_indptr_gpu: torch.Tensor | None = None
-    paged_kv_indptr_gpu: torch.Tensor | None = None
+    cascade_wrapper: MultiLevelCascadeAttentionWrapper | None
 
 
 class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
@@ -482,6 +546,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
             self.dcp_world_size = 1
             self.dcp_rank = 0
             self.dcp_kv_cache_interleave_size = 1
+        self.use_dcp = self.dcp_world_size > 1
 
         self.num_qo_heads = self.model_config.get_num_attention_heads(
             self.vllm_config.parallel_config
@@ -535,34 +600,14 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                 "sinks, please use trtllm on blackwell or flash attention on "
                 "earlier GPUs."
             )
-        # Preparing persistent buffers (device-side)
-        self.paged_kv_indptr = torch.zeros(
-            max_num_reqs + 1, dtype=torch.int32, device=self.device
-        )
-        self.paged_kv_indices = torch.zeros(
-            max_num_pages,  # max num pages possible
-            dtype=torch.int32,
-            device=self.device,
-        )
-        self.paged_kv_last_page_len = torch.zeros(
-            max_num_reqs, dtype=torch.int32, device=self.device
-        )
-        # host-side buffer
-        pin_memory = is_pin_memory_available()
-        self.paged_kv_indptr_cpu = torch.zeros(
-            max_num_reqs + 1, dtype=torch.int32, device="cpu", pin_memory=pin_memory
-        )
-        self.paged_kv_indptr_np = self.paged_kv_indptr_cpu.numpy()
-        self.paged_kv_indptr_buffer = torch.zeros_like(
-            self.paged_kv_indptr_cpu, pin_memory=pin_memory
-        )
-        self.paged_kv_indices_cpu = torch.zeros(
-            max_num_pages, dtype=torch.int32, device="cpu", pin_memory=pin_memory
-        )
-        self.paged_kv_last_page_len_cpu = torch.zeros(
-            max_num_reqs, dtype=torch.int32, device="cpu", pin_memory=pin_memory
-        )
-        self.paged_kv_last_page_len_np = self.paged_kv_last_page_len_cpu.numpy()
+        # Preparing persistent buffers
+        self.pin_memory = is_pin_memory_available()
+        self.paged_kv_indptr = self._make_buffer(max_num_reqs + 1)
+        self.paged_kv_indptr_cpu_buffer = torch.zeros_like(
+            self.paged_kv_indptr.cpu, pin_memory=self.pin_memory
+        )  # Extra buffer for mutable paged_kv_indptr.cpu in cuda graph mode
+        self.paged_kv_indices = self._make_buffer(max_num_pages)
+        self.paged_kv_last_page_len = self._make_buffer(max_num_reqs)
 
         if self.head_dim == 256 and current_platform.is_device_capability_family(100):
             # https://github.com/flashinfer-ai/flashinfer/issues/1993 reports that
@@ -573,6 +618,18 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                 "passing --block-size 32 or --block-size 64."
             )
 
+    def _make_buffer(
+        self, *size: int | torch.SymInt, dtype: torch.dtype = torch.int32
+    ) -> CpuGpuBuffer:
+        return CpuGpuBuffer(
+            *size,
+            dtype=dtype,
+            device=self.device,
+            pin_memory=self.pin_memory,
+            with_numpy=True,
+        )
+
+    @override  # type: ignore[misc]
     @classmethod
     def get_cudagraph_support(
         cls: type["FlashInferMetadataBuilder"],
@@ -607,7 +664,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         self,
     ) -> BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper:
         if self._prefill_wrapper is None:
-            if self.dcp_world_size > 1:
+            if self.use_dcp:
                 self._prefill_wrapper = BatchDCPPrefillWrapper(
                     workspace_buffer=self._get_workspace_buffer(),
                 )
@@ -626,9 +683,9 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
 
         if decode_wrapper is None:
             if use_cudagraph:
-                paged_kv_indptr = self.paged_kv_indptr[: batch_size + 1]
-                paged_kv_indices = self.paged_kv_indices
-                paged_kv_last_page_len = self.paged_kv_last_page_len[:batch_size]
+                paged_kv_indptr = self.paged_kv_indptr.gpu[: batch_size + 1]
+                paged_kv_indices = self.paged_kv_indices.gpu
+                paged_kv_last_page_len = self.paged_kv_last_page_len.gpu[:batch_size]
             else:
                 paged_kv_indptr = None
                 paged_kv_indices = None
@@ -661,6 +718,60 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
             )
         return self._cascade_wrapper
 
+    def _compute_flashinfer_kv_metadata(
+        self,
+        num_blocks_np: np.ndarray,
+        seq_lens_np: np.ndarray,
+        block_table_tensor: torch.Tensor,
+        num_reqs: int,
+        page_size: int,
+    ) -> torch.Tensor:
+        """
+        Compute paged_kv_indptr, paged_kv_indices, paged_kv_last_page_len for FlashInfer
+        attention.
+
+        Results are stored in self.paged_kv_indptr,
+        self.paged_kv_indices, self.paged_kv_last_page_len buffers.
+
+        Returns paged_kv_indices, a GPU tensor with shape [num_actual_pages].
+        """
+        # write self.paged_kv_indptr_cpu inplace (0-index is always 0)
+        np.cumsum(
+            num_blocks_np,
+            dtype=np.int32,
+            out=self.paged_kv_indptr.np[1 : num_reqs + 1],
+        )
+        # NOTE(woosuk): Because self.paged_kv_indptr_cpu can be modified
+        # after this line (e.g., for cuda graphs), we need to copy the data to
+        # self.paged_kv_indptr_buffer to avoid race condition.
+        self.paged_kv_indptr_cpu_buffer[: num_reqs + 1] = self.paged_kv_indptr.cpu[
+            : num_reqs + 1
+        ]
+        paged_kv_indptr = self.paged_kv_indptr.gpu[: num_reqs + 1]
+        paged_kv_indptr.copy_(
+            self.paged_kv_indptr_cpu_buffer[: num_reqs + 1], non_blocking=True
+        )
+
+        # write self.paged_kv_indices inplace
+        num_actual_pages = self.paged_kv_indptr.np[num_reqs]
+        paged_kv_indices = self.paged_kv_indices.gpu[:num_actual_pages]
+        _copy_page_indices_kernel[(num_reqs,)](
+            paged_kv_indices,
+            block_table_tensor,
+            block_table_tensor.stride(0),
+            paged_kv_indptr,
+            BLOCK_SIZE=1024,
+        )
+
+        # write self.paged_kv_last_page_len_cpu inplace
+        paged_kv_last_page_len_np = seq_lens_np % page_size
+        self.paged_kv_last_page_len.np[:num_reqs] = np.where(
+            (paged_kv_last_page_len_np == 0) & (seq_lens_np != 0),
+            page_size,
+            paged_kv_last_page_len_np,
+        )
+        return paged_kv_indices
+
     def build(
         self,
         common_prefix_len: int,
@@ -678,98 +789,17 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         )
 
         page_size = self.page_size
-        max_q_len = common_attn_metadata.max_query_len
         max_seq_len = common_attn_metadata.max_seq_len
         seq_lens = common_attn_metadata.seq_lens
-        seq_lens_cpu = common_attn_metadata.seq_lens_cpu
         block_table_tensor = common_attn_metadata.block_table_tensor
+        qo_indptr = common_attn_metadata.query_start_loc
         qo_indptr_cpu = common_attn_metadata.query_start_loc_cpu
 
-        if self.dcp_world_size > 1:
-            if num_prefills > 0:
-                qo_indptr_prefill_cpu = (
-                    qo_indptr_cpu[num_decodes:] - qo_indptr_cpu[num_decodes]
-                )
-                query_lens_prefill_cpu = (
-                    qo_indptr_prefill_cpu[1:] - qo_indptr_prefill_cpu[:-1]
-                )
-                seq_lens_cpu[num_decodes:] = (
-                    seq_lens_cpu[num_decodes:] - query_lens_prefill_cpu
-                )
-
-            seq_lens_cpu = get_dcp_local_seq_lens(
-                seq_lens_cpu,
-                self.dcp_world_size,
-                self.dcp_rank,
-                self.dcp_kv_cache_interleave_size,
-            )
-
-        seq_lens_np = seq_lens_cpu.numpy()
-        num_blocks_np = (seq_lens_np + (page_size - 1)) // page_size
-
+        # Step 1: Decide which dispatch modes to use:
+        # - Cascade attention (distinct mode)
+        # - Prefill (FI native or TRTLLM)
+        # - Decode (FI native or TRTLLM)
         use_cascade = common_prefix_len > 0
-        if use_cascade:
-            # Grab the blocks of the shared prefix from the first request.
-            assert common_prefix_len % page_size == 0
-            num_common_kv_blocks = common_prefix_len // page_size
-
-            # Create CPU versions directly for cascade (no GPU versions needed)
-            shared_qo_indptr_cpu = torch.tensor(
-                [0, num_actual_tokens], dtype=torch.int32, device="cpu"
-            )
-            shared_kv_page_indptr_cpu = torch.tensor(
-                [0, num_common_kv_blocks], dtype=torch.int32, device="cpu"
-            )
-            shared_kv_page_indices_cpu = block_table_tensor[0, :num_common_kv_blocks]
-            shared_kv_last_page_len_cpu = torch.tensor(
-                [page_size], dtype=torch.int32, device="cpu"
-            )
-
-            # Remove the blocks of the shared prefix from all requests.
-            block_table_tensor = block_table_tensor[:, num_common_kv_blocks:]
-            num_blocks_np -= num_common_kv_blocks
-        else:
-            shared_qo_indptr_cpu = None
-            shared_kv_page_indptr_cpu = None
-            shared_kv_page_indices_cpu = None
-            shared_kv_last_page_len_cpu = None
-
-        # write self.paged_kv_indptr_cpu inplace (0-index is always 0)
-        np.cumsum(
-            num_blocks_np,
-            dtype=np.int32,
-            out=self.paged_kv_indptr_np[1 : num_reqs + 1],
-        )
-        # NOTE(woosuk): Because self.paged_kv_indptr_cpu can be modified
-        # after this line (e.g., for cuda graphs), we need to copy the data to
-        # self.paged_kv_indptr_buffer to avoid race condition.
-        self.paged_kv_indptr_buffer[: num_reqs + 1] = self.paged_kv_indptr_cpu[
-            : num_reqs + 1
-        ]
-        paged_kv_indptr = self.paged_kv_indptr[: num_reqs + 1]
-        paged_kv_indptr.copy_(
-            self.paged_kv_indptr_buffer[: num_reqs + 1], non_blocking=True
-        )
-
-        # write self.paged_kv_indices inplace
-        num_actual_pages = self.paged_kv_indptr_np[num_reqs]
-        paged_kv_indices = self.paged_kv_indices[:num_actual_pages]
-        _copy_page_indices_kernel[(num_reqs,)](
-            paged_kv_indices,
-            block_table_tensor,
-            block_table_tensor.stride(0),
-            paged_kv_indptr,
-            BLOCK_SIZE=1024,
-        )
-
-        # write self.paged_kv_last_page_len_cpu inplace
-        paged_kv_last_page_len_np = seq_lens_np % page_size
-        self.paged_kv_last_page_len_np[:num_reqs] = np.where(
-            (paged_kv_last_page_len_np == 0) & (seq_lens_np != 0),
-            page_size,
-            paged_kv_last_page_len_np,
-        )
-
         uses_spec_reorder = self.reorder_batch_threshold > 1
         prefill_use_trtllm = use_trtllm_attention(
             self.num_qo_heads,
@@ -788,7 +818,14 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
             self.use_trtllm_decode_attention and self.dcp_world_size <= 1
         )
 
-        if not (prefill_use_trtllm and decode_use_trtllm):
+        all_uses_trtllm = (num_prefills == 0 or prefill_use_trtllm) and (
+            num_decodes == 0 or decode_use_trtllm
+        )
+        is_only_trtllm_decode = num_prefills == 0 and (
+            num_decodes > 0 and decode_use_trtllm
+        )
+
+        if not all_uses_trtllm:
             if self.has_sinks:
                 raise NotImplementedError(
                     "FlashInfer backend currently does not support attention "
@@ -813,28 +850,102 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
             # fall back to model dtype.
             self.q_data_type = self.model_config.dtype
 
+        # Step 2: Initialize the output metadata
+        # Leave prefill/decode/cascade_wrapper empty, to be populated
+        # case by case depending on the batch contents and backend selection.
         attn_metadata = FlashInferMetadata(
             num_actual_tokens=num_actual_tokens,
-            q_data_type=self.q_data_type,
             slot_mapping=common_attn_metadata.slot_mapping,
-            max_q_len=max_q_len,
-            max_q_len_prefill=max_q_len,
-            max_seq_len=max_seq_len,
-            seq_lens=seq_lens,
-            block_table_tensor=block_table_tensor,
-            prefill_use_trtllm=prefill_use_trtllm,
-            decode_use_trtllm=decode_use_trtllm,
+            q_data_type=self.q_data_type,
             num_decodes=num_decodes,
             num_decode_tokens=num_decode_tokens,
             num_prefills=num_prefills,
             num_prefill_tokens=num_prefill_tokens,
             use_cascade=use_cascade,
+            prefill=None,
+            decode=None,
+            cascade_wrapper=None,
         )
 
-        paged_kv_indptr_cpu = self.paged_kv_indptr_cpu[: 1 + num_reqs]
-        paged_kv_last_page_len_cpu = self.paged_kv_last_page_len_cpu[:num_reqs]
+        # Guard access to seq_lens_cpu, which may not always be needed
+        # and can be expensive to retrieve in async mode.
+        needs_seq_lens_cpu = self.use_dcp or use_cascade or not is_only_trtllm_decode
+        seq_lens_cpu = common_attn_metadata.seq_lens_cpu if needs_seq_lens_cpu else None
+        seq_lens_np = seq_lens_cpu.numpy() if seq_lens_cpu is not None else None
+        num_blocks_np = (
+            (seq_lens_np + (page_size - 1)) // page_size
+            if seq_lens_np is not None
+            else None
+        )
+
+        # Adjust seq_lens_cpu for DCP
+        if self.use_dcp:
+            assert seq_lens_cpu is not None
+            if num_prefills > 0:
+                qo_indptr_prefill_cpu = (
+                    qo_indptr_cpu[num_decodes:] - qo_indptr_cpu[num_decodes]
+                )
+                query_lens_prefill_cpu = (
+                    qo_indptr_prefill_cpu[1:] - qo_indptr_prefill_cpu[:-1]
+                )
+                seq_lens_cpu[num_decodes:] = (
+                    seq_lens_cpu[num_decodes:] - query_lens_prefill_cpu
+                )
+
+            seq_lens_cpu = get_dcp_local_seq_lens(
+                seq_lens_cpu,
+                self.dcp_world_size,
+                self.dcp_rank,
+                self.dcp_kv_cache_interleave_size,
+            )
+
+        # Adjust num_block_np for cascade attention
+        if use_cascade:
+            assert num_blocks_np is not None
+            assert common_prefix_len % page_size == 0
+            num_common_kv_blocks = common_prefix_len // page_size
+            num_blocks_np -= num_common_kv_blocks
+
+        # Compute paged_kv_indices if necessary
+        needs_paged_kv_indices = use_cascade or not is_only_trtllm_decode
+        if needs_paged_kv_indices:
+            assert num_blocks_np is not None
+            assert seq_lens_np is not None
+            paged_kv_indices = self._compute_flashinfer_kv_metadata(
+                num_blocks_np,
+                seq_lens_np,
+                block_table_tensor,
+                num_reqs,
+                page_size,
+            )
+        else:
+            paged_kv_indices = None
+
+        # Early-out for cascade attention
+        if use_cascade:
+            # Grab the blocks of the shared prefix from the first request.
+            num_common_kv_blocks = common_prefix_len // page_size
+
+            # Create CPU versions directly for cascade (no GPU versions needed)
+            shared_qo_indptr_cpu = torch.tensor(
+                [0, num_actual_tokens], dtype=torch.int32, device="cpu"
+            )
+            shared_kv_page_indptr_cpu = torch.tensor(
+                [0, num_common_kv_blocks], dtype=torch.int32, device="cpu"
+            )
+            shared_kv_page_indices_cpu = block_table_tensor[0, :num_common_kv_blocks]
+            shared_kv_last_page_len_cpu = torch.tensor(
+                [page_size], dtype=torch.int32, device="cpu"
+            )
+
+            # Remove the blocks of the shared prefix from all requests.
+            block_table_tensor = block_table_tensor[:, num_common_kv_blocks:]
+            num_blocks_np -= num_common_kv_blocks
+
+            assert paged_kv_indices is not None
+            paged_kv_indptr_cpu = self.paged_kv_indptr.cpu[: 1 + num_reqs]
+            paged_kv_last_page_len_cpu = self.paged_kv_last_page_len.cpu[:num_reqs]
 
-        if attn_metadata.use_cascade:
             attn_metadata.cascade_wrapper = self._get_cascade_wrapper()
             attn_metadata.cascade_wrapper.plan(
                 [shared_qo_indptr_cpu, qo_indptr_cpu],
@@ -852,91 +963,107 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                 q_data_type=self.q_data_type,
                 kv_data_type=self.kv_cache_dtype,
             )
-        else:
-            # Regular attention (common case).
-            # Decodes are at the front and prefills are at the back.
-            num_prefills = attn_metadata.num_prefills
-            num_decodes = attn_metadata.num_decodes
-            if num_prefills > 0:
-                # Decodes are first so prefills start after the last decode
-                prefill_start = num_decodes
-                attn_metadata.prefill_wrapper = self._get_prefill_wrapper()
-                assert qo_indptr_cpu[prefill_start:].shape[0] == num_prefills + 1
-                assert paged_kv_indptr_cpu[prefill_start:].shape[0] == num_prefills + 1
-                assert (
-                    paged_kv_last_page_len_cpu[prefill_start:].shape[0] == num_prefills
-                )
-                # Since prefill_wrapper.run() will be called with
-                # query[num_decode_tokens:] we need to adjust the qo_indptr
-                # to be relative to the start of the prefill queries.
-                qo_indptr_cpu = (
-                    qo_indptr_cpu[prefill_start:] - qo_indptr_cpu[prefill_start]
-                )
-                paged_kv_indptr_cpu = paged_kv_indptr_cpu[prefill_start:]
+            return attn_metadata
 
-                # Recompute max_q_len for the slice of requests we are using
-                # for prefills. This can be different from max_q_len when
-                # we have a non-uniform batch with some short decodes offloaded
-                # to the prefill pathway
-                query_lens_prefill = qo_indptr_cpu[1:] - qo_indptr_cpu[:-1]
-                attn_metadata.max_q_len_prefill = int(query_lens_prefill.max().item())
+        # Step 3: Handle prefill and decode pathways case by case
+        ## PREFILL PATHWAY
+        if num_prefills > 0:
+            # Slices for shared prefill metadata
+            prefill_start = num_decodes
+            qo_indptr_prefill_cpu = (
+                qo_indptr_cpu[prefill_start:] - qo_indptr_cpu[prefill_start]
+            )
+            assert qo_indptr_prefill_cpu.shape[0] == num_prefills + 1
 
-                if not attn_metadata.prefill_use_trtllm:
-                    if self.dcp_world_size > 1:
-                        assert isinstance(
-                            attn_metadata.prefill_wrapper, BatchDCPPrefillWrapper
-                        )
-                        attn_metadata.prefill_wrapper.plan(
-                            qo_indptr_cpu=qo_indptr_cpu,
-                            paged_kv_indptr_cpu=paged_kv_indptr_cpu,
-                            paged_kv_indices=paged_kv_indices,
-                            paged_kv_last_page_len_cpu=paged_kv_last_page_len_cpu,
-                            prefill_start=prefill_start,
-                            page_size=self.page_size,
-                            num_qo_heads=self.num_qo_heads,
-                            dcp_world_size=self.dcp_world_size,
-                            num_kv_heads=self.num_kv_heads,
-                            head_dim=self.head_dim,
-                            sm_scale=self.sm_scale,
-                            window_left=self.window_left,
-                            logits_soft_cap=self.logits_soft_cap,
-                            q_data_type=self.q_data_type,
-                            kv_cache_dtype=self.kv_cache_dtype,
-                            prefill_fixed_split_size=self.prefill_fixed_split_size,
-                            disable_split_kv=self.disable_split_kv,
-                        )
-                    else:
-                        assert isinstance(
-                            attn_metadata.prefill_wrapper,
-                            BatchPrefillWithPagedKVCacheWrapper,
-                        )
-                        attn_metadata.prefill_wrapper.plan(
-                            qo_indptr_cpu,
-                            paged_kv_indptr_cpu,
-                            paged_kv_indices,
-                            paged_kv_last_page_len_cpu[prefill_start:],
-                            self.num_qo_heads,
-                            self.num_kv_heads,
-                            self.head_dim,
-                            self.page_size,
-                            causal=True,
-                            sm_scale=self.sm_scale,
-                            window_left=self.window_left,
-                            logits_soft_cap=self.logits_soft_cap,
-                            q_data_type=self.q_data_type,
-                            kv_data_type=self.kv_cache_dtype,
-                            fixed_split_size=self.prefill_fixed_split_size,
-                            disable_split_kv=self.disable_split_kv,
-                        )
+            if prefill_use_trtllm:
+                # Create GPU versions
+                qo_indptr_prefill_gpu = (
+                    qo_indptr[prefill_start:] - qo_indptr[prefill_start]
+                )
+                paged_kv_indptr_prefill_gpu = self.paged_kv_indptr.gpu[
+                    prefill_start : num_reqs + 1
+                ]
+                # Compute max_q_len for prefill requests
+                query_lens_prefill_cpu = (
+                    qo_indptr_prefill_cpu[1:] - qo_indptr_prefill_cpu[:-1]
+                )
+                max_q_len_prefill = int(query_lens_prefill_cpu.max().item())
+                attn_metadata.prefill = TRTLLMPrefill(
+                    block_tables=block_table_tensor[prefill_start:],
+                    seq_lens=seq_lens[prefill_start:],
+                    cum_seq_lens_q=qo_indptr_prefill_gpu,
+                    cum_seq_lens_kv=paged_kv_indptr_prefill_gpu,
+                    max_q_len=max_q_len_prefill,
+                    max_seq_len=max_seq_len,
+                )
+            else:
+                prefill_wrapper = self._get_prefill_wrapper()
+                # Slicing CPU buffers that are only needed for FI native prefills
+                paged_kv_last_page_len_prefill_cpu = self.paged_kv_last_page_len.cpu[
+                    prefill_start:num_reqs
+                ]
+                assert paged_kv_last_page_len_prefill_cpu.shape[0] == num_prefills
+                paged_kv_indptr_prefill_cpu = self.paged_kv_indptr.cpu[
+                    prefill_start : num_reqs + 1
+                ]
+                assert paged_kv_indptr_prefill_cpu.shape[0] == num_prefills + 1
+                if self.use_dcp:
+                    assert isinstance(prefill_wrapper, BatchDCPPrefillWrapper)
+                    prefill_wrapper.plan(
+                        qo_indptr_cpu=qo_indptr_prefill_cpu,
+                        paged_kv_indptr_cpu=paged_kv_indptr_prefill_cpu,
+                        paged_kv_indices=paged_kv_indices,
+                        paged_kv_last_page_len_cpu=paged_kv_last_page_len_prefill_cpu,
+                        page_size=self.page_size,
+                        num_qo_heads=self.num_qo_heads,
+                        dcp_world_size=self.dcp_world_size,
+                        num_kv_heads=self.num_kv_heads,
+                        head_dim=self.head_dim,
+                        sm_scale=self.sm_scale,
+                        window_left=self.window_left,
+                        logits_soft_cap=self.logits_soft_cap,
+                        q_data_type=self.q_data_type,
+                        kv_cache_dtype=self.kv_cache_dtype,
+                        prefill_fixed_split_size=self.prefill_fixed_split_size,
+                        disable_split_kv=self.disable_split_kv,
+                    )
                 else:
-                    attn_metadata.qo_indptr_gpu = qo_indptr_cpu.to(
-                        self.device, non_blocking=True
+                    assert isinstance(
+                        prefill_wrapper,
+                        BatchPrefillWithPagedKVCacheWrapper,
                     )
-                    attn_metadata.paged_kv_indptr_gpu = paged_kv_indptr_cpu.to(
-                        self.device, non_blocking=True
+                    prefill_wrapper.plan(
+                        qo_indptr_prefill_cpu,
+                        paged_kv_indptr_prefill_cpu,
+                        paged_kv_indices,
+                        paged_kv_last_page_len_prefill_cpu,
+                        self.num_qo_heads,
+                        self.num_kv_heads,
+                        self.head_dim,
+                        self.page_size,
+                        causal=True,
+                        sm_scale=self.sm_scale,
+                        window_left=self.window_left,
+                        logits_soft_cap=self.logits_soft_cap,
+                        q_data_type=self.q_data_type,
+                        kv_data_type=self.kv_cache_dtype,
+                        fixed_split_size=self.prefill_fixed_split_size,
+                        disable_split_kv=self.disable_split_kv,
                     )
+                attn_metadata.prefill = FIPrefill(wrapper=prefill_wrapper)
 
-            if num_decodes > 0:
+        ## DECODE PATHWAY
+        if num_decodes > 0:
+            if decode_use_trtllm:
+                assert num_decode_tokens % num_decodes == 0, (
+                    "TRTLLM decode requires uniform query lengths per request."
+                )
+                attn_metadata.decode = TRTLLMDecode(
+                    block_tables=block_table_tensor[:num_decodes],
+                    seq_lens=seq_lens[:num_decodes],
+                    max_seq_len=max_seq_len,
+                )
+            else:
                 pure_decode = num_prefills == 0
                 use_cudagraph = (
                     self.enable_cuda_graph
@@ -945,33 +1072,33 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                 )
                 num_input_tokens = num_decode_tokens
 
-                attn_metadata.decode_wrapper = self._get_decode_wrapper(
+                decode_wrapper = self._get_decode_wrapper(
                     num_input_tokens, use_cudagraph
                 )
-                if not attn_metadata.decode_use_trtllm:
-                    # Use the persistent buffer with padding length,
-                    # instead of the same address but chunked version
-                    # in atten_metadata when using cudagraph.
-                    fast_plan_decode(
-                        attn_metadata.decode_wrapper,
-                        self.paged_kv_indptr_cpu[: num_input_tokens + 1],
-                        paged_kv_indices,
-                        self.paged_kv_last_page_len_cpu[:num_input_tokens],
-                        seq_lens_cpu[:num_input_tokens],
-                        self.num_qo_heads * self.dcp_world_size,
-                        self.num_kv_heads,
-                        self.head_dim,
-                        self.page_size,
-                        # Disable flashinfer's pos encoding and use vllm's rope.
-                        pos_encoding_mode="NONE",
-                        sm_scale=self.sm_scale,
-                        window_left=self.window_left,
-                        logits_soft_cap=self.logits_soft_cap,
-                        q_data_type=self.q_data_type,
-                        kv_data_type=self.kv_cache_dtype,
-                        fixed_split_size=self.decode_fixed_split_size,
-                        disable_split_kv=self.disable_split_kv,
-                    )
+                # Use the persistent buffer with padding length,
+                # instead of the same address but chunked version
+                # in atten_metadata when using cudagraph.
+                fast_plan_decode(
+                    decode_wrapper,
+                    self.paged_kv_indptr.cpu[: num_input_tokens + 1],
+                    paged_kv_indices,
+                    self.paged_kv_last_page_len.cpu[:num_input_tokens],
+                    seq_lens_cpu[:num_input_tokens],
+                    self.num_qo_heads * self.dcp_world_size,
+                    self.num_kv_heads,
+                    self.head_dim,
+                    self.page_size,
+                    # Disable flashinfer's pos encoding and use vllm's rope.
+                    pos_encoding_mode="NONE",
+                    sm_scale=self.sm_scale,
+                    window_left=self.window_left,
+                    logits_soft_cap=self.logits_soft_cap,
+                    q_data_type=self.q_data_type,
+                    kv_data_type=self.kv_cache_dtype,
+                    fixed_split_size=self.decode_fixed_split_size,
+                    disable_split_kv=self.disable_split_kv,
+                )
+                attn_metadata.decode = FIDecode(wrapper=decode_wrapper)
         return attn_metadata
 
     def use_cascade_attention(self, *args, **kwargs) -> bool:
@@ -1104,6 +1231,9 @@ class FlashInferImpl(AttentionImpl):
         if self.bmm2_scale is None:
             self.bmm2_scale = layer._v_scale_float
 
+        prefill_use_trtllm = isinstance(attn_metadata.prefill, TRTLLMPrefill)
+        decode_use_trtllm = isinstance(attn_metadata.decode, TRTLLMDecode)
+
         # The attn+quant fusion happens when output_scale is provided.
         if output_scale is None:
             assert output_block_scale is None, (
@@ -1113,8 +1243,8 @@ class FlashInferImpl(AttentionImpl):
             assert attn_metadata.q_data_type == FP8_DTYPE, (
                 "Query must be FP8 when attn+quant fusion happened."
             )
-            assert (
-                attn_metadata.prefill_use_trtllm and attn_metadata.decode_use_trtllm
+            assert (attn_metadata.num_prefills == 0 or prefill_use_trtllm) and (
+                attn_metadata.num_decodes == 0 or decode_use_trtllm
             ), "Must use TRT-LLM attn"
 
             if output.dtype == FP8_DTYPE:
@@ -1191,22 +1321,25 @@ class FlashInferImpl(AttentionImpl):
 
         # When using spec decoding, num_decodes can be < num_decode_tokens
         # because some decode requests may have more than one query token.
-        num_decodes = attn_metadata.num_decodes
         num_decode_tokens = attn_metadata.num_decode_tokens
         num_prefill_tokens = attn_metadata.num_prefill_tokens
 
         stride_order = FlashInferBackend.get_kv_cache_stride_order()
         kv_cache_permute = kv_cache.permute(*stride_order)
+
+        use_dcp = self.dcp_world_size > 1
+
         # Regular attention (common case).
         # Decodes are at the front and prefills are at the back.
         if num_prefill_tokens > 0:
-            prefill_wrapper = attn_metadata.prefill_wrapper
             prefill_query = query[num_decode_tokens:]
             assert prefill_query.shape[0] == num_prefill_tokens
-            assert prefill_wrapper is not None
 
-            if not attn_metadata.prefill_use_trtllm:
-                if self.dcp_world_size > 1:
+            if not prefill_use_trtllm:
+                assert isinstance(attn_metadata.prefill, FIPrefill)
+                prefill_wrapper = attn_metadata.prefill.wrapper
+                assert prefill_wrapper is not None
+                if use_dcp:
                     assert isinstance(prefill_wrapper, BatchDCPPrefillWrapper)
                     assert prefill_wrapper._context._window_left == self.window_left
                     assert prefill_wrapper._context._logits_soft_cap == (
@@ -1247,11 +1380,12 @@ class FlashInferImpl(AttentionImpl):
                         out=output[num_decode_tokens:],
                     )
             else:
+                assert isinstance(attn_metadata.prefill, TRTLLMPrefill)
                 # prefill_query may be non-contiguous
                 prefill_query = prefill_query.contiguous()
                 workspace_buffer = _get_trtllm_gen_workspace_buffer()
-                block_tables_prefill = attn_metadata.block_table_tensor[num_decodes:]
-                seq_lens_prefill = attn_metadata.seq_lens[num_decodes:]
+                block_tables_prefill = attn_metadata.prefill.block_tables
+                seq_lens_prefill = attn_metadata.prefill.seq_lens
 
                 # This path needs to be enabled with VLLM_KV_CACHE_LAYOUT = HND
                 assert get_kv_cache_layout() == "HND"
@@ -1298,13 +1432,13 @@ class FlashInferImpl(AttentionImpl):
                     workspace_buffer=workspace_buffer,
                     block_tables=mock_block_table,
                     seq_lens=seq_lens_prefill,
-                    max_q_len=attn_metadata.max_q_len_prefill,
-                    max_kv_len=attn_metadata.max_seq_len,
+                    max_q_len=attn_metadata.prefill.max_q_len,
+                    max_kv_len=attn_metadata.prefill.max_seq_len,
                     bmm1_scale=self.bmm1_scale,
                     bmm2_scale=self.bmm2_scale,
                     batch_size=attn_metadata.num_prefills,
-                    cum_seq_lens_q=attn_metadata.qo_indptr_gpu,
-                    cum_seq_lens_kv=attn_metadata.paged_kv_indptr_gpu,
+                    cum_seq_lens_q=attn_metadata.prefill.cum_seq_lens_q,
+                    cum_seq_lens_kv=attn_metadata.prefill.cum_seq_lens_kv,
                     window_left=self.window_left,
                     sinks=self.sinks,
                     o_sf_scale=self.o_sf_scale,
@@ -1312,17 +1446,18 @@ class FlashInferImpl(AttentionImpl):
                 )
 
         if num_decode_tokens > 0:
-            decode_wrapper = attn_metadata.decode_wrapper
             decode_query = query[:num_decode_tokens]
             assert decode_query.shape[0] == num_decode_tokens
-            assert decode_wrapper is not None
 
-            if not attn_metadata.decode_use_trtllm:
+            if not decode_use_trtllm:
+                assert isinstance(attn_metadata.decode, FIDecode)
+                decode_wrapper = attn_metadata.decode.wrapper
+                assert decode_wrapper is not None
                 assert decode_wrapper._window_left == self.window_left
                 assert decode_wrapper._logits_soft_cap == (self.logits_soft_cap or 0.0)
                 assert decode_wrapper._sm_scale == self.scale
 
-                if self.dcp_world_size > 1:
+                if use_dcp:
                     decode_query = get_dcp_group().all_gather(
                         decode_query.contiguous(), dim=-2
                     )
@@ -1357,12 +1492,11 @@ class FlashInferImpl(AttentionImpl):
                     )
             else:
                 # decode_query may be non-contiguous
+                assert isinstance(attn_metadata.decode, TRTLLMDecode)
                 decode_query = decode_query.contiguous()
                 workspace_buffer = _get_trtllm_gen_workspace_buffer()
-                block_tables_decode = attn_metadata.block_table_tensor[
-                    :num_decode_tokens
-                ]
-                seq_lens_decode = attn_metadata.seq_lens[:num_decode_tokens]
+                block_tables_decode = attn_metadata.decode.block_tables
+                seq_lens_decode = attn_metadata.decode.seq_lens
 
                 # This path needs to be enabled with VLLM_KV_CACHE_LAYOUT = HND
                 assert get_kv_cache_layout() == "HND"
@@ -1397,7 +1531,7 @@ class FlashInferImpl(AttentionImpl):
                     workspace_buffer=workspace_buffer,
                     block_tables=block_tables_decode,
                     seq_lens=seq_lens_decode,
-                    max_seq_len=attn_metadata.max_seq_len,
+                    max_seq_len=attn_metadata.decode.max_seq_len,
                     bmm1_scale=self.bmm1_scale,
                     bmm2_scale=self.bmm2_scale,
                     window_left=self.window_left,

From 45c0526ac9517ae52a91e3e07bd023ce9f1a7c69 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Thu, 18 Dec 2025 17:29:11 -0800
Subject: [PATCH 695/770] [BugFix] Handle errors when preprocessing added
 requests (#30895)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 .buildkite/test-pipeline.yaml                 |  5 +-
 .../engine/test_preprocess_error_handling.py  | 56 +++++++++++++++++++
 vllm/v1/engine/core.py                        | 35 +++++++++++-
 3 files changed, 93 insertions(+), 3 deletions(-)
 create mode 100644 tests/v1/engine/test_preprocess_error_handling.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index faf34d95735f4..f644504a5b937 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -319,7 +319,10 @@ steps:
     # TODO: accuracy does not match, whether setting
     # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
     - pytest -v -s v1/e2e
-    - pytest -v -s v1/engine
+    # Run this test standalone for now;
+    # need to untangle use (implicit) use of spawn/fork across the tests.
+    - pytest -v -s v1/engine/test_preprocess_error_handling.py
+    - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
 
 - label: V1 Test entrypoints # 35min
   timeout_in_minutes: 50
diff --git a/tests/v1/engine/test_preprocess_error_handling.py b/tests/v1/engine/test_preprocess_error_handling.py
new file mode 100644
index 0000000000000..0586cc64fa104
--- /dev/null
+++ b/tests/v1/engine/test_preprocess_error_handling.py
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch.cuda
+
+from vllm import LLM, SamplingParams
+from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine.core import EngineCore
+
+MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
+
+
+def test_preprocess_error_handling(monkeypatch: pytest.MonkeyPatch):
+    """Test that preprocessing errors are handled gracefully."""
+
+    assert not torch.cuda.is_initialized(), (
+        "fork needs to be used for the engine "
+        "core process and this isn't possible if cuda is already initialized"
+    )
+
+    # Store original method to call for non-failing requests
+    original_preprocess = EngineCore.preprocess_add_request
+
+    # Monkeypatch to make preprocess_add_request raise an exception
+    # only for requests with "FAIL" in the first token
+    def conditional_failing_preprocess(self, request: EngineCoreRequest):
+        # Fail if the first token id is 333
+        if request.prompt_token_ids and request.prompt_token_ids[0] == 333:
+            raise ValueError("Simulated preprocessing error!")
+        return original_preprocess(self, request)
+
+    monkeypatch.setattr(
+        EngineCore, "preprocess_add_request", conditional_failing_preprocess
+    )
+
+    llm = LLM(model=MODEL_NAME)
+
+    # Create a failing request by crafting a request with an invalid token
+    # We need to use a direct approach since LLM.generate tokenizes for us
+    from vllm.inputs import TokensPrompt
+
+    # This should raise an exception due to the preprocessing failure
+    # Special token id to trigger the failure
+    failing_prompt = TokensPrompt(prompt_token_ids=[333])
+    outputs = llm.generate(failing_prompt, SamplingParams(max_tokens=10))  # type: ignore
+    assert len(outputs) == 1
+    assert len(outputs[0].outputs[0].token_ids) == 0
+    assert outputs[0].finished
+    assert outputs[0].outputs[0].finish_reason == "error"
+
+    # Verify the engine is still functional with a normal request
+    outputs = llm.generate("Hello, my name is", SamplingParams(max_tokens=10))
+    assert len(outputs) == 1
+    assert len(outputs[0].outputs[0].token_ids) > 0
+    assert outputs[0].outputs[0].finish_reason in ("stop", "length")
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 9e2571201a684..40c3e9a515e18 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -43,9 +43,11 @@ from vllm.v1.core.kv_cache_utils import (
 from vllm.v1.core.sched.interface import SchedulerInterface
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.engine import (
+    EngineCoreOutput,
     EngineCoreOutputs,
     EngineCoreRequest,
     EngineCoreRequestType,
+    FinishReason,
     ReconfigureDistributedRequest,
     ReconfigureRankType,
     UtilityOutput,
@@ -1055,9 +1057,14 @@ class EngineCoreProc(EngineCore):
                     request_type = EngineCoreRequestType(bytes(type_frame.buffer))
 
                     # Deserialize the request data.
+                    request: Any
                     if request_type == EngineCoreRequestType.ADD:
-                        request = add_request_decoder.decode(data_frames)
-                        request = self.preprocess_add_request(request)
+                        req: EngineCoreRequest = add_request_decoder.decode(data_frames)
+                        try:
+                            request = self.preprocess_add_request(req)
+                        except Exception:
+                            self._handle_request_preproc_error(req)
+                            continue
                     else:
                         request = generic_decoder.decode(data_frames)
 
@@ -1141,6 +1148,30 @@ class EngineCoreProc(EngineCore):
                     # Limit the number of buffers to reuse.
                     reuse_buffers.append(buffer)
 
+    def _handle_request_preproc_error(self, request: EngineCoreRequest) -> None:
+        """Log and return a request-scoped error response for exceptions raised
+        from the add request preprocessing in the input socket processing thread.
+        """
+        logger.exception(
+            "Unexpected error pre-processing request %s", request.request_id
+        )
+        self.output_queue.put_nowait(
+            (
+                request.client_index,
+                EngineCoreOutputs(
+                    engine_index=self.engine_index,
+                    finished_requests={request.request_id},
+                    outputs=[
+                        EngineCoreOutput(
+                            request_id=request.request_id,
+                            new_token_ids=[],
+                            finish_reason=FinishReason.ERROR,
+                        )
+                    ],
+                ),
+            )
+        )
+
 
 class DPEngineCoreProc(EngineCoreProc):
     """ZMQ-wrapper for running EngineCore in background process

From 6a09612b2e0e09d037a220ea8115632b8084e008 Mon Sep 17 00:00:00 2001
From: PlatinumGod <pyjapple@gmail.com>
Date: Fri, 19 Dec 2025 09:34:27 +0800
Subject: [PATCH 696/770] [Bugfix] Fix tool_choice="none" being ignored by
 GPT-OSS/harmony models (#30867)

Signed-off-by: yujiepu <pyjapple@gmail.com>
Signed-off-by: PlatinumGod <pyjapple@gmail.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
---
 tests/entrypoints/openai/test_serving_chat.py | 78 ++++++++++++++++++-
 vllm/entrypoints/openai/serving_chat.py       | 12 ++-
 2 files changed, 86 insertions(+), 4 deletions(-)

diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 69d7b1ceedf59..af4b52ac094e2 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -52,8 +52,19 @@ def with_tool_parser(request) -> bool:
     return request.param
 
 
+@pytest.fixture(
+    scope="module",
+    params=[True],
+    ids=["exclude_tools_when_tool_choice_none"],
+)
+def exclude_tools_when_tool_choice_none(request) -> bool:
+    return request.param
+
+
 @pytest.fixture(scope="module")
-def default_server_args(with_tool_parser: bool):
+def default_server_args(
+    with_tool_parser: bool, exclude_tools_when_tool_choice_none: bool
+):
     args = [
         # use half precision for speed and memory savings in CI environment
         "--enforce-eager",
@@ -72,6 +83,8 @@ def default_server_args(with_tool_parser: bool):
                 "--enable-auto-tool-choice",
             ]
         )
+    if exclude_tools_when_tool_choice_none:
+        args.append("--exclude-tools-when-tool-choice-none")
     return args
 
 
@@ -335,6 +348,69 @@ async def test_gpt_oss_tool_message_array_content(
     assert response_multi_array.choices[0].message is not None
 
 
+@pytest.mark.asyncio
+async def test_gpt_oss_tool_choice_none(
+    gptoss_client: OpenAI,
+    with_tool_parser: bool,
+    exclude_tools_when_tool_choice_none: bool,
+):
+    if not (with_tool_parser and exclude_tools_when_tool_choice_none):
+        pytest.skip(
+            "skip tool_choice tests when non-tool or "
+            "--exclude-tools-when-tool-choice-none not set"
+        )
+
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_current_weather",
+                "description": "Get the current weather in a given location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "city": {"type": "string"},
+                        "state": {"type": "string"},
+                        "unit": {
+                            "type": "string",
+                            "enum": ["celsius", "fahrenheit"],
+                        },
+                    },
+                    "required": ["city", "state", "unit"],
+                },
+            },
+        }
+    ]
+
+    messages = [
+        {
+            "role": "user",
+            "content": "What's the temperature(in degrees Celsius) in Dallas?",
+        },
+    ]
+
+    tool_choice_auto = await gptoss_client.chat.completions.create(
+        model=GPT_OSS_MODEL_NAME,
+        messages=messages,
+        tools=tools,
+        tool_choice="auto",
+        temperature=0.0,
+    )
+    msg = tool_choice_auto.choices[0].message
+    assert len(msg.tool_calls) == 1
+
+    tool_choice_none = await gptoss_client.chat.completions.create(
+        model=GPT_OSS_MODEL_NAME,
+        messages=messages,
+        tools=tools,
+        tool_choice="none",
+        temperature=0.0,
+    )
+
+    msg = tool_choice_none.choices[0].message
+    assert len(msg.tool_calls) == 0
+
+
 MODEL_NAME = "openai-community/gpt2"
 MODEL_NAME_SHORT = "gpt2"
 CHAT_TEMPLATE = "Dummy chat template for testing {}"
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 04967cbe268dd..e36ae00fc9b86 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -299,7 +299,10 @@ class OpenAIServingChat(OpenAIServing):
                 )
             else:
                 # For GPT-OSS.
-                conversation, engine_prompts = self._make_request_with_harmony(request)
+                should_include_tools = tool_dicts is not None
+                conversation, engine_prompts = self._make_request_with_harmony(
+                    request, should_include_tools
+                )
         except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(f"{e} {e.__cause__}")
@@ -1833,6 +1836,7 @@ class OpenAIServingChat(OpenAIServing):
     def _make_request_with_harmony(
         self,
         request: ChatCompletionRequest,
+        should_include_tools: bool = True,
     ):
         messages: list[OpenAIMessage] = []
 
@@ -1850,12 +1854,14 @@ class OpenAIServingChat(OpenAIServing):
             reasoning_effort=request.reasoning_effort,
             browser_description=None,
             python_description=None,
-            with_custom_tools=request.tools is not None,
+            with_custom_tools=should_include_tools,
         )
         messages.append(sys_msg)
 
         # Add developer message.
-        dev_msg = get_developer_message(tools=request.tools)
+        dev_msg = get_developer_message(
+            tools=request.tools if should_include_tools else None
+        )
         messages.append(dev_msg)
 
         # Add user message.

From 7b43db210c34023cb1e8bcec1d3c49537556856f Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Thu, 18 Dec 2025 20:17:27 -0600
Subject: [PATCH 697/770] [ROCm][CI][Bugfix] Multi-Modal Model Support Fixes
 and Attention Backend Improvements (#30270)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 .buildkite/test-amd.yaml                      | 21 +++++++---
 .../multimodal/{generation => }/conftest.py   |  9 ++--
 .../multimodal/generation/test_common.py      | 32 +++++++++++++-
 .../generation/test_granite_speech.py         |  2 +-
 tests/models/multimodal/pooling/conftest.py   | 18 --------
 .../models/transformers/multimodal.py         | 32 +++++++++++++-
 vllm/platforms/rocm.py                        | 42 ++++++++++++-------
 7 files changed, 109 insertions(+), 47 deletions(-)
 rename tests/models/multimodal/{generation => }/conftest.py (79%)
 delete mode 100644 tests/models/multimodal/pooling/conftest.py

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 6e20ff3bf38d9..9a770869b1d17 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -964,7 +964,7 @@ steps:
     - pytest -v -s models/multimodal/processing
 
 - label: Multi-Modal Models Test (Standard) # 60min
-  timeout_in_minutes: 80
+  timeout_in_minutes: 100
   mirror_hardwares: [amdexperimental]
   agent_pool: mi325_1
   # grade: Blocking
@@ -973,13 +973,15 @@ steps:
   - vllm/
   - tests/models/multimodal
   commands:
+    - export MIOPEN_DEBUG_CONV_DIRECT=0
+    - export MIOPEN_DEBUG_CONV_GEMM=0
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pip freeze | grep -E 'torch'
     - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
     - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 
-- label: Multi-Modal Accuracy Eval (Small Models) # 150min - 180min
-  timeout_in_minutes: 180
+- label: Multi-Modal Accuracy Eval (Small Models) # 5min
+  timeout_in_minutes: 10
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
@@ -989,7 +991,9 @@ steps:
   - vllm/inputs/
   - vllm/v1/core/
   commands:
-  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
+  - export MIOPEN_DEBUG_CONV_DIRECT=0
+  - export MIOPEN_DEBUG_CONV_GEMM=0
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt
 
 - label: Multi-Modal Models Test (Extended) 1 # 60min
   timeout_in_minutes: 120
@@ -1001,10 +1005,13 @@ steps:
   - vllm/
   - tests/models/multimodal
   commands:
+    - export MIOPEN_DEBUG_CONV_DIRECT=0
+    - export MIOPEN_DEBUG_CONV_GEMM=0
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
 
-- label: Multi-Modal Models Test (Extended) 2
+- label: Multi-Modal Models Test (Extended) 2 #60min
+  timeout_in_minutes: 120
   mirror_hardwares: [amdexperimental]
   agent_pool: mi325_1
   # grade: Blocking
@@ -1013,6 +1020,8 @@ steps:
   - vllm/
   - tests/models/multimodal
   commands:
+    - export MIOPEN_DEBUG_CONV_DIRECT=0
+    - export MIOPEN_DEBUG_CONV_GEMM=0
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
 
@@ -1026,6 +1035,8 @@ steps:
   - vllm/
   - tests/models/multimodal
   commands:
+    - export MIOPEN_DEBUG_CONV_DIRECT=0
+    - export MIOPEN_DEBUG_CONV_GEMM=0
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
 
diff --git a/tests/models/multimodal/generation/conftest.py b/tests/models/multimodal/conftest.py
similarity index 79%
rename from tests/models/multimodal/generation/conftest.py
rename to tests/models/multimodal/conftest.py
index 26f8586742cea..4243298cdc896 100644
--- a/tests/models/multimodal/generation/conftest.py
+++ b/tests/models/multimodal/conftest.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Pytest configuration for vLLM tests."""
+"""Pytest configuration for vLLM multimodal tests."""
 
 import warnings
 
@@ -9,16 +9,13 @@ import torch
 from vllm.platforms import current_platform
 
 
-def pytest_configure(config):
-    """Disable Flash/MemEfficient SDP on ROCm to avoid HF
-    Transformers accuracy issues.
-    """
+def pytest_collection_modifyitems(config, items):
+    """Configure ROCm-specific settings based on collected tests."""
     if not current_platform.is_rocm():
         return
 
     skip_patterns = ["test_granite_speech.py"]
     if any(pattern in str(arg) for arg in config.args for pattern in skip_patterns):
-        # Skip disabling SDP for Granite Speech tests on ROCm
         return
 
     # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index c5a0b6748f797..6640e1ff9474d 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -173,6 +173,13 @@ VLM_TEST_SETTINGS = {
         auto_cls=AutoModelForImageTextToText,
         vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
         patch_hf_runner=model_utils.qwen3_vl_patch_hf_runner,
+        vllm_runner_kwargs={
+            "attention_config": {
+                "backend": "ROCM_AITER_FA",
+            },
+        }
+        if current_platform.is_rocm()
+        else None,
         image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
         marks=[
             pytest.mark.core_model,
@@ -253,8 +260,19 @@ VLM_TEST_SETTINGS = {
         image_size_factors=[(0.25, 0.2, 0.15)],
         vllm_runner_kwargs={
             "model_impl": "transformers",
+            # TODO: [ROCm] Revert this once issue #30167 is resolved
+            **(
+                {
+                    "mm_processor_kwargs": {
+                        "min_pixels": 256 * 28 * 28,
+                        "max_pixels": 1280 * 28 * 28,
+                    },
+                }
+                if current_platform.is_rocm()
+                else {}
+            ),
         },
-        marks=[large_gpu_mark(min_gb=32)],
+        marks=[large_gpu_mark(min_gb=80 if current_platform.is_rocm() else 32)],
     ),
     #### Extended model tests
     "aria": VLMTestInfo(
@@ -645,7 +663,17 @@ VLM_TEST_SETTINGS = {
         hf_output_post_proc=model_utils.minimax_vl_01_hf_output,
         patch_hf_runner=model_utils.minimax_vl_01_patch_hf_runner,
         auto_cls=AutoModelForImageTextToText,
-        marks=[large_gpu_mark(min_gb=80)],
+        marks=[
+            large_gpu_mark(min_gb=80),
+            # TODO: [ROCm] Fix pickle issue with ROCm spawn and tp>1
+            pytest.mark.skipif(
+                current_platform.is_rocm(),
+                reason=(
+                    "ROCm: Model too large for single GPU; "
+                    "multi-GPU blocked by HF _LazyConfigMapping pickle issue with spawn"
+                ),
+            ),
+        ],
     ),
     "molmo": VLMTestInfo(
         models=["allenai/Molmo-7B-D-0924"],
diff --git a/tests/models/multimodal/generation/test_granite_speech.py b/tests/models/multimodal/generation/test_granite_speech.py
index 489743c5a29b3..1519a50c1a0c3 100644
--- a/tests/models/multimodal/generation/test_granite_speech.py
+++ b/tests/models/multimodal/generation/test_granite_speech.py
@@ -39,7 +39,7 @@ models = [MODEL_NAME]
 def granite_speech_attention_config():
     """Return attention config for Granite Speech tests on ROCm."""
     if current_platform.is_rocm():
-        return {"backend": "TRITON_ATTN"}
+        return {"backend": "ROCM_AITER_FA"}
     return None
 
 
diff --git a/tests/models/multimodal/pooling/conftest.py b/tests/models/multimodal/pooling/conftest.py
deleted file mode 100644
index 401bc39b4b109..0000000000000
--- a/tests/models/multimodal/pooling/conftest.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Pytest configuration for vLLM pooling tests."""
-
-import pytest
-
-from vllm.platforms import current_platform
-
-
-@pytest.fixture
-def siglip_attention_config():
-    """Return attention config for SigLIP tests on ROCm.
-
-    On ROCm, SigLIP tests require FLEX_ATTENTION backend.
-    """
-    if current_platform.is_rocm():
-        return {"backend": "FLEX_ATTENTION"}
-    return None
diff --git a/vllm/model_executor/models/transformers/multimodal.py b/vllm/model_executor/models/transformers/multimodal.py
index 9d77dee2810c3..fcf9a0d077abe 100644
--- a/vllm/model_executor/models/transformers/multimodal.py
+++ b/vllm/model_executor/models/transformers/multimodal.py
@@ -22,6 +22,7 @@ from typing import TYPE_CHECKING
 import torch
 
 from vllm.config.utils import getattr_iter
+from vllm.logger import init_logger
 from vllm.model_executor.models.interfaces import SupportsMRoPE, SupportsMultiModal
 from vllm.model_executor.models.utils import WeightsMapper
 from vllm.multimodal import MultiModalKwargsItems
@@ -36,6 +37,7 @@ from vllm.multimodal.inputs import (
 from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems
 from vllm.multimodal.processing import BaseMultiModalProcessor, BaseProcessingInfo
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
 if TYPE_CHECKING:
@@ -52,6 +54,8 @@ DYNAMIC_ARG_DIMS = {
     "inputs_embeds": 0,
 }
 
+logger = init_logger(__name__)
+
 
 class MultiModalProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self):
@@ -345,8 +349,29 @@ class MultiModalMixin(SupportsMultiModal, SupportsMRoPE):
 
         num_image_patches = kwargs.pop("num_image_patches")
         kwargs.pop("token_type_ids", None)  # used only in `forward`
+
         if pixel_values is not None:
-            vision_embeddings = self.model.get_image_features(pixel_values, **kwargs)
+            # ROCm: Force math SDP backend for vision encoder to avoid accuracy issues
+            # with flash_sdp and mem_efficient_sdp
+            if current_platform.is_rocm():
+                # TODO: [ROCm] Fix accuracy issues with flash backend
+                logger.debug(
+                    "ROCm platform detected. Forcing math SDP backend "
+                    "for vision encoder. Currently ROCm platform has "
+                    "accuracy issues with `flash_sdp` and"
+                    "`mem_efficient_sdp` backends. See issue: "
+                    "https://github.com/vllm-project/vllm/issues/30167"
+                )
+                with torch.nn.attention.sdpa_kernel(
+                    backends=[torch.nn.attention.SDPBackend.MATH]
+                ):
+                    vision_embeddings = self.model.get_image_features(
+                        pixel_values, **kwargs
+                    )
+            else:
+                vision_embeddings = self.model.get_image_features(
+                    pixel_values, **kwargs
+                )
 
             if isinstance(vision_embeddings, torch.Tensor):
                 if vision_embeddings.ndim == 2:
@@ -364,6 +389,11 @@ class MultiModalMixin(SupportsMultiModal, SupportsMRoPE):
                 ]
 
             return vision_embeddings
+        else:
+            logger.debug(
+                "No pixel values or image embeddings provided for multimodal embedding."
+            )
+            return None
 
     def get_mrope_input_positions(
         self,
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index c237f7cf887c1..5892639eba406 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -8,6 +8,7 @@ from typing import TYPE_CHECKING, Optional
 import torch
 
 import vllm.envs as envs
+from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
 from vllm.utils.torch_utils import cuda_device_count_stateless
@@ -204,7 +205,7 @@ class RocmPlatform(Platform):
             assert block_size == 1, (
                 "Sparse MLA backend on ROCm only supports block size 1 for now."
             )
-            logger.info_once("Using Sparse MLA backend on V1 engine.")
+            logger.info_once("Using Sparse MLA backend.")
             return AttentionBackendEnum.ROCM_AITER_MLA_SPARSE.get_path()
 
         if attn_selector_config.use_mla:
@@ -239,16 +240,16 @@ class RocmPlatform(Platform):
             return AttentionBackendEnum.FLEX_ATTENTION.get_path()
 
         if selected_backend == AttentionBackendEnum.TRITON_ATTN:
-            logger.info("Using Triton Attention backend on V1 engine.")
+            logger.info("Using Triton Attention backend.")
             return AttentionBackendEnum.TRITON_ATTN.get_path()
 
         if selected_backend == AttentionBackendEnum.ROCM_ATTN:
-            logger.info("Using Rocm Attention backend on V1 engine.")
+            logger.info("Using Rocm Attention backend.")
             return AttentionBackendEnum.ROCM_ATTN.get_path()
 
         if selected_backend == AttentionBackendEnum.ROCM_AITER_FA:
             if on_gfx9():
-                logger.info("Using Aiter Flash Attention backend on V1 engine.")
+                logger.info("Using Aiter Flash Attention backend.")
                 return AttentionBackendEnum.ROCM_AITER_FA.get_path()
             else:
                 raise ValueError(
@@ -257,25 +258,25 @@ class RocmPlatform(Platform):
                 )
 
         if selected_backend == AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN:
-            logger.info("Using Aiter Unified Attention backend on V1 engine.")
+            logger.info("Using Aiter Unified Attention backend.")
             return AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path()
 
         # Handle automatic backend selection based on environment variables
         if selected_backend is None:
             # Priority 1: Check for AITER Unified Attention (must check before MHA)
             if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION:
-                logger.info("Using Aiter Unified Attention backend on V1 engine.")
+                logger.info("Using Aiter Unified Attention backend.")
                 return AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path()
 
             # Priority 2: Check for AITER MHA (Flash Attention)
             # Only use if explicitly enabled (not just VLLM_ROCM_USE_AITER=1)
             if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA and on_gfx9():
-                logger.info("Using Aiter Flash Attention backend on V1 engine.")
+                logger.info("Using Aiter Flash Attention backend.")
                 return AttentionBackendEnum.ROCM_AITER_FA.get_path()
 
             # Priority 3: Check for ROCM_ATTN (prefill-decode split)
             if envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION:
-                logger.info("Using Rocm Attention backend on V1 engine.")
+                logger.info("Using Rocm Attention backend.")
                 return AttentionBackendEnum.ROCM_ATTN.get_path()
 
             # Priority 4: Check for AITER enabled without specific flags
@@ -285,11 +286,19 @@ class RocmPlatform(Platform):
                 and on_gfx9()
                 and envs.VLLM_ROCM_USE_AITER_MHA is not False
             ):
-                logger.info("Using Aiter Flash Attention backend on V1 engine.")
+                logger.info("Using Aiter Flash Attention backend.")
                 return AttentionBackendEnum.ROCM_AITER_FA.get_path()
 
+            # Priority 5: If model is Encoder-only self-attention type
+            if (
+                attn_selector_config.attn_type is not None
+                and attn_selector_config.attn_type == AttentionType.ENCODER_ONLY
+            ):
+                logger.info("Using FlexAttention backend.")
+                return AttentionBackendEnum.FLEX_ATTENTION.get_path()
+
             # Default: Triton Unified Attention
-            logger.info("Using Triton Attention backend on V1 engine.")
+            logger.info("Using Triton Attention backend.")
             return AttentionBackendEnum.TRITON_ATTN.get_path()
 
         raise RuntimeError(
@@ -324,14 +333,19 @@ class RocmPlatform(Platform):
 
         from vllm._aiter_ops import rocm_aiter_ops
 
-        if rocm_aiter_ops.is_mha_enabled():
-            # Note: AITER FA is only supported for Qwen-VL models.
-            # TODO: Add support for other VL models in their model class.
+        if rocm_aiter_ops.is_enabled():
+            logger.info_once("Using AITER Flash Attention backend for ViT model.")
             return AttentionBackendEnum.ROCM_AITER_FA
 
-        if on_gfx9() and find_spec("flash_attn") is not None:
+        if (
+            on_gfx9()
+            and find_spec("flash_attn") is not None
+            and (dtype == torch.float16 or dtype == torch.bfloat16)
+        ):
+            logger.info_once("Using Flash Attention backend for ViT model.")
             return AttentionBackendEnum.FLASH_ATTN
 
+        logger.info_once("Using Torch SDPA backend for ViT model.")
         return AttentionBackendEnum.TORCH_SDPA
 
     @classmethod

From 2ac85a4544cf9488037e61bf9ed7a87d0c3696bb Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Thu, 18 Dec 2025 19:58:28 -0800
Subject: [PATCH 698/770] [BugFix] Fix logprobs with spec decode and modified
 logits (#30846)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/sample/test_logprobs.py    | 33 ++++++++++++++++++++---------
 vllm/v1/sample/rejection_sampler.py |  8 ++++++-
 2 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index 76a0e8e25a4ae..1e2cc2241ba95 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -547,6 +547,13 @@ def test_spec_decode_logprobs(
     sampling_params = SamplingParams(
         temperature=0, logprobs=top_logprobs, max_tokens=10, ignore_eos=False
     )
+    penalty_sampling_params = SamplingParams(
+        temperature=0,
+        logprobs=top_logprobs,
+        max_tokens=10,
+        ignore_eos=False,
+        presence_penalty=-1.0,
+    )
     method, model_name, spec_model_name = model_setup
     max_model_len = 256
 
@@ -558,14 +565,17 @@ def test_spec_decode_logprobs(
         seed=42,
         logprobs_mode=logprobs_mode,
         gpu_memory_utilization=0.4,
+        enable_prefix_caching=False,
+    )
+    ref_results = ref_llm.generate(
+        [prompt, prompt], [sampling_params, penalty_sampling_params]
     )
-    ref_results = ref_llm.generate([prompt], sampling_params)
     # Collect logprobs outputs from reference LLM.
     ref_logprobs = []
-    for output in ref_results[0].outputs:
-        for logprobs in output.logprobs:
-            for token_id in logprobs:
-                ref_logprobs.append(logprobs[token_id])
+    for results in ref_results:
+        for output in results.outputs:
+            for logprobs in output.logprobs:
+                ref_logprobs.extend(logprobs.values())
     del ref_llm
     torch.cuda.empty_cache()
     cleanup_dist_env_and_memory()
@@ -587,14 +597,17 @@ def test_spec_decode_logprobs(
         # Force prefill chunking
         enable_chunked_prefill=True,
         max_num_batched_tokens=32,
+        enable_prefix_caching=False,
+    )
+    spec_results = spec_llm.generate(
+        [prompt, prompt], [sampling_params, penalty_sampling_params]
     )
-    spec_results = spec_llm.generate([prompt], sampling_params)
     # Collect logprobs outputs from spec decode LLM.
     spec_logprobs = []
-    for output in spec_results[0].outputs:
-        for logprobs in output.logprobs:
-            for token_id in logprobs:
-                spec_logprobs.append(logprobs[token_id])
+    for results in spec_results:
+        for output in results.outputs:
+            for logprobs in output.logprobs:
+                spec_logprobs.extend(logprobs.values())
     del spec_llm
     torch.cuda.empty_cache()
     cleanup_dist_env_and_memory()
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index 50b91d8292ee8..f2338e9b4b7d0 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -119,8 +119,14 @@ class RejectionSampler(nn.Module):
         raw_target_logits = logits[target_logits_indices]
         # Use float32 for the target_logits.
         raw_target_logits = raw_target_logits.to(torch.float32)
+        target_logits = raw_target_logits
+        if not self.is_processed_logprobs_mode:
+            # Clone raw_target_logits before applying processors to preserve
+            # the original raw logits for logprobs computation, since
+            # apply_logits_processors modifies the tensor in-place.
+            target_logits = target_logits.clone()
         target_logits = self.apply_logits_processors(
-            raw_target_logits, sampling_metadata, metadata
+            target_logits, sampling_metadata, metadata
         )
         # [num_tokens, vocab_size]
         # NOTE(woosuk): `target_logits` can be updated in place inside the

From de08b8f61bd412f2bcc3a5f21da26871e39a9078 Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <jinzhen.ljz@antgroup.com>
Date: Fri, 19 Dec 2025 12:29:48 +0800
Subject: [PATCH 699/770] [Quantization] enable compressed-tensors marlin
 support for turing (#31000)

Signed-off-by: Jinzhen Lin <jinzhen.ljz@antgroup.com>
---
 .../layers/quantization/kernels/mixed_precision/marlin.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
index faaa45b861de7..eb14f9ec378c4 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
@@ -30,7 +30,7 @@ from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
 class MarlinLinearKernel(MPLinearKernel):
     @classmethod
     def get_min_capability(cls) -> int:
-        return 80
+        return 75
 
     @classmethod
     def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]:

From 096b25c9edd8808b3e19a652db59d72e27931208 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Fri, 19 Dec 2025 15:29:52 +0800
Subject: [PATCH 700/770] [Doc][CPU] Fix index link for CPU regular release
 wheels (#31015)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 docs/getting_started/installation/cpu.arm.inc.md | 4 ++--
 docs/getting_started/installation/cpu.x86.inc.md | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/getting_started/installation/cpu.arm.inc.md b/docs/getting_started/installation/cpu.arm.inc.md
index 4940e5781b29a..611e6edf6668b 100644
--- a/docs/getting_started/installation/cpu.arm.inc.md
+++ b/docs/getting_started/installation/cpu.arm.inc.md
@@ -19,12 +19,12 @@ Pre-built vLLM wheels for Arm are available since version 0.11.2. These wheels c
 
 ```bash
 export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
-uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu --index-strategy first-index
+uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cpu-cp38-abi3-manylinux_2_35_aarch64.whl
 ```
 
 ??? console "pip"
     ```bash
-    pip install vllm==${VLLM_VERSION}+cpu --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu
+    pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cpu-cp38-abi3-manylinux_2_35_aarch64.whl
     ```
 
 !!! warning "set `LD_PRELOAD`"
diff --git a/docs/getting_started/installation/cpu.x86.inc.md b/docs/getting_started/installation/cpu.x86.inc.md
index 01e34eee10539..013750bc537bf 100644
--- a/docs/getting_started/installation/cpu.x86.inc.md
+++ b/docs/getting_started/installation/cpu.x86.inc.md
@@ -23,12 +23,12 @@ Pre-built vLLM wheels for x86 with AVX512 are available since version 0.13.0. To
 export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
 
 # use uv
-uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu --index-strategy first-index --torch-backend cpu
+uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl --torch-backend cpu
 ```
 ??? console "pip"
     ```bash
     # use pip
-    pip install vllm==${VLLM_VERSION}+cpu --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu --extra-index-url https://download.pytorch.org/whl/cpu
+    pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cpu
     ```
 !!! warning "set `LD_PRELOAD`"
     Before use vLLM CPU installed via wheels, make sure TCMalloc and Intel OpenMP are installed and added to `LD_PRELOAD`:

From 4924ac582c5007fbf7b15d719708812e06961009 Mon Sep 17 00:00:00 2001
From: Wenqi Glantz <wglantz@nvidia.com>
Date: Fri, 19 Dec 2025 02:59:36 -0500
Subject: [PATCH 701/770] Add hidden dimension validation for multimodal
 embedding inputs (#30968)

Signed-off-by: Wenqi Glantz <wglantz@nvidia.com>
---
 .../openai/test_embedding_shape_validation.py | 223 ++++++++++++++++
 .../test_embedding_shape_validation_unit.py   | 249 ++++++++++++++++++
 vllm/multimodal/parse.py                      | 117 +++++++-
 vllm/multimodal/processing.py                 |  10 +-
 4 files changed, 589 insertions(+), 10 deletions(-)
 create mode 100644 tests/entrypoints/openai/test_embedding_shape_validation.py
 create mode 100644 tests/multimodal/test_embedding_shape_validation_unit.py

diff --git a/tests/entrypoints/openai/test_embedding_shape_validation.py b/tests/entrypoints/openai/test_embedding_shape_validation.py
new file mode 100644
index 0000000000000..27060e0be5aee
--- /dev/null
+++ b/tests/entrypoints/openai/test_embedding_shape_validation.py
@@ -0,0 +1,223 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Embedding shape validation in multimodal APIs.
+
+Tests verify that embeddings with correct ndim but incorrect hidden_size
+are rejected before they can cause crashes during model inference.
+
+Validation is performed by the parser (MultiModalDataParser) and EmbeddingItems
+classes, not by CompletionRenderer or MediaIO classes.
+"""
+
+import pytest
+import torch
+
+from vllm.multimodal.parse import (
+    AudioEmbeddingItems,
+    ImageEmbeddingItems,
+    MultiModalDataParser,
+    VideoEmbeddingItems,
+)
+
+
+class TestMultiModalParserShapeValidation:
+    """Test hidden_size validation in MultiModalDataParser."""
+
+    def test_image_embeddings_correct_hidden_size_accepted(self):
+        """Baseline: Image embeddings with correct hidden_size should work."""
+        expected_hidden_size = 768
+        parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
+
+        valid_embeds = torch.randn(2, 100, expected_hidden_size)
+
+        result = parser.parse_mm_data({"image": valid_embeds})
+
+        assert "image" in result
+        assert isinstance(result["image"], ImageEmbeddingItems)
+        assert result["image"].get_count() == 2
+
+    def test_image_embeddings_wrong_hidden_size_rejected(self):
+        """Security: Image embeddings with wrong hidden_size should be rejected."""
+        expected_hidden_size = 768
+        wrong_hidden_size = 4096
+        parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
+
+        invalid_embeds = torch.randn(2, 100, wrong_hidden_size)
+
+        with pytest.raises(ValueError) as exc_info:
+            parser.parse_mm_data({"image": invalid_embeds})
+
+        error_msg = str(exc_info.value).lower()
+        assert "image" in error_msg
+        assert "hidden dimension mismatch" in error_msg
+
+    def test_audio_embeddings_wrong_hidden_size_rejected(self):
+        """Security: Audio embeddings with wrong hidden_size should be rejected."""
+        expected_hidden_size = 768
+        wrong_hidden_size = 2048
+        parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
+
+        invalid_embeds = torch.randn(2, 100, wrong_hidden_size)
+
+        with pytest.raises(ValueError) as exc_info:
+            parser.parse_mm_data({"audio": invalid_embeds})
+
+        error_msg = str(exc_info.value).lower()
+        assert "audio" in error_msg
+        assert "hidden dimension mismatch" in error_msg
+
+    def test_video_embeddings_wrong_hidden_size_rejected(self):
+        """Security: Video embeddings with wrong hidden_size should be rejected."""
+        expected_hidden_size = 768
+        wrong_hidden_size = 512
+        parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
+
+        invalid_embeds = torch.randn(2, 100, wrong_hidden_size)
+
+        with pytest.raises(ValueError) as exc_info:
+            parser.parse_mm_data({"video": invalid_embeds})
+
+        error_msg = str(exc_info.value).lower()
+        assert "video" in error_msg
+        assert "hidden dimension mismatch" in error_msg
+
+    def test_list_of_embeddings_validates_each(self):
+        """Security: Each embedding in list should be validated."""
+        expected_hidden_size = 768
+        wrong_hidden_size = 1024
+        parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
+
+        # List with second tensor having wrong hidden_size
+        invalid_embeds = [
+            torch.randn(100, expected_hidden_size),
+            torch.randn(100, wrong_hidden_size),
+        ]
+
+        with pytest.raises(ValueError) as exc_info:
+            parser.parse_mm_data({"image": invalid_embeds})
+
+        # Should identify which embedding failed
+        assert "[1]" in str(exc_info.value)
+
+    def test_validation_disabled_allows_any_size(self):
+        """When validation disabled (legacy), any hidden_size allowed."""
+        parser = MultiModalDataParser(expected_hidden_size=None)
+
+        any_hidden_size = 12345
+        embeds = torch.randn(2, 100, any_hidden_size)
+
+        # Should not raise
+        result = parser.parse_mm_data({"image": embeds})
+        assert "image" in result
+        assert isinstance(result["image"], ImageEmbeddingItems)
+
+
+class TestEmbeddingItemsDirectValidation:
+    """Direct tests for EmbeddingItems hidden_size validation."""
+
+    def test_image_embedding_items_validates_batched_tensor(self):
+        """Test validation for batched (3D) image embeddings."""
+        expected = 768
+        wrong = 1024
+
+        # Valid
+        valid = torch.randn(2, 100, expected)
+        items = ImageEmbeddingItems(valid, expected_hidden_size=expected)
+        assert items.get_count() == 2
+
+        # Invalid
+        invalid = torch.randn(2, 100, wrong)
+        with pytest.raises(ValueError) as exc_info:
+            ImageEmbeddingItems(invalid, expected_hidden_size=expected)
+
+        assert str(wrong) in str(exc_info.value)
+        assert str(expected) in str(exc_info.value)
+
+    def test_image_embedding_items_validates_list_of_tensors(self):
+        """Test validation for list of 2D image embeddings."""
+        expected = 768
+        wrong = 512
+
+        # Valid list
+        valid_list = [torch.randn(100, expected), torch.randn(50, expected)]
+        items = ImageEmbeddingItems(valid_list, expected_hidden_size=expected)
+        assert items.get_count() == 2
+
+        # Invalid list
+        invalid_list = [torch.randn(100, expected), torch.randn(50, wrong)]
+        with pytest.raises(ValueError) as exc_info:
+            ImageEmbeddingItems(invalid_list, expected_hidden_size=expected)
+
+        assert "[1]" in str(exc_info.value)
+
+    def test_audio_embedding_items_validates(self):
+        """Test validation for audio embeddings."""
+        expected = 768
+        wrong = 256
+
+        invalid = torch.randn(2, 100, wrong)
+        with pytest.raises(ValueError) as exc_info:
+            AudioEmbeddingItems(invalid, expected_hidden_size=expected)
+
+        assert "audio" in str(exc_info.value).lower()
+
+    def test_video_embedding_items_validates(self):
+        """Test validation for video embeddings."""
+        expected = 768
+        wrong = 384
+
+        invalid = torch.randn(2, 100, wrong)
+        with pytest.raises(ValueError) as exc_info:
+            VideoEmbeddingItems(invalid, expected_hidden_size=expected)
+
+        assert "video" in str(exc_info.value).lower()
+
+
+class TestShapeValidationIntegration:
+    """Integration tests verifying attack scenarios are blocked."""
+
+    def test_attack_scenario_multimodal_image(self):
+        """
+        Simulate attack through Chat API with image embeddings.
+
+        Verifies validation occurs in multimodal parser path.
+        """
+        expected_hidden_size = 768
+        wrong_hidden_size = 4096
+        parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
+
+        attack_tensor = torch.randn(1, 100, wrong_hidden_size)
+
+        with pytest.raises(ValueError):
+            parser.parse_mm_data({"image": attack_tensor})
+
+    def test_attack_scenario_multimodal_audio(self):
+        """
+        Simulate attack through Chat API with audio embeddings.
+
+        Verifies validation occurs in multimodal parser path.
+        """
+        expected_hidden_size = 768
+        wrong_hidden_size = 2048
+        parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
+
+        attack_tensor = torch.randn(1, 100, wrong_hidden_size)
+
+        with pytest.raises(ValueError):
+            parser.parse_mm_data({"audio": attack_tensor})
+
+    def test_attack_scenario_multimodal_video(self):
+        """
+        Simulate attack through Chat API with video embeddings.
+
+        Verifies validation occurs in multimodal parser path.
+        """
+        expected_hidden_size = 768
+        wrong_hidden_size = 1024
+        parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
+
+        attack_tensor = torch.randn(1, 100, wrong_hidden_size)
+
+        with pytest.raises(ValueError):
+            parser.parse_mm_data({"video": attack_tensor})
diff --git a/tests/multimodal/test_embedding_shape_validation_unit.py b/tests/multimodal/test_embedding_shape_validation_unit.py
new file mode 100644
index 0000000000000..7966aad4e988c
--- /dev/null
+++ b/tests/multimodal/test_embedding_shape_validation_unit.py
@@ -0,0 +1,249 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Unit tests for embedding shape validation.
+
+Simple, fast unit tests that can run without server fixtures.
+Run with: pytest tests/multimodal/test_embedding_shape_validation_unit.py -v
+"""
+
+import pytest
+import torch
+
+from vllm.multimodal.parse import (
+    AudioEmbeddingItems,
+    ImageEmbeddingItems,
+)
+
+
+class TestImageEmbedBasicValidation:
+    """Test basic ndim validation in image embeddings via ImageEmbeddingItems."""
+
+    def test_valid_2d_tensor_accepted(self):
+        """Baseline: 2D tensors should be accepted."""
+        valid_tensor = torch.randn(10, 768, dtype=torch.float32)
+
+        # Should not raise - 2D is valid
+        items = ImageEmbeddingItems(valid_tensor)
+        assert items.get_count() == 10
+
+    def test_valid_3d_tensor_accepted(self):
+        """Baseline: 3D tensors should be accepted."""
+        valid_tensor = torch.randn(2, 10, 768, dtype=torch.float32)
+
+        # Should not raise - 3D is valid
+        items = ImageEmbeddingItems(valid_tensor)
+        assert items.get_count() == 2
+
+    def test_valid_list_of_2d_tensors_accepted(self):
+        """Baseline: List of 2D tensors should be accepted."""
+        tensors = [
+            torch.randn(10, 768, dtype=torch.float32),
+            torch.randn(15, 768, dtype=torch.float32),
+        ]
+
+        # Should not raise
+        items = ImageEmbeddingItems(tensors)
+        assert items.get_count() == 2
+
+    def test_1d_tensor_rejected(self):
+        """Security: 1D tensors should be rejected (invalid ndim)."""
+        invalid_tensor = torch.randn(768, dtype=torch.float32)  # 1D
+
+        with pytest.raises(ValueError) as exc_info:
+            ImageEmbeddingItems(invalid_tensor)
+
+        assert "must be 2D" in str(exc_info.value) or "3D" in str(exc_info.value)
+
+    def test_4d_tensor_rejected(self):
+        """Security: 4D tensors should be rejected (invalid ndim)."""
+        invalid_tensor = torch.randn(1, 2, 10, 768, dtype=torch.float32)  # 4D
+
+        with pytest.raises(ValueError) as exc_info:
+            ImageEmbeddingItems(invalid_tensor)
+
+        assert "must be 2D" in str(exc_info.value) or "3D" in str(exc_info.value)
+
+    def test_hidden_size_validation_correct_size(self):
+        """Embeddings with correct hidden size should be accepted."""
+        expected_hidden_size = 768
+        valid_tensor = torch.randn(10, expected_hidden_size, dtype=torch.float32)
+
+        # Should not raise
+        items = ImageEmbeddingItems(
+            valid_tensor, expected_hidden_size=expected_hidden_size
+        )
+        assert items.get_count() == 10
+
+    def test_hidden_size_validation_wrong_size_rejected(self):
+        """Embeddings with wrong hidden size should be rejected."""
+        expected_hidden_size = 768
+        wrong_hidden_size = 4096
+        invalid_tensor = torch.randn(10, wrong_hidden_size, dtype=torch.float32)
+
+        with pytest.raises(ValueError) as exc_info:
+            ImageEmbeddingItems(
+                invalid_tensor, expected_hidden_size=expected_hidden_size
+            )
+
+        error_msg = str(exc_info.value)
+        assert "hidden dimension mismatch" in error_msg.lower()
+        assert str(wrong_hidden_size) in error_msg
+        assert str(expected_hidden_size) in error_msg
+
+
+class TestAudioEmbedBasicValidation:
+    """Test basic ndim validation in audio embeddings via AudioEmbeddingItems."""
+
+    def test_valid_2d_tensor_accepted(self):
+        """Baseline: 2D tensors should be accepted."""
+        valid_tensor = torch.randn(10, 768, dtype=torch.float32)
+
+        # Should not raise - 2D is valid
+        items = AudioEmbeddingItems(valid_tensor)
+        assert items.get_count() == 10
+
+    def test_valid_3d_tensor_accepted(self):
+        """Baseline: 3D tensors should be accepted."""
+        valid_tensor = torch.randn(2, 10, 768, dtype=torch.float32)
+
+        # Should not raise - 3D is valid
+        items = AudioEmbeddingItems(valid_tensor)
+        assert items.get_count() == 2
+
+    def test_valid_list_of_2d_tensors_accepted(self):
+        """Baseline: List of 2D tensors should be accepted."""
+        tensors = [
+            torch.randn(10, 768, dtype=torch.float32),
+            torch.randn(15, 768, dtype=torch.float32),
+        ]
+
+        # Should not raise
+        items = AudioEmbeddingItems(tensors)
+        assert items.get_count() == 2
+
+    def test_1d_tensor_rejected(self):
+        """Security: 1D tensors should be rejected (invalid ndim)."""
+        invalid_tensor = torch.randn(768, dtype=torch.float32)  # 1D
+
+        with pytest.raises(ValueError) as exc_info:
+            AudioEmbeddingItems(invalid_tensor)
+
+        assert "must be 2D" in str(exc_info.value) or "3D" in str(exc_info.value)
+
+    def test_scalar_rejected(self):
+        """Security: Scalar tensors should be rejected."""
+        invalid_tensor = torch.tensor(1.0)  # 0D (scalar)
+
+        with pytest.raises(ValueError):
+            AudioEmbeddingItems(invalid_tensor)
+
+    def test_hidden_size_validation_correct_size(self):
+        """Embeddings with correct hidden size should be accepted."""
+        expected_hidden_size = 768
+        valid_tensor = torch.randn(10, expected_hidden_size, dtype=torch.float32)
+
+        # Should not raise
+        items = AudioEmbeddingItems(
+            valid_tensor, expected_hidden_size=expected_hidden_size
+        )
+        assert items.get_count() == 10
+
+    def test_hidden_size_validation_wrong_size_rejected(self):
+        """Embeddings with wrong hidden size should be rejected."""
+        expected_hidden_size = 768
+        wrong_hidden_size = 4096
+        invalid_tensor = torch.randn(10, wrong_hidden_size, dtype=torch.float32)
+
+        with pytest.raises(ValueError) as exc_info:
+            AudioEmbeddingItems(
+                invalid_tensor, expected_hidden_size=expected_hidden_size
+            )
+
+        error_msg = str(exc_info.value)
+        assert "hidden dimension mismatch" in error_msg.lower()
+        assert str(wrong_hidden_size) in error_msg
+        assert str(expected_hidden_size) in error_msg
+
+
+class TestShapeValidationDoSPrevention:
+    """
+    Tests for DoS prevention through shape validation.
+
+    Verifies that embeddings with incorrect shapes are rejected early,
+    preventing crashes during model inference.
+    """
+
+    def test_prevent_crash_from_wrong_shape_image_embeds(self):
+        """
+        Prevent crash scenario: wrong hidden size in image embeddings.
+
+        Without validation, this would pass initial checks but crash later
+        during model forward pass when dimensions don't match.
+        """
+        expected_hidden_size = 768  # Typical model hidden size
+        wrong_hidden_size = 4096  # Wrong size (e.g., Llama-sized)
+
+        wrong_embedding = torch.randn(100, wrong_hidden_size, dtype=torch.float32)
+
+        # Should be rejected at instantiation time, not during inference
+        with pytest.raises(ValueError) as exc_info:
+            ImageEmbeddingItems(
+                wrong_embedding, expected_hidden_size=expected_hidden_size
+            )
+
+        error_msg = str(exc_info.value)
+        assert "hidden dimension mismatch" in error_msg.lower()
+        assert str(expected_hidden_size) in error_msg  # Expected
+        assert str(wrong_hidden_size) in error_msg  # Received
+
+    def test_prevent_crash_from_wrong_shape_audio_embeds(self):
+        """
+        Prevent crash scenario: wrong hidden size in audio embeddings.
+        """
+        expected_hidden_size = 768
+        wrong_hidden_size = 4096
+
+        wrong_embedding = torch.randn(100, wrong_hidden_size, dtype=torch.float32)
+
+        with pytest.raises(ValueError) as exc_info:
+            AudioEmbeddingItems(
+                wrong_embedding, expected_hidden_size=expected_hidden_size
+            )
+
+        error_msg = str(exc_info.value)
+        assert "hidden dimension mismatch" in error_msg.lower()
+
+    def test_extremely_large_hidden_size_rejected(self):
+        """Security: Prevent DoS from extremely large embeddings."""
+        expected_hidden_size = 768
+        huge_hidden_size = 100000  # Large but not extreme to avoid test OOM
+
+        invalid_tensor = torch.randn(10, huge_hidden_size, dtype=torch.float32)
+
+        with pytest.raises(ValueError) as exc_info:
+            ImageEmbeddingItems(
+                invalid_tensor, expected_hidden_size=expected_hidden_size
+            )
+
+        assert "hidden dimension mismatch" in str(exc_info.value).lower()
+
+    def test_batch_with_mixed_hidden_sizes_rejected(self):
+        """All embeddings in a list must have the same hidden size."""
+        expected_hidden_size = 768
+
+        # One correct, one wrong
+        batch = [
+            torch.randn(10, expected_hidden_size, dtype=torch.float32),
+            torch.randn(10, expected_hidden_size + 100, dtype=torch.float32),  # Wrong!
+        ]
+
+        # Should fail on the second one
+        with pytest.raises(ValueError) as exc_info:
+            ImageEmbeddingItems(batch, expected_hidden_size=expected_hidden_size)
+
+        assert "hidden dimension mismatch" in str(exc_info.value).lower()
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v", "--tb=short"])
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index a69afc3176cab..64c03f8d4da94 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -126,6 +126,30 @@ class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]):
         return {}
 
 
+def validate_embedding_ndim(
+    tensor: torch.Tensor,
+    modality: str,
+    index: int | None = None,
+) -> None:
+    """Validate tensor ndim for multimodal embeddings.
+
+    Single embeddings should be 2D (seq_len, hidden_size).
+    Batched embeddings should be 3D (batch, seq_len, hidden_size).
+
+    Args:
+        tensor: The tensor to validate.
+        modality: The modality name for error messages (e.g., "image", "audio").
+        index: Optional index for list items, included in error messages.
+    """
+    if tensor.ndim < 2 or tensor.ndim > 3:
+        idx_str = f" [{index}]" if index is not None else ""
+        raise ValueError(
+            f"{modality.capitalize()} embedding{idx_str} must be 2D "
+            f"(seq_len, hidden_size) or 3D (batch, seq_len, hidden_size), "
+            f"got {tensor.ndim}D tensor with shape {tuple(tensor.shape)}"
+        )
+
+
 class EmbeddingItems(
     ModalityDataItems[torch.Tensor | list[torch.Tensor], torch.Tensor]
 ):
@@ -134,6 +158,63 @@ class EmbeddingItems(
     or a list of embedding tensors (one per item).
     """
 
+    def __init__(
+        self,
+        data: torch.Tensor | list[torch.Tensor],
+        modality: str,
+        expected_hidden_size: int | None = None,
+    ) -> None:
+        super().__init__(data, modality)
+
+        # Validate ndim first (before hidden_size which depends on correct ndim)
+        self._validate_ndim()
+
+        # Validate hidden dimension if expected size is provided
+        if expected_hidden_size is not None:
+            self._validate_hidden_size(expected_hidden_size)
+
+    def _validate_ndim(self) -> None:
+        """Validate that embedding tensors have correct ndim (2D or 3D)."""
+        if isinstance(self.data, torch.Tensor):
+            validate_embedding_ndim(self.data, self.modality)
+        else:
+            # List of tensors: each should be 2D (seq_len, hidden_size)
+            for idx, tensor in enumerate(self.data):
+                if tensor.ndim != 2:
+                    raise ValueError(
+                        f"{self.modality.capitalize()} embedding [{idx}] must be "
+                        f"2D (seq_len, hidden_size), got {tensor.ndim}D tensor "
+                        f"with shape {tuple(tensor.shape)}"
+                    )
+
+    def _validate_hidden_size(self, expected_hidden_size: int) -> None:
+        """Validate that embedding hidden dimension matches expected size.
+
+        This validates hidden dimensions to prevent vulnerabilities: Embeddings
+        with correct ndim but wrong hidden dimension could bypass initial
+        checks and cause crashes during model inference when dimensions don't match.
+        """
+        if isinstance(self.data, torch.Tensor):
+            # Batched tensor: shape is (batch, seq_len, hidden_size)
+            actual_hidden_size = self.data.shape[-1]
+            if actual_hidden_size != expected_hidden_size:
+                raise ValueError(
+                    f"{self.modality.capitalize()} embedding hidden dimension "
+                    f"mismatch: got {actual_hidden_size}, but model expects "
+                    f"{expected_hidden_size}. Embedding shape: {tuple(self.data.shape)}"
+                )
+        else:
+            # List of tensors: each has shape (seq_len, hidden_size)
+            for idx, tensor in enumerate(self.data):
+                actual_hidden_size = tensor.shape[-1]
+                if actual_hidden_size != expected_hidden_size:
+                    raise ValueError(
+                        f"{self.modality.capitalize()} embedding [{idx}] hidden "
+                        f"dimension mismatch: got {actual_hidden_size}, but model "
+                        f"expects {expected_hidden_size}. "
+                        f"Embedding shape: {tuple(tensor.shape)}"
+                    )
+
     def _unwrap(
         self, item: torch.Tensor | MediaWithBytes[torch.Tensor]
     ) -> torch.Tensor:
@@ -228,8 +309,12 @@ class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]):
 
 
 class AudioEmbeddingItems(EmbeddingItems):
-    def __init__(self, data: torch.Tensor | list[torch.Tensor]) -> None:
-        super().__init__(data, "audio")
+    def __init__(
+        self,
+        data: torch.Tensor | list[torch.Tensor],
+        expected_hidden_size: int | None = None,
+    ) -> None:
+        super().__init__(data, "audio", expected_hidden_size)
 
 
 class ImageSize(NamedTuple):
@@ -256,8 +341,12 @@ class ImageProcessorItems(ProcessorBatchItems[HfImageItem]):
 
 
 class ImageEmbeddingItems(EmbeddingItems):
-    def __init__(self, data: torch.Tensor | list[torch.Tensor]) -> None:
-        super().__init__(data, "image")
+    def __init__(
+        self,
+        data: torch.Tensor | list[torch.Tensor],
+        expected_hidden_size: int | None = None,
+    ) -> None:
+        super().__init__(data, "image", expected_hidden_size)
 
 
 class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]):
@@ -287,8 +376,12 @@ class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]):
 
 
 class VideoEmbeddingItems(EmbeddingItems):
-    def __init__(self, data: torch.Tensor | list[torch.Tensor]) -> None:
-        super().__init__(data, "video")
+    def __init__(
+        self,
+        data: torch.Tensor | list[torch.Tensor],
+        expected_hidden_size: int | None = None,
+    ) -> None:
+        super().__init__(data, "video", expected_hidden_size)
 
 
 _D = TypeVar("_D", bound=ModalityDataItems[Any, Any])
@@ -363,6 +456,10 @@ class MultiModalDataParser:
     Args:
         target_sr (float, optional): Enables automatic resampling of audio
             items to the model's expected sampling rate.
+        expected_hidden_size (int, optional): Expected hidden dimension for
+            embedding inputs. If provided, validates that user-supplied
+            embeddings have the correct hidden size to prevent crashes
+            during model inference.
     """
 
     def __init__(
@@ -371,6 +468,7 @@ class MultiModalDataParser:
         target_sr: float | None = None,
         audio_resample_method: Literal["librosa", "scipy"] = "librosa",
         video_needs_metadata: bool = False,
+        expected_hidden_size: int | None = None,
     ) -> None:
         super().__init__()
 
@@ -379,6 +477,7 @@ class MultiModalDataParser:
             method=audio_resample_method,
         )
         self.video_needs_metadata = video_needs_metadata
+        self.expected_hidden_size = expected_hidden_size
 
     @classmethod
     def is_embeddings(
@@ -443,7 +542,7 @@ class MultiModalDataParser:
             return None
 
         if self.is_embeddings(data):
-            return AudioEmbeddingItems(data)
+            return AudioEmbeddingItems(data, self.expected_hidden_size)
 
         data_items: list[AudioItem]
         if (
@@ -481,7 +580,7 @@ class MultiModalDataParser:
             return None
 
         if self.is_embeddings(data):
-            return ImageEmbeddingItems(data)
+            return ImageEmbeddingItems(data, self.expected_hidden_size)
 
         if (
             isinstance(data, (PILImage.Image, MediaWithBytes))
@@ -507,7 +606,7 @@ class MultiModalDataParser:
             return None
 
         if self.is_embeddings(data):
-            return VideoEmbeddingItems(data)
+            return VideoEmbeddingItems(data, self.expected_hidden_size)
 
         data_items: list[VideoItem]
         if (
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 0390773783961..3bbdab3b393c5 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1330,7 +1330,15 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         of [`MultiModalDataParser`][vllm.multimodal.parse.MultiModalDataParser]
         that has additional subparsers.
         """
-        return MultiModalDataParser()
+        # Get expected hidden size for embedding validation if mm_embeds enabled
+        # This validates hidden dimensions to prevent vulnerabilities: embeddings
+        # with correct ndim but wrong shape could cause crashes at inference time
+        mm_config = self.info.ctx.model_config.get_multimodal_config()
+        expected_hidden_size = None
+        if mm_config.enable_mm_embeds:
+            expected_hidden_size = self.info.ctx.model_config.get_inputs_embeds_size()
+
+        return MultiModalDataParser(expected_hidden_size=expected_hidden_size)
 
     def validate_num_items(
         self,

From ac1c934276167085dc7d9e5adb8c2089de0460f9 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Fri, 19 Dec 2025 16:00:33 +0800
Subject: [PATCH 702/770] [Bugfix] Fix incorrect tiles creation for mm prefix
 triton attention (#30974)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/attention/ops/triton_unified_attention.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py
index f61c8e9b89c24..e60af7434d25c 100644
--- a/vllm/attention/ops/triton_unified_attention.py
+++ b/vllm/attention/ops/triton_unified_attention.py
@@ -189,9 +189,14 @@ def kernel_unified_attention_2d(
         + 1
     )
 
-    # adjust for potential padding in the last q_block by considering the
-    # actual sequence length
-    max_seq_prefix_len = tl.minimum(max_seq_prefix_len, seq_len)
+    if USE_MM_PREFIX:
+        # image bidirectional attention ranges require a full range
+        # including q_block padding to make sure doc mask is correct
+        max_seq_prefix_len = tl.maximum(max_seq_prefix_len, seq_len)
+    else:
+        # adjust for potential padding in the last q_block by considering the
+        # actual sequence length
+        max_seq_prefix_len = tl.minimum(max_seq_prefix_len, seq_len)
 
     # calculate the number of tiles that need to be processed to
     # cover the longest sequence prefix (due to causal masking, tiles beyond
@@ -202,7 +207,8 @@ def kernel_unified_attention_2d(
     # Default: keep previous global behavior
     tile_start = 0
     tile_end = num_tiles
-    if SLIDING_WINDOW > 0:
+    # TODO(Isotr0py): sliding window pruning with image bidirectional mask
+    if SLIDING_WINDOW > 0 and not USE_MM_PREFIX:
         # Query rows covered by this Q-block
         qpos_lo = q_block_local_idx * BLOCK_Q
         qpos_hi = tl.minimum(

From 9187de9fac94996120ae29afc43d5a34aa86cf18 Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <jinzhen.ljz@antgroup.com>
Date: Fri, 19 Dec 2025 16:56:35 +0800
Subject: [PATCH 703/770] [Quantization] enable compressed-tensors marlin
 support for turing (2) (#31008)

Signed-off-by: Jinzhen Lin <jinzhen.ljz@antgroup.com>
---
 vllm/model_executor/layers/quantization/utils/marlin_utils.py   | 2 +-
 .../layers/quantization/utils/marlin_utils_fp4.py               | 2 +-
 .../layers/quantization/utils/marlin_utils_fp8.py               | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 072b46f055210..66e979b505f0d 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -48,7 +48,7 @@ def query_marlin_supported_quant_types(
             -1 if capability_tuple is None else capability_tuple.to_int()
         )
 
-    if device_capability < 80:
+    if device_capability < 75:
         return []
 
     # - has_zp is True: return quant_types that has zero points
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
index b94d5bbf36540..876c724bf972d 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
@@ -23,7 +23,7 @@ logger = init_logger(__name__)
 
 
 def is_fp4_marlin_supported():
-    return current_platform.has_device_capability(80)
+    return current_platform.has_device_capability(75)
 
 
 def nvfp4_marlin_process_scales(marlin_scales):
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
index c67e4f437cf0c..3d084516bf9a2 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -22,7 +22,7 @@ logger = init_logger(__name__)
 
 
 def is_fp8_marlin_supported():
-    return current_platform.has_device_capability(80)
+    return current_platform.has_device_capability(75)
 
 
 def fp8_fused_exponent_bias_into_scales(scales):

From 086b96339ffe057f92cd0a20c8be820b17e24dbf Mon Sep 17 00:00:00 2001
From: lif <1835304752@qq.com>
Date: Fri, 19 Dec 2025 18:23:28 +0800
Subject: [PATCH 704/770] [Bugfix] Add validation for tool requests when
 tool_parser is unavailable (#30613)

Signed-off-by: majiayu000 <1835304752@qq.com>
Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
---
 tests/entrypoints/openai/test_serving_chat.py | 67 +++++++++++++++++++
 vllm/entrypoints/openai/serving_chat.py       | 31 ++++++---
 2 files changed, 89 insertions(+), 9 deletions(-)

diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index af4b52ac094e2..6fb074b8a19ba 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -15,6 +15,7 @@ from vllm.entrypoints.openai.parser.harmony_utils import get_encoding
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionRequest,
     ChatCompletionResponse,
+    ErrorResponse,
     RequestResponseMetadata,
 )
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
@@ -1444,3 +1445,69 @@ class TestServingChatWithHarmony:
                 },
             ],
         )
+
+
+@pytest.mark.asyncio
+async def test_tool_choice_validation_without_parser():
+    """Test that tool_choice='required' or named tool without tool_parser
+    returns an appropriate error message."""
+    mock_engine = MagicMock(spec=AsyncLLM)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
+    mock_engine.model_config = MockModelConfig()
+    mock_engine.input_processor = MagicMock()
+    mock_engine.io_processor = MagicMock()
+
+    models = OpenAIServingModels(
+        engine_client=mock_engine,
+        base_model_paths=BASE_MODEL_PATHS,
+    )
+    # Create serving_chat without tool_parser (enable_auto_tools=False)
+    serving_chat = OpenAIServingChat(
+        mock_engine,
+        models,
+        response_role="assistant",
+        chat_template=CHAT_TEMPLATE,
+        chat_template_content_format="auto",
+        request_logger=None,
+        enable_auto_tools=False,  # No tool parser
+    )
+
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_weather",
+                "description": "Get the weather in a given location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {"location": {"type": "string"}},
+                    "required": ["location"],
+                },
+            },
+        }
+    ]
+
+    # Test tool_choice="required" without tool_parser
+    req_required = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[{"role": "user", "content": "What's the weather?"}],
+        tools=tools,
+        tool_choice="required",
+    )
+    response_required = await serving_chat.create_chat_completion(req_required)
+    assert isinstance(response_required, ErrorResponse)
+    assert "tool_choice" in response_required.error.message
+    assert "--tool-call-parser" in response_required.error.message
+
+    # Test named tool_choice without tool_parser
+    req_named = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[{"role": "user", "content": "What's the weather?"}],
+        tools=tools,
+        tool_choice={"type": "function", "function": {"name": "get_weather"}},
+    )
+    response_named = await serving_chat.create_chat_completion(req_named)
+    assert isinstance(response_named, ErrorResponse)
+    assert "tool_choice" in response_named.error.message
+    assert "--tool-call-parser" in response_named.error.message
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index e36ae00fc9b86..73a3328679584 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -253,18 +253,31 @@ class OpenAIServingChat(OpenAIServing):
                 truncate_tool_call_ids(request)
                 validate_request_params(request)
 
-            if (
-                request.tool_choice == "auto"
-                and not (self.enable_auto_tools and tool_parser is not None)
+            # Check if tool parsing is unavailable (common condition)
+            tool_parsing_unavailable = (
+                tool_parser is None
                 and not isinstance(tokenizer, MistralTokenizer)
                 and not self.use_harmony
+            )
+
+            # Validate tool_choice when tool parsing is required but unavailable
+            if tool_parsing_unavailable and request.tool_choice not in (
+                None,
+                "none",
             ):
-                # for hf tokenizers, "auto" tools requires
-                # --enable-auto-tool-choice and --tool-call-parser
-                return self.create_error_response(
-                    '"auto" tool choice requires '
-                    "--enable-auto-tool-choice and --tool-call-parser to be set"
-                )
+                if request.tool_choice == "auto" and not self.enable_auto_tools:
+                    # for hf tokenizers, "auto" tools requires
+                    # --enable-auto-tool-choice and --tool-call-parser
+                    return self.create_error_response(
+                        '"auto" tool choice requires '
+                        "--enable-auto-tool-choice and --tool-call-parser to be set"
+                    )
+                elif request.tool_choice != "auto":
+                    # "required" or named tool requires tool parser
+                    return self.create_error_response(
+                        f'tool_choice="{request.tool_choice}" requires '
+                        "--tool-call-parser to be set"
+                    )
 
             if request.tools is None or (
                 request.tool_choice == "none"

From 455949675d1808e4fafd54c17a94aec0c3e30fff Mon Sep 17 00:00:00 2001
From: Marko Rosenmueller <5467316+dr75@users.noreply.github.com>
Date: Fri, 19 Dec 2025 11:47:44 +0100
Subject: [PATCH 705/770] [Frontend][Bug] allow tool calls in analysis channel
 (#28139)

Signed-off-by: Marko Rosenmueller <5467316+dr75@users.noreply.github.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
---
 .../test_serving_chat_stream_harmony.py       | 212 ++++++++++++++++++
 vllm/entrypoints/openai/serving_chat.py       |  72 ++----
 .../openai/serving_chat_stream_harmony.py     | 101 +++++++++
 3 files changed, 327 insertions(+), 58 deletions(-)
 create mode 100644 tests/entrypoints/openai/test_serving_chat_stream_harmony.py
 create mode 100644 vllm/entrypoints/openai/serving_chat_stream_harmony.py

diff --git a/tests/entrypoints/openai/test_serving_chat_stream_harmony.py b/tests/entrypoints/openai/test_serving_chat_stream_harmony.py
new file mode 100644
index 0000000000000..1934d43d5cfb6
--- /dev/null
+++ b/tests/entrypoints/openai/test_serving_chat_stream_harmony.py
@@ -0,0 +1,212 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Unit tests for harmony streaming delta extraction.
+"""
+
+from dataclasses import dataclass, field
+from unittest.mock import patch
+
+import pytest
+
+from vllm.entrypoints.openai.serving_chat_stream_harmony import (
+    extract_harmony_streaming_delta,
+)
+
+
+@dataclass
+class MockMessage:
+    """Mock message object for testing."""
+
+    channel: str | None = None
+    recipient: str | None = None
+
+
+@dataclass
+class MockStreamableParser:
+    """Mock StreamableParser for testing without openai_harmony dependency."""
+
+    messages: list[MockMessage] = field(default_factory=list)
+
+
+class TestExtractHarmonyStreamingDelta:
+    """Tests for extract_harmony_streaming_delta function."""
+
+    @pytest.mark.parametrize(
+        "delta_text,expected_content",
+        [
+            ("Hello, world!", "Hello, world!"),
+            ("", ""),
+        ],
+    )
+    def test_final_channel_returns_content_delta(self, delta_text, expected_content):
+        """Test that final channel returns a DeltaMessage with content."""
+        parser = MockStreamableParser()
+        delta_message, tools_streamed = extract_harmony_streaming_delta(
+            harmony_parser=parser,
+            cur_channel="final",
+            cur_recipient=None,
+            prev_recipient=None,
+            delta_text=delta_text,
+            include_reasoning=False,
+        )
+
+        assert delta_message is not None
+        assert delta_message.content == expected_content
+        assert tools_streamed is False
+
+    @pytest.mark.parametrize(
+        "include_reasoning,expected_has_message",
+        [
+            (True, True),
+            (False, False),
+        ],
+    )
+    def test_analysis_channel_reasoning(self, include_reasoning, expected_has_message):
+        """Test analysis channel respects include_reasoning flag."""
+        parser = MockStreamableParser()
+        delta_message, tools_streamed = extract_harmony_streaming_delta(
+            harmony_parser=parser,
+            cur_channel="analysis",
+            cur_recipient=None,
+            prev_recipient=None,
+            delta_text="Let me think...",
+            include_reasoning=include_reasoning,
+        )
+
+        if expected_has_message:
+            assert delta_message is not None
+            assert delta_message.reasoning == "Let me think..."
+        else:
+            assert delta_message is None
+        assert tools_streamed is False
+
+    @pytest.mark.parametrize("channel", ["commentary", "analysis"])
+    @patch("vllm.entrypoints.openai.serving_chat_stream_harmony.make_tool_call_id")
+    def test_new_tool_call(self, mock_make_tool_call_id, channel):
+        """Test new tool call creation when recipient changes."""
+        mock_make_tool_call_id.return_value = "call_test123"
+        parser = MockStreamableParser()
+
+        delta_message, tools_streamed = extract_harmony_streaming_delta(
+            harmony_parser=parser,
+            cur_channel=channel,
+            cur_recipient="functions.get_weather",
+            prev_recipient=None,
+            delta_text="",
+            include_reasoning=False,
+        )
+
+        assert delta_message is not None
+        assert len(delta_message.tool_calls) == 1
+        tool_call = delta_message.tool_calls[0]
+        assert tool_call.id == "call_test123"
+        assert tool_call.type == "function"
+        assert tool_call.function.name == "get_weather"
+        assert tool_call.function.arguments == ""
+        assert tool_call.index == 0
+        assert tools_streamed is True
+
+    @pytest.mark.parametrize("channel", ["commentary", "analysis"])
+    def test_tool_call_argument_streaming(self, channel):
+        """Test streaming tool call arguments (same recipient)."""
+        parser = MockStreamableParser()
+
+        delta_message, tools_streamed = extract_harmony_streaming_delta(
+            harmony_parser=parser,
+            cur_channel=channel,
+            cur_recipient="functions.get_weather",
+            prev_recipient="functions.get_weather",
+            delta_text='{"location": "Paris"}',
+            include_reasoning=False,
+        )
+
+        assert delta_message is not None
+        tool_call = delta_message.tool_calls[0]
+        assert tool_call.id is None
+        assert tool_call.function.arguments == '{"location": "Paris"}'
+        assert tool_call.index == 0
+        assert tools_streamed is True
+
+    @pytest.mark.parametrize("channel", ["commentary", "analysis"])
+    def test_tool_call_empty_arguments_returns_none(self, channel):
+        """Test empty delta_text with same recipient returns None."""
+        parser = MockStreamableParser()
+
+        delta_message, tools_streamed = extract_harmony_streaming_delta(
+            harmony_parser=parser,
+            cur_channel=channel,
+            cur_recipient="functions.get_weather",
+            prev_recipient="functions.get_weather",
+            delta_text="",
+            include_reasoning=False,
+        )
+
+        assert delta_message is None
+        assert tools_streamed is False
+
+    def test_tool_call_index_from_previous_messages(self):
+        """Test tool call index accounts for previous function messages."""
+        messages = [
+            MockMessage(channel="analysis", recipient=None),  # Not counted
+            MockMessage(channel="commentary", recipient="functions.tool1"),  # Counted
+            MockMessage(channel="final", recipient=None),  # Not counted
+        ]
+        parser = MockStreamableParser(messages=messages)
+
+        delta_message, _ = extract_harmony_streaming_delta(
+            harmony_parser=parser,
+            cur_channel="commentary",
+            cur_recipient="functions.tool2",
+            prev_recipient="functions.tool2",
+            delta_text="args",
+            include_reasoning=False,
+        )
+
+        assert delta_message.tool_calls[0].index == 1
+
+    @pytest.mark.parametrize(
+        "channel,recipient",
+        [
+            ("commentary", None),
+            ("commentary", "browser.search"),
+        ],
+    )
+    def test_returns_tool_call_preambles(self, channel, recipient):
+        """Test that invalid channel/recipient combinations return None."""
+        parser = MockStreamableParser()
+        delta_text = "some text"
+        delta_message, tools_streamed = extract_harmony_streaming_delta(
+            harmony_parser=parser,
+            cur_channel=channel,
+            cur_recipient=recipient,
+            prev_recipient=None,
+            delta_text=delta_text,
+            include_reasoning=True,
+        )
+
+        assert delta_message.content == delta_text
+        assert tools_streamed is False
+
+    @pytest.mark.parametrize(
+        "channel,recipient",
+        [
+            (None, None),
+            ("unknown_channel", None),
+        ],
+    )
+    def test_returns_none_for_invalid_inputs(self, channel, recipient):
+        """Test that invalid channel/recipient combinations return None."""
+        parser = MockStreamableParser()
+
+        delta_message, tools_streamed = extract_harmony_streaming_delta(
+            harmony_parser=parser,
+            cur_channel=channel,
+            cur_recipient=recipient,
+            prev_recipient=None,
+            delta_text="some text",
+            include_reasoning=True,
+        )
+
+        assert delta_message is None
+        assert tools_streamed is False
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 73a3328679584..d437d1e5c3b06 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -51,6 +51,9 @@ from vllm.entrypoints.openai.protocol import (
     ToolCall,
     UsageInfo,
 )
+from vllm.entrypoints.openai.serving_chat_stream_harmony import (
+    extract_harmony_streaming_delta,
+)
 from vllm.entrypoints.openai.serving_engine import (
     GenerationError,
     OpenAIServing,
@@ -837,64 +840,17 @@ class OpenAIServingChat(OpenAIServing):
                             current_token_ids = as_list(output.token_ids)
 
                     if self.use_harmony:
-                        if cur_channel == "final":
-                            delta_message = DeltaMessage(content=delta_text)
-                        elif cur_channel == "analysis":
-                            if request.include_reasoning:
-                                delta_message = DeltaMessage(reasoning=delta_text)
-                            else:
-                                delta_message = None
-                        elif (
-                            cur_channel == "commentary"
-                            and cur_recipient
-                            and cur_recipient.startswith("functions.")
-                        ):
-                            # Count completed tool calls to determine index
-                            base_index = 0
-                            for msg in harmony_parser.messages:
-                                if (
-                                    msg.channel == "commentary"
-                                    and msg.recipient
-                                    and msg.recipient.startswith("functions.")
-                                ):
-                                    base_index += 1
-
-                            if prev_recipient != cur_recipient:
-                                tool_name = cur_recipient.split("functions.", 1)[1]
-                                delta_message = DeltaMessage(
-                                    tool_calls=[
-                                        DeltaToolCall(
-                                            id=make_tool_call_id(),
-                                            type="function",
-                                            function=DeltaFunctionCall(
-                                                name=tool_name,
-                                                arguments="",
-                                            ),
-                                            index=base_index,
-                                        )
-                                    ]
-                                )
-                            elif delta_text:
-                                delta_message = DeltaMessage(
-                                    tool_calls=[
-                                        DeltaToolCall(
-                                            index=base_index,
-                                            function=DeltaFunctionCall(
-                                                arguments=delta_text
-                                            ),
-                                        )
-                                    ]
-                                )
-                            else:
-                                delta_message = None
-
-                            if delta_message is not None:
-                                harmony_tools_streamed[i] = True
-                        elif cur_channel == "commentary":
-                            # Tool call preambles meant to be shown to the user
-                            delta_message = DeltaMessage(content=delta_text)
-                        else:
-                            delta_message = None
+                        delta_message, tools_streamed_flag = (
+                            extract_harmony_streaming_delta(
+                                harmony_parser=harmony_parser,
+                                cur_channel=cur_channel,
+                                cur_recipient=cur_recipient,
+                                prev_recipient=prev_recipient,
+                                delta_text=delta_text,
+                                include_reasoning=request.include_reasoning,
+                            )
+                        )
+                        harmony_tools_streamed[i] |= tools_streamed_flag
                     # handle streaming deltas for tools with named tool_choice
                     elif tool_choice_function_name:
                         if (
diff --git a/vllm/entrypoints/openai/serving_chat_stream_harmony.py b/vllm/entrypoints/openai/serving_chat_stream_harmony.py
new file mode 100644
index 0000000000000..1b5ae620651c6
--- /dev/null
+++ b/vllm/entrypoints/openai/serving_chat_stream_harmony.py
@@ -0,0 +1,101 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Harmony-specific streaming delta extraction for chat completions.
+
+This module handles the extraction of DeltaMessage objects from
+harmony parser state during streaming chat completions.
+"""
+
+from openai_harmony import StreamableParser
+
+from vllm.entrypoints.chat_utils import make_tool_call_id
+from vllm.entrypoints.openai.protocol import (
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+)
+
+
+def extract_harmony_streaming_delta(
+    harmony_parser: StreamableParser,
+    cur_channel: str | None,
+    cur_recipient: str | None,
+    prev_recipient: str | None,
+    delta_text: str,
+    include_reasoning: bool,
+) -> tuple[DeltaMessage | None, bool]:
+    """
+    Extract a DeltaMessage from harmony parser state during streaming.
+
+    Args:
+        harmony_parser: The StreamableParser instance tracking parse state
+        cur_channel: Current channel ("final", "analysis", "commentary", etc.)
+        cur_recipient: Current recipient (e.g., "functions.my_func")
+        prev_recipient: Previous recipient for detecting tool call transitions
+        delta_text: The text delta to include in the message
+        include_reasoning: Whether to include reasoning content
+
+    Returns:
+        A tuple of (DeltaMessage or None, tools_streamed_flag)
+    """
+    tools_streamed = False
+
+    if cur_channel == "final":
+        delta_message = DeltaMessage(content=delta_text)
+    elif (
+        (cur_channel == "commentary" or cur_channel == "analysis")
+        and cur_recipient
+        and cur_recipient.startswith("functions.")
+    ):
+        # Count completed tool calls to determine index
+        base_index = 0
+        for msg in harmony_parser.messages:
+            if (
+                (msg.channel == "commentary" or msg.channel == "analysis")
+                and msg.recipient
+                and msg.recipient.startswith("functions.")
+            ):
+                base_index += 1
+
+        if prev_recipient != cur_recipient:
+            tool_name = cur_recipient.split("functions.", 1)[1]
+            delta_message = DeltaMessage(
+                tool_calls=[
+                    DeltaToolCall(
+                        id=make_tool_call_id(),
+                        type="function",
+                        function=DeltaFunctionCall(
+                            name=tool_name,
+                            arguments="",
+                        ),
+                        index=base_index,
+                    )
+                ]
+            )
+        elif delta_text:
+            delta_message = DeltaMessage(
+                tool_calls=[
+                    DeltaToolCall(
+                        index=base_index,
+                        function=DeltaFunctionCall(arguments=delta_text),
+                    )
+                ]
+            )
+        else:
+            delta_message = None
+
+        if delta_message is not None:
+            tools_streamed = True
+    elif cur_channel == "commentary":
+        # Tool call preambles meant to be shown to the user
+        delta_message = DeltaMessage(content=delta_text)
+    elif cur_channel == "analysis":
+        if include_reasoning:
+            delta_message = DeltaMessage(reasoning=delta_text)
+        else:
+            delta_message = None
+    else:
+        delta_message = None
+
+    return delta_message, tools_streamed

From 420ba2dbb6cf9bec13492563731e5f056e4c86f8 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Fri, 19 Dec 2025 20:16:18 +0800
Subject: [PATCH 706/770] Enable aarch64 CPU performance benchmarks (#26494)

Signed-off-by: Ioana Ghiban <ioana.ghiban@arm.com>
Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
Co-authored-by: Ioana Ghiban <ioana.ghiban@arm.com>
Co-authored-by: Fadi Arafeh <fadi.arafeh@arm.com>
---
 .buildkite/performance-benchmarks/README.md   |   9 +-
 .../scripts/run-performance-benchmarks.sh     |  24 ++--
 .../tests/latency-tests-arm64-cpu.json        |  26 ++++
 .../tests/serving-tests-arm64-cpu.json        | 130 ++++++++++++++++++
 .../tests/throughput-tests-arm64-cpu.json     |  27 ++++
 docs/benchmarking/dashboard.md                |  15 +-
 6 files changed, 213 insertions(+), 18 deletions(-)
 mode change 100644 => 100755 .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
 create mode 100644 .buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json
 create mode 100644 .buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json
 create mode 100644 .buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json

diff --git a/.buildkite/performance-benchmarks/README.md b/.buildkite/performance-benchmarks/README.md
index 015f48c2520d6..64a262c6cb401 100644
--- a/.buildkite/performance-benchmarks/README.md
+++ b/.buildkite/performance-benchmarks/README.md
@@ -7,7 +7,7 @@ vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](http
 
 ## Performance benchmark quick overview
 
-**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors and Intel® Gaudi® 3 Accelerators with different models.
+**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors, Intel® Gaudi® 3 Accelerators and Arm® Neoverse™ with different models.
 
 **Benchmarking Duration**: about 1hr.
 
@@ -23,7 +23,7 @@ bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
 
 Runtime environment variables:
 
-- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
+- `ON_CPU`: set the value to '1' on Intel® Xeon® and Arm® Neoverse™ Processors. Default value is 0.
 - `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
 - `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
 - `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
@@ -34,8 +34,9 @@ Runtime environment variables:
 
 See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
 > NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
-For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
->
+> For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
+> For Arm® Neoverse™, use `tests/latency-tests-arm64-cpu.json`, `tests/throughput-tests-arm64-cpu.json`, `tests/serving-tests-arm64-cpu.json` instead.
+
 ### Latency test
 
 Here is an example of one test inside `latency-tests.json`:
diff --git a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
old mode 100644
new mode 100755
index 34ceefe0996f2..6b6a7e472b9c8
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -49,7 +49,11 @@ check_cpus() {
     echo "Need at least 1 NUMA to run benchmarking."
     exit 1
   fi
-  declare -g gpu_type="cpu"
+  if [[ "$(uname -m)" == "aarch64" ]] || [[ "$(uname -m)" == "arm64" ]]; then
+    declare -g gpu_type="arm64-cpu"
+  else
+    declare -g gpu_type="cpu"
+  fi
   echo "GPU type is $gpu_type"
 }
 
@@ -207,8 +211,8 @@ run_latency_tests() {
 
     # check if there is enough GPU to run the test
     tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
-    if [ "$ON_CPU" == "1" ]; then
-      pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size')
+    if [[ "$ON_CPU" == "1" ]]; then
+      pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size // 1')
       world_size=$(($tp*$pp))
       if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
         echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
@@ -276,8 +280,8 @@ run_throughput_tests() {
 
     # check if there is enough GPU to run the test
     tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
-    if [ "$ON_CPU" == "1" ]; then
-      pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size')
+    if [[ "$ON_CPU" == "1" ]]; then
+      pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size // 1')
       world_size=$(($tp*$pp))
       if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
         echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
@@ -393,8 +397,8 @@ run_serving_tests() {
 
     # check if there is enough resources to run the test
     tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
-    if [ "$ON_CPU" == "1" ]; then
-      pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size')
+    if [[ "$ON_CPU" == "1" ]]; then
+      pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size // 1')
       world_size=$(($tp*$pp))
       if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
         echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
@@ -496,9 +500,9 @@ run_serving_tests() {
 main() {
   local ARCH
   ARCH=''
-  if [ "$ON_CPU" == "1" ];then
-     check_cpus
-     ARCH='-cpu'
+  if [[ "$ON_CPU" == "1" ]]; then
+    check_cpus
+    ARCH="-$gpu_type"
   else
      check_gpus
      ARCH="$arch_suffix"
diff --git a/.buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json b/.buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json
new file mode 100644
index 0000000000000..fba695041e3ee
--- /dev/null
+++ b/.buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json
@@ -0,0 +1,26 @@
+[
+    {
+        "test_name": "latency_llama8B_tp1",
+        "environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+            "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+            "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+            "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "dtype": "bfloat16",
+            "distributed_executor_backend": "mp",
+            "block_size": 128,
+            "trust_remote_code": "",
+            "disable_log_stats": "",
+            "enforce_eager": "",
+            "max_num_batched_tokens": 2048,
+            "max_num_seqs": 256,
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    }
+]
diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json
new file mode 100644
index 0000000000000..63f1f8ab887b3
--- /dev/null
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json
@@ -0,0 +1,130 @@
+{
+  "defaults": {
+    "qps_list": [
+      "inf"
+    ],
+    "max_concurrency_list": [
+      12,
+      16,
+      24,
+      32,
+      64,
+      128,
+      200
+    ],
+    "server_environment_variables": {
+      "VLLM_RPC_TIMEOUT": 100000,
+      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+      "VLLM_CPU_SGL_KERNEL": 1,
+      "VLLM_CPU_KVCACHE_SPACE": 40
+    },
+    "server_parameters": {
+      "model": "meta-llama/Llama-3.1-8B-Instruct",
+      "tensor_parallel_size": 1,
+      "dtype": "bfloat16",
+      "distributed_executor_backend": "mp",
+      "block_size": 128,
+      "trust_remote_code": "",
+      "disable_log_stats": "",
+      "enforce_eager": "",
+      "max_num_batched_tokens": 2048,
+      "max_num_seqs": 256,
+      "load_format": "dummy"
+    },
+    "client_parameters": {
+      "model": "meta-llama/Llama-3.1-8B-Instruct",
+      "backend": "vllm",
+      "ignore-eos": "",
+      "num_prompts": 200
+    }
+  },
+  "tests": [
+    {
+      "test_name": "serving_llama8B_tp1_sharegpt",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "sharegpt",
+        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_sharegpt",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "sharegpt",
+        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp1_random_128_128",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_128_128",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp1_random_128_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_128_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp1_random_2048_128",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_2048_128",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 128
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/.buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json b/.buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json
new file mode 100644
index 0000000000000..da84dd4d0c67a
--- /dev/null
+++ b/.buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json
@@ -0,0 +1,27 @@
+[
+    {
+        "test_name": "throughput_llama8B_tp1",
+        "environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+            "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+            "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+            "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "dtype": "bfloat16",
+            "distributed_executor_backend": "mp",
+            "block_size": 128,
+            "trust_remote_code": "",
+            "disable_log_stats": "",
+            "enforce_eager": "",
+            "max_num_batched_tokens": 2048,
+            "max_num_seqs": 256,
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    }
+]
diff --git a/docs/benchmarking/dashboard.md b/docs/benchmarking/dashboard.md
index 7cc4d23250df9..4cbc1a6a0a4fb 100644
--- a/docs/benchmarking/dashboard.md
+++ b/docs/benchmarking/dashboard.md
@@ -8,12 +8,19 @@ The results are automatically published to the public [vLLM Performance Dashboar
 ## Manually Trigger the benchmark
 
 Use [vllm-ci-test-repo images](https://gallery.ecr.aws/q9t5s3a7/vllm-ci-test-repo) with vLLM benchmark suite.
-For CPU environment, please use the image with "-cpu" postfix.
+For x86 CPU environment, please use the image with "-cpu" postfix. For AArch64 CPU environment, please use the image with "-arm64-cpu" postfix.
 
-Here is an example for docker run command for CPU.
+Here is an example for docker run command for CPU. For GPUs skip setting the `ON_CPU` env var.
 
 ```bash
-docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface  -e HF_TOKEN=''  --shm-size=16g --name vllm-cpu-ci  public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:1da94e673c257373280026f75ceb4effac80e892-cpu
+export VLLM_COMMIT=1da94e673c257373280026f75ceb4effac80e892 # use full commit hash from the main branch
+export HF_TOKEN=<valid Hugging Face token>
+if [[ "$(uname -m)" == aarch64 || "$(uname -m)" == arm64 ]]; then
+  IMG_SUFFIX="arm64-cpu"
+else
+  IMG_SUFFIX="cpu"
+fi
+docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface -e HF_TOKEN=$HF_TOKEN -e ON_ARM64_CPU=1 --shm-size=16g --name vllm-cpu-ci public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:${VLLM_COMMIT}-${IMG_SUFFIX}
 ```
 
 Then, run below command inside the docker instance.
@@ -26,7 +33,7 @@ When run, benchmark script generates results under **benchmark/results** folder,
 
 ### Runtime environment variables
 
-- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
+- `ON_CPU`: set the value to '1' on Intel® Xeon® and Arm® Neoverse™ Processors. Default value is 0.
 - `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
 - `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
 - `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).

From bd2b52fc2dd09b401991835c8a2a6f2ef940b2e4 Mon Sep 17 00:00:00 2001
From: Nishidha Panpaliya <nishidha.panpaliya@partner.ibm.com>
Date: Fri, 19 Dec 2025 17:56:35 +0530
Subject: [PATCH 707/770] [CPU][Bugfix] Fix ppc64le CPU build (#30871)

Signed-off-by: Nishidha Panpaliya <nishidha.panpaliya@partner.ibm.com>
---
 csrc/cpu/utils.hpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/csrc/cpu/utils.hpp b/csrc/cpu/utils.hpp
index 8ab0bb039c014..682751d67b1cd 100644
--- a/csrc/cpu/utils.hpp
+++ b/csrc/cpu/utils.hpp
@@ -37,10 +37,12 @@ struct VecTypeTrait<c10::BFloat16> {
 };
 #endif
 
+#if !defined(__powerpc__)
 template <>
 struct VecTypeTrait<c10::Half> {
   using vec_t = vec_op::FP16Vec16;
 };
+#endif
 
 struct Counter {
   std::atomic<int64_t> counter;

From b5545d9d5cab2625ac04a19f552631a2034c8f47 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Fri, 19 Dec 2025 14:39:54 +0100
Subject: [PATCH 708/770] [Bugfix] [Kernel] Triton attention kernels: mask out
 V blocks that fall outside sliding window (#30887)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 vllm/attention/ops/triton_unified_attention.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py
index e60af7434d25c..c946dbd8a2c4e 100644
--- a/vllm/attention/ops/triton_unified_attention.py
+++ b/vllm/attention/ops/triton_unified_attention.py
@@ -363,6 +363,12 @@ def kernel_unified_attention_2d(
         L = L * alpha + l_j
         M = m_j
 
+        if SLIDING_WINDOW:
+            qpos_lo = q_block_local_idx * BLOCK_Q
+            V = tl.where(
+                (context_len + qpos_lo - seq_offset[:, None]) < SLIDING_WINDOW, V, 0.0
+            )
+
         # acc : (BLOCK_M, HEAD_SIZE_PADDED)
         acc += tl.dot(P.to(V.dtype), V)
 
@@ -678,6 +684,12 @@ def kernel_unified_attention_3d(
         L = L * alpha + l_j
         M = m_j
 
+        if SLIDING_WINDOW:
+            qpos_lo = q_block_local_idx * BLOCK_Q
+            V = tl.where(
+                (context_len + qpos_lo - seq_offset[:, None]) < SLIDING_WINDOW, V, 0.0
+            )
+
         # acc : (BLOCK_M, HEAD_SIZE_PADDED)
         acc += tl.dot(P.to(V.dtype), V)
 

From 23a1946e3b1b2dd45a98c3dfd86556563543c91b Mon Sep 17 00:00:00 2001
From: Shanshan Shen <467638484@qq.com>
Date: Fri, 19 Dec 2025 22:16:09 +0800
Subject: [PATCH 709/770] [CustomOp][Refactor] Extract common methods for
 ApplyRotaryEmb CustomOp (#31021)

Signed-off-by: shen-shanshan <467638484@qq.com>
---
 .../layers/rotary_embedding/common.py         | 63 ++++++++++---------
 1 file changed, 35 insertions(+), 28 deletions(-)

diff --git a/vllm/model_executor/layers/rotary_embedding/common.py b/vllm/model_executor/layers/rotary_embedding/common.py
index 50660c6ecc223..b86cd9f001d61 100644
--- a/vllm/model_executor/layers/rotary_embedding/common.py
+++ b/vllm/model_executor/layers/rotary_embedding/common.py
@@ -178,6 +178,37 @@ class ApplyRotaryEmb(CustomOp):
             output = output.to(origin_dtype)
         return output
 
+    def _pre_process(
+        self,
+        x: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Size, torch.dtype]:
+        origin_shape = x.shape
+        if len(origin_shape) == 3:
+            # x: [seq_len, num_heads, head_size]
+            x = x.unsqueeze(0)
+
+        origin_dtype = x.dtype
+        if self.enable_fp32_compute:
+            x = x.float()
+            cos = cos.float()
+            sin = sin.float()
+
+        return x, cos, sin, origin_shape, origin_dtype
+
+    def _post_process(
+        self,
+        output: torch.Tensor,
+        origin_shape: torch.Size,
+        origin_dtype: torch.dtype,
+    ) -> torch.Tensor:
+        if len(origin_shape) == 3:
+            output = output.squeeze(0)
+        if self.enable_fp32_compute:
+            output = output.to(origin_dtype)
+        return output
+
     def forward_native(
         self,
         x: torch.Tensor,
@@ -197,16 +228,7 @@ class ApplyRotaryEmb(CustomOp):
     ) -> torch.Tensor:
         from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
 
-        origin_dtype = x.dtype
-        if self.enable_fp32_compute:
-            x = x.float()
-            cos = cos.float()
-            sin = sin.float()
-
-        origin_shape = x.shape
-        if len(origin_shape) == 3:
-            # x: [seq_len, num_heads, head_size]
-            x = x.unsqueeze(0)
+        x, cos, sin, origin_shape, origin_dtype = self._pre_process(x, cos, sin)
 
         """
         Arguments of apply_rotary_emb() in vllm_flash_attn:
@@ -218,10 +240,7 @@ class ApplyRotaryEmb(CustomOp):
         interleaved = not self.is_neox_style
         output = apply_rotary_emb(x, cos, sin, interleaved)
 
-        if len(origin_shape) == 3:
-            output = output.squeeze(0)
-        if self.enable_fp32_compute:
-            output = output.to(origin_dtype)
+        output = self._post_process(output, origin_shape, origin_dtype)
         return output
 
     def forward_hip(
@@ -231,16 +250,7 @@ class ApplyRotaryEmb(CustomOp):
         sin: torch.Tensor,
     ) -> torch.Tensor:
         if self.apply_rotary_emb_flash_attn is not None:
-            origin_dtype = x.dtype
-            if self.enable_fp32_compute:
-                x = x.float()
-                cos = cos.float()
-                sin = sin.float()
-
-            origin_shape = x.shape
-            if len(origin_shape) == 3:
-                # x: [seq_len, num_heads, head_size]
-                x = x.unsqueeze(0)
+            x, cos, sin, origin_shape, origin_dtype = self._pre_process(x, cos, sin)
 
             """
             Arguments of apply_rotary() in flash_attn:
@@ -254,10 +264,7 @@ class ApplyRotaryEmb(CustomOp):
                 x, cos, sin, interleaved=interleaved
             ).type_as(x)
 
-            if len(origin_shape) == 3:
-                output = output.squeeze(0)
-            if self.enable_fp32_compute:
-                output = output.to(origin_dtype)
+            output = self._post_process(output, origin_shape, origin_dtype)
         else:
             # Falling back to PyTorch native implementation.
             output = self.forward_native(x, cos, sin)

From 5fbfa8d9ef15948599631baeb91e8220b2ee9bcc Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <jinzhen.ljz@antgroup.com>
Date: Fri, 19 Dec 2025 23:33:22 +0800
Subject: [PATCH 710/770] [Quantization] fix marlin w8a8 check (#30961)

Signed-off-by: Jinzhen Lin <jinzhen.ljz@antgroup.com>
---
 .../layers/quantization/utils/marlin_utils_fp8.py        | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
index 3d084516bf9a2..93d238a0524d8 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -11,7 +11,6 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     marlin_make_workspace_new,
     marlin_permute_bias,
     marlin_permute_scales,
-    marlin_quant_input,
     should_use_atomic_add_reduce,
 )
 from vllm.model_executor.utils import replace_parameter
@@ -63,13 +62,11 @@ def apply_fp8_marlin_linear(
     inputs = reshaped_x
     a_scales = None
     if input_dtype is not None and input_dtype.itemsize == 1:
-        if input_dtype != torch.float8_e4m3fn:
-            raise RuntimeError("FP8 weight + INT8 activation is not supported.")
-
-        inputs, a_scales = marlin_quant_input(inputs, torch.float8_e4m3fn)
+        # inputs, a_scales = marlin_quant_input(inputs, torch.float8_e4m3fn)
+        raise RuntimeError("Marlin W8A8 is not supported.")
 
     output = ops.gptq_marlin_gemm(
-        a=reshaped_x,
+        a=inputs,
         c=None,
         b_q_weight=weight,
         b_bias=bias,

From 268a972c6280b7f7e9e67d7012164a5a8e477961 Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Fri, 19 Dec 2025 11:08:53 -0500
Subject: [PATCH 711/770] Update Pytorch version update docs (#30982)

---
 .../contributing/ci/update_pytorch_version.md | 23 ++++++++-----------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/docs/contributing/ci/update_pytorch_version.md b/docs/contributing/ci/update_pytorch_version.md
index 735bb2e205332..74c0beb779c7d 100644
--- a/docs/contributing/ci/update_pytorch_version.md
+++ b/docs/contributing/ci/update_pytorch_version.md
@@ -77,25 +77,20 @@ This complicates the process as we cannot use the out-of-the-box
     - `.buildkite/release-pipeline.yaml`
     - `.buildkite/scripts/upload-wheels.sh`
 
-## Address long vLLM build time
+## Manually running vLLM builds on BuildKiteCI
 
-When building vLLM with a new PyTorch/CUDA version, no cache will exist
-in the vLLM sccache S3 bucket, causing the build job on CI to potentially take more than 5 hours
-and timeout. Additionally, since vLLM's fastcheck pipeline runs in read-only mode,
-it doesn't populate the cache, so re-running it to warm up the cache
-is ineffective.
+When building vLLM with a new PyTorch/CUDA version, the vLLM sccache S3 bucket
+will not have any cached artifacts, which can cause CI build jobs to exceed 5 hours.
+Furthermore, vLLM's fastcheck pipeline operates in read-only mode and does not
+populate the cache, making it ineffective for cache warm-up purposes.
 
-While ongoing efforts like <https://github.com/vllm-project/vllm/issues/17419>
-address the long build time at its source, the current workaround is to set `VLLM_CI_BRANCH`
-to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/long_build`)
-when manually triggering a build on Buildkite. This branch accomplishes two things:
+To address this, manually trigger a build on Buildkite to accomplish two objectives:
 
-1. Increase the timeout limit to 10 hours so that the build doesn't time out.
-2. Allow the compiled artifacts to be written to the vLLM sccache S3 bucket
-to warm it up so that future builds are faster.
+1. Run the complete test suite against the PyTorch RC build by setting the environment variables: `RUN_ALL=1` and `NIGHTLY=1`
+2. Populate the vLLM sccache S3 bucket with compiled artifacts, enabling faster subsequent builds
 
 <p align="center" width="100%">
-    <img width="60%" alt="Buildkite new build popup" src="https://github.com/user-attachments/assets/a8ff0fcd-76e0-4e91-b72f-014e3fdb6b94">
+<img width="60%" alt="Buildkite new build popup" src="https://github.com/user-attachments/assets/3b07f71b-bb18-4ca3-aeaf-da0fe79d315f" />
 </p>
 
 ## Update all the different vLLM platforms

From 969bbc7c6166ce10a48df2fcdc2fba508dbafcce Mon Sep 17 00:00:00 2001
From: Zhonghua Deng <abzhonghua@gmail.com>
Date: Sat, 20 Dec 2025 01:17:03 +0800
Subject: [PATCH 712/770] [Model] Add MiMo-V2-Flash support (#30836)

Signed-off-by: Abatom <abzhonghua@gmail.com>
Signed-off-by: Jumiar <liuanqim10@126.com>
Signed-off-by: Zyann7 <zyann7@outlook.com>
Co-authored-by: Jumiar <liuanqim10@126.com>
Co-authored-by: Zyann7 <zyann7@outlook.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 docs/models/supported_models.md               |   1 +
 tests/models/registry.py                      |   3 +
 vllm/config/__init__.py                       |   2 +
 vllm/config/model.py                          |   5 +
 vllm/model_executor/layers/linear.py          |  62 +-
 .../layers/quantization/utils/fp8_utils.py    |   8 +
 vllm/model_executor/models/mimo_v2_flash.py   | 720 ++++++++++++++++++
 vllm/model_executor/models/registry.py        |   1 +
 8 files changed, 789 insertions(+), 13 deletions(-)
 create mode 100644 vllm/model_executor/models/mimo_v2_flash.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 3ffbf63f9a18b..62e42a730e9cb 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -415,6 +415,7 @@ th {
 | `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | | ✅︎ |
 | `Mamba2ForCausalLM` | Mamba2 | `mistralai/Mamba-Codestral-7B-v0.1`, etc. | | ✅︎ |
 | `MiMoForCausalLM` | MiMo | `XiaomiMiMo/MiMo-7B-RL`, etc. | ✅︎ | ✅︎ |
+| `MiMoV2FlashForCausalLM` | MiMoV2Flash | `XiaomiMiMo/MiMo-V2-Flash`, etc. | ︎| ✅︎ |
 | `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ |
 | `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ |
 | `MiniMaxM2ForCausalLM` | MiniMax-M2 |`MiniMaxAI/MiniMax-M2`, etc. | | ✅︎ |
diff --git a/tests/models/registry.py b/tests/models/registry.py
index fa70e94abd865..82b9303b2a21b 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -459,6 +459,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     ),
     "Zamba2ForCausalLM": _HfExamplesInfo("Zyphra/Zamba2-7B-instruct"),
     "MiMoForCausalLM": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL", trust_remote_code=True),
+    "MiMoV2FlashForCausalLM": _HfExamplesInfo(
+        "XiaomiMiMo/MiMo-V2-Flash", trust_remote_code=True
+    ),
     "Dots1ForCausalLM": _HfExamplesInfo("rednote-hilab/dots.llm1.inst"),
 }
 
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 0e91dd57420a8..3c77fad41d077 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -18,6 +18,7 @@ from vllm.config.lora import LoRAConfig
 from vllm.config.model import (
     ModelConfig,
     iter_architecture_defaults,
+    str_dtype_to_torch_dtype,
     try_match_architecture_defaults,
 )
 from vllm.config.multimodal import MultiModalConfig
@@ -72,6 +73,7 @@ __all__ = [
     # From vllm.config.model
     "ModelConfig",
     "iter_architecture_defaults",
+    "str_dtype_to_torch_dtype",
     "try_match_architecture_defaults",
     # From vllm.config.multimodal
     "MultiModalConfig",
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 1de9d15cf8c52..db5789b709372 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -1849,6 +1849,11 @@ _STR_DTYPE_TO_TORCH_DTYPE = {
     "bfloat16": torch.bfloat16,
 }
 
+
+def str_dtype_to_torch_dtype(type: str):
+    return _STR_DTYPE_TO_TORCH_DTYPE.get(type)
+
+
 # model_type -> reason
 _FLOAT16_NOT_SUPPORTED_MODELS = {
     "gemma2": "Numerical instability. Please use bfloat16 or float32 instead.",
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index dfcc601a1c530..4ca4f75711ac7 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -277,6 +277,7 @@ class LinearBase(CustomOp):
         self.params_dtype = params_dtype
         self.quant_config = quant_config
         self.prefix = prefix
+        self.allow_fp8_block_shape_mismatch = False
         if quant_config is None:
             self.quant_method: QuantizeMethodBase | None = UnquantizedLinearMethod()
         else:
@@ -475,6 +476,7 @@ class ColumnParallelLinear(LinearBase):
             disable_tp=disable_tp,
         )
 
+        self._maybe_allow_fp8_block_shape_mismatch()
         self.gather_output = gather_output
 
         if output_sizes is None:
@@ -509,6 +511,33 @@ class ColumnParallelLinear(LinearBase):
             self.register_parameter("bias", None)
         self.update_param_tp_status()
 
+    def _maybe_allow_fp8_block_shape_mismatch(self) -> None:
+        quant_config = getattr(self, "quant_config", None)
+        weight_block = getattr(quant_config, "weight_block_size", None)
+        if (
+            weight_block is None
+            or len(weight_block) < 1
+            or len(self.output_partition_sizes) <= 1
+        ):
+            return
+
+        try:
+            block_n = int(weight_block[0])
+        except (ValueError, TypeError):
+            return
+
+        if block_n <= 0:
+            return
+
+        if any(size % block_n != 0 for size in self.output_partition_sizes):
+            self.allow_fp8_block_shape_mismatch = True
+            logger.debug(
+                "Allowing FP8 block shape mismatch for %s (block_n=%d, partitions=%s)",
+                getattr(self, "prefix", "<unknown>"),
+                block_n,
+                self.output_partition_sizes,
+            )
+
     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         output_dim = getattr(param, "output_dim", None)
 
@@ -906,9 +935,11 @@ class QKVParallelLinear(ColumnParallelLinear):
         *,
         return_bias: bool = True,
         disable_tp: bool = False,
+        v_head_size: int | None = None,
     ):
         self.hidden_size = hidden_size
         self.head_size = head_size
+        self.v_head_size = v_head_size if v_head_size is not None else head_size
         self.total_num_heads = total_num_heads
         if total_num_kv_heads is None:
             total_num_kv_heads = total_num_heads
@@ -924,12 +955,14 @@ class QKVParallelLinear(ColumnParallelLinear):
             self.num_kv_head_replicas = 1
         input_size = self.hidden_size
         output_size = (
-            (self.num_heads + 2 * self.num_kv_heads) * tp_size * self.head_size
-        )
+            self.num_heads * self.head_size
+            + self.num_kv_heads * self.head_size
+            + self.num_kv_heads * self.v_head_size
+        ) * tp_size
         self.output_sizes = [
             self.num_heads * self.head_size * tp_size,  # q_proj
             self.num_kv_heads * self.head_size * tp_size,  # k_proj
-            self.num_kv_heads * self.head_size * tp_size,  # v_proj
+            self.num_kv_heads * self.v_head_size * tp_size,  # v_proj
         ]
 
         super().__init__(
@@ -950,7 +983,8 @@ class QKVParallelLinear(ColumnParallelLinear):
             "q": 0,
             "k": self.num_heads * self.head_size,
             "v": (self.num_heads + self.num_kv_heads) * self.head_size,
-            "total": (self.num_heads + 2 * self.num_kv_heads) * self.head_size,
+            "total": (self.num_heads + self.num_kv_heads) * self.head_size
+            + self.num_kv_heads * self.v_head_size,
         }
         return shard_offset_mapping.get(loaded_shard_id)
 
@@ -958,7 +992,7 @@ class QKVParallelLinear(ColumnParallelLinear):
         shard_size_mapping = {
             "q": self.num_heads * self.head_size,
             "k": self.num_kv_heads * self.head_size,
-            "v": self.num_kv_heads * self.head_size,
+            "v": self.num_kv_heads * self.v_head_size,
         }
         return shard_size_mapping.get(loaded_shard_id)
 
@@ -985,7 +1019,7 @@ class QKVParallelLinear(ColumnParallelLinear):
             (
                 "v",
                 (self.total_num_heads + self.total_num_kv_heads) * self.head_size,
-                self.total_num_kv_heads * self.head_size,
+                self.total_num_kv_heads * self.v_head_size,
             ),
         ]
 
@@ -1110,7 +1144,7 @@ class QKVParallelLinear(ColumnParallelLinear):
                 (
                     "v",
                     (self.total_num_heads + self.total_num_kv_heads) * self.head_size,
-                    self.total_num_kv_heads * self.head_size,
+                    self.total_num_kv_heads * self.v_head_size,
                 ),
             ]
             use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
@@ -1139,11 +1173,12 @@ class QKVParallelLinear(ColumnParallelLinear):
                         "v": (
                             (self.total_num_heads + self.total_num_kv_heads)
                             * self.head_size,
-                            self.total_num_kv_heads * self.head_size,
+                            self.total_num_kv_heads * self.v_head_size,
                         ),
                         "total": (
-                            (self.total_num_heads + 2 * self.total_num_kv_heads)
-                            * self.head_size,
+                            (self.total_num_heads + self.total_num_kv_heads)
+                            * self.head_size
+                            + self.total_num_kv_heads * self.v_head_size,
                             0,
                         ),
                     }
@@ -1170,7 +1205,7 @@ class QKVParallelLinear(ColumnParallelLinear):
                 shard_size = self.num_kv_heads * self.head_size
             elif loaded_shard_id == "v":
                 shard_offset = (self.num_heads + self.num_kv_heads) * self.head_size
-                shard_size = self.num_kv_heads * self.head_size
+                shard_size = self.num_kv_heads * self.v_head_size
             # Special case for Quantized Weights.
             # If quantized, we need to adjust the offset and size to account
             # for the packing.
@@ -1199,10 +1234,11 @@ class QKVParallelLinear(ColumnParallelLinear):
                     ),
                     "v": (
                         (self.num_heads + self.num_kv_heads) * self.head_size,
-                        self.num_kv_heads * self.head_size,
+                        self.num_kv_heads * self.v_head_size,
                     ),
                     "total": (
-                        (self.num_heads + 2 * self.num_kv_heads) * self.head_size,
+                        (self.num_heads + self.num_kv_heads) * self.head_size
+                        + self.num_kv_heads * self.v_head_size,
                         0,
                     ),
                 }
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index bdc3d1fc7232d..13e813952b30a 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -1252,6 +1252,14 @@ def validate_fp8_block_shape(
     """Validate block quantization shapes for tensor parallelism."""
     from vllm.distributed import get_tensor_model_parallel_world_size
 
+    if getattr(layer, "allow_fp8_block_shape_mismatch", False):
+        logger.debug(
+            "Skipping FP8 block shape validation for layer %s due to detected"
+            " mismatch allowance.",
+            getattr(layer, "prefix", "<unknown>"),
+        )
+        return
+
     tp_size = getattr(layer, "tp_size", get_tensor_model_parallel_world_size())
     block_n, block_k = block_size[0], block_size[1]
 
diff --git a/vllm/model_executor/models/mimo_v2_flash.py b/vllm/model_executor/models/mimo_v2_flash.py
new file mode 100644
index 0000000000000..12b486f001e03
--- /dev/null
+++ b/vllm/model_executor/models/mimo_v2_flash.py
@@ -0,0 +1,720 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterable
+from itertools import islice
+
+import torch
+from torch import nn
+
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
+from vllm.config import (
+    CacheConfig,
+    VllmConfig,
+    get_current_vllm_config,
+    str_dtype_to_torch_dtype,
+)
+from vllm.distributed import (
+    get_ep_group,
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_gather,
+)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from vllm.model_executor.models.utils import sequence_parallel_chunk
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import MixtureOfExperts, SupportsPP
+from .utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    extract_layer_index,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
+
+logger = init_logger(__name__)
+
+
+class MiMoV2MLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: QuantizationConfig | None = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            prefix=f"{prefix}.down_proj",
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class MiMoV2MoE(nn.Module):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        is_nextn: bool = False,
+    ):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_text_config
+        parallel_config = vllm_config.parallel_config
+        quant_config = vllm_config.quant_config
+
+        self.tp_size = get_tensor_model_parallel_world_size()
+
+        self.ep_group = get_ep_group().device_group
+        self.ep_rank = get_ep_group().rank_in_group
+        self.ep_size = self.ep_group.size()
+        self.n_routed_experts = config.n_routed_experts
+
+        self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe
+
+        if self.tp_size > config.n_routed_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.n_routed_experts}."
+            )
+
+        if config.hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {config.hidden_act}. "
+                "Only silu is supported for now."
+            )
+
+        vllm_config = get_current_vllm_config()
+        eplb_config = vllm_config.parallel_config.eplb_config
+        self.enable_eplb = parallel_config.enable_eplb
+
+        self.n_logical_experts = self.n_routed_experts
+        self.n_redundant_experts = eplb_config.num_redundant_experts
+        self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts
+        self.n_local_physical_experts = self.n_physical_experts // self.ep_size
+
+        self.physical_expert_start = self.ep_rank * self.n_local_physical_experts
+        self.physical_expert_end = (
+            self.physical_expert_start + self.n_local_physical_experts
+        )
+
+        dtype = getattr(config, "moe_router_dtype", "float32")
+        self.gate_dtype = str_dtype_to_torch_dtype(dtype)
+        self.gate = nn.Linear(
+            config.hidden_size,
+            config.n_routed_experts,
+            bias=False,
+            dtype=self.gate_dtype,
+        )
+        self.gate.e_score_correction_bias = nn.Parameter(
+            torch.empty(config.n_routed_experts, dtype=self.gate_dtype)
+        )
+
+        self.experts = FusedMoE(
+            num_experts=self.n_routed_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            reduce_results=True,
+            renormalize=config.norm_topk_prob,
+            quant_config=quant_config,
+            prefix=f"{prefix}.experts",
+            e_score_correction_bias=self.gate.e_score_correction_bias,
+            enable_eplb=self.enable_eplb,
+            num_redundant_experts=self.n_redundant_experts,
+            is_sequence_parallel=self.is_sequence_parallel,
+            use_grouped_topk=True,
+            num_expert_group=config.n_group,
+            topk_group=config.topk_group,
+            scoring_func="sigmoid",
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        assert hidden_states.dim() <= 2, "MiMoV2MoE only supports 1D or 2D inputs"
+        is_input_1d = hidden_states.dim() == 1
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        if self.is_sequence_parallel:
+            hidden_states = sequence_parallel_chunk(hidden_states)
+
+        if self.gate_dtype is not None:
+            gate_input = hidden_states.to(self.gate_dtype)
+        else:
+            gate_input = hidden_states
+        router_logits = self.gate(gate_input)
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states, router_logits=router_logits
+        )
+
+        if self.is_sequence_parallel:
+            final_hidden_states = tensor_model_parallel_all_gather(
+                final_hidden_states, 0
+            )
+            final_hidden_states = final_hidden_states[:num_tokens]
+
+        return final_hidden_states.squeeze(0) if is_input_1d else final_hidden_states
+
+
+class MiMoV2Attention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        head_dim: int,
+        v_head_dim: int | None = None,
+        sliding_window_size: int = -1,
+        attention_bias: bool = False,
+        add_swa_attention_sink_bias: bool = False,
+        layer_id: int = 0,
+        rope_theta: float = 1000000,
+        max_position_embeddings: int = 32768,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        partial_rotary_factor: float = 1.0,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.layer_id = layer_id
+        tp_size = get_tensor_model_parallel_world_size()
+
+        self.total_num_heads = num_heads
+        self.num_heads = self.total_num_heads // tp_size
+
+        self.total_num_kv_heads = num_kv_heads
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+
+        self.head_dim = head_dim
+
+        self.v_head_dim = v_head_dim if v_head_dim is not None else head_dim
+
+        self.q_size = self.num_heads * self.head_dim
+        self.k_size = self.num_kv_heads * self.head_dim
+        self.v_size = self.num_kv_heads * self.v_head_dim
+
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=attention_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+            v_head_size=self.v_head_dim,
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.v_head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            reduce_results=True,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.rotary_emb = get_rope(
+            head_size=self.head_dim,
+            max_position=max_position_embeddings,
+            rope_parameters={
+                "rope_type": "default",
+                "rope_theta": rope_theta,
+                "partial_rotary_factor": partial_rotary_factor,
+            },
+        )
+
+        self.attention_sink_bias = (
+            torch.nn.Parameter(torch.empty(self.num_heads), requires_grad=False)
+            if add_swa_attention_sink_bias
+            else None
+        )
+
+        sliding_window = sliding_window_size if sliding_window_size > -1 else None
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            per_layer_sliding_window=sliding_window,
+            attn_type=AttentionType.DECODER,
+            prefix=f"{prefix}.attn",
+            sinks=self.attention_sink_bias,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.k_size, self.v_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+
+        v = v.view(-1, self.num_kv_heads, self.v_head_dim)
+        v = torch.nn.functional.pad(v, [0, self.head_dim - self.v_head_dim], value=0)
+        v = v.view(-1, self.num_kv_heads * self.head_dim)
+
+        attn_output = self.attn(q, k, v)
+
+        attn_output = attn_output.view(-1, self.num_heads, self.head_dim)[
+            ..., : self.v_head_dim
+        ].reshape(-1, self.num_heads * self.v_head_dim)
+
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class MiMoV2FlashDecoderLayer(nn.Module):
+    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+        config = vllm_config.model_config.hf_text_config
+        quant_config = vllm_config.quant_config
+        layer_id = extract_layer_index(prefix)
+
+        self.hidden_size = config.hidden_size
+        self.config = config
+        self.layer_id = layer_id
+
+        rope_theta = getattr(config, "rope_theta", 1000000)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 32768)
+
+        if self.is_compressed_softmax_layer():
+            self.self_attn = MiMoV2Attention(
+                hidden_size=self.hidden_size,
+                num_heads=config.swa_num_attention_heads,
+                num_kv_heads=config.swa_num_key_value_heads,
+                head_dim=config.swa_head_dim,
+                v_head_dim=getattr(config, "swa_v_head_dim", None),
+                sliding_window_size=config.sliding_window_size,
+                attention_bias=config.attention_bias,
+                add_swa_attention_sink_bias=getattr(
+                    config, "add_swa_attention_sink_bias", False
+                ),
+                layer_id=layer_id,
+                rope_theta=getattr(config, "swa_rope_theta", rope_theta),
+                max_position_embeddings=max_position_embeddings,
+                quant_config=quant_config,
+                partial_rotary_factor=getattr(config, "partial_rotary_factor", 1.0),
+                prefix=f"{prefix}.self_attn",
+            )
+        else:
+            self.self_attn = MiMoV2Attention(
+                hidden_size=self.hidden_size,
+                num_heads=config.num_attention_heads,
+                num_kv_heads=config.num_key_value_heads,
+                head_dim=config.head_dim,
+                v_head_dim=getattr(config, "v_head_dim", None),
+                sliding_window_size=-1,  # normal attention
+                attention_bias=config.attention_bias,
+                layer_id=layer_id,
+                rope_theta=rope_theta,
+                max_position_embeddings=max_position_embeddings,
+                quant_config=quant_config,
+                partial_rotary_factor=getattr(config, "partial_rotary_factor", 1.0),
+                prefix=f"{prefix}.self_attn",
+            )
+
+        self.is_layer_sparse = self.is_moe_layer(layer_id)
+        if self.is_layer_sparse:
+            self.mlp = MiMoV2MoE(
+                vllm_config=vllm_config,
+                prefix=f"{prefix}.mlp",
+            )
+        else:
+            self.mlp = MiMoV2MLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
+
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.layernorm_epsilon)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.layernorm_epsilon
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+    def is_moe_layer(self, layer_idx: int) -> bool:
+        return (
+            hasattr(self.config, "moe_layer_freq")
+            and layer_idx >= 0
+            and not isinstance(self.config.moe_layer_freq, int)
+            and self.config.moe_layer_freq[layer_idx]
+        )
+
+    def is_compressed_softmax_layer(self) -> bool:
+        return self.config.hybrid_layer_pattern[self.layer_id] == 1
+
+
+class MiMoV2Model(nn.Module):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config.get_text_config()
+        quant_config = vllm_config.quant_config
+        eplb_config = vllm_config.parallel_config.eplb_config
+
+        self.config = config
+        self.quant_config = quant_config
+        self.vocab_size = config.vocab_size
+        self.num_redundant_experts = eplb_config.num_redundant_experts
+        self.v_scale = getattr(config, "attention_value_scale", None)
+
+        if get_pp_group().is_first_rank or (
+            config.tie_word_embeddings and get_pp_group().is_last_rank
+        ):
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=f"{prefix}.embed_tokens",
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: MiMoV2FlashDecoderLayer(
+                vllm_config=vllm_config,
+                prefix=prefix,
+            ),
+            prefix=f"{prefix}.layers",
+        )
+
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.layernorm_epsilon)
+        else:
+            self.norm = PPMissingLayer()
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.embed_input_ids(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for idx, layer in enumerate(
+            islice(self.layers, self.start_layer, self.end_layer)
+        ):
+            hidden_states, residual = layer(positions, hidden_states, residual)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        return FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.n_routed_experts,
+            num_redundant_experts=self.num_redundant_experts,
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        tp_rank = get_tensor_model_parallel_rank()
+        tp_size = get_tensor_model_parallel_world_size()
+
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+        expert_params_mapping = self.get_expert_mapping()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                continue
+            if "mtp" in name:
+                continue
+
+            if self.quant_config is not None:
+                cache_scale_name = self.quant_config.get_cache_scale(name)
+                if cache_scale_name is not None and cache_scale_name in params_dict:
+                    param = params_dict[cache_scale_name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+
+                    kv_scale = loaded_weight
+                    if kv_scale.dim() > 0 and kv_scale.numel() > 1:
+                        kv_scale = kv_scale.view(-1)[0]
+
+                    weight_loader(param, kv_scale)
+                    loaded_params.add(cache_scale_name)
+                    continue
+
+            expert_matched = False
+            for param_name, weight_name, expert_id, shard_id in expert_params_mapping:
+                if weight_name not in name:
+                    continue
+
+                name_rewritten = name.replace(weight_name, param_name)
+
+                if is_pp_missing_parameter(name_rewritten, self):
+                    continue
+
+                if (
+                    name_rewritten.endswith(".bias") or name_rewritten.endswith("_bias")
+                ) and name_rewritten not in params_dict:
+                    continue
+
+                if name_rewritten not in params_dict:
+                    continue
+
+                param = params_dict[name_rewritten]
+                weight_loader = param.weight_loader
+
+                weight_loader(
+                    param,
+                    loaded_weight,
+                    name_rewritten,
+                    shard_id=shard_id,
+                    expert_id=expert_id,
+                )
+                loaded_params.add(name_rewritten)
+                expert_matched = True
+                break
+
+            if expert_matched:
+                continue
+
+            stacked_matched = False
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name_rewritten = name.replace(weight_name, param_name)
+
+                if (
+                    name_rewritten.endswith(".bias")
+                    and name_rewritten not in params_dict
+                ):
+                    continue
+
+                if is_pp_missing_parameter(name_rewritten, self):
+                    continue
+
+                if name_rewritten not in params_dict:
+                    continue
+
+                param = params_dict[name_rewritten]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+
+                if param_name == "qkv_proj" and shard_id == "v":
+                    v_scale = (
+                        self.v_scale
+                        if self.v_scale is not None
+                        else getattr(self.config, "attention_value_scale", None)
+                    )
+                    if v_scale is not None and (
+                        name.endswith("weight_scale_inv") or name.endswith(".bias")
+                    ):
+                        loaded_weight *= float(v_scale)
+
+                weight_loader(param, loaded_weight, shard_id)
+                loaded_params.add(name_rewritten)
+
+                stacked_matched = True
+                break
+
+            if stacked_matched:
+                continue
+
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+
+            orig_name = name
+            mapped_name = maybe_remap_kv_scale_name(name, params_dict)
+            name = mapped_name if mapped_name is not None else orig_name
+
+            if name not in params_dict:
+                continue
+
+            param = params_dict[name]
+
+            if "attention_sink_bias" in name:
+                total_heads = loaded_weight.shape[0]
+                heads_per_rank = total_heads // tp_size
+                head_start = tp_rank * heads_per_rank
+                narrow_weight = loaded_weight.narrow(0, head_start, heads_per_rank)
+
+                param.data.copy_(narrow_weight)
+                loaded_params.add(name)
+            else:
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+                loaded_params.add(name)
+
+        return loaded_params
+
+
+class MiMoV2FlashForCausalLM(nn.Module, SupportsPP, MixtureOfExperts):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+
+        self.config = config
+        self.quant_config = quant_config
+        self.model = MiMoV2Model(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "model"),
+        )
+
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors
+        )
+
+    def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
+        self.model.aux_hidden_state_layers = layers
+
+    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
+        num_layers = len(self.model.layers)
+        return (2, num_layers // 2, num_layers - 3)
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        logits = self.logits_processor(self.lm_head, hidden_states)
+        return logits
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return self.model.get_expert_mapping()
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index d332f51152484..3ba61b52cfdf1 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -152,6 +152,7 @@ _TEXT_GENERATION_MODELS = {
     "MptForCausalLM": ("mpt", "MPTForCausalLM"),
     "MPTForCausalLM": ("mpt", "MPTForCausalLM"),
     "MiMoForCausalLM": ("mimo", "MiMoForCausalLM"),
+    "MiMoV2FlashForCausalLM": ("mimo_v2_flash", "MiMoV2FlashForCausalLM"),
     "NemotronForCausalLM": ("nemotron", "NemotronForCausalLM"),
     "NemotronHForCausalLM": ("nemotron_h", "NemotronHForCausalLM"),
     "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"),

From 1ab521353155e170adf9d9bae34e2a96b994aef4 Mon Sep 17 00:00:00 2001
From: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com>
Date: Fri, 19 Dec 2025 12:38:30 -0800
Subject: [PATCH 713/770] Make engine core client handshake timeout
 configurable  (#27444)

Signed-off-by: Seiji Eicher <seiji@anyscale.com>
---
 tests/v1/engine/test_engine_core_client.py | 93 +++++++++++++++++++++-
 vllm/envs.py                               |  6 ++
 vllm/v1/engine/core_client.py              |  9 ++-
 3 files changed, 105 insertions(+), 3 deletions(-)

diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 770560a5e549e..8c840fd2ac7e0 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -2,12 +2,14 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
+import importlib
 import os
 import signal
 import time
 import uuid
 from dataclasses import dataclass
 from threading import Thread
+from types import SimpleNamespace
 from typing import Any
 from unittest.mock import MagicMock
 
@@ -24,7 +26,11 @@ from vllm.usage.usage_lib import UsageContext
 from vllm.utils.torch_utils import set_default_torch_num_threads
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core import EngineCore
-from vllm.v1.engine.core_client import AsyncMPClient, EngineCoreClient, SyncMPClient
+from vllm.v1.engine.core_client import (
+    AsyncMPClient,
+    EngineCoreClient,
+    SyncMPClient,
+)
 from vllm.v1.engine.utils import CoreEngineProcManager
 from vllm.v1.executor.abstract import Executor
 
@@ -60,6 +66,91 @@ def make_request(
     )
 
 
+def _reload_envs_module():
+    import vllm.envs as envs_mod
+
+    cache_clear = getattr(getattr(envs_mod, "__getattr__", None), "cache_clear", None)
+    if cache_clear is not None:
+        cache_clear()
+    return importlib.reload(envs_mod)
+
+
+def _reload_core_client_module():
+    module = importlib.import_module("vllm.v1.engine.core_client")
+    return importlib.reload(module)
+
+
+def test_mp_client_uses_env_timeout(monkeypatch: pytest.MonkeyPatch):
+    timeout_value = 654
+    monkeypatch.setenv("VLLM_ENGINE_READY_TIMEOUT_S", str(timeout_value))
+
+    # Ensure that the environment variable is loaded if caching is enabled
+    _reload_envs_module()
+    core_client_mod = _reload_core_client_module()
+
+    poll_timeouts: list[int] = []
+
+    class ShadowSocket:
+        def poll(self, timeout: int) -> int:
+            # Capture the timeout value for each poll call
+            poll_timeouts.append(timeout)
+            return 1
+
+        def recv_multipart(self):
+            return (b"\x00\x00", b"ready")
+
+    class DummySocket:
+        def send_multipart(self, _msg, *, copy: bool = False, track: bool = False):
+            if track:
+                return SimpleNamespace(done=True)
+
+        def recv_multipart(self, *, copy: bool = False):
+            return (b"", b"")
+
+        def close(self, *, linger: int = 0):
+            pass
+
+        def bind(self, _address):
+            pass
+
+        def connect(self, _address):
+            pass
+
+        def setsockopt(self, *_args, **_kwargs):
+            pass
+
+    monkeypatch.setattr(core_client_mod.zmq.Socket, "shadow", lambda *_: ShadowSocket())
+    monkeypatch.setattr(
+        core_client_mod, "make_zmq_socket", lambda *_, **__: DummySocket()
+    )
+
+    parallel_config = SimpleNamespace(
+        data_parallel_size=1,
+        data_parallel_rank=0,
+        data_parallel_size_local=1,
+        data_parallel_rank_local=None,
+        data_parallel_hybrid_lb=False,
+        data_parallel_external_lb=False,
+    )
+    vllm_config = SimpleNamespace(parallel_config=parallel_config)
+
+    client = core_client_mod.MPClient(
+        asyncio_mode=False,
+        vllm_config=vllm_config,
+        executor_class=object,
+        log_stats=False,
+        client_addresses={
+            "input_address": "inproc://input",
+            "output_address": "inproc://output",
+        },
+    )
+    try:
+        # timeout_value is in seconds, but poll receives milliseconds
+        assert poll_timeouts == [timeout_value * 1000]
+    finally:
+        client.shutdown()
+
+
 def loop_until_done(client: EngineCoreClient, outputs: dict):
     while True:
         engine_core_outputs = client.get_output().outputs
diff --git a/vllm/envs.py b/vllm/envs.py
index f6db42e9124d6..1d4128d74b95c 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -24,6 +24,7 @@ if TYPE_CHECKING:
     LOCAL_RANK: int = 0
     CUDA_VISIBLE_DEVICES: str | None = None
     VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60
+    VLLM_ENGINE_READY_TIMEOUT_S: int = 600
     VLLM_API_KEY: str | None = None
     VLLM_DEBUG_LOG_API_SERVER_RESPONSE: bool = False
     S3_ACCESS_KEY_ID: str | None = None
@@ -604,6 +605,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_ENGINE_ITERATION_TIMEOUT_S": lambda: int(
         os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")
     ),
+    # Timeout in seconds for waiting for engine cores to become ready
+    # during startup. Default is 600 seconds (10 minutes).
+    "VLLM_ENGINE_READY_TIMEOUT_S": lambda: int(
+        os.environ.get("VLLM_ENGINE_READY_TIMEOUT_S", "600")
+    ),
     # API key for vLLM API server
     "VLLM_API_KEY": lambda: os.environ.get("VLLM_API_KEY", None),
     # Whether to log responses from API Server for debugging
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 807db8275fbf5..cacbc805e84f8 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -20,6 +20,7 @@ import zmq
 import zmq.asyncio
 
 from vllm.config import VllmConfig
+from vllm.envs import VLLM_ENGINE_READY_TIMEOUT_S
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.tasks import SupportedTask
@@ -528,7 +529,9 @@ class MPClient(EngineCoreClient):
             identities = set(self.core_engines)
             sync_input_socket = zmq.Socket.shadow(self.input_socket)
             while identities:
-                if not sync_input_socket.poll(timeout=600_000):
+                if not sync_input_socket.poll(
+                    timeout=VLLM_ENGINE_READY_TIMEOUT_S * 1000  # convert to ms
+                ):
                     raise TimeoutError(
                         "Timed out waiting for engines to send"
                         "initial message on input socket."
@@ -1340,7 +1343,9 @@ class DPLBAsyncMPClient(DPAsyncMPClient):
         # Wait for ready messages from new engines on the input socket
         sync_input_socket = zmq.Socket.shadow(self.input_socket)
         while new_engine_identities:
-            if not sync_input_socket.poll(timeout=600_000):
+            if not sync_input_socket.poll(
+                timeout=VLLM_ENGINE_READY_TIMEOUT_S * 1000  # convert to ms
+            ):
                 raise TimeoutError(
                     "Timed out waiting for new engines to send initial "
                     "message on input socket."

From 3bd8335bd015d695819e2313d09c6d449ef8248e Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Fri, 19 Dec 2025 15:50:39 -0500
Subject: [PATCH 714/770] [Refactor] Refactor for `DeepGemmQuantScaleFMT` using
 cache (#30898)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 .../layers/quantization/utils/fp8_utils.py    |  4 +--
 vllm/utils/deep_gemm.py                       | 36 ++++++++++++++-----
 2 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 13e813952b30a..15ea9f7d60fff 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -31,6 +31,7 @@ from vllm.model_executor.utils import replace_parameter
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils.deep_gemm import (
+    DeepGemmQuantScaleFMT,
     fp8_gemm_nt,
     is_deep_gemm_e8m0_used,
     is_deep_gemm_supported,
@@ -247,7 +248,6 @@ class W8A8BlockFp8LinearOp:
         self.act_quant_group_shape = act_quant_group_shape
         self.is_deep_gemm_supported = is_deep_gemm_supported()
         self.is_hopper = current_platform.is_device_capability(90)
-        self.is_blackwell = current_platform.is_device_capability_family(100)
         self.use_deep_gemm_e8m0 = is_deep_gemm_e8m0_used()
 
         # Get the correct blockscale mul and input quant operations.
@@ -303,7 +303,7 @@ class W8A8BlockFp8LinearOp:
         weight: torch.Tensor,
         weight_scale: torch.Tensor,
     ) -> torch.Tensor:
-        if self.use_deep_gemm_e8m0 and self.is_blackwell:
+        if DeepGemmQuantScaleFMT.from_oracle() == DeepGemmQuantScaleFMT.UE8M0:
             q_input, input_scale = per_token_group_quant_fp8_packed_for_deepgemm(
                 input_2d,
                 group_size=self.act_quant_group_shape.col,
diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index 3d4f8449ad3b6..bcda46421e827 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -32,15 +32,34 @@ class DeepGemmQuantScaleFMT(Enum):
     # element contains 4 scale values.
     UE8M0 = 2
 
-    @staticmethod
-    def from_oracle() -> "DeepGemmQuantScaleFMT":
-        if not is_deep_gemm_e8m0_used():
-            return DeepGemmQuantScaleFMT.FLOAT32
-        return (
-            DeepGemmQuantScaleFMT.UE8M0
-            if current_platform.is_device_capability_family(100)
-            else DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0
+    @classmethod
+    def init_oracle_cache(cls) -> None:
+        """Initialize the oracle decision and store it in the class cache"""
+        cached = getattr(cls, "_oracle_cache", None)
+        if cached is not None:
+            return
+
+        use_e8m0 = (
+            envs.VLLM_USE_DEEP_GEMM_E8M0
+            and is_deep_gemm_supported()
+            and (_fp8_gemm_nt_impl is not None)
         )
+        if not use_e8m0:
+            cls._oracle_cache = cls.FLOAT32  # type: ignore
+            return
+
+        cls._oracle_cache = (  # type: ignore
+            cls.UE8M0
+            if current_platform.is_device_capability_family(100)
+            else cls.FLOAT32_CEIL_UE8M0
+        )
+
+    @classmethod
+    def from_oracle(cls) -> "DeepGemmQuantScaleFMT":
+        """Return the pre-initialized oracle decision"""
+        cached = getattr(cls, "_oracle_cache", None)
+        assert cached is not None, "DeepGemmQuantScaleFMT oracle cache not initialized"
+        return cached
 
 
 @functools.cache
@@ -149,6 +168,7 @@ def _lazy_init() -> None:
     _transform_sf_into_required_layout_impl = getattr(
         _dg, "transform_sf_into_required_layout", None
     )
+    DeepGemmQuantScaleFMT.init_oracle_cache()
 
 
 def get_num_sms() -> int:

From 5f6477d1d0b800985602a4b495e524de19782073 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Fri, 19 Dec 2025 16:07:54 -0500
Subject: [PATCH 715/770] [BugFix] Fix TypeError: unhashable type: 'dict' when
 serving deepseek32 (#30924)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 vllm/v1/worker/gpu_model_runner.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 33be4dccfc710..92822d829a881 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1641,7 +1641,10 @@ class GPUModelRunner(
         ) -> None:
             attn_group = self.attn_groups[kv_cache_gid][attn_gid]
             builder = attn_group.get_metadata_builder(ubid or 0)
-            cache_key = (kv_cache_groups[kv_cache_gid].kv_cache_spec, type(builder))
+            kv_cache_spec = kv_cache_groups[kv_cache_gid].kv_cache_spec
+            if isinstance(kv_cache_spec, UniformTypeKVCacheSpecs):
+                kv_cache_spec = kv_cache_spec.kv_cache_specs[attn_group.layer_names[0]]
+            cache_key = (kv_cache_spec, type(builder))
 
             cascade_attn_prefix_len = (
                 cascade_attn_prefix_lens[kv_cache_gid][attn_gid]

From 83a317f650f210b86572b13b8198b7d38aaacb7e Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Fri, 19 Dec 2025 16:09:54 -0500
Subject: [PATCH 716/770] [MoE Refactor][3/N] Deprecate cutlass block quant fp8
 (b200) (#30990)

Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
---
 CMakeLists.txt                                |  18 -
 .../moe/blockwise_scaled_group_mm_sm100.cu    | 373 ------------------
 csrc/torch_bindings.cpp                       |   7 -
 .../kernels/moe/test_cutlass_grouped_gemm.py  |  92 -----
 vllm/_custom_ops.py                           |  14 -
 .../layers/fused_moe/cutlass_moe.py           | 158 +-------
 .../layers/fused_moe/fused_moe.py             |  23 --
 .../model_executor/layers/quantization/fp8.py |  22 +-
 8 files changed, 3 insertions(+), 704 deletions(-)
 delete mode 100644 csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu
 delete mode 100644 tests/kernels/moe/test_cutlass_grouped_gemm.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a14496e035d9a..c46fb18d7bfef 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -799,24 +799,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   else()
     cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
   endif()
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
-    set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${SRCS}"
-      CUDA_ARCHS "${SCALED_MM_ARCHS}")
-    list(APPEND VLLM_EXT_SRC "${SRCS}")
-    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
-    message(STATUS "Building blockwise_scaled_group_mm_sm100 for archs: ${SCALED_MM_ARCHS}")
-  else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
-      message(STATUS "Not building blockwise_scaled_group_mm_sm100 kernels as CUDA Compiler version is "
-                     "not >= 12.8, we recommend upgrading to CUDA 12.8 or later "
-                     "if you intend on running FP8 quantized MoE models on Blackwell.")
-    else()
-      message(STATUS "Not building blockwise_scaled_group_mm_sm100 as no compatible archs found "
-                     "in CUDA target architectures")
-    endif()
-  endif()
 
   #
   # Machete kernels
diff --git a/csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu b/csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu
deleted file mode 100644
index 6c8f6309ef43f..0000000000000
--- a/csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu
+++ /dev/null
@@ -1,373 +0,0 @@
-#include "core/registration.h"
-
-#include <torch/all.h>
-#include <cutlass/arch/arch.h>
-
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <c10/cuda/CUDAStream.h>
-
-#include "cute/tensor.hpp"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/epilogue/collective/default_epilogue.hpp"
-#include "cutlass/epilogue/thread/linear_combination.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/gemm/group_array_problem_shape.hpp"
-#include "cutlass/gemm/collective/collective_builder.hpp"
-#include "cutlass/epilogue/collective/collective_builder.hpp"
-#include "cutlass/gemm/device/gemm_universal_adapter.h"
-#include "cutlass/gemm/kernel/gemm_universal.hpp"
-
-#include "cutlass/util/command_line.h"
-#include "cutlass/util/distribution.h"
-#include "cutlass/util/host_tensor.h"
-#include "cutlass/util/packed_stride.hpp"
-#include "cutlass/util/tensor_view_io.h"
-#include "cutlass/util/reference/device/gemm.h"
-#include "cutlass/util/reference/device/tensor_compare.h"
-#include "cutlass/util/reference/host/tensor_fill.h"
-#include "cutlass/util/reference/host/gett.hpp"
-#include "cutlass/util/reference/host/tensor_norm.h"
-#include "cutlass/util/reference/host/tensor_compare.h"
-#include <cassert>
-
-using namespace cute;
-
-template <typename ElementAB, typename ElementC, typename ElementAccumulator,
-          typename LayoutSFA, typename LayoutSFB, typename ScaleConfig>
-__global__ void get_ggemm_starts(
-    int32_t* expert_offsets, ElementAB** a_offsets, ElementAB** b_offsets,
-    ElementC** out_offsets, ElementAccumulator** a_scale_offsets,
-    ElementAccumulator** b_scale_offsets, ElementAB* a_base_as_int,
-    ElementAB* b_base_as_int, ElementC* out_base_as_int,
-    ElementAccumulator* a_scale_base_as_int,
-    ElementAccumulator* b_scale_base_as_int, LayoutSFA* layout_sfa_base_as_int,
-    LayoutSFB* layout_sfb_base_as_int, int* problem_sizes) {
-  int expert_id = threadIdx.x;
-
-  if (expert_id >= gridDim.x * blockDim.x) {
-    return;
-  }
-
-  int m = problem_sizes[expert_id * 3];
-  int n = problem_sizes[expert_id * 3 + 1];
-  int k = problem_sizes[expert_id * 3 + 2];
-
-  int32_t expert_offset = expert_offsets[expert_id];
-  int a_stride = expert_offset * k;
-  int b_stride = expert_id * k * n;
-  int a_scale_stride = expert_offset * k / 128;
-  int b_scale_stride = expert_id * k * n / 128 / 128;
-
-  a_offsets[expert_id] = a_base_as_int + a_stride;
-  b_offsets[expert_id] = b_base_as_int + b_stride;
-  out_offsets[expert_id] = out_base_as_int + expert_offset * n;
-  a_scale_offsets[expert_id] = a_scale_base_as_int + a_scale_stride;
-  b_scale_offsets[expert_id] = b_scale_base_as_int + b_scale_stride;
-
-  LayoutSFA* layout_sfa_ptr = layout_sfa_base_as_int + expert_id;
-  LayoutSFB* layout_sfb_ptr = layout_sfb_base_as_int + expert_id;
-
-  *layout_sfa_ptr =
-      ScaleConfig::tile_atom_to_shape_SFA(cute::make_shape(m, n, k, 1));
-  *layout_sfb_ptr =
-      ScaleConfig::tile_atom_to_shape_SFB(cute::make_shape(m, n, k, 1));
-}
-
-#define __CALL_GET_STARTS_KERNEL(TENSOR_C_TYPE, C_TYPE, LayoutSFA, LayoutSFB, \
-                                 ScaleConfig)                                 \
-  else if (out_tensors.dtype() == TENSOR_C_TYPE) {                            \
-    get_ggemm_starts<cutlass::float_e4m3_t, C_TYPE, float, LayoutSFA,         \
-                     LayoutSFB, ScaleConfig><<<1, num_experts, 0, stream>>>(  \
-        static_cast<int32_t*>(expert_offsets.data_ptr()),                     \
-        static_cast<cutlass::float_e4m3_t**>(a_ptrs.data_ptr()),              \
-        static_cast<cutlass::float_e4m3_t**>(b_ptrs.data_ptr()),              \
-        static_cast<C_TYPE**>(out_ptrs.data_ptr()),                           \
-        static_cast<float**>(a_scales_ptrs.data_ptr()),                       \
-        static_cast<float**>(b_scales_ptrs.data_ptr()),                       \
-        static_cast<cutlass::float_e4m3_t*>(a_tensors.data_ptr()),            \
-        static_cast<cutlass::float_e4m3_t*>(b_tensors.data_ptr()),            \
-        static_cast<C_TYPE*>(out_tensors.data_ptr()),                         \
-        static_cast<float*>(a_scales.data_ptr()),                             \
-        static_cast<float*>(b_scales.data_ptr()),                             \
-        reinterpret_cast<LayoutSFA*>(layout_sfa.data_ptr()),                  \
-        reinterpret_cast<LayoutSFB*>(layout_sfb.data_ptr()),                  \
-        static_cast<int*>(problem_sizes.data_ptr()));                         \
-  }
-
-template <typename LayoutSFA, typename LayoutSFB, typename ScaleConfig>
-void run_get_ggemm_starts(
-    torch::Tensor const& expert_offsets, torch::Tensor& a_ptrs,
-    torch::Tensor& b_ptrs, torch::Tensor& out_ptrs,
-    torch::Tensor& a_scales_ptrs, torch::Tensor& b_scales_ptrs,
-    torch::Tensor const& a_tensors, torch::Tensor const& b_tensors,
-    torch::Tensor out_tensors, torch::Tensor const& a_scales,
-    torch::Tensor const& b_scales, torch::Tensor const& layout_sfa,
-    torch::Tensor const& layout_sfb, torch::Tensor const& problem_sizes) {
-  TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn);
-  TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn);
-  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(out_tensors.size(1) % 128 == 0 or out_tensors.size(0) % 128 == 0);
-  TORCH_CHECK(a_tensors.size(1) % 128 == 0 or a_tensors.size(0) % 128 == 0);
-
-  int num_experts = (int)expert_offsets.size(0);
-  auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
-
-  if (false) {
-  }
-  __CALL_GET_STARTS_KERNEL(torch::kBFloat16, cutlass::bfloat16_t, LayoutSFA,
-                           LayoutSFB, ScaleConfig)
-  __CALL_GET_STARTS_KERNEL(torch::kFloat16, cutlass::half_t, LayoutSFA,
-                           LayoutSFB, ScaleConfig)
-  else {
-    TORCH_CHECK(false, "Unsupported output tensor type");
-  }
-}
-
-template <typename OutType, typename ScheduleConfig, typename LayoutD>
-void run_blockwise_scaled_group_mm(
-    torch::Tensor& out_ptrs, const torch::Tensor& a_ptrs,
-    const torch::Tensor& b_ptrs, const torch::Tensor& a_scales_ptrs,
-    const torch::Tensor& b_scales_ptrs, const torch::Tensor& stride_a,
-    const torch::Tensor& stride_b, const torch::Tensor& stride_c,
-    const torch::Tensor& layout_sfa, const torch::Tensor& layout_sfb,
-    const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
-  using ProblemShape = cutlass::gemm::GroupProblemShape<Shape<int, int, int>>;
-
-  // Types
-  using ElementA = cutlass::float_e4m3_t;
-  using ElementB = cutlass::float_e4m3_t;
-  using ElementC = OutType;
-  using ElementD = ElementC;
-  using ElementAccumulator = float;
-  using LayoutA = cutlass::layout::RowMajor;
-  using LayoutB = cutlass::layout::ColumnMajor;
-  using LayoutC = LayoutD;
-
-  // Alignments
-  static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
-  static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
-  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
-
-  using ArchTag = cutlass::arch::Sm100;
-  using OperatorClass = cutlass::arch::OpClassTensorOp;
-
-  using CollectiveEpilogue =
-      typename cutlass::epilogue::collective::CollectiveBuilder<
-          ArchTag, OperatorClass, typename ScheduleConfig::MmaTileShape,
-          typename ScheduleConfig::ClusterShape,
-          cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
-          ElementAccumulator, void, LayoutC*, AlignmentC, ElementD, LayoutC*,
-          AlignmentC, typename ScheduleConfig::EpilogueSchedule>::CollectiveOp;
-
-  using CollectiveMainloop =
-      typename cutlass::gemm::collective::CollectiveBuilder<
-          ArchTag, OperatorClass, ElementA,
-          cute::tuple<LayoutA*, typename ScheduleConfig::LayoutSFA*>,
-          AlignmentA, ElementB,
-          cute::tuple<LayoutB*, typename ScheduleConfig::LayoutSFB*>,
-          AlignmentB, ElementAccumulator, typename ScheduleConfig::MmaTileShape,
-          typename ScheduleConfig::ClusterShape,
-          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
-              sizeof(typename CollectiveEpilogue::SharedStorage))>,
-          typename ScheduleConfig::KernelSchedule>::CollectiveOp;
-
-  using GemmKernel =
-      cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
-                                           CollectiveEpilogue, void>;
-
-  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
-  using StrideA = typename Gemm::GemmKernel::InternalStrideA;
-  using StrideB = typename Gemm::GemmKernel::InternalStrideB;
-  using StrideC = typename Gemm::GemmKernel::InternalStrideC;
-  using StrideD = typename Gemm::GemmKernel::InternalStrideD;
-
-  using UnderlyingProblemShape = ProblemShape::UnderlyingProblemShape;
-  int num_experts = (int)expert_offsets.size(0);
-
-  Gemm gemm_op;
-
-  // Mainloop Arguments
-  typename GemmKernel::MainloopArguments mainloop_args{
-      static_cast<const ElementA**>(a_ptrs.data_ptr()),
-      static_cast<StrideA*>(stride_a.data_ptr()),
-      static_cast<const ElementB**>(b_ptrs.data_ptr()),
-      static_cast<StrideB*>(stride_b.data_ptr()),
-      static_cast<const ElementAccumulator**>(a_scales_ptrs.data_ptr()),
-      reinterpret_cast<typename ScheduleConfig::LayoutSFA*>(
-          layout_sfa.data_ptr()),
-      static_cast<const ElementAccumulator**>(b_scales_ptrs.data_ptr()),
-      reinterpret_cast<typename ScheduleConfig::LayoutSFB*>(
-          layout_sfb.data_ptr())};
-
-  int device_id = a_ptrs.device().index();
-  static const cutlass::KernelHardwareInfo hw_info{
-      device_id, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
-                     device_id)};
-
-  // Epilogue Arguments
-  typename GemmKernel::EpilogueArguments epilogue_args{
-      {},  // epilogue.thread
-      nullptr,
-      static_cast<StrideC*>(stride_c.data_ptr()),
-      static_cast<ElementD**>(out_ptrs.data_ptr()),
-      static_cast<StrideC*>(stride_c.data_ptr())};
-
-  UnderlyingProblemShape* problem_sizes_as_shapes =
-      static_cast<UnderlyingProblemShape*>(problem_sizes.data_ptr());
-
-  // Gemm Arguments
-  typename GemmKernel::Arguments args{
-      cutlass::gemm::GemmUniversalMode::kGrouped,
-      {num_experts, problem_sizes_as_shapes, nullptr},
-      mainloop_args,
-      epilogue_args,
-      hw_info};
-
-  at::cuda::CUDAGuard device_guard{(char)a_ptrs.device().index()};
-  const cudaStream_t stream =
-      at::cuda::getCurrentCUDAStream(a_ptrs.get_device());
-
-  auto can_implement_status = gemm_op.can_implement(args);
-  TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
-              "Failed to implement GEMM");
-
-  size_t workspace_size = gemm_op.get_workspace_size(args);
-  auto const workspace_options =
-      torch::TensorOptions().dtype(torch::kUInt8).device(a_ptrs.device());
-  auto workspace = torch::empty(workspace_size, workspace_options);
-
-  auto status = gemm_op.initialize(args, workspace.data_ptr(), stream);
-  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to initialize GEMM");
-
-  status = gemm_op.run(stream);
-  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
-}
-
-template <typename OutType>
-void blockwise_scaled_group_mm_dispatch_shape(
-    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
-    const torch::Tensor& scales_a, const torch::Tensor& scales_b,
-    const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
-  struct MmaConfig {
-    using ElementA = cutlass::float_e4m3_t;
-    using KernelSchedule =
-        cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockwise1SmSm100;
-    using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
-    using ScaleConfig = cutlass::detail::Sm100BlockwiseScaleConfig<
-        1, 128, 128, cute::UMMA::Major::K, cute::UMMA::Major::K>;
-    using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
-    using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
-    using LayoutC = cutlass::layout::RowMajor;
-    using MmaTileShape = Shape<_128, _128, _128>;
-    using ClusterShape = Shape<_1, _1, _1>;
-  };
-
-  int num_experts = (int)expert_offsets.size(0);
-
-  auto a_ptrs = torch::empty(
-      {num_experts},
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-  auto b_ptrs = torch::empty(
-      {num_experts},
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-  auto out_ptrs = torch::empty(
-      {num_experts},
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-  auto a_scales_ptrs = torch::empty(
-      {num_experts},
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-  auto b_scales_ptrs = torch::empty(
-      {num_experts},
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-
-  auto layout_sfa = torch::empty(
-      {num_experts, 5},
-      torch::TensorOptions().dtype(torch::kInt32).device(a.device()));
-  auto layout_sfb = torch::empty(
-      {num_experts, 5},
-      torch::TensorOptions().dtype(torch::kInt32).device(a.device()));
-
-  auto stride_a = torch::full(
-      {num_experts}, a.size(1),
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-  auto stride_b = torch::full(
-      {num_experts}, a.size(1),
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-  auto stride_c = torch::full(
-      {num_experts}, output.size(1),
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-
-  torch::TensorOptions options_int =
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device());
-
-  run_get_ggemm_starts<typename MmaConfig::LayoutSFA,
-                       typename MmaConfig::LayoutSFB,
-                       typename MmaConfig::ScaleConfig>(
-      expert_offsets, a_ptrs, b_ptrs, out_ptrs, a_scales_ptrs, b_scales_ptrs, a,
-      b, output, scales_a, scales_b, layout_sfa, layout_sfb, problem_sizes);
-
-  run_blockwise_scaled_group_mm<OutType, MmaConfig,
-                                typename MmaConfig::LayoutC>(
-      out_ptrs, a_ptrs, b_ptrs, a_scales_ptrs, b_scales_ptrs, stride_a,
-      stride_b, stride_c, layout_sfa, layout_sfb, problem_sizes,
-      expert_offsets);
-}
-
-void cutlass_blockwise_scaled_grouped_mm(
-    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
-    const torch::Tensor& scales_a, const torch::Tensor& scales_b,
-    const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
-  TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor");
-  TORCH_CHECK(problem_sizes.size(1) == 3,
-              "problem_sizes must have shape (num_experts, 3)");
-  TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0),
-              "Number of experts in problem_sizes must match expert_offsets");
-  TORCH_CHECK(problem_sizes.dtype() == torch::kInt32,
-              "problem_sizes must be int32");
-  TORCH_CHECK(a.scalar_type() == torch::kFloat8_e4m3fn,
-              "a must be kFloat8_e4m3fn");
-  TORCH_CHECK(b.scalar_type() == torch::kFloat8_e4m3fn,
-              "b must be kFloat8_e4m3fn");
-  TORCH_CHECK(output.scalar_type() == torch::kBFloat16 ||
-                  output.scalar_type() == torch::kHalf,
-              "output must be bfloat16 or half");
-  TORCH_CHECK(scales_a.scalar_type() == torch::kFloat32,
-              "scales_a must be float32");
-  TORCH_CHECK(scales_b.scalar_type() == torch::kFloat32,
-              "scales_b must be float32");
-  TORCH_CHECK(expert_offsets.scalar_type() == torch::kInt32,
-              "expert_offsets must be int32");
-
-  TORCH_CHECK(output.dim() == 2, "output must be 2D tensor");
-  TORCH_CHECK(a.dim() == 2, "a must be 2D tensor");
-  TORCH_CHECK(b.dim() == 3, "b must be 3D tensor");
-  TORCH_CHECK(scales_a.dim() == 2, "scales_a must be 2D tensor");
-  TORCH_CHECK(scales_b.dim() == 3, "scales_b must be 3D tensor");
-  TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor");
-  TORCH_CHECK(problem_sizes.size(1) == 3,
-              "problem_sizes must have shape (num_experts, 3)");
-  TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0),
-              "Number of experts in problem_sizes must match expert_offsets");
-  TORCH_CHECK(problem_sizes.dtype() == torch::kInt32,
-              "problem_sizes must be int32");
-  TORCH_CHECK(expert_offsets.dim() == 1, "expert_offsets must be 1D tensor");
-
-#if defined(ENABLE_CUTLASS_MOE_SM100) && ENABLE_CUTLASS_MOE_SM100
-  if (output.scalar_type() == torch::kBFloat16) {
-    blockwise_scaled_group_mm_dispatch_shape<cutlass::bfloat16_t>(
-        output, a, b, scales_a, scales_b, problem_sizes, expert_offsets);
-  } else if (output.scalar_type() == torch::kFloat16) {
-    blockwise_scaled_group_mm_dispatch_shape<cutlass::half_t>(
-        output, a, b, scales_a, scales_b, problem_sizes, expert_offsets);
-  } else {
-    TORCH_CHECK(false, "Unsupported output tensor type");
-  }
-#endif
-}
-
-TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
-  m.impl("cutlass_blockwise_scaled_grouped_mm",
-         &cutlass_blockwise_scaled_grouped_mm);
-}
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 83d4943d62776..461f74ca184fd 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -416,13 +416,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "                      Tensor alpha) -> ()");
   ops.impl("cutlass_scaled_fp4_mm", torch::kCUDA, &cutlass_scaled_fp4_mm);
 
-  // cutlass blockwise scaledgroup GEMM
-  ops.def(
-      "cutlass_blockwise_scaled_grouped_mm(Tensor! output, Tensor a, Tensor b, "
-      "Tensor scales_a, Tensor scales_b, "
-      "Tensor problem_sizes, Tensor expert_offsets) -> ()");
-  // conditionally compiled so impl registration is in source file
-
   // cutlass nvfp4 block scaled group GEMM
   ops.def(
       "cutlass_fp4_group_mm(Tensor! out, Tensor a, Tensor b,"
diff --git a/tests/kernels/moe/test_cutlass_grouped_gemm.py b/tests/kernels/moe/test_cutlass_grouped_gemm.py
deleted file mode 100644
index 1c10cb3b2c699..0000000000000
--- a/tests/kernels/moe/test_cutlass_grouped_gemm.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# DeepGEMM Style Cutlass Grouped GEMM Test
-# See https://github.com/deepseek-ai/DeepGEMM/blob/main/tests/test_core.py
-
-import random
-
-import pytest
-import torch
-
-from tests.kernels.moe.utils import per_token_cast_to_fp8
-from tests.kernels.utils import baseline_scaled_mm
-from vllm import _custom_ops as ops
-from vllm.platforms import current_platform
-from vllm.utils.deep_gemm import per_block_cast_to_fp8
-from vllm.utils.math_utils import cdiv
-
-
-@pytest.mark.parametrize(
-    "num_groups, expected_m_per_group, k, n",
-    [
-        (4, 8192, 7168, 4096),
-        (4, 8192, 2048, 7168),
-        (8, 4096, 7168, 4096),
-        (8, 4096, 2048, 7168),
-        (32, 1024, 7168, 4096),
-        (32, 1024, 2048, 7168),
-    ],
-)
-@pytest.mark.parametrize("out_dtype", [torch.float16])
-@pytest.mark.skipif(
-    (lambda x: x is None or x.to_int() != 100)(
-        current_platform.get_device_capability()
-    ),
-    reason="Block Scaled Grouped GEMM is only supported on SM100.",
-)
-def test_cutlass_grouped_gemm(
-    num_groups: int,
-    expected_m_per_group: int,
-    k: int,
-    n: int,
-    out_dtype: torch.dtype,
-):
-    device = "cuda"
-    alignment = 128
-    group_ms = [
-        int(expected_m_per_group * random.uniform(0.7, 1.3)) for _ in range(num_groups)
-    ]
-    m = sum([cdiv(m, alignment) * alignment for m in group_ms])
-
-    x = torch.randn((m, k), device=device, dtype=out_dtype)
-    y = torch.randn((num_groups, n, k), device=device, dtype=out_dtype)
-    out = torch.empty((m, n), device=device, dtype=out_dtype)
-    ref_out = torch.randn((m, n), device=device, dtype=out_dtype)
-
-    ep_offset = [0] + [sum(group_ms[:i]) for i in range(1, num_groups)] + [m]
-    pb_size = []
-    for i in range(num_groups):
-        pb_size.append([ep_offset[i + 1] - ep_offset[i], n, k])
-    problem_sizes = torch.tensor(pb_size, device=device, dtype=torch.int32)
-    expert_offsets = torch.tensor(ep_offset, device=device, dtype=torch.int32)
-
-    x_fp8 = per_token_cast_to_fp8(x)
-    y_fp8 = (
-        torch.empty_like(y, dtype=torch.float8_e4m3fn),
-        torch.empty(
-            (num_groups, cdiv(n, 128), k // 128), device=device, dtype=torch.float
-        ),
-    )
-    for i in range(num_groups):
-        y_fp8[0][i], y_fp8[1][i] = per_block_cast_to_fp8(y[i], [128, 128])
-
-    for i in range(num_groups):
-        a = x_fp8[0][ep_offset[i] : ep_offset[i + 1]]
-        a_scale = x_fp8[1][ep_offset[i] : ep_offset[i + 1]]
-        b = y_fp8[0][i].t()
-        b_scale = y_fp8[1][i].t()
-        baseline = baseline_scaled_mm(a, b, a_scale, b_scale, out_dtype)
-        ref_out[ep_offset[i] : ep_offset[i + 1]] = baseline
-
-    ops.cutlass_blockwise_scaled_grouped_mm(
-        out,
-        x_fp8[0],
-        y_fp8[0],
-        x_fp8[1],
-        y_fp8[1],
-        problem_sizes,
-        expert_offsets[:-1],
-    )
-
-    torch.testing.assert_close(ref_out, out, atol=5e-1, rtol=1e-3)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index cf7f17a033be3..78bd8d4e64115 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -788,20 +788,6 @@ def cutlass_scaled_mm_supports_fp4(cuda_device_capability: int) -> bool:
     return torch.ops._C.cutlass_scaled_mm_supports_fp4(cuda_device_capability)
 
 
-def cutlass_blockwise_scaled_grouped_mm(
-    output: torch.Tensor,
-    a: torch.Tensor,
-    b: torch.Tensor,
-    scales_a: torch.Tensor,
-    scales_b: torch.Tensor,
-    problem_sizes: torch.Tensor,
-    expert_offsets: torch.Tensor,
-):
-    torch.ops._C.cutlass_blockwise_scaled_grouped_mm(
-        output, a, b, scales_a, scales_b, problem_sizes, expert_offsets
-    )
-
-
 def cutlass_scaled_fp4_mm(
     a: torch.Tensor,
     b: torch.Tensor,
diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
index 4a0b4e82c1b39..9281780fca478 100644
--- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -21,7 +21,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
     TopKWeightAndReduceDelegate,
     TopKWeightAndReduceNoOP,
 )
-from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize, _resize_cache
+from vllm.model_executor.layers.fused_moe.utils import _resize_cache
 from vllm.scalar_type import scalar_types
 
 logger = init_logger(__name__)
@@ -896,162 +896,6 @@ def cutlass_moe_fp4(
     )
 
 
-def _valid_cutlass_block_scaled_grouped_gemm(
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    inplace: bool,
-    activation: str,
-    apply_router_weight_on_input: bool,
-    expert_map: torch.Tensor | None,
-) -> bool:
-    def _valid_cutlass_block_scaled_grouped_gemm_shape(N: int, K: int):
-        return N % 128 == 0 and K % 128 == 0
-
-    _, K, N = w2.size()
-    if not _valid_cutlass_block_scaled_grouped_gemm_shape(N, K):
-        logger.debug_once(
-            "CutlassBlockScaledGroupedGemm disabled: unaligned problem size. "
-            "N: %s, K: %s",
-            N,
-            K,
-        )
-        return False
-
-    if w1.dtype != torch.float8_e4m3fn or w2.dtype != torch.float8_e4m3fn:
-        logger.debug_once(
-            "CutlassBlockScaledGroupedGemm disabled: invalid weight dtype(s). "
-            "w1.dtype: %s, w2.dtype: %s",
-            w1.dtype,
-            w2.dtype,
-        )
-        return False
-
-    if expert_map is not None:
-        logger.debug_once(
-            "CutlassBlockScaledGroupedGemm disabled: expert_parallel is not supported."
-        )
-        return False
-
-    if activation != "silu":
-        logger.debug_once(
-            "CutlassBlockScaledGroupedGemm disabled: only activation silu is supported."
-        )
-        return False
-
-    if apply_router_weight_on_input:
-        logger.debug_once(
-            "CutlassBlockScaledGroupedGemm disabled:"
-            " apply_router_weight_on_input is not supported."
-        )
-        return False
-
-    if inplace:
-        logger.debug_once(
-            "CutlassBlockScaledGroupedGemm disabled: inplace is not supported."
-        )
-        return False
-
-    return True
-
-
-# TODO(bnell): would be nice combine/integrate with regular cutlass_fp8.
-def run_cutlass_block_scaled_fused_experts(
-    a: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    w1_scale: torch.Tensor,
-    w2_scale: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-) -> torch.Tensor:
-    w1_q = w1.transpose(1, 2)
-    w2_q = w2.transpose(1, 2)
-    w1_scale = w1_scale.transpose(1, 2)
-    w2_scale = w2_scale.transpose(1, 2)
-
-    assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
-    assert a.shape[0] == topk_ids.shape[0], (
-        "a and topk_ids must have the same batch size"
-    )
-    assert w1_q.dtype == torch.float8_e4m3fn, "w1_q must be float8_e4m3fn"
-    assert w2_q.dtype == torch.float8_e4m3fn, "w2_q must be float8_e4m3fn"
-    assert a.shape[1] == w1_q.shape[1], "Hidden size mismatch w1"
-    assert w1_q.shape[2] == w2_q.shape[1] * 2, "Hidden size mismatch w2"
-    assert w1_q.shape[0] == w2_q.shape[0], "Expert number mismatch"
-    assert w1_q.shape[0] == w1_scale.shape[0], "w1_scale expert number mismatch"
-    assert w1_q.shape[0] == w2_scale.shape[0], "w2_scale expert number mismatch"
-    assert a.dtype in [torch.half, torch.bfloat16], "Invalid output dtype"
-
-    out_dtype = a.dtype
-    num_experts = w1_q.size(0)
-    m = a.size(0)
-    k = w1_q.size(1)
-    n = w2_q.size(1)
-
-    topk = topk_ids.size(1)
-
-    a_q, a1_scale = _fp8_quantize(
-        a, A_scale=None, per_act_token=False, block_shape=[128, 128]
-    )
-    device = a_q.device
-
-    expert_offsets = torch.empty((num_experts + 1,), dtype=torch.int32, device=device)
-    problem_sizes1 = torch.empty((num_experts, 3), dtype=torch.int32, device=device)
-    problem_sizes2 = torch.empty((num_experts, 3), dtype=torch.int32, device=device)
-
-    a_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
-    c_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
-
-    ops.get_cutlass_moe_mm_data(
-        topk_ids,
-        expert_offsets,
-        problem_sizes1,
-        problem_sizes2,
-        a_map,
-        c_map,
-        num_experts,
-        n,
-        k,
-    )
-
-    rep_a_q = a_q.view(dtype=torch.uint8)[a_map].view(dtype=a_q.dtype)
-    rep_a1_scales = a1_scale[a_map]
-
-    c1 = torch.empty((m * topk, n * 2), dtype=out_dtype, device=device)
-    c2 = torch.empty((m * topk, k), dtype=out_dtype, device=device)
-
-    ops.cutlass_blockwise_scaled_grouped_mm(
-        c1,
-        rep_a_q,
-        w1_q,
-        rep_a1_scales,
-        w1_scale,
-        problem_sizes1,
-        expert_offsets[:-1],
-    )
-
-    intermediate = torch.empty((m * topk, n), dtype=out_dtype, device=device)
-    torch.ops._C.silu_and_mul(intermediate, c1)
-
-    intermediate_q, a2_scale = _fp8_quantize(
-        intermediate, A_scale=None, per_act_token=False, block_shape=[128, 128]
-    )
-
-    ops.cutlass_blockwise_scaled_grouped_mm(
-        c2,
-        intermediate_q,
-        w2_q,
-        a2_scale,
-        w2_scale,
-        problem_sizes2,
-        expert_offsets[:-1],
-    )
-
-    return (
-        c2[c_map].view(m, topk, k) * topk_weights.view(m, topk, 1).to(out_dtype)
-    ).sum(dim=1)
-
-
 # W4A8
 def run_cutlass_moe_w4a8_fp8(
     output: torch.Tensor,
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 37f8e7780f999..c8d80ae023d43 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -25,10 +25,6 @@ from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEQuantConfig,
     _get_config_dtype_str,
 )
-from vllm.model_executor.layers.fused_moe.cutlass_moe import (
-    _valid_cutlass_block_scaled_grouped_gemm,
-    run_cutlass_block_scaled_fused_experts,
-)
 from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
     _valid_deep_gemm,
     deep_gemm_moe_fp8,
@@ -1678,11 +1674,9 @@ def fused_experts(
     expert_map: torch.Tensor | None = None,
     quant_config: FusedMoEQuantConfig | None = None,
     allow_deep_gemm: bool = False,
-    allow_cutlass_block_scaled_grouped_gemm: bool = False,
 ) -> torch.Tensor:
     if quant_config is None:
         quant_config = FUSED_MOE_UNQUANTIZED_CONFIG
-    use_fp8_w8a8 = quant_config.use_fp8_w8a8
 
     # For now, disable DeepGemm for small N (<= 512) until better
     # permute/unpermute ops are available.
@@ -1712,23 +1706,6 @@ def fused_experts(
             a2_scale=quant_config.a2_scale,
             apply_router_weight_on_input=apply_router_weight_on_input,
         )
-    elif (
-        allow_cutlass_block_scaled_grouped_gemm
-        and use_fp8_w8a8
-        and _valid_cutlass_block_scaled_grouped_gemm(
-            w1, w2, inplace, activation, apply_router_weight_on_input, expert_map
-        )
-    ):
-        assert quant_config is not None
-        return run_cutlass_block_scaled_fused_experts(
-            a=hidden_states,
-            w1=w1,
-            w2=w2,
-            w1_scale=quant_config.w1_scale,
-            w2_scale=quant_config.w2_scale,
-            topk_weights=topk_weights,
-            topk_ids=topk_ids,
-        )
     else:
         return dispatch_fused_experts_func(inplace)(
             hidden_states=hidden_states,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index ec3fc5ace17d8..78685538ea1b3 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -118,9 +118,8 @@ class Fp8MoeBackend(Enum):
     FLASHINFER_TRTLLM = 1
     FLASHINFER_CUTLASS = 2
     DEEPGEMM = 3
-    CUTLASS_BLOCK_SCALED_GROUPED_GEMM = 4
-    MARLIN = 5
-    TRITON = 6
+    MARLIN = 4
+    TRITON = 5
 
 
 def get_fp8_moe_backend(
@@ -191,17 +190,6 @@ def get_fp8_moe_backend(
             logger.info_once("Using DeepGEMM backend for FP8 MoE", scope="local")
             return Fp8MoeBackend.DEEPGEMM
 
-    # CUTLASS BlockScaled GroupedGemm on SM100 with block-quantized weights
-    if (
-        current_platform.is_cuda()
-        and current_platform.is_device_capability_family(100)
-        and block_quant
-    ):
-        logger.info_once(
-            "Using Cutlass BlockScaled GroupedGemm backend for FP8 MoE", scope="local"
-        )
-        return Fp8MoeBackend.CUTLASS_BLOCK_SCALED_GROUPED_GEMM
-
     # default to Triton
     logger.info_once("Using Triton backend for FP8 MoE")
     return Fp8MoeBackend.TRITON
@@ -752,9 +740,6 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             )
 
         self.allow_deep_gemm = self.fp8_backend == Fp8MoeBackend.DEEPGEMM
-        self.allow_cutlass_block_scaled_grouped_gemm = (
-            self.fp8_backend == Fp8MoeBackend.CUTLASS_BLOCK_SCALED_GROUPED_GEMM
-        )
 
     def create_weights(
         self,
@@ -1316,9 +1301,6 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 expert_map=layer.expert_map,
                 quant_config=self.moe_quant_config,
                 allow_deep_gemm=self.allow_deep_gemm,
-                allow_cutlass_block_scaled_grouped_gemm=(
-                    self.allow_cutlass_block_scaled_grouped_gemm
-                ),
             )
 
         if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:

From 4cf9429897c1a6c720a0f099a7a46e9d51af9342 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Fri, 19 Dec 2025 18:31:31 -0500
Subject: [PATCH 717/770] [Bug] Fix `error 'Dynamo failed to run FX node with
 fake tensors` for Deepseek V3.2 (#31046)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/model_executor/models/deepseek_v2.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 6670143cda250..22d43a4bae18a 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -878,8 +878,11 @@ class Indexer(nn.Module):
         )
 
         q_pe, k_pe = rotary_emb(positions, q_pe, k_pe.unsqueeze(1))
-        q = torch.cat([q_pe.squeeze(0), q_nope], dim=-1)
-        k = torch.cat([k_pe.squeeze((0, 2)), k_nope], dim=-1)
+        # `rotary_emb` is shape-preserving; `q_pe` is already
+        # [num_tokens, n_head, rope_dim].
+        q = torch.cat([q_pe, q_nope], dim=-1)
+        # `k_pe` is [num_tokens, 1, rope_dim] (MQA).
+        k = torch.cat([k_pe.squeeze(1), k_nope], dim=-1)
 
         # we only quant q here since k quant is fused with cache insertion
         q = q.view(-1, self.head_dim)

From 95befecc184778b28c5251d8a2699be8622b683f Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Fri, 19 Dec 2025 18:36:38 -0500
Subject: [PATCH 718/770] [MoE Refactor][2/N] Use Modular Kernels for Fp8
 (#30825)

Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
---
 .../model_executor/layers/quantization/fp8.py | 138 +++++++++++-------
 1 file changed, 89 insertions(+), 49 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 78685538ea1b3..a86fb3d309525 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from enum import Enum
-from functools import partial
 from typing import TYPE_CHECKING, Any, Optional
 
 import torch
@@ -51,7 +50,6 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
     FlashinferMoeBackend,
     apply_flashinfer_per_tensor_scale_fp8,
     build_flashinfer_fp8_cutlass_moe_prepare_finalize,
-    flashinfer_cutlass_moe_fp8,
     get_flashinfer_moe_backend,
     register_moe_scaling_factors,
     rotate_flashinfer_fp8_moe_weights,
@@ -728,18 +726,28 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             self.flashinfer_moe_backend = FlashinferMoeBackend.TENSORRT_LLM
         elif self.fp8_backend == Fp8MoeBackend.FLASHINFER_CUTLASS:
             self.flashinfer_moe_backend = FlashinferMoeBackend.CUTLASS
-            if self.block_quant:
-                assert self.weight_block_size == [128, 128], (
-                    f"Only support weight_block_size == [128, 128], "
-                    f"got {self.weight_block_size}"
+            if self.block_quant and self.weight_block_size != [128, 128]:
+                raise NotImplementedError(
+                    "FlashInfer CUTLASS FP8 MoE backend only supports block "
+                    "size [128, 128]."
+                )
+            if not self.block_quant:
+                if layer.renormalize or layer.custom_routing_function is not None:
+                    raise NotImplementedError(
+                        "FlashInfer CUTLASS FP8 MoE backend does custom routing "
+                        f"function or renormalization, but got {layer.renormalize} and "
+                        f"{layer.custom_routing_function}."
+                    )
+                if layer.scoring_func != "sigmoid":
+                    raise NotImplementedError(
+                        "FlashInfer CUTLASS FP8 MoE backend only supports "
+                        f"'sigmoid' scoring function, but got {layer.scoring_func}."
+                    )
+            if layer.activation != "silu":
+                raise NotImplementedError(
+                    "FlashInfer CUTLASS FP8 MoE backend only supports SiLU "
+                    "activation function, but got {layer.activation}."
                 )
-            self.flashinfer_moe_fn = partial(
-                flashinfer_cutlass_moe_fp8,
-                moe=self.moe,
-                use_deepseek_fp8_block_scale=self.block_quant,
-            )
-
-        self.allow_deep_gemm = self.fp8_backend == Fp8MoeBackend.DEEPGEMM
 
     def create_weights(
         self,
@@ -928,7 +936,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
 
             # DeepGemm scales need to be transposed and aligned. We try to do
             # it ahead of time for performance reasons.
-            if self.allow_deep_gemm:
+            if self.fp8_backend == Fp8MoeBackend.DEEPGEMM:
                 dg_w13_weight, dg_w13_weight_scale_inv = (
                     deepgemm_post_process_fp8_weight_block(
                         wq=layer.w13_weight.data,
@@ -1039,6 +1047,61 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             del layer.w13_input_scale
             del layer.w2_input_scale
 
+        # NOTE(rob): this is a WIP refactor. We are first migrating
+        # all of the kernels in the TP case to use mk. Once this is
+        # done, then we will initialzie the TP case and DP/EP case
+        # via the same code path (i.e. via maybe_init_modular_kernel).
+        # NOTE(rob): in progress migrating all into this format.
+        if self.fp8_backend == Fp8MoeBackend.FLASHINFER_CUTLASS:
+            from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
+                FlashInferExperts,
+            )
+            from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (  # noqa: E501
+                FlashInferAllGatherMoEPrepareAndFinalize,
+            )
+
+            config = self.get_fused_moe_quant_config(layer)
+            assert config is not None
+            self.moe_quant_config = config
+
+            self.kernel = mk.FusedMoEModularKernel(
+                FlashInferAllGatherMoEPrepareAndFinalize(
+                    use_dp=(self.moe.dp_size > 1),
+                    use_deepseek_fp8_block_scale=self.block_quant,
+                ),
+                FlashInferExperts(
+                    out_dtype=torch.get_default_dtype(),
+                    quant_config=self.moe_quant_config,
+                    ep_rank=self.moe.ep_rank,
+                    ep_size=self.moe.ep_size,
+                    tp_rank=self.moe.tp_rank,
+                    tp_size=self.moe.tp_size,
+                    use_dp=(self.moe.dp_size > 1),
+                    use_deepseek_fp8_block_scale=self.block_quant,
+                ),
+            )
+            self.use_inplace = False
+
+        elif self.fp8_backend in [Fp8MoeBackend.DEEPGEMM, Fp8MoeBackend.TRITON]:
+            from vllm.model_executor.layers.fused_moe import (
+                TritonOrDeepGemmExperts,
+            )
+            from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+                MoEPrepareAndFinalizeNoEP,
+            )
+
+            config = self.get_fused_moe_quant_config(layer)
+            assert config is not None
+            self.moe_quant_config = config
+            self.kernel = mk.FusedMoEModularKernel(
+                MoEPrepareAndFinalizeNoEP(),
+                TritonOrDeepGemmExperts(
+                    quant_config=self.moe_quant_config,
+                    allow_deep_gemm=(self.fp8_backend == Fp8MoeBackend.DEEPGEMM),
+                ),
+            )
+            self.use_inplace = True
+
     def maybe_make_prepare_finalize(
         self,
         routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
@@ -1091,7 +1154,9 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             assert max_num_tokens_per_rank is not None
 
             experts_impl = (
-                BatchedDeepGemmExperts if self.allow_deep_gemm else BatchedTritonExperts
+                BatchedDeepGemmExperts
+                if self.fp8_backend == Fp8MoeBackend.DEEPGEMM
+                else BatchedTritonExperts
             )
             logger.debug(
                 "%s(%s): max_tokens_per_rank=%s, block_size=%s, per_act_token=%s",
@@ -1126,7 +1191,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             )
             return TritonOrDeepGemmExperts(
                 quant_config=self.moe_quant_config,
-                allow_deep_gemm=self.allow_deep_gemm,
+                allow_deep_gemm=(self.fp8_backend == Fp8MoeBackend.DEEPGEMM),
             )
 
     def get_fused_moe_quant_config(
@@ -1164,6 +1229,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
+            # TODO(rob): convert this to MK.
             if layer.enable_eplb:
                 raise NotImplementedError("EPLB not supported for `Fp8MoEMethod` yet.")
             assert layer.activation == "silu", (
@@ -1228,6 +1294,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 rocm_aiter_fused_experts,
             )
 
+            # TODO(rob): convert this to MK.
             result = rocm_aiter_fused_experts(
                 x,
                 layer.w13_weight,
@@ -1240,6 +1307,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 quant_config=self.moe_quant_config,
             )
         elif self.use_marlin:
+            # TODO(rob): convert this to MK.
             assert layer.activation == "silu", (
                 f"{layer.activation} not supported for Marlin MoE."
             )
@@ -1261,47 +1329,19 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 input_dtype=self.marlin_input_dtype,
                 workspace=layer.workspace,
             )
-        elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
-            assert layer.activation == "silu", (
-                f"Expected 'silu' activation but got {layer.activation}"
-            )
-            if not self.block_quant:
-                assert (
-                    not layer.renormalize and layer.custom_routing_function is not None
-                )
-                assert layer.scoring_func == "sigmoid", (
-                    f"Expected 'sigmoid' scoring func but got {layer.scoring_func}"
-                )
-            # Delegate to CUTLASS FlashInfer path; function already bound with
-            # use_deepseek_fp8_block_scale for block-quant when applicable
-            result = self.flashinfer_moe_fn(
+        else:
+            result = self.kernel(
                 x,
-                layer,
+                layer.w13_weight,
+                layer.w2_weight,
                 topk_weights,
                 topk_ids,
-                inplace=False,
+                inplace=self.use_inplace,
                 activation=layer.activation,
                 global_num_experts=layer.global_num_experts,
                 expert_map=layer.expert_map,
                 apply_router_weight_on_input=layer.apply_router_weight_on_input,
             )
-        else:
-            from vllm.model_executor.layers.fused_moe import fused_experts
-
-            result = fused_experts(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                inplace=True,
-                activation=layer.activation,
-                global_num_experts=layer.global_num_experts,
-                apply_router_weight_on_input=layer.apply_router_weight_on_input,
-                expert_map=layer.expert_map,
-                quant_config=self.moe_quant_config,
-                allow_deep_gemm=self.allow_deep_gemm,
-            )
 
         if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
             assert not isinstance(result, tuple), (

From 8a7a414374909bc176cb165e8031e0e92948d07e Mon Sep 17 00:00:00 2001
From: Yuxuan Zhang <2448370773@qq.com>
Date: Sat, 20 Dec 2025 08:09:58 +0800
Subject: [PATCH 719/770] GLM-4.7 Tool Parser and Doc Update (#30876)

Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
---
 docs/features/tool_calling.md              |  9 ++++++++-
 docs/models/supported_models.md            |  2 +-
 vllm/model_executor/models/glm4_moe.py     |  3 ++-
 vllm/tool_parsers/__init__.py              |  4 ++++
 vllm/tool_parsers/glm47_moe_tool_parser.py | 23 ++++++++++++++++++++++
 5 files changed, 38 insertions(+), 3 deletions(-)
 create mode 100644 vllm/tool_parsers/glm47_moe_tool_parser.py

diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md
index 70a11d6def566..7b6945cb71c27 100644
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -352,10 +352,17 @@ Supported models:
 * `zai-org/GLM-4.5`
 * `zai-org/GLM-4.5-Air`
 * `zai-org/GLM-4.6`
-* `zai-org/GLM-4.6-Air`
 
 Flags: `--tool-call-parser glm45`
 
+### GLM-4.7 Models (`glm47`)
+
+Supported models:
+
+* `zai-org/GLM-4.7`
+
+Flags: `--tool-call-parser glm47`
+
 ### Qwen3-Coder Models (`qwen3_xml`)
 
 Supported models:
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 62e42a730e9cb..ffd0681dca7e8 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -387,7 +387,7 @@ th {
 | `Gemma3nForCausalLM` | Gemma 3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | |
 | `GlmForCausalLM` | GLM-4 | `zai-org/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ |
 | `Glm4ForCausalLM` | GLM-4-0414 | `zai-org/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ |
-| `Glm4MoeForCausalLM` | GLM-4.5, GLM-4.6 | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ |
+| `Glm4MoeForCausalLM` | GLM-4.5, GLM-4.6, GLM-4.7 | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ |
 | `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | | ✅︎ |
 | `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ |
 | `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ |
diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
index 541d3b2beff83..6fb09be7c67f2 100644
--- a/vllm/model_executor/models/glm4_moe.py
+++ b/vllm/model_executor/models/glm4_moe.py
@@ -21,7 +21,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Inference-only GLM-4.5, GLM-4.6 model compatible with HuggingFace weights."""
+"""Inference-only GLM-4.5, GLM-4.6, GLM-4.7 model
+compatible with HuggingFace weights."""
 
 import typing
 from collections.abc import Callable, Iterable
diff --git a/vllm/tool_parsers/__init__.py b/vllm/tool_parsers/__init__.py
index 181d8bcba9553..ee92727e1c9a4 100644
--- a/vllm/tool_parsers/__init__.py
+++ b/vllm/tool_parsers/__init__.py
@@ -42,6 +42,10 @@ _TOOL_PARSERS_TO_REGISTER = {
         "glm4_moe_tool_parser",
         "Glm4MoeModelToolParser",
     ),
+    "glm47": (
+        "glm47_moe_tool_parser",
+        "Glm47MoeModelToolParser",
+    ),
     "granite-20b-fc": (
         "granite_20b_fc_tool_parser",
         "Granite20bFCToolParser",
diff --git a/vllm/tool_parsers/glm47_moe_tool_parser.py b/vllm/tool_parsers/glm47_moe_tool_parser.py
new file mode 100644
index 0000000000000..ae42a640d9413
--- /dev/null
+++ b/vllm/tool_parsers/glm47_moe_tool_parser.py
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import regex as re
+
+from vllm.logger import init_logger
+from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers.glm4_moe_tool_parser import Glm4MoeModelToolParser
+
+logger = init_logger(__name__)
+
+
+class Glm47MoeModelToolParser(Glm4MoeModelToolParser):
+    def __init__(self, tokenizer: TokenizerLike):
+        super().__init__(tokenizer)
+        self.func_detail_regex = re.compile(
+            r"<tool_call>(.*?)(<arg_key>.*?)?</tool_call>", re.DOTALL
+        )
+        self.func_arg_regex = re.compile(
+            r"<arg_key>(.*?)</arg_key>(?:\\n|\s)*<arg_value>(.*?)</arg_value>",
+            re.DOTALL,
+        )

From d52c5096d7305abc7f266026cb042121ff5bccda Mon Sep 17 00:00:00 2001
From: zejunchen-zejun <zejun.chen@amd.com>
Date: Sat, 20 Dec 2025 09:03:35 +0800
Subject: [PATCH 720/770] [Bugfix] fix the alias bug of AttentionBackendEnum
 when register CUSTOM attention backend to vllm (#30869)

Signed-off-by: zejunchen-zejun <zejun.chen@amd.com>
---
 tests/test_attention_backend_registry.py | 169 +++++++++++++++++++++++
 vllm/attention/backends/registry.py      |   6 +-
 2 files changed, 173 insertions(+), 2 deletions(-)
 create mode 100644 tests/test_attention_backend_registry.py

diff --git a/tests/test_attention_backend_registry.py b/tests/test_attention_backend_registry.py
new file mode 100644
index 0000000000000..7b90b949aa457
--- /dev/null
+++ b/tests/test_attention_backend_registry.py
@@ -0,0 +1,169 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.attention.backends.abstract import (
+    AttentionBackend,
+    AttentionImpl,
+)
+from vllm.attention.backends.registry import (
+    AttentionBackendEnum,
+    MambaAttentionBackendEnum,
+    register_backend,
+)
+
+
+class CustomAttentionImpl(AttentionImpl):
+    """Mock custom attention implementation for testing."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+
+    def forward(self, *args, **kwargs):
+        """Mock forward pass."""
+        pass
+
+
+class CustomAttentionBackend(AttentionBackend):
+    """Mock custom attention backend for testing."""
+
+    @staticmethod
+    def get_name():
+        return "CUSTOM"
+
+    @staticmethod
+    def get_impl_cls():
+        return CustomAttentionImpl
+
+    @staticmethod
+    def get_builder_cls():
+        """Mock builder class."""
+        return None
+
+    @staticmethod
+    def get_required_kv_cache_layout():
+        """Mock KV cache layout."""
+        return None
+
+
+class CustomMambaAttentionImpl(AttentionImpl):
+    """Mock custom mamba attention implementation for testing."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+
+    def forward(self, *args, **kwargs):
+        """Mock forward pass."""
+        pass
+
+
+class CustomMambaAttentionBackend(AttentionBackend):
+    """Mock custom mamba attention backend for testing."""
+
+    @staticmethod
+    def get_name():
+        return "CUSTOM_MAMBA"
+
+    @staticmethod
+    def get_impl_cls():
+        return CustomMambaAttentionImpl
+
+    @staticmethod
+    def get_builder_cls():
+        """Mock builder class."""
+        return None
+
+    @staticmethod
+    def get_required_kv_cache_layout():
+        """Mock KV cache layout."""
+        return None
+
+
+def test_custom_is_not_alias_of_any_backend():
+    # Get all members of AttentionBackendEnum
+    all_backends = list(AttentionBackendEnum)
+
+    # Find any aliases of CUSTOM
+    aliases = []
+    for backend in all_backends:
+        if backend.name != "CUSTOM" and backend is AttentionBackendEnum.CUSTOM:
+            aliases.append(backend.name)
+
+    # CUSTOM should not be an alias of any other backend
+    assert len(aliases) == 0, (
+        f"BUG! CUSTOM is an alias of: {', '.join(aliases)}!\n"
+        f"CUSTOM.value = {repr(AttentionBackendEnum.CUSTOM.value)}\n"
+        f"This happens when CUSTOM has the same value as another backend.\n"
+        f"When you register to CUSTOM, you're actually registering to {aliases[0]}!\n"
+        f"All backend values:\n"
+        + "\n".join(f"  {b.name}: {repr(b.value)}" for b in all_backends)
+    )
+
+    # Verify CUSTOM has its own unique identity
+    assert AttentionBackendEnum.CUSTOM.name == "CUSTOM", (
+        f"CUSTOM.name should be 'CUSTOM', but got '{AttentionBackendEnum.CUSTOM.name}'"
+    )
+
+
+def test_register_custom_backend_with_class_path():
+    # Register with explicit class path
+    register_backend(
+        backend=AttentionBackendEnum.CUSTOM,
+        class_path="tests.test_attention_backend_registry.CustomAttentionBackend",
+        is_mamba=False,
+    )
+
+    # Check that CUSTOM backend is registered
+    assert AttentionBackendEnum.CUSTOM.is_overridden(), (
+        "CUSTOM should be overridden after registration"
+    )
+
+    # Get the registered class path
+    class_path = AttentionBackendEnum.CUSTOM.get_path()
+    assert class_path == "tests.test_attention_backend_registry.CustomAttentionBackend"
+
+    # Get the backend class
+    backend_cls = AttentionBackendEnum.CUSTOM.get_class()
+    assert backend_cls.get_name() == "CUSTOM"
+    assert backend_cls.get_impl_cls() == CustomAttentionImpl
+
+
+def test_mamba_custom_is_not_alias_of_any_backend():
+    # Get all mamba backends
+    all_backends = list(MambaAttentionBackendEnum)
+
+    # Find any aliases of CUSTOM
+    aliases = []
+    for backend in all_backends:
+        if backend.name != "CUSTOM" and backend is MambaAttentionBackendEnum.CUSTOM:
+            aliases.append(backend.name)
+
+    # CUSTOM should not be an alias of any other backend
+    assert len(aliases) == 0, (
+        f"BUG! MambaAttentionBackendEnum.CUSTOM is an alias of: {', '.join(aliases)}!\n"
+        f"CUSTOM.value = {repr(MambaAttentionBackendEnum.CUSTOM.value)}\n"
+        f"All mamba backend values:\n"
+        + "\n".join(f"  {b.name}: {repr(b.value)}" for b in all_backends)
+    )
+
+
+def test_register_custom_mamba_backend_with_class_path():
+    # Register with explicit class path
+    register_backend(
+        backend=MambaAttentionBackendEnum.CUSTOM,
+        class_path="tests.test_attention_backend_registry.CustomMambaAttentionBackend",
+        is_mamba=True,
+    )
+
+    # Check that the backend is registered
+    assert MambaAttentionBackendEnum.CUSTOM.is_overridden()
+
+    # Get the registered class path
+    class_path = MambaAttentionBackendEnum.CUSTOM.get_path()
+    assert (
+        class_path
+        == "tests.test_attention_backend_registry.CustomMambaAttentionBackend"
+    )
+
+    # Get the backend class
+    backend_cls = MambaAttentionBackendEnum.CUSTOM.get_class()
+    assert backend_cls.get_name() == "CUSTOM_MAMBA"
+    assert backend_cls.get_impl_cls() == CustomMambaAttentionImpl
diff --git a/vllm/attention/backends/registry.py b/vllm/attention/backends/registry.py
index ed0021db204ac..416b996df9f22 100644
--- a/vllm/attention/backends/registry.py
+++ b/vllm/attention/backends/registry.py
@@ -77,7 +77,8 @@ class AttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta):
     )
     CPU_ATTN = "vllm.v1.attention.backends.cpu_attn.CPUAttentionBackend"
     # Placeholder for third-party/custom backends - must be registered before use
-    CUSTOM = ""
+    # set to None to avoid alias with other backend, whose value is an empty string
+    CUSTOM = None
 
     def get_path(self, include_classname: bool = True) -> str:
         """Get the class path for this backend (respects overrides).
@@ -139,7 +140,8 @@ class MambaAttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta):
     LINEAR = "vllm.v1.attention.backends.linear_attn.LinearAttentionBackend"
     GDN_ATTN = "vllm.v1.attention.backends.gdn_attn.GDNAttentionBackend"
     # Placeholder for third-party/custom backends - must be registered before use
-    CUSTOM = ""
+    # set to None to avoid alias with other backend, whose value is an empty string
+    CUSTOM = None
 
     def get_path(self, include_classname: bool = True) -> str:
         """Get the class path for this backend (respects overrides).

From 0be149524c0e21d8a86496918b24a919ed7eb3aa Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Fri, 19 Dec 2025 21:19:12 -0600
Subject: [PATCH 721/770] [ROCm][CI/Build] Update ROCm dockerfiles (#30991)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
---
 docker/Dockerfile.rocm      |  6 ++++++
 docker/Dockerfile.rocm_base | 10 +++++-----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 1b6bdabc7a539..4c09808a14333 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -130,6 +130,7 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
     && uv pip install --system *.whl
 
 ARG COMMON_WORKDIR
+ARG BASE_IMAGE
 
 # Copy over the benchmark scripts as well
 COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks
@@ -144,4 +145,9 @@ ENV SAFETENSORS_FAST_GPU=1
 # Performance environment variable.
 ENV HIP_FORCE_DEV_KERNARG=1
 
+# Workaround for ROCm profiler limits
+RUN echo "ROCTRACER_MAX_EVENTS=10000000" > ${COMMON_WORKDIR}/libkineto.conf
+ENV KINETO_CONFIG="${COMMON_WORKDIR}/libkineto.conf"
+RUN echo "VLLM_BASE_IMAGE=${BASE_IMAGE}" >> ${COMMON_WORKDIR}/versions.txt
+
 CMD ["/bin/bash"]
diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base
index a57ee728d9243..ac63231094462 100644
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@@ -1,15 +1,15 @@
-ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.1-complete
-ARG TRITON_BRANCH="57c693b6"
+ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete
+ARG TRITON_BRANCH="a272dfa8"
 ARG TRITON_REPO="https://github.com/ROCm/triton.git"
-ARG PYTORCH_BRANCH="1c57644d"
-ARG PYTORCH_VISION_BRANCH="v0.23.0"
+ARG PYTORCH_BRANCH="89075173"
 ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
+ARG PYTORCH_VISION_BRANCH="v0.24.1"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
 ARG PYTORCH_AUDIO_BRANCH="v2.9.0"
 ARG PYTORCH_AUDIO_REPO="https://github.com/pytorch/audio.git"
 ARG FA_BRANCH="0e60e394"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="59bd8ff2"
+ARG AITER_BRANCH="6af8b687"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 
 FROM ${BASE_IMAGE} AS base

From ff2168bca3a195b835c64a5c9012d7b6a9f34e61 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Fri, 19 Dec 2025 22:46:15 -0500
Subject: [PATCH 722/770] [CI] FIx `fixture 'siglip_attention_config' not
 found` (#31053)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 tests/models/multimodal/pooling/conftest.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 tests/models/multimodal/pooling/conftest.py

diff --git a/tests/models/multimodal/pooling/conftest.py b/tests/models/multimodal/pooling/conftest.py
new file mode 100644
index 0000000000000..401bc39b4b109
--- /dev/null
+++ b/tests/models/multimodal/pooling/conftest.py
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Pytest configuration for vLLM pooling tests."""
+
+import pytest
+
+from vllm.platforms import current_platform
+
+
+@pytest.fixture
+def siglip_attention_config():
+    """Return attention config for SigLIP tests on ROCm.
+
+    On ROCm, SigLIP tests require FLEX_ATTENTION backend.
+    """
+    if current_platform.is_rocm():
+        return {"backend": "FLEX_ATTENTION"}
+    return None

From 1501a4070ec1f5e3f8f1ffd0099dc32d85a9ad98 Mon Sep 17 00:00:00 2001
From: Jeffrey Wang <jeffreywang@anyscale.com>
Date: Sat, 20 Dec 2025 02:29:31 -0800
Subject: [PATCH 723/770] [Bugfix] Read truncate_prompt_tokens from
 pooling_params in AsyncLLM.encode() (#31013)

Signed-off-by: Jeffrey Wang <jeffreywang@anyscale.com>
---
 vllm/engine/protocol.py     |  6 +++++-
 vllm/v1/engine/async_llm.py | 16 +++++++++++++++-
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index d94951a0cffc8..bf656cf23de65 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -71,7 +71,11 @@ class EngineClient(ABC):
         truncate_prompt_tokens: int | None = None,
         tokenization_kwargs: dict[str, Any] | None = None,
     ) -> AsyncGenerator[PoolingRequestOutput, None]:
-        """Generate outputs for a request from a pooling model."""
+        """Generate outputs for a request from a pooling model.
+
+        NOTE: truncate_prompt_tokens is deprecated in v0.14.
+        TODO: Remove this argument in v0.15.
+        """
         ...
 
     @abstractmethod
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index a6ee241c41151..1cbe4718f2e5c 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -4,6 +4,7 @@ import asyncio
 import os
 import socket
 import time
+import warnings
 from collections.abc import AsyncGenerator, Iterable, Mapping
 from copy import copy
 from typing import Any, cast
@@ -627,6 +628,9 @@ class AsyncLLM(EngineClient):
 
         The caller of generate() iterates the returned AsyncGenerator,
         returning the RequestOutput back to the caller.
+
+        NOTE: truncate_prompt_tokens is deprecated in v0.14.
+        TODO: Remove truncate_prompt_tokens in v0.15.
         """
 
         try:
@@ -641,9 +645,19 @@ class AsyncLLM(EngineClient):
 
             if tokenization_kwargs is None:
                 tokenization_kwargs = {}
+
+            if truncate_prompt_tokens is not None:
+                warnings.warn(
+                    "The `truncate_prompt_tokens` parameter in `AsyncLLM.encode()` "
+                    "is deprecated and will be removed in v0.15. "
+                    "Please use `pooling_params.truncate_prompt_tokens` instead.",
+                    DeprecationWarning,
+                    stacklevel=2,
+                )
+
             _validate_truncation_size(
                 self.model_config.max_model_len,
-                truncate_prompt_tokens,
+                pooling_params.truncate_prompt_tokens,
                 tokenization_kwargs,
             )
 

From 560ae9638c00e761d6f1ca33882249dbfebbe8aa Mon Sep 17 00:00:00 2001
From: Yan Ma <yan.ma@intel.com>
Date: Sat, 20 Dec 2025 21:45:27 +0800
Subject: [PATCH 724/770] [XPU] enable fp8 online streaming quantization 
 (#30944)

Signed-off-by: Yan Ma <yan.ma@intel.com>
---
 .../model_executor/layers/quantization/fp8.py |  14 +-
 .../layers/quantization/ipex_quant.py         | 122 +++---------------
 2 files changed, 29 insertions(+), 107 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index a86fb3d309525..30ca64238ae94 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -124,11 +124,13 @@ def get_fp8_moe_backend(
     block_quant: bool,
     moe_parallel_config: FusedMoEParallelConfig,
     with_lora_support: bool,
-) -> Fp8MoeBackend:
+) -> Fp8MoeBackend | None:
     """
     Select the primary FP8 MoE backend
     Note: Shape-specific fallbacks may still occur at runtime.
     """
+    if current_platform.is_xpu():
+        return None
     if with_lora_support:
         return Fp8MoeBackend.TRITON
     # Prefer FlashInfer backends on supported GPUs; allow SM90 and SM100.
@@ -292,6 +294,13 @@ class Fp8Config(QuantizationConfig):
                 return UnquantizedLinearMethod()
             return XPUFp8LinearMethod(fp8_config)
         elif isinstance(layer, FusedMoE):
+            if is_layer_skipped(
+                prefix=prefix,
+                ignored_layers=self.ignored_layers,
+                fused_mapping=self.packed_modules_mapping,
+            ):
+                return UnquantizedFusedMoEMethod(layer.moe_config)
+
             return XPUFp8MoEMethod(fp8_config, layer)
         elif isinstance(layer, Attention):
             return Fp8KVCacheMethod(self)
@@ -1107,7 +1116,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
     ) -> mk.FusedMoEPrepareAndFinalize | None:
         if (
-            self.rocm_aiter_moe_enabled
+            current_platform.is_xpu()
+            or self.rocm_aiter_moe_enabled
             or self.use_marlin
             or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
         ):
diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py
index f33ee43727f19..9de2924ec71b1 100644
--- a/vllm/model_executor/layers/quantization/ipex_quant.py
+++ b/vllm/model_executor/layers/quantization/ipex_quant.py
@@ -6,13 +6,8 @@ from typing import Any, Optional
 import torch
 from packaging import version
 from torch.nn import Module
-from torch.nn.parameter import Parameter
 
 from vllm._ipex_ops import ipex_ops as ops
-from vllm.model_executor.layers.fused_moe import (
-    FusedMoEMethodBase,
-    FusedMoeWeightScaleSupported,
-)
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.linear import (
     LinearBase,
@@ -24,14 +19,14 @@ from vllm.model_executor.layers.quantization import (
     QuantizationMethods,
 )
 from vllm.model_executor.layers.quantization.awq import AWQLinearMethod
-from vllm.model_executor.layers.quantization.fp8 import Fp8Config, Fp8LinearMethod
+from vllm.model_executor.layers.quantization.fp8 import (
+    Fp8Config,
+    Fp8LinearMethod,
+    Fp8OnlineMoEMethod,
+)
 from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
 from vllm.model_executor.layers.quantization.utils.quant_utils import is_layer_skipped
-from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    maybe_create_device_identity,
-)
-from vllm.model_executor.parameter import ModelWeightParameter
-from vllm.model_executor.utils import set_weight_attrs
+from vllm.model_executor.utils import replace_parameter
 from vllm.platforms import current_platform
 
 MIN_IPEX_VERSION = "2.6.0"
@@ -309,44 +304,15 @@ class XPUFp8LinearMethod(Fp8LinearMethod):
     def __init__(self, quant_config: Fp8Config):
         super().__init__(quant_config)
 
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        input_size_per_partition: int,
-        output_partition_sizes: list[int],
-        input_size: int,
-        output_size: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ):
-        maybe_create_device_identity()
-
-        output_size_per_partition = sum(output_partition_sizes)
-        weight_loader = extra_weight_attrs.get("weight_loader")
-        layer.logical_widths = output_partition_sizes
-        layer.input_size_per_partition = input_size_per_partition
-        layer.output_size_per_partition = output_size_per_partition
-        layer.orig_dtype = params_dtype
-        layer.weight_block_size = None
-        weight = ModelWeightParameter(
-            data=torch.empty(
-                output_size_per_partition,
-                input_size_per_partition,
-                dtype=params_dtype,
-            ),
-            input_dim=1,
-            output_dim=0,
-            weight_loader=weight_loader,
-        )
-        layer.register_parameter("weight", weight)
-
     def process_weights_after_loading(self, layer: Module) -> None:
+        if getattr(layer, "_already_called_process_weights_after_loading", False):
+            return
         # If checkpoint not serialized fp8, quantize the weights.
         if not self.quant_config.is_checkpoint_fp8_serialized:
             qweight, weight_scale = ops.scaled_fp8_quant(layer.weight, scale=None)
             # Update the layer with the new values.
-            layer.weight = Parameter(qweight, requires_grad=False)
-            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+            replace_parameter(layer, "weight", qweight.data)
+            replace_parameter(layer, "weight_scale", weight_scale.data)
             layer.input_scale = None
 
     def apply(
@@ -363,69 +329,14 @@ class XPUFp8LinearMethod(Fp8LinearMethod):
         return output
 
 
-class XPUFp8MoEMethod(FusedMoEMethodBase):
+class XPUFp8MoEMethod(Fp8OnlineMoEMethod):
     def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module):
-        super().__init__(layer.moe_config)
+        super().__init__(quant_config, layer)
         self.quant_config = quant_config
 
-    def create_weights(
-        self,
-        layer: Module,
-        num_experts: int,
-        hidden_size: int,
-        intermediate_size_per_partition: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ):
-        layer.intermediate_size_per_partition = intermediate_size_per_partition
-        layer.hidden_size = hidden_size
-        layer.num_experts = num_experts
-        layer.orig_dtype = params_dtype
-        layer.weight_block_size = None
-        # WEIGHTS
-        w13_weight = torch.nn.Parameter(
-            torch.empty(
-                num_experts,
-                2 * intermediate_size_per_partition,
-                hidden_size,
-                dtype=params_dtype,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w13_weight", w13_weight)
-        set_weight_attrs(w13_weight, extra_weight_attrs)
-
-        w2_weight = torch.nn.Parameter(
-            torch.empty(
-                num_experts,
-                hidden_size,
-                intermediate_size_per_partition,
-                dtype=params_dtype,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w2_weight", w2_weight)
-        set_weight_attrs(w2_weight, extra_weight_attrs)
-
-        # Allocate 2 scales for w1 and w3 respectively.
-        # They will be combined to a single scale after weight loading.
-        w13_weight_scale = torch.nn.Parameter(
-            torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False
-        )
-        w2_weight_scale = torch.nn.Parameter(
-            torch.ones(num_experts, dtype=torch.float32), requires_grad=False
-        )
-        layer.register_parameter("w13_weight_scale", w13_weight_scale)
-        layer.register_parameter("w2_weight_scale", w2_weight_scale)
-
-        extra_weight_attrs.update(
-            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
-        )
-        # INPUT_SCALES
-        layer.w13_input_scale = None
-        layer.w2_input_scale = None
-
     def process_weights_after_loading(self, layer: Module) -> None:
+        if getattr(layer, "_already_called_process_weights_after_loading", False):
+            return
         if not self.quant_config.is_checkpoint_fp8_serialized:
             fp8_dtype = current_platform.fp8_dtype()
             w13_weight = torch.empty_like(layer.w13_weight.data, dtype=fp8_dtype)
@@ -448,8 +359,9 @@ class XPUFp8MoEMethod(FusedMoEMethodBase):
                 w2_weight[expert, :, :], layer.w2_weight_scale[expert] = (
                     ops.scaled_fp8_quant(layer.w2_weight.data[expert, :, :])
                 )
-            layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
-            layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
+            replace_parameter(layer, "w13_weight", w13_weight)
+            replace_parameter(layer, "w2_weight", w2_weight)
+
         import intel_extension_for_pytorch as ipex
 
         ep_rank_start = self.moe.ep_rank * self.moe.num_local_experts

From 54c892438479c0a8aec9a10b8db570034af92443 Mon Sep 17 00:00:00 2001
From: baonudesifeizhai <85092850+baonudesifeizhai@users.noreply.github.com>
Date: Sat, 20 Dec 2025 13:22:04 -0500
Subject: [PATCH 725/770] [MoE Refactor][5/N] Isolate zero expert to
 LongCatFlash (#28891)

Signed-off-by: baonudesifeizhai <85092850+baonudesifeizhai@users.noreply.github.com>
Signed-off-by: Dongjie Zou <85092850+baonudesifeizhai@users.noreply.github.com>
Signed-off-by: baonudesifeizhai <baonudesifeizhai@gmail.com>
Signed-off-by: Robert Shaw <robertgshaw2@gmail.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robertgshaw2@gmail.com>
---
 tests/test_routing_simulator.py               |   2 +-
 .../layers/fused_moe/__init__.py              |   4 +
 .../fused_moe/fused_moe_modular_method.py     |  10 +-
 vllm/model_executor/layers/fused_moe/layer.py |  53 +----
 .../fused_moe/unquantized_fused_moe_method.py |  10 +-
 .../layers/fused_moe/zero_expert_fused_moe.py | 189 ++++++++++++++++++
 .../layers/quantization/awq_marlin.py         |   2 +-
 .../layers/quantization/bitsandbytes.py       |   2 +-
 .../compressed_tensors_moe.py                 |  12 +-
 .../layers/quantization/experts_int8.py       |   2 +-
 .../model_executor/layers/quantization/fp8.py |  12 +-
 .../layers/quantization/gguf.py               |   2 +-
 .../layers/quantization/gptq_marlin.py        |   2 +-
 .../layers/quantization/modelopt.py           |   4 +-
 .../layers/quantization/moe_wna16.py          |   2 +-
 .../layers/quantization/mxfp4.py              |   4 +-
 .../layers/quantization/quark/quark_moe.py    |   6 +-
 .../model_executor/layers/quantization/rtn.py |   2 +-
 vllm/model_executor/models/longcat_flash.py   |  53 +++--
 19 files changed, 264 insertions(+), 109 deletions(-)
 create mode 100644 vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py

diff --git a/tests/test_routing_simulator.py b/tests/test_routing_simulator.py
index e8826eb441a24..44cbdeed45074 100644
--- a/tests/test_routing_simulator.py
+++ b/tests/test_routing_simulator.py
@@ -127,7 +127,7 @@ def test_routing_strategy_integration(monkeypatch, device):
         envs.environment_variables[env_name] = lambda s=strategy: s
 
         # Test the select_experts method
-        topk_weights, topk_ids, _ = fused_moe.select_experts(
+        topk_weights, topk_ids = fused_moe.select_experts(
             hidden_states=hidden_states,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 8fee4038b60b8..3d248e7fb9945 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -25,6 +25,9 @@ from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import (
     UnquantizedFusedMoEMethod,
 )
 from vllm.model_executor.layers.fused_moe.utils import activation_without_mul
+from vllm.model_executor.layers.fused_moe.zero_expert_fused_moe import (
+    ZeroExpertFusedMoE,
+)
 from vllm.triton_utils import HAS_TRITON
 
 _config: dict[str, Any] | None = None
@@ -54,6 +57,7 @@ __all__ = [
     "FusedMoEPrepareAndFinalize",
     "RoutingMethodType",
     "SharedFusedMoE",
+    "ZeroExpertFusedMoE",
     "activation_without_mul",
     "override_config",
     "get_config",
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
index 9c9bc2514bb4b..30ff1bf2f008a 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
@@ -92,7 +92,7 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        topk_weights, topk_ids, zero_expert_result = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
@@ -110,10 +110,4 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
             expert_map=None if self.disable_expert_map else layer.expert_map,
         )
 
-        if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
-            assert not isinstance(result, tuple), (
-                "Shared + zero experts are mutually exclusive not yet supported"
-            )
-            return result, zero_expert_result
-        else:
-            return result
+        return result
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 6a65b06014bca..2e7267d56d838 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -32,7 +32,6 @@ from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEQuantConfig,
     RoutingMethodType,
 )
-from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton
 from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
     init_aiter_topK_meta_data,
 )
@@ -350,8 +349,6 @@ class FusedMoE(CustomOp):
         num_redundant_experts: int = 0,
         has_bias: bool = False,
         is_sequence_parallel=False,
-        zero_expert_num: int | None = 0,
-        zero_expert_type: str | None = None,
         expert_mapping: list[tuple[str, str, int, str]] | None = None,
         n_shared_experts: int | None = None,
         routing_method_type: int | None = None,
@@ -409,8 +406,6 @@ class FusedMoE(CustomOp):
 
         self.global_num_experts = num_experts + num_redundant_experts
         self.logical_num_experts = num_experts
-        self.zero_expert_num = zero_expert_num
-        self.zero_expert_type = zero_expert_type
 
         # Expert mapping used in self.load_weights
         self.expert_mapping = expert_mapping
@@ -1525,15 +1520,15 @@ class FusedMoE(CustomOp):
         self,
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Route the input hidden states to the top-k experts based on the
         router logits.
 
         Returns:
-                (topk_weights, topk_ids, zero_expert_result)
-                (tuple[torch.Tensor, torch.Tensor, torch.Tensor]):
-                The weights, expert ids, and zero expert computation result.
+                (topk_weights, topk_ids)
+                (tuple[torch.Tensor, torch.Tensor]):
+                The weights and expert ids.
 
             **Compatibility**: When EPLB is not enabled, the returned ids are
             equivalent to global logical ids, so should be compatible with
@@ -1655,23 +1650,7 @@ class FusedMoE(CustomOp):
 
         assert topk_ids.dtype == indices_type or indices_type is None
 
-        # Compute zero expert result if needed
-        if (
-            self.zero_expert_num is not None
-            and self.zero_expert_num > 0
-            and self.zero_expert_type is not None
-            and self.global_num_experts is not None
-        ):
-            zero_expert_result = zero_experts_compute_triton(
-                expert_indices=topk_ids,
-                expert_scales=topk_weights,
-                num_experts=self.global_num_experts,
-                zero_expert_type=self.zero_expert_type,
-                hidden_states=hidden_states,
-            )
-        else:
-            zero_expert_result = None
-        return topk_weights, topk_ids, zero_expert_result
+        return topk_weights, topk_ids
 
     def must_reduce_shared_expert_outputs(self) -> bool:
         """
@@ -1736,14 +1715,7 @@ class FusedMoE(CustomOp):
                 fused_output = torch.ops.vllm.moe_forward(
                     hidden_states, router_logits, self.layer_name
                 )
-            if self.zero_expert_num is not None and self.zero_expert_num > 0:
-                assert isinstance(fused_output, tuple)
-                fused_output, zero_expert_result = fused_output
-                return (reduce_output(fused_output) + zero_expert_result)[
-                    ..., :og_hidden_states
-                ]
-            else:
-                return reduce_output(fused_output)[..., :og_hidden_states]
+            return reduce_output(fused_output)[..., :og_hidden_states]
         else:
             if current_platform.is_tpu() or current_platform.is_cpu():
                 # TODO: Once the OOM issue for the TPU backend is resolved, we
@@ -1841,13 +1813,6 @@ class FusedMoE(CustomOp):
                     final_hidden_states,
                 )
 
-            if self.zero_expert_num is not None and self.zero_expert_num > 0:
-                assert isinstance(final_hidden_states, tuple)
-                assert self.shared_experts is None
-                final_hidden_states, zero_expert_result = final_hidden_states
-                if zero_expert_result is not None:
-                    final_hidden_states += zero_expert_result
-
             if not skip_result_store:
                 if self.shared_experts is None:
                     full_fused_final_hidden_states[chunk_start:chunk_end, :].copy_(
@@ -2030,9 +1995,6 @@ class FusedMoE(CustomOp):
                     shared_output,
                     final_hidden_states,
                 )
-            elif self.zero_expert_num is not None and self.zero_expert_num > 0:
-                assert isinstance(final_hidden_states, tuple)
-                final_hidden_states, zero_expert_result = final_hidden_states
 
             def combine_output(states: torch.Tensor) -> torch.Tensor:
                 if do_naive_dispatch_combine:
@@ -2051,9 +2013,6 @@ class FusedMoE(CustomOp):
                     final_hidden_states[0],
                     combine_output(final_hidden_states[1]),
                 )
-            elif self.zero_expert_num is not None and self.zero_expert_num > 0:
-                assert isinstance(final_hidden_states, torch.Tensor)
-                return (combine_output(final_hidden_states), zero_expert_result)
             else:
                 return combine_output(final_hidden_states)
 
diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
index 6182f10aa70f0..1ee7b65b22e3f 100644
--- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
+++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
@@ -295,7 +295,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        topk_weights, topk_ids, zero_expert_result = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
@@ -336,13 +336,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                 expert_map=layer.expert_map,
             )
 
-        if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
-            assert not isinstance(result, tuple), (
-                "Shared + zero experts are mutually exclusive not yet supported"
-            )
-            return result, zero_expert_result
-        else:
-            return result
+        return result
 
     def forward_cpu(
         self,
diff --git a/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py b/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py
new file mode 100644
index 0000000000000..97d21767f4fc3
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py
@@ -0,0 +1,189 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from contextlib import contextmanager
+
+import torch
+from torch import nn
+
+from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton
+from vllm.model_executor.layers.fused_moe.layer import FusedMoE
+
+
+class ZeroExpertFusedMoE(FusedMoE):
+    """
+    A FusedMoE operation that also computes the results of zero experts.
+    Zero experts perform identity operations (scaled pass-through) instead
+    of full MLP computations.
+
+    This class uses memoization to avoid redundant routing computation:
+    routing is computed once and reused for both zero expert computation
+    and the main FusedMoE forward pass.
+    """
+
+    def __init__(
+        self,
+        zero_expert_num: int,
+        zero_expert_type: str,
+        router: nn.Module,
+        **kwargs,
+    ):
+        # ZeroExpertFusedMoE manages its own custom_routing_function for memoization
+        assert (
+            "custom_routing_function" not in kwargs
+            or kwargs.get("custom_routing_function") is None
+        ), (
+            "ZeroExpertFusedMoE does not support external custom_routing_function. "
+            "It manages its own for routing memoization."
+        )
+
+        # Automatically slice router's e_score_correction_bias to only include
+        # real experts (not zero_experts) for the base FusedMoE.
+        # The full bias will be used temporarily in forward() for routing.
+        if hasattr(router, "e_score_correction_bias") and "num_experts" in kwargs:
+            num_real_experts = kwargs["num_experts"]
+            router_bias = router.e_score_correction_bias
+            user_bias = kwargs.get("e_score_correction_bias")
+
+            # Use router's bias if:
+            # 1. User didn't provide bias, or
+            # 2. User provided full bias (same size as router)
+            if user_bias is None or user_bias.shape[0] == router_bias.shape[0]:
+                kwargs["e_score_correction_bias"] = router_bias[:num_real_experts]
+
+        # FusedMoE no longer accepts zero_expert_num/zero_expert_type.
+        # We handle zero experts ourselves in forward().
+        super().__init__(**kwargs)
+        # Store the actual zero_expert_num and zero_expert_type for our own use
+        self._actual_zero_expert_num = zero_expert_num
+        self._actual_zero_expert_type = zero_expert_type
+        self._router = router  # Full router (includes zero experts)
+
+        # Expose zero_expert_num and zero_expert_type as attributes for
+        # compatibility with quantization methods that check these attributes
+        self.zero_expert_num = 0
+        self.zero_expert_type = None
+
+        # Memoization state for routing results
+        self._memoized_topk_weights: torch.Tensor | None = None
+        self._memoized_topk_ids: torch.Tensor | None = None
+
+        # Create custom_routing_function to reuse memoized routing results
+        def custom_routing_function(hidden_states, gating_output, topk, renormalize):
+            """Return memoized `topk_weights` and `topk_ids`."""
+            if self._memoized_topk_weights is None or self._memoized_topk_ids is None:
+                raise RuntimeError(
+                    "ZeroExpertFusedMoE: routing results not memoized. "
+                    "Call select_experts first to compute routing."
+                )
+            return self._memoized_topk_weights, self._memoized_topk_ids
+
+        self.custom_routing_function = custom_routing_function
+
+    @contextmanager
+    def _temporarily_set_attrs(self, **attrs):
+        """
+        Temporarily set attributes using object.__setattr__ and restore them.
+
+        This bypasses nn.Module.__setattr__ to avoid Dynamo tracing issues.
+        When PyTorch Dynamo traces the forward pass, it cannot handle
+        nn.Module.__setattr__ calls (which include parameter registration logic),
+        resulting in "Unsupported" errors. Using object.__setattr__ directly
+        sets the attribute without triggering nn.Module's custom __setattr__,
+        allowing Dynamo to trace the code successfully.
+        """
+        originals = {key: getattr(self, key) for key in attrs}
+        try:
+            for key, value in attrs.items():
+                object.__setattr__(self, key, value)
+            yield
+        finally:
+            for key, value in originals.items():
+                object.__setattr__(self, key, value)
+
+    def _compute_zero_expert_result(
+        self,
+        hidden_states: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+    ) -> torch.Tensor | None:
+        """Compute zero expert results using pre-computed routing."""
+        if (
+            self._actual_zero_expert_num is None
+            or self._actual_zero_expert_num <= 0
+            or self._actual_zero_expert_type is None
+        ):
+            return None
+
+        return zero_experts_compute_triton(
+            expert_indices=topk_ids.clone(),
+            expert_scales=topk_weights.clone(),
+            num_experts=self.logical_num_experts,
+            zero_expert_type=self._actual_zero_expert_type,
+            hidden_states=hidden_states,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,  # Full logits including zero experts
+    ) -> torch.Tensor:
+        """
+        Forward pass with zero expert support and routing memoization.
+
+        Args:
+            hidden_states: Input hidden states
+            router_logits: Full router logits (including zero experts)
+
+        Returns:
+            Combined output from real experts and zero experts
+        """
+        # Prepare temporary attribute overrides for routing computation
+        temp_attrs = {
+            "custom_routing_function": None,  # Disable for first routing
+        }
+        if self._router is not None:
+            temp_attrs["e_score_correction_bias"] = self._router.e_score_correction_bias
+
+        # Compute routing with temporary attributes
+        # Pass full router_logits (including zero experts) so that zero experts
+        # can be properly identified in topk_ids
+        with self._temporarily_set_attrs(**temp_attrs):
+            topk_weights, topk_ids = self.select_experts(
+                hidden_states=hidden_states,
+                router_logits=router_logits,  # Full logits (includes zero experts)
+            )
+
+        # Compute zero expert result if needed
+        zero_expert_result = self._compute_zero_expert_result(
+            hidden_states=hidden_states,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+        )
+
+        # Memoize routing results for reuse in super().forward()
+        self._memoized_topk_weights = topk_weights
+        self._memoized_topk_ids = topk_ids
+
+        # Slice router_logits for real experts only
+        router_logits_sliced = router_logits[..., : self.logical_num_experts]
+
+        # Compute real expert results (will reuse memoized routing via
+        # custom_routing_function)
+        # zero_expert_num is already 0, so FusedMoE won't handle zero experts
+        fused_out = super().forward(
+            hidden_states=hidden_states,
+            router_logits=router_logits_sliced,
+        )
+
+        # Combine results
+        # Both zero_expert_result and fused_out are computed from the same
+        # hidden_states, so they should be on the same device.
+        if zero_expert_result is not None:
+            fused_out = fused_out + zero_expert_result
+
+        # Clear memoization after use
+        self._memoized_topk_weights = None
+        self._memoized_topk_ids = None
+
+        return fused_out
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 314848721a80a..602d02d2f15a4 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -764,7 +764,7 @@ class AWQMarlinMoEMethod(FusedMoEMethodBase):
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         assert layer.activation == "silu", "Only SiLU activation is supported."
 
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 1fd959cb3423d..efe5677045e4b 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -500,7 +500,7 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index fc359a3067a9c..f4038801c266b 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -574,7 +574,7 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
                 e_score_correction_bias=layer.e_score_correction_bias,
             )
 
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
@@ -1166,7 +1166,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
@@ -1403,7 +1403,7 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
@@ -1765,7 +1765,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
             f"{layer.activation} not supported for Marlin MoE."
         )
 
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
@@ -1991,7 +1991,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
@@ -2607,7 +2607,7 @@ class CompressedTensorsW4A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 "EPLB not supported for `CompressedTensorsW4A8Fp8MoEMethod` yet."
             )
         assert self.moe_quant_config is not None
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index 11097cf36f5ca..56b11b22f7ff5 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -142,7 +142,7 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 30ca64238ae94..4b2438133dd6b 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1292,13 +1292,11 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                     apply_router_weight_on_input=layer.apply_router_weight_on_input,
                 )
 
-        select_result = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
 
-        topk_weights, topk_ids, zero_expert_result = select_result
-
         if self.rocm_aiter_moe_enabled:
             from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa: E501
                 rocm_aiter_fused_experts,
@@ -1353,13 +1351,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 apply_router_weight_on_input=layer.apply_router_weight_on_input,
             )
 
-        if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
-            assert not isinstance(result, tuple), (
-                "Shared + zero experts are mutually exclusive not yet supported"
-            )
-            return result, zero_expert_result
-        else:
-            return result
+        return result
 
 
 class Fp8OnlineMoEMethod(Fp8MoEMethod):
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index 9dd734f2fea6a..9600bb42295dc 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -639,7 +639,7 @@ class GGUFMoEMethod(FusedMoEMethodBase):
                 "fused GGUF MoE method."
             )
 
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 347c7b2008d12..d2dafca99a230 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -900,7 +900,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         assert layer.activation == "silu", "Only SiLU activation is supported."
 
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index aa3937d4c03ff..54e8673fcfbb8 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -796,7 +796,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
             )
 
         # Expert selection
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
@@ -1599,7 +1599,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
             x_routing, _ = x
         else:
             x_routing = x
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x_routing,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index 4bedb951a33f5..513f6f7b21abc 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -370,7 +370,7 @@ class MoeWNA16Method(FusedMoEMethodBase):
         from vllm.model_executor.layers.fused_moe import fused_experts
 
         assert layer.activation == "silu", "Only SiLU activation is supported."
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 832925825c453..dc0fbfa7df35a 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -896,7 +896,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
             raise NotImplementedError("EPLB is not supported for mxfp4")
 
         if self.mxfp4_backend == Mxfp4Backend.MARLIN:
-            topk_weights, topk_ids, _ = layer.select_experts(
+            topk_weights, topk_ids = layer.select_experts(
                 hidden_states=x,
                 router_logits=router_logits,
             )
@@ -990,7 +990,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
         ):
             from vllm.utils.flashinfer import flashinfer_cutlass_fused_moe
 
-            topk_weights, topk_ids, _ = layer.select_experts(
+            topk_weights, topk_ids = layer.select_experts(
                 hidden_states=x,
                 router_logits=router_logits,
             )
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index 0b9b098afb1f6..81970480319ab 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -338,7 +338,7 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
@@ -530,7 +530,7 @@ class QuarkW4A8Fp8MoEMethod(QuarkMoEMethod):
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
@@ -738,7 +738,7 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py
index b2ecb0b175f81..dce9c661ec332 100644
--- a/vllm/model_executor/layers/quantization/rtn.py
+++ b/vllm/model_executor/layers/quantization/rtn.py
@@ -359,7 +359,7 @@ class RTNMoEMethod(FusedMoEMethodBase):
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/models/longcat_flash.py b/vllm/model_executor/models/longcat_flash.py
index c5441283f9711..774737387639b 100644
--- a/vllm/model_executor/models/longcat_flash.py
+++ b/vllm/model_executor/models/longcat_flash.py
@@ -46,7 +46,7 @@ from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.fused_moe import FusedMoE, ZeroExpertFusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
     MergedColumnParallelLinear,
@@ -179,7 +179,7 @@ class FlashConfig(PretrainedConfig):
         self.intermediate_size = (
             self.ffn_hidden_size
             if hasattr(self, "ffn_hidden_size")
-            else self.intermediate_size
+            else intermediate_size
         )
         if hasattr(self, "moe_intermediate_size"):
             self.moe_intermediate_size = self.moe_intermediate_size
@@ -280,10 +280,6 @@ class LongcatMoe(nn.Module):
     ):
         super().__init__()
         self.hidden_size = hidden_size
-        self.zero_expert_num = config.zero_expert_num
-        self.zero_expert_type = config.zero_expert_type
-        self.routed_scaling_factor = config.routed_scaling_factor
-        self.enable_eplb = enable_eplb
         # Gate always runs at half / full precision for now.
         self.rounter_params_dtype = params_dtype
         if config.router_dtype == "float32":
@@ -291,25 +287,27 @@ class LongcatMoe(nn.Module):
 
         self.router = LongcatRouter(
             config=config,
-            zero_expert_num=self.zero_expert_num,
+            zero_expert_num=config.zero_expert_num,
             rounter_params_dtype=self.rounter_params_dtype,
             prefix=f"{prefix}.gate",
         )
 
-        self.experts = FusedMoE(
+        assert config.zero_expert_num is not None
+        assert config.zero_expert_type is not None
+        self.experts = ZeroExpertFusedMoE(
+            zero_expert_num=config.zero_expert_num,
+            zero_expert_type=config.zero_expert_type,
+            router=self.router,
             num_experts=num_experts,
             top_k=top_k,
             hidden_size=hidden_size,
             intermediate_size=intermediate_size,
             reduce_results=True,
             params_dtype=params_dtype,
-            e_score_correction_bias=self.router.e_score_correction_bias,
             renormalize=False,
             quant_config=quant_config,
             prefix=f"{prefix}.experts",
-            zero_expert_num=self.zero_expert_num,
-            zero_expert_type=self.zero_expert_type,
-            enable_eplb=self.enable_eplb,
+            enable_eplb=enable_eplb,
             routed_scaling_factor=config.routed_scaling_factor,
         )
 
@@ -317,11 +315,34 @@ class LongcatMoe(nn.Module):
         num_tokens, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
 
-        router_logits = self.router(hidden_states.to(self.rounter_params_dtype))
-        final_hidden_states = self.experts(
-            hidden_states=hidden_states, router_logits=router_logits
+        # Align to FusedMoE padded hidden size to avoid dim mismatch
+        padded_hidden = self.experts.hidden_size
+        if hidden_dim < padded_hidden:
+            hidden_states_padded = torch.nn.functional.pad(
+                hidden_states,
+                (0, padded_hidden - hidden_dim),
+                mode="constant",
+                value=0.0,
+            )
+        else:
+            hidden_states_padded = hidden_states
+
+        router_logits_full = self.router(
+            hidden_states_padded.to(self.rounter_params_dtype)
         )
 
+        # ZeroExpertFusedMoE handles routing memoization and zero expert computation
+        # internally. Pass full router_logits (including zero experts) so that
+        # zero experts can be properly identified in routing.
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states_padded,
+            router_logits=router_logits_full,  # Full logits (includes zero experts)
+        )
+
+        # Crop back to original hidden dimension if padded earlier
+        if padded_hidden != hidden_dim:
+            final_hidden_states = final_hidden_states[..., :hidden_dim]
+
         return final_hidden_states.view(num_tokens, hidden_dim)
 
 
@@ -419,6 +440,7 @@ class FlashDecoderLayer(nn.Module):
         hidden_states = self.self_attn[0](
             positions=positions,
             hidden_states=hidden_states,
+            llama_4_scaling=None,
         )
 
         hidden_states, residual = self.post_attention_layernorm[0](
@@ -438,6 +460,7 @@ class FlashDecoderLayer(nn.Module):
         hidden_states = self.self_attn[1](
             positions=positions,
             hidden_states=hidden_states,
+            llama_4_scaling=None,
         )
         hidden_states, residual = self.post_attention_layernorm[1](
             hidden_states, residual

From ee52d9901d368da35a80048c299ccb53e78a9a01 Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <jinzhen.ljz@antgroup.com>
Date: Sun, 21 Dec 2025 04:02:57 +0800
Subject: [PATCH 726/770] [Quantization] support logical_widths for fp8 marlin
 (#30962)

Signed-off-by: Jinzhen Lin <jinzhen.ljz@antgroup.com>
Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 .../layers/quantization/utils/marlin_utils_fp8.py      | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
index 93d238a0524d8..1fb5223b07d76 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -142,10 +142,20 @@ def prepare_fp8_layer_for_marlin(
     # marlin kernel only support channel-wise and group-wise quantization
     # we need to convert the scales
     if weight_block_size is None:
+        logical_widths = getattr(layer, "logical_widths", [])
         if scales.nelement() == 1:
             # tensor-wise quantization -> channel-wise quantization
             # (1, 1) =>(repeat)=> (1, size_n)
             scales = scales.view(1, 1).repeat_interleave(part_size_n, 1)
+        elif scales.nelement() == len(logical_widths):
+            # tensor-wise quantization with logical_widths ->
+            #    channel-wise quantization
+            assert sum(logical_widths) == part_size_n, (
+                f"Sum of logical_widths ({sum(logical_widths)}) must be equal "
+                f"to part_size_n ({part_size_n})"
+            )
+            lw_tensor = scales.new_tensor(logical_widths, dtype=torch.int64)
+            scales = scales.view(1, -1).repeat_interleave(lw_tensor, dim=1)
         elif scales.nelement() > 1 and scales.nelement() != part_size_n:
             assert part_size_n % scales.nelement() == 0
             s_size = scales.nelement()

From ae0770fa6bcc58892692f0d1b2a3f0a300dbe82a Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Sat, 20 Dec 2025 16:48:49 -0500
Subject: [PATCH 727/770] [CI] Fix H200 Distributed test (#31054)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 .buildkite/test-amd.yaml                    |   6 +-
 .buildkite/test-pipeline.yaml               |   6 +-
 .buildkite/test_areas/distributed.yaml      |   4 +-
 examples/offline_inference/data_parallel.py | 164 +++++---------------
 4 files changed, 51 insertions(+), 129 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 9a770869b1d17..bd00c47df8cb0 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1254,13 +1254,13 @@ steps:
   - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
     - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
     - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
     - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
   - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
     - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
 
 - label: Distributed Tests (2 GPUs) # 68min
   timeout_in_minutes: 90
@@ -1508,7 +1508,7 @@ steps:
     - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
     - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
     - pytest -v -s tests/distributed/test_context_parallel.py
-    - HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput
+    - HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
     - pytest -v -s tests/v1/distributed/test_dbo.py
 
 ##### B200 test #####
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index f644504a5b937..3c823fc872b05 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -1109,13 +1109,13 @@ steps:
   - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
     - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
     - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
     - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
   - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
     - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
 
 - label: Distributed Tests (2 GPUs) # 68min
   timeout_in_minutes: 90
@@ -1334,7 +1334,7 @@ steps:
     - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
     - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
     - pytest -v -s tests/distributed/test_context_parallel.py
-    - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput
+    - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
     - pytest -v -s tests/v1/distributed/test_dbo.py
 
 ##### B200 test #####
diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml
index 52d57c99fcfb5..1a3739cc2417a 100644
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -145,7 +145,7 @@ steps:
     - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
     - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
     - pytest -v -s tests/distributed/test_context_parallel.py
-    - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput
+    - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
     - pytest -v -s tests/v1/distributed/test_dbo.py
 
 - label: Distributed Tests (2 GPUs)(B200)
@@ -171,7 +171,7 @@ steps:
   - tests/distributed/
   - tests/examples/offline_inference/data_parallel.py
   commands:
-    - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
+    - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
 
 - label: Distributed NixlConnector PD accuracy (4 GPUs)
   timeout_in_minutes: 30
diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
index be0b846995a92..bcf1ba307eff1 100644
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@@ -5,25 +5,25 @@ Usage:
 Single node:
     python examples/offline_inference/data_parallel.py \
             --model="ibm-research/PowerMoE-3b" \
-            --dp-size=2 \
-            --tp-size=2
+            -dp=2 \
+            -tp=2
 
 Multi-node:
     Node 0 (assume the node has ip of 10.99.48.128):
             python examples/offline_inference/data_parallel.py \
                     --model="ibm-research/PowerMoE-3b" \
-                    --dp-size=2 \
-                    --tp-size=2 \
-                    --node-size=2 \
+                    -dp=2 \
+                    -tp=2 \
+                    --nnodes=2 \
                     --node-rank=0 \
                     --master-addr=10.99.48.128 \
                     --master-port=13345
     Node 1:
             python examples/offline_inference/data_parallel.py \
                     --model="ibm-research/PowerMoE-3b" \
-                    --dp-size=2 \
-                    --tp-size=2 \
-                    --node-size=2 \
+                    -dp=2 \
+                    -tp=2 \
+                    --nnodes=2 \
                     --node-rank=1 \
                     --master-addr=10.99.48.128 \
                     --master-port=13345
@@ -32,103 +32,40 @@ Multi-node:
 import os
 from time import sleep
 
-from vllm import LLM, SamplingParams
+from vllm import LLM, EngineArgs, SamplingParams
 from vllm.platforms import current_platform
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.network_utils import get_open_port
 
 
-def parse_args():
-    import argparse
+def create_parser():
+    parser = FlexibleArgumentParser(description="Data Parallel Inference")
 
-    parser = argparse.ArgumentParser(description="Data Parallel Inference")
-    parser.add_argument(
-        "--model",
-        type=str,
-        default="ibm-research/PowerMoE-3b",
-        help="Model name or path",
-    )
-    parser.add_argument("--dp-size", type=int, default=2, help="Data parallel size")
-    parser.add_argument("--tp-size", type=int, default=2, help="Tensor parallel size")
-    parser.add_argument(
-        "--node-size", type=int, default=1, help="Total number of nodes"
-    )
-    parser.add_argument(
-        "--node-rank", type=int, default=0, help="Rank of the current node"
-    )
-    parser.add_argument(
-        "--master-addr", type=str, default="", help="Master node IP address"
-    )
-    parser.add_argument("--master-port", type=int, default=0, help="Master node port")
-    parser.add_argument(
-        "--enforce-eager", action="store_true", help="Enforce eager mode execution."
-    )
-    parser.add_argument(
-        "--trust-remote-code", action="store_true", help="Trust remote code."
-    )
-    parser.add_argument(
-        "--max-num-seqs",
-        type=int,
-        default=64,
-        help=("Maximum number of sequences to be processed in a single iteration."),
-    )
-    parser.add_argument(
-        "--max-model-len",
-        type=int,
-        help=("Maximum number of tokens to be processed in a single iteration."),
+    # Add all engine args
+    EngineArgs.add_cli_args(parser)
+    parser.set_defaults(
+        model="ibm-research/PowerMoE-3b",
+        enable_expert_parallel=True,
     )
+
+    # Add timeout (not in EngineArgs)
     parser.add_argument(
         "--timeout",
         type=int,
         default=300,
-        help=("Number of seconds before unresponsive process is killed."),
+        help="Number of seconds before unresponsive process is killed.",
     )
-    parser.add_argument(
-        "--gpu-memory-utilization",
-        type=float,
-        default=0.8,
-        help=("Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."),
-    )
-    parser.add_argument(
-        "--enable-dbo",
-        action="store_true",
-        help=("Enable microbatched execution"),
-    )
-    parser.add_argument(
-        "--compilation-config",
-        type=int,
-        help=("Compilation optimization (O) mode 0-3."),
-    )
-    parser.add_argument(
-        "--quantization",
-        type=str,
-    )
-    parser.add_argument(
-        "--disable-expert-parallel",
-        dest="enable_expert_parallel",
-        action="store_false",
-        help="Disable expert parallel (default: enabled).",
-    )
-    parser.set_defaults(enable_expert_parallel=True)
-    return parser.parse_args()
+
+    return parser
 
 
 def main(
-    model,
     dp_size,
     local_dp_rank,
     global_dp_rank,
     dp_master_ip,
     dp_master_port,
-    GPUs_per_dp_rank,
-    enforce_eager,
-    enable_expert_parallel,
-    trust_remote_code,
-    max_num_seqs,
-    max_model_len,
-    compilation_config,
-    gpu_memory_utilization,
-    enable_dbo,
-    quantization,
+    engine_args,
 ):
     os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
     os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank)
@@ -173,19 +110,7 @@ def main(
     )
 
     # Create an LLM.
-    llm = LLM(
-        model=model,
-        tensor_parallel_size=GPUs_per_dp_rank,
-        enforce_eager=enforce_eager,
-        enable_expert_parallel=enable_expert_parallel,
-        trust_remote_code=trust_remote_code,
-        max_num_seqs=max_num_seqs,
-        max_model_len=max_model_len,
-        gpu_memory_utilization=gpu_memory_utilization,
-        enable_dbo=enable_dbo,
-        quantization=quantization,
-        compilation_config=compilation_config,
-    )
+    llm = LLM(**engine_args)
     outputs = llm.generate(prompts, sampling_params)
     # Print the outputs.
     for i, output in enumerate(outputs):
@@ -204,22 +129,29 @@ def main(
 
 
 if __name__ == "__main__":
-    args = parse_args()
+    parser = create_parser()
+    args = vars(parser.parse_args())
 
-    dp_size = args.dp_size
-    tp_size = args.tp_size
-    node_size = args.node_size
-    node_rank = args.node_rank
+    # Extract DP-specific args
+    dp_size = args.pop("data_parallel_size")
+    nnodes = args.get("nnodes", 1)
+    node_rank = args.get("node_rank", 0)
+    master_addr = args.get("master_addr", "")
+    master_port = args.get("master_port", 0)
+    timeout = args.pop("timeout")
 
-    if node_size == 1:
+    # Remaining args are engine args
+    engine_args = args
+
+    if nnodes == 1:
         dp_master_ip = "127.0.0.1"
         dp_master_port = get_open_port()
     else:
-        dp_master_ip = args.master_addr
-        dp_master_port = args.master_port
+        dp_master_ip = master_addr
+        dp_master_port = master_port
 
-    assert dp_size % node_size == 0, "dp_size should be divisible by node_size"
-    dp_per_node = dp_size // node_size
+    assert dp_size % nnodes == 0, "dp_size should be divisible by nnodes"
+    dp_per_node = dp_size // nnodes
 
     from multiprocessing import Process
 
@@ -235,29 +167,19 @@ if __name__ == "__main__":
         proc = Process(
             target=main,
             args=(
-                args.model,
                 dp_size,
                 local_dp_rank,
                 global_dp_rank,
                 dp_master_ip,
                 dp_master_port,
-                tp_size,
-                args.enforce_eager,
-                args.enable_expert_parallel,
-                args.trust_remote_code,
-                args.max_num_seqs,
-                args.max_model_len,
-                args.compilation_config,
-                args.gpu_memory_utilization,
-                args.enable_dbo,
-                args.quantization,
+                engine_args,
             ),
         )
         proc.start()
         procs.append(proc)
     exit_code = 0
     for proc in procs:
-        proc.join(timeout=args.timeout)
+        proc.join(timeout=timeout)
         if proc.exitcode is None:
             print(f"Killing process {proc.pid} that didn't stop within 5 minutes.")
             proc.kill()

From 7c73ceb5812ace65e0d1b6ada3622b8b9f0400c0 Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <jinzhen.ljz@antgroup.com>
Date: Sun, 21 Dec 2025 05:58:11 +0800
Subject: [PATCH 728/770] [Quantization] add marlin w4a8/w8a8 check (#31061)

Signed-off-by: Jinzhen Lin <jinzhen.ljz@antgroup.com>
---
 .../layers/quantization/utils/marlin_utils.py        | 12 ++++++++++++
 .../layers/quantization/utils/marlin_utils_fp4.py    | 12 ++++++++++++
 .../layers/quantization/utils/marlin_utils_fp8.py    |  4 ++++
 3 files changed, 28 insertions(+)

diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 66e979b505f0d..3de2b6509e460 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -594,9 +594,15 @@ def apply_awq_marlin_linear(
 
     a_scales = None
     if input_dtype == torch.int8:
+        assert quant_type == scalar_types.uint4, (
+            "W8A8-INT8 is not supported by marlin kernel."
+        )
         reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
         a_scales = a_scales * input_global_scale
     elif input_dtype == torch.float8_e4m3fn:
+        assert quant_type == scalar_types.uint4, (
+            "INT8 weight + FP8 activation is not supported."
+        )
         reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
 
     output = ops.gptq_marlin_gemm(
@@ -649,9 +655,15 @@ def apply_rtn_marlin_linear(
 
     a_scales = None
     if input_dtype == torch.int8:
+        assert quant_type == scalar_types.uint4b8, (
+            "W8A8-INT8 is not supported by marlin kernel."
+        )
         reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
         a_scales = a_scales * input_global_scale
     elif input_dtype == torch.float8_e4m3fn:
+        assert quant_type == scalar_types.uint4b8, (
+            "INT8 weight + FP8 activation is not supported."
+        )
         reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
 
     output = ops.gptq_marlin_gemm(
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
index 876c724bf972d..4d0a34c3be119 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
@@ -154,6 +154,12 @@ def prepare_fp4_layer_for_marlin(
     )
 
     is_nvfp4 = hasattr(layer, "weight_scale_2")
+    if input_dtype is not None and input_dtype.itemsize == 1:
+        if is_nvfp4:
+            raise RuntimeError("NVFP4 weight + INT8/FP8 activation is not supported.")
+        elif input_dtype != torch.float8_e4m3fn:
+            raise RuntimeError("MXFP4 weight + INT8 activation is not supported.")
+
     group_size = 16 if is_nvfp4 else 32
 
     part_size_n = layer.output_size_per_partition
@@ -231,6 +237,12 @@ def prepare_moe_fp4_layer_for_marlin(
     )
 
     is_nvfp4 = hasattr(layer, "w13_weight_scale_2")
+    if input_dtype is not None and input_dtype.itemsize == 1:
+        if is_nvfp4:
+            raise RuntimeError("NVFP4 weight + INT8/FP8 activation is not supported.")
+        elif input_dtype != torch.float8_e4m3fn:
+            raise RuntimeError("MXFP4 weight + INT8 activation is not supported.")
+
     group_size = 16 if is_nvfp4 else 32
 
     e = layer.num_experts
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
index 1fb5223b07d76..4d2f2fd71ad36 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -99,6 +99,8 @@ def prepare_fp8_layer_for_marlin(
         "be used leveraging the Marlin kernel. This may degrade "
         "performance for compute-heavy workloads."
     )
+    if input_dtype is not None and input_dtype.itemsize == 1:
+        raise RuntimeError("Marlin W8A8 is not supported.")
 
     part_size_n = layer.output_size_per_partition
     part_size_k = layer.input_size_per_partition
@@ -206,6 +208,8 @@ def prepare_moe_fp8_layer_for_marlin(
         "be used leveraging the Marlin kernel. This may degrade "
         "performance for compute-heavy workloads."
     )
+    if input_dtype is not None and input_dtype.itemsize == 1:
+        raise RuntimeError("Marlin W8A8 is not supported.")
 
     e = layer.num_experts
     k = layer.hidden_size

From 3e92b2b7acaa61335ecd7bea5eeed50388739194 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= <wangzhipeng628@gmail.com>
Date: Sun, 21 Dec 2025 10:39:31 +0800
Subject: [PATCH 729/770] [BugFix]fix gpt-oss v1/completions response bug
 (#30608)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: princepride <wangzhipeng628@gmail.com>
Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: bbrowning <bbrownin@redhat.com>
---
 .../openai/test_response_api_with_harmony.py  | 21 +++++++++++++++++++
 tests/entrypoints/openai/test_serving_chat.py | 15 +++++--------
 vllm/entrypoints/openai/serving_chat.py       |  9 ++++----
 vllm/tool_parsers/openai_tool_parser.py       |  9 ++++++++
 4 files changed, 40 insertions(+), 14 deletions(-)

diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py
index 6f2a50020699c..8ef0d7f277d5f 100644
--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import importlib
+import importlib.util
 import json
 import time
 
@@ -986,3 +987,23 @@ async def test_function_call_with_previous_input_messages(
     assert (
         "aquarius" in output_text or "otter" in output_text or "tuesday" in output_text
     )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_chat_truncation_content_not_null(client: OpenAI, model_name: str):
+    response = await client.chat.completions.create(
+        model=model_name,
+        messages=[{"role": "user", "content": "What is the role of AI in medicine?"}],
+        temperature=0.0,
+        max_tokens=250,
+    )
+
+    choice = response.choices[0]
+    assert choice.finish_reason == "length", (
+        f"Expected finish_reason='length', got {choice.finish_reason}"
+    )
+    assert choice.message.content is not None, (
+        "Content should not be None when truncated"
+    )
+    assert len(choice.message.content) > 0, "Content should not be empty"
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 6fb074b8a19ba..d845913b8ee03 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -955,7 +955,6 @@ class TestServingChatWithHarmony:
             input_messages,
             [
                 {"role": "system"},
-                {"role": "developer"},
                 {"role": "user", "content": messages[0]["content"]},
             ],
         )
@@ -983,7 +982,6 @@ class TestServingChatWithHarmony:
             input_messages_2,
             [
                 {"role": "system"},
-                {"role": "developer"},
                 {"role": "user"},
                 # The analysis message should be dropped on subsequent inputs because
                 # of the subsequent assistant message to the final channel.
@@ -1043,7 +1041,7 @@ class TestServingChatWithHarmony:
         )
 
         # Test the Harmony messages for the second turn's input
-        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
+        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
         input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
         verify_harmony_messages(
             input_messages_2,
@@ -1124,7 +1122,7 @@ class TestServingChatWithHarmony:
         )
 
         # Test the Harmony messages for the second turn's input
-        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
+        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
         input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
         verify_harmony_messages(
             input_messages_2,
@@ -1205,7 +1203,7 @@ class TestServingChatWithHarmony:
         )
 
         # Test the Harmony messages for the second turn's input
-        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
+        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
         input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
         verify_harmony_messages(
             input_messages_2,
@@ -1255,7 +1253,7 @@ class TestServingChatWithHarmony:
         )
 
         # Test the Harmony messages for the third turn's input
-        req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
+        req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
         input_messages_3, _ = serving_chat._make_request_with_harmony(req_3)
         verify_harmony_messages(
             input_messages_3,
@@ -1318,7 +1316,7 @@ class TestServingChatWithHarmony:
         )
 
         # Test the Harmony messages for the fourth turn's input
-        req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
+        req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
         input_messages_4, _ = serving_chat._make_request_with_harmony(req_4)
         verify_harmony_messages(
             input_messages_4,
@@ -1374,7 +1372,6 @@ class TestServingChatWithHarmony:
             input_messages,
             [
                 {"role": "system"},
-                {"role": "developer"},
                 {"role": "user", "content": messages[0]["content"]},
                 # The reasoning that would have resulted in an analysis message is
                 # dropped because of a later assistant message to the final channel.
@@ -1406,7 +1403,6 @@ class TestServingChatWithHarmony:
             input_messages,
             [
                 {"role": "system"},
-                {"role": "developer"},
                 {"role": "user", "content": messages[0]["content"]},
                 {
                     "role": "assistant",
@@ -1436,7 +1432,6 @@ class TestServingChatWithHarmony:
             input_messages,
             [
                 {"role": "system"},
-                {"role": "developer"},
                 {"role": "user", "content": messages[0]["content"]},
                 {
                     "role": "assistant",
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index d437d1e5c3b06..88d87a3334955 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -1828,10 +1828,11 @@ class OpenAIServingChat(OpenAIServing):
         messages.append(sys_msg)
 
         # Add developer message.
-        dev_msg = get_developer_message(
-            tools=request.tools if should_include_tools else None
-        )
-        messages.append(dev_msg)
+        if request.tools:
+            dev_msg = get_developer_message(
+                tools=request.tools if should_include_tools else None
+            )
+            messages.append(dev_msg)
 
         # Add user message.
         messages.extend(parse_chat_inputs_to_harmony_messages(request.messages))
diff --git a/vllm/tool_parsers/openai_tool_parser.py b/vllm/tool_parsers/openai_tool_parser.py
index db92ea8982d70..da1a9c773f78f 100644
--- a/vllm/tool_parsers/openai_tool_parser.py
+++ b/vllm/tool_parsers/openai_tool_parser.py
@@ -79,6 +79,15 @@ class OpenAIToolParser(ToolParser):
                 elif msg.channel == "commentary" and not msg.recipient:
                     commentary_content = msg_text
 
+        # Extract partial content from the parser state if the generation was truncated
+        if parser.current_content:
+            if parser.current_channel == "final":
+                final_content = parser.current_content
+            elif (
+                parser.current_channel == "commentary" and not parser.current_recipient
+            ):
+                commentary_content = parser.current_content
+
         return ExtractedToolCallInformation(
             tools_called=len(tool_calls) > 0,
             tool_calls=tool_calls,

From bb80f69bc98cbf062bf030cb11185f7ba526e28a Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Sun, 21 Dec 2025 11:24:34 +0800
Subject: [PATCH 730/770] add aarnphm and chaunceyjiang to the new tool_parser
 directory (#31088)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 .github/CODEOWNERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index d6447649cd89a..4d7a366f05e37 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -15,6 +15,7 @@
 /vllm/lora @jeejeelee
 /vllm/reasoning @aarnphm @chaunceyjiang
 /vllm/entrypoints @aarnphm @chaunceyjiang
+/vllm/tool_parsers @aarnphm @chaunceyjiang
 /vllm/compilation @zou3519 @youkaichao @ProExpertProg
 /vllm/distributed/kv_transfer @NickLucche @ApostaC
 CMakeLists.txt @tlrmchlsmth @LucasWilkinson

From 93cabc417cf88210f16ebf4b3c20b9adbe9ed83a Mon Sep 17 00:00:00 2001
From: Ameen Patel <ameenp360@gmail.com>
Date: Sun, 21 Dec 2025 07:43:01 -0800
Subject: [PATCH 731/770] ci: add nvidia-smi warmup before Prime-RL integration
 test (#31093)

Signed-off-by: AmeenP <ameenp360@gmail.com>
---
 .buildkite/test-pipeline.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 3c823fc872b05..1186acad48ae0 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -1359,6 +1359,7 @@ steps:
   - vllm/
   - .buildkite/scripts/run-prime-rl-test.sh
   commands:
+    - nvidia-smi
     - bash .buildkite/scripts/run-prime-rl-test.sh
 
 - label: DeepSeek V2-Lite Accuracy

From b471092d3a983485243dd153777d026bf5f49183 Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Sun, 21 Dec 2025 12:37:42 -0500
Subject: [PATCH 732/770] [MoE Refactor][4/N] Marlin Fp8 Mk (#31036)

---
 tests/quantization/test_fp8.py                |  4 +
 .../model_executor/layers/fused_moe/config.py | 32 +++++++-
 .../layers/fused_moe/fused_marlin_moe.py      | 38 +++++-----
 .../model_executor/layers/quantization/fp8.py | 74 +++++++++----------
 4 files changed, 85 insertions(+), 63 deletions(-)

diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index 62203186510ce..c9ab24fd439c5 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -15,6 +15,7 @@ from vllm.model_executor.layers.quantization.fp8 import (
     Fp8Config,
     Fp8KVCacheMethod,
     Fp8LinearMethod,
+    Fp8MoeBackend,
     Fp8MoEMethod,
 )
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -324,7 +325,10 @@ def test_fp8_reloading(
                 weight_loader=default_weight_loader,
             )
 
+        # Fp8LinearMethod uses use_marlin
+        # Fp8MoEMethod uses fp8_backend
         method.use_marlin = use_marlin
+        method.fp8_backend = Fp8MoeBackend.MARLIN if use_marlin else None
 
     # capture weights format during loading
     original_metadata = [
diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
index a9a2990ca2b53..d581e91f36d03 100644
--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -19,6 +19,7 @@ from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import (
     OCP_MX_Scheme,
 )
 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
+from vllm.platforms import current_platform
 from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
 from vllm.utils.import_utils import has_triton_kernels
 from vllm.utils.math_utils import cdiv
@@ -39,6 +40,7 @@ if has_triton_kernels():
 def _get_config_dtype_str(
     dtype: torch.dtype,
     use_fp8_w8a8: bool = False,
+    use_fp8_w8a16: bool = False,
     use_int8_w8a16: bool = False,
     use_int4_w4a16: bool = False,
     ocp_mx_scheme: str | None = None,
@@ -50,6 +52,8 @@ def _get_config_dtype_str(
     """
     if use_fp8_w8a8:
         return "fp8_w8a8"
+    elif use_fp8_w8a16:
+        return "fp8_w8a16"
     elif use_int8_w8a16:
         return "int8_w8a16"
     elif use_int4_w4a16:
@@ -319,6 +323,10 @@ class FusedMoEQuantConfig:
     def use_int8_w8a16(self) -> bool:
         return self._a1.dtype is None and self._w1.dtype == torch.int8
 
+    @property
+    def use_fp8_w8a16(self) -> bool:
+        return self._a1.dtype is None and self._w1.dtype == current_platform.fp8_dtype()
+
     @property
     def use_int4_w4a16(self) -> bool:
         return self._a1.dtype is None and self._w1.dtype == "int4"
@@ -362,6 +370,7 @@ class FusedMoEQuantConfig:
         """
         return _get_config_dtype_str(
             use_fp8_w8a8=self.use_fp8_w8a8,
+            use_fp8_w8a16=self.use_fp8_w8a16,
             use_int8_w8a16=self.use_int8_w8a16,
             use_int4_w4a16=self.use_int4_w4a16,
             ocp_mx_scheme=self.ocp_mx_scheme,
@@ -680,7 +689,6 @@ def int4_w4a16_moe_quant_config(
 ) -> FusedMoEQuantConfig:
     """
     Construct a quant config for 16-bit float activations and int4 weights.
-    Note: Activations are pre-quantized.
     """
     group_shape = GroupShape(*block_shape) if block_shape is not None else None
     return FusedMoEQuantConfig(
@@ -691,6 +699,27 @@ def int4_w4a16_moe_quant_config(
     )
 
 
+def fp8_w8a16_moe_quant_config(
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    block_shape: list[int] | None = None,
+) -> FusedMoEQuantConfig:
+    """
+    Construct a quant config for 16-bit float activations and fp8 weights.
+    """
+    group_shape = GroupShape(*block_shape) if block_shape is not None else None
+    return FusedMoEQuantConfig(
+        _a1=FusedMoEQuantDesc(),
+        _a2=FusedMoEQuantDesc(),
+        _w1=FusedMoEQuantDesc(
+            current_platform.fp8_dtype(), group_shape, w1_scale, None, None
+        ),
+        _w2=FusedMoEQuantDesc(
+            current_platform.fp8_dtype(), group_shape, w2_scale, None, None
+        ),
+    )
+
+
 def int8_w8a16_moe_quant_config(
     w1_scale: torch.Tensor,
     w2_scale: torch.Tensor,
@@ -700,7 +729,6 @@ def int8_w8a16_moe_quant_config(
 ) -> FusedMoEQuantConfig:
     """
     Construct a quant config for 16-bit float activations and int8 weights.
-    Note: Activations are pre-quantized.
     """
     group_shape = GroupShape(*block_shape) if block_shape is not None else None
     return FusedMoEQuantConfig(
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index 92d72b75656cd..295a2a28156ed 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -13,9 +13,6 @@ from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
     batched_moe_align_block_size,
     moe_align_block_size,
 )
-from vllm.model_executor.layers.fused_moe.prepare_finalize import (
-    MoEPrepareAndFinalizeNoEP,
-)
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
     TopKWeightAndReduceDelegate,
     TopKWeightAndReduceNoOP,
@@ -26,6 +23,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     marlin_moe_intermediate_size,
     marlin_quant_input,
 )
+from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
 
 
@@ -542,9 +540,11 @@ class MarlinExpertsBase(mk.FusedMoEPermuteExpertsUnpermute):
         is_k_full: bool = True,
     ):
         # TODO (varun) : Enable activation quantization
-        assert quant_config.use_mxfp4_w4a16 or quant_config.use_int4_w4a16, (
-            "Supports only mxfp4_w4a16 or int4_w4a16"
-        )
+        assert (
+            quant_config.use_mxfp4_w4a16
+            or quant_config.use_int4_w4a16
+            or quant_config.use_fp8_w8a16
+        ), "Supports only mxfp4_w4a16, int4_w4a16 or fp8_w8a16"
         self.w13_g_idx = w13_g_idx
         self.w2_g_idx = w2_g_idx
         self.w13_g_idx_sort_indices = w13_g_idx_sort_indices
@@ -555,11 +555,17 @@ class MarlinExpertsBase(mk.FusedMoEPermuteExpertsUnpermute):
     @property
     def quant_type_id(self) -> int:
         # uint4b8 will be set for int4 weight and float4_e2m1f will be used for mxfp4
-        return (
-            scalar_types.uint4b8.id
-            if self.quant_config.use_int4_w4a16
-            else scalar_types.float4_e2m1f.id
-        )
+        if self.quant_config.use_int4_w4a16:
+            return scalar_types.uint4b8.id
+        elif self.quant_config.use_mxfp4_w4a16:
+            return scalar_types.float4_e2m1f.id
+        elif (
+            self.quant_config.use_fp8_w8a16
+            and current_platform.fp8_dtype() == torch.float8_e4m3fn
+        ):
+            return scalar_types.float8_e4m3fn.id
+        else:
+            raise NotImplementedError("Unsupported quantization type.")
 
     def moe_problem_size(
         self,
@@ -711,16 +717,6 @@ class MarlinExperts(MarlinExpertsBase):
         ops.moe_sum(input, output)
 
 
-def modular_marlin_fused_moe(
-    quant_config: FusedMoEQuantConfig, shared_experts: torch.nn.Module | None = None
-) -> mk.FusedMoEModularKernel:
-    return mk.FusedMoEModularKernel(
-        MoEPrepareAndFinalizeNoEP(),
-        MarlinExperts(quant_config),
-        shared_experts,
-    )
-
-
 class BatchedMarlinExperts(MarlinExpertsBase):
     def __init__(
         self,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 4b2438133dd6b..1f770d6d89a13 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -32,8 +32,8 @@ from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEQuantConfig,
     RoutingMethodType,
     fp8_w8a8_moe_quant_config,
+    fp8_w8a16_moe_quant_config,
 )
-from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
 from vllm.model_executor.layers.fused_moe.layer import UnquantizedFusedMoEMethod
 from vllm.model_executor.layers.linear import (
     LinearBase,
@@ -95,7 +95,6 @@ from vllm.model_executor.parameter import (
 )
 from vllm.model_executor.utils import replace_parameter, set_weight_attrs
 from vllm.platforms import current_platform
-from vllm.scalar_type import scalar_types
 from vllm.utils.deep_gemm import (
     is_deep_gemm_e8m0_used,
     is_deep_gemm_supported,
@@ -729,7 +728,6 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         )
 
         self.marlin_input_dtype = None
-        self.use_marlin = self.fp8_backend == Fp8MoeBackend.MARLIN
         self.flashinfer_moe_backend: FlashinferMoeBackend | None = None
         if self.fp8_backend == Fp8MoeBackend.FLASHINFER_TRTLLM:
             self.flashinfer_moe_backend = FlashinferMoeBackend.TENSORRT_LLM
@@ -1048,7 +1046,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                     rotate_flashinfer_fp8_moe_weights(w13_weight, w2_weight)
                 layer.w13_weight.data = w13_weight.data
 
-        if self.use_marlin:
+        if self.fp8_backend == Fp8MoeBackend.MARLIN:
             prepare_moe_fp8_layer_for_marlin(
                 layer, False, input_dtype=self.marlin_input_dtype
             )
@@ -1091,10 +1089,17 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             )
             self.use_inplace = False
 
-        elif self.fp8_backend in [Fp8MoeBackend.DEEPGEMM, Fp8MoeBackend.TRITON]:
+        elif self.fp8_backend in [
+            Fp8MoeBackend.DEEPGEMM,
+            Fp8MoeBackend.TRITON,
+            Fp8MoeBackend.MARLIN,
+        ]:
             from vllm.model_executor.layers.fused_moe import (
                 TritonOrDeepGemmExperts,
             )
+            from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+                MarlinExperts,
+            )
             from vllm.model_executor.layers.fused_moe.prepare_finalize import (
                 MoEPrepareAndFinalizeNoEP,
             )
@@ -1102,12 +1107,19 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             config = self.get_fused_moe_quant_config(layer)
             assert config is not None
             self.moe_quant_config = config
-            self.kernel = mk.FusedMoEModularKernel(
-                MoEPrepareAndFinalizeNoEP(),
-                TritonOrDeepGemmExperts(
+            use_marlin = self.fp8_backend == Fp8MoeBackend.MARLIN
+            allow_deep_gemm = self.fp8_backend == Fp8MoeBackend.DEEPGEMM
+            moe_kernel = (
+                MarlinExperts(quant_config=self.moe_quant_config)
+                if use_marlin
+                else TritonOrDeepGemmExperts(
                     quant_config=self.moe_quant_config,
-                    allow_deep_gemm=(self.fp8_backend == Fp8MoeBackend.DEEPGEMM),
-                ),
+                    allow_deep_gemm=allow_deep_gemm,
+                )
+            )
+
+            self.kernel = mk.FusedMoEModularKernel(
+                MoEPrepareAndFinalizeNoEP(), moe_kernel
             )
             self.use_inplace = True
 
@@ -1116,9 +1128,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
     ) -> mk.FusedMoEPrepareAndFinalize | None:
         if (
-            current_platform.is_xpu()
-            or self.rocm_aiter_moe_enabled
-            or self.use_marlin
+            self.rocm_aiter_moe_enabled
+            or self.fp8_backend == Fp8MoeBackend.MARLIN
             or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
         ):
             return None
@@ -1150,7 +1161,9 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             TritonOrDeepGemmExperts,
         )
 
-        assert not self.use_marlin and not self.rocm_aiter_moe_enabled, (
+        assert (
+            self.fp8_backend != Fp8MoeBackend.MARLIN
+        ) and not self.rocm_aiter_moe_enabled, (
             "Marlin and ROCm AITER are not supported with all2all yet."
         )
 
@@ -1207,8 +1220,12 @@ class Fp8MoEMethod(FusedMoEMethodBase):
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
     ) -> FusedMoEQuantConfig | None:
-        if self.use_marlin:
-            return None
+        if self.fp8_backend == Fp8MoeBackend.MARLIN:
+            return fp8_w8a16_moe_quant_config(
+                w1_scale=layer.w13_weight_scale,
+                w2_scale=layer.w2_weight_scale,
+                block_shape=self.weight_block_size,
+            )
 
         return fp8_w8a8_moe_quant_config(
             w1_scale=(
@@ -1314,29 +1331,6 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 expert_map=layer.expert_map,
                 quant_config=self.moe_quant_config,
             )
-        elif self.use_marlin:
-            # TODO(rob): convert this to MK.
-            assert layer.activation == "silu", (
-                f"{layer.activation} not supported for Marlin MoE."
-            )
-            result = fused_marlin_moe(
-                x,
-                layer.w13_weight,
-                layer.w2_weight,
-                None,
-                None,
-                layer.w13_weight_scale,
-                layer.w2_weight_scale,
-                router_logits,
-                topk_weights,
-                topk_ids,
-                quant_type_id=scalar_types.float8_e4m3fn.id,
-                apply_router_weight_on_input=layer.apply_router_weight_on_input,
-                global_num_experts=layer.global_num_experts,
-                expert_map=layer.expert_map,
-                input_dtype=self.marlin_input_dtype,
-                workspace=layer.workspace,
-            )
         else:
             result = self.kernel(
                 x,
@@ -1495,7 +1489,7 @@ class Fp8OnlineMoEMethod(Fp8MoEMethod):
             replace_parameter(layer, "w2_weight", shuffled_w2)
 
         # Rushuffle weights for MARLIN if needed.
-        if self.use_marlin:
+        if self.fp8_backend == Fp8MoeBackend.MARLIN:
             prepare_moe_fp8_layer_for_marlin(
                 layer, False, input_dtype=self.marlin_input_dtype
             )

From 06d490282f2bab6922137eb5230be9df5ebbe9c4 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Sun, 21 Dec 2025 12:41:57 -0500
Subject: [PATCH 733/770] [NVFP4][Perf] Tune NVFP4 input quant kernel for small
 batch size (#30897)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 benchmarks/kernels/bench_nvfp4_quant.py       | 177 ++++++++++++++++++
 .../activation_nvfp4_quant_fusion_kernels.cu  |   5 +-
 csrc/quantization/fp4/nvfp4_experts_quant.cu  |  31 ++-
 csrc/quantization/fp4/nvfp4_quant_kernels.cu  |  62 ++----
 csrc/quantization/fp4/nvfp4_utils.cuh         |  65 +++----
 5 files changed, 243 insertions(+), 97 deletions(-)
 create mode 100644 benchmarks/kernels/bench_nvfp4_quant.py

diff --git a/benchmarks/kernels/bench_nvfp4_quant.py b/benchmarks/kernels/bench_nvfp4_quant.py
new file mode 100644
index 0000000000000..7517376535925
--- /dev/null
+++ b/benchmarks/kernels/bench_nvfp4_quant.py
@@ -0,0 +1,177 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import copy
+import itertools
+
+import torch
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+from vllm.scalar_type import scalar_types
+from vllm.triton_utils import triton
+from vllm.utils.flashinfer import flashinfer_fp4_quantize
+
+if not current_platform.has_device_capability(100):
+    raise RuntimeError("NVFP4 requires compute capability of 10.0 (Blackwell)")
+
+FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
+FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
+
+PROVIDER_CFGS = {
+    "vllm": dict(backend="vllm", enabled=True),
+    "flashinfer": dict(backend="flashinfer", enabled=True),
+}
+
+_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
+
+
+def compute_global_scale(tensor: torch.Tensor) -> torch.Tensor:
+    """Compute global scale for FP4 quantization."""
+    amax = torch.abs(tensor).max().to(torch.float32)
+    return FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / amax
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=[1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096],
+        x_log=False,
+        line_arg="provider",
+        line_vals=_enabled,
+        line_names=_enabled,
+        ylabel="us (lower is better)",
+        plot_name="NVFP4 Input Quantization Latency (us)",
+        args={},
+    )
+)
+def benchmark(batch_size, provider, N, K):
+    M = batch_size
+    device = "cuda"
+    dtype = torch.bfloat16
+
+    # Create input tensor
+    a = torch.randn((M, K), device=device, dtype=dtype)
+
+    # Compute global scale for activation
+    a_global_scale = compute_global_scale(a)
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    cfg = PROVIDER_CFGS[provider]
+
+    if cfg["backend"] == "vllm":
+        # vLLM's FP4 quantization
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: ops.scaled_fp4_quant(a, a_global_scale),
+            quantiles=quantiles,
+        )
+    elif cfg["backend"] == "flashinfer":
+        # FlashInfer's FP4 quantization
+        # Use is_sf_swizzled_layout=True to match vLLM's output format
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: flashinfer_fp4_quantize(
+                a, a_global_scale, is_sf_swizzled_layout=True
+            ),
+            quantiles=quantiles,
+        )
+
+    # Convert ms to us for better readability at small batch sizes
+    to_us = lambda t_ms: t_ms * 1000
+    return to_us(ms), to_us(max_ms), to_us(min_ms)
+
+
+def prepare_shapes(args):
+    out = []
+    for model, tp_size in itertools.product(args.models, args.tp_sizes):
+        for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
+            KN[tp_dim] //= tp_size
+            KN.append(model)
+            out.append(KN)
+    return out
+
+
+def _test_accuracy_once(M: int, K: int, dtype: torch.dtype, device: str):
+    """Test accuracy between vLLM and FlashInfer FP4 quantization."""
+    # Create input tensor
+    a = torch.randn((M, K), device=device, dtype=dtype)
+
+    # Compute global scale
+    a_global_scale = compute_global_scale(a)
+
+    # vLLM quantization
+    vllm_fp4, vllm_scale = ops.scaled_fp4_quant(a, a_global_scale)
+
+    # FlashInfer quantization (with swizzled layout to match vLLM's output)
+    flashinfer_fp4, flashinfer_scale = flashinfer_fp4_quantize(
+        a, a_global_scale, is_sf_swizzled_layout=True
+    )
+    flashinfer_scale = flashinfer_scale.view(torch.float8_e4m3fn)
+
+    # Compare outputs
+    torch.testing.assert_close(
+        vllm_fp4,
+        flashinfer_fp4,
+    )
+    print(f"M={M}, K={K}, dtype={dtype}: PASSED")
+
+
+def test_accuracy():
+    """Run accuracy tests across various shapes."""
+    print("\n" + "=" * 60)
+    print("Running accuracy tests: vLLM vs FlashInfer")
+    print("=" * 60)
+
+    device = "cuda"
+    dtype = torch.bfloat16
+
+    # Test various batch sizes and hidden dimensions
+    Ms = [1, 1024]
+    Ks = [4096]
+
+    for M in Ms:
+        for K in Ks:
+            _test_accuracy_once(M, K, dtype, device)
+
+    print("\nAll accuracy tests passed!")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Benchmark NVFP4 quantization: vLLM vs FlashInfer"
+    )
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=["meta-llama/Llama-3.1-8B-Instruct"],
+        choices=list(WEIGHT_SHAPES.keys()),
+    )
+    parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
+    parser.add_argument(
+        "--save-path",
+        type=str,
+        default=None,
+        help="Path to save benchmark results",
+    )
+    parser.add_argument(
+        "--accuracy",
+        action="store_true",
+        help="Run accuracy tests",
+    )
+    args = parser.parse_args()
+
+    if args.accuracy:
+        test_accuracy()
+
+    for K, N, model in prepare_shapes(args):
+        print(f"\n{model}, N={N} K={K}")
+        benchmark.run(
+            print_data=True,
+            save_path=args.save_path,
+            N=N,
+            K=K,
+        )
+
+    print("\nBenchmark finished!")
diff --git a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
index 7539f836ecf37..e0438556dfe5c 100644
--- a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
+++ b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
@@ -74,6 +74,9 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
   static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
                 "Vec size is not matched.");
 
+  // Precompute SF layout parameter (constant for entire kernel).
+  int32_t const numKTiles = (numCols + 63) / 64;
+
   // Get the global scaling factor, which will be applied to the SF.
   // Note SFScale is the same as next GEMM's alpha, which is
   // (448.f / (Alpha_A / 6.f)).
@@ -101,7 +104,7 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
       auto sf_out =
           cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
                                              CVT_FP4_NUM_THREADS_PER_SF>(
-              rowIdx, colIdx, numCols, SFout);
+              rowIdx, colIdx, numKTiles, SFout);
 
       out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(out_silu_mul, SFScaleVal,
                                                      sf_out);
diff --git a/csrc/quantization/fp4/nvfp4_experts_quant.cu b/csrc/quantization/fp4/nvfp4_experts_quant.cu
index 82c53c2375a31..20191a9bc6160 100644
--- a/csrc/quantization/fp4/nvfp4_experts_quant.cu
+++ b/csrc/quantization/fp4/nvfp4_experts_quant.cu
@@ -25,6 +25,7 @@
 #include <cuda_fp8.h>
 #include "dispatch_utils.h"
 
+#include "cuda_utils.h"
 #include "nvfp4_utils.cuh"
 #include "launch_bounds_utils.h"
 
@@ -44,6 +45,9 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
   static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
                 "Vec size is not matched.");
 
+  // Precompute SF layout parameter (constant for entire kernel).
+  int32_t const numKTiles = (numCols + 63) / 64;
+
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int colsPerRow = numCols / CVT_FP4_ELTS_PER_THREAD;
 
@@ -112,17 +116,13 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
     // (448.f / (Alpha_A / 6.f)).
     float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[expert_idx];
 
-    int factor = CVT_FP4_SF_VEC_SIZE * 4;
-    // The actual output_scales dim is computed from the padded numCols.
-    int32_t numCols_padded = (numCols + factor - 1) / factor * factor;
-    int numCols_SFout = numCols_padded / CVT_FP4_SF_VEC_SIZE / 4;
     uint32_t* SFout_in_expert =
-        SFout + output_scale_offset_by_experts[expert_idx] * numCols_SFout;
+        SFout + output_scale_offset_by_experts[expert_idx] * numKTiles;
 
     auto sf_out =
         cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
                                            CVT_FP4_NUM_THREADS_PER_SF>(
-            rowIdx_in_expert, colIdx, numCols, SFout_in_expert);
+            rowIdx_in_expert, colIdx, numKTiles, SFout_in_expert);
 
     out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
   }
@@ -140,6 +140,10 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
       (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
   static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
                 "Vec size is not matched.");
+
+  // Precompute SF layout parameter (constant for entire kernel).
+  int32_t const numKTiles = (numCols + 63) / 64;
+
   extern __shared__ uint32_t shared_input_offsets[];
 
   // Load input offsets into shared memory.
@@ -202,16 +206,13 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
 
     float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[expert_idx];
 
-    int factor = CVT_FP4_SF_VEC_SIZE * 4;
-    int32_t numCols_padded = (numCols + factor - 1) / factor * factor;
-    int numCols_SFout = numCols_padded / CVT_FP4_SF_VEC_SIZE / 4;
     uint32_t* SFout_in_expert =
-        SFout + output_scale_offset_by_experts[expert_idx] * numCols_SFout;
+        SFout + output_scale_offset_by_experts[expert_idx] * numKTiles;
 
     auto sf_out =
         cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
                                            CVT_FP4_NUM_THREADS_PER_SF>(
-            rowIdx_in_expert, colIdx, numCols, SFout_in_expert);
+            rowIdx_in_expert, colIdx, numKTiles, SFout_in_expert);
 
     out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
   }
@@ -222,12 +223,8 @@ void quant_impl(void* output, void* output_scale, void* input,
                 void* input_global_scale, void* input_offset_by_experts,
                 void* output_scale_offset_by_experts, int m_topk, int k,
                 int n_experts, cudaStream_t stream) {
-  // TODO: this multiProcessorCount should be cached.
-  int device;
-  cudaGetDevice(&device);
-  int multiProcessorCount;
-  cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount,
-                         device);
+  int multiProcessorCount =
+      get_device_attribute(cudaDevAttrMultiProcessorCount, -1);
 
   // Grid, Block size.
   // Each thread converts 8 values.
diff --git a/csrc/quantization/fp4/nvfp4_quant_kernels.cu b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
index 6d69852bb4e4f..6acadb4cefd2c 100644
--- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
@@ -38,6 +38,12 @@ __host__ __device__ inline Int round_up(Int x, Int y) {
   return (x + y - 1) / y * y;
 }
 
+// Compute effective rows for grid configuration with swizzled SF layouts.
+inline int computeEffectiveRows(int m) {
+  constexpr int ROW_TILE = 128;
+  return round_up(m, ROW_TILE);
+}
+
 // Use UE4M3 by default.
 template <class Type, bool UE8M0_SF = false>
 __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
@@ -49,6 +55,9 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
   static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
                 "Vec size is not matched.");
 
+  // Precompute SF layout parameter (constant for entire kernel).
+  int32_t const numKTiles = (numCols + 63) / 64;
+
   int sf_m = round_up<int>(numRows, 128);
   int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE;
   int sf_n_int = round_up<int>(sf_n_unpadded, 4) / 4;
@@ -79,7 +88,7 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
       auto sf_out =
           cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
                                              CVT_FP4_NUM_THREADS_PER_SF>(
-              rowIdx, colIdx, numCols, SFout);
+              rowIdx, colIdx, numKTiles, SFout);
 
       out_pos =
           cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, global_scale, sf_out);
@@ -87,43 +96,6 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
   }
 }
 
-template <typename T>
-void invokeFP4Quantization(int m, int n, T const* input, float const* SFScale,
-                           int64_t* output, int32_t* SFOuput, bool useUE8M0,
-                           int multiProcessorCount, cudaStream_t stream) {
-  // Grid, Block size.
-  // Each thread converts 8 values.
-  dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
-  // Get number of blocks per SM
-  int const numBlocksPerSM =
-      vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
-  dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
-
-  // Launch the cvt kernel.
-  if (useUE8M0) {
-    cvt_fp16_to_fp4<T, true><<<grid, block, 0, stream>>>(
-        m, n, input, SFScale, reinterpret_cast<uint32_t*>(output),
-        reinterpret_cast<uint32_t*>(SFOuput));
-  } else {
-    cvt_fp16_to_fp4<T, false><<<grid, block, 0, stream>>>(
-        m, n, input, SFScale, reinterpret_cast<uint32_t*>(output),
-        reinterpret_cast<uint32_t*>(SFOuput));
-  }
-}
-
-// Instantiate the function.
-template void invokeFP4Quantization(int m, int n, half const* input,
-                                    float const* SFScale, int64_t* output,
-                                    int32_t* SFOuput, bool useUE8M0,
-                                    int multiProcessorCount,
-                                    cudaStream_t stream);
-
-template void invokeFP4Quantization(int m, int n, __nv_bfloat16 const* input,
-                                    float const* SFScale, int64_t* output,
-                                    int32_t* SFOuput, bool useUE8M0,
-                                    int multiProcessorCount,
-                                    cudaStream_t stream);
-
 }  // namespace vllm
 
 void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
@@ -147,13 +119,19 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
 
-  // We don't support e8m0 scales at this moment.
-  bool useUE8M0 = false;
+  // Grid, Block size. Each thread converts 8 values.
+  dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
+  int const numBlocksPerSM =
+      vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
+  int effectiveRows = vllm::computeEffectiveRows(m);
+  dim3 grid(std::min(effectiveRows, multiProcessorCount * numBlocksPerSM));
 
   VLLM_DISPATCH_HALF_TYPES(input.scalar_type(), "nvfp4_quant_kernel", [&] {
     using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
     auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
-    vllm::invokeFP4Quantization(m, n, input_ptr, input_sf_ptr, output_ptr,
-                                sf_out, useUE8M0, multiProcessorCount, stream);
+    // NOTE: We don't support e8m0 scales at this moment.
+    vllm::cvt_fp16_to_fp4<cuda_type, false><<<grid, block, 0, stream>>>(
+        m, n, input_ptr, input_sf_ptr, reinterpret_cast<uint32_t*>(output_ptr),
+        reinterpret_cast<uint32_t*>(sf_out));
   });
 }
diff --git a/csrc/quantization/fp4/nvfp4_utils.cuh b/csrc/quantization/fp4/nvfp4_utils.cuh
index 48e4959de9793..4c91af85e1514 100644
--- a/csrc/quantization/fp4/nvfp4_utils.cuh
+++ b/csrc/quantization/fp4/nvfp4_utils.cuh
@@ -128,51 +128,42 @@ inline __device__ float reciprocal_approximate_ftz(float a) {
   return b;
 }
 
+// Compute SF output offset for swizzled tensor core layout.
+// SF layout: [numMTiles, numKTiles, 32, 4, 4]
+// Caller must precompute: numKTiles = (numCols + 63) / 64
 template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF>
-__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx,
-                                                       int numCols,
-                                                       SFType* SFout) {
+__device__ __forceinline__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(
+    int rowIdx, int colIdx, int32_t numKTiles, SFType* SFout) {
   static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 ||
                 CVT_FP4_NUM_THREADS_PER_SF == 2);
 
   // One pair of threads write one SF to global memory.
   // TODO: stage through smem for packed STG.32
   // is it better than STG.8 from 4 threads ?
-  if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) {
-    // SF vector index (16 elements share one SF in the K dimension).
-    int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
-    int32_t mIdx = rowIdx;
-
-    // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
-    // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx]
-
-    int32_t mTileIdx = mIdx / (32 * 4);
-    // SF vector size 16.
-    int factor = CVT_FP4_SF_VEC_SIZE * 4;
-    int32_t numKTiles = (numCols + factor - 1) / factor;
-    int64_t mTileStride = numKTiles * 32 * 4 * 4;
-
-    int32_t kTileIdx = (kIdx / 4);
-    int64_t kTileStride = 32 * 4 * 4;
-
-    // M tile layout [32, 4] is column-major.
-    int32_t outerMIdx = (mIdx % 32);
-    int64_t outerMStride = 4 * 4;
-
-    int32_t innerMIdx = (mIdx % (32 * 4)) / 32;
-    int64_t innerMStride = 4;
-
-    int32_t innerKIdx = (kIdx % 4);
-    int64_t innerKStride = 1;
-
-    // Compute the global offset.
-    int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride +
-                       outerMIdx * outerMStride + innerMIdx * innerMStride +
-                       innerKIdx * innerKStride;
-
-    return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
+  if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF != 0) {
+    return nullptr;
   }
-  return nullptr;
+
+  // SF vector index (16 elements share one SF in the K dimension).
+  int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
+  int32_t mIdx = rowIdx;
+
+  // Decompose indices using bitwise ops (all divisors are powers of 2).
+  // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
+  int32_t mTileIdx = mIdx >> 7;         // mIdx / 128
+  int32_t outerMIdx = mIdx & 31;        // mIdx % 32
+  int32_t innerMIdx = (mIdx >> 5) & 3;  // (mIdx / 32) % 4
+  int32_t kTileIdx = kIdx >> 2;         // kIdx / 4
+  int32_t innerKIdx = kIdx & 3;         // kIdx % 4
+
+  // Compute global SF offset: mTileIdx * (numKTiles * 512) + kTileIdx * 512 +
+  //                           outerMIdx * 16 + innerMIdx * 4 + innerKIdx
+  // Use bitwise OR for non-overlapping lower bits.
+  int64_t SFOffset = (static_cast<int64_t>(mTileIdx) * numKTiles + kTileIdx)
+                         << 9 |
+                     (outerMIdx << 4) | (innerMIdx << 2) | innerKIdx;
+
+  return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
 }
 
 // Quantizes the provided PackedVec into the uint32_t output

From 9d701e90d82f091395dc32634f2a26132c167bfd Mon Sep 17 00:00:00 2001
From: Steve Westerhouse <westers@users.noreply.github.com>
Date: Sun, 21 Dec 2025 18:41:37 -0600
Subject: [PATCH 734/770] [Doc] Clarify FP8 KV cache computation workflow
 (#31071)

Signed-off-by: westers <steve.westerhouse@origami-analytics.com>
---
 docs/design/paged_attention.md                | 42 +++++++++----------
 .../quantization/quantized_kvcache.md         | 10 +++++
 2 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/docs/design/paged_attention.md b/docs/design/paged_attention.md
index 5cc5878425515..53368ab1a79fa 100644
--- a/docs/design/paged_attention.md
+++ b/docs/design/paged_attention.md
@@ -139,18 +139,18 @@ token data.
 const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
 ```
 
-<figure markdown="span">
-  ![](../assets/design/paged_attention/query.png){ align="center" alt="query" width="70%" }
-</figure>
+<p align="center">
+  <img src="../assets/design/paged_attention/query.png" alt="query" width="70%" />
+</p>
 
 Each thread defines its own `q_ptr` which points to the assigned
 query token data on global memory. For example, if `VEC_SIZE` is 4
 and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains
 total of 128 elements divided into 128 / 4 = 32 vecs.
 
-<figure markdown="span">
-  ![](../assets/design/paged_attention/q_vecs.png){ align="center" alt="q_vecs" width="70%" }
-</figure>
+<p align="center">
+  <img src="../assets/design/paged_attention/q_vecs.png" alt="q_vecs" width="70%" />
+</p>
 
 ```cpp
 __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
@@ -187,9 +187,9 @@ key token at different iterations. As shown above, that `k_ptr`
 points to key token data based on `k_cache` at assigned block,
 assigned head and assigned token.
 
-<figure markdown="span">
-  ![](../assets/design/paged_attention/key.png){ align="center" alt="key" width="70%" }
-</figure>
+<p align="center">
+  <img src="../assets/design/paged_attention/key.png" alt="key" width="70%" />
+</p>
 
 The diagram above illustrates the memory layout for key data. It
 assumes that the `BLOCK_SIZE` is 16, `HEAD_SIZE` is 128, `x` is
@@ -202,9 +202,9 @@ iterations. Inside each rectangle, there are a total 32 vecs (128
 elements for one token) that will be processed by 2 threads (one
 thread group) separately.
 
-<figure markdown="span">
-  ![](../assets/design/paged_attention/k_vecs.png){ align="center" alt="k_vecs" width="70%" }
-</figure>
+<p align="center">
+  <img src="../assets/design/paged_attention/k_vecs.png" alt="k_vecs" width="70%" />
+</p>
 
 ```cpp
 K_vec k_vecs[NUM_VECS_PER_THREAD]
@@ -361,17 +361,17 @@ later steps. Now, it should store the normalized softmax result of
 
 ## Value
 
-<figure markdown="span">
-  ![](../assets/design/paged_attention/value.png){ align="center" alt="value" width="70%" }
-</figure>
+<p align="center">
+  <img src="../assets/design/paged_attention/value.png" alt="value" width="70%" />
+</p>
 
-<figure markdown="span">
-  ![](../assets/design/paged_attention/logits_vec.png){ align="center" alt="logits_vec" width="50%" }
-</figure>
+<p align="center">
+  <img src="../assets/design/paged_attention/logits_vec.png" alt="logits_vec" width="50%" />
+</p>
 
-<figure markdown="span">
-  ![](../assets/design/paged_attention/v_vec.png){ align="center" alt="v_vec" width="70%" }
-</figure>
+<p align="center">
+  <img src="../assets/design/paged_attention/v_vec.png" alt="v_vec" width="70%" />
+</p>
 
 Now we need to retrieve the value data and perform dot multiplication
 with `logits`. Unlike query and key, there is no thread group
diff --git a/docs/features/quantization/quantized_kvcache.md b/docs/features/quantization/quantized_kvcache.md
index d26a5e217f314..586117272d3ba 100644
--- a/docs/features/quantization/quantized_kvcache.md
+++ b/docs/features/quantization/quantized_kvcache.md
@@ -17,6 +17,16 @@ The E4M3 format offers higher precision compared to E5M2. However, due to its sm
 
 For now, only per-tensor (scalar) scaling factors are supported. Development is ongoing to support scaling factors of a finer granularity (e.g. per-channel).
 
+### How FP8 KV Cache Works
+
+The FP8 KV cache implementation follows this workflow:
+
+1. **Storage**: Key and Value tensors are quantized to FP8 format using scaling factors before being stored in the KV cache
+2. **Retrieval**: When needed for attention computation, cached KV tensors are dequantized back to higher precision (FP16/BF16)
+3. **Attention**: The attention-value multiplication (softmax output × V) is performed using the dequantized higher-precision V tensor
+
+This means the final attention computation operates on dequantized values, not FP8 tensors. The quantization reduces memory usage during storage but maintains computation accuracy by using higher precision during the actual attention operations.
+
 ### Performance Impact
 
 The current FP8 KV cache implementation primarily benefits throughput by allowing approximately double the amount of space for KV cache allocation. This enables either:

From 7e065eba596e1fd273c765358f5cb34468bffb17 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Sun, 21 Dec 2025 21:32:40 -0500
Subject: [PATCH 735/770] [CI] Fix "2 Node Tests (4 GPUs in total)" (#31090)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 .buildkite/test-amd.yaml                    |  4 +-
 .buildkite/test-pipeline.yaml               |  4 +-
 .buildkite/test_areas/distributed.yaml      |  2 +-
 examples/offline_inference/data_parallel.py | 68 ++++++++++++++-------
 4 files changed, 51 insertions(+), 27 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index bd00c47df8cb0..0d7194f003840 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1254,13 +1254,13 @@ steps:
   - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
     - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
     - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
     - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
   - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
     - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
 
 - label: Distributed Tests (2 GPUs) # 68min
   timeout_in_minutes: 90
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 1186acad48ae0..7b664c4fa15fe 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -1109,13 +1109,13 @@ steps:
   - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
     - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
     - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
     - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
   - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
     - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
 
 - label: Distributed Tests (2 GPUs) # 68min
   timeout_in_minutes: 90
diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml
index 1a3739cc2417a..65a981a9d6d00 100644
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -171,7 +171,7 @@ steps:
   - tests/distributed/
   - tests/examples/offline_inference/data_parallel.py
   commands:
-    - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
+    - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"
 
 - label: Distributed NixlConnector PD accuracy (4 GPUs)
   timeout_in_minutes: 30
diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
index bcf1ba307eff1..287409fa2b5c1 100644
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@@ -14,19 +14,19 @@ Multi-node:
                     --model="ibm-research/PowerMoE-3b" \
                     -dp=2 \
                     -tp=2 \
-                    --nnodes=2 \
-                    --node-rank=0 \
-                    --master-addr=10.99.48.128 \
-                    --master-port=13345
+                    --dp-num-nodes=2 \
+                    --dp-node-rank=0 \
+                    --dp-master-addr=10.99.48.128 \
+                    --dp-master-port=13345
     Node 1:
             python examples/offline_inference/data_parallel.py \
                     --model="ibm-research/PowerMoE-3b" \
                     -dp=2 \
                     -tp=2 \
-                    --nnodes=2 \
-                    --node-rank=1 \
-                    --master-addr=10.99.48.128 \
-                    --master-port=13345
+                    --dp-num-nodes=2 \
+                    --dp-node-rank=1 \
+                    --dp-master-addr=10.99.48.128 \
+                    --dp-master-port=13345
 """
 
 import os
@@ -48,7 +48,31 @@ def create_parser():
         enable_expert_parallel=True,
     )
 
-    # Add timeout (not in EngineArgs)
+    # Add DP-specific args (separate from engine args to avoid conflicts)
+    parser.add_argument(
+        "--dp-num-nodes",
+        type=int,
+        default=1,
+        help="Total number of nodes for data parallel.",
+    )
+    parser.add_argument(
+        "--dp-node-rank",
+        type=int,
+        default=0,
+        help="Rank of the current node for data parallel.",
+    )
+    parser.add_argument(
+        "--dp-master-addr",
+        type=str,
+        default="",
+        help="Master node IP address for DP coordination.",
+    )
+    parser.add_argument(
+        "--dp-master-port",
+        type=int,
+        default=0,
+        help="Master node port for DP coordination.",
+    )
     parser.add_argument(
         "--timeout",
         type=int,
@@ -132,26 +156,26 @@ if __name__ == "__main__":
     parser = create_parser()
     args = vars(parser.parse_args())
 
-    # Extract DP-specific args
+    # Extract DP-specific args (pop to remove from engine_args)
     dp_size = args.pop("data_parallel_size")
-    nnodes = args.get("nnodes", 1)
-    node_rank = args.get("node_rank", 0)
-    master_addr = args.get("master_addr", "")
-    master_port = args.get("master_port", 0)
+    dp_num_nodes = args.pop("dp_num_nodes")
+    dp_node_rank = args.pop("dp_node_rank")
+    dp_master_addr = args.pop("dp_master_addr")
+    dp_master_port = args.pop("dp_master_port")
     timeout = args.pop("timeout")
 
     # Remaining args are engine args
     engine_args = args
 
-    if nnodes == 1:
+    if dp_num_nodes == 1:
         dp_master_ip = "127.0.0.1"
-        dp_master_port = get_open_port()
+        dp_master_port_val = get_open_port()
     else:
-        dp_master_ip = master_addr
-        dp_master_port = master_port
+        dp_master_ip = dp_master_addr
+        dp_master_port_val = dp_master_port
 
-    assert dp_size % nnodes == 0, "dp_size should be divisible by nnodes"
-    dp_per_node = dp_size // nnodes
+    assert dp_size % dp_num_nodes == 0, "dp_size should be divisible by dp_num_nodes"
+    dp_per_node = dp_size // dp_num_nodes
 
     from multiprocessing import Process
 
@@ -162,7 +186,7 @@ if __name__ == "__main__":
 
     procs = []
     for local_dp_rank, global_dp_rank in enumerate(
-        range(node_rank * dp_per_node, (node_rank + 1) * dp_per_node)
+        range(dp_node_rank * dp_per_node, (dp_node_rank + 1) * dp_per_node)
     ):
         proc = Process(
             target=main,
@@ -171,7 +195,7 @@ if __name__ == "__main__":
                 local_dp_rank,
                 global_dp_rank,
                 dp_master_ip,
-                dp_master_port,
+                dp_master_port_val,
                 engine_args,
             ),
         )

From 097978a15dc3757e6fdbbae6ad752b97691581bd Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Mon, 22 Dec 2025 10:39:22 +0800
Subject: [PATCH 736/770] [Kernel] Enable fused_qknorm_rope_kernel supports
 partial rope (#30821)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 csrc/fused_qknorm_rope_kernel.cu              | 109 ++++++++++--------
 tests/kernels/core/test_fused_qk_norm_rope.py |   7 +-
 2 files changed, 64 insertions(+), 52 deletions(-)

diff --git a/csrc/fused_qknorm_rope_kernel.cu b/csrc/fused_qknorm_rope_kernel.cu
index baff8363162ef..5c23a90794594 100644
--- a/csrc/fused_qknorm_rope_kernel.cu
+++ b/csrc/fused_qknorm_rope_kernel.cu
@@ -107,7 +107,8 @@ __global__ void fusedQKNormRopeKernel(
     void const* k_weight_void,       // RMSNorm weights for key
     void const* cos_sin_cache_void,  // Pre-computed cos/sin cache
     int64_t const* position_ids,     // Position IDs for RoPE
-    int const num_tokens             // Number of tokens
+    int const num_tokens,            // Number of tokens
+    int const rotary_dim             // Dimension for RoPE
 ) {
 #if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 800) && !defined(USE_ROCM)
   if constexpr ((std::is_same_v<scalar_t_in, c10::BFloat16>) ||
@@ -227,56 +228,59 @@ __global__ void fusedQKNormRopeKernel(
 
     // Calculate cache pointer for this position - similar to
     // pos_encoding_kernels.cu
-    T_cache const* cache_ptr = cos_sin_cache + pos_id * head_dim;
-    int const embed_dim = head_dim / 2;
+    T_cache const* cache_ptr = cos_sin_cache + pos_id * rotary_dim;
+    int const embed_dim = rotary_dim / 2;
     T_cache const* cos_ptr = cache_ptr;
     T_cache const* sin_ptr = cache_ptr + embed_dim;
-
-    if constexpr (interleave) {
-      // Perform interleaving. Use pre-computed cos/sin values.
+    int const rotary_lanes = rotary_dim / numElemsPerThread;  // rotary range
+    if (laneId < rotary_lanes) {
+      if constexpr (interleave) {
+        // Perform interleaving. Use pre-computed cos/sin values.
 #pragma unroll
-      for (int i = 0; i < numElemsPerThread / 2; ++i) {
-        int const idx0 = 2 * i;
-        int const idx1 = 2 * i + 1;
+        for (int i = 0; i < numElemsPerThread / 2; ++i) {
+          int const idx0 = 2 * i;
+          int const idx1 = 2 * i + 1;
+          // Global dimension index in the head
+          int const dim_idx = laneId * numElemsPerThread + idx0;
 
-        float const val0 = elements[idx0];
-        float const val1 = elements[idx1];
+          float const val0 = elements[idx0];
+          float const val1 = elements[idx1];
 
-        int const dim_idx = laneId * numElemsPerThread + idx0;
-        int const half_dim = dim_idx / 2;
-        float const cos_val =
-            CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
-        float const sin_val =
-            CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
+          int const half_dim = dim_idx / 2;
+          float const cos_val =
+              CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
+          float const sin_val =
+              CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
 
-        elements[idx0] = val0 * cos_val - val1 * sin_val;
-        elements[idx1] = val0 * sin_val + val1 * cos_val;
-      }
-    } else {
-      // Before data exchange with in warp, we need to sync.
-      __syncwarp();
-      // Get the data from the other half of the warp. Use pre-computed cos/sin
-      // values.
-#pragma unroll
-      for (int i = 0; i < numElemsPerThread; i++) {
-        elements2[i] = __shfl_xor_sync(FINAL_MASK, elements[i], 16);
-        if (laneId < 16) {
-          elements2[i] = -elements2[i];
+          elements[idx0] = val0 * cos_val - val1 * sin_val;
+          elements[idx1] = val0 * sin_val + val1 * cos_val;
         }
+      } else {
+        // Before data exchange with in warp, we need to sync.
+        __syncwarp();
+        int pairOffset = (rotary_dim / 2) / numElemsPerThread;
+        // Get the data from the other half of the warp. Use pre-computed
+        // cos/sin values.
+#pragma unroll
+        for (int i = 0; i < numElemsPerThread; i++) {
+          elements2[i] = __shfl_xor_sync(FINAL_MASK, elements[i], pairOffset);
 
-        int dim_idx = laneId * numElemsPerThread + i;
-        dim_idx = (dim_idx * 2) % head_dim;
-        int half_dim = dim_idx / 2;
-        // Use pre-computed cos/sin from cache
-        float cos_val = CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
-        float sin_val = CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
+          if (laneId < pairOffset) {
+            elements2[i] = -elements2[i];
+          }
+          int dim_idx = laneId * numElemsPerThread + i;
 
-        elements[i] = elements[i] * cos_val + elements2[i] * sin_val;
+          dim_idx = (dim_idx * 2) % rotary_dim;
+          int half_dim = dim_idx / 2;
+          float cos_val = CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
+          float sin_val = CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
+
+          elements[i] = elements[i] * cos_val + elements2[i] * sin_val;
+        }
+        // __shfl_xor_sync does not provide memfence. Need to sync again.
+        __syncwarp();
       }
-      // __shfl_xor_sync does not provide memfence. Need to sync again.
-      __syncwarp();
     }
-
     // Store.
     {
       vec_T vec;
@@ -312,10 +316,10 @@ template <typename scalar_t_in, typename scalar_t_cache>
 void launchFusedQKNormRope(void* qkv, int const num_tokens,
                            int const num_heads_q, int const num_heads_k,
                            int const num_heads_v, int const head_dim,
-                           float const eps, void const* q_weight,
-                           void const* k_weight, void const* cos_sin_cache,
-                           bool const interleave, int64_t const* position_ids,
-                           cudaStream_t stream) {
+                           int const rotary_dim, float const eps,
+                           void const* q_weight, void const* k_weight,
+                           void const* cos_sin_cache, bool const interleave,
+                           int64_t const* position_ids, cudaStream_t stream) {
   constexpr int blockSize = 256;
 
   int const warpsPerBlock = blockSize / 32;
@@ -332,7 +336,7 @@ void launchFusedQKNormRope(void* qkv, int const num_tokens,
         fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 64, INTERLEAVE>
             <<<gridDim, blockDim, 0, stream>>>(
                 qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
-                k_weight, cos_sin_cache, position_ids, num_tokens);
+                k_weight, cos_sin_cache, position_ids, num_tokens, rotary_dim);
       });
       break;
     case 128:
@@ -340,7 +344,7 @@ void launchFusedQKNormRope(void* qkv, int const num_tokens,
         fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 128, INTERLEAVE>
             <<<gridDim, blockDim, 0, stream>>>(
                 qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
-                k_weight, cos_sin_cache, position_ids, num_tokens);
+                k_weight, cos_sin_cache, position_ids, num_tokens, rotary_dim);
       });
       break;
     case 256:
@@ -348,7 +352,7 @@ void launchFusedQKNormRope(void* qkv, int const num_tokens,
         fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 256, INTERLEAVE>
             <<<gridDim, blockDim, 0, stream>>>(
                 qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
-                k_weight, cos_sin_cache, position_ids, num_tokens);
+                k_weight, cos_sin_cache, position_ids, num_tokens, rotary_dim);
       });
       break;
     default:
@@ -392,12 +396,16 @@ void fused_qk_norm_rope(
               "Query weights size must match head dimension");
   TORCH_CHECK(k_weight.size(0) == head_dim,
               "Key weights size must match head dimension");
-  TORCH_CHECK(cos_sin_cache.size(1) == head_dim,
-              "Cos/sin cache dimension must match head_dim");
+
+  TORCH_CHECK(cos_sin_cache.size(1) % 2 == 0, "rotary_dim must be even");
+  TORCH_CHECK(cos_sin_cache.size(1) <= head_dim,
+              "rotary_dim must be less than or equal to head_dim");
+
   TORCH_CHECK(qkv.scalar_type() == q_weight.scalar_type() &&
                   qkv.scalar_type() == k_weight.scalar_type(),
               "qkv, q_weight and k_weight must have the same dtype");
 
+  int64_t rotary_dim = cos_sin_cache.size(1);
   int64_t num_tokens = qkv.size(0);
   TORCH_CHECK(position_ids.size(0) == num_tokens,
               "Number of tokens in position_ids must match QKV");
@@ -419,7 +427,8 @@ void fused_qk_norm_rope(
               qkv.data_ptr(), static_cast<int>(num_tokens),
               static_cast<int>(num_heads_q), static_cast<int>(num_heads_k),
               static_cast<int>(num_heads_v), static_cast<int>(head_dim),
-              static_cast<float>(eps), q_weight.data_ptr(), k_weight.data_ptr(),
+              static_cast<int>(cos_sin_cache.size(1)), static_cast<float>(eps),
+              q_weight.data_ptr(), k_weight.data_ptr(),
               cos_sin_cache.data_ptr(), !is_neox,
               reinterpret_cast<int64_t const*>(position_ids.data_ptr()),
               stream);
diff --git a/tests/kernels/core/test_fused_qk_norm_rope.py b/tests/kernels/core/test_fused_qk_norm_rope.py
index a23959e353da9..05d61ec02fd29 100644
--- a/tests/kernels/core/test_fused_qk_norm_rope.py
+++ b/tests/kernels/core/test_fused_qk_norm_rope.py
@@ -13,6 +13,7 @@ DTYPES = [torch.bfloat16, torch.float16]
 IS_NEOX = [True, False]
 EPS_VALUES = [1e-5, 1e-6]
 SEEDS = [13]
+PARTIAL_ROPE = [True, False]
 CUDA_DEVICES = ["cuda:0"]
 
 
@@ -52,6 +53,7 @@ def _apply_qk_norm_rope(
 @pytest.mark.parametrize("is_neox", IS_NEOX)
 @pytest.mark.parametrize("eps", EPS_VALUES)
 @pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("rotary_ratio", [1.0, 0.5, 0.25])
 @torch.inference_mode()
 def test_fused_qk_norm_rope_matches_reference(
     device: str,
@@ -59,6 +61,7 @@ def test_fused_qk_norm_rope_matches_reference(
     is_neox: bool,
     eps: float,
     seed: int,
+    rotary_ratio: float,
 ):
     torch.set_default_device(device)
     current_platform.seed_everything(seed)
@@ -76,10 +79,10 @@ def test_fused_qk_norm_rope_matches_reference(
     k_norm.weight.data.normal_(mean=1.0, std=0.1)
     q_weight = q_norm.weight.data
     k_weight = k_norm.weight.data
-
+    rotary_dim = int(head_dim * rotary_ratio)
     rope = RotaryEmbedding(
         head_size=head_dim,
-        rotary_dim=head_dim,
+        rotary_dim=rotary_dim,
         max_position_embeddings=4096,
         base=10000.0,
         is_neox_style=is_neox,

From 19cc9468fd0fa1701e7cb74b5928b329a1d16cf1 Mon Sep 17 00:00:00 2001
From: CedricHuang <38417461+CedricHwong@users.noreply.github.com>
Date: Mon, 22 Dec 2025 11:34:49 +0800
Subject: [PATCH 737/770] [Feature]: Support NVIDIA ModelOpt HF FP8 variants
 FP8_PER_CHANNEL_PER_TOKEN and FP8_PB_WO  in vLLM (#30957)

---
 docs/features/quantization/modelopt.md        |  31 +++
 tests/quantization/test_modelopt.py           | 141 ++++++++++
 vllm/config/model.py                          |  18 +-
 vllm/model_executor/layers/linear.py          |   2 +
 .../layers/quantization/modelopt.py           | 260 +++++++++++++++++-
 5 files changed, 437 insertions(+), 15 deletions(-)

diff --git a/docs/features/quantization/modelopt.md b/docs/features/quantization/modelopt.md
index b02d5ba9e89a2..5c846767bc5b8 100644
--- a/docs/features/quantization/modelopt.md
+++ b/docs/features/quantization/modelopt.md
@@ -8,6 +8,16 @@ We recommend installing the library with:
 pip install nvidia-modelopt
 ```
 
+## Supported ModelOpt checkpoint formats
+
+vLLM detects ModelOpt checkpoints via `hf_quant_config.json` and supports the
+following `quantization.quant_algo` values:
+
+- `FP8`: per-tensor weight scale (+ optional static activation scale).
+- `FP8_PER_CHANNEL_PER_TOKEN`: per-channel weight scale and dynamic per-token activation quantization.
+- `FP8_PB_WO` (ModelOpt may emit `fp8_pb_wo`): block-scaled FP8 weight-only (typically 128×128 blocks).
+- `NVFP4`: ModelOpt NVFP4 checkpoints (use `quantization="modelopt_fp4"`).
+
 ## Quantizing HuggingFace Models with PTQ
 
 You can quantize HuggingFace models using the example scripts provided in the Model Optimizer repository. The primary script for LLM PTQ is typically found within the `examples/llm_ptq` directory.
@@ -80,3 +90,24 @@ The quantized checkpoint can then be deployed with vLLM. As an example, the foll
     if __name__ == "__main__":
         main()
     ```
+
+## Running the OpenAI-compatible server
+
+To serve a local ModelOpt checkpoint via the OpenAI-compatible API:
+
+```bash
+vllm serve <path_to_exported_checkpoint> \
+  --quantization modelopt \
+  --host 0.0.0.0 --port 8000
+```
+
+## Testing (local checkpoints)
+
+vLLM's ModelOpt unit tests are gated by local checkpoint paths and are skipped
+by default in CI. To run the tests locally:
+
+```bash
+export VLLM_TEST_MODELOPT_FP8_PC_PT_MODEL_PATH=<path_to_fp8_pc_pt_checkpoint>
+export VLLM_TEST_MODELOPT_FP8_PB_WO_MODEL_PATH=<path_to_fp8_pb_wo_checkpoint>
+pytest -q tests/quantization/test_modelopt.py
+```
diff --git a/tests/quantization/test_modelopt.py b/tests/quantization/test_modelopt.py
index 0298994c396f6..154b29d7017ac 100644
--- a/tests/quantization/test_modelopt.py
+++ b/tests/quantization/test_modelopt.py
@@ -6,6 +6,7 @@ Run `pytest tests/quantization/test_modelopt.py`.
 """
 
 import os
+from typing import NoReturn
 
 import pytest
 import torch
@@ -19,6 +20,28 @@ def enable_pickle(monkeypatch):
     monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
 
 
+def _skip(msg: str) -> NoReturn:
+    pytest.skip(msg)
+    raise RuntimeError(msg)
+
+
+def _snapshot_download_or_skip(model_id: str) -> str:
+    try:
+        from huggingface_hub import snapshot_download
+    except Exception as e:  # pragma: no cover
+        _skip(f"huggingface_hub is required to download {model_id}: {e}")
+
+    try:
+        return snapshot_download(
+            repo_id=model_id,
+            repo_type="model",
+            # These checkpoints are already small; download full repo for simplicity.
+            allow_patterns=["*"],
+        )
+    except Exception as e:
+        _skip(f"Failed to download {model_id} from the HF Hub: {e}")
+
+
 @pytest.mark.skipif(
     not is_quant_method_supported("modelopt"),
     reason="ModelOpt FP8 is not supported on this GPU type.",
@@ -91,3 +114,121 @@ def test_modelopt_fp8_checkpoint_setup(vllm_runner):
         output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
         assert output
         print(f"ModelOpt FP8 output: {output}")
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("modelopt"),
+    reason="ModelOpt FP8 is not supported on this GPU type.",
+)
+def test_modelopt_fp8_pc_pt_checkpoint_setup(vllm_runner):
+    """Test ModelOpt FP8_PER_CHANNEL_PER_TOKEN checkpoint setup."""
+    model_id = "CedricHwang/qwen2.5-0.5b-modelopt-fp8-pc-pt"
+    model_path = _snapshot_download_or_skip(model_id)
+
+    with vllm_runner(model_path, quantization="modelopt", enforce_eager=True) as llm:
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            o_proj = layer.self_attn.o_proj
+            gate_up_proj = layer.mlp.gate_up_proj
+            down_proj = layer.mlp.down_proj
+
+            from vllm.model_executor.layers.quantization.modelopt import (
+                ModelOptFp8PcPtLinearMethod,
+            )
+
+            assert isinstance(qkv_proj.quant_method, ModelOptFp8PcPtLinearMethod)
+            assert isinstance(o_proj.quant_method, ModelOptFp8PcPtLinearMethod)
+            assert isinstance(gate_up_proj.quant_method, ModelOptFp8PcPtLinearMethod)
+            assert isinstance(down_proj.quant_method, ModelOptFp8PcPtLinearMethod)
+
+            assert qkv_proj.weight.dtype == torch.float8_e4m3fn
+            assert o_proj.weight.dtype == torch.float8_e4m3fn
+            assert gate_up_proj.weight.dtype == torch.float8_e4m3fn
+            assert down_proj.weight.dtype == torch.float8_e4m3fn
+
+            # Per-channel scales; activations are dynamically scaled per token.
+            assert hasattr(qkv_proj, "weight_scale")
+            assert qkv_proj.weight_scale.dtype == torch.float32
+            assert qkv_proj.weight_scale.dim() == 1
+            assert not hasattr(qkv_proj, "input_scale")
+
+            assert hasattr(o_proj, "weight_scale")
+            assert o_proj.weight_scale.dtype == torch.float32
+            assert o_proj.weight_scale.dim() == 1
+            assert not hasattr(o_proj, "input_scale")
+
+            assert hasattr(gate_up_proj, "weight_scale")
+            assert gate_up_proj.weight_scale.dtype == torch.float32
+            assert gate_up_proj.weight_scale.dim() == 1
+            assert not hasattr(gate_up_proj, "input_scale")
+
+            assert hasattr(down_proj, "weight_scale")
+            assert down_proj.weight_scale.dtype == torch.float32
+            assert down_proj.weight_scale.dim() == 1
+            assert not hasattr(down_proj, "input_scale")
+
+        llm.apply_model(check_model)
+
+        output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
+        assert output
+        print(f"ModelOpt FP8_PER_CHANNEL_PER_TOKEN output: {output}")
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("modelopt"),
+    reason="ModelOpt FP8 is not supported on this GPU type.",
+)
+def test_modelopt_fp8_pb_wo_checkpoint_setup(vllm_runner):
+    """Test ModelOpt FP8_PB_WO checkpoint setup."""
+    model_id = "CedricHwang/qwen2.5-0.5b-modelopt-fp8-pb-wo"
+    model_path = _snapshot_download_or_skip(model_id)
+
+    with vllm_runner(model_path, quantization="modelopt", enforce_eager=True) as llm:
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            o_proj = layer.self_attn.o_proj
+            gate_up_proj = layer.mlp.gate_up_proj
+            down_proj = layer.mlp.down_proj
+
+            from vllm.model_executor.layers.quantization.modelopt import (
+                ModelOptFp8PbWoLinearMethod,
+            )
+
+            assert isinstance(qkv_proj.quant_method, ModelOptFp8PbWoLinearMethod)
+            assert isinstance(o_proj.quant_method, ModelOptFp8PbWoLinearMethod)
+            assert isinstance(gate_up_proj.quant_method, ModelOptFp8PbWoLinearMethod)
+            assert isinstance(down_proj.quant_method, ModelOptFp8PbWoLinearMethod)
+
+            assert qkv_proj.weight.dtype == torch.float8_e4m3fn
+            assert o_proj.weight.dtype == torch.float8_e4m3fn
+            assert gate_up_proj.weight.dtype == torch.float8_e4m3fn
+            assert down_proj.weight.dtype == torch.float8_e4m3fn
+
+            # Block scales; should be materialized as a 2D [out_blk, in_blk] tensor.
+            assert hasattr(qkv_proj, "weight_scale")
+            assert qkv_proj.weight_scale.dtype == torch.float32
+            assert qkv_proj.weight_scale.dim() == 2
+
+            assert hasattr(o_proj, "weight_scale")
+            assert o_proj.weight_scale.dtype == torch.float32
+            assert o_proj.weight_scale.dim() == 2
+
+            assert hasattr(gate_up_proj, "weight_scale")
+            assert gate_up_proj.weight_scale.dtype == torch.float32
+            assert gate_up_proj.weight_scale.dim() == 2
+
+            assert hasattr(down_proj, "weight_scale")
+            assert down_proj.weight_scale.dtype == torch.float32
+            assert down_proj.weight_scale.dim() == 2
+
+        llm.apply_model(check_model)
+
+        output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
+        assert output
+        print(f"ModelOpt FP8_PB_WO output: {output}")
diff --git a/vllm/config/model.py b/vllm/config/model.py
index db5789b709372..c796e300ab155 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -843,12 +843,18 @@ class ModelConfig:
             producer_name = quant_cfg.get("producer", {}).get("name")
             if producer_name == "modelopt":
                 quant_algo = quant_cfg.get("quantization", {}).get("quant_algo")
-                if quant_algo == "FP8":
-                    quant_cfg["quant_method"] = "modelopt"
-                elif quant_algo == "NVFP4":
-                    quant_cfg["quant_method"] = "modelopt_fp4"
-                elif quant_algo is not None:
-                    raise ValueError(f"Unknown ModelOpt quant algo: {quant_algo}")
+                if quant_algo is not None:
+                    quant_algo_upper = str(quant_algo).upper()
+                    if quant_algo_upper in {
+                        "FP8",
+                        "FP8_PER_CHANNEL_PER_TOKEN",
+                        "FP8_PB_WO",
+                    }:
+                        quant_cfg["quant_method"] = "modelopt"
+                    elif quant_algo_upper == "NVFP4":
+                        quant_cfg["quant_method"] = "modelopt_fp4"
+                    else:
+                        raise ValueError(f"Unknown ModelOpt quant algo: {quant_algo}")
 
         return quant_cfg
 
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 4ca4f75711ac7..402f0bf69ceaa 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -53,6 +53,8 @@ WEIGHT_LOADER_V2_SUPPORTED = [
     "GPTQLinearMethod",
     "FBGEMMFp8LinearMethod",
     "ModelOptFp8LinearMethod",
+    "ModelOptFp8PcPtLinearMethod",
+    "ModelOptFp8PbWoLinearMethod",
     "IPEXAWQLinearMethod",
     "IPEXGPTQLinearMethod",
     "HQQMarlinMethod",
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 54e8673fcfbb8..afbefe1fedc18 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -55,6 +55,9 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
     select_cutlass_fp8_gemm_impl,
     swap_w13_to_w31,
 )
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    W8A8BlockFp8LinearOp,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     get_marlin_input_dtype,
 )
@@ -72,9 +75,15 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
 )
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     Fp8LinearOp,
+    cutlass_block_fp8_supported,
     requantize_with_max_scale,
 )
-from vllm.model_executor.parameter import ModelWeightParameter, PerTensorScaleParameter
+from vllm.model_executor.parameter import (
+    BlockQuantScaleParameter,
+    ChannelQuantScaleParameter,
+    ModelWeightParameter,
+    PerTensorScaleParameter,
+)
 from vllm.scalar_type import scalar_types
 from vllm.utils.flashinfer import (
     flashinfer_scaled_fp4_mm,
@@ -88,7 +97,16 @@ if TYPE_CHECKING:
 
 logger = init_logger(__name__)
 
-QUANT_ALGOS = ["FP8", "NVFP4"]
+QUANT_ALGOS = [
+    # FP8 (per-tensor weight + optional static activation scale).
+    "FP8",
+    # FP8 per-channel weight scale + per-token activation scale.
+    "FP8_PER_CHANNEL_PER_TOKEN",
+    # FP8 per-block weight-only (ModelOpt may emit this as lowercase).
+    "FP8_PB_WO",
+    # FP4
+    "NVFP4",
+]
 KV_CACHE_QUANT_ALGOS = ["FP8"]
 
 
@@ -255,6 +273,9 @@ class ModelOptQuantConfigBase(QuantizationConfig):
         if not quant_method:
             raise ValueError("Missing 'quant_algo' in quantization config")
 
+        # Normalize quant_algo for robust matching (ModelOpt may emit lowercase).
+        quant_method = str(quant_method).upper()
+
         if kv_cache_quant_method is None:
             # No KV cache quantization, keep this branch just to have this comment
             pass
@@ -263,6 +284,8 @@ class ModelOptQuantConfigBase(QuantizationConfig):
                 f"kv_cache_quant_algo must be a string, got "
                 f"{type(kv_cache_quant_method)}"
             )
+        else:
+            kv_cache_quant_method = kv_cache_quant_method.upper()
 
         if not isinstance(exclude_modules, list):
             raise ValueError(
@@ -302,17 +325,34 @@ class ModelOptFp8Config(ModelOptQuantConfigBase):
 
     def __init__(
         self,
+        quant_method: str,
         is_checkpoint_fp8_serialized: bool,
         kv_cache_quant_method: str | None,
         exclude_modules: list[str],
     ) -> None:
         super().__init__(exclude_modules)
+        self.quant_method = quant_method
         self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
         self.kv_cache_quant_method = kv_cache_quant_method
         if is_checkpoint_fp8_serialized:
             logger.warning(
-                "Detected ModelOpt fp8 checkpoint. Please note that"
-                " the format is experimental and could change."
+                "Detected ModelOpt fp8 checkpoint (quant_algo=%s). Please note "
+                "that the format is experimental and could change.",
+                quant_method,
+            )
+
+        # Select LinearMethod implementation based on quant_algo.
+        if self.quant_method == "FP8":
+            self.LinearMethodCls = ModelOptFp8LinearMethod
+        elif self.quant_method == "FP8_PER_CHANNEL_PER_TOKEN":
+            self.LinearMethodCls = ModelOptFp8PcPtLinearMethod
+        elif self.quant_method == "FP8_PB_WO":
+            self.LinearMethodCls = ModelOptFp8PbWoLinearMethod
+        else:
+            raise ValueError(
+                "Unsupported ModelOpt FP8 quant_algo for vLLM: "
+                f"{self.quant_method}. Supported: FP8 / "
+                "FP8_PER_CHANNEL_PER_TOKEN / FP8_PB_WO."
             )
 
     def get_name(self) -> QuantizationMethods:
@@ -346,13 +386,13 @@ class ModelOptFp8Config(ModelOptQuantConfigBase):
         if "quantization" in hf_quant_cfg:
             quant_config = hf_quant_cfg["quantization"]
             if isinstance(quant_config, dict):
-                quant_algo = quant_config.get("quant_algo", "")
-                if "FP8" in quant_algo:
+                quant_algo = str(quant_config.get("quant_algo", ""))
+                if "FP8" in quant_algo.upper():
                     return "modelopt"
         else:
             # Check for compressed-tensors style config with specific quant_algo
-            quant_algo = hf_quant_cfg.get("quant_algo", "")
-            if isinstance(quant_algo, str) and "FP8" in quant_algo:
+            quant_algo = str(hf_quant_cfg.get("quant_algo", ""))
+            if "FP8" in quant_algo.upper():
                 return "modelopt"
 
         return None
@@ -369,7 +409,12 @@ class ModelOptFp8Config(ModelOptQuantConfigBase):
     ) -> "ModelOptFp8Config":
         is_checkpoint_fp8_serialized = "FP8" in quant_method
 
-        return cls(is_checkpoint_fp8_serialized, kv_cache_quant_method, exclude_modules)
+        return cls(
+            quant_method,
+            is_checkpoint_fp8_serialized,
+            kv_cache_quant_method,
+            exclude_modules,
+        )
 
 
 class ModelOptFp8LinearMethod(LinearMethodBase):
@@ -464,6 +509,203 @@ class ModelOptFp8LinearMethod(LinearMethodBase):
         )
 
 
+class ModelOptFp8PcPtLinearMethod(LinearMethodBase):
+    """Linear method for ModelOpt FP8_PER_CHANNEL_PER_TOKEN checkpoints.
+
+    Expected checkpoint structure (per Linear):
+    - weight: fp8-e4m3fn, shape [out, in]
+    - weight_scale: fp32, shape [out] (per-output-channel)
+    - no input_scale (activations are dynamically quantized per-token)
+    """
+
+    def __init__(self, quant_config: ModelOptFp8Config) -> None:
+        self.quant_config = quant_config
+        self.fp8_linear = Fp8LinearOp(
+            act_quant_static=False, act_quant_group_shape=GroupShape.PER_TOKEN
+        )
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        del input_size, output_size
+
+        if not self.quant_config.is_checkpoint_fp8_serialized:
+            raise ValueError(
+                "FP8_PER_CHANNEL_PER_TOKEN currently only supports "
+                "FP8-serialized checkpoints."
+            )
+
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        layer.logical_widths = output_partition_sizes
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition,
+                dtype=torch.float8_e4m3fn,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+
+        weight_scale = ChannelQuantScaleParameter(
+            data=torch.empty(output_size_per_partition, dtype=torch.float32),
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        weight_scale[:] = torch.finfo(torch.float32).min
+        layer.register_parameter("weight_scale", weight_scale)
+
+    def process_weights_after_loading(self, layer: Module) -> None:
+        layer.weight = Parameter(layer.weight.t(), requires_grad=False)
+        layer.weight_scale = Parameter(layer.weight_scale.data, requires_grad=False)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        return self.fp8_linear.apply(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            input_scale=None,
+            bias=bias,
+        )
+
+
+class ModelOptFp8PbWoLinearMethod(LinearMethodBase):
+    """Linear method for ModelOpt FP8_PB_WO checkpoints.
+
+    ModelOpt exports `weight_scale` as a 4D tensor:
+      [out_blk, 1, in_blk, 1]
+    where block size is typically 128 for both dims.
+
+    vLLM executes it as FP8 GEMM with *dynamic per-token* activation quant.
+    """
+
+    _WEIGHT_BLOCK_SIZE: tuple[int, int] = (128, 128)
+
+    def __init__(self, quant_config: ModelOptFp8Config) -> None:
+        self.quant_config = quant_config
+        block_n, block_k = self._WEIGHT_BLOCK_SIZE
+        self.weight_block_size = list(self._WEIGHT_BLOCK_SIZE)
+        self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp(
+            weight_group_shape=GroupShape(block_n, block_k),
+            act_quant_group_shape=GroupShape(1, block_k),
+            cutlass_block_fp8_supported=cutlass_block_fp8_supported(),
+            use_aiter_and_is_supported=False,
+        )
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        del input_size, output_size
+
+        if not self.quant_config.is_checkpoint_fp8_serialized:
+            raise ValueError(
+                "FP8_PB_WO currently only supports FP8-serialized checkpoints."
+            )
+
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        layer.logical_widths = output_partition_sizes
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+
+        # Expose block size so the v2 weight loaders can translate offsets from
+        # element-space -> block-space for BlockQuantScaleParameter.
+        layer.weight_block_size = self.weight_block_size
+
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition,
+                dtype=torch.float8_e4m3fn,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+
+        block_n, block_k = self._WEIGHT_BLOCK_SIZE
+        if output_size_per_partition % block_n != 0:
+            raise ValueError(
+                "ModelOpt FP8_PB_WO requires out_features divisible by "
+                f"{block_n}, got {output_size_per_partition}."
+            )
+        if input_size_per_partition % block_k != 0:
+            raise ValueError(
+                "ModelOpt FP8_PB_WO requires in_features divisible by "
+                f"{block_k}, got {input_size_per_partition}."
+            )
+
+        out_blks = output_size_per_partition // block_n
+        in_blks = input_size_per_partition // block_k
+
+        # Match ModelOpt's exported shape so weight loading works without a
+        # custom loader: [out_blk, 1, in_blk, 1]
+        weight_scale = BlockQuantScaleParameter(
+            data=torch.empty((out_blks, 1, in_blks, 1), dtype=torch.float32),
+            input_dim=2,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        weight_scale[:] = torch.finfo(torch.float32).min
+        layer.register_parameter("weight_scale", weight_scale)
+
+    def process_weights_after_loading(self, layer: Module) -> None:
+        # Keep weight in [out, in] layout for W8A8BlockFp8LinearOp.
+        layer.weight = Parameter(layer.weight.data, requires_grad=False)
+
+        scale = layer.weight_scale
+        if scale.dim() == 4:
+            # [out_blk, 1, in_blk, 1] -> [out_blk, in_blk]
+            scale = scale.squeeze(1).squeeze(-1)
+        elif scale.dim() != 2:
+            raise ValueError(
+                "Unexpected ModelOpt FP8_PB_WO weight_scale shape: "
+                f"{tuple(scale.shape)}."
+            )
+
+        layer.weight_scale = Parameter(scale.contiguous(), requires_grad=False)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        return self.w8a8_block_fp8_linear.apply(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            input_scale=None,
+            bias=bias,
+        )
+
+
 class ModelOptFp8MoEMethod(FusedMoEMethodBase):
     """MoE method for ModelOpt FP8.
     Supports loading FP8 checkpoints with static weight scale and

From 8c084de59d45a511aebe4005f49d9a81abceff3e Mon Sep 17 00:00:00 2001
From: Kevin McKay <kevin.mckay@outlook.com>
Date: Sun, 21 Dec 2025 23:13:14 -0600
Subject: [PATCH 738/770] [Misc] Fix spelling typos in comments (#31114)

Signed-off-by: c0de128 <kevin.mckay@outlook.com>
---
 .buildkite/scripts/generate-nightly-index.py             | 2 +-
 tests/models/multimodal/processing/test_tensor_schema.py | 2 +-
 vllm/reasoning/mistral_reasoning_parser.py               | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py
index d0965fbd56405..72f0bb2970289 100644
--- a/.buildkite/scripts/generate-nightly-index.py
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -372,7 +372,7 @@ if __name__ == "__main__":
 
     print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
 
-    # keep only "official" files for a non-nightly version (specifed by cli args)
+    # keep only "official" files for a non-nightly version (specified by cli args)
     PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$")
     if PY_VERSION_RE.match(version):
         # upload-wheels.sh ensures no "dev" is in args.version
diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py
index cb875436857cf..46fd4249ea4f8 100644
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -138,7 +138,7 @@ def create_batched_mm_kwargs(
     )
 
 
-# TODO(Isotr0py): Don't initalize model during test
+# TODO(Isotr0py): Don't initialize model during test
 @contextmanager
 def initialize_dummy_model(
     model_cls: type[nn.Module],
diff --git a/vllm/reasoning/mistral_reasoning_parser.py b/vllm/reasoning/mistral_reasoning_parser.py
index de3d1296ec734..48a36b4c6634c 100644
--- a/vllm/reasoning/mistral_reasoning_parser.py
+++ b/vllm/reasoning/mistral_reasoning_parser.py
@@ -104,7 +104,7 @@ class MistralReasoningParser(BaseThinkingReasoningParser):
         # 3. Both BOT and EOT have been outputted.
         elif has_bot_token and has_eot_token:
             return input_ids[:bot_token_index] + input_ids[eot_token_index + 1 :]
-        # 4. Only EOT has been outputted => this should not have occured for a model
+        # 4. Only EOT has been outputted => this should not have occurred for a model
         #    well prompted and trained.
         else:
             return input_ids[:eot_token_index] + input_ids[eot_token_index + 1 :]

From ec58c10ce1554eddca6c5c0eaa93ebeb4ed024d1 Mon Sep 17 00:00:00 2001
From: Kevin McKay <kevin.mckay@outlook.com>
Date: Sun, 21 Dec 2025 23:13:48 -0600
Subject: [PATCH 739/770] [Misc] Fix quantization-related typos (#31116)

Signed-off-by: c0de128 <kevin.mckay@outlook.com>
---
 tests/kernels/moe/modular_kernel_tools/common.py | 10 +++++-----
 tests/quantization/test_fp8.py                   |  2 +-
 vllm/utils/deep_gemm.py                          |  2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/kernels/moe/modular_kernel_tools/common.py b/tests/kernels/moe/modular_kernel_tools/common.py
index 6078ce44cee9f..537dcae4e74b4 100644
--- a/tests/kernels/moe/modular_kernel_tools/common.py
+++ b/tests/kernels/moe/modular_kernel_tools/common.py
@@ -258,16 +258,16 @@ class Config:
                     f"{self.fe_supported_types()}."
                 )
 
-        # Check block quanization support
-        is_block_quatized = self.quant_block_shape is not None
-        if is_block_quatized and self.quant_dtype is None:
+        # Check block quantization support
+        is_block_quantized = self.quant_block_shape is not None
+        if is_block_quantized and self.quant_dtype is None:
             return False, "No block quantization support."
 
-        if is_block_quatized and not self.is_block_quant_supported():
+        if is_block_quantized and not self.is_block_quant_supported():
             return False, "Mismatched block quantization support."
 
         # deep_gemm only works with block-quantized
-        if self.needs_deep_gemm() and not is_block_quatized:
+        if self.needs_deep_gemm() and not is_block_quantized:
             return False, "Needs DeepGEMM but not block quantized."
 
         # Check dependencies (turn into asserts?)
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index c9ab24fd439c5..a4b6d35987e13 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -217,7 +217,7 @@ def test_scaled_fp8_quant(dtype) -> None:
     ref_y, inv_scale = ops.scaled_fp8_quant(x, None)
     ref_y = per_tensor_dequantize(ref_y, inv_scale, dtype)
 
-    # Reference dynamic quantizaton
+    # Reference dynamic quantization
     y = quantize_ref(x, inv_scale)
     torch.testing.assert_close(ref_y, per_tensor_dequantize(y, inv_scale, dtype))
 
diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index bcda46421e827..56c9ca361eaef 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -389,7 +389,7 @@ def should_use_deepgemm_for_fp8_linear(
 
     # Verify DeepGEMM N/K dims requirements
     # NOTE: Also synchronized with test_w8a8_block_fp8_deep_gemm_matmul
-    # test inside kernels/quatization/test_block_fp8.py
+    # test inside kernels/quantization/test_block_fp8.py
     N_MULTIPLE = 64
     K_MULTIPLE = 128
 

From 42b42824ae8223fa84e7545709572c1fd231436b Mon Sep 17 00:00:00 2001
From: Kevin McKay <kevin.mckay@outlook.com>
Date: Sun, 21 Dec 2025 23:14:02 -0600
Subject: [PATCH 740/770] [Misc] Fix grammar errors in comments and messages
 (#31115)

Signed-off-by: c0de128 <kevin.mckay@outlook.com>
---
 tests/quantization/test_compressed_tensors.py | 6 +++---
 vllm/attention/ops/merge_attn_states.py       | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 412b21328a325..535f028202275 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -83,7 +83,7 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
         current_platform.is_rocm()
         and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
     ):
-        pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
+        pytest.skip(f"Skip model {model_path} as it is not supported on ROCm.")
 
     with vllm_runner(model_path, enforce_eager=True) as llm:
 
@@ -161,7 +161,7 @@ def test_compressed_tensors_w8a8_logprobs(
         current_platform.is_rocm()
         and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
     ):
-        pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
+        pytest.skip(f"Skip model {model_path} as it is not supported on ROCm.")
 
     if use_aiter:
         if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
@@ -231,7 +231,7 @@ def test_compressed_tensors_w8a8_dynamic_per_token(
         current_platform.is_rocm()
         and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
     ):
-        pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
+        pytest.skip(f"Skip model {model_path} as it is not supported on ROCm.")
 
     if use_aiter:
         if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
diff --git a/vllm/attention/ops/merge_attn_states.py b/vllm/attention/ops/merge_attn_states.py
index 16106f3c93a6a..f347fb3fbba51 100644
--- a/vllm/attention/ops/merge_attn_states.py
+++ b/vllm/attention/ops/merge_attn_states.py
@@ -15,7 +15,7 @@ def merge_attn_states(
     output_lse: torch.Tensor | None = None,
 ) -> None:
     # NOTE(DefTruth): Currently, custom merge_attn_states CUDA kernel
-    # is not support for FP8 dtype, fallback to use Triton kernel.
+    # does not support FP8 dtype, fallback to use Triton kernel.
     def supported_dtypes(o: torch.Tensor) -> bool:
         return o.dtype in [torch.float32, torch.half, torch.bfloat16]
 

From 14c3e6ade3605852282ec895397e6403b609847a Mon Sep 17 00:00:00 2001
From: Kevin McKay <kevin.mckay@outlook.com>
Date: Sun, 21 Dec 2025 23:14:14 -0600
Subject: [PATCH 741/770] [Misc] Fix spelling typos in model comments (#31117)

Signed-off-by: c0de128 <kevin.mckay@outlook.com>
---
 vllm/model_executor/models/config.py                 | 2 +-
 vllm/model_executor/models/qwen3_omni_moe_thinker.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index a3624b1cfa5f2..ccac8a6066429 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -401,7 +401,7 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
             # of attention tokens that would fit mamba_page_size:
             # e.g. for mamba page size = 788kB
             #          attn_1_token = 2kB -> fits ~394 tokens
-            #      then round up to a mulitple of 256 -> 512 tokens
+            #      then round up to a multiple of 256 -> 512 tokens
             # End result:
             #  attn_block_size = 512
             #  mamba_block_size = 512 (aligned to a multiple of chunk_size)
diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
index 089129e443c01..5ca6b3d852ac3 100755
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -323,7 +323,7 @@ class Qwen3Omni_VisionTransformer(nn.Module):
             hidden_size=self.hidden_size,
         )
 
-        # vit pos embeding, TODO: spatial_patch_size vs patch_size
+        # vit pos embedding, TODO: spatial_patch_size vs patch_size
         if self.apply_vit_abs_pos_embed:
             self.pos_embed = nn.Embedding(self.num_grid_per_side**2, self.hidden_size)
         else:

From 44ae85f725591edbfa3aa9e42184e68ebe56f504 Mon Sep 17 00:00:00 2001
From: Kevin McKay <kevin.mckay@outlook.com>
Date: Sun, 21 Dec 2025 23:14:27 -0600
Subject: [PATCH 742/770] [Misc] Fix typo: 'occured' -> 'occurred' (#31120)

Signed-off-by: c0de128 <kevin.mckay@outlook.com>

From cf8eed7bef52293a9a6706a2953374b24dff2c81 Mon Sep 17 00:00:00 2001
From: Kevin McKay <kevin.mckay@outlook.com>
Date: Sun, 21 Dec 2025 23:14:58 -0600
Subject: [PATCH 743/770] [Bugfix][ROCm] Fix typo: is_linear_fp8_enaled ->
 is_linear_fp8_enabled (#31109)

Signed-off-by: c0de128 <kevin.mckay@outlook.com>
Co-authored-by: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 vllm/_aiter_ops.py                                              | 2 +-
 .../compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py   | 2 +-
 vllm/model_executor/layers/quantization/fp8.py                  | 2 +-
 vllm/model_executor/layers/quantization/input_quant_fp8.py      | 2 +-
 vllm/platforms/rocm.py                                          | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py
index 0eae279acf5be..03e3bb7594910 100644
--- a/vllm/_aiter_ops.py
+++ b/vllm/_aiter_ops.py
@@ -761,7 +761,7 @@ class rocm_aiter_ops:
 
     @classmethod
     @if_aiter_supported
-    def is_linear_fp8_enaled(cls) -> bool:
+    def is_linear_fp8_enabled(cls) -> bool:
         return cls.is_linear_enabled()
 
     @classmethod
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index ee99572f5f499..758a54c10605a 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -61,7 +61,7 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
             )
 
         self.cutlass_block_fp8_supported = cutlass_block_fp8_supported()
-        self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enaled()
+        self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enabled()
 
         if self.weight_block_size is not None:
             assert not self.is_static_input_scheme
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 1f770d6d89a13..d19b20798ed06 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -414,7 +414,7 @@ class Fp8LinearMethod(LinearMethodBase):
         if vllm_is_batch_invariant():
             self.use_marlin = False
 
-        self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enaled()
+        self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enabled()
         self.use_deep_gemm = is_deep_gemm_supported()
 
         self.weight_block_size = self.quant_config.weight_block_size
diff --git a/vllm/model_executor/layers/quantization/input_quant_fp8.py b/vllm/model_executor/layers/quantization/input_quant_fp8.py
index a5db086fb4729..7994c838ad548 100644
--- a/vllm/model_executor/layers/quantization/input_quant_fp8.py
+++ b/vllm/model_executor/layers/quantization/input_quant_fp8.py
@@ -51,7 +51,7 @@ class QuantFP8(CustomOp):
         self.column_major_scales = column_major_scales
         self.use_ue8m0 = use_ue8m0
 
-        self.use_aiter = rocm_aiter_ops.is_linear_fp8_enaled()
+        self.use_aiter = rocm_aiter_ops.is_linear_fp8_enabled()
 
         self.is_group_quant = group_shape.is_per_group()
         if self.is_group_quant:
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 5892639eba406..b95287906c1fe 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -408,7 +408,7 @@ class RocmPlatform(Platform):
         parallel_config = vllm_config.parallel_config
         is_eager_execution = compilation_config == CUDAGraphMode.NONE
         use_aiter_rms_norm = rocm_aiter_ops.is_rmsnorm_enabled()
-        use_aiter_fp8_linear = rocm_aiter_ops.is_linear_fp8_enaled()
+        use_aiter_fp8_linear = rocm_aiter_ops.is_linear_fp8_enabled()
 
         if compilation_config.cudagraph_mode.has_full_cudagraphs():
             # decode context parallel does not support full cudagraphs

From c02a2705f9ceeb00b5d32453621f997b2ceafbea Mon Sep 17 00:00:00 2001
From: Roger Young <42564206+rogeryoungh@users.noreply.github.com>
Date: Mon, 22 Dec 2025 13:28:40 +0800
Subject: [PATCH 744/770] Update MiniMax-M2 ToolCall and add MiniMax-M2.1 in
 Docs (#31083)

Signed-off-by: xuebi <xuebi@minimaxi.com>
Co-authored-by: xuebi <xuebi@minimaxi.com>
---
 docs/models/supported_models.md             |   2 +-
 vllm/tool_parsers/minimax_m2_tool_parser.py | 213 +++++++++++++++-----
 2 files changed, 167 insertions(+), 48 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index ffd0681dca7e8..90d4ff96c52f7 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -418,7 +418,7 @@ th {
 | `MiMoV2FlashForCausalLM` | MiMoV2Flash | `XiaomiMiMo/MiMo-V2-Flash`, etc. | ︎| ✅︎ |
 | `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ |
 | `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ |
-| `MiniMaxM2ForCausalLM` | MiniMax-M2 |`MiniMaxAI/MiniMax-M2`, etc. | | ✅︎ |
+| `MiniMaxM2ForCausalLM` | MiniMax-M2, MiniMax-M2.1 |`MiniMaxAI/MiniMax-M2`, etc. | | ✅︎ |
 | `MistralForCausalLM` | Ministral-3, Mistral, Mistral-Instruct | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ |
 | `MistralLarge3ForCausalLM` | Mistral-Large-3-675B-Base-2512, Mistral-Large-3-675B-Instruct-2512 | `mistralai/Mistral-Large-3-675B-Base-2512`, `mistralai/Mistral-Large-3-675B-Instruct-2512`, etc. | ✅︎ | ✅︎ |
 | `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ |
diff --git a/vllm/tool_parsers/minimax_m2_tool_parser.py b/vllm/tool_parsers/minimax_m2_tool_parser.py
index a1ab75f548bfc..67bd0e61620da 100644
--- a/vllm/tool_parsers/minimax_m2_tool_parser.py
+++ b/vllm/tool_parsers/minimax_m2_tool_parser.py
@@ -138,37 +138,167 @@ class MinimaxM2ToolParser(ToolParser):
         return name_str
 
     def _convert_param_value(self, value: str, param_type: str) -> Any:
-        """Convert parameter value to the correct type."""
+        """Convert parameter value to the correct type (legacy single-type version)."""
+        return self._convert_param_value_with_types(value, [param_type])
+
+    def _extract_types_from_schema(self, schema: Any) -> list[str]:
+        """
+        Extract all possible types from a JSON schema definition.
+        Handles anyOf, oneOf, allOf, type arrays, and enum fields.
+
+        Args:
+            schema: The JSON schema definition for a parameter
+
+        Returns:
+            List of type strings (e.g., ["string", "integer", "null"])
+        """
+        if schema is None:
+            return ["string"]
+
+        if not isinstance(schema, dict):
+            return ["string"]
+
+        types: set[str] = set()
+
+        # Handle direct "type" field
+        if "type" in schema:
+            type_value = schema["type"]
+            if isinstance(type_value, str):
+                types.add(type_value)
+            elif isinstance(type_value, list):
+                for t in type_value:
+                    if isinstance(t, str):
+                        types.add(t)
+
+        # Handle enum - infer types from enum values
+        if "enum" in schema and isinstance(schema["enum"], list) and schema["enum"]:
+            for value in schema["enum"]:
+                if value is None:
+                    types.add("null")
+                elif isinstance(value, bool):
+                    types.add("boolean")
+                elif isinstance(value, int):
+                    types.add("integer")
+                elif isinstance(value, float):
+                    types.add("number")
+                elif isinstance(value, str):
+                    types.add("string")
+                elif isinstance(value, list):
+                    types.add("array")
+                elif isinstance(value, dict):
+                    types.add("object")
+
+        # Handle anyOf, oneOf, allOf - recursively extract types
+        for choice_field in ("anyOf", "oneOf", "allOf"):
+            if choice_field in schema and isinstance(schema[choice_field], list):
+                for choice in schema[choice_field]:
+                    extracted = self._extract_types_from_schema(choice)
+                    types.update(extracted)
+
+        # If no types found, default to string
+        if not types:
+            return ["string"]
+
+        return list(types)
+
+    def _convert_param_value_with_types(
+        self, value: str, param_types: list[str]
+    ) -> Any:
+        """
+        Convert parameter value to the correct type based on a list of possible types.
+        Tries each type in order until one succeeds.
+
+        Args:
+            value: The string value to convert
+            param_types: List of possible type strings
+
+        Returns:
+            The converted value
+        """
         if value.lower() == "null":
             return None
 
-        param_type = param_type.lower()
-        if param_type in ["string", "str", "text"]:
+        # Normalize types
+        normalized_types = [t.lower() for t in param_types]
+
+        # Try null first if it's in the list
+        if "null" in normalized_types or value.lower() in ("null", "none", "nil"):
+            return None
+
+        # Try each type in order of preference (most specific first, string as fallback)
+        # Priority: integer > number > boolean > object > array > string
+        type_priority = [
+            "integer",
+            "int",
+            "number",
+            "float",
+            "boolean",
+            "bool",
+            "object",
+            "array",
+            "string",
+            "str",
+            "text",
+        ]
+
+        for param_type in type_priority:
+            if param_type not in normalized_types:
+                continue
+
+            if param_type in ["string", "str", "text"]:
+                return value
+            elif param_type in ["integer", "int"]:
+                try:
+                    return int(value)
+                except (ValueError, TypeError):
+                    continue
+            elif param_type in ["number", "float"]:
+                try:
+                    val = float(value)
+                    return val if val != int(val) else int(val)
+                except (ValueError, TypeError):
+                    continue
+            elif param_type in ["boolean", "bool"]:
+                lower_val = value.lower().strip()
+                if lower_val in ["true", "1", "yes", "on"]:
+                    return True
+                elif lower_val in ["false", "0", "no", "off"]:
+                    return False
+                continue
+            elif param_type in ["object", "array"]:
+                try:
+                    return json.loads(value)
+                except json.JSONDecodeError:
+                    continue
+
+        # Fallback: try JSON parse, then return as string
+        try:
+            return json.loads(value)
+        except json.JSONDecodeError:
             return value
-        elif param_type in ["integer", "int"]:
-            try:
-                return int(value)
-            except (ValueError, TypeError):
-                return value
-        elif param_type in ["number", "float"]:
-            try:
-                val = float(value)
-                return val if val != int(val) else int(val)
-            except (ValueError, TypeError):
-                return value
-        elif param_type in ["boolean", "bool"]:
-            return value.lower() in ["true", "1"]
-        elif param_type in ["object", "array"]:
-            try:
-                return json.loads(value)
-            except json.JSONDecodeError:
-                return value
-        else:
-            # Try JSON parse first, fallback to string
-            try:
-                return json.loads(value)
-            except json.JSONDecodeError:
-                return value
+
+    def _get_param_types_from_config(
+        self, param_name: str, param_config: dict
+    ) -> list[str]:
+        """
+        Get parameter types from parameter configuration.
+        Handles anyOf, oneOf, allOf, and direct type definitions.
+
+        Args:
+            param_name: The name of the parameter
+            param_config: The properties dict from the tool schema
+
+        Returns:
+            List of type strings
+        """
+        if param_name not in param_config:
+            return ["string"]
+
+        param_schema = param_config[param_name]
+        if not isinstance(param_schema, dict):
+            return ["string"]
+
+        return self._extract_types_from_schema(param_schema)
 
     def _parse_single_invoke(
         self, invoke_str: str, tools: list | None
@@ -207,17 +337,11 @@ class MinimaxM2ToolParser(ToolParser):
                 if param_value.endswith("\n"):
                     param_value = param_value[:-1]
 
-                # Get parameter type
-                param_type = "string"
-                if (
-                    param_name in param_config
-                    and isinstance(param_config[param_name], dict)
-                    and "type" in param_config[param_name]
-                ):
-                    param_type = param_config[param_name]["type"]
+                # Get parameter types (supports anyOf/oneOf/allOf)
+                param_type = self._get_param_types_from_config(param_name, param_config)
 
                 # Convert value
-                param_dict[param_name] = self._convert_param_value(
+                param_dict[param_name] = self._convert_param_value_with_types(
                     param_value, param_type
                 )
 
@@ -593,7 +717,7 @@ class MinimaxM2ToolParser(ToolParser):
                         # Store raw value for later processing
                         self.accumulated_params[self.current_param_name] = param_value
 
-                        # Get parameter configuration for type conversion
+                        # Get parameter configuration with anyOf support
                         param_config = {}
                         if self.streaming_request and self.streaming_request.tools:
                             for tool in self.streaming_request.tools:
@@ -610,17 +734,12 @@ class MinimaxM2ToolParser(ToolParser):
                                         param_config = params["properties"]
                                     break
 
-                        # Get parameter type
-                        param_type = "string"
-                        if (
-                            self.current_param_name in param_config
-                            and isinstance(param_config[self.current_param_name], dict)
-                            and "type" in param_config[self.current_param_name]
-                        ):
-                            param_type = param_config[self.current_param_name]["type"]
+                        # Get parameter types (supports anyOf/oneOf/allOf)
+                        param_type = self._get_param_types_from_config(
+                            self.current_param_name, param_config
+                        )
 
-                        # Convert param value to appropriate type
-                        converted_value = self._convert_param_value(
+                        converted_value = self._convert_param_value_with_types(
                             param_value, param_type
                         )
 

From 256a33ecb4923a4bfa6e1a9adbc69f5255a5aa6d Mon Sep 17 00:00:00 2001
From: Li Wang <wangli858794774@gmail.com>
Date: Mon, 22 Dec 2025 18:15:54 +0800
Subject: [PATCH 745/770] [Model] Fix bagel failed to run (#31132)

Signed-off-by: wangli <wangli858794774@gmail.com>
---
 vllm/model_executor/models/bagel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/bagel.py b/vllm/model_executor/models/bagel.py
index 98229c6d4ca1b..cf45fb9fe8370 100644
--- a/vllm/model_executor/models/bagel.py
+++ b/vllm/model_executor/models/bagel.py
@@ -487,7 +487,7 @@ class BagelForConditionalGeneration(
         # Split by image
         return tuple(vision_embeds)
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         """Get multimodal embeddings from input."""
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:

From bd6d5a7475aa26a85de5d63cfcae9a1151d7652f Mon Sep 17 00:00:00 2001
From: AlonKejzman <alonkeizman@gmail.com>
Date: Mon, 22 Dec 2025 14:56:06 +0200
Subject: [PATCH 746/770] [gpt-oss] Fix harmony parser in streaming responses
 (#30205)

Signed-off-by: AlonKejzman <alonkeizman@gmail.com>
---
 vllm/entrypoints/openai/serving_chat.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 88d87a3334955..422a8c18e8e98 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -811,6 +811,11 @@ class OpenAIServingChat(OpenAIServing):
                             delta_text += harmony_parser.last_content_delta or ""
                         cur_channel = harmony_parser.current_channel
                         cur_recipient = harmony_parser.current_recipient
+                        # handle the case where several tokens where generated at once
+                        # including the final token, leading to a delta in the text
+                        # but the current channel to be empty (start state)
+                        if not cur_channel and delta_text:
+                            cur_channel = "final"
                     else:
                         delta_text = output.text
 

From 2cf91c2ea48f27572b9fdcdd8d8802c7f1059e8b Mon Sep 17 00:00:00 2001
From: Shengqi Chen <harry-chen@outlook.com>
Date: Mon, 22 Dec 2025 21:24:21 +0800
Subject: [PATCH 747/770] [CI] add polling for precompiled wheel in
 python_only_compile.sh, fix index generation for releases (#30781)

Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
---
 .buildkite/scripts/generate-nightly-index.py  | 12 ++++++--
 .buildkite/scripts/upload-wheels.sh           |  3 +-
 tests/standalone_tests/python_only_compile.sh | 28 +++++++++++++------
 3 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py
index 72f0bb2970289..1794df9479e55 100644
--- a/.buildkite/scripts/generate-nightly-index.py
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -291,6 +291,7 @@ if __name__ == "__main__":
     """
     Arguments:
         --version <version> : version string for the current build (e.g., commit hash)
+        --wheel-dir <wheel_directory> : directory containing wheel files (default to be same as `version`)
         --current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
         --output-dir <output_directory> : directory to store generated index files
         --alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
@@ -318,6 +319,12 @@ if __name__ == "__main__":
         required=True,
         help="Directory to store generated index files",
     )
+    parser.add_argument(
+        "--wheel-dir",
+        type=str,
+        default=None,
+        help="Directory containing wheel files (default to be same as `version`)",
+    )
     parser.add_argument(
         "--alias-to-default",
         type=str,
@@ -384,9 +391,10 @@ if __name__ == "__main__":
         print("Nightly version detected, keeping all wheel files.")
 
     # Generate index and metadata, assuming wheels and indices are stored as:
-    # s3://vllm-wheels/{version}/<wheel files>
+    # s3://vllm-wheels/{wheel_dir}/<wheel files>
     # s3://vllm-wheels/<anything>/<index files>
-    wheel_base_dir = Path(output_dir).parent / version
+    wheel_dir = args.wheel_dir or version
+    wheel_base_dir = Path(output_dir).parent / wheel_dir.strip().rstrip("/")
     index_base_dir = Path(output_dir)
 
     generate_index_and_metadata(
diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
index 3a218a4bb2e6d..1af7f476ae74b 100644
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -102,6 +102,7 @@ if [[ "$version" != *"dev"* ]]; then
     echo "Re-generating indices for /$pure_version/"
     rm -rf "$INDICES_OUTPUT_DIR/*"
     mkdir -p "$INDICES_OUTPUT_DIR"
-    $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
+    # wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path
+    $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
     aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
 fi
diff --git a/tests/standalone_tests/python_only_compile.sh b/tests/standalone_tests/python_only_compile.sh
index 2017e34030d60..ebf199a5056fb 100644
--- a/tests/standalone_tests/python_only_compile.sh
+++ b/tests/standalone_tests/python_only_compile.sh
@@ -18,25 +18,37 @@ for i in {1..5}; do
     echo "Checking metadata.json URL (attempt $i)..."
     if curl --fail "$meta_json_url" > metadata.json; then
         echo "INFO: metadata.json URL is valid."
-        # check whether it is valid json by python
+        # check whether it is valid json by python (printed to stdout)
         if python3 -m json.tool metadata.json; then
-            echo "INFO: metadata.json is valid JSON. Proceeding with the test."
+            echo "INFO: metadata.json is valid JSON. Proceeding with the check."
+            # check whether there is an object in the json matching:
+            # "package_name": "vllm", and "platform_tag" matches the current architecture
+            # see `determine_wheel_url` in setup.py for more details
+            if python3 -c "import platform as p,json as j,sys as s; d = j.load(open('metadata.json')); \
+             s.exit(int(not any(o.get('package_name') == 'vllm' and p.machine() in o.get('platform_tag') \
+             for o in d)))" 2>/dev/null; then
+                echo "INFO: metadata.json contains a pre-compiled wheel for the current architecture."
+                break
+            else
+                echo "WARN: metadata.json does not have a pre-compiled wheel for the current architecture."
+            fi
         else
             echo "CRITICAL: metadata.json exists but is not valid JSON, please do report in #sig-ci channel!"
+            echo "INFO: metadata.json content:"
+            cat metadata.json
             exit 1
         fi
-        break
     fi
-    # failure handling
+    # failure handling & retry logic
     if [ $i -eq 5 ]; then
-        echo "ERROR: metadata.json URL is still not valid after 5 attempts."
-        echo "ERROR: Please check whether the precompiled wheel for commit $merge_base_commit exists."
+        echo "ERROR: metadata is still not available after 5 attempts."
+        echo "ERROR: Please check whether the precompiled wheel for commit $merge_base_commit is available."
         echo " NOTE: If $merge_base_commit is a new commit on main, maybe try again after its release pipeline finishes."
         echo " NOTE: If it fails, please report in #sig-ci channel."
         exit 1
     else
-        echo "WARNING: metadata.json URL is not valid. Retrying in 3 minutes..."
-        sleep 180
+        echo "WARNING: metadata is not available. Retrying after 5 minutes..."
+        sleep 300
     fi
 done
 

From 8f8f469b1b8c4ef44a0a8a5f0bc6ddf86acbc9db Mon Sep 17 00:00:00 2001
From: dengyunyang <584797741@qq.com>
Date: Mon, 22 Dec 2025 21:25:59 +0800
Subject: [PATCH 748/770] [BugFix] skip language model in Encoder (#30242)

Signed-off-by: dengyunyang <584797741@qq.com>
---
 .../disaggregated_encoder/README.md           |  2 +
 vllm/config/model.py                          |  2 +-
 vllm/model_executor/model_loader/utils.py     |  9 ++-
 vllm/model_executor/models/adapters.py        | 61 +++++++++++++++++++
 vllm/model_executor/models/interfaces.py      | 12 ++++
 vllm/model_executor/models/qwen2_5_vl.py      | 10 ++-
 vllm/model_executor/models/qwen3_vl.py        |  8 +++
 vllm/v1/worker/gpu_model_runner.py            | 15 +++++
 8 files changed, 116 insertions(+), 3 deletions(-)

diff --git a/examples/online_serving/disaggregated_encoder/README.md b/examples/online_serving/disaggregated_encoder/README.md
index b2c3bb974dfab..2a59f86d15fb7 100644
--- a/examples/online_serving/disaggregated_encoder/README.md
+++ b/examples/online_serving/disaggregated_encoder/README.md
@@ -38,6 +38,8 @@ Encoder engines should be launched with the following flags:
 
 - `--max-num-batched-tokens=<large value>` **(default: 2048)** – This flag controls the token scheduling budget per decoding step and is irrelevant to encoder-only instances. **Set it to a very high value (effectively unlimited) to bypass scheduler limitations.** The actual token budget is managed by the encoder cache manager.
 
+- `--convert "mm_encoder_only"` **(Optional)** - The language model is skipped during initialization to reduce device memory usage. **Models using this option must implement the `get_language_model_spec` interface.**
+
 ## Local media inputs
 
 To support local image inputs (from your ```MEDIA_PATH``` directory), add the following flag to the encoder instance:
diff --git a/vllm/config/model.py b/vllm/config/model.py
index c796e300ab155..dd2b7b9d7a786 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -71,7 +71,7 @@ else:
 logger = init_logger(__name__)
 
 RunnerOption = Literal["auto", RunnerType]
-ConvertType = Literal["none", "embed", "classify", "reward"]
+ConvertType = Literal["none", "embed", "classify", "reward", "mm_encoder_only"]
 ConvertOption = Literal["auto", ConvertType]
 TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"]
 ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index 74b02e4c62583..08d7a851ac9ab 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -189,7 +189,9 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
             )
 
     convert_type = model_config.convert_type
-    if convert_type != "none" and supports_multimodal(model_cls):
+    if convert_type not in ["none", "mm_encoder_only"] and supports_multimodal(
+        model_cls
+    ):
         logger.debug_once("Detected conversion of Multi Modal model.")
         converted = try_create_mm_pooling_model_cls(model_cls)
         if converted is not None:
@@ -200,6 +202,11 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
 
     if convert_type == "none":
         pass
+    elif convert_type == "mm_encoder_only":
+        logger.debug_once("Converting to mm encoder only model.")
+        from vllm.model_executor.models.adapters import as_mm_encoder_only_model
+
+        model_cls = as_mm_encoder_only_model(model_cls)
     elif convert_type == "embed":
         logger.debug_once("Converting to embedding model.")
         model_cls = as_embedding_model(model_cls)
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 504de9fe10871..acf1e57a59a97 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -520,3 +520,64 @@ def seq_cls_model_loader(model, weights: Iterable[tuple[str, torch.Tensor]]):
     method = getattr(text_config, "method", None)
     assert method in SEQ_CLS_LOAD_METHODS, f"method {method} not supported"
     return SEQ_CLS_LOAD_METHODS[method](model, weights)
+
+
+def as_mm_encoder_only_model(cls: _T) -> _T:
+    """
+    Subclass an existing vLLM vl model to support mm encoder only for
+    EPD encoder instances.
+    """
+    if not hasattr(cls, "embed_multimodal"):
+        # Submodel case: return the original class.
+        return cls
+
+    if not hasattr(cls, "get_language_model_spec"):
+        raise TypeError(f"{cls} need to implement `get_language_model_spec` method.")
+
+    lm_model_cls, lm_attr = cls.get_language_model_spec()
+
+    if lm_model_cls is None or lm_attr is None:
+        raise TypeError(
+            f"{cls}.get_language_model_spec() must return (lm_model_cls, lm_attr)"
+        )
+
+    class DummyLM(nn.Module):
+        def __init__(self, *args, **kwargs):
+            self.make_empty_intermediate_tensors = None
+
+    class ModelForMMEncoderOnly(cls):
+        def __init__(
+            self,
+            *,
+            vllm_config: "VllmConfig",
+            prefix: str = "",
+            **kwargs: Any,
+        ) -> None:
+            self.is_mm_encoder_only_model = True
+            origin_init = lm_model_cls.__init__
+            try:
+                lm_model_cls.__init__ = DummyLM.__init__
+                super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs)
+
+                if hasattr(self, lm_attr):
+                    delattr(self, lm_attr)
+            finally:
+                lm_model_cls.__init__ = origin_init
+
+        def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+            from .utils import AutoWeightsLoader
+
+            origin_init_ = AutoWeightsLoader.__init__
+
+            def _new_init_(self, *args, **kwargs):
+                origin_init_(self, *args, **kwargs)
+                self.skip_prefixes = (self.skip_prefixes or []) + [f"{lm_attr}."]
+
+            try:
+                AutoWeightsLoader.__init__ = _new_init_
+                result = super().load_weights(weights)
+            finally:
+                AutoWeightsLoader.__init__ = origin_init_
+            return result
+
+    return ModelForMMEncoderOnly  # type: ignore
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index cb99d57e8b8c7..67c65a44dcf7f 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -141,6 +141,14 @@ class SupportsMultiModal(Protocol):
         """
         ...
 
+    @classmethod
+    def get_language_model_spec(cls) -> tuple[nn.Module | None, str | None]:
+        """
+        Return the language model spec:
+        (language model class, language model attr)
+        """
+        return None, None
+
     @overload
     def embed_input_ids(self, input_ids: Tensor) -> Tensor: ...
 
@@ -302,6 +310,10 @@ def supports_multimodal_encoder_tp_data(model: type[object] | object) -> bool:
     return getattr(model, "supports_encoder_tp_data", False)
 
 
+def supports_mm_encoder_only(model: type[object] | object) -> bool:
+    return getattr(model, "is_mm_encoder_only_model", False)
+
+
 @overload
 def supports_multimodal_pruning(
     model: type[object],
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index b730ac0315893..0b44ff622f05b 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -34,7 +34,7 @@ import einops
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from transformers import BatchFeature
+from transformers import BatchFeature, Qwen2ForCausalLM
 from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor
 from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
     Qwen2_5_VLConfig,
@@ -1567,3 +1567,11 @@ class Qwen2_5_VLForConditionalGeneration(
             connector="visual.merger.",
             tower_model="visual.",
         )
+
+    @classmethod
+    def get_language_model_spec(cls) -> tuple[nn.Module | None, str | None]:
+        """
+        Return the language model spec:
+        (language model class, language model attr)
+        """
+        return Qwen2ForCausalLM, "language_model"
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 4838f68e06f70..fea73557f1e82 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -2090,3 +2090,11 @@ class Qwen3VLForConditionalGeneration(
             connector="visual.merger",
             tower_model="visual.",
         )
+
+    @classmethod
+    def get_language_model_spec(cls) -> tuple[nn.Module | None, str | None]:
+        """
+        Return the language model spec:
+        (language model class, language model attr)
+        """
+        return Qwen3LLMForCausalLM, "language_model"
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 92822d829a881..0a17923e89989 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -66,6 +66,7 @@ from vllm.model_executor.models.interfaces import (
     SupportsXDRoPE,
     is_mixture_of_experts,
     supports_eagle3,
+    supports_mm_encoder_only,
     supports_mrope,
     supports_multimodal_pruning,
     supports_transcription,
@@ -4067,6 +4068,11 @@ class GPUModelRunner(
             remove_lora: If False, dummy LoRAs are not destroyed after the run
             activate_lora: If False, dummy_run is performed without LoRAs.
         """
+        if supports_mm_encoder_only(self.model):
+            # The current dummy run only covers LM execution, so we can skip it.
+            # mm encoder dummy run may need to add in the future.
+            return torch.tensor([]), torch.tensor([])
+
         assert (
             cudagraph_runtime_mode is None
             or cudagraph_runtime_mode.valid_runtime_modes()
@@ -4344,6 +4350,11 @@ class GPUModelRunner(
         # The dummy hidden states may contain special values,
         # like `inf` or `nan`.
         # To avoid breaking the sampler, we use a random tensor here instead.
+
+        if supports_mm_encoder_only(self.model):
+            # MM Encoder only model no need to run sampler.
+            return torch.tensor([])
+
         hidden_states = torch.rand_like(hidden_states)
 
         logits = self.model.compute_logits(hidden_states)
@@ -4472,6 +4483,10 @@ class GPUModelRunner(
         self,
         hidden_states: torch.Tensor,
     ) -> PoolerOutput:
+        if supports_mm_encoder_only(self.model):
+            # MM Encoder only model not need to run pooler.
+            return torch.tensor([])
+
         # Find the task that has the largest output for subsequent steps
         supported_pooling_tasks = self.get_supported_pooling_tasks()
 

From b1c3f96ae3874cc3b5a6bc7e55f43339b81da183 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Mon, 22 Dec 2025 16:21:40 +0100
Subject: [PATCH 749/770] [CI][Bugfix] Fix `entrypoints/openai/test_audio.py`
 (#31151)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 tests/entrypoints/openai/test_audio.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index 4cf864bdb2de9..af5f2fec402ed 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -313,7 +313,7 @@ async def test_chat_streaming_input_audio(
                         "format": "wav",
                     },
                 },
-                {"type": "text", "text": "What's happening in this audio?"},
+                {"type": "text", "text": "What's a short title for this audio?"},
             ],
         }
     ]

From a5bc77c253a6a3a7f2df6936ac7c9f233030df09 Mon Sep 17 00:00:00 2001
From: Micah Williamson <micah.williamson@amd.com>
Date: Mon, 22 Dec 2025 09:41:56 -0600
Subject: [PATCH 750/770] [AMD][CI] Add "V1 Test e2e + engine" to mi325_8 Agent
 Pool (#31040)

Signed-off-by: Micah Williamson <micah.williamson@amd.com>
---
 .buildkite/test-amd.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 0d7194f003840..a4d89a46b01ac 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -349,7 +349,9 @@ steps:
 - label: V1 Test e2e + engine # 65min
   timeout_in_minutes: 90
   mirror_hardwares: [amdexperimental]
-  agent_pool: mi325_4
+  # The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability.
+  # See discussion here: https://github.com/vllm-project/vllm/pull/31040
+  agent_pool: mi325_8
   # grade: Blocking
   source_file_dependencies:
     - vllm/

From 022f3cea5327cc720a325c50931e1edcfdf2d32b Mon Sep 17 00:00:00 2001
From: TJian <tunjian.tan@embeddedllm.com>
Date: Tue, 23 Dec 2025 01:28:22 +0900
Subject: [PATCH 751/770] [ROCm] [Critical]: Remove unused variable (#31156)

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 csrc/fused_qknorm_rope_kernel.cu | 1 -
 1 file changed, 1 deletion(-)

diff --git a/csrc/fused_qknorm_rope_kernel.cu b/csrc/fused_qknorm_rope_kernel.cu
index 5c23a90794594..a51e1a347e1d4 100644
--- a/csrc/fused_qknorm_rope_kernel.cu
+++ b/csrc/fused_qknorm_rope_kernel.cu
@@ -405,7 +405,6 @@ void fused_qk_norm_rope(
                   qkv.scalar_type() == k_weight.scalar_type(),
               "qkv, q_weight and k_weight must have the same dtype");
 
-  int64_t rotary_dim = cos_sin_cache.size(1);
   int64_t num_tokens = qkv.size(0);
   TORCH_CHECK(position_ids.size(0) == num_tokens,
               "Number of tokens in position_ids must match QKV");

From 8dd0db687b40a2fe7827975d93a296c9070aaba1 Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Mon, 22 Dec 2025 08:45:59 -0800
Subject: [PATCH 752/770] [UX] improve profiler error message (#31125)

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 vllm/v1/worker/gpu_worker.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index bc71351d2cc55..4747388e22b3d 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -634,7 +634,12 @@ class Worker(WorkerBase):
 
     def profile(self, is_start: bool = True):
         if self.profiler is None:
-            raise RuntimeError("Profiling is not enabled.")
+            raise RuntimeError(
+                "Profiling is not enabled. Please set --profiler-config to enable "
+                "profiling. Example: "
+                "'--profiler-config.profiler=torch --profiler-config.torch_profiler_dir"
+                "=YOUR_DIR_PATH_TO_DUMP_TRACE'"
+            )
         if is_start:
             self.profiler.start()
         else:

From ab3a85fd688dc2544edab3098fab8e2c8893a707 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Mon, 22 Dec 2025 11:19:27 -0600
Subject: [PATCH 753/770] [ROCm][CI/Build] Fix triton version to one that has
 triton_kernels required for gpt-oss to run (#31159)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
---
 docker/Dockerfile.rocm_base | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base
index ac63231094462..c5e94ee1f6928 100644
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@@ -1,5 +1,5 @@
 ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete
-ARG TRITON_BRANCH="a272dfa8"
+ARG TRITON_BRANCH="57c693b6"
 ARG TRITON_REPO="https://github.com/ROCm/triton.git"
 ARG PYTORCH_BRANCH="89075173"
 ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
@@ -162,4 +162,4 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
     && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
     && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
     && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
-    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
\ No newline at end of file
+    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt

From 7b926e890195f026b60f36506503a80afc583b33 Mon Sep 17 00:00:00 2001
From: Yongye Zhu <zyy1102000@gmail.com>
Date: Mon, 22 Dec 2025 09:34:19 -0800
Subject: [PATCH 754/770] [MoE Refactor][9/N] Use modular kernel for
 unquantized Triton MoE (#31052)

Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
---
 tests/kernels/moe/test_moe.py                 |  7 ++++++
 .../fused_moe/unquantized_fused_moe_method.py | 22 +++++++++++++------
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
index ce99d9691fdc8..fd6ce6bfbd782 100644
--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -60,6 +60,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import quantize_w
 from vllm.model_executor.models.mixtral import MixtralMoE
 from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
+from vllm.v1.worker.workspace import init_workspace_manager
 
 NUM_EXPERTS = [8, 64, 192]
 EP_SIZE = [1, 4]
@@ -487,6 +488,7 @@ def test_mixtral_moe(
     monkeypatch.setenv("MASTER_ADDR", "localhost")
     monkeypatch.setenv("MASTER_PORT", "12345")
     init_distributed_environment()
+    init_workspace_manager(torch.cuda.current_device())
 
     # Instantiate our and huggingface's MoE blocks
     vllm_config.compilation_config.static_forward_context = dict()
@@ -533,6 +535,11 @@ def test_mixtral_moe(
             torch.cuda.synchronize()
             torch.cuda.empty_cache()
 
+        # FIXME (zyongye) fix this after we move self.kernel
+        # assignment in FusedMoE.__init__
+
+        vllm_moe.experts.quant_method.process_weights_after_loading(vllm_moe.experts)
+
         # Run forward passes for both MoE blocks
         hf_states, _ = hf_moe.forward(hf_inputs)
         vllm_states = vllm_moe.forward(vllm_inputs)
diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
index 1ee7b65b22e3f..82dbccf3fa9da 100644
--- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
+++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
@@ -6,6 +6,7 @@ import torch
 import torch.nn.functional as F
 
 import vllm.envs as envs
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm._aiter_ops import rocm_aiter_ops
 from vllm.logger import init_logger
 from vllm.model_executor.custom_op import CustomOp
@@ -23,6 +24,9 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
     FusedMoEPermuteExpertsUnpermute,
     FusedMoEPrepareAndFinalize,
 )
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+    MoEPrepareAndFinalizeNoEP,
+)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.platforms.interface import CpuArchEnum
@@ -30,9 +34,9 @@ from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
 
 if current_platform.is_cuda_alike():
     from .fused_batched_moe import BatchedTritonExperts
-    from .fused_moe import TritonExperts, fused_experts
+    from .fused_moe import TritonExperts
 else:
-    fused_experts = None  # type: ignore
+    TritonExperts = None  # type: ignore
 
 if current_platform.is_tpu():
     from .moe_pallas import fused_moe as fused_moe_pallas
@@ -265,6 +269,13 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                     layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer)
             else:
                 layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer)
+        elif current_platform.is_cuda_alike():
+            self.moe_quant_config = self.get_fused_moe_quant_config(layer)
+            self.kernel = mk.FusedMoEModularKernel(
+                MoEPrepareAndFinalizeNoEP(),
+                TritonExperts(self.moe_quant_config),
+                shared_experts=None,
+            )
 
     def apply(
         self,
@@ -278,9 +289,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
             router_logits=router_logits,
         )
 
-    def get_fused_moe_quant_config(
-        self, layer: torch.nn.Module
-    ) -> FusedMoEQuantConfig | None:
+    def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> FusedMoEQuantConfig:
         if self.moe.has_bias:
             return biased_moe_quant_config(
                 layer.w13_bias,
@@ -322,7 +331,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                 apply_router_weight_on_input=layer.apply_router_weight_on_input,
             )
         else:
-            result = fused_experts(
+            result = self.kernel(
                 hidden_states=x,
                 w1=layer.w13_weight,
                 w2=layer.w2_weight,
@@ -330,7 +339,6 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                 topk_ids=topk_ids,
                 inplace=True,
                 activation=layer.activation,
-                quant_config=self.moe_quant_config,
                 apply_router_weight_on_input=layer.apply_router_weight_on_input,
                 global_num_experts=layer.global_num_experts,
                 expert_map=layer.expert_map,

From b10f41c894bc55c8dd8234a40edd1971be762805 Mon Sep 17 00:00:00 2001
From: Pavani Majety <pmajety@nvidia.com>
Date: Mon, 22 Dec 2025 11:15:57 -0800
Subject: [PATCH 755/770] [SM100] Enable fp8 compute for prefill MLA (#30746)

Signed-off-by: Pavani Majety <pmajety@nvidia.com>
---
 tests/v1/attention/test_mla_backends.py       |   7 +-
 .../quantization/utils/flashinfer_fp4_moe.py  |   1 -
 vllm/v1/attention/backends/mla/common.py      | 127 ++++++++++++++++--
 3 files changed, 117 insertions(+), 18 deletions(-)

diff --git a/tests/v1/attention/test_mla_backends.py b/tests/v1/attention/test_mla_backends.py
index 783e02ce89bdb..6a62440d95417 100644
--- a/tests/v1/attention/test_mla_backends.py
+++ b/tests/v1/attention/test_mla_backends.py
@@ -27,7 +27,7 @@ from vllm.utils.math_utils import cdiv
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 from vllm.v1.attention.backends.mla.common import QueryLenSupport
 from vllm.v1.attention.backends.utils import CommonAttentionMetadata
-from vllm.v1.kv_cache_interface import FullAttentionSpec
+from vllm.v1.kv_cache_interface import MLAAttentionSpec
 
 BACKENDS_TO_TEST = [
     AttentionBackendEnum.CUTLASS_MLA,
@@ -289,7 +289,7 @@ class MockMLAAttentionLayer(AttentionLayerBase):
 
 def run_attention_backend(
     backend: AttentionBackendEnum,
-    kv_cache_spec: FullAttentionSpec,
+    kv_cache_spec: MLAAttentionSpec,
     layer_names: list[str],
     vllm_config,
     device: torch.device,
@@ -740,7 +740,7 @@ def test_backend_correctness(
         kv_cache = kv_cache_per_block_size[block_size]
 
         # Create kv_cache_spec with the correct block_size for this backend
-        backend_kv_cache_spec = FullAttentionSpec(
+        backend_kv_cache_spec = MLAAttentionSpec(
             block_size=block_size,
             num_kv_heads=vllm_config.model_config.get_num_kv_heads(
                 vllm_config.parallel_config
@@ -748,6 +748,7 @@ def test_backend_correctness(
             head_size=vllm_config.model_config.get_head_size(),
             dtype=vllm_config.model_config.dtype,
             sliding_window=vllm_config.model_config.get_sliding_window(),
+            cache_dtype_str=vllm_config.cache_config.cache_dtype,
         )
 
         backend_output = run_attention_backend(
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
index 1d410316d6299..16598d567848f 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
@@ -325,7 +325,6 @@ def flashinfer_trtllm_fp4_moe(
         local_expert_offset=layer.ep_rank * layer.local_num_experts,
         local_num_experts=layer.local_num_experts,
         routed_scaling_factor=None,
-        tile_tokens_dim=None,
         routing_method_type=routing_method_type,
         do_finalize=True,
     )[0]
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index fea482493635f..0ff951213040a 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -355,6 +355,7 @@ class MLACommonPrefillMetadata:
     max_query_len: int
     chunked_context: ChunkedContextMetadata | None = None
     query_seq_lens: torch.Tensor | None = None
+    q_data_type: torch.dtype | None = None
 
 
 @dataclass
@@ -539,6 +540,11 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
             metadata_cls if metadata_cls is not None else MLACommonMetadata
         )
         self.kv_cache_spec = kv_cache_spec
+        self.q_data_type = (
+            current_platform.fp8_dtype()
+            if (kv_cache_spec is not None and "fp8" in kv_cache_spec.cache_dtype_str)
+            else vllm_config.model_config.dtype
+        )
         scheduler_config = vllm_config.scheduler_config
         self.model_config = vllm_config.model_config
         parallel_config = vllm_config.parallel_config
@@ -681,7 +687,6 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
 
         # For main run, qo_indptr == kv_indptr
         kv_indptr = qo_indptr.clone()
-
         # Prepare main prefill
         self._fi_prefill_main.plan(
             qo_indptr=qo_indptr,
@@ -694,7 +699,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
             sm_scale=self._global_hyperparameters.sm_scale,
             window_left=self._global_hyperparameters.window_left,
             logits_soft_cap=self._global_hyperparameters.logits_soft_cap,
-            q_data_type=self.model_config.dtype,
+            q_data_type=self.q_data_type,
         )
 
         # Prepare context prefills
@@ -713,7 +718,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
                     sm_scale=self._global_hyperparameters.sm_scale,
                     window_left=self._global_hyperparameters.window_left,
                     logits_soft_cap=self._global_hyperparameters.logits_soft_cap,
-                    q_data_type=self.model_config.dtype,
+                    q_data_type=self.q_data_type,
                 )
 
         prefill.prefill_main = self._fi_prefill_main
@@ -970,6 +975,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
                 query_start_loc=prefill_query_start_loc,
                 max_query_len=max_query_len,
                 chunked_context=chunked_context_metadata,
+                q_data_type=self.q_data_type,
             )
 
             if self._use_cudnn_prefill:
@@ -1370,8 +1376,15 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         return attn_out
 
     def _run_prefill_new_tokens_fa(
-        self, prefill: MLACommonPrefillMetadata, q, k, v, return_softmax_lse
+        self,
+        prefill: MLACommonPrefillMetadata,
+        q,
+        k,
+        v,
+        return_softmax_lse,
+        fp8_attention: bool,
     ):
+        logger.debug_once("Running FlashAttention prefill new tokens", scope="local")
         return self._flash_attn_varlen_diff_headdims(
             q=q,
             k=k,
@@ -1386,11 +1399,23 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         )
 
     def _run_prefill_new_tokens_fi(
-        self, prefill: MLACommonPrefillMetadata, q, k, v, return_softmax_lse
+        self,
+        prefill: MLACommonPrefillMetadata,
+        q,
+        k,
+        v,
+        return_softmax_lse,
+        fp8_attention: bool,
     ):
+        logger.debug_once("Running FlashInfer prefill new tokens", scope="local")
         assert isinstance(prefill, FlashInferPrefillMetadata)
         assert prefill.prefill_main is not None
-
+        if fp8_attention:
+            logger.debug_once("Running Flashinfer prefill in FP8")
+            fp8_dtype = current_platform.fp8_dtype()
+            q = q.to(fp8_dtype)
+            k = k.to(fp8_dtype)
+            v = v.to(fp8_dtype)
         ret = prefill.prefill_main.run(
             q=q,
             k=k,
@@ -1403,10 +1428,18 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         return ret
 
     def _run_prefill_new_tokens_cudnn(
-        self, prefill: MLACommonPrefillMetadata, q, k, v, return_softmax_lse
+        self,
+        prefill: MLACommonPrefillMetadata,
+        q,
+        k,
+        v,
+        return_softmax_lse,
+        fp8_attention: bool,
     ):
+        logger.debug_once("Running Cudnn prefill new tokens", scope="local")
         assert isinstance(prefill, CudnnPrefillMetadata)
         assert prefill.query_seq_lens is not None
+        assert fp8_attention is False, "Cudnn prefill does not support fp8 attention"
         output, lse = cudnn_batch_prefill_with_kv_cache(
             q=q,
             k_cache=k,
@@ -1428,9 +1461,19 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         return output
 
     def _run_prefill_context_chunk_fa(
-        self, prefill: MLACommonPrefillMetadata, chunk_idx: int, q, k, v
+        self,
+        prefill: MLACommonPrefillMetadata,
+        chunk_idx: int,
+        q,
+        k,
+        v,
+        fp8_attention: bool,
     ):
+        logger.debug_once("Running FlashAttention prefill context chunk", scope="local")
         assert prefill.chunked_context is not None
+        assert fp8_attention is False, (
+            "FlashAttention prefill does not support fp8 attention"
+        )
         return self._flash_attn_varlen_diff_headdims(
             q=q,
             k=k,
@@ -1445,10 +1488,22 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         )
 
     def _run_prefill_context_chunk_fi(
-        self, prefill: MLACommonPrefillMetadata, chunk_idx: int, q, k, v
+        self,
+        prefill: MLACommonPrefillMetadata,
+        chunk_idx: int,
+        q,
+        k,
+        v,
+        fp8_attention: bool,
     ):
+        logger.debug_once("Running FlashInfer prefill context chunk", scope="local")
         assert isinstance(prefill, FlashInferPrefillMetadata)
-
+        if fp8_attention:
+            logger.debug_once("Running FlashInfer prefill in FP8")
+            fp8_dtype = current_platform.fp8_dtype()
+            q = q.to(fp8_dtype)
+            k = k.to(fp8_dtype)
+            v = v.to(fp8_dtype)
         attn_out, lse = prefill.prefill_chunks[chunk_idx].run(
             q=q,
             k=k,
@@ -1460,12 +1515,20 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         return attn_out, lse.transpose(0, 1).contiguous()
 
     def _run_prefill_context_chunk_cudnn(
-        self, prefill: MLACommonPrefillMetadata, chunk_idx: int, q, k, v
+        self,
+        prefill: MLACommonPrefillMetadata,
+        chunk_idx: int,
+        q,
+        k,
+        v,
+        fp8_attention: bool,
     ):
+        logger.debug_once("Running Cudnn prefill context chunk", scope="local")
         assert isinstance(prefill, CudnnPrefillMetadata)
         assert prefill.chunked_context is not None
         assert prefill.chunked_context.seq_lens[chunk_idx] is not None
         assert prefill.query_seq_lens is not None
+        assert fp8_attention is False, "Cudnn prefill does not support fp8 attention"
         return cudnn_batch_prefill_with_kv_cache(
             q=q,
             k_cache=k,
@@ -1485,13 +1548,27 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         )
 
     def _run_prefill_new_tokens_trtllm_ragged(
-        self, prefill: MLACommonPrefillMetadata, q, k, v, return_softmax_lse
+        self,
+        prefill: MLACommonPrefillMetadata,
+        q,
+        k,
+        v,
+        return_softmax_lse,
+        fp8_attention: bool,
     ):
+        logger.debug_once("Running TRT-LLM ragged prefill new tokens", scope="local")
         """TRT-LLM ragged attention for new tokens (causal)."""
         from flashinfer.prefill import trtllm_ragged_attention_deepseek
 
         assert prefill.query_seq_lens is not None
 
+        if fp8_attention:
+            logger.debug_once("Running TRT-LLM ragged prefill in FP8")
+            fp8_dtype = current_platform.fp8_dtype()
+            q = q.to(fp8_dtype)
+            k = k.to(fp8_dtype)
+            v = v.to(fp8_dtype)
+
         ret = trtllm_ragged_attention_deepseek(
             query=q,
             key=k,
@@ -1518,8 +1595,15 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         return ret
 
     def _run_prefill_context_chunk_trtllm_ragged(
-        self, prefill: MLACommonPrefillMetadata, chunk_idx: int, q, k, v
+        self,
+        prefill: MLACommonPrefillMetadata,
+        chunk_idx: int,
+        q,
+        k,
+        v,
+        fp8_attention: bool,
     ):
+        logger.debug_once("Running TRT-LLM ragged prefill context chunk", scope="local")
         """TRT-LLM ragged attention for context chunks (non-causal)."""
         from flashinfer.prefill import trtllm_ragged_attention_deepseek
 
@@ -1535,6 +1619,13 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         )
         self._workspace_buffer.fill_(0)
 
+        if fp8_attention:
+            logger.debug_once("Running TRT-LLM ragged prefill context chunk in FP8")
+            fp8_dtype = current_platform.fp8_dtype()
+            q = q.to(fp8_dtype)
+            k = k.to(fp8_dtype)
+            v = v.to(fp8_dtype)
+
         attn_out, lse = trtllm_ragged_attention_deepseek(
             query=q,
             key=k,
@@ -1687,6 +1778,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: MLACommonMetadata,
         k_scale: torch.Tensor,
+        fp8_attention: bool,
     ):
         assert attn_metadata.prefill is not None
         prefill_metadata = attn_metadata.prefill
@@ -1725,6 +1817,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
                 q=q,
                 k=k,
                 v=v,
+                fp8_attention=fp8_attention,
             )
 
             if output is None:
@@ -1753,6 +1846,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         attn_metadata: MLACommonMetadata,
         k_scale: torch.Tensor,
         dcp_world_size: int,
+        fp8_attention: bool,
     ):
         assert k_scale is None, "DCP not support scaled kvcache now."
         assert attn_metadata.prefill is not None
@@ -1829,6 +1923,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
                 q=q,
                 k=k,
                 v=v,
+                fp8_attention=fp8_attention,
             )
 
             if output is None:
@@ -1859,6 +1954,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         attn_metadata: MLACommonMetadata,
         k_scale: torch.Tensor,
         output: torch.Tensor,
+        fp8_attention: bool = False,
     ) -> None:
         # TODO (zyongye): Prefill function here
         assert attn_metadata.prefill is not None
@@ -1878,6 +1974,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
             k=k,
             v=v,
             return_softmax_lse=has_context,
+            fp8_attention=fp8_attention,
         )
 
         if has_context:
@@ -1890,11 +1987,12 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
                         attn_metadata,
                         k_scale=None,
                         dcp_world_size=self.dcp_world_size,
+                        fp8_attention=fp8_attention,
                     )
                 )
             else:
                 context_output, context_lse = self._compute_prefill_context(
-                    q, kv_c_and_k_pe_cache, attn_metadata, k_scale
+                    q, kv_c_and_k_pe_cache, attn_metadata, k_scale, fp8_attention
                 )
 
             # unpad if necessary
@@ -2015,6 +2113,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
                 attn_metadata,
                 layer._k_scale,
                 output=output[num_decode_tokens:],
+                fp8_attention=fp8_attention,
             )
 
         if has_decode:

From 95863540538804a48e31ca5806cefbac9d4c5326 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Mon, 22 Dec 2025 15:06:29 -0500
Subject: [PATCH 756/770] [Doc] Add vllm-metal to hardware plugin documentation
 (#31174)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 docs/getting_started/installation/README.md        | 1 +
 docs/getting_started/installation/cpu.apple.inc.md | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/docs/getting_started/installation/README.md b/docs/getting_started/installation/README.md
index 9b93a6b9ac12c..cdbe601ca801a 100644
--- a/docs/getting_started/installation/README.md
+++ b/docs/getting_started/installation/README.md
@@ -28,3 +28,4 @@ The backends below live **outside** the main `vllm` repository and follow the
 | Cambricon MLU | `vllm-mlu` | <https://github.com/Cambricon/vllm-mlu> |
 | Baidu Kunlun XPU | N/A, install from source | <https://github.com/baidu/vLLM-Kunlun> |
 | Sophgo TPU | N/A, install from source | <https://github.com/sophgo/vllm-tpu> |
+| Apple Silicon (Metal) | N/A, install from source | <https://github.com/vllm-project/vllm-metal> |
diff --git a/docs/getting_started/installation/cpu.apple.inc.md b/docs/getting_started/installation/cpu.apple.inc.md
index 9f1f6e3821397..c5a4d00ddcf4c 100644
--- a/docs/getting_started/installation/cpu.apple.inc.md
+++ b/docs/getting_started/installation/cpu.apple.inc.md
@@ -4,6 +4,9 @@ vLLM has experimental support for macOS with Apple Silicon. For now, users must
 
 Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
 
+!!! tip "GPU-Accelerated Inference with vLLM-Metal"
+    For GPU-accelerated inference on Apple Silicon using Metal, check out [vllm-metal](https://github.com/vllm-project/vllm-metal), a community-maintained hardware plugin that uses MLX as the compute backend.
+
 # --8<-- [end:installation]
 # --8<-- [start:requirements]
 

From de71747655668d268998264365dc9bbc8e58d410 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Mon, 22 Dec 2025 16:06:10 -0500
Subject: [PATCH 757/770] [SpecDecode] Simplified alternative
 padded-speculation acceptance rate fix (#29845)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 tests/v1/spec_decode/test_eagle.py            | 10 +++++--
 vllm/v1/attention/backends/mla/common.py      | 24 ++++++++++-------
 .../attention/backends/mla/flashattn_mla.py   |  5 ++--
 vllm/v1/attention/backends/mla/flashmla.py    |  2 +-
 .../attention/backends/mla/rocm_aiter_mla.py  |  2 +-
 vllm/v1/spec_decode/eagle.py                  | 26 ++++++++++++++++---
 vllm/v1/spec_decode/utils.py                  |  2 ++
 vllm/v1/worker/gpu_model_runner.py            | 16 +++++++-----
 8 files changed, 62 insertions(+), 25 deletions(-)

diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index f63cd3a6e42aa..a5e326e82c592 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -306,10 +306,16 @@ def test_prepare_inputs_padded():
 
     proposer = _create_proposer("eagle", num_speculative_tokens)
 
-    output_metadata, token_indices_to_sample = proposer.prepare_inputs_padded(
-        common_attn_metadata, spec_decode_metadata, valid_sampled_tokens_count
+    output_metadata, token_indices_to_sample, num_rejected_tokens_gpu = (
+        proposer.prepare_inputs_padded(
+            common_attn_metadata, spec_decode_metadata, valid_sampled_tokens_count
+        )
     )
 
+    # Verify num_rejected_tokens_gpu is calculated correctly
+    expected_num_rejected = torch.tensor([1, 0, 2], dtype=torch.int32, device=device)
+    assert torch.equal(num_rejected_tokens_gpu, expected_num_rejected)
+
     assert output_metadata.max_query_len == 3
     assert torch.equal(output_metadata.query_start_loc, expected_query_start_loc)
     assert torch.equal(token_indices_to_sample, expected_token_indices_to_sample)
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 0ff951213040a..7a6edc0add8bf 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -564,6 +564,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
             self.dcp_rank = 0
         self.dcp_local_block_size = parallel_config.cp_kv_cache_interleave_size
         self.dcp_virtual_block_size = self.dcp_local_block_size * self.dcp_world_size
+        self.cp_kv_cache_interleave_size = parallel_config.cp_kv_cache_interleave_size
 
         # Don't try to access the runner on AMD
         if self.aot_schedule:
@@ -727,8 +728,8 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
     def _build_decode(
         self,
         block_table_tensor: torch.Tensor,
-        seq_lens_cpu: torch.Tensor,
         seq_lens_device: torch.Tensor,
+        max_seq_len: int,
         query_start_loc_cpu: torch.Tensor,
         query_start_loc_device: torch.Tensor,
         num_decode_tokens: int,
@@ -778,13 +779,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
         query_start_loc = common_attn_metadata.query_start_loc
         query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
         seq_lens = common_attn_metadata.seq_lens
-        seq_lens_cpu = common_attn_metadata.seq_lens_cpu
         dcp_local_seq_lens = common_attn_metadata.dcp_local_seq_lens
-        dcp_local_seq_lens_cpu = common_attn_metadata.dcp_local_seq_lens_cpu
-
-        query_seq_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
-
-        num_computed_tokens_cpu = common_attn_metadata.seq_lens_cpu - query_seq_lens_cpu
 
         num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
             split_decodes_and_prefills(
@@ -799,6 +794,8 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
 
         prefill_metadata = None
         if num_prefills > 0:
+            num_computed_tokens_cpu = common_attn_metadata.num_computed_tokens_cpu
+
             reqs_start = num_decodes  # prefill_start
 
             context_lens_cpu = num_computed_tokens_cpu[reqs_start:num_reqs]
@@ -995,13 +992,22 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
             dcp_tot_seq_lens_device = None
             if self.dcp_world_size > 1:
                 dcp_tot_seq_lens_device = seq_lens[:num_decodes]
-                seq_lens_cpu = dcp_local_seq_lens_cpu
                 seq_lens = dcp_local_seq_lens
 
+                # After DCP distribution, the maximum number of tokens for any rank is
+                # ceil(L / (N * I)) * I, where L is max_seq_len, N is dcp_world_size,
+                # and I is cp_kv_cache_interleave_size.
+                # This eliminates GPU->CPU sync while minimizing workspace
+                # over-allocation.
+                num_partitions = self.dcp_world_size * self.cp_kv_cache_interleave_size
+                max_seq_len = (
+                    (max_seq_len + num_partitions - 1) // num_partitions
+                ) * self.cp_kv_cache_interleave_size
+
             decode_metadata = self._build_decode(
                 block_table_tensor=block_table_tensor[:num_decodes, ...],
-                seq_lens_cpu=seq_lens_cpu[:num_decodes],
                 seq_lens_device=seq_lens[:num_decodes],
+                max_seq_len=max_seq_len,
                 query_start_loc_cpu=query_start_loc_cpu[: num_decodes + 1],
                 query_start_loc_device=query_start_loc[: num_decodes + 1],
                 num_decode_tokens=num_decode_tokens,
diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py
index b28814aceada9..b4a68f472e9c1 100644
--- a/vllm/v1/attention/backends/mla/flashattn_mla.py
+++ b/vllm/v1/attention/backends/mla/flashattn_mla.py
@@ -169,8 +169,8 @@ class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata]
     def _build_decode(
         self,
         block_table_tensor: torch.Tensor,
-        seq_lens_cpu: torch.Tensor,
         seq_lens_device: torch.Tensor,
+        max_seq_len: int,
         query_start_loc_cpu: torch.Tensor,
         query_start_loc_device: torch.Tensor,
         num_decode_tokens: int,
@@ -178,7 +178,6 @@ class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata]
     ) -> FlashAttnMLADecodeMetadata:
         query_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
         max_query_len = query_lens_cpu.max().item()
-        max_seq_len = seq_lens_cpu.max().item()
 
         # For Flash Attention MLA + full cudagraph
         max_num_splits = 0
@@ -193,7 +192,7 @@ class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata]
             max_num_splits = 1
 
         scheduler_metadata = self._schedule_decode(
-            num_reqs=seq_lens_cpu.numel(),
+            num_reqs=seq_lens_device.shape[0],
             cu_query_lens=query_start_loc_device,
             max_query_len=max_query_len,
             seqlens=seq_lens_device,
diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py
index 74a4cd8430250..913503ce44944 100644
--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -143,8 +143,8 @@ class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
     def _build_decode(
         self,
         block_table_tensor: torch.Tensor,
-        seq_lens_cpu: torch.Tensor,
         seq_lens_device: torch.Tensor,
+        max_seq_len: int,
         query_start_loc_cpu: torch.Tensor,
         query_start_loc_device: torch.Tensor,
         num_decode_tokens: int,
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
index 589d6ef2f6348..e8921f8a1c403 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -106,8 +106,8 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
     def _build_decode(
         self,
         block_table_tensor: torch.Tensor,
-        seq_lens_cpu: torch.Tensor,
         seq_lens_device: torch.Tensor,
+        max_seq_len: int,
         query_start_loc_cpu: torch.Tensor,
         query_start_loc_device: torch.Tensor,
         num_decode_tokens: int,
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 65a0a88ec0f5d..66697132b365c 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -236,6 +236,7 @@ class EagleProposer:
         common_attn_metadata: CommonAttentionMetadata,
         sampling_metadata: SamplingMetadata,
         mm_embed_inputs: tuple[list[torch.Tensor], torch.Tensor] | None = None,
+        num_rejected_tokens_gpu: torch.Tensor | None = None,
     ) -> torch.Tensor:
         num_tokens = target_token_ids.shape[0]
         batch_size = next_token_ids.shape[0]
@@ -414,6 +415,17 @@ class EagleProposer:
         common_attn_metadata.query_start_loc_cpu = torch.from_numpy(
             self.token_arange_np[: batch_size + 1]
         ).clone()
+
+        # In padded drafter batch, we need to adjust the sequence lengths
+        # to remove the "padding" (i.e. rejected tokens).
+        # Only apply this adjustment when we have rejected tokens
+        # (i.e., not the first proposal).
+        if self.num_speculative_tokens > 1 and num_rejected_tokens_gpu is not None:
+            common_attn_metadata.seq_lens -= num_rejected_tokens_gpu
+            # Invalidate the CPU-side shadows to avoid H<>D sync.
+            common_attn_metadata._seq_lens_cpu = None
+            common_attn_metadata._num_computed_tokens_cpu = None
+
         for token_index in range(self.num_speculative_tokens - 1):
             # Update the inputs.
             # cast to int32 is crucial when eagle model is compiled.
@@ -628,13 +640,14 @@ class EagleProposer:
         common_attn_metadata: CommonAttentionMetadata,
         spec_decode_metadata: SpecDecodeMetadata,
         valid_sampled_tokens_count: torch.Tensor,
-    ) -> tuple[CommonAttentionMetadata, torch.Tensor]:
+    ) -> tuple[CommonAttentionMetadata, torch.Tensor, torch.Tensor]:
         """
         This function is used to prepare the inputs for speculative decoding
         It updates the common_attn_metadata for speculative decoding,
         but does not consider the rejected tokens. Instead, all tokens
         are included as inputs to the speculator, with the rejected tokens
         used as padding and filtered out later by `token_indices_to_sample`.
+        No blocking CPU operations should be introduced in this function.
         """
         num_reqs = common_attn_metadata.num_reqs
         device = valid_sampled_tokens_count.device
@@ -642,14 +655,17 @@ class EagleProposer:
         token_indices_to_sample = torch.empty(
             (num_reqs,), dtype=torch.int32, device=device
         )
+        num_rejected_tokens_gpu = torch.empty(
+            (num_reqs,), dtype=torch.int32, device=device
+        )
 
-        # Kernel grid: one program per request (row)
         grid = (num_reqs,)
         eagle_prepare_inputs_padded_kernel[grid](
             spec_decode_metadata.cu_num_draft_tokens,
             valid_sampled_tokens_count,
             common_attn_metadata.query_start_loc,
             token_indices_to_sample,
+            num_rejected_tokens_gpu,
             num_reqs,
         )
 
@@ -674,7 +690,11 @@ class EagleProposer:
             dcp_local_seq_lens=common_attn_metadata.dcp_local_seq_lens,
         )
 
-        return spec_common_attn_metadata, token_indices_to_sample
+        return (
+            spec_common_attn_metadata,
+            token_indices_to_sample,
+            num_rejected_tokens_gpu,
+        )
 
     def propose_tree(
         self,
diff --git a/vllm/v1/spec_decode/utils.py b/vllm/v1/spec_decode/utils.py
index 9d4399d00487a..783b6ed5961bc 100644
--- a/vllm/v1/spec_decode/utils.py
+++ b/vllm/v1/spec_decode/utils.py
@@ -23,6 +23,7 @@ def eagle_prepare_inputs_padded_kernel(
     valid_sampled_tokens_count_ptr,  # [num_reqs]
     query_start_loc_gpu_ptr,  # [num_reqs + 1]
     token_indices_to_sample_ptr,  # [num_reqs] (output)
+    num_rejected_tokens_gpu_ptr,  # [num_reqs] (output)
     num_reqs,  # tl.int32
 ):
     """
@@ -56,6 +57,7 @@ def eagle_prepare_inputs_padded_kernel(
 
     index_to_sample = q_last_tok_idx - num_rejected_tokens
     tl.store(token_indices_to_sample_ptr + req_idx, index_to_sample)
+    tl.store(num_rejected_tokens_gpu_ptr + req_idx, num_rejected_tokens)
 
 
 @triton.jit
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 0a17923e89989..455406394d3ec 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -3534,6 +3534,7 @@ class GPUModelRunner(
                     next_token_ids, valid_sampled_tokens_count
                 )
 
+            num_rejected_tokens_gpu = None
             if spec_decode_metadata is None:
                 token_indices_to_sample = None
                 # input_ids can be None for multimodal models.
@@ -3564,12 +3565,14 @@ class GPUModelRunner(
                     else:
                         target_hidden_states = hidden_states[token_indices]
                 else:
-                    common_attn_metadata, token_indices_to_sample = (
-                        self.drafter.prepare_inputs_padded(
-                            common_attn_metadata,
-                            spec_decode_metadata,
-                            valid_sampled_tokens_count,
-                        )
+                    (
+                        common_attn_metadata,
+                        token_indices_to_sample,
+                        num_rejected_tokens_gpu,
+                    ) = self.drafter.prepare_inputs_padded(
+                        common_attn_metadata,
+                        spec_decode_metadata,
+                        valid_sampled_tokens_count,
                     )
                     total_num_tokens = common_attn_metadata.num_actual_tokens
                     # When padding the batch, token_indices is just a range
@@ -3600,6 +3603,7 @@ class GPUModelRunner(
                 sampling_metadata=sampling_metadata,
                 common_attn_metadata=common_attn_metadata,
                 mm_embed_inputs=mm_embed_inputs,
+                num_rejected_tokens_gpu=num_rejected_tokens_gpu,
             )
 
         return draft_token_ids

From 5312a7284e58e47bd10f7acbab52f703714027c2 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Mon, 22 Dec 2025 17:24:27 -0500
Subject: [PATCH 758/770] [Bug] Fix `'CutlassMLAImpl' object has no attribute
 '_workspace_buffer'` (#31173)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/v1/attention/backends/mla/common.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 7a6edc0add8bf..619c1c2794daa 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -355,6 +355,7 @@ class MLACommonPrefillMetadata:
     max_query_len: int
     chunked_context: ChunkedContextMetadata | None = None
     query_seq_lens: torch.Tensor | None = None
+    workspace_buffer: torch.Tensor | None = None
     q_data_type: torch.dtype | None = None
 
 
@@ -986,6 +987,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
                 prefill_metadata.query_seq_lens = (
                     prefill_query_start_loc[1:] - prefill_query_start_loc[:-1]
                 )
+                prefill_metadata.workspace_buffer = self._workspace_buffer
 
         decode_metadata = None
         if num_decodes > 0:
@@ -1567,6 +1569,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         from flashinfer.prefill import trtllm_ragged_attention_deepseek
 
         assert prefill.query_seq_lens is not None
+        assert prefill.workspace_buffer is not None
 
         if fp8_attention:
             logger.debug_once("Running TRT-LLM ragged prefill in FP8")
@@ -1579,7 +1582,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
             query=q,
             key=k,
             value=v,
-            workspace_buffer=self._workspace_buffer,
+            workspace_buffer=prefill.workspace_buffer,
             seq_lens=prefill.query_seq_lens,
             max_q_len=prefill.max_query_len,
             max_kv_len=prefill.max_query_len,
@@ -1615,6 +1618,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
 
         assert prefill.chunked_context is not None
         assert prefill.chunked_context.seq_lens[chunk_idx] is not None
+        assert prefill.workspace_buffer is not None
 
         out = torch.zeros(
             q.shape[0],
@@ -1623,7 +1627,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
             device=q.device,
             dtype=q.dtype,
         )
-        self._workspace_buffer.fill_(0)
+        prefill.workspace_buffer.fill_(0)
 
         if fp8_attention:
             logger.debug_once("Running TRT-LLM ragged prefill context chunk in FP8")
@@ -1636,7 +1640,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
             query=q,
             key=k,
             value=v,
-            workspace_buffer=self._workspace_buffer,
+            workspace_buffer=prefill.workspace_buffer,
             seq_lens=prefill.chunked_context.seq_lens[chunk_idx],
             max_q_len=prefill.max_query_len,
             max_kv_len=prefill.chunked_context.max_seq_lens[chunk_idx],

From 85aff45e24de7af96d30baa1d7d0fc7aec43c28a Mon Sep 17 00:00:00 2001
From: Benjamin Chislett <bchislett@nvidia.com>
Date: Mon, 22 Dec 2025 17:25:22 -0500
Subject: [PATCH 759/770] [Perf] Remove blocking copy in GDN Attention (#31167)

Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
---
 vllm/v1/attention/backends/gdn_attn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/attention/backends/gdn_attn.py b/vllm/v1/attention/backends/gdn_attn.py
index ace2cbb0564c8..fcde986f48d46 100644
--- a/vllm/v1/attention/backends/gdn_attn.py
+++ b/vllm/v1/attention/backends/gdn_attn.py
@@ -143,7 +143,7 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata]
 
         query_start_loc = m.query_start_loc
         context_lens = m.num_computed_tokens_cpu
-        context_lens_tensor = context_lens.to(query_start_loc.device)
+        context_lens_tensor = context_lens.to(query_start_loc.device, non_blocking=True)
         nums_dict, batch_ptr, token_chunk_offset_ptr = None, None, None
 
         if (

From 6d518ffbaa04d3e930b320830542b6baf49aa61c Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Mon, 22 Dec 2025 18:40:35 -0500
Subject: [PATCH 760/770] [CI Failure] Disable mosaicml/mpt-7b and
 databricks/dbrx-instruct tests (#31182)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 docs/serving/integrations/langchain.md | 2 +-
 tests/models/registry.py               | 8 ++++++--
 tests/tokenizers_/test_detokenize.py   | 3 ++-
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/docs/serving/integrations/langchain.md b/docs/serving/integrations/langchain.md
index 192a61ea5b903..14b336dffa78f 100644
--- a/docs/serving/integrations/langchain.md
+++ b/docs/serving/integrations/langchain.md
@@ -16,7 +16,7 @@ To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`
     from langchain_community.llms import VLLM
 
     llm = VLLM(
-        model="mosaicml/mpt-7b",
+        model="Qwen/Qwen3-4B",
         trust_remote_code=True,  # mandatory for hf models
         max_new_tokens=128,
         top_k=10,
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 82b9303b2a21b..081167c6aedf7 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -215,7 +215,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
         trust_remote_code=True,
     ),
     "CwmForCausalLM": _HfExamplesInfo("facebook/cwm", min_transformers_version="4.58"),
-    "DbrxForCausalLM": _HfExamplesInfo("databricks/dbrx-instruct"),
+    # FIXME: databricks/dbrx-instruct has been deleted
+    "DbrxForCausalLM": _HfExamplesInfo(
+        "databricks/dbrx-instruct", is_available_online=False
+    ),
     "DeciLMForCausalLM": _HfExamplesInfo(
         "nvidia/Llama-3_3-Nemotron-Super-49B-v1",
         trust_remote_code=True,
@@ -366,7 +369,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
         {"tiny": "TitanML/tiny-mixtral"},
     ),
     "MptForCausalLM": _HfExamplesInfo("mpt", is_available_online=False),
-    "MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b"),
+    # FIXME: mosaicml/mpt-7b has been deleted
+    "MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b", is_available_online=False),
     "NemotronForCausalLM": _HfExamplesInfo("nvidia/Minitron-8B-Base"),
     "NemotronHForCausalLM": _HfExamplesInfo(
         "nvidia/Nemotron-H-8B-Base-8K", trust_remote_code=True
diff --git a/tests/tokenizers_/test_detokenize.py b/tests/tokenizers_/test_detokenize.py
index d307993d04df9..ad6c5fb415aad 100644
--- a/tests/tokenizers_/test_detokenize.py
+++ b/tests/tokenizers_/test_detokenize.py
@@ -38,7 +38,8 @@ TOKENIZERS = [
     "EleutherAI/gpt-j-6b",
     "EleutherAI/pythia-70m",
     "bigscience/bloom-560m",
-    "mosaicml/mpt-7b",
+    # FIXME: mosaicml/mpt-7b has been deleted
+    # "mosaicml/mpt-7b",
     "tiiuae/falcon-7b",
     "meta-llama/Llama-3.2-1B-Instruct",
     "codellama/CodeLlama-7b-hf",

From b57b967386e1962955c93b7cb39828448b68789b Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Mon, 22 Dec 2025 18:42:58 -0500
Subject: [PATCH 761/770] [MoE Refactor][7/N] AITER MK (#31102)

Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
---
 .../layers/fused_moe/fused_moe.py             |   6 +-
 .../layers/fused_moe/prepare_finalize.py      |   9 ++
 .../layers/fused_moe/rocm_aiter_fused_moe.py  |  79 ++++++++++++
 .../model_executor/layers/quantization/fp8.py | 116 ++++++++----------
 4 files changed, 144 insertions(+), 66 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index c8d80ae023d43..bf51554341607 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -2132,6 +2132,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
             torch.float16,
             torch.bfloat16,
             torch.float8_e4m3fn,
+            torch.float8_e4m3fnuz,
         ]
 
         E, num_tokens, N, K, top_k_num = self.moe_problem_size(
@@ -2156,7 +2157,10 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
             compute_type = tl.float16
         elif hidden_states.dtype == torch.float32:
             compute_type = tl.float32
-        elif hidden_states.dtype == torch.float8_e4m3fn:
+        elif (
+            hidden_states.dtype == torch.float8_e4m3fn
+            or hidden_states.dtype == torch.float8_e4m3fnuz
+        ):
             compute_type = tl.bfloat16
         else:
             raise ValueError(f"Unsupported compute_type: {hidden_states.dtype}")
diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
index e27e2eb32da0f..5d806fa843a3c 100644
--- a/vllm/model_executor/layers/fused_moe/prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
@@ -13,6 +13,10 @@ from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
 
 
 class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):
+    def __init__(self, defer_input_quant: bool = False) -> None:
+        super().__init__()
+        self.defer_input_quant = defer_input_quant
+
     @property
     def activation_format(self) -> mk.FusedMoEActivationFormat:
         return mk.FusedMoEActivationFormat.Standard
@@ -48,6 +52,11 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):
             # Note: do not use inplace for shared experts overlap
             a1 = a1 * topk_weights.to(a1.dtype)
 
+        # Defer input quant to moe kernel for backends (e.g. AITER, FI)
+        # which use a single kernel call for quant + experts.
+        if self.defer_input_quant:
+            return a1, None, None, None, None
+
         a1q, a1q_scale = moe_kernel_quantize_input(
             a1,
             quant_config.a1_scale,
diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index 882ad0a537cd5..ebd9e3a4a8f2a 100644
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -5,11 +5,15 @@ from functools import lru_cache
 
 import torch
 
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm._aiter_ops import rocm_aiter_ops
 from vllm.model_executor.layers.fused_moe.config import (
     FUSED_MOE_UNQUANTIZED_CONFIG,
     FusedMoEQuantConfig,
 )
+from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
+    TopKWeightAndReduceNoOP,
+)
 
 
 class QuantMethod(IntEnum):
@@ -263,3 +267,78 @@ def rocm_aiter_fused_experts(
             a2_scale=quant_config.a2_scale,
             doweight_stage1=apply_router_weight_on_input,
         )
+
+
+class AiterExperts(mk.FusedMoEPermuteExpertsUnpermute):
+    def __init__(self, quant_config):
+        super().__init__(quant_config)
+
+    @property
+    def activation_formats(
+        self,
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (
+            mk.FusedMoEActivationFormat.Standard,
+            mk.FusedMoEActivationFormat.Standard,
+        )
+
+    def supports_expert_map(self):
+        return True
+
+    def supports_chunking(self):
+        return False
+
+    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
+        return TopKWeightAndReduceNoOP()
+
+    def workspace_shapes(
+        self,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
+        # Workspaces are managed internally by AITER.
+        workspace1 = (0,)
+        workspace2 = (0,)
+        output = (M, K)
+        return (workspace1, workspace2, output)
+
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        apply_router_weight_on_input: bool,
+    ):
+        assert a1q_scale is None
+        assert a2_scale is None
+        assert expert_tokens_meta is None
+
+        result = rocm_aiter_fused_experts(
+            hidden_states=hidden_states,
+            w1=w1,
+            w2=w2,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            activation=activation,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            expert_map=expert_map,
+            quant_config=self.quant_config,
+        )
+        assert result.shape == output.shape
+        output.copy_(result)
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index d19b20798ed06..9da19c082dc27 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -117,6 +117,7 @@ class Fp8MoeBackend(Enum):
     DEEPGEMM = 3
     MARLIN = 4
     TRITON = 5
+    AITER = 6
 
 
 def get_fp8_moe_backend(
@@ -189,6 +190,10 @@ def get_fp8_moe_backend(
             logger.info_once("Using DeepGEMM backend for FP8 MoE", scope="local")
             return Fp8MoeBackend.DEEPGEMM
 
+    if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MOE:
+        logger.info_once("Using ROCm AITER backend for FP8 MoE", scope="local")
+        return Fp8MoeBackend.AITER
+
     # default to Triton
     logger.info_once("Using Triton backend for FP8 MoE")
     return Fp8MoeBackend.TRITON
@@ -888,16 +893,10 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             layer.w13_input_scale = None
             layer.w2_input_scale = None
 
-        self.rocm_aiter_moe_enabled = False
-
     def process_weights_after_loading(self, layer: Module) -> None:
         if getattr(layer, "_already_called_process_weights_after_loading", False):
             return
 
-        # Lazy import to avoid importing triton too early.
-
-        self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
-
         # TODO (rob): refactor block quant into separate class.
         if self.block_quant:
             assert self.quant_config.activation_scheme == "dynamic"
@@ -932,7 +931,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             replace_parameter(layer, "w13_weight_scale_inv", w13_weight_scale_inv)
             replace_parameter(layer, "w2_weight", w2_weight)
             replace_parameter(layer, "w2_weight_scale_inv", w2_weight_scale_inv)
-            if self.rocm_aiter_moe_enabled:
+            if self.fp8_backend == Fp8MoeBackend.AITER:
                 # reshaping weights is required for aiter moe kernel.
                 shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
                     layer.w13_weight.data, layer.w2_weight.data
@@ -1026,7 +1025,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                     )
                     start += shard_size
 
-            if self.rocm_aiter_moe_enabled:
+            if self.fp8_backend == Fp8MoeBackend.AITER:
                 shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
                     layer.w13_weight, layer.w2_weight
                 )
@@ -1072,6 +1071,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             self.moe_quant_config = config
 
             self.kernel = mk.FusedMoEModularKernel(
+                # TODO(rob): we can use the generic MoEPrepareAndFinalizeNoEP
+                # with the changes to defer input quantization
                 FlashInferAllGatherMoEPrepareAndFinalize(
                     use_dp=(self.moe.dp_size > 1),
                     use_deepseek_fp8_block_scale=self.block_quant,
@@ -1093,6 +1094,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             Fp8MoeBackend.DEEPGEMM,
             Fp8MoeBackend.TRITON,
             Fp8MoeBackend.MARLIN,
+            Fp8MoeBackend.AITER,
         ]:
             from vllm.model_executor.layers.fused_moe import (
                 TritonOrDeepGemmExperts,
@@ -1103,24 +1105,33 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             from vllm.model_executor.layers.fused_moe.prepare_finalize import (
                 MoEPrepareAndFinalizeNoEP,
             )
+            from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+                AiterExperts,
+            )
 
             config = self.get_fused_moe_quant_config(layer)
             assert config is not None
             self.moe_quant_config = config
-            use_marlin = self.fp8_backend == Fp8MoeBackend.MARLIN
-            allow_deep_gemm = self.fp8_backend == Fp8MoeBackend.DEEPGEMM
-            moe_kernel = (
-                MarlinExperts(quant_config=self.moe_quant_config)
-                if use_marlin
-                else TritonOrDeepGemmExperts(
-                    quant_config=self.moe_quant_config,
-                    allow_deep_gemm=allow_deep_gemm,
-                )
-            )
 
-            self.kernel = mk.FusedMoEModularKernel(
-                MoEPrepareAndFinalizeNoEP(), moe_kernel
-            )
+            if self.fp8_backend == Fp8MoeBackend.AITER:
+                self.kernel = mk.FusedMoEModularKernel(
+                    # TODO: make defer_input_quant an attr of the AiterExperts
+                    MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
+                    AiterExperts(quant_config=self.moe_quant_config),
+                )
+            elif self.fp8_backend == Fp8MoeBackend.MARLIN:
+                self.kernel = mk.FusedMoEModularKernel(
+                    MoEPrepareAndFinalizeNoEP(),
+                    MarlinExperts(quant_config=self.moe_quant_config),
+                )
+            else:
+                self.kernel = mk.FusedMoEModularKernel(
+                    MoEPrepareAndFinalizeNoEP(),
+                    TritonOrDeepGemmExperts(
+                        quant_config=self.moe_quant_config,
+                        allow_deep_gemm=(self.fp8_backend == Fp8MoeBackend.DEEPGEMM),
+                    ),
+                )
             self.use_inplace = True
 
     def maybe_make_prepare_finalize(
@@ -1128,7 +1139,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
     ) -> mk.FusedMoEPrepareAndFinalize | None:
         if (
-            self.rocm_aiter_moe_enabled
+            self.fp8_backend == Fp8MoeBackend.AITER
             or self.fp8_backend == Fp8MoeBackend.MARLIN
             or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
         ):
@@ -1161,11 +1172,10 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             TritonOrDeepGemmExperts,
         )
 
-        assert (
-            self.fp8_backend != Fp8MoeBackend.MARLIN
-        ) and not self.rocm_aiter_moe_enabled, (
-            "Marlin and ROCm AITER are not supported with all2all yet."
-        )
+        if self.fp8_backend in [Fp8MoeBackend.MARLIN, Fp8MoeBackend.AITER]:
+            raise NotImplementedError(
+                "Marlin and ROCm AITER are not supported with all2all yet."
+            )
 
         assert self.moe_quant_config is not None
 
@@ -1313,37 +1323,18 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             hidden_states=x,
             router_logits=router_logits,
         )
-
-        if self.rocm_aiter_moe_enabled:
-            from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa: E501
-                rocm_aiter_fused_experts,
-            )
-
-            # TODO(rob): convert this to MK.
-            result = rocm_aiter_fused_experts(
-                x,
-                layer.w13_weight,
-                layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                activation=layer.activation,
-                apply_router_weight_on_input=layer.apply_router_weight_on_input,
-                expert_map=layer.expert_map,
-                quant_config=self.moe_quant_config,
-            )
-        else:
-            result = self.kernel(
-                x,
-                layer.w13_weight,
-                layer.w2_weight,
-                topk_weights,
-                topk_ids,
-                inplace=self.use_inplace,
-                activation=layer.activation,
-                global_num_experts=layer.global_num_experts,
-                expert_map=layer.expert_map,
-                apply_router_weight_on_input=layer.apply_router_weight_on_input,
-            )
+        result = self.kernel(
+            x,
+            layer.w13_weight,
+            layer.w2_weight,
+            topk_weights,
+            topk_ids,
+            inplace=self.use_inplace,
+            activation=layer.activation,
+            global_num_experts=layer.global_num_experts,
+            expert_map=layer.expert_map,
+            apply_router_weight_on_input=layer.apply_router_weight_on_input,
+        )
 
         return result
 
@@ -1456,15 +1447,10 @@ class Fp8OnlineMoEMethod(Fp8MoEMethod):
         layer.w13_input_scale = None
         layer.w2_input_scale = None
 
-        self.rocm_aiter_moe_enabled = False
-
     def process_weights_after_loading(self, layer: Module) -> None:
         if getattr(layer, "_already_called_process_weights_after_loading", False):
             return
 
-        # Lazy import to avoid importing triton too early.
-        self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
-
         # If checkpoint is fp16, quantize in place.
         fp8_dtype = current_platform.fp8_dtype()
         w13_weight = torch.empty_like(layer.w13_weight.data, dtype=fp8_dtype)
@@ -1481,7 +1467,7 @@ class Fp8OnlineMoEMethod(Fp8MoEMethod):
         replace_parameter(layer, "w2_weight", w2_weight)
 
         # Reshuffle weights for AITER if needed.
-        if self.rocm_aiter_moe_enabled:
+        if self.fp8_backend == Fp8MoeBackend.AITER:
             shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
                 layer.w13_weight, layer.w2_weight
             )
@@ -1489,7 +1475,7 @@ class Fp8OnlineMoEMethod(Fp8MoEMethod):
             replace_parameter(layer, "w2_weight", shuffled_w2)
 
         # Rushuffle weights for MARLIN if needed.
-        if self.fp8_backend == Fp8MoeBackend.MARLIN:
+        elif self.fp8_backend == Fp8MoeBackend.MARLIN:
             prepare_moe_fp8_layer_for_marlin(
                 layer, False, input_dtype=self.marlin_input_dtype
             )

From 78e5e62bbf9d59c1ba1fac3ec57d7899d80e3e39 Mon Sep 17 00:00:00 2001
From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Date: Mon, 22 Dec 2025 19:28:19 -0600
Subject: [PATCH 762/770] [AMD][CI] fix v1/engine
 test_preprocess_error_handling (#31192)

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
---
 tests/v1/engine/test_preprocess_error_handling.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/v1/engine/test_preprocess_error_handling.py b/tests/v1/engine/test_preprocess_error_handling.py
index 0586cc64fa104..821ac168d97a9 100644
--- a/tests/v1/engine/test_preprocess_error_handling.py
+++ b/tests/v1/engine/test_preprocess_error_handling.py
@@ -5,6 +5,7 @@ import pytest
 import torch.cuda
 
 from vllm import LLM, SamplingParams
+from vllm.platforms import current_platform
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core import EngineCore
 
@@ -14,6 +15,11 @@ MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
 def test_preprocess_error_handling(monkeypatch: pytest.MonkeyPatch):
     """Test that preprocessing errors are handled gracefully."""
 
+    if current_platform.is_rocm():
+        pytest.skip(
+            "Skipped on ROCm: this test only works with 'fork', but ROCm uses 'spawn'."
+        )
+
     assert not torch.cuda.is_initialized(), (
         "fork needs to be used for the engine "
         "core process and this isn't possible if cuda is already initialized"

From 612d5ffdab4bc48e7eae327bc5903920be528db2 Mon Sep 17 00:00:00 2001
From: Angela Yi <yiangela7@gmail.com>
Date: Mon, 22 Dec 2025 17:56:47 -0800
Subject: [PATCH 763/770] [ci] Fix Pytorch compilation test oom in 2.10
 (#31194)

Signed-off-by: angelayi <yiangela7@gmail.com>
---
 tests/compile/test_dynamic_shapes_compilation.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/compile/test_dynamic_shapes_compilation.py b/tests/compile/test_dynamic_shapes_compilation.py
index 9ccb363b088f5..1fda21dea6361 100644
--- a/tests/compile/test_dynamic_shapes_compilation.py
+++ b/tests/compile/test_dynamic_shapes_compilation.py
@@ -77,6 +77,7 @@ def test_dynamic_shapes_compilation(
                 "evaluate_guards": evaluate_guards,
             },
         },
+        max_model_len=1024,
     )
 
     output = model.generate(prompt)

From 3e10262356657f3ee7673335832599f910c28d89 Mon Sep 17 00:00:00 2001
From: Pavani Majety <pmajety@nvidia.com>
Date: Mon, 22 Dec 2025 18:15:33 -0800
Subject: [PATCH 764/770] Revert "[SM100] Enable fp8 compute for prefill MLA
 (#30746)" (#31197)

Signed-off-by: Pavani Majety <pmajety@nvidia.com>
---
 tests/v1/attention/test_mla_backends.py       |   7 +-
 .../quantization/utils/flashinfer_fp4_moe.py  |   1 +
 vllm/v1/attention/backends/mla/common.py      | 126 ++----------------
 3 files changed, 18 insertions(+), 116 deletions(-)

diff --git a/tests/v1/attention/test_mla_backends.py b/tests/v1/attention/test_mla_backends.py
index 6a62440d95417..783e02ce89bdb 100644
--- a/tests/v1/attention/test_mla_backends.py
+++ b/tests/v1/attention/test_mla_backends.py
@@ -27,7 +27,7 @@ from vllm.utils.math_utils import cdiv
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 from vllm.v1.attention.backends.mla.common import QueryLenSupport
 from vllm.v1.attention.backends.utils import CommonAttentionMetadata
-from vllm.v1.kv_cache_interface import MLAAttentionSpec
+from vllm.v1.kv_cache_interface import FullAttentionSpec
 
 BACKENDS_TO_TEST = [
     AttentionBackendEnum.CUTLASS_MLA,
@@ -289,7 +289,7 @@ class MockMLAAttentionLayer(AttentionLayerBase):
 
 def run_attention_backend(
     backend: AttentionBackendEnum,
-    kv_cache_spec: MLAAttentionSpec,
+    kv_cache_spec: FullAttentionSpec,
     layer_names: list[str],
     vllm_config,
     device: torch.device,
@@ -740,7 +740,7 @@ def test_backend_correctness(
         kv_cache = kv_cache_per_block_size[block_size]
 
         # Create kv_cache_spec with the correct block_size for this backend
-        backend_kv_cache_spec = MLAAttentionSpec(
+        backend_kv_cache_spec = FullAttentionSpec(
             block_size=block_size,
             num_kv_heads=vllm_config.model_config.get_num_kv_heads(
                 vllm_config.parallel_config
@@ -748,7 +748,6 @@ def test_backend_correctness(
             head_size=vllm_config.model_config.get_head_size(),
             dtype=vllm_config.model_config.dtype,
             sliding_window=vllm_config.model_config.get_sliding_window(),
-            cache_dtype_str=vllm_config.cache_config.cache_dtype,
         )
 
         backend_output = run_attention_backend(
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
index 16598d567848f..1d410316d6299 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
@@ -325,6 +325,7 @@ def flashinfer_trtllm_fp4_moe(
         local_expert_offset=layer.ep_rank * layer.local_num_experts,
         local_num_experts=layer.local_num_experts,
         routed_scaling_factor=None,
+        tile_tokens_dim=None,
         routing_method_type=routing_method_type,
         do_finalize=True,
     )[0]
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 619c1c2794daa..e9ec96835f277 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -541,11 +541,6 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
             metadata_cls if metadata_cls is not None else MLACommonMetadata
         )
         self.kv_cache_spec = kv_cache_spec
-        self.q_data_type = (
-            current_platform.fp8_dtype()
-            if (kv_cache_spec is not None and "fp8" in kv_cache_spec.cache_dtype_str)
-            else vllm_config.model_config.dtype
-        )
         scheduler_config = vllm_config.scheduler_config
         self.model_config = vllm_config.model_config
         parallel_config = vllm_config.parallel_config
@@ -689,6 +684,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
 
         # For main run, qo_indptr == kv_indptr
         kv_indptr = qo_indptr.clone()
+
         # Prepare main prefill
         self._fi_prefill_main.plan(
             qo_indptr=qo_indptr,
@@ -701,7 +697,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
             sm_scale=self._global_hyperparameters.sm_scale,
             window_left=self._global_hyperparameters.window_left,
             logits_soft_cap=self._global_hyperparameters.logits_soft_cap,
-            q_data_type=self.q_data_type,
+            q_data_type=self.model_config.dtype,
         )
 
         # Prepare context prefills
@@ -720,7 +716,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
                     sm_scale=self._global_hyperparameters.sm_scale,
                     window_left=self._global_hyperparameters.window_left,
                     logits_soft_cap=self._global_hyperparameters.logits_soft_cap,
-                    q_data_type=self.q_data_type,
+                    q_data_type=self.model_config.dtype,
                 )
 
         prefill.prefill_main = self._fi_prefill_main
@@ -973,7 +969,6 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
                 query_start_loc=prefill_query_start_loc,
                 max_query_len=max_query_len,
                 chunked_context=chunked_context_metadata,
-                q_data_type=self.q_data_type,
             )
 
             if self._use_cudnn_prefill:
@@ -1384,15 +1379,8 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         return attn_out
 
     def _run_prefill_new_tokens_fa(
-        self,
-        prefill: MLACommonPrefillMetadata,
-        q,
-        k,
-        v,
-        return_softmax_lse,
-        fp8_attention: bool,
+        self, prefill: MLACommonPrefillMetadata, q, k, v, return_softmax_lse
     ):
-        logger.debug_once("Running FlashAttention prefill new tokens", scope="local")
         return self._flash_attn_varlen_diff_headdims(
             q=q,
             k=k,
@@ -1407,23 +1395,11 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         )
 
     def _run_prefill_new_tokens_fi(
-        self,
-        prefill: MLACommonPrefillMetadata,
-        q,
-        k,
-        v,
-        return_softmax_lse,
-        fp8_attention: bool,
+        self, prefill: MLACommonPrefillMetadata, q, k, v, return_softmax_lse
     ):
-        logger.debug_once("Running FlashInfer prefill new tokens", scope="local")
         assert isinstance(prefill, FlashInferPrefillMetadata)
         assert prefill.prefill_main is not None
-        if fp8_attention:
-            logger.debug_once("Running Flashinfer prefill in FP8")
-            fp8_dtype = current_platform.fp8_dtype()
-            q = q.to(fp8_dtype)
-            k = k.to(fp8_dtype)
-            v = v.to(fp8_dtype)
+
         ret = prefill.prefill_main.run(
             q=q,
             k=k,
@@ -1436,18 +1412,10 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         return ret
 
     def _run_prefill_new_tokens_cudnn(
-        self,
-        prefill: MLACommonPrefillMetadata,
-        q,
-        k,
-        v,
-        return_softmax_lse,
-        fp8_attention: bool,
+        self, prefill: MLACommonPrefillMetadata, q, k, v, return_softmax_lse
     ):
-        logger.debug_once("Running Cudnn prefill new tokens", scope="local")
         assert isinstance(prefill, CudnnPrefillMetadata)
         assert prefill.query_seq_lens is not None
-        assert fp8_attention is False, "Cudnn prefill does not support fp8 attention"
         output, lse = cudnn_batch_prefill_with_kv_cache(
             q=q,
             k_cache=k,
@@ -1469,19 +1437,9 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         return output
 
     def _run_prefill_context_chunk_fa(
-        self,
-        prefill: MLACommonPrefillMetadata,
-        chunk_idx: int,
-        q,
-        k,
-        v,
-        fp8_attention: bool,
+        self, prefill: MLACommonPrefillMetadata, chunk_idx: int, q, k, v
     ):
-        logger.debug_once("Running FlashAttention prefill context chunk", scope="local")
         assert prefill.chunked_context is not None
-        assert fp8_attention is False, (
-            "FlashAttention prefill does not support fp8 attention"
-        )
         return self._flash_attn_varlen_diff_headdims(
             q=q,
             k=k,
@@ -1496,22 +1454,10 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         )
 
     def _run_prefill_context_chunk_fi(
-        self,
-        prefill: MLACommonPrefillMetadata,
-        chunk_idx: int,
-        q,
-        k,
-        v,
-        fp8_attention: bool,
+        self, prefill: MLACommonPrefillMetadata, chunk_idx: int, q, k, v
     ):
-        logger.debug_once("Running FlashInfer prefill context chunk", scope="local")
         assert isinstance(prefill, FlashInferPrefillMetadata)
-        if fp8_attention:
-            logger.debug_once("Running FlashInfer prefill in FP8")
-            fp8_dtype = current_platform.fp8_dtype()
-            q = q.to(fp8_dtype)
-            k = k.to(fp8_dtype)
-            v = v.to(fp8_dtype)
+
         attn_out, lse = prefill.prefill_chunks[chunk_idx].run(
             q=q,
             k=k,
@@ -1523,20 +1469,12 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         return attn_out, lse.transpose(0, 1).contiguous()
 
     def _run_prefill_context_chunk_cudnn(
-        self,
-        prefill: MLACommonPrefillMetadata,
-        chunk_idx: int,
-        q,
-        k,
-        v,
-        fp8_attention: bool,
+        self, prefill: MLACommonPrefillMetadata, chunk_idx: int, q, k, v
     ):
-        logger.debug_once("Running Cudnn prefill context chunk", scope="local")
         assert isinstance(prefill, CudnnPrefillMetadata)
         assert prefill.chunked_context is not None
         assert prefill.chunked_context.seq_lens[chunk_idx] is not None
         assert prefill.query_seq_lens is not None
-        assert fp8_attention is False, "Cudnn prefill does not support fp8 attention"
         return cudnn_batch_prefill_with_kv_cache(
             q=q,
             k_cache=k,
@@ -1556,28 +1494,14 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         )
 
     def _run_prefill_new_tokens_trtllm_ragged(
-        self,
-        prefill: MLACommonPrefillMetadata,
-        q,
-        k,
-        v,
-        return_softmax_lse,
-        fp8_attention: bool,
+        self, prefill: MLACommonPrefillMetadata, q, k, v, return_softmax_lse
     ):
-        logger.debug_once("Running TRT-LLM ragged prefill new tokens", scope="local")
         """TRT-LLM ragged attention for new tokens (causal)."""
         from flashinfer.prefill import trtllm_ragged_attention_deepseek
 
         assert prefill.query_seq_lens is not None
         assert prefill.workspace_buffer is not None
 
-        if fp8_attention:
-            logger.debug_once("Running TRT-LLM ragged prefill in FP8")
-            fp8_dtype = current_platform.fp8_dtype()
-            q = q.to(fp8_dtype)
-            k = k.to(fp8_dtype)
-            v = v.to(fp8_dtype)
-
         ret = trtllm_ragged_attention_deepseek(
             query=q,
             key=k,
@@ -1604,15 +1528,8 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         return ret
 
     def _run_prefill_context_chunk_trtllm_ragged(
-        self,
-        prefill: MLACommonPrefillMetadata,
-        chunk_idx: int,
-        q,
-        k,
-        v,
-        fp8_attention: bool,
+        self, prefill: MLACommonPrefillMetadata, chunk_idx: int, q, k, v
     ):
-        logger.debug_once("Running TRT-LLM ragged prefill context chunk", scope="local")
         """TRT-LLM ragged attention for context chunks (non-causal)."""
         from flashinfer.prefill import trtllm_ragged_attention_deepseek
 
@@ -1629,13 +1546,6 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         )
         prefill.workspace_buffer.fill_(0)
 
-        if fp8_attention:
-            logger.debug_once("Running TRT-LLM ragged prefill context chunk in FP8")
-            fp8_dtype = current_platform.fp8_dtype()
-            q = q.to(fp8_dtype)
-            k = k.to(fp8_dtype)
-            v = v.to(fp8_dtype)
-
         attn_out, lse = trtllm_ragged_attention_deepseek(
             query=q,
             key=k,
@@ -1788,7 +1698,6 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: MLACommonMetadata,
         k_scale: torch.Tensor,
-        fp8_attention: bool,
     ):
         assert attn_metadata.prefill is not None
         prefill_metadata = attn_metadata.prefill
@@ -1827,7 +1736,6 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
                 q=q,
                 k=k,
                 v=v,
-                fp8_attention=fp8_attention,
             )
 
             if output is None:
@@ -1856,7 +1764,6 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         attn_metadata: MLACommonMetadata,
         k_scale: torch.Tensor,
         dcp_world_size: int,
-        fp8_attention: bool,
     ):
         assert k_scale is None, "DCP not support scaled kvcache now."
         assert attn_metadata.prefill is not None
@@ -1933,7 +1840,6 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
                 q=q,
                 k=k,
                 v=v,
-                fp8_attention=fp8_attention,
             )
 
             if output is None:
@@ -1964,7 +1870,6 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         attn_metadata: MLACommonMetadata,
         k_scale: torch.Tensor,
         output: torch.Tensor,
-        fp8_attention: bool = False,
     ) -> None:
         # TODO (zyongye): Prefill function here
         assert attn_metadata.prefill is not None
@@ -1984,7 +1889,6 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
             k=k,
             v=v,
             return_softmax_lse=has_context,
-            fp8_attention=fp8_attention,
         )
 
         if has_context:
@@ -1997,12 +1901,11 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
                         attn_metadata,
                         k_scale=None,
                         dcp_world_size=self.dcp_world_size,
-                        fp8_attention=fp8_attention,
                     )
                 )
             else:
                 context_output, context_lse = self._compute_prefill_context(
-                    q, kv_c_and_k_pe_cache, attn_metadata, k_scale, fp8_attention
+                    q, kv_c_and_k_pe_cache, attn_metadata, k_scale
                 )
 
             # unpad if necessary
@@ -2123,7 +2026,6 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
                 attn_metadata,
                 layer._k_scale,
                 output=output[num_decode_tokens:],
-                fp8_attention=fp8_attention,
             )
 
         if has_decode:

From a37328fc5c6c281042b3bbf4cb067e8bd3cd2ce5 Mon Sep 17 00:00:00 2001
From: quanliu <18646313696@163.com>
Date: Tue, 23 Dec 2025 10:32:47 +0800
Subject: [PATCH 765/770] [Feature] Batch invariant: Lora (#30097)

Signed-off-by: quanliu <18646313696@163.com>
---
 vllm/lora/ops/triton_ops/utils.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/vllm/lora/ops/triton_ops/utils.py b/vllm/lora/ops/triton_ops/utils.py
index 8ed42382e3a86..ed9e916455e5f 100644
--- a/vllm/lora/ops/triton_ops/utils.py
+++ b/vllm/lora/ops/triton_ops/utils.py
@@ -11,9 +11,11 @@ import torch
 
 from vllm import envs
 from vllm.logger import init_logger
+from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
 from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
+is_batch_invariant = vllm_is_batch_invariant()
 
 _LORA_A_PTR_DICT: dict[tuple[int, ...], tuple[torch.tensor, ...]] = {}
 _LORA_B_PTR_DICT: dict[tuple[int, ...], tuple[torch.tensor, ...]] = {}
@@ -150,7 +152,8 @@ def _get_lora_b_ptr(
 @functools.lru_cache
 def load_lora_op_config(op_type: str, add_inputs: bool | None) -> dict | None:
     user_defined_config_folder = envs.VLLM_TUNED_CONFIG_FOLDER
-    if user_defined_config_folder is not None:
+    # Avoid optimizing for the batch invariant case. Use default config
+    if user_defined_config_folder is not None and not is_batch_invariant:
         gpu_name = torch.cuda.get_device_name()
         gpu_name = gpu_name.replace(" ", "_")
         gpu_name = gpu_name.replace("-", "_")
@@ -203,11 +206,14 @@ def get_lora_op_configs(
     # default config
     default = {}
     if op_type == "shrink":
+        split_k = 64 if batch < 128 else 8
+        if is_batch_invariant:
+            split_k = 1
         default = {
             "block_m": 32,
             "block_n": 16,
             "block_k": 256 if batch < 128 else 32,
-            "split_k": 64 if batch < 128 else 8,
+            "split_k": split_k,
             "num_warps": 4,
             "num_ctas": 1,
             "group_size_m": 8,

From 8cef137689170338fdd573838414fc7f9e3ec84a Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 23 Dec 2025 11:19:50 +0800
Subject: [PATCH 766/770] [Chore] Update more locations to use
 `attention_config.backend` (#31153)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 benchmarks/benchmark_batch_invariance.py      | 2 +-
 tests/compile/distributed/test_fusions_e2e.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/benchmarks/benchmark_batch_invariance.py b/benchmarks/benchmark_batch_invariance.py
index b5c16c42de467..7473a41e51406 100755
--- a/benchmarks/benchmark_batch_invariance.py
+++ b/benchmarks/benchmark_batch_invariance.py
@@ -104,7 +104,6 @@ def run_benchmark_with_batch_invariant(
     random.seed(seed)
 
     # Set environment variables
-    os.environ["VLLM_ATTENTION_BACKEND"] = backend
     if batch_invariant:
         os.environ["VLLM_BATCH_INVARIANT"] = "1"
     else:
@@ -140,6 +139,7 @@ def run_benchmark_with_batch_invariant(
             max_model_len=max_model_len,
             dtype="bfloat16",
             tensor_parallel_size=tp_size,
+            attention_config={"backend": backend},
             enable_prefix_caching=False,
         )
         init_time = time.perf_counter() - start_init
diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py
index 28ab2cee71a6a..f8a629ed46cee 100644
--- a/tests/compile/distributed/test_fusions_e2e.py
+++ b/tests/compile/distributed/test_fusions_e2e.py
@@ -557,7 +557,8 @@ def test_rms_group_quant(
     # To capture subprocess logs, we need to know whether spawn or fork is used.
     # Force spawn as it is more general.
     monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
+
+    model_kwargs["attention_config"] = {"backend": backend.name}
 
     compilation_config = CompilationConfig(
         # Testing properties

From f1c2c20136cca6ea8798a64855eaf52ee9a42210 Mon Sep 17 00:00:00 2001
From: Yan Ma <yan.ma@intel.com>
Date: Tue, 23 Dec 2025 13:22:15 +0800
Subject: [PATCH 767/770] [XPU] decrease IGC_ForceOCLSIMDWidth for speculative
 decoding triton-xpu kernel compilation (#30538)

Signed-off-by: Yan Ma <yan.ma@intel.com>
---
 docker/Dockerfile.xpu   | 7 ++++++-
 docs/features/README.md | 2 +-
 vllm/platforms/xpu.py   | 4 +++-
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu
index 72d2053102c22..4168c1570d874 100644
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@@ -2,7 +2,7 @@ FROM intel/deep-learning-essentials:2025.2.2-0-devel-ubuntu24.04 AS vllm-base
 
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
     echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
-    add-apt-repository -y ppa:kobuk-team/intel-graphics
+    add-apt-repository -y ppa:kobuk-team/intel-graphics-staging
 
 RUN apt clean && apt-get update -y && \
     apt-get install -y --no-install-recommends --fix-missing \
@@ -47,6 +47,11 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     pip install --no-cache-dir \
     -r requirements/xpu.txt
 
+# arctic-inference is built from source which needs torch-xpu properly installed
+# used for suffix method speculative decoding
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install --no-cache-dir arctic-inference==0.1.1
+
 ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
 
 COPY . .
diff --git a/docs/features/README.md b/docs/features/README.md
index e9e5232929b72..b9083b9993159 100644
--- a/docs/features/README.md
+++ b/docs/features/README.md
@@ -64,7 +64,7 @@ th:not(:first-child) {
 | [CP](../configuration/optimization.md#chunked-prefill)                                     | [❌](https://github.com/vllm-project/vllm/issues/2729) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
 | [APC](automatic_prefix_caching.md)                        | [❌](https://github.com/vllm-project/vllm/issues/3687) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
 | [LoRA](lora.md)                                           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
-| [SD](spec_decode.md)                                      | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | [🟠](https://github.com/vllm-project/vllm/issues/26963)       |
+| [SD](spec_decode.md)                                      | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | ✅        |
 | CUDA graph                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | [❌](https://github.com/vllm-project/vllm/issues/26970)        |
 | [pooling](../models/pooling_models.md)                    | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
 | <abbr title="Encoder-Decoder Models">enc-dec</abbr>       | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❌     | ✅        |
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 2d67551eed9f6..2e39a216a10a0 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -156,7 +156,9 @@ class XPUPlatform(Platform):
 
         if vllm_config.lora_config is not None:
             compilation_config.mode = CompilationMode.NONE
-
+        # decrease triton kernel compilation scratch space for speculative decoding
+        if vllm_config.speculative_config is not None:
+            os.environ["IGC_ForceOCLSIMDWidth"] = "16"  # noqa: SIM112
         # check and update parallel config
         parallel_config = vllm_config.parallel_config
         # Only override worker_cls if it's still the default "auto"

From 6b16fff01b671840420fef304a62a77279d56533 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 23 Dec 2025 15:44:01 +0800
Subject: [PATCH 768/770] [Bugfix] Fix Jais2ForCausalLM (#31198)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/model_executor/models/jais2.py | 29 ++++-------------------------
 1 file changed, 4 insertions(+), 25 deletions(-)

diff --git a/vllm/model_executor/models/jais2.py b/vllm/model_executor/models/jais2.py
index 01e75338a8ced..aacc4abd43e61 100644
--- a/vllm/model_executor/models/jais2.py
+++ b/vllm/model_executor/models/jais2.py
@@ -48,7 +48,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -167,7 +166,6 @@ class Jais2Attention(nn.Module):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
             rope_parameters=getattr(config, "rope_parameters", None),
             is_neox_style=is_neox_style,
@@ -304,17 +302,12 @@ class Jais2Model(nn.Module):
 
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config
         self.padding_idx = config.pad_token_id
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
+
+        self.vocab_size = config.vocab_size
         self.org_vocab_size = config.vocab_size
         if get_pp_group().is_first_rank or (
             config.tie_word_embeddings and get_pp_group().is_last_rank
@@ -456,29 +449,15 @@ class Jais2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
         self.config = config
-        self.lora_config = lora_config
-
         self.model = self._init_model(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
 
         if get_pp_group().is_last_rank:
-            self.unpadded_vocab_size = config.vocab_size
-            if lora_config:
-                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=(
-                    DEFAULT_VOCAB_PADDING_SIZE
-                    # We need bigger padding if using lora for kernel
-                    # compatibility
-                    if not lora_config
-                    else lora_config.lora_vocab_padding_size
-                ),
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
@@ -487,7 +466,7 @@ class Jais2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
             logit_scale = getattr(config, "logit_scale", 1.0)
             self.logits_processor = LogitsProcessor(
-                self.unpadded_vocab_size, config.vocab_size, logit_scale
+                config.vocab_size, scale=logit_scale
             )
         else:
             self.lm_head = PPMissingLayer()

From f32cfd7d979796fbe1af4bdce3bc3b7bfe512607 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Tue, 23 Dec 2025 11:07:54 +0100
Subject: [PATCH 769/770] [ROCm][FEAT] Support AITER RMSNorm quantization
 fusion pass  (#26575)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
---
 tests/compile/test_fusion.py          | 314 +++++++++++++++-----------
 vllm/_aiter_ops.py                    | 157 +++++++++++--
 vllm/compilation/matcher_utils.py     | 114 +++++++++-
 vllm/compilation/pass_manager.py      |   6 +-
 vllm/compilation/rocm_aiter_fusion.py | 289 ++++++++++++++++++------
 5 files changed, 662 insertions(+), 218 deletions(-)

diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index 6b72c595cd779..7755e9f9b7380 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import itertools
 
 import pytest
 import torch
@@ -53,37 +52,61 @@ class TestModel(torch.nn.Module):
         hidden_size: int,
         eps: float,
         group_shape: GroupShape,
-        cuda_force_torch: bool,
+        use_aiter: bool = False,
+        cuda_force_torch: bool = False,
+        use_aiter_quant_op: bool = True,
         *args,
         **kwargs,
     ):
         super().__init__(*args, **kwargs)
+        self.use_aiter = use_aiter
+        self.use_aiter_quant_op = use_aiter_quant_op
         self.cuda_force_torch = cuda_force_torch
+        self.group_shape = group_shape
+        self.enable_quant_fp8_custom_op = None  # Will be set later if applicable
+
         self.norm = [RMSNorm(hidden_size, eps) for _ in range(4)]
-        if group_shape.is_per_group():
-            self.wscale = [
-                torch.rand(
-                    (hidden_size // group_shape[1], hidden_size // group_shape[1]),
-                    dtype=torch.float32,
-                )
-                for _ in range(3)
-            ]
-        else:
-            self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
-        static = group_shape == GroupShape.PER_TENSOR
+
+        # Setup quantization scale descriptor
+        static = group_shape == GroupShape.PER_TENSOR and not use_aiter
         quant_scale = ScaleDesc(torch.float32, static, group_shape)
         self.quant_key = QuantKey(dtype=FP8_DTYPE, scale=quant_scale, symmetric=True)
+
+        # Setup scales
         if static:
             self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
         else:
             self.scale = [None for _ in range(3)]
+
+        # Setup weights
         self.w = [
             torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE) for _ in range(3)
         ]
-        if not group_shape.is_per_group():
+        if not group_shape.is_per_group() or use_aiter:
             self.w = [self.w[0].t() for _ in range(3)]
 
+        # Setup weight scales
         if group_shape.is_per_group():
+            scale_size = (
+                (hidden_size + 128 - 1) // 128
+                if use_aiter
+                else hidden_size // group_shape[1]
+            )
+            wscale_shape: tuple[int, ...] = (scale_size, scale_size)
+        else:
+            wscale_shape = (1,)
+        self.wscale = [torch.rand(wscale_shape, dtype=torch.float32) for _ in range(3)]
+
+        # Setup FP8 linear operation
+        is_per_group = group_shape.is_per_group()
+        if is_per_group and use_aiter:
+            self.fp8_linear = W8A8BlockFp8LinearOp(
+                weight_group_shape=GroupShape(128, 128),
+                act_quant_group_shape=group_shape,
+                use_aiter_and_is_supported=use_aiter_quant_op,
+            )
+            # AITER blockwise doesn't use enable_quant_fp8_custom_op
+        elif is_per_group:
             self.fp8_linear = W8A8BlockFp8LinearOp(
                 weight_group_shape=GroupShape(group_shape[1], group_shape[1]),
                 act_quant_group_shape=group_shape,
@@ -91,6 +114,13 @@ class TestModel(torch.nn.Module):
                 use_aiter_and_is_supported=False,
             )
             self.enable_quant_fp8_custom_op = self.fp8_linear.input_quant_op.enabled()
+        elif use_aiter:
+            self.fp8_linear = Fp8LinearOp(
+                act_quant_static=False,
+                act_quant_group_shape=group_shape,
+            )
+            self.fp8_linear.quant_fp8.use_aiter = use_aiter_quant_op
+            self.enable_quant_fp8_custom_op = self.fp8_linear.quant_fp8.enabled()
         else:
             with override_cutlass_fp8_supported(not cuda_force_torch):
                 self.fp8_linear = Fp8LinearOp(
@@ -100,7 +130,6 @@ class TestModel(torch.nn.Module):
                 self.enable_quant_fp8_custom_op = self.fp8_linear.quant_fp8.enabled()
 
         self.enable_rms_norm_custom_op = self.norm[0].enabled()
-        self.group_shape = group_shape
 
     def forward(self, x):
         # avoid having graph input be an arg to a pattern directly
@@ -126,19 +155,49 @@ class TestModel(torch.nn.Module):
         y4, resid = self.norm[3](x4, resid)  # use resid here
         return y4
 
+    def ops_in_model_before(self):
+        if (
+            self.use_aiter
+            and self.group_shape.is_per_group()
+            and current_platform.is_fp8_fnuz()
+        ):
+            return [rocm_aiter_ops.get_group_quant_op()]
+        if self.use_aiter and self.group_shape.is_per_group():
+            return [torch.ops.vllm.triton_per_token_group_quant_fp8.default]
+        if self.use_aiter and self.use_aiter_quant_op:
+            return [rocm_aiter_ops.get_per_token_quant_op()]
+        if self.use_aiter:
+            return [QUANT_OPS[self.quant_key]]
+        if self.enable_quant_fp8_custom_op:
+            return [QUANT_OPS[self.quant_key]]
+        return [torch.ops.aten.reciprocal]
+
     def ops_in_model_after(self):
+        if self.use_aiter and self.group_shape.is_per_group():
+            from vllm.compilation.rocm_aiter_fusion import (
+                AiterFusedAddRMSFp8GroupQuantPattern,
+                AiterRMSFp8GroupQuantPattern,
+            )
+
+            return [
+                AiterFusedAddRMSFp8GroupQuantPattern.FUSED_OP,
+                AiterRMSFp8GroupQuantPattern.FUSED_OP,
+            ]
+        if self.use_aiter:
+            from vllm.compilation.rocm_aiter_fusion import (
+                AiterFusedAddRMSNormDynamicQuantPattern,
+                AiterRMSNormDynamicQuantPattern,
+            )
+
+            return [
+                AiterFusedAddRMSNormDynamicQuantPattern.FUSED_OP,
+                AiterRMSNormDynamicQuantPattern.FUSED_OP,
+            ]
         return [
             FUSED_OPS[FusedRMSQuantKey(self.quant_key, True)],
             FUSED_OPS[FusedRMSQuantKey(self.quant_key, False)],
         ]
 
-    def ops_in_model_before(self):
-        return (
-            [QUANT_OPS[self.quant_key]]
-            if self.enable_quant_fp8_custom_op
-            else [torch.ops.aten.reciprocal]
-        )
-
     def ops_in_model_before_partial(self):
         return (
             [RMS_OP, RMS_ADD_OP]
@@ -155,67 +214,45 @@ GROUP_SHAPES = [
 ]
 
 
-class TestRmsnormGroupFp8QuantModel(torch.nn.Module):
-    def __init__(self, hidden_size: int, eps: float, **kwargs):
-        super().__init__()
-        self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp(
-            weight_group_shape=GroupShape(128, 128),
-            act_quant_group_shape=GroupShape(1, 128),
-            cutlass_block_fp8_supported=False,
-            use_aiter_and_is_supported=True,
-        )
-        self.w = [
-            torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
-            for _ in range(3)
-        ]
+def _run_fusion_test(
+    model,
+    fusion_pass,
+    vllm_config,
+    dtype,
+    hidden_size,
+    num_tokens,
+):
+    """Helper function for common fusion test logic.
 
-        scale_hidden_size = (hidden_size + 128 - 1) // 128
-        self.wscale = [
-            torch.rand((scale_hidden_size, scale_hidden_size), dtype=torch.float32)
-            for _ in range(3)
-        ]
+    Must be called within vllm_config context.
+    """
+    noop_pass = NoOpEliminationPass(vllm_config)
+    cleanup_pass = PostCleanupPass(vllm_config)
 
-        self.norm_weight = [torch.ones(hidden_size) for _ in range(4)]
-        self.eps = eps
+    backend = TestBackend(noop_pass, fusion_pass, cleanup_pass)
+    backend2 = TestBackend(noop_pass, cleanup_pass)
 
-    def forward(self, x):
-        # avoid having graph input be an arg to a pattern directly
-        x = resid = torch.relu(x)
-        y = rocm_aiter_ops.rms_norm(x, self.norm_weight[0], self.eps)
+    x = torch.rand(num_tokens, hidden_size)
+    torch._dynamo.mark_dynamic(x, 0)
 
-        x2 = self.w8a8_block_fp8_linear.apply(y, self.w[0], self.wscale[0])
-        # make sure resid is used for replacement to work
-        y2, resid = rocm_aiter_ops.rms_norm2d_with_add(
-            x2, resid, self.norm_weight[1], self.eps
-        )
+    model_fused = torch.compile(model, backend=backend)
+    result_fused = model_fused(x)
 
-        x3 = self.w8a8_block_fp8_linear.apply(y2, self.w[1], self.wscale[1])
+    model_unfused = torch.compile(model, backend=backend2)
+    result_unfused = model_unfused(x)
 
-        y3, resid = rocm_aiter_ops.rms_norm2d_with_add(
-            x3, resid, self.norm_weight[2], self.eps
-        )
+    if dtype == torch.float16:
+        ATOL, RTOL = (2e-3, 2e-3)
+    else:
+        ATOL, RTOL = (1e-2, 1e-2)
 
-        x4 = self.w8a8_block_fp8_linear.apply(y3, self.w[2], self.wscale[2])
+    torch.testing.assert_close(result_fused, result_unfused, atol=ATOL, rtol=RTOL)
 
-        y4, resid = rocm_aiter_ops.rms_norm2d_with_add(
-            x4, resid, self.norm_weight[3], self.eps
-        )
-        return y4
+    assert fusion_pass.matched_count == 3
+    backend.check_before_ops(model.ops_in_model_before())
+    backend.check_after_ops(model.ops_in_model_after())
 
-    def ops_in_model_before(self):
-        return [
-            torch.ops.vllm.rocm_aiter_rms_norm,
-            torch.ops.vllm.rocm_aiter_group_fp8_quant,
-        ]
-
-    def ops_in_model_before_partial(self):
-        return []
-
-    def ops_in_model_after(self):
-        return [
-            torch.ops.vllm.rocm_aiter_rmsnorm_fp8_group_quant,
-            torch.ops.vllm.rocm_aiter_rmsnorm_with_add_fp8_group_quant,
-        ]
+    return backend, backend2
 
 
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
@@ -223,11 +260,8 @@ class TestRmsnormGroupFp8QuantModel(torch.nn.Module):
 @pytest.mark.parametrize("num_tokens", [257])
 @pytest.mark.parametrize("eps", [1e-5, 1e-6])
 @pytest.mark.parametrize("group_shape", GROUP_SHAPES)
-@pytest.mark.parametrize(
-    "model_class, enable_rms_norm_custom_op, enable_quant_fp8_custom_op",
-    list(itertools.product([TestModel], [True, False], [True, False]))
-    + [(TestRmsnormGroupFp8QuantModel, False, False)],
-)
+@pytest.mark.parametrize("enable_rms_norm_custom_op", [True, False])
+@pytest.mark.parametrize("enable_quant_fp8_custom_op", [True, False])
 # cuda_force_torch used to test torch code path on platforms that
 # cutlass_fp8_supported() == True.
 @pytest.mark.parametrize(
@@ -242,23 +276,13 @@ def test_fusion_rmsnorm_quant(
     num_tokens,
     eps,
     group_shape,
-    model_class,
     enable_rms_norm_custom_op,
     enable_quant_fp8_custom_op,
     cuda_force_torch,
 ):
-    if model_class is TestRmsnormGroupFp8QuantModel and not IS_AITER_FOUND:
-        pytest.skip("AITER is not supported on this GPU.")
-
-    torch.set_default_device("cuda")
-    torch.set_default_dtype(dtype)
-    torch.manual_seed(1)
-    maybe_create_device_identity()  # needed for certain non-cutlass fp8 paths
-
     if not enable_quant_fp8_custom_op and group_shape.is_per_group():
         pytest.skip("Unsupported unwrapped quant fp8 op for blockwise quantization")
 
-    # Skip test for 64-bit group shape when running with cutlass or deepgemm
     if group_shape == GroupShape(1, 64) and (
         cutlass_block_fp8_supported() or is_deep_gemm_supported()
     ):
@@ -269,6 +293,7 @@ def test_fusion_rmsnorm_quant(
         custom_ops.append("+rms_norm")
     if enable_quant_fp8_custom_op:
         custom_ops.append("+quant_fp8")
+
     vllm_config = VllmConfig(
         model_config=ModelConfig(dtype=dtype),
         compilation_config=CompilationConfig(
@@ -279,60 +304,97 @@ def test_fusion_rmsnorm_quant(
             ),
         ),
     )
+
     with vllm.config.set_current_vllm_config(vllm_config):
-        # Reshape pass is needed for the fusion pass to work
-        noop_pass = NoOpEliminationPass(vllm_config)
-        if model_class is TestRmsnormGroupFp8QuantModel:
-            from vllm.compilation.rocm_aiter_fusion import (
-                RocmAiterRMSNormFp8GroupQuantFusionPass,
-            )
+        # Setup device before model creation
+        torch.set_default_device("cuda")
+        torch.set_default_dtype(dtype)
+        torch.manual_seed(1)
+        maybe_create_device_identity()
 
-            fusion_pass = RocmAiterRMSNormFp8GroupQuantFusionPass(vllm_config)
-        else:
-            fusion_pass = RMSNormQuantFusionPass(vllm_config)
-        cleanup_pass = PostCleanupPass(vllm_config)
-
-        backend = TestBackend(noop_pass, fusion_pass, cleanup_pass)
-        backend2 = TestBackend(noop_pass, cleanup_pass)
-        model = model_class(
+        fusion_pass = RMSNormQuantFusionPass(vllm_config)
+        model = TestModel(
             hidden_size=hidden_size,
             eps=eps,
             group_shape=group_shape,
+            use_aiter=False,
             cuda_force_torch=cuda_force_torch,
         )
-        # First dimension dynamic
-        x = torch.rand(num_tokens, hidden_size)
-        torch._dynamo.mark_dynamic(x, 0)
 
-        model_fused = torch.compile(model, backend=backend)
-        result_fused = model_fused(x)
-
-        model_unfused = torch.compile(model, backend=backend2)
-        result_unfused = model_unfused(x)
-
-        if dtype == torch.float16:
-            ATOL, RTOL = (2e-3, 2e-3)
-        else:
-            ATOL, RTOL = (1e-2, 1e-2)
-
-        torch.testing.assert_close(result_fused, result_unfused, atol=ATOL, rtol=RTOL)
-
-        assert fusion_pass.matched_count == 3
-        backend.check_before_ops(model.ops_in_model_before())
+        backend, _ = _run_fusion_test(
+            model, fusion_pass, vllm_config, dtype, hidden_size, num_tokens
+        )
         backend.check_before_ops(
             model.ops_in_model_before_partial(), fully_replaced=False
         )
-        backend.check_after_ops(model.ops_in_model_after())
 
         # If RMSNorm custom op is disabled (native/torch impl used),
         # there's a risk that the fused add doesn't get included in the
         # replacement and only the rms part gets fused with quant.
         # Hence, we check only 2 add nodes are left (final fused rmsnorm add).
-        if (
-            not enable_rms_norm_custom_op
-            and model_class is not TestRmsnormGroupFp8QuantModel
-        ):
+        if not enable_rms_norm_custom_op:
             n_add_nodes = lambda g: sum(1 for _ in find_op_nodes(torch.ops.aten.add, g))
             # 7 = 1 (RMS) + 3x2 (3xRMS_ADD, 2 each)
             assert n_add_nodes(backend.graph_pre_pass) == 7
             assert n_add_nodes(backend.graph_post_pass) == 2
+
+
+GROUP_SHAPE_QUANT_OPS_MATCHS = [
+    (GroupShape.PER_TOKEN, True),
+    (GroupShape.PER_TOKEN, False),
+    (GroupShape(1, 128), True),
+]
+
+
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("hidden_size", [256])
+@pytest.mark.parametrize("num_tokens", [257])
+@pytest.mark.parametrize("eps", [1e-5, 1e-6])
+@pytest.mark.parametrize(
+    "group_shape, use_aiter_quant_op", GROUP_SHAPE_QUANT_OPS_MATCHS
+)
+@pytest.mark.skipif(
+    (not current_platform.is_rocm() or not IS_AITER_FOUND),
+    reason="Only test on ROCm with aiter package installed",
+)
+def test_aiter_fusion_rmsnorm_quant(
+    dtype: torch.dtype,
+    hidden_size: int,
+    num_tokens: int,
+    eps: float,
+    group_shape: GroupShape,
+    use_aiter_quant_op: bool,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    vllm_config = VllmConfig(
+        model_config=ModelConfig(dtype=dtype),
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            custom_ops=["+rms_norm", "+quant_fp8"],
+            pass_config=PassConfig(fuse_norm_quant=True, eliminate_noops=True),
+        ),
+    )
+
+    with vllm.config.set_current_vllm_config(vllm_config), monkeypatch.context() as m:
+        from vllm.compilation.rocm_aiter_fusion import RocmAiterRMSNormFusionPass
+
+        m.setenv("VLLM_ROCM_USE_AITER", "1")
+        rocm_aiter_ops.refresh_env_variables()
+
+        torch.set_default_device("cuda")
+        torch.set_default_dtype(dtype)
+        torch.manual_seed(1)
+        maybe_create_device_identity()
+
+        fusion_pass = RocmAiterRMSNormFusionPass(vllm_config)
+        model = TestModel(
+            hidden_size=hidden_size,
+            eps=eps,
+            group_shape=group_shape,
+            use_aiter=True,
+            use_aiter_quant_op=use_aiter_quant_op,
+        )
+
+        _run_fusion_test(
+            model, fusion_pass, vllm_config, dtype, hidden_size, num_tokens
+        )
diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py
index 03e3bb7594910..299c8219120ae 100644
--- a/vllm/_aiter_ops.py
+++ b/vllm/_aiter_ops.py
@@ -4,6 +4,7 @@ import functools
 from collections.abc import Callable
 
 import torch
+from torch._ops import OpOverload
 
 import vllm.envs as envs
 from vllm.platforms import current_platform
@@ -433,16 +434,16 @@ def _rocm_aiter_rmsnorm2d_fwd_with_add_impl(
     from aiter import rmsnorm2d_fwd_with_add
 
     residual_out = torch.empty_like(residual)
-    output = torch.empty_like(x)
+    out = torch.empty_like(x)
     rmsnorm2d_fwd_with_add(
-        output,  # output
+        out,  # output
         x,  # input
         residual,  # residual input
         residual_out,  # residual output
         weight,
         variance_epsilon,
     )
-    return output, residual_out
+    return out, residual_out
 
 
 def _rocm_aiter_rmsnorm2d_fwd_with_add_fake(
@@ -451,7 +452,84 @@ def _rocm_aiter_rmsnorm2d_fwd_with_add_fake(
     weight: torch.Tensor,
     variance_epsilon: float,
 ) -> tuple[torch.Tensor, torch.Tensor]:
-    return torch.empty_like(x), torch.empty_like(residual)
+    residual_out = torch.empty_like(residual)
+    out = torch.empty_like(x)
+    return out, residual_out
+
+
+def _rocm_aiter_rmsnorm_fused_add_dynamic_quant_impl(
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    epsilon: float,
+    quant_dtype: torch.dtype,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    import aiter as rocm_aiter
+
+    assert quant_dtype in [torch.int8, _FP8_DTYPE]
+
+    y_scale = torch.empty(x.shape[0], 1, dtype=torch.float32, device=x.device)
+    out = torch.empty(x.shape, dtype=quant_dtype, device=x.device)
+    residual_out = torch.empty_like(x)
+
+    rocm_aiter.rmsnorm2d_fwd_with_add_dynamicquant(
+        out,
+        x,
+        residual,
+        residual_out,
+        y_scale,
+        weight,
+        epsilon,
+        use_model_sensitive_rmsnorm=0,
+    )
+
+    return out, residual_out, y_scale
+
+
+def _rocm_aiter_rmsnorm_fused_add_dynamic_quant_fake(
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    epsilon: float,
+    quant_dtype: torch.dtype,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    y_scale = torch.empty(x.shape[0], 1, dtype=torch.float32, device=x.device)
+    out = torch.empty(x.shape, dtype=quant_dtype, device=x.device)
+    residual_out = torch.empty_like(x)
+
+    return out, residual_out, y_scale
+
+
+def _rocm_aiter_rmsnorm_fused_dynamic_quant_impl(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    epsilon: float,
+    quant_dtype: torch.dtype,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    import aiter as rocm_aiter
+
+    assert quant_dtype in [torch.int8, _FP8_DTYPE]
+
+    y_scale = torch.empty(x.shape[0], 1, dtype=torch.float32, device=x.device)
+    out = torch.empty(x.shape, dtype=quant_dtype, device=x.device)
+
+    rocm_aiter.rmsnorm2d_fwd_with_dynamicquant(
+        out, x, y_scale, weight, epsilon, use_model_sensitive_rmsnorm=0
+    )
+
+    return out, y_scale
+
+
+def _rocm_aiter_rmsnorm_fused_dynamic_quant_fake(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    epsilon: float,
+    quant_dtype: torch.dtype,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    y_scale = torch.empty(x.shape[0], 1, dtype=torch.float32, device=x.device)
+    out = torch.empty(x.shape, dtype=quant_dtype, device=x.device)
+
+    return out, y_scale
 
 
 def _rocm_aiter_per_tensor_quant_impl(
@@ -527,7 +605,11 @@ def _rocm_aiter_rmsnorm_with_add_fp8_group_quant_impl(
         dtype_quant=AITER_FP8_DTYPE,
         res1=residual,
     )
-    return (x_quant, x_quant_scales, res)
+    return (
+        x_quant,
+        res,
+        x_quant_scales,
+    )
 
 
 def _rocm_aiter_rmsnorm_with_add_fp8_group_quant_fake(
@@ -541,8 +623,8 @@ def _rocm_aiter_rmsnorm_with_add_fp8_group_quant_fake(
     scale_shape = (M, (N + group_size - 1) // group_size)
     return (
         torch.empty_like(x, dtype=AITER_FP8_DTYPE, device=x.device),
-        torch.empty(scale_shape, dtype=torch.float32, device=x.device),
         torch.empty_like(residual, device=residual.device),
+        torch.empty(scale_shape, dtype=torch.float32, device=x.device),
     )
 
 
@@ -901,6 +983,20 @@ class rocm_aiter_ops:
                 dispatch_key=current_platform.dispatch_key,
             )
 
+            direct_register_custom_op(
+                op_name="rocm_aiter_rmsnorm_fused_dynamic_quant",
+                op_func=_rocm_aiter_rmsnorm_fused_dynamic_quant_impl,
+                fake_impl=_rocm_aiter_rmsnorm_fused_dynamic_quant_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_rmsnorm_fused_add_dynamic_quant",
+                op_func=_rocm_aiter_rmsnorm_fused_add_dynamic_quant_impl,
+                fake_impl=_rocm_aiter_rmsnorm_fused_add_dynamic_quant_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
             direct_register_custom_op(
                 op_name="rocm_aiter_rmsnorm_fp8_group_quant",
                 op_func=_rocm_aiter_rmsnorm_fp8_group_quant_impl,
@@ -936,13 +1032,54 @@ class rocm_aiter_ops:
             direct_register_custom_op(
                 op_name="rocm_aiter_per_token_quant",
                 op_func=_rocm_aiter_per_token_quant_impl,
-                mutates_args=["scale"],
                 fake_impl=_rocm_aiter_per_token_quant_fake,
                 dispatch_key=current_platform.dispatch_key,
             )
 
             _OPS_REGISTERED = True
 
+    @staticmethod
+    def get_rmsnorm_fused_add_op() -> OpOverload:
+        return torch.ops.vllm.rocm_aiter_rmsnorm2d_fwd_with_add.default
+
+    @staticmethod
+    def get_rmsnorm_op() -> OpOverload:
+        return torch.ops.vllm.rocm_aiter_rms_norm.default
+
+    @staticmethod
+    def get_rmsnorm_fused_add_dynamic_quant_op() -> OpOverload:
+        return torch.ops.vllm.rocm_aiter_rmsnorm_fused_add_dynamic_quant.default
+
+    @staticmethod
+    def get_rmsnorm_fused_dynamic_quant_op() -> OpOverload:
+        return torch.ops.vllm.rocm_aiter_rmsnorm_fused_dynamic_quant.default
+
+    @staticmethod
+    def get_rmsnorm_group_fused_quant_op() -> OpOverload:
+        return torch.ops.vllm.rocm_aiter_rmsnorm_fp8_group_quant.default
+
+    @staticmethod
+    def get_rmsnorm_group_add_fused_quant_op() -> OpOverload:
+        return torch.ops.vllm.rocm_aiter_rmsnorm_with_add_fp8_group_quant.default
+
+    @staticmethod
+    def get_per_token_quant_op() -> OpOverload:
+        return torch.ops.vllm.rocm_aiter_per_token_quant.default
+
+    @staticmethod
+    def get_group_quant_op() -> OpOverload:
+        return torch.ops.vllm.rocm_aiter_group_fp8_quant.default
+
+    @staticmethod
+    def get_act_mul_fused_fp8_group_quant_op() -> OpOverload:
+        return torch.ops.vllm.rocm_aiter_act_mul_and_fp8_group_quant.default
+
+    @staticmethod
+    def rms_norm(
+        x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float
+    ) -> torch.Tensor:
+        return torch.ops.vllm.rocm_aiter_rms_norm(x, weight, variance_epsilon)
+
     @staticmethod
     def rms_norm2d_with_add(
         x: torch.Tensor,
@@ -954,12 +1091,6 @@ class rocm_aiter_ops:
             x, residual, weight, variance_epsilon
         )
 
-    @staticmethod
-    def rms_norm(
-        x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float
-    ) -> torch.Tensor:
-        return torch.ops.vllm.rocm_aiter_rms_norm(x, weight, variance_epsilon)
-
     @staticmethod
     def gemm_a8w8(
         A: torch.Tensor,
diff --git a/vllm/compilation/matcher_utils.py b/vllm/compilation/matcher_utils.py
index ec9ed34f561b4..7301aa3e5932d 100644
--- a/vllm/compilation/matcher_utils.py
+++ b/vllm/compilation/matcher_utils.py
@@ -6,11 +6,13 @@ import torch
 from torch._higher_order_ops import auto_functionalized
 from torch._ops import OpOverload
 
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.config import get_current_vllm_config
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape,
     QuantKey,
     _normalize_quant_group_shape,
     kFp8Dynamic64Sym,
@@ -150,26 +152,50 @@ class MatcherRotaryEmbedding(MatcherCustomOp):
 
 
 class MatcherRMSNorm(MatcherCustomOp):
-    def __init__(self, epsilon: float, enabled: bool | None = None):
+    def __init__(
+        self,
+        epsilon: float,
+        enabled: bool | None = None,
+        match_rocm_aiter: bool = False,
+    ):
         if enabled is None:
             enabled = RMSNorm.enabled()
 
         super().__init__(enabled)
         self.epsilon = epsilon
+        self._rmsnorm_op = RMS_OP
+        self.match_rocm_aiter = match_rocm_aiter
+
+        if match_rocm_aiter:
+            self._rmsnorm_op = rocm_aiter_ops.get_rmsnorm_op()
 
     def inputs(self):
         input = self.empty(5, 16) if self.enabled else self.empty_f32(5, 16)
         weight = self.empty(16)
         return [input, weight]
 
+    def forward_rocm_aiter(
+        self,
+        input: torch.Tensor,
+        weight: torch.Tensor,
+    ) -> torch.Tensor:
+        return self._rmsnorm_op(
+            x=input,
+            weight=weight,
+            variance_epsilon=self.epsilon,
+        )
+
     def forward_custom(
         self,
         input: torch.Tensor,
         weight: torch.Tensor,
     ) -> torch.Tensor:
+        if self.match_rocm_aiter:
+            return self.forward_rocm_aiter(input, weight)
+
         result = torch.empty_like(input)
         _, result = auto_functionalized(
-            RMS_OP,
+            self._rmsnorm_op,
             result=result,
             input=input,
             weight=weight,
@@ -189,12 +215,23 @@ class MatcherRMSNorm(MatcherCustomOp):
 
 
 class MatcherFusedAddRMSNorm(MatcherCustomOp):
-    def __init__(self, epsilon: float, enabled: bool | None = None):
+    def __init__(
+        self,
+        epsilon: float,
+        enabled: bool | None = None,
+        match_rocm_aiter: bool = False,
+    ):
         if enabled is None:
             enabled = RMSNorm.enabled()
 
         super().__init__(enabled)
         self.epsilon = epsilon
+        self.match_rocm_aiter = match_rocm_aiter
+
+        self._rmsnorm_op = RMS_ADD_OP
+
+        if match_rocm_aiter:
+            self._rmsnorm_op = rocm_aiter_ops.get_rmsnorm_fused_add_op()
 
     def inputs(self):
         input = self.empty(5, 16) if self.enabled else self.empty_f32(5, 16)
@@ -202,14 +239,27 @@ class MatcherFusedAddRMSNorm(MatcherCustomOp):
         residual = self.empty(5, 16)
         return [input, weight, residual]
 
+    def forward_rocm_aiter(
+        self,
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        residual: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        return self._rmsnorm_op(
+            x=input, residual=residual, weight=weight, variance_epsilon=self.epsilon
+        )
+
     def forward_custom(
         self,
         input: torch.Tensor,
         weight: torch.Tensor,
         residual: torch.Tensor,
     ) -> tuple[torch.Tensor, torch.Tensor]:
+        if self.match_rocm_aiter:
+            return self.forward_rocm_aiter(input, weight, residual)
+
         _, result, residual = auto_functionalized(
-            RMS_ADD_OP,
+            self._rmsnorm_op,
             input=input,
             residual=residual,
             weight=weight,
@@ -236,22 +286,46 @@ class MatcherQuantFP8(MatcherCustomOp):
         enabled: bool | None = None,
         has_col_major_scales: bool = False,
         is_e8m0: bool = False,
+        match_rocm_aiter: bool = False,
     ):
         if enabled is None:
             enabled = QuantFP8.enabled()
 
         super().__init__(enabled)
         self.quant_key = quant_key
-        assert quant_key in QUANT_OPS, f"unsupported quantization scheme {quant_key}"
-        self.QUANT_OP = QUANT_OPS[quant_key]
-
         self.has_col_major_scales = has_col_major_scales
         self.is_e8m0 = is_e8m0
+        self.match_rocm_aiter = match_rocm_aiter
+
+        if match_rocm_aiter:
+            assert not quant_key.scale.group_shape.is_per_tensor(), (
+                "ROCm aiter fusion pass does not support per tensor quantization"
+            )
+            if quant_key.scale.group_shape.is_per_token():
+                self.QUANT_OP = rocm_aiter_ops.get_per_token_quant_op()
+            else:
+                assert quant_key.scale.group_shape.col == 128, (
+                    "ROCm aiter fusion pass currently supports "
+                    "quantization operation with group_size 128"
+                )
+                if current_platform.is_fp8_fnuz():
+                    self.QUANT_OP = rocm_aiter_ops.get_group_quant_op()
+                else:
+                    self.QUANT_OP = (
+                        torch.ops.vllm.triton_per_token_group_quant_fp8.default
+                    )
+
+        else:
+            assert quant_key in QUANT_OPS, (
+                f"unsupported quantization scheme {quant_key}"
+            )
+            self.QUANT_OP = QUANT_OPS[quant_key]
+
+            assert quant_key.dtype == current_platform.fp8_dtype(), (
+                "Only QuantFP8 supported by"
+            )
+            assert quant_key.scale2 is None
 
-        assert quant_key.dtype == current_platform.fp8_dtype(), (
-            "Only QuantFP8 supported by"
-        )
-        assert quant_key.scale2 is None
         self.quant_fp8 = QuantFP8(
             quant_key.scale.static,
             quant_key.scale.group_shape,
@@ -259,11 +333,29 @@ class MatcherQuantFP8(MatcherCustomOp):
             use_ue8m0=is_e8m0,
         )
 
+    def forward_rocm_aiter(
+        self,
+        input: torch.Tensor,
+        scale: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        quant_key_group_shape = self.quant_key.scale.group_shape
+        if quant_key_group_shape == GroupShape.PER_TOKEN:
+            return self.QUANT_OP(
+                x=input,
+                quant_dtype=self.quant_key.dtype,
+                scale=scale,
+            )
+        else:
+            return self.QUANT_OP(input, quant_key_group_shape.col)
+
     def forward_custom(
         self,
         input: torch.Tensor,
         scale: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
+        if self.match_rocm_aiter:
+            return self.forward_rocm_aiter(input, scale)
+
         result = torch.empty(
             input.shape, device=input.device, dtype=self.quant_key.dtype
         )
diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
index 4ebb386f75ed8..4c2dee505a941 100644
--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
@@ -16,7 +16,7 @@ from .vllm_inductor_pass import VllmInductorPass
 
 if rocm_aiter_ops.is_enabled():
     from vllm.compilation.rocm_aiter_fusion import (
-        RocmAiterRMSNormFp8GroupQuantFusionPass,
+        RocmAiterRMSNormFusionPass,
         RocmAiterSiluMulFp8GroupQuantFusionPass,
     )
 
@@ -117,7 +117,9 @@ class PostGradPassManager(CustomGraphPass):
             if self.pass_config.fuse_norm_quant:
                 self.passes += [RMSNormQuantFusionPass(config)]
                 if rocm_aiter_ops.is_enabled():
-                    self.passes += [RocmAiterRMSNormFp8GroupQuantFusionPass(config)]
+                    self.passes += [
+                        RocmAiterRMSNormFusionPass(config),
+                    ]
             if self.pass_config.fuse_act_quant:
                 self.passes += [ActivationQuantFusionPass(config)]
                 if rocm_aiter_ops.is_enabled():
diff --git a/vllm/compilation/rocm_aiter_fusion.py b/vllm/compilation/rocm_aiter_fusion.py
index 8b5db9de38181..f66bb76b97f05 100644
--- a/vllm/compilation/rocm_aiter_fusion.py
+++ b/vllm/compilation/rocm_aiter_fusion.py
@@ -9,60 +9,195 @@ from torch._inductor.pattern_matcher import PatternMatcherPass
 from torch._ops import OpOverload
 
 import vllm.model_executor.layers.quantization.utils.fp8_utils  # noqa: F401
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.compilation.activation_quant_fusion import ActivationQuantPattern
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape,
+    QuantKey,
+    ScaleDesc,
+)
 from vllm.platforms import current_platform
 
-from .fusion import empty_bf16
+from .fusion import (
+    FusedRMSQuantKey,
+)
 from .inductor_pass import enable_fake_mode
-from .matcher_utils import MatcherSiluAndMul
+from .matcher_utils import (
+    MatcherFusedAddRMSNorm,
+    MatcherQuantFP8,
+    MatcherRMSNorm,
+    MatcherSiluAndMul,
+)
 from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass
 
 logger = init_logger(__name__)
 FP8_DTYPE = current_platform.fp8_dtype()
 
-AITER_RMS_GROUP_QUANT_OP = torch.ops.vllm.rocm_aiter_rmsnorm_fp8_group_quant.default
-AITER_RMS_ADD_GROUP_QUANT_OP = (
-    torch.ops.vllm.rocm_aiter_rmsnorm_with_add_fp8_group_quant.default
-)
 
-AITER_RMS_OP = torch.ops.vllm.rocm_aiter_rms_norm.default
-AITER_RMS_ADD_OP = torch.ops.vllm.rocm_aiter_rmsnorm2d_fwd_with_add.default
+class AiterRMSNormQuantPattern:
+    def __init__(
+        self, epsilon: float, key: FusedRMSQuantKey, match_aiter_quant: bool = True
+    ):
+        self.epsilon = epsilon
+        self.quant_dtype = key.quant.dtype
 
-AITER_GROUP_FP8_QUANT_OP = torch.ops.vllm.rocm_aiter_group_fp8_quant.default
-TRITON_GROUP_FP8_QUANT_OP = torch.ops.vllm.triton_per_token_group_quant_fp8.default
-
-FUSED_SILU_MUL_QUANT_OP = torch.ops.vllm.rocm_aiter_act_mul_and_fp8_group_quant.default
+        self.rmsnorm_matcher = (
+            MatcherRMSNorm(epsilon, match_rocm_aiter=True)
+            if not key.fused_add
+            else MatcherFusedAddRMSNorm(epsilon, match_rocm_aiter=True)
+        )
+        self.quant_matcher = MatcherQuantFP8(
+            key.quant,
+            match_rocm_aiter=match_aiter_quant,
+        )
 
 
-class AiterRMSFp8GroupQuantPattern:
+class AiterRMSNormDynamicQuantPattern(AiterRMSNormQuantPattern):
+    """AITER RMSNorm + Dynamic Quantization pattern."""
+
+    FUSED_OP = rocm_aiter_ops.get_rmsnorm_fused_dynamic_quant_op()
+
+    def __init__(
+        self,
+        epsilon: float,
+        quant_dtype: torch.dtype,
+        match_aiter_quant: bool = True,
+        group_shape: GroupShape = GroupShape.PER_TOKEN,
+        symmetric=True,
+    ):
+        scale = ScaleDesc(torch.float32, False, group_shape)
+        key = FusedRMSQuantKey(
+            fused_add=False,
+            quant=QuantKey(dtype=quant_dtype, scale=scale, symmetric=symmetric),
+        )
+
+        super().__init__(epsilon, key, match_aiter_quant)
+
+    def register(self, pm_pass):
+        def pattern(
+            input: torch.Tensor,
+            weight: torch.Tensor,
+        ):
+            result_rms = self.rmsnorm_matcher(input, weight)
+            result, scale = self.quant_matcher(result_rms)
+            return result, scale
+
+        def replacement(
+            input: torch.Tensor,
+            weight: torch.Tensor,
+        ):
+            result = self.FUSED_OP(
+                x=input,
+                weight=weight,
+                epsilon=self.epsilon,
+                quant_dtype=self.quant_dtype,
+            )
+
+            return result[0], result[1]
+
+        pm.register_replacement(
+            pattern,
+            replacement,
+            self.rmsnorm_matcher.inputs(),
+            pm.fwd_only,
+            pm_pass,
+        )
+
+
+class AiterFusedAddRMSNormDynamicQuantPattern(AiterRMSNormQuantPattern):
+    """AITER RMSNorm Fused Add + Dynamic Quantization pattern."""
+
+    FUSED_OP = rocm_aiter_ops.get_rmsnorm_fused_add_dynamic_quant_op()
+
+    def __init__(
+        self,
+        epsilon: float,
+        quant_dtype: torch.dtype,
+        match_aiter_quant: bool = True,
+        group_shape: GroupShape = GroupShape.PER_TOKEN,
+        symmetric=True,
+    ):
+        scale = ScaleDesc(torch.float32, False, group_shape)
+        key = FusedRMSQuantKey(
+            fused_add=True,
+            quant=QuantKey(dtype=quant_dtype, scale=scale, symmetric=symmetric),
+        )
+
+        super().__init__(epsilon, key, match_aiter_quant)
+
+    def register(self, pm_pass):
+        def pattern(
+            input: torch.Tensor,
+            weight: torch.Tensor,
+            residual: torch.Tensor,
+        ):
+            result_rms, residual_out = self.rmsnorm_matcher(input, weight, residual)
+            result, scale = self.quant_matcher(result_rms)
+
+            return result, residual_out, scale
+
+        def replacement(
+            input: torch.Tensor, weight: torch.Tensor, residual: torch.Tensor
+        ):
+            result = self.FUSED_OP(
+                x=input,
+                residual=residual,
+                weight=weight,
+                epsilon=self.epsilon,
+                quant_dtype=self.quant_dtype,
+            )
+
+            return result[0], result[1], result[2]
+
+        pm.register_replacement(
+            pattern,
+            replacement,
+            self.rmsnorm_matcher.inputs(),
+            pm.fwd_only,
+            pm_pass,
+        )
+
+
+class AiterRMSFp8GroupQuantPattern(AiterRMSNormQuantPattern):
     """
     This pattern fuses aiter rms_norm & group fp8 quant custom
     ops into an aiter rms_norm_group_fp8_quant op.
     """
 
-    def __init__(self, epsilon: float, quant_dtype: torch.dtype, quant_op: OpOverload):
-        self.epsilon = epsilon
-        self.quant_dtype = quant_dtype
-        self.quant_op = quant_op
+    FUSED_OP = rocm_aiter_ops.get_rmsnorm_group_fused_quant_op()
+
+    def __init__(
+        self,
+        epsilon: float,
+        quant_dtype: torch.dtype,
+        group_shape: GroupShape,
+        match_aiter_quant: bool = True,
+        symmetric=True,
+    ):
+        scale = ScaleDesc(torch.float32, False, group_shape)
+        key = FusedRMSQuantKey(
+            fused_add=False,
+            quant=QuantKey(dtype=quant_dtype, scale=scale, symmetric=symmetric),
+        )
+
+        super().__init__(epsilon, key, match_aiter_quant)
 
     def register(self, pm_pass: PatternMatcherPass):
         def pattern(
             input: torch.Tensor,
             weight: torch.Tensor,
         ):
-            at1 = AITER_RMS_OP(x=input, weight=weight, variance_epsilon=self.epsilon)
-
-            at2 = self.quant_op(at1, 128)
-
-            return at2[0], at2[1]
+            result_rms = self.rmsnorm_matcher(input, weight)
+            result, scale = self.quant_matcher(result_rms)
+            return result, scale
 
         def replacement(
             input: torch.Tensor,
             weight: torch.Tensor,
         ):
-            at = AITER_RMS_GROUP_QUANT_OP(
+            at = self.FUSED_OP(
                 x=input,
                 weight=weight,
                 variance_epsilon=self.epsilon,
@@ -71,49 +206,52 @@ class AiterRMSFp8GroupQuantPattern:
 
             return at[0], at[1]
 
-        inputs = [
-            empty_bf16(5, 4),  # input
-            empty_bf16(1, 5),  # weight
-        ]
-
-        pm.register_replacement(pattern, replacement, inputs, pm.fwd_only, pm_pass)
+        pm.register_replacement(
+            pattern, replacement, self.rmsnorm_matcher.inputs(), pm.fwd_only, pm_pass
+        )
 
 
-class AiterFusedAddRMSFp8GroupQuantPattern:
+class AiterFusedAddRMSFp8GroupQuantPattern(AiterRMSNormQuantPattern):
     """
     This pattern fuses aiter rms_norm_with_add & group fp8 quant custom ops
     into a aiter rms_norm_with_add_group_fp8_quant op.
     """
 
-    def __init__(self, epsilon: float, quant_dtype: torch.dtype, quant_op: OpOverload):
-        self.epsilon = epsilon
-        self.quant_dtype = quant_dtype
-        self.quant_op = quant_op
+    FUSED_OP = rocm_aiter_ops.get_rmsnorm_group_add_fused_quant_op()
+
+    def __init__(
+        self,
+        epsilon: float,
+        quant_dtype: torch.dtype,
+        group_shape: GroupShape,
+        match_aiter_quant: bool = True,
+        symmetric=True,
+    ):
+        scale = ScaleDesc(torch.float32, False, group_shape)
+        key = FusedRMSQuantKey(
+            fused_add=True,
+            quant=QuantKey(dtype=quant_dtype, scale=scale, symmetric=symmetric),
+        )
+
+        super().__init__(epsilon, key, match_aiter_quant)
 
     def register(self, pm_pass: PatternMatcherPass):
         def pattern(
             input: torch.Tensor,
-            residual: torch.Tensor,
             weight: torch.Tensor,
+            residual: torch.Tensor,
         ):
-            at1 = AITER_RMS_ADD_OP(
-                x=input,
-                residual=residual,
-                weight=weight,
-                variance_epsilon=self.epsilon,
-            )
+            result_rms, residual_out = self.rmsnorm_matcher(input, weight, residual)
+            result, scale = self.quant_matcher(result_rms)
 
-            at2 = self.quant_op(at1[0], 128)
-
-            # result, scale, residual
-            return at2[0], at2[1], at1[1]
+            return result, residual_out, scale
 
         def replacement(
             input: torch.Tensor,
-            residual: torch.Tensor,
             weight: torch.Tensor,
+            residual: torch.Tensor,
         ):
-            at = AITER_RMS_ADD_GROUP_QUANT_OP(
+            at = self.FUSED_OP(
                 x=input,
                 residual=residual,
                 weight=weight,
@@ -124,18 +262,15 @@ class AiterFusedAddRMSFp8GroupQuantPattern:
             # result, scale, residual
             return at[0], at[1], at[2]
 
-        inputs = [
-            empty_bf16(5, 4),  # input
-            empty_bf16(5, 4),  # residual
-            empty_bf16(1, 5),  # weight
-        ]
-
-        pm.register_replacement(pattern, replacement, inputs, pm.fwd_only, pm_pass)
+        pm.register_replacement(
+            pattern, replacement, self.rmsnorm_matcher.inputs(), pm.fwd_only, pm_pass
+        )
 
 
-class RocmAiterRMSNormFp8GroupQuantFusionPass(VllmPatternMatcherPass):
+class RocmAiterRMSNormFusionPass(VllmPatternMatcherPass):
     """
-    This pass fuses rms_norm & quant custom ops into a fused rms_norm_quant op.
+    This pass fuses aiter rms_norm & vllm/aiter quant custom ops
+    into a fused rms_norm_quant op.
     It also supports fused_add_rms_norm.
     """
 
@@ -144,20 +279,33 @@ class RocmAiterRMSNormFp8GroupQuantFusionPass(VllmPatternMatcherPass):
         super().__init__(config)
 
         self.patterns: PatternMatcherPass = PatternMatcherPass(
-            pass_name="rocm_aiter_rms_norm_fp8_group_quant_fusion_pass"
+            pass_name="rocm_aiter_rms_norm_quant_fusion_pass"
         )
 
         # Make sure fused add patterns are before simple rms norm,
         # as the latter is a subset of the former in torch ops
         for epsilon in [1e-5, 1e-6]:
-            # Fuse rms_norm + dynamic group fp8 quant
-            for quant_op in [AITER_GROUP_FP8_QUANT_OP, TRITON_GROUP_FP8_QUANT_OP]:
-                AiterRMSFp8GroupQuantPattern(epsilon, FP8_DTYPE, quant_op).register(
-                    self.patterns
-                )
+            #  Fuse aiter rms_norm + aiter dynamic group fp8 quant
+            AiterRMSFp8GroupQuantPattern(
+                epsilon, FP8_DTYPE, GroupShape(1, 128)
+            ).register(self.patterns)
 
-                AiterFusedAddRMSFp8GroupQuantPattern(
-                    epsilon, FP8_DTYPE, quant_op
+            # Fuse aiter fused_add_rms_norm + aiter dynamic group fp8 quant
+            AiterFusedAddRMSFp8GroupQuantPattern(
+                epsilon, FP8_DTYPE, GroupShape(1, 128)
+            ).register(self.patterns)
+
+            for match_aiter_quant in [True, False]:
+                # Fuse aiter rms_norm + (aiter / vllm built-in)
+                # dynamic per-token fp8 quant
+                AiterRMSNormDynamicQuantPattern(
+                    epsilon, FP8_DTYPE, match_aiter_quant=match_aiter_quant
+                ).register(self.patterns)
+
+                # Fuse aiter fused_add_rms_norm + (aiter / vllm built-in)
+                # dynamic per-token fp8 quant
+                AiterFusedAddRMSNormDynamicQuantPattern(
+                    epsilon, FP8_DTYPE, match_aiter_quant=match_aiter_quant
                 ).register(self.patterns)
 
         self.dump_patterns(config, self.patterns)
@@ -169,6 +317,8 @@ class RocmAiterRMSNormFp8GroupQuantFusionPass(VllmPatternMatcherPass):
 
     def uuid(self) -> Any:
         fusion_patterns = [
+            AiterRMSNormDynamicQuantPattern,
+            AiterFusedAddRMSNormDynamicQuantPattern,
             AiterRMSFp8GroupQuantPattern,
             AiterFusedAddRMSFp8GroupQuantPattern,
         ]
@@ -181,6 +331,8 @@ class AiterSiluMulFp8GroupQuantPattern(ActivationQuantPattern):
     ops into an aiter silu_and_mul_group_fp8_quant op.
     """
 
+    FUSED_SILU_MUL_QUANT_OP = rocm_aiter_ops.get_act_mul_fused_fp8_group_quant_op()
+
     def __init__(self, quant_op: OpOverload):
         self.silu_and_mul_matcher = MatcherSiluAndMul()
         self.quant_op = quant_op
@@ -196,7 +348,7 @@ class AiterSiluMulFp8GroupQuantPattern(ActivationQuantPattern):
         def replacement(
             input: torch.Tensor,
         ):
-            at = FUSED_SILU_MUL_QUANT_OP(x=input, group_size=128)
+            at = self.FUSED_SILU_MUL_QUANT_OP(x=input, group_size=128)
             return at[0], at[1]
 
         inputs = [
@@ -216,6 +368,11 @@ class RocmAiterSiluMulFp8GroupQuantFusionPass(VllmPatternMatcherPass):
     https://github.com/pytorch/pytorch/pull/139321#issuecomment-2452354980
     """
 
+    AITER_GROUP_FP8_QUANT_OP = rocm_aiter_ops.get_group_quant_op()
+    TRITON_GROUP_FP8_QUANT_OP = torch.ops.vllm.triton_per_token_group_quant_fp8.default
+
+    QUANT_OPS = [AITER_GROUP_FP8_QUANT_OP, TRITON_GROUP_FP8_QUANT_OP]
+
     @enable_fake_mode
     def __init__(self, config: VllmConfig):
         super().__init__(config)
@@ -224,7 +381,7 @@ class RocmAiterSiluMulFp8GroupQuantFusionPass(VllmPatternMatcherPass):
             pass_name="rocm_aiter_silu_mul_fp8_group_quant_fusion_pass"
         )
 
-        for quant_op in [AITER_GROUP_FP8_QUANT_OP, TRITON_GROUP_FP8_QUANT_OP]:
+        for quant_op in self.QUANT_OPS:
             AiterSiluMulFp8GroupQuantPattern(quant_op).register(self.patterns)
 
         self.dump_patterns(config, self.patterns)

From 73cfb7a722e36c8429e7fdb5306be2d981c3f8dd Mon Sep 17 00:00:00 2001
From: Weida Hong <wdhongtw@google.com>
Date: Tue, 23 Dec 2025 18:08:58 +0800
Subject: [PATCH 770/770] Correct position of docstring of class attributes
 (#31209)

Signed-off-by: Weida Hong <wdhongtw@google.com>
---
 vllm/forward_context.py       |  2 +-
 vllm/v1/kv_cache_interface.py | 21 ++++++++++++---------
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 033cc1f544b3b..7a569ec32eac9 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -186,6 +186,7 @@ class DPMetadata:
 class ForwardContext:
     # copy from vllm_config.compilation_config.static_forward_context
     no_compile_layers: dict[str, Any]
+    attn_metadata: dict[str, AttentionMetadata] | list[dict[str, AttentionMetadata]]
     """
     Type Dict[str, AttentionMetadata] for v1, map from layer_name of each 
     attention layer to its attention metadata
@@ -193,7 +194,6 @@ class ForwardContext:
     for each microbatch.
     Set dynamically for each forward pass
     """
-    attn_metadata: dict[str, AttentionMetadata] | list[dict[str, AttentionMetadata]]
     # TODO: remove after making all virtual_engines share the same kv cache
     virtual_engine: int  # set dynamically for each forward pass
     # set dynamically for each forward pass
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
index 751862aa9c767..7370f0aefafb4 100644
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -80,17 +80,20 @@ class AttentionSpec(KVCacheSpec):
 
 @dataclass(frozen=True)
 class FullAttentionSpec(AttentionSpec):
-    sliding_window: int | None = None
-    attention_chunk_size: int | None = None
     """
-    When hybrid allocator is disabled and the model contains both full 
-    attention layers and sliding window attention layers, sliding 
-    window attention are regarded as full attention in KV cache manager 
-    (blocks are allocated for all tokens), while computed as sliding window 
+    When hybrid allocator is disabled and the model contains both full
+    attention layers and sliding window attention layers, sliding
+    window attention are regarded as full attention in KV cache manager
+    (blocks are allocated for all tokens), while computed as sliding window
     attention in model runner.
     In this case, we use FullAttentionSpec and record the sliding window size.
+    """
+
+    sliding_window: int | None = None
+    """
     Default to None for not using sliding window attention.
     """
+    attention_chunk_size: int | None = None
 
     def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
         max_model_len = vllm_config.model_config.max_model_len
@@ -390,10 +393,11 @@ class KVCacheConfig:
     The KV cache configuration of a model.
     """
 
-    """The number of KV cache blocks"""
     num_blocks: int
-    """How should model runner initialize the KV cache tensors for each layer"""
+    """The number of KV cache blocks"""
     kv_cache_tensors: list[KVCacheTensor]
+    """How should model runner initialize the KV cache tensors for each layer"""
+    kv_cache_groups: list[KVCacheGroupSpec]
     """
     The kv cache groups of the model.
     For models with only one type of attention, there is only one group that
@@ -401,4 +405,3 @@ class KVCacheConfig:
     For models with multiple types of attention, there will be multiple groups,
     see `_get_kv_cache_config_uniform_page_size` for more details.
     """
-    kv_cache_groups: list[KVCacheGroupSpec]